http://blog.chinaunix.net/uid-26859697-id-4872933.html



前面已经分析了linux内存管理算法(伙伴管理算法)的准备工作。

具体的算法初始化则回到start_kernel()函数接着往下走,下一个函数是mm_init()



1. 【file:/init/main.c】
2. /
3.   Set up kernel memory allocators
4.  /
5. static void __init mm_init(void)
6. {
7.     /
8.       page_cgroup requires contiguous pages,
9.       bigger than MAX_ORDER unless SPARSEMEM.
10.      */
11.     page_cgroup_init_flatmem();
12.     mem_init();
13.     kmem_cache_init();
14.     percpu_init_late();
15.     pgtable_init();
16.     vmalloc_init();
17. }
    乍看仅仅是几个函数的调用,实际上这里的事情远远没这么简单。其中<span style="-ms-word-wrap: break-word;">page_cgroup_init_flatmem()</span>与<span style="-ms-word-wrap: break-word;">cgroup</span>相关,而<span style="-ms-word-wrap: break-word;">mem_init()</span>则是管理伙伴管理算法的初始化,此外<span style="-ms-word-wrap: break-word;">kmem_cache_init()</span>是用于内核<span style="-ms-word-wrap: break-word;">slub</span>内存分配体系的初始化,而<span style="-ms-word-wrap: break-word;">vmalloc_init()</span>则是用于<span style="-ms-word-wrap: break-word;">vmalloc</span>的初始化。

    当前主要分析伙伴管理算法,则仅对<span style="-ms-word-wrap: break-word;">mem_init()</span>做专门的分析,其余的暂且后面再分析。

    伙伴管理算法的初始化函数入口是<span style="-ms-word-wrap: break-word;">mem_init()</span>,其实现:

<div class="codeText" id="codeText" style="background: rgb(255, 255, 255); font: 12px/normal Consolas, monospace; margin: 0px 0px 1.1em; padding: 0px; border: 1px solid rgb(221, 221, 221); border-image: none; width: 1252.19px; letter-spacing: 0.1px; overflow: auto; -ms-word-break: break-all; -ms-word-wrap: break-word; font-size-adjust: none; font-stretch: normal;">
  1. 【file:/arch/x86/mm/init_32.c】
  2. void __init mem_init(void)
  3. {
  4.     pci_iommu_alloc();
  5.  
  6. #ifdef CONFIG_FLATMEM
  7.     BUG_ON(!mem_map);
  8. #endif
  9.     /*
  10.      * With CONFIG_DEBUG_PAGEALLOC initialization of highmem pages has to
  11.      * be done before free_all_bootmem(). Memblock use free low memory for
  12.      * temporary data (see find_range_array()) and for this purpose can use
  13.      * pages that was already passed to the buddy allocator, hence marked as
  14.      * not accessible in the page tables when compiled with
  15.      * CONFIG_DEBUG_PAGEALLOC. Otherwise order of initialization is not
  16.      * important here.
  17.      */
  18.     set_highmem_pages_init();
  19.  
  20.     / this will put all low memory onto the freelists /
  21.     free_all_bootmem();
  22.  
  23.     after_bootmem = 1;
  24.  
  25.     mem_init_print_info(NULL);
  26.     printk(KERN_INFO "virtual kernel memory layout:\n"
  27.         " fixmap : 0x%08lx - 0x%08lx (%4ld kB)\n"
  28. #ifdef CONFIG_HIGHMEM
  29.         " pkmap : 0x%08lx - 0x%08lx (%4ld kB)\n"
  30. #endif
  31.         " vmalloc : 0x%08lx - 0x%08lx (%4ld MB)\n"
  32.         " lowmem : 0x%08lx - 0x%08lx (%4ld MB)\n"
  33.         " .init : 0x%08lx - 0x%08lx (%4ld kB)\n"
  34.         " .data : 0x%08lx - 0x%08lx (%4ld kB)\n"
  35.         " .text : 0x%08lx - 0x%08lx (%4ld kB)\n",
  36.         FIXADDR_START, FIXADDR_TOP,
  37.         (FIXADDR_TOP - FIXADDR_START) >> 10,
  38.  
  39. #ifdef CONFIG_HIGHMEM
  40.         PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE,
  41.         (LAST_PKMAP*PAGE_SIZE) >> 10,
  42. #endif
  43.  
  44.         VMALLOC_START, VMALLOC_END,
  45.         (VMALLOC_END - VMALLOC_START) >> 20,
  46.  
  47.         (unsigned long)__va(0), (unsigned long)high_memory,
  48.         ((unsigned long)high_memory - (unsigned long)__va(0)) >> 20,
  49.  
  50.         (unsigned long)&init_begin, (unsigned long)&init_end,
  51.         ((unsigned long)&__init_end -
  52.          (unsigned long)&__init_begin) >> 10,
  53.  
  54.         (unsigned long)&_etext, (unsigned long)&_edata,
  55.         ((unsigned long)&_edata - (unsigned long)&_etext) >> 10,
  56.  
  57.         (unsigned long)&_text, (unsigned long)&_etext,
  58.         ((unsigned long)&_etext - (unsigned long)&_text) >> 10);
  59.  
  60.     /*
  61.      * Check boundaries twice: Some fundamental inconsistencies can
  62.      * be detected at build time already.
  63.      */
  64. #define __FIXADDR_TOP (-PAGE_SIZE)
  65. #ifdef CONFIG_HIGHMEM
  66.     BUILD_BUG_ON(PKMAP_BASE + LAST_PKMAP*PAGE_SIZE > FIXADDR_START);
  67.     BUILD_BUG_ON(VMALLOC_END > PKMAP_BASE);
  68. #endif
  69. #define high_memory (-128UL << 20)
  70.     BUILD_BUG_ON(VMALLOC_START >= VMALLOC_END);
  71. #undef high_memory
  72. #undef __FIXADDR_TOP
  73. #ifdef CONFIG_RANDOMIZE_BASE
  74.     BUILD_BUG_ON(CONFIG_RANDOMIZE_BASE_MAX_OFFSET > KERNEL_IMAGE_SIZE);
  75. #endif
  76.  
  77. #ifdef CONFIG_HIGHMEM
  78.     BUG_ON(PKMAP_BASE + LAST_PKMAP*PAGE_SIZE > FIXADDR_START);
  79.     BUG_ON(VMALLOC_END > PKMAP_BASE);
  80. #endif
  81.     BUG_ON(VMALLOC_START >= VMALLOC_END);
  82.     BUG_ON((unsigned long)high_memory > VMALLOC_START);
  83.  
  84.     if (boot_cpu_data.wp_works_ok < 0)
  85.         test_wp_bit();
  86. }

其中pci_iommu_alloc()不是伙伴算法重点相关的函数,不过还是稍微记录一下:

  • 【file:/arch/x86/kernel/pci-dma.c】

  • void __init pci_iommu_alloc(void)
  • {
  •     struct iommu_table_entry *p;
  •  
  •     sort_iommu_table(iommu_table, iommu_table_end);
  •     check_iommu_entries(iommu_table, iommu_table_end);
  •  
  •     for (p = iommu_table; p < iommu_table_end; p++) {
  •         if (p && p->detect && p->detect() > 0) {
  •             p->flags |= IOMMU_DETECTED;
  •             if (p->early_init)
  •                 p->early_init();
  •             if (p->flags & IOMMU_FINISH_IF_DETECTED)
  •                 break;
  •         }
  •     }
  • }

  • 该函数主要是将iommu table先行排序检查,然后调用各个表项注册的函数进行初始化。

    而接着的set_highmem_pages_init()则是伙伴算法的开始:

  • 【file:/arch/x86/mm/highmem_32.c】

  • void __init set_highmem_pages_init(void)
  • {
  •     struct zone *zone;
  •     int nid;
  •  
  •     /*
  •      * Explicitly reset zone->managed_pages because set_highmem_pages_init()
  •      * is invoked before free_all_bootmem()
  •      */
  •     reset_all_zones_managed_pages();
  •     for_each_zone(zone) {
  •         unsigned long zone_start_pfn, zone_end_pfn;
  •  
  •         if (!is_highmem(zone))
  •             continue;
  •  
  •         zone_start_pfn = zone->zone_start_pfn;
  •         zone_end_pfn = zone_start_pfn + zone->spanned_pages;
  •  
  •         nid = zone_to_nid(zone);
  •         printk(KERN_INFO "Initializing %s for node %d (%08lx:%08lx)\n",
  •                 zone->name, nid, zone_start_pfn, zone_end_pfn);
  •  
  •         add_highpages_with_active_regions(nid, zone_start_pfn,
  •                  zone_end_pfn);
  •     }
  • }

  • 该函数中reset_all_zones_managed_pages()主要是将所有的内存管理区zone的页面管理数据进行清0重置。而接下来的for_each_zone(zone)循环体结合is_highmem(zone)判断则是用于遍历查找出高端内存的管理区,对查找到高端内存调则用add_highpages_with_active_regions()将其释放添加至伙伴管理算法中。

    add_highpages_with_active_regions()具体实现:

  • 【file:/arch/x86/mm/init_32.c】

  • void __init add_highpages_with_active_regions(int nid,
  •              unsigned long start_pfn, unsigned long end_pfn)
  • {
  •     phys_addr_t start, end;
  •     u64 i;
  •  
  •     for_each_free_mem_range(i, nid, &start, &end, NULL) {
  •         unsigned long pfn = clamp_t(unsigned long, PFN_UP(start),
  •                         start_pfn, end_pfn);
  •         unsigned long e_pfn = clamp_t(unsigned long, PFN_DOWN(end),
  •                           start_pfn, end_pfn);
  •         for ( ; pfn < e_pfn; pfn++)
  •             if (pfn_valid(pfn))
  •                 free_highmem_page(pfn_to_page(pfn));
  •     }
  • }

  • 其中for_each_free_mem_range(i, nid, &start, &end, NULL)用于遍历查找memblock算法中空闲的空间区域,然后通过clamp_t()对空间区域进行去除内存空洞调整。里面的for ( ; pfn < e_pfn; pfn++)则用于将空间区域的各页面通过free_highmem_page()进行释放处理,其中if (pfn_valid(pfn))用于判断页面的有效性,而pfn_to_page(pfn)则是将页框号转换为页面管理结构。

    进一步分析free_highmem_page()实现:

  • 【file:/mm/page_alloc.c】

  • void free_highmem_page(struct page *page)
  • {
  •     __free_reserved_page(page);
  •     totalram_pages++;
  •     page_zone(page)->managed_pages++;
  •     totalhigh_pages++;
  • }

  • 其中<span style="-ms-word-wrap: break-word;">totalram_pages</span>用于记录内存的总页面数,<span style="-ms-word-wrap: break-word;">page_zone(page)-&gt;managed_pages</span>则是记录管理区的管理页面数,<span style="-ms-word-wrap: break-word;">totalhigh_pages</span>则是记录高端内存的页面总数;
    
    具体看一下<span style="-ms-word-wrap: break-word;">__free_reserved_page()</span>:
    
  • 【file:/include/linux/mm.h】

  • / Free the reserved page into the buddy system, so it gets managed. /
  • static inline void __free_reserved_page(struct page *page)
  • {
  •     ClearPageReserved(page);
  •     init_page_count(page);
  •     __free_page(page);
  • }

  • 其中<span style="-ms-word-wrap: break-word;">ClearPageReserved</span>定义在<span style="-ms-word-wrap: break-word;">/include/linux/page-flags.h</span>中:
    
    #define CLEARPAGEFLAG(uname, lname)&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; \
    
    static inline void ClearPage##uname(struct page *page)&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; \
    
    &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; { clear_bit(PG_##lname, &amp;page-&gt;flags); }
    
    用于清除页面的<span style="-ms-word-wrap: break-word;">flag</span>中的<span style="-ms-word-wrap: break-word;">reserved</span>标志位,表示页面属于动态内存。
    
    接着的<span style="-ms-word-wrap: break-word;">init_page_count()</span>这是设置页面的<span style="-ms-word-wrap: break-word;">_count</span>引用计数,设置为<span style="-ms-word-wrap: break-word;">1</span>,用于为<span style="-ms-word-wrap: break-word;">__free_page()</span>释放页面到内存管理算法中做准备。最后的<span style="-ms-word-wrap: break-word;">__free_page()</span>,该函数既是初始化伙伴管理算法,同时也是伙伴管理算法释放页面的操作函数。暂且搁置分析<span style="-ms-word-wrap: break-word;">__free_page()</span>的实现,后面再详细深入。
    
    接着回到<span style="-ms-word-wrap: break-word;">mem_init ()</span>里面下一个调用<span style="-ms-word-wrap: break-word;">free_all_bootmem()</span>:
    
  • 【file:/mm/nobootmem.c】

  • unsigned long __init free_all_bootmem(void)
  • {
  •     unsigned long pages;
  •  
  •     reset_all_zones_managed_pages();
  •  
  •     /*
  •      * We need to use NUMA_NO_NODE instead of NODE_DATA(0)->node_id
  •      * because in some case like Node0 doesn't have RAM installed
  •      * low ram will be on Node1
  •      */
  •     pages = free_low_memory_core_early();
  •     totalram_pages += pages;
  •  
  •     return pages;
  • }

  •     其中reset_all_zones_managed_pages()是用于重置管理区zone结构中的managed_pages成员数据,着重分析一下free_low_memory_core_early()实现:

  • 【file:/mm/nobootmem.c】

  • static unsigned long __init free_low_memory_core_early(void)
  • {
  •     unsigned long count = 0;
  •     phys_addr_t start, end;
  •     u64 i;
  •  
  •     for_each_free_mem_range(i, NUMA_NO_NODE, &start, &end, NULL)
  •         count += __free_memory_core(start, end);
  •  
  • #ifdef CONFIG_ARCH_DISCARD_MEMBLOCK
  •     {
  •         phys_addr_t size;
  •  
  •         / Free memblock.reserved array if it was allocated /
  •         size = get_allocated_memblock_reserved_regions_info(&start);
  •         if (size)
  •             count += __free_memory_core(start, start + size);
  •  
  •         / Free memblock.memory array if it was allocated /
  •         size = get_allocated_memblock_memory_regions_info(&start);
  •         if (size)
  •             count += __free_memory_core(start, start + size);
  •     }
  • #endif
  •  
  •     return count;
  • }

  • 该函数通过for_each_free_mem_range()遍历memblock算法中的空闲内存空间,并调用__free_memory_core()来释放;而后面的get_allocated_memblock_reserved_regions_info()get_allocated_memblock_memory_regions_info()用于获取通过申请而得的memblock管理算法空间,然后释放,其中如果其算法管理空间是系统定义的memblock_reserved_init_regionsmemblock_memory_init_regions则仍保留不予以释放。

    最后着重分析一下__free_memory_core()的实现:

  • 【file:/mm/nobootmem.c】

  • static void init free_pages_memory(unsigned long start, unsigned long end)
  • {
  •     int order;
  •  
  •     while (start < end) {
  •         order = min(MAX_ORDER - 1UL, __ffs(start));
  •  
  •         while (start + (1UL << order) > end)
  •             order--;
  •  
  •         __free_pages_bootmem(pfn_to_page(start), order);
  •  
  •         start += (1UL << order);
  •     }
  • }

  •     其里面的__free_pages_bootmem()则:

  • 【file:/mm/nobootmem.c】

  • void init free_pages_bootmem(struct page *page, unsigned int order)
  • {
  •     unsigned int nr_pages = 1 << order;
  •     struct page *p = page;
  •     unsigned int loop;
  •  
  •     prefetchw(p);
  •     for (loop = 0; loop < (nr_pages - 1); loop++, p++) {
  •         prefetchw(p + 1);
  •         __ClearPageReserved(p);
  •         set_page_count(p, 0);
  •     }
  •     __ClearPageReserved(p);
  •     set_page_count(p, 0);
  •  
  •     page_zone(page)->managed_pages += nr_pages;
  •     set_page_refcounted(page);
  •     __free_pages(page, order);
  • }

  •    由此可以看到,其最终调用的还是__free_pages()将页面予以释放。该函数在后面集中进行分析。

       至此,伙伴管理算法初始化完毕。