http://blog.chinaunix.net/uid-26859697-id-4894349.html

前面已经分析了伙伴管理算法的释放实现,接着分析一下伙伴管理算法的内存申请实现。

  伙伴管理算法内存申请和释放的入口一样,其实并没有很清楚的界限表示这个函数是入口,而那个不是,所以例行从稍微偏上一点的地方作为入口分析。于是选择了alloc_pages()宏定义作为分析切入口:


1. 【file:/include/linux/gfp.h】
2. #define alloc_pages(gfp_mask, order) \
3.         alloc_pages_node(numa_node_id(), gfp_mask, order)
而alloc_pages_node()的实现:


1. 【file:/include/linux/gfp.h】
2. static inline struct page alloc_pages_node(int nid, gfp_t gfp_mask,
3.                         unsigned int order)
4. {
5.     / Unknown node is current node */
6.     if (nid < 0)
7.         nid = numa_node_id();
8.  
9.     return __alloc_pages(gfp_mask, order, node_zonelist(nid, gfp_mask));
10. }
没有明确内存申请的node节点时,则默认会选择当前的node节点作为申请节点。往下则接着调用__alloc_pages()来申请具体内存,其中入参node_zonelist()是用于获取node节点的zone管理区列表。接着往下看一下__alloc_pages()的实现:


1. 【file:/include/linux/gfp.h】
2. static inline struct page
3. __alloc_pages(gfp_t gfp_mask, unsigned int order,
4.         struct zonelist zonelist)
5. {
6.     return __alloc_pages_nodemask(gfp_mask, order, zonelist, NULL);
7. }
实则是封装了__alloc_pages_nodemask()。而__alloc_pages_nodemask()的实现:


1. 【file:/mm/page_alloc.c】
2. /
3.   This is the 'heart' of the zoned buddy allocator.
4.  /
5. struct page
6. alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
7.             struct zonelist zonelist, nodemask_t nodemask)
8. {
9.     enum zone_type high_zoneidx = gfp_zone(gfp_mask);
10.     struct zone preferred_zone;
11.     struct page page = NULL;
12.     int migratetype = allocflags_to_migratetype(gfp_mask);
13.     unsigned int cpuset_mems_cookie;
14.     int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR;
15.     struct mem_cgroup *memcg = NULL;
16.  
17.     gfp_mask &= gfp_allowed_mask;
18.  
19.     lockdep_trace_alloc(gfp_mask);
20.  
21.     might_sleep_if(gfp_mask &
GFP_WAIT);
22.  
23.     if (should_fail_alloc_page(gfp_mask, order))
24.         return NULL;
25.  
26.     /
27.       Check the zones suitable for the gfp_mask contain at least one
28.       valid zone. It's possible to have an empty zonelist as a result
29.       of GFP_THISNODE and a memoryless node
30.      /
31.     if (unlikely(!zonelist->_zonerefs->zone))
32.         return NULL;
33.  
34.     /
35.       Will only have any effect when __GFP_KMEMCG is set. This is
36.       verified in the (always inline) callee
37.      /
38.     if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order))
39.         return NULL;
40.  
41. retry_cpuset:
42.     cpuset_mems_cookie = get_mems_allowed();
43.  
44.     / The preferred zone is used for statistics later /
45.     first_zones_zonelist(zonelist, high_zoneidx,
46.                 nodemask ? : &cpuset_current_mems_allowed,
47.                 &preferred_zone);
48.     if (!preferred_zone)
49.         goto out;
50.  
51. #ifdef CONFIG_CMA
52.     if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
53.         alloc_flags |= ALLOC_CMA;
54. #endif
55. retry:
56.     / First allocation attempt /
57.     page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
58.             zonelist, high_zoneidx, alloc_flags,
59.             preferred_zone, migratetype);
60.     if (unlikely(!page)) {
61.         /
62.           The first pass makes sure allocations are spread
63.           fairly within the local node. However, the local
64.           node might have free pages left after the fairness
65.           batches are exhausted, and remote zones haven't
66.           even been considered yet. Try once more without
67.           fairness, and include remote zones now, before
68.           entering the slowpath and waking kswapd: prefer
69.           spilling to a remote zone over swapping locally.
70.          /
71.         if (alloc_flags & ALLOC_FAIR) {
72.             reset_alloc_batches(zonelist, high_zoneidx,
73.                         preferred_zone);
74.             alloc_flags &= ~ALLOC_FAIR;
75.             goto retry;
76.         }
77.         /
78.           Runtime PM, block IO and its error handling path
79.           can deadlock because I/O on the device might not
80.           complete.
81.          /
82.         gfp_mask = memalloc_noio_flags(gfp_mask);
83.         page = __alloc_pages_slowpath(gfp_mask, order,
84.                 zonelist, high_zoneidx, nodemask,
85.                 preferred_zone, migratetype);
86.     }
87.  
88.     trace_mm_page_alloc(page, order, gfp_mask, migratetype);
89.  
90. out:
91.     /
92.       When updating a task's mems_allowed, it is possible to race with
93.       parallel threads in such a way that an allocation can fail while
94.       the mask is being updated. If a page allocation is about to fail,
95.       check if the cpuset changed during allocation and if so, retry.
96.      /
97.     if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
98.         goto retry_cpuset;
99.  
100.     memcg_kmem_commit_charge(page, memcg, order);
101.  
102.     return page;
103. }
这就是伙伴管理算法的核心了,于是兜兜转转,终于到了。

其中lockdep_trace_alloc()需要CONFIG_TRACE_IRQFLAGS和CONFIG_PROVE_LOCKING同时定义的时候,才起作用,否则为空函数;如果申请页面传入的gfp_mask掩码携带__GFP_WAIT标识,表示允许页面申请时休眠,则会进入might_sleep_if()检查是否需要休眠等待以及重新调度;由于未设置CONFIG_FAIL_PAGE_ALLOC,则should_fail_alloc_page()恒定返回false;if (unlikely(!zonelist-&gt;_zonerefs-&gt;zone))用于检查当前申请页面的内存管理区zone是否为空;memcg_kmem_newpage_charge()和memcg_kmem_commit_charge()与控制组群Cgroup相关;get_mems_allowed()封装了read_seqcount_begin()用于获得当前对被顺序计数保护的共享资源进行读访问的顺序号,用于避免并发的情况下引起的失败,与其组合的操作函数是put_mems_allowed();first_zones_zonelist()则是用于根据nodemask,找到合适的不大于high_zoneidx的内存管理区preferred_zone;另外allocflags_to_migratetype()是用于转换GFP标识为正确的迁移类型。

最后__alloc_pages_nodemask()分配内存页面的关键函数是:get_page_from_freelist()和__alloc_pages_slowpath(),其中get_page_from_freelist()最先用于尝试页面分配,如果分配失败的情况下,则会进一步调用__alloc_pages_slowpath()。__alloc_pages_slowpath()是用于慢速页面分配,允许等待和内存回收。由于__alloc_pages_slowpath()涉及其他内存管理机制,这里暂不深入分析。

故最后分析一下get_page_from_freelist()的实现:


1. 【file:/mm/page_alloc.c】
2. /
3.   get_page_from_freelist goes through the zonelist trying to allocate
4.   a page.
5.  /
6. static struct page
7. get_page_from_freelist(gfp_t gfp_mask, nodemask_t nodemask, unsigned int order,
8.         struct zonelist zonelist, int high_zoneidx, int alloc_flags,
9.         struct zone preferred_zone, int migratetype)
10. {
11.     struct zoneref z;
12.     struct page page = NULL;
13.     int classzone_idx;
14.     struct zone zone;
15.     nodemask_t allowednodes = NULL;/ zonelist_cache approximation /
16.     int zlc_active = 0; / set if using zonelist_cache /
17.     int did_zlc_setup = 0; / just call zlc_setup() one time /
18.  
19.     classzone_idx = zone_idx(preferred_zone);
20. zonelist_scan:
21.     /
22.       Scan zonelist, looking for a zone with enough free.
23.       See also __cpuset_node_allowed_softwall() comment in kernel/cpuset.c.
24.      /
25.     for_each_zone_zonelist_nodemask(zone, z, zonelist,
26.                         high_zoneidx, nodemask) {
27.         unsigned long mark;
28.  
29.         if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&
30.             !zlc_zone_worth_trying(zonelist, z, allowednodes))
31.                 continue;
32.         if ((alloc_flags & ALLOC_CPUSET) &&
33.             !cpuset_zone_allowed_softwall(zone, gfp_mask))
34.                 continue;
35.         BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
36.         if (unlikely(alloc_flags & ALLOC_NO_WATERMARKS))
37.             goto try_this_zone;
38.         /
39.           Distribute pages in proportion to the individual
40.           zone size to ensure fair page aging. The zone a
41.           page was allocated in should have no effect on the
42.           time the page has in memory before being reclaimed.
43.          /
44.         if (alloc_flags & ALLOC_FAIR) {
45.             if (!zone_local(preferred_zone, zone))
46.                 continue;
47.             if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0)
48.                 continue;
49.         }
50.         /
51.           When allocating a page cache page for writing, we
52.           want to get it from a zone that is within its dirty
53.           limit, such that no single zone holds more than its
54.           proportional share of globally allowed dirty pages.
55.           The dirty limits take into account the zone's
56.           lowmem reserves and high watermark so that kswapd
57.           should be able to balance it without having to
58.           write pages from its LRU list.
59.          
60.           This may look like it could increase pressure on
61.           lower zones by failing allocations in higher zones
62.           before they are full. But the pages that do spill
63.           over are limited as the lower zones are protected
64.           by this very same mechanism. It should not become
65.           a practical burden to them.
66.          
67.           XXX: For now, allow allocations to potentially
68.           exceed the per-zone dirty limit in the slowpath
69.           (ALLOC_WMARK_LOW unset) before going into reclaim,
70.           which is important when on a NUMA setup the allowed
71.           zones are together not big enough to reach the
72.           global limit. The proper fix for these situations
73.           will require awareness of zones in the
74.           dirty-throttling and the flusher threads.
75.          /
76.         if ((alloc_flags & ALLOC_WMARK_LOW) &&
77.             (gfp_mask & __GFP_WRITE) && !zone_dirty_ok(zone))
78.             goto this_zone_full;
79.  
80.         mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
81.         if (!zone_watermark_ok(zone, order, mark,
82.                        classzone_idx, alloc_flags)) {
83.             int ret;
84.  
85.             if (IS_ENABLED(CONFIG_NUMA) &&
86.                     !did_zlc_setup && nr_online_nodes > 1) {
87.                 /
88.                   we do zlc_setup if there are multiple nodes
89.                   and before considering the first zone allowed
90.                   by the cpuset.
91.                  /
92.                 allowednodes = zlc_setup(zonelist, alloc_flags);
93.                 zlc_active = 1;
94.                 did_zlc_setup = 1;
95.             }
96.  
97.             if (zone_reclaim_mode == 0 ||
98.                 !zone_allows_reclaim(preferred_zone, zone))
99.                 goto this_zone_full;
100.  
101.             /
102.               As we may have just activated ZLC, check if the first
103.               eligible zone has failed zone_reclaim recently.
104.              /
105.             if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&
106.                 !zlc_zone_worth_trying(zonelist, z, allowednodes))
107.                 continue;
108.  
109.             ret = zone_reclaim(zone, gfp_mask, order);
110.             switch (ret) {
111.             case ZONE_RECLAIM_NOSCAN:
112.                 / did not scan /
113.                 continue;
114.             case ZONE_RECLAIM_FULL:
115.                 / scanned but unreclaimable /
116.                 continue;
117.             default:
118.                 / did we reclaim enough /
119.                 if (zone_watermark_ok(zone, order, mark,
120.                         classzone_idx, alloc_flags))
121.                     goto try_this_zone;
122.  
123.                 /
124.                   Failed to reclaim enough to meet watermark.
125.                   Only mark the zone full if checking the min
126.                   watermark or if we failed to reclaim just
127.                   1<<order pages or else the page allocator
128.                   fastpath will prematurely mark zones full
129.                   when the watermark is between the low and
130.                   min watermarks.
131.                  /
132.                 if (((alloc_flags & ALLOC_WMARK_MASK) == ALLOC_WMARK_MIN) ||
133.                     ret == ZONE_RECLAIM_SOME)
134.                     goto this_zone_full;
135.  
136.                 continue;
137.             }
138.         }
139.  
140. try_this_zone:
141.         page = buffered_rmqueue(preferred_zone, zone, order,
142.                         gfp_mask, migratetype);
143.         if (page)
144.             break;
145. this_zone_full:
146.         if (IS_ENABLED(CONFIG_NUMA))
147.             zlc_mark_zone_full(zonelist, z);
148.     }
149.  
150.     if (unlikely(IS_ENABLED(CONFIG_NUMA) && page == NULL && zlc_active)) {
151.         / Disable zlc cache for second zonelist scan /
152.         zlc_active = 0;
153.         goto zonelist_scan;
154.     }
155.  
156.     if (page)
157.         /
158.           page->pfmemalloc is set when ALLOC_NO_WATERMARKS was
159.           necessary to allocate the page. The expectation is
160.           that the caller is taking steps that will free more
161.           memory. The caller should avoid the page being used
162.           for !PFMEMALLOC purposes.
163.          */
164.         page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS);
165.  
166.     return page;
167. }
该函数主要是遍历各个内存管理区列表zonelist以尝试页面申请。其中for_each_zone_zonelist_nodemask()则是用于遍历zonelist的,每个内存管理区尝试申请前,都将检查内存管理区是否有可分配的内存空间、根据alloc_flags判断当前CPU是否允许在该内存管理区zone中申请以及做watermark水印检查以判断zone中的内存是否足够等。这部分的功能实现将在后面详细分析,当前主要聚焦在伙伴管理算法的实现。

不难找到真正用于分配内存页面的函数为buffered_rmqueue(),其实现:


1. 【file:/mm/page_alloc.c】
2. /
3.   Really, prep_compound_page() should be called from rmqueue_bulk(). But
4.   we cheat by calling it from here, in the order > 0 path. Saves a branch
5.   or two.
6.  /
7. static inline
8. struct page buffered_rmqueue(struct zone preferred_zone,
9.             struct zone zone, int order, gfp_t gfp_flags,
10.             int migratetype)
11. {
12.     unsigned long flags;
13.     struct page *page;
14.     int cold = !!(gfp_flags &
GFP_COLD);
15.  
16. again:
17.     if (likely(order == 0)) {
18.         struct per_cpu_pages pcp;
19.         struct list_head list;
20.  
21.         local_irq_save(flags);
22.         pcp = &this_cpu_ptr(zone->pageset)->pcp;
23.         list = &pcp->lists[migratetype];
24.         if (list_empty(list)) {
25.             pcp->count += rmqueue_bulk(zone, 0,
26.                     pcp->batch, list,
27.                     migratetype, cold);
28.             if (unlikely(list_empty(list)))
29.                 goto failed;
30.         }
31.  
32.         if (cold)
33.             page = list_entry(list->prev, struct page, lru);
34.         else
35.             page = list_entry(list->next, struct page, lru);
36.  
37.         list_del(&page->lru);
38.         pcp->count--;
39.     } else {
40.         if (unlikely(gfp_flags & GFP_NOFAIL)) {
41.             /
42.              
GFP_NOFAIL is not to be used in new code.
43.              
44.               All GFP_NOFAIL callers should be fixed so that they
45.               properly detect and handle allocation failures.
46.              
47.               We most definitely don't want callers attempting to
48.               allocate greater than order-1 page units with
49.              *
GFP_NOFAIL.
50.              */
51.             WARN_ON_ONCE(order > 1);
52.         }
53.         spin_lock_irqsave(&zone->lock, flags);
54.         page = rmqueue(zone, order, migratetype);
55.         spin_unlock(&zone->lock);
56.         if (!page)
57.             goto failed;
58.         
mod_zone_freepage_state(zone, -(1 << order),
59.                       get_pageblock_migratetype(page));
60.     }
61.  
62.     mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));
63.  
64.     
count_zone_vm_events(PGALLOC, zone, 1 << order);
65.     zone_statistics(preferred_zone, zone, gfp_flags);
66.     local_irq_restore(flags);
67.  
68.     VM_BUG_ON_PAGE(bad_range(zone, page), page);
69.     if (prep_new_page(page, order, gfp_flags))
70.         goto again;
71.     return page;
72.  
73. failed:
74.     local_irq_restore(flags);
75.     return NULL;
76. }
<span style="line-height: 1.5; -ms-word-wrap: break-word;">&nbsp;&nbsp;</span><span style="line-height: 1.5; -ms-word-wrap: break-word;"> if (likely(order == 0))如果申请的内存页面处于伙伴管理算法中的0阶,即只申请一个内存页面时,则首先尝试从冷热页中申请,若申请失败则继而调用rmqueue_bulk()去申请页面至冷热页管理列表中,继而再从冷热页列表中获取;如果申请多个页面则会通过__rmqueue()直接从伙伴管理中申请。</span>

&nbsp;&nbsp;&nbsp; __rmqueue()的实现:


1. 【file:/mm/page_alloc.c】
2. /
3.   Do the hard work of removing an element from the buddy allocator.
4.   Call me with the zone->lock already held.
5.  /
6. static struct page __rmqueue(struct zone zone, unsigned int order,
7.                         int migratetype)
8. {
9.     struct page page;
10.  
11. retry_reserve:
12.     page = rmqueue_smallest(zone, order, migratetype);
13.  
14.     if (unlikely(!page) && migratetype != MIGRATE_RESERVE) {
15.         page =
rmqueue_fallback(zone, order, migratetype);
16.  
17.         /
18.           Use MIGRATE_RESERVE rather than fail an allocation. goto
19.           is used because __rmqueue_smallest is an inline function
20.           and we want just one call site
21.          /
22.         if (!page) {
23.             migratetype = MIGRATE_RESERVE;
24.             goto retry_reserve;
25.         }
26.     }
27.  
28.     trace_mm_page_alloc_zone_locked(page, order, migratetype);
29.     return page;
30. }
该函数里面有两个关键函数:__rmqueue_smallest()和__rmqueue_fallback()。

先行分析一下__rmqueue_fallback():


1. 【file:/mm/page_alloc.c】
2. /
3.   Go through the free lists for the given migratetype and remove
4.   the smallest available page from the freelists
5.  /
6. static inline
7. struct page __rmqueue_smallest(struct zone zone, unsigned int order,
8.                         int migratetype)
9. {
10.     unsigned int current_order;
11.     struct free_area area;
12.     struct page page;
13.  
14.     / Find a page of the appropriate size in the preferred list /
15.     for (current_order = order; current_order < MAX_ORDER; ++current_order) {
16.         area = &(zone->free_area[current_order]);
17.         if (list_empty(&area->free_list[migratetype]))
18.             continue;
19.  
20.         page = list_entry(area->free_list[migratetype].next,
21.                             struct page, lru);
22.         list_del(&page->lru);
23.         rmv_page_order(page);
24.         area->nr_free--;
25.         expand(zone, page, order, current_order, area, migratetype);
26.         return page;
27.     }
28.  
29.     return NULL;
30. }
该函数实现了分配算法的核心功能,首先for()循环其由指定的伙伴管理算法链表order阶开始,如果该阶的链表不为空,则直接通过list_del()从该链表中获取空闲页面以满足申请需要;如果该阶的链表为空,则往更高一阶的链表查找,直到找到链表不为空的一阶,至于若找到了最高阶仍为空链表,则申请失败;否则将在找到链表不为空的一阶后,将空闲页面块通过list_del()从链表中摘除出来,然后通过expand()将其对等拆分开,并将拆分出来的一半空闲部分挂接至低一阶的链表中,直到拆分至恰好满足申请需要的order阶,最后将得到的满足要求的页面返回回去。至此,页面已经分配到了。

至于__rmqueue_fallback():


1. 【file:/mm/page_alloc.c】
2. / Remove an element from the buddy allocator from the fallback list /
3. static inline struct page
4. __rmqueue_fallback(struct zone zone, int order, int start_migratetype)
5. {
6.     struct free_area area;
7.     int current_order;
8.     struct page page;
9.     int migratetype, new_type, i;
10.  
11.     / Find the largest possible block of pages in the other list /
12.     for (current_order = MAX_ORDER-1; current_order >= order;
13.                         --current_order) {
14.         for (i = 0;; i++) {
15.             migratetype = fallbacks[start_migratetype][i];
16.  
17.             / MIGRATE_RESERVE handled later if necessary /
18.             if (migratetype == MIGRATE_RESERVE)
19.                 break;
20.  
21.             area = &(zone->free_area[current_order]);
22.             if (list_empty(&area->free_list[migratetype]))
23.                 continue;
24.  
25.             page = list_entry(area->free_list[migratetype].next,
26.                     struct page, lru);
27.             area->nr_free--;
28.  
29.             new_type = try_to_steal_freepages(zone, page,
30.                               start_migratetype,
31.                               migratetype);
32.  
33.             / Remove the page from the freelists /
34.             list_del(&page->lru);
35.             rmv_page_order(page);
36.  
37.             expand(zone, page, order, current_order, area,
38.                    new_type);
39.  
40.             trace_mm_page_alloc_extfrag(page, order, current_order,
41.                 start_migratetype, migratetype, new_type);
42.  
43.             return page;
44.         }
45.     }
46.  
47.     return NULL;
48. }
其主要是向其他迁移类型中获取内存。较正常的伙伴算法不同,其向迁移类型的内存申请内存页面时,是从最高阶开始查找的,主要是从大块内存中申请可以避免更少的碎片。如果尝试完所有的手段仍无法获得内存页面,则会从MIGRATE_RESERVE列表中获取。这部分暂不深入,后面再详细分析。

毕了,至此伙伴管理算法的分配部分暂时分析完毕。