前面已经分析了slub分配算法的初始化及slab资源池的创建,现在进一步分析一下slub分配算法的分配实现。

kmem_cache_alloc()是申请<span style="-ms-word-wrap: break-word;">slab</span>对象的入口函数,其实现:


1. 【file:/mm/slub.c】
2. void kmem_cache_alloc(struct kmem_cache s, gfp_t gfpflags)
3. {
4.     void *ret = slab_alloc(s, gfpflags, _RETIP);
5.  
6.     trace_kmem_cache_alloc(_RETIP, ret, s->object_size,
7.                 s->size, gfpflags);
8.  
9.     return ret;
10. }
&nbsp;

该函数主要是通过<span style="-ms-word-wrap: break-word;">slab_alloc()</span>来分配对象,而<span style="-ms-word-wrap: break-word;">trace_kmem_cache_alloc()</span>则是用于记录<span style="-ms-word-wrap: break-word;">slab</span>分配轨迹。<span style="-ms-word-wrap: break-word;">&nbsp;&nbsp;&nbsp; </span>那么接下来分析<span style="-ms-word-wrap: break-word;">slab_alloc()</span>的实现:


1. 【file:/mm/slub.c】
2. static __always_inline void slab_alloc(struct kmem_cache s,
3.         gfp_t gfpflags, unsigned long addr)
4. {
5.     return slab_alloc_node(s, gfpflags, NUMA_NO_NODE, addr);
6. }
&nbsp;

&nbsp;&nbsp;&nbsp; 该函数封装了<span style="-ms-word-wrap: break-word;">slab_alloc_node()</span>:


1. 【file:/mm/slub.c】
2. /
3.   Inlined fastpath so that allocation functions (kmalloc, kmem_cache_alloc)
4.   have the fastpath folded into their functions. So no function call
5.   overhead for requests that can be satisfied on the fastpath.
6.  
7.   The fastpath works by first checking if the lockless freelist can be used.
8.   If not then __slab_alloc is called for slow processing.
9.  
10.   Otherwise we can simply pick the next object from the lockless free list.
11.  /
12. static always_inline void slab_alloc_node(struct kmem_cache s,
13.         gfp_t gfpflags, int node, unsigned long addr)
14. {
15.     void object;
16.     struct kmem_cache_cpu c;
17.     struct page page;
18.     unsigned long tid;
19.  
20.     if (slab_pre_alloc_hook(s, gfpflags))
21.         return NULL;
22.  
23.     s = memcg_kmem_get_cache(s, gfpflags);
24. redo:
25.     /
26.       Must read kmem_cache cpu data via this cpu ptr. Preemption is
27.       enabled. We may switch back and forth between cpus while
28.       reading from one cpu area. That does not matter as long
29.       as we end up on the original cpu again when doing the cmpxchg.
30.      
31.       Preemption is disabled for the retrieval of the tid because that
32.       must occur from the current processor. We cannot allow rescheduling
33.       on a different processor between the determination of the pointer
34.       and the retrieval of the tid.
35.      */
36.     preempt_disable();
37.     c =
this_cpu_ptr(s->cpu_slab);
38.  
39.     /
40.       The transaction ids are globally unique per cpu and per operation on
41.       a per cpu queue. Thus they can be guarantee that the cmpxchg_double
42.       occurs on the right processor and that there was no operation on the
43.       linked list in between.
44.      /
45.     tid = c->tid;
46.     preempt_enable();
47.  
48.     object = c->freelist;
49.     page = c->page;
50.     if (unlikely(!object || !node_match(page, node)))
51.         object = slab_alloc(s, gfpflags, node, addr, c);
52.  
53.     else {
54.         void next_object = get_freepointer_safe(s, object);
55.  
56.         /
57.           The cmpxchg will only match if there was no additional
58.           operation and if we are on the right processor.
59.          
60.           The cmpxchg does the following atomically (without lock
61.          
62.           1. Relocate first pointer to the current per cpu area.
63.           2. Verify that tid and freelist have not been changed
64.           3. If they were not changed replace tid and freelist
65.          
66.           Since this is without lock semantics the protection is only
67.           against code executing on this cpu not from access by
68.           other cpus.
69.          */
70.         if (unlikely(!this_cpu_cmpxchg_double(
71.                 s->cpu_slab->freelist, s->cpu_slab->tid,
72.                 object, tid,
73.                 next_object, next_tid(tid)))) {
74.  
75.             note_cmpxchg_failure("slab_alloc", s, tid);
76.             goto redo;
77.         }
78.         prefetch_freepointer(s, next_object);
79.         stat(s, ALLOC_FASTPATH);
80.     }
81.  
82.     if (unlikely(gfpflags &
GFP_ZERO) && object)
83.         memset(object, 0, s->object_size);
84.  
85.     slab_post_alloc_hook(s, gfpflags, object);
86.  
87.     return object;
88. }
&nbsp;

如果开启了<span style="-ms-word-wrap: break-word;">CONFIG_SLUB_DEBUG</span>配置的情况下,将会经由<span style="-ms-word-wrap: break-word;">slab_pre_alloc_hook()</span>对<span style="-ms-word-wrap: break-word;">slub</span>分配进行预处理,确保申请<span style="-ms-word-wrap: break-word;">OK</span>;接着如果开启了<span style="-ms-word-wrap: break-word;">CONFIG_MEMCG_KMEM</span>配置的情况下,将会通过<span style="-ms-word-wrap: break-word;">memcg_kmem_get_cache()</span>将<span style="-ms-word-wrap: break-word;">kmem_cache</span>结构指针转换为<span style="-ms-word-wrap: break-word;">mcgroup</span>组的<span style="-ms-word-wrap: break-word;">kmem_cache</span>指针。

接下来的<span style="-ms-word-wrap: break-word;">slab</span>分配,将会通过<span style="-ms-word-wrap: break-word;">preempt_disable()</span>先行禁止内核抢占,继而通过<span style="-ms-word-wrap: break-word;">__this_cpu_ptr()</span>获取当前<span style="-ms-word-wrap: break-word;">CPU</span>的<span style="-ms-word-wrap: break-word;">kmem_cache_cpu</span>结构,随后取得<span style="-ms-word-wrap: break-word;">kmem_cache_cpu</span>的<span style="-ms-word-wrap: break-word;">tid</span>值后<span style="-ms-word-wrap: break-word;">preempt_enable()</span>使能内核抢占。

往下的<span style="-ms-word-wrap: break-word;">if (unlikely(!object || !node_match(page, node)))</span>判断当前<span style="-ms-word-wrap: break-word;">CPU</span>的<span style="-ms-word-wrap: break-word;">slab</span>空闲列表是否为空或者当前<span style="-ms-word-wrap: break-word;">slab</span>使用内存页面与管理节点是否不匹配。如果其中某一条件为否定,则将通过<span style="-ms-word-wrap: break-word;">__slab_alloc()</span>进行<span style="-ms-word-wrap: break-word;">slab</span>分配;否则将进入<span style="-ms-word-wrap: break-word;">else</span>分支进行分配操作。

__slab_alloc()的分配稍后分析,现在看一下<span style="-ms-word-wrap: break-word;">else</span>分支的动作。其先经<span style="-ms-word-wrap: break-word;">get_freepointer_safe()</span>取得<span style="-ms-word-wrap: break-word;">slab</span>中空闲对象地址,接着使用<span style="-ms-word-wrap: break-word;">this_cpu_cmpxchg_double()</span>原子指令操作取得该空闲对象,如果获取成功将使用<span style="-ms-word-wrap: break-word;">prefetch_freepointer()</span>刷新数据,否则将经<span style="-ms-word-wrap: break-word;">note_cmpxchg_failure()</span>记录日志后重回<span style="-ms-word-wrap: break-word;">redo</span>标签再次尝试分配。这里面的关键是<span style="-ms-word-wrap: break-word;">this_cpu_cmpxchg_double()</span>原子指令操作。该原子操作主要做了三件事情:<span style="-ms-word-wrap: break-word;">1</span>)重定向首指针指向当前<span style="-ms-word-wrap: break-word;">CPU</span>的空间;<span style="-ms-word-wrap: break-word;">2</span>)判断<span style="-ms-word-wrap: break-word;">tid</span>和<span style="-ms-word-wrap: break-word;">freelist</span>未被修改;<span style="-ms-word-wrap: break-word;">3</span>)如果未被修改,也就是相等,确信此次<span style="-ms-word-wrap: break-word;">slab</span>分配未被<span style="-ms-word-wrap: break-word;">CPU</span>迁移,接着将新的<span style="-ms-word-wrap: break-word;">tid</span>和<span style="-ms-word-wrap: break-word;">freelist</span>数据覆盖过去以更新。

具体将<span style="-ms-word-wrap: break-word;">this_cpu_cmpxchg_double()</span>的功能展开用<span style="-ms-word-wrap: break-word;">C</span>语言表述就是<span style="-ms-word-wrap: break-word;">:</span>

if ((__this_cpu_ptr(s-&gt;cpu_slab-&gt;freelist) == object) &amp;&amp; (__this_cpu_ptr(s-&gt;cpu_slab-&gt;tid) == tid))

{

&nbsp;&nbsp;&nbsp; __this_cpu_ptr(s-&gt;cpu_slab-&gt;freelist) = next_object;

&nbsp;&nbsp;&nbsp; __this_cpu_ptr(s-&gt;cpu_slab-&gt;tid) = next_tid(tid);

&nbsp;&nbsp;&nbsp; return true;

}

else

{

&nbsp;&nbsp;&nbsp; return false;

}

这里使用原子操作,其通过单指令方式实现完以上功能免除了加锁解锁操作,且完全避免了多核的情况下<span style="-ms-word-wrap: break-word;">CPU</span>迁移锁资源等待所带来的性能开销,极大地提升了效率。这在通常的程序开发中消除性能瓶颈也是极佳的手段。

完了分配到对象后,将会根据申请标志<span style="-ms-word-wrap: break-word;">__GFP_ZERO</span>将该对象进行格式化操作,然后经由<span style="-ms-word-wrap: break-word;">slab_post_alloc_hook()</span>进行对象分配后处理。

&nbsp;&nbsp;&nbsp; 最后回来看一下<span style="-ms-word-wrap: break-word;">__slab_alloc()</span>函数实现:


1. 【file:/mm/slub.c】
2. /
3.   Slow path. The lockless freelist is empty or we need to perform
4.   debugging duties.
5.  
6.   Processing is still very fast if new objects have been freed to the
7.   regular freelist. In that case we simply take over the regular freelist
8.   as the lockless freelist and zap the regular freelist.
9.  
10.   If that is not working then we fall back to the partial lists. We take the
11.   first element of the freelist as the object to allocate now and move the
12.   rest of the freelist to the lockless freelist.
13.  
14.   And if we were unable to get a new slab from the partial slab lists then
15.   we need to allocate a new slab. This is the slowest path since it involves
16.   a call to the page allocator and the setup of a new slab.
17.  /
18. static void __slab_alloc(struct kmem_cache s, gfp_t gfpflags, int node,
19.               unsigned long addr, struct kmem_cache_cpu c)
20. {
21.     void freelist;
22.     struct page page;
23.     unsigned long flags;
24.  
25.     local_irq_save(flags);
26. #ifdef CONFIG_PREEMPT
27.     /
28.       We may have been preempted and rescheduled on a different
29.       cpu before disabling interrupts. Need to reload cpu area
30.       pointer.
31.      /
32.     c = this_cpu_ptr(s->cpu_slab);
33. #endif
34.  
35.     page = c->page;
36.     if (!page)
37.         goto new_slab;
38. redo:
39.  
40.     if (unlikely(!node_match(page, node))) {
41.         stat(s, ALLOC_NODE_MISMATCH);
42.         deactivate_slab(s, page, c->freelist);
43.         c->page = NULL;
44.         c->freelist = NULL;
45.         goto new_slab;
46.     }
47.  
48.     /
49.       By rights, we should be searching for a slab page that was
50.       PFMEMALLOC but right now, we are losing the pfmemalloc
51.       information when the page leaves the per-cpu allocator
52.      /
53.     if (unlikely(!pfmemalloc_match(page, gfpflags))) {
54.         deactivate_slab(s, page, c->freelist);
55.         c->page = NULL;
56.         c->freelist = NULL;
57.         goto new_slab;
58.     }
59.  
60.     / must check again c->freelist in case of cpu migration or IRQ /
61.     freelist = c->freelist;
62.     if (freelist)
63.         goto load_freelist;
64.  
65.     stat(s, ALLOC_SLOWPATH);
66.  
67.     freelist = get_freelist(s, page);
68.  
69.     if (!freelist) {
70.         c->page = NULL;
71.         stat(s, DEACTIVATE_BYPASS);
72.         goto new_slab;
73.     }
74.  
75.     stat(s, ALLOC_REFILL);
76.  
77. load_freelist:
78.     /
79.       freelist is pointing to the list of objects to be used.
80.       page is pointing to the page from which the objects are obtained.
81.       That page must be frozen for per cpu allocations to work.
82.      /
83.     VM_BUG_ON(!c->page->frozen);
84.     c->freelist = get_freepointer(s, freelist);
85.     c->tid = next_tid(c->tid);
86.     local_irq_restore(flags);
87.     return freelist;
88.  
89. new_slab:
90.  
91.     if (c->partial) {
92.         page = c->page = c->partial;
93.         c->partial = page->next;
94.         stat(s, CPU_PARTIAL_ALLOC);
95.         c->freelist = NULL;
96.         goto redo;
97.     }
98.  
99.     freelist = new_slab_objects(s, gfpflags, node, &c);
100.  
101.     if (unlikely(!freelist)) {
102.         if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit())
103.             slab_out_of_memory(s, gfpflags, node);
104.  
105.         local_irq_restore(flags);
106.         return NULL;
107.     }
108.  
109.     page = c->page;
110.     if (likely(!kmem_cache_debug(s) && pfmemalloc_match(page, gfpflags)))
111.         goto load_freelist;
112.  
113.     / Only entered in the debug case /
114.     if (kmem_cache_debug(s) &&
115.             !alloc_debug_processing(s, page, freelist, addr))
116.         goto new_slab; / Slab failed checks. Next slab needed /
117.  
118.     deactivate_slab(s, page, get_freepointer(s, freelist));
119.     c->page = NULL;
120.     c->freelist = NULL;
121.     local_irq_restore(flags);
122.     return freelist;
123. }
&nbsp;

__slab_alloc()是<span style="-ms-word-wrap: break-word;">slab</span>申请的慢路径,这是由于<span style="-ms-word-wrap: break-word;">freelist</span>是空的或者需要执行调试任务。

该函数会先行<span style="-ms-word-wrap: break-word;">local_irq_save()</span>禁止本地处理器的中断并且记住它们之前的状态。如果配置<span style="-ms-word-wrap: break-word;">CONFIG_PREEMPT</span>了,为了避免因调度切换到不同的<span style="-ms-word-wrap: break-word;">CPU</span>,该函数会重新通过<span style="-ms-word-wrap: break-word;">this_cpu_ptr()</span>获取<span style="-ms-word-wrap: break-word;">CPU</span>域的指针;如果<span style="-ms-word-wrap: break-word;">c-&gt;page</span>为空,也就是<span style="-ms-word-wrap: break-word;">cpu local slab</span>不存在就经由<span style="-ms-word-wrap: break-word;">new_slab</span>分支新分配一个。

当<span style="-ms-word-wrap: break-word;">c-&gt;page</span>不为空的情况下,会经<span style="-ms-word-wrap: break-word;">node_match()</span>判断页面与节点是否匹配,如果节点不匹配就通过<span style="-ms-word-wrap: break-word;">deactivate_slab()</span>去激活<span style="-ms-word-wrap: break-word;">cpu</span>本地<span style="-ms-word-wrap: break-word;">slab</span>;再然后通过<span style="-ms-word-wrap: break-word;">pfmemalloc_match()</span>判断当前页面属性是否为<span style="-ms-word-wrap: break-word;">pfmemalloc</span>,如果不是则同样去激活。

接着会再次检查空闲对象指针<span style="-ms-word-wrap: break-word;">freelist</span>是否为空,避免在禁止本地处理器中断前因发生了<span style="-ms-word-wrap: break-word;">CPU</span>迁移或者中断,导致本地的空闲对象指针不为空。如果不为空的情况下,将会跳转至<span style="-ms-word-wrap: break-word;">load_freelist</span>,这里将会把对象从空闲队列中取出,并更新数据信息,然后恢复中断使能,返回对象地址。如果为空,将会更新慢路径申请对象的统计信息,并通过<span style="-ms-word-wrap: break-word;">get_freelist()</span>从页面中获取空闲队列。<span style="-ms-word-wrap: break-word;">if (!freelist)</span>表示获取空闲队列失败,此时则需要创建新的<span style="-ms-word-wrap: break-word;">slab</span>,否则更新统计信息进入<span style="-ms-word-wrap: break-word;">load_freelist</span>分支取得对象并返回。

最后看一下该函数的<span style="-ms-word-wrap: break-word;">new_slab</span>分支的实现,首先会<span style="-ms-word-wrap: break-word;">if (c-&gt;partial)</span>判断<span style="-ms-word-wrap: break-word;">partial</span>是否为空,不为空则从<span style="-ms-word-wrap: break-word;">partial</span>中取出,然后跳转回<span style="-ms-word-wrap: break-word;">redo</span>重试分配。如果<span style="-ms-word-wrap: break-word;">partial</span>为空,意味着当前所有的<span style="-ms-word-wrap: break-word;">slab</span>都已经满负荷使用,那么则需使用<span style="-ms-word-wrap: break-word;">new_slab_objects()</span>创建新的<span style="-ms-word-wrap: break-word;">slab</span>。如果创建失败,那么将<span style="-ms-word-wrap: break-word;">if (!(gfpflags &amp; __GFP_NOWARN) &amp;&amp; printk_ratelimit())</span>判断申请页面是否配置为无告警,并且送往控制台的消息数量在临界值内,则调用<span style="-ms-word-wrap: break-word;">slab_out_of_memory()</span>记录日志后使能中断并返回<span style="-ms-word-wrap: break-word;">NULL</span>表示申请失败。否则将会<span style="-ms-word-wrap: break-word;">if (likely(!kmem_cache_debug(s) &amp;&amp; pfmemalloc_match(page, gfpflags)))</span>判断是否未开启调试且页面属性匹配<span style="-ms-word-wrap: break-word;">pfmemalloc</span>,是则跳转至<span style="-ms-word-wrap: break-word;">load_freelist</span>分支进行<span style="-ms-word-wrap: break-word;">slab</span>对象分配;否则将会经<span style="-ms-word-wrap: break-word;">if (kmem_cache_debug(s) &amp;&amp; !alloc_debug_processing(s, page, freelist, addr))</span><span style="-ms-word-wrap: break-word;"> </span>判断,若开启调试并且调试初始化失败,则返回创建新的<span style="-ms-word-wrap: break-word;">slab</span>。如果未开启调试或<span style="-ms-word-wrap: break-word;">page</span>调试初始化失败,都将会<span style="-ms-word-wrap: break-word;">deactivate_slab()</span>去激活该<span style="-ms-word-wrap: break-word;">page</span>,使能中断并返回。

&nbsp;&nbsp;&nbsp; 深入分析一下<span style="-ms-word-wrap: break-word;">new_slab_objects()</span>函数实现:


1. 【file:/mm/slub.c】
2. static inline void new_slab_objects(struct kmem_cache s, gfp_t flags,
3.             int node, struct kmem_cache_cpu pc)
4. {
5.     void freelist;
6.     struct kmem_cache_cpu c = pc;
7.     struct page page;
8.  
9.     freelist = get_partial(s, flags, node, c);
10.  
11.     if (freelist)
12.         return freelist;
13.  
14.     page = new_slab(s, flags, node);
15.     if (page) {
16.         c = __this_cpu_ptr(s->cpu_slab);
17.         if (c->page)
18.             flush_slab(s, c);
19.  
20.         /
21.           No other reference to the page yet so we can
22.           muck around with it freely without cmpxchg
23.          /
24.         freelist = page->freelist;
25.         page->freelist = NULL;
26.  
27.         stat(s, ALLOC_SLAB);
28.         c->page = page;
29.         *pc = c;
30.     } else
31.         freelist = NULL;
32.  
33.     return freelist;
34. }
&nbsp;

该函数在尝试创建新的<span style="-ms-word-wrap: break-word;">slab</span>前,将先通过<span style="-ms-word-wrap: break-word;">get_partial()</span>获取存在空闲对象的<span style="-ms-word-wrap: break-word;">slab</span>并将对象返回;否则继而通过<span style="-ms-word-wrap: break-word;">new_slab()</span>创建<span style="-ms-word-wrap: break-word;">slab</span>,如果创建好<span style="-ms-word-wrap: break-word;">slab</span>后,将空闲对象链表摘下并返回。

这就是<span style="-ms-word-wrap: break-word;">slub</span>分配算法的对象分配流程。

[http://blog.chinaunix.net/uid-26859697-id-5498373.html](http://blog.chinaunix.net/uid-26859697-id-5498373.html)