Linux系统内存管理中存在着一个称之为OOM killerOut-Of-Memory killer)的机制,该机制主要用于内存监控,监控进程的内存使用量,当系统的内存耗尽时,其将根据算法选择性地kill了部分进程。本文分析的内存溢出保护机制,也就是OOM killer机制了。

 

回到伙伴管理算法中涉及的一函数<span style="-ms-word-wrap: break-word;">__alloc_pages_nodemask()</span>,其里面调用的<span style="-ms-word-wrap: break-word;">__alloc_pages_slowpath()</span>并未展开深入,而内存溢出保护机制则在此函数中。

先行查看一下<span style="-ms-word-wrap: break-word;">__alloc_pages_slowpath()</span>的实现:


1. 【file:/ mm/page_alloc.h】
2. static inline struct page
3. __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
4.     struct zonelist zonelist, enum zone_type high_zoneidx,
5.     nodemask_t nodemask, struct zone preferred_zone,
6.     int migratetype)
7. {
8.     const gfp_t wait = gfp_mask & GFP_WAIT;
9.     struct page page = NULL;
10.     int alloc_flags;
11.     unsigned long pages_reclaimed = 0;
12.     unsigned long did_some_progress;
13.     bool sync_migration = false;
14.     bool deferred_compaction = false;
15.     bool contended_compaction = false;
16.  
17.     /
18.       In the slowpath, we sanity check order to avoid ever trying to
19.       reclaim >= MAX_ORDER areas which will never succeed. Callers may
20.       be using allocators in order of preference for an area that is
21.       too large.
22.      */
23.     if (order >= MAX_ORDER) {
24.         WARN_ON_ONCE(!(gfp_mask &
GFP_NOWARN));
25.         return NULL;
26.     }
27.  
28.     /
29.       GFP_THISNODE (meaning GFP_THISNODE, GFP_NORETRY and
30.       __GFP_NOWARN set) should not cause reclaim since the subsystem
31.       (f.e. slab) using GFP_THISNODE may choose to trigger reclaim
32.       using a larger set of nodes after it has established that the
33.       allowed per node queues are empty and that nodes are
34.       over allocated.
35.      /
36.     if (IS_ENABLED(CONFIG_NUMA) &&
37.         (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
38.         goto nopage;
39.  
40. restart:
41.     if (!(gfp_mask & GFP_NO_KSWAPD))
42.         wake_all_kswapds(order, zonelist, high_zoneidx, preferred_zone);
43.  
44.     /
45.       OK, we're below the kswapd watermark and have kicked background
46.       reclaim. Now things get more complex, so set up alloc_flags according
47.       to how we want to proceed.
48.      /
49.     alloc_flags = gfp_to_alloc_flags(gfp_mask);
50.  
51.     /
52.       Find the true preferred zone if the allocation is unconstrained by
53.       cpusets.
54.      /
55.     if (!(alloc_flags & ALLOC_CPUSET) && !nodemask)
56.         first_zones_zonelist(zonelist, high_zoneidx, NULL,
57.                     &preferred_zone);
58.  
59. rebalance:
60.     / This is the last chance, in general, before the goto nopage. /
61.     page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
62.             high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS,
63.             preferred_zone, migratetype);
64.     if (page)
65.         goto got_pg;
66.  
67.     / Allocate without watermarks if the context allows /
68.     if (alloc_flags & ALLOC_NO_WATERMARKS) {
69.         /
70.           Ignore mempolicies if ALLOC_NO_WATERMARKS on the grounds
71.           the allocation is high priority and these type of
72.           allocations are system rather than user orientated
73.          /
74.         zonelist = node_zonelist(numa_node_id(), gfp_mask);
75.  
76.         page =
alloc_pages_high_priority(gfp_mask, order,
77.                 zonelist, high_zoneidx, nodemask,
78.                 preferred_zone, migratetype);
79.         if (page) {
80.             goto got_pg;
81.         }
82.     }
83.  
84.     / Atomic allocations - we can't balance anything /
85.     if (!wait) {
86.         /
87.           All existing users of the deprecated GFP_NOFAIL are
88.           blockable, so warn of any new users that actually allow this
89.           type of allocation to fail.
90.          */
91.         WARN_ON_ONCE(gfp_mask &
GFP_NOFAIL);
92.         goto nopage;
93.     }
94.  
95.     / Avoid recursion of direct reclaim /
96.     if (current->flags & PF_MEMALLOC)
97.         goto nopage;
98.  
99.     / Avoid allocations with no watermarks from looping endlessly /
100.     if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & GFP_NOFAIL))
101.         goto nopage;
102.  
103.     /
104.       Try direct compaction. The first pass is asynchronous. Subsequent
105.       attempts after direct reclaim are synchronous
106.      /
107.     page =
alloc_pages_direct_compact(gfp_mask, order,
108.                     zonelist, high_zoneidx,
109.                     nodemask,
110.                     alloc_flags, preferred_zone,
111.                     migratetype, sync_migration,
112.                     &contended_compaction,
113.                     &deferred_compaction,
114.                     &did_some_progress);
115.     if (page)
116.         goto got_pg;
117.     sync_migration = true;
118.  
119.     /
120.       If compaction is deferred for high-order allocations, it is because
121.       sync compaction recently failed. In this is the case and the caller
122.       requested a movable allocation that does not heavily disrupt the
123.       system then fail the allocation instead of entering direct reclaim.
124.      /
125.     if ((deferred_compaction || contended_compaction) &&
126.                         (gfp_mask & GFP_NO_KSWAPD))
127.         goto nopage;
128.  
129.     / Try direct reclaim and then allocating /
130.     page =
alloc_pages_direct_reclaim(gfp_mask, order,
131.                     zonelist, high_zoneidx,
132.                     nodemask,
133.                     alloc_flags, preferred_zone,
134.                     migratetype, &did_some_progress);
135.     if (page)
136.         goto got_pg;
137.  
138.     /
139.       If we failed to make any progress reclaiming, then we are
140.       running out of options and have to consider going OOM
141.      /
142.     if (!did_some_progress) {
143.         if (oom_gfp_allowed(gfp_mask)) {
144.             if (oom_killer_disabled)
145.                 goto nopage;
146.             / Coredumps can quickly deplete all memory reserves /
147.             if ((current->flags & PF_DUMPCORE) &&
148.                 !(gfp_mask & GFP_NOFAIL))
149.                 goto nopage;
150.             page =
alloc_pages_may_oom(gfp_mask, order,
151.                     zonelist, high_zoneidx,
152.                     nodemask, preferred_zone,
153.                     migratetype);
154.             if (page)
155.                 goto got_pg;
156.  
157.             if (!(gfp_mask & GFP_NOFAIL)) {
158.                 /
159.                   The oom killer is not called for high-order
160.                   allocations that may fail, so if no progress
161.                   is being made, there are no other options and
162.                   retrying is unlikely to help.
163.                  /
164.                 if (order > PAGE_ALLOC_COSTLY_ORDER)
165.                     goto nopage;
166.                 /
167.                   The oom killer is not called for lowmem
168.                   allocations to prevent needlessly killing
169.                   innocent tasks.
170.                  /
171.                 if (high_zoneidx < ZONE_NORMAL)
172.                     goto nopage;
173.             }
174.  
175.             goto restart;
176.         }
177.     }
178.  
179.     / Check if we should retry the allocation /
180.     pages_reclaimed += did_some_progress;
181.     if (should_alloc_retry(gfp_mask, order, did_some_progress,
182.                         pages_reclaimed)) {
183.         / Wait for some write requests to complete then retry /
184.         wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
185.         goto rebalance;
186.     } else {
187.         /
188.           High-order allocations do not necessarily loop after
189.           direct reclaim and reclaim/compaction depends on compaction
190.           being called after reclaim so call directly if necessary
191.          /
192.         page =
alloc_pages_direct_compact(gfp_mask, order,
193.                     zonelist, high_zoneidx,
194.                     nodemask,
195.                     alloc_flags, preferred_zone,
196.                     migratetype, sync_migration,
197.                     &contended_compaction,
198.                     &deferred_compaction,
199.                     &did_some_progress);
200.         if (page)
201.             goto got_pg;
202.     }
203.  
204. nopage:
205.     warn_alloc_failed(gfp_mask, order, NULL);
206.     return page;
207. got_pg:
208.     if (kmemcheck_enabled)
209.         kmemcheck_pagealloc_alloc(page, order, gfp_mask);
210.  
211.     return page;
212. }
&nbsp;

该函数首先判断调用者是否禁止唤醒<span style="-ms-word-wrap: break-word;">kswapd</span>线程,若不做禁止则唤醒线程进行内存回收工作,然后通过<span style="-ms-word-wrap: break-word;">gfp_to_alloc_flags()</span>对内存分配标识进行调整,而后再次调用<span style="-ms-word-wrap: break-word;">get_page_from_freelist()</span>尝试分配,如果分配到则退出。否则继续尝试内存分配,继续尝试分配则先行判断是否设置了<span style="-ms-word-wrap: break-word;">ALLOC_NO_WATERMARKS</span>标识,如果设置了,则将忽略<span style="-ms-word-wrap: break-word;">watermark</span>,调用<span style="-ms-word-wrap: break-word;">__alloc_pages_high_priority()</span>进行分配。

__alloc_pages_high_priority()函数实现:


1. 【file:/ mm/page_alloc.h】
2. /
3.   This is called in the allocator slow-path if the allocation request is of
4.   sufficient urgency to ignore watermarks and take other desperate measures
5.  /
6. static inline struct page
7. __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
8.     struct zonelist zonelist, enum zone_type high_zoneidx,
9.     nodemask_t nodemask, struct zone preferred_zone,
10.     int migratetype)
11. {
12.     struct page *page;
13.  
14.     do {
15.         page = get_page_from_freelist(gfp_mask, nodemask, order,
16.             zonelist, high_zoneidx, ALLOC_NO_WATERMARKS,
17.             preferred_zone, migratetype);
18.  
19.         if (!page && gfp_mask & GFP_NOFAIL)
20.             wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
21.     } while (!page && (gfp_mask &
GFP_NOFAIL));
22.  
23.     return page;
24. }
&nbsp;

可以看到该函数根据分配标识<span style="-ms-word-wrap: break-word;">__GFP_NOFAIL</span>不断地调用<span style="-ms-word-wrap: break-word;">get_page_from_freelist()</span>循环尝试去获得内存。

接着回到<span style="-ms-word-wrap: break-word;">__alloc_pages_slowpath()</span>中,其从<span style="-ms-word-wrap: break-word;">__alloc_pages_high_priority()</span>退出后继而判断是否设置了<span style="-ms-word-wrap: break-word;">__GFP_WAIT</span>标识,如果设置则表示内存分配运行休眠,否则直接以分配内存失败而退出。接着将会调用<span style="-ms-word-wrap: break-word;">__alloc_pages_direct_compact()</span>和<span style="-ms-word-wrap: break-word;">__alloc_pages_direct_reclaim()</span>尝试回收内存并尝试分配。基于上面的多种尝试内存分配仍然失败的情况,将会调用<span style="-ms-word-wrap: break-word;">__alloc_pages_may_oom()</span>触发<span style="-ms-word-wrap: break-word;">OOM killer</span>机制。<span style="-ms-word-wrap: break-word;">OOM killer</span>将进程<span style="-ms-word-wrap: break-word;">kill</span>后会重新再次尝试内存分配,最后则是分配失败或分配成功的收尾处理。

__alloc_pages_slowpath()暂且分析至此,回到本文重点函数<span style="-ms-word-wrap: break-word;">__alloc_pages_may_oom()</span>中进一步进行分析。


1. 【file:/ mm/page_alloc.h】
2. static inline struct page
3. __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
4.     struct zonelist zonelist, enum zone_type high_zoneidx,
5.     nodemask_t nodemask, struct zone preferred_zone,
6.     int migratetype)
7. {
8.     struct page page;
9.  
10.     / Acquire the OOM killer lock for the zones in zonelist /
11.     if (!try_set_zonelist_oom(zonelist, gfp_mask)) {
12.         schedule_timeout_uninterruptible(1);
13.         return NULL;
14.     }
15.  
16.     /
17.       Go through the zonelist yet one more time, keep very high watermark
18.       here, this is only to catch a parallel oom killing, we must fail if
19.       we're still under heavy pressure.
20.      /
21.     page = get_page_from_freelist(gfp_mask|GFP_HARDWALL, nodemask,
22.         order, zonelist, high_zoneidx,
23.         ALLOC_WMARK_HIGH|ALLOC_CPUSET,
24.         preferred_zone, migratetype);
25.     if (page)
26.         goto out;
27.  
28.     if (!(gfp_mask &
GFP_NOFAIL)) {
29.         / The OOM killer will not help higher order allocs /
30.         if (order > PAGE_ALLOC_COSTLY_ORDER)
31.             goto out;
32.         / The OOM killer does not needlessly kill tasks for lowmem /
33.         if (high_zoneidx < ZONE_NORMAL)
34.             goto out;
35.         /
36.           GFP_THISNODE contains GFP_NORETRY and we never hit this.
37.          * Sanity check for bare calls of
GFP_THISNODE, not real OOM.
38.           The caller should handle page allocation failure by itself if
39.           it specifies GFP_THISNODE.
40.           Note: Hugepage uses it but will hit PAGE_ALLOC_COSTLY_ORDER.
41.          /
42.         if (gfp_mask &
GFP_THISNODE)
43.             goto out;
44.     }
45.     / Exhausted what can be done so it's blamo time /
46.     out_of_memory(zonelist, gfp_mask, order, nodemask, false);
47.  
48. out:
49.     clear_zonelist_oom(zonelist, gfp_mask);
50.     return page;
51. }
&nbsp;

该函数首先通过<span style="-ms-word-wrap: break-word;">try_set_zonelist_oom()</span>判断<span style="-ms-word-wrap: break-word;">OOM killer</span>是否已经在其他核进行<span style="-ms-word-wrap: break-word;">killing</span>操作,如果没有的情况下将会在<span style="-ms-word-wrap: break-word;">try_set_zonelist_oom()</span>内部进行锁操作,确保只有一个核执行<span style="-ms-word-wrap: break-word;">killing</span>的操作。继而调用<span style="-ms-word-wrap: break-word;">get_page_from_freelist()</span>在高<span style="-ms-word-wrap: break-word;">watermark</span>的情况下尝试再次获取内存,不过这里注定会失败。接着就是调用到了关键函数<span style="-ms-word-wrap: break-word;">out_of_memory()</span>。最后函数退出时将会调用<span style="-ms-word-wrap: break-word;">clear_zonelist_oom()</span>清除掉<span style="-ms-word-wrap: break-word;">try_set_zonelist_oom()</span>里面的锁操作。

着重分析一下<span style="-ms-word-wrap: break-word;">out_of_memory()</span>:


1. 【file:/ mm/oom_kill.c】
2. /
3.   out_of_memory - kill the "best" process when we run out of memory
4.   @zonelist: zonelist pointer
5.   @gfp_mask: memory allocation flags
6.   @order: amount of memory being requested as a power of 2
7.   @nodemask: nodemask passed to page allocator
8.   @force_kill: true if a task must be killed, even if others are exiting
9.  
10.   If we run out of memory, we have the choice between either
11.   killing a random task (bad), letting the system crash (worse)
12.   OR try to be smart about which process to kill. Note that we
13.   don't have to be perfect here, we just have to be good.
14.  /
15. void out_of_memory(struct zonelist zonelist, gfp_t gfp_mask,
16.         int order, nodemask_t nodemask, bool force_kill)
17. {
18.     const nodemask_t mpol_mask;
19.     struct task_struct p;
20.     unsigned long totalpages;
21.     unsigned long freed = 0;
22.     unsigned int uninitialized_var(points);
23.     enum oom_constraint constraint = CONSTRAINT_NONE;
24.     int killed = 0;
25.  
26.     blocking_notifier_call_chain(&oom_notify_list, 0, &freed);
27.     if (freed > 0)
28.         / Got some memory back in the last second. /
29.         return;
30.  
31.     /
32.       If current has a pending SIGKILL or is exiting, then automatically
33.       select it. The goal is to allow it to allocate so that it may
34.       quickly exit and free its memory.
35.      /
36.     if (fatal_signal_pending(current) || current->flags & PF_EXITING) {
37.         set_thread_flag(TIF_MEMDIE);
38.         return;
39.     }
40.  
41.     /
42.       Check if there were limitations on the allocation (only relevant for
43.       NUMA) that may require different handling.
44.      /
45.     constraint = constrained_alloc(zonelist, gfp_mask, nodemask,
46.                         &totalpages);
47.     mpol_mask = (constraint == CONSTRAINT_MEMORY_POLICY) ? nodemask : NULL;
48.     check_panic_on_oom(constraint, gfp_mask, order, mpol_mask);
49.  
50.     if (sysctl_oom_kill_allocating_task && current->mm &&
51.         !oom_unkillable_task(current, NULL, nodemask) &&
52.         current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) {
53.         get_task_struct(current);
54.         oom_kill_process(current, gfp_mask, order, 0, totalpages, NULL,
55.                  nodemask,
56.                  "Out of memory (oom_kill_allocating_task)");
57.         goto out;
58.     }
59.  
60.     p = select_bad_process(&points, totalpages, mpol_mask, force_kill);
61.     / Found nothing?!?! Either we hang forever, or we panic. /
62.     if (!p) {
63.         dump_header(NULL, gfp_mask, order, NULL, mpol_mask);
64.         panic("Out of memory and no killable processes…\n");
65.     }
66.     if (p != (void )-1UL) {
67.         oom_kill_process(p, gfp_mask, order, points, totalpages, NULL,
68.                  nodemask, "Out of memory");
69.         killed = 1;
70.     }
71. out:
72.     /
73.       Give the killed threads a good chance of exiting before trying to
74.       allocate memory again.
75.      /
76.     if (killed)
77.         schedule_timeout_killable(1);
78. }
&nbsp;

该函数首先调用<span style="-ms-word-wrap: break-word;">blocking_notifier_call_chain()</span>进行<span style="-ms-word-wrap: break-word;">OOM</span>的内核通知链回调处理;接着的<span style="-ms-word-wrap: break-word;">if (fatal_signal_pending(current) || current-&gt;flags &amp; PF_EXITING)</span>判断则是用于检查是否有<span style="-ms-word-wrap: break-word;">SIGKILL</span>信号挂起或者正在信号处理中,如果有则退出;再接着通过<span style="-ms-word-wrap: break-word;">constrained_alloc()</span>检查内存分配限制以及<span style="-ms-word-wrap: break-word;">check_panic_on_oom()</span>检查是否报<span style="-ms-word-wrap: break-word;">linux</span>内核<span style="-ms-word-wrap: break-word;">panic</span>;继而判断<span style="-ms-word-wrap: break-word;">sysctl_oom_kill_allocating_task</span>变量及进程检查,如果符合条件判断,则将当前分配的内存<span style="-ms-word-wrap: break-word;">kill</span>掉;否则最后,将通过<span style="-ms-word-wrap: break-word;">select_bad_process()</span>选出最佳的进程,进而调用<span style="-ms-word-wrap: break-word;">oom_kill_process()</span>对其进行<span style="-ms-word-wrap: break-word;">kill</span>操作。

最后分析一下<span style="-ms-word-wrap: break-word;">select_bad_process()</span>和<span style="-ms-word-wrap: break-word;">oom_kill_process()</span>,其中<span style="-ms-word-wrap: break-word;">select_bad_process()</span>的实现:


1. 【file:/ mm/oom_kill.c】
2. /
3.   Simple selection loop. We chose the process with the highest
4.   number of 'points'. Returns -1 on scan abort.
5.  
6.   (not docbooked, we don't want this one cluttering up the manual)
7.  /
8. static struct task_struct select_bad_process(unsigned int ppoints,
9.         unsigned long totalpages, const nodemask_t nodemask,
10.         bool force_kill)
11. {
12.     struct task_struct g, p;
13.     struct task_struct chosen = NULL;
14.     unsigned long chosen_points = 0;
15.  
16.     rcu_read_lock();
17.     for_each_process_thread(g, p) {
18.         unsigned int points;
19.  
20.         switch (oom_scan_process_thread(p, totalpages, nodemask,
21.                         force_kill)) {
22.         case OOM_SCAN_SELECT:
23.             chosen = p;
24.             chosen_points = ULONG_MAX;
25.             / fall through /
26.         case OOM_SCAN_CONTINUE:
27.             continue;
28.         case OOM_SCAN_ABORT:
29.             rcu_read_unlock();
30.             return (struct task_struct )(-1UL);
31.         case OOM_SCAN_OK:
32.             break;
33.         };
34.         points = oom_badness(p, NULL, nodemask, totalpages);
35.         if (!points || points < chosen_points)
36.             continue;
37.         / Prefer thread group leaders for display purposes /
38.         if (points == chosen_points && thread_group_leader(chosen))
39.             continue;
40.  
41.         chosen = p;
42.         chosen_points = points;
43.     }
44.     if (chosen)
45.         get_task_struct(chosen);
46.     rcu_read_unlock();
47.  
48.     ppoints = chosen_points * 1000 / totalpages;
49.     return chosen;
50. }
&nbsp;

&nbsp;&nbsp;&nbsp; 此函数通过<span style="-ms-word-wrap: break-word;">for_each_process_thread()</span>宏遍历所有进程,进而借用<span style="-ms-word-wrap: break-word;">oom_scan_process_thread()</span>获得进程扫描类型然后通过<span style="-ms-word-wrap: break-word;">switch-case</span>作特殊化处理,例如存在某进程退出中则中断扫描、某进程占用内存过多且被标识为优先<span style="-ms-word-wrap: break-word;">kill</span>掉则优选等特殊处理。而正常情况则会通过<span style="-ms-word-wrap: break-word;">oom_badness()</span>计算出进程的分值,然后根据最高分值将进程控制块返回回去。

&nbsp;&nbsp;&nbsp; 顺便研究一下<span style="-ms-word-wrap: break-word;">oom_badness()</span>的实现<span style="-ms-word-wrap: break-word;">:</span>


1. 【file:/ mm/oom_kill.c】
2. /
3.   oom_badness - heuristic function to determine which candidate task to kill
4.   @p: task struct of which task we should calculate
5.   @totalpages: total present RAM allowed for page allocation
6.  
7.   The heuristic for determining which task to kill is made to be as simple and
8.   predictable as possible. The goal is to return the highest value for the
9.   task consuming the most memory to avoid subsequent oom failures.
10.  /
11. unsigned long oom_badness(struct task_struct p, struct mem_cgroup memcg,
12.               const nodemask_t nodemask, unsigned long totalpages)
13. {
14.     long points;
15.     long adj;
16.  
17.     if (oom_unkillable_task(p, memcg, nodemask))
18.         return 0;
19.  
20.     p = find_lock_task_mm(p);
21.     if (!p)
22.         return 0;
23.  
24.     adj = (long)p->signal->oom_score_adj;
25.     if (adj == OOM_SCORE_ADJ_MIN) {
26.         task_unlock(p);
27.         return 0;
28.     }
29.  
30.     /
31.       The baseline for the badness score is the proportion of RAM that each
32.       task's rss, pagetable and swap space use.
33.      /
34.     points = get_mm_rss(p->mm) + atomic_long_read(&p->mm->nr_ptes) +
35.          get_mm_counter(p->mm, MM_SWAPENTS);
36.     task_unlock(p);
37.  
38.     /
39.       Root processes get 3% bonus, just like the __vm_enough_memory()
40.       implementation used by LSMs.
41.      /
42.     if (has_capability_noaudit(p, CAP_SYS_ADMIN))
43.         points -= (points 3) / 100;
44.  
45.     / Normalize to oom_score_adj units /
46.     adj = totalpages / 1000;
47.     points += adj;
48.  
49.     /
50.       Never return 0 for an eligible task regardless of the root bonus and
51.       oom_score_adj (oom_score_adj can't be OOM_SCORE_ADJ_MIN here).
52.      */
53.     return points > 0 ? points : 1;
54. }
&nbsp;

&nbsp;&nbsp;&nbsp; 计算进程分值的函数中,首先排除了不可<span style="-ms-word-wrap: break-word;">OOM kill</span>的进程以及<span style="-ms-word-wrap: break-word;">oom_score_adj</span>值为<span style="-ms-word-wrap: break-word;">OOM_SCORE_ADJ_MIN</span>(即<span style="-ms-word-wrap: break-word;">-1000</span>)的进程,其中<span style="-ms-word-wrap: break-word;">oom_score_adj</span>取值范围是<span style="-ms-word-wrap: break-word;">-1000</span>到<span style="-ms-word-wrap: break-word;">1000</span>;接着就是计算进程的<span style="-ms-word-wrap: break-word;">RSS</span>、页表以及<span style="-ms-word-wrap: break-word;">SWAP</span>空间的使用量占<span style="-ms-word-wrap: break-word;">RAM</span>的比重,如果该进程是超级进程,则去除<span style="-ms-word-wrap: break-word;">3%</span>的权重;最后将<span style="-ms-word-wrap: break-word;">oom_score_adj</span>和<span style="-ms-word-wrap: break-word;">points</span>归一后,但凡小于<span style="-ms-word-wrap: break-word;">0</span>值的都返回<span style="-ms-word-wrap: break-word;">1</span>,其他的则返回原值。由此可知,分值越低的则越不会被<span style="-ms-word-wrap: break-word;">kill</span>,而且该值可以通过修改<span style="-ms-word-wrap: break-word;">oom_score_adj</span>进行调整。

&nbsp;&nbsp;&nbsp; 最后分析一下找到了最&ldquo;<span style="-ms-word-wrap: break-word;">bad</span>&rdquo;的进程后,其享受的&ldquo;待遇&rdquo;<span style="-ms-word-wrap: break-word;">oom_kill_process()</span>:


1. 【file:/ mm/oom_kill.c】
2. /
3.   Must be called while holding a reference to p, which will be released upon
4.   returning.
5.  /
6. void oom_kill_process(struct task_struct p, gfp_t gfp_mask, int order,
7.               unsigned int points, unsigned long totalpages,
8.               struct mem_cgroup memcg, nodemask_t nodemask,
9.               const char message)
10. {
11.     struct task_struct victim = p;
12.     struct task_struct child;
13.     struct task_struct t;
14.     struct mm_struct mm;
15.     unsigned int victim_points = 0;
16.     static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL,
17.                           DEFAULT_RATELIMIT_BURST);
18.  
19.     /
20.       If the task is already exiting, don't alarm the sysadmin or kill
21.       its children or threads, just set TIF_MEMDIE so it can die quickly
22.      /
23.     if (p->flags & PF_EXITING) {
24.         set_tsk_thread_flag(p, TIF_MEMDIE);
25.         put_task_struct(p);
26.         return;
27.     }
28.  
29.     if (__ratelimit(&oom_rs))
30.         dump_header(p, gfp_mask, order, memcg, nodemask);
31.  
32.     task_lock(p);
33.     pr_err("%s: Kill process %d (%s) score %d or sacrifice child\n",
34.         message, task_pid_nr(p), p->comm, points);
35.     task_unlock(p);
36.  
37.     /
38.       If any of p's children has a different mm and is eligible for kill,
39.       the one with the highest oom_badness() score is sacrificed for its
40.       parent. This attempts to lose the minimal amount of work done while
41.       still freeing memory.
42.      /
43.     read_lock(&tasklist_lock);
44.     for_each_thread(p, t) {
45.         list_for_each_entry(child, &t->children, sibling) {
46.             unsigned int child_points;
47.  
48.             if (child->mm == p->mm)
49.                 continue;
50.             /
51.               oom_badness() returns 0 if the thread is unkillable
52.              /
53.             child_points = oom_badness(child, memcg, nodemask,
54.                                 totalpages);
55.             if (child_points > victim_points) {
56.                 put_task_struct(victim);
57.                 victim = child;
58.                 victim_points = child_points;
59.                 get_task_struct(victim);
60.             }
61.         }
62.     }
63.     read_unlock(&tasklist_lock);
64.  
65.     p = find_lock_task_mm(victim);
66.     if (!p) {
67.         put_task_struct(victim);
68.         return;
69.     } else if (victim != p) {
70.         get_task_struct(p);
71.         put_task_struct(victim);
72.         victim = p;
73.     }
74.  
75.     / mm cannot safely be dereferenced after task_unlock(victim) /
76.     mm = victim->mm;
77.     pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB\n",
78.         task_pid_nr(victim), victim->comm, K(victim->mm->total_vm),
79.         K(get_mm_counter(victim->mm, MM_ANONPAGES)),
80.         K(get_mm_counter(victim->mm, MM_FILEPAGES)));
81.     task_unlock(victim);
82.  
83.     /
84.       Kill all user processes sharing victim->mm in other thread groups, if
85.       any. They don't get access to memory reserves, though, to avoid
86.       depletion of all memory. This prevents mm->mmap_sem livelock when an
87.       oom killed thread cannot exit because it requires the semaphore and
88.       its contended by another thread trying to allocate memory itself.
89.       That thread will now get access to memory reserves since it has a
90.       pending fatal signal.
91.      /
92.     rcu_read_lock();
93.     for_each_process(p)
94.         if (p->mm == mm && !same_thread_group(p, victim) &&
95.             !(p->flags & PF_KTHREAD)) {
96.             if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
97.                 continue;
98.  
99.             task_lock(p); / Protect ->comm from prctl() /
100.             pr_err("Kill process %d (%s) sharing same memory\n",
101.                 task_pid_nr(p), p->comm);
102.             task_unlock(p);
103.             do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true);
104.         }
105.     rcu_read_unlock();
106.  
107.     set_tsk_thread_flag(victim, TIF_MEMDIE);
108.     do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, true);
109.     put_task_struct(victim);
110. }
&nbsp;

&nbsp;&nbsp;&nbsp; 该函数将会判断当前被<span style="-ms-word-wrap: break-word;">kill</span>的进程情况,如果该进程处于退出状态,则设置<span style="-ms-word-wrap: break-word;">TIF_MEMDIE</span>标志,不做<span style="-ms-word-wrap: break-word;">kill</span>操作;接着会通过<span style="-ms-word-wrap: break-word;">list_for_each_entry()</span>遍历该进程的子进程信息,如果某个子进程拥有不同的<span style="-ms-word-wrap: break-word;">mm</span>且合适被<span style="-ms-word-wrap: break-word;">kill</span>掉,将会优先考虑将该子进程替代父进程<span style="-ms-word-wrap: break-word;">kill</span>掉,这样可以避免<span style="-ms-word-wrap: break-word;">kill</span>掉父进程带来的接管子进程的工作开销;再往下通过<span style="-ms-word-wrap: break-word;">find_lock_task_mm()</span>找到持有<span style="-ms-word-wrap: break-word;">mm</span>锁的进程,如果进程处于退出状态,则<span style="-ms-word-wrap: break-word;">return</span>,否则继续处理,若此时的进程与传入的不是同一个时则更新<span style="-ms-word-wrap: break-word;">victim</span>;继而接着通过<span style="-ms-word-wrap: break-word;">for_each_process()</span>查找与当前被<span style="-ms-word-wrap: break-word;">kill</span>进程使用到了同样的共享内存的进程进行一起<span style="-ms-word-wrap: break-word;">kill</span>掉,<span style="-ms-word-wrap: break-word;">kill</span>之前将对应的进程添加标识<span style="-ms-word-wrap: break-word;">TIF_MEMDIE</span>,而<span style="-ms-word-wrap: break-word;">kill</span>的动作则是通过发送<span style="-ms-word-wrap: break-word;">SICKILL</span>信号给对应进程,由被<span style="-ms-word-wrap: break-word;">kill</span>进程从内核态返回用户态时进行处理。

&nbsp;&nbsp;&nbsp; 至此,<span style="-ms-word-wrap: break-word;">OOM kill</span>处理分析完毕。

[http://blog.chinaunix.net/uid-26859697-id-5107510.html](http://blog.chinaunix.net/uid-26859697-id-5107510.html)