前面的前奏已经分析介绍了建立内核页表相关变量的设置准备,接下来转入正题分析内核页表的建立。

    建立内核页表的关键函数init_mem_mapping():


1. 【file:/arch/x86/mm/init.c】
2. void init init_mem_mapping(void)
3. {
4.     unsigned long end;
5.  
6.     probe_page_size_mask();
7.  
8. #ifdef CONFIG_X86_64
9.     end = max_pfn << PAGE_SHIFT;
10. #else
11.     end = max_low_pfn << PAGE_SHIFT;
12. #endif
13.  
14.     / the ISA range is always mapped regardless of memory holes /
15.     init_memory_mapping(0, ISA_END_ADDRESS);
16.  
17.     /
18.      
If the allocation is in bottom-up direction, we setup direct mapping
19.       in bottom-up, otherwise we setup direct mapping in top-down.
20.      
/
21.     if (memblock_bottom_up()) {
22.         unsigned long kernel_end =
pa_symbol(_end);
23.  
24.         /
25.          
we need two separate calls here. This is because we want to
26.           allocate page tables above the kernel. So we first map
27.          
[kernel_end, end) to make memory above the kernel be mapped
28.           as soon as possible. And then use page tables allocated above
29.          
the kernel to map [ISA_END_ADDRESS, kernel_end).
30.          /
31.         memory_map_bottom_up(kernel_end, end);
32.         memory_map_bottom_up(ISA_END_ADDRESS, kernel_end);
33.     } else {
34.         memory_map_top_down(ISA_END_ADDRESS, end);
35.     }
36.  
37. #ifdef CONFIG_X86_64
38.     if (max_pfn > max_low_pfn) {
39.         /
can we preseve max_low_pfn ?*/
40.         max_low_pfn = max_pfn;
41.     }
42. #else
43.     early_ioremap_page_table_range_init();
44. #endif
45.  
46.     load_cr3(swapper_pg_dir);
47.     __flush_tlb_all();
48.  
49.     early_memtest(0, max_pfn_mapped << PAGE_SHIFT);
50. }
&nbsp; &nbsp;&nbsp;其中probe_page_size_mask()实现:


1. 【file:/arch/x86/mm/init.c】
2. static void init probe_page_size_mask(void)
3. {
4.     init_gbpages();
5.  
6. #if !defined(CONFIG_DEBUG_PAGEALLOC) && !defined(CONFIG_KMEMCHECK)
7.     /
8.      
For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages.
9.       This will simplify cpa(), which otherwise needs to support splitting
10.      
large pages into small in interrupt context, etc.
11.      /
12.     if (direct_gbpages)
13.         page_size_mask |= 1 << PG_LEVEL_1G;
14.     if (cpu_has_pse)
15.         page_size_mask |= 1 << PG_LEVEL_2M;
16. #endif
17.  
18.     /
Enable PSE if available /
19.     if (cpu_has_pse)
20.         set_in_cr4(X86_CR4_PSE);
21.  
22.     /
Enable PGE if available */
23.     if (cpu_has_pge) {
24.         set_in_cr4(X86_CR4_PGE);
25.         
supported_pte_mask |= _PAGE_GLOBAL;
26.     }
27. }
&nbsp;

&nbsp; &nbsp;&nbsp;probe_page_size_mask()主要作用是初始化直接映射变量(在init_gbpages()里面)和对page_size_mask变量进行设置,以及根据配置来控制CR4寄存器的置位,用于后面分页时的页面大小情况判定。

&nbsp; &nbsp;&nbsp;回到init_mem_mapping()继续往下走,接着是init_memory_mapping(),其中入参ISA_END_ADDRESS表示ISA总线上设备的地址末尾。

&nbsp; &nbsp;&nbsp;init_mem_mapping()实现:


1. 【file:/arch/x86/mm/init.c】
2. /
3.  
Setup the direct mapping of the physical memory at PAGE_OFFSET.
4.   This runs before bootmem is initialized and gets pages directly from
5.  
the physical memory. To access them they are temporarily mapped.
6.  */
7. unsigned long __init_refok init_memory_mapping(unsigned long start,
8.                            unsigned long end)
9. {
10.     struct map_range mr[NR_RANGE_MR];
11.     unsigned long ret = 0;
12.     int nr_range, i;
13.  
14.     pr_info("init_memory_mapping: [mem %#010lx-%#010lx]\n",
15.            start, end - 1);
16.  
17.     memset(mr, 0, sizeof(mr));
18.     nr_range = split_mem_range(mr, 0, start, end);
19.  
20.     for (i = 0; i < nr_range; i++)
21.         ret = kernel_physical_mapping_init(mr[i].start, mr[i].end,
22.                            mr[i].page_size_mask);
23.  
24.     add_pfn_range_mapped(start >> PAGE_SHIFT, ret >> PAGE_SHIFT);
25.  
26.     return ret >> PAGE_SHIFT;
27. }
&nbsp;

&nbsp; &nbsp;&nbsp;init_mem_mapping()里面关键操作有三个split_mem_range()、kernel_physical_mapping_init()和add_pfn_range_mapped()函数。

&nbsp; &nbsp;&nbsp;首先分析一下split_mem_range():


1. 【file:/arch/x86/mm/init.c】
2. static int __meminit split_mem_range(struct map_range mr, int nr_range,
3.                      unsigned long start,
4.                      unsigned long end)
5. {
6.     unsigned long start_pfn, end_pfn, limit_pfn;
7.     unsigned long pfn;
8.     int i;
9.  
10.     limit_pfn = PFN_DOWN(end);
11.  
12.     /
head if not big page alignment ? /
13.     pfn = start_pfn = PFN_DOWN(start);
14. #ifdef CONFIG_X86_32
15.     /

16.       Don't use a large page for the first 2/4MB of memory
17.      
because there are often fixed size MTRRs in there
18.       and overlapping MTRRs into large pages can cause
19.      
slowdowns.
20.      /
21.     if (pfn == 0)
22.         end_pfn = PFN_DOWN(PMD_SIZE);
23.     else
24.         end_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE));
25. #else /
CONFIG_X86_64 /
26.     end_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE));
27. #endif
28.     if (end_pfn > limit_pfn)
29.         end_pfn = limit_pfn;
30.     if (start_pfn < end_pfn) {
31.         nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
32.         pfn = end_pfn;
33.     }
34.  
35.     /
big page (2M) range /
36.     start_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE));
37. #ifdef CONFIG_X86_32
38.     end_pfn = round_down(limit_pfn, PFN_DOWN(PMD_SIZE));
39. #else /
CONFIG_X86_64 /
40.     end_pfn = round_up(pfn, PFN_DOWN(PUD_SIZE));
41.     if (end_pfn > round_down(limit_pfn, PFN_DOWN(PMD_SIZE)))
42.         end_pfn = round_down(limit_pfn, PFN_DOWN(PMD_SIZE));
43. #endif
44.  
45.     if (start_pfn < end_pfn) {
46.         nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
47.                 page_size_mask & (1<<PG_LEVEL_2M));
48.         pfn = end_pfn;
49.     }
50.  
51. #ifdef CONFIG_X86_64
52.     /
big page (1G) range /
53.     start_pfn = round_up(pfn, PFN_DOWN(PUD_SIZE));
54.     end_pfn = round_down(limit_pfn, PFN_DOWN(PUD_SIZE));
55.     if (start_pfn < end_pfn) {
56.         nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
57.                 page_size_mask &
58.                  ((1<<PG_LEVEL_2M)|(1<<PG_LEVEL_1G)));
59.         pfn = end_pfn;
60.     }
61.  
62.     /
tail is not big page (1G) alignment /
63.     start_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE));
64.     end_pfn = round_down(limit_pfn, PFN_DOWN(PMD_SIZE));
65.     if (start_pfn < end_pfn) {
66.         nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
67.                 page_size_mask & (1<<PG_LEVEL_2M));
68.         pfn = end_pfn;
69.     }
70. #endif
71.  
72.     /
tail is not big page (2M) alignment /
73.     start_pfn = pfn;
74.     end_pfn = limit_pfn;
75.     nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
76.  
77.     if (!after_bootmem)
78.         adjust_range_page_size_mask(mr, nr_range);
79.  
80.     /
try to merge same page size and continuous /
81.     for (i = 0; nr_range > 1 && i < nr_range - 1; i++) {
82.         unsigned long old_start;
83.         if (mr[i].end != mr[i+1].start ||
84.             mr[i].page_size_mask != mr[i+1].page_size_mask)
85.             continue;
86.         /
move it /
87.         old_start = mr[i].start;
88.         memmove(&mr[i], &mr[i+1],
89.             (nr_range - 1 - i)
sizeof(struct map_range));
90.         mr[i–].start = old_start;
91.         nr_range–;
92.     }
93.  
94.     for (i = 0; i < nr_range; i++)
95.         printk(KERN_DEBUG " [mem %#010lx-%#010lx] page %s\n",
96.                 mr[i].start, mr[i].end - 1,
97.             (mr[i].page_size_mask & (1<<PG_LEVEL_1G))?"1G":(
98.              (mr[i].page_size_mask & (1<<PG_LEVEL_2M))?"2M":"4k"));
99.  
100.     return nr_range;
101. }
&nbsp;

&nbsp; &nbsp;&nbsp;split_mem_range()根据传入的内存start和end做四舍五入的对齐操作(即round_up和round_down),并根据对齐的情况,把开始、末尾的不对齐部分及中间部分分成了三段,使用save_mr()将其存放在init_mem_mapping()的局部变量数组mr中。划分开来主要是为了允许各部分可以映射不同页面大小,然后如果各划分开来的部分是连续的,映射页面大小也是一致的,则将其合并。最后将映射的情况打印出来,在shell上使用dmesg命令可以看到该打印信息,样例:

&nbsp;

&nbsp; &nbsp;&nbsp;接下来看kernel_physical_mapping_init():


1. 【file:/arch/x86/mm/init.c】
2. /
3.  
This maps the physical memory to kernel virtual address space, a total
4.   of max_low_pfn pages, by creating page tables starting from address
5.  
PAGE_OFFSET:
6.  /
7. unsigned long __init
8. kernel_physical_mapping_init(unsigned long start,
9.                  unsigned long end,
10.                  unsigned long page_size_mask)
11. {
12.     int use_pse = page_size_mask == (1<<PG_LEVEL_2M);
13.     unsigned long last_map_addr = end;
14.     unsigned long start_pfn, end_pfn;
15.     pgd_t
pgd_base = swapper_pg_dir;
16.     int pgd_idx, pmd_idx, pte_ofs;
17.     unsigned long pfn;
18.     pgd_t pgd;
19.     pmd_t
pmd;
20.     pte_t pte;
21.     unsigned pages_2m, pages_4k;
22.     int mapping_iter;
23.  
24.     start_pfn = start >> PAGE_SHIFT;
25.     end_pfn = end >> PAGE_SHIFT;
26.  
27.     /

28.       First iteration will setup identity mapping using large/small pages
29.      
based on use_pse, with other attributes same as set by
30.       the early code in head_32.S
31.      

32.       Second iteration will setup the appropriate attributes (NX, GLOBAL..)
33.      
as desired for the kernel identity mapping.
34.      
35.      
This two pass mechanism conforms to the TLB app note which says:
36.      
37.      
"Software should not write to a paging-structure entry in a way
38.       that would change, for any linear address, both the page size
39.      
and either the page frame or attributes."
40.      /
41.     mapping_iter = 1;
42.  
43.     if (!cpu_has_pse)
44.         use_pse = 0;
45.  
46. repeat:
47.     pages_2m = pages_4k = 0;
48.     pfn = start_pfn;
49.     pgd_idx = pgd_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET);
50.     pgd = pgd_base + pgd_idx;
51.     for (; pgd_idx < PTRS_PER_PGD; pgd++, pgd_idx++) {
52.         pmd = one_md_table_init(pgd);
53.  
54.         if (pfn >= end_pfn)
55.             continue;
56. #ifdef CONFIG_X86_PAE
57.         pmd_idx = pmd_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET);
58.         pmd += pmd_idx;
59. #else
60.         pmd_idx = 0;
61. #endif
62.         for (; pmd_idx < PTRS_PER_PMD && pfn < end_pfn;
63.              pmd++, pmd_idx++) {
64.             unsigned int addr = pfn
PAGE_SIZE + PAGE_OFFSET;
65.  
66.             /
67.              
Map with big pages if possible, otherwise
68.               create normal page tables:
69.              
/
70.             if (use_pse) {
71.                 unsigned int addr2;
72.                 pgprot_t prot = PAGE_KERNEL_LARGE;
73.                 /
74.                  
first pass will use the same initial
75.                   identity mapping attribute + _PAGE_PSE.
76.                  
/
77.                 pgprot_t init_prot =
78.                     pgprot(PTE_IDENT_ATTR |
79.                          _PAGE_PSE);
80.  
81.                 pfn &= PMD_MASK >> PAGE_SHIFT;
82.                 addr2 = (pfn + PTRS_PER_PTE-1) PAGE_SIZE +
83.                     PAGE_OFFSET + PAGE_SIZE-1;
84.  
85.                 if (is_kernel_text(addr) ||
86.                     is_kernel_text(addr2))
87.                     prot = PAGE_KERNEL_LARGE_EXEC;
88.  
89.                 pages_2m++;
90.                 if (mapping_iter == 1)
91.                     set_pmd(pmd, pfn_pmd(pfn, init_prot));
92.                 else
93.                     set_pmd(pmd, pfn_pmd(pfn, prot));
94.  
95.                 pfn += PTRS_PER_PTE;
96.                 continue;
97.             }
98.             pte = one_page_table_init(pmd);
99.  
100.             pte_ofs = pte_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET);
101.             pte += pte_ofs;
102.             for (; pte_ofs < PTRS_PER_PTE && pfn < end_pfn;
103.                  pte++, pfn++, pte_ofs++, addr += PAGE_SIZE) {
104.                 pgprot_t prot = PAGE_KERNEL;
105.                 /

106.                   first pass will use the same initial
107.                  
identity mapping attribute.
108.                  */
109.                 pgprot_t init_prot =
pgprot(PTE_IDENT_ATTR);
110.  
111.                 if (is_kernel_text(addr))
112.                     prot = PAGE_KERNEL_EXEC;
113.  
114.                 pages_4k++;
115.                 if (mapping_iter == 1) {
116.                     set_pte(pte, pfn_pte(pfn, init_prot));
117.                     last_map_addr = (pfn << PAGE_SHIFT) + PAGE_SIZE;
118.                 } else
119.                     set_pte(pte, pfn_pte(pfn, prot));
120.             }
121.         }
122.     }
123.     if (mapping_iter == 1) {
124.         /
125.          
update direct mapping page count only in the first
126.           iteration.
127.          
/
128.         update_page_count(PG_LEVEL_2M, pages_2m);
129.         update_page_count(PG_LEVEL_4K, pages_4k);
130.  
131.         /
132.          
local global flush tlb, which will flush the previous
133.           mappings present in both small and large page TLB's.
134.          
/
135.         __flush_tlb_all();
136.  
137.         /
138.          
Second iteration will set the actual desired PTE attributes.
139.          */
140.         mapping_iter = 2;
141.         goto repeat;
142.     }
143.     return last_map_addr;
144. }
&nbsp;

&nbsp; &nbsp;&nbsp;kernel_physical_mapping_init()是建立内核页表的一个关键函数,就是它负责处理物理内存的映射。swapper_pg_dir(来自于/arch/x86/kernel/head_32.s)就是页全局目录的空间了。而页表目录的空间则来自于调用one_page_table_init()申请而得,而one_page_table_init()则是通过调用关系:one_page_table_init()-&gt;alloc_low_page()-&gt;alloc_low_pages()-&gt;memblock_reserve()最后申请而得,同时页全局目录项的熟悉也在这里设置完毕,详细代码这里就不分析了。回到kernel_physical_mapping_init()代码中,该函数里面有个标签repeat,通过mapping_iter结合goto语句的控制,该标签下的代码将会执行两次。第一次执行时,内存映射设置如同head_32.s里面的一样,将页面属性设置为PTE_IDENT_ATTR;第二次执行时,会根据内核的情况设置具体的页面属性,默认是设置为PAGE_KERNEL,但如果经过is_kernel_text判断为内核代码空间,则设置为PAGE_KERNEL_EXEC。最终建立内核页表的同时,完成内存映射。

&nbsp; &nbsp; &nbsp;继续init_memory_mapping()的最后一个关键调用函数add_pfn_range_mapped():


1. 【file:/arch/x86/mm/init.c】
2. struct range pfn_mapped[E820_X_MAX];
3. int nr_pfn_mapped;
4.  
5. static void add_pfn_range_mapped(unsigned long start_pfn, unsigned long end_pfn)
6. {
7.     nr_pfn_mapped = add_range_with_merge(pfn_mapped, E820_X_MAX,
8.                          nr_pfn_mapped, start_pfn, end_pfn);
9.     nr_pfn_mapped = clean_sort_range(pfn_mapped, E820_X_MAX);
10.  
11.     max_pfn_mapped = max(max_pfn_mapped, end_pfn);
12.  
13.     if (start_pfn < (1UL<<(32-PAGE_SHIFT)))
14.         max_low_pfn_mapped = max(max_low_pfn_mapped,
15.                      min(end_pfn, 1UL<<(32-PAGE_SHIFT)));
16. }
&nbsp;

&nbsp; &nbsp;&nbsp;该函数主要是将新增内存映射的物理页框范围加入到全局数组pfn_mapped中,其中nr_pfn_mapped用于表示数组中的有效项数量。由此一来,则可以通过内核函数pfn_range_is_mapped来判断指定的物理内存是否被映射,避免了重复映射的情况。

&nbsp; &nbsp;&nbsp;回到init_mem_mapping()继续往下,此时memblock_bottom_up()返回的memblock.bottom_up值仍为false,所以接着走的是else分支,调用memory_map_top_down(),入参为ISA_END_ADDRESS和end。其中end则是通过max_low_pfn &lt;&lt; PAGE_SHIFT被设置为内核直接映射的最后页框所对应的地址。

&nbsp; &nbsp;&nbsp;memory_map_top_down()代码实现:


1. 【file:/arch/x86/mm/init.c】
2. /*
3.  
memory_map_top_down - Map [map_start, map_end) top down
4.   @map_start: start address of the target memory range
5.  
@map_end: end address of the target memory range
6.  
7.  
This function will setup direct mapping for memory range
8.   [map_start, map_end) in top-down. That said, the page tables
9.  
will be allocated at the end of the memory, and we map the
10.   memory in top-down.
11.  
/
12. static void __init memory_map_top_down(unsigned long map_start,
13.                        unsigned long map_end)
14. {
15.     unsigned long real_end, start, last_start;
16.     unsigned long step_size;
17.     unsigned long addr;
18.     unsigned long mapped_ram_size = 0;
19.     unsigned long new_mapped_ram_size;
20.  
21.     / xen has big range in reserved near end of ram, skip it at first./
22.     addr = memblock_find_in_range(map_start, map_end, PMD_SIZE, PMD_SIZE);
23.     real_end = addr + PMD_SIZE;
24.  
25.     / step_size need to be small so pgt_buf from BRK could cover it /
26.     step_size = PMD_SIZE;
27.     max_pfn_mapped = 0; / will get exact value next /
28.     min_pfn_mapped = real_end >> PAGE_SHIFT;
29.     last_start = start = real_end;
30.  
31.     /
32.      
We start from the top (end of memory) and go to the bottom.
33.       The memblock_find_in_range() gets us a block of RAM from the
34.      
end of RAM in [min_pfn_mapped, max_pfn_mapped) used as new pages
35.       for page table.
36.      
/
37.     while (last_start > map_start) {
38.         if (last_start > step_size) {
39.             start = round_down(last_start - 1, step_size);
40.             if (start < map_start)
41.                 start = map_start;
42.         } else
43.             start = map_start;
44.         new_mapped_ram_size = init_range_memory_mapping(start,
45.                             last_start);
46.         last_start = start;
47.         min_pfn_mapped = last_start >> PAGE_SHIFT;
48.         / only increase step_size after big range get mapped /
49.         if (new_mapped_ram_size > mapped_ram_size)
50.             step_size = get_new_step_size(step_size);
51.         mapped_ram_size += new_mapped_ram_size;
52.     }
53.  
54.     if (real_end < map_end)
55.         init_range_memory_mapping(real_end, map_end);
56. }
&nbsp;

&nbsp; &nbsp;&nbsp;memory_map_top_down()首先使用memblock_find_in_range尝试查找内存,PMD_SIZE大小的内存(4M),确认建立页表的空间足够,然后开始建立页表,其关键函数是init_range_memory_mapping(),该函数的实现:


1. 【file:/arch/x86/mm/init.c】
2. /
3.  
We need to iterate through the E820 memory map and create direct mappings
4.   for only E820_RAM and E820_KERN_RESERVED regions. We cannot simply
5.  
create direct mappings for all pfns from [0 to max_low_pfn) and
6.   [4GB to max_pfn) because of possible memory holes in high addresses
7.  
that cannot be marked as UC by fixed/variable range MTRRs.
8.   Depending on the alignment of E820 ranges, this may possibly result
9.  
in using smaller size (i.e. 4K instead of 2M or 1G) page tables.
10.  
11.  
init_mem_mapping() calls init_range_memory_mapping() with big range.
12.   That range would have hole in the middle or ends, and only ram parts
13.  
will be mapped in init_range_memory_mapping().
14.  /
15. static unsigned long __init init_range_memory_mapping(
16.                        unsigned long r_start,
17.                        unsigned long r_end)
18. {
19.     unsigned long start_pfn, end_pfn;
20.     unsigned long mapped_ram_size = 0;
21.     int i;
22.  
23.     for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) {
24.         u64 start = clamp_val(PFN_PHYS(start_pfn), r_start, r_end);
25.         u64 end = clamp_val(PFN_PHYS(end_pfn), r_start, r_end);
26.         if (start >= end)
27.             continue;
28.  
29.         /

30.           if it is overlapping with brk pgt, we need to
31.          
alloc pgt buf from memblock instead.
32.          */
33.         can_use_brk_pgt = max(start, (u64)pgt_buf_end<<PAGE_SHIFT) >=
34.                     min(end, (u64)pgt_buf_top<<PAGE_SHIFT);
35.         init_memory_mapping(start, end);
36.         mapped_ram_size += end - start;
37.         can_use_brk_pgt = true;
38.     }
39.  
40.     return mapped_ram_size;
41. }
&nbsp;

&nbsp; &nbsp;&nbsp;可以看到init_range_memory_mapping()调用了前面刚分析的init_memory_mapping()函数,由此可知,它将完成内核直接映射区(低端内存)的页表建立。此外可以注意到pgt_buf_end和pgt_buf_top的使用,在init_memory_mapping()函数调用前,变量can_use_brk_pgt的设置主要是为了避免内存空间重叠,仍然使用页表缓冲区空间。不过这只是64bit系统上才会出现的情况,而32bit系统上面则没有,因为32bit系统的kernel_physical_mapping_init()并不使用alloc_low_page()申请内存,所以不涉及。

&nbsp; &nbsp;&nbsp;至此,内核低端内存页表建立完毕。

[http://blog.chinaunix.net/uid-26859697-id-4592327.html](http://blog.chinaunix.net/uid-26859697-id-4592327.html)