前面已经分析了内核页表的准备工作以及内核低端内存页表的建立,接着回到init_mem_mapping()中,低端内存页表建立后紧随着还有一个函数early_ioremap_page_table_range_init()



1. 【file:/arch/x86/mm/init.c】
2. /
3.   Build a proper pagetable for the kernel mappings. Up until this
4.   point, we've been running on some set of pagetables constructed by
5.   the boot process.
6.  
7.   If we're booting on native hardware, this will be a pagetable
8.   constructed in arch/x86/kernel/head_32.S. The root of the
9.   pagetable will be swapper_pg_dir.
10.  
11.   If we're booting paravirtualized under a hypervisor, then there are
12.   more options: we may already be running PAE, and the pagetable may
13.   or may not be based in swapper_pg_dir. In any case,
14.   paravirt_pagetable_init() will set up swapper_pg_dir
15.   appropriately for the rest of the initialization to work.
16.  
17.   In general, pagetable_init() assumes that the pagetable may already
18.   be partially populated, and so it avoids stomping on any existing
19.   mappings.
20.  /
21. void __init early_ioremap_page_table_range_init(void)
22. {
23.     pgd_t pgd_base = swapper_pg_dir;
24.     unsigned long vaddr, end;
25.  
26.     /
27.       Fixed mappings, only the page table structure has to be
28.       created - mappings will be set by set_fixmap():
29.      /
30.     vaddr = fix_to_virt(end_of_fixed_addresses - 1) & PMD_MASK;
31.     end = (FIXADDR_TOP + PMD_SIZE - 1) & PMD_MASK;
32.     page_table_range_init(vaddr, end, pgd_base);
33.     early_ioremap_reset();
34. }
 

该函数主要是用于建立固定内存映射区的。固定内存映射区是指<span style="-ms-word-wrap: break-word;">FIXADDR_START</span>到<span style="-ms-word-wrap: break-word;">FIXADDR_TOP</span>的地址空间,而该地址空间因功能特性不同通过索引来定义区分,其中索引以枚举类型的形式定义在<span style="-ms-word-wrap: break-word;">enum fixed_addresses</span>里面。


1. 【file:/arch/x86/include/asm/fixmap.h】
2. /
3.   Here we define all the compile-time 'special' virtual
4.   addresses. The point is to have a constant address at
5.   compile time, but to set the physical address only
6.   in the boot process.
7.   for x86_32: We allocate these special addresses
8.   from the end of virtual memory (0xfffff000) backwards.
9.   Also this lets us do fail-safe vmalloc(), we
10.   can guarantee that these special addresses and
11.   vmalloc()-ed addresses never overlap.
12.  
13.   These 'compile-time allocated' memory buffers are
14.   fixed-size 4k pages (or larger if used with an increment
15.   higher than 1). Use set_fixmap(idx,phys) to associate
16.   physical memory with fixmap indices.
17.  
18.   TLB entries of such buffers will not be flushed across
19.   task switches.
20.  /
21. enum fixed_addresses {
22. #ifdef CONFIG_X86_32
23.     FIX_HOLE,
24.     FIX_VDSO,
25. #else
26.     VSYSCALL_LAST_PAGE,
27.     VSYSCALL_FIRST_PAGE = VSYSCALL_LAST_PAGE
28.                 + ((VSYSCALL_END-VSYSCALL_START) >> PAGE_SHIFT) - 1,
29.     VVAR_PAGE,
30.     VSYSCALL_HPET,
31. #ifdef CONFIG_PARAVIRT_CLOCK
32.     PVCLOCK_FIXMAP_BEGIN,
33.     PVCLOCK_FIXMAP_END = PVCLOCK_FIXMAP_BEGIN+PVCLOCK_VSYSCALL_NR_PAGES-1,
34. #endif
35. #endif
36.     FIX_DBGP_BASE,
37.     FIX_EARLYCON_MEM_BASE,
38. #ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
39.     FIX_OHCI1394_BASE,
40. #endif
41. #ifdef CONFIG_X86_LOCAL_APIC
42.     FIX_APIC_BASE, / local (CPU) APIC) -- required for SMP or not /
43. #endif
44. #ifdef CONFIG_X86_IO_APIC
45.     FIX_IO_APIC_BASE_0,
46.     FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS - 1,
47. #endif
48. #ifdef CONFIG_X86_VISWS_APIC
49.     FIX_CO_CPU, / Cobalt timer /
50.     FIX_CO_APIC, / Cobalt APIC Redirection Table /
51.     FIX_LI_PCIA, / Lithium PCI Bridge A /
52.     FIX_LI_PCIB, / Lithium PCI Bridge B /
53. #endif
54.     FIX_RO_IDT, / Virtual mapping for read-only IDT /
55. #ifdef CONFIG_X86_32
56.     FIX_KMAP_BEGIN, / reserved pte's for temporary kernel mappings /
57.     FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NRNR_CPUS)-1,
58. #ifdef CONFIG_PCI_MMCONFIG
59.     FIX_PCIE_MCFG,
60. #endif
61. #endif
62. #ifdef CONFIG_PARAVIRT
63.     FIX_PARAVIRT_BOOTMAP,
64. #endif
65.     FIX_TEXT_POKE1, / reserve 2 pages for text_poke() /
66.     FIX_TEXT_POKE0, / first page is last, because allocation is backward /
67. #ifdef CONFIG_X86_INTEL_MID
68.     FIX_LNW_VRTC,
69. #endif
70.     end_of_permanent_fixed_addresses,
71.  
72.     /
73.       256 temporary boot-time mappings, used by early_ioremap(),
74.       before ioremap() is functional.
75.      
76.       If necessary we round it up to the next 256 pages boundary so
77.       that we can have a single pgd entry and a single pte table:
78.      /
79. #define NR_FIX_BTMAPS 64
80. #define FIX_BTMAPS_SLOTS 4
81. #define TOTAL_FIX_BTMAPS (NR_FIX_BTMAPS FIX_BTMAPS_SLOTS)
82.     FIX_BTMAP_END =
83.      (
end_of_permanent_fixed_addresses ^
84.       (end_of_permanent_fixed_addresses + TOTAL_FIX_BTMAPS - 1)) &
85.      -PTRS_PER_PTE
86.      ?
end_of_permanent_fixed_addresses + TOTAL_FIX_BTMAPS -
87.        (end_of_permanent_fixed_addresses & (TOTAL_FIX_BTMAPS - 1))
88.      :
end_of_permanent_fixed_addresses,
89.     FIX_BTMAP_BEGIN = FIX_BTMAP_END + TOTAL_FIX_BTMAPS - 1,
90. #ifdef CONFIG_X86_32
91.     FIX_WP_TEST,
92. #endif
93. #ifdef CONFIG_INTEL_TXT
94.     FIX_TBOOT_BASE,
95. #endif
96.     __end_of_fixed_addresses
97. };
&nbsp;

但是各枚举标识的分区并不是从低地址往高地址分布,而是自高地址往低地址分布。其中<span style="-ms-word-wrap: break-word;">__fix_to_virt</span>宏定义就是用来通过索引来计算相应的固定映射区域的线性地址。

#define __fix_to_virt(x)         (FIXADDR_TOP - ((x) << PAGE_SHIFT))

对应的有虚拟地址转索引的宏:

#define __virt_to_fix(x)         ((FIXADDR_TOP - ((x)&PAGE_MASK)) >> PAGE_SHIFT)

接着回到<span style="-ms-word-wrap: break-word;">early_ioremap_page_table_range_init()</span>的第一个函数<span style="-ms-word-wrap: break-word;">page_table_range_init()</span>:


1. 【file:/arch/x86/mm/init_32.c】
2. /
3.   This function initializes a certain range of kernel virtual memory
4.   with new bootmem page tables, everywhere page tables are missing in
5.   the given range.
6.  
7.   NOTE: The pagetables are allocated contiguous on the physical space
8.   so we can cache the place of the first one and move around without
9.   checking the pgd every time.
10.  /
11. static void __init
12. page_table_range_init(unsigned long start, unsigned long end, pgd_t pgd_base)
13. {
14.     int pgd_idx, pmd_idx;
15.     unsigned long vaddr;
16.     pgd_t pgd;
17.     pmd_t pmd;
18.     pte_t pte = NULL;
19.     unsigned long count = page_table_range_init_count(start, end);
20.     void adr = NULL;
21.  
22.     if (count)
23.         adr = alloc_low_pages(count);
24.  
25.     vaddr = start;
26.     pgd_idx = pgd_index(vaddr);
27.     pmd_idx = pmd_index(vaddr);
28.     pgd = pgd_base + pgd_idx;
29.  
30.     for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd++, pgd_idx++) {
31.         pmd = one_md_table_init(pgd);
32.         pmd = pmd + pmd_index(vaddr);
33.         for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end);
34.                             pmd++, pmd_idx++) {
35.             pte = page_table_kmap_check(one_page_table_init(pmd),
36.                             pmd, vaddr, pte, &adr);
37.  
38.             vaddr += PMD_SIZE;
39.         }
40.         pmd_idx = 0;
41.     }
42. }
&nbsp;

该函数里面其中调用的<span style="-ms-word-wrap: break-word;">page_table_range_init_count()</span>:


1. 【file:/arch/x86/mm/init_32.c】
2. static unsigned long __init
3. page_table_range_init_count(unsigned long start, unsigned long end)
4. {
5.     unsigned long count = 0;
6. #ifdef CONFIG_HIGHMEM
7.     int pmd_idx_kmap_begin = fix_to_virt(FIX_KMAP_END) >> PMD_SHIFT;
8.     int pmd_idx_kmap_end = fix_to_virt(FIX_KMAP_BEGIN) >> PMD_SHIFT;
9.     int pgd_idx, pmd_idx;
10.     unsigned long vaddr;
11.  
12.     if (pmd_idx_kmap_begin == pmd_idx_kmap_end)
13.         return 0;
14.  
15.     vaddr = start;
16.     pgd_idx = pgd_index(vaddr);
17.  
18.     for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd_idx++) {
19.         for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end);
20.                             pmd_idx++) {
21.             if ((vaddr >> PMD_SHIFT) >= pmd_idx_kmap_begin &&
22.                 (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end)
23.                 count++;
24.             vaddr += PMD_SIZE;
25.         }
26.         pmd_idx = 0;
27.     }
28. #endif
29.     return count;
30. }
&nbsp;

page_table_range_init_count()用来计算指临时内核映射区间的页表数量。前面提到<span style="-ms-word-wrap: break-word;">FIXADDR_START</span>到<span style="-ms-word-wrap: break-word;">FIXADDR_TOP</span>是固定映射区,其间有多个索引标识不同功能的映射区间,其中的一个区间<span style="-ms-word-wrap: break-word;">FIX_KMAP_BEGIN</span>到<span style="-ms-word-wrap: break-word;">FIX_KMAP_END</span>是临时内核映射区。顺便可以看一下两者的定义:

&nbsp;&nbsp;&nbsp; FIX_KMAP_BEGIN, /* reserved pte&#39;s for temporary kernel mappings */

&nbsp;&nbsp;&nbsp; FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1,

其中<span style="-ms-word-wrap: break-word;">KM_TYPE_NR</span>表示&ldquo;窗口&rdquo;数量,在高端内存的任意一个页框都可以通过一个&ldquo;窗口&rdquo;映射到内核地址空间,调用<span style="-ms-word-wrap: break-word;">kmap_atomic</span>可以搭建起&ldquo;窗口&rdquo;到高端内存的关系,即建立临时内核映射。而<span style="-ms-word-wrap: break-word;">NR_CPUS</span>则表示<span style="-ms-word-wrap: break-word;">CPU</span>数量。总的来说就是该临时内核映射区间是为了给各个<span style="-ms-word-wrap: break-word;">CPU</span>准备一个指定的窗口空间。由于<span style="-ms-word-wrap: break-word;">kmap_atomic()</span>对该区间的使用,所以该区间必须保证其页表连续性。

如果页全局目录数不为<span style="-ms-word-wrap: break-word;">0</span>的时候,紧接着<span style="-ms-word-wrap: break-word;">page_table_range_init_count()</span>的是<span style="-ms-word-wrap: break-word;">alloc_low_pages()</span>:


1. 【file:/arch/x86/mm/init.c】
2. /
3.   Pages returned are already directly mapped.
4.  
5.   Changing that is likely to break Xen, see commit:
6.  
7.   279b706 x86,xen: introduce x86_init.mapping.pagetable_reserve
8.  
9.   for detailed information.
10.  /
11. __ref void alloc_low_pages(unsigned int num)
12. {
13.     unsigned long pfn;
14.     int i;
15.  
16.     if (after_bootmem) {
17.         unsigned int order;
18.  
19.         order = get_order((unsigned long)num << PAGE_SHIFT);
20.         return (void )get_free_pages(GFP_ATOMIC | GFP_NOTRACK |
21.                         __GFP_ZERO, order);
22.     }
23.  
24.     if ((pgt_buf_end + num) > pgt_buf_top || !can_use_brk_pgt) {
25.         unsigned long ret;
26.         if (min_pfn_mapped >= max_pfn_mapped)
27.             panic("alloc_low_pages: ran out of memory");
28.         ret = memblock_find_in_range(min_pfn_mapped << PAGE_SHIFT,
29.                     max_pfn_mapped << PAGE_SHIFT,
30.                     PAGE_SIZE num , PAGE_SIZE);
31.         if (!ret)
32.             panic("alloc_low_pages: can not alloc memory");
33.         memblock_reserve(ret, PAGE_SIZE num);
34.         pfn = ret >> PAGE_SHIFT;
35.     } else {
36.         pfn = pgt_buf_end;
37.         pgt_buf_end += num;
38.         printk(KERN_DEBUG "BRK [%#010lx, %#010lx] PGTABLE\n",
39.             pfn << PAGE_SHIFT, (pgt_buf_end << PAGE_SHIFT) - 1);
40.     }
41.  
42.     for (i = 0; i < num; i++) {
43.         void adr;
44.  
45.         adr = va((pfn + i) << PAGE_SHIFT);
46.         clear_page(adr);
47.     }
48.  
49.     return
va(pfn << PAGE_SHIFT);
50. }
&nbsp;

则是根据前面<span style="-ms-word-wrap: break-word;">early_alloc_pgt_buf()</span>申请保留的页表缓冲空间使用情况来判断,是从页表缓冲空间中申请还是通过<span style="-ms-word-wrap: break-word;">memblock</span>算法申请页表内存。

回到<span style="-ms-word-wrap: break-word;">page_table_range_init()</span>,其中<span style="-ms-word-wrap: break-word;">one_md_table_init()</span>是用于当<span style="-ms-word-wrap: break-word;">pgd</span>入参为空时,申请新物理页作为页中间目录的,但是此次仅分析<span style="-ms-word-wrap: break-word;">x86</span>非<span style="-ms-word-wrap: break-word;">PAE</span>环境的情况,不存在页中间目录,故实际上返回的仍是入参。附代码:


1. 【file:/arch/x86/mm/init_32.c】
2. /
3.   Creates a middle page table and puts a pointer to it in the
4.   given global directory entry. This only returns the gd entry
5.   in non-PAE compilation mode, since the middle layer is folded.
6.  /
7. static pmd_t init one_md_table_init(pgd_t pgd)
8. {
9.     pud_t pud;
10.     pmd_t pmd_table;
11.  
12. #ifdef CONFIG_X86_PAE
13.     if (!(pgd_val(pgd) & _PAGE_PRESENT)) {
14.         pmd_table = (pmd_t *)alloc_low_page();
15.         paravirt_alloc_pmd(&init_mm,
pa(pmd_table) >> PAGE_SHIFT);
16.         set_pgd(pgd, pgd(pa(pmd_table) | _PAGE_PRESENT));
17.         pud = pud_offset(pgd, 0);
18.         BUG_ON(pmd_table != pmd_offset(pud, 0));
19.  
20.         return pmd_table;
21.     }
22. #endif
23.     pud = pud_offset(pgd, 0);
24.     pmd_table = pmd_offset(pud, 0);
25.  
26.     return pmd_table;
27. }
&nbsp;

接着的是<span style="-ms-word-wrap: break-word;">page_table_kmap_check()</span>,其入参调用的<span style="-ms-word-wrap: break-word;">one_page_table_init()</span>是用于当入参<span style="-ms-word-wrap: break-word;">pmd</span>没有页表指向时,创建页表并使其指向被创建的页表。<span style="-ms-word-wrap: break-word;">page_table_kmap_check()</span>实现:


1. 【file:/arch/x86/mm/init_32.c】
2. static pte_t __init page_table_kmap_check(pte_t pte, pmd_t pmd,
3.                        unsigned long vaddr, pte_t lastpte,
4.                        void adr)
5. {
6. #ifdef CONFIG_HIGHMEM
7.     /
8.       Something (early fixmap) may already have put a pte
9.       page here, which causes the page table allocation
10.       to become nonlinear. Attempt to fix it, and if it
11.       is still nonlinear then we have to bug.
12.      /
13.     int pmd_idx_kmap_begin = fix_to_virt(FIX_KMAP_END) >> PMD_SHIFT;
14.     int pmd_idx_kmap_end = fix_to_virt(FIX_KMAP_BEGIN) >> PMD_SHIFT;
15.  
16.     if (pmd_idx_kmap_begin != pmd_idx_kmap_end
17.         && (vaddr >> PMD_SHIFT) >= pmd_idx_kmap_begin
18.         && (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end) {
19.         pte_t newpte;
20.         int i;
21.  
22.         BUG_ON(after_bootmem);
23.         newpte = adr;
24.         for (i = 0; i < PTRS_PER_PTE; i++)
25.             set_pte(newpte + i, pte[i]);
26.         adr = (void )(((unsigned long)(*adr)) + PAGE_SIZE);
27.  
28.         paravirt_alloc_pte(&init_mm, pa(newpte) >> PAGE_SHIFT);
29.         set_pmd(pmd,
pmd(pa(newpte)|_PAGE_TABLE));
30.         BUG_ON(newpte != pte_offset_kernel(pmd, 0));
31.         
flush_tlb_all();
32.  
33.         paravirt_release_pte(__pa(pte) >> PAGE_SHIFT);
34.         pte = newpte;
35.     }
36.     BUG_ON(vaddr < fix_to_virt(FIX_KMAP_BEGIN - 1)
37.            && vaddr > fix_to_virt(FIX_KMAP_END)
38.            && lastpte && lastpte + PTRS_PER_PTE != pte);
39. #endif
40.     return pte;
41. }
&nbsp;

可以看到这里在此出现临时内核映射区间的标识(<span style="-ms-word-wrap: break-word;">FIX_KMAP_END</span>和<span style="-ms-word-wrap: break-word;">FIX_KMAP_BEGIN</span>),检查当前页表初始化的地址是否处于该区间范围,如果是,则把其<span style="-ms-word-wrap: break-word;">pte</span>页表的内容拷贝到<span style="-ms-word-wrap: break-word;">page_table_range_init()</span>申请的页表空间中,并将<span style="-ms-word-wrap: break-word;">newpte</span>新页表的地址设置到<span style="-ms-word-wrap: break-word;">pmd</span>中(<span style="-ms-word-wrap: break-word;">32bit</span>系统实际上就是页全局目录),然后调用<span style="-ms-word-wrap: break-word;">__flush_tlb_all()</span>刷新<span style="-ms-word-wrap: break-word;">TLB</span>缓存;如果不是该区间,则仅是由入参中调用的<span style="-ms-word-wrap: break-word;">one_page_table_init()</span>被分配到了页表空间。

由此,可以知道<span style="-ms-word-wrap: break-word;">page_table_range_init()</span>主要是做了什么了。这是由于<span style="-ms-word-wrap: break-word;">kmap_atomic()</span>对该区间的使用,该区间必须保证其页表连续性。为了避免前期可能对固定映射区已经分配了页表项,基于临时内核映射区间要求页表连续性的保证,所以在此重新申请连续的页表空间将原页表内容拷贝至此。值得注意的是,与低端内存的页表初始化不同的是,这里的页表只是被分配,相应的<span style="-ms-word-wrap: break-word;">PTE</span>项并未初始化,这个工作将会交由以后各个固定映射区部分的相关代码调用<span style="-ms-word-wrap: break-word;">set_fixmap()</span>来将相关的固定映射区页表与物理内存关联。

early_ioremap_page_table_range_init()函数再往下的<span style="-ms-word-wrap: break-word;">early_ioremap_reset()</span>仅是对<span style="-ms-word-wrap: break-word;">after_paging_init</span>全局变量赋值。

最后退出<span style="-ms-word-wrap: break-word;">early_ioremap_page_table_range_init()</span>后,<span style="-ms-word-wrap: break-word;">init_mem_mapping()</span>调用<span style="-ms-word-wrap: break-word;">load_cr3()</span>刷新<span style="-ms-word-wrap: break-word;">CR3</span>寄存器,<span style="-ms-word-wrap: break-word;">__flush_tlb_all()</span>则用于刷新<span style="-ms-word-wrap: break-word;">TLB</span>,由此启用新的内存分页映射。

&nbsp; &nbsp; &nbsp;至此,内核页表建立完毕。

[http://blog.chinaunix.net/uid-26859697-id-4687399.html](http://blog.chinaunix.net/uid-26859697-id-4687399.html)