xref: /linux/arch/x86/mm/init.c (revision fc8e1ead9314cf0e0f1922e661428b93d3a50d88)
1 #include <linux/initrd.h>
2 #include <linux/ioport.h>
3 #include <linux/swap.h>
4 
5 #include <asm/cacheflush.h>
6 #include <asm/e820.h>
7 #include <asm/init.h>
8 #include <asm/page.h>
9 #include <asm/page_types.h>
10 #include <asm/sections.h>
11 #include <asm/setup.h>
12 #include <asm/system.h>
13 #include <asm/tlbflush.h>
14 #include <asm/tlb.h>
15 #include <asm/proto.h>
16 
17 DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
18 
19 unsigned long __initdata e820_table_start;
20 unsigned long __meminitdata e820_table_end;
21 unsigned long __meminitdata e820_table_top;
22 
23 int after_bootmem;
24 
25 int direct_gbpages
26 #ifdef CONFIG_DIRECT_GBPAGES
27 				= 1
28 #endif
29 ;
30 
31 int nx_enabled;
32 
33 #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
34 static int disable_nx __cpuinitdata;
35 
36 /*
37  * noexec = on|off
38  *
39  * Control non-executable mappings for processes.
40  *
41  * on      Enable
42  * off     Disable
43  */
44 static int __init noexec_setup(char *str)
45 {
46 	if (!str)
47 		return -EINVAL;
48 	if (!strncmp(str, "on", 2)) {
49 		__supported_pte_mask |= _PAGE_NX;
50 		disable_nx = 0;
51 	} else if (!strncmp(str, "off", 3)) {
52 		disable_nx = 1;
53 		__supported_pte_mask &= ~_PAGE_NX;
54 	}
55 	return 0;
56 }
57 early_param("noexec", noexec_setup);
58 #endif
59 
60 #ifdef CONFIG_X86_PAE
61 static void __init set_nx(void)
62 {
63 	unsigned int v[4], l, h;
64 
65 	if (cpu_has_pae && (cpuid_eax(0x80000000) > 0x80000001)) {
66 		cpuid(0x80000001, &v[0], &v[1], &v[2], &v[3]);
67 
68 		if ((v[3] & (1 << 20)) && !disable_nx) {
69 			rdmsr(MSR_EFER, l, h);
70 			l |= EFER_NX;
71 			wrmsr(MSR_EFER, l, h);
72 			nx_enabled = 1;
73 			__supported_pte_mask |= _PAGE_NX;
74 		}
75 	}
76 }
77 #else
78 static inline void set_nx(void)
79 {
80 }
81 #endif
82 
83 #ifdef CONFIG_X86_64
84 void __cpuinit check_efer(void)
85 {
86 	unsigned long efer;
87 
88 	rdmsrl(MSR_EFER, efer);
89 	if (!(efer & EFER_NX) || disable_nx)
90 		__supported_pte_mask &= ~_PAGE_NX;
91 }
92 #endif
93 
94 static void __init find_early_table_space(unsigned long end, int use_pse,
95 					  int use_gbpages)
96 {
97 	unsigned long puds, pmds, ptes, tables, start;
98 
99 	puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
100 	tables = roundup(puds * sizeof(pud_t), PAGE_SIZE);
101 
102 	if (use_gbpages) {
103 		unsigned long extra;
104 
105 		extra = end - ((end>>PUD_SHIFT) << PUD_SHIFT);
106 		pmds = (extra + PMD_SIZE - 1) >> PMD_SHIFT;
107 	} else
108 		pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
109 
110 	tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE);
111 
112 	if (use_pse) {
113 		unsigned long extra;
114 
115 		extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT);
116 #ifdef CONFIG_X86_32
117 		extra += PMD_SIZE;
118 #endif
119 		ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT;
120 	} else
121 		ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
122 
123 	tables += roundup(ptes * sizeof(pte_t), PAGE_SIZE);
124 
125 #ifdef CONFIG_X86_32
126 	/* for fixmap */
127 	tables += roundup(__end_of_fixed_addresses * sizeof(pte_t), PAGE_SIZE);
128 #endif
129 
130 	/*
131 	 * RED-PEN putting page tables only on node 0 could
132 	 * cause a hotspot and fill up ZONE_DMA. The page tables
133 	 * need roughly 0.5KB per GB.
134 	 */
135 #ifdef CONFIG_X86_32
136 	start = 0x7000;
137 #else
138 	start = 0x8000;
139 #endif
140 	e820_table_start = find_e820_area(start, max_pfn_mapped<<PAGE_SHIFT,
141 					tables, PAGE_SIZE);
142 	if (e820_table_start == -1UL)
143 		panic("Cannot find space for the kernel page tables");
144 
145 	e820_table_start >>= PAGE_SHIFT;
146 	e820_table_end = e820_table_start;
147 	e820_table_top = e820_table_start + (tables >> PAGE_SHIFT);
148 
149 	printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n",
150 		end, e820_table_start << PAGE_SHIFT, e820_table_top << PAGE_SHIFT);
151 }
152 
153 struct map_range {
154 	unsigned long start;
155 	unsigned long end;
156 	unsigned page_size_mask;
157 };
158 
159 #ifdef CONFIG_X86_32
160 #define NR_RANGE_MR 3
161 #else /* CONFIG_X86_64 */
162 #define NR_RANGE_MR 5
163 #endif
164 
165 static int __meminit save_mr(struct map_range *mr, int nr_range,
166 			     unsigned long start_pfn, unsigned long end_pfn,
167 			     unsigned long page_size_mask)
168 {
169 	if (start_pfn < end_pfn) {
170 		if (nr_range >= NR_RANGE_MR)
171 			panic("run out of range for init_memory_mapping\n");
172 		mr[nr_range].start = start_pfn<<PAGE_SHIFT;
173 		mr[nr_range].end   = end_pfn<<PAGE_SHIFT;
174 		mr[nr_range].page_size_mask = page_size_mask;
175 		nr_range++;
176 	}
177 
178 	return nr_range;
179 }
180 
181 /*
182  * Setup the direct mapping of the physical memory at PAGE_OFFSET.
183  * This runs before bootmem is initialized and gets pages directly from
184  * the physical memory. To access them they are temporarily mapped.
185  */
186 unsigned long __init_refok init_memory_mapping(unsigned long start,
187 					       unsigned long end)
188 {
189 	unsigned long page_size_mask = 0;
190 	unsigned long start_pfn, end_pfn;
191 	unsigned long ret = 0;
192 	unsigned long pos;
193 
194 	struct map_range mr[NR_RANGE_MR];
195 	int nr_range, i;
196 	int use_pse, use_gbpages;
197 
198 	printk(KERN_INFO "init_memory_mapping: %016lx-%016lx\n", start, end);
199 
200 #if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KMEMCHECK)
201 	/*
202 	 * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages.
203 	 * This will simplify cpa(), which otherwise needs to support splitting
204 	 * large pages into small in interrupt context, etc.
205 	 */
206 	use_pse = use_gbpages = 0;
207 #else
208 	use_pse = cpu_has_pse;
209 	use_gbpages = direct_gbpages;
210 #endif
211 
212 	set_nx();
213 	if (nx_enabled)
214 		printk(KERN_INFO "NX (Execute Disable) protection: active\n");
215 
216 	/* Enable PSE if available */
217 	if (cpu_has_pse)
218 		set_in_cr4(X86_CR4_PSE);
219 
220 	/* Enable PGE if available */
221 	if (cpu_has_pge) {
222 		set_in_cr4(X86_CR4_PGE);
223 		__supported_pte_mask |= _PAGE_GLOBAL;
224 	}
225 
226 	if (use_gbpages)
227 		page_size_mask |= 1 << PG_LEVEL_1G;
228 	if (use_pse)
229 		page_size_mask |= 1 << PG_LEVEL_2M;
230 
231 	memset(mr, 0, sizeof(mr));
232 	nr_range = 0;
233 
234 	/* head if not big page alignment ? */
235 	start_pfn = start >> PAGE_SHIFT;
236 	pos = start_pfn << PAGE_SHIFT;
237 #ifdef CONFIG_X86_32
238 	/*
239 	 * Don't use a large page for the first 2/4MB of memory
240 	 * because there are often fixed size MTRRs in there
241 	 * and overlapping MTRRs into large pages can cause
242 	 * slowdowns.
243 	 */
244 	if (pos == 0)
245 		end_pfn = 1<<(PMD_SHIFT - PAGE_SHIFT);
246 	else
247 		end_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
248 				 << (PMD_SHIFT - PAGE_SHIFT);
249 #else /* CONFIG_X86_64 */
250 	end_pfn = ((pos + (PMD_SIZE - 1)) >> PMD_SHIFT)
251 			<< (PMD_SHIFT - PAGE_SHIFT);
252 #endif
253 	if (end_pfn > (end >> PAGE_SHIFT))
254 		end_pfn = end >> PAGE_SHIFT;
255 	if (start_pfn < end_pfn) {
256 		nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
257 		pos = end_pfn << PAGE_SHIFT;
258 	}
259 
260 	/* big page (2M) range */
261 	start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
262 			 << (PMD_SHIFT - PAGE_SHIFT);
263 #ifdef CONFIG_X86_32
264 	end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
265 #else /* CONFIG_X86_64 */
266 	end_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT)
267 			 << (PUD_SHIFT - PAGE_SHIFT);
268 	if (end_pfn > ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT)))
269 		end_pfn = ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT));
270 #endif
271 
272 	if (start_pfn < end_pfn) {
273 		nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
274 				page_size_mask & (1<<PG_LEVEL_2M));
275 		pos = end_pfn << PAGE_SHIFT;
276 	}
277 
278 #ifdef CONFIG_X86_64
279 	/* big page (1G) range */
280 	start_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT)
281 			 << (PUD_SHIFT - PAGE_SHIFT);
282 	end_pfn = (end >> PUD_SHIFT) << (PUD_SHIFT - PAGE_SHIFT);
283 	if (start_pfn < end_pfn) {
284 		nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
285 				page_size_mask &
286 				 ((1<<PG_LEVEL_2M)|(1<<PG_LEVEL_1G)));
287 		pos = end_pfn << PAGE_SHIFT;
288 	}
289 
290 	/* tail is not big page (1G) alignment */
291 	start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
292 			 << (PMD_SHIFT - PAGE_SHIFT);
293 	end_pfn = (end >> PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
294 	if (start_pfn < end_pfn) {
295 		nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
296 				page_size_mask & (1<<PG_LEVEL_2M));
297 		pos = end_pfn << PAGE_SHIFT;
298 	}
299 #endif
300 
301 	/* tail is not big page (2M) alignment */
302 	start_pfn = pos>>PAGE_SHIFT;
303 	end_pfn = end>>PAGE_SHIFT;
304 	nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
305 
306 	/* try to merge same page size and continuous */
307 	for (i = 0; nr_range > 1 && i < nr_range - 1; i++) {
308 		unsigned long old_start;
309 		if (mr[i].end != mr[i+1].start ||
310 		    mr[i].page_size_mask != mr[i+1].page_size_mask)
311 			continue;
312 		/* move it */
313 		old_start = mr[i].start;
314 		memmove(&mr[i], &mr[i+1],
315 			(nr_range - 1 - i) * sizeof(struct map_range));
316 		mr[i--].start = old_start;
317 		nr_range--;
318 	}
319 
320 	for (i = 0; i < nr_range; i++)
321 		printk(KERN_DEBUG " %010lx - %010lx page %s\n",
322 				mr[i].start, mr[i].end,
323 			(mr[i].page_size_mask & (1<<PG_LEVEL_1G))?"1G":(
324 			 (mr[i].page_size_mask & (1<<PG_LEVEL_2M))?"2M":"4k"));
325 
326 	/*
327 	 * Find space for the kernel direct mapping tables.
328 	 *
329 	 * Later we should allocate these tables in the local node of the
330 	 * memory mapped. Unfortunately this is done currently before the
331 	 * nodes are discovered.
332 	 */
333 	if (!after_bootmem)
334 		find_early_table_space(end, use_pse, use_gbpages);
335 
336 #ifdef CONFIG_X86_32
337 	for (i = 0; i < nr_range; i++)
338 		kernel_physical_mapping_init(mr[i].start, mr[i].end,
339 					     mr[i].page_size_mask);
340 	ret = end;
341 #else /* CONFIG_X86_64 */
342 	for (i = 0; i < nr_range; i++)
343 		ret = kernel_physical_mapping_init(mr[i].start, mr[i].end,
344 						   mr[i].page_size_mask);
345 #endif
346 
347 #ifdef CONFIG_X86_32
348 	early_ioremap_page_table_range_init();
349 
350 	load_cr3(swapper_pg_dir);
351 #endif
352 
353 #ifdef CONFIG_X86_64
354 	if (!after_bootmem && !start) {
355 		pud_t *pud;
356 		pmd_t *pmd;
357 
358 		mmu_cr4_features = read_cr4();
359 
360 		/*
361 		 * _brk_end cannot change anymore, but it and _end may be
362 		 * located on different 2M pages. cleanup_highmap(), however,
363 		 * can only consider _end when it runs, so destroy any
364 		 * mappings beyond _brk_end here.
365 		 */
366 		pud = pud_offset(pgd_offset_k(_brk_end), _brk_end);
367 		pmd = pmd_offset(pud, _brk_end - 1);
368 		while (++pmd <= pmd_offset(pud, (unsigned long)_end - 1))
369 			pmd_clear(pmd);
370 	}
371 #endif
372 	__flush_tlb_all();
373 
374 	if (!after_bootmem && e820_table_end > e820_table_start)
375 		reserve_early(e820_table_start << PAGE_SHIFT,
376 				 e820_table_end << PAGE_SHIFT, "PGTABLE");
377 
378 	if (!after_bootmem)
379 		early_memtest(start, end);
380 
381 	return ret >> PAGE_SHIFT;
382 }
383 
384 
385 /*
386  * devmem_is_allowed() checks to see if /dev/mem access to a certain address
387  * is valid. The argument is a physical page number.
388  *
389  *
390  * On x86, access has to be given to the first megabyte of ram because that area
391  * contains bios code and data regions used by X and dosemu and similar apps.
392  * Access has to be given to non-kernel-ram areas as well, these contain the PCI
393  * mmio resources as well as potential bios/acpi data regions.
394  */
395 int devmem_is_allowed(unsigned long pagenr)
396 {
397 	if (pagenr <= 256)
398 		return 1;
399 	if (iomem_is_exclusive(pagenr << PAGE_SHIFT))
400 		return 0;
401 	if (!page_is_ram(pagenr))
402 		return 1;
403 	return 0;
404 }
405 
406 void free_init_pages(char *what, unsigned long begin, unsigned long end)
407 {
408 	unsigned long addr = begin;
409 
410 	if (addr >= end)
411 		return;
412 
413 	/*
414 	 * If debugging page accesses then do not free this memory but
415 	 * mark them not present - any buggy init-section access will
416 	 * create a kernel page fault:
417 	 */
418 #ifdef CONFIG_DEBUG_PAGEALLOC
419 	printk(KERN_INFO "debug: unmapping init memory %08lx..%08lx\n",
420 		begin, PAGE_ALIGN(end));
421 	set_memory_np(begin, (end - begin) >> PAGE_SHIFT);
422 #else
423 	/*
424 	 * We just marked the kernel text read only above, now that
425 	 * we are going to free part of that, we need to make that
426 	 * writeable first.
427 	 */
428 	set_memory_rw(begin, (end - begin) >> PAGE_SHIFT);
429 
430 	printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10);
431 
432 	for (; addr < end; addr += PAGE_SIZE) {
433 		ClearPageReserved(virt_to_page(addr));
434 		init_page_count(virt_to_page(addr));
435 		memset((void *)(addr & ~(PAGE_SIZE-1)),
436 			POISON_FREE_INITMEM, PAGE_SIZE);
437 		free_page(addr);
438 		totalram_pages++;
439 	}
440 #endif
441 }
442 
443 void free_initmem(void)
444 {
445 	free_init_pages("unused kernel memory",
446 			(unsigned long)(&__init_begin),
447 			(unsigned long)(&__init_end));
448 }
449 
450 #ifdef CONFIG_BLK_DEV_INITRD
451 void free_initrd_mem(unsigned long start, unsigned long end)
452 {
453 	free_init_pages("initrd memory", start, end);
454 }
455 #endif
456