1 #include <linux/gfp.h> 2 #include <linux/initrd.h> 3 #include <linux/ioport.h> 4 #include <linux/swap.h> 5 #include <linux/memblock.h> 6 #include <linux/bootmem.h> /* for max_low_pfn */ 7 8 #include <asm/set_memory.h> 9 #include <asm/e820/api.h> 10 #include <asm/init.h> 11 #include <asm/page.h> 12 #include <asm/page_types.h> 13 #include <asm/sections.h> 14 #include <asm/setup.h> 15 #include <asm/tlbflush.h> 16 #include <asm/tlb.h> 17 #include <asm/proto.h> 18 #include <asm/dma.h> /* for MAX_DMA_PFN */ 19 #include <asm/microcode.h> 20 #include <asm/kaslr.h> 21 #include <asm/hypervisor.h> 22 #include <asm/cpufeature.h> 23 #include <asm/pti.h> 24 25 /* 26 * We need to define the tracepoints somewhere, and tlb.c 27 * is only compied when SMP=y. 28 */ 29 #define CREATE_TRACE_POINTS 30 #include <trace/events/tlb.h> 31 32 #include "mm_internal.h" 33 34 /* 35 * Tables translating between page_cache_type_t and pte encoding. 36 * 37 * The default values are defined statically as minimal supported mode; 38 * WC and WT fall back to UC-. pat_init() updates these values to support 39 * more cache modes, WC and WT, when it is safe to do so. See pat_init() 40 * for the details. Note, __early_ioremap() used during early boot-time 41 * takes pgprot_t (pte encoding) and does not use these tables. 42 * 43 * Index into __cachemode2pte_tbl[] is the cachemode. 44 * 45 * Index into __pte2cachemode_tbl[] are the caching attribute bits of the pte 46 * (_PAGE_PWT, _PAGE_PCD, _PAGE_PAT) at index bit positions 0, 1, 2. 47 */ 48 uint16_t __cachemode2pte_tbl[_PAGE_CACHE_MODE_NUM] = { 49 [_PAGE_CACHE_MODE_WB ] = 0 | 0 , 50 [_PAGE_CACHE_MODE_WC ] = 0 | _PAGE_PCD, 51 [_PAGE_CACHE_MODE_UC_MINUS] = 0 | _PAGE_PCD, 52 [_PAGE_CACHE_MODE_UC ] = _PAGE_PWT | _PAGE_PCD, 53 [_PAGE_CACHE_MODE_WT ] = 0 | _PAGE_PCD, 54 [_PAGE_CACHE_MODE_WP ] = 0 | _PAGE_PCD, 55 }; 56 EXPORT_SYMBOL(__cachemode2pte_tbl); 57 58 uint8_t __pte2cachemode_tbl[8] = { 59 [__pte2cm_idx( 0 | 0 | 0 )] = _PAGE_CACHE_MODE_WB, 60 [__pte2cm_idx(_PAGE_PWT | 0 | 0 )] = _PAGE_CACHE_MODE_UC_MINUS, 61 [__pte2cm_idx( 0 | _PAGE_PCD | 0 )] = _PAGE_CACHE_MODE_UC_MINUS, 62 [__pte2cm_idx(_PAGE_PWT | _PAGE_PCD | 0 )] = _PAGE_CACHE_MODE_UC, 63 [__pte2cm_idx( 0 | 0 | _PAGE_PAT)] = _PAGE_CACHE_MODE_WB, 64 [__pte2cm_idx(_PAGE_PWT | 0 | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC_MINUS, 65 [__pte2cm_idx(0 | _PAGE_PCD | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC_MINUS, 66 [__pte2cm_idx(_PAGE_PWT | _PAGE_PCD | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC, 67 }; 68 EXPORT_SYMBOL(__pte2cachemode_tbl); 69 70 static unsigned long __initdata pgt_buf_start; 71 static unsigned long __initdata pgt_buf_end; 72 static unsigned long __initdata pgt_buf_top; 73 74 static unsigned long min_pfn_mapped; 75 76 static bool __initdata can_use_brk_pgt = true; 77 78 /* 79 * Pages returned are already directly mapped. 80 * 81 * Changing that is likely to break Xen, see commit: 82 * 83 * 279b706 x86,xen: introduce x86_init.mapping.pagetable_reserve 84 * 85 * for detailed information. 86 */ 87 __ref void *alloc_low_pages(unsigned int num) 88 { 89 unsigned long pfn; 90 int i; 91 92 if (after_bootmem) { 93 unsigned int order; 94 95 order = get_order((unsigned long)num << PAGE_SHIFT); 96 return (void *)__get_free_pages(GFP_ATOMIC | __GFP_ZERO, order); 97 } 98 99 if ((pgt_buf_end + num) > pgt_buf_top || !can_use_brk_pgt) { 100 unsigned long ret; 101 if (min_pfn_mapped >= max_pfn_mapped) 102 panic("alloc_low_pages: ran out of memory"); 103 ret = memblock_find_in_range(min_pfn_mapped << PAGE_SHIFT, 104 max_pfn_mapped << PAGE_SHIFT, 105 PAGE_SIZE * num , PAGE_SIZE); 106 if (!ret) 107 panic("alloc_low_pages: can not alloc memory"); 108 memblock_reserve(ret, PAGE_SIZE * num); 109 pfn = ret >> PAGE_SHIFT; 110 } else { 111 pfn = pgt_buf_end; 112 pgt_buf_end += num; 113 printk(KERN_DEBUG "BRK [%#010lx, %#010lx] PGTABLE\n", 114 pfn << PAGE_SHIFT, (pgt_buf_end << PAGE_SHIFT) - 1); 115 } 116 117 for (i = 0; i < num; i++) { 118 void *adr; 119 120 adr = __va((pfn + i) << PAGE_SHIFT); 121 clear_page(adr); 122 } 123 124 return __va(pfn << PAGE_SHIFT); 125 } 126 127 /* 128 * By default need 3 4k for initial PMD_SIZE, 3 4k for 0-ISA_END_ADDRESS. 129 * With KASLR memory randomization, depending on the machine e820 memory 130 * and the PUD alignment. We may need twice more pages when KASLR memory 131 * randomization is enabled. 132 */ 133 #ifndef CONFIG_RANDOMIZE_MEMORY 134 #define INIT_PGD_PAGE_COUNT 6 135 #else 136 #define INIT_PGD_PAGE_COUNT 12 137 #endif 138 #define INIT_PGT_BUF_SIZE (INIT_PGD_PAGE_COUNT * PAGE_SIZE) 139 RESERVE_BRK(early_pgt_alloc, INIT_PGT_BUF_SIZE); 140 void __init early_alloc_pgt_buf(void) 141 { 142 unsigned long tables = INIT_PGT_BUF_SIZE; 143 phys_addr_t base; 144 145 base = __pa(extend_brk(tables, PAGE_SIZE)); 146 147 pgt_buf_start = base >> PAGE_SHIFT; 148 pgt_buf_end = pgt_buf_start; 149 pgt_buf_top = pgt_buf_start + (tables >> PAGE_SHIFT); 150 } 151 152 int after_bootmem; 153 154 early_param_on_off("gbpages", "nogbpages", direct_gbpages, CONFIG_X86_DIRECT_GBPAGES); 155 156 struct map_range { 157 unsigned long start; 158 unsigned long end; 159 unsigned page_size_mask; 160 }; 161 162 static int page_size_mask; 163 164 static void __init probe_page_size_mask(void) 165 { 166 /* 167 * For pagealloc debugging, identity mapping will use small pages. 168 * This will simplify cpa(), which otherwise needs to support splitting 169 * large pages into small in interrupt context, etc. 170 */ 171 if (boot_cpu_has(X86_FEATURE_PSE) && !debug_pagealloc_enabled()) 172 page_size_mask |= 1 << PG_LEVEL_2M; 173 else 174 direct_gbpages = 0; 175 176 /* Enable PSE if available */ 177 if (boot_cpu_has(X86_FEATURE_PSE)) 178 cr4_set_bits_and_update_boot(X86_CR4_PSE); 179 180 /* Enable PGE if available */ 181 __supported_pte_mask &= ~_PAGE_GLOBAL; 182 if (boot_cpu_has(X86_FEATURE_PGE)) { 183 cr4_set_bits_and_update_boot(X86_CR4_PGE); 184 __supported_pte_mask |= _PAGE_GLOBAL; 185 } 186 187 /* By the default is everything supported: */ 188 __default_kernel_pte_mask = __supported_pte_mask; 189 /* Except when with PTI where the kernel is mostly non-Global: */ 190 if (cpu_feature_enabled(X86_FEATURE_PTI)) 191 __default_kernel_pte_mask &= ~_PAGE_GLOBAL; 192 193 /* Enable 1 GB linear kernel mappings if available: */ 194 if (direct_gbpages && boot_cpu_has(X86_FEATURE_GBPAGES)) { 195 printk(KERN_INFO "Using GB pages for direct mapping\n"); 196 page_size_mask |= 1 << PG_LEVEL_1G; 197 } else { 198 direct_gbpages = 0; 199 } 200 } 201 202 static void setup_pcid(void) 203 { 204 if (!IS_ENABLED(CONFIG_X86_64)) 205 return; 206 207 if (!boot_cpu_has(X86_FEATURE_PCID)) 208 return; 209 210 if (boot_cpu_has(X86_FEATURE_PGE)) { 211 /* 212 * This can't be cr4_set_bits_and_update_boot() -- the 213 * trampoline code can't handle CR4.PCIDE and it wouldn't 214 * do any good anyway. Despite the name, 215 * cr4_set_bits_and_update_boot() doesn't actually cause 216 * the bits in question to remain set all the way through 217 * the secondary boot asm. 218 * 219 * Instead, we brute-force it and set CR4.PCIDE manually in 220 * start_secondary(). 221 */ 222 cr4_set_bits(X86_CR4_PCIDE); 223 224 /* 225 * INVPCID's single-context modes (2/3) only work if we set 226 * X86_CR4_PCIDE, *and* we INVPCID support. It's unusable 227 * on systems that have X86_CR4_PCIDE clear, or that have 228 * no INVPCID support at all. 229 */ 230 if (boot_cpu_has(X86_FEATURE_INVPCID)) 231 setup_force_cpu_cap(X86_FEATURE_INVPCID_SINGLE); 232 } else { 233 /* 234 * flush_tlb_all(), as currently implemented, won't work if 235 * PCID is on but PGE is not. Since that combination 236 * doesn't exist on real hardware, there's no reason to try 237 * to fully support it, but it's polite to avoid corrupting 238 * data if we're on an improperly configured VM. 239 */ 240 setup_clear_cpu_cap(X86_FEATURE_PCID); 241 } 242 } 243 244 #ifdef CONFIG_X86_32 245 #define NR_RANGE_MR 3 246 #else /* CONFIG_X86_64 */ 247 #define NR_RANGE_MR 5 248 #endif 249 250 static int __meminit save_mr(struct map_range *mr, int nr_range, 251 unsigned long start_pfn, unsigned long end_pfn, 252 unsigned long page_size_mask) 253 { 254 if (start_pfn < end_pfn) { 255 if (nr_range >= NR_RANGE_MR) 256 panic("run out of range for init_memory_mapping\n"); 257 mr[nr_range].start = start_pfn<<PAGE_SHIFT; 258 mr[nr_range].end = end_pfn<<PAGE_SHIFT; 259 mr[nr_range].page_size_mask = page_size_mask; 260 nr_range++; 261 } 262 263 return nr_range; 264 } 265 266 /* 267 * adjust the page_size_mask for small range to go with 268 * big page size instead small one if nearby are ram too. 269 */ 270 static void __ref adjust_range_page_size_mask(struct map_range *mr, 271 int nr_range) 272 { 273 int i; 274 275 for (i = 0; i < nr_range; i++) { 276 if ((page_size_mask & (1<<PG_LEVEL_2M)) && 277 !(mr[i].page_size_mask & (1<<PG_LEVEL_2M))) { 278 unsigned long start = round_down(mr[i].start, PMD_SIZE); 279 unsigned long end = round_up(mr[i].end, PMD_SIZE); 280 281 #ifdef CONFIG_X86_32 282 if ((end >> PAGE_SHIFT) > max_low_pfn) 283 continue; 284 #endif 285 286 if (memblock_is_region_memory(start, end - start)) 287 mr[i].page_size_mask |= 1<<PG_LEVEL_2M; 288 } 289 if ((page_size_mask & (1<<PG_LEVEL_1G)) && 290 !(mr[i].page_size_mask & (1<<PG_LEVEL_1G))) { 291 unsigned long start = round_down(mr[i].start, PUD_SIZE); 292 unsigned long end = round_up(mr[i].end, PUD_SIZE); 293 294 if (memblock_is_region_memory(start, end - start)) 295 mr[i].page_size_mask |= 1<<PG_LEVEL_1G; 296 } 297 } 298 } 299 300 static const char *page_size_string(struct map_range *mr) 301 { 302 static const char str_1g[] = "1G"; 303 static const char str_2m[] = "2M"; 304 static const char str_4m[] = "4M"; 305 static const char str_4k[] = "4k"; 306 307 if (mr->page_size_mask & (1<<PG_LEVEL_1G)) 308 return str_1g; 309 /* 310 * 32-bit without PAE has a 4M large page size. 311 * PG_LEVEL_2M is misnamed, but we can at least 312 * print out the right size in the string. 313 */ 314 if (IS_ENABLED(CONFIG_X86_32) && 315 !IS_ENABLED(CONFIG_X86_PAE) && 316 mr->page_size_mask & (1<<PG_LEVEL_2M)) 317 return str_4m; 318 319 if (mr->page_size_mask & (1<<PG_LEVEL_2M)) 320 return str_2m; 321 322 return str_4k; 323 } 324 325 static int __meminit split_mem_range(struct map_range *mr, int nr_range, 326 unsigned long start, 327 unsigned long end) 328 { 329 unsigned long start_pfn, end_pfn, limit_pfn; 330 unsigned long pfn; 331 int i; 332 333 limit_pfn = PFN_DOWN(end); 334 335 /* head if not big page alignment ? */ 336 pfn = start_pfn = PFN_DOWN(start); 337 #ifdef CONFIG_X86_32 338 /* 339 * Don't use a large page for the first 2/4MB of memory 340 * because there are often fixed size MTRRs in there 341 * and overlapping MTRRs into large pages can cause 342 * slowdowns. 343 */ 344 if (pfn == 0) 345 end_pfn = PFN_DOWN(PMD_SIZE); 346 else 347 end_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE)); 348 #else /* CONFIG_X86_64 */ 349 end_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE)); 350 #endif 351 if (end_pfn > limit_pfn) 352 end_pfn = limit_pfn; 353 if (start_pfn < end_pfn) { 354 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0); 355 pfn = end_pfn; 356 } 357 358 /* big page (2M) range */ 359 start_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE)); 360 #ifdef CONFIG_X86_32 361 end_pfn = round_down(limit_pfn, PFN_DOWN(PMD_SIZE)); 362 #else /* CONFIG_X86_64 */ 363 end_pfn = round_up(pfn, PFN_DOWN(PUD_SIZE)); 364 if (end_pfn > round_down(limit_pfn, PFN_DOWN(PMD_SIZE))) 365 end_pfn = round_down(limit_pfn, PFN_DOWN(PMD_SIZE)); 366 #endif 367 368 if (start_pfn < end_pfn) { 369 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 370 page_size_mask & (1<<PG_LEVEL_2M)); 371 pfn = end_pfn; 372 } 373 374 #ifdef CONFIG_X86_64 375 /* big page (1G) range */ 376 start_pfn = round_up(pfn, PFN_DOWN(PUD_SIZE)); 377 end_pfn = round_down(limit_pfn, PFN_DOWN(PUD_SIZE)); 378 if (start_pfn < end_pfn) { 379 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 380 page_size_mask & 381 ((1<<PG_LEVEL_2M)|(1<<PG_LEVEL_1G))); 382 pfn = end_pfn; 383 } 384 385 /* tail is not big page (1G) alignment */ 386 start_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE)); 387 end_pfn = round_down(limit_pfn, PFN_DOWN(PMD_SIZE)); 388 if (start_pfn < end_pfn) { 389 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 390 page_size_mask & (1<<PG_LEVEL_2M)); 391 pfn = end_pfn; 392 } 393 #endif 394 395 /* tail is not big page (2M) alignment */ 396 start_pfn = pfn; 397 end_pfn = limit_pfn; 398 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0); 399 400 if (!after_bootmem) 401 adjust_range_page_size_mask(mr, nr_range); 402 403 /* try to merge same page size and continuous */ 404 for (i = 0; nr_range > 1 && i < nr_range - 1; i++) { 405 unsigned long old_start; 406 if (mr[i].end != mr[i+1].start || 407 mr[i].page_size_mask != mr[i+1].page_size_mask) 408 continue; 409 /* move it */ 410 old_start = mr[i].start; 411 memmove(&mr[i], &mr[i+1], 412 (nr_range - 1 - i) * sizeof(struct map_range)); 413 mr[i--].start = old_start; 414 nr_range--; 415 } 416 417 for (i = 0; i < nr_range; i++) 418 pr_debug(" [mem %#010lx-%#010lx] page %s\n", 419 mr[i].start, mr[i].end - 1, 420 page_size_string(&mr[i])); 421 422 return nr_range; 423 } 424 425 struct range pfn_mapped[E820_MAX_ENTRIES]; 426 int nr_pfn_mapped; 427 428 static void add_pfn_range_mapped(unsigned long start_pfn, unsigned long end_pfn) 429 { 430 nr_pfn_mapped = add_range_with_merge(pfn_mapped, E820_MAX_ENTRIES, 431 nr_pfn_mapped, start_pfn, end_pfn); 432 nr_pfn_mapped = clean_sort_range(pfn_mapped, E820_MAX_ENTRIES); 433 434 max_pfn_mapped = max(max_pfn_mapped, end_pfn); 435 436 if (start_pfn < (1UL<<(32-PAGE_SHIFT))) 437 max_low_pfn_mapped = max(max_low_pfn_mapped, 438 min(end_pfn, 1UL<<(32-PAGE_SHIFT))); 439 } 440 441 bool pfn_range_is_mapped(unsigned long start_pfn, unsigned long end_pfn) 442 { 443 int i; 444 445 for (i = 0; i < nr_pfn_mapped; i++) 446 if ((start_pfn >= pfn_mapped[i].start) && 447 (end_pfn <= pfn_mapped[i].end)) 448 return true; 449 450 return false; 451 } 452 453 /* 454 * Setup the direct mapping of the physical memory at PAGE_OFFSET. 455 * This runs before bootmem is initialized and gets pages directly from 456 * the physical memory. To access them they are temporarily mapped. 457 */ 458 unsigned long __ref init_memory_mapping(unsigned long start, 459 unsigned long end) 460 { 461 struct map_range mr[NR_RANGE_MR]; 462 unsigned long ret = 0; 463 int nr_range, i; 464 465 pr_debug("init_memory_mapping: [mem %#010lx-%#010lx]\n", 466 start, end - 1); 467 468 memset(mr, 0, sizeof(mr)); 469 nr_range = split_mem_range(mr, 0, start, end); 470 471 for (i = 0; i < nr_range; i++) 472 ret = kernel_physical_mapping_init(mr[i].start, mr[i].end, 473 mr[i].page_size_mask); 474 475 add_pfn_range_mapped(start >> PAGE_SHIFT, ret >> PAGE_SHIFT); 476 477 return ret >> PAGE_SHIFT; 478 } 479 480 /* 481 * We need to iterate through the E820 memory map and create direct mappings 482 * for only E820_TYPE_RAM and E820_KERN_RESERVED regions. We cannot simply 483 * create direct mappings for all pfns from [0 to max_low_pfn) and 484 * [4GB to max_pfn) because of possible memory holes in high addresses 485 * that cannot be marked as UC by fixed/variable range MTRRs. 486 * Depending on the alignment of E820 ranges, this may possibly result 487 * in using smaller size (i.e. 4K instead of 2M or 1G) page tables. 488 * 489 * init_mem_mapping() calls init_range_memory_mapping() with big range. 490 * That range would have hole in the middle or ends, and only ram parts 491 * will be mapped in init_range_memory_mapping(). 492 */ 493 static unsigned long __init init_range_memory_mapping( 494 unsigned long r_start, 495 unsigned long r_end) 496 { 497 unsigned long start_pfn, end_pfn; 498 unsigned long mapped_ram_size = 0; 499 int i; 500 501 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) { 502 u64 start = clamp_val(PFN_PHYS(start_pfn), r_start, r_end); 503 u64 end = clamp_val(PFN_PHYS(end_pfn), r_start, r_end); 504 if (start >= end) 505 continue; 506 507 /* 508 * if it is overlapping with brk pgt, we need to 509 * alloc pgt buf from memblock instead. 510 */ 511 can_use_brk_pgt = max(start, (u64)pgt_buf_end<<PAGE_SHIFT) >= 512 min(end, (u64)pgt_buf_top<<PAGE_SHIFT); 513 init_memory_mapping(start, end); 514 mapped_ram_size += end - start; 515 can_use_brk_pgt = true; 516 } 517 518 return mapped_ram_size; 519 } 520 521 static unsigned long __init get_new_step_size(unsigned long step_size) 522 { 523 /* 524 * Initial mapped size is PMD_SIZE (2M). 525 * We can not set step_size to be PUD_SIZE (1G) yet. 526 * In worse case, when we cross the 1G boundary, and 527 * PG_LEVEL_2M is not set, we will need 1+1+512 pages (2M + 8k) 528 * to map 1G range with PTE. Hence we use one less than the 529 * difference of page table level shifts. 530 * 531 * Don't need to worry about overflow in the top-down case, on 32bit, 532 * when step_size is 0, round_down() returns 0 for start, and that 533 * turns it into 0x100000000ULL. 534 * In the bottom-up case, round_up(x, 0) returns 0 though too, which 535 * needs to be taken into consideration by the code below. 536 */ 537 return step_size << (PMD_SHIFT - PAGE_SHIFT - 1); 538 } 539 540 /** 541 * memory_map_top_down - Map [map_start, map_end) top down 542 * @map_start: start address of the target memory range 543 * @map_end: end address of the target memory range 544 * 545 * This function will setup direct mapping for memory range 546 * [map_start, map_end) in top-down. That said, the page tables 547 * will be allocated at the end of the memory, and we map the 548 * memory in top-down. 549 */ 550 static void __init memory_map_top_down(unsigned long map_start, 551 unsigned long map_end) 552 { 553 unsigned long real_end, start, last_start; 554 unsigned long step_size; 555 unsigned long addr; 556 unsigned long mapped_ram_size = 0; 557 558 /* xen has big range in reserved near end of ram, skip it at first.*/ 559 addr = memblock_find_in_range(map_start, map_end, PMD_SIZE, PMD_SIZE); 560 real_end = addr + PMD_SIZE; 561 562 /* step_size need to be small so pgt_buf from BRK could cover it */ 563 step_size = PMD_SIZE; 564 max_pfn_mapped = 0; /* will get exact value next */ 565 min_pfn_mapped = real_end >> PAGE_SHIFT; 566 last_start = start = real_end; 567 568 /* 569 * We start from the top (end of memory) and go to the bottom. 570 * The memblock_find_in_range() gets us a block of RAM from the 571 * end of RAM in [min_pfn_mapped, max_pfn_mapped) used as new pages 572 * for page table. 573 */ 574 while (last_start > map_start) { 575 if (last_start > step_size) { 576 start = round_down(last_start - 1, step_size); 577 if (start < map_start) 578 start = map_start; 579 } else 580 start = map_start; 581 mapped_ram_size += init_range_memory_mapping(start, 582 last_start); 583 last_start = start; 584 min_pfn_mapped = last_start >> PAGE_SHIFT; 585 if (mapped_ram_size >= step_size) 586 step_size = get_new_step_size(step_size); 587 } 588 589 if (real_end < map_end) 590 init_range_memory_mapping(real_end, map_end); 591 } 592 593 /** 594 * memory_map_bottom_up - Map [map_start, map_end) bottom up 595 * @map_start: start address of the target memory range 596 * @map_end: end address of the target memory range 597 * 598 * This function will setup direct mapping for memory range 599 * [map_start, map_end) in bottom-up. Since we have limited the 600 * bottom-up allocation above the kernel, the page tables will 601 * be allocated just above the kernel and we map the memory 602 * in [map_start, map_end) in bottom-up. 603 */ 604 static void __init memory_map_bottom_up(unsigned long map_start, 605 unsigned long map_end) 606 { 607 unsigned long next, start; 608 unsigned long mapped_ram_size = 0; 609 /* step_size need to be small so pgt_buf from BRK could cover it */ 610 unsigned long step_size = PMD_SIZE; 611 612 start = map_start; 613 min_pfn_mapped = start >> PAGE_SHIFT; 614 615 /* 616 * We start from the bottom (@map_start) and go to the top (@map_end). 617 * The memblock_find_in_range() gets us a block of RAM from the 618 * end of RAM in [min_pfn_mapped, max_pfn_mapped) used as new pages 619 * for page table. 620 */ 621 while (start < map_end) { 622 if (step_size && map_end - start > step_size) { 623 next = round_up(start + 1, step_size); 624 if (next > map_end) 625 next = map_end; 626 } else { 627 next = map_end; 628 } 629 630 mapped_ram_size += init_range_memory_mapping(start, next); 631 start = next; 632 633 if (mapped_ram_size >= step_size) 634 step_size = get_new_step_size(step_size); 635 } 636 } 637 638 void __init init_mem_mapping(void) 639 { 640 unsigned long end; 641 642 pti_check_boottime_disable(); 643 probe_page_size_mask(); 644 setup_pcid(); 645 646 #ifdef CONFIG_X86_64 647 end = max_pfn << PAGE_SHIFT; 648 #else 649 end = max_low_pfn << PAGE_SHIFT; 650 #endif 651 652 /* the ISA range is always mapped regardless of memory holes */ 653 init_memory_mapping(0, ISA_END_ADDRESS); 654 655 /* Init the trampoline, possibly with KASLR memory offset */ 656 init_trampoline(); 657 658 /* 659 * If the allocation is in bottom-up direction, we setup direct mapping 660 * in bottom-up, otherwise we setup direct mapping in top-down. 661 */ 662 if (memblock_bottom_up()) { 663 unsigned long kernel_end = __pa_symbol(_end); 664 665 /* 666 * we need two separate calls here. This is because we want to 667 * allocate page tables above the kernel. So we first map 668 * [kernel_end, end) to make memory above the kernel be mapped 669 * as soon as possible. And then use page tables allocated above 670 * the kernel to map [ISA_END_ADDRESS, kernel_end). 671 */ 672 memory_map_bottom_up(kernel_end, end); 673 memory_map_bottom_up(ISA_END_ADDRESS, kernel_end); 674 } else { 675 memory_map_top_down(ISA_END_ADDRESS, end); 676 } 677 678 #ifdef CONFIG_X86_64 679 if (max_pfn > max_low_pfn) { 680 /* can we preseve max_low_pfn ?*/ 681 max_low_pfn = max_pfn; 682 } 683 #else 684 early_ioremap_page_table_range_init(); 685 #endif 686 687 load_cr3(swapper_pg_dir); 688 __flush_tlb_all(); 689 690 x86_init.hyper.init_mem_mapping(); 691 692 early_memtest(0, max_pfn_mapped << PAGE_SHIFT); 693 } 694 695 /* 696 * devmem_is_allowed() checks to see if /dev/mem access to a certain address 697 * is valid. The argument is a physical page number. 698 * 699 * On x86, access has to be given to the first megabyte of RAM because that 700 * area traditionally contains BIOS code and data regions used by X, dosemu, 701 * and similar apps. Since they map the entire memory range, the whole range 702 * must be allowed (for mapping), but any areas that would otherwise be 703 * disallowed are flagged as being "zero filled" instead of rejected. 704 * Access has to be given to non-kernel-ram areas as well, these contain the 705 * PCI mmio resources as well as potential bios/acpi data regions. 706 */ 707 int devmem_is_allowed(unsigned long pagenr) 708 { 709 if (region_intersects(PFN_PHYS(pagenr), PAGE_SIZE, 710 IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE) 711 != REGION_DISJOINT) { 712 /* 713 * For disallowed memory regions in the low 1MB range, 714 * request that the page be shown as all zeros. 715 */ 716 if (pagenr < 256) 717 return 2; 718 719 return 0; 720 } 721 722 /* 723 * This must follow RAM test, since System RAM is considered a 724 * restricted resource under CONFIG_STRICT_IOMEM. 725 */ 726 if (iomem_is_exclusive(pagenr << PAGE_SHIFT)) { 727 /* Low 1MB bypasses iomem restrictions. */ 728 if (pagenr < 256) 729 return 1; 730 731 return 0; 732 } 733 734 return 1; 735 } 736 737 void free_init_pages(char *what, unsigned long begin, unsigned long end) 738 { 739 unsigned long begin_aligned, end_aligned; 740 741 /* Make sure boundaries are page aligned */ 742 begin_aligned = PAGE_ALIGN(begin); 743 end_aligned = end & PAGE_MASK; 744 745 if (WARN_ON(begin_aligned != begin || end_aligned != end)) { 746 begin = begin_aligned; 747 end = end_aligned; 748 } 749 750 if (begin >= end) 751 return; 752 753 /* 754 * If debugging page accesses then do not free this memory but 755 * mark them not present - any buggy init-section access will 756 * create a kernel page fault: 757 */ 758 if (debug_pagealloc_enabled()) { 759 pr_info("debug: unmapping init [mem %#010lx-%#010lx]\n", 760 begin, end - 1); 761 set_memory_np(begin, (end - begin) >> PAGE_SHIFT); 762 } else { 763 /* 764 * We just marked the kernel text read only above, now that 765 * we are going to free part of that, we need to make that 766 * writeable and non-executable first. 767 */ 768 set_memory_nx(begin, (end - begin) >> PAGE_SHIFT); 769 set_memory_rw(begin, (end - begin) >> PAGE_SHIFT); 770 771 free_reserved_area((void *)begin, (void *)end, 772 POISON_FREE_INITMEM, what); 773 } 774 } 775 776 void __ref free_initmem(void) 777 { 778 e820__reallocate_tables(); 779 780 free_init_pages("unused kernel", 781 (unsigned long)(&__init_begin), 782 (unsigned long)(&__init_end)); 783 } 784 785 #ifdef CONFIG_BLK_DEV_INITRD 786 void __init free_initrd_mem(unsigned long start, unsigned long end) 787 { 788 /* 789 * end could be not aligned, and We can not align that, 790 * decompresser could be confused by aligned initrd_end 791 * We already reserve the end partial page before in 792 * - i386_start_kernel() 793 * - x86_64_start_kernel() 794 * - relocate_initrd() 795 * So here We can do PAGE_ALIGN() safely to get partial page to be freed 796 */ 797 free_init_pages("initrd", start, PAGE_ALIGN(end)); 798 } 799 #endif 800 801 /* 802 * Calculate the precise size of the DMA zone (first 16 MB of RAM), 803 * and pass it to the MM layer - to help it set zone watermarks more 804 * accurately. 805 * 806 * Done on 64-bit systems only for the time being, although 32-bit systems 807 * might benefit from this as well. 808 */ 809 void __init memblock_find_dma_reserve(void) 810 { 811 #ifdef CONFIG_X86_64 812 u64 nr_pages = 0, nr_free_pages = 0; 813 unsigned long start_pfn, end_pfn; 814 phys_addr_t start_addr, end_addr; 815 int i; 816 u64 u; 817 818 /* 819 * Iterate over all memory ranges (free and reserved ones alike), 820 * to calculate the total number of pages in the first 16 MB of RAM: 821 */ 822 nr_pages = 0; 823 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) { 824 start_pfn = min(start_pfn, MAX_DMA_PFN); 825 end_pfn = min(end_pfn, MAX_DMA_PFN); 826 827 nr_pages += end_pfn - start_pfn; 828 } 829 830 /* 831 * Iterate over free memory ranges to calculate the number of free 832 * pages in the DMA zone, while not counting potential partial 833 * pages at the beginning or the end of the range: 834 */ 835 nr_free_pages = 0; 836 for_each_free_mem_range(u, NUMA_NO_NODE, MEMBLOCK_NONE, &start_addr, &end_addr, NULL) { 837 start_pfn = min_t(unsigned long, PFN_UP(start_addr), MAX_DMA_PFN); 838 end_pfn = min_t(unsigned long, PFN_DOWN(end_addr), MAX_DMA_PFN); 839 840 if (start_pfn < end_pfn) 841 nr_free_pages += end_pfn - start_pfn; 842 } 843 844 set_dma_reserve(nr_pages - nr_free_pages); 845 #endif 846 } 847 848 void __init zone_sizes_init(void) 849 { 850 unsigned long max_zone_pfns[MAX_NR_ZONES]; 851 852 memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); 853 854 #ifdef CONFIG_ZONE_DMA 855 max_zone_pfns[ZONE_DMA] = min(MAX_DMA_PFN, max_low_pfn); 856 #endif 857 #ifdef CONFIG_ZONE_DMA32 858 max_zone_pfns[ZONE_DMA32] = min(MAX_DMA32_PFN, max_low_pfn); 859 #endif 860 max_zone_pfns[ZONE_NORMAL] = max_low_pfn; 861 #ifdef CONFIG_HIGHMEM 862 max_zone_pfns[ZONE_HIGHMEM] = max_pfn; 863 #endif 864 865 free_area_init_nodes(max_zone_pfns); 866 } 867 868 __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = { 869 .loaded_mm = &init_mm, 870 .next_asid = 1, 871 .cr4 = ~0UL, /* fail hard if we screw up cr4 shadow initialization */ 872 }; 873 EXPORT_PER_CPU_SYMBOL(cpu_tlbstate); 874 875 void update_cache_mode_entry(unsigned entry, enum page_cache_mode cache) 876 { 877 /* entry 0 MUST be WB (hardwired to speed up translations) */ 878 BUG_ON(!entry && cache != _PAGE_CACHE_MODE_WB); 879 880 __cachemode2pte_tbl[cache] = __cm_idx2pte(entry); 881 __pte2cachemode_tbl[entry] = cache; 882 } 883