1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * mm_init.c - Memory initialisation verification and debugging 4 * 5 * Copyright 2008 IBM Corporation, 2008 6 * Author Mel Gorman <mel@csn.ul.ie> 7 * 8 */ 9 #include <linux/kernel.h> 10 #include <linux/init.h> 11 #include <linux/kobject.h> 12 #include <linux/export.h> 13 #include <linux/memory.h> 14 #include <linux/notifier.h> 15 #include <linux/sched.h> 16 #include <linux/mman.h> 17 #include <linux/memblock.h> 18 #include <linux/page-isolation.h> 19 #include <linux/padata.h> 20 #include <linux/nmi.h> 21 #include <linux/buffer_head.h> 22 #include <linux/kmemleak.h> 23 #include <linux/kfence.h> 24 #include <linux/page_ext.h> 25 #include <linux/pti.h> 26 #include <linux/pgtable.h> 27 #include <linux/stackdepot.h> 28 #include <linux/swap.h> 29 #include <linux/cma.h> 30 #include <linux/crash_dump.h> 31 #include <linux/execmem.h> 32 #include <linux/vmstat.h> 33 #include <linux/kexec_handover.h> 34 #include <linux/hugetlb.h> 35 #include "internal.h" 36 #include "slab.h" 37 #include "shuffle.h" 38 39 #include <asm/setup.h> 40 41 #ifndef CONFIG_NUMA 42 unsigned long max_mapnr; 43 EXPORT_SYMBOL(max_mapnr); 44 45 struct page *mem_map; 46 EXPORT_SYMBOL(mem_map); 47 #endif 48 49 /* 50 * high_memory defines the upper bound on direct map memory, then end 51 * of ZONE_NORMAL. 52 */ 53 void *high_memory; 54 EXPORT_SYMBOL(high_memory); 55 56 unsigned long zero_page_pfn __ro_after_init; 57 EXPORT_SYMBOL(zero_page_pfn); 58 59 #ifndef __HAVE_COLOR_ZERO_PAGE 60 const uint8_t empty_zero_page[PAGE_SIZE] __aligned(PAGE_SIZE); 61 EXPORT_SYMBOL(empty_zero_page); 62 63 struct page *__zero_page __ro_after_init; 64 EXPORT_SYMBOL(__zero_page); 65 #endif /* __HAVE_COLOR_ZERO_PAGE */ 66 67 #ifdef CONFIG_DEBUG_MEMORY_INIT 68 int __meminitdata mminit_loglevel; 69 70 /* The zonelists are simply reported, validation is manual. */ 71 void __init mminit_verify_zonelist(void) 72 { 73 int nid; 74 75 if (mminit_loglevel < MMINIT_VERIFY) 76 return; 77 78 for_each_online_node(nid) { 79 pg_data_t *pgdat = NODE_DATA(nid); 80 struct zone *zone; 81 struct zoneref *z; 82 struct zonelist *zonelist; 83 int i, listid, zoneid; 84 85 for (i = 0; i < MAX_ZONELISTS * MAX_NR_ZONES; i++) { 86 87 /* Identify the zone and nodelist */ 88 zoneid = i % MAX_NR_ZONES; 89 listid = i / MAX_NR_ZONES; 90 zonelist = &pgdat->node_zonelists[listid]; 91 zone = &pgdat->node_zones[zoneid]; 92 if (!populated_zone(zone)) 93 continue; 94 95 /* Print information about the zonelist */ 96 printk(KERN_DEBUG "mminit::zonelist %s %d:%s = ", 97 listid > 0 ? "thisnode" : "general", nid, 98 zone->name); 99 100 /* Iterate the zonelist */ 101 for_each_zone_zonelist(zone, z, zonelist, zoneid) 102 pr_cont("%d:%s ", zone_to_nid(zone), zone->name); 103 pr_cont("\n"); 104 } 105 } 106 } 107 108 void __init mminit_verify_pageflags_layout(void) 109 { 110 int shift, width; 111 unsigned long or_mask, add_mask; 112 113 shift = BITS_PER_LONG; 114 width = shift - NR_NON_PAGEFLAG_BITS; 115 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths", 116 "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d Gen %d Tier %d Flags %d\n", 117 SECTIONS_WIDTH, 118 NODES_WIDTH, 119 ZONES_WIDTH, 120 LAST_CPUPID_WIDTH, 121 KASAN_TAG_WIDTH, 122 LRU_GEN_WIDTH, 123 LRU_REFS_WIDTH, 124 NR_PAGEFLAGS); 125 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_shifts", 126 "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d\n", 127 SECTIONS_SHIFT, 128 NODES_SHIFT, 129 ZONES_SHIFT, 130 LAST_CPUPID_SHIFT, 131 KASAN_TAG_WIDTH); 132 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_pgshifts", 133 "Section %lu Node %lu Zone %lu Lastcpupid %lu Kasantag %lu\n", 134 (unsigned long)SECTIONS_PGSHIFT, 135 (unsigned long)NODES_PGSHIFT, 136 (unsigned long)ZONES_PGSHIFT, 137 (unsigned long)LAST_CPUPID_PGSHIFT, 138 (unsigned long)KASAN_TAG_PGSHIFT); 139 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodezoneid", 140 "Node/Zone ID: %lu -> %lu\n", 141 (unsigned long)(ZONEID_PGOFF + ZONEID_SHIFT), 142 (unsigned long)ZONEID_PGOFF); 143 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_usage", 144 "location: %d -> %d layout %d -> %d unused %d -> %d page-flags\n", 145 shift, width, width, NR_PAGEFLAGS, NR_PAGEFLAGS, 0); 146 #ifdef NODE_NOT_IN_PAGE_FLAGS 147 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodeflags", 148 "Node not in page flags"); 149 #endif 150 #ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS 151 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodeflags", 152 "Last cpupid not in page flags"); 153 #endif 154 155 if (SECTIONS_WIDTH) { 156 shift -= SECTIONS_WIDTH; 157 BUG_ON(shift != SECTIONS_PGSHIFT); 158 } 159 if (NODES_WIDTH) { 160 shift -= NODES_WIDTH; 161 BUG_ON(shift != NODES_PGSHIFT); 162 } 163 if (ZONES_WIDTH) { 164 shift -= ZONES_WIDTH; 165 BUG_ON(shift != ZONES_PGSHIFT); 166 } 167 168 /* Check for bitmask overlaps */ 169 or_mask = (ZONES_MASK << ZONES_PGSHIFT) | 170 (NODES_MASK << NODES_PGSHIFT) | 171 (SECTIONS_MASK << SECTIONS_PGSHIFT); 172 add_mask = (ZONES_MASK << ZONES_PGSHIFT) + 173 (NODES_MASK << NODES_PGSHIFT) + 174 (SECTIONS_MASK << SECTIONS_PGSHIFT); 175 BUG_ON(or_mask != add_mask); 176 } 177 178 static __init int set_mminit_loglevel(char *str) 179 { 180 get_option(&str, &mminit_loglevel); 181 return 0; 182 } 183 early_param("mminit_loglevel", set_mminit_loglevel); 184 #endif /* CONFIG_DEBUG_MEMORY_INIT */ 185 186 struct kobject *mm_kobj; 187 188 #ifdef CONFIG_SMP 189 s32 vm_committed_as_batch = 32; 190 191 void mm_compute_batch(int overcommit_policy) 192 { 193 u64 memsized_batch; 194 s32 nr = num_present_cpus(); 195 s32 batch = max_t(s32, nr*2, 32); 196 unsigned long ram_pages = totalram_pages(); 197 198 /* 199 * For policy OVERCOMMIT_NEVER, set batch size to 0.4% of 200 * (total memory/#cpus), and lift it to 25% for other policies 201 * to ease the possible lock contention for percpu_counter 202 * vm_committed_as, while the max limit is INT_MAX 203 */ 204 if (overcommit_policy == OVERCOMMIT_NEVER) 205 memsized_batch = min_t(u64, ram_pages/nr/256, INT_MAX); 206 else 207 memsized_batch = min_t(u64, ram_pages/nr/4, INT_MAX); 208 209 vm_committed_as_batch = max_t(s32, memsized_batch, batch); 210 } 211 212 static int __meminit mm_compute_batch_notifier(struct notifier_block *self, 213 unsigned long action, void *arg) 214 { 215 switch (action) { 216 case MEM_ONLINE: 217 case MEM_OFFLINE: 218 mm_compute_batch(sysctl_overcommit_memory); 219 break; 220 default: 221 break; 222 } 223 return NOTIFY_OK; 224 } 225 226 static int __init mm_compute_batch_init(void) 227 { 228 mm_compute_batch(sysctl_overcommit_memory); 229 hotplug_memory_notifier(mm_compute_batch_notifier, MM_COMPUTE_BATCH_PRI); 230 return 0; 231 } 232 233 __initcall(mm_compute_batch_init); 234 235 #endif 236 237 static int __init mm_sysfs_init(void) 238 { 239 mm_kobj = kobject_create_and_add("mm", kernel_kobj); 240 if (!mm_kobj) 241 return -ENOMEM; 242 243 return 0; 244 } 245 postcore_initcall(mm_sysfs_init); 246 247 static unsigned long arch_zone_lowest_possible_pfn[MAX_NR_ZONES] __initdata; 248 static unsigned long arch_zone_highest_possible_pfn[MAX_NR_ZONES] __initdata; 249 static unsigned long zone_movable_pfn[MAX_NUMNODES] __initdata; 250 251 static unsigned long required_kernelcore __initdata; 252 static unsigned long required_kernelcore_percent __initdata; 253 static unsigned long required_movablecore __initdata; 254 static unsigned long required_movablecore_percent __initdata; 255 256 static unsigned long nr_kernel_pages __initdata; 257 static unsigned long nr_all_pages __initdata; 258 259 static bool deferred_struct_pages __meminitdata; 260 261 static DEFINE_PER_CPU(struct per_cpu_nodestat, boot_nodestats); 262 263 static int __init cmdline_parse_core(char *p, unsigned long *core, 264 unsigned long *percent) 265 { 266 unsigned long long coremem; 267 char *endptr; 268 269 if (!p) 270 return -EINVAL; 271 272 /* Value may be a percentage of total memory, otherwise bytes */ 273 coremem = simple_strtoull(p, &endptr, 0); 274 if (*endptr == '%') { 275 /* Paranoid check for percent values greater than 100 */ 276 WARN_ON(coremem > 100); 277 278 *percent = coremem; 279 } else { 280 coremem = memparse(p, &p); 281 /* Paranoid check that UL is enough for the coremem value */ 282 WARN_ON((coremem >> PAGE_SHIFT) > ULONG_MAX); 283 284 *core = coremem >> PAGE_SHIFT; 285 *percent = 0UL; 286 } 287 return 0; 288 } 289 290 bool mirrored_kernelcore __initdata_memblock; 291 292 /* 293 * kernelcore=size sets the amount of memory for use for allocations that 294 * cannot be reclaimed or migrated. 295 */ 296 static int __init cmdline_parse_kernelcore(char *p) 297 { 298 /* parse kernelcore=mirror */ 299 if (parse_option_str(p, "mirror")) { 300 mirrored_kernelcore = true; 301 return 0; 302 } 303 304 return cmdline_parse_core(p, &required_kernelcore, 305 &required_kernelcore_percent); 306 } 307 early_param("kernelcore", cmdline_parse_kernelcore); 308 309 /* 310 * movablecore=size sets the amount of memory for use for allocations that 311 * can be reclaimed or migrated. 312 */ 313 static int __init cmdline_parse_movablecore(char *p) 314 { 315 return cmdline_parse_core(p, &required_movablecore, 316 &required_movablecore_percent); 317 } 318 early_param("movablecore", cmdline_parse_movablecore); 319 320 /* 321 * early_calculate_totalpages() 322 * Sum pages in active regions for movable zone. 323 * Populate N_MEMORY for calculating usable_nodes. 324 */ 325 static unsigned long __init early_calculate_totalpages(void) 326 { 327 unsigned long totalpages = 0; 328 unsigned long start_pfn, end_pfn; 329 int i, nid; 330 331 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) { 332 unsigned long pages = end_pfn - start_pfn; 333 334 totalpages += pages; 335 if (pages) 336 node_set_state(nid, N_MEMORY); 337 } 338 return totalpages; 339 } 340 341 /* 342 * This finds a zone that can be used for ZONE_MOVABLE pages. The 343 * assumption is made that zones within a node are ordered in monotonic 344 * increasing memory addresses so that the "highest" populated zone is used 345 */ 346 static void __init find_usable_zone_for_movable(void) 347 { 348 int zone_index; 349 for (zone_index = MAX_NR_ZONES - 1; zone_index >= 0; zone_index--) { 350 if (zone_index == ZONE_MOVABLE) 351 continue; 352 353 if (arch_zone_highest_possible_pfn[zone_index] > 354 arch_zone_lowest_possible_pfn[zone_index]) 355 break; 356 } 357 358 VM_BUG_ON(zone_index == -1); 359 movable_zone = zone_index; 360 } 361 362 /* 363 * Find the PFN the Movable zone begins in each node. Kernel memory 364 * is spread evenly between nodes as long as the nodes have enough 365 * memory. When they don't, some nodes will have more kernelcore than 366 * others 367 */ 368 static void __init find_zone_movable_pfns_for_nodes(void) 369 { 370 int i, nid; 371 unsigned long usable_startpfn; 372 unsigned long kernelcore_node, kernelcore_remaining; 373 /* save the state before borrow the nodemask */ 374 nodemask_t saved_node_state = node_states[N_MEMORY]; 375 unsigned long totalpages = early_calculate_totalpages(); 376 int usable_nodes = nodes_weight(node_states[N_MEMORY]); 377 struct memblock_region *r; 378 379 /* Need to find movable_zone earlier when movable_node is specified. */ 380 find_usable_zone_for_movable(); 381 382 /* 383 * If movable_node is specified, ignore kernelcore and movablecore 384 * options. 385 */ 386 if (movable_node_is_enabled()) { 387 for_each_mem_region(r) { 388 if (!memblock_is_hotpluggable(r)) 389 continue; 390 391 nid = memblock_get_region_node(r); 392 393 usable_startpfn = memblock_region_memory_base_pfn(r); 394 zone_movable_pfn[nid] = zone_movable_pfn[nid] ? 395 min(usable_startpfn, zone_movable_pfn[nid]) : 396 usable_startpfn; 397 } 398 399 goto out2; 400 } 401 402 /* 403 * If kernelcore=mirror is specified, ignore movablecore option 404 */ 405 if (mirrored_kernelcore) { 406 bool mem_below_4gb_not_mirrored = false; 407 408 if (!memblock_has_mirror()) { 409 pr_warn("The system has no mirror memory, ignore kernelcore=mirror.\n"); 410 goto out; 411 } 412 413 if (is_kdump_kernel()) { 414 pr_warn("The system is under kdump, ignore kernelcore=mirror.\n"); 415 goto out; 416 } 417 418 for_each_mem_region(r) { 419 if (memblock_is_mirror(r)) 420 continue; 421 422 nid = memblock_get_region_node(r); 423 424 usable_startpfn = memblock_region_memory_base_pfn(r); 425 426 if (usable_startpfn < PHYS_PFN(SZ_4G)) { 427 mem_below_4gb_not_mirrored = true; 428 continue; 429 } 430 431 zone_movable_pfn[nid] = zone_movable_pfn[nid] ? 432 min(usable_startpfn, zone_movable_pfn[nid]) : 433 usable_startpfn; 434 } 435 436 if (mem_below_4gb_not_mirrored) 437 pr_warn("This configuration results in unmirrored kernel memory.\n"); 438 439 goto out2; 440 } 441 442 /* 443 * If kernelcore=nn% or movablecore=nn% was specified, calculate the 444 * amount of necessary memory. 445 */ 446 if (required_kernelcore_percent) 447 required_kernelcore = (totalpages * 100 * required_kernelcore_percent) / 448 10000UL; 449 if (required_movablecore_percent) 450 required_movablecore = (totalpages * 100 * required_movablecore_percent) / 451 10000UL; 452 453 /* 454 * If movablecore= was specified, calculate what size of 455 * kernelcore that corresponds so that memory usable for 456 * any allocation type is evenly spread. If both kernelcore 457 * and movablecore are specified, then the value of kernelcore 458 * will be used for required_kernelcore if it's greater than 459 * what movablecore would have allowed. 460 */ 461 if (required_movablecore) { 462 unsigned long corepages; 463 464 /* 465 * Round-up so that ZONE_MOVABLE is at least as large as what 466 * was requested by the user 467 */ 468 required_movablecore = 469 round_up(required_movablecore, MAX_ORDER_NR_PAGES); 470 required_movablecore = min(totalpages, required_movablecore); 471 corepages = totalpages - required_movablecore; 472 473 required_kernelcore = max(required_kernelcore, corepages); 474 } 475 476 /* 477 * If kernelcore was not specified or kernelcore size is larger 478 * than totalpages, there is no ZONE_MOVABLE. 479 */ 480 if (!required_kernelcore || required_kernelcore >= totalpages) 481 goto out; 482 483 /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */ 484 usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone]; 485 486 restart: 487 /* Spread kernelcore memory as evenly as possible throughout nodes */ 488 kernelcore_node = required_kernelcore / usable_nodes; 489 for_each_node_state(nid, N_MEMORY) { 490 unsigned long start_pfn, end_pfn; 491 492 /* 493 * Recalculate kernelcore_node if the division per node 494 * now exceeds what is necessary to satisfy the requested 495 * amount of memory for the kernel 496 */ 497 if (required_kernelcore < kernelcore_node) 498 kernelcore_node = required_kernelcore / usable_nodes; 499 500 /* 501 * As the map is walked, we track how much memory is usable 502 * by the kernel using kernelcore_remaining. When it is 503 * 0, the rest of the node is usable by ZONE_MOVABLE 504 */ 505 kernelcore_remaining = kernelcore_node; 506 507 /* Go through each range of PFNs within this node */ 508 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) { 509 unsigned long size_pages; 510 511 start_pfn = max(start_pfn, zone_movable_pfn[nid]); 512 if (start_pfn >= end_pfn) 513 continue; 514 515 /* Account for what is only usable for kernelcore */ 516 if (start_pfn < usable_startpfn) { 517 unsigned long kernel_pages; 518 kernel_pages = min(end_pfn, usable_startpfn) 519 - start_pfn; 520 521 kernelcore_remaining -= min(kernel_pages, 522 kernelcore_remaining); 523 required_kernelcore -= min(kernel_pages, 524 required_kernelcore); 525 526 /* Continue if range is now fully accounted */ 527 if (end_pfn <= usable_startpfn) { 528 529 /* 530 * Push zone_movable_pfn to the end so 531 * that if we have to rebalance 532 * kernelcore across nodes, we will 533 * not double account here 534 */ 535 zone_movable_pfn[nid] = end_pfn; 536 continue; 537 } 538 start_pfn = usable_startpfn; 539 } 540 541 /* 542 * The usable PFN range for ZONE_MOVABLE is from 543 * start_pfn->end_pfn. Calculate size_pages as the 544 * number of pages used as kernelcore 545 */ 546 size_pages = end_pfn - start_pfn; 547 if (size_pages > kernelcore_remaining) 548 size_pages = kernelcore_remaining; 549 zone_movable_pfn[nid] = start_pfn + size_pages; 550 551 /* 552 * Some kernelcore has been met, update counts and 553 * break if the kernelcore for this node has been 554 * satisfied 555 */ 556 required_kernelcore -= min(required_kernelcore, 557 size_pages); 558 kernelcore_remaining -= size_pages; 559 if (!kernelcore_remaining) 560 break; 561 } 562 } 563 564 /* 565 * If there is still required_kernelcore, we do another pass with one 566 * less node in the count. This will push zone_movable_pfn[nid] further 567 * along on the nodes that still have memory until kernelcore is 568 * satisfied 569 */ 570 usable_nodes--; 571 if (usable_nodes && required_kernelcore > usable_nodes) 572 goto restart; 573 574 out2: 575 /* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */ 576 for_each_node_state(nid, N_MEMORY) { 577 unsigned long start_pfn, end_pfn; 578 579 zone_movable_pfn[nid] = 580 round_up(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES); 581 582 get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); 583 if (zone_movable_pfn[nid] >= end_pfn) 584 zone_movable_pfn[nid] = 0; 585 } 586 587 out: 588 /* restore the node_state */ 589 node_states[N_MEMORY] = saved_node_state; 590 } 591 592 void __meminit __init_single_page(struct page *page, unsigned long pfn, 593 unsigned long zone, int nid) 594 { 595 mm_zero_struct_page(page); 596 set_page_links(page, zone, nid, pfn); 597 init_page_count(page); 598 atomic_set(&page->_mapcount, -1); 599 page_cpupid_reset_last(page); 600 page_kasan_tag_reset(page); 601 602 INIT_LIST_HEAD(&page->lru); 603 #ifdef WANT_PAGE_VIRTUAL 604 /* The shift won't overflow because ZONE_NORMAL is below 4G. */ 605 if (!is_highmem_idx(zone)) 606 set_page_address(page, __va(pfn << PAGE_SHIFT)); 607 #endif 608 } 609 610 #ifdef CONFIG_NUMA 611 /* 612 * During memory init memblocks map pfns to nids. The search is expensive and 613 * this caches recent lookups. The implementation of __early_pfn_to_nid 614 * treats start/end as pfns. 615 */ 616 struct mminit_pfnnid_cache { 617 unsigned long last_start; 618 unsigned long last_end; 619 int last_nid; 620 }; 621 622 static struct mminit_pfnnid_cache early_pfnnid_cache __meminitdata; 623 624 /* 625 * Required by SPARSEMEM. Given a PFN, return what node the PFN is on. 626 */ 627 static int __meminit __early_pfn_to_nid(unsigned long pfn, 628 struct mminit_pfnnid_cache *state) 629 { 630 unsigned long start_pfn, end_pfn; 631 int nid; 632 633 if (state->last_start <= pfn && pfn < state->last_end) 634 return state->last_nid; 635 636 nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn); 637 if (nid != NUMA_NO_NODE) { 638 state->last_start = start_pfn; 639 state->last_end = end_pfn; 640 state->last_nid = nid; 641 } 642 643 return nid; 644 } 645 646 int __meminit early_pfn_to_nid(unsigned long pfn) 647 { 648 static DEFINE_SPINLOCK(early_pfn_lock); 649 int nid; 650 651 spin_lock(&early_pfn_lock); 652 nid = __early_pfn_to_nid(pfn, &early_pfnnid_cache); 653 if (nid < 0) 654 nid = first_online_node; 655 spin_unlock(&early_pfn_lock); 656 657 return nid; 658 } 659 660 bool hashdist = HASHDIST_DEFAULT; 661 662 static int __init set_hashdist(char *str) 663 { 664 return kstrtobool(str, &hashdist) == 0; 665 } 666 __setup("hashdist=", set_hashdist); 667 668 static inline void fixup_hashdist(void) 669 { 670 if (num_node_state(N_MEMORY) == 1) 671 hashdist = false; 672 } 673 #else 674 static inline void fixup_hashdist(void) {} 675 #endif /* CONFIG_NUMA */ 676 677 #ifdef CONFIG_ZONE_DEVICE 678 static __meminit void pageblock_migratetype_init_range(unsigned long pfn, 679 unsigned long nr_pages, int migratetype) 680 { 681 const unsigned long end = pfn + nr_pages; 682 683 for (pfn = pageblock_align(pfn); pfn < end; pfn += pageblock_nr_pages) { 684 init_pageblock_migratetype(pfn_to_page(pfn), migratetype, false); 685 if (IS_ALIGNED(pfn, PAGES_PER_SECTION)) 686 cond_resched(); 687 } 688 } 689 #endif 690 691 /* 692 * Initialize a reserved page unconditionally, finding its zone first. 693 */ 694 void __meminit __init_page_from_nid(unsigned long pfn, int nid) 695 { 696 pg_data_t *pgdat; 697 int zid; 698 699 pgdat = NODE_DATA(nid); 700 701 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 702 struct zone *zone = &pgdat->node_zones[zid]; 703 704 if (zone_spans_pfn(zone, pfn)) 705 break; 706 } 707 __init_single_page(pfn_to_page(pfn), pfn, zid, nid); 708 709 if (pageblock_aligned(pfn)) { 710 enum migratetype mt = 711 kho_scratch_migratetype(pfn, MIGRATE_MOVABLE); 712 init_pageblock_migratetype(pfn_to_page(pfn), mt, false); 713 } 714 } 715 716 #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT 717 static inline void pgdat_set_deferred_range(pg_data_t *pgdat) 718 { 719 pgdat->first_deferred_pfn = ULONG_MAX; 720 } 721 722 /* Returns true if the struct page for the pfn is initialised */ 723 static inline bool __meminit early_page_initialised(unsigned long pfn, int nid) 724 { 725 if (node_online(nid) && pfn >= NODE_DATA(nid)->first_deferred_pfn) 726 return false; 727 728 return true; 729 } 730 731 /* 732 * Returns true when the remaining initialisation should be deferred until 733 * later in the boot cycle when it can be parallelised. 734 */ 735 static bool __meminit 736 defer_init(int nid, unsigned long pfn, unsigned long end_pfn) 737 { 738 static unsigned long prev_end_pfn, nr_initialised; 739 740 if (early_page_ext_enabled()) 741 return false; 742 743 /* Always populate low zones for address-constrained allocations */ 744 if (end_pfn < pgdat_end_pfn(NODE_DATA(nid))) 745 return false; 746 747 if (NODE_DATA(nid)->first_deferred_pfn != ULONG_MAX) 748 return true; 749 750 /* 751 * prev_end_pfn static that contains the end of previous zone 752 * No need to protect because called very early in boot before smp_init. 753 */ 754 if (prev_end_pfn != end_pfn) { 755 prev_end_pfn = end_pfn; 756 nr_initialised = 0; 757 } 758 759 /* 760 * We start only with one section of pages, more pages are added as 761 * needed until the rest of deferred pages are initialized. 762 */ 763 nr_initialised++; 764 if ((nr_initialised > PAGES_PER_SECTION) && 765 (pfn & (PAGES_PER_SECTION - 1)) == 0) { 766 NODE_DATA(nid)->first_deferred_pfn = pfn; 767 return true; 768 } 769 return false; 770 } 771 772 static void __meminit __init_deferred_page(unsigned long pfn, int nid) 773 { 774 if (early_page_initialised(pfn, nid)) 775 return; 776 777 __init_page_from_nid(pfn, nid); 778 } 779 #else 780 static inline void pgdat_set_deferred_range(pg_data_t *pgdat) {} 781 782 static inline bool early_page_initialised(unsigned long pfn, int nid) 783 { 784 return true; 785 } 786 787 static inline bool defer_init(int nid, unsigned long pfn, unsigned long end_pfn) 788 { 789 return false; 790 } 791 792 static inline void __init_deferred_page(unsigned long pfn, int nid) 793 { 794 } 795 #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ 796 797 void __meminit init_deferred_page(unsigned long pfn, int nid) 798 { 799 __init_deferred_page(pfn, nid); 800 } 801 802 /* If zone is ZONE_MOVABLE but memory is mirrored, it is an overlapped init */ 803 static bool __meminit 804 overlap_memmap_init(unsigned long zone, unsigned long *pfn) 805 { 806 static struct memblock_region *r __meminitdata; 807 808 if (mirrored_kernelcore && zone == ZONE_MOVABLE) { 809 if (!r || *pfn >= memblock_region_memory_end_pfn(r)) { 810 for_each_mem_region(r) { 811 if (*pfn < memblock_region_memory_end_pfn(r)) 812 break; 813 } 814 } 815 if (*pfn >= memblock_region_memory_base_pfn(r) && 816 memblock_is_mirror(r)) { 817 *pfn = memblock_region_memory_end_pfn(r); 818 return true; 819 } 820 } 821 return false; 822 } 823 824 /* 825 * Only struct pages that correspond to ranges defined by memblock.memory 826 * are zeroed and initialized by going through __init_single_page() during 827 * memmap_init_zone_range(). 828 * 829 * But, there could be struct pages that correspond to holes in 830 * memblock.memory. This can happen because of the following reasons: 831 * - physical memory bank size is not necessarily the exact multiple of the 832 * arbitrary section size 833 * - early reserved memory may not be listed in memblock.memory 834 * - non-memory regions covered by the contiguous flatmem mapping 835 * - memory layouts defined with memmap= kernel parameter may not align 836 * nicely with memmap sections 837 * 838 * Explicitly initialize those struct pages so that: 839 * - PG_Reserved is set 840 * - zone and node links point to zone and node that span the page if the 841 * hole is in the middle of a zone 842 * - zone and node links point to adjacent zone/node if the hole falls on 843 * the zone boundary; the pages in such holes will be prepended to the 844 * zone/node above the hole except for the trailing pages in the last 845 * section that will be appended to the zone/node below. 846 */ 847 static void __init init_unavailable_range(unsigned long spfn, 848 unsigned long epfn, 849 int zone, int node) 850 { 851 unsigned long pfn; 852 u64 pgcnt = 0; 853 854 for_each_valid_pfn(pfn, spfn, epfn) { 855 __init_single_page(pfn_to_page(pfn), pfn, zone, node); 856 __SetPageReserved(pfn_to_page(pfn)); 857 pgcnt++; 858 } 859 860 if (pgcnt) 861 pr_info("On node %d, zone %s: %lld pages in unavailable ranges\n", 862 node, zone_names[zone], pgcnt); 863 } 864 865 /* 866 * Initially all pages are reserved - free ones are freed 867 * up by memblock_free_all() once the early boot process is 868 * done. Non-atomic initialization, single-pass. 869 * 870 * All aligned pageblocks are initialized to the specified migratetype 871 * (usually MIGRATE_MOVABLE). Besides setting the migratetype, no related 872 * zone stats (e.g., nr_isolate_pageblock) are touched. 873 */ 874 void __meminit memmap_init_range(unsigned long size, int nid, unsigned long zone, 875 unsigned long start_pfn, unsigned long zone_end_pfn, 876 enum meminit_context context, 877 struct vmem_altmap *altmap, int migratetype, 878 bool isolate_pageblock) 879 { 880 unsigned long pfn, end_pfn = start_pfn + size; 881 struct page *page; 882 883 if (highest_memmap_pfn < end_pfn - 1) 884 highest_memmap_pfn = end_pfn - 1; 885 886 #ifdef CONFIG_ZONE_DEVICE 887 /* 888 * Honor reservation requested by the driver for this ZONE_DEVICE 889 * memory. We limit the total number of pages to initialize to just 890 * those that might contain the memory mapping. We will defer the 891 * ZONE_DEVICE page initialization until after we have released 892 * the hotplug lock. 893 */ 894 if (zone == ZONE_DEVICE) { 895 if (!altmap) 896 return; 897 898 if (start_pfn == altmap->base_pfn) 899 start_pfn += altmap->reserve; 900 end_pfn = altmap->base_pfn + vmem_altmap_offset(altmap); 901 } 902 #endif 903 904 for (pfn = start_pfn; pfn < end_pfn; ) { 905 /* 906 * There can be holes in boot-time mem_map[]s handed to this 907 * function. They do not exist on hotplugged memory. 908 */ 909 if (context == MEMINIT_EARLY) { 910 if (overlap_memmap_init(zone, &pfn)) 911 continue; 912 if (defer_init(nid, pfn, zone_end_pfn)) { 913 deferred_struct_pages = true; 914 break; 915 } 916 } 917 918 page = pfn_to_page(pfn); 919 __init_single_page(page, pfn, zone, nid); 920 if (context == MEMINIT_HOTPLUG) { 921 #ifdef CONFIG_ZONE_DEVICE 922 if (zone == ZONE_DEVICE) 923 __SetPageReserved(page); 924 else 925 #endif 926 __SetPageOffline(page); 927 } 928 929 /* 930 * Usually, we want to mark the pageblock MIGRATE_MOVABLE, 931 * such that unmovable allocations won't be scattered all 932 * over the place during system boot. 933 */ 934 if (pageblock_aligned(pfn)) { 935 init_pageblock_migratetype(page, migratetype, 936 isolate_pageblock); 937 cond_resched(); 938 } 939 pfn++; 940 } 941 } 942 943 static void __init memmap_init_zone_range(struct zone *zone, 944 unsigned long start_pfn, 945 unsigned long end_pfn, 946 unsigned long *hole_pfn, 947 enum migratetype mt) 948 { 949 unsigned long zone_start_pfn = zone->zone_start_pfn; 950 unsigned long zone_end_pfn = zone_start_pfn + zone->spanned_pages; 951 int nid = zone_to_nid(zone), zone_id = zone_idx(zone); 952 953 start_pfn = clamp(start_pfn, zone_start_pfn, zone_end_pfn); 954 end_pfn = clamp(end_pfn, zone_start_pfn, zone_end_pfn); 955 956 if (start_pfn >= end_pfn) 957 return; 958 959 memmap_init_range(end_pfn - start_pfn, nid, zone_id, start_pfn, 960 zone_end_pfn, MEMINIT_EARLY, NULL, mt, false); 961 962 if (*hole_pfn < start_pfn) 963 init_unavailable_range(*hole_pfn, start_pfn, zone_id, nid); 964 965 *hole_pfn = end_pfn; 966 } 967 968 static void __init memmap_init(void) 969 { 970 unsigned long start_pfn, end_pfn; 971 unsigned long hole_pfn = 0; 972 int i, j, zone_id = 0, nid; 973 974 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) { 975 struct pglist_data *node = NODE_DATA(nid); 976 enum migratetype mt = 977 kho_scratch_migratetype(start_pfn, MIGRATE_MOVABLE); 978 979 for (j = 0; j < MAX_NR_ZONES; j++) { 980 struct zone *zone = node->node_zones + j; 981 982 if (!populated_zone(zone)) 983 continue; 984 985 memmap_init_zone_range(zone, start_pfn, end_pfn, 986 &hole_pfn, mt); 987 zone_id = j; 988 } 989 } 990 991 /* 992 * Initialize the memory map for hole in the range [memory_end, 993 * section_end] for SPARSEMEM and in the range [memory_end, memmap_end] 994 * for FLATMEM. 995 * Append the pages in this hole to the highest zone in the last 996 * node. 997 */ 998 #ifdef CONFIG_SPARSEMEM 999 end_pfn = round_up(end_pfn, PAGES_PER_SECTION); 1000 #else 1001 end_pfn = round_up(end_pfn, MAX_ORDER_NR_PAGES); 1002 #endif 1003 if (hole_pfn < end_pfn) 1004 init_unavailable_range(hole_pfn, end_pfn, zone_id, nid); 1005 } 1006 1007 #ifdef CONFIG_ZONE_DEVICE 1008 static void __ref __init_zone_device_page(struct page *page, unsigned long pfn, 1009 unsigned long zone_idx, int nid, 1010 struct dev_pagemap *pgmap) 1011 { 1012 1013 __init_single_page(page, pfn, zone_idx, nid); 1014 1015 /* 1016 * Mark page reserved as it will need to wait for onlining 1017 * phase for it to be fully associated with a zone. 1018 * 1019 * We can use the non-atomic __set_bit operation for setting 1020 * the flag as we are still initializing the pages. 1021 */ 1022 __SetPageReserved(page); 1023 1024 /* 1025 * ZONE_DEVICE pages union ->lru with a ->pgmap back pointer 1026 * and zone_device_data. It is a bug if a ZONE_DEVICE page is 1027 * ever freed or placed on a driver-private list. 1028 */ 1029 page_folio(page)->pgmap = pgmap; 1030 page->zone_device_data = NULL; 1031 1032 /* 1033 * ZONE_DEVICE pages other than MEMORY_TYPE_GENERIC are released 1034 * directly to the driver page allocator which will set the page count 1035 * to 1 when allocating the page. 1036 * 1037 * MEMORY_TYPE_GENERIC and MEMORY_TYPE_FS_DAX pages automatically have 1038 * their refcount reset to one whenever they are freed (ie. after 1039 * their refcount drops to 0). 1040 */ 1041 switch (pgmap->type) { 1042 case MEMORY_DEVICE_FS_DAX: 1043 case MEMORY_DEVICE_PRIVATE: 1044 case MEMORY_DEVICE_COHERENT: 1045 case MEMORY_DEVICE_PCI_P2PDMA: 1046 set_page_count(page, 0); 1047 break; 1048 1049 case MEMORY_DEVICE_GENERIC: 1050 break; 1051 } 1052 } 1053 1054 /* 1055 * With compound page geometry and when struct pages are stored in ram most 1056 * tail pages are reused. Consequently, the amount of unique struct pages to 1057 * initialize is a lot smaller that the total amount of struct pages being 1058 * mapped. This is a paired / mild layering violation with explicit knowledge 1059 * of how the sparse_vmemmap internals handle compound pages in the lack 1060 * of an altmap. See vmemmap_populate_compound_pages(). 1061 */ 1062 static inline unsigned long compound_nr_pages(unsigned long pfn, 1063 struct vmem_altmap *altmap, 1064 struct dev_pagemap *pgmap) 1065 { 1066 /* 1067 * If DAX memory is hot-plugged into an unoccupied subsection 1068 * of an early section, the unoptimized boot memmap is reused. 1069 * See section_activate(). 1070 */ 1071 if (early_section(__pfn_to_section(pfn)) || 1072 !vmemmap_can_optimize(altmap, pgmap)) 1073 return pgmap_vmemmap_nr(pgmap); 1074 1075 return VMEMMAP_RESERVE_NR * (PAGE_SIZE / sizeof(struct page)); 1076 } 1077 1078 static void __ref memmap_init_compound(struct page *head, 1079 unsigned long head_pfn, 1080 unsigned long zone_idx, int nid, 1081 struct dev_pagemap *pgmap, 1082 unsigned long nr_pages) 1083 { 1084 unsigned long pfn, end_pfn = head_pfn + nr_pages; 1085 unsigned int order = pgmap->vmemmap_shift; 1086 1087 /* 1088 * We have to initialize the pages, including setting up page links. 1089 * prep_compound_page() does not take care of that, so instead we 1090 * open-code prep_compound_page() so we can take care of initializing 1091 * the pages in the same go. 1092 */ 1093 __SetPageHead(head); 1094 for (pfn = head_pfn + 1; pfn < end_pfn; pfn++) { 1095 struct page *page = pfn_to_page(pfn); 1096 1097 __init_zone_device_page(page, pfn, zone_idx, nid, pgmap); 1098 prep_compound_tail(page, head, order); 1099 set_page_count(page, 0); 1100 } 1101 prep_compound_head(head, order); 1102 } 1103 1104 void __ref memmap_init_zone_device(struct zone *zone, 1105 unsigned long start_pfn, 1106 unsigned long nr_pages, 1107 struct dev_pagemap *pgmap) 1108 { 1109 unsigned long pfn, end_pfn = start_pfn + nr_pages; 1110 struct pglist_data *pgdat = zone->zone_pgdat; 1111 struct vmem_altmap *altmap = pgmap_altmap(pgmap); 1112 unsigned int pfns_per_compound = pgmap_vmemmap_nr(pgmap); 1113 unsigned long zone_idx = zone_idx(zone); 1114 unsigned long start = jiffies; 1115 int nid = pgdat->node_id; 1116 1117 if (WARN_ON_ONCE(!pgmap || zone_idx != ZONE_DEVICE)) 1118 return; 1119 1120 /* 1121 * The call to memmap_init should have already taken care 1122 * of the pages reserved for the memmap, so we can just jump to 1123 * the end of that region and start processing the device pages. 1124 */ 1125 if (altmap) { 1126 start_pfn = altmap->base_pfn + vmem_altmap_offset(altmap); 1127 nr_pages = end_pfn - start_pfn; 1128 } 1129 1130 for (pfn = start_pfn; pfn < end_pfn; pfn += pfns_per_compound) { 1131 struct page *page = pfn_to_page(pfn); 1132 1133 __init_zone_device_page(page, pfn, zone_idx, nid, pgmap); 1134 1135 if (IS_ALIGNED(pfn, PAGES_PER_SECTION)) 1136 cond_resched(); 1137 1138 if (pfns_per_compound == 1) 1139 continue; 1140 1141 memmap_init_compound(page, pfn, zone_idx, nid, pgmap, 1142 compound_nr_pages(pfn, altmap, pgmap)); 1143 } 1144 1145 pageblock_migratetype_init_range(start_pfn, nr_pages, MIGRATE_MOVABLE); 1146 1147 pr_debug("%s initialised %lu pages in %ums\n", __func__, 1148 nr_pages, jiffies_to_msecs(jiffies - start)); 1149 } 1150 #endif 1151 1152 /* 1153 * The zone ranges provided by the architecture do not include ZONE_MOVABLE 1154 * because it is sized independent of architecture. Unlike the other zones, 1155 * the starting point for ZONE_MOVABLE is not fixed. It may be different 1156 * in each node depending on the size of each node and how evenly kernelcore 1157 * is distributed. This helper function adjusts the zone ranges 1158 * provided by the architecture for a given node by using the end of the 1159 * highest usable zone for ZONE_MOVABLE. This preserves the assumption that 1160 * zones within a node are in order of monotonic increases memory addresses 1161 */ 1162 static void __init adjust_zone_range_for_zone_movable(int nid, 1163 unsigned long zone_type, 1164 unsigned long node_end_pfn, 1165 unsigned long *zone_start_pfn, 1166 unsigned long *zone_end_pfn) 1167 { 1168 /* Only adjust if ZONE_MOVABLE is on this node */ 1169 if (zone_movable_pfn[nid]) { 1170 /* Size ZONE_MOVABLE */ 1171 if (zone_type == ZONE_MOVABLE) { 1172 *zone_start_pfn = zone_movable_pfn[nid]; 1173 *zone_end_pfn = min(node_end_pfn, 1174 arch_zone_highest_possible_pfn[movable_zone]); 1175 1176 /* Adjust for ZONE_MOVABLE starting within this range */ 1177 } else if (!mirrored_kernelcore && 1178 *zone_start_pfn < zone_movable_pfn[nid] && 1179 *zone_end_pfn > zone_movable_pfn[nid]) { 1180 *zone_end_pfn = zone_movable_pfn[nid]; 1181 1182 /* Check if this whole range is within ZONE_MOVABLE */ 1183 } else if (*zone_start_pfn >= zone_movable_pfn[nid]) 1184 *zone_start_pfn = *zone_end_pfn; 1185 } 1186 } 1187 1188 /* 1189 * Return the number of holes in a range on a node. If nid is MAX_NUMNODES, 1190 * then all holes in the requested range will be accounted for. 1191 */ 1192 static unsigned long __init __absent_pages_in_range(int nid, 1193 unsigned long range_start_pfn, 1194 unsigned long range_end_pfn) 1195 { 1196 unsigned long nr_absent = range_end_pfn - range_start_pfn; 1197 unsigned long start_pfn, end_pfn; 1198 int i; 1199 1200 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) { 1201 start_pfn = clamp(start_pfn, range_start_pfn, range_end_pfn); 1202 end_pfn = clamp(end_pfn, range_start_pfn, range_end_pfn); 1203 nr_absent -= end_pfn - start_pfn; 1204 } 1205 return nr_absent; 1206 } 1207 1208 /** 1209 * absent_pages_in_range - Return number of page frames in holes within a range 1210 * @start_pfn: The start PFN to start searching for holes 1211 * @end_pfn: The end PFN to stop searching for holes 1212 * 1213 * Return: the number of pages frames in memory holes within a range. 1214 */ 1215 unsigned long __init absent_pages_in_range(unsigned long start_pfn, 1216 unsigned long end_pfn) 1217 { 1218 return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn); 1219 } 1220 1221 /* Return the number of page frames in holes in a zone on a node */ 1222 static unsigned long __init zone_absent_pages_in_node(int nid, 1223 unsigned long zone_type, 1224 unsigned long zone_start_pfn, 1225 unsigned long zone_end_pfn) 1226 { 1227 unsigned long nr_absent; 1228 1229 /* zone is empty, we don't have any absent pages */ 1230 if (zone_start_pfn == zone_end_pfn) 1231 return 0; 1232 1233 nr_absent = __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn); 1234 1235 /* 1236 * ZONE_MOVABLE handling. 1237 * Treat pages to be ZONE_MOVABLE in ZONE_NORMAL as absent pages 1238 * and vice versa. 1239 */ 1240 if (mirrored_kernelcore && zone_movable_pfn[nid]) { 1241 unsigned long start_pfn, end_pfn; 1242 struct memblock_region *r; 1243 1244 for_each_mem_region(r) { 1245 start_pfn = clamp(memblock_region_memory_base_pfn(r), 1246 zone_start_pfn, zone_end_pfn); 1247 end_pfn = clamp(memblock_region_memory_end_pfn(r), 1248 zone_start_pfn, zone_end_pfn); 1249 1250 if (zone_type == ZONE_MOVABLE && 1251 memblock_is_mirror(r)) 1252 nr_absent += end_pfn - start_pfn; 1253 1254 if (zone_type == ZONE_NORMAL && 1255 !memblock_is_mirror(r)) 1256 nr_absent += end_pfn - start_pfn; 1257 } 1258 } 1259 1260 return nr_absent; 1261 } 1262 1263 /* 1264 * Return the number of pages a zone spans in a node, including holes 1265 * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node() 1266 */ 1267 static unsigned long __init zone_spanned_pages_in_node(int nid, 1268 unsigned long zone_type, 1269 unsigned long node_start_pfn, 1270 unsigned long node_end_pfn, 1271 unsigned long *zone_start_pfn, 1272 unsigned long *zone_end_pfn) 1273 { 1274 unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type]; 1275 unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type]; 1276 1277 /* Get the start and end of the zone */ 1278 *zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high); 1279 *zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high); 1280 adjust_zone_range_for_zone_movable(nid, zone_type, node_end_pfn, 1281 zone_start_pfn, zone_end_pfn); 1282 1283 /* Check that this node has pages within the zone's required range */ 1284 if (*zone_end_pfn < node_start_pfn || *zone_start_pfn > node_end_pfn) 1285 return 0; 1286 1287 /* Move the zone boundaries inside the node if necessary */ 1288 *zone_end_pfn = min(*zone_end_pfn, node_end_pfn); 1289 *zone_start_pfn = max(*zone_start_pfn, node_start_pfn); 1290 1291 /* Return the spanned pages */ 1292 return *zone_end_pfn - *zone_start_pfn; 1293 } 1294 1295 static void __init reset_memoryless_node_totalpages(struct pglist_data *pgdat) 1296 { 1297 struct zone *z; 1298 1299 for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++) { 1300 z->zone_start_pfn = 0; 1301 z->spanned_pages = 0; 1302 z->present_pages = 0; 1303 #if defined(CONFIG_MEMORY_HOTPLUG) 1304 z->present_early_pages = 0; 1305 #endif 1306 } 1307 1308 pgdat->node_spanned_pages = 0; 1309 pgdat->node_present_pages = 0; 1310 pr_debug("On node %d totalpages: 0\n", pgdat->node_id); 1311 } 1312 1313 static void __init calc_nr_kernel_pages(void) 1314 { 1315 unsigned long start_pfn, end_pfn; 1316 phys_addr_t start_addr, end_addr; 1317 u64 u; 1318 #ifdef CONFIG_HIGHMEM 1319 unsigned long high_zone_low = arch_zone_lowest_possible_pfn[ZONE_HIGHMEM]; 1320 #endif 1321 1322 for_each_free_mem_range(u, NUMA_NO_NODE, MEMBLOCK_NONE, &start_addr, &end_addr, NULL) { 1323 start_pfn = PFN_UP(start_addr); 1324 end_pfn = PFN_DOWN(end_addr); 1325 1326 if (start_pfn < end_pfn) { 1327 nr_all_pages += end_pfn - start_pfn; 1328 #ifdef CONFIG_HIGHMEM 1329 start_pfn = clamp(start_pfn, 0, high_zone_low); 1330 end_pfn = clamp(end_pfn, 0, high_zone_low); 1331 #endif 1332 nr_kernel_pages += end_pfn - start_pfn; 1333 } 1334 } 1335 } 1336 1337 static void __init calculate_node_totalpages(struct pglist_data *pgdat, 1338 unsigned long node_start_pfn, 1339 unsigned long node_end_pfn) 1340 { 1341 unsigned long realtotalpages = 0, totalpages = 0; 1342 enum zone_type i; 1343 1344 for (i = 0; i < MAX_NR_ZONES; i++) { 1345 struct zone *zone = pgdat->node_zones + i; 1346 unsigned long zone_start_pfn, zone_end_pfn; 1347 unsigned long spanned, absent; 1348 unsigned long real_size; 1349 1350 spanned = zone_spanned_pages_in_node(pgdat->node_id, i, 1351 node_start_pfn, 1352 node_end_pfn, 1353 &zone_start_pfn, 1354 &zone_end_pfn); 1355 absent = zone_absent_pages_in_node(pgdat->node_id, i, 1356 zone_start_pfn, 1357 zone_end_pfn); 1358 1359 real_size = spanned - absent; 1360 1361 if (spanned) 1362 zone->zone_start_pfn = zone_start_pfn; 1363 else 1364 zone->zone_start_pfn = 0; 1365 zone->spanned_pages = spanned; 1366 zone->present_pages = real_size; 1367 #if defined(CONFIG_MEMORY_HOTPLUG) 1368 zone->present_early_pages = real_size; 1369 #endif 1370 1371 totalpages += spanned; 1372 realtotalpages += real_size; 1373 } 1374 1375 pgdat->node_spanned_pages = totalpages; 1376 pgdat->node_present_pages = realtotalpages; 1377 pr_debug("On node %d totalpages: %lu\n", pgdat->node_id, realtotalpages); 1378 } 1379 1380 #ifdef CONFIG_COMPACTION 1381 static void pgdat_init_kcompactd(struct pglist_data *pgdat) 1382 { 1383 init_waitqueue_head(&pgdat->kcompactd_wait); 1384 } 1385 #else 1386 static void pgdat_init_kcompactd(struct pglist_data *pgdat) {} 1387 #endif 1388 1389 static void __meminit pgdat_init_internals(struct pglist_data *pgdat) 1390 { 1391 int i; 1392 1393 pgdat_resize_init(pgdat); 1394 pgdat_kswapd_lock_init(pgdat); 1395 pgdat_init_kcompactd(pgdat); 1396 1397 init_waitqueue_head(&pgdat->kswapd_wait); 1398 init_waitqueue_head(&pgdat->pfmemalloc_wait); 1399 1400 for (i = 0; i < NR_VMSCAN_THROTTLE; i++) 1401 init_waitqueue_head(&pgdat->reclaim_wait[i]); 1402 1403 pgdat_page_ext_init(pgdat); 1404 lruvec_init(&pgdat->__lruvec); 1405 } 1406 1407 static void __meminit zone_init_internals(struct zone *zone, enum zone_type idx, int nid, 1408 unsigned long remaining_pages) 1409 { 1410 atomic_long_set(&zone->managed_pages, remaining_pages); 1411 zone_set_nid(zone, nid); 1412 zone->name = zone_names[idx]; 1413 zone->zone_pgdat = NODE_DATA(nid); 1414 spin_lock_init(&zone->lock); 1415 zone_seqlock_init(zone); 1416 zone_pcp_init(zone); 1417 } 1418 1419 static void __meminit zone_init_free_lists(struct zone *zone) 1420 { 1421 struct list_head *list; 1422 unsigned int order; 1423 1424 for_each_free_list(list, zone, order) 1425 INIT_LIST_HEAD(list); 1426 1427 for (order = 0; order < NR_PAGE_ORDERS; order++) 1428 zone->free_area[order].nr_free = 0; 1429 1430 #ifdef CONFIG_UNACCEPTED_MEMORY 1431 INIT_LIST_HEAD(&zone->unaccepted_pages); 1432 #endif 1433 } 1434 1435 void __meminit init_currently_empty_zone(struct zone *zone, 1436 unsigned long zone_start_pfn, 1437 unsigned long size) 1438 { 1439 struct pglist_data *pgdat = zone->zone_pgdat; 1440 int zone_idx = zone_idx(zone) + 1; 1441 1442 if (zone_idx > pgdat->nr_zones) 1443 pgdat->nr_zones = zone_idx; 1444 1445 zone->zone_start_pfn = zone_start_pfn; 1446 1447 mminit_dprintk(MMINIT_TRACE, "memmap_init", 1448 "Initialising map node %d zone %lu pfns %lu -> %lu\n", 1449 pgdat->node_id, 1450 (unsigned long)zone_idx(zone), 1451 zone_start_pfn, (zone_start_pfn + size)); 1452 1453 zone_init_free_lists(zone); 1454 zone->initialized = 1; 1455 } 1456 1457 #ifndef CONFIG_SPARSEMEM 1458 /* 1459 * Calculate the size of the zone->pageblock_flags rounded to an unsigned long 1460 * Start by making sure zonesize is a multiple of pageblock_order by rounding 1461 * up. Then use 1 NR_PAGEBLOCK_BITS worth of bits per pageblock, finally 1462 * round what is now in bits to nearest long in bits, then return it in 1463 * bytes. 1464 */ 1465 static unsigned long __init usemap_size(unsigned long zone_start_pfn, unsigned long zonesize) 1466 { 1467 unsigned long usemapsize; 1468 1469 zonesize += zone_start_pfn & (pageblock_nr_pages-1); 1470 usemapsize = round_up(zonesize, pageblock_nr_pages); 1471 usemapsize = usemapsize >> pageblock_order; 1472 usemapsize *= NR_PAGEBLOCK_BITS; 1473 usemapsize = round_up(usemapsize, BITS_PER_LONG); 1474 1475 return usemapsize / BITS_PER_BYTE; 1476 } 1477 1478 static void __ref setup_usemap(struct zone *zone) 1479 { 1480 unsigned long usemapsize = usemap_size(zone->zone_start_pfn, 1481 zone->spanned_pages); 1482 zone->pageblock_flags = NULL; 1483 if (usemapsize) { 1484 zone->pageblock_flags = 1485 memblock_alloc_node(usemapsize, SMP_CACHE_BYTES, 1486 zone_to_nid(zone)); 1487 if (!zone->pageblock_flags) 1488 panic("Failed to allocate %ld bytes for zone %s pageblock flags on node %d\n", 1489 usemapsize, zone->name, zone_to_nid(zone)); 1490 } 1491 } 1492 #else 1493 static inline void setup_usemap(struct zone *zone) {} 1494 #endif /* CONFIG_SPARSEMEM */ 1495 1496 #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE 1497 1498 /* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */ 1499 void __init set_pageblock_order(void) 1500 { 1501 unsigned int order = PAGE_BLOCK_MAX_ORDER; 1502 1503 /* Check that pageblock_nr_pages has not already been setup */ 1504 if (pageblock_order) 1505 return; 1506 1507 /* Don't let pageblocks exceed the maximum allocation granularity. */ 1508 if (HPAGE_SHIFT > PAGE_SHIFT && HUGETLB_PAGE_ORDER < order) 1509 order = HUGETLB_PAGE_ORDER; 1510 1511 /* 1512 * Assume the largest contiguous order of interest is a huge page. 1513 * This value may be variable depending on boot parameters on powerpc. 1514 */ 1515 pageblock_order = order; 1516 } 1517 #else /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */ 1518 1519 /* 1520 * When CONFIG_HUGETLB_PAGE_SIZE_VARIABLE is not set, set_pageblock_order() 1521 * is unused as pageblock_order is set at compile-time. See 1522 * include/linux/pageblock-flags.h for the values of pageblock_order based on 1523 * the kernel config 1524 */ 1525 void __init set_pageblock_order(void) 1526 { 1527 } 1528 1529 #endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */ 1530 1531 /* 1532 * Set up the zone data structures 1533 * - init pgdat internals 1534 * - init all zones belonging to this node 1535 * 1536 * NOTE: this function is only called during memory hotplug 1537 */ 1538 #ifdef CONFIG_MEMORY_HOTPLUG 1539 void __ref free_area_init_core_hotplug(struct pglist_data *pgdat) 1540 { 1541 int nid = pgdat->node_id; 1542 enum zone_type z; 1543 int cpu; 1544 1545 pgdat_init_internals(pgdat); 1546 1547 if (pgdat->per_cpu_nodestats == &boot_nodestats) 1548 pgdat->per_cpu_nodestats = alloc_percpu(struct per_cpu_nodestat); 1549 1550 /* 1551 * Reset the nr_zones, order and highest_zoneidx before reuse. 1552 * Note that kswapd will init kswapd_highest_zoneidx properly 1553 * when it starts in the near future. 1554 */ 1555 pgdat->nr_zones = 0; 1556 pgdat->kswapd_order = 0; 1557 pgdat->kswapd_highest_zoneidx = 0; 1558 pgdat->node_start_pfn = 0; 1559 pgdat->node_present_pages = 0; 1560 1561 for_each_online_cpu(cpu) { 1562 struct per_cpu_nodestat *p; 1563 1564 p = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu); 1565 memset(p, 0, sizeof(*p)); 1566 } 1567 1568 /* 1569 * When memory is hot-added, all the memory is in offline state. So 1570 * clear all zones' present_pages and managed_pages because they will 1571 * be updated in online_pages() and offline_pages(). 1572 */ 1573 for (z = 0; z < MAX_NR_ZONES; z++) { 1574 struct zone *zone = pgdat->node_zones + z; 1575 1576 zone->present_pages = 0; 1577 zone_init_internals(zone, z, nid, 0); 1578 } 1579 } 1580 #endif 1581 1582 static void __init free_area_init_core(struct pglist_data *pgdat) 1583 { 1584 enum zone_type j; 1585 int nid = pgdat->node_id; 1586 1587 pgdat_init_internals(pgdat); 1588 pgdat->per_cpu_nodestats = &boot_nodestats; 1589 1590 for (j = 0; j < MAX_NR_ZONES; j++) { 1591 struct zone *zone = pgdat->node_zones + j; 1592 unsigned long size = zone->spanned_pages; 1593 1594 /* 1595 * Initialize zone->managed_pages as 0 , it will be reset 1596 * when memblock allocator frees pages into buddy system. 1597 */ 1598 zone_init_internals(zone, j, nid, zone->present_pages); 1599 1600 if (!size) 1601 continue; 1602 1603 setup_usemap(zone); 1604 init_currently_empty_zone(zone, zone->zone_start_pfn, size); 1605 } 1606 } 1607 1608 void __init *memmap_alloc(phys_addr_t size, phys_addr_t align, 1609 phys_addr_t min_addr, int nid, bool exact_nid) 1610 { 1611 void *ptr; 1612 1613 /* 1614 * Kmemleak will explicitly scan mem_map by traversing all valid 1615 * `struct *page`,so memblock does not need to be added to the scan list. 1616 */ 1617 if (exact_nid) 1618 ptr = memblock_alloc_exact_nid_raw(size, align, min_addr, 1619 MEMBLOCK_ALLOC_NOLEAKTRACE, 1620 nid); 1621 else 1622 ptr = memblock_alloc_try_nid_raw(size, align, min_addr, 1623 MEMBLOCK_ALLOC_NOLEAKTRACE, 1624 nid); 1625 1626 if (ptr && size > 0) 1627 page_init_poison(ptr, size); 1628 1629 return ptr; 1630 } 1631 1632 #ifdef CONFIG_FLATMEM 1633 static void __init alloc_node_mem_map(struct pglist_data *pgdat) 1634 { 1635 unsigned long start, offset, size, end; 1636 struct page *map; 1637 1638 /* Skip empty nodes */ 1639 if (!pgdat->node_spanned_pages) 1640 return; 1641 1642 start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1); 1643 offset = pgdat->node_start_pfn - start; 1644 /* 1645 * The zone's endpoints aren't required to be MAX_PAGE_ORDER 1646 * aligned but the node_mem_map endpoints must be in order 1647 * for the buddy allocator to function correctly. 1648 */ 1649 end = ALIGN(pgdat_end_pfn(pgdat), MAX_ORDER_NR_PAGES); 1650 size = (end - start) * sizeof(struct page); 1651 map = memmap_alloc(size, SMP_CACHE_BYTES, MEMBLOCK_LOW_LIMIT, 1652 pgdat->node_id, false); 1653 if (!map) 1654 panic("Failed to allocate %ld bytes for node %d memory map\n", 1655 size, pgdat->node_id); 1656 pgdat->node_mem_map = map + offset; 1657 memmap_boot_pages_add(DIV_ROUND_UP(size, PAGE_SIZE)); 1658 pr_debug("%s: node %d, pgdat %08lx, node_mem_map %08lx\n", 1659 __func__, pgdat->node_id, (unsigned long)pgdat, 1660 (unsigned long)pgdat->node_mem_map); 1661 1662 /* the global mem_map is just set as node 0's */ 1663 WARN_ON(pgdat != NODE_DATA(0)); 1664 1665 mem_map = pgdat->node_mem_map; 1666 if (page_to_pfn(mem_map) != pgdat->node_start_pfn) 1667 mem_map -= offset; 1668 1669 max_mapnr = end - start; 1670 } 1671 #else 1672 static inline void alloc_node_mem_map(struct pglist_data *pgdat) { } 1673 #endif /* CONFIG_FLATMEM */ 1674 1675 /** 1676 * get_pfn_range_for_nid - Return the start and end page frames for a node 1677 * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned. 1678 * @start_pfn: Passed by reference. On return, it will have the node start_pfn. 1679 * @end_pfn: Passed by reference. On return, it will have the node end_pfn. 1680 * 1681 * It returns the start and end page frame of a node based on information 1682 * provided by memblock_set_node(). If called for a node 1683 * with no available memory, the start and end PFNs will be 0. 1684 */ 1685 void __init get_pfn_range_for_nid(unsigned int nid, 1686 unsigned long *start_pfn, unsigned long *end_pfn) 1687 { 1688 unsigned long this_start_pfn, this_end_pfn; 1689 int i; 1690 1691 *start_pfn = -1UL; 1692 *end_pfn = 0; 1693 1694 for_each_mem_pfn_range(i, nid, &this_start_pfn, &this_end_pfn, NULL) { 1695 *start_pfn = min(*start_pfn, this_start_pfn); 1696 *end_pfn = max(*end_pfn, this_end_pfn); 1697 } 1698 1699 if (*start_pfn == -1UL) 1700 *start_pfn = 0; 1701 } 1702 1703 static void __init free_area_init_node(int nid) 1704 { 1705 pg_data_t *pgdat = NODE_DATA(nid); 1706 unsigned long start_pfn = 0; 1707 unsigned long end_pfn = 0; 1708 1709 /* pg_data_t should be reset to zero when it's allocated */ 1710 WARN_ON(pgdat->nr_zones || pgdat->kswapd_highest_zoneidx); 1711 1712 get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); 1713 1714 pgdat->node_id = nid; 1715 pgdat->node_start_pfn = start_pfn; 1716 pgdat->per_cpu_nodestats = NULL; 1717 1718 if (start_pfn != end_pfn) { 1719 pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid, 1720 (u64)start_pfn << PAGE_SHIFT, 1721 end_pfn ? ((u64)end_pfn << PAGE_SHIFT) - 1 : 0); 1722 1723 calculate_node_totalpages(pgdat, start_pfn, end_pfn); 1724 } else { 1725 pr_info("Initmem setup node %d as memoryless\n", nid); 1726 1727 reset_memoryless_node_totalpages(pgdat); 1728 } 1729 1730 alloc_node_mem_map(pgdat); 1731 pgdat_set_deferred_range(pgdat); 1732 1733 free_area_init_core(pgdat); 1734 lru_gen_init_pgdat(pgdat); 1735 } 1736 1737 /* Any regular or high memory on that node? */ 1738 static void __init check_for_memory(pg_data_t *pgdat) 1739 { 1740 enum zone_type zone_type; 1741 1742 for (zone_type = 0; zone_type <= ZONE_MOVABLE - 1; zone_type++) { 1743 struct zone *zone = &pgdat->node_zones[zone_type]; 1744 if (populated_zone(zone)) { 1745 if (IS_ENABLED(CONFIG_HIGHMEM)) 1746 node_set_state(pgdat->node_id, N_HIGH_MEMORY); 1747 if (zone_type <= ZONE_NORMAL) 1748 node_set_state(pgdat->node_id, N_NORMAL_MEMORY); 1749 break; 1750 } 1751 } 1752 } 1753 1754 #if MAX_NUMNODES > 1 1755 /* 1756 * Figure out the number of possible node ids. 1757 */ 1758 void __init setup_nr_node_ids(void) 1759 { 1760 unsigned int highest; 1761 1762 highest = find_last_bit(node_possible_map.bits, MAX_NUMNODES); 1763 nr_node_ids = highest + 1; 1764 } 1765 #endif 1766 1767 /* 1768 * Some architectures, e.g. ARC may have ZONE_HIGHMEM below ZONE_NORMAL. For 1769 * such cases we allow max_zone_pfn sorted in the descending order 1770 */ 1771 static bool arch_has_descending_max_zone_pfns(void) 1772 { 1773 return IS_ENABLED(CONFIG_ARC) && !IS_ENABLED(CONFIG_ARC_HAS_PAE40); 1774 } 1775 1776 static void __init set_high_memory(void) 1777 { 1778 phys_addr_t highmem = memblock_end_of_DRAM(); 1779 1780 /* 1781 * Some architectures (e.g. ARM) set high_memory very early and 1782 * use it in arch setup code. 1783 * If an architecture already set high_memory don't overwrite it 1784 */ 1785 if (high_memory) 1786 return; 1787 1788 #ifdef CONFIG_HIGHMEM 1789 if (arch_has_descending_max_zone_pfns() || 1790 highmem > PFN_PHYS(arch_zone_lowest_possible_pfn[ZONE_HIGHMEM])) 1791 highmem = PFN_PHYS(arch_zone_lowest_possible_pfn[ZONE_HIGHMEM]); 1792 #endif 1793 1794 high_memory = phys_to_virt(highmem - 1) + 1; 1795 } 1796 1797 /** 1798 * free_area_init - Initialise all pg_data_t and zone data 1799 * 1800 * This will call free_area_init_node() for each active node in the system. 1801 * Using the page ranges provided by memblock_set_node(), the size of each 1802 * zone in each node and their holes is calculated. If the maximum PFN 1803 * between two adjacent zones match, it is assumed that the zone is empty. 1804 * For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed 1805 * that arch_max_dma32_pfn has no pages. It is also assumed that a zone 1806 * starts where the previous one ended. For example, ZONE_DMA32 starts 1807 * at arch_max_dma_pfn. 1808 */ 1809 static void __init free_area_init(void) 1810 { 1811 unsigned long max_zone_pfn[MAX_NR_ZONES] = { 0 }; 1812 unsigned long start_pfn, end_pfn; 1813 int i, nid, zone; 1814 bool descending; 1815 1816 arch_zone_limits_init(max_zone_pfn); 1817 sparse_init(); 1818 1819 start_pfn = PHYS_PFN(memblock_start_of_DRAM()); 1820 descending = arch_has_descending_max_zone_pfns(); 1821 1822 for (i = 0; i < MAX_NR_ZONES; i++) { 1823 if (descending) 1824 zone = MAX_NR_ZONES - i - 1; 1825 else 1826 zone = i; 1827 1828 if (zone == ZONE_MOVABLE) 1829 continue; 1830 1831 end_pfn = max(max_zone_pfn[zone], start_pfn); 1832 arch_zone_lowest_possible_pfn[zone] = start_pfn; 1833 arch_zone_highest_possible_pfn[zone] = end_pfn; 1834 1835 start_pfn = end_pfn; 1836 } 1837 1838 /* Find the PFNs that ZONE_MOVABLE begins at in each node */ 1839 memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn)); 1840 find_zone_movable_pfns_for_nodes(); 1841 1842 /* Print out the zone ranges */ 1843 pr_info("Zone ranges:\n"); 1844 for (i = 0; i < MAX_NR_ZONES; i++) { 1845 if (i == ZONE_MOVABLE) 1846 continue; 1847 pr_info(" %-8s ", zone_names[i]); 1848 if (arch_zone_lowest_possible_pfn[i] == 1849 arch_zone_highest_possible_pfn[i]) 1850 pr_cont("empty\n"); 1851 else 1852 pr_cont("[mem %#018Lx-%#018Lx]\n", 1853 (u64)arch_zone_lowest_possible_pfn[i] 1854 << PAGE_SHIFT, 1855 ((u64)arch_zone_highest_possible_pfn[i] 1856 << PAGE_SHIFT) - 1); 1857 } 1858 1859 /* Print out the PFNs ZONE_MOVABLE begins at in each node */ 1860 pr_info("Movable zone start for each node\n"); 1861 for (i = 0; i < MAX_NUMNODES; i++) { 1862 if (zone_movable_pfn[i]) 1863 pr_info(" Node %d: %#018Lx\n", i, 1864 (u64)zone_movable_pfn[i] << PAGE_SHIFT); 1865 } 1866 1867 /* 1868 * Print out the early node map, and initialize the 1869 * subsection-map relative to active online memory ranges to 1870 * enable future "sub-section" extensions of the memory map. 1871 */ 1872 pr_info("Early memory node ranges\n"); 1873 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) { 1874 pr_info(" node %3d: [mem %#018Lx-%#018Lx]\n", nid, 1875 (u64)start_pfn << PAGE_SHIFT, 1876 ((u64)end_pfn << PAGE_SHIFT) - 1); 1877 sparse_init_subsection_map(start_pfn, end_pfn - start_pfn); 1878 } 1879 1880 /* Initialise every node */ 1881 mminit_verify_pageflags_layout(); 1882 setup_nr_node_ids(); 1883 set_pageblock_order(); 1884 1885 for_each_node(nid) { 1886 pg_data_t *pgdat; 1887 1888 /* 1889 * If an architecture has not allocated node data for 1890 * this node, presume the node is memoryless or offline. 1891 */ 1892 if (!NODE_DATA(nid)) 1893 alloc_offline_node_data(nid); 1894 1895 pgdat = NODE_DATA(nid); 1896 free_area_init_node(nid); 1897 1898 /* 1899 * No sysfs hierarchy will be created via register_node() 1900 *for memory-less node because here it's not marked as N_MEMORY 1901 *and won't be set online later. The benefit is userspace 1902 *program won't be confused by sysfs files/directories of 1903 *memory-less node. The pgdat will get fully initialized by 1904 *hotadd_init_pgdat() when memory is hotplugged into this node. 1905 */ 1906 if (pgdat->node_present_pages) { 1907 node_set_state(nid, N_MEMORY); 1908 check_for_memory(pgdat); 1909 } 1910 } 1911 1912 for_each_node_state(nid, N_MEMORY) 1913 sparse_vmemmap_init_nid_late(nid); 1914 1915 calc_nr_kernel_pages(); 1916 memmap_init(); 1917 1918 /* disable hash distribution for systems with a single node */ 1919 fixup_hashdist(); 1920 1921 set_high_memory(); 1922 } 1923 1924 /** 1925 * node_map_pfn_alignment - determine the maximum internode alignment 1926 * 1927 * This function should be called after node map is populated and sorted. 1928 * It calculates the maximum power of two alignment which can distinguish 1929 * all the nodes. 1930 * 1931 * For example, if all nodes are 1GiB and aligned to 1GiB, the return value 1932 * would indicate 1GiB alignment with (1 << (30 - PAGE_SHIFT)). If the 1933 * nodes are shifted by 256MiB, 256MiB. Note that if only the last node is 1934 * shifted, 1GiB is enough and this function will indicate so. 1935 * 1936 * This is used to test whether pfn -> nid mapping of the chosen memory 1937 * model has fine enough granularity to avoid incorrect mapping for the 1938 * populated node map. 1939 * 1940 * Return: the determined alignment in pfn's. 0 if there is no alignment 1941 * requirement (single node). 1942 */ 1943 unsigned long __init node_map_pfn_alignment(void) 1944 { 1945 unsigned long accl_mask = 0, last_end = 0; 1946 unsigned long start, end, mask; 1947 int last_nid = NUMA_NO_NODE; 1948 int i, nid; 1949 1950 for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid) { 1951 if (!start || last_nid < 0 || last_nid == nid) { 1952 last_nid = nid; 1953 last_end = end; 1954 continue; 1955 } 1956 1957 /* 1958 * Start with a mask granular enough to pin-point to the 1959 * start pfn and tick off bits one-by-one until it becomes 1960 * too coarse to separate the current node from the last. 1961 */ 1962 mask = ~((1 << __ffs(start)) - 1); 1963 while (mask && last_end <= (start & (mask << 1))) 1964 mask <<= 1; 1965 1966 /* accumulate all internode masks */ 1967 accl_mask |= mask; 1968 } 1969 1970 /* convert mask to number of pages */ 1971 return ~accl_mask + 1; 1972 } 1973 1974 #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT 1975 static void __init deferred_free_pages(unsigned long pfn, 1976 unsigned long nr_pages, enum migratetype mt) 1977 { 1978 struct page *page; 1979 unsigned long i; 1980 1981 if (!nr_pages) 1982 return; 1983 1984 page = pfn_to_page(pfn); 1985 1986 /* Free a large naturally-aligned chunk if possible */ 1987 if (nr_pages == MAX_ORDER_NR_PAGES && IS_MAX_ORDER_ALIGNED(pfn)) { 1988 for (i = 0; i < nr_pages; i += pageblock_nr_pages) 1989 init_pageblock_migratetype(page + i, mt, false); 1990 __free_pages_core(page, MAX_PAGE_ORDER, MEMINIT_EARLY); 1991 return; 1992 } 1993 1994 /* Accept chunks smaller than MAX_PAGE_ORDER upfront */ 1995 accept_memory(PFN_PHYS(pfn), nr_pages * PAGE_SIZE); 1996 1997 for (i = 0; i < nr_pages; i++, page++, pfn++) { 1998 if (pageblock_aligned(pfn)) 1999 init_pageblock_migratetype(page, mt, false); 2000 __free_pages_core(page, 0, MEMINIT_EARLY); 2001 } 2002 } 2003 2004 /* Completion tracking for deferred_init_memmap() threads */ 2005 static atomic_t pgdat_init_n_undone __initdata; 2006 static __initdata DECLARE_COMPLETION(pgdat_init_all_done_comp); 2007 2008 static inline void __init pgdat_init_report_one_done(void) 2009 { 2010 if (atomic_dec_and_test(&pgdat_init_n_undone)) 2011 complete(&pgdat_init_all_done_comp); 2012 } 2013 2014 /* 2015 * Initialize struct pages. We minimize pfn page lookups and scheduler checks 2016 * by performing it only once every MAX_ORDER_NR_PAGES. 2017 * Return number of pages initialized. 2018 */ 2019 static unsigned long __init deferred_init_pages(struct zone *zone, 2020 unsigned long pfn, unsigned long end_pfn) 2021 { 2022 int nid = zone_to_nid(zone); 2023 unsigned long nr_pages = end_pfn - pfn; 2024 int zid = zone_idx(zone); 2025 struct page *page = pfn_to_page(pfn); 2026 2027 for (; pfn < end_pfn; pfn++, page++) 2028 __init_single_page(page, pfn, zid, nid); 2029 return nr_pages; 2030 } 2031 2032 /* 2033 * Initialize and free pages. 2034 * 2035 * At this point reserved pages and struct pages that correspond to holes in 2036 * memblock.memory are already initialized so every free range has a valid 2037 * memory map around it. 2038 * This ensures that access of pages that are ahead of the range being 2039 * initialized (computing buddy page in __free_one_page()) always reads a valid 2040 * struct page. 2041 * 2042 * In order to try and improve CPU cache locality we have the loop broken along 2043 * max page order boundaries. 2044 */ 2045 static unsigned long __init 2046 deferred_init_memmap_chunk(unsigned long start_pfn, unsigned long end_pfn, 2047 struct zone *zone, bool can_resched) 2048 { 2049 int nid = zone_to_nid(zone); 2050 unsigned long nr_pages = 0; 2051 phys_addr_t start, end; 2052 u64 i = 0; 2053 2054 for_each_free_mem_range(i, nid, 0, &start, &end, NULL) { 2055 unsigned long spfn = PFN_UP(start); 2056 unsigned long epfn = PFN_DOWN(end); 2057 enum migratetype mt = 2058 kho_scratch_migratetype(spfn, MIGRATE_MOVABLE); 2059 2060 if (spfn >= end_pfn) 2061 break; 2062 2063 spfn = max(spfn, start_pfn); 2064 epfn = min(epfn, end_pfn); 2065 2066 while (spfn < epfn) { 2067 unsigned long mo_pfn = ALIGN(spfn + 1, MAX_ORDER_NR_PAGES); 2068 unsigned long chunk_end = min(mo_pfn, epfn); 2069 2070 nr_pages += deferred_init_pages(zone, spfn, chunk_end); 2071 deferred_free_pages(spfn, chunk_end - spfn, mt); 2072 2073 spfn = chunk_end; 2074 2075 if (can_resched) 2076 cond_resched(); 2077 else 2078 touch_nmi_watchdog(); 2079 } 2080 } 2081 2082 return nr_pages; 2083 } 2084 2085 static void __init 2086 deferred_init_memmap_job(unsigned long start_pfn, unsigned long end_pfn, 2087 void *arg) 2088 { 2089 struct zone *zone = arg; 2090 2091 deferred_init_memmap_chunk(start_pfn, end_pfn, zone, true); 2092 } 2093 2094 static unsigned int __init 2095 deferred_page_init_max_threads(const struct cpumask *node_cpumask) 2096 { 2097 return max(cpumask_weight(node_cpumask), 1U); 2098 } 2099 2100 /* Initialise remaining memory on a node */ 2101 static int __init deferred_init_memmap(void *data) 2102 { 2103 pg_data_t *pgdat = data; 2104 const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); 2105 int max_threads = deferred_page_init_max_threads(cpumask); 2106 unsigned long first_init_pfn, last_pfn, flags; 2107 unsigned long start = jiffies; 2108 struct zone *zone; 2109 2110 /* Bind memory initialisation thread to a local node if possible */ 2111 if (!cpumask_empty(cpumask)) 2112 set_cpus_allowed_ptr(current, cpumask); 2113 2114 pgdat_resize_lock(pgdat, &flags); 2115 first_init_pfn = pgdat->first_deferred_pfn; 2116 if (first_init_pfn == ULONG_MAX) { 2117 pgdat_resize_unlock(pgdat, &flags); 2118 pgdat_init_report_one_done(); 2119 return 0; 2120 } 2121 2122 /* Sanity check boundaries */ 2123 BUG_ON(pgdat->first_deferred_pfn < pgdat->node_start_pfn); 2124 BUG_ON(pgdat->first_deferred_pfn > pgdat_end_pfn(pgdat)); 2125 pgdat->first_deferred_pfn = ULONG_MAX; 2126 2127 /* 2128 * Once we unlock here, the zone cannot be grown anymore, thus if an 2129 * interrupt thread must allocate this early in boot, zone must be 2130 * pre-grown prior to start of deferred page initialization. 2131 */ 2132 pgdat_resize_unlock(pgdat, &flags); 2133 2134 /* Only the highest zone is deferred */ 2135 zone = pgdat->node_zones + pgdat->nr_zones - 1; 2136 last_pfn = SECTION_ALIGN_UP(zone_end_pfn(zone)); 2137 2138 struct padata_mt_job job = { 2139 .thread_fn = deferred_init_memmap_job, 2140 .fn_arg = zone, 2141 .start = first_init_pfn, 2142 .size = last_pfn - first_init_pfn, 2143 .align = PAGES_PER_SECTION, 2144 .min_chunk = PAGES_PER_SECTION, 2145 .max_threads = max_threads, 2146 .numa_aware = false, 2147 }; 2148 2149 padata_do_multithreaded(&job); 2150 2151 /* Sanity check that the next zone really is unpopulated */ 2152 WARN_ON(pgdat->nr_zones < MAX_NR_ZONES && populated_zone(++zone)); 2153 2154 pr_info("node %d deferred pages initialised in %ums\n", 2155 pgdat->node_id, jiffies_to_msecs(jiffies - start)); 2156 2157 pgdat_init_report_one_done(); 2158 return 0; 2159 } 2160 2161 /* 2162 * If this zone has deferred pages, try to grow it by initializing enough 2163 * deferred pages to satisfy the allocation specified by order, rounded up to 2164 * the nearest PAGES_PER_SECTION boundary. So we're adding memory in increments 2165 * of SECTION_SIZE bytes by initializing struct pages in increments of 2166 * PAGES_PER_SECTION * sizeof(struct page) bytes. 2167 * 2168 * Return true when zone was grown, otherwise return false. We return true even 2169 * when we grow less than requested, to let the caller decide if there are 2170 * enough pages to satisfy the allocation. 2171 */ 2172 bool __init deferred_grow_zone(struct zone *zone, unsigned int order) 2173 { 2174 unsigned long nr_pages_needed = SECTION_ALIGN_UP(1 << order); 2175 pg_data_t *pgdat = zone->zone_pgdat; 2176 unsigned long first_deferred_pfn = pgdat->first_deferred_pfn; 2177 unsigned long spfn, epfn, flags; 2178 unsigned long nr_pages = 0; 2179 2180 /* Only the last zone may have deferred pages */ 2181 if (zone_end_pfn(zone) != pgdat_end_pfn(pgdat)) 2182 return false; 2183 2184 pgdat_resize_lock(pgdat, &flags); 2185 2186 /* 2187 * If someone grew this zone while we were waiting for spinlock, return 2188 * true, as there might be enough pages already. 2189 */ 2190 if (first_deferred_pfn != pgdat->first_deferred_pfn) { 2191 pgdat_resize_unlock(pgdat, &flags); 2192 return true; 2193 } 2194 2195 /* 2196 * Initialize at least nr_pages_needed in section chunks. 2197 * If a section has less free memory than nr_pages_needed, the next 2198 * section will be also initialized. 2199 * Note, that it still does not guarantee that allocation of order can 2200 * be satisfied if the sections are fragmented because of memblock 2201 * allocations. 2202 */ 2203 for (spfn = first_deferred_pfn, epfn = SECTION_ALIGN_UP(spfn + 1); 2204 nr_pages < nr_pages_needed && spfn < zone_end_pfn(zone); 2205 spfn = epfn, epfn += PAGES_PER_SECTION) { 2206 nr_pages += deferred_init_memmap_chunk(spfn, epfn, zone, false); 2207 } 2208 2209 /* 2210 * There were no pages to initialize and free which means the zone's 2211 * memory map is completely initialized. 2212 */ 2213 pgdat->first_deferred_pfn = nr_pages ? spfn : ULONG_MAX; 2214 2215 pgdat_resize_unlock(pgdat, &flags); 2216 2217 return nr_pages > 0; 2218 } 2219 2220 #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ 2221 2222 #ifdef CONFIG_CMA 2223 void __init init_cma_reserved_pageblock(struct page *page) 2224 { 2225 unsigned i = pageblock_nr_pages; 2226 struct page *p = page; 2227 2228 do { 2229 __ClearPageReserved(p); 2230 set_page_count(p, 0); 2231 } while (++p, --i); 2232 2233 init_pageblock_migratetype(page, MIGRATE_CMA, false); 2234 set_page_refcounted(page); 2235 /* pages were reserved and not allocated */ 2236 clear_page_tag_ref(page); 2237 __free_pages(page, pageblock_order); 2238 2239 adjust_managed_page_count(page, pageblock_nr_pages); 2240 page_zone(page)->cma_pages += pageblock_nr_pages; 2241 } 2242 /* 2243 * Similar to above, but only set the migrate type and stats. 2244 */ 2245 void __init init_cma_pageblock(struct page *page) 2246 { 2247 init_pageblock_migratetype(page, MIGRATE_CMA, false); 2248 adjust_managed_page_count(page, pageblock_nr_pages); 2249 page_zone(page)->cma_pages += pageblock_nr_pages; 2250 } 2251 #endif 2252 2253 void set_zone_contiguous(struct zone *zone) 2254 { 2255 unsigned long block_start_pfn = zone->zone_start_pfn; 2256 unsigned long block_end_pfn; 2257 2258 block_end_pfn = pageblock_end_pfn(block_start_pfn); 2259 for (; block_start_pfn < zone_end_pfn(zone); 2260 block_start_pfn = block_end_pfn, 2261 block_end_pfn += pageblock_nr_pages) { 2262 2263 block_end_pfn = min(block_end_pfn, zone_end_pfn(zone)); 2264 2265 if (!__pageblock_pfn_to_page(block_start_pfn, 2266 block_end_pfn, zone)) 2267 return; 2268 cond_resched(); 2269 } 2270 2271 /* We confirm that there is no hole */ 2272 zone->contiguous = true; 2273 } 2274 2275 /* 2276 * Check if a PFN range intersects multiple zones on one or more 2277 * NUMA nodes. Specify the @nid argument if it is known that this 2278 * PFN range is on one node, NUMA_NO_NODE otherwise. 2279 */ 2280 bool pfn_range_intersects_zones(int nid, unsigned long start_pfn, 2281 unsigned long nr_pages) 2282 { 2283 struct zone *zone, *izone = NULL; 2284 2285 for_each_zone(zone) { 2286 if (nid != NUMA_NO_NODE && zone_to_nid(zone) != nid) 2287 continue; 2288 2289 if (zone_intersects(zone, start_pfn, nr_pages)) { 2290 if (izone != NULL) 2291 return true; 2292 izone = zone; 2293 } 2294 2295 } 2296 2297 return false; 2298 } 2299 2300 static void __init mem_init_print_info(void); 2301 void __init page_alloc_init_late(void) 2302 { 2303 struct zone *zone; 2304 int nid; 2305 2306 #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT 2307 2308 /* There will be num_node_state(N_MEMORY) threads */ 2309 atomic_set(&pgdat_init_n_undone, num_node_state(N_MEMORY)); 2310 for_each_node_state(nid, N_MEMORY) { 2311 kthread_run(deferred_init_memmap, NODE_DATA(nid), "pgdatinit%d", nid); 2312 } 2313 2314 /* Block until all are initialised */ 2315 wait_for_completion(&pgdat_init_all_done_comp); 2316 2317 /* 2318 * We initialized the rest of the deferred pages. Permanently disable 2319 * on-demand struct page initialization. 2320 */ 2321 static_branch_disable(&deferred_pages); 2322 2323 /* Reinit limits that are based on free pages after the kernel is up */ 2324 files_maxfiles_init(); 2325 #endif 2326 2327 /* Accounting of total+free memory is stable at this point. */ 2328 mem_init_print_info(); 2329 buffer_init(); 2330 2331 /* Discard memblock private memory */ 2332 memblock_discard(); 2333 2334 for_each_node_state(nid, N_MEMORY) 2335 shuffle_free_memory(NODE_DATA(nid)); 2336 2337 for_each_populated_zone(zone) 2338 set_zone_contiguous(zone); 2339 2340 /* Initialize page ext after all struct pages are initialized. */ 2341 if (deferred_struct_pages) 2342 page_ext_init(); 2343 2344 page_alloc_sysctl_init(); 2345 } 2346 2347 /* 2348 * Adaptive scale is meant to reduce sizes of hash tables on large memory 2349 * machines. As memory size is increased the scale is also increased but at 2350 * slower pace. Starting from ADAPT_SCALE_BASE (64G), every time memory 2351 * quadruples the scale is increased by one, which means the size of hash table 2352 * only doubles, instead of quadrupling as well. 2353 * Because 32-bit systems cannot have large physical memory, where this scaling 2354 * makes sense, it is disabled on such platforms. 2355 */ 2356 #if __BITS_PER_LONG > 32 2357 #define ADAPT_SCALE_BASE (64ul << 30) 2358 #define ADAPT_SCALE_SHIFT 2 2359 #define ADAPT_SCALE_NPAGES (ADAPT_SCALE_BASE >> PAGE_SHIFT) 2360 #endif 2361 2362 /* 2363 * allocate a large system hash table from bootmem 2364 * - it is assumed that the hash table must contain an exact power-of-2 2365 * quantity of entries 2366 * - limit is the number of hash buckets, not the total allocation size 2367 */ 2368 void *__init alloc_large_system_hash(const char *tablename, 2369 unsigned long bucketsize, 2370 unsigned long numentries, 2371 int scale, 2372 int flags, 2373 unsigned int *_hash_shift, 2374 unsigned int *_hash_mask, 2375 unsigned long low_limit, 2376 unsigned long high_limit) 2377 { 2378 unsigned long long max = high_limit; 2379 unsigned long log2qty, size; 2380 void *table; 2381 gfp_t gfp_flags; 2382 bool virt; 2383 bool huge; 2384 2385 /* allow the kernel cmdline to have a say */ 2386 if (!numentries) { 2387 /* round applicable memory size up to nearest megabyte */ 2388 numentries = nr_kernel_pages; 2389 2390 /* It isn't necessary when PAGE_SIZE >= 1MB */ 2391 if (PAGE_SIZE < SZ_1M) 2392 numentries = round_up(numentries, SZ_1M / PAGE_SIZE); 2393 2394 #if __BITS_PER_LONG > 32 2395 if (!high_limit) { 2396 unsigned long adapt; 2397 2398 for (adapt = ADAPT_SCALE_NPAGES; adapt < numentries; 2399 adapt <<= ADAPT_SCALE_SHIFT) 2400 scale++; 2401 } 2402 #endif 2403 2404 /* limit to 1 bucket per 2^scale bytes of low memory */ 2405 if (scale > PAGE_SHIFT) 2406 numentries >>= (scale - PAGE_SHIFT); 2407 else 2408 numentries <<= (PAGE_SHIFT - scale); 2409 2410 if (unlikely((numentries * bucketsize) < PAGE_SIZE)) 2411 numentries = PAGE_SIZE / bucketsize; 2412 } 2413 numentries = roundup_pow_of_two(numentries); 2414 2415 /* limit allocation size to 1/16 total memory by default */ 2416 if (max == 0) { 2417 max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4; 2418 max = div64_ul(max, bucketsize); 2419 } 2420 max = min(max, 0x80000000ULL); 2421 2422 if (numentries < low_limit) 2423 numentries = low_limit; 2424 if (numentries > max) 2425 numentries = max; 2426 2427 log2qty = ilog2(numentries); 2428 2429 gfp_flags = (flags & HASH_ZERO) ? GFP_ATOMIC | __GFP_ZERO : GFP_ATOMIC; 2430 do { 2431 virt = false; 2432 size = bucketsize << log2qty; 2433 if (flags & HASH_EARLY) { 2434 if (flags & HASH_ZERO) 2435 table = memblock_alloc(size, SMP_CACHE_BYTES); 2436 else 2437 table = memblock_alloc_raw(size, 2438 SMP_CACHE_BYTES); 2439 } else if (get_order(size) > MAX_PAGE_ORDER || hashdist) { 2440 table = vmalloc_huge(size, gfp_flags); 2441 virt = true; 2442 if (table) 2443 huge = is_vm_area_hugepages(table); 2444 } else { 2445 /* 2446 * If bucketsize is not a power-of-two, we may free 2447 * some pages at the end of hash table which 2448 * alloc_pages_exact() automatically does 2449 */ 2450 table = alloc_pages_exact(size, gfp_flags); 2451 kmemleak_alloc(table, size, 1, gfp_flags); 2452 } 2453 } while (!table && size > PAGE_SIZE && --log2qty); 2454 2455 if (!table) 2456 panic("Failed to allocate %s hash table\n", tablename); 2457 2458 pr_info("%s hash table entries: %ld (order: %d, %lu bytes, %s)\n", 2459 tablename, 1UL << log2qty, get_order(size), size, 2460 virt ? (huge ? "vmalloc hugepage" : "vmalloc") : "linear"); 2461 2462 if (_hash_shift) 2463 *_hash_shift = log2qty; 2464 if (_hash_mask) 2465 *_hash_mask = (1 << log2qty) - 1; 2466 2467 return table; 2468 } 2469 2470 void __init memblock_free_pages(unsigned long pfn, unsigned int order) 2471 { 2472 struct page *page = pfn_to_page(pfn); 2473 2474 if (IS_ENABLED(CONFIG_DEFERRED_STRUCT_PAGE_INIT)) { 2475 int nid = early_pfn_to_nid(pfn); 2476 2477 if (!early_page_initialised(pfn, nid)) 2478 return; 2479 } 2480 2481 if (!kmsan_memblock_free_pages(page, order)) { 2482 /* KMSAN will take care of these pages. */ 2483 return; 2484 } 2485 2486 /* pages were reserved and not allocated */ 2487 clear_page_tag_ref(page); 2488 __free_pages_core(page, order, MEMINIT_EARLY); 2489 } 2490 2491 DEFINE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_ALLOC_DEFAULT_ON, init_on_alloc); 2492 EXPORT_SYMBOL(init_on_alloc); 2493 2494 DEFINE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_FREE_DEFAULT_ON, init_on_free); 2495 EXPORT_SYMBOL(init_on_free); 2496 2497 static bool _init_on_alloc_enabled_early __read_mostly 2498 = IS_ENABLED(CONFIG_INIT_ON_ALLOC_DEFAULT_ON); 2499 static int __init early_init_on_alloc(char *buf) 2500 { 2501 2502 return kstrtobool(buf, &_init_on_alloc_enabled_early); 2503 } 2504 early_param("init_on_alloc", early_init_on_alloc); 2505 2506 static bool _init_on_free_enabled_early __read_mostly 2507 = IS_ENABLED(CONFIG_INIT_ON_FREE_DEFAULT_ON); 2508 static int __init early_init_on_free(char *buf) 2509 { 2510 return kstrtobool(buf, &_init_on_free_enabled_early); 2511 } 2512 early_param("init_on_free", early_init_on_free); 2513 2514 DEFINE_STATIC_KEY_MAYBE(CONFIG_DEBUG_VM, check_pages_enabled); 2515 2516 static bool check_pages_enabled_early __initdata; 2517 2518 static int __init early_check_pages(char *buf) 2519 { 2520 return kstrtobool(buf, &check_pages_enabled_early); 2521 } 2522 early_param("check_pages", early_check_pages); 2523 2524 /* 2525 * Enable static keys related to various memory debugging and hardening options. 2526 * Some override others, and depend on early params that are evaluated in the 2527 * order of appearance. So we need to first gather the full picture of what was 2528 * enabled, and then make decisions. 2529 */ 2530 static void __init mem_debugging_and_hardening_init(void) 2531 { 2532 bool page_poisoning_requested = false; 2533 bool want_check_pages = check_pages_enabled_early; 2534 2535 #ifdef CONFIG_PAGE_POISONING 2536 /* 2537 * Page poisoning is debug page alloc for some arches. If 2538 * either of those options are enabled, enable poisoning. 2539 */ 2540 if (page_poisoning_enabled() || 2541 (!IS_ENABLED(CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC) && 2542 debug_pagealloc_enabled())) { 2543 static_branch_enable(&_page_poisoning_enabled); 2544 page_poisoning_requested = true; 2545 want_check_pages = true; 2546 } 2547 #endif 2548 2549 if ((_init_on_alloc_enabled_early || _init_on_free_enabled_early) && 2550 page_poisoning_requested) { 2551 pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, " 2552 "will take precedence over init_on_alloc and init_on_free\n"); 2553 _init_on_alloc_enabled_early = false; 2554 _init_on_free_enabled_early = false; 2555 } 2556 2557 if (_init_on_alloc_enabled_early) { 2558 want_check_pages = true; 2559 static_branch_enable(&init_on_alloc); 2560 } else { 2561 static_branch_disable(&init_on_alloc); 2562 } 2563 2564 if (_init_on_free_enabled_early) { 2565 want_check_pages = true; 2566 static_branch_enable(&init_on_free); 2567 } else { 2568 static_branch_disable(&init_on_free); 2569 } 2570 2571 if (IS_ENABLED(CONFIG_KMSAN) && 2572 (_init_on_alloc_enabled_early || _init_on_free_enabled_early)) 2573 pr_info("mem auto-init: please make sure init_on_alloc and init_on_free are disabled when running KMSAN\n"); 2574 2575 #ifdef CONFIG_DEBUG_PAGEALLOC 2576 if (debug_pagealloc_enabled()) { 2577 want_check_pages = true; 2578 static_branch_enable(&_debug_pagealloc_enabled); 2579 2580 if (debug_guardpage_minorder()) 2581 static_branch_enable(&_debug_guardpage_enabled); 2582 } 2583 #endif 2584 2585 /* 2586 * Any page debugging or hardening option also enables sanity checking 2587 * of struct pages being allocated or freed. With CONFIG_DEBUG_VM it's 2588 * enabled already. 2589 */ 2590 if (!IS_ENABLED(CONFIG_DEBUG_VM) && want_check_pages) 2591 static_branch_enable(&check_pages_enabled); 2592 } 2593 2594 /* Report memory auto-initialization states for this boot. */ 2595 static void __init report_meminit(void) 2596 { 2597 const char *stack; 2598 2599 if (IS_ENABLED(CONFIG_INIT_STACK_ALL_PATTERN)) 2600 stack = "all(pattern)"; 2601 else if (IS_ENABLED(CONFIG_INIT_STACK_ALL_ZERO)) 2602 stack = "all(zero)"; 2603 else 2604 stack = "off"; 2605 2606 pr_info("mem auto-init: stack:%s, heap alloc:%s, heap free:%s\n", 2607 stack, str_on_off(want_init_on_alloc(GFP_KERNEL)), 2608 str_on_off(want_init_on_free())); 2609 if (want_init_on_free()) 2610 pr_info("mem auto-init: clearing system memory may take some time...\n"); 2611 } 2612 2613 static void __init mem_init_print_info(void) 2614 { 2615 unsigned long physpages, codesize, datasize, rosize, bss_size; 2616 unsigned long init_code_size, init_data_size; 2617 2618 physpages = get_num_physpages(); 2619 codesize = _etext - _stext; 2620 datasize = _edata - _sdata; 2621 rosize = __end_rodata - __start_rodata; 2622 bss_size = __bss_stop - __bss_start; 2623 init_data_size = __init_end - __init_begin; 2624 init_code_size = _einittext - _sinittext; 2625 2626 /* 2627 * Detect special cases and adjust section sizes accordingly: 2628 * 1) .init.* may be embedded into .data sections 2629 * 2) .init.text.* may be out of [__init_begin, __init_end], 2630 * please refer to arch/tile/kernel/vmlinux.lds.S. 2631 * 3) .rodata.* may be embedded into .text or .data sections. 2632 */ 2633 #define adj_init_size(start, end, size, pos, adj) \ 2634 do { \ 2635 if (&start[0] <= &pos[0] && &pos[0] < &end[0] && size > adj) \ 2636 size -= adj; \ 2637 } while (0) 2638 2639 adj_init_size(__init_begin, __init_end, init_data_size, 2640 _sinittext, init_code_size); 2641 adj_init_size(_stext, _etext, codesize, _sinittext, init_code_size); 2642 adj_init_size(_sdata, _edata, datasize, __init_begin, init_data_size); 2643 adj_init_size(_stext, _etext, codesize, __start_rodata, rosize); 2644 adj_init_size(_sdata, _edata, datasize, __start_rodata, rosize); 2645 2646 #undef adj_init_size 2647 2648 pr_info("Memory: %luK/%luK available (%luK kernel code, %luK rwdata, %luK rodata, %luK init, %luK bss, %luK reserved, %luK cma-reserved" 2649 #ifdef CONFIG_HIGHMEM 2650 ", %luK highmem" 2651 #endif 2652 ")\n", 2653 K(nr_free_pages()), K(physpages), 2654 codesize / SZ_1K, datasize / SZ_1K, rosize / SZ_1K, 2655 (init_data_size + init_code_size) / SZ_1K, bss_size / SZ_1K, 2656 K(physpages - totalram_pages() - totalcma_pages), 2657 K(totalcma_pages) 2658 #ifdef CONFIG_HIGHMEM 2659 , K(totalhigh_pages()) 2660 #endif 2661 ); 2662 } 2663 2664 #ifndef __HAVE_COLOR_ZERO_PAGE 2665 /* 2666 * architectures that __HAVE_COLOR_ZERO_PAGE must define this function 2667 */ 2668 void __init __weak arch_setup_zero_pages(void) 2669 { 2670 __zero_page = virt_to_page(empty_zero_page); 2671 } 2672 #endif 2673 2674 static void __init init_zero_page_pfn(void) 2675 { 2676 arch_setup_zero_pages(); 2677 zero_page_pfn = page_to_pfn(ZERO_PAGE(0)); 2678 } 2679 2680 void __init __weak arch_mm_preinit(void) 2681 { 2682 } 2683 2684 void __init __weak mem_init(void) 2685 { 2686 } 2687 2688 void __init mm_core_init_early(void) 2689 { 2690 hugetlb_cma_reserve(); 2691 hugetlb_bootmem_alloc(); 2692 2693 free_area_init(); 2694 } 2695 2696 /* 2697 * Set up kernel memory allocators 2698 */ 2699 void __init mm_core_init(void) 2700 { 2701 arch_mm_preinit(); 2702 init_zero_page_pfn(); 2703 2704 /* Initializations relying on SMP setup */ 2705 BUILD_BUG_ON(MAX_ZONELISTS > 2); 2706 build_all_zonelists(NULL); 2707 page_alloc_init_cpuhp(); 2708 alloc_tag_sec_init(); 2709 /* 2710 * page_ext requires contiguous pages, 2711 * bigger than MAX_PAGE_ORDER unless SPARSEMEM. 2712 */ 2713 page_ext_init_flatmem(); 2714 mem_debugging_and_hardening_init(); 2715 kfence_alloc_pool_and_metadata(); 2716 report_meminit(); 2717 kmsan_init_shadow(); 2718 stack_depot_early_init(); 2719 2720 /* 2721 * KHO memory setup must happen while memblock is still active, but 2722 * as close as possible to buddy initialization 2723 */ 2724 kho_memory_init(); 2725 2726 memblock_free_all(); 2727 mem_init(); 2728 kmem_cache_init(); 2729 /* 2730 * page_owner must be initialized after buddy is ready, and also after 2731 * slab is ready so that stack_depot_init() works properly 2732 */ 2733 page_ext_init_flatmem_late(); 2734 kmemleak_init(); 2735 ptlock_cache_init(); 2736 pgtable_cache_init(); 2737 debug_objects_mem_init(); 2738 vmalloc_init(); 2739 /* If no deferred init page_ext now, as vmap is fully initialized */ 2740 if (!deferred_struct_pages) 2741 page_ext_init(); 2742 /* Should be run before the first non-init thread is created */ 2743 init_espfix_bsp(); 2744 /* Should be run after espfix64 is set up. */ 2745 pti_init(); 2746 kmsan_init_runtime(); 2747 mm_cache_init(); 2748 execmem_init(); 2749 } 2750