1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * linux/mm/memory_hotplug.c 4 * 5 * Copyright (C) 6 */ 7 8 #include <linux/stddef.h> 9 #include <linux/mm.h> 10 #include <linux/sched/signal.h> 11 #include <linux/swap.h> 12 #include <linux/interrupt.h> 13 #include <linux/pagemap.h> 14 #include <linux/compiler.h> 15 #include <linux/export.h> 16 #include <linux/writeback.h> 17 #include <linux/slab.h> 18 #include <linux/sysctl.h> 19 #include <linux/cpu.h> 20 #include <linux/memory.h> 21 #include <linux/memremap.h> 22 #include <linux/memory_hotplug.h> 23 #include <linux/vmalloc.h> 24 #include <linux/ioport.h> 25 #include <linux/delay.h> 26 #include <linux/migrate.h> 27 #include <linux/page-isolation.h> 28 #include <linux/pfn.h> 29 #include <linux/suspend.h> 30 #include <linux/mm_inline.h> 31 #include <linux/firmware-map.h> 32 #include <linux/stop_machine.h> 33 #include <linux/hugetlb.h> 34 #include <linux/memblock.h> 35 #include <linux/compaction.h> 36 #include <linux/rmap.h> 37 #include <linux/module.h> 38 39 #include <asm/tlbflush.h> 40 41 #include "internal.h" 42 #include "shuffle.h" 43 44 enum { 45 MEMMAP_ON_MEMORY_DISABLE = 0, 46 MEMMAP_ON_MEMORY_ENABLE, 47 MEMMAP_ON_MEMORY_FORCE, 48 }; 49 50 static int memmap_mode __read_mostly = MEMMAP_ON_MEMORY_DISABLE; 51 52 static inline unsigned long memory_block_memmap_size(void) 53 { 54 return PHYS_PFN(memory_block_size_bytes()) * sizeof(struct page); 55 } 56 57 static inline unsigned long memory_block_memmap_on_memory_pages(void) 58 { 59 unsigned long nr_pages = PFN_UP(memory_block_memmap_size()); 60 61 /* 62 * In "forced" memmap_on_memory mode, we add extra pages to align the 63 * vmemmap size to cover full pageblocks. That way, we can add memory 64 * even if the vmemmap size is not properly aligned, however, we might waste 65 * memory. 66 */ 67 if (memmap_mode == MEMMAP_ON_MEMORY_FORCE) 68 return pageblock_align(nr_pages); 69 return nr_pages; 70 } 71 72 #ifdef CONFIG_MHP_MEMMAP_ON_MEMORY 73 /* 74 * memory_hotplug.memmap_on_memory parameter 75 */ 76 static int set_memmap_mode(const char *val, const struct kernel_param *kp) 77 { 78 int ret, mode; 79 bool enabled; 80 81 if (sysfs_streq(val, "force") || sysfs_streq(val, "FORCE")) { 82 mode = MEMMAP_ON_MEMORY_FORCE; 83 } else { 84 ret = kstrtobool(val, &enabled); 85 if (ret < 0) 86 return ret; 87 if (enabled) 88 mode = MEMMAP_ON_MEMORY_ENABLE; 89 else 90 mode = MEMMAP_ON_MEMORY_DISABLE; 91 } 92 *((int *)kp->arg) = mode; 93 if (mode == MEMMAP_ON_MEMORY_FORCE) { 94 unsigned long memmap_pages = memory_block_memmap_on_memory_pages(); 95 96 pr_info_once("Memory hotplug will waste %ld pages in each memory block\n", 97 memmap_pages - PFN_UP(memory_block_memmap_size())); 98 } 99 return 0; 100 } 101 102 static int get_memmap_mode(char *buffer, const struct kernel_param *kp) 103 { 104 int mode = *((int *)kp->arg); 105 106 if (mode == MEMMAP_ON_MEMORY_FORCE) 107 return sprintf(buffer, "force\n"); 108 return sprintf(buffer, "%c\n", mode ? 'Y' : 'N'); 109 } 110 111 static const struct kernel_param_ops memmap_mode_ops = { 112 .set = set_memmap_mode, 113 .get = get_memmap_mode, 114 }; 115 module_param_cb(memmap_on_memory, &memmap_mode_ops, &memmap_mode, 0444); 116 MODULE_PARM_DESC(memmap_on_memory, "Enable memmap on memory for memory hotplug\n" 117 "With value \"force\" it could result in memory wastage due " 118 "to memmap size limitations (Y/N/force)"); 119 120 static inline bool mhp_memmap_on_memory(void) 121 { 122 return memmap_mode != MEMMAP_ON_MEMORY_DISABLE; 123 } 124 #else 125 static inline bool mhp_memmap_on_memory(void) 126 { 127 return false; 128 } 129 #endif 130 131 enum { 132 ONLINE_POLICY_CONTIG_ZONES = 0, 133 ONLINE_POLICY_AUTO_MOVABLE, 134 }; 135 136 static const char * const online_policy_to_str[] = { 137 [ONLINE_POLICY_CONTIG_ZONES] = "contig-zones", 138 [ONLINE_POLICY_AUTO_MOVABLE] = "auto-movable", 139 }; 140 141 static int set_online_policy(const char *val, const struct kernel_param *kp) 142 { 143 int ret = sysfs_match_string(online_policy_to_str, val); 144 145 if (ret < 0) 146 return ret; 147 *((int *)kp->arg) = ret; 148 return 0; 149 } 150 151 static int get_online_policy(char *buffer, const struct kernel_param *kp) 152 { 153 return sprintf(buffer, "%s\n", online_policy_to_str[*((int *)kp->arg)]); 154 } 155 156 /* 157 * memory_hotplug.online_policy: configure online behavior when onlining without 158 * specifying a zone (MMOP_ONLINE) 159 * 160 * "contig-zones": keep zone contiguous 161 * "auto-movable": online memory to ZONE_MOVABLE if the configuration 162 * (auto_movable_ratio, auto_movable_numa_aware) allows for it 163 */ 164 static int online_policy __read_mostly = ONLINE_POLICY_CONTIG_ZONES; 165 static const struct kernel_param_ops online_policy_ops = { 166 .set = set_online_policy, 167 .get = get_online_policy, 168 }; 169 module_param_cb(online_policy, &online_policy_ops, &online_policy, 0644); 170 MODULE_PARM_DESC(online_policy, 171 "Set the online policy (\"contig-zones\", \"auto-movable\") " 172 "Default: \"contig-zones\""); 173 174 /* 175 * memory_hotplug.auto_movable_ratio: specify maximum MOVABLE:KERNEL ratio 176 * 177 * The ratio represent an upper limit and the kernel might decide to not 178 * online some memory to ZONE_MOVABLE -- e.g., because hotplugged KERNEL memory 179 * doesn't allow for more MOVABLE memory. 180 */ 181 static unsigned int auto_movable_ratio __read_mostly = 301; 182 module_param(auto_movable_ratio, uint, 0644); 183 MODULE_PARM_DESC(auto_movable_ratio, 184 "Set the maximum ratio of MOVABLE:KERNEL memory in the system " 185 "in percent for \"auto-movable\" online policy. Default: 301"); 186 187 /* 188 * memory_hotplug.auto_movable_numa_aware: consider numa node stats 189 */ 190 #ifdef CONFIG_NUMA 191 static bool auto_movable_numa_aware __read_mostly = true; 192 module_param(auto_movable_numa_aware, bool, 0644); 193 MODULE_PARM_DESC(auto_movable_numa_aware, 194 "Consider numa node stats in addition to global stats in " 195 "\"auto-movable\" online policy. Default: true"); 196 #endif /* CONFIG_NUMA */ 197 198 /* 199 * online_page_callback contains pointer to current page onlining function. 200 * Initially it is generic_online_page(). If it is required it could be 201 * changed by calling set_online_page_callback() for callback registration 202 * and restore_online_page_callback() for generic callback restore. 203 */ 204 205 static online_page_callback_t online_page_callback = generic_online_page; 206 static DEFINE_MUTEX(online_page_callback_lock); 207 208 DEFINE_STATIC_PERCPU_RWSEM(mem_hotplug_lock); 209 210 void get_online_mems(void) 211 { 212 percpu_down_read(&mem_hotplug_lock); 213 } 214 215 void put_online_mems(void) 216 { 217 percpu_up_read(&mem_hotplug_lock); 218 } 219 220 bool movable_node_enabled = false; 221 222 static int mhp_default_online_type = -1; 223 int mhp_get_default_online_type(void) 224 { 225 if (mhp_default_online_type >= 0) 226 return mhp_default_online_type; 227 228 if (IS_ENABLED(CONFIG_MHP_DEFAULT_ONLINE_TYPE_OFFLINE)) 229 mhp_default_online_type = MMOP_OFFLINE; 230 else if (IS_ENABLED(CONFIG_MHP_DEFAULT_ONLINE_TYPE_ONLINE_AUTO)) 231 mhp_default_online_type = MMOP_ONLINE; 232 else if (IS_ENABLED(CONFIG_MHP_DEFAULT_ONLINE_TYPE_ONLINE_KERNEL)) 233 mhp_default_online_type = MMOP_ONLINE_KERNEL; 234 else if (IS_ENABLED(CONFIG_MHP_DEFAULT_ONLINE_TYPE_ONLINE_MOVABLE)) 235 mhp_default_online_type = MMOP_ONLINE_MOVABLE; 236 else 237 mhp_default_online_type = MMOP_OFFLINE; 238 239 return mhp_default_online_type; 240 } 241 242 void mhp_set_default_online_type(int online_type) 243 { 244 mhp_default_online_type = online_type; 245 } 246 247 static int __init setup_memhp_default_state(char *str) 248 { 249 const int online_type = mhp_online_type_from_str(str); 250 251 if (online_type >= 0) 252 mhp_default_online_type = online_type; 253 254 return 1; 255 } 256 __setup("memhp_default_state=", setup_memhp_default_state); 257 258 void mem_hotplug_begin(void) 259 { 260 cpus_read_lock(); 261 percpu_down_write(&mem_hotplug_lock); 262 } 263 264 void mem_hotplug_done(void) 265 { 266 percpu_up_write(&mem_hotplug_lock); 267 cpus_read_unlock(); 268 } 269 270 u64 max_mem_size = U64_MAX; 271 272 /* add this memory to iomem resource */ 273 static struct resource *register_memory_resource(u64 start, u64 size, 274 const char *resource_name) 275 { 276 struct resource *res; 277 unsigned long flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; 278 279 if (strcmp(resource_name, "System RAM")) 280 flags |= IORESOURCE_SYSRAM_DRIVER_MANAGED; 281 282 if (!mhp_range_allowed(start, size, true)) 283 return ERR_PTR(-E2BIG); 284 285 /* 286 * Make sure value parsed from 'mem=' only restricts memory adding 287 * while booting, so that memory hotplug won't be impacted. Please 288 * refer to document of 'mem=' in kernel-parameters.txt for more 289 * details. 290 */ 291 if (start + size > max_mem_size && system_state < SYSTEM_RUNNING) 292 return ERR_PTR(-E2BIG); 293 294 /* 295 * Request ownership of the new memory range. This might be 296 * a child of an existing resource that was present but 297 * not marked as busy. 298 */ 299 res = __request_region(&iomem_resource, start, size, 300 resource_name, flags); 301 302 if (!res) { 303 pr_debug("Unable to reserve System RAM region: %016llx->%016llx\n", 304 start, start + size); 305 return ERR_PTR(-EEXIST); 306 } 307 return res; 308 } 309 310 static void release_memory_resource(struct resource *res) 311 { 312 if (!res) 313 return; 314 release_resource(res); 315 kfree(res); 316 } 317 318 static int check_pfn_span(unsigned long pfn, unsigned long nr_pages) 319 { 320 /* 321 * Disallow all operations smaller than a sub-section and only 322 * allow operations smaller than a section for 323 * SPARSEMEM_VMEMMAP. Note that check_hotplug_memory_range() 324 * enforces a larger memory_block_size_bytes() granularity for 325 * memory that will be marked online, so this check should only 326 * fire for direct arch_{add,remove}_memory() users outside of 327 * add_memory_resource(). 328 */ 329 unsigned long min_align; 330 331 if (IS_ENABLED(CONFIG_SPARSEMEM_VMEMMAP)) 332 min_align = PAGES_PER_SUBSECTION; 333 else 334 min_align = PAGES_PER_SECTION; 335 if (!IS_ALIGNED(pfn | nr_pages, min_align)) 336 return -EINVAL; 337 return 0; 338 } 339 340 /* 341 * Return page for the valid pfn only if the page is online. All pfn 342 * walkers which rely on the fully initialized page->flags and others 343 * should use this rather than pfn_valid && pfn_to_page 344 */ 345 struct page *pfn_to_online_page(unsigned long pfn) 346 { 347 unsigned long nr = pfn_to_section_nr(pfn); 348 struct dev_pagemap *pgmap; 349 struct mem_section *ms; 350 351 if (nr >= NR_MEM_SECTIONS) 352 return NULL; 353 354 ms = __nr_to_section(nr); 355 if (!online_section(ms)) 356 return NULL; 357 358 /* 359 * Save some code text when online_section() + 360 * pfn_section_valid() are sufficient. 361 */ 362 if (IS_ENABLED(CONFIG_HAVE_ARCH_PFN_VALID) && !pfn_valid(pfn)) 363 return NULL; 364 365 if (!pfn_section_valid(ms, pfn)) 366 return NULL; 367 368 if (!online_device_section(ms)) 369 return pfn_to_page(pfn); 370 371 /* 372 * Slowpath: when ZONE_DEVICE collides with 373 * ZONE_{NORMAL,MOVABLE} within the same section some pfns in 374 * the section may be 'offline' but 'valid'. Only 375 * get_dev_pagemap() can determine sub-section online status. 376 */ 377 pgmap = get_dev_pagemap(pfn, NULL); 378 put_dev_pagemap(pgmap); 379 380 /* The presence of a pgmap indicates ZONE_DEVICE offline pfn */ 381 if (pgmap) 382 return NULL; 383 384 return pfn_to_page(pfn); 385 } 386 EXPORT_SYMBOL_GPL(pfn_to_online_page); 387 388 int __add_pages(int nid, unsigned long pfn, unsigned long nr_pages, 389 struct mhp_params *params) 390 { 391 const unsigned long end_pfn = pfn + nr_pages; 392 unsigned long cur_nr_pages; 393 int err; 394 struct vmem_altmap *altmap = params->altmap; 395 396 if (WARN_ON_ONCE(!pgprot_val(params->pgprot))) 397 return -EINVAL; 398 399 VM_BUG_ON(!mhp_range_allowed(PFN_PHYS(pfn), nr_pages * PAGE_SIZE, false)); 400 401 if (altmap) { 402 /* 403 * Validate altmap is within bounds of the total request 404 */ 405 if (altmap->base_pfn != pfn 406 || vmem_altmap_offset(altmap) > nr_pages) { 407 pr_warn_once("memory add fail, invalid altmap\n"); 408 return -EINVAL; 409 } 410 altmap->alloc = 0; 411 } 412 413 if (check_pfn_span(pfn, nr_pages)) { 414 WARN(1, "Misaligned %s start: %#lx end: %#lx\n", __func__, pfn, pfn + nr_pages - 1); 415 return -EINVAL; 416 } 417 418 for (; pfn < end_pfn; pfn += cur_nr_pages) { 419 /* Select all remaining pages up to the next section boundary */ 420 cur_nr_pages = min(end_pfn - pfn, 421 SECTION_ALIGN_UP(pfn + 1) - pfn); 422 err = sparse_add_section(nid, pfn, cur_nr_pages, altmap, 423 params->pgmap); 424 if (err) 425 break; 426 cond_resched(); 427 } 428 vmemmap_populate_print_last(); 429 return err; 430 } 431 432 /* find the smallest valid pfn in the range [start_pfn, end_pfn) */ 433 static unsigned long find_smallest_section_pfn(int nid, struct zone *zone, 434 unsigned long start_pfn, 435 unsigned long end_pfn) 436 { 437 for (; start_pfn < end_pfn; start_pfn += PAGES_PER_SUBSECTION) { 438 if (unlikely(!pfn_to_online_page(start_pfn))) 439 continue; 440 441 if (unlikely(pfn_to_nid(start_pfn) != nid)) 442 continue; 443 444 if (zone != page_zone(pfn_to_page(start_pfn))) 445 continue; 446 447 return start_pfn; 448 } 449 450 return 0; 451 } 452 453 /* find the biggest valid pfn in the range [start_pfn, end_pfn). */ 454 static unsigned long find_biggest_section_pfn(int nid, struct zone *zone, 455 unsigned long start_pfn, 456 unsigned long end_pfn) 457 { 458 unsigned long pfn; 459 460 /* pfn is the end pfn of a memory section. */ 461 pfn = end_pfn - 1; 462 for (; pfn >= start_pfn; pfn -= PAGES_PER_SUBSECTION) { 463 if (unlikely(!pfn_to_online_page(pfn))) 464 continue; 465 466 if (unlikely(pfn_to_nid(pfn) != nid)) 467 continue; 468 469 if (zone != page_zone(pfn_to_page(pfn))) 470 continue; 471 472 return pfn; 473 } 474 475 return 0; 476 } 477 478 static void shrink_zone_span(struct zone *zone, unsigned long start_pfn, 479 unsigned long end_pfn) 480 { 481 unsigned long pfn; 482 int nid = zone_to_nid(zone); 483 484 if (zone->zone_start_pfn == start_pfn) { 485 /* 486 * If the section is smallest section in the zone, it need 487 * shrink zone->zone_start_pfn and zone->zone_spanned_pages. 488 * In this case, we find second smallest valid mem_section 489 * for shrinking zone. 490 */ 491 pfn = find_smallest_section_pfn(nid, zone, end_pfn, 492 zone_end_pfn(zone)); 493 if (pfn) { 494 zone->spanned_pages = zone_end_pfn(zone) - pfn; 495 zone->zone_start_pfn = pfn; 496 } else { 497 zone->zone_start_pfn = 0; 498 zone->spanned_pages = 0; 499 } 500 } else if (zone_end_pfn(zone) == end_pfn) { 501 /* 502 * If the section is biggest section in the zone, it need 503 * shrink zone->spanned_pages. 504 * In this case, we find second biggest valid mem_section for 505 * shrinking zone. 506 */ 507 pfn = find_biggest_section_pfn(nid, zone, zone->zone_start_pfn, 508 start_pfn); 509 if (pfn) 510 zone->spanned_pages = pfn - zone->zone_start_pfn + 1; 511 else { 512 zone->zone_start_pfn = 0; 513 zone->spanned_pages = 0; 514 } 515 } 516 } 517 518 static void update_pgdat_span(struct pglist_data *pgdat) 519 { 520 unsigned long node_start_pfn = 0, node_end_pfn = 0; 521 struct zone *zone; 522 523 for (zone = pgdat->node_zones; 524 zone < pgdat->node_zones + MAX_NR_ZONES; zone++) { 525 unsigned long end_pfn = zone_end_pfn(zone); 526 527 /* No need to lock the zones, they can't change. */ 528 if (!zone->spanned_pages) 529 continue; 530 if (!node_end_pfn) { 531 node_start_pfn = zone->zone_start_pfn; 532 node_end_pfn = end_pfn; 533 continue; 534 } 535 536 if (end_pfn > node_end_pfn) 537 node_end_pfn = end_pfn; 538 if (zone->zone_start_pfn < node_start_pfn) 539 node_start_pfn = zone->zone_start_pfn; 540 } 541 542 pgdat->node_start_pfn = node_start_pfn; 543 pgdat->node_spanned_pages = node_end_pfn - node_start_pfn; 544 } 545 546 void remove_pfn_range_from_zone(struct zone *zone, 547 unsigned long start_pfn, 548 unsigned long nr_pages) 549 { 550 const unsigned long end_pfn = start_pfn + nr_pages; 551 struct pglist_data *pgdat = zone->zone_pgdat; 552 unsigned long pfn, cur_nr_pages; 553 554 /* Poison struct pages because they are now uninitialized again. */ 555 for (pfn = start_pfn; pfn < end_pfn; pfn += cur_nr_pages) { 556 cond_resched(); 557 558 /* Select all remaining pages up to the next section boundary */ 559 cur_nr_pages = 560 min(end_pfn - pfn, SECTION_ALIGN_UP(pfn + 1) - pfn); 561 page_init_poison(pfn_to_page(pfn), 562 sizeof(struct page) * cur_nr_pages); 563 } 564 565 /* 566 * Zone shrinking code cannot properly deal with ZONE_DEVICE. So 567 * we will not try to shrink the zones - which is okay as 568 * set_zone_contiguous() cannot deal with ZONE_DEVICE either way. 569 */ 570 if (zone_is_zone_device(zone)) 571 return; 572 573 clear_zone_contiguous(zone); 574 575 shrink_zone_span(zone, start_pfn, start_pfn + nr_pages); 576 update_pgdat_span(pgdat); 577 578 set_zone_contiguous(zone); 579 } 580 581 /** 582 * __remove_pages() - remove sections of pages 583 * @pfn: starting pageframe (must be aligned to start of a section) 584 * @nr_pages: number of pages to remove (must be multiple of section size) 585 * @altmap: alternative device page map or %NULL if default memmap is used 586 * 587 * Generic helper function to remove section mappings and sysfs entries 588 * for the section of the memory we are removing. Caller needs to make 589 * sure that pages are marked reserved and zones are adjust properly by 590 * calling offline_pages(). 591 */ 592 void __remove_pages(unsigned long pfn, unsigned long nr_pages, 593 struct vmem_altmap *altmap) 594 { 595 const unsigned long end_pfn = pfn + nr_pages; 596 unsigned long cur_nr_pages; 597 598 if (check_pfn_span(pfn, nr_pages)) { 599 WARN(1, "Misaligned %s start: %#lx end: %#lx\n", __func__, pfn, pfn + nr_pages - 1); 600 return; 601 } 602 603 for (; pfn < end_pfn; pfn += cur_nr_pages) { 604 cond_resched(); 605 /* Select all remaining pages up to the next section boundary */ 606 cur_nr_pages = min(end_pfn - pfn, 607 SECTION_ALIGN_UP(pfn + 1) - pfn); 608 sparse_remove_section(pfn, cur_nr_pages, altmap); 609 } 610 } 611 612 int set_online_page_callback(online_page_callback_t callback) 613 { 614 int rc = -EINVAL; 615 616 get_online_mems(); 617 mutex_lock(&online_page_callback_lock); 618 619 if (online_page_callback == generic_online_page) { 620 online_page_callback = callback; 621 rc = 0; 622 } 623 624 mutex_unlock(&online_page_callback_lock); 625 put_online_mems(); 626 627 return rc; 628 } 629 EXPORT_SYMBOL_GPL(set_online_page_callback); 630 631 int restore_online_page_callback(online_page_callback_t callback) 632 { 633 int rc = -EINVAL; 634 635 get_online_mems(); 636 mutex_lock(&online_page_callback_lock); 637 638 if (online_page_callback == callback) { 639 online_page_callback = generic_online_page; 640 rc = 0; 641 } 642 643 mutex_unlock(&online_page_callback_lock); 644 put_online_mems(); 645 646 return rc; 647 } 648 EXPORT_SYMBOL_GPL(restore_online_page_callback); 649 650 /* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ 651 void generic_online_page(struct page *page, unsigned int order) 652 { 653 __free_pages_core(page, order, MEMINIT_HOTPLUG); 654 } 655 EXPORT_SYMBOL_GPL(generic_online_page); 656 657 static void online_pages_range(unsigned long start_pfn, unsigned long nr_pages) 658 { 659 const unsigned long end_pfn = start_pfn + nr_pages; 660 unsigned long pfn; 661 662 /* 663 * Online the pages in MAX_PAGE_ORDER aligned chunks. The callback might 664 * decide to not expose all pages to the buddy (e.g., expose them 665 * later). We account all pages as being online and belonging to this 666 * zone ("present"). 667 * When using memmap_on_memory, the range might not be aligned to 668 * MAX_ORDER_NR_PAGES - 1, but pageblock aligned. __ffs() will detect 669 * this and the first chunk to online will be pageblock_nr_pages. 670 */ 671 for (pfn = start_pfn; pfn < end_pfn;) { 672 struct page *page = pfn_to_page(pfn); 673 int order; 674 675 /* 676 * Free to online pages in the largest chunks alignment allows. 677 * 678 * __ffs() behaviour is undefined for 0. start == 0 is 679 * MAX_PAGE_ORDER-aligned, Set order to MAX_PAGE_ORDER for 680 * the case. 681 */ 682 if (pfn) 683 order = min_t(int, MAX_PAGE_ORDER, __ffs(pfn)); 684 else 685 order = MAX_PAGE_ORDER; 686 687 /* 688 * Exposing the page to the buddy by freeing can cause 689 * issues with debug_pagealloc enabled: some archs don't 690 * like double-unmappings. So treat them like any pages that 691 * were allocated from the buddy. 692 */ 693 debug_pagealloc_map_pages(page, 1 << order); 694 (*online_page_callback)(page, order); 695 pfn += (1UL << order); 696 } 697 698 /* mark all involved sections as online */ 699 online_mem_sections(start_pfn, end_pfn); 700 } 701 702 /* check which state of node_states will be changed when online memory */ 703 static void node_states_check_changes_online(unsigned long nr_pages, 704 struct zone *zone, struct memory_notify *arg) 705 { 706 int nid = zone_to_nid(zone); 707 708 arg->status_change_nid = NUMA_NO_NODE; 709 arg->status_change_nid_normal = NUMA_NO_NODE; 710 711 if (!node_state(nid, N_MEMORY)) 712 arg->status_change_nid = nid; 713 if (zone_idx(zone) <= ZONE_NORMAL && !node_state(nid, N_NORMAL_MEMORY)) 714 arg->status_change_nid_normal = nid; 715 } 716 717 static void node_states_set_node(int node, struct memory_notify *arg) 718 { 719 if (arg->status_change_nid_normal >= 0) 720 node_set_state(node, N_NORMAL_MEMORY); 721 722 if (arg->status_change_nid >= 0) 723 node_set_state(node, N_MEMORY); 724 } 725 726 static void __meminit resize_zone_range(struct zone *zone, unsigned long start_pfn, 727 unsigned long nr_pages) 728 { 729 unsigned long old_end_pfn = zone_end_pfn(zone); 730 731 if (zone_is_empty(zone) || start_pfn < zone->zone_start_pfn) 732 zone->zone_start_pfn = start_pfn; 733 734 zone->spanned_pages = max(start_pfn + nr_pages, old_end_pfn) - zone->zone_start_pfn; 735 } 736 737 static void __meminit resize_pgdat_range(struct pglist_data *pgdat, unsigned long start_pfn, 738 unsigned long nr_pages) 739 { 740 unsigned long old_end_pfn = pgdat_end_pfn(pgdat); 741 742 if (!pgdat->node_spanned_pages || start_pfn < pgdat->node_start_pfn) 743 pgdat->node_start_pfn = start_pfn; 744 745 pgdat->node_spanned_pages = max(start_pfn + nr_pages, old_end_pfn) - pgdat->node_start_pfn; 746 747 } 748 749 #ifdef CONFIG_ZONE_DEVICE 750 static void section_taint_zone_device(unsigned long pfn) 751 { 752 struct mem_section *ms = __pfn_to_section(pfn); 753 754 ms->section_mem_map |= SECTION_TAINT_ZONE_DEVICE; 755 } 756 #else 757 static inline void section_taint_zone_device(unsigned long pfn) 758 { 759 } 760 #endif 761 762 /* 763 * Associate the pfn range with the given zone, initializing the memmaps 764 * and resizing the pgdat/zone data to span the added pages. After this 765 * call, all affected pages are PageOffline(). 766 * 767 * All aligned pageblocks are initialized to the specified migratetype 768 * (usually MIGRATE_MOVABLE). Besides setting the migratetype, no related 769 * zone stats (e.g., nr_isolate_pageblock) are touched. 770 */ 771 void move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn, 772 unsigned long nr_pages, 773 struct vmem_altmap *altmap, int migratetype) 774 { 775 struct pglist_data *pgdat = zone->zone_pgdat; 776 int nid = pgdat->node_id; 777 778 clear_zone_contiguous(zone); 779 780 if (zone_is_empty(zone)) 781 init_currently_empty_zone(zone, start_pfn, nr_pages); 782 resize_zone_range(zone, start_pfn, nr_pages); 783 resize_pgdat_range(pgdat, start_pfn, nr_pages); 784 785 /* 786 * Subsection population requires care in pfn_to_online_page(). 787 * Set the taint to enable the slow path detection of 788 * ZONE_DEVICE pages in an otherwise ZONE_{NORMAL,MOVABLE} 789 * section. 790 */ 791 if (zone_is_zone_device(zone)) { 792 if (!IS_ALIGNED(start_pfn, PAGES_PER_SECTION)) 793 section_taint_zone_device(start_pfn); 794 if (!IS_ALIGNED(start_pfn + nr_pages, PAGES_PER_SECTION)) 795 section_taint_zone_device(start_pfn + nr_pages); 796 } 797 798 /* 799 * TODO now we have a visible range of pages which are not associated 800 * with their zone properly. Not nice but set_pfnblock_flags_mask 801 * expects the zone spans the pfn range. All the pages in the range 802 * are reserved so nobody should be touching them so we should be safe 803 */ 804 memmap_init_range(nr_pages, nid, zone_idx(zone), start_pfn, 0, 805 MEMINIT_HOTPLUG, altmap, migratetype); 806 807 set_zone_contiguous(zone); 808 } 809 810 struct auto_movable_stats { 811 unsigned long kernel_early_pages; 812 unsigned long movable_pages; 813 }; 814 815 static void auto_movable_stats_account_zone(struct auto_movable_stats *stats, 816 struct zone *zone) 817 { 818 if (zone_idx(zone) == ZONE_MOVABLE) { 819 stats->movable_pages += zone->present_pages; 820 } else { 821 stats->kernel_early_pages += zone->present_early_pages; 822 #ifdef CONFIG_CMA 823 /* 824 * CMA pages (never on hotplugged memory) behave like 825 * ZONE_MOVABLE. 826 */ 827 stats->movable_pages += zone->cma_pages; 828 stats->kernel_early_pages -= zone->cma_pages; 829 #endif /* CONFIG_CMA */ 830 } 831 } 832 struct auto_movable_group_stats { 833 unsigned long movable_pages; 834 unsigned long req_kernel_early_pages; 835 }; 836 837 static int auto_movable_stats_account_group(struct memory_group *group, 838 void *arg) 839 { 840 const int ratio = READ_ONCE(auto_movable_ratio); 841 struct auto_movable_group_stats *stats = arg; 842 long pages; 843 844 /* 845 * We don't support modifying the config while the auto-movable online 846 * policy is already enabled. Just avoid the division by zero below. 847 */ 848 if (!ratio) 849 return 0; 850 851 /* 852 * Calculate how many early kernel pages this group requires to 853 * satisfy the configured zone ratio. 854 */ 855 pages = group->present_movable_pages * 100 / ratio; 856 pages -= group->present_kernel_pages; 857 858 if (pages > 0) 859 stats->req_kernel_early_pages += pages; 860 stats->movable_pages += group->present_movable_pages; 861 return 0; 862 } 863 864 static bool auto_movable_can_online_movable(int nid, struct memory_group *group, 865 unsigned long nr_pages) 866 { 867 unsigned long kernel_early_pages, movable_pages; 868 struct auto_movable_group_stats group_stats = {}; 869 struct auto_movable_stats stats = {}; 870 struct zone *zone; 871 int i; 872 873 /* Walk all relevant zones and collect MOVABLE vs. KERNEL stats. */ 874 if (nid == NUMA_NO_NODE) { 875 /* TODO: cache values */ 876 for_each_populated_zone(zone) 877 auto_movable_stats_account_zone(&stats, zone); 878 } else { 879 for (i = 0; i < MAX_NR_ZONES; i++) { 880 pg_data_t *pgdat = NODE_DATA(nid); 881 882 zone = pgdat->node_zones + i; 883 if (populated_zone(zone)) 884 auto_movable_stats_account_zone(&stats, zone); 885 } 886 } 887 888 kernel_early_pages = stats.kernel_early_pages; 889 movable_pages = stats.movable_pages; 890 891 /* 892 * Kernel memory inside dynamic memory group allows for more MOVABLE 893 * memory within the same group. Remove the effect of all but the 894 * current group from the stats. 895 */ 896 walk_dynamic_memory_groups(nid, auto_movable_stats_account_group, 897 group, &group_stats); 898 if (kernel_early_pages <= group_stats.req_kernel_early_pages) 899 return false; 900 kernel_early_pages -= group_stats.req_kernel_early_pages; 901 movable_pages -= group_stats.movable_pages; 902 903 if (group && group->is_dynamic) 904 kernel_early_pages += group->present_kernel_pages; 905 906 /* 907 * Test if we could online the given number of pages to ZONE_MOVABLE 908 * and still stay in the configured ratio. 909 */ 910 movable_pages += nr_pages; 911 return movable_pages <= (auto_movable_ratio * kernel_early_pages) / 100; 912 } 913 914 /* 915 * Returns a default kernel memory zone for the given pfn range. 916 * If no kernel zone covers this pfn range it will automatically go 917 * to the ZONE_NORMAL. 918 */ 919 static struct zone *default_kernel_zone_for_pfn(int nid, unsigned long start_pfn, 920 unsigned long nr_pages) 921 { 922 struct pglist_data *pgdat = NODE_DATA(nid); 923 int zid; 924 925 for (zid = 0; zid < ZONE_NORMAL; zid++) { 926 struct zone *zone = &pgdat->node_zones[zid]; 927 928 if (zone_intersects(zone, start_pfn, nr_pages)) 929 return zone; 930 } 931 932 return &pgdat->node_zones[ZONE_NORMAL]; 933 } 934 935 /* 936 * Determine to which zone to online memory dynamically based on user 937 * configuration and system stats. We care about the following ratio: 938 * 939 * MOVABLE : KERNEL 940 * 941 * Whereby MOVABLE is memory in ZONE_MOVABLE and KERNEL is memory in 942 * one of the kernel zones. CMA pages inside one of the kernel zones really 943 * behaves like ZONE_MOVABLE, so we treat them accordingly. 944 * 945 * We don't allow for hotplugged memory in a KERNEL zone to increase the 946 * amount of MOVABLE memory we can have, so we end up with: 947 * 948 * MOVABLE : KERNEL_EARLY 949 * 950 * Whereby KERNEL_EARLY is memory in one of the kernel zones, available sinze 951 * boot. We base our calculation on KERNEL_EARLY internally, because: 952 * 953 * a) Hotplugged memory in one of the kernel zones can sometimes still get 954 * hotunplugged, especially when hot(un)plugging individual memory blocks. 955 * There is no coordination across memory devices, therefore "automatic" 956 * hotunplugging, as implemented in hypervisors, could result in zone 957 * imbalances. 958 * b) Early/boot memory in one of the kernel zones can usually not get 959 * hotunplugged again (e.g., no firmware interface to unplug, fragmented 960 * with unmovable allocations). While there are corner cases where it might 961 * still work, it is barely relevant in practice. 962 * 963 * Exceptions are dynamic memory groups, which allow for more MOVABLE 964 * memory within the same memory group -- because in that case, there is 965 * coordination within the single memory device managed by a single driver. 966 * 967 * We rely on "present pages" instead of "managed pages", as the latter is 968 * highly unreliable and dynamic in virtualized environments, and does not 969 * consider boot time allocations. For example, memory ballooning adjusts the 970 * managed pages when inflating/deflating the balloon, and balloon compaction 971 * can even migrate inflated pages between zones. 972 * 973 * Using "present pages" is better but some things to keep in mind are: 974 * 975 * a) Some memblock allocations, such as for the crashkernel area, are 976 * effectively unused by the kernel, yet they account to "present pages". 977 * Fortunately, these allocations are comparatively small in relevant setups 978 * (e.g., fraction of system memory). 979 * b) Some hotplugged memory blocks in virtualized environments, esecially 980 * hotplugged by virtio-mem, look like they are completely present, however, 981 * only parts of the memory block are actually currently usable. 982 * "present pages" is an upper limit that can get reached at runtime. As 983 * we base our calculations on KERNEL_EARLY, this is not an issue. 984 */ 985 static struct zone *auto_movable_zone_for_pfn(int nid, 986 struct memory_group *group, 987 unsigned long pfn, 988 unsigned long nr_pages) 989 { 990 unsigned long online_pages = 0, max_pages, end_pfn; 991 struct page *page; 992 993 if (!auto_movable_ratio) 994 goto kernel_zone; 995 996 if (group && !group->is_dynamic) { 997 max_pages = group->s.max_pages; 998 online_pages = group->present_movable_pages; 999 1000 /* If anything is !MOVABLE online the rest !MOVABLE. */ 1001 if (group->present_kernel_pages) 1002 goto kernel_zone; 1003 } else if (!group || group->d.unit_pages == nr_pages) { 1004 max_pages = nr_pages; 1005 } else { 1006 max_pages = group->d.unit_pages; 1007 /* 1008 * Take a look at all online sections in the current unit. 1009 * We can safely assume that all pages within a section belong 1010 * to the same zone, because dynamic memory groups only deal 1011 * with hotplugged memory. 1012 */ 1013 pfn = ALIGN_DOWN(pfn, group->d.unit_pages); 1014 end_pfn = pfn + group->d.unit_pages; 1015 for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) { 1016 page = pfn_to_online_page(pfn); 1017 if (!page) 1018 continue; 1019 /* If anything is !MOVABLE online the rest !MOVABLE. */ 1020 if (!is_zone_movable_page(page)) 1021 goto kernel_zone; 1022 online_pages += PAGES_PER_SECTION; 1023 } 1024 } 1025 1026 /* 1027 * Online MOVABLE if we could *currently* online all remaining parts 1028 * MOVABLE. We expect to (add+) online them immediately next, so if 1029 * nobody interferes, all will be MOVABLE if possible. 1030 */ 1031 nr_pages = max_pages - online_pages; 1032 if (!auto_movable_can_online_movable(NUMA_NO_NODE, group, nr_pages)) 1033 goto kernel_zone; 1034 1035 #ifdef CONFIG_NUMA 1036 if (auto_movable_numa_aware && 1037 !auto_movable_can_online_movable(nid, group, nr_pages)) 1038 goto kernel_zone; 1039 #endif /* CONFIG_NUMA */ 1040 1041 return &NODE_DATA(nid)->node_zones[ZONE_MOVABLE]; 1042 kernel_zone: 1043 return default_kernel_zone_for_pfn(nid, pfn, nr_pages); 1044 } 1045 1046 static inline struct zone *default_zone_for_pfn(int nid, unsigned long start_pfn, 1047 unsigned long nr_pages) 1048 { 1049 struct zone *kernel_zone = default_kernel_zone_for_pfn(nid, start_pfn, 1050 nr_pages); 1051 struct zone *movable_zone = &NODE_DATA(nid)->node_zones[ZONE_MOVABLE]; 1052 bool in_kernel = zone_intersects(kernel_zone, start_pfn, nr_pages); 1053 bool in_movable = zone_intersects(movable_zone, start_pfn, nr_pages); 1054 1055 /* 1056 * We inherit the existing zone in a simple case where zones do not 1057 * overlap in the given range 1058 */ 1059 if (in_kernel ^ in_movable) 1060 return (in_kernel) ? kernel_zone : movable_zone; 1061 1062 /* 1063 * If the range doesn't belong to any zone or two zones overlap in the 1064 * given range then we use movable zone only if movable_node is 1065 * enabled because we always online to a kernel zone by default. 1066 */ 1067 return movable_node_enabled ? movable_zone : kernel_zone; 1068 } 1069 1070 struct zone *zone_for_pfn_range(int online_type, int nid, 1071 struct memory_group *group, unsigned long start_pfn, 1072 unsigned long nr_pages) 1073 { 1074 if (online_type == MMOP_ONLINE_KERNEL) 1075 return default_kernel_zone_for_pfn(nid, start_pfn, nr_pages); 1076 1077 if (online_type == MMOP_ONLINE_MOVABLE) 1078 return &NODE_DATA(nid)->node_zones[ZONE_MOVABLE]; 1079 1080 if (online_policy == ONLINE_POLICY_AUTO_MOVABLE) 1081 return auto_movable_zone_for_pfn(nid, group, start_pfn, nr_pages); 1082 1083 return default_zone_for_pfn(nid, start_pfn, nr_pages); 1084 } 1085 1086 /* 1087 * This function should only be called by memory_block_{online,offline}, 1088 * and {online,offline}_pages. 1089 */ 1090 void adjust_present_page_count(struct page *page, struct memory_group *group, 1091 long nr_pages) 1092 { 1093 struct zone *zone = page_zone(page); 1094 const bool movable = zone_idx(zone) == ZONE_MOVABLE; 1095 1096 /* 1097 * We only support onlining/offlining/adding/removing of complete 1098 * memory blocks; therefore, either all is either early or hotplugged. 1099 */ 1100 if (early_section(__pfn_to_section(page_to_pfn(page)))) 1101 zone->present_early_pages += nr_pages; 1102 zone->present_pages += nr_pages; 1103 zone->zone_pgdat->node_present_pages += nr_pages; 1104 1105 if (group && movable) 1106 group->present_movable_pages += nr_pages; 1107 else if (group && !movable) 1108 group->present_kernel_pages += nr_pages; 1109 } 1110 1111 int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages, 1112 struct zone *zone, bool mhp_off_inaccessible) 1113 { 1114 unsigned long end_pfn = pfn + nr_pages; 1115 int ret, i; 1116 1117 ret = kasan_add_zero_shadow(__va(PFN_PHYS(pfn)), PFN_PHYS(nr_pages)); 1118 if (ret) 1119 return ret; 1120 1121 /* 1122 * Memory block is accessible at this stage and hence poison the struct 1123 * pages now. If the memory block is accessible during memory hotplug 1124 * addition phase, then page poisining is already performed in 1125 * sparse_add_section(). 1126 */ 1127 if (mhp_off_inaccessible) 1128 page_init_poison(pfn_to_page(pfn), sizeof(struct page) * nr_pages); 1129 1130 move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_UNMOVABLE); 1131 1132 for (i = 0; i < nr_pages; i++) { 1133 struct page *page = pfn_to_page(pfn + i); 1134 1135 __ClearPageOffline(page); 1136 SetPageVmemmapSelfHosted(page); 1137 } 1138 1139 /* 1140 * It might be that the vmemmap_pages fully span sections. If that is 1141 * the case, mark those sections online here as otherwise they will be 1142 * left offline. 1143 */ 1144 if (nr_pages >= PAGES_PER_SECTION) 1145 online_mem_sections(pfn, ALIGN_DOWN(end_pfn, PAGES_PER_SECTION)); 1146 1147 return ret; 1148 } 1149 1150 void mhp_deinit_memmap_on_memory(unsigned long pfn, unsigned long nr_pages) 1151 { 1152 unsigned long end_pfn = pfn + nr_pages; 1153 1154 /* 1155 * It might be that the vmemmap_pages fully span sections. If that is 1156 * the case, mark those sections offline here as otherwise they will be 1157 * left online. 1158 */ 1159 if (nr_pages >= PAGES_PER_SECTION) 1160 offline_mem_sections(pfn, ALIGN_DOWN(end_pfn, PAGES_PER_SECTION)); 1161 1162 /* 1163 * The pages associated with this vmemmap have been offlined, so 1164 * we can reset its state here. 1165 */ 1166 remove_pfn_range_from_zone(page_zone(pfn_to_page(pfn)), pfn, nr_pages); 1167 kasan_remove_zero_shadow(__va(PFN_PHYS(pfn)), PFN_PHYS(nr_pages)); 1168 } 1169 1170 /* 1171 * Must be called with mem_hotplug_lock in write mode. 1172 */ 1173 int online_pages(unsigned long pfn, unsigned long nr_pages, 1174 struct zone *zone, struct memory_group *group) 1175 { 1176 unsigned long flags; 1177 int need_zonelists_rebuild = 0; 1178 const int nid = zone_to_nid(zone); 1179 int ret; 1180 struct memory_notify arg; 1181 1182 /* 1183 * {on,off}lining is constrained to full memory sections (or more 1184 * precisely to memory blocks from the user space POV). 1185 * memmap_on_memory is an exception because it reserves initial part 1186 * of the physical memory space for vmemmaps. That space is pageblock 1187 * aligned. 1188 */ 1189 if (WARN_ON_ONCE(!nr_pages || !pageblock_aligned(pfn) || 1190 !IS_ALIGNED(pfn + nr_pages, PAGES_PER_SECTION))) 1191 return -EINVAL; 1192 1193 1194 /* associate pfn range with the zone */ 1195 move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_ISOLATE); 1196 1197 arg.start_pfn = pfn; 1198 arg.nr_pages = nr_pages; 1199 node_states_check_changes_online(nr_pages, zone, &arg); 1200 1201 ret = memory_notify(MEM_GOING_ONLINE, &arg); 1202 ret = notifier_to_errno(ret); 1203 if (ret) 1204 goto failed_addition; 1205 1206 /* 1207 * Fixup the number of isolated pageblocks before marking the sections 1208 * onlining, such that undo_isolate_page_range() works correctly. 1209 */ 1210 spin_lock_irqsave(&zone->lock, flags); 1211 zone->nr_isolate_pageblock += nr_pages / pageblock_nr_pages; 1212 spin_unlock_irqrestore(&zone->lock, flags); 1213 1214 /* 1215 * If this zone is not populated, then it is not in zonelist. 1216 * This means the page allocator ignores this zone. 1217 * So, zonelist must be updated after online. 1218 */ 1219 if (!populated_zone(zone)) { 1220 need_zonelists_rebuild = 1; 1221 setup_zone_pageset(zone); 1222 } 1223 1224 online_pages_range(pfn, nr_pages); 1225 adjust_present_page_count(pfn_to_page(pfn), group, nr_pages); 1226 1227 node_states_set_node(nid, &arg); 1228 if (need_zonelists_rebuild) 1229 build_all_zonelists(NULL); 1230 1231 /* Basic onlining is complete, allow allocation of onlined pages. */ 1232 undo_isolate_page_range(pfn, pfn + nr_pages, MIGRATE_MOVABLE); 1233 1234 /* 1235 * Freshly onlined pages aren't shuffled (e.g., all pages are placed to 1236 * the tail of the freelist when undoing isolation). Shuffle the whole 1237 * zone to make sure the just onlined pages are properly distributed 1238 * across the whole freelist - to create an initial shuffle. 1239 */ 1240 shuffle_zone(zone); 1241 1242 /* reinitialise watermarks and update pcp limits */ 1243 init_per_zone_wmark_min(); 1244 1245 kswapd_run(nid); 1246 kcompactd_run(nid); 1247 1248 writeback_set_ratelimit(); 1249 1250 memory_notify(MEM_ONLINE, &arg); 1251 return 0; 1252 1253 failed_addition: 1254 pr_debug("online_pages [mem %#010llx-%#010llx] failed\n", 1255 (unsigned long long) pfn << PAGE_SHIFT, 1256 (((unsigned long long) pfn + nr_pages) << PAGE_SHIFT) - 1); 1257 memory_notify(MEM_CANCEL_ONLINE, &arg); 1258 remove_pfn_range_from_zone(zone, pfn, nr_pages); 1259 return ret; 1260 } 1261 1262 /* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ 1263 static pg_data_t *hotadd_init_pgdat(int nid) 1264 { 1265 struct pglist_data *pgdat; 1266 1267 /* 1268 * NODE_DATA is preallocated (free_area_init) but its internal 1269 * state is not allocated completely. Add missing pieces. 1270 * Completely offline nodes stay around and they just need 1271 * reintialization. 1272 */ 1273 pgdat = NODE_DATA(nid); 1274 1275 /* init node's zones as empty zones, we don't have any present pages.*/ 1276 free_area_init_core_hotplug(pgdat); 1277 1278 /* 1279 * The node we allocated has no zone fallback lists. For avoiding 1280 * to access not-initialized zonelist, build here. 1281 */ 1282 build_all_zonelists(pgdat); 1283 1284 return pgdat; 1285 } 1286 1287 /* 1288 * __try_online_node - online a node if offlined 1289 * @nid: the node ID 1290 * @set_node_online: Whether we want to online the node 1291 * called by cpu_up() to online a node without onlined memory. 1292 * 1293 * Returns: 1294 * 1 -> a new node has been allocated 1295 * 0 -> the node is already online 1296 * -ENOMEM -> the node could not be allocated 1297 */ 1298 static int __try_online_node(int nid, bool set_node_online) 1299 { 1300 pg_data_t *pgdat; 1301 int ret = 1; 1302 1303 if (node_online(nid)) 1304 return 0; 1305 1306 pgdat = hotadd_init_pgdat(nid); 1307 if (!pgdat) { 1308 pr_err("Cannot online node %d due to NULL pgdat\n", nid); 1309 ret = -ENOMEM; 1310 goto out; 1311 } 1312 1313 if (set_node_online) { 1314 node_set_online(nid); 1315 ret = register_one_node(nid); 1316 BUG_ON(ret); 1317 } 1318 out: 1319 return ret; 1320 } 1321 1322 /* 1323 * Users of this function always want to online/register the node 1324 */ 1325 int try_online_node(int nid) 1326 { 1327 int ret; 1328 1329 mem_hotplug_begin(); 1330 ret = __try_online_node(nid, true); 1331 mem_hotplug_done(); 1332 return ret; 1333 } 1334 1335 static int check_hotplug_memory_range(u64 start, u64 size) 1336 { 1337 /* memory range must be block size aligned */ 1338 if (!size || !IS_ALIGNED(start, memory_block_size_bytes()) || 1339 !IS_ALIGNED(size, memory_block_size_bytes())) { 1340 pr_err("Block size [%#lx] unaligned hotplug range: start %#llx, size %#llx", 1341 memory_block_size_bytes(), start, size); 1342 return -EINVAL; 1343 } 1344 1345 return 0; 1346 } 1347 1348 static int online_memory_block(struct memory_block *mem, void *arg) 1349 { 1350 mem->online_type = mhp_get_default_online_type(); 1351 return device_online(&mem->dev); 1352 } 1353 1354 #ifndef arch_supports_memmap_on_memory 1355 static inline bool arch_supports_memmap_on_memory(unsigned long vmemmap_size) 1356 { 1357 /* 1358 * As default, we want the vmemmap to span a complete PMD such that we 1359 * can map the vmemmap using a single PMD if supported by the 1360 * architecture. 1361 */ 1362 return IS_ALIGNED(vmemmap_size, PMD_SIZE); 1363 } 1364 #endif 1365 1366 bool mhp_supports_memmap_on_memory(void) 1367 { 1368 unsigned long vmemmap_size = memory_block_memmap_size(); 1369 unsigned long memmap_pages = memory_block_memmap_on_memory_pages(); 1370 1371 /* 1372 * Besides having arch support and the feature enabled at runtime, we 1373 * need a few more assumptions to hold true: 1374 * 1375 * a) The vmemmap pages span complete PMDs: We don't want vmemmap code 1376 * to populate memory from the altmap for unrelated parts (i.e., 1377 * other memory blocks) 1378 * 1379 * b) The vmemmap pages (and thereby the pages that will be exposed to 1380 * the buddy) have to cover full pageblocks: memory onlining/offlining 1381 * code requires applicable ranges to be page-aligned, for example, to 1382 * set the migratetypes properly. 1383 * 1384 * TODO: Although we have a check here to make sure that vmemmap pages 1385 * fully populate a PMD, it is not the right place to check for 1386 * this. A much better solution involves improving vmemmap code 1387 * to fallback to base pages when trying to populate vmemmap using 1388 * altmap as an alternative source of memory, and we do not exactly 1389 * populate a single PMD. 1390 */ 1391 if (!mhp_memmap_on_memory()) 1392 return false; 1393 1394 /* 1395 * Make sure the vmemmap allocation is fully contained 1396 * so that we always allocate vmemmap memory from altmap area. 1397 */ 1398 if (!IS_ALIGNED(vmemmap_size, PAGE_SIZE)) 1399 return false; 1400 1401 /* 1402 * start pfn should be pageblock_nr_pages aligned for correctly 1403 * setting migrate types 1404 */ 1405 if (!pageblock_aligned(memmap_pages)) 1406 return false; 1407 1408 if (memmap_pages == PHYS_PFN(memory_block_size_bytes())) 1409 /* No effective hotplugged memory doesn't make sense. */ 1410 return false; 1411 1412 return arch_supports_memmap_on_memory(vmemmap_size); 1413 } 1414 EXPORT_SYMBOL_GPL(mhp_supports_memmap_on_memory); 1415 1416 static void remove_memory_blocks_and_altmaps(u64 start, u64 size) 1417 { 1418 unsigned long memblock_size = memory_block_size_bytes(); 1419 u64 cur_start; 1420 1421 /* 1422 * For memmap_on_memory, the altmaps were added on a per-memblock 1423 * basis; we have to process each individual memory block. 1424 */ 1425 for (cur_start = start; cur_start < start + size; 1426 cur_start += memblock_size) { 1427 struct vmem_altmap *altmap = NULL; 1428 struct memory_block *mem; 1429 1430 mem = find_memory_block(pfn_to_section_nr(PFN_DOWN(cur_start))); 1431 if (WARN_ON_ONCE(!mem)) 1432 continue; 1433 1434 altmap = mem->altmap; 1435 mem->altmap = NULL; 1436 1437 remove_memory_block_devices(cur_start, memblock_size); 1438 1439 arch_remove_memory(cur_start, memblock_size, altmap); 1440 1441 /* Verify that all vmemmap pages have actually been freed. */ 1442 WARN(altmap->alloc, "Altmap not fully unmapped"); 1443 kfree(altmap); 1444 } 1445 } 1446 1447 static int create_altmaps_and_memory_blocks(int nid, struct memory_group *group, 1448 u64 start, u64 size, mhp_t mhp_flags) 1449 { 1450 unsigned long memblock_size = memory_block_size_bytes(); 1451 u64 cur_start; 1452 int ret; 1453 1454 for (cur_start = start; cur_start < start + size; 1455 cur_start += memblock_size) { 1456 struct mhp_params params = { .pgprot = 1457 pgprot_mhp(PAGE_KERNEL) }; 1458 struct vmem_altmap mhp_altmap = { 1459 .base_pfn = PHYS_PFN(cur_start), 1460 .end_pfn = PHYS_PFN(cur_start + memblock_size - 1), 1461 }; 1462 1463 mhp_altmap.free = memory_block_memmap_on_memory_pages(); 1464 if (mhp_flags & MHP_OFFLINE_INACCESSIBLE) 1465 mhp_altmap.inaccessible = true; 1466 params.altmap = kmemdup(&mhp_altmap, sizeof(struct vmem_altmap), 1467 GFP_KERNEL); 1468 if (!params.altmap) { 1469 ret = -ENOMEM; 1470 goto out; 1471 } 1472 1473 /* call arch's memory hotadd */ 1474 ret = arch_add_memory(nid, cur_start, memblock_size, ¶ms); 1475 if (ret < 0) { 1476 kfree(params.altmap); 1477 goto out; 1478 } 1479 1480 /* create memory block devices after memory was added */ 1481 ret = create_memory_block_devices(cur_start, memblock_size, 1482 params.altmap, group); 1483 if (ret) { 1484 arch_remove_memory(cur_start, memblock_size, NULL); 1485 kfree(params.altmap); 1486 goto out; 1487 } 1488 } 1489 1490 return 0; 1491 out: 1492 if (ret && cur_start != start) 1493 remove_memory_blocks_and_altmaps(start, cur_start - start); 1494 return ret; 1495 } 1496 1497 /* 1498 * NOTE: The caller must call lock_device_hotplug() to serialize hotplug 1499 * and online/offline operations (triggered e.g. by sysfs). 1500 * 1501 * we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG 1502 */ 1503 int add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags) 1504 { 1505 struct mhp_params params = { .pgprot = pgprot_mhp(PAGE_KERNEL) }; 1506 enum memblock_flags memblock_flags = MEMBLOCK_NONE; 1507 struct memory_group *group = NULL; 1508 u64 start, size; 1509 bool new_node = false; 1510 int ret; 1511 1512 start = res->start; 1513 size = resource_size(res); 1514 1515 ret = check_hotplug_memory_range(start, size); 1516 if (ret) 1517 return ret; 1518 1519 if (mhp_flags & MHP_NID_IS_MGID) { 1520 group = memory_group_find_by_id(nid); 1521 if (!group) 1522 return -EINVAL; 1523 nid = group->nid; 1524 } 1525 1526 if (!node_possible(nid)) { 1527 WARN(1, "node %d was absent from the node_possible_map\n", nid); 1528 return -EINVAL; 1529 } 1530 1531 mem_hotplug_begin(); 1532 1533 if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) { 1534 if (res->flags & IORESOURCE_SYSRAM_DRIVER_MANAGED) 1535 memblock_flags = MEMBLOCK_DRIVER_MANAGED; 1536 ret = memblock_add_node(start, size, nid, memblock_flags); 1537 if (ret) 1538 goto error_mem_hotplug_end; 1539 } 1540 1541 ret = __try_online_node(nid, false); 1542 if (ret < 0) 1543 goto error; 1544 new_node = ret; 1545 1546 /* 1547 * Self hosted memmap array 1548 */ 1549 if ((mhp_flags & MHP_MEMMAP_ON_MEMORY) && 1550 mhp_supports_memmap_on_memory()) { 1551 ret = create_altmaps_and_memory_blocks(nid, group, start, size, mhp_flags); 1552 if (ret) 1553 goto error; 1554 } else { 1555 ret = arch_add_memory(nid, start, size, ¶ms); 1556 if (ret < 0) 1557 goto error; 1558 1559 /* create memory block devices after memory was added */ 1560 ret = create_memory_block_devices(start, size, NULL, group); 1561 if (ret) { 1562 arch_remove_memory(start, size, params.altmap); 1563 goto error; 1564 } 1565 } 1566 1567 if (new_node) { 1568 /* If sysfs file of new node can't be created, cpu on the node 1569 * can't be hot-added. There is no rollback way now. 1570 * So, check by BUG_ON() to catch it reluctantly.. 1571 * We online node here. We can't roll back from here. 1572 */ 1573 node_set_online(nid); 1574 ret = __register_one_node(nid); 1575 BUG_ON(ret); 1576 } 1577 1578 register_memory_blocks_under_node(nid, PFN_DOWN(start), 1579 PFN_UP(start + size - 1), 1580 MEMINIT_HOTPLUG); 1581 1582 /* create new memmap entry */ 1583 if (!strcmp(res->name, "System RAM")) 1584 firmware_map_add_hotplug(start, start + size, "System RAM"); 1585 1586 /* device_online() will take the lock when calling online_pages() */ 1587 mem_hotplug_done(); 1588 1589 /* 1590 * In case we're allowed to merge the resource, flag it and trigger 1591 * merging now that adding succeeded. 1592 */ 1593 if (mhp_flags & MHP_MERGE_RESOURCE) 1594 merge_system_ram_resource(res); 1595 1596 /* online pages if requested */ 1597 if (mhp_get_default_online_type() != MMOP_OFFLINE) 1598 walk_memory_blocks(start, size, NULL, online_memory_block); 1599 1600 return ret; 1601 error: 1602 if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) 1603 memblock_remove(start, size); 1604 error_mem_hotplug_end: 1605 mem_hotplug_done(); 1606 return ret; 1607 } 1608 1609 /* requires device_hotplug_lock, see add_memory_resource() */ 1610 int __add_memory(int nid, u64 start, u64 size, mhp_t mhp_flags) 1611 { 1612 struct resource *res; 1613 int ret; 1614 1615 res = register_memory_resource(start, size, "System RAM"); 1616 if (IS_ERR(res)) 1617 return PTR_ERR(res); 1618 1619 ret = add_memory_resource(nid, res, mhp_flags); 1620 if (ret < 0) 1621 release_memory_resource(res); 1622 return ret; 1623 } 1624 1625 int add_memory(int nid, u64 start, u64 size, mhp_t mhp_flags) 1626 { 1627 int rc; 1628 1629 lock_device_hotplug(); 1630 rc = __add_memory(nid, start, size, mhp_flags); 1631 unlock_device_hotplug(); 1632 1633 return rc; 1634 } 1635 EXPORT_SYMBOL_GPL(add_memory); 1636 1637 /* 1638 * Add special, driver-managed memory to the system as system RAM. Such 1639 * memory is not exposed via the raw firmware-provided memmap as system 1640 * RAM, instead, it is detected and added by a driver - during cold boot, 1641 * after a reboot, and after kexec. 1642 * 1643 * Reasons why this memory should not be used for the initial memmap of a 1644 * kexec kernel or for placing kexec images: 1645 * - The booting kernel is in charge of determining how this memory will be 1646 * used (e.g., use persistent memory as system RAM) 1647 * - Coordination with a hypervisor is required before this memory 1648 * can be used (e.g., inaccessible parts). 1649 * 1650 * For this memory, no entries in /sys/firmware/memmap ("raw firmware-provided 1651 * memory map") are created. Also, the created memory resource is flagged 1652 * with IORESOURCE_SYSRAM_DRIVER_MANAGED, so in-kernel users can special-case 1653 * this memory as well (esp., not place kexec images onto it). 1654 * 1655 * The resource_name (visible via /proc/iomem) has to have the format 1656 * "System RAM ($DRIVER)". 1657 */ 1658 int add_memory_driver_managed(int nid, u64 start, u64 size, 1659 const char *resource_name, mhp_t mhp_flags) 1660 { 1661 struct resource *res; 1662 int rc; 1663 1664 if (!resource_name || 1665 strstr(resource_name, "System RAM (") != resource_name || 1666 resource_name[strlen(resource_name) - 1] != ')') 1667 return -EINVAL; 1668 1669 lock_device_hotplug(); 1670 1671 res = register_memory_resource(start, size, resource_name); 1672 if (IS_ERR(res)) { 1673 rc = PTR_ERR(res); 1674 goto out_unlock; 1675 } 1676 1677 rc = add_memory_resource(nid, res, mhp_flags); 1678 if (rc < 0) 1679 release_memory_resource(res); 1680 1681 out_unlock: 1682 unlock_device_hotplug(); 1683 return rc; 1684 } 1685 EXPORT_SYMBOL_GPL(add_memory_driver_managed); 1686 1687 /* 1688 * Platforms should define arch_get_mappable_range() that provides 1689 * maximum possible addressable physical memory range for which the 1690 * linear mapping could be created. The platform returned address 1691 * range must adhere to these following semantics. 1692 * 1693 * - range.start <= range.end 1694 * - Range includes both end points [range.start..range.end] 1695 * 1696 * There is also a fallback definition provided here, allowing the 1697 * entire possible physical address range in case any platform does 1698 * not define arch_get_mappable_range(). 1699 */ 1700 struct range __weak arch_get_mappable_range(void) 1701 { 1702 struct range mhp_range = { 1703 .start = 0UL, 1704 .end = -1ULL, 1705 }; 1706 return mhp_range; 1707 } 1708 1709 struct range mhp_get_pluggable_range(bool need_mapping) 1710 { 1711 const u64 max_phys = DIRECT_MAP_PHYSMEM_END; 1712 struct range mhp_range; 1713 1714 if (need_mapping) { 1715 mhp_range = arch_get_mappable_range(); 1716 if (mhp_range.start > max_phys) { 1717 mhp_range.start = 0; 1718 mhp_range.end = 0; 1719 } 1720 mhp_range.end = min_t(u64, mhp_range.end, max_phys); 1721 } else { 1722 mhp_range.start = 0; 1723 mhp_range.end = max_phys; 1724 } 1725 return mhp_range; 1726 } 1727 EXPORT_SYMBOL_GPL(mhp_get_pluggable_range); 1728 1729 bool mhp_range_allowed(u64 start, u64 size, bool need_mapping) 1730 { 1731 struct range mhp_range = mhp_get_pluggable_range(need_mapping); 1732 u64 end = start + size; 1733 1734 if (start < end && start >= mhp_range.start && (end - 1) <= mhp_range.end) 1735 return true; 1736 1737 pr_warn("Hotplug memory [%#llx-%#llx] exceeds maximum addressable range [%#llx-%#llx]\n", 1738 start, end, mhp_range.start, mhp_range.end); 1739 return false; 1740 } 1741 1742 #ifdef CONFIG_MEMORY_HOTREMOVE 1743 /* 1744 * Scan pfn range [start,end) to find movable/migratable pages (LRU pages, 1745 * non-lru movable pages and hugepages). Will skip over most unmovable 1746 * pages (esp., pages that can be skipped when offlining), but bail out on 1747 * definitely unmovable pages. 1748 * 1749 * Returns: 1750 * 0 in case a movable page is found and movable_pfn was updated. 1751 * -ENOENT in case no movable page was found. 1752 * -EBUSY in case a definitely unmovable page was found. 1753 */ 1754 static int scan_movable_pages(unsigned long start, unsigned long end, 1755 unsigned long *movable_pfn) 1756 { 1757 unsigned long pfn; 1758 1759 for (pfn = start; pfn < end; pfn++) { 1760 struct page *page; 1761 struct folio *folio; 1762 1763 if (!pfn_valid(pfn)) 1764 continue; 1765 page = pfn_to_page(pfn); 1766 if (PageLRU(page)) 1767 goto found; 1768 if (__PageMovable(page)) 1769 goto found; 1770 1771 /* 1772 * PageOffline() pages that are not marked __PageMovable() and 1773 * have a reference count > 0 (after MEM_GOING_OFFLINE) are 1774 * definitely unmovable. If their reference count would be 0, 1775 * they could at least be skipped when offlining memory. 1776 */ 1777 if (PageOffline(page) && page_count(page)) 1778 return -EBUSY; 1779 1780 if (!PageHuge(page)) 1781 continue; 1782 folio = page_folio(page); 1783 /* 1784 * This test is racy as we hold no reference or lock. The 1785 * hugetlb page could have been free'ed and head is no longer 1786 * a hugetlb page before the following check. In such unlikely 1787 * cases false positives and negatives are possible. Calling 1788 * code must deal with these scenarios. 1789 */ 1790 if (folio_test_hugetlb_migratable(folio)) 1791 goto found; 1792 pfn |= folio_nr_pages(folio) - 1; 1793 } 1794 return -ENOENT; 1795 found: 1796 *movable_pfn = pfn; 1797 return 0; 1798 } 1799 1800 static void do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) 1801 { 1802 struct folio *folio; 1803 unsigned long pfn; 1804 LIST_HEAD(source); 1805 static DEFINE_RATELIMIT_STATE(migrate_rs, DEFAULT_RATELIMIT_INTERVAL, 1806 DEFAULT_RATELIMIT_BURST); 1807 1808 for (pfn = start_pfn; pfn < end_pfn; pfn++) { 1809 struct page *page; 1810 1811 if (!pfn_valid(pfn)) 1812 continue; 1813 page = pfn_to_page(pfn); 1814 folio = page_folio(page); 1815 1816 /* 1817 * No reference or lock is held on the folio, so it might 1818 * be modified concurrently (e.g. split). As such, 1819 * folio_nr_pages() may read garbage. This is fine as the outer 1820 * loop will revisit the split folio later. 1821 */ 1822 if (folio_test_large(folio)) 1823 pfn = folio_pfn(folio) + folio_nr_pages(folio) - 1; 1824 1825 if (!folio_try_get(folio)) 1826 continue; 1827 1828 if (unlikely(page_folio(page) != folio)) 1829 goto put_folio; 1830 1831 if (folio_test_hwpoison(folio) || 1832 (folio_test_large(folio) && folio_test_has_hwpoisoned(folio))) { 1833 if (WARN_ON(folio_test_lru(folio))) 1834 folio_isolate_lru(folio); 1835 if (folio_mapped(folio)) { 1836 folio_lock(folio); 1837 unmap_poisoned_folio(folio, pfn, false); 1838 folio_unlock(folio); 1839 } 1840 1841 goto put_folio; 1842 } 1843 1844 if (!isolate_folio_to_list(folio, &source)) { 1845 if (__ratelimit(&migrate_rs)) { 1846 pr_warn("failed to isolate pfn %lx\n", 1847 page_to_pfn(page)); 1848 dump_page(page, "isolation failed"); 1849 } 1850 } 1851 put_folio: 1852 folio_put(folio); 1853 } 1854 if (!list_empty(&source)) { 1855 nodemask_t nmask = node_states[N_MEMORY]; 1856 struct migration_target_control mtc = { 1857 .nmask = &nmask, 1858 .gfp_mask = GFP_KERNEL | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL, 1859 .reason = MR_MEMORY_HOTPLUG, 1860 }; 1861 int ret; 1862 1863 /* 1864 * We have checked that migration range is on a single zone so 1865 * we can use the nid of the first page to all the others. 1866 */ 1867 mtc.nid = folio_nid(list_first_entry(&source, struct folio, lru)); 1868 1869 /* 1870 * try to allocate from a different node but reuse this node 1871 * if there are no other online nodes to be used (e.g. we are 1872 * offlining a part of the only existing node) 1873 */ 1874 node_clear(mtc.nid, nmask); 1875 if (nodes_empty(nmask)) 1876 node_set(mtc.nid, nmask); 1877 ret = migrate_pages(&source, alloc_migration_target, NULL, 1878 (unsigned long)&mtc, MIGRATE_SYNC, MR_MEMORY_HOTPLUG, NULL); 1879 if (ret) { 1880 list_for_each_entry(folio, &source, lru) { 1881 if (__ratelimit(&migrate_rs)) { 1882 pr_warn("migrating pfn %lx failed ret:%d\n", 1883 folio_pfn(folio), ret); 1884 dump_page(&folio->page, 1885 "migration failure"); 1886 } 1887 } 1888 putback_movable_pages(&source); 1889 } 1890 } 1891 } 1892 1893 static int __init cmdline_parse_movable_node(char *p) 1894 { 1895 movable_node_enabled = true; 1896 return 0; 1897 } 1898 early_param("movable_node", cmdline_parse_movable_node); 1899 1900 /* check which state of node_states will be changed when offline memory */ 1901 static void node_states_check_changes_offline(unsigned long nr_pages, 1902 struct zone *zone, struct memory_notify *arg) 1903 { 1904 struct pglist_data *pgdat = zone->zone_pgdat; 1905 unsigned long present_pages = 0; 1906 enum zone_type zt; 1907 1908 arg->status_change_nid = NUMA_NO_NODE; 1909 arg->status_change_nid_normal = NUMA_NO_NODE; 1910 1911 /* 1912 * Check whether node_states[N_NORMAL_MEMORY] will be changed. 1913 * If the memory to be offline is within the range 1914 * [0..ZONE_NORMAL], and it is the last present memory there, 1915 * the zones in that range will become empty after the offlining, 1916 * thus we can determine that we need to clear the node from 1917 * node_states[N_NORMAL_MEMORY]. 1918 */ 1919 for (zt = 0; zt <= ZONE_NORMAL; zt++) 1920 present_pages += pgdat->node_zones[zt].present_pages; 1921 if (zone_idx(zone) <= ZONE_NORMAL && nr_pages >= present_pages) 1922 arg->status_change_nid_normal = zone_to_nid(zone); 1923 1924 /* 1925 * We have accounted the pages from [0..ZONE_NORMAL); ZONE_HIGHMEM 1926 * does not apply as we don't support 32bit. 1927 * Here we count the possible pages from ZONE_MOVABLE. 1928 * If after having accounted all the pages, we see that the nr_pages 1929 * to be offlined is over or equal to the accounted pages, 1930 * we know that the node will become empty, and so, we can clear 1931 * it for N_MEMORY as well. 1932 */ 1933 present_pages += pgdat->node_zones[ZONE_MOVABLE].present_pages; 1934 1935 if (nr_pages >= present_pages) 1936 arg->status_change_nid = zone_to_nid(zone); 1937 } 1938 1939 static void node_states_clear_node(int node, struct memory_notify *arg) 1940 { 1941 if (arg->status_change_nid_normal >= 0) 1942 node_clear_state(node, N_NORMAL_MEMORY); 1943 1944 if (arg->status_change_nid >= 0) 1945 node_clear_state(node, N_MEMORY); 1946 } 1947 1948 static int count_system_ram_pages_cb(unsigned long start_pfn, 1949 unsigned long nr_pages, void *data) 1950 { 1951 unsigned long *nr_system_ram_pages = data; 1952 1953 *nr_system_ram_pages += nr_pages; 1954 return 0; 1955 } 1956 1957 /* 1958 * Must be called with mem_hotplug_lock in write mode. 1959 */ 1960 int offline_pages(unsigned long start_pfn, unsigned long nr_pages, 1961 struct zone *zone, struct memory_group *group) 1962 { 1963 const unsigned long end_pfn = start_pfn + nr_pages; 1964 unsigned long pfn, managed_pages, system_ram_pages = 0; 1965 const int node = zone_to_nid(zone); 1966 unsigned long flags; 1967 struct memory_notify arg; 1968 char *reason; 1969 int ret; 1970 1971 /* 1972 * {on,off}lining is constrained to full memory sections (or more 1973 * precisely to memory blocks from the user space POV). 1974 * memmap_on_memory is an exception because it reserves initial part 1975 * of the physical memory space for vmemmaps. That space is pageblock 1976 * aligned. 1977 */ 1978 if (WARN_ON_ONCE(!nr_pages || !pageblock_aligned(start_pfn) || 1979 !IS_ALIGNED(start_pfn + nr_pages, PAGES_PER_SECTION))) 1980 return -EINVAL; 1981 1982 /* 1983 * Don't allow to offline memory blocks that contain holes. 1984 * Consequently, memory blocks with holes can never get onlined 1985 * via the hotplug path - online_pages() - as hotplugged memory has 1986 * no holes. This way, we don't have to worry about memory holes, 1987 * don't need pfn_valid() checks, and can avoid using 1988 * walk_system_ram_range() later. 1989 */ 1990 walk_system_ram_range(start_pfn, nr_pages, &system_ram_pages, 1991 count_system_ram_pages_cb); 1992 if (system_ram_pages != nr_pages) { 1993 ret = -EINVAL; 1994 reason = "memory holes"; 1995 goto failed_removal; 1996 } 1997 1998 /* 1999 * We only support offlining of memory blocks managed by a single zone, 2000 * checked by calling code. This is just a sanity check that we might 2001 * want to remove in the future. 2002 */ 2003 if (WARN_ON_ONCE(page_zone(pfn_to_page(start_pfn)) != zone || 2004 page_zone(pfn_to_page(end_pfn - 1)) != zone)) { 2005 ret = -EINVAL; 2006 reason = "multizone range"; 2007 goto failed_removal; 2008 } 2009 2010 /* 2011 * Disable pcplists so that page isolation cannot race with freeing 2012 * in a way that pages from isolated pageblock are left on pcplists. 2013 */ 2014 zone_pcp_disable(zone); 2015 lru_cache_disable(); 2016 2017 /* set above range as isolated */ 2018 ret = start_isolate_page_range(start_pfn, end_pfn, 2019 MIGRATE_MOVABLE, 2020 MEMORY_OFFLINE | REPORT_FAILURE); 2021 if (ret) { 2022 reason = "failure to isolate range"; 2023 goto failed_removal_pcplists_disabled; 2024 } 2025 2026 arg.start_pfn = start_pfn; 2027 arg.nr_pages = nr_pages; 2028 node_states_check_changes_offline(nr_pages, zone, &arg); 2029 2030 ret = memory_notify(MEM_GOING_OFFLINE, &arg); 2031 ret = notifier_to_errno(ret); 2032 if (ret) { 2033 reason = "notifier failure"; 2034 goto failed_removal_isolated; 2035 } 2036 2037 do { 2038 pfn = start_pfn; 2039 do { 2040 /* 2041 * Historically we always checked for any signal and 2042 * can't limit it to fatal signals without eventually 2043 * breaking user space. 2044 */ 2045 if (signal_pending(current)) { 2046 ret = -EINTR; 2047 reason = "signal backoff"; 2048 goto failed_removal_isolated; 2049 } 2050 2051 cond_resched(); 2052 2053 ret = scan_movable_pages(pfn, end_pfn, &pfn); 2054 if (!ret) { 2055 /* 2056 * TODO: fatal migration failures should bail 2057 * out 2058 */ 2059 do_migrate_range(pfn, end_pfn); 2060 } 2061 } while (!ret); 2062 2063 if (ret != -ENOENT) { 2064 reason = "unmovable page"; 2065 goto failed_removal_isolated; 2066 } 2067 2068 /* 2069 * Dissolve free hugetlb folios in the memory block before doing 2070 * offlining actually in order to make hugetlbfs's object 2071 * counting consistent. 2072 */ 2073 ret = dissolve_free_hugetlb_folios(start_pfn, end_pfn); 2074 if (ret) { 2075 reason = "failure to dissolve huge pages"; 2076 goto failed_removal_isolated; 2077 } 2078 2079 ret = test_pages_isolated(start_pfn, end_pfn, MEMORY_OFFLINE); 2080 2081 } while (ret); 2082 2083 /* Mark all sections offline and remove free pages from the buddy. */ 2084 managed_pages = __offline_isolated_pages(start_pfn, end_pfn); 2085 pr_debug("Offlined Pages %ld\n", nr_pages); 2086 2087 /* 2088 * The memory sections are marked offline, and the pageblock flags 2089 * effectively stale; nobody should be touching them. Fixup the number 2090 * of isolated pageblocks, memory onlining will properly revert this. 2091 */ 2092 spin_lock_irqsave(&zone->lock, flags); 2093 zone->nr_isolate_pageblock -= nr_pages / pageblock_nr_pages; 2094 spin_unlock_irqrestore(&zone->lock, flags); 2095 2096 lru_cache_enable(); 2097 zone_pcp_enable(zone); 2098 2099 /* removal success */ 2100 adjust_managed_page_count(pfn_to_page(start_pfn), -managed_pages); 2101 adjust_present_page_count(pfn_to_page(start_pfn), group, -nr_pages); 2102 2103 /* reinitialise watermarks and update pcp limits */ 2104 init_per_zone_wmark_min(); 2105 2106 /* 2107 * Make sure to mark the node as memory-less before rebuilding the zone 2108 * list. Otherwise this node would still appear in the fallback lists. 2109 */ 2110 node_states_clear_node(node, &arg); 2111 if (!populated_zone(zone)) { 2112 zone_pcp_reset(zone); 2113 build_all_zonelists(NULL); 2114 } 2115 2116 if (arg.status_change_nid >= 0) { 2117 kcompactd_stop(node); 2118 kswapd_stop(node); 2119 } 2120 2121 writeback_set_ratelimit(); 2122 2123 memory_notify(MEM_OFFLINE, &arg); 2124 remove_pfn_range_from_zone(zone, start_pfn, nr_pages); 2125 return 0; 2126 2127 failed_removal_isolated: 2128 /* pushback to free area */ 2129 undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); 2130 memory_notify(MEM_CANCEL_OFFLINE, &arg); 2131 failed_removal_pcplists_disabled: 2132 lru_cache_enable(); 2133 zone_pcp_enable(zone); 2134 failed_removal: 2135 pr_debug("memory offlining [mem %#010llx-%#010llx] failed due to %s\n", 2136 (unsigned long long) start_pfn << PAGE_SHIFT, 2137 ((unsigned long long) end_pfn << PAGE_SHIFT) - 1, 2138 reason); 2139 return ret; 2140 } 2141 2142 static int check_memblock_offlined_cb(struct memory_block *mem, void *arg) 2143 { 2144 int *nid = arg; 2145 2146 *nid = mem->nid; 2147 if (unlikely(mem->state != MEM_OFFLINE)) { 2148 phys_addr_t beginpa, endpa; 2149 2150 beginpa = PFN_PHYS(section_nr_to_pfn(mem->start_section_nr)); 2151 endpa = beginpa + memory_block_size_bytes() - 1; 2152 pr_warn("removing memory fails, because memory [%pa-%pa] is onlined\n", 2153 &beginpa, &endpa); 2154 2155 return -EBUSY; 2156 } 2157 return 0; 2158 } 2159 2160 static int count_memory_range_altmaps_cb(struct memory_block *mem, void *arg) 2161 { 2162 u64 *num_altmaps = (u64 *)arg; 2163 2164 if (mem->altmap) 2165 *num_altmaps += 1; 2166 2167 return 0; 2168 } 2169 2170 static int check_cpu_on_node(int nid) 2171 { 2172 int cpu; 2173 2174 for_each_present_cpu(cpu) { 2175 if (cpu_to_node(cpu) == nid) 2176 /* 2177 * the cpu on this node isn't removed, and we can't 2178 * offline this node. 2179 */ 2180 return -EBUSY; 2181 } 2182 2183 return 0; 2184 } 2185 2186 static int check_no_memblock_for_node_cb(struct memory_block *mem, void *arg) 2187 { 2188 int nid = *(int *)arg; 2189 2190 /* 2191 * If a memory block belongs to multiple nodes, the stored nid is not 2192 * reliable. However, such blocks are always online (e.g., cannot get 2193 * offlined) and, therefore, are still spanned by the node. 2194 */ 2195 return mem->nid == nid ? -EEXIST : 0; 2196 } 2197 2198 /** 2199 * try_offline_node 2200 * @nid: the node ID 2201 * 2202 * Offline a node if all memory sections and cpus of the node are removed. 2203 * 2204 * NOTE: The caller must call lock_device_hotplug() to serialize hotplug 2205 * and online/offline operations before this call. 2206 */ 2207 void try_offline_node(int nid) 2208 { 2209 int rc; 2210 2211 /* 2212 * If the node still spans pages (especially ZONE_DEVICE), don't 2213 * offline it. A node spans memory after move_pfn_range_to_zone(), 2214 * e.g., after the memory block was onlined. 2215 */ 2216 if (node_spanned_pages(nid)) 2217 return; 2218 2219 /* 2220 * Especially offline memory blocks might not be spanned by the 2221 * node. They will get spanned by the node once they get onlined. 2222 * However, they link to the node in sysfs and can get onlined later. 2223 */ 2224 rc = for_each_memory_block(&nid, check_no_memblock_for_node_cb); 2225 if (rc) 2226 return; 2227 2228 if (check_cpu_on_node(nid)) 2229 return; 2230 2231 /* 2232 * all memory/cpu of this node are removed, we can offline this 2233 * node now. 2234 */ 2235 node_set_offline(nid); 2236 unregister_one_node(nid); 2237 } 2238 EXPORT_SYMBOL(try_offline_node); 2239 2240 static int memory_blocks_have_altmaps(u64 start, u64 size) 2241 { 2242 u64 num_memblocks = size / memory_block_size_bytes(); 2243 u64 num_altmaps = 0; 2244 2245 if (!mhp_memmap_on_memory()) 2246 return 0; 2247 2248 walk_memory_blocks(start, size, &num_altmaps, 2249 count_memory_range_altmaps_cb); 2250 2251 if (num_altmaps == 0) 2252 return 0; 2253 2254 if (WARN_ON_ONCE(num_memblocks != num_altmaps)) 2255 return -EINVAL; 2256 2257 return 1; 2258 } 2259 2260 static int try_remove_memory(u64 start, u64 size) 2261 { 2262 int rc, nid = NUMA_NO_NODE; 2263 2264 BUG_ON(check_hotplug_memory_range(start, size)); 2265 2266 /* 2267 * All memory blocks must be offlined before removing memory. Check 2268 * whether all memory blocks in question are offline and return error 2269 * if this is not the case. 2270 * 2271 * While at it, determine the nid. Note that if we'd have mixed nodes, 2272 * we'd only try to offline the last determined one -- which is good 2273 * enough for the cases we care about. 2274 */ 2275 rc = walk_memory_blocks(start, size, &nid, check_memblock_offlined_cb); 2276 if (rc) 2277 return rc; 2278 2279 /* remove memmap entry */ 2280 firmware_map_remove(start, start + size, "System RAM"); 2281 2282 mem_hotplug_begin(); 2283 2284 rc = memory_blocks_have_altmaps(start, size); 2285 if (rc < 0) { 2286 mem_hotplug_done(); 2287 return rc; 2288 } else if (!rc) { 2289 /* 2290 * Memory block device removal under the device_hotplug_lock is 2291 * a barrier against racing online attempts. 2292 * No altmaps present, do the removal directly 2293 */ 2294 remove_memory_block_devices(start, size); 2295 arch_remove_memory(start, size, NULL); 2296 } else { 2297 /* all memblocks in the range have altmaps */ 2298 remove_memory_blocks_and_altmaps(start, size); 2299 } 2300 2301 if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) 2302 memblock_remove(start, size); 2303 2304 release_mem_region_adjustable(start, size); 2305 2306 if (nid != NUMA_NO_NODE) 2307 try_offline_node(nid); 2308 2309 mem_hotplug_done(); 2310 return 0; 2311 } 2312 2313 /** 2314 * __remove_memory - Remove memory if every memory block is offline 2315 * @start: physical address of the region to remove 2316 * @size: size of the region to remove 2317 * 2318 * NOTE: The caller must call lock_device_hotplug() to serialize hotplug 2319 * and online/offline operations before this call, as required by 2320 * try_offline_node(). 2321 */ 2322 void __remove_memory(u64 start, u64 size) 2323 { 2324 2325 /* 2326 * trigger BUG() if some memory is not offlined prior to calling this 2327 * function 2328 */ 2329 if (try_remove_memory(start, size)) 2330 BUG(); 2331 } 2332 2333 /* 2334 * Remove memory if every memory block is offline, otherwise return -EBUSY is 2335 * some memory is not offline 2336 */ 2337 int remove_memory(u64 start, u64 size) 2338 { 2339 int rc; 2340 2341 lock_device_hotplug(); 2342 rc = try_remove_memory(start, size); 2343 unlock_device_hotplug(); 2344 2345 return rc; 2346 } 2347 EXPORT_SYMBOL_GPL(remove_memory); 2348 2349 static int try_offline_memory_block(struct memory_block *mem, void *arg) 2350 { 2351 uint8_t online_type = MMOP_ONLINE_KERNEL; 2352 uint8_t **online_types = arg; 2353 struct page *page; 2354 int rc; 2355 2356 /* 2357 * Sense the online_type via the zone of the memory block. Offlining 2358 * with multiple zones within one memory block will be rejected 2359 * by offlining code ... so we don't care about that. 2360 */ 2361 page = pfn_to_online_page(section_nr_to_pfn(mem->start_section_nr)); 2362 if (page && zone_idx(page_zone(page)) == ZONE_MOVABLE) 2363 online_type = MMOP_ONLINE_MOVABLE; 2364 2365 rc = device_offline(&mem->dev); 2366 /* 2367 * Default is MMOP_OFFLINE - change it only if offlining succeeded, 2368 * so try_reonline_memory_block() can do the right thing. 2369 */ 2370 if (!rc) 2371 **online_types = online_type; 2372 2373 (*online_types)++; 2374 /* Ignore if already offline. */ 2375 return rc < 0 ? rc : 0; 2376 } 2377 2378 static int try_reonline_memory_block(struct memory_block *mem, void *arg) 2379 { 2380 uint8_t **online_types = arg; 2381 int rc; 2382 2383 if (**online_types != MMOP_OFFLINE) { 2384 mem->online_type = **online_types; 2385 rc = device_online(&mem->dev); 2386 if (rc < 0) 2387 pr_warn("%s: Failed to re-online memory: %d", 2388 __func__, rc); 2389 } 2390 2391 /* Continue processing all remaining memory blocks. */ 2392 (*online_types)++; 2393 return 0; 2394 } 2395 2396 /* 2397 * Try to offline and remove memory. Might take a long time to finish in case 2398 * memory is still in use. Primarily useful for memory devices that logically 2399 * unplugged all memory (so it's no longer in use) and want to offline + remove 2400 * that memory. 2401 */ 2402 int offline_and_remove_memory(u64 start, u64 size) 2403 { 2404 const unsigned long mb_count = size / memory_block_size_bytes(); 2405 uint8_t *online_types, *tmp; 2406 int rc; 2407 2408 if (!IS_ALIGNED(start, memory_block_size_bytes()) || 2409 !IS_ALIGNED(size, memory_block_size_bytes()) || !size) 2410 return -EINVAL; 2411 2412 /* 2413 * We'll remember the old online type of each memory block, so we can 2414 * try to revert whatever we did when offlining one memory block fails 2415 * after offlining some others succeeded. 2416 */ 2417 online_types = kmalloc_array(mb_count, sizeof(*online_types), 2418 GFP_KERNEL); 2419 if (!online_types) 2420 return -ENOMEM; 2421 /* 2422 * Initialize all states to MMOP_OFFLINE, so when we abort processing in 2423 * try_offline_memory_block(), we'll skip all unprocessed blocks in 2424 * try_reonline_memory_block(). 2425 */ 2426 memset(online_types, MMOP_OFFLINE, mb_count); 2427 2428 lock_device_hotplug(); 2429 2430 tmp = online_types; 2431 rc = walk_memory_blocks(start, size, &tmp, try_offline_memory_block); 2432 2433 /* 2434 * In case we succeeded to offline all memory, remove it. 2435 * This cannot fail as it cannot get onlined in the meantime. 2436 */ 2437 if (!rc) { 2438 rc = try_remove_memory(start, size); 2439 if (rc) 2440 pr_err("%s: Failed to remove memory: %d", __func__, rc); 2441 } 2442 2443 /* 2444 * Rollback what we did. While memory onlining might theoretically fail 2445 * (nacked by a notifier), it barely ever happens. 2446 */ 2447 if (rc) { 2448 tmp = online_types; 2449 walk_memory_blocks(start, size, &tmp, 2450 try_reonline_memory_block); 2451 } 2452 unlock_device_hotplug(); 2453 2454 kfree(online_types); 2455 return rc; 2456 } 2457 EXPORT_SYMBOL_GPL(offline_and_remove_memory); 2458 #endif /* CONFIG_MEMORY_HOTREMOVE */ 2459