1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * linux/mm/memory_hotplug.c 4 * 5 * Copyright (C) 6 */ 7 8 #include <linux/stddef.h> 9 #include <linux/mm.h> 10 #include <linux/sched/signal.h> 11 #include <linux/swap.h> 12 #include <linux/interrupt.h> 13 #include <linux/pagemap.h> 14 #include <linux/compiler.h> 15 #include <linux/export.h> 16 #include <linux/writeback.h> 17 #include <linux/slab.h> 18 #include <linux/sysctl.h> 19 #include <linux/cpu.h> 20 #include <linux/memory.h> 21 #include <linux/memremap.h> 22 #include <linux/memory_hotplug.h> 23 #include <linux/vmalloc.h> 24 #include <linux/ioport.h> 25 #include <linux/delay.h> 26 #include <linux/migrate.h> 27 #include <linux/page-isolation.h> 28 #include <linux/pfn.h> 29 #include <linux/suspend.h> 30 #include <linux/mm_inline.h> 31 #include <linux/firmware-map.h> 32 #include <linux/stop_machine.h> 33 #include <linux/hugetlb.h> 34 #include <linux/memblock.h> 35 #include <linux/compaction.h> 36 #include <linux/rmap.h> 37 #include <linux/module.h> 38 39 #include <asm/tlbflush.h> 40 41 #include "internal.h" 42 #include "shuffle.h" 43 44 enum { 45 MEMMAP_ON_MEMORY_DISABLE = 0, 46 MEMMAP_ON_MEMORY_ENABLE, 47 MEMMAP_ON_MEMORY_FORCE, 48 }; 49 50 static int memmap_mode __read_mostly = MEMMAP_ON_MEMORY_DISABLE; 51 52 static inline unsigned long memory_block_memmap_size(void) 53 { 54 return PHYS_PFN(memory_block_size_bytes()) * sizeof(struct page); 55 } 56 57 static inline unsigned long memory_block_memmap_on_memory_pages(void) 58 { 59 unsigned long nr_pages = PFN_UP(memory_block_memmap_size()); 60 61 /* 62 * In "forced" memmap_on_memory mode, we add extra pages to align the 63 * vmemmap size to cover full pageblocks. That way, we can add memory 64 * even if the vmemmap size is not properly aligned, however, we might waste 65 * memory. 66 */ 67 if (memmap_mode == MEMMAP_ON_MEMORY_FORCE) 68 return pageblock_align(nr_pages); 69 return nr_pages; 70 } 71 72 #ifdef CONFIG_MHP_MEMMAP_ON_MEMORY 73 /* 74 * memory_hotplug.memmap_on_memory parameter 75 */ 76 static int set_memmap_mode(const char *val, const struct kernel_param *kp) 77 { 78 int ret, mode; 79 bool enabled; 80 81 if (sysfs_streq(val, "force") || sysfs_streq(val, "FORCE")) { 82 mode = MEMMAP_ON_MEMORY_FORCE; 83 } else { 84 ret = kstrtobool(val, &enabled); 85 if (ret < 0) 86 return ret; 87 if (enabled) 88 mode = MEMMAP_ON_MEMORY_ENABLE; 89 else 90 mode = MEMMAP_ON_MEMORY_DISABLE; 91 } 92 *((int *)kp->arg) = mode; 93 if (mode == MEMMAP_ON_MEMORY_FORCE) { 94 unsigned long memmap_pages = memory_block_memmap_on_memory_pages(); 95 96 pr_info_once("Memory hotplug will waste %ld pages in each memory block\n", 97 memmap_pages - PFN_UP(memory_block_memmap_size())); 98 } 99 return 0; 100 } 101 102 static int get_memmap_mode(char *buffer, const struct kernel_param *kp) 103 { 104 int mode = *((int *)kp->arg); 105 106 if (mode == MEMMAP_ON_MEMORY_FORCE) 107 return sprintf(buffer, "force\n"); 108 return sprintf(buffer, "%c\n", mode ? 'Y' : 'N'); 109 } 110 111 static const struct kernel_param_ops memmap_mode_ops = { 112 .set = set_memmap_mode, 113 .get = get_memmap_mode, 114 }; 115 module_param_cb(memmap_on_memory, &memmap_mode_ops, &memmap_mode, 0444); 116 MODULE_PARM_DESC(memmap_on_memory, "Enable memmap on memory for memory hotplug\n" 117 "With value \"force\" it could result in memory wastage due " 118 "to memmap size limitations (Y/N/force)"); 119 120 static inline bool mhp_memmap_on_memory(void) 121 { 122 return memmap_mode != MEMMAP_ON_MEMORY_DISABLE; 123 } 124 #else 125 static inline bool mhp_memmap_on_memory(void) 126 { 127 return false; 128 } 129 #endif 130 131 enum { 132 ONLINE_POLICY_CONTIG_ZONES = 0, 133 ONLINE_POLICY_AUTO_MOVABLE, 134 }; 135 136 static const char * const online_policy_to_str[] = { 137 [ONLINE_POLICY_CONTIG_ZONES] = "contig-zones", 138 [ONLINE_POLICY_AUTO_MOVABLE] = "auto-movable", 139 }; 140 141 static int set_online_policy(const char *val, const struct kernel_param *kp) 142 { 143 int ret = sysfs_match_string(online_policy_to_str, val); 144 145 if (ret < 0) 146 return ret; 147 *((int *)kp->arg) = ret; 148 return 0; 149 } 150 151 static int get_online_policy(char *buffer, const struct kernel_param *kp) 152 { 153 return sprintf(buffer, "%s\n", online_policy_to_str[*((int *)kp->arg)]); 154 } 155 156 /* 157 * memory_hotplug.online_policy: configure online behavior when onlining without 158 * specifying a zone (MMOP_ONLINE) 159 * 160 * "contig-zones": keep zone contiguous 161 * "auto-movable": online memory to ZONE_MOVABLE if the configuration 162 * (auto_movable_ratio, auto_movable_numa_aware) allows for it 163 */ 164 static int online_policy __read_mostly = ONLINE_POLICY_CONTIG_ZONES; 165 static const struct kernel_param_ops online_policy_ops = { 166 .set = set_online_policy, 167 .get = get_online_policy, 168 }; 169 module_param_cb(online_policy, &online_policy_ops, &online_policy, 0644); 170 MODULE_PARM_DESC(online_policy, 171 "Set the online policy (\"contig-zones\", \"auto-movable\") " 172 "Default: \"contig-zones\""); 173 174 /* 175 * memory_hotplug.auto_movable_ratio: specify maximum MOVABLE:KERNEL ratio 176 * 177 * The ratio represent an upper limit and the kernel might decide to not 178 * online some memory to ZONE_MOVABLE -- e.g., because hotplugged KERNEL memory 179 * doesn't allow for more MOVABLE memory. 180 */ 181 static unsigned int auto_movable_ratio __read_mostly = 301; 182 module_param(auto_movable_ratio, uint, 0644); 183 MODULE_PARM_DESC(auto_movable_ratio, 184 "Set the maximum ratio of MOVABLE:KERNEL memory in the system " 185 "in percent for \"auto-movable\" online policy. Default: 301"); 186 187 /* 188 * memory_hotplug.auto_movable_numa_aware: consider numa node stats 189 */ 190 #ifdef CONFIG_NUMA 191 static bool auto_movable_numa_aware __read_mostly = true; 192 module_param(auto_movable_numa_aware, bool, 0644); 193 MODULE_PARM_DESC(auto_movable_numa_aware, 194 "Consider numa node stats in addition to global stats in " 195 "\"auto-movable\" online policy. Default: true"); 196 #endif /* CONFIG_NUMA */ 197 198 /* 199 * online_page_callback contains pointer to current page onlining function. 200 * Initially it is generic_online_page(). If it is required it could be 201 * changed by calling set_online_page_callback() for callback registration 202 * and restore_online_page_callback() for generic callback restore. 203 */ 204 205 static online_page_callback_t online_page_callback = generic_online_page; 206 static DEFINE_MUTEX(online_page_callback_lock); 207 208 DEFINE_STATIC_PERCPU_RWSEM(mem_hotplug_lock); 209 210 void get_online_mems(void) 211 { 212 percpu_down_read(&mem_hotplug_lock); 213 } 214 215 void put_online_mems(void) 216 { 217 percpu_up_read(&mem_hotplug_lock); 218 } 219 220 bool movable_node_enabled = false; 221 222 static int mhp_default_online_type = -1; 223 int mhp_get_default_online_type(void) 224 { 225 if (mhp_default_online_type >= 0) 226 return mhp_default_online_type; 227 228 if (IS_ENABLED(CONFIG_MHP_DEFAULT_ONLINE_TYPE_OFFLINE)) 229 mhp_default_online_type = MMOP_OFFLINE; 230 else if (IS_ENABLED(CONFIG_MHP_DEFAULT_ONLINE_TYPE_ONLINE_AUTO)) 231 mhp_default_online_type = MMOP_ONLINE; 232 else if (IS_ENABLED(CONFIG_MHP_DEFAULT_ONLINE_TYPE_ONLINE_KERNEL)) 233 mhp_default_online_type = MMOP_ONLINE_KERNEL; 234 else if (IS_ENABLED(CONFIG_MHP_DEFAULT_ONLINE_TYPE_ONLINE_MOVABLE)) 235 mhp_default_online_type = MMOP_ONLINE_MOVABLE; 236 else 237 mhp_default_online_type = MMOP_OFFLINE; 238 239 return mhp_default_online_type; 240 } 241 242 void mhp_set_default_online_type(int online_type) 243 { 244 mhp_default_online_type = online_type; 245 } 246 247 static int __init setup_memhp_default_state(char *str) 248 { 249 const int online_type = mhp_online_type_from_str(str); 250 251 if (online_type >= 0) 252 mhp_default_online_type = online_type; 253 254 return 1; 255 } 256 __setup("memhp_default_state=", setup_memhp_default_state); 257 258 void mem_hotplug_begin(void) 259 { 260 cpus_read_lock(); 261 percpu_down_write(&mem_hotplug_lock); 262 } 263 264 void mem_hotplug_done(void) 265 { 266 percpu_up_write(&mem_hotplug_lock); 267 cpus_read_unlock(); 268 } 269 270 u64 max_mem_size = U64_MAX; 271 272 /* add this memory to iomem resource */ 273 static struct resource *register_memory_resource(u64 start, u64 size, 274 const char *resource_name) 275 { 276 struct resource *res; 277 unsigned long flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; 278 279 if (strcmp(resource_name, "System RAM")) 280 flags |= IORESOURCE_SYSRAM_DRIVER_MANAGED; 281 282 if (!mhp_range_allowed(start, size, true)) 283 return ERR_PTR(-E2BIG); 284 285 /* 286 * Make sure value parsed from 'mem=' only restricts memory adding 287 * while booting, so that memory hotplug won't be impacted. Please 288 * refer to document of 'mem=' in kernel-parameters.txt for more 289 * details. 290 */ 291 if (start + size > max_mem_size && system_state < SYSTEM_RUNNING) 292 return ERR_PTR(-E2BIG); 293 294 /* 295 * Request ownership of the new memory range. This might be 296 * a child of an existing resource that was present but 297 * not marked as busy. 298 */ 299 res = __request_region(&iomem_resource, start, size, 300 resource_name, flags); 301 302 if (!res) { 303 pr_debug("Unable to reserve System RAM region: %016llx->%016llx\n", 304 start, start + size); 305 return ERR_PTR(-EEXIST); 306 } 307 return res; 308 } 309 310 static void release_memory_resource(struct resource *res) 311 { 312 if (!res) 313 return; 314 release_resource(res); 315 kfree(res); 316 } 317 318 static int check_pfn_span(unsigned long pfn, unsigned long nr_pages) 319 { 320 /* 321 * Disallow all operations smaller than a sub-section and only 322 * allow operations smaller than a section for 323 * SPARSEMEM_VMEMMAP. Note that check_hotplug_memory_range() 324 * enforces a larger memory_block_size_bytes() granularity for 325 * memory that will be marked online, so this check should only 326 * fire for direct arch_{add,remove}_memory() users outside of 327 * add_memory_resource(). 328 */ 329 unsigned long min_align; 330 331 if (IS_ENABLED(CONFIG_SPARSEMEM_VMEMMAP)) 332 min_align = PAGES_PER_SUBSECTION; 333 else 334 min_align = PAGES_PER_SECTION; 335 if (!IS_ALIGNED(pfn | nr_pages, min_align)) 336 return -EINVAL; 337 return 0; 338 } 339 340 /* 341 * Return page for the valid pfn only if the page is online. All pfn 342 * walkers which rely on the fully initialized page->flags and others 343 * should use this rather than pfn_valid && pfn_to_page 344 */ 345 struct page *pfn_to_online_page(unsigned long pfn) 346 { 347 unsigned long nr = pfn_to_section_nr(pfn); 348 struct dev_pagemap *pgmap; 349 struct mem_section *ms; 350 351 if (nr >= NR_MEM_SECTIONS) 352 return NULL; 353 354 ms = __nr_to_section(nr); 355 if (!online_section(ms)) 356 return NULL; 357 358 /* 359 * Save some code text when online_section() + 360 * pfn_section_valid() are sufficient. 361 */ 362 if (IS_ENABLED(CONFIG_HAVE_ARCH_PFN_VALID) && !pfn_valid(pfn)) 363 return NULL; 364 365 if (!pfn_section_valid(ms, pfn)) 366 return NULL; 367 368 if (!online_device_section(ms)) 369 return pfn_to_page(pfn); 370 371 /* 372 * Slowpath: when ZONE_DEVICE collides with 373 * ZONE_{NORMAL,MOVABLE} within the same section some pfns in 374 * the section may be 'offline' but 'valid'. Only 375 * get_dev_pagemap() can determine sub-section online status. 376 */ 377 pgmap = get_dev_pagemap(pfn, NULL); 378 put_dev_pagemap(pgmap); 379 380 /* The presence of a pgmap indicates ZONE_DEVICE offline pfn */ 381 if (pgmap) 382 return NULL; 383 384 return pfn_to_page(pfn); 385 } 386 EXPORT_SYMBOL_GPL(pfn_to_online_page); 387 388 int __add_pages(int nid, unsigned long pfn, unsigned long nr_pages, 389 struct mhp_params *params) 390 { 391 const unsigned long end_pfn = pfn + nr_pages; 392 unsigned long cur_nr_pages; 393 int err; 394 struct vmem_altmap *altmap = params->altmap; 395 396 if (WARN_ON_ONCE(!pgprot_val(params->pgprot))) 397 return -EINVAL; 398 399 VM_BUG_ON(!mhp_range_allowed(PFN_PHYS(pfn), nr_pages * PAGE_SIZE, false)); 400 401 if (altmap) { 402 /* 403 * Validate altmap is within bounds of the total request 404 */ 405 if (altmap->base_pfn != pfn 406 || vmem_altmap_offset(altmap) > nr_pages) { 407 pr_warn_once("memory add fail, invalid altmap\n"); 408 return -EINVAL; 409 } 410 altmap->alloc = 0; 411 } 412 413 if (check_pfn_span(pfn, nr_pages)) { 414 WARN(1, "Misaligned %s start: %#lx end: %#lx\n", __func__, pfn, pfn + nr_pages - 1); 415 return -EINVAL; 416 } 417 418 for (; pfn < end_pfn; pfn += cur_nr_pages) { 419 /* Select all remaining pages up to the next section boundary */ 420 cur_nr_pages = min(end_pfn - pfn, 421 SECTION_ALIGN_UP(pfn + 1) - pfn); 422 err = sparse_add_section(nid, pfn, cur_nr_pages, altmap, 423 params->pgmap); 424 if (err) 425 break; 426 cond_resched(); 427 } 428 vmemmap_populate_print_last(); 429 return err; 430 } 431 432 /* find the smallest valid pfn in the range [start_pfn, end_pfn) */ 433 static unsigned long find_smallest_section_pfn(int nid, struct zone *zone, 434 unsigned long start_pfn, 435 unsigned long end_pfn) 436 { 437 for (; start_pfn < end_pfn; start_pfn += PAGES_PER_SUBSECTION) { 438 if (unlikely(!pfn_to_online_page(start_pfn))) 439 continue; 440 441 if (unlikely(pfn_to_nid(start_pfn) != nid)) 442 continue; 443 444 if (zone != page_zone(pfn_to_page(start_pfn))) 445 continue; 446 447 return start_pfn; 448 } 449 450 return 0; 451 } 452 453 /* find the biggest valid pfn in the range [start_pfn, end_pfn). */ 454 static unsigned long find_biggest_section_pfn(int nid, struct zone *zone, 455 unsigned long start_pfn, 456 unsigned long end_pfn) 457 { 458 unsigned long pfn; 459 460 /* pfn is the end pfn of a memory section. */ 461 pfn = end_pfn - 1; 462 for (; pfn >= start_pfn; pfn -= PAGES_PER_SUBSECTION) { 463 if (unlikely(!pfn_to_online_page(pfn))) 464 continue; 465 466 if (unlikely(pfn_to_nid(pfn) != nid)) 467 continue; 468 469 if (zone != page_zone(pfn_to_page(pfn))) 470 continue; 471 472 return pfn; 473 } 474 475 return 0; 476 } 477 478 static void shrink_zone_span(struct zone *zone, unsigned long start_pfn, 479 unsigned long end_pfn) 480 { 481 unsigned long pfn; 482 int nid = zone_to_nid(zone); 483 484 if (zone->zone_start_pfn == start_pfn) { 485 /* 486 * If the section is smallest section in the zone, it need 487 * shrink zone->zone_start_pfn and zone->zone_spanned_pages. 488 * In this case, we find second smallest valid mem_section 489 * for shrinking zone. 490 */ 491 pfn = find_smallest_section_pfn(nid, zone, end_pfn, 492 zone_end_pfn(zone)); 493 if (pfn) { 494 zone->spanned_pages = zone_end_pfn(zone) - pfn; 495 zone->zone_start_pfn = pfn; 496 } else { 497 zone->zone_start_pfn = 0; 498 zone->spanned_pages = 0; 499 } 500 } else if (zone_end_pfn(zone) == end_pfn) { 501 /* 502 * If the section is biggest section in the zone, it need 503 * shrink zone->spanned_pages. 504 * In this case, we find second biggest valid mem_section for 505 * shrinking zone. 506 */ 507 pfn = find_biggest_section_pfn(nid, zone, zone->zone_start_pfn, 508 start_pfn); 509 if (pfn) 510 zone->spanned_pages = pfn - zone->zone_start_pfn + 1; 511 else { 512 zone->zone_start_pfn = 0; 513 zone->spanned_pages = 0; 514 } 515 } 516 } 517 518 static void update_pgdat_span(struct pglist_data *pgdat) 519 { 520 unsigned long node_start_pfn = 0, node_end_pfn = 0; 521 struct zone *zone; 522 523 for (zone = pgdat->node_zones; 524 zone < pgdat->node_zones + MAX_NR_ZONES; zone++) { 525 unsigned long end_pfn = zone_end_pfn(zone); 526 527 /* No need to lock the zones, they can't change. */ 528 if (!zone->spanned_pages) 529 continue; 530 if (!node_end_pfn) { 531 node_start_pfn = zone->zone_start_pfn; 532 node_end_pfn = end_pfn; 533 continue; 534 } 535 536 if (end_pfn > node_end_pfn) 537 node_end_pfn = end_pfn; 538 if (zone->zone_start_pfn < node_start_pfn) 539 node_start_pfn = zone->zone_start_pfn; 540 } 541 542 pgdat->node_start_pfn = node_start_pfn; 543 pgdat->node_spanned_pages = node_end_pfn - node_start_pfn; 544 } 545 546 void remove_pfn_range_from_zone(struct zone *zone, 547 unsigned long start_pfn, 548 unsigned long nr_pages) 549 { 550 const unsigned long end_pfn = start_pfn + nr_pages; 551 struct pglist_data *pgdat = zone->zone_pgdat; 552 unsigned long pfn, cur_nr_pages; 553 554 /* Poison struct pages because they are now uninitialized again. */ 555 for (pfn = start_pfn; pfn < end_pfn; pfn += cur_nr_pages) { 556 cond_resched(); 557 558 /* Select all remaining pages up to the next section boundary */ 559 cur_nr_pages = 560 min(end_pfn - pfn, SECTION_ALIGN_UP(pfn + 1) - pfn); 561 page_init_poison(pfn_to_page(pfn), 562 sizeof(struct page) * cur_nr_pages); 563 } 564 565 /* 566 * Zone shrinking code cannot properly deal with ZONE_DEVICE. So 567 * we will not try to shrink the zones - which is okay as 568 * set_zone_contiguous() cannot deal with ZONE_DEVICE either way. 569 */ 570 if (zone_is_zone_device(zone)) 571 return; 572 573 clear_zone_contiguous(zone); 574 575 shrink_zone_span(zone, start_pfn, start_pfn + nr_pages); 576 update_pgdat_span(pgdat); 577 578 set_zone_contiguous(zone); 579 } 580 581 /** 582 * __remove_pages() - remove sections of pages 583 * @pfn: starting pageframe (must be aligned to start of a section) 584 * @nr_pages: number of pages to remove (must be multiple of section size) 585 * @altmap: alternative device page map or %NULL if default memmap is used 586 * 587 * Generic helper function to remove section mappings and sysfs entries 588 * for the section of the memory we are removing. Caller needs to make 589 * sure that pages are marked reserved and zones are adjust properly by 590 * calling offline_pages(). 591 */ 592 void __remove_pages(unsigned long pfn, unsigned long nr_pages, 593 struct vmem_altmap *altmap) 594 { 595 const unsigned long end_pfn = pfn + nr_pages; 596 unsigned long cur_nr_pages; 597 598 if (check_pfn_span(pfn, nr_pages)) { 599 WARN(1, "Misaligned %s start: %#lx end: %#lx\n", __func__, pfn, pfn + nr_pages - 1); 600 return; 601 } 602 603 for (; pfn < end_pfn; pfn += cur_nr_pages) { 604 cond_resched(); 605 /* Select all remaining pages up to the next section boundary */ 606 cur_nr_pages = min(end_pfn - pfn, 607 SECTION_ALIGN_UP(pfn + 1) - pfn); 608 sparse_remove_section(pfn, cur_nr_pages, altmap); 609 } 610 } 611 612 int set_online_page_callback(online_page_callback_t callback) 613 { 614 int rc = -EINVAL; 615 616 get_online_mems(); 617 mutex_lock(&online_page_callback_lock); 618 619 if (online_page_callback == generic_online_page) { 620 online_page_callback = callback; 621 rc = 0; 622 } 623 624 mutex_unlock(&online_page_callback_lock); 625 put_online_mems(); 626 627 return rc; 628 } 629 EXPORT_SYMBOL_GPL(set_online_page_callback); 630 631 int restore_online_page_callback(online_page_callback_t callback) 632 { 633 int rc = -EINVAL; 634 635 get_online_mems(); 636 mutex_lock(&online_page_callback_lock); 637 638 if (online_page_callback == callback) { 639 online_page_callback = generic_online_page; 640 rc = 0; 641 } 642 643 mutex_unlock(&online_page_callback_lock); 644 put_online_mems(); 645 646 return rc; 647 } 648 EXPORT_SYMBOL_GPL(restore_online_page_callback); 649 650 /* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ 651 void generic_online_page(struct page *page, unsigned int order) 652 { 653 __free_pages_core(page, order, MEMINIT_HOTPLUG); 654 } 655 EXPORT_SYMBOL_GPL(generic_online_page); 656 657 static void online_pages_range(unsigned long start_pfn, unsigned long nr_pages) 658 { 659 const unsigned long end_pfn = start_pfn + nr_pages; 660 unsigned long pfn; 661 662 /* 663 * Online the pages in MAX_PAGE_ORDER aligned chunks. The callback might 664 * decide to not expose all pages to the buddy (e.g., expose them 665 * later). We account all pages as being online and belonging to this 666 * zone ("present"). 667 * When using memmap_on_memory, the range might not be aligned to 668 * MAX_ORDER_NR_PAGES - 1, but pageblock aligned. __ffs() will detect 669 * this and the first chunk to online will be pageblock_nr_pages. 670 */ 671 for (pfn = start_pfn; pfn < end_pfn;) { 672 struct page *page = pfn_to_page(pfn); 673 int order; 674 675 /* 676 * Free to online pages in the largest chunks alignment allows. 677 * 678 * __ffs() behaviour is undefined for 0. start == 0 is 679 * MAX_PAGE_ORDER-aligned, Set order to MAX_PAGE_ORDER for 680 * the case. 681 */ 682 if (pfn) 683 order = min_t(int, MAX_PAGE_ORDER, __ffs(pfn)); 684 else 685 order = MAX_PAGE_ORDER; 686 687 /* 688 * Exposing the page to the buddy by freeing can cause 689 * issues with debug_pagealloc enabled: some archs don't 690 * like double-unmappings. So treat them like any pages that 691 * were allocated from the buddy. 692 */ 693 debug_pagealloc_map_pages(page, 1 << order); 694 (*online_page_callback)(page, order); 695 pfn += (1UL << order); 696 } 697 698 /* mark all involved sections as online */ 699 online_mem_sections(start_pfn, end_pfn); 700 } 701 702 /* check which state of node_states will be changed when online memory */ 703 static void node_states_check_changes_online(unsigned long nr_pages, 704 struct zone *zone, struct memory_notify *arg) 705 { 706 int nid = zone_to_nid(zone); 707 708 arg->status_change_nid = NUMA_NO_NODE; 709 arg->status_change_nid_normal = NUMA_NO_NODE; 710 711 if (!node_state(nid, N_MEMORY)) 712 arg->status_change_nid = nid; 713 if (zone_idx(zone) <= ZONE_NORMAL && !node_state(nid, N_NORMAL_MEMORY)) 714 arg->status_change_nid_normal = nid; 715 } 716 717 static void node_states_set_node(int node, struct memory_notify *arg) 718 { 719 if (arg->status_change_nid_normal >= 0) 720 node_set_state(node, N_NORMAL_MEMORY); 721 722 if (arg->status_change_nid >= 0) 723 node_set_state(node, N_MEMORY); 724 } 725 726 static void __meminit resize_zone_range(struct zone *zone, unsigned long start_pfn, 727 unsigned long nr_pages) 728 { 729 unsigned long old_end_pfn = zone_end_pfn(zone); 730 731 if (zone_is_empty(zone) || start_pfn < zone->zone_start_pfn) 732 zone->zone_start_pfn = start_pfn; 733 734 zone->spanned_pages = max(start_pfn + nr_pages, old_end_pfn) - zone->zone_start_pfn; 735 } 736 737 static void __meminit resize_pgdat_range(struct pglist_data *pgdat, unsigned long start_pfn, 738 unsigned long nr_pages) 739 { 740 unsigned long old_end_pfn = pgdat_end_pfn(pgdat); 741 742 if (!pgdat->node_spanned_pages || start_pfn < pgdat->node_start_pfn) 743 pgdat->node_start_pfn = start_pfn; 744 745 pgdat->node_spanned_pages = max(start_pfn + nr_pages, old_end_pfn) - pgdat->node_start_pfn; 746 747 } 748 749 #ifdef CONFIG_ZONE_DEVICE 750 static void section_taint_zone_device(unsigned long pfn) 751 { 752 struct mem_section *ms = __pfn_to_section(pfn); 753 754 ms->section_mem_map |= SECTION_TAINT_ZONE_DEVICE; 755 } 756 #else 757 static inline void section_taint_zone_device(unsigned long pfn) 758 { 759 } 760 #endif 761 762 /* 763 * Associate the pfn range with the given zone, initializing the memmaps 764 * and resizing the pgdat/zone data to span the added pages. After this 765 * call, all affected pages are PageOffline(). 766 * 767 * All aligned pageblocks are initialized to the specified migratetype 768 * (usually MIGRATE_MOVABLE). Besides setting the migratetype, no related 769 * zone stats (e.g., nr_isolate_pageblock) are touched. 770 */ 771 void move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn, 772 unsigned long nr_pages, 773 struct vmem_altmap *altmap, int migratetype) 774 { 775 struct pglist_data *pgdat = zone->zone_pgdat; 776 int nid = pgdat->node_id; 777 778 clear_zone_contiguous(zone); 779 780 if (zone_is_empty(zone)) 781 init_currently_empty_zone(zone, start_pfn, nr_pages); 782 resize_zone_range(zone, start_pfn, nr_pages); 783 resize_pgdat_range(pgdat, start_pfn, nr_pages); 784 785 /* 786 * Subsection population requires care in pfn_to_online_page(). 787 * Set the taint to enable the slow path detection of 788 * ZONE_DEVICE pages in an otherwise ZONE_{NORMAL,MOVABLE} 789 * section. 790 */ 791 if (zone_is_zone_device(zone)) { 792 if (!IS_ALIGNED(start_pfn, PAGES_PER_SECTION)) 793 section_taint_zone_device(start_pfn); 794 if (!IS_ALIGNED(start_pfn + nr_pages, PAGES_PER_SECTION)) 795 section_taint_zone_device(start_pfn + nr_pages); 796 } 797 798 /* 799 * TODO now we have a visible range of pages which are not associated 800 * with their zone properly. Not nice but set_pfnblock_flags_mask 801 * expects the zone spans the pfn range. All the pages in the range 802 * are reserved so nobody should be touching them so we should be safe 803 */ 804 memmap_init_range(nr_pages, nid, zone_idx(zone), start_pfn, 0, 805 MEMINIT_HOTPLUG, altmap, migratetype); 806 807 set_zone_contiguous(zone); 808 } 809 810 struct auto_movable_stats { 811 unsigned long kernel_early_pages; 812 unsigned long movable_pages; 813 }; 814 815 static void auto_movable_stats_account_zone(struct auto_movable_stats *stats, 816 struct zone *zone) 817 { 818 if (zone_idx(zone) == ZONE_MOVABLE) { 819 stats->movable_pages += zone->present_pages; 820 } else { 821 stats->kernel_early_pages += zone->present_early_pages; 822 #ifdef CONFIG_CMA 823 /* 824 * CMA pages (never on hotplugged memory) behave like 825 * ZONE_MOVABLE. 826 */ 827 stats->movable_pages += zone->cma_pages; 828 stats->kernel_early_pages -= zone->cma_pages; 829 #endif /* CONFIG_CMA */ 830 } 831 } 832 struct auto_movable_group_stats { 833 unsigned long movable_pages; 834 unsigned long req_kernel_early_pages; 835 }; 836 837 static int auto_movable_stats_account_group(struct memory_group *group, 838 void *arg) 839 { 840 const int ratio = READ_ONCE(auto_movable_ratio); 841 struct auto_movable_group_stats *stats = arg; 842 long pages; 843 844 /* 845 * We don't support modifying the config while the auto-movable online 846 * policy is already enabled. Just avoid the division by zero below. 847 */ 848 if (!ratio) 849 return 0; 850 851 /* 852 * Calculate how many early kernel pages this group requires to 853 * satisfy the configured zone ratio. 854 */ 855 pages = group->present_movable_pages * 100 / ratio; 856 pages -= group->present_kernel_pages; 857 858 if (pages > 0) 859 stats->req_kernel_early_pages += pages; 860 stats->movable_pages += group->present_movable_pages; 861 return 0; 862 } 863 864 static bool auto_movable_can_online_movable(int nid, struct memory_group *group, 865 unsigned long nr_pages) 866 { 867 unsigned long kernel_early_pages, movable_pages; 868 struct auto_movable_group_stats group_stats = {}; 869 struct auto_movable_stats stats = {}; 870 struct zone *zone; 871 int i; 872 873 /* Walk all relevant zones and collect MOVABLE vs. KERNEL stats. */ 874 if (nid == NUMA_NO_NODE) { 875 /* TODO: cache values */ 876 for_each_populated_zone(zone) 877 auto_movable_stats_account_zone(&stats, zone); 878 } else { 879 for (i = 0; i < MAX_NR_ZONES; i++) { 880 pg_data_t *pgdat = NODE_DATA(nid); 881 882 zone = pgdat->node_zones + i; 883 if (populated_zone(zone)) 884 auto_movable_stats_account_zone(&stats, zone); 885 } 886 } 887 888 kernel_early_pages = stats.kernel_early_pages; 889 movable_pages = stats.movable_pages; 890 891 /* 892 * Kernel memory inside dynamic memory group allows for more MOVABLE 893 * memory within the same group. Remove the effect of all but the 894 * current group from the stats. 895 */ 896 walk_dynamic_memory_groups(nid, auto_movable_stats_account_group, 897 group, &group_stats); 898 if (kernel_early_pages <= group_stats.req_kernel_early_pages) 899 return false; 900 kernel_early_pages -= group_stats.req_kernel_early_pages; 901 movable_pages -= group_stats.movable_pages; 902 903 if (group && group->is_dynamic) 904 kernel_early_pages += group->present_kernel_pages; 905 906 /* 907 * Test if we could online the given number of pages to ZONE_MOVABLE 908 * and still stay in the configured ratio. 909 */ 910 movable_pages += nr_pages; 911 return movable_pages <= (auto_movable_ratio * kernel_early_pages) / 100; 912 } 913 914 /* 915 * Returns a default kernel memory zone for the given pfn range. 916 * If no kernel zone covers this pfn range it will automatically go 917 * to the ZONE_NORMAL. 918 */ 919 static struct zone *default_kernel_zone_for_pfn(int nid, unsigned long start_pfn, 920 unsigned long nr_pages) 921 { 922 struct pglist_data *pgdat = NODE_DATA(nid); 923 int zid; 924 925 for (zid = 0; zid < ZONE_NORMAL; zid++) { 926 struct zone *zone = &pgdat->node_zones[zid]; 927 928 if (zone_intersects(zone, start_pfn, nr_pages)) 929 return zone; 930 } 931 932 return &pgdat->node_zones[ZONE_NORMAL]; 933 } 934 935 /* 936 * Determine to which zone to online memory dynamically based on user 937 * configuration and system stats. We care about the following ratio: 938 * 939 * MOVABLE : KERNEL 940 * 941 * Whereby MOVABLE is memory in ZONE_MOVABLE and KERNEL is memory in 942 * one of the kernel zones. CMA pages inside one of the kernel zones really 943 * behaves like ZONE_MOVABLE, so we treat them accordingly. 944 * 945 * We don't allow for hotplugged memory in a KERNEL zone to increase the 946 * amount of MOVABLE memory we can have, so we end up with: 947 * 948 * MOVABLE : KERNEL_EARLY 949 * 950 * Whereby KERNEL_EARLY is memory in one of the kernel zones, available sinze 951 * boot. We base our calculation on KERNEL_EARLY internally, because: 952 * 953 * a) Hotplugged memory in one of the kernel zones can sometimes still get 954 * hotunplugged, especially when hot(un)plugging individual memory blocks. 955 * There is no coordination across memory devices, therefore "automatic" 956 * hotunplugging, as implemented in hypervisors, could result in zone 957 * imbalances. 958 * b) Early/boot memory in one of the kernel zones can usually not get 959 * hotunplugged again (e.g., no firmware interface to unplug, fragmented 960 * with unmovable allocations). While there are corner cases where it might 961 * still work, it is barely relevant in practice. 962 * 963 * Exceptions are dynamic memory groups, which allow for more MOVABLE 964 * memory within the same memory group -- because in that case, there is 965 * coordination within the single memory device managed by a single driver. 966 * 967 * We rely on "present pages" instead of "managed pages", as the latter is 968 * highly unreliable and dynamic in virtualized environments, and does not 969 * consider boot time allocations. For example, memory ballooning adjusts the 970 * managed pages when inflating/deflating the balloon, and balloon compaction 971 * can even migrate inflated pages between zones. 972 * 973 * Using "present pages" is better but some things to keep in mind are: 974 * 975 * a) Some memblock allocations, such as for the crashkernel area, are 976 * effectively unused by the kernel, yet they account to "present pages". 977 * Fortunately, these allocations are comparatively small in relevant setups 978 * (e.g., fraction of system memory). 979 * b) Some hotplugged memory blocks in virtualized environments, esecially 980 * hotplugged by virtio-mem, look like they are completely present, however, 981 * only parts of the memory block are actually currently usable. 982 * "present pages" is an upper limit that can get reached at runtime. As 983 * we base our calculations on KERNEL_EARLY, this is not an issue. 984 */ 985 static struct zone *auto_movable_zone_for_pfn(int nid, 986 struct memory_group *group, 987 unsigned long pfn, 988 unsigned long nr_pages) 989 { 990 unsigned long online_pages = 0, max_pages, end_pfn; 991 struct page *page; 992 993 if (!auto_movable_ratio) 994 goto kernel_zone; 995 996 if (group && !group->is_dynamic) { 997 max_pages = group->s.max_pages; 998 online_pages = group->present_movable_pages; 999 1000 /* If anything is !MOVABLE online the rest !MOVABLE. */ 1001 if (group->present_kernel_pages) 1002 goto kernel_zone; 1003 } else if (!group || group->d.unit_pages == nr_pages) { 1004 max_pages = nr_pages; 1005 } else { 1006 max_pages = group->d.unit_pages; 1007 /* 1008 * Take a look at all online sections in the current unit. 1009 * We can safely assume that all pages within a section belong 1010 * to the same zone, because dynamic memory groups only deal 1011 * with hotplugged memory. 1012 */ 1013 pfn = ALIGN_DOWN(pfn, group->d.unit_pages); 1014 end_pfn = pfn + group->d.unit_pages; 1015 for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) { 1016 page = pfn_to_online_page(pfn); 1017 if (!page) 1018 continue; 1019 /* If anything is !MOVABLE online the rest !MOVABLE. */ 1020 if (!is_zone_movable_page(page)) 1021 goto kernel_zone; 1022 online_pages += PAGES_PER_SECTION; 1023 } 1024 } 1025 1026 /* 1027 * Online MOVABLE if we could *currently* online all remaining parts 1028 * MOVABLE. We expect to (add+) online them immediately next, so if 1029 * nobody interferes, all will be MOVABLE if possible. 1030 */ 1031 nr_pages = max_pages - online_pages; 1032 if (!auto_movable_can_online_movable(NUMA_NO_NODE, group, nr_pages)) 1033 goto kernel_zone; 1034 1035 #ifdef CONFIG_NUMA 1036 if (auto_movable_numa_aware && 1037 !auto_movable_can_online_movable(nid, group, nr_pages)) 1038 goto kernel_zone; 1039 #endif /* CONFIG_NUMA */ 1040 1041 return &NODE_DATA(nid)->node_zones[ZONE_MOVABLE]; 1042 kernel_zone: 1043 return default_kernel_zone_for_pfn(nid, pfn, nr_pages); 1044 } 1045 1046 static inline struct zone *default_zone_for_pfn(int nid, unsigned long start_pfn, 1047 unsigned long nr_pages) 1048 { 1049 struct zone *kernel_zone = default_kernel_zone_for_pfn(nid, start_pfn, 1050 nr_pages); 1051 struct zone *movable_zone = &NODE_DATA(nid)->node_zones[ZONE_MOVABLE]; 1052 bool in_kernel = zone_intersects(kernel_zone, start_pfn, nr_pages); 1053 bool in_movable = zone_intersects(movable_zone, start_pfn, nr_pages); 1054 1055 /* 1056 * We inherit the existing zone in a simple case where zones do not 1057 * overlap in the given range 1058 */ 1059 if (in_kernel ^ in_movable) 1060 return (in_kernel) ? kernel_zone : movable_zone; 1061 1062 /* 1063 * If the range doesn't belong to any zone or two zones overlap in the 1064 * given range then we use movable zone only if movable_node is 1065 * enabled because we always online to a kernel zone by default. 1066 */ 1067 return movable_node_enabled ? movable_zone : kernel_zone; 1068 } 1069 1070 struct zone *zone_for_pfn_range(int online_type, int nid, 1071 struct memory_group *group, unsigned long start_pfn, 1072 unsigned long nr_pages) 1073 { 1074 if (online_type == MMOP_ONLINE_KERNEL) 1075 return default_kernel_zone_for_pfn(nid, start_pfn, nr_pages); 1076 1077 if (online_type == MMOP_ONLINE_MOVABLE) 1078 return &NODE_DATA(nid)->node_zones[ZONE_MOVABLE]; 1079 1080 if (online_policy == ONLINE_POLICY_AUTO_MOVABLE) 1081 return auto_movable_zone_for_pfn(nid, group, start_pfn, nr_pages); 1082 1083 return default_zone_for_pfn(nid, start_pfn, nr_pages); 1084 } 1085 1086 /* 1087 * This function should only be called by memory_block_{online,offline}, 1088 * and {online,offline}_pages. 1089 */ 1090 void adjust_present_page_count(struct page *page, struct memory_group *group, 1091 long nr_pages) 1092 { 1093 struct zone *zone = page_zone(page); 1094 const bool movable = zone_idx(zone) == ZONE_MOVABLE; 1095 1096 /* 1097 * We only support onlining/offlining/adding/removing of complete 1098 * memory blocks; therefore, either all is either early or hotplugged. 1099 */ 1100 if (early_section(__pfn_to_section(page_to_pfn(page)))) 1101 zone->present_early_pages += nr_pages; 1102 zone->present_pages += nr_pages; 1103 zone->zone_pgdat->node_present_pages += nr_pages; 1104 1105 if (group && movable) 1106 group->present_movable_pages += nr_pages; 1107 else if (group && !movable) 1108 group->present_kernel_pages += nr_pages; 1109 } 1110 1111 int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages, 1112 struct zone *zone, bool mhp_off_inaccessible) 1113 { 1114 unsigned long end_pfn = pfn + nr_pages; 1115 int ret, i; 1116 1117 ret = kasan_add_zero_shadow(__va(PFN_PHYS(pfn)), PFN_PHYS(nr_pages)); 1118 if (ret) 1119 return ret; 1120 1121 /* 1122 * Memory block is accessible at this stage and hence poison the struct 1123 * pages now. If the memory block is accessible during memory hotplug 1124 * addition phase, then page poisining is already performed in 1125 * sparse_add_section(). 1126 */ 1127 if (mhp_off_inaccessible) 1128 page_init_poison(pfn_to_page(pfn), sizeof(struct page) * nr_pages); 1129 1130 move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_UNMOVABLE); 1131 1132 for (i = 0; i < nr_pages; i++) { 1133 struct page *page = pfn_to_page(pfn + i); 1134 1135 __ClearPageOffline(page); 1136 SetPageVmemmapSelfHosted(page); 1137 } 1138 1139 /* 1140 * It might be that the vmemmap_pages fully span sections. If that is 1141 * the case, mark those sections online here as otherwise they will be 1142 * left offline. 1143 */ 1144 if (nr_pages >= PAGES_PER_SECTION) 1145 online_mem_sections(pfn, ALIGN_DOWN(end_pfn, PAGES_PER_SECTION)); 1146 1147 return ret; 1148 } 1149 1150 void mhp_deinit_memmap_on_memory(unsigned long pfn, unsigned long nr_pages) 1151 { 1152 unsigned long end_pfn = pfn + nr_pages; 1153 1154 /* 1155 * It might be that the vmemmap_pages fully span sections. If that is 1156 * the case, mark those sections offline here as otherwise they will be 1157 * left online. 1158 */ 1159 if (nr_pages >= PAGES_PER_SECTION) 1160 offline_mem_sections(pfn, ALIGN_DOWN(end_pfn, PAGES_PER_SECTION)); 1161 1162 /* 1163 * The pages associated with this vmemmap have been offlined, so 1164 * we can reset its state here. 1165 */ 1166 remove_pfn_range_from_zone(page_zone(pfn_to_page(pfn)), pfn, nr_pages); 1167 kasan_remove_zero_shadow(__va(PFN_PHYS(pfn)), PFN_PHYS(nr_pages)); 1168 } 1169 1170 /* 1171 * Must be called with mem_hotplug_lock in write mode. 1172 */ 1173 int online_pages(unsigned long pfn, unsigned long nr_pages, 1174 struct zone *zone, struct memory_group *group) 1175 { 1176 unsigned long flags; 1177 int need_zonelists_rebuild = 0; 1178 const int nid = zone_to_nid(zone); 1179 int ret; 1180 struct memory_notify arg; 1181 1182 /* 1183 * {on,off}lining is constrained to full memory sections (or more 1184 * precisely to memory blocks from the user space POV). 1185 * memmap_on_memory is an exception because it reserves initial part 1186 * of the physical memory space for vmemmaps. That space is pageblock 1187 * aligned. 1188 */ 1189 if (WARN_ON_ONCE(!nr_pages || !pageblock_aligned(pfn) || 1190 !IS_ALIGNED(pfn + nr_pages, PAGES_PER_SECTION))) 1191 return -EINVAL; 1192 1193 1194 /* associate pfn range with the zone */ 1195 move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_ISOLATE); 1196 1197 arg.start_pfn = pfn; 1198 arg.nr_pages = nr_pages; 1199 node_states_check_changes_online(nr_pages, zone, &arg); 1200 1201 ret = memory_notify(MEM_GOING_ONLINE, &arg); 1202 ret = notifier_to_errno(ret); 1203 if (ret) 1204 goto failed_addition; 1205 1206 /* 1207 * Fixup the number of isolated pageblocks before marking the sections 1208 * onlining, such that undo_isolate_page_range() works correctly. 1209 */ 1210 spin_lock_irqsave(&zone->lock, flags); 1211 zone->nr_isolate_pageblock += nr_pages / pageblock_nr_pages; 1212 spin_unlock_irqrestore(&zone->lock, flags); 1213 1214 /* 1215 * If this zone is not populated, then it is not in zonelist. 1216 * This means the page allocator ignores this zone. 1217 * So, zonelist must be updated after online. 1218 */ 1219 if (!populated_zone(zone)) { 1220 need_zonelists_rebuild = 1; 1221 setup_zone_pageset(zone); 1222 } 1223 1224 online_pages_range(pfn, nr_pages); 1225 adjust_present_page_count(pfn_to_page(pfn), group, nr_pages); 1226 1227 node_states_set_node(nid, &arg); 1228 if (need_zonelists_rebuild) 1229 build_all_zonelists(NULL); 1230 1231 /* Basic onlining is complete, allow allocation of onlined pages. */ 1232 undo_isolate_page_range(pfn, pfn + nr_pages, MIGRATE_MOVABLE); 1233 1234 /* 1235 * Freshly onlined pages aren't shuffled (e.g., all pages are placed to 1236 * the tail of the freelist when undoing isolation). Shuffle the whole 1237 * zone to make sure the just onlined pages are properly distributed 1238 * across the whole freelist - to create an initial shuffle. 1239 */ 1240 shuffle_zone(zone); 1241 1242 /* reinitialise watermarks and update pcp limits */ 1243 init_per_zone_wmark_min(); 1244 1245 kswapd_run(nid); 1246 kcompactd_run(nid); 1247 1248 writeback_set_ratelimit(); 1249 1250 memory_notify(MEM_ONLINE, &arg); 1251 return 0; 1252 1253 failed_addition: 1254 pr_debug("online_pages [mem %#010llx-%#010llx] failed\n", 1255 (unsigned long long) pfn << PAGE_SHIFT, 1256 (((unsigned long long) pfn + nr_pages) << PAGE_SHIFT) - 1); 1257 memory_notify(MEM_CANCEL_ONLINE, &arg); 1258 remove_pfn_range_from_zone(zone, pfn, nr_pages); 1259 return ret; 1260 } 1261 1262 /* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ 1263 static pg_data_t *hotadd_init_pgdat(int nid) 1264 { 1265 struct pglist_data *pgdat; 1266 1267 /* 1268 * NODE_DATA is preallocated (free_area_init) but its internal 1269 * state is not allocated completely. Add missing pieces. 1270 * Completely offline nodes stay around and they just need 1271 * reintialization. 1272 */ 1273 pgdat = NODE_DATA(nid); 1274 1275 /* init node's zones as empty zones, we don't have any present pages.*/ 1276 free_area_init_core_hotplug(pgdat); 1277 1278 /* 1279 * The node we allocated has no zone fallback lists. For avoiding 1280 * to access not-initialized zonelist, build here. 1281 */ 1282 build_all_zonelists(pgdat); 1283 1284 return pgdat; 1285 } 1286 1287 /* 1288 * __try_online_node - online a node if offlined 1289 * @nid: the node ID 1290 * @set_node_online: Whether we want to online the node 1291 * called by cpu_up() to online a node without onlined memory. 1292 * 1293 * Returns: 1294 * 1 -> a new node has been allocated 1295 * 0 -> the node is already online 1296 * -ENOMEM -> the node could not be allocated 1297 */ 1298 static int __try_online_node(int nid, bool set_node_online) 1299 { 1300 pg_data_t *pgdat; 1301 int ret = 1; 1302 1303 if (node_online(nid)) 1304 return 0; 1305 1306 pgdat = hotadd_init_pgdat(nid); 1307 if (!pgdat) { 1308 pr_err("Cannot online node %d due to NULL pgdat\n", nid); 1309 ret = -ENOMEM; 1310 goto out; 1311 } 1312 1313 if (set_node_online) { 1314 node_set_online(nid); 1315 ret = register_one_node(nid); 1316 BUG_ON(ret); 1317 } 1318 out: 1319 return ret; 1320 } 1321 1322 /* 1323 * Users of this function always want to online/register the node 1324 */ 1325 int try_online_node(int nid) 1326 { 1327 int ret; 1328 1329 mem_hotplug_begin(); 1330 ret = __try_online_node(nid, true); 1331 mem_hotplug_done(); 1332 return ret; 1333 } 1334 1335 static int check_hotplug_memory_range(u64 start, u64 size) 1336 { 1337 /* memory range must be block size aligned */ 1338 if (!size || !IS_ALIGNED(start, memory_block_size_bytes()) || 1339 !IS_ALIGNED(size, memory_block_size_bytes())) { 1340 pr_err("Block size [%#lx] unaligned hotplug range: start %#llx, size %#llx", 1341 memory_block_size_bytes(), start, size); 1342 return -EINVAL; 1343 } 1344 1345 return 0; 1346 } 1347 1348 static int online_memory_block(struct memory_block *mem, void *arg) 1349 { 1350 mem->online_type = mhp_get_default_online_type(); 1351 return device_online(&mem->dev); 1352 } 1353 1354 #ifndef arch_supports_memmap_on_memory 1355 static inline bool arch_supports_memmap_on_memory(unsigned long vmemmap_size) 1356 { 1357 /* 1358 * As default, we want the vmemmap to span a complete PMD such that we 1359 * can map the vmemmap using a single PMD if supported by the 1360 * architecture. 1361 */ 1362 return IS_ALIGNED(vmemmap_size, PMD_SIZE); 1363 } 1364 #endif 1365 1366 bool mhp_supports_memmap_on_memory(void) 1367 { 1368 unsigned long vmemmap_size = memory_block_memmap_size(); 1369 unsigned long memmap_pages = memory_block_memmap_on_memory_pages(); 1370 1371 /* 1372 * Besides having arch support and the feature enabled at runtime, we 1373 * need a few more assumptions to hold true: 1374 * 1375 * a) The vmemmap pages span complete PMDs: We don't want vmemmap code 1376 * to populate memory from the altmap for unrelated parts (i.e., 1377 * other memory blocks) 1378 * 1379 * b) The vmemmap pages (and thereby the pages that will be exposed to 1380 * the buddy) have to cover full pageblocks: memory onlining/offlining 1381 * code requires applicable ranges to be page-aligned, for example, to 1382 * set the migratetypes properly. 1383 * 1384 * TODO: Although we have a check here to make sure that vmemmap pages 1385 * fully populate a PMD, it is not the right place to check for 1386 * this. A much better solution involves improving vmemmap code 1387 * to fallback to base pages when trying to populate vmemmap using 1388 * altmap as an alternative source of memory, and we do not exactly 1389 * populate a single PMD. 1390 */ 1391 if (!mhp_memmap_on_memory()) 1392 return false; 1393 1394 /* 1395 * Make sure the vmemmap allocation is fully contained 1396 * so that we always allocate vmemmap memory from altmap area. 1397 */ 1398 if (!IS_ALIGNED(vmemmap_size, PAGE_SIZE)) 1399 return false; 1400 1401 /* 1402 * start pfn should be pageblock_nr_pages aligned for correctly 1403 * setting migrate types 1404 */ 1405 if (!pageblock_aligned(memmap_pages)) 1406 return false; 1407 1408 if (memmap_pages == PHYS_PFN(memory_block_size_bytes())) 1409 /* No effective hotplugged memory doesn't make sense. */ 1410 return false; 1411 1412 return arch_supports_memmap_on_memory(vmemmap_size); 1413 } 1414 EXPORT_SYMBOL_GPL(mhp_supports_memmap_on_memory); 1415 1416 static void remove_memory_blocks_and_altmaps(u64 start, u64 size) 1417 { 1418 unsigned long memblock_size = memory_block_size_bytes(); 1419 u64 cur_start; 1420 1421 /* 1422 * For memmap_on_memory, the altmaps were added on a per-memblock 1423 * basis; we have to process each individual memory block. 1424 */ 1425 for (cur_start = start; cur_start < start + size; 1426 cur_start += memblock_size) { 1427 struct vmem_altmap *altmap = NULL; 1428 struct memory_block *mem; 1429 1430 mem = find_memory_block(pfn_to_section_nr(PFN_DOWN(cur_start))); 1431 if (WARN_ON_ONCE(!mem)) 1432 continue; 1433 1434 altmap = mem->altmap; 1435 mem->altmap = NULL; 1436 1437 remove_memory_block_devices(cur_start, memblock_size); 1438 1439 arch_remove_memory(cur_start, memblock_size, altmap); 1440 1441 /* Verify that all vmemmap pages have actually been freed. */ 1442 WARN(altmap->alloc, "Altmap not fully unmapped"); 1443 kfree(altmap); 1444 } 1445 } 1446 1447 static int create_altmaps_and_memory_blocks(int nid, struct memory_group *group, 1448 u64 start, u64 size, mhp_t mhp_flags) 1449 { 1450 unsigned long memblock_size = memory_block_size_bytes(); 1451 u64 cur_start; 1452 int ret; 1453 1454 for (cur_start = start; cur_start < start + size; 1455 cur_start += memblock_size) { 1456 struct mhp_params params = { .pgprot = 1457 pgprot_mhp(PAGE_KERNEL) }; 1458 struct vmem_altmap mhp_altmap = { 1459 .base_pfn = PHYS_PFN(cur_start), 1460 .end_pfn = PHYS_PFN(cur_start + memblock_size - 1), 1461 }; 1462 1463 mhp_altmap.free = memory_block_memmap_on_memory_pages(); 1464 if (mhp_flags & MHP_OFFLINE_INACCESSIBLE) 1465 mhp_altmap.inaccessible = true; 1466 params.altmap = kmemdup(&mhp_altmap, sizeof(struct vmem_altmap), 1467 GFP_KERNEL); 1468 if (!params.altmap) { 1469 ret = -ENOMEM; 1470 goto out; 1471 } 1472 1473 /* call arch's memory hotadd */ 1474 ret = arch_add_memory(nid, cur_start, memblock_size, ¶ms); 1475 if (ret < 0) { 1476 kfree(params.altmap); 1477 goto out; 1478 } 1479 1480 /* create memory block devices after memory was added */ 1481 ret = create_memory_block_devices(cur_start, memblock_size, 1482 params.altmap, group); 1483 if (ret) { 1484 arch_remove_memory(cur_start, memblock_size, NULL); 1485 kfree(params.altmap); 1486 goto out; 1487 } 1488 } 1489 1490 return 0; 1491 out: 1492 if (ret && cur_start != start) 1493 remove_memory_blocks_and_altmaps(start, cur_start - start); 1494 return ret; 1495 } 1496 1497 /* 1498 * NOTE: The caller must call lock_device_hotplug() to serialize hotplug 1499 * and online/offline operations (triggered e.g. by sysfs). 1500 * 1501 * we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG 1502 */ 1503 int add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags) 1504 { 1505 struct mhp_params params = { .pgprot = pgprot_mhp(PAGE_KERNEL) }; 1506 enum memblock_flags memblock_flags = MEMBLOCK_NONE; 1507 struct memory_group *group = NULL; 1508 u64 start, size; 1509 bool new_node = false; 1510 int ret; 1511 1512 start = res->start; 1513 size = resource_size(res); 1514 1515 ret = check_hotplug_memory_range(start, size); 1516 if (ret) 1517 return ret; 1518 1519 if (mhp_flags & MHP_NID_IS_MGID) { 1520 group = memory_group_find_by_id(nid); 1521 if (!group) 1522 return -EINVAL; 1523 nid = group->nid; 1524 } 1525 1526 if (!node_possible(nid)) { 1527 WARN(1, "node %d was absent from the node_possible_map\n", nid); 1528 return -EINVAL; 1529 } 1530 1531 mem_hotplug_begin(); 1532 1533 if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) { 1534 if (res->flags & IORESOURCE_SYSRAM_DRIVER_MANAGED) 1535 memblock_flags = MEMBLOCK_DRIVER_MANAGED; 1536 ret = memblock_add_node(start, size, nid, memblock_flags); 1537 if (ret) 1538 goto error_mem_hotplug_end; 1539 } 1540 1541 ret = __try_online_node(nid, false); 1542 if (ret < 0) 1543 goto error; 1544 new_node = ret; 1545 1546 /* 1547 * Self hosted memmap array 1548 */ 1549 if ((mhp_flags & MHP_MEMMAP_ON_MEMORY) && 1550 mhp_supports_memmap_on_memory()) { 1551 ret = create_altmaps_and_memory_blocks(nid, group, start, size, mhp_flags); 1552 if (ret) 1553 goto error; 1554 } else { 1555 ret = arch_add_memory(nid, start, size, ¶ms); 1556 if (ret < 0) 1557 goto error; 1558 1559 /* create memory block devices after memory was added */ 1560 ret = create_memory_block_devices(start, size, NULL, group); 1561 if (ret) { 1562 arch_remove_memory(start, size, params.altmap); 1563 goto error; 1564 } 1565 } 1566 1567 if (new_node) { 1568 /* If sysfs file of new node can't be created, cpu on the node 1569 * can't be hot-added. There is no rollback way now. 1570 * So, check by BUG_ON() to catch it reluctantly.. 1571 * We online node here. We can't roll back from here. 1572 */ 1573 node_set_online(nid); 1574 ret = __register_one_node(nid); 1575 BUG_ON(ret); 1576 } 1577 1578 register_memory_blocks_under_node(nid, PFN_DOWN(start), 1579 PFN_UP(start + size - 1), 1580 MEMINIT_HOTPLUG); 1581 1582 /* create new memmap entry */ 1583 if (!strcmp(res->name, "System RAM")) 1584 firmware_map_add_hotplug(start, start + size, "System RAM"); 1585 1586 /* device_online() will take the lock when calling online_pages() */ 1587 mem_hotplug_done(); 1588 1589 /* 1590 * In case we're allowed to merge the resource, flag it and trigger 1591 * merging now that adding succeeded. 1592 */ 1593 if (mhp_flags & MHP_MERGE_RESOURCE) 1594 merge_system_ram_resource(res); 1595 1596 /* online pages if requested */ 1597 if (mhp_get_default_online_type() != MMOP_OFFLINE) 1598 walk_memory_blocks(start, size, NULL, online_memory_block); 1599 1600 return ret; 1601 error: 1602 if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) 1603 memblock_remove(start, size); 1604 error_mem_hotplug_end: 1605 mem_hotplug_done(); 1606 return ret; 1607 } 1608 1609 /* requires device_hotplug_lock, see add_memory_resource() */ 1610 int __add_memory(int nid, u64 start, u64 size, mhp_t mhp_flags) 1611 { 1612 struct resource *res; 1613 int ret; 1614 1615 res = register_memory_resource(start, size, "System RAM"); 1616 if (IS_ERR(res)) 1617 return PTR_ERR(res); 1618 1619 ret = add_memory_resource(nid, res, mhp_flags); 1620 if (ret < 0) 1621 release_memory_resource(res); 1622 return ret; 1623 } 1624 1625 int add_memory(int nid, u64 start, u64 size, mhp_t mhp_flags) 1626 { 1627 int rc; 1628 1629 lock_device_hotplug(); 1630 rc = __add_memory(nid, start, size, mhp_flags); 1631 unlock_device_hotplug(); 1632 1633 return rc; 1634 } 1635 EXPORT_SYMBOL_GPL(add_memory); 1636 1637 /* 1638 * Add special, driver-managed memory to the system as system RAM. Such 1639 * memory is not exposed via the raw firmware-provided memmap as system 1640 * RAM, instead, it is detected and added by a driver - during cold boot, 1641 * after a reboot, and after kexec. 1642 * 1643 * Reasons why this memory should not be used for the initial memmap of a 1644 * kexec kernel or for placing kexec images: 1645 * - The booting kernel is in charge of determining how this memory will be 1646 * used (e.g., use persistent memory as system RAM) 1647 * - Coordination with a hypervisor is required before this memory 1648 * can be used (e.g., inaccessible parts). 1649 * 1650 * For this memory, no entries in /sys/firmware/memmap ("raw firmware-provided 1651 * memory map") are created. Also, the created memory resource is flagged 1652 * with IORESOURCE_SYSRAM_DRIVER_MANAGED, so in-kernel users can special-case 1653 * this memory as well (esp., not place kexec images onto it). 1654 * 1655 * The resource_name (visible via /proc/iomem) has to have the format 1656 * "System RAM ($DRIVER)". 1657 */ 1658 int add_memory_driver_managed(int nid, u64 start, u64 size, 1659 const char *resource_name, mhp_t mhp_flags) 1660 { 1661 struct resource *res; 1662 int rc; 1663 1664 if (!resource_name || 1665 strstr(resource_name, "System RAM (") != resource_name || 1666 resource_name[strlen(resource_name) - 1] != ')') 1667 return -EINVAL; 1668 1669 lock_device_hotplug(); 1670 1671 res = register_memory_resource(start, size, resource_name); 1672 if (IS_ERR(res)) { 1673 rc = PTR_ERR(res); 1674 goto out_unlock; 1675 } 1676 1677 rc = add_memory_resource(nid, res, mhp_flags); 1678 if (rc < 0) 1679 release_memory_resource(res); 1680 1681 out_unlock: 1682 unlock_device_hotplug(); 1683 return rc; 1684 } 1685 EXPORT_SYMBOL_GPL(add_memory_driver_managed); 1686 1687 /* 1688 * Platforms should define arch_get_mappable_range() that provides 1689 * maximum possible addressable physical memory range for which the 1690 * linear mapping could be created. The platform returned address 1691 * range must adhere to these following semantics. 1692 * 1693 * - range.start <= range.end 1694 * - Range includes both end points [range.start..range.end] 1695 * 1696 * There is also a fallback definition provided here, allowing the 1697 * entire possible physical address range in case any platform does 1698 * not define arch_get_mappable_range(). 1699 */ 1700 struct range __weak arch_get_mappable_range(void) 1701 { 1702 struct range mhp_range = { 1703 .start = 0UL, 1704 .end = -1ULL, 1705 }; 1706 return mhp_range; 1707 } 1708 1709 struct range mhp_get_pluggable_range(bool need_mapping) 1710 { 1711 const u64 max_phys = DIRECT_MAP_PHYSMEM_END; 1712 struct range mhp_range; 1713 1714 if (need_mapping) { 1715 mhp_range = arch_get_mappable_range(); 1716 if (mhp_range.start > max_phys) { 1717 mhp_range.start = 0; 1718 mhp_range.end = 0; 1719 } 1720 mhp_range.end = min_t(u64, mhp_range.end, max_phys); 1721 } else { 1722 mhp_range.start = 0; 1723 mhp_range.end = max_phys; 1724 } 1725 return mhp_range; 1726 } 1727 EXPORT_SYMBOL_GPL(mhp_get_pluggable_range); 1728 1729 bool mhp_range_allowed(u64 start, u64 size, bool need_mapping) 1730 { 1731 struct range mhp_range = mhp_get_pluggable_range(need_mapping); 1732 u64 end = start + size; 1733 1734 if (start < end && start >= mhp_range.start && (end - 1) <= mhp_range.end) 1735 return true; 1736 1737 pr_warn("Hotplug memory [%#llx-%#llx] exceeds maximum addressable range [%#llx-%#llx]\n", 1738 start, end, mhp_range.start, mhp_range.end); 1739 return false; 1740 } 1741 1742 #ifdef CONFIG_MEMORY_HOTREMOVE 1743 /* 1744 * Scan pfn range [start,end) to find movable/migratable pages (LRU pages, 1745 * non-lru movable pages and hugepages). Will skip over most unmovable 1746 * pages (esp., pages that can be skipped when offlining), but bail out on 1747 * definitely unmovable pages. 1748 * 1749 * Returns: 1750 * 0 in case a movable page is found and movable_pfn was updated. 1751 * -ENOENT in case no movable page was found. 1752 * -EBUSY in case a definitely unmovable page was found. 1753 */ 1754 static int scan_movable_pages(unsigned long start, unsigned long end, 1755 unsigned long *movable_pfn) 1756 { 1757 unsigned long pfn; 1758 1759 for (pfn = start; pfn < end; pfn++) { 1760 struct page *page; 1761 struct folio *folio; 1762 1763 if (!pfn_valid(pfn)) 1764 continue; 1765 page = pfn_to_page(pfn); 1766 if (PageLRU(page)) 1767 goto found; 1768 if (__PageMovable(page)) 1769 goto found; 1770 1771 /* 1772 * PageOffline() pages that are not marked __PageMovable() and 1773 * have a reference count > 0 (after MEM_GOING_OFFLINE) are 1774 * definitely unmovable. If their reference count would be 0, 1775 * they could at least be skipped when offlining memory. 1776 */ 1777 if (PageOffline(page) && page_count(page)) 1778 return -EBUSY; 1779 1780 if (!PageHuge(page)) 1781 continue; 1782 folio = page_folio(page); 1783 /* 1784 * This test is racy as we hold no reference or lock. The 1785 * hugetlb page could have been free'ed and head is no longer 1786 * a hugetlb page before the following check. In such unlikely 1787 * cases false positives and negatives are possible. Calling 1788 * code must deal with these scenarios. 1789 */ 1790 if (folio_test_hugetlb_migratable(folio)) 1791 goto found; 1792 pfn |= folio_nr_pages(folio) - 1; 1793 } 1794 return -ENOENT; 1795 found: 1796 *movable_pfn = pfn; 1797 return 0; 1798 } 1799 1800 static void do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) 1801 { 1802 struct folio *folio; 1803 unsigned long pfn; 1804 LIST_HEAD(source); 1805 static DEFINE_RATELIMIT_STATE(migrate_rs, DEFAULT_RATELIMIT_INTERVAL, 1806 DEFAULT_RATELIMIT_BURST); 1807 1808 for (pfn = start_pfn; pfn < end_pfn; pfn++) { 1809 struct page *page; 1810 1811 if (!pfn_valid(pfn)) 1812 continue; 1813 page = pfn_to_page(pfn); 1814 folio = page_folio(page); 1815 1816 /* 1817 * No reference or lock is held on the folio, so it might 1818 * be modified concurrently (e.g. split). As such, 1819 * folio_nr_pages() may read garbage. This is fine as the outer 1820 * loop will revisit the split folio later. 1821 */ 1822 if (folio_test_large(folio)) 1823 pfn = folio_pfn(folio) + folio_nr_pages(folio) - 1; 1824 1825 /* 1826 * HWPoison pages have elevated reference counts so the migration would 1827 * fail on them. It also doesn't make any sense to migrate them in the 1828 * first place. Still try to unmap such a page in case it is still mapped 1829 * (keep the unmap as the catch all safety net). 1830 */ 1831 if (folio_test_hwpoison(folio) || 1832 (folio_test_large(folio) && folio_test_has_hwpoisoned(folio))) { 1833 if (WARN_ON(folio_test_lru(folio))) 1834 folio_isolate_lru(folio); 1835 if (folio_mapped(folio)) 1836 unmap_poisoned_folio(folio, TTU_IGNORE_MLOCK); 1837 continue; 1838 } 1839 1840 if (!folio_try_get(folio)) 1841 continue; 1842 1843 if (unlikely(page_folio(page) != folio)) 1844 goto put_folio; 1845 1846 if (!isolate_folio_to_list(folio, &source)) { 1847 if (__ratelimit(&migrate_rs)) { 1848 pr_warn("failed to isolate pfn %lx\n", 1849 page_to_pfn(page)); 1850 dump_page(page, "isolation failed"); 1851 } 1852 } 1853 put_folio: 1854 folio_put(folio); 1855 } 1856 if (!list_empty(&source)) { 1857 nodemask_t nmask = node_states[N_MEMORY]; 1858 struct migration_target_control mtc = { 1859 .nmask = &nmask, 1860 .gfp_mask = GFP_KERNEL | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL, 1861 .reason = MR_MEMORY_HOTPLUG, 1862 }; 1863 int ret; 1864 1865 /* 1866 * We have checked that migration range is on a single zone so 1867 * we can use the nid of the first page to all the others. 1868 */ 1869 mtc.nid = folio_nid(list_first_entry(&source, struct folio, lru)); 1870 1871 /* 1872 * try to allocate from a different node but reuse this node 1873 * if there are no other online nodes to be used (e.g. we are 1874 * offlining a part of the only existing node) 1875 */ 1876 node_clear(mtc.nid, nmask); 1877 if (nodes_empty(nmask)) 1878 node_set(mtc.nid, nmask); 1879 ret = migrate_pages(&source, alloc_migration_target, NULL, 1880 (unsigned long)&mtc, MIGRATE_SYNC, MR_MEMORY_HOTPLUG, NULL); 1881 if (ret) { 1882 list_for_each_entry(folio, &source, lru) { 1883 if (__ratelimit(&migrate_rs)) { 1884 pr_warn("migrating pfn %lx failed ret:%d\n", 1885 folio_pfn(folio), ret); 1886 dump_page(&folio->page, 1887 "migration failure"); 1888 } 1889 } 1890 putback_movable_pages(&source); 1891 } 1892 } 1893 } 1894 1895 static int __init cmdline_parse_movable_node(char *p) 1896 { 1897 movable_node_enabled = true; 1898 return 0; 1899 } 1900 early_param("movable_node", cmdline_parse_movable_node); 1901 1902 /* check which state of node_states will be changed when offline memory */ 1903 static void node_states_check_changes_offline(unsigned long nr_pages, 1904 struct zone *zone, struct memory_notify *arg) 1905 { 1906 struct pglist_data *pgdat = zone->zone_pgdat; 1907 unsigned long present_pages = 0; 1908 enum zone_type zt; 1909 1910 arg->status_change_nid = NUMA_NO_NODE; 1911 arg->status_change_nid_normal = NUMA_NO_NODE; 1912 1913 /* 1914 * Check whether node_states[N_NORMAL_MEMORY] will be changed. 1915 * If the memory to be offline is within the range 1916 * [0..ZONE_NORMAL], and it is the last present memory there, 1917 * the zones in that range will become empty after the offlining, 1918 * thus we can determine that we need to clear the node from 1919 * node_states[N_NORMAL_MEMORY]. 1920 */ 1921 for (zt = 0; zt <= ZONE_NORMAL; zt++) 1922 present_pages += pgdat->node_zones[zt].present_pages; 1923 if (zone_idx(zone) <= ZONE_NORMAL && nr_pages >= present_pages) 1924 arg->status_change_nid_normal = zone_to_nid(zone); 1925 1926 /* 1927 * We have accounted the pages from [0..ZONE_NORMAL); ZONE_HIGHMEM 1928 * does not apply as we don't support 32bit. 1929 * Here we count the possible pages from ZONE_MOVABLE. 1930 * If after having accounted all the pages, we see that the nr_pages 1931 * to be offlined is over or equal to the accounted pages, 1932 * we know that the node will become empty, and so, we can clear 1933 * it for N_MEMORY as well. 1934 */ 1935 present_pages += pgdat->node_zones[ZONE_MOVABLE].present_pages; 1936 1937 if (nr_pages >= present_pages) 1938 arg->status_change_nid = zone_to_nid(zone); 1939 } 1940 1941 static void node_states_clear_node(int node, struct memory_notify *arg) 1942 { 1943 if (arg->status_change_nid_normal >= 0) 1944 node_clear_state(node, N_NORMAL_MEMORY); 1945 1946 if (arg->status_change_nid >= 0) 1947 node_clear_state(node, N_MEMORY); 1948 } 1949 1950 static int count_system_ram_pages_cb(unsigned long start_pfn, 1951 unsigned long nr_pages, void *data) 1952 { 1953 unsigned long *nr_system_ram_pages = data; 1954 1955 *nr_system_ram_pages += nr_pages; 1956 return 0; 1957 } 1958 1959 /* 1960 * Must be called with mem_hotplug_lock in write mode. 1961 */ 1962 int offline_pages(unsigned long start_pfn, unsigned long nr_pages, 1963 struct zone *zone, struct memory_group *group) 1964 { 1965 const unsigned long end_pfn = start_pfn + nr_pages; 1966 unsigned long pfn, managed_pages, system_ram_pages = 0; 1967 const int node = zone_to_nid(zone); 1968 unsigned long flags; 1969 struct memory_notify arg; 1970 char *reason; 1971 int ret; 1972 1973 /* 1974 * {on,off}lining is constrained to full memory sections (or more 1975 * precisely to memory blocks from the user space POV). 1976 * memmap_on_memory is an exception because it reserves initial part 1977 * of the physical memory space for vmemmaps. That space is pageblock 1978 * aligned. 1979 */ 1980 if (WARN_ON_ONCE(!nr_pages || !pageblock_aligned(start_pfn) || 1981 !IS_ALIGNED(start_pfn + nr_pages, PAGES_PER_SECTION))) 1982 return -EINVAL; 1983 1984 /* 1985 * Don't allow to offline memory blocks that contain holes. 1986 * Consequently, memory blocks with holes can never get onlined 1987 * via the hotplug path - online_pages() - as hotplugged memory has 1988 * no holes. This way, we don't have to worry about memory holes, 1989 * don't need pfn_valid() checks, and can avoid using 1990 * walk_system_ram_range() later. 1991 */ 1992 walk_system_ram_range(start_pfn, nr_pages, &system_ram_pages, 1993 count_system_ram_pages_cb); 1994 if (system_ram_pages != nr_pages) { 1995 ret = -EINVAL; 1996 reason = "memory holes"; 1997 goto failed_removal; 1998 } 1999 2000 /* 2001 * We only support offlining of memory blocks managed by a single zone, 2002 * checked by calling code. This is just a sanity check that we might 2003 * want to remove in the future. 2004 */ 2005 if (WARN_ON_ONCE(page_zone(pfn_to_page(start_pfn)) != zone || 2006 page_zone(pfn_to_page(end_pfn - 1)) != zone)) { 2007 ret = -EINVAL; 2008 reason = "multizone range"; 2009 goto failed_removal; 2010 } 2011 2012 /* 2013 * Disable pcplists so that page isolation cannot race with freeing 2014 * in a way that pages from isolated pageblock are left on pcplists. 2015 */ 2016 zone_pcp_disable(zone); 2017 lru_cache_disable(); 2018 2019 /* set above range as isolated */ 2020 ret = start_isolate_page_range(start_pfn, end_pfn, 2021 MIGRATE_MOVABLE, 2022 MEMORY_OFFLINE | REPORT_FAILURE); 2023 if (ret) { 2024 reason = "failure to isolate range"; 2025 goto failed_removal_pcplists_disabled; 2026 } 2027 2028 arg.start_pfn = start_pfn; 2029 arg.nr_pages = nr_pages; 2030 node_states_check_changes_offline(nr_pages, zone, &arg); 2031 2032 ret = memory_notify(MEM_GOING_OFFLINE, &arg); 2033 ret = notifier_to_errno(ret); 2034 if (ret) { 2035 reason = "notifier failure"; 2036 goto failed_removal_isolated; 2037 } 2038 2039 do { 2040 pfn = start_pfn; 2041 do { 2042 /* 2043 * Historically we always checked for any signal and 2044 * can't limit it to fatal signals without eventually 2045 * breaking user space. 2046 */ 2047 if (signal_pending(current)) { 2048 ret = -EINTR; 2049 reason = "signal backoff"; 2050 goto failed_removal_isolated; 2051 } 2052 2053 cond_resched(); 2054 2055 ret = scan_movable_pages(pfn, end_pfn, &pfn); 2056 if (!ret) { 2057 /* 2058 * TODO: fatal migration failures should bail 2059 * out 2060 */ 2061 do_migrate_range(pfn, end_pfn); 2062 } 2063 } while (!ret); 2064 2065 if (ret != -ENOENT) { 2066 reason = "unmovable page"; 2067 goto failed_removal_isolated; 2068 } 2069 2070 /* 2071 * Dissolve free hugetlb folios in the memory block before doing 2072 * offlining actually in order to make hugetlbfs's object 2073 * counting consistent. 2074 */ 2075 ret = dissolve_free_hugetlb_folios(start_pfn, end_pfn); 2076 if (ret) { 2077 reason = "failure to dissolve huge pages"; 2078 goto failed_removal_isolated; 2079 } 2080 2081 ret = test_pages_isolated(start_pfn, end_pfn, MEMORY_OFFLINE); 2082 2083 } while (ret); 2084 2085 /* Mark all sections offline and remove free pages from the buddy. */ 2086 managed_pages = __offline_isolated_pages(start_pfn, end_pfn); 2087 pr_debug("Offlined Pages %ld\n", nr_pages); 2088 2089 /* 2090 * The memory sections are marked offline, and the pageblock flags 2091 * effectively stale; nobody should be touching them. Fixup the number 2092 * of isolated pageblocks, memory onlining will properly revert this. 2093 */ 2094 spin_lock_irqsave(&zone->lock, flags); 2095 zone->nr_isolate_pageblock -= nr_pages / pageblock_nr_pages; 2096 spin_unlock_irqrestore(&zone->lock, flags); 2097 2098 lru_cache_enable(); 2099 zone_pcp_enable(zone); 2100 2101 /* removal success */ 2102 adjust_managed_page_count(pfn_to_page(start_pfn), -managed_pages); 2103 adjust_present_page_count(pfn_to_page(start_pfn), group, -nr_pages); 2104 2105 /* reinitialise watermarks and update pcp limits */ 2106 init_per_zone_wmark_min(); 2107 2108 /* 2109 * Make sure to mark the node as memory-less before rebuilding the zone 2110 * list. Otherwise this node would still appear in the fallback lists. 2111 */ 2112 node_states_clear_node(node, &arg); 2113 if (!populated_zone(zone)) { 2114 zone_pcp_reset(zone); 2115 build_all_zonelists(NULL); 2116 } 2117 2118 if (arg.status_change_nid >= 0) { 2119 kcompactd_stop(node); 2120 kswapd_stop(node); 2121 } 2122 2123 writeback_set_ratelimit(); 2124 2125 memory_notify(MEM_OFFLINE, &arg); 2126 remove_pfn_range_from_zone(zone, start_pfn, nr_pages); 2127 return 0; 2128 2129 failed_removal_isolated: 2130 /* pushback to free area */ 2131 undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); 2132 memory_notify(MEM_CANCEL_OFFLINE, &arg); 2133 failed_removal_pcplists_disabled: 2134 lru_cache_enable(); 2135 zone_pcp_enable(zone); 2136 failed_removal: 2137 pr_debug("memory offlining [mem %#010llx-%#010llx] failed due to %s\n", 2138 (unsigned long long) start_pfn << PAGE_SHIFT, 2139 ((unsigned long long) end_pfn << PAGE_SHIFT) - 1, 2140 reason); 2141 return ret; 2142 } 2143 2144 static int check_memblock_offlined_cb(struct memory_block *mem, void *arg) 2145 { 2146 int *nid = arg; 2147 2148 *nid = mem->nid; 2149 if (unlikely(mem->state != MEM_OFFLINE)) { 2150 phys_addr_t beginpa, endpa; 2151 2152 beginpa = PFN_PHYS(section_nr_to_pfn(mem->start_section_nr)); 2153 endpa = beginpa + memory_block_size_bytes() - 1; 2154 pr_warn("removing memory fails, because memory [%pa-%pa] is onlined\n", 2155 &beginpa, &endpa); 2156 2157 return -EBUSY; 2158 } 2159 return 0; 2160 } 2161 2162 static int count_memory_range_altmaps_cb(struct memory_block *mem, void *arg) 2163 { 2164 u64 *num_altmaps = (u64 *)arg; 2165 2166 if (mem->altmap) 2167 *num_altmaps += 1; 2168 2169 return 0; 2170 } 2171 2172 static int check_cpu_on_node(int nid) 2173 { 2174 int cpu; 2175 2176 for_each_present_cpu(cpu) { 2177 if (cpu_to_node(cpu) == nid) 2178 /* 2179 * the cpu on this node isn't removed, and we can't 2180 * offline this node. 2181 */ 2182 return -EBUSY; 2183 } 2184 2185 return 0; 2186 } 2187 2188 static int check_no_memblock_for_node_cb(struct memory_block *mem, void *arg) 2189 { 2190 int nid = *(int *)arg; 2191 2192 /* 2193 * If a memory block belongs to multiple nodes, the stored nid is not 2194 * reliable. However, such blocks are always online (e.g., cannot get 2195 * offlined) and, therefore, are still spanned by the node. 2196 */ 2197 return mem->nid == nid ? -EEXIST : 0; 2198 } 2199 2200 /** 2201 * try_offline_node 2202 * @nid: the node ID 2203 * 2204 * Offline a node if all memory sections and cpus of the node are removed. 2205 * 2206 * NOTE: The caller must call lock_device_hotplug() to serialize hotplug 2207 * and online/offline operations before this call. 2208 */ 2209 void try_offline_node(int nid) 2210 { 2211 int rc; 2212 2213 /* 2214 * If the node still spans pages (especially ZONE_DEVICE), don't 2215 * offline it. A node spans memory after move_pfn_range_to_zone(), 2216 * e.g., after the memory block was onlined. 2217 */ 2218 if (node_spanned_pages(nid)) 2219 return; 2220 2221 /* 2222 * Especially offline memory blocks might not be spanned by the 2223 * node. They will get spanned by the node once they get onlined. 2224 * However, they link to the node in sysfs and can get onlined later. 2225 */ 2226 rc = for_each_memory_block(&nid, check_no_memblock_for_node_cb); 2227 if (rc) 2228 return; 2229 2230 if (check_cpu_on_node(nid)) 2231 return; 2232 2233 /* 2234 * all memory/cpu of this node are removed, we can offline this 2235 * node now. 2236 */ 2237 node_set_offline(nid); 2238 unregister_one_node(nid); 2239 } 2240 EXPORT_SYMBOL(try_offline_node); 2241 2242 static int memory_blocks_have_altmaps(u64 start, u64 size) 2243 { 2244 u64 num_memblocks = size / memory_block_size_bytes(); 2245 u64 num_altmaps = 0; 2246 2247 if (!mhp_memmap_on_memory()) 2248 return 0; 2249 2250 walk_memory_blocks(start, size, &num_altmaps, 2251 count_memory_range_altmaps_cb); 2252 2253 if (num_altmaps == 0) 2254 return 0; 2255 2256 if (WARN_ON_ONCE(num_memblocks != num_altmaps)) 2257 return -EINVAL; 2258 2259 return 1; 2260 } 2261 2262 static int try_remove_memory(u64 start, u64 size) 2263 { 2264 int rc, nid = NUMA_NO_NODE; 2265 2266 BUG_ON(check_hotplug_memory_range(start, size)); 2267 2268 /* 2269 * All memory blocks must be offlined before removing memory. Check 2270 * whether all memory blocks in question are offline and return error 2271 * if this is not the case. 2272 * 2273 * While at it, determine the nid. Note that if we'd have mixed nodes, 2274 * we'd only try to offline the last determined one -- which is good 2275 * enough for the cases we care about. 2276 */ 2277 rc = walk_memory_blocks(start, size, &nid, check_memblock_offlined_cb); 2278 if (rc) 2279 return rc; 2280 2281 /* remove memmap entry */ 2282 firmware_map_remove(start, start + size, "System RAM"); 2283 2284 mem_hotplug_begin(); 2285 2286 rc = memory_blocks_have_altmaps(start, size); 2287 if (rc < 0) { 2288 mem_hotplug_done(); 2289 return rc; 2290 } else if (!rc) { 2291 /* 2292 * Memory block device removal under the device_hotplug_lock is 2293 * a barrier against racing online attempts. 2294 * No altmaps present, do the removal directly 2295 */ 2296 remove_memory_block_devices(start, size); 2297 arch_remove_memory(start, size, NULL); 2298 } else { 2299 /* all memblocks in the range have altmaps */ 2300 remove_memory_blocks_and_altmaps(start, size); 2301 } 2302 2303 if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) 2304 memblock_remove(start, size); 2305 2306 release_mem_region_adjustable(start, size); 2307 2308 if (nid != NUMA_NO_NODE) 2309 try_offline_node(nid); 2310 2311 mem_hotplug_done(); 2312 return 0; 2313 } 2314 2315 /** 2316 * __remove_memory - Remove memory if every memory block is offline 2317 * @start: physical address of the region to remove 2318 * @size: size of the region to remove 2319 * 2320 * NOTE: The caller must call lock_device_hotplug() to serialize hotplug 2321 * and online/offline operations before this call, as required by 2322 * try_offline_node(). 2323 */ 2324 void __remove_memory(u64 start, u64 size) 2325 { 2326 2327 /* 2328 * trigger BUG() if some memory is not offlined prior to calling this 2329 * function 2330 */ 2331 if (try_remove_memory(start, size)) 2332 BUG(); 2333 } 2334 2335 /* 2336 * Remove memory if every memory block is offline, otherwise return -EBUSY is 2337 * some memory is not offline 2338 */ 2339 int remove_memory(u64 start, u64 size) 2340 { 2341 int rc; 2342 2343 lock_device_hotplug(); 2344 rc = try_remove_memory(start, size); 2345 unlock_device_hotplug(); 2346 2347 return rc; 2348 } 2349 EXPORT_SYMBOL_GPL(remove_memory); 2350 2351 static int try_offline_memory_block(struct memory_block *mem, void *arg) 2352 { 2353 uint8_t online_type = MMOP_ONLINE_KERNEL; 2354 uint8_t **online_types = arg; 2355 struct page *page; 2356 int rc; 2357 2358 /* 2359 * Sense the online_type via the zone of the memory block. Offlining 2360 * with multiple zones within one memory block will be rejected 2361 * by offlining code ... so we don't care about that. 2362 */ 2363 page = pfn_to_online_page(section_nr_to_pfn(mem->start_section_nr)); 2364 if (page && zone_idx(page_zone(page)) == ZONE_MOVABLE) 2365 online_type = MMOP_ONLINE_MOVABLE; 2366 2367 rc = device_offline(&mem->dev); 2368 /* 2369 * Default is MMOP_OFFLINE - change it only if offlining succeeded, 2370 * so try_reonline_memory_block() can do the right thing. 2371 */ 2372 if (!rc) 2373 **online_types = online_type; 2374 2375 (*online_types)++; 2376 /* Ignore if already offline. */ 2377 return rc < 0 ? rc : 0; 2378 } 2379 2380 static int try_reonline_memory_block(struct memory_block *mem, void *arg) 2381 { 2382 uint8_t **online_types = arg; 2383 int rc; 2384 2385 if (**online_types != MMOP_OFFLINE) { 2386 mem->online_type = **online_types; 2387 rc = device_online(&mem->dev); 2388 if (rc < 0) 2389 pr_warn("%s: Failed to re-online memory: %d", 2390 __func__, rc); 2391 } 2392 2393 /* Continue processing all remaining memory blocks. */ 2394 (*online_types)++; 2395 return 0; 2396 } 2397 2398 /* 2399 * Try to offline and remove memory. Might take a long time to finish in case 2400 * memory is still in use. Primarily useful for memory devices that logically 2401 * unplugged all memory (so it's no longer in use) and want to offline + remove 2402 * that memory. 2403 */ 2404 int offline_and_remove_memory(u64 start, u64 size) 2405 { 2406 const unsigned long mb_count = size / memory_block_size_bytes(); 2407 uint8_t *online_types, *tmp; 2408 int rc; 2409 2410 if (!IS_ALIGNED(start, memory_block_size_bytes()) || 2411 !IS_ALIGNED(size, memory_block_size_bytes()) || !size) 2412 return -EINVAL; 2413 2414 /* 2415 * We'll remember the old online type of each memory block, so we can 2416 * try to revert whatever we did when offlining one memory block fails 2417 * after offlining some others succeeded. 2418 */ 2419 online_types = kmalloc_array(mb_count, sizeof(*online_types), 2420 GFP_KERNEL); 2421 if (!online_types) 2422 return -ENOMEM; 2423 /* 2424 * Initialize all states to MMOP_OFFLINE, so when we abort processing in 2425 * try_offline_memory_block(), we'll skip all unprocessed blocks in 2426 * try_reonline_memory_block(). 2427 */ 2428 memset(online_types, MMOP_OFFLINE, mb_count); 2429 2430 lock_device_hotplug(); 2431 2432 tmp = online_types; 2433 rc = walk_memory_blocks(start, size, &tmp, try_offline_memory_block); 2434 2435 /* 2436 * In case we succeeded to offline all memory, remove it. 2437 * This cannot fail as it cannot get onlined in the meantime. 2438 */ 2439 if (!rc) { 2440 rc = try_remove_memory(start, size); 2441 if (rc) 2442 pr_err("%s: Failed to remove memory: %d", __func__, rc); 2443 } 2444 2445 /* 2446 * Rollback what we did. While memory onlining might theoretically fail 2447 * (nacked by a notifier), it barely ever happens. 2448 */ 2449 if (rc) { 2450 tmp = online_types; 2451 walk_memory_blocks(start, size, &tmp, 2452 try_reonline_memory_block); 2453 } 2454 unlock_device_hotplug(); 2455 2456 kfree(online_types); 2457 return rc; 2458 } 2459 EXPORT_SYMBOL_GPL(offline_and_remove_memory); 2460 #endif /* CONFIG_MEMORY_HOTREMOVE */ 2461