1 /* 2 * linux/mm/memory_hotplug.c 3 * 4 * Copyright (C) 5 */ 6 7 #include <linux/stddef.h> 8 #include <linux/mm.h> 9 #include <linux/swap.h> 10 #include <linux/interrupt.h> 11 #include <linux/pagemap.h> 12 #include <linux/compiler.h> 13 #include <linux/export.h> 14 #include <linux/pagevec.h> 15 #include <linux/writeback.h> 16 #include <linux/slab.h> 17 #include <linux/sysctl.h> 18 #include <linux/cpu.h> 19 #include <linux/memory.h> 20 #include <linux/memremap.h> 21 #include <linux/memory_hotplug.h> 22 #include <linux/highmem.h> 23 #include <linux/vmalloc.h> 24 #include <linux/ioport.h> 25 #include <linux/delay.h> 26 #include <linux/migrate.h> 27 #include <linux/page-isolation.h> 28 #include <linux/pfn.h> 29 #include <linux/suspend.h> 30 #include <linux/mm_inline.h> 31 #include <linux/firmware-map.h> 32 #include <linux/stop_machine.h> 33 #include <linux/hugetlb.h> 34 #include <linux/memblock.h> 35 #include <linux/bootmem.h> 36 #include <linux/compaction.h> 37 38 #include <asm/tlbflush.h> 39 40 #include "internal.h" 41 42 /* 43 * online_page_callback contains pointer to current page onlining function. 44 * Initially it is generic_online_page(). If it is required it could be 45 * changed by calling set_online_page_callback() for callback registration 46 * and restore_online_page_callback() for generic callback restore. 47 */ 48 49 static void generic_online_page(struct page *page); 50 51 static online_page_callback_t online_page_callback = generic_online_page; 52 static DEFINE_MUTEX(online_page_callback_lock); 53 54 /* The same as the cpu_hotplug lock, but for memory hotplug. */ 55 static struct { 56 struct task_struct *active_writer; 57 struct mutex lock; /* Synchronizes accesses to refcount, */ 58 /* 59 * Also blocks the new readers during 60 * an ongoing mem hotplug operation. 61 */ 62 int refcount; 63 64 #ifdef CONFIG_DEBUG_LOCK_ALLOC 65 struct lockdep_map dep_map; 66 #endif 67 } mem_hotplug = { 68 .active_writer = NULL, 69 .lock = __MUTEX_INITIALIZER(mem_hotplug.lock), 70 .refcount = 0, 71 #ifdef CONFIG_DEBUG_LOCK_ALLOC 72 .dep_map = {.name = "mem_hotplug.lock" }, 73 #endif 74 }; 75 76 /* Lockdep annotations for get/put_online_mems() and mem_hotplug_begin/end() */ 77 #define memhp_lock_acquire_read() lock_map_acquire_read(&mem_hotplug.dep_map) 78 #define memhp_lock_acquire() lock_map_acquire(&mem_hotplug.dep_map) 79 #define memhp_lock_release() lock_map_release(&mem_hotplug.dep_map) 80 81 #ifndef CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE 82 bool memhp_auto_online; 83 #else 84 bool memhp_auto_online = true; 85 #endif 86 EXPORT_SYMBOL_GPL(memhp_auto_online); 87 88 static int __init setup_memhp_default_state(char *str) 89 { 90 if (!strcmp(str, "online")) 91 memhp_auto_online = true; 92 else if (!strcmp(str, "offline")) 93 memhp_auto_online = false; 94 95 return 1; 96 } 97 __setup("memhp_default_state=", setup_memhp_default_state); 98 99 void get_online_mems(void) 100 { 101 might_sleep(); 102 if (mem_hotplug.active_writer == current) 103 return; 104 memhp_lock_acquire_read(); 105 mutex_lock(&mem_hotplug.lock); 106 mem_hotplug.refcount++; 107 mutex_unlock(&mem_hotplug.lock); 108 109 } 110 111 void put_online_mems(void) 112 { 113 if (mem_hotplug.active_writer == current) 114 return; 115 mutex_lock(&mem_hotplug.lock); 116 117 if (WARN_ON(!mem_hotplug.refcount)) 118 mem_hotplug.refcount++; /* try to fix things up */ 119 120 if (!--mem_hotplug.refcount && unlikely(mem_hotplug.active_writer)) 121 wake_up_process(mem_hotplug.active_writer); 122 mutex_unlock(&mem_hotplug.lock); 123 memhp_lock_release(); 124 125 } 126 127 void mem_hotplug_begin(void) 128 { 129 mem_hotplug.active_writer = current; 130 131 memhp_lock_acquire(); 132 for (;;) { 133 mutex_lock(&mem_hotplug.lock); 134 if (likely(!mem_hotplug.refcount)) 135 break; 136 __set_current_state(TASK_UNINTERRUPTIBLE); 137 mutex_unlock(&mem_hotplug.lock); 138 schedule(); 139 } 140 } 141 142 void mem_hotplug_done(void) 143 { 144 mem_hotplug.active_writer = NULL; 145 mutex_unlock(&mem_hotplug.lock); 146 memhp_lock_release(); 147 } 148 149 /* add this memory to iomem resource */ 150 static struct resource *register_memory_resource(u64 start, u64 size) 151 { 152 struct resource *res; 153 res = kzalloc(sizeof(struct resource), GFP_KERNEL); 154 if (!res) 155 return ERR_PTR(-ENOMEM); 156 157 res->name = "System RAM"; 158 res->start = start; 159 res->end = start + size - 1; 160 res->flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; 161 if (request_resource(&iomem_resource, res) < 0) { 162 pr_debug("System RAM resource %pR cannot be added\n", res); 163 kfree(res); 164 return ERR_PTR(-EEXIST); 165 } 166 return res; 167 } 168 169 static void release_memory_resource(struct resource *res) 170 { 171 if (!res) 172 return; 173 release_resource(res); 174 kfree(res); 175 return; 176 } 177 178 #ifdef CONFIG_MEMORY_HOTPLUG_SPARSE 179 void get_page_bootmem(unsigned long info, struct page *page, 180 unsigned long type) 181 { 182 page->lru.next = (struct list_head *) type; 183 SetPagePrivate(page); 184 set_page_private(page, info); 185 page_ref_inc(page); 186 } 187 188 void put_page_bootmem(struct page *page) 189 { 190 unsigned long type; 191 192 type = (unsigned long) page->lru.next; 193 BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE || 194 type > MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE); 195 196 if (page_ref_dec_return(page) == 1) { 197 ClearPagePrivate(page); 198 set_page_private(page, 0); 199 INIT_LIST_HEAD(&page->lru); 200 free_reserved_page(page); 201 } 202 } 203 204 #ifdef CONFIG_HAVE_BOOTMEM_INFO_NODE 205 #ifndef CONFIG_SPARSEMEM_VMEMMAP 206 static void register_page_bootmem_info_section(unsigned long start_pfn) 207 { 208 unsigned long *usemap, mapsize, section_nr, i; 209 struct mem_section *ms; 210 struct page *page, *memmap; 211 212 section_nr = pfn_to_section_nr(start_pfn); 213 ms = __nr_to_section(section_nr); 214 215 /* Get section's memmap address */ 216 memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr); 217 218 /* 219 * Get page for the memmap's phys address 220 * XXX: need more consideration for sparse_vmemmap... 221 */ 222 page = virt_to_page(memmap); 223 mapsize = sizeof(struct page) * PAGES_PER_SECTION; 224 mapsize = PAGE_ALIGN(mapsize) >> PAGE_SHIFT; 225 226 /* remember memmap's page */ 227 for (i = 0; i < mapsize; i++, page++) 228 get_page_bootmem(section_nr, page, SECTION_INFO); 229 230 usemap = __nr_to_section(section_nr)->pageblock_flags; 231 page = virt_to_page(usemap); 232 233 mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT; 234 235 for (i = 0; i < mapsize; i++, page++) 236 get_page_bootmem(section_nr, page, MIX_SECTION_INFO); 237 238 } 239 #else /* CONFIG_SPARSEMEM_VMEMMAP */ 240 static void register_page_bootmem_info_section(unsigned long start_pfn) 241 { 242 unsigned long *usemap, mapsize, section_nr, i; 243 struct mem_section *ms; 244 struct page *page, *memmap; 245 246 if (!pfn_valid(start_pfn)) 247 return; 248 249 section_nr = pfn_to_section_nr(start_pfn); 250 ms = __nr_to_section(section_nr); 251 252 memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr); 253 254 register_page_bootmem_memmap(section_nr, memmap, PAGES_PER_SECTION); 255 256 usemap = __nr_to_section(section_nr)->pageblock_flags; 257 page = virt_to_page(usemap); 258 259 mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT; 260 261 for (i = 0; i < mapsize; i++, page++) 262 get_page_bootmem(section_nr, page, MIX_SECTION_INFO); 263 } 264 #endif /* !CONFIG_SPARSEMEM_VMEMMAP */ 265 266 void __init register_page_bootmem_info_node(struct pglist_data *pgdat) 267 { 268 unsigned long i, pfn, end_pfn, nr_pages; 269 int node = pgdat->node_id; 270 struct page *page; 271 272 nr_pages = PAGE_ALIGN(sizeof(struct pglist_data)) >> PAGE_SHIFT; 273 page = virt_to_page(pgdat); 274 275 for (i = 0; i < nr_pages; i++, page++) 276 get_page_bootmem(node, page, NODE_INFO); 277 278 pfn = pgdat->node_start_pfn; 279 end_pfn = pgdat_end_pfn(pgdat); 280 281 /* register section info */ 282 for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) { 283 /* 284 * Some platforms can assign the same pfn to multiple nodes - on 285 * node0 as well as nodeN. To avoid registering a pfn against 286 * multiple nodes we check that this pfn does not already 287 * reside in some other nodes. 288 */ 289 if (pfn_valid(pfn) && (early_pfn_to_nid(pfn) == node)) 290 register_page_bootmem_info_section(pfn); 291 } 292 } 293 #endif /* CONFIG_HAVE_BOOTMEM_INFO_NODE */ 294 295 static void __meminit grow_zone_span(struct zone *zone, unsigned long start_pfn, 296 unsigned long end_pfn) 297 { 298 unsigned long old_zone_end_pfn; 299 300 zone_span_writelock(zone); 301 302 old_zone_end_pfn = zone_end_pfn(zone); 303 if (zone_is_empty(zone) || start_pfn < zone->zone_start_pfn) 304 zone->zone_start_pfn = start_pfn; 305 306 zone->spanned_pages = max(old_zone_end_pfn, end_pfn) - 307 zone->zone_start_pfn; 308 309 zone_span_writeunlock(zone); 310 } 311 312 static void resize_zone(struct zone *zone, unsigned long start_pfn, 313 unsigned long end_pfn) 314 { 315 zone_span_writelock(zone); 316 317 if (end_pfn - start_pfn) { 318 zone->zone_start_pfn = start_pfn; 319 zone->spanned_pages = end_pfn - start_pfn; 320 } else { 321 /* 322 * make it consist as free_area_init_core(), 323 * if spanned_pages = 0, then keep start_pfn = 0 324 */ 325 zone->zone_start_pfn = 0; 326 zone->spanned_pages = 0; 327 } 328 329 zone_span_writeunlock(zone); 330 } 331 332 static void fix_zone_id(struct zone *zone, unsigned long start_pfn, 333 unsigned long end_pfn) 334 { 335 enum zone_type zid = zone_idx(zone); 336 int nid = zone->zone_pgdat->node_id; 337 unsigned long pfn; 338 339 for (pfn = start_pfn; pfn < end_pfn; pfn++) 340 set_page_links(pfn_to_page(pfn), zid, nid, pfn); 341 } 342 343 /* Can fail with -ENOMEM from allocating a wait table with vmalloc() or 344 * alloc_bootmem_node_nopanic()/memblock_virt_alloc_node_nopanic() */ 345 static int __ref ensure_zone_is_initialized(struct zone *zone, 346 unsigned long start_pfn, unsigned long num_pages) 347 { 348 if (!zone_is_initialized(zone)) 349 return init_currently_empty_zone(zone, start_pfn, num_pages); 350 351 return 0; 352 } 353 354 static int __meminit move_pfn_range_left(struct zone *z1, struct zone *z2, 355 unsigned long start_pfn, unsigned long end_pfn) 356 { 357 int ret; 358 unsigned long flags; 359 unsigned long z1_start_pfn; 360 361 ret = ensure_zone_is_initialized(z1, start_pfn, end_pfn - start_pfn); 362 if (ret) 363 return ret; 364 365 pgdat_resize_lock(z1->zone_pgdat, &flags); 366 367 /* can't move pfns which are higher than @z2 */ 368 if (end_pfn > zone_end_pfn(z2)) 369 goto out_fail; 370 /* the move out part must be at the left most of @z2 */ 371 if (start_pfn > z2->zone_start_pfn) 372 goto out_fail; 373 /* must included/overlap */ 374 if (end_pfn <= z2->zone_start_pfn) 375 goto out_fail; 376 377 /* use start_pfn for z1's start_pfn if z1 is empty */ 378 if (!zone_is_empty(z1)) 379 z1_start_pfn = z1->zone_start_pfn; 380 else 381 z1_start_pfn = start_pfn; 382 383 resize_zone(z1, z1_start_pfn, end_pfn); 384 resize_zone(z2, end_pfn, zone_end_pfn(z2)); 385 386 pgdat_resize_unlock(z1->zone_pgdat, &flags); 387 388 fix_zone_id(z1, start_pfn, end_pfn); 389 390 return 0; 391 out_fail: 392 pgdat_resize_unlock(z1->zone_pgdat, &flags); 393 return -1; 394 } 395 396 static int __meminit move_pfn_range_right(struct zone *z1, struct zone *z2, 397 unsigned long start_pfn, unsigned long end_pfn) 398 { 399 int ret; 400 unsigned long flags; 401 unsigned long z2_end_pfn; 402 403 ret = ensure_zone_is_initialized(z2, start_pfn, end_pfn - start_pfn); 404 if (ret) 405 return ret; 406 407 pgdat_resize_lock(z1->zone_pgdat, &flags); 408 409 /* can't move pfns which are lower than @z1 */ 410 if (z1->zone_start_pfn > start_pfn) 411 goto out_fail; 412 /* the move out part mast at the right most of @z1 */ 413 if (zone_end_pfn(z1) > end_pfn) 414 goto out_fail; 415 /* must included/overlap */ 416 if (start_pfn >= zone_end_pfn(z1)) 417 goto out_fail; 418 419 /* use end_pfn for z2's end_pfn if z2 is empty */ 420 if (!zone_is_empty(z2)) 421 z2_end_pfn = zone_end_pfn(z2); 422 else 423 z2_end_pfn = end_pfn; 424 425 resize_zone(z1, z1->zone_start_pfn, start_pfn); 426 resize_zone(z2, start_pfn, z2_end_pfn); 427 428 pgdat_resize_unlock(z1->zone_pgdat, &flags); 429 430 fix_zone_id(z2, start_pfn, end_pfn); 431 432 return 0; 433 out_fail: 434 pgdat_resize_unlock(z1->zone_pgdat, &flags); 435 return -1; 436 } 437 438 static struct zone * __meminit move_pfn_range(int zone_shift, 439 unsigned long start_pfn, unsigned long end_pfn) 440 { 441 struct zone *zone = page_zone(pfn_to_page(start_pfn)); 442 int ret = 0; 443 444 if (zone_shift < 0) 445 ret = move_pfn_range_left(zone + zone_shift, zone, 446 start_pfn, end_pfn); 447 else if (zone_shift) 448 ret = move_pfn_range_right(zone, zone + zone_shift, 449 start_pfn, end_pfn); 450 451 if (ret) 452 return NULL; 453 454 return zone + zone_shift; 455 } 456 457 static void __meminit grow_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn, 458 unsigned long end_pfn) 459 { 460 unsigned long old_pgdat_end_pfn = pgdat_end_pfn(pgdat); 461 462 if (!pgdat->node_spanned_pages || start_pfn < pgdat->node_start_pfn) 463 pgdat->node_start_pfn = start_pfn; 464 465 pgdat->node_spanned_pages = max(old_pgdat_end_pfn, end_pfn) - 466 pgdat->node_start_pfn; 467 } 468 469 static int __meminit __add_zone(struct zone *zone, unsigned long phys_start_pfn) 470 { 471 struct pglist_data *pgdat = zone->zone_pgdat; 472 int nr_pages = PAGES_PER_SECTION; 473 int nid = pgdat->node_id; 474 int zone_type; 475 unsigned long flags, pfn; 476 int ret; 477 478 zone_type = zone - pgdat->node_zones; 479 ret = ensure_zone_is_initialized(zone, phys_start_pfn, nr_pages); 480 if (ret) 481 return ret; 482 483 pgdat_resize_lock(zone->zone_pgdat, &flags); 484 grow_zone_span(zone, phys_start_pfn, phys_start_pfn + nr_pages); 485 grow_pgdat_span(zone->zone_pgdat, phys_start_pfn, 486 phys_start_pfn + nr_pages); 487 pgdat_resize_unlock(zone->zone_pgdat, &flags); 488 memmap_init_zone(nr_pages, nid, zone_type, 489 phys_start_pfn, MEMMAP_HOTPLUG); 490 491 /* online_page_range is called later and expects pages reserved */ 492 for (pfn = phys_start_pfn; pfn < phys_start_pfn + nr_pages; pfn++) { 493 if (!pfn_valid(pfn)) 494 continue; 495 496 SetPageReserved(pfn_to_page(pfn)); 497 } 498 return 0; 499 } 500 501 static int __meminit __add_section(int nid, struct zone *zone, 502 unsigned long phys_start_pfn) 503 { 504 int ret; 505 506 if (pfn_valid(phys_start_pfn)) 507 return -EEXIST; 508 509 ret = sparse_add_one_section(zone, phys_start_pfn); 510 511 if (ret < 0) 512 return ret; 513 514 ret = __add_zone(zone, phys_start_pfn); 515 516 if (ret < 0) 517 return ret; 518 519 return register_new_memory(nid, __pfn_to_section(phys_start_pfn)); 520 } 521 522 /* 523 * Reasonably generic function for adding memory. It is 524 * expected that archs that support memory hotplug will 525 * call this function after deciding the zone to which to 526 * add the new pages. 527 */ 528 int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn, 529 unsigned long nr_pages) 530 { 531 unsigned long i; 532 int err = 0; 533 int start_sec, end_sec; 534 struct vmem_altmap *altmap; 535 536 clear_zone_contiguous(zone); 537 538 /* during initialize mem_map, align hot-added range to section */ 539 start_sec = pfn_to_section_nr(phys_start_pfn); 540 end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1); 541 542 altmap = to_vmem_altmap((unsigned long) pfn_to_page(phys_start_pfn)); 543 if (altmap) { 544 /* 545 * Validate altmap is within bounds of the total request 546 */ 547 if (altmap->base_pfn != phys_start_pfn 548 || vmem_altmap_offset(altmap) > nr_pages) { 549 pr_warn_once("memory add fail, invalid altmap\n"); 550 err = -EINVAL; 551 goto out; 552 } 553 altmap->alloc = 0; 554 } 555 556 for (i = start_sec; i <= end_sec; i++) { 557 err = __add_section(nid, zone, section_nr_to_pfn(i)); 558 559 /* 560 * EEXIST is finally dealt with by ioresource collision 561 * check. see add_memory() => register_memory_resource() 562 * Warning will be printed if there is collision. 563 */ 564 if (err && (err != -EEXIST)) 565 break; 566 err = 0; 567 } 568 vmemmap_populate_print_last(); 569 out: 570 set_zone_contiguous(zone); 571 return err; 572 } 573 EXPORT_SYMBOL_GPL(__add_pages); 574 575 #ifdef CONFIG_MEMORY_HOTREMOVE 576 /* find the smallest valid pfn in the range [start_pfn, end_pfn) */ 577 static int find_smallest_section_pfn(int nid, struct zone *zone, 578 unsigned long start_pfn, 579 unsigned long end_pfn) 580 { 581 struct mem_section *ms; 582 583 for (; start_pfn < end_pfn; start_pfn += PAGES_PER_SECTION) { 584 ms = __pfn_to_section(start_pfn); 585 586 if (unlikely(!valid_section(ms))) 587 continue; 588 589 if (unlikely(pfn_to_nid(start_pfn) != nid)) 590 continue; 591 592 if (zone && zone != page_zone(pfn_to_page(start_pfn))) 593 continue; 594 595 return start_pfn; 596 } 597 598 return 0; 599 } 600 601 /* find the biggest valid pfn in the range [start_pfn, end_pfn). */ 602 static int find_biggest_section_pfn(int nid, struct zone *zone, 603 unsigned long start_pfn, 604 unsigned long end_pfn) 605 { 606 struct mem_section *ms; 607 unsigned long pfn; 608 609 /* pfn is the end pfn of a memory section. */ 610 pfn = end_pfn - 1; 611 for (; pfn >= start_pfn; pfn -= PAGES_PER_SECTION) { 612 ms = __pfn_to_section(pfn); 613 614 if (unlikely(!valid_section(ms))) 615 continue; 616 617 if (unlikely(pfn_to_nid(pfn) != nid)) 618 continue; 619 620 if (zone && zone != page_zone(pfn_to_page(pfn))) 621 continue; 622 623 return pfn; 624 } 625 626 return 0; 627 } 628 629 static void shrink_zone_span(struct zone *zone, unsigned long start_pfn, 630 unsigned long end_pfn) 631 { 632 unsigned long zone_start_pfn = zone->zone_start_pfn; 633 unsigned long z = zone_end_pfn(zone); /* zone_end_pfn namespace clash */ 634 unsigned long zone_end_pfn = z; 635 unsigned long pfn; 636 struct mem_section *ms; 637 int nid = zone_to_nid(zone); 638 639 zone_span_writelock(zone); 640 if (zone_start_pfn == start_pfn) { 641 /* 642 * If the section is smallest section in the zone, it need 643 * shrink zone->zone_start_pfn and zone->zone_spanned_pages. 644 * In this case, we find second smallest valid mem_section 645 * for shrinking zone. 646 */ 647 pfn = find_smallest_section_pfn(nid, zone, end_pfn, 648 zone_end_pfn); 649 if (pfn) { 650 zone->zone_start_pfn = pfn; 651 zone->spanned_pages = zone_end_pfn - pfn; 652 } 653 } else if (zone_end_pfn == end_pfn) { 654 /* 655 * If the section is biggest section in the zone, it need 656 * shrink zone->spanned_pages. 657 * In this case, we find second biggest valid mem_section for 658 * shrinking zone. 659 */ 660 pfn = find_biggest_section_pfn(nid, zone, zone_start_pfn, 661 start_pfn); 662 if (pfn) 663 zone->spanned_pages = pfn - zone_start_pfn + 1; 664 } 665 666 /* 667 * The section is not biggest or smallest mem_section in the zone, it 668 * only creates a hole in the zone. So in this case, we need not 669 * change the zone. But perhaps, the zone has only hole data. Thus 670 * it check the zone has only hole or not. 671 */ 672 pfn = zone_start_pfn; 673 for (; pfn < zone_end_pfn; pfn += PAGES_PER_SECTION) { 674 ms = __pfn_to_section(pfn); 675 676 if (unlikely(!valid_section(ms))) 677 continue; 678 679 if (page_zone(pfn_to_page(pfn)) != zone) 680 continue; 681 682 /* If the section is current section, it continues the loop */ 683 if (start_pfn == pfn) 684 continue; 685 686 /* If we find valid section, we have nothing to do */ 687 zone_span_writeunlock(zone); 688 return; 689 } 690 691 /* The zone has no valid section */ 692 zone->zone_start_pfn = 0; 693 zone->spanned_pages = 0; 694 zone_span_writeunlock(zone); 695 } 696 697 static void shrink_pgdat_span(struct pglist_data *pgdat, 698 unsigned long start_pfn, unsigned long end_pfn) 699 { 700 unsigned long pgdat_start_pfn = pgdat->node_start_pfn; 701 unsigned long p = pgdat_end_pfn(pgdat); /* pgdat_end_pfn namespace clash */ 702 unsigned long pgdat_end_pfn = p; 703 unsigned long pfn; 704 struct mem_section *ms; 705 int nid = pgdat->node_id; 706 707 if (pgdat_start_pfn == start_pfn) { 708 /* 709 * If the section is smallest section in the pgdat, it need 710 * shrink pgdat->node_start_pfn and pgdat->node_spanned_pages. 711 * In this case, we find second smallest valid mem_section 712 * for shrinking zone. 713 */ 714 pfn = find_smallest_section_pfn(nid, NULL, end_pfn, 715 pgdat_end_pfn); 716 if (pfn) { 717 pgdat->node_start_pfn = pfn; 718 pgdat->node_spanned_pages = pgdat_end_pfn - pfn; 719 } 720 } else if (pgdat_end_pfn == end_pfn) { 721 /* 722 * If the section is biggest section in the pgdat, it need 723 * shrink pgdat->node_spanned_pages. 724 * In this case, we find second biggest valid mem_section for 725 * shrinking zone. 726 */ 727 pfn = find_biggest_section_pfn(nid, NULL, pgdat_start_pfn, 728 start_pfn); 729 if (pfn) 730 pgdat->node_spanned_pages = pfn - pgdat_start_pfn + 1; 731 } 732 733 /* 734 * If the section is not biggest or smallest mem_section in the pgdat, 735 * it only creates a hole in the pgdat. So in this case, we need not 736 * change the pgdat. 737 * But perhaps, the pgdat has only hole data. Thus it check the pgdat 738 * has only hole or not. 739 */ 740 pfn = pgdat_start_pfn; 741 for (; pfn < pgdat_end_pfn; pfn += PAGES_PER_SECTION) { 742 ms = __pfn_to_section(pfn); 743 744 if (unlikely(!valid_section(ms))) 745 continue; 746 747 if (pfn_to_nid(pfn) != nid) 748 continue; 749 750 /* If the section is current section, it continues the loop */ 751 if (start_pfn == pfn) 752 continue; 753 754 /* If we find valid section, we have nothing to do */ 755 return; 756 } 757 758 /* The pgdat has no valid section */ 759 pgdat->node_start_pfn = 0; 760 pgdat->node_spanned_pages = 0; 761 } 762 763 static void __remove_zone(struct zone *zone, unsigned long start_pfn) 764 { 765 struct pglist_data *pgdat = zone->zone_pgdat; 766 int nr_pages = PAGES_PER_SECTION; 767 int zone_type; 768 unsigned long flags; 769 770 zone_type = zone - pgdat->node_zones; 771 772 pgdat_resize_lock(zone->zone_pgdat, &flags); 773 shrink_zone_span(zone, start_pfn, start_pfn + nr_pages); 774 shrink_pgdat_span(pgdat, start_pfn, start_pfn + nr_pages); 775 pgdat_resize_unlock(zone->zone_pgdat, &flags); 776 } 777 778 static int __remove_section(struct zone *zone, struct mem_section *ms, 779 unsigned long map_offset) 780 { 781 unsigned long start_pfn; 782 int scn_nr; 783 int ret = -EINVAL; 784 785 if (!valid_section(ms)) 786 return ret; 787 788 ret = unregister_memory_section(ms); 789 if (ret) 790 return ret; 791 792 scn_nr = __section_nr(ms); 793 start_pfn = section_nr_to_pfn(scn_nr); 794 __remove_zone(zone, start_pfn); 795 796 sparse_remove_one_section(zone, ms, map_offset); 797 return 0; 798 } 799 800 /** 801 * __remove_pages() - remove sections of pages from a zone 802 * @zone: zone from which pages need to be removed 803 * @phys_start_pfn: starting pageframe (must be aligned to start of a section) 804 * @nr_pages: number of pages to remove (must be multiple of section size) 805 * 806 * Generic helper function to remove section mappings and sysfs entries 807 * for the section of the memory we are removing. Caller needs to make 808 * sure that pages are marked reserved and zones are adjust properly by 809 * calling offline_pages(). 810 */ 811 int __remove_pages(struct zone *zone, unsigned long phys_start_pfn, 812 unsigned long nr_pages) 813 { 814 unsigned long i; 815 unsigned long map_offset = 0; 816 int sections_to_remove, ret = 0; 817 818 /* In the ZONE_DEVICE case device driver owns the memory region */ 819 if (is_dev_zone(zone)) { 820 struct page *page = pfn_to_page(phys_start_pfn); 821 struct vmem_altmap *altmap; 822 823 altmap = to_vmem_altmap((unsigned long) page); 824 if (altmap) 825 map_offset = vmem_altmap_offset(altmap); 826 } else { 827 resource_size_t start, size; 828 829 start = phys_start_pfn << PAGE_SHIFT; 830 size = nr_pages * PAGE_SIZE; 831 832 ret = release_mem_region_adjustable(&iomem_resource, start, 833 size); 834 if (ret) { 835 resource_size_t endres = start + size - 1; 836 837 pr_warn("Unable to release resource <%pa-%pa> (%d)\n", 838 &start, &endres, ret); 839 } 840 } 841 842 clear_zone_contiguous(zone); 843 844 /* 845 * We can only remove entire sections 846 */ 847 BUG_ON(phys_start_pfn & ~PAGE_SECTION_MASK); 848 BUG_ON(nr_pages % PAGES_PER_SECTION); 849 850 sections_to_remove = nr_pages / PAGES_PER_SECTION; 851 for (i = 0; i < sections_to_remove; i++) { 852 unsigned long pfn = phys_start_pfn + i*PAGES_PER_SECTION; 853 854 ret = __remove_section(zone, __pfn_to_section(pfn), map_offset); 855 map_offset = 0; 856 if (ret) 857 break; 858 } 859 860 set_zone_contiguous(zone); 861 862 return ret; 863 } 864 EXPORT_SYMBOL_GPL(__remove_pages); 865 #endif /* CONFIG_MEMORY_HOTREMOVE */ 866 867 int set_online_page_callback(online_page_callback_t callback) 868 { 869 int rc = -EINVAL; 870 871 get_online_mems(); 872 mutex_lock(&online_page_callback_lock); 873 874 if (online_page_callback == generic_online_page) { 875 online_page_callback = callback; 876 rc = 0; 877 } 878 879 mutex_unlock(&online_page_callback_lock); 880 put_online_mems(); 881 882 return rc; 883 } 884 EXPORT_SYMBOL_GPL(set_online_page_callback); 885 886 int restore_online_page_callback(online_page_callback_t callback) 887 { 888 int rc = -EINVAL; 889 890 get_online_mems(); 891 mutex_lock(&online_page_callback_lock); 892 893 if (online_page_callback == callback) { 894 online_page_callback = generic_online_page; 895 rc = 0; 896 } 897 898 mutex_unlock(&online_page_callback_lock); 899 put_online_mems(); 900 901 return rc; 902 } 903 EXPORT_SYMBOL_GPL(restore_online_page_callback); 904 905 void __online_page_set_limits(struct page *page) 906 { 907 } 908 EXPORT_SYMBOL_GPL(__online_page_set_limits); 909 910 void __online_page_increment_counters(struct page *page) 911 { 912 adjust_managed_page_count(page, 1); 913 } 914 EXPORT_SYMBOL_GPL(__online_page_increment_counters); 915 916 void __online_page_free(struct page *page) 917 { 918 __free_reserved_page(page); 919 } 920 EXPORT_SYMBOL_GPL(__online_page_free); 921 922 static void generic_online_page(struct page *page) 923 { 924 __online_page_set_limits(page); 925 __online_page_increment_counters(page); 926 __online_page_free(page); 927 } 928 929 static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages, 930 void *arg) 931 { 932 unsigned long i; 933 unsigned long onlined_pages = *(unsigned long *)arg; 934 struct page *page; 935 if (PageReserved(pfn_to_page(start_pfn))) 936 for (i = 0; i < nr_pages; i++) { 937 page = pfn_to_page(start_pfn + i); 938 (*online_page_callback)(page); 939 onlined_pages++; 940 } 941 *(unsigned long *)arg = onlined_pages; 942 return 0; 943 } 944 945 #ifdef CONFIG_MOVABLE_NODE 946 /* 947 * When CONFIG_MOVABLE_NODE, we permit onlining of a node which doesn't have 948 * normal memory. 949 */ 950 static bool can_online_high_movable(struct zone *zone) 951 { 952 return true; 953 } 954 #else /* CONFIG_MOVABLE_NODE */ 955 /* ensure every online node has NORMAL memory */ 956 static bool can_online_high_movable(struct zone *zone) 957 { 958 return node_state(zone_to_nid(zone), N_NORMAL_MEMORY); 959 } 960 #endif /* CONFIG_MOVABLE_NODE */ 961 962 /* check which state of node_states will be changed when online memory */ 963 static void node_states_check_changes_online(unsigned long nr_pages, 964 struct zone *zone, struct memory_notify *arg) 965 { 966 int nid = zone_to_nid(zone); 967 enum zone_type zone_last = ZONE_NORMAL; 968 969 /* 970 * If we have HIGHMEM or movable node, node_states[N_NORMAL_MEMORY] 971 * contains nodes which have zones of 0...ZONE_NORMAL, 972 * set zone_last to ZONE_NORMAL. 973 * 974 * If we don't have HIGHMEM nor movable node, 975 * node_states[N_NORMAL_MEMORY] contains nodes which have zones of 976 * 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE. 977 */ 978 if (N_MEMORY == N_NORMAL_MEMORY) 979 zone_last = ZONE_MOVABLE; 980 981 /* 982 * if the memory to be online is in a zone of 0...zone_last, and 983 * the zones of 0...zone_last don't have memory before online, we will 984 * need to set the node to node_states[N_NORMAL_MEMORY] after 985 * the memory is online. 986 */ 987 if (zone_idx(zone) <= zone_last && !node_state(nid, N_NORMAL_MEMORY)) 988 arg->status_change_nid_normal = nid; 989 else 990 arg->status_change_nid_normal = -1; 991 992 #ifdef CONFIG_HIGHMEM 993 /* 994 * If we have movable node, node_states[N_HIGH_MEMORY] 995 * contains nodes which have zones of 0...ZONE_HIGHMEM, 996 * set zone_last to ZONE_HIGHMEM. 997 * 998 * If we don't have movable node, node_states[N_NORMAL_MEMORY] 999 * contains nodes which have zones of 0...ZONE_MOVABLE, 1000 * set zone_last to ZONE_MOVABLE. 1001 */ 1002 zone_last = ZONE_HIGHMEM; 1003 if (N_MEMORY == N_HIGH_MEMORY) 1004 zone_last = ZONE_MOVABLE; 1005 1006 if (zone_idx(zone) <= zone_last && !node_state(nid, N_HIGH_MEMORY)) 1007 arg->status_change_nid_high = nid; 1008 else 1009 arg->status_change_nid_high = -1; 1010 #else 1011 arg->status_change_nid_high = arg->status_change_nid_normal; 1012 #endif 1013 1014 /* 1015 * if the node don't have memory befor online, we will need to 1016 * set the node to node_states[N_MEMORY] after the memory 1017 * is online. 1018 */ 1019 if (!node_state(nid, N_MEMORY)) 1020 arg->status_change_nid = nid; 1021 else 1022 arg->status_change_nid = -1; 1023 } 1024 1025 static void node_states_set_node(int node, struct memory_notify *arg) 1026 { 1027 if (arg->status_change_nid_normal >= 0) 1028 node_set_state(node, N_NORMAL_MEMORY); 1029 1030 if (arg->status_change_nid_high >= 0) 1031 node_set_state(node, N_HIGH_MEMORY); 1032 1033 node_set_state(node, N_MEMORY); 1034 } 1035 1036 int zone_can_shift(unsigned long pfn, unsigned long nr_pages, 1037 enum zone_type target) 1038 { 1039 struct zone *zone = page_zone(pfn_to_page(pfn)); 1040 enum zone_type idx = zone_idx(zone); 1041 int i; 1042 1043 if (idx < target) { 1044 /* pages must be at end of current zone */ 1045 if (pfn + nr_pages != zone_end_pfn(zone)) 1046 return 0; 1047 1048 /* no zones in use between current zone and target */ 1049 for (i = idx + 1; i < target; i++) 1050 if (zone_is_initialized(zone - idx + i)) 1051 return 0; 1052 } 1053 1054 if (target < idx) { 1055 /* pages must be at beginning of current zone */ 1056 if (pfn != zone->zone_start_pfn) 1057 return 0; 1058 1059 /* no zones in use between current zone and target */ 1060 for (i = target + 1; i < idx; i++) 1061 if (zone_is_initialized(zone - idx + i)) 1062 return 0; 1063 } 1064 1065 return target - idx; 1066 } 1067 1068 /* Must be protected by mem_hotplug_begin() */ 1069 int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type) 1070 { 1071 unsigned long flags; 1072 unsigned long onlined_pages = 0; 1073 struct zone *zone; 1074 int need_zonelists_rebuild = 0; 1075 int nid; 1076 int ret; 1077 struct memory_notify arg; 1078 int zone_shift = 0; 1079 1080 /* 1081 * This doesn't need a lock to do pfn_to_page(). 1082 * The section can't be removed here because of the 1083 * memory_block->state_mutex. 1084 */ 1085 zone = page_zone(pfn_to_page(pfn)); 1086 1087 if ((zone_idx(zone) > ZONE_NORMAL || 1088 online_type == MMOP_ONLINE_MOVABLE) && 1089 !can_online_high_movable(zone)) 1090 return -EINVAL; 1091 1092 if (online_type == MMOP_ONLINE_KERNEL) 1093 zone_shift = zone_can_shift(pfn, nr_pages, ZONE_NORMAL); 1094 else if (online_type == MMOP_ONLINE_MOVABLE) 1095 zone_shift = zone_can_shift(pfn, nr_pages, ZONE_MOVABLE); 1096 1097 zone = move_pfn_range(zone_shift, pfn, pfn + nr_pages); 1098 if (!zone) 1099 return -EINVAL; 1100 1101 arg.start_pfn = pfn; 1102 arg.nr_pages = nr_pages; 1103 node_states_check_changes_online(nr_pages, zone, &arg); 1104 1105 nid = zone_to_nid(zone); 1106 1107 ret = memory_notify(MEM_GOING_ONLINE, &arg); 1108 ret = notifier_to_errno(ret); 1109 if (ret) 1110 goto failed_addition; 1111 1112 /* 1113 * If this zone is not populated, then it is not in zonelist. 1114 * This means the page allocator ignores this zone. 1115 * So, zonelist must be updated after online. 1116 */ 1117 mutex_lock(&zonelists_mutex); 1118 if (!populated_zone(zone)) { 1119 need_zonelists_rebuild = 1; 1120 build_all_zonelists(NULL, zone); 1121 } 1122 1123 ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages, 1124 online_pages_range); 1125 if (ret) { 1126 if (need_zonelists_rebuild) 1127 zone_pcp_reset(zone); 1128 mutex_unlock(&zonelists_mutex); 1129 goto failed_addition; 1130 } 1131 1132 zone->present_pages += onlined_pages; 1133 1134 pgdat_resize_lock(zone->zone_pgdat, &flags); 1135 zone->zone_pgdat->node_present_pages += onlined_pages; 1136 pgdat_resize_unlock(zone->zone_pgdat, &flags); 1137 1138 if (onlined_pages) { 1139 node_states_set_node(nid, &arg); 1140 if (need_zonelists_rebuild) 1141 build_all_zonelists(NULL, NULL); 1142 else 1143 zone_pcp_update(zone); 1144 } 1145 1146 mutex_unlock(&zonelists_mutex); 1147 1148 init_per_zone_wmark_min(); 1149 1150 if (onlined_pages) { 1151 kswapd_run(nid); 1152 kcompactd_run(nid); 1153 } 1154 1155 vm_total_pages = nr_free_pagecache_pages(); 1156 1157 writeback_set_ratelimit(); 1158 1159 if (onlined_pages) 1160 memory_notify(MEM_ONLINE, &arg); 1161 return 0; 1162 1163 failed_addition: 1164 pr_debug("online_pages [mem %#010llx-%#010llx] failed\n", 1165 (unsigned long long) pfn << PAGE_SHIFT, 1166 (((unsigned long long) pfn + nr_pages) << PAGE_SHIFT) - 1); 1167 memory_notify(MEM_CANCEL_ONLINE, &arg); 1168 return ret; 1169 } 1170 #endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */ 1171 1172 static void reset_node_present_pages(pg_data_t *pgdat) 1173 { 1174 struct zone *z; 1175 1176 for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++) 1177 z->present_pages = 0; 1178 1179 pgdat->node_present_pages = 0; 1180 } 1181 1182 /* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ 1183 static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start) 1184 { 1185 struct pglist_data *pgdat; 1186 unsigned long zones_size[MAX_NR_ZONES] = {0}; 1187 unsigned long zholes_size[MAX_NR_ZONES] = {0}; 1188 unsigned long start_pfn = PFN_DOWN(start); 1189 1190 pgdat = NODE_DATA(nid); 1191 if (!pgdat) { 1192 pgdat = arch_alloc_nodedata(nid); 1193 if (!pgdat) 1194 return NULL; 1195 1196 arch_refresh_nodedata(nid, pgdat); 1197 } else { 1198 /* Reset the nr_zones, order and classzone_idx before reuse */ 1199 pgdat->nr_zones = 0; 1200 pgdat->kswapd_order = 0; 1201 pgdat->kswapd_classzone_idx = 0; 1202 } 1203 1204 /* we can use NODE_DATA(nid) from here */ 1205 1206 /* init node's zones as empty zones, we don't have any present pages.*/ 1207 free_area_init_node(nid, zones_size, start_pfn, zholes_size); 1208 pgdat->per_cpu_nodestats = alloc_percpu(struct per_cpu_nodestat); 1209 1210 /* 1211 * The node we allocated has no zone fallback lists. For avoiding 1212 * to access not-initialized zonelist, build here. 1213 */ 1214 mutex_lock(&zonelists_mutex); 1215 build_all_zonelists(pgdat, NULL); 1216 mutex_unlock(&zonelists_mutex); 1217 1218 /* 1219 * zone->managed_pages is set to an approximate value in 1220 * free_area_init_core(), which will cause 1221 * /sys/device/system/node/nodeX/meminfo has wrong data. 1222 * So reset it to 0 before any memory is onlined. 1223 */ 1224 reset_node_managed_pages(pgdat); 1225 1226 /* 1227 * When memory is hot-added, all the memory is in offline state. So 1228 * clear all zones' present_pages because they will be updated in 1229 * online_pages() and offline_pages(). 1230 */ 1231 reset_node_present_pages(pgdat); 1232 1233 return pgdat; 1234 } 1235 1236 static void rollback_node_hotadd(int nid, pg_data_t *pgdat) 1237 { 1238 arch_refresh_nodedata(nid, NULL); 1239 free_percpu(pgdat->per_cpu_nodestats); 1240 arch_free_nodedata(pgdat); 1241 return; 1242 } 1243 1244 1245 /** 1246 * try_online_node - online a node if offlined 1247 * 1248 * called by cpu_up() to online a node without onlined memory. 1249 */ 1250 int try_online_node(int nid) 1251 { 1252 pg_data_t *pgdat; 1253 int ret; 1254 1255 if (node_online(nid)) 1256 return 0; 1257 1258 mem_hotplug_begin(); 1259 pgdat = hotadd_new_pgdat(nid, 0); 1260 if (!pgdat) { 1261 pr_err("Cannot online node %d due to NULL pgdat\n", nid); 1262 ret = -ENOMEM; 1263 goto out; 1264 } 1265 node_set_online(nid); 1266 ret = register_one_node(nid); 1267 BUG_ON(ret); 1268 1269 if (pgdat->node_zonelists->_zonerefs->zone == NULL) { 1270 mutex_lock(&zonelists_mutex); 1271 build_all_zonelists(NULL, NULL); 1272 mutex_unlock(&zonelists_mutex); 1273 } 1274 1275 out: 1276 mem_hotplug_done(); 1277 return ret; 1278 } 1279 1280 static int check_hotplug_memory_range(u64 start, u64 size) 1281 { 1282 u64 start_pfn = PFN_DOWN(start); 1283 u64 nr_pages = size >> PAGE_SHIFT; 1284 1285 /* Memory range must be aligned with section */ 1286 if ((start_pfn & ~PAGE_SECTION_MASK) || 1287 (nr_pages % PAGES_PER_SECTION) || (!nr_pages)) { 1288 pr_err("Section-unaligned hotplug range: start 0x%llx, size 0x%llx\n", 1289 (unsigned long long)start, 1290 (unsigned long long)size); 1291 return -EINVAL; 1292 } 1293 1294 return 0; 1295 } 1296 1297 /* 1298 * If movable zone has already been setup, newly added memory should be check. 1299 * If its address is higher than movable zone, it should be added as movable. 1300 * Without this check, movable zone may overlap with other zone. 1301 */ 1302 static int should_add_memory_movable(int nid, u64 start, u64 size) 1303 { 1304 unsigned long start_pfn = start >> PAGE_SHIFT; 1305 pg_data_t *pgdat = NODE_DATA(nid); 1306 struct zone *movable_zone = pgdat->node_zones + ZONE_MOVABLE; 1307 1308 if (zone_is_empty(movable_zone)) 1309 return 0; 1310 1311 if (movable_zone->zone_start_pfn <= start_pfn) 1312 return 1; 1313 1314 return 0; 1315 } 1316 1317 int zone_for_memory(int nid, u64 start, u64 size, int zone_default, 1318 bool for_device) 1319 { 1320 #ifdef CONFIG_ZONE_DEVICE 1321 if (for_device) 1322 return ZONE_DEVICE; 1323 #endif 1324 if (should_add_memory_movable(nid, start, size)) 1325 return ZONE_MOVABLE; 1326 1327 return zone_default; 1328 } 1329 1330 static int online_memory_block(struct memory_block *mem, void *arg) 1331 { 1332 return memory_block_change_state(mem, MEM_ONLINE, MEM_OFFLINE); 1333 } 1334 1335 /* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ 1336 int __ref add_memory_resource(int nid, struct resource *res, bool online) 1337 { 1338 u64 start, size; 1339 pg_data_t *pgdat = NULL; 1340 bool new_pgdat; 1341 bool new_node; 1342 int ret; 1343 1344 start = res->start; 1345 size = resource_size(res); 1346 1347 ret = check_hotplug_memory_range(start, size); 1348 if (ret) 1349 return ret; 1350 1351 { /* Stupid hack to suppress address-never-null warning */ 1352 void *p = NODE_DATA(nid); 1353 new_pgdat = !p; 1354 } 1355 1356 mem_hotplug_begin(); 1357 1358 /* 1359 * Add new range to memblock so that when hotadd_new_pgdat() is called 1360 * to allocate new pgdat, get_pfn_range_for_nid() will be able to find 1361 * this new range and calculate total pages correctly. The range will 1362 * be removed at hot-remove time. 1363 */ 1364 memblock_add_node(start, size, nid); 1365 1366 new_node = !node_online(nid); 1367 if (new_node) { 1368 pgdat = hotadd_new_pgdat(nid, start); 1369 ret = -ENOMEM; 1370 if (!pgdat) 1371 goto error; 1372 } 1373 1374 /* call arch's memory hotadd */ 1375 ret = arch_add_memory(nid, start, size, false); 1376 1377 if (ret < 0) 1378 goto error; 1379 1380 /* we online node here. we can't roll back from here. */ 1381 node_set_online(nid); 1382 1383 if (new_node) { 1384 ret = register_one_node(nid); 1385 /* 1386 * If sysfs file of new node can't create, cpu on the node 1387 * can't be hot-added. There is no rollback way now. 1388 * So, check by BUG_ON() to catch it reluctantly.. 1389 */ 1390 BUG_ON(ret); 1391 } 1392 1393 /* create new memmap entry */ 1394 firmware_map_add_hotplug(start, start + size, "System RAM"); 1395 1396 /* online pages if requested */ 1397 if (online) 1398 walk_memory_range(PFN_DOWN(start), PFN_UP(start + size - 1), 1399 NULL, online_memory_block); 1400 1401 goto out; 1402 1403 error: 1404 /* rollback pgdat allocation and others */ 1405 if (new_pgdat) 1406 rollback_node_hotadd(nid, pgdat); 1407 memblock_remove(start, size); 1408 1409 out: 1410 mem_hotplug_done(); 1411 return ret; 1412 } 1413 EXPORT_SYMBOL_GPL(add_memory_resource); 1414 1415 int __ref add_memory(int nid, u64 start, u64 size) 1416 { 1417 struct resource *res; 1418 int ret; 1419 1420 res = register_memory_resource(start, size); 1421 if (IS_ERR(res)) 1422 return PTR_ERR(res); 1423 1424 ret = add_memory_resource(nid, res, memhp_auto_online); 1425 if (ret < 0) 1426 release_memory_resource(res); 1427 return ret; 1428 } 1429 EXPORT_SYMBOL_GPL(add_memory); 1430 1431 #ifdef CONFIG_MEMORY_HOTREMOVE 1432 /* 1433 * A free page on the buddy free lists (not the per-cpu lists) has PageBuddy 1434 * set and the size of the free page is given by page_order(). Using this, 1435 * the function determines if the pageblock contains only free pages. 1436 * Due to buddy contraints, a free page at least the size of a pageblock will 1437 * be located at the start of the pageblock 1438 */ 1439 static inline int pageblock_free(struct page *page) 1440 { 1441 return PageBuddy(page) && page_order(page) >= pageblock_order; 1442 } 1443 1444 /* Return the start of the next active pageblock after a given page */ 1445 static struct page *next_active_pageblock(struct page *page) 1446 { 1447 /* Ensure the starting page is pageblock-aligned */ 1448 BUG_ON(page_to_pfn(page) & (pageblock_nr_pages - 1)); 1449 1450 /* If the entire pageblock is free, move to the end of free page */ 1451 if (pageblock_free(page)) { 1452 int order; 1453 /* be careful. we don't have locks, page_order can be changed.*/ 1454 order = page_order(page); 1455 if ((order < MAX_ORDER) && (order >= pageblock_order)) 1456 return page + (1 << order); 1457 } 1458 1459 return page + pageblock_nr_pages; 1460 } 1461 1462 /* Checks if this range of memory is likely to be hot-removable. */ 1463 bool is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages) 1464 { 1465 struct page *page = pfn_to_page(start_pfn); 1466 struct page *end_page = page + nr_pages; 1467 1468 /* Check the starting page of each pageblock within the range */ 1469 for (; page < end_page; page = next_active_pageblock(page)) { 1470 if (!is_pageblock_removable_nolock(page)) 1471 return false; 1472 cond_resched(); 1473 } 1474 1475 /* All pageblocks in the memory block are likely to be hot-removable */ 1476 return true; 1477 } 1478 1479 /* 1480 * Confirm all pages in a range [start, end) is belongs to the same zone. 1481 */ 1482 int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn) 1483 { 1484 unsigned long pfn, sec_end_pfn; 1485 struct zone *zone = NULL; 1486 struct page *page; 1487 int i; 1488 for (pfn = start_pfn, sec_end_pfn = SECTION_ALIGN_UP(start_pfn); 1489 pfn < end_pfn; 1490 pfn = sec_end_pfn + 1, sec_end_pfn += PAGES_PER_SECTION) { 1491 /* Make sure the memory section is present first */ 1492 if (!present_section_nr(pfn_to_section_nr(pfn))) 1493 continue; 1494 for (; pfn < sec_end_pfn && pfn < end_pfn; 1495 pfn += MAX_ORDER_NR_PAGES) { 1496 i = 0; 1497 /* This is just a CONFIG_HOLES_IN_ZONE check.*/ 1498 while ((i < MAX_ORDER_NR_PAGES) && 1499 !pfn_valid_within(pfn + i)) 1500 i++; 1501 if (i == MAX_ORDER_NR_PAGES) 1502 continue; 1503 page = pfn_to_page(pfn + i); 1504 if (zone && page_zone(page) != zone) 1505 return 0; 1506 zone = page_zone(page); 1507 } 1508 } 1509 return 1; 1510 } 1511 1512 /* 1513 * Scan pfn range [start,end) to find movable/migratable pages (LRU pages 1514 * and hugepages). We scan pfn because it's much easier than scanning over 1515 * linked list. This function returns the pfn of the first found movable 1516 * page if it's found, otherwise 0. 1517 */ 1518 static unsigned long scan_movable_pages(unsigned long start, unsigned long end) 1519 { 1520 unsigned long pfn; 1521 struct page *page; 1522 for (pfn = start; pfn < end; pfn++) { 1523 if (pfn_valid(pfn)) { 1524 page = pfn_to_page(pfn); 1525 if (PageLRU(page)) 1526 return pfn; 1527 if (PageHuge(page)) { 1528 if (page_huge_active(page)) 1529 return pfn; 1530 else 1531 pfn = round_up(pfn + 1, 1532 1 << compound_order(page)) - 1; 1533 } 1534 } 1535 } 1536 return 0; 1537 } 1538 1539 static struct page *new_node_page(struct page *page, unsigned long private, 1540 int **result) 1541 { 1542 gfp_t gfp_mask = GFP_USER | __GFP_MOVABLE; 1543 int nid = page_to_nid(page); 1544 nodemask_t nmask = node_states[N_MEMORY]; 1545 struct page *new_page = NULL; 1546 1547 /* 1548 * TODO: allocate a destination hugepage from a nearest neighbor node, 1549 * accordance with memory policy of the user process if possible. For 1550 * now as a simple work-around, we use the next node for destination. 1551 */ 1552 if (PageHuge(page)) 1553 return alloc_huge_page_node(page_hstate(compound_head(page)), 1554 next_node_in(nid, nmask)); 1555 1556 node_clear(nid, nmask); 1557 1558 if (PageHighMem(page) 1559 || (zone_idx(page_zone(page)) == ZONE_MOVABLE)) 1560 gfp_mask |= __GFP_HIGHMEM; 1561 1562 if (!nodes_empty(nmask)) 1563 new_page = __alloc_pages_nodemask(gfp_mask, 0, 1564 node_zonelist(nid, gfp_mask), &nmask); 1565 if (!new_page) 1566 new_page = __alloc_pages(gfp_mask, 0, 1567 node_zonelist(nid, gfp_mask)); 1568 1569 return new_page; 1570 } 1571 1572 #define NR_OFFLINE_AT_ONCE_PAGES (256) 1573 static int 1574 do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) 1575 { 1576 unsigned long pfn; 1577 struct page *page; 1578 int move_pages = NR_OFFLINE_AT_ONCE_PAGES; 1579 int not_managed = 0; 1580 int ret = 0; 1581 LIST_HEAD(source); 1582 1583 for (pfn = start_pfn; pfn < end_pfn && move_pages > 0; pfn++) { 1584 if (!pfn_valid(pfn)) 1585 continue; 1586 page = pfn_to_page(pfn); 1587 1588 if (PageHuge(page)) { 1589 struct page *head = compound_head(page); 1590 pfn = page_to_pfn(head) + (1<<compound_order(head)) - 1; 1591 if (compound_order(head) > PFN_SECTION_SHIFT) { 1592 ret = -EBUSY; 1593 break; 1594 } 1595 if (isolate_huge_page(page, &source)) 1596 move_pages -= 1 << compound_order(head); 1597 continue; 1598 } 1599 1600 if (!get_page_unless_zero(page)) 1601 continue; 1602 /* 1603 * We can skip free pages. And we can only deal with pages on 1604 * LRU. 1605 */ 1606 ret = isolate_lru_page(page); 1607 if (!ret) { /* Success */ 1608 put_page(page); 1609 list_add_tail(&page->lru, &source); 1610 move_pages--; 1611 inc_node_page_state(page, NR_ISOLATED_ANON + 1612 page_is_file_cache(page)); 1613 1614 } else { 1615 #ifdef CONFIG_DEBUG_VM 1616 pr_alert("removing pfn %lx from LRU failed\n", pfn); 1617 dump_page(page, "failed to remove from LRU"); 1618 #endif 1619 put_page(page); 1620 /* Because we don't have big zone->lock. we should 1621 check this again here. */ 1622 if (page_count(page)) { 1623 not_managed++; 1624 ret = -EBUSY; 1625 break; 1626 } 1627 } 1628 } 1629 if (!list_empty(&source)) { 1630 if (not_managed) { 1631 putback_movable_pages(&source); 1632 goto out; 1633 } 1634 1635 /* Allocate a new page from the nearest neighbor node */ 1636 ret = migrate_pages(&source, new_node_page, NULL, 0, 1637 MIGRATE_SYNC, MR_MEMORY_HOTPLUG); 1638 if (ret) 1639 putback_movable_pages(&source); 1640 } 1641 out: 1642 return ret; 1643 } 1644 1645 /* 1646 * remove from free_area[] and mark all as Reserved. 1647 */ 1648 static int 1649 offline_isolated_pages_cb(unsigned long start, unsigned long nr_pages, 1650 void *data) 1651 { 1652 __offline_isolated_pages(start, start + nr_pages); 1653 return 0; 1654 } 1655 1656 static void 1657 offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) 1658 { 1659 walk_system_ram_range(start_pfn, end_pfn - start_pfn, NULL, 1660 offline_isolated_pages_cb); 1661 } 1662 1663 /* 1664 * Check all pages in range, recoreded as memory resource, are isolated. 1665 */ 1666 static int 1667 check_pages_isolated_cb(unsigned long start_pfn, unsigned long nr_pages, 1668 void *data) 1669 { 1670 int ret; 1671 long offlined = *(long *)data; 1672 ret = test_pages_isolated(start_pfn, start_pfn + nr_pages, true); 1673 offlined = nr_pages; 1674 if (!ret) 1675 *(long *)data += offlined; 1676 return ret; 1677 } 1678 1679 static long 1680 check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn) 1681 { 1682 long offlined = 0; 1683 int ret; 1684 1685 ret = walk_system_ram_range(start_pfn, end_pfn - start_pfn, &offlined, 1686 check_pages_isolated_cb); 1687 if (ret < 0) 1688 offlined = (long)ret; 1689 return offlined; 1690 } 1691 1692 #ifdef CONFIG_MOVABLE_NODE 1693 /* 1694 * When CONFIG_MOVABLE_NODE, we permit offlining of a node which doesn't have 1695 * normal memory. 1696 */ 1697 static bool can_offline_normal(struct zone *zone, unsigned long nr_pages) 1698 { 1699 return true; 1700 } 1701 #else /* CONFIG_MOVABLE_NODE */ 1702 /* ensure the node has NORMAL memory if it is still online */ 1703 static bool can_offline_normal(struct zone *zone, unsigned long nr_pages) 1704 { 1705 struct pglist_data *pgdat = zone->zone_pgdat; 1706 unsigned long present_pages = 0; 1707 enum zone_type zt; 1708 1709 for (zt = 0; zt <= ZONE_NORMAL; zt++) 1710 present_pages += pgdat->node_zones[zt].present_pages; 1711 1712 if (present_pages > nr_pages) 1713 return true; 1714 1715 present_pages = 0; 1716 for (; zt <= ZONE_MOVABLE; zt++) 1717 present_pages += pgdat->node_zones[zt].present_pages; 1718 1719 /* 1720 * we can't offline the last normal memory until all 1721 * higher memory is offlined. 1722 */ 1723 return present_pages == 0; 1724 } 1725 #endif /* CONFIG_MOVABLE_NODE */ 1726 1727 static int __init cmdline_parse_movable_node(char *p) 1728 { 1729 #ifdef CONFIG_MOVABLE_NODE 1730 /* 1731 * Memory used by the kernel cannot be hot-removed because Linux 1732 * cannot migrate the kernel pages. When memory hotplug is 1733 * enabled, we should prevent memblock from allocating memory 1734 * for the kernel. 1735 * 1736 * ACPI SRAT records all hotpluggable memory ranges. But before 1737 * SRAT is parsed, we don't know about it. 1738 * 1739 * The kernel image is loaded into memory at very early time. We 1740 * cannot prevent this anyway. So on NUMA system, we set any 1741 * node the kernel resides in as un-hotpluggable. 1742 * 1743 * Since on modern servers, one node could have double-digit 1744 * gigabytes memory, we can assume the memory around the kernel 1745 * image is also un-hotpluggable. So before SRAT is parsed, just 1746 * allocate memory near the kernel image to try the best to keep 1747 * the kernel away from hotpluggable memory. 1748 */ 1749 memblock_set_bottom_up(true); 1750 movable_node_enabled = true; 1751 #else 1752 pr_warn("movable_node option not supported\n"); 1753 #endif 1754 return 0; 1755 } 1756 early_param("movable_node", cmdline_parse_movable_node); 1757 1758 /* check which state of node_states will be changed when offline memory */ 1759 static void node_states_check_changes_offline(unsigned long nr_pages, 1760 struct zone *zone, struct memory_notify *arg) 1761 { 1762 struct pglist_data *pgdat = zone->zone_pgdat; 1763 unsigned long present_pages = 0; 1764 enum zone_type zt, zone_last = ZONE_NORMAL; 1765 1766 /* 1767 * If we have HIGHMEM or movable node, node_states[N_NORMAL_MEMORY] 1768 * contains nodes which have zones of 0...ZONE_NORMAL, 1769 * set zone_last to ZONE_NORMAL. 1770 * 1771 * If we don't have HIGHMEM nor movable node, 1772 * node_states[N_NORMAL_MEMORY] contains nodes which have zones of 1773 * 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE. 1774 */ 1775 if (N_MEMORY == N_NORMAL_MEMORY) 1776 zone_last = ZONE_MOVABLE; 1777 1778 /* 1779 * check whether node_states[N_NORMAL_MEMORY] will be changed. 1780 * If the memory to be offline is in a zone of 0...zone_last, 1781 * and it is the last present memory, 0...zone_last will 1782 * become empty after offline , thus we can determind we will 1783 * need to clear the node from node_states[N_NORMAL_MEMORY]. 1784 */ 1785 for (zt = 0; zt <= zone_last; zt++) 1786 present_pages += pgdat->node_zones[zt].present_pages; 1787 if (zone_idx(zone) <= zone_last && nr_pages >= present_pages) 1788 arg->status_change_nid_normal = zone_to_nid(zone); 1789 else 1790 arg->status_change_nid_normal = -1; 1791 1792 #ifdef CONFIG_HIGHMEM 1793 /* 1794 * If we have movable node, node_states[N_HIGH_MEMORY] 1795 * contains nodes which have zones of 0...ZONE_HIGHMEM, 1796 * set zone_last to ZONE_HIGHMEM. 1797 * 1798 * If we don't have movable node, node_states[N_NORMAL_MEMORY] 1799 * contains nodes which have zones of 0...ZONE_MOVABLE, 1800 * set zone_last to ZONE_MOVABLE. 1801 */ 1802 zone_last = ZONE_HIGHMEM; 1803 if (N_MEMORY == N_HIGH_MEMORY) 1804 zone_last = ZONE_MOVABLE; 1805 1806 for (; zt <= zone_last; zt++) 1807 present_pages += pgdat->node_zones[zt].present_pages; 1808 if (zone_idx(zone) <= zone_last && nr_pages >= present_pages) 1809 arg->status_change_nid_high = zone_to_nid(zone); 1810 else 1811 arg->status_change_nid_high = -1; 1812 #else 1813 arg->status_change_nid_high = arg->status_change_nid_normal; 1814 #endif 1815 1816 /* 1817 * node_states[N_HIGH_MEMORY] contains nodes which have 0...ZONE_MOVABLE 1818 */ 1819 zone_last = ZONE_MOVABLE; 1820 1821 /* 1822 * check whether node_states[N_HIGH_MEMORY] will be changed 1823 * If we try to offline the last present @nr_pages from the node, 1824 * we can determind we will need to clear the node from 1825 * node_states[N_HIGH_MEMORY]. 1826 */ 1827 for (; zt <= zone_last; zt++) 1828 present_pages += pgdat->node_zones[zt].present_pages; 1829 if (nr_pages >= present_pages) 1830 arg->status_change_nid = zone_to_nid(zone); 1831 else 1832 arg->status_change_nid = -1; 1833 } 1834 1835 static void node_states_clear_node(int node, struct memory_notify *arg) 1836 { 1837 if (arg->status_change_nid_normal >= 0) 1838 node_clear_state(node, N_NORMAL_MEMORY); 1839 1840 if ((N_MEMORY != N_NORMAL_MEMORY) && 1841 (arg->status_change_nid_high >= 0)) 1842 node_clear_state(node, N_HIGH_MEMORY); 1843 1844 if ((N_MEMORY != N_HIGH_MEMORY) && 1845 (arg->status_change_nid >= 0)) 1846 node_clear_state(node, N_MEMORY); 1847 } 1848 1849 static int __ref __offline_pages(unsigned long start_pfn, 1850 unsigned long end_pfn, unsigned long timeout) 1851 { 1852 unsigned long pfn, nr_pages, expire; 1853 long offlined_pages; 1854 int ret, drain, retry_max, node; 1855 unsigned long flags; 1856 struct zone *zone; 1857 struct memory_notify arg; 1858 1859 /* at least, alignment against pageblock is necessary */ 1860 if (!IS_ALIGNED(start_pfn, pageblock_nr_pages)) 1861 return -EINVAL; 1862 if (!IS_ALIGNED(end_pfn, pageblock_nr_pages)) 1863 return -EINVAL; 1864 /* This makes hotplug much easier...and readable. 1865 we assume this for now. .*/ 1866 if (!test_pages_in_a_zone(start_pfn, end_pfn)) 1867 return -EINVAL; 1868 1869 zone = page_zone(pfn_to_page(start_pfn)); 1870 node = zone_to_nid(zone); 1871 nr_pages = end_pfn - start_pfn; 1872 1873 if (zone_idx(zone) <= ZONE_NORMAL && !can_offline_normal(zone, nr_pages)) 1874 return -EINVAL; 1875 1876 /* set above range as isolated */ 1877 ret = start_isolate_page_range(start_pfn, end_pfn, 1878 MIGRATE_MOVABLE, true); 1879 if (ret) 1880 return ret; 1881 1882 arg.start_pfn = start_pfn; 1883 arg.nr_pages = nr_pages; 1884 node_states_check_changes_offline(nr_pages, zone, &arg); 1885 1886 ret = memory_notify(MEM_GOING_OFFLINE, &arg); 1887 ret = notifier_to_errno(ret); 1888 if (ret) 1889 goto failed_removal; 1890 1891 pfn = start_pfn; 1892 expire = jiffies + timeout; 1893 drain = 0; 1894 retry_max = 5; 1895 repeat: 1896 /* start memory hot removal */ 1897 ret = -EAGAIN; 1898 if (time_after(jiffies, expire)) 1899 goto failed_removal; 1900 ret = -EINTR; 1901 if (signal_pending(current)) 1902 goto failed_removal; 1903 ret = 0; 1904 if (drain) { 1905 lru_add_drain_all(); 1906 cond_resched(); 1907 drain_all_pages(zone); 1908 } 1909 1910 pfn = scan_movable_pages(start_pfn, end_pfn); 1911 if (pfn) { /* We have movable pages */ 1912 ret = do_migrate_range(pfn, end_pfn); 1913 if (!ret) { 1914 drain = 1; 1915 goto repeat; 1916 } else { 1917 if (ret < 0) 1918 if (--retry_max == 0) 1919 goto failed_removal; 1920 yield(); 1921 drain = 1; 1922 goto repeat; 1923 } 1924 } 1925 /* drain all zone's lru pagevec, this is asynchronous... */ 1926 lru_add_drain_all(); 1927 yield(); 1928 /* drain pcp pages, this is synchronous. */ 1929 drain_all_pages(zone); 1930 /* 1931 * dissolve free hugepages in the memory block before doing offlining 1932 * actually in order to make hugetlbfs's object counting consistent. 1933 */ 1934 ret = dissolve_free_huge_pages(start_pfn, end_pfn); 1935 if (ret) 1936 goto failed_removal; 1937 /* check again */ 1938 offlined_pages = check_pages_isolated(start_pfn, end_pfn); 1939 if (offlined_pages < 0) { 1940 ret = -EBUSY; 1941 goto failed_removal; 1942 } 1943 pr_info("Offlined Pages %ld\n", offlined_pages); 1944 /* Ok, all of our target is isolated. 1945 We cannot do rollback at this point. */ 1946 offline_isolated_pages(start_pfn, end_pfn); 1947 /* reset pagetype flags and makes migrate type to be MOVABLE */ 1948 undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); 1949 /* removal success */ 1950 adjust_managed_page_count(pfn_to_page(start_pfn), -offlined_pages); 1951 zone->present_pages -= offlined_pages; 1952 1953 pgdat_resize_lock(zone->zone_pgdat, &flags); 1954 zone->zone_pgdat->node_present_pages -= offlined_pages; 1955 pgdat_resize_unlock(zone->zone_pgdat, &flags); 1956 1957 init_per_zone_wmark_min(); 1958 1959 if (!populated_zone(zone)) { 1960 zone_pcp_reset(zone); 1961 mutex_lock(&zonelists_mutex); 1962 build_all_zonelists(NULL, NULL); 1963 mutex_unlock(&zonelists_mutex); 1964 } else 1965 zone_pcp_update(zone); 1966 1967 node_states_clear_node(node, &arg); 1968 if (arg.status_change_nid >= 0) { 1969 kswapd_stop(node); 1970 kcompactd_stop(node); 1971 } 1972 1973 vm_total_pages = nr_free_pagecache_pages(); 1974 writeback_set_ratelimit(); 1975 1976 memory_notify(MEM_OFFLINE, &arg); 1977 return 0; 1978 1979 failed_removal: 1980 pr_debug("memory offlining [mem %#010llx-%#010llx] failed\n", 1981 (unsigned long long) start_pfn << PAGE_SHIFT, 1982 ((unsigned long long) end_pfn << PAGE_SHIFT) - 1); 1983 memory_notify(MEM_CANCEL_OFFLINE, &arg); 1984 /* pushback to free area */ 1985 undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); 1986 return ret; 1987 } 1988 1989 /* Must be protected by mem_hotplug_begin() */ 1990 int offline_pages(unsigned long start_pfn, unsigned long nr_pages) 1991 { 1992 return __offline_pages(start_pfn, start_pfn + nr_pages, 120 * HZ); 1993 } 1994 #endif /* CONFIG_MEMORY_HOTREMOVE */ 1995 1996 /** 1997 * walk_memory_range - walks through all mem sections in [start_pfn, end_pfn) 1998 * @start_pfn: start pfn of the memory range 1999 * @end_pfn: end pfn of the memory range 2000 * @arg: argument passed to func 2001 * @func: callback for each memory section walked 2002 * 2003 * This function walks through all present mem sections in range 2004 * [start_pfn, end_pfn) and call func on each mem section. 2005 * 2006 * Returns the return value of func. 2007 */ 2008 int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn, 2009 void *arg, int (*func)(struct memory_block *, void *)) 2010 { 2011 struct memory_block *mem = NULL; 2012 struct mem_section *section; 2013 unsigned long pfn, section_nr; 2014 int ret; 2015 2016 for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { 2017 section_nr = pfn_to_section_nr(pfn); 2018 if (!present_section_nr(section_nr)) 2019 continue; 2020 2021 section = __nr_to_section(section_nr); 2022 /* same memblock? */ 2023 if (mem) 2024 if ((section_nr >= mem->start_section_nr) && 2025 (section_nr <= mem->end_section_nr)) 2026 continue; 2027 2028 mem = find_memory_block_hinted(section, mem); 2029 if (!mem) 2030 continue; 2031 2032 ret = func(mem, arg); 2033 if (ret) { 2034 kobject_put(&mem->dev.kobj); 2035 return ret; 2036 } 2037 } 2038 2039 if (mem) 2040 kobject_put(&mem->dev.kobj); 2041 2042 return 0; 2043 } 2044 2045 #ifdef CONFIG_MEMORY_HOTREMOVE 2046 static int check_memblock_offlined_cb(struct memory_block *mem, void *arg) 2047 { 2048 int ret = !is_memblock_offlined(mem); 2049 2050 if (unlikely(ret)) { 2051 phys_addr_t beginpa, endpa; 2052 2053 beginpa = PFN_PHYS(section_nr_to_pfn(mem->start_section_nr)); 2054 endpa = PFN_PHYS(section_nr_to_pfn(mem->end_section_nr + 1))-1; 2055 pr_warn("removing memory fails, because memory [%pa-%pa] is onlined\n", 2056 &beginpa, &endpa); 2057 } 2058 2059 return ret; 2060 } 2061 2062 static int check_cpu_on_node(pg_data_t *pgdat) 2063 { 2064 int cpu; 2065 2066 for_each_present_cpu(cpu) { 2067 if (cpu_to_node(cpu) == pgdat->node_id) 2068 /* 2069 * the cpu on this node isn't removed, and we can't 2070 * offline this node. 2071 */ 2072 return -EBUSY; 2073 } 2074 2075 return 0; 2076 } 2077 2078 static void unmap_cpu_on_node(pg_data_t *pgdat) 2079 { 2080 #ifdef CONFIG_ACPI_NUMA 2081 int cpu; 2082 2083 for_each_possible_cpu(cpu) 2084 if (cpu_to_node(cpu) == pgdat->node_id) 2085 numa_clear_node(cpu); 2086 #endif 2087 } 2088 2089 static int check_and_unmap_cpu_on_node(pg_data_t *pgdat) 2090 { 2091 int ret; 2092 2093 ret = check_cpu_on_node(pgdat); 2094 if (ret) 2095 return ret; 2096 2097 /* 2098 * the node will be offlined when we come here, so we can clear 2099 * the cpu_to_node() now. 2100 */ 2101 2102 unmap_cpu_on_node(pgdat); 2103 return 0; 2104 } 2105 2106 /** 2107 * try_offline_node 2108 * 2109 * Offline a node if all memory sections and cpus of the node are removed. 2110 * 2111 * NOTE: The caller must call lock_device_hotplug() to serialize hotplug 2112 * and online/offline operations before this call. 2113 */ 2114 void try_offline_node(int nid) 2115 { 2116 pg_data_t *pgdat = NODE_DATA(nid); 2117 unsigned long start_pfn = pgdat->node_start_pfn; 2118 unsigned long end_pfn = start_pfn + pgdat->node_spanned_pages; 2119 unsigned long pfn; 2120 2121 for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { 2122 unsigned long section_nr = pfn_to_section_nr(pfn); 2123 2124 if (!present_section_nr(section_nr)) 2125 continue; 2126 2127 if (pfn_to_nid(pfn) != nid) 2128 continue; 2129 2130 /* 2131 * some memory sections of this node are not removed, and we 2132 * can't offline node now. 2133 */ 2134 return; 2135 } 2136 2137 if (check_and_unmap_cpu_on_node(pgdat)) 2138 return; 2139 2140 /* 2141 * all memory/cpu of this node are removed, we can offline this 2142 * node now. 2143 */ 2144 node_set_offline(nid); 2145 unregister_one_node(nid); 2146 } 2147 EXPORT_SYMBOL(try_offline_node); 2148 2149 /** 2150 * remove_memory 2151 * 2152 * NOTE: The caller must call lock_device_hotplug() to serialize hotplug 2153 * and online/offline operations before this call, as required by 2154 * try_offline_node(). 2155 */ 2156 void __ref remove_memory(int nid, u64 start, u64 size) 2157 { 2158 int ret; 2159 2160 BUG_ON(check_hotplug_memory_range(start, size)); 2161 2162 mem_hotplug_begin(); 2163 2164 /* 2165 * All memory blocks must be offlined before removing memory. Check 2166 * whether all memory blocks in question are offline and trigger a BUG() 2167 * if this is not the case. 2168 */ 2169 ret = walk_memory_range(PFN_DOWN(start), PFN_UP(start + size - 1), NULL, 2170 check_memblock_offlined_cb); 2171 if (ret) 2172 BUG(); 2173 2174 /* remove memmap entry */ 2175 firmware_map_remove(start, start + size, "System RAM"); 2176 memblock_free(start, size); 2177 memblock_remove(start, size); 2178 2179 arch_remove_memory(start, size); 2180 2181 try_offline_node(nid); 2182 2183 mem_hotplug_done(); 2184 } 2185 EXPORT_SYMBOL_GPL(remove_memory); 2186 #endif /* CONFIG_MEMORY_HOTREMOVE */ 2187