1 /* 2 * linux/mm/memory_hotplug.c 3 * 4 * Copyright (C) 5 */ 6 7 #include <linux/stddef.h> 8 #include <linux/mm.h> 9 #include <linux/swap.h> 10 #include <linux/interrupt.h> 11 #include <linux/pagemap.h> 12 #include <linux/compiler.h> 13 #include <linux/export.h> 14 #include <linux/pagevec.h> 15 #include <linux/writeback.h> 16 #include <linux/slab.h> 17 #include <linux/sysctl.h> 18 #include <linux/cpu.h> 19 #include <linux/memory.h> 20 #include <linux/memory_hotplug.h> 21 #include <linux/highmem.h> 22 #include <linux/vmalloc.h> 23 #include <linux/ioport.h> 24 #include <linux/delay.h> 25 #include <linux/migrate.h> 26 #include <linux/page-isolation.h> 27 #include <linux/pfn.h> 28 #include <linux/suspend.h> 29 #include <linux/mm_inline.h> 30 #include <linux/firmware-map.h> 31 #include <linux/stop_machine.h> 32 #include <linux/hugetlb.h> 33 #include <linux/memblock.h> 34 #include <linux/bootmem.h> 35 36 #include <asm/tlbflush.h> 37 38 #include "internal.h" 39 40 /* 41 * online_page_callback contains pointer to current page onlining function. 42 * Initially it is generic_online_page(). If it is required it could be 43 * changed by calling set_online_page_callback() for callback registration 44 * and restore_online_page_callback() for generic callback restore. 45 */ 46 47 static void generic_online_page(struct page *page); 48 49 static online_page_callback_t online_page_callback = generic_online_page; 50 static DEFINE_MUTEX(online_page_callback_lock); 51 52 /* The same as the cpu_hotplug lock, but for memory hotplug. */ 53 static struct { 54 struct task_struct *active_writer; 55 struct mutex lock; /* Synchronizes accesses to refcount, */ 56 /* 57 * Also blocks the new readers during 58 * an ongoing mem hotplug operation. 59 */ 60 int refcount; 61 62 #ifdef CONFIG_DEBUG_LOCK_ALLOC 63 struct lockdep_map dep_map; 64 #endif 65 } mem_hotplug = { 66 .active_writer = NULL, 67 .lock = __MUTEX_INITIALIZER(mem_hotplug.lock), 68 .refcount = 0, 69 #ifdef CONFIG_DEBUG_LOCK_ALLOC 70 .dep_map = {.name = "mem_hotplug.lock" }, 71 #endif 72 }; 73 74 /* Lockdep annotations for get/put_online_mems() and mem_hotplug_begin/end() */ 75 #define memhp_lock_acquire_read() lock_map_acquire_read(&mem_hotplug.dep_map) 76 #define memhp_lock_acquire() lock_map_acquire(&mem_hotplug.dep_map) 77 #define memhp_lock_release() lock_map_release(&mem_hotplug.dep_map) 78 79 void get_online_mems(void) 80 { 81 might_sleep(); 82 if (mem_hotplug.active_writer == current) 83 return; 84 memhp_lock_acquire_read(); 85 mutex_lock(&mem_hotplug.lock); 86 mem_hotplug.refcount++; 87 mutex_unlock(&mem_hotplug.lock); 88 89 } 90 91 void put_online_mems(void) 92 { 93 if (mem_hotplug.active_writer == current) 94 return; 95 mutex_lock(&mem_hotplug.lock); 96 97 if (WARN_ON(!mem_hotplug.refcount)) 98 mem_hotplug.refcount++; /* try to fix things up */ 99 100 if (!--mem_hotplug.refcount && unlikely(mem_hotplug.active_writer)) 101 wake_up_process(mem_hotplug.active_writer); 102 mutex_unlock(&mem_hotplug.lock); 103 memhp_lock_release(); 104 105 } 106 107 void mem_hotplug_begin(void) 108 { 109 mem_hotplug.active_writer = current; 110 111 memhp_lock_acquire(); 112 for (;;) { 113 mutex_lock(&mem_hotplug.lock); 114 if (likely(!mem_hotplug.refcount)) 115 break; 116 __set_current_state(TASK_UNINTERRUPTIBLE); 117 mutex_unlock(&mem_hotplug.lock); 118 schedule(); 119 } 120 } 121 122 void mem_hotplug_done(void) 123 { 124 mem_hotplug.active_writer = NULL; 125 mutex_unlock(&mem_hotplug.lock); 126 memhp_lock_release(); 127 } 128 129 /* add this memory to iomem resource */ 130 static struct resource *register_memory_resource(u64 start, u64 size) 131 { 132 struct resource *res; 133 res = kzalloc(sizeof(struct resource), GFP_KERNEL); 134 BUG_ON(!res); 135 136 res->name = "System RAM"; 137 res->start = start; 138 res->end = start + size - 1; 139 res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; 140 if (request_resource(&iomem_resource, res) < 0) { 141 pr_debug("System RAM resource %pR cannot be added\n", res); 142 kfree(res); 143 res = NULL; 144 } 145 return res; 146 } 147 148 static void release_memory_resource(struct resource *res) 149 { 150 if (!res) 151 return; 152 release_resource(res); 153 kfree(res); 154 return; 155 } 156 157 #ifdef CONFIG_MEMORY_HOTPLUG_SPARSE 158 void get_page_bootmem(unsigned long info, struct page *page, 159 unsigned long type) 160 { 161 page->lru.next = (struct list_head *) type; 162 SetPagePrivate(page); 163 set_page_private(page, info); 164 atomic_inc(&page->_count); 165 } 166 167 void put_page_bootmem(struct page *page) 168 { 169 unsigned long type; 170 171 type = (unsigned long) page->lru.next; 172 BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE || 173 type > MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE); 174 175 if (atomic_dec_return(&page->_count) == 1) { 176 ClearPagePrivate(page); 177 set_page_private(page, 0); 178 INIT_LIST_HEAD(&page->lru); 179 free_reserved_page(page); 180 } 181 } 182 183 #ifdef CONFIG_HAVE_BOOTMEM_INFO_NODE 184 #ifndef CONFIG_SPARSEMEM_VMEMMAP 185 static void register_page_bootmem_info_section(unsigned long start_pfn) 186 { 187 unsigned long *usemap, mapsize, section_nr, i; 188 struct mem_section *ms; 189 struct page *page, *memmap; 190 191 section_nr = pfn_to_section_nr(start_pfn); 192 ms = __nr_to_section(section_nr); 193 194 /* Get section's memmap address */ 195 memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr); 196 197 /* 198 * Get page for the memmap's phys address 199 * XXX: need more consideration for sparse_vmemmap... 200 */ 201 page = virt_to_page(memmap); 202 mapsize = sizeof(struct page) * PAGES_PER_SECTION; 203 mapsize = PAGE_ALIGN(mapsize) >> PAGE_SHIFT; 204 205 /* remember memmap's page */ 206 for (i = 0; i < mapsize; i++, page++) 207 get_page_bootmem(section_nr, page, SECTION_INFO); 208 209 usemap = __nr_to_section(section_nr)->pageblock_flags; 210 page = virt_to_page(usemap); 211 212 mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT; 213 214 for (i = 0; i < mapsize; i++, page++) 215 get_page_bootmem(section_nr, page, MIX_SECTION_INFO); 216 217 } 218 #else /* CONFIG_SPARSEMEM_VMEMMAP */ 219 static void register_page_bootmem_info_section(unsigned long start_pfn) 220 { 221 unsigned long *usemap, mapsize, section_nr, i; 222 struct mem_section *ms; 223 struct page *page, *memmap; 224 225 if (!pfn_valid(start_pfn)) 226 return; 227 228 section_nr = pfn_to_section_nr(start_pfn); 229 ms = __nr_to_section(section_nr); 230 231 memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr); 232 233 register_page_bootmem_memmap(section_nr, memmap, PAGES_PER_SECTION); 234 235 usemap = __nr_to_section(section_nr)->pageblock_flags; 236 page = virt_to_page(usemap); 237 238 mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT; 239 240 for (i = 0; i < mapsize; i++, page++) 241 get_page_bootmem(section_nr, page, MIX_SECTION_INFO); 242 } 243 #endif /* !CONFIG_SPARSEMEM_VMEMMAP */ 244 245 void register_page_bootmem_info_node(struct pglist_data *pgdat) 246 { 247 unsigned long i, pfn, end_pfn, nr_pages; 248 int node = pgdat->node_id; 249 struct page *page; 250 struct zone *zone; 251 252 nr_pages = PAGE_ALIGN(sizeof(struct pglist_data)) >> PAGE_SHIFT; 253 page = virt_to_page(pgdat); 254 255 for (i = 0; i < nr_pages; i++, page++) 256 get_page_bootmem(node, page, NODE_INFO); 257 258 zone = &pgdat->node_zones[0]; 259 for (; zone < pgdat->node_zones + MAX_NR_ZONES - 1; zone++) { 260 if (zone_is_initialized(zone)) { 261 nr_pages = zone->wait_table_hash_nr_entries 262 * sizeof(wait_queue_head_t); 263 nr_pages = PAGE_ALIGN(nr_pages) >> PAGE_SHIFT; 264 page = virt_to_page(zone->wait_table); 265 266 for (i = 0; i < nr_pages; i++, page++) 267 get_page_bootmem(node, page, NODE_INFO); 268 } 269 } 270 271 pfn = pgdat->node_start_pfn; 272 end_pfn = pgdat_end_pfn(pgdat); 273 274 /* register section info */ 275 for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) { 276 /* 277 * Some platforms can assign the same pfn to multiple nodes - on 278 * node0 as well as nodeN. To avoid registering a pfn against 279 * multiple nodes we check that this pfn does not already 280 * reside in some other nodes. 281 */ 282 if (pfn_valid(pfn) && (pfn_to_nid(pfn) == node)) 283 register_page_bootmem_info_section(pfn); 284 } 285 } 286 #endif /* CONFIG_HAVE_BOOTMEM_INFO_NODE */ 287 288 static void __meminit grow_zone_span(struct zone *zone, unsigned long start_pfn, 289 unsigned long end_pfn) 290 { 291 unsigned long old_zone_end_pfn; 292 293 zone_span_writelock(zone); 294 295 old_zone_end_pfn = zone_end_pfn(zone); 296 if (zone_is_empty(zone) || start_pfn < zone->zone_start_pfn) 297 zone->zone_start_pfn = start_pfn; 298 299 zone->spanned_pages = max(old_zone_end_pfn, end_pfn) - 300 zone->zone_start_pfn; 301 302 zone_span_writeunlock(zone); 303 } 304 305 static void resize_zone(struct zone *zone, unsigned long start_pfn, 306 unsigned long end_pfn) 307 { 308 zone_span_writelock(zone); 309 310 if (end_pfn - start_pfn) { 311 zone->zone_start_pfn = start_pfn; 312 zone->spanned_pages = end_pfn - start_pfn; 313 } else { 314 /* 315 * make it consist as free_area_init_core(), 316 * if spanned_pages = 0, then keep start_pfn = 0 317 */ 318 zone->zone_start_pfn = 0; 319 zone->spanned_pages = 0; 320 } 321 322 zone_span_writeunlock(zone); 323 } 324 325 static void fix_zone_id(struct zone *zone, unsigned long start_pfn, 326 unsigned long end_pfn) 327 { 328 enum zone_type zid = zone_idx(zone); 329 int nid = zone->zone_pgdat->node_id; 330 unsigned long pfn; 331 332 for (pfn = start_pfn; pfn < end_pfn; pfn++) 333 set_page_links(pfn_to_page(pfn), zid, nid, pfn); 334 } 335 336 /* Can fail with -ENOMEM from allocating a wait table with vmalloc() or 337 * alloc_bootmem_node_nopanic()/memblock_virt_alloc_node_nopanic() */ 338 static int __ref ensure_zone_is_initialized(struct zone *zone, 339 unsigned long start_pfn, unsigned long num_pages) 340 { 341 if (!zone_is_initialized(zone)) 342 return init_currently_empty_zone(zone, start_pfn, num_pages, 343 MEMMAP_HOTPLUG); 344 return 0; 345 } 346 347 static int __meminit move_pfn_range_left(struct zone *z1, struct zone *z2, 348 unsigned long start_pfn, unsigned long end_pfn) 349 { 350 int ret; 351 unsigned long flags; 352 unsigned long z1_start_pfn; 353 354 ret = ensure_zone_is_initialized(z1, start_pfn, end_pfn - start_pfn); 355 if (ret) 356 return ret; 357 358 pgdat_resize_lock(z1->zone_pgdat, &flags); 359 360 /* can't move pfns which are higher than @z2 */ 361 if (end_pfn > zone_end_pfn(z2)) 362 goto out_fail; 363 /* the move out part must be at the left most of @z2 */ 364 if (start_pfn > z2->zone_start_pfn) 365 goto out_fail; 366 /* must included/overlap */ 367 if (end_pfn <= z2->zone_start_pfn) 368 goto out_fail; 369 370 /* use start_pfn for z1's start_pfn if z1 is empty */ 371 if (!zone_is_empty(z1)) 372 z1_start_pfn = z1->zone_start_pfn; 373 else 374 z1_start_pfn = start_pfn; 375 376 resize_zone(z1, z1_start_pfn, end_pfn); 377 resize_zone(z2, end_pfn, zone_end_pfn(z2)); 378 379 pgdat_resize_unlock(z1->zone_pgdat, &flags); 380 381 fix_zone_id(z1, start_pfn, end_pfn); 382 383 return 0; 384 out_fail: 385 pgdat_resize_unlock(z1->zone_pgdat, &flags); 386 return -1; 387 } 388 389 static int __meminit move_pfn_range_right(struct zone *z1, struct zone *z2, 390 unsigned long start_pfn, unsigned long end_pfn) 391 { 392 int ret; 393 unsigned long flags; 394 unsigned long z2_end_pfn; 395 396 ret = ensure_zone_is_initialized(z2, start_pfn, end_pfn - start_pfn); 397 if (ret) 398 return ret; 399 400 pgdat_resize_lock(z1->zone_pgdat, &flags); 401 402 /* can't move pfns which are lower than @z1 */ 403 if (z1->zone_start_pfn > start_pfn) 404 goto out_fail; 405 /* the move out part mast at the right most of @z1 */ 406 if (zone_end_pfn(z1) > end_pfn) 407 goto out_fail; 408 /* must included/overlap */ 409 if (start_pfn >= zone_end_pfn(z1)) 410 goto out_fail; 411 412 /* use end_pfn for z2's end_pfn if z2 is empty */ 413 if (!zone_is_empty(z2)) 414 z2_end_pfn = zone_end_pfn(z2); 415 else 416 z2_end_pfn = end_pfn; 417 418 resize_zone(z1, z1->zone_start_pfn, start_pfn); 419 resize_zone(z2, start_pfn, z2_end_pfn); 420 421 pgdat_resize_unlock(z1->zone_pgdat, &flags); 422 423 fix_zone_id(z2, start_pfn, end_pfn); 424 425 return 0; 426 out_fail: 427 pgdat_resize_unlock(z1->zone_pgdat, &flags); 428 return -1; 429 } 430 431 static void __meminit grow_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn, 432 unsigned long end_pfn) 433 { 434 unsigned long old_pgdat_end_pfn = pgdat_end_pfn(pgdat); 435 436 if (!pgdat->node_spanned_pages || start_pfn < pgdat->node_start_pfn) 437 pgdat->node_start_pfn = start_pfn; 438 439 pgdat->node_spanned_pages = max(old_pgdat_end_pfn, end_pfn) - 440 pgdat->node_start_pfn; 441 } 442 443 static int __meminit __add_zone(struct zone *zone, unsigned long phys_start_pfn) 444 { 445 struct pglist_data *pgdat = zone->zone_pgdat; 446 int nr_pages = PAGES_PER_SECTION; 447 int nid = pgdat->node_id; 448 int zone_type; 449 unsigned long flags; 450 int ret; 451 452 zone_type = zone - pgdat->node_zones; 453 ret = ensure_zone_is_initialized(zone, phys_start_pfn, nr_pages); 454 if (ret) 455 return ret; 456 457 pgdat_resize_lock(zone->zone_pgdat, &flags); 458 grow_zone_span(zone, phys_start_pfn, phys_start_pfn + nr_pages); 459 grow_pgdat_span(zone->zone_pgdat, phys_start_pfn, 460 phys_start_pfn + nr_pages); 461 pgdat_resize_unlock(zone->zone_pgdat, &flags); 462 memmap_init_zone(nr_pages, nid, zone_type, 463 phys_start_pfn, MEMMAP_HOTPLUG); 464 return 0; 465 } 466 467 static int __meminit __add_section(int nid, struct zone *zone, 468 unsigned long phys_start_pfn) 469 { 470 int ret; 471 472 if (pfn_valid(phys_start_pfn)) 473 return -EEXIST; 474 475 ret = sparse_add_one_section(zone, phys_start_pfn); 476 477 if (ret < 0) 478 return ret; 479 480 ret = __add_zone(zone, phys_start_pfn); 481 482 if (ret < 0) 483 return ret; 484 485 return register_new_memory(nid, __pfn_to_section(phys_start_pfn)); 486 } 487 488 /* 489 * Reasonably generic function for adding memory. It is 490 * expected that archs that support memory hotplug will 491 * call this function after deciding the zone to which to 492 * add the new pages. 493 */ 494 int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn, 495 unsigned long nr_pages) 496 { 497 unsigned long i; 498 int err = 0; 499 int start_sec, end_sec; 500 /* during initialize mem_map, align hot-added range to section */ 501 start_sec = pfn_to_section_nr(phys_start_pfn); 502 end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1); 503 504 for (i = start_sec; i <= end_sec; i++) { 505 err = __add_section(nid, zone, section_nr_to_pfn(i)); 506 507 /* 508 * EEXIST is finally dealt with by ioresource collision 509 * check. see add_memory() => register_memory_resource() 510 * Warning will be printed if there is collision. 511 */ 512 if (err && (err != -EEXIST)) 513 break; 514 err = 0; 515 } 516 vmemmap_populate_print_last(); 517 518 return err; 519 } 520 EXPORT_SYMBOL_GPL(__add_pages); 521 522 #ifdef CONFIG_MEMORY_HOTREMOVE 523 /* find the smallest valid pfn in the range [start_pfn, end_pfn) */ 524 static int find_smallest_section_pfn(int nid, struct zone *zone, 525 unsigned long start_pfn, 526 unsigned long end_pfn) 527 { 528 struct mem_section *ms; 529 530 for (; start_pfn < end_pfn; start_pfn += PAGES_PER_SECTION) { 531 ms = __pfn_to_section(start_pfn); 532 533 if (unlikely(!valid_section(ms))) 534 continue; 535 536 if (unlikely(pfn_to_nid(start_pfn) != nid)) 537 continue; 538 539 if (zone && zone != page_zone(pfn_to_page(start_pfn))) 540 continue; 541 542 return start_pfn; 543 } 544 545 return 0; 546 } 547 548 /* find the biggest valid pfn in the range [start_pfn, end_pfn). */ 549 static int find_biggest_section_pfn(int nid, struct zone *zone, 550 unsigned long start_pfn, 551 unsigned long end_pfn) 552 { 553 struct mem_section *ms; 554 unsigned long pfn; 555 556 /* pfn is the end pfn of a memory section. */ 557 pfn = end_pfn - 1; 558 for (; pfn >= start_pfn; pfn -= PAGES_PER_SECTION) { 559 ms = __pfn_to_section(pfn); 560 561 if (unlikely(!valid_section(ms))) 562 continue; 563 564 if (unlikely(pfn_to_nid(pfn) != nid)) 565 continue; 566 567 if (zone && zone != page_zone(pfn_to_page(pfn))) 568 continue; 569 570 return pfn; 571 } 572 573 return 0; 574 } 575 576 static void shrink_zone_span(struct zone *zone, unsigned long start_pfn, 577 unsigned long end_pfn) 578 { 579 unsigned long zone_start_pfn = zone->zone_start_pfn; 580 unsigned long z = zone_end_pfn(zone); /* zone_end_pfn namespace clash */ 581 unsigned long zone_end_pfn = z; 582 unsigned long pfn; 583 struct mem_section *ms; 584 int nid = zone_to_nid(zone); 585 586 zone_span_writelock(zone); 587 if (zone_start_pfn == start_pfn) { 588 /* 589 * If the section is smallest section in the zone, it need 590 * shrink zone->zone_start_pfn and zone->zone_spanned_pages. 591 * In this case, we find second smallest valid mem_section 592 * for shrinking zone. 593 */ 594 pfn = find_smallest_section_pfn(nid, zone, end_pfn, 595 zone_end_pfn); 596 if (pfn) { 597 zone->zone_start_pfn = pfn; 598 zone->spanned_pages = zone_end_pfn - pfn; 599 } 600 } else if (zone_end_pfn == end_pfn) { 601 /* 602 * If the section is biggest section in the zone, it need 603 * shrink zone->spanned_pages. 604 * In this case, we find second biggest valid mem_section for 605 * shrinking zone. 606 */ 607 pfn = find_biggest_section_pfn(nid, zone, zone_start_pfn, 608 start_pfn); 609 if (pfn) 610 zone->spanned_pages = pfn - zone_start_pfn + 1; 611 } 612 613 /* 614 * The section is not biggest or smallest mem_section in the zone, it 615 * only creates a hole in the zone. So in this case, we need not 616 * change the zone. But perhaps, the zone has only hole data. Thus 617 * it check the zone has only hole or not. 618 */ 619 pfn = zone_start_pfn; 620 for (; pfn < zone_end_pfn; pfn += PAGES_PER_SECTION) { 621 ms = __pfn_to_section(pfn); 622 623 if (unlikely(!valid_section(ms))) 624 continue; 625 626 if (page_zone(pfn_to_page(pfn)) != zone) 627 continue; 628 629 /* If the section is current section, it continues the loop */ 630 if (start_pfn == pfn) 631 continue; 632 633 /* If we find valid section, we have nothing to do */ 634 zone_span_writeunlock(zone); 635 return; 636 } 637 638 /* The zone has no valid section */ 639 zone->zone_start_pfn = 0; 640 zone->spanned_pages = 0; 641 zone_span_writeunlock(zone); 642 } 643 644 static void shrink_pgdat_span(struct pglist_data *pgdat, 645 unsigned long start_pfn, unsigned long end_pfn) 646 { 647 unsigned long pgdat_start_pfn = pgdat->node_start_pfn; 648 unsigned long p = pgdat_end_pfn(pgdat); /* pgdat_end_pfn namespace clash */ 649 unsigned long pgdat_end_pfn = p; 650 unsigned long pfn; 651 struct mem_section *ms; 652 int nid = pgdat->node_id; 653 654 if (pgdat_start_pfn == start_pfn) { 655 /* 656 * If the section is smallest section in the pgdat, it need 657 * shrink pgdat->node_start_pfn and pgdat->node_spanned_pages. 658 * In this case, we find second smallest valid mem_section 659 * for shrinking zone. 660 */ 661 pfn = find_smallest_section_pfn(nid, NULL, end_pfn, 662 pgdat_end_pfn); 663 if (pfn) { 664 pgdat->node_start_pfn = pfn; 665 pgdat->node_spanned_pages = pgdat_end_pfn - pfn; 666 } 667 } else if (pgdat_end_pfn == end_pfn) { 668 /* 669 * If the section is biggest section in the pgdat, it need 670 * shrink pgdat->node_spanned_pages. 671 * In this case, we find second biggest valid mem_section for 672 * shrinking zone. 673 */ 674 pfn = find_biggest_section_pfn(nid, NULL, pgdat_start_pfn, 675 start_pfn); 676 if (pfn) 677 pgdat->node_spanned_pages = pfn - pgdat_start_pfn + 1; 678 } 679 680 /* 681 * If the section is not biggest or smallest mem_section in the pgdat, 682 * it only creates a hole in the pgdat. So in this case, we need not 683 * change the pgdat. 684 * But perhaps, the pgdat has only hole data. Thus it check the pgdat 685 * has only hole or not. 686 */ 687 pfn = pgdat_start_pfn; 688 for (; pfn < pgdat_end_pfn; pfn += PAGES_PER_SECTION) { 689 ms = __pfn_to_section(pfn); 690 691 if (unlikely(!valid_section(ms))) 692 continue; 693 694 if (pfn_to_nid(pfn) != nid) 695 continue; 696 697 /* If the section is current section, it continues the loop */ 698 if (start_pfn == pfn) 699 continue; 700 701 /* If we find valid section, we have nothing to do */ 702 return; 703 } 704 705 /* The pgdat has no valid section */ 706 pgdat->node_start_pfn = 0; 707 pgdat->node_spanned_pages = 0; 708 } 709 710 static void __remove_zone(struct zone *zone, unsigned long start_pfn) 711 { 712 struct pglist_data *pgdat = zone->zone_pgdat; 713 int nr_pages = PAGES_PER_SECTION; 714 int zone_type; 715 unsigned long flags; 716 717 zone_type = zone - pgdat->node_zones; 718 719 pgdat_resize_lock(zone->zone_pgdat, &flags); 720 shrink_zone_span(zone, start_pfn, start_pfn + nr_pages); 721 shrink_pgdat_span(pgdat, start_pfn, start_pfn + nr_pages); 722 pgdat_resize_unlock(zone->zone_pgdat, &flags); 723 } 724 725 static int __remove_section(struct zone *zone, struct mem_section *ms) 726 { 727 unsigned long start_pfn; 728 int scn_nr; 729 int ret = -EINVAL; 730 731 if (!valid_section(ms)) 732 return ret; 733 734 ret = unregister_memory_section(ms); 735 if (ret) 736 return ret; 737 738 scn_nr = __section_nr(ms); 739 start_pfn = section_nr_to_pfn(scn_nr); 740 __remove_zone(zone, start_pfn); 741 742 sparse_remove_one_section(zone, ms); 743 return 0; 744 } 745 746 /** 747 * __remove_pages() - remove sections of pages from a zone 748 * @zone: zone from which pages need to be removed 749 * @phys_start_pfn: starting pageframe (must be aligned to start of a section) 750 * @nr_pages: number of pages to remove (must be multiple of section size) 751 * 752 * Generic helper function to remove section mappings and sysfs entries 753 * for the section of the memory we are removing. Caller needs to make 754 * sure that pages are marked reserved and zones are adjust properly by 755 * calling offline_pages(). 756 */ 757 int __remove_pages(struct zone *zone, unsigned long phys_start_pfn, 758 unsigned long nr_pages) 759 { 760 unsigned long i; 761 int sections_to_remove; 762 resource_size_t start, size; 763 int ret = 0; 764 765 /* 766 * We can only remove entire sections 767 */ 768 BUG_ON(phys_start_pfn & ~PAGE_SECTION_MASK); 769 BUG_ON(nr_pages % PAGES_PER_SECTION); 770 771 start = phys_start_pfn << PAGE_SHIFT; 772 size = nr_pages * PAGE_SIZE; 773 ret = release_mem_region_adjustable(&iomem_resource, start, size); 774 if (ret) { 775 resource_size_t endres = start + size - 1; 776 777 pr_warn("Unable to release resource <%pa-%pa> (%d)\n", 778 &start, &endres, ret); 779 } 780 781 sections_to_remove = nr_pages / PAGES_PER_SECTION; 782 for (i = 0; i < sections_to_remove; i++) { 783 unsigned long pfn = phys_start_pfn + i*PAGES_PER_SECTION; 784 ret = __remove_section(zone, __pfn_to_section(pfn)); 785 if (ret) 786 break; 787 } 788 return ret; 789 } 790 EXPORT_SYMBOL_GPL(__remove_pages); 791 #endif /* CONFIG_MEMORY_HOTREMOVE */ 792 793 int set_online_page_callback(online_page_callback_t callback) 794 { 795 int rc = -EINVAL; 796 797 get_online_mems(); 798 mutex_lock(&online_page_callback_lock); 799 800 if (online_page_callback == generic_online_page) { 801 online_page_callback = callback; 802 rc = 0; 803 } 804 805 mutex_unlock(&online_page_callback_lock); 806 put_online_mems(); 807 808 return rc; 809 } 810 EXPORT_SYMBOL_GPL(set_online_page_callback); 811 812 int restore_online_page_callback(online_page_callback_t callback) 813 { 814 int rc = -EINVAL; 815 816 get_online_mems(); 817 mutex_lock(&online_page_callback_lock); 818 819 if (online_page_callback == callback) { 820 online_page_callback = generic_online_page; 821 rc = 0; 822 } 823 824 mutex_unlock(&online_page_callback_lock); 825 put_online_mems(); 826 827 return rc; 828 } 829 EXPORT_SYMBOL_GPL(restore_online_page_callback); 830 831 void __online_page_set_limits(struct page *page) 832 { 833 } 834 EXPORT_SYMBOL_GPL(__online_page_set_limits); 835 836 void __online_page_increment_counters(struct page *page) 837 { 838 adjust_managed_page_count(page, 1); 839 } 840 EXPORT_SYMBOL_GPL(__online_page_increment_counters); 841 842 void __online_page_free(struct page *page) 843 { 844 __free_reserved_page(page); 845 } 846 EXPORT_SYMBOL_GPL(__online_page_free); 847 848 static void generic_online_page(struct page *page) 849 { 850 __online_page_set_limits(page); 851 __online_page_increment_counters(page); 852 __online_page_free(page); 853 } 854 855 static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages, 856 void *arg) 857 { 858 unsigned long i; 859 unsigned long onlined_pages = *(unsigned long *)arg; 860 struct page *page; 861 if (PageReserved(pfn_to_page(start_pfn))) 862 for (i = 0; i < nr_pages; i++) { 863 page = pfn_to_page(start_pfn + i); 864 (*online_page_callback)(page); 865 onlined_pages++; 866 } 867 *(unsigned long *)arg = onlined_pages; 868 return 0; 869 } 870 871 #ifdef CONFIG_MOVABLE_NODE 872 /* 873 * When CONFIG_MOVABLE_NODE, we permit onlining of a node which doesn't have 874 * normal memory. 875 */ 876 static bool can_online_high_movable(struct zone *zone) 877 { 878 return true; 879 } 880 #else /* CONFIG_MOVABLE_NODE */ 881 /* ensure every online node has NORMAL memory */ 882 static bool can_online_high_movable(struct zone *zone) 883 { 884 return node_state(zone_to_nid(zone), N_NORMAL_MEMORY); 885 } 886 #endif /* CONFIG_MOVABLE_NODE */ 887 888 /* check which state of node_states will be changed when online memory */ 889 static void node_states_check_changes_online(unsigned long nr_pages, 890 struct zone *zone, struct memory_notify *arg) 891 { 892 int nid = zone_to_nid(zone); 893 enum zone_type zone_last = ZONE_NORMAL; 894 895 /* 896 * If we have HIGHMEM or movable node, node_states[N_NORMAL_MEMORY] 897 * contains nodes which have zones of 0...ZONE_NORMAL, 898 * set zone_last to ZONE_NORMAL. 899 * 900 * If we don't have HIGHMEM nor movable node, 901 * node_states[N_NORMAL_MEMORY] contains nodes which have zones of 902 * 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE. 903 */ 904 if (N_MEMORY == N_NORMAL_MEMORY) 905 zone_last = ZONE_MOVABLE; 906 907 /* 908 * if the memory to be online is in a zone of 0...zone_last, and 909 * the zones of 0...zone_last don't have memory before online, we will 910 * need to set the node to node_states[N_NORMAL_MEMORY] after 911 * the memory is online. 912 */ 913 if (zone_idx(zone) <= zone_last && !node_state(nid, N_NORMAL_MEMORY)) 914 arg->status_change_nid_normal = nid; 915 else 916 arg->status_change_nid_normal = -1; 917 918 #ifdef CONFIG_HIGHMEM 919 /* 920 * If we have movable node, node_states[N_HIGH_MEMORY] 921 * contains nodes which have zones of 0...ZONE_HIGHMEM, 922 * set zone_last to ZONE_HIGHMEM. 923 * 924 * If we don't have movable node, node_states[N_NORMAL_MEMORY] 925 * contains nodes which have zones of 0...ZONE_MOVABLE, 926 * set zone_last to ZONE_MOVABLE. 927 */ 928 zone_last = ZONE_HIGHMEM; 929 if (N_MEMORY == N_HIGH_MEMORY) 930 zone_last = ZONE_MOVABLE; 931 932 if (zone_idx(zone) <= zone_last && !node_state(nid, N_HIGH_MEMORY)) 933 arg->status_change_nid_high = nid; 934 else 935 arg->status_change_nid_high = -1; 936 #else 937 arg->status_change_nid_high = arg->status_change_nid_normal; 938 #endif 939 940 /* 941 * if the node don't have memory befor online, we will need to 942 * set the node to node_states[N_MEMORY] after the memory 943 * is online. 944 */ 945 if (!node_state(nid, N_MEMORY)) 946 arg->status_change_nid = nid; 947 else 948 arg->status_change_nid = -1; 949 } 950 951 static void node_states_set_node(int node, struct memory_notify *arg) 952 { 953 if (arg->status_change_nid_normal >= 0) 954 node_set_state(node, N_NORMAL_MEMORY); 955 956 if (arg->status_change_nid_high >= 0) 957 node_set_state(node, N_HIGH_MEMORY); 958 959 node_set_state(node, N_MEMORY); 960 } 961 962 963 /* Must be protected by mem_hotplug_begin() */ 964 int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type) 965 { 966 unsigned long flags; 967 unsigned long onlined_pages = 0; 968 struct zone *zone; 969 int need_zonelists_rebuild = 0; 970 int nid; 971 int ret; 972 struct memory_notify arg; 973 974 /* 975 * This doesn't need a lock to do pfn_to_page(). 976 * The section can't be removed here because of the 977 * memory_block->state_mutex. 978 */ 979 zone = page_zone(pfn_to_page(pfn)); 980 981 if ((zone_idx(zone) > ZONE_NORMAL || 982 online_type == MMOP_ONLINE_MOVABLE) && 983 !can_online_high_movable(zone)) 984 return -EINVAL; 985 986 if (online_type == MMOP_ONLINE_KERNEL && 987 zone_idx(zone) == ZONE_MOVABLE) { 988 if (move_pfn_range_left(zone - 1, zone, pfn, pfn + nr_pages)) 989 return -EINVAL; 990 } 991 if (online_type == MMOP_ONLINE_MOVABLE && 992 zone_idx(zone) == ZONE_MOVABLE - 1) { 993 if (move_pfn_range_right(zone, zone + 1, pfn, pfn + nr_pages)) 994 return -EINVAL; 995 } 996 997 /* Previous code may changed the zone of the pfn range */ 998 zone = page_zone(pfn_to_page(pfn)); 999 1000 arg.start_pfn = pfn; 1001 arg.nr_pages = nr_pages; 1002 node_states_check_changes_online(nr_pages, zone, &arg); 1003 1004 nid = pfn_to_nid(pfn); 1005 1006 ret = memory_notify(MEM_GOING_ONLINE, &arg); 1007 ret = notifier_to_errno(ret); 1008 if (ret) { 1009 memory_notify(MEM_CANCEL_ONLINE, &arg); 1010 return ret; 1011 } 1012 /* 1013 * If this zone is not populated, then it is not in zonelist. 1014 * This means the page allocator ignores this zone. 1015 * So, zonelist must be updated after online. 1016 */ 1017 mutex_lock(&zonelists_mutex); 1018 if (!populated_zone(zone)) { 1019 need_zonelists_rebuild = 1; 1020 build_all_zonelists(NULL, zone); 1021 } 1022 1023 ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages, 1024 online_pages_range); 1025 if (ret) { 1026 if (need_zonelists_rebuild) 1027 zone_pcp_reset(zone); 1028 mutex_unlock(&zonelists_mutex); 1029 printk(KERN_DEBUG "online_pages [mem %#010llx-%#010llx] failed\n", 1030 (unsigned long long) pfn << PAGE_SHIFT, 1031 (((unsigned long long) pfn + nr_pages) 1032 << PAGE_SHIFT) - 1); 1033 memory_notify(MEM_CANCEL_ONLINE, &arg); 1034 return ret; 1035 } 1036 1037 zone->present_pages += onlined_pages; 1038 1039 pgdat_resize_lock(zone->zone_pgdat, &flags); 1040 zone->zone_pgdat->node_present_pages += onlined_pages; 1041 pgdat_resize_unlock(zone->zone_pgdat, &flags); 1042 1043 if (onlined_pages) { 1044 node_states_set_node(zone_to_nid(zone), &arg); 1045 if (need_zonelists_rebuild) 1046 build_all_zonelists(NULL, NULL); 1047 else 1048 zone_pcp_update(zone); 1049 } 1050 1051 mutex_unlock(&zonelists_mutex); 1052 1053 init_per_zone_wmark_min(); 1054 1055 if (onlined_pages) 1056 kswapd_run(zone_to_nid(zone)); 1057 1058 vm_total_pages = nr_free_pagecache_pages(); 1059 1060 writeback_set_ratelimit(); 1061 1062 if (onlined_pages) 1063 memory_notify(MEM_ONLINE, &arg); 1064 return 0; 1065 } 1066 #endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */ 1067 1068 static void reset_node_present_pages(pg_data_t *pgdat) 1069 { 1070 struct zone *z; 1071 1072 for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++) 1073 z->present_pages = 0; 1074 1075 pgdat->node_present_pages = 0; 1076 } 1077 1078 /* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ 1079 static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start) 1080 { 1081 struct pglist_data *pgdat; 1082 unsigned long zones_size[MAX_NR_ZONES] = {0}; 1083 unsigned long zholes_size[MAX_NR_ZONES] = {0}; 1084 unsigned long start_pfn = PFN_DOWN(start); 1085 1086 pgdat = NODE_DATA(nid); 1087 if (!pgdat) { 1088 pgdat = arch_alloc_nodedata(nid); 1089 if (!pgdat) 1090 return NULL; 1091 1092 arch_refresh_nodedata(nid, pgdat); 1093 } else { 1094 /* Reset the nr_zones and classzone_idx to 0 before reuse */ 1095 pgdat->nr_zones = 0; 1096 pgdat->classzone_idx = 0; 1097 } 1098 1099 /* we can use NODE_DATA(nid) from here */ 1100 1101 /* init node's zones as empty zones, we don't have any present pages.*/ 1102 free_area_init_node(nid, zones_size, start_pfn, zholes_size); 1103 1104 /* 1105 * The node we allocated has no zone fallback lists. For avoiding 1106 * to access not-initialized zonelist, build here. 1107 */ 1108 mutex_lock(&zonelists_mutex); 1109 build_all_zonelists(pgdat, NULL); 1110 mutex_unlock(&zonelists_mutex); 1111 1112 /* 1113 * zone->managed_pages is set to an approximate value in 1114 * free_area_init_core(), which will cause 1115 * /sys/device/system/node/nodeX/meminfo has wrong data. 1116 * So reset it to 0 before any memory is onlined. 1117 */ 1118 reset_node_managed_pages(pgdat); 1119 1120 /* 1121 * When memory is hot-added, all the memory is in offline state. So 1122 * clear all zones' present_pages because they will be updated in 1123 * online_pages() and offline_pages(). 1124 */ 1125 reset_node_present_pages(pgdat); 1126 1127 return pgdat; 1128 } 1129 1130 static void rollback_node_hotadd(int nid, pg_data_t *pgdat) 1131 { 1132 arch_refresh_nodedata(nid, NULL); 1133 arch_free_nodedata(pgdat); 1134 return; 1135 } 1136 1137 1138 /** 1139 * try_online_node - online a node if offlined 1140 * 1141 * called by cpu_up() to online a node without onlined memory. 1142 */ 1143 int try_online_node(int nid) 1144 { 1145 pg_data_t *pgdat; 1146 int ret; 1147 1148 if (node_online(nid)) 1149 return 0; 1150 1151 mem_hotplug_begin(); 1152 pgdat = hotadd_new_pgdat(nid, 0); 1153 if (!pgdat) { 1154 pr_err("Cannot online node %d due to NULL pgdat\n", nid); 1155 ret = -ENOMEM; 1156 goto out; 1157 } 1158 node_set_online(nid); 1159 ret = register_one_node(nid); 1160 BUG_ON(ret); 1161 1162 if (pgdat->node_zonelists->_zonerefs->zone == NULL) { 1163 mutex_lock(&zonelists_mutex); 1164 build_all_zonelists(NULL, NULL); 1165 mutex_unlock(&zonelists_mutex); 1166 } 1167 1168 out: 1169 mem_hotplug_done(); 1170 return ret; 1171 } 1172 1173 static int check_hotplug_memory_range(u64 start, u64 size) 1174 { 1175 u64 start_pfn = PFN_DOWN(start); 1176 u64 nr_pages = size >> PAGE_SHIFT; 1177 1178 /* Memory range must be aligned with section */ 1179 if ((start_pfn & ~PAGE_SECTION_MASK) || 1180 (nr_pages % PAGES_PER_SECTION) || (!nr_pages)) { 1181 pr_err("Section-unaligned hotplug range: start 0x%llx, size 0x%llx\n", 1182 (unsigned long long)start, 1183 (unsigned long long)size); 1184 return -EINVAL; 1185 } 1186 1187 return 0; 1188 } 1189 1190 /* 1191 * If movable zone has already been setup, newly added memory should be check. 1192 * If its address is higher than movable zone, it should be added as movable. 1193 * Without this check, movable zone may overlap with other zone. 1194 */ 1195 static int should_add_memory_movable(int nid, u64 start, u64 size) 1196 { 1197 unsigned long start_pfn = start >> PAGE_SHIFT; 1198 pg_data_t *pgdat = NODE_DATA(nid); 1199 struct zone *movable_zone = pgdat->node_zones + ZONE_MOVABLE; 1200 1201 if (zone_is_empty(movable_zone)) 1202 return 0; 1203 1204 if (movable_zone->zone_start_pfn <= start_pfn) 1205 return 1; 1206 1207 return 0; 1208 } 1209 1210 int zone_for_memory(int nid, u64 start, u64 size, int zone_default) 1211 { 1212 if (should_add_memory_movable(nid, start, size)) 1213 return ZONE_MOVABLE; 1214 1215 return zone_default; 1216 } 1217 1218 /* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ 1219 int __ref add_memory(int nid, u64 start, u64 size) 1220 { 1221 pg_data_t *pgdat = NULL; 1222 bool new_pgdat; 1223 bool new_node; 1224 struct resource *res; 1225 int ret; 1226 1227 ret = check_hotplug_memory_range(start, size); 1228 if (ret) 1229 return ret; 1230 1231 res = register_memory_resource(start, size); 1232 ret = -EEXIST; 1233 if (!res) 1234 return ret; 1235 1236 { /* Stupid hack to suppress address-never-null warning */ 1237 void *p = NODE_DATA(nid); 1238 new_pgdat = !p; 1239 } 1240 1241 mem_hotplug_begin(); 1242 1243 new_node = !node_online(nid); 1244 if (new_node) { 1245 pgdat = hotadd_new_pgdat(nid, start); 1246 ret = -ENOMEM; 1247 if (!pgdat) 1248 goto error; 1249 } 1250 1251 /* call arch's memory hotadd */ 1252 ret = arch_add_memory(nid, start, size); 1253 1254 if (ret < 0) 1255 goto error; 1256 1257 /* we online node here. we can't roll back from here. */ 1258 node_set_online(nid); 1259 1260 if (new_node) { 1261 ret = register_one_node(nid); 1262 /* 1263 * If sysfs file of new node can't create, cpu on the node 1264 * can't be hot-added. There is no rollback way now. 1265 * So, check by BUG_ON() to catch it reluctantly.. 1266 */ 1267 BUG_ON(ret); 1268 } 1269 1270 /* create new memmap entry */ 1271 firmware_map_add_hotplug(start, start + size, "System RAM"); 1272 1273 goto out; 1274 1275 error: 1276 /* rollback pgdat allocation and others */ 1277 if (new_pgdat) 1278 rollback_node_hotadd(nid, pgdat); 1279 release_memory_resource(res); 1280 1281 out: 1282 mem_hotplug_done(); 1283 return ret; 1284 } 1285 EXPORT_SYMBOL_GPL(add_memory); 1286 1287 #ifdef CONFIG_MEMORY_HOTREMOVE 1288 /* 1289 * A free page on the buddy free lists (not the per-cpu lists) has PageBuddy 1290 * set and the size of the free page is given by page_order(). Using this, 1291 * the function determines if the pageblock contains only free pages. 1292 * Due to buddy contraints, a free page at least the size of a pageblock will 1293 * be located at the start of the pageblock 1294 */ 1295 static inline int pageblock_free(struct page *page) 1296 { 1297 return PageBuddy(page) && page_order(page) >= pageblock_order; 1298 } 1299 1300 /* Return the start of the next active pageblock after a given page */ 1301 static struct page *next_active_pageblock(struct page *page) 1302 { 1303 /* Ensure the starting page is pageblock-aligned */ 1304 BUG_ON(page_to_pfn(page) & (pageblock_nr_pages - 1)); 1305 1306 /* If the entire pageblock is free, move to the end of free page */ 1307 if (pageblock_free(page)) { 1308 int order; 1309 /* be careful. we don't have locks, page_order can be changed.*/ 1310 order = page_order(page); 1311 if ((order < MAX_ORDER) && (order >= pageblock_order)) 1312 return page + (1 << order); 1313 } 1314 1315 return page + pageblock_nr_pages; 1316 } 1317 1318 /* Checks if this range of memory is likely to be hot-removable. */ 1319 int is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages) 1320 { 1321 struct page *page = pfn_to_page(start_pfn); 1322 struct page *end_page = page + nr_pages; 1323 1324 /* Check the starting page of each pageblock within the range */ 1325 for (; page < end_page; page = next_active_pageblock(page)) { 1326 if (!is_pageblock_removable_nolock(page)) 1327 return 0; 1328 cond_resched(); 1329 } 1330 1331 /* All pageblocks in the memory block are likely to be hot-removable */ 1332 return 1; 1333 } 1334 1335 /* 1336 * Confirm all pages in a range [start, end) is belongs to the same zone. 1337 */ 1338 int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn) 1339 { 1340 unsigned long pfn; 1341 struct zone *zone = NULL; 1342 struct page *page; 1343 int i; 1344 for (pfn = start_pfn; 1345 pfn < end_pfn; 1346 pfn += MAX_ORDER_NR_PAGES) { 1347 i = 0; 1348 /* This is just a CONFIG_HOLES_IN_ZONE check.*/ 1349 while ((i < MAX_ORDER_NR_PAGES) && !pfn_valid_within(pfn + i)) 1350 i++; 1351 if (i == MAX_ORDER_NR_PAGES) 1352 continue; 1353 page = pfn_to_page(pfn + i); 1354 if (zone && page_zone(page) != zone) 1355 return 0; 1356 zone = page_zone(page); 1357 } 1358 return 1; 1359 } 1360 1361 /* 1362 * Scan pfn range [start,end) to find movable/migratable pages (LRU pages 1363 * and hugepages). We scan pfn because it's much easier than scanning over 1364 * linked list. This function returns the pfn of the first found movable 1365 * page if it's found, otherwise 0. 1366 */ 1367 static unsigned long scan_movable_pages(unsigned long start, unsigned long end) 1368 { 1369 unsigned long pfn; 1370 struct page *page; 1371 for (pfn = start; pfn < end; pfn++) { 1372 if (pfn_valid(pfn)) { 1373 page = pfn_to_page(pfn); 1374 if (PageLRU(page)) 1375 return pfn; 1376 if (PageHuge(page)) { 1377 if (page_huge_active(page)) 1378 return pfn; 1379 else 1380 pfn = round_up(pfn + 1, 1381 1 << compound_order(page)) - 1; 1382 } 1383 } 1384 } 1385 return 0; 1386 } 1387 1388 #define NR_OFFLINE_AT_ONCE_PAGES (256) 1389 static int 1390 do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) 1391 { 1392 unsigned long pfn; 1393 struct page *page; 1394 int move_pages = NR_OFFLINE_AT_ONCE_PAGES; 1395 int not_managed = 0; 1396 int ret = 0; 1397 LIST_HEAD(source); 1398 1399 for (pfn = start_pfn; pfn < end_pfn && move_pages > 0; pfn++) { 1400 if (!pfn_valid(pfn)) 1401 continue; 1402 page = pfn_to_page(pfn); 1403 1404 if (PageHuge(page)) { 1405 struct page *head = compound_head(page); 1406 pfn = page_to_pfn(head) + (1<<compound_order(head)) - 1; 1407 if (compound_order(head) > PFN_SECTION_SHIFT) { 1408 ret = -EBUSY; 1409 break; 1410 } 1411 if (isolate_huge_page(page, &source)) 1412 move_pages -= 1 << compound_order(head); 1413 continue; 1414 } 1415 1416 if (!get_page_unless_zero(page)) 1417 continue; 1418 /* 1419 * We can skip free pages. And we can only deal with pages on 1420 * LRU. 1421 */ 1422 ret = isolate_lru_page(page); 1423 if (!ret) { /* Success */ 1424 put_page(page); 1425 list_add_tail(&page->lru, &source); 1426 move_pages--; 1427 inc_zone_page_state(page, NR_ISOLATED_ANON + 1428 page_is_file_cache(page)); 1429 1430 } else { 1431 #ifdef CONFIG_DEBUG_VM 1432 printk(KERN_ALERT "removing pfn %lx from LRU failed\n", 1433 pfn); 1434 dump_page(page, "failed to remove from LRU"); 1435 #endif 1436 put_page(page); 1437 /* Because we don't have big zone->lock. we should 1438 check this again here. */ 1439 if (page_count(page)) { 1440 not_managed++; 1441 ret = -EBUSY; 1442 break; 1443 } 1444 } 1445 } 1446 if (!list_empty(&source)) { 1447 if (not_managed) { 1448 putback_movable_pages(&source); 1449 goto out; 1450 } 1451 1452 /* 1453 * alloc_migrate_target should be improooooved!! 1454 * migrate_pages returns # of failed pages. 1455 */ 1456 ret = migrate_pages(&source, alloc_migrate_target, NULL, 0, 1457 MIGRATE_SYNC, MR_MEMORY_HOTPLUG); 1458 if (ret) 1459 putback_movable_pages(&source); 1460 } 1461 out: 1462 return ret; 1463 } 1464 1465 /* 1466 * remove from free_area[] and mark all as Reserved. 1467 */ 1468 static int 1469 offline_isolated_pages_cb(unsigned long start, unsigned long nr_pages, 1470 void *data) 1471 { 1472 __offline_isolated_pages(start, start + nr_pages); 1473 return 0; 1474 } 1475 1476 static void 1477 offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) 1478 { 1479 walk_system_ram_range(start_pfn, end_pfn - start_pfn, NULL, 1480 offline_isolated_pages_cb); 1481 } 1482 1483 /* 1484 * Check all pages in range, recoreded as memory resource, are isolated. 1485 */ 1486 static int 1487 check_pages_isolated_cb(unsigned long start_pfn, unsigned long nr_pages, 1488 void *data) 1489 { 1490 int ret; 1491 long offlined = *(long *)data; 1492 ret = test_pages_isolated(start_pfn, start_pfn + nr_pages, true); 1493 offlined = nr_pages; 1494 if (!ret) 1495 *(long *)data += offlined; 1496 return ret; 1497 } 1498 1499 static long 1500 check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn) 1501 { 1502 long offlined = 0; 1503 int ret; 1504 1505 ret = walk_system_ram_range(start_pfn, end_pfn - start_pfn, &offlined, 1506 check_pages_isolated_cb); 1507 if (ret < 0) 1508 offlined = (long)ret; 1509 return offlined; 1510 } 1511 1512 #ifdef CONFIG_MOVABLE_NODE 1513 /* 1514 * When CONFIG_MOVABLE_NODE, we permit offlining of a node which doesn't have 1515 * normal memory. 1516 */ 1517 static bool can_offline_normal(struct zone *zone, unsigned long nr_pages) 1518 { 1519 return true; 1520 } 1521 #else /* CONFIG_MOVABLE_NODE */ 1522 /* ensure the node has NORMAL memory if it is still online */ 1523 static bool can_offline_normal(struct zone *zone, unsigned long nr_pages) 1524 { 1525 struct pglist_data *pgdat = zone->zone_pgdat; 1526 unsigned long present_pages = 0; 1527 enum zone_type zt; 1528 1529 for (zt = 0; zt <= ZONE_NORMAL; zt++) 1530 present_pages += pgdat->node_zones[zt].present_pages; 1531 1532 if (present_pages > nr_pages) 1533 return true; 1534 1535 present_pages = 0; 1536 for (; zt <= ZONE_MOVABLE; zt++) 1537 present_pages += pgdat->node_zones[zt].present_pages; 1538 1539 /* 1540 * we can't offline the last normal memory until all 1541 * higher memory is offlined. 1542 */ 1543 return present_pages == 0; 1544 } 1545 #endif /* CONFIG_MOVABLE_NODE */ 1546 1547 static int __init cmdline_parse_movable_node(char *p) 1548 { 1549 #ifdef CONFIG_MOVABLE_NODE 1550 /* 1551 * Memory used by the kernel cannot be hot-removed because Linux 1552 * cannot migrate the kernel pages. When memory hotplug is 1553 * enabled, we should prevent memblock from allocating memory 1554 * for the kernel. 1555 * 1556 * ACPI SRAT records all hotpluggable memory ranges. But before 1557 * SRAT is parsed, we don't know about it. 1558 * 1559 * The kernel image is loaded into memory at very early time. We 1560 * cannot prevent this anyway. So on NUMA system, we set any 1561 * node the kernel resides in as un-hotpluggable. 1562 * 1563 * Since on modern servers, one node could have double-digit 1564 * gigabytes memory, we can assume the memory around the kernel 1565 * image is also un-hotpluggable. So before SRAT is parsed, just 1566 * allocate memory near the kernel image to try the best to keep 1567 * the kernel away from hotpluggable memory. 1568 */ 1569 memblock_set_bottom_up(true); 1570 movable_node_enabled = true; 1571 #else 1572 pr_warn("movable_node option not supported\n"); 1573 #endif 1574 return 0; 1575 } 1576 early_param("movable_node", cmdline_parse_movable_node); 1577 1578 /* check which state of node_states will be changed when offline memory */ 1579 static void node_states_check_changes_offline(unsigned long nr_pages, 1580 struct zone *zone, struct memory_notify *arg) 1581 { 1582 struct pglist_data *pgdat = zone->zone_pgdat; 1583 unsigned long present_pages = 0; 1584 enum zone_type zt, zone_last = ZONE_NORMAL; 1585 1586 /* 1587 * If we have HIGHMEM or movable node, node_states[N_NORMAL_MEMORY] 1588 * contains nodes which have zones of 0...ZONE_NORMAL, 1589 * set zone_last to ZONE_NORMAL. 1590 * 1591 * If we don't have HIGHMEM nor movable node, 1592 * node_states[N_NORMAL_MEMORY] contains nodes which have zones of 1593 * 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE. 1594 */ 1595 if (N_MEMORY == N_NORMAL_MEMORY) 1596 zone_last = ZONE_MOVABLE; 1597 1598 /* 1599 * check whether node_states[N_NORMAL_MEMORY] will be changed. 1600 * If the memory to be offline is in a zone of 0...zone_last, 1601 * and it is the last present memory, 0...zone_last will 1602 * become empty after offline , thus we can determind we will 1603 * need to clear the node from node_states[N_NORMAL_MEMORY]. 1604 */ 1605 for (zt = 0; zt <= zone_last; zt++) 1606 present_pages += pgdat->node_zones[zt].present_pages; 1607 if (zone_idx(zone) <= zone_last && nr_pages >= present_pages) 1608 arg->status_change_nid_normal = zone_to_nid(zone); 1609 else 1610 arg->status_change_nid_normal = -1; 1611 1612 #ifdef CONFIG_HIGHMEM 1613 /* 1614 * If we have movable node, node_states[N_HIGH_MEMORY] 1615 * contains nodes which have zones of 0...ZONE_HIGHMEM, 1616 * set zone_last to ZONE_HIGHMEM. 1617 * 1618 * If we don't have movable node, node_states[N_NORMAL_MEMORY] 1619 * contains nodes which have zones of 0...ZONE_MOVABLE, 1620 * set zone_last to ZONE_MOVABLE. 1621 */ 1622 zone_last = ZONE_HIGHMEM; 1623 if (N_MEMORY == N_HIGH_MEMORY) 1624 zone_last = ZONE_MOVABLE; 1625 1626 for (; zt <= zone_last; zt++) 1627 present_pages += pgdat->node_zones[zt].present_pages; 1628 if (zone_idx(zone) <= zone_last && nr_pages >= present_pages) 1629 arg->status_change_nid_high = zone_to_nid(zone); 1630 else 1631 arg->status_change_nid_high = -1; 1632 #else 1633 arg->status_change_nid_high = arg->status_change_nid_normal; 1634 #endif 1635 1636 /* 1637 * node_states[N_HIGH_MEMORY] contains nodes which have 0...ZONE_MOVABLE 1638 */ 1639 zone_last = ZONE_MOVABLE; 1640 1641 /* 1642 * check whether node_states[N_HIGH_MEMORY] will be changed 1643 * If we try to offline the last present @nr_pages from the node, 1644 * we can determind we will need to clear the node from 1645 * node_states[N_HIGH_MEMORY]. 1646 */ 1647 for (; zt <= zone_last; zt++) 1648 present_pages += pgdat->node_zones[zt].present_pages; 1649 if (nr_pages >= present_pages) 1650 arg->status_change_nid = zone_to_nid(zone); 1651 else 1652 arg->status_change_nid = -1; 1653 } 1654 1655 static void node_states_clear_node(int node, struct memory_notify *arg) 1656 { 1657 if (arg->status_change_nid_normal >= 0) 1658 node_clear_state(node, N_NORMAL_MEMORY); 1659 1660 if ((N_MEMORY != N_NORMAL_MEMORY) && 1661 (arg->status_change_nid_high >= 0)) 1662 node_clear_state(node, N_HIGH_MEMORY); 1663 1664 if ((N_MEMORY != N_HIGH_MEMORY) && 1665 (arg->status_change_nid >= 0)) 1666 node_clear_state(node, N_MEMORY); 1667 } 1668 1669 static int __ref __offline_pages(unsigned long start_pfn, 1670 unsigned long end_pfn, unsigned long timeout) 1671 { 1672 unsigned long pfn, nr_pages, expire; 1673 long offlined_pages; 1674 int ret, drain, retry_max, node; 1675 unsigned long flags; 1676 struct zone *zone; 1677 struct memory_notify arg; 1678 1679 /* at least, alignment against pageblock is necessary */ 1680 if (!IS_ALIGNED(start_pfn, pageblock_nr_pages)) 1681 return -EINVAL; 1682 if (!IS_ALIGNED(end_pfn, pageblock_nr_pages)) 1683 return -EINVAL; 1684 /* This makes hotplug much easier...and readable. 1685 we assume this for now. .*/ 1686 if (!test_pages_in_a_zone(start_pfn, end_pfn)) 1687 return -EINVAL; 1688 1689 zone = page_zone(pfn_to_page(start_pfn)); 1690 node = zone_to_nid(zone); 1691 nr_pages = end_pfn - start_pfn; 1692 1693 if (zone_idx(zone) <= ZONE_NORMAL && !can_offline_normal(zone, nr_pages)) 1694 return -EINVAL; 1695 1696 /* set above range as isolated */ 1697 ret = start_isolate_page_range(start_pfn, end_pfn, 1698 MIGRATE_MOVABLE, true); 1699 if (ret) 1700 return ret; 1701 1702 arg.start_pfn = start_pfn; 1703 arg.nr_pages = nr_pages; 1704 node_states_check_changes_offline(nr_pages, zone, &arg); 1705 1706 ret = memory_notify(MEM_GOING_OFFLINE, &arg); 1707 ret = notifier_to_errno(ret); 1708 if (ret) 1709 goto failed_removal; 1710 1711 pfn = start_pfn; 1712 expire = jiffies + timeout; 1713 drain = 0; 1714 retry_max = 5; 1715 repeat: 1716 /* start memory hot removal */ 1717 ret = -EAGAIN; 1718 if (time_after(jiffies, expire)) 1719 goto failed_removal; 1720 ret = -EINTR; 1721 if (signal_pending(current)) 1722 goto failed_removal; 1723 ret = 0; 1724 if (drain) { 1725 lru_add_drain_all(); 1726 cond_resched(); 1727 drain_all_pages(zone); 1728 } 1729 1730 pfn = scan_movable_pages(start_pfn, end_pfn); 1731 if (pfn) { /* We have movable pages */ 1732 ret = do_migrate_range(pfn, end_pfn); 1733 if (!ret) { 1734 drain = 1; 1735 goto repeat; 1736 } else { 1737 if (ret < 0) 1738 if (--retry_max == 0) 1739 goto failed_removal; 1740 yield(); 1741 drain = 1; 1742 goto repeat; 1743 } 1744 } 1745 /* drain all zone's lru pagevec, this is asynchronous... */ 1746 lru_add_drain_all(); 1747 yield(); 1748 /* drain pcp pages, this is synchronous. */ 1749 drain_all_pages(zone); 1750 /* 1751 * dissolve free hugepages in the memory block before doing offlining 1752 * actually in order to make hugetlbfs's object counting consistent. 1753 */ 1754 dissolve_free_huge_pages(start_pfn, end_pfn); 1755 /* check again */ 1756 offlined_pages = check_pages_isolated(start_pfn, end_pfn); 1757 if (offlined_pages < 0) { 1758 ret = -EBUSY; 1759 goto failed_removal; 1760 } 1761 printk(KERN_INFO "Offlined Pages %ld\n", offlined_pages); 1762 /* Ok, all of our target is isolated. 1763 We cannot do rollback at this point. */ 1764 offline_isolated_pages(start_pfn, end_pfn); 1765 /* reset pagetype flags and makes migrate type to be MOVABLE */ 1766 undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); 1767 /* removal success */ 1768 adjust_managed_page_count(pfn_to_page(start_pfn), -offlined_pages); 1769 zone->present_pages -= offlined_pages; 1770 1771 pgdat_resize_lock(zone->zone_pgdat, &flags); 1772 zone->zone_pgdat->node_present_pages -= offlined_pages; 1773 pgdat_resize_unlock(zone->zone_pgdat, &flags); 1774 1775 init_per_zone_wmark_min(); 1776 1777 if (!populated_zone(zone)) { 1778 zone_pcp_reset(zone); 1779 mutex_lock(&zonelists_mutex); 1780 build_all_zonelists(NULL, NULL); 1781 mutex_unlock(&zonelists_mutex); 1782 } else 1783 zone_pcp_update(zone); 1784 1785 node_states_clear_node(node, &arg); 1786 if (arg.status_change_nid >= 0) 1787 kswapd_stop(node); 1788 1789 vm_total_pages = nr_free_pagecache_pages(); 1790 writeback_set_ratelimit(); 1791 1792 memory_notify(MEM_OFFLINE, &arg); 1793 return 0; 1794 1795 failed_removal: 1796 printk(KERN_INFO "memory offlining [mem %#010llx-%#010llx] failed\n", 1797 (unsigned long long) start_pfn << PAGE_SHIFT, 1798 ((unsigned long long) end_pfn << PAGE_SHIFT) - 1); 1799 memory_notify(MEM_CANCEL_OFFLINE, &arg); 1800 /* pushback to free area */ 1801 undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); 1802 return ret; 1803 } 1804 1805 /* Must be protected by mem_hotplug_begin() */ 1806 int offline_pages(unsigned long start_pfn, unsigned long nr_pages) 1807 { 1808 return __offline_pages(start_pfn, start_pfn + nr_pages, 120 * HZ); 1809 } 1810 #endif /* CONFIG_MEMORY_HOTREMOVE */ 1811 1812 /** 1813 * walk_memory_range - walks through all mem sections in [start_pfn, end_pfn) 1814 * @start_pfn: start pfn of the memory range 1815 * @end_pfn: end pfn of the memory range 1816 * @arg: argument passed to func 1817 * @func: callback for each memory section walked 1818 * 1819 * This function walks through all present mem sections in range 1820 * [start_pfn, end_pfn) and call func on each mem section. 1821 * 1822 * Returns the return value of func. 1823 */ 1824 int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn, 1825 void *arg, int (*func)(struct memory_block *, void *)) 1826 { 1827 struct memory_block *mem = NULL; 1828 struct mem_section *section; 1829 unsigned long pfn, section_nr; 1830 int ret; 1831 1832 for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { 1833 section_nr = pfn_to_section_nr(pfn); 1834 if (!present_section_nr(section_nr)) 1835 continue; 1836 1837 section = __nr_to_section(section_nr); 1838 /* same memblock? */ 1839 if (mem) 1840 if ((section_nr >= mem->start_section_nr) && 1841 (section_nr <= mem->end_section_nr)) 1842 continue; 1843 1844 mem = find_memory_block_hinted(section, mem); 1845 if (!mem) 1846 continue; 1847 1848 ret = func(mem, arg); 1849 if (ret) { 1850 kobject_put(&mem->dev.kobj); 1851 return ret; 1852 } 1853 } 1854 1855 if (mem) 1856 kobject_put(&mem->dev.kobj); 1857 1858 return 0; 1859 } 1860 1861 #ifdef CONFIG_MEMORY_HOTREMOVE 1862 static int check_memblock_offlined_cb(struct memory_block *mem, void *arg) 1863 { 1864 int ret = !is_memblock_offlined(mem); 1865 1866 if (unlikely(ret)) { 1867 phys_addr_t beginpa, endpa; 1868 1869 beginpa = PFN_PHYS(section_nr_to_pfn(mem->start_section_nr)); 1870 endpa = PFN_PHYS(section_nr_to_pfn(mem->end_section_nr + 1))-1; 1871 pr_warn("removing memory fails, because memory " 1872 "[%pa-%pa] is onlined\n", 1873 &beginpa, &endpa); 1874 } 1875 1876 return ret; 1877 } 1878 1879 static int check_cpu_on_node(pg_data_t *pgdat) 1880 { 1881 int cpu; 1882 1883 for_each_present_cpu(cpu) { 1884 if (cpu_to_node(cpu) == pgdat->node_id) 1885 /* 1886 * the cpu on this node isn't removed, and we can't 1887 * offline this node. 1888 */ 1889 return -EBUSY; 1890 } 1891 1892 return 0; 1893 } 1894 1895 static void unmap_cpu_on_node(pg_data_t *pgdat) 1896 { 1897 #ifdef CONFIG_ACPI_NUMA 1898 int cpu; 1899 1900 for_each_possible_cpu(cpu) 1901 if (cpu_to_node(cpu) == pgdat->node_id) 1902 numa_clear_node(cpu); 1903 #endif 1904 } 1905 1906 static int check_and_unmap_cpu_on_node(pg_data_t *pgdat) 1907 { 1908 int ret; 1909 1910 ret = check_cpu_on_node(pgdat); 1911 if (ret) 1912 return ret; 1913 1914 /* 1915 * the node will be offlined when we come here, so we can clear 1916 * the cpu_to_node() now. 1917 */ 1918 1919 unmap_cpu_on_node(pgdat); 1920 return 0; 1921 } 1922 1923 /** 1924 * try_offline_node 1925 * 1926 * Offline a node if all memory sections and cpus of the node are removed. 1927 * 1928 * NOTE: The caller must call lock_device_hotplug() to serialize hotplug 1929 * and online/offline operations before this call. 1930 */ 1931 void try_offline_node(int nid) 1932 { 1933 pg_data_t *pgdat = NODE_DATA(nid); 1934 unsigned long start_pfn = pgdat->node_start_pfn; 1935 unsigned long end_pfn = start_pfn + pgdat->node_spanned_pages; 1936 unsigned long pfn; 1937 int i; 1938 1939 for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { 1940 unsigned long section_nr = pfn_to_section_nr(pfn); 1941 1942 if (!present_section_nr(section_nr)) 1943 continue; 1944 1945 if (pfn_to_nid(pfn) != nid) 1946 continue; 1947 1948 /* 1949 * some memory sections of this node are not removed, and we 1950 * can't offline node now. 1951 */ 1952 return; 1953 } 1954 1955 if (check_and_unmap_cpu_on_node(pgdat)) 1956 return; 1957 1958 /* 1959 * all memory/cpu of this node are removed, we can offline this 1960 * node now. 1961 */ 1962 node_set_offline(nid); 1963 unregister_one_node(nid); 1964 1965 /* free waittable in each zone */ 1966 for (i = 0; i < MAX_NR_ZONES; i++) { 1967 struct zone *zone = pgdat->node_zones + i; 1968 1969 /* 1970 * wait_table may be allocated from boot memory, 1971 * here only free if it's allocated by vmalloc. 1972 */ 1973 if (is_vmalloc_addr(zone->wait_table)) { 1974 vfree(zone->wait_table); 1975 zone->wait_table = NULL; 1976 } 1977 } 1978 } 1979 EXPORT_SYMBOL(try_offline_node); 1980 1981 /** 1982 * remove_memory 1983 * 1984 * NOTE: The caller must call lock_device_hotplug() to serialize hotplug 1985 * and online/offline operations before this call, as required by 1986 * try_offline_node(). 1987 */ 1988 void __ref remove_memory(int nid, u64 start, u64 size) 1989 { 1990 int ret; 1991 1992 BUG_ON(check_hotplug_memory_range(start, size)); 1993 1994 mem_hotplug_begin(); 1995 1996 /* 1997 * All memory blocks must be offlined before removing memory. Check 1998 * whether all memory blocks in question are offline and trigger a BUG() 1999 * if this is not the case. 2000 */ 2001 ret = walk_memory_range(PFN_DOWN(start), PFN_UP(start + size - 1), NULL, 2002 check_memblock_offlined_cb); 2003 if (ret) 2004 BUG(); 2005 2006 /* remove memmap entry */ 2007 firmware_map_remove(start, start + size, "System RAM"); 2008 2009 arch_remove_memory(start, size); 2010 2011 try_offline_node(nid); 2012 2013 mem_hotplug_done(); 2014 } 2015 EXPORT_SYMBOL_GPL(remove_memory); 2016 #endif /* CONFIG_MEMORY_HOTREMOVE */ 2017