1 /* 2 * linux/mm/memory_hotplug.c 3 * 4 * Copyright (C) 5 */ 6 7 #include <linux/stddef.h> 8 #include <linux/mm.h> 9 #include <linux/swap.h> 10 #include <linux/interrupt.h> 11 #include <linux/pagemap.h> 12 #include <linux/bootmem.h> 13 #include <linux/compiler.h> 14 #include <linux/export.h> 15 #include <linux/pagevec.h> 16 #include <linux/writeback.h> 17 #include <linux/slab.h> 18 #include <linux/sysctl.h> 19 #include <linux/cpu.h> 20 #include <linux/memory.h> 21 #include <linux/memory_hotplug.h> 22 #include <linux/highmem.h> 23 #include <linux/vmalloc.h> 24 #include <linux/ioport.h> 25 #include <linux/delay.h> 26 #include <linux/migrate.h> 27 #include <linux/page-isolation.h> 28 #include <linux/pfn.h> 29 #include <linux/suspend.h> 30 #include <linux/mm_inline.h> 31 #include <linux/firmware-map.h> 32 33 #include <asm/tlbflush.h> 34 35 #include "internal.h" 36 37 /* 38 * online_page_callback contains pointer to current page onlining function. 39 * Initially it is generic_online_page(). If it is required it could be 40 * changed by calling set_online_page_callback() for callback registration 41 * and restore_online_page_callback() for generic callback restore. 42 */ 43 44 static void generic_online_page(struct page *page); 45 46 static online_page_callback_t online_page_callback = generic_online_page; 47 48 DEFINE_MUTEX(mem_hotplug_mutex); 49 50 void lock_memory_hotplug(void) 51 { 52 mutex_lock(&mem_hotplug_mutex); 53 54 /* for exclusive hibernation if CONFIG_HIBERNATION=y */ 55 lock_system_sleep(); 56 } 57 58 void unlock_memory_hotplug(void) 59 { 60 unlock_system_sleep(); 61 mutex_unlock(&mem_hotplug_mutex); 62 } 63 64 65 /* add this memory to iomem resource */ 66 static struct resource *register_memory_resource(u64 start, u64 size) 67 { 68 struct resource *res; 69 res = kzalloc(sizeof(struct resource), GFP_KERNEL); 70 BUG_ON(!res); 71 72 res->name = "System RAM"; 73 res->start = start; 74 res->end = start + size - 1; 75 res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; 76 if (request_resource(&iomem_resource, res) < 0) { 77 printk("System RAM resource %pR cannot be added\n", res); 78 kfree(res); 79 res = NULL; 80 } 81 return res; 82 } 83 84 static void release_memory_resource(struct resource *res) 85 { 86 if (!res) 87 return; 88 release_resource(res); 89 kfree(res); 90 return; 91 } 92 93 #ifdef CONFIG_MEMORY_HOTPLUG_SPARSE 94 #ifndef CONFIG_SPARSEMEM_VMEMMAP 95 static void get_page_bootmem(unsigned long info, struct page *page, 96 unsigned long type) 97 { 98 page->lru.next = (struct list_head *) type; 99 SetPagePrivate(page); 100 set_page_private(page, info); 101 atomic_inc(&page->_count); 102 } 103 104 /* reference to __meminit __free_pages_bootmem is valid 105 * so use __ref to tell modpost not to generate a warning */ 106 void __ref put_page_bootmem(struct page *page) 107 { 108 unsigned long type; 109 110 type = (unsigned long) page->lru.next; 111 BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE || 112 type > MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE); 113 114 if (atomic_dec_return(&page->_count) == 1) { 115 ClearPagePrivate(page); 116 set_page_private(page, 0); 117 INIT_LIST_HEAD(&page->lru); 118 __free_pages_bootmem(page, 0); 119 } 120 121 } 122 123 static void register_page_bootmem_info_section(unsigned long start_pfn) 124 { 125 unsigned long *usemap, mapsize, section_nr, i; 126 struct mem_section *ms; 127 struct page *page, *memmap; 128 129 section_nr = pfn_to_section_nr(start_pfn); 130 ms = __nr_to_section(section_nr); 131 132 /* Get section's memmap address */ 133 memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr); 134 135 /* 136 * Get page for the memmap's phys address 137 * XXX: need more consideration for sparse_vmemmap... 138 */ 139 page = virt_to_page(memmap); 140 mapsize = sizeof(struct page) * PAGES_PER_SECTION; 141 mapsize = PAGE_ALIGN(mapsize) >> PAGE_SHIFT; 142 143 /* remember memmap's page */ 144 for (i = 0; i < mapsize; i++, page++) 145 get_page_bootmem(section_nr, page, SECTION_INFO); 146 147 usemap = __nr_to_section(section_nr)->pageblock_flags; 148 page = virt_to_page(usemap); 149 150 mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT; 151 152 for (i = 0; i < mapsize; i++, page++) 153 get_page_bootmem(section_nr, page, MIX_SECTION_INFO); 154 155 } 156 157 void register_page_bootmem_info_node(struct pglist_data *pgdat) 158 { 159 unsigned long i, pfn, end_pfn, nr_pages; 160 int node = pgdat->node_id; 161 struct page *page; 162 struct zone *zone; 163 164 nr_pages = PAGE_ALIGN(sizeof(struct pglist_data)) >> PAGE_SHIFT; 165 page = virt_to_page(pgdat); 166 167 for (i = 0; i < nr_pages; i++, page++) 168 get_page_bootmem(node, page, NODE_INFO); 169 170 zone = &pgdat->node_zones[0]; 171 for (; zone < pgdat->node_zones + MAX_NR_ZONES - 1; zone++) { 172 if (zone->wait_table) { 173 nr_pages = zone->wait_table_hash_nr_entries 174 * sizeof(wait_queue_head_t); 175 nr_pages = PAGE_ALIGN(nr_pages) >> PAGE_SHIFT; 176 page = virt_to_page(zone->wait_table); 177 178 for (i = 0; i < nr_pages; i++, page++) 179 get_page_bootmem(node, page, NODE_INFO); 180 } 181 } 182 183 pfn = pgdat->node_start_pfn; 184 end_pfn = pfn + pgdat->node_spanned_pages; 185 186 /* register_section info */ 187 for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) { 188 /* 189 * Some platforms can assign the same pfn to multiple nodes - on 190 * node0 as well as nodeN. To avoid registering a pfn against 191 * multiple nodes we check that this pfn does not already 192 * reside in some other node. 193 */ 194 if (pfn_valid(pfn) && (pfn_to_nid(pfn) == node)) 195 register_page_bootmem_info_section(pfn); 196 } 197 } 198 #endif /* !CONFIG_SPARSEMEM_VMEMMAP */ 199 200 static void grow_zone_span(struct zone *zone, unsigned long start_pfn, 201 unsigned long end_pfn) 202 { 203 unsigned long old_zone_end_pfn; 204 205 zone_span_writelock(zone); 206 207 old_zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages; 208 if (!zone->spanned_pages || start_pfn < zone->zone_start_pfn) 209 zone->zone_start_pfn = start_pfn; 210 211 zone->spanned_pages = max(old_zone_end_pfn, end_pfn) - 212 zone->zone_start_pfn; 213 214 zone_span_writeunlock(zone); 215 } 216 217 static void resize_zone(struct zone *zone, unsigned long start_pfn, 218 unsigned long end_pfn) 219 { 220 zone_span_writelock(zone); 221 222 if (end_pfn - start_pfn) { 223 zone->zone_start_pfn = start_pfn; 224 zone->spanned_pages = end_pfn - start_pfn; 225 } else { 226 /* 227 * make it consist as free_area_init_core(), 228 * if spanned_pages = 0, then keep start_pfn = 0 229 */ 230 zone->zone_start_pfn = 0; 231 zone->spanned_pages = 0; 232 } 233 234 zone_span_writeunlock(zone); 235 } 236 237 static void fix_zone_id(struct zone *zone, unsigned long start_pfn, 238 unsigned long end_pfn) 239 { 240 enum zone_type zid = zone_idx(zone); 241 int nid = zone->zone_pgdat->node_id; 242 unsigned long pfn; 243 244 for (pfn = start_pfn; pfn < end_pfn; pfn++) 245 set_page_links(pfn_to_page(pfn), zid, nid, pfn); 246 } 247 248 static int __meminit move_pfn_range_left(struct zone *z1, struct zone *z2, 249 unsigned long start_pfn, unsigned long end_pfn) 250 { 251 int ret; 252 unsigned long flags; 253 unsigned long z1_start_pfn; 254 255 if (!z1->wait_table) { 256 ret = init_currently_empty_zone(z1, start_pfn, 257 end_pfn - start_pfn, MEMMAP_HOTPLUG); 258 if (ret) 259 return ret; 260 } 261 262 pgdat_resize_lock(z1->zone_pgdat, &flags); 263 264 /* can't move pfns which are higher than @z2 */ 265 if (end_pfn > z2->zone_start_pfn + z2->spanned_pages) 266 goto out_fail; 267 /* the move out part mast at the left most of @z2 */ 268 if (start_pfn > z2->zone_start_pfn) 269 goto out_fail; 270 /* must included/overlap */ 271 if (end_pfn <= z2->zone_start_pfn) 272 goto out_fail; 273 274 /* use start_pfn for z1's start_pfn if z1 is empty */ 275 if (z1->spanned_pages) 276 z1_start_pfn = z1->zone_start_pfn; 277 else 278 z1_start_pfn = start_pfn; 279 280 resize_zone(z1, z1_start_pfn, end_pfn); 281 resize_zone(z2, end_pfn, z2->zone_start_pfn + z2->spanned_pages); 282 283 pgdat_resize_unlock(z1->zone_pgdat, &flags); 284 285 fix_zone_id(z1, start_pfn, end_pfn); 286 287 return 0; 288 out_fail: 289 pgdat_resize_unlock(z1->zone_pgdat, &flags); 290 return -1; 291 } 292 293 static int __meminit move_pfn_range_right(struct zone *z1, struct zone *z2, 294 unsigned long start_pfn, unsigned long end_pfn) 295 { 296 int ret; 297 unsigned long flags; 298 unsigned long z2_end_pfn; 299 300 if (!z2->wait_table) { 301 ret = init_currently_empty_zone(z2, start_pfn, 302 end_pfn - start_pfn, MEMMAP_HOTPLUG); 303 if (ret) 304 return ret; 305 } 306 307 pgdat_resize_lock(z1->zone_pgdat, &flags); 308 309 /* can't move pfns which are lower than @z1 */ 310 if (z1->zone_start_pfn > start_pfn) 311 goto out_fail; 312 /* the move out part mast at the right most of @z1 */ 313 if (z1->zone_start_pfn + z1->spanned_pages > end_pfn) 314 goto out_fail; 315 /* must included/overlap */ 316 if (start_pfn >= z1->zone_start_pfn + z1->spanned_pages) 317 goto out_fail; 318 319 /* use end_pfn for z2's end_pfn if z2 is empty */ 320 if (z2->spanned_pages) 321 z2_end_pfn = z2->zone_start_pfn + z2->spanned_pages; 322 else 323 z2_end_pfn = end_pfn; 324 325 resize_zone(z1, z1->zone_start_pfn, start_pfn); 326 resize_zone(z2, start_pfn, z2_end_pfn); 327 328 pgdat_resize_unlock(z1->zone_pgdat, &flags); 329 330 fix_zone_id(z2, start_pfn, end_pfn); 331 332 return 0; 333 out_fail: 334 pgdat_resize_unlock(z1->zone_pgdat, &flags); 335 return -1; 336 } 337 338 static void grow_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn, 339 unsigned long end_pfn) 340 { 341 unsigned long old_pgdat_end_pfn = 342 pgdat->node_start_pfn + pgdat->node_spanned_pages; 343 344 if (!pgdat->node_spanned_pages || start_pfn < pgdat->node_start_pfn) 345 pgdat->node_start_pfn = start_pfn; 346 347 pgdat->node_spanned_pages = max(old_pgdat_end_pfn, end_pfn) - 348 pgdat->node_start_pfn; 349 } 350 351 static int __meminit __add_zone(struct zone *zone, unsigned long phys_start_pfn) 352 { 353 struct pglist_data *pgdat = zone->zone_pgdat; 354 int nr_pages = PAGES_PER_SECTION; 355 int nid = pgdat->node_id; 356 int zone_type; 357 unsigned long flags; 358 359 zone_type = zone - pgdat->node_zones; 360 if (!zone->wait_table) { 361 int ret; 362 363 ret = init_currently_empty_zone(zone, phys_start_pfn, 364 nr_pages, MEMMAP_HOTPLUG); 365 if (ret) 366 return ret; 367 } 368 pgdat_resize_lock(zone->zone_pgdat, &flags); 369 grow_zone_span(zone, phys_start_pfn, phys_start_pfn + nr_pages); 370 grow_pgdat_span(zone->zone_pgdat, phys_start_pfn, 371 phys_start_pfn + nr_pages); 372 pgdat_resize_unlock(zone->zone_pgdat, &flags); 373 memmap_init_zone(nr_pages, nid, zone_type, 374 phys_start_pfn, MEMMAP_HOTPLUG); 375 return 0; 376 } 377 378 static int __meminit __add_section(int nid, struct zone *zone, 379 unsigned long phys_start_pfn) 380 { 381 int nr_pages = PAGES_PER_SECTION; 382 int ret; 383 384 if (pfn_valid(phys_start_pfn)) 385 return -EEXIST; 386 387 ret = sparse_add_one_section(zone, phys_start_pfn, nr_pages); 388 389 if (ret < 0) 390 return ret; 391 392 ret = __add_zone(zone, phys_start_pfn); 393 394 if (ret < 0) 395 return ret; 396 397 return register_new_memory(nid, __pfn_to_section(phys_start_pfn)); 398 } 399 400 #ifdef CONFIG_SPARSEMEM_VMEMMAP 401 static int __remove_section(struct zone *zone, struct mem_section *ms) 402 { 403 /* 404 * XXX: Freeing memmap with vmemmap is not implement yet. 405 * This should be removed later. 406 */ 407 return -EBUSY; 408 } 409 #else 410 static int __remove_section(struct zone *zone, struct mem_section *ms) 411 { 412 unsigned long flags; 413 struct pglist_data *pgdat = zone->zone_pgdat; 414 int ret = -EINVAL; 415 416 if (!valid_section(ms)) 417 return ret; 418 419 ret = unregister_memory_section(ms); 420 if (ret) 421 return ret; 422 423 pgdat_resize_lock(pgdat, &flags); 424 sparse_remove_one_section(zone, ms); 425 pgdat_resize_unlock(pgdat, &flags); 426 return 0; 427 } 428 #endif 429 430 /* 431 * Reasonably generic function for adding memory. It is 432 * expected that archs that support memory hotplug will 433 * call this function after deciding the zone to which to 434 * add the new pages. 435 */ 436 int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn, 437 unsigned long nr_pages) 438 { 439 unsigned long i; 440 int err = 0; 441 int start_sec, end_sec; 442 /* during initialize mem_map, align hot-added range to section */ 443 start_sec = pfn_to_section_nr(phys_start_pfn); 444 end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1); 445 446 for (i = start_sec; i <= end_sec; i++) { 447 err = __add_section(nid, zone, i << PFN_SECTION_SHIFT); 448 449 /* 450 * EEXIST is finally dealt with by ioresource collision 451 * check. see add_memory() => register_memory_resource() 452 * Warning will be printed if there is collision. 453 */ 454 if (err && (err != -EEXIST)) 455 break; 456 err = 0; 457 } 458 459 return err; 460 } 461 EXPORT_SYMBOL_GPL(__add_pages); 462 463 /** 464 * __remove_pages() - remove sections of pages from a zone 465 * @zone: zone from which pages need to be removed 466 * @phys_start_pfn: starting pageframe (must be aligned to start of a section) 467 * @nr_pages: number of pages to remove (must be multiple of section size) 468 * 469 * Generic helper function to remove section mappings and sysfs entries 470 * for the section of the memory we are removing. Caller needs to make 471 * sure that pages are marked reserved and zones are adjust properly by 472 * calling offline_pages(). 473 */ 474 int __remove_pages(struct zone *zone, unsigned long phys_start_pfn, 475 unsigned long nr_pages) 476 { 477 unsigned long i, ret = 0; 478 int sections_to_remove; 479 480 /* 481 * We can only remove entire sections 482 */ 483 BUG_ON(phys_start_pfn & ~PAGE_SECTION_MASK); 484 BUG_ON(nr_pages % PAGES_PER_SECTION); 485 486 release_mem_region(phys_start_pfn << PAGE_SHIFT, nr_pages * PAGE_SIZE); 487 488 sections_to_remove = nr_pages / PAGES_PER_SECTION; 489 for (i = 0; i < sections_to_remove; i++) { 490 unsigned long pfn = phys_start_pfn + i*PAGES_PER_SECTION; 491 ret = __remove_section(zone, __pfn_to_section(pfn)); 492 if (ret) 493 break; 494 } 495 return ret; 496 } 497 EXPORT_SYMBOL_GPL(__remove_pages); 498 499 int set_online_page_callback(online_page_callback_t callback) 500 { 501 int rc = -EINVAL; 502 503 lock_memory_hotplug(); 504 505 if (online_page_callback == generic_online_page) { 506 online_page_callback = callback; 507 rc = 0; 508 } 509 510 unlock_memory_hotplug(); 511 512 return rc; 513 } 514 EXPORT_SYMBOL_GPL(set_online_page_callback); 515 516 int restore_online_page_callback(online_page_callback_t callback) 517 { 518 int rc = -EINVAL; 519 520 lock_memory_hotplug(); 521 522 if (online_page_callback == callback) { 523 online_page_callback = generic_online_page; 524 rc = 0; 525 } 526 527 unlock_memory_hotplug(); 528 529 return rc; 530 } 531 EXPORT_SYMBOL_GPL(restore_online_page_callback); 532 533 void __online_page_set_limits(struct page *page) 534 { 535 unsigned long pfn = page_to_pfn(page); 536 537 if (pfn >= num_physpages) 538 num_physpages = pfn + 1; 539 } 540 EXPORT_SYMBOL_GPL(__online_page_set_limits); 541 542 void __online_page_increment_counters(struct page *page) 543 { 544 totalram_pages++; 545 546 #ifdef CONFIG_HIGHMEM 547 if (PageHighMem(page)) 548 totalhigh_pages++; 549 #endif 550 } 551 EXPORT_SYMBOL_GPL(__online_page_increment_counters); 552 553 void __online_page_free(struct page *page) 554 { 555 ClearPageReserved(page); 556 init_page_count(page); 557 __free_page(page); 558 } 559 EXPORT_SYMBOL_GPL(__online_page_free); 560 561 static void generic_online_page(struct page *page) 562 { 563 __online_page_set_limits(page); 564 __online_page_increment_counters(page); 565 __online_page_free(page); 566 } 567 568 static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages, 569 void *arg) 570 { 571 unsigned long i; 572 unsigned long onlined_pages = *(unsigned long *)arg; 573 struct page *page; 574 if (PageReserved(pfn_to_page(start_pfn))) 575 for (i = 0; i < nr_pages; i++) { 576 page = pfn_to_page(start_pfn + i); 577 (*online_page_callback)(page); 578 onlined_pages++; 579 } 580 *(unsigned long *)arg = onlined_pages; 581 return 0; 582 } 583 584 /* ensure every online node has NORMAL memory */ 585 static bool can_online_high_movable(struct zone *zone) 586 { 587 return node_state(zone_to_nid(zone), N_NORMAL_MEMORY); 588 } 589 590 /* check which state of node_states will be changed when online memory */ 591 static void node_states_check_changes_online(unsigned long nr_pages, 592 struct zone *zone, struct memory_notify *arg) 593 { 594 int nid = zone_to_nid(zone); 595 enum zone_type zone_last = ZONE_NORMAL; 596 597 /* 598 * If we have HIGHMEM, node_states[N_NORMAL_MEMORY] contains nodes 599 * which have 0...ZONE_NORMAL, set zone_last to ZONE_NORMAL. 600 * 601 * If we don't have HIGHMEM, node_states[N_NORMAL_MEMORY] contains nodes 602 * which have 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE. 603 */ 604 if (N_HIGH_MEMORY == N_NORMAL_MEMORY) 605 zone_last = ZONE_MOVABLE; 606 607 /* 608 * if the memory to be online is in a zone of 0...zone_last, and 609 * the zones of 0...zone_last don't have memory before online, we will 610 * need to set the node to node_states[N_NORMAL_MEMORY] after 611 * the memory is online. 612 */ 613 if (zone_idx(zone) <= zone_last && !node_state(nid, N_NORMAL_MEMORY)) 614 arg->status_change_nid_normal = nid; 615 else 616 arg->status_change_nid_normal = -1; 617 618 /* 619 * if the node don't have memory befor online, we will need to 620 * set the node to node_states[N_HIGH_MEMORY] after the memory 621 * is online. 622 */ 623 if (!node_state(nid, N_HIGH_MEMORY)) 624 arg->status_change_nid = nid; 625 else 626 arg->status_change_nid = -1; 627 } 628 629 static void node_states_set_node(int node, struct memory_notify *arg) 630 { 631 if (arg->status_change_nid_normal >= 0) 632 node_set_state(node, N_NORMAL_MEMORY); 633 634 node_set_state(node, N_HIGH_MEMORY); 635 } 636 637 638 int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type) 639 { 640 unsigned long onlined_pages = 0; 641 struct zone *zone; 642 int need_zonelists_rebuild = 0; 643 int nid; 644 int ret; 645 struct memory_notify arg; 646 647 lock_memory_hotplug(); 648 /* 649 * This doesn't need a lock to do pfn_to_page(). 650 * The section can't be removed here because of the 651 * memory_block->state_mutex. 652 */ 653 zone = page_zone(pfn_to_page(pfn)); 654 655 if ((zone_idx(zone) > ZONE_NORMAL || online_type == ONLINE_MOVABLE) && 656 !can_online_high_movable(zone)) { 657 unlock_memory_hotplug(); 658 return -1; 659 } 660 661 if (online_type == ONLINE_KERNEL && zone_idx(zone) == ZONE_MOVABLE) { 662 if (move_pfn_range_left(zone - 1, zone, pfn, pfn + nr_pages)) { 663 unlock_memory_hotplug(); 664 return -1; 665 } 666 } 667 if (online_type == ONLINE_MOVABLE && zone_idx(zone) == ZONE_MOVABLE - 1) { 668 if (move_pfn_range_right(zone, zone + 1, pfn, pfn + nr_pages)) { 669 unlock_memory_hotplug(); 670 return -1; 671 } 672 } 673 674 /* Previous code may changed the zone of the pfn range */ 675 zone = page_zone(pfn_to_page(pfn)); 676 677 arg.start_pfn = pfn; 678 arg.nr_pages = nr_pages; 679 node_states_check_changes_online(nr_pages, zone, &arg); 680 681 nid = page_to_nid(pfn_to_page(pfn)); 682 683 ret = memory_notify(MEM_GOING_ONLINE, &arg); 684 ret = notifier_to_errno(ret); 685 if (ret) { 686 memory_notify(MEM_CANCEL_ONLINE, &arg); 687 unlock_memory_hotplug(); 688 return ret; 689 } 690 /* 691 * If this zone is not populated, then it is not in zonelist. 692 * This means the page allocator ignores this zone. 693 * So, zonelist must be updated after online. 694 */ 695 mutex_lock(&zonelists_mutex); 696 if (!populated_zone(zone)) { 697 need_zonelists_rebuild = 1; 698 build_all_zonelists(NULL, zone); 699 } 700 701 ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages, 702 online_pages_range); 703 if (ret) { 704 if (need_zonelists_rebuild) 705 zone_pcp_reset(zone); 706 mutex_unlock(&zonelists_mutex); 707 printk(KERN_DEBUG "online_pages [mem %#010llx-%#010llx] failed\n", 708 (unsigned long long) pfn << PAGE_SHIFT, 709 (((unsigned long long) pfn + nr_pages) 710 << PAGE_SHIFT) - 1); 711 memory_notify(MEM_CANCEL_ONLINE, &arg); 712 unlock_memory_hotplug(); 713 return ret; 714 } 715 716 zone->present_pages += onlined_pages; 717 zone->zone_pgdat->node_present_pages += onlined_pages; 718 if (onlined_pages) { 719 node_states_set_node(zone_to_nid(zone), &arg); 720 if (need_zonelists_rebuild) 721 build_all_zonelists(NULL, NULL); 722 else 723 zone_pcp_update(zone); 724 } 725 726 mutex_unlock(&zonelists_mutex); 727 728 init_per_zone_wmark_min(); 729 730 if (onlined_pages) 731 kswapd_run(zone_to_nid(zone)); 732 733 vm_total_pages = nr_free_pagecache_pages(); 734 735 writeback_set_ratelimit(); 736 737 if (onlined_pages) 738 memory_notify(MEM_ONLINE, &arg); 739 unlock_memory_hotplug(); 740 741 return 0; 742 } 743 #endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */ 744 745 /* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ 746 static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start) 747 { 748 struct pglist_data *pgdat; 749 unsigned long zones_size[MAX_NR_ZONES] = {0}; 750 unsigned long zholes_size[MAX_NR_ZONES] = {0}; 751 unsigned long start_pfn = start >> PAGE_SHIFT; 752 753 pgdat = arch_alloc_nodedata(nid); 754 if (!pgdat) 755 return NULL; 756 757 arch_refresh_nodedata(nid, pgdat); 758 759 /* we can use NODE_DATA(nid) from here */ 760 761 /* init node's zones as empty zones, we don't have any present pages.*/ 762 free_area_init_node(nid, zones_size, start_pfn, zholes_size); 763 764 /* 765 * The node we allocated has no zone fallback lists. For avoiding 766 * to access not-initialized zonelist, build here. 767 */ 768 mutex_lock(&zonelists_mutex); 769 build_all_zonelists(pgdat, NULL); 770 mutex_unlock(&zonelists_mutex); 771 772 return pgdat; 773 } 774 775 static void rollback_node_hotadd(int nid, pg_data_t *pgdat) 776 { 777 arch_refresh_nodedata(nid, NULL); 778 arch_free_nodedata(pgdat); 779 return; 780 } 781 782 783 /* 784 * called by cpu_up() to online a node without onlined memory. 785 */ 786 int mem_online_node(int nid) 787 { 788 pg_data_t *pgdat; 789 int ret; 790 791 lock_memory_hotplug(); 792 pgdat = hotadd_new_pgdat(nid, 0); 793 if (!pgdat) { 794 ret = -ENOMEM; 795 goto out; 796 } 797 node_set_online(nid); 798 ret = register_one_node(nid); 799 BUG_ON(ret); 800 801 out: 802 unlock_memory_hotplug(); 803 return ret; 804 } 805 806 /* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ 807 int __ref add_memory(int nid, u64 start, u64 size) 808 { 809 pg_data_t *pgdat = NULL; 810 int new_pgdat = 0; 811 struct resource *res; 812 int ret; 813 814 lock_memory_hotplug(); 815 816 res = register_memory_resource(start, size); 817 ret = -EEXIST; 818 if (!res) 819 goto out; 820 821 if (!node_online(nid)) { 822 pgdat = hotadd_new_pgdat(nid, start); 823 ret = -ENOMEM; 824 if (!pgdat) 825 goto error; 826 new_pgdat = 1; 827 } 828 829 /* call arch's memory hotadd */ 830 ret = arch_add_memory(nid, start, size); 831 832 if (ret < 0) 833 goto error; 834 835 /* we online node here. we can't roll back from here. */ 836 node_set_online(nid); 837 838 if (new_pgdat) { 839 ret = register_one_node(nid); 840 /* 841 * If sysfs file of new node can't create, cpu on the node 842 * can't be hot-added. There is no rollback way now. 843 * So, check by BUG_ON() to catch it reluctantly.. 844 */ 845 BUG_ON(ret); 846 } 847 848 /* create new memmap entry */ 849 firmware_map_add_hotplug(start, start + size, "System RAM"); 850 851 goto out; 852 853 error: 854 /* rollback pgdat allocation and others */ 855 if (new_pgdat) 856 rollback_node_hotadd(nid, pgdat); 857 if (res) 858 release_memory_resource(res); 859 860 out: 861 unlock_memory_hotplug(); 862 return ret; 863 } 864 EXPORT_SYMBOL_GPL(add_memory); 865 866 #ifdef CONFIG_MEMORY_HOTREMOVE 867 /* 868 * A free page on the buddy free lists (not the per-cpu lists) has PageBuddy 869 * set and the size of the free page is given by page_order(). Using this, 870 * the function determines if the pageblock contains only free pages. 871 * Due to buddy contraints, a free page at least the size of a pageblock will 872 * be located at the start of the pageblock 873 */ 874 static inline int pageblock_free(struct page *page) 875 { 876 return PageBuddy(page) && page_order(page) >= pageblock_order; 877 } 878 879 /* Return the start of the next active pageblock after a given page */ 880 static struct page *next_active_pageblock(struct page *page) 881 { 882 /* Ensure the starting page is pageblock-aligned */ 883 BUG_ON(page_to_pfn(page) & (pageblock_nr_pages - 1)); 884 885 /* If the entire pageblock is free, move to the end of free page */ 886 if (pageblock_free(page)) { 887 int order; 888 /* be careful. we don't have locks, page_order can be changed.*/ 889 order = page_order(page); 890 if ((order < MAX_ORDER) && (order >= pageblock_order)) 891 return page + (1 << order); 892 } 893 894 return page + pageblock_nr_pages; 895 } 896 897 /* Checks if this range of memory is likely to be hot-removable. */ 898 int is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages) 899 { 900 struct page *page = pfn_to_page(start_pfn); 901 struct page *end_page = page + nr_pages; 902 903 /* Check the starting page of each pageblock within the range */ 904 for (; page < end_page; page = next_active_pageblock(page)) { 905 if (!is_pageblock_removable_nolock(page)) 906 return 0; 907 cond_resched(); 908 } 909 910 /* All pageblocks in the memory block are likely to be hot-removable */ 911 return 1; 912 } 913 914 /* 915 * Confirm all pages in a range [start, end) is belongs to the same zone. 916 */ 917 static int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn) 918 { 919 unsigned long pfn; 920 struct zone *zone = NULL; 921 struct page *page; 922 int i; 923 for (pfn = start_pfn; 924 pfn < end_pfn; 925 pfn += MAX_ORDER_NR_PAGES) { 926 i = 0; 927 /* This is just a CONFIG_HOLES_IN_ZONE check.*/ 928 while ((i < MAX_ORDER_NR_PAGES) && !pfn_valid_within(pfn + i)) 929 i++; 930 if (i == MAX_ORDER_NR_PAGES) 931 continue; 932 page = pfn_to_page(pfn + i); 933 if (zone && page_zone(page) != zone) 934 return 0; 935 zone = page_zone(page); 936 } 937 return 1; 938 } 939 940 /* 941 * Scanning pfn is much easier than scanning lru list. 942 * Scan pfn from start to end and Find LRU page. 943 */ 944 static unsigned long scan_lru_pages(unsigned long start, unsigned long end) 945 { 946 unsigned long pfn; 947 struct page *page; 948 for (pfn = start; pfn < end; pfn++) { 949 if (pfn_valid(pfn)) { 950 page = pfn_to_page(pfn); 951 if (PageLRU(page)) 952 return pfn; 953 } 954 } 955 return 0; 956 } 957 958 #define NR_OFFLINE_AT_ONCE_PAGES (256) 959 static int 960 do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) 961 { 962 unsigned long pfn; 963 struct page *page; 964 int move_pages = NR_OFFLINE_AT_ONCE_PAGES; 965 int not_managed = 0; 966 int ret = 0; 967 LIST_HEAD(source); 968 969 for (pfn = start_pfn; pfn < end_pfn && move_pages > 0; pfn++) { 970 if (!pfn_valid(pfn)) 971 continue; 972 page = pfn_to_page(pfn); 973 if (!get_page_unless_zero(page)) 974 continue; 975 /* 976 * We can skip free pages. And we can only deal with pages on 977 * LRU. 978 */ 979 ret = isolate_lru_page(page); 980 if (!ret) { /* Success */ 981 put_page(page); 982 list_add_tail(&page->lru, &source); 983 move_pages--; 984 inc_zone_page_state(page, NR_ISOLATED_ANON + 985 page_is_file_cache(page)); 986 987 } else { 988 #ifdef CONFIG_DEBUG_VM 989 printk(KERN_ALERT "removing pfn %lx from LRU failed\n", 990 pfn); 991 dump_page(page); 992 #endif 993 put_page(page); 994 /* Because we don't have big zone->lock. we should 995 check this again here. */ 996 if (page_count(page)) { 997 not_managed++; 998 ret = -EBUSY; 999 break; 1000 } 1001 } 1002 } 1003 if (!list_empty(&source)) { 1004 if (not_managed) { 1005 putback_lru_pages(&source); 1006 goto out; 1007 } 1008 1009 /* 1010 * alloc_migrate_target should be improooooved!! 1011 * migrate_pages returns # of failed pages. 1012 */ 1013 ret = migrate_pages(&source, alloc_migrate_target, 0, 1014 true, MIGRATE_SYNC); 1015 if (ret) 1016 putback_lru_pages(&source); 1017 } 1018 out: 1019 return ret; 1020 } 1021 1022 /* 1023 * remove from free_area[] and mark all as Reserved. 1024 */ 1025 static int 1026 offline_isolated_pages_cb(unsigned long start, unsigned long nr_pages, 1027 void *data) 1028 { 1029 __offline_isolated_pages(start, start + nr_pages); 1030 return 0; 1031 } 1032 1033 static void 1034 offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) 1035 { 1036 walk_system_ram_range(start_pfn, end_pfn - start_pfn, NULL, 1037 offline_isolated_pages_cb); 1038 } 1039 1040 /* 1041 * Check all pages in range, recoreded as memory resource, are isolated. 1042 */ 1043 static int 1044 check_pages_isolated_cb(unsigned long start_pfn, unsigned long nr_pages, 1045 void *data) 1046 { 1047 int ret; 1048 long offlined = *(long *)data; 1049 ret = test_pages_isolated(start_pfn, start_pfn + nr_pages, true); 1050 offlined = nr_pages; 1051 if (!ret) 1052 *(long *)data += offlined; 1053 return ret; 1054 } 1055 1056 static long 1057 check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn) 1058 { 1059 long offlined = 0; 1060 int ret; 1061 1062 ret = walk_system_ram_range(start_pfn, end_pfn - start_pfn, &offlined, 1063 check_pages_isolated_cb); 1064 if (ret < 0) 1065 offlined = (long)ret; 1066 return offlined; 1067 } 1068 1069 /* ensure the node has NORMAL memory if it is still online */ 1070 static bool can_offline_normal(struct zone *zone, unsigned long nr_pages) 1071 { 1072 struct pglist_data *pgdat = zone->zone_pgdat; 1073 unsigned long present_pages = 0; 1074 enum zone_type zt; 1075 1076 for (zt = 0; zt <= ZONE_NORMAL; zt++) 1077 present_pages += pgdat->node_zones[zt].present_pages; 1078 1079 if (present_pages > nr_pages) 1080 return true; 1081 1082 present_pages = 0; 1083 for (; zt <= ZONE_MOVABLE; zt++) 1084 present_pages += pgdat->node_zones[zt].present_pages; 1085 1086 /* 1087 * we can't offline the last normal memory until all 1088 * higher memory is offlined. 1089 */ 1090 return present_pages == 0; 1091 } 1092 1093 /* check which state of node_states will be changed when offline memory */ 1094 static void node_states_check_changes_offline(unsigned long nr_pages, 1095 struct zone *zone, struct memory_notify *arg) 1096 { 1097 struct pglist_data *pgdat = zone->zone_pgdat; 1098 unsigned long present_pages = 0; 1099 enum zone_type zt, zone_last = ZONE_NORMAL; 1100 1101 /* 1102 * If we have HIGHMEM, node_states[N_NORMAL_MEMORY] contains nodes 1103 * which have 0...ZONE_NORMAL, set zone_last to ZONE_NORMAL. 1104 * 1105 * If we don't have HIGHMEM, node_states[N_NORMAL_MEMORY] contains nodes 1106 * which have 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE. 1107 */ 1108 if (N_HIGH_MEMORY == N_NORMAL_MEMORY) 1109 zone_last = ZONE_MOVABLE; 1110 1111 /* 1112 * check whether node_states[N_NORMAL_MEMORY] will be changed. 1113 * If the memory to be offline is in a zone of 0...zone_last, 1114 * and it is the last present memory, 0...zone_last will 1115 * become empty after offline , thus we can determind we will 1116 * need to clear the node from node_states[N_NORMAL_MEMORY]. 1117 */ 1118 for (zt = 0; zt <= zone_last; zt++) 1119 present_pages += pgdat->node_zones[zt].present_pages; 1120 if (zone_idx(zone) <= zone_last && nr_pages >= present_pages) 1121 arg->status_change_nid_normal = zone_to_nid(zone); 1122 else 1123 arg->status_change_nid_normal = -1; 1124 1125 /* 1126 * node_states[N_HIGH_MEMORY] contains nodes which have 0...ZONE_MOVABLE 1127 */ 1128 zone_last = ZONE_MOVABLE; 1129 1130 /* 1131 * check whether node_states[N_HIGH_MEMORY] will be changed 1132 * If we try to offline the last present @nr_pages from the node, 1133 * we can determind we will need to clear the node from 1134 * node_states[N_HIGH_MEMORY]. 1135 */ 1136 for (; zt <= zone_last; zt++) 1137 present_pages += pgdat->node_zones[zt].present_pages; 1138 if (nr_pages >= present_pages) 1139 arg->status_change_nid = zone_to_nid(zone); 1140 else 1141 arg->status_change_nid = -1; 1142 } 1143 1144 static void node_states_clear_node(int node, struct memory_notify *arg) 1145 { 1146 if (arg->status_change_nid_normal >= 0) 1147 node_clear_state(node, N_NORMAL_MEMORY); 1148 1149 if ((N_HIGH_MEMORY != N_NORMAL_MEMORY) && 1150 (arg->status_change_nid >= 0)) 1151 node_clear_state(node, N_HIGH_MEMORY); 1152 } 1153 1154 static int __ref __offline_pages(unsigned long start_pfn, 1155 unsigned long end_pfn, unsigned long timeout) 1156 { 1157 unsigned long pfn, nr_pages, expire; 1158 long offlined_pages; 1159 int ret, drain, retry_max, node; 1160 struct zone *zone; 1161 struct memory_notify arg; 1162 1163 BUG_ON(start_pfn >= end_pfn); 1164 /* at least, alignment against pageblock is necessary */ 1165 if (!IS_ALIGNED(start_pfn, pageblock_nr_pages)) 1166 return -EINVAL; 1167 if (!IS_ALIGNED(end_pfn, pageblock_nr_pages)) 1168 return -EINVAL; 1169 /* This makes hotplug much easier...and readable. 1170 we assume this for now. .*/ 1171 if (!test_pages_in_a_zone(start_pfn, end_pfn)) 1172 return -EINVAL; 1173 1174 lock_memory_hotplug(); 1175 1176 zone = page_zone(pfn_to_page(start_pfn)); 1177 node = zone_to_nid(zone); 1178 nr_pages = end_pfn - start_pfn; 1179 1180 ret = -EINVAL; 1181 if (zone_idx(zone) <= ZONE_NORMAL && !can_offline_normal(zone, nr_pages)) 1182 goto out; 1183 1184 /* set above range as isolated */ 1185 ret = start_isolate_page_range(start_pfn, end_pfn, 1186 MIGRATE_MOVABLE, true); 1187 if (ret) 1188 goto out; 1189 1190 arg.start_pfn = start_pfn; 1191 arg.nr_pages = nr_pages; 1192 node_states_check_changes_offline(nr_pages, zone, &arg); 1193 1194 ret = memory_notify(MEM_GOING_OFFLINE, &arg); 1195 ret = notifier_to_errno(ret); 1196 if (ret) 1197 goto failed_removal; 1198 1199 pfn = start_pfn; 1200 expire = jiffies + timeout; 1201 drain = 0; 1202 retry_max = 5; 1203 repeat: 1204 /* start memory hot removal */ 1205 ret = -EAGAIN; 1206 if (time_after(jiffies, expire)) 1207 goto failed_removal; 1208 ret = -EINTR; 1209 if (signal_pending(current)) 1210 goto failed_removal; 1211 ret = 0; 1212 if (drain) { 1213 lru_add_drain_all(); 1214 cond_resched(); 1215 drain_all_pages(); 1216 } 1217 1218 pfn = scan_lru_pages(start_pfn, end_pfn); 1219 if (pfn) { /* We have page on LRU */ 1220 ret = do_migrate_range(pfn, end_pfn); 1221 if (!ret) { 1222 drain = 1; 1223 goto repeat; 1224 } else { 1225 if (ret < 0) 1226 if (--retry_max == 0) 1227 goto failed_removal; 1228 yield(); 1229 drain = 1; 1230 goto repeat; 1231 } 1232 } 1233 /* drain all zone's lru pagevec, this is asyncronous... */ 1234 lru_add_drain_all(); 1235 yield(); 1236 /* drain pcp pages , this is synchrouns. */ 1237 drain_all_pages(); 1238 /* check again */ 1239 offlined_pages = check_pages_isolated(start_pfn, end_pfn); 1240 if (offlined_pages < 0) { 1241 ret = -EBUSY; 1242 goto failed_removal; 1243 } 1244 printk(KERN_INFO "Offlined Pages %ld\n", offlined_pages); 1245 /* Ok, all of our target is islaoted. 1246 We cannot do rollback at this point. */ 1247 offline_isolated_pages(start_pfn, end_pfn); 1248 /* reset pagetype flags and makes migrate type to be MOVABLE */ 1249 undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); 1250 /* removal success */ 1251 zone->present_pages -= offlined_pages; 1252 zone->zone_pgdat->node_present_pages -= offlined_pages; 1253 totalram_pages -= offlined_pages; 1254 1255 init_per_zone_wmark_min(); 1256 1257 if (!populated_zone(zone)) { 1258 zone_pcp_reset(zone); 1259 mutex_lock(&zonelists_mutex); 1260 build_all_zonelists(NULL, NULL); 1261 mutex_unlock(&zonelists_mutex); 1262 } else 1263 zone_pcp_update(zone); 1264 1265 node_states_clear_node(node, &arg); 1266 if (arg.status_change_nid >= 0) 1267 kswapd_stop(node); 1268 1269 vm_total_pages = nr_free_pagecache_pages(); 1270 writeback_set_ratelimit(); 1271 1272 memory_notify(MEM_OFFLINE, &arg); 1273 unlock_memory_hotplug(); 1274 return 0; 1275 1276 failed_removal: 1277 printk(KERN_INFO "memory offlining [mem %#010llx-%#010llx] failed\n", 1278 (unsigned long long) start_pfn << PAGE_SHIFT, 1279 ((unsigned long long) end_pfn << PAGE_SHIFT) - 1); 1280 memory_notify(MEM_CANCEL_OFFLINE, &arg); 1281 /* pushback to free area */ 1282 undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); 1283 1284 out: 1285 unlock_memory_hotplug(); 1286 return ret; 1287 } 1288 1289 int offline_pages(unsigned long start_pfn, unsigned long nr_pages) 1290 { 1291 return __offline_pages(start_pfn, start_pfn + nr_pages, 120 * HZ); 1292 } 1293 1294 int remove_memory(u64 start, u64 size) 1295 { 1296 struct memory_block *mem = NULL; 1297 struct mem_section *section; 1298 unsigned long start_pfn, end_pfn; 1299 unsigned long pfn, section_nr; 1300 int ret; 1301 1302 start_pfn = PFN_DOWN(start); 1303 end_pfn = start_pfn + PFN_DOWN(size); 1304 1305 for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { 1306 section_nr = pfn_to_section_nr(pfn); 1307 if (!present_section_nr(section_nr)) 1308 continue; 1309 1310 section = __nr_to_section(section_nr); 1311 /* same memblock? */ 1312 if (mem) 1313 if ((section_nr >= mem->start_section_nr) && 1314 (section_nr <= mem->end_section_nr)) 1315 continue; 1316 1317 mem = find_memory_block_hinted(section, mem); 1318 if (!mem) 1319 continue; 1320 1321 ret = offline_memory_block(mem); 1322 if (ret) { 1323 kobject_put(&mem->dev.kobj); 1324 return ret; 1325 } 1326 } 1327 1328 if (mem) 1329 kobject_put(&mem->dev.kobj); 1330 1331 return 0; 1332 } 1333 #else 1334 int offline_pages(unsigned long start_pfn, unsigned long nr_pages) 1335 { 1336 return -EINVAL; 1337 } 1338 int remove_memory(u64 start, u64 size) 1339 { 1340 return -EINVAL; 1341 } 1342 #endif /* CONFIG_MEMORY_HOTREMOVE */ 1343 EXPORT_SYMBOL_GPL(remove_memory); 1344