1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Memory subsystem support 4 * 5 * Written by Matt Tolentino <matthew.e.tolentino@intel.com> 6 * Dave Hansen <haveblue@us.ibm.com> 7 * 8 * This file provides the necessary infrastructure to represent 9 * a SPARSEMEM-memory-model system's physical memory in /sysfs. 10 * All arch-independent code that assumes MEMORY_HOTPLUG requires 11 * SPARSEMEM should be contained here, or in mm/memory_hotplug.c. 12 */ 13 14 #include <linux/module.h> 15 #include <linux/init.h> 16 #include <linux/topology.h> 17 #include <linux/capability.h> 18 #include <linux/device.h> 19 #include <linux/memory.h> 20 #include <linux/memory_hotplug.h> 21 #include <linux/mm.h> 22 #include <linux/stat.h> 23 #include <linux/slab.h> 24 #include <linux/xarray.h> 25 26 #include <linux/atomic.h> 27 #include <linux/uaccess.h> 28 29 #define MEMORY_CLASS_NAME "memory" 30 31 static const char *const online_type_to_str[] = { 32 [MMOP_OFFLINE] = "offline", 33 [MMOP_ONLINE] = "online", 34 [MMOP_ONLINE_KERNEL] = "online_kernel", 35 [MMOP_ONLINE_MOVABLE] = "online_movable", 36 }; 37 38 int mhp_online_type_from_str(const char *str) 39 { 40 int i; 41 42 for (i = 0; i < ARRAY_SIZE(online_type_to_str); i++) { 43 if (sysfs_streq(str, online_type_to_str[i])) 44 return i; 45 } 46 return -EINVAL; 47 } 48 49 #define to_memory_block(dev) container_of(dev, struct memory_block, dev) 50 51 static int sections_per_block; 52 53 static inline unsigned long memory_block_id(unsigned long section_nr) 54 { 55 return section_nr / sections_per_block; 56 } 57 58 static inline unsigned long pfn_to_block_id(unsigned long pfn) 59 { 60 return memory_block_id(pfn_to_section_nr(pfn)); 61 } 62 63 static inline unsigned long phys_to_block_id(unsigned long phys) 64 { 65 return pfn_to_block_id(PFN_DOWN(phys)); 66 } 67 68 static int memory_subsys_online(struct device *dev); 69 static int memory_subsys_offline(struct device *dev); 70 71 static struct bus_type memory_subsys = { 72 .name = MEMORY_CLASS_NAME, 73 .dev_name = MEMORY_CLASS_NAME, 74 .online = memory_subsys_online, 75 .offline = memory_subsys_offline, 76 }; 77 78 /* 79 * Memory blocks are cached in a local radix tree to avoid 80 * a costly linear search for the corresponding device on 81 * the subsystem bus. 82 */ 83 static DEFINE_XARRAY(memory_blocks); 84 85 /* 86 * Memory groups, indexed by memory group id (mgid). 87 */ 88 static DEFINE_XARRAY_FLAGS(memory_groups, XA_FLAGS_ALLOC); 89 #define MEMORY_GROUP_MARK_DYNAMIC XA_MARK_1 90 91 static BLOCKING_NOTIFIER_HEAD(memory_chain); 92 93 int register_memory_notifier(struct notifier_block *nb) 94 { 95 return blocking_notifier_chain_register(&memory_chain, nb); 96 } 97 EXPORT_SYMBOL(register_memory_notifier); 98 99 void unregister_memory_notifier(struct notifier_block *nb) 100 { 101 blocking_notifier_chain_unregister(&memory_chain, nb); 102 } 103 EXPORT_SYMBOL(unregister_memory_notifier); 104 105 static void memory_block_release(struct device *dev) 106 { 107 struct memory_block *mem = to_memory_block(dev); 108 109 kfree(mem); 110 } 111 112 unsigned long __weak memory_block_size_bytes(void) 113 { 114 return MIN_MEMORY_BLOCK_SIZE; 115 } 116 EXPORT_SYMBOL_GPL(memory_block_size_bytes); 117 118 /* 119 * Show the first physical section index (number) of this memory block. 120 */ 121 static ssize_t phys_index_show(struct device *dev, 122 struct device_attribute *attr, char *buf) 123 { 124 struct memory_block *mem = to_memory_block(dev); 125 unsigned long phys_index; 126 127 phys_index = mem->start_section_nr / sections_per_block; 128 129 return sysfs_emit(buf, "%08lx\n", phys_index); 130 } 131 132 /* 133 * Legacy interface that we cannot remove. Always indicate "removable" 134 * with CONFIG_MEMORY_HOTREMOVE - bad heuristic. 135 */ 136 static ssize_t removable_show(struct device *dev, struct device_attribute *attr, 137 char *buf) 138 { 139 return sysfs_emit(buf, "%d\n", (int)IS_ENABLED(CONFIG_MEMORY_HOTREMOVE)); 140 } 141 142 /* 143 * online, offline, going offline, etc. 144 */ 145 static ssize_t state_show(struct device *dev, struct device_attribute *attr, 146 char *buf) 147 { 148 struct memory_block *mem = to_memory_block(dev); 149 const char *output; 150 151 /* 152 * We can probably put these states in a nice little array 153 * so that they're not open-coded 154 */ 155 switch (mem->state) { 156 case MEM_ONLINE: 157 output = "online"; 158 break; 159 case MEM_OFFLINE: 160 output = "offline"; 161 break; 162 case MEM_GOING_OFFLINE: 163 output = "going-offline"; 164 break; 165 default: 166 WARN_ON(1); 167 return sysfs_emit(buf, "ERROR-UNKNOWN-%ld\n", mem->state); 168 } 169 170 return sysfs_emit(buf, "%s\n", output); 171 } 172 173 int memory_notify(unsigned long val, void *v) 174 { 175 return blocking_notifier_call_chain(&memory_chain, val, v); 176 } 177 178 static int memory_block_online(struct memory_block *mem) 179 { 180 unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr); 181 unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block; 182 unsigned long nr_vmemmap_pages = mem->nr_vmemmap_pages; 183 struct zone *zone; 184 int ret; 185 186 zone = zone_for_pfn_range(mem->online_type, mem->nid, mem->group, 187 start_pfn, nr_pages); 188 189 /* 190 * Although vmemmap pages have a different lifecycle than the pages 191 * they describe (they remain until the memory is unplugged), doing 192 * their initialization and accounting at memory onlining/offlining 193 * stage helps to keep accounting easier to follow - e.g vmemmaps 194 * belong to the same zone as the memory they backed. 195 */ 196 if (nr_vmemmap_pages) { 197 ret = mhp_init_memmap_on_memory(start_pfn, nr_vmemmap_pages, zone); 198 if (ret) 199 return ret; 200 } 201 202 ret = online_pages(start_pfn + nr_vmemmap_pages, 203 nr_pages - nr_vmemmap_pages, zone, mem->group); 204 if (ret) { 205 if (nr_vmemmap_pages) 206 mhp_deinit_memmap_on_memory(start_pfn, nr_vmemmap_pages); 207 return ret; 208 } 209 210 /* 211 * Account once onlining succeeded. If the zone was unpopulated, it is 212 * now already properly populated. 213 */ 214 if (nr_vmemmap_pages) 215 adjust_present_page_count(pfn_to_page(start_pfn), mem->group, 216 nr_vmemmap_pages); 217 218 mem->zone = zone; 219 return ret; 220 } 221 222 static int memory_block_offline(struct memory_block *mem) 223 { 224 unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr); 225 unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block; 226 unsigned long nr_vmemmap_pages = mem->nr_vmemmap_pages; 227 int ret; 228 229 if (!mem->zone) 230 return -EINVAL; 231 232 /* 233 * Unaccount before offlining, such that unpopulated zone and kthreads 234 * can properly be torn down in offline_pages(). 235 */ 236 if (nr_vmemmap_pages) 237 adjust_present_page_count(pfn_to_page(start_pfn), mem->group, 238 -nr_vmemmap_pages); 239 240 ret = offline_pages(start_pfn + nr_vmemmap_pages, 241 nr_pages - nr_vmemmap_pages, mem->zone, mem->group); 242 if (ret) { 243 /* offline_pages() failed. Account back. */ 244 if (nr_vmemmap_pages) 245 adjust_present_page_count(pfn_to_page(start_pfn), 246 mem->group, nr_vmemmap_pages); 247 return ret; 248 } 249 250 if (nr_vmemmap_pages) 251 mhp_deinit_memmap_on_memory(start_pfn, nr_vmemmap_pages); 252 253 mem->zone = NULL; 254 return ret; 255 } 256 257 /* 258 * MEMORY_HOTPLUG depends on SPARSEMEM in mm/Kconfig, so it is 259 * OK to have direct references to sparsemem variables in here. 260 */ 261 static int 262 memory_block_action(struct memory_block *mem, unsigned long action) 263 { 264 int ret; 265 266 switch (action) { 267 case MEM_ONLINE: 268 ret = memory_block_online(mem); 269 break; 270 case MEM_OFFLINE: 271 ret = memory_block_offline(mem); 272 break; 273 default: 274 WARN(1, KERN_WARNING "%s(%ld, %ld) unknown action: " 275 "%ld\n", __func__, mem->start_section_nr, action, action); 276 ret = -EINVAL; 277 } 278 279 return ret; 280 } 281 282 static int memory_block_change_state(struct memory_block *mem, 283 unsigned long to_state, unsigned long from_state_req) 284 { 285 int ret = 0; 286 287 if (mem->state != from_state_req) 288 return -EINVAL; 289 290 if (to_state == MEM_OFFLINE) 291 mem->state = MEM_GOING_OFFLINE; 292 293 ret = memory_block_action(mem, to_state); 294 mem->state = ret ? from_state_req : to_state; 295 296 return ret; 297 } 298 299 /* The device lock serializes operations on memory_subsys_[online|offline] */ 300 static int memory_subsys_online(struct device *dev) 301 { 302 struct memory_block *mem = to_memory_block(dev); 303 int ret; 304 305 if (mem->state == MEM_ONLINE) 306 return 0; 307 308 /* 309 * When called via device_online() without configuring the online_type, 310 * we want to default to MMOP_ONLINE. 311 */ 312 if (mem->online_type == MMOP_OFFLINE) 313 mem->online_type = MMOP_ONLINE; 314 315 ret = memory_block_change_state(mem, MEM_ONLINE, MEM_OFFLINE); 316 mem->online_type = MMOP_OFFLINE; 317 318 return ret; 319 } 320 321 static int memory_subsys_offline(struct device *dev) 322 { 323 struct memory_block *mem = to_memory_block(dev); 324 325 if (mem->state == MEM_OFFLINE) 326 return 0; 327 328 return memory_block_change_state(mem, MEM_OFFLINE, MEM_ONLINE); 329 } 330 331 static ssize_t state_store(struct device *dev, struct device_attribute *attr, 332 const char *buf, size_t count) 333 { 334 const int online_type = mhp_online_type_from_str(buf); 335 struct memory_block *mem = to_memory_block(dev); 336 int ret; 337 338 if (online_type < 0) 339 return -EINVAL; 340 341 ret = lock_device_hotplug_sysfs(); 342 if (ret) 343 return ret; 344 345 switch (online_type) { 346 case MMOP_ONLINE_KERNEL: 347 case MMOP_ONLINE_MOVABLE: 348 case MMOP_ONLINE: 349 /* mem->online_type is protected by device_hotplug_lock */ 350 mem->online_type = online_type; 351 ret = device_online(&mem->dev); 352 break; 353 case MMOP_OFFLINE: 354 ret = device_offline(&mem->dev); 355 break; 356 default: 357 ret = -EINVAL; /* should never happen */ 358 } 359 360 unlock_device_hotplug(); 361 362 if (ret < 0) 363 return ret; 364 if (ret) 365 return -EINVAL; 366 367 return count; 368 } 369 370 /* 371 * Legacy interface that we cannot remove: s390x exposes the storage increment 372 * covered by a memory block, allowing for identifying which memory blocks 373 * comprise a storage increment. Since a memory block spans complete 374 * storage increments nowadays, this interface is basically unused. Other 375 * archs never exposed != 0. 376 */ 377 static ssize_t phys_device_show(struct device *dev, 378 struct device_attribute *attr, char *buf) 379 { 380 struct memory_block *mem = to_memory_block(dev); 381 unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr); 382 383 return sysfs_emit(buf, "%d\n", 384 arch_get_memory_phys_device(start_pfn)); 385 } 386 387 #ifdef CONFIG_MEMORY_HOTREMOVE 388 static int print_allowed_zone(char *buf, int len, int nid, 389 struct memory_group *group, 390 unsigned long start_pfn, unsigned long nr_pages, 391 int online_type, struct zone *default_zone) 392 { 393 struct zone *zone; 394 395 zone = zone_for_pfn_range(online_type, nid, group, start_pfn, nr_pages); 396 if (zone == default_zone) 397 return 0; 398 399 return sysfs_emit_at(buf, len, " %s", zone->name); 400 } 401 402 static ssize_t valid_zones_show(struct device *dev, 403 struct device_attribute *attr, char *buf) 404 { 405 struct memory_block *mem = to_memory_block(dev); 406 unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr); 407 unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block; 408 struct memory_group *group = mem->group; 409 struct zone *default_zone; 410 int nid = mem->nid; 411 int len = 0; 412 413 /* 414 * Check the existing zone. Make sure that we do that only on the 415 * online nodes otherwise the page_zone is not reliable 416 */ 417 if (mem->state == MEM_ONLINE) { 418 /* 419 * If !mem->zone, the memory block spans multiple zones and 420 * cannot get offlined. 421 */ 422 default_zone = mem->zone; 423 if (!default_zone) 424 return sysfs_emit(buf, "%s\n", "none"); 425 len += sysfs_emit_at(buf, len, "%s", default_zone->name); 426 goto out; 427 } 428 429 default_zone = zone_for_pfn_range(MMOP_ONLINE, nid, group, 430 start_pfn, nr_pages); 431 432 len += sysfs_emit_at(buf, len, "%s", default_zone->name); 433 len += print_allowed_zone(buf, len, nid, group, start_pfn, nr_pages, 434 MMOP_ONLINE_KERNEL, default_zone); 435 len += print_allowed_zone(buf, len, nid, group, start_pfn, nr_pages, 436 MMOP_ONLINE_MOVABLE, default_zone); 437 out: 438 len += sysfs_emit_at(buf, len, "\n"); 439 return len; 440 } 441 static DEVICE_ATTR_RO(valid_zones); 442 #endif 443 444 static DEVICE_ATTR_RO(phys_index); 445 static DEVICE_ATTR_RW(state); 446 static DEVICE_ATTR_RO(phys_device); 447 static DEVICE_ATTR_RO(removable); 448 449 /* 450 * Show the memory block size (shared by all memory blocks). 451 */ 452 static ssize_t block_size_bytes_show(struct device *dev, 453 struct device_attribute *attr, char *buf) 454 { 455 return sysfs_emit(buf, "%lx\n", memory_block_size_bytes()); 456 } 457 458 static DEVICE_ATTR_RO(block_size_bytes); 459 460 /* 461 * Memory auto online policy. 462 */ 463 464 static ssize_t auto_online_blocks_show(struct device *dev, 465 struct device_attribute *attr, char *buf) 466 { 467 return sysfs_emit(buf, "%s\n", 468 online_type_to_str[mhp_default_online_type]); 469 } 470 471 static ssize_t auto_online_blocks_store(struct device *dev, 472 struct device_attribute *attr, 473 const char *buf, size_t count) 474 { 475 const int online_type = mhp_online_type_from_str(buf); 476 477 if (online_type < 0) 478 return -EINVAL; 479 480 mhp_default_online_type = online_type; 481 return count; 482 } 483 484 static DEVICE_ATTR_RW(auto_online_blocks); 485 486 /* 487 * Some architectures will have custom drivers to do this, and 488 * will not need to do it from userspace. The fake hot-add code 489 * as well as ppc64 will do all of their discovery in userspace 490 * and will require this interface. 491 */ 492 #ifdef CONFIG_ARCH_MEMORY_PROBE 493 static ssize_t probe_store(struct device *dev, struct device_attribute *attr, 494 const char *buf, size_t count) 495 { 496 u64 phys_addr; 497 int nid, ret; 498 unsigned long pages_per_block = PAGES_PER_SECTION * sections_per_block; 499 500 ret = kstrtoull(buf, 0, &phys_addr); 501 if (ret) 502 return ret; 503 504 if (phys_addr & ((pages_per_block << PAGE_SHIFT) - 1)) 505 return -EINVAL; 506 507 ret = lock_device_hotplug_sysfs(); 508 if (ret) 509 return ret; 510 511 nid = memory_add_physaddr_to_nid(phys_addr); 512 ret = __add_memory(nid, phys_addr, 513 MIN_MEMORY_BLOCK_SIZE * sections_per_block, 514 MHP_NONE); 515 516 if (ret) 517 goto out; 518 519 ret = count; 520 out: 521 unlock_device_hotplug(); 522 return ret; 523 } 524 525 static DEVICE_ATTR_WO(probe); 526 #endif 527 528 #ifdef CONFIG_MEMORY_FAILURE 529 /* 530 * Support for offlining pages of memory 531 */ 532 533 /* Soft offline a page */ 534 static ssize_t soft_offline_page_store(struct device *dev, 535 struct device_attribute *attr, 536 const char *buf, size_t count) 537 { 538 int ret; 539 u64 pfn; 540 if (!capable(CAP_SYS_ADMIN)) 541 return -EPERM; 542 if (kstrtoull(buf, 0, &pfn) < 0) 543 return -EINVAL; 544 pfn >>= PAGE_SHIFT; 545 ret = soft_offline_page(pfn, 0); 546 return ret == 0 ? count : ret; 547 } 548 549 /* Forcibly offline a page, including killing processes. */ 550 static ssize_t hard_offline_page_store(struct device *dev, 551 struct device_attribute *attr, 552 const char *buf, size_t count) 553 { 554 int ret; 555 u64 pfn; 556 if (!capable(CAP_SYS_ADMIN)) 557 return -EPERM; 558 if (kstrtoull(buf, 0, &pfn) < 0) 559 return -EINVAL; 560 pfn >>= PAGE_SHIFT; 561 ret = memory_failure(pfn, 0); 562 if (ret == -EOPNOTSUPP) 563 ret = 0; 564 return ret ? ret : count; 565 } 566 567 static DEVICE_ATTR_WO(soft_offline_page); 568 static DEVICE_ATTR_WO(hard_offline_page); 569 #endif 570 571 /* See phys_device_show(). */ 572 int __weak arch_get_memory_phys_device(unsigned long start_pfn) 573 { 574 return 0; 575 } 576 577 /* 578 * A reference for the returned memory block device is acquired. 579 * 580 * Called under device_hotplug_lock. 581 */ 582 static struct memory_block *find_memory_block_by_id(unsigned long block_id) 583 { 584 struct memory_block *mem; 585 586 mem = xa_load(&memory_blocks, block_id); 587 if (mem) 588 get_device(&mem->dev); 589 return mem; 590 } 591 592 /* 593 * Called under device_hotplug_lock. 594 */ 595 struct memory_block *find_memory_block(unsigned long section_nr) 596 { 597 unsigned long block_id = memory_block_id(section_nr); 598 599 return find_memory_block_by_id(block_id); 600 } 601 602 static struct attribute *memory_memblk_attrs[] = { 603 &dev_attr_phys_index.attr, 604 &dev_attr_state.attr, 605 &dev_attr_phys_device.attr, 606 &dev_attr_removable.attr, 607 #ifdef CONFIG_MEMORY_HOTREMOVE 608 &dev_attr_valid_zones.attr, 609 #endif 610 NULL 611 }; 612 613 static const struct attribute_group memory_memblk_attr_group = { 614 .attrs = memory_memblk_attrs, 615 }; 616 617 static const struct attribute_group *memory_memblk_attr_groups[] = { 618 &memory_memblk_attr_group, 619 NULL, 620 }; 621 622 static int __add_memory_block(struct memory_block *memory) 623 { 624 int ret; 625 626 memory->dev.bus = &memory_subsys; 627 memory->dev.id = memory->start_section_nr / sections_per_block; 628 memory->dev.release = memory_block_release; 629 memory->dev.groups = memory_memblk_attr_groups; 630 memory->dev.offline = memory->state == MEM_OFFLINE; 631 632 ret = device_register(&memory->dev); 633 if (ret) { 634 put_device(&memory->dev); 635 return ret; 636 } 637 ret = xa_err(xa_store(&memory_blocks, memory->dev.id, memory, 638 GFP_KERNEL)); 639 if (ret) { 640 put_device(&memory->dev); 641 device_unregister(&memory->dev); 642 } 643 return ret; 644 } 645 646 static struct zone *early_node_zone_for_memory_block(struct memory_block *mem, 647 int nid) 648 { 649 const unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr); 650 const unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block; 651 struct zone *zone, *matching_zone = NULL; 652 pg_data_t *pgdat = NODE_DATA(nid); 653 int i; 654 655 /* 656 * This logic only works for early memory, when the applicable zones 657 * already span the memory block. We don't expect overlapping zones on 658 * a single node for early memory. So if we're told that some PFNs 659 * of a node fall into this memory block, we can assume that all node 660 * zones that intersect with the memory block are actually applicable. 661 * No need to look at the memmap. 662 */ 663 for (i = 0; i < MAX_NR_ZONES; i++) { 664 zone = pgdat->node_zones + i; 665 if (!populated_zone(zone)) 666 continue; 667 if (!zone_intersects(zone, start_pfn, nr_pages)) 668 continue; 669 if (!matching_zone) { 670 matching_zone = zone; 671 continue; 672 } 673 /* Spans multiple zones ... */ 674 matching_zone = NULL; 675 break; 676 } 677 return matching_zone; 678 } 679 680 #ifdef CONFIG_NUMA 681 /** 682 * memory_block_add_nid() - Indicate that system RAM falling into this memory 683 * block device (partially) belongs to the given node. 684 * @mem: The memory block device. 685 * @nid: The node id. 686 * @context: The memory initialization context. 687 * 688 * Indicate that system RAM falling into this memory block (partially) belongs 689 * to the given node. If the context indicates ("early") that we are adding the 690 * node during node device subsystem initialization, this will also properly 691 * set/adjust mem->zone based on the zone ranges of the given node. 692 */ 693 void memory_block_add_nid(struct memory_block *mem, int nid, 694 enum meminit_context context) 695 { 696 if (context == MEMINIT_EARLY && mem->nid != nid) { 697 /* 698 * For early memory we have to determine the zone when setting 699 * the node id and handle multiple nodes spanning a single 700 * memory block by indicate via zone == NULL that we're not 701 * dealing with a single zone. So if we're setting the node id 702 * the first time, determine if there is a single zone. If we're 703 * setting the node id a second time to a different node, 704 * invalidate the single detected zone. 705 */ 706 if (mem->nid == NUMA_NO_NODE) 707 mem->zone = early_node_zone_for_memory_block(mem, nid); 708 else 709 mem->zone = NULL; 710 } 711 712 /* 713 * If this memory block spans multiple nodes, we only indicate 714 * the last processed node. If we span multiple nodes (not applicable 715 * to hotplugged memory), zone == NULL will prohibit memory offlining 716 * and consequently unplug. 717 */ 718 mem->nid = nid; 719 } 720 #endif 721 722 static int add_memory_block(unsigned long block_id, unsigned long state, 723 unsigned long nr_vmemmap_pages, 724 struct memory_group *group) 725 { 726 struct memory_block *mem; 727 int ret = 0; 728 729 mem = find_memory_block_by_id(block_id); 730 if (mem) { 731 put_device(&mem->dev); 732 return -EEXIST; 733 } 734 mem = kzalloc(sizeof(*mem), GFP_KERNEL); 735 if (!mem) 736 return -ENOMEM; 737 738 mem->start_section_nr = block_id * sections_per_block; 739 mem->state = state; 740 mem->nid = NUMA_NO_NODE; 741 mem->nr_vmemmap_pages = nr_vmemmap_pages; 742 INIT_LIST_HEAD(&mem->group_next); 743 744 #ifndef CONFIG_NUMA 745 if (state == MEM_ONLINE) 746 /* 747 * MEM_ONLINE at this point implies early memory. With NUMA, 748 * we'll determine the zone when setting the node id via 749 * memory_block_add_nid(). Memory hotplug updated the zone 750 * manually when memory onlining/offlining succeeds. 751 */ 752 mem->zone = early_node_zone_for_memory_block(mem, NUMA_NO_NODE); 753 #endif /* CONFIG_NUMA */ 754 755 ret = __add_memory_block(mem); 756 if (ret) 757 return ret; 758 759 if (group) { 760 mem->group = group; 761 list_add(&mem->group_next, &group->memory_blocks); 762 } 763 764 return 0; 765 } 766 767 static int __init add_boot_memory_block(unsigned long base_section_nr) 768 { 769 int section_count = 0; 770 unsigned long nr; 771 772 for (nr = base_section_nr; nr < base_section_nr + sections_per_block; 773 nr++) 774 if (present_section_nr(nr)) 775 section_count++; 776 777 if (section_count == 0) 778 return 0; 779 return add_memory_block(memory_block_id(base_section_nr), 780 MEM_ONLINE, 0, NULL); 781 } 782 783 static int add_hotplug_memory_block(unsigned long block_id, 784 unsigned long nr_vmemmap_pages, 785 struct memory_group *group) 786 { 787 return add_memory_block(block_id, MEM_OFFLINE, nr_vmemmap_pages, group); 788 } 789 790 static void remove_memory_block(struct memory_block *memory) 791 { 792 if (WARN_ON_ONCE(memory->dev.bus != &memory_subsys)) 793 return; 794 795 WARN_ON(xa_erase(&memory_blocks, memory->dev.id) == NULL); 796 797 if (memory->group) { 798 list_del(&memory->group_next); 799 memory->group = NULL; 800 } 801 802 /* drop the ref. we got via find_memory_block() */ 803 put_device(&memory->dev); 804 device_unregister(&memory->dev); 805 } 806 807 /* 808 * Create memory block devices for the given memory area. Start and size 809 * have to be aligned to memory block granularity. Memory block devices 810 * will be initialized as offline. 811 * 812 * Called under device_hotplug_lock. 813 */ 814 int create_memory_block_devices(unsigned long start, unsigned long size, 815 unsigned long vmemmap_pages, 816 struct memory_group *group) 817 { 818 const unsigned long start_block_id = pfn_to_block_id(PFN_DOWN(start)); 819 unsigned long end_block_id = pfn_to_block_id(PFN_DOWN(start + size)); 820 struct memory_block *mem; 821 unsigned long block_id; 822 int ret = 0; 823 824 if (WARN_ON_ONCE(!IS_ALIGNED(start, memory_block_size_bytes()) || 825 !IS_ALIGNED(size, memory_block_size_bytes()))) 826 return -EINVAL; 827 828 for (block_id = start_block_id; block_id != end_block_id; block_id++) { 829 ret = add_hotplug_memory_block(block_id, vmemmap_pages, group); 830 if (ret) 831 break; 832 } 833 if (ret) { 834 end_block_id = block_id; 835 for (block_id = start_block_id; block_id != end_block_id; 836 block_id++) { 837 mem = find_memory_block_by_id(block_id); 838 if (WARN_ON_ONCE(!mem)) 839 continue; 840 remove_memory_block(mem); 841 } 842 } 843 return ret; 844 } 845 846 /* 847 * Remove memory block devices for the given memory area. Start and size 848 * have to be aligned to memory block granularity. Memory block devices 849 * have to be offline. 850 * 851 * Called under device_hotplug_lock. 852 */ 853 void remove_memory_block_devices(unsigned long start, unsigned long size) 854 { 855 const unsigned long start_block_id = pfn_to_block_id(PFN_DOWN(start)); 856 const unsigned long end_block_id = pfn_to_block_id(PFN_DOWN(start + size)); 857 struct memory_block *mem; 858 unsigned long block_id; 859 860 if (WARN_ON_ONCE(!IS_ALIGNED(start, memory_block_size_bytes()) || 861 !IS_ALIGNED(size, memory_block_size_bytes()))) 862 return; 863 864 for (block_id = start_block_id; block_id != end_block_id; block_id++) { 865 mem = find_memory_block_by_id(block_id); 866 if (WARN_ON_ONCE(!mem)) 867 continue; 868 unregister_memory_block_under_nodes(mem); 869 remove_memory_block(mem); 870 } 871 } 872 873 /* return true if the memory block is offlined, otherwise, return false */ 874 bool is_memblock_offlined(struct memory_block *mem) 875 { 876 return mem->state == MEM_OFFLINE; 877 } 878 879 static struct attribute *memory_root_attrs[] = { 880 #ifdef CONFIG_ARCH_MEMORY_PROBE 881 &dev_attr_probe.attr, 882 #endif 883 884 #ifdef CONFIG_MEMORY_FAILURE 885 &dev_attr_soft_offline_page.attr, 886 &dev_attr_hard_offline_page.attr, 887 #endif 888 889 &dev_attr_block_size_bytes.attr, 890 &dev_attr_auto_online_blocks.attr, 891 NULL 892 }; 893 894 static const struct attribute_group memory_root_attr_group = { 895 .attrs = memory_root_attrs, 896 }; 897 898 static const struct attribute_group *memory_root_attr_groups[] = { 899 &memory_root_attr_group, 900 NULL, 901 }; 902 903 /* 904 * Initialize the sysfs support for memory devices. At the time this function 905 * is called, we cannot have concurrent creation/deletion of memory block 906 * devices, the device_hotplug_lock is not needed. 907 */ 908 void __init memory_dev_init(void) 909 { 910 int ret; 911 unsigned long block_sz, nr; 912 913 /* Validate the configured memory block size */ 914 block_sz = memory_block_size_bytes(); 915 if (!is_power_of_2(block_sz) || block_sz < MIN_MEMORY_BLOCK_SIZE) 916 panic("Memory block size not suitable: 0x%lx\n", block_sz); 917 sections_per_block = block_sz / MIN_MEMORY_BLOCK_SIZE; 918 919 ret = subsys_system_register(&memory_subsys, memory_root_attr_groups); 920 if (ret) 921 panic("%s() failed to register subsystem: %d\n", __func__, ret); 922 923 /* 924 * Create entries for memory sections that were found 925 * during boot and have been initialized 926 */ 927 for (nr = 0; nr <= __highest_present_section_nr; 928 nr += sections_per_block) { 929 ret = add_boot_memory_block(nr); 930 if (ret) 931 panic("%s() failed to add memory block: %d\n", __func__, 932 ret); 933 } 934 } 935 936 /** 937 * walk_memory_blocks - walk through all present memory blocks overlapped 938 * by the range [start, start + size) 939 * 940 * @start: start address of the memory range 941 * @size: size of the memory range 942 * @arg: argument passed to func 943 * @func: callback for each memory section walked 944 * 945 * This function walks through all present memory blocks overlapped by the 946 * range [start, start + size), calling func on each memory block. 947 * 948 * In case func() returns an error, walking is aborted and the error is 949 * returned. 950 * 951 * Called under device_hotplug_lock. 952 */ 953 int walk_memory_blocks(unsigned long start, unsigned long size, 954 void *arg, walk_memory_blocks_func_t func) 955 { 956 const unsigned long start_block_id = phys_to_block_id(start); 957 const unsigned long end_block_id = phys_to_block_id(start + size - 1); 958 struct memory_block *mem; 959 unsigned long block_id; 960 int ret = 0; 961 962 if (!size) 963 return 0; 964 965 for (block_id = start_block_id; block_id <= end_block_id; block_id++) { 966 mem = find_memory_block_by_id(block_id); 967 if (!mem) 968 continue; 969 970 ret = func(mem, arg); 971 put_device(&mem->dev); 972 if (ret) 973 break; 974 } 975 return ret; 976 } 977 978 struct for_each_memory_block_cb_data { 979 walk_memory_blocks_func_t func; 980 void *arg; 981 }; 982 983 static int for_each_memory_block_cb(struct device *dev, void *data) 984 { 985 struct memory_block *mem = to_memory_block(dev); 986 struct for_each_memory_block_cb_data *cb_data = data; 987 988 return cb_data->func(mem, cb_data->arg); 989 } 990 991 /** 992 * for_each_memory_block - walk through all present memory blocks 993 * 994 * @arg: argument passed to func 995 * @func: callback for each memory block walked 996 * 997 * This function walks through all present memory blocks, calling func on 998 * each memory block. 999 * 1000 * In case func() returns an error, walking is aborted and the error is 1001 * returned. 1002 */ 1003 int for_each_memory_block(void *arg, walk_memory_blocks_func_t func) 1004 { 1005 struct for_each_memory_block_cb_data cb_data = { 1006 .func = func, 1007 .arg = arg, 1008 }; 1009 1010 return bus_for_each_dev(&memory_subsys, NULL, &cb_data, 1011 for_each_memory_block_cb); 1012 } 1013 1014 /* 1015 * This is an internal helper to unify allocation and initialization of 1016 * memory groups. Note that the passed memory group will be copied to a 1017 * dynamically allocated memory group. After this call, the passed 1018 * memory group should no longer be used. 1019 */ 1020 static int memory_group_register(struct memory_group group) 1021 { 1022 struct memory_group *new_group; 1023 uint32_t mgid; 1024 int ret; 1025 1026 if (!node_possible(group.nid)) 1027 return -EINVAL; 1028 1029 new_group = kzalloc(sizeof(group), GFP_KERNEL); 1030 if (!new_group) 1031 return -ENOMEM; 1032 *new_group = group; 1033 INIT_LIST_HEAD(&new_group->memory_blocks); 1034 1035 ret = xa_alloc(&memory_groups, &mgid, new_group, xa_limit_31b, 1036 GFP_KERNEL); 1037 if (ret) { 1038 kfree(new_group); 1039 return ret; 1040 } else if (group.is_dynamic) { 1041 xa_set_mark(&memory_groups, mgid, MEMORY_GROUP_MARK_DYNAMIC); 1042 } 1043 return mgid; 1044 } 1045 1046 /** 1047 * memory_group_register_static() - Register a static memory group. 1048 * @nid: The node id. 1049 * @max_pages: The maximum number of pages we'll have in this static memory 1050 * group. 1051 * 1052 * Register a new static memory group and return the memory group id. 1053 * All memory in the group belongs to a single unit, such as a DIMM. All 1054 * memory belonging to a static memory group is added in one go to be removed 1055 * in one go -- it's static. 1056 * 1057 * Returns an error if out of memory, if the node id is invalid, if no new 1058 * memory groups can be registered, or if max_pages is invalid (0). Otherwise, 1059 * returns the new memory group id. 1060 */ 1061 int memory_group_register_static(int nid, unsigned long max_pages) 1062 { 1063 struct memory_group group = { 1064 .nid = nid, 1065 .s = { 1066 .max_pages = max_pages, 1067 }, 1068 }; 1069 1070 if (!max_pages) 1071 return -EINVAL; 1072 return memory_group_register(group); 1073 } 1074 EXPORT_SYMBOL_GPL(memory_group_register_static); 1075 1076 /** 1077 * memory_group_register_dynamic() - Register a dynamic memory group. 1078 * @nid: The node id. 1079 * @unit_pages: Unit in pages in which is memory added/removed in this dynamic 1080 * memory group. 1081 * 1082 * Register a new dynamic memory group and return the memory group id. 1083 * Memory within a dynamic memory group is added/removed dynamically 1084 * in unit_pages. 1085 * 1086 * Returns an error if out of memory, if the node id is invalid, if no new 1087 * memory groups can be registered, or if unit_pages is invalid (0, not a 1088 * power of two, smaller than a single memory block). Otherwise, returns the 1089 * new memory group id. 1090 */ 1091 int memory_group_register_dynamic(int nid, unsigned long unit_pages) 1092 { 1093 struct memory_group group = { 1094 .nid = nid, 1095 .is_dynamic = true, 1096 .d = { 1097 .unit_pages = unit_pages, 1098 }, 1099 }; 1100 1101 if (!unit_pages || !is_power_of_2(unit_pages) || 1102 unit_pages < PHYS_PFN(memory_block_size_bytes())) 1103 return -EINVAL; 1104 return memory_group_register(group); 1105 } 1106 EXPORT_SYMBOL_GPL(memory_group_register_dynamic); 1107 1108 /** 1109 * memory_group_unregister() - Unregister a memory group. 1110 * @mgid: the memory group id 1111 * 1112 * Unregister a memory group. If any memory block still belongs to this 1113 * memory group, unregistering will fail. 1114 * 1115 * Returns -EINVAL if the memory group id is invalid, returns -EBUSY if some 1116 * memory blocks still belong to this memory group and returns 0 if 1117 * unregistering succeeded. 1118 */ 1119 int memory_group_unregister(int mgid) 1120 { 1121 struct memory_group *group; 1122 1123 if (mgid < 0) 1124 return -EINVAL; 1125 1126 group = xa_load(&memory_groups, mgid); 1127 if (!group) 1128 return -EINVAL; 1129 if (!list_empty(&group->memory_blocks)) 1130 return -EBUSY; 1131 xa_erase(&memory_groups, mgid); 1132 kfree(group); 1133 return 0; 1134 } 1135 EXPORT_SYMBOL_GPL(memory_group_unregister); 1136 1137 /* 1138 * This is an internal helper only to be used in core memory hotplug code to 1139 * lookup a memory group. We don't care about locking, as we don't expect a 1140 * memory group to get unregistered while adding memory to it -- because 1141 * the group and the memory is managed by the same driver. 1142 */ 1143 struct memory_group *memory_group_find_by_id(int mgid) 1144 { 1145 return xa_load(&memory_groups, mgid); 1146 } 1147 1148 /* 1149 * This is an internal helper only to be used in core memory hotplug code to 1150 * walk all dynamic memory groups excluding a given memory group, either 1151 * belonging to a specific node, or belonging to any node. 1152 */ 1153 int walk_dynamic_memory_groups(int nid, walk_memory_groups_func_t func, 1154 struct memory_group *excluded, void *arg) 1155 { 1156 struct memory_group *group; 1157 unsigned long index; 1158 int ret = 0; 1159 1160 xa_for_each_marked(&memory_groups, index, group, 1161 MEMORY_GROUP_MARK_DYNAMIC) { 1162 if (group == excluded) 1163 continue; 1164 #ifdef CONFIG_NUMA 1165 if (nid != NUMA_NO_NODE && group->nid != nid) 1166 continue; 1167 #endif /* CONFIG_NUMA */ 1168 ret = func(group, arg); 1169 if (ret) 1170 break; 1171 } 1172 return ret; 1173 } 1174