1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Memory subsystem support 4 * 5 * Written by Matt Tolentino <matthew.e.tolentino@intel.com> 6 * Dave Hansen <haveblue@us.ibm.com> 7 * 8 * This file provides the necessary infrastructure to represent 9 * a SPARSEMEM-memory-model system's physical memory in /sysfs. 10 * All arch-independent code that assumes MEMORY_HOTPLUG requires 11 * SPARSEMEM should be contained here, or in mm/memory_hotplug.c. 12 */ 13 14 #include <linux/module.h> 15 #include <linux/init.h> 16 #include <linux/topology.h> 17 #include <linux/capability.h> 18 #include <linux/device.h> 19 #include <linux/memory.h> 20 #include <linux/memory_hotplug.h> 21 #include <linux/mm.h> 22 #include <linux/stat.h> 23 #include <linux/slab.h> 24 #include <linux/xarray.h> 25 26 #include <linux/atomic.h> 27 #include <linux/uaccess.h> 28 29 #define MEMORY_CLASS_NAME "memory" 30 31 static const char *const online_type_to_str[] = { 32 [MMOP_OFFLINE] = "offline", 33 [MMOP_ONLINE] = "online", 34 [MMOP_ONLINE_KERNEL] = "online_kernel", 35 [MMOP_ONLINE_MOVABLE] = "online_movable", 36 }; 37 38 int mhp_online_type_from_str(const char *str) 39 { 40 int i; 41 42 for (i = 0; i < ARRAY_SIZE(online_type_to_str); i++) { 43 if (sysfs_streq(str, online_type_to_str[i])) 44 return i; 45 } 46 return -EINVAL; 47 } 48 49 #define to_memory_block(dev) container_of(dev, struct memory_block, dev) 50 51 static int sections_per_block; 52 53 static inline unsigned long memory_block_id(unsigned long section_nr) 54 { 55 return section_nr / sections_per_block; 56 } 57 58 static inline unsigned long pfn_to_block_id(unsigned long pfn) 59 { 60 return memory_block_id(pfn_to_section_nr(pfn)); 61 } 62 63 static inline unsigned long phys_to_block_id(unsigned long phys) 64 { 65 return pfn_to_block_id(PFN_DOWN(phys)); 66 } 67 68 static int memory_subsys_online(struct device *dev); 69 static int memory_subsys_offline(struct device *dev); 70 71 static const struct bus_type memory_subsys = { 72 .name = MEMORY_CLASS_NAME, 73 .dev_name = MEMORY_CLASS_NAME, 74 .online = memory_subsys_online, 75 .offline = memory_subsys_offline, 76 }; 77 78 /* 79 * Memory blocks are cached in a local radix tree to avoid 80 * a costly linear search for the corresponding device on 81 * the subsystem bus. 82 */ 83 static DEFINE_XARRAY(memory_blocks); 84 85 /* 86 * Memory groups, indexed by memory group id (mgid). 87 */ 88 static DEFINE_XARRAY_FLAGS(memory_groups, XA_FLAGS_ALLOC); 89 #define MEMORY_GROUP_MARK_DYNAMIC XA_MARK_1 90 91 static BLOCKING_NOTIFIER_HEAD(memory_chain); 92 93 int register_memory_notifier(struct notifier_block *nb) 94 { 95 return blocking_notifier_chain_register(&memory_chain, nb); 96 } 97 EXPORT_SYMBOL(register_memory_notifier); 98 99 void unregister_memory_notifier(struct notifier_block *nb) 100 { 101 blocking_notifier_chain_unregister(&memory_chain, nb); 102 } 103 EXPORT_SYMBOL(unregister_memory_notifier); 104 105 static void memory_block_release(struct device *dev) 106 { 107 struct memory_block *mem = to_memory_block(dev); 108 /* Verify that the altmap is freed */ 109 WARN_ON(mem->altmap); 110 kfree(mem); 111 } 112 113 114 /* Max block size to be set by memory_block_advise_max_size */ 115 static unsigned long memory_block_advised_size; 116 static bool memory_block_advised_size_queried; 117 118 /** 119 * memory_block_advise_max_size() - advise memory hotplug on the max suggested 120 * block size, usually for alignment. 121 * @size: suggestion for maximum block size. must be aligned on power of 2. 122 * 123 * Early boot software (pre-allocator init) may advise archs on the max block 124 * size. This value can only decrease after initialization, as the intent is 125 * to identify the largest supported alignment for all sources. 126 * 127 * Use of this value is arch-defined, as is min/max block size. 128 * 129 * Return: 0 on success 130 * -EINVAL if size is 0 or not pow2 aligned 131 * -EBUSY if value has already been probed 132 */ 133 int __init memory_block_advise_max_size(unsigned long size) 134 { 135 if (!size || !is_power_of_2(size)) 136 return -EINVAL; 137 138 if (memory_block_advised_size_queried) 139 return -EBUSY; 140 141 if (memory_block_advised_size) 142 memory_block_advised_size = min(memory_block_advised_size, size); 143 else 144 memory_block_advised_size = size; 145 146 return 0; 147 } 148 149 /** 150 * memory_block_advised_max_size() - query advised max hotplug block size. 151 * 152 * After the first call, the value can never change. Callers looking for the 153 * actual block size should use memory_block_size_bytes. This interface is 154 * intended for use by arch-init when initializing the hotplug block size. 155 * 156 * Return: advised size in bytes, or 0 if never set. 157 */ 158 unsigned long memory_block_advised_max_size(void) 159 { 160 memory_block_advised_size_queried = true; 161 return memory_block_advised_size; 162 } 163 164 unsigned long __weak memory_block_size_bytes(void) 165 { 166 return MIN_MEMORY_BLOCK_SIZE; 167 } 168 EXPORT_SYMBOL_GPL(memory_block_size_bytes); 169 170 /* Show the memory block ID, relative to the memory block size */ 171 static ssize_t phys_index_show(struct device *dev, 172 struct device_attribute *attr, char *buf) 173 { 174 struct memory_block *mem = to_memory_block(dev); 175 176 return sysfs_emit(buf, "%08lx\n", memory_block_id(mem->start_section_nr)); 177 } 178 179 /* 180 * Legacy interface that we cannot remove. Always indicate "removable" 181 * with CONFIG_MEMORY_HOTREMOVE - bad heuristic. 182 */ 183 static ssize_t removable_show(struct device *dev, struct device_attribute *attr, 184 char *buf) 185 { 186 return sysfs_emit(buf, "%d\n", (int)IS_ENABLED(CONFIG_MEMORY_HOTREMOVE)); 187 } 188 189 /* 190 * online, offline, going offline, etc. 191 */ 192 static ssize_t state_show(struct device *dev, struct device_attribute *attr, 193 char *buf) 194 { 195 struct memory_block *mem = to_memory_block(dev); 196 const char *output; 197 198 /* 199 * We can probably put these states in a nice little array 200 * so that they're not open-coded 201 */ 202 switch (mem->state) { 203 case MEM_ONLINE: 204 output = "online"; 205 break; 206 case MEM_OFFLINE: 207 output = "offline"; 208 break; 209 case MEM_GOING_OFFLINE: 210 output = "going-offline"; 211 break; 212 default: 213 WARN_ON(1); 214 return sysfs_emit(buf, "ERROR-UNKNOWN-%ld\n", mem->state); 215 } 216 217 return sysfs_emit(buf, "%s\n", output); 218 } 219 220 int memory_notify(unsigned long val, void *v) 221 { 222 return blocking_notifier_call_chain(&memory_chain, val, v); 223 } 224 225 #if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_MEMORY_HOTPLUG) 226 static unsigned long memblk_nr_poison(struct memory_block *mem); 227 #else 228 static inline unsigned long memblk_nr_poison(struct memory_block *mem) 229 { 230 return 0; 231 } 232 #endif 233 234 /* 235 * Must acquire mem_hotplug_lock in write mode. 236 */ 237 static int memory_block_online(struct memory_block *mem) 238 { 239 unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr); 240 unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block; 241 unsigned long nr_vmemmap_pages = 0; 242 struct memory_notify arg; 243 struct zone *zone; 244 int ret; 245 246 if (memblk_nr_poison(mem)) 247 return -EHWPOISON; 248 249 zone = zone_for_pfn_range(mem->online_type, mem->nid, mem->group, 250 start_pfn, nr_pages); 251 252 /* 253 * Although vmemmap pages have a different lifecycle than the pages 254 * they describe (they remain until the memory is unplugged), doing 255 * their initialization and accounting at memory onlining/offlining 256 * stage helps to keep accounting easier to follow - e.g vmemmaps 257 * belong to the same zone as the memory they backed. 258 */ 259 if (mem->altmap) 260 nr_vmemmap_pages = mem->altmap->free; 261 262 arg.altmap_start_pfn = start_pfn; 263 arg.altmap_nr_pages = nr_vmemmap_pages; 264 arg.start_pfn = start_pfn + nr_vmemmap_pages; 265 arg.nr_pages = nr_pages - nr_vmemmap_pages; 266 mem_hotplug_begin(); 267 ret = memory_notify(MEM_PREPARE_ONLINE, &arg); 268 ret = notifier_to_errno(ret); 269 if (ret) 270 goto out_notifier; 271 272 if (nr_vmemmap_pages) { 273 ret = mhp_init_memmap_on_memory(start_pfn, nr_vmemmap_pages, 274 zone, mem->altmap->inaccessible); 275 if (ret) 276 goto out; 277 } 278 279 ret = online_pages(start_pfn + nr_vmemmap_pages, 280 nr_pages - nr_vmemmap_pages, zone, mem->group); 281 if (ret) { 282 if (nr_vmemmap_pages) 283 mhp_deinit_memmap_on_memory(start_pfn, nr_vmemmap_pages); 284 goto out; 285 } 286 287 /* 288 * Account once onlining succeeded. If the zone was unpopulated, it is 289 * now already properly populated. 290 */ 291 if (nr_vmemmap_pages) 292 adjust_present_page_count(pfn_to_page(start_pfn), mem->group, 293 nr_vmemmap_pages); 294 295 mem->zone = zone; 296 mem_hotplug_done(); 297 return ret; 298 out: 299 memory_notify(MEM_FINISH_OFFLINE, &arg); 300 out_notifier: 301 mem_hotplug_done(); 302 return ret; 303 } 304 305 /* 306 * Must acquire mem_hotplug_lock in write mode. 307 */ 308 static int memory_block_offline(struct memory_block *mem) 309 { 310 unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr); 311 unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block; 312 unsigned long nr_vmemmap_pages = 0; 313 struct memory_notify arg; 314 int ret; 315 316 if (!mem->zone) 317 return -EINVAL; 318 319 /* 320 * Unaccount before offlining, such that unpopulated zone and kthreads 321 * can properly be torn down in offline_pages(). 322 */ 323 if (mem->altmap) 324 nr_vmemmap_pages = mem->altmap->free; 325 326 mem_hotplug_begin(); 327 if (nr_vmemmap_pages) 328 adjust_present_page_count(pfn_to_page(start_pfn), mem->group, 329 -nr_vmemmap_pages); 330 331 ret = offline_pages(start_pfn + nr_vmemmap_pages, 332 nr_pages - nr_vmemmap_pages, mem->zone, mem->group); 333 if (ret) { 334 /* offline_pages() failed. Account back. */ 335 if (nr_vmemmap_pages) 336 adjust_present_page_count(pfn_to_page(start_pfn), 337 mem->group, nr_vmemmap_pages); 338 goto out; 339 } 340 341 if (nr_vmemmap_pages) 342 mhp_deinit_memmap_on_memory(start_pfn, nr_vmemmap_pages); 343 344 mem->zone = NULL; 345 arg.altmap_start_pfn = start_pfn; 346 arg.altmap_nr_pages = nr_vmemmap_pages; 347 arg.start_pfn = start_pfn + nr_vmemmap_pages; 348 arg.nr_pages = nr_pages - nr_vmemmap_pages; 349 memory_notify(MEM_FINISH_OFFLINE, &arg); 350 out: 351 mem_hotplug_done(); 352 return ret; 353 } 354 355 /* 356 * MEMORY_HOTPLUG depends on SPARSEMEM in mm/Kconfig, so it is 357 * OK to have direct references to sparsemem variables in here. 358 */ 359 static int 360 memory_block_action(struct memory_block *mem, unsigned long action) 361 { 362 int ret; 363 364 switch (action) { 365 case MEM_ONLINE: 366 ret = memory_block_online(mem); 367 break; 368 case MEM_OFFLINE: 369 ret = memory_block_offline(mem); 370 break; 371 default: 372 WARN(1, KERN_WARNING "%s(%ld, %ld) unknown action: " 373 "%ld\n", __func__, mem->start_section_nr, action, action); 374 ret = -EINVAL; 375 } 376 377 return ret; 378 } 379 380 static int memory_block_change_state(struct memory_block *mem, 381 unsigned long to_state, unsigned long from_state_req) 382 { 383 int ret = 0; 384 385 if (mem->state != from_state_req) 386 return -EINVAL; 387 388 if (to_state == MEM_OFFLINE) 389 mem->state = MEM_GOING_OFFLINE; 390 391 ret = memory_block_action(mem, to_state); 392 mem->state = ret ? from_state_req : to_state; 393 394 return ret; 395 } 396 397 /* The device lock serializes operations on memory_subsys_[online|offline] */ 398 static int memory_subsys_online(struct device *dev) 399 { 400 struct memory_block *mem = to_memory_block(dev); 401 int ret; 402 403 if (mem->state == MEM_ONLINE) 404 return 0; 405 406 /* 407 * When called via device_online() without configuring the online_type, 408 * we want to default to MMOP_ONLINE. 409 */ 410 if (mem->online_type == MMOP_OFFLINE) 411 mem->online_type = MMOP_ONLINE; 412 413 ret = memory_block_change_state(mem, MEM_ONLINE, MEM_OFFLINE); 414 mem->online_type = MMOP_OFFLINE; 415 416 return ret; 417 } 418 419 static int memory_subsys_offline(struct device *dev) 420 { 421 struct memory_block *mem = to_memory_block(dev); 422 423 if (mem->state == MEM_OFFLINE) 424 return 0; 425 426 return memory_block_change_state(mem, MEM_OFFLINE, MEM_ONLINE); 427 } 428 429 static ssize_t state_store(struct device *dev, struct device_attribute *attr, 430 const char *buf, size_t count) 431 { 432 const int online_type = mhp_online_type_from_str(buf); 433 struct memory_block *mem = to_memory_block(dev); 434 int ret; 435 436 if (online_type < 0) 437 return -EINVAL; 438 439 ret = lock_device_hotplug_sysfs(); 440 if (ret) 441 return ret; 442 443 switch (online_type) { 444 case MMOP_ONLINE_KERNEL: 445 case MMOP_ONLINE_MOVABLE: 446 case MMOP_ONLINE: 447 /* mem->online_type is protected by device_hotplug_lock */ 448 mem->online_type = online_type; 449 ret = device_online(&mem->dev); 450 break; 451 case MMOP_OFFLINE: 452 ret = device_offline(&mem->dev); 453 break; 454 default: 455 ret = -EINVAL; /* should never happen */ 456 } 457 458 unlock_device_hotplug(); 459 460 if (ret < 0) 461 return ret; 462 if (ret) 463 return -EINVAL; 464 465 return count; 466 } 467 468 /* 469 * Legacy interface that we cannot remove: s390x exposes the storage increment 470 * covered by a memory block, allowing for identifying which memory blocks 471 * comprise a storage increment. Since a memory block spans complete 472 * storage increments nowadays, this interface is basically unused. Other 473 * archs never exposed != 0. 474 */ 475 static ssize_t phys_device_show(struct device *dev, 476 struct device_attribute *attr, char *buf) 477 { 478 struct memory_block *mem = to_memory_block(dev); 479 unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr); 480 481 return sysfs_emit(buf, "%d\n", 482 arch_get_memory_phys_device(start_pfn)); 483 } 484 485 #ifdef CONFIG_MEMORY_HOTREMOVE 486 static int print_allowed_zone(char *buf, int len, int nid, 487 struct memory_group *group, 488 unsigned long start_pfn, unsigned long nr_pages, 489 int online_type, struct zone *default_zone) 490 { 491 struct zone *zone; 492 493 zone = zone_for_pfn_range(online_type, nid, group, start_pfn, nr_pages); 494 if (zone == default_zone) 495 return 0; 496 497 return sysfs_emit_at(buf, len, " %s", zone->name); 498 } 499 500 static ssize_t valid_zones_show(struct device *dev, 501 struct device_attribute *attr, char *buf) 502 { 503 struct memory_block *mem = to_memory_block(dev); 504 unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr); 505 unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block; 506 struct memory_group *group = mem->group; 507 struct zone *default_zone; 508 int nid = mem->nid; 509 int len; 510 511 /* 512 * Check the existing zone. Make sure that we do that only on the 513 * online nodes otherwise the page_zone is not reliable 514 */ 515 if (mem->state == MEM_ONLINE) { 516 /* 517 * If !mem->zone, the memory block spans multiple zones and 518 * cannot get offlined. 519 */ 520 return sysfs_emit(buf, "%s\n", 521 mem->zone ? mem->zone->name : "none"); 522 } 523 524 default_zone = zone_for_pfn_range(MMOP_ONLINE, nid, group, 525 start_pfn, nr_pages); 526 527 len = sysfs_emit(buf, "%s", default_zone->name); 528 len += print_allowed_zone(buf, len, nid, group, start_pfn, nr_pages, 529 MMOP_ONLINE_KERNEL, default_zone); 530 len += print_allowed_zone(buf, len, nid, group, start_pfn, nr_pages, 531 MMOP_ONLINE_MOVABLE, default_zone); 532 len += sysfs_emit_at(buf, len, "\n"); 533 return len; 534 } 535 static DEVICE_ATTR_RO(valid_zones); 536 #endif 537 538 static DEVICE_ATTR_RO(phys_index); 539 static DEVICE_ATTR_RW(state); 540 static DEVICE_ATTR_RO(phys_device); 541 static DEVICE_ATTR_RO(removable); 542 543 /* 544 * Show the memory block size (shared by all memory blocks). 545 */ 546 static ssize_t block_size_bytes_show(struct device *dev, 547 struct device_attribute *attr, char *buf) 548 { 549 return sysfs_emit(buf, "%lx\n", memory_block_size_bytes()); 550 } 551 552 static DEVICE_ATTR_RO(block_size_bytes); 553 554 /* 555 * Memory auto online policy. 556 */ 557 558 static ssize_t auto_online_blocks_show(struct device *dev, 559 struct device_attribute *attr, char *buf) 560 { 561 return sysfs_emit(buf, "%s\n", 562 online_type_to_str[mhp_get_default_online_type()]); 563 } 564 565 static ssize_t auto_online_blocks_store(struct device *dev, 566 struct device_attribute *attr, 567 const char *buf, size_t count) 568 { 569 const int online_type = mhp_online_type_from_str(buf); 570 571 if (online_type < 0) 572 return -EINVAL; 573 574 mhp_set_default_online_type(online_type); 575 return count; 576 } 577 578 static DEVICE_ATTR_RW(auto_online_blocks); 579 580 #ifdef CONFIG_CRASH_HOTPLUG 581 #include <linux/kexec.h> 582 static ssize_t crash_hotplug_show(struct device *dev, 583 struct device_attribute *attr, char *buf) 584 { 585 return sysfs_emit(buf, "%d\n", crash_check_hotplug_support()); 586 } 587 static DEVICE_ATTR_RO(crash_hotplug); 588 #endif 589 590 /* 591 * Some architectures will have custom drivers to do this, and 592 * will not need to do it from userspace. The fake hot-add code 593 * as well as ppc64 will do all of their discovery in userspace 594 * and will require this interface. 595 */ 596 #ifdef CONFIG_ARCH_MEMORY_PROBE 597 static ssize_t probe_store(struct device *dev, struct device_attribute *attr, 598 const char *buf, size_t count) 599 { 600 u64 phys_addr; 601 int nid, ret; 602 unsigned long pages_per_block = PAGES_PER_SECTION * sections_per_block; 603 604 ret = kstrtoull(buf, 0, &phys_addr); 605 if (ret) 606 return ret; 607 608 if (phys_addr & ((pages_per_block << PAGE_SHIFT) - 1)) 609 return -EINVAL; 610 611 ret = lock_device_hotplug_sysfs(); 612 if (ret) 613 return ret; 614 615 nid = memory_add_physaddr_to_nid(phys_addr); 616 ret = __add_memory(nid, phys_addr, 617 MIN_MEMORY_BLOCK_SIZE * sections_per_block, 618 MHP_NONE); 619 620 if (ret) 621 goto out; 622 623 ret = count; 624 out: 625 unlock_device_hotplug(); 626 return ret; 627 } 628 629 static DEVICE_ATTR_WO(probe); 630 #endif 631 632 #ifdef CONFIG_MEMORY_FAILURE 633 /* 634 * Support for offlining pages of memory 635 */ 636 637 /* Soft offline a page */ 638 static ssize_t soft_offline_page_store(struct device *dev, 639 struct device_attribute *attr, 640 const char *buf, size_t count) 641 { 642 int ret; 643 u64 pfn; 644 if (!capable(CAP_SYS_ADMIN)) 645 return -EPERM; 646 if (kstrtoull(buf, 0, &pfn) < 0) 647 return -EINVAL; 648 pfn >>= PAGE_SHIFT; 649 ret = soft_offline_page(pfn, 0); 650 return ret == 0 ? count : ret; 651 } 652 653 /* Forcibly offline a page, including killing processes. */ 654 static ssize_t hard_offline_page_store(struct device *dev, 655 struct device_attribute *attr, 656 const char *buf, size_t count) 657 { 658 int ret; 659 u64 pfn; 660 if (!capable(CAP_SYS_ADMIN)) 661 return -EPERM; 662 if (kstrtoull(buf, 0, &pfn) < 0) 663 return -EINVAL; 664 pfn >>= PAGE_SHIFT; 665 ret = memory_failure(pfn, MF_SW_SIMULATED); 666 if (ret == -EOPNOTSUPP) 667 ret = 0; 668 return ret ? ret : count; 669 } 670 671 static DEVICE_ATTR_WO(soft_offline_page); 672 static DEVICE_ATTR_WO(hard_offline_page); 673 #endif 674 675 /* See phys_device_show(). */ 676 int __weak arch_get_memory_phys_device(unsigned long start_pfn) 677 { 678 return 0; 679 } 680 681 /* 682 * A reference for the returned memory block device is acquired. 683 * 684 * Called under device_hotplug_lock. 685 */ 686 static struct memory_block *find_memory_block_by_id(unsigned long block_id) 687 { 688 struct memory_block *mem; 689 690 mem = xa_load(&memory_blocks, block_id); 691 if (mem) 692 get_device(&mem->dev); 693 return mem; 694 } 695 696 /* 697 * Called under device_hotplug_lock. 698 */ 699 struct memory_block *find_memory_block(unsigned long section_nr) 700 { 701 unsigned long block_id = memory_block_id(section_nr); 702 703 return find_memory_block_by_id(block_id); 704 } 705 706 static struct attribute *memory_memblk_attrs[] = { 707 &dev_attr_phys_index.attr, 708 &dev_attr_state.attr, 709 &dev_attr_phys_device.attr, 710 &dev_attr_removable.attr, 711 #ifdef CONFIG_MEMORY_HOTREMOVE 712 &dev_attr_valid_zones.attr, 713 #endif 714 NULL 715 }; 716 717 static const struct attribute_group memory_memblk_attr_group = { 718 .attrs = memory_memblk_attrs, 719 }; 720 721 static const struct attribute_group *memory_memblk_attr_groups[] = { 722 &memory_memblk_attr_group, 723 NULL, 724 }; 725 726 static int __add_memory_block(struct memory_block *memory) 727 { 728 int ret; 729 730 memory->dev.bus = &memory_subsys; 731 memory->dev.id = memory->start_section_nr / sections_per_block; 732 memory->dev.release = memory_block_release; 733 memory->dev.groups = memory_memblk_attr_groups; 734 memory->dev.offline = memory->state == MEM_OFFLINE; 735 736 ret = device_register(&memory->dev); 737 if (ret) { 738 put_device(&memory->dev); 739 return ret; 740 } 741 ret = xa_err(xa_store(&memory_blocks, memory->dev.id, memory, 742 GFP_KERNEL)); 743 if (ret) 744 device_unregister(&memory->dev); 745 746 return ret; 747 } 748 749 static struct zone *early_node_zone_for_memory_block(struct memory_block *mem, 750 int nid) 751 { 752 const unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr); 753 const unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block; 754 struct zone *zone, *matching_zone = NULL; 755 pg_data_t *pgdat = NODE_DATA(nid); 756 int i; 757 758 /* 759 * This logic only works for early memory, when the applicable zones 760 * already span the memory block. We don't expect overlapping zones on 761 * a single node for early memory. So if we're told that some PFNs 762 * of a node fall into this memory block, we can assume that all node 763 * zones that intersect with the memory block are actually applicable. 764 * No need to look at the memmap. 765 */ 766 for (i = 0; i < MAX_NR_ZONES; i++) { 767 zone = pgdat->node_zones + i; 768 if (!populated_zone(zone)) 769 continue; 770 if (!zone_intersects(zone, start_pfn, nr_pages)) 771 continue; 772 if (!matching_zone) { 773 matching_zone = zone; 774 continue; 775 } 776 /* Spans multiple zones ... */ 777 matching_zone = NULL; 778 break; 779 } 780 return matching_zone; 781 } 782 783 #ifdef CONFIG_NUMA 784 /** 785 * memory_block_add_nid() - Indicate that system RAM falling into this memory 786 * block device (partially) belongs to the given node. 787 * @mem: The memory block device. 788 * @nid: The node id. 789 * @context: The memory initialization context. 790 * 791 * Indicate that system RAM falling into this memory block (partially) belongs 792 * to the given node. If the context indicates ("early") that we are adding the 793 * node during node device subsystem initialization, this will also properly 794 * set/adjust mem->zone based on the zone ranges of the given node. 795 */ 796 void memory_block_add_nid(struct memory_block *mem, int nid, 797 enum meminit_context context) 798 { 799 if (context == MEMINIT_EARLY && mem->nid != nid) { 800 /* 801 * For early memory we have to determine the zone when setting 802 * the node id and handle multiple nodes spanning a single 803 * memory block by indicate via zone == NULL that we're not 804 * dealing with a single zone. So if we're setting the node id 805 * the first time, determine if there is a single zone. If we're 806 * setting the node id a second time to a different node, 807 * invalidate the single detected zone. 808 */ 809 if (mem->nid == NUMA_NO_NODE) 810 mem->zone = early_node_zone_for_memory_block(mem, nid); 811 else 812 mem->zone = NULL; 813 } 814 815 /* 816 * If this memory block spans multiple nodes, we only indicate 817 * the last processed node. If we span multiple nodes (not applicable 818 * to hotplugged memory), zone == NULL will prohibit memory offlining 819 * and consequently unplug. 820 */ 821 mem->nid = nid; 822 } 823 #endif 824 825 static int add_memory_block(unsigned long block_id, unsigned long state, 826 struct vmem_altmap *altmap, 827 struct memory_group *group) 828 { 829 struct memory_block *mem; 830 int ret = 0; 831 832 mem = find_memory_block_by_id(block_id); 833 if (mem) { 834 put_device(&mem->dev); 835 return -EEXIST; 836 } 837 mem = kzalloc(sizeof(*mem), GFP_KERNEL); 838 if (!mem) 839 return -ENOMEM; 840 841 mem->start_section_nr = block_id * sections_per_block; 842 mem->state = state; 843 mem->nid = NUMA_NO_NODE; 844 mem->altmap = altmap; 845 INIT_LIST_HEAD(&mem->group_next); 846 847 #ifndef CONFIG_NUMA 848 if (state == MEM_ONLINE) 849 /* 850 * MEM_ONLINE at this point implies early memory. With NUMA, 851 * we'll determine the zone when setting the node id via 852 * memory_block_add_nid(). Memory hotplug updated the zone 853 * manually when memory onlining/offlining succeeds. 854 */ 855 mem->zone = early_node_zone_for_memory_block(mem, NUMA_NO_NODE); 856 #endif /* CONFIG_NUMA */ 857 858 ret = __add_memory_block(mem); 859 if (ret) 860 return ret; 861 862 if (group) { 863 mem->group = group; 864 list_add(&mem->group_next, &group->memory_blocks); 865 } 866 867 return 0; 868 } 869 870 static int add_hotplug_memory_block(unsigned long block_id, 871 struct vmem_altmap *altmap, 872 struct memory_group *group) 873 { 874 return add_memory_block(block_id, MEM_OFFLINE, altmap, group); 875 } 876 877 static void remove_memory_block(struct memory_block *memory) 878 { 879 if (WARN_ON_ONCE(memory->dev.bus != &memory_subsys)) 880 return; 881 882 WARN_ON(xa_erase(&memory_blocks, memory->dev.id) == NULL); 883 884 if (memory->group) { 885 list_del(&memory->group_next); 886 memory->group = NULL; 887 } 888 889 /* drop the ref. we got via find_memory_block() */ 890 put_device(&memory->dev); 891 device_unregister(&memory->dev); 892 } 893 894 /* 895 * Create memory block devices for the given memory area. Start and size 896 * have to be aligned to memory block granularity. Memory block devices 897 * will be initialized as offline. 898 * 899 * Called under device_hotplug_lock. 900 */ 901 int create_memory_block_devices(unsigned long start, unsigned long size, 902 struct vmem_altmap *altmap, 903 struct memory_group *group) 904 { 905 const unsigned long start_block_id = pfn_to_block_id(PFN_DOWN(start)); 906 unsigned long end_block_id = pfn_to_block_id(PFN_DOWN(start + size)); 907 struct memory_block *mem; 908 unsigned long block_id; 909 int ret = 0; 910 911 if (WARN_ON_ONCE(!IS_ALIGNED(start, memory_block_size_bytes()) || 912 !IS_ALIGNED(size, memory_block_size_bytes()))) 913 return -EINVAL; 914 915 for (block_id = start_block_id; block_id != end_block_id; block_id++) { 916 ret = add_hotplug_memory_block(block_id, altmap, group); 917 if (ret) 918 break; 919 } 920 if (ret) { 921 end_block_id = block_id; 922 for (block_id = start_block_id; block_id != end_block_id; 923 block_id++) { 924 mem = find_memory_block_by_id(block_id); 925 if (WARN_ON_ONCE(!mem)) 926 continue; 927 remove_memory_block(mem); 928 } 929 } 930 return ret; 931 } 932 933 /* 934 * Remove memory block devices for the given memory area. Start and size 935 * have to be aligned to memory block granularity. Memory block devices 936 * have to be offline. 937 * 938 * Called under device_hotplug_lock. 939 */ 940 void remove_memory_block_devices(unsigned long start, unsigned long size) 941 { 942 const unsigned long start_block_id = pfn_to_block_id(PFN_DOWN(start)); 943 const unsigned long end_block_id = pfn_to_block_id(PFN_DOWN(start + size)); 944 struct memory_block *mem; 945 unsigned long block_id; 946 947 if (WARN_ON_ONCE(!IS_ALIGNED(start, memory_block_size_bytes()) || 948 !IS_ALIGNED(size, memory_block_size_bytes()))) 949 return; 950 951 for (block_id = start_block_id; block_id != end_block_id; block_id++) { 952 mem = find_memory_block_by_id(block_id); 953 if (WARN_ON_ONCE(!mem)) 954 continue; 955 num_poisoned_pages_sub(-1UL, memblk_nr_poison(mem)); 956 unregister_memory_block_under_nodes(mem); 957 remove_memory_block(mem); 958 } 959 } 960 961 static struct attribute *memory_root_attrs[] = { 962 #ifdef CONFIG_ARCH_MEMORY_PROBE 963 &dev_attr_probe.attr, 964 #endif 965 966 #ifdef CONFIG_MEMORY_FAILURE 967 &dev_attr_soft_offline_page.attr, 968 &dev_attr_hard_offline_page.attr, 969 #endif 970 971 &dev_attr_block_size_bytes.attr, 972 &dev_attr_auto_online_blocks.attr, 973 #ifdef CONFIG_CRASH_HOTPLUG 974 &dev_attr_crash_hotplug.attr, 975 #endif 976 NULL 977 }; 978 979 static const struct attribute_group memory_root_attr_group = { 980 .attrs = memory_root_attrs, 981 }; 982 983 static const struct attribute_group *memory_root_attr_groups[] = { 984 &memory_root_attr_group, 985 NULL, 986 }; 987 988 /* 989 * Initialize the sysfs support for memory devices. At the time this function 990 * is called, we cannot have concurrent creation/deletion of memory block 991 * devices, the device_hotplug_lock is not needed. 992 */ 993 void __init memory_dev_init(void) 994 { 995 int ret; 996 unsigned long block_sz, block_id, nr; 997 998 /* Validate the configured memory block size */ 999 block_sz = memory_block_size_bytes(); 1000 if (!is_power_of_2(block_sz) || block_sz < MIN_MEMORY_BLOCK_SIZE) 1001 panic("Memory block size not suitable: 0x%lx\n", block_sz); 1002 sections_per_block = block_sz / MIN_MEMORY_BLOCK_SIZE; 1003 1004 ret = subsys_system_register(&memory_subsys, memory_root_attr_groups); 1005 if (ret) 1006 panic("%s() failed to register subsystem: %d\n", __func__, ret); 1007 1008 /* 1009 * Create entries for memory sections that were found during boot 1010 * and have been initialized. Use @block_id to track the last 1011 * handled block and initialize it to an invalid value (ULONG_MAX) 1012 * to bypass the block ID matching check for the first present 1013 * block so that it can be covered. 1014 */ 1015 block_id = ULONG_MAX; 1016 for_each_present_section_nr(0, nr) { 1017 if (block_id != ULONG_MAX && memory_block_id(nr) == block_id) 1018 continue; 1019 1020 block_id = memory_block_id(nr); 1021 ret = add_memory_block(block_id, MEM_ONLINE, NULL, NULL); 1022 if (ret) { 1023 panic("%s() failed to add memory block: %d\n", 1024 __func__, ret); 1025 } 1026 } 1027 } 1028 1029 /** 1030 * walk_memory_blocks - walk through all present memory blocks overlapped 1031 * by the range [start, start + size) 1032 * 1033 * @start: start address of the memory range 1034 * @size: size of the memory range 1035 * @arg: argument passed to func 1036 * @func: callback for each memory section walked 1037 * 1038 * This function walks through all present memory blocks overlapped by the 1039 * range [start, start + size), calling func on each memory block. 1040 * 1041 * In case func() returns an error, walking is aborted and the error is 1042 * returned. 1043 * 1044 * Called under device_hotplug_lock. 1045 */ 1046 int walk_memory_blocks(unsigned long start, unsigned long size, 1047 void *arg, walk_memory_blocks_func_t func) 1048 { 1049 const unsigned long start_block_id = phys_to_block_id(start); 1050 const unsigned long end_block_id = phys_to_block_id(start + size - 1); 1051 struct memory_block *mem; 1052 unsigned long block_id; 1053 int ret = 0; 1054 1055 if (!size) 1056 return 0; 1057 1058 for (block_id = start_block_id; block_id <= end_block_id; block_id++) { 1059 mem = find_memory_block_by_id(block_id); 1060 if (!mem) 1061 continue; 1062 1063 ret = func(mem, arg); 1064 put_device(&mem->dev); 1065 if (ret) 1066 break; 1067 } 1068 return ret; 1069 } 1070 1071 struct for_each_memory_block_cb_data { 1072 walk_memory_blocks_func_t func; 1073 void *arg; 1074 }; 1075 1076 static int for_each_memory_block_cb(struct device *dev, void *data) 1077 { 1078 struct memory_block *mem = to_memory_block(dev); 1079 struct for_each_memory_block_cb_data *cb_data = data; 1080 1081 return cb_data->func(mem, cb_data->arg); 1082 } 1083 1084 /** 1085 * for_each_memory_block - walk through all present memory blocks 1086 * 1087 * @arg: argument passed to func 1088 * @func: callback for each memory block walked 1089 * 1090 * This function walks through all present memory blocks, calling func on 1091 * each memory block. 1092 * 1093 * In case func() returns an error, walking is aborted and the error is 1094 * returned. 1095 */ 1096 int for_each_memory_block(void *arg, walk_memory_blocks_func_t func) 1097 { 1098 struct for_each_memory_block_cb_data cb_data = { 1099 .func = func, 1100 .arg = arg, 1101 }; 1102 1103 return bus_for_each_dev(&memory_subsys, NULL, &cb_data, 1104 for_each_memory_block_cb); 1105 } 1106 1107 /* 1108 * This is an internal helper to unify allocation and initialization of 1109 * memory groups. Note that the passed memory group will be copied to a 1110 * dynamically allocated memory group. After this call, the passed 1111 * memory group should no longer be used. 1112 */ 1113 static int memory_group_register(struct memory_group group) 1114 { 1115 struct memory_group *new_group; 1116 uint32_t mgid; 1117 int ret; 1118 1119 if (!node_possible(group.nid)) 1120 return -EINVAL; 1121 1122 new_group = kzalloc(sizeof(group), GFP_KERNEL); 1123 if (!new_group) 1124 return -ENOMEM; 1125 *new_group = group; 1126 INIT_LIST_HEAD(&new_group->memory_blocks); 1127 1128 ret = xa_alloc(&memory_groups, &mgid, new_group, xa_limit_31b, 1129 GFP_KERNEL); 1130 if (ret) { 1131 kfree(new_group); 1132 return ret; 1133 } else if (group.is_dynamic) { 1134 xa_set_mark(&memory_groups, mgid, MEMORY_GROUP_MARK_DYNAMIC); 1135 } 1136 return mgid; 1137 } 1138 1139 /** 1140 * memory_group_register_static() - Register a static memory group. 1141 * @nid: The node id. 1142 * @max_pages: The maximum number of pages we'll have in this static memory 1143 * group. 1144 * 1145 * Register a new static memory group and return the memory group id. 1146 * All memory in the group belongs to a single unit, such as a DIMM. All 1147 * memory belonging to a static memory group is added in one go to be removed 1148 * in one go -- it's static. 1149 * 1150 * Returns an error if out of memory, if the node id is invalid, if no new 1151 * memory groups can be registered, or if max_pages is invalid (0). Otherwise, 1152 * returns the new memory group id. 1153 */ 1154 int memory_group_register_static(int nid, unsigned long max_pages) 1155 { 1156 struct memory_group group = { 1157 .nid = nid, 1158 .s = { 1159 .max_pages = max_pages, 1160 }, 1161 }; 1162 1163 if (!max_pages) 1164 return -EINVAL; 1165 return memory_group_register(group); 1166 } 1167 EXPORT_SYMBOL_GPL(memory_group_register_static); 1168 1169 /** 1170 * memory_group_register_dynamic() - Register a dynamic memory group. 1171 * @nid: The node id. 1172 * @unit_pages: Unit in pages in which is memory added/removed in this dynamic 1173 * memory group. 1174 * 1175 * Register a new dynamic memory group and return the memory group id. 1176 * Memory within a dynamic memory group is added/removed dynamically 1177 * in unit_pages. 1178 * 1179 * Returns an error if out of memory, if the node id is invalid, if no new 1180 * memory groups can be registered, or if unit_pages is invalid (0, not a 1181 * power of two, smaller than a single memory block). Otherwise, returns the 1182 * new memory group id. 1183 */ 1184 int memory_group_register_dynamic(int nid, unsigned long unit_pages) 1185 { 1186 struct memory_group group = { 1187 .nid = nid, 1188 .is_dynamic = true, 1189 .d = { 1190 .unit_pages = unit_pages, 1191 }, 1192 }; 1193 1194 if (!unit_pages || !is_power_of_2(unit_pages) || 1195 unit_pages < PHYS_PFN(memory_block_size_bytes())) 1196 return -EINVAL; 1197 return memory_group_register(group); 1198 } 1199 EXPORT_SYMBOL_GPL(memory_group_register_dynamic); 1200 1201 /** 1202 * memory_group_unregister() - Unregister a memory group. 1203 * @mgid: the memory group id 1204 * 1205 * Unregister a memory group. If any memory block still belongs to this 1206 * memory group, unregistering will fail. 1207 * 1208 * Returns -EINVAL if the memory group id is invalid, returns -EBUSY if some 1209 * memory blocks still belong to this memory group and returns 0 if 1210 * unregistering succeeded. 1211 */ 1212 int memory_group_unregister(int mgid) 1213 { 1214 struct memory_group *group; 1215 1216 if (mgid < 0) 1217 return -EINVAL; 1218 1219 group = xa_load(&memory_groups, mgid); 1220 if (!group) 1221 return -EINVAL; 1222 if (!list_empty(&group->memory_blocks)) 1223 return -EBUSY; 1224 xa_erase(&memory_groups, mgid); 1225 kfree(group); 1226 return 0; 1227 } 1228 EXPORT_SYMBOL_GPL(memory_group_unregister); 1229 1230 /* 1231 * This is an internal helper only to be used in core memory hotplug code to 1232 * lookup a memory group. We don't care about locking, as we don't expect a 1233 * memory group to get unregistered while adding memory to it -- because 1234 * the group and the memory is managed by the same driver. 1235 */ 1236 struct memory_group *memory_group_find_by_id(int mgid) 1237 { 1238 return xa_load(&memory_groups, mgid); 1239 } 1240 1241 /* 1242 * This is an internal helper only to be used in core memory hotplug code to 1243 * walk all dynamic memory groups excluding a given memory group, either 1244 * belonging to a specific node, or belonging to any node. 1245 */ 1246 int walk_dynamic_memory_groups(int nid, walk_memory_groups_func_t func, 1247 struct memory_group *excluded, void *arg) 1248 { 1249 struct memory_group *group; 1250 unsigned long index; 1251 int ret = 0; 1252 1253 xa_for_each_marked(&memory_groups, index, group, 1254 MEMORY_GROUP_MARK_DYNAMIC) { 1255 if (group == excluded) 1256 continue; 1257 #ifdef CONFIG_NUMA 1258 if (nid != NUMA_NO_NODE && group->nid != nid) 1259 continue; 1260 #endif /* CONFIG_NUMA */ 1261 ret = func(group, arg); 1262 if (ret) 1263 break; 1264 } 1265 return ret; 1266 } 1267 1268 #if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_MEMORY_HOTPLUG) 1269 void memblk_nr_poison_inc(unsigned long pfn) 1270 { 1271 const unsigned long block_id = pfn_to_block_id(pfn); 1272 struct memory_block *mem = find_memory_block_by_id(block_id); 1273 1274 if (mem) 1275 atomic_long_inc(&mem->nr_hwpoison); 1276 } 1277 1278 void memblk_nr_poison_sub(unsigned long pfn, long i) 1279 { 1280 const unsigned long block_id = pfn_to_block_id(pfn); 1281 struct memory_block *mem = find_memory_block_by_id(block_id); 1282 1283 if (mem) 1284 atomic_long_sub(i, &mem->nr_hwpoison); 1285 } 1286 1287 static unsigned long memblk_nr_poison(struct memory_block *mem) 1288 { 1289 return atomic_long_read(&mem->nr_hwpoison); 1290 } 1291 #endif 1292