1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Memory subsystem support 4 * 5 * Written by Matt Tolentino <matthew.e.tolentino@intel.com> 6 * Dave Hansen <haveblue@us.ibm.com> 7 * 8 * This file provides the necessary infrastructure to represent 9 * a SPARSEMEM-memory-model system's physical memory in /sysfs. 10 * All arch-independent code that assumes MEMORY_HOTPLUG requires 11 * SPARSEMEM should be contained here, or in mm/memory_hotplug.c. 12 */ 13 14 #include <linux/module.h> 15 #include <linux/init.h> 16 #include <linux/topology.h> 17 #include <linux/capability.h> 18 #include <linux/device.h> 19 #include <linux/memory.h> 20 #include <linux/memory_hotplug.h> 21 #include <linux/mm.h> 22 #include <linux/stat.h> 23 #include <linux/slab.h> 24 #include <linux/xarray.h> 25 26 #include <linux/atomic.h> 27 #include <linux/uaccess.h> 28 29 #define MEMORY_CLASS_NAME "memory" 30 31 static const char *const online_type_to_str[] = { 32 [MMOP_OFFLINE] = "offline", 33 [MMOP_ONLINE] = "online", 34 [MMOP_ONLINE_KERNEL] = "online_kernel", 35 [MMOP_ONLINE_MOVABLE] = "online_movable", 36 }; 37 38 int mhp_online_type_from_str(const char *str) 39 { 40 int i; 41 42 for (i = 0; i < ARRAY_SIZE(online_type_to_str); i++) { 43 if (sysfs_streq(str, online_type_to_str[i])) 44 return i; 45 } 46 return -EINVAL; 47 } 48 49 #define to_memory_block(dev) container_of(dev, struct memory_block, dev) 50 51 static int sections_per_block; 52 53 static inline unsigned long memory_block_id(unsigned long section_nr) 54 { 55 return section_nr / sections_per_block; 56 } 57 58 static inline unsigned long pfn_to_block_id(unsigned long pfn) 59 { 60 return memory_block_id(pfn_to_section_nr(pfn)); 61 } 62 63 static inline unsigned long phys_to_block_id(unsigned long phys) 64 { 65 return pfn_to_block_id(PFN_DOWN(phys)); 66 } 67 68 static int memory_subsys_online(struct device *dev); 69 static int memory_subsys_offline(struct device *dev); 70 71 static const struct bus_type memory_subsys = { 72 .name = MEMORY_CLASS_NAME, 73 .dev_name = MEMORY_CLASS_NAME, 74 .online = memory_subsys_online, 75 .offline = memory_subsys_offline, 76 }; 77 78 /* 79 * Memory blocks are cached in a local radix tree to avoid 80 * a costly linear search for the corresponding device on 81 * the subsystem bus. 82 */ 83 static DEFINE_XARRAY(memory_blocks); 84 85 /* 86 * Memory groups, indexed by memory group id (mgid). 87 */ 88 static DEFINE_XARRAY_FLAGS(memory_groups, XA_FLAGS_ALLOC); 89 #define MEMORY_GROUP_MARK_DYNAMIC XA_MARK_1 90 91 static BLOCKING_NOTIFIER_HEAD(memory_chain); 92 93 int register_memory_notifier(struct notifier_block *nb) 94 { 95 return blocking_notifier_chain_register(&memory_chain, nb); 96 } 97 EXPORT_SYMBOL(register_memory_notifier); 98 99 void unregister_memory_notifier(struct notifier_block *nb) 100 { 101 blocking_notifier_chain_unregister(&memory_chain, nb); 102 } 103 EXPORT_SYMBOL(unregister_memory_notifier); 104 105 static void memory_block_release(struct device *dev) 106 { 107 struct memory_block *mem = to_memory_block(dev); 108 /* Verify that the altmap is freed */ 109 WARN_ON(mem->altmap); 110 kfree(mem); 111 } 112 113 unsigned long __weak memory_block_size_bytes(void) 114 { 115 return MIN_MEMORY_BLOCK_SIZE; 116 } 117 EXPORT_SYMBOL_GPL(memory_block_size_bytes); 118 119 /* Show the memory block ID, relative to the memory block size */ 120 static ssize_t phys_index_show(struct device *dev, 121 struct device_attribute *attr, char *buf) 122 { 123 struct memory_block *mem = to_memory_block(dev); 124 125 return sysfs_emit(buf, "%08lx\n", memory_block_id(mem->start_section_nr)); 126 } 127 128 /* 129 * Legacy interface that we cannot remove. Always indicate "removable" 130 * with CONFIG_MEMORY_HOTREMOVE - bad heuristic. 131 */ 132 static ssize_t removable_show(struct device *dev, struct device_attribute *attr, 133 char *buf) 134 { 135 return sysfs_emit(buf, "%d\n", (int)IS_ENABLED(CONFIG_MEMORY_HOTREMOVE)); 136 } 137 138 /* 139 * online, offline, going offline, etc. 140 */ 141 static ssize_t state_show(struct device *dev, struct device_attribute *attr, 142 char *buf) 143 { 144 struct memory_block *mem = to_memory_block(dev); 145 const char *output; 146 147 /* 148 * We can probably put these states in a nice little array 149 * so that they're not open-coded 150 */ 151 switch (mem->state) { 152 case MEM_ONLINE: 153 output = "online"; 154 break; 155 case MEM_OFFLINE: 156 output = "offline"; 157 break; 158 case MEM_GOING_OFFLINE: 159 output = "going-offline"; 160 break; 161 default: 162 WARN_ON(1); 163 return sysfs_emit(buf, "ERROR-UNKNOWN-%ld\n", mem->state); 164 } 165 166 return sysfs_emit(buf, "%s\n", output); 167 } 168 169 int memory_notify(unsigned long val, void *v) 170 { 171 return blocking_notifier_call_chain(&memory_chain, val, v); 172 } 173 174 #if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_MEMORY_HOTPLUG) 175 static unsigned long memblk_nr_poison(struct memory_block *mem); 176 #else 177 static inline unsigned long memblk_nr_poison(struct memory_block *mem) 178 { 179 return 0; 180 } 181 #endif 182 183 /* 184 * Must acquire mem_hotplug_lock in write mode. 185 */ 186 static int memory_block_online(struct memory_block *mem) 187 { 188 unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr); 189 unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block; 190 unsigned long nr_vmemmap_pages = 0; 191 struct memory_notify arg; 192 struct zone *zone; 193 int ret; 194 195 if (memblk_nr_poison(mem)) 196 return -EHWPOISON; 197 198 zone = zone_for_pfn_range(mem->online_type, mem->nid, mem->group, 199 start_pfn, nr_pages); 200 201 /* 202 * Although vmemmap pages have a different lifecycle than the pages 203 * they describe (they remain until the memory is unplugged), doing 204 * their initialization and accounting at memory onlining/offlining 205 * stage helps to keep accounting easier to follow - e.g vmemmaps 206 * belong to the same zone as the memory they backed. 207 */ 208 if (mem->altmap) 209 nr_vmemmap_pages = mem->altmap->free; 210 211 arg.altmap_start_pfn = start_pfn; 212 arg.altmap_nr_pages = nr_vmemmap_pages; 213 arg.start_pfn = start_pfn + nr_vmemmap_pages; 214 arg.nr_pages = nr_pages - nr_vmemmap_pages; 215 mem_hotplug_begin(); 216 ret = memory_notify(MEM_PREPARE_ONLINE, &arg); 217 ret = notifier_to_errno(ret); 218 if (ret) 219 goto out_notifier; 220 221 if (nr_vmemmap_pages) { 222 ret = mhp_init_memmap_on_memory(start_pfn, nr_vmemmap_pages, 223 zone, mem->altmap->inaccessible); 224 if (ret) 225 goto out; 226 } 227 228 ret = online_pages(start_pfn + nr_vmemmap_pages, 229 nr_pages - nr_vmemmap_pages, zone, mem->group); 230 if (ret) { 231 if (nr_vmemmap_pages) 232 mhp_deinit_memmap_on_memory(start_pfn, nr_vmemmap_pages); 233 goto out; 234 } 235 236 /* 237 * Account once onlining succeeded. If the zone was unpopulated, it is 238 * now already properly populated. 239 */ 240 if (nr_vmemmap_pages) 241 adjust_present_page_count(pfn_to_page(start_pfn), mem->group, 242 nr_vmemmap_pages); 243 244 mem->zone = zone; 245 mem_hotplug_done(); 246 return ret; 247 out: 248 memory_notify(MEM_FINISH_OFFLINE, &arg); 249 out_notifier: 250 mem_hotplug_done(); 251 return ret; 252 } 253 254 /* 255 * Must acquire mem_hotplug_lock in write mode. 256 */ 257 static int memory_block_offline(struct memory_block *mem) 258 { 259 unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr); 260 unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block; 261 unsigned long nr_vmemmap_pages = 0; 262 struct memory_notify arg; 263 int ret; 264 265 if (!mem->zone) 266 return -EINVAL; 267 268 /* 269 * Unaccount before offlining, such that unpopulated zone and kthreads 270 * can properly be torn down in offline_pages(). 271 */ 272 if (mem->altmap) 273 nr_vmemmap_pages = mem->altmap->free; 274 275 mem_hotplug_begin(); 276 if (nr_vmemmap_pages) 277 adjust_present_page_count(pfn_to_page(start_pfn), mem->group, 278 -nr_vmemmap_pages); 279 280 ret = offline_pages(start_pfn + nr_vmemmap_pages, 281 nr_pages - nr_vmemmap_pages, mem->zone, mem->group); 282 if (ret) { 283 /* offline_pages() failed. Account back. */ 284 if (nr_vmemmap_pages) 285 adjust_present_page_count(pfn_to_page(start_pfn), 286 mem->group, nr_vmemmap_pages); 287 goto out; 288 } 289 290 if (nr_vmemmap_pages) 291 mhp_deinit_memmap_on_memory(start_pfn, nr_vmemmap_pages); 292 293 mem->zone = NULL; 294 arg.altmap_start_pfn = start_pfn; 295 arg.altmap_nr_pages = nr_vmemmap_pages; 296 arg.start_pfn = start_pfn + nr_vmemmap_pages; 297 arg.nr_pages = nr_pages - nr_vmemmap_pages; 298 memory_notify(MEM_FINISH_OFFLINE, &arg); 299 out: 300 mem_hotplug_done(); 301 return ret; 302 } 303 304 /* 305 * MEMORY_HOTPLUG depends on SPARSEMEM in mm/Kconfig, so it is 306 * OK to have direct references to sparsemem variables in here. 307 */ 308 static int 309 memory_block_action(struct memory_block *mem, unsigned long action) 310 { 311 int ret; 312 313 switch (action) { 314 case MEM_ONLINE: 315 ret = memory_block_online(mem); 316 break; 317 case MEM_OFFLINE: 318 ret = memory_block_offline(mem); 319 break; 320 default: 321 WARN(1, KERN_WARNING "%s(%ld, %ld) unknown action: " 322 "%ld\n", __func__, mem->start_section_nr, action, action); 323 ret = -EINVAL; 324 } 325 326 return ret; 327 } 328 329 static int memory_block_change_state(struct memory_block *mem, 330 unsigned long to_state, unsigned long from_state_req) 331 { 332 int ret = 0; 333 334 if (mem->state != from_state_req) 335 return -EINVAL; 336 337 if (to_state == MEM_OFFLINE) 338 mem->state = MEM_GOING_OFFLINE; 339 340 ret = memory_block_action(mem, to_state); 341 mem->state = ret ? from_state_req : to_state; 342 343 return ret; 344 } 345 346 /* The device lock serializes operations on memory_subsys_[online|offline] */ 347 static int memory_subsys_online(struct device *dev) 348 { 349 struct memory_block *mem = to_memory_block(dev); 350 int ret; 351 352 if (mem->state == MEM_ONLINE) 353 return 0; 354 355 /* 356 * When called via device_online() without configuring the online_type, 357 * we want to default to MMOP_ONLINE. 358 */ 359 if (mem->online_type == MMOP_OFFLINE) 360 mem->online_type = MMOP_ONLINE; 361 362 ret = memory_block_change_state(mem, MEM_ONLINE, MEM_OFFLINE); 363 mem->online_type = MMOP_OFFLINE; 364 365 return ret; 366 } 367 368 static int memory_subsys_offline(struct device *dev) 369 { 370 struct memory_block *mem = to_memory_block(dev); 371 372 if (mem->state == MEM_OFFLINE) 373 return 0; 374 375 return memory_block_change_state(mem, MEM_OFFLINE, MEM_ONLINE); 376 } 377 378 static ssize_t state_store(struct device *dev, struct device_attribute *attr, 379 const char *buf, size_t count) 380 { 381 const int online_type = mhp_online_type_from_str(buf); 382 struct memory_block *mem = to_memory_block(dev); 383 int ret; 384 385 if (online_type < 0) 386 return -EINVAL; 387 388 ret = lock_device_hotplug_sysfs(); 389 if (ret) 390 return ret; 391 392 switch (online_type) { 393 case MMOP_ONLINE_KERNEL: 394 case MMOP_ONLINE_MOVABLE: 395 case MMOP_ONLINE: 396 /* mem->online_type is protected by device_hotplug_lock */ 397 mem->online_type = online_type; 398 ret = device_online(&mem->dev); 399 break; 400 case MMOP_OFFLINE: 401 ret = device_offline(&mem->dev); 402 break; 403 default: 404 ret = -EINVAL; /* should never happen */ 405 } 406 407 unlock_device_hotplug(); 408 409 if (ret < 0) 410 return ret; 411 if (ret) 412 return -EINVAL; 413 414 return count; 415 } 416 417 /* 418 * Legacy interface that we cannot remove: s390x exposes the storage increment 419 * covered by a memory block, allowing for identifying which memory blocks 420 * comprise a storage increment. Since a memory block spans complete 421 * storage increments nowadays, this interface is basically unused. Other 422 * archs never exposed != 0. 423 */ 424 static ssize_t phys_device_show(struct device *dev, 425 struct device_attribute *attr, char *buf) 426 { 427 struct memory_block *mem = to_memory_block(dev); 428 unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr); 429 430 return sysfs_emit(buf, "%d\n", 431 arch_get_memory_phys_device(start_pfn)); 432 } 433 434 #ifdef CONFIG_MEMORY_HOTREMOVE 435 static int print_allowed_zone(char *buf, int len, int nid, 436 struct memory_group *group, 437 unsigned long start_pfn, unsigned long nr_pages, 438 int online_type, struct zone *default_zone) 439 { 440 struct zone *zone; 441 442 zone = zone_for_pfn_range(online_type, nid, group, start_pfn, nr_pages); 443 if (zone == default_zone) 444 return 0; 445 446 return sysfs_emit_at(buf, len, " %s", zone->name); 447 } 448 449 static ssize_t valid_zones_show(struct device *dev, 450 struct device_attribute *attr, char *buf) 451 { 452 struct memory_block *mem = to_memory_block(dev); 453 unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr); 454 unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block; 455 struct memory_group *group = mem->group; 456 struct zone *default_zone; 457 int nid = mem->nid; 458 int len = 0; 459 460 /* 461 * Check the existing zone. Make sure that we do that only on the 462 * online nodes otherwise the page_zone is not reliable 463 */ 464 if (mem->state == MEM_ONLINE) { 465 /* 466 * If !mem->zone, the memory block spans multiple zones and 467 * cannot get offlined. 468 */ 469 default_zone = mem->zone; 470 if (!default_zone) 471 return sysfs_emit(buf, "%s\n", "none"); 472 len += sysfs_emit_at(buf, len, "%s", default_zone->name); 473 goto out; 474 } 475 476 default_zone = zone_for_pfn_range(MMOP_ONLINE, nid, group, 477 start_pfn, nr_pages); 478 479 len += sysfs_emit_at(buf, len, "%s", default_zone->name); 480 len += print_allowed_zone(buf, len, nid, group, start_pfn, nr_pages, 481 MMOP_ONLINE_KERNEL, default_zone); 482 len += print_allowed_zone(buf, len, nid, group, start_pfn, nr_pages, 483 MMOP_ONLINE_MOVABLE, default_zone); 484 out: 485 len += sysfs_emit_at(buf, len, "\n"); 486 return len; 487 } 488 static DEVICE_ATTR_RO(valid_zones); 489 #endif 490 491 static DEVICE_ATTR_RO(phys_index); 492 static DEVICE_ATTR_RW(state); 493 static DEVICE_ATTR_RO(phys_device); 494 static DEVICE_ATTR_RO(removable); 495 496 /* 497 * Show the memory block size (shared by all memory blocks). 498 */ 499 static ssize_t block_size_bytes_show(struct device *dev, 500 struct device_attribute *attr, char *buf) 501 { 502 return sysfs_emit(buf, "%lx\n", memory_block_size_bytes()); 503 } 504 505 static DEVICE_ATTR_RO(block_size_bytes); 506 507 /* 508 * Memory auto online policy. 509 */ 510 511 static ssize_t auto_online_blocks_show(struct device *dev, 512 struct device_attribute *attr, char *buf) 513 { 514 return sysfs_emit(buf, "%s\n", 515 online_type_to_str[mhp_default_online_type]); 516 } 517 518 static ssize_t auto_online_blocks_store(struct device *dev, 519 struct device_attribute *attr, 520 const char *buf, size_t count) 521 { 522 const int online_type = mhp_online_type_from_str(buf); 523 524 if (online_type < 0) 525 return -EINVAL; 526 527 mhp_default_online_type = online_type; 528 return count; 529 } 530 531 static DEVICE_ATTR_RW(auto_online_blocks); 532 533 #ifdef CONFIG_CRASH_HOTPLUG 534 #include <linux/kexec.h> 535 static ssize_t crash_hotplug_show(struct device *dev, 536 struct device_attribute *attr, char *buf) 537 { 538 return sysfs_emit(buf, "%d\n", crash_hotplug_memory_support()); 539 } 540 static DEVICE_ATTR_RO(crash_hotplug); 541 #endif 542 543 /* 544 * Some architectures will have custom drivers to do this, and 545 * will not need to do it from userspace. The fake hot-add code 546 * as well as ppc64 will do all of their discovery in userspace 547 * and will require this interface. 548 */ 549 #ifdef CONFIG_ARCH_MEMORY_PROBE 550 static ssize_t probe_store(struct device *dev, struct device_attribute *attr, 551 const char *buf, size_t count) 552 { 553 u64 phys_addr; 554 int nid, ret; 555 unsigned long pages_per_block = PAGES_PER_SECTION * sections_per_block; 556 557 ret = kstrtoull(buf, 0, &phys_addr); 558 if (ret) 559 return ret; 560 561 if (phys_addr & ((pages_per_block << PAGE_SHIFT) - 1)) 562 return -EINVAL; 563 564 ret = lock_device_hotplug_sysfs(); 565 if (ret) 566 return ret; 567 568 nid = memory_add_physaddr_to_nid(phys_addr); 569 ret = __add_memory(nid, phys_addr, 570 MIN_MEMORY_BLOCK_SIZE * sections_per_block, 571 MHP_NONE); 572 573 if (ret) 574 goto out; 575 576 ret = count; 577 out: 578 unlock_device_hotplug(); 579 return ret; 580 } 581 582 static DEVICE_ATTR_WO(probe); 583 #endif 584 585 #ifdef CONFIG_MEMORY_FAILURE 586 /* 587 * Support for offlining pages of memory 588 */ 589 590 /* Soft offline a page */ 591 static ssize_t soft_offline_page_store(struct device *dev, 592 struct device_attribute *attr, 593 const char *buf, size_t count) 594 { 595 int ret; 596 u64 pfn; 597 if (!capable(CAP_SYS_ADMIN)) 598 return -EPERM; 599 if (kstrtoull(buf, 0, &pfn) < 0) 600 return -EINVAL; 601 pfn >>= PAGE_SHIFT; 602 ret = soft_offline_page(pfn, 0); 603 return ret == 0 ? count : ret; 604 } 605 606 /* Forcibly offline a page, including killing processes. */ 607 static ssize_t hard_offline_page_store(struct device *dev, 608 struct device_attribute *attr, 609 const char *buf, size_t count) 610 { 611 int ret; 612 u64 pfn; 613 if (!capable(CAP_SYS_ADMIN)) 614 return -EPERM; 615 if (kstrtoull(buf, 0, &pfn) < 0) 616 return -EINVAL; 617 pfn >>= PAGE_SHIFT; 618 ret = memory_failure(pfn, MF_SW_SIMULATED); 619 if (ret == -EOPNOTSUPP) 620 ret = 0; 621 return ret ? ret : count; 622 } 623 624 static DEVICE_ATTR_WO(soft_offline_page); 625 static DEVICE_ATTR_WO(hard_offline_page); 626 #endif 627 628 /* See phys_device_show(). */ 629 int __weak arch_get_memory_phys_device(unsigned long start_pfn) 630 { 631 return 0; 632 } 633 634 /* 635 * A reference for the returned memory block device is acquired. 636 * 637 * Called under device_hotplug_lock. 638 */ 639 static struct memory_block *find_memory_block_by_id(unsigned long block_id) 640 { 641 struct memory_block *mem; 642 643 mem = xa_load(&memory_blocks, block_id); 644 if (mem) 645 get_device(&mem->dev); 646 return mem; 647 } 648 649 /* 650 * Called under device_hotplug_lock. 651 */ 652 struct memory_block *find_memory_block(unsigned long section_nr) 653 { 654 unsigned long block_id = memory_block_id(section_nr); 655 656 return find_memory_block_by_id(block_id); 657 } 658 659 static struct attribute *memory_memblk_attrs[] = { 660 &dev_attr_phys_index.attr, 661 &dev_attr_state.attr, 662 &dev_attr_phys_device.attr, 663 &dev_attr_removable.attr, 664 #ifdef CONFIG_MEMORY_HOTREMOVE 665 &dev_attr_valid_zones.attr, 666 #endif 667 NULL 668 }; 669 670 static const struct attribute_group memory_memblk_attr_group = { 671 .attrs = memory_memblk_attrs, 672 }; 673 674 static const struct attribute_group *memory_memblk_attr_groups[] = { 675 &memory_memblk_attr_group, 676 NULL, 677 }; 678 679 static int __add_memory_block(struct memory_block *memory) 680 { 681 int ret; 682 683 memory->dev.bus = &memory_subsys; 684 memory->dev.id = memory->start_section_nr / sections_per_block; 685 memory->dev.release = memory_block_release; 686 memory->dev.groups = memory_memblk_attr_groups; 687 memory->dev.offline = memory->state == MEM_OFFLINE; 688 689 ret = device_register(&memory->dev); 690 if (ret) { 691 put_device(&memory->dev); 692 return ret; 693 } 694 ret = xa_err(xa_store(&memory_blocks, memory->dev.id, memory, 695 GFP_KERNEL)); 696 if (ret) 697 device_unregister(&memory->dev); 698 699 return ret; 700 } 701 702 static struct zone *early_node_zone_for_memory_block(struct memory_block *mem, 703 int nid) 704 { 705 const unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr); 706 const unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block; 707 struct zone *zone, *matching_zone = NULL; 708 pg_data_t *pgdat = NODE_DATA(nid); 709 int i; 710 711 /* 712 * This logic only works for early memory, when the applicable zones 713 * already span the memory block. We don't expect overlapping zones on 714 * a single node for early memory. So if we're told that some PFNs 715 * of a node fall into this memory block, we can assume that all node 716 * zones that intersect with the memory block are actually applicable. 717 * No need to look at the memmap. 718 */ 719 for (i = 0; i < MAX_NR_ZONES; i++) { 720 zone = pgdat->node_zones + i; 721 if (!populated_zone(zone)) 722 continue; 723 if (!zone_intersects(zone, start_pfn, nr_pages)) 724 continue; 725 if (!matching_zone) { 726 matching_zone = zone; 727 continue; 728 } 729 /* Spans multiple zones ... */ 730 matching_zone = NULL; 731 break; 732 } 733 return matching_zone; 734 } 735 736 #ifdef CONFIG_NUMA 737 /** 738 * memory_block_add_nid() - Indicate that system RAM falling into this memory 739 * block device (partially) belongs to the given node. 740 * @mem: The memory block device. 741 * @nid: The node id. 742 * @context: The memory initialization context. 743 * 744 * Indicate that system RAM falling into this memory block (partially) belongs 745 * to the given node. If the context indicates ("early") that we are adding the 746 * node during node device subsystem initialization, this will also properly 747 * set/adjust mem->zone based on the zone ranges of the given node. 748 */ 749 void memory_block_add_nid(struct memory_block *mem, int nid, 750 enum meminit_context context) 751 { 752 if (context == MEMINIT_EARLY && mem->nid != nid) { 753 /* 754 * For early memory we have to determine the zone when setting 755 * the node id and handle multiple nodes spanning a single 756 * memory block by indicate via zone == NULL that we're not 757 * dealing with a single zone. So if we're setting the node id 758 * the first time, determine if there is a single zone. If we're 759 * setting the node id a second time to a different node, 760 * invalidate the single detected zone. 761 */ 762 if (mem->nid == NUMA_NO_NODE) 763 mem->zone = early_node_zone_for_memory_block(mem, nid); 764 else 765 mem->zone = NULL; 766 } 767 768 /* 769 * If this memory block spans multiple nodes, we only indicate 770 * the last processed node. If we span multiple nodes (not applicable 771 * to hotplugged memory), zone == NULL will prohibit memory offlining 772 * and consequently unplug. 773 */ 774 mem->nid = nid; 775 } 776 #endif 777 778 static int add_memory_block(unsigned long block_id, unsigned long state, 779 struct vmem_altmap *altmap, 780 struct memory_group *group) 781 { 782 struct memory_block *mem; 783 int ret = 0; 784 785 mem = find_memory_block_by_id(block_id); 786 if (mem) { 787 put_device(&mem->dev); 788 return -EEXIST; 789 } 790 mem = kzalloc(sizeof(*mem), GFP_KERNEL); 791 if (!mem) 792 return -ENOMEM; 793 794 mem->start_section_nr = block_id * sections_per_block; 795 mem->state = state; 796 mem->nid = NUMA_NO_NODE; 797 mem->altmap = altmap; 798 INIT_LIST_HEAD(&mem->group_next); 799 800 #ifndef CONFIG_NUMA 801 if (state == MEM_ONLINE) 802 /* 803 * MEM_ONLINE at this point implies early memory. With NUMA, 804 * we'll determine the zone when setting the node id via 805 * memory_block_add_nid(). Memory hotplug updated the zone 806 * manually when memory onlining/offlining succeeds. 807 */ 808 mem->zone = early_node_zone_for_memory_block(mem, NUMA_NO_NODE); 809 #endif /* CONFIG_NUMA */ 810 811 ret = __add_memory_block(mem); 812 if (ret) 813 return ret; 814 815 if (group) { 816 mem->group = group; 817 list_add(&mem->group_next, &group->memory_blocks); 818 } 819 820 return 0; 821 } 822 823 static int __init add_boot_memory_block(unsigned long base_section_nr) 824 { 825 int section_count = 0; 826 unsigned long nr; 827 828 for (nr = base_section_nr; nr < base_section_nr + sections_per_block; 829 nr++) 830 if (present_section_nr(nr)) 831 section_count++; 832 833 if (section_count == 0) 834 return 0; 835 return add_memory_block(memory_block_id(base_section_nr), 836 MEM_ONLINE, NULL, NULL); 837 } 838 839 static int add_hotplug_memory_block(unsigned long block_id, 840 struct vmem_altmap *altmap, 841 struct memory_group *group) 842 { 843 return add_memory_block(block_id, MEM_OFFLINE, altmap, group); 844 } 845 846 static void remove_memory_block(struct memory_block *memory) 847 { 848 if (WARN_ON_ONCE(memory->dev.bus != &memory_subsys)) 849 return; 850 851 WARN_ON(xa_erase(&memory_blocks, memory->dev.id) == NULL); 852 853 if (memory->group) { 854 list_del(&memory->group_next); 855 memory->group = NULL; 856 } 857 858 /* drop the ref. we got via find_memory_block() */ 859 put_device(&memory->dev); 860 device_unregister(&memory->dev); 861 } 862 863 /* 864 * Create memory block devices for the given memory area. Start and size 865 * have to be aligned to memory block granularity. Memory block devices 866 * will be initialized as offline. 867 * 868 * Called under device_hotplug_lock. 869 */ 870 int create_memory_block_devices(unsigned long start, unsigned long size, 871 struct vmem_altmap *altmap, 872 struct memory_group *group) 873 { 874 const unsigned long start_block_id = pfn_to_block_id(PFN_DOWN(start)); 875 unsigned long end_block_id = pfn_to_block_id(PFN_DOWN(start + size)); 876 struct memory_block *mem; 877 unsigned long block_id; 878 int ret = 0; 879 880 if (WARN_ON_ONCE(!IS_ALIGNED(start, memory_block_size_bytes()) || 881 !IS_ALIGNED(size, memory_block_size_bytes()))) 882 return -EINVAL; 883 884 for (block_id = start_block_id; block_id != end_block_id; block_id++) { 885 ret = add_hotplug_memory_block(block_id, altmap, group); 886 if (ret) 887 break; 888 } 889 if (ret) { 890 end_block_id = block_id; 891 for (block_id = start_block_id; block_id != end_block_id; 892 block_id++) { 893 mem = find_memory_block_by_id(block_id); 894 if (WARN_ON_ONCE(!mem)) 895 continue; 896 remove_memory_block(mem); 897 } 898 } 899 return ret; 900 } 901 902 /* 903 * Remove memory block devices for the given memory area. Start and size 904 * have to be aligned to memory block granularity. Memory block devices 905 * have to be offline. 906 * 907 * Called under device_hotplug_lock. 908 */ 909 void remove_memory_block_devices(unsigned long start, unsigned long size) 910 { 911 const unsigned long start_block_id = pfn_to_block_id(PFN_DOWN(start)); 912 const unsigned long end_block_id = pfn_to_block_id(PFN_DOWN(start + size)); 913 struct memory_block *mem; 914 unsigned long block_id; 915 916 if (WARN_ON_ONCE(!IS_ALIGNED(start, memory_block_size_bytes()) || 917 !IS_ALIGNED(size, memory_block_size_bytes()))) 918 return; 919 920 for (block_id = start_block_id; block_id != end_block_id; block_id++) { 921 mem = find_memory_block_by_id(block_id); 922 if (WARN_ON_ONCE(!mem)) 923 continue; 924 num_poisoned_pages_sub(-1UL, memblk_nr_poison(mem)); 925 unregister_memory_block_under_nodes(mem); 926 remove_memory_block(mem); 927 } 928 } 929 930 static struct attribute *memory_root_attrs[] = { 931 #ifdef CONFIG_ARCH_MEMORY_PROBE 932 &dev_attr_probe.attr, 933 #endif 934 935 #ifdef CONFIG_MEMORY_FAILURE 936 &dev_attr_soft_offline_page.attr, 937 &dev_attr_hard_offline_page.attr, 938 #endif 939 940 &dev_attr_block_size_bytes.attr, 941 &dev_attr_auto_online_blocks.attr, 942 #ifdef CONFIG_CRASH_HOTPLUG 943 &dev_attr_crash_hotplug.attr, 944 #endif 945 NULL 946 }; 947 948 static const struct attribute_group memory_root_attr_group = { 949 .attrs = memory_root_attrs, 950 }; 951 952 static const struct attribute_group *memory_root_attr_groups[] = { 953 &memory_root_attr_group, 954 NULL, 955 }; 956 957 /* 958 * Initialize the sysfs support for memory devices. At the time this function 959 * is called, we cannot have concurrent creation/deletion of memory block 960 * devices, the device_hotplug_lock is not needed. 961 */ 962 void __init memory_dev_init(void) 963 { 964 int ret; 965 unsigned long block_sz, nr; 966 967 /* Validate the configured memory block size */ 968 block_sz = memory_block_size_bytes(); 969 if (!is_power_of_2(block_sz) || block_sz < MIN_MEMORY_BLOCK_SIZE) 970 panic("Memory block size not suitable: 0x%lx\n", block_sz); 971 sections_per_block = block_sz / MIN_MEMORY_BLOCK_SIZE; 972 973 ret = subsys_system_register(&memory_subsys, memory_root_attr_groups); 974 if (ret) 975 panic("%s() failed to register subsystem: %d\n", __func__, ret); 976 977 /* 978 * Create entries for memory sections that were found 979 * during boot and have been initialized 980 */ 981 for (nr = 0; nr <= __highest_present_section_nr; 982 nr += sections_per_block) { 983 ret = add_boot_memory_block(nr); 984 if (ret) 985 panic("%s() failed to add memory block: %d\n", __func__, 986 ret); 987 } 988 } 989 990 /** 991 * walk_memory_blocks - walk through all present memory blocks overlapped 992 * by the range [start, start + size) 993 * 994 * @start: start address of the memory range 995 * @size: size of the memory range 996 * @arg: argument passed to func 997 * @func: callback for each memory section walked 998 * 999 * This function walks through all present memory blocks overlapped by the 1000 * range [start, start + size), calling func on each memory block. 1001 * 1002 * In case func() returns an error, walking is aborted and the error is 1003 * returned. 1004 * 1005 * Called under device_hotplug_lock. 1006 */ 1007 int walk_memory_blocks(unsigned long start, unsigned long size, 1008 void *arg, walk_memory_blocks_func_t func) 1009 { 1010 const unsigned long start_block_id = phys_to_block_id(start); 1011 const unsigned long end_block_id = phys_to_block_id(start + size - 1); 1012 struct memory_block *mem; 1013 unsigned long block_id; 1014 int ret = 0; 1015 1016 if (!size) 1017 return 0; 1018 1019 for (block_id = start_block_id; block_id <= end_block_id; block_id++) { 1020 mem = find_memory_block_by_id(block_id); 1021 if (!mem) 1022 continue; 1023 1024 ret = func(mem, arg); 1025 put_device(&mem->dev); 1026 if (ret) 1027 break; 1028 } 1029 return ret; 1030 } 1031 1032 struct for_each_memory_block_cb_data { 1033 walk_memory_blocks_func_t func; 1034 void *arg; 1035 }; 1036 1037 static int for_each_memory_block_cb(struct device *dev, void *data) 1038 { 1039 struct memory_block *mem = to_memory_block(dev); 1040 struct for_each_memory_block_cb_data *cb_data = data; 1041 1042 return cb_data->func(mem, cb_data->arg); 1043 } 1044 1045 /** 1046 * for_each_memory_block - walk through all present memory blocks 1047 * 1048 * @arg: argument passed to func 1049 * @func: callback for each memory block walked 1050 * 1051 * This function walks through all present memory blocks, calling func on 1052 * each memory block. 1053 * 1054 * In case func() returns an error, walking is aborted and the error is 1055 * returned. 1056 */ 1057 int for_each_memory_block(void *arg, walk_memory_blocks_func_t func) 1058 { 1059 struct for_each_memory_block_cb_data cb_data = { 1060 .func = func, 1061 .arg = arg, 1062 }; 1063 1064 return bus_for_each_dev(&memory_subsys, NULL, &cb_data, 1065 for_each_memory_block_cb); 1066 } 1067 1068 /* 1069 * This is an internal helper to unify allocation and initialization of 1070 * memory groups. Note that the passed memory group will be copied to a 1071 * dynamically allocated memory group. After this call, the passed 1072 * memory group should no longer be used. 1073 */ 1074 static int memory_group_register(struct memory_group group) 1075 { 1076 struct memory_group *new_group; 1077 uint32_t mgid; 1078 int ret; 1079 1080 if (!node_possible(group.nid)) 1081 return -EINVAL; 1082 1083 new_group = kzalloc(sizeof(group), GFP_KERNEL); 1084 if (!new_group) 1085 return -ENOMEM; 1086 *new_group = group; 1087 INIT_LIST_HEAD(&new_group->memory_blocks); 1088 1089 ret = xa_alloc(&memory_groups, &mgid, new_group, xa_limit_31b, 1090 GFP_KERNEL); 1091 if (ret) { 1092 kfree(new_group); 1093 return ret; 1094 } else if (group.is_dynamic) { 1095 xa_set_mark(&memory_groups, mgid, MEMORY_GROUP_MARK_DYNAMIC); 1096 } 1097 return mgid; 1098 } 1099 1100 /** 1101 * memory_group_register_static() - Register a static memory group. 1102 * @nid: The node id. 1103 * @max_pages: The maximum number of pages we'll have in this static memory 1104 * group. 1105 * 1106 * Register a new static memory group and return the memory group id. 1107 * All memory in the group belongs to a single unit, such as a DIMM. All 1108 * memory belonging to a static memory group is added in one go to be removed 1109 * in one go -- it's static. 1110 * 1111 * Returns an error if out of memory, if the node id is invalid, if no new 1112 * memory groups can be registered, or if max_pages is invalid (0). Otherwise, 1113 * returns the new memory group id. 1114 */ 1115 int memory_group_register_static(int nid, unsigned long max_pages) 1116 { 1117 struct memory_group group = { 1118 .nid = nid, 1119 .s = { 1120 .max_pages = max_pages, 1121 }, 1122 }; 1123 1124 if (!max_pages) 1125 return -EINVAL; 1126 return memory_group_register(group); 1127 } 1128 EXPORT_SYMBOL_GPL(memory_group_register_static); 1129 1130 /** 1131 * memory_group_register_dynamic() - Register a dynamic memory group. 1132 * @nid: The node id. 1133 * @unit_pages: Unit in pages in which is memory added/removed in this dynamic 1134 * memory group. 1135 * 1136 * Register a new dynamic memory group and return the memory group id. 1137 * Memory within a dynamic memory group is added/removed dynamically 1138 * in unit_pages. 1139 * 1140 * Returns an error if out of memory, if the node id is invalid, if no new 1141 * memory groups can be registered, or if unit_pages is invalid (0, not a 1142 * power of two, smaller than a single memory block). Otherwise, returns the 1143 * new memory group id. 1144 */ 1145 int memory_group_register_dynamic(int nid, unsigned long unit_pages) 1146 { 1147 struct memory_group group = { 1148 .nid = nid, 1149 .is_dynamic = true, 1150 .d = { 1151 .unit_pages = unit_pages, 1152 }, 1153 }; 1154 1155 if (!unit_pages || !is_power_of_2(unit_pages) || 1156 unit_pages < PHYS_PFN(memory_block_size_bytes())) 1157 return -EINVAL; 1158 return memory_group_register(group); 1159 } 1160 EXPORT_SYMBOL_GPL(memory_group_register_dynamic); 1161 1162 /** 1163 * memory_group_unregister() - Unregister a memory group. 1164 * @mgid: the memory group id 1165 * 1166 * Unregister a memory group. If any memory block still belongs to this 1167 * memory group, unregistering will fail. 1168 * 1169 * Returns -EINVAL if the memory group id is invalid, returns -EBUSY if some 1170 * memory blocks still belong to this memory group and returns 0 if 1171 * unregistering succeeded. 1172 */ 1173 int memory_group_unregister(int mgid) 1174 { 1175 struct memory_group *group; 1176 1177 if (mgid < 0) 1178 return -EINVAL; 1179 1180 group = xa_load(&memory_groups, mgid); 1181 if (!group) 1182 return -EINVAL; 1183 if (!list_empty(&group->memory_blocks)) 1184 return -EBUSY; 1185 xa_erase(&memory_groups, mgid); 1186 kfree(group); 1187 return 0; 1188 } 1189 EXPORT_SYMBOL_GPL(memory_group_unregister); 1190 1191 /* 1192 * This is an internal helper only to be used in core memory hotplug code to 1193 * lookup a memory group. We don't care about locking, as we don't expect a 1194 * memory group to get unregistered while adding memory to it -- because 1195 * the group and the memory is managed by the same driver. 1196 */ 1197 struct memory_group *memory_group_find_by_id(int mgid) 1198 { 1199 return xa_load(&memory_groups, mgid); 1200 } 1201 1202 /* 1203 * This is an internal helper only to be used in core memory hotplug code to 1204 * walk all dynamic memory groups excluding a given memory group, either 1205 * belonging to a specific node, or belonging to any node. 1206 */ 1207 int walk_dynamic_memory_groups(int nid, walk_memory_groups_func_t func, 1208 struct memory_group *excluded, void *arg) 1209 { 1210 struct memory_group *group; 1211 unsigned long index; 1212 int ret = 0; 1213 1214 xa_for_each_marked(&memory_groups, index, group, 1215 MEMORY_GROUP_MARK_DYNAMIC) { 1216 if (group == excluded) 1217 continue; 1218 #ifdef CONFIG_NUMA 1219 if (nid != NUMA_NO_NODE && group->nid != nid) 1220 continue; 1221 #endif /* CONFIG_NUMA */ 1222 ret = func(group, arg); 1223 if (ret) 1224 break; 1225 } 1226 return ret; 1227 } 1228 1229 #if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_MEMORY_HOTPLUG) 1230 void memblk_nr_poison_inc(unsigned long pfn) 1231 { 1232 const unsigned long block_id = pfn_to_block_id(pfn); 1233 struct memory_block *mem = find_memory_block_by_id(block_id); 1234 1235 if (mem) 1236 atomic_long_inc(&mem->nr_hwpoison); 1237 } 1238 1239 void memblk_nr_poison_sub(unsigned long pfn, long i) 1240 { 1241 const unsigned long block_id = pfn_to_block_id(pfn); 1242 struct memory_block *mem = find_memory_block_by_id(block_id); 1243 1244 if (mem) 1245 atomic_long_sub(i, &mem->nr_hwpoison); 1246 } 1247 1248 static unsigned long memblk_nr_poison(struct memory_block *mem) 1249 { 1250 return atomic_long_read(&mem->nr_hwpoison); 1251 } 1252 #endif 1253