1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Memory subsystem support 4 * 5 * Written by Matt Tolentino <matthew.e.tolentino@intel.com> 6 * Dave Hansen <haveblue@us.ibm.com> 7 * 8 * This file provides the necessary infrastructure to represent 9 * a SPARSEMEM-memory-model system's physical memory in /sysfs. 10 * All arch-independent code that assumes MEMORY_HOTPLUG requires 11 * SPARSEMEM should be contained here, or in mm/memory_hotplug.c. 12 */ 13 14 #include <linux/module.h> 15 #include <linux/init.h> 16 #include <linux/topology.h> 17 #include <linux/capability.h> 18 #include <linux/device.h> 19 #include <linux/memory.h> 20 #include <linux/memory_hotplug.h> 21 #include <linux/mm.h> 22 #include <linux/stat.h> 23 #include <linux/slab.h> 24 #include <linux/xarray.h> 25 26 #include <linux/atomic.h> 27 #include <linux/uaccess.h> 28 29 #define MEMORY_CLASS_NAME "memory" 30 31 static const char *const online_type_to_str[] = { 32 [MMOP_OFFLINE] = "offline", 33 [MMOP_ONLINE] = "online", 34 [MMOP_ONLINE_KERNEL] = "online_kernel", 35 [MMOP_ONLINE_MOVABLE] = "online_movable", 36 }; 37 38 int mhp_online_type_from_str(const char *str) 39 { 40 int i; 41 42 for (i = 0; i < ARRAY_SIZE(online_type_to_str); i++) { 43 if (sysfs_streq(str, online_type_to_str[i])) 44 return i; 45 } 46 return -EINVAL; 47 } 48 49 #define to_memory_block(dev) container_of(dev, struct memory_block, dev) 50 51 static int sections_per_block; 52 53 static inline unsigned long memory_block_id(unsigned long section_nr) 54 { 55 return section_nr / sections_per_block; 56 } 57 58 static inline unsigned long pfn_to_block_id(unsigned long pfn) 59 { 60 return memory_block_id(pfn_to_section_nr(pfn)); 61 } 62 63 static inline unsigned long phys_to_block_id(unsigned long phys) 64 { 65 return pfn_to_block_id(PFN_DOWN(phys)); 66 } 67 68 static int memory_subsys_online(struct device *dev); 69 static int memory_subsys_offline(struct device *dev); 70 71 static struct bus_type memory_subsys = { 72 .name = MEMORY_CLASS_NAME, 73 .dev_name = MEMORY_CLASS_NAME, 74 .online = memory_subsys_online, 75 .offline = memory_subsys_offline, 76 }; 77 78 /* 79 * Memory blocks are cached in a local radix tree to avoid 80 * a costly linear search for the corresponding device on 81 * the subsystem bus. 82 */ 83 static DEFINE_XARRAY(memory_blocks); 84 85 /* 86 * Memory groups, indexed by memory group id (mgid). 87 */ 88 static DEFINE_XARRAY_FLAGS(memory_groups, XA_FLAGS_ALLOC); 89 90 static BLOCKING_NOTIFIER_HEAD(memory_chain); 91 92 int register_memory_notifier(struct notifier_block *nb) 93 { 94 return blocking_notifier_chain_register(&memory_chain, nb); 95 } 96 EXPORT_SYMBOL(register_memory_notifier); 97 98 void unregister_memory_notifier(struct notifier_block *nb) 99 { 100 blocking_notifier_chain_unregister(&memory_chain, nb); 101 } 102 EXPORT_SYMBOL(unregister_memory_notifier); 103 104 static void memory_block_release(struct device *dev) 105 { 106 struct memory_block *mem = to_memory_block(dev); 107 108 kfree(mem); 109 } 110 111 unsigned long __weak memory_block_size_bytes(void) 112 { 113 return MIN_MEMORY_BLOCK_SIZE; 114 } 115 EXPORT_SYMBOL_GPL(memory_block_size_bytes); 116 117 /* 118 * Show the first physical section index (number) of this memory block. 119 */ 120 static ssize_t phys_index_show(struct device *dev, 121 struct device_attribute *attr, char *buf) 122 { 123 struct memory_block *mem = to_memory_block(dev); 124 unsigned long phys_index; 125 126 phys_index = mem->start_section_nr / sections_per_block; 127 128 return sysfs_emit(buf, "%08lx\n", phys_index); 129 } 130 131 /* 132 * Legacy interface that we cannot remove. Always indicate "removable" 133 * with CONFIG_MEMORY_HOTREMOVE - bad heuristic. 134 */ 135 static ssize_t removable_show(struct device *dev, struct device_attribute *attr, 136 char *buf) 137 { 138 return sysfs_emit(buf, "%d\n", (int)IS_ENABLED(CONFIG_MEMORY_HOTREMOVE)); 139 } 140 141 /* 142 * online, offline, going offline, etc. 143 */ 144 static ssize_t state_show(struct device *dev, struct device_attribute *attr, 145 char *buf) 146 { 147 struct memory_block *mem = to_memory_block(dev); 148 const char *output; 149 150 /* 151 * We can probably put these states in a nice little array 152 * so that they're not open-coded 153 */ 154 switch (mem->state) { 155 case MEM_ONLINE: 156 output = "online"; 157 break; 158 case MEM_OFFLINE: 159 output = "offline"; 160 break; 161 case MEM_GOING_OFFLINE: 162 output = "going-offline"; 163 break; 164 default: 165 WARN_ON(1); 166 return sysfs_emit(buf, "ERROR-UNKNOWN-%ld\n", mem->state); 167 } 168 169 return sysfs_emit(buf, "%s\n", output); 170 } 171 172 int memory_notify(unsigned long val, void *v) 173 { 174 return blocking_notifier_call_chain(&memory_chain, val, v); 175 } 176 177 static int memory_block_online(struct memory_block *mem) 178 { 179 unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr); 180 unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block; 181 unsigned long nr_vmemmap_pages = mem->nr_vmemmap_pages; 182 struct zone *zone; 183 int ret; 184 185 zone = zone_for_pfn_range(mem->online_type, mem->nid, start_pfn, nr_pages); 186 187 /* 188 * Although vmemmap pages have a different lifecycle than the pages 189 * they describe (they remain until the memory is unplugged), doing 190 * their initialization and accounting at memory onlining/offlining 191 * stage helps to keep accounting easier to follow - e.g vmemmaps 192 * belong to the same zone as the memory they backed. 193 */ 194 if (nr_vmemmap_pages) { 195 ret = mhp_init_memmap_on_memory(start_pfn, nr_vmemmap_pages, zone); 196 if (ret) 197 return ret; 198 } 199 200 ret = online_pages(start_pfn + nr_vmemmap_pages, 201 nr_pages - nr_vmemmap_pages, zone, mem->group); 202 if (ret) { 203 if (nr_vmemmap_pages) 204 mhp_deinit_memmap_on_memory(start_pfn, nr_vmemmap_pages); 205 return ret; 206 } 207 208 /* 209 * Account once onlining succeeded. If the zone was unpopulated, it is 210 * now already properly populated. 211 */ 212 if (nr_vmemmap_pages) 213 adjust_present_page_count(pfn_to_page(start_pfn), mem->group, 214 nr_vmemmap_pages); 215 216 return ret; 217 } 218 219 static int memory_block_offline(struct memory_block *mem) 220 { 221 unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr); 222 unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block; 223 unsigned long nr_vmemmap_pages = mem->nr_vmemmap_pages; 224 int ret; 225 226 /* 227 * Unaccount before offlining, such that unpopulated zone and kthreads 228 * can properly be torn down in offline_pages(). 229 */ 230 if (nr_vmemmap_pages) 231 adjust_present_page_count(pfn_to_page(start_pfn), mem->group, 232 -nr_vmemmap_pages); 233 234 ret = offline_pages(start_pfn + nr_vmemmap_pages, 235 nr_pages - nr_vmemmap_pages, mem->group); 236 if (ret) { 237 /* offline_pages() failed. Account back. */ 238 if (nr_vmemmap_pages) 239 adjust_present_page_count(pfn_to_page(start_pfn), 240 mem->group, nr_vmemmap_pages); 241 return ret; 242 } 243 244 if (nr_vmemmap_pages) 245 mhp_deinit_memmap_on_memory(start_pfn, nr_vmemmap_pages); 246 247 return ret; 248 } 249 250 /* 251 * MEMORY_HOTPLUG depends on SPARSEMEM in mm/Kconfig, so it is 252 * OK to have direct references to sparsemem variables in here. 253 */ 254 static int 255 memory_block_action(struct memory_block *mem, unsigned long action) 256 { 257 int ret; 258 259 switch (action) { 260 case MEM_ONLINE: 261 ret = memory_block_online(mem); 262 break; 263 case MEM_OFFLINE: 264 ret = memory_block_offline(mem); 265 break; 266 default: 267 WARN(1, KERN_WARNING "%s(%ld, %ld) unknown action: " 268 "%ld\n", __func__, mem->start_section_nr, action, action); 269 ret = -EINVAL; 270 } 271 272 return ret; 273 } 274 275 static int memory_block_change_state(struct memory_block *mem, 276 unsigned long to_state, unsigned long from_state_req) 277 { 278 int ret = 0; 279 280 if (mem->state != from_state_req) 281 return -EINVAL; 282 283 if (to_state == MEM_OFFLINE) 284 mem->state = MEM_GOING_OFFLINE; 285 286 ret = memory_block_action(mem, to_state); 287 mem->state = ret ? from_state_req : to_state; 288 289 return ret; 290 } 291 292 /* The device lock serializes operations on memory_subsys_[online|offline] */ 293 static int memory_subsys_online(struct device *dev) 294 { 295 struct memory_block *mem = to_memory_block(dev); 296 int ret; 297 298 if (mem->state == MEM_ONLINE) 299 return 0; 300 301 /* 302 * When called via device_online() without configuring the online_type, 303 * we want to default to MMOP_ONLINE. 304 */ 305 if (mem->online_type == MMOP_OFFLINE) 306 mem->online_type = MMOP_ONLINE; 307 308 ret = memory_block_change_state(mem, MEM_ONLINE, MEM_OFFLINE); 309 mem->online_type = MMOP_OFFLINE; 310 311 return ret; 312 } 313 314 static int memory_subsys_offline(struct device *dev) 315 { 316 struct memory_block *mem = to_memory_block(dev); 317 318 if (mem->state == MEM_OFFLINE) 319 return 0; 320 321 return memory_block_change_state(mem, MEM_OFFLINE, MEM_ONLINE); 322 } 323 324 static ssize_t state_store(struct device *dev, struct device_attribute *attr, 325 const char *buf, size_t count) 326 { 327 const int online_type = mhp_online_type_from_str(buf); 328 struct memory_block *mem = to_memory_block(dev); 329 int ret; 330 331 if (online_type < 0) 332 return -EINVAL; 333 334 ret = lock_device_hotplug_sysfs(); 335 if (ret) 336 return ret; 337 338 switch (online_type) { 339 case MMOP_ONLINE_KERNEL: 340 case MMOP_ONLINE_MOVABLE: 341 case MMOP_ONLINE: 342 /* mem->online_type is protected by device_hotplug_lock */ 343 mem->online_type = online_type; 344 ret = device_online(&mem->dev); 345 break; 346 case MMOP_OFFLINE: 347 ret = device_offline(&mem->dev); 348 break; 349 default: 350 ret = -EINVAL; /* should never happen */ 351 } 352 353 unlock_device_hotplug(); 354 355 if (ret < 0) 356 return ret; 357 if (ret) 358 return -EINVAL; 359 360 return count; 361 } 362 363 /* 364 * Legacy interface that we cannot remove: s390x exposes the storage increment 365 * covered by a memory block, allowing for identifying which memory blocks 366 * comprise a storage increment. Since a memory block spans complete 367 * storage increments nowadays, this interface is basically unused. Other 368 * archs never exposed != 0. 369 */ 370 static ssize_t phys_device_show(struct device *dev, 371 struct device_attribute *attr, char *buf) 372 { 373 struct memory_block *mem = to_memory_block(dev); 374 unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr); 375 376 return sysfs_emit(buf, "%d\n", 377 arch_get_memory_phys_device(start_pfn)); 378 } 379 380 #ifdef CONFIG_MEMORY_HOTREMOVE 381 static int print_allowed_zone(char *buf, int len, int nid, 382 unsigned long start_pfn, unsigned long nr_pages, 383 int online_type, struct zone *default_zone) 384 { 385 struct zone *zone; 386 387 zone = zone_for_pfn_range(online_type, nid, start_pfn, nr_pages); 388 if (zone == default_zone) 389 return 0; 390 391 return sysfs_emit_at(buf, len, " %s", zone->name); 392 } 393 394 static ssize_t valid_zones_show(struct device *dev, 395 struct device_attribute *attr, char *buf) 396 { 397 struct memory_block *mem = to_memory_block(dev); 398 unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr); 399 unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block; 400 struct zone *default_zone; 401 int len = 0; 402 int nid; 403 404 /* 405 * Check the existing zone. Make sure that we do that only on the 406 * online nodes otherwise the page_zone is not reliable 407 */ 408 if (mem->state == MEM_ONLINE) { 409 /* 410 * The block contains more than one zone can not be offlined. 411 * This can happen e.g. for ZONE_DMA and ZONE_DMA32 412 */ 413 default_zone = test_pages_in_a_zone(start_pfn, 414 start_pfn + nr_pages); 415 if (!default_zone) 416 return sysfs_emit(buf, "%s\n", "none"); 417 len += sysfs_emit_at(buf, len, "%s", default_zone->name); 418 goto out; 419 } 420 421 nid = mem->nid; 422 default_zone = zone_for_pfn_range(MMOP_ONLINE, nid, start_pfn, 423 nr_pages); 424 425 len += sysfs_emit_at(buf, len, "%s", default_zone->name); 426 len += print_allowed_zone(buf, len, nid, start_pfn, nr_pages, 427 MMOP_ONLINE_KERNEL, default_zone); 428 len += print_allowed_zone(buf, len, nid, start_pfn, nr_pages, 429 MMOP_ONLINE_MOVABLE, default_zone); 430 out: 431 len += sysfs_emit_at(buf, len, "\n"); 432 return len; 433 } 434 static DEVICE_ATTR_RO(valid_zones); 435 #endif 436 437 static DEVICE_ATTR_RO(phys_index); 438 static DEVICE_ATTR_RW(state); 439 static DEVICE_ATTR_RO(phys_device); 440 static DEVICE_ATTR_RO(removable); 441 442 /* 443 * Show the memory block size (shared by all memory blocks). 444 */ 445 static ssize_t block_size_bytes_show(struct device *dev, 446 struct device_attribute *attr, char *buf) 447 { 448 return sysfs_emit(buf, "%lx\n", memory_block_size_bytes()); 449 } 450 451 static DEVICE_ATTR_RO(block_size_bytes); 452 453 /* 454 * Memory auto online policy. 455 */ 456 457 static ssize_t auto_online_blocks_show(struct device *dev, 458 struct device_attribute *attr, char *buf) 459 { 460 return sysfs_emit(buf, "%s\n", 461 online_type_to_str[mhp_default_online_type]); 462 } 463 464 static ssize_t auto_online_blocks_store(struct device *dev, 465 struct device_attribute *attr, 466 const char *buf, size_t count) 467 { 468 const int online_type = mhp_online_type_from_str(buf); 469 470 if (online_type < 0) 471 return -EINVAL; 472 473 mhp_default_online_type = online_type; 474 return count; 475 } 476 477 static DEVICE_ATTR_RW(auto_online_blocks); 478 479 /* 480 * Some architectures will have custom drivers to do this, and 481 * will not need to do it from userspace. The fake hot-add code 482 * as well as ppc64 will do all of their discovery in userspace 483 * and will require this interface. 484 */ 485 #ifdef CONFIG_ARCH_MEMORY_PROBE 486 static ssize_t probe_store(struct device *dev, struct device_attribute *attr, 487 const char *buf, size_t count) 488 { 489 u64 phys_addr; 490 int nid, ret; 491 unsigned long pages_per_block = PAGES_PER_SECTION * sections_per_block; 492 493 ret = kstrtoull(buf, 0, &phys_addr); 494 if (ret) 495 return ret; 496 497 if (phys_addr & ((pages_per_block << PAGE_SHIFT) - 1)) 498 return -EINVAL; 499 500 ret = lock_device_hotplug_sysfs(); 501 if (ret) 502 return ret; 503 504 nid = memory_add_physaddr_to_nid(phys_addr); 505 ret = __add_memory(nid, phys_addr, 506 MIN_MEMORY_BLOCK_SIZE * sections_per_block, 507 MHP_NONE); 508 509 if (ret) 510 goto out; 511 512 ret = count; 513 out: 514 unlock_device_hotplug(); 515 return ret; 516 } 517 518 static DEVICE_ATTR_WO(probe); 519 #endif 520 521 #ifdef CONFIG_MEMORY_FAILURE 522 /* 523 * Support for offlining pages of memory 524 */ 525 526 /* Soft offline a page */ 527 static ssize_t soft_offline_page_store(struct device *dev, 528 struct device_attribute *attr, 529 const char *buf, size_t count) 530 { 531 int ret; 532 u64 pfn; 533 if (!capable(CAP_SYS_ADMIN)) 534 return -EPERM; 535 if (kstrtoull(buf, 0, &pfn) < 0) 536 return -EINVAL; 537 pfn >>= PAGE_SHIFT; 538 ret = soft_offline_page(pfn, 0); 539 return ret == 0 ? count : ret; 540 } 541 542 /* Forcibly offline a page, including killing processes. */ 543 static ssize_t hard_offline_page_store(struct device *dev, 544 struct device_attribute *attr, 545 const char *buf, size_t count) 546 { 547 int ret; 548 u64 pfn; 549 if (!capable(CAP_SYS_ADMIN)) 550 return -EPERM; 551 if (kstrtoull(buf, 0, &pfn) < 0) 552 return -EINVAL; 553 pfn >>= PAGE_SHIFT; 554 ret = memory_failure(pfn, 0); 555 return ret ? ret : count; 556 } 557 558 static DEVICE_ATTR_WO(soft_offline_page); 559 static DEVICE_ATTR_WO(hard_offline_page); 560 #endif 561 562 /* See phys_device_show(). */ 563 int __weak arch_get_memory_phys_device(unsigned long start_pfn) 564 { 565 return 0; 566 } 567 568 /* 569 * A reference for the returned memory block device is acquired. 570 * 571 * Called under device_hotplug_lock. 572 */ 573 static struct memory_block *find_memory_block_by_id(unsigned long block_id) 574 { 575 struct memory_block *mem; 576 577 mem = xa_load(&memory_blocks, block_id); 578 if (mem) 579 get_device(&mem->dev); 580 return mem; 581 } 582 583 /* 584 * Called under device_hotplug_lock. 585 */ 586 struct memory_block *find_memory_block(struct mem_section *section) 587 { 588 unsigned long block_id = memory_block_id(__section_nr(section)); 589 590 return find_memory_block_by_id(block_id); 591 } 592 593 static struct attribute *memory_memblk_attrs[] = { 594 &dev_attr_phys_index.attr, 595 &dev_attr_state.attr, 596 &dev_attr_phys_device.attr, 597 &dev_attr_removable.attr, 598 #ifdef CONFIG_MEMORY_HOTREMOVE 599 &dev_attr_valid_zones.attr, 600 #endif 601 NULL 602 }; 603 604 static const struct attribute_group memory_memblk_attr_group = { 605 .attrs = memory_memblk_attrs, 606 }; 607 608 static const struct attribute_group *memory_memblk_attr_groups[] = { 609 &memory_memblk_attr_group, 610 NULL, 611 }; 612 613 /* 614 * register_memory - Setup a sysfs device for a memory block 615 */ 616 static 617 int register_memory(struct memory_block *memory) 618 { 619 int ret; 620 621 memory->dev.bus = &memory_subsys; 622 memory->dev.id = memory->start_section_nr / sections_per_block; 623 memory->dev.release = memory_block_release; 624 memory->dev.groups = memory_memblk_attr_groups; 625 memory->dev.offline = memory->state == MEM_OFFLINE; 626 627 ret = device_register(&memory->dev); 628 if (ret) { 629 put_device(&memory->dev); 630 return ret; 631 } 632 ret = xa_err(xa_store(&memory_blocks, memory->dev.id, memory, 633 GFP_KERNEL)); 634 if (ret) { 635 put_device(&memory->dev); 636 device_unregister(&memory->dev); 637 } 638 return ret; 639 } 640 641 static int init_memory_block(unsigned long block_id, unsigned long state, 642 unsigned long nr_vmemmap_pages, 643 struct memory_group *group) 644 { 645 struct memory_block *mem; 646 int ret = 0; 647 648 mem = find_memory_block_by_id(block_id); 649 if (mem) { 650 put_device(&mem->dev); 651 return -EEXIST; 652 } 653 mem = kzalloc(sizeof(*mem), GFP_KERNEL); 654 if (!mem) 655 return -ENOMEM; 656 657 mem->start_section_nr = block_id * sections_per_block; 658 mem->state = state; 659 mem->nid = NUMA_NO_NODE; 660 mem->nr_vmemmap_pages = nr_vmemmap_pages; 661 INIT_LIST_HEAD(&mem->group_next); 662 663 if (group) { 664 mem->group = group; 665 list_add(&mem->group_next, &group->memory_blocks); 666 } 667 668 ret = register_memory(mem); 669 670 return ret; 671 } 672 673 static int add_memory_block(unsigned long base_section_nr) 674 { 675 int section_count = 0; 676 unsigned long nr; 677 678 for (nr = base_section_nr; nr < base_section_nr + sections_per_block; 679 nr++) 680 if (present_section_nr(nr)) 681 section_count++; 682 683 if (section_count == 0) 684 return 0; 685 return init_memory_block(memory_block_id(base_section_nr), 686 MEM_ONLINE, 0, NULL); 687 } 688 689 static void unregister_memory(struct memory_block *memory) 690 { 691 if (WARN_ON_ONCE(memory->dev.bus != &memory_subsys)) 692 return; 693 694 WARN_ON(xa_erase(&memory_blocks, memory->dev.id) == NULL); 695 696 if (memory->group) { 697 list_del(&memory->group_next); 698 memory->group = NULL; 699 } 700 701 /* drop the ref. we got via find_memory_block() */ 702 put_device(&memory->dev); 703 device_unregister(&memory->dev); 704 } 705 706 /* 707 * Create memory block devices for the given memory area. Start and size 708 * have to be aligned to memory block granularity. Memory block devices 709 * will be initialized as offline. 710 * 711 * Called under device_hotplug_lock. 712 */ 713 int create_memory_block_devices(unsigned long start, unsigned long size, 714 unsigned long vmemmap_pages, 715 struct memory_group *group) 716 { 717 const unsigned long start_block_id = pfn_to_block_id(PFN_DOWN(start)); 718 unsigned long end_block_id = pfn_to_block_id(PFN_DOWN(start + size)); 719 struct memory_block *mem; 720 unsigned long block_id; 721 int ret = 0; 722 723 if (WARN_ON_ONCE(!IS_ALIGNED(start, memory_block_size_bytes()) || 724 !IS_ALIGNED(size, memory_block_size_bytes()))) 725 return -EINVAL; 726 727 for (block_id = start_block_id; block_id != end_block_id; block_id++) { 728 ret = init_memory_block(block_id, MEM_OFFLINE, vmemmap_pages, 729 group); 730 if (ret) 731 break; 732 } 733 if (ret) { 734 end_block_id = block_id; 735 for (block_id = start_block_id; block_id != end_block_id; 736 block_id++) { 737 mem = find_memory_block_by_id(block_id); 738 if (WARN_ON_ONCE(!mem)) 739 continue; 740 unregister_memory(mem); 741 } 742 } 743 return ret; 744 } 745 746 /* 747 * Remove memory block devices for the given memory area. Start and size 748 * have to be aligned to memory block granularity. Memory block devices 749 * have to be offline. 750 * 751 * Called under device_hotplug_lock. 752 */ 753 void remove_memory_block_devices(unsigned long start, unsigned long size) 754 { 755 const unsigned long start_block_id = pfn_to_block_id(PFN_DOWN(start)); 756 const unsigned long end_block_id = pfn_to_block_id(PFN_DOWN(start + size)); 757 struct memory_block *mem; 758 unsigned long block_id; 759 760 if (WARN_ON_ONCE(!IS_ALIGNED(start, memory_block_size_bytes()) || 761 !IS_ALIGNED(size, memory_block_size_bytes()))) 762 return; 763 764 for (block_id = start_block_id; block_id != end_block_id; block_id++) { 765 mem = find_memory_block_by_id(block_id); 766 if (WARN_ON_ONCE(!mem)) 767 continue; 768 unregister_memory_block_under_nodes(mem); 769 unregister_memory(mem); 770 } 771 } 772 773 /* return true if the memory block is offlined, otherwise, return false */ 774 bool is_memblock_offlined(struct memory_block *mem) 775 { 776 return mem->state == MEM_OFFLINE; 777 } 778 779 static struct attribute *memory_root_attrs[] = { 780 #ifdef CONFIG_ARCH_MEMORY_PROBE 781 &dev_attr_probe.attr, 782 #endif 783 784 #ifdef CONFIG_MEMORY_FAILURE 785 &dev_attr_soft_offline_page.attr, 786 &dev_attr_hard_offline_page.attr, 787 #endif 788 789 &dev_attr_block_size_bytes.attr, 790 &dev_attr_auto_online_blocks.attr, 791 NULL 792 }; 793 794 static const struct attribute_group memory_root_attr_group = { 795 .attrs = memory_root_attrs, 796 }; 797 798 static const struct attribute_group *memory_root_attr_groups[] = { 799 &memory_root_attr_group, 800 NULL, 801 }; 802 803 /* 804 * Initialize the sysfs support for memory devices. At the time this function 805 * is called, we cannot have concurrent creation/deletion of memory block 806 * devices, the device_hotplug_lock is not needed. 807 */ 808 void __init memory_dev_init(void) 809 { 810 int ret; 811 unsigned long block_sz, nr; 812 813 /* Validate the configured memory block size */ 814 block_sz = memory_block_size_bytes(); 815 if (!is_power_of_2(block_sz) || block_sz < MIN_MEMORY_BLOCK_SIZE) 816 panic("Memory block size not suitable: 0x%lx\n", block_sz); 817 sections_per_block = block_sz / MIN_MEMORY_BLOCK_SIZE; 818 819 ret = subsys_system_register(&memory_subsys, memory_root_attr_groups); 820 if (ret) 821 panic("%s() failed to register subsystem: %d\n", __func__, ret); 822 823 /* 824 * Create entries for memory sections that were found 825 * during boot and have been initialized 826 */ 827 for (nr = 0; nr <= __highest_present_section_nr; 828 nr += sections_per_block) { 829 ret = add_memory_block(nr); 830 if (ret) 831 panic("%s() failed to add memory block: %d\n", __func__, 832 ret); 833 } 834 } 835 836 /** 837 * walk_memory_blocks - walk through all present memory blocks overlapped 838 * by the range [start, start + size) 839 * 840 * @start: start address of the memory range 841 * @size: size of the memory range 842 * @arg: argument passed to func 843 * @func: callback for each memory section walked 844 * 845 * This function walks through all present memory blocks overlapped by the 846 * range [start, start + size), calling func on each memory block. 847 * 848 * In case func() returns an error, walking is aborted and the error is 849 * returned. 850 * 851 * Called under device_hotplug_lock. 852 */ 853 int walk_memory_blocks(unsigned long start, unsigned long size, 854 void *arg, walk_memory_blocks_func_t func) 855 { 856 const unsigned long start_block_id = phys_to_block_id(start); 857 const unsigned long end_block_id = phys_to_block_id(start + size - 1); 858 struct memory_block *mem; 859 unsigned long block_id; 860 int ret = 0; 861 862 if (!size) 863 return 0; 864 865 for (block_id = start_block_id; block_id <= end_block_id; block_id++) { 866 mem = find_memory_block_by_id(block_id); 867 if (!mem) 868 continue; 869 870 ret = func(mem, arg); 871 put_device(&mem->dev); 872 if (ret) 873 break; 874 } 875 return ret; 876 } 877 878 struct for_each_memory_block_cb_data { 879 walk_memory_blocks_func_t func; 880 void *arg; 881 }; 882 883 static int for_each_memory_block_cb(struct device *dev, void *data) 884 { 885 struct memory_block *mem = to_memory_block(dev); 886 struct for_each_memory_block_cb_data *cb_data = data; 887 888 return cb_data->func(mem, cb_data->arg); 889 } 890 891 /** 892 * for_each_memory_block - walk through all present memory blocks 893 * 894 * @arg: argument passed to func 895 * @func: callback for each memory block walked 896 * 897 * This function walks through all present memory blocks, calling func on 898 * each memory block. 899 * 900 * In case func() returns an error, walking is aborted and the error is 901 * returned. 902 */ 903 int for_each_memory_block(void *arg, walk_memory_blocks_func_t func) 904 { 905 struct for_each_memory_block_cb_data cb_data = { 906 .func = func, 907 .arg = arg, 908 }; 909 910 return bus_for_each_dev(&memory_subsys, NULL, &cb_data, 911 for_each_memory_block_cb); 912 } 913 914 /* 915 * This is an internal helper to unify allocation and initialization of 916 * memory groups. Note that the passed memory group will be copied to a 917 * dynamically allocated memory group. After this call, the passed 918 * memory group should no longer be used. 919 */ 920 static int memory_group_register(struct memory_group group) 921 { 922 struct memory_group *new_group; 923 uint32_t mgid; 924 int ret; 925 926 if (!node_possible(group.nid)) 927 return -EINVAL; 928 929 new_group = kzalloc(sizeof(group), GFP_KERNEL); 930 if (!new_group) 931 return -ENOMEM; 932 *new_group = group; 933 INIT_LIST_HEAD(&new_group->memory_blocks); 934 935 ret = xa_alloc(&memory_groups, &mgid, new_group, xa_limit_31b, 936 GFP_KERNEL); 937 if (ret) { 938 kfree(new_group); 939 return ret; 940 } 941 return mgid; 942 } 943 944 /** 945 * memory_group_register_static() - Register a static memory group. 946 * @nid: The node id. 947 * @max_pages: The maximum number of pages we'll have in this static memory 948 * group. 949 * 950 * Register a new static memory group and return the memory group id. 951 * All memory in the group belongs to a single unit, such as a DIMM. All 952 * memory belonging to a static memory group is added in one go to be removed 953 * in one go -- it's static. 954 * 955 * Returns an error if out of memory, if the node id is invalid, if no new 956 * memory groups can be registered, or if max_pages is invalid (0). Otherwise, 957 * returns the new memory group id. 958 */ 959 int memory_group_register_static(int nid, unsigned long max_pages) 960 { 961 struct memory_group group = { 962 .nid = nid, 963 .s = { 964 .max_pages = max_pages, 965 }, 966 }; 967 968 if (!max_pages) 969 return -EINVAL; 970 return memory_group_register(group); 971 } 972 EXPORT_SYMBOL_GPL(memory_group_register_static); 973 974 /** 975 * memory_group_register_dynamic() - Register a dynamic memory group. 976 * @nid: The node id. 977 * @unit_pages: Unit in pages in which is memory added/removed in this dynamic 978 * memory group. 979 * 980 * Register a new dynamic memory group and return the memory group id. 981 * Memory within a dynamic memory group is added/removed dynamically 982 * in unit_pages. 983 * 984 * Returns an error if out of memory, if the node id is invalid, if no new 985 * memory groups can be registered, or if unit_pages is invalid (0, not a 986 * power of two, smaller than a single memory block). Otherwise, returns the 987 * new memory group id. 988 */ 989 int memory_group_register_dynamic(int nid, unsigned long unit_pages) 990 { 991 struct memory_group group = { 992 .nid = nid, 993 .is_dynamic = true, 994 .d = { 995 .unit_pages = unit_pages, 996 }, 997 }; 998 999 if (!unit_pages || !is_power_of_2(unit_pages) || 1000 unit_pages < PHYS_PFN(memory_block_size_bytes())) 1001 return -EINVAL; 1002 return memory_group_register(group); 1003 } 1004 EXPORT_SYMBOL_GPL(memory_group_register_dynamic); 1005 1006 /** 1007 * memory_group_unregister() - Unregister a memory group. 1008 * @mgid: the memory group id 1009 * 1010 * Unregister a memory group. If any memory block still belongs to this 1011 * memory group, unregistering will fail. 1012 * 1013 * Returns -EINVAL if the memory group id is invalid, returns -EBUSY if some 1014 * memory blocks still belong to this memory group and returns 0 if 1015 * unregistering succeeded. 1016 */ 1017 int memory_group_unregister(int mgid) 1018 { 1019 struct memory_group *group; 1020 1021 if (mgid < 0) 1022 return -EINVAL; 1023 1024 group = xa_load(&memory_groups, mgid); 1025 if (!group) 1026 return -EINVAL; 1027 if (!list_empty(&group->memory_blocks)) 1028 return -EBUSY; 1029 xa_erase(&memory_groups, mgid); 1030 kfree(group); 1031 return 0; 1032 } 1033 EXPORT_SYMBOL_GPL(memory_group_unregister); 1034 1035 /* 1036 * This is an internal helper only to be used in core memory hotplug code to 1037 * lookup a memory group. We don't care about locking, as we don't expect a 1038 * memory group to get unregistered while adding memory to it -- because 1039 * the group and the memory is managed by the same driver. 1040 */ 1041 struct memory_group *memory_group_find_by_id(int mgid) 1042 { 1043 return xa_load(&memory_groups, mgid); 1044 } 1045