1 /* 2 * Memory subsystem support 3 * 4 * Written by Matt Tolentino <matthew.e.tolentino@intel.com> 5 * Dave Hansen <haveblue@us.ibm.com> 6 * 7 * This file provides the necessary infrastructure to represent 8 * a SPARSEMEM-memory-model system's physical memory in /sysfs. 9 * All arch-independent code that assumes MEMORY_HOTPLUG requires 10 * SPARSEMEM should be contained here, or in mm/memory_hotplug.c. 11 */ 12 13 #include <linux/module.h> 14 #include <linux/init.h> 15 #include <linux/topology.h> 16 #include <linux/capability.h> 17 #include <linux/device.h> 18 #include <linux/memory.h> 19 #include <linux/kobject.h> 20 #include <linux/memory_hotplug.h> 21 #include <linux/mm.h> 22 #include <linux/mutex.h> 23 #include <linux/stat.h> 24 #include <linux/slab.h> 25 26 #include <linux/atomic.h> 27 #include <asm/uaccess.h> 28 29 static DEFINE_MUTEX(mem_sysfs_mutex); 30 31 #define MEMORY_CLASS_NAME "memory" 32 33 static int sections_per_block; 34 35 static inline int base_memory_block_id(int section_nr) 36 { 37 return section_nr / sections_per_block; 38 } 39 40 static struct bus_type memory_subsys = { 41 .name = MEMORY_CLASS_NAME, 42 .dev_name = MEMORY_CLASS_NAME, 43 }; 44 45 static BLOCKING_NOTIFIER_HEAD(memory_chain); 46 47 int register_memory_notifier(struct notifier_block *nb) 48 { 49 return blocking_notifier_chain_register(&memory_chain, nb); 50 } 51 EXPORT_SYMBOL(register_memory_notifier); 52 53 void unregister_memory_notifier(struct notifier_block *nb) 54 { 55 blocking_notifier_chain_unregister(&memory_chain, nb); 56 } 57 EXPORT_SYMBOL(unregister_memory_notifier); 58 59 static ATOMIC_NOTIFIER_HEAD(memory_isolate_chain); 60 61 int register_memory_isolate_notifier(struct notifier_block *nb) 62 { 63 return atomic_notifier_chain_register(&memory_isolate_chain, nb); 64 } 65 EXPORT_SYMBOL(register_memory_isolate_notifier); 66 67 void unregister_memory_isolate_notifier(struct notifier_block *nb) 68 { 69 atomic_notifier_chain_unregister(&memory_isolate_chain, nb); 70 } 71 EXPORT_SYMBOL(unregister_memory_isolate_notifier); 72 73 static void memory_block_release(struct device *dev) 74 { 75 struct memory_block *mem = container_of(dev, struct memory_block, dev); 76 77 kfree(mem); 78 } 79 80 unsigned long __weak memory_block_size_bytes(void) 81 { 82 return MIN_MEMORY_BLOCK_SIZE; 83 } 84 85 static unsigned long get_memory_block_size(void) 86 { 87 unsigned long block_sz; 88 89 block_sz = memory_block_size_bytes(); 90 91 /* Validate blk_sz is a power of 2 and not less than section size */ 92 if ((block_sz & (block_sz - 1)) || (block_sz < MIN_MEMORY_BLOCK_SIZE)) { 93 WARN_ON(1); 94 block_sz = MIN_MEMORY_BLOCK_SIZE; 95 } 96 97 return block_sz; 98 } 99 100 /* 101 * use this as the physical section index that this memsection 102 * uses. 103 */ 104 105 static ssize_t show_mem_start_phys_index(struct device *dev, 106 struct device_attribute *attr, char *buf) 107 { 108 struct memory_block *mem = 109 container_of(dev, struct memory_block, dev); 110 unsigned long phys_index; 111 112 phys_index = mem->start_section_nr / sections_per_block; 113 return sprintf(buf, "%08lx\n", phys_index); 114 } 115 116 static ssize_t show_mem_end_phys_index(struct device *dev, 117 struct device_attribute *attr, char *buf) 118 { 119 struct memory_block *mem = 120 container_of(dev, struct memory_block, dev); 121 unsigned long phys_index; 122 123 phys_index = mem->end_section_nr / sections_per_block; 124 return sprintf(buf, "%08lx\n", phys_index); 125 } 126 127 /* 128 * Show whether the section of memory is likely to be hot-removable 129 */ 130 static ssize_t show_mem_removable(struct device *dev, 131 struct device_attribute *attr, char *buf) 132 { 133 unsigned long i, pfn; 134 int ret = 1; 135 struct memory_block *mem = 136 container_of(dev, struct memory_block, dev); 137 138 for (i = 0; i < sections_per_block; i++) { 139 pfn = section_nr_to_pfn(mem->start_section_nr + i); 140 ret &= is_mem_section_removable(pfn, PAGES_PER_SECTION); 141 } 142 143 return sprintf(buf, "%d\n", ret); 144 } 145 146 /* 147 * online, offline, going offline, etc. 148 */ 149 static ssize_t show_mem_state(struct device *dev, 150 struct device_attribute *attr, char *buf) 151 { 152 struct memory_block *mem = 153 container_of(dev, struct memory_block, dev); 154 ssize_t len = 0; 155 156 /* 157 * We can probably put these states in a nice little array 158 * so that they're not open-coded 159 */ 160 switch (mem->state) { 161 case MEM_ONLINE: 162 len = sprintf(buf, "online\n"); 163 break; 164 case MEM_OFFLINE: 165 len = sprintf(buf, "offline\n"); 166 break; 167 case MEM_GOING_OFFLINE: 168 len = sprintf(buf, "going-offline\n"); 169 break; 170 default: 171 len = sprintf(buf, "ERROR-UNKNOWN-%ld\n", 172 mem->state); 173 WARN_ON(1); 174 break; 175 } 176 177 return len; 178 } 179 180 int memory_notify(unsigned long val, void *v) 181 { 182 return blocking_notifier_call_chain(&memory_chain, val, v); 183 } 184 185 int memory_isolate_notify(unsigned long val, void *v) 186 { 187 return atomic_notifier_call_chain(&memory_isolate_chain, val, v); 188 } 189 190 /* 191 * The probe routines leave the pages reserved, just as the bootmem code does. 192 * Make sure they're still that way. 193 */ 194 static bool pages_correctly_reserved(unsigned long start_pfn) 195 { 196 int i, j; 197 struct page *page; 198 unsigned long pfn = start_pfn; 199 200 /* 201 * memmap between sections is not contiguous except with 202 * SPARSEMEM_VMEMMAP. We lookup the page once per section 203 * and assume memmap is contiguous within each section 204 */ 205 for (i = 0; i < sections_per_block; i++, pfn += PAGES_PER_SECTION) { 206 if (WARN_ON_ONCE(!pfn_valid(pfn))) 207 return false; 208 page = pfn_to_page(pfn); 209 210 for (j = 0; j < PAGES_PER_SECTION; j++) { 211 if (PageReserved(page + j)) 212 continue; 213 214 printk(KERN_WARNING "section number %ld page number %d " 215 "not reserved, was it already online?\n", 216 pfn_to_section_nr(pfn), j); 217 218 return false; 219 } 220 } 221 222 return true; 223 } 224 225 /* 226 * MEMORY_HOTPLUG depends on SPARSEMEM in mm/Kconfig, so it is 227 * OK to have direct references to sparsemem variables in here. 228 */ 229 static int 230 memory_block_action(unsigned long phys_index, unsigned long action, int online_type) 231 { 232 unsigned long start_pfn; 233 unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block; 234 struct page *first_page; 235 int ret; 236 237 first_page = pfn_to_page(phys_index << PFN_SECTION_SHIFT); 238 start_pfn = page_to_pfn(first_page); 239 240 switch (action) { 241 case MEM_ONLINE: 242 if (!pages_correctly_reserved(start_pfn)) 243 return -EBUSY; 244 245 ret = online_pages(start_pfn, nr_pages, online_type); 246 break; 247 case MEM_OFFLINE: 248 ret = offline_pages(start_pfn, nr_pages); 249 break; 250 default: 251 WARN(1, KERN_WARNING "%s(%ld, %ld) unknown action: " 252 "%ld\n", __func__, phys_index, action, action); 253 ret = -EINVAL; 254 } 255 256 return ret; 257 } 258 259 static int __memory_block_change_state(struct memory_block *mem, 260 unsigned long to_state, unsigned long from_state_req, 261 int online_type) 262 { 263 int ret = 0; 264 265 if (mem->state != from_state_req) { 266 ret = -EINVAL; 267 goto out; 268 } 269 270 if (to_state == MEM_OFFLINE) 271 mem->state = MEM_GOING_OFFLINE; 272 273 ret = memory_block_action(mem->start_section_nr, to_state, online_type); 274 275 if (ret) { 276 mem->state = from_state_req; 277 goto out; 278 } 279 280 mem->state = to_state; 281 switch (mem->state) { 282 case MEM_OFFLINE: 283 kobject_uevent(&mem->dev.kobj, KOBJ_OFFLINE); 284 break; 285 case MEM_ONLINE: 286 kobject_uevent(&mem->dev.kobj, KOBJ_ONLINE); 287 break; 288 default: 289 break; 290 } 291 out: 292 return ret; 293 } 294 295 static int memory_block_change_state(struct memory_block *mem, 296 unsigned long to_state, unsigned long from_state_req, 297 int online_type) 298 { 299 int ret; 300 301 mutex_lock(&mem->state_mutex); 302 ret = __memory_block_change_state(mem, to_state, from_state_req, 303 online_type); 304 mutex_unlock(&mem->state_mutex); 305 306 return ret; 307 } 308 static ssize_t 309 store_mem_state(struct device *dev, 310 struct device_attribute *attr, const char *buf, size_t count) 311 { 312 struct memory_block *mem; 313 int ret = -EINVAL; 314 315 mem = container_of(dev, struct memory_block, dev); 316 317 if (!strncmp(buf, "online_kernel", min_t(int, count, 13))) 318 ret = memory_block_change_state(mem, MEM_ONLINE, 319 MEM_OFFLINE, ONLINE_KERNEL); 320 else if (!strncmp(buf, "online_movable", min_t(int, count, 14))) 321 ret = memory_block_change_state(mem, MEM_ONLINE, 322 MEM_OFFLINE, ONLINE_MOVABLE); 323 else if (!strncmp(buf, "online", min_t(int, count, 6))) 324 ret = memory_block_change_state(mem, MEM_ONLINE, 325 MEM_OFFLINE, ONLINE_KEEP); 326 else if(!strncmp(buf, "offline", min_t(int, count, 7))) 327 ret = memory_block_change_state(mem, MEM_OFFLINE, 328 MEM_ONLINE, -1); 329 330 if (ret) 331 return ret; 332 return count; 333 } 334 335 /* 336 * phys_device is a bad name for this. What I really want 337 * is a way to differentiate between memory ranges that 338 * are part of physical devices that constitute 339 * a complete removable unit or fru. 340 * i.e. do these ranges belong to the same physical device, 341 * s.t. if I offline all of these sections I can then 342 * remove the physical device? 343 */ 344 static ssize_t show_phys_device(struct device *dev, 345 struct device_attribute *attr, char *buf) 346 { 347 struct memory_block *mem = 348 container_of(dev, struct memory_block, dev); 349 return sprintf(buf, "%d\n", mem->phys_device); 350 } 351 352 static DEVICE_ATTR(phys_index, 0444, show_mem_start_phys_index, NULL); 353 static DEVICE_ATTR(end_phys_index, 0444, show_mem_end_phys_index, NULL); 354 static DEVICE_ATTR(state, 0644, show_mem_state, store_mem_state); 355 static DEVICE_ATTR(phys_device, 0444, show_phys_device, NULL); 356 static DEVICE_ATTR(removable, 0444, show_mem_removable, NULL); 357 358 /* 359 * Block size attribute stuff 360 */ 361 static ssize_t 362 print_block_size(struct device *dev, struct device_attribute *attr, 363 char *buf) 364 { 365 return sprintf(buf, "%lx\n", get_memory_block_size()); 366 } 367 368 static DEVICE_ATTR(block_size_bytes, 0444, print_block_size, NULL); 369 370 /* 371 * Some architectures will have custom drivers to do this, and 372 * will not need to do it from userspace. The fake hot-add code 373 * as well as ppc64 will do all of their discovery in userspace 374 * and will require this interface. 375 */ 376 #ifdef CONFIG_ARCH_MEMORY_PROBE 377 static ssize_t 378 memory_probe_store(struct device *dev, struct device_attribute *attr, 379 const char *buf, size_t count) 380 { 381 u64 phys_addr; 382 int nid; 383 int i, ret; 384 unsigned long pages_per_block = PAGES_PER_SECTION * sections_per_block; 385 386 phys_addr = simple_strtoull(buf, NULL, 0); 387 388 if (phys_addr & ((pages_per_block << PAGE_SHIFT) - 1)) 389 return -EINVAL; 390 391 for (i = 0; i < sections_per_block; i++) { 392 nid = memory_add_physaddr_to_nid(phys_addr); 393 ret = add_memory(nid, phys_addr, 394 PAGES_PER_SECTION << PAGE_SHIFT); 395 if (ret) 396 goto out; 397 398 phys_addr += MIN_MEMORY_BLOCK_SIZE; 399 } 400 401 ret = count; 402 out: 403 return ret; 404 } 405 406 static DEVICE_ATTR(probe, S_IWUSR, NULL, memory_probe_store); 407 #endif 408 409 #ifdef CONFIG_MEMORY_FAILURE 410 /* 411 * Support for offlining pages of memory 412 */ 413 414 /* Soft offline a page */ 415 static ssize_t 416 store_soft_offline_page(struct device *dev, 417 struct device_attribute *attr, 418 const char *buf, size_t count) 419 { 420 int ret; 421 u64 pfn; 422 if (!capable(CAP_SYS_ADMIN)) 423 return -EPERM; 424 if (strict_strtoull(buf, 0, &pfn) < 0) 425 return -EINVAL; 426 pfn >>= PAGE_SHIFT; 427 if (!pfn_valid(pfn)) 428 return -ENXIO; 429 ret = soft_offline_page(pfn_to_page(pfn), 0); 430 return ret == 0 ? count : ret; 431 } 432 433 /* Forcibly offline a page, including killing processes. */ 434 static ssize_t 435 store_hard_offline_page(struct device *dev, 436 struct device_attribute *attr, 437 const char *buf, size_t count) 438 { 439 int ret; 440 u64 pfn; 441 if (!capable(CAP_SYS_ADMIN)) 442 return -EPERM; 443 if (strict_strtoull(buf, 0, &pfn) < 0) 444 return -EINVAL; 445 pfn >>= PAGE_SHIFT; 446 ret = memory_failure(pfn, 0, 0); 447 return ret ? ret : count; 448 } 449 450 static DEVICE_ATTR(soft_offline_page, S_IWUSR, NULL, store_soft_offline_page); 451 static DEVICE_ATTR(hard_offline_page, S_IWUSR, NULL, store_hard_offline_page); 452 #endif 453 454 /* 455 * Note that phys_device is optional. It is here to allow for 456 * differentiation between which *physical* devices each 457 * section belongs to... 458 */ 459 int __weak arch_get_memory_phys_device(unsigned long start_pfn) 460 { 461 return 0; 462 } 463 464 /* 465 * A reference for the returned object is held and the reference for the 466 * hinted object is released. 467 */ 468 struct memory_block *find_memory_block_hinted(struct mem_section *section, 469 struct memory_block *hint) 470 { 471 int block_id = base_memory_block_id(__section_nr(section)); 472 struct device *hintdev = hint ? &hint->dev : NULL; 473 struct device *dev; 474 475 dev = subsys_find_device_by_id(&memory_subsys, block_id, hintdev); 476 if (hint) 477 put_device(&hint->dev); 478 if (!dev) 479 return NULL; 480 return container_of(dev, struct memory_block, dev); 481 } 482 483 /* 484 * For now, we have a linear search to go find the appropriate 485 * memory_block corresponding to a particular phys_index. If 486 * this gets to be a real problem, we can always use a radix 487 * tree or something here. 488 * 489 * This could be made generic for all device subsystems. 490 */ 491 struct memory_block *find_memory_block(struct mem_section *section) 492 { 493 return find_memory_block_hinted(section, NULL); 494 } 495 496 static struct attribute *memory_memblk_attrs[] = { 497 &dev_attr_phys_index.attr, 498 &dev_attr_end_phys_index.attr, 499 &dev_attr_state.attr, 500 &dev_attr_phys_device.attr, 501 &dev_attr_removable.attr, 502 NULL 503 }; 504 505 static struct attribute_group memory_memblk_attr_group = { 506 .attrs = memory_memblk_attrs, 507 }; 508 509 static const struct attribute_group *memory_memblk_attr_groups[] = { 510 &memory_memblk_attr_group, 511 NULL, 512 }; 513 514 /* 515 * register_memory - Setup a sysfs device for a memory block 516 */ 517 static 518 int register_memory(struct memory_block *memory) 519 { 520 int error; 521 522 memory->dev.bus = &memory_subsys; 523 memory->dev.id = memory->start_section_nr / sections_per_block; 524 memory->dev.release = memory_block_release; 525 memory->dev.groups = memory_memblk_attr_groups; 526 527 error = device_register(&memory->dev); 528 return error; 529 } 530 531 static int init_memory_block(struct memory_block **memory, 532 struct mem_section *section, unsigned long state) 533 { 534 struct memory_block *mem; 535 unsigned long start_pfn; 536 int scn_nr; 537 int ret = 0; 538 539 mem = kzalloc(sizeof(*mem), GFP_KERNEL); 540 if (!mem) 541 return -ENOMEM; 542 543 scn_nr = __section_nr(section); 544 mem->start_section_nr = 545 base_memory_block_id(scn_nr) * sections_per_block; 546 mem->end_section_nr = mem->start_section_nr + sections_per_block - 1; 547 mem->state = state; 548 mem->section_count++; 549 mutex_init(&mem->state_mutex); 550 start_pfn = section_nr_to_pfn(mem->start_section_nr); 551 mem->phys_device = arch_get_memory_phys_device(start_pfn); 552 553 ret = register_memory(mem); 554 555 *memory = mem; 556 return ret; 557 } 558 559 static int add_memory_section(int nid, struct mem_section *section, 560 struct memory_block **mem_p, 561 unsigned long state, enum mem_add_context context) 562 { 563 struct memory_block *mem = NULL; 564 int scn_nr = __section_nr(section); 565 int ret = 0; 566 567 mutex_lock(&mem_sysfs_mutex); 568 569 if (context == BOOT) { 570 /* same memory block ? */ 571 if (mem_p && *mem_p) 572 if (scn_nr >= (*mem_p)->start_section_nr && 573 scn_nr <= (*mem_p)->end_section_nr) { 574 mem = *mem_p; 575 kobject_get(&mem->dev.kobj); 576 } 577 } else 578 mem = find_memory_block(section); 579 580 if (mem) { 581 mem->section_count++; 582 kobject_put(&mem->dev.kobj); 583 } else { 584 ret = init_memory_block(&mem, section, state); 585 /* store memory_block pointer for next loop */ 586 if (!ret && context == BOOT) 587 if (mem_p) 588 *mem_p = mem; 589 } 590 591 if (!ret) { 592 if (context == HOTPLUG && 593 mem->section_count == sections_per_block) 594 ret = register_mem_sect_under_node(mem, nid); 595 } 596 597 mutex_unlock(&mem_sysfs_mutex); 598 return ret; 599 } 600 601 /* 602 * need an interface for the VM to add new memory regions, 603 * but without onlining it. 604 */ 605 int register_new_memory(int nid, struct mem_section *section) 606 { 607 return add_memory_section(nid, section, NULL, MEM_OFFLINE, HOTPLUG); 608 } 609 610 #ifdef CONFIG_MEMORY_HOTREMOVE 611 static void 612 unregister_memory(struct memory_block *memory) 613 { 614 BUG_ON(memory->dev.bus != &memory_subsys); 615 616 /* drop the ref. we got in remove_memory_block() */ 617 kobject_put(&memory->dev.kobj); 618 device_unregister(&memory->dev); 619 } 620 621 static int remove_memory_block(unsigned long node_id, 622 struct mem_section *section, int phys_device) 623 { 624 struct memory_block *mem; 625 626 mutex_lock(&mem_sysfs_mutex); 627 mem = find_memory_block(section); 628 unregister_mem_sect_under_nodes(mem, __section_nr(section)); 629 630 mem->section_count--; 631 if (mem->section_count == 0) 632 unregister_memory(mem); 633 else 634 kobject_put(&mem->dev.kobj); 635 636 mutex_unlock(&mem_sysfs_mutex); 637 return 0; 638 } 639 640 int unregister_memory_section(struct mem_section *section) 641 { 642 if (!present_section(section)) 643 return -EINVAL; 644 645 return remove_memory_block(0, section, 0); 646 } 647 #endif /* CONFIG_MEMORY_HOTREMOVE */ 648 649 /* 650 * offline one memory block. If the memory block has been offlined, do nothing. 651 */ 652 int offline_memory_block(struct memory_block *mem) 653 { 654 int ret = 0; 655 656 mutex_lock(&mem->state_mutex); 657 if (mem->state != MEM_OFFLINE) 658 ret = __memory_block_change_state(mem, MEM_OFFLINE, MEM_ONLINE, -1); 659 mutex_unlock(&mem->state_mutex); 660 661 return ret; 662 } 663 664 /* return true if the memory block is offlined, otherwise, return false */ 665 bool is_memblock_offlined(struct memory_block *mem) 666 { 667 return mem->state == MEM_OFFLINE; 668 } 669 670 static struct attribute *memory_root_attrs[] = { 671 #ifdef CONFIG_ARCH_MEMORY_PROBE 672 &dev_attr_probe.attr, 673 #endif 674 675 #ifdef CONFIG_MEMORY_FAILURE 676 &dev_attr_soft_offline_page.attr, 677 &dev_attr_hard_offline_page.attr, 678 #endif 679 680 &dev_attr_block_size_bytes.attr, 681 NULL 682 }; 683 684 static struct attribute_group memory_root_attr_group = { 685 .attrs = memory_root_attrs, 686 }; 687 688 static const struct attribute_group *memory_root_attr_groups[] = { 689 &memory_root_attr_group, 690 NULL, 691 }; 692 693 /* 694 * Initialize the sysfs support for memory devices... 695 */ 696 int __init memory_dev_init(void) 697 { 698 unsigned int i; 699 int ret; 700 int err; 701 unsigned long block_sz; 702 struct memory_block *mem = NULL; 703 704 ret = subsys_system_register(&memory_subsys, memory_root_attr_groups); 705 if (ret) 706 goto out; 707 708 block_sz = get_memory_block_size(); 709 sections_per_block = block_sz / MIN_MEMORY_BLOCK_SIZE; 710 711 /* 712 * Create entries for memory sections that were found 713 * during boot and have been initialized 714 */ 715 for (i = 0; i < NR_MEM_SECTIONS; i++) { 716 if (!present_section_nr(i)) 717 continue; 718 /* don't need to reuse memory_block if only one per block */ 719 err = add_memory_section(0, __nr_to_section(i), 720 (sections_per_block == 1) ? NULL : &mem, 721 MEM_ONLINE, 722 BOOT); 723 if (!ret) 724 ret = err; 725 } 726 727 out: 728 if (ret) 729 printk(KERN_ERR "%s() failed: %d\n", __func__, ret); 730 return ret; 731 } 732