1 /* 2 * Memory subsystem support 3 * 4 * Written by Matt Tolentino <matthew.e.tolentino@intel.com> 5 * Dave Hansen <haveblue@us.ibm.com> 6 * 7 * This file provides the necessary infrastructure to represent 8 * a SPARSEMEM-memory-model system's physical memory in /sysfs. 9 * All arch-independent code that assumes MEMORY_HOTPLUG requires 10 * SPARSEMEM should be contained here, or in mm/memory_hotplug.c. 11 */ 12 13 #include <linux/module.h> 14 #include <linux/init.h> 15 #include <linux/topology.h> 16 #include <linux/capability.h> 17 #include <linux/device.h> 18 #include <linux/memory.h> 19 #include <linux/kobject.h> 20 #include <linux/memory_hotplug.h> 21 #include <linux/mm.h> 22 #include <linux/mutex.h> 23 #include <linux/stat.h> 24 #include <linux/slab.h> 25 26 #include <linux/atomic.h> 27 #include <asm/uaccess.h> 28 29 static DEFINE_MUTEX(mem_sysfs_mutex); 30 31 #define MEMORY_CLASS_NAME "memory" 32 33 static int sections_per_block; 34 35 static inline int base_memory_block_id(int section_nr) 36 { 37 return section_nr / sections_per_block; 38 } 39 40 static struct bus_type memory_subsys = { 41 .name = MEMORY_CLASS_NAME, 42 .dev_name = MEMORY_CLASS_NAME, 43 }; 44 45 static BLOCKING_NOTIFIER_HEAD(memory_chain); 46 47 int register_memory_notifier(struct notifier_block *nb) 48 { 49 return blocking_notifier_chain_register(&memory_chain, nb); 50 } 51 EXPORT_SYMBOL(register_memory_notifier); 52 53 void unregister_memory_notifier(struct notifier_block *nb) 54 { 55 blocking_notifier_chain_unregister(&memory_chain, nb); 56 } 57 EXPORT_SYMBOL(unregister_memory_notifier); 58 59 static ATOMIC_NOTIFIER_HEAD(memory_isolate_chain); 60 61 int register_memory_isolate_notifier(struct notifier_block *nb) 62 { 63 return atomic_notifier_chain_register(&memory_isolate_chain, nb); 64 } 65 EXPORT_SYMBOL(register_memory_isolate_notifier); 66 67 void unregister_memory_isolate_notifier(struct notifier_block *nb) 68 { 69 atomic_notifier_chain_unregister(&memory_isolate_chain, nb); 70 } 71 EXPORT_SYMBOL(unregister_memory_isolate_notifier); 72 73 /* 74 * register_memory - Setup a sysfs device for a memory block 75 */ 76 static 77 int register_memory(struct memory_block *memory) 78 { 79 int error; 80 81 memory->dev.bus = &memory_subsys; 82 memory->dev.id = memory->start_section_nr / sections_per_block; 83 84 error = device_register(&memory->dev); 85 return error; 86 } 87 88 static void 89 unregister_memory(struct memory_block *memory) 90 { 91 BUG_ON(memory->dev.bus != &memory_subsys); 92 93 /* drop the ref. we got in remove_memory_block() */ 94 kobject_put(&memory->dev.kobj); 95 device_unregister(&memory->dev); 96 } 97 98 unsigned long __weak memory_block_size_bytes(void) 99 { 100 return MIN_MEMORY_BLOCK_SIZE; 101 } 102 103 static unsigned long get_memory_block_size(void) 104 { 105 unsigned long block_sz; 106 107 block_sz = memory_block_size_bytes(); 108 109 /* Validate blk_sz is a power of 2 and not less than section size */ 110 if ((block_sz & (block_sz - 1)) || (block_sz < MIN_MEMORY_BLOCK_SIZE)) { 111 WARN_ON(1); 112 block_sz = MIN_MEMORY_BLOCK_SIZE; 113 } 114 115 return block_sz; 116 } 117 118 /* 119 * use this as the physical section index that this memsection 120 * uses. 121 */ 122 123 static ssize_t show_mem_start_phys_index(struct device *dev, 124 struct device_attribute *attr, char *buf) 125 { 126 struct memory_block *mem = 127 container_of(dev, struct memory_block, dev); 128 unsigned long phys_index; 129 130 phys_index = mem->start_section_nr / sections_per_block; 131 return sprintf(buf, "%08lx\n", phys_index); 132 } 133 134 static ssize_t show_mem_end_phys_index(struct device *dev, 135 struct device_attribute *attr, char *buf) 136 { 137 struct memory_block *mem = 138 container_of(dev, struct memory_block, dev); 139 unsigned long phys_index; 140 141 phys_index = mem->end_section_nr / sections_per_block; 142 return sprintf(buf, "%08lx\n", phys_index); 143 } 144 145 /* 146 * Show whether the section of memory is likely to be hot-removable 147 */ 148 static ssize_t show_mem_removable(struct device *dev, 149 struct device_attribute *attr, char *buf) 150 { 151 unsigned long i, pfn; 152 int ret = 1; 153 struct memory_block *mem = 154 container_of(dev, struct memory_block, dev); 155 156 for (i = 0; i < sections_per_block; i++) { 157 pfn = section_nr_to_pfn(mem->start_section_nr + i); 158 ret &= is_mem_section_removable(pfn, PAGES_PER_SECTION); 159 } 160 161 return sprintf(buf, "%d\n", ret); 162 } 163 164 /* 165 * online, offline, going offline, etc. 166 */ 167 static ssize_t show_mem_state(struct device *dev, 168 struct device_attribute *attr, char *buf) 169 { 170 struct memory_block *mem = 171 container_of(dev, struct memory_block, dev); 172 ssize_t len = 0; 173 174 /* 175 * We can probably put these states in a nice little array 176 * so that they're not open-coded 177 */ 178 switch (mem->state) { 179 case MEM_ONLINE: 180 len = sprintf(buf, "online\n"); 181 break; 182 case MEM_OFFLINE: 183 len = sprintf(buf, "offline\n"); 184 break; 185 case MEM_GOING_OFFLINE: 186 len = sprintf(buf, "going-offline\n"); 187 break; 188 default: 189 len = sprintf(buf, "ERROR-UNKNOWN-%ld\n", 190 mem->state); 191 WARN_ON(1); 192 break; 193 } 194 195 return len; 196 } 197 198 int memory_notify(unsigned long val, void *v) 199 { 200 return blocking_notifier_call_chain(&memory_chain, val, v); 201 } 202 203 int memory_isolate_notify(unsigned long val, void *v) 204 { 205 return atomic_notifier_call_chain(&memory_isolate_chain, val, v); 206 } 207 208 /* 209 * The probe routines leave the pages reserved, just as the bootmem code does. 210 * Make sure they're still that way. 211 */ 212 static bool pages_correctly_reserved(unsigned long start_pfn, 213 unsigned long nr_pages) 214 { 215 int i, j; 216 struct page *page; 217 unsigned long pfn = start_pfn; 218 219 /* 220 * memmap between sections is not contiguous except with 221 * SPARSEMEM_VMEMMAP. We lookup the page once per section 222 * and assume memmap is contiguous within each section 223 */ 224 for (i = 0; i < sections_per_block; i++, pfn += PAGES_PER_SECTION) { 225 if (WARN_ON_ONCE(!pfn_valid(pfn))) 226 return false; 227 page = pfn_to_page(pfn); 228 229 for (j = 0; j < PAGES_PER_SECTION; j++) { 230 if (PageReserved(page + j)) 231 continue; 232 233 printk(KERN_WARNING "section number %ld page number %d " 234 "not reserved, was it already online?\n", 235 pfn_to_section_nr(pfn), j); 236 237 return false; 238 } 239 } 240 241 return true; 242 } 243 244 /* 245 * MEMORY_HOTPLUG depends on SPARSEMEM in mm/Kconfig, so it is 246 * OK to have direct references to sparsemem variables in here. 247 */ 248 static int 249 memory_block_action(unsigned long phys_index, unsigned long action) 250 { 251 unsigned long start_pfn; 252 unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block; 253 struct page *first_page; 254 int ret; 255 256 first_page = pfn_to_page(phys_index << PFN_SECTION_SHIFT); 257 start_pfn = page_to_pfn(first_page); 258 259 switch (action) { 260 case MEM_ONLINE: 261 if (!pages_correctly_reserved(start_pfn, nr_pages)) 262 return -EBUSY; 263 264 ret = online_pages(start_pfn, nr_pages); 265 break; 266 case MEM_OFFLINE: 267 ret = offline_pages(start_pfn, nr_pages); 268 break; 269 default: 270 WARN(1, KERN_WARNING "%s(%ld, %ld) unknown action: " 271 "%ld\n", __func__, phys_index, action, action); 272 ret = -EINVAL; 273 } 274 275 return ret; 276 } 277 278 static int __memory_block_change_state(struct memory_block *mem, 279 unsigned long to_state, unsigned long from_state_req) 280 { 281 int ret = 0; 282 283 if (mem->state != from_state_req) { 284 ret = -EINVAL; 285 goto out; 286 } 287 288 if (to_state == MEM_OFFLINE) 289 mem->state = MEM_GOING_OFFLINE; 290 291 ret = memory_block_action(mem->start_section_nr, to_state); 292 293 if (ret) { 294 mem->state = from_state_req; 295 goto out; 296 } 297 298 mem->state = to_state; 299 switch (mem->state) { 300 case MEM_OFFLINE: 301 kobject_uevent(&mem->dev.kobj, KOBJ_OFFLINE); 302 break; 303 case MEM_ONLINE: 304 kobject_uevent(&mem->dev.kobj, KOBJ_ONLINE); 305 break; 306 default: 307 break; 308 } 309 out: 310 return ret; 311 } 312 313 static int memory_block_change_state(struct memory_block *mem, 314 unsigned long to_state, unsigned long from_state_req) 315 { 316 int ret; 317 318 mutex_lock(&mem->state_mutex); 319 ret = __memory_block_change_state(mem, to_state, from_state_req); 320 mutex_unlock(&mem->state_mutex); 321 322 return ret; 323 } 324 static ssize_t 325 store_mem_state(struct device *dev, 326 struct device_attribute *attr, const char *buf, size_t count) 327 { 328 struct memory_block *mem; 329 int ret = -EINVAL; 330 331 mem = container_of(dev, struct memory_block, dev); 332 333 if (!strncmp(buf, "online", min((int)count, 6))) 334 ret = memory_block_change_state(mem, MEM_ONLINE, MEM_OFFLINE); 335 else if(!strncmp(buf, "offline", min((int)count, 7))) 336 ret = memory_block_change_state(mem, MEM_OFFLINE, MEM_ONLINE); 337 338 if (ret) 339 return ret; 340 return count; 341 } 342 343 /* 344 * phys_device is a bad name for this. What I really want 345 * is a way to differentiate between memory ranges that 346 * are part of physical devices that constitute 347 * a complete removable unit or fru. 348 * i.e. do these ranges belong to the same physical device, 349 * s.t. if I offline all of these sections I can then 350 * remove the physical device? 351 */ 352 static ssize_t show_phys_device(struct device *dev, 353 struct device_attribute *attr, char *buf) 354 { 355 struct memory_block *mem = 356 container_of(dev, struct memory_block, dev); 357 return sprintf(buf, "%d\n", mem->phys_device); 358 } 359 360 static DEVICE_ATTR(phys_index, 0444, show_mem_start_phys_index, NULL); 361 static DEVICE_ATTR(end_phys_index, 0444, show_mem_end_phys_index, NULL); 362 static DEVICE_ATTR(state, 0644, show_mem_state, store_mem_state); 363 static DEVICE_ATTR(phys_device, 0444, show_phys_device, NULL); 364 static DEVICE_ATTR(removable, 0444, show_mem_removable, NULL); 365 366 #define mem_create_simple_file(mem, attr_name) \ 367 device_create_file(&mem->dev, &dev_attr_##attr_name) 368 #define mem_remove_simple_file(mem, attr_name) \ 369 device_remove_file(&mem->dev, &dev_attr_##attr_name) 370 371 /* 372 * Block size attribute stuff 373 */ 374 static ssize_t 375 print_block_size(struct device *dev, struct device_attribute *attr, 376 char *buf) 377 { 378 return sprintf(buf, "%lx\n", get_memory_block_size()); 379 } 380 381 static DEVICE_ATTR(block_size_bytes, 0444, print_block_size, NULL); 382 383 static int block_size_init(void) 384 { 385 return device_create_file(memory_subsys.dev_root, 386 &dev_attr_block_size_bytes); 387 } 388 389 /* 390 * Some architectures will have custom drivers to do this, and 391 * will not need to do it from userspace. The fake hot-add code 392 * as well as ppc64 will do all of their discovery in userspace 393 * and will require this interface. 394 */ 395 #ifdef CONFIG_ARCH_MEMORY_PROBE 396 static ssize_t 397 memory_probe_store(struct device *dev, struct device_attribute *attr, 398 const char *buf, size_t count) 399 { 400 u64 phys_addr; 401 int nid; 402 int i, ret; 403 unsigned long pages_per_block = PAGES_PER_SECTION * sections_per_block; 404 405 phys_addr = simple_strtoull(buf, NULL, 0); 406 407 if (phys_addr & ((pages_per_block << PAGE_SHIFT) - 1)) 408 return -EINVAL; 409 410 for (i = 0; i < sections_per_block; i++) { 411 nid = memory_add_physaddr_to_nid(phys_addr); 412 ret = add_memory(nid, phys_addr, 413 PAGES_PER_SECTION << PAGE_SHIFT); 414 if (ret) 415 goto out; 416 417 phys_addr += MIN_MEMORY_BLOCK_SIZE; 418 } 419 420 ret = count; 421 out: 422 return ret; 423 } 424 static DEVICE_ATTR(probe, S_IWUSR, NULL, memory_probe_store); 425 426 static int memory_probe_init(void) 427 { 428 return device_create_file(memory_subsys.dev_root, &dev_attr_probe); 429 } 430 #else 431 static inline int memory_probe_init(void) 432 { 433 return 0; 434 } 435 #endif 436 437 #ifdef CONFIG_MEMORY_FAILURE 438 /* 439 * Support for offlining pages of memory 440 */ 441 442 /* Soft offline a page */ 443 static ssize_t 444 store_soft_offline_page(struct device *dev, 445 struct device_attribute *attr, 446 const char *buf, size_t count) 447 { 448 int ret; 449 u64 pfn; 450 if (!capable(CAP_SYS_ADMIN)) 451 return -EPERM; 452 if (strict_strtoull(buf, 0, &pfn) < 0) 453 return -EINVAL; 454 pfn >>= PAGE_SHIFT; 455 if (!pfn_valid(pfn)) 456 return -ENXIO; 457 ret = soft_offline_page(pfn_to_page(pfn), 0); 458 return ret == 0 ? count : ret; 459 } 460 461 /* Forcibly offline a page, including killing processes. */ 462 static ssize_t 463 store_hard_offline_page(struct device *dev, 464 struct device_attribute *attr, 465 const char *buf, size_t count) 466 { 467 int ret; 468 u64 pfn; 469 if (!capable(CAP_SYS_ADMIN)) 470 return -EPERM; 471 if (strict_strtoull(buf, 0, &pfn) < 0) 472 return -EINVAL; 473 pfn >>= PAGE_SHIFT; 474 ret = memory_failure(pfn, 0, 0); 475 return ret ? ret : count; 476 } 477 478 static DEVICE_ATTR(soft_offline_page, 0644, NULL, store_soft_offline_page); 479 static DEVICE_ATTR(hard_offline_page, 0644, NULL, store_hard_offline_page); 480 481 static __init int memory_fail_init(void) 482 { 483 int err; 484 485 err = device_create_file(memory_subsys.dev_root, 486 &dev_attr_soft_offline_page); 487 if (!err) 488 err = device_create_file(memory_subsys.dev_root, 489 &dev_attr_hard_offline_page); 490 return err; 491 } 492 #else 493 static inline int memory_fail_init(void) 494 { 495 return 0; 496 } 497 #endif 498 499 /* 500 * Note that phys_device is optional. It is here to allow for 501 * differentiation between which *physical* devices each 502 * section belongs to... 503 */ 504 int __weak arch_get_memory_phys_device(unsigned long start_pfn) 505 { 506 return 0; 507 } 508 509 /* 510 * A reference for the returned object is held and the reference for the 511 * hinted object is released. 512 */ 513 struct memory_block *find_memory_block_hinted(struct mem_section *section, 514 struct memory_block *hint) 515 { 516 int block_id = base_memory_block_id(__section_nr(section)); 517 struct device *hintdev = hint ? &hint->dev : NULL; 518 struct device *dev; 519 520 dev = subsys_find_device_by_id(&memory_subsys, block_id, hintdev); 521 if (hint) 522 put_device(&hint->dev); 523 if (!dev) 524 return NULL; 525 return container_of(dev, struct memory_block, dev); 526 } 527 528 /* 529 * For now, we have a linear search to go find the appropriate 530 * memory_block corresponding to a particular phys_index. If 531 * this gets to be a real problem, we can always use a radix 532 * tree or something here. 533 * 534 * This could be made generic for all device subsystems. 535 */ 536 struct memory_block *find_memory_block(struct mem_section *section) 537 { 538 return find_memory_block_hinted(section, NULL); 539 } 540 541 static int init_memory_block(struct memory_block **memory, 542 struct mem_section *section, unsigned long state) 543 { 544 struct memory_block *mem; 545 unsigned long start_pfn; 546 int scn_nr; 547 int ret = 0; 548 549 mem = kzalloc(sizeof(*mem), GFP_KERNEL); 550 if (!mem) 551 return -ENOMEM; 552 553 scn_nr = __section_nr(section); 554 mem->start_section_nr = 555 base_memory_block_id(scn_nr) * sections_per_block; 556 mem->end_section_nr = mem->start_section_nr + sections_per_block - 1; 557 mem->state = state; 558 mem->section_count++; 559 mutex_init(&mem->state_mutex); 560 start_pfn = section_nr_to_pfn(mem->start_section_nr); 561 mem->phys_device = arch_get_memory_phys_device(start_pfn); 562 563 ret = register_memory(mem); 564 if (!ret) 565 ret = mem_create_simple_file(mem, phys_index); 566 if (!ret) 567 ret = mem_create_simple_file(mem, end_phys_index); 568 if (!ret) 569 ret = mem_create_simple_file(mem, state); 570 if (!ret) 571 ret = mem_create_simple_file(mem, phys_device); 572 if (!ret) 573 ret = mem_create_simple_file(mem, removable); 574 575 *memory = mem; 576 return ret; 577 } 578 579 static int add_memory_section(int nid, struct mem_section *section, 580 struct memory_block **mem_p, 581 unsigned long state, enum mem_add_context context) 582 { 583 struct memory_block *mem = NULL; 584 int scn_nr = __section_nr(section); 585 int ret = 0; 586 587 mutex_lock(&mem_sysfs_mutex); 588 589 if (context == BOOT) { 590 /* same memory block ? */ 591 if (mem_p && *mem_p) 592 if (scn_nr >= (*mem_p)->start_section_nr && 593 scn_nr <= (*mem_p)->end_section_nr) { 594 mem = *mem_p; 595 kobject_get(&mem->dev.kobj); 596 } 597 } else 598 mem = find_memory_block(section); 599 600 if (mem) { 601 mem->section_count++; 602 kobject_put(&mem->dev.kobj); 603 } else { 604 ret = init_memory_block(&mem, section, state); 605 /* store memory_block pointer for next loop */ 606 if (!ret && context == BOOT) 607 if (mem_p) 608 *mem_p = mem; 609 } 610 611 if (!ret) { 612 if (context == HOTPLUG && 613 mem->section_count == sections_per_block) 614 ret = register_mem_sect_under_node(mem, nid); 615 } 616 617 mutex_unlock(&mem_sysfs_mutex); 618 return ret; 619 } 620 621 int remove_memory_block(unsigned long node_id, struct mem_section *section, 622 int phys_device) 623 { 624 struct memory_block *mem; 625 626 mutex_lock(&mem_sysfs_mutex); 627 mem = find_memory_block(section); 628 unregister_mem_sect_under_nodes(mem, __section_nr(section)); 629 630 mem->section_count--; 631 if (mem->section_count == 0) { 632 mem_remove_simple_file(mem, phys_index); 633 mem_remove_simple_file(mem, end_phys_index); 634 mem_remove_simple_file(mem, state); 635 mem_remove_simple_file(mem, phys_device); 636 mem_remove_simple_file(mem, removable); 637 unregister_memory(mem); 638 kfree(mem); 639 } else 640 kobject_put(&mem->dev.kobj); 641 642 mutex_unlock(&mem_sysfs_mutex); 643 return 0; 644 } 645 646 /* 647 * need an interface for the VM to add new memory regions, 648 * but without onlining it. 649 */ 650 int register_new_memory(int nid, struct mem_section *section) 651 { 652 return add_memory_section(nid, section, NULL, MEM_OFFLINE, HOTPLUG); 653 } 654 655 int unregister_memory_section(struct mem_section *section) 656 { 657 if (!present_section(section)) 658 return -EINVAL; 659 660 return remove_memory_block(0, section, 0); 661 } 662 663 /* 664 * offline one memory block. If the memory block has been offlined, do nothing. 665 */ 666 int offline_memory_block(struct memory_block *mem) 667 { 668 int ret = 0; 669 670 mutex_lock(&mem->state_mutex); 671 if (mem->state != MEM_OFFLINE) 672 ret = __memory_block_change_state(mem, MEM_OFFLINE, MEM_ONLINE); 673 mutex_unlock(&mem->state_mutex); 674 675 return ret; 676 } 677 678 /* 679 * Initialize the sysfs support for memory devices... 680 */ 681 int __init memory_dev_init(void) 682 { 683 unsigned int i; 684 int ret; 685 int err; 686 unsigned long block_sz; 687 struct memory_block *mem = NULL; 688 689 ret = subsys_system_register(&memory_subsys, NULL); 690 if (ret) 691 goto out; 692 693 block_sz = get_memory_block_size(); 694 sections_per_block = block_sz / MIN_MEMORY_BLOCK_SIZE; 695 696 /* 697 * Create entries for memory sections that were found 698 * during boot and have been initialized 699 */ 700 for (i = 0; i < NR_MEM_SECTIONS; i++) { 701 if (!present_section_nr(i)) 702 continue; 703 /* don't need to reuse memory_block if only one per block */ 704 err = add_memory_section(0, __nr_to_section(i), 705 (sections_per_block == 1) ? NULL : &mem, 706 MEM_ONLINE, 707 BOOT); 708 if (!ret) 709 ret = err; 710 } 711 712 err = memory_probe_init(); 713 if (!ret) 714 ret = err; 715 err = memory_fail_init(); 716 if (!ret) 717 ret = err; 718 err = block_size_init(); 719 if (!ret) 720 ret = err; 721 out: 722 if (ret) 723 printk(KERN_ERR "%s() failed: %d\n", __func__, ret); 724 return ret; 725 } 726