1 /* 2 * drivers/base/memory.c - basic Memory class support 3 * 4 * Written by Matt Tolentino <matthew.e.tolentino@intel.com> 5 * Dave Hansen <haveblue@us.ibm.com> 6 * 7 * This file provides the necessary infrastructure to represent 8 * a SPARSEMEM-memory-model system's physical memory in /sysfs. 9 * All arch-independent code that assumes MEMORY_HOTPLUG requires 10 * SPARSEMEM should be contained here, or in mm/memory_hotplug.c. 11 */ 12 13 #include <linux/sysdev.h> 14 #include <linux/module.h> 15 #include <linux/init.h> 16 #include <linux/topology.h> 17 #include <linux/capability.h> 18 #include <linux/device.h> 19 #include <linux/memory.h> 20 #include <linux/kobject.h> 21 #include <linux/memory_hotplug.h> 22 #include <linux/mm.h> 23 #include <linux/mutex.h> 24 #include <linux/stat.h> 25 #include <linux/slab.h> 26 27 #include <linux/atomic.h> 28 #include <asm/uaccess.h> 29 30 static DEFINE_MUTEX(mem_sysfs_mutex); 31 32 #define MEMORY_CLASS_NAME "memory" 33 34 static int sections_per_block; 35 36 static inline int base_memory_block_id(int section_nr) 37 { 38 return section_nr / sections_per_block; 39 } 40 41 static struct sysdev_class memory_sysdev_class = { 42 .name = MEMORY_CLASS_NAME, 43 }; 44 45 static const char *memory_uevent_name(struct kset *kset, struct kobject *kobj) 46 { 47 return MEMORY_CLASS_NAME; 48 } 49 50 static int memory_uevent(struct kset *kset, struct kobject *obj, 51 struct kobj_uevent_env *env) 52 { 53 int retval = 0; 54 55 return retval; 56 } 57 58 static const struct kset_uevent_ops memory_uevent_ops = { 59 .name = memory_uevent_name, 60 .uevent = memory_uevent, 61 }; 62 63 static BLOCKING_NOTIFIER_HEAD(memory_chain); 64 65 int register_memory_notifier(struct notifier_block *nb) 66 { 67 return blocking_notifier_chain_register(&memory_chain, nb); 68 } 69 EXPORT_SYMBOL(register_memory_notifier); 70 71 void unregister_memory_notifier(struct notifier_block *nb) 72 { 73 blocking_notifier_chain_unregister(&memory_chain, nb); 74 } 75 EXPORT_SYMBOL(unregister_memory_notifier); 76 77 static ATOMIC_NOTIFIER_HEAD(memory_isolate_chain); 78 79 int register_memory_isolate_notifier(struct notifier_block *nb) 80 { 81 return atomic_notifier_chain_register(&memory_isolate_chain, nb); 82 } 83 EXPORT_SYMBOL(register_memory_isolate_notifier); 84 85 void unregister_memory_isolate_notifier(struct notifier_block *nb) 86 { 87 atomic_notifier_chain_unregister(&memory_isolate_chain, nb); 88 } 89 EXPORT_SYMBOL(unregister_memory_isolate_notifier); 90 91 /* 92 * register_memory - Setup a sysfs device for a memory block 93 */ 94 static 95 int register_memory(struct memory_block *memory) 96 { 97 int error; 98 99 memory->sysdev.cls = &memory_sysdev_class; 100 memory->sysdev.id = memory->start_section_nr / sections_per_block; 101 102 error = sysdev_register(&memory->sysdev); 103 return error; 104 } 105 106 static void 107 unregister_memory(struct memory_block *memory) 108 { 109 BUG_ON(memory->sysdev.cls != &memory_sysdev_class); 110 111 /* drop the ref. we got in remove_memory_block() */ 112 kobject_put(&memory->sysdev.kobj); 113 sysdev_unregister(&memory->sysdev); 114 } 115 116 unsigned long __weak memory_block_size_bytes(void) 117 { 118 return MIN_MEMORY_BLOCK_SIZE; 119 } 120 121 static unsigned long get_memory_block_size(void) 122 { 123 unsigned long block_sz; 124 125 block_sz = memory_block_size_bytes(); 126 127 /* Validate blk_sz is a power of 2 and not less than section size */ 128 if ((block_sz & (block_sz - 1)) || (block_sz < MIN_MEMORY_BLOCK_SIZE)) { 129 WARN_ON(1); 130 block_sz = MIN_MEMORY_BLOCK_SIZE; 131 } 132 133 return block_sz; 134 } 135 136 /* 137 * use this as the physical section index that this memsection 138 * uses. 139 */ 140 141 static ssize_t show_mem_start_phys_index(struct sys_device *dev, 142 struct sysdev_attribute *attr, char *buf) 143 { 144 struct memory_block *mem = 145 container_of(dev, struct memory_block, sysdev); 146 unsigned long phys_index; 147 148 phys_index = mem->start_section_nr / sections_per_block; 149 return sprintf(buf, "%08lx\n", phys_index); 150 } 151 152 static ssize_t show_mem_end_phys_index(struct sys_device *dev, 153 struct sysdev_attribute *attr, char *buf) 154 { 155 struct memory_block *mem = 156 container_of(dev, struct memory_block, sysdev); 157 unsigned long phys_index; 158 159 phys_index = mem->end_section_nr / sections_per_block; 160 return sprintf(buf, "%08lx\n", phys_index); 161 } 162 163 /* 164 * Show whether the section of memory is likely to be hot-removable 165 */ 166 static ssize_t show_mem_removable(struct sys_device *dev, 167 struct sysdev_attribute *attr, char *buf) 168 { 169 unsigned long i, pfn; 170 int ret = 1; 171 struct memory_block *mem = 172 container_of(dev, struct memory_block, sysdev); 173 174 for (i = 0; i < sections_per_block; i++) { 175 pfn = section_nr_to_pfn(mem->start_section_nr + i); 176 ret &= is_mem_section_removable(pfn, PAGES_PER_SECTION); 177 } 178 179 return sprintf(buf, "%d\n", ret); 180 } 181 182 /* 183 * online, offline, going offline, etc. 184 */ 185 static ssize_t show_mem_state(struct sys_device *dev, 186 struct sysdev_attribute *attr, char *buf) 187 { 188 struct memory_block *mem = 189 container_of(dev, struct memory_block, sysdev); 190 ssize_t len = 0; 191 192 /* 193 * We can probably put these states in a nice little array 194 * so that they're not open-coded 195 */ 196 switch (mem->state) { 197 case MEM_ONLINE: 198 len = sprintf(buf, "online\n"); 199 break; 200 case MEM_OFFLINE: 201 len = sprintf(buf, "offline\n"); 202 break; 203 case MEM_GOING_OFFLINE: 204 len = sprintf(buf, "going-offline\n"); 205 break; 206 default: 207 len = sprintf(buf, "ERROR-UNKNOWN-%ld\n", 208 mem->state); 209 WARN_ON(1); 210 break; 211 } 212 213 return len; 214 } 215 216 int memory_notify(unsigned long val, void *v) 217 { 218 return blocking_notifier_call_chain(&memory_chain, val, v); 219 } 220 221 int memory_isolate_notify(unsigned long val, void *v) 222 { 223 return atomic_notifier_call_chain(&memory_isolate_chain, val, v); 224 } 225 226 /* 227 * The probe routines leave the pages reserved, just as the bootmem code does. 228 * Make sure they're still that way. 229 */ 230 static bool pages_correctly_reserved(unsigned long start_pfn, 231 unsigned long nr_pages) 232 { 233 int i, j; 234 struct page *page; 235 unsigned long pfn = start_pfn; 236 237 /* 238 * memmap between sections is not contiguous except with 239 * SPARSEMEM_VMEMMAP. We lookup the page once per section 240 * and assume memmap is contiguous within each section 241 */ 242 for (i = 0; i < sections_per_block; i++, pfn += PAGES_PER_SECTION) { 243 if (WARN_ON_ONCE(!pfn_valid(pfn))) 244 return false; 245 page = pfn_to_page(pfn); 246 247 for (j = 0; j < PAGES_PER_SECTION; j++) { 248 if (PageReserved(page + j)) 249 continue; 250 251 printk(KERN_WARNING "section number %ld page number %d " 252 "not reserved, was it already online?\n", 253 pfn_to_section_nr(pfn), j); 254 255 return false; 256 } 257 } 258 259 return true; 260 } 261 262 /* 263 * MEMORY_HOTPLUG depends on SPARSEMEM in mm/Kconfig, so it is 264 * OK to have direct references to sparsemem variables in here. 265 */ 266 static int 267 memory_block_action(unsigned long phys_index, unsigned long action) 268 { 269 unsigned long start_pfn, start_paddr; 270 unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block; 271 struct page *first_page; 272 int ret; 273 274 first_page = pfn_to_page(phys_index << PFN_SECTION_SHIFT); 275 276 switch (action) { 277 case MEM_ONLINE: 278 start_pfn = page_to_pfn(first_page); 279 280 if (!pages_correctly_reserved(start_pfn, nr_pages)) 281 return -EBUSY; 282 283 ret = online_pages(start_pfn, nr_pages); 284 break; 285 case MEM_OFFLINE: 286 start_paddr = page_to_pfn(first_page) << PAGE_SHIFT; 287 ret = remove_memory(start_paddr, 288 nr_pages << PAGE_SHIFT); 289 break; 290 default: 291 WARN(1, KERN_WARNING "%s(%ld, %ld) unknown action: " 292 "%ld\n", __func__, phys_index, action, action); 293 ret = -EINVAL; 294 } 295 296 return ret; 297 } 298 299 static int memory_block_change_state(struct memory_block *mem, 300 unsigned long to_state, unsigned long from_state_req) 301 { 302 int ret = 0; 303 304 mutex_lock(&mem->state_mutex); 305 306 if (mem->state != from_state_req) { 307 ret = -EINVAL; 308 goto out; 309 } 310 311 if (to_state == MEM_OFFLINE) 312 mem->state = MEM_GOING_OFFLINE; 313 314 ret = memory_block_action(mem->start_section_nr, to_state); 315 316 if (ret) 317 mem->state = from_state_req; 318 else 319 mem->state = to_state; 320 321 out: 322 mutex_unlock(&mem->state_mutex); 323 return ret; 324 } 325 326 static ssize_t 327 store_mem_state(struct sys_device *dev, 328 struct sysdev_attribute *attr, const char *buf, size_t count) 329 { 330 struct memory_block *mem; 331 int ret = -EINVAL; 332 333 mem = container_of(dev, struct memory_block, sysdev); 334 335 if (!strncmp(buf, "online", min((int)count, 6))) 336 ret = memory_block_change_state(mem, MEM_ONLINE, MEM_OFFLINE); 337 else if(!strncmp(buf, "offline", min((int)count, 7))) 338 ret = memory_block_change_state(mem, MEM_OFFLINE, MEM_ONLINE); 339 340 if (ret) 341 return ret; 342 return count; 343 } 344 345 /* 346 * phys_device is a bad name for this. What I really want 347 * is a way to differentiate between memory ranges that 348 * are part of physical devices that constitute 349 * a complete removable unit or fru. 350 * i.e. do these ranges belong to the same physical device, 351 * s.t. if I offline all of these sections I can then 352 * remove the physical device? 353 */ 354 static ssize_t show_phys_device(struct sys_device *dev, 355 struct sysdev_attribute *attr, char *buf) 356 { 357 struct memory_block *mem = 358 container_of(dev, struct memory_block, sysdev); 359 return sprintf(buf, "%d\n", mem->phys_device); 360 } 361 362 static SYSDEV_ATTR(phys_index, 0444, show_mem_start_phys_index, NULL); 363 static SYSDEV_ATTR(end_phys_index, 0444, show_mem_end_phys_index, NULL); 364 static SYSDEV_ATTR(state, 0644, show_mem_state, store_mem_state); 365 static SYSDEV_ATTR(phys_device, 0444, show_phys_device, NULL); 366 static SYSDEV_ATTR(removable, 0444, show_mem_removable, NULL); 367 368 #define mem_create_simple_file(mem, attr_name) \ 369 sysdev_create_file(&mem->sysdev, &attr_##attr_name) 370 #define mem_remove_simple_file(mem, attr_name) \ 371 sysdev_remove_file(&mem->sysdev, &attr_##attr_name) 372 373 /* 374 * Block size attribute stuff 375 */ 376 static ssize_t 377 print_block_size(struct sysdev_class *class, struct sysdev_class_attribute *attr, 378 char *buf) 379 { 380 return sprintf(buf, "%lx\n", get_memory_block_size()); 381 } 382 383 static SYSDEV_CLASS_ATTR(block_size_bytes, 0444, print_block_size, NULL); 384 385 static int block_size_init(void) 386 { 387 return sysfs_create_file(&memory_sysdev_class.kset.kobj, 388 &attr_block_size_bytes.attr); 389 } 390 391 /* 392 * Some architectures will have custom drivers to do this, and 393 * will not need to do it from userspace. The fake hot-add code 394 * as well as ppc64 will do all of their discovery in userspace 395 * and will require this interface. 396 */ 397 #ifdef CONFIG_ARCH_MEMORY_PROBE 398 static ssize_t 399 memory_probe_store(struct class *class, struct class_attribute *attr, 400 const char *buf, size_t count) 401 { 402 u64 phys_addr; 403 int nid; 404 int i, ret; 405 unsigned long pages_per_block = PAGES_PER_SECTION * sections_per_block; 406 407 phys_addr = simple_strtoull(buf, NULL, 0); 408 409 if (phys_addr & ((pages_per_block << PAGE_SHIFT) - 1)) 410 return -EINVAL; 411 412 for (i = 0; i < sections_per_block; i++) { 413 nid = memory_add_physaddr_to_nid(phys_addr); 414 ret = add_memory(nid, phys_addr, 415 PAGES_PER_SECTION << PAGE_SHIFT); 416 if (ret) 417 goto out; 418 419 phys_addr += MIN_MEMORY_BLOCK_SIZE; 420 } 421 422 ret = count; 423 out: 424 return ret; 425 } 426 static CLASS_ATTR(probe, S_IWUSR, NULL, memory_probe_store); 427 428 static int memory_probe_init(void) 429 { 430 return sysfs_create_file(&memory_sysdev_class.kset.kobj, 431 &class_attr_probe.attr); 432 } 433 #else 434 static inline int memory_probe_init(void) 435 { 436 return 0; 437 } 438 #endif 439 440 #ifdef CONFIG_MEMORY_FAILURE 441 /* 442 * Support for offlining pages of memory 443 */ 444 445 /* Soft offline a page */ 446 static ssize_t 447 store_soft_offline_page(struct class *class, 448 struct class_attribute *attr, 449 const char *buf, size_t count) 450 { 451 int ret; 452 u64 pfn; 453 if (!capable(CAP_SYS_ADMIN)) 454 return -EPERM; 455 if (strict_strtoull(buf, 0, &pfn) < 0) 456 return -EINVAL; 457 pfn >>= PAGE_SHIFT; 458 if (!pfn_valid(pfn)) 459 return -ENXIO; 460 ret = soft_offline_page(pfn_to_page(pfn), 0); 461 return ret == 0 ? count : ret; 462 } 463 464 /* Forcibly offline a page, including killing processes. */ 465 static ssize_t 466 store_hard_offline_page(struct class *class, 467 struct class_attribute *attr, 468 const char *buf, size_t count) 469 { 470 int ret; 471 u64 pfn; 472 if (!capable(CAP_SYS_ADMIN)) 473 return -EPERM; 474 if (strict_strtoull(buf, 0, &pfn) < 0) 475 return -EINVAL; 476 pfn >>= PAGE_SHIFT; 477 ret = __memory_failure(pfn, 0, 0); 478 return ret ? ret : count; 479 } 480 481 static CLASS_ATTR(soft_offline_page, 0644, NULL, store_soft_offline_page); 482 static CLASS_ATTR(hard_offline_page, 0644, NULL, store_hard_offline_page); 483 484 static __init int memory_fail_init(void) 485 { 486 int err; 487 488 err = sysfs_create_file(&memory_sysdev_class.kset.kobj, 489 &class_attr_soft_offline_page.attr); 490 if (!err) 491 err = sysfs_create_file(&memory_sysdev_class.kset.kobj, 492 &class_attr_hard_offline_page.attr); 493 return err; 494 } 495 #else 496 static inline int memory_fail_init(void) 497 { 498 return 0; 499 } 500 #endif 501 502 /* 503 * Note that phys_device is optional. It is here to allow for 504 * differentiation between which *physical* devices each 505 * section belongs to... 506 */ 507 int __weak arch_get_memory_phys_device(unsigned long start_pfn) 508 { 509 return 0; 510 } 511 512 struct memory_block *find_memory_block_hinted(struct mem_section *section, 513 struct memory_block *hint) 514 { 515 struct kobject *kobj; 516 struct sys_device *sysdev; 517 struct memory_block *mem; 518 char name[sizeof(MEMORY_CLASS_NAME) + 9 + 1]; 519 int block_id = base_memory_block_id(__section_nr(section)); 520 521 kobj = hint ? &hint->sysdev.kobj : NULL; 522 523 /* 524 * This only works because we know that section == sysdev->id 525 * slightly redundant with sysdev_register() 526 */ 527 sprintf(&name[0], "%s%d", MEMORY_CLASS_NAME, block_id); 528 529 kobj = kset_find_obj_hinted(&memory_sysdev_class.kset, name, kobj); 530 if (!kobj) 531 return NULL; 532 533 sysdev = container_of(kobj, struct sys_device, kobj); 534 mem = container_of(sysdev, struct memory_block, sysdev); 535 536 return mem; 537 } 538 539 /* 540 * For now, we have a linear search to go find the appropriate 541 * memory_block corresponding to a particular phys_index. If 542 * this gets to be a real problem, we can always use a radix 543 * tree or something here. 544 * 545 * This could be made generic for all sysdev classes. 546 */ 547 struct memory_block *find_memory_block(struct mem_section *section) 548 { 549 return find_memory_block_hinted(section, NULL); 550 } 551 552 static int init_memory_block(struct memory_block **memory, 553 struct mem_section *section, unsigned long state) 554 { 555 struct memory_block *mem; 556 unsigned long start_pfn; 557 int scn_nr; 558 int ret = 0; 559 560 mem = kzalloc(sizeof(*mem), GFP_KERNEL); 561 if (!mem) 562 return -ENOMEM; 563 564 scn_nr = __section_nr(section); 565 mem->start_section_nr = 566 base_memory_block_id(scn_nr) * sections_per_block; 567 mem->end_section_nr = mem->start_section_nr + sections_per_block - 1; 568 mem->state = state; 569 mem->section_count++; 570 mutex_init(&mem->state_mutex); 571 start_pfn = section_nr_to_pfn(mem->start_section_nr); 572 mem->phys_device = arch_get_memory_phys_device(start_pfn); 573 574 ret = register_memory(mem); 575 if (!ret) 576 ret = mem_create_simple_file(mem, phys_index); 577 if (!ret) 578 ret = mem_create_simple_file(mem, end_phys_index); 579 if (!ret) 580 ret = mem_create_simple_file(mem, state); 581 if (!ret) 582 ret = mem_create_simple_file(mem, phys_device); 583 if (!ret) 584 ret = mem_create_simple_file(mem, removable); 585 586 *memory = mem; 587 return ret; 588 } 589 590 static int add_memory_section(int nid, struct mem_section *section, 591 unsigned long state, enum mem_add_context context) 592 { 593 struct memory_block *mem; 594 int ret = 0; 595 596 mutex_lock(&mem_sysfs_mutex); 597 598 mem = find_memory_block(section); 599 if (mem) { 600 mem->section_count++; 601 kobject_put(&mem->sysdev.kobj); 602 } else 603 ret = init_memory_block(&mem, section, state); 604 605 if (!ret) { 606 if (context == HOTPLUG && 607 mem->section_count == sections_per_block) 608 ret = register_mem_sect_under_node(mem, nid); 609 } 610 611 mutex_unlock(&mem_sysfs_mutex); 612 return ret; 613 } 614 615 int remove_memory_block(unsigned long node_id, struct mem_section *section, 616 int phys_device) 617 { 618 struct memory_block *mem; 619 620 mutex_lock(&mem_sysfs_mutex); 621 mem = find_memory_block(section); 622 unregister_mem_sect_under_nodes(mem, __section_nr(section)); 623 624 mem->section_count--; 625 if (mem->section_count == 0) { 626 mem_remove_simple_file(mem, phys_index); 627 mem_remove_simple_file(mem, end_phys_index); 628 mem_remove_simple_file(mem, state); 629 mem_remove_simple_file(mem, phys_device); 630 mem_remove_simple_file(mem, removable); 631 unregister_memory(mem); 632 kfree(mem); 633 } else 634 kobject_put(&mem->sysdev.kobj); 635 636 mutex_unlock(&mem_sysfs_mutex); 637 return 0; 638 } 639 640 /* 641 * need an interface for the VM to add new memory regions, 642 * but without onlining it. 643 */ 644 int register_new_memory(int nid, struct mem_section *section) 645 { 646 return add_memory_section(nid, section, MEM_OFFLINE, HOTPLUG); 647 } 648 649 int unregister_memory_section(struct mem_section *section) 650 { 651 if (!present_section(section)) 652 return -EINVAL; 653 654 return remove_memory_block(0, section, 0); 655 } 656 657 /* 658 * Initialize the sysfs support for memory devices... 659 */ 660 int __init memory_dev_init(void) 661 { 662 unsigned int i; 663 int ret; 664 int err; 665 unsigned long block_sz; 666 667 memory_sysdev_class.kset.uevent_ops = &memory_uevent_ops; 668 ret = sysdev_class_register(&memory_sysdev_class); 669 if (ret) 670 goto out; 671 672 block_sz = get_memory_block_size(); 673 sections_per_block = block_sz / MIN_MEMORY_BLOCK_SIZE; 674 675 /* 676 * Create entries for memory sections that were found 677 * during boot and have been initialized 678 */ 679 for (i = 0; i < NR_MEM_SECTIONS; i++) { 680 if (!present_section_nr(i)) 681 continue; 682 err = add_memory_section(0, __nr_to_section(i), MEM_ONLINE, 683 BOOT); 684 if (!ret) 685 ret = err; 686 } 687 688 err = memory_probe_init(); 689 if (!ret) 690 ret = err; 691 err = memory_fail_init(); 692 if (!ret) 693 ret = err; 694 err = block_size_init(); 695 if (!ret) 696 ret = err; 697 out: 698 if (ret) 699 printk(KERN_ERR "%s() failed: %d\n", __func__, ret); 700 return ret; 701 } 702