1 /* 2 * drivers/base/memory.c - basic Memory class support 3 * 4 * Written by Matt Tolentino <matthew.e.tolentino@intel.com> 5 * Dave Hansen <haveblue@us.ibm.com> 6 * 7 * This file provides the necessary infrastructure to represent 8 * a SPARSEMEM-memory-model system's physical memory in /sysfs. 9 * All arch-independent code that assumes MEMORY_HOTPLUG requires 10 * SPARSEMEM should be contained here, or in mm/memory_hotplug.c. 11 */ 12 13 #include <linux/sysdev.h> 14 #include <linux/module.h> 15 #include <linux/init.h> 16 #include <linux/topology.h> 17 #include <linux/capability.h> 18 #include <linux/device.h> 19 #include <linux/memory.h> 20 #include <linux/kobject.h> 21 #include <linux/memory_hotplug.h> 22 #include <linux/mm.h> 23 #include <linux/mutex.h> 24 #include <linux/stat.h> 25 #include <linux/slab.h> 26 27 #include <asm/atomic.h> 28 #include <asm/uaccess.h> 29 30 static DEFINE_MUTEX(mem_sysfs_mutex); 31 32 #define MEMORY_CLASS_NAME "memory" 33 #define MIN_MEMORY_BLOCK_SIZE (1 << SECTION_SIZE_BITS) 34 35 static int sections_per_block; 36 37 static inline int base_memory_block_id(int section_nr) 38 { 39 return section_nr / sections_per_block; 40 } 41 42 static struct sysdev_class memory_sysdev_class = { 43 .name = MEMORY_CLASS_NAME, 44 }; 45 46 static const char *memory_uevent_name(struct kset *kset, struct kobject *kobj) 47 { 48 return MEMORY_CLASS_NAME; 49 } 50 51 static int memory_uevent(struct kset *kset, struct kobject *obj, 52 struct kobj_uevent_env *env) 53 { 54 int retval = 0; 55 56 return retval; 57 } 58 59 static const struct kset_uevent_ops memory_uevent_ops = { 60 .name = memory_uevent_name, 61 .uevent = memory_uevent, 62 }; 63 64 static BLOCKING_NOTIFIER_HEAD(memory_chain); 65 66 int register_memory_notifier(struct notifier_block *nb) 67 { 68 return blocking_notifier_chain_register(&memory_chain, nb); 69 } 70 EXPORT_SYMBOL(register_memory_notifier); 71 72 void unregister_memory_notifier(struct notifier_block *nb) 73 { 74 blocking_notifier_chain_unregister(&memory_chain, nb); 75 } 76 EXPORT_SYMBOL(unregister_memory_notifier); 77 78 static ATOMIC_NOTIFIER_HEAD(memory_isolate_chain); 79 80 int register_memory_isolate_notifier(struct notifier_block *nb) 81 { 82 return atomic_notifier_chain_register(&memory_isolate_chain, nb); 83 } 84 EXPORT_SYMBOL(register_memory_isolate_notifier); 85 86 void unregister_memory_isolate_notifier(struct notifier_block *nb) 87 { 88 atomic_notifier_chain_unregister(&memory_isolate_chain, nb); 89 } 90 EXPORT_SYMBOL(unregister_memory_isolate_notifier); 91 92 /* 93 * register_memory - Setup a sysfs device for a memory block 94 */ 95 static 96 int register_memory(struct memory_block *memory) 97 { 98 int error; 99 100 memory->sysdev.cls = &memory_sysdev_class; 101 memory->sysdev.id = memory->start_section_nr / sections_per_block; 102 103 error = sysdev_register(&memory->sysdev); 104 return error; 105 } 106 107 static void 108 unregister_memory(struct memory_block *memory) 109 { 110 BUG_ON(memory->sysdev.cls != &memory_sysdev_class); 111 112 /* drop the ref. we got in remove_memory_block() */ 113 kobject_put(&memory->sysdev.kobj); 114 sysdev_unregister(&memory->sysdev); 115 } 116 117 unsigned long __weak memory_block_size_bytes(void) 118 { 119 return MIN_MEMORY_BLOCK_SIZE; 120 } 121 122 static unsigned long get_memory_block_size(void) 123 { 124 unsigned long block_sz; 125 126 block_sz = memory_block_size_bytes(); 127 128 /* Validate blk_sz is a power of 2 and not less than section size */ 129 if ((block_sz & (block_sz - 1)) || (block_sz < MIN_MEMORY_BLOCK_SIZE)) { 130 WARN_ON(1); 131 block_sz = MIN_MEMORY_BLOCK_SIZE; 132 } 133 134 return block_sz; 135 } 136 137 /* 138 * use this as the physical section index that this memsection 139 * uses. 140 */ 141 142 static ssize_t show_mem_start_phys_index(struct sys_device *dev, 143 struct sysdev_attribute *attr, char *buf) 144 { 145 struct memory_block *mem = 146 container_of(dev, struct memory_block, sysdev); 147 unsigned long phys_index; 148 149 phys_index = mem->start_section_nr / sections_per_block; 150 return sprintf(buf, "%08lx\n", phys_index); 151 } 152 153 static ssize_t show_mem_end_phys_index(struct sys_device *dev, 154 struct sysdev_attribute *attr, char *buf) 155 { 156 struct memory_block *mem = 157 container_of(dev, struct memory_block, sysdev); 158 unsigned long phys_index; 159 160 phys_index = mem->end_section_nr / sections_per_block; 161 return sprintf(buf, "%08lx\n", phys_index); 162 } 163 164 /* 165 * Show whether the section of memory is likely to be hot-removable 166 */ 167 static ssize_t show_mem_removable(struct sys_device *dev, 168 struct sysdev_attribute *attr, char *buf) 169 { 170 unsigned long i, pfn; 171 int ret = 1; 172 struct memory_block *mem = 173 container_of(dev, struct memory_block, sysdev); 174 175 for (i = 0; i < sections_per_block; i++) { 176 pfn = section_nr_to_pfn(mem->start_section_nr + i); 177 ret &= is_mem_section_removable(pfn, PAGES_PER_SECTION); 178 } 179 180 return sprintf(buf, "%d\n", ret); 181 } 182 183 /* 184 * online, offline, going offline, etc. 185 */ 186 static ssize_t show_mem_state(struct sys_device *dev, 187 struct sysdev_attribute *attr, char *buf) 188 { 189 struct memory_block *mem = 190 container_of(dev, struct memory_block, sysdev); 191 ssize_t len = 0; 192 193 /* 194 * We can probably put these states in a nice little array 195 * so that they're not open-coded 196 */ 197 switch (mem->state) { 198 case MEM_ONLINE: 199 len = sprintf(buf, "online\n"); 200 break; 201 case MEM_OFFLINE: 202 len = sprintf(buf, "offline\n"); 203 break; 204 case MEM_GOING_OFFLINE: 205 len = sprintf(buf, "going-offline\n"); 206 break; 207 default: 208 len = sprintf(buf, "ERROR-UNKNOWN-%ld\n", 209 mem->state); 210 WARN_ON(1); 211 break; 212 } 213 214 return len; 215 } 216 217 int memory_notify(unsigned long val, void *v) 218 { 219 return blocking_notifier_call_chain(&memory_chain, val, v); 220 } 221 222 int memory_isolate_notify(unsigned long val, void *v) 223 { 224 return atomic_notifier_call_chain(&memory_isolate_chain, val, v); 225 } 226 227 /* 228 * MEMORY_HOTPLUG depends on SPARSEMEM in mm/Kconfig, so it is 229 * OK to have direct references to sparsemem variables in here. 230 */ 231 static int 232 memory_block_action(unsigned long phys_index, unsigned long action) 233 { 234 int i; 235 unsigned long start_pfn, start_paddr; 236 unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block; 237 struct page *first_page; 238 int ret; 239 240 first_page = pfn_to_page(phys_index << PFN_SECTION_SHIFT); 241 242 /* 243 * The probe routines leave the pages reserved, just 244 * as the bootmem code does. Make sure they're still 245 * that way. 246 */ 247 if (action == MEM_ONLINE) { 248 for (i = 0; i < nr_pages; i++) { 249 if (PageReserved(first_page+i)) 250 continue; 251 252 printk(KERN_WARNING "section number %ld page number %d " 253 "not reserved, was it already online?\n", 254 phys_index, i); 255 return -EBUSY; 256 } 257 } 258 259 switch (action) { 260 case MEM_ONLINE: 261 start_pfn = page_to_pfn(first_page); 262 ret = online_pages(start_pfn, nr_pages); 263 break; 264 case MEM_OFFLINE: 265 start_paddr = page_to_pfn(first_page) << PAGE_SHIFT; 266 ret = remove_memory(start_paddr, 267 nr_pages << PAGE_SHIFT); 268 break; 269 default: 270 WARN(1, KERN_WARNING "%s(%ld, %ld) unknown action: " 271 "%ld\n", __func__, phys_index, action, action); 272 ret = -EINVAL; 273 } 274 275 return ret; 276 } 277 278 static int memory_block_change_state(struct memory_block *mem, 279 unsigned long to_state, unsigned long from_state_req) 280 { 281 int ret = 0; 282 283 mutex_lock(&mem->state_mutex); 284 285 if (mem->state != from_state_req) { 286 ret = -EINVAL; 287 goto out; 288 } 289 290 if (to_state == MEM_OFFLINE) 291 mem->state = MEM_GOING_OFFLINE; 292 293 ret = memory_block_action(mem->start_section_nr, to_state); 294 295 if (ret) 296 mem->state = from_state_req; 297 else 298 mem->state = to_state; 299 300 out: 301 mutex_unlock(&mem->state_mutex); 302 return ret; 303 } 304 305 static ssize_t 306 store_mem_state(struct sys_device *dev, 307 struct sysdev_attribute *attr, const char *buf, size_t count) 308 { 309 struct memory_block *mem; 310 int ret = -EINVAL; 311 312 mem = container_of(dev, struct memory_block, sysdev); 313 314 if (!strncmp(buf, "online", min((int)count, 6))) 315 ret = memory_block_change_state(mem, MEM_ONLINE, MEM_OFFLINE); 316 else if(!strncmp(buf, "offline", min((int)count, 7))) 317 ret = memory_block_change_state(mem, MEM_OFFLINE, MEM_ONLINE); 318 319 if (ret) 320 return ret; 321 return count; 322 } 323 324 /* 325 * phys_device is a bad name for this. What I really want 326 * is a way to differentiate between memory ranges that 327 * are part of physical devices that constitute 328 * a complete removable unit or fru. 329 * i.e. do these ranges belong to the same physical device, 330 * s.t. if I offline all of these sections I can then 331 * remove the physical device? 332 */ 333 static ssize_t show_phys_device(struct sys_device *dev, 334 struct sysdev_attribute *attr, char *buf) 335 { 336 struct memory_block *mem = 337 container_of(dev, struct memory_block, sysdev); 338 return sprintf(buf, "%d\n", mem->phys_device); 339 } 340 341 static SYSDEV_ATTR(phys_index, 0444, show_mem_start_phys_index, NULL); 342 static SYSDEV_ATTR(end_phys_index, 0444, show_mem_end_phys_index, NULL); 343 static SYSDEV_ATTR(state, 0644, show_mem_state, store_mem_state); 344 static SYSDEV_ATTR(phys_device, 0444, show_phys_device, NULL); 345 static SYSDEV_ATTR(removable, 0444, show_mem_removable, NULL); 346 347 #define mem_create_simple_file(mem, attr_name) \ 348 sysdev_create_file(&mem->sysdev, &attr_##attr_name) 349 #define mem_remove_simple_file(mem, attr_name) \ 350 sysdev_remove_file(&mem->sysdev, &attr_##attr_name) 351 352 /* 353 * Block size attribute stuff 354 */ 355 static ssize_t 356 print_block_size(struct sysdev_class *class, struct sysdev_class_attribute *attr, 357 char *buf) 358 { 359 return sprintf(buf, "%lx\n", get_memory_block_size()); 360 } 361 362 static SYSDEV_CLASS_ATTR(block_size_bytes, 0444, print_block_size, NULL); 363 364 static int block_size_init(void) 365 { 366 return sysfs_create_file(&memory_sysdev_class.kset.kobj, 367 &attr_block_size_bytes.attr); 368 } 369 370 /* 371 * Some architectures will have custom drivers to do this, and 372 * will not need to do it from userspace. The fake hot-add code 373 * as well as ppc64 will do all of their discovery in userspace 374 * and will require this interface. 375 */ 376 #ifdef CONFIG_ARCH_MEMORY_PROBE 377 static ssize_t 378 memory_probe_store(struct class *class, struct class_attribute *attr, 379 const char *buf, size_t count) 380 { 381 u64 phys_addr; 382 int nid; 383 int i, ret; 384 385 phys_addr = simple_strtoull(buf, NULL, 0); 386 387 for (i = 0; i < sections_per_block; i++) { 388 nid = memory_add_physaddr_to_nid(phys_addr); 389 ret = add_memory(nid, phys_addr, 390 PAGES_PER_SECTION << PAGE_SHIFT); 391 if (ret) 392 goto out; 393 394 phys_addr += MIN_MEMORY_BLOCK_SIZE; 395 } 396 397 ret = count; 398 out: 399 return ret; 400 } 401 static CLASS_ATTR(probe, S_IWUSR, NULL, memory_probe_store); 402 403 static int memory_probe_init(void) 404 { 405 return sysfs_create_file(&memory_sysdev_class.kset.kobj, 406 &class_attr_probe.attr); 407 } 408 #else 409 static inline int memory_probe_init(void) 410 { 411 return 0; 412 } 413 #endif 414 415 #ifdef CONFIG_MEMORY_FAILURE 416 /* 417 * Support for offlining pages of memory 418 */ 419 420 /* Soft offline a page */ 421 static ssize_t 422 store_soft_offline_page(struct class *class, 423 struct class_attribute *attr, 424 const char *buf, size_t count) 425 { 426 int ret; 427 u64 pfn; 428 if (!capable(CAP_SYS_ADMIN)) 429 return -EPERM; 430 if (strict_strtoull(buf, 0, &pfn) < 0) 431 return -EINVAL; 432 pfn >>= PAGE_SHIFT; 433 if (!pfn_valid(pfn)) 434 return -ENXIO; 435 ret = soft_offline_page(pfn_to_page(pfn), 0); 436 return ret == 0 ? count : ret; 437 } 438 439 /* Forcibly offline a page, including killing processes. */ 440 static ssize_t 441 store_hard_offline_page(struct class *class, 442 struct class_attribute *attr, 443 const char *buf, size_t count) 444 { 445 int ret; 446 u64 pfn; 447 if (!capable(CAP_SYS_ADMIN)) 448 return -EPERM; 449 if (strict_strtoull(buf, 0, &pfn) < 0) 450 return -EINVAL; 451 pfn >>= PAGE_SHIFT; 452 ret = __memory_failure(pfn, 0, 0); 453 return ret ? ret : count; 454 } 455 456 static CLASS_ATTR(soft_offline_page, 0644, NULL, store_soft_offline_page); 457 static CLASS_ATTR(hard_offline_page, 0644, NULL, store_hard_offline_page); 458 459 static __init int memory_fail_init(void) 460 { 461 int err; 462 463 err = sysfs_create_file(&memory_sysdev_class.kset.kobj, 464 &class_attr_soft_offline_page.attr); 465 if (!err) 466 err = sysfs_create_file(&memory_sysdev_class.kset.kobj, 467 &class_attr_hard_offline_page.attr); 468 return err; 469 } 470 #else 471 static inline int memory_fail_init(void) 472 { 473 return 0; 474 } 475 #endif 476 477 /* 478 * Note that phys_device is optional. It is here to allow for 479 * differentiation between which *physical* devices each 480 * section belongs to... 481 */ 482 int __weak arch_get_memory_phys_device(unsigned long start_pfn) 483 { 484 return 0; 485 } 486 487 struct memory_block *find_memory_block_hinted(struct mem_section *section, 488 struct memory_block *hint) 489 { 490 struct kobject *kobj; 491 struct sys_device *sysdev; 492 struct memory_block *mem; 493 char name[sizeof(MEMORY_CLASS_NAME) + 9 + 1]; 494 int block_id = base_memory_block_id(__section_nr(section)); 495 496 kobj = hint ? &hint->sysdev.kobj : NULL; 497 498 /* 499 * This only works because we know that section == sysdev->id 500 * slightly redundant with sysdev_register() 501 */ 502 sprintf(&name[0], "%s%d", MEMORY_CLASS_NAME, block_id); 503 504 kobj = kset_find_obj_hinted(&memory_sysdev_class.kset, name, kobj); 505 if (!kobj) 506 return NULL; 507 508 sysdev = container_of(kobj, struct sys_device, kobj); 509 mem = container_of(sysdev, struct memory_block, sysdev); 510 511 return mem; 512 } 513 514 /* 515 * For now, we have a linear search to go find the appropriate 516 * memory_block corresponding to a particular phys_index. If 517 * this gets to be a real problem, we can always use a radix 518 * tree or something here. 519 * 520 * This could be made generic for all sysdev classes. 521 */ 522 struct memory_block *find_memory_block(struct mem_section *section) 523 { 524 return find_memory_block_hinted(section, NULL); 525 } 526 527 static int init_memory_block(struct memory_block **memory, 528 struct mem_section *section, unsigned long state) 529 { 530 struct memory_block *mem; 531 unsigned long start_pfn; 532 int scn_nr; 533 int ret = 0; 534 535 mem = kzalloc(sizeof(*mem), GFP_KERNEL); 536 if (!mem) 537 return -ENOMEM; 538 539 scn_nr = __section_nr(section); 540 mem->start_section_nr = 541 base_memory_block_id(scn_nr) * sections_per_block; 542 mem->end_section_nr = mem->start_section_nr + sections_per_block - 1; 543 mem->state = state; 544 mem->section_count++; 545 mutex_init(&mem->state_mutex); 546 start_pfn = section_nr_to_pfn(mem->start_section_nr); 547 mem->phys_device = arch_get_memory_phys_device(start_pfn); 548 549 ret = register_memory(mem); 550 if (!ret) 551 ret = mem_create_simple_file(mem, phys_index); 552 if (!ret) 553 ret = mem_create_simple_file(mem, end_phys_index); 554 if (!ret) 555 ret = mem_create_simple_file(mem, state); 556 if (!ret) 557 ret = mem_create_simple_file(mem, phys_device); 558 if (!ret) 559 ret = mem_create_simple_file(mem, removable); 560 561 *memory = mem; 562 return ret; 563 } 564 565 static int add_memory_section(int nid, struct mem_section *section, 566 unsigned long state, enum mem_add_context context) 567 { 568 struct memory_block *mem; 569 int ret = 0; 570 571 mutex_lock(&mem_sysfs_mutex); 572 573 mem = find_memory_block(section); 574 if (mem) { 575 mem->section_count++; 576 kobject_put(&mem->sysdev.kobj); 577 } else 578 ret = init_memory_block(&mem, section, state); 579 580 if (!ret) { 581 if (context == HOTPLUG && 582 mem->section_count == sections_per_block) 583 ret = register_mem_sect_under_node(mem, nid); 584 } 585 586 mutex_unlock(&mem_sysfs_mutex); 587 return ret; 588 } 589 590 int remove_memory_block(unsigned long node_id, struct mem_section *section, 591 int phys_device) 592 { 593 struct memory_block *mem; 594 595 mutex_lock(&mem_sysfs_mutex); 596 mem = find_memory_block(section); 597 unregister_mem_sect_under_nodes(mem, __section_nr(section)); 598 599 mem->section_count--; 600 if (mem->section_count == 0) { 601 mem_remove_simple_file(mem, phys_index); 602 mem_remove_simple_file(mem, end_phys_index); 603 mem_remove_simple_file(mem, state); 604 mem_remove_simple_file(mem, phys_device); 605 mem_remove_simple_file(mem, removable); 606 unregister_memory(mem); 607 kfree(mem); 608 } else 609 kobject_put(&mem->sysdev.kobj); 610 611 mutex_unlock(&mem_sysfs_mutex); 612 return 0; 613 } 614 615 /* 616 * need an interface for the VM to add new memory regions, 617 * but without onlining it. 618 */ 619 int register_new_memory(int nid, struct mem_section *section) 620 { 621 return add_memory_section(nid, section, MEM_OFFLINE, HOTPLUG); 622 } 623 624 int unregister_memory_section(struct mem_section *section) 625 { 626 if (!present_section(section)) 627 return -EINVAL; 628 629 return remove_memory_block(0, section, 0); 630 } 631 632 /* 633 * Initialize the sysfs support for memory devices... 634 */ 635 int __init memory_dev_init(void) 636 { 637 unsigned int i; 638 int ret; 639 int err; 640 unsigned long block_sz; 641 642 memory_sysdev_class.kset.uevent_ops = &memory_uevent_ops; 643 ret = sysdev_class_register(&memory_sysdev_class); 644 if (ret) 645 goto out; 646 647 block_sz = get_memory_block_size(); 648 sections_per_block = block_sz / MIN_MEMORY_BLOCK_SIZE; 649 650 /* 651 * Create entries for memory sections that were found 652 * during boot and have been initialized 653 */ 654 for (i = 0; i < NR_MEM_SECTIONS; i++) { 655 if (!present_section_nr(i)) 656 continue; 657 err = add_memory_section(0, __nr_to_section(i), MEM_ONLINE, 658 BOOT); 659 if (!ret) 660 ret = err; 661 } 662 663 err = memory_probe_init(); 664 if (!ret) 665 ret = err; 666 err = memory_fail_init(); 667 if (!ret) 668 ret = err; 669 err = block_size_init(); 670 if (!ret) 671 ret = err; 672 out: 673 if (ret) 674 printk(KERN_ERR "%s() failed: %d\n", __func__, ret); 675 return ret; 676 } 677