1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Basic Node interface support 4 */ 5 6 #include <linux/module.h> 7 #include <linux/init.h> 8 #include <linux/mm.h> 9 #include <linux/memory.h> 10 #include <linux/vmstat.h> 11 #include <linux/notifier.h> 12 #include <linux/node.h> 13 #include <linux/hugetlb.h> 14 #include <linux/compaction.h> 15 #include <linux/cpumask.h> 16 #include <linux/topology.h> 17 #include <linux/nodemask.h> 18 #include <linux/cpu.h> 19 #include <linux/device.h> 20 #include <linux/pm_runtime.h> 21 #include <linux/swap.h> 22 #include <linux/slab.h> 23 24 static const struct bus_type node_subsys = { 25 .name = "node", 26 .dev_name = "node", 27 }; 28 29 static inline ssize_t cpumap_read(struct file *file, struct kobject *kobj, 30 const struct bin_attribute *attr, char *buf, 31 loff_t off, size_t count) 32 { 33 struct device *dev = kobj_to_dev(kobj); 34 struct node *node_dev = to_node(dev); 35 cpumask_var_t mask; 36 ssize_t n; 37 38 if (!alloc_cpumask_var(&mask, GFP_KERNEL)) 39 return 0; 40 41 cpumask_and(mask, cpumask_of_node(node_dev->dev.id), cpu_online_mask); 42 n = cpumap_print_bitmask_to_buf(buf, mask, off, count); 43 free_cpumask_var(mask); 44 45 return n; 46 } 47 48 static const BIN_ATTR_RO(cpumap, CPUMAP_FILE_MAX_BYTES); 49 50 static inline ssize_t cpulist_read(struct file *file, struct kobject *kobj, 51 const struct bin_attribute *attr, char *buf, 52 loff_t off, size_t count) 53 { 54 struct device *dev = kobj_to_dev(kobj); 55 struct node *node_dev = to_node(dev); 56 cpumask_var_t mask; 57 ssize_t n; 58 59 if (!alloc_cpumask_var(&mask, GFP_KERNEL)) 60 return 0; 61 62 cpumask_and(mask, cpumask_of_node(node_dev->dev.id), cpu_online_mask); 63 n = cpumap_print_list_to_buf(buf, mask, off, count); 64 free_cpumask_var(mask); 65 66 return n; 67 } 68 69 static const BIN_ATTR_RO(cpulist, CPULIST_FILE_MAX_BYTES); 70 71 /** 72 * struct node_access_nodes - Access class device to hold user visible 73 * relationships to other nodes. 74 * @dev: Device for this memory access class 75 * @list_node: List element in the node's access list 76 * @access: The access class rank 77 * @coord: Heterogeneous memory performance coordinates 78 */ 79 struct node_access_nodes { 80 struct device dev; 81 struct list_head list_node; 82 unsigned int access; 83 #ifdef CONFIG_HMEM_REPORTING 84 struct access_coordinate coord; 85 #endif 86 }; 87 #define to_access_nodes(dev) container_of(dev, struct node_access_nodes, dev) 88 89 static struct attribute *node_init_access_node_attrs[] = { 90 NULL, 91 }; 92 93 static struct attribute *node_targ_access_node_attrs[] = { 94 NULL, 95 }; 96 97 static const struct attribute_group initiators = { 98 .name = "initiators", 99 .attrs = node_init_access_node_attrs, 100 }; 101 102 static const struct attribute_group targets = { 103 .name = "targets", 104 .attrs = node_targ_access_node_attrs, 105 }; 106 107 static const struct attribute_group *node_access_node_groups[] = { 108 &initiators, 109 &targets, 110 NULL, 111 }; 112 113 static void node_remove_accesses(struct node *node) 114 { 115 struct node_access_nodes *c, *cnext; 116 117 list_for_each_entry_safe(c, cnext, &node->access_list, list_node) { 118 list_del(&c->list_node); 119 device_unregister(&c->dev); 120 } 121 } 122 123 static void node_access_release(struct device *dev) 124 { 125 kfree(to_access_nodes(dev)); 126 } 127 128 static struct node_access_nodes *node_init_node_access(struct node *node, 129 enum access_coordinate_class access) 130 { 131 struct node_access_nodes *access_node; 132 struct device *dev; 133 134 list_for_each_entry(access_node, &node->access_list, list_node) 135 if (access_node->access == access) 136 return access_node; 137 138 access_node = kzalloc(sizeof(*access_node), GFP_KERNEL); 139 if (!access_node) 140 return NULL; 141 142 access_node->access = access; 143 dev = &access_node->dev; 144 dev->parent = &node->dev; 145 dev->release = node_access_release; 146 dev->groups = node_access_node_groups; 147 if (dev_set_name(dev, "access%u", access)) 148 goto free; 149 150 if (device_register(dev)) 151 goto free_name; 152 153 pm_runtime_no_callbacks(dev); 154 list_add_tail(&access_node->list_node, &node->access_list); 155 return access_node; 156 free_name: 157 kfree_const(dev->kobj.name); 158 free: 159 kfree(access_node); 160 return NULL; 161 } 162 163 #ifdef CONFIG_HMEM_REPORTING 164 #define ACCESS_ATTR(property) \ 165 static ssize_t property##_show(struct device *dev, \ 166 struct device_attribute *attr, \ 167 char *buf) \ 168 { \ 169 return sysfs_emit(buf, "%u\n", \ 170 to_access_nodes(dev)->coord.property); \ 171 } \ 172 static DEVICE_ATTR_RO(property) 173 174 ACCESS_ATTR(read_bandwidth); 175 ACCESS_ATTR(read_latency); 176 ACCESS_ATTR(write_bandwidth); 177 ACCESS_ATTR(write_latency); 178 179 static struct attribute *access_attrs[] = { 180 &dev_attr_read_bandwidth.attr, 181 &dev_attr_read_latency.attr, 182 &dev_attr_write_bandwidth.attr, 183 &dev_attr_write_latency.attr, 184 NULL, 185 }; 186 187 /** 188 * node_set_perf_attrs - Set the performance values for given access class 189 * @nid: Node identifier to be set 190 * @coord: Heterogeneous memory performance coordinates 191 * @access: The access class the for the given attributes 192 */ 193 void node_set_perf_attrs(unsigned int nid, struct access_coordinate *coord, 194 enum access_coordinate_class access) 195 { 196 struct node_access_nodes *c; 197 struct node *node; 198 int i; 199 200 if (WARN_ON_ONCE(!node_online(nid))) 201 return; 202 203 node = node_devices[nid]; 204 c = node_init_node_access(node, access); 205 if (!c) 206 return; 207 208 c->coord = *coord; 209 for (i = 0; access_attrs[i] != NULL; i++) { 210 if (sysfs_add_file_to_group(&c->dev.kobj, access_attrs[i], 211 "initiators")) { 212 pr_info("failed to add performance attribute to node %d\n", 213 nid); 214 break; 215 } 216 } 217 } 218 EXPORT_SYMBOL_GPL(node_set_perf_attrs); 219 220 /** 221 * struct node_cache_info - Internal tracking for memory node caches 222 * @dev: Device represeting the cache level 223 * @node: List element for tracking in the node 224 * @cache_attrs:Attributes for this cache level 225 */ 226 struct node_cache_info { 227 struct device dev; 228 struct list_head node; 229 struct node_cache_attrs cache_attrs; 230 }; 231 #define to_cache_info(device) container_of(device, struct node_cache_info, dev) 232 233 #define CACHE_ATTR(name, fmt) \ 234 static ssize_t name##_show(struct device *dev, \ 235 struct device_attribute *attr, \ 236 char *buf) \ 237 { \ 238 return sysfs_emit(buf, fmt "\n", \ 239 to_cache_info(dev)->cache_attrs.name); \ 240 } \ 241 static DEVICE_ATTR_RO(name); 242 243 CACHE_ATTR(size, "%llu") 244 CACHE_ATTR(line_size, "%u") 245 CACHE_ATTR(indexing, "%u") 246 CACHE_ATTR(write_policy, "%u") 247 CACHE_ATTR(address_mode, "%#x") 248 249 static struct attribute *cache_attrs[] = { 250 &dev_attr_indexing.attr, 251 &dev_attr_size.attr, 252 &dev_attr_line_size.attr, 253 &dev_attr_write_policy.attr, 254 &dev_attr_address_mode.attr, 255 NULL, 256 }; 257 ATTRIBUTE_GROUPS(cache); 258 259 static void node_cache_release(struct device *dev) 260 { 261 kfree(dev); 262 } 263 264 static void node_cacheinfo_release(struct device *dev) 265 { 266 struct node_cache_info *info = to_cache_info(dev); 267 kfree(info); 268 } 269 270 static void node_init_cache_dev(struct node *node) 271 { 272 struct device *dev; 273 274 dev = kzalloc(sizeof(*dev), GFP_KERNEL); 275 if (!dev) 276 return; 277 278 device_initialize(dev); 279 dev->parent = &node->dev; 280 dev->release = node_cache_release; 281 if (dev_set_name(dev, "memory_side_cache")) 282 goto put_device; 283 284 if (device_add(dev)) 285 goto put_device; 286 287 pm_runtime_no_callbacks(dev); 288 node->cache_dev = dev; 289 return; 290 put_device: 291 put_device(dev); 292 } 293 294 /** 295 * node_add_cache() - add cache attribute to a memory node 296 * @nid: Node identifier that has new cache attributes 297 * @cache_attrs: Attributes for the cache being added 298 */ 299 void node_add_cache(unsigned int nid, struct node_cache_attrs *cache_attrs) 300 { 301 struct node_cache_info *info; 302 struct device *dev; 303 struct node *node; 304 305 if (!node_online(nid) || !node_devices[nid]) 306 return; 307 308 node = node_devices[nid]; 309 list_for_each_entry(info, &node->cache_attrs, node) { 310 if (info->cache_attrs.level == cache_attrs->level) { 311 dev_warn(&node->dev, 312 "attempt to add duplicate cache level:%d\n", 313 cache_attrs->level); 314 return; 315 } 316 } 317 318 if (!node->cache_dev) 319 node_init_cache_dev(node); 320 if (!node->cache_dev) 321 return; 322 323 info = kzalloc(sizeof(*info), GFP_KERNEL); 324 if (!info) 325 return; 326 327 dev = &info->dev; 328 device_initialize(dev); 329 dev->parent = node->cache_dev; 330 dev->release = node_cacheinfo_release; 331 dev->groups = cache_groups; 332 if (dev_set_name(dev, "index%d", cache_attrs->level)) 333 goto put_device; 334 335 info->cache_attrs = *cache_attrs; 336 if (device_add(dev)) { 337 dev_warn(&node->dev, "failed to add cache level:%d\n", 338 cache_attrs->level); 339 goto put_device; 340 } 341 pm_runtime_no_callbacks(dev); 342 list_add_tail(&info->node, &node->cache_attrs); 343 return; 344 put_device: 345 put_device(dev); 346 } 347 348 static void node_remove_caches(struct node *node) 349 { 350 struct node_cache_info *info, *next; 351 352 if (!node->cache_dev) 353 return; 354 355 list_for_each_entry_safe(info, next, &node->cache_attrs, node) { 356 list_del(&info->node); 357 device_unregister(&info->dev); 358 } 359 device_unregister(node->cache_dev); 360 } 361 362 static void node_init_caches(unsigned int nid) 363 { 364 INIT_LIST_HEAD(&node_devices[nid]->cache_attrs); 365 } 366 #else 367 static void node_init_caches(unsigned int nid) { } 368 static void node_remove_caches(struct node *node) { } 369 #endif 370 371 #define K(x) ((x) << (PAGE_SHIFT - 10)) 372 static ssize_t node_read_meminfo(struct device *dev, 373 struct device_attribute *attr, char *buf) 374 { 375 int len = 0; 376 int nid = dev->id; 377 struct pglist_data *pgdat = NODE_DATA(nid); 378 struct sysinfo i; 379 unsigned long sreclaimable, sunreclaimable; 380 unsigned long swapcached = 0; 381 382 si_meminfo_node(&i, nid); 383 sreclaimable = node_page_state_pages(pgdat, NR_SLAB_RECLAIMABLE_B); 384 sunreclaimable = node_page_state_pages(pgdat, NR_SLAB_UNRECLAIMABLE_B); 385 #ifdef CONFIG_SWAP 386 swapcached = node_page_state_pages(pgdat, NR_SWAPCACHE); 387 #endif 388 len = sysfs_emit_at(buf, len, 389 "Node %d MemTotal: %8lu kB\n" 390 "Node %d MemFree: %8lu kB\n" 391 "Node %d MemUsed: %8lu kB\n" 392 "Node %d SwapCached: %8lu kB\n" 393 "Node %d Active: %8lu kB\n" 394 "Node %d Inactive: %8lu kB\n" 395 "Node %d Active(anon): %8lu kB\n" 396 "Node %d Inactive(anon): %8lu kB\n" 397 "Node %d Active(file): %8lu kB\n" 398 "Node %d Inactive(file): %8lu kB\n" 399 "Node %d Unevictable: %8lu kB\n" 400 "Node %d Mlocked: %8lu kB\n", 401 nid, K(i.totalram), 402 nid, K(i.freeram), 403 nid, K(i.totalram - i.freeram), 404 nid, K(swapcached), 405 nid, K(node_page_state(pgdat, NR_ACTIVE_ANON) + 406 node_page_state(pgdat, NR_ACTIVE_FILE)), 407 nid, K(node_page_state(pgdat, NR_INACTIVE_ANON) + 408 node_page_state(pgdat, NR_INACTIVE_FILE)), 409 nid, K(node_page_state(pgdat, NR_ACTIVE_ANON)), 410 nid, K(node_page_state(pgdat, NR_INACTIVE_ANON)), 411 nid, K(node_page_state(pgdat, NR_ACTIVE_FILE)), 412 nid, K(node_page_state(pgdat, NR_INACTIVE_FILE)), 413 nid, K(node_page_state(pgdat, NR_UNEVICTABLE)), 414 nid, K(sum_zone_node_page_state(nid, NR_MLOCK))); 415 416 #ifdef CONFIG_HIGHMEM 417 len += sysfs_emit_at(buf, len, 418 "Node %d HighTotal: %8lu kB\n" 419 "Node %d HighFree: %8lu kB\n" 420 "Node %d LowTotal: %8lu kB\n" 421 "Node %d LowFree: %8lu kB\n", 422 nid, K(i.totalhigh), 423 nid, K(i.freehigh), 424 nid, K(i.totalram - i.totalhigh), 425 nid, K(i.freeram - i.freehigh)); 426 #endif 427 len += sysfs_emit_at(buf, len, 428 "Node %d Dirty: %8lu kB\n" 429 "Node %d Writeback: %8lu kB\n" 430 "Node %d FilePages: %8lu kB\n" 431 "Node %d Mapped: %8lu kB\n" 432 "Node %d AnonPages: %8lu kB\n" 433 "Node %d Shmem: %8lu kB\n" 434 "Node %d KernelStack: %8lu kB\n" 435 #ifdef CONFIG_SHADOW_CALL_STACK 436 "Node %d ShadowCallStack:%8lu kB\n" 437 #endif 438 "Node %d PageTables: %8lu kB\n" 439 "Node %d SecPageTables: %8lu kB\n" 440 "Node %d NFS_Unstable: %8lu kB\n" 441 "Node %d Bounce: %8lu kB\n" 442 "Node %d WritebackTmp: %8lu kB\n" 443 "Node %d KReclaimable: %8lu kB\n" 444 "Node %d Slab: %8lu kB\n" 445 "Node %d SReclaimable: %8lu kB\n" 446 "Node %d SUnreclaim: %8lu kB\n" 447 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 448 "Node %d AnonHugePages: %8lu kB\n" 449 "Node %d ShmemHugePages: %8lu kB\n" 450 "Node %d ShmemPmdMapped: %8lu kB\n" 451 "Node %d FileHugePages: %8lu kB\n" 452 "Node %d FilePmdMapped: %8lu kB\n" 453 #endif 454 #ifdef CONFIG_UNACCEPTED_MEMORY 455 "Node %d Unaccepted: %8lu kB\n" 456 #endif 457 , 458 nid, K(node_page_state(pgdat, NR_FILE_DIRTY)), 459 nid, K(node_page_state(pgdat, NR_WRITEBACK)), 460 nid, K(node_page_state(pgdat, NR_FILE_PAGES)), 461 nid, K(node_page_state(pgdat, NR_FILE_MAPPED)), 462 nid, K(node_page_state(pgdat, NR_ANON_MAPPED)), 463 nid, K(i.sharedram), 464 nid, node_page_state(pgdat, NR_KERNEL_STACK_KB), 465 #ifdef CONFIG_SHADOW_CALL_STACK 466 nid, node_page_state(pgdat, NR_KERNEL_SCS_KB), 467 #endif 468 nid, K(node_page_state(pgdat, NR_PAGETABLE)), 469 nid, K(node_page_state(pgdat, NR_SECONDARY_PAGETABLE)), 470 nid, 0UL, 471 nid, K(sum_zone_node_page_state(nid, NR_BOUNCE)), 472 nid, K(node_page_state(pgdat, NR_WRITEBACK_TEMP)), 473 nid, K(sreclaimable + 474 node_page_state(pgdat, NR_KERNEL_MISC_RECLAIMABLE)), 475 nid, K(sreclaimable + sunreclaimable), 476 nid, K(sreclaimable), 477 nid, K(sunreclaimable) 478 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 479 , 480 nid, K(node_page_state(pgdat, NR_ANON_THPS)), 481 nid, K(node_page_state(pgdat, NR_SHMEM_THPS)), 482 nid, K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED)), 483 nid, K(node_page_state(pgdat, NR_FILE_THPS)), 484 nid, K(node_page_state(pgdat, NR_FILE_PMDMAPPED)) 485 #endif 486 #ifdef CONFIG_UNACCEPTED_MEMORY 487 , 488 nid, K(sum_zone_node_page_state(nid, NR_UNACCEPTED)) 489 #endif 490 ); 491 len += hugetlb_report_node_meminfo(buf, len, nid); 492 return len; 493 } 494 495 #undef K 496 static DEVICE_ATTR(meminfo, 0444, node_read_meminfo, NULL); 497 498 static ssize_t node_read_numastat(struct device *dev, 499 struct device_attribute *attr, char *buf) 500 { 501 fold_vm_numa_events(); 502 return sysfs_emit(buf, 503 "numa_hit %lu\n" 504 "numa_miss %lu\n" 505 "numa_foreign %lu\n" 506 "interleave_hit %lu\n" 507 "local_node %lu\n" 508 "other_node %lu\n", 509 sum_zone_numa_event_state(dev->id, NUMA_HIT), 510 sum_zone_numa_event_state(dev->id, NUMA_MISS), 511 sum_zone_numa_event_state(dev->id, NUMA_FOREIGN), 512 sum_zone_numa_event_state(dev->id, NUMA_INTERLEAVE_HIT), 513 sum_zone_numa_event_state(dev->id, NUMA_LOCAL), 514 sum_zone_numa_event_state(dev->id, NUMA_OTHER)); 515 } 516 static DEVICE_ATTR(numastat, 0444, node_read_numastat, NULL); 517 518 static ssize_t node_read_vmstat(struct device *dev, 519 struct device_attribute *attr, char *buf) 520 { 521 int nid = dev->id; 522 struct pglist_data *pgdat = NODE_DATA(nid); 523 int i; 524 int len = 0; 525 526 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) 527 len += sysfs_emit_at(buf, len, "%s %lu\n", 528 zone_stat_name(i), 529 sum_zone_node_page_state(nid, i)); 530 531 #ifdef CONFIG_NUMA 532 fold_vm_numa_events(); 533 for (i = 0; i < NR_VM_NUMA_EVENT_ITEMS; i++) 534 len += sysfs_emit_at(buf, len, "%s %lu\n", 535 numa_stat_name(i), 536 sum_zone_numa_event_state(nid, i)); 537 538 #endif 539 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) { 540 unsigned long pages = node_page_state_pages(pgdat, i); 541 542 if (vmstat_item_print_in_thp(i)) 543 pages /= HPAGE_PMD_NR; 544 len += sysfs_emit_at(buf, len, "%s %lu\n", node_stat_name(i), 545 pages); 546 } 547 548 return len; 549 } 550 static DEVICE_ATTR(vmstat, 0444, node_read_vmstat, NULL); 551 552 static ssize_t node_read_distance(struct device *dev, 553 struct device_attribute *attr, char *buf) 554 { 555 int nid = dev->id; 556 int len = 0; 557 int i; 558 559 /* 560 * buf is currently PAGE_SIZE in length and each node needs 4 chars 561 * at the most (distance + space or newline). 562 */ 563 BUILD_BUG_ON(MAX_NUMNODES * 4 > PAGE_SIZE); 564 565 for_each_online_node(i) { 566 len += sysfs_emit_at(buf, len, "%s%d", 567 i ? " " : "", node_distance(nid, i)); 568 } 569 570 len += sysfs_emit_at(buf, len, "\n"); 571 return len; 572 } 573 static DEVICE_ATTR(distance, 0444, node_read_distance, NULL); 574 575 static struct attribute *node_dev_attrs[] = { 576 &dev_attr_meminfo.attr, 577 &dev_attr_numastat.attr, 578 &dev_attr_distance.attr, 579 &dev_attr_vmstat.attr, 580 NULL 581 }; 582 583 static const struct bin_attribute *node_dev_bin_attrs[] = { 584 &bin_attr_cpumap, 585 &bin_attr_cpulist, 586 NULL 587 }; 588 589 static const struct attribute_group node_dev_group = { 590 .attrs = node_dev_attrs, 591 .bin_attrs_new = node_dev_bin_attrs, 592 }; 593 594 static const struct attribute_group *node_dev_groups[] = { 595 &node_dev_group, 596 #ifdef CONFIG_HAVE_ARCH_NODE_DEV_GROUP 597 &arch_node_dev_group, 598 #endif 599 #ifdef CONFIG_MEMORY_FAILURE 600 &memory_failure_attr_group, 601 #endif 602 NULL 603 }; 604 605 static void node_device_release(struct device *dev) 606 { 607 kfree(to_node(dev)); 608 } 609 610 /* 611 * register_node - Setup a sysfs device for a node. 612 * @num - Node number to use when creating the device. 613 * 614 * Initialize and register the node device. 615 */ 616 static int register_node(struct node *node, int num) 617 { 618 int error; 619 620 node->dev.id = num; 621 node->dev.bus = &node_subsys; 622 node->dev.release = node_device_release; 623 node->dev.groups = node_dev_groups; 624 error = device_register(&node->dev); 625 626 if (error) { 627 put_device(&node->dev); 628 } else { 629 hugetlb_register_node(node); 630 compaction_register_node(node); 631 } 632 633 return error; 634 } 635 636 /** 637 * unregister_node - unregister a node device 638 * @node: node going away 639 * 640 * Unregisters a node device @node. All the devices on the node must be 641 * unregistered before calling this function. 642 */ 643 void unregister_node(struct node *node) 644 { 645 hugetlb_unregister_node(node); 646 compaction_unregister_node(node); 647 node_remove_accesses(node); 648 node_remove_caches(node); 649 device_unregister(&node->dev); 650 } 651 652 struct node *node_devices[MAX_NUMNODES]; 653 654 /* 655 * register cpu under node 656 */ 657 int register_cpu_under_node(unsigned int cpu, unsigned int nid) 658 { 659 int ret; 660 struct device *obj; 661 662 if (!node_online(nid)) 663 return 0; 664 665 obj = get_cpu_device(cpu); 666 if (!obj) 667 return 0; 668 669 ret = sysfs_create_link(&node_devices[nid]->dev.kobj, 670 &obj->kobj, 671 kobject_name(&obj->kobj)); 672 if (ret) 673 return ret; 674 675 return sysfs_create_link(&obj->kobj, 676 &node_devices[nid]->dev.kobj, 677 kobject_name(&node_devices[nid]->dev.kobj)); 678 } 679 680 /** 681 * register_memory_node_under_compute_node - link memory node to its compute 682 * node for a given access class. 683 * @mem_nid: Memory node number 684 * @cpu_nid: Cpu node number 685 * @access: Access class to register 686 * 687 * Description: 688 * For use with platforms that may have separate memory and compute nodes. 689 * This function will export node relationships linking which memory 690 * initiator nodes can access memory targets at a given ranked access 691 * class. 692 */ 693 int register_memory_node_under_compute_node(unsigned int mem_nid, 694 unsigned int cpu_nid, 695 enum access_coordinate_class access) 696 { 697 struct node *init_node, *targ_node; 698 struct node_access_nodes *initiator, *target; 699 int ret; 700 701 if (!node_online(cpu_nid) || !node_online(mem_nid)) 702 return -ENODEV; 703 704 init_node = node_devices[cpu_nid]; 705 targ_node = node_devices[mem_nid]; 706 initiator = node_init_node_access(init_node, access); 707 target = node_init_node_access(targ_node, access); 708 if (!initiator || !target) 709 return -ENOMEM; 710 711 ret = sysfs_add_link_to_group(&initiator->dev.kobj, "targets", 712 &targ_node->dev.kobj, 713 dev_name(&targ_node->dev)); 714 if (ret) 715 return ret; 716 717 ret = sysfs_add_link_to_group(&target->dev.kobj, "initiators", 718 &init_node->dev.kobj, 719 dev_name(&init_node->dev)); 720 if (ret) 721 goto err; 722 723 return 0; 724 err: 725 sysfs_remove_link_from_group(&initiator->dev.kobj, "targets", 726 dev_name(&targ_node->dev)); 727 return ret; 728 } 729 730 int unregister_cpu_under_node(unsigned int cpu, unsigned int nid) 731 { 732 struct device *obj; 733 734 if (!node_online(nid)) 735 return 0; 736 737 obj = get_cpu_device(cpu); 738 if (!obj) 739 return 0; 740 741 sysfs_remove_link(&node_devices[nid]->dev.kobj, 742 kobject_name(&obj->kobj)); 743 sysfs_remove_link(&obj->kobj, 744 kobject_name(&node_devices[nid]->dev.kobj)); 745 746 return 0; 747 } 748 749 #ifdef CONFIG_MEMORY_HOTPLUG 750 static int __ref get_nid_for_pfn(unsigned long pfn) 751 { 752 #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT 753 if (system_state < SYSTEM_RUNNING) 754 return early_pfn_to_nid(pfn); 755 #endif 756 return pfn_to_nid(pfn); 757 } 758 759 static void do_register_memory_block_under_node(int nid, 760 struct memory_block *mem_blk, 761 enum meminit_context context) 762 { 763 int ret; 764 765 memory_block_add_nid(mem_blk, nid, context); 766 767 ret = sysfs_create_link_nowarn(&node_devices[nid]->dev.kobj, 768 &mem_blk->dev.kobj, 769 kobject_name(&mem_blk->dev.kobj)); 770 if (ret && ret != -EEXIST) 771 dev_err_ratelimited(&node_devices[nid]->dev, 772 "can't create link to %s in sysfs (%d)\n", 773 kobject_name(&mem_blk->dev.kobj), ret); 774 775 ret = sysfs_create_link_nowarn(&mem_blk->dev.kobj, 776 &node_devices[nid]->dev.kobj, 777 kobject_name(&node_devices[nid]->dev.kobj)); 778 if (ret && ret != -EEXIST) 779 dev_err_ratelimited(&mem_blk->dev, 780 "can't create link to %s in sysfs (%d)\n", 781 kobject_name(&node_devices[nid]->dev.kobj), 782 ret); 783 } 784 785 /* register memory section under specified node if it spans that node */ 786 static int register_mem_block_under_node_early(struct memory_block *mem_blk, 787 void *arg) 788 { 789 unsigned long memory_block_pfns = memory_block_size_bytes() / PAGE_SIZE; 790 unsigned long start_pfn = section_nr_to_pfn(mem_blk->start_section_nr); 791 unsigned long end_pfn = start_pfn + memory_block_pfns - 1; 792 int nid = *(int *)arg; 793 unsigned long pfn; 794 795 for (pfn = start_pfn; pfn <= end_pfn; pfn++) { 796 int page_nid; 797 798 /* 799 * memory block could have several absent sections from start. 800 * skip pfn range from absent section 801 */ 802 if (!pfn_in_present_section(pfn)) { 803 pfn = round_down(pfn + PAGES_PER_SECTION, 804 PAGES_PER_SECTION) - 1; 805 continue; 806 } 807 808 /* 809 * We need to check if page belongs to nid only at the boot 810 * case because node's ranges can be interleaved. 811 */ 812 page_nid = get_nid_for_pfn(pfn); 813 if (page_nid < 0) 814 continue; 815 if (page_nid != nid) 816 continue; 817 818 do_register_memory_block_under_node(nid, mem_blk, MEMINIT_EARLY); 819 return 0; 820 } 821 /* mem section does not span the specified node */ 822 return 0; 823 } 824 825 /* 826 * During hotplug we know that all pages in the memory block belong to the same 827 * node. 828 */ 829 static int register_mem_block_under_node_hotplug(struct memory_block *mem_blk, 830 void *arg) 831 { 832 int nid = *(int *)arg; 833 834 do_register_memory_block_under_node(nid, mem_blk, MEMINIT_HOTPLUG); 835 return 0; 836 } 837 838 /* 839 * Unregister a memory block device under the node it spans. Memory blocks 840 * with multiple nodes cannot be offlined and therefore also never be removed. 841 */ 842 void unregister_memory_block_under_nodes(struct memory_block *mem_blk) 843 { 844 if (mem_blk->nid == NUMA_NO_NODE) 845 return; 846 847 sysfs_remove_link(&node_devices[mem_blk->nid]->dev.kobj, 848 kobject_name(&mem_blk->dev.kobj)); 849 sysfs_remove_link(&mem_blk->dev.kobj, 850 kobject_name(&node_devices[mem_blk->nid]->dev.kobj)); 851 } 852 853 void register_memory_blocks_under_node(int nid, unsigned long start_pfn, 854 unsigned long end_pfn, 855 enum meminit_context context) 856 { 857 walk_memory_blocks_func_t func; 858 859 if (context == MEMINIT_HOTPLUG) 860 func = register_mem_block_under_node_hotplug; 861 else 862 func = register_mem_block_under_node_early; 863 864 walk_memory_blocks(PFN_PHYS(start_pfn), PFN_PHYS(end_pfn - start_pfn), 865 (void *)&nid, func); 866 return; 867 } 868 #endif /* CONFIG_MEMORY_HOTPLUG */ 869 870 int __register_one_node(int nid) 871 { 872 int error; 873 int cpu; 874 struct node *node; 875 876 node = kzalloc(sizeof(struct node), GFP_KERNEL); 877 if (!node) 878 return -ENOMEM; 879 880 INIT_LIST_HEAD(&node->access_list); 881 node_devices[nid] = node; 882 883 error = register_node(node_devices[nid], nid); 884 885 /* link cpu under this node */ 886 for_each_present_cpu(cpu) { 887 if (cpu_to_node(cpu) == nid) 888 register_cpu_under_node(cpu, nid); 889 } 890 891 node_init_caches(nid); 892 893 return error; 894 } 895 896 void unregister_one_node(int nid) 897 { 898 if (!node_devices[nid]) 899 return; 900 901 unregister_node(node_devices[nid]); 902 node_devices[nid] = NULL; 903 } 904 905 /* 906 * node states attributes 907 */ 908 909 struct node_attr { 910 struct device_attribute attr; 911 enum node_states state; 912 }; 913 914 static ssize_t show_node_state(struct device *dev, 915 struct device_attribute *attr, char *buf) 916 { 917 struct node_attr *na = container_of(attr, struct node_attr, attr); 918 919 return sysfs_emit(buf, "%*pbl\n", 920 nodemask_pr_args(&node_states[na->state])); 921 } 922 923 #define _NODE_ATTR(name, state) \ 924 { __ATTR(name, 0444, show_node_state, NULL), state } 925 926 static struct node_attr node_state_attr[] = { 927 [N_POSSIBLE] = _NODE_ATTR(possible, N_POSSIBLE), 928 [N_ONLINE] = _NODE_ATTR(online, N_ONLINE), 929 [N_NORMAL_MEMORY] = _NODE_ATTR(has_normal_memory, N_NORMAL_MEMORY), 930 #ifdef CONFIG_HIGHMEM 931 [N_HIGH_MEMORY] = _NODE_ATTR(has_high_memory, N_HIGH_MEMORY), 932 #endif 933 [N_MEMORY] = _NODE_ATTR(has_memory, N_MEMORY), 934 [N_CPU] = _NODE_ATTR(has_cpu, N_CPU), 935 [N_GENERIC_INITIATOR] = _NODE_ATTR(has_generic_initiator, 936 N_GENERIC_INITIATOR), 937 }; 938 939 static struct attribute *node_state_attrs[] = { 940 &node_state_attr[N_POSSIBLE].attr.attr, 941 &node_state_attr[N_ONLINE].attr.attr, 942 &node_state_attr[N_NORMAL_MEMORY].attr.attr, 943 #ifdef CONFIG_HIGHMEM 944 &node_state_attr[N_HIGH_MEMORY].attr.attr, 945 #endif 946 &node_state_attr[N_MEMORY].attr.attr, 947 &node_state_attr[N_CPU].attr.attr, 948 &node_state_attr[N_GENERIC_INITIATOR].attr.attr, 949 NULL 950 }; 951 952 static const struct attribute_group memory_root_attr_group = { 953 .attrs = node_state_attrs, 954 }; 955 956 static const struct attribute_group *cpu_root_attr_groups[] = { 957 &memory_root_attr_group, 958 NULL, 959 }; 960 961 void __init node_dev_init(void) 962 { 963 int ret, i; 964 965 BUILD_BUG_ON(ARRAY_SIZE(node_state_attr) != NR_NODE_STATES); 966 BUILD_BUG_ON(ARRAY_SIZE(node_state_attrs)-1 != NR_NODE_STATES); 967 968 ret = subsys_system_register(&node_subsys, cpu_root_attr_groups); 969 if (ret) 970 panic("%s() failed to register subsystem: %d\n", __func__, ret); 971 972 /* 973 * Create all node devices, which will properly link the node 974 * to applicable memory block devices and already created cpu devices. 975 */ 976 for_each_online_node(i) { 977 ret = register_one_node(i); 978 if (ret) 979 panic("%s() failed to add node: %d\n", __func__, ret); 980 } 981 } 982