1 // SPDX-License-Identifier: GPL-2.0 2 #include <linux/slab.h> 3 #include <linux/lockdep.h> 4 #include <linux/sysfs.h> 5 #include <linux/kobject.h> 6 #include <linux/memory.h> 7 #include <linux/memory-tiers.h> 8 #include <linux/notifier.h> 9 #include <linux/sched/sysctl.h> 10 11 #include "internal.h" 12 13 struct memory_tier { 14 /* hierarchy of memory tiers */ 15 struct list_head list; 16 /* list of all memory types part of this tier */ 17 struct list_head memory_types; 18 /* 19 * start value of abstract distance. memory tier maps 20 * an abstract distance range, 21 * adistance_start .. adistance_start + MEMTIER_CHUNK_SIZE 22 */ 23 int adistance_start; 24 struct device dev; 25 /* All the nodes that are part of all the lower memory tiers. */ 26 nodemask_t lower_tier_mask; 27 }; 28 29 struct demotion_nodes { 30 nodemask_t preferred; 31 }; 32 33 struct node_memory_type_map { 34 struct memory_dev_type *memtype; 35 int map_count; 36 }; 37 38 static DEFINE_MUTEX(memory_tier_lock); 39 static LIST_HEAD(memory_tiers); 40 /* 41 * The list is used to store all memory types that are not created 42 * by a device driver. 43 */ 44 static LIST_HEAD(default_memory_types); 45 static struct node_memory_type_map node_memory_types[MAX_NUMNODES]; 46 struct memory_dev_type *default_dram_type; 47 nodemask_t default_dram_nodes __initdata = NODE_MASK_NONE; 48 49 static const struct bus_type memory_tier_subsys = { 50 .name = "memory_tiering", 51 .dev_name = "memory_tier", 52 }; 53 54 #ifdef CONFIG_NUMA_BALANCING 55 /** 56 * folio_use_access_time - check if a folio reuses cpupid for page access time 57 * @folio: folio to check 58 * 59 * folio's _last_cpupid field is repurposed by memory tiering. In memory 60 * tiering mode, cpupid of slow memory folio (not toptier memory) is used to 61 * record page access time. 62 * 63 * Return: the folio _last_cpupid is used to record page access time 64 */ 65 bool folio_use_access_time(struct folio *folio) 66 { 67 return (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) && 68 !node_is_toptier(folio_nid(folio)); 69 } 70 #endif 71 72 #ifdef CONFIG_MIGRATION 73 static int top_tier_adistance; 74 /* 75 * node_demotion[] examples: 76 * 77 * Example 1: 78 * 79 * Node 0 & 1 are CPU + DRAM nodes, node 2 & 3 are PMEM nodes. 80 * 81 * node distances: 82 * node 0 1 2 3 83 * 0 10 20 30 40 84 * 1 20 10 40 30 85 * 2 30 40 10 40 86 * 3 40 30 40 10 87 * 88 * memory_tiers0 = 0-1 89 * memory_tiers1 = 2-3 90 * 91 * node_demotion[0].preferred = 2 92 * node_demotion[1].preferred = 3 93 * node_demotion[2].preferred = <empty> 94 * node_demotion[3].preferred = <empty> 95 * 96 * Example 2: 97 * 98 * Node 0 & 1 are CPU + DRAM nodes, node 2 is memory-only DRAM node. 99 * 100 * node distances: 101 * node 0 1 2 102 * 0 10 20 30 103 * 1 20 10 30 104 * 2 30 30 10 105 * 106 * memory_tiers0 = 0-2 107 * 108 * node_demotion[0].preferred = <empty> 109 * node_demotion[1].preferred = <empty> 110 * node_demotion[2].preferred = <empty> 111 * 112 * Example 3: 113 * 114 * Node 0 is CPU + DRAM nodes, Node 1 is HBM node, node 2 is PMEM node. 115 * 116 * node distances: 117 * node 0 1 2 118 * 0 10 20 30 119 * 1 20 10 40 120 * 2 30 40 10 121 * 122 * memory_tiers0 = 1 123 * memory_tiers1 = 0 124 * memory_tiers2 = 2 125 * 126 * node_demotion[0].preferred = 2 127 * node_demotion[1].preferred = 0 128 * node_demotion[2].preferred = <empty> 129 * 130 */ 131 static struct demotion_nodes *node_demotion __read_mostly; 132 #endif /* CONFIG_MIGRATION */ 133 134 static BLOCKING_NOTIFIER_HEAD(mt_adistance_algorithms); 135 136 /* The lock is used to protect `default_dram_perf*` info and nid. */ 137 static DEFINE_MUTEX(default_dram_perf_lock); 138 static bool default_dram_perf_error; 139 static struct access_coordinate default_dram_perf; 140 static int default_dram_perf_ref_nid = NUMA_NO_NODE; 141 static const char *default_dram_perf_ref_source; 142 143 static inline struct memory_tier *to_memory_tier(struct device *device) 144 { 145 return container_of(device, struct memory_tier, dev); 146 } 147 148 static __always_inline nodemask_t get_memtier_nodemask(struct memory_tier *memtier) 149 { 150 nodemask_t nodes = NODE_MASK_NONE; 151 struct memory_dev_type *memtype; 152 153 list_for_each_entry(memtype, &memtier->memory_types, tier_sibling) 154 nodes_or(nodes, nodes, memtype->nodes); 155 156 return nodes; 157 } 158 159 static void memory_tier_device_release(struct device *dev) 160 { 161 struct memory_tier *tier = to_memory_tier(dev); 162 /* 163 * synchronize_rcu in clear_node_memory_tier makes sure 164 * we don't have rcu access to this memory tier. 165 */ 166 kfree(tier); 167 } 168 169 static ssize_t nodelist_show(struct device *dev, 170 struct device_attribute *attr, char *buf) 171 { 172 int ret; 173 nodemask_t nmask; 174 175 mutex_lock(&memory_tier_lock); 176 nmask = get_memtier_nodemask(to_memory_tier(dev)); 177 ret = sysfs_emit(buf, "%*pbl\n", nodemask_pr_args(&nmask)); 178 mutex_unlock(&memory_tier_lock); 179 return ret; 180 } 181 static DEVICE_ATTR_RO(nodelist); 182 183 static struct attribute *memtier_dev_attrs[] = { 184 &dev_attr_nodelist.attr, 185 NULL 186 }; 187 188 static const struct attribute_group memtier_dev_group = { 189 .attrs = memtier_dev_attrs, 190 }; 191 192 static const struct attribute_group *memtier_dev_groups[] = { 193 &memtier_dev_group, 194 NULL 195 }; 196 197 static struct memory_tier *find_create_memory_tier(struct memory_dev_type *memtype) 198 { 199 int ret; 200 bool found_slot = false; 201 struct memory_tier *memtier, *new_memtier; 202 int adistance = memtype->adistance; 203 unsigned int memtier_adistance_chunk_size = MEMTIER_CHUNK_SIZE; 204 205 lockdep_assert_held_once(&memory_tier_lock); 206 207 adistance = round_down(adistance, memtier_adistance_chunk_size); 208 /* 209 * If the memtype is already part of a memory tier, 210 * just return that. 211 */ 212 if (!list_empty(&memtype->tier_sibling)) { 213 list_for_each_entry(memtier, &memory_tiers, list) { 214 if (adistance == memtier->adistance_start) 215 return memtier; 216 } 217 WARN_ON(1); 218 return ERR_PTR(-EINVAL); 219 } 220 221 list_for_each_entry(memtier, &memory_tiers, list) { 222 if (adistance == memtier->adistance_start) { 223 goto link_memtype; 224 } else if (adistance < memtier->adistance_start) { 225 found_slot = true; 226 break; 227 } 228 } 229 230 new_memtier = kzalloc(sizeof(struct memory_tier), GFP_KERNEL); 231 if (!new_memtier) 232 return ERR_PTR(-ENOMEM); 233 234 new_memtier->adistance_start = adistance; 235 INIT_LIST_HEAD(&new_memtier->list); 236 INIT_LIST_HEAD(&new_memtier->memory_types); 237 if (found_slot) 238 list_add_tail(&new_memtier->list, &memtier->list); 239 else 240 list_add_tail(&new_memtier->list, &memory_tiers); 241 242 new_memtier->dev.id = adistance >> MEMTIER_CHUNK_BITS; 243 new_memtier->dev.bus = &memory_tier_subsys; 244 new_memtier->dev.release = memory_tier_device_release; 245 new_memtier->dev.groups = memtier_dev_groups; 246 247 ret = device_register(&new_memtier->dev); 248 if (ret) { 249 list_del(&new_memtier->list); 250 put_device(&new_memtier->dev); 251 return ERR_PTR(ret); 252 } 253 memtier = new_memtier; 254 255 link_memtype: 256 list_add(&memtype->tier_sibling, &memtier->memory_types); 257 return memtier; 258 } 259 260 static struct memory_tier *__node_get_memory_tier(int node) 261 { 262 pg_data_t *pgdat; 263 264 pgdat = NODE_DATA(node); 265 if (!pgdat) 266 return NULL; 267 /* 268 * Since we hold memory_tier_lock, we can avoid 269 * RCU read locks when accessing the details. No 270 * parallel updates are possible here. 271 */ 272 return rcu_dereference_check(pgdat->memtier, 273 lockdep_is_held(&memory_tier_lock)); 274 } 275 276 #ifdef CONFIG_MIGRATION 277 bool node_is_toptier(int node) 278 { 279 bool toptier; 280 pg_data_t *pgdat; 281 struct memory_tier *memtier; 282 283 pgdat = NODE_DATA(node); 284 if (!pgdat) 285 return false; 286 287 rcu_read_lock(); 288 memtier = rcu_dereference(pgdat->memtier); 289 if (!memtier) { 290 toptier = true; 291 goto out; 292 } 293 if (memtier->adistance_start <= top_tier_adistance) 294 toptier = true; 295 else 296 toptier = false; 297 out: 298 rcu_read_unlock(); 299 return toptier; 300 } 301 302 void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets) 303 { 304 struct memory_tier *memtier; 305 306 /* 307 * pg_data_t.memtier updates includes a synchronize_rcu() 308 * which ensures that we either find NULL or a valid memtier 309 * in NODE_DATA. protect the access via rcu_read_lock(); 310 */ 311 rcu_read_lock(); 312 memtier = rcu_dereference(pgdat->memtier); 313 if (memtier) 314 *targets = memtier->lower_tier_mask; 315 else 316 *targets = NODE_MASK_NONE; 317 rcu_read_unlock(); 318 } 319 320 /** 321 * next_demotion_node() - Get the next node in the demotion path 322 * @node: The starting node to lookup the next node 323 * 324 * Return: node id for next memory node in the demotion path hierarchy 325 * from @node; NUMA_NO_NODE if @node is terminal. This does not keep 326 * @node online or guarantee that it *continues* to be the next demotion 327 * target. 328 */ 329 int next_demotion_node(int node) 330 { 331 struct demotion_nodes *nd; 332 int target; 333 334 if (!node_demotion) 335 return NUMA_NO_NODE; 336 337 nd = &node_demotion[node]; 338 339 /* 340 * node_demotion[] is updated without excluding this 341 * function from running. 342 * 343 * Make sure to use RCU over entire code blocks if 344 * node_demotion[] reads need to be consistent. 345 */ 346 rcu_read_lock(); 347 /* 348 * If there are multiple target nodes, just select one 349 * target node randomly. 350 * 351 * In addition, we can also use round-robin to select 352 * target node, but we should introduce another variable 353 * for node_demotion[] to record last selected target node, 354 * that may cause cache ping-pong due to the changing of 355 * last target node. Or introducing per-cpu data to avoid 356 * caching issue, which seems more complicated. So selecting 357 * target node randomly seems better until now. 358 */ 359 target = node_random(&nd->preferred); 360 rcu_read_unlock(); 361 362 return target; 363 } 364 365 static void disable_all_demotion_targets(void) 366 { 367 struct memory_tier *memtier; 368 int node; 369 370 for_each_node_state(node, N_MEMORY) { 371 node_demotion[node].preferred = NODE_MASK_NONE; 372 /* 373 * We are holding memory_tier_lock, it is safe 374 * to access pgda->memtier. 375 */ 376 memtier = __node_get_memory_tier(node); 377 if (memtier) 378 memtier->lower_tier_mask = NODE_MASK_NONE; 379 } 380 /* 381 * Ensure that the "disable" is visible across the system. 382 * Readers will see either a combination of before+disable 383 * state or disable+after. They will never see before and 384 * after state together. 385 */ 386 synchronize_rcu(); 387 } 388 389 static void dump_demotion_targets(void) 390 { 391 int node; 392 393 for_each_node_state(node, N_MEMORY) { 394 struct memory_tier *memtier = __node_get_memory_tier(node); 395 nodemask_t preferred = node_demotion[node].preferred; 396 397 if (!memtier) 398 continue; 399 400 if (nodes_empty(preferred)) 401 pr_info("Demotion targets for Node %d: null\n", node); 402 else 403 pr_info("Demotion targets for Node %d: preferred: %*pbl, fallback: %*pbl\n", 404 node, nodemask_pr_args(&preferred), 405 nodemask_pr_args(&memtier->lower_tier_mask)); 406 } 407 } 408 409 /* 410 * Find an automatic demotion target for all memory 411 * nodes. Failing here is OK. It might just indicate 412 * being at the end of a chain. 413 */ 414 static void establish_demotion_targets(void) 415 { 416 struct memory_tier *memtier; 417 struct demotion_nodes *nd; 418 int target = NUMA_NO_NODE, node; 419 int distance, best_distance; 420 nodemask_t tier_nodes, lower_tier; 421 422 lockdep_assert_held_once(&memory_tier_lock); 423 424 if (!node_demotion) 425 return; 426 427 disable_all_demotion_targets(); 428 429 for_each_node_state(node, N_MEMORY) { 430 best_distance = -1; 431 nd = &node_demotion[node]; 432 433 memtier = __node_get_memory_tier(node); 434 if (!memtier || list_is_last(&memtier->list, &memory_tiers)) 435 continue; 436 /* 437 * Get the lower memtier to find the demotion node list. 438 */ 439 memtier = list_next_entry(memtier, list); 440 tier_nodes = get_memtier_nodemask(memtier); 441 /* 442 * find_next_best_node, use 'used' nodemask as a skip list. 443 * Add all memory nodes except the selected memory tier 444 * nodelist to skip list so that we find the best node from the 445 * memtier nodelist. 446 */ 447 nodes_andnot(tier_nodes, node_states[N_MEMORY], tier_nodes); 448 449 /* 450 * Find all the nodes in the memory tier node list of same best distance. 451 * add them to the preferred mask. We randomly select between nodes 452 * in the preferred mask when allocating pages during demotion. 453 */ 454 do { 455 target = find_next_best_node(node, &tier_nodes); 456 if (target == NUMA_NO_NODE) 457 break; 458 459 distance = node_distance(node, target); 460 if (distance == best_distance || best_distance == -1) { 461 best_distance = distance; 462 node_set(target, nd->preferred); 463 } else { 464 break; 465 } 466 } while (1); 467 } 468 /* 469 * Promotion is allowed from a memory tier to higher 470 * memory tier only if the memory tier doesn't include 471 * compute. We want to skip promotion from a memory tier, 472 * if any node that is part of the memory tier have CPUs. 473 * Once we detect such a memory tier, we consider that tier 474 * as top tiper from which promotion is not allowed. 475 */ 476 list_for_each_entry_reverse(memtier, &memory_tiers, list) { 477 tier_nodes = get_memtier_nodemask(memtier); 478 if (nodes_and(tier_nodes, node_states[N_CPU], tier_nodes)) { 479 /* 480 * abstract distance below the max value of this memtier 481 * is considered toptier. 482 */ 483 top_tier_adistance = memtier->adistance_start + 484 MEMTIER_CHUNK_SIZE - 1; 485 break; 486 } 487 } 488 /* 489 * Now build the lower_tier mask for each node collecting node mask from 490 * all memory tier below it. This allows us to fallback demotion page 491 * allocation to a set of nodes that is closer the above selected 492 * preferred node. 493 */ 494 lower_tier = node_states[N_MEMORY]; 495 list_for_each_entry(memtier, &memory_tiers, list) { 496 /* 497 * Keep removing current tier from lower_tier nodes, 498 * This will remove all nodes in current and above 499 * memory tier from the lower_tier mask. 500 */ 501 tier_nodes = get_memtier_nodemask(memtier); 502 nodes_andnot(lower_tier, lower_tier, tier_nodes); 503 memtier->lower_tier_mask = lower_tier; 504 } 505 506 dump_demotion_targets(); 507 } 508 509 #else 510 static inline void establish_demotion_targets(void) {} 511 #endif /* CONFIG_MIGRATION */ 512 513 static inline void __init_node_memory_type(int node, struct memory_dev_type *memtype) 514 { 515 if (!node_memory_types[node].memtype) 516 node_memory_types[node].memtype = memtype; 517 /* 518 * for each device getting added in the same NUMA node 519 * with this specific memtype, bump the map count. We 520 * Only take memtype device reference once, so that 521 * changing a node memtype can be done by dropping the 522 * only reference count taken here. 523 */ 524 525 if (node_memory_types[node].memtype == memtype) { 526 if (!node_memory_types[node].map_count++) 527 kref_get(&memtype->kref); 528 } 529 } 530 531 static struct memory_tier *set_node_memory_tier(int node) 532 { 533 struct memory_tier *memtier; 534 struct memory_dev_type *memtype = default_dram_type; 535 int adist = MEMTIER_ADISTANCE_DRAM; 536 pg_data_t *pgdat = NODE_DATA(node); 537 538 539 lockdep_assert_held_once(&memory_tier_lock); 540 541 if (!node_state(node, N_MEMORY)) 542 return ERR_PTR(-EINVAL); 543 544 mt_calc_adistance(node, &adist); 545 if (!node_memory_types[node].memtype) { 546 memtype = mt_find_alloc_memory_type(adist, &default_memory_types); 547 if (IS_ERR(memtype)) { 548 memtype = default_dram_type; 549 pr_info("Failed to allocate a memory type. Fall back.\n"); 550 } 551 } 552 553 __init_node_memory_type(node, memtype); 554 555 memtype = node_memory_types[node].memtype; 556 node_set(node, memtype->nodes); 557 memtier = find_create_memory_tier(memtype); 558 if (!IS_ERR(memtier)) 559 rcu_assign_pointer(pgdat->memtier, memtier); 560 return memtier; 561 } 562 563 static void destroy_memory_tier(struct memory_tier *memtier) 564 { 565 list_del(&memtier->list); 566 device_unregister(&memtier->dev); 567 } 568 569 static bool clear_node_memory_tier(int node) 570 { 571 bool cleared = false; 572 pg_data_t *pgdat; 573 struct memory_tier *memtier; 574 575 pgdat = NODE_DATA(node); 576 if (!pgdat) 577 return false; 578 579 /* 580 * Make sure that anybody looking at NODE_DATA who finds 581 * a valid memtier finds memory_dev_types with nodes still 582 * linked to the memtier. We achieve this by waiting for 583 * rcu read section to finish using synchronize_rcu. 584 * This also enables us to free the destroyed memory tier 585 * with kfree instead of kfree_rcu 586 */ 587 memtier = __node_get_memory_tier(node); 588 if (memtier) { 589 struct memory_dev_type *memtype; 590 591 rcu_assign_pointer(pgdat->memtier, NULL); 592 synchronize_rcu(); 593 memtype = node_memory_types[node].memtype; 594 node_clear(node, memtype->nodes); 595 if (nodes_empty(memtype->nodes)) { 596 list_del_init(&memtype->tier_sibling); 597 if (list_empty(&memtier->memory_types)) 598 destroy_memory_tier(memtier); 599 } 600 cleared = true; 601 } 602 return cleared; 603 } 604 605 static void release_memtype(struct kref *kref) 606 { 607 struct memory_dev_type *memtype; 608 609 memtype = container_of(kref, struct memory_dev_type, kref); 610 kfree(memtype); 611 } 612 613 struct memory_dev_type *alloc_memory_type(int adistance) 614 { 615 struct memory_dev_type *memtype; 616 617 memtype = kmalloc(sizeof(*memtype), GFP_KERNEL); 618 if (!memtype) 619 return ERR_PTR(-ENOMEM); 620 621 memtype->adistance = adistance; 622 INIT_LIST_HEAD(&memtype->tier_sibling); 623 memtype->nodes = NODE_MASK_NONE; 624 kref_init(&memtype->kref); 625 return memtype; 626 } 627 EXPORT_SYMBOL_GPL(alloc_memory_type); 628 629 void put_memory_type(struct memory_dev_type *memtype) 630 { 631 kref_put(&memtype->kref, release_memtype); 632 } 633 EXPORT_SYMBOL_GPL(put_memory_type); 634 635 void init_node_memory_type(int node, struct memory_dev_type *memtype) 636 { 637 638 mutex_lock(&memory_tier_lock); 639 __init_node_memory_type(node, memtype); 640 mutex_unlock(&memory_tier_lock); 641 } 642 EXPORT_SYMBOL_GPL(init_node_memory_type); 643 644 void clear_node_memory_type(int node, struct memory_dev_type *memtype) 645 { 646 mutex_lock(&memory_tier_lock); 647 if (node_memory_types[node].memtype == memtype || !memtype) 648 node_memory_types[node].map_count--; 649 /* 650 * If we unmapped all the attached devices to this node, 651 * clear the node memory type. 652 */ 653 if (!node_memory_types[node].map_count) { 654 memtype = node_memory_types[node].memtype; 655 node_memory_types[node].memtype = NULL; 656 put_memory_type(memtype); 657 } 658 mutex_unlock(&memory_tier_lock); 659 } 660 EXPORT_SYMBOL_GPL(clear_node_memory_type); 661 662 struct memory_dev_type *mt_find_alloc_memory_type(int adist, struct list_head *memory_types) 663 { 664 struct memory_dev_type *mtype; 665 666 list_for_each_entry(mtype, memory_types, list) 667 if (mtype->adistance == adist) 668 return mtype; 669 670 mtype = alloc_memory_type(adist); 671 if (IS_ERR(mtype)) 672 return mtype; 673 674 list_add(&mtype->list, memory_types); 675 676 return mtype; 677 } 678 EXPORT_SYMBOL_GPL(mt_find_alloc_memory_type); 679 680 void mt_put_memory_types(struct list_head *memory_types) 681 { 682 struct memory_dev_type *mtype, *mtn; 683 684 list_for_each_entry_safe(mtype, mtn, memory_types, list) { 685 list_del(&mtype->list); 686 put_memory_type(mtype); 687 } 688 } 689 EXPORT_SYMBOL_GPL(mt_put_memory_types); 690 691 /* 692 * This is invoked via `late_initcall()` to initialize memory tiers for 693 * memory nodes, both with and without CPUs. After the initialization of 694 * firmware and devices, adistance algorithms are expected to be provided. 695 */ 696 static int __init memory_tier_late_init(void) 697 { 698 int nid; 699 struct memory_tier *memtier; 700 701 get_online_mems(); 702 guard(mutex)(&memory_tier_lock); 703 704 /* Assign each uninitialized N_MEMORY node to a memory tier. */ 705 for_each_node_state(nid, N_MEMORY) { 706 /* 707 * Some device drivers may have initialized 708 * memory tiers, potentially bringing memory nodes 709 * online and configuring memory tiers. 710 * Exclude them here. 711 */ 712 if (node_memory_types[nid].memtype) 713 continue; 714 715 memtier = set_node_memory_tier(nid); 716 if (IS_ERR(memtier)) 717 continue; 718 } 719 720 establish_demotion_targets(); 721 put_online_mems(); 722 723 return 0; 724 } 725 late_initcall(memory_tier_late_init); 726 727 static void dump_hmem_attrs(struct access_coordinate *coord, const char *prefix) 728 { 729 pr_info( 730 "%sread_latency: %u, write_latency: %u, read_bandwidth: %u, write_bandwidth: %u\n", 731 prefix, coord->read_latency, coord->write_latency, 732 coord->read_bandwidth, coord->write_bandwidth); 733 } 734 735 int mt_set_default_dram_perf(int nid, struct access_coordinate *perf, 736 const char *source) 737 { 738 guard(mutex)(&default_dram_perf_lock); 739 if (default_dram_perf_error) 740 return -EIO; 741 742 if (perf->read_latency + perf->write_latency == 0 || 743 perf->read_bandwidth + perf->write_bandwidth == 0) 744 return -EINVAL; 745 746 if (default_dram_perf_ref_nid == NUMA_NO_NODE) { 747 default_dram_perf = *perf; 748 default_dram_perf_ref_nid = nid; 749 default_dram_perf_ref_source = kstrdup(source, GFP_KERNEL); 750 return 0; 751 } 752 753 /* 754 * The performance of all default DRAM nodes is expected to be 755 * same (that is, the variation is less than 10%). And it 756 * will be used as base to calculate the abstract distance of 757 * other memory nodes. 758 */ 759 if (abs(perf->read_latency - default_dram_perf.read_latency) * 10 > 760 default_dram_perf.read_latency || 761 abs(perf->write_latency - default_dram_perf.write_latency) * 10 > 762 default_dram_perf.write_latency || 763 abs(perf->read_bandwidth - default_dram_perf.read_bandwidth) * 10 > 764 default_dram_perf.read_bandwidth || 765 abs(perf->write_bandwidth - default_dram_perf.write_bandwidth) * 10 > 766 default_dram_perf.write_bandwidth) { 767 pr_info( 768 "memory-tiers: the performance of DRAM node %d mismatches that of the reference\n" 769 "DRAM node %d.\n", nid, default_dram_perf_ref_nid); 770 pr_info(" performance of reference DRAM node %d from %s:\n", 771 default_dram_perf_ref_nid, default_dram_perf_ref_source); 772 dump_hmem_attrs(&default_dram_perf, " "); 773 pr_info(" performance of DRAM node %d from %s:\n", nid, source); 774 dump_hmem_attrs(perf, " "); 775 pr_info( 776 " disable default DRAM node performance based abstract distance algorithm.\n"); 777 default_dram_perf_error = true; 778 return -EINVAL; 779 } 780 781 return 0; 782 } 783 784 int mt_perf_to_adistance(struct access_coordinate *perf, int *adist) 785 { 786 guard(mutex)(&default_dram_perf_lock); 787 if (default_dram_perf_error) 788 return -EIO; 789 790 if (perf->read_latency + perf->write_latency == 0 || 791 perf->read_bandwidth + perf->write_bandwidth == 0) 792 return -EINVAL; 793 794 if (default_dram_perf_ref_nid == NUMA_NO_NODE) 795 return -ENOENT; 796 797 /* 798 * The abstract distance of a memory node is in direct proportion to 799 * its memory latency (read + write) and inversely proportional to its 800 * memory bandwidth (read + write). The abstract distance, memory 801 * latency, and memory bandwidth of the default DRAM nodes are used as 802 * the base. 803 */ 804 *adist = MEMTIER_ADISTANCE_DRAM * 805 (perf->read_latency + perf->write_latency) / 806 (default_dram_perf.read_latency + default_dram_perf.write_latency) * 807 (default_dram_perf.read_bandwidth + default_dram_perf.write_bandwidth) / 808 (perf->read_bandwidth + perf->write_bandwidth); 809 810 return 0; 811 } 812 EXPORT_SYMBOL_GPL(mt_perf_to_adistance); 813 814 /** 815 * register_mt_adistance_algorithm() - Register memory tiering abstract distance algorithm 816 * @nb: The notifier block which describe the algorithm 817 * 818 * Return: 0 on success, errno on error. 819 * 820 * Every memory tiering abstract distance algorithm provider needs to 821 * register the algorithm with register_mt_adistance_algorithm(). To 822 * calculate the abstract distance for a specified memory node, the 823 * notifier function will be called unless some high priority 824 * algorithm has provided result. The prototype of the notifier 825 * function is as follows, 826 * 827 * int (*algorithm_notifier)(struct notifier_block *nb, 828 * unsigned long nid, void *data); 829 * 830 * Where "nid" specifies the memory node, "data" is the pointer to the 831 * returned abstract distance (that is, "int *adist"). If the 832 * algorithm provides the result, NOTIFY_STOP should be returned. 833 * Otherwise, return_value & %NOTIFY_STOP_MASK == 0 to allow the next 834 * algorithm in the chain to provide the result. 835 */ 836 int register_mt_adistance_algorithm(struct notifier_block *nb) 837 { 838 return blocking_notifier_chain_register(&mt_adistance_algorithms, nb); 839 } 840 EXPORT_SYMBOL_GPL(register_mt_adistance_algorithm); 841 842 /** 843 * unregister_mt_adistance_algorithm() - Unregister memory tiering abstract distance algorithm 844 * @nb: the notifier block which describe the algorithm 845 * 846 * Return: 0 on success, errno on error. 847 */ 848 int unregister_mt_adistance_algorithm(struct notifier_block *nb) 849 { 850 return blocking_notifier_chain_unregister(&mt_adistance_algorithms, nb); 851 } 852 EXPORT_SYMBOL_GPL(unregister_mt_adistance_algorithm); 853 854 /** 855 * mt_calc_adistance() - Calculate abstract distance with registered algorithms 856 * @node: the node to calculate abstract distance for 857 * @adist: the returned abstract distance 858 * 859 * Return: if return_value & %NOTIFY_STOP_MASK != 0, then some 860 * abstract distance algorithm provides the result, and return it via 861 * @adist. Otherwise, no algorithm can provide the result and @adist 862 * will be kept as it is. 863 */ 864 int mt_calc_adistance(int node, int *adist) 865 { 866 return blocking_notifier_call_chain(&mt_adistance_algorithms, node, adist); 867 } 868 EXPORT_SYMBOL_GPL(mt_calc_adistance); 869 870 static int __meminit memtier_hotplug_callback(struct notifier_block *self, 871 unsigned long action, void *_arg) 872 { 873 struct memory_tier *memtier; 874 struct node_notify *nn = _arg; 875 876 switch (action) { 877 case NODE_REMOVED_LAST_MEMORY: 878 mutex_lock(&memory_tier_lock); 879 if (clear_node_memory_tier(nn->nid)) 880 establish_demotion_targets(); 881 mutex_unlock(&memory_tier_lock); 882 break; 883 case NODE_ADDED_FIRST_MEMORY: 884 mutex_lock(&memory_tier_lock); 885 memtier = set_node_memory_tier(nn->nid); 886 if (!IS_ERR(memtier)) 887 establish_demotion_targets(); 888 mutex_unlock(&memory_tier_lock); 889 break; 890 } 891 892 return notifier_from_errno(0); 893 } 894 895 static int __init memory_tier_init(void) 896 { 897 int ret; 898 899 ret = subsys_virtual_register(&memory_tier_subsys, NULL); 900 if (ret) 901 panic("%s() failed to register memory tier subsystem\n", __func__); 902 903 #ifdef CONFIG_MIGRATION 904 node_demotion = kcalloc(nr_node_ids, sizeof(struct demotion_nodes), 905 GFP_KERNEL); 906 WARN_ON(!node_demotion); 907 #endif 908 909 mutex_lock(&memory_tier_lock); 910 /* 911 * For now we can have 4 faster memory tiers with smaller adistance 912 * than default DRAM tier. 913 */ 914 default_dram_type = mt_find_alloc_memory_type(MEMTIER_ADISTANCE_DRAM, 915 &default_memory_types); 916 mutex_unlock(&memory_tier_lock); 917 if (IS_ERR(default_dram_type)) 918 panic("%s() failed to allocate default DRAM tier\n", __func__); 919 920 /* Record nodes with memory and CPU to set default DRAM performance. */ 921 nodes_and(default_dram_nodes, node_states[N_MEMORY], 922 node_states[N_CPU]); 923 924 hotplug_node_notifier(memtier_hotplug_callback, MEMTIER_HOTPLUG_PRI); 925 return 0; 926 } 927 subsys_initcall(memory_tier_init); 928 929 bool numa_demotion_enabled = false; 930 931 #ifdef CONFIG_MIGRATION 932 #ifdef CONFIG_SYSFS 933 static ssize_t demotion_enabled_show(struct kobject *kobj, 934 struct kobj_attribute *attr, char *buf) 935 { 936 return sysfs_emit(buf, "%s\n", str_true_false(numa_demotion_enabled)); 937 } 938 939 static ssize_t demotion_enabled_store(struct kobject *kobj, 940 struct kobj_attribute *attr, 941 const char *buf, size_t count) 942 { 943 ssize_t ret; 944 bool before = numa_demotion_enabled; 945 946 ret = kstrtobool(buf, &numa_demotion_enabled); 947 if (ret) 948 return ret; 949 950 /* 951 * Reset kswapd_failures statistics. They may no longer be 952 * valid since the policy for kswapd has changed. 953 */ 954 if (before == false && numa_demotion_enabled == true) { 955 struct pglist_data *pgdat; 956 957 for_each_online_pgdat(pgdat) 958 kswapd_clear_hopeless(pgdat, KSWAPD_CLEAR_HOPELESS_OTHER); 959 } 960 961 return count; 962 } 963 964 static struct kobj_attribute numa_demotion_enabled_attr = 965 __ATTR_RW(demotion_enabled); 966 967 static struct attribute *numa_attrs[] = { 968 &numa_demotion_enabled_attr.attr, 969 NULL, 970 }; 971 972 static const struct attribute_group numa_attr_group = { 973 .attrs = numa_attrs, 974 }; 975 976 static int __init numa_init_sysfs(void) 977 { 978 int err; 979 struct kobject *numa_kobj; 980 981 numa_kobj = kobject_create_and_add("numa", mm_kobj); 982 if (!numa_kobj) { 983 pr_err("failed to create numa kobject\n"); 984 return -ENOMEM; 985 } 986 err = sysfs_create_group(numa_kobj, &numa_attr_group); 987 if (err) { 988 pr_err("failed to register numa group\n"); 989 goto delete_obj; 990 } 991 return 0; 992 993 delete_obj: 994 kobject_put(numa_kobj); 995 return err; 996 } 997 subsys_initcall(numa_init_sysfs); 998 #endif /* CONFIG_SYSFS */ 999 #endif 1000