1 // SPDX-License-Identifier: GPL-2.0 2 #include <linux/slab.h> 3 #include <linux/lockdep.h> 4 #include <linux/sysfs.h> 5 #include <linux/kobject.h> 6 #include <linux/memory.h> 7 #include <linux/memory-tiers.h> 8 #include <linux/notifier.h> 9 #include <linux/sched/sysctl.h> 10 11 #include "internal.h" 12 13 struct memory_tier { 14 /* hierarchy of memory tiers */ 15 struct list_head list; 16 /* list of all memory types part of this tier */ 17 struct list_head memory_types; 18 /* 19 * start value of abstract distance. memory tier maps 20 * an abstract distance range, 21 * adistance_start .. adistance_start + MEMTIER_CHUNK_SIZE 22 */ 23 int adistance_start; 24 struct device dev; 25 /* All the nodes that are part of all the lower memory tiers. */ 26 nodemask_t lower_tier_mask; 27 }; 28 29 struct demotion_nodes { 30 nodemask_t preferred; 31 }; 32 33 struct node_memory_type_map { 34 struct memory_dev_type *memtype; 35 int map_count; 36 }; 37 38 static DEFINE_MUTEX(memory_tier_lock); 39 static LIST_HEAD(memory_tiers); 40 /* 41 * The list is used to store all memory types that are not created 42 * by a device driver. 43 */ 44 static LIST_HEAD(default_memory_types); 45 static struct node_memory_type_map node_memory_types[MAX_NUMNODES]; 46 struct memory_dev_type *default_dram_type; 47 nodemask_t default_dram_nodes __initdata = NODE_MASK_NONE; 48 49 static const struct bus_type memory_tier_subsys = { 50 .name = "memory_tiering", 51 .dev_name = "memory_tier", 52 }; 53 54 #ifdef CONFIG_NUMA_BALANCING 55 /** 56 * folio_use_access_time - check if a folio reuses cpupid for page access time 57 * @folio: folio to check 58 * 59 * folio's _last_cpupid field is repurposed by memory tiering. In memory 60 * tiering mode, cpupid of slow memory folio (not toptier memory) is used to 61 * record page access time. 62 * 63 * Return: the folio _last_cpupid is used to record page access time 64 */ 65 bool folio_use_access_time(struct folio *folio) 66 { 67 return (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) && 68 !node_is_toptier(folio_nid(folio)); 69 } 70 #endif 71 72 #ifdef CONFIG_MIGRATION 73 static int top_tier_adistance; 74 /* 75 * node_demotion[] examples: 76 * 77 * Example 1: 78 * 79 * Node 0 & 1 are CPU + DRAM nodes, node 2 & 3 are PMEM nodes. 80 * 81 * node distances: 82 * node 0 1 2 3 83 * 0 10 20 30 40 84 * 1 20 10 40 30 85 * 2 30 40 10 40 86 * 3 40 30 40 10 87 * 88 * memory_tiers0 = 0-1 89 * memory_tiers1 = 2-3 90 * 91 * node_demotion[0].preferred = 2 92 * node_demotion[1].preferred = 3 93 * node_demotion[2].preferred = <empty> 94 * node_demotion[3].preferred = <empty> 95 * 96 * Example 2: 97 * 98 * Node 0 & 1 are CPU + DRAM nodes, node 2 is memory-only DRAM node. 99 * 100 * node distances: 101 * node 0 1 2 102 * 0 10 20 30 103 * 1 20 10 30 104 * 2 30 30 10 105 * 106 * memory_tiers0 = 0-2 107 * 108 * node_demotion[0].preferred = <empty> 109 * node_demotion[1].preferred = <empty> 110 * node_demotion[2].preferred = <empty> 111 * 112 * Example 3: 113 * 114 * Node 0 is CPU + DRAM nodes, Node 1 is HBM node, node 2 is PMEM node. 115 * 116 * node distances: 117 * node 0 1 2 118 * 0 10 20 30 119 * 1 20 10 40 120 * 2 30 40 10 121 * 122 * memory_tiers0 = 1 123 * memory_tiers1 = 0 124 * memory_tiers2 = 2 125 * 126 * node_demotion[0].preferred = 2 127 * node_demotion[1].preferred = 0 128 * node_demotion[2].preferred = <empty> 129 * 130 */ 131 static struct demotion_nodes *node_demotion __read_mostly; 132 #endif /* CONFIG_MIGRATION */ 133 134 static BLOCKING_NOTIFIER_HEAD(mt_adistance_algorithms); 135 136 /* The lock is used to protect `default_dram_perf*` info and nid. */ 137 static DEFINE_MUTEX(default_dram_perf_lock); 138 static bool default_dram_perf_error; 139 static struct access_coordinate default_dram_perf; 140 static int default_dram_perf_ref_nid = NUMA_NO_NODE; 141 static const char *default_dram_perf_ref_source; 142 143 static inline struct memory_tier *to_memory_tier(struct device *device) 144 { 145 return container_of(device, struct memory_tier, dev); 146 } 147 148 static __always_inline nodemask_t get_memtier_nodemask(struct memory_tier *memtier) 149 { 150 nodemask_t nodes = NODE_MASK_NONE; 151 struct memory_dev_type *memtype; 152 153 list_for_each_entry(memtype, &memtier->memory_types, tier_sibling) 154 nodes_or(nodes, nodes, memtype->nodes); 155 156 return nodes; 157 } 158 159 static void memory_tier_device_release(struct device *dev) 160 { 161 struct memory_tier *tier = to_memory_tier(dev); 162 /* 163 * synchronize_rcu in clear_node_memory_tier makes sure 164 * we don't have rcu access to this memory tier. 165 */ 166 kfree(tier); 167 } 168 169 static ssize_t nodelist_show(struct device *dev, 170 struct device_attribute *attr, char *buf) 171 { 172 int ret; 173 nodemask_t nmask; 174 175 mutex_lock(&memory_tier_lock); 176 nmask = get_memtier_nodemask(to_memory_tier(dev)); 177 ret = sysfs_emit(buf, "%*pbl\n", nodemask_pr_args(&nmask)); 178 mutex_unlock(&memory_tier_lock); 179 return ret; 180 } 181 static DEVICE_ATTR_RO(nodelist); 182 183 static struct attribute *memtier_dev_attrs[] = { 184 &dev_attr_nodelist.attr, 185 NULL 186 }; 187 188 static const struct attribute_group memtier_dev_group = { 189 .attrs = memtier_dev_attrs, 190 }; 191 192 static const struct attribute_group *memtier_dev_groups[] = { 193 &memtier_dev_group, 194 NULL 195 }; 196 197 static struct memory_tier *find_create_memory_tier(struct memory_dev_type *memtype) 198 { 199 int ret; 200 bool found_slot = false; 201 struct memory_tier *memtier, *new_memtier; 202 int adistance = memtype->adistance; 203 unsigned int memtier_adistance_chunk_size = MEMTIER_CHUNK_SIZE; 204 205 lockdep_assert_held_once(&memory_tier_lock); 206 207 adistance = round_down(adistance, memtier_adistance_chunk_size); 208 /* 209 * If the memtype is already part of a memory tier, 210 * just return that. 211 */ 212 if (!list_empty(&memtype->tier_sibling)) { 213 list_for_each_entry(memtier, &memory_tiers, list) { 214 if (adistance == memtier->adistance_start) 215 return memtier; 216 } 217 WARN_ON(1); 218 return ERR_PTR(-EINVAL); 219 } 220 221 list_for_each_entry(memtier, &memory_tiers, list) { 222 if (adistance == memtier->adistance_start) { 223 goto link_memtype; 224 } else if (adistance < memtier->adistance_start) { 225 found_slot = true; 226 break; 227 } 228 } 229 230 new_memtier = kzalloc(sizeof(struct memory_tier), GFP_KERNEL); 231 if (!new_memtier) 232 return ERR_PTR(-ENOMEM); 233 234 new_memtier->adistance_start = adistance; 235 INIT_LIST_HEAD(&new_memtier->list); 236 INIT_LIST_HEAD(&new_memtier->memory_types); 237 if (found_slot) 238 list_add_tail(&new_memtier->list, &memtier->list); 239 else 240 list_add_tail(&new_memtier->list, &memory_tiers); 241 242 new_memtier->dev.id = adistance >> MEMTIER_CHUNK_BITS; 243 new_memtier->dev.bus = &memory_tier_subsys; 244 new_memtier->dev.release = memory_tier_device_release; 245 new_memtier->dev.groups = memtier_dev_groups; 246 247 ret = device_register(&new_memtier->dev); 248 if (ret) { 249 list_del(&new_memtier->list); 250 put_device(&new_memtier->dev); 251 return ERR_PTR(ret); 252 } 253 memtier = new_memtier; 254 255 link_memtype: 256 list_add(&memtype->tier_sibling, &memtier->memory_types); 257 return memtier; 258 } 259 260 static struct memory_tier *__node_get_memory_tier(int node) 261 { 262 pg_data_t *pgdat; 263 264 pgdat = NODE_DATA(node); 265 if (!pgdat) 266 return NULL; 267 /* 268 * Since we hold memory_tier_lock, we can avoid 269 * RCU read locks when accessing the details. No 270 * parallel updates are possible here. 271 */ 272 return rcu_dereference_check(pgdat->memtier, 273 lockdep_is_held(&memory_tier_lock)); 274 } 275 276 #ifdef CONFIG_MIGRATION 277 bool node_is_toptier(int node) 278 { 279 bool toptier; 280 pg_data_t *pgdat; 281 struct memory_tier *memtier; 282 283 pgdat = NODE_DATA(node); 284 if (!pgdat) 285 return false; 286 287 rcu_read_lock(); 288 memtier = rcu_dereference(pgdat->memtier); 289 if (!memtier) { 290 toptier = true; 291 goto out; 292 } 293 if (memtier->adistance_start <= top_tier_adistance) 294 toptier = true; 295 else 296 toptier = false; 297 out: 298 rcu_read_unlock(); 299 return toptier; 300 } 301 302 void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets) 303 { 304 struct memory_tier *memtier; 305 306 /* 307 * pg_data_t.memtier updates includes a synchronize_rcu() 308 * which ensures that we either find NULL or a valid memtier 309 * in NODE_DATA. protect the access via rcu_read_lock(); 310 */ 311 rcu_read_lock(); 312 memtier = rcu_dereference(pgdat->memtier); 313 if (memtier) 314 *targets = memtier->lower_tier_mask; 315 else 316 *targets = NODE_MASK_NONE; 317 rcu_read_unlock(); 318 } 319 320 /** 321 * next_demotion_node() - Get the next node in the demotion path 322 * @node: The starting node to lookup the next node 323 * 324 * Return: node id for next memory node in the demotion path hierarchy 325 * from @node; NUMA_NO_NODE if @node is terminal. This does not keep 326 * @node online or guarantee that it *continues* to be the next demotion 327 * target. 328 */ 329 int next_demotion_node(int node) 330 { 331 struct demotion_nodes *nd; 332 int target; 333 334 if (!node_demotion) 335 return NUMA_NO_NODE; 336 337 nd = &node_demotion[node]; 338 339 /* 340 * node_demotion[] is updated without excluding this 341 * function from running. 342 * 343 * Make sure to use RCU over entire code blocks if 344 * node_demotion[] reads need to be consistent. 345 */ 346 rcu_read_lock(); 347 /* 348 * If there are multiple target nodes, just select one 349 * target node randomly. 350 * 351 * In addition, we can also use round-robin to select 352 * target node, but we should introduce another variable 353 * for node_demotion[] to record last selected target node, 354 * that may cause cache ping-pong due to the changing of 355 * last target node. Or introducing per-cpu data to avoid 356 * caching issue, which seems more complicated. So selecting 357 * target node randomly seems better until now. 358 */ 359 target = node_random(&nd->preferred); 360 rcu_read_unlock(); 361 362 return target; 363 } 364 365 static void disable_all_demotion_targets(void) 366 { 367 struct memory_tier *memtier; 368 int node; 369 370 for_each_node_state(node, N_MEMORY) { 371 node_demotion[node].preferred = NODE_MASK_NONE; 372 /* 373 * We are holding memory_tier_lock, it is safe 374 * to access pgda->memtier. 375 */ 376 memtier = __node_get_memory_tier(node); 377 if (memtier) 378 memtier->lower_tier_mask = NODE_MASK_NONE; 379 } 380 /* 381 * Ensure that the "disable" is visible across the system. 382 * Readers will see either a combination of before+disable 383 * state or disable+after. They will never see before and 384 * after state together. 385 */ 386 synchronize_rcu(); 387 } 388 389 static void dump_demotion_targets(void) 390 { 391 int node; 392 393 for_each_node_state(node, N_MEMORY) { 394 struct memory_tier *memtier = __node_get_memory_tier(node); 395 nodemask_t preferred = node_demotion[node].preferred; 396 397 if (!memtier) 398 continue; 399 400 if (nodes_empty(preferred)) 401 pr_info("Demotion targets for Node %d: null\n", node); 402 else 403 pr_info("Demotion targets for Node %d: preferred: %*pbl, fallback: %*pbl\n", 404 node, nodemask_pr_args(&preferred), 405 nodemask_pr_args(&memtier->lower_tier_mask)); 406 } 407 } 408 409 /* 410 * Find an automatic demotion target for all memory 411 * nodes. Failing here is OK. It might just indicate 412 * being at the end of a chain. 413 */ 414 static void establish_demotion_targets(void) 415 { 416 struct memory_tier *memtier; 417 struct demotion_nodes *nd; 418 int target = NUMA_NO_NODE, node; 419 int distance, best_distance; 420 nodemask_t tier_nodes, lower_tier; 421 422 lockdep_assert_held_once(&memory_tier_lock); 423 424 if (!node_demotion) 425 return; 426 427 disable_all_demotion_targets(); 428 429 for_each_node_state(node, N_MEMORY) { 430 best_distance = -1; 431 nd = &node_demotion[node]; 432 433 memtier = __node_get_memory_tier(node); 434 if (!memtier || list_is_last(&memtier->list, &memory_tiers)) 435 continue; 436 /* 437 * Get the lower memtier to find the demotion node list. 438 */ 439 memtier = list_next_entry(memtier, list); 440 tier_nodes = get_memtier_nodemask(memtier); 441 /* 442 * find_next_best_node, use 'used' nodemask as a skip list. 443 * Add all memory nodes except the selected memory tier 444 * nodelist to skip list so that we find the best node from the 445 * memtier nodelist. 446 */ 447 nodes_andnot(tier_nodes, node_states[N_MEMORY], tier_nodes); 448 449 /* 450 * Find all the nodes in the memory tier node list of same best distance. 451 * add them to the preferred mask. We randomly select between nodes 452 * in the preferred mask when allocating pages during demotion. 453 */ 454 do { 455 target = find_next_best_node(node, &tier_nodes); 456 if (target == NUMA_NO_NODE) 457 break; 458 459 distance = node_distance(node, target); 460 if (distance == best_distance || best_distance == -1) { 461 best_distance = distance; 462 node_set(target, nd->preferred); 463 } else { 464 break; 465 } 466 } while (1); 467 } 468 /* 469 * Promotion is allowed from a memory tier to higher 470 * memory tier only if the memory tier doesn't include 471 * compute. We want to skip promotion from a memory tier, 472 * if any node that is part of the memory tier have CPUs. 473 * Once we detect such a memory tier, we consider that tier 474 * as top tiper from which promotion is not allowed. 475 */ 476 list_for_each_entry_reverse(memtier, &memory_tiers, list) { 477 tier_nodes = get_memtier_nodemask(memtier); 478 nodes_and(tier_nodes, node_states[N_CPU], tier_nodes); 479 if (!nodes_empty(tier_nodes)) { 480 /* 481 * abstract distance below the max value of this memtier 482 * is considered toptier. 483 */ 484 top_tier_adistance = memtier->adistance_start + 485 MEMTIER_CHUNK_SIZE - 1; 486 break; 487 } 488 } 489 /* 490 * Now build the lower_tier mask for each node collecting node mask from 491 * all memory tier below it. This allows us to fallback demotion page 492 * allocation to a set of nodes that is closer the above selected 493 * preferred node. 494 */ 495 lower_tier = node_states[N_MEMORY]; 496 list_for_each_entry(memtier, &memory_tiers, list) { 497 /* 498 * Keep removing current tier from lower_tier nodes, 499 * This will remove all nodes in current and above 500 * memory tier from the lower_tier mask. 501 */ 502 tier_nodes = get_memtier_nodemask(memtier); 503 nodes_andnot(lower_tier, lower_tier, tier_nodes); 504 memtier->lower_tier_mask = lower_tier; 505 } 506 507 dump_demotion_targets(); 508 } 509 510 #else 511 static inline void establish_demotion_targets(void) {} 512 #endif /* CONFIG_MIGRATION */ 513 514 static inline void __init_node_memory_type(int node, struct memory_dev_type *memtype) 515 { 516 if (!node_memory_types[node].memtype) 517 node_memory_types[node].memtype = memtype; 518 /* 519 * for each device getting added in the same NUMA node 520 * with this specific memtype, bump the map count. We 521 * Only take memtype device reference once, so that 522 * changing a node memtype can be done by droping the 523 * only reference count taken here. 524 */ 525 526 if (node_memory_types[node].memtype == memtype) { 527 if (!node_memory_types[node].map_count++) 528 kref_get(&memtype->kref); 529 } 530 } 531 532 static struct memory_tier *set_node_memory_tier(int node) 533 { 534 struct memory_tier *memtier; 535 struct memory_dev_type *memtype = default_dram_type; 536 int adist = MEMTIER_ADISTANCE_DRAM; 537 pg_data_t *pgdat = NODE_DATA(node); 538 539 540 lockdep_assert_held_once(&memory_tier_lock); 541 542 if (!node_state(node, N_MEMORY)) 543 return ERR_PTR(-EINVAL); 544 545 mt_calc_adistance(node, &adist); 546 if (!node_memory_types[node].memtype) { 547 memtype = mt_find_alloc_memory_type(adist, &default_memory_types); 548 if (IS_ERR(memtype)) { 549 memtype = default_dram_type; 550 pr_info("Failed to allocate a memory type. Fall back.\n"); 551 } 552 } 553 554 __init_node_memory_type(node, memtype); 555 556 memtype = node_memory_types[node].memtype; 557 node_set(node, memtype->nodes); 558 memtier = find_create_memory_tier(memtype); 559 if (!IS_ERR(memtier)) 560 rcu_assign_pointer(pgdat->memtier, memtier); 561 return memtier; 562 } 563 564 static void destroy_memory_tier(struct memory_tier *memtier) 565 { 566 list_del(&memtier->list); 567 device_unregister(&memtier->dev); 568 } 569 570 static bool clear_node_memory_tier(int node) 571 { 572 bool cleared = false; 573 pg_data_t *pgdat; 574 struct memory_tier *memtier; 575 576 pgdat = NODE_DATA(node); 577 if (!pgdat) 578 return false; 579 580 /* 581 * Make sure that anybody looking at NODE_DATA who finds 582 * a valid memtier finds memory_dev_types with nodes still 583 * linked to the memtier. We achieve this by waiting for 584 * rcu read section to finish using synchronize_rcu. 585 * This also enables us to free the destroyed memory tier 586 * with kfree instead of kfree_rcu 587 */ 588 memtier = __node_get_memory_tier(node); 589 if (memtier) { 590 struct memory_dev_type *memtype; 591 592 rcu_assign_pointer(pgdat->memtier, NULL); 593 synchronize_rcu(); 594 memtype = node_memory_types[node].memtype; 595 node_clear(node, memtype->nodes); 596 if (nodes_empty(memtype->nodes)) { 597 list_del_init(&memtype->tier_sibling); 598 if (list_empty(&memtier->memory_types)) 599 destroy_memory_tier(memtier); 600 } 601 cleared = true; 602 } 603 return cleared; 604 } 605 606 static void release_memtype(struct kref *kref) 607 { 608 struct memory_dev_type *memtype; 609 610 memtype = container_of(kref, struct memory_dev_type, kref); 611 kfree(memtype); 612 } 613 614 struct memory_dev_type *alloc_memory_type(int adistance) 615 { 616 struct memory_dev_type *memtype; 617 618 memtype = kmalloc(sizeof(*memtype), GFP_KERNEL); 619 if (!memtype) 620 return ERR_PTR(-ENOMEM); 621 622 memtype->adistance = adistance; 623 INIT_LIST_HEAD(&memtype->tier_sibling); 624 memtype->nodes = NODE_MASK_NONE; 625 kref_init(&memtype->kref); 626 return memtype; 627 } 628 EXPORT_SYMBOL_GPL(alloc_memory_type); 629 630 void put_memory_type(struct memory_dev_type *memtype) 631 { 632 kref_put(&memtype->kref, release_memtype); 633 } 634 EXPORT_SYMBOL_GPL(put_memory_type); 635 636 void init_node_memory_type(int node, struct memory_dev_type *memtype) 637 { 638 639 mutex_lock(&memory_tier_lock); 640 __init_node_memory_type(node, memtype); 641 mutex_unlock(&memory_tier_lock); 642 } 643 EXPORT_SYMBOL_GPL(init_node_memory_type); 644 645 void clear_node_memory_type(int node, struct memory_dev_type *memtype) 646 { 647 mutex_lock(&memory_tier_lock); 648 if (node_memory_types[node].memtype == memtype || !memtype) 649 node_memory_types[node].map_count--; 650 /* 651 * If we umapped all the attached devices to this node, 652 * clear the node memory type. 653 */ 654 if (!node_memory_types[node].map_count) { 655 memtype = node_memory_types[node].memtype; 656 node_memory_types[node].memtype = NULL; 657 put_memory_type(memtype); 658 } 659 mutex_unlock(&memory_tier_lock); 660 } 661 EXPORT_SYMBOL_GPL(clear_node_memory_type); 662 663 struct memory_dev_type *mt_find_alloc_memory_type(int adist, struct list_head *memory_types) 664 { 665 struct memory_dev_type *mtype; 666 667 list_for_each_entry(mtype, memory_types, list) 668 if (mtype->adistance == adist) 669 return mtype; 670 671 mtype = alloc_memory_type(adist); 672 if (IS_ERR(mtype)) 673 return mtype; 674 675 list_add(&mtype->list, memory_types); 676 677 return mtype; 678 } 679 EXPORT_SYMBOL_GPL(mt_find_alloc_memory_type); 680 681 void mt_put_memory_types(struct list_head *memory_types) 682 { 683 struct memory_dev_type *mtype, *mtn; 684 685 list_for_each_entry_safe(mtype, mtn, memory_types, list) { 686 list_del(&mtype->list); 687 put_memory_type(mtype); 688 } 689 } 690 EXPORT_SYMBOL_GPL(mt_put_memory_types); 691 692 /* 693 * This is invoked via `late_initcall()` to initialize memory tiers for 694 * memory nodes, both with and without CPUs. After the initialization of 695 * firmware and devices, adistance algorithms are expected to be provided. 696 */ 697 static int __init memory_tier_late_init(void) 698 { 699 int nid; 700 struct memory_tier *memtier; 701 702 get_online_mems(); 703 guard(mutex)(&memory_tier_lock); 704 705 /* Assign each uninitialized N_MEMORY node to a memory tier. */ 706 for_each_node_state(nid, N_MEMORY) { 707 /* 708 * Some device drivers may have initialized 709 * memory tiers, potentially bringing memory nodes 710 * online and configuring memory tiers. 711 * Exclude them here. 712 */ 713 if (node_memory_types[nid].memtype) 714 continue; 715 716 memtier = set_node_memory_tier(nid); 717 if (IS_ERR(memtier)) 718 continue; 719 } 720 721 establish_demotion_targets(); 722 put_online_mems(); 723 724 return 0; 725 } 726 late_initcall(memory_tier_late_init); 727 728 static void dump_hmem_attrs(struct access_coordinate *coord, const char *prefix) 729 { 730 pr_info( 731 "%sread_latency: %u, write_latency: %u, read_bandwidth: %u, write_bandwidth: %u\n", 732 prefix, coord->read_latency, coord->write_latency, 733 coord->read_bandwidth, coord->write_bandwidth); 734 } 735 736 int mt_set_default_dram_perf(int nid, struct access_coordinate *perf, 737 const char *source) 738 { 739 guard(mutex)(&default_dram_perf_lock); 740 if (default_dram_perf_error) 741 return -EIO; 742 743 if (perf->read_latency + perf->write_latency == 0 || 744 perf->read_bandwidth + perf->write_bandwidth == 0) 745 return -EINVAL; 746 747 if (default_dram_perf_ref_nid == NUMA_NO_NODE) { 748 default_dram_perf = *perf; 749 default_dram_perf_ref_nid = nid; 750 default_dram_perf_ref_source = kstrdup(source, GFP_KERNEL); 751 return 0; 752 } 753 754 /* 755 * The performance of all default DRAM nodes is expected to be 756 * same (that is, the variation is less than 10%). And it 757 * will be used as base to calculate the abstract distance of 758 * other memory nodes. 759 */ 760 if (abs(perf->read_latency - default_dram_perf.read_latency) * 10 > 761 default_dram_perf.read_latency || 762 abs(perf->write_latency - default_dram_perf.write_latency) * 10 > 763 default_dram_perf.write_latency || 764 abs(perf->read_bandwidth - default_dram_perf.read_bandwidth) * 10 > 765 default_dram_perf.read_bandwidth || 766 abs(perf->write_bandwidth - default_dram_perf.write_bandwidth) * 10 > 767 default_dram_perf.write_bandwidth) { 768 pr_info( 769 "memory-tiers: the performance of DRAM node %d mismatches that of the reference\n" 770 "DRAM node %d.\n", nid, default_dram_perf_ref_nid); 771 pr_info(" performance of reference DRAM node %d from %s:\n", 772 default_dram_perf_ref_nid, default_dram_perf_ref_source); 773 dump_hmem_attrs(&default_dram_perf, " "); 774 pr_info(" performance of DRAM node %d from %s:\n", nid, source); 775 dump_hmem_attrs(perf, " "); 776 pr_info( 777 " disable default DRAM node performance based abstract distance algorithm.\n"); 778 default_dram_perf_error = true; 779 return -EINVAL; 780 } 781 782 return 0; 783 } 784 785 int mt_perf_to_adistance(struct access_coordinate *perf, int *adist) 786 { 787 guard(mutex)(&default_dram_perf_lock); 788 if (default_dram_perf_error) 789 return -EIO; 790 791 if (perf->read_latency + perf->write_latency == 0 || 792 perf->read_bandwidth + perf->write_bandwidth == 0) 793 return -EINVAL; 794 795 if (default_dram_perf_ref_nid == NUMA_NO_NODE) 796 return -ENOENT; 797 798 /* 799 * The abstract distance of a memory node is in direct proportion to 800 * its memory latency (read + write) and inversely proportional to its 801 * memory bandwidth (read + write). The abstract distance, memory 802 * latency, and memory bandwidth of the default DRAM nodes are used as 803 * the base. 804 */ 805 *adist = MEMTIER_ADISTANCE_DRAM * 806 (perf->read_latency + perf->write_latency) / 807 (default_dram_perf.read_latency + default_dram_perf.write_latency) * 808 (default_dram_perf.read_bandwidth + default_dram_perf.write_bandwidth) / 809 (perf->read_bandwidth + perf->write_bandwidth); 810 811 return 0; 812 } 813 EXPORT_SYMBOL_GPL(mt_perf_to_adistance); 814 815 /** 816 * register_mt_adistance_algorithm() - Register memory tiering abstract distance algorithm 817 * @nb: The notifier block which describe the algorithm 818 * 819 * Return: 0 on success, errno on error. 820 * 821 * Every memory tiering abstract distance algorithm provider needs to 822 * register the algorithm with register_mt_adistance_algorithm(). To 823 * calculate the abstract distance for a specified memory node, the 824 * notifier function will be called unless some high priority 825 * algorithm has provided result. The prototype of the notifier 826 * function is as follows, 827 * 828 * int (*algorithm_notifier)(struct notifier_block *nb, 829 * unsigned long nid, void *data); 830 * 831 * Where "nid" specifies the memory node, "data" is the pointer to the 832 * returned abstract distance (that is, "int *adist"). If the 833 * algorithm provides the result, NOTIFY_STOP should be returned. 834 * Otherwise, return_value & %NOTIFY_STOP_MASK == 0 to allow the next 835 * algorithm in the chain to provide the result. 836 */ 837 int register_mt_adistance_algorithm(struct notifier_block *nb) 838 { 839 return blocking_notifier_chain_register(&mt_adistance_algorithms, nb); 840 } 841 EXPORT_SYMBOL_GPL(register_mt_adistance_algorithm); 842 843 /** 844 * unregister_mt_adistance_algorithm() - Unregister memory tiering abstract distance algorithm 845 * @nb: the notifier block which describe the algorithm 846 * 847 * Return: 0 on success, errno on error. 848 */ 849 int unregister_mt_adistance_algorithm(struct notifier_block *nb) 850 { 851 return blocking_notifier_chain_unregister(&mt_adistance_algorithms, nb); 852 } 853 EXPORT_SYMBOL_GPL(unregister_mt_adistance_algorithm); 854 855 /** 856 * mt_calc_adistance() - Calculate abstract distance with registered algorithms 857 * @node: the node to calculate abstract distance for 858 * @adist: the returned abstract distance 859 * 860 * Return: if return_value & %NOTIFY_STOP_MASK != 0, then some 861 * abstract distance algorithm provides the result, and return it via 862 * @adist. Otherwise, no algorithm can provide the result and @adist 863 * will be kept as it is. 864 */ 865 int mt_calc_adistance(int node, int *adist) 866 { 867 return blocking_notifier_call_chain(&mt_adistance_algorithms, node, adist); 868 } 869 EXPORT_SYMBOL_GPL(mt_calc_adistance); 870 871 static int __meminit memtier_hotplug_callback(struct notifier_block *self, 872 unsigned long action, void *_arg) 873 { 874 struct memory_tier *memtier; 875 struct memory_notify *arg = _arg; 876 877 /* 878 * Only update the node migration order when a node is 879 * changing status, like online->offline. 880 */ 881 if (arg->status_change_nid < 0) 882 return notifier_from_errno(0); 883 884 switch (action) { 885 case MEM_OFFLINE: 886 mutex_lock(&memory_tier_lock); 887 if (clear_node_memory_tier(arg->status_change_nid)) 888 establish_demotion_targets(); 889 mutex_unlock(&memory_tier_lock); 890 break; 891 case MEM_ONLINE: 892 mutex_lock(&memory_tier_lock); 893 memtier = set_node_memory_tier(arg->status_change_nid); 894 if (!IS_ERR(memtier)) 895 establish_demotion_targets(); 896 mutex_unlock(&memory_tier_lock); 897 break; 898 } 899 900 return notifier_from_errno(0); 901 } 902 903 static int __init memory_tier_init(void) 904 { 905 int ret; 906 907 ret = subsys_virtual_register(&memory_tier_subsys, NULL); 908 if (ret) 909 panic("%s() failed to register memory tier subsystem\n", __func__); 910 911 #ifdef CONFIG_MIGRATION 912 node_demotion = kcalloc(nr_node_ids, sizeof(struct demotion_nodes), 913 GFP_KERNEL); 914 WARN_ON(!node_demotion); 915 #endif 916 917 mutex_lock(&memory_tier_lock); 918 /* 919 * For now we can have 4 faster memory tiers with smaller adistance 920 * than default DRAM tier. 921 */ 922 default_dram_type = mt_find_alloc_memory_type(MEMTIER_ADISTANCE_DRAM, 923 &default_memory_types); 924 mutex_unlock(&memory_tier_lock); 925 if (IS_ERR(default_dram_type)) 926 panic("%s() failed to allocate default DRAM tier\n", __func__); 927 928 /* Record nodes with memory and CPU to set default DRAM performance. */ 929 nodes_and(default_dram_nodes, node_states[N_MEMORY], 930 node_states[N_CPU]); 931 932 hotplug_memory_notifier(memtier_hotplug_callback, MEMTIER_HOTPLUG_PRI); 933 return 0; 934 } 935 subsys_initcall(memory_tier_init); 936 937 bool numa_demotion_enabled = false; 938 939 #ifdef CONFIG_MIGRATION 940 #ifdef CONFIG_SYSFS 941 static ssize_t demotion_enabled_show(struct kobject *kobj, 942 struct kobj_attribute *attr, char *buf) 943 { 944 return sysfs_emit(buf, "%s\n", str_true_false(numa_demotion_enabled)); 945 } 946 947 static ssize_t demotion_enabled_store(struct kobject *kobj, 948 struct kobj_attribute *attr, 949 const char *buf, size_t count) 950 { 951 ssize_t ret; 952 953 ret = kstrtobool(buf, &numa_demotion_enabled); 954 if (ret) 955 return ret; 956 957 return count; 958 } 959 960 static struct kobj_attribute numa_demotion_enabled_attr = 961 __ATTR_RW(demotion_enabled); 962 963 static struct attribute *numa_attrs[] = { 964 &numa_demotion_enabled_attr.attr, 965 NULL, 966 }; 967 968 static const struct attribute_group numa_attr_group = { 969 .attrs = numa_attrs, 970 }; 971 972 static int __init numa_init_sysfs(void) 973 { 974 int err; 975 struct kobject *numa_kobj; 976 977 numa_kobj = kobject_create_and_add("numa", mm_kobj); 978 if (!numa_kobj) { 979 pr_err("failed to create numa kobject\n"); 980 return -ENOMEM; 981 } 982 err = sysfs_create_group(numa_kobj, &numa_attr_group); 983 if (err) { 984 pr_err("failed to register numa group\n"); 985 goto delete_obj; 986 } 987 return 0; 988 989 delete_obj: 990 kobject_put(numa_kobj); 991 return err; 992 } 993 subsys_initcall(numa_init_sysfs); 994 #endif /* CONFIG_SYSFS */ 995 #endif 996