1 // SPDX-License-Identifier: GPL-2.0 2 #include <linux/slab.h> 3 #include <linux/lockdep.h> 4 #include <linux/sysfs.h> 5 #include <linux/kobject.h> 6 #include <linux/memory.h> 7 #include <linux/memory-tiers.h> 8 #include <linux/notifier.h> 9 10 #include "internal.h" 11 12 struct memory_tier { 13 /* hierarchy of memory tiers */ 14 struct list_head list; 15 /* list of all memory types part of this tier */ 16 struct list_head memory_types; 17 /* 18 * start value of abstract distance. memory tier maps 19 * an abstract distance range, 20 * adistance_start .. adistance_start + MEMTIER_CHUNK_SIZE 21 */ 22 int adistance_start; 23 struct device dev; 24 /* All the nodes that are part of all the lower memory tiers. */ 25 nodemask_t lower_tier_mask; 26 }; 27 28 struct demotion_nodes { 29 nodemask_t preferred; 30 }; 31 32 struct node_memory_type_map { 33 struct memory_dev_type *memtype; 34 int map_count; 35 }; 36 37 static DEFINE_MUTEX(memory_tier_lock); 38 static LIST_HEAD(memory_tiers); 39 /* 40 * The list is used to store all memory types that are not created 41 * by a device driver. 42 */ 43 static LIST_HEAD(default_memory_types); 44 static struct node_memory_type_map node_memory_types[MAX_NUMNODES]; 45 struct memory_dev_type *default_dram_type; 46 47 static const struct bus_type memory_tier_subsys = { 48 .name = "memory_tiering", 49 .dev_name = "memory_tier", 50 }; 51 52 #ifdef CONFIG_MIGRATION 53 static int top_tier_adistance; 54 /* 55 * node_demotion[] examples: 56 * 57 * Example 1: 58 * 59 * Node 0 & 1 are CPU + DRAM nodes, node 2 & 3 are PMEM nodes. 60 * 61 * node distances: 62 * node 0 1 2 3 63 * 0 10 20 30 40 64 * 1 20 10 40 30 65 * 2 30 40 10 40 66 * 3 40 30 40 10 67 * 68 * memory_tiers0 = 0-1 69 * memory_tiers1 = 2-3 70 * 71 * node_demotion[0].preferred = 2 72 * node_demotion[1].preferred = 3 73 * node_demotion[2].preferred = <empty> 74 * node_demotion[3].preferred = <empty> 75 * 76 * Example 2: 77 * 78 * Node 0 & 1 are CPU + DRAM nodes, node 2 is memory-only DRAM node. 79 * 80 * node distances: 81 * node 0 1 2 82 * 0 10 20 30 83 * 1 20 10 30 84 * 2 30 30 10 85 * 86 * memory_tiers0 = 0-2 87 * 88 * node_demotion[0].preferred = <empty> 89 * node_demotion[1].preferred = <empty> 90 * node_demotion[2].preferred = <empty> 91 * 92 * Example 3: 93 * 94 * Node 0 is CPU + DRAM nodes, Node 1 is HBM node, node 2 is PMEM node. 95 * 96 * node distances: 97 * node 0 1 2 98 * 0 10 20 30 99 * 1 20 10 40 100 * 2 30 40 10 101 * 102 * memory_tiers0 = 1 103 * memory_tiers1 = 0 104 * memory_tiers2 = 2 105 * 106 * node_demotion[0].preferred = 2 107 * node_demotion[1].preferred = 0 108 * node_demotion[2].preferred = <empty> 109 * 110 */ 111 static struct demotion_nodes *node_demotion __read_mostly; 112 #endif /* CONFIG_MIGRATION */ 113 114 static BLOCKING_NOTIFIER_HEAD(mt_adistance_algorithms); 115 116 /* The lock is used to protect `default_dram_perf*` info and nid. */ 117 static DEFINE_MUTEX(default_dram_perf_lock); 118 static bool default_dram_perf_error; 119 static struct access_coordinate default_dram_perf; 120 static int default_dram_perf_ref_nid = NUMA_NO_NODE; 121 static const char *default_dram_perf_ref_source; 122 123 static inline struct memory_tier *to_memory_tier(struct device *device) 124 { 125 return container_of(device, struct memory_tier, dev); 126 } 127 128 static __always_inline nodemask_t get_memtier_nodemask(struct memory_tier *memtier) 129 { 130 nodemask_t nodes = NODE_MASK_NONE; 131 struct memory_dev_type *memtype; 132 133 list_for_each_entry(memtype, &memtier->memory_types, tier_sibling) 134 nodes_or(nodes, nodes, memtype->nodes); 135 136 return nodes; 137 } 138 139 static void memory_tier_device_release(struct device *dev) 140 { 141 struct memory_tier *tier = to_memory_tier(dev); 142 /* 143 * synchronize_rcu in clear_node_memory_tier makes sure 144 * we don't have rcu access to this memory tier. 145 */ 146 kfree(tier); 147 } 148 149 static ssize_t nodelist_show(struct device *dev, 150 struct device_attribute *attr, char *buf) 151 { 152 int ret; 153 nodemask_t nmask; 154 155 mutex_lock(&memory_tier_lock); 156 nmask = get_memtier_nodemask(to_memory_tier(dev)); 157 ret = sysfs_emit(buf, "%*pbl\n", nodemask_pr_args(&nmask)); 158 mutex_unlock(&memory_tier_lock); 159 return ret; 160 } 161 static DEVICE_ATTR_RO(nodelist); 162 163 static struct attribute *memtier_dev_attrs[] = { 164 &dev_attr_nodelist.attr, 165 NULL 166 }; 167 168 static const struct attribute_group memtier_dev_group = { 169 .attrs = memtier_dev_attrs, 170 }; 171 172 static const struct attribute_group *memtier_dev_groups[] = { 173 &memtier_dev_group, 174 NULL 175 }; 176 177 static struct memory_tier *find_create_memory_tier(struct memory_dev_type *memtype) 178 { 179 int ret; 180 bool found_slot = false; 181 struct memory_tier *memtier, *new_memtier; 182 int adistance = memtype->adistance; 183 unsigned int memtier_adistance_chunk_size = MEMTIER_CHUNK_SIZE; 184 185 lockdep_assert_held_once(&memory_tier_lock); 186 187 adistance = round_down(adistance, memtier_adistance_chunk_size); 188 /* 189 * If the memtype is already part of a memory tier, 190 * just return that. 191 */ 192 if (!list_empty(&memtype->tier_sibling)) { 193 list_for_each_entry(memtier, &memory_tiers, list) { 194 if (adistance == memtier->adistance_start) 195 return memtier; 196 } 197 WARN_ON(1); 198 return ERR_PTR(-EINVAL); 199 } 200 201 list_for_each_entry(memtier, &memory_tiers, list) { 202 if (adistance == memtier->adistance_start) { 203 goto link_memtype; 204 } else if (adistance < memtier->adistance_start) { 205 found_slot = true; 206 break; 207 } 208 } 209 210 new_memtier = kzalloc(sizeof(struct memory_tier), GFP_KERNEL); 211 if (!new_memtier) 212 return ERR_PTR(-ENOMEM); 213 214 new_memtier->adistance_start = adistance; 215 INIT_LIST_HEAD(&new_memtier->list); 216 INIT_LIST_HEAD(&new_memtier->memory_types); 217 if (found_slot) 218 list_add_tail(&new_memtier->list, &memtier->list); 219 else 220 list_add_tail(&new_memtier->list, &memory_tiers); 221 222 new_memtier->dev.id = adistance >> MEMTIER_CHUNK_BITS; 223 new_memtier->dev.bus = &memory_tier_subsys; 224 new_memtier->dev.release = memory_tier_device_release; 225 new_memtier->dev.groups = memtier_dev_groups; 226 227 ret = device_register(&new_memtier->dev); 228 if (ret) { 229 list_del(&new_memtier->list); 230 put_device(&new_memtier->dev); 231 return ERR_PTR(ret); 232 } 233 memtier = new_memtier; 234 235 link_memtype: 236 list_add(&memtype->tier_sibling, &memtier->memory_types); 237 return memtier; 238 } 239 240 static struct memory_tier *__node_get_memory_tier(int node) 241 { 242 pg_data_t *pgdat; 243 244 pgdat = NODE_DATA(node); 245 if (!pgdat) 246 return NULL; 247 /* 248 * Since we hold memory_tier_lock, we can avoid 249 * RCU read locks when accessing the details. No 250 * parallel updates are possible here. 251 */ 252 return rcu_dereference_check(pgdat->memtier, 253 lockdep_is_held(&memory_tier_lock)); 254 } 255 256 #ifdef CONFIG_MIGRATION 257 bool node_is_toptier(int node) 258 { 259 bool toptier; 260 pg_data_t *pgdat; 261 struct memory_tier *memtier; 262 263 pgdat = NODE_DATA(node); 264 if (!pgdat) 265 return false; 266 267 rcu_read_lock(); 268 memtier = rcu_dereference(pgdat->memtier); 269 if (!memtier) { 270 toptier = true; 271 goto out; 272 } 273 if (memtier->adistance_start <= top_tier_adistance) 274 toptier = true; 275 else 276 toptier = false; 277 out: 278 rcu_read_unlock(); 279 return toptier; 280 } 281 282 void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets) 283 { 284 struct memory_tier *memtier; 285 286 /* 287 * pg_data_t.memtier updates includes a synchronize_rcu() 288 * which ensures that we either find NULL or a valid memtier 289 * in NODE_DATA. protect the access via rcu_read_lock(); 290 */ 291 rcu_read_lock(); 292 memtier = rcu_dereference(pgdat->memtier); 293 if (memtier) 294 *targets = memtier->lower_tier_mask; 295 else 296 *targets = NODE_MASK_NONE; 297 rcu_read_unlock(); 298 } 299 300 /** 301 * next_demotion_node() - Get the next node in the demotion path 302 * @node: The starting node to lookup the next node 303 * 304 * Return: node id for next memory node in the demotion path hierarchy 305 * from @node; NUMA_NO_NODE if @node is terminal. This does not keep 306 * @node online or guarantee that it *continues* to be the next demotion 307 * target. 308 */ 309 int next_demotion_node(int node) 310 { 311 struct demotion_nodes *nd; 312 int target; 313 314 if (!node_demotion) 315 return NUMA_NO_NODE; 316 317 nd = &node_demotion[node]; 318 319 /* 320 * node_demotion[] is updated without excluding this 321 * function from running. 322 * 323 * Make sure to use RCU over entire code blocks if 324 * node_demotion[] reads need to be consistent. 325 */ 326 rcu_read_lock(); 327 /* 328 * If there are multiple target nodes, just select one 329 * target node randomly. 330 * 331 * In addition, we can also use round-robin to select 332 * target node, but we should introduce another variable 333 * for node_demotion[] to record last selected target node, 334 * that may cause cache ping-pong due to the changing of 335 * last target node. Or introducing per-cpu data to avoid 336 * caching issue, which seems more complicated. So selecting 337 * target node randomly seems better until now. 338 */ 339 target = node_random(&nd->preferred); 340 rcu_read_unlock(); 341 342 return target; 343 } 344 345 static void disable_all_demotion_targets(void) 346 { 347 struct memory_tier *memtier; 348 int node; 349 350 for_each_node_state(node, N_MEMORY) { 351 node_demotion[node].preferred = NODE_MASK_NONE; 352 /* 353 * We are holding memory_tier_lock, it is safe 354 * to access pgda->memtier. 355 */ 356 memtier = __node_get_memory_tier(node); 357 if (memtier) 358 memtier->lower_tier_mask = NODE_MASK_NONE; 359 } 360 /* 361 * Ensure that the "disable" is visible across the system. 362 * Readers will see either a combination of before+disable 363 * state or disable+after. They will never see before and 364 * after state together. 365 */ 366 synchronize_rcu(); 367 } 368 369 static void dump_demotion_targets(void) 370 { 371 int node; 372 373 for_each_node_state(node, N_MEMORY) { 374 struct memory_tier *memtier = __node_get_memory_tier(node); 375 nodemask_t preferred = node_demotion[node].preferred; 376 377 if (!memtier) 378 continue; 379 380 if (nodes_empty(preferred)) 381 pr_info("Demotion targets for Node %d: null\n", node); 382 else 383 pr_info("Demotion targets for Node %d: preferred: %*pbl, fallback: %*pbl\n", 384 node, nodemask_pr_args(&preferred), 385 nodemask_pr_args(&memtier->lower_tier_mask)); 386 } 387 } 388 389 /* 390 * Find an automatic demotion target for all memory 391 * nodes. Failing here is OK. It might just indicate 392 * being at the end of a chain. 393 */ 394 static void establish_demotion_targets(void) 395 { 396 struct memory_tier *memtier; 397 struct demotion_nodes *nd; 398 int target = NUMA_NO_NODE, node; 399 int distance, best_distance; 400 nodemask_t tier_nodes, lower_tier; 401 402 lockdep_assert_held_once(&memory_tier_lock); 403 404 if (!node_demotion) 405 return; 406 407 disable_all_demotion_targets(); 408 409 for_each_node_state(node, N_MEMORY) { 410 best_distance = -1; 411 nd = &node_demotion[node]; 412 413 memtier = __node_get_memory_tier(node); 414 if (!memtier || list_is_last(&memtier->list, &memory_tiers)) 415 continue; 416 /* 417 * Get the lower memtier to find the demotion node list. 418 */ 419 memtier = list_next_entry(memtier, list); 420 tier_nodes = get_memtier_nodemask(memtier); 421 /* 422 * find_next_best_node, use 'used' nodemask as a skip list. 423 * Add all memory nodes except the selected memory tier 424 * nodelist to skip list so that we find the best node from the 425 * memtier nodelist. 426 */ 427 nodes_andnot(tier_nodes, node_states[N_MEMORY], tier_nodes); 428 429 /* 430 * Find all the nodes in the memory tier node list of same best distance. 431 * add them to the preferred mask. We randomly select between nodes 432 * in the preferred mask when allocating pages during demotion. 433 */ 434 do { 435 target = find_next_best_node(node, &tier_nodes); 436 if (target == NUMA_NO_NODE) 437 break; 438 439 distance = node_distance(node, target); 440 if (distance == best_distance || best_distance == -1) { 441 best_distance = distance; 442 node_set(target, nd->preferred); 443 } else { 444 break; 445 } 446 } while (1); 447 } 448 /* 449 * Promotion is allowed from a memory tier to higher 450 * memory tier only if the memory tier doesn't include 451 * compute. We want to skip promotion from a memory tier, 452 * if any node that is part of the memory tier have CPUs. 453 * Once we detect such a memory tier, we consider that tier 454 * as top tiper from which promotion is not allowed. 455 */ 456 list_for_each_entry_reverse(memtier, &memory_tiers, list) { 457 tier_nodes = get_memtier_nodemask(memtier); 458 nodes_and(tier_nodes, node_states[N_CPU], tier_nodes); 459 if (!nodes_empty(tier_nodes)) { 460 /* 461 * abstract distance below the max value of this memtier 462 * is considered toptier. 463 */ 464 top_tier_adistance = memtier->adistance_start + 465 MEMTIER_CHUNK_SIZE - 1; 466 break; 467 } 468 } 469 /* 470 * Now build the lower_tier mask for each node collecting node mask from 471 * all memory tier below it. This allows us to fallback demotion page 472 * allocation to a set of nodes that is closer the above selected 473 * preferred node. 474 */ 475 lower_tier = node_states[N_MEMORY]; 476 list_for_each_entry(memtier, &memory_tiers, list) { 477 /* 478 * Keep removing current tier from lower_tier nodes, 479 * This will remove all nodes in current and above 480 * memory tier from the lower_tier mask. 481 */ 482 tier_nodes = get_memtier_nodemask(memtier); 483 nodes_andnot(lower_tier, lower_tier, tier_nodes); 484 memtier->lower_tier_mask = lower_tier; 485 } 486 487 dump_demotion_targets(); 488 } 489 490 #else 491 static inline void establish_demotion_targets(void) {} 492 #endif /* CONFIG_MIGRATION */ 493 494 static inline void __init_node_memory_type(int node, struct memory_dev_type *memtype) 495 { 496 if (!node_memory_types[node].memtype) 497 node_memory_types[node].memtype = memtype; 498 /* 499 * for each device getting added in the same NUMA node 500 * with this specific memtype, bump the map count. We 501 * Only take memtype device reference once, so that 502 * changing a node memtype can be done by droping the 503 * only reference count taken here. 504 */ 505 506 if (node_memory_types[node].memtype == memtype) { 507 if (!node_memory_types[node].map_count++) 508 kref_get(&memtype->kref); 509 } 510 } 511 512 static struct memory_tier *set_node_memory_tier(int node) 513 { 514 struct memory_tier *memtier; 515 struct memory_dev_type *memtype = default_dram_type; 516 int adist = MEMTIER_ADISTANCE_DRAM; 517 pg_data_t *pgdat = NODE_DATA(node); 518 519 520 lockdep_assert_held_once(&memory_tier_lock); 521 522 if (!node_state(node, N_MEMORY)) 523 return ERR_PTR(-EINVAL); 524 525 mt_calc_adistance(node, &adist); 526 if (!node_memory_types[node].memtype) { 527 memtype = mt_find_alloc_memory_type(adist, &default_memory_types); 528 if (IS_ERR(memtype)) { 529 memtype = default_dram_type; 530 pr_info("Failed to allocate a memory type. Fall back.\n"); 531 } 532 } 533 534 __init_node_memory_type(node, memtype); 535 536 memtype = node_memory_types[node].memtype; 537 node_set(node, memtype->nodes); 538 memtier = find_create_memory_tier(memtype); 539 if (!IS_ERR(memtier)) 540 rcu_assign_pointer(pgdat->memtier, memtier); 541 return memtier; 542 } 543 544 static void destroy_memory_tier(struct memory_tier *memtier) 545 { 546 list_del(&memtier->list); 547 device_unregister(&memtier->dev); 548 } 549 550 static bool clear_node_memory_tier(int node) 551 { 552 bool cleared = false; 553 pg_data_t *pgdat; 554 struct memory_tier *memtier; 555 556 pgdat = NODE_DATA(node); 557 if (!pgdat) 558 return false; 559 560 /* 561 * Make sure that anybody looking at NODE_DATA who finds 562 * a valid memtier finds memory_dev_types with nodes still 563 * linked to the memtier. We achieve this by waiting for 564 * rcu read section to finish using synchronize_rcu. 565 * This also enables us to free the destroyed memory tier 566 * with kfree instead of kfree_rcu 567 */ 568 memtier = __node_get_memory_tier(node); 569 if (memtier) { 570 struct memory_dev_type *memtype; 571 572 rcu_assign_pointer(pgdat->memtier, NULL); 573 synchronize_rcu(); 574 memtype = node_memory_types[node].memtype; 575 node_clear(node, memtype->nodes); 576 if (nodes_empty(memtype->nodes)) { 577 list_del_init(&memtype->tier_sibling); 578 if (list_empty(&memtier->memory_types)) 579 destroy_memory_tier(memtier); 580 } 581 cleared = true; 582 } 583 return cleared; 584 } 585 586 static void release_memtype(struct kref *kref) 587 { 588 struct memory_dev_type *memtype; 589 590 memtype = container_of(kref, struct memory_dev_type, kref); 591 kfree(memtype); 592 } 593 594 struct memory_dev_type *alloc_memory_type(int adistance) 595 { 596 struct memory_dev_type *memtype; 597 598 memtype = kmalloc(sizeof(*memtype), GFP_KERNEL); 599 if (!memtype) 600 return ERR_PTR(-ENOMEM); 601 602 memtype->adistance = adistance; 603 INIT_LIST_HEAD(&memtype->tier_sibling); 604 memtype->nodes = NODE_MASK_NONE; 605 kref_init(&memtype->kref); 606 return memtype; 607 } 608 EXPORT_SYMBOL_GPL(alloc_memory_type); 609 610 void put_memory_type(struct memory_dev_type *memtype) 611 { 612 kref_put(&memtype->kref, release_memtype); 613 } 614 EXPORT_SYMBOL_GPL(put_memory_type); 615 616 void init_node_memory_type(int node, struct memory_dev_type *memtype) 617 { 618 619 mutex_lock(&memory_tier_lock); 620 __init_node_memory_type(node, memtype); 621 mutex_unlock(&memory_tier_lock); 622 } 623 EXPORT_SYMBOL_GPL(init_node_memory_type); 624 625 void clear_node_memory_type(int node, struct memory_dev_type *memtype) 626 { 627 mutex_lock(&memory_tier_lock); 628 if (node_memory_types[node].memtype == memtype || !memtype) 629 node_memory_types[node].map_count--; 630 /* 631 * If we umapped all the attached devices to this node, 632 * clear the node memory type. 633 */ 634 if (!node_memory_types[node].map_count) { 635 memtype = node_memory_types[node].memtype; 636 node_memory_types[node].memtype = NULL; 637 put_memory_type(memtype); 638 } 639 mutex_unlock(&memory_tier_lock); 640 } 641 EXPORT_SYMBOL_GPL(clear_node_memory_type); 642 643 struct memory_dev_type *mt_find_alloc_memory_type(int adist, struct list_head *memory_types) 644 { 645 struct memory_dev_type *mtype; 646 647 list_for_each_entry(mtype, memory_types, list) 648 if (mtype->adistance == adist) 649 return mtype; 650 651 mtype = alloc_memory_type(adist); 652 if (IS_ERR(mtype)) 653 return mtype; 654 655 list_add(&mtype->list, memory_types); 656 657 return mtype; 658 } 659 EXPORT_SYMBOL_GPL(mt_find_alloc_memory_type); 660 661 void mt_put_memory_types(struct list_head *memory_types) 662 { 663 struct memory_dev_type *mtype, *mtn; 664 665 list_for_each_entry_safe(mtype, mtn, memory_types, list) { 666 list_del(&mtype->list); 667 put_memory_type(mtype); 668 } 669 } 670 EXPORT_SYMBOL_GPL(mt_put_memory_types); 671 672 /* 673 * This is invoked via `late_initcall()` to initialize memory tiers for 674 * CPU-less memory nodes after driver initialization, which is 675 * expected to provide `adistance` algorithms. 676 */ 677 static int __init memory_tier_late_init(void) 678 { 679 int nid; 680 681 guard(mutex)(&memory_tier_lock); 682 for_each_node_state(nid, N_MEMORY) { 683 /* 684 * Some device drivers may have initialized memory tiers 685 * between `memory_tier_init()` and `memory_tier_late_init()`, 686 * potentially bringing online memory nodes and 687 * configuring memory tiers. Exclude them here. 688 */ 689 if (node_memory_types[nid].memtype) 690 continue; 691 692 set_node_memory_tier(nid); 693 } 694 695 establish_demotion_targets(); 696 697 return 0; 698 } 699 late_initcall(memory_tier_late_init); 700 701 static void dump_hmem_attrs(struct access_coordinate *coord, const char *prefix) 702 { 703 pr_info( 704 "%sread_latency: %u, write_latency: %u, read_bandwidth: %u, write_bandwidth: %u\n", 705 prefix, coord->read_latency, coord->write_latency, 706 coord->read_bandwidth, coord->write_bandwidth); 707 } 708 709 int mt_set_default_dram_perf(int nid, struct access_coordinate *perf, 710 const char *source) 711 { 712 guard(mutex)(&default_dram_perf_lock); 713 if (default_dram_perf_error) 714 return -EIO; 715 716 if (perf->read_latency + perf->write_latency == 0 || 717 perf->read_bandwidth + perf->write_bandwidth == 0) 718 return -EINVAL; 719 720 if (default_dram_perf_ref_nid == NUMA_NO_NODE) { 721 default_dram_perf = *perf; 722 default_dram_perf_ref_nid = nid; 723 default_dram_perf_ref_source = kstrdup(source, GFP_KERNEL); 724 return 0; 725 } 726 727 /* 728 * The performance of all default DRAM nodes is expected to be 729 * same (that is, the variation is less than 10%). And it 730 * will be used as base to calculate the abstract distance of 731 * other memory nodes. 732 */ 733 if (abs(perf->read_latency - default_dram_perf.read_latency) * 10 > 734 default_dram_perf.read_latency || 735 abs(perf->write_latency - default_dram_perf.write_latency) * 10 > 736 default_dram_perf.write_latency || 737 abs(perf->read_bandwidth - default_dram_perf.read_bandwidth) * 10 > 738 default_dram_perf.read_bandwidth || 739 abs(perf->write_bandwidth - default_dram_perf.write_bandwidth) * 10 > 740 default_dram_perf.write_bandwidth) { 741 pr_info( 742 "memory-tiers: the performance of DRAM node %d mismatches that of the reference\n" 743 "DRAM node %d.\n", nid, default_dram_perf_ref_nid); 744 pr_info(" performance of reference DRAM node %d:\n", 745 default_dram_perf_ref_nid); 746 dump_hmem_attrs(&default_dram_perf, " "); 747 pr_info(" performance of DRAM node %d:\n", nid); 748 dump_hmem_attrs(perf, " "); 749 pr_info( 750 " disable default DRAM node performance based abstract distance algorithm.\n"); 751 default_dram_perf_error = true; 752 return -EINVAL; 753 } 754 755 return 0; 756 } 757 758 int mt_perf_to_adistance(struct access_coordinate *perf, int *adist) 759 { 760 guard(mutex)(&default_dram_perf_lock); 761 if (default_dram_perf_error) 762 return -EIO; 763 764 if (perf->read_latency + perf->write_latency == 0 || 765 perf->read_bandwidth + perf->write_bandwidth == 0) 766 return -EINVAL; 767 768 if (default_dram_perf_ref_nid == NUMA_NO_NODE) 769 return -ENOENT; 770 771 /* 772 * The abstract distance of a memory node is in direct proportion to 773 * its memory latency (read + write) and inversely proportional to its 774 * memory bandwidth (read + write). The abstract distance, memory 775 * latency, and memory bandwidth of the default DRAM nodes are used as 776 * the base. 777 */ 778 *adist = MEMTIER_ADISTANCE_DRAM * 779 (perf->read_latency + perf->write_latency) / 780 (default_dram_perf.read_latency + default_dram_perf.write_latency) * 781 (default_dram_perf.read_bandwidth + default_dram_perf.write_bandwidth) / 782 (perf->read_bandwidth + perf->write_bandwidth); 783 784 return 0; 785 } 786 EXPORT_SYMBOL_GPL(mt_perf_to_adistance); 787 788 /** 789 * register_mt_adistance_algorithm() - Register memory tiering abstract distance algorithm 790 * @nb: The notifier block which describe the algorithm 791 * 792 * Return: 0 on success, errno on error. 793 * 794 * Every memory tiering abstract distance algorithm provider needs to 795 * register the algorithm with register_mt_adistance_algorithm(). To 796 * calculate the abstract distance for a specified memory node, the 797 * notifier function will be called unless some high priority 798 * algorithm has provided result. The prototype of the notifier 799 * function is as follows, 800 * 801 * int (*algorithm_notifier)(struct notifier_block *nb, 802 * unsigned long nid, void *data); 803 * 804 * Where "nid" specifies the memory node, "data" is the pointer to the 805 * returned abstract distance (that is, "int *adist"). If the 806 * algorithm provides the result, NOTIFY_STOP should be returned. 807 * Otherwise, return_value & %NOTIFY_STOP_MASK == 0 to allow the next 808 * algorithm in the chain to provide the result. 809 */ 810 int register_mt_adistance_algorithm(struct notifier_block *nb) 811 { 812 return blocking_notifier_chain_register(&mt_adistance_algorithms, nb); 813 } 814 EXPORT_SYMBOL_GPL(register_mt_adistance_algorithm); 815 816 /** 817 * unregister_mt_adistance_algorithm() - Unregister memory tiering abstract distance algorithm 818 * @nb: the notifier block which describe the algorithm 819 * 820 * Return: 0 on success, errno on error. 821 */ 822 int unregister_mt_adistance_algorithm(struct notifier_block *nb) 823 { 824 return blocking_notifier_chain_unregister(&mt_adistance_algorithms, nb); 825 } 826 EXPORT_SYMBOL_GPL(unregister_mt_adistance_algorithm); 827 828 /** 829 * mt_calc_adistance() - Calculate abstract distance with registered algorithms 830 * @node: the node to calculate abstract distance for 831 * @adist: the returned abstract distance 832 * 833 * Return: if return_value & %NOTIFY_STOP_MASK != 0, then some 834 * abstract distance algorithm provides the result, and return it via 835 * @adist. Otherwise, no algorithm can provide the result and @adist 836 * will be kept as it is. 837 */ 838 int mt_calc_adistance(int node, int *adist) 839 { 840 return blocking_notifier_call_chain(&mt_adistance_algorithms, node, adist); 841 } 842 EXPORT_SYMBOL_GPL(mt_calc_adistance); 843 844 static int __meminit memtier_hotplug_callback(struct notifier_block *self, 845 unsigned long action, void *_arg) 846 { 847 struct memory_tier *memtier; 848 struct memory_notify *arg = _arg; 849 850 /* 851 * Only update the node migration order when a node is 852 * changing status, like online->offline. 853 */ 854 if (arg->status_change_nid < 0) 855 return notifier_from_errno(0); 856 857 switch (action) { 858 case MEM_OFFLINE: 859 mutex_lock(&memory_tier_lock); 860 if (clear_node_memory_tier(arg->status_change_nid)) 861 establish_demotion_targets(); 862 mutex_unlock(&memory_tier_lock); 863 break; 864 case MEM_ONLINE: 865 mutex_lock(&memory_tier_lock); 866 memtier = set_node_memory_tier(arg->status_change_nid); 867 if (!IS_ERR(memtier)) 868 establish_demotion_targets(); 869 mutex_unlock(&memory_tier_lock); 870 break; 871 } 872 873 return notifier_from_errno(0); 874 } 875 876 static int __init memory_tier_init(void) 877 { 878 int ret, node; 879 struct memory_tier *memtier; 880 881 ret = subsys_virtual_register(&memory_tier_subsys, NULL); 882 if (ret) 883 panic("%s() failed to register memory tier subsystem\n", __func__); 884 885 #ifdef CONFIG_MIGRATION 886 node_demotion = kcalloc(nr_node_ids, sizeof(struct demotion_nodes), 887 GFP_KERNEL); 888 WARN_ON(!node_demotion); 889 #endif 890 mutex_lock(&memory_tier_lock); 891 /* 892 * For now we can have 4 faster memory tiers with smaller adistance 893 * than default DRAM tier. 894 */ 895 default_dram_type = mt_find_alloc_memory_type(MEMTIER_ADISTANCE_DRAM, 896 &default_memory_types); 897 if (IS_ERR(default_dram_type)) 898 panic("%s() failed to allocate default DRAM tier\n", __func__); 899 900 /* 901 * Look at all the existing N_MEMORY nodes and add them to 902 * default memory tier or to a tier if we already have memory 903 * types assigned. 904 */ 905 for_each_node_state(node, N_MEMORY) { 906 if (!node_state(node, N_CPU)) 907 /* 908 * Defer memory tier initialization on 909 * CPUless numa nodes. These will be initialized 910 * after firmware and devices are initialized. 911 */ 912 continue; 913 914 memtier = set_node_memory_tier(node); 915 if (IS_ERR(memtier)) 916 /* 917 * Continue with memtiers we are able to setup 918 */ 919 break; 920 } 921 establish_demotion_targets(); 922 mutex_unlock(&memory_tier_lock); 923 924 hotplug_memory_notifier(memtier_hotplug_callback, MEMTIER_HOTPLUG_PRI); 925 return 0; 926 } 927 subsys_initcall(memory_tier_init); 928 929 bool numa_demotion_enabled = false; 930 931 #ifdef CONFIG_MIGRATION 932 #ifdef CONFIG_SYSFS 933 static ssize_t demotion_enabled_show(struct kobject *kobj, 934 struct kobj_attribute *attr, char *buf) 935 { 936 return sysfs_emit(buf, "%s\n", 937 numa_demotion_enabled ? "true" : "false"); 938 } 939 940 static ssize_t demotion_enabled_store(struct kobject *kobj, 941 struct kobj_attribute *attr, 942 const char *buf, size_t count) 943 { 944 ssize_t ret; 945 946 ret = kstrtobool(buf, &numa_demotion_enabled); 947 if (ret) 948 return ret; 949 950 return count; 951 } 952 953 static struct kobj_attribute numa_demotion_enabled_attr = 954 __ATTR_RW(demotion_enabled); 955 956 static struct attribute *numa_attrs[] = { 957 &numa_demotion_enabled_attr.attr, 958 NULL, 959 }; 960 961 static const struct attribute_group numa_attr_group = { 962 .attrs = numa_attrs, 963 }; 964 965 static int __init numa_init_sysfs(void) 966 { 967 int err; 968 struct kobject *numa_kobj; 969 970 numa_kobj = kobject_create_and_add("numa", mm_kobj); 971 if (!numa_kobj) { 972 pr_err("failed to create numa kobject\n"); 973 return -ENOMEM; 974 } 975 err = sysfs_create_group(numa_kobj, &numa_attr_group); 976 if (err) { 977 pr_err("failed to register numa group\n"); 978 goto delete_obj; 979 } 980 return 0; 981 982 delete_obj: 983 kobject_put(numa_kobj); 984 return err; 985 } 986 subsys_initcall(numa_init_sysfs); 987 #endif /* CONFIG_SYSFS */ 988 #endif 989