1 // SPDX-License-Identifier: GPL-2.0 2 #include <linux/slab.h> 3 #include <linux/lockdep.h> 4 #include <linux/sysfs.h> 5 #include <linux/kobject.h> 6 #include <linux/memory.h> 7 #include <linux/memory-tiers.h> 8 #include <linux/notifier.h> 9 10 #include "internal.h" 11 12 struct memory_tier { 13 /* hierarchy of memory tiers */ 14 struct list_head list; 15 /* list of all memory types part of this tier */ 16 struct list_head memory_types; 17 /* 18 * start value of abstract distance. memory tier maps 19 * an abstract distance range, 20 * adistance_start .. adistance_start + MEMTIER_CHUNK_SIZE 21 */ 22 int adistance_start; 23 struct device dev; 24 /* All the nodes that are part of all the lower memory tiers. */ 25 nodemask_t lower_tier_mask; 26 }; 27 28 struct demotion_nodes { 29 nodemask_t preferred; 30 }; 31 32 struct node_memory_type_map { 33 struct memory_dev_type *memtype; 34 int map_count; 35 }; 36 37 static DEFINE_MUTEX(memory_tier_lock); 38 static LIST_HEAD(memory_tiers); 39 /* 40 * The list is used to store all memory types that are not created 41 * by a device driver. 42 */ 43 static LIST_HEAD(default_memory_types); 44 static struct node_memory_type_map node_memory_types[MAX_NUMNODES]; 45 struct memory_dev_type *default_dram_type; 46 nodemask_t default_dram_nodes __initdata = NODE_MASK_NONE; 47 48 static const struct bus_type memory_tier_subsys = { 49 .name = "memory_tiering", 50 .dev_name = "memory_tier", 51 }; 52 53 #ifdef CONFIG_MIGRATION 54 static int top_tier_adistance; 55 /* 56 * node_demotion[] examples: 57 * 58 * Example 1: 59 * 60 * Node 0 & 1 are CPU + DRAM nodes, node 2 & 3 are PMEM nodes. 61 * 62 * node distances: 63 * node 0 1 2 3 64 * 0 10 20 30 40 65 * 1 20 10 40 30 66 * 2 30 40 10 40 67 * 3 40 30 40 10 68 * 69 * memory_tiers0 = 0-1 70 * memory_tiers1 = 2-3 71 * 72 * node_demotion[0].preferred = 2 73 * node_demotion[1].preferred = 3 74 * node_demotion[2].preferred = <empty> 75 * node_demotion[3].preferred = <empty> 76 * 77 * Example 2: 78 * 79 * Node 0 & 1 are CPU + DRAM nodes, node 2 is memory-only DRAM node. 80 * 81 * node distances: 82 * node 0 1 2 83 * 0 10 20 30 84 * 1 20 10 30 85 * 2 30 30 10 86 * 87 * memory_tiers0 = 0-2 88 * 89 * node_demotion[0].preferred = <empty> 90 * node_demotion[1].preferred = <empty> 91 * node_demotion[2].preferred = <empty> 92 * 93 * Example 3: 94 * 95 * Node 0 is CPU + DRAM nodes, Node 1 is HBM node, node 2 is PMEM node. 96 * 97 * node distances: 98 * node 0 1 2 99 * 0 10 20 30 100 * 1 20 10 40 101 * 2 30 40 10 102 * 103 * memory_tiers0 = 1 104 * memory_tiers1 = 0 105 * memory_tiers2 = 2 106 * 107 * node_demotion[0].preferred = 2 108 * node_demotion[1].preferred = 0 109 * node_demotion[2].preferred = <empty> 110 * 111 */ 112 static struct demotion_nodes *node_demotion __read_mostly; 113 #endif /* CONFIG_MIGRATION */ 114 115 static BLOCKING_NOTIFIER_HEAD(mt_adistance_algorithms); 116 117 /* The lock is used to protect `default_dram_perf*` info and nid. */ 118 static DEFINE_MUTEX(default_dram_perf_lock); 119 static bool default_dram_perf_error; 120 static struct access_coordinate default_dram_perf; 121 static int default_dram_perf_ref_nid = NUMA_NO_NODE; 122 static const char *default_dram_perf_ref_source; 123 124 static inline struct memory_tier *to_memory_tier(struct device *device) 125 { 126 return container_of(device, struct memory_tier, dev); 127 } 128 129 static __always_inline nodemask_t get_memtier_nodemask(struct memory_tier *memtier) 130 { 131 nodemask_t nodes = NODE_MASK_NONE; 132 struct memory_dev_type *memtype; 133 134 list_for_each_entry(memtype, &memtier->memory_types, tier_sibling) 135 nodes_or(nodes, nodes, memtype->nodes); 136 137 return nodes; 138 } 139 140 static void memory_tier_device_release(struct device *dev) 141 { 142 struct memory_tier *tier = to_memory_tier(dev); 143 /* 144 * synchronize_rcu in clear_node_memory_tier makes sure 145 * we don't have rcu access to this memory tier. 146 */ 147 kfree(tier); 148 } 149 150 static ssize_t nodelist_show(struct device *dev, 151 struct device_attribute *attr, char *buf) 152 { 153 int ret; 154 nodemask_t nmask; 155 156 mutex_lock(&memory_tier_lock); 157 nmask = get_memtier_nodemask(to_memory_tier(dev)); 158 ret = sysfs_emit(buf, "%*pbl\n", nodemask_pr_args(&nmask)); 159 mutex_unlock(&memory_tier_lock); 160 return ret; 161 } 162 static DEVICE_ATTR_RO(nodelist); 163 164 static struct attribute *memtier_dev_attrs[] = { 165 &dev_attr_nodelist.attr, 166 NULL 167 }; 168 169 static const struct attribute_group memtier_dev_group = { 170 .attrs = memtier_dev_attrs, 171 }; 172 173 static const struct attribute_group *memtier_dev_groups[] = { 174 &memtier_dev_group, 175 NULL 176 }; 177 178 static struct memory_tier *find_create_memory_tier(struct memory_dev_type *memtype) 179 { 180 int ret; 181 bool found_slot = false; 182 struct memory_tier *memtier, *new_memtier; 183 int adistance = memtype->adistance; 184 unsigned int memtier_adistance_chunk_size = MEMTIER_CHUNK_SIZE; 185 186 lockdep_assert_held_once(&memory_tier_lock); 187 188 adistance = round_down(adistance, memtier_adistance_chunk_size); 189 /* 190 * If the memtype is already part of a memory tier, 191 * just return that. 192 */ 193 if (!list_empty(&memtype->tier_sibling)) { 194 list_for_each_entry(memtier, &memory_tiers, list) { 195 if (adistance == memtier->adistance_start) 196 return memtier; 197 } 198 WARN_ON(1); 199 return ERR_PTR(-EINVAL); 200 } 201 202 list_for_each_entry(memtier, &memory_tiers, list) { 203 if (adistance == memtier->adistance_start) { 204 goto link_memtype; 205 } else if (adistance < memtier->adistance_start) { 206 found_slot = true; 207 break; 208 } 209 } 210 211 new_memtier = kzalloc(sizeof(struct memory_tier), GFP_KERNEL); 212 if (!new_memtier) 213 return ERR_PTR(-ENOMEM); 214 215 new_memtier->adistance_start = adistance; 216 INIT_LIST_HEAD(&new_memtier->list); 217 INIT_LIST_HEAD(&new_memtier->memory_types); 218 if (found_slot) 219 list_add_tail(&new_memtier->list, &memtier->list); 220 else 221 list_add_tail(&new_memtier->list, &memory_tiers); 222 223 new_memtier->dev.id = adistance >> MEMTIER_CHUNK_BITS; 224 new_memtier->dev.bus = &memory_tier_subsys; 225 new_memtier->dev.release = memory_tier_device_release; 226 new_memtier->dev.groups = memtier_dev_groups; 227 228 ret = device_register(&new_memtier->dev); 229 if (ret) { 230 list_del(&new_memtier->list); 231 put_device(&new_memtier->dev); 232 return ERR_PTR(ret); 233 } 234 memtier = new_memtier; 235 236 link_memtype: 237 list_add(&memtype->tier_sibling, &memtier->memory_types); 238 return memtier; 239 } 240 241 static struct memory_tier *__node_get_memory_tier(int node) 242 { 243 pg_data_t *pgdat; 244 245 pgdat = NODE_DATA(node); 246 if (!pgdat) 247 return NULL; 248 /* 249 * Since we hold memory_tier_lock, we can avoid 250 * RCU read locks when accessing the details. No 251 * parallel updates are possible here. 252 */ 253 return rcu_dereference_check(pgdat->memtier, 254 lockdep_is_held(&memory_tier_lock)); 255 } 256 257 #ifdef CONFIG_MIGRATION 258 bool node_is_toptier(int node) 259 { 260 bool toptier; 261 pg_data_t *pgdat; 262 struct memory_tier *memtier; 263 264 pgdat = NODE_DATA(node); 265 if (!pgdat) 266 return false; 267 268 rcu_read_lock(); 269 memtier = rcu_dereference(pgdat->memtier); 270 if (!memtier) { 271 toptier = true; 272 goto out; 273 } 274 if (memtier->adistance_start <= top_tier_adistance) 275 toptier = true; 276 else 277 toptier = false; 278 out: 279 rcu_read_unlock(); 280 return toptier; 281 } 282 283 void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets) 284 { 285 struct memory_tier *memtier; 286 287 /* 288 * pg_data_t.memtier updates includes a synchronize_rcu() 289 * which ensures that we either find NULL or a valid memtier 290 * in NODE_DATA. protect the access via rcu_read_lock(); 291 */ 292 rcu_read_lock(); 293 memtier = rcu_dereference(pgdat->memtier); 294 if (memtier) 295 *targets = memtier->lower_tier_mask; 296 else 297 *targets = NODE_MASK_NONE; 298 rcu_read_unlock(); 299 } 300 301 /** 302 * next_demotion_node() - Get the next node in the demotion path 303 * @node: The starting node to lookup the next node 304 * 305 * Return: node id for next memory node in the demotion path hierarchy 306 * from @node; NUMA_NO_NODE if @node is terminal. This does not keep 307 * @node online or guarantee that it *continues* to be the next demotion 308 * target. 309 */ 310 int next_demotion_node(int node) 311 { 312 struct demotion_nodes *nd; 313 int target; 314 315 if (!node_demotion) 316 return NUMA_NO_NODE; 317 318 nd = &node_demotion[node]; 319 320 /* 321 * node_demotion[] is updated without excluding this 322 * function from running. 323 * 324 * Make sure to use RCU over entire code blocks if 325 * node_demotion[] reads need to be consistent. 326 */ 327 rcu_read_lock(); 328 /* 329 * If there are multiple target nodes, just select one 330 * target node randomly. 331 * 332 * In addition, we can also use round-robin to select 333 * target node, but we should introduce another variable 334 * for node_demotion[] to record last selected target node, 335 * that may cause cache ping-pong due to the changing of 336 * last target node. Or introducing per-cpu data to avoid 337 * caching issue, which seems more complicated. So selecting 338 * target node randomly seems better until now. 339 */ 340 target = node_random(&nd->preferred); 341 rcu_read_unlock(); 342 343 return target; 344 } 345 346 static void disable_all_demotion_targets(void) 347 { 348 struct memory_tier *memtier; 349 int node; 350 351 for_each_node_state(node, N_MEMORY) { 352 node_demotion[node].preferred = NODE_MASK_NONE; 353 /* 354 * We are holding memory_tier_lock, it is safe 355 * to access pgda->memtier. 356 */ 357 memtier = __node_get_memory_tier(node); 358 if (memtier) 359 memtier->lower_tier_mask = NODE_MASK_NONE; 360 } 361 /* 362 * Ensure that the "disable" is visible across the system. 363 * Readers will see either a combination of before+disable 364 * state or disable+after. They will never see before and 365 * after state together. 366 */ 367 synchronize_rcu(); 368 } 369 370 static void dump_demotion_targets(void) 371 { 372 int node; 373 374 for_each_node_state(node, N_MEMORY) { 375 struct memory_tier *memtier = __node_get_memory_tier(node); 376 nodemask_t preferred = node_demotion[node].preferred; 377 378 if (!memtier) 379 continue; 380 381 if (nodes_empty(preferred)) 382 pr_info("Demotion targets for Node %d: null\n", node); 383 else 384 pr_info("Demotion targets for Node %d: preferred: %*pbl, fallback: %*pbl\n", 385 node, nodemask_pr_args(&preferred), 386 nodemask_pr_args(&memtier->lower_tier_mask)); 387 } 388 } 389 390 /* 391 * Find an automatic demotion target for all memory 392 * nodes. Failing here is OK. It might just indicate 393 * being at the end of a chain. 394 */ 395 static void establish_demotion_targets(void) 396 { 397 struct memory_tier *memtier; 398 struct demotion_nodes *nd; 399 int target = NUMA_NO_NODE, node; 400 int distance, best_distance; 401 nodemask_t tier_nodes, lower_tier; 402 403 lockdep_assert_held_once(&memory_tier_lock); 404 405 if (!node_demotion) 406 return; 407 408 disable_all_demotion_targets(); 409 410 for_each_node_state(node, N_MEMORY) { 411 best_distance = -1; 412 nd = &node_demotion[node]; 413 414 memtier = __node_get_memory_tier(node); 415 if (!memtier || list_is_last(&memtier->list, &memory_tiers)) 416 continue; 417 /* 418 * Get the lower memtier to find the demotion node list. 419 */ 420 memtier = list_next_entry(memtier, list); 421 tier_nodes = get_memtier_nodemask(memtier); 422 /* 423 * find_next_best_node, use 'used' nodemask as a skip list. 424 * Add all memory nodes except the selected memory tier 425 * nodelist to skip list so that we find the best node from the 426 * memtier nodelist. 427 */ 428 nodes_andnot(tier_nodes, node_states[N_MEMORY], tier_nodes); 429 430 /* 431 * Find all the nodes in the memory tier node list of same best distance. 432 * add them to the preferred mask. We randomly select between nodes 433 * in the preferred mask when allocating pages during demotion. 434 */ 435 do { 436 target = find_next_best_node(node, &tier_nodes); 437 if (target == NUMA_NO_NODE) 438 break; 439 440 distance = node_distance(node, target); 441 if (distance == best_distance || best_distance == -1) { 442 best_distance = distance; 443 node_set(target, nd->preferred); 444 } else { 445 break; 446 } 447 } while (1); 448 } 449 /* 450 * Promotion is allowed from a memory tier to higher 451 * memory tier only if the memory tier doesn't include 452 * compute. We want to skip promotion from a memory tier, 453 * if any node that is part of the memory tier have CPUs. 454 * Once we detect such a memory tier, we consider that tier 455 * as top tiper from which promotion is not allowed. 456 */ 457 list_for_each_entry_reverse(memtier, &memory_tiers, list) { 458 tier_nodes = get_memtier_nodemask(memtier); 459 nodes_and(tier_nodes, node_states[N_CPU], tier_nodes); 460 if (!nodes_empty(tier_nodes)) { 461 /* 462 * abstract distance below the max value of this memtier 463 * is considered toptier. 464 */ 465 top_tier_adistance = memtier->adistance_start + 466 MEMTIER_CHUNK_SIZE - 1; 467 break; 468 } 469 } 470 /* 471 * Now build the lower_tier mask for each node collecting node mask from 472 * all memory tier below it. This allows us to fallback demotion page 473 * allocation to a set of nodes that is closer the above selected 474 * preferred node. 475 */ 476 lower_tier = node_states[N_MEMORY]; 477 list_for_each_entry(memtier, &memory_tiers, list) { 478 /* 479 * Keep removing current tier from lower_tier nodes, 480 * This will remove all nodes in current and above 481 * memory tier from the lower_tier mask. 482 */ 483 tier_nodes = get_memtier_nodemask(memtier); 484 nodes_andnot(lower_tier, lower_tier, tier_nodes); 485 memtier->lower_tier_mask = lower_tier; 486 } 487 488 dump_demotion_targets(); 489 } 490 491 #else 492 static inline void establish_demotion_targets(void) {} 493 #endif /* CONFIG_MIGRATION */ 494 495 static inline void __init_node_memory_type(int node, struct memory_dev_type *memtype) 496 { 497 if (!node_memory_types[node].memtype) 498 node_memory_types[node].memtype = memtype; 499 /* 500 * for each device getting added in the same NUMA node 501 * with this specific memtype, bump the map count. We 502 * Only take memtype device reference once, so that 503 * changing a node memtype can be done by droping the 504 * only reference count taken here. 505 */ 506 507 if (node_memory_types[node].memtype == memtype) { 508 if (!node_memory_types[node].map_count++) 509 kref_get(&memtype->kref); 510 } 511 } 512 513 static struct memory_tier *set_node_memory_tier(int node) 514 { 515 struct memory_tier *memtier; 516 struct memory_dev_type *memtype = default_dram_type; 517 int adist = MEMTIER_ADISTANCE_DRAM; 518 pg_data_t *pgdat = NODE_DATA(node); 519 520 521 lockdep_assert_held_once(&memory_tier_lock); 522 523 if (!node_state(node, N_MEMORY)) 524 return ERR_PTR(-EINVAL); 525 526 mt_calc_adistance(node, &adist); 527 if (!node_memory_types[node].memtype) { 528 memtype = mt_find_alloc_memory_type(adist, &default_memory_types); 529 if (IS_ERR(memtype)) { 530 memtype = default_dram_type; 531 pr_info("Failed to allocate a memory type. Fall back.\n"); 532 } 533 } 534 535 __init_node_memory_type(node, memtype); 536 537 memtype = node_memory_types[node].memtype; 538 node_set(node, memtype->nodes); 539 memtier = find_create_memory_tier(memtype); 540 if (!IS_ERR(memtier)) 541 rcu_assign_pointer(pgdat->memtier, memtier); 542 return memtier; 543 } 544 545 static void destroy_memory_tier(struct memory_tier *memtier) 546 { 547 list_del(&memtier->list); 548 device_unregister(&memtier->dev); 549 } 550 551 static bool clear_node_memory_tier(int node) 552 { 553 bool cleared = false; 554 pg_data_t *pgdat; 555 struct memory_tier *memtier; 556 557 pgdat = NODE_DATA(node); 558 if (!pgdat) 559 return false; 560 561 /* 562 * Make sure that anybody looking at NODE_DATA who finds 563 * a valid memtier finds memory_dev_types with nodes still 564 * linked to the memtier. We achieve this by waiting for 565 * rcu read section to finish using synchronize_rcu. 566 * This also enables us to free the destroyed memory tier 567 * with kfree instead of kfree_rcu 568 */ 569 memtier = __node_get_memory_tier(node); 570 if (memtier) { 571 struct memory_dev_type *memtype; 572 573 rcu_assign_pointer(pgdat->memtier, NULL); 574 synchronize_rcu(); 575 memtype = node_memory_types[node].memtype; 576 node_clear(node, memtype->nodes); 577 if (nodes_empty(memtype->nodes)) { 578 list_del_init(&memtype->tier_sibling); 579 if (list_empty(&memtier->memory_types)) 580 destroy_memory_tier(memtier); 581 } 582 cleared = true; 583 } 584 return cleared; 585 } 586 587 static void release_memtype(struct kref *kref) 588 { 589 struct memory_dev_type *memtype; 590 591 memtype = container_of(kref, struct memory_dev_type, kref); 592 kfree(memtype); 593 } 594 595 struct memory_dev_type *alloc_memory_type(int adistance) 596 { 597 struct memory_dev_type *memtype; 598 599 memtype = kmalloc(sizeof(*memtype), GFP_KERNEL); 600 if (!memtype) 601 return ERR_PTR(-ENOMEM); 602 603 memtype->adistance = adistance; 604 INIT_LIST_HEAD(&memtype->tier_sibling); 605 memtype->nodes = NODE_MASK_NONE; 606 kref_init(&memtype->kref); 607 return memtype; 608 } 609 EXPORT_SYMBOL_GPL(alloc_memory_type); 610 611 void put_memory_type(struct memory_dev_type *memtype) 612 { 613 kref_put(&memtype->kref, release_memtype); 614 } 615 EXPORT_SYMBOL_GPL(put_memory_type); 616 617 void init_node_memory_type(int node, struct memory_dev_type *memtype) 618 { 619 620 mutex_lock(&memory_tier_lock); 621 __init_node_memory_type(node, memtype); 622 mutex_unlock(&memory_tier_lock); 623 } 624 EXPORT_SYMBOL_GPL(init_node_memory_type); 625 626 void clear_node_memory_type(int node, struct memory_dev_type *memtype) 627 { 628 mutex_lock(&memory_tier_lock); 629 if (node_memory_types[node].memtype == memtype || !memtype) 630 node_memory_types[node].map_count--; 631 /* 632 * If we umapped all the attached devices to this node, 633 * clear the node memory type. 634 */ 635 if (!node_memory_types[node].map_count) { 636 memtype = node_memory_types[node].memtype; 637 node_memory_types[node].memtype = NULL; 638 put_memory_type(memtype); 639 } 640 mutex_unlock(&memory_tier_lock); 641 } 642 EXPORT_SYMBOL_GPL(clear_node_memory_type); 643 644 struct memory_dev_type *mt_find_alloc_memory_type(int adist, struct list_head *memory_types) 645 { 646 struct memory_dev_type *mtype; 647 648 list_for_each_entry(mtype, memory_types, list) 649 if (mtype->adistance == adist) 650 return mtype; 651 652 mtype = alloc_memory_type(adist); 653 if (IS_ERR(mtype)) 654 return mtype; 655 656 list_add(&mtype->list, memory_types); 657 658 return mtype; 659 } 660 EXPORT_SYMBOL_GPL(mt_find_alloc_memory_type); 661 662 void mt_put_memory_types(struct list_head *memory_types) 663 { 664 struct memory_dev_type *mtype, *mtn; 665 666 list_for_each_entry_safe(mtype, mtn, memory_types, list) { 667 list_del(&mtype->list); 668 put_memory_type(mtype); 669 } 670 } 671 EXPORT_SYMBOL_GPL(mt_put_memory_types); 672 673 /* 674 * This is invoked via `late_initcall()` to initialize memory tiers for 675 * memory nodes, both with and without CPUs. After the initialization of 676 * firmware and devices, adistance algorithms are expected to be provided. 677 */ 678 static int __init memory_tier_late_init(void) 679 { 680 int nid; 681 struct memory_tier *memtier; 682 683 get_online_mems(); 684 guard(mutex)(&memory_tier_lock); 685 686 /* Assign each uninitialized N_MEMORY node to a memory tier. */ 687 for_each_node_state(nid, N_MEMORY) { 688 /* 689 * Some device drivers may have initialized 690 * memory tiers, potentially bringing memory nodes 691 * online and configuring memory tiers. 692 * Exclude them here. 693 */ 694 if (node_memory_types[nid].memtype) 695 continue; 696 697 memtier = set_node_memory_tier(nid); 698 if (IS_ERR(memtier)) 699 continue; 700 } 701 702 establish_demotion_targets(); 703 put_online_mems(); 704 705 return 0; 706 } 707 late_initcall(memory_tier_late_init); 708 709 static void dump_hmem_attrs(struct access_coordinate *coord, const char *prefix) 710 { 711 pr_info( 712 "%sread_latency: %u, write_latency: %u, read_bandwidth: %u, write_bandwidth: %u\n", 713 prefix, coord->read_latency, coord->write_latency, 714 coord->read_bandwidth, coord->write_bandwidth); 715 } 716 717 int mt_set_default_dram_perf(int nid, struct access_coordinate *perf, 718 const char *source) 719 { 720 guard(mutex)(&default_dram_perf_lock); 721 if (default_dram_perf_error) 722 return -EIO; 723 724 if (perf->read_latency + perf->write_latency == 0 || 725 perf->read_bandwidth + perf->write_bandwidth == 0) 726 return -EINVAL; 727 728 if (default_dram_perf_ref_nid == NUMA_NO_NODE) { 729 default_dram_perf = *perf; 730 default_dram_perf_ref_nid = nid; 731 default_dram_perf_ref_source = kstrdup(source, GFP_KERNEL); 732 return 0; 733 } 734 735 /* 736 * The performance of all default DRAM nodes is expected to be 737 * same (that is, the variation is less than 10%). And it 738 * will be used as base to calculate the abstract distance of 739 * other memory nodes. 740 */ 741 if (abs(perf->read_latency - default_dram_perf.read_latency) * 10 > 742 default_dram_perf.read_latency || 743 abs(perf->write_latency - default_dram_perf.write_latency) * 10 > 744 default_dram_perf.write_latency || 745 abs(perf->read_bandwidth - default_dram_perf.read_bandwidth) * 10 > 746 default_dram_perf.read_bandwidth || 747 abs(perf->write_bandwidth - default_dram_perf.write_bandwidth) * 10 > 748 default_dram_perf.write_bandwidth) { 749 pr_info( 750 "memory-tiers: the performance of DRAM node %d mismatches that of the reference\n" 751 "DRAM node %d.\n", nid, default_dram_perf_ref_nid); 752 pr_info(" performance of reference DRAM node %d:\n", 753 default_dram_perf_ref_nid); 754 dump_hmem_attrs(&default_dram_perf, " "); 755 pr_info(" performance of DRAM node %d:\n", nid); 756 dump_hmem_attrs(perf, " "); 757 pr_info( 758 " disable default DRAM node performance based abstract distance algorithm.\n"); 759 default_dram_perf_error = true; 760 return -EINVAL; 761 } 762 763 return 0; 764 } 765 766 int mt_perf_to_adistance(struct access_coordinate *perf, int *adist) 767 { 768 guard(mutex)(&default_dram_perf_lock); 769 if (default_dram_perf_error) 770 return -EIO; 771 772 if (perf->read_latency + perf->write_latency == 0 || 773 perf->read_bandwidth + perf->write_bandwidth == 0) 774 return -EINVAL; 775 776 if (default_dram_perf_ref_nid == NUMA_NO_NODE) 777 return -ENOENT; 778 779 /* 780 * The abstract distance of a memory node is in direct proportion to 781 * its memory latency (read + write) and inversely proportional to its 782 * memory bandwidth (read + write). The abstract distance, memory 783 * latency, and memory bandwidth of the default DRAM nodes are used as 784 * the base. 785 */ 786 *adist = MEMTIER_ADISTANCE_DRAM * 787 (perf->read_latency + perf->write_latency) / 788 (default_dram_perf.read_latency + default_dram_perf.write_latency) * 789 (default_dram_perf.read_bandwidth + default_dram_perf.write_bandwidth) / 790 (perf->read_bandwidth + perf->write_bandwidth); 791 792 return 0; 793 } 794 EXPORT_SYMBOL_GPL(mt_perf_to_adistance); 795 796 /** 797 * register_mt_adistance_algorithm() - Register memory tiering abstract distance algorithm 798 * @nb: The notifier block which describe the algorithm 799 * 800 * Return: 0 on success, errno on error. 801 * 802 * Every memory tiering abstract distance algorithm provider needs to 803 * register the algorithm with register_mt_adistance_algorithm(). To 804 * calculate the abstract distance for a specified memory node, the 805 * notifier function will be called unless some high priority 806 * algorithm has provided result. The prototype of the notifier 807 * function is as follows, 808 * 809 * int (*algorithm_notifier)(struct notifier_block *nb, 810 * unsigned long nid, void *data); 811 * 812 * Where "nid" specifies the memory node, "data" is the pointer to the 813 * returned abstract distance (that is, "int *adist"). If the 814 * algorithm provides the result, NOTIFY_STOP should be returned. 815 * Otherwise, return_value & %NOTIFY_STOP_MASK == 0 to allow the next 816 * algorithm in the chain to provide the result. 817 */ 818 int register_mt_adistance_algorithm(struct notifier_block *nb) 819 { 820 return blocking_notifier_chain_register(&mt_adistance_algorithms, nb); 821 } 822 EXPORT_SYMBOL_GPL(register_mt_adistance_algorithm); 823 824 /** 825 * unregister_mt_adistance_algorithm() - Unregister memory tiering abstract distance algorithm 826 * @nb: the notifier block which describe the algorithm 827 * 828 * Return: 0 on success, errno on error. 829 */ 830 int unregister_mt_adistance_algorithm(struct notifier_block *nb) 831 { 832 return blocking_notifier_chain_unregister(&mt_adistance_algorithms, nb); 833 } 834 EXPORT_SYMBOL_GPL(unregister_mt_adistance_algorithm); 835 836 /** 837 * mt_calc_adistance() - Calculate abstract distance with registered algorithms 838 * @node: the node to calculate abstract distance for 839 * @adist: the returned abstract distance 840 * 841 * Return: if return_value & %NOTIFY_STOP_MASK != 0, then some 842 * abstract distance algorithm provides the result, and return it via 843 * @adist. Otherwise, no algorithm can provide the result and @adist 844 * will be kept as it is. 845 */ 846 int mt_calc_adistance(int node, int *adist) 847 { 848 return blocking_notifier_call_chain(&mt_adistance_algorithms, node, adist); 849 } 850 EXPORT_SYMBOL_GPL(mt_calc_adistance); 851 852 static int __meminit memtier_hotplug_callback(struct notifier_block *self, 853 unsigned long action, void *_arg) 854 { 855 struct memory_tier *memtier; 856 struct memory_notify *arg = _arg; 857 858 /* 859 * Only update the node migration order when a node is 860 * changing status, like online->offline. 861 */ 862 if (arg->status_change_nid < 0) 863 return notifier_from_errno(0); 864 865 switch (action) { 866 case MEM_OFFLINE: 867 mutex_lock(&memory_tier_lock); 868 if (clear_node_memory_tier(arg->status_change_nid)) 869 establish_demotion_targets(); 870 mutex_unlock(&memory_tier_lock); 871 break; 872 case MEM_ONLINE: 873 mutex_lock(&memory_tier_lock); 874 memtier = set_node_memory_tier(arg->status_change_nid); 875 if (!IS_ERR(memtier)) 876 establish_demotion_targets(); 877 mutex_unlock(&memory_tier_lock); 878 break; 879 } 880 881 return notifier_from_errno(0); 882 } 883 884 static int __init memory_tier_init(void) 885 { 886 int ret; 887 888 ret = subsys_virtual_register(&memory_tier_subsys, NULL); 889 if (ret) 890 panic("%s() failed to register memory tier subsystem\n", __func__); 891 892 #ifdef CONFIG_MIGRATION 893 node_demotion = kcalloc(nr_node_ids, sizeof(struct demotion_nodes), 894 GFP_KERNEL); 895 WARN_ON(!node_demotion); 896 #endif 897 898 guard(mutex)(&memory_tier_lock); 899 /* 900 * For now we can have 4 faster memory tiers with smaller adistance 901 * than default DRAM tier. 902 */ 903 default_dram_type = mt_find_alloc_memory_type(MEMTIER_ADISTANCE_DRAM, 904 &default_memory_types); 905 if (IS_ERR(default_dram_type)) 906 panic("%s() failed to allocate default DRAM tier\n", __func__); 907 908 /* Record nodes with memory and CPU to set default DRAM performance. */ 909 nodes_and(default_dram_nodes, node_states[N_MEMORY], 910 node_states[N_CPU]); 911 912 hotplug_memory_notifier(memtier_hotplug_callback, MEMTIER_HOTPLUG_PRI); 913 return 0; 914 } 915 subsys_initcall(memory_tier_init); 916 917 bool numa_demotion_enabled = false; 918 919 #ifdef CONFIG_MIGRATION 920 #ifdef CONFIG_SYSFS 921 static ssize_t demotion_enabled_show(struct kobject *kobj, 922 struct kobj_attribute *attr, char *buf) 923 { 924 return sysfs_emit(buf, "%s\n", 925 numa_demotion_enabled ? "true" : "false"); 926 } 927 928 static ssize_t demotion_enabled_store(struct kobject *kobj, 929 struct kobj_attribute *attr, 930 const char *buf, size_t count) 931 { 932 ssize_t ret; 933 934 ret = kstrtobool(buf, &numa_demotion_enabled); 935 if (ret) 936 return ret; 937 938 return count; 939 } 940 941 static struct kobj_attribute numa_demotion_enabled_attr = 942 __ATTR_RW(demotion_enabled); 943 944 static struct attribute *numa_attrs[] = { 945 &numa_demotion_enabled_attr.attr, 946 NULL, 947 }; 948 949 static const struct attribute_group numa_attr_group = { 950 .attrs = numa_attrs, 951 }; 952 953 static int __init numa_init_sysfs(void) 954 { 955 int err; 956 struct kobject *numa_kobj; 957 958 numa_kobj = kobject_create_and_add("numa", mm_kobj); 959 if (!numa_kobj) { 960 pr_err("failed to create numa kobject\n"); 961 return -ENOMEM; 962 } 963 err = sysfs_create_group(numa_kobj, &numa_attr_group); 964 if (err) { 965 pr_err("failed to register numa group\n"); 966 goto delete_obj; 967 } 968 return 0; 969 970 delete_obj: 971 kobject_put(numa_kobj); 972 return err; 973 } 974 subsys_initcall(numa_init_sysfs); 975 #endif /* CONFIG_SYSFS */ 976 #endif 977