1 // SPDX-License-Identifier: GPL-2.0 2 #include <linux/slab.h> 3 #include <linux/lockdep.h> 4 #include <linux/sysfs.h> 5 #include <linux/kobject.h> 6 #include <linux/memory.h> 7 #include <linux/memory-tiers.h> 8 #include <linux/notifier.h> 9 #include <linux/sched/sysctl.h> 10 11 #include "internal.h" 12 13 struct memory_tier { 14 /* hierarchy of memory tiers */ 15 struct list_head list; 16 /* list of all memory types part of this tier */ 17 struct list_head memory_types; 18 /* 19 * start value of abstract distance. memory tier maps 20 * an abstract distance range, 21 * adistance_start .. adistance_start + MEMTIER_CHUNK_SIZE 22 */ 23 int adistance_start; 24 struct device dev; 25 /* All the nodes that are part of all the lower memory tiers. */ 26 nodemask_t lower_tier_mask; 27 }; 28 29 struct demotion_nodes { 30 nodemask_t preferred; 31 }; 32 33 struct node_memory_type_map { 34 struct memory_dev_type *memtype; 35 int map_count; 36 }; 37 38 static DEFINE_MUTEX(memory_tier_lock); 39 static LIST_HEAD(memory_tiers); 40 /* 41 * The list is used to store all memory types that are not created 42 * by a device driver. 43 */ 44 static LIST_HEAD(default_memory_types); 45 static struct node_memory_type_map node_memory_types[MAX_NUMNODES]; 46 struct memory_dev_type *default_dram_type; 47 nodemask_t default_dram_nodes __initdata = NODE_MASK_NONE; 48 49 static const struct bus_type memory_tier_subsys = { 50 .name = "memory_tiering", 51 .dev_name = "memory_tier", 52 }; 53 54 #ifdef CONFIG_NUMA_BALANCING 55 /** 56 * folio_use_access_time - check if a folio reuses cpupid for page access time 57 * @folio: folio to check 58 * 59 * folio's _last_cpupid field is repurposed by memory tiering. In memory 60 * tiering mode, cpupid of slow memory folio (not toptier memory) is used to 61 * record page access time. 62 * 63 * Return: the folio _last_cpupid is used to record page access time 64 */ 65 bool folio_use_access_time(struct folio *folio) 66 { 67 return (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) && 68 !node_is_toptier(folio_nid(folio)); 69 } 70 #endif 71 72 #ifdef CONFIG_MIGRATION 73 static int top_tier_adistance; 74 /* 75 * node_demotion[] examples: 76 * 77 * Example 1: 78 * 79 * Node 0 & 1 are CPU + DRAM nodes, node 2 & 3 are PMEM nodes. 80 * 81 * node distances: 82 * node 0 1 2 3 83 * 0 10 20 30 40 84 * 1 20 10 40 30 85 * 2 30 40 10 40 86 * 3 40 30 40 10 87 * 88 * memory_tiers0 = 0-1 89 * memory_tiers1 = 2-3 90 * 91 * node_demotion[0].preferred = 2 92 * node_demotion[1].preferred = 3 93 * node_demotion[2].preferred = <empty> 94 * node_demotion[3].preferred = <empty> 95 * 96 * Example 2: 97 * 98 * Node 0 & 1 are CPU + DRAM nodes, node 2 is memory-only DRAM node. 99 * 100 * node distances: 101 * node 0 1 2 102 * 0 10 20 30 103 * 1 20 10 30 104 * 2 30 30 10 105 * 106 * memory_tiers0 = 0-2 107 * 108 * node_demotion[0].preferred = <empty> 109 * node_demotion[1].preferred = <empty> 110 * node_demotion[2].preferred = <empty> 111 * 112 * Example 3: 113 * 114 * Node 0 is CPU + DRAM nodes, Node 1 is HBM node, node 2 is PMEM node. 115 * 116 * node distances: 117 * node 0 1 2 118 * 0 10 20 30 119 * 1 20 10 40 120 * 2 30 40 10 121 * 122 * memory_tiers0 = 1 123 * memory_tiers1 = 0 124 * memory_tiers2 = 2 125 * 126 * node_demotion[0].preferred = 2 127 * node_demotion[1].preferred = 0 128 * node_demotion[2].preferred = <empty> 129 * 130 */ 131 static struct demotion_nodes *node_demotion __read_mostly; 132 #endif /* CONFIG_MIGRATION */ 133 134 static BLOCKING_NOTIFIER_HEAD(mt_adistance_algorithms); 135 136 /* The lock is used to protect `default_dram_perf*` info and nid. */ 137 static DEFINE_MUTEX(default_dram_perf_lock); 138 static bool default_dram_perf_error; 139 static struct access_coordinate default_dram_perf; 140 static int default_dram_perf_ref_nid = NUMA_NO_NODE; 141 static const char *default_dram_perf_ref_source; 142 143 static inline struct memory_tier *to_memory_tier(struct device *device) 144 { 145 return container_of(device, struct memory_tier, dev); 146 } 147 148 static __always_inline nodemask_t get_memtier_nodemask(struct memory_tier *memtier) 149 { 150 nodemask_t nodes = NODE_MASK_NONE; 151 struct memory_dev_type *memtype; 152 153 list_for_each_entry(memtype, &memtier->memory_types, tier_sibling) 154 nodes_or(nodes, nodes, memtype->nodes); 155 156 return nodes; 157 } 158 159 static void memory_tier_device_release(struct device *dev) 160 { 161 struct memory_tier *tier = to_memory_tier(dev); 162 /* 163 * synchronize_rcu in clear_node_memory_tier makes sure 164 * we don't have rcu access to this memory tier. 165 */ 166 kfree(tier); 167 } 168 169 static ssize_t nodelist_show(struct device *dev, 170 struct device_attribute *attr, char *buf) 171 { 172 int ret; 173 nodemask_t nmask; 174 175 mutex_lock(&memory_tier_lock); 176 nmask = get_memtier_nodemask(to_memory_tier(dev)); 177 ret = sysfs_emit(buf, "%*pbl\n", nodemask_pr_args(&nmask)); 178 mutex_unlock(&memory_tier_lock); 179 return ret; 180 } 181 static DEVICE_ATTR_RO(nodelist); 182 183 static struct attribute *memtier_dev_attrs[] = { 184 &dev_attr_nodelist.attr, 185 NULL 186 }; 187 188 static const struct attribute_group memtier_dev_group = { 189 .attrs = memtier_dev_attrs, 190 }; 191 192 static const struct attribute_group *memtier_dev_groups[] = { 193 &memtier_dev_group, 194 NULL 195 }; 196 197 static struct memory_tier *find_create_memory_tier(struct memory_dev_type *memtype) 198 { 199 int ret; 200 bool found_slot = false; 201 struct memory_tier *memtier, *new_memtier; 202 int adistance = memtype->adistance; 203 unsigned int memtier_adistance_chunk_size = MEMTIER_CHUNK_SIZE; 204 205 lockdep_assert_held_once(&memory_tier_lock); 206 207 adistance = round_down(adistance, memtier_adistance_chunk_size); 208 /* 209 * If the memtype is already part of a memory tier, 210 * just return that. 211 */ 212 if (!list_empty(&memtype->tier_sibling)) { 213 list_for_each_entry(memtier, &memory_tiers, list) { 214 if (adistance == memtier->adistance_start) 215 return memtier; 216 } 217 WARN_ON(1); 218 return ERR_PTR(-EINVAL); 219 } 220 221 list_for_each_entry(memtier, &memory_tiers, list) { 222 if (adistance == memtier->adistance_start) { 223 goto link_memtype; 224 } else if (adistance < memtier->adistance_start) { 225 found_slot = true; 226 break; 227 } 228 } 229 230 new_memtier = kzalloc(sizeof(struct memory_tier), GFP_KERNEL); 231 if (!new_memtier) 232 return ERR_PTR(-ENOMEM); 233 234 new_memtier->adistance_start = adistance; 235 INIT_LIST_HEAD(&new_memtier->list); 236 INIT_LIST_HEAD(&new_memtier->memory_types); 237 if (found_slot) 238 list_add_tail(&new_memtier->list, &memtier->list); 239 else 240 list_add_tail(&new_memtier->list, &memory_tiers); 241 242 new_memtier->dev.id = adistance >> MEMTIER_CHUNK_BITS; 243 new_memtier->dev.bus = &memory_tier_subsys; 244 new_memtier->dev.release = memory_tier_device_release; 245 new_memtier->dev.groups = memtier_dev_groups; 246 247 ret = device_register(&new_memtier->dev); 248 if (ret) { 249 list_del(&new_memtier->list); 250 put_device(&new_memtier->dev); 251 return ERR_PTR(ret); 252 } 253 memtier = new_memtier; 254 255 link_memtype: 256 list_add(&memtype->tier_sibling, &memtier->memory_types); 257 return memtier; 258 } 259 260 static struct memory_tier *__node_get_memory_tier(int node) 261 { 262 pg_data_t *pgdat; 263 264 pgdat = NODE_DATA(node); 265 if (!pgdat) 266 return NULL; 267 /* 268 * Since we hold memory_tier_lock, we can avoid 269 * RCU read locks when accessing the details. No 270 * parallel updates are possible here. 271 */ 272 return rcu_dereference_check(pgdat->memtier, 273 lockdep_is_held(&memory_tier_lock)); 274 } 275 276 #ifdef CONFIG_MIGRATION 277 bool node_is_toptier(int node) 278 { 279 bool toptier; 280 pg_data_t *pgdat; 281 struct memory_tier *memtier; 282 283 pgdat = NODE_DATA(node); 284 if (!pgdat) 285 return false; 286 287 rcu_read_lock(); 288 memtier = rcu_dereference(pgdat->memtier); 289 if (!memtier) { 290 toptier = true; 291 goto out; 292 } 293 if (memtier->adistance_start <= top_tier_adistance) 294 toptier = true; 295 else 296 toptier = false; 297 out: 298 rcu_read_unlock(); 299 return toptier; 300 } 301 302 void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets) 303 { 304 struct memory_tier *memtier; 305 306 /* 307 * pg_data_t.memtier updates includes a synchronize_rcu() 308 * which ensures that we either find NULL or a valid memtier 309 * in NODE_DATA. protect the access via rcu_read_lock(); 310 */ 311 rcu_read_lock(); 312 memtier = rcu_dereference(pgdat->memtier); 313 if (memtier) 314 *targets = memtier->lower_tier_mask; 315 else 316 *targets = NODE_MASK_NONE; 317 rcu_read_unlock(); 318 } 319 320 /** 321 * next_demotion_node() - Get the next node in the demotion path 322 * @node: The starting node to lookup the next node 323 * @allowed_mask: The pointer to allowed node mask 324 * 325 * Return: node id for next memory node in the demotion path hierarchy 326 * from @node; NUMA_NO_NODE if @node is terminal. This does not keep 327 * @node online or guarantee that it *continues* to be the next demotion 328 * target. 329 */ 330 int next_demotion_node(int node, const nodemask_t *allowed_mask) 331 { 332 struct demotion_nodes *nd; 333 nodemask_t mask; 334 335 if (!node_demotion) 336 return NUMA_NO_NODE; 337 338 nd = &node_demotion[node]; 339 340 /* 341 * node_demotion[] is updated without excluding this 342 * function from running. 343 * 344 * Make sure to use RCU over entire code blocks if 345 * node_demotion[] reads need to be consistent. 346 */ 347 rcu_read_lock(); 348 /* Filter out nodes that are not in allowed_mask. */ 349 nodes_and(mask, nd->preferred, *allowed_mask); 350 rcu_read_unlock(); 351 352 /* 353 * If there are multiple target nodes, just select one 354 * target node randomly. 355 * 356 * In addition, we can also use round-robin to select 357 * target node, but we should introduce another variable 358 * for node_demotion[] to record last selected target node, 359 * that may cause cache ping-pong due to the changing of 360 * last target node. Or introducing per-cpu data to avoid 361 * caching issue, which seems more complicated. So selecting 362 * target node randomly seems better until now. 363 */ 364 if (!nodes_empty(mask)) 365 return node_random(&mask); 366 367 /* 368 * Preferred nodes are not in allowed_mask. Flip bits in 369 * allowed_mask as used node mask. Then, use it to get the 370 * closest demotion target. 371 */ 372 nodes_complement(mask, *allowed_mask); 373 return find_next_best_node(node, &mask); 374 } 375 376 static void disable_all_demotion_targets(void) 377 { 378 struct memory_tier *memtier; 379 int node; 380 381 for_each_node_state(node, N_MEMORY) { 382 node_demotion[node].preferred = NODE_MASK_NONE; 383 /* 384 * We are holding memory_tier_lock, it is safe 385 * to access pgda->memtier. 386 */ 387 memtier = __node_get_memory_tier(node); 388 if (memtier) 389 memtier->lower_tier_mask = NODE_MASK_NONE; 390 } 391 /* 392 * Ensure that the "disable" is visible across the system. 393 * Readers will see either a combination of before+disable 394 * state or disable+after. They will never see before and 395 * after state together. 396 */ 397 synchronize_rcu(); 398 } 399 400 static void dump_demotion_targets(void) 401 { 402 int node; 403 404 for_each_node_state(node, N_MEMORY) { 405 struct memory_tier *memtier = __node_get_memory_tier(node); 406 nodemask_t preferred = node_demotion[node].preferred; 407 408 if (!memtier) 409 continue; 410 411 if (nodes_empty(preferred)) 412 pr_info("Demotion targets for Node %d: null\n", node); 413 else 414 pr_info("Demotion targets for Node %d: preferred: %*pbl, fallback: %*pbl\n", 415 node, nodemask_pr_args(&preferred), 416 nodemask_pr_args(&memtier->lower_tier_mask)); 417 } 418 } 419 420 /* 421 * Find an automatic demotion target for all memory 422 * nodes. Failing here is OK. It might just indicate 423 * being at the end of a chain. 424 */ 425 static void establish_demotion_targets(void) 426 { 427 struct memory_tier *memtier; 428 struct demotion_nodes *nd; 429 int target = NUMA_NO_NODE, node; 430 int distance, best_distance; 431 nodemask_t tier_nodes, lower_tier; 432 433 lockdep_assert_held_once(&memory_tier_lock); 434 435 if (!node_demotion) 436 return; 437 438 disable_all_demotion_targets(); 439 440 for_each_node_state(node, N_MEMORY) { 441 best_distance = -1; 442 nd = &node_demotion[node]; 443 444 memtier = __node_get_memory_tier(node); 445 if (!memtier || list_is_last(&memtier->list, &memory_tiers)) 446 continue; 447 /* 448 * Get the lower memtier to find the demotion node list. 449 */ 450 memtier = list_next_entry(memtier, list); 451 tier_nodes = get_memtier_nodemask(memtier); 452 /* 453 * find_next_best_node, use 'used' nodemask as a skip list. 454 * Add all memory nodes except the selected memory tier 455 * nodelist to skip list so that we find the best node from the 456 * memtier nodelist. 457 */ 458 nodes_andnot(tier_nodes, node_states[N_MEMORY], tier_nodes); 459 460 /* 461 * Find all the nodes in the memory tier node list of same best distance. 462 * add them to the preferred mask. We randomly select between nodes 463 * in the preferred mask when allocating pages during demotion. 464 */ 465 do { 466 target = find_next_best_node(node, &tier_nodes); 467 if (target == NUMA_NO_NODE) 468 break; 469 470 distance = node_distance(node, target); 471 if (distance == best_distance || best_distance == -1) { 472 best_distance = distance; 473 node_set(target, nd->preferred); 474 } else { 475 break; 476 } 477 } while (1); 478 } 479 /* 480 * Promotion is allowed from a memory tier to higher 481 * memory tier only if the memory tier doesn't include 482 * compute. We want to skip promotion from a memory tier, 483 * if any node that is part of the memory tier have CPUs. 484 * Once we detect such a memory tier, we consider that tier 485 * as top tiper from which promotion is not allowed. 486 */ 487 list_for_each_entry_reverse(memtier, &memory_tiers, list) { 488 tier_nodes = get_memtier_nodemask(memtier); 489 if (nodes_and(tier_nodes, node_states[N_CPU], tier_nodes)) { 490 /* 491 * abstract distance below the max value of this memtier 492 * is considered toptier. 493 */ 494 top_tier_adistance = memtier->adistance_start + 495 MEMTIER_CHUNK_SIZE - 1; 496 break; 497 } 498 } 499 /* 500 * Now build the lower_tier mask for each node collecting node mask from 501 * all memory tier below it. This allows us to fallback demotion page 502 * allocation to a set of nodes that is closer the above selected 503 * preferred node. 504 */ 505 lower_tier = node_states[N_MEMORY]; 506 list_for_each_entry(memtier, &memory_tiers, list) { 507 /* 508 * Keep removing current tier from lower_tier nodes, 509 * This will remove all nodes in current and above 510 * memory tier from the lower_tier mask. 511 */ 512 tier_nodes = get_memtier_nodemask(memtier); 513 nodes_andnot(lower_tier, lower_tier, tier_nodes); 514 memtier->lower_tier_mask = lower_tier; 515 } 516 517 dump_demotion_targets(); 518 } 519 520 #else 521 static inline void establish_demotion_targets(void) {} 522 #endif /* CONFIG_MIGRATION */ 523 524 static inline void __init_node_memory_type(int node, struct memory_dev_type *memtype) 525 { 526 if (!node_memory_types[node].memtype) 527 node_memory_types[node].memtype = memtype; 528 /* 529 * for each device getting added in the same NUMA node 530 * with this specific memtype, bump the map count. We 531 * Only take memtype device reference once, so that 532 * changing a node memtype can be done by dropping the 533 * only reference count taken here. 534 */ 535 536 if (node_memory_types[node].memtype == memtype) { 537 if (!node_memory_types[node].map_count++) 538 kref_get(&memtype->kref); 539 } 540 } 541 542 static struct memory_tier *set_node_memory_tier(int node) 543 { 544 struct memory_tier *memtier; 545 struct memory_dev_type *memtype = default_dram_type; 546 int adist = MEMTIER_ADISTANCE_DRAM; 547 pg_data_t *pgdat = NODE_DATA(node); 548 549 550 lockdep_assert_held_once(&memory_tier_lock); 551 552 if (!node_state(node, N_MEMORY)) 553 return ERR_PTR(-EINVAL); 554 555 mt_calc_adistance(node, &adist); 556 if (!node_memory_types[node].memtype) { 557 memtype = mt_find_alloc_memory_type(adist, &default_memory_types); 558 if (IS_ERR(memtype)) { 559 memtype = default_dram_type; 560 pr_info("Failed to allocate a memory type. Fall back.\n"); 561 } 562 } 563 564 __init_node_memory_type(node, memtype); 565 566 memtype = node_memory_types[node].memtype; 567 node_set(node, memtype->nodes); 568 memtier = find_create_memory_tier(memtype); 569 if (!IS_ERR(memtier)) 570 rcu_assign_pointer(pgdat->memtier, memtier); 571 return memtier; 572 } 573 574 static void destroy_memory_tier(struct memory_tier *memtier) 575 { 576 list_del(&memtier->list); 577 device_unregister(&memtier->dev); 578 } 579 580 static bool clear_node_memory_tier(int node) 581 { 582 bool cleared = false; 583 pg_data_t *pgdat; 584 struct memory_tier *memtier; 585 586 pgdat = NODE_DATA(node); 587 if (!pgdat) 588 return false; 589 590 /* 591 * Make sure that anybody looking at NODE_DATA who finds 592 * a valid memtier finds memory_dev_types with nodes still 593 * linked to the memtier. We achieve this by waiting for 594 * rcu read section to finish using synchronize_rcu. 595 * This also enables us to free the destroyed memory tier 596 * with kfree instead of kfree_rcu 597 */ 598 memtier = __node_get_memory_tier(node); 599 if (memtier) { 600 struct memory_dev_type *memtype; 601 602 rcu_assign_pointer(pgdat->memtier, NULL); 603 synchronize_rcu(); 604 memtype = node_memory_types[node].memtype; 605 node_clear(node, memtype->nodes); 606 if (nodes_empty(memtype->nodes)) { 607 list_del_init(&memtype->tier_sibling); 608 if (list_empty(&memtier->memory_types)) 609 destroy_memory_tier(memtier); 610 } 611 cleared = true; 612 } 613 return cleared; 614 } 615 616 static void release_memtype(struct kref *kref) 617 { 618 struct memory_dev_type *memtype; 619 620 memtype = container_of(kref, struct memory_dev_type, kref); 621 kfree(memtype); 622 } 623 624 struct memory_dev_type *alloc_memory_type(int adistance) 625 { 626 struct memory_dev_type *memtype; 627 628 memtype = kmalloc(sizeof(*memtype), GFP_KERNEL); 629 if (!memtype) 630 return ERR_PTR(-ENOMEM); 631 632 memtype->adistance = adistance; 633 INIT_LIST_HEAD(&memtype->tier_sibling); 634 memtype->nodes = NODE_MASK_NONE; 635 kref_init(&memtype->kref); 636 return memtype; 637 } 638 EXPORT_SYMBOL_GPL(alloc_memory_type); 639 640 void put_memory_type(struct memory_dev_type *memtype) 641 { 642 kref_put(&memtype->kref, release_memtype); 643 } 644 EXPORT_SYMBOL_GPL(put_memory_type); 645 646 void init_node_memory_type(int node, struct memory_dev_type *memtype) 647 { 648 649 mutex_lock(&memory_tier_lock); 650 __init_node_memory_type(node, memtype); 651 mutex_unlock(&memory_tier_lock); 652 } 653 EXPORT_SYMBOL_GPL(init_node_memory_type); 654 655 void clear_node_memory_type(int node, struct memory_dev_type *memtype) 656 { 657 mutex_lock(&memory_tier_lock); 658 if (node_memory_types[node].memtype == memtype || !memtype) 659 node_memory_types[node].map_count--; 660 /* 661 * If we unmapped all the attached devices to this node, 662 * clear the node memory type. 663 */ 664 if (!node_memory_types[node].map_count) { 665 memtype = node_memory_types[node].memtype; 666 node_memory_types[node].memtype = NULL; 667 put_memory_type(memtype); 668 } 669 mutex_unlock(&memory_tier_lock); 670 } 671 EXPORT_SYMBOL_GPL(clear_node_memory_type); 672 673 struct memory_dev_type *mt_find_alloc_memory_type(int adist, struct list_head *memory_types) 674 { 675 struct memory_dev_type *mtype; 676 677 list_for_each_entry(mtype, memory_types, list) 678 if (mtype->adistance == adist) 679 return mtype; 680 681 mtype = alloc_memory_type(adist); 682 if (IS_ERR(mtype)) 683 return mtype; 684 685 list_add(&mtype->list, memory_types); 686 687 return mtype; 688 } 689 EXPORT_SYMBOL_GPL(mt_find_alloc_memory_type); 690 691 void mt_put_memory_types(struct list_head *memory_types) 692 { 693 struct memory_dev_type *mtype, *mtn; 694 695 list_for_each_entry_safe(mtype, mtn, memory_types, list) { 696 list_del(&mtype->list); 697 put_memory_type(mtype); 698 } 699 } 700 EXPORT_SYMBOL_GPL(mt_put_memory_types); 701 702 /* 703 * This is invoked via `late_initcall()` to initialize memory tiers for 704 * memory nodes, both with and without CPUs. After the initialization of 705 * firmware and devices, adistance algorithms are expected to be provided. 706 */ 707 static int __init memory_tier_late_init(void) 708 { 709 int nid; 710 struct memory_tier *memtier; 711 712 get_online_mems(); 713 guard(mutex)(&memory_tier_lock); 714 715 /* Assign each uninitialized N_MEMORY node to a memory tier. */ 716 for_each_node_state(nid, N_MEMORY) { 717 /* 718 * Some device drivers may have initialized 719 * memory tiers, potentially bringing memory nodes 720 * online and configuring memory tiers. 721 * Exclude them here. 722 */ 723 if (node_memory_types[nid].memtype) 724 continue; 725 726 memtier = set_node_memory_tier(nid); 727 if (IS_ERR(memtier)) 728 continue; 729 } 730 731 establish_demotion_targets(); 732 put_online_mems(); 733 734 return 0; 735 } 736 late_initcall(memory_tier_late_init); 737 738 static void dump_hmem_attrs(struct access_coordinate *coord, const char *prefix) 739 { 740 pr_info( 741 "%sread_latency: %u, write_latency: %u, read_bandwidth: %u, write_bandwidth: %u\n", 742 prefix, coord->read_latency, coord->write_latency, 743 coord->read_bandwidth, coord->write_bandwidth); 744 } 745 746 int mt_set_default_dram_perf(int nid, struct access_coordinate *perf, 747 const char *source) 748 { 749 guard(mutex)(&default_dram_perf_lock); 750 if (default_dram_perf_error) 751 return -EIO; 752 753 if (perf->read_latency + perf->write_latency == 0 || 754 perf->read_bandwidth + perf->write_bandwidth == 0) 755 return -EINVAL; 756 757 if (default_dram_perf_ref_nid == NUMA_NO_NODE) { 758 default_dram_perf = *perf; 759 default_dram_perf_ref_nid = nid; 760 default_dram_perf_ref_source = kstrdup(source, GFP_KERNEL); 761 return 0; 762 } 763 764 /* 765 * The performance of all default DRAM nodes is expected to be 766 * same (that is, the variation is less than 10%). And it 767 * will be used as base to calculate the abstract distance of 768 * other memory nodes. 769 */ 770 if (abs(perf->read_latency - default_dram_perf.read_latency) * 10 > 771 default_dram_perf.read_latency || 772 abs(perf->write_latency - default_dram_perf.write_latency) * 10 > 773 default_dram_perf.write_latency || 774 abs(perf->read_bandwidth - default_dram_perf.read_bandwidth) * 10 > 775 default_dram_perf.read_bandwidth || 776 abs(perf->write_bandwidth - default_dram_perf.write_bandwidth) * 10 > 777 default_dram_perf.write_bandwidth) { 778 pr_info( 779 "memory-tiers: the performance of DRAM node %d mismatches that of the reference\n" 780 "DRAM node %d.\n", nid, default_dram_perf_ref_nid); 781 pr_info(" performance of reference DRAM node %d from %s:\n", 782 default_dram_perf_ref_nid, default_dram_perf_ref_source); 783 dump_hmem_attrs(&default_dram_perf, " "); 784 pr_info(" performance of DRAM node %d from %s:\n", nid, source); 785 dump_hmem_attrs(perf, " "); 786 pr_info( 787 " disable default DRAM node performance based abstract distance algorithm.\n"); 788 default_dram_perf_error = true; 789 return -EINVAL; 790 } 791 792 return 0; 793 } 794 795 int mt_perf_to_adistance(struct access_coordinate *perf, int *adist) 796 { 797 guard(mutex)(&default_dram_perf_lock); 798 if (default_dram_perf_error) 799 return -EIO; 800 801 if (perf->read_latency + perf->write_latency == 0 || 802 perf->read_bandwidth + perf->write_bandwidth == 0) 803 return -EINVAL; 804 805 if (default_dram_perf_ref_nid == NUMA_NO_NODE) 806 return -ENOENT; 807 808 /* 809 * The abstract distance of a memory node is in direct proportion to 810 * its memory latency (read + write) and inversely proportional to its 811 * memory bandwidth (read + write). The abstract distance, memory 812 * latency, and memory bandwidth of the default DRAM nodes are used as 813 * the base. 814 */ 815 *adist = MEMTIER_ADISTANCE_DRAM * 816 (perf->read_latency + perf->write_latency) / 817 (default_dram_perf.read_latency + default_dram_perf.write_latency) * 818 (default_dram_perf.read_bandwidth + default_dram_perf.write_bandwidth) / 819 (perf->read_bandwidth + perf->write_bandwidth); 820 821 return 0; 822 } 823 EXPORT_SYMBOL_GPL(mt_perf_to_adistance); 824 825 /** 826 * register_mt_adistance_algorithm() - Register memory tiering abstract distance algorithm 827 * @nb: The notifier block which describe the algorithm 828 * 829 * Return: 0 on success, errno on error. 830 * 831 * Every memory tiering abstract distance algorithm provider needs to 832 * register the algorithm with register_mt_adistance_algorithm(). To 833 * calculate the abstract distance for a specified memory node, the 834 * notifier function will be called unless some high priority 835 * algorithm has provided result. The prototype of the notifier 836 * function is as follows, 837 * 838 * int (*algorithm_notifier)(struct notifier_block *nb, 839 * unsigned long nid, void *data); 840 * 841 * Where "nid" specifies the memory node, "data" is the pointer to the 842 * returned abstract distance (that is, "int *adist"). If the 843 * algorithm provides the result, NOTIFY_STOP should be returned. 844 * Otherwise, return_value & %NOTIFY_STOP_MASK == 0 to allow the next 845 * algorithm in the chain to provide the result. 846 */ 847 int register_mt_adistance_algorithm(struct notifier_block *nb) 848 { 849 return blocking_notifier_chain_register(&mt_adistance_algorithms, nb); 850 } 851 EXPORT_SYMBOL_GPL(register_mt_adistance_algorithm); 852 853 /** 854 * unregister_mt_adistance_algorithm() - Unregister memory tiering abstract distance algorithm 855 * @nb: the notifier block which describe the algorithm 856 * 857 * Return: 0 on success, errno on error. 858 */ 859 int unregister_mt_adistance_algorithm(struct notifier_block *nb) 860 { 861 return blocking_notifier_chain_unregister(&mt_adistance_algorithms, nb); 862 } 863 EXPORT_SYMBOL_GPL(unregister_mt_adistance_algorithm); 864 865 /** 866 * mt_calc_adistance() - Calculate abstract distance with registered algorithms 867 * @node: the node to calculate abstract distance for 868 * @adist: the returned abstract distance 869 * 870 * Return: if return_value & %NOTIFY_STOP_MASK != 0, then some 871 * abstract distance algorithm provides the result, and return it via 872 * @adist. Otherwise, no algorithm can provide the result and @adist 873 * will be kept as it is. 874 */ 875 int mt_calc_adistance(int node, int *adist) 876 { 877 return blocking_notifier_call_chain(&mt_adistance_algorithms, node, adist); 878 } 879 EXPORT_SYMBOL_GPL(mt_calc_adistance); 880 881 static int __meminit memtier_hotplug_callback(struct notifier_block *self, 882 unsigned long action, void *_arg) 883 { 884 struct memory_tier *memtier; 885 struct node_notify *nn = _arg; 886 887 switch (action) { 888 case NODE_REMOVED_LAST_MEMORY: 889 mutex_lock(&memory_tier_lock); 890 if (clear_node_memory_tier(nn->nid)) 891 establish_demotion_targets(); 892 mutex_unlock(&memory_tier_lock); 893 break; 894 case NODE_ADDED_FIRST_MEMORY: 895 mutex_lock(&memory_tier_lock); 896 memtier = set_node_memory_tier(nn->nid); 897 if (!IS_ERR(memtier)) 898 establish_demotion_targets(); 899 mutex_unlock(&memory_tier_lock); 900 break; 901 } 902 903 return notifier_from_errno(0); 904 } 905 906 static int __init memory_tier_init(void) 907 { 908 int ret; 909 910 ret = subsys_virtual_register(&memory_tier_subsys, NULL); 911 if (ret) 912 panic("%s() failed to register memory tier subsystem\n", __func__); 913 914 #ifdef CONFIG_MIGRATION 915 node_demotion = kcalloc(nr_node_ids, sizeof(struct demotion_nodes), 916 GFP_KERNEL); 917 WARN_ON(!node_demotion); 918 #endif 919 920 mutex_lock(&memory_tier_lock); 921 /* 922 * For now we can have 4 faster memory tiers with smaller adistance 923 * than default DRAM tier. 924 */ 925 default_dram_type = mt_find_alloc_memory_type(MEMTIER_ADISTANCE_DRAM, 926 &default_memory_types); 927 mutex_unlock(&memory_tier_lock); 928 if (IS_ERR(default_dram_type)) 929 panic("%s() failed to allocate default DRAM tier\n", __func__); 930 931 /* Record nodes with memory and CPU to set default DRAM performance. */ 932 nodes_and(default_dram_nodes, node_states[N_MEMORY], 933 node_states[N_CPU]); 934 935 hotplug_node_notifier(memtier_hotplug_callback, MEMTIER_HOTPLUG_PRI); 936 return 0; 937 } 938 subsys_initcall(memory_tier_init); 939 940 bool numa_demotion_enabled = false; 941 942 #ifdef CONFIG_MIGRATION 943 #ifdef CONFIG_SYSFS 944 static ssize_t demotion_enabled_show(struct kobject *kobj, 945 struct kobj_attribute *attr, char *buf) 946 { 947 return sysfs_emit(buf, "%s\n", str_true_false(numa_demotion_enabled)); 948 } 949 950 static ssize_t demotion_enabled_store(struct kobject *kobj, 951 struct kobj_attribute *attr, 952 const char *buf, size_t count) 953 { 954 ssize_t ret; 955 bool before = numa_demotion_enabled; 956 957 ret = kstrtobool(buf, &numa_demotion_enabled); 958 if (ret) 959 return ret; 960 961 /* 962 * Reset kswapd_failures statistics. They may no longer be 963 * valid since the policy for kswapd has changed. 964 */ 965 if (before == false && numa_demotion_enabled == true) { 966 struct pglist_data *pgdat; 967 968 for_each_online_pgdat(pgdat) 969 kswapd_clear_hopeless(pgdat, KSWAPD_CLEAR_HOPELESS_OTHER); 970 } 971 972 return count; 973 } 974 975 static struct kobj_attribute numa_demotion_enabled_attr = 976 __ATTR_RW(demotion_enabled); 977 978 static struct attribute *numa_attrs[] = { 979 &numa_demotion_enabled_attr.attr, 980 NULL, 981 }; 982 983 static const struct attribute_group numa_attr_group = { 984 .attrs = numa_attrs, 985 }; 986 987 static int __init numa_init_sysfs(void) 988 { 989 int err; 990 struct kobject *numa_kobj; 991 992 numa_kobj = kobject_create_and_add("numa", mm_kobj); 993 if (!numa_kobj) { 994 pr_err("failed to create numa kobject\n"); 995 return -ENOMEM; 996 } 997 err = sysfs_create_group(numa_kobj, &numa_attr_group); 998 if (err) { 999 pr_err("failed to register numa group\n"); 1000 goto delete_obj; 1001 } 1002 return 0; 1003 1004 delete_obj: 1005 kobject_put(numa_kobj); 1006 return err; 1007 } 1008 subsys_initcall(numa_init_sysfs); 1009 #endif /* CONFIG_SYSFS */ 1010 #endif 1011