xref: /linux/mm/memory-tiers.c (revision 9e6d33937b42ca4867af3b341e5d09abca4a2746)
1 // SPDX-License-Identifier: GPL-2.0
2 #include <linux/slab.h>
3 #include <linux/lockdep.h>
4 #include <linux/sysfs.h>
5 #include <linux/kobject.h>
6 #include <linux/memory.h>
7 #include <linux/memory-tiers.h>
8 #include <linux/notifier.h>
9 
10 #include "internal.h"
11 
12 struct memory_tier {
13 	/* hierarchy of memory tiers */
14 	struct list_head list;
15 	/* list of all memory types part of this tier */
16 	struct list_head memory_types;
17 	/*
18 	 * start value of abstract distance. memory tier maps
19 	 * an abstract distance  range,
20 	 * adistance_start .. adistance_start + MEMTIER_CHUNK_SIZE
21 	 */
22 	int adistance_start;
23 	struct device dev;
24 	/* All the nodes that are part of all the lower memory tiers. */
25 	nodemask_t lower_tier_mask;
26 };
27 
28 struct demotion_nodes {
29 	nodemask_t preferred;
30 };
31 
32 struct node_memory_type_map {
33 	struct memory_dev_type *memtype;
34 	int map_count;
35 };
36 
37 static DEFINE_MUTEX(memory_tier_lock);
38 static LIST_HEAD(memory_tiers);
39 /*
40  * The list is used to store all memory types that are not created
41  * by a device driver.
42  */
43 static LIST_HEAD(default_memory_types);
44 static struct node_memory_type_map node_memory_types[MAX_NUMNODES];
45 struct memory_dev_type *default_dram_type;
46 
47 static const struct bus_type memory_tier_subsys = {
48 	.name = "memory_tiering",
49 	.dev_name = "memory_tier",
50 };
51 
52 #ifdef CONFIG_MIGRATION
53 static int top_tier_adistance;
54 /*
55  * node_demotion[] examples:
56  *
57  * Example 1:
58  *
59  * Node 0 & 1 are CPU + DRAM nodes, node 2 & 3 are PMEM nodes.
60  *
61  * node distances:
62  * node   0    1    2    3
63  *    0  10   20   30   40
64  *    1  20   10   40   30
65  *    2  30   40   10   40
66  *    3  40   30   40   10
67  *
68  * memory_tiers0 = 0-1
69  * memory_tiers1 = 2-3
70  *
71  * node_demotion[0].preferred = 2
72  * node_demotion[1].preferred = 3
73  * node_demotion[2].preferred = <empty>
74  * node_demotion[3].preferred = <empty>
75  *
76  * Example 2:
77  *
78  * Node 0 & 1 are CPU + DRAM nodes, node 2 is memory-only DRAM node.
79  *
80  * node distances:
81  * node   0    1    2
82  *    0  10   20   30
83  *    1  20   10   30
84  *    2  30   30   10
85  *
86  * memory_tiers0 = 0-2
87  *
88  * node_demotion[0].preferred = <empty>
89  * node_demotion[1].preferred = <empty>
90  * node_demotion[2].preferred = <empty>
91  *
92  * Example 3:
93  *
94  * Node 0 is CPU + DRAM nodes, Node 1 is HBM node, node 2 is PMEM node.
95  *
96  * node distances:
97  * node   0    1    2
98  *    0  10   20   30
99  *    1  20   10   40
100  *    2  30   40   10
101  *
102  * memory_tiers0 = 1
103  * memory_tiers1 = 0
104  * memory_tiers2 = 2
105  *
106  * node_demotion[0].preferred = 2
107  * node_demotion[1].preferred = 0
108  * node_demotion[2].preferred = <empty>
109  *
110  */
111 static struct demotion_nodes *node_demotion __read_mostly;
112 #endif /* CONFIG_MIGRATION */
113 
114 static BLOCKING_NOTIFIER_HEAD(mt_adistance_algorithms);
115 
116 /* The lock is used to protect `default_dram_perf*` info and nid. */
117 static DEFINE_MUTEX(default_dram_perf_lock);
118 static bool default_dram_perf_error;
119 static struct access_coordinate default_dram_perf;
120 static int default_dram_perf_ref_nid = NUMA_NO_NODE;
121 static const char *default_dram_perf_ref_source;
122 
123 static inline struct memory_tier *to_memory_tier(struct device *device)
124 {
125 	return container_of(device, struct memory_tier, dev);
126 }
127 
128 static __always_inline nodemask_t get_memtier_nodemask(struct memory_tier *memtier)
129 {
130 	nodemask_t nodes = NODE_MASK_NONE;
131 	struct memory_dev_type *memtype;
132 
133 	list_for_each_entry(memtype, &memtier->memory_types, tier_sibling)
134 		nodes_or(nodes, nodes, memtype->nodes);
135 
136 	return nodes;
137 }
138 
139 static void memory_tier_device_release(struct device *dev)
140 {
141 	struct memory_tier *tier = to_memory_tier(dev);
142 	/*
143 	 * synchronize_rcu in clear_node_memory_tier makes sure
144 	 * we don't have rcu access to this memory tier.
145 	 */
146 	kfree(tier);
147 }
148 
149 static ssize_t nodelist_show(struct device *dev,
150 			     struct device_attribute *attr, char *buf)
151 {
152 	int ret;
153 	nodemask_t nmask;
154 
155 	mutex_lock(&memory_tier_lock);
156 	nmask = get_memtier_nodemask(to_memory_tier(dev));
157 	ret = sysfs_emit(buf, "%*pbl\n", nodemask_pr_args(&nmask));
158 	mutex_unlock(&memory_tier_lock);
159 	return ret;
160 }
161 static DEVICE_ATTR_RO(nodelist);
162 
163 static struct attribute *memtier_dev_attrs[] = {
164 	&dev_attr_nodelist.attr,
165 	NULL
166 };
167 
168 static const struct attribute_group memtier_dev_group = {
169 	.attrs = memtier_dev_attrs,
170 };
171 
172 static const struct attribute_group *memtier_dev_groups[] = {
173 	&memtier_dev_group,
174 	NULL
175 };
176 
177 static struct memory_tier *find_create_memory_tier(struct memory_dev_type *memtype)
178 {
179 	int ret;
180 	bool found_slot = false;
181 	struct memory_tier *memtier, *new_memtier;
182 	int adistance = memtype->adistance;
183 	unsigned int memtier_adistance_chunk_size = MEMTIER_CHUNK_SIZE;
184 
185 	lockdep_assert_held_once(&memory_tier_lock);
186 
187 	adistance = round_down(adistance, memtier_adistance_chunk_size);
188 	/*
189 	 * If the memtype is already part of a memory tier,
190 	 * just return that.
191 	 */
192 	if (!list_empty(&memtype->tier_sibling)) {
193 		list_for_each_entry(memtier, &memory_tiers, list) {
194 			if (adistance == memtier->adistance_start)
195 				return memtier;
196 		}
197 		WARN_ON(1);
198 		return ERR_PTR(-EINVAL);
199 	}
200 
201 	list_for_each_entry(memtier, &memory_tiers, list) {
202 		if (adistance == memtier->adistance_start) {
203 			goto link_memtype;
204 		} else if (adistance < memtier->adistance_start) {
205 			found_slot = true;
206 			break;
207 		}
208 	}
209 
210 	new_memtier = kzalloc(sizeof(struct memory_tier), GFP_KERNEL);
211 	if (!new_memtier)
212 		return ERR_PTR(-ENOMEM);
213 
214 	new_memtier->adistance_start = adistance;
215 	INIT_LIST_HEAD(&new_memtier->list);
216 	INIT_LIST_HEAD(&new_memtier->memory_types);
217 	if (found_slot)
218 		list_add_tail(&new_memtier->list, &memtier->list);
219 	else
220 		list_add_tail(&new_memtier->list, &memory_tiers);
221 
222 	new_memtier->dev.id = adistance >> MEMTIER_CHUNK_BITS;
223 	new_memtier->dev.bus = &memory_tier_subsys;
224 	new_memtier->dev.release = memory_tier_device_release;
225 	new_memtier->dev.groups = memtier_dev_groups;
226 
227 	ret = device_register(&new_memtier->dev);
228 	if (ret) {
229 		list_del(&new_memtier->list);
230 		put_device(&new_memtier->dev);
231 		return ERR_PTR(ret);
232 	}
233 	memtier = new_memtier;
234 
235 link_memtype:
236 	list_add(&memtype->tier_sibling, &memtier->memory_types);
237 	return memtier;
238 }
239 
240 static struct memory_tier *__node_get_memory_tier(int node)
241 {
242 	pg_data_t *pgdat;
243 
244 	pgdat = NODE_DATA(node);
245 	if (!pgdat)
246 		return NULL;
247 	/*
248 	 * Since we hold memory_tier_lock, we can avoid
249 	 * RCU read locks when accessing the details. No
250 	 * parallel updates are possible here.
251 	 */
252 	return rcu_dereference_check(pgdat->memtier,
253 				     lockdep_is_held(&memory_tier_lock));
254 }
255 
256 #ifdef CONFIG_MIGRATION
257 bool node_is_toptier(int node)
258 {
259 	bool toptier;
260 	pg_data_t *pgdat;
261 	struct memory_tier *memtier;
262 
263 	pgdat = NODE_DATA(node);
264 	if (!pgdat)
265 		return false;
266 
267 	rcu_read_lock();
268 	memtier = rcu_dereference(pgdat->memtier);
269 	if (!memtier) {
270 		toptier = true;
271 		goto out;
272 	}
273 	if (memtier->adistance_start <= top_tier_adistance)
274 		toptier = true;
275 	else
276 		toptier = false;
277 out:
278 	rcu_read_unlock();
279 	return toptier;
280 }
281 
282 void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets)
283 {
284 	struct memory_tier *memtier;
285 
286 	/*
287 	 * pg_data_t.memtier updates includes a synchronize_rcu()
288 	 * which ensures that we either find NULL or a valid memtier
289 	 * in NODE_DATA. protect the access via rcu_read_lock();
290 	 */
291 	rcu_read_lock();
292 	memtier = rcu_dereference(pgdat->memtier);
293 	if (memtier)
294 		*targets = memtier->lower_tier_mask;
295 	else
296 		*targets = NODE_MASK_NONE;
297 	rcu_read_unlock();
298 }
299 
300 /**
301  * next_demotion_node() - Get the next node in the demotion path
302  * @node: The starting node to lookup the next node
303  *
304  * Return: node id for next memory node in the demotion path hierarchy
305  * from @node; NUMA_NO_NODE if @node is terminal.  This does not keep
306  * @node online or guarantee that it *continues* to be the next demotion
307  * target.
308  */
309 int next_demotion_node(int node)
310 {
311 	struct demotion_nodes *nd;
312 	int target;
313 
314 	if (!node_demotion)
315 		return NUMA_NO_NODE;
316 
317 	nd = &node_demotion[node];
318 
319 	/*
320 	 * node_demotion[] is updated without excluding this
321 	 * function from running.
322 	 *
323 	 * Make sure to use RCU over entire code blocks if
324 	 * node_demotion[] reads need to be consistent.
325 	 */
326 	rcu_read_lock();
327 	/*
328 	 * If there are multiple target nodes, just select one
329 	 * target node randomly.
330 	 *
331 	 * In addition, we can also use round-robin to select
332 	 * target node, but we should introduce another variable
333 	 * for node_demotion[] to record last selected target node,
334 	 * that may cause cache ping-pong due to the changing of
335 	 * last target node. Or introducing per-cpu data to avoid
336 	 * caching issue, which seems more complicated. So selecting
337 	 * target node randomly seems better until now.
338 	 */
339 	target = node_random(&nd->preferred);
340 	rcu_read_unlock();
341 
342 	return target;
343 }
344 
345 static void disable_all_demotion_targets(void)
346 {
347 	struct memory_tier *memtier;
348 	int node;
349 
350 	for_each_node_state(node, N_MEMORY) {
351 		node_demotion[node].preferred = NODE_MASK_NONE;
352 		/*
353 		 * We are holding memory_tier_lock, it is safe
354 		 * to access pgda->memtier.
355 		 */
356 		memtier = __node_get_memory_tier(node);
357 		if (memtier)
358 			memtier->lower_tier_mask = NODE_MASK_NONE;
359 	}
360 	/*
361 	 * Ensure that the "disable" is visible across the system.
362 	 * Readers will see either a combination of before+disable
363 	 * state or disable+after.  They will never see before and
364 	 * after state together.
365 	 */
366 	synchronize_rcu();
367 }
368 
369 static void dump_demotion_targets(void)
370 {
371 	int node;
372 
373 	for_each_node_state(node, N_MEMORY) {
374 		struct memory_tier *memtier = __node_get_memory_tier(node);
375 		nodemask_t preferred = node_demotion[node].preferred;
376 
377 		if (!memtier)
378 			continue;
379 
380 		if (nodes_empty(preferred))
381 			pr_info("Demotion targets for Node %d: null\n", node);
382 		else
383 			pr_info("Demotion targets for Node %d: preferred: %*pbl, fallback: %*pbl\n",
384 				node, nodemask_pr_args(&preferred),
385 				nodemask_pr_args(&memtier->lower_tier_mask));
386 	}
387 }
388 
389 /*
390  * Find an automatic demotion target for all memory
391  * nodes. Failing here is OK.  It might just indicate
392  * being at the end of a chain.
393  */
394 static void establish_demotion_targets(void)
395 {
396 	struct memory_tier *memtier;
397 	struct demotion_nodes *nd;
398 	int target = NUMA_NO_NODE, node;
399 	int distance, best_distance;
400 	nodemask_t tier_nodes, lower_tier;
401 
402 	lockdep_assert_held_once(&memory_tier_lock);
403 
404 	if (!node_demotion)
405 		return;
406 
407 	disable_all_demotion_targets();
408 
409 	for_each_node_state(node, N_MEMORY) {
410 		best_distance = -1;
411 		nd = &node_demotion[node];
412 
413 		memtier = __node_get_memory_tier(node);
414 		if (!memtier || list_is_last(&memtier->list, &memory_tiers))
415 			continue;
416 		/*
417 		 * Get the lower memtier to find the  demotion node list.
418 		 */
419 		memtier = list_next_entry(memtier, list);
420 		tier_nodes = get_memtier_nodemask(memtier);
421 		/*
422 		 * find_next_best_node, use 'used' nodemask as a skip list.
423 		 * Add all memory nodes except the selected memory tier
424 		 * nodelist to skip list so that we find the best node from the
425 		 * memtier nodelist.
426 		 */
427 		nodes_andnot(tier_nodes, node_states[N_MEMORY], tier_nodes);
428 
429 		/*
430 		 * Find all the nodes in the memory tier node list of same best distance.
431 		 * add them to the preferred mask. We randomly select between nodes
432 		 * in the preferred mask when allocating pages during demotion.
433 		 */
434 		do {
435 			target = find_next_best_node(node, &tier_nodes);
436 			if (target == NUMA_NO_NODE)
437 				break;
438 
439 			distance = node_distance(node, target);
440 			if (distance == best_distance || best_distance == -1) {
441 				best_distance = distance;
442 				node_set(target, nd->preferred);
443 			} else {
444 				break;
445 			}
446 		} while (1);
447 	}
448 	/*
449 	 * Promotion is allowed from a memory tier to higher
450 	 * memory tier only if the memory tier doesn't include
451 	 * compute. We want to skip promotion from a memory tier,
452 	 * if any node that is part of the memory tier have CPUs.
453 	 * Once we detect such a memory tier, we consider that tier
454 	 * as top tiper from which promotion is not allowed.
455 	 */
456 	list_for_each_entry_reverse(memtier, &memory_tiers, list) {
457 		tier_nodes = get_memtier_nodemask(memtier);
458 		nodes_and(tier_nodes, node_states[N_CPU], tier_nodes);
459 		if (!nodes_empty(tier_nodes)) {
460 			/*
461 			 * abstract distance below the max value of this memtier
462 			 * is considered toptier.
463 			 */
464 			top_tier_adistance = memtier->adistance_start +
465 						MEMTIER_CHUNK_SIZE - 1;
466 			break;
467 		}
468 	}
469 	/*
470 	 * Now build the lower_tier mask for each node collecting node mask from
471 	 * all memory tier below it. This allows us to fallback demotion page
472 	 * allocation to a set of nodes that is closer the above selected
473 	 * preferred node.
474 	 */
475 	lower_tier = node_states[N_MEMORY];
476 	list_for_each_entry(memtier, &memory_tiers, list) {
477 		/*
478 		 * Keep removing current tier from lower_tier nodes,
479 		 * This will remove all nodes in current and above
480 		 * memory tier from the lower_tier mask.
481 		 */
482 		tier_nodes = get_memtier_nodemask(memtier);
483 		nodes_andnot(lower_tier, lower_tier, tier_nodes);
484 		memtier->lower_tier_mask = lower_tier;
485 	}
486 
487 	dump_demotion_targets();
488 }
489 
490 #else
491 static inline void establish_demotion_targets(void) {}
492 #endif /* CONFIG_MIGRATION */
493 
494 static inline void __init_node_memory_type(int node, struct memory_dev_type *memtype)
495 {
496 	if (!node_memory_types[node].memtype)
497 		node_memory_types[node].memtype = memtype;
498 	/*
499 	 * for each device getting added in the same NUMA node
500 	 * with this specific memtype, bump the map count. We
501 	 * Only take memtype device reference once, so that
502 	 * changing a node memtype can be done by droping the
503 	 * only reference count taken here.
504 	 */
505 
506 	if (node_memory_types[node].memtype == memtype) {
507 		if (!node_memory_types[node].map_count++)
508 			kref_get(&memtype->kref);
509 	}
510 }
511 
512 static struct memory_tier *set_node_memory_tier(int node)
513 {
514 	struct memory_tier *memtier;
515 	struct memory_dev_type *memtype = default_dram_type;
516 	int adist = MEMTIER_ADISTANCE_DRAM;
517 	pg_data_t *pgdat = NODE_DATA(node);
518 
519 
520 	lockdep_assert_held_once(&memory_tier_lock);
521 
522 	if (!node_state(node, N_MEMORY))
523 		return ERR_PTR(-EINVAL);
524 
525 	mt_calc_adistance(node, &adist);
526 	if (!node_memory_types[node].memtype) {
527 		memtype = mt_find_alloc_memory_type(adist, &default_memory_types);
528 		if (IS_ERR(memtype)) {
529 			memtype = default_dram_type;
530 			pr_info("Failed to allocate a memory type. Fall back.\n");
531 		}
532 	}
533 
534 	__init_node_memory_type(node, memtype);
535 
536 	memtype = node_memory_types[node].memtype;
537 	node_set(node, memtype->nodes);
538 	memtier = find_create_memory_tier(memtype);
539 	if (!IS_ERR(memtier))
540 		rcu_assign_pointer(pgdat->memtier, memtier);
541 	return memtier;
542 }
543 
544 static void destroy_memory_tier(struct memory_tier *memtier)
545 {
546 	list_del(&memtier->list);
547 	device_unregister(&memtier->dev);
548 }
549 
550 static bool clear_node_memory_tier(int node)
551 {
552 	bool cleared = false;
553 	pg_data_t *pgdat;
554 	struct memory_tier *memtier;
555 
556 	pgdat = NODE_DATA(node);
557 	if (!pgdat)
558 		return false;
559 
560 	/*
561 	 * Make sure that anybody looking at NODE_DATA who finds
562 	 * a valid memtier finds memory_dev_types with nodes still
563 	 * linked to the memtier. We achieve this by waiting for
564 	 * rcu read section to finish using synchronize_rcu.
565 	 * This also enables us to free the destroyed memory tier
566 	 * with kfree instead of kfree_rcu
567 	 */
568 	memtier = __node_get_memory_tier(node);
569 	if (memtier) {
570 		struct memory_dev_type *memtype;
571 
572 		rcu_assign_pointer(pgdat->memtier, NULL);
573 		synchronize_rcu();
574 		memtype = node_memory_types[node].memtype;
575 		node_clear(node, memtype->nodes);
576 		if (nodes_empty(memtype->nodes)) {
577 			list_del_init(&memtype->tier_sibling);
578 			if (list_empty(&memtier->memory_types))
579 				destroy_memory_tier(memtier);
580 		}
581 		cleared = true;
582 	}
583 	return cleared;
584 }
585 
586 static void release_memtype(struct kref *kref)
587 {
588 	struct memory_dev_type *memtype;
589 
590 	memtype = container_of(kref, struct memory_dev_type, kref);
591 	kfree(memtype);
592 }
593 
594 struct memory_dev_type *alloc_memory_type(int adistance)
595 {
596 	struct memory_dev_type *memtype;
597 
598 	memtype = kmalloc(sizeof(*memtype), GFP_KERNEL);
599 	if (!memtype)
600 		return ERR_PTR(-ENOMEM);
601 
602 	memtype->adistance = adistance;
603 	INIT_LIST_HEAD(&memtype->tier_sibling);
604 	memtype->nodes  = NODE_MASK_NONE;
605 	kref_init(&memtype->kref);
606 	return memtype;
607 }
608 EXPORT_SYMBOL_GPL(alloc_memory_type);
609 
610 void put_memory_type(struct memory_dev_type *memtype)
611 {
612 	kref_put(&memtype->kref, release_memtype);
613 }
614 EXPORT_SYMBOL_GPL(put_memory_type);
615 
616 void init_node_memory_type(int node, struct memory_dev_type *memtype)
617 {
618 
619 	mutex_lock(&memory_tier_lock);
620 	__init_node_memory_type(node, memtype);
621 	mutex_unlock(&memory_tier_lock);
622 }
623 EXPORT_SYMBOL_GPL(init_node_memory_type);
624 
625 void clear_node_memory_type(int node, struct memory_dev_type *memtype)
626 {
627 	mutex_lock(&memory_tier_lock);
628 	if (node_memory_types[node].memtype == memtype || !memtype)
629 		node_memory_types[node].map_count--;
630 	/*
631 	 * If we umapped all the attached devices to this node,
632 	 * clear the node memory type.
633 	 */
634 	if (!node_memory_types[node].map_count) {
635 		memtype = node_memory_types[node].memtype;
636 		node_memory_types[node].memtype = NULL;
637 		put_memory_type(memtype);
638 	}
639 	mutex_unlock(&memory_tier_lock);
640 }
641 EXPORT_SYMBOL_GPL(clear_node_memory_type);
642 
643 struct memory_dev_type *mt_find_alloc_memory_type(int adist, struct list_head *memory_types)
644 {
645 	struct memory_dev_type *mtype;
646 
647 	list_for_each_entry(mtype, memory_types, list)
648 		if (mtype->adistance == adist)
649 			return mtype;
650 
651 	mtype = alloc_memory_type(adist);
652 	if (IS_ERR(mtype))
653 		return mtype;
654 
655 	list_add(&mtype->list, memory_types);
656 
657 	return mtype;
658 }
659 EXPORT_SYMBOL_GPL(mt_find_alloc_memory_type);
660 
661 void mt_put_memory_types(struct list_head *memory_types)
662 {
663 	struct memory_dev_type *mtype, *mtn;
664 
665 	list_for_each_entry_safe(mtype, mtn, memory_types, list) {
666 		list_del(&mtype->list);
667 		put_memory_type(mtype);
668 	}
669 }
670 EXPORT_SYMBOL_GPL(mt_put_memory_types);
671 
672 /*
673  * This is invoked via `late_initcall()` to initialize memory tiers for
674  * CPU-less memory nodes after driver initialization, which is
675  * expected to provide `adistance` algorithms.
676  */
677 static int __init memory_tier_late_init(void)
678 {
679 	int nid;
680 
681 	guard(mutex)(&memory_tier_lock);
682 	for_each_node_state(nid, N_MEMORY) {
683 		/*
684 		 * Some device drivers may have initialized memory tiers
685 		 * between `memory_tier_init()` and `memory_tier_late_init()`,
686 		 * potentially bringing online memory nodes and
687 		 * configuring memory tiers. Exclude them here.
688 		 */
689 		if (node_memory_types[nid].memtype)
690 			continue;
691 
692 		set_node_memory_tier(nid);
693 	}
694 
695 	establish_demotion_targets();
696 
697 	return 0;
698 }
699 late_initcall(memory_tier_late_init);
700 
701 static void dump_hmem_attrs(struct access_coordinate *coord, const char *prefix)
702 {
703 	pr_info(
704 "%sread_latency: %u, write_latency: %u, read_bandwidth: %u, write_bandwidth: %u\n",
705 		prefix, coord->read_latency, coord->write_latency,
706 		coord->read_bandwidth, coord->write_bandwidth);
707 }
708 
709 int mt_set_default_dram_perf(int nid, struct access_coordinate *perf,
710 			     const char *source)
711 {
712 	guard(mutex)(&default_dram_perf_lock);
713 	if (default_dram_perf_error)
714 		return -EIO;
715 
716 	if (perf->read_latency + perf->write_latency == 0 ||
717 	    perf->read_bandwidth + perf->write_bandwidth == 0)
718 		return -EINVAL;
719 
720 	if (default_dram_perf_ref_nid == NUMA_NO_NODE) {
721 		default_dram_perf = *perf;
722 		default_dram_perf_ref_nid = nid;
723 		default_dram_perf_ref_source = kstrdup(source, GFP_KERNEL);
724 		return 0;
725 	}
726 
727 	/*
728 	 * The performance of all default DRAM nodes is expected to be
729 	 * same (that is, the variation is less than 10%).  And it
730 	 * will be used as base to calculate the abstract distance of
731 	 * other memory nodes.
732 	 */
733 	if (abs(perf->read_latency - default_dram_perf.read_latency) * 10 >
734 	    default_dram_perf.read_latency ||
735 	    abs(perf->write_latency - default_dram_perf.write_latency) * 10 >
736 	    default_dram_perf.write_latency ||
737 	    abs(perf->read_bandwidth - default_dram_perf.read_bandwidth) * 10 >
738 	    default_dram_perf.read_bandwidth ||
739 	    abs(perf->write_bandwidth - default_dram_perf.write_bandwidth) * 10 >
740 	    default_dram_perf.write_bandwidth) {
741 		pr_info(
742 "memory-tiers: the performance of DRAM node %d mismatches that of the reference\n"
743 "DRAM node %d.\n", nid, default_dram_perf_ref_nid);
744 		pr_info("  performance of reference DRAM node %d:\n",
745 			default_dram_perf_ref_nid);
746 		dump_hmem_attrs(&default_dram_perf, "    ");
747 		pr_info("  performance of DRAM node %d:\n", nid);
748 		dump_hmem_attrs(perf, "    ");
749 		pr_info(
750 "  disable default DRAM node performance based abstract distance algorithm.\n");
751 		default_dram_perf_error = true;
752 		return -EINVAL;
753 	}
754 
755 	return 0;
756 }
757 
758 int mt_perf_to_adistance(struct access_coordinate *perf, int *adist)
759 {
760 	guard(mutex)(&default_dram_perf_lock);
761 	if (default_dram_perf_error)
762 		return -EIO;
763 
764 	if (perf->read_latency + perf->write_latency == 0 ||
765 	    perf->read_bandwidth + perf->write_bandwidth == 0)
766 		return -EINVAL;
767 
768 	if (default_dram_perf_ref_nid == NUMA_NO_NODE)
769 		return -ENOENT;
770 
771 	/*
772 	 * The abstract distance of a memory node is in direct proportion to
773 	 * its memory latency (read + write) and inversely proportional to its
774 	 * memory bandwidth (read + write).  The abstract distance, memory
775 	 * latency, and memory bandwidth of the default DRAM nodes are used as
776 	 * the base.
777 	 */
778 	*adist = MEMTIER_ADISTANCE_DRAM *
779 		(perf->read_latency + perf->write_latency) /
780 		(default_dram_perf.read_latency + default_dram_perf.write_latency) *
781 		(default_dram_perf.read_bandwidth + default_dram_perf.write_bandwidth) /
782 		(perf->read_bandwidth + perf->write_bandwidth);
783 
784 	return 0;
785 }
786 EXPORT_SYMBOL_GPL(mt_perf_to_adistance);
787 
788 /**
789  * register_mt_adistance_algorithm() - Register memory tiering abstract distance algorithm
790  * @nb: The notifier block which describe the algorithm
791  *
792  * Return: 0 on success, errno on error.
793  *
794  * Every memory tiering abstract distance algorithm provider needs to
795  * register the algorithm with register_mt_adistance_algorithm().  To
796  * calculate the abstract distance for a specified memory node, the
797  * notifier function will be called unless some high priority
798  * algorithm has provided result.  The prototype of the notifier
799  * function is as follows,
800  *
801  *   int (*algorithm_notifier)(struct notifier_block *nb,
802  *                             unsigned long nid, void *data);
803  *
804  * Where "nid" specifies the memory node, "data" is the pointer to the
805  * returned abstract distance (that is, "int *adist").  If the
806  * algorithm provides the result, NOTIFY_STOP should be returned.
807  * Otherwise, return_value & %NOTIFY_STOP_MASK == 0 to allow the next
808  * algorithm in the chain to provide the result.
809  */
810 int register_mt_adistance_algorithm(struct notifier_block *nb)
811 {
812 	return blocking_notifier_chain_register(&mt_adistance_algorithms, nb);
813 }
814 EXPORT_SYMBOL_GPL(register_mt_adistance_algorithm);
815 
816 /**
817  * unregister_mt_adistance_algorithm() - Unregister memory tiering abstract distance algorithm
818  * @nb: the notifier block which describe the algorithm
819  *
820  * Return: 0 on success, errno on error.
821  */
822 int unregister_mt_adistance_algorithm(struct notifier_block *nb)
823 {
824 	return blocking_notifier_chain_unregister(&mt_adistance_algorithms, nb);
825 }
826 EXPORT_SYMBOL_GPL(unregister_mt_adistance_algorithm);
827 
828 /**
829  * mt_calc_adistance() - Calculate abstract distance with registered algorithms
830  * @node: the node to calculate abstract distance for
831  * @adist: the returned abstract distance
832  *
833  * Return: if return_value & %NOTIFY_STOP_MASK != 0, then some
834  * abstract distance algorithm provides the result, and return it via
835  * @adist.  Otherwise, no algorithm can provide the result and @adist
836  * will be kept as it is.
837  */
838 int mt_calc_adistance(int node, int *adist)
839 {
840 	return blocking_notifier_call_chain(&mt_adistance_algorithms, node, adist);
841 }
842 EXPORT_SYMBOL_GPL(mt_calc_adistance);
843 
844 static int __meminit memtier_hotplug_callback(struct notifier_block *self,
845 					      unsigned long action, void *_arg)
846 {
847 	struct memory_tier *memtier;
848 	struct memory_notify *arg = _arg;
849 
850 	/*
851 	 * Only update the node migration order when a node is
852 	 * changing status, like online->offline.
853 	 */
854 	if (arg->status_change_nid < 0)
855 		return notifier_from_errno(0);
856 
857 	switch (action) {
858 	case MEM_OFFLINE:
859 		mutex_lock(&memory_tier_lock);
860 		if (clear_node_memory_tier(arg->status_change_nid))
861 			establish_demotion_targets();
862 		mutex_unlock(&memory_tier_lock);
863 		break;
864 	case MEM_ONLINE:
865 		mutex_lock(&memory_tier_lock);
866 		memtier = set_node_memory_tier(arg->status_change_nid);
867 		if (!IS_ERR(memtier))
868 			establish_demotion_targets();
869 		mutex_unlock(&memory_tier_lock);
870 		break;
871 	}
872 
873 	return notifier_from_errno(0);
874 }
875 
876 static int __init memory_tier_init(void)
877 {
878 	int ret, node;
879 	struct memory_tier *memtier;
880 
881 	ret = subsys_virtual_register(&memory_tier_subsys, NULL);
882 	if (ret)
883 		panic("%s() failed to register memory tier subsystem\n", __func__);
884 
885 #ifdef CONFIG_MIGRATION
886 	node_demotion = kcalloc(nr_node_ids, sizeof(struct demotion_nodes),
887 				GFP_KERNEL);
888 	WARN_ON(!node_demotion);
889 #endif
890 	mutex_lock(&memory_tier_lock);
891 	/*
892 	 * For now we can have 4 faster memory tiers with smaller adistance
893 	 * than default DRAM tier.
894 	 */
895 	default_dram_type = mt_find_alloc_memory_type(MEMTIER_ADISTANCE_DRAM,
896 						      &default_memory_types);
897 	if (IS_ERR(default_dram_type))
898 		panic("%s() failed to allocate default DRAM tier\n", __func__);
899 
900 	/*
901 	 * Look at all the existing N_MEMORY nodes and add them to
902 	 * default memory tier or to a tier if we already have memory
903 	 * types assigned.
904 	 */
905 	for_each_node_state(node, N_MEMORY) {
906 		if (!node_state(node, N_CPU))
907 			/*
908 			 * Defer memory tier initialization on
909 			 * CPUless numa nodes. These will be initialized
910 			 * after firmware and devices are initialized.
911 			 */
912 			continue;
913 
914 		memtier = set_node_memory_tier(node);
915 		if (IS_ERR(memtier))
916 			/*
917 			 * Continue with memtiers we are able to setup
918 			 */
919 			break;
920 	}
921 	establish_demotion_targets();
922 	mutex_unlock(&memory_tier_lock);
923 
924 	hotplug_memory_notifier(memtier_hotplug_callback, MEMTIER_HOTPLUG_PRI);
925 	return 0;
926 }
927 subsys_initcall(memory_tier_init);
928 
929 bool numa_demotion_enabled = false;
930 
931 #ifdef CONFIG_MIGRATION
932 #ifdef CONFIG_SYSFS
933 static ssize_t demotion_enabled_show(struct kobject *kobj,
934 				     struct kobj_attribute *attr, char *buf)
935 {
936 	return sysfs_emit(buf, "%s\n",
937 			  numa_demotion_enabled ? "true" : "false");
938 }
939 
940 static ssize_t demotion_enabled_store(struct kobject *kobj,
941 				      struct kobj_attribute *attr,
942 				      const char *buf, size_t count)
943 {
944 	ssize_t ret;
945 
946 	ret = kstrtobool(buf, &numa_demotion_enabled);
947 	if (ret)
948 		return ret;
949 
950 	return count;
951 }
952 
953 static struct kobj_attribute numa_demotion_enabled_attr =
954 	__ATTR_RW(demotion_enabled);
955 
956 static struct attribute *numa_attrs[] = {
957 	&numa_demotion_enabled_attr.attr,
958 	NULL,
959 };
960 
961 static const struct attribute_group numa_attr_group = {
962 	.attrs = numa_attrs,
963 };
964 
965 static int __init numa_init_sysfs(void)
966 {
967 	int err;
968 	struct kobject *numa_kobj;
969 
970 	numa_kobj = kobject_create_and_add("numa", mm_kobj);
971 	if (!numa_kobj) {
972 		pr_err("failed to create numa kobject\n");
973 		return -ENOMEM;
974 	}
975 	err = sysfs_create_group(numa_kobj, &numa_attr_group);
976 	if (err) {
977 		pr_err("failed to register numa group\n");
978 		goto delete_obj;
979 	}
980 	return 0;
981 
982 delete_obj:
983 	kobject_put(numa_kobj);
984 	return err;
985 }
986 subsys_initcall(numa_init_sysfs);
987 #endif /* CONFIG_SYSFS */
988 #endif
989