xref: /linux/drivers/resctrl/mpam_resctrl.c (revision e2683c8868d03382da7e1ce8453b543a043066d1)
1 // SPDX-License-Identifier: GPL-2.0
2 // Copyright (C) 2025 Arm Ltd.
3 
4 #define pr_fmt(fmt) "%s:%s: " fmt, KBUILD_MODNAME, __func__
5 
6 #include <linux/arm_mpam.h>
7 #include <linux/cacheinfo.h>
8 #include <linux/cpu.h>
9 #include <linux/cpumask.h>
10 #include <linux/errno.h>
11 #include <linux/limits.h>
12 #include <linux/list.h>
13 #include <linux/math.h>
14 #include <linux/printk.h>
15 #include <linux/rculist.h>
16 #include <linux/resctrl.h>
17 #include <linux/slab.h>
18 #include <linux/types.h>
19 #include <linux/wait.h>
20 
21 #include <asm/mpam.h>
22 
23 #include "mpam_internal.h"
24 
25 static DECLARE_WAIT_QUEUE_HEAD(resctrl_mon_ctx_waiters);
26 
27 /*
28  * The classes we've picked to map to resctrl resources, wrapped
29  * in with their resctrl structure.
30  * Class pointer may be NULL.
31  */
32 static struct mpam_resctrl_res mpam_resctrl_controls[RDT_NUM_RESOURCES];
33 
34 #define for_each_mpam_resctrl_control(res, rid)					\
35 	for (rid = 0, res = &mpam_resctrl_controls[rid];			\
36 	     rid < RDT_NUM_RESOURCES;						\
37 	     rid++, res = &mpam_resctrl_controls[rid])
38 
39 /*
40  * The classes we've picked to map to resctrl events.
41  * Resctrl believes all the worlds a Xeon, and these are all on the L3. This
42  * array lets us find the actual class backing the event counters. e.g.
43  * the only memory bandwidth counters may be on the memory controller, but to
44  * make use of them, we pretend they are on L3. Restrict the events considered
45  * to those supported by MPAM.
46  * Class pointer may be NULL.
47  */
48 #define MPAM_MAX_EVENT QOS_L3_MBM_TOTAL_EVENT_ID
49 static struct mpam_resctrl_mon mpam_resctrl_counters[MPAM_MAX_EVENT + 1];
50 
51 #define for_each_mpam_resctrl_mon(mon, eventid)					\
52 	for (eventid = QOS_FIRST_EVENT, mon = &mpam_resctrl_counters[eventid];	\
53 	     eventid <= MPAM_MAX_EVENT;						\
54 	     eventid++, mon = &mpam_resctrl_counters[eventid])
55 
56 /* The lock for modifying resctrl's domain lists from cpuhp callbacks. */
57 static DEFINE_MUTEX(domain_list_lock);
58 
59 /*
60  * MPAM emulates CDP by setting different PARTID in the I/D fields of MPAM0_EL1.
61  * This applies globally to all traffic the CPU generates.
62  */
63 static bool cdp_enabled;
64 
65 /*
66  * We use cacheinfo to discover the size of the caches and their id. cacheinfo
67  * populates this from a device_initcall(). mpam_resctrl_setup() must wait.
68  */
69 static bool cacheinfo_ready;
70 static DECLARE_WAIT_QUEUE_HEAD(wait_cacheinfo_ready);
71 
72 /*
73  * If resctrl_init() succeeded, resctrl_exit() can be used to remove support
74  * for the filesystem in the event of an error.
75  */
76 static bool resctrl_enabled;
77 
78 bool resctrl_arch_alloc_capable(void)
79 {
80 	struct mpam_resctrl_res *res;
81 	enum resctrl_res_level rid;
82 
83 	for_each_mpam_resctrl_control(res, rid) {
84 		if (res->resctrl_res.alloc_capable)
85 			return true;
86 	}
87 
88 	return false;
89 }
90 
91 bool resctrl_arch_mon_capable(void)
92 {
93 	struct mpam_resctrl_res *res = &mpam_resctrl_controls[RDT_RESOURCE_L3];
94 	struct rdt_resource *l3 = &res->resctrl_res;
95 
96 	/* All monitors are presented as being on the L3 cache */
97 	return l3->mon_capable;
98 }
99 
100 bool resctrl_arch_is_evt_configurable(enum resctrl_event_id evt)
101 {
102 	return false;
103 }
104 
105 void resctrl_arch_mon_event_config_read(void *info)
106 {
107 }
108 
109 void resctrl_arch_mon_event_config_write(void *info)
110 {
111 }
112 
113 void resctrl_arch_reset_rmid_all(struct rdt_resource *r, struct rdt_l3_mon_domain *d)
114 {
115 }
116 
117 void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_l3_mon_domain *d,
118 			     u32 closid, u32 rmid, enum resctrl_event_id eventid)
119 {
120 }
121 
122 void resctrl_arch_reset_cntr(struct rdt_resource *r, struct rdt_l3_mon_domain *d,
123 			     u32 closid, u32 rmid, int cntr_id,
124 			     enum resctrl_event_id eventid)
125 {
126 }
127 
128 void resctrl_arch_config_cntr(struct rdt_resource *r, struct rdt_l3_mon_domain *d,
129 			      enum resctrl_event_id evtid, u32 rmid, u32 closid,
130 			      u32 cntr_id, bool assign)
131 {
132 }
133 
134 int resctrl_arch_cntr_read(struct rdt_resource *r, struct rdt_l3_mon_domain *d,
135 			   u32 unused, u32 rmid, int cntr_id,
136 			   enum resctrl_event_id eventid, u64 *val)
137 {
138 	return -EOPNOTSUPP;
139 }
140 
141 bool resctrl_arch_mbm_cntr_assign_enabled(struct rdt_resource *r)
142 {
143 	return false;
144 }
145 
146 int resctrl_arch_mbm_cntr_assign_set(struct rdt_resource *r, bool enable)
147 {
148 	return -EINVAL;
149 }
150 
151 int resctrl_arch_io_alloc_enable(struct rdt_resource *r, bool enable)
152 {
153 	return -EOPNOTSUPP;
154 }
155 
156 bool resctrl_arch_get_io_alloc_enabled(struct rdt_resource *r)
157 {
158 	return false;
159 }
160 
161 void resctrl_arch_pre_mount(void)
162 {
163 }
164 
165 bool resctrl_arch_get_cdp_enabled(enum resctrl_res_level rid)
166 {
167 	return mpam_resctrl_controls[rid].cdp_enabled;
168 }
169 
170 /**
171  * resctrl_reset_task_closids() - Reset the PARTID/PMG values for all tasks.
172  *
173  * At boot, all existing tasks use partid zero for D and I.
174  * To enable/disable CDP emulation, all these tasks need relabelling.
175  */
176 static void resctrl_reset_task_closids(void)
177 {
178 	struct task_struct *p, *t;
179 
180 	read_lock(&tasklist_lock);
181 	for_each_process_thread(p, t) {
182 		resctrl_arch_set_closid_rmid(t, RESCTRL_RESERVED_CLOSID,
183 					     RESCTRL_RESERVED_RMID);
184 	}
185 	read_unlock(&tasklist_lock);
186 }
187 
188 int resctrl_arch_set_cdp_enabled(enum resctrl_res_level rid, bool enable)
189 {
190 	u32 partid_i = RESCTRL_RESERVED_CLOSID, partid_d = RESCTRL_RESERVED_CLOSID;
191 	struct mpam_resctrl_res *res = &mpam_resctrl_controls[RDT_RESOURCE_L3];
192 	struct rdt_resource *l3 = &res->resctrl_res;
193 	int cpu;
194 
195 	if (!IS_ENABLED(CONFIG_EXPERT) && enable) {
196 		/*
197 		 * If the resctrl fs is mounted more than once, sequentially,
198 		 * then CDP can lead to the use of out of range PARTIDs.
199 		 */
200 		pr_warn("CDP not supported\n");
201 		return -EOPNOTSUPP;
202 	}
203 
204 	if (enable)
205 		pr_warn("CDP is an expert feature and may cause MPAM to malfunction.\n");
206 
207 	/*
208 	 * resctrl_arch_set_cdp_enabled() is only called with enable set to
209 	 * false on error and unmount.
210 	 */
211 	cdp_enabled = enable;
212 	mpam_resctrl_controls[rid].cdp_enabled = enable;
213 
214 	if (enable)
215 		l3->mon.num_rmid = resctrl_arch_system_num_rmid_idx() / 2;
216 	else
217 		l3->mon.num_rmid = resctrl_arch_system_num_rmid_idx();
218 
219 	/* The mbw_max feature can't hide cdp as it's a per-partid maximum. */
220 	if (cdp_enabled && !mpam_resctrl_controls[RDT_RESOURCE_MBA].cdp_enabled)
221 		mpam_resctrl_controls[RDT_RESOURCE_MBA].resctrl_res.alloc_capable = false;
222 
223 	/*
224 	 * If resctrl has attempted to enable CDP on MBA, re-enable MBA as two
225 	 * configurations will be provided so there is no aliasing problem.
226 	 */
227 	if (mpam_resctrl_controls[RDT_RESOURCE_MBA].cdp_enabled &&
228 	    mpam_resctrl_controls[RDT_RESOURCE_MBA].class)
229 		mpam_resctrl_controls[RDT_RESOURCE_MBA].resctrl_res.alloc_capable = true;
230 
231 	/* On unmount when CDP is disabled, re-enable MBA */
232 	if (!cdp_enabled && mpam_resctrl_controls[RDT_RESOURCE_MBA].class)
233 		mpam_resctrl_controls[RDT_RESOURCE_MBA].resctrl_res.alloc_capable = true;
234 
235 	if (enable) {
236 		if (mpam_partid_max < 1)
237 			return -EINVAL;
238 
239 		partid_d = resctrl_get_config_index(RESCTRL_RESERVED_CLOSID, CDP_DATA);
240 		partid_i = resctrl_get_config_index(RESCTRL_RESERVED_CLOSID, CDP_CODE);
241 	}
242 
243 	mpam_set_task_partid_pmg(current, partid_d, partid_i, 0, 0);
244 	WRITE_ONCE(arm64_mpam_global_default, mpam_get_regval(current));
245 
246 	resctrl_reset_task_closids();
247 
248 	for_each_possible_cpu(cpu)
249 		mpam_set_cpu_defaults(cpu, partid_d, partid_i, 0, 0);
250 	on_each_cpu(resctrl_arch_sync_cpu_closid_rmid, NULL, 1);
251 
252 	return 0;
253 }
254 
255 static bool mpam_resctrl_hide_cdp(enum resctrl_res_level rid)
256 {
257 	return cdp_enabled && !resctrl_arch_get_cdp_enabled(rid);
258 }
259 
260 /*
261  * MSC may raise an error interrupt if it sees an out or range partid/pmg,
262  * and go on to truncate the value. Regardless of what the hardware supports,
263  * only the system wide safe value is safe to use.
264  */
265 u32 resctrl_arch_get_num_closid(struct rdt_resource *ignored)
266 {
267 	return mpam_partid_max + 1;
268 }
269 
270 u32 resctrl_arch_system_num_rmid_idx(void)
271 {
272 	return (mpam_pmg_max + 1) * (mpam_partid_max + 1);
273 }
274 
275 u32 resctrl_arch_rmid_idx_encode(u32 closid, u32 rmid)
276 {
277 	return closid * (mpam_pmg_max + 1) + rmid;
278 }
279 
280 void resctrl_arch_rmid_idx_decode(u32 idx, u32 *closid, u32 *rmid)
281 {
282 	*closid = idx / (mpam_pmg_max + 1);
283 	*rmid = idx % (mpam_pmg_max + 1);
284 }
285 
286 void resctrl_arch_sched_in(struct task_struct *tsk)
287 {
288 	lockdep_assert_preemption_disabled();
289 
290 	mpam_thread_switch(tsk);
291 }
292 
293 void resctrl_arch_set_cpu_default_closid_rmid(int cpu, u32 closid, u32 rmid)
294 {
295 	WARN_ON_ONCE(closid > U16_MAX);
296 	WARN_ON_ONCE(rmid > U8_MAX);
297 
298 	if (!cdp_enabled) {
299 		mpam_set_cpu_defaults(cpu, closid, closid, rmid, rmid);
300 	} else {
301 		/*
302 		 * When CDP is enabled, resctrl halves the closid range and we
303 		 * use odd/even partid for one closid.
304 		 */
305 		u32 partid_d = resctrl_get_config_index(closid, CDP_DATA);
306 		u32 partid_i = resctrl_get_config_index(closid, CDP_CODE);
307 
308 		mpam_set_cpu_defaults(cpu, partid_d, partid_i, rmid, rmid);
309 	}
310 }
311 
312 void resctrl_arch_sync_cpu_closid_rmid(void *info)
313 {
314 	struct resctrl_cpu_defaults *r = info;
315 
316 	lockdep_assert_preemption_disabled();
317 
318 	if (r) {
319 		resctrl_arch_set_cpu_default_closid_rmid(smp_processor_id(),
320 							 r->closid, r->rmid);
321 	}
322 
323 	resctrl_arch_sched_in(current);
324 }
325 
326 void resctrl_arch_set_closid_rmid(struct task_struct *tsk, u32 closid, u32 rmid)
327 {
328 	WARN_ON_ONCE(closid > U16_MAX);
329 	WARN_ON_ONCE(rmid > U8_MAX);
330 
331 	if (!cdp_enabled) {
332 		mpam_set_task_partid_pmg(tsk, closid, closid, rmid, rmid);
333 	} else {
334 		u32 partid_d = resctrl_get_config_index(closid, CDP_DATA);
335 		u32 partid_i = resctrl_get_config_index(closid, CDP_CODE);
336 
337 		mpam_set_task_partid_pmg(tsk, partid_d, partid_i, rmid, rmid);
338 	}
339 }
340 
341 bool resctrl_arch_match_closid(struct task_struct *tsk, u32 closid)
342 {
343 	u64 regval = mpam_get_regval(tsk);
344 	u32 tsk_closid = FIELD_GET(MPAM0_EL1_PARTID_D, regval);
345 
346 	if (cdp_enabled)
347 		tsk_closid >>= 1;
348 
349 	return tsk_closid == closid;
350 }
351 
352 /* The task's pmg is not unique, the partid must be considered too */
353 bool resctrl_arch_match_rmid(struct task_struct *tsk, u32 closid, u32 rmid)
354 {
355 	u64 regval = mpam_get_regval(tsk);
356 	u32 tsk_closid = FIELD_GET(MPAM0_EL1_PARTID_D, regval);
357 	u32 tsk_rmid = FIELD_GET(MPAM0_EL1_PMG_D, regval);
358 
359 	if (cdp_enabled)
360 		tsk_closid >>= 1;
361 
362 	return (tsk_closid == closid) && (tsk_rmid == rmid);
363 }
364 
365 struct rdt_resource *resctrl_arch_get_resource(enum resctrl_res_level l)
366 {
367 	if (l >= RDT_NUM_RESOURCES)
368 		return NULL;
369 
370 	return &mpam_resctrl_controls[l].resctrl_res;
371 }
372 
373 static int resctrl_arch_mon_ctx_alloc_no_wait(enum resctrl_event_id evtid)
374 {
375 	struct mpam_resctrl_mon *mon = &mpam_resctrl_counters[evtid];
376 
377 	if (!mpam_is_enabled())
378 		return -EINVAL;
379 
380 	if (!mon->class)
381 		return -EINVAL;
382 
383 	switch (evtid) {
384 	case QOS_L3_OCCUP_EVENT_ID:
385 		/* With CDP, one monitor gets used for both code/data reads */
386 		return mpam_alloc_csu_mon(mon->class);
387 	case QOS_L3_MBM_LOCAL_EVENT_ID:
388 	case QOS_L3_MBM_TOTAL_EVENT_ID:
389 		return USE_PRE_ALLOCATED;
390 	default:
391 		return -EOPNOTSUPP;
392 	}
393 }
394 
395 void *resctrl_arch_mon_ctx_alloc(struct rdt_resource *r,
396 				 enum resctrl_event_id evtid)
397 {
398 	DEFINE_WAIT(wait);
399 	int *ret;
400 
401 	ret = kmalloc_obj(*ret);
402 	if (!ret)
403 		return ERR_PTR(-ENOMEM);
404 
405 	do {
406 		prepare_to_wait(&resctrl_mon_ctx_waiters, &wait,
407 				TASK_INTERRUPTIBLE);
408 		*ret = resctrl_arch_mon_ctx_alloc_no_wait(evtid);
409 		if (*ret == -ENOSPC)
410 			schedule();
411 	} while (*ret == -ENOSPC && !signal_pending(current));
412 	finish_wait(&resctrl_mon_ctx_waiters, &wait);
413 
414 	return ret;
415 }
416 
417 static void resctrl_arch_mon_ctx_free_no_wait(enum resctrl_event_id evtid,
418 					      u32 mon_idx)
419 {
420 	struct mpam_resctrl_mon *mon = &mpam_resctrl_counters[evtid];
421 
422 	if (!mpam_is_enabled())
423 		return;
424 
425 	if (!mon->class)
426 		return;
427 
428 	if (evtid == QOS_L3_OCCUP_EVENT_ID)
429 		mpam_free_csu_mon(mon->class, mon_idx);
430 
431 	wake_up(&resctrl_mon_ctx_waiters);
432 }
433 
434 void resctrl_arch_mon_ctx_free(struct rdt_resource *r,
435 			       enum resctrl_event_id evtid, void *arch_mon_ctx)
436 {
437 	u32 mon_idx = *(u32 *)arch_mon_ctx;
438 
439 	kfree(arch_mon_ctx);
440 
441 	resctrl_arch_mon_ctx_free_no_wait(evtid, mon_idx);
442 }
443 
444 static int __read_mon(struct mpam_resctrl_mon *mon, struct mpam_component *mon_comp,
445 		      enum mpam_device_features mon_type,
446 		      int mon_idx,
447 		      enum resctrl_conf_type cdp_type, u32 closid, u32 rmid, u64 *val)
448 {
449 	struct mon_cfg cfg;
450 
451 	if (!mpam_is_enabled())
452 		return -EINVAL;
453 
454 	/* Shift closid to account for CDP */
455 	closid = resctrl_get_config_index(closid, cdp_type);
456 
457 	if (irqs_disabled()) {
458 		/* Check if we can access this domain without an IPI */
459 		return -EIO;
460 	}
461 
462 	cfg = (struct mon_cfg) {
463 		.mon = mon_idx,
464 		.match_pmg = true,
465 		.partid = closid,
466 		.pmg = rmid,
467 	};
468 
469 	return mpam_msmon_read(mon_comp, &cfg, mon_type, val);
470 }
471 
472 static int read_mon_cdp_safe(struct mpam_resctrl_mon *mon, struct mpam_component *mon_comp,
473 			     enum mpam_device_features mon_type,
474 			     int mon_idx, u32 closid, u32 rmid, u64 *val)
475 {
476 	if (cdp_enabled) {
477 		u64 code_val = 0, data_val = 0;
478 		int err;
479 
480 		err = __read_mon(mon, mon_comp, mon_type, mon_idx,
481 				 CDP_CODE, closid, rmid, &code_val);
482 		if (err)
483 			return err;
484 
485 		err = __read_mon(mon, mon_comp, mon_type, mon_idx,
486 				 CDP_DATA, closid, rmid, &data_val);
487 		if (err)
488 			return err;
489 
490 		*val += code_val + data_val;
491 		return 0;
492 	}
493 
494 	return __read_mon(mon, mon_comp, mon_type, mon_idx,
495 			  CDP_NONE, closid, rmid, val);
496 }
497 
498 /* MBWU when not in ABMC mode (not supported), and CSU counters. */
499 int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain_hdr *hdr,
500 			   u32 closid, u32 rmid, enum resctrl_event_id eventid,
501 			   void *arch_priv, u64 *val, void *arch_mon_ctx)
502 {
503 	struct mpam_resctrl_dom *l3_dom;
504 	struct mpam_component *mon_comp;
505 	u32 mon_idx = *(u32 *)arch_mon_ctx;
506 	enum mpam_device_features mon_type;
507 	struct mpam_resctrl_mon *mon = &mpam_resctrl_counters[eventid];
508 
509 	resctrl_arch_rmid_read_context_check();
510 
511 	if (!mpam_is_enabled())
512 		return -EINVAL;
513 
514 	if (eventid >= QOS_NUM_EVENTS || !mon->class)
515 		return -EINVAL;
516 
517 	l3_dom = container_of(hdr, struct mpam_resctrl_dom, resctrl_mon_dom.hdr);
518 	mon_comp = l3_dom->mon_comp[eventid];
519 
520 	if (eventid != QOS_L3_OCCUP_EVENT_ID)
521 		return -EINVAL;
522 
523 	mon_type = mpam_feat_msmon_csu;
524 
525 	return read_mon_cdp_safe(mon, mon_comp, mon_type, mon_idx,
526 				 closid, rmid, val);
527 }
528 
529 /*
530  * The rmid realloc threshold should be for the smallest cache exposed to
531  * resctrl.
532  */
533 static int update_rmid_limits(struct mpam_class *class)
534 {
535 	u32 num_unique_pmg = resctrl_arch_system_num_rmid_idx();
536 	struct mpam_props *cprops = &class->props;
537 	struct cacheinfo *ci;
538 
539 	lockdep_assert_cpus_held();
540 
541 	if (!mpam_has_feature(mpam_feat_msmon_csu, cprops))
542 		return 0;
543 
544 	/*
545 	 * Assume cache levels are the same size for all CPUs...
546 	 * The check just requires any online CPU and it can't go offline as we
547 	 * hold the cpu lock.
548 	 */
549 	ci = get_cpu_cacheinfo_level(raw_smp_processor_id(), class->level);
550 	if (!ci || ci->size == 0) {
551 		pr_debug("Could not read cache size for class %u\n",
552 			 class->level);
553 		return -EINVAL;
554 	}
555 
556 	if (!resctrl_rmid_realloc_limit ||
557 	    ci->size < resctrl_rmid_realloc_limit) {
558 		resctrl_rmid_realloc_limit = ci->size;
559 		resctrl_rmid_realloc_threshold = ci->size / num_unique_pmg;
560 	}
561 
562 	return 0;
563 }
564 
565 static bool cache_has_usable_cpor(struct mpam_class *class)
566 {
567 	struct mpam_props *cprops = &class->props;
568 
569 	if (!mpam_has_feature(mpam_feat_cpor_part, cprops))
570 		return false;
571 
572 	/* resctrl uses u32 for all bitmap configurations */
573 	return class->props.cpbm_wd <= 32;
574 }
575 
576 static bool mba_class_use_mbw_max(struct mpam_props *cprops)
577 {
578 	return (mpam_has_feature(mpam_feat_mbw_max, cprops) &&
579 		cprops->bwa_wd);
580 }
581 
582 static bool class_has_usable_mba(struct mpam_props *cprops)
583 {
584 	return mba_class_use_mbw_max(cprops);
585 }
586 
587 static bool cache_has_usable_csu(struct mpam_class *class)
588 {
589 	struct mpam_props *cprops;
590 
591 	if (!class)
592 		return false;
593 
594 	cprops = &class->props;
595 
596 	if (!mpam_has_feature(mpam_feat_msmon_csu, cprops))
597 		return false;
598 
599 	/*
600 	 * CSU counters settle on the value, so we can get away with
601 	 * having only one.
602 	 */
603 	if (!cprops->num_csu_mon)
604 		return false;
605 
606 	return true;
607 }
608 
609 /*
610  * Calculate the worst-case percentage change from each implemented step
611  * in the control.
612  */
613 static u32 get_mba_granularity(struct mpam_props *cprops)
614 {
615 	if (!mba_class_use_mbw_max(cprops))
616 		return 0;
617 
618 	/*
619 	 * bwa_wd is the number of bits implemented in the 0.xxx
620 	 * fixed point fraction. 1 bit is 50%, 2 is 25% etc.
621 	 */
622 	return DIV_ROUND_UP(MAX_MBA_BW, 1 << cprops->bwa_wd);
623 }
624 
625 /*
626  * Each fixed-point hardware value architecturally represents a range
627  * of values: the full range 0% - 100% is split contiguously into
628  * (1 << cprops->bwa_wd) equal bands.
629  *
630  * Although the bwa_bwd fields have 6 bits the maximum valid value is 16
631  * as it reports the width of fields that are at most 16 bits. When
632  * fewer than 16 bits are valid the least significant bits are
633  * ignored. The implied binary point is kept between bits 15 and 16 and
634  * so the valid bits are leftmost.
635  *
636  * See ARM IHI0099B.a "MPAM system component specification", Section 9.3,
637  * "The fixed-point fractional format" for more information.
638  *
639  * Find the nearest percentage value to the upper bound of the selected band:
640  */
641 static u32 mbw_max_to_percent(u16 mbw_max, struct mpam_props *cprops)
642 {
643 	u32 val = mbw_max;
644 
645 	val >>= 16 - cprops->bwa_wd;
646 	val += 1;
647 	val *= MAX_MBA_BW;
648 	val = DIV_ROUND_CLOSEST(val, 1 << cprops->bwa_wd);
649 
650 	return val;
651 }
652 
653 /*
654  * Find the band whose upper bound is closest to the specified percentage.
655  *
656  * A round-to-nearest policy is followed here as a balanced compromise
657  * between unexpected under-commit of the resource (where the total of
658  * a set of resource allocations after conversion is less than the
659  * expected total, due to rounding of the individual converted
660  * percentages) and over-commit (where the total of the converted
661  * allocations is greater than expected).
662  */
663 static u16 percent_to_mbw_max(u8 pc, struct mpam_props *cprops)
664 {
665 	u32 val = pc;
666 
667 	val <<= cprops->bwa_wd;
668 	val = DIV_ROUND_CLOSEST(val, MAX_MBA_BW);
669 	val = max(val, 1) - 1;
670 	val <<= 16 - cprops->bwa_wd;
671 
672 	return val;
673 }
674 
675 static u32 get_mba_min(struct mpam_props *cprops)
676 {
677 	if (!mba_class_use_mbw_max(cprops)) {
678 		WARN_ON_ONCE(1);
679 		return 0;
680 	}
681 
682 	return mbw_max_to_percent(0, cprops);
683 }
684 
685 /* Find the L3 cache that has affinity with this CPU */
686 static int find_l3_equivalent_bitmask(int cpu, cpumask_var_t tmp_cpumask)
687 {
688 	u32 cache_id = get_cpu_cacheinfo_id(cpu, 3);
689 
690 	lockdep_assert_cpus_held();
691 
692 	return mpam_get_cpumask_from_cache_id(cache_id, 3, tmp_cpumask);
693 }
694 
695 /*
696  * topology_matches_l3() - Is the provided class the same shape as L3
697  * @victim:		The class we'd like to pretend is L3.
698  *
699  * resctrl expects all the world's a Xeon, and all counters are on the
700  * L3. We allow some mapping counters on other classes. This requires
701  * that the CPU->domain mapping is the same kind of shape.
702  *
703  * Using cacheinfo directly would make this work even if resctrl can't
704  * use the L3 - but cacheinfo can't tell us anything about offline CPUs.
705  * Using the L3 resctrl domain list also depends on CPUs being online.
706  * Using the mpam_class we picked for L3 so we can use its domain list
707  * assumes that there are MPAM controls on the L3.
708  * Instead, this path eventually uses the mpam_get_cpumask_from_cache_id()
709  * helper which can tell us about offline CPUs ... but getting the cache_id
710  * to start with relies on at least one CPU per L3 cache being online at
711  * boot.
712  *
713  * Walk the victim component list and compare the affinity mask with the
714  * corresponding L3. The topology matches if each victim:component's affinity
715  * mask is the same as the CPU's corresponding L3's. These lists/masks are
716  * computed from firmware tables so don't change at runtime.
717  */
718 static bool topology_matches_l3(struct mpam_class *victim)
719 {
720 	int cpu, err;
721 	struct mpam_component *victim_iter;
722 
723 	lockdep_assert_cpus_held();
724 
725 	cpumask_var_t __free(free_cpumask_var) tmp_cpumask = CPUMASK_VAR_NULL;
726 	if (!alloc_cpumask_var(&tmp_cpumask, GFP_KERNEL))
727 		return false;
728 
729 	guard(srcu)(&mpam_srcu);
730 	list_for_each_entry_srcu(victim_iter, &victim->components, class_list,
731 				 srcu_read_lock_held(&mpam_srcu)) {
732 		if (cpumask_empty(&victim_iter->affinity)) {
733 			pr_debug("class %u has CPU-less component %u - can't match L3!\n",
734 				 victim->level, victim_iter->comp_id);
735 			return false;
736 		}
737 
738 		cpu = cpumask_any_and(&victim_iter->affinity, cpu_online_mask);
739 		if (WARN_ON_ONCE(cpu >= nr_cpu_ids))
740 			return false;
741 
742 		cpumask_clear(tmp_cpumask);
743 		err = find_l3_equivalent_bitmask(cpu, tmp_cpumask);
744 		if (err) {
745 			pr_debug("Failed to find L3's equivalent component to class %u component %u\n",
746 				 victim->level, victim_iter->comp_id);
747 			return false;
748 		}
749 
750 		/* Any differing bits in the affinity mask? */
751 		if (!cpumask_equal(tmp_cpumask, &victim_iter->affinity)) {
752 			pr_debug("class %u component %u has Mismatched CPU mask with L3 equivalent\n"
753 				 "L3:%*pbl != victim:%*pbl\n",
754 				 victim->level, victim_iter->comp_id,
755 				 cpumask_pr_args(tmp_cpumask),
756 				 cpumask_pr_args(&victim_iter->affinity));
757 
758 			return false;
759 		}
760 	}
761 
762 	return true;
763 }
764 
765 /*
766  * Test if the traffic for a class matches that at egress from the L3. For
767  * MSC at memory controllers this is only possible if there is a single L3
768  * as otherwise the counters at the memory can include bandwidth from the
769  * non-local L3.
770  */
771 static bool traffic_matches_l3(struct mpam_class *class)
772 {
773 	int err, cpu;
774 
775 	lockdep_assert_cpus_held();
776 
777 	if (class->type == MPAM_CLASS_CACHE && class->level == 3)
778 		return true;
779 
780 	if (class->type == MPAM_CLASS_CACHE && class->level != 3) {
781 		pr_debug("class %u is a different cache from L3\n", class->level);
782 		return false;
783 	}
784 
785 	if (class->type != MPAM_CLASS_MEMORY) {
786 		pr_debug("class %u is neither of type cache or memory\n", class->level);
787 		return false;
788 	}
789 
790 	cpumask_var_t __free(free_cpumask_var) tmp_cpumask = CPUMASK_VAR_NULL;
791 	if (!alloc_cpumask_var(&tmp_cpumask, GFP_KERNEL)) {
792 		pr_debug("cpumask allocation failed\n");
793 		return false;
794 	}
795 
796 	cpu = cpumask_any_and(&class->affinity, cpu_online_mask);
797 	err = find_l3_equivalent_bitmask(cpu, tmp_cpumask);
798 	if (err) {
799 		pr_debug("Failed to find L3 downstream to cpu %d\n", cpu);
800 		return false;
801 	}
802 
803 	if (!cpumask_equal(tmp_cpumask, cpu_possible_mask)) {
804 		pr_debug("There is more than one L3\n");
805 		return false;
806 	}
807 
808 	/* Be strict; the traffic might stop in the intermediate cache. */
809 	if (get_cpu_cacheinfo_id(cpu, 4) != -1) {
810 		pr_debug("L3 isn't the last level of cache\n");
811 		return false;
812 	}
813 
814 	if (num_possible_nodes() > 1) {
815 		pr_debug("There is more than one numa node\n");
816 		return false;
817 	}
818 
819 #ifdef CONFIG_HMEM_REPORTING
820 	if (node_devices[cpu_to_node(cpu)]->cache_dev) {
821 		pr_debug("There is a memory side cache\n");
822 		return false;
823 	}
824 #endif
825 
826 	return true;
827 }
828 
829 /* Test whether we can export MPAM_CLASS_CACHE:{2,3}? */
830 static void mpam_resctrl_pick_caches(void)
831 {
832 	struct mpam_class *class;
833 	struct mpam_resctrl_res *res;
834 
835 	lockdep_assert_cpus_held();
836 
837 	guard(srcu)(&mpam_srcu);
838 	list_for_each_entry_srcu(class, &mpam_classes, classes_list,
839 				 srcu_read_lock_held(&mpam_srcu)) {
840 		if (class->type != MPAM_CLASS_CACHE) {
841 			pr_debug("class %u is not a cache\n", class->level);
842 			continue;
843 		}
844 
845 		if (class->level != 2 && class->level != 3) {
846 			pr_debug("class %u is not L2 or L3\n", class->level);
847 			continue;
848 		}
849 
850 		if (!cache_has_usable_cpor(class)) {
851 			pr_debug("class %u cache misses CPOR\n", class->level);
852 			continue;
853 		}
854 
855 		if (!cpumask_equal(&class->affinity, cpu_possible_mask)) {
856 			pr_debug("class %u has missing CPUs, mask %*pb != %*pb\n", class->level,
857 				 cpumask_pr_args(&class->affinity),
858 				 cpumask_pr_args(cpu_possible_mask));
859 			continue;
860 		}
861 
862 		if (class->level == 2)
863 			res = &mpam_resctrl_controls[RDT_RESOURCE_L2];
864 		else
865 			res = &mpam_resctrl_controls[RDT_RESOURCE_L3];
866 		res->class = class;
867 	}
868 }
869 
870 static void mpam_resctrl_pick_mba(void)
871 {
872 	struct mpam_class *class, *candidate_class = NULL;
873 	struct mpam_resctrl_res *res;
874 
875 	lockdep_assert_cpus_held();
876 
877 	guard(srcu)(&mpam_srcu);
878 	list_for_each_entry_srcu(class, &mpam_classes, classes_list,
879 				 srcu_read_lock_held(&mpam_srcu)) {
880 		struct mpam_props *cprops = &class->props;
881 
882 		if (class->level != 3 && class->type == MPAM_CLASS_CACHE) {
883 			pr_debug("class %u is a cache but not the L3\n", class->level);
884 			continue;
885 		}
886 
887 		if (!class_has_usable_mba(cprops)) {
888 			pr_debug("class %u has no bandwidth control\n",
889 				 class->level);
890 			continue;
891 		}
892 
893 		if (!cpumask_equal(&class->affinity, cpu_possible_mask)) {
894 			pr_debug("class %u has missing CPUs\n", class->level);
895 			continue;
896 		}
897 
898 		if (!topology_matches_l3(class)) {
899 			pr_debug("class %u topology doesn't match L3\n",
900 				 class->level);
901 			continue;
902 		}
903 
904 		if (!traffic_matches_l3(class)) {
905 			pr_debug("class %u traffic doesn't match L3 egress\n",
906 				 class->level);
907 			continue;
908 		}
909 
910 		/*
911 		 * Pick a resource to be MBA that as close as possible to
912 		 * the L3. mbm_total counts the bandwidth leaving the L3
913 		 * cache and MBA should correspond as closely as possible
914 		 * for proper operation of mba_sc.
915 		 */
916 		if (!candidate_class || class->level < candidate_class->level)
917 			candidate_class = class;
918 	}
919 
920 	if (candidate_class) {
921 		pr_debug("selected class %u to back MBA\n",
922 			 candidate_class->level);
923 		res = &mpam_resctrl_controls[RDT_RESOURCE_MBA];
924 		res->class = candidate_class;
925 	}
926 }
927 
928 static void counter_update_class(enum resctrl_event_id evt_id,
929 				 struct mpam_class *class)
930 {
931 	struct mpam_class *existing_class = mpam_resctrl_counters[evt_id].class;
932 
933 	if (existing_class) {
934 		if (class->level == 3) {
935 			pr_debug("Existing class is L3 - L3 wins\n");
936 			return;
937 		}
938 
939 		if (existing_class->level < class->level) {
940 			pr_debug("Existing class is closer to L3, %u versus %u - closer is better\n",
941 				 existing_class->level, class->level);
942 			return;
943 		}
944 	}
945 
946 	mpam_resctrl_counters[evt_id].class = class;
947 }
948 
949 static void mpam_resctrl_pick_counters(void)
950 {
951 	struct mpam_class *class;
952 
953 	lockdep_assert_cpus_held();
954 
955 	guard(srcu)(&mpam_srcu);
956 	list_for_each_entry_srcu(class, &mpam_classes, classes_list,
957 				 srcu_read_lock_held(&mpam_srcu)) {
958 		/* The name of the resource is L3... */
959 		if (class->type == MPAM_CLASS_CACHE && class->level != 3) {
960 			pr_debug("class %u is a cache but not the L3", class->level);
961 			continue;
962 		}
963 
964 		if (!cpumask_equal(&class->affinity, cpu_possible_mask)) {
965 			pr_debug("class %u does not cover all CPUs",
966 				 class->level);
967 			continue;
968 		}
969 
970 		if (cache_has_usable_csu(class)) {
971 			pr_debug("class %u has usable CSU",
972 				 class->level);
973 
974 			/* CSU counters only make sense on a cache. */
975 			switch (class->type) {
976 			case MPAM_CLASS_CACHE:
977 				if (update_rmid_limits(class))
978 					break;
979 
980 				counter_update_class(QOS_L3_OCCUP_EVENT_ID, class);
981 				break;
982 			default:
983 				break;
984 			}
985 		}
986 	}
987 }
988 
989 static int mpam_resctrl_control_init(struct mpam_resctrl_res *res)
990 {
991 	struct mpam_class *class = res->class;
992 	struct mpam_props *cprops = &class->props;
993 	struct rdt_resource *r = &res->resctrl_res;
994 
995 	switch (r->rid) {
996 	case RDT_RESOURCE_L2:
997 	case RDT_RESOURCE_L3:
998 		r->schema_fmt = RESCTRL_SCHEMA_BITMAP;
999 		r->cache.arch_has_sparse_bitmasks = true;
1000 
1001 		r->cache.cbm_len = class->props.cpbm_wd;
1002 		/* mpam_devices will reject empty bitmaps */
1003 		r->cache.min_cbm_bits = 1;
1004 
1005 		if (r->rid == RDT_RESOURCE_L2) {
1006 			r->name = "L2";
1007 			r->ctrl_scope = RESCTRL_L2_CACHE;
1008 			r->cdp_capable = true;
1009 		} else {
1010 			r->name = "L3";
1011 			r->ctrl_scope = RESCTRL_L3_CACHE;
1012 			r->cdp_capable = true;
1013 		}
1014 
1015 		/*
1016 		 * Which bits are shared with other ...things...  Unknown
1017 		 * devices use partid-0 which uses all the bitmap fields. Until
1018 		 * we have configured the SMMU and GIC not to do this 'all the
1019 		 * bits' is the correct answer here.
1020 		 */
1021 		r->cache.shareable_bits = resctrl_get_default_ctrl(r);
1022 		r->alloc_capable = true;
1023 		break;
1024 	case RDT_RESOURCE_MBA:
1025 		r->schema_fmt = RESCTRL_SCHEMA_RANGE;
1026 		r->ctrl_scope = RESCTRL_L3_CACHE;
1027 
1028 		r->membw.delay_linear = true;
1029 		r->membw.throttle_mode = THREAD_THROTTLE_UNDEFINED;
1030 		r->membw.min_bw = get_mba_min(cprops);
1031 		r->membw.max_bw = MAX_MBA_BW;
1032 		r->membw.bw_gran = get_mba_granularity(cprops);
1033 
1034 		r->name = "MB";
1035 		r->alloc_capable = true;
1036 		break;
1037 	default:
1038 		return -EINVAL;
1039 	}
1040 
1041 	return 0;
1042 }
1043 
1044 static int mpam_resctrl_pick_domain_id(int cpu, struct mpam_component *comp)
1045 {
1046 	struct mpam_class *class = comp->class;
1047 
1048 	if (class->type == MPAM_CLASS_CACHE)
1049 		return comp->comp_id;
1050 
1051 	if (topology_matches_l3(class)) {
1052 		/* Use the corresponding L3 component ID as the domain ID */
1053 		int id = get_cpu_cacheinfo_id(cpu, 3);
1054 
1055 		/* Implies topology_matches_l3() made a mistake */
1056 		if (WARN_ON_ONCE(id == -1))
1057 			return comp->comp_id;
1058 
1059 		return id;
1060 	}
1061 
1062 	/* Otherwise, expose the ID used by the firmware table code. */
1063 	return comp->comp_id;
1064 }
1065 
1066 static int mpam_resctrl_monitor_init(struct mpam_resctrl_mon *mon,
1067 				     enum resctrl_event_id type)
1068 {
1069 	struct mpam_resctrl_res *res = &mpam_resctrl_controls[RDT_RESOURCE_L3];
1070 	struct rdt_resource *l3 = &res->resctrl_res;
1071 
1072 	lockdep_assert_cpus_held();
1073 
1074 	/*
1075 	 * There also needs to be an L3 cache present.
1076 	 * The check just requires any online CPU and it can't go offline as we
1077 	 * hold the cpu lock.
1078 	 */
1079 	if (get_cpu_cacheinfo_id(raw_smp_processor_id(), 3) == -1)
1080 		return 0;
1081 
1082 	/*
1083 	 * If there are no MPAM resources on L3, force it into existence.
1084 	 * topology_matches_l3() already ensures this looks like the L3.
1085 	 * The domain-ids will be fixed up by mpam_resctrl_domain_hdr_init().
1086 	 */
1087 	if (!res->class) {
1088 		pr_warn_once("Faking L3 MSC to enable counters.\n");
1089 		res->class = mpam_resctrl_counters[type].class;
1090 	}
1091 
1092 	/*
1093 	 * Called multiple times!, once per event type that has a
1094 	 * monitoring class.
1095 	 * Setting name is necessary on monitor only platforms.
1096 	 */
1097 	l3->name = "L3";
1098 	l3->mon_scope = RESCTRL_L3_CACHE;
1099 
1100 	/*
1101 	 * num-rmid is the upper bound for the number of monitoring groups that
1102 	 * can exist simultaneously, including the default monitoring group for
1103 	 * each control group. Hence, advertise the whole rmid_idx space even
1104 	 * though each control group has its own pmg/rmid space. Unfortunately,
1105 	 * this does mean userspace needs to know the architecture to correctly
1106 	 * interpret this value.
1107 	 */
1108 	l3->mon.num_rmid = resctrl_arch_system_num_rmid_idx();
1109 
1110 	if (resctrl_enable_mon_event(type, false, 0, NULL))
1111 		l3->mon_capable = true;
1112 
1113 	return 0;
1114 }
1115 
1116 u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_ctrl_domain *d,
1117 			    u32 closid, enum resctrl_conf_type type)
1118 {
1119 	u32 partid;
1120 	struct mpam_config *cfg;
1121 	struct mpam_props *cprops;
1122 	struct mpam_resctrl_res *res;
1123 	struct mpam_resctrl_dom *dom;
1124 	enum mpam_device_features configured_by;
1125 
1126 	lockdep_assert_cpus_held();
1127 
1128 	if (!mpam_is_enabled())
1129 		return resctrl_get_default_ctrl(r);
1130 
1131 	res = container_of(r, struct mpam_resctrl_res, resctrl_res);
1132 	dom = container_of(d, struct mpam_resctrl_dom, resctrl_ctrl_dom);
1133 	cprops = &res->class->props;
1134 
1135 	/*
1136 	 * When CDP is enabled, but the resource doesn't support it,
1137 	 * the control is cloned across both partids.
1138 	 * Pick one at random to read:
1139 	 */
1140 	if (mpam_resctrl_hide_cdp(r->rid))
1141 		type = CDP_DATA;
1142 
1143 	partid = resctrl_get_config_index(closid, type);
1144 	cfg = &dom->ctrl_comp->cfg[partid];
1145 
1146 	switch (r->rid) {
1147 	case RDT_RESOURCE_L2:
1148 	case RDT_RESOURCE_L3:
1149 		configured_by = mpam_feat_cpor_part;
1150 		break;
1151 	case RDT_RESOURCE_MBA:
1152 		if (mpam_has_feature(mpam_feat_mbw_max, cprops)) {
1153 			configured_by = mpam_feat_mbw_max;
1154 			break;
1155 		}
1156 		fallthrough;
1157 	default:
1158 		return resctrl_get_default_ctrl(r);
1159 	}
1160 
1161 	if (!r->alloc_capable || partid >= resctrl_arch_get_num_closid(r) ||
1162 	    !mpam_has_feature(configured_by, cfg))
1163 		return resctrl_get_default_ctrl(r);
1164 
1165 	switch (configured_by) {
1166 	case mpam_feat_cpor_part:
1167 		return cfg->cpbm;
1168 	case mpam_feat_mbw_max:
1169 		return mbw_max_to_percent(cfg->mbw_max, cprops);
1170 	default:
1171 		return resctrl_get_default_ctrl(r);
1172 	}
1173 }
1174 
1175 int resctrl_arch_update_one(struct rdt_resource *r, struct rdt_ctrl_domain *d,
1176 			    u32 closid, enum resctrl_conf_type t, u32 cfg_val)
1177 {
1178 	int err;
1179 	u32 partid;
1180 	struct mpam_config cfg;
1181 	struct mpam_props *cprops;
1182 	struct mpam_resctrl_res *res;
1183 	struct mpam_resctrl_dom *dom;
1184 
1185 	lockdep_assert_cpus_held();
1186 	lockdep_assert_irqs_enabled();
1187 
1188 	if (!mpam_is_enabled())
1189 		return -EINVAL;
1190 
1191 	/*
1192 	 * No need to check the CPU as mpam_apply_config() doesn't care, and
1193 	 * resctrl_arch_update_domains() relies on this.
1194 	 */
1195 	res = container_of(r, struct mpam_resctrl_res, resctrl_res);
1196 	dom = container_of(d, struct mpam_resctrl_dom, resctrl_ctrl_dom);
1197 	cprops = &res->class->props;
1198 
1199 	if (mpam_resctrl_hide_cdp(r->rid))
1200 		t = CDP_DATA;
1201 
1202 	partid = resctrl_get_config_index(closid, t);
1203 	if (!r->alloc_capable || partid >= resctrl_arch_get_num_closid(r)) {
1204 		pr_debug("Not alloc capable or computed PARTID out of range\n");
1205 		return -EINVAL;
1206 	}
1207 
1208 	/*
1209 	 * Copy the current config to avoid clearing other resources when the
1210 	 * same component is exposed multiple times through resctrl.
1211 	 */
1212 	cfg = dom->ctrl_comp->cfg[partid];
1213 
1214 	switch (r->rid) {
1215 	case RDT_RESOURCE_L2:
1216 	case RDT_RESOURCE_L3:
1217 		cfg.cpbm = cfg_val;
1218 		mpam_set_feature(mpam_feat_cpor_part, &cfg);
1219 		break;
1220 	case RDT_RESOURCE_MBA:
1221 		if (mpam_has_feature(mpam_feat_mbw_max, cprops)) {
1222 			cfg.mbw_max = percent_to_mbw_max(cfg_val, cprops);
1223 			mpam_set_feature(mpam_feat_mbw_max, &cfg);
1224 			break;
1225 		}
1226 		fallthrough;
1227 	default:
1228 		return -EINVAL;
1229 	}
1230 
1231 	/*
1232 	 * When CDP is enabled, but the resource doesn't support it, we need to
1233 	 * apply the same configuration to the other partid.
1234 	 */
1235 	if (mpam_resctrl_hide_cdp(r->rid)) {
1236 		partid = resctrl_get_config_index(closid, CDP_CODE);
1237 		err = mpam_apply_config(dom->ctrl_comp, partid, &cfg);
1238 		if (err)
1239 			return err;
1240 
1241 		partid = resctrl_get_config_index(closid, CDP_DATA);
1242 		return mpam_apply_config(dom->ctrl_comp, partid, &cfg);
1243 	}
1244 
1245 	return mpam_apply_config(dom->ctrl_comp, partid, &cfg);
1246 }
1247 
1248 int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid)
1249 {
1250 	int err;
1251 	struct rdt_ctrl_domain *d;
1252 
1253 	lockdep_assert_cpus_held();
1254 	lockdep_assert_irqs_enabled();
1255 
1256 	if (!mpam_is_enabled())
1257 		return -EINVAL;
1258 
1259 	list_for_each_entry_rcu(d, &r->ctrl_domains, hdr.list) {
1260 		for (enum resctrl_conf_type t = 0; t < CDP_NUM_TYPES; t++) {
1261 			struct resctrl_staged_config *cfg = &d->staged_config[t];
1262 
1263 			if (!cfg->have_new_ctrl)
1264 				continue;
1265 
1266 			err = resctrl_arch_update_one(r, d, closid, t,
1267 						      cfg->new_ctrl);
1268 			if (err)
1269 				return err;
1270 		}
1271 	}
1272 
1273 	return 0;
1274 }
1275 
1276 void resctrl_arch_reset_all_ctrls(struct rdt_resource *r)
1277 {
1278 	struct mpam_resctrl_res *res;
1279 
1280 	lockdep_assert_cpus_held();
1281 
1282 	if (!mpam_is_enabled())
1283 		return;
1284 
1285 	res = container_of(r, struct mpam_resctrl_res, resctrl_res);
1286 	mpam_reset_class_locked(res->class);
1287 }
1288 
1289 static void mpam_resctrl_domain_hdr_init(int cpu, struct mpam_component *comp,
1290 					 enum resctrl_res_level rid,
1291 					 struct rdt_domain_hdr *hdr)
1292 {
1293 	lockdep_assert_cpus_held();
1294 
1295 	INIT_LIST_HEAD(&hdr->list);
1296 	hdr->id = mpam_resctrl_pick_domain_id(cpu, comp);
1297 	hdr->rid = rid;
1298 	cpumask_set_cpu(cpu, &hdr->cpu_mask);
1299 }
1300 
1301 static void mpam_resctrl_online_domain_hdr(unsigned int cpu,
1302 					   struct rdt_domain_hdr *hdr)
1303 {
1304 	lockdep_assert_cpus_held();
1305 
1306 	cpumask_set_cpu(cpu, &hdr->cpu_mask);
1307 }
1308 
1309 /**
1310  * mpam_resctrl_offline_domain_hdr() - Update the domain header to remove a CPU.
1311  * @cpu:	The CPU to remove from the domain.
1312  * @hdr:	The domain's header.
1313  *
1314  * Removes @cpu from the header mask. If this was the last CPU in the domain,
1315  * the domain header is removed from its parent list and true is returned,
1316  * indicating the parent structure can be freed.
1317  * If there are other CPUs in the domain, returns false.
1318  */
1319 static bool mpam_resctrl_offline_domain_hdr(unsigned int cpu,
1320 					    struct rdt_domain_hdr *hdr)
1321 {
1322 	lockdep_assert_held(&domain_list_lock);
1323 
1324 	cpumask_clear_cpu(cpu, &hdr->cpu_mask);
1325 	if (cpumask_empty(&hdr->cpu_mask)) {
1326 		list_del_rcu(&hdr->list);
1327 		synchronize_rcu();
1328 		return true;
1329 	}
1330 
1331 	return false;
1332 }
1333 
1334 static void mpam_resctrl_domain_insert(struct list_head *list,
1335 				       struct rdt_domain_hdr *new)
1336 {
1337 	struct rdt_domain_hdr *err;
1338 	struct list_head *pos = NULL;
1339 
1340 	lockdep_assert_held(&domain_list_lock);
1341 
1342 	err = resctrl_find_domain(list, new->id, &pos);
1343 	if (WARN_ON_ONCE(err))
1344 		return;
1345 
1346 	list_add_tail_rcu(&new->list, pos);
1347 }
1348 
1349 static struct mpam_component *find_component(struct mpam_class *class, int cpu)
1350 {
1351 	struct mpam_component *comp;
1352 
1353 	guard(srcu)(&mpam_srcu);
1354 	list_for_each_entry_srcu(comp, &class->components, class_list,
1355 				 srcu_read_lock_held(&mpam_srcu)) {
1356 		if (cpumask_test_cpu(cpu, &comp->affinity))
1357 			return comp;
1358 	}
1359 
1360 	return NULL;
1361 }
1362 
1363 static struct mpam_resctrl_dom *
1364 mpam_resctrl_alloc_domain(unsigned int cpu, struct mpam_resctrl_res *res)
1365 {
1366 	int err;
1367 	struct mpam_resctrl_dom *dom;
1368 	struct rdt_l3_mon_domain *mon_d;
1369 	struct rdt_ctrl_domain *ctrl_d;
1370 	struct mpam_class *class = res->class;
1371 	struct mpam_component *comp_iter, *ctrl_comp;
1372 	struct rdt_resource *r = &res->resctrl_res;
1373 
1374 	lockdep_assert_held(&domain_list_lock);
1375 
1376 	ctrl_comp = NULL;
1377 	guard(srcu)(&mpam_srcu);
1378 	list_for_each_entry_srcu(comp_iter, &class->components, class_list,
1379 				 srcu_read_lock_held(&mpam_srcu)) {
1380 		if (cpumask_test_cpu(cpu, &comp_iter->affinity)) {
1381 			ctrl_comp = comp_iter;
1382 			break;
1383 		}
1384 	}
1385 
1386 	/* class has no component for this CPU */
1387 	if (WARN_ON_ONCE(!ctrl_comp))
1388 		return ERR_PTR(-EINVAL);
1389 
1390 	dom = kzalloc_node(sizeof(*dom), GFP_KERNEL, cpu_to_node(cpu));
1391 	if (!dom)
1392 		return ERR_PTR(-ENOMEM);
1393 
1394 	if (r->alloc_capable) {
1395 		dom->ctrl_comp = ctrl_comp;
1396 
1397 		ctrl_d = &dom->resctrl_ctrl_dom;
1398 		mpam_resctrl_domain_hdr_init(cpu, ctrl_comp, r->rid, &ctrl_d->hdr);
1399 		ctrl_d->hdr.type = RESCTRL_CTRL_DOMAIN;
1400 		err = resctrl_online_ctrl_domain(r, ctrl_d);
1401 		if (err)
1402 			goto free_domain;
1403 
1404 		mpam_resctrl_domain_insert(&r->ctrl_domains, &ctrl_d->hdr);
1405 	} else {
1406 		pr_debug("Skipped control domain online - no controls\n");
1407 	}
1408 
1409 	if (r->mon_capable) {
1410 		struct mpam_component *any_mon_comp = NULL;
1411 		struct mpam_resctrl_mon *mon;
1412 		enum resctrl_event_id eventid;
1413 
1414 		/*
1415 		 * Even if the monitor domain is backed by a different
1416 		 * component, the L3 component IDs need to be used... only
1417 		 * there may be no ctrl_comp for the L3.
1418 		 * Search each event's class list for a component with
1419 		 * overlapping CPUs and set up the dom->mon_comp array.
1420 		 */
1421 
1422 		for_each_mpam_resctrl_mon(mon, eventid) {
1423 			struct mpam_component *mon_comp;
1424 
1425 			if (!mon->class)
1426 				continue;       // dummy resource
1427 
1428 			mon_comp = find_component(mon->class, cpu);
1429 			dom->mon_comp[eventid] = mon_comp;
1430 			if (mon_comp)
1431 				any_mon_comp = mon_comp;
1432 		}
1433 		if (!any_mon_comp) {
1434 			WARN_ON_ONCE(0);
1435 			err = -EFAULT;
1436 			goto offline_ctrl_domain;
1437 		}
1438 
1439 		mon_d = &dom->resctrl_mon_dom;
1440 		mpam_resctrl_domain_hdr_init(cpu, any_mon_comp, r->rid, &mon_d->hdr);
1441 		mon_d->hdr.type = RESCTRL_MON_DOMAIN;
1442 		err = resctrl_online_mon_domain(r, &mon_d->hdr);
1443 		if (err)
1444 			goto offline_ctrl_domain;
1445 
1446 		mpam_resctrl_domain_insert(&r->mon_domains, &mon_d->hdr);
1447 	} else {
1448 		pr_debug("Skipped monitor domain online - no monitors\n");
1449 	}
1450 
1451 	return dom;
1452 
1453 offline_ctrl_domain:
1454 	if (r->alloc_capable) {
1455 		mpam_resctrl_offline_domain_hdr(cpu, &ctrl_d->hdr);
1456 		resctrl_offline_ctrl_domain(r, ctrl_d);
1457 	}
1458 free_domain:
1459 	kfree(dom);
1460 	dom = ERR_PTR(err);
1461 
1462 	return dom;
1463 }
1464 
1465 /*
1466  * We know all the monitors are associated with the L3, even if there are no
1467  * controls and therefore no control component. Find the cache-id for the CPU
1468  * and use that to search for existing resctrl domains.
1469  * This relies on mpam_resctrl_pick_domain_id() using the L3 cache-id
1470  * for anything that is not a cache.
1471  */
1472 static struct mpam_resctrl_dom *mpam_resctrl_get_mon_domain_from_cpu(int cpu)
1473 {
1474 	int cache_id;
1475 	struct mpam_resctrl_dom *dom;
1476 	struct mpam_resctrl_res *l3 = &mpam_resctrl_controls[RDT_RESOURCE_L3];
1477 
1478 	lockdep_assert_cpus_held();
1479 
1480 	if (!l3->class)
1481 		return NULL;
1482 	cache_id = get_cpu_cacheinfo_id(cpu, 3);
1483 	if (cache_id < 0)
1484 		return NULL;
1485 
1486 	list_for_each_entry_rcu(dom, &l3->resctrl_res.mon_domains, resctrl_mon_dom.hdr.list) {
1487 		if (dom->resctrl_mon_dom.hdr.id == cache_id)
1488 			return dom;
1489 	}
1490 
1491 	return NULL;
1492 }
1493 
1494 static struct mpam_resctrl_dom *
1495 mpam_resctrl_get_domain_from_cpu(int cpu, struct mpam_resctrl_res *res)
1496 {
1497 	struct mpam_resctrl_dom *dom;
1498 	struct rdt_resource *r = &res->resctrl_res;
1499 
1500 	lockdep_assert_cpus_held();
1501 
1502 	list_for_each_entry_rcu(dom, &r->ctrl_domains, resctrl_ctrl_dom.hdr.list) {
1503 		if (cpumask_test_cpu(cpu, &dom->ctrl_comp->affinity))
1504 			return dom;
1505 	}
1506 
1507 	if (r->rid != RDT_RESOURCE_L3)
1508 		return NULL;
1509 
1510 	/* Search the mon domain list too - needed on monitor only platforms. */
1511 	return mpam_resctrl_get_mon_domain_from_cpu(cpu);
1512 }
1513 
1514 int mpam_resctrl_online_cpu(unsigned int cpu)
1515 {
1516 	struct mpam_resctrl_res *res;
1517 	enum resctrl_res_level rid;
1518 
1519 	guard(mutex)(&domain_list_lock);
1520 	for_each_mpam_resctrl_control(res, rid) {
1521 		struct mpam_resctrl_dom *dom;
1522 		struct rdt_resource *r = &res->resctrl_res;
1523 
1524 		if (!res->class)
1525 			continue;	// dummy_resource;
1526 
1527 		dom = mpam_resctrl_get_domain_from_cpu(cpu, res);
1528 		if (!dom) {
1529 			dom = mpam_resctrl_alloc_domain(cpu, res);
1530 			if (IS_ERR(dom))
1531 				return PTR_ERR(dom);
1532 		} else {
1533 			if (r->alloc_capable) {
1534 				struct rdt_ctrl_domain *ctrl_d = &dom->resctrl_ctrl_dom;
1535 
1536 				mpam_resctrl_online_domain_hdr(cpu, &ctrl_d->hdr);
1537 			}
1538 			if (r->mon_capable) {
1539 				struct rdt_l3_mon_domain *mon_d = &dom->resctrl_mon_dom;
1540 
1541 				mpam_resctrl_online_domain_hdr(cpu, &mon_d->hdr);
1542 			}
1543 		}
1544 	}
1545 
1546 	resctrl_online_cpu(cpu);
1547 
1548 	return 0;
1549 }
1550 
1551 void mpam_resctrl_offline_cpu(unsigned int cpu)
1552 {
1553 	struct mpam_resctrl_res *res;
1554 	enum resctrl_res_level rid;
1555 
1556 	resctrl_offline_cpu(cpu);
1557 
1558 	guard(mutex)(&domain_list_lock);
1559 	for_each_mpam_resctrl_control(res, rid) {
1560 		struct mpam_resctrl_dom *dom;
1561 		struct rdt_l3_mon_domain *mon_d;
1562 		struct rdt_ctrl_domain *ctrl_d;
1563 		bool ctrl_dom_empty, mon_dom_empty;
1564 		struct rdt_resource *r = &res->resctrl_res;
1565 
1566 		if (!res->class)
1567 			continue;	// dummy resource
1568 
1569 		dom = mpam_resctrl_get_domain_from_cpu(cpu, res);
1570 		if (WARN_ON_ONCE(!dom))
1571 			continue;
1572 
1573 		if (r->alloc_capable) {
1574 			ctrl_d = &dom->resctrl_ctrl_dom;
1575 			ctrl_dom_empty = mpam_resctrl_offline_domain_hdr(cpu, &ctrl_d->hdr);
1576 			if (ctrl_dom_empty)
1577 				resctrl_offline_ctrl_domain(&res->resctrl_res, ctrl_d);
1578 		} else {
1579 			ctrl_dom_empty = true;
1580 		}
1581 
1582 		if (r->mon_capable) {
1583 			mon_d = &dom->resctrl_mon_dom;
1584 			mon_dom_empty = mpam_resctrl_offline_domain_hdr(cpu, &mon_d->hdr);
1585 			if (mon_dom_empty)
1586 				resctrl_offline_mon_domain(&res->resctrl_res, &mon_d->hdr);
1587 		} else {
1588 			mon_dom_empty = true;
1589 		}
1590 
1591 		if (ctrl_dom_empty && mon_dom_empty)
1592 			kfree(dom);
1593 	}
1594 }
1595 
1596 int mpam_resctrl_setup(void)
1597 {
1598 	int err = 0;
1599 	struct mpam_resctrl_res *res;
1600 	enum resctrl_res_level rid;
1601 	struct mpam_resctrl_mon *mon;
1602 	enum resctrl_event_id eventid;
1603 
1604 	wait_event(wait_cacheinfo_ready, cacheinfo_ready);
1605 
1606 	cpus_read_lock();
1607 	for_each_mpam_resctrl_control(res, rid) {
1608 		INIT_LIST_HEAD_RCU(&res->resctrl_res.ctrl_domains);
1609 		INIT_LIST_HEAD_RCU(&res->resctrl_res.mon_domains);
1610 		res->resctrl_res.rid = rid;
1611 	}
1612 
1613 	/* Find some classes to use for controls */
1614 	mpam_resctrl_pick_caches();
1615 	mpam_resctrl_pick_mba();
1616 
1617 	/* Initialise the resctrl structures from the classes */
1618 	for_each_mpam_resctrl_control(res, rid) {
1619 		if (!res->class)
1620 			continue;	// dummy resource
1621 
1622 		err = mpam_resctrl_control_init(res);
1623 		if (err) {
1624 			pr_debug("Failed to initialise rid %u\n", rid);
1625 			goto internal_error;
1626 		}
1627 	}
1628 
1629 	/* Find some classes to use for monitors */
1630 	mpam_resctrl_pick_counters();
1631 
1632 	for_each_mpam_resctrl_mon(mon, eventid) {
1633 		if (!mon->class)
1634 			continue;	// dummy resource
1635 
1636 		err = mpam_resctrl_monitor_init(mon, eventid);
1637 		if (err) {
1638 			pr_debug("Failed to initialise event %u\n", eventid);
1639 			goto internal_error;
1640 		}
1641 	}
1642 
1643 	cpus_read_unlock();
1644 
1645 	if (!resctrl_arch_alloc_capable() && !resctrl_arch_mon_capable()) {
1646 		pr_debug("No alloc(%u) or monitor(%u) found - resctrl not supported\n",
1647 			 resctrl_arch_alloc_capable(), resctrl_arch_mon_capable());
1648 		return -EOPNOTSUPP;
1649 	}
1650 
1651 	err = resctrl_init();
1652 	if (err)
1653 		return err;
1654 
1655 	WRITE_ONCE(resctrl_enabled, true);
1656 
1657 	return 0;
1658 
1659 internal_error:
1660 	cpus_read_unlock();
1661 	pr_debug("Internal error %d - resctrl not supported\n", err);
1662 	return err;
1663 }
1664 
1665 void mpam_resctrl_exit(void)
1666 {
1667 	if (!READ_ONCE(resctrl_enabled))
1668 		return;
1669 
1670 	WRITE_ONCE(resctrl_enabled, false);
1671 	resctrl_exit();
1672 }
1673 
1674 /*
1675  * The driver is detaching an MSC from this class, if resctrl was using it,
1676  * pull on resctrl_exit().
1677  */
1678 void mpam_resctrl_teardown_class(struct mpam_class *class)
1679 {
1680 	struct mpam_resctrl_res *res;
1681 	enum resctrl_res_level rid;
1682 	struct mpam_resctrl_mon *mon;
1683 	enum resctrl_event_id eventid;
1684 
1685 	might_sleep();
1686 
1687 	for_each_mpam_resctrl_control(res, rid) {
1688 		if (res->class == class) {
1689 			res->class = NULL;
1690 			break;
1691 		}
1692 	}
1693 	for_each_mpam_resctrl_mon(mon, eventid) {
1694 		if (mon->class == class) {
1695 			mon->class = NULL;
1696 			break;
1697 		}
1698 	}
1699 }
1700 
1701 static int __init __cacheinfo_ready(void)
1702 {
1703 	cacheinfo_ready = true;
1704 	wake_up(&wait_cacheinfo_ready);
1705 
1706 	return 0;
1707 }
1708 device_initcall_sync(__cacheinfo_ready);
1709 
1710 #ifdef CONFIG_MPAM_KUNIT_TEST
1711 #include "test_mpam_resctrl.c"
1712 #endif
1713