xref: /linux/arch/x86/kernel/cpu/resctrl/core.c (revision 8f6b6ad69b50bf16bb762ffafbfa44a4884f9a17)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Resource Director Technology(RDT)
4  * - Cache Allocation code.
5  *
6  * Copyright (C) 2016 Intel Corporation
7  *
8  * Authors:
9  *    Fenghua Yu <fenghua.yu@intel.com>
10  *    Tony Luck <tony.luck@intel.com>
11  *    Vikas Shivappa <vikas.shivappa@intel.com>
12  *
13  * More information about RDT be found in the Intel (R) x86 Architecture
14  * Software Developer Manual June 2016, volume 3, section 17.17.
15  */
16 
17 #define pr_fmt(fmt)	"resctrl: " fmt
18 
19 #include <linux/cpu.h>
20 #include <linux/slab.h>
21 #include <linux/err.h>
22 #include <linux/cpuhotplug.h>
23 
24 #include <asm/cpu_device_id.h>
25 #include <asm/msr.h>
26 #include <asm/resctrl.h>
27 #include "internal.h"
28 
29 /*
30  * rdt_domain structures are kfree()d when their last CPU goes offline,
31  * and allocated when the first CPU in a new domain comes online.
32  * The rdt_resource's domain list is updated when this happens. Readers of
33  * the domain list must either take cpus_read_lock(), or rely on an RCU
34  * read-side critical section, to avoid observing concurrent modification.
35  * All writers take this mutex:
36  */
37 static DEFINE_MUTEX(domain_list_lock);
38 
39 /*
40  * The cached resctrl_pqr_state is strictly per CPU and can never be
41  * updated from a remote CPU. Functions which modify the state
42  * are called with interrupts disabled and no preemption, which
43  * is sufficient for the protection.
44  */
45 DEFINE_PER_CPU(struct resctrl_pqr_state, pqr_state);
46 
47 /*
48  * Global boolean for rdt_alloc which is true if any
49  * resource allocation is enabled.
50  */
51 bool rdt_alloc_capable;
52 
53 static void mba_wrmsr_intel(struct msr_param *m);
54 static void cat_wrmsr(struct msr_param *m);
55 static void mba_wrmsr_amd(struct msr_param *m);
56 
57 #define ctrl_domain_init(id) LIST_HEAD_INIT(rdt_resources_all[id].r_resctrl.ctrl_domains)
58 #define mon_domain_init(id) LIST_HEAD_INIT(rdt_resources_all[id].r_resctrl.mon_domains)
59 
60 struct rdt_hw_resource rdt_resources_all[RDT_NUM_RESOURCES] = {
61 	[RDT_RESOURCE_L3] =
62 	{
63 		.r_resctrl = {
64 			.name			= "L3",
65 			.ctrl_scope		= RESCTRL_L3_CACHE,
66 			.mon_scope		= RESCTRL_L3_CACHE,
67 			.ctrl_domains		= ctrl_domain_init(RDT_RESOURCE_L3),
68 			.mon_domains		= mon_domain_init(RDT_RESOURCE_L3),
69 			.schema_fmt		= RESCTRL_SCHEMA_BITMAP,
70 		},
71 		.msr_base		= MSR_IA32_L3_CBM_BASE,
72 		.msr_update		= cat_wrmsr,
73 	},
74 	[RDT_RESOURCE_L2] =
75 	{
76 		.r_resctrl = {
77 			.name			= "L2",
78 			.ctrl_scope		= RESCTRL_L2_CACHE,
79 			.ctrl_domains		= ctrl_domain_init(RDT_RESOURCE_L2),
80 			.schema_fmt		= RESCTRL_SCHEMA_BITMAP,
81 		},
82 		.msr_base		= MSR_IA32_L2_CBM_BASE,
83 		.msr_update		= cat_wrmsr,
84 	},
85 	[RDT_RESOURCE_MBA] =
86 	{
87 		.r_resctrl = {
88 			.name			= "MB",
89 			.ctrl_scope		= RESCTRL_L3_CACHE,
90 			.ctrl_domains		= ctrl_domain_init(RDT_RESOURCE_MBA),
91 			.schema_fmt		= RESCTRL_SCHEMA_RANGE,
92 		},
93 	},
94 	[RDT_RESOURCE_SMBA] =
95 	{
96 		.r_resctrl = {
97 			.name			= "SMBA",
98 			.ctrl_scope		= RESCTRL_L3_CACHE,
99 			.ctrl_domains		= ctrl_domain_init(RDT_RESOURCE_SMBA),
100 			.schema_fmt		= RESCTRL_SCHEMA_RANGE,
101 		},
102 	},
103 	[RDT_RESOURCE_PERF_PKG] =
104 	{
105 		.r_resctrl = {
106 			.name			= "PERF_PKG",
107 			.mon_scope		= RESCTRL_PACKAGE,
108 			.mon_domains		= mon_domain_init(RDT_RESOURCE_PERF_PKG),
109 		},
110 	},
111 };
112 
113 u32 resctrl_arch_system_num_rmid_idx(void)
114 {
115 	struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
116 
117 	/* RMID are independent numbers for x86. num_rmid_idx == num_rmid */
118 	return r->mon.num_rmid;
119 }
120 
121 struct rdt_resource *resctrl_arch_get_resource(enum resctrl_res_level l)
122 {
123 	if (l >= RDT_NUM_RESOURCES)
124 		return NULL;
125 
126 	return &rdt_resources_all[l].r_resctrl;
127 }
128 
129 /*
130  * cache_alloc_hsw_probe() - Have to probe for Intel haswell server CPUs
131  * as they do not have CPUID enumeration support for Cache allocation.
132  * The check for Vendor/Family/Model is not enough to guarantee that
133  * the MSRs won't #GP fault because only the following SKUs support
134  * CAT:
135  *	Intel(R) Xeon(R)  CPU E5-2658  v3  @  2.20GHz
136  *	Intel(R) Xeon(R)  CPU E5-2648L v3  @  1.80GHz
137  *	Intel(R) Xeon(R)  CPU E5-2628L v3  @  2.00GHz
138  *	Intel(R) Xeon(R)  CPU E5-2618L v3  @  2.30GHz
139  *	Intel(R) Xeon(R)  CPU E5-2608L v3  @  2.00GHz
140  *	Intel(R) Xeon(R)  CPU E5-2658A v3  @  2.20GHz
141  *
142  * Probe by trying to write the first of the L3 cache mask registers
143  * and checking that the bits stick. Max CLOSids is always 4 and max cbm length
144  * is always 20 on hsw server parts. The minimum cache bitmask length
145  * allowed for HSW server is always 2 bits. Hardcode all of them.
146  */
147 static inline void cache_alloc_hsw_probe(void)
148 {
149 	struct rdt_hw_resource *hw_res = &rdt_resources_all[RDT_RESOURCE_L3];
150 	struct rdt_resource *r  = &hw_res->r_resctrl;
151 	u64 max_cbm = BIT_ULL_MASK(20) - 1, l3_cbm_0;
152 
153 	if (wrmsrq_safe(MSR_IA32_L3_CBM_BASE, max_cbm))
154 		return;
155 
156 	rdmsrq(MSR_IA32_L3_CBM_BASE, l3_cbm_0);
157 
158 	/* If all the bits were set in MSR, return success */
159 	if (l3_cbm_0 != max_cbm)
160 		return;
161 
162 	hw_res->num_closid = 4;
163 	r->cache.cbm_len = 20;
164 	r->cache.shareable_bits = 0xc0000;
165 	r->cache.min_cbm_bits = 2;
166 	r->cache.arch_has_sparse_bitmasks = false;
167 	r->alloc_capable = true;
168 
169 	rdt_alloc_capable = true;
170 }
171 
172 /*
173  * rdt_get_mb_table() - get a mapping of bandwidth(b/w) percentage values
174  * exposed to user interface and the h/w understandable delay values.
175  *
176  * The non-linear delay values have the granularity of power of two
177  * and also the h/w does not guarantee a curve for configured delay
178  * values vs. actual b/w enforced.
179  * Hence we need a mapping that is pre calibrated so the user can
180  * express the memory b/w as a percentage value.
181  */
182 static inline bool rdt_get_mb_table(struct rdt_resource *r)
183 {
184 	/*
185 	 * There are no Intel SKUs as of now to support non-linear delay.
186 	 */
187 	pr_info("MBA b/w map not implemented for cpu:%d, model:%d",
188 		boot_cpu_data.x86, boot_cpu_data.x86_model);
189 
190 	return false;
191 }
192 
193 static __init bool __get_mem_config_intel(struct rdt_resource *r)
194 {
195 	struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
196 	union cpuid_0x10_3_eax eax;
197 	union cpuid_0x10_x_edx edx;
198 	u32 ebx, ecx, max_delay;
199 
200 	cpuid_count(0x00000010, 3, &eax.full, &ebx, &ecx, &edx.full);
201 	hw_res->num_closid = edx.split.cos_max + 1;
202 	max_delay = eax.split.max_delay + 1;
203 	r->membw.max_bw = MAX_MBA_BW;
204 	r->membw.arch_needs_linear = true;
205 	if (ecx & MBA_IS_LINEAR) {
206 		r->membw.delay_linear = true;
207 		r->membw.min_bw = MAX_MBA_BW - max_delay;
208 		r->membw.bw_gran = MAX_MBA_BW - max_delay;
209 	} else {
210 		if (!rdt_get_mb_table(r))
211 			return false;
212 		r->membw.arch_needs_linear = false;
213 	}
214 
215 	if (boot_cpu_has(X86_FEATURE_PER_THREAD_MBA))
216 		r->membw.throttle_mode = THREAD_THROTTLE_PER_THREAD;
217 	else
218 		r->membw.throttle_mode = THREAD_THROTTLE_MAX;
219 
220 	r->alloc_capable = true;
221 
222 	return true;
223 }
224 
225 static __init bool __rdt_get_mem_config_amd(struct rdt_resource *r)
226 {
227 	struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
228 	u32 eax, ebx, ecx, edx, subleaf;
229 
230 	/*
231 	 * Query CPUID_Fn80000020_EDX_x01 for MBA and
232 	 * CPUID_Fn80000020_EDX_x02 for SMBA
233 	 */
234 	subleaf = (r->rid == RDT_RESOURCE_SMBA) ? 2 :  1;
235 
236 	cpuid_count(0x80000020, subleaf, &eax, &ebx, &ecx, &edx);
237 	hw_res->num_closid = edx + 1;
238 	r->membw.max_bw = 1 << eax;
239 
240 	/* AMD does not use delay */
241 	r->membw.delay_linear = false;
242 	r->membw.arch_needs_linear = false;
243 
244 	/*
245 	 * AMD does not use memory delay throttle model to control
246 	 * the allocation like Intel does.
247 	 */
248 	r->membw.throttle_mode = THREAD_THROTTLE_UNDEFINED;
249 	r->membw.min_bw = 0;
250 	r->membw.bw_gran = 1;
251 
252 	r->alloc_capable = true;
253 
254 	return true;
255 }
256 
257 static void rdt_get_cache_alloc_cfg(int idx, struct rdt_resource *r)
258 {
259 	struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
260 	union cpuid_0x10_1_eax eax;
261 	union cpuid_0x10_x_ecx ecx;
262 	union cpuid_0x10_x_edx edx;
263 	u32 ebx, default_ctrl;
264 
265 	cpuid_count(0x00000010, idx, &eax.full, &ebx, &ecx.full, &edx.full);
266 	hw_res->num_closid = edx.split.cos_max + 1;
267 	r->cache.cbm_len = eax.split.cbm_len + 1;
268 	default_ctrl = BIT_MASK(eax.split.cbm_len + 1) - 1;
269 	r->cache.shareable_bits = ebx & default_ctrl;
270 	if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
271 		r->cache.arch_has_sparse_bitmasks = ecx.split.noncont;
272 	r->alloc_capable = true;
273 }
274 
275 static void rdt_get_cdp_config(int level)
276 {
277 	/*
278 	 * By default, CDP is disabled. CDP can be enabled by mount parameter
279 	 * "cdp" during resctrl file system mount time.
280 	 */
281 	rdt_resources_all[level].cdp_enabled = false;
282 	rdt_resources_all[level].r_resctrl.cdp_capable = true;
283 }
284 
285 static void rdt_set_io_alloc_capable(struct rdt_resource *r)
286 {
287 	r->cache.io_alloc_capable = true;
288 }
289 
290 static void rdt_get_cdp_l3_config(void)
291 {
292 	rdt_get_cdp_config(RDT_RESOURCE_L3);
293 }
294 
295 static void rdt_get_cdp_l2_config(void)
296 {
297 	rdt_get_cdp_config(RDT_RESOURCE_L2);
298 }
299 
300 static void mba_wrmsr_amd(struct msr_param *m)
301 {
302 	struct rdt_hw_ctrl_domain *hw_dom = resctrl_to_arch_ctrl_dom(m->dom);
303 	struct rdt_hw_resource *hw_res = resctrl_to_arch_res(m->res);
304 	unsigned int i;
305 
306 	for (i = m->low; i < m->high; i++)
307 		wrmsrq(hw_res->msr_base + i, hw_dom->ctrl_val[i]);
308 }
309 
310 /*
311  * Map the memory b/w percentage value to delay values
312  * that can be written to QOS_MSRs.
313  * There are currently no SKUs which support non linear delay values.
314  */
315 static u32 delay_bw_map(unsigned long bw, struct rdt_resource *r)
316 {
317 	if (r->membw.delay_linear)
318 		return MAX_MBA_BW - bw;
319 
320 	pr_warn_once("Non Linear delay-bw map not supported but queried\n");
321 	return MAX_MBA_BW;
322 }
323 
324 static void mba_wrmsr_intel(struct msr_param *m)
325 {
326 	struct rdt_hw_ctrl_domain *hw_dom = resctrl_to_arch_ctrl_dom(m->dom);
327 	struct rdt_hw_resource *hw_res = resctrl_to_arch_res(m->res);
328 	unsigned int i;
329 
330 	/*  Write the delay values for mba. */
331 	for (i = m->low; i < m->high; i++)
332 		wrmsrq(hw_res->msr_base + i, delay_bw_map(hw_dom->ctrl_val[i], m->res));
333 }
334 
335 static void cat_wrmsr(struct msr_param *m)
336 {
337 	struct rdt_hw_ctrl_domain *hw_dom = resctrl_to_arch_ctrl_dom(m->dom);
338 	struct rdt_hw_resource *hw_res = resctrl_to_arch_res(m->res);
339 	unsigned int i;
340 
341 	for (i = m->low; i < m->high; i++)
342 		wrmsrq(hw_res->msr_base + i, hw_dom->ctrl_val[i]);
343 }
344 
345 u32 resctrl_arch_get_num_closid(struct rdt_resource *r)
346 {
347 	return resctrl_to_arch_res(r)->num_closid;
348 }
349 
350 void rdt_ctrl_update(void *arg)
351 {
352 	struct rdt_hw_resource *hw_res;
353 	struct msr_param *m = arg;
354 
355 	hw_res = resctrl_to_arch_res(m->res);
356 	hw_res->msr_update(m);
357 }
358 
359 static void setup_default_ctrlval(struct rdt_resource *r, u32 *dc)
360 {
361 	struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
362 	int i;
363 
364 	/*
365 	 * Initialize the Control MSRs to having no control.
366 	 * For Cache Allocation: Set all bits in cbm
367 	 * For Memory Allocation: Set b/w requested to 100%
368 	 */
369 	for (i = 0; i < hw_res->num_closid; i++, dc++)
370 		*dc = resctrl_get_default_ctrl(r);
371 }
372 
373 static void ctrl_domain_free(struct rdt_hw_ctrl_domain *hw_dom)
374 {
375 	kfree(hw_dom->ctrl_val);
376 	kfree(hw_dom);
377 }
378 
379 static void l3_mon_domain_free(struct rdt_hw_l3_mon_domain *hw_dom)
380 {
381 	int idx;
382 
383 	for_each_mbm_idx(idx)
384 		kfree(hw_dom->arch_mbm_states[idx]);
385 	kfree(hw_dom);
386 }
387 
388 static int domain_setup_ctrlval(struct rdt_resource *r, struct rdt_ctrl_domain *d)
389 {
390 	struct rdt_hw_ctrl_domain *hw_dom = resctrl_to_arch_ctrl_dom(d);
391 	struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
392 	struct msr_param m;
393 	u32 *dc;
394 
395 	dc = kmalloc_array(hw_res->num_closid, sizeof(*hw_dom->ctrl_val),
396 			   GFP_KERNEL);
397 	if (!dc)
398 		return -ENOMEM;
399 
400 	hw_dom->ctrl_val = dc;
401 	setup_default_ctrlval(r, dc);
402 
403 	m.res = r;
404 	m.dom = d;
405 	m.low = 0;
406 	m.high = hw_res->num_closid;
407 	hw_res->msr_update(&m);
408 	return 0;
409 }
410 
411 /**
412  * l3_mon_domain_mbm_alloc() - Allocate arch private storage for the MBM counters
413  * @num_rmid:	The size of the MBM counter array
414  * @hw_dom:	The domain that owns the allocated arrays
415  *
416  * Return:	0 for success, or -ENOMEM.
417  */
418 static int l3_mon_domain_mbm_alloc(u32 num_rmid, struct rdt_hw_l3_mon_domain *hw_dom)
419 {
420 	size_t tsize = sizeof(*hw_dom->arch_mbm_states[0]);
421 	enum resctrl_event_id eventid;
422 	int idx;
423 
424 	for_each_mbm_event_id(eventid) {
425 		if (!resctrl_is_mon_event_enabled(eventid))
426 			continue;
427 		idx = MBM_STATE_IDX(eventid);
428 		hw_dom->arch_mbm_states[idx] = kcalloc(num_rmid, tsize, GFP_KERNEL);
429 		if (!hw_dom->arch_mbm_states[idx])
430 			goto cleanup;
431 	}
432 
433 	return 0;
434 cleanup:
435 	for_each_mbm_idx(idx) {
436 		kfree(hw_dom->arch_mbm_states[idx]);
437 		hw_dom->arch_mbm_states[idx] = NULL;
438 	}
439 
440 	return -ENOMEM;
441 }
442 
443 static int get_domain_id_from_scope(int cpu, enum resctrl_scope scope)
444 {
445 	switch (scope) {
446 	case RESCTRL_L2_CACHE:
447 	case RESCTRL_L3_CACHE:
448 		return get_cpu_cacheinfo_id(cpu, scope);
449 	case RESCTRL_L3_NODE:
450 		return cpu_to_node(cpu);
451 	case RESCTRL_PACKAGE:
452 		return topology_physical_package_id(cpu);
453 	default:
454 		break;
455 	}
456 
457 	return -EINVAL;
458 }
459 
460 static void domain_add_cpu_ctrl(int cpu, struct rdt_resource *r)
461 {
462 	int id = get_domain_id_from_scope(cpu, r->ctrl_scope);
463 	struct rdt_hw_ctrl_domain *hw_dom;
464 	struct list_head *add_pos = NULL;
465 	struct rdt_domain_hdr *hdr;
466 	struct rdt_ctrl_domain *d;
467 	int err;
468 
469 	lockdep_assert_held(&domain_list_lock);
470 
471 	if (id < 0) {
472 		pr_warn_once("Can't find control domain id for CPU:%d scope:%d for resource %s\n",
473 			     cpu, r->ctrl_scope, r->name);
474 		return;
475 	}
476 
477 	hdr = resctrl_find_domain(&r->ctrl_domains, id, &add_pos);
478 	if (hdr) {
479 		if (!domain_header_is_valid(hdr, RESCTRL_CTRL_DOMAIN, r->rid))
480 			return;
481 		d = container_of(hdr, struct rdt_ctrl_domain, hdr);
482 
483 		cpumask_set_cpu(cpu, &d->hdr.cpu_mask);
484 		if (r->cache.arch_has_per_cpu_cfg)
485 			rdt_domain_reconfigure_cdp(r);
486 		return;
487 	}
488 
489 	hw_dom = kzalloc_node(sizeof(*hw_dom), GFP_KERNEL, cpu_to_node(cpu));
490 	if (!hw_dom)
491 		return;
492 
493 	d = &hw_dom->d_resctrl;
494 	d->hdr.id = id;
495 	d->hdr.type = RESCTRL_CTRL_DOMAIN;
496 	d->hdr.rid = r->rid;
497 	cpumask_set_cpu(cpu, &d->hdr.cpu_mask);
498 
499 	rdt_domain_reconfigure_cdp(r);
500 
501 	if (domain_setup_ctrlval(r, d)) {
502 		ctrl_domain_free(hw_dom);
503 		return;
504 	}
505 
506 	list_add_tail_rcu(&d->hdr.list, add_pos);
507 
508 	err = resctrl_online_ctrl_domain(r, d);
509 	if (err) {
510 		list_del_rcu(&d->hdr.list);
511 		synchronize_rcu();
512 		ctrl_domain_free(hw_dom);
513 	}
514 }
515 
516 static void l3_mon_domain_setup(int cpu, int id, struct rdt_resource *r, struct list_head *add_pos)
517 {
518 	struct rdt_hw_l3_mon_domain *hw_dom;
519 	struct rdt_l3_mon_domain *d;
520 	struct cacheinfo *ci;
521 	int err;
522 
523 	hw_dom = kzalloc_node(sizeof(*hw_dom), GFP_KERNEL, cpu_to_node(cpu));
524 	if (!hw_dom)
525 		return;
526 
527 	d = &hw_dom->d_resctrl;
528 	d->hdr.id = id;
529 	d->hdr.type = RESCTRL_MON_DOMAIN;
530 	d->hdr.rid = RDT_RESOURCE_L3;
531 	ci = get_cpu_cacheinfo_level(cpu, RESCTRL_L3_CACHE);
532 	if (!ci) {
533 		pr_warn_once("Can't find L3 cache for CPU:%d resource %s\n", cpu, r->name);
534 		l3_mon_domain_free(hw_dom);
535 		return;
536 	}
537 	d->ci_id = ci->id;
538 	cpumask_set_cpu(cpu, &d->hdr.cpu_mask);
539 
540 	arch_mon_domain_online(r, d);
541 
542 	if (l3_mon_domain_mbm_alloc(r->mon.num_rmid, hw_dom)) {
543 		l3_mon_domain_free(hw_dom);
544 		return;
545 	}
546 
547 	list_add_tail_rcu(&d->hdr.list, add_pos);
548 
549 	err = resctrl_online_mon_domain(r, &d->hdr);
550 	if (err) {
551 		list_del_rcu(&d->hdr.list);
552 		synchronize_rcu();
553 		l3_mon_domain_free(hw_dom);
554 	}
555 }
556 
557 static void domain_add_cpu_mon(int cpu, struct rdt_resource *r)
558 {
559 	int id = get_domain_id_from_scope(cpu, r->mon_scope);
560 	struct list_head *add_pos = NULL;
561 	struct rdt_domain_hdr *hdr;
562 
563 	lockdep_assert_held(&domain_list_lock);
564 
565 	if (id < 0) {
566 		pr_warn_once("Can't find monitor domain id for CPU:%d scope:%d for resource %s\n",
567 			     cpu, r->mon_scope, r->name);
568 		return;
569 	}
570 
571 	hdr = resctrl_find_domain(&r->mon_domains, id, &add_pos);
572 	if (hdr)
573 		cpumask_set_cpu(cpu, &hdr->cpu_mask);
574 
575 	switch (r->rid) {
576 	case RDT_RESOURCE_L3:
577 		/* Update the mbm_assign_mode state for the CPU if supported */
578 		if (r->mon.mbm_cntr_assignable)
579 			resctrl_arch_mbm_cntr_assign_set_one(r);
580 		if (!hdr)
581 			l3_mon_domain_setup(cpu, id, r, add_pos);
582 		break;
583 	default:
584 		pr_warn_once("Unknown resource rid=%d\n", r->rid);
585 		break;
586 	}
587 }
588 
589 static void domain_add_cpu(int cpu, struct rdt_resource *r)
590 {
591 	if (r->alloc_capable)
592 		domain_add_cpu_ctrl(cpu, r);
593 	if (r->mon_capable)
594 		domain_add_cpu_mon(cpu, r);
595 }
596 
597 static void domain_remove_cpu_ctrl(int cpu, struct rdt_resource *r)
598 {
599 	int id = get_domain_id_from_scope(cpu, r->ctrl_scope);
600 	struct rdt_hw_ctrl_domain *hw_dom;
601 	struct rdt_domain_hdr *hdr;
602 	struct rdt_ctrl_domain *d;
603 
604 	lockdep_assert_held(&domain_list_lock);
605 
606 	if (id < 0) {
607 		pr_warn_once("Can't find control domain id for CPU:%d scope:%d for resource %s\n",
608 			     cpu, r->ctrl_scope, r->name);
609 		return;
610 	}
611 
612 	hdr = resctrl_find_domain(&r->ctrl_domains, id, NULL);
613 	if (!hdr) {
614 		pr_warn("Can't find control domain for id=%d for CPU %d for resource %s\n",
615 			id, cpu, r->name);
616 		return;
617 	}
618 
619 	cpumask_clear_cpu(cpu, &hdr->cpu_mask);
620 	if (!cpumask_empty(&hdr->cpu_mask))
621 		return;
622 
623 	if (!domain_header_is_valid(hdr, RESCTRL_CTRL_DOMAIN, r->rid))
624 		return;
625 
626 	d = container_of(hdr, struct rdt_ctrl_domain, hdr);
627 	hw_dom = resctrl_to_arch_ctrl_dom(d);
628 
629 	resctrl_offline_ctrl_domain(r, d);
630 	list_del_rcu(&hdr->list);
631 	synchronize_rcu();
632 
633 	/*
634 	 * rdt_ctrl_domain "d" is going to be freed below, so clear
635 	 * its pointer from pseudo_lock_region struct.
636 	 */
637 	if (d->plr)
638 		d->plr->d = NULL;
639 	ctrl_domain_free(hw_dom);
640 }
641 
642 static void domain_remove_cpu_mon(int cpu, struct rdt_resource *r)
643 {
644 	int id = get_domain_id_from_scope(cpu, r->mon_scope);
645 	struct rdt_domain_hdr *hdr;
646 
647 	lockdep_assert_held(&domain_list_lock);
648 
649 	if (id < 0) {
650 		pr_warn_once("Can't find monitor domain id for CPU:%d scope:%d for resource %s\n",
651 			     cpu, r->mon_scope, r->name);
652 		return;
653 	}
654 
655 	hdr = resctrl_find_domain(&r->mon_domains, id, NULL);
656 	if (!hdr) {
657 		pr_warn("Can't find monitor domain for id=%d for CPU %d for resource %s\n",
658 			id, cpu, r->name);
659 		return;
660 	}
661 
662 	cpumask_clear_cpu(cpu, &hdr->cpu_mask);
663 	if (!cpumask_empty(&hdr->cpu_mask))
664 		return;
665 
666 	switch (r->rid) {
667 	case RDT_RESOURCE_L3: {
668 		struct rdt_hw_l3_mon_domain *hw_dom;
669 		struct rdt_l3_mon_domain *d;
670 
671 		if (!domain_header_is_valid(hdr, RESCTRL_MON_DOMAIN, RDT_RESOURCE_L3))
672 			return;
673 
674 		d = container_of(hdr, struct rdt_l3_mon_domain, hdr);
675 		hw_dom = resctrl_to_arch_mon_dom(d);
676 		resctrl_offline_mon_domain(r, hdr);
677 		list_del_rcu(&hdr->list);
678 		synchronize_rcu();
679 		l3_mon_domain_free(hw_dom);
680 		break;
681 	}
682 	default:
683 		pr_warn_once("Unknown resource rid=%d\n", r->rid);
684 		break;
685 	}
686 }
687 
688 static void domain_remove_cpu(int cpu, struct rdt_resource *r)
689 {
690 	if (r->alloc_capable)
691 		domain_remove_cpu_ctrl(cpu, r);
692 	if (r->mon_capable)
693 		domain_remove_cpu_mon(cpu, r);
694 }
695 
696 static void clear_closid_rmid(int cpu)
697 {
698 	struct resctrl_pqr_state *state = this_cpu_ptr(&pqr_state);
699 
700 	state->default_closid = RESCTRL_RESERVED_CLOSID;
701 	state->default_rmid = RESCTRL_RESERVED_RMID;
702 	state->cur_closid = RESCTRL_RESERVED_CLOSID;
703 	state->cur_rmid = RESCTRL_RESERVED_RMID;
704 	wrmsr(MSR_IA32_PQR_ASSOC, RESCTRL_RESERVED_RMID,
705 	      RESCTRL_RESERVED_CLOSID);
706 }
707 
708 static int resctrl_arch_online_cpu(unsigned int cpu)
709 {
710 	struct rdt_resource *r;
711 
712 	mutex_lock(&domain_list_lock);
713 	for_each_capable_rdt_resource(r)
714 		domain_add_cpu(cpu, r);
715 	mutex_unlock(&domain_list_lock);
716 
717 	clear_closid_rmid(cpu);
718 	resctrl_online_cpu(cpu);
719 
720 	return 0;
721 }
722 
723 static int resctrl_arch_offline_cpu(unsigned int cpu)
724 {
725 	struct rdt_resource *r;
726 
727 	resctrl_offline_cpu(cpu);
728 
729 	mutex_lock(&domain_list_lock);
730 	for_each_capable_rdt_resource(r)
731 		domain_remove_cpu(cpu, r);
732 	mutex_unlock(&domain_list_lock);
733 
734 	clear_closid_rmid(cpu);
735 
736 	return 0;
737 }
738 
739 void resctrl_arch_pre_mount(void)
740 {
741 	if (!intel_aet_get_events())
742 		return;
743 }
744 
745 enum {
746 	RDT_FLAG_CMT,
747 	RDT_FLAG_MBM_TOTAL,
748 	RDT_FLAG_MBM_LOCAL,
749 	RDT_FLAG_L3_CAT,
750 	RDT_FLAG_L3_CDP,
751 	RDT_FLAG_L2_CAT,
752 	RDT_FLAG_L2_CDP,
753 	RDT_FLAG_MBA,
754 	RDT_FLAG_SMBA,
755 	RDT_FLAG_BMEC,
756 	RDT_FLAG_ABMC,
757 	RDT_FLAG_SDCIAE,
758 };
759 
760 #define RDT_OPT(idx, n, f)	\
761 [idx] = {			\
762 	.name = n,		\
763 	.flag = f		\
764 }
765 
766 struct rdt_options {
767 	char	*name;
768 	int	flag;
769 	bool	force_off, force_on;
770 };
771 
772 static struct rdt_options rdt_options[]  __ro_after_init = {
773 	RDT_OPT(RDT_FLAG_CMT,	    "cmt",	X86_FEATURE_CQM_OCCUP_LLC),
774 	RDT_OPT(RDT_FLAG_MBM_TOTAL, "mbmtotal", X86_FEATURE_CQM_MBM_TOTAL),
775 	RDT_OPT(RDT_FLAG_MBM_LOCAL, "mbmlocal", X86_FEATURE_CQM_MBM_LOCAL),
776 	RDT_OPT(RDT_FLAG_L3_CAT,    "l3cat",	X86_FEATURE_CAT_L3),
777 	RDT_OPT(RDT_FLAG_L3_CDP,    "l3cdp",	X86_FEATURE_CDP_L3),
778 	RDT_OPT(RDT_FLAG_L2_CAT,    "l2cat",	X86_FEATURE_CAT_L2),
779 	RDT_OPT(RDT_FLAG_L2_CDP,    "l2cdp",	X86_FEATURE_CDP_L2),
780 	RDT_OPT(RDT_FLAG_MBA,	    "mba",	X86_FEATURE_MBA),
781 	RDT_OPT(RDT_FLAG_SMBA,	    "smba",	X86_FEATURE_SMBA),
782 	RDT_OPT(RDT_FLAG_BMEC,	    "bmec",	X86_FEATURE_BMEC),
783 	RDT_OPT(RDT_FLAG_ABMC,	    "abmc",	X86_FEATURE_ABMC),
784 	RDT_OPT(RDT_FLAG_SDCIAE,    "sdciae",	X86_FEATURE_SDCIAE),
785 };
786 #define NUM_RDT_OPTIONS ARRAY_SIZE(rdt_options)
787 
788 static int __init set_rdt_options(char *str)
789 {
790 	struct rdt_options *o;
791 	bool force_off;
792 	char *tok;
793 
794 	if (*str == '=')
795 		str++;
796 	while ((tok = strsep(&str, ",")) != NULL) {
797 		force_off = *tok == '!';
798 		if (force_off)
799 			tok++;
800 		for (o = rdt_options; o < &rdt_options[NUM_RDT_OPTIONS]; o++) {
801 			if (strcmp(tok, o->name) == 0) {
802 				if (force_off)
803 					o->force_off = true;
804 				else
805 					o->force_on = true;
806 				break;
807 			}
808 		}
809 	}
810 	return 1;
811 }
812 __setup("rdt", set_rdt_options);
813 
814 bool rdt_cpu_has(int flag)
815 {
816 	bool ret = boot_cpu_has(flag);
817 	struct rdt_options *o;
818 
819 	if (!ret)
820 		return ret;
821 
822 	for (o = rdt_options; o < &rdt_options[NUM_RDT_OPTIONS]; o++) {
823 		if (flag == o->flag) {
824 			if (o->force_off)
825 				ret = false;
826 			if (o->force_on)
827 				ret = true;
828 			break;
829 		}
830 	}
831 	return ret;
832 }
833 
834 bool resctrl_arch_is_evt_configurable(enum resctrl_event_id evt)
835 {
836 	if (!rdt_cpu_has(X86_FEATURE_BMEC))
837 		return false;
838 
839 	switch (evt) {
840 	case QOS_L3_MBM_TOTAL_EVENT_ID:
841 		return rdt_cpu_has(X86_FEATURE_CQM_MBM_TOTAL);
842 	case QOS_L3_MBM_LOCAL_EVENT_ID:
843 		return rdt_cpu_has(X86_FEATURE_CQM_MBM_LOCAL);
844 	default:
845 		return false;
846 	}
847 }
848 
849 static __init bool get_mem_config(void)
850 {
851 	struct rdt_hw_resource *hw_res = &rdt_resources_all[RDT_RESOURCE_MBA];
852 
853 	if (!rdt_cpu_has(X86_FEATURE_MBA))
854 		return false;
855 
856 	if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
857 		return __get_mem_config_intel(&hw_res->r_resctrl);
858 	else if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
859 		return __rdt_get_mem_config_amd(&hw_res->r_resctrl);
860 
861 	return false;
862 }
863 
864 static __init bool get_slow_mem_config(void)
865 {
866 	struct rdt_hw_resource *hw_res = &rdt_resources_all[RDT_RESOURCE_SMBA];
867 
868 	if (!rdt_cpu_has(X86_FEATURE_SMBA))
869 		return false;
870 
871 	if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
872 		return __rdt_get_mem_config_amd(&hw_res->r_resctrl);
873 
874 	return false;
875 }
876 
877 static __init bool get_rdt_alloc_resources(void)
878 {
879 	struct rdt_resource *r;
880 	bool ret = false;
881 
882 	if (rdt_alloc_capable)
883 		return true;
884 
885 	if (!boot_cpu_has(X86_FEATURE_RDT_A))
886 		return false;
887 
888 	if (rdt_cpu_has(X86_FEATURE_CAT_L3)) {
889 		r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
890 		rdt_get_cache_alloc_cfg(1, r);
891 		if (rdt_cpu_has(X86_FEATURE_CDP_L3))
892 			rdt_get_cdp_l3_config();
893 		if (rdt_cpu_has(X86_FEATURE_SDCIAE))
894 			rdt_set_io_alloc_capable(r);
895 		ret = true;
896 	}
897 	if (rdt_cpu_has(X86_FEATURE_CAT_L2)) {
898 		/* CPUID 0x10.2 fields are same format at 0x10.1 */
899 		r = &rdt_resources_all[RDT_RESOURCE_L2].r_resctrl;
900 		rdt_get_cache_alloc_cfg(2, r);
901 		if (rdt_cpu_has(X86_FEATURE_CDP_L2))
902 			rdt_get_cdp_l2_config();
903 		ret = true;
904 	}
905 
906 	if (get_mem_config())
907 		ret = true;
908 
909 	if (get_slow_mem_config())
910 		ret = true;
911 
912 	return ret;
913 }
914 
915 static __init bool get_rdt_mon_resources(void)
916 {
917 	struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
918 	bool ret = false;
919 
920 	if (rdt_cpu_has(X86_FEATURE_CQM_OCCUP_LLC)) {
921 		resctrl_enable_mon_event(QOS_L3_OCCUP_EVENT_ID, false, 0);
922 		ret = true;
923 	}
924 	if (rdt_cpu_has(X86_FEATURE_CQM_MBM_TOTAL)) {
925 		resctrl_enable_mon_event(QOS_L3_MBM_TOTAL_EVENT_ID, false, 0);
926 		ret = true;
927 	}
928 	if (rdt_cpu_has(X86_FEATURE_CQM_MBM_LOCAL)) {
929 		resctrl_enable_mon_event(QOS_L3_MBM_LOCAL_EVENT_ID, false, 0);
930 		ret = true;
931 	}
932 	if (rdt_cpu_has(X86_FEATURE_ABMC))
933 		ret = true;
934 
935 	if (!ret)
936 		return false;
937 
938 	return !rdt_get_l3_mon_config(r);
939 }
940 
941 static __init void __check_quirks_intel(void)
942 {
943 	switch (boot_cpu_data.x86_vfm) {
944 	case INTEL_HASWELL_X:
945 		if (!rdt_options[RDT_FLAG_L3_CAT].force_off)
946 			cache_alloc_hsw_probe();
947 		break;
948 	case INTEL_SKYLAKE_X:
949 		if (boot_cpu_data.x86_stepping <= 4)
950 			set_rdt_options("!cmt,!mbmtotal,!mbmlocal,!l3cat");
951 		else
952 			set_rdt_options("!l3cat");
953 		fallthrough;
954 	case INTEL_BROADWELL_X:
955 		intel_rdt_mbm_apply_quirk();
956 		break;
957 	}
958 }
959 
960 static __init void check_quirks(void)
961 {
962 	if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
963 		__check_quirks_intel();
964 }
965 
966 static __init bool get_rdt_resources(void)
967 {
968 	rdt_alloc_capable = get_rdt_alloc_resources();
969 	rdt_mon_capable = get_rdt_mon_resources();
970 
971 	return (rdt_mon_capable || rdt_alloc_capable);
972 }
973 
974 static __init void rdt_init_res_defs_intel(void)
975 {
976 	struct rdt_hw_resource *hw_res;
977 	struct rdt_resource *r;
978 
979 	for_each_rdt_resource(r) {
980 		hw_res = resctrl_to_arch_res(r);
981 
982 		if (r->rid == RDT_RESOURCE_L3 ||
983 		    r->rid == RDT_RESOURCE_L2) {
984 			r->cache.arch_has_per_cpu_cfg = false;
985 			r->cache.min_cbm_bits = 1;
986 		} else if (r->rid == RDT_RESOURCE_MBA) {
987 			hw_res->msr_base = MSR_IA32_MBA_THRTL_BASE;
988 			hw_res->msr_update = mba_wrmsr_intel;
989 		}
990 	}
991 }
992 
993 static __init void rdt_init_res_defs_amd(void)
994 {
995 	struct rdt_hw_resource *hw_res;
996 	struct rdt_resource *r;
997 
998 	for_each_rdt_resource(r) {
999 		hw_res = resctrl_to_arch_res(r);
1000 
1001 		if (r->rid == RDT_RESOURCE_L3 ||
1002 		    r->rid == RDT_RESOURCE_L2) {
1003 			r->cache.arch_has_sparse_bitmasks = true;
1004 			r->cache.arch_has_per_cpu_cfg = true;
1005 			r->cache.min_cbm_bits = 0;
1006 		} else if (r->rid == RDT_RESOURCE_MBA) {
1007 			hw_res->msr_base = MSR_IA32_MBA_BW_BASE;
1008 			hw_res->msr_update = mba_wrmsr_amd;
1009 		} else if (r->rid == RDT_RESOURCE_SMBA) {
1010 			hw_res->msr_base = MSR_IA32_SMBA_BW_BASE;
1011 			hw_res->msr_update = mba_wrmsr_amd;
1012 		}
1013 	}
1014 }
1015 
1016 static __init void rdt_init_res_defs(void)
1017 {
1018 	if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
1019 		rdt_init_res_defs_intel();
1020 	else if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
1021 		rdt_init_res_defs_amd();
1022 }
1023 
1024 static enum cpuhp_state rdt_online;
1025 
1026 /* Runs once on the BSP during boot. */
1027 void resctrl_cpu_detect(struct cpuinfo_x86 *c)
1028 {
1029 	if (!cpu_has(c, X86_FEATURE_CQM_LLC) && !cpu_has(c, X86_FEATURE_ABMC)) {
1030 		c->x86_cache_max_rmid  = -1;
1031 		c->x86_cache_occ_scale = -1;
1032 		c->x86_cache_mbm_width_offset = -1;
1033 		return;
1034 	}
1035 
1036 	/* will be overridden if occupancy monitoring exists */
1037 	c->x86_cache_max_rmid = cpuid_ebx(0xf);
1038 
1039 	if (cpu_has(c, X86_FEATURE_CQM_OCCUP_LLC) ||
1040 	    cpu_has(c, X86_FEATURE_CQM_MBM_TOTAL) ||
1041 	    cpu_has(c, X86_FEATURE_CQM_MBM_LOCAL) ||
1042 	    cpu_has(c, X86_FEATURE_ABMC)) {
1043 		u32 eax, ebx, ecx, edx;
1044 
1045 		/* QoS sub-leaf, EAX=0Fh, ECX=1 */
1046 		cpuid_count(0xf, 1, &eax, &ebx, &ecx, &edx);
1047 
1048 		c->x86_cache_max_rmid  = ecx;
1049 		c->x86_cache_occ_scale = ebx;
1050 		c->x86_cache_mbm_width_offset = eax & 0xff;
1051 
1052 		if (c->x86_vendor == X86_VENDOR_AMD && !c->x86_cache_mbm_width_offset)
1053 			c->x86_cache_mbm_width_offset = MBM_CNTR_WIDTH_OFFSET_AMD;
1054 	}
1055 }
1056 
1057 static int __init resctrl_arch_late_init(void)
1058 {
1059 	struct rdt_resource *r;
1060 	int state, ret, i;
1061 
1062 	/* for_each_rdt_resource() requires all rid to be initialised. */
1063 	for (i = 0; i < RDT_NUM_RESOURCES; i++)
1064 		rdt_resources_all[i].r_resctrl.rid = i;
1065 
1066 	/*
1067 	 * Initialize functions(or definitions) that are different
1068 	 * between vendors here.
1069 	 */
1070 	rdt_init_res_defs();
1071 
1072 	check_quirks();
1073 
1074 	if (!get_rdt_resources())
1075 		return -ENODEV;
1076 
1077 	state = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN,
1078 				  "x86/resctrl/cat:online:",
1079 				  resctrl_arch_online_cpu,
1080 				  resctrl_arch_offline_cpu);
1081 	if (state < 0)
1082 		return state;
1083 
1084 	ret = resctrl_init();
1085 	if (ret) {
1086 		cpuhp_remove_state(state);
1087 		return ret;
1088 	}
1089 	rdt_online = state;
1090 
1091 	for_each_alloc_capable_rdt_resource(r)
1092 		pr_info("%s allocation detected\n", r->name);
1093 
1094 	for_each_mon_capable_rdt_resource(r)
1095 		pr_info("%s monitoring detected\n", r->name);
1096 
1097 	return 0;
1098 }
1099 
1100 late_initcall(resctrl_arch_late_init);
1101 
1102 static void __exit resctrl_arch_exit(void)
1103 {
1104 	intel_aet_exit();
1105 
1106 	cpuhp_remove_state(rdt_online);
1107 
1108 	resctrl_exit();
1109 }
1110 
1111 __exitcall(resctrl_arch_exit);
1112