xref: /linux/arch/x86/kernel/cpu/resctrl/core.c (revision f4e0cd80d3e7c31327459008b01d63804838a89d)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Resource Director Technology(RDT)
4  * - Cache Allocation code.
5  *
6  * Copyright (C) 2016 Intel Corporation
7  *
8  * Authors:
9  *    Fenghua Yu <fenghua.yu@intel.com>
10  *    Tony Luck <tony.luck@intel.com>
11  *    Vikas Shivappa <vikas.shivappa@intel.com>
12  *
13  * More information about RDT be found in the Intel (R) x86 Architecture
14  * Software Developer Manual June 2016, volume 3, section 17.17.
15  */
16 
17 #define pr_fmt(fmt)	"resctrl: " fmt
18 
19 #include <linux/cpu.h>
20 #include <linux/slab.h>
21 #include <linux/err.h>
22 #include <linux/cpuhotplug.h>
23 
24 #include <asm/cpu_device_id.h>
25 #include <asm/msr.h>
26 #include <asm/resctrl.h>
27 #include "internal.h"
28 
29 /*
30  * rdt_domain structures are kfree()d when their last CPU goes offline,
31  * and allocated when the first CPU in a new domain comes online.
32  * The rdt_resource's domain list is updated when this happens. Readers of
33  * the domain list must either take cpus_read_lock(), or rely on an RCU
34  * read-side critical section, to avoid observing concurrent modification.
35  * All writers take this mutex:
36  */
37 static DEFINE_MUTEX(domain_list_lock);
38 
39 /*
40  * The cached resctrl_pqr_state is strictly per CPU and can never be
41  * updated from a remote CPU. Functions which modify the state
42  * are called with interrupts disabled and no preemption, which
43  * is sufficient for the protection.
44  */
45 DEFINE_PER_CPU(struct resctrl_pqr_state, pqr_state);
46 
47 /*
48  * Global boolean for rdt_alloc which is true if any
49  * resource allocation is enabled.
50  */
51 bool rdt_alloc_capable;
52 
53 static void mba_wrmsr_intel(struct msr_param *m);
54 static void cat_wrmsr(struct msr_param *m);
55 static void mba_wrmsr_amd(struct msr_param *m);
56 
57 #define ctrl_domain_init(id) LIST_HEAD_INIT(rdt_resources_all[id].r_resctrl.ctrl_domains)
58 #define mon_domain_init(id) LIST_HEAD_INIT(rdt_resources_all[id].r_resctrl.mon_domains)
59 
60 struct rdt_hw_resource rdt_resources_all[RDT_NUM_RESOURCES] = {
61 	[RDT_RESOURCE_L3] =
62 	{
63 		.r_resctrl = {
64 			.name			= "L3",
65 			.ctrl_scope		= RESCTRL_L3_CACHE,
66 			.mon_scope		= RESCTRL_L3_CACHE,
67 			.ctrl_domains		= ctrl_domain_init(RDT_RESOURCE_L3),
68 			.mon_domains		= mon_domain_init(RDT_RESOURCE_L3),
69 			.schema_fmt		= RESCTRL_SCHEMA_BITMAP,
70 		},
71 		.msr_base		= MSR_IA32_L3_CBM_BASE,
72 		.msr_update		= cat_wrmsr,
73 	},
74 	[RDT_RESOURCE_L2] =
75 	{
76 		.r_resctrl = {
77 			.name			= "L2",
78 			.ctrl_scope		= RESCTRL_L2_CACHE,
79 			.ctrl_domains		= ctrl_domain_init(RDT_RESOURCE_L2),
80 			.schema_fmt		= RESCTRL_SCHEMA_BITMAP,
81 		},
82 		.msr_base		= MSR_IA32_L2_CBM_BASE,
83 		.msr_update		= cat_wrmsr,
84 	},
85 	[RDT_RESOURCE_MBA] =
86 	{
87 		.r_resctrl = {
88 			.name			= "MB",
89 			.ctrl_scope		= RESCTRL_L3_CACHE,
90 			.ctrl_domains		= ctrl_domain_init(RDT_RESOURCE_MBA),
91 			.schema_fmt		= RESCTRL_SCHEMA_RANGE,
92 		},
93 	},
94 	[RDT_RESOURCE_SMBA] =
95 	{
96 		.r_resctrl = {
97 			.name			= "SMBA",
98 			.ctrl_scope		= RESCTRL_L3_CACHE,
99 			.ctrl_domains		= ctrl_domain_init(RDT_RESOURCE_SMBA),
100 			.schema_fmt		= RESCTRL_SCHEMA_RANGE,
101 		},
102 	},
103 	[RDT_RESOURCE_PERF_PKG] =
104 	{
105 		.r_resctrl = {
106 			.name			= "PERF_PKG",
107 			.mon_scope		= RESCTRL_PACKAGE,
108 			.mon_domains		= mon_domain_init(RDT_RESOURCE_PERF_PKG),
109 		},
110 	},
111 };
112 
113 u32 resctrl_arch_system_num_rmid_idx(void)
114 {
115 	struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
116 
117 	/* RMID are independent numbers for x86. num_rmid_idx == num_rmid */
118 	return r->mon.num_rmid;
119 }
120 
121 struct rdt_resource *resctrl_arch_get_resource(enum resctrl_res_level l)
122 {
123 	if (l >= RDT_NUM_RESOURCES)
124 		return NULL;
125 
126 	return &rdt_resources_all[l].r_resctrl;
127 }
128 
129 /*
130  * cache_alloc_hsw_probe() - Have to probe for Intel haswell server CPUs
131  * as they do not have CPUID enumeration support for Cache allocation.
132  * The check for Vendor/Family/Model is not enough to guarantee that
133  * the MSRs won't #GP fault because only the following SKUs support
134  * CAT:
135  *	Intel(R) Xeon(R)  CPU E5-2658  v3  @  2.20GHz
136  *	Intel(R) Xeon(R)  CPU E5-2648L v3  @  1.80GHz
137  *	Intel(R) Xeon(R)  CPU E5-2628L v3  @  2.00GHz
138  *	Intel(R) Xeon(R)  CPU E5-2618L v3  @  2.30GHz
139  *	Intel(R) Xeon(R)  CPU E5-2608L v3  @  2.00GHz
140  *	Intel(R) Xeon(R)  CPU E5-2658A v3  @  2.20GHz
141  *
142  * Probe by trying to write the first of the L3 cache mask registers
143  * and checking that the bits stick. Max CLOSids is always 4 and max cbm length
144  * is always 20 on hsw server parts. The minimum cache bitmask length
145  * allowed for HSW server is always 2 bits. Hardcode all of them.
146  */
147 static inline void cache_alloc_hsw_probe(void)
148 {
149 	struct rdt_hw_resource *hw_res = &rdt_resources_all[RDT_RESOURCE_L3];
150 	struct rdt_resource *r  = &hw_res->r_resctrl;
151 	u64 max_cbm = BIT_ULL_MASK(20) - 1, l3_cbm_0;
152 
153 	if (wrmsrq_safe(MSR_IA32_L3_CBM_BASE, max_cbm))
154 		return;
155 
156 	rdmsrq(MSR_IA32_L3_CBM_BASE, l3_cbm_0);
157 
158 	/* If all the bits were set in MSR, return success */
159 	if (l3_cbm_0 != max_cbm)
160 		return;
161 
162 	hw_res->num_closid = 4;
163 	r->cache.cbm_len = 20;
164 	r->cache.shareable_bits = 0xc0000;
165 	r->cache.min_cbm_bits = 2;
166 	r->cache.arch_has_sparse_bitmasks = false;
167 	r->alloc_capable = true;
168 
169 	rdt_alloc_capable = true;
170 }
171 
172 /*
173  * rdt_get_mb_table() - get a mapping of bandwidth(b/w) percentage values
174  * exposed to user interface and the h/w understandable delay values.
175  *
176  * The non-linear delay values have the granularity of power of two
177  * and also the h/w does not guarantee a curve for configured delay
178  * values vs. actual b/w enforced.
179  * Hence we need a mapping that is pre calibrated so the user can
180  * express the memory b/w as a percentage value.
181  */
182 static inline bool rdt_get_mb_table(struct rdt_resource *r)
183 {
184 	/*
185 	 * There are no Intel SKUs as of now to support non-linear delay.
186 	 */
187 	pr_info("MBA b/w map not implemented for cpu:%d, model:%d",
188 		boot_cpu_data.x86, boot_cpu_data.x86_model);
189 
190 	return false;
191 }
192 
193 static __init bool __get_mem_config_intel(struct rdt_resource *r)
194 {
195 	struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
196 	union cpuid_0x10_3_eax eax;
197 	union cpuid_0x10_x_edx edx;
198 	u32 ebx, ecx, max_delay;
199 
200 	cpuid_count(0x00000010, 3, &eax.full, &ebx, &ecx, &edx.full);
201 	hw_res->num_closid = edx.split.cos_max + 1;
202 	max_delay = eax.split.max_delay + 1;
203 	r->membw.max_bw = MAX_MBA_BW;
204 	r->membw.arch_needs_linear = true;
205 	if (ecx & MBA_IS_LINEAR) {
206 		r->membw.delay_linear = true;
207 		r->membw.min_bw = MAX_MBA_BW - max_delay;
208 		r->membw.bw_gran = MAX_MBA_BW - max_delay;
209 	} else {
210 		if (!rdt_get_mb_table(r))
211 			return false;
212 		r->membw.arch_needs_linear = false;
213 	}
214 
215 	if (boot_cpu_has(X86_FEATURE_PER_THREAD_MBA))
216 		r->membw.throttle_mode = THREAD_THROTTLE_PER_THREAD;
217 	else
218 		r->membw.throttle_mode = THREAD_THROTTLE_MAX;
219 
220 	r->alloc_capable = true;
221 
222 	return true;
223 }
224 
225 static __init bool __rdt_get_mem_config_amd(struct rdt_resource *r)
226 {
227 	struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
228 	u32 eax, ebx, ecx, edx, subleaf;
229 
230 	/*
231 	 * Query CPUID_Fn80000020_EDX_x01 for MBA and
232 	 * CPUID_Fn80000020_EDX_x02 for SMBA
233 	 */
234 	subleaf = (r->rid == RDT_RESOURCE_SMBA) ? 2 :  1;
235 
236 	cpuid_count(0x80000020, subleaf, &eax, &ebx, &ecx, &edx);
237 	hw_res->num_closid = edx + 1;
238 	r->membw.max_bw = 1 << eax;
239 
240 	/* AMD does not use delay */
241 	r->membw.delay_linear = false;
242 	r->membw.arch_needs_linear = false;
243 
244 	/*
245 	 * AMD does not use memory delay throttle model to control
246 	 * the allocation like Intel does.
247 	 */
248 	r->membw.throttle_mode = THREAD_THROTTLE_UNDEFINED;
249 	r->membw.min_bw = 0;
250 	r->membw.bw_gran = 1;
251 
252 	r->alloc_capable = true;
253 
254 	return true;
255 }
256 
257 static void rdt_get_cache_alloc_cfg(int idx, struct rdt_resource *r)
258 {
259 	struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
260 	union cpuid_0x10_1_eax eax;
261 	union cpuid_0x10_x_ecx ecx;
262 	union cpuid_0x10_x_edx edx;
263 	u32 ebx, default_ctrl;
264 
265 	cpuid_count(0x00000010, idx, &eax.full, &ebx, &ecx.full, &edx.full);
266 	hw_res->num_closid = edx.split.cos_max + 1;
267 	r->cache.cbm_len = eax.split.cbm_len + 1;
268 	default_ctrl = BIT_MASK(eax.split.cbm_len + 1) - 1;
269 	r->cache.shareable_bits = ebx & default_ctrl;
270 	if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
271 		r->cache.arch_has_sparse_bitmasks = ecx.split.noncont;
272 	r->alloc_capable = true;
273 }
274 
275 static void rdt_get_cdp_config(int level)
276 {
277 	/*
278 	 * By default, CDP is disabled. CDP can be enabled by mount parameter
279 	 * "cdp" during resctrl file system mount time.
280 	 */
281 	rdt_resources_all[level].cdp_enabled = false;
282 	rdt_resources_all[level].r_resctrl.cdp_capable = true;
283 }
284 
285 static void rdt_set_io_alloc_capable(struct rdt_resource *r)
286 {
287 	r->cache.io_alloc_capable = true;
288 }
289 
290 static void rdt_get_cdp_l3_config(void)
291 {
292 	rdt_get_cdp_config(RDT_RESOURCE_L3);
293 }
294 
295 static void rdt_get_cdp_l2_config(void)
296 {
297 	rdt_get_cdp_config(RDT_RESOURCE_L2);
298 }
299 
300 static void mba_wrmsr_amd(struct msr_param *m)
301 {
302 	struct rdt_hw_ctrl_domain *hw_dom = resctrl_to_arch_ctrl_dom(m->dom);
303 	struct rdt_hw_resource *hw_res = resctrl_to_arch_res(m->res);
304 	unsigned int i;
305 
306 	for (i = m->low; i < m->high; i++)
307 		wrmsrq(hw_res->msr_base + i, hw_dom->ctrl_val[i]);
308 }
309 
310 /*
311  * Map the memory b/w percentage value to delay values
312  * that can be written to QOS_MSRs.
313  * There are currently no SKUs which support non linear delay values.
314  */
315 static u32 delay_bw_map(unsigned long bw, struct rdt_resource *r)
316 {
317 	if (r->membw.delay_linear)
318 		return MAX_MBA_BW - bw;
319 
320 	pr_warn_once("Non Linear delay-bw map not supported but queried\n");
321 	return MAX_MBA_BW;
322 }
323 
324 static void mba_wrmsr_intel(struct msr_param *m)
325 {
326 	struct rdt_hw_ctrl_domain *hw_dom = resctrl_to_arch_ctrl_dom(m->dom);
327 	struct rdt_hw_resource *hw_res = resctrl_to_arch_res(m->res);
328 	unsigned int i;
329 
330 	/*  Write the delay values for mba. */
331 	for (i = m->low; i < m->high; i++)
332 		wrmsrq(hw_res->msr_base + i, delay_bw_map(hw_dom->ctrl_val[i], m->res));
333 }
334 
335 static void cat_wrmsr(struct msr_param *m)
336 {
337 	struct rdt_hw_ctrl_domain *hw_dom = resctrl_to_arch_ctrl_dom(m->dom);
338 	struct rdt_hw_resource *hw_res = resctrl_to_arch_res(m->res);
339 	unsigned int i;
340 
341 	for (i = m->low; i < m->high; i++)
342 		wrmsrq(hw_res->msr_base + i, hw_dom->ctrl_val[i]);
343 }
344 
345 u32 resctrl_arch_get_num_closid(struct rdt_resource *r)
346 {
347 	return resctrl_to_arch_res(r)->num_closid;
348 }
349 
350 void rdt_ctrl_update(void *arg)
351 {
352 	struct rdt_hw_resource *hw_res;
353 	struct msr_param *m = arg;
354 
355 	hw_res = resctrl_to_arch_res(m->res);
356 	hw_res->msr_update(m);
357 }
358 
359 static void setup_default_ctrlval(struct rdt_resource *r, u32 *dc)
360 {
361 	struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
362 	int i;
363 
364 	/*
365 	 * Initialize the Control MSRs to having no control.
366 	 * For Cache Allocation: Set all bits in cbm
367 	 * For Memory Allocation: Set b/w requested to 100%
368 	 */
369 	for (i = 0; i < hw_res->num_closid; i++, dc++)
370 		*dc = resctrl_get_default_ctrl(r);
371 }
372 
373 static void ctrl_domain_free(struct rdt_hw_ctrl_domain *hw_dom)
374 {
375 	kfree(hw_dom->ctrl_val);
376 	kfree(hw_dom);
377 }
378 
379 static void l3_mon_domain_free(struct rdt_hw_l3_mon_domain *hw_dom)
380 {
381 	int idx;
382 
383 	for_each_mbm_idx(idx)
384 		kfree(hw_dom->arch_mbm_states[idx]);
385 	kfree(hw_dom);
386 }
387 
388 static int domain_setup_ctrlval(struct rdt_resource *r, struct rdt_ctrl_domain *d)
389 {
390 	struct rdt_hw_ctrl_domain *hw_dom = resctrl_to_arch_ctrl_dom(d);
391 	struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
392 	struct msr_param m;
393 	u32 *dc;
394 
395 	dc = kmalloc_array(hw_res->num_closid, sizeof(*hw_dom->ctrl_val),
396 			   GFP_KERNEL);
397 	if (!dc)
398 		return -ENOMEM;
399 
400 	hw_dom->ctrl_val = dc;
401 	setup_default_ctrlval(r, dc);
402 
403 	m.res = r;
404 	m.dom = d;
405 	m.low = 0;
406 	m.high = hw_res->num_closid;
407 	hw_res->msr_update(&m);
408 	return 0;
409 }
410 
411 /**
412  * l3_mon_domain_mbm_alloc() - Allocate arch private storage for the MBM counters
413  * @num_rmid:	The size of the MBM counter array
414  * @hw_dom:	The domain that owns the allocated arrays
415  *
416  * Return:	0 for success, or -ENOMEM.
417  */
418 static int l3_mon_domain_mbm_alloc(u32 num_rmid, struct rdt_hw_l3_mon_domain *hw_dom)
419 {
420 	size_t tsize = sizeof(*hw_dom->arch_mbm_states[0]);
421 	enum resctrl_event_id eventid;
422 	int idx;
423 
424 	for_each_mbm_event_id(eventid) {
425 		if (!resctrl_is_mon_event_enabled(eventid))
426 			continue;
427 		idx = MBM_STATE_IDX(eventid);
428 		hw_dom->arch_mbm_states[idx] = kcalloc(num_rmid, tsize, GFP_KERNEL);
429 		if (!hw_dom->arch_mbm_states[idx])
430 			goto cleanup;
431 	}
432 
433 	return 0;
434 cleanup:
435 	for_each_mbm_idx(idx) {
436 		kfree(hw_dom->arch_mbm_states[idx]);
437 		hw_dom->arch_mbm_states[idx] = NULL;
438 	}
439 
440 	return -ENOMEM;
441 }
442 
443 static int get_domain_id_from_scope(int cpu, enum resctrl_scope scope)
444 {
445 	switch (scope) {
446 	case RESCTRL_L2_CACHE:
447 	case RESCTRL_L3_CACHE:
448 		return get_cpu_cacheinfo_id(cpu, scope);
449 	case RESCTRL_L3_NODE:
450 		return cpu_to_node(cpu);
451 	case RESCTRL_PACKAGE:
452 		return topology_physical_package_id(cpu);
453 	default:
454 		break;
455 	}
456 
457 	return -EINVAL;
458 }
459 
460 static void domain_add_cpu_ctrl(int cpu, struct rdt_resource *r)
461 {
462 	int id = get_domain_id_from_scope(cpu, r->ctrl_scope);
463 	struct rdt_hw_ctrl_domain *hw_dom;
464 	struct list_head *add_pos = NULL;
465 	struct rdt_domain_hdr *hdr;
466 	struct rdt_ctrl_domain *d;
467 	int err;
468 
469 	lockdep_assert_held(&domain_list_lock);
470 
471 	if (id < 0) {
472 		pr_warn_once("Can't find control domain id for CPU:%d scope:%d for resource %s\n",
473 			     cpu, r->ctrl_scope, r->name);
474 		return;
475 	}
476 
477 	hdr = resctrl_find_domain(&r->ctrl_domains, id, &add_pos);
478 	if (hdr) {
479 		if (!domain_header_is_valid(hdr, RESCTRL_CTRL_DOMAIN, r->rid))
480 			return;
481 		d = container_of(hdr, struct rdt_ctrl_domain, hdr);
482 
483 		cpumask_set_cpu(cpu, &d->hdr.cpu_mask);
484 		if (r->cache.arch_has_per_cpu_cfg)
485 			rdt_domain_reconfigure_cdp(r);
486 		return;
487 	}
488 
489 	hw_dom = kzalloc_node(sizeof(*hw_dom), GFP_KERNEL, cpu_to_node(cpu));
490 	if (!hw_dom)
491 		return;
492 
493 	d = &hw_dom->d_resctrl;
494 	d->hdr.id = id;
495 	d->hdr.type = RESCTRL_CTRL_DOMAIN;
496 	d->hdr.rid = r->rid;
497 	cpumask_set_cpu(cpu, &d->hdr.cpu_mask);
498 
499 	rdt_domain_reconfigure_cdp(r);
500 
501 	if (domain_setup_ctrlval(r, d)) {
502 		ctrl_domain_free(hw_dom);
503 		return;
504 	}
505 
506 	list_add_tail_rcu(&d->hdr.list, add_pos);
507 
508 	err = resctrl_online_ctrl_domain(r, d);
509 	if (err) {
510 		list_del_rcu(&d->hdr.list);
511 		synchronize_rcu();
512 		ctrl_domain_free(hw_dom);
513 	}
514 }
515 
516 static void l3_mon_domain_setup(int cpu, int id, struct rdt_resource *r, struct list_head *add_pos)
517 {
518 	struct rdt_hw_l3_mon_domain *hw_dom;
519 	struct rdt_l3_mon_domain *d;
520 	struct cacheinfo *ci;
521 	int err;
522 
523 	hw_dom = kzalloc_node(sizeof(*hw_dom), GFP_KERNEL, cpu_to_node(cpu));
524 	if (!hw_dom)
525 		return;
526 
527 	d = &hw_dom->d_resctrl;
528 	d->hdr.id = id;
529 	d->hdr.type = RESCTRL_MON_DOMAIN;
530 	d->hdr.rid = RDT_RESOURCE_L3;
531 	ci = get_cpu_cacheinfo_level(cpu, RESCTRL_L3_CACHE);
532 	if (!ci) {
533 		pr_warn_once("Can't find L3 cache for CPU:%d resource %s\n", cpu, r->name);
534 		l3_mon_domain_free(hw_dom);
535 		return;
536 	}
537 	d->ci_id = ci->id;
538 	cpumask_set_cpu(cpu, &d->hdr.cpu_mask);
539 
540 	arch_mon_domain_online(r, d);
541 
542 	if (l3_mon_domain_mbm_alloc(r->mon.num_rmid, hw_dom)) {
543 		l3_mon_domain_free(hw_dom);
544 		return;
545 	}
546 
547 	list_add_tail_rcu(&d->hdr.list, add_pos);
548 
549 	err = resctrl_online_mon_domain(r, &d->hdr);
550 	if (err) {
551 		list_del_rcu(&d->hdr.list);
552 		synchronize_rcu();
553 		l3_mon_domain_free(hw_dom);
554 	}
555 }
556 
557 static void domain_add_cpu_mon(int cpu, struct rdt_resource *r)
558 {
559 	int id = get_domain_id_from_scope(cpu, r->mon_scope);
560 	struct list_head *add_pos = NULL;
561 	struct rdt_domain_hdr *hdr;
562 
563 	lockdep_assert_held(&domain_list_lock);
564 
565 	if (id < 0) {
566 		pr_warn_once("Can't find monitor domain id for CPU:%d scope:%d for resource %s\n",
567 			     cpu, r->mon_scope, r->name);
568 		return;
569 	}
570 
571 	hdr = resctrl_find_domain(&r->mon_domains, id, &add_pos);
572 	if (hdr)
573 		cpumask_set_cpu(cpu, &hdr->cpu_mask);
574 
575 	switch (r->rid) {
576 	case RDT_RESOURCE_L3:
577 		/* Update the mbm_assign_mode state for the CPU if supported */
578 		if (r->mon.mbm_cntr_assignable)
579 			resctrl_arch_mbm_cntr_assign_set_one(r);
580 		if (!hdr)
581 			l3_mon_domain_setup(cpu, id, r, add_pos);
582 		break;
583 	case RDT_RESOURCE_PERF_PKG:
584 		if (!hdr)
585 			intel_aet_mon_domain_setup(cpu, id, r, add_pos);
586 		break;
587 	default:
588 		pr_warn_once("Unknown resource rid=%d\n", r->rid);
589 		break;
590 	}
591 }
592 
593 static void domain_add_cpu(int cpu, struct rdt_resource *r)
594 {
595 	if (r->alloc_capable)
596 		domain_add_cpu_ctrl(cpu, r);
597 	if (r->mon_capable)
598 		domain_add_cpu_mon(cpu, r);
599 }
600 
601 static void domain_remove_cpu_ctrl(int cpu, struct rdt_resource *r)
602 {
603 	int id = get_domain_id_from_scope(cpu, r->ctrl_scope);
604 	struct rdt_hw_ctrl_domain *hw_dom;
605 	struct rdt_domain_hdr *hdr;
606 	struct rdt_ctrl_domain *d;
607 
608 	lockdep_assert_held(&domain_list_lock);
609 
610 	if (id < 0) {
611 		pr_warn_once("Can't find control domain id for CPU:%d scope:%d for resource %s\n",
612 			     cpu, r->ctrl_scope, r->name);
613 		return;
614 	}
615 
616 	hdr = resctrl_find_domain(&r->ctrl_domains, id, NULL);
617 	if (!hdr) {
618 		pr_warn("Can't find control domain for id=%d for CPU %d for resource %s\n",
619 			id, cpu, r->name);
620 		return;
621 	}
622 
623 	cpumask_clear_cpu(cpu, &hdr->cpu_mask);
624 	if (!cpumask_empty(&hdr->cpu_mask))
625 		return;
626 
627 	if (!domain_header_is_valid(hdr, RESCTRL_CTRL_DOMAIN, r->rid))
628 		return;
629 
630 	d = container_of(hdr, struct rdt_ctrl_domain, hdr);
631 	hw_dom = resctrl_to_arch_ctrl_dom(d);
632 
633 	resctrl_offline_ctrl_domain(r, d);
634 	list_del_rcu(&hdr->list);
635 	synchronize_rcu();
636 
637 	/*
638 	 * rdt_ctrl_domain "d" is going to be freed below, so clear
639 	 * its pointer from pseudo_lock_region struct.
640 	 */
641 	if (d->plr)
642 		d->plr->d = NULL;
643 	ctrl_domain_free(hw_dom);
644 }
645 
646 static void domain_remove_cpu_mon(int cpu, struct rdt_resource *r)
647 {
648 	int id = get_domain_id_from_scope(cpu, r->mon_scope);
649 	struct rdt_domain_hdr *hdr;
650 
651 	lockdep_assert_held(&domain_list_lock);
652 
653 	if (id < 0) {
654 		pr_warn_once("Can't find monitor domain id for CPU:%d scope:%d for resource %s\n",
655 			     cpu, r->mon_scope, r->name);
656 		return;
657 	}
658 
659 	hdr = resctrl_find_domain(&r->mon_domains, id, NULL);
660 	if (!hdr) {
661 		pr_warn("Can't find monitor domain for id=%d for CPU %d for resource %s\n",
662 			id, cpu, r->name);
663 		return;
664 	}
665 
666 	cpumask_clear_cpu(cpu, &hdr->cpu_mask);
667 	if (!cpumask_empty(&hdr->cpu_mask))
668 		return;
669 
670 	switch (r->rid) {
671 	case RDT_RESOURCE_L3: {
672 		struct rdt_hw_l3_mon_domain *hw_dom;
673 		struct rdt_l3_mon_domain *d;
674 
675 		if (!domain_header_is_valid(hdr, RESCTRL_MON_DOMAIN, RDT_RESOURCE_L3))
676 			return;
677 
678 		d = container_of(hdr, struct rdt_l3_mon_domain, hdr);
679 		hw_dom = resctrl_to_arch_mon_dom(d);
680 		resctrl_offline_mon_domain(r, hdr);
681 		list_del_rcu(&hdr->list);
682 		synchronize_rcu();
683 		l3_mon_domain_free(hw_dom);
684 		break;
685 	}
686 	case RDT_RESOURCE_PERF_PKG: {
687 		struct rdt_perf_pkg_mon_domain *pkgd;
688 
689 		if (!domain_header_is_valid(hdr, RESCTRL_MON_DOMAIN, RDT_RESOURCE_PERF_PKG))
690 			return;
691 
692 		pkgd = container_of(hdr, struct rdt_perf_pkg_mon_domain, hdr);
693 		resctrl_offline_mon_domain(r, hdr);
694 		list_del_rcu(&hdr->list);
695 		synchronize_rcu();
696 		kfree(pkgd);
697 		break;
698 	}
699 	default:
700 		pr_warn_once("Unknown resource rid=%d\n", r->rid);
701 		break;
702 	}
703 }
704 
705 static void domain_remove_cpu(int cpu, struct rdt_resource *r)
706 {
707 	if (r->alloc_capable)
708 		domain_remove_cpu_ctrl(cpu, r);
709 	if (r->mon_capable)
710 		domain_remove_cpu_mon(cpu, r);
711 }
712 
713 static void clear_closid_rmid(int cpu)
714 {
715 	struct resctrl_pqr_state *state = this_cpu_ptr(&pqr_state);
716 
717 	state->default_closid = RESCTRL_RESERVED_CLOSID;
718 	state->default_rmid = RESCTRL_RESERVED_RMID;
719 	state->cur_closid = RESCTRL_RESERVED_CLOSID;
720 	state->cur_rmid = RESCTRL_RESERVED_RMID;
721 	wrmsr(MSR_IA32_PQR_ASSOC, RESCTRL_RESERVED_RMID,
722 	      RESCTRL_RESERVED_CLOSID);
723 }
724 
725 static int resctrl_arch_online_cpu(unsigned int cpu)
726 {
727 	struct rdt_resource *r;
728 
729 	mutex_lock(&domain_list_lock);
730 	for_each_capable_rdt_resource(r)
731 		domain_add_cpu(cpu, r);
732 	mutex_unlock(&domain_list_lock);
733 
734 	clear_closid_rmid(cpu);
735 	resctrl_online_cpu(cpu);
736 
737 	return 0;
738 }
739 
740 static int resctrl_arch_offline_cpu(unsigned int cpu)
741 {
742 	struct rdt_resource *r;
743 
744 	resctrl_offline_cpu(cpu);
745 
746 	mutex_lock(&domain_list_lock);
747 	for_each_capable_rdt_resource(r)
748 		domain_remove_cpu(cpu, r);
749 	mutex_unlock(&domain_list_lock);
750 
751 	clear_closid_rmid(cpu);
752 
753 	return 0;
754 }
755 
756 void resctrl_arch_pre_mount(void)
757 {
758 	if (!intel_aet_get_events())
759 		return;
760 }
761 
762 enum {
763 	RDT_FLAG_CMT,
764 	RDT_FLAG_MBM_TOTAL,
765 	RDT_FLAG_MBM_LOCAL,
766 	RDT_FLAG_L3_CAT,
767 	RDT_FLAG_L3_CDP,
768 	RDT_FLAG_L2_CAT,
769 	RDT_FLAG_L2_CDP,
770 	RDT_FLAG_MBA,
771 	RDT_FLAG_SMBA,
772 	RDT_FLAG_BMEC,
773 	RDT_FLAG_ABMC,
774 	RDT_FLAG_SDCIAE,
775 };
776 
777 #define RDT_OPT(idx, n, f)	\
778 [idx] = {			\
779 	.name = n,		\
780 	.flag = f		\
781 }
782 
783 struct rdt_options {
784 	char	*name;
785 	int	flag;
786 	bool	force_off, force_on;
787 };
788 
789 static struct rdt_options rdt_options[]  __ro_after_init = {
790 	RDT_OPT(RDT_FLAG_CMT,	    "cmt",	X86_FEATURE_CQM_OCCUP_LLC),
791 	RDT_OPT(RDT_FLAG_MBM_TOTAL, "mbmtotal", X86_FEATURE_CQM_MBM_TOTAL),
792 	RDT_OPT(RDT_FLAG_MBM_LOCAL, "mbmlocal", X86_FEATURE_CQM_MBM_LOCAL),
793 	RDT_OPT(RDT_FLAG_L3_CAT,    "l3cat",	X86_FEATURE_CAT_L3),
794 	RDT_OPT(RDT_FLAG_L3_CDP,    "l3cdp",	X86_FEATURE_CDP_L3),
795 	RDT_OPT(RDT_FLAG_L2_CAT,    "l2cat",	X86_FEATURE_CAT_L2),
796 	RDT_OPT(RDT_FLAG_L2_CDP,    "l2cdp",	X86_FEATURE_CDP_L2),
797 	RDT_OPT(RDT_FLAG_MBA,	    "mba",	X86_FEATURE_MBA),
798 	RDT_OPT(RDT_FLAG_SMBA,	    "smba",	X86_FEATURE_SMBA),
799 	RDT_OPT(RDT_FLAG_BMEC,	    "bmec",	X86_FEATURE_BMEC),
800 	RDT_OPT(RDT_FLAG_ABMC,	    "abmc",	X86_FEATURE_ABMC),
801 	RDT_OPT(RDT_FLAG_SDCIAE,    "sdciae",	X86_FEATURE_SDCIAE),
802 };
803 #define NUM_RDT_OPTIONS ARRAY_SIZE(rdt_options)
804 
805 static int __init set_rdt_options(char *str)
806 {
807 	struct rdt_options *o;
808 	bool force_off;
809 	char *tok;
810 
811 	if (*str == '=')
812 		str++;
813 	while ((tok = strsep(&str, ",")) != NULL) {
814 		force_off = *tok == '!';
815 		if (force_off)
816 			tok++;
817 		for (o = rdt_options; o < &rdt_options[NUM_RDT_OPTIONS]; o++) {
818 			if (strcmp(tok, o->name) == 0) {
819 				if (force_off)
820 					o->force_off = true;
821 				else
822 					o->force_on = true;
823 				break;
824 			}
825 		}
826 	}
827 	return 1;
828 }
829 __setup("rdt", set_rdt_options);
830 
831 bool rdt_cpu_has(int flag)
832 {
833 	bool ret = boot_cpu_has(flag);
834 	struct rdt_options *o;
835 
836 	if (!ret)
837 		return ret;
838 
839 	for (o = rdt_options; o < &rdt_options[NUM_RDT_OPTIONS]; o++) {
840 		if (flag == o->flag) {
841 			if (o->force_off)
842 				ret = false;
843 			if (o->force_on)
844 				ret = true;
845 			break;
846 		}
847 	}
848 	return ret;
849 }
850 
851 bool resctrl_arch_is_evt_configurable(enum resctrl_event_id evt)
852 {
853 	if (!rdt_cpu_has(X86_FEATURE_BMEC))
854 		return false;
855 
856 	switch (evt) {
857 	case QOS_L3_MBM_TOTAL_EVENT_ID:
858 		return rdt_cpu_has(X86_FEATURE_CQM_MBM_TOTAL);
859 	case QOS_L3_MBM_LOCAL_EVENT_ID:
860 		return rdt_cpu_has(X86_FEATURE_CQM_MBM_LOCAL);
861 	default:
862 		return false;
863 	}
864 }
865 
866 static __init bool get_mem_config(void)
867 {
868 	struct rdt_hw_resource *hw_res = &rdt_resources_all[RDT_RESOURCE_MBA];
869 
870 	if (!rdt_cpu_has(X86_FEATURE_MBA))
871 		return false;
872 
873 	if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
874 		return __get_mem_config_intel(&hw_res->r_resctrl);
875 	else if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
876 		return __rdt_get_mem_config_amd(&hw_res->r_resctrl);
877 
878 	return false;
879 }
880 
881 static __init bool get_slow_mem_config(void)
882 {
883 	struct rdt_hw_resource *hw_res = &rdt_resources_all[RDT_RESOURCE_SMBA];
884 
885 	if (!rdt_cpu_has(X86_FEATURE_SMBA))
886 		return false;
887 
888 	if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
889 		return __rdt_get_mem_config_amd(&hw_res->r_resctrl);
890 
891 	return false;
892 }
893 
894 static __init bool get_rdt_alloc_resources(void)
895 {
896 	struct rdt_resource *r;
897 	bool ret = false;
898 
899 	if (rdt_alloc_capable)
900 		return true;
901 
902 	if (!boot_cpu_has(X86_FEATURE_RDT_A))
903 		return false;
904 
905 	if (rdt_cpu_has(X86_FEATURE_CAT_L3)) {
906 		r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
907 		rdt_get_cache_alloc_cfg(1, r);
908 		if (rdt_cpu_has(X86_FEATURE_CDP_L3))
909 			rdt_get_cdp_l3_config();
910 		if (rdt_cpu_has(X86_FEATURE_SDCIAE))
911 			rdt_set_io_alloc_capable(r);
912 		ret = true;
913 	}
914 	if (rdt_cpu_has(X86_FEATURE_CAT_L2)) {
915 		/* CPUID 0x10.2 fields are same format at 0x10.1 */
916 		r = &rdt_resources_all[RDT_RESOURCE_L2].r_resctrl;
917 		rdt_get_cache_alloc_cfg(2, r);
918 		if (rdt_cpu_has(X86_FEATURE_CDP_L2))
919 			rdt_get_cdp_l2_config();
920 		ret = true;
921 	}
922 
923 	if (get_mem_config())
924 		ret = true;
925 
926 	if (get_slow_mem_config())
927 		ret = true;
928 
929 	return ret;
930 }
931 
932 static __init bool get_rdt_mon_resources(void)
933 {
934 	struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
935 	bool ret = false;
936 
937 	if (rdt_cpu_has(X86_FEATURE_CQM_OCCUP_LLC)) {
938 		resctrl_enable_mon_event(QOS_L3_OCCUP_EVENT_ID, false, 0, NULL);
939 		ret = true;
940 	}
941 	if (rdt_cpu_has(X86_FEATURE_CQM_MBM_TOTAL)) {
942 		resctrl_enable_mon_event(QOS_L3_MBM_TOTAL_EVENT_ID, false, 0, NULL);
943 		ret = true;
944 	}
945 	if (rdt_cpu_has(X86_FEATURE_CQM_MBM_LOCAL)) {
946 		resctrl_enable_mon_event(QOS_L3_MBM_LOCAL_EVENT_ID, false, 0, NULL);
947 		ret = true;
948 	}
949 	if (rdt_cpu_has(X86_FEATURE_ABMC))
950 		ret = true;
951 
952 	if (!ret)
953 		return false;
954 
955 	return !rdt_get_l3_mon_config(r);
956 }
957 
958 static __init void __check_quirks_intel(void)
959 {
960 	switch (boot_cpu_data.x86_vfm) {
961 	case INTEL_HASWELL_X:
962 		if (!rdt_options[RDT_FLAG_L3_CAT].force_off)
963 			cache_alloc_hsw_probe();
964 		break;
965 	case INTEL_SKYLAKE_X:
966 		if (boot_cpu_data.x86_stepping <= 4)
967 			set_rdt_options("!cmt,!mbmtotal,!mbmlocal,!l3cat");
968 		else
969 			set_rdt_options("!l3cat");
970 		fallthrough;
971 	case INTEL_BROADWELL_X:
972 		intel_rdt_mbm_apply_quirk();
973 		break;
974 	}
975 }
976 
977 static __init void check_quirks(void)
978 {
979 	if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
980 		__check_quirks_intel();
981 }
982 
983 static __init bool get_rdt_resources(void)
984 {
985 	rdt_alloc_capable = get_rdt_alloc_resources();
986 	rdt_mon_capable = get_rdt_mon_resources();
987 
988 	return (rdt_mon_capable || rdt_alloc_capable);
989 }
990 
991 static __init void rdt_init_res_defs_intel(void)
992 {
993 	struct rdt_hw_resource *hw_res;
994 	struct rdt_resource *r;
995 
996 	for_each_rdt_resource(r) {
997 		hw_res = resctrl_to_arch_res(r);
998 
999 		if (r->rid == RDT_RESOURCE_L3 ||
1000 		    r->rid == RDT_RESOURCE_L2) {
1001 			r->cache.arch_has_per_cpu_cfg = false;
1002 			r->cache.min_cbm_bits = 1;
1003 		} else if (r->rid == RDT_RESOURCE_MBA) {
1004 			hw_res->msr_base = MSR_IA32_MBA_THRTL_BASE;
1005 			hw_res->msr_update = mba_wrmsr_intel;
1006 		}
1007 	}
1008 }
1009 
1010 static __init void rdt_init_res_defs_amd(void)
1011 {
1012 	struct rdt_hw_resource *hw_res;
1013 	struct rdt_resource *r;
1014 
1015 	for_each_rdt_resource(r) {
1016 		hw_res = resctrl_to_arch_res(r);
1017 
1018 		if (r->rid == RDT_RESOURCE_L3 ||
1019 		    r->rid == RDT_RESOURCE_L2) {
1020 			r->cache.arch_has_sparse_bitmasks = true;
1021 			r->cache.arch_has_per_cpu_cfg = true;
1022 			r->cache.min_cbm_bits = 0;
1023 		} else if (r->rid == RDT_RESOURCE_MBA) {
1024 			hw_res->msr_base = MSR_IA32_MBA_BW_BASE;
1025 			hw_res->msr_update = mba_wrmsr_amd;
1026 		} else if (r->rid == RDT_RESOURCE_SMBA) {
1027 			hw_res->msr_base = MSR_IA32_SMBA_BW_BASE;
1028 			hw_res->msr_update = mba_wrmsr_amd;
1029 		}
1030 	}
1031 }
1032 
1033 static __init void rdt_init_res_defs(void)
1034 {
1035 	if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
1036 		rdt_init_res_defs_intel();
1037 	else if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
1038 		rdt_init_res_defs_amd();
1039 }
1040 
1041 static enum cpuhp_state rdt_online;
1042 
1043 /* Runs once on the BSP during boot. */
1044 void resctrl_cpu_detect(struct cpuinfo_x86 *c)
1045 {
1046 	if (!cpu_has(c, X86_FEATURE_CQM_LLC) && !cpu_has(c, X86_FEATURE_ABMC)) {
1047 		c->x86_cache_max_rmid  = -1;
1048 		c->x86_cache_occ_scale = -1;
1049 		c->x86_cache_mbm_width_offset = -1;
1050 		return;
1051 	}
1052 
1053 	/* will be overridden if occupancy monitoring exists */
1054 	c->x86_cache_max_rmid = cpuid_ebx(0xf);
1055 
1056 	if (cpu_has(c, X86_FEATURE_CQM_OCCUP_LLC) ||
1057 	    cpu_has(c, X86_FEATURE_CQM_MBM_TOTAL) ||
1058 	    cpu_has(c, X86_FEATURE_CQM_MBM_LOCAL) ||
1059 	    cpu_has(c, X86_FEATURE_ABMC)) {
1060 		u32 eax, ebx, ecx, edx;
1061 
1062 		/* QoS sub-leaf, EAX=0Fh, ECX=1 */
1063 		cpuid_count(0xf, 1, &eax, &ebx, &ecx, &edx);
1064 
1065 		c->x86_cache_max_rmid  = ecx;
1066 		c->x86_cache_occ_scale = ebx;
1067 		c->x86_cache_mbm_width_offset = eax & 0xff;
1068 
1069 		if (c->x86_vendor == X86_VENDOR_AMD && !c->x86_cache_mbm_width_offset)
1070 			c->x86_cache_mbm_width_offset = MBM_CNTR_WIDTH_OFFSET_AMD;
1071 	}
1072 }
1073 
1074 static int __init resctrl_arch_late_init(void)
1075 {
1076 	struct rdt_resource *r;
1077 	int state, ret, i;
1078 
1079 	/* for_each_rdt_resource() requires all rid to be initialised. */
1080 	for (i = 0; i < RDT_NUM_RESOURCES; i++)
1081 		rdt_resources_all[i].r_resctrl.rid = i;
1082 
1083 	/*
1084 	 * Initialize functions(or definitions) that are different
1085 	 * between vendors here.
1086 	 */
1087 	rdt_init_res_defs();
1088 
1089 	check_quirks();
1090 
1091 	if (!get_rdt_resources())
1092 		return -ENODEV;
1093 
1094 	state = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN,
1095 				  "x86/resctrl/cat:online:",
1096 				  resctrl_arch_online_cpu,
1097 				  resctrl_arch_offline_cpu);
1098 	if (state < 0)
1099 		return state;
1100 
1101 	ret = resctrl_init();
1102 	if (ret) {
1103 		cpuhp_remove_state(state);
1104 		return ret;
1105 	}
1106 	rdt_online = state;
1107 
1108 	for_each_alloc_capable_rdt_resource(r)
1109 		pr_info("%s allocation detected\n", r->name);
1110 
1111 	for_each_mon_capable_rdt_resource(r)
1112 		pr_info("%s monitoring detected\n", r->name);
1113 
1114 	return 0;
1115 }
1116 
1117 late_initcall(resctrl_arch_late_init);
1118 
1119 static void __exit resctrl_arch_exit(void)
1120 {
1121 	intel_aet_exit();
1122 
1123 	cpuhp_remove_state(rdt_online);
1124 
1125 	resctrl_exit();
1126 }
1127 
1128 __exitcall(resctrl_arch_exit);
1129