xref: /linux/arch/x86/kernel/cpu/resctrl/core.c (revision 55d0969c451159cff86949b38c39171cab962069)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Resource Director Technology(RDT)
4  * - Cache Allocation code.
5  *
6  * Copyright (C) 2016 Intel Corporation
7  *
8  * Authors:
9  *    Fenghua Yu <fenghua.yu@intel.com>
10  *    Tony Luck <tony.luck@intel.com>
11  *    Vikas Shivappa <vikas.shivappa@intel.com>
12  *
13  * More information about RDT be found in the Intel (R) x86 Architecture
14  * Software Developer Manual June 2016, volume 3, section 17.17.
15  */
16 
17 #define pr_fmt(fmt)	"resctrl: " fmt
18 
19 #include <linux/cpu.h>
20 #include <linux/slab.h>
21 #include <linux/err.h>
22 #include <linux/cpuhotplug.h>
23 
24 #include <asm/cpu_device_id.h>
25 #include <asm/resctrl.h>
26 #include "internal.h"
27 
28 /*
29  * rdt_domain structures are kfree()d when their last CPU goes offline,
30  * and allocated when the first CPU in a new domain comes online.
31  * The rdt_resource's domain list is updated when this happens. Readers of
32  * the domain list must either take cpus_read_lock(), or rely on an RCU
33  * read-side critical section, to avoid observing concurrent modification.
34  * All writers take this mutex:
35  */
36 static DEFINE_MUTEX(domain_list_lock);
37 
38 /*
39  * The cached resctrl_pqr_state is strictly per CPU and can never be
40  * updated from a remote CPU. Functions which modify the state
41  * are called with interrupts disabled and no preemption, which
42  * is sufficient for the protection.
43  */
44 DEFINE_PER_CPU(struct resctrl_pqr_state, pqr_state);
45 
46 /*
47  * Used to store the max resource name width and max resource data width
48  * to display the schemata in a tabular format
49  */
50 int max_name_width, max_data_width;
51 
52 /*
53  * Global boolean for rdt_alloc which is true if any
54  * resource allocation is enabled.
55  */
56 bool rdt_alloc_capable;
57 
58 static void mba_wrmsr_intel(struct msr_param *m);
59 static void cat_wrmsr(struct msr_param *m);
60 static void mba_wrmsr_amd(struct msr_param *m);
61 
62 #define ctrl_domain_init(id) LIST_HEAD_INIT(rdt_resources_all[id].r_resctrl.ctrl_domains)
63 #define mon_domain_init(id) LIST_HEAD_INIT(rdt_resources_all[id].r_resctrl.mon_domains)
64 
65 struct rdt_hw_resource rdt_resources_all[] = {
66 	[RDT_RESOURCE_L3] =
67 	{
68 		.r_resctrl = {
69 			.rid			= RDT_RESOURCE_L3,
70 			.name			= "L3",
71 			.ctrl_scope		= RESCTRL_L3_CACHE,
72 			.mon_scope		= RESCTRL_L3_CACHE,
73 			.ctrl_domains		= ctrl_domain_init(RDT_RESOURCE_L3),
74 			.mon_domains		= mon_domain_init(RDT_RESOURCE_L3),
75 			.parse_ctrlval		= parse_cbm,
76 			.format_str		= "%d=%0*x",
77 			.fflags			= RFTYPE_RES_CACHE,
78 		},
79 		.msr_base		= MSR_IA32_L3_CBM_BASE,
80 		.msr_update		= cat_wrmsr,
81 	},
82 	[RDT_RESOURCE_L2] =
83 	{
84 		.r_resctrl = {
85 			.rid			= RDT_RESOURCE_L2,
86 			.name			= "L2",
87 			.ctrl_scope		= RESCTRL_L2_CACHE,
88 			.ctrl_domains		= ctrl_domain_init(RDT_RESOURCE_L2),
89 			.parse_ctrlval		= parse_cbm,
90 			.format_str		= "%d=%0*x",
91 			.fflags			= RFTYPE_RES_CACHE,
92 		},
93 		.msr_base		= MSR_IA32_L2_CBM_BASE,
94 		.msr_update		= cat_wrmsr,
95 	},
96 	[RDT_RESOURCE_MBA] =
97 	{
98 		.r_resctrl = {
99 			.rid			= RDT_RESOURCE_MBA,
100 			.name			= "MB",
101 			.ctrl_scope		= RESCTRL_L3_CACHE,
102 			.ctrl_domains		= ctrl_domain_init(RDT_RESOURCE_MBA),
103 			.parse_ctrlval		= parse_bw,
104 			.format_str		= "%d=%*u",
105 			.fflags			= RFTYPE_RES_MB,
106 		},
107 	},
108 	[RDT_RESOURCE_SMBA] =
109 	{
110 		.r_resctrl = {
111 			.rid			= RDT_RESOURCE_SMBA,
112 			.name			= "SMBA",
113 			.ctrl_scope		= RESCTRL_L3_CACHE,
114 			.ctrl_domains		= ctrl_domain_init(RDT_RESOURCE_SMBA),
115 			.parse_ctrlval		= parse_bw,
116 			.format_str		= "%d=%*u",
117 			.fflags			= RFTYPE_RES_MB,
118 		},
119 	},
120 };
121 
122 u32 resctrl_arch_system_num_rmid_idx(void)
123 {
124 	struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
125 
126 	/* RMID are independent numbers for x86. num_rmid_idx == num_rmid */
127 	return r->num_rmid;
128 }
129 
130 /*
131  * cache_alloc_hsw_probe() - Have to probe for Intel haswell server CPUs
132  * as they do not have CPUID enumeration support for Cache allocation.
133  * The check for Vendor/Family/Model is not enough to guarantee that
134  * the MSRs won't #GP fault because only the following SKUs support
135  * CAT:
136  *	Intel(R) Xeon(R)  CPU E5-2658  v3  @  2.20GHz
137  *	Intel(R) Xeon(R)  CPU E5-2648L v3  @  1.80GHz
138  *	Intel(R) Xeon(R)  CPU E5-2628L v3  @  2.00GHz
139  *	Intel(R) Xeon(R)  CPU E5-2618L v3  @  2.30GHz
140  *	Intel(R) Xeon(R)  CPU E5-2608L v3  @  2.00GHz
141  *	Intel(R) Xeon(R)  CPU E5-2658A v3  @  2.20GHz
142  *
143  * Probe by trying to write the first of the L3 cache mask registers
144  * and checking that the bits stick. Max CLOSids is always 4 and max cbm length
145  * is always 20 on hsw server parts. The minimum cache bitmask length
146  * allowed for HSW server is always 2 bits. Hardcode all of them.
147  */
148 static inline void cache_alloc_hsw_probe(void)
149 {
150 	struct rdt_hw_resource *hw_res = &rdt_resources_all[RDT_RESOURCE_L3];
151 	struct rdt_resource *r  = &hw_res->r_resctrl;
152 	u64 max_cbm = BIT_ULL_MASK(20) - 1, l3_cbm_0;
153 
154 	if (wrmsrl_safe(MSR_IA32_L3_CBM_BASE, max_cbm))
155 		return;
156 
157 	rdmsrl(MSR_IA32_L3_CBM_BASE, l3_cbm_0);
158 
159 	/* If all the bits were set in MSR, return success */
160 	if (l3_cbm_0 != max_cbm)
161 		return;
162 
163 	hw_res->num_closid = 4;
164 	r->default_ctrl = max_cbm;
165 	r->cache.cbm_len = 20;
166 	r->cache.shareable_bits = 0xc0000;
167 	r->cache.min_cbm_bits = 2;
168 	r->cache.arch_has_sparse_bitmasks = false;
169 	r->alloc_capable = true;
170 
171 	rdt_alloc_capable = true;
172 }
173 
174 bool is_mba_sc(struct rdt_resource *r)
175 {
176 	if (!r)
177 		return rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl.membw.mba_sc;
178 
179 	/*
180 	 * The software controller support is only applicable to MBA resource.
181 	 * Make sure to check for resource type.
182 	 */
183 	if (r->rid != RDT_RESOURCE_MBA)
184 		return false;
185 
186 	return r->membw.mba_sc;
187 }
188 
189 /*
190  * rdt_get_mb_table() - get a mapping of bandwidth(b/w) percentage values
191  * exposed to user interface and the h/w understandable delay values.
192  *
193  * The non-linear delay values have the granularity of power of two
194  * and also the h/w does not guarantee a curve for configured delay
195  * values vs. actual b/w enforced.
196  * Hence we need a mapping that is pre calibrated so the user can
197  * express the memory b/w as a percentage value.
198  */
199 static inline bool rdt_get_mb_table(struct rdt_resource *r)
200 {
201 	/*
202 	 * There are no Intel SKUs as of now to support non-linear delay.
203 	 */
204 	pr_info("MBA b/w map not implemented for cpu:%d, model:%d",
205 		boot_cpu_data.x86, boot_cpu_data.x86_model);
206 
207 	return false;
208 }
209 
210 static __init bool __get_mem_config_intel(struct rdt_resource *r)
211 {
212 	struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
213 	union cpuid_0x10_3_eax eax;
214 	union cpuid_0x10_x_edx edx;
215 	u32 ebx, ecx, max_delay;
216 
217 	cpuid_count(0x00000010, 3, &eax.full, &ebx, &ecx, &edx.full);
218 	hw_res->num_closid = edx.split.cos_max + 1;
219 	max_delay = eax.split.max_delay + 1;
220 	r->default_ctrl = MAX_MBA_BW;
221 	r->membw.arch_needs_linear = true;
222 	if (ecx & MBA_IS_LINEAR) {
223 		r->membw.delay_linear = true;
224 		r->membw.min_bw = MAX_MBA_BW - max_delay;
225 		r->membw.bw_gran = MAX_MBA_BW - max_delay;
226 	} else {
227 		if (!rdt_get_mb_table(r))
228 			return false;
229 		r->membw.arch_needs_linear = false;
230 	}
231 	r->data_width = 3;
232 
233 	if (boot_cpu_has(X86_FEATURE_PER_THREAD_MBA))
234 		r->membw.throttle_mode = THREAD_THROTTLE_PER_THREAD;
235 	else
236 		r->membw.throttle_mode = THREAD_THROTTLE_MAX;
237 	thread_throttle_mode_init();
238 
239 	r->alloc_capable = true;
240 
241 	return true;
242 }
243 
244 static __init bool __rdt_get_mem_config_amd(struct rdt_resource *r)
245 {
246 	struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
247 	u32 eax, ebx, ecx, edx, subleaf;
248 
249 	/*
250 	 * Query CPUID_Fn80000020_EDX_x01 for MBA and
251 	 * CPUID_Fn80000020_EDX_x02 for SMBA
252 	 */
253 	subleaf = (r->rid == RDT_RESOURCE_SMBA) ? 2 :  1;
254 
255 	cpuid_count(0x80000020, subleaf, &eax, &ebx, &ecx, &edx);
256 	hw_res->num_closid = edx + 1;
257 	r->default_ctrl = 1 << eax;
258 
259 	/* AMD does not use delay */
260 	r->membw.delay_linear = false;
261 	r->membw.arch_needs_linear = false;
262 
263 	/*
264 	 * AMD does not use memory delay throttle model to control
265 	 * the allocation like Intel does.
266 	 */
267 	r->membw.throttle_mode = THREAD_THROTTLE_UNDEFINED;
268 	r->membw.min_bw = 0;
269 	r->membw.bw_gran = 1;
270 	/* Max value is 2048, Data width should be 4 in decimal */
271 	r->data_width = 4;
272 
273 	r->alloc_capable = true;
274 
275 	return true;
276 }
277 
278 static void rdt_get_cache_alloc_cfg(int idx, struct rdt_resource *r)
279 {
280 	struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
281 	union cpuid_0x10_1_eax eax;
282 	union cpuid_0x10_x_ecx ecx;
283 	union cpuid_0x10_x_edx edx;
284 	u32 ebx;
285 
286 	cpuid_count(0x00000010, idx, &eax.full, &ebx, &ecx.full, &edx.full);
287 	hw_res->num_closid = edx.split.cos_max + 1;
288 	r->cache.cbm_len = eax.split.cbm_len + 1;
289 	r->default_ctrl = BIT_MASK(eax.split.cbm_len + 1) - 1;
290 	r->cache.shareable_bits = ebx & r->default_ctrl;
291 	r->data_width = (r->cache.cbm_len + 3) / 4;
292 	if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
293 		r->cache.arch_has_sparse_bitmasks = ecx.split.noncont;
294 	r->alloc_capable = true;
295 }
296 
297 static void rdt_get_cdp_config(int level)
298 {
299 	/*
300 	 * By default, CDP is disabled. CDP can be enabled by mount parameter
301 	 * "cdp" during resctrl file system mount time.
302 	 */
303 	rdt_resources_all[level].cdp_enabled = false;
304 	rdt_resources_all[level].r_resctrl.cdp_capable = true;
305 }
306 
307 static void rdt_get_cdp_l3_config(void)
308 {
309 	rdt_get_cdp_config(RDT_RESOURCE_L3);
310 }
311 
312 static void rdt_get_cdp_l2_config(void)
313 {
314 	rdt_get_cdp_config(RDT_RESOURCE_L2);
315 }
316 
317 static void mba_wrmsr_amd(struct msr_param *m)
318 {
319 	struct rdt_hw_ctrl_domain *hw_dom = resctrl_to_arch_ctrl_dom(m->dom);
320 	struct rdt_hw_resource *hw_res = resctrl_to_arch_res(m->res);
321 	unsigned int i;
322 
323 	for (i = m->low; i < m->high; i++)
324 		wrmsrl(hw_res->msr_base + i, hw_dom->ctrl_val[i]);
325 }
326 
327 /*
328  * Map the memory b/w percentage value to delay values
329  * that can be written to QOS_MSRs.
330  * There are currently no SKUs which support non linear delay values.
331  */
332 static u32 delay_bw_map(unsigned long bw, struct rdt_resource *r)
333 {
334 	if (r->membw.delay_linear)
335 		return MAX_MBA_BW - bw;
336 
337 	pr_warn_once("Non Linear delay-bw map not supported but queried\n");
338 	return r->default_ctrl;
339 }
340 
341 static void mba_wrmsr_intel(struct msr_param *m)
342 {
343 	struct rdt_hw_ctrl_domain *hw_dom = resctrl_to_arch_ctrl_dom(m->dom);
344 	struct rdt_hw_resource *hw_res = resctrl_to_arch_res(m->res);
345 	unsigned int i;
346 
347 	/*  Write the delay values for mba. */
348 	for (i = m->low; i < m->high; i++)
349 		wrmsrl(hw_res->msr_base + i, delay_bw_map(hw_dom->ctrl_val[i], m->res));
350 }
351 
352 static void cat_wrmsr(struct msr_param *m)
353 {
354 	struct rdt_hw_ctrl_domain *hw_dom = resctrl_to_arch_ctrl_dom(m->dom);
355 	struct rdt_hw_resource *hw_res = resctrl_to_arch_res(m->res);
356 	unsigned int i;
357 
358 	for (i = m->low; i < m->high; i++)
359 		wrmsrl(hw_res->msr_base + i, hw_dom->ctrl_val[i]);
360 }
361 
362 struct rdt_ctrl_domain *get_ctrl_domain_from_cpu(int cpu, struct rdt_resource *r)
363 {
364 	struct rdt_ctrl_domain *d;
365 
366 	lockdep_assert_cpus_held();
367 
368 	list_for_each_entry(d, &r->ctrl_domains, hdr.list) {
369 		/* Find the domain that contains this CPU */
370 		if (cpumask_test_cpu(cpu, &d->hdr.cpu_mask))
371 			return d;
372 	}
373 
374 	return NULL;
375 }
376 
377 struct rdt_mon_domain *get_mon_domain_from_cpu(int cpu, struct rdt_resource *r)
378 {
379 	struct rdt_mon_domain *d;
380 
381 	lockdep_assert_cpus_held();
382 
383 	list_for_each_entry(d, &r->mon_domains, hdr.list) {
384 		/* Find the domain that contains this CPU */
385 		if (cpumask_test_cpu(cpu, &d->hdr.cpu_mask))
386 			return d;
387 	}
388 
389 	return NULL;
390 }
391 
392 u32 resctrl_arch_get_num_closid(struct rdt_resource *r)
393 {
394 	return resctrl_to_arch_res(r)->num_closid;
395 }
396 
397 void rdt_ctrl_update(void *arg)
398 {
399 	struct rdt_hw_resource *hw_res;
400 	struct msr_param *m = arg;
401 
402 	hw_res = resctrl_to_arch_res(m->res);
403 	hw_res->msr_update(m);
404 }
405 
406 /*
407  * rdt_find_domain - Search for a domain id in a resource domain list.
408  *
409  * Search the domain list to find the domain id. If the domain id is
410  * found, return the domain. NULL otherwise.  If the domain id is not
411  * found (and NULL returned) then the first domain with id bigger than
412  * the input id can be returned to the caller via @pos.
413  */
414 struct rdt_domain_hdr *rdt_find_domain(struct list_head *h, int id,
415 				       struct list_head **pos)
416 {
417 	struct rdt_domain_hdr *d;
418 	struct list_head *l;
419 
420 	list_for_each(l, h) {
421 		d = list_entry(l, struct rdt_domain_hdr, list);
422 		/* When id is found, return its domain. */
423 		if (id == d->id)
424 			return d;
425 		/* Stop searching when finding id's position in sorted list. */
426 		if (id < d->id)
427 			break;
428 	}
429 
430 	if (pos)
431 		*pos = l;
432 
433 	return NULL;
434 }
435 
436 static void setup_default_ctrlval(struct rdt_resource *r, u32 *dc)
437 {
438 	struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
439 	int i;
440 
441 	/*
442 	 * Initialize the Control MSRs to having no control.
443 	 * For Cache Allocation: Set all bits in cbm
444 	 * For Memory Allocation: Set b/w requested to 100%
445 	 */
446 	for (i = 0; i < hw_res->num_closid; i++, dc++)
447 		*dc = r->default_ctrl;
448 }
449 
450 static void ctrl_domain_free(struct rdt_hw_ctrl_domain *hw_dom)
451 {
452 	kfree(hw_dom->ctrl_val);
453 	kfree(hw_dom);
454 }
455 
456 static void mon_domain_free(struct rdt_hw_mon_domain *hw_dom)
457 {
458 	kfree(hw_dom->arch_mbm_total);
459 	kfree(hw_dom->arch_mbm_local);
460 	kfree(hw_dom);
461 }
462 
463 static int domain_setup_ctrlval(struct rdt_resource *r, struct rdt_ctrl_domain *d)
464 {
465 	struct rdt_hw_ctrl_domain *hw_dom = resctrl_to_arch_ctrl_dom(d);
466 	struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
467 	struct msr_param m;
468 	u32 *dc;
469 
470 	dc = kmalloc_array(hw_res->num_closid, sizeof(*hw_dom->ctrl_val),
471 			   GFP_KERNEL);
472 	if (!dc)
473 		return -ENOMEM;
474 
475 	hw_dom->ctrl_val = dc;
476 	setup_default_ctrlval(r, dc);
477 
478 	m.res = r;
479 	m.dom = d;
480 	m.low = 0;
481 	m.high = hw_res->num_closid;
482 	hw_res->msr_update(&m);
483 	return 0;
484 }
485 
486 /**
487  * arch_domain_mbm_alloc() - Allocate arch private storage for the MBM counters
488  * @num_rmid:	The size of the MBM counter array
489  * @hw_dom:	The domain that owns the allocated arrays
490  */
491 static int arch_domain_mbm_alloc(u32 num_rmid, struct rdt_hw_mon_domain *hw_dom)
492 {
493 	size_t tsize;
494 
495 	if (is_mbm_total_enabled()) {
496 		tsize = sizeof(*hw_dom->arch_mbm_total);
497 		hw_dom->arch_mbm_total = kcalloc(num_rmid, tsize, GFP_KERNEL);
498 		if (!hw_dom->arch_mbm_total)
499 			return -ENOMEM;
500 	}
501 	if (is_mbm_local_enabled()) {
502 		tsize = sizeof(*hw_dom->arch_mbm_local);
503 		hw_dom->arch_mbm_local = kcalloc(num_rmid, tsize, GFP_KERNEL);
504 		if (!hw_dom->arch_mbm_local) {
505 			kfree(hw_dom->arch_mbm_total);
506 			hw_dom->arch_mbm_total = NULL;
507 			return -ENOMEM;
508 		}
509 	}
510 
511 	return 0;
512 }
513 
514 static int get_domain_id_from_scope(int cpu, enum resctrl_scope scope)
515 {
516 	switch (scope) {
517 	case RESCTRL_L2_CACHE:
518 	case RESCTRL_L3_CACHE:
519 		return get_cpu_cacheinfo_id(cpu, scope);
520 	case RESCTRL_L3_NODE:
521 		return cpu_to_node(cpu);
522 	default:
523 		break;
524 	}
525 
526 	return -EINVAL;
527 }
528 
529 static void domain_add_cpu_ctrl(int cpu, struct rdt_resource *r)
530 {
531 	int id = get_domain_id_from_scope(cpu, r->ctrl_scope);
532 	struct rdt_hw_ctrl_domain *hw_dom;
533 	struct list_head *add_pos = NULL;
534 	struct rdt_domain_hdr *hdr;
535 	struct rdt_ctrl_domain *d;
536 	int err;
537 
538 	lockdep_assert_held(&domain_list_lock);
539 
540 	if (id < 0) {
541 		pr_warn_once("Can't find control domain id for CPU:%d scope:%d for resource %s\n",
542 			     cpu, r->ctrl_scope, r->name);
543 		return;
544 	}
545 
546 	hdr = rdt_find_domain(&r->ctrl_domains, id, &add_pos);
547 	if (hdr) {
548 		if (WARN_ON_ONCE(hdr->type != RESCTRL_CTRL_DOMAIN))
549 			return;
550 		d = container_of(hdr, struct rdt_ctrl_domain, hdr);
551 
552 		cpumask_set_cpu(cpu, &d->hdr.cpu_mask);
553 		if (r->cache.arch_has_per_cpu_cfg)
554 			rdt_domain_reconfigure_cdp(r);
555 		return;
556 	}
557 
558 	hw_dom = kzalloc_node(sizeof(*hw_dom), GFP_KERNEL, cpu_to_node(cpu));
559 	if (!hw_dom)
560 		return;
561 
562 	d = &hw_dom->d_resctrl;
563 	d->hdr.id = id;
564 	d->hdr.type = RESCTRL_CTRL_DOMAIN;
565 	cpumask_set_cpu(cpu, &d->hdr.cpu_mask);
566 
567 	rdt_domain_reconfigure_cdp(r);
568 
569 	if (domain_setup_ctrlval(r, d)) {
570 		ctrl_domain_free(hw_dom);
571 		return;
572 	}
573 
574 	list_add_tail_rcu(&d->hdr.list, add_pos);
575 
576 	err = resctrl_online_ctrl_domain(r, d);
577 	if (err) {
578 		list_del_rcu(&d->hdr.list);
579 		synchronize_rcu();
580 		ctrl_domain_free(hw_dom);
581 	}
582 }
583 
584 static void domain_add_cpu_mon(int cpu, struct rdt_resource *r)
585 {
586 	int id = get_domain_id_from_scope(cpu, r->mon_scope);
587 	struct list_head *add_pos = NULL;
588 	struct rdt_hw_mon_domain *hw_dom;
589 	struct rdt_domain_hdr *hdr;
590 	struct rdt_mon_domain *d;
591 	int err;
592 
593 	lockdep_assert_held(&domain_list_lock);
594 
595 	if (id < 0) {
596 		pr_warn_once("Can't find monitor domain id for CPU:%d scope:%d for resource %s\n",
597 			     cpu, r->mon_scope, r->name);
598 		return;
599 	}
600 
601 	hdr = rdt_find_domain(&r->mon_domains, id, &add_pos);
602 	if (hdr) {
603 		if (WARN_ON_ONCE(hdr->type != RESCTRL_MON_DOMAIN))
604 			return;
605 		d = container_of(hdr, struct rdt_mon_domain, hdr);
606 
607 		cpumask_set_cpu(cpu, &d->hdr.cpu_mask);
608 		return;
609 	}
610 
611 	hw_dom = kzalloc_node(sizeof(*hw_dom), GFP_KERNEL, cpu_to_node(cpu));
612 	if (!hw_dom)
613 		return;
614 
615 	d = &hw_dom->d_resctrl;
616 	d->hdr.id = id;
617 	d->hdr.type = RESCTRL_MON_DOMAIN;
618 	d->ci = get_cpu_cacheinfo_level(cpu, RESCTRL_L3_CACHE);
619 	if (!d->ci) {
620 		pr_warn_once("Can't find L3 cache for CPU:%d resource %s\n", cpu, r->name);
621 		mon_domain_free(hw_dom);
622 		return;
623 	}
624 	cpumask_set_cpu(cpu, &d->hdr.cpu_mask);
625 
626 	arch_mon_domain_online(r, d);
627 
628 	if (arch_domain_mbm_alloc(r->num_rmid, hw_dom)) {
629 		mon_domain_free(hw_dom);
630 		return;
631 	}
632 
633 	list_add_tail_rcu(&d->hdr.list, add_pos);
634 
635 	err = resctrl_online_mon_domain(r, d);
636 	if (err) {
637 		list_del_rcu(&d->hdr.list);
638 		synchronize_rcu();
639 		mon_domain_free(hw_dom);
640 	}
641 }
642 
643 static void domain_add_cpu(int cpu, struct rdt_resource *r)
644 {
645 	if (r->alloc_capable)
646 		domain_add_cpu_ctrl(cpu, r);
647 	if (r->mon_capable)
648 		domain_add_cpu_mon(cpu, r);
649 }
650 
651 static void domain_remove_cpu_ctrl(int cpu, struct rdt_resource *r)
652 {
653 	int id = get_domain_id_from_scope(cpu, r->ctrl_scope);
654 	struct rdt_hw_ctrl_domain *hw_dom;
655 	struct rdt_domain_hdr *hdr;
656 	struct rdt_ctrl_domain *d;
657 
658 	lockdep_assert_held(&domain_list_lock);
659 
660 	if (id < 0) {
661 		pr_warn_once("Can't find control domain id for CPU:%d scope:%d for resource %s\n",
662 			     cpu, r->ctrl_scope, r->name);
663 		return;
664 	}
665 
666 	hdr = rdt_find_domain(&r->ctrl_domains, id, NULL);
667 	if (!hdr) {
668 		pr_warn("Can't find control domain for id=%d for CPU %d for resource %s\n",
669 			id, cpu, r->name);
670 		return;
671 	}
672 
673 	if (WARN_ON_ONCE(hdr->type != RESCTRL_CTRL_DOMAIN))
674 		return;
675 
676 	d = container_of(hdr, struct rdt_ctrl_domain, hdr);
677 	hw_dom = resctrl_to_arch_ctrl_dom(d);
678 
679 	cpumask_clear_cpu(cpu, &d->hdr.cpu_mask);
680 	if (cpumask_empty(&d->hdr.cpu_mask)) {
681 		resctrl_offline_ctrl_domain(r, d);
682 		list_del_rcu(&d->hdr.list);
683 		synchronize_rcu();
684 
685 		/*
686 		 * rdt_ctrl_domain "d" is going to be freed below, so clear
687 		 * its pointer from pseudo_lock_region struct.
688 		 */
689 		if (d->plr)
690 			d->plr->d = NULL;
691 		ctrl_domain_free(hw_dom);
692 
693 		return;
694 	}
695 }
696 
697 static void domain_remove_cpu_mon(int cpu, struct rdt_resource *r)
698 {
699 	int id = get_domain_id_from_scope(cpu, r->mon_scope);
700 	struct rdt_hw_mon_domain *hw_dom;
701 	struct rdt_domain_hdr *hdr;
702 	struct rdt_mon_domain *d;
703 
704 	lockdep_assert_held(&domain_list_lock);
705 
706 	if (id < 0) {
707 		pr_warn_once("Can't find monitor domain id for CPU:%d scope:%d for resource %s\n",
708 			     cpu, r->mon_scope, r->name);
709 		return;
710 	}
711 
712 	hdr = rdt_find_domain(&r->mon_domains, id, NULL);
713 	if (!hdr) {
714 		pr_warn("Can't find monitor domain for id=%d for CPU %d for resource %s\n",
715 			id, cpu, r->name);
716 		return;
717 	}
718 
719 	if (WARN_ON_ONCE(hdr->type != RESCTRL_MON_DOMAIN))
720 		return;
721 
722 	d = container_of(hdr, struct rdt_mon_domain, hdr);
723 	hw_dom = resctrl_to_arch_mon_dom(d);
724 
725 	cpumask_clear_cpu(cpu, &d->hdr.cpu_mask);
726 	if (cpumask_empty(&d->hdr.cpu_mask)) {
727 		resctrl_offline_mon_domain(r, d);
728 		list_del_rcu(&d->hdr.list);
729 		synchronize_rcu();
730 		mon_domain_free(hw_dom);
731 
732 		return;
733 	}
734 }
735 
736 static void domain_remove_cpu(int cpu, struct rdt_resource *r)
737 {
738 	if (r->alloc_capable)
739 		domain_remove_cpu_ctrl(cpu, r);
740 	if (r->mon_capable)
741 		domain_remove_cpu_mon(cpu, r);
742 }
743 
744 static void clear_closid_rmid(int cpu)
745 {
746 	struct resctrl_pqr_state *state = this_cpu_ptr(&pqr_state);
747 
748 	state->default_closid = RESCTRL_RESERVED_CLOSID;
749 	state->default_rmid = RESCTRL_RESERVED_RMID;
750 	state->cur_closid = RESCTRL_RESERVED_CLOSID;
751 	state->cur_rmid = RESCTRL_RESERVED_RMID;
752 	wrmsr(MSR_IA32_PQR_ASSOC, RESCTRL_RESERVED_RMID,
753 	      RESCTRL_RESERVED_CLOSID);
754 }
755 
756 static int resctrl_arch_online_cpu(unsigned int cpu)
757 {
758 	struct rdt_resource *r;
759 
760 	mutex_lock(&domain_list_lock);
761 	for_each_capable_rdt_resource(r)
762 		domain_add_cpu(cpu, r);
763 	mutex_unlock(&domain_list_lock);
764 
765 	clear_closid_rmid(cpu);
766 	resctrl_online_cpu(cpu);
767 
768 	return 0;
769 }
770 
771 static int resctrl_arch_offline_cpu(unsigned int cpu)
772 {
773 	struct rdt_resource *r;
774 
775 	resctrl_offline_cpu(cpu);
776 
777 	mutex_lock(&domain_list_lock);
778 	for_each_capable_rdt_resource(r)
779 		domain_remove_cpu(cpu, r);
780 	mutex_unlock(&domain_list_lock);
781 
782 	clear_closid_rmid(cpu);
783 
784 	return 0;
785 }
786 
787 /*
788  * Choose a width for the resource name and resource data based on the
789  * resource that has widest name and cbm.
790  */
791 static __init void rdt_init_padding(void)
792 {
793 	struct rdt_resource *r;
794 
795 	for_each_alloc_capable_rdt_resource(r) {
796 		if (r->data_width > max_data_width)
797 			max_data_width = r->data_width;
798 	}
799 }
800 
801 enum {
802 	RDT_FLAG_CMT,
803 	RDT_FLAG_MBM_TOTAL,
804 	RDT_FLAG_MBM_LOCAL,
805 	RDT_FLAG_L3_CAT,
806 	RDT_FLAG_L3_CDP,
807 	RDT_FLAG_L2_CAT,
808 	RDT_FLAG_L2_CDP,
809 	RDT_FLAG_MBA,
810 	RDT_FLAG_SMBA,
811 	RDT_FLAG_BMEC,
812 };
813 
814 #define RDT_OPT(idx, n, f)	\
815 [idx] = {			\
816 	.name = n,		\
817 	.flag = f		\
818 }
819 
820 struct rdt_options {
821 	char	*name;
822 	int	flag;
823 	bool	force_off, force_on;
824 };
825 
826 static struct rdt_options rdt_options[]  __initdata = {
827 	RDT_OPT(RDT_FLAG_CMT,	    "cmt",	X86_FEATURE_CQM_OCCUP_LLC),
828 	RDT_OPT(RDT_FLAG_MBM_TOTAL, "mbmtotal", X86_FEATURE_CQM_MBM_TOTAL),
829 	RDT_OPT(RDT_FLAG_MBM_LOCAL, "mbmlocal", X86_FEATURE_CQM_MBM_LOCAL),
830 	RDT_OPT(RDT_FLAG_L3_CAT,    "l3cat",	X86_FEATURE_CAT_L3),
831 	RDT_OPT(RDT_FLAG_L3_CDP,    "l3cdp",	X86_FEATURE_CDP_L3),
832 	RDT_OPT(RDT_FLAG_L2_CAT,    "l2cat",	X86_FEATURE_CAT_L2),
833 	RDT_OPT(RDT_FLAG_L2_CDP,    "l2cdp",	X86_FEATURE_CDP_L2),
834 	RDT_OPT(RDT_FLAG_MBA,	    "mba",	X86_FEATURE_MBA),
835 	RDT_OPT(RDT_FLAG_SMBA,	    "smba",	X86_FEATURE_SMBA),
836 	RDT_OPT(RDT_FLAG_BMEC,	    "bmec",	X86_FEATURE_BMEC),
837 };
838 #define NUM_RDT_OPTIONS ARRAY_SIZE(rdt_options)
839 
840 static int __init set_rdt_options(char *str)
841 {
842 	struct rdt_options *o;
843 	bool force_off;
844 	char *tok;
845 
846 	if (*str == '=')
847 		str++;
848 	while ((tok = strsep(&str, ",")) != NULL) {
849 		force_off = *tok == '!';
850 		if (force_off)
851 			tok++;
852 		for (o = rdt_options; o < &rdt_options[NUM_RDT_OPTIONS]; o++) {
853 			if (strcmp(tok, o->name) == 0) {
854 				if (force_off)
855 					o->force_off = true;
856 				else
857 					o->force_on = true;
858 				break;
859 			}
860 		}
861 	}
862 	return 1;
863 }
864 __setup("rdt", set_rdt_options);
865 
866 bool __init rdt_cpu_has(int flag)
867 {
868 	bool ret = boot_cpu_has(flag);
869 	struct rdt_options *o;
870 
871 	if (!ret)
872 		return ret;
873 
874 	for (o = rdt_options; o < &rdt_options[NUM_RDT_OPTIONS]; o++) {
875 		if (flag == o->flag) {
876 			if (o->force_off)
877 				ret = false;
878 			if (o->force_on)
879 				ret = true;
880 			break;
881 		}
882 	}
883 	return ret;
884 }
885 
886 static __init bool get_mem_config(void)
887 {
888 	struct rdt_hw_resource *hw_res = &rdt_resources_all[RDT_RESOURCE_MBA];
889 
890 	if (!rdt_cpu_has(X86_FEATURE_MBA))
891 		return false;
892 
893 	if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
894 		return __get_mem_config_intel(&hw_res->r_resctrl);
895 	else if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
896 		return __rdt_get_mem_config_amd(&hw_res->r_resctrl);
897 
898 	return false;
899 }
900 
901 static __init bool get_slow_mem_config(void)
902 {
903 	struct rdt_hw_resource *hw_res = &rdt_resources_all[RDT_RESOURCE_SMBA];
904 
905 	if (!rdt_cpu_has(X86_FEATURE_SMBA))
906 		return false;
907 
908 	if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
909 		return __rdt_get_mem_config_amd(&hw_res->r_resctrl);
910 
911 	return false;
912 }
913 
914 static __init bool get_rdt_alloc_resources(void)
915 {
916 	struct rdt_resource *r;
917 	bool ret = false;
918 
919 	if (rdt_alloc_capable)
920 		return true;
921 
922 	if (!boot_cpu_has(X86_FEATURE_RDT_A))
923 		return false;
924 
925 	if (rdt_cpu_has(X86_FEATURE_CAT_L3)) {
926 		r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
927 		rdt_get_cache_alloc_cfg(1, r);
928 		if (rdt_cpu_has(X86_FEATURE_CDP_L3))
929 			rdt_get_cdp_l3_config();
930 		ret = true;
931 	}
932 	if (rdt_cpu_has(X86_FEATURE_CAT_L2)) {
933 		/* CPUID 0x10.2 fields are same format at 0x10.1 */
934 		r = &rdt_resources_all[RDT_RESOURCE_L2].r_resctrl;
935 		rdt_get_cache_alloc_cfg(2, r);
936 		if (rdt_cpu_has(X86_FEATURE_CDP_L2))
937 			rdt_get_cdp_l2_config();
938 		ret = true;
939 	}
940 
941 	if (get_mem_config())
942 		ret = true;
943 
944 	if (get_slow_mem_config())
945 		ret = true;
946 
947 	return ret;
948 }
949 
950 static __init bool get_rdt_mon_resources(void)
951 {
952 	struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
953 
954 	if (rdt_cpu_has(X86_FEATURE_CQM_OCCUP_LLC))
955 		rdt_mon_features |= (1 << QOS_L3_OCCUP_EVENT_ID);
956 	if (rdt_cpu_has(X86_FEATURE_CQM_MBM_TOTAL))
957 		rdt_mon_features |= (1 << QOS_L3_MBM_TOTAL_EVENT_ID);
958 	if (rdt_cpu_has(X86_FEATURE_CQM_MBM_LOCAL))
959 		rdt_mon_features |= (1 << QOS_L3_MBM_LOCAL_EVENT_ID);
960 
961 	if (!rdt_mon_features)
962 		return false;
963 
964 	return !rdt_get_mon_l3_config(r);
965 }
966 
967 static __init void __check_quirks_intel(void)
968 {
969 	switch (boot_cpu_data.x86_vfm) {
970 	case INTEL_HASWELL_X:
971 		if (!rdt_options[RDT_FLAG_L3_CAT].force_off)
972 			cache_alloc_hsw_probe();
973 		break;
974 	case INTEL_SKYLAKE_X:
975 		if (boot_cpu_data.x86_stepping <= 4)
976 			set_rdt_options("!cmt,!mbmtotal,!mbmlocal,!l3cat");
977 		else
978 			set_rdt_options("!l3cat");
979 		fallthrough;
980 	case INTEL_BROADWELL_X:
981 		intel_rdt_mbm_apply_quirk();
982 		break;
983 	}
984 }
985 
986 static __init void check_quirks(void)
987 {
988 	if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
989 		__check_quirks_intel();
990 }
991 
992 static __init bool get_rdt_resources(void)
993 {
994 	rdt_alloc_capable = get_rdt_alloc_resources();
995 	rdt_mon_capable = get_rdt_mon_resources();
996 
997 	return (rdt_mon_capable || rdt_alloc_capable);
998 }
999 
1000 static __init void rdt_init_res_defs_intel(void)
1001 {
1002 	struct rdt_hw_resource *hw_res;
1003 	struct rdt_resource *r;
1004 
1005 	for_each_rdt_resource(r) {
1006 		hw_res = resctrl_to_arch_res(r);
1007 
1008 		if (r->rid == RDT_RESOURCE_L3 ||
1009 		    r->rid == RDT_RESOURCE_L2) {
1010 			r->cache.arch_has_per_cpu_cfg = false;
1011 			r->cache.min_cbm_bits = 1;
1012 		} else if (r->rid == RDT_RESOURCE_MBA) {
1013 			hw_res->msr_base = MSR_IA32_MBA_THRTL_BASE;
1014 			hw_res->msr_update = mba_wrmsr_intel;
1015 		}
1016 	}
1017 }
1018 
1019 static __init void rdt_init_res_defs_amd(void)
1020 {
1021 	struct rdt_hw_resource *hw_res;
1022 	struct rdt_resource *r;
1023 
1024 	for_each_rdt_resource(r) {
1025 		hw_res = resctrl_to_arch_res(r);
1026 
1027 		if (r->rid == RDT_RESOURCE_L3 ||
1028 		    r->rid == RDT_RESOURCE_L2) {
1029 			r->cache.arch_has_sparse_bitmasks = true;
1030 			r->cache.arch_has_per_cpu_cfg = true;
1031 			r->cache.min_cbm_bits = 0;
1032 		} else if (r->rid == RDT_RESOURCE_MBA) {
1033 			hw_res->msr_base = MSR_IA32_MBA_BW_BASE;
1034 			hw_res->msr_update = mba_wrmsr_amd;
1035 		} else if (r->rid == RDT_RESOURCE_SMBA) {
1036 			hw_res->msr_base = MSR_IA32_SMBA_BW_BASE;
1037 			hw_res->msr_update = mba_wrmsr_amd;
1038 		}
1039 	}
1040 }
1041 
1042 static __init void rdt_init_res_defs(void)
1043 {
1044 	if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
1045 		rdt_init_res_defs_intel();
1046 	else if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
1047 		rdt_init_res_defs_amd();
1048 }
1049 
1050 static enum cpuhp_state rdt_online;
1051 
1052 /* Runs once on the BSP during boot. */
1053 void resctrl_cpu_detect(struct cpuinfo_x86 *c)
1054 {
1055 	if (!cpu_has(c, X86_FEATURE_CQM_LLC)) {
1056 		c->x86_cache_max_rmid  = -1;
1057 		c->x86_cache_occ_scale = -1;
1058 		c->x86_cache_mbm_width_offset = -1;
1059 		return;
1060 	}
1061 
1062 	/* will be overridden if occupancy monitoring exists */
1063 	c->x86_cache_max_rmid = cpuid_ebx(0xf);
1064 
1065 	if (cpu_has(c, X86_FEATURE_CQM_OCCUP_LLC) ||
1066 	    cpu_has(c, X86_FEATURE_CQM_MBM_TOTAL) ||
1067 	    cpu_has(c, X86_FEATURE_CQM_MBM_LOCAL)) {
1068 		u32 eax, ebx, ecx, edx;
1069 
1070 		/* QoS sub-leaf, EAX=0Fh, ECX=1 */
1071 		cpuid_count(0xf, 1, &eax, &ebx, &ecx, &edx);
1072 
1073 		c->x86_cache_max_rmid  = ecx;
1074 		c->x86_cache_occ_scale = ebx;
1075 		c->x86_cache_mbm_width_offset = eax & 0xff;
1076 
1077 		if (c->x86_vendor == X86_VENDOR_AMD && !c->x86_cache_mbm_width_offset)
1078 			c->x86_cache_mbm_width_offset = MBM_CNTR_WIDTH_OFFSET_AMD;
1079 	}
1080 }
1081 
1082 static int __init resctrl_late_init(void)
1083 {
1084 	struct rdt_resource *r;
1085 	int state, ret;
1086 
1087 	/*
1088 	 * Initialize functions(or definitions) that are different
1089 	 * between vendors here.
1090 	 */
1091 	rdt_init_res_defs();
1092 
1093 	check_quirks();
1094 
1095 	if (!get_rdt_resources())
1096 		return -ENODEV;
1097 
1098 	rdt_init_padding();
1099 
1100 	state = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN,
1101 				  "x86/resctrl/cat:online:",
1102 				  resctrl_arch_online_cpu,
1103 				  resctrl_arch_offline_cpu);
1104 	if (state < 0)
1105 		return state;
1106 
1107 	ret = rdtgroup_init();
1108 	if (ret) {
1109 		cpuhp_remove_state(state);
1110 		return ret;
1111 	}
1112 	rdt_online = state;
1113 
1114 	for_each_alloc_capable_rdt_resource(r)
1115 		pr_info("%s allocation detected\n", r->name);
1116 
1117 	for_each_mon_capable_rdt_resource(r)
1118 		pr_info("%s monitoring detected\n", r->name);
1119 
1120 	return 0;
1121 }
1122 
1123 late_initcall(resctrl_late_init);
1124 
1125 static void __exit resctrl_exit(void)
1126 {
1127 	struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
1128 
1129 	cpuhp_remove_state(rdt_online);
1130 
1131 	rdtgroup_exit();
1132 
1133 	if (r->mon_capable)
1134 		rdt_put_mon_l3_config();
1135 }
1136 
1137 __exitcall(resctrl_exit);
1138