1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3 * Resource Director Technology(RDT)
4 * - Monitoring code
5 *
6 * Copyright (C) 2017 Intel Corporation
7 *
8 * Author:
9 * Vikas Shivappa <vikas.shivappa@intel.com>
10 *
11 * This replaces the cqm.c based on perf but we reuse a lot of
12 * code and datastructures originally from Peter Zijlstra and Matt Fleming.
13 *
14 * More information about RDT be found in the Intel (R) x86 Architecture
15 * Software Developer Manual June 2016, volume 3, section 17.17.
16 */
17
18 #define pr_fmt(fmt) "resctrl: " fmt
19
20 #include <linux/cpu.h>
21 #include <linux/resctrl.h>
22
23 #include <asm/cpu_device_id.h>
24 #include <asm/msr.h>
25
26 #include "internal.h"
27
28 /*
29 * Global boolean for rdt_monitor which is true if any
30 * resource monitoring is enabled.
31 */
32 bool rdt_mon_capable;
33
34 /*
35 * Global to indicate which monitoring events are enabled.
36 */
37 unsigned int rdt_mon_features;
38
39 #define CF(cf) ((unsigned long)(1048576 * (cf) + 0.5))
40
41 static int snc_nodes_per_l3_cache = 1;
42
43 /*
44 * The correction factor table is documented in Documentation/filesystems/resctrl.rst.
45 * If rmid > rmid threshold, MBM total and local values should be multiplied
46 * by the correction factor.
47 *
48 * The original table is modified for better code:
49 *
50 * 1. The threshold 0 is changed to rmid count - 1 so don't do correction
51 * for the case.
52 * 2. MBM total and local correction table indexed by core counter which is
53 * equal to (x86_cache_max_rmid + 1) / 8 - 1 and is from 0 up to 27.
54 * 3. The correction factor is normalized to 2^20 (1048576) so it's faster
55 * to calculate corrected value by shifting:
56 * corrected_value = (original_value * correction_factor) >> 20
57 */
58 static const struct mbm_correction_factor_table {
59 u32 rmidthreshold;
60 u64 cf;
61 } mbm_cf_table[] __initconst = {
62 {7, CF(1.000000)},
63 {15, CF(1.000000)},
64 {15, CF(0.969650)},
65 {31, CF(1.000000)},
66 {31, CF(1.066667)},
67 {31, CF(0.969650)},
68 {47, CF(1.142857)},
69 {63, CF(1.000000)},
70 {63, CF(1.185115)},
71 {63, CF(1.066553)},
72 {79, CF(1.454545)},
73 {95, CF(1.000000)},
74 {95, CF(1.230769)},
75 {95, CF(1.142857)},
76 {95, CF(1.066667)},
77 {127, CF(1.000000)},
78 {127, CF(1.254863)},
79 {127, CF(1.185255)},
80 {151, CF(1.000000)},
81 {127, CF(1.066667)},
82 {167, CF(1.000000)},
83 {159, CF(1.454334)},
84 {183, CF(1.000000)},
85 {127, CF(0.969744)},
86 {191, CF(1.280246)},
87 {191, CF(1.230921)},
88 {215, CF(1.000000)},
89 {191, CF(1.143118)},
90 };
91
92 static u32 mbm_cf_rmidthreshold __read_mostly = UINT_MAX;
93
94 static u64 mbm_cf __read_mostly;
95
get_corrected_mbm_count(u32 rmid,unsigned long val)96 static inline u64 get_corrected_mbm_count(u32 rmid, unsigned long val)
97 {
98 /* Correct MBM value. */
99 if (rmid > mbm_cf_rmidthreshold)
100 val = (val * mbm_cf) >> 20;
101
102 return val;
103 }
104
105 /*
106 * When Sub-NUMA Cluster (SNC) mode is not enabled (as indicated by
107 * "snc_nodes_per_l3_cache == 1") no translation of the RMID value is
108 * needed. The physical RMID is the same as the logical RMID.
109 *
110 * On a platform with SNC mode enabled, Linux enables RMID sharing mode
111 * via MSR 0xCA0 (see the "RMID Sharing Mode" section in the "Intel
112 * Resource Director Technology Architecture Specification" for a full
113 * description of RMID sharing mode).
114 *
115 * In RMID sharing mode there are fewer "logical RMID" values available
116 * to accumulate data ("physical RMIDs" are divided evenly between SNC
117 * nodes that share an L3 cache). Linux creates an rdt_mon_domain for
118 * each SNC node.
119 *
120 * The value loaded into IA32_PQR_ASSOC is the "logical RMID".
121 *
122 * Data is collected independently on each SNC node and can be retrieved
123 * using the "physical RMID" value computed by this function and loaded
124 * into IA32_QM_EVTSEL. @cpu can be any CPU in the SNC node.
125 *
126 * The scope of the IA32_QM_EVTSEL and IA32_QM_CTR MSRs is at the L3
127 * cache. So a "physical RMID" may be read from any CPU that shares
128 * the L3 cache with the desired SNC node, not just from a CPU in
129 * the specific SNC node.
130 */
logical_rmid_to_physical_rmid(int cpu,int lrmid)131 static int logical_rmid_to_physical_rmid(int cpu, int lrmid)
132 {
133 struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
134
135 if (snc_nodes_per_l3_cache == 1)
136 return lrmid;
137
138 return lrmid + (cpu_to_node(cpu) % snc_nodes_per_l3_cache) * r->num_rmid;
139 }
140
__rmid_read_phys(u32 prmid,enum resctrl_event_id eventid,u64 * val)141 static int __rmid_read_phys(u32 prmid, enum resctrl_event_id eventid, u64 *val)
142 {
143 u64 msr_val;
144
145 /*
146 * As per the SDM, when IA32_QM_EVTSEL.EvtID (bits 7:0) is configured
147 * with a valid event code for supported resource type and the bits
148 * IA32_QM_EVTSEL.RMID (bits 41:32) are configured with valid RMID,
149 * IA32_QM_CTR.data (bits 61:0) reports the monitored data.
150 * IA32_QM_CTR.Error (bit 63) and IA32_QM_CTR.Unavailable (bit 62)
151 * are error bits.
152 */
153 wrmsr(MSR_IA32_QM_EVTSEL, eventid, prmid);
154 rdmsrq(MSR_IA32_QM_CTR, msr_val);
155
156 if (msr_val & RMID_VAL_ERROR)
157 return -EIO;
158 if (msr_val & RMID_VAL_UNAVAIL)
159 return -EINVAL;
160
161 *val = msr_val;
162 return 0;
163 }
164
get_arch_mbm_state(struct rdt_hw_mon_domain * hw_dom,u32 rmid,enum resctrl_event_id eventid)165 static struct arch_mbm_state *get_arch_mbm_state(struct rdt_hw_mon_domain *hw_dom,
166 u32 rmid,
167 enum resctrl_event_id eventid)
168 {
169 switch (eventid) {
170 case QOS_L3_OCCUP_EVENT_ID:
171 return NULL;
172 case QOS_L3_MBM_TOTAL_EVENT_ID:
173 return &hw_dom->arch_mbm_total[rmid];
174 case QOS_L3_MBM_LOCAL_EVENT_ID:
175 return &hw_dom->arch_mbm_local[rmid];
176 default:
177 /* Never expect to get here */
178 WARN_ON_ONCE(1);
179 return NULL;
180 }
181 }
182
resctrl_arch_reset_rmid(struct rdt_resource * r,struct rdt_mon_domain * d,u32 unused,u32 rmid,enum resctrl_event_id eventid)183 void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_mon_domain *d,
184 u32 unused, u32 rmid,
185 enum resctrl_event_id eventid)
186 {
187 struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d);
188 int cpu = cpumask_any(&d->hdr.cpu_mask);
189 struct arch_mbm_state *am;
190 u32 prmid;
191
192 am = get_arch_mbm_state(hw_dom, rmid, eventid);
193 if (am) {
194 memset(am, 0, sizeof(*am));
195
196 prmid = logical_rmid_to_physical_rmid(cpu, rmid);
197 /* Record any initial, non-zero count value. */
198 __rmid_read_phys(prmid, eventid, &am->prev_msr);
199 }
200 }
201
202 /*
203 * Assumes that hardware counters are also reset and thus that there is
204 * no need to record initial non-zero counts.
205 */
resctrl_arch_reset_rmid_all(struct rdt_resource * r,struct rdt_mon_domain * d)206 void resctrl_arch_reset_rmid_all(struct rdt_resource *r, struct rdt_mon_domain *d)
207 {
208 struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d);
209
210 if (resctrl_arch_is_mbm_total_enabled())
211 memset(hw_dom->arch_mbm_total, 0,
212 sizeof(*hw_dom->arch_mbm_total) * r->num_rmid);
213
214 if (resctrl_arch_is_mbm_local_enabled())
215 memset(hw_dom->arch_mbm_local, 0,
216 sizeof(*hw_dom->arch_mbm_local) * r->num_rmid);
217 }
218
mbm_overflow_count(u64 prev_msr,u64 cur_msr,unsigned int width)219 static u64 mbm_overflow_count(u64 prev_msr, u64 cur_msr, unsigned int width)
220 {
221 u64 shift = 64 - width, chunks;
222
223 chunks = (cur_msr << shift) - (prev_msr << shift);
224 return chunks >> shift;
225 }
226
resctrl_arch_rmid_read(struct rdt_resource * r,struct rdt_mon_domain * d,u32 unused,u32 rmid,enum resctrl_event_id eventid,u64 * val,void * ignored)227 int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_mon_domain *d,
228 u32 unused, u32 rmid, enum resctrl_event_id eventid,
229 u64 *val, void *ignored)
230 {
231 struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d);
232 struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
233 int cpu = cpumask_any(&d->hdr.cpu_mask);
234 struct arch_mbm_state *am;
235 u64 msr_val, chunks;
236 u32 prmid;
237 int ret;
238
239 resctrl_arch_rmid_read_context_check();
240
241 prmid = logical_rmid_to_physical_rmid(cpu, rmid);
242 ret = __rmid_read_phys(prmid, eventid, &msr_val);
243 if (ret)
244 return ret;
245
246 am = get_arch_mbm_state(hw_dom, rmid, eventid);
247 if (am) {
248 am->chunks += mbm_overflow_count(am->prev_msr, msr_val,
249 hw_res->mbm_width);
250 chunks = get_corrected_mbm_count(rmid, am->chunks);
251 am->prev_msr = msr_val;
252 } else {
253 chunks = msr_val;
254 }
255
256 *val = chunks * hw_res->mon_scale;
257
258 return 0;
259 }
260
261 /*
262 * The power-on reset value of MSR_RMID_SNC_CONFIG is 0x1
263 * which indicates that RMIDs are configured in legacy mode.
264 * This mode is incompatible with Linux resctrl semantics
265 * as RMIDs are partitioned between SNC nodes, which requires
266 * a user to know which RMID is allocated to a task.
267 * Clearing bit 0 reconfigures the RMID counters for use
268 * in RMID sharing mode. This mode is better for Linux.
269 * The RMID space is divided between all SNC nodes with the
270 * RMIDs renumbered to start from zero in each node when
271 * counting operations from tasks. Code to read the counters
272 * must adjust RMID counter numbers based on SNC node. See
273 * logical_rmid_to_physical_rmid() for code that does this.
274 */
arch_mon_domain_online(struct rdt_resource * r,struct rdt_mon_domain * d)275 void arch_mon_domain_online(struct rdt_resource *r, struct rdt_mon_domain *d)
276 {
277 if (snc_nodes_per_l3_cache > 1)
278 msr_clear_bit(MSR_RMID_SNC_CONFIG, 0);
279 }
280
281 /* CPU models that support MSR_RMID_SNC_CONFIG */
282 static const struct x86_cpu_id snc_cpu_ids[] __initconst = {
283 X86_MATCH_VFM(INTEL_ICELAKE_X, 0),
284 X86_MATCH_VFM(INTEL_SAPPHIRERAPIDS_X, 0),
285 X86_MATCH_VFM(INTEL_EMERALDRAPIDS_X, 0),
286 X86_MATCH_VFM(INTEL_GRANITERAPIDS_X, 0),
287 X86_MATCH_VFM(INTEL_ATOM_CRESTMONT_X, 0),
288 {}
289 };
290
291 /*
292 * There isn't a simple hardware bit that indicates whether a CPU is running
293 * in Sub-NUMA Cluster (SNC) mode. Infer the state by comparing the
294 * number of CPUs sharing the L3 cache with CPU0 to the number of CPUs in
295 * the same NUMA node as CPU0.
296 * It is not possible to accurately determine SNC state if the system is
297 * booted with a maxcpus=N parameter. That distorts the ratio of SNC nodes
298 * to L3 caches. It will be OK if system is booted with hyperthreading
299 * disabled (since this doesn't affect the ratio).
300 */
snc_get_config(void)301 static __init int snc_get_config(void)
302 {
303 struct cacheinfo *ci = get_cpu_cacheinfo_level(0, RESCTRL_L3_CACHE);
304 const cpumask_t *node0_cpumask;
305 int cpus_per_node, cpus_per_l3;
306 int ret;
307
308 if (!x86_match_cpu(snc_cpu_ids) || !ci)
309 return 1;
310
311 cpus_read_lock();
312 if (num_online_cpus() != num_present_cpus())
313 pr_warn("Some CPUs offline, SNC detection may be incorrect\n");
314 cpus_read_unlock();
315
316 node0_cpumask = cpumask_of_node(cpu_to_node(0));
317
318 cpus_per_node = cpumask_weight(node0_cpumask);
319 cpus_per_l3 = cpumask_weight(&ci->shared_cpu_map);
320
321 if (!cpus_per_node || !cpus_per_l3)
322 return 1;
323
324 ret = cpus_per_l3 / cpus_per_node;
325
326 /* sanity check: Only valid results are 1, 2, 3, 4, 6 */
327 switch (ret) {
328 case 1:
329 break;
330 case 2 ... 4:
331 case 6:
332 pr_info("Sub-NUMA Cluster mode detected with %d nodes per L3 cache\n", ret);
333 rdt_resources_all[RDT_RESOURCE_L3].r_resctrl.mon_scope = RESCTRL_L3_NODE;
334 break;
335 default:
336 pr_warn("Ignore improbable SNC node count %d\n", ret);
337 ret = 1;
338 break;
339 }
340
341 return ret;
342 }
343
rdt_get_mon_l3_config(struct rdt_resource * r)344 int __init rdt_get_mon_l3_config(struct rdt_resource *r)
345 {
346 unsigned int mbm_offset = boot_cpu_data.x86_cache_mbm_width_offset;
347 struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
348 unsigned int threshold;
349
350 snc_nodes_per_l3_cache = snc_get_config();
351
352 resctrl_rmid_realloc_limit = boot_cpu_data.x86_cache_size * 1024;
353 hw_res->mon_scale = boot_cpu_data.x86_cache_occ_scale / snc_nodes_per_l3_cache;
354 r->num_rmid = (boot_cpu_data.x86_cache_max_rmid + 1) / snc_nodes_per_l3_cache;
355 hw_res->mbm_width = MBM_CNTR_WIDTH_BASE;
356
357 if (mbm_offset > 0 && mbm_offset <= MBM_CNTR_WIDTH_OFFSET_MAX)
358 hw_res->mbm_width += mbm_offset;
359 else if (mbm_offset > MBM_CNTR_WIDTH_OFFSET_MAX)
360 pr_warn("Ignoring impossible MBM counter offset\n");
361
362 /*
363 * A reasonable upper limit on the max threshold is the number
364 * of lines tagged per RMID if all RMIDs have the same number of
365 * lines tagged in the LLC.
366 *
367 * For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC.
368 */
369 threshold = resctrl_rmid_realloc_limit / r->num_rmid;
370
371 /*
372 * Because num_rmid may not be a power of two, round the value
373 * to the nearest multiple of hw_res->mon_scale so it matches a
374 * value the hardware will measure. mon_scale may not be a power of 2.
375 */
376 resctrl_rmid_realloc_threshold = resctrl_arch_round_mon_val(threshold);
377
378 if (rdt_cpu_has(X86_FEATURE_BMEC)) {
379 u32 eax, ebx, ecx, edx;
380
381 /* Detect list of bandwidth sources that can be tracked */
382 cpuid_count(0x80000020, 3, &eax, &ebx, &ecx, &edx);
383 r->mbm_cfg_mask = ecx & MAX_EVT_CONFIG_BITS;
384 }
385
386 r->mon_capable = true;
387
388 return 0;
389 }
390
intel_rdt_mbm_apply_quirk(void)391 void __init intel_rdt_mbm_apply_quirk(void)
392 {
393 int cf_index;
394
395 cf_index = (boot_cpu_data.x86_cache_max_rmid + 1) / 8 - 1;
396 if (cf_index >= ARRAY_SIZE(mbm_cf_table)) {
397 pr_info("No MBM correction factor available\n");
398 return;
399 }
400
401 mbm_cf_rmidthreshold = mbm_cf_table[cf_index].rmidthreshold;
402 mbm_cf = mbm_cf_table[cf_index].cf;
403 }
404