1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Resource Director Technology(RDT) 4 * - Monitoring code 5 * 6 * Copyright (C) 2017 Intel Corporation 7 * 8 * Author: 9 * Vikas Shivappa <vikas.shivappa@intel.com> 10 * 11 * This replaces the cqm.c based on perf but we reuse a lot of 12 * code and datastructures originally from Peter Zijlstra and Matt Fleming. 13 * 14 * More information about RDT be found in the Intel (R) x86 Architecture 15 * Software Developer Manual June 2016, volume 3, section 17.17. 16 */ 17 18 #define pr_fmt(fmt) "resctrl: " fmt 19 20 #include <linux/cpu.h> 21 #include <linux/resctrl.h> 22 23 #include <asm/cpu_device_id.h> 24 #include <asm/msr.h> 25 26 #include "internal.h" 27 28 /* 29 * Global boolean for rdt_monitor which is true if any 30 * resource monitoring is enabled. 31 */ 32 bool rdt_mon_capable; 33 34 /* 35 * Global to indicate which monitoring events are enabled. 36 */ 37 unsigned int rdt_mon_features; 38 39 #define CF(cf) ((unsigned long)(1048576 * (cf) + 0.5)) 40 41 static int snc_nodes_per_l3_cache = 1; 42 43 /* 44 * The correction factor table is documented in Documentation/filesystems/resctrl.rst. 45 * If rmid > rmid threshold, MBM total and local values should be multiplied 46 * by the correction factor. 47 * 48 * The original table is modified for better code: 49 * 50 * 1. The threshold 0 is changed to rmid count - 1 so don't do correction 51 * for the case. 52 * 2. MBM total and local correction table indexed by core counter which is 53 * equal to (x86_cache_max_rmid + 1) / 8 - 1 and is from 0 up to 27. 54 * 3. The correction factor is normalized to 2^20 (1048576) so it's faster 55 * to calculate corrected value by shifting: 56 * corrected_value = (original_value * correction_factor) >> 20 57 */ 58 static const struct mbm_correction_factor_table { 59 u32 rmidthreshold; 60 u64 cf; 61 } mbm_cf_table[] __initconst = { 62 {7, CF(1.000000)}, 63 {15, CF(1.000000)}, 64 {15, CF(0.969650)}, 65 {31, CF(1.000000)}, 66 {31, CF(1.066667)}, 67 {31, CF(0.969650)}, 68 {47, CF(1.142857)}, 69 {63, CF(1.000000)}, 70 {63, CF(1.185115)}, 71 {63, CF(1.066553)}, 72 {79, CF(1.454545)}, 73 {95, CF(1.000000)}, 74 {95, CF(1.230769)}, 75 {95, CF(1.142857)}, 76 {95, CF(1.066667)}, 77 {127, CF(1.000000)}, 78 {127, CF(1.254863)}, 79 {127, CF(1.185255)}, 80 {151, CF(1.000000)}, 81 {127, CF(1.066667)}, 82 {167, CF(1.000000)}, 83 {159, CF(1.454334)}, 84 {183, CF(1.000000)}, 85 {127, CF(0.969744)}, 86 {191, CF(1.280246)}, 87 {191, CF(1.230921)}, 88 {215, CF(1.000000)}, 89 {191, CF(1.143118)}, 90 }; 91 92 static u32 mbm_cf_rmidthreshold __read_mostly = UINT_MAX; 93 94 static u64 mbm_cf __read_mostly; 95 96 static inline u64 get_corrected_mbm_count(u32 rmid, unsigned long val) 97 { 98 /* Correct MBM value. */ 99 if (rmid > mbm_cf_rmidthreshold) 100 val = (val * mbm_cf) >> 20; 101 102 return val; 103 } 104 105 /* 106 * When Sub-NUMA Cluster (SNC) mode is not enabled (as indicated by 107 * "snc_nodes_per_l3_cache == 1") no translation of the RMID value is 108 * needed. The physical RMID is the same as the logical RMID. 109 * 110 * On a platform with SNC mode enabled, Linux enables RMID sharing mode 111 * via MSR 0xCA0 (see the "RMID Sharing Mode" section in the "Intel 112 * Resource Director Technology Architecture Specification" for a full 113 * description of RMID sharing mode). 114 * 115 * In RMID sharing mode there are fewer "logical RMID" values available 116 * to accumulate data ("physical RMIDs" are divided evenly between SNC 117 * nodes that share an L3 cache). Linux creates an rdt_mon_domain for 118 * each SNC node. 119 * 120 * The value loaded into IA32_PQR_ASSOC is the "logical RMID". 121 * 122 * Data is collected independently on each SNC node and can be retrieved 123 * using the "physical RMID" value computed by this function and loaded 124 * into IA32_QM_EVTSEL. @cpu can be any CPU in the SNC node. 125 * 126 * The scope of the IA32_QM_EVTSEL and IA32_QM_CTR MSRs is at the L3 127 * cache. So a "physical RMID" may be read from any CPU that shares 128 * the L3 cache with the desired SNC node, not just from a CPU in 129 * the specific SNC node. 130 */ 131 static int logical_rmid_to_physical_rmid(int cpu, int lrmid) 132 { 133 struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; 134 135 if (snc_nodes_per_l3_cache == 1) 136 return lrmid; 137 138 return lrmid + (cpu_to_node(cpu) % snc_nodes_per_l3_cache) * r->num_rmid; 139 } 140 141 static int __rmid_read_phys(u32 prmid, enum resctrl_event_id eventid, u64 *val) 142 { 143 u64 msr_val; 144 145 /* 146 * As per the SDM, when IA32_QM_EVTSEL.EvtID (bits 7:0) is configured 147 * with a valid event code for supported resource type and the bits 148 * IA32_QM_EVTSEL.RMID (bits 41:32) are configured with valid RMID, 149 * IA32_QM_CTR.data (bits 61:0) reports the monitored data. 150 * IA32_QM_CTR.Error (bit 63) and IA32_QM_CTR.Unavailable (bit 62) 151 * are error bits. 152 */ 153 wrmsr(MSR_IA32_QM_EVTSEL, eventid, prmid); 154 rdmsrq(MSR_IA32_QM_CTR, msr_val); 155 156 if (msr_val & RMID_VAL_ERROR) 157 return -EIO; 158 if (msr_val & RMID_VAL_UNAVAIL) 159 return -EINVAL; 160 161 *val = msr_val; 162 return 0; 163 } 164 165 static struct arch_mbm_state *get_arch_mbm_state(struct rdt_hw_mon_domain *hw_dom, 166 u32 rmid, 167 enum resctrl_event_id eventid) 168 { 169 switch (eventid) { 170 case QOS_L3_OCCUP_EVENT_ID: 171 return NULL; 172 case QOS_L3_MBM_TOTAL_EVENT_ID: 173 return &hw_dom->arch_mbm_total[rmid]; 174 case QOS_L3_MBM_LOCAL_EVENT_ID: 175 return &hw_dom->arch_mbm_local[rmid]; 176 default: 177 /* Never expect to get here */ 178 WARN_ON_ONCE(1); 179 return NULL; 180 } 181 } 182 183 void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_mon_domain *d, 184 u32 unused, u32 rmid, 185 enum resctrl_event_id eventid) 186 { 187 struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); 188 int cpu = cpumask_any(&d->hdr.cpu_mask); 189 struct arch_mbm_state *am; 190 u32 prmid; 191 192 am = get_arch_mbm_state(hw_dom, rmid, eventid); 193 if (am) { 194 memset(am, 0, sizeof(*am)); 195 196 prmid = logical_rmid_to_physical_rmid(cpu, rmid); 197 /* Record any initial, non-zero count value. */ 198 __rmid_read_phys(prmid, eventid, &am->prev_msr); 199 } 200 } 201 202 /* 203 * Assumes that hardware counters are also reset and thus that there is 204 * no need to record initial non-zero counts. 205 */ 206 void resctrl_arch_reset_rmid_all(struct rdt_resource *r, struct rdt_mon_domain *d) 207 { 208 struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); 209 210 if (resctrl_arch_is_mbm_total_enabled()) 211 memset(hw_dom->arch_mbm_total, 0, 212 sizeof(*hw_dom->arch_mbm_total) * r->num_rmid); 213 214 if (resctrl_arch_is_mbm_local_enabled()) 215 memset(hw_dom->arch_mbm_local, 0, 216 sizeof(*hw_dom->arch_mbm_local) * r->num_rmid); 217 } 218 219 static u64 mbm_overflow_count(u64 prev_msr, u64 cur_msr, unsigned int width) 220 { 221 u64 shift = 64 - width, chunks; 222 223 chunks = (cur_msr << shift) - (prev_msr << shift); 224 return chunks >> shift; 225 } 226 227 int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_mon_domain *d, 228 u32 unused, u32 rmid, enum resctrl_event_id eventid, 229 u64 *val, void *ignored) 230 { 231 struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); 232 struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); 233 int cpu = cpumask_any(&d->hdr.cpu_mask); 234 struct arch_mbm_state *am; 235 u64 msr_val, chunks; 236 u32 prmid; 237 int ret; 238 239 resctrl_arch_rmid_read_context_check(); 240 241 prmid = logical_rmid_to_physical_rmid(cpu, rmid); 242 ret = __rmid_read_phys(prmid, eventid, &msr_val); 243 if (ret) 244 return ret; 245 246 am = get_arch_mbm_state(hw_dom, rmid, eventid); 247 if (am) { 248 am->chunks += mbm_overflow_count(am->prev_msr, msr_val, 249 hw_res->mbm_width); 250 chunks = get_corrected_mbm_count(rmid, am->chunks); 251 am->prev_msr = msr_val; 252 } else { 253 chunks = msr_val; 254 } 255 256 *val = chunks * hw_res->mon_scale; 257 258 return 0; 259 } 260 261 /* 262 * The power-on reset value of MSR_RMID_SNC_CONFIG is 0x1 263 * which indicates that RMIDs are configured in legacy mode. 264 * This mode is incompatible with Linux resctrl semantics 265 * as RMIDs are partitioned between SNC nodes, which requires 266 * a user to know which RMID is allocated to a task. 267 * Clearing bit 0 reconfigures the RMID counters for use 268 * in RMID sharing mode. This mode is better for Linux. 269 * The RMID space is divided between all SNC nodes with the 270 * RMIDs renumbered to start from zero in each node when 271 * counting operations from tasks. Code to read the counters 272 * must adjust RMID counter numbers based on SNC node. See 273 * logical_rmid_to_physical_rmid() for code that does this. 274 */ 275 void arch_mon_domain_online(struct rdt_resource *r, struct rdt_mon_domain *d) 276 { 277 if (snc_nodes_per_l3_cache > 1) 278 msr_clear_bit(MSR_RMID_SNC_CONFIG, 0); 279 } 280 281 /* CPU models that support MSR_RMID_SNC_CONFIG */ 282 static const struct x86_cpu_id snc_cpu_ids[] __initconst = { 283 X86_MATCH_VFM(INTEL_ICELAKE_X, 0), 284 X86_MATCH_VFM(INTEL_SAPPHIRERAPIDS_X, 0), 285 X86_MATCH_VFM(INTEL_EMERALDRAPIDS_X, 0), 286 X86_MATCH_VFM(INTEL_GRANITERAPIDS_X, 0), 287 X86_MATCH_VFM(INTEL_ATOM_CRESTMONT_X, 0), 288 {} 289 }; 290 291 /* 292 * There isn't a simple hardware bit that indicates whether a CPU is running 293 * in Sub-NUMA Cluster (SNC) mode. Infer the state by comparing the 294 * number of CPUs sharing the L3 cache with CPU0 to the number of CPUs in 295 * the same NUMA node as CPU0. 296 * It is not possible to accurately determine SNC state if the system is 297 * booted with a maxcpus=N parameter. That distorts the ratio of SNC nodes 298 * to L3 caches. It will be OK if system is booted with hyperthreading 299 * disabled (since this doesn't affect the ratio). 300 */ 301 static __init int snc_get_config(void) 302 { 303 struct cacheinfo *ci = get_cpu_cacheinfo_level(0, RESCTRL_L3_CACHE); 304 const cpumask_t *node0_cpumask; 305 int cpus_per_node, cpus_per_l3; 306 int ret; 307 308 if (!x86_match_cpu(snc_cpu_ids) || !ci) 309 return 1; 310 311 cpus_read_lock(); 312 if (num_online_cpus() != num_present_cpus()) 313 pr_warn("Some CPUs offline, SNC detection may be incorrect\n"); 314 cpus_read_unlock(); 315 316 node0_cpumask = cpumask_of_node(cpu_to_node(0)); 317 318 cpus_per_node = cpumask_weight(node0_cpumask); 319 cpus_per_l3 = cpumask_weight(&ci->shared_cpu_map); 320 321 if (!cpus_per_node || !cpus_per_l3) 322 return 1; 323 324 ret = cpus_per_l3 / cpus_per_node; 325 326 /* sanity check: Only valid results are 1, 2, 3, 4, 6 */ 327 switch (ret) { 328 case 1: 329 break; 330 case 2 ... 4: 331 case 6: 332 pr_info("Sub-NUMA Cluster mode detected with %d nodes per L3 cache\n", ret); 333 rdt_resources_all[RDT_RESOURCE_L3].r_resctrl.mon_scope = RESCTRL_L3_NODE; 334 break; 335 default: 336 pr_warn("Ignore improbable SNC node count %d\n", ret); 337 ret = 1; 338 break; 339 } 340 341 return ret; 342 } 343 344 int __init rdt_get_mon_l3_config(struct rdt_resource *r) 345 { 346 unsigned int mbm_offset = boot_cpu_data.x86_cache_mbm_width_offset; 347 struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); 348 unsigned int threshold; 349 350 snc_nodes_per_l3_cache = snc_get_config(); 351 352 resctrl_rmid_realloc_limit = boot_cpu_data.x86_cache_size * 1024; 353 hw_res->mon_scale = boot_cpu_data.x86_cache_occ_scale / snc_nodes_per_l3_cache; 354 r->num_rmid = (boot_cpu_data.x86_cache_max_rmid + 1) / snc_nodes_per_l3_cache; 355 hw_res->mbm_width = MBM_CNTR_WIDTH_BASE; 356 357 if (mbm_offset > 0 && mbm_offset <= MBM_CNTR_WIDTH_OFFSET_MAX) 358 hw_res->mbm_width += mbm_offset; 359 else if (mbm_offset > MBM_CNTR_WIDTH_OFFSET_MAX) 360 pr_warn("Ignoring impossible MBM counter offset\n"); 361 362 /* 363 * A reasonable upper limit on the max threshold is the number 364 * of lines tagged per RMID if all RMIDs have the same number of 365 * lines tagged in the LLC. 366 * 367 * For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC. 368 */ 369 threshold = resctrl_rmid_realloc_limit / r->num_rmid; 370 371 /* 372 * Because num_rmid may not be a power of two, round the value 373 * to the nearest multiple of hw_res->mon_scale so it matches a 374 * value the hardware will measure. mon_scale may not be a power of 2. 375 */ 376 resctrl_rmid_realloc_threshold = resctrl_arch_round_mon_val(threshold); 377 378 if (rdt_cpu_has(X86_FEATURE_BMEC)) { 379 u32 eax, ebx, ecx, edx; 380 381 /* Detect list of bandwidth sources that can be tracked */ 382 cpuid_count(0x80000020, 3, &eax, &ebx, &ecx, &edx); 383 r->mbm_cfg_mask = ecx & MAX_EVT_CONFIG_BITS; 384 } 385 386 r->mon_capable = true; 387 388 return 0; 389 } 390 391 void __init intel_rdt_mbm_apply_quirk(void) 392 { 393 int cf_index; 394 395 cf_index = (boot_cpu_data.x86_cache_max_rmid + 1) / 8 - 1; 396 if (cf_index >= ARRAY_SIZE(mbm_cf_table)) { 397 pr_info("No MBM correction factor available\n"); 398 return; 399 } 400 401 mbm_cf_rmidthreshold = mbm_cf_table[cf_index].rmidthreshold; 402 mbm_cf = mbm_cf_table[cf_index].cf; 403 } 404