1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Resource Director Technology(RDT) 4 * - Monitoring code 5 * 6 * Copyright (C) 2017 Intel Corporation 7 * 8 * Author: 9 * Vikas Shivappa <vikas.shivappa@intel.com> 10 * 11 * This replaces the cqm.c based on perf but we reuse a lot of 12 * code and datastructures originally from Peter Zijlstra and Matt Fleming. 13 * 14 * More information about RDT be found in the Intel (R) x86 Architecture 15 * Software Developer Manual June 2016, volume 3, section 17.17. 16 */ 17 18 #define pr_fmt(fmt) "resctrl: " fmt 19 20 #include <linux/cpu.h> 21 #include <linux/resctrl.h> 22 23 #include <asm/cpu_device_id.h> 24 #include <asm/msr.h> 25 26 #include "internal.h" 27 28 /* 29 * Global boolean for rdt_monitor which is true if any 30 * resource monitoring is enabled. 31 */ 32 bool rdt_mon_capable; 33 34 #define CF(cf) ((unsigned long)(1048576 * (cf) + 0.5)) 35 36 static int snc_nodes_per_l3_cache = 1; 37 38 /* 39 * The correction factor table is documented in Documentation/filesystems/resctrl.rst. 40 * If rmid > rmid threshold, MBM total and local values should be multiplied 41 * by the correction factor. 42 * 43 * The original table is modified for better code: 44 * 45 * 1. The threshold 0 is changed to rmid count - 1 so don't do correction 46 * for the case. 47 * 2. MBM total and local correction table indexed by core counter which is 48 * equal to (x86_cache_max_rmid + 1) / 8 - 1 and is from 0 up to 27. 49 * 3. The correction factor is normalized to 2^20 (1048576) so it's faster 50 * to calculate corrected value by shifting: 51 * corrected_value = (original_value * correction_factor) >> 20 52 */ 53 static const struct mbm_correction_factor_table { 54 u32 rmidthreshold; 55 u64 cf; 56 } mbm_cf_table[] __initconst = { 57 {7, CF(1.000000)}, 58 {15, CF(1.000000)}, 59 {15, CF(0.969650)}, 60 {31, CF(1.000000)}, 61 {31, CF(1.066667)}, 62 {31, CF(0.969650)}, 63 {47, CF(1.142857)}, 64 {63, CF(1.000000)}, 65 {63, CF(1.185115)}, 66 {63, CF(1.066553)}, 67 {79, CF(1.454545)}, 68 {95, CF(1.000000)}, 69 {95, CF(1.230769)}, 70 {95, CF(1.142857)}, 71 {95, CF(1.066667)}, 72 {127, CF(1.000000)}, 73 {127, CF(1.254863)}, 74 {127, CF(1.185255)}, 75 {151, CF(1.000000)}, 76 {127, CF(1.066667)}, 77 {167, CF(1.000000)}, 78 {159, CF(1.454334)}, 79 {183, CF(1.000000)}, 80 {127, CF(0.969744)}, 81 {191, CF(1.280246)}, 82 {191, CF(1.230921)}, 83 {215, CF(1.000000)}, 84 {191, CF(1.143118)}, 85 }; 86 87 static u32 mbm_cf_rmidthreshold __read_mostly = UINT_MAX; 88 89 static u64 mbm_cf __read_mostly; 90 91 static inline u64 get_corrected_mbm_count(u32 rmid, unsigned long val) 92 { 93 /* Correct MBM value. */ 94 if (rmid > mbm_cf_rmidthreshold) 95 val = (val * mbm_cf) >> 20; 96 97 return val; 98 } 99 100 /* 101 * When Sub-NUMA Cluster (SNC) mode is not enabled (as indicated by 102 * "snc_nodes_per_l3_cache == 1") no translation of the RMID value is 103 * needed. The physical RMID is the same as the logical RMID. 104 * 105 * On a platform with SNC mode enabled, Linux enables RMID sharing mode 106 * via MSR 0xCA0 (see the "RMID Sharing Mode" section in the "Intel 107 * Resource Director Technology Architecture Specification" for a full 108 * description of RMID sharing mode). 109 * 110 * In RMID sharing mode there are fewer "logical RMID" values available 111 * to accumulate data ("physical RMIDs" are divided evenly between SNC 112 * nodes that share an L3 cache). Linux creates an rdt_mon_domain for 113 * each SNC node. 114 * 115 * The value loaded into IA32_PQR_ASSOC is the "logical RMID". 116 * 117 * Data is collected independently on each SNC node and can be retrieved 118 * using the "physical RMID" value computed by this function and loaded 119 * into IA32_QM_EVTSEL. @cpu can be any CPU in the SNC node. 120 * 121 * The scope of the IA32_QM_EVTSEL and IA32_QM_CTR MSRs is at the L3 122 * cache. So a "physical RMID" may be read from any CPU that shares 123 * the L3 cache with the desired SNC node, not just from a CPU in 124 * the specific SNC node. 125 */ 126 static int logical_rmid_to_physical_rmid(int cpu, int lrmid) 127 { 128 struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; 129 130 if (snc_nodes_per_l3_cache == 1) 131 return lrmid; 132 133 return lrmid + (cpu_to_node(cpu) % snc_nodes_per_l3_cache) * r->mon.num_rmid; 134 } 135 136 static int __rmid_read_phys(u32 prmid, enum resctrl_event_id eventid, u64 *val) 137 { 138 u64 msr_val; 139 140 /* 141 * As per the SDM, when IA32_QM_EVTSEL.EvtID (bits 7:0) is configured 142 * with a valid event code for supported resource type and the bits 143 * IA32_QM_EVTSEL.RMID (bits 41:32) are configured with valid RMID, 144 * IA32_QM_CTR.data (bits 61:0) reports the monitored data. 145 * IA32_QM_CTR.Error (bit 63) and IA32_QM_CTR.Unavailable (bit 62) 146 * are error bits. 147 */ 148 wrmsr(MSR_IA32_QM_EVTSEL, eventid, prmid); 149 rdmsrq(MSR_IA32_QM_CTR, msr_val); 150 151 if (msr_val & RMID_VAL_ERROR) 152 return -EIO; 153 if (msr_val & RMID_VAL_UNAVAIL) 154 return -EINVAL; 155 156 *val = msr_val; 157 return 0; 158 } 159 160 static struct arch_mbm_state *get_arch_mbm_state(struct rdt_hw_mon_domain *hw_dom, 161 u32 rmid, 162 enum resctrl_event_id eventid) 163 { 164 struct arch_mbm_state *state; 165 166 if (!resctrl_is_mbm_event(eventid)) 167 return NULL; 168 169 state = hw_dom->arch_mbm_states[MBM_STATE_IDX(eventid)]; 170 171 return state ? &state[rmid] : NULL; 172 } 173 174 void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_mon_domain *d, 175 u32 unused, u32 rmid, 176 enum resctrl_event_id eventid) 177 { 178 struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); 179 int cpu = cpumask_any(&d->hdr.cpu_mask); 180 struct arch_mbm_state *am; 181 u32 prmid; 182 183 am = get_arch_mbm_state(hw_dom, rmid, eventid); 184 if (am) { 185 memset(am, 0, sizeof(*am)); 186 187 prmid = logical_rmid_to_physical_rmid(cpu, rmid); 188 /* Record any initial, non-zero count value. */ 189 __rmid_read_phys(prmid, eventid, &am->prev_msr); 190 } 191 } 192 193 /* 194 * Assumes that hardware counters are also reset and thus that there is 195 * no need to record initial non-zero counts. 196 */ 197 void resctrl_arch_reset_rmid_all(struct rdt_resource *r, struct rdt_mon_domain *d) 198 { 199 struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); 200 enum resctrl_event_id eventid; 201 int idx; 202 203 for_each_mbm_event_id(eventid) { 204 if (!resctrl_is_mon_event_enabled(eventid)) 205 continue; 206 idx = MBM_STATE_IDX(eventid); 207 memset(hw_dom->arch_mbm_states[idx], 0, 208 sizeof(*hw_dom->arch_mbm_states[0]) * r->mon.num_rmid); 209 } 210 } 211 212 static u64 mbm_overflow_count(u64 prev_msr, u64 cur_msr, unsigned int width) 213 { 214 u64 shift = 64 - width, chunks; 215 216 chunks = (cur_msr << shift) - (prev_msr << shift); 217 return chunks >> shift; 218 } 219 220 static u64 get_corrected_val(struct rdt_resource *r, struct rdt_mon_domain *d, 221 u32 rmid, enum resctrl_event_id eventid, u64 msr_val) 222 { 223 struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); 224 struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); 225 struct arch_mbm_state *am; 226 u64 chunks; 227 228 am = get_arch_mbm_state(hw_dom, rmid, eventid); 229 if (am) { 230 am->chunks += mbm_overflow_count(am->prev_msr, msr_val, 231 hw_res->mbm_width); 232 chunks = get_corrected_mbm_count(rmid, am->chunks); 233 am->prev_msr = msr_val; 234 } else { 235 chunks = msr_val; 236 } 237 238 return chunks * hw_res->mon_scale; 239 } 240 241 int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_mon_domain *d, 242 u32 unused, u32 rmid, enum resctrl_event_id eventid, 243 u64 *val, void *ignored) 244 { 245 int cpu = cpumask_any(&d->hdr.cpu_mask); 246 u64 msr_val; 247 u32 prmid; 248 int ret; 249 250 resctrl_arch_rmid_read_context_check(); 251 252 prmid = logical_rmid_to_physical_rmid(cpu, rmid); 253 ret = __rmid_read_phys(prmid, eventid, &msr_val); 254 if (ret) 255 return ret; 256 257 *val = get_corrected_val(r, d, rmid, eventid, msr_val); 258 259 return 0; 260 } 261 262 static int __cntr_id_read(u32 cntr_id, u64 *val) 263 { 264 u64 msr_val; 265 266 /* 267 * QM_EVTSEL Register definition: 268 * ======================================================= 269 * Bits Mnemonic Description 270 * ======================================================= 271 * 63:44 -- Reserved 272 * 43:32 RMID RMID or counter ID in ABMC mode 273 * when reading an MBM event 274 * 31 ExtendedEvtID Extended Event Identifier 275 * 30:8 -- Reserved 276 * 7:0 EvtID Event Identifier 277 * ======================================================= 278 * The contents of a specific counter can be read by setting the 279 * following fields in QM_EVTSEL.ExtendedEvtID(=1) and 280 * QM_EVTSEL.EvtID = L3CacheABMC (=1) and setting QM_EVTSEL.RMID 281 * to the desired counter ID. Reading the QM_CTR then returns the 282 * contents of the specified counter. The RMID_VAL_ERROR bit is set 283 * if the counter configuration is invalid, or if an invalid counter 284 * ID is set in the QM_EVTSEL.RMID field. The RMID_VAL_UNAVAIL bit 285 * is set if the counter data is unavailable. 286 */ 287 wrmsr(MSR_IA32_QM_EVTSEL, ABMC_EXTENDED_EVT_ID | ABMC_EVT_ID, cntr_id); 288 rdmsrl(MSR_IA32_QM_CTR, msr_val); 289 290 if (msr_val & RMID_VAL_ERROR) 291 return -EIO; 292 if (msr_val & RMID_VAL_UNAVAIL) 293 return -EINVAL; 294 295 *val = msr_val; 296 return 0; 297 } 298 299 void resctrl_arch_reset_cntr(struct rdt_resource *r, struct rdt_mon_domain *d, 300 u32 unused, u32 rmid, int cntr_id, 301 enum resctrl_event_id eventid) 302 { 303 struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); 304 struct arch_mbm_state *am; 305 306 am = get_arch_mbm_state(hw_dom, rmid, eventid); 307 if (am) { 308 memset(am, 0, sizeof(*am)); 309 310 /* Record any initial, non-zero count value. */ 311 __cntr_id_read(cntr_id, &am->prev_msr); 312 } 313 } 314 315 int resctrl_arch_cntr_read(struct rdt_resource *r, struct rdt_mon_domain *d, 316 u32 unused, u32 rmid, int cntr_id, 317 enum resctrl_event_id eventid, u64 *val) 318 { 319 u64 msr_val; 320 int ret; 321 322 ret = __cntr_id_read(cntr_id, &msr_val); 323 if (ret) 324 return ret; 325 326 *val = get_corrected_val(r, d, rmid, eventid, msr_val); 327 328 return 0; 329 } 330 331 /* 332 * The power-on reset value of MSR_RMID_SNC_CONFIG is 0x1 333 * which indicates that RMIDs are configured in legacy mode. 334 * This mode is incompatible with Linux resctrl semantics 335 * as RMIDs are partitioned between SNC nodes, which requires 336 * a user to know which RMID is allocated to a task. 337 * Clearing bit 0 reconfigures the RMID counters for use 338 * in RMID sharing mode. This mode is better for Linux. 339 * The RMID space is divided between all SNC nodes with the 340 * RMIDs renumbered to start from zero in each node when 341 * counting operations from tasks. Code to read the counters 342 * must adjust RMID counter numbers based on SNC node. See 343 * logical_rmid_to_physical_rmid() for code that does this. 344 */ 345 void arch_mon_domain_online(struct rdt_resource *r, struct rdt_mon_domain *d) 346 { 347 if (snc_nodes_per_l3_cache > 1) 348 msr_clear_bit(MSR_RMID_SNC_CONFIG, 0); 349 } 350 351 /* CPU models that support MSR_RMID_SNC_CONFIG */ 352 static const struct x86_cpu_id snc_cpu_ids[] __initconst = { 353 X86_MATCH_VFM(INTEL_ICELAKE_X, 0), 354 X86_MATCH_VFM(INTEL_SAPPHIRERAPIDS_X, 0), 355 X86_MATCH_VFM(INTEL_EMERALDRAPIDS_X, 0), 356 X86_MATCH_VFM(INTEL_GRANITERAPIDS_X, 0), 357 X86_MATCH_VFM(INTEL_ATOM_CRESTMONT_X, 0), 358 {} 359 }; 360 361 /* 362 * There isn't a simple hardware bit that indicates whether a CPU is running 363 * in Sub-NUMA Cluster (SNC) mode. Infer the state by comparing the 364 * number of CPUs sharing the L3 cache with CPU0 to the number of CPUs in 365 * the same NUMA node as CPU0. 366 * It is not possible to accurately determine SNC state if the system is 367 * booted with a maxcpus=N parameter. That distorts the ratio of SNC nodes 368 * to L3 caches. It will be OK if system is booted with hyperthreading 369 * disabled (since this doesn't affect the ratio). 370 */ 371 static __init int snc_get_config(void) 372 { 373 struct cacheinfo *ci = get_cpu_cacheinfo_level(0, RESCTRL_L3_CACHE); 374 const cpumask_t *node0_cpumask; 375 int cpus_per_node, cpus_per_l3; 376 int ret; 377 378 if (!x86_match_cpu(snc_cpu_ids) || !ci) 379 return 1; 380 381 cpus_read_lock(); 382 if (num_online_cpus() != num_present_cpus()) 383 pr_warn("Some CPUs offline, SNC detection may be incorrect\n"); 384 cpus_read_unlock(); 385 386 node0_cpumask = cpumask_of_node(cpu_to_node(0)); 387 388 cpus_per_node = cpumask_weight(node0_cpumask); 389 cpus_per_l3 = cpumask_weight(&ci->shared_cpu_map); 390 391 if (!cpus_per_node || !cpus_per_l3) 392 return 1; 393 394 ret = cpus_per_l3 / cpus_per_node; 395 396 /* sanity check: Only valid results are 1, 2, 3, 4, 6 */ 397 switch (ret) { 398 case 1: 399 break; 400 case 2 ... 4: 401 case 6: 402 pr_info("Sub-NUMA Cluster mode detected with %d nodes per L3 cache\n", ret); 403 rdt_resources_all[RDT_RESOURCE_L3].r_resctrl.mon_scope = RESCTRL_L3_NODE; 404 break; 405 default: 406 pr_warn("Ignore improbable SNC node count %d\n", ret); 407 ret = 1; 408 break; 409 } 410 411 return ret; 412 } 413 414 int __init rdt_get_mon_l3_config(struct rdt_resource *r) 415 { 416 unsigned int mbm_offset = boot_cpu_data.x86_cache_mbm_width_offset; 417 struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); 418 unsigned int threshold; 419 u32 eax, ebx, ecx, edx; 420 421 snc_nodes_per_l3_cache = snc_get_config(); 422 423 resctrl_rmid_realloc_limit = boot_cpu_data.x86_cache_size * 1024; 424 hw_res->mon_scale = boot_cpu_data.x86_cache_occ_scale / snc_nodes_per_l3_cache; 425 r->mon.num_rmid = (boot_cpu_data.x86_cache_max_rmid + 1) / snc_nodes_per_l3_cache; 426 hw_res->mbm_width = MBM_CNTR_WIDTH_BASE; 427 428 if (mbm_offset > 0 && mbm_offset <= MBM_CNTR_WIDTH_OFFSET_MAX) 429 hw_res->mbm_width += mbm_offset; 430 else if (mbm_offset > MBM_CNTR_WIDTH_OFFSET_MAX) 431 pr_warn("Ignoring impossible MBM counter offset\n"); 432 433 /* 434 * A reasonable upper limit on the max threshold is the number 435 * of lines tagged per RMID if all RMIDs have the same number of 436 * lines tagged in the LLC. 437 * 438 * For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC. 439 */ 440 threshold = resctrl_rmid_realloc_limit / r->mon.num_rmid; 441 442 /* 443 * Because num_rmid may not be a power of two, round the value 444 * to the nearest multiple of hw_res->mon_scale so it matches a 445 * value the hardware will measure. mon_scale may not be a power of 2. 446 */ 447 resctrl_rmid_realloc_threshold = resctrl_arch_round_mon_val(threshold); 448 449 if (rdt_cpu_has(X86_FEATURE_BMEC) || rdt_cpu_has(X86_FEATURE_ABMC)) { 450 /* Detect list of bandwidth sources that can be tracked */ 451 cpuid_count(0x80000020, 3, &eax, &ebx, &ecx, &edx); 452 r->mon.mbm_cfg_mask = ecx & MAX_EVT_CONFIG_BITS; 453 } 454 455 if (rdt_cpu_has(X86_FEATURE_ABMC)) { 456 r->mon.mbm_cntr_assignable = true; 457 cpuid_count(0x80000020, 5, &eax, &ebx, &ecx, &edx); 458 r->mon.num_mbm_cntrs = (ebx & GENMASK(15, 0)) + 1; 459 hw_res->mbm_cntr_assign_enabled = true; 460 } 461 462 r->mon_capable = true; 463 464 return 0; 465 } 466 467 void __init intel_rdt_mbm_apply_quirk(void) 468 { 469 int cf_index; 470 471 cf_index = (boot_cpu_data.x86_cache_max_rmid + 1) / 8 - 1; 472 if (cf_index >= ARRAY_SIZE(mbm_cf_table)) { 473 pr_info("No MBM correction factor available\n"); 474 return; 475 } 476 477 mbm_cf_rmidthreshold = mbm_cf_table[cf_index].rmidthreshold; 478 mbm_cf = mbm_cf_table[cf_index].cf; 479 } 480 481 static void resctrl_abmc_set_one_amd(void *arg) 482 { 483 bool *enable = arg; 484 485 if (*enable) 486 msr_set_bit(MSR_IA32_L3_QOS_EXT_CFG, ABMC_ENABLE_BIT); 487 else 488 msr_clear_bit(MSR_IA32_L3_QOS_EXT_CFG, ABMC_ENABLE_BIT); 489 } 490 491 /* 492 * ABMC enable/disable requires update of L3_QOS_EXT_CFG MSR on all the CPUs 493 * associated with all monitor domains. 494 */ 495 static void _resctrl_abmc_enable(struct rdt_resource *r, bool enable) 496 { 497 struct rdt_mon_domain *d; 498 499 lockdep_assert_cpus_held(); 500 501 list_for_each_entry(d, &r->mon_domains, hdr.list) { 502 on_each_cpu_mask(&d->hdr.cpu_mask, resctrl_abmc_set_one_amd, 503 &enable, 1); 504 resctrl_arch_reset_rmid_all(r, d); 505 } 506 } 507 508 int resctrl_arch_mbm_cntr_assign_set(struct rdt_resource *r, bool enable) 509 { 510 struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); 511 512 if (r->mon.mbm_cntr_assignable && 513 hw_res->mbm_cntr_assign_enabled != enable) { 514 _resctrl_abmc_enable(r, enable); 515 hw_res->mbm_cntr_assign_enabled = enable; 516 } 517 518 return 0; 519 } 520 521 bool resctrl_arch_mbm_cntr_assign_enabled(struct rdt_resource *r) 522 { 523 return resctrl_to_arch_res(r)->mbm_cntr_assign_enabled; 524 } 525 526 static void resctrl_abmc_config_one_amd(void *info) 527 { 528 union l3_qos_abmc_cfg *abmc_cfg = info; 529 530 wrmsrl(MSR_IA32_L3_QOS_ABMC_CFG, abmc_cfg->full); 531 } 532 533 /* 534 * Send an IPI to the domain to assign the counter to RMID, event pair. 535 */ 536 void resctrl_arch_config_cntr(struct rdt_resource *r, struct rdt_mon_domain *d, 537 enum resctrl_event_id evtid, u32 rmid, u32 closid, 538 u32 cntr_id, bool assign) 539 { 540 struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); 541 union l3_qos_abmc_cfg abmc_cfg = { 0 }; 542 struct arch_mbm_state *am; 543 544 abmc_cfg.split.cfg_en = 1; 545 abmc_cfg.split.cntr_en = assign ? 1 : 0; 546 abmc_cfg.split.cntr_id = cntr_id; 547 abmc_cfg.split.bw_src = rmid; 548 if (assign) 549 abmc_cfg.split.bw_type = resctrl_get_mon_evt_cfg(evtid); 550 551 smp_call_function_any(&d->hdr.cpu_mask, resctrl_abmc_config_one_amd, &abmc_cfg, 1); 552 553 /* 554 * The hardware counter is reset (because cfg_en == 1) so there is no 555 * need to record initial non-zero counts. 556 */ 557 am = get_arch_mbm_state(hw_dom, rmid, evtid); 558 if (am) 559 memset(am, 0, sizeof(*am)); 560 } 561 562 void resctrl_arch_mbm_cntr_assign_set_one(struct rdt_resource *r) 563 { 564 struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); 565 566 resctrl_abmc_set_one_amd(&hw_res->mbm_cntr_assign_enabled); 567 } 568