1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Resource Director Technology(RDT) 4 * - Monitoring code 5 * 6 * Copyright (C) 2017 Intel Corporation 7 * 8 * Author: 9 * Vikas Shivappa <vikas.shivappa@intel.com> 10 * 11 * This replaces the cqm.c based on perf but we reuse a lot of 12 * code and datastructures originally from Peter Zijlstra and Matt Fleming. 13 * 14 * More information about RDT be found in the Intel (R) x86 Architecture 15 * Software Developer Manual June 2016, volume 3, section 17.17. 16 */ 17 18 #define pr_fmt(fmt) "resctrl: " fmt 19 20 #include <linux/cpu.h> 21 #include <linux/resctrl.h> 22 23 #include <asm/cpu_device_id.h> 24 #include <asm/cpuid/api.h> 25 #include <asm/msr.h> 26 27 #include "internal.h" 28 29 /* 30 * Global boolean for rdt_monitor which is true if any 31 * resource monitoring is enabled. 32 */ 33 bool rdt_mon_capable; 34 35 #define CF(cf) ((unsigned long)(1048576 * (cf) + 0.5)) 36 37 static int snc_nodes_per_l3_cache = 1; 38 39 /* 40 * The correction factor table is documented in Documentation/filesystems/resctrl.rst. 41 * If rmid > rmid threshold, MBM total and local values should be multiplied 42 * by the correction factor. 43 * 44 * The original table is modified for better code: 45 * 46 * 1. The threshold 0 is changed to rmid count - 1 so don't do correction 47 * for the case. 48 * 2. MBM total and local correction table indexed by core counter which is 49 * equal to (x86_cache_max_rmid + 1) / 8 - 1 and is from 0 up to 27. 50 * 3. The correction factor is normalized to 2^20 (1048576) so it's faster 51 * to calculate corrected value by shifting: 52 * corrected_value = (original_value * correction_factor) >> 20 53 */ 54 static const struct mbm_correction_factor_table { 55 u32 rmidthreshold; 56 u64 cf; 57 } mbm_cf_table[] __initconst = { 58 {7, CF(1.000000)}, 59 {15, CF(1.000000)}, 60 {15, CF(0.969650)}, 61 {31, CF(1.000000)}, 62 {31, CF(1.066667)}, 63 {31, CF(0.969650)}, 64 {47, CF(1.142857)}, 65 {63, CF(1.000000)}, 66 {63, CF(1.185115)}, 67 {63, CF(1.066553)}, 68 {79, CF(1.454545)}, 69 {95, CF(1.000000)}, 70 {95, CF(1.230769)}, 71 {95, CF(1.142857)}, 72 {95, CF(1.066667)}, 73 {127, CF(1.000000)}, 74 {127, CF(1.254863)}, 75 {127, CF(1.185255)}, 76 {151, CF(1.000000)}, 77 {127, CF(1.066667)}, 78 {167, CF(1.000000)}, 79 {159, CF(1.454334)}, 80 {183, CF(1.000000)}, 81 {127, CF(0.969744)}, 82 {191, CF(1.280246)}, 83 {191, CF(1.230921)}, 84 {215, CF(1.000000)}, 85 {191, CF(1.143118)}, 86 }; 87 88 static u32 mbm_cf_rmidthreshold __read_mostly = UINT_MAX; 89 90 static u64 mbm_cf __read_mostly; 91 92 static inline u64 get_corrected_mbm_count(u32 rmid, unsigned long val) 93 { 94 /* Correct MBM value. */ 95 if (rmid > mbm_cf_rmidthreshold) 96 val = (val * mbm_cf) >> 20; 97 98 return val; 99 } 100 101 /* 102 * When Sub-NUMA Cluster (SNC) mode is not enabled (as indicated by 103 * "snc_nodes_per_l3_cache == 1") no translation of the RMID value is 104 * needed. The physical RMID is the same as the logical RMID. 105 * 106 * On a platform with SNC mode enabled, Linux enables RMID sharing mode 107 * via MSR 0xCA0 (see the "RMID Sharing Mode" section in the "Intel 108 * Resource Director Technology Architecture Specification" for a full 109 * description of RMID sharing mode). 110 * 111 * In RMID sharing mode there are fewer "logical RMID" values available 112 * to accumulate data ("physical RMIDs" are divided evenly between SNC 113 * nodes that share an L3 cache). Linux creates an rdt_l3_mon_domain for 114 * each SNC node. 115 * 116 * The value loaded into IA32_PQR_ASSOC is the "logical RMID". 117 * 118 * Data is collected independently on each SNC node and can be retrieved 119 * using the "physical RMID" value computed by this function and loaded 120 * into IA32_QM_EVTSEL. @cpu can be any CPU in the SNC node. 121 * 122 * The scope of the IA32_QM_EVTSEL and IA32_QM_CTR MSRs is at the L3 123 * cache. So a "physical RMID" may be read from any CPU that shares 124 * the L3 cache with the desired SNC node, not just from a CPU in 125 * the specific SNC node. 126 */ 127 static int logical_rmid_to_physical_rmid(int cpu, int lrmid) 128 { 129 struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; 130 131 if (snc_nodes_per_l3_cache == 1) 132 return lrmid; 133 134 return lrmid + (cpu_to_node(cpu) % snc_nodes_per_l3_cache) * r->mon.num_rmid; 135 } 136 137 static int __rmid_read_phys(u32 prmid, enum resctrl_event_id eventid, u64 *val) 138 { 139 u64 msr_val; 140 141 /* 142 * As per the SDM, when IA32_QM_EVTSEL.EvtID (bits 7:0) is configured 143 * with a valid event code for supported resource type and the bits 144 * IA32_QM_EVTSEL.RMID (bits 41:32) are configured with valid RMID, 145 * IA32_QM_CTR.data (bits 61:0) reports the monitored data. 146 * IA32_QM_CTR.Error (bit 63) and IA32_QM_CTR.Unavailable (bit 62) 147 * are error bits. 148 */ 149 wrmsr(MSR_IA32_QM_EVTSEL, eventid, prmid); 150 rdmsrq(MSR_IA32_QM_CTR, msr_val); 151 152 if (msr_val & RMID_VAL_ERROR) 153 return -EIO; 154 if (msr_val & RMID_VAL_UNAVAIL) 155 return -EINVAL; 156 157 *val = msr_val; 158 return 0; 159 } 160 161 static struct arch_mbm_state *get_arch_mbm_state(struct rdt_hw_l3_mon_domain *hw_dom, 162 u32 rmid, 163 enum resctrl_event_id eventid) 164 { 165 struct arch_mbm_state *state; 166 167 if (!resctrl_is_mbm_event(eventid)) 168 return NULL; 169 170 state = hw_dom->arch_mbm_states[MBM_STATE_IDX(eventid)]; 171 172 return state ? &state[rmid] : NULL; 173 } 174 175 void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_l3_mon_domain *d, 176 u32 unused, u32 rmid, 177 enum resctrl_event_id eventid) 178 { 179 struct rdt_hw_l3_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); 180 int cpu = cpumask_any(&d->hdr.cpu_mask); 181 struct arch_mbm_state *am; 182 u32 prmid; 183 184 am = get_arch_mbm_state(hw_dom, rmid, eventid); 185 if (am) { 186 memset(am, 0, sizeof(*am)); 187 188 prmid = logical_rmid_to_physical_rmid(cpu, rmid); 189 /* Record any initial, non-zero count value. */ 190 __rmid_read_phys(prmid, eventid, &am->prev_msr); 191 } 192 } 193 194 /* 195 * Assumes that hardware counters are also reset and thus that there is 196 * no need to record initial non-zero counts. 197 */ 198 void resctrl_arch_reset_rmid_all(struct rdt_resource *r, struct rdt_l3_mon_domain *d) 199 { 200 struct rdt_hw_l3_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); 201 enum resctrl_event_id eventid; 202 int idx; 203 204 for_each_mbm_event_id(eventid) { 205 if (!resctrl_is_mon_event_enabled(eventid)) 206 continue; 207 idx = MBM_STATE_IDX(eventid); 208 memset(hw_dom->arch_mbm_states[idx], 0, 209 sizeof(*hw_dom->arch_mbm_states[0]) * r->mon.num_rmid); 210 } 211 } 212 213 static u64 mbm_overflow_count(u64 prev_msr, u64 cur_msr, unsigned int width) 214 { 215 u64 shift = 64 - width, chunks; 216 217 chunks = (cur_msr << shift) - (prev_msr << shift); 218 return chunks >> shift; 219 } 220 221 static u64 get_corrected_val(struct rdt_resource *r, struct rdt_l3_mon_domain *d, 222 u32 rmid, enum resctrl_event_id eventid, u64 msr_val) 223 { 224 struct rdt_hw_l3_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); 225 struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); 226 struct arch_mbm_state *am; 227 u64 chunks; 228 229 am = get_arch_mbm_state(hw_dom, rmid, eventid); 230 if (am) { 231 am->chunks += mbm_overflow_count(am->prev_msr, msr_val, 232 hw_res->mbm_width); 233 chunks = get_corrected_mbm_count(rmid, am->chunks); 234 am->prev_msr = msr_val; 235 } else { 236 chunks = msr_val; 237 } 238 239 return chunks * hw_res->mon_scale; 240 } 241 242 int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain_hdr *hdr, 243 u32 unused, u32 rmid, enum resctrl_event_id eventid, 244 void *arch_priv, u64 *val, void *ignored) 245 { 246 struct rdt_hw_l3_mon_domain *hw_dom; 247 struct rdt_l3_mon_domain *d; 248 struct arch_mbm_state *am; 249 u64 msr_val; 250 u32 prmid; 251 int cpu; 252 int ret; 253 254 resctrl_arch_rmid_read_context_check(); 255 256 if (r->rid == RDT_RESOURCE_PERF_PKG) 257 return intel_aet_read_event(hdr->id, rmid, arch_priv, val); 258 259 if (!domain_header_is_valid(hdr, RESCTRL_MON_DOMAIN, RDT_RESOURCE_L3)) 260 return -EINVAL; 261 262 d = container_of(hdr, struct rdt_l3_mon_domain, hdr); 263 hw_dom = resctrl_to_arch_mon_dom(d); 264 cpu = cpumask_any(&hdr->cpu_mask); 265 prmid = logical_rmid_to_physical_rmid(cpu, rmid); 266 ret = __rmid_read_phys(prmid, eventid, &msr_val); 267 268 if (!ret) { 269 *val = get_corrected_val(r, d, rmid, eventid, msr_val); 270 } else if (ret == -EINVAL) { 271 am = get_arch_mbm_state(hw_dom, rmid, eventid); 272 if (am) 273 am->prev_msr = 0; 274 } 275 276 return ret; 277 } 278 279 static int __cntr_id_read(u32 cntr_id, u64 *val) 280 { 281 u64 msr_val; 282 283 /* 284 * QM_EVTSEL Register definition: 285 * ======================================================= 286 * Bits Mnemonic Description 287 * ======================================================= 288 * 63:44 -- Reserved 289 * 43:32 RMID RMID or counter ID in ABMC mode 290 * when reading an MBM event 291 * 31 ExtendedEvtID Extended Event Identifier 292 * 30:8 -- Reserved 293 * 7:0 EvtID Event Identifier 294 * ======================================================= 295 * The contents of a specific counter can be read by setting the 296 * following fields in QM_EVTSEL.ExtendedEvtID(=1) and 297 * QM_EVTSEL.EvtID = L3CacheABMC (=1) and setting QM_EVTSEL.RMID 298 * to the desired counter ID. Reading the QM_CTR then returns the 299 * contents of the specified counter. The RMID_VAL_ERROR bit is set 300 * if the counter configuration is invalid, or if an invalid counter 301 * ID is set in the QM_EVTSEL.RMID field. The RMID_VAL_UNAVAIL bit 302 * is set if the counter data is unavailable. 303 */ 304 wrmsr(MSR_IA32_QM_EVTSEL, ABMC_EXTENDED_EVT_ID | ABMC_EVT_ID, cntr_id); 305 rdmsrl(MSR_IA32_QM_CTR, msr_val); 306 307 if (msr_val & RMID_VAL_ERROR) 308 return -EIO; 309 if (msr_val & RMID_VAL_UNAVAIL) 310 return -EINVAL; 311 312 *val = msr_val; 313 return 0; 314 } 315 316 void resctrl_arch_reset_cntr(struct rdt_resource *r, struct rdt_l3_mon_domain *d, 317 u32 unused, u32 rmid, int cntr_id, 318 enum resctrl_event_id eventid) 319 { 320 struct rdt_hw_l3_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); 321 struct arch_mbm_state *am; 322 323 am = get_arch_mbm_state(hw_dom, rmid, eventid); 324 if (am) { 325 memset(am, 0, sizeof(*am)); 326 327 /* Record any initial, non-zero count value. */ 328 __cntr_id_read(cntr_id, &am->prev_msr); 329 } 330 } 331 332 int resctrl_arch_cntr_read(struct rdt_resource *r, struct rdt_l3_mon_domain *d, 333 u32 unused, u32 rmid, int cntr_id, 334 enum resctrl_event_id eventid, u64 *val) 335 { 336 u64 msr_val; 337 int ret; 338 339 ret = __cntr_id_read(cntr_id, &msr_val); 340 if (ret) 341 return ret; 342 343 *val = get_corrected_val(r, d, rmid, eventid, msr_val); 344 345 return 0; 346 } 347 348 /* 349 * The power-on reset value of MSR_RMID_SNC_CONFIG is 0x1 350 * which indicates that RMIDs are configured in legacy mode. 351 * This mode is incompatible with Linux resctrl semantics 352 * as RMIDs are partitioned between SNC nodes, which requires 353 * a user to know which RMID is allocated to a task. 354 * Clearing bit 0 reconfigures the RMID counters for use 355 * in RMID sharing mode. This mode is better for Linux. 356 * The RMID space is divided between all SNC nodes with the 357 * RMIDs renumbered to start from zero in each node when 358 * counting operations from tasks. Code to read the counters 359 * must adjust RMID counter numbers based on SNC node. See 360 * logical_rmid_to_physical_rmid() for code that does this. 361 */ 362 void arch_mon_domain_online(struct rdt_resource *r, struct rdt_l3_mon_domain *d) 363 { 364 if (snc_nodes_per_l3_cache > 1) 365 msr_clear_bit(MSR_RMID_SNC_CONFIG, 0); 366 } 367 368 /* CPU models that support SNC and MSR_RMID_SNC_CONFIG */ 369 static const struct x86_cpu_id snc_cpu_ids[] __initconst = { 370 X86_MATCH_VFM(INTEL_ICELAKE_X, 0), 371 X86_MATCH_VFM(INTEL_SAPPHIRERAPIDS_X, 0), 372 X86_MATCH_VFM(INTEL_EMERALDRAPIDS_X, 0), 373 X86_MATCH_VFM(INTEL_GRANITERAPIDS_X, 0), 374 X86_MATCH_VFM(INTEL_ATOM_CRESTMONT_X, 0), 375 X86_MATCH_VFM(INTEL_ATOM_DARKMONT_X, 0), 376 {} 377 }; 378 379 static __init int snc_get_config(void) 380 { 381 int ret = topology_num_nodes_per_package(); 382 383 if (ret > 1 && !x86_match_cpu(snc_cpu_ids)) { 384 pr_warn("CoD enabled system? Resctrl not supported\n"); 385 return 1; 386 } 387 388 /* sanity check: Only valid results are 1, 2, 3, 4, 6 */ 389 switch (ret) { 390 case 1: 391 break; 392 case 2 ... 4: 393 case 6: 394 pr_info("Sub-NUMA Cluster mode detected with %d nodes per L3 cache\n", ret); 395 rdt_resources_all[RDT_RESOURCE_L3].r_resctrl.mon_scope = RESCTRL_L3_NODE; 396 break; 397 default: 398 pr_warn("Ignore improbable SNC node count %d\n", ret); 399 ret = 1; 400 break; 401 } 402 403 return ret; 404 } 405 406 int __init rdt_get_l3_mon_config(struct rdt_resource *r) 407 { 408 unsigned int mbm_offset = boot_cpu_data.x86_cache_mbm_width_offset; 409 struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); 410 unsigned int threshold; 411 u32 eax, ebx, ecx, edx; 412 413 snc_nodes_per_l3_cache = snc_get_config(); 414 415 resctrl_rmid_realloc_limit = boot_cpu_data.x86_cache_size * 1024; 416 hw_res->mon_scale = boot_cpu_data.x86_cache_occ_scale / snc_nodes_per_l3_cache; 417 r->mon.num_rmid = (boot_cpu_data.x86_cache_max_rmid + 1) / snc_nodes_per_l3_cache; 418 hw_res->mbm_width = MBM_CNTR_WIDTH_BASE; 419 420 if (mbm_offset > 0 && mbm_offset <= MBM_CNTR_WIDTH_OFFSET_MAX) 421 hw_res->mbm_width += mbm_offset; 422 else if (mbm_offset > MBM_CNTR_WIDTH_OFFSET_MAX) 423 pr_warn("Ignoring impossible MBM counter offset\n"); 424 425 /* 426 * A reasonable upper limit on the max threshold is the number 427 * of lines tagged per RMID if all RMIDs have the same number of 428 * lines tagged in the LLC. 429 * 430 * For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC. 431 */ 432 threshold = resctrl_rmid_realloc_limit / r->mon.num_rmid; 433 434 /* 435 * Because num_rmid may not be a power of two, round the value 436 * to the nearest multiple of hw_res->mon_scale so it matches a 437 * value the hardware will measure. mon_scale may not be a power of 2. 438 */ 439 resctrl_rmid_realloc_threshold = resctrl_arch_round_mon_val(threshold); 440 441 if (rdt_cpu_has(X86_FEATURE_BMEC) || rdt_cpu_has(X86_FEATURE_ABMC)) { 442 /* Detect list of bandwidth sources that can be tracked */ 443 cpuid_count(0x80000020, 3, &eax, &ebx, &ecx, &edx); 444 r->mon.mbm_cfg_mask = ecx & MAX_EVT_CONFIG_BITS; 445 } 446 447 /* 448 * resctrl assumes a system that supports assignable counters can 449 * switch to "default" mode. Ensure that there is a "default" mode 450 * to switch to. This enforces a dependency between the independent 451 * X86_FEATURE_ABMC and X86_FEATURE_CQM_MBM_TOTAL/X86_FEATURE_CQM_MBM_LOCAL 452 * hardware features. 453 */ 454 if (rdt_cpu_has(X86_FEATURE_ABMC) && 455 (rdt_cpu_has(X86_FEATURE_CQM_MBM_TOTAL) || 456 rdt_cpu_has(X86_FEATURE_CQM_MBM_LOCAL))) { 457 r->mon.mbm_cntr_assignable = true; 458 cpuid_count(0x80000020, 5, &eax, &ebx, &ecx, &edx); 459 r->mon.num_mbm_cntrs = (ebx & GENMASK(15, 0)) + 1; 460 hw_res->mbm_cntr_assign_enabled = true; 461 } 462 463 r->mon_capable = true; 464 465 return 0; 466 } 467 468 void __init intel_rdt_mbm_apply_quirk(void) 469 { 470 int cf_index; 471 472 cf_index = (boot_cpu_data.x86_cache_max_rmid + 1) / 8 - 1; 473 if (cf_index >= ARRAY_SIZE(mbm_cf_table)) { 474 pr_info("No MBM correction factor available\n"); 475 return; 476 } 477 478 mbm_cf_rmidthreshold = mbm_cf_table[cf_index].rmidthreshold; 479 mbm_cf = mbm_cf_table[cf_index].cf; 480 } 481 482 static void resctrl_abmc_set_one_amd(void *arg) 483 { 484 bool *enable = arg; 485 486 if (*enable) 487 msr_set_bit(MSR_IA32_L3_QOS_EXT_CFG, ABMC_ENABLE_BIT); 488 else 489 msr_clear_bit(MSR_IA32_L3_QOS_EXT_CFG, ABMC_ENABLE_BIT); 490 } 491 492 /* 493 * ABMC enable/disable requires update of L3_QOS_EXT_CFG MSR on all the CPUs 494 * associated with all monitor domains. 495 */ 496 static void _resctrl_abmc_enable(struct rdt_resource *r, bool enable) 497 { 498 struct rdt_l3_mon_domain *d; 499 500 lockdep_assert_cpus_held(); 501 502 list_for_each_entry(d, &r->mon_domains, hdr.list) { 503 on_each_cpu_mask(&d->hdr.cpu_mask, resctrl_abmc_set_one_amd, 504 &enable, 1); 505 resctrl_arch_reset_rmid_all(r, d); 506 } 507 } 508 509 int resctrl_arch_mbm_cntr_assign_set(struct rdt_resource *r, bool enable) 510 { 511 struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); 512 513 if (r->mon.mbm_cntr_assignable && 514 hw_res->mbm_cntr_assign_enabled != enable) { 515 _resctrl_abmc_enable(r, enable); 516 hw_res->mbm_cntr_assign_enabled = enable; 517 } 518 519 return 0; 520 } 521 522 bool resctrl_arch_mbm_cntr_assign_enabled(struct rdt_resource *r) 523 { 524 return resctrl_to_arch_res(r)->mbm_cntr_assign_enabled; 525 } 526 527 static void resctrl_abmc_config_one_amd(void *info) 528 { 529 union l3_qos_abmc_cfg *abmc_cfg = info; 530 531 wrmsrl(MSR_IA32_L3_QOS_ABMC_CFG, abmc_cfg->full); 532 } 533 534 /* 535 * Send an IPI to the domain to assign the counter to RMID, event pair. 536 */ 537 void resctrl_arch_config_cntr(struct rdt_resource *r, struct rdt_l3_mon_domain *d, 538 enum resctrl_event_id evtid, u32 rmid, u32 closid, 539 u32 cntr_id, bool assign) 540 { 541 struct rdt_hw_l3_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); 542 union l3_qos_abmc_cfg abmc_cfg = { 0 }; 543 struct arch_mbm_state *am; 544 545 abmc_cfg.split.cfg_en = 1; 546 abmc_cfg.split.cntr_en = assign ? 1 : 0; 547 abmc_cfg.split.cntr_id = cntr_id; 548 abmc_cfg.split.bw_src = rmid; 549 if (assign) 550 abmc_cfg.split.bw_type = resctrl_get_mon_evt_cfg(evtid); 551 552 smp_call_function_any(&d->hdr.cpu_mask, resctrl_abmc_config_one_amd, &abmc_cfg, 1); 553 554 /* 555 * The hardware counter is reset (because cfg_en == 1) so there is no 556 * need to record initial non-zero counts. 557 */ 558 am = get_arch_mbm_state(hw_dom, rmid, evtid); 559 if (am) 560 memset(am, 0, sizeof(*am)); 561 } 562 563 void resctrl_arch_mbm_cntr_assign_set_one(struct rdt_resource *r) 564 { 565 struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); 566 567 resctrl_abmc_set_one_amd(&hw_res->mbm_cntr_assign_enabled); 568 } 569