1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Resource Director Technology(RDT) 4 * - Monitoring code 5 * 6 * Copyright (C) 2017 Intel Corporation 7 * 8 * Author: 9 * Vikas Shivappa <vikas.shivappa@intel.com> 10 * 11 * This replaces the cqm.c based on perf but we reuse a lot of 12 * code and datastructures originally from Peter Zijlstra and Matt Fleming. 13 * 14 * More information about RDT be found in the Intel (R) x86 Architecture 15 * Software Developer Manual June 2016, volume 3, section 17.17. 16 */ 17 18 #define pr_fmt(fmt) "resctrl: " fmt 19 20 #include <linux/cpu.h> 21 #include <linux/resctrl.h> 22 23 #include <asm/cpu_device_id.h> 24 #include <asm/cpuid/api.h> 25 #include <asm/msr.h> 26 27 #include "internal.h" 28 29 /* 30 * Global boolean for rdt_monitor which is true if any 31 * resource monitoring is enabled. 32 */ 33 bool rdt_mon_capable; 34 35 #define CF(cf) ((unsigned long)(1048576 * (cf) + 0.5)) 36 37 static int snc_nodes_per_l3_cache = 1; 38 39 /* 40 * The correction factor table is documented in Documentation/filesystems/resctrl.rst. 41 * If rmid > rmid threshold, MBM total and local values should be multiplied 42 * by the correction factor. 43 * 44 * The original table is modified for better code: 45 * 46 * 1. The threshold 0 is changed to rmid count - 1 so don't do correction 47 * for the case. 48 * 2. MBM total and local correction table indexed by core counter which is 49 * equal to (x86_cache_max_rmid + 1) / 8 - 1 and is from 0 up to 27. 50 * 3. The correction factor is normalized to 2^20 (1048576) so it's faster 51 * to calculate corrected value by shifting: 52 * corrected_value = (original_value * correction_factor) >> 20 53 */ 54 static const struct mbm_correction_factor_table { 55 u32 rmidthreshold; 56 u64 cf; 57 } mbm_cf_table[] __initconst = { 58 {7, CF(1.000000)}, 59 {15, CF(1.000000)}, 60 {15, CF(0.969650)}, 61 {31, CF(1.000000)}, 62 {31, CF(1.066667)}, 63 {31, CF(0.969650)}, 64 {47, CF(1.142857)}, 65 {63, CF(1.000000)}, 66 {63, CF(1.185115)}, 67 {63, CF(1.066553)}, 68 {79, CF(1.454545)}, 69 {95, CF(1.000000)}, 70 {95, CF(1.230769)}, 71 {95, CF(1.142857)}, 72 {95, CF(1.066667)}, 73 {127, CF(1.000000)}, 74 {127, CF(1.254863)}, 75 {127, CF(1.185255)}, 76 {151, CF(1.000000)}, 77 {127, CF(1.066667)}, 78 {167, CF(1.000000)}, 79 {159, CF(1.454334)}, 80 {183, CF(1.000000)}, 81 {127, CF(0.969744)}, 82 {191, CF(1.280246)}, 83 {191, CF(1.230921)}, 84 {215, CF(1.000000)}, 85 {191, CF(1.143118)}, 86 }; 87 88 static u32 mbm_cf_rmidthreshold __read_mostly = UINT_MAX; 89 90 static u64 mbm_cf __read_mostly; 91 92 static inline u64 get_corrected_mbm_count(u32 rmid, unsigned long val) 93 { 94 /* Correct MBM value. */ 95 if (rmid > mbm_cf_rmidthreshold) 96 val = (val * mbm_cf) >> 20; 97 98 return val; 99 } 100 101 /* 102 * When Sub-NUMA Cluster (SNC) mode is not enabled (as indicated by 103 * "snc_nodes_per_l3_cache == 1") no translation of the RMID value is 104 * needed. The physical RMID is the same as the logical RMID. 105 * 106 * On a platform with SNC mode enabled, Linux enables RMID sharing mode 107 * via MSR 0xCA0 (see the "RMID Sharing Mode" section in the "Intel 108 * Resource Director Technology Architecture Specification" for a full 109 * description of RMID sharing mode). 110 * 111 * In RMID sharing mode there are fewer "logical RMID" values available 112 * to accumulate data ("physical RMIDs" are divided evenly between SNC 113 * nodes that share an L3 cache). Linux creates an rdt_l3_mon_domain for 114 * each SNC node. 115 * 116 * The value loaded into IA32_PQR_ASSOC is the "logical RMID". 117 * 118 * Data is collected independently on each SNC node and can be retrieved 119 * using the "physical RMID" value computed by this function and loaded 120 * into IA32_QM_EVTSEL. @cpu can be any CPU in the SNC node. 121 * 122 * The scope of the IA32_QM_EVTSEL and IA32_QM_CTR MSRs is at the L3 123 * cache. So a "physical RMID" may be read from any CPU that shares 124 * the L3 cache with the desired SNC node, not just from a CPU in 125 * the specific SNC node. 126 */ 127 static int logical_rmid_to_physical_rmid(int cpu, int lrmid) 128 { 129 struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; 130 131 if (snc_nodes_per_l3_cache == 1) 132 return lrmid; 133 134 return lrmid + (cpu_to_node(cpu) % snc_nodes_per_l3_cache) * r->mon.num_rmid; 135 } 136 137 static int __rmid_read_phys(u32 prmid, enum resctrl_event_id eventid, u64 *val) 138 { 139 u64 msr_val; 140 141 /* 142 * As per the SDM, when IA32_QM_EVTSEL.EvtID (bits 7:0) is configured 143 * with a valid event code for supported resource type and the bits 144 * IA32_QM_EVTSEL.RMID (bits 41:32) are configured with valid RMID, 145 * IA32_QM_CTR.data (bits 61:0) reports the monitored data. 146 * IA32_QM_CTR.Error (bit 63) and IA32_QM_CTR.Unavailable (bit 62) 147 * are error bits. 148 */ 149 wrmsr(MSR_IA32_QM_EVTSEL, eventid, prmid); 150 rdmsrq(MSR_IA32_QM_CTR, msr_val); 151 152 if (msr_val & RMID_VAL_ERROR) 153 return -EIO; 154 if (msr_val & RMID_VAL_UNAVAIL) 155 return -EINVAL; 156 157 *val = msr_val; 158 return 0; 159 } 160 161 static struct arch_mbm_state *get_arch_mbm_state(struct rdt_hw_l3_mon_domain *hw_dom, 162 u32 rmid, 163 enum resctrl_event_id eventid) 164 { 165 struct arch_mbm_state *state; 166 167 if (!resctrl_is_mbm_event(eventid)) 168 return NULL; 169 170 state = hw_dom->arch_mbm_states[MBM_STATE_IDX(eventid)]; 171 172 return state ? &state[rmid] : NULL; 173 } 174 175 void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_l3_mon_domain *d, 176 u32 unused, u32 rmid, 177 enum resctrl_event_id eventid) 178 { 179 struct rdt_hw_l3_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); 180 int cpu = cpumask_any(&d->hdr.cpu_mask); 181 struct arch_mbm_state *am; 182 u32 prmid; 183 184 am = get_arch_mbm_state(hw_dom, rmid, eventid); 185 if (am) { 186 memset(am, 0, sizeof(*am)); 187 188 prmid = logical_rmid_to_physical_rmid(cpu, rmid); 189 /* Record any initial, non-zero count value. */ 190 __rmid_read_phys(prmid, eventid, &am->prev_msr); 191 } 192 } 193 194 /* 195 * Assumes that hardware counters are also reset and thus that there is 196 * no need to record initial non-zero counts. 197 */ 198 void resctrl_arch_reset_rmid_all(struct rdt_resource *r, struct rdt_l3_mon_domain *d) 199 { 200 struct rdt_hw_l3_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); 201 enum resctrl_event_id eventid; 202 int idx; 203 204 for_each_mbm_event_id(eventid) { 205 if (!resctrl_is_mon_event_enabled(eventid)) 206 continue; 207 idx = MBM_STATE_IDX(eventid); 208 memset(hw_dom->arch_mbm_states[idx], 0, 209 sizeof(*hw_dom->arch_mbm_states[0]) * r->mon.num_rmid); 210 } 211 } 212 213 static u64 mbm_overflow_count(u64 prev_msr, u64 cur_msr, unsigned int width) 214 { 215 u64 shift = 64 - width, chunks; 216 217 chunks = (cur_msr << shift) - (prev_msr << shift); 218 return chunks >> shift; 219 } 220 221 static u64 get_corrected_val(struct rdt_resource *r, struct rdt_l3_mon_domain *d, 222 u32 rmid, enum resctrl_event_id eventid, u64 msr_val) 223 { 224 struct rdt_hw_l3_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); 225 struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); 226 struct arch_mbm_state *am; 227 u64 chunks; 228 229 am = get_arch_mbm_state(hw_dom, rmid, eventid); 230 if (am) { 231 am->chunks += mbm_overflow_count(am->prev_msr, msr_val, 232 hw_res->mbm_width); 233 chunks = get_corrected_mbm_count(rmid, am->chunks); 234 am->prev_msr = msr_val; 235 } else { 236 chunks = msr_val; 237 } 238 239 return chunks * hw_res->mon_scale; 240 } 241 242 int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain_hdr *hdr, 243 u32 unused, u32 rmid, enum resctrl_event_id eventid, 244 void *arch_priv, u64 *val, void *ignored) 245 { 246 struct rdt_hw_l3_mon_domain *hw_dom; 247 struct rdt_l3_mon_domain *d; 248 struct arch_mbm_state *am; 249 u64 msr_val; 250 u32 prmid; 251 int cpu; 252 int ret; 253 254 resctrl_arch_rmid_read_context_check(); 255 256 if (r->rid == RDT_RESOURCE_PERF_PKG) 257 return intel_aet_read_event(hdr->id, rmid, arch_priv, val); 258 259 if (!domain_header_is_valid(hdr, RESCTRL_MON_DOMAIN, RDT_RESOURCE_L3)) 260 return -EINVAL; 261 262 d = container_of(hdr, struct rdt_l3_mon_domain, hdr); 263 hw_dom = resctrl_to_arch_mon_dom(d); 264 cpu = cpumask_any(&hdr->cpu_mask); 265 prmid = logical_rmid_to_physical_rmid(cpu, rmid); 266 ret = __rmid_read_phys(prmid, eventid, &msr_val); 267 268 if (!ret) { 269 *val = get_corrected_val(r, d, rmid, eventid, msr_val); 270 } else if (ret == -EINVAL) { 271 am = get_arch_mbm_state(hw_dom, rmid, eventid); 272 if (am) 273 am->prev_msr = 0; 274 } 275 276 return ret; 277 } 278 279 static int __cntr_id_read(u32 cntr_id, u64 *val) 280 { 281 u64 msr_val; 282 283 /* 284 * QM_EVTSEL Register definition: 285 * ======================================================= 286 * Bits Mnemonic Description 287 * ======================================================= 288 * 63:44 -- Reserved 289 * 43:32 RMID RMID or counter ID in ABMC mode 290 * when reading an MBM event 291 * 31 ExtendedEvtID Extended Event Identifier 292 * 30:8 -- Reserved 293 * 7:0 EvtID Event Identifier 294 * ======================================================= 295 * The contents of a specific counter can be read by setting the 296 * following fields in QM_EVTSEL.ExtendedEvtID(=1) and 297 * QM_EVTSEL.EvtID = L3CacheABMC (=1) and setting QM_EVTSEL.RMID 298 * to the desired counter ID. Reading the QM_CTR then returns the 299 * contents of the specified counter. The RMID_VAL_ERROR bit is set 300 * if the counter configuration is invalid, or if an invalid counter 301 * ID is set in the QM_EVTSEL.RMID field. The RMID_VAL_UNAVAIL bit 302 * is set if the counter data is unavailable. 303 */ 304 wrmsr(MSR_IA32_QM_EVTSEL, ABMC_EXTENDED_EVT_ID | ABMC_EVT_ID, cntr_id); 305 rdmsrq(MSR_IA32_QM_CTR, msr_val); 306 307 if (msr_val & RMID_VAL_ERROR) 308 return -EIO; 309 if (msr_val & RMID_VAL_UNAVAIL) 310 return -EINVAL; 311 312 *val = msr_val; 313 return 0; 314 } 315 316 void resctrl_arch_reset_cntr(struct rdt_resource *r, struct rdt_l3_mon_domain *d, 317 u32 unused, u32 rmid, int cntr_id, 318 enum resctrl_event_id eventid) 319 { 320 struct rdt_hw_l3_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); 321 struct arch_mbm_state *am; 322 323 am = get_arch_mbm_state(hw_dom, rmid, eventid); 324 if (am) { 325 memset(am, 0, sizeof(*am)); 326 327 /* Record any initial, non-zero count value. */ 328 __cntr_id_read(cntr_id, &am->prev_msr); 329 } 330 } 331 332 int resctrl_arch_cntr_read(struct rdt_resource *r, struct rdt_l3_mon_domain *d, 333 u32 unused, u32 rmid, int cntr_id, 334 enum resctrl_event_id eventid, u64 *val) 335 { 336 u64 msr_val; 337 int ret; 338 339 ret = __cntr_id_read(cntr_id, &msr_val); 340 if (ret) 341 return ret; 342 343 *val = get_corrected_val(r, d, rmid, eventid, msr_val); 344 345 return 0; 346 } 347 348 /* 349 * The power-on reset value of MSR_RMID_SNC_CONFIG is 0x1 350 * which indicates that RMIDs are configured in legacy mode. 351 * This mode is incompatible with Linux resctrl semantics 352 * as RMIDs are partitioned between SNC nodes, which requires 353 * a user to know which RMID is allocated to a task. 354 * Clearing bit 0 reconfigures the RMID counters for use 355 * in RMID sharing mode. This mode is better for Linux. 356 * The RMID space is divided between all SNC nodes with the 357 * RMIDs renumbered to start from zero in each node when 358 * counting operations from tasks. Code to read the counters 359 * must adjust RMID counter numbers based on SNC node. See 360 * logical_rmid_to_physical_rmid() for code that does this. 361 */ 362 void arch_mon_domain_online(struct rdt_resource *r, struct rdt_l3_mon_domain *d) 363 { 364 if (snc_nodes_per_l3_cache > 1) 365 msr_clear_bit(MSR_RMID_SNC_CONFIG, 0); 366 } 367 368 /* CPU models that support SNC and MSR_RMID_SNC_CONFIG */ 369 static const struct x86_cpu_id snc_cpu_ids[] __initconst = { 370 X86_MATCH_VFM(INTEL_ICELAKE_X, 0), 371 X86_MATCH_VFM(INTEL_SAPPHIRERAPIDS_X, 0), 372 X86_MATCH_VFM(INTEL_EMERALDRAPIDS_X, 0), 373 X86_MATCH_VFM(INTEL_GRANITERAPIDS_X, 0), 374 X86_MATCH_VFM(INTEL_ATOM_CRESTMONT_X, 0), 375 X86_MATCH_VFM(INTEL_ATOM_DARKMONT_X, 0), 376 {} 377 }; 378 379 static __init int snc_get_config(void) 380 { 381 int ret; 382 383 if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) 384 return 1; 385 386 ret = topology_num_nodes_per_package(); 387 388 if (ret > 1 && !x86_match_cpu(snc_cpu_ids)) { 389 pr_warn("CoD enabled system? Resctrl not supported\n"); 390 return 1; 391 } 392 393 /* sanity check: Only valid results are 1, 2, 3, 4, 6 */ 394 switch (ret) { 395 case 1: 396 break; 397 case 2 ... 4: 398 case 6: 399 pr_info("Sub-NUMA Cluster mode detected with %d nodes per L3 cache\n", ret); 400 rdt_resources_all[RDT_RESOURCE_L3].r_resctrl.mon_scope = RESCTRL_L3_NODE; 401 break; 402 default: 403 pr_warn("Ignore improbable SNC node count %d\n", ret); 404 ret = 1; 405 break; 406 } 407 408 return ret; 409 } 410 411 int __init rdt_get_l3_mon_config(struct rdt_resource *r) 412 { 413 unsigned int mbm_offset = boot_cpu_data.x86_cache_mbm_width_offset; 414 struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); 415 unsigned int threshold; 416 u32 eax, ebx, ecx, edx; 417 418 snc_nodes_per_l3_cache = snc_get_config(); 419 420 resctrl_rmid_realloc_limit = boot_cpu_data.x86_cache_size * 1024; 421 hw_res->mon_scale = boot_cpu_data.x86_cache_occ_scale / snc_nodes_per_l3_cache; 422 r->mon.num_rmid = (boot_cpu_data.x86_cache_max_rmid + 1) / snc_nodes_per_l3_cache; 423 hw_res->mbm_width = MBM_CNTR_WIDTH_BASE; 424 425 if (mbm_offset > 0 && mbm_offset <= MBM_CNTR_WIDTH_OFFSET_MAX) 426 hw_res->mbm_width += mbm_offset; 427 else if (mbm_offset > MBM_CNTR_WIDTH_OFFSET_MAX) 428 pr_warn("Ignoring impossible MBM counter offset\n"); 429 430 /* 431 * A reasonable upper limit on the max threshold is the number 432 * of lines tagged per RMID if all RMIDs have the same number of 433 * lines tagged in the LLC. 434 * 435 * For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC. 436 */ 437 threshold = resctrl_rmid_realloc_limit / r->mon.num_rmid; 438 439 /* 440 * Because num_rmid may not be a power of two, round the value 441 * to the nearest multiple of hw_res->mon_scale so it matches a 442 * value the hardware will measure. mon_scale may not be a power of 2. 443 */ 444 resctrl_rmid_realloc_threshold = resctrl_arch_round_mon_val(threshold); 445 446 if (rdt_cpu_has(X86_FEATURE_BMEC) || rdt_cpu_has(X86_FEATURE_ABMC)) { 447 /* Detect list of bandwidth sources that can be tracked */ 448 cpuid_count(0x80000020, 3, &eax, &ebx, &ecx, &edx); 449 r->mon.mbm_cfg_mask = ecx & MAX_EVT_CONFIG_BITS; 450 } 451 452 /* 453 * resctrl assumes a system that supports assignable counters can 454 * switch to "default" mode. Ensure that there is a "default" mode 455 * to switch to. This enforces a dependency between the independent 456 * X86_FEATURE_ABMC and X86_FEATURE_CQM_MBM_TOTAL/X86_FEATURE_CQM_MBM_LOCAL 457 * hardware features. 458 */ 459 if (rdt_cpu_has(X86_FEATURE_ABMC) && 460 (rdt_cpu_has(X86_FEATURE_CQM_MBM_TOTAL) || 461 rdt_cpu_has(X86_FEATURE_CQM_MBM_LOCAL))) { 462 r->mon.mbm_cntr_assignable = true; 463 r->mon.mbm_cntr_configurable = true; 464 cpuid_count(0x80000020, 5, &eax, &ebx, &ecx, &edx); 465 r->mon.num_mbm_cntrs = (ebx & GENMASK(15, 0)) + 1; 466 hw_res->mbm_cntr_assign_enabled = true; 467 } 468 469 r->mon_capable = true; 470 471 return 0; 472 } 473 474 void __init intel_rdt_mbm_apply_quirk(void) 475 { 476 int cf_index; 477 478 cf_index = (boot_cpu_data.x86_cache_max_rmid + 1) / 8 - 1; 479 if (cf_index >= ARRAY_SIZE(mbm_cf_table)) { 480 pr_info("No MBM correction factor available\n"); 481 return; 482 } 483 484 mbm_cf_rmidthreshold = mbm_cf_table[cf_index].rmidthreshold; 485 mbm_cf = mbm_cf_table[cf_index].cf; 486 } 487 488 static void resctrl_abmc_set_one_amd(void *arg) 489 { 490 bool *enable = arg; 491 492 if (*enable) 493 msr_set_bit(MSR_IA32_L3_QOS_EXT_CFG, ABMC_ENABLE_BIT); 494 else 495 msr_clear_bit(MSR_IA32_L3_QOS_EXT_CFG, ABMC_ENABLE_BIT); 496 } 497 498 /* 499 * ABMC enable/disable requires update of L3_QOS_EXT_CFG MSR on all the CPUs 500 * associated with all monitor domains. 501 */ 502 static void _resctrl_abmc_enable(struct rdt_resource *r, bool enable) 503 { 504 struct rdt_l3_mon_domain *d; 505 506 lockdep_assert_cpus_held(); 507 508 list_for_each_entry(d, &r->mon_domains, hdr.list) { 509 on_each_cpu_mask(&d->hdr.cpu_mask, resctrl_abmc_set_one_amd, 510 &enable, 1); 511 resctrl_arch_reset_rmid_all(r, d); 512 } 513 } 514 515 int resctrl_arch_mbm_cntr_assign_set(struct rdt_resource *r, bool enable) 516 { 517 struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); 518 519 if (r->mon.mbm_cntr_assignable && 520 hw_res->mbm_cntr_assign_enabled != enable) { 521 _resctrl_abmc_enable(r, enable); 522 hw_res->mbm_cntr_assign_enabled = enable; 523 } 524 525 return 0; 526 } 527 528 bool resctrl_arch_mbm_cntr_assign_enabled(struct rdt_resource *r) 529 { 530 return resctrl_to_arch_res(r)->mbm_cntr_assign_enabled; 531 } 532 533 static void resctrl_abmc_config_one_amd(void *info) 534 { 535 union l3_qos_abmc_cfg *abmc_cfg = info; 536 537 wrmsrq(MSR_IA32_L3_QOS_ABMC_CFG, abmc_cfg->full); 538 } 539 540 /* 541 * Send an IPI to the domain to assign the counter to RMID, event pair. 542 */ 543 void resctrl_arch_config_cntr(struct rdt_resource *r, struct rdt_l3_mon_domain *d, 544 enum resctrl_event_id evtid, u32 rmid, u32 closid, 545 u32 cntr_id, bool assign) 546 { 547 struct rdt_hw_l3_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); 548 union l3_qos_abmc_cfg abmc_cfg = { 0 }; 549 struct arch_mbm_state *am; 550 551 abmc_cfg.split.cfg_en = 1; 552 abmc_cfg.split.cntr_en = assign ? 1 : 0; 553 abmc_cfg.split.cntr_id = cntr_id; 554 abmc_cfg.split.bw_src = rmid; 555 if (assign) 556 abmc_cfg.split.bw_type = resctrl_get_mon_evt_cfg(evtid); 557 558 smp_call_function_any(&d->hdr.cpu_mask, resctrl_abmc_config_one_amd, &abmc_cfg, 1); 559 560 /* 561 * The hardware counter is reset (because cfg_en == 1) so there is no 562 * need to record initial non-zero counts. 563 */ 564 am = get_arch_mbm_state(hw_dom, rmid, evtid); 565 if (am) 566 memset(am, 0, sizeof(*am)); 567 } 568 569 void resctrl_arch_mbm_cntr_assign_set_one(struct rdt_resource *r) 570 { 571 struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); 572 573 resctrl_abmc_set_one_amd(&hw_res->mbm_cntr_assign_enabled); 574 } 575