1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Resource Director Technology(RDT) 4 * - Monitoring code 5 * 6 * Copyright (C) 2017 Intel Corporation 7 * 8 * Author: 9 * Vikas Shivappa <vikas.shivappa@intel.com> 10 * 11 * This replaces the cqm.c based on perf but we reuse a lot of 12 * code and datastructures originally from Peter Zijlstra and Matt Fleming. 13 * 14 * More information about RDT be found in the Intel (R) x86 Architecture 15 * Software Developer Manual June 2016, volume 3, section 17.17. 16 */ 17 18 #define pr_fmt(fmt) "resctrl: " fmt 19 20 #include <linux/cpu.h> 21 #include <linux/resctrl.h> 22 23 #include <asm/cpu_device_id.h> 24 #include <asm/msr.h> 25 26 #include "internal.h" 27 28 /* 29 * Global boolean for rdt_monitor which is true if any 30 * resource monitoring is enabled. 31 */ 32 bool rdt_mon_capable; 33 34 #define CF(cf) ((unsigned long)(1048576 * (cf) + 0.5)) 35 36 static int snc_nodes_per_l3_cache = 1; 37 38 /* 39 * The correction factor table is documented in Documentation/filesystems/resctrl.rst. 40 * If rmid > rmid threshold, MBM total and local values should be multiplied 41 * by the correction factor. 42 * 43 * The original table is modified for better code: 44 * 45 * 1. The threshold 0 is changed to rmid count - 1 so don't do correction 46 * for the case. 47 * 2. MBM total and local correction table indexed by core counter which is 48 * equal to (x86_cache_max_rmid + 1) / 8 - 1 and is from 0 up to 27. 49 * 3. The correction factor is normalized to 2^20 (1048576) so it's faster 50 * to calculate corrected value by shifting: 51 * corrected_value = (original_value * correction_factor) >> 20 52 */ 53 static const struct mbm_correction_factor_table { 54 u32 rmidthreshold; 55 u64 cf; 56 } mbm_cf_table[] __initconst = { 57 {7, CF(1.000000)}, 58 {15, CF(1.000000)}, 59 {15, CF(0.969650)}, 60 {31, CF(1.000000)}, 61 {31, CF(1.066667)}, 62 {31, CF(0.969650)}, 63 {47, CF(1.142857)}, 64 {63, CF(1.000000)}, 65 {63, CF(1.185115)}, 66 {63, CF(1.066553)}, 67 {79, CF(1.454545)}, 68 {95, CF(1.000000)}, 69 {95, CF(1.230769)}, 70 {95, CF(1.142857)}, 71 {95, CF(1.066667)}, 72 {127, CF(1.000000)}, 73 {127, CF(1.254863)}, 74 {127, CF(1.185255)}, 75 {151, CF(1.000000)}, 76 {127, CF(1.066667)}, 77 {167, CF(1.000000)}, 78 {159, CF(1.454334)}, 79 {183, CF(1.000000)}, 80 {127, CF(0.969744)}, 81 {191, CF(1.280246)}, 82 {191, CF(1.230921)}, 83 {215, CF(1.000000)}, 84 {191, CF(1.143118)}, 85 }; 86 87 static u32 mbm_cf_rmidthreshold __read_mostly = UINT_MAX; 88 89 static u64 mbm_cf __read_mostly; 90 91 static inline u64 get_corrected_mbm_count(u32 rmid, unsigned long val) 92 { 93 /* Correct MBM value. */ 94 if (rmid > mbm_cf_rmidthreshold) 95 val = (val * mbm_cf) >> 20; 96 97 return val; 98 } 99 100 /* 101 * When Sub-NUMA Cluster (SNC) mode is not enabled (as indicated by 102 * "snc_nodes_per_l3_cache == 1") no translation of the RMID value is 103 * needed. The physical RMID is the same as the logical RMID. 104 * 105 * On a platform with SNC mode enabled, Linux enables RMID sharing mode 106 * via MSR 0xCA0 (see the "RMID Sharing Mode" section in the "Intel 107 * Resource Director Technology Architecture Specification" for a full 108 * description of RMID sharing mode). 109 * 110 * In RMID sharing mode there are fewer "logical RMID" values available 111 * to accumulate data ("physical RMIDs" are divided evenly between SNC 112 * nodes that share an L3 cache). Linux creates an rdt_mon_domain for 113 * each SNC node. 114 * 115 * The value loaded into IA32_PQR_ASSOC is the "logical RMID". 116 * 117 * Data is collected independently on each SNC node and can be retrieved 118 * using the "physical RMID" value computed by this function and loaded 119 * into IA32_QM_EVTSEL. @cpu can be any CPU in the SNC node. 120 * 121 * The scope of the IA32_QM_EVTSEL and IA32_QM_CTR MSRs is at the L3 122 * cache. So a "physical RMID" may be read from any CPU that shares 123 * the L3 cache with the desired SNC node, not just from a CPU in 124 * the specific SNC node. 125 */ 126 static int logical_rmid_to_physical_rmid(int cpu, int lrmid) 127 { 128 struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; 129 130 if (snc_nodes_per_l3_cache == 1) 131 return lrmid; 132 133 return lrmid + (cpu_to_node(cpu) % snc_nodes_per_l3_cache) * r->mon.num_rmid; 134 } 135 136 static int __rmid_read_phys(u32 prmid, enum resctrl_event_id eventid, u64 *val) 137 { 138 u64 msr_val; 139 140 /* 141 * As per the SDM, when IA32_QM_EVTSEL.EvtID (bits 7:0) is configured 142 * with a valid event code for supported resource type and the bits 143 * IA32_QM_EVTSEL.RMID (bits 41:32) are configured with valid RMID, 144 * IA32_QM_CTR.data (bits 61:0) reports the monitored data. 145 * IA32_QM_CTR.Error (bit 63) and IA32_QM_CTR.Unavailable (bit 62) 146 * are error bits. 147 */ 148 wrmsr(MSR_IA32_QM_EVTSEL, eventid, prmid); 149 rdmsrq(MSR_IA32_QM_CTR, msr_val); 150 151 if (msr_val & RMID_VAL_ERROR) 152 return -EIO; 153 if (msr_val & RMID_VAL_UNAVAIL) 154 return -EINVAL; 155 156 *val = msr_val; 157 return 0; 158 } 159 160 static struct arch_mbm_state *get_arch_mbm_state(struct rdt_hw_mon_domain *hw_dom, 161 u32 rmid, 162 enum resctrl_event_id eventid) 163 { 164 struct arch_mbm_state *state; 165 166 if (!resctrl_is_mbm_event(eventid)) 167 return NULL; 168 169 state = hw_dom->arch_mbm_states[MBM_STATE_IDX(eventid)]; 170 171 return state ? &state[rmid] : NULL; 172 } 173 174 void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_mon_domain *d, 175 u32 unused, u32 rmid, 176 enum resctrl_event_id eventid) 177 { 178 struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); 179 int cpu = cpumask_any(&d->hdr.cpu_mask); 180 struct arch_mbm_state *am; 181 u32 prmid; 182 183 am = get_arch_mbm_state(hw_dom, rmid, eventid); 184 if (am) { 185 memset(am, 0, sizeof(*am)); 186 187 prmid = logical_rmid_to_physical_rmid(cpu, rmid); 188 /* Record any initial, non-zero count value. */ 189 __rmid_read_phys(prmid, eventid, &am->prev_msr); 190 } 191 } 192 193 /* 194 * Assumes that hardware counters are also reset and thus that there is 195 * no need to record initial non-zero counts. 196 */ 197 void resctrl_arch_reset_rmid_all(struct rdt_resource *r, struct rdt_mon_domain *d) 198 { 199 struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); 200 enum resctrl_event_id eventid; 201 int idx; 202 203 for_each_mbm_event_id(eventid) { 204 if (!resctrl_is_mon_event_enabled(eventid)) 205 continue; 206 idx = MBM_STATE_IDX(eventid); 207 memset(hw_dom->arch_mbm_states[idx], 0, 208 sizeof(*hw_dom->arch_mbm_states[0]) * r->mon.num_rmid); 209 } 210 } 211 212 static u64 mbm_overflow_count(u64 prev_msr, u64 cur_msr, unsigned int width) 213 { 214 u64 shift = 64 - width, chunks; 215 216 chunks = (cur_msr << shift) - (prev_msr << shift); 217 return chunks >> shift; 218 } 219 220 static u64 get_corrected_val(struct rdt_resource *r, struct rdt_mon_domain *d, 221 u32 rmid, enum resctrl_event_id eventid, u64 msr_val) 222 { 223 struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); 224 struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); 225 struct arch_mbm_state *am; 226 u64 chunks; 227 228 am = get_arch_mbm_state(hw_dom, rmid, eventid); 229 if (am) { 230 am->chunks += mbm_overflow_count(am->prev_msr, msr_val, 231 hw_res->mbm_width); 232 chunks = get_corrected_mbm_count(rmid, am->chunks); 233 am->prev_msr = msr_val; 234 } else { 235 chunks = msr_val; 236 } 237 238 return chunks * hw_res->mon_scale; 239 } 240 241 int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_mon_domain *d, 242 u32 unused, u32 rmid, enum resctrl_event_id eventid, 243 u64 *val, void *ignored) 244 { 245 struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); 246 int cpu = cpumask_any(&d->hdr.cpu_mask); 247 struct arch_mbm_state *am; 248 u64 msr_val; 249 u32 prmid; 250 int ret; 251 252 resctrl_arch_rmid_read_context_check(); 253 254 prmid = logical_rmid_to_physical_rmid(cpu, rmid); 255 ret = __rmid_read_phys(prmid, eventid, &msr_val); 256 257 if (!ret) { 258 *val = get_corrected_val(r, d, rmid, eventid, msr_val); 259 } else if (ret == -EINVAL) { 260 am = get_arch_mbm_state(hw_dom, rmid, eventid); 261 if (am) 262 am->prev_msr = 0; 263 } 264 265 return ret; 266 } 267 268 static int __cntr_id_read(u32 cntr_id, u64 *val) 269 { 270 u64 msr_val; 271 272 /* 273 * QM_EVTSEL Register definition: 274 * ======================================================= 275 * Bits Mnemonic Description 276 * ======================================================= 277 * 63:44 -- Reserved 278 * 43:32 RMID RMID or counter ID in ABMC mode 279 * when reading an MBM event 280 * 31 ExtendedEvtID Extended Event Identifier 281 * 30:8 -- Reserved 282 * 7:0 EvtID Event Identifier 283 * ======================================================= 284 * The contents of a specific counter can be read by setting the 285 * following fields in QM_EVTSEL.ExtendedEvtID(=1) and 286 * QM_EVTSEL.EvtID = L3CacheABMC (=1) and setting QM_EVTSEL.RMID 287 * to the desired counter ID. Reading the QM_CTR then returns the 288 * contents of the specified counter. The RMID_VAL_ERROR bit is set 289 * if the counter configuration is invalid, or if an invalid counter 290 * ID is set in the QM_EVTSEL.RMID field. The RMID_VAL_UNAVAIL bit 291 * is set if the counter data is unavailable. 292 */ 293 wrmsr(MSR_IA32_QM_EVTSEL, ABMC_EXTENDED_EVT_ID | ABMC_EVT_ID, cntr_id); 294 rdmsrl(MSR_IA32_QM_CTR, msr_val); 295 296 if (msr_val & RMID_VAL_ERROR) 297 return -EIO; 298 if (msr_val & RMID_VAL_UNAVAIL) 299 return -EINVAL; 300 301 *val = msr_val; 302 return 0; 303 } 304 305 void resctrl_arch_reset_cntr(struct rdt_resource *r, struct rdt_mon_domain *d, 306 u32 unused, u32 rmid, int cntr_id, 307 enum resctrl_event_id eventid) 308 { 309 struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); 310 struct arch_mbm_state *am; 311 312 am = get_arch_mbm_state(hw_dom, rmid, eventid); 313 if (am) { 314 memset(am, 0, sizeof(*am)); 315 316 /* Record any initial, non-zero count value. */ 317 __cntr_id_read(cntr_id, &am->prev_msr); 318 } 319 } 320 321 int resctrl_arch_cntr_read(struct rdt_resource *r, struct rdt_mon_domain *d, 322 u32 unused, u32 rmid, int cntr_id, 323 enum resctrl_event_id eventid, u64 *val) 324 { 325 u64 msr_val; 326 int ret; 327 328 ret = __cntr_id_read(cntr_id, &msr_val); 329 if (ret) 330 return ret; 331 332 *val = get_corrected_val(r, d, rmid, eventid, msr_val); 333 334 return 0; 335 } 336 337 /* 338 * The power-on reset value of MSR_RMID_SNC_CONFIG is 0x1 339 * which indicates that RMIDs are configured in legacy mode. 340 * This mode is incompatible with Linux resctrl semantics 341 * as RMIDs are partitioned between SNC nodes, which requires 342 * a user to know which RMID is allocated to a task. 343 * Clearing bit 0 reconfigures the RMID counters for use 344 * in RMID sharing mode. This mode is better for Linux. 345 * The RMID space is divided between all SNC nodes with the 346 * RMIDs renumbered to start from zero in each node when 347 * counting operations from tasks. Code to read the counters 348 * must adjust RMID counter numbers based on SNC node. See 349 * logical_rmid_to_physical_rmid() for code that does this. 350 */ 351 void arch_mon_domain_online(struct rdt_resource *r, struct rdt_mon_domain *d) 352 { 353 if (snc_nodes_per_l3_cache > 1) 354 msr_clear_bit(MSR_RMID_SNC_CONFIG, 0); 355 } 356 357 /* CPU models that support MSR_RMID_SNC_CONFIG */ 358 static const struct x86_cpu_id snc_cpu_ids[] __initconst = { 359 X86_MATCH_VFM(INTEL_ICELAKE_X, 0), 360 X86_MATCH_VFM(INTEL_SAPPHIRERAPIDS_X, 0), 361 X86_MATCH_VFM(INTEL_EMERALDRAPIDS_X, 0), 362 X86_MATCH_VFM(INTEL_GRANITERAPIDS_X, 0), 363 X86_MATCH_VFM(INTEL_ATOM_CRESTMONT_X, 0), 364 {} 365 }; 366 367 /* 368 * There isn't a simple hardware bit that indicates whether a CPU is running 369 * in Sub-NUMA Cluster (SNC) mode. Infer the state by comparing the 370 * number of CPUs sharing the L3 cache with CPU0 to the number of CPUs in 371 * the same NUMA node as CPU0. 372 * It is not possible to accurately determine SNC state if the system is 373 * booted with a maxcpus=N parameter. That distorts the ratio of SNC nodes 374 * to L3 caches. It will be OK if system is booted with hyperthreading 375 * disabled (since this doesn't affect the ratio). 376 */ 377 static __init int snc_get_config(void) 378 { 379 struct cacheinfo *ci = get_cpu_cacheinfo_level(0, RESCTRL_L3_CACHE); 380 const cpumask_t *node0_cpumask; 381 int cpus_per_node, cpus_per_l3; 382 int ret; 383 384 if (!x86_match_cpu(snc_cpu_ids) || !ci) 385 return 1; 386 387 cpus_read_lock(); 388 if (num_online_cpus() != num_present_cpus()) 389 pr_warn("Some CPUs offline, SNC detection may be incorrect\n"); 390 cpus_read_unlock(); 391 392 node0_cpumask = cpumask_of_node(cpu_to_node(0)); 393 394 cpus_per_node = cpumask_weight(node0_cpumask); 395 cpus_per_l3 = cpumask_weight(&ci->shared_cpu_map); 396 397 if (!cpus_per_node || !cpus_per_l3) 398 return 1; 399 400 ret = cpus_per_l3 / cpus_per_node; 401 402 /* sanity check: Only valid results are 1, 2, 3, 4, 6 */ 403 switch (ret) { 404 case 1: 405 break; 406 case 2 ... 4: 407 case 6: 408 pr_info("Sub-NUMA Cluster mode detected with %d nodes per L3 cache\n", ret); 409 rdt_resources_all[RDT_RESOURCE_L3].r_resctrl.mon_scope = RESCTRL_L3_NODE; 410 break; 411 default: 412 pr_warn("Ignore improbable SNC node count %d\n", ret); 413 ret = 1; 414 break; 415 } 416 417 return ret; 418 } 419 420 int __init rdt_get_mon_l3_config(struct rdt_resource *r) 421 { 422 unsigned int mbm_offset = boot_cpu_data.x86_cache_mbm_width_offset; 423 struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); 424 unsigned int threshold; 425 u32 eax, ebx, ecx, edx; 426 427 snc_nodes_per_l3_cache = snc_get_config(); 428 429 resctrl_rmid_realloc_limit = boot_cpu_data.x86_cache_size * 1024; 430 hw_res->mon_scale = boot_cpu_data.x86_cache_occ_scale / snc_nodes_per_l3_cache; 431 r->mon.num_rmid = (boot_cpu_data.x86_cache_max_rmid + 1) / snc_nodes_per_l3_cache; 432 hw_res->mbm_width = MBM_CNTR_WIDTH_BASE; 433 434 if (mbm_offset > 0 && mbm_offset <= MBM_CNTR_WIDTH_OFFSET_MAX) 435 hw_res->mbm_width += mbm_offset; 436 else if (mbm_offset > MBM_CNTR_WIDTH_OFFSET_MAX) 437 pr_warn("Ignoring impossible MBM counter offset\n"); 438 439 /* 440 * A reasonable upper limit on the max threshold is the number 441 * of lines tagged per RMID if all RMIDs have the same number of 442 * lines tagged in the LLC. 443 * 444 * For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC. 445 */ 446 threshold = resctrl_rmid_realloc_limit / r->mon.num_rmid; 447 448 /* 449 * Because num_rmid may not be a power of two, round the value 450 * to the nearest multiple of hw_res->mon_scale so it matches a 451 * value the hardware will measure. mon_scale may not be a power of 2. 452 */ 453 resctrl_rmid_realloc_threshold = resctrl_arch_round_mon_val(threshold); 454 455 if (rdt_cpu_has(X86_FEATURE_BMEC) || rdt_cpu_has(X86_FEATURE_ABMC)) { 456 /* Detect list of bandwidth sources that can be tracked */ 457 cpuid_count(0x80000020, 3, &eax, &ebx, &ecx, &edx); 458 r->mon.mbm_cfg_mask = ecx & MAX_EVT_CONFIG_BITS; 459 } 460 461 if (rdt_cpu_has(X86_FEATURE_ABMC)) { 462 r->mon.mbm_cntr_assignable = true; 463 cpuid_count(0x80000020, 5, &eax, &ebx, &ecx, &edx); 464 r->mon.num_mbm_cntrs = (ebx & GENMASK(15, 0)) + 1; 465 hw_res->mbm_cntr_assign_enabled = true; 466 } 467 468 r->mon_capable = true; 469 470 return 0; 471 } 472 473 void __init intel_rdt_mbm_apply_quirk(void) 474 { 475 int cf_index; 476 477 cf_index = (boot_cpu_data.x86_cache_max_rmid + 1) / 8 - 1; 478 if (cf_index >= ARRAY_SIZE(mbm_cf_table)) { 479 pr_info("No MBM correction factor available\n"); 480 return; 481 } 482 483 mbm_cf_rmidthreshold = mbm_cf_table[cf_index].rmidthreshold; 484 mbm_cf = mbm_cf_table[cf_index].cf; 485 } 486 487 static void resctrl_abmc_set_one_amd(void *arg) 488 { 489 bool *enable = arg; 490 491 if (*enable) 492 msr_set_bit(MSR_IA32_L3_QOS_EXT_CFG, ABMC_ENABLE_BIT); 493 else 494 msr_clear_bit(MSR_IA32_L3_QOS_EXT_CFG, ABMC_ENABLE_BIT); 495 } 496 497 /* 498 * ABMC enable/disable requires update of L3_QOS_EXT_CFG MSR on all the CPUs 499 * associated with all monitor domains. 500 */ 501 static void _resctrl_abmc_enable(struct rdt_resource *r, bool enable) 502 { 503 struct rdt_mon_domain *d; 504 505 lockdep_assert_cpus_held(); 506 507 list_for_each_entry(d, &r->mon_domains, hdr.list) { 508 on_each_cpu_mask(&d->hdr.cpu_mask, resctrl_abmc_set_one_amd, 509 &enable, 1); 510 resctrl_arch_reset_rmid_all(r, d); 511 } 512 } 513 514 int resctrl_arch_mbm_cntr_assign_set(struct rdt_resource *r, bool enable) 515 { 516 struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); 517 518 if (r->mon.mbm_cntr_assignable && 519 hw_res->mbm_cntr_assign_enabled != enable) { 520 _resctrl_abmc_enable(r, enable); 521 hw_res->mbm_cntr_assign_enabled = enable; 522 } 523 524 return 0; 525 } 526 527 bool resctrl_arch_mbm_cntr_assign_enabled(struct rdt_resource *r) 528 { 529 return resctrl_to_arch_res(r)->mbm_cntr_assign_enabled; 530 } 531 532 static void resctrl_abmc_config_one_amd(void *info) 533 { 534 union l3_qos_abmc_cfg *abmc_cfg = info; 535 536 wrmsrl(MSR_IA32_L3_QOS_ABMC_CFG, abmc_cfg->full); 537 } 538 539 /* 540 * Send an IPI to the domain to assign the counter to RMID, event pair. 541 */ 542 void resctrl_arch_config_cntr(struct rdt_resource *r, struct rdt_mon_domain *d, 543 enum resctrl_event_id evtid, u32 rmid, u32 closid, 544 u32 cntr_id, bool assign) 545 { 546 struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); 547 union l3_qos_abmc_cfg abmc_cfg = { 0 }; 548 struct arch_mbm_state *am; 549 550 abmc_cfg.split.cfg_en = 1; 551 abmc_cfg.split.cntr_en = assign ? 1 : 0; 552 abmc_cfg.split.cntr_id = cntr_id; 553 abmc_cfg.split.bw_src = rmid; 554 if (assign) 555 abmc_cfg.split.bw_type = resctrl_get_mon_evt_cfg(evtid); 556 557 smp_call_function_any(&d->hdr.cpu_mask, resctrl_abmc_config_one_amd, &abmc_cfg, 1); 558 559 /* 560 * The hardware counter is reset (because cfg_en == 1) so there is no 561 * need to record initial non-zero counts. 562 */ 563 am = get_arch_mbm_state(hw_dom, rmid, evtid); 564 if (am) 565 memset(am, 0, sizeof(*am)); 566 } 567 568 void resctrl_arch_mbm_cntr_assign_set_one(struct rdt_resource *r) 569 { 570 struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); 571 572 resctrl_abmc_set_one_amd(&hw_res->mbm_cntr_assign_enabled); 573 } 574