1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Resource Director Technology(RDT) 4 * - Monitoring code 5 * 6 * Copyright (C) 2017 Intel Corporation 7 * 8 * Author: 9 * Vikas Shivappa <vikas.shivappa@intel.com> 10 * 11 * This replaces the cqm.c based on perf but we reuse a lot of 12 * code and datastructures originally from Peter Zijlstra and Matt Fleming. 13 * 14 * More information about RDT be found in the Intel (R) x86 Architecture 15 * Software Developer Manual June 2016, volume 3, section 17.17. 16 */ 17 18 #define pr_fmt(fmt) "resctrl: " fmt 19 20 #include <linux/cpu.h> 21 #include <linux/module.h> 22 #include <linux/sizes.h> 23 #include <linux/slab.h> 24 25 #include <asm/cpu_device_id.h> 26 #include <asm/msr.h> 27 #include <asm/resctrl.h> 28 29 #include "internal.h" 30 #include "trace.h" 31 32 /** 33 * struct rmid_entry - dirty tracking for all RMID. 34 * @closid: The CLOSID for this entry. 35 * @rmid: The RMID for this entry. 36 * @busy: The number of domains with cached data using this RMID. 37 * @list: Member of the rmid_free_lru list when busy == 0. 38 * 39 * Depending on the architecture the correct monitor is accessed using 40 * both @closid and @rmid, or @rmid only. 41 * 42 * Take the rdtgroup_mutex when accessing. 43 */ 44 struct rmid_entry { 45 u32 closid; 46 u32 rmid; 47 int busy; 48 struct list_head list; 49 }; 50 51 /* 52 * @rmid_free_lru - A least recently used list of free RMIDs 53 * These RMIDs are guaranteed to have an occupancy less than the 54 * threshold occupancy 55 */ 56 static LIST_HEAD(rmid_free_lru); 57 58 /* 59 * @closid_num_dirty_rmid The number of dirty RMID each CLOSID has. 60 * Only allocated when CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID is defined. 61 * Indexed by CLOSID. Protected by rdtgroup_mutex. 62 */ 63 static u32 *closid_num_dirty_rmid; 64 65 /* 66 * @rmid_limbo_count - count of currently unused but (potentially) 67 * dirty RMIDs. 68 * This counts RMIDs that no one is currently using but that 69 * may have a occupancy value > resctrl_rmid_realloc_threshold. User can 70 * change the threshold occupancy value. 71 */ 72 static unsigned int rmid_limbo_count; 73 74 /* 75 * @rmid_entry - The entry in the limbo and free lists. 76 */ 77 static struct rmid_entry *rmid_ptrs; 78 79 /* 80 * Global boolean for rdt_monitor which is true if any 81 * resource monitoring is enabled. 82 */ 83 bool rdt_mon_capable; 84 85 /* 86 * Global to indicate which monitoring events are enabled. 87 */ 88 unsigned int rdt_mon_features; 89 90 /* 91 * This is the threshold cache occupancy in bytes at which we will consider an 92 * RMID available for re-allocation. 93 */ 94 unsigned int resctrl_rmid_realloc_threshold; 95 96 /* 97 * This is the maximum value for the reallocation threshold, in bytes. 98 */ 99 unsigned int resctrl_rmid_realloc_limit; 100 101 #define CF(cf) ((unsigned long)(1048576 * (cf) + 0.5)) 102 103 static int snc_nodes_per_l3_cache = 1; 104 105 /* 106 * The correction factor table is documented in Documentation/arch/x86/resctrl.rst. 107 * If rmid > rmid threshold, MBM total and local values should be multiplied 108 * by the correction factor. 109 * 110 * The original table is modified for better code: 111 * 112 * 1. The threshold 0 is changed to rmid count - 1 so don't do correction 113 * for the case. 114 * 2. MBM total and local correction table indexed by core counter which is 115 * equal to (x86_cache_max_rmid + 1) / 8 - 1 and is from 0 up to 27. 116 * 3. The correction factor is normalized to 2^20 (1048576) so it's faster 117 * to calculate corrected value by shifting: 118 * corrected_value = (original_value * correction_factor) >> 20 119 */ 120 static const struct mbm_correction_factor_table { 121 u32 rmidthreshold; 122 u64 cf; 123 } mbm_cf_table[] __initconst = { 124 {7, CF(1.000000)}, 125 {15, CF(1.000000)}, 126 {15, CF(0.969650)}, 127 {31, CF(1.000000)}, 128 {31, CF(1.066667)}, 129 {31, CF(0.969650)}, 130 {47, CF(1.142857)}, 131 {63, CF(1.000000)}, 132 {63, CF(1.185115)}, 133 {63, CF(1.066553)}, 134 {79, CF(1.454545)}, 135 {95, CF(1.000000)}, 136 {95, CF(1.230769)}, 137 {95, CF(1.142857)}, 138 {95, CF(1.066667)}, 139 {127, CF(1.000000)}, 140 {127, CF(1.254863)}, 141 {127, CF(1.185255)}, 142 {151, CF(1.000000)}, 143 {127, CF(1.066667)}, 144 {167, CF(1.000000)}, 145 {159, CF(1.454334)}, 146 {183, CF(1.000000)}, 147 {127, CF(0.969744)}, 148 {191, CF(1.280246)}, 149 {191, CF(1.230921)}, 150 {215, CF(1.000000)}, 151 {191, CF(1.143118)}, 152 }; 153 154 static u32 mbm_cf_rmidthreshold __read_mostly = UINT_MAX; 155 static u64 mbm_cf __read_mostly; 156 157 static inline u64 get_corrected_mbm_count(u32 rmid, unsigned long val) 158 { 159 /* Correct MBM value. */ 160 if (rmid > mbm_cf_rmidthreshold) 161 val = (val * mbm_cf) >> 20; 162 163 return val; 164 } 165 166 /* 167 * x86 and arm64 differ in their handling of monitoring. 168 * x86's RMID are independent numbers, there is only one source of traffic 169 * with an RMID value of '1'. 170 * arm64's PMG extends the PARTID/CLOSID space, there are multiple sources of 171 * traffic with a PMG value of '1', one for each CLOSID, meaning the RMID 172 * value is no longer unique. 173 * To account for this, resctrl uses an index. On x86 this is just the RMID, 174 * on arm64 it encodes the CLOSID and RMID. This gives a unique number. 175 * 176 * The domain's rmid_busy_llc and rmid_ptrs[] are sized by index. The arch code 177 * must accept an attempt to read every index. 178 */ 179 static inline struct rmid_entry *__rmid_entry(u32 idx) 180 { 181 struct rmid_entry *entry; 182 u32 closid, rmid; 183 184 entry = &rmid_ptrs[idx]; 185 resctrl_arch_rmid_idx_decode(idx, &closid, &rmid); 186 187 WARN_ON_ONCE(entry->closid != closid); 188 WARN_ON_ONCE(entry->rmid != rmid); 189 190 return entry; 191 } 192 193 /* 194 * When Sub-NUMA Cluster (SNC) mode is not enabled (as indicated by 195 * "snc_nodes_per_l3_cache == 1") no translation of the RMID value is 196 * needed. The physical RMID is the same as the logical RMID. 197 * 198 * On a platform with SNC mode enabled, Linux enables RMID sharing mode 199 * via MSR 0xCA0 (see the "RMID Sharing Mode" section in the "Intel 200 * Resource Director Technology Architecture Specification" for a full 201 * description of RMID sharing mode). 202 * 203 * In RMID sharing mode there are fewer "logical RMID" values available 204 * to accumulate data ("physical RMIDs" are divided evenly between SNC 205 * nodes that share an L3 cache). Linux creates an rdt_mon_domain for 206 * each SNC node. 207 * 208 * The value loaded into IA32_PQR_ASSOC is the "logical RMID". 209 * 210 * Data is collected independently on each SNC node and can be retrieved 211 * using the "physical RMID" value computed by this function and loaded 212 * into IA32_QM_EVTSEL. @cpu can be any CPU in the SNC node. 213 * 214 * The scope of the IA32_QM_EVTSEL and IA32_QM_CTR MSRs is at the L3 215 * cache. So a "physical RMID" may be read from any CPU that shares 216 * the L3 cache with the desired SNC node, not just from a CPU in 217 * the specific SNC node. 218 */ 219 static int logical_rmid_to_physical_rmid(int cpu, int lrmid) 220 { 221 struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; 222 223 if (snc_nodes_per_l3_cache == 1) 224 return lrmid; 225 226 return lrmid + (cpu_to_node(cpu) % snc_nodes_per_l3_cache) * r->num_rmid; 227 } 228 229 static int __rmid_read_phys(u32 prmid, enum resctrl_event_id eventid, u64 *val) 230 { 231 u64 msr_val; 232 233 /* 234 * As per the SDM, when IA32_QM_EVTSEL.EvtID (bits 7:0) is configured 235 * with a valid event code for supported resource type and the bits 236 * IA32_QM_EVTSEL.RMID (bits 41:32) are configured with valid RMID, 237 * IA32_QM_CTR.data (bits 61:0) reports the monitored data. 238 * IA32_QM_CTR.Error (bit 63) and IA32_QM_CTR.Unavailable (bit 62) 239 * are error bits. 240 */ 241 wrmsr(MSR_IA32_QM_EVTSEL, eventid, prmid); 242 rdmsrq(MSR_IA32_QM_CTR, msr_val); 243 244 if (msr_val & RMID_VAL_ERROR) 245 return -EIO; 246 if (msr_val & RMID_VAL_UNAVAIL) 247 return -EINVAL; 248 249 *val = msr_val; 250 return 0; 251 } 252 253 static struct arch_mbm_state *get_arch_mbm_state(struct rdt_hw_mon_domain *hw_dom, 254 u32 rmid, 255 enum resctrl_event_id eventid) 256 { 257 switch (eventid) { 258 case QOS_L3_OCCUP_EVENT_ID: 259 return NULL; 260 case QOS_L3_MBM_TOTAL_EVENT_ID: 261 return &hw_dom->arch_mbm_total[rmid]; 262 case QOS_L3_MBM_LOCAL_EVENT_ID: 263 return &hw_dom->arch_mbm_local[rmid]; 264 } 265 266 /* Never expect to get here */ 267 WARN_ON_ONCE(1); 268 269 return NULL; 270 } 271 272 void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_mon_domain *d, 273 u32 unused, u32 rmid, 274 enum resctrl_event_id eventid) 275 { 276 struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); 277 int cpu = cpumask_any(&d->hdr.cpu_mask); 278 struct arch_mbm_state *am; 279 u32 prmid; 280 281 am = get_arch_mbm_state(hw_dom, rmid, eventid); 282 if (am) { 283 memset(am, 0, sizeof(*am)); 284 285 prmid = logical_rmid_to_physical_rmid(cpu, rmid); 286 /* Record any initial, non-zero count value. */ 287 __rmid_read_phys(prmid, eventid, &am->prev_msr); 288 } 289 } 290 291 /* 292 * Assumes that hardware counters are also reset and thus that there is 293 * no need to record initial non-zero counts. 294 */ 295 void resctrl_arch_reset_rmid_all(struct rdt_resource *r, struct rdt_mon_domain *d) 296 { 297 struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); 298 299 if (resctrl_arch_is_mbm_total_enabled()) 300 memset(hw_dom->arch_mbm_total, 0, 301 sizeof(*hw_dom->arch_mbm_total) * r->num_rmid); 302 303 if (resctrl_arch_is_mbm_local_enabled()) 304 memset(hw_dom->arch_mbm_local, 0, 305 sizeof(*hw_dom->arch_mbm_local) * r->num_rmid); 306 } 307 308 static u64 mbm_overflow_count(u64 prev_msr, u64 cur_msr, unsigned int width) 309 { 310 u64 shift = 64 - width, chunks; 311 312 chunks = (cur_msr << shift) - (prev_msr << shift); 313 return chunks >> shift; 314 } 315 316 int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_mon_domain *d, 317 u32 unused, u32 rmid, enum resctrl_event_id eventid, 318 u64 *val, void *ignored) 319 { 320 struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); 321 struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); 322 int cpu = cpumask_any(&d->hdr.cpu_mask); 323 struct arch_mbm_state *am; 324 u64 msr_val, chunks; 325 u32 prmid; 326 int ret; 327 328 resctrl_arch_rmid_read_context_check(); 329 330 prmid = logical_rmid_to_physical_rmid(cpu, rmid); 331 ret = __rmid_read_phys(prmid, eventid, &msr_val); 332 if (ret) 333 return ret; 334 335 am = get_arch_mbm_state(hw_dom, rmid, eventid); 336 if (am) { 337 am->chunks += mbm_overflow_count(am->prev_msr, msr_val, 338 hw_res->mbm_width); 339 chunks = get_corrected_mbm_count(rmid, am->chunks); 340 am->prev_msr = msr_val; 341 } else { 342 chunks = msr_val; 343 } 344 345 *val = chunks * hw_res->mon_scale; 346 347 return 0; 348 } 349 350 static void limbo_release_entry(struct rmid_entry *entry) 351 { 352 lockdep_assert_held(&rdtgroup_mutex); 353 354 rmid_limbo_count--; 355 list_add_tail(&entry->list, &rmid_free_lru); 356 357 if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) 358 closid_num_dirty_rmid[entry->closid]--; 359 } 360 361 /* 362 * Check the RMIDs that are marked as busy for this domain. If the 363 * reported LLC occupancy is below the threshold clear the busy bit and 364 * decrement the count. If the busy count gets to zero on an RMID, we 365 * free the RMID 366 */ 367 void __check_limbo(struct rdt_mon_domain *d, bool force_free) 368 { 369 struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3); 370 u32 idx_limit = resctrl_arch_system_num_rmid_idx(); 371 struct rmid_entry *entry; 372 u32 idx, cur_idx = 1; 373 void *arch_mon_ctx; 374 bool rmid_dirty; 375 u64 val = 0; 376 377 arch_mon_ctx = resctrl_arch_mon_ctx_alloc(r, QOS_L3_OCCUP_EVENT_ID); 378 if (IS_ERR(arch_mon_ctx)) { 379 pr_warn_ratelimited("Failed to allocate monitor context: %ld", 380 PTR_ERR(arch_mon_ctx)); 381 return; 382 } 383 384 /* 385 * Skip RMID 0 and start from RMID 1 and check all the RMIDs that 386 * are marked as busy for occupancy < threshold. If the occupancy 387 * is less than the threshold decrement the busy counter of the 388 * RMID and move it to the free list when the counter reaches 0. 389 */ 390 for (;;) { 391 idx = find_next_bit(d->rmid_busy_llc, idx_limit, cur_idx); 392 if (idx >= idx_limit) 393 break; 394 395 entry = __rmid_entry(idx); 396 if (resctrl_arch_rmid_read(r, d, entry->closid, entry->rmid, 397 QOS_L3_OCCUP_EVENT_ID, &val, 398 arch_mon_ctx)) { 399 rmid_dirty = true; 400 } else { 401 rmid_dirty = (val >= resctrl_rmid_realloc_threshold); 402 403 /* 404 * x86's CLOSID and RMID are independent numbers, so the entry's 405 * CLOSID is an empty CLOSID (X86_RESCTRL_EMPTY_CLOSID). On Arm the 406 * RMID (PMG) extends the CLOSID (PARTID) space with bits that aren't 407 * used to select the configuration. It is thus necessary to track both 408 * CLOSID and RMID because there may be dependencies between them 409 * on some architectures. 410 */ 411 trace_mon_llc_occupancy_limbo(entry->closid, entry->rmid, d->hdr.id, val); 412 } 413 414 if (force_free || !rmid_dirty) { 415 clear_bit(idx, d->rmid_busy_llc); 416 if (!--entry->busy) 417 limbo_release_entry(entry); 418 } 419 cur_idx = idx + 1; 420 } 421 422 resctrl_arch_mon_ctx_free(r, QOS_L3_OCCUP_EVENT_ID, arch_mon_ctx); 423 } 424 425 bool has_busy_rmid(struct rdt_mon_domain *d) 426 { 427 u32 idx_limit = resctrl_arch_system_num_rmid_idx(); 428 429 return find_first_bit(d->rmid_busy_llc, idx_limit) != idx_limit; 430 } 431 432 static struct rmid_entry *resctrl_find_free_rmid(u32 closid) 433 { 434 struct rmid_entry *itr; 435 u32 itr_idx, cmp_idx; 436 437 if (list_empty(&rmid_free_lru)) 438 return rmid_limbo_count ? ERR_PTR(-EBUSY) : ERR_PTR(-ENOSPC); 439 440 list_for_each_entry(itr, &rmid_free_lru, list) { 441 /* 442 * Get the index of this free RMID, and the index it would need 443 * to be if it were used with this CLOSID. 444 * If the CLOSID is irrelevant on this architecture, the two 445 * index values are always the same on every entry and thus the 446 * very first entry will be returned. 447 */ 448 itr_idx = resctrl_arch_rmid_idx_encode(itr->closid, itr->rmid); 449 cmp_idx = resctrl_arch_rmid_idx_encode(closid, itr->rmid); 450 451 if (itr_idx == cmp_idx) 452 return itr; 453 } 454 455 return ERR_PTR(-ENOSPC); 456 } 457 458 /** 459 * resctrl_find_cleanest_closid() - Find a CLOSID where all the associated 460 * RMID are clean, or the CLOSID that has 461 * the most clean RMID. 462 * 463 * MPAM's equivalent of RMID are per-CLOSID, meaning a freshly allocated CLOSID 464 * may not be able to allocate clean RMID. To avoid this the allocator will 465 * choose the CLOSID with the most clean RMID. 466 * 467 * When the CLOSID and RMID are independent numbers, the first free CLOSID will 468 * be returned. 469 */ 470 int resctrl_find_cleanest_closid(void) 471 { 472 u32 cleanest_closid = ~0; 473 int i = 0; 474 475 lockdep_assert_held(&rdtgroup_mutex); 476 477 if (!IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) 478 return -EIO; 479 480 for (i = 0; i < closids_supported(); i++) { 481 int num_dirty; 482 483 if (closid_allocated(i)) 484 continue; 485 486 num_dirty = closid_num_dirty_rmid[i]; 487 if (num_dirty == 0) 488 return i; 489 490 if (cleanest_closid == ~0) 491 cleanest_closid = i; 492 493 if (num_dirty < closid_num_dirty_rmid[cleanest_closid]) 494 cleanest_closid = i; 495 } 496 497 if (cleanest_closid == ~0) 498 return -ENOSPC; 499 500 return cleanest_closid; 501 } 502 503 /* 504 * For MPAM the RMID value is not unique, and has to be considered with 505 * the CLOSID. The (CLOSID, RMID) pair is allocated on all domains, which 506 * allows all domains to be managed by a single free list. 507 * Each domain also has a rmid_busy_llc to reduce the work of the limbo handler. 508 */ 509 int alloc_rmid(u32 closid) 510 { 511 struct rmid_entry *entry; 512 513 lockdep_assert_held(&rdtgroup_mutex); 514 515 entry = resctrl_find_free_rmid(closid); 516 if (IS_ERR(entry)) 517 return PTR_ERR(entry); 518 519 list_del(&entry->list); 520 return entry->rmid; 521 } 522 523 static void add_rmid_to_limbo(struct rmid_entry *entry) 524 { 525 struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3); 526 struct rdt_mon_domain *d; 527 u32 idx; 528 529 lockdep_assert_held(&rdtgroup_mutex); 530 531 /* Walking r->domains, ensure it can't race with cpuhp */ 532 lockdep_assert_cpus_held(); 533 534 idx = resctrl_arch_rmid_idx_encode(entry->closid, entry->rmid); 535 536 entry->busy = 0; 537 list_for_each_entry(d, &r->mon_domains, hdr.list) { 538 /* 539 * For the first limbo RMID in the domain, 540 * setup up the limbo worker. 541 */ 542 if (!has_busy_rmid(d)) 543 cqm_setup_limbo_handler(d, CQM_LIMBOCHECK_INTERVAL, 544 RESCTRL_PICK_ANY_CPU); 545 set_bit(idx, d->rmid_busy_llc); 546 entry->busy++; 547 } 548 549 rmid_limbo_count++; 550 if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) 551 closid_num_dirty_rmid[entry->closid]++; 552 } 553 554 void free_rmid(u32 closid, u32 rmid) 555 { 556 u32 idx = resctrl_arch_rmid_idx_encode(closid, rmid); 557 struct rmid_entry *entry; 558 559 lockdep_assert_held(&rdtgroup_mutex); 560 561 /* 562 * Do not allow the default rmid to be free'd. Comparing by index 563 * allows architectures that ignore the closid parameter to avoid an 564 * unnecessary check. 565 */ 566 if (!resctrl_arch_mon_capable() || 567 idx == resctrl_arch_rmid_idx_encode(RESCTRL_RESERVED_CLOSID, 568 RESCTRL_RESERVED_RMID)) 569 return; 570 571 entry = __rmid_entry(idx); 572 573 if (resctrl_arch_is_llc_occupancy_enabled()) 574 add_rmid_to_limbo(entry); 575 else 576 list_add_tail(&entry->list, &rmid_free_lru); 577 } 578 579 static struct mbm_state *get_mbm_state(struct rdt_mon_domain *d, u32 closid, 580 u32 rmid, enum resctrl_event_id evtid) 581 { 582 u32 idx = resctrl_arch_rmid_idx_encode(closid, rmid); 583 584 switch (evtid) { 585 case QOS_L3_MBM_TOTAL_EVENT_ID: 586 return &d->mbm_total[idx]; 587 case QOS_L3_MBM_LOCAL_EVENT_ID: 588 return &d->mbm_local[idx]; 589 default: 590 return NULL; 591 } 592 } 593 594 static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr) 595 { 596 int cpu = smp_processor_id(); 597 struct rdt_mon_domain *d; 598 struct mbm_state *m; 599 int err, ret; 600 u64 tval = 0; 601 602 if (rr->first) { 603 resctrl_arch_reset_rmid(rr->r, rr->d, closid, rmid, rr->evtid); 604 m = get_mbm_state(rr->d, closid, rmid, rr->evtid); 605 if (m) 606 memset(m, 0, sizeof(struct mbm_state)); 607 return 0; 608 } 609 610 if (rr->d) { 611 /* Reading a single domain, must be on a CPU in that domain. */ 612 if (!cpumask_test_cpu(cpu, &rr->d->hdr.cpu_mask)) 613 return -EINVAL; 614 rr->err = resctrl_arch_rmid_read(rr->r, rr->d, closid, rmid, 615 rr->evtid, &tval, rr->arch_mon_ctx); 616 if (rr->err) 617 return rr->err; 618 619 rr->val += tval; 620 621 return 0; 622 } 623 624 /* Summing domains that share a cache, must be on a CPU for that cache. */ 625 if (!cpumask_test_cpu(cpu, &rr->ci->shared_cpu_map)) 626 return -EINVAL; 627 628 /* 629 * Legacy files must report the sum of an event across all 630 * domains that share the same L3 cache instance. 631 * Report success if a read from any domain succeeds, -EINVAL 632 * (translated to "Unavailable" for user space) if reading from 633 * all domains fail for any reason. 634 */ 635 ret = -EINVAL; 636 list_for_each_entry(d, &rr->r->mon_domains, hdr.list) { 637 if (d->ci->id != rr->ci->id) 638 continue; 639 err = resctrl_arch_rmid_read(rr->r, d, closid, rmid, 640 rr->evtid, &tval, rr->arch_mon_ctx); 641 if (!err) { 642 rr->val += tval; 643 ret = 0; 644 } 645 } 646 647 if (ret) 648 rr->err = ret; 649 650 return ret; 651 } 652 653 /* 654 * mbm_bw_count() - Update bw count from values previously read by 655 * __mon_event_count(). 656 * @closid: The closid used to identify the cached mbm_state. 657 * @rmid: The rmid used to identify the cached mbm_state. 658 * @rr: The struct rmid_read populated by __mon_event_count(). 659 * 660 * Supporting function to calculate the memory bandwidth 661 * and delta bandwidth in MBps. The chunks value previously read by 662 * __mon_event_count() is compared with the chunks value from the previous 663 * invocation. This must be called once per second to maintain values in MBps. 664 */ 665 static void mbm_bw_count(u32 closid, u32 rmid, struct rmid_read *rr) 666 { 667 u64 cur_bw, bytes, cur_bytes; 668 struct mbm_state *m; 669 670 m = get_mbm_state(rr->d, closid, rmid, rr->evtid); 671 if (WARN_ON_ONCE(!m)) 672 return; 673 674 cur_bytes = rr->val; 675 bytes = cur_bytes - m->prev_bw_bytes; 676 m->prev_bw_bytes = cur_bytes; 677 678 cur_bw = bytes / SZ_1M; 679 680 m->prev_bw = cur_bw; 681 } 682 683 /* 684 * This is scheduled by mon_event_read() to read the CQM/MBM counters 685 * on a domain. 686 */ 687 void mon_event_count(void *info) 688 { 689 struct rdtgroup *rdtgrp, *entry; 690 struct rmid_read *rr = info; 691 struct list_head *head; 692 int ret; 693 694 rdtgrp = rr->rgrp; 695 696 ret = __mon_event_count(rdtgrp->closid, rdtgrp->mon.rmid, rr); 697 698 /* 699 * For Ctrl groups read data from child monitor groups and 700 * add them together. Count events which are read successfully. 701 * Discard the rmid_read's reporting errors. 702 */ 703 head = &rdtgrp->mon.crdtgrp_list; 704 705 if (rdtgrp->type == RDTCTRL_GROUP) { 706 list_for_each_entry(entry, head, mon.crdtgrp_list) { 707 if (__mon_event_count(entry->closid, entry->mon.rmid, 708 rr) == 0) 709 ret = 0; 710 } 711 } 712 713 /* 714 * __mon_event_count() calls for newly created monitor groups may 715 * report -EINVAL/Unavailable if the monitor hasn't seen any traffic. 716 * Discard error if any of the monitor event reads succeeded. 717 */ 718 if (ret == 0) 719 rr->err = 0; 720 } 721 722 static struct rdt_ctrl_domain *get_ctrl_domain_from_cpu(int cpu, 723 struct rdt_resource *r) 724 { 725 struct rdt_ctrl_domain *d; 726 727 lockdep_assert_cpus_held(); 728 729 list_for_each_entry(d, &r->ctrl_domains, hdr.list) { 730 /* Find the domain that contains this CPU */ 731 if (cpumask_test_cpu(cpu, &d->hdr.cpu_mask)) 732 return d; 733 } 734 735 return NULL; 736 } 737 738 /* 739 * Feedback loop for MBA software controller (mba_sc) 740 * 741 * mba_sc is a feedback loop where we periodically read MBM counters and 742 * adjust the bandwidth percentage values via the IA32_MBA_THRTL_MSRs so 743 * that: 744 * 745 * current bandwidth(cur_bw) < user specified bandwidth(user_bw) 746 * 747 * This uses the MBM counters to measure the bandwidth and MBA throttle 748 * MSRs to control the bandwidth for a particular rdtgrp. It builds on the 749 * fact that resctrl rdtgroups have both monitoring and control. 750 * 751 * The frequency of the checks is 1s and we just tag along the MBM overflow 752 * timer. Having 1s interval makes the calculation of bandwidth simpler. 753 * 754 * Although MBA's goal is to restrict the bandwidth to a maximum, there may 755 * be a need to increase the bandwidth to avoid unnecessarily restricting 756 * the L2 <-> L3 traffic. 757 * 758 * Since MBA controls the L2 external bandwidth where as MBM measures the 759 * L3 external bandwidth the following sequence could lead to such a 760 * situation. 761 * 762 * Consider an rdtgroup which had high L3 <-> memory traffic in initial 763 * phases -> mba_sc kicks in and reduced bandwidth percentage values -> but 764 * after some time rdtgroup has mostly L2 <-> L3 traffic. 765 * 766 * In this case we may restrict the rdtgroup's L2 <-> L3 traffic as its 767 * throttle MSRs already have low percentage values. To avoid 768 * unnecessarily restricting such rdtgroups, we also increase the bandwidth. 769 */ 770 static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_mon_domain *dom_mbm) 771 { 772 u32 closid, rmid, cur_msr_val, new_msr_val; 773 struct mbm_state *pmbm_data, *cmbm_data; 774 struct rdt_ctrl_domain *dom_mba; 775 enum resctrl_event_id evt_id; 776 struct rdt_resource *r_mba; 777 struct list_head *head; 778 struct rdtgroup *entry; 779 u32 cur_bw, user_bw; 780 781 r_mba = resctrl_arch_get_resource(RDT_RESOURCE_MBA); 782 evt_id = rgrp->mba_mbps_event; 783 784 closid = rgrp->closid; 785 rmid = rgrp->mon.rmid; 786 pmbm_data = get_mbm_state(dom_mbm, closid, rmid, evt_id); 787 if (WARN_ON_ONCE(!pmbm_data)) 788 return; 789 790 dom_mba = get_ctrl_domain_from_cpu(smp_processor_id(), r_mba); 791 if (!dom_mba) { 792 pr_warn_once("Failure to get domain for MBA update\n"); 793 return; 794 } 795 796 cur_bw = pmbm_data->prev_bw; 797 user_bw = dom_mba->mbps_val[closid]; 798 799 /* MBA resource doesn't support CDP */ 800 cur_msr_val = resctrl_arch_get_config(r_mba, dom_mba, closid, CDP_NONE); 801 802 /* 803 * For Ctrl groups read data from child monitor groups. 804 */ 805 head = &rgrp->mon.crdtgrp_list; 806 list_for_each_entry(entry, head, mon.crdtgrp_list) { 807 cmbm_data = get_mbm_state(dom_mbm, entry->closid, entry->mon.rmid, evt_id); 808 if (WARN_ON_ONCE(!cmbm_data)) 809 return; 810 cur_bw += cmbm_data->prev_bw; 811 } 812 813 /* 814 * Scale up/down the bandwidth linearly for the ctrl group. The 815 * bandwidth step is the bandwidth granularity specified by the 816 * hardware. 817 * Always increase throttling if current bandwidth is above the 818 * target set by user. 819 * But avoid thrashing up and down on every poll by checking 820 * whether a decrease in throttling is likely to push the group 821 * back over target. E.g. if currently throttling to 30% of bandwidth 822 * on a system with 10% granularity steps, check whether moving to 823 * 40% would go past the limit by multiplying current bandwidth by 824 * "(30 + 10) / 30". 825 */ 826 if (cur_msr_val > r_mba->membw.min_bw && user_bw < cur_bw) { 827 new_msr_val = cur_msr_val - r_mba->membw.bw_gran; 828 } else if (cur_msr_val < MAX_MBA_BW && 829 (user_bw > (cur_bw * (cur_msr_val + r_mba->membw.min_bw) / cur_msr_val))) { 830 new_msr_val = cur_msr_val + r_mba->membw.bw_gran; 831 } else { 832 return; 833 } 834 835 resctrl_arch_update_one(r_mba, dom_mba, closid, CDP_NONE, new_msr_val); 836 } 837 838 static void mbm_update_one_event(struct rdt_resource *r, struct rdt_mon_domain *d, 839 u32 closid, u32 rmid, enum resctrl_event_id evtid) 840 { 841 struct rmid_read rr = {0}; 842 843 rr.r = r; 844 rr.d = d; 845 rr.evtid = evtid; 846 rr.arch_mon_ctx = resctrl_arch_mon_ctx_alloc(rr.r, rr.evtid); 847 if (IS_ERR(rr.arch_mon_ctx)) { 848 pr_warn_ratelimited("Failed to allocate monitor context: %ld", 849 PTR_ERR(rr.arch_mon_ctx)); 850 return; 851 } 852 853 __mon_event_count(closid, rmid, &rr); 854 855 /* 856 * If the software controller is enabled, compute the 857 * bandwidth for this event id. 858 */ 859 if (is_mba_sc(NULL)) 860 mbm_bw_count(closid, rmid, &rr); 861 862 resctrl_arch_mon_ctx_free(rr.r, rr.evtid, rr.arch_mon_ctx); 863 } 864 865 static void mbm_update(struct rdt_resource *r, struct rdt_mon_domain *d, 866 u32 closid, u32 rmid) 867 { 868 /* 869 * This is protected from concurrent reads from user as both 870 * the user and overflow handler hold the global mutex. 871 */ 872 if (resctrl_arch_is_mbm_total_enabled()) 873 mbm_update_one_event(r, d, closid, rmid, QOS_L3_MBM_TOTAL_EVENT_ID); 874 875 if (resctrl_arch_is_mbm_local_enabled()) 876 mbm_update_one_event(r, d, closid, rmid, QOS_L3_MBM_LOCAL_EVENT_ID); 877 } 878 879 /* 880 * Handler to scan the limbo list and move the RMIDs 881 * to free list whose occupancy < threshold_occupancy. 882 */ 883 void cqm_handle_limbo(struct work_struct *work) 884 { 885 unsigned long delay = msecs_to_jiffies(CQM_LIMBOCHECK_INTERVAL); 886 struct rdt_mon_domain *d; 887 888 cpus_read_lock(); 889 mutex_lock(&rdtgroup_mutex); 890 891 d = container_of(work, struct rdt_mon_domain, cqm_limbo.work); 892 893 __check_limbo(d, false); 894 895 if (has_busy_rmid(d)) { 896 d->cqm_work_cpu = cpumask_any_housekeeping(&d->hdr.cpu_mask, 897 RESCTRL_PICK_ANY_CPU); 898 schedule_delayed_work_on(d->cqm_work_cpu, &d->cqm_limbo, 899 delay); 900 } 901 902 mutex_unlock(&rdtgroup_mutex); 903 cpus_read_unlock(); 904 } 905 906 /** 907 * cqm_setup_limbo_handler() - Schedule the limbo handler to run for this 908 * domain. 909 * @dom: The domain the limbo handler should run for. 910 * @delay_ms: How far in the future the handler should run. 911 * @exclude_cpu: Which CPU the handler should not run on, 912 * RESCTRL_PICK_ANY_CPU to pick any CPU. 913 */ 914 void cqm_setup_limbo_handler(struct rdt_mon_domain *dom, unsigned long delay_ms, 915 int exclude_cpu) 916 { 917 unsigned long delay = msecs_to_jiffies(delay_ms); 918 int cpu; 919 920 cpu = cpumask_any_housekeeping(&dom->hdr.cpu_mask, exclude_cpu); 921 dom->cqm_work_cpu = cpu; 922 923 if (cpu < nr_cpu_ids) 924 schedule_delayed_work_on(cpu, &dom->cqm_limbo, delay); 925 } 926 927 void mbm_handle_overflow(struct work_struct *work) 928 { 929 unsigned long delay = msecs_to_jiffies(MBM_OVERFLOW_INTERVAL); 930 struct rdtgroup *prgrp, *crgrp; 931 struct rdt_mon_domain *d; 932 struct list_head *head; 933 struct rdt_resource *r; 934 935 cpus_read_lock(); 936 mutex_lock(&rdtgroup_mutex); 937 938 /* 939 * If the filesystem has been unmounted this work no longer needs to 940 * run. 941 */ 942 if (!resctrl_mounted || !resctrl_arch_mon_capable()) 943 goto out_unlock; 944 945 r = resctrl_arch_get_resource(RDT_RESOURCE_L3); 946 d = container_of(work, struct rdt_mon_domain, mbm_over.work); 947 948 list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) { 949 mbm_update(r, d, prgrp->closid, prgrp->mon.rmid); 950 951 head = &prgrp->mon.crdtgrp_list; 952 list_for_each_entry(crgrp, head, mon.crdtgrp_list) 953 mbm_update(r, d, crgrp->closid, crgrp->mon.rmid); 954 955 if (is_mba_sc(NULL)) 956 update_mba_bw(prgrp, d); 957 } 958 959 /* 960 * Re-check for housekeeping CPUs. This allows the overflow handler to 961 * move off a nohz_full CPU quickly. 962 */ 963 d->mbm_work_cpu = cpumask_any_housekeeping(&d->hdr.cpu_mask, 964 RESCTRL_PICK_ANY_CPU); 965 schedule_delayed_work_on(d->mbm_work_cpu, &d->mbm_over, delay); 966 967 out_unlock: 968 mutex_unlock(&rdtgroup_mutex); 969 cpus_read_unlock(); 970 } 971 972 /** 973 * mbm_setup_overflow_handler() - Schedule the overflow handler to run for this 974 * domain. 975 * @dom: The domain the overflow handler should run for. 976 * @delay_ms: How far in the future the handler should run. 977 * @exclude_cpu: Which CPU the handler should not run on, 978 * RESCTRL_PICK_ANY_CPU to pick any CPU. 979 */ 980 void mbm_setup_overflow_handler(struct rdt_mon_domain *dom, unsigned long delay_ms, 981 int exclude_cpu) 982 { 983 unsigned long delay = msecs_to_jiffies(delay_ms); 984 int cpu; 985 986 /* 987 * When a domain comes online there is no guarantee the filesystem is 988 * mounted. If not, there is no need to catch counter overflow. 989 */ 990 if (!resctrl_mounted || !resctrl_arch_mon_capable()) 991 return; 992 cpu = cpumask_any_housekeeping(&dom->hdr.cpu_mask, exclude_cpu); 993 dom->mbm_work_cpu = cpu; 994 995 if (cpu < nr_cpu_ids) 996 schedule_delayed_work_on(cpu, &dom->mbm_over, delay); 997 } 998 999 static int dom_data_init(struct rdt_resource *r) 1000 { 1001 u32 idx_limit = resctrl_arch_system_num_rmid_idx(); 1002 u32 num_closid = resctrl_arch_get_num_closid(r); 1003 struct rmid_entry *entry = NULL; 1004 int err = 0, i; 1005 u32 idx; 1006 1007 mutex_lock(&rdtgroup_mutex); 1008 if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) { 1009 u32 *tmp; 1010 1011 /* 1012 * If the architecture hasn't provided a sanitised value here, 1013 * this may result in larger arrays than necessary. Resctrl will 1014 * use a smaller system wide value based on the resources in 1015 * use. 1016 */ 1017 tmp = kcalloc(num_closid, sizeof(*tmp), GFP_KERNEL); 1018 if (!tmp) { 1019 err = -ENOMEM; 1020 goto out_unlock; 1021 } 1022 1023 closid_num_dirty_rmid = tmp; 1024 } 1025 1026 rmid_ptrs = kcalloc(idx_limit, sizeof(struct rmid_entry), GFP_KERNEL); 1027 if (!rmid_ptrs) { 1028 if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) { 1029 kfree(closid_num_dirty_rmid); 1030 closid_num_dirty_rmid = NULL; 1031 } 1032 err = -ENOMEM; 1033 goto out_unlock; 1034 } 1035 1036 for (i = 0; i < idx_limit; i++) { 1037 entry = &rmid_ptrs[i]; 1038 INIT_LIST_HEAD(&entry->list); 1039 1040 resctrl_arch_rmid_idx_decode(i, &entry->closid, &entry->rmid); 1041 list_add_tail(&entry->list, &rmid_free_lru); 1042 } 1043 1044 /* 1045 * RESCTRL_RESERVED_CLOSID and RESCTRL_RESERVED_RMID are special and 1046 * are always allocated. These are used for the rdtgroup_default 1047 * control group, which will be setup later in resctrl_init(). 1048 */ 1049 idx = resctrl_arch_rmid_idx_encode(RESCTRL_RESERVED_CLOSID, 1050 RESCTRL_RESERVED_RMID); 1051 entry = __rmid_entry(idx); 1052 list_del(&entry->list); 1053 1054 out_unlock: 1055 mutex_unlock(&rdtgroup_mutex); 1056 1057 return err; 1058 } 1059 1060 static void dom_data_exit(struct rdt_resource *r) 1061 { 1062 mutex_lock(&rdtgroup_mutex); 1063 1064 if (!r->mon_capable) 1065 goto out_unlock; 1066 1067 if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) { 1068 kfree(closid_num_dirty_rmid); 1069 closid_num_dirty_rmid = NULL; 1070 } 1071 1072 kfree(rmid_ptrs); 1073 rmid_ptrs = NULL; 1074 1075 out_unlock: 1076 mutex_unlock(&rdtgroup_mutex); 1077 } 1078 1079 static struct mon_evt llc_occupancy_event = { 1080 .name = "llc_occupancy", 1081 .evtid = QOS_L3_OCCUP_EVENT_ID, 1082 }; 1083 1084 static struct mon_evt mbm_total_event = { 1085 .name = "mbm_total_bytes", 1086 .evtid = QOS_L3_MBM_TOTAL_EVENT_ID, 1087 }; 1088 1089 static struct mon_evt mbm_local_event = { 1090 .name = "mbm_local_bytes", 1091 .evtid = QOS_L3_MBM_LOCAL_EVENT_ID, 1092 }; 1093 1094 /* 1095 * Initialize the event list for the resource. 1096 * 1097 * Note that MBM events are also part of RDT_RESOURCE_L3 resource 1098 * because as per the SDM the total and local memory bandwidth 1099 * are enumerated as part of L3 monitoring. 1100 */ 1101 static void l3_mon_evt_init(struct rdt_resource *r) 1102 { 1103 INIT_LIST_HEAD(&r->evt_list); 1104 1105 if (resctrl_arch_is_llc_occupancy_enabled()) 1106 list_add_tail(&llc_occupancy_event.list, &r->evt_list); 1107 if (resctrl_arch_is_mbm_total_enabled()) 1108 list_add_tail(&mbm_total_event.list, &r->evt_list); 1109 if (resctrl_arch_is_mbm_local_enabled()) 1110 list_add_tail(&mbm_local_event.list, &r->evt_list); 1111 } 1112 1113 /* 1114 * The power-on reset value of MSR_RMID_SNC_CONFIG is 0x1 1115 * which indicates that RMIDs are configured in legacy mode. 1116 * This mode is incompatible with Linux resctrl semantics 1117 * as RMIDs are partitioned between SNC nodes, which requires 1118 * a user to know which RMID is allocated to a task. 1119 * Clearing bit 0 reconfigures the RMID counters for use 1120 * in RMID sharing mode. This mode is better for Linux. 1121 * The RMID space is divided between all SNC nodes with the 1122 * RMIDs renumbered to start from zero in each node when 1123 * counting operations from tasks. Code to read the counters 1124 * must adjust RMID counter numbers based on SNC node. See 1125 * logical_rmid_to_physical_rmid() for code that does this. 1126 */ 1127 void arch_mon_domain_online(struct rdt_resource *r, struct rdt_mon_domain *d) 1128 { 1129 if (snc_nodes_per_l3_cache > 1) 1130 msr_clear_bit(MSR_RMID_SNC_CONFIG, 0); 1131 } 1132 1133 /* CPU models that support MSR_RMID_SNC_CONFIG */ 1134 static const struct x86_cpu_id snc_cpu_ids[] __initconst = { 1135 X86_MATCH_VFM(INTEL_ICELAKE_X, 0), 1136 X86_MATCH_VFM(INTEL_SAPPHIRERAPIDS_X, 0), 1137 X86_MATCH_VFM(INTEL_EMERALDRAPIDS_X, 0), 1138 X86_MATCH_VFM(INTEL_GRANITERAPIDS_X, 0), 1139 X86_MATCH_VFM(INTEL_ATOM_CRESTMONT_X, 0), 1140 {} 1141 }; 1142 1143 /* 1144 * There isn't a simple hardware bit that indicates whether a CPU is running 1145 * in Sub-NUMA Cluster (SNC) mode. Infer the state by comparing the 1146 * number of CPUs sharing the L3 cache with CPU0 to the number of CPUs in 1147 * the same NUMA node as CPU0. 1148 * It is not possible to accurately determine SNC state if the system is 1149 * booted with a maxcpus=N parameter. That distorts the ratio of SNC nodes 1150 * to L3 caches. It will be OK if system is booted with hyperthreading 1151 * disabled (since this doesn't affect the ratio). 1152 */ 1153 static __init int snc_get_config(void) 1154 { 1155 struct cacheinfo *ci = get_cpu_cacheinfo_level(0, RESCTRL_L3_CACHE); 1156 const cpumask_t *node0_cpumask; 1157 int cpus_per_node, cpus_per_l3; 1158 int ret; 1159 1160 if (!x86_match_cpu(snc_cpu_ids) || !ci) 1161 return 1; 1162 1163 cpus_read_lock(); 1164 if (num_online_cpus() != num_present_cpus()) 1165 pr_warn("Some CPUs offline, SNC detection may be incorrect\n"); 1166 cpus_read_unlock(); 1167 1168 node0_cpumask = cpumask_of_node(cpu_to_node(0)); 1169 1170 cpus_per_node = cpumask_weight(node0_cpumask); 1171 cpus_per_l3 = cpumask_weight(&ci->shared_cpu_map); 1172 1173 if (!cpus_per_node || !cpus_per_l3) 1174 return 1; 1175 1176 ret = cpus_per_l3 / cpus_per_node; 1177 1178 /* sanity check: Only valid results are 1, 2, 3, 4, 6 */ 1179 switch (ret) { 1180 case 1: 1181 break; 1182 case 2 ... 4: 1183 case 6: 1184 pr_info("Sub-NUMA Cluster mode detected with %d nodes per L3 cache\n", ret); 1185 rdt_resources_all[RDT_RESOURCE_L3].r_resctrl.mon_scope = RESCTRL_L3_NODE; 1186 break; 1187 default: 1188 pr_warn("Ignore improbable SNC node count %d\n", ret); 1189 ret = 1; 1190 break; 1191 } 1192 1193 return ret; 1194 } 1195 1196 /** 1197 * resctrl_mon_resource_init() - Initialise global monitoring structures. 1198 * 1199 * Allocate and initialise global monitor resources that do not belong to a 1200 * specific domain. i.e. the rmid_ptrs[] used for the limbo and free lists. 1201 * Called once during boot after the struct rdt_resource's have been configured 1202 * but before the filesystem is mounted. 1203 * Resctrl's cpuhp callbacks may be called before this point to bring a domain 1204 * online. 1205 * 1206 * Returns 0 for success, or -ENOMEM. 1207 */ 1208 int __init resctrl_mon_resource_init(void) 1209 { 1210 struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3); 1211 int ret; 1212 1213 if (!r->mon_capable) 1214 return 0; 1215 1216 ret = dom_data_init(r); 1217 if (ret) 1218 return ret; 1219 1220 l3_mon_evt_init(r); 1221 1222 if (resctrl_arch_is_evt_configurable(QOS_L3_MBM_TOTAL_EVENT_ID)) { 1223 mbm_total_event.configurable = true; 1224 resctrl_file_fflags_init("mbm_total_bytes_config", 1225 RFTYPE_MON_INFO | RFTYPE_RES_CACHE); 1226 } 1227 if (resctrl_arch_is_evt_configurable(QOS_L3_MBM_LOCAL_EVENT_ID)) { 1228 mbm_local_event.configurable = true; 1229 resctrl_file_fflags_init("mbm_local_bytes_config", 1230 RFTYPE_MON_INFO | RFTYPE_RES_CACHE); 1231 } 1232 1233 if (resctrl_arch_is_mbm_local_enabled()) 1234 mba_mbps_default_event = QOS_L3_MBM_LOCAL_EVENT_ID; 1235 else if (resctrl_arch_is_mbm_total_enabled()) 1236 mba_mbps_default_event = QOS_L3_MBM_TOTAL_EVENT_ID; 1237 1238 return 0; 1239 } 1240 1241 int __init rdt_get_mon_l3_config(struct rdt_resource *r) 1242 { 1243 unsigned int mbm_offset = boot_cpu_data.x86_cache_mbm_width_offset; 1244 struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); 1245 unsigned int threshold; 1246 1247 snc_nodes_per_l3_cache = snc_get_config(); 1248 1249 resctrl_rmid_realloc_limit = boot_cpu_data.x86_cache_size * 1024; 1250 hw_res->mon_scale = boot_cpu_data.x86_cache_occ_scale / snc_nodes_per_l3_cache; 1251 r->num_rmid = (boot_cpu_data.x86_cache_max_rmid + 1) / snc_nodes_per_l3_cache; 1252 hw_res->mbm_width = MBM_CNTR_WIDTH_BASE; 1253 1254 if (mbm_offset > 0 && mbm_offset <= MBM_CNTR_WIDTH_OFFSET_MAX) 1255 hw_res->mbm_width += mbm_offset; 1256 else if (mbm_offset > MBM_CNTR_WIDTH_OFFSET_MAX) 1257 pr_warn("Ignoring impossible MBM counter offset\n"); 1258 1259 /* 1260 * A reasonable upper limit on the max threshold is the number 1261 * of lines tagged per RMID if all RMIDs have the same number of 1262 * lines tagged in the LLC. 1263 * 1264 * For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC. 1265 */ 1266 threshold = resctrl_rmid_realloc_limit / r->num_rmid; 1267 1268 /* 1269 * Because num_rmid may not be a power of two, round the value 1270 * to the nearest multiple of hw_res->mon_scale so it matches a 1271 * value the hardware will measure. mon_scale may not be a power of 2. 1272 */ 1273 resctrl_rmid_realloc_threshold = resctrl_arch_round_mon_val(threshold); 1274 1275 if (rdt_cpu_has(X86_FEATURE_BMEC)) { 1276 u32 eax, ebx, ecx, edx; 1277 1278 /* Detect list of bandwidth sources that can be tracked */ 1279 cpuid_count(0x80000020, 3, &eax, &ebx, &ecx, &edx); 1280 r->mbm_cfg_mask = ecx & MAX_EVT_CONFIG_BITS; 1281 } 1282 1283 r->mon_capable = true; 1284 1285 return 0; 1286 } 1287 1288 void resctrl_mon_resource_exit(void) 1289 { 1290 struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3); 1291 1292 dom_data_exit(r); 1293 } 1294 1295 void __init intel_rdt_mbm_apply_quirk(void) 1296 { 1297 int cf_index; 1298 1299 cf_index = (boot_cpu_data.x86_cache_max_rmid + 1) / 8 - 1; 1300 if (cf_index >= ARRAY_SIZE(mbm_cf_table)) { 1301 pr_info("No MBM correction factor available\n"); 1302 return; 1303 } 1304 1305 mbm_cf_rmidthreshold = mbm_cf_table[cf_index].rmidthreshold; 1306 mbm_cf = mbm_cf_table[cf_index].cf; 1307 } 1308