1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Resource Director Technology(RDT) 4 * - Monitoring code 5 * 6 * Copyright (C) 2017 Intel Corporation 7 * 8 * Author: 9 * Vikas Shivappa <vikas.shivappa@intel.com> 10 * 11 * This replaces the cqm.c based on perf but we reuse a lot of 12 * code and datastructures originally from Peter Zijlstra and Matt Fleming. 13 * 14 * More information about RDT be found in the Intel (R) x86 Architecture 15 * Software Developer Manual June 2016, volume 3, section 17.17. 16 */ 17 18 #define pr_fmt(fmt) "resctrl: " fmt 19 20 #include <linux/cpu.h> 21 #include <linux/module.h> 22 #include <linux/sizes.h> 23 #include <linux/slab.h> 24 25 #include <asm/cpu_device_id.h> 26 #include <asm/resctrl.h> 27 28 #include "internal.h" 29 #include "trace.h" 30 31 /** 32 * struct rmid_entry - dirty tracking for all RMID. 33 * @closid: The CLOSID for this entry. 34 * @rmid: The RMID for this entry. 35 * @busy: The number of domains with cached data using this RMID. 36 * @list: Member of the rmid_free_lru list when busy == 0. 37 * 38 * Depending on the architecture the correct monitor is accessed using 39 * both @closid and @rmid, or @rmid only. 40 * 41 * Take the rdtgroup_mutex when accessing. 42 */ 43 struct rmid_entry { 44 u32 closid; 45 u32 rmid; 46 int busy; 47 struct list_head list; 48 }; 49 50 /* 51 * @rmid_free_lru - A least recently used list of free RMIDs 52 * These RMIDs are guaranteed to have an occupancy less than the 53 * threshold occupancy 54 */ 55 static LIST_HEAD(rmid_free_lru); 56 57 /* 58 * @closid_num_dirty_rmid The number of dirty RMID each CLOSID has. 59 * Only allocated when CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID is defined. 60 * Indexed by CLOSID. Protected by rdtgroup_mutex. 61 */ 62 static u32 *closid_num_dirty_rmid; 63 64 /* 65 * @rmid_limbo_count - count of currently unused but (potentially) 66 * dirty RMIDs. 67 * This counts RMIDs that no one is currently using but that 68 * may have a occupancy value > resctrl_rmid_realloc_threshold. User can 69 * change the threshold occupancy value. 70 */ 71 static unsigned int rmid_limbo_count; 72 73 /* 74 * @rmid_entry - The entry in the limbo and free lists. 75 */ 76 static struct rmid_entry *rmid_ptrs; 77 78 /* 79 * Global boolean for rdt_monitor which is true if any 80 * resource monitoring is enabled. 81 */ 82 bool rdt_mon_capable; 83 84 /* 85 * Global to indicate which monitoring events are enabled. 86 */ 87 unsigned int rdt_mon_features; 88 89 /* 90 * This is the threshold cache occupancy in bytes at which we will consider an 91 * RMID available for re-allocation. 92 */ 93 unsigned int resctrl_rmid_realloc_threshold; 94 95 /* 96 * This is the maximum value for the reallocation threshold, in bytes. 97 */ 98 unsigned int resctrl_rmid_realloc_limit; 99 100 #define CF(cf) ((unsigned long)(1048576 * (cf) + 0.5)) 101 102 static int snc_nodes_per_l3_cache = 1; 103 104 /* 105 * The correction factor table is documented in Documentation/arch/x86/resctrl.rst. 106 * If rmid > rmid threshold, MBM total and local values should be multiplied 107 * by the correction factor. 108 * 109 * The original table is modified for better code: 110 * 111 * 1. The threshold 0 is changed to rmid count - 1 so don't do correction 112 * for the case. 113 * 2. MBM total and local correction table indexed by core counter which is 114 * equal to (x86_cache_max_rmid + 1) / 8 - 1 and is from 0 up to 27. 115 * 3. The correction factor is normalized to 2^20 (1048576) so it's faster 116 * to calculate corrected value by shifting: 117 * corrected_value = (original_value * correction_factor) >> 20 118 */ 119 static const struct mbm_correction_factor_table { 120 u32 rmidthreshold; 121 u64 cf; 122 } mbm_cf_table[] __initconst = { 123 {7, CF(1.000000)}, 124 {15, CF(1.000000)}, 125 {15, CF(0.969650)}, 126 {31, CF(1.000000)}, 127 {31, CF(1.066667)}, 128 {31, CF(0.969650)}, 129 {47, CF(1.142857)}, 130 {63, CF(1.000000)}, 131 {63, CF(1.185115)}, 132 {63, CF(1.066553)}, 133 {79, CF(1.454545)}, 134 {95, CF(1.000000)}, 135 {95, CF(1.230769)}, 136 {95, CF(1.142857)}, 137 {95, CF(1.066667)}, 138 {127, CF(1.000000)}, 139 {127, CF(1.254863)}, 140 {127, CF(1.185255)}, 141 {151, CF(1.000000)}, 142 {127, CF(1.066667)}, 143 {167, CF(1.000000)}, 144 {159, CF(1.454334)}, 145 {183, CF(1.000000)}, 146 {127, CF(0.969744)}, 147 {191, CF(1.280246)}, 148 {191, CF(1.230921)}, 149 {215, CF(1.000000)}, 150 {191, CF(1.143118)}, 151 }; 152 153 static u32 mbm_cf_rmidthreshold __read_mostly = UINT_MAX; 154 static u64 mbm_cf __read_mostly; 155 156 static inline u64 get_corrected_mbm_count(u32 rmid, unsigned long val) 157 { 158 /* Correct MBM value. */ 159 if (rmid > mbm_cf_rmidthreshold) 160 val = (val * mbm_cf) >> 20; 161 162 return val; 163 } 164 165 /* 166 * x86 and arm64 differ in their handling of monitoring. 167 * x86's RMID are independent numbers, there is only one source of traffic 168 * with an RMID value of '1'. 169 * arm64's PMG extends the PARTID/CLOSID space, there are multiple sources of 170 * traffic with a PMG value of '1', one for each CLOSID, meaning the RMID 171 * value is no longer unique. 172 * To account for this, resctrl uses an index. On x86 this is just the RMID, 173 * on arm64 it encodes the CLOSID and RMID. This gives a unique number. 174 * 175 * The domain's rmid_busy_llc and rmid_ptrs[] are sized by index. The arch code 176 * must accept an attempt to read every index. 177 */ 178 static inline struct rmid_entry *__rmid_entry(u32 idx) 179 { 180 struct rmid_entry *entry; 181 u32 closid, rmid; 182 183 entry = &rmid_ptrs[idx]; 184 resctrl_arch_rmid_idx_decode(idx, &closid, &rmid); 185 186 WARN_ON_ONCE(entry->closid != closid); 187 WARN_ON_ONCE(entry->rmid != rmid); 188 189 return entry; 190 } 191 192 /* 193 * When Sub-NUMA Cluster (SNC) mode is not enabled (as indicated by 194 * "snc_nodes_per_l3_cache == 1") no translation of the RMID value is 195 * needed. The physical RMID is the same as the logical RMID. 196 * 197 * On a platform with SNC mode enabled, Linux enables RMID sharing mode 198 * via MSR 0xCA0 (see the "RMID Sharing Mode" section in the "Intel 199 * Resource Director Technology Architecture Specification" for a full 200 * description of RMID sharing mode). 201 * 202 * In RMID sharing mode there are fewer "logical RMID" values available 203 * to accumulate data ("physical RMIDs" are divided evenly between SNC 204 * nodes that share an L3 cache). Linux creates an rdt_mon_domain for 205 * each SNC node. 206 * 207 * The value loaded into IA32_PQR_ASSOC is the "logical RMID". 208 * 209 * Data is collected independently on each SNC node and can be retrieved 210 * using the "physical RMID" value computed by this function and loaded 211 * into IA32_QM_EVTSEL. @cpu can be any CPU in the SNC node. 212 * 213 * The scope of the IA32_QM_EVTSEL and IA32_QM_CTR MSRs is at the L3 214 * cache. So a "physical RMID" may be read from any CPU that shares 215 * the L3 cache with the desired SNC node, not just from a CPU in 216 * the specific SNC node. 217 */ 218 static int logical_rmid_to_physical_rmid(int cpu, int lrmid) 219 { 220 struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; 221 222 if (snc_nodes_per_l3_cache == 1) 223 return lrmid; 224 225 return lrmid + (cpu_to_node(cpu) % snc_nodes_per_l3_cache) * r->num_rmid; 226 } 227 228 static int __rmid_read_phys(u32 prmid, enum resctrl_event_id eventid, u64 *val) 229 { 230 u64 msr_val; 231 232 /* 233 * As per the SDM, when IA32_QM_EVTSEL.EvtID (bits 7:0) is configured 234 * with a valid event code for supported resource type and the bits 235 * IA32_QM_EVTSEL.RMID (bits 41:32) are configured with valid RMID, 236 * IA32_QM_CTR.data (bits 61:0) reports the monitored data. 237 * IA32_QM_CTR.Error (bit 63) and IA32_QM_CTR.Unavailable (bit 62) 238 * are error bits. 239 */ 240 wrmsr(MSR_IA32_QM_EVTSEL, eventid, prmid); 241 rdmsrl(MSR_IA32_QM_CTR, msr_val); 242 243 if (msr_val & RMID_VAL_ERROR) 244 return -EIO; 245 if (msr_val & RMID_VAL_UNAVAIL) 246 return -EINVAL; 247 248 *val = msr_val; 249 return 0; 250 } 251 252 static struct arch_mbm_state *get_arch_mbm_state(struct rdt_hw_mon_domain *hw_dom, 253 u32 rmid, 254 enum resctrl_event_id eventid) 255 { 256 switch (eventid) { 257 case QOS_L3_OCCUP_EVENT_ID: 258 return NULL; 259 case QOS_L3_MBM_TOTAL_EVENT_ID: 260 return &hw_dom->arch_mbm_total[rmid]; 261 case QOS_L3_MBM_LOCAL_EVENT_ID: 262 return &hw_dom->arch_mbm_local[rmid]; 263 } 264 265 /* Never expect to get here */ 266 WARN_ON_ONCE(1); 267 268 return NULL; 269 } 270 271 void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_mon_domain *d, 272 u32 unused, u32 rmid, 273 enum resctrl_event_id eventid) 274 { 275 struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); 276 int cpu = cpumask_any(&d->hdr.cpu_mask); 277 struct arch_mbm_state *am; 278 u32 prmid; 279 280 am = get_arch_mbm_state(hw_dom, rmid, eventid); 281 if (am) { 282 memset(am, 0, sizeof(*am)); 283 284 prmid = logical_rmid_to_physical_rmid(cpu, rmid); 285 /* Record any initial, non-zero count value. */ 286 __rmid_read_phys(prmid, eventid, &am->prev_msr); 287 } 288 } 289 290 /* 291 * Assumes that hardware counters are also reset and thus that there is 292 * no need to record initial non-zero counts. 293 */ 294 void resctrl_arch_reset_rmid_all(struct rdt_resource *r, struct rdt_mon_domain *d) 295 { 296 struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); 297 298 if (resctrl_arch_is_mbm_total_enabled()) 299 memset(hw_dom->arch_mbm_total, 0, 300 sizeof(*hw_dom->arch_mbm_total) * r->num_rmid); 301 302 if (resctrl_arch_is_mbm_local_enabled()) 303 memset(hw_dom->arch_mbm_local, 0, 304 sizeof(*hw_dom->arch_mbm_local) * r->num_rmid); 305 } 306 307 static u64 mbm_overflow_count(u64 prev_msr, u64 cur_msr, unsigned int width) 308 { 309 u64 shift = 64 - width, chunks; 310 311 chunks = (cur_msr << shift) - (prev_msr << shift); 312 return chunks >> shift; 313 } 314 315 int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_mon_domain *d, 316 u32 unused, u32 rmid, enum resctrl_event_id eventid, 317 u64 *val, void *ignored) 318 { 319 struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); 320 struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); 321 int cpu = cpumask_any(&d->hdr.cpu_mask); 322 struct arch_mbm_state *am; 323 u64 msr_val, chunks; 324 u32 prmid; 325 int ret; 326 327 resctrl_arch_rmid_read_context_check(); 328 329 prmid = logical_rmid_to_physical_rmid(cpu, rmid); 330 ret = __rmid_read_phys(prmid, eventid, &msr_val); 331 if (ret) 332 return ret; 333 334 am = get_arch_mbm_state(hw_dom, rmid, eventid); 335 if (am) { 336 am->chunks += mbm_overflow_count(am->prev_msr, msr_val, 337 hw_res->mbm_width); 338 chunks = get_corrected_mbm_count(rmid, am->chunks); 339 am->prev_msr = msr_val; 340 } else { 341 chunks = msr_val; 342 } 343 344 *val = chunks * hw_res->mon_scale; 345 346 return 0; 347 } 348 349 static void limbo_release_entry(struct rmid_entry *entry) 350 { 351 lockdep_assert_held(&rdtgroup_mutex); 352 353 rmid_limbo_count--; 354 list_add_tail(&entry->list, &rmid_free_lru); 355 356 if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) 357 closid_num_dirty_rmid[entry->closid]--; 358 } 359 360 /* 361 * Check the RMIDs that are marked as busy for this domain. If the 362 * reported LLC occupancy is below the threshold clear the busy bit and 363 * decrement the count. If the busy count gets to zero on an RMID, we 364 * free the RMID 365 */ 366 void __check_limbo(struct rdt_mon_domain *d, bool force_free) 367 { 368 struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3); 369 u32 idx_limit = resctrl_arch_system_num_rmid_idx(); 370 struct rmid_entry *entry; 371 u32 idx, cur_idx = 1; 372 void *arch_mon_ctx; 373 bool rmid_dirty; 374 u64 val = 0; 375 376 arch_mon_ctx = resctrl_arch_mon_ctx_alloc(r, QOS_L3_OCCUP_EVENT_ID); 377 if (IS_ERR(arch_mon_ctx)) { 378 pr_warn_ratelimited("Failed to allocate monitor context: %ld", 379 PTR_ERR(arch_mon_ctx)); 380 return; 381 } 382 383 /* 384 * Skip RMID 0 and start from RMID 1 and check all the RMIDs that 385 * are marked as busy for occupancy < threshold. If the occupancy 386 * is less than the threshold decrement the busy counter of the 387 * RMID and move it to the free list when the counter reaches 0. 388 */ 389 for (;;) { 390 idx = find_next_bit(d->rmid_busy_llc, idx_limit, cur_idx); 391 if (idx >= idx_limit) 392 break; 393 394 entry = __rmid_entry(idx); 395 if (resctrl_arch_rmid_read(r, d, entry->closid, entry->rmid, 396 QOS_L3_OCCUP_EVENT_ID, &val, 397 arch_mon_ctx)) { 398 rmid_dirty = true; 399 } else { 400 rmid_dirty = (val >= resctrl_rmid_realloc_threshold); 401 402 /* 403 * x86's CLOSID and RMID are independent numbers, so the entry's 404 * CLOSID is an empty CLOSID (X86_RESCTRL_EMPTY_CLOSID). On Arm the 405 * RMID (PMG) extends the CLOSID (PARTID) space with bits that aren't 406 * used to select the configuration. It is thus necessary to track both 407 * CLOSID and RMID because there may be dependencies between them 408 * on some architectures. 409 */ 410 trace_mon_llc_occupancy_limbo(entry->closid, entry->rmid, d->hdr.id, val); 411 } 412 413 if (force_free || !rmid_dirty) { 414 clear_bit(idx, d->rmid_busy_llc); 415 if (!--entry->busy) 416 limbo_release_entry(entry); 417 } 418 cur_idx = idx + 1; 419 } 420 421 resctrl_arch_mon_ctx_free(r, QOS_L3_OCCUP_EVENT_ID, arch_mon_ctx); 422 } 423 424 bool has_busy_rmid(struct rdt_mon_domain *d) 425 { 426 u32 idx_limit = resctrl_arch_system_num_rmid_idx(); 427 428 return find_first_bit(d->rmid_busy_llc, idx_limit) != idx_limit; 429 } 430 431 static struct rmid_entry *resctrl_find_free_rmid(u32 closid) 432 { 433 struct rmid_entry *itr; 434 u32 itr_idx, cmp_idx; 435 436 if (list_empty(&rmid_free_lru)) 437 return rmid_limbo_count ? ERR_PTR(-EBUSY) : ERR_PTR(-ENOSPC); 438 439 list_for_each_entry(itr, &rmid_free_lru, list) { 440 /* 441 * Get the index of this free RMID, and the index it would need 442 * to be if it were used with this CLOSID. 443 * If the CLOSID is irrelevant on this architecture, the two 444 * index values are always the same on every entry and thus the 445 * very first entry will be returned. 446 */ 447 itr_idx = resctrl_arch_rmid_idx_encode(itr->closid, itr->rmid); 448 cmp_idx = resctrl_arch_rmid_idx_encode(closid, itr->rmid); 449 450 if (itr_idx == cmp_idx) 451 return itr; 452 } 453 454 return ERR_PTR(-ENOSPC); 455 } 456 457 /** 458 * resctrl_find_cleanest_closid() - Find a CLOSID where all the associated 459 * RMID are clean, or the CLOSID that has 460 * the most clean RMID. 461 * 462 * MPAM's equivalent of RMID are per-CLOSID, meaning a freshly allocated CLOSID 463 * may not be able to allocate clean RMID. To avoid this the allocator will 464 * choose the CLOSID with the most clean RMID. 465 * 466 * When the CLOSID and RMID are independent numbers, the first free CLOSID will 467 * be returned. 468 */ 469 int resctrl_find_cleanest_closid(void) 470 { 471 u32 cleanest_closid = ~0; 472 int i = 0; 473 474 lockdep_assert_held(&rdtgroup_mutex); 475 476 if (!IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) 477 return -EIO; 478 479 for (i = 0; i < closids_supported(); i++) { 480 int num_dirty; 481 482 if (closid_allocated(i)) 483 continue; 484 485 num_dirty = closid_num_dirty_rmid[i]; 486 if (num_dirty == 0) 487 return i; 488 489 if (cleanest_closid == ~0) 490 cleanest_closid = i; 491 492 if (num_dirty < closid_num_dirty_rmid[cleanest_closid]) 493 cleanest_closid = i; 494 } 495 496 if (cleanest_closid == ~0) 497 return -ENOSPC; 498 499 return cleanest_closid; 500 } 501 502 /* 503 * For MPAM the RMID value is not unique, and has to be considered with 504 * the CLOSID. The (CLOSID, RMID) pair is allocated on all domains, which 505 * allows all domains to be managed by a single free list. 506 * Each domain also has a rmid_busy_llc to reduce the work of the limbo handler. 507 */ 508 int alloc_rmid(u32 closid) 509 { 510 struct rmid_entry *entry; 511 512 lockdep_assert_held(&rdtgroup_mutex); 513 514 entry = resctrl_find_free_rmid(closid); 515 if (IS_ERR(entry)) 516 return PTR_ERR(entry); 517 518 list_del(&entry->list); 519 return entry->rmid; 520 } 521 522 static void add_rmid_to_limbo(struct rmid_entry *entry) 523 { 524 struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3); 525 struct rdt_mon_domain *d; 526 u32 idx; 527 528 lockdep_assert_held(&rdtgroup_mutex); 529 530 /* Walking r->domains, ensure it can't race with cpuhp */ 531 lockdep_assert_cpus_held(); 532 533 idx = resctrl_arch_rmid_idx_encode(entry->closid, entry->rmid); 534 535 entry->busy = 0; 536 list_for_each_entry(d, &r->mon_domains, hdr.list) { 537 /* 538 * For the first limbo RMID in the domain, 539 * setup up the limbo worker. 540 */ 541 if (!has_busy_rmid(d)) 542 cqm_setup_limbo_handler(d, CQM_LIMBOCHECK_INTERVAL, 543 RESCTRL_PICK_ANY_CPU); 544 set_bit(idx, d->rmid_busy_llc); 545 entry->busy++; 546 } 547 548 rmid_limbo_count++; 549 if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) 550 closid_num_dirty_rmid[entry->closid]++; 551 } 552 553 void free_rmid(u32 closid, u32 rmid) 554 { 555 u32 idx = resctrl_arch_rmid_idx_encode(closid, rmid); 556 struct rmid_entry *entry; 557 558 lockdep_assert_held(&rdtgroup_mutex); 559 560 /* 561 * Do not allow the default rmid to be free'd. Comparing by index 562 * allows architectures that ignore the closid parameter to avoid an 563 * unnecessary check. 564 */ 565 if (!resctrl_arch_mon_capable() || 566 idx == resctrl_arch_rmid_idx_encode(RESCTRL_RESERVED_CLOSID, 567 RESCTRL_RESERVED_RMID)) 568 return; 569 570 entry = __rmid_entry(idx); 571 572 if (resctrl_arch_is_llc_occupancy_enabled()) 573 add_rmid_to_limbo(entry); 574 else 575 list_add_tail(&entry->list, &rmid_free_lru); 576 } 577 578 static struct mbm_state *get_mbm_state(struct rdt_mon_domain *d, u32 closid, 579 u32 rmid, enum resctrl_event_id evtid) 580 { 581 u32 idx = resctrl_arch_rmid_idx_encode(closid, rmid); 582 583 switch (evtid) { 584 case QOS_L3_MBM_TOTAL_EVENT_ID: 585 return &d->mbm_total[idx]; 586 case QOS_L3_MBM_LOCAL_EVENT_ID: 587 return &d->mbm_local[idx]; 588 default: 589 return NULL; 590 } 591 } 592 593 static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr) 594 { 595 int cpu = smp_processor_id(); 596 struct rdt_mon_domain *d; 597 struct mbm_state *m; 598 int err, ret; 599 u64 tval = 0; 600 601 if (rr->first) { 602 resctrl_arch_reset_rmid(rr->r, rr->d, closid, rmid, rr->evtid); 603 m = get_mbm_state(rr->d, closid, rmid, rr->evtid); 604 if (m) 605 memset(m, 0, sizeof(struct mbm_state)); 606 return 0; 607 } 608 609 if (rr->d) { 610 /* Reading a single domain, must be on a CPU in that domain. */ 611 if (!cpumask_test_cpu(cpu, &rr->d->hdr.cpu_mask)) 612 return -EINVAL; 613 rr->err = resctrl_arch_rmid_read(rr->r, rr->d, closid, rmid, 614 rr->evtid, &tval, rr->arch_mon_ctx); 615 if (rr->err) 616 return rr->err; 617 618 rr->val += tval; 619 620 return 0; 621 } 622 623 /* Summing domains that share a cache, must be on a CPU for that cache. */ 624 if (!cpumask_test_cpu(cpu, &rr->ci->shared_cpu_map)) 625 return -EINVAL; 626 627 /* 628 * Legacy files must report the sum of an event across all 629 * domains that share the same L3 cache instance. 630 * Report success if a read from any domain succeeds, -EINVAL 631 * (translated to "Unavailable" for user space) if reading from 632 * all domains fail for any reason. 633 */ 634 ret = -EINVAL; 635 list_for_each_entry(d, &rr->r->mon_domains, hdr.list) { 636 if (d->ci->id != rr->ci->id) 637 continue; 638 err = resctrl_arch_rmid_read(rr->r, d, closid, rmid, 639 rr->evtid, &tval, rr->arch_mon_ctx); 640 if (!err) { 641 rr->val += tval; 642 ret = 0; 643 } 644 } 645 646 if (ret) 647 rr->err = ret; 648 649 return ret; 650 } 651 652 /* 653 * mbm_bw_count() - Update bw count from values previously read by 654 * __mon_event_count(). 655 * @closid: The closid used to identify the cached mbm_state. 656 * @rmid: The rmid used to identify the cached mbm_state. 657 * @rr: The struct rmid_read populated by __mon_event_count(). 658 * 659 * Supporting function to calculate the memory bandwidth 660 * and delta bandwidth in MBps. The chunks value previously read by 661 * __mon_event_count() is compared with the chunks value from the previous 662 * invocation. This must be called once per second to maintain values in MBps. 663 */ 664 static void mbm_bw_count(u32 closid, u32 rmid, struct rmid_read *rr) 665 { 666 u64 cur_bw, bytes, cur_bytes; 667 struct mbm_state *m; 668 669 m = get_mbm_state(rr->d, closid, rmid, rr->evtid); 670 if (WARN_ON_ONCE(!m)) 671 return; 672 673 cur_bytes = rr->val; 674 bytes = cur_bytes - m->prev_bw_bytes; 675 m->prev_bw_bytes = cur_bytes; 676 677 cur_bw = bytes / SZ_1M; 678 679 m->prev_bw = cur_bw; 680 } 681 682 /* 683 * This is scheduled by mon_event_read() to read the CQM/MBM counters 684 * on a domain. 685 */ 686 void mon_event_count(void *info) 687 { 688 struct rdtgroup *rdtgrp, *entry; 689 struct rmid_read *rr = info; 690 struct list_head *head; 691 int ret; 692 693 rdtgrp = rr->rgrp; 694 695 ret = __mon_event_count(rdtgrp->closid, rdtgrp->mon.rmid, rr); 696 697 /* 698 * For Ctrl groups read data from child monitor groups and 699 * add them together. Count events which are read successfully. 700 * Discard the rmid_read's reporting errors. 701 */ 702 head = &rdtgrp->mon.crdtgrp_list; 703 704 if (rdtgrp->type == RDTCTRL_GROUP) { 705 list_for_each_entry(entry, head, mon.crdtgrp_list) { 706 if (__mon_event_count(entry->closid, entry->mon.rmid, 707 rr) == 0) 708 ret = 0; 709 } 710 } 711 712 /* 713 * __mon_event_count() calls for newly created monitor groups may 714 * report -EINVAL/Unavailable if the monitor hasn't seen any traffic. 715 * Discard error if any of the monitor event reads succeeded. 716 */ 717 if (ret == 0) 718 rr->err = 0; 719 } 720 721 static struct rdt_ctrl_domain *get_ctrl_domain_from_cpu(int cpu, 722 struct rdt_resource *r) 723 { 724 struct rdt_ctrl_domain *d; 725 726 lockdep_assert_cpus_held(); 727 728 list_for_each_entry(d, &r->ctrl_domains, hdr.list) { 729 /* Find the domain that contains this CPU */ 730 if (cpumask_test_cpu(cpu, &d->hdr.cpu_mask)) 731 return d; 732 } 733 734 return NULL; 735 } 736 737 /* 738 * Feedback loop for MBA software controller (mba_sc) 739 * 740 * mba_sc is a feedback loop where we periodically read MBM counters and 741 * adjust the bandwidth percentage values via the IA32_MBA_THRTL_MSRs so 742 * that: 743 * 744 * current bandwidth(cur_bw) < user specified bandwidth(user_bw) 745 * 746 * This uses the MBM counters to measure the bandwidth and MBA throttle 747 * MSRs to control the bandwidth for a particular rdtgrp. It builds on the 748 * fact that resctrl rdtgroups have both monitoring and control. 749 * 750 * The frequency of the checks is 1s and we just tag along the MBM overflow 751 * timer. Having 1s interval makes the calculation of bandwidth simpler. 752 * 753 * Although MBA's goal is to restrict the bandwidth to a maximum, there may 754 * be a need to increase the bandwidth to avoid unnecessarily restricting 755 * the L2 <-> L3 traffic. 756 * 757 * Since MBA controls the L2 external bandwidth where as MBM measures the 758 * L3 external bandwidth the following sequence could lead to such a 759 * situation. 760 * 761 * Consider an rdtgroup which had high L3 <-> memory traffic in initial 762 * phases -> mba_sc kicks in and reduced bandwidth percentage values -> but 763 * after some time rdtgroup has mostly L2 <-> L3 traffic. 764 * 765 * In this case we may restrict the rdtgroup's L2 <-> L3 traffic as its 766 * throttle MSRs already have low percentage values. To avoid 767 * unnecessarily restricting such rdtgroups, we also increase the bandwidth. 768 */ 769 static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_mon_domain *dom_mbm) 770 { 771 u32 closid, rmid, cur_msr_val, new_msr_val; 772 struct mbm_state *pmbm_data, *cmbm_data; 773 struct rdt_ctrl_domain *dom_mba; 774 enum resctrl_event_id evt_id; 775 struct rdt_resource *r_mba; 776 struct list_head *head; 777 struct rdtgroup *entry; 778 u32 cur_bw, user_bw; 779 780 r_mba = resctrl_arch_get_resource(RDT_RESOURCE_MBA); 781 evt_id = rgrp->mba_mbps_event; 782 783 closid = rgrp->closid; 784 rmid = rgrp->mon.rmid; 785 pmbm_data = get_mbm_state(dom_mbm, closid, rmid, evt_id); 786 if (WARN_ON_ONCE(!pmbm_data)) 787 return; 788 789 dom_mba = get_ctrl_domain_from_cpu(smp_processor_id(), r_mba); 790 if (!dom_mba) { 791 pr_warn_once("Failure to get domain for MBA update\n"); 792 return; 793 } 794 795 cur_bw = pmbm_data->prev_bw; 796 user_bw = dom_mba->mbps_val[closid]; 797 798 /* MBA resource doesn't support CDP */ 799 cur_msr_val = resctrl_arch_get_config(r_mba, dom_mba, closid, CDP_NONE); 800 801 /* 802 * For Ctrl groups read data from child monitor groups. 803 */ 804 head = &rgrp->mon.crdtgrp_list; 805 list_for_each_entry(entry, head, mon.crdtgrp_list) { 806 cmbm_data = get_mbm_state(dom_mbm, entry->closid, entry->mon.rmid, evt_id); 807 if (WARN_ON_ONCE(!cmbm_data)) 808 return; 809 cur_bw += cmbm_data->prev_bw; 810 } 811 812 /* 813 * Scale up/down the bandwidth linearly for the ctrl group. The 814 * bandwidth step is the bandwidth granularity specified by the 815 * hardware. 816 * Always increase throttling if current bandwidth is above the 817 * target set by user. 818 * But avoid thrashing up and down on every poll by checking 819 * whether a decrease in throttling is likely to push the group 820 * back over target. E.g. if currently throttling to 30% of bandwidth 821 * on a system with 10% granularity steps, check whether moving to 822 * 40% would go past the limit by multiplying current bandwidth by 823 * "(30 + 10) / 30". 824 */ 825 if (cur_msr_val > r_mba->membw.min_bw && user_bw < cur_bw) { 826 new_msr_val = cur_msr_val - r_mba->membw.bw_gran; 827 } else if (cur_msr_val < MAX_MBA_BW && 828 (user_bw > (cur_bw * (cur_msr_val + r_mba->membw.min_bw) / cur_msr_val))) { 829 new_msr_val = cur_msr_val + r_mba->membw.bw_gran; 830 } else { 831 return; 832 } 833 834 resctrl_arch_update_one(r_mba, dom_mba, closid, CDP_NONE, new_msr_val); 835 } 836 837 static void mbm_update_one_event(struct rdt_resource *r, struct rdt_mon_domain *d, 838 u32 closid, u32 rmid, enum resctrl_event_id evtid) 839 { 840 struct rmid_read rr = {0}; 841 842 rr.r = r; 843 rr.d = d; 844 rr.evtid = evtid; 845 rr.arch_mon_ctx = resctrl_arch_mon_ctx_alloc(rr.r, rr.evtid); 846 if (IS_ERR(rr.arch_mon_ctx)) { 847 pr_warn_ratelimited("Failed to allocate monitor context: %ld", 848 PTR_ERR(rr.arch_mon_ctx)); 849 return; 850 } 851 852 __mon_event_count(closid, rmid, &rr); 853 854 /* 855 * If the software controller is enabled, compute the 856 * bandwidth for this event id. 857 */ 858 if (is_mba_sc(NULL)) 859 mbm_bw_count(closid, rmid, &rr); 860 861 resctrl_arch_mon_ctx_free(rr.r, rr.evtid, rr.arch_mon_ctx); 862 } 863 864 static void mbm_update(struct rdt_resource *r, struct rdt_mon_domain *d, 865 u32 closid, u32 rmid) 866 { 867 /* 868 * This is protected from concurrent reads from user as both 869 * the user and overflow handler hold the global mutex. 870 */ 871 if (resctrl_arch_is_mbm_total_enabled()) 872 mbm_update_one_event(r, d, closid, rmid, QOS_L3_MBM_TOTAL_EVENT_ID); 873 874 if (resctrl_arch_is_mbm_local_enabled()) 875 mbm_update_one_event(r, d, closid, rmid, QOS_L3_MBM_LOCAL_EVENT_ID); 876 } 877 878 /* 879 * Handler to scan the limbo list and move the RMIDs 880 * to free list whose occupancy < threshold_occupancy. 881 */ 882 void cqm_handle_limbo(struct work_struct *work) 883 { 884 unsigned long delay = msecs_to_jiffies(CQM_LIMBOCHECK_INTERVAL); 885 struct rdt_mon_domain *d; 886 887 cpus_read_lock(); 888 mutex_lock(&rdtgroup_mutex); 889 890 d = container_of(work, struct rdt_mon_domain, cqm_limbo.work); 891 892 __check_limbo(d, false); 893 894 if (has_busy_rmid(d)) { 895 d->cqm_work_cpu = cpumask_any_housekeeping(&d->hdr.cpu_mask, 896 RESCTRL_PICK_ANY_CPU); 897 schedule_delayed_work_on(d->cqm_work_cpu, &d->cqm_limbo, 898 delay); 899 } 900 901 mutex_unlock(&rdtgroup_mutex); 902 cpus_read_unlock(); 903 } 904 905 /** 906 * cqm_setup_limbo_handler() - Schedule the limbo handler to run for this 907 * domain. 908 * @dom: The domain the limbo handler should run for. 909 * @delay_ms: How far in the future the handler should run. 910 * @exclude_cpu: Which CPU the handler should not run on, 911 * RESCTRL_PICK_ANY_CPU to pick any CPU. 912 */ 913 void cqm_setup_limbo_handler(struct rdt_mon_domain *dom, unsigned long delay_ms, 914 int exclude_cpu) 915 { 916 unsigned long delay = msecs_to_jiffies(delay_ms); 917 int cpu; 918 919 cpu = cpumask_any_housekeeping(&dom->hdr.cpu_mask, exclude_cpu); 920 dom->cqm_work_cpu = cpu; 921 922 if (cpu < nr_cpu_ids) 923 schedule_delayed_work_on(cpu, &dom->cqm_limbo, delay); 924 } 925 926 void mbm_handle_overflow(struct work_struct *work) 927 { 928 unsigned long delay = msecs_to_jiffies(MBM_OVERFLOW_INTERVAL); 929 struct rdtgroup *prgrp, *crgrp; 930 struct rdt_mon_domain *d; 931 struct list_head *head; 932 struct rdt_resource *r; 933 934 cpus_read_lock(); 935 mutex_lock(&rdtgroup_mutex); 936 937 /* 938 * If the filesystem has been unmounted this work no longer needs to 939 * run. 940 */ 941 if (!resctrl_mounted || !resctrl_arch_mon_capable()) 942 goto out_unlock; 943 944 r = resctrl_arch_get_resource(RDT_RESOURCE_L3); 945 d = container_of(work, struct rdt_mon_domain, mbm_over.work); 946 947 list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) { 948 mbm_update(r, d, prgrp->closid, prgrp->mon.rmid); 949 950 head = &prgrp->mon.crdtgrp_list; 951 list_for_each_entry(crgrp, head, mon.crdtgrp_list) 952 mbm_update(r, d, crgrp->closid, crgrp->mon.rmid); 953 954 if (is_mba_sc(NULL)) 955 update_mba_bw(prgrp, d); 956 } 957 958 /* 959 * Re-check for housekeeping CPUs. This allows the overflow handler to 960 * move off a nohz_full CPU quickly. 961 */ 962 d->mbm_work_cpu = cpumask_any_housekeeping(&d->hdr.cpu_mask, 963 RESCTRL_PICK_ANY_CPU); 964 schedule_delayed_work_on(d->mbm_work_cpu, &d->mbm_over, delay); 965 966 out_unlock: 967 mutex_unlock(&rdtgroup_mutex); 968 cpus_read_unlock(); 969 } 970 971 /** 972 * mbm_setup_overflow_handler() - Schedule the overflow handler to run for this 973 * domain. 974 * @dom: The domain the overflow handler should run for. 975 * @delay_ms: How far in the future the handler should run. 976 * @exclude_cpu: Which CPU the handler should not run on, 977 * RESCTRL_PICK_ANY_CPU to pick any CPU. 978 */ 979 void mbm_setup_overflow_handler(struct rdt_mon_domain *dom, unsigned long delay_ms, 980 int exclude_cpu) 981 { 982 unsigned long delay = msecs_to_jiffies(delay_ms); 983 int cpu; 984 985 /* 986 * When a domain comes online there is no guarantee the filesystem is 987 * mounted. If not, there is no need to catch counter overflow. 988 */ 989 if (!resctrl_mounted || !resctrl_arch_mon_capable()) 990 return; 991 cpu = cpumask_any_housekeeping(&dom->hdr.cpu_mask, exclude_cpu); 992 dom->mbm_work_cpu = cpu; 993 994 if (cpu < nr_cpu_ids) 995 schedule_delayed_work_on(cpu, &dom->mbm_over, delay); 996 } 997 998 static int dom_data_init(struct rdt_resource *r) 999 { 1000 u32 idx_limit = resctrl_arch_system_num_rmid_idx(); 1001 u32 num_closid = resctrl_arch_get_num_closid(r); 1002 struct rmid_entry *entry = NULL; 1003 int err = 0, i; 1004 u32 idx; 1005 1006 mutex_lock(&rdtgroup_mutex); 1007 if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) { 1008 u32 *tmp; 1009 1010 /* 1011 * If the architecture hasn't provided a sanitised value here, 1012 * this may result in larger arrays than necessary. Resctrl will 1013 * use a smaller system wide value based on the resources in 1014 * use. 1015 */ 1016 tmp = kcalloc(num_closid, sizeof(*tmp), GFP_KERNEL); 1017 if (!tmp) { 1018 err = -ENOMEM; 1019 goto out_unlock; 1020 } 1021 1022 closid_num_dirty_rmid = tmp; 1023 } 1024 1025 rmid_ptrs = kcalloc(idx_limit, sizeof(struct rmid_entry), GFP_KERNEL); 1026 if (!rmid_ptrs) { 1027 if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) { 1028 kfree(closid_num_dirty_rmid); 1029 closid_num_dirty_rmid = NULL; 1030 } 1031 err = -ENOMEM; 1032 goto out_unlock; 1033 } 1034 1035 for (i = 0; i < idx_limit; i++) { 1036 entry = &rmid_ptrs[i]; 1037 INIT_LIST_HEAD(&entry->list); 1038 1039 resctrl_arch_rmid_idx_decode(i, &entry->closid, &entry->rmid); 1040 list_add_tail(&entry->list, &rmid_free_lru); 1041 } 1042 1043 /* 1044 * RESCTRL_RESERVED_CLOSID and RESCTRL_RESERVED_RMID are special and 1045 * are always allocated. These are used for the rdtgroup_default 1046 * control group, which will be setup later in resctrl_init(). 1047 */ 1048 idx = resctrl_arch_rmid_idx_encode(RESCTRL_RESERVED_CLOSID, 1049 RESCTRL_RESERVED_RMID); 1050 entry = __rmid_entry(idx); 1051 list_del(&entry->list); 1052 1053 out_unlock: 1054 mutex_unlock(&rdtgroup_mutex); 1055 1056 return err; 1057 } 1058 1059 static void dom_data_exit(struct rdt_resource *r) 1060 { 1061 mutex_lock(&rdtgroup_mutex); 1062 1063 if (!r->mon_capable) 1064 goto out_unlock; 1065 1066 if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) { 1067 kfree(closid_num_dirty_rmid); 1068 closid_num_dirty_rmid = NULL; 1069 } 1070 1071 kfree(rmid_ptrs); 1072 rmid_ptrs = NULL; 1073 1074 out_unlock: 1075 mutex_unlock(&rdtgroup_mutex); 1076 } 1077 1078 static struct mon_evt llc_occupancy_event = { 1079 .name = "llc_occupancy", 1080 .evtid = QOS_L3_OCCUP_EVENT_ID, 1081 }; 1082 1083 static struct mon_evt mbm_total_event = { 1084 .name = "mbm_total_bytes", 1085 .evtid = QOS_L3_MBM_TOTAL_EVENT_ID, 1086 }; 1087 1088 static struct mon_evt mbm_local_event = { 1089 .name = "mbm_local_bytes", 1090 .evtid = QOS_L3_MBM_LOCAL_EVENT_ID, 1091 }; 1092 1093 /* 1094 * Initialize the event list for the resource. 1095 * 1096 * Note that MBM events are also part of RDT_RESOURCE_L3 resource 1097 * because as per the SDM the total and local memory bandwidth 1098 * are enumerated as part of L3 monitoring. 1099 */ 1100 static void l3_mon_evt_init(struct rdt_resource *r) 1101 { 1102 INIT_LIST_HEAD(&r->evt_list); 1103 1104 if (resctrl_arch_is_llc_occupancy_enabled()) 1105 list_add_tail(&llc_occupancy_event.list, &r->evt_list); 1106 if (resctrl_arch_is_mbm_total_enabled()) 1107 list_add_tail(&mbm_total_event.list, &r->evt_list); 1108 if (resctrl_arch_is_mbm_local_enabled()) 1109 list_add_tail(&mbm_local_event.list, &r->evt_list); 1110 } 1111 1112 /* 1113 * The power-on reset value of MSR_RMID_SNC_CONFIG is 0x1 1114 * which indicates that RMIDs are configured in legacy mode. 1115 * This mode is incompatible with Linux resctrl semantics 1116 * as RMIDs are partitioned between SNC nodes, which requires 1117 * a user to know which RMID is allocated to a task. 1118 * Clearing bit 0 reconfigures the RMID counters for use 1119 * in RMID sharing mode. This mode is better for Linux. 1120 * The RMID space is divided between all SNC nodes with the 1121 * RMIDs renumbered to start from zero in each node when 1122 * counting operations from tasks. Code to read the counters 1123 * must adjust RMID counter numbers based on SNC node. See 1124 * logical_rmid_to_physical_rmid() for code that does this. 1125 */ 1126 void arch_mon_domain_online(struct rdt_resource *r, struct rdt_mon_domain *d) 1127 { 1128 if (snc_nodes_per_l3_cache > 1) 1129 msr_clear_bit(MSR_RMID_SNC_CONFIG, 0); 1130 } 1131 1132 /* CPU models that support MSR_RMID_SNC_CONFIG */ 1133 static const struct x86_cpu_id snc_cpu_ids[] __initconst = { 1134 X86_MATCH_VFM(INTEL_ICELAKE_X, 0), 1135 X86_MATCH_VFM(INTEL_SAPPHIRERAPIDS_X, 0), 1136 X86_MATCH_VFM(INTEL_EMERALDRAPIDS_X, 0), 1137 X86_MATCH_VFM(INTEL_GRANITERAPIDS_X, 0), 1138 X86_MATCH_VFM(INTEL_ATOM_CRESTMONT_X, 0), 1139 {} 1140 }; 1141 1142 /* 1143 * There isn't a simple hardware bit that indicates whether a CPU is running 1144 * in Sub-NUMA Cluster (SNC) mode. Infer the state by comparing the 1145 * number of CPUs sharing the L3 cache with CPU0 to the number of CPUs in 1146 * the same NUMA node as CPU0. 1147 * It is not possible to accurately determine SNC state if the system is 1148 * booted with a maxcpus=N parameter. That distorts the ratio of SNC nodes 1149 * to L3 caches. It will be OK if system is booted with hyperthreading 1150 * disabled (since this doesn't affect the ratio). 1151 */ 1152 static __init int snc_get_config(void) 1153 { 1154 struct cacheinfo *ci = get_cpu_cacheinfo_level(0, RESCTRL_L3_CACHE); 1155 const cpumask_t *node0_cpumask; 1156 int cpus_per_node, cpus_per_l3; 1157 int ret; 1158 1159 if (!x86_match_cpu(snc_cpu_ids) || !ci) 1160 return 1; 1161 1162 cpus_read_lock(); 1163 if (num_online_cpus() != num_present_cpus()) 1164 pr_warn("Some CPUs offline, SNC detection may be incorrect\n"); 1165 cpus_read_unlock(); 1166 1167 node0_cpumask = cpumask_of_node(cpu_to_node(0)); 1168 1169 cpus_per_node = cpumask_weight(node0_cpumask); 1170 cpus_per_l3 = cpumask_weight(&ci->shared_cpu_map); 1171 1172 if (!cpus_per_node || !cpus_per_l3) 1173 return 1; 1174 1175 ret = cpus_per_l3 / cpus_per_node; 1176 1177 /* sanity check: Only valid results are 1, 2, 3, 4, 6 */ 1178 switch (ret) { 1179 case 1: 1180 break; 1181 case 2 ... 4: 1182 case 6: 1183 pr_info("Sub-NUMA Cluster mode detected with %d nodes per L3 cache\n", ret); 1184 rdt_resources_all[RDT_RESOURCE_L3].r_resctrl.mon_scope = RESCTRL_L3_NODE; 1185 break; 1186 default: 1187 pr_warn("Ignore improbable SNC node count %d\n", ret); 1188 ret = 1; 1189 break; 1190 } 1191 1192 return ret; 1193 } 1194 1195 /** 1196 * resctrl_mon_resource_init() - Initialise global monitoring structures. 1197 * 1198 * Allocate and initialise global monitor resources that do not belong to a 1199 * specific domain. i.e. the rmid_ptrs[] used for the limbo and free lists. 1200 * Called once during boot after the struct rdt_resource's have been configured 1201 * but before the filesystem is mounted. 1202 * Resctrl's cpuhp callbacks may be called before this point to bring a domain 1203 * online. 1204 * 1205 * Returns 0 for success, or -ENOMEM. 1206 */ 1207 int __init resctrl_mon_resource_init(void) 1208 { 1209 struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3); 1210 int ret; 1211 1212 if (!r->mon_capable) 1213 return 0; 1214 1215 ret = dom_data_init(r); 1216 if (ret) 1217 return ret; 1218 1219 l3_mon_evt_init(r); 1220 1221 if (resctrl_arch_is_evt_configurable(QOS_L3_MBM_TOTAL_EVENT_ID)) { 1222 mbm_total_event.configurable = true; 1223 resctrl_file_fflags_init("mbm_total_bytes_config", 1224 RFTYPE_MON_INFO | RFTYPE_RES_CACHE); 1225 } 1226 if (resctrl_arch_is_evt_configurable(QOS_L3_MBM_LOCAL_EVENT_ID)) { 1227 mbm_local_event.configurable = true; 1228 resctrl_file_fflags_init("mbm_local_bytes_config", 1229 RFTYPE_MON_INFO | RFTYPE_RES_CACHE); 1230 } 1231 1232 if (resctrl_arch_is_mbm_local_enabled()) 1233 mba_mbps_default_event = QOS_L3_MBM_LOCAL_EVENT_ID; 1234 else if (resctrl_arch_is_mbm_total_enabled()) 1235 mba_mbps_default_event = QOS_L3_MBM_TOTAL_EVENT_ID; 1236 1237 return 0; 1238 } 1239 1240 int __init rdt_get_mon_l3_config(struct rdt_resource *r) 1241 { 1242 unsigned int mbm_offset = boot_cpu_data.x86_cache_mbm_width_offset; 1243 struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); 1244 unsigned int threshold; 1245 1246 snc_nodes_per_l3_cache = snc_get_config(); 1247 1248 resctrl_rmid_realloc_limit = boot_cpu_data.x86_cache_size * 1024; 1249 hw_res->mon_scale = boot_cpu_data.x86_cache_occ_scale / snc_nodes_per_l3_cache; 1250 r->num_rmid = (boot_cpu_data.x86_cache_max_rmid + 1) / snc_nodes_per_l3_cache; 1251 hw_res->mbm_width = MBM_CNTR_WIDTH_BASE; 1252 1253 if (mbm_offset > 0 && mbm_offset <= MBM_CNTR_WIDTH_OFFSET_MAX) 1254 hw_res->mbm_width += mbm_offset; 1255 else if (mbm_offset > MBM_CNTR_WIDTH_OFFSET_MAX) 1256 pr_warn("Ignoring impossible MBM counter offset\n"); 1257 1258 /* 1259 * A reasonable upper limit on the max threshold is the number 1260 * of lines tagged per RMID if all RMIDs have the same number of 1261 * lines tagged in the LLC. 1262 * 1263 * For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC. 1264 */ 1265 threshold = resctrl_rmid_realloc_limit / r->num_rmid; 1266 1267 /* 1268 * Because num_rmid may not be a power of two, round the value 1269 * to the nearest multiple of hw_res->mon_scale so it matches a 1270 * value the hardware will measure. mon_scale may not be a power of 2. 1271 */ 1272 resctrl_rmid_realloc_threshold = resctrl_arch_round_mon_val(threshold); 1273 1274 if (rdt_cpu_has(X86_FEATURE_BMEC)) { 1275 u32 eax, ebx, ecx, edx; 1276 1277 /* Detect list of bandwidth sources that can be tracked */ 1278 cpuid_count(0x80000020, 3, &eax, &ebx, &ecx, &edx); 1279 r->mbm_cfg_mask = ecx & MAX_EVT_CONFIG_BITS; 1280 } 1281 1282 r->mon_capable = true; 1283 1284 return 0; 1285 } 1286 1287 void resctrl_mon_resource_exit(void) 1288 { 1289 struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3); 1290 1291 dom_data_exit(r); 1292 } 1293 1294 void __init intel_rdt_mbm_apply_quirk(void) 1295 { 1296 int cf_index; 1297 1298 cf_index = (boot_cpu_data.x86_cache_max_rmid + 1) / 8 - 1; 1299 if (cf_index >= ARRAY_SIZE(mbm_cf_table)) { 1300 pr_info("No MBM correction factor available\n"); 1301 return; 1302 } 1303 1304 mbm_cf_rmidthreshold = mbm_cf_table[cf_index].rmidthreshold; 1305 mbm_cf = mbm_cf_table[cf_index].cf; 1306 } 1307