1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Resource Director Technology(RDT) 4 * - Monitoring code 5 * 6 * Copyright (C) 2017 Intel Corporation 7 * 8 * Author: 9 * Vikas Shivappa <vikas.shivappa@intel.com> 10 * 11 * This replaces the cqm.c based on perf but we reuse a lot of 12 * code and datastructures originally from Peter Zijlstra and Matt Fleming. 13 * 14 * More information about RDT be found in the Intel (R) x86 Architecture 15 * Software Developer Manual June 2016, volume 3, section 17.17. 16 */ 17 18 #include <linux/cpu.h> 19 #include <linux/module.h> 20 #include <linux/sizes.h> 21 #include <linux/slab.h> 22 23 #include <asm/cpu_device_id.h> 24 #include <asm/resctrl.h> 25 26 #include "internal.h" 27 #include "trace.h" 28 29 /** 30 * struct rmid_entry - dirty tracking for all RMID. 31 * @closid: The CLOSID for this entry. 32 * @rmid: The RMID for this entry. 33 * @busy: The number of domains with cached data using this RMID. 34 * @list: Member of the rmid_free_lru list when busy == 0. 35 * 36 * Depending on the architecture the correct monitor is accessed using 37 * both @closid and @rmid, or @rmid only. 38 * 39 * Take the rdtgroup_mutex when accessing. 40 */ 41 struct rmid_entry { 42 u32 closid; 43 u32 rmid; 44 int busy; 45 struct list_head list; 46 }; 47 48 /* 49 * @rmid_free_lru - A least recently used list of free RMIDs 50 * These RMIDs are guaranteed to have an occupancy less than the 51 * threshold occupancy 52 */ 53 static LIST_HEAD(rmid_free_lru); 54 55 /* 56 * @closid_num_dirty_rmid The number of dirty RMID each CLOSID has. 57 * Only allocated when CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID is defined. 58 * Indexed by CLOSID. Protected by rdtgroup_mutex. 59 */ 60 static u32 *closid_num_dirty_rmid; 61 62 /* 63 * @rmid_limbo_count - count of currently unused but (potentially) 64 * dirty RMIDs. 65 * This counts RMIDs that no one is currently using but that 66 * may have a occupancy value > resctrl_rmid_realloc_threshold. User can 67 * change the threshold occupancy value. 68 */ 69 static unsigned int rmid_limbo_count; 70 71 /* 72 * @rmid_entry - The entry in the limbo and free lists. 73 */ 74 static struct rmid_entry *rmid_ptrs; 75 76 /* 77 * Global boolean for rdt_monitor which is true if any 78 * resource monitoring is enabled. 79 */ 80 bool rdt_mon_capable; 81 82 /* 83 * Global to indicate which monitoring events are enabled. 84 */ 85 unsigned int rdt_mon_features; 86 87 /* 88 * This is the threshold cache occupancy in bytes at which we will consider an 89 * RMID available for re-allocation. 90 */ 91 unsigned int resctrl_rmid_realloc_threshold; 92 93 /* 94 * This is the maximum value for the reallocation threshold, in bytes. 95 */ 96 unsigned int resctrl_rmid_realloc_limit; 97 98 #define CF(cf) ((unsigned long)(1048576 * (cf) + 0.5)) 99 100 /* 101 * The correction factor table is documented in Documentation/arch/x86/resctrl.rst. 102 * If rmid > rmid threshold, MBM total and local values should be multiplied 103 * by the correction factor. 104 * 105 * The original table is modified for better code: 106 * 107 * 1. The threshold 0 is changed to rmid count - 1 so don't do correction 108 * for the case. 109 * 2. MBM total and local correction table indexed by core counter which is 110 * equal to (x86_cache_max_rmid + 1) / 8 - 1 and is from 0 up to 27. 111 * 3. The correction factor is normalized to 2^20 (1048576) so it's faster 112 * to calculate corrected value by shifting: 113 * corrected_value = (original_value * correction_factor) >> 20 114 */ 115 static const struct mbm_correction_factor_table { 116 u32 rmidthreshold; 117 u64 cf; 118 } mbm_cf_table[] __initconst = { 119 {7, CF(1.000000)}, 120 {15, CF(1.000000)}, 121 {15, CF(0.969650)}, 122 {31, CF(1.000000)}, 123 {31, CF(1.066667)}, 124 {31, CF(0.969650)}, 125 {47, CF(1.142857)}, 126 {63, CF(1.000000)}, 127 {63, CF(1.185115)}, 128 {63, CF(1.066553)}, 129 {79, CF(1.454545)}, 130 {95, CF(1.000000)}, 131 {95, CF(1.230769)}, 132 {95, CF(1.142857)}, 133 {95, CF(1.066667)}, 134 {127, CF(1.000000)}, 135 {127, CF(1.254863)}, 136 {127, CF(1.185255)}, 137 {151, CF(1.000000)}, 138 {127, CF(1.066667)}, 139 {167, CF(1.000000)}, 140 {159, CF(1.454334)}, 141 {183, CF(1.000000)}, 142 {127, CF(0.969744)}, 143 {191, CF(1.280246)}, 144 {191, CF(1.230921)}, 145 {215, CF(1.000000)}, 146 {191, CF(1.143118)}, 147 }; 148 149 static u32 mbm_cf_rmidthreshold __read_mostly = UINT_MAX; 150 static u64 mbm_cf __read_mostly; 151 152 static inline u64 get_corrected_mbm_count(u32 rmid, unsigned long val) 153 { 154 /* Correct MBM value. */ 155 if (rmid > mbm_cf_rmidthreshold) 156 val = (val * mbm_cf) >> 20; 157 158 return val; 159 } 160 161 /* 162 * x86 and arm64 differ in their handling of monitoring. 163 * x86's RMID are independent numbers, there is only one source of traffic 164 * with an RMID value of '1'. 165 * arm64's PMG extends the PARTID/CLOSID space, there are multiple sources of 166 * traffic with a PMG value of '1', one for each CLOSID, meaning the RMID 167 * value is no longer unique. 168 * To account for this, resctrl uses an index. On x86 this is just the RMID, 169 * on arm64 it encodes the CLOSID and RMID. This gives a unique number. 170 * 171 * The domain's rmid_busy_llc and rmid_ptrs[] are sized by index. The arch code 172 * must accept an attempt to read every index. 173 */ 174 static inline struct rmid_entry *__rmid_entry(u32 idx) 175 { 176 struct rmid_entry *entry; 177 u32 closid, rmid; 178 179 entry = &rmid_ptrs[idx]; 180 resctrl_arch_rmid_idx_decode(idx, &closid, &rmid); 181 182 WARN_ON_ONCE(entry->closid != closid); 183 WARN_ON_ONCE(entry->rmid != rmid); 184 185 return entry; 186 } 187 188 static int __rmid_read(u32 rmid, enum resctrl_event_id eventid, u64 *val) 189 { 190 u64 msr_val; 191 192 /* 193 * As per the SDM, when IA32_QM_EVTSEL.EvtID (bits 7:0) is configured 194 * with a valid event code for supported resource type and the bits 195 * IA32_QM_EVTSEL.RMID (bits 41:32) are configured with valid RMID, 196 * IA32_QM_CTR.data (bits 61:0) reports the monitored data. 197 * IA32_QM_CTR.Error (bit 63) and IA32_QM_CTR.Unavailable (bit 62) 198 * are error bits. 199 */ 200 wrmsr(MSR_IA32_QM_EVTSEL, eventid, rmid); 201 rdmsrl(MSR_IA32_QM_CTR, msr_val); 202 203 if (msr_val & RMID_VAL_ERROR) 204 return -EIO; 205 if (msr_val & RMID_VAL_UNAVAIL) 206 return -EINVAL; 207 208 *val = msr_val; 209 return 0; 210 } 211 212 static struct arch_mbm_state *get_arch_mbm_state(struct rdt_hw_domain *hw_dom, 213 u32 rmid, 214 enum resctrl_event_id eventid) 215 { 216 switch (eventid) { 217 case QOS_L3_OCCUP_EVENT_ID: 218 return NULL; 219 case QOS_L3_MBM_TOTAL_EVENT_ID: 220 return &hw_dom->arch_mbm_total[rmid]; 221 case QOS_L3_MBM_LOCAL_EVENT_ID: 222 return &hw_dom->arch_mbm_local[rmid]; 223 } 224 225 /* Never expect to get here */ 226 WARN_ON_ONCE(1); 227 228 return NULL; 229 } 230 231 void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_domain *d, 232 u32 unused, u32 rmid, 233 enum resctrl_event_id eventid) 234 { 235 struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d); 236 struct arch_mbm_state *am; 237 238 am = get_arch_mbm_state(hw_dom, rmid, eventid); 239 if (am) { 240 memset(am, 0, sizeof(*am)); 241 242 /* Record any initial, non-zero count value. */ 243 __rmid_read(rmid, eventid, &am->prev_msr); 244 } 245 } 246 247 /* 248 * Assumes that hardware counters are also reset and thus that there is 249 * no need to record initial non-zero counts. 250 */ 251 void resctrl_arch_reset_rmid_all(struct rdt_resource *r, struct rdt_domain *d) 252 { 253 struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d); 254 255 if (is_mbm_total_enabled()) 256 memset(hw_dom->arch_mbm_total, 0, 257 sizeof(*hw_dom->arch_mbm_total) * r->num_rmid); 258 259 if (is_mbm_local_enabled()) 260 memset(hw_dom->arch_mbm_local, 0, 261 sizeof(*hw_dom->arch_mbm_local) * r->num_rmid); 262 } 263 264 static u64 mbm_overflow_count(u64 prev_msr, u64 cur_msr, unsigned int width) 265 { 266 u64 shift = 64 - width, chunks; 267 268 chunks = (cur_msr << shift) - (prev_msr << shift); 269 return chunks >> shift; 270 } 271 272 int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain *d, 273 u32 unused, u32 rmid, enum resctrl_event_id eventid, 274 u64 *val, void *ignored) 275 { 276 struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); 277 struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d); 278 struct arch_mbm_state *am; 279 u64 msr_val, chunks; 280 int ret; 281 282 resctrl_arch_rmid_read_context_check(); 283 284 if (!cpumask_test_cpu(smp_processor_id(), &d->cpu_mask)) 285 return -EINVAL; 286 287 ret = __rmid_read(rmid, eventid, &msr_val); 288 if (ret) 289 return ret; 290 291 am = get_arch_mbm_state(hw_dom, rmid, eventid); 292 if (am) { 293 am->chunks += mbm_overflow_count(am->prev_msr, msr_val, 294 hw_res->mbm_width); 295 chunks = get_corrected_mbm_count(rmid, am->chunks); 296 am->prev_msr = msr_val; 297 } else { 298 chunks = msr_val; 299 } 300 301 *val = chunks * hw_res->mon_scale; 302 303 return 0; 304 } 305 306 static void limbo_release_entry(struct rmid_entry *entry) 307 { 308 lockdep_assert_held(&rdtgroup_mutex); 309 310 rmid_limbo_count--; 311 list_add_tail(&entry->list, &rmid_free_lru); 312 313 if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) 314 closid_num_dirty_rmid[entry->closid]--; 315 } 316 317 /* 318 * Check the RMIDs that are marked as busy for this domain. If the 319 * reported LLC occupancy is below the threshold clear the busy bit and 320 * decrement the count. If the busy count gets to zero on an RMID, we 321 * free the RMID 322 */ 323 void __check_limbo(struct rdt_domain *d, bool force_free) 324 { 325 struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; 326 u32 idx_limit = resctrl_arch_system_num_rmid_idx(); 327 struct rmid_entry *entry; 328 u32 idx, cur_idx = 1; 329 void *arch_mon_ctx; 330 bool rmid_dirty; 331 u64 val = 0; 332 333 arch_mon_ctx = resctrl_arch_mon_ctx_alloc(r, QOS_L3_OCCUP_EVENT_ID); 334 if (IS_ERR(arch_mon_ctx)) { 335 pr_warn_ratelimited("Failed to allocate monitor context: %ld", 336 PTR_ERR(arch_mon_ctx)); 337 return; 338 } 339 340 /* 341 * Skip RMID 0 and start from RMID 1 and check all the RMIDs that 342 * are marked as busy for occupancy < threshold. If the occupancy 343 * is less than the threshold decrement the busy counter of the 344 * RMID and move it to the free list when the counter reaches 0. 345 */ 346 for (;;) { 347 idx = find_next_bit(d->rmid_busy_llc, idx_limit, cur_idx); 348 if (idx >= idx_limit) 349 break; 350 351 entry = __rmid_entry(idx); 352 if (resctrl_arch_rmid_read(r, d, entry->closid, entry->rmid, 353 QOS_L3_OCCUP_EVENT_ID, &val, 354 arch_mon_ctx)) { 355 rmid_dirty = true; 356 } else { 357 rmid_dirty = (val >= resctrl_rmid_realloc_threshold); 358 359 /* 360 * x86's CLOSID and RMID are independent numbers, so the entry's 361 * CLOSID is an empty CLOSID (X86_RESCTRL_EMPTY_CLOSID). On Arm the 362 * RMID (PMG) extends the CLOSID (PARTID) space with bits that aren't 363 * used to select the configuration. It is thus necessary to track both 364 * CLOSID and RMID because there may be dependencies between them 365 * on some architectures. 366 */ 367 trace_mon_llc_occupancy_limbo(entry->closid, entry->rmid, d->id, val); 368 } 369 370 if (force_free || !rmid_dirty) { 371 clear_bit(idx, d->rmid_busy_llc); 372 if (!--entry->busy) 373 limbo_release_entry(entry); 374 } 375 cur_idx = idx + 1; 376 } 377 378 resctrl_arch_mon_ctx_free(r, QOS_L3_OCCUP_EVENT_ID, arch_mon_ctx); 379 } 380 381 bool has_busy_rmid(struct rdt_domain *d) 382 { 383 u32 idx_limit = resctrl_arch_system_num_rmid_idx(); 384 385 return find_first_bit(d->rmid_busy_llc, idx_limit) != idx_limit; 386 } 387 388 static struct rmid_entry *resctrl_find_free_rmid(u32 closid) 389 { 390 struct rmid_entry *itr; 391 u32 itr_idx, cmp_idx; 392 393 if (list_empty(&rmid_free_lru)) 394 return rmid_limbo_count ? ERR_PTR(-EBUSY) : ERR_PTR(-ENOSPC); 395 396 list_for_each_entry(itr, &rmid_free_lru, list) { 397 /* 398 * Get the index of this free RMID, and the index it would need 399 * to be if it were used with this CLOSID. 400 * If the CLOSID is irrelevant on this architecture, the two 401 * index values are always the same on every entry and thus the 402 * very first entry will be returned. 403 */ 404 itr_idx = resctrl_arch_rmid_idx_encode(itr->closid, itr->rmid); 405 cmp_idx = resctrl_arch_rmid_idx_encode(closid, itr->rmid); 406 407 if (itr_idx == cmp_idx) 408 return itr; 409 } 410 411 return ERR_PTR(-ENOSPC); 412 } 413 414 /** 415 * resctrl_find_cleanest_closid() - Find a CLOSID where all the associated 416 * RMID are clean, or the CLOSID that has 417 * the most clean RMID. 418 * 419 * MPAM's equivalent of RMID are per-CLOSID, meaning a freshly allocated CLOSID 420 * may not be able to allocate clean RMID. To avoid this the allocator will 421 * choose the CLOSID with the most clean RMID. 422 * 423 * When the CLOSID and RMID are independent numbers, the first free CLOSID will 424 * be returned. 425 */ 426 int resctrl_find_cleanest_closid(void) 427 { 428 u32 cleanest_closid = ~0; 429 int i = 0; 430 431 lockdep_assert_held(&rdtgroup_mutex); 432 433 if (!IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) 434 return -EIO; 435 436 for (i = 0; i < closids_supported(); i++) { 437 int num_dirty; 438 439 if (closid_allocated(i)) 440 continue; 441 442 num_dirty = closid_num_dirty_rmid[i]; 443 if (num_dirty == 0) 444 return i; 445 446 if (cleanest_closid == ~0) 447 cleanest_closid = i; 448 449 if (num_dirty < closid_num_dirty_rmid[cleanest_closid]) 450 cleanest_closid = i; 451 } 452 453 if (cleanest_closid == ~0) 454 return -ENOSPC; 455 456 return cleanest_closid; 457 } 458 459 /* 460 * For MPAM the RMID value is not unique, and has to be considered with 461 * the CLOSID. The (CLOSID, RMID) pair is allocated on all domains, which 462 * allows all domains to be managed by a single free list. 463 * Each domain also has a rmid_busy_llc to reduce the work of the limbo handler. 464 */ 465 int alloc_rmid(u32 closid) 466 { 467 struct rmid_entry *entry; 468 469 lockdep_assert_held(&rdtgroup_mutex); 470 471 entry = resctrl_find_free_rmid(closid); 472 if (IS_ERR(entry)) 473 return PTR_ERR(entry); 474 475 list_del(&entry->list); 476 return entry->rmid; 477 } 478 479 static void add_rmid_to_limbo(struct rmid_entry *entry) 480 { 481 struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; 482 struct rdt_domain *d; 483 u32 idx; 484 485 lockdep_assert_held(&rdtgroup_mutex); 486 487 /* Walking r->domains, ensure it can't race with cpuhp */ 488 lockdep_assert_cpus_held(); 489 490 idx = resctrl_arch_rmid_idx_encode(entry->closid, entry->rmid); 491 492 entry->busy = 0; 493 list_for_each_entry(d, &r->domains, list) { 494 /* 495 * For the first limbo RMID in the domain, 496 * setup up the limbo worker. 497 */ 498 if (!has_busy_rmid(d)) 499 cqm_setup_limbo_handler(d, CQM_LIMBOCHECK_INTERVAL, 500 RESCTRL_PICK_ANY_CPU); 501 set_bit(idx, d->rmid_busy_llc); 502 entry->busy++; 503 } 504 505 rmid_limbo_count++; 506 if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) 507 closid_num_dirty_rmid[entry->closid]++; 508 } 509 510 void free_rmid(u32 closid, u32 rmid) 511 { 512 u32 idx = resctrl_arch_rmid_idx_encode(closid, rmid); 513 struct rmid_entry *entry; 514 515 lockdep_assert_held(&rdtgroup_mutex); 516 517 /* 518 * Do not allow the default rmid to be free'd. Comparing by index 519 * allows architectures that ignore the closid parameter to avoid an 520 * unnecessary check. 521 */ 522 if (idx == resctrl_arch_rmid_idx_encode(RESCTRL_RESERVED_CLOSID, 523 RESCTRL_RESERVED_RMID)) 524 return; 525 526 entry = __rmid_entry(idx); 527 528 if (is_llc_occupancy_enabled()) 529 add_rmid_to_limbo(entry); 530 else 531 list_add_tail(&entry->list, &rmid_free_lru); 532 } 533 534 static struct mbm_state *get_mbm_state(struct rdt_domain *d, u32 closid, 535 u32 rmid, enum resctrl_event_id evtid) 536 { 537 u32 idx = resctrl_arch_rmid_idx_encode(closid, rmid); 538 539 switch (evtid) { 540 case QOS_L3_MBM_TOTAL_EVENT_ID: 541 return &d->mbm_total[idx]; 542 case QOS_L3_MBM_LOCAL_EVENT_ID: 543 return &d->mbm_local[idx]; 544 default: 545 return NULL; 546 } 547 } 548 549 static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr) 550 { 551 struct mbm_state *m; 552 u64 tval = 0; 553 554 if (rr->first) { 555 resctrl_arch_reset_rmid(rr->r, rr->d, closid, rmid, rr->evtid); 556 m = get_mbm_state(rr->d, closid, rmid, rr->evtid); 557 if (m) 558 memset(m, 0, sizeof(struct mbm_state)); 559 return 0; 560 } 561 562 rr->err = resctrl_arch_rmid_read(rr->r, rr->d, closid, rmid, rr->evtid, 563 &tval, rr->arch_mon_ctx); 564 if (rr->err) 565 return rr->err; 566 567 rr->val += tval; 568 569 return 0; 570 } 571 572 /* 573 * mbm_bw_count() - Update bw count from values previously read by 574 * __mon_event_count(). 575 * @closid: The closid used to identify the cached mbm_state. 576 * @rmid: The rmid used to identify the cached mbm_state. 577 * @rr: The struct rmid_read populated by __mon_event_count(). 578 * 579 * Supporting function to calculate the memory bandwidth 580 * and delta bandwidth in MBps. The chunks value previously read by 581 * __mon_event_count() is compared with the chunks value from the previous 582 * invocation. This must be called once per second to maintain values in MBps. 583 */ 584 static void mbm_bw_count(u32 closid, u32 rmid, struct rmid_read *rr) 585 { 586 u32 idx = resctrl_arch_rmid_idx_encode(closid, rmid); 587 struct mbm_state *m = &rr->d->mbm_local[idx]; 588 u64 cur_bw, bytes, cur_bytes; 589 590 cur_bytes = rr->val; 591 bytes = cur_bytes - m->prev_bw_bytes; 592 m->prev_bw_bytes = cur_bytes; 593 594 cur_bw = bytes / SZ_1M; 595 596 m->prev_bw = cur_bw; 597 } 598 599 /* 600 * This is scheduled by mon_event_read() to read the CQM/MBM counters 601 * on a domain. 602 */ 603 void mon_event_count(void *info) 604 { 605 struct rdtgroup *rdtgrp, *entry; 606 struct rmid_read *rr = info; 607 struct list_head *head; 608 int ret; 609 610 rdtgrp = rr->rgrp; 611 612 ret = __mon_event_count(rdtgrp->closid, rdtgrp->mon.rmid, rr); 613 614 /* 615 * For Ctrl groups read data from child monitor groups and 616 * add them together. Count events which are read successfully. 617 * Discard the rmid_read's reporting errors. 618 */ 619 head = &rdtgrp->mon.crdtgrp_list; 620 621 if (rdtgrp->type == RDTCTRL_GROUP) { 622 list_for_each_entry(entry, head, mon.crdtgrp_list) { 623 if (__mon_event_count(entry->closid, entry->mon.rmid, 624 rr) == 0) 625 ret = 0; 626 } 627 } 628 629 /* 630 * __mon_event_count() calls for newly created monitor groups may 631 * report -EINVAL/Unavailable if the monitor hasn't seen any traffic. 632 * Discard error if any of the monitor event reads succeeded. 633 */ 634 if (ret == 0) 635 rr->err = 0; 636 } 637 638 /* 639 * Feedback loop for MBA software controller (mba_sc) 640 * 641 * mba_sc is a feedback loop where we periodically read MBM counters and 642 * adjust the bandwidth percentage values via the IA32_MBA_THRTL_MSRs so 643 * that: 644 * 645 * current bandwidth(cur_bw) < user specified bandwidth(user_bw) 646 * 647 * This uses the MBM counters to measure the bandwidth and MBA throttle 648 * MSRs to control the bandwidth for a particular rdtgrp. It builds on the 649 * fact that resctrl rdtgroups have both monitoring and control. 650 * 651 * The frequency of the checks is 1s and we just tag along the MBM overflow 652 * timer. Having 1s interval makes the calculation of bandwidth simpler. 653 * 654 * Although MBA's goal is to restrict the bandwidth to a maximum, there may 655 * be a need to increase the bandwidth to avoid unnecessarily restricting 656 * the L2 <-> L3 traffic. 657 * 658 * Since MBA controls the L2 external bandwidth where as MBM measures the 659 * L3 external bandwidth the following sequence could lead to such a 660 * situation. 661 * 662 * Consider an rdtgroup which had high L3 <-> memory traffic in initial 663 * phases -> mba_sc kicks in and reduced bandwidth percentage values -> but 664 * after some time rdtgroup has mostly L2 <-> L3 traffic. 665 * 666 * In this case we may restrict the rdtgroup's L2 <-> L3 traffic as its 667 * throttle MSRs already have low percentage values. To avoid 668 * unnecessarily restricting such rdtgroups, we also increase the bandwidth. 669 */ 670 static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_domain *dom_mbm) 671 { 672 u32 closid, rmid, cur_msr_val, new_msr_val; 673 struct mbm_state *pmbm_data, *cmbm_data; 674 struct rdt_resource *r_mba; 675 struct rdt_domain *dom_mba; 676 u32 cur_bw, user_bw, idx; 677 struct list_head *head; 678 struct rdtgroup *entry; 679 680 if (!is_mbm_local_enabled()) 681 return; 682 683 r_mba = &rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl; 684 685 closid = rgrp->closid; 686 rmid = rgrp->mon.rmid; 687 idx = resctrl_arch_rmid_idx_encode(closid, rmid); 688 pmbm_data = &dom_mbm->mbm_local[idx]; 689 690 dom_mba = get_domain_from_cpu(smp_processor_id(), r_mba); 691 if (!dom_mba) { 692 pr_warn_once("Failure to get domain for MBA update\n"); 693 return; 694 } 695 696 cur_bw = pmbm_data->prev_bw; 697 user_bw = dom_mba->mbps_val[closid]; 698 699 /* MBA resource doesn't support CDP */ 700 cur_msr_val = resctrl_arch_get_config(r_mba, dom_mba, closid, CDP_NONE); 701 702 /* 703 * For Ctrl groups read data from child monitor groups. 704 */ 705 head = &rgrp->mon.crdtgrp_list; 706 list_for_each_entry(entry, head, mon.crdtgrp_list) { 707 cmbm_data = &dom_mbm->mbm_local[entry->mon.rmid]; 708 cur_bw += cmbm_data->prev_bw; 709 } 710 711 /* 712 * Scale up/down the bandwidth linearly for the ctrl group. The 713 * bandwidth step is the bandwidth granularity specified by the 714 * hardware. 715 * Always increase throttling if current bandwidth is above the 716 * target set by user. 717 * But avoid thrashing up and down on every poll by checking 718 * whether a decrease in throttling is likely to push the group 719 * back over target. E.g. if currently throttling to 30% of bandwidth 720 * on a system with 10% granularity steps, check whether moving to 721 * 40% would go past the limit by multiplying current bandwidth by 722 * "(30 + 10) / 30". 723 */ 724 if (cur_msr_val > r_mba->membw.min_bw && user_bw < cur_bw) { 725 new_msr_val = cur_msr_val - r_mba->membw.bw_gran; 726 } else if (cur_msr_val < MAX_MBA_BW && 727 (user_bw > (cur_bw * (cur_msr_val + r_mba->membw.min_bw) / cur_msr_val))) { 728 new_msr_val = cur_msr_val + r_mba->membw.bw_gran; 729 } else { 730 return; 731 } 732 733 resctrl_arch_update_one(r_mba, dom_mba, closid, CDP_NONE, new_msr_val); 734 } 735 736 static void mbm_update(struct rdt_resource *r, struct rdt_domain *d, 737 u32 closid, u32 rmid) 738 { 739 struct rmid_read rr; 740 741 rr.first = false; 742 rr.r = r; 743 rr.d = d; 744 745 /* 746 * This is protected from concurrent reads from user 747 * as both the user and we hold the global mutex. 748 */ 749 if (is_mbm_total_enabled()) { 750 rr.evtid = QOS_L3_MBM_TOTAL_EVENT_ID; 751 rr.val = 0; 752 rr.arch_mon_ctx = resctrl_arch_mon_ctx_alloc(rr.r, rr.evtid); 753 if (IS_ERR(rr.arch_mon_ctx)) { 754 pr_warn_ratelimited("Failed to allocate monitor context: %ld", 755 PTR_ERR(rr.arch_mon_ctx)); 756 return; 757 } 758 759 __mon_event_count(closid, rmid, &rr); 760 761 resctrl_arch_mon_ctx_free(rr.r, rr.evtid, rr.arch_mon_ctx); 762 } 763 if (is_mbm_local_enabled()) { 764 rr.evtid = QOS_L3_MBM_LOCAL_EVENT_ID; 765 rr.val = 0; 766 rr.arch_mon_ctx = resctrl_arch_mon_ctx_alloc(rr.r, rr.evtid); 767 if (IS_ERR(rr.arch_mon_ctx)) { 768 pr_warn_ratelimited("Failed to allocate monitor context: %ld", 769 PTR_ERR(rr.arch_mon_ctx)); 770 return; 771 } 772 773 __mon_event_count(closid, rmid, &rr); 774 775 /* 776 * Call the MBA software controller only for the 777 * control groups and when user has enabled 778 * the software controller explicitly. 779 */ 780 if (is_mba_sc(NULL)) 781 mbm_bw_count(closid, rmid, &rr); 782 783 resctrl_arch_mon_ctx_free(rr.r, rr.evtid, rr.arch_mon_ctx); 784 } 785 } 786 787 /* 788 * Handler to scan the limbo list and move the RMIDs 789 * to free list whose occupancy < threshold_occupancy. 790 */ 791 void cqm_handle_limbo(struct work_struct *work) 792 { 793 unsigned long delay = msecs_to_jiffies(CQM_LIMBOCHECK_INTERVAL); 794 struct rdt_domain *d; 795 796 cpus_read_lock(); 797 mutex_lock(&rdtgroup_mutex); 798 799 d = container_of(work, struct rdt_domain, cqm_limbo.work); 800 801 __check_limbo(d, false); 802 803 if (has_busy_rmid(d)) { 804 d->cqm_work_cpu = cpumask_any_housekeeping(&d->cpu_mask, 805 RESCTRL_PICK_ANY_CPU); 806 schedule_delayed_work_on(d->cqm_work_cpu, &d->cqm_limbo, 807 delay); 808 } 809 810 mutex_unlock(&rdtgroup_mutex); 811 cpus_read_unlock(); 812 } 813 814 /** 815 * cqm_setup_limbo_handler() - Schedule the limbo handler to run for this 816 * domain. 817 * @dom: The domain the limbo handler should run for. 818 * @delay_ms: How far in the future the handler should run. 819 * @exclude_cpu: Which CPU the handler should not run on, 820 * RESCTRL_PICK_ANY_CPU to pick any CPU. 821 */ 822 void cqm_setup_limbo_handler(struct rdt_domain *dom, unsigned long delay_ms, 823 int exclude_cpu) 824 { 825 unsigned long delay = msecs_to_jiffies(delay_ms); 826 int cpu; 827 828 cpu = cpumask_any_housekeeping(&dom->cpu_mask, exclude_cpu); 829 dom->cqm_work_cpu = cpu; 830 831 if (cpu < nr_cpu_ids) 832 schedule_delayed_work_on(cpu, &dom->cqm_limbo, delay); 833 } 834 835 void mbm_handle_overflow(struct work_struct *work) 836 { 837 unsigned long delay = msecs_to_jiffies(MBM_OVERFLOW_INTERVAL); 838 struct rdtgroup *prgrp, *crgrp; 839 struct list_head *head; 840 struct rdt_resource *r; 841 struct rdt_domain *d; 842 843 cpus_read_lock(); 844 mutex_lock(&rdtgroup_mutex); 845 846 /* 847 * If the filesystem has been unmounted this work no longer needs to 848 * run. 849 */ 850 if (!resctrl_mounted || !resctrl_arch_mon_capable()) 851 goto out_unlock; 852 853 r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; 854 d = container_of(work, struct rdt_domain, mbm_over.work); 855 856 list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) { 857 mbm_update(r, d, prgrp->closid, prgrp->mon.rmid); 858 859 head = &prgrp->mon.crdtgrp_list; 860 list_for_each_entry(crgrp, head, mon.crdtgrp_list) 861 mbm_update(r, d, crgrp->closid, crgrp->mon.rmid); 862 863 if (is_mba_sc(NULL)) 864 update_mba_bw(prgrp, d); 865 } 866 867 /* 868 * Re-check for housekeeping CPUs. This allows the overflow handler to 869 * move off a nohz_full CPU quickly. 870 */ 871 d->mbm_work_cpu = cpumask_any_housekeeping(&d->cpu_mask, 872 RESCTRL_PICK_ANY_CPU); 873 schedule_delayed_work_on(d->mbm_work_cpu, &d->mbm_over, delay); 874 875 out_unlock: 876 mutex_unlock(&rdtgroup_mutex); 877 cpus_read_unlock(); 878 } 879 880 /** 881 * mbm_setup_overflow_handler() - Schedule the overflow handler to run for this 882 * domain. 883 * @dom: The domain the overflow handler should run for. 884 * @delay_ms: How far in the future the handler should run. 885 * @exclude_cpu: Which CPU the handler should not run on, 886 * RESCTRL_PICK_ANY_CPU to pick any CPU. 887 */ 888 void mbm_setup_overflow_handler(struct rdt_domain *dom, unsigned long delay_ms, 889 int exclude_cpu) 890 { 891 unsigned long delay = msecs_to_jiffies(delay_ms); 892 int cpu; 893 894 /* 895 * When a domain comes online there is no guarantee the filesystem is 896 * mounted. If not, there is no need to catch counter overflow. 897 */ 898 if (!resctrl_mounted || !resctrl_arch_mon_capable()) 899 return; 900 cpu = cpumask_any_housekeeping(&dom->cpu_mask, exclude_cpu); 901 dom->mbm_work_cpu = cpu; 902 903 if (cpu < nr_cpu_ids) 904 schedule_delayed_work_on(cpu, &dom->mbm_over, delay); 905 } 906 907 static int dom_data_init(struct rdt_resource *r) 908 { 909 u32 idx_limit = resctrl_arch_system_num_rmid_idx(); 910 u32 num_closid = resctrl_arch_get_num_closid(r); 911 struct rmid_entry *entry = NULL; 912 int err = 0, i; 913 u32 idx; 914 915 mutex_lock(&rdtgroup_mutex); 916 if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) { 917 u32 *tmp; 918 919 /* 920 * If the architecture hasn't provided a sanitised value here, 921 * this may result in larger arrays than necessary. Resctrl will 922 * use a smaller system wide value based on the resources in 923 * use. 924 */ 925 tmp = kcalloc(num_closid, sizeof(*tmp), GFP_KERNEL); 926 if (!tmp) { 927 err = -ENOMEM; 928 goto out_unlock; 929 } 930 931 closid_num_dirty_rmid = tmp; 932 } 933 934 rmid_ptrs = kcalloc(idx_limit, sizeof(struct rmid_entry), GFP_KERNEL); 935 if (!rmid_ptrs) { 936 if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) { 937 kfree(closid_num_dirty_rmid); 938 closid_num_dirty_rmid = NULL; 939 } 940 err = -ENOMEM; 941 goto out_unlock; 942 } 943 944 for (i = 0; i < idx_limit; i++) { 945 entry = &rmid_ptrs[i]; 946 INIT_LIST_HEAD(&entry->list); 947 948 resctrl_arch_rmid_idx_decode(i, &entry->closid, &entry->rmid); 949 list_add_tail(&entry->list, &rmid_free_lru); 950 } 951 952 /* 953 * RESCTRL_RESERVED_CLOSID and RESCTRL_RESERVED_RMID are special and 954 * are always allocated. These are used for the rdtgroup_default 955 * control group, which will be setup later in rdtgroup_init(). 956 */ 957 idx = resctrl_arch_rmid_idx_encode(RESCTRL_RESERVED_CLOSID, 958 RESCTRL_RESERVED_RMID); 959 entry = __rmid_entry(idx); 960 list_del(&entry->list); 961 962 out_unlock: 963 mutex_unlock(&rdtgroup_mutex); 964 965 return err; 966 } 967 968 static void __exit dom_data_exit(void) 969 { 970 mutex_lock(&rdtgroup_mutex); 971 972 if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) { 973 kfree(closid_num_dirty_rmid); 974 closid_num_dirty_rmid = NULL; 975 } 976 977 kfree(rmid_ptrs); 978 rmid_ptrs = NULL; 979 980 mutex_unlock(&rdtgroup_mutex); 981 } 982 983 static struct mon_evt llc_occupancy_event = { 984 .name = "llc_occupancy", 985 .evtid = QOS_L3_OCCUP_EVENT_ID, 986 }; 987 988 static struct mon_evt mbm_total_event = { 989 .name = "mbm_total_bytes", 990 .evtid = QOS_L3_MBM_TOTAL_EVENT_ID, 991 }; 992 993 static struct mon_evt mbm_local_event = { 994 .name = "mbm_local_bytes", 995 .evtid = QOS_L3_MBM_LOCAL_EVENT_ID, 996 }; 997 998 /* 999 * Initialize the event list for the resource. 1000 * 1001 * Note that MBM events are also part of RDT_RESOURCE_L3 resource 1002 * because as per the SDM the total and local memory bandwidth 1003 * are enumerated as part of L3 monitoring. 1004 */ 1005 static void l3_mon_evt_init(struct rdt_resource *r) 1006 { 1007 INIT_LIST_HEAD(&r->evt_list); 1008 1009 if (is_llc_occupancy_enabled()) 1010 list_add_tail(&llc_occupancy_event.list, &r->evt_list); 1011 if (is_mbm_total_enabled()) 1012 list_add_tail(&mbm_total_event.list, &r->evt_list); 1013 if (is_mbm_local_enabled()) 1014 list_add_tail(&mbm_local_event.list, &r->evt_list); 1015 } 1016 1017 int __init rdt_get_mon_l3_config(struct rdt_resource *r) 1018 { 1019 unsigned int mbm_offset = boot_cpu_data.x86_cache_mbm_width_offset; 1020 struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); 1021 unsigned int threshold; 1022 int ret; 1023 1024 resctrl_rmid_realloc_limit = boot_cpu_data.x86_cache_size * 1024; 1025 hw_res->mon_scale = boot_cpu_data.x86_cache_occ_scale; 1026 r->num_rmid = boot_cpu_data.x86_cache_max_rmid + 1; 1027 hw_res->mbm_width = MBM_CNTR_WIDTH_BASE; 1028 1029 if (mbm_offset > 0 && mbm_offset <= MBM_CNTR_WIDTH_OFFSET_MAX) 1030 hw_res->mbm_width += mbm_offset; 1031 else if (mbm_offset > MBM_CNTR_WIDTH_OFFSET_MAX) 1032 pr_warn("Ignoring impossible MBM counter offset\n"); 1033 1034 /* 1035 * A reasonable upper limit on the max threshold is the number 1036 * of lines tagged per RMID if all RMIDs have the same number of 1037 * lines tagged in the LLC. 1038 * 1039 * For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC. 1040 */ 1041 threshold = resctrl_rmid_realloc_limit / r->num_rmid; 1042 1043 /* 1044 * Because num_rmid may not be a power of two, round the value 1045 * to the nearest multiple of hw_res->mon_scale so it matches a 1046 * value the hardware will measure. mon_scale may not be a power of 2. 1047 */ 1048 resctrl_rmid_realloc_threshold = resctrl_arch_round_mon_val(threshold); 1049 1050 ret = dom_data_init(r); 1051 if (ret) 1052 return ret; 1053 1054 if (rdt_cpu_has(X86_FEATURE_BMEC)) { 1055 u32 eax, ebx, ecx, edx; 1056 1057 /* Detect list of bandwidth sources that can be tracked */ 1058 cpuid_count(0x80000020, 3, &eax, &ebx, &ecx, &edx); 1059 hw_res->mbm_cfg_mask = ecx & MAX_EVT_CONFIG_BITS; 1060 1061 if (rdt_cpu_has(X86_FEATURE_CQM_MBM_TOTAL)) { 1062 mbm_total_event.configurable = true; 1063 mbm_config_rftype_init("mbm_total_bytes_config"); 1064 } 1065 if (rdt_cpu_has(X86_FEATURE_CQM_MBM_LOCAL)) { 1066 mbm_local_event.configurable = true; 1067 mbm_config_rftype_init("mbm_local_bytes_config"); 1068 } 1069 } 1070 1071 l3_mon_evt_init(r); 1072 1073 r->mon_capable = true; 1074 1075 return 0; 1076 } 1077 1078 void __exit rdt_put_mon_l3_config(void) 1079 { 1080 dom_data_exit(); 1081 } 1082 1083 void __init intel_rdt_mbm_apply_quirk(void) 1084 { 1085 int cf_index; 1086 1087 cf_index = (boot_cpu_data.x86_cache_max_rmid + 1) / 8 - 1; 1088 if (cf_index >= ARRAY_SIZE(mbm_cf_table)) { 1089 pr_info("No MBM correction factor available\n"); 1090 return; 1091 } 1092 1093 mbm_cf_rmidthreshold = mbm_cf_table[cf_index].rmidthreshold; 1094 mbm_cf = mbm_cf_table[cf_index].cf; 1095 } 1096