1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Resource Director Technology(RDT) 4 * - Monitoring code 5 * 6 * Copyright (C) 2017 Intel Corporation 7 * 8 * Author: 9 * Vikas Shivappa <vikas.shivappa@intel.com> 10 * 11 * This replaces the cqm.c based on perf but we reuse a lot of 12 * code and datastructures originally from Peter Zijlstra and Matt Fleming. 13 * 14 * More information about RDT be found in the Intel (R) x86 Architecture 15 * Software Developer Manual June 2016, volume 3, section 17.17. 16 */ 17 18 #include <linux/cpu.h> 19 #include <linux/module.h> 20 #include <linux/sizes.h> 21 #include <linux/slab.h> 22 23 #include <asm/cpu_device_id.h> 24 #include <asm/resctrl.h> 25 26 #include "internal.h" 27 #include "trace.h" 28 29 /** 30 * struct rmid_entry - dirty tracking for all RMID. 31 * @closid: The CLOSID for this entry. 32 * @rmid: The RMID for this entry. 33 * @busy: The number of domains with cached data using this RMID. 34 * @list: Member of the rmid_free_lru list when busy == 0. 35 * 36 * Depending on the architecture the correct monitor is accessed using 37 * both @closid and @rmid, or @rmid only. 38 * 39 * Take the rdtgroup_mutex when accessing. 40 */ 41 struct rmid_entry { 42 u32 closid; 43 u32 rmid; 44 int busy; 45 struct list_head list; 46 }; 47 48 /* 49 * @rmid_free_lru - A least recently used list of free RMIDs 50 * These RMIDs are guaranteed to have an occupancy less than the 51 * threshold occupancy 52 */ 53 static LIST_HEAD(rmid_free_lru); 54 55 /* 56 * @closid_num_dirty_rmid The number of dirty RMID each CLOSID has. 57 * Only allocated when CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID is defined. 58 * Indexed by CLOSID. Protected by rdtgroup_mutex. 59 */ 60 static u32 *closid_num_dirty_rmid; 61 62 /* 63 * @rmid_limbo_count - count of currently unused but (potentially) 64 * dirty RMIDs. 65 * This counts RMIDs that no one is currently using but that 66 * may have a occupancy value > resctrl_rmid_realloc_threshold. User can 67 * change the threshold occupancy value. 68 */ 69 static unsigned int rmid_limbo_count; 70 71 /* 72 * @rmid_entry - The entry in the limbo and free lists. 73 */ 74 static struct rmid_entry *rmid_ptrs; 75 76 /* 77 * Global boolean for rdt_monitor which is true if any 78 * resource monitoring is enabled. 79 */ 80 bool rdt_mon_capable; 81 82 /* 83 * Global to indicate which monitoring events are enabled. 84 */ 85 unsigned int rdt_mon_features; 86 87 /* 88 * This is the threshold cache occupancy in bytes at which we will consider an 89 * RMID available for re-allocation. 90 */ 91 unsigned int resctrl_rmid_realloc_threshold; 92 93 /* 94 * This is the maximum value for the reallocation threshold, in bytes. 95 */ 96 unsigned int resctrl_rmid_realloc_limit; 97 98 #define CF(cf) ((unsigned long)(1048576 * (cf) + 0.5)) 99 100 /* 101 * The correction factor table is documented in Documentation/arch/x86/resctrl.rst. 102 * If rmid > rmid threshold, MBM total and local values should be multiplied 103 * by the correction factor. 104 * 105 * The original table is modified for better code: 106 * 107 * 1. The threshold 0 is changed to rmid count - 1 so don't do correction 108 * for the case. 109 * 2. MBM total and local correction table indexed by core counter which is 110 * equal to (x86_cache_max_rmid + 1) / 8 - 1 and is from 0 up to 27. 111 * 3. The correction factor is normalized to 2^20 (1048576) so it's faster 112 * to calculate corrected value by shifting: 113 * corrected_value = (original_value * correction_factor) >> 20 114 */ 115 static const struct mbm_correction_factor_table { 116 u32 rmidthreshold; 117 u64 cf; 118 } mbm_cf_table[] __initconst = { 119 {7, CF(1.000000)}, 120 {15, CF(1.000000)}, 121 {15, CF(0.969650)}, 122 {31, CF(1.000000)}, 123 {31, CF(1.066667)}, 124 {31, CF(0.969650)}, 125 {47, CF(1.142857)}, 126 {63, CF(1.000000)}, 127 {63, CF(1.185115)}, 128 {63, CF(1.066553)}, 129 {79, CF(1.454545)}, 130 {95, CF(1.000000)}, 131 {95, CF(1.230769)}, 132 {95, CF(1.142857)}, 133 {95, CF(1.066667)}, 134 {127, CF(1.000000)}, 135 {127, CF(1.254863)}, 136 {127, CF(1.185255)}, 137 {151, CF(1.000000)}, 138 {127, CF(1.066667)}, 139 {167, CF(1.000000)}, 140 {159, CF(1.454334)}, 141 {183, CF(1.000000)}, 142 {127, CF(0.969744)}, 143 {191, CF(1.280246)}, 144 {191, CF(1.230921)}, 145 {215, CF(1.000000)}, 146 {191, CF(1.143118)}, 147 }; 148 149 static u32 mbm_cf_rmidthreshold __read_mostly = UINT_MAX; 150 static u64 mbm_cf __read_mostly; 151 152 static inline u64 get_corrected_mbm_count(u32 rmid, unsigned long val) 153 { 154 /* Correct MBM value. */ 155 if (rmid > mbm_cf_rmidthreshold) 156 val = (val * mbm_cf) >> 20; 157 158 return val; 159 } 160 161 /* 162 * x86 and arm64 differ in their handling of monitoring. 163 * x86's RMID are independent numbers, there is only one source of traffic 164 * with an RMID value of '1'. 165 * arm64's PMG extends the PARTID/CLOSID space, there are multiple sources of 166 * traffic with a PMG value of '1', one for each CLOSID, meaning the RMID 167 * value is no longer unique. 168 * To account for this, resctrl uses an index. On x86 this is just the RMID, 169 * on arm64 it encodes the CLOSID and RMID. This gives a unique number. 170 * 171 * The domain's rmid_busy_llc and rmid_ptrs[] are sized by index. The arch code 172 * must accept an attempt to read every index. 173 */ 174 static inline struct rmid_entry *__rmid_entry(u32 idx) 175 { 176 struct rmid_entry *entry; 177 u32 closid, rmid; 178 179 entry = &rmid_ptrs[idx]; 180 resctrl_arch_rmid_idx_decode(idx, &closid, &rmid); 181 182 WARN_ON_ONCE(entry->closid != closid); 183 WARN_ON_ONCE(entry->rmid != rmid); 184 185 return entry; 186 } 187 188 static int __rmid_read(u32 rmid, enum resctrl_event_id eventid, u64 *val) 189 { 190 u64 msr_val; 191 192 /* 193 * As per the SDM, when IA32_QM_EVTSEL.EvtID (bits 7:0) is configured 194 * with a valid event code for supported resource type and the bits 195 * IA32_QM_EVTSEL.RMID (bits 41:32) are configured with valid RMID, 196 * IA32_QM_CTR.data (bits 61:0) reports the monitored data. 197 * IA32_QM_CTR.Error (bit 63) and IA32_QM_CTR.Unavailable (bit 62) 198 * are error bits. 199 */ 200 wrmsr(MSR_IA32_QM_EVTSEL, eventid, rmid); 201 rdmsrl(MSR_IA32_QM_CTR, msr_val); 202 203 if (msr_val & RMID_VAL_ERROR) 204 return -EIO; 205 if (msr_val & RMID_VAL_UNAVAIL) 206 return -EINVAL; 207 208 *val = msr_val; 209 return 0; 210 } 211 212 static struct arch_mbm_state *get_arch_mbm_state(struct rdt_hw_domain *hw_dom, 213 u32 rmid, 214 enum resctrl_event_id eventid) 215 { 216 switch (eventid) { 217 case QOS_L3_OCCUP_EVENT_ID: 218 return NULL; 219 case QOS_L3_MBM_TOTAL_EVENT_ID: 220 return &hw_dom->arch_mbm_total[rmid]; 221 case QOS_L3_MBM_LOCAL_EVENT_ID: 222 return &hw_dom->arch_mbm_local[rmid]; 223 } 224 225 /* Never expect to get here */ 226 WARN_ON_ONCE(1); 227 228 return NULL; 229 } 230 231 void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_domain *d, 232 u32 unused, u32 rmid, 233 enum resctrl_event_id eventid) 234 { 235 struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d); 236 struct arch_mbm_state *am; 237 238 am = get_arch_mbm_state(hw_dom, rmid, eventid); 239 if (am) { 240 memset(am, 0, sizeof(*am)); 241 242 /* Record any initial, non-zero count value. */ 243 __rmid_read(rmid, eventid, &am->prev_msr); 244 } 245 } 246 247 /* 248 * Assumes that hardware counters are also reset and thus that there is 249 * no need to record initial non-zero counts. 250 */ 251 void resctrl_arch_reset_rmid_all(struct rdt_resource *r, struct rdt_domain *d) 252 { 253 struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d); 254 255 if (is_mbm_total_enabled()) 256 memset(hw_dom->arch_mbm_total, 0, 257 sizeof(*hw_dom->arch_mbm_total) * r->num_rmid); 258 259 if (is_mbm_local_enabled()) 260 memset(hw_dom->arch_mbm_local, 0, 261 sizeof(*hw_dom->arch_mbm_local) * r->num_rmid); 262 } 263 264 static u64 mbm_overflow_count(u64 prev_msr, u64 cur_msr, unsigned int width) 265 { 266 u64 shift = 64 - width, chunks; 267 268 chunks = (cur_msr << shift) - (prev_msr << shift); 269 return chunks >> shift; 270 } 271 272 int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain *d, 273 u32 unused, u32 rmid, enum resctrl_event_id eventid, 274 u64 *val, void *ignored) 275 { 276 struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); 277 struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d); 278 struct arch_mbm_state *am; 279 u64 msr_val, chunks; 280 int ret; 281 282 resctrl_arch_rmid_read_context_check(); 283 284 if (!cpumask_test_cpu(smp_processor_id(), &d->cpu_mask)) 285 return -EINVAL; 286 287 ret = __rmid_read(rmid, eventid, &msr_val); 288 if (ret) 289 return ret; 290 291 am = get_arch_mbm_state(hw_dom, rmid, eventid); 292 if (am) { 293 am->chunks += mbm_overflow_count(am->prev_msr, msr_val, 294 hw_res->mbm_width); 295 chunks = get_corrected_mbm_count(rmid, am->chunks); 296 am->prev_msr = msr_val; 297 } else { 298 chunks = msr_val; 299 } 300 301 *val = chunks * hw_res->mon_scale; 302 303 return 0; 304 } 305 306 static void limbo_release_entry(struct rmid_entry *entry) 307 { 308 lockdep_assert_held(&rdtgroup_mutex); 309 310 rmid_limbo_count--; 311 list_add_tail(&entry->list, &rmid_free_lru); 312 313 if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) 314 closid_num_dirty_rmid[entry->closid]--; 315 } 316 317 /* 318 * Check the RMIDs that are marked as busy for this domain. If the 319 * reported LLC occupancy is below the threshold clear the busy bit and 320 * decrement the count. If the busy count gets to zero on an RMID, we 321 * free the RMID 322 */ 323 void __check_limbo(struct rdt_domain *d, bool force_free) 324 { 325 struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; 326 u32 idx_limit = resctrl_arch_system_num_rmid_idx(); 327 struct rmid_entry *entry; 328 u32 idx, cur_idx = 1; 329 void *arch_mon_ctx; 330 bool rmid_dirty; 331 u64 val = 0; 332 333 arch_mon_ctx = resctrl_arch_mon_ctx_alloc(r, QOS_L3_OCCUP_EVENT_ID); 334 if (IS_ERR(arch_mon_ctx)) { 335 pr_warn_ratelimited("Failed to allocate monitor context: %ld", 336 PTR_ERR(arch_mon_ctx)); 337 return; 338 } 339 340 /* 341 * Skip RMID 0 and start from RMID 1 and check all the RMIDs that 342 * are marked as busy for occupancy < threshold. If the occupancy 343 * is less than the threshold decrement the busy counter of the 344 * RMID and move it to the free list when the counter reaches 0. 345 */ 346 for (;;) { 347 idx = find_next_bit(d->rmid_busy_llc, idx_limit, cur_idx); 348 if (idx >= idx_limit) 349 break; 350 351 entry = __rmid_entry(idx); 352 if (resctrl_arch_rmid_read(r, d, entry->closid, entry->rmid, 353 QOS_L3_OCCUP_EVENT_ID, &val, 354 arch_mon_ctx)) { 355 rmid_dirty = true; 356 } else { 357 rmid_dirty = (val >= resctrl_rmid_realloc_threshold); 358 359 /* 360 * x86's CLOSID and RMID are independent numbers, so the entry's 361 * CLOSID is an empty CLOSID (X86_RESCTRL_EMPTY_CLOSID). On Arm the 362 * RMID (PMG) extends the CLOSID (PARTID) space with bits that aren't 363 * used to select the configuration. It is thus necessary to track both 364 * CLOSID and RMID because there may be dependencies between them 365 * on some architectures. 366 */ 367 trace_mon_llc_occupancy_limbo(entry->closid, entry->rmid, d->id, val); 368 } 369 370 if (force_free || !rmid_dirty) { 371 clear_bit(idx, d->rmid_busy_llc); 372 if (!--entry->busy) 373 limbo_release_entry(entry); 374 } 375 cur_idx = idx + 1; 376 } 377 378 resctrl_arch_mon_ctx_free(r, QOS_L3_OCCUP_EVENT_ID, arch_mon_ctx); 379 } 380 381 bool has_busy_rmid(struct rdt_domain *d) 382 { 383 u32 idx_limit = resctrl_arch_system_num_rmid_idx(); 384 385 return find_first_bit(d->rmid_busy_llc, idx_limit) != idx_limit; 386 } 387 388 static struct rmid_entry *resctrl_find_free_rmid(u32 closid) 389 { 390 struct rmid_entry *itr; 391 u32 itr_idx, cmp_idx; 392 393 if (list_empty(&rmid_free_lru)) 394 return rmid_limbo_count ? ERR_PTR(-EBUSY) : ERR_PTR(-ENOSPC); 395 396 list_for_each_entry(itr, &rmid_free_lru, list) { 397 /* 398 * Get the index of this free RMID, and the index it would need 399 * to be if it were used with this CLOSID. 400 * If the CLOSID is irrelevant on this architecture, the two 401 * index values are always the same on every entry and thus the 402 * very first entry will be returned. 403 */ 404 itr_idx = resctrl_arch_rmid_idx_encode(itr->closid, itr->rmid); 405 cmp_idx = resctrl_arch_rmid_idx_encode(closid, itr->rmid); 406 407 if (itr_idx == cmp_idx) 408 return itr; 409 } 410 411 return ERR_PTR(-ENOSPC); 412 } 413 414 /** 415 * resctrl_find_cleanest_closid() - Find a CLOSID where all the associated 416 * RMID are clean, or the CLOSID that has 417 * the most clean RMID. 418 * 419 * MPAM's equivalent of RMID are per-CLOSID, meaning a freshly allocated CLOSID 420 * may not be able to allocate clean RMID. To avoid this the allocator will 421 * choose the CLOSID with the most clean RMID. 422 * 423 * When the CLOSID and RMID are independent numbers, the first free CLOSID will 424 * be returned. 425 */ 426 int resctrl_find_cleanest_closid(void) 427 { 428 u32 cleanest_closid = ~0; 429 int i = 0; 430 431 lockdep_assert_held(&rdtgroup_mutex); 432 433 if (!IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) 434 return -EIO; 435 436 for (i = 0; i < closids_supported(); i++) { 437 int num_dirty; 438 439 if (closid_allocated(i)) 440 continue; 441 442 num_dirty = closid_num_dirty_rmid[i]; 443 if (num_dirty == 0) 444 return i; 445 446 if (cleanest_closid == ~0) 447 cleanest_closid = i; 448 449 if (num_dirty < closid_num_dirty_rmid[cleanest_closid]) 450 cleanest_closid = i; 451 } 452 453 if (cleanest_closid == ~0) 454 return -ENOSPC; 455 456 return cleanest_closid; 457 } 458 459 /* 460 * For MPAM the RMID value is not unique, and has to be considered with 461 * the CLOSID. The (CLOSID, RMID) pair is allocated on all domains, which 462 * allows all domains to be managed by a single free list. 463 * Each domain also has a rmid_busy_llc to reduce the work of the limbo handler. 464 */ 465 int alloc_rmid(u32 closid) 466 { 467 struct rmid_entry *entry; 468 469 lockdep_assert_held(&rdtgroup_mutex); 470 471 entry = resctrl_find_free_rmid(closid); 472 if (IS_ERR(entry)) 473 return PTR_ERR(entry); 474 475 list_del(&entry->list); 476 return entry->rmid; 477 } 478 479 static void add_rmid_to_limbo(struct rmid_entry *entry) 480 { 481 struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; 482 struct rdt_domain *d; 483 u32 idx; 484 485 lockdep_assert_held(&rdtgroup_mutex); 486 487 /* Walking r->domains, ensure it can't race with cpuhp */ 488 lockdep_assert_cpus_held(); 489 490 idx = resctrl_arch_rmid_idx_encode(entry->closid, entry->rmid); 491 492 entry->busy = 0; 493 list_for_each_entry(d, &r->domains, list) { 494 /* 495 * For the first limbo RMID in the domain, 496 * setup up the limbo worker. 497 */ 498 if (!has_busy_rmid(d)) 499 cqm_setup_limbo_handler(d, CQM_LIMBOCHECK_INTERVAL, 500 RESCTRL_PICK_ANY_CPU); 501 set_bit(idx, d->rmid_busy_llc); 502 entry->busy++; 503 } 504 505 rmid_limbo_count++; 506 if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) 507 closid_num_dirty_rmid[entry->closid]++; 508 } 509 510 void free_rmid(u32 closid, u32 rmid) 511 { 512 u32 idx = resctrl_arch_rmid_idx_encode(closid, rmid); 513 struct rmid_entry *entry; 514 515 lockdep_assert_held(&rdtgroup_mutex); 516 517 /* 518 * Do not allow the default rmid to be free'd. Comparing by index 519 * allows architectures that ignore the closid parameter to avoid an 520 * unnecessary check. 521 */ 522 if (!resctrl_arch_mon_capable() || 523 idx == resctrl_arch_rmid_idx_encode(RESCTRL_RESERVED_CLOSID, 524 RESCTRL_RESERVED_RMID)) 525 return; 526 527 entry = __rmid_entry(idx); 528 529 if (is_llc_occupancy_enabled()) 530 add_rmid_to_limbo(entry); 531 else 532 list_add_tail(&entry->list, &rmid_free_lru); 533 } 534 535 static struct mbm_state *get_mbm_state(struct rdt_domain *d, u32 closid, 536 u32 rmid, enum resctrl_event_id evtid) 537 { 538 u32 idx = resctrl_arch_rmid_idx_encode(closid, rmid); 539 540 switch (evtid) { 541 case QOS_L3_MBM_TOTAL_EVENT_ID: 542 return &d->mbm_total[idx]; 543 case QOS_L3_MBM_LOCAL_EVENT_ID: 544 return &d->mbm_local[idx]; 545 default: 546 return NULL; 547 } 548 } 549 550 static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr) 551 { 552 struct mbm_state *m; 553 u64 tval = 0; 554 555 if (rr->first) { 556 resctrl_arch_reset_rmid(rr->r, rr->d, closid, rmid, rr->evtid); 557 m = get_mbm_state(rr->d, closid, rmid, rr->evtid); 558 if (m) 559 memset(m, 0, sizeof(struct mbm_state)); 560 return 0; 561 } 562 563 rr->err = resctrl_arch_rmid_read(rr->r, rr->d, closid, rmid, rr->evtid, 564 &tval, rr->arch_mon_ctx); 565 if (rr->err) 566 return rr->err; 567 568 rr->val += tval; 569 570 return 0; 571 } 572 573 /* 574 * mbm_bw_count() - Update bw count from values previously read by 575 * __mon_event_count(). 576 * @closid: The closid used to identify the cached mbm_state. 577 * @rmid: The rmid used to identify the cached mbm_state. 578 * @rr: The struct rmid_read populated by __mon_event_count(). 579 * 580 * Supporting function to calculate the memory bandwidth 581 * and delta bandwidth in MBps. The chunks value previously read by 582 * __mon_event_count() is compared with the chunks value from the previous 583 * invocation. This must be called once per second to maintain values in MBps. 584 */ 585 static void mbm_bw_count(u32 closid, u32 rmid, struct rmid_read *rr) 586 { 587 u32 idx = resctrl_arch_rmid_idx_encode(closid, rmid); 588 struct mbm_state *m = &rr->d->mbm_local[idx]; 589 u64 cur_bw, bytes, cur_bytes; 590 591 cur_bytes = rr->val; 592 bytes = cur_bytes - m->prev_bw_bytes; 593 m->prev_bw_bytes = cur_bytes; 594 595 cur_bw = bytes / SZ_1M; 596 597 m->prev_bw = cur_bw; 598 } 599 600 /* 601 * This is scheduled by mon_event_read() to read the CQM/MBM counters 602 * on a domain. 603 */ 604 void mon_event_count(void *info) 605 { 606 struct rdtgroup *rdtgrp, *entry; 607 struct rmid_read *rr = info; 608 struct list_head *head; 609 int ret; 610 611 rdtgrp = rr->rgrp; 612 613 ret = __mon_event_count(rdtgrp->closid, rdtgrp->mon.rmid, rr); 614 615 /* 616 * For Ctrl groups read data from child monitor groups and 617 * add them together. Count events which are read successfully. 618 * Discard the rmid_read's reporting errors. 619 */ 620 head = &rdtgrp->mon.crdtgrp_list; 621 622 if (rdtgrp->type == RDTCTRL_GROUP) { 623 list_for_each_entry(entry, head, mon.crdtgrp_list) { 624 if (__mon_event_count(entry->closid, entry->mon.rmid, 625 rr) == 0) 626 ret = 0; 627 } 628 } 629 630 /* 631 * __mon_event_count() calls for newly created monitor groups may 632 * report -EINVAL/Unavailable if the monitor hasn't seen any traffic. 633 * Discard error if any of the monitor event reads succeeded. 634 */ 635 if (ret == 0) 636 rr->err = 0; 637 } 638 639 /* 640 * Feedback loop for MBA software controller (mba_sc) 641 * 642 * mba_sc is a feedback loop where we periodically read MBM counters and 643 * adjust the bandwidth percentage values via the IA32_MBA_THRTL_MSRs so 644 * that: 645 * 646 * current bandwidth(cur_bw) < user specified bandwidth(user_bw) 647 * 648 * This uses the MBM counters to measure the bandwidth and MBA throttle 649 * MSRs to control the bandwidth for a particular rdtgrp. It builds on the 650 * fact that resctrl rdtgroups have both monitoring and control. 651 * 652 * The frequency of the checks is 1s and we just tag along the MBM overflow 653 * timer. Having 1s interval makes the calculation of bandwidth simpler. 654 * 655 * Although MBA's goal is to restrict the bandwidth to a maximum, there may 656 * be a need to increase the bandwidth to avoid unnecessarily restricting 657 * the L2 <-> L3 traffic. 658 * 659 * Since MBA controls the L2 external bandwidth where as MBM measures the 660 * L3 external bandwidth the following sequence could lead to such a 661 * situation. 662 * 663 * Consider an rdtgroup which had high L3 <-> memory traffic in initial 664 * phases -> mba_sc kicks in and reduced bandwidth percentage values -> but 665 * after some time rdtgroup has mostly L2 <-> L3 traffic. 666 * 667 * In this case we may restrict the rdtgroup's L2 <-> L3 traffic as its 668 * throttle MSRs already have low percentage values. To avoid 669 * unnecessarily restricting such rdtgroups, we also increase the bandwidth. 670 */ 671 static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_domain *dom_mbm) 672 { 673 u32 closid, rmid, cur_msr_val, new_msr_val; 674 struct mbm_state *pmbm_data, *cmbm_data; 675 struct rdt_resource *r_mba; 676 struct rdt_domain *dom_mba; 677 u32 cur_bw, user_bw, idx; 678 struct list_head *head; 679 struct rdtgroup *entry; 680 681 if (!is_mbm_local_enabled()) 682 return; 683 684 r_mba = &rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl; 685 686 closid = rgrp->closid; 687 rmid = rgrp->mon.rmid; 688 idx = resctrl_arch_rmid_idx_encode(closid, rmid); 689 pmbm_data = &dom_mbm->mbm_local[idx]; 690 691 dom_mba = get_domain_from_cpu(smp_processor_id(), r_mba); 692 if (!dom_mba) { 693 pr_warn_once("Failure to get domain for MBA update\n"); 694 return; 695 } 696 697 cur_bw = pmbm_data->prev_bw; 698 user_bw = dom_mba->mbps_val[closid]; 699 700 /* MBA resource doesn't support CDP */ 701 cur_msr_val = resctrl_arch_get_config(r_mba, dom_mba, closid, CDP_NONE); 702 703 /* 704 * For Ctrl groups read data from child monitor groups. 705 */ 706 head = &rgrp->mon.crdtgrp_list; 707 list_for_each_entry(entry, head, mon.crdtgrp_list) { 708 cmbm_data = &dom_mbm->mbm_local[entry->mon.rmid]; 709 cur_bw += cmbm_data->prev_bw; 710 } 711 712 /* 713 * Scale up/down the bandwidth linearly for the ctrl group. The 714 * bandwidth step is the bandwidth granularity specified by the 715 * hardware. 716 * Always increase throttling if current bandwidth is above the 717 * target set by user. 718 * But avoid thrashing up and down on every poll by checking 719 * whether a decrease in throttling is likely to push the group 720 * back over target. E.g. if currently throttling to 30% of bandwidth 721 * on a system with 10% granularity steps, check whether moving to 722 * 40% would go past the limit by multiplying current bandwidth by 723 * "(30 + 10) / 30". 724 */ 725 if (cur_msr_val > r_mba->membw.min_bw && user_bw < cur_bw) { 726 new_msr_val = cur_msr_val - r_mba->membw.bw_gran; 727 } else if (cur_msr_val < MAX_MBA_BW && 728 (user_bw > (cur_bw * (cur_msr_val + r_mba->membw.min_bw) / cur_msr_val))) { 729 new_msr_val = cur_msr_val + r_mba->membw.bw_gran; 730 } else { 731 return; 732 } 733 734 resctrl_arch_update_one(r_mba, dom_mba, closid, CDP_NONE, new_msr_val); 735 } 736 737 static void mbm_update(struct rdt_resource *r, struct rdt_domain *d, 738 u32 closid, u32 rmid) 739 { 740 struct rmid_read rr; 741 742 rr.first = false; 743 rr.r = r; 744 rr.d = d; 745 746 /* 747 * This is protected from concurrent reads from user 748 * as both the user and we hold the global mutex. 749 */ 750 if (is_mbm_total_enabled()) { 751 rr.evtid = QOS_L3_MBM_TOTAL_EVENT_ID; 752 rr.val = 0; 753 rr.arch_mon_ctx = resctrl_arch_mon_ctx_alloc(rr.r, rr.evtid); 754 if (IS_ERR(rr.arch_mon_ctx)) { 755 pr_warn_ratelimited("Failed to allocate monitor context: %ld", 756 PTR_ERR(rr.arch_mon_ctx)); 757 return; 758 } 759 760 __mon_event_count(closid, rmid, &rr); 761 762 resctrl_arch_mon_ctx_free(rr.r, rr.evtid, rr.arch_mon_ctx); 763 } 764 if (is_mbm_local_enabled()) { 765 rr.evtid = QOS_L3_MBM_LOCAL_EVENT_ID; 766 rr.val = 0; 767 rr.arch_mon_ctx = resctrl_arch_mon_ctx_alloc(rr.r, rr.evtid); 768 if (IS_ERR(rr.arch_mon_ctx)) { 769 pr_warn_ratelimited("Failed to allocate monitor context: %ld", 770 PTR_ERR(rr.arch_mon_ctx)); 771 return; 772 } 773 774 __mon_event_count(closid, rmid, &rr); 775 776 /* 777 * Call the MBA software controller only for the 778 * control groups and when user has enabled 779 * the software controller explicitly. 780 */ 781 if (is_mba_sc(NULL)) 782 mbm_bw_count(closid, rmid, &rr); 783 784 resctrl_arch_mon_ctx_free(rr.r, rr.evtid, rr.arch_mon_ctx); 785 } 786 } 787 788 /* 789 * Handler to scan the limbo list and move the RMIDs 790 * to free list whose occupancy < threshold_occupancy. 791 */ 792 void cqm_handle_limbo(struct work_struct *work) 793 { 794 unsigned long delay = msecs_to_jiffies(CQM_LIMBOCHECK_INTERVAL); 795 struct rdt_domain *d; 796 797 cpus_read_lock(); 798 mutex_lock(&rdtgroup_mutex); 799 800 d = container_of(work, struct rdt_domain, cqm_limbo.work); 801 802 __check_limbo(d, false); 803 804 if (has_busy_rmid(d)) { 805 d->cqm_work_cpu = cpumask_any_housekeeping(&d->cpu_mask, 806 RESCTRL_PICK_ANY_CPU); 807 schedule_delayed_work_on(d->cqm_work_cpu, &d->cqm_limbo, 808 delay); 809 } 810 811 mutex_unlock(&rdtgroup_mutex); 812 cpus_read_unlock(); 813 } 814 815 /** 816 * cqm_setup_limbo_handler() - Schedule the limbo handler to run for this 817 * domain. 818 * @dom: The domain the limbo handler should run for. 819 * @delay_ms: How far in the future the handler should run. 820 * @exclude_cpu: Which CPU the handler should not run on, 821 * RESCTRL_PICK_ANY_CPU to pick any CPU. 822 */ 823 void cqm_setup_limbo_handler(struct rdt_domain *dom, unsigned long delay_ms, 824 int exclude_cpu) 825 { 826 unsigned long delay = msecs_to_jiffies(delay_ms); 827 int cpu; 828 829 cpu = cpumask_any_housekeeping(&dom->cpu_mask, exclude_cpu); 830 dom->cqm_work_cpu = cpu; 831 832 if (cpu < nr_cpu_ids) 833 schedule_delayed_work_on(cpu, &dom->cqm_limbo, delay); 834 } 835 836 void mbm_handle_overflow(struct work_struct *work) 837 { 838 unsigned long delay = msecs_to_jiffies(MBM_OVERFLOW_INTERVAL); 839 struct rdtgroup *prgrp, *crgrp; 840 struct list_head *head; 841 struct rdt_resource *r; 842 struct rdt_domain *d; 843 844 cpus_read_lock(); 845 mutex_lock(&rdtgroup_mutex); 846 847 /* 848 * If the filesystem has been unmounted this work no longer needs to 849 * run. 850 */ 851 if (!resctrl_mounted || !resctrl_arch_mon_capable()) 852 goto out_unlock; 853 854 r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; 855 d = container_of(work, struct rdt_domain, mbm_over.work); 856 857 list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) { 858 mbm_update(r, d, prgrp->closid, prgrp->mon.rmid); 859 860 head = &prgrp->mon.crdtgrp_list; 861 list_for_each_entry(crgrp, head, mon.crdtgrp_list) 862 mbm_update(r, d, crgrp->closid, crgrp->mon.rmid); 863 864 if (is_mba_sc(NULL)) 865 update_mba_bw(prgrp, d); 866 } 867 868 /* 869 * Re-check for housekeeping CPUs. This allows the overflow handler to 870 * move off a nohz_full CPU quickly. 871 */ 872 d->mbm_work_cpu = cpumask_any_housekeeping(&d->cpu_mask, 873 RESCTRL_PICK_ANY_CPU); 874 schedule_delayed_work_on(d->mbm_work_cpu, &d->mbm_over, delay); 875 876 out_unlock: 877 mutex_unlock(&rdtgroup_mutex); 878 cpus_read_unlock(); 879 } 880 881 /** 882 * mbm_setup_overflow_handler() - Schedule the overflow handler to run for this 883 * domain. 884 * @dom: The domain the overflow handler should run for. 885 * @delay_ms: How far in the future the handler should run. 886 * @exclude_cpu: Which CPU the handler should not run on, 887 * RESCTRL_PICK_ANY_CPU to pick any CPU. 888 */ 889 void mbm_setup_overflow_handler(struct rdt_domain *dom, unsigned long delay_ms, 890 int exclude_cpu) 891 { 892 unsigned long delay = msecs_to_jiffies(delay_ms); 893 int cpu; 894 895 /* 896 * When a domain comes online there is no guarantee the filesystem is 897 * mounted. If not, there is no need to catch counter overflow. 898 */ 899 if (!resctrl_mounted || !resctrl_arch_mon_capable()) 900 return; 901 cpu = cpumask_any_housekeeping(&dom->cpu_mask, exclude_cpu); 902 dom->mbm_work_cpu = cpu; 903 904 if (cpu < nr_cpu_ids) 905 schedule_delayed_work_on(cpu, &dom->mbm_over, delay); 906 } 907 908 static int dom_data_init(struct rdt_resource *r) 909 { 910 u32 idx_limit = resctrl_arch_system_num_rmid_idx(); 911 u32 num_closid = resctrl_arch_get_num_closid(r); 912 struct rmid_entry *entry = NULL; 913 int err = 0, i; 914 u32 idx; 915 916 mutex_lock(&rdtgroup_mutex); 917 if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) { 918 u32 *tmp; 919 920 /* 921 * If the architecture hasn't provided a sanitised value here, 922 * this may result in larger arrays than necessary. Resctrl will 923 * use a smaller system wide value based on the resources in 924 * use. 925 */ 926 tmp = kcalloc(num_closid, sizeof(*tmp), GFP_KERNEL); 927 if (!tmp) { 928 err = -ENOMEM; 929 goto out_unlock; 930 } 931 932 closid_num_dirty_rmid = tmp; 933 } 934 935 rmid_ptrs = kcalloc(idx_limit, sizeof(struct rmid_entry), GFP_KERNEL); 936 if (!rmid_ptrs) { 937 if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) { 938 kfree(closid_num_dirty_rmid); 939 closid_num_dirty_rmid = NULL; 940 } 941 err = -ENOMEM; 942 goto out_unlock; 943 } 944 945 for (i = 0; i < idx_limit; i++) { 946 entry = &rmid_ptrs[i]; 947 INIT_LIST_HEAD(&entry->list); 948 949 resctrl_arch_rmid_idx_decode(i, &entry->closid, &entry->rmid); 950 list_add_tail(&entry->list, &rmid_free_lru); 951 } 952 953 /* 954 * RESCTRL_RESERVED_CLOSID and RESCTRL_RESERVED_RMID are special and 955 * are always allocated. These are used for the rdtgroup_default 956 * control group, which will be setup later in rdtgroup_init(). 957 */ 958 idx = resctrl_arch_rmid_idx_encode(RESCTRL_RESERVED_CLOSID, 959 RESCTRL_RESERVED_RMID); 960 entry = __rmid_entry(idx); 961 list_del(&entry->list); 962 963 out_unlock: 964 mutex_unlock(&rdtgroup_mutex); 965 966 return err; 967 } 968 969 static void __exit dom_data_exit(void) 970 { 971 mutex_lock(&rdtgroup_mutex); 972 973 if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) { 974 kfree(closid_num_dirty_rmid); 975 closid_num_dirty_rmid = NULL; 976 } 977 978 kfree(rmid_ptrs); 979 rmid_ptrs = NULL; 980 981 mutex_unlock(&rdtgroup_mutex); 982 } 983 984 static struct mon_evt llc_occupancy_event = { 985 .name = "llc_occupancy", 986 .evtid = QOS_L3_OCCUP_EVENT_ID, 987 }; 988 989 static struct mon_evt mbm_total_event = { 990 .name = "mbm_total_bytes", 991 .evtid = QOS_L3_MBM_TOTAL_EVENT_ID, 992 }; 993 994 static struct mon_evt mbm_local_event = { 995 .name = "mbm_local_bytes", 996 .evtid = QOS_L3_MBM_LOCAL_EVENT_ID, 997 }; 998 999 /* 1000 * Initialize the event list for the resource. 1001 * 1002 * Note that MBM events are also part of RDT_RESOURCE_L3 resource 1003 * because as per the SDM the total and local memory bandwidth 1004 * are enumerated as part of L3 monitoring. 1005 */ 1006 static void l3_mon_evt_init(struct rdt_resource *r) 1007 { 1008 INIT_LIST_HEAD(&r->evt_list); 1009 1010 if (is_llc_occupancy_enabled()) 1011 list_add_tail(&llc_occupancy_event.list, &r->evt_list); 1012 if (is_mbm_total_enabled()) 1013 list_add_tail(&mbm_total_event.list, &r->evt_list); 1014 if (is_mbm_local_enabled()) 1015 list_add_tail(&mbm_local_event.list, &r->evt_list); 1016 } 1017 1018 int __init rdt_get_mon_l3_config(struct rdt_resource *r) 1019 { 1020 unsigned int mbm_offset = boot_cpu_data.x86_cache_mbm_width_offset; 1021 struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); 1022 unsigned int threshold; 1023 int ret; 1024 1025 resctrl_rmid_realloc_limit = boot_cpu_data.x86_cache_size * 1024; 1026 hw_res->mon_scale = boot_cpu_data.x86_cache_occ_scale; 1027 r->num_rmid = boot_cpu_data.x86_cache_max_rmid + 1; 1028 hw_res->mbm_width = MBM_CNTR_WIDTH_BASE; 1029 1030 if (mbm_offset > 0 && mbm_offset <= MBM_CNTR_WIDTH_OFFSET_MAX) 1031 hw_res->mbm_width += mbm_offset; 1032 else if (mbm_offset > MBM_CNTR_WIDTH_OFFSET_MAX) 1033 pr_warn("Ignoring impossible MBM counter offset\n"); 1034 1035 /* 1036 * A reasonable upper limit on the max threshold is the number 1037 * of lines tagged per RMID if all RMIDs have the same number of 1038 * lines tagged in the LLC. 1039 * 1040 * For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC. 1041 */ 1042 threshold = resctrl_rmid_realloc_limit / r->num_rmid; 1043 1044 /* 1045 * Because num_rmid may not be a power of two, round the value 1046 * to the nearest multiple of hw_res->mon_scale so it matches a 1047 * value the hardware will measure. mon_scale may not be a power of 2. 1048 */ 1049 resctrl_rmid_realloc_threshold = resctrl_arch_round_mon_val(threshold); 1050 1051 ret = dom_data_init(r); 1052 if (ret) 1053 return ret; 1054 1055 if (rdt_cpu_has(X86_FEATURE_BMEC)) { 1056 u32 eax, ebx, ecx, edx; 1057 1058 /* Detect list of bandwidth sources that can be tracked */ 1059 cpuid_count(0x80000020, 3, &eax, &ebx, &ecx, &edx); 1060 hw_res->mbm_cfg_mask = ecx & MAX_EVT_CONFIG_BITS; 1061 1062 if (rdt_cpu_has(X86_FEATURE_CQM_MBM_TOTAL)) { 1063 mbm_total_event.configurable = true; 1064 mbm_config_rftype_init("mbm_total_bytes_config"); 1065 } 1066 if (rdt_cpu_has(X86_FEATURE_CQM_MBM_LOCAL)) { 1067 mbm_local_event.configurable = true; 1068 mbm_config_rftype_init("mbm_local_bytes_config"); 1069 } 1070 } 1071 1072 l3_mon_evt_init(r); 1073 1074 r->mon_capable = true; 1075 1076 return 0; 1077 } 1078 1079 void __exit rdt_put_mon_l3_config(void) 1080 { 1081 dom_data_exit(); 1082 } 1083 1084 void __init intel_rdt_mbm_apply_quirk(void) 1085 { 1086 int cf_index; 1087 1088 cf_index = (boot_cpu_data.x86_cache_max_rmid + 1) / 8 - 1; 1089 if (cf_index >= ARRAY_SIZE(mbm_cf_table)) { 1090 pr_info("No MBM correction factor available\n"); 1091 return; 1092 } 1093 1094 mbm_cf_rmidthreshold = mbm_cf_table[cf_index].rmidthreshold; 1095 mbm_cf = mbm_cf_table[cf_index].cf; 1096 } 1097