1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Resource Director Technology (RDT) 4 * 5 * Pseudo-locking support built on top of Cache Allocation Technology (CAT) 6 * 7 * Copyright (C) 2018 Intel Corporation 8 * 9 * Author: Reinette Chatre <reinette.chatre@intel.com> 10 */ 11 12 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 13 14 #include <linux/cacheflush.h> 15 #include <linux/cpu.h> 16 #include <linux/perf_event.h> 17 #include <linux/pm_qos.h> 18 #include <linux/resctrl.h> 19 20 #include <asm/cpu_device_id.h> 21 #include <asm/perf_event.h> 22 #include <asm/msr.h> 23 24 #include "../../events/perf_event.h" /* For X86_CONFIG() */ 25 #include "internal.h" 26 27 #define CREATE_TRACE_POINTS 28 29 #include "pseudo_lock_trace.h" 30 31 /* 32 * The bits needed to disable hardware prefetching varies based on the 33 * platform. During initialization we will discover which bits to use. 34 */ 35 static u64 prefetch_disable_bits; 36 37 /** 38 * resctrl_arch_get_prefetch_disable_bits - prefetch disable bits of supported 39 * platforms 40 * @void: It takes no parameters. 41 * 42 * Capture the list of platforms that have been validated to support 43 * pseudo-locking. This includes testing to ensure pseudo-locked regions 44 * with low cache miss rates can be created under variety of load conditions 45 * as well as that these pseudo-locked regions can maintain their low cache 46 * miss rates under variety of load conditions for significant lengths of time. 47 * 48 * After a platform has been validated to support pseudo-locking its 49 * hardware prefetch disable bits are included here as they are documented 50 * in the SDM. 51 * 52 * When adding a platform here also add support for its cache events to 53 * resctrl_arch_measure_l*_residency() 54 * 55 * Return: 56 * If platform is supported, the bits to disable hardware prefetchers, 0 57 * if platform is not supported. 58 */ 59 u64 resctrl_arch_get_prefetch_disable_bits(void) 60 { 61 prefetch_disable_bits = 0; 62 63 if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL || 64 boot_cpu_data.x86 != 6) 65 return 0; 66 67 switch (boot_cpu_data.x86_vfm) { 68 case INTEL_BROADWELL_X: 69 /* 70 * SDM defines bits of MSR_MISC_FEATURE_CONTROL register 71 * as: 72 * 0 L2 Hardware Prefetcher Disable (R/W) 73 * 1 L2 Adjacent Cache Line Prefetcher Disable (R/W) 74 * 2 DCU Hardware Prefetcher Disable (R/W) 75 * 3 DCU IP Prefetcher Disable (R/W) 76 * 63:4 Reserved 77 */ 78 prefetch_disable_bits = 0xF; 79 break; 80 case INTEL_ATOM_GOLDMONT: 81 case INTEL_ATOM_GOLDMONT_PLUS: 82 /* 83 * SDM defines bits of MSR_MISC_FEATURE_CONTROL register 84 * as: 85 * 0 L2 Hardware Prefetcher Disable (R/W) 86 * 1 Reserved 87 * 2 DCU Hardware Prefetcher Disable (R/W) 88 * 63:3 Reserved 89 */ 90 prefetch_disable_bits = 0x5; 91 break; 92 } 93 94 return prefetch_disable_bits; 95 } 96 97 /** 98 * resctrl_arch_pseudo_lock_fn - Load kernel memory into cache 99 * @_plr: the pseudo-lock region descriptor 100 * 101 * This is the core pseudo-locking flow. 102 * 103 * First we ensure that the kernel memory cannot be found in the cache. 104 * Then, while taking care that there will be as little interference as 105 * possible, the memory to be loaded is accessed while core is running 106 * with class of service set to the bitmask of the pseudo-locked region. 107 * After this is complete no future CAT allocations will be allowed to 108 * overlap with this bitmask. 109 * 110 * Local register variables are utilized to ensure that the memory region 111 * to be locked is the only memory access made during the critical locking 112 * loop. 113 * 114 * Return: 0. Waiter on waitqueue will be woken on completion. 115 */ 116 int resctrl_arch_pseudo_lock_fn(void *_plr) 117 { 118 struct pseudo_lock_region *plr = _plr; 119 u32 rmid_p, closid_p; 120 unsigned long i; 121 u64 saved_msr; 122 #ifdef CONFIG_KASAN 123 /* 124 * The registers used for local register variables are also used 125 * when KASAN is active. When KASAN is active we use a regular 126 * variable to ensure we always use a valid pointer, but the cost 127 * is that this variable will enter the cache through evicting the 128 * memory we are trying to lock into the cache. Thus expect lower 129 * pseudo-locking success rate when KASAN is active. 130 */ 131 unsigned int line_size; 132 unsigned int size; 133 void *mem_r; 134 #else 135 register unsigned int line_size asm("esi"); 136 register unsigned int size asm("edi"); 137 register void *mem_r asm(_ASM_BX); 138 #endif /* CONFIG_KASAN */ 139 140 /* 141 * Make sure none of the allocated memory is cached. If it is we 142 * will get a cache hit in below loop from outside of pseudo-locked 143 * region. 144 * wbinvd (as opposed to clflush/clflushopt) is required to 145 * increase likelihood that allocated cache portion will be filled 146 * with associated memory. 147 */ 148 wbinvd(); 149 150 /* 151 * Always called with interrupts enabled. By disabling interrupts 152 * ensure that we will not be preempted during this critical section. 153 */ 154 local_irq_disable(); 155 156 /* 157 * Call wrmsr and rdmsr as directly as possible to avoid tracing 158 * clobbering local register variables or affecting cache accesses. 159 * 160 * Disable the hardware prefetcher so that when the end of the memory 161 * being pseudo-locked is reached the hardware will not read beyond 162 * the buffer and evict pseudo-locked memory read earlier from the 163 * cache. 164 */ 165 saved_msr = native_rdmsrq(MSR_MISC_FEATURE_CONTROL); 166 native_wrmsrq(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits); 167 closid_p = this_cpu_read(pqr_state.cur_closid); 168 rmid_p = this_cpu_read(pqr_state.cur_rmid); 169 mem_r = plr->kmem; 170 size = plr->size; 171 line_size = plr->line_size; 172 /* 173 * Critical section begin: start by writing the closid associated 174 * with the capacity bitmask of the cache region being 175 * pseudo-locked followed by reading of kernel memory to load it 176 * into the cache. 177 */ 178 native_wrmsr(MSR_IA32_PQR_ASSOC, rmid_p, plr->closid); 179 180 /* 181 * Cache was flushed earlier. Now access kernel memory to read it 182 * into cache region associated with just activated plr->closid. 183 * Loop over data twice: 184 * - In first loop the cache region is shared with the page walker 185 * as it populates the paging structure caches (including TLB). 186 * - In the second loop the paging structure caches are used and 187 * cache region is populated with the memory being referenced. 188 */ 189 for (i = 0; i < size; i += PAGE_SIZE) { 190 /* 191 * Add a barrier to prevent speculative execution of this 192 * loop reading beyond the end of the buffer. 193 */ 194 rmb(); 195 asm volatile("mov (%0,%1,1), %%eax\n\t" 196 : 197 : "r" (mem_r), "r" (i) 198 : "%eax", "memory"); 199 } 200 for (i = 0; i < size; i += line_size) { 201 /* 202 * Add a barrier to prevent speculative execution of this 203 * loop reading beyond the end of the buffer. 204 */ 205 rmb(); 206 asm volatile("mov (%0,%1,1), %%eax\n\t" 207 : 208 : "r" (mem_r), "r" (i) 209 : "%eax", "memory"); 210 } 211 /* 212 * Critical section end: restore closid with capacity bitmask that 213 * does not overlap with pseudo-locked region. 214 */ 215 native_wrmsr(MSR_IA32_PQR_ASSOC, rmid_p, closid_p); 216 217 /* Re-enable the hardware prefetcher(s) */ 218 wrmsrq(MSR_MISC_FEATURE_CONTROL, saved_msr); 219 local_irq_enable(); 220 221 plr->thread_done = 1; 222 wake_up_interruptible(&plr->lock_thread_wq); 223 return 0; 224 } 225 226 /** 227 * resctrl_arch_measure_cycles_lat_fn - Measure cycle latency to read 228 * pseudo-locked memory 229 * @_plr: pseudo-lock region to measure 230 * 231 * There is no deterministic way to test if a memory region is cached. One 232 * way is to measure how long it takes to read the memory, the speed of 233 * access is a good way to learn how close to the cpu the data was. Even 234 * more, if the prefetcher is disabled and the memory is read at a stride 235 * of half the cache line, then a cache miss will be easy to spot since the 236 * read of the first half would be significantly slower than the read of 237 * the second half. 238 * 239 * Return: 0. Waiter on waitqueue will be woken on completion. 240 */ 241 int resctrl_arch_measure_cycles_lat_fn(void *_plr) 242 { 243 struct pseudo_lock_region *plr = _plr; 244 u32 saved_low, saved_high; 245 unsigned long i; 246 u64 start, end; 247 void *mem_r; 248 249 local_irq_disable(); 250 /* 251 * Disable hardware prefetchers. 252 */ 253 rdmsr(MSR_MISC_FEATURE_CONTROL, saved_low, saved_high); 254 wrmsrq(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits); 255 mem_r = READ_ONCE(plr->kmem); 256 /* 257 * Dummy execute of the time measurement to load the needed 258 * instructions into the L1 instruction cache. 259 */ 260 start = rdtsc_ordered(); 261 for (i = 0; i < plr->size; i += 32) { 262 start = rdtsc_ordered(); 263 asm volatile("mov (%0,%1,1), %%eax\n\t" 264 : 265 : "r" (mem_r), "r" (i) 266 : "%eax", "memory"); 267 end = rdtsc_ordered(); 268 trace_pseudo_lock_mem_latency((u32)(end - start)); 269 } 270 wrmsr(MSR_MISC_FEATURE_CONTROL, saved_low, saved_high); 271 local_irq_enable(); 272 plr->thread_done = 1; 273 wake_up_interruptible(&plr->lock_thread_wq); 274 return 0; 275 } 276 277 /* 278 * Create a perf_event_attr for the hit and miss perf events that will 279 * be used during the performance measurement. A perf_event maintains 280 * a pointer to its perf_event_attr so a unique attribute structure is 281 * created for each perf_event. 282 * 283 * The actual configuration of the event is set right before use in order 284 * to use the X86_CONFIG macro. 285 */ 286 static struct perf_event_attr perf_miss_attr = { 287 .type = PERF_TYPE_RAW, 288 .size = sizeof(struct perf_event_attr), 289 .pinned = 1, 290 .disabled = 0, 291 .exclude_user = 1, 292 }; 293 294 static struct perf_event_attr perf_hit_attr = { 295 .type = PERF_TYPE_RAW, 296 .size = sizeof(struct perf_event_attr), 297 .pinned = 1, 298 .disabled = 0, 299 .exclude_user = 1, 300 }; 301 302 struct residency_counts { 303 u64 miss_before, hits_before; 304 u64 miss_after, hits_after; 305 }; 306 307 static int measure_residency_fn(struct perf_event_attr *miss_attr, 308 struct perf_event_attr *hit_attr, 309 struct pseudo_lock_region *plr, 310 struct residency_counts *counts) 311 { 312 u64 hits_before = 0, hits_after = 0, miss_before = 0, miss_after = 0; 313 struct perf_event *miss_event, *hit_event; 314 int hit_pmcnum, miss_pmcnum; 315 u32 saved_low, saved_high; 316 unsigned int line_size; 317 unsigned int size; 318 unsigned long i; 319 void *mem_r; 320 u64 tmp; 321 322 miss_event = perf_event_create_kernel_counter(miss_attr, plr->cpu, 323 NULL, NULL, NULL); 324 if (IS_ERR(miss_event)) 325 goto out; 326 327 hit_event = perf_event_create_kernel_counter(hit_attr, plr->cpu, 328 NULL, NULL, NULL); 329 if (IS_ERR(hit_event)) 330 goto out_miss; 331 332 local_irq_disable(); 333 /* 334 * Check any possible error state of events used by performing 335 * one local read. 336 */ 337 if (perf_event_read_local(miss_event, &tmp, NULL, NULL)) { 338 local_irq_enable(); 339 goto out_hit; 340 } 341 if (perf_event_read_local(hit_event, &tmp, NULL, NULL)) { 342 local_irq_enable(); 343 goto out_hit; 344 } 345 346 /* 347 * Disable hardware prefetchers. 348 */ 349 rdmsr(MSR_MISC_FEATURE_CONTROL, saved_low, saved_high); 350 wrmsrq(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits); 351 352 /* Initialize rest of local variables */ 353 /* 354 * Performance event has been validated right before this with 355 * interrupts disabled - it is thus safe to read the counter index. 356 */ 357 miss_pmcnum = x86_perf_rdpmc_index(miss_event); 358 hit_pmcnum = x86_perf_rdpmc_index(hit_event); 359 line_size = READ_ONCE(plr->line_size); 360 mem_r = READ_ONCE(plr->kmem); 361 size = READ_ONCE(plr->size); 362 363 /* 364 * Read counter variables twice - first to load the instructions 365 * used in L1 cache, second to capture accurate value that does not 366 * include cache misses incurred because of instruction loads. 367 */ 368 hits_before = rdpmc(hit_pmcnum); 369 miss_before = rdpmc(miss_pmcnum); 370 /* 371 * From SDM: Performing back-to-back fast reads are not guaranteed 372 * to be monotonic. 373 * Use LFENCE to ensure all previous instructions are retired 374 * before proceeding. 375 */ 376 rmb(); 377 hits_before = rdpmc(hit_pmcnum); 378 miss_before = rdpmc(miss_pmcnum); 379 /* 380 * Use LFENCE to ensure all previous instructions are retired 381 * before proceeding. 382 */ 383 rmb(); 384 for (i = 0; i < size; i += line_size) { 385 /* 386 * Add a barrier to prevent speculative execution of this 387 * loop reading beyond the end of the buffer. 388 */ 389 rmb(); 390 asm volatile("mov (%0,%1,1), %%eax\n\t" 391 : 392 : "r" (mem_r), "r" (i) 393 : "%eax", "memory"); 394 } 395 /* 396 * Use LFENCE to ensure all previous instructions are retired 397 * before proceeding. 398 */ 399 rmb(); 400 hits_after = rdpmc(hit_pmcnum); 401 miss_after = rdpmc(miss_pmcnum); 402 /* 403 * Use LFENCE to ensure all previous instructions are retired 404 * before proceeding. 405 */ 406 rmb(); 407 /* Re-enable hardware prefetchers */ 408 wrmsr(MSR_MISC_FEATURE_CONTROL, saved_low, saved_high); 409 local_irq_enable(); 410 out_hit: 411 perf_event_release_kernel(hit_event); 412 out_miss: 413 perf_event_release_kernel(miss_event); 414 out: 415 /* 416 * All counts will be zero on failure. 417 */ 418 counts->miss_before = miss_before; 419 counts->hits_before = hits_before; 420 counts->miss_after = miss_after; 421 counts->hits_after = hits_after; 422 return 0; 423 } 424 425 int resctrl_arch_measure_l2_residency(void *_plr) 426 { 427 struct pseudo_lock_region *plr = _plr; 428 struct residency_counts counts = {0}; 429 430 /* 431 * Non-architectural event for the Goldmont Microarchitecture 432 * from Intel x86 Architecture Software Developer Manual (SDM): 433 * MEM_LOAD_UOPS_RETIRED D1H (event number) 434 * Umask values: 435 * L2_HIT 02H 436 * L2_MISS 10H 437 */ 438 switch (boot_cpu_data.x86_vfm) { 439 case INTEL_ATOM_GOLDMONT: 440 case INTEL_ATOM_GOLDMONT_PLUS: 441 perf_miss_attr.config = X86_CONFIG(.event = 0xd1, 442 .umask = 0x10); 443 perf_hit_attr.config = X86_CONFIG(.event = 0xd1, 444 .umask = 0x2); 445 break; 446 default: 447 goto out; 448 } 449 450 measure_residency_fn(&perf_miss_attr, &perf_hit_attr, plr, &counts); 451 /* 452 * If a failure prevented the measurements from succeeding 453 * tracepoints will still be written and all counts will be zero. 454 */ 455 trace_pseudo_lock_l2(counts.hits_after - counts.hits_before, 456 counts.miss_after - counts.miss_before); 457 out: 458 plr->thread_done = 1; 459 wake_up_interruptible(&plr->lock_thread_wq); 460 return 0; 461 } 462 463 int resctrl_arch_measure_l3_residency(void *_plr) 464 { 465 struct pseudo_lock_region *plr = _plr; 466 struct residency_counts counts = {0}; 467 468 /* 469 * On Broadwell Microarchitecture the MEM_LOAD_UOPS_RETIRED event 470 * has two "no fix" errata associated with it: BDM35 and BDM100. On 471 * this platform the following events are used instead: 472 * LONGEST_LAT_CACHE 2EH (Documented in SDM) 473 * REFERENCE 4FH 474 * MISS 41H 475 */ 476 477 switch (boot_cpu_data.x86_vfm) { 478 case INTEL_BROADWELL_X: 479 /* On BDW the hit event counts references, not hits */ 480 perf_hit_attr.config = X86_CONFIG(.event = 0x2e, 481 .umask = 0x4f); 482 perf_miss_attr.config = X86_CONFIG(.event = 0x2e, 483 .umask = 0x41); 484 break; 485 default: 486 goto out; 487 } 488 489 measure_residency_fn(&perf_miss_attr, &perf_hit_attr, plr, &counts); 490 /* 491 * If a failure prevented the measurements from succeeding 492 * tracepoints will still be written and all counts will be zero. 493 */ 494 495 counts.miss_after -= counts.miss_before; 496 if (boot_cpu_data.x86_vfm == INTEL_BROADWELL_X) { 497 /* 498 * On BDW references and misses are counted, need to adjust. 499 * Sometimes the "hits" counter is a bit more than the 500 * references, for example, x references but x + 1 hits. 501 * To not report invalid hit values in this case we treat 502 * that as misses equal to references. 503 */ 504 /* First compute the number of cache references measured */ 505 counts.hits_after -= counts.hits_before; 506 /* Next convert references to cache hits */ 507 counts.hits_after -= min(counts.miss_after, counts.hits_after); 508 } else { 509 counts.hits_after -= counts.hits_before; 510 } 511 512 trace_pseudo_lock_l3(counts.hits_after, counts.miss_after); 513 out: 514 plr->thread_done = 1; 515 wake_up_interruptible(&plr->lock_thread_wq); 516 return 0; 517 } 518