xref: /linux/arch/x86/kernel/cpu/resctrl/pseudo_lock.c (revision 664a231d90aa450f9f6f029bee3a94dd08e1aac6)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Resource Director Technology (RDT)
4  *
5  * Pseudo-locking support built on top of Cache Allocation Technology (CAT)
6  *
7  * Copyright (C) 2018 Intel Corporation
8  *
9  * Author: Reinette Chatre <reinette.chatre@intel.com>
10  */
11 
12 #define pr_fmt(fmt)	KBUILD_MODNAME ": " fmt
13 
14 #include <linux/cacheflush.h>
15 #include <linux/cpu.h>
16 #include <linux/perf_event.h>
17 #include <linux/pm_qos.h>
18 #include <linux/resctrl.h>
19 
20 #include <asm/cpu_device_id.h>
21 #include <asm/perf_event.h>
22 #include <asm/msr.h>
23 
24 #include "../../events/perf_event.h" /* For X86_CONFIG() */
25 #include "internal.h"
26 
27 #define CREATE_TRACE_POINTS
28 
29 #include "pseudo_lock_trace.h"
30 
31 /*
32  * The bits needed to disable hardware prefetching varies based on the
33  * platform. During initialization we will discover which bits to use.
34  */
35 static u64 prefetch_disable_bits;
36 
37 /**
38  * resctrl_arch_get_prefetch_disable_bits - prefetch disable bits of supported
39  *                                          platforms
40  * @void: It takes no parameters.
41  *
42  * Capture the list of platforms that have been validated to support
43  * pseudo-locking. This includes testing to ensure pseudo-locked regions
44  * with low cache miss rates can be created under variety of load conditions
45  * as well as that these pseudo-locked regions can maintain their low cache
46  * miss rates under variety of load conditions for significant lengths of time.
47  *
48  * After a platform has been validated to support pseudo-locking its
49  * hardware prefetch disable bits are included here as they are documented
50  * in the SDM.
51  *
52  * When adding a platform here also add support for its cache events to
53  * resctrl_arch_measure_l*_residency()
54  *
55  * Return:
56  * If platform is supported, the bits to disable hardware prefetchers, 0
57  * if platform is not supported.
58  */
resctrl_arch_get_prefetch_disable_bits(void)59 u64 resctrl_arch_get_prefetch_disable_bits(void)
60 {
61 	prefetch_disable_bits = 0;
62 
63 	if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
64 	    boot_cpu_data.x86 != 6)
65 		return 0;
66 
67 	switch (boot_cpu_data.x86_vfm) {
68 	case INTEL_BROADWELL_X:
69 		/*
70 		 * SDM defines bits of MSR_MISC_FEATURE_CONTROL register
71 		 * as:
72 		 * 0    L2 Hardware Prefetcher Disable (R/W)
73 		 * 1    L2 Adjacent Cache Line Prefetcher Disable (R/W)
74 		 * 2    DCU Hardware Prefetcher Disable (R/W)
75 		 * 3    DCU IP Prefetcher Disable (R/W)
76 		 * 63:4 Reserved
77 		 */
78 		prefetch_disable_bits = 0xF;
79 		break;
80 	case INTEL_ATOM_GOLDMONT:
81 	case INTEL_ATOM_GOLDMONT_PLUS:
82 		/*
83 		 * SDM defines bits of MSR_MISC_FEATURE_CONTROL register
84 		 * as:
85 		 * 0     L2 Hardware Prefetcher Disable (R/W)
86 		 * 1     Reserved
87 		 * 2     DCU Hardware Prefetcher Disable (R/W)
88 		 * 63:3  Reserved
89 		 */
90 		prefetch_disable_bits = 0x5;
91 		break;
92 	}
93 
94 	return prefetch_disable_bits;
95 }
96 
97 /**
98  * resctrl_arch_pseudo_lock_fn - Load kernel memory into cache
99  * @_plr: the pseudo-lock region descriptor
100  *
101  * This is the core pseudo-locking flow.
102  *
103  * First we ensure that the kernel memory cannot be found in the cache.
104  * Then, while taking care that there will be as little interference as
105  * possible, the memory to be loaded is accessed while core is running
106  * with class of service set to the bitmask of the pseudo-locked region.
107  * After this is complete no future CAT allocations will be allowed to
108  * overlap with this bitmask.
109  *
110  * Local register variables are utilized to ensure that the memory region
111  * to be locked is the only memory access made during the critical locking
112  * loop.
113  *
114  * Return: 0. Waiter on waitqueue will be woken on completion.
115  */
resctrl_arch_pseudo_lock_fn(void * _plr)116 int resctrl_arch_pseudo_lock_fn(void *_plr)
117 {
118 	struct pseudo_lock_region *plr = _plr;
119 	u32 rmid_p, closid_p;
120 	unsigned long i;
121 	u64 saved_msr;
122 #ifdef CONFIG_KASAN
123 	/*
124 	 * The registers used for local register variables are also used
125 	 * when KASAN is active. When KASAN is active we use a regular
126 	 * variable to ensure we always use a valid pointer, but the cost
127 	 * is that this variable will enter the cache through evicting the
128 	 * memory we are trying to lock into the cache. Thus expect lower
129 	 * pseudo-locking success rate when KASAN is active.
130 	 */
131 	unsigned int line_size;
132 	unsigned int size;
133 	void *mem_r;
134 #else
135 	register unsigned int line_size asm("esi");
136 	register unsigned int size asm("edi");
137 	register void *mem_r asm(_ASM_BX);
138 #endif /* CONFIG_KASAN */
139 
140 	/*
141 	 * Make sure none of the allocated memory is cached. If it is we
142 	 * will get a cache hit in below loop from outside of pseudo-locked
143 	 * region.
144 	 * wbinvd (as opposed to clflush/clflushopt) is required to
145 	 * increase likelihood that allocated cache portion will be filled
146 	 * with associated memory.
147 	 */
148 	wbinvd();
149 
150 	/*
151 	 * Always called with interrupts enabled. By disabling interrupts
152 	 * ensure that we will not be preempted during this critical section.
153 	 */
154 	local_irq_disable();
155 
156 	/*
157 	 * Call wrmsr and rdmsr as directly as possible to avoid tracing
158 	 * clobbering local register variables or affecting cache accesses.
159 	 *
160 	 * Disable the hardware prefetcher so that when the end of the memory
161 	 * being pseudo-locked is reached the hardware will not read beyond
162 	 * the buffer and evict pseudo-locked memory read earlier from the
163 	 * cache.
164 	 */
165 	saved_msr = native_rdmsrq(MSR_MISC_FEATURE_CONTROL);
166 	native_wrmsrq(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits);
167 	closid_p = this_cpu_read(pqr_state.cur_closid);
168 	rmid_p = this_cpu_read(pqr_state.cur_rmid);
169 	mem_r = plr->kmem;
170 	size = plr->size;
171 	line_size = plr->line_size;
172 	/*
173 	 * Critical section begin: start by writing the closid associated
174 	 * with the capacity bitmask of the cache region being
175 	 * pseudo-locked followed by reading of kernel memory to load it
176 	 * into the cache.
177 	 */
178 	native_wrmsr(MSR_IA32_PQR_ASSOC, rmid_p, plr->closid);
179 
180 	/*
181 	 * Cache was flushed earlier. Now access kernel memory to read it
182 	 * into cache region associated with just activated plr->closid.
183 	 * Loop over data twice:
184 	 * - In first loop the cache region is shared with the page walker
185 	 *   as it populates the paging structure caches (including TLB).
186 	 * - In the second loop the paging structure caches are used and
187 	 *   cache region is populated with the memory being referenced.
188 	 */
189 	for (i = 0; i < size; i += PAGE_SIZE) {
190 		/*
191 		 * Add a barrier to prevent speculative execution of this
192 		 * loop reading beyond the end of the buffer.
193 		 */
194 		rmb();
195 		asm volatile("mov (%0,%1,1), %%eax\n\t"
196 			:
197 			: "r" (mem_r), "r" (i)
198 			: "%eax", "memory");
199 	}
200 	for (i = 0; i < size; i += line_size) {
201 		/*
202 		 * Add a barrier to prevent speculative execution of this
203 		 * loop reading beyond the end of the buffer.
204 		 */
205 		rmb();
206 		asm volatile("mov (%0,%1,1), %%eax\n\t"
207 			:
208 			: "r" (mem_r), "r" (i)
209 			: "%eax", "memory");
210 	}
211 	/*
212 	 * Critical section end: restore closid with capacity bitmask that
213 	 * does not overlap with pseudo-locked region.
214 	 */
215 	native_wrmsr(MSR_IA32_PQR_ASSOC, rmid_p, closid_p);
216 
217 	/* Re-enable the hardware prefetcher(s) */
218 	wrmsrq(MSR_MISC_FEATURE_CONTROL, saved_msr);
219 	local_irq_enable();
220 
221 	plr->thread_done = 1;
222 	wake_up_interruptible(&plr->lock_thread_wq);
223 	return 0;
224 }
225 
226 /**
227  * resctrl_arch_measure_cycles_lat_fn - Measure cycle latency to read
228  *                                      pseudo-locked memory
229  * @_plr: pseudo-lock region to measure
230  *
231  * There is no deterministic way to test if a memory region is cached. One
232  * way is to measure how long it takes to read the memory, the speed of
233  * access is a good way to learn how close to the cpu the data was. Even
234  * more, if the prefetcher is disabled and the memory is read at a stride
235  * of half the cache line, then a cache miss will be easy to spot since the
236  * read of the first half would be significantly slower than the read of
237  * the second half.
238  *
239  * Return: 0. Waiter on waitqueue will be woken on completion.
240  */
resctrl_arch_measure_cycles_lat_fn(void * _plr)241 int resctrl_arch_measure_cycles_lat_fn(void *_plr)
242 {
243 	struct pseudo_lock_region *plr = _plr;
244 	u32 saved_low, saved_high;
245 	unsigned long i;
246 	u64 start, end;
247 	void *mem_r;
248 
249 	local_irq_disable();
250 	/*
251 	 * Disable hardware prefetchers.
252 	 */
253 	rdmsr(MSR_MISC_FEATURE_CONTROL, saved_low, saved_high);
254 	wrmsrq(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits);
255 	mem_r = READ_ONCE(plr->kmem);
256 	/*
257 	 * Dummy execute of the time measurement to load the needed
258 	 * instructions into the L1 instruction cache.
259 	 */
260 	start = rdtsc_ordered();
261 	for (i = 0; i < plr->size; i += 32) {
262 		start = rdtsc_ordered();
263 		asm volatile("mov (%0,%1,1), %%eax\n\t"
264 			     :
265 			     : "r" (mem_r), "r" (i)
266 			     : "%eax", "memory");
267 		end = rdtsc_ordered();
268 		trace_pseudo_lock_mem_latency((u32)(end - start));
269 	}
270 	wrmsr(MSR_MISC_FEATURE_CONTROL, saved_low, saved_high);
271 	local_irq_enable();
272 	plr->thread_done = 1;
273 	wake_up_interruptible(&plr->lock_thread_wq);
274 	return 0;
275 }
276 
277 /*
278  * Create a perf_event_attr for the hit and miss perf events that will
279  * be used during the performance measurement. A perf_event maintains
280  * a pointer to its perf_event_attr so a unique attribute structure is
281  * created for each perf_event.
282  *
283  * The actual configuration of the event is set right before use in order
284  * to use the X86_CONFIG macro.
285  */
286 static struct perf_event_attr perf_miss_attr = {
287 	.type		= PERF_TYPE_RAW,
288 	.size		= sizeof(struct perf_event_attr),
289 	.pinned		= 1,
290 	.disabled	= 0,
291 	.exclude_user	= 1,
292 };
293 
294 static struct perf_event_attr perf_hit_attr = {
295 	.type		= PERF_TYPE_RAW,
296 	.size		= sizeof(struct perf_event_attr),
297 	.pinned		= 1,
298 	.disabled	= 0,
299 	.exclude_user	= 1,
300 };
301 
302 struct residency_counts {
303 	u64 miss_before, hits_before;
304 	u64 miss_after,  hits_after;
305 };
306 
measure_residency_fn(struct perf_event_attr * miss_attr,struct perf_event_attr * hit_attr,struct pseudo_lock_region * plr,struct residency_counts * counts)307 static int measure_residency_fn(struct perf_event_attr *miss_attr,
308 				struct perf_event_attr *hit_attr,
309 				struct pseudo_lock_region *plr,
310 				struct residency_counts *counts)
311 {
312 	u64 hits_before = 0, hits_after = 0, miss_before = 0, miss_after = 0;
313 	struct perf_event *miss_event, *hit_event;
314 	int hit_pmcnum, miss_pmcnum;
315 	u32 saved_low, saved_high;
316 	unsigned int line_size;
317 	unsigned int size;
318 	unsigned long i;
319 	void *mem_r;
320 	u64 tmp;
321 
322 	miss_event = perf_event_create_kernel_counter(miss_attr, plr->cpu,
323 						      NULL, NULL, NULL);
324 	if (IS_ERR(miss_event))
325 		goto out;
326 
327 	hit_event = perf_event_create_kernel_counter(hit_attr, plr->cpu,
328 						     NULL, NULL, NULL);
329 	if (IS_ERR(hit_event))
330 		goto out_miss;
331 
332 	local_irq_disable();
333 	/*
334 	 * Check any possible error state of events used by performing
335 	 * one local read.
336 	 */
337 	if (perf_event_read_local(miss_event, &tmp, NULL, NULL)) {
338 		local_irq_enable();
339 		goto out_hit;
340 	}
341 	if (perf_event_read_local(hit_event, &tmp, NULL, NULL)) {
342 		local_irq_enable();
343 		goto out_hit;
344 	}
345 
346 	/*
347 	 * Disable hardware prefetchers.
348 	 */
349 	rdmsr(MSR_MISC_FEATURE_CONTROL, saved_low, saved_high);
350 	wrmsrq(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits);
351 
352 	/* Initialize rest of local variables */
353 	/*
354 	 * Performance event has been validated right before this with
355 	 * interrupts disabled - it is thus safe to read the counter index.
356 	 */
357 	miss_pmcnum = x86_perf_rdpmc_index(miss_event);
358 	hit_pmcnum = x86_perf_rdpmc_index(hit_event);
359 	line_size = READ_ONCE(plr->line_size);
360 	mem_r = READ_ONCE(plr->kmem);
361 	size = READ_ONCE(plr->size);
362 
363 	/*
364 	 * Read counter variables twice - first to load the instructions
365 	 * used in L1 cache, second to capture accurate value that does not
366 	 * include cache misses incurred because of instruction loads.
367 	 */
368 	hits_before = rdpmc(hit_pmcnum);
369 	miss_before = rdpmc(miss_pmcnum);
370 	/*
371 	 * From SDM: Performing back-to-back fast reads are not guaranteed
372 	 * to be monotonic.
373 	 * Use LFENCE to ensure all previous instructions are retired
374 	 * before proceeding.
375 	 */
376 	rmb();
377 	hits_before = rdpmc(hit_pmcnum);
378 	miss_before = rdpmc(miss_pmcnum);
379 	/*
380 	 * Use LFENCE to ensure all previous instructions are retired
381 	 * before proceeding.
382 	 */
383 	rmb();
384 	for (i = 0; i < size; i += line_size) {
385 		/*
386 		 * Add a barrier to prevent speculative execution of this
387 		 * loop reading beyond the end of the buffer.
388 		 */
389 		rmb();
390 		asm volatile("mov (%0,%1,1), %%eax\n\t"
391 			     :
392 			     : "r" (mem_r), "r" (i)
393 			     : "%eax", "memory");
394 	}
395 	/*
396 	 * Use LFENCE to ensure all previous instructions are retired
397 	 * before proceeding.
398 	 */
399 	rmb();
400 	hits_after = rdpmc(hit_pmcnum);
401 	miss_after = rdpmc(miss_pmcnum);
402 	/*
403 	 * Use LFENCE to ensure all previous instructions are retired
404 	 * before proceeding.
405 	 */
406 	rmb();
407 	/* Re-enable hardware prefetchers */
408 	wrmsr(MSR_MISC_FEATURE_CONTROL, saved_low, saved_high);
409 	local_irq_enable();
410 out_hit:
411 	perf_event_release_kernel(hit_event);
412 out_miss:
413 	perf_event_release_kernel(miss_event);
414 out:
415 	/*
416 	 * All counts will be zero on failure.
417 	 */
418 	counts->miss_before = miss_before;
419 	counts->hits_before = hits_before;
420 	counts->miss_after  = miss_after;
421 	counts->hits_after  = hits_after;
422 	return 0;
423 }
424 
resctrl_arch_measure_l2_residency(void * _plr)425 int resctrl_arch_measure_l2_residency(void *_plr)
426 {
427 	struct pseudo_lock_region *plr = _plr;
428 	struct residency_counts counts = {0};
429 
430 	/*
431 	 * Non-architectural event for the Goldmont Microarchitecture
432 	 * from Intel x86 Architecture Software Developer Manual (SDM):
433 	 * MEM_LOAD_UOPS_RETIRED D1H (event number)
434 	 * Umask values:
435 	 *     L2_HIT   02H
436 	 *     L2_MISS  10H
437 	 */
438 	switch (boot_cpu_data.x86_vfm) {
439 	case INTEL_ATOM_GOLDMONT:
440 	case INTEL_ATOM_GOLDMONT_PLUS:
441 		perf_miss_attr.config = X86_CONFIG(.event = 0xd1,
442 						   .umask = 0x10);
443 		perf_hit_attr.config = X86_CONFIG(.event = 0xd1,
444 						  .umask = 0x2);
445 		break;
446 	default:
447 		goto out;
448 	}
449 
450 	measure_residency_fn(&perf_miss_attr, &perf_hit_attr, plr, &counts);
451 	/*
452 	 * If a failure prevented the measurements from succeeding
453 	 * tracepoints will still be written and all counts will be zero.
454 	 */
455 	trace_pseudo_lock_l2(counts.hits_after - counts.hits_before,
456 			     counts.miss_after - counts.miss_before);
457 out:
458 	plr->thread_done = 1;
459 	wake_up_interruptible(&plr->lock_thread_wq);
460 	return 0;
461 }
462 
resctrl_arch_measure_l3_residency(void * _plr)463 int resctrl_arch_measure_l3_residency(void *_plr)
464 {
465 	struct pseudo_lock_region *plr = _plr;
466 	struct residency_counts counts = {0};
467 
468 	/*
469 	 * On Broadwell Microarchitecture the MEM_LOAD_UOPS_RETIRED event
470 	 * has two "no fix" errata associated with it: BDM35 and BDM100. On
471 	 * this platform the following events are used instead:
472 	 * LONGEST_LAT_CACHE 2EH (Documented in SDM)
473 	 *       REFERENCE 4FH
474 	 *       MISS      41H
475 	 */
476 
477 	switch (boot_cpu_data.x86_vfm) {
478 	case INTEL_BROADWELL_X:
479 		/* On BDW the hit event counts references, not hits */
480 		perf_hit_attr.config = X86_CONFIG(.event = 0x2e,
481 						  .umask = 0x4f);
482 		perf_miss_attr.config = X86_CONFIG(.event = 0x2e,
483 						   .umask = 0x41);
484 		break;
485 	default:
486 		goto out;
487 	}
488 
489 	measure_residency_fn(&perf_miss_attr, &perf_hit_attr, plr, &counts);
490 	/*
491 	 * If a failure prevented the measurements from succeeding
492 	 * tracepoints will still be written and all counts will be zero.
493 	 */
494 
495 	counts.miss_after -= counts.miss_before;
496 	if (boot_cpu_data.x86_vfm == INTEL_BROADWELL_X) {
497 		/*
498 		 * On BDW references and misses are counted, need to adjust.
499 		 * Sometimes the "hits" counter is a bit more than the
500 		 * references, for example, x references but x + 1 hits.
501 		 * To not report invalid hit values in this case we treat
502 		 * that as misses equal to references.
503 		 */
504 		/* First compute the number of cache references measured */
505 		counts.hits_after -= counts.hits_before;
506 		/* Next convert references to cache hits */
507 		counts.hits_after -= min(counts.miss_after, counts.hits_after);
508 	} else {
509 		counts.hits_after -= counts.hits_before;
510 	}
511 
512 	trace_pseudo_lock_l3(counts.hits_after, counts.miss_after);
513 out:
514 	plr->thread_done = 1;
515 	wake_up_interruptible(&plr->lock_thread_wq);
516 	return 0;
517 }
518