xref: /linux/arch/x86/events/intel/bts.c (revision e47a324d6f07c9ef252cfce1f14cfa5110cbed99)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * BTS PMU driver for perf
4  * Copyright (c) 2013-2014, Intel Corporation.
5  */
6 
7 #undef DEBUG
8 
9 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
10 
11 #include <linux/bitops.h>
12 #include <linux/types.h>
13 #include <linux/slab.h>
14 #include <linux/debugfs.h>
15 #include <linux/device.h>
16 #include <linux/coredump.h>
17 
18 #include <linux/sizes.h>
19 #include <asm/perf_event.h>
20 #include <asm/msr.h>
21 
22 #include "../perf_event.h"
23 
24 struct bts_ctx {
25 	struct perf_output_handle	handle;
26 	struct debug_store		ds_back;
27 	int				state;
28 };
29 
30 /* BTS context states: */
31 enum {
32 	/* no ongoing AUX transactions */
33 	BTS_STATE_STOPPED = 0,
34 	/* AUX transaction is on, BTS tracing is disabled */
35 	BTS_STATE_INACTIVE,
36 	/* AUX transaction is on, BTS tracing is running */
37 	BTS_STATE_ACTIVE,
38 };
39 
40 static struct bts_ctx __percpu *bts_ctx;
41 
42 #define BTS_RECORD_SIZE		24
43 #define BTS_SAFETY_MARGIN	4080
44 
45 struct bts_phys {
46 	struct page	*page;
47 	unsigned long	size;
48 	unsigned long	offset;
49 	unsigned long	displacement;
50 };
51 
52 struct bts_buffer {
53 	size_t		real_size;	/* multiple of BTS_RECORD_SIZE */
54 	unsigned int	nr_pages;
55 	unsigned int	nr_bufs;
56 	unsigned int	cur_buf;
57 	bool		snapshot;
58 	local_t		data_size;
59 	local_t		head;
60 	unsigned long	end;
61 	void		**data_pages;
62 	struct bts_phys	buf[] __counted_by(nr_bufs);
63 };
64 
65 static struct pmu bts_pmu;
66 
67 static int buf_nr_pages(struct page *page)
68 {
69 	if (!PagePrivate(page))
70 		return 1;
71 
72 	return 1 << page_private(page);
73 }
74 
75 static size_t buf_size(struct page *page)
76 {
77 	return buf_nr_pages(page) * PAGE_SIZE;
78 }
79 
80 static void *
81 bts_buffer_setup_aux(struct perf_event *event, void **pages,
82 		     int nr_pages, bool overwrite)
83 {
84 	struct bts_buffer *bb;
85 	struct page *page;
86 	int cpu = event->cpu;
87 	int node = (cpu == -1) ? cpu : cpu_to_node(cpu);
88 	unsigned long offset;
89 	size_t size = nr_pages << PAGE_SHIFT;
90 	int pg, nr_buf, pad;
91 
92 	/* count all the high order buffers */
93 	for (pg = 0, nr_buf = 0; pg < nr_pages;) {
94 		page = virt_to_page(pages[pg]);
95 		pg += buf_nr_pages(page);
96 		nr_buf++;
97 	}
98 
99 	/*
100 	 * to avoid interrupts in overwrite mode, only allow one physical
101 	 */
102 	if (overwrite && nr_buf > 1)
103 		return NULL;
104 
105 	bb = kzalloc_node(struct_size(bb, buf, nr_buf), GFP_KERNEL, node);
106 	if (!bb)
107 		return NULL;
108 
109 	bb->nr_pages = nr_pages;
110 	bb->nr_bufs = nr_buf;
111 	bb->snapshot = overwrite;
112 	bb->data_pages = pages;
113 	bb->real_size = size - size % BTS_RECORD_SIZE;
114 
115 	for (pg = 0, nr_buf = 0, offset = 0, pad = 0; nr_buf < bb->nr_bufs; nr_buf++) {
116 		unsigned int __nr_pages;
117 
118 		page = virt_to_page(pages[pg]);
119 		__nr_pages = buf_nr_pages(page);
120 		bb->buf[nr_buf].page = page;
121 		bb->buf[nr_buf].offset = offset;
122 		bb->buf[nr_buf].displacement = (pad ? BTS_RECORD_SIZE - pad : 0);
123 		bb->buf[nr_buf].size = buf_size(page) - bb->buf[nr_buf].displacement;
124 		pad = bb->buf[nr_buf].size % BTS_RECORD_SIZE;
125 		bb->buf[nr_buf].size -= pad;
126 
127 		pg += __nr_pages;
128 		offset += __nr_pages << PAGE_SHIFT;
129 	}
130 
131 	return bb;
132 }
133 
134 static void bts_buffer_free_aux(void *data)
135 {
136 	kfree(data);
137 }
138 
139 static unsigned long bts_buffer_offset(struct bts_buffer *bb, unsigned int idx)
140 {
141 	return bb->buf[idx].offset + bb->buf[idx].displacement;
142 }
143 
144 static void
145 bts_config_buffer(struct bts_buffer *bb)
146 {
147 	int cpu = raw_smp_processor_id();
148 	struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
149 	struct bts_phys *phys = &bb->buf[bb->cur_buf];
150 	unsigned long index, thresh = 0, end = phys->size;
151 	struct page *page = phys->page;
152 
153 	index = local_read(&bb->head);
154 
155 	if (!bb->snapshot) {
156 		if (bb->end < phys->offset + buf_size(page))
157 			end = bb->end - phys->offset - phys->displacement;
158 
159 		index -= phys->offset + phys->displacement;
160 
161 		if (end - index > BTS_SAFETY_MARGIN)
162 			thresh = end - BTS_SAFETY_MARGIN;
163 		else if (end - index > BTS_RECORD_SIZE)
164 			thresh = end - BTS_RECORD_SIZE;
165 		else
166 			thresh = end;
167 	}
168 
169 	ds->bts_buffer_base = (u64)(long)page_address(page) + phys->displacement;
170 	ds->bts_index = ds->bts_buffer_base + index;
171 	ds->bts_absolute_maximum = ds->bts_buffer_base + end;
172 	ds->bts_interrupt_threshold = !bb->snapshot
173 		? ds->bts_buffer_base + thresh
174 		: ds->bts_absolute_maximum + BTS_RECORD_SIZE;
175 }
176 
177 static void bts_buffer_pad_out(struct bts_phys *phys, unsigned long head)
178 {
179 	unsigned long index = head - phys->offset;
180 
181 	memset(page_address(phys->page) + index, 0, phys->size - index);
182 }
183 
184 static void bts_update(struct bts_ctx *bts)
185 {
186 	int cpu = raw_smp_processor_id();
187 	struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
188 	struct bts_buffer *bb = perf_get_aux(&bts->handle);
189 	unsigned long index = ds->bts_index - ds->bts_buffer_base, old, head;
190 
191 	if (!bb)
192 		return;
193 
194 	head = index + bts_buffer_offset(bb, bb->cur_buf);
195 	old = local_xchg(&bb->head, head);
196 
197 	if (!bb->snapshot) {
198 		if (old == head)
199 			return;
200 
201 		if (ds->bts_index >= ds->bts_absolute_maximum)
202 			perf_aux_output_flag(&bts->handle,
203 			                     PERF_AUX_FLAG_TRUNCATED);
204 
205 		/*
206 		 * old and head are always in the same physical buffer, so we
207 		 * can subtract them to get the data size.
208 		 */
209 		local_add(head - old, &bb->data_size);
210 	} else {
211 		local_set(&bb->data_size, head);
212 	}
213 
214 	/*
215 	 * Since BTS is coherent, just add compiler barrier to ensure
216 	 * BTS updating is ordered against bts::handle::event.
217 	 */
218 	barrier();
219 }
220 
221 static int
222 bts_buffer_reset(struct bts_buffer *bb, struct perf_output_handle *handle);
223 
224 /*
225  * Ordering PMU callbacks wrt themselves and the PMI is done by means
226  * of bts::state, which:
227  *  - is set when bts::handle::event is valid, that is, between
228  *    perf_aux_output_begin() and perf_aux_output_end();
229  *  - is zero otherwise;
230  *  - is ordered against bts::handle::event with a compiler barrier.
231  */
232 
233 static void __bts_event_start(struct perf_event *event)
234 {
235 	struct bts_ctx *bts = this_cpu_ptr(bts_ctx);
236 	struct bts_buffer *bb = perf_get_aux(&bts->handle);
237 	u64 config = 0;
238 
239 	if (!bb->snapshot)
240 		config |= ARCH_PERFMON_EVENTSEL_INT;
241 	if (!event->attr.exclude_kernel)
242 		config |= ARCH_PERFMON_EVENTSEL_OS;
243 	if (!event->attr.exclude_user)
244 		config |= ARCH_PERFMON_EVENTSEL_USR;
245 
246 	bts_config_buffer(bb);
247 
248 	/*
249 	 * local barrier to make sure that ds configuration made it
250 	 * before we enable BTS and bts::state goes ACTIVE
251 	 */
252 	wmb();
253 
254 	/* INACTIVE/STOPPED -> ACTIVE */
255 	WRITE_ONCE(bts->state, BTS_STATE_ACTIVE);
256 
257 	intel_pmu_enable_bts(config);
258 
259 }
260 
261 static void bts_event_start(struct perf_event *event, int flags)
262 {
263 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
264 	struct bts_ctx *bts = this_cpu_ptr(bts_ctx);
265 	struct bts_buffer *bb;
266 
267 	bb = perf_aux_output_begin(&bts->handle, event);
268 	if (!bb)
269 		goto fail_stop;
270 
271 	if (bts_buffer_reset(bb, &bts->handle))
272 		goto fail_end_stop;
273 
274 	bts->ds_back.bts_buffer_base = cpuc->ds->bts_buffer_base;
275 	bts->ds_back.bts_absolute_maximum = cpuc->ds->bts_absolute_maximum;
276 	bts->ds_back.bts_interrupt_threshold = cpuc->ds->bts_interrupt_threshold;
277 
278 	perf_event_itrace_started(event);
279 	event->hw.state = 0;
280 
281 	__bts_event_start(event);
282 
283 	return;
284 
285 fail_end_stop:
286 	perf_aux_output_end(&bts->handle, 0);
287 
288 fail_stop:
289 	event->hw.state = PERF_HES_STOPPED;
290 }
291 
292 static void __bts_event_stop(struct perf_event *event, int state)
293 {
294 	struct bts_ctx *bts = this_cpu_ptr(bts_ctx);
295 
296 	/* ACTIVE -> INACTIVE(PMI)/STOPPED(->stop()) */
297 	WRITE_ONCE(bts->state, state);
298 
299 	/*
300 	 * No extra synchronization is mandated by the documentation to have
301 	 * BTS data stores globally visible.
302 	 */
303 	intel_pmu_disable_bts();
304 }
305 
306 static void bts_event_stop(struct perf_event *event, int flags)
307 {
308 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
309 	struct bts_ctx *bts = this_cpu_ptr(bts_ctx);
310 	struct bts_buffer *bb = NULL;
311 	int state = READ_ONCE(bts->state);
312 
313 	if (state == BTS_STATE_ACTIVE)
314 		__bts_event_stop(event, BTS_STATE_STOPPED);
315 
316 	if (state != BTS_STATE_STOPPED)
317 		bb = perf_get_aux(&bts->handle);
318 
319 	event->hw.state |= PERF_HES_STOPPED;
320 
321 	if (flags & PERF_EF_UPDATE) {
322 		bts_update(bts);
323 
324 		if (bb) {
325 			if (bb->snapshot)
326 				bts->handle.head =
327 					local_xchg(&bb->data_size,
328 						   bb->nr_pages << PAGE_SHIFT);
329 			perf_aux_output_end(&bts->handle,
330 					    local_xchg(&bb->data_size, 0));
331 		}
332 
333 		cpuc->ds->bts_index = bts->ds_back.bts_buffer_base;
334 		cpuc->ds->bts_buffer_base = bts->ds_back.bts_buffer_base;
335 		cpuc->ds->bts_absolute_maximum = bts->ds_back.bts_absolute_maximum;
336 		cpuc->ds->bts_interrupt_threshold = bts->ds_back.bts_interrupt_threshold;
337 	}
338 }
339 
340 void intel_bts_enable_local(void)
341 {
342 	struct bts_ctx *bts;
343 	int state;
344 
345 	if (!bts_ctx)
346 		return;
347 
348 	bts = this_cpu_ptr(bts_ctx);
349 	state = READ_ONCE(bts->state);
350 	/*
351 	 * Here we transition from INACTIVE to ACTIVE;
352 	 * if we instead are STOPPED from the interrupt handler,
353 	 * stay that way. Can't be ACTIVE here though.
354 	 */
355 	if (WARN_ON_ONCE(state == BTS_STATE_ACTIVE))
356 		return;
357 
358 	if (state == BTS_STATE_STOPPED)
359 		return;
360 
361 	if (bts->handle.event)
362 		__bts_event_start(bts->handle.event);
363 }
364 
365 void intel_bts_disable_local(void)
366 {
367 	struct bts_ctx *bts;
368 
369 	if (!bts_ctx)
370 		return;
371 
372 	bts = this_cpu_ptr(bts_ctx);
373 
374 	/*
375 	 * Here we transition from ACTIVE to INACTIVE;
376 	 * do nothing for STOPPED or INACTIVE.
377 	 */
378 	if (READ_ONCE(bts->state) != BTS_STATE_ACTIVE)
379 		return;
380 
381 	if (bts->handle.event)
382 		__bts_event_stop(bts->handle.event, BTS_STATE_INACTIVE);
383 }
384 
385 static int
386 bts_buffer_reset(struct bts_buffer *bb, struct perf_output_handle *handle)
387 {
388 	unsigned long head, space, next_space, pad, gap, skip, wakeup;
389 	unsigned int next_buf;
390 	struct bts_phys *phys, *next_phys;
391 	int ret;
392 
393 	if (bb->snapshot)
394 		return 0;
395 
396 	head = handle->head & ((bb->nr_pages << PAGE_SHIFT) - 1);
397 
398 	phys = &bb->buf[bb->cur_buf];
399 	space = phys->offset + phys->displacement + phys->size - head;
400 	pad = space;
401 	if (space > handle->size) {
402 		space = handle->size;
403 		space -= space % BTS_RECORD_SIZE;
404 	}
405 	if (space <= BTS_SAFETY_MARGIN) {
406 		/* See if next phys buffer has more space */
407 		next_buf = bb->cur_buf + 1;
408 		if (next_buf >= bb->nr_bufs)
409 			next_buf = 0;
410 		next_phys = &bb->buf[next_buf];
411 		gap = buf_size(phys->page) - phys->displacement - phys->size +
412 		      next_phys->displacement;
413 		skip = pad + gap;
414 		if (handle->size >= skip) {
415 			next_space = next_phys->size;
416 			if (next_space + skip > handle->size) {
417 				next_space = handle->size - skip;
418 				next_space -= next_space % BTS_RECORD_SIZE;
419 			}
420 			if (next_space > space || !space) {
421 				if (pad)
422 					bts_buffer_pad_out(phys, head);
423 				ret = perf_aux_output_skip(handle, skip);
424 				if (ret)
425 					return ret;
426 				/* Advance to next phys buffer */
427 				phys = next_phys;
428 				space = next_space;
429 				head = phys->offset + phys->displacement;
430 				/*
431 				 * After this, cur_buf and head won't match ds
432 				 * anymore, so we must not be racing with
433 				 * bts_update().
434 				 */
435 				bb->cur_buf = next_buf;
436 				local_set(&bb->head, head);
437 			}
438 		}
439 	}
440 
441 	/* Don't go far beyond wakeup watermark */
442 	wakeup = BTS_SAFETY_MARGIN + BTS_RECORD_SIZE + handle->wakeup -
443 		 handle->head;
444 	if (space > wakeup) {
445 		space = wakeup;
446 		space -= space % BTS_RECORD_SIZE;
447 	}
448 
449 	bb->end = head + space;
450 
451 	/*
452 	 * If we have no space, the lost notification would have been sent when
453 	 * we hit absolute_maximum - see bts_update()
454 	 */
455 	if (!space)
456 		return -ENOSPC;
457 
458 	return 0;
459 }
460 
461 int intel_bts_interrupt(void)
462 {
463 	struct debug_store *ds = this_cpu_ptr(&cpu_hw_events)->ds;
464 	struct bts_ctx *bts;
465 	struct perf_event *event;
466 	struct bts_buffer *bb;
467 	s64 old_head;
468 	int err = -ENOSPC, handled = 0;
469 
470 	if (!bts_ctx)
471 		return 0;
472 
473 	bts = this_cpu_ptr(bts_ctx);
474 	event = bts->handle.event;
475 	/*
476 	 * The only surefire way of knowing if this NMI is ours is by checking
477 	 * the write ptr against the PMI threshold.
478 	 */
479 	if (ds && (ds->bts_index >= ds->bts_interrupt_threshold))
480 		handled = 1;
481 
482 	/*
483 	 * this is wrapped in intel_bts_enable_local/intel_bts_disable_local,
484 	 * so we can only be INACTIVE or STOPPED
485 	 */
486 	if (READ_ONCE(bts->state) == BTS_STATE_STOPPED)
487 		return handled;
488 
489 	bb = perf_get_aux(&bts->handle);
490 	if (!bb)
491 		return handled;
492 
493 	/*
494 	 * Skip snapshot counters: they don't use the interrupt, but
495 	 * there's no other way of telling, because the pointer will
496 	 * keep moving
497 	 */
498 	if (bb->snapshot)
499 		return 0;
500 
501 	old_head = local_read(&bb->head);
502 	bts_update(bts);
503 
504 	/* no new data */
505 	if (old_head == local_read(&bb->head))
506 		return handled;
507 
508 	perf_aux_output_end(&bts->handle, local_xchg(&bb->data_size, 0));
509 
510 	bb = perf_aux_output_begin(&bts->handle, event);
511 	if (bb)
512 		err = bts_buffer_reset(bb, &bts->handle);
513 
514 	if (err) {
515 		WRITE_ONCE(bts->state, BTS_STATE_STOPPED);
516 
517 		if (bb) {
518 			/*
519 			 * BTS_STATE_STOPPED should be visible before
520 			 * cleared handle::event
521 			 */
522 			barrier();
523 			perf_aux_output_end(&bts->handle, 0);
524 		}
525 	}
526 
527 	return 1;
528 }
529 
530 static void bts_event_del(struct perf_event *event, int mode)
531 {
532 	bts_event_stop(event, PERF_EF_UPDATE);
533 }
534 
535 static int bts_event_add(struct perf_event *event, int mode)
536 {
537 	struct bts_ctx *bts = this_cpu_ptr(bts_ctx);
538 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
539 	struct hw_perf_event *hwc = &event->hw;
540 
541 	event->hw.state = PERF_HES_STOPPED;
542 
543 	if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask))
544 		return -EBUSY;
545 
546 	if (bts->handle.event)
547 		return -EBUSY;
548 
549 	if (mode & PERF_EF_START) {
550 		bts_event_start(event, 0);
551 		if (hwc->state & PERF_HES_STOPPED)
552 			return -EINVAL;
553 	}
554 
555 	return 0;
556 }
557 
558 static void bts_event_destroy(struct perf_event *event)
559 {
560 	x86_release_hardware();
561 	x86_del_exclusive(x86_lbr_exclusive_bts);
562 }
563 
564 static int bts_event_init(struct perf_event *event)
565 {
566 	int ret;
567 
568 	if (event->attr.type != bts_pmu.type)
569 		return -ENOENT;
570 
571 	/*
572 	 * BTS leaks kernel addresses even when CPL0 tracing is
573 	 * disabled, so disallow intel_bts driver for unprivileged
574 	 * users on paranoid systems since it provides trace data
575 	 * to the user in a zero-copy fashion.
576 	 */
577 	if (event->attr.exclude_kernel) {
578 		ret = perf_allow_kernel();
579 		if (ret)
580 			return ret;
581 	}
582 
583 	if (x86_add_exclusive(x86_lbr_exclusive_bts))
584 		return -EBUSY;
585 
586 	ret = x86_reserve_hardware();
587 	if (ret) {
588 		x86_del_exclusive(x86_lbr_exclusive_bts);
589 		return ret;
590 	}
591 
592 	event->destroy = bts_event_destroy;
593 
594 	return 0;
595 }
596 
597 static void bts_event_read(struct perf_event *event)
598 {
599 }
600 
601 static __init int bts_init(void)
602 {
603 	if (!boot_cpu_has(X86_FEATURE_DTES64))
604 		return -ENODEV;
605 
606 	x86_pmu.bts = boot_cpu_has(X86_FEATURE_BTS);
607 	if (!x86_pmu.bts)
608 		return -ENODEV;
609 
610 	if (boot_cpu_has(X86_FEATURE_PTI)) {
611 		/*
612 		 * BTS hardware writes through a virtual memory map we must
613 		 * either use the kernel physical map, or the user mapping of
614 		 * the AUX buffer.
615 		 *
616 		 * However, since this driver supports per-CPU and per-task inherit
617 		 * we cannot use the user mapping since it will not be available
618 		 * if we're not running the owning process.
619 		 *
620 		 * With PTI we can't use the kernel map either, because its not
621 		 * there when we run userspace.
622 		 *
623 		 * For now, disable this driver when using PTI.
624 		 */
625 		return -ENODEV;
626 	}
627 
628 	bts_ctx = alloc_percpu(struct bts_ctx);
629 	if (!bts_ctx)
630 		return -ENOMEM;
631 
632 	bts_pmu.capabilities	= PERF_PMU_CAP_AUX_NO_SG | PERF_PMU_CAP_ITRACE |
633 				  PERF_PMU_CAP_EXCLUSIVE;
634 	bts_pmu.task_ctx_nr	= perf_sw_context;
635 	bts_pmu.event_init	= bts_event_init;
636 	bts_pmu.add		= bts_event_add;
637 	bts_pmu.del		= bts_event_del;
638 	bts_pmu.start		= bts_event_start;
639 	bts_pmu.stop		= bts_event_stop;
640 	bts_pmu.read		= bts_event_read;
641 	bts_pmu.setup_aux	= bts_buffer_setup_aux;
642 	bts_pmu.free_aux	= bts_buffer_free_aux;
643 
644 	return perf_pmu_register(&bts_pmu, "intel_bts", -1);
645 }
646 arch_initcall(bts_init);
647