1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3 * BTS PMU driver for perf
4 * Copyright (c) 2013-2014, Intel Corporation.
5 */
6
7 #undef DEBUG
8
9 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
10
11 #include <linux/bitops.h>
12 #include <linux/types.h>
13 #include <linux/slab.h>
14 #include <linux/debugfs.h>
15 #include <linux/device.h>
16 #include <linux/coredump.h>
17
18 #include <linux/sizes.h>
19 #include <asm/perf_event.h>
20 #include <asm/msr.h>
21
22 #include "../perf_event.h"
23
24 struct bts_ctx {
25 struct perf_output_handle handle;
26 struct debug_store ds_back;
27 int state;
28 };
29
30 /* BTS context states: */
31 enum {
32 /* no ongoing AUX transactions */
33 BTS_STATE_STOPPED = 0,
34 /* AUX transaction is on, BTS tracing is disabled */
35 BTS_STATE_INACTIVE,
36 /* AUX transaction is on, BTS tracing is running */
37 BTS_STATE_ACTIVE,
38 };
39
40 static struct bts_ctx __percpu *bts_ctx;
41
42 #define BTS_RECORD_SIZE 24
43 #define BTS_SAFETY_MARGIN 4080
44
45 struct bts_phys {
46 struct page *page;
47 unsigned long size;
48 unsigned long offset;
49 unsigned long displacement;
50 };
51
52 struct bts_buffer {
53 size_t real_size; /* multiple of BTS_RECORD_SIZE */
54 unsigned int nr_pages;
55 unsigned int nr_bufs;
56 unsigned int cur_buf;
57 bool snapshot;
58 local_t data_size;
59 local_t head;
60 unsigned long end;
61 void **data_pages;
62 struct bts_phys buf[] __counted_by(nr_bufs);
63 };
64
65 static struct pmu bts_pmu;
66
buf_nr_pages(struct page * page)67 static int buf_nr_pages(struct page *page)
68 {
69 if (!PagePrivate(page))
70 return 1;
71
72 return 1 << page_private(page);
73 }
74
buf_size(struct page * page)75 static size_t buf_size(struct page *page)
76 {
77 return buf_nr_pages(page) * PAGE_SIZE;
78 }
79
80 static void *
bts_buffer_setup_aux(struct perf_event * event,void ** pages,int nr_pages,bool overwrite)81 bts_buffer_setup_aux(struct perf_event *event, void **pages,
82 int nr_pages, bool overwrite)
83 {
84 struct bts_buffer *bb;
85 struct page *page;
86 int cpu = event->cpu;
87 int node = (cpu == -1) ? cpu : cpu_to_node(cpu);
88 unsigned long offset;
89 size_t size = nr_pages << PAGE_SHIFT;
90 int pg, nr_buf, pad;
91
92 /* count all the high order buffers */
93 for (pg = 0, nr_buf = 0; pg < nr_pages;) {
94 page = virt_to_page(pages[pg]);
95 pg += buf_nr_pages(page);
96 nr_buf++;
97 }
98
99 /*
100 * to avoid interrupts in overwrite mode, only allow one physical
101 */
102 if (overwrite && nr_buf > 1)
103 return NULL;
104
105 bb = kzalloc_node(struct_size(bb, buf, nr_buf), GFP_KERNEL, node);
106 if (!bb)
107 return NULL;
108
109 bb->nr_pages = nr_pages;
110 bb->nr_bufs = nr_buf;
111 bb->snapshot = overwrite;
112 bb->data_pages = pages;
113 bb->real_size = size - size % BTS_RECORD_SIZE;
114
115 for (pg = 0, nr_buf = 0, offset = 0, pad = 0; nr_buf < bb->nr_bufs; nr_buf++) {
116 unsigned int __nr_pages;
117
118 page = virt_to_page(pages[pg]);
119 __nr_pages = buf_nr_pages(page);
120 bb->buf[nr_buf].page = page;
121 bb->buf[nr_buf].offset = offset;
122 bb->buf[nr_buf].displacement = (pad ? BTS_RECORD_SIZE - pad : 0);
123 bb->buf[nr_buf].size = buf_size(page) - bb->buf[nr_buf].displacement;
124 pad = bb->buf[nr_buf].size % BTS_RECORD_SIZE;
125 bb->buf[nr_buf].size -= pad;
126
127 pg += __nr_pages;
128 offset += __nr_pages << PAGE_SHIFT;
129 }
130
131 return bb;
132 }
133
bts_buffer_free_aux(void * data)134 static void bts_buffer_free_aux(void *data)
135 {
136 kfree(data);
137 }
138
bts_buffer_offset(struct bts_buffer * bb,unsigned int idx)139 static unsigned long bts_buffer_offset(struct bts_buffer *bb, unsigned int idx)
140 {
141 return bb->buf[idx].offset + bb->buf[idx].displacement;
142 }
143
144 static void
bts_config_buffer(struct bts_buffer * bb)145 bts_config_buffer(struct bts_buffer *bb)
146 {
147 int cpu = raw_smp_processor_id();
148 struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
149 struct bts_phys *phys = &bb->buf[bb->cur_buf];
150 unsigned long index, thresh = 0, end = phys->size;
151 struct page *page = phys->page;
152
153 index = local_read(&bb->head);
154
155 if (!bb->snapshot) {
156 if (bb->end < phys->offset + buf_size(page))
157 end = bb->end - phys->offset - phys->displacement;
158
159 index -= phys->offset + phys->displacement;
160
161 if (end - index > BTS_SAFETY_MARGIN)
162 thresh = end - BTS_SAFETY_MARGIN;
163 else if (end - index > BTS_RECORD_SIZE)
164 thresh = end - BTS_RECORD_SIZE;
165 else
166 thresh = end;
167 }
168
169 ds->bts_buffer_base = (u64)(long)page_address(page) + phys->displacement;
170 ds->bts_index = ds->bts_buffer_base + index;
171 ds->bts_absolute_maximum = ds->bts_buffer_base + end;
172 ds->bts_interrupt_threshold = !bb->snapshot
173 ? ds->bts_buffer_base + thresh
174 : ds->bts_absolute_maximum + BTS_RECORD_SIZE;
175 }
176
bts_buffer_pad_out(struct bts_phys * phys,unsigned long head)177 static void bts_buffer_pad_out(struct bts_phys *phys, unsigned long head)
178 {
179 unsigned long index = head - phys->offset;
180
181 memset(page_address(phys->page) + index, 0, phys->size - index);
182 }
183
bts_update(struct bts_ctx * bts)184 static void bts_update(struct bts_ctx *bts)
185 {
186 int cpu = raw_smp_processor_id();
187 struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
188 struct bts_buffer *bb = perf_get_aux(&bts->handle);
189 unsigned long index = ds->bts_index - ds->bts_buffer_base, old, head;
190
191 if (!bb)
192 return;
193
194 head = index + bts_buffer_offset(bb, bb->cur_buf);
195 old = local_xchg(&bb->head, head);
196
197 if (!bb->snapshot) {
198 if (old == head)
199 return;
200
201 if (ds->bts_index >= ds->bts_absolute_maximum)
202 perf_aux_output_flag(&bts->handle,
203 PERF_AUX_FLAG_TRUNCATED);
204
205 /*
206 * old and head are always in the same physical buffer, so we
207 * can subtract them to get the data size.
208 */
209 local_add(head - old, &bb->data_size);
210 } else {
211 local_set(&bb->data_size, head);
212 }
213
214 /*
215 * Since BTS is coherent, just add compiler barrier to ensure
216 * BTS updating is ordered against bts::handle::event.
217 */
218 barrier();
219 }
220
221 static int
222 bts_buffer_reset(struct bts_buffer *bb, struct perf_output_handle *handle);
223
224 /*
225 * Ordering PMU callbacks wrt themselves and the PMI is done by means
226 * of bts::state, which:
227 * - is set when bts::handle::event is valid, that is, between
228 * perf_aux_output_begin() and perf_aux_output_end();
229 * - is zero otherwise;
230 * - is ordered against bts::handle::event with a compiler barrier.
231 */
232
__bts_event_start(struct perf_event * event)233 static void __bts_event_start(struct perf_event *event)
234 {
235 struct bts_ctx *bts = this_cpu_ptr(bts_ctx);
236 struct bts_buffer *bb = perf_get_aux(&bts->handle);
237 u64 config = 0;
238
239 if (!bb->snapshot)
240 config |= ARCH_PERFMON_EVENTSEL_INT;
241 if (!event->attr.exclude_kernel)
242 config |= ARCH_PERFMON_EVENTSEL_OS;
243 if (!event->attr.exclude_user)
244 config |= ARCH_PERFMON_EVENTSEL_USR;
245
246 bts_config_buffer(bb);
247
248 /*
249 * local barrier to make sure that ds configuration made it
250 * before we enable BTS and bts::state goes ACTIVE
251 */
252 wmb();
253
254 /* INACTIVE/STOPPED -> ACTIVE */
255 WRITE_ONCE(bts->state, BTS_STATE_ACTIVE);
256
257 intel_pmu_enable_bts(config);
258
259 }
260
bts_event_start(struct perf_event * event,int flags)261 static void bts_event_start(struct perf_event *event, int flags)
262 {
263 struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
264 struct bts_ctx *bts = this_cpu_ptr(bts_ctx);
265 struct bts_buffer *bb;
266
267 bb = perf_aux_output_begin(&bts->handle, event);
268 if (!bb)
269 goto fail_stop;
270
271 if (bts_buffer_reset(bb, &bts->handle))
272 goto fail_end_stop;
273
274 bts->ds_back.bts_buffer_base = cpuc->ds->bts_buffer_base;
275 bts->ds_back.bts_absolute_maximum = cpuc->ds->bts_absolute_maximum;
276 bts->ds_back.bts_interrupt_threshold = cpuc->ds->bts_interrupt_threshold;
277
278 perf_event_itrace_started(event);
279 event->hw.state = 0;
280
281 __bts_event_start(event);
282
283 return;
284
285 fail_end_stop:
286 perf_aux_output_end(&bts->handle, 0);
287
288 fail_stop:
289 event->hw.state = PERF_HES_STOPPED;
290 }
291
__bts_event_stop(struct perf_event * event,int state)292 static void __bts_event_stop(struct perf_event *event, int state)
293 {
294 struct bts_ctx *bts = this_cpu_ptr(bts_ctx);
295
296 /* ACTIVE -> INACTIVE(PMI)/STOPPED(->stop()) */
297 WRITE_ONCE(bts->state, state);
298
299 /*
300 * No extra synchronization is mandated by the documentation to have
301 * BTS data stores globally visible.
302 */
303 intel_pmu_disable_bts();
304 }
305
bts_event_stop(struct perf_event * event,int flags)306 static void bts_event_stop(struct perf_event *event, int flags)
307 {
308 struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
309 struct bts_ctx *bts = this_cpu_ptr(bts_ctx);
310 struct bts_buffer *bb = NULL;
311 int state = READ_ONCE(bts->state);
312
313 if (state == BTS_STATE_ACTIVE)
314 __bts_event_stop(event, BTS_STATE_STOPPED);
315
316 if (state != BTS_STATE_STOPPED)
317 bb = perf_get_aux(&bts->handle);
318
319 event->hw.state |= PERF_HES_STOPPED;
320
321 if (flags & PERF_EF_UPDATE) {
322 bts_update(bts);
323
324 if (bb) {
325 if (bb->snapshot)
326 bts->handle.head =
327 local_xchg(&bb->data_size,
328 bb->nr_pages << PAGE_SHIFT);
329 perf_aux_output_end(&bts->handle,
330 local_xchg(&bb->data_size, 0));
331 }
332
333 cpuc->ds->bts_index = bts->ds_back.bts_buffer_base;
334 cpuc->ds->bts_buffer_base = bts->ds_back.bts_buffer_base;
335 cpuc->ds->bts_absolute_maximum = bts->ds_back.bts_absolute_maximum;
336 cpuc->ds->bts_interrupt_threshold = bts->ds_back.bts_interrupt_threshold;
337 }
338 }
339
intel_bts_enable_local(void)340 void intel_bts_enable_local(void)
341 {
342 struct bts_ctx *bts;
343 int state;
344
345 if (!bts_ctx)
346 return;
347
348 bts = this_cpu_ptr(bts_ctx);
349 state = READ_ONCE(bts->state);
350 /*
351 * Here we transition from INACTIVE to ACTIVE;
352 * if we instead are STOPPED from the interrupt handler,
353 * stay that way. Can't be ACTIVE here though.
354 */
355 if (WARN_ON_ONCE(state == BTS_STATE_ACTIVE))
356 return;
357
358 if (state == BTS_STATE_STOPPED)
359 return;
360
361 if (bts->handle.event)
362 __bts_event_start(bts->handle.event);
363 }
364
intel_bts_disable_local(void)365 void intel_bts_disable_local(void)
366 {
367 struct bts_ctx *bts;
368
369 if (!bts_ctx)
370 return;
371
372 bts = this_cpu_ptr(bts_ctx);
373
374 /*
375 * Here we transition from ACTIVE to INACTIVE;
376 * do nothing for STOPPED or INACTIVE.
377 */
378 if (READ_ONCE(bts->state) != BTS_STATE_ACTIVE)
379 return;
380
381 if (bts->handle.event)
382 __bts_event_stop(bts->handle.event, BTS_STATE_INACTIVE);
383 }
384
385 static int
bts_buffer_reset(struct bts_buffer * bb,struct perf_output_handle * handle)386 bts_buffer_reset(struct bts_buffer *bb, struct perf_output_handle *handle)
387 {
388 unsigned long head, space, next_space, pad, gap, skip, wakeup;
389 unsigned int next_buf;
390 struct bts_phys *phys, *next_phys;
391 int ret;
392
393 if (bb->snapshot)
394 return 0;
395
396 head = handle->head & ((bb->nr_pages << PAGE_SHIFT) - 1);
397
398 phys = &bb->buf[bb->cur_buf];
399 space = phys->offset + phys->displacement + phys->size - head;
400 pad = space;
401 if (space > handle->size) {
402 space = handle->size;
403 space -= space % BTS_RECORD_SIZE;
404 }
405 if (space <= BTS_SAFETY_MARGIN) {
406 /* See if next phys buffer has more space */
407 next_buf = bb->cur_buf + 1;
408 if (next_buf >= bb->nr_bufs)
409 next_buf = 0;
410 next_phys = &bb->buf[next_buf];
411 gap = buf_size(phys->page) - phys->displacement - phys->size +
412 next_phys->displacement;
413 skip = pad + gap;
414 if (handle->size >= skip) {
415 next_space = next_phys->size;
416 if (next_space + skip > handle->size) {
417 next_space = handle->size - skip;
418 next_space -= next_space % BTS_RECORD_SIZE;
419 }
420 if (next_space > space || !space) {
421 if (pad)
422 bts_buffer_pad_out(phys, head);
423 ret = perf_aux_output_skip(handle, skip);
424 if (ret)
425 return ret;
426 /* Advance to next phys buffer */
427 phys = next_phys;
428 space = next_space;
429 head = phys->offset + phys->displacement;
430 /*
431 * After this, cur_buf and head won't match ds
432 * anymore, so we must not be racing with
433 * bts_update().
434 */
435 bb->cur_buf = next_buf;
436 local_set(&bb->head, head);
437 }
438 }
439 }
440
441 /* Don't go far beyond wakeup watermark */
442 wakeup = BTS_SAFETY_MARGIN + BTS_RECORD_SIZE + handle->wakeup -
443 handle->head;
444 if (space > wakeup) {
445 space = wakeup;
446 space -= space % BTS_RECORD_SIZE;
447 }
448
449 bb->end = head + space;
450
451 /*
452 * If we have no space, the lost notification would have been sent when
453 * we hit absolute_maximum - see bts_update()
454 */
455 if (!space)
456 return -ENOSPC;
457
458 return 0;
459 }
460
intel_bts_interrupt(void)461 int intel_bts_interrupt(void)
462 {
463 struct debug_store *ds = this_cpu_ptr(&cpu_hw_events)->ds;
464 struct bts_ctx *bts;
465 struct perf_event *event;
466 struct bts_buffer *bb;
467 s64 old_head;
468 int err = -ENOSPC, handled = 0;
469
470 if (!bts_ctx)
471 return 0;
472
473 bts = this_cpu_ptr(bts_ctx);
474 event = bts->handle.event;
475 /*
476 * The only surefire way of knowing if this NMI is ours is by checking
477 * the write ptr against the PMI threshold.
478 */
479 if (ds && (ds->bts_index >= ds->bts_interrupt_threshold))
480 handled = 1;
481
482 /*
483 * this is wrapped in intel_bts_enable_local/intel_bts_disable_local,
484 * so we can only be INACTIVE or STOPPED
485 */
486 if (READ_ONCE(bts->state) == BTS_STATE_STOPPED)
487 return handled;
488
489 bb = perf_get_aux(&bts->handle);
490 if (!bb)
491 return handled;
492
493 /*
494 * Skip snapshot counters: they don't use the interrupt, but
495 * there's no other way of telling, because the pointer will
496 * keep moving
497 */
498 if (bb->snapshot)
499 return 0;
500
501 old_head = local_read(&bb->head);
502 bts_update(bts);
503
504 /* no new data */
505 if (old_head == local_read(&bb->head))
506 return handled;
507
508 perf_aux_output_end(&bts->handle, local_xchg(&bb->data_size, 0));
509
510 bb = perf_aux_output_begin(&bts->handle, event);
511 if (bb)
512 err = bts_buffer_reset(bb, &bts->handle);
513
514 if (err) {
515 WRITE_ONCE(bts->state, BTS_STATE_STOPPED);
516
517 if (bb) {
518 /*
519 * BTS_STATE_STOPPED should be visible before
520 * cleared handle::event
521 */
522 barrier();
523 perf_aux_output_end(&bts->handle, 0);
524 }
525 }
526
527 return 1;
528 }
529
bts_event_del(struct perf_event * event,int mode)530 static void bts_event_del(struct perf_event *event, int mode)
531 {
532 bts_event_stop(event, PERF_EF_UPDATE);
533 }
534
bts_event_add(struct perf_event * event,int mode)535 static int bts_event_add(struct perf_event *event, int mode)
536 {
537 struct bts_ctx *bts = this_cpu_ptr(bts_ctx);
538 struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
539 struct hw_perf_event *hwc = &event->hw;
540
541 event->hw.state = PERF_HES_STOPPED;
542
543 if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask))
544 return -EBUSY;
545
546 if (bts->handle.event)
547 return -EBUSY;
548
549 if (mode & PERF_EF_START) {
550 bts_event_start(event, 0);
551 if (hwc->state & PERF_HES_STOPPED)
552 return -EINVAL;
553 }
554
555 return 0;
556 }
557
bts_event_destroy(struct perf_event * event)558 static void bts_event_destroy(struct perf_event *event)
559 {
560 x86_release_hardware();
561 x86_del_exclusive(x86_lbr_exclusive_bts);
562 }
563
bts_event_init(struct perf_event * event)564 static int bts_event_init(struct perf_event *event)
565 {
566 int ret;
567
568 if (event->attr.type != bts_pmu.type)
569 return -ENOENT;
570
571 /*
572 * BTS leaks kernel addresses even when CPL0 tracing is
573 * disabled, so disallow intel_bts driver for unprivileged
574 * users on paranoid systems since it provides trace data
575 * to the user in a zero-copy fashion.
576 */
577 if (event->attr.exclude_kernel) {
578 ret = perf_allow_kernel();
579 if (ret)
580 return ret;
581 }
582
583 if (x86_add_exclusive(x86_lbr_exclusive_bts))
584 return -EBUSY;
585
586 ret = x86_reserve_hardware();
587 if (ret) {
588 x86_del_exclusive(x86_lbr_exclusive_bts);
589 return ret;
590 }
591
592 event->destroy = bts_event_destroy;
593
594 return 0;
595 }
596
bts_event_read(struct perf_event * event)597 static void bts_event_read(struct perf_event *event)
598 {
599 }
600
bts_init(void)601 static __init int bts_init(void)
602 {
603 if (!boot_cpu_has(X86_FEATURE_DTES64))
604 return -ENODEV;
605
606 x86_pmu.bts = boot_cpu_has(X86_FEATURE_BTS);
607 if (!x86_pmu.bts)
608 return -ENODEV;
609
610 if (boot_cpu_has(X86_FEATURE_PTI)) {
611 /*
612 * BTS hardware writes through a virtual memory map we must
613 * either use the kernel physical map, or the user mapping of
614 * the AUX buffer.
615 *
616 * However, since this driver supports per-CPU and per-task inherit
617 * we cannot use the user mapping since it will not be available
618 * if we're not running the owning process.
619 *
620 * With PTI we can't use the kernel map either, because its not
621 * there when we run userspace.
622 *
623 * For now, disable this driver when using PTI.
624 */
625 return -ENODEV;
626 }
627
628 bts_ctx = alloc_percpu(struct bts_ctx);
629 if (!bts_ctx)
630 return -ENOMEM;
631
632 bts_pmu.capabilities = PERF_PMU_CAP_AUX_NO_SG | PERF_PMU_CAP_ITRACE |
633 PERF_PMU_CAP_EXCLUSIVE;
634 bts_pmu.task_ctx_nr = perf_sw_context;
635 bts_pmu.event_init = bts_event_init;
636 bts_pmu.add = bts_event_add;
637 bts_pmu.del = bts_event_del;
638 bts_pmu.start = bts_event_start;
639 bts_pmu.stop = bts_event_stop;
640 bts_pmu.read = bts_event_read;
641 bts_pmu.setup_aux = bts_buffer_setup_aux;
642 bts_pmu.free_aux = bts_buffer_free_aux;
643
644 return perf_pmu_register(&bts_pmu, "intel_bts", -1);
645 }
646 early_initcall(bts_init);
647