1 /*
2 * Copyright (c) 2025 Bojan Novković <bnovkov@freebsd.org>
3 *
4 * SPDX-License-Identifier: BSD-2-Clause
5 */
6
7 /*
8 * hwt(4) Intel Processor Trace (PT) backend
9 *
10 * Driver Design Overview
11 *
12 * - Since PT is configured on a per-core basis, the driver uses
13 * 'smp_rendezvous' to start and disable tracing on each target core.
14 * - PT-specific resources are stored in a 'struct pt_ctx' context structure for
15 * each traced CPU core or thread. Upon initialization, a ToPA configuration
16 * is generated for each 'pt_ctx' structure using the HWT tracing buffers.
17 * The HWT tracing buffer is split into 4K ToPA entries. Currently, each
18 * 4K ToPA entry is configured to trigger an interrupt after it is filled.
19 * - The PT driver uses the XSAVE/XRSTOR PT extensions to load and save all
20 * relevant PT registers. Every time a traced thread is switched
21 * out or in, its state will be saved to or loaded from its corresponding
22 * 'pt_ctx' context.
23 * - When tracing starts, the PT hardware will start writing data into the
24 * tracing buffer. When a TOPA_INT entry is filled, it will trigger an
25 * interrupt before continuing. The interrupt handler will then fetch the
26 * last valid tracing buffer offset and enqueue a HWT_RECORD_BUFFER record.
27 * The driver is currently configured to use the NMI interrupt line.
28 * - The userspace PT backend waits for incoming HWT_RECORD_BUFFER records
29 * and uses the offsets to decode data from the tracing buffer.
30 *
31 * Future improvements and limitations
32 *
33 * - We currently configure the PT hardware to trigger an interrupt whenever
34 * a 4K ToPA entry is filled. While this is fine when tracing smaller
35 * functions or infrequent code paths, this will generate too much interrupt
36 * traffic when tracing hotter functions. A proper solution for this issue
37 * should estimate the amount of data generated by the current configuration
38 * and use it to determine interrupt frequency.
39 *
40 * - Support for more tracing options and PT features.
41 *
42 */
43
44 #include <sys/systm.h>
45 #include <sys/hwt.h>
46 #include <sys/kernel.h>
47 #include <sys/lock.h>
48 #include <sys/malloc.h>
49 #include <sys/module.h>
50 #include <sys/mutex.h>
51 #include <sys/sdt.h>
52 #include <sys/smp.h>
53 #include <sys/taskqueue.h>
54
55 #include <vm/vm.h>
56 #include <vm/vm_page.h>
57
58 #include <machine/atomic.h>
59 #include <machine/cpufunc.h>
60 #include <machine/fpu.h>
61 #include <machine/smp.h>
62 #include <machine/specialreg.h>
63
64 #include <x86/apicvar.h>
65 #include <x86/x86_var.h>
66
67 #include <dev/hwt/hwt_context.h>
68 #include <dev/hwt/hwt_vm.h>
69 #include <dev/hwt/hwt_backend.h>
70 #include <dev/hwt/hwt_config.h>
71 #include <dev/hwt/hwt_cpu.h>
72 #include <dev/hwt/hwt_record.h>
73 #include <dev/hwt/hwt_thread.h>
74
75 #include <amd64/pt/pt.h>
76
77 #ifdef PT_DEBUG
78 #define dprintf(fmt, ...) printf(fmt, ##__VA_ARGS__)
79 #else
80 #define dprintf(fmt, ...)
81 #endif
82 #define PT_SUPPORTED_FLAGS \
83 (RTIT_CTL_MTCEN | RTIT_CTL_CR3FILTER | RTIT_CTL_DIS_TNT | \
84 RTIT_CTL_USER | RTIT_CTL_OS | RTIT_CTL_BRANCHEN)
85 #define PT_XSAVE_MASK (XFEATURE_ENABLED_X87 | XFEATURE_ENABLED_SSE)
86 #define PT_XSTATE_BV (PT_XSAVE_MASK | XFEATURE_ENABLED_PT)
87 #define PT_MAX_IP_RANGES 2
88
89 #define PT_TOPA_MASK_PTRS 0x7f
90 #define PT_TOPA_PAGE_MASK 0xffffff80
91 #define PT_TOPA_PAGE_SHIFT 7
92
93 #define CPUID_PT_LEAF 0x14
94
95 MALLOC_DEFINE(M_PT, "pt", "Intel Processor Trace");
96
97 SDT_PROVIDER_DEFINE(pt);
98 SDT_PROBE_DEFINE(pt, , , topa__intr);
99
100 TASKQUEUE_FAST_DEFINE_THREAD(pt);
101
102 static void pt_send_buffer_record(void *arg, int pending __unused);
103 static int pt_topa_intr(struct trapframe *tf);
104
105 /*
106 * Intel Processor Trace XSAVE-managed state.
107 */
108 struct pt_ext_area {
109 uint64_t rtit_ctl;
110 uint64_t rtit_output_base;
111 uint64_t rtit_output_mask_ptrs;
112 uint64_t rtit_status;
113 uint64_t rtit_cr3_match;
114 uint64_t rtit_addr0_a;
115 uint64_t rtit_addr0_b;
116 uint64_t rtit_addr1_a;
117 uint64_t rtit_addr1_b;
118 };
119
120 struct pt_buffer {
121 uint64_t *topa_hw; /* ToPA table entries. */
122 size_t size;
123 struct mtx lock; /* Lock for fields below. */
124 vm_offset_t offset;
125 uint64_t wrap_count;
126 int curpage;
127 };
128
129 struct pt_ctx {
130 int id;
131 struct pt_buffer buf; /* ToPA buffer metadata */
132 struct task task; /* ToPA buffer notification task */
133 struct hwt_context *hwt_ctx;
134 uint8_t *save_area; /* PT XSAVE area */
135 };
136 /* PT tracing contexts used for CPU mode. */
137 static struct pt_ctx *pt_pcpu_ctx;
138
139 enum pt_cpu_state {
140 PT_DISABLED = 0,
141 PT_STOPPED,
142 PT_ACTIVE
143 };
144
145 static struct pt_cpu {
146 struct pt_ctx *ctx; /* active PT tracing context */
147 enum pt_cpu_state state; /* used as part of trace stop protocol */
148 } *pt_pcpu;
149
150 /*
151 * PT-related CPUID bits.
152 */
153 static struct pt_cpu_info {
154 uint32_t l0_eax;
155 uint32_t l0_ebx;
156 uint32_t l0_ecx;
157 uint32_t l1_eax;
158 uint32_t l1_ebx;
159 size_t xsave_area_size;
160 size_t xstate_hdr_offset;
161 size_t pt_xsave_offset;
162 } pt_info __read_mostly;
163
164 static bool initialized = false;
165 static int cpu_mode_ctr = 0;
166
167 static __inline enum pt_cpu_state
pt_cpu_get_state(int cpu_id)168 pt_cpu_get_state(int cpu_id)
169 {
170 return (atomic_load_int(&pt_pcpu[cpu_id].state));
171 }
172
173 static __inline void
pt_cpu_set_state(int cpu_id,enum pt_cpu_state state)174 pt_cpu_set_state(int cpu_id, enum pt_cpu_state state)
175 {
176 atomic_store_int(&pt_pcpu[cpu_id].state, state);
177 }
178
179 static __inline struct xstate_hdr *
pt_ctx_get_xstate_hdr(struct pt_ctx * ctx)180 pt_ctx_get_xstate_hdr(struct pt_ctx *ctx)
181 {
182 return ((struct xstate_hdr *)(ctx->save_area +
183 pt_info.xstate_hdr_offset));
184 }
185
186
187 static __inline struct pt_ext_area *
pt_ctx_get_ext_area(struct pt_ctx * ctx)188 pt_ctx_get_ext_area(struct pt_ctx *ctx)
189 {
190 return ((struct pt_ext_area *)(ctx->save_area +
191 pt_info.pt_xsave_offset));
192 }
193
194 /*
195 * Updates current trace buffer offset from the
196 * ToPA MSRs. Records if the trace buffer wrapped.
197 */
198 static __inline void
pt_update_buffer(struct pt_buffer * buf)199 pt_update_buffer(struct pt_buffer *buf)
200 {
201 uint64_t reg;
202 int curpage;
203
204 /* Update buffer offset. */
205 reg = rdmsr(MSR_IA32_RTIT_OUTPUT_MASK_PTRS);
206 curpage = (reg & PT_TOPA_PAGE_MASK) >> PT_TOPA_PAGE_SHIFT;
207 mtx_lock_spin(&buf->lock);
208 /* Check if the output wrapped. */
209 if (buf->curpage > curpage)
210 buf->wrap_count++;
211 buf->curpage = curpage;
212 buf->offset = reg >> 32;
213 mtx_unlock_spin(&buf->lock);
214
215 dprintf("%s: wrap_cnt: %lu, curpage: %d, offset: %zu\n", __func__,
216 buf->wrap_count, buf->curpage, buf->offset);
217 }
218
219 static __inline void
pt_fill_buffer_record(int id,struct pt_buffer * buf,struct hwt_record_entry * rec)220 pt_fill_buffer_record(int id, struct pt_buffer *buf,
221 struct hwt_record_entry *rec)
222 {
223 rec->record_type = HWT_RECORD_BUFFER;
224 rec->buf_id = id;
225 rec->curpage = buf->curpage;
226 rec->offset = buf->offset + (buf->wrap_count * buf->size);
227 }
228
229 /*
230 * Enables or disables tracing on curcpu
231 * using the XSAVE/XRSTOR PT extensions.
232 */
233 static void
pt_cpu_toggle_local(uint8_t * save_area,bool enable)234 pt_cpu_toggle_local(uint8_t *save_area, bool enable)
235 {
236 u_long xcr0, cr0;
237 u_long xss;
238
239 cr0 = rcr0();
240 if (cr0 & CR0_TS)
241 clts();
242 xcr0 = rxcr(XCR0);
243 if ((xcr0 & PT_XSAVE_MASK) != PT_XSAVE_MASK)
244 load_xcr(XCR0, xcr0 | PT_XSAVE_MASK);
245 xss = rdmsr(MSR_IA32_XSS);
246 wrmsr(MSR_IA32_XSS, xss | XFEATURE_ENABLED_PT);
247
248 if (!enable) {
249 KASSERT((rdmsr(MSR_IA32_RTIT_CTL) & RTIT_CTL_TRACEEN) != 0,
250 ("%s: PT is disabled", __func__));
251 xsaves(save_area, XFEATURE_ENABLED_PT);
252 } else {
253 KASSERT((rdmsr(MSR_IA32_RTIT_CTL) & RTIT_CTL_TRACEEN) == 0,
254 ("%s: PT is enabled", __func__));
255 xrstors(save_area, XFEATURE_ENABLED_PT);
256 }
257 wrmsr(MSR_IA32_XSS, xss);
258 if ((xcr0 & PT_XSAVE_MASK) != PT_XSAVE_MASK)
259 load_xcr(XCR0, xcr0);
260 if (cr0 & CR0_TS)
261 load_cr0(cr0);
262 }
263
264 /*
265 * Starts PT tracing on 'curcpu'.
266 */
267 static void
pt_cpu_start(void * dummy)268 pt_cpu_start(void *dummy)
269 {
270 struct pt_cpu *cpu;
271
272 cpu = &pt_pcpu[curcpu];
273 MPASS(cpu->ctx != NULL);
274
275 dprintf("%s: curcpu %d\n", __func__, curcpu);
276 load_cr4(rcr4() | CR4_XSAVE);
277 wrmsr(MSR_IA32_RTIT_STATUS, 0);
278 pt_cpu_set_state(curcpu, PT_ACTIVE);
279 pt_cpu_toggle_local(cpu->ctx->save_area, true);
280 }
281
282 /*
283 * Stops PT tracing on 'curcpu'.
284 * Updates trace buffer offset to ensure
285 * any data generated between the last interrupt
286 * and the trace stop gets picked up by userspace.
287 */
288 static void
pt_cpu_stop(void * dummy)289 pt_cpu_stop(void *dummy)
290 {
291 struct pt_cpu *cpu;
292 struct pt_ctx *ctx;
293
294 /* Shutdown may occur before PT gets properly configured. */
295 if (pt_cpu_get_state(curcpu) == PT_DISABLED)
296 return;
297
298 cpu = &pt_pcpu[curcpu];
299 ctx = cpu->ctx;
300 MPASS(ctx != NULL);
301 dprintf("%s: curcpu %d\n", __func__, curcpu);
302
303 pt_cpu_set_state(curcpu, PT_STOPPED);
304 pt_cpu_toggle_local(cpu->ctx->save_area, false);
305 pt_update_buffer(&ctx->buf);
306 }
307
308 /*
309 * Prepares the Table of Physical Addresses (ToPA) metadata for 'pt_ctx'.
310 * The HWT trace buffer is split into 4K ToPA table entries and used
311 * as a circular buffer, meaning that the last ToPA entry points to
312 * the first ToPA entry. Each entry is configured to raise an
313 * interrupt after being filled.
314 */
315 static int
pt_topa_prepare(struct pt_ctx * ctx,struct hwt_vm * vm)316 pt_topa_prepare(struct pt_ctx *ctx, struct hwt_vm *vm)
317 {
318 struct pt_buffer *buf;
319 size_t topa_size;
320 int i;
321
322 topa_size = TOPA_SIZE_4K;
323 buf = &ctx->buf;
324
325 KASSERT(buf->topa_hw == NULL,
326 ("%s: ToPA info already exists", __func__));
327 buf->topa_hw = mallocarray(vm->npages + 1, sizeof(uint64_t), M_PT,
328 M_ZERO | M_WAITOK);
329 dprintf("%s: ToPA virt addr %p\n", __func__, buf->topa_hw);
330 buf->size = vm->npages * PAGE_SIZE;
331 for (i = 0; i < vm->npages; i++) {
332 buf->topa_hw[i] = VM_PAGE_TO_PHYS(vm->pages[i]) | topa_size;
333 /*
334 * XXX: TOPA_INT should ideally be set according to
335 * expected amount of incoming trace data. Too few TOPA_INT
336 * entries will not trigger interrupts often enough when tracing
337 * smaller functions.
338 */
339 buf->topa_hw[i] |= TOPA_INT;
340 }
341 buf->topa_hw[vm->npages] = (uint64_t)vtophys(buf->topa_hw) | TOPA_END;
342
343 return (0);
344 }
345
346 /*
347 * Configures IP filtering for trace generation.
348 * A maximum of 2 ranges can be specified due to
349 * limitations imposed by the XSAVE/XRSTOR PT extensions.
350 */
351 static int
pt_configure_ranges(struct pt_ctx * ctx,struct pt_cpu_config * cfg)352 pt_configure_ranges(struct pt_ctx *ctx, struct pt_cpu_config *cfg)
353 {
354 struct pt_ext_area *pt_ext;
355 int nranges_supp, n, error = 0;
356
357 pt_ext = pt_ctx_get_ext_area(ctx);
358 if (pt_info.l0_ebx & CPUPT_IPF) {
359 nranges_supp = (pt_info.l1_eax & CPUPT_NADDR_M) >>
360 CPUPT_NADDR_S;
361
362 if (nranges_supp > PT_IP_FILTER_MAX_RANGES)
363 nranges_supp = PT_IP_FILTER_MAX_RANGES;
364 n = cfg->nranges;
365 if (n > nranges_supp) {
366 printf("%s: %d IP filtering ranges requested, CPU "
367 "supports %d, truncating\n",
368 __func__, n, nranges_supp);
369 n = nranges_supp;
370 }
371
372 switch (n) {
373 case 2:
374 pt_ext->rtit_ctl |= (1UL << RTIT_CTL_ADDR_CFG_S(1));
375 pt_ext->rtit_addr1_a = cfg->ip_ranges[1].start;
376 pt_ext->rtit_addr1_b = cfg->ip_ranges[1].end;
377 case 1:
378 pt_ext->rtit_ctl |= (1UL << RTIT_CTL_ADDR_CFG_S(0));
379 pt_ext->rtit_addr0_a = cfg->ip_ranges[0].start;
380 pt_ext->rtit_addr0_b = cfg->ip_ranges[0].end;
381 break;
382 default:
383 error = (EINVAL);
384 break;
385 };
386 } else
387 error = (ENXIO);
388
389 return (error);
390 }
391
392 static int
pt_init_ctx(struct pt_ctx * pt_ctx,struct hwt_vm * vm,int ctx_id)393 pt_init_ctx(struct pt_ctx *pt_ctx, struct hwt_vm *vm, int ctx_id)
394 {
395
396 dprintf("%s: ctx id %d\n", __func__, ctx_id);
397
398 KASSERT(pt_ctx->buf.topa_hw == NULL,
399 ("%s: active ToPA buffer in context %p\n", __func__, pt_ctx));
400
401 memset(pt_ctx, 0, sizeof(struct pt_ctx));
402 mtx_init(&pt_ctx->buf.lock, "pttopa", NULL, MTX_SPIN);
403 pt_ctx->save_area = malloc_aligned(pt_info.xsave_area_size, 64,
404 M_PT, M_NOWAIT | M_ZERO);
405 if (pt_ctx->save_area == NULL)
406 return (ENOMEM);
407 dprintf("%s: preparing ToPA buffer\n", __func__);
408 if (pt_topa_prepare(pt_ctx, vm) != 0) {
409 dprintf("%s: failed to prepare ToPA buffer\n", __func__);
410 free(pt_ctx->save_area, M_PT);
411 return (ENOMEM);
412 }
413
414 pt_ctx->id = ctx_id;
415 TASK_INIT(&pt_ctx->task, 0, pt_send_buffer_record, pt_ctx);
416
417 return (0);
418 }
419
420 static void
pt_deinit_ctx(struct pt_ctx * pt_ctx)421 pt_deinit_ctx(struct pt_ctx *pt_ctx)
422 {
423
424 if (pt_ctx->buf.topa_hw != NULL)
425 free(pt_ctx->buf.topa_hw, M_PT);
426 if (pt_ctx->save_area != NULL)
427 free(pt_ctx->save_area, M_PT);
428 memset(pt_ctx, 0, sizeof(*pt_ctx));
429 pt_ctx->buf.topa_hw = NULL;
430 }
431
432 /*
433 * HWT backend configuration method.
434 *
435 * Checks and translates the user-defined configuration to a
436 * set of PT tracing features. Uses the feature set to initialize
437 * the tracing context for the target CPU or thread.
438 */
439 static int
pt_backend_configure(struct hwt_context * ctx,int cpu_id,int thread_id)440 pt_backend_configure(struct hwt_context *ctx, int cpu_id, int thread_id)
441 {
442 struct hwt_cpu *hwt_cpu;
443 struct hwt_thread *thr;
444 struct pt_ctx *pt_ctx;
445 struct pt_cpu_config *cfg;
446 struct pt_ext_area *pt_ext;
447 struct xstate_hdr *hdr;
448 int error;
449
450 dprintf("%s\n", __func__);
451
452 cfg = (struct pt_cpu_config *)ctx->config;
453 pt_ctx = NULL;
454
455 /* Clear any flags we don't support yet. */
456 cfg->rtit_ctl &= PT_SUPPORTED_FLAGS;
457 if (cfg->rtit_ctl & RTIT_CTL_MTCEN) {
458 if ((pt_info.l0_ebx & CPUPT_MTC) == 0) {
459 printf("%s: CPU does not support generating MTC "
460 "packets\n", __func__);
461 return (ENXIO);
462 }
463 }
464
465 if (cfg->rtit_ctl & RTIT_CTL_CR3FILTER) {
466 if ((pt_info.l0_ebx & CPUPT_CR3) == 0) {
467 printf("%s: CPU does not support CR3 filtering\n",
468 __func__);
469 return (ENXIO);
470 }
471 }
472
473 if (cfg->rtit_ctl & RTIT_CTL_DIS_TNT) {
474 if ((pt_info.l0_ebx & CPUPT_DIS_TNT) == 0) {
475 printf("%s: CPU does not support TNT\n", __func__);
476 return (ENXIO);
477 }
478 }
479 /* TODO: support for more config bits. */
480
481 if (ctx->mode == HWT_MODE_CPU) {
482 TAILQ_FOREACH(hwt_cpu, &ctx->cpus, next) {
483 if (hwt_cpu->cpu_id != cpu_id)
484 continue;
485 pt_ctx = &pt_pcpu_ctx[cpu_id];
486 break;
487 }
488 } else {
489 TAILQ_FOREACH(thr, &ctx->threads, next) {
490 if (thr->thread_id != thread_id)
491 continue;
492 KASSERT(thr->private != NULL,
493 ("%s: hwt thread private"
494 " not set, thr %p",
495 __func__, thr));
496 pt_ctx = (struct pt_ctx *)thr->private;
497 break;
498 }
499 }
500 if (pt_ctx == NULL)
501 return (ENOENT);
502
503 dprintf("%s: preparing MSRs\n", __func__);
504 pt_ext = pt_ctx_get_ext_area(pt_ctx);
505 hdr = pt_ctx_get_xstate_hdr(pt_ctx);
506
507 pt_ext->rtit_ctl |= cfg->rtit_ctl;
508 if (cfg->nranges != 0) {
509 dprintf("%s: preparing IPF ranges\n", __func__);
510 if ((error = pt_configure_ranges(pt_ctx, cfg)) != 0)
511 return (error);
512 }
513 pt_ctx->hwt_ctx = ctx;
514 pt_ext->rtit_ctl |= RTIT_CTL_TOPA;
515 pt_ext->rtit_output_base = (uint64_t)vtophys(pt_ctx->buf.topa_hw);
516 pt_ext->rtit_output_mask_ptrs = PT_TOPA_MASK_PTRS;
517 hdr->xstate_bv = XFEATURE_ENABLED_PT;
518 hdr->xstate_xcomp_bv = XFEATURE_ENABLED_PT |
519 XSTATE_XCOMP_BV_COMPACT;
520 pt_ext->rtit_ctl |= RTIT_CTL_TRACEEN;
521 pt_pcpu[cpu_id].ctx = pt_ctx;
522 pt_cpu_set_state(cpu_id, PT_STOPPED);
523
524 return (0);
525 }
526
527 /*
528 * hwt backend trace start operation. CPU affine.
529 */
530 static void
pt_backend_enable(struct hwt_context * ctx,int cpu_id)531 pt_backend_enable(struct hwt_context *ctx, int cpu_id)
532 {
533 if (ctx->mode == HWT_MODE_CPU)
534 return;
535
536 KASSERT(curcpu == cpu_id,
537 ("%s: attempting to start PT on another cpu", __func__));
538 pt_cpu_start(NULL);
539 CPU_SET(cpu_id, &ctx->cpu_map);
540 }
541
542 /*
543 * hwt backend trace stop operation. CPU affine.
544 */
545 static void
pt_backend_disable(struct hwt_context * ctx,int cpu_id)546 pt_backend_disable(struct hwt_context *ctx, int cpu_id)
547 {
548 struct pt_cpu *cpu;
549
550 if (ctx->mode == HWT_MODE_CPU)
551 return;
552
553 KASSERT(curcpu == cpu_id,
554 ("%s: attempting to disable PT on another cpu", __func__));
555 pt_cpu_stop(NULL);
556 CPU_CLR(cpu_id, &ctx->cpu_map);
557 cpu = &pt_pcpu[cpu_id];
558 cpu->ctx = NULL;
559 }
560
561 /*
562 * hwt backend trace start operation for remote CPUs.
563 */
564 static int
pt_backend_enable_smp(struct hwt_context * ctx)565 pt_backend_enable_smp(struct hwt_context *ctx)
566 {
567
568 dprintf("%s\n", __func__);
569 if (ctx->mode == HWT_MODE_CPU &&
570 atomic_swap_32(&cpu_mode_ctr, 1) != 0)
571 return (-1);
572
573 KASSERT(ctx->mode == HWT_MODE_CPU,
574 ("%s: should only be used for CPU mode", __func__));
575 smp_rendezvous_cpus(ctx->cpu_map, NULL, pt_cpu_start, NULL, NULL);
576
577 return (0);
578 }
579
580 /*
581 * hwt backend trace stop operation for remote CPUs.
582 */
583 static int
pt_backend_disable_smp(struct hwt_context * ctx)584 pt_backend_disable_smp(struct hwt_context *ctx)
585 {
586
587 dprintf("%s\n", __func__);
588 if (ctx->mode == HWT_MODE_CPU &&
589 atomic_swap_32(&cpu_mode_ctr, 0) == 0)
590 return (-1);
591
592 if (CPU_EMPTY(&ctx->cpu_map)) {
593 dprintf("%s: empty cpu map\n", __func__);
594 return (-1);
595 }
596 smp_rendezvous_cpus(ctx->cpu_map, NULL, pt_cpu_stop, NULL, NULL);
597
598 return (0);
599 }
600
601 /*
602 * HWT backend initialization method.
603 *
604 * Installs the ToPA interrupt handler and initializes
605 * the tracing contexts used for HWT_MODE_CPU.
606 */
607 static int
pt_backend_init(struct hwt_context * ctx)608 pt_backend_init(struct hwt_context *ctx)
609 {
610 struct hwt_cpu *hwt_cpu;
611 int error;
612
613 dprintf("%s\n", __func__);
614 if (ctx->mode == HWT_MODE_CPU) {
615 TAILQ_FOREACH(hwt_cpu, &ctx->cpus, next) {
616 error = pt_init_ctx(&pt_pcpu_ctx[hwt_cpu->cpu_id],
617 hwt_cpu->vm, hwt_cpu->cpu_id);
618 if (error)
619 return (error);
620 }
621 }
622
623 return (0);
624 }
625
626 /*
627 * HWT backend teardown method.
628 *
629 * Removes the ToPA interrupt handler, stops tracing on all active CPUs,
630 * and releases all previously allocated ToPA metadata.
631 */
632 static int
pt_backend_deinit(struct hwt_context * ctx)633 pt_backend_deinit(struct hwt_context *ctx)
634 {
635 struct pt_ctx *pt_ctx;
636 struct hwt_thread *thr;
637 int cpu_id;
638
639 dprintf("%s\n", __func__);
640
641 pt_backend_disable_smp(ctx);
642 if (ctx->mode == HWT_MODE_THREAD) {
643 TAILQ_FOREACH(thr, &ctx->threads, next) {
644 KASSERT(thr->private != NULL,
645 ("%s: thr->private not set", __func__));
646 pt_ctx = (struct pt_ctx *)thr->private;
647 pt_deinit_ctx(pt_ctx);
648 }
649 } else {
650 CPU_FOREACH(cpu_id) {
651 if (!CPU_ISSET(cpu_id, &ctx->cpu_map))
652 continue;
653 if (pt_pcpu[cpu_id].ctx != NULL) {
654 KASSERT(pt_pcpu[cpu_id].ctx ==
655 &pt_pcpu_ctx[cpu_id],
656 ("%s: CPU mode tracing with non-cpu mode PT"
657 "context active",
658 __func__));
659 pt_pcpu[cpu_id].ctx = NULL;
660 }
661 pt_ctx = &pt_pcpu_ctx[cpu_id];
662 pt_deinit_ctx(pt_ctx);
663 memset(&pt_pcpu[cpu_id], 0, sizeof(struct pt_cpu));
664 }
665 }
666
667 return (0);
668 }
669
670 /*
671 * Fetches current offset into the tracing buffer.
672 */
673 static int
pt_backend_read(struct hwt_vm * vm,int * curpage,vm_offset_t * curpage_offset,uint64_t * data)674 pt_backend_read(struct hwt_vm *vm, int *curpage, vm_offset_t *curpage_offset,
675 uint64_t *data)
676 {
677 struct pt_buffer *buf;
678
679 if (vm->ctx->mode == HWT_MODE_THREAD)
680 buf = &((struct pt_ctx *)vm->thr->private)->buf;
681 else
682 buf = &pt_pcpu[vm->cpu->cpu_id].ctx->buf;
683 mtx_lock_spin(&buf->lock);
684 *curpage = buf->curpage;
685 *curpage_offset = buf->offset + (buf->wrap_count * vm->ctx->bufsize);
686 mtx_unlock_spin(&buf->lock);
687
688 return (0);
689 }
690
691 /*
692 * HWT thread creation hook.
693 * Allocates and associates a 'struct pt_ctx' for a given hwt thread.
694 */
695 static int
pt_backend_alloc_thread(struct hwt_thread * thr)696 pt_backend_alloc_thread(struct hwt_thread *thr)
697 {
698 struct pt_ctx *pt_ctx;
699 int error;
700
701 /* Omit M_WAITOK since this might get invoked a non-sleepable context */
702 pt_ctx = malloc(sizeof(*pt_ctx), M_PT, M_NOWAIT | M_ZERO);
703 if (pt_ctx == NULL)
704 return (ENOMEM);
705
706 error = pt_init_ctx(pt_ctx, thr->vm, thr->thread_id);
707 if (error)
708 return (error);
709
710 thr->private = pt_ctx;
711 return (0);
712 }
713 /*
714 * HWT thread teardown hook.
715 */
716 static void
pt_backend_free_thread(struct hwt_thread * thr)717 pt_backend_free_thread(struct hwt_thread *thr)
718 {
719 struct pt_ctx *ctx;
720
721 ctx = (struct pt_ctx *)thr->private;
722
723 pt_deinit_ctx(ctx);
724 free(ctx, M_PT);
725 }
726
727 static void
pt_backend_dump(int cpu_id)728 pt_backend_dump(int cpu_id)
729 {
730 }
731
732 static struct hwt_backend_ops pt_ops = {
733 .hwt_backend_init = pt_backend_init,
734 .hwt_backend_deinit = pt_backend_deinit,
735
736 .hwt_backend_configure = pt_backend_configure,
737
738 .hwt_backend_enable = pt_backend_enable,
739 .hwt_backend_disable = pt_backend_disable,
740
741 #ifdef SMP
742 .hwt_backend_enable_smp = pt_backend_enable_smp,
743 .hwt_backend_disable_smp = pt_backend_disable_smp,
744 #endif
745
746 .hwt_backend_read = pt_backend_read,
747 .hwt_backend_dump = pt_backend_dump,
748
749 .hwt_backend_thread_alloc = pt_backend_alloc_thread,
750 .hwt_backend_thread_free = pt_backend_free_thread,
751 };
752
753 static struct hwt_backend backend = {
754 .ops = &pt_ops,
755 .name = "pt",
756 .kva_req = 1,
757 };
758
759 /*
760 * Reads the latest valid trace buffer offset and enqueues
761 * a HWT_RECORD_BUFFER record.
762 * Used as a taskqueue routine from the ToPA interrupt handler.
763 */
764 static void
pt_send_buffer_record(void * arg,int pending __unused)765 pt_send_buffer_record(void *arg, int pending __unused)
766 {
767 struct hwt_record_entry record;
768 struct pt_ctx *ctx = (struct pt_ctx *)arg;
769
770 /* Prepare buffer record. */
771 mtx_lock_spin(&ctx->buf.lock);
772 pt_fill_buffer_record(ctx->id, &ctx->buf, &record);
773 mtx_unlock_spin(&ctx->buf.lock);
774 hwt_record_ctx(ctx->hwt_ctx, &record, M_ZERO | M_NOWAIT);
775 }
776 static void
pt_topa_status_clear(void)777 pt_topa_status_clear(void)
778 {
779 uint64_t reg;
780
781 reg = rdmsr(MSR_IA_GLOBAL_STATUS_RESET);
782 reg &= ~GLOBAL_STATUS_FLAG_TRACETOPAPMI;
783 reg |= GLOBAL_STATUS_FLAG_TRACETOPAPMI;
784 wrmsr(MSR_IA_GLOBAL_STATUS_RESET, reg);
785 }
786
787 /*
788 * ToPA PMI handler.
789 *
790 * Invoked every time a ToPA entry marked with TOPA_INT is filled.
791 * Uses taskqueue to enqueue a buffer record for userspace.
792 * Re-enables the PC interrupt line as long as tracing is active.
793 */
794 static int
pt_topa_intr(struct trapframe * tf)795 pt_topa_intr(struct trapframe *tf)
796 {
797 struct pt_buffer *buf;
798 struct pt_ctx *ctx;
799 uint64_t reg;
800
801 SDT_PROBE0(pt, , , topa__intr);
802
803 if (pt_cpu_get_state(curcpu) != PT_ACTIVE) {
804 return (0);
805 }
806 reg = rdmsr(MSR_IA_GLOBAL_STATUS);
807 if ((reg & GLOBAL_STATUS_FLAG_TRACETOPAPMI) == 0) {
808 /* ACK spurious or leftover interrupt. */
809 pt_topa_status_clear();
810 return (1);
811 }
812
813 ctx = pt_pcpu[curcpu].ctx;
814 buf = &ctx->buf;
815 KASSERT(buf->topa_hw != NULL,
816 ("%s: ToPA PMI interrupt with invalid buffer", __func__));
817
818 pt_cpu_toggle_local(ctx->save_area, false);
819 pt_update_buffer(buf);
820 pt_topa_status_clear();
821 taskqueue_enqueue_flags(taskqueue_pt, &ctx->task,
822 TASKQUEUE_FAIL_IF_PENDING);
823
824 if (pt_cpu_get_state(curcpu) == PT_ACTIVE) {
825 pt_cpu_toggle_local(ctx->save_area, true);
826 lapic_reenable_pcint();
827 }
828 return (1);
829 }
830
831 /*
832 * Module initialization.
833 *
834 * Saves all PT-related cpuid info, registers itself as a HWT backend,
835 * and allocates metadata required to keep track of tracing operations
836 * on each CPU.
837 */
838 static int
pt_init(void)839 pt_init(void)
840 {
841 u_int cp[4];
842 int error;
843
844 dprintf("pt: Enumerating part 1\n");
845 cpuid_count(CPUID_PT_LEAF, 0, cp);
846 dprintf("pt: Maximum valid sub-leaf Index: %x\n", cp[0]);
847 dprintf("pt: ebx %x\n", cp[1]);
848 dprintf("pt: ecx %x\n", cp[2]);
849
850 pt_info.l0_eax = cp[0];
851 pt_info.l0_ebx = cp[1];
852 pt_info.l0_ecx = cp[2];
853
854 dprintf("pt: Enumerating part 2\n");
855 cpuid_count(CPUID_PT_LEAF, 1, cp);
856 dprintf("pt: eax %x\n", cp[0]);
857 dprintf("pt: ebx %x\n", cp[1]);
858
859 pt_info.l1_eax = cp[0];
860 pt_info.l1_ebx = cp[1];
861
862 error = hwt_backend_register(&backend);
863 if (error != 0) {
864 printf("pt: unable to register hwt backend, error %d\n", error);
865 return (error);
866 }
867 pt_pcpu = mallocarray(mp_ncpus, sizeof(struct pt_cpu), M_PT,
868 M_ZERO | M_WAITOK);
869 pt_pcpu_ctx = mallocarray(mp_ncpus, sizeof(struct pt_ctx), M_PT,
870 M_ZERO | M_WAITOK);
871
872 nmi_register_handler(pt_topa_intr);
873 if (!lapic_enable_pcint()) {
874 nmi_remove_handler(pt_topa_intr);
875 hwt_backend_unregister(&backend);
876 free(pt_pcpu, M_PT);
877 free(pt_pcpu_ctx, M_PT);
878 pt_pcpu = NULL;
879 pt_pcpu_ctx = NULL;
880 printf("pt: failed to setup interrupt line\n");
881 return (error);
882 }
883 initialized = true;
884
885 return (0);
886 }
887
888 /*
889 * Checks whether the CPU support Intel PT and
890 * initializes XSAVE area info.
891 *
892 * The driver relies on XSAVE/XRSTOR PT extensions,
893 * Table of Physical Addresses (ToPA) support, and
894 * support for multiple ToPA entries.
895 */
896 static bool
pt_supported(void)897 pt_supported(void)
898 {
899 u_int cp[4];
900
901 if ((cpu_stdext_feature & CPUID_STDEXT_PROCTRACE) == 0) {
902 printf("pt: CPU does not support Intel Processor Trace\n");
903 return (false);
904 }
905 if ((cpu_feature2 & CPUID2_XSAVE) == 0) {
906 printf("pt: XSAVE is not supported\n");
907 return (false);
908 }
909 if (!xsave_extfeature_supported(XFEATURE_ENABLED_PT, true)) {
910 printf("pt: CPU does not support managing PT state using XSAVE\n");
911 return (false);
912 }
913 if (!xsave_extension_supported(CPUID_EXTSTATE_XSAVEC)) {
914 printf("pt: XSAVE compaction is not supported\n");
915 return (false);
916 }
917 if (!xsave_extension_supported(CPUID_EXTSTATE_XSAVES)) {
918 printf("pt: CPU does not support XSAVES/XRSTORS\n");
919 return (false);
920 }
921
922 /* Require ToPA support. */
923 cpuid_count(CPUID_PT_LEAF, 0, cp);
924 if ((cp[2] & CPUPT_TOPA) == 0) {
925 printf("pt: ToPA is not supported\n");
926 return (false);
927 }
928 if ((cp[2] & CPUPT_TOPA_MULTI) == 0) {
929 printf("pt: multiple ToPA outputs are not supported\n");
930 return (false);
931 }
932
933 pt_info.xstate_hdr_offset = xsave_area_hdr_offset();
934 pt_info.xsave_area_size = xsave_area_size(PT_XSTATE_BV, true, true);
935 pt_info.pt_xsave_offset = xsave_area_offset(PT_XSTATE_BV,
936 XFEATURE_ENABLED_PT, true, true);
937
938 return (true);
939 }
940
941 static void
pt_deinit(void)942 pt_deinit(void)
943 {
944 if (!initialized)
945 return;
946 nmi_remove_handler(pt_topa_intr);
947 lapic_disable_pcint();
948 hwt_backend_unregister(&backend);
949 free(pt_pcpu, M_PT);
950 free(pt_pcpu_ctx, M_PT);
951 pt_pcpu = NULL;
952 initialized = false;
953 }
954
955 static int
pt_modevent(module_t mod,int type,void * data)956 pt_modevent(module_t mod, int type, void *data)
957 {
958 switch (type) {
959 case MOD_LOAD:
960 if (!pt_supported() || pt_init() != 0) {
961 return (ENXIO);
962 }
963 break;
964 case MOD_UNLOAD:
965 pt_deinit();
966 break;
967 default:
968 break;
969 }
970
971 return (0);
972 }
973
974 static moduledata_t pt_mod = { "intel_pt", pt_modevent, NULL };
975
976 DECLARE_MODULE(intel_pt, pt_mod, SI_SUB_DRIVERS, SI_ORDER_FIRST);
977 MODULE_DEPEND(intel_pt, hwt, 1, 1, 1);
978 MODULE_VERSION(intel_pt, 1);
979