xref: /freebsd/sys/amd64/pt/pt.c (revision 96d82d2d133acaf8effa2e3aee546276e39ff9f2)
1 /*
2  * Copyright (c) 2025 Bojan Novković <bnovkov@freebsd.org>
3  *
4  * SPDX-License-Identifier: BSD-2-Clause
5  */
6 
7 /*
8  * hwt(4) Intel Processor Trace (PT) backend
9  *
10  * Driver Design Overview
11  *
12  * - Since PT is configured on a per-core basis, the driver uses
13  *   'smp_rendezvous' to start and disable tracing on each target core.
14  * - PT-specific resources are stored in a 'struct pt_ctx' context structure for
15  *   each traced CPU core or thread. Upon initialization, a ToPA configuration
16  *   is generated for each 'pt_ctx' structure using the HWT tracing buffers.
17  *   The HWT tracing buffer is split into 4K ToPA entries. Currently, each
18  *   4K ToPA entry is configured to trigger an interrupt after it is filled.
19  * - The PT driver uses the XSAVE/XRSTOR PT extensions to load and save all
20  *   relevant PT registers. Every time a traced thread is switched
21  *   out or in, its state will be saved to or loaded from its corresponding
22  *   'pt_ctx' context.
23  * - When tracing starts, the PT hardware will start writing data into the
24  *   tracing buffer. When a TOPA_INT entry is filled, it will trigger an
25  *   interrupt before continuing. The interrupt handler will then fetch the
26  *   last valid tracing buffer offset and enqueue a HWT_RECORD_BUFFER record.
27  *   The driver is currently configured to use the NMI interrupt line.
28  * - The userspace PT backend waits for incoming HWT_RECORD_BUFFER records
29  *   and uses the offsets to decode data from the tracing buffer.
30  *
31  * Future improvements and limitations
32  *
33  * - We currently configure the PT hardware to trigger an interrupt whenever
34  *   a 4K ToPA entry is filled. While this is fine when tracing smaller
35  *   functions or infrequent code paths, this will generate too much interrupt
36  *   traffic when tracing hotter functions. A proper solution for this issue
37  *   should estimate the amount of data generated by the current configuration
38  *   and use it to determine interrupt frequency.
39  *
40  * - Support for more tracing options and PT features.
41  *
42  */
43 
44 #include <sys/systm.h>
45 #include <sys/bus.h>
46 #include <sys/hwt.h>
47 #include <sys/interrupt.h>
48 #include <sys/kernel.h>
49 #include <sys/lock.h>
50 #include <sys/malloc.h>
51 #include <sys/module.h>
52 #include <sys/mutex.h>
53 #include <sys/smp.h>
54 
55 #include <vm/vm.h>
56 #include <vm/vm_page.h>
57 
58 #include <machine/atomic.h>
59 #include <machine/cpufunc.h>
60 #include <machine/fpu.h>
61 #include <machine/smp.h>
62 #include <machine/specialreg.h>
63 
64 #include <x86/apicvar.h>
65 #include <x86/x86_var.h>
66 
67 #include <dev/hwt/hwt_context.h>
68 #include <dev/hwt/hwt_vm.h>
69 #include <dev/hwt/hwt_backend.h>
70 #include <dev/hwt/hwt_config.h>
71 #include <dev/hwt/hwt_cpu.h>
72 #include <dev/hwt/hwt_record.h>
73 #include <dev/hwt/hwt_thread.h>
74 
75 #include <amd64/pt/pt.h>
76 
77 #ifdef PT_DEBUG
78 #define dprintf(fmt, ...) printf(fmt, ##__VA_ARGS__)
79 #else
80 #define dprintf(fmt, ...)
81 #endif
82 #define PT_SUPPORTED_FLAGS						\
83 	(RTIT_CTL_MTCEN | RTIT_CTL_CR3FILTER | RTIT_CTL_DIS_TNT |	\
84 	    RTIT_CTL_USER | RTIT_CTL_OS | RTIT_CTL_BRANCHEN)
85 #define PT_XSAVE_MASK (XFEATURE_ENABLED_X87 | XFEATURE_ENABLED_SSE)
86 #define PT_XSTATE_BV (PT_XSAVE_MASK | XFEATURE_ENABLED_PT)
87 #define PT_MAX_IP_RANGES 2
88 
89 #define PT_TOPA_MASK_PTRS 0x7f
90 #define PT_TOPA_PAGE_MASK 0xffffff80
91 #define PT_TOPA_PAGE_SHIFT 7
92 
93 #define CPUID_PT_LEAF	0x14
94 
95 MALLOC_DEFINE(M_PT, "pt", "Intel Processor Trace");
96 
97 static void pt_send_buffer_record(void *arg);
98 static int pt_topa_intr(struct trapframe *tf);
99 
100 /*
101  * Intel Processor Trace XSAVE-managed state.
102  */
103 struct pt_ext_area {
104 	uint64_t rtit_ctl;
105 	uint64_t rtit_output_base;
106 	uint64_t rtit_output_mask_ptrs;
107 	uint64_t rtit_status;
108 	uint64_t rtit_cr3_match;
109 	uint64_t rtit_addr0_a;
110 	uint64_t rtit_addr0_b;
111 	uint64_t rtit_addr1_a;
112 	uint64_t rtit_addr1_b;
113 };
114 
115 struct pt_buffer {
116 	uint64_t *topa_hw; /* ToPA table entries. */
117 	size_t size;
118 	struct mtx lock; /* Lock for fields below. */
119 	vm_offset_t offset;
120 };
121 
122 struct pt_ctx {
123 	int id;
124 	struct pt_buffer buf; /* ToPA buffer metadata */
125 	struct hwt_context *hwt_ctx;
126 	uint8_t *save_area; /* PT XSAVE area */
127 };
128 /* PT tracing contexts used for CPU mode. */
129 static struct pt_ctx *pt_pcpu_ctx;
130 
131 enum pt_cpu_state { PT_INACTIVE = 0, PT_ACTIVE };
132 
133 static struct pt_cpu {
134 	struct pt_ctx *ctx;	 /* active PT tracing context */
135 	enum pt_cpu_state state; /* used as part of trace stop protocol */
136 	void *swi_cookie;	 /* Software interrupt handler context */
137 	int in_pcint_handler;
138 } *pt_pcpu;
139 
140 /*
141  * PT-related CPUID bits.
142  */
143 static struct pt_cpu_info {
144 	uint32_t l0_eax;
145 	uint32_t l0_ebx;
146 	uint32_t l0_ecx;
147 	uint32_t l1_eax;
148 	uint32_t l1_ebx;
149 	size_t xsave_area_size;
150 	size_t xstate_hdr_offset;
151 	size_t pt_xsave_offset;
152 } pt_info  __read_mostly;
153 
154 static bool initialized = false;
155 static int cpu_mode_ctr = 0;
156 
157 static __inline enum pt_cpu_state
pt_cpu_get_state(int cpu_id)158 pt_cpu_get_state(int cpu_id)
159 {
160 	return (atomic_load_int(&pt_pcpu[cpu_id].state));
161 }
162 
163 static __inline void
pt_cpu_set_state(int cpu_id,enum pt_cpu_state state)164 pt_cpu_set_state(int cpu_id, enum pt_cpu_state state)
165 {
166 	atomic_store_int(&pt_pcpu[cpu_id].state, state);
167 }
168 
169 static __inline struct xstate_hdr *
pt_ctx_get_xstate_hdr(struct pt_ctx * ctx)170 pt_ctx_get_xstate_hdr(struct pt_ctx *ctx)
171 {
172 	return ((struct xstate_hdr *)(ctx->save_area +
173 	    pt_info.xstate_hdr_offset));
174 }
175 
176 
177 static __inline struct pt_ext_area *
pt_ctx_get_ext_area(struct pt_ctx * ctx)178 pt_ctx_get_ext_area(struct pt_ctx *ctx)
179 {
180 	return ((struct pt_ext_area *)(ctx->save_area +
181 	    pt_info.pt_xsave_offset));
182 }
183 
184 /*
185  * Updates current trace buffer offset from the
186  * ToPA MSRs. Records if the trace buffer wrapped.
187  */
188 static __inline void
pt_update_buffer(struct pt_buffer * buf)189 pt_update_buffer(struct pt_buffer *buf)
190 {
191 	uint64_t reg;
192 	uint64_t offset;
193 
194 	/* Update buffer offset. */
195 	reg = rdmsr(MSR_IA32_RTIT_OUTPUT_MASK_PTRS);
196 	offset = ((reg & PT_TOPA_PAGE_MASK) >> PT_TOPA_PAGE_SHIFT) * PAGE_SIZE;
197 	offset += (reg >> 32);
198 
199 	atomic_store_rel_64(&buf->offset, offset);
200 }
201 
202 static __inline void
pt_fill_buffer_record(int id,struct pt_buffer * buf,struct hwt_record_entry * rec)203 pt_fill_buffer_record(int id, struct pt_buffer *buf,
204     struct hwt_record_entry *rec)
205 {
206 	vm_offset_t offset;
207 
208 	offset = atomic_load_acq_64(&buf->offset);
209 
210 	rec->record_type = HWT_RECORD_BUFFER;
211 	rec->buf_id = id;
212 	rec->curpage = offset / PAGE_SIZE;
213 	rec->offset = offset & PAGE_MASK;
214 }
215 
216 /*
217  * Enables or disables tracing on curcpu
218  * using the XSAVE/XRSTOR PT extensions.
219  */
220 static void
pt_cpu_toggle_local(uint8_t * save_area,bool enable)221 pt_cpu_toggle_local(uint8_t *save_area, bool enable)
222 {
223 	u_long xcr0, cr0;
224 	u_long xss;
225 
226 	cr0 = rcr0();
227 	if (cr0 & CR0_TS)
228 		clts();
229 	xcr0 = rxcr(XCR0);
230 	if ((xcr0 & PT_XSAVE_MASK) != PT_XSAVE_MASK)
231 		load_xcr(XCR0, xcr0 | PT_XSAVE_MASK);
232 	xss = rdmsr(MSR_IA32_XSS);
233 	wrmsr(MSR_IA32_XSS, xss | XFEATURE_ENABLED_PT);
234 
235 	if (!enable) {
236 		KASSERT((rdmsr(MSR_IA32_RTIT_CTL) & RTIT_CTL_TRACEEN) != 0,
237 		    ("%s: PT is disabled", __func__));
238 		xsaves(save_area, XFEATURE_ENABLED_PT);
239 	} else {
240 		KASSERT((rdmsr(MSR_IA32_RTIT_CTL) & RTIT_CTL_TRACEEN) == 0,
241 		    ("%s: PT is enabled", __func__));
242 		xrstors(save_area, XFEATURE_ENABLED_PT);
243 	}
244 	wrmsr(MSR_IA32_XSS, xss);
245 	if ((xcr0 & PT_XSAVE_MASK) != PT_XSAVE_MASK)
246 		load_xcr(XCR0, xcr0);
247 	if (cr0 & CR0_TS)
248 		load_cr0(cr0);
249 }
250 
251 /*
252  * Starts PT tracing on 'curcpu'.
253  */
254 static void
pt_cpu_start(void * dummy)255 pt_cpu_start(void *dummy)
256 {
257 	struct pt_cpu *cpu;
258 
259 	cpu = &pt_pcpu[curcpu];
260 	MPASS(cpu->ctx != NULL);
261 
262 	dprintf("%s: curcpu %d\n", __func__, curcpu);
263 	pt_cpu_set_state(curcpu, PT_ACTIVE);
264 	load_cr4(rcr4() | CR4_XSAVE);
265 	wrmsr(MSR_IA32_RTIT_STATUS, 0);
266 	pt_cpu_toggle_local(cpu->ctx->save_area, true);
267 }
268 
269 /*
270  * Stops PT tracing on 'curcpu'.
271  * Updates trace buffer offset to ensure
272  * any data generated between the last interrupt
273  * and the trace stop gets picked up by userspace.
274  */
275 static void
pt_cpu_stop(void * dummy)276 pt_cpu_stop(void *dummy)
277 {
278 	struct pt_cpu *cpu;
279 	struct pt_ctx *ctx;
280 
281 	cpu = &pt_pcpu[curcpu];
282 	ctx = cpu->ctx;
283 
284 	dprintf("%s: curcpu %d\n", __func__, curcpu);
285 	/* Shutdown may occur before PT gets properly configured. */
286 	if (ctx == NULL) {
287 		dprintf("%s: missing context on cpu %d; bailing\n", __func__,
288 		    curcpu);
289 		return;
290 	}
291 	pt_cpu_toggle_local(cpu->ctx->save_area, false);
292 	pt_update_buffer(&ctx->buf);
293 }
294 
295 /*
296  * Prepares the Table of Physical Addresses (ToPA) metadata for 'pt_ctx'.
297  * The HWT trace buffer is split into 4K ToPA table entries and used
298  * as a circular buffer, meaning that the last ToPA entry points to
299  * the first ToPA entry. Each entry is configured to raise an
300  * interrupt after being filled.
301  */
302 static int
pt_topa_prepare(struct pt_ctx * ctx,struct hwt_vm * vm)303 pt_topa_prepare(struct pt_ctx *ctx, struct hwt_vm *vm)
304 {
305 	struct pt_buffer *buf;
306 	size_t topa_size;
307 	int i;
308 
309 	topa_size = TOPA_SIZE_4K;
310 	buf = &ctx->buf;
311 
312 	KASSERT(buf->topa_hw == NULL,
313 	    ("%s: ToPA info already exists", __func__));
314 	buf->topa_hw = mallocarray(vm->npages + 1, sizeof(uint64_t), M_PT,
315 	    M_ZERO | M_WAITOK);
316 	dprintf("%s: ToPA virt addr %p\n", __func__, buf->topa_hw);
317 	buf->size = vm->npages * PAGE_SIZE;
318 	for (i = 0; i < vm->npages; i++) {
319 		buf->topa_hw[i] = VM_PAGE_TO_PHYS(vm->pages[i]) | topa_size;
320 		/*
321 		 * XXX: TOPA_INT should ideally be set according to
322 		 * expected amount of incoming trace data. Too few TOPA_INT
323 		 * entries will not trigger interrupts often enough when tracing
324 		 * smaller functions.
325 		 */
326 		buf->topa_hw[i] |= TOPA_INT;
327 	}
328 	buf->topa_hw[vm->npages] = (uint64_t)vtophys(buf->topa_hw) | TOPA_END;
329 
330 	return (0);
331 }
332 
333 /*
334  * Configures IP filtering for trace generation.
335  * A maximum of 2 ranges can be specified due to
336  * limitations imposed by the XSAVE/XRSTOR PT extensions.
337  */
338 static int
pt_configure_ranges(struct pt_ctx * ctx,struct pt_cpu_config * cfg)339 pt_configure_ranges(struct pt_ctx *ctx, struct pt_cpu_config *cfg)
340 {
341 	struct pt_ext_area *pt_ext;
342 	int nranges_supp, n, error = 0;
343 
344 	pt_ext = pt_ctx_get_ext_area(ctx);
345 	if (pt_info.l0_ebx & CPUPT_IPF) {
346 		nranges_supp = (pt_info.l1_eax & CPUPT_NADDR_M) >>
347 		    CPUPT_NADDR_S;
348 
349 		if (nranges_supp > PT_IP_FILTER_MAX_RANGES)
350 			nranges_supp = PT_IP_FILTER_MAX_RANGES;
351 		n = cfg->nranges;
352 		if (n > nranges_supp) {
353 			printf("%s: %d IP filtering ranges requested, CPU "
354 			       "supports %d, truncating\n",
355 			    __func__, n, nranges_supp);
356 			n = nranges_supp;
357 		}
358 
359 		switch (n) {
360 		case 2:
361 			pt_ext->rtit_ctl |= (1UL << RTIT_CTL_ADDR_CFG_S(1));
362 			pt_ext->rtit_addr1_a = cfg->ip_ranges[1].start;
363 			pt_ext->rtit_addr1_b = cfg->ip_ranges[1].end;
364 		case 1:
365 			pt_ext->rtit_ctl |= (1UL << RTIT_CTL_ADDR_CFG_S(0));
366 			pt_ext->rtit_addr0_a = cfg->ip_ranges[0].start;
367 			pt_ext->rtit_addr0_b = cfg->ip_ranges[0].end;
368 			break;
369 		default:
370 			error = (EINVAL);
371 			break;
372 		};
373 	} else
374 		error = (ENXIO);
375 
376 	return (error);
377 }
378 
379 static int
pt_init_ctx(struct pt_ctx * pt_ctx,struct hwt_vm * vm,int ctx_id)380 pt_init_ctx(struct pt_ctx *pt_ctx, struct hwt_vm *vm, int ctx_id)
381 {
382 
383 	dprintf("%s: ctx id %d\n", __func__, ctx_id);
384 
385 	KASSERT(pt_ctx->buf.topa_hw == NULL,
386 	    ("%s: active ToPA buffer in context %p\n", __func__, pt_ctx));
387 
388 	memset(pt_ctx, 0, sizeof(struct pt_ctx));
389 	mtx_init(&pt_ctx->buf.lock, "pttopa", NULL, MTX_SPIN);
390 	pt_ctx->save_area = malloc_aligned(pt_info.xsave_area_size, 64,
391 	    M_PT, M_NOWAIT | M_ZERO);
392 	if (pt_ctx->save_area == NULL)
393 		return (ENOMEM);
394 	dprintf("%s: preparing ToPA buffer\n", __func__);
395 	if (pt_topa_prepare(pt_ctx, vm) != 0) {
396 		free(pt_ctx->save_area, M_PT);
397 		return (ENOMEM);
398 	}
399 
400 	pt_ctx->id = ctx_id;
401 
402 	return (0);
403 }
404 
405 static void
pt_deinit_ctx(struct pt_ctx * pt_ctx)406 pt_deinit_ctx(struct pt_ctx *pt_ctx)
407 {
408 
409 	if (pt_ctx->buf.topa_hw != NULL)
410 		free(pt_ctx->buf.topa_hw, M_PT);
411 	if (pt_ctx->save_area != NULL)
412 		free(pt_ctx->save_area, M_PT);
413 	memset(pt_ctx, 0, sizeof(*pt_ctx));
414 }
415 
416 /*
417  * HWT backend configuration method.
418  *
419  * Checks and translates the user-defined configuration to a
420  * set of PT tracing features. Uses the feature set to initialize
421  * the tracing context for the target CPU or thread.
422  */
423 static int
pt_backend_configure(struct hwt_context * ctx,int cpu_id,int thread_id)424 pt_backend_configure(struct hwt_context *ctx, int cpu_id, int thread_id)
425 {
426 	struct hwt_cpu *hwt_cpu;
427 	struct hwt_thread *thr;
428 	struct pt_ctx *pt_ctx;
429 	struct pt_cpu_config *cfg;
430 	struct pt_ext_area *pt_ext;
431 	struct xstate_hdr *hdr;
432 	int error;
433 
434 	dprintf("%s\n", __func__);
435 
436 	cfg = (struct pt_cpu_config *)ctx->config;
437 	pt_ctx = NULL;
438 
439 	/* Clear any flags we don't support yet. */
440 	cfg->rtit_ctl &= PT_SUPPORTED_FLAGS;
441 	if (cfg->rtit_ctl & RTIT_CTL_MTCEN) {
442 		if ((pt_info.l0_ebx & CPUPT_MTC) == 0) {
443 			printf("%s: CPU does not support generating MTC "
444 			    "packets\n", __func__);
445 			return (ENXIO);
446 		}
447 	}
448 
449 	if (cfg->rtit_ctl & RTIT_CTL_CR3FILTER) {
450 		if ((pt_info.l0_ebx & CPUPT_CR3) == 0) {
451 			printf("%s: CPU does not support CR3 filtering\n",
452 			    __func__);
453 			return (ENXIO);
454 		}
455 	}
456 
457 	if (cfg->rtit_ctl & RTIT_CTL_DIS_TNT) {
458 		if ((pt_info.l0_ebx & CPUPT_DIS_TNT) == 0) {
459 			printf("%s: CPU does not support TNT\n", __func__);
460 			return (ENXIO);
461 		}
462 	}
463 	/* TODO: support for more config bits. */
464 
465 	if (ctx->mode == HWT_MODE_CPU) {
466 		TAILQ_FOREACH(hwt_cpu, &ctx->cpus, next) {
467 			if (hwt_cpu->cpu_id != cpu_id)
468 				continue;
469 			pt_ctx = &pt_pcpu_ctx[cpu_id];
470 			break;
471 		}
472 	} else {
473 		TAILQ_FOREACH(thr, &ctx->threads, next) {
474 			if (thr->thread_id != thread_id)
475 				continue;
476 			KASSERT(thr->private != NULL,
477 			    ("%s: hwt thread private"
478 			     " not set, thr %p",
479 				__func__, thr));
480 			pt_ctx = (struct pt_ctx *)thr->private;
481 			break;
482 		}
483 	}
484 	if (pt_ctx == NULL)
485 		return (ENOENT);
486 
487 	dprintf("%s: preparing MSRs\n", __func__);
488 	pt_ext = pt_ctx_get_ext_area(pt_ctx);
489 	hdr = pt_ctx_get_xstate_hdr(pt_ctx);
490 
491 	pt_ext->rtit_ctl |= cfg->rtit_ctl;
492 	if (cfg->nranges != 0) {
493 		dprintf("%s: preparing IPF ranges\n", __func__);
494 		if ((error = pt_configure_ranges(pt_ctx, cfg)) != 0)
495 			return (error);
496 	}
497 	pt_ctx->hwt_ctx = ctx;
498 	pt_ext->rtit_ctl |= RTIT_CTL_TOPA;
499 	pt_ext->rtit_output_base = (uint64_t)vtophys(pt_ctx->buf.topa_hw);
500 	pt_ext->rtit_output_mask_ptrs = PT_TOPA_MASK_PTRS;
501 	hdr->xstate_bv = XFEATURE_ENABLED_PT;
502 	hdr->xstate_xcomp_bv = XFEATURE_ENABLED_PT |
503 	    XSTATE_XCOMP_BV_COMPACT;
504 	pt_ext->rtit_ctl |= RTIT_CTL_TRACEEN;
505 	pt_pcpu[cpu_id].ctx = pt_ctx;
506 
507 	return (0);
508 }
509 
510 /*
511  * hwt backend trace start operation. CPU affine.
512  */
513 static void
pt_backend_enable(struct hwt_context * ctx,int cpu_id)514 pt_backend_enable(struct hwt_context *ctx, int cpu_id)
515 {
516 	if (ctx->mode == HWT_MODE_CPU)
517 		return;
518 
519 	KASSERT(curcpu == cpu_id,
520 	    ("%s: attempting to start PT on another cpu", __func__));
521 	pt_cpu_start(NULL);
522 	CPU_SET(cpu_id, &ctx->cpu_map);
523 }
524 
525 /*
526  * hwt backend trace stop operation. CPU affine.
527  */
528 static void
pt_backend_disable(struct hwt_context * ctx,int cpu_id)529 pt_backend_disable(struct hwt_context *ctx, int cpu_id)
530 {
531 	struct pt_cpu *cpu;
532 
533 	if (ctx->mode == HWT_MODE_CPU)
534 		return;
535 	KASSERT(curcpu == cpu_id,
536 	    ("%s: attempting to disable PT on another cpu", __func__));
537 
538 	cpu = &pt_pcpu[cpu_id];
539 
540 	dprintf("%s: waiting for cpu %d to exit interrupt handler\n", __func__,
541 	    cpu_id);
542 	pt_cpu_set_state(cpu_id, PT_INACTIVE);
543 	while (atomic_cmpset_int(&cpu->in_pcint_handler, 1, 0))
544 		;
545 
546 	pt_cpu_stop(NULL);
547 	CPU_CLR(cpu_id, &ctx->cpu_map);
548 	cpu->ctx = NULL;
549 }
550 
551 /*
552  * hwt backend trace start operation for remote CPUs.
553  */
554 static int
pt_backend_enable_smp(struct hwt_context * ctx)555 pt_backend_enable_smp(struct hwt_context *ctx)
556 {
557 	dprintf("%s\n", __func__);
558 
559 	KASSERT(ctx->mode == HWT_MODE_CPU,
560 	    ("%s: should only be used for CPU mode", __func__));
561 	if (ctx->mode == HWT_MODE_CPU &&
562 	    atomic_swap_32(&cpu_mode_ctr, 1) != 0)
563 		return (-1);
564 
565 	smp_rendezvous_cpus(ctx->cpu_map, NULL, pt_cpu_start, NULL, NULL);
566 
567 	return (0);
568 }
569 
570 /*
571  * hwt backend trace stop operation for remote CPUs.
572  */
573 static int
pt_backend_disable_smp(struct hwt_context * ctx)574 pt_backend_disable_smp(struct hwt_context *ctx)
575 {
576 	struct pt_cpu *cpu;
577 
578 	dprintf("%s\n", __func__);
579 	if (ctx->mode == HWT_MODE_CPU &&
580 	    atomic_swap_32(&cpu_mode_ctr, 0) == 0)
581 		return (-1);
582 
583 	if (CPU_EMPTY(&ctx->cpu_map)) {
584 		dprintf("%s: empty cpu map\n", __func__);
585 		return (-1);
586 	}
587 	CPU_FOREACH_ISSET(cpu_id, &ctx->cpu_map) {
588 		cpu = &pt_pcpu[cpu_id];
589 		dprintf("%s: waiting for cpu %d to exit interrupt handler\n",
590 		    __func__, cpu_id);
591 		pt_cpu_set_state(cpu_id, PT_INACTIVE);
592 		while (atomic_cmpset_int(&cpu->in_pcint_handler, 1, 0))
593 			;
594 	}
595 	smp_rendezvous_cpus(ctx->cpu_map, NULL, pt_cpu_stop, NULL, NULL);
596 
597 	return (0);
598 }
599 
600 /*
601  * HWT backend initialization method.
602  *
603  * Installs the ToPA interrupt handler and initializes
604  * the tracing contexts used for HWT_MODE_CPU.
605  */
606 static int
pt_backend_init(struct hwt_context * ctx)607 pt_backend_init(struct hwt_context *ctx)
608 {
609 	struct hwt_cpu *hwt_cpu;
610 	int error;
611 
612 	dprintf("%s\n", __func__);
613 	if (ctx->mode != HWT_MODE_CPU)
614 		return (0);
615 	TAILQ_FOREACH(hwt_cpu, &ctx->cpus, next) {
616 		error = pt_init_ctx(&pt_pcpu_ctx[hwt_cpu->cpu_id], hwt_cpu->vm,
617 		    hwt_cpu->cpu_id);
618 		if (error)
619 			return (error);
620 	}
621 
622 	return (0);
623 }
624 
625 /*
626  * HWT backend teardown method.
627  *
628  * Removes the ToPA interrupt handler, stops tracing on all active CPUs,
629  * and releases all previously allocated ToPA metadata.
630  */
631 static int
pt_backend_deinit(struct hwt_context * ctx)632 pt_backend_deinit(struct hwt_context *ctx)
633 {
634 	struct pt_ctx *pt_ctx;
635 	struct hwt_thread *thr;
636 	int cpu_id;
637 
638 	dprintf("%s\n", __func__);
639 
640 	pt_backend_disable_smp(ctx);
641 	if (ctx->mode == HWT_MODE_THREAD) {
642 		TAILQ_FOREACH(thr, &ctx->threads, next) {
643 			KASSERT(thr->private != NULL,
644 			    ("%s: thr->private not set", __func__));
645 			pt_ctx = (struct pt_ctx *)thr->private;
646 			pt_deinit_ctx(pt_ctx);
647 		}
648 	} else {
649 		CPU_FOREACH_ISSET(cpu_id, &ctx->cpu_map) {
650 			if (pt_pcpu[cpu_id].ctx == NULL)
651 				continue;
652 			KASSERT(pt_pcpu[cpu_id].ctx == &pt_pcpu_ctx[cpu_id],
653 			    ("%s: CPU mode tracing with non-cpu mode PT"
654 			     "context active",
655 				__func__));
656 			pt_deinit_ctx(pt_pcpu[cpu_id].ctx);
657 			pt_pcpu[cpu_id].ctx = NULL;
658 			atomic_set_int(&pt_pcpu[cpu_id].in_pcint_handler, 0);
659 		}
660 	}
661 
662 	return (0);
663 }
664 
665 /*
666  * Fetches current offset into the tracing buffer.
667  */
668 static int
pt_backend_read(struct hwt_vm * vm,int * curpage,vm_offset_t * curpage_offset,uint64_t * data)669 pt_backend_read(struct hwt_vm *vm, int *curpage, vm_offset_t *curpage_offset,
670     uint64_t *data)
671 {
672 	struct pt_buffer *buf;
673 	uint64_t offset;
674 
675 	if (vm->ctx->mode == HWT_MODE_THREAD)
676 		buf = &((struct pt_ctx *)vm->thr->private)->buf;
677 	else
678 		buf = &pt_pcpu[vm->cpu->cpu_id].ctx->buf;
679 	offset = atomic_load_acq_64(&buf->offset);
680 	*curpage = offset / PAGE_SIZE;
681 	*curpage_offset = offset & PAGE_MASK;
682 
683 	return (0);
684 }
685 
686 /*
687  * HWT thread creation hook.
688  * Allocates and associates a 'struct pt_ctx' for a given hwt thread.
689  */
690 static int
pt_backend_alloc_thread(struct hwt_thread * thr)691 pt_backend_alloc_thread(struct hwt_thread *thr)
692 {
693 	struct pt_ctx *pt_ctx;
694 	int error;
695 
696 	/* Omit M_WAITOK since this might get invoked a non-sleepable context */
697 	pt_ctx = malloc(sizeof(*pt_ctx), M_PT, M_NOWAIT | M_ZERO);
698 	if (pt_ctx == NULL)
699 		return (ENOMEM);
700 
701 	error = pt_init_ctx(pt_ctx, thr->vm, thr->thread_id);
702 	if (error)
703 		return (error);
704 
705 	thr->private = pt_ctx;
706 	return (0);
707 }
708 /*
709  * HWT thread teardown hook.
710  */
711 static void
pt_backend_free_thread(struct hwt_thread * thr)712 pt_backend_free_thread(struct hwt_thread *thr)
713 {
714 	struct pt_ctx *ctx;
715 
716 	ctx = (struct pt_ctx *)thr->private;
717 
718 	pt_deinit_ctx(ctx);
719 	free(ctx, M_PT);
720 }
721 
722 static void
pt_backend_dump(int cpu_id)723 pt_backend_dump(int cpu_id)
724 {
725 }
726 
727 static struct hwt_backend_ops pt_ops = {
728 	.hwt_backend_init = pt_backend_init,
729 	.hwt_backend_deinit = pt_backend_deinit,
730 
731 	.hwt_backend_configure = pt_backend_configure,
732 
733 	.hwt_backend_enable = pt_backend_enable,
734 	.hwt_backend_disable = pt_backend_disable,
735 
736 #ifdef SMP
737 	.hwt_backend_enable_smp = pt_backend_enable_smp,
738 	.hwt_backend_disable_smp = pt_backend_disable_smp,
739 #endif
740 
741 	.hwt_backend_read = pt_backend_read,
742 	.hwt_backend_dump = pt_backend_dump,
743 
744 	.hwt_backend_thread_alloc = pt_backend_alloc_thread,
745 	.hwt_backend_thread_free = pt_backend_free_thread,
746 };
747 
748 static struct hwt_backend backend = {
749 	.ops = &pt_ops,
750 	.name = "pt",
751 	.kva_req = 1,
752 };
753 
754 /*
755  * Reads the latest valid trace buffer offset and enqueues
756  * a HWT_RECORD_BUFFER record.
757  * Used as a taskqueue routine from the ToPA interrupt handler.
758  */
759 static void
pt_send_buffer_record(void * arg)760 pt_send_buffer_record(void *arg)
761 {
762 	struct pt_cpu *cpu = (struct pt_cpu *)arg;
763 	struct hwt_record_entry record;
764 
765 	struct pt_ctx *ctx = cpu->ctx;
766 	pt_fill_buffer_record(ctx->id, &ctx->buf, &record);
767 	hwt_record_ctx(ctx->hwt_ctx, &record, M_ZERO | M_NOWAIT);
768 }
769 static void
pt_topa_status_clear(void)770 pt_topa_status_clear(void)
771 {
772 	uint64_t reg;
773 
774 	reg = rdmsr(MSR_IA_GLOBAL_STATUS_RESET);
775 	reg &= ~GLOBAL_STATUS_FLAG_TRACETOPAPMI;
776 	reg |= GLOBAL_STATUS_FLAG_TRACETOPAPMI;
777 	wrmsr(MSR_IA_GLOBAL_STATUS_RESET, reg);
778 }
779 
780 /*
781  * ToPA PMI handler.
782  *
783  * Invoked every time a ToPA entry marked with TOPA_INT is filled.
784  * Uses taskqueue to enqueue a buffer record for userspace.
785  * Re-enables the PC interrupt line as long as tracing is active.
786  */
787 static int
pt_topa_intr(struct trapframe * tf)788 pt_topa_intr(struct trapframe *tf)
789 {
790 	struct pt_buffer *buf;
791 	struct pt_cpu *cpu;
792 	struct pt_ctx *ctx;
793 	uint64_t reg;
794 
795 	cpu = &pt_pcpu[curcpu];
796 	reg = rdmsr(MSR_IA_GLOBAL_STATUS);
797 	if ((reg & GLOBAL_STATUS_FLAG_TRACETOPAPMI) == 0) {
798 		pt_topa_status_clear();
799 		return (0);
800 	}
801 
802 	if (pt_cpu_get_state(curcpu) != PT_ACTIVE) {
803 		return (1);
804 	}
805 	atomic_set_int(&cpu->in_pcint_handler, 1);
806 
807 	ctx = cpu->ctx;
808 	KASSERT(ctx != NULL,
809 	    ("%s: cpu %d: ToPA PMI interrupt without an active context",
810 		__func__, curcpu));
811 	buf = &ctx->buf;
812 	KASSERT(buf->topa_hw != NULL,
813 	    ("%s: cpu %d: ToPA PMI interrupt with invalid buffer", __func__,
814 		curcpu));
815 	pt_cpu_toggle_local(ctx->save_area, false);
816 	pt_update_buffer(buf);
817 	pt_topa_status_clear();
818 
819 	if (pt_cpu_get_state(curcpu) == PT_ACTIVE) {
820 		swi_sched(cpu->swi_cookie, SWI_FROMNMI);
821 		pt_cpu_toggle_local(ctx->save_area, true);
822 		lapic_reenable_pcint();
823 	}
824 	atomic_set_int(&cpu->in_pcint_handler, 0);
825 	return (1);
826 }
827 
828 /*
829  * Module initialization.
830  *
831  * Saves all PT-related cpuid info, registers itself as a HWT backend,
832  * and allocates metadata required to keep track of tracing operations
833  * on each CPU.
834  */
835 static int
pt_init(void)836 pt_init(void)
837 {
838 	u_int cp[4];
839 	int error, i;
840 
841 	dprintf("pt: Enumerating part 1\n");
842 	cpuid_count(CPUID_PT_LEAF, 0, cp);
843 	dprintf("pt: Maximum valid sub-leaf Index: %x\n", cp[0]);
844 	dprintf("pt: ebx %x\n", cp[1]);
845 	dprintf("pt: ecx %x\n", cp[2]);
846 
847 	pt_info.l0_eax = cp[0];
848 	pt_info.l0_ebx = cp[1];
849 	pt_info.l0_ecx = cp[2];
850 
851 	dprintf("pt: Enumerating part 2\n");
852 	cpuid_count(CPUID_PT_LEAF, 1, cp);
853 	dprintf("pt: eax %x\n", cp[0]);
854 	dprintf("pt: ebx %x\n", cp[1]);
855 
856 	pt_info.l1_eax = cp[0];
857 	pt_info.l1_ebx = cp[1];
858 
859 	error = hwt_backend_register(&backend);
860 	if (error != 0) {
861 		printf("pt: unable to register hwt backend, error %d\n", error);
862 		return (error);
863 	}
864 	pt_pcpu = mallocarray(mp_ncpus, sizeof(struct pt_cpu), M_PT,
865 	    M_ZERO | M_WAITOK);
866 	pt_pcpu_ctx = mallocarray(mp_ncpus, sizeof(struct pt_ctx), M_PT,
867 	    M_ZERO | M_WAITOK);
868 
869 	for (i = 0; i < mp_ncpus; i++) {
870 		error = swi_add(&clk_intr_event, "pt", pt_send_buffer_record,
871 		    &pt_pcpu[i], SWI_CLOCK, INTR_MPSAFE,
872 		    &pt_pcpu[i].swi_cookie);
873 		if (error != 0) {
874 			dprintf(
875 			    "%s: failed to add interrupt handler for cpu: %d\n",
876 			    __func__, error);
877 			goto err;
878 		}
879 	}
880 
881 	nmi_register_handler(pt_topa_intr);
882 	if (lapic_enable_pcint()) {
883 		initialized = true;
884 		return (0);
885 	} else
886 		printf("pt: failed to setup interrupt line\n");
887 err:
888 	nmi_remove_handler(pt_topa_intr);
889 	hwt_backend_unregister(&backend);
890 
891 	for (i = 0; i < mp_ncpus; i++) {
892 		if (pt_pcpu[i].swi_cookie != 0)
893 			swi_remove(pt_pcpu[i].swi_cookie);
894 	}
895 	free(pt_pcpu, M_PT);
896 	free(pt_pcpu_ctx, M_PT);
897 	pt_pcpu = NULL;
898 	pt_pcpu_ctx = NULL;
899 
900 	return (error);
901 }
902 
903 /*
904  * Checks whether the CPU support Intel PT and
905  * initializes XSAVE area info.
906  *
907  * The driver relies on XSAVE/XRSTOR PT extensions,
908  * Table of Physical Addresses (ToPA) support, and
909  * support for multiple ToPA entries.
910  */
911 static bool
pt_supported(void)912 pt_supported(void)
913 {
914 	u_int cp[4];
915 
916 	if ((cpu_stdext_feature & CPUID_STDEXT_PROCTRACE) == 0) {
917 		printf("pt: CPU does not support Intel Processor Trace\n");
918 		return (false);
919 	}
920 	if ((cpu_feature2 & CPUID2_XSAVE) == 0) {
921 		printf("pt: XSAVE is not supported\n");
922 		return (false);
923 	}
924 	if (!xsave_extfeature_supported(XFEATURE_ENABLED_PT, true)) {
925 		printf("pt: CPU does not support managing PT state using XSAVE\n");
926 		return (false);
927 	}
928 	if (!xsave_extension_supported(CPUID_EXTSTATE_XSAVEC)) {
929 		printf("pt: XSAVE compaction is not supported\n");
930 		return (false);
931 	}
932 	if (!xsave_extension_supported(CPUID_EXTSTATE_XSAVES)) {
933 		printf("pt: CPU does not support XSAVES/XRSTORS\n");
934 		return (false);
935 	}
936 
937 	/* Require ToPA support. */
938 	cpuid_count(CPUID_PT_LEAF, 0, cp);
939 	if ((cp[2] & CPUPT_TOPA) == 0) {
940 		printf("pt: ToPA is not supported\n");
941 		return (false);
942 	}
943 	if ((cp[2] & CPUPT_TOPA_MULTI) == 0) {
944 		printf("pt: multiple ToPA outputs are not supported\n");
945 		return (false);
946 	}
947 
948 	pt_info.xstate_hdr_offset = xsave_area_hdr_offset();
949 	pt_info.xsave_area_size = xsave_area_size(PT_XSTATE_BV, true, true);
950 	pt_info.pt_xsave_offset = xsave_area_offset(PT_XSTATE_BV,
951 	    XFEATURE_ENABLED_PT, true, true);
952 
953 	return (true);
954 }
955 
956 static void
pt_deinit(void)957 pt_deinit(void)
958 {
959 	int i;
960 	struct pt_cpu *cpu;
961 
962 	if (!initialized)
963 		return;
964 	nmi_remove_handler(pt_topa_intr);
965 	lapic_disable_pcint();
966 	hwt_backend_unregister(&backend);
967 
968 	for (i = 0; i < mp_ncpus; i++) {
969 		cpu = &pt_pcpu[i];
970 		swi_remove(cpu->swi_cookie);
971 	}
972 
973 	free(pt_pcpu, M_PT);
974 	free(pt_pcpu_ctx, M_PT);
975 	pt_pcpu = NULL;
976 	pt_pcpu_ctx = NULL;
977 	initialized = false;
978 }
979 
980 static int
pt_modevent(module_t mod,int type,void * data)981 pt_modevent(module_t mod, int type, void *data)
982 {
983 	switch (type) {
984 	case MOD_LOAD:
985 		if (!pt_supported() || pt_init() != 0) {
986 			return (ENXIO);
987 		}
988 		break;
989 	case MOD_UNLOAD:
990 		pt_deinit();
991 		break;
992 	default:
993 		break;
994 	}
995 
996 	return (0);
997 }
998 
999 static moduledata_t pt_mod = { "intel_pt", pt_modevent, NULL };
1000 
1001 DECLARE_MODULE(intel_pt, pt_mod, SI_SUB_DRIVERS, SI_ORDER_FIRST);
1002 MODULE_DEPEND(intel_pt, hwt, 1, 1, 1);
1003 MODULE_VERSION(intel_pt, 1);
1004