1 /* 2 * Copyright (c) 2025 Bojan Novković <bnovkov@freebsd.org> 3 * 4 * SPDX-License-Identifier: BSD-2-Clause 5 */ 6 7 /* 8 * hwt(4) Intel Processor Trace (PT) backend 9 * 10 * Driver Design Overview 11 * 12 * - Since PT is configured on a per-core basis, the driver uses 13 * 'smp_rendezvous' to start and disable tracing on each target core. 14 * - PT-specific resources are stored in a 'struct pt_ctx' context structure for 15 * each traced CPU core or thread. Upon initialization, a ToPA configuration 16 * is generated for each 'pt_ctx' structure using the HWT tracing buffers. 17 * The HWT tracing buffer is split into 4K ToPA entries. Currently, each 18 * 4K ToPA entry is configured to trigger an interrupt after it is filled. 19 * - The PT driver uses the XSAVE/XRSTOR PT extensions to load and save all 20 * relevant PT registers. Every time a traced thread is switched 21 * out or in, its state will be saved to or loaded from its corresponding 22 * 'pt_ctx' context. 23 * - When tracing starts, the PT hardware will start writing data into the 24 * tracing buffer. When a TOPA_INT entry is filled, it will trigger an 25 * interrupt before continuing. The interrupt handler will then fetch the 26 * last valid tracing buffer offset and enqueue a HWT_RECORD_BUFFER record. 27 * The driver is currently configured to use the NMI interrupt line. 28 * - The userspace PT backend waits for incoming HWT_RECORD_BUFFER records 29 * and uses the offsets to decode data from the tracing buffer. 30 * 31 * Future improvements and limitations 32 * 33 * - We currently configure the PT hardware to trigger an interrupt whenever 34 * a 4K ToPA entry is filled. While this is fine when tracing smaller 35 * functions or infrequent code paths, this will generate too much interrupt 36 * traffic when tracing hotter functions. A proper solution for this issue 37 * should estimate the amount of data generated by the current configuration 38 * and use it to determine interrupt frequency. 39 * 40 * - Support for more tracing options and PT features. 41 * 42 */ 43 44 #include <sys/systm.h> 45 #include <sys/bus.h> 46 #include <sys/hwt.h> 47 #include <sys/interrupt.h> 48 #include <sys/kernel.h> 49 #include <sys/lock.h> 50 #include <sys/malloc.h> 51 #include <sys/module.h> 52 #include <sys/mutex.h> 53 #include <sys/smp.h> 54 55 #include <vm/vm.h> 56 #include <vm/vm_page.h> 57 58 #include <machine/atomic.h> 59 #include <machine/cpufunc.h> 60 #include <machine/fpu.h> 61 #include <machine/smp.h> 62 #include <machine/specialreg.h> 63 64 #include <x86/apicvar.h> 65 #include <x86/x86_var.h> 66 67 #include <dev/hwt/hwt_context.h> 68 #include <dev/hwt/hwt_vm.h> 69 #include <dev/hwt/hwt_backend.h> 70 #include <dev/hwt/hwt_config.h> 71 #include <dev/hwt/hwt_cpu.h> 72 #include <dev/hwt/hwt_record.h> 73 #include <dev/hwt/hwt_thread.h> 74 75 #include <amd64/pt/pt.h> 76 77 #ifdef PT_DEBUG 78 #define dprintf(fmt, ...) printf(fmt, ##__VA_ARGS__) 79 #else 80 #define dprintf(fmt, ...) 81 #endif 82 #define PT_SUPPORTED_FLAGS \ 83 (RTIT_CTL_MTCEN | RTIT_CTL_CR3FILTER | RTIT_CTL_DIS_TNT | \ 84 RTIT_CTL_USER | RTIT_CTL_OS | RTIT_CTL_BRANCHEN) 85 #define PT_XSAVE_MASK (XFEATURE_ENABLED_X87 | XFEATURE_ENABLED_SSE) 86 #define PT_XSTATE_BV (PT_XSAVE_MASK | XFEATURE_ENABLED_PT) 87 #define PT_MAX_IP_RANGES 2 88 89 #define PT_TOPA_MASK_PTRS 0x7f 90 #define PT_TOPA_PAGE_MASK 0xffffff80 91 #define PT_TOPA_PAGE_SHIFT 7 92 93 #define CPUID_PT_LEAF 0x14 94 95 MALLOC_DEFINE(M_PT, "pt", "Intel Processor Trace"); 96 97 static void pt_send_buffer_record(void *arg); 98 static int pt_topa_intr(struct trapframe *tf); 99 100 /* 101 * Intel Processor Trace XSAVE-managed state. 102 */ 103 struct pt_ext_area { 104 uint64_t rtit_ctl; 105 uint64_t rtit_output_base; 106 uint64_t rtit_output_mask_ptrs; 107 uint64_t rtit_status; 108 uint64_t rtit_cr3_match; 109 uint64_t rtit_addr0_a; 110 uint64_t rtit_addr0_b; 111 uint64_t rtit_addr1_a; 112 uint64_t rtit_addr1_b; 113 }; 114 115 struct pt_buffer { 116 uint64_t *topa_hw; /* ToPA table entries. */ 117 size_t size; 118 struct mtx lock; /* Lock for fields below. */ 119 vm_offset_t offset; 120 }; 121 122 struct pt_ctx { 123 int id; 124 struct pt_buffer buf; /* ToPA buffer metadata */ 125 struct hwt_context *hwt_ctx; 126 uint8_t *save_area; /* PT XSAVE area */ 127 }; 128 /* PT tracing contexts used for CPU mode. */ 129 static struct pt_ctx *pt_pcpu_ctx; 130 131 enum pt_cpu_state { PT_INACTIVE = 0, PT_ACTIVE }; 132 133 static struct pt_cpu { 134 struct pt_ctx *ctx; /* active PT tracing context */ 135 enum pt_cpu_state state; /* used as part of trace stop protocol */ 136 void *swi_cookie; /* Software interrupt handler context */ 137 int in_pcint_handler; 138 } *pt_pcpu; 139 140 /* 141 * PT-related CPUID bits. 142 */ 143 static struct pt_cpu_info { 144 uint32_t l0_eax; 145 uint32_t l0_ebx; 146 uint32_t l0_ecx; 147 uint32_t l1_eax; 148 uint32_t l1_ebx; 149 size_t xsave_area_size; 150 size_t xstate_hdr_offset; 151 size_t pt_xsave_offset; 152 } pt_info __read_mostly; 153 154 static bool initialized = false; 155 static int cpu_mode_ctr = 0; 156 157 static __inline enum pt_cpu_state 158 pt_cpu_get_state(int cpu_id) 159 { 160 return (atomic_load_int(&pt_pcpu[cpu_id].state)); 161 } 162 163 static __inline void 164 pt_cpu_set_state(int cpu_id, enum pt_cpu_state state) 165 { 166 atomic_store_int(&pt_pcpu[cpu_id].state, state); 167 } 168 169 static __inline struct xstate_hdr * 170 pt_ctx_get_xstate_hdr(struct pt_ctx *ctx) 171 { 172 return ((struct xstate_hdr *)(ctx->save_area + 173 pt_info.xstate_hdr_offset)); 174 } 175 176 177 static __inline struct pt_ext_area * 178 pt_ctx_get_ext_area(struct pt_ctx *ctx) 179 { 180 return ((struct pt_ext_area *)(ctx->save_area + 181 pt_info.pt_xsave_offset)); 182 } 183 184 /* 185 * Updates current trace buffer offset from the 186 * ToPA MSRs. Records if the trace buffer wrapped. 187 */ 188 static __inline void 189 pt_update_buffer(struct pt_buffer *buf) 190 { 191 uint64_t reg; 192 uint64_t offset; 193 194 /* Update buffer offset. */ 195 reg = rdmsr(MSR_IA32_RTIT_OUTPUT_MASK_PTRS); 196 offset = ((reg & PT_TOPA_PAGE_MASK) >> PT_TOPA_PAGE_SHIFT) * PAGE_SIZE; 197 offset += (reg >> 32); 198 199 atomic_store_rel_64(&buf->offset, offset); 200 } 201 202 static __inline void 203 pt_fill_buffer_record(int id, struct pt_buffer *buf, 204 struct hwt_record_entry *rec) 205 { 206 vm_offset_t offset; 207 208 offset = atomic_load_acq_64(&buf->offset); 209 210 rec->record_type = HWT_RECORD_BUFFER; 211 rec->buf_id = id; 212 rec->curpage = offset / PAGE_SIZE; 213 rec->offset = offset & PAGE_MASK; 214 } 215 216 /* 217 * Enables or disables tracing on curcpu 218 * using the XSAVE/XRSTOR PT extensions. 219 */ 220 static void 221 pt_cpu_toggle_local(uint8_t *save_area, bool enable) 222 { 223 u_long xcr0, cr0; 224 u_long xss; 225 226 cr0 = rcr0(); 227 if (cr0 & CR0_TS) 228 clts(); 229 xcr0 = rxcr(XCR0); 230 if ((xcr0 & PT_XSAVE_MASK) != PT_XSAVE_MASK) 231 load_xcr(XCR0, xcr0 | PT_XSAVE_MASK); 232 xss = rdmsr(MSR_IA32_XSS); 233 wrmsr(MSR_IA32_XSS, xss | XFEATURE_ENABLED_PT); 234 235 if (!enable) { 236 KASSERT((rdmsr(MSR_IA32_RTIT_CTL) & RTIT_CTL_TRACEEN) != 0, 237 ("%s: PT is disabled", __func__)); 238 xsaves(save_area, XFEATURE_ENABLED_PT); 239 } else { 240 KASSERT((rdmsr(MSR_IA32_RTIT_CTL) & RTIT_CTL_TRACEEN) == 0, 241 ("%s: PT is enabled", __func__)); 242 xrstors(save_area, XFEATURE_ENABLED_PT); 243 } 244 wrmsr(MSR_IA32_XSS, xss); 245 if ((xcr0 & PT_XSAVE_MASK) != PT_XSAVE_MASK) 246 load_xcr(XCR0, xcr0); 247 if (cr0 & CR0_TS) 248 load_cr0(cr0); 249 } 250 251 /* 252 * Starts PT tracing on 'curcpu'. 253 */ 254 static void 255 pt_cpu_start(void *dummy) 256 { 257 struct pt_cpu *cpu; 258 259 cpu = &pt_pcpu[curcpu]; 260 MPASS(cpu->ctx != NULL); 261 262 dprintf("%s: curcpu %d\n", __func__, curcpu); 263 pt_cpu_set_state(curcpu, PT_ACTIVE); 264 load_cr4(rcr4() | CR4_XSAVE); 265 wrmsr(MSR_IA32_RTIT_STATUS, 0); 266 pt_cpu_toggle_local(cpu->ctx->save_area, true); 267 } 268 269 /* 270 * Stops PT tracing on 'curcpu'. 271 * Updates trace buffer offset to ensure 272 * any data generated between the last interrupt 273 * and the trace stop gets picked up by userspace. 274 */ 275 static void 276 pt_cpu_stop(void *dummy) 277 { 278 struct pt_cpu *cpu; 279 struct pt_ctx *ctx; 280 281 cpu = &pt_pcpu[curcpu]; 282 ctx = cpu->ctx; 283 284 dprintf("%s: curcpu %d\n", __func__, curcpu); 285 /* Shutdown may occur before PT gets properly configured. */ 286 if (ctx == NULL) { 287 dprintf("%s: missing context on cpu %d; bailing\n", __func__, 288 curcpu); 289 return; 290 } 291 pt_cpu_toggle_local(cpu->ctx->save_area, false); 292 pt_update_buffer(&ctx->buf); 293 } 294 295 /* 296 * Prepares the Table of Physical Addresses (ToPA) metadata for 'pt_ctx'. 297 * The HWT trace buffer is split into 4K ToPA table entries and used 298 * as a circular buffer, meaning that the last ToPA entry points to 299 * the first ToPA entry. Each entry is configured to raise an 300 * interrupt after being filled. 301 */ 302 static int 303 pt_topa_prepare(struct pt_ctx *ctx, struct hwt_vm *vm) 304 { 305 struct pt_buffer *buf; 306 size_t topa_size; 307 int i; 308 309 topa_size = TOPA_SIZE_4K; 310 buf = &ctx->buf; 311 312 KASSERT(buf->topa_hw == NULL, 313 ("%s: ToPA info already exists", __func__)); 314 buf->topa_hw = mallocarray(vm->npages + 1, sizeof(uint64_t), M_PT, 315 M_ZERO | M_WAITOK); 316 dprintf("%s: ToPA virt addr %p\n", __func__, buf->topa_hw); 317 buf->size = vm->npages * PAGE_SIZE; 318 for (i = 0; i < vm->npages; i++) { 319 buf->topa_hw[i] = VM_PAGE_TO_PHYS(vm->pages[i]) | topa_size; 320 /* 321 * XXX: TOPA_INT should ideally be set according to 322 * expected amount of incoming trace data. Too few TOPA_INT 323 * entries will not trigger interrupts often enough when tracing 324 * smaller functions. 325 */ 326 buf->topa_hw[i] |= TOPA_INT; 327 } 328 buf->topa_hw[vm->npages] = (uint64_t)vtophys(buf->topa_hw) | TOPA_END; 329 330 return (0); 331 } 332 333 /* 334 * Configures IP filtering for trace generation. 335 * A maximum of 2 ranges can be specified due to 336 * limitations imposed by the XSAVE/XRSTOR PT extensions. 337 */ 338 static int 339 pt_configure_ranges(struct pt_ctx *ctx, struct pt_cpu_config *cfg) 340 { 341 struct pt_ext_area *pt_ext; 342 int nranges_supp, n, error = 0; 343 344 pt_ext = pt_ctx_get_ext_area(ctx); 345 if (pt_info.l0_ebx & CPUPT_IPF) { 346 nranges_supp = (pt_info.l1_eax & CPUPT_NADDR_M) >> 347 CPUPT_NADDR_S; 348 349 if (nranges_supp > PT_IP_FILTER_MAX_RANGES) 350 nranges_supp = PT_IP_FILTER_MAX_RANGES; 351 n = cfg->nranges; 352 if (n > nranges_supp) { 353 printf("%s: %d IP filtering ranges requested, CPU " 354 "supports %d, truncating\n", 355 __func__, n, nranges_supp); 356 n = nranges_supp; 357 } 358 359 switch (n) { 360 case 2: 361 pt_ext->rtit_ctl |= (1UL << RTIT_CTL_ADDR_CFG_S(1)); 362 pt_ext->rtit_addr1_a = cfg->ip_ranges[1].start; 363 pt_ext->rtit_addr1_b = cfg->ip_ranges[1].end; 364 case 1: 365 pt_ext->rtit_ctl |= (1UL << RTIT_CTL_ADDR_CFG_S(0)); 366 pt_ext->rtit_addr0_a = cfg->ip_ranges[0].start; 367 pt_ext->rtit_addr0_b = cfg->ip_ranges[0].end; 368 break; 369 default: 370 error = (EINVAL); 371 break; 372 }; 373 } else 374 error = (ENXIO); 375 376 return (error); 377 } 378 379 static int 380 pt_init_ctx(struct pt_ctx *pt_ctx, struct hwt_vm *vm, int ctx_id) 381 { 382 383 dprintf("%s: ctx id %d\n", __func__, ctx_id); 384 385 KASSERT(pt_ctx->buf.topa_hw == NULL, 386 ("%s: active ToPA buffer in context %p\n", __func__, pt_ctx)); 387 388 memset(pt_ctx, 0, sizeof(struct pt_ctx)); 389 mtx_init(&pt_ctx->buf.lock, "pttopa", NULL, MTX_SPIN); 390 pt_ctx->save_area = malloc_aligned(pt_info.xsave_area_size, 64, 391 M_PT, M_NOWAIT | M_ZERO); 392 if (pt_ctx->save_area == NULL) 393 return (ENOMEM); 394 dprintf("%s: preparing ToPA buffer\n", __func__); 395 if (pt_topa_prepare(pt_ctx, vm) != 0) { 396 free(pt_ctx->save_area, M_PT); 397 return (ENOMEM); 398 } 399 400 pt_ctx->id = ctx_id; 401 402 return (0); 403 } 404 405 static void 406 pt_deinit_ctx(struct pt_ctx *pt_ctx) 407 { 408 409 if (pt_ctx->buf.topa_hw != NULL) 410 free(pt_ctx->buf.topa_hw, M_PT); 411 if (pt_ctx->save_area != NULL) 412 free(pt_ctx->save_area, M_PT); 413 memset(pt_ctx, 0, sizeof(*pt_ctx)); 414 } 415 416 /* 417 * HWT backend configuration method. 418 * 419 * Checks and translates the user-defined configuration to a 420 * set of PT tracing features. Uses the feature set to initialize 421 * the tracing context for the target CPU or thread. 422 */ 423 static int 424 pt_backend_configure(struct hwt_context *ctx, int cpu_id, int thread_id) 425 { 426 struct hwt_cpu *hwt_cpu; 427 struct hwt_thread *thr; 428 struct pt_ctx *pt_ctx; 429 struct pt_cpu_config *cfg; 430 struct pt_ext_area *pt_ext; 431 struct xstate_hdr *hdr; 432 int error; 433 434 dprintf("%s\n", __func__); 435 436 cfg = (struct pt_cpu_config *)ctx->config; 437 pt_ctx = NULL; 438 439 /* Clear any flags we don't support yet. */ 440 cfg->rtit_ctl &= PT_SUPPORTED_FLAGS; 441 if (cfg->rtit_ctl & RTIT_CTL_MTCEN) { 442 if ((pt_info.l0_ebx & CPUPT_MTC) == 0) { 443 printf("%s: CPU does not support generating MTC " 444 "packets\n", __func__); 445 return (ENXIO); 446 } 447 } 448 449 if (cfg->rtit_ctl & RTIT_CTL_CR3FILTER) { 450 if ((pt_info.l0_ebx & CPUPT_CR3) == 0) { 451 printf("%s: CPU does not support CR3 filtering\n", 452 __func__); 453 return (ENXIO); 454 } 455 } 456 457 if (cfg->rtit_ctl & RTIT_CTL_DIS_TNT) { 458 if ((pt_info.l0_ebx & CPUPT_DIS_TNT) == 0) { 459 printf("%s: CPU does not support TNT\n", __func__); 460 return (ENXIO); 461 } 462 } 463 /* TODO: support for more config bits. */ 464 465 if (ctx->mode == HWT_MODE_CPU) { 466 TAILQ_FOREACH(hwt_cpu, &ctx->cpus, next) { 467 if (hwt_cpu->cpu_id != cpu_id) 468 continue; 469 pt_ctx = &pt_pcpu_ctx[cpu_id]; 470 break; 471 } 472 } else { 473 TAILQ_FOREACH(thr, &ctx->threads, next) { 474 if (thr->thread_id != thread_id) 475 continue; 476 KASSERT(thr->private != NULL, 477 ("%s: hwt thread private" 478 " not set, thr %p", 479 __func__, thr)); 480 pt_ctx = (struct pt_ctx *)thr->private; 481 break; 482 } 483 } 484 if (pt_ctx == NULL) 485 return (ENOENT); 486 487 dprintf("%s: preparing MSRs\n", __func__); 488 pt_ext = pt_ctx_get_ext_area(pt_ctx); 489 hdr = pt_ctx_get_xstate_hdr(pt_ctx); 490 491 pt_ext->rtit_ctl |= cfg->rtit_ctl; 492 if (cfg->nranges != 0) { 493 dprintf("%s: preparing IPF ranges\n", __func__); 494 if ((error = pt_configure_ranges(pt_ctx, cfg)) != 0) 495 return (error); 496 } 497 pt_ctx->hwt_ctx = ctx; 498 pt_ext->rtit_ctl |= RTIT_CTL_TOPA; 499 pt_ext->rtit_output_base = (uint64_t)vtophys(pt_ctx->buf.topa_hw); 500 pt_ext->rtit_output_mask_ptrs = PT_TOPA_MASK_PTRS; 501 hdr->xstate_bv = XFEATURE_ENABLED_PT; 502 hdr->xstate_xcomp_bv = XFEATURE_ENABLED_PT | 503 XSTATE_XCOMP_BV_COMPACT; 504 pt_ext->rtit_ctl |= RTIT_CTL_TRACEEN; 505 pt_pcpu[cpu_id].ctx = pt_ctx; 506 507 return (0); 508 } 509 510 /* 511 * hwt backend trace start operation. CPU affine. 512 */ 513 static void 514 pt_backend_enable(struct hwt_context *ctx, int cpu_id) 515 { 516 if (ctx->mode == HWT_MODE_CPU) 517 return; 518 519 KASSERT(curcpu == cpu_id, 520 ("%s: attempting to start PT on another cpu", __func__)); 521 pt_cpu_start(NULL); 522 CPU_SET(cpu_id, &ctx->cpu_map); 523 } 524 525 /* 526 * hwt backend trace stop operation. CPU affine. 527 */ 528 static void 529 pt_backend_disable(struct hwt_context *ctx, int cpu_id) 530 { 531 struct pt_cpu *cpu; 532 533 if (ctx->mode == HWT_MODE_CPU) 534 return; 535 KASSERT(curcpu == cpu_id, 536 ("%s: attempting to disable PT on another cpu", __func__)); 537 538 cpu = &pt_pcpu[cpu_id]; 539 540 dprintf("%s: waiting for cpu %d to exit interrupt handler\n", __func__, 541 cpu_id); 542 pt_cpu_set_state(cpu_id, PT_INACTIVE); 543 while (atomic_cmpset_int(&cpu->in_pcint_handler, 1, 0)) 544 ; 545 546 pt_cpu_stop(NULL); 547 CPU_CLR(cpu_id, &ctx->cpu_map); 548 cpu->ctx = NULL; 549 } 550 551 /* 552 * hwt backend trace start operation for remote CPUs. 553 */ 554 static int 555 pt_backend_enable_smp(struct hwt_context *ctx) 556 { 557 dprintf("%s\n", __func__); 558 559 KASSERT(ctx->mode == HWT_MODE_CPU, 560 ("%s: should only be used for CPU mode", __func__)); 561 if (ctx->mode == HWT_MODE_CPU && 562 atomic_swap_32(&cpu_mode_ctr, 1) != 0) 563 return (-1); 564 565 smp_rendezvous_cpus(ctx->cpu_map, NULL, pt_cpu_start, NULL, NULL); 566 567 return (0); 568 } 569 570 /* 571 * hwt backend trace stop operation for remote CPUs. 572 */ 573 static int 574 pt_backend_disable_smp(struct hwt_context *ctx) 575 { 576 struct pt_cpu *cpu; 577 578 dprintf("%s\n", __func__); 579 if (ctx->mode == HWT_MODE_CPU && 580 atomic_swap_32(&cpu_mode_ctr, 0) == 0) 581 return (-1); 582 583 if (CPU_EMPTY(&ctx->cpu_map)) { 584 dprintf("%s: empty cpu map\n", __func__); 585 return (-1); 586 } 587 CPU_FOREACH_ISSET(cpu_id, &ctx->cpu_map) { 588 cpu = &pt_pcpu[cpu_id]; 589 dprintf("%s: waiting for cpu %d to exit interrupt handler\n", 590 __func__, cpu_id); 591 pt_cpu_set_state(cpu_id, PT_INACTIVE); 592 while (atomic_cmpset_int(&cpu->in_pcint_handler, 1, 0)) 593 ; 594 } 595 smp_rendezvous_cpus(ctx->cpu_map, NULL, pt_cpu_stop, NULL, NULL); 596 597 return (0); 598 } 599 600 /* 601 * HWT backend initialization method. 602 * 603 * Installs the ToPA interrupt handler and initializes 604 * the tracing contexts used for HWT_MODE_CPU. 605 */ 606 static int 607 pt_backend_init(struct hwt_context *ctx) 608 { 609 struct hwt_cpu *hwt_cpu; 610 int error; 611 612 dprintf("%s\n", __func__); 613 if (ctx->mode != HWT_MODE_CPU) 614 return (0); 615 TAILQ_FOREACH(hwt_cpu, &ctx->cpus, next) { 616 error = pt_init_ctx(&pt_pcpu_ctx[hwt_cpu->cpu_id], hwt_cpu->vm, 617 hwt_cpu->cpu_id); 618 if (error) 619 return (error); 620 } 621 622 return (0); 623 } 624 625 /* 626 * HWT backend teardown method. 627 * 628 * Removes the ToPA interrupt handler, stops tracing on all active CPUs, 629 * and releases all previously allocated ToPA metadata. 630 */ 631 static int 632 pt_backend_deinit(struct hwt_context *ctx) 633 { 634 struct pt_ctx *pt_ctx; 635 struct hwt_thread *thr; 636 int cpu_id; 637 638 dprintf("%s\n", __func__); 639 640 pt_backend_disable_smp(ctx); 641 if (ctx->mode == HWT_MODE_THREAD) { 642 TAILQ_FOREACH(thr, &ctx->threads, next) { 643 KASSERT(thr->private != NULL, 644 ("%s: thr->private not set", __func__)); 645 pt_ctx = (struct pt_ctx *)thr->private; 646 pt_deinit_ctx(pt_ctx); 647 } 648 } else { 649 CPU_FOREACH_ISSET(cpu_id, &ctx->cpu_map) { 650 if (pt_pcpu[cpu_id].ctx == NULL) 651 continue; 652 KASSERT(pt_pcpu[cpu_id].ctx == &pt_pcpu_ctx[cpu_id], 653 ("%s: CPU mode tracing with non-cpu mode PT" 654 "context active", 655 __func__)); 656 pt_deinit_ctx(pt_pcpu[cpu_id].ctx); 657 pt_pcpu[cpu_id].ctx = NULL; 658 atomic_set_int(&pt_pcpu[cpu_id].in_pcint_handler, 0); 659 } 660 } 661 662 return (0); 663 } 664 665 /* 666 * Fetches current offset into the tracing buffer. 667 */ 668 static int 669 pt_backend_read(struct hwt_vm *vm, int *curpage, vm_offset_t *curpage_offset, 670 uint64_t *data) 671 { 672 struct pt_buffer *buf; 673 uint64_t offset; 674 675 if (vm->ctx->mode == HWT_MODE_THREAD) 676 buf = &((struct pt_ctx *)vm->thr->private)->buf; 677 else 678 buf = &pt_pcpu[vm->cpu->cpu_id].ctx->buf; 679 offset = atomic_load_acq_64(&buf->offset); 680 *curpage = offset / PAGE_SIZE; 681 *curpage_offset = offset & PAGE_MASK; 682 683 return (0); 684 } 685 686 /* 687 * HWT thread creation hook. 688 * Allocates and associates a 'struct pt_ctx' for a given hwt thread. 689 */ 690 static int 691 pt_backend_alloc_thread(struct hwt_thread *thr) 692 { 693 struct pt_ctx *pt_ctx; 694 int error; 695 696 /* Omit M_WAITOK since this might get invoked a non-sleepable context */ 697 pt_ctx = malloc(sizeof(*pt_ctx), M_PT, M_NOWAIT | M_ZERO); 698 if (pt_ctx == NULL) 699 return (ENOMEM); 700 701 error = pt_init_ctx(pt_ctx, thr->vm, thr->thread_id); 702 if (error) 703 return (error); 704 705 thr->private = pt_ctx; 706 return (0); 707 } 708 /* 709 * HWT thread teardown hook. 710 */ 711 static void 712 pt_backend_free_thread(struct hwt_thread *thr) 713 { 714 struct pt_ctx *ctx; 715 716 ctx = (struct pt_ctx *)thr->private; 717 718 pt_deinit_ctx(ctx); 719 free(ctx, M_PT); 720 } 721 722 static void 723 pt_backend_dump(int cpu_id) 724 { 725 } 726 727 static struct hwt_backend_ops pt_ops = { 728 .hwt_backend_init = pt_backend_init, 729 .hwt_backend_deinit = pt_backend_deinit, 730 731 .hwt_backend_configure = pt_backend_configure, 732 733 .hwt_backend_enable = pt_backend_enable, 734 .hwt_backend_disable = pt_backend_disable, 735 736 #ifdef SMP 737 .hwt_backend_enable_smp = pt_backend_enable_smp, 738 .hwt_backend_disable_smp = pt_backend_disable_smp, 739 #endif 740 741 .hwt_backend_read = pt_backend_read, 742 .hwt_backend_dump = pt_backend_dump, 743 744 .hwt_backend_thread_alloc = pt_backend_alloc_thread, 745 .hwt_backend_thread_free = pt_backend_free_thread, 746 }; 747 748 static struct hwt_backend backend = { 749 .ops = &pt_ops, 750 .name = "pt", 751 .kva_req = 1, 752 }; 753 754 /* 755 * Reads the latest valid trace buffer offset and enqueues 756 * a HWT_RECORD_BUFFER record. 757 * Used as a taskqueue routine from the ToPA interrupt handler. 758 */ 759 static void 760 pt_send_buffer_record(void *arg) 761 { 762 struct pt_cpu *cpu = (struct pt_cpu *)arg; 763 struct hwt_record_entry record; 764 765 struct pt_ctx *ctx = cpu->ctx; 766 pt_fill_buffer_record(ctx->id, &ctx->buf, &record); 767 hwt_record_ctx(ctx->hwt_ctx, &record, M_ZERO | M_NOWAIT); 768 } 769 static void 770 pt_topa_status_clear(void) 771 { 772 uint64_t reg; 773 774 reg = rdmsr(MSR_IA_GLOBAL_STATUS_RESET); 775 reg &= ~GLOBAL_STATUS_FLAG_TRACETOPAPMI; 776 reg |= GLOBAL_STATUS_FLAG_TRACETOPAPMI; 777 wrmsr(MSR_IA_GLOBAL_STATUS_RESET, reg); 778 } 779 780 /* 781 * ToPA PMI handler. 782 * 783 * Invoked every time a ToPA entry marked with TOPA_INT is filled. 784 * Uses taskqueue to enqueue a buffer record for userspace. 785 * Re-enables the PC interrupt line as long as tracing is active. 786 */ 787 static int 788 pt_topa_intr(struct trapframe *tf) 789 { 790 struct pt_buffer *buf; 791 struct pt_cpu *cpu; 792 struct pt_ctx *ctx; 793 uint64_t reg; 794 795 cpu = &pt_pcpu[curcpu]; 796 reg = rdmsr(MSR_IA_GLOBAL_STATUS); 797 if ((reg & GLOBAL_STATUS_FLAG_TRACETOPAPMI) == 0) { 798 pt_topa_status_clear(); 799 return (0); 800 } 801 802 if (pt_cpu_get_state(curcpu) != PT_ACTIVE) { 803 return (1); 804 } 805 atomic_set_int(&cpu->in_pcint_handler, 1); 806 807 ctx = cpu->ctx; 808 KASSERT(ctx != NULL, 809 ("%s: cpu %d: ToPA PMI interrupt without an active context", 810 __func__, curcpu)); 811 buf = &ctx->buf; 812 KASSERT(buf->topa_hw != NULL, 813 ("%s: cpu %d: ToPA PMI interrupt with invalid buffer", __func__, 814 curcpu)); 815 pt_cpu_toggle_local(ctx->save_area, false); 816 pt_update_buffer(buf); 817 pt_topa_status_clear(); 818 819 if (pt_cpu_get_state(curcpu) == PT_ACTIVE) { 820 swi_sched(cpu->swi_cookie, SWI_FROMNMI); 821 pt_cpu_toggle_local(ctx->save_area, true); 822 lapic_reenable_pcint(); 823 } 824 atomic_set_int(&cpu->in_pcint_handler, 0); 825 return (1); 826 } 827 828 /* 829 * Module initialization. 830 * 831 * Saves all PT-related cpuid info, registers itself as a HWT backend, 832 * and allocates metadata required to keep track of tracing operations 833 * on each CPU. 834 */ 835 static int 836 pt_init(void) 837 { 838 u_int cp[4]; 839 int error, i; 840 841 dprintf("pt: Enumerating part 1\n"); 842 cpuid_count(CPUID_PT_LEAF, 0, cp); 843 dprintf("pt: Maximum valid sub-leaf Index: %x\n", cp[0]); 844 dprintf("pt: ebx %x\n", cp[1]); 845 dprintf("pt: ecx %x\n", cp[2]); 846 847 pt_info.l0_eax = cp[0]; 848 pt_info.l0_ebx = cp[1]; 849 pt_info.l0_ecx = cp[2]; 850 851 dprintf("pt: Enumerating part 2\n"); 852 cpuid_count(CPUID_PT_LEAF, 1, cp); 853 dprintf("pt: eax %x\n", cp[0]); 854 dprintf("pt: ebx %x\n", cp[1]); 855 856 pt_info.l1_eax = cp[0]; 857 pt_info.l1_ebx = cp[1]; 858 859 error = hwt_backend_register(&backend); 860 if (error != 0) { 861 printf("pt: unable to register hwt backend, error %d\n", error); 862 return (error); 863 } 864 pt_pcpu = mallocarray(mp_ncpus, sizeof(struct pt_cpu), M_PT, 865 M_ZERO | M_WAITOK); 866 pt_pcpu_ctx = mallocarray(mp_ncpus, sizeof(struct pt_ctx), M_PT, 867 M_ZERO | M_WAITOK); 868 869 for (i = 0; i < mp_ncpus; i++) { 870 error = swi_add(&clk_intr_event, "pt", pt_send_buffer_record, 871 &pt_pcpu[i], SWI_CLOCK, INTR_MPSAFE, 872 &pt_pcpu[i].swi_cookie); 873 if (error != 0) { 874 dprintf( 875 "%s: failed to add interrupt handler for cpu: %d\n", 876 __func__, error); 877 goto err; 878 } 879 } 880 881 nmi_register_handler(pt_topa_intr); 882 if (lapic_enable_pcint()) { 883 initialized = true; 884 return (0); 885 } else 886 printf("pt: failed to setup interrupt line\n"); 887 err: 888 nmi_remove_handler(pt_topa_intr); 889 hwt_backend_unregister(&backend); 890 891 for (i = 0; i < mp_ncpus; i++) { 892 if (pt_pcpu[i].swi_cookie != 0) 893 swi_remove(pt_pcpu[i].swi_cookie); 894 } 895 free(pt_pcpu, M_PT); 896 free(pt_pcpu_ctx, M_PT); 897 pt_pcpu = NULL; 898 pt_pcpu_ctx = NULL; 899 900 return (error); 901 } 902 903 /* 904 * Checks whether the CPU support Intel PT and 905 * initializes XSAVE area info. 906 * 907 * The driver relies on XSAVE/XRSTOR PT extensions, 908 * Table of Physical Addresses (ToPA) support, and 909 * support for multiple ToPA entries. 910 */ 911 static bool 912 pt_supported(void) 913 { 914 u_int cp[4]; 915 916 if ((cpu_stdext_feature & CPUID_STDEXT_PROCTRACE) == 0) { 917 printf("pt: CPU does not support Intel Processor Trace\n"); 918 return (false); 919 } 920 if ((cpu_feature2 & CPUID2_XSAVE) == 0) { 921 printf("pt: XSAVE is not supported\n"); 922 return (false); 923 } 924 if (!xsave_extfeature_supported(XFEATURE_ENABLED_PT, true)) { 925 printf("pt: CPU does not support managing PT state using XSAVE\n"); 926 return (false); 927 } 928 if (!xsave_extension_supported(CPUID_EXTSTATE_XSAVEC)) { 929 printf("pt: XSAVE compaction is not supported\n"); 930 return (false); 931 } 932 if (!xsave_extension_supported(CPUID_EXTSTATE_XSAVES)) { 933 printf("pt: CPU does not support XSAVES/XRSTORS\n"); 934 return (false); 935 } 936 937 /* Require ToPA support. */ 938 cpuid_count(CPUID_PT_LEAF, 0, cp); 939 if ((cp[2] & CPUPT_TOPA) == 0) { 940 printf("pt: ToPA is not supported\n"); 941 return (false); 942 } 943 if ((cp[2] & CPUPT_TOPA_MULTI) == 0) { 944 printf("pt: multiple ToPA outputs are not supported\n"); 945 return (false); 946 } 947 948 pt_info.xstate_hdr_offset = xsave_area_hdr_offset(); 949 pt_info.xsave_area_size = xsave_area_size(PT_XSTATE_BV, true, true); 950 pt_info.pt_xsave_offset = xsave_area_offset(PT_XSTATE_BV, 951 XFEATURE_ENABLED_PT, true, true); 952 953 return (true); 954 } 955 956 static void 957 pt_deinit(void) 958 { 959 int i; 960 struct pt_cpu *cpu; 961 962 if (!initialized) 963 return; 964 nmi_remove_handler(pt_topa_intr); 965 lapic_disable_pcint(); 966 hwt_backend_unregister(&backend); 967 968 for (i = 0; i < mp_ncpus; i++) { 969 cpu = &pt_pcpu[i]; 970 swi_remove(cpu->swi_cookie); 971 } 972 973 free(pt_pcpu, M_PT); 974 free(pt_pcpu_ctx, M_PT); 975 pt_pcpu = NULL; 976 pt_pcpu_ctx = NULL; 977 initialized = false; 978 } 979 980 static int 981 pt_modevent(module_t mod, int type, void *data) 982 { 983 switch (type) { 984 case MOD_LOAD: 985 if (!pt_supported() || pt_init() != 0) { 986 return (ENXIO); 987 } 988 break; 989 case MOD_UNLOAD: 990 pt_deinit(); 991 break; 992 default: 993 break; 994 } 995 996 return (0); 997 } 998 999 static moduledata_t pt_mod = { "intel_pt", pt_modevent, NULL }; 1000 1001 DECLARE_MODULE(intel_pt, pt_mod, SI_SUB_DRIVERS, SI_ORDER_FIRST); 1002 MODULE_DEPEND(intel_pt, hwt, 1, 1, 1); 1003 MODULE_VERSION(intel_pt, 1); 1004