1 /* 2 * Copyright (c) 2025 Bojan Novković <bnovkov@freebsd.org> 3 * 4 * SPDX-License-Identifier: BSD-2-Clause 5 */ 6 7 /* 8 * hwt(4) Intel Processor Trace (PT) backend 9 * 10 * Driver Design Overview 11 * 12 * - Since PT is configured on a per-core basis, the driver uses 13 * 'smp_rendezvous' to start and disable tracing on each target core. 14 * - PT-specific resources are stored in a 'struct pt_ctx' context structure for 15 * each traced CPU core or thread. Upon initialization, a ToPA configuration 16 * is generated for each 'pt_ctx' structure using the HWT tracing buffers. 17 * The HWT tracing buffer is split into 4K ToPA entries. Currently, each 18 * 4K ToPA entry is configured to trigger an interrupt after it is filled. 19 * - The PT driver uses the XSAVE/XRSTOR PT extensions to load and save all 20 * relevant PT registers. Every time a traced thread is switched 21 * out or in, its state will be saved to or loaded from its corresponding 22 * 'pt_ctx' context. 23 * - When tracing starts, the PT hardware will start writing data into the 24 * tracing buffer. When a TOPA_INT entry is filled, it will trigger an 25 * interrupt before continuing. The interrupt handler will then fetch the 26 * last valid tracing buffer offset and enqueue a HWT_RECORD_BUFFER record. 27 * The driver is currently configured to use the NMI interrupt line. 28 * - The userspace PT backend waits for incoming HWT_RECORD_BUFFER records 29 * and uses the offsets to decode data from the tracing buffer. 30 * 31 * Future improvements and limitations 32 * 33 * - We currently configure the PT hardware to trigger an interrupt whenever 34 * a 4K ToPA entry is filled. While this is fine when tracing smaller 35 * functions or infrequent code paths, this will generate too much interrupt 36 * traffic when tracing hotter functions. A proper solution for this issue 37 * should estimate the amount of data generated by the current configuration 38 * and use it to determine interrupt frequency. 39 * 40 * - Support for more tracing options and PT features. 41 * 42 */ 43 44 #include <sys/systm.h> 45 #include <sys/hwt.h> 46 #include <sys/kernel.h> 47 #include <sys/lock.h> 48 #include <sys/malloc.h> 49 #include <sys/module.h> 50 #include <sys/mutex.h> 51 #include <sys/sdt.h> 52 #include <sys/smp.h> 53 #include <sys/taskqueue.h> 54 55 #include <vm/vm.h> 56 #include <vm/vm_page.h> 57 58 #include <machine/atomic.h> 59 #include <machine/cpufunc.h> 60 #include <machine/fpu.h> 61 #include <machine/smp.h> 62 #include <machine/specialreg.h> 63 64 #include <x86/apicvar.h> 65 #include <x86/x86_var.h> 66 67 #include <dev/hwt/hwt_context.h> 68 #include <dev/hwt/hwt_vm.h> 69 #include <dev/hwt/hwt_backend.h> 70 #include <dev/hwt/hwt_config.h> 71 #include <dev/hwt/hwt_cpu.h> 72 #include <dev/hwt/hwt_record.h> 73 #include <dev/hwt/hwt_thread.h> 74 75 #include <amd64/pt/pt.h> 76 77 #ifdef PT_DEBUG 78 #define dprintf(fmt, ...) printf(fmt, ##__VA_ARGS__) 79 #else 80 #define dprintf(fmt, ...) 81 #endif 82 #define PT_SUPPORTED_FLAGS \ 83 (RTIT_CTL_MTCEN | RTIT_CTL_CR3FILTER | RTIT_CTL_DIS_TNT | \ 84 RTIT_CTL_USER | RTIT_CTL_OS | RTIT_CTL_BRANCHEN) 85 #define PT_XSAVE_MASK (XFEATURE_ENABLED_X87 | XFEATURE_ENABLED_SSE) 86 #define PT_XSTATE_BV (PT_XSAVE_MASK | XFEATURE_ENABLED_PT) 87 #define PT_MAX_IP_RANGES 2 88 89 #define PT_TOPA_MASK_PTRS 0x7f 90 #define PT_TOPA_PAGE_MASK 0xffffff80 91 #define PT_TOPA_PAGE_SHIFT 7 92 93 #define CPUID_PT_LEAF 0x14 94 95 MALLOC_DEFINE(M_PT, "pt", "Intel Processor Trace"); 96 97 SDT_PROVIDER_DEFINE(pt); 98 SDT_PROBE_DEFINE(pt, , , topa__intr); 99 100 TASKQUEUE_FAST_DEFINE_THREAD(pt); 101 102 static void pt_send_buffer_record(void *arg, int pending __unused); 103 static int pt_topa_intr(struct trapframe *tf); 104 105 /* 106 * Intel Processor Trace XSAVE-managed state. 107 */ 108 struct pt_ext_area { 109 uint64_t rtit_ctl; 110 uint64_t rtit_output_base; 111 uint64_t rtit_output_mask_ptrs; 112 uint64_t rtit_status; 113 uint64_t rtit_cr3_match; 114 uint64_t rtit_addr0_a; 115 uint64_t rtit_addr0_b; 116 uint64_t rtit_addr1_a; 117 uint64_t rtit_addr1_b; 118 }; 119 120 struct pt_buffer { 121 uint64_t *topa_hw; /* ToPA table entries. */ 122 size_t size; 123 struct mtx lock; /* Lock for fields below. */ 124 vm_offset_t offset; 125 uint64_t wrap_count; 126 int curpage; 127 }; 128 129 struct pt_ctx { 130 int id; 131 struct pt_buffer buf; /* ToPA buffer metadata */ 132 struct task task; /* ToPA buffer notification task */ 133 struct hwt_context *hwt_ctx; 134 uint8_t *save_area; /* PT XSAVE area */ 135 }; 136 /* PT tracing contexts used for CPU mode. */ 137 static struct pt_ctx *pt_pcpu_ctx; 138 139 enum pt_cpu_state { 140 PT_DISABLED = 0, 141 PT_STOPPED, 142 PT_ACTIVE 143 }; 144 145 static struct pt_cpu { 146 struct pt_ctx *ctx; /* active PT tracing context */ 147 enum pt_cpu_state state; /* used as part of trace stop protocol */ 148 } *pt_pcpu; 149 150 /* 151 * PT-related CPUID bits. 152 */ 153 static struct pt_cpu_info { 154 uint32_t l0_eax; 155 uint32_t l0_ebx; 156 uint32_t l0_ecx; 157 uint32_t l1_eax; 158 uint32_t l1_ebx; 159 size_t xsave_area_size; 160 size_t xstate_hdr_offset; 161 size_t pt_xsave_offset; 162 } pt_info __read_mostly; 163 164 static bool initialized = false; 165 static int cpu_mode_ctr = 0; 166 167 static __inline enum pt_cpu_state 168 pt_cpu_get_state(int cpu_id) 169 { 170 return (atomic_load_int(&pt_pcpu[cpu_id].state)); 171 } 172 173 static __inline void 174 pt_cpu_set_state(int cpu_id, enum pt_cpu_state state) 175 { 176 atomic_store_int(&pt_pcpu[cpu_id].state, state); 177 } 178 179 static __inline struct xstate_hdr * 180 pt_ctx_get_xstate_hdr(struct pt_ctx *ctx) 181 { 182 return ((struct xstate_hdr *)(ctx->save_area + 183 pt_info.xstate_hdr_offset)); 184 } 185 186 187 static __inline struct pt_ext_area * 188 pt_ctx_get_ext_area(struct pt_ctx *ctx) 189 { 190 return ((struct pt_ext_area *)(ctx->save_area + 191 pt_info.pt_xsave_offset)); 192 } 193 194 /* 195 * Updates current trace buffer offset from the 196 * ToPA MSRs. Records if the trace buffer wrapped. 197 */ 198 static __inline void 199 pt_update_buffer(struct pt_buffer *buf) 200 { 201 uint64_t reg; 202 int curpage; 203 204 /* Update buffer offset. */ 205 reg = rdmsr(MSR_IA32_RTIT_OUTPUT_MASK_PTRS); 206 curpage = (reg & PT_TOPA_PAGE_MASK) >> PT_TOPA_PAGE_SHIFT; 207 mtx_lock_spin(&buf->lock); 208 /* Check if the output wrapped. */ 209 if (buf->curpage > curpage) 210 buf->wrap_count++; 211 buf->curpage = curpage; 212 buf->offset = reg >> 32; 213 mtx_unlock_spin(&buf->lock); 214 215 dprintf("%s: wrap_cnt: %lu, curpage: %d, offset: %zu\n", __func__, 216 buf->wrap_count, buf->curpage, buf->offset); 217 } 218 219 static __inline void 220 pt_fill_buffer_record(int id, struct pt_buffer *buf, 221 struct hwt_record_entry *rec) 222 { 223 rec->record_type = HWT_RECORD_BUFFER; 224 rec->buf_id = id; 225 rec->curpage = buf->curpage; 226 rec->offset = buf->offset + (buf->wrap_count * buf->size); 227 } 228 229 /* 230 * Enables or disables tracing on curcpu 231 * using the XSAVE/XRSTOR PT extensions. 232 */ 233 static void 234 pt_cpu_toggle_local(uint8_t *save_area, bool enable) 235 { 236 u_long xcr0, cr0; 237 u_long xss; 238 239 cr0 = rcr0(); 240 if (cr0 & CR0_TS) 241 clts(); 242 xcr0 = rxcr(XCR0); 243 if ((xcr0 & PT_XSAVE_MASK) != PT_XSAVE_MASK) 244 load_xcr(XCR0, xcr0 | PT_XSAVE_MASK); 245 xss = rdmsr(MSR_IA32_XSS); 246 wrmsr(MSR_IA32_XSS, xss | XFEATURE_ENABLED_PT); 247 248 if (!enable) { 249 KASSERT((rdmsr(MSR_IA32_RTIT_CTL) & RTIT_CTL_TRACEEN) != 0, 250 ("%s: PT is disabled", __func__)); 251 xsaves(save_area, XFEATURE_ENABLED_PT); 252 } else { 253 KASSERT((rdmsr(MSR_IA32_RTIT_CTL) & RTIT_CTL_TRACEEN) == 0, 254 ("%s: PT is enabled", __func__)); 255 xrstors(save_area, XFEATURE_ENABLED_PT); 256 } 257 wrmsr(MSR_IA32_XSS, xss); 258 if ((xcr0 & PT_XSAVE_MASK) != PT_XSAVE_MASK) 259 load_xcr(XCR0, xcr0); 260 if (cr0 & CR0_TS) 261 load_cr0(cr0); 262 } 263 264 /* 265 * Starts PT tracing on 'curcpu'. 266 */ 267 static void 268 pt_cpu_start(void *dummy) 269 { 270 struct pt_cpu *cpu; 271 272 cpu = &pt_pcpu[curcpu]; 273 MPASS(cpu->ctx != NULL); 274 275 dprintf("%s: curcpu %d\n", __func__, curcpu); 276 load_cr4(rcr4() | CR4_XSAVE); 277 wrmsr(MSR_IA32_RTIT_STATUS, 0); 278 pt_cpu_set_state(curcpu, PT_ACTIVE); 279 pt_cpu_toggle_local(cpu->ctx->save_area, true); 280 } 281 282 /* 283 * Stops PT tracing on 'curcpu'. 284 * Updates trace buffer offset to ensure 285 * any data generated between the last interrupt 286 * and the trace stop gets picked up by userspace. 287 */ 288 static void 289 pt_cpu_stop(void *dummy) 290 { 291 struct pt_cpu *cpu; 292 struct pt_ctx *ctx; 293 294 /* Shutdown may occur before PT gets properly configured. */ 295 if (pt_cpu_get_state(curcpu) == PT_DISABLED) 296 return; 297 298 cpu = &pt_pcpu[curcpu]; 299 ctx = cpu->ctx; 300 MPASS(ctx != NULL); 301 dprintf("%s: curcpu %d\n", __func__, curcpu); 302 303 pt_cpu_set_state(curcpu, PT_STOPPED); 304 pt_cpu_toggle_local(cpu->ctx->save_area, false); 305 pt_update_buffer(&ctx->buf); 306 } 307 308 /* 309 * Prepares the Table of Physical Addresses (ToPA) metadata for 'pt_ctx'. 310 * The HWT trace buffer is split into 4K ToPA table entries and used 311 * as a circular buffer, meaning that the last ToPA entry points to 312 * the first ToPA entry. Each entry is configured to raise an 313 * interrupt after being filled. 314 */ 315 static int 316 pt_topa_prepare(struct pt_ctx *ctx, struct hwt_vm *vm) 317 { 318 struct pt_buffer *buf; 319 size_t topa_size; 320 int i; 321 322 topa_size = TOPA_SIZE_4K; 323 buf = &ctx->buf; 324 325 KASSERT(buf->topa_hw == NULL, 326 ("%s: ToPA info already exists", __func__)); 327 buf->topa_hw = mallocarray(vm->npages + 1, sizeof(uint64_t), M_PT, 328 M_ZERO | M_WAITOK); 329 dprintf("%s: ToPA virt addr %p\n", __func__, buf->topa_hw); 330 buf->size = vm->npages * PAGE_SIZE; 331 for (i = 0; i < vm->npages; i++) { 332 buf->topa_hw[i] = VM_PAGE_TO_PHYS(vm->pages[i]) | topa_size; 333 /* 334 * XXX: TOPA_INT should ideally be set according to 335 * expected amount of incoming trace data. Too few TOPA_INT 336 * entries will not trigger interrupts often enough when tracing 337 * smaller functions. 338 */ 339 buf->topa_hw[i] |= TOPA_INT; 340 } 341 buf->topa_hw[vm->npages] = (uint64_t)vtophys(buf->topa_hw) | TOPA_END; 342 343 return (0); 344 } 345 346 /* 347 * Configures IP filtering for trace generation. 348 * A maximum of 2 ranges can be specified due to 349 * limitations imposed by the XSAVE/XRSTOR PT extensions. 350 */ 351 static int 352 pt_configure_ranges(struct pt_ctx *ctx, struct pt_cpu_config *cfg) 353 { 354 struct pt_ext_area *pt_ext; 355 int nranges_supp, n, error = 0; 356 357 pt_ext = pt_ctx_get_ext_area(ctx); 358 if (pt_info.l0_ebx & CPUPT_IPF) { 359 nranges_supp = (pt_info.l1_eax & CPUPT_NADDR_M) >> 360 CPUPT_NADDR_S; 361 362 if (nranges_supp > PT_IP_FILTER_MAX_RANGES) 363 nranges_supp = PT_IP_FILTER_MAX_RANGES; 364 n = cfg->nranges; 365 if (n > nranges_supp) { 366 printf("%s: %d IP filtering ranges requested, CPU " 367 "supports %d, truncating\n", 368 __func__, n, nranges_supp); 369 n = nranges_supp; 370 } 371 372 switch (n) { 373 case 2: 374 pt_ext->rtit_ctl |= (1UL << RTIT_CTL_ADDR_CFG_S(1)); 375 pt_ext->rtit_addr1_a = cfg->ip_ranges[1].start; 376 pt_ext->rtit_addr1_b = cfg->ip_ranges[1].end; 377 case 1: 378 pt_ext->rtit_ctl |= (1UL << RTIT_CTL_ADDR_CFG_S(0)); 379 pt_ext->rtit_addr0_a = cfg->ip_ranges[0].start; 380 pt_ext->rtit_addr0_b = cfg->ip_ranges[0].end; 381 break; 382 default: 383 error = (EINVAL); 384 break; 385 }; 386 } else 387 error = (ENXIO); 388 389 return (error); 390 } 391 392 static int 393 pt_init_ctx(struct pt_ctx *pt_ctx, struct hwt_vm *vm, int ctx_id) 394 { 395 396 dprintf("%s: ctx id %d\n", __func__, ctx_id); 397 398 KASSERT(pt_ctx->buf.topa_hw == NULL, 399 ("%s: active ToPA buffer in context %p\n", __func__, pt_ctx)); 400 401 memset(pt_ctx, 0, sizeof(struct pt_ctx)); 402 mtx_init(&pt_ctx->buf.lock, "pttopa", NULL, MTX_SPIN); 403 pt_ctx->save_area = malloc_aligned(pt_info.xsave_area_size, 64, 404 M_PT, M_NOWAIT | M_ZERO); 405 if (pt_ctx->save_area == NULL) 406 return (ENOMEM); 407 dprintf("%s: preparing ToPA buffer\n", __func__); 408 if (pt_topa_prepare(pt_ctx, vm) != 0) { 409 dprintf("%s: failed to prepare ToPA buffer\n", __func__); 410 free(pt_ctx->save_area, M_PT); 411 return (ENOMEM); 412 } 413 414 pt_ctx->id = ctx_id; 415 TASK_INIT(&pt_ctx->task, 0, pt_send_buffer_record, pt_ctx); 416 417 return (0); 418 } 419 420 static void 421 pt_deinit_ctx(struct pt_ctx *pt_ctx) 422 { 423 424 if (pt_ctx->buf.topa_hw != NULL) 425 free(pt_ctx->buf.topa_hw, M_PT); 426 if (pt_ctx->save_area != NULL) 427 free(pt_ctx->save_area, M_PT); 428 memset(pt_ctx, 0, sizeof(*pt_ctx)); 429 pt_ctx->buf.topa_hw = NULL; 430 } 431 432 /* 433 * HWT backend configuration method. 434 * 435 * Checks and translates the user-defined configuration to a 436 * set of PT tracing features. Uses the feature set to initialize 437 * the tracing context for the target CPU or thread. 438 */ 439 static int 440 pt_backend_configure(struct hwt_context *ctx, int cpu_id, int thread_id) 441 { 442 struct hwt_cpu *hwt_cpu; 443 struct hwt_thread *thr; 444 struct pt_ctx *pt_ctx; 445 struct pt_cpu_config *cfg; 446 struct pt_ext_area *pt_ext; 447 struct xstate_hdr *hdr; 448 int error; 449 450 dprintf("%s\n", __func__); 451 452 cfg = (struct pt_cpu_config *)ctx->config; 453 pt_ctx = NULL; 454 455 /* Clear any flags we don't support yet. */ 456 cfg->rtit_ctl &= PT_SUPPORTED_FLAGS; 457 if (cfg->rtit_ctl & RTIT_CTL_MTCEN) { 458 if ((pt_info.l0_ebx & CPUPT_MTC) == 0) { 459 printf("%s: CPU does not support generating MTC " 460 "packets\n", __func__); 461 return (ENXIO); 462 } 463 } 464 465 if (cfg->rtit_ctl & RTIT_CTL_CR3FILTER) { 466 if ((pt_info.l0_ebx & CPUPT_CR3) == 0) { 467 printf("%s: CPU does not support CR3 filtering\n", 468 __func__); 469 return (ENXIO); 470 } 471 } 472 473 if (cfg->rtit_ctl & RTIT_CTL_DIS_TNT) { 474 if ((pt_info.l0_ebx & CPUPT_DIS_TNT) == 0) { 475 printf("%s: CPU does not support TNT\n", __func__); 476 return (ENXIO); 477 } 478 } 479 /* TODO: support for more config bits. */ 480 481 if (ctx->mode == HWT_MODE_CPU) { 482 TAILQ_FOREACH(hwt_cpu, &ctx->cpus, next) { 483 if (hwt_cpu->cpu_id != cpu_id) 484 continue; 485 pt_ctx = &pt_pcpu_ctx[cpu_id]; 486 break; 487 } 488 } else { 489 TAILQ_FOREACH(thr, &ctx->threads, next) { 490 if (thr->thread_id != thread_id) 491 continue; 492 KASSERT(thr->private != NULL, 493 ("%s: hwt thread private" 494 " not set, thr %p", 495 __func__, thr)); 496 pt_ctx = (struct pt_ctx *)thr->private; 497 break; 498 } 499 } 500 if (pt_ctx == NULL) 501 return (ENOENT); 502 503 dprintf("%s: preparing MSRs\n", __func__); 504 pt_ext = pt_ctx_get_ext_area(pt_ctx); 505 hdr = pt_ctx_get_xstate_hdr(pt_ctx); 506 507 pt_ext->rtit_ctl |= cfg->rtit_ctl; 508 if (cfg->nranges != 0) { 509 dprintf("%s: preparing IPF ranges\n", __func__); 510 if ((error = pt_configure_ranges(pt_ctx, cfg)) != 0) 511 return (error); 512 } 513 pt_ctx->hwt_ctx = ctx; 514 pt_ext->rtit_ctl |= RTIT_CTL_TOPA; 515 pt_ext->rtit_output_base = (uint64_t)vtophys(pt_ctx->buf.topa_hw); 516 pt_ext->rtit_output_mask_ptrs = PT_TOPA_MASK_PTRS; 517 hdr->xstate_bv = XFEATURE_ENABLED_PT; 518 hdr->xstate_xcomp_bv = XFEATURE_ENABLED_PT | 519 XSTATE_XCOMP_BV_COMPACT; 520 pt_ext->rtit_ctl |= RTIT_CTL_TRACEEN; 521 pt_pcpu[cpu_id].ctx = pt_ctx; 522 pt_cpu_set_state(cpu_id, PT_STOPPED); 523 524 return (0); 525 } 526 527 /* 528 * hwt backend trace start operation. CPU affine. 529 */ 530 static void 531 pt_backend_enable(struct hwt_context *ctx, int cpu_id) 532 { 533 if (ctx->mode == HWT_MODE_CPU) 534 return; 535 536 KASSERT(curcpu == cpu_id, 537 ("%s: attempting to start PT on another cpu", __func__)); 538 pt_cpu_start(NULL); 539 CPU_SET(cpu_id, &ctx->cpu_map); 540 } 541 542 /* 543 * hwt backend trace stop operation. CPU affine. 544 */ 545 static void 546 pt_backend_disable(struct hwt_context *ctx, int cpu_id) 547 { 548 struct pt_cpu *cpu; 549 550 if (ctx->mode == HWT_MODE_CPU) 551 return; 552 553 KASSERT(curcpu == cpu_id, 554 ("%s: attempting to disable PT on another cpu", __func__)); 555 pt_cpu_stop(NULL); 556 CPU_CLR(cpu_id, &ctx->cpu_map); 557 cpu = &pt_pcpu[cpu_id]; 558 cpu->ctx = NULL; 559 } 560 561 /* 562 * hwt backend trace start operation for remote CPUs. 563 */ 564 static int 565 pt_backend_enable_smp(struct hwt_context *ctx) 566 { 567 568 dprintf("%s\n", __func__); 569 if (ctx->mode == HWT_MODE_CPU && 570 atomic_swap_32(&cpu_mode_ctr, 1) != 0) 571 return (-1); 572 573 KASSERT(ctx->mode == HWT_MODE_CPU, 574 ("%s: should only be used for CPU mode", __func__)); 575 smp_rendezvous_cpus(ctx->cpu_map, NULL, pt_cpu_start, NULL, NULL); 576 577 return (0); 578 } 579 580 /* 581 * hwt backend trace stop operation for remote CPUs. 582 */ 583 static int 584 pt_backend_disable_smp(struct hwt_context *ctx) 585 { 586 587 dprintf("%s\n", __func__); 588 if (ctx->mode == HWT_MODE_CPU && 589 atomic_swap_32(&cpu_mode_ctr, 0) == 0) 590 return (-1); 591 592 if (CPU_EMPTY(&ctx->cpu_map)) { 593 dprintf("%s: empty cpu map\n", __func__); 594 return (-1); 595 } 596 smp_rendezvous_cpus(ctx->cpu_map, NULL, pt_cpu_stop, NULL, NULL); 597 598 return (0); 599 } 600 601 /* 602 * HWT backend initialization method. 603 * 604 * Installs the ToPA interrupt handler and initializes 605 * the tracing contexts used for HWT_MODE_CPU. 606 */ 607 static int 608 pt_backend_init(struct hwt_context *ctx) 609 { 610 struct hwt_cpu *hwt_cpu; 611 int error; 612 613 dprintf("%s\n", __func__); 614 if (ctx->mode == HWT_MODE_CPU) { 615 TAILQ_FOREACH(hwt_cpu, &ctx->cpus, next) { 616 error = pt_init_ctx(&pt_pcpu_ctx[hwt_cpu->cpu_id], 617 hwt_cpu->vm, hwt_cpu->cpu_id); 618 if (error) 619 return (error); 620 } 621 } 622 623 return (0); 624 } 625 626 /* 627 * HWT backend teardown method. 628 * 629 * Removes the ToPA interrupt handler, stops tracing on all active CPUs, 630 * and releases all previously allocated ToPA metadata. 631 */ 632 static int 633 pt_backend_deinit(struct hwt_context *ctx) 634 { 635 struct pt_ctx *pt_ctx; 636 struct hwt_thread *thr; 637 int cpu_id; 638 639 dprintf("%s\n", __func__); 640 641 pt_backend_disable_smp(ctx); 642 if (ctx->mode == HWT_MODE_THREAD) { 643 TAILQ_FOREACH(thr, &ctx->threads, next) { 644 KASSERT(thr->private != NULL, 645 ("%s: thr->private not set", __func__)); 646 pt_ctx = (struct pt_ctx *)thr->private; 647 pt_deinit_ctx(pt_ctx); 648 } 649 } else { 650 CPU_FOREACH(cpu_id) { 651 if (!CPU_ISSET(cpu_id, &ctx->cpu_map)) 652 continue; 653 if (pt_pcpu[cpu_id].ctx != NULL) { 654 KASSERT(pt_pcpu[cpu_id].ctx == 655 &pt_pcpu_ctx[cpu_id], 656 ("%s: CPU mode tracing with non-cpu mode PT" 657 "context active", 658 __func__)); 659 pt_pcpu[cpu_id].ctx = NULL; 660 } 661 pt_ctx = &pt_pcpu_ctx[cpu_id]; 662 pt_deinit_ctx(pt_ctx); 663 memset(&pt_pcpu[cpu_id], 0, sizeof(struct pt_cpu)); 664 } 665 } 666 667 return (0); 668 } 669 670 /* 671 * Fetches current offset into the tracing buffer. 672 */ 673 static int 674 pt_backend_read(struct hwt_vm *vm, int *curpage, vm_offset_t *curpage_offset, 675 uint64_t *data) 676 { 677 struct pt_buffer *buf; 678 679 if (vm->ctx->mode == HWT_MODE_THREAD) 680 buf = &((struct pt_ctx *)vm->thr->private)->buf; 681 else 682 buf = &pt_pcpu[vm->cpu->cpu_id].ctx->buf; 683 mtx_lock_spin(&buf->lock); 684 *curpage = buf->curpage; 685 *curpage_offset = buf->offset + (buf->wrap_count * vm->ctx->bufsize); 686 mtx_unlock_spin(&buf->lock); 687 688 return (0); 689 } 690 691 /* 692 * HWT thread creation hook. 693 * Allocates and associates a 'struct pt_ctx' for a given hwt thread. 694 */ 695 static int 696 pt_backend_alloc_thread(struct hwt_thread *thr) 697 { 698 struct pt_ctx *pt_ctx; 699 int error; 700 701 /* Omit M_WAITOK since this might get invoked a non-sleepable context */ 702 pt_ctx = malloc(sizeof(*pt_ctx), M_PT, M_NOWAIT | M_ZERO); 703 if (pt_ctx == NULL) 704 return (ENOMEM); 705 706 error = pt_init_ctx(pt_ctx, thr->vm, thr->thread_id); 707 if (error) 708 return (error); 709 710 thr->private = pt_ctx; 711 return (0); 712 } 713 /* 714 * HWT thread teardown hook. 715 */ 716 static void 717 pt_backend_free_thread(struct hwt_thread *thr) 718 { 719 struct pt_ctx *ctx; 720 721 ctx = (struct pt_ctx *)thr->private; 722 723 pt_deinit_ctx(ctx); 724 free(ctx, M_PT); 725 } 726 727 static void 728 pt_backend_dump(int cpu_id) 729 { 730 } 731 732 static struct hwt_backend_ops pt_ops = { 733 .hwt_backend_init = pt_backend_init, 734 .hwt_backend_deinit = pt_backend_deinit, 735 736 .hwt_backend_configure = pt_backend_configure, 737 738 .hwt_backend_enable = pt_backend_enable, 739 .hwt_backend_disable = pt_backend_disable, 740 741 #ifdef SMP 742 .hwt_backend_enable_smp = pt_backend_enable_smp, 743 .hwt_backend_disable_smp = pt_backend_disable_smp, 744 #endif 745 746 .hwt_backend_read = pt_backend_read, 747 .hwt_backend_dump = pt_backend_dump, 748 749 .hwt_backend_thread_alloc = pt_backend_alloc_thread, 750 .hwt_backend_thread_free = pt_backend_free_thread, 751 }; 752 753 static struct hwt_backend backend = { 754 .ops = &pt_ops, 755 .name = "pt", 756 .kva_req = 1, 757 }; 758 759 /* 760 * Reads the latest valid trace buffer offset and enqueues 761 * a HWT_RECORD_BUFFER record. 762 * Used as a taskqueue routine from the ToPA interrupt handler. 763 */ 764 static void 765 pt_send_buffer_record(void *arg, int pending __unused) 766 { 767 struct hwt_record_entry record; 768 struct pt_ctx *ctx = (struct pt_ctx *)arg; 769 770 /* Prepare buffer record. */ 771 mtx_lock_spin(&ctx->buf.lock); 772 pt_fill_buffer_record(ctx->id, &ctx->buf, &record); 773 mtx_unlock_spin(&ctx->buf.lock); 774 hwt_record_ctx(ctx->hwt_ctx, &record, M_ZERO | M_NOWAIT); 775 } 776 static void 777 pt_topa_status_clear(void) 778 { 779 uint64_t reg; 780 781 reg = rdmsr(MSR_IA_GLOBAL_STATUS_RESET); 782 reg &= ~GLOBAL_STATUS_FLAG_TRACETOPAPMI; 783 reg |= GLOBAL_STATUS_FLAG_TRACETOPAPMI; 784 wrmsr(MSR_IA_GLOBAL_STATUS_RESET, reg); 785 } 786 787 /* 788 * ToPA PMI handler. 789 * 790 * Invoked every time a ToPA entry marked with TOPA_INT is filled. 791 * Uses taskqueue to enqueue a buffer record for userspace. 792 * Re-enables the PC interrupt line as long as tracing is active. 793 */ 794 static int 795 pt_topa_intr(struct trapframe *tf) 796 { 797 struct pt_buffer *buf; 798 struct pt_ctx *ctx; 799 uint64_t reg; 800 801 SDT_PROBE0(pt, , , topa__intr); 802 803 if (pt_cpu_get_state(curcpu) != PT_ACTIVE) { 804 return (0); 805 } 806 reg = rdmsr(MSR_IA_GLOBAL_STATUS); 807 if ((reg & GLOBAL_STATUS_FLAG_TRACETOPAPMI) == 0) { 808 /* ACK spurious or leftover interrupt. */ 809 pt_topa_status_clear(); 810 return (1); 811 } 812 813 ctx = pt_pcpu[curcpu].ctx; 814 buf = &ctx->buf; 815 KASSERT(buf->topa_hw != NULL, 816 ("%s: ToPA PMI interrupt with invalid buffer", __func__)); 817 818 pt_cpu_toggle_local(ctx->save_area, false); 819 pt_update_buffer(buf); 820 pt_topa_status_clear(); 821 taskqueue_enqueue_flags(taskqueue_pt, &ctx->task, 822 TASKQUEUE_FAIL_IF_PENDING); 823 824 if (pt_cpu_get_state(curcpu) == PT_ACTIVE) { 825 pt_cpu_toggle_local(ctx->save_area, true); 826 lapic_reenable_pcint(); 827 } 828 return (1); 829 } 830 831 /* 832 * Module initialization. 833 * 834 * Saves all PT-related cpuid info, registers itself as a HWT backend, 835 * and allocates metadata required to keep track of tracing operations 836 * on each CPU. 837 */ 838 static int 839 pt_init(void) 840 { 841 u_int cp[4]; 842 int error; 843 844 dprintf("pt: Enumerating part 1\n"); 845 cpuid_count(CPUID_PT_LEAF, 0, cp); 846 dprintf("pt: Maximum valid sub-leaf Index: %x\n", cp[0]); 847 dprintf("pt: ebx %x\n", cp[1]); 848 dprintf("pt: ecx %x\n", cp[2]); 849 850 pt_info.l0_eax = cp[0]; 851 pt_info.l0_ebx = cp[1]; 852 pt_info.l0_ecx = cp[2]; 853 854 dprintf("pt: Enumerating part 2\n"); 855 cpuid_count(CPUID_PT_LEAF, 1, cp); 856 dprintf("pt: eax %x\n", cp[0]); 857 dprintf("pt: ebx %x\n", cp[1]); 858 859 pt_info.l1_eax = cp[0]; 860 pt_info.l1_ebx = cp[1]; 861 862 error = hwt_backend_register(&backend); 863 if (error != 0) { 864 printf("pt: unable to register hwt backend, error %d\n", error); 865 return (error); 866 } 867 pt_pcpu = mallocarray(mp_ncpus, sizeof(struct pt_cpu), M_PT, 868 M_ZERO | M_WAITOK); 869 pt_pcpu_ctx = mallocarray(mp_ncpus, sizeof(struct pt_ctx), M_PT, 870 M_ZERO | M_WAITOK); 871 872 nmi_register_handler(pt_topa_intr); 873 if (!lapic_enable_pcint()) { 874 nmi_remove_handler(pt_topa_intr); 875 hwt_backend_unregister(&backend); 876 free(pt_pcpu, M_PT); 877 free(pt_pcpu_ctx, M_PT); 878 pt_pcpu = NULL; 879 pt_pcpu_ctx = NULL; 880 printf("pt: failed to setup interrupt line\n"); 881 return (error); 882 } 883 initialized = true; 884 885 return (0); 886 } 887 888 /* 889 * Checks whether the CPU support Intel PT and 890 * initializes XSAVE area info. 891 * 892 * The driver relies on XSAVE/XRSTOR PT extensions, 893 * Table of Physical Addresses (ToPA) support, and 894 * support for multiple ToPA entries. 895 */ 896 static bool 897 pt_supported(void) 898 { 899 u_int cp[4]; 900 901 if ((cpu_stdext_feature & CPUID_STDEXT_PROCTRACE) == 0) { 902 printf("pt: CPU does not support Intel Processor Trace\n"); 903 return (false); 904 } 905 if ((cpu_feature2 & CPUID2_XSAVE) == 0) { 906 printf("pt: XSAVE is not supported\n"); 907 return (false); 908 } 909 if (!xsave_extfeature_supported(XFEATURE_ENABLED_PT, true)) { 910 printf("pt: CPU does not support managing PT state using XSAVE\n"); 911 return (false); 912 } 913 if (!xsave_extension_supported(CPUID_EXTSTATE_XSAVEC)) { 914 printf("pt: XSAVE compaction is not supported\n"); 915 return (false); 916 } 917 if (!xsave_extension_supported(CPUID_EXTSTATE_XSAVES)) { 918 printf("pt: CPU does not support XSAVES/XRSTORS\n"); 919 return (false); 920 } 921 922 /* Require ToPA support. */ 923 cpuid_count(CPUID_PT_LEAF, 0, cp); 924 if ((cp[2] & CPUPT_TOPA) == 0) { 925 printf("pt: ToPA is not supported\n"); 926 return (false); 927 } 928 if ((cp[2] & CPUPT_TOPA_MULTI) == 0) { 929 printf("pt: multiple ToPA outputs are not supported\n"); 930 return (false); 931 } 932 933 pt_info.xstate_hdr_offset = xsave_area_hdr_offset(); 934 pt_info.xsave_area_size = xsave_area_size(PT_XSTATE_BV, true, true); 935 pt_info.pt_xsave_offset = xsave_area_offset(PT_XSTATE_BV, 936 XFEATURE_ENABLED_PT, true, true); 937 938 return (true); 939 } 940 941 static void 942 pt_deinit(void) 943 { 944 if (!initialized) 945 return; 946 nmi_remove_handler(pt_topa_intr); 947 lapic_disable_pcint(); 948 hwt_backend_unregister(&backend); 949 free(pt_pcpu, M_PT); 950 free(pt_pcpu_ctx, M_PT); 951 pt_pcpu = NULL; 952 initialized = false; 953 } 954 955 static int 956 pt_modevent(module_t mod, int type, void *data) 957 { 958 switch (type) { 959 case MOD_LOAD: 960 if (!pt_supported() || pt_init() != 0) { 961 return (ENXIO); 962 } 963 break; 964 case MOD_UNLOAD: 965 pt_deinit(); 966 break; 967 default: 968 break; 969 } 970 971 return (0); 972 } 973 974 static moduledata_t pt_mod = { "intel_pt", pt_modevent, NULL }; 975 976 DECLARE_MODULE(intel_pt, pt_mod, SI_SUB_DRIVERS, SI_ORDER_FIRST); 977 MODULE_DEPEND(intel_pt, hwt, 1, 1, 1); 978 MODULE_VERSION(intel_pt, 1); 979