1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Intel(R) Processor Trace PMU driver for perf 4 * Copyright (c) 2013-2014, Intel Corporation. 5 * 6 * Intel PT is specified in the Intel Architecture Instruction Set Extensions 7 * Programming Reference: 8 * http://software.intel.com/en-us/intel-isa-extensions 9 */ 10 11 #undef DEBUG 12 13 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 14 15 #include <linux/types.h> 16 #include <linux/bits.h> 17 #include <linux/limits.h> 18 #include <linux/slab.h> 19 #include <linux/device.h> 20 21 #include <asm/cpuid.h> 22 #include <asm/perf_event.h> 23 #include <asm/insn.h> 24 #include <asm/io.h> 25 #include <asm/intel_pt.h> 26 #include <asm/cpu_device_id.h> 27 28 #include "../perf_event.h" 29 #include "pt.h" 30 31 static DEFINE_PER_CPU(struct pt, pt_ctx); 32 33 static struct pt_pmu pt_pmu; 34 35 /* 36 * Capabilities of Intel PT hardware, such as number of address bits or 37 * supported output schemes, are cached and exported to userspace as "caps" 38 * attribute group of pt pmu device 39 * (/sys/bus/event_source/devices/intel_pt/caps/) so that userspace can store 40 * relevant bits together with intel_pt traces. 41 * 42 * These are necessary for both trace decoding (payloads_lip, contains address 43 * width encoded in IP-related packets), and event configuration (bitmasks with 44 * permitted values for certain bit fields). 45 */ 46 #define PT_CAP(_n, _l, _r, _m) \ 47 [PT_CAP_ ## _n] = { .name = __stringify(_n), .leaf = _l, \ 48 .reg = _r, .mask = _m } 49 50 static struct pt_cap_desc { 51 const char *name; 52 u32 leaf; 53 u8 reg; 54 u32 mask; 55 } pt_caps[] = { 56 PT_CAP(max_subleaf, 0, CPUID_EAX, 0xffffffff), 57 PT_CAP(cr3_filtering, 0, CPUID_EBX, BIT(0)), 58 PT_CAP(psb_cyc, 0, CPUID_EBX, BIT(1)), 59 PT_CAP(ip_filtering, 0, CPUID_EBX, BIT(2)), 60 PT_CAP(mtc, 0, CPUID_EBX, BIT(3)), 61 PT_CAP(ptwrite, 0, CPUID_EBX, BIT(4)), 62 PT_CAP(power_event_trace, 0, CPUID_EBX, BIT(5)), 63 PT_CAP(event_trace, 0, CPUID_EBX, BIT(7)), 64 PT_CAP(tnt_disable, 0, CPUID_EBX, BIT(8)), 65 PT_CAP(topa_output, 0, CPUID_ECX, BIT(0)), 66 PT_CAP(topa_multiple_entries, 0, CPUID_ECX, BIT(1)), 67 PT_CAP(single_range_output, 0, CPUID_ECX, BIT(2)), 68 PT_CAP(output_subsys, 0, CPUID_ECX, BIT(3)), 69 PT_CAP(payloads_lip, 0, CPUID_ECX, BIT(31)), 70 PT_CAP(num_address_ranges, 1, CPUID_EAX, 0x7), 71 PT_CAP(mtc_periods, 1, CPUID_EAX, 0xffff0000), 72 PT_CAP(cycle_thresholds, 1, CPUID_EBX, 0xffff), 73 PT_CAP(psb_periods, 1, CPUID_EBX, 0xffff0000), 74 }; 75 76 u32 intel_pt_validate_cap(u32 *caps, enum pt_capabilities capability) 77 { 78 struct pt_cap_desc *cd = &pt_caps[capability]; 79 u32 c = caps[cd->leaf * PT_CPUID_REGS_NUM + cd->reg]; 80 unsigned int shift = __ffs(cd->mask); 81 82 return (c & cd->mask) >> shift; 83 } 84 EXPORT_SYMBOL_GPL(intel_pt_validate_cap); 85 86 u32 intel_pt_validate_hw_cap(enum pt_capabilities cap) 87 { 88 return intel_pt_validate_cap(pt_pmu.caps, cap); 89 } 90 EXPORT_SYMBOL_GPL(intel_pt_validate_hw_cap); 91 92 static ssize_t pt_cap_show(struct device *cdev, 93 struct device_attribute *attr, 94 char *buf) 95 { 96 struct dev_ext_attribute *ea = 97 container_of(attr, struct dev_ext_attribute, attr); 98 enum pt_capabilities cap = (long)ea->var; 99 100 return snprintf(buf, PAGE_SIZE, "%x\n", intel_pt_validate_hw_cap(cap)); 101 } 102 103 static struct attribute_group pt_cap_group __ro_after_init = { 104 .name = "caps", 105 }; 106 107 PMU_FORMAT_ATTR(pt, "config:0" ); 108 PMU_FORMAT_ATTR(cyc, "config:1" ); 109 PMU_FORMAT_ATTR(pwr_evt, "config:4" ); 110 PMU_FORMAT_ATTR(fup_on_ptw, "config:5" ); 111 PMU_FORMAT_ATTR(mtc, "config:9" ); 112 PMU_FORMAT_ATTR(tsc, "config:10" ); 113 PMU_FORMAT_ATTR(noretcomp, "config:11" ); 114 PMU_FORMAT_ATTR(ptw, "config:12" ); 115 PMU_FORMAT_ATTR(branch, "config:13" ); 116 PMU_FORMAT_ATTR(event, "config:31" ); 117 PMU_FORMAT_ATTR(notnt, "config:55" ); 118 PMU_FORMAT_ATTR(mtc_period, "config:14-17" ); 119 PMU_FORMAT_ATTR(cyc_thresh, "config:19-22" ); 120 PMU_FORMAT_ATTR(psb_period, "config:24-27" ); 121 122 static struct attribute *pt_formats_attr[] = { 123 &format_attr_pt.attr, 124 &format_attr_cyc.attr, 125 &format_attr_pwr_evt.attr, 126 &format_attr_event.attr, 127 &format_attr_notnt.attr, 128 &format_attr_fup_on_ptw.attr, 129 &format_attr_mtc.attr, 130 &format_attr_tsc.attr, 131 &format_attr_noretcomp.attr, 132 &format_attr_ptw.attr, 133 &format_attr_branch.attr, 134 &format_attr_mtc_period.attr, 135 &format_attr_cyc_thresh.attr, 136 &format_attr_psb_period.attr, 137 NULL, 138 }; 139 140 static struct attribute_group pt_format_group = { 141 .name = "format", 142 .attrs = pt_formats_attr, 143 }; 144 145 static ssize_t 146 pt_timing_attr_show(struct device *dev, struct device_attribute *attr, 147 char *page) 148 { 149 struct perf_pmu_events_attr *pmu_attr = 150 container_of(attr, struct perf_pmu_events_attr, attr); 151 152 switch (pmu_attr->id) { 153 case 0: 154 return sprintf(page, "%lu\n", pt_pmu.max_nonturbo_ratio); 155 case 1: 156 return sprintf(page, "%u:%u\n", 157 pt_pmu.tsc_art_num, 158 pt_pmu.tsc_art_den); 159 default: 160 break; 161 } 162 163 return -EINVAL; 164 } 165 166 PMU_EVENT_ATTR(max_nonturbo_ratio, timing_attr_max_nonturbo_ratio, 0, 167 pt_timing_attr_show); 168 PMU_EVENT_ATTR(tsc_art_ratio, timing_attr_tsc_art_ratio, 1, 169 pt_timing_attr_show); 170 171 static struct attribute *pt_timing_attr[] = { 172 &timing_attr_max_nonturbo_ratio.attr.attr, 173 &timing_attr_tsc_art_ratio.attr.attr, 174 NULL, 175 }; 176 177 static struct attribute_group pt_timing_group = { 178 .attrs = pt_timing_attr, 179 }; 180 181 static const struct attribute_group *pt_attr_groups[] = { 182 &pt_cap_group, 183 &pt_format_group, 184 &pt_timing_group, 185 NULL, 186 }; 187 188 static int __init pt_pmu_hw_init(void) 189 { 190 struct dev_ext_attribute *de_attrs; 191 struct attribute **attrs; 192 size_t size; 193 u64 reg; 194 int ret; 195 long i; 196 197 rdmsrl(MSR_PLATFORM_INFO, reg); 198 pt_pmu.max_nonturbo_ratio = (reg & 0xff00) >> 8; 199 200 /* 201 * if available, read in TSC to core crystal clock ratio, 202 * otherwise, zero for numerator stands for "not enumerated" 203 * as per SDM 204 */ 205 if (boot_cpu_data.cpuid_level >= CPUID_LEAF_TSC) { 206 u32 eax, ebx, ecx, edx; 207 208 cpuid(CPUID_LEAF_TSC, &eax, &ebx, &ecx, &edx); 209 210 pt_pmu.tsc_art_num = ebx; 211 pt_pmu.tsc_art_den = eax; 212 } 213 214 /* model-specific quirks */ 215 switch (boot_cpu_data.x86_vfm) { 216 case INTEL_BROADWELL: 217 case INTEL_BROADWELL_D: 218 case INTEL_BROADWELL_G: 219 case INTEL_BROADWELL_X: 220 /* not setting BRANCH_EN will #GP, erratum BDM106 */ 221 pt_pmu.branch_en_always_on = true; 222 break; 223 default: 224 break; 225 } 226 227 if (boot_cpu_has(X86_FEATURE_VMX)) { 228 /* 229 * Intel SDM, 36.5 "Tracing post-VMXON" says that 230 * "IA32_VMX_MISC[bit 14]" being 1 means PT can trace 231 * post-VMXON. 232 */ 233 rdmsrl(MSR_IA32_VMX_MISC, reg); 234 if (reg & BIT(14)) 235 pt_pmu.vmx = true; 236 } 237 238 for (i = 0; i < PT_CPUID_LEAVES; i++) { 239 cpuid_count(20, i, 240 &pt_pmu.caps[CPUID_EAX + i*PT_CPUID_REGS_NUM], 241 &pt_pmu.caps[CPUID_EBX + i*PT_CPUID_REGS_NUM], 242 &pt_pmu.caps[CPUID_ECX + i*PT_CPUID_REGS_NUM], 243 &pt_pmu.caps[CPUID_EDX + i*PT_CPUID_REGS_NUM]); 244 } 245 246 ret = -ENOMEM; 247 size = sizeof(struct attribute *) * (ARRAY_SIZE(pt_caps)+1); 248 attrs = kzalloc(size, GFP_KERNEL); 249 if (!attrs) 250 goto fail; 251 252 size = sizeof(struct dev_ext_attribute) * (ARRAY_SIZE(pt_caps)+1); 253 de_attrs = kzalloc(size, GFP_KERNEL); 254 if (!de_attrs) 255 goto fail; 256 257 for (i = 0; i < ARRAY_SIZE(pt_caps); i++) { 258 struct dev_ext_attribute *de_attr = de_attrs + i; 259 260 de_attr->attr.attr.name = pt_caps[i].name; 261 262 sysfs_attr_init(&de_attr->attr.attr); 263 264 de_attr->attr.attr.mode = S_IRUGO; 265 de_attr->attr.show = pt_cap_show; 266 de_attr->var = (void *)i; 267 268 attrs[i] = &de_attr->attr.attr; 269 } 270 271 pt_cap_group.attrs = attrs; 272 273 return 0; 274 275 fail: 276 kfree(attrs); 277 278 return ret; 279 } 280 281 #define RTIT_CTL_CYC_PSB (RTIT_CTL_CYCLEACC | \ 282 RTIT_CTL_CYC_THRESH | \ 283 RTIT_CTL_PSB_FREQ) 284 285 #define RTIT_CTL_MTC (RTIT_CTL_MTC_EN | \ 286 RTIT_CTL_MTC_RANGE) 287 288 #define RTIT_CTL_PTW (RTIT_CTL_PTW_EN | \ 289 RTIT_CTL_FUP_ON_PTW) 290 291 /* 292 * Bit 0 (TraceEn) in the attr.config is meaningless as the 293 * corresponding bit in the RTIT_CTL can only be controlled 294 * by the driver; therefore, repurpose it to mean: pass 295 * through the bit that was previously assumed to be always 296 * on for PT, thereby allowing the user to *not* set it if 297 * they so wish. See also pt_event_valid() and pt_config(). 298 */ 299 #define RTIT_CTL_PASSTHROUGH RTIT_CTL_TRACEEN 300 301 #define PT_CONFIG_MASK (RTIT_CTL_TRACEEN | \ 302 RTIT_CTL_TSC_EN | \ 303 RTIT_CTL_DISRETC | \ 304 RTIT_CTL_BRANCH_EN | \ 305 RTIT_CTL_CYC_PSB | \ 306 RTIT_CTL_MTC | \ 307 RTIT_CTL_PWR_EVT_EN | \ 308 RTIT_CTL_EVENT_EN | \ 309 RTIT_CTL_NOTNT | \ 310 RTIT_CTL_FUP_ON_PTW | \ 311 RTIT_CTL_PTW_EN) 312 313 static bool pt_event_valid(struct perf_event *event) 314 { 315 u64 config = event->attr.config; 316 u64 allowed, requested; 317 318 if ((config & PT_CONFIG_MASK) != config) 319 return false; 320 321 if (config & RTIT_CTL_CYC_PSB) { 322 if (!intel_pt_validate_hw_cap(PT_CAP_psb_cyc)) 323 return false; 324 325 allowed = intel_pt_validate_hw_cap(PT_CAP_psb_periods); 326 requested = (config & RTIT_CTL_PSB_FREQ) >> 327 RTIT_CTL_PSB_FREQ_OFFSET; 328 if (requested && (!(allowed & BIT(requested)))) 329 return false; 330 331 allowed = intel_pt_validate_hw_cap(PT_CAP_cycle_thresholds); 332 requested = (config & RTIT_CTL_CYC_THRESH) >> 333 RTIT_CTL_CYC_THRESH_OFFSET; 334 if (requested && (!(allowed & BIT(requested)))) 335 return false; 336 } 337 338 if (config & RTIT_CTL_MTC) { 339 /* 340 * In the unlikely case that CPUID lists valid mtc periods, 341 * but not the mtc capability, drop out here. 342 * 343 * Spec says that setting mtc period bits while mtc bit in 344 * CPUID is 0 will #GP, so better safe than sorry. 345 */ 346 if (!intel_pt_validate_hw_cap(PT_CAP_mtc)) 347 return false; 348 349 allowed = intel_pt_validate_hw_cap(PT_CAP_mtc_periods); 350 if (!allowed) 351 return false; 352 353 requested = (config & RTIT_CTL_MTC_RANGE) >> 354 RTIT_CTL_MTC_RANGE_OFFSET; 355 356 if (!(allowed & BIT(requested))) 357 return false; 358 } 359 360 if (config & RTIT_CTL_PWR_EVT_EN && 361 !intel_pt_validate_hw_cap(PT_CAP_power_event_trace)) 362 return false; 363 364 if (config & RTIT_CTL_EVENT_EN && 365 !intel_pt_validate_hw_cap(PT_CAP_event_trace)) 366 return false; 367 368 if (config & RTIT_CTL_NOTNT && 369 !intel_pt_validate_hw_cap(PT_CAP_tnt_disable)) 370 return false; 371 372 if (config & RTIT_CTL_PTW) { 373 if (!intel_pt_validate_hw_cap(PT_CAP_ptwrite)) 374 return false; 375 376 /* FUPonPTW without PTW doesn't make sense */ 377 if ((config & RTIT_CTL_FUP_ON_PTW) && 378 !(config & RTIT_CTL_PTW_EN)) 379 return false; 380 } 381 382 /* 383 * Setting bit 0 (TraceEn in RTIT_CTL MSR) in the attr.config 384 * clears the assumption that BranchEn must always be enabled, 385 * as was the case with the first implementation of PT. 386 * If this bit is not set, the legacy behavior is preserved 387 * for compatibility with the older userspace. 388 * 389 * Re-using bit 0 for this purpose is fine because it is never 390 * directly set by the user; previous attempts at setting it in 391 * the attr.config resulted in -EINVAL. 392 */ 393 if (config & RTIT_CTL_PASSTHROUGH) { 394 /* 395 * Disallow not setting BRANCH_EN where BRANCH_EN is 396 * always required. 397 */ 398 if (pt_pmu.branch_en_always_on && 399 !(config & RTIT_CTL_BRANCH_EN)) 400 return false; 401 } else { 402 /* 403 * Disallow BRANCH_EN without the PASSTHROUGH. 404 */ 405 if (config & RTIT_CTL_BRANCH_EN) 406 return false; 407 } 408 409 return true; 410 } 411 412 /* 413 * PT configuration helpers 414 * These all are cpu affine and operate on a local PT 415 */ 416 417 static void pt_config_start(struct perf_event *event) 418 { 419 struct pt *pt = this_cpu_ptr(&pt_ctx); 420 u64 ctl = event->hw.aux_config; 421 422 if (READ_ONCE(event->hw.aux_paused)) 423 return; 424 425 ctl |= RTIT_CTL_TRACEEN; 426 if (READ_ONCE(pt->vmx_on)) 427 perf_aux_output_flag(&pt->handle, PERF_AUX_FLAG_PARTIAL); 428 else 429 wrmsrl(MSR_IA32_RTIT_CTL, ctl); 430 431 WRITE_ONCE(event->hw.aux_config, ctl); 432 } 433 434 /* Address ranges and their corresponding msr configuration registers */ 435 static const struct pt_address_range { 436 unsigned long msr_a; 437 unsigned long msr_b; 438 unsigned int reg_off; 439 } pt_address_ranges[] = { 440 { 441 .msr_a = MSR_IA32_RTIT_ADDR0_A, 442 .msr_b = MSR_IA32_RTIT_ADDR0_B, 443 .reg_off = RTIT_CTL_ADDR0_OFFSET, 444 }, 445 { 446 .msr_a = MSR_IA32_RTIT_ADDR1_A, 447 .msr_b = MSR_IA32_RTIT_ADDR1_B, 448 .reg_off = RTIT_CTL_ADDR1_OFFSET, 449 }, 450 { 451 .msr_a = MSR_IA32_RTIT_ADDR2_A, 452 .msr_b = MSR_IA32_RTIT_ADDR2_B, 453 .reg_off = RTIT_CTL_ADDR2_OFFSET, 454 }, 455 { 456 .msr_a = MSR_IA32_RTIT_ADDR3_A, 457 .msr_b = MSR_IA32_RTIT_ADDR3_B, 458 .reg_off = RTIT_CTL_ADDR3_OFFSET, 459 } 460 }; 461 462 static u64 pt_config_filters(struct perf_event *event) 463 { 464 struct pt_filters *filters = event->hw.addr_filters; 465 struct pt *pt = this_cpu_ptr(&pt_ctx); 466 unsigned int range = 0; 467 u64 rtit_ctl = 0; 468 469 if (!filters) 470 return 0; 471 472 perf_event_addr_filters_sync(event); 473 474 for (range = 0; range < filters->nr_filters; range++) { 475 struct pt_filter *filter = &filters->filter[range]; 476 477 /* 478 * Note, if the range has zero start/end addresses due 479 * to its dynamic object not being loaded yet, we just 480 * go ahead and program zeroed range, which will simply 481 * produce no data. Note^2: if executable code at 0x0 482 * is a concern, we can set up an "invalid" configuration 483 * such as msr_b < msr_a. 484 */ 485 486 /* avoid redundant msr writes */ 487 if (pt->filters.filter[range].msr_a != filter->msr_a) { 488 wrmsrl(pt_address_ranges[range].msr_a, filter->msr_a); 489 pt->filters.filter[range].msr_a = filter->msr_a; 490 } 491 492 if (pt->filters.filter[range].msr_b != filter->msr_b) { 493 wrmsrl(pt_address_ranges[range].msr_b, filter->msr_b); 494 pt->filters.filter[range].msr_b = filter->msr_b; 495 } 496 497 rtit_ctl |= (u64)filter->config << pt_address_ranges[range].reg_off; 498 } 499 500 return rtit_ctl; 501 } 502 503 static void pt_config(struct perf_event *event) 504 { 505 struct pt *pt = this_cpu_ptr(&pt_ctx); 506 struct pt_buffer *buf = perf_get_aux(&pt->handle); 507 u64 reg; 508 509 /* First round: clear STATUS, in particular the PSB byte counter. */ 510 if (!event->hw.aux_config) { 511 perf_event_itrace_started(event); 512 wrmsrl(MSR_IA32_RTIT_STATUS, 0); 513 } 514 515 reg = pt_config_filters(event); 516 reg |= RTIT_CTL_TRACEEN; 517 if (!buf->single) 518 reg |= RTIT_CTL_TOPA; 519 520 /* 521 * Previously, we had BRANCH_EN on by default, but now that PT has 522 * grown features outside of branch tracing, it is useful to allow 523 * the user to disable it. Setting bit 0 in the event's attr.config 524 * allows BRANCH_EN to pass through instead of being always on. See 525 * also the comment in pt_event_valid(). 526 */ 527 if (event->attr.config & BIT(0)) { 528 reg |= event->attr.config & RTIT_CTL_BRANCH_EN; 529 } else { 530 reg |= RTIT_CTL_BRANCH_EN; 531 } 532 533 if (!event->attr.exclude_kernel) 534 reg |= RTIT_CTL_OS; 535 if (!event->attr.exclude_user) 536 reg |= RTIT_CTL_USR; 537 538 reg |= (event->attr.config & PT_CONFIG_MASK); 539 540 event->hw.aux_config = reg; 541 542 /* 543 * Allow resume before starting so as not to overwrite a value set by a 544 * PMI. 545 */ 546 barrier(); 547 WRITE_ONCE(pt->resume_allowed, 1); 548 /* Configuration is complete, it is now OK to handle an NMI */ 549 barrier(); 550 WRITE_ONCE(pt->handle_nmi, 1); 551 barrier(); 552 pt_config_start(event); 553 barrier(); 554 /* 555 * Allow pause after starting so its pt_config_stop() doesn't race with 556 * pt_config_start(). 557 */ 558 WRITE_ONCE(pt->pause_allowed, 1); 559 } 560 561 static void pt_config_stop(struct perf_event *event) 562 { 563 struct pt *pt = this_cpu_ptr(&pt_ctx); 564 u64 ctl = READ_ONCE(event->hw.aux_config); 565 566 /* may be already stopped by a PMI */ 567 if (!(ctl & RTIT_CTL_TRACEEN)) 568 return; 569 570 ctl &= ~RTIT_CTL_TRACEEN; 571 if (!READ_ONCE(pt->vmx_on)) 572 wrmsrl(MSR_IA32_RTIT_CTL, ctl); 573 574 WRITE_ONCE(event->hw.aux_config, ctl); 575 576 /* 577 * A wrmsr that disables trace generation serializes other PT 578 * registers and causes all data packets to be written to memory, 579 * but a fence is required for the data to become globally visible. 580 * 581 * The below WMB, separating data store and aux_head store matches 582 * the consumer's RMB that separates aux_head load and data load. 583 */ 584 wmb(); 585 } 586 587 /** 588 * struct topa - ToPA metadata 589 * @list: linkage to struct pt_buffer's list of tables 590 * @offset: offset of the first entry in this table in the buffer 591 * @size: total size of all entries in this table 592 * @last: index of the last initialized entry in this table 593 * @z_count: how many times the first entry repeats 594 */ 595 struct topa { 596 struct list_head list; 597 u64 offset; 598 size_t size; 599 int last; 600 unsigned int z_count; 601 }; 602 603 /* 604 * Keep ToPA table-related metadata on the same page as the actual table, 605 * taking up a few words from the top 606 */ 607 608 #define TENTS_PER_PAGE \ 609 ((PAGE_SIZE - sizeof(struct topa)) / sizeof(struct topa_entry)) 610 611 /** 612 * struct topa_page - page-sized ToPA table with metadata at the top 613 * @table: actual ToPA table entries, as understood by PT hardware 614 * @topa: metadata 615 */ 616 struct topa_page { 617 struct topa_entry table[TENTS_PER_PAGE]; 618 struct topa topa; 619 }; 620 621 static inline struct topa_page *topa_to_page(struct topa *topa) 622 { 623 return container_of(topa, struct topa_page, topa); 624 } 625 626 static inline struct topa_page *topa_entry_to_page(struct topa_entry *te) 627 { 628 return (struct topa_page *)((unsigned long)te & PAGE_MASK); 629 } 630 631 static inline phys_addr_t topa_pfn(struct topa *topa) 632 { 633 return PFN_DOWN(virt_to_phys(topa_to_page(topa))); 634 } 635 636 /* make -1 stand for the last table entry */ 637 #define TOPA_ENTRY(t, i) \ 638 ((i) == -1 \ 639 ? &topa_to_page(t)->table[(t)->last] \ 640 : &topa_to_page(t)->table[(i)]) 641 #define TOPA_ENTRY_SIZE(t, i) (sizes(TOPA_ENTRY((t), (i))->size)) 642 #define TOPA_ENTRY_PAGES(t, i) (1 << TOPA_ENTRY((t), (i))->size) 643 644 static void pt_config_buffer(struct pt_buffer *buf) 645 { 646 struct pt *pt = this_cpu_ptr(&pt_ctx); 647 u64 reg, mask; 648 void *base; 649 650 if (buf->single) { 651 base = buf->data_pages[0]; 652 mask = (buf->nr_pages * PAGE_SIZE - 1) >> 7; 653 } else { 654 base = topa_to_page(buf->cur)->table; 655 mask = (u64)buf->cur_idx; 656 } 657 658 reg = virt_to_phys(base); 659 if (pt->output_base != reg) { 660 pt->output_base = reg; 661 wrmsrl(MSR_IA32_RTIT_OUTPUT_BASE, reg); 662 } 663 664 reg = 0x7f | (mask << 7) | ((u64)buf->output_off << 32); 665 if (pt->output_mask != reg) { 666 pt->output_mask = reg; 667 wrmsrl(MSR_IA32_RTIT_OUTPUT_MASK, reg); 668 } 669 } 670 671 /** 672 * topa_alloc() - allocate page-sized ToPA table 673 * @cpu: CPU on which to allocate. 674 * @gfp: Allocation flags. 675 * 676 * Return: On success, return the pointer to ToPA table page. 677 */ 678 static struct topa *topa_alloc(int cpu, gfp_t gfp) 679 { 680 int node = cpu_to_node(cpu); 681 struct topa_page *tp; 682 struct page *p; 683 684 p = alloc_pages_node(node, gfp | __GFP_ZERO, 0); 685 if (!p) 686 return NULL; 687 688 tp = page_address(p); 689 tp->topa.last = 0; 690 691 /* 692 * In case of singe-entry ToPA, always put the self-referencing END 693 * link as the 2nd entry in the table 694 */ 695 if (!intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries)) { 696 TOPA_ENTRY(&tp->topa, 1)->base = page_to_phys(p) >> TOPA_SHIFT; 697 TOPA_ENTRY(&tp->topa, 1)->end = 1; 698 } 699 700 return &tp->topa; 701 } 702 703 /** 704 * topa_free() - free a page-sized ToPA table 705 * @topa: Table to deallocate. 706 */ 707 static void topa_free(struct topa *topa) 708 { 709 free_page((unsigned long)topa); 710 } 711 712 /** 713 * topa_insert_table() - insert a ToPA table into a buffer 714 * @buf: PT buffer that's being extended. 715 * @topa: New topa table to be inserted. 716 * 717 * If it's the first table in this buffer, set up buffer's pointers 718 * accordingly; otherwise, add a END=1 link entry to @topa to the current 719 * "last" table and adjust the last table pointer to @topa. 720 */ 721 static void topa_insert_table(struct pt_buffer *buf, struct topa *topa) 722 { 723 struct topa *last = buf->last; 724 725 list_add_tail(&topa->list, &buf->tables); 726 727 if (!buf->first) { 728 buf->first = buf->last = buf->cur = topa; 729 return; 730 } 731 732 topa->offset = last->offset + last->size; 733 buf->last = topa; 734 735 if (!intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries)) 736 return; 737 738 BUG_ON(last->last != TENTS_PER_PAGE - 1); 739 740 TOPA_ENTRY(last, -1)->base = topa_pfn(topa); 741 TOPA_ENTRY(last, -1)->end = 1; 742 } 743 744 /** 745 * topa_table_full() - check if a ToPA table is filled up 746 * @topa: ToPA table. 747 */ 748 static bool topa_table_full(struct topa *topa) 749 { 750 /* single-entry ToPA is a special case */ 751 if (!intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries)) 752 return !!topa->last; 753 754 return topa->last == TENTS_PER_PAGE - 1; 755 } 756 757 /** 758 * topa_insert_pages() - create a list of ToPA tables 759 * @buf: PT buffer being initialized. 760 * @cpu: CPU on which to allocate. 761 * @gfp: Allocation flags. 762 * 763 * This initializes a list of ToPA tables with entries from 764 * the data_pages provided by rb_alloc_aux(). 765 * 766 * Return: 0 on success or error code. 767 */ 768 static int topa_insert_pages(struct pt_buffer *buf, int cpu, gfp_t gfp) 769 { 770 struct topa *topa = buf->last; 771 int order = 0; 772 struct page *p; 773 774 p = virt_to_page(buf->data_pages[buf->nr_pages]); 775 if (PagePrivate(p)) 776 order = page_private(p); 777 778 if (topa_table_full(topa)) { 779 topa = topa_alloc(cpu, gfp); 780 if (!topa) 781 return -ENOMEM; 782 783 topa_insert_table(buf, topa); 784 } 785 786 if (topa->z_count == topa->last - 1) { 787 if (order == TOPA_ENTRY(topa, topa->last - 1)->size) 788 topa->z_count++; 789 } 790 791 TOPA_ENTRY(topa, -1)->base = page_to_phys(p) >> TOPA_SHIFT; 792 TOPA_ENTRY(topa, -1)->size = order; 793 if (!buf->snapshot && 794 !intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries)) { 795 TOPA_ENTRY(topa, -1)->intr = 1; 796 TOPA_ENTRY(topa, -1)->stop = 1; 797 } 798 799 topa->last++; 800 topa->size += sizes(order); 801 802 buf->nr_pages += 1ul << order; 803 804 return 0; 805 } 806 807 /** 808 * pt_topa_dump() - print ToPA tables and their entries 809 * @buf: PT buffer. 810 */ 811 static void pt_topa_dump(struct pt_buffer *buf) 812 { 813 struct topa *topa; 814 815 list_for_each_entry(topa, &buf->tables, list) { 816 struct topa_page *tp = topa_to_page(topa); 817 int i; 818 819 pr_debug("# table @%p, off %llx size %zx\n", tp->table, 820 topa->offset, topa->size); 821 for (i = 0; i < TENTS_PER_PAGE; i++) { 822 pr_debug("# entry @%p (%lx sz %u %c%c%c) raw=%16llx\n", 823 &tp->table[i], 824 (unsigned long)tp->table[i].base << TOPA_SHIFT, 825 sizes(tp->table[i].size), 826 tp->table[i].end ? 'E' : ' ', 827 tp->table[i].intr ? 'I' : ' ', 828 tp->table[i].stop ? 'S' : ' ', 829 *(u64 *)&tp->table[i]); 830 if ((intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries) && 831 tp->table[i].stop) || 832 tp->table[i].end) 833 break; 834 if (!i && topa->z_count) 835 i += topa->z_count; 836 } 837 } 838 } 839 840 /** 841 * pt_buffer_advance() - advance to the next output region 842 * @buf: PT buffer. 843 * 844 * Advance the current pointers in the buffer to the next ToPA entry. 845 */ 846 static void pt_buffer_advance(struct pt_buffer *buf) 847 { 848 buf->output_off = 0; 849 buf->cur_idx++; 850 851 if (buf->cur_idx == buf->cur->last) { 852 if (buf->cur == buf->last) { 853 buf->cur = buf->first; 854 buf->wrapped = true; 855 } else { 856 buf->cur = list_entry(buf->cur->list.next, struct topa, 857 list); 858 } 859 buf->cur_idx = 0; 860 } 861 } 862 863 /** 864 * pt_update_head() - calculate current offsets and sizes 865 * @pt: Per-cpu pt context. 866 * 867 * Update buffer's current write pointer position and data size. 868 */ 869 static void pt_update_head(struct pt *pt) 870 { 871 struct pt_buffer *buf = perf_get_aux(&pt->handle); 872 bool wrapped = buf->wrapped; 873 u64 topa_idx, base, old; 874 875 buf->wrapped = false; 876 877 if (buf->single) { 878 local_set(&buf->data_size, buf->output_off); 879 return; 880 } 881 882 /* offset of the first region in this table from the beginning of buf */ 883 base = buf->cur->offset + buf->output_off; 884 885 /* offset of the current output region within this table */ 886 for (topa_idx = 0; topa_idx < buf->cur_idx; topa_idx++) 887 base += TOPA_ENTRY_SIZE(buf->cur, topa_idx); 888 889 if (buf->snapshot) { 890 local_set(&buf->data_size, base); 891 } else { 892 old = (local64_xchg(&buf->head, base) & 893 ((buf->nr_pages << PAGE_SHIFT) - 1)); 894 if (base < old || (base == old && wrapped)) 895 base += buf->nr_pages << PAGE_SHIFT; 896 897 local_add(base - old, &buf->data_size); 898 } 899 } 900 901 /** 902 * pt_buffer_region() - obtain current output region's address 903 * @buf: PT buffer. 904 */ 905 static void *pt_buffer_region(struct pt_buffer *buf) 906 { 907 return phys_to_virt((phys_addr_t)TOPA_ENTRY(buf->cur, buf->cur_idx)->base << TOPA_SHIFT); 908 } 909 910 /** 911 * pt_buffer_region_size() - obtain current output region's size 912 * @buf: PT buffer. 913 */ 914 static size_t pt_buffer_region_size(struct pt_buffer *buf) 915 { 916 return TOPA_ENTRY_SIZE(buf->cur, buf->cur_idx); 917 } 918 919 /** 920 * pt_handle_status() - take care of possible status conditions 921 * @pt: Per-cpu pt context. 922 */ 923 static void pt_handle_status(struct pt *pt) 924 { 925 struct pt_buffer *buf = perf_get_aux(&pt->handle); 926 int advance = 0; 927 u64 status; 928 929 rdmsrl(MSR_IA32_RTIT_STATUS, status); 930 931 if (status & RTIT_STATUS_ERROR) { 932 pr_err_ratelimited("ToPA ERROR encountered, trying to recover\n"); 933 pt_topa_dump(buf); 934 status &= ~RTIT_STATUS_ERROR; 935 } 936 937 if (status & RTIT_STATUS_STOPPED) { 938 status &= ~RTIT_STATUS_STOPPED; 939 940 /* 941 * On systems that only do single-entry ToPA, hitting STOP 942 * means we are already losing data; need to let the decoder 943 * know. 944 */ 945 if (!buf->single && 946 (!intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries) || 947 buf->output_off == pt_buffer_region_size(buf))) { 948 perf_aux_output_flag(&pt->handle, 949 PERF_AUX_FLAG_TRUNCATED); 950 advance++; 951 } 952 } 953 954 /* 955 * Also on single-entry ToPA implementations, interrupt will come 956 * before the output reaches its output region's boundary. 957 */ 958 if (!intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries) && 959 !buf->snapshot && 960 pt_buffer_region_size(buf) - buf->output_off <= TOPA_PMI_MARGIN) { 961 void *head = pt_buffer_region(buf); 962 963 /* everything within this margin needs to be zeroed out */ 964 memset(head + buf->output_off, 0, 965 pt_buffer_region_size(buf) - 966 buf->output_off); 967 advance++; 968 } 969 970 if (advance) 971 pt_buffer_advance(buf); 972 973 wrmsrl(MSR_IA32_RTIT_STATUS, status); 974 } 975 976 /** 977 * pt_read_offset() - translate registers into buffer pointers 978 * @buf: PT buffer. 979 * 980 * Set buffer's output pointers from MSR values. 981 */ 982 static void pt_read_offset(struct pt_buffer *buf) 983 { 984 struct pt *pt = this_cpu_ptr(&pt_ctx); 985 struct topa_page *tp; 986 987 if (!buf->single) { 988 rdmsrl(MSR_IA32_RTIT_OUTPUT_BASE, pt->output_base); 989 tp = phys_to_virt(pt->output_base); 990 buf->cur = &tp->topa; 991 } 992 993 rdmsrl(MSR_IA32_RTIT_OUTPUT_MASK, pt->output_mask); 994 /* offset within current output region */ 995 buf->output_off = pt->output_mask >> 32; 996 /* index of current output region within this table */ 997 if (!buf->single) 998 buf->cur_idx = (pt->output_mask & 0xffffff80) >> 7; 999 } 1000 1001 static struct topa_entry * 1002 pt_topa_entry_for_page(struct pt_buffer *buf, unsigned int pg) 1003 { 1004 struct topa_page *tp; 1005 struct topa *topa; 1006 unsigned int idx, cur_pg = 0, z_pg = 0, start_idx = 0; 1007 1008 /* 1009 * Indicates a bug in the caller. 1010 */ 1011 if (WARN_ON_ONCE(pg >= buf->nr_pages)) 1012 return NULL; 1013 1014 /* 1015 * First, find the ToPA table where @pg fits. With high 1016 * order allocations, there shouldn't be many of these. 1017 */ 1018 list_for_each_entry(topa, &buf->tables, list) { 1019 if (topa->offset + topa->size > (unsigned long)pg << PAGE_SHIFT) 1020 goto found; 1021 } 1022 1023 /* 1024 * Hitting this means we have a problem in the ToPA 1025 * allocation code. 1026 */ 1027 WARN_ON_ONCE(1); 1028 1029 return NULL; 1030 1031 found: 1032 /* 1033 * Indicates a problem in the ToPA allocation code. 1034 */ 1035 if (WARN_ON_ONCE(topa->last == -1)) 1036 return NULL; 1037 1038 tp = topa_to_page(topa); 1039 cur_pg = PFN_DOWN(topa->offset); 1040 if (topa->z_count) { 1041 z_pg = TOPA_ENTRY_PAGES(topa, 0) * (topa->z_count + 1); 1042 start_idx = topa->z_count + 1; 1043 } 1044 1045 /* 1046 * Multiple entries at the beginning of the table have the same size, 1047 * ideally all of them; if @pg falls there, the search is done. 1048 */ 1049 if (pg >= cur_pg && pg < cur_pg + z_pg) { 1050 idx = (pg - cur_pg) / TOPA_ENTRY_PAGES(topa, 0); 1051 return &tp->table[idx]; 1052 } 1053 1054 /* 1055 * Otherwise, slow path: iterate through the remaining entries. 1056 */ 1057 for (idx = start_idx, cur_pg += z_pg; idx < topa->last; idx++) { 1058 if (cur_pg + TOPA_ENTRY_PAGES(topa, idx) > pg) 1059 return &tp->table[idx]; 1060 1061 cur_pg += TOPA_ENTRY_PAGES(topa, idx); 1062 } 1063 1064 /* 1065 * Means we couldn't find a ToPA entry in the table that does match. 1066 */ 1067 WARN_ON_ONCE(1); 1068 1069 return NULL; 1070 } 1071 1072 static struct topa_entry * 1073 pt_topa_prev_entry(struct pt_buffer *buf, struct topa_entry *te) 1074 { 1075 unsigned long table = (unsigned long)te & ~(PAGE_SIZE - 1); 1076 struct topa_page *tp; 1077 struct topa *topa; 1078 1079 tp = (struct topa_page *)table; 1080 if (tp->table != te) 1081 return --te; 1082 1083 topa = &tp->topa; 1084 if (topa == buf->first) 1085 topa = buf->last; 1086 else 1087 topa = list_prev_entry(topa, list); 1088 1089 tp = topa_to_page(topa); 1090 1091 return &tp->table[topa->last - 1]; 1092 } 1093 1094 /** 1095 * pt_buffer_reset_markers() - place interrupt and stop bits in the buffer 1096 * @buf: PT buffer. 1097 * @handle: Current output handle. 1098 * 1099 * Place INT and STOP marks to prevent overwriting old data that the consumer 1100 * hasn't yet collected and waking up the consumer after a certain fraction of 1101 * the buffer has filled up. Only needed and sensible for non-snapshot counters. 1102 * 1103 * This obviously relies on buf::head to figure out buffer markers, so it has 1104 * to be called after pt_buffer_reset_offsets() and before the hardware tracing 1105 * is enabled. 1106 */ 1107 static int pt_buffer_reset_markers(struct pt_buffer *buf, 1108 struct perf_output_handle *handle) 1109 1110 { 1111 unsigned long head = local64_read(&buf->head); 1112 unsigned long idx, npages, wakeup; 1113 1114 if (buf->single) 1115 return 0; 1116 1117 /* can't stop in the middle of an output region */ 1118 if (buf->output_off + handle->size + 1 < pt_buffer_region_size(buf)) { 1119 perf_aux_output_flag(handle, PERF_AUX_FLAG_TRUNCATED); 1120 return -EINVAL; 1121 } 1122 1123 1124 /* single entry ToPA is handled by marking all regions STOP=1 INT=1 */ 1125 if (!intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries)) 1126 return 0; 1127 1128 /* clear STOP and INT from current entry */ 1129 if (buf->stop_te) { 1130 buf->stop_te->stop = 0; 1131 buf->stop_te->intr = 0; 1132 } 1133 1134 if (buf->intr_te) 1135 buf->intr_te->intr = 0; 1136 1137 /* how many pages till the STOP marker */ 1138 npages = handle->size >> PAGE_SHIFT; 1139 1140 /* if it's on a page boundary, fill up one more page */ 1141 if (!offset_in_page(head + handle->size + 1)) 1142 npages++; 1143 1144 idx = (head >> PAGE_SHIFT) + npages; 1145 idx &= buf->nr_pages - 1; 1146 1147 if (idx != buf->stop_pos) { 1148 buf->stop_pos = idx; 1149 buf->stop_te = pt_topa_entry_for_page(buf, idx); 1150 buf->stop_te = pt_topa_prev_entry(buf, buf->stop_te); 1151 } 1152 1153 wakeup = handle->wakeup >> PAGE_SHIFT; 1154 1155 /* in the worst case, wake up the consumer one page before hard stop */ 1156 idx = (head >> PAGE_SHIFT) + npages - 1; 1157 if (idx > wakeup) 1158 idx = wakeup; 1159 1160 idx &= buf->nr_pages - 1; 1161 if (idx != buf->intr_pos) { 1162 buf->intr_pos = idx; 1163 buf->intr_te = pt_topa_entry_for_page(buf, idx); 1164 buf->intr_te = pt_topa_prev_entry(buf, buf->intr_te); 1165 } 1166 1167 buf->stop_te->stop = 1; 1168 buf->stop_te->intr = 1; 1169 buf->intr_te->intr = 1; 1170 1171 return 0; 1172 } 1173 1174 /** 1175 * pt_buffer_reset_offsets() - adjust buffer's write pointers from aux_head 1176 * @buf: PT buffer. 1177 * @head: Write pointer (aux_head) from AUX buffer. 1178 * 1179 * Find the ToPA table and entry corresponding to given @head and set buffer's 1180 * "current" pointers accordingly. This is done after we have obtained the 1181 * current aux_head position from a successful call to perf_aux_output_begin() 1182 * to make sure the hardware is writing to the right place. 1183 * 1184 * This function modifies buf::{cur,cur_idx,output_off} that will be programmed 1185 * into PT msrs when the tracing is enabled and buf::head and buf::data_size, 1186 * which are used to determine INT and STOP markers' locations by a subsequent 1187 * call to pt_buffer_reset_markers(). 1188 */ 1189 static void pt_buffer_reset_offsets(struct pt_buffer *buf, unsigned long head) 1190 { 1191 struct topa_page *cur_tp; 1192 struct topa_entry *te; 1193 int pg; 1194 1195 if (buf->snapshot) 1196 head &= (buf->nr_pages << PAGE_SHIFT) - 1; 1197 1198 if (!buf->single) { 1199 pg = (head >> PAGE_SHIFT) & (buf->nr_pages - 1); 1200 te = pt_topa_entry_for_page(buf, pg); 1201 1202 cur_tp = topa_entry_to_page(te); 1203 buf->cur = &cur_tp->topa; 1204 buf->cur_idx = te - TOPA_ENTRY(buf->cur, 0); 1205 buf->output_off = head & (pt_buffer_region_size(buf) - 1); 1206 } else { 1207 buf->output_off = head; 1208 } 1209 1210 local64_set(&buf->head, head); 1211 local_set(&buf->data_size, 0); 1212 } 1213 1214 /** 1215 * pt_buffer_fini_topa() - deallocate ToPA structure of a buffer 1216 * @buf: PT buffer. 1217 */ 1218 static void pt_buffer_fini_topa(struct pt_buffer *buf) 1219 { 1220 struct topa *topa, *iter; 1221 1222 if (buf->single) 1223 return; 1224 1225 list_for_each_entry_safe(topa, iter, &buf->tables, list) { 1226 /* 1227 * right now, this is in free_aux() path only, so 1228 * no need to unlink this table from the list 1229 */ 1230 topa_free(topa); 1231 } 1232 } 1233 1234 /** 1235 * pt_buffer_init_topa() - initialize ToPA table for pt buffer 1236 * @buf: PT buffer. 1237 * @cpu: CPU on which to allocate. 1238 * @nr_pages: No. of pages to allocate. 1239 * @gfp: Allocation flags. 1240 * 1241 * Return: 0 on success or error code. 1242 */ 1243 static int pt_buffer_init_topa(struct pt_buffer *buf, int cpu, 1244 unsigned long nr_pages, gfp_t gfp) 1245 { 1246 struct topa *topa; 1247 int err; 1248 1249 topa = topa_alloc(cpu, gfp); 1250 if (!topa) 1251 return -ENOMEM; 1252 1253 topa_insert_table(buf, topa); 1254 1255 while (buf->nr_pages < nr_pages) { 1256 err = topa_insert_pages(buf, cpu, gfp); 1257 if (err) { 1258 pt_buffer_fini_topa(buf); 1259 return -ENOMEM; 1260 } 1261 } 1262 1263 /* link last table to the first one, unless we're double buffering */ 1264 if (intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries)) { 1265 TOPA_ENTRY(buf->last, -1)->base = topa_pfn(buf->first); 1266 TOPA_ENTRY(buf->last, -1)->end = 1; 1267 } 1268 1269 pt_topa_dump(buf); 1270 return 0; 1271 } 1272 1273 static int pt_buffer_try_single(struct pt_buffer *buf, int nr_pages) 1274 { 1275 struct page *p = virt_to_page(buf->data_pages[0]); 1276 int ret = -ENOTSUPP, order = 0; 1277 1278 /* 1279 * We can use single range output mode 1280 * + in snapshot mode, where we don't need interrupts; 1281 * + if the hardware supports it; 1282 * + if the entire buffer is one contiguous allocation. 1283 */ 1284 if (!buf->snapshot) 1285 goto out; 1286 1287 if (!intel_pt_validate_hw_cap(PT_CAP_single_range_output)) 1288 goto out; 1289 1290 if (PagePrivate(p)) 1291 order = page_private(p); 1292 1293 if (1 << order != nr_pages) 1294 goto out; 1295 1296 /* 1297 * Some processors cannot always support single range for more than 1298 * 4KB - refer errata TGL052, ADL037 and RPL017. Future processors might 1299 * also be affected, so for now rather than trying to keep track of 1300 * which ones, just disable it for all. 1301 */ 1302 if (nr_pages > 1) 1303 goto out; 1304 1305 buf->single = true; 1306 buf->nr_pages = nr_pages; 1307 ret = 0; 1308 out: 1309 return ret; 1310 } 1311 1312 /** 1313 * pt_buffer_setup_aux() - set up topa tables for a PT buffer 1314 * @event: Performance event 1315 * @pages: Array of pointers to buffer pages passed from perf core. 1316 * @nr_pages: Number of pages in the buffer. 1317 * @snapshot: If this is a snapshot/overwrite counter. 1318 * 1319 * This is a pmu::setup_aux callback that sets up ToPA tables and all the 1320 * bookkeeping for an AUX buffer. 1321 * 1322 * Return: Our private PT buffer structure. 1323 */ 1324 static void * 1325 pt_buffer_setup_aux(struct perf_event *event, void **pages, 1326 int nr_pages, bool snapshot) 1327 { 1328 struct pt_buffer *buf; 1329 int node, ret, cpu = event->cpu; 1330 1331 if (!nr_pages) 1332 return NULL; 1333 1334 /* 1335 * Only support AUX sampling in snapshot mode, where we don't 1336 * generate NMIs. 1337 */ 1338 if (event->attr.aux_sample_size && !snapshot) 1339 return NULL; 1340 1341 if (cpu == -1) 1342 cpu = raw_smp_processor_id(); 1343 node = cpu_to_node(cpu); 1344 1345 buf = kzalloc_node(sizeof(struct pt_buffer), GFP_KERNEL, node); 1346 if (!buf) 1347 return NULL; 1348 1349 buf->snapshot = snapshot; 1350 buf->data_pages = pages; 1351 buf->stop_pos = -1; 1352 buf->intr_pos = -1; 1353 1354 INIT_LIST_HEAD(&buf->tables); 1355 1356 ret = pt_buffer_try_single(buf, nr_pages); 1357 if (!ret) 1358 return buf; 1359 1360 ret = pt_buffer_init_topa(buf, cpu, nr_pages, GFP_KERNEL); 1361 if (ret) { 1362 kfree(buf); 1363 return NULL; 1364 } 1365 1366 return buf; 1367 } 1368 1369 /** 1370 * pt_buffer_free_aux() - perf AUX deallocation path callback 1371 * @data: PT buffer. 1372 */ 1373 static void pt_buffer_free_aux(void *data) 1374 { 1375 struct pt_buffer *buf = data; 1376 1377 pt_buffer_fini_topa(buf); 1378 kfree(buf); 1379 } 1380 1381 static int pt_addr_filters_init(struct perf_event *event) 1382 { 1383 struct pt_filters *filters; 1384 int node = event->cpu == -1 ? -1 : cpu_to_node(event->cpu); 1385 1386 if (!intel_pt_validate_hw_cap(PT_CAP_num_address_ranges)) 1387 return 0; 1388 1389 filters = kzalloc_node(sizeof(struct pt_filters), GFP_KERNEL, node); 1390 if (!filters) 1391 return -ENOMEM; 1392 1393 if (event->parent) 1394 memcpy(filters, event->parent->hw.addr_filters, 1395 sizeof(*filters)); 1396 1397 event->hw.addr_filters = filters; 1398 1399 return 0; 1400 } 1401 1402 static void pt_addr_filters_fini(struct perf_event *event) 1403 { 1404 kfree(event->hw.addr_filters); 1405 event->hw.addr_filters = NULL; 1406 } 1407 1408 #ifdef CONFIG_X86_64 1409 /* Clamp to a canonical address greater-than-or-equal-to the address given */ 1410 static u64 clamp_to_ge_canonical_addr(u64 vaddr, u8 vaddr_bits) 1411 { 1412 return __is_canonical_address(vaddr, vaddr_bits) ? 1413 vaddr : 1414 -BIT_ULL(vaddr_bits - 1); 1415 } 1416 1417 /* Clamp to a canonical address less-than-or-equal-to the address given */ 1418 static u64 clamp_to_le_canonical_addr(u64 vaddr, u8 vaddr_bits) 1419 { 1420 return __is_canonical_address(vaddr, vaddr_bits) ? 1421 vaddr : 1422 BIT_ULL(vaddr_bits - 1) - 1; 1423 } 1424 #else 1425 #define clamp_to_ge_canonical_addr(x, y) (x) 1426 #define clamp_to_le_canonical_addr(x, y) (x) 1427 #endif 1428 1429 static int pt_event_addr_filters_validate(struct list_head *filters) 1430 { 1431 struct perf_addr_filter *filter; 1432 int range = 0; 1433 1434 list_for_each_entry(filter, filters, entry) { 1435 /* 1436 * PT doesn't support single address triggers and 1437 * 'start' filters. 1438 */ 1439 if (!filter->size || 1440 filter->action == PERF_ADDR_FILTER_ACTION_START) 1441 return -EOPNOTSUPP; 1442 1443 if (++range > intel_pt_validate_hw_cap(PT_CAP_num_address_ranges)) 1444 return -EOPNOTSUPP; 1445 } 1446 1447 return 0; 1448 } 1449 1450 static void pt_event_addr_filters_sync(struct perf_event *event) 1451 { 1452 struct perf_addr_filters_head *head = perf_event_addr_filters(event); 1453 unsigned long msr_a, msr_b; 1454 struct perf_addr_filter_range *fr = event->addr_filter_ranges; 1455 struct pt_filters *filters = event->hw.addr_filters; 1456 struct perf_addr_filter *filter; 1457 int range = 0; 1458 1459 if (!filters) 1460 return; 1461 1462 list_for_each_entry(filter, &head->list, entry) { 1463 if (filter->path.dentry && !fr[range].start) { 1464 msr_a = msr_b = 0; 1465 } else { 1466 unsigned long n = fr[range].size - 1; 1467 unsigned long a = fr[range].start; 1468 unsigned long b; 1469 1470 if (a > ULONG_MAX - n) 1471 b = ULONG_MAX; 1472 else 1473 b = a + n; 1474 /* 1475 * Apply the offset. 64-bit addresses written to the 1476 * MSRs must be canonical, but the range can encompass 1477 * non-canonical addresses. Since software cannot 1478 * execute at non-canonical addresses, adjusting to 1479 * canonical addresses does not affect the result of the 1480 * address filter. 1481 */ 1482 msr_a = clamp_to_ge_canonical_addr(a, boot_cpu_data.x86_virt_bits); 1483 msr_b = clamp_to_le_canonical_addr(b, boot_cpu_data.x86_virt_bits); 1484 if (msr_b < msr_a) 1485 msr_a = msr_b = 0; 1486 } 1487 1488 filters->filter[range].msr_a = msr_a; 1489 filters->filter[range].msr_b = msr_b; 1490 if (filter->action == PERF_ADDR_FILTER_ACTION_FILTER) 1491 filters->filter[range].config = 1; 1492 else 1493 filters->filter[range].config = 2; 1494 range++; 1495 } 1496 1497 filters->nr_filters = range; 1498 } 1499 1500 /** 1501 * intel_pt_interrupt() - PT PMI handler 1502 */ 1503 void intel_pt_interrupt(void) 1504 { 1505 struct pt *pt = this_cpu_ptr(&pt_ctx); 1506 struct pt_buffer *buf; 1507 struct perf_event *event = pt->handle.event; 1508 1509 /* 1510 * There may be a dangling PT bit in the interrupt status register 1511 * after PT has been disabled by pt_event_stop(). Make sure we don't 1512 * do anything (particularly, re-enable) for this event here. 1513 */ 1514 if (!READ_ONCE(pt->handle_nmi)) 1515 return; 1516 1517 if (!event) 1518 return; 1519 1520 pt_config_stop(event); 1521 1522 buf = perf_get_aux(&pt->handle); 1523 if (!buf) 1524 return; 1525 1526 pt_read_offset(buf); 1527 1528 pt_handle_status(pt); 1529 1530 pt_update_head(pt); 1531 1532 perf_aux_output_end(&pt->handle, local_xchg(&buf->data_size, 0)); 1533 1534 if (!event->hw.state) { 1535 int ret; 1536 1537 buf = perf_aux_output_begin(&pt->handle, event); 1538 if (!buf) { 1539 event->hw.state = PERF_HES_STOPPED; 1540 WRITE_ONCE(pt->resume_allowed, 0); 1541 return; 1542 } 1543 1544 pt_buffer_reset_offsets(buf, pt->handle.head); 1545 /* snapshot counters don't use PMI, so it's safe */ 1546 ret = pt_buffer_reset_markers(buf, &pt->handle); 1547 if (ret) { 1548 perf_aux_output_end(&pt->handle, 0); 1549 WRITE_ONCE(pt->resume_allowed, 0); 1550 return; 1551 } 1552 1553 pt_config_buffer(buf); 1554 pt_config_start(event); 1555 } 1556 } 1557 1558 void intel_pt_handle_vmx(int on) 1559 { 1560 struct pt *pt = this_cpu_ptr(&pt_ctx); 1561 struct perf_event *event; 1562 unsigned long flags; 1563 1564 /* PT plays nice with VMX, do nothing */ 1565 if (pt_pmu.vmx) 1566 return; 1567 1568 /* 1569 * VMXON will clear RTIT_CTL.TraceEn; we need to make 1570 * sure to not try to set it while VMX is on. Disable 1571 * interrupts to avoid racing with pmu callbacks; 1572 * concurrent PMI should be handled fine. 1573 */ 1574 local_irq_save(flags); 1575 WRITE_ONCE(pt->vmx_on, on); 1576 1577 /* 1578 * If an AUX transaction is in progress, it will contain 1579 * gap(s), so flag it PARTIAL to inform the user. 1580 */ 1581 event = pt->handle.event; 1582 if (event) 1583 perf_aux_output_flag(&pt->handle, 1584 PERF_AUX_FLAG_PARTIAL); 1585 1586 /* Turn PTs back on */ 1587 if (!on && event) 1588 wrmsrl(MSR_IA32_RTIT_CTL, event->hw.aux_config); 1589 1590 local_irq_restore(flags); 1591 } 1592 EXPORT_SYMBOL_GPL(intel_pt_handle_vmx); 1593 1594 /* 1595 * PMU callbacks 1596 */ 1597 1598 static void pt_event_start(struct perf_event *event, int mode) 1599 { 1600 struct hw_perf_event *hwc = &event->hw; 1601 struct pt *pt = this_cpu_ptr(&pt_ctx); 1602 struct pt_buffer *buf; 1603 1604 if (mode & PERF_EF_RESUME) { 1605 if (READ_ONCE(pt->resume_allowed)) { 1606 u64 status; 1607 1608 /* 1609 * Only if the trace is not active and the error and 1610 * stopped bits are clear, is it safe to start, but a 1611 * PMI might have just cleared these, so resume_allowed 1612 * must be checked again also. 1613 */ 1614 rdmsrl(MSR_IA32_RTIT_STATUS, status); 1615 if (!(status & (RTIT_STATUS_TRIGGEREN | 1616 RTIT_STATUS_ERROR | 1617 RTIT_STATUS_STOPPED)) && 1618 READ_ONCE(pt->resume_allowed)) 1619 pt_config_start(event); 1620 } 1621 return; 1622 } 1623 1624 buf = perf_aux_output_begin(&pt->handle, event); 1625 if (!buf) 1626 goto fail_stop; 1627 1628 pt_buffer_reset_offsets(buf, pt->handle.head); 1629 if (!buf->snapshot) { 1630 if (pt_buffer_reset_markers(buf, &pt->handle)) 1631 goto fail_end_stop; 1632 } 1633 1634 hwc->state = 0; 1635 1636 pt_config_buffer(buf); 1637 pt_config(event); 1638 1639 return; 1640 1641 fail_end_stop: 1642 perf_aux_output_end(&pt->handle, 0); 1643 fail_stop: 1644 hwc->state = PERF_HES_STOPPED; 1645 } 1646 1647 static void pt_event_stop(struct perf_event *event, int mode) 1648 { 1649 struct pt *pt = this_cpu_ptr(&pt_ctx); 1650 1651 if (mode & PERF_EF_PAUSE) { 1652 if (READ_ONCE(pt->pause_allowed)) 1653 pt_config_stop(event); 1654 return; 1655 } 1656 1657 /* 1658 * Protect against the PMI racing with disabling wrmsr, 1659 * see comment in intel_pt_interrupt(). 1660 */ 1661 WRITE_ONCE(pt->handle_nmi, 0); 1662 barrier(); 1663 1664 /* 1665 * Prevent a resume from attempting to restart tracing, or a pause 1666 * during a subsequent start. Do this after clearing handle_nmi so that 1667 * pt_event_snapshot_aux() will not re-allow them. 1668 */ 1669 WRITE_ONCE(pt->pause_allowed, 0); 1670 WRITE_ONCE(pt->resume_allowed, 0); 1671 barrier(); 1672 1673 pt_config_stop(event); 1674 1675 if (event->hw.state == PERF_HES_STOPPED) 1676 return; 1677 1678 event->hw.state = PERF_HES_STOPPED; 1679 1680 if (mode & PERF_EF_UPDATE) { 1681 struct pt_buffer *buf = perf_get_aux(&pt->handle); 1682 1683 if (!buf) 1684 return; 1685 1686 if (WARN_ON_ONCE(pt->handle.event != event)) 1687 return; 1688 1689 pt_read_offset(buf); 1690 1691 pt_handle_status(pt); 1692 1693 pt_update_head(pt); 1694 1695 if (buf->snapshot) 1696 pt->handle.head = 1697 local_xchg(&buf->data_size, 1698 buf->nr_pages << PAGE_SHIFT); 1699 perf_aux_output_end(&pt->handle, local_xchg(&buf->data_size, 0)); 1700 } 1701 } 1702 1703 static long pt_event_snapshot_aux(struct perf_event *event, 1704 struct perf_output_handle *handle, 1705 unsigned long size) 1706 { 1707 struct pt *pt = this_cpu_ptr(&pt_ctx); 1708 struct pt_buffer *buf = perf_get_aux(&pt->handle); 1709 unsigned long from = 0, to; 1710 long ret; 1711 1712 if (WARN_ON_ONCE(!buf)) 1713 return 0; 1714 1715 /* 1716 * Sampling is only allowed on snapshot events; 1717 * see pt_buffer_setup_aux(). 1718 */ 1719 if (WARN_ON_ONCE(!buf->snapshot)) 1720 return 0; 1721 1722 /* Prevent pause/resume from attempting to start/stop tracing */ 1723 WRITE_ONCE(pt->pause_allowed, 0); 1724 WRITE_ONCE(pt->resume_allowed, 0); 1725 barrier(); 1726 /* 1727 * There is no PT interrupt in this mode, so stop the trace and it will 1728 * remain stopped while the buffer is copied. 1729 */ 1730 pt_config_stop(event); 1731 pt_read_offset(buf); 1732 pt_update_head(pt); 1733 1734 to = local_read(&buf->data_size); 1735 if (to < size) 1736 from = buf->nr_pages << PAGE_SHIFT; 1737 from += to - size; 1738 1739 ret = perf_output_copy_aux(&pt->handle, handle, from, to); 1740 1741 /* 1742 * Here, handle_nmi tells us if the tracing was on. 1743 * If the tracing was on, restart it. 1744 */ 1745 if (READ_ONCE(pt->handle_nmi)) { 1746 WRITE_ONCE(pt->resume_allowed, 1); 1747 barrier(); 1748 pt_config_start(event); 1749 barrier(); 1750 WRITE_ONCE(pt->pause_allowed, 1); 1751 } 1752 1753 return ret; 1754 } 1755 1756 static void pt_event_del(struct perf_event *event, int mode) 1757 { 1758 pt_event_stop(event, PERF_EF_UPDATE); 1759 } 1760 1761 static int pt_event_add(struct perf_event *event, int mode) 1762 { 1763 struct pt *pt = this_cpu_ptr(&pt_ctx); 1764 struct hw_perf_event *hwc = &event->hw; 1765 int ret = -EBUSY; 1766 1767 if (pt->handle.event) 1768 goto fail; 1769 1770 if (mode & PERF_EF_START) { 1771 pt_event_start(event, 0); 1772 ret = -EINVAL; 1773 if (hwc->state == PERF_HES_STOPPED) 1774 goto fail; 1775 } else { 1776 hwc->state = PERF_HES_STOPPED; 1777 } 1778 1779 ret = 0; 1780 fail: 1781 1782 return ret; 1783 } 1784 1785 static void pt_event_read(struct perf_event *event) 1786 { 1787 } 1788 1789 static void pt_event_destroy(struct perf_event *event) 1790 { 1791 pt_addr_filters_fini(event); 1792 x86_del_exclusive(x86_lbr_exclusive_pt); 1793 } 1794 1795 static int pt_event_init(struct perf_event *event) 1796 { 1797 if (event->attr.type != pt_pmu.pmu.type) 1798 return -ENOENT; 1799 1800 if (!pt_event_valid(event)) 1801 return -EINVAL; 1802 1803 if (x86_add_exclusive(x86_lbr_exclusive_pt)) 1804 return -EBUSY; 1805 1806 if (pt_addr_filters_init(event)) { 1807 x86_del_exclusive(x86_lbr_exclusive_pt); 1808 return -ENOMEM; 1809 } 1810 1811 event->destroy = pt_event_destroy; 1812 1813 return 0; 1814 } 1815 1816 void cpu_emergency_stop_pt(void) 1817 { 1818 struct pt *pt = this_cpu_ptr(&pt_ctx); 1819 1820 if (pt->handle.event) 1821 pt_event_stop(pt->handle.event, PERF_EF_UPDATE); 1822 } 1823 1824 int is_intel_pt_event(struct perf_event *event) 1825 { 1826 return event->pmu == &pt_pmu.pmu; 1827 } 1828 1829 static __init int pt_init(void) 1830 { 1831 int ret, cpu, prior_warn = 0; 1832 1833 BUILD_BUG_ON(sizeof(struct topa) > PAGE_SIZE); 1834 1835 if (!boot_cpu_has(X86_FEATURE_INTEL_PT)) 1836 return -ENODEV; 1837 1838 cpus_read_lock(); 1839 for_each_online_cpu(cpu) { 1840 u64 ctl; 1841 1842 ret = rdmsrl_safe_on_cpu(cpu, MSR_IA32_RTIT_CTL, &ctl); 1843 if (!ret && (ctl & RTIT_CTL_TRACEEN)) 1844 prior_warn++; 1845 } 1846 cpus_read_unlock(); 1847 1848 if (prior_warn) { 1849 x86_add_exclusive(x86_lbr_exclusive_pt); 1850 pr_warn("PT is enabled at boot time, doing nothing\n"); 1851 1852 return -EBUSY; 1853 } 1854 1855 ret = pt_pmu_hw_init(); 1856 if (ret) 1857 return ret; 1858 1859 if (!intel_pt_validate_hw_cap(PT_CAP_topa_output)) { 1860 pr_warn("ToPA output is not supported on this CPU\n"); 1861 return -ENODEV; 1862 } 1863 1864 if (!intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries)) 1865 pt_pmu.pmu.capabilities = PERF_PMU_CAP_AUX_NO_SG; 1866 1867 pt_pmu.pmu.capabilities |= PERF_PMU_CAP_EXCLUSIVE | 1868 PERF_PMU_CAP_ITRACE | 1869 PERF_PMU_CAP_AUX_PAUSE; 1870 pt_pmu.pmu.attr_groups = pt_attr_groups; 1871 pt_pmu.pmu.task_ctx_nr = perf_sw_context; 1872 pt_pmu.pmu.event_init = pt_event_init; 1873 pt_pmu.pmu.add = pt_event_add; 1874 pt_pmu.pmu.del = pt_event_del; 1875 pt_pmu.pmu.start = pt_event_start; 1876 pt_pmu.pmu.stop = pt_event_stop; 1877 pt_pmu.pmu.snapshot_aux = pt_event_snapshot_aux; 1878 pt_pmu.pmu.read = pt_event_read; 1879 pt_pmu.pmu.setup_aux = pt_buffer_setup_aux; 1880 pt_pmu.pmu.free_aux = pt_buffer_free_aux; 1881 pt_pmu.pmu.addr_filters_sync = pt_event_addr_filters_sync; 1882 pt_pmu.pmu.addr_filters_validate = pt_event_addr_filters_validate; 1883 pt_pmu.pmu.nr_addr_filters = 1884 intel_pt_validate_hw_cap(PT_CAP_num_address_ranges); 1885 1886 ret = perf_pmu_register(&pt_pmu.pmu, "intel_pt", -1); 1887 1888 return ret; 1889 } 1890 arch_initcall(pt_init); 1891