1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Intel(R) Processor Trace PMU driver for perf 4 * Copyright (c) 2013-2014, Intel Corporation. 5 * 6 * Intel PT is specified in the Intel Architecture Instruction Set Extensions 7 * Programming Reference: 8 * http://software.intel.com/en-us/intel-isa-extensions 9 */ 10 11 #undef DEBUG 12 13 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 14 15 #include <linux/types.h> 16 #include <linux/bits.h> 17 #include <linux/limits.h> 18 #include <linux/slab.h> 19 #include <linux/device.h> 20 21 #include <asm/cpuid/api.h> 22 #include <asm/perf_event.h> 23 #include <asm/insn.h> 24 #include <asm/io.h> 25 #include <asm/intel_pt.h> 26 #include <asm/cpu_device_id.h> 27 #include <asm/msr.h> 28 29 #include "../perf_event.h" 30 #include "pt.h" 31 32 static DEFINE_PER_CPU(struct pt, pt_ctx); 33 34 static struct pt_pmu pt_pmu; 35 36 /* 37 * Capabilities of Intel PT hardware, such as number of address bits or 38 * supported output schemes, are cached and exported to userspace as "caps" 39 * attribute group of pt pmu device 40 * (/sys/bus/event_source/devices/intel_pt/caps/) so that userspace can store 41 * relevant bits together with intel_pt traces. 42 * 43 * These are necessary for both trace decoding (payloads_lip, contains address 44 * width encoded in IP-related packets), and event configuration (bitmasks with 45 * permitted values for certain bit fields). 46 */ 47 #define PT_CAP(_n, _l, _r, _m) \ 48 [PT_CAP_ ## _n] = { .name = __stringify(_n), .leaf = _l, \ 49 .reg = _r, .mask = _m } 50 51 static struct pt_cap_desc { 52 const char *name; 53 u32 leaf; 54 u8 reg; 55 u32 mask; 56 } pt_caps[] = { 57 PT_CAP(max_subleaf, 0, CPUID_EAX, 0xffffffff), 58 PT_CAP(cr3_filtering, 0, CPUID_EBX, BIT(0)), 59 PT_CAP(psb_cyc, 0, CPUID_EBX, BIT(1)), 60 PT_CAP(ip_filtering, 0, CPUID_EBX, BIT(2)), 61 PT_CAP(mtc, 0, CPUID_EBX, BIT(3)), 62 PT_CAP(ptwrite, 0, CPUID_EBX, BIT(4)), 63 PT_CAP(power_event_trace, 0, CPUID_EBX, BIT(5)), 64 PT_CAP(event_trace, 0, CPUID_EBX, BIT(7)), 65 PT_CAP(tnt_disable, 0, CPUID_EBX, BIT(8)), 66 PT_CAP(topa_output, 0, CPUID_ECX, BIT(0)), 67 PT_CAP(topa_multiple_entries, 0, CPUID_ECX, BIT(1)), 68 PT_CAP(single_range_output, 0, CPUID_ECX, BIT(2)), 69 PT_CAP(output_subsys, 0, CPUID_ECX, BIT(3)), 70 PT_CAP(payloads_lip, 0, CPUID_ECX, BIT(31)), 71 PT_CAP(num_address_ranges, 1, CPUID_EAX, 0x7), 72 PT_CAP(mtc_periods, 1, CPUID_EAX, 0xffff0000), 73 PT_CAP(cycle_thresholds, 1, CPUID_EBX, 0xffff), 74 PT_CAP(psb_periods, 1, CPUID_EBX, 0xffff0000), 75 }; 76 77 u32 intel_pt_validate_cap(u32 *caps, enum pt_capabilities capability) 78 { 79 struct pt_cap_desc *cd = &pt_caps[capability]; 80 u32 c = caps[cd->leaf * PT_CPUID_REGS_NUM + cd->reg]; 81 unsigned int shift = __ffs(cd->mask); 82 83 return (c & cd->mask) >> shift; 84 } 85 EXPORT_SYMBOL_GPL(intel_pt_validate_cap); 86 87 u32 intel_pt_validate_hw_cap(enum pt_capabilities cap) 88 { 89 return intel_pt_validate_cap(pt_pmu.caps, cap); 90 } 91 EXPORT_SYMBOL_GPL(intel_pt_validate_hw_cap); 92 93 static ssize_t pt_cap_show(struct device *cdev, 94 struct device_attribute *attr, 95 char *buf) 96 { 97 struct dev_ext_attribute *ea = 98 container_of(attr, struct dev_ext_attribute, attr); 99 enum pt_capabilities cap = (long)ea->var; 100 101 return snprintf(buf, PAGE_SIZE, "%x\n", intel_pt_validate_hw_cap(cap)); 102 } 103 104 static struct attribute_group pt_cap_group __ro_after_init = { 105 .name = "caps", 106 }; 107 108 PMU_FORMAT_ATTR(pt, "config:0" ); 109 PMU_FORMAT_ATTR(cyc, "config:1" ); 110 PMU_FORMAT_ATTR(pwr_evt, "config:4" ); 111 PMU_FORMAT_ATTR(fup_on_ptw, "config:5" ); 112 PMU_FORMAT_ATTR(mtc, "config:9" ); 113 PMU_FORMAT_ATTR(tsc, "config:10" ); 114 PMU_FORMAT_ATTR(noretcomp, "config:11" ); 115 PMU_FORMAT_ATTR(ptw, "config:12" ); 116 PMU_FORMAT_ATTR(branch, "config:13" ); 117 PMU_FORMAT_ATTR(event, "config:31" ); 118 PMU_FORMAT_ATTR(notnt, "config:55" ); 119 PMU_FORMAT_ATTR(mtc_period, "config:14-17" ); 120 PMU_FORMAT_ATTR(cyc_thresh, "config:19-22" ); 121 PMU_FORMAT_ATTR(psb_period, "config:24-27" ); 122 123 static struct attribute *pt_formats_attr[] = { 124 &format_attr_pt.attr, 125 &format_attr_cyc.attr, 126 &format_attr_pwr_evt.attr, 127 &format_attr_event.attr, 128 &format_attr_notnt.attr, 129 &format_attr_fup_on_ptw.attr, 130 &format_attr_mtc.attr, 131 &format_attr_tsc.attr, 132 &format_attr_noretcomp.attr, 133 &format_attr_ptw.attr, 134 &format_attr_branch.attr, 135 &format_attr_mtc_period.attr, 136 &format_attr_cyc_thresh.attr, 137 &format_attr_psb_period.attr, 138 NULL, 139 }; 140 141 static struct attribute_group pt_format_group = { 142 .name = "format", 143 .attrs = pt_formats_attr, 144 }; 145 146 static ssize_t 147 pt_timing_attr_show(struct device *dev, struct device_attribute *attr, 148 char *page) 149 { 150 struct perf_pmu_events_attr *pmu_attr = 151 container_of(attr, struct perf_pmu_events_attr, attr); 152 153 switch (pmu_attr->id) { 154 case 0: 155 return sprintf(page, "%lu\n", pt_pmu.max_nonturbo_ratio); 156 case 1: 157 return sprintf(page, "%u:%u\n", 158 pt_pmu.tsc_art_num, 159 pt_pmu.tsc_art_den); 160 default: 161 break; 162 } 163 164 return -EINVAL; 165 } 166 167 PMU_EVENT_ATTR(max_nonturbo_ratio, timing_attr_max_nonturbo_ratio, 0, 168 pt_timing_attr_show); 169 PMU_EVENT_ATTR(tsc_art_ratio, timing_attr_tsc_art_ratio, 1, 170 pt_timing_attr_show); 171 172 static struct attribute *pt_timing_attr[] = { 173 &timing_attr_max_nonturbo_ratio.attr.attr, 174 &timing_attr_tsc_art_ratio.attr.attr, 175 NULL, 176 }; 177 178 static struct attribute_group pt_timing_group = { 179 .attrs = pt_timing_attr, 180 }; 181 182 static const struct attribute_group *pt_attr_groups[] = { 183 &pt_cap_group, 184 &pt_format_group, 185 &pt_timing_group, 186 NULL, 187 }; 188 189 static int __init pt_pmu_hw_init(void) 190 { 191 struct dev_ext_attribute *de_attrs; 192 struct attribute **attrs; 193 size_t size; 194 u64 reg; 195 int ret; 196 long i; 197 198 rdmsrq(MSR_PLATFORM_INFO, reg); 199 pt_pmu.max_nonturbo_ratio = (reg & 0xff00) >> 8; 200 201 /* 202 * if available, read in TSC to core crystal clock ratio, 203 * otherwise, zero for numerator stands for "not enumerated" 204 * as per SDM 205 */ 206 if (boot_cpu_data.cpuid_level >= CPUID_LEAF_TSC) { 207 u32 eax, ebx, ecx, edx; 208 209 cpuid(CPUID_LEAF_TSC, &eax, &ebx, &ecx, &edx); 210 211 pt_pmu.tsc_art_num = ebx; 212 pt_pmu.tsc_art_den = eax; 213 } 214 215 /* model-specific quirks */ 216 switch (boot_cpu_data.x86_vfm) { 217 case INTEL_BROADWELL: 218 case INTEL_BROADWELL_D: 219 case INTEL_BROADWELL_G: 220 case INTEL_BROADWELL_X: 221 /* not setting BRANCH_EN will #GP, erratum BDM106 */ 222 pt_pmu.branch_en_always_on = true; 223 break; 224 default: 225 break; 226 } 227 228 if (boot_cpu_has(X86_FEATURE_VMX)) { 229 /* 230 * Intel SDM, 36.5 "Tracing post-VMXON" says that 231 * "IA32_VMX_MISC[bit 14]" being 1 means PT can trace 232 * post-VMXON. 233 */ 234 rdmsrq(MSR_IA32_VMX_MISC, reg); 235 if (reg & BIT(14)) 236 pt_pmu.vmx = true; 237 } 238 239 for (i = 0; i < PT_CPUID_LEAVES; i++) { 240 cpuid_count(20, i, 241 &pt_pmu.caps[CPUID_EAX + i*PT_CPUID_REGS_NUM], 242 &pt_pmu.caps[CPUID_EBX + i*PT_CPUID_REGS_NUM], 243 &pt_pmu.caps[CPUID_ECX + i*PT_CPUID_REGS_NUM], 244 &pt_pmu.caps[CPUID_EDX + i*PT_CPUID_REGS_NUM]); 245 } 246 247 ret = -ENOMEM; 248 size = sizeof(struct attribute *) * (ARRAY_SIZE(pt_caps)+1); 249 attrs = kzalloc(size, GFP_KERNEL); 250 if (!attrs) 251 goto fail; 252 253 size = sizeof(struct dev_ext_attribute) * (ARRAY_SIZE(pt_caps)+1); 254 de_attrs = kzalloc(size, GFP_KERNEL); 255 if (!de_attrs) 256 goto fail; 257 258 for (i = 0; i < ARRAY_SIZE(pt_caps); i++) { 259 struct dev_ext_attribute *de_attr = de_attrs + i; 260 261 de_attr->attr.attr.name = pt_caps[i].name; 262 263 sysfs_attr_init(&de_attr->attr.attr); 264 265 de_attr->attr.attr.mode = S_IRUGO; 266 de_attr->attr.show = pt_cap_show; 267 de_attr->var = (void *)i; 268 269 attrs[i] = &de_attr->attr.attr; 270 } 271 272 pt_cap_group.attrs = attrs; 273 274 return 0; 275 276 fail: 277 kfree(attrs); 278 279 return ret; 280 } 281 282 #define RTIT_CTL_CYC_PSB (RTIT_CTL_CYCLEACC | \ 283 RTIT_CTL_CYC_THRESH | \ 284 RTIT_CTL_PSB_FREQ) 285 286 #define RTIT_CTL_MTC (RTIT_CTL_MTC_EN | \ 287 RTIT_CTL_MTC_RANGE) 288 289 #define RTIT_CTL_PTW (RTIT_CTL_PTW_EN | \ 290 RTIT_CTL_FUP_ON_PTW) 291 292 /* 293 * Bit 0 (TraceEn) in the attr.config is meaningless as the 294 * corresponding bit in the RTIT_CTL can only be controlled 295 * by the driver; therefore, repurpose it to mean: pass 296 * through the bit that was previously assumed to be always 297 * on for PT, thereby allowing the user to *not* set it if 298 * they so wish. See also pt_event_valid() and pt_config(). 299 */ 300 #define RTIT_CTL_PASSTHROUGH RTIT_CTL_TRACEEN 301 302 #define PT_CONFIG_MASK (RTIT_CTL_TRACEEN | \ 303 RTIT_CTL_TSC_EN | \ 304 RTIT_CTL_DISRETC | \ 305 RTIT_CTL_BRANCH_EN | \ 306 RTIT_CTL_CYC_PSB | \ 307 RTIT_CTL_MTC | \ 308 RTIT_CTL_PWR_EVT_EN | \ 309 RTIT_CTL_EVENT_EN | \ 310 RTIT_CTL_NOTNT | \ 311 RTIT_CTL_FUP_ON_PTW | \ 312 RTIT_CTL_PTW_EN) 313 314 static bool pt_event_valid(struct perf_event *event) 315 { 316 u64 config = event->attr.config; 317 u64 allowed, requested; 318 319 if ((config & PT_CONFIG_MASK) != config) 320 return false; 321 322 if (config & RTIT_CTL_CYC_PSB) { 323 if (!intel_pt_validate_hw_cap(PT_CAP_psb_cyc)) 324 return false; 325 326 allowed = intel_pt_validate_hw_cap(PT_CAP_psb_periods); 327 requested = (config & RTIT_CTL_PSB_FREQ) >> 328 RTIT_CTL_PSB_FREQ_OFFSET; 329 if (requested && (!(allowed & BIT(requested)))) 330 return false; 331 332 allowed = intel_pt_validate_hw_cap(PT_CAP_cycle_thresholds); 333 requested = (config & RTIT_CTL_CYC_THRESH) >> 334 RTIT_CTL_CYC_THRESH_OFFSET; 335 if (requested && (!(allowed & BIT(requested)))) 336 return false; 337 } 338 339 if (config & RTIT_CTL_MTC) { 340 /* 341 * In the unlikely case that CPUID lists valid mtc periods, 342 * but not the mtc capability, drop out here. 343 * 344 * Spec says that setting mtc period bits while mtc bit in 345 * CPUID is 0 will #GP, so better safe than sorry. 346 */ 347 if (!intel_pt_validate_hw_cap(PT_CAP_mtc)) 348 return false; 349 350 allowed = intel_pt_validate_hw_cap(PT_CAP_mtc_periods); 351 if (!allowed) 352 return false; 353 354 requested = (config & RTIT_CTL_MTC_RANGE) >> 355 RTIT_CTL_MTC_RANGE_OFFSET; 356 357 if (!(allowed & BIT(requested))) 358 return false; 359 } 360 361 if (config & RTIT_CTL_PWR_EVT_EN && 362 !intel_pt_validate_hw_cap(PT_CAP_power_event_trace)) 363 return false; 364 365 if (config & RTIT_CTL_EVENT_EN && 366 !intel_pt_validate_hw_cap(PT_CAP_event_trace)) 367 return false; 368 369 if (config & RTIT_CTL_NOTNT && 370 !intel_pt_validate_hw_cap(PT_CAP_tnt_disable)) 371 return false; 372 373 if (config & RTIT_CTL_PTW) { 374 if (!intel_pt_validate_hw_cap(PT_CAP_ptwrite)) 375 return false; 376 377 /* FUPonPTW without PTW doesn't make sense */ 378 if ((config & RTIT_CTL_FUP_ON_PTW) && 379 !(config & RTIT_CTL_PTW_EN)) 380 return false; 381 } 382 383 /* 384 * Setting bit 0 (TraceEn in RTIT_CTL MSR) in the attr.config 385 * clears the assumption that BranchEn must always be enabled, 386 * as was the case with the first implementation of PT. 387 * If this bit is not set, the legacy behavior is preserved 388 * for compatibility with the older userspace. 389 * 390 * Re-using bit 0 for this purpose is fine because it is never 391 * directly set by the user; previous attempts at setting it in 392 * the attr.config resulted in -EINVAL. 393 */ 394 if (config & RTIT_CTL_PASSTHROUGH) { 395 /* 396 * Disallow not setting BRANCH_EN where BRANCH_EN is 397 * always required. 398 */ 399 if (pt_pmu.branch_en_always_on && 400 !(config & RTIT_CTL_BRANCH_EN)) 401 return false; 402 } else { 403 /* 404 * Disallow BRANCH_EN without the PASSTHROUGH. 405 */ 406 if (config & RTIT_CTL_BRANCH_EN) 407 return false; 408 } 409 410 return true; 411 } 412 413 /* 414 * PT configuration helpers 415 * These all are cpu affine and operate on a local PT 416 */ 417 418 static void pt_config_start(struct perf_event *event) 419 { 420 struct pt *pt = this_cpu_ptr(&pt_ctx); 421 u64 ctl = event->hw.aux_config; 422 423 if (READ_ONCE(event->hw.aux_paused)) 424 return; 425 426 ctl |= RTIT_CTL_TRACEEN; 427 if (READ_ONCE(pt->vmx_on)) 428 perf_aux_output_flag(&pt->handle, PERF_AUX_FLAG_PARTIAL); 429 else 430 wrmsrq(MSR_IA32_RTIT_CTL, ctl); 431 432 WRITE_ONCE(event->hw.aux_config, ctl); 433 } 434 435 /* Address ranges and their corresponding msr configuration registers */ 436 static const struct pt_address_range { 437 unsigned long msr_a; 438 unsigned long msr_b; 439 unsigned int reg_off; 440 } pt_address_ranges[] = { 441 { 442 .msr_a = MSR_IA32_RTIT_ADDR0_A, 443 .msr_b = MSR_IA32_RTIT_ADDR0_B, 444 .reg_off = RTIT_CTL_ADDR0_OFFSET, 445 }, 446 { 447 .msr_a = MSR_IA32_RTIT_ADDR1_A, 448 .msr_b = MSR_IA32_RTIT_ADDR1_B, 449 .reg_off = RTIT_CTL_ADDR1_OFFSET, 450 }, 451 { 452 .msr_a = MSR_IA32_RTIT_ADDR2_A, 453 .msr_b = MSR_IA32_RTIT_ADDR2_B, 454 .reg_off = RTIT_CTL_ADDR2_OFFSET, 455 }, 456 { 457 .msr_a = MSR_IA32_RTIT_ADDR3_A, 458 .msr_b = MSR_IA32_RTIT_ADDR3_B, 459 .reg_off = RTIT_CTL_ADDR3_OFFSET, 460 } 461 }; 462 463 static u64 pt_config_filters(struct perf_event *event) 464 { 465 struct pt_filters *filters = event->hw.addr_filters; 466 struct pt *pt = this_cpu_ptr(&pt_ctx); 467 unsigned int range = 0; 468 u64 rtit_ctl = 0; 469 470 if (!filters) 471 return 0; 472 473 perf_event_addr_filters_sync(event); 474 475 for (range = 0; range < filters->nr_filters; range++) { 476 struct pt_filter *filter = &filters->filter[range]; 477 478 /* 479 * Note, if the range has zero start/end addresses due 480 * to its dynamic object not being loaded yet, we just 481 * go ahead and program zeroed range, which will simply 482 * produce no data. Note^2: if executable code at 0x0 483 * is a concern, we can set up an "invalid" configuration 484 * such as msr_b < msr_a. 485 */ 486 487 /* avoid redundant msr writes */ 488 if (pt->filters.filter[range].msr_a != filter->msr_a) { 489 wrmsrq(pt_address_ranges[range].msr_a, filter->msr_a); 490 pt->filters.filter[range].msr_a = filter->msr_a; 491 } 492 493 if (pt->filters.filter[range].msr_b != filter->msr_b) { 494 wrmsrq(pt_address_ranges[range].msr_b, filter->msr_b); 495 pt->filters.filter[range].msr_b = filter->msr_b; 496 } 497 498 rtit_ctl |= (u64)filter->config << pt_address_ranges[range].reg_off; 499 } 500 501 return rtit_ctl; 502 } 503 504 static void pt_config(struct perf_event *event) 505 { 506 struct pt *pt = this_cpu_ptr(&pt_ctx); 507 struct pt_buffer *buf = perf_get_aux(&pt->handle); 508 u64 reg; 509 510 /* First round: clear STATUS, in particular the PSB byte counter. */ 511 if (!event->hw.aux_config) { 512 perf_event_itrace_started(event); 513 wrmsrq(MSR_IA32_RTIT_STATUS, 0); 514 } 515 516 reg = pt_config_filters(event); 517 reg |= RTIT_CTL_TRACEEN; 518 if (!buf->single) 519 reg |= RTIT_CTL_TOPA; 520 521 /* 522 * Previously, we had BRANCH_EN on by default, but now that PT has 523 * grown features outside of branch tracing, it is useful to allow 524 * the user to disable it. Setting bit 0 in the event's attr.config 525 * allows BRANCH_EN to pass through instead of being always on. See 526 * also the comment in pt_event_valid(). 527 */ 528 if (event->attr.config & BIT(0)) { 529 reg |= event->attr.config & RTIT_CTL_BRANCH_EN; 530 } else { 531 reg |= RTIT_CTL_BRANCH_EN; 532 } 533 534 if (!event->attr.exclude_kernel) 535 reg |= RTIT_CTL_OS; 536 if (!event->attr.exclude_user) 537 reg |= RTIT_CTL_USR; 538 539 reg |= (event->attr.config & PT_CONFIG_MASK); 540 541 event->hw.aux_config = reg; 542 543 /* 544 * Allow resume before starting so as not to overwrite a value set by a 545 * PMI. 546 */ 547 barrier(); 548 WRITE_ONCE(pt->resume_allowed, 1); 549 /* Configuration is complete, it is now OK to handle an NMI */ 550 barrier(); 551 WRITE_ONCE(pt->handle_nmi, 1); 552 barrier(); 553 pt_config_start(event); 554 barrier(); 555 /* 556 * Allow pause after starting so its pt_config_stop() doesn't race with 557 * pt_config_start(). 558 */ 559 WRITE_ONCE(pt->pause_allowed, 1); 560 } 561 562 static void pt_config_stop(struct perf_event *event) 563 { 564 struct pt *pt = this_cpu_ptr(&pt_ctx); 565 u64 ctl = READ_ONCE(event->hw.aux_config); 566 567 /* may be already stopped by a PMI */ 568 if (!(ctl & RTIT_CTL_TRACEEN)) 569 return; 570 571 ctl &= ~RTIT_CTL_TRACEEN; 572 if (!READ_ONCE(pt->vmx_on)) 573 wrmsrq(MSR_IA32_RTIT_CTL, ctl); 574 575 WRITE_ONCE(event->hw.aux_config, ctl); 576 577 /* 578 * A wrmsr that disables trace generation serializes other PT 579 * registers and causes all data packets to be written to memory, 580 * but a fence is required for the data to become globally visible. 581 * 582 * The below WMB, separating data store and aux_head store matches 583 * the consumer's RMB that separates aux_head load and data load. 584 */ 585 wmb(); 586 } 587 588 /** 589 * struct topa - ToPA metadata 590 * @list: linkage to struct pt_buffer's list of tables 591 * @offset: offset of the first entry in this table in the buffer 592 * @size: total size of all entries in this table 593 * @last: index of the last initialized entry in this table 594 * @z_count: how many times the first entry repeats 595 */ 596 struct topa { 597 struct list_head list; 598 u64 offset; 599 size_t size; 600 int last; 601 unsigned int z_count; 602 }; 603 604 /* 605 * Keep ToPA table-related metadata on the same page as the actual table, 606 * taking up a few words from the top 607 */ 608 609 #define TENTS_PER_PAGE \ 610 ((PAGE_SIZE - sizeof(struct topa)) / sizeof(struct topa_entry)) 611 612 /** 613 * struct topa_page - page-sized ToPA table with metadata at the top 614 * @table: actual ToPA table entries, as understood by PT hardware 615 * @topa: metadata 616 */ 617 struct topa_page { 618 struct topa_entry table[TENTS_PER_PAGE]; 619 struct topa topa; 620 }; 621 622 static inline struct topa_page *topa_to_page(struct topa *topa) 623 { 624 return container_of(topa, struct topa_page, topa); 625 } 626 627 static inline struct topa_page *topa_entry_to_page(struct topa_entry *te) 628 { 629 return (struct topa_page *)((unsigned long)te & PAGE_MASK); 630 } 631 632 static inline phys_addr_t topa_pfn(struct topa *topa) 633 { 634 return PFN_DOWN(virt_to_phys(topa_to_page(topa))); 635 } 636 637 /* make -1 stand for the last table entry */ 638 #define TOPA_ENTRY(t, i) \ 639 ((i) == -1 \ 640 ? &topa_to_page(t)->table[(t)->last] \ 641 : &topa_to_page(t)->table[(i)]) 642 #define TOPA_ENTRY_SIZE(t, i) (sizes(TOPA_ENTRY((t), (i))->size)) 643 #define TOPA_ENTRY_PAGES(t, i) (1 << TOPA_ENTRY((t), (i))->size) 644 645 static void pt_config_buffer(struct pt_buffer *buf) 646 { 647 struct pt *pt = this_cpu_ptr(&pt_ctx); 648 u64 reg, mask; 649 void *base; 650 651 if (buf->single) { 652 base = buf->data_pages[0]; 653 mask = (buf->nr_pages * PAGE_SIZE - 1) >> 7; 654 } else { 655 base = topa_to_page(buf->cur)->table; 656 mask = (u64)buf->cur_idx; 657 } 658 659 reg = virt_to_phys(base); 660 if (pt->output_base != reg) { 661 pt->output_base = reg; 662 wrmsrq(MSR_IA32_RTIT_OUTPUT_BASE, reg); 663 } 664 665 reg = 0x7f | (mask << 7) | ((u64)buf->output_off << 32); 666 if (pt->output_mask != reg) { 667 pt->output_mask = reg; 668 wrmsrq(MSR_IA32_RTIT_OUTPUT_MASK, reg); 669 } 670 } 671 672 /** 673 * topa_alloc() - allocate page-sized ToPA table 674 * @cpu: CPU on which to allocate. 675 * @gfp: Allocation flags. 676 * 677 * Return: On success, return the pointer to ToPA table page. 678 */ 679 static struct topa *topa_alloc(int cpu, gfp_t gfp) 680 { 681 int node = cpu_to_node(cpu); 682 struct topa_page *tp; 683 struct page *p; 684 685 p = alloc_pages_node(node, gfp | __GFP_ZERO, 0); 686 if (!p) 687 return NULL; 688 689 tp = page_address(p); 690 tp->topa.last = 0; 691 692 /* 693 * In case of singe-entry ToPA, always put the self-referencing END 694 * link as the 2nd entry in the table 695 */ 696 if (!intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries)) { 697 TOPA_ENTRY(&tp->topa, 1)->base = page_to_phys(p) >> TOPA_SHIFT; 698 TOPA_ENTRY(&tp->topa, 1)->end = 1; 699 } 700 701 return &tp->topa; 702 } 703 704 /** 705 * topa_free() - free a page-sized ToPA table 706 * @topa: Table to deallocate. 707 */ 708 static void topa_free(struct topa *topa) 709 { 710 free_page((unsigned long)topa); 711 } 712 713 /** 714 * topa_insert_table() - insert a ToPA table into a buffer 715 * @buf: PT buffer that's being extended. 716 * @topa: New topa table to be inserted. 717 * 718 * If it's the first table in this buffer, set up buffer's pointers 719 * accordingly; otherwise, add a END=1 link entry to @topa to the current 720 * "last" table and adjust the last table pointer to @topa. 721 */ 722 static void topa_insert_table(struct pt_buffer *buf, struct topa *topa) 723 { 724 struct topa *last = buf->last; 725 726 list_add_tail(&topa->list, &buf->tables); 727 728 if (!buf->first) { 729 buf->first = buf->last = buf->cur = topa; 730 return; 731 } 732 733 topa->offset = last->offset + last->size; 734 buf->last = topa; 735 736 if (!intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries)) 737 return; 738 739 BUG_ON(last->last != TENTS_PER_PAGE - 1); 740 741 TOPA_ENTRY(last, -1)->base = topa_pfn(topa); 742 TOPA_ENTRY(last, -1)->end = 1; 743 } 744 745 /** 746 * topa_table_full() - check if a ToPA table is filled up 747 * @topa: ToPA table. 748 */ 749 static bool topa_table_full(struct topa *topa) 750 { 751 /* single-entry ToPA is a special case */ 752 if (!intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries)) 753 return !!topa->last; 754 755 return topa->last == TENTS_PER_PAGE - 1; 756 } 757 758 /** 759 * topa_insert_pages() - create a list of ToPA tables 760 * @buf: PT buffer being initialized. 761 * @cpu: CPU on which to allocate. 762 * @gfp: Allocation flags. 763 * 764 * This initializes a list of ToPA tables with entries from 765 * the data_pages provided by rb_alloc_aux(). 766 * 767 * Return: 0 on success or error code. 768 */ 769 static int topa_insert_pages(struct pt_buffer *buf, int cpu, gfp_t gfp) 770 { 771 struct topa *topa = buf->last; 772 int order = 0; 773 struct page *p; 774 775 p = virt_to_page(buf->data_pages[buf->nr_pages]); 776 if (PagePrivate(p)) 777 order = page_private(p); 778 779 if (topa_table_full(topa)) { 780 topa = topa_alloc(cpu, gfp); 781 if (!topa) 782 return -ENOMEM; 783 784 topa_insert_table(buf, topa); 785 } 786 787 if (topa->z_count == topa->last - 1) { 788 if (order == TOPA_ENTRY(topa, topa->last - 1)->size) 789 topa->z_count++; 790 } 791 792 TOPA_ENTRY(topa, -1)->base = page_to_phys(p) >> TOPA_SHIFT; 793 TOPA_ENTRY(topa, -1)->size = order; 794 if (!buf->snapshot && 795 !intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries)) { 796 TOPA_ENTRY(topa, -1)->intr = 1; 797 TOPA_ENTRY(topa, -1)->stop = 1; 798 } 799 800 topa->last++; 801 topa->size += sizes(order); 802 803 buf->nr_pages += 1ul << order; 804 805 return 0; 806 } 807 808 /** 809 * pt_topa_dump() - print ToPA tables and their entries 810 * @buf: PT buffer. 811 */ 812 static void pt_topa_dump(struct pt_buffer *buf) 813 { 814 struct topa *topa; 815 816 list_for_each_entry(topa, &buf->tables, list) { 817 struct topa_page *tp = topa_to_page(topa); 818 int i; 819 820 pr_debug("# table @%p, off %llx size %zx\n", tp->table, 821 topa->offset, topa->size); 822 for (i = 0; i < TENTS_PER_PAGE; i++) { 823 pr_debug("# entry @%p (%lx sz %u %c%c%c) raw=%16llx\n", 824 &tp->table[i], 825 (unsigned long)tp->table[i].base << TOPA_SHIFT, 826 sizes(tp->table[i].size), 827 tp->table[i].end ? 'E' : ' ', 828 tp->table[i].intr ? 'I' : ' ', 829 tp->table[i].stop ? 'S' : ' ', 830 *(u64 *)&tp->table[i]); 831 if ((intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries) && 832 tp->table[i].stop) || 833 tp->table[i].end) 834 break; 835 if (!i && topa->z_count) 836 i += topa->z_count; 837 } 838 } 839 } 840 841 /** 842 * pt_buffer_advance() - advance to the next output region 843 * @buf: PT buffer. 844 * 845 * Advance the current pointers in the buffer to the next ToPA entry. 846 */ 847 static void pt_buffer_advance(struct pt_buffer *buf) 848 { 849 buf->output_off = 0; 850 buf->cur_idx++; 851 852 if (buf->cur_idx == buf->cur->last) { 853 if (buf->cur == buf->last) { 854 buf->cur = buf->first; 855 buf->wrapped = true; 856 } else { 857 buf->cur = list_entry(buf->cur->list.next, struct topa, 858 list); 859 } 860 buf->cur_idx = 0; 861 } 862 } 863 864 /** 865 * pt_update_head() - calculate current offsets and sizes 866 * @pt: Per-cpu pt context. 867 * 868 * Update buffer's current write pointer position and data size. 869 */ 870 static void pt_update_head(struct pt *pt) 871 { 872 struct pt_buffer *buf = perf_get_aux(&pt->handle); 873 bool wrapped = buf->wrapped; 874 u64 topa_idx, base, old; 875 876 buf->wrapped = false; 877 878 if (buf->single) { 879 local_set(&buf->data_size, buf->output_off); 880 return; 881 } 882 883 /* offset of the first region in this table from the beginning of buf */ 884 base = buf->cur->offset + buf->output_off; 885 886 /* offset of the current output region within this table */ 887 for (topa_idx = 0; topa_idx < buf->cur_idx; topa_idx++) 888 base += TOPA_ENTRY_SIZE(buf->cur, topa_idx); 889 890 if (buf->snapshot) { 891 local_set(&buf->data_size, base); 892 } else { 893 old = (local64_xchg(&buf->head, base) & 894 ((buf->nr_pages << PAGE_SHIFT) - 1)); 895 if (base < old || (base == old && wrapped)) 896 base += buf->nr_pages << PAGE_SHIFT; 897 898 local_add(base - old, &buf->data_size); 899 } 900 } 901 902 /** 903 * pt_buffer_region() - obtain current output region's address 904 * @buf: PT buffer. 905 */ 906 static void *pt_buffer_region(struct pt_buffer *buf) 907 { 908 return phys_to_virt((phys_addr_t)TOPA_ENTRY(buf->cur, buf->cur_idx)->base << TOPA_SHIFT); 909 } 910 911 /** 912 * pt_buffer_region_size() - obtain current output region's size 913 * @buf: PT buffer. 914 */ 915 static size_t pt_buffer_region_size(struct pt_buffer *buf) 916 { 917 return TOPA_ENTRY_SIZE(buf->cur, buf->cur_idx); 918 } 919 920 /** 921 * pt_handle_status() - take care of possible status conditions 922 * @pt: Per-cpu pt context. 923 */ 924 static void pt_handle_status(struct pt *pt) 925 { 926 struct pt_buffer *buf = perf_get_aux(&pt->handle); 927 int advance = 0; 928 u64 status; 929 930 rdmsrq(MSR_IA32_RTIT_STATUS, status); 931 932 if (status & RTIT_STATUS_ERROR) { 933 pr_err_ratelimited("ToPA ERROR encountered, trying to recover\n"); 934 pt_topa_dump(buf); 935 status &= ~RTIT_STATUS_ERROR; 936 } 937 938 if (status & RTIT_STATUS_STOPPED) { 939 status &= ~RTIT_STATUS_STOPPED; 940 941 /* 942 * On systems that only do single-entry ToPA, hitting STOP 943 * means we are already losing data; need to let the decoder 944 * know. 945 */ 946 if (!buf->single && 947 (!intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries) || 948 buf->output_off == pt_buffer_region_size(buf))) { 949 perf_aux_output_flag(&pt->handle, 950 PERF_AUX_FLAG_TRUNCATED); 951 advance++; 952 } 953 } 954 955 /* 956 * Also on single-entry ToPA implementations, interrupt will come 957 * before the output reaches its output region's boundary. 958 */ 959 if (!intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries) && 960 !buf->snapshot && 961 pt_buffer_region_size(buf) - buf->output_off <= TOPA_PMI_MARGIN) { 962 void *head = pt_buffer_region(buf); 963 964 /* everything within this margin needs to be zeroed out */ 965 memset(head + buf->output_off, 0, 966 pt_buffer_region_size(buf) - 967 buf->output_off); 968 advance++; 969 } 970 971 if (advance) 972 pt_buffer_advance(buf); 973 974 wrmsrq(MSR_IA32_RTIT_STATUS, status); 975 } 976 977 /** 978 * pt_read_offset() - translate registers into buffer pointers 979 * @buf: PT buffer. 980 * 981 * Set buffer's output pointers from MSR values. 982 */ 983 static void pt_read_offset(struct pt_buffer *buf) 984 { 985 struct pt *pt = this_cpu_ptr(&pt_ctx); 986 struct topa_page *tp; 987 988 if (!buf->single) { 989 rdmsrq(MSR_IA32_RTIT_OUTPUT_BASE, pt->output_base); 990 tp = phys_to_virt(pt->output_base); 991 buf->cur = &tp->topa; 992 } 993 994 rdmsrq(MSR_IA32_RTIT_OUTPUT_MASK, pt->output_mask); 995 /* offset within current output region */ 996 buf->output_off = pt->output_mask >> 32; 997 /* index of current output region within this table */ 998 if (!buf->single) 999 buf->cur_idx = (pt->output_mask & 0xffffff80) >> 7; 1000 } 1001 1002 static struct topa_entry * 1003 pt_topa_entry_for_page(struct pt_buffer *buf, unsigned int pg) 1004 { 1005 struct topa_page *tp; 1006 struct topa *topa; 1007 unsigned int idx, cur_pg = 0, z_pg = 0, start_idx = 0; 1008 1009 /* 1010 * Indicates a bug in the caller. 1011 */ 1012 if (WARN_ON_ONCE(pg >= buf->nr_pages)) 1013 return NULL; 1014 1015 /* 1016 * First, find the ToPA table where @pg fits. With high 1017 * order allocations, there shouldn't be many of these. 1018 */ 1019 list_for_each_entry(topa, &buf->tables, list) { 1020 if (topa->offset + topa->size > (unsigned long)pg << PAGE_SHIFT) 1021 goto found; 1022 } 1023 1024 /* 1025 * Hitting this means we have a problem in the ToPA 1026 * allocation code. 1027 */ 1028 WARN_ON_ONCE(1); 1029 1030 return NULL; 1031 1032 found: 1033 /* 1034 * Indicates a problem in the ToPA allocation code. 1035 */ 1036 if (WARN_ON_ONCE(topa->last == -1)) 1037 return NULL; 1038 1039 tp = topa_to_page(topa); 1040 cur_pg = PFN_DOWN(topa->offset); 1041 if (topa->z_count) { 1042 z_pg = TOPA_ENTRY_PAGES(topa, 0) * (topa->z_count + 1); 1043 start_idx = topa->z_count + 1; 1044 } 1045 1046 /* 1047 * Multiple entries at the beginning of the table have the same size, 1048 * ideally all of them; if @pg falls there, the search is done. 1049 */ 1050 if (pg >= cur_pg && pg < cur_pg + z_pg) { 1051 idx = (pg - cur_pg) / TOPA_ENTRY_PAGES(topa, 0); 1052 return &tp->table[idx]; 1053 } 1054 1055 /* 1056 * Otherwise, slow path: iterate through the remaining entries. 1057 */ 1058 for (idx = start_idx, cur_pg += z_pg; idx < topa->last; idx++) { 1059 if (cur_pg + TOPA_ENTRY_PAGES(topa, idx) > pg) 1060 return &tp->table[idx]; 1061 1062 cur_pg += TOPA_ENTRY_PAGES(topa, idx); 1063 } 1064 1065 /* 1066 * Means we couldn't find a ToPA entry in the table that does match. 1067 */ 1068 WARN_ON_ONCE(1); 1069 1070 return NULL; 1071 } 1072 1073 static struct topa_entry * 1074 pt_topa_prev_entry(struct pt_buffer *buf, struct topa_entry *te) 1075 { 1076 unsigned long table = (unsigned long)te & ~(PAGE_SIZE - 1); 1077 struct topa_page *tp; 1078 struct topa *topa; 1079 1080 tp = (struct topa_page *)table; 1081 if (tp->table != te) 1082 return --te; 1083 1084 topa = &tp->topa; 1085 if (topa == buf->first) 1086 topa = buf->last; 1087 else 1088 topa = list_prev_entry(topa, list); 1089 1090 tp = topa_to_page(topa); 1091 1092 return &tp->table[topa->last - 1]; 1093 } 1094 1095 /** 1096 * pt_buffer_reset_markers() - place interrupt and stop bits in the buffer 1097 * @buf: PT buffer. 1098 * @handle: Current output handle. 1099 * 1100 * Place INT and STOP marks to prevent overwriting old data that the consumer 1101 * hasn't yet collected and waking up the consumer after a certain fraction of 1102 * the buffer has filled up. Only needed and sensible for non-snapshot counters. 1103 * 1104 * This obviously relies on buf::head to figure out buffer markers, so it has 1105 * to be called after pt_buffer_reset_offsets() and before the hardware tracing 1106 * is enabled. 1107 */ 1108 static int pt_buffer_reset_markers(struct pt_buffer *buf, 1109 struct perf_output_handle *handle) 1110 1111 { 1112 unsigned long head = local64_read(&buf->head); 1113 unsigned long idx, npages, wakeup; 1114 1115 if (buf->single) 1116 return 0; 1117 1118 /* can't stop in the middle of an output region */ 1119 if (buf->output_off + handle->size + 1 < pt_buffer_region_size(buf)) { 1120 perf_aux_output_flag(handle, PERF_AUX_FLAG_TRUNCATED); 1121 return -EINVAL; 1122 } 1123 1124 1125 /* single entry ToPA is handled by marking all regions STOP=1 INT=1 */ 1126 if (!intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries)) 1127 return 0; 1128 1129 /* clear STOP and INT from current entry */ 1130 if (buf->stop_te) { 1131 buf->stop_te->stop = 0; 1132 buf->stop_te->intr = 0; 1133 } 1134 1135 if (buf->intr_te) 1136 buf->intr_te->intr = 0; 1137 1138 /* how many pages till the STOP marker */ 1139 npages = handle->size >> PAGE_SHIFT; 1140 1141 /* if it's on a page boundary, fill up one more page */ 1142 if (!offset_in_page(head + handle->size + 1)) 1143 npages++; 1144 1145 idx = (head >> PAGE_SHIFT) + npages; 1146 idx &= buf->nr_pages - 1; 1147 1148 if (idx != buf->stop_pos) { 1149 buf->stop_pos = idx; 1150 buf->stop_te = pt_topa_entry_for_page(buf, idx); 1151 buf->stop_te = pt_topa_prev_entry(buf, buf->stop_te); 1152 } 1153 1154 wakeup = handle->wakeup >> PAGE_SHIFT; 1155 1156 /* in the worst case, wake up the consumer one page before hard stop */ 1157 idx = (head >> PAGE_SHIFT) + npages - 1; 1158 if (idx > wakeup) 1159 idx = wakeup; 1160 1161 idx &= buf->nr_pages - 1; 1162 if (idx != buf->intr_pos) { 1163 buf->intr_pos = idx; 1164 buf->intr_te = pt_topa_entry_for_page(buf, idx); 1165 buf->intr_te = pt_topa_prev_entry(buf, buf->intr_te); 1166 } 1167 1168 buf->stop_te->stop = 1; 1169 buf->stop_te->intr = 1; 1170 buf->intr_te->intr = 1; 1171 1172 return 0; 1173 } 1174 1175 /** 1176 * pt_buffer_reset_offsets() - adjust buffer's write pointers from aux_head 1177 * @buf: PT buffer. 1178 * @head: Write pointer (aux_head) from AUX buffer. 1179 * 1180 * Find the ToPA table and entry corresponding to given @head and set buffer's 1181 * "current" pointers accordingly. This is done after we have obtained the 1182 * current aux_head position from a successful call to perf_aux_output_begin() 1183 * to make sure the hardware is writing to the right place. 1184 * 1185 * This function modifies buf::{cur,cur_idx,output_off} that will be programmed 1186 * into PT msrs when the tracing is enabled and buf::head and buf::data_size, 1187 * which are used to determine INT and STOP markers' locations by a subsequent 1188 * call to pt_buffer_reset_markers(). 1189 */ 1190 static void pt_buffer_reset_offsets(struct pt_buffer *buf, unsigned long head) 1191 { 1192 struct topa_page *cur_tp; 1193 struct topa_entry *te; 1194 int pg; 1195 1196 if (buf->snapshot) 1197 head &= (buf->nr_pages << PAGE_SHIFT) - 1; 1198 1199 if (!buf->single) { 1200 pg = (head >> PAGE_SHIFT) & (buf->nr_pages - 1); 1201 te = pt_topa_entry_for_page(buf, pg); 1202 1203 cur_tp = topa_entry_to_page(te); 1204 buf->cur = &cur_tp->topa; 1205 buf->cur_idx = te - TOPA_ENTRY(buf->cur, 0); 1206 buf->output_off = head & (pt_buffer_region_size(buf) - 1); 1207 } else { 1208 buf->output_off = head; 1209 } 1210 1211 local64_set(&buf->head, head); 1212 local_set(&buf->data_size, 0); 1213 } 1214 1215 /** 1216 * pt_buffer_fini_topa() - deallocate ToPA structure of a buffer 1217 * @buf: PT buffer. 1218 */ 1219 static void pt_buffer_fini_topa(struct pt_buffer *buf) 1220 { 1221 struct topa *topa, *iter; 1222 1223 if (buf->single) 1224 return; 1225 1226 list_for_each_entry_safe(topa, iter, &buf->tables, list) { 1227 /* 1228 * right now, this is in free_aux() path only, so 1229 * no need to unlink this table from the list 1230 */ 1231 topa_free(topa); 1232 } 1233 } 1234 1235 /** 1236 * pt_buffer_init_topa() - initialize ToPA table for pt buffer 1237 * @buf: PT buffer. 1238 * @cpu: CPU on which to allocate. 1239 * @nr_pages: No. of pages to allocate. 1240 * @gfp: Allocation flags. 1241 * 1242 * Return: 0 on success or error code. 1243 */ 1244 static int pt_buffer_init_topa(struct pt_buffer *buf, int cpu, 1245 unsigned long nr_pages, gfp_t gfp) 1246 { 1247 struct topa *topa; 1248 int err; 1249 1250 topa = topa_alloc(cpu, gfp); 1251 if (!topa) 1252 return -ENOMEM; 1253 1254 topa_insert_table(buf, topa); 1255 1256 while (buf->nr_pages < nr_pages) { 1257 err = topa_insert_pages(buf, cpu, gfp); 1258 if (err) { 1259 pt_buffer_fini_topa(buf); 1260 return -ENOMEM; 1261 } 1262 } 1263 1264 /* link last table to the first one, unless we're double buffering */ 1265 if (intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries)) { 1266 TOPA_ENTRY(buf->last, -1)->base = topa_pfn(buf->first); 1267 TOPA_ENTRY(buf->last, -1)->end = 1; 1268 } 1269 1270 pt_topa_dump(buf); 1271 return 0; 1272 } 1273 1274 static int pt_buffer_try_single(struct pt_buffer *buf, int nr_pages) 1275 { 1276 struct page *p = virt_to_page(buf->data_pages[0]); 1277 int ret = -ENOTSUPP, order = 0; 1278 1279 /* 1280 * We can use single range output mode 1281 * + in snapshot mode, where we don't need interrupts; 1282 * + if the hardware supports it; 1283 * + if the entire buffer is one contiguous allocation. 1284 */ 1285 if (!buf->snapshot) 1286 goto out; 1287 1288 if (!intel_pt_validate_hw_cap(PT_CAP_single_range_output)) 1289 goto out; 1290 1291 if (PagePrivate(p)) 1292 order = page_private(p); 1293 1294 if (1 << order != nr_pages) 1295 goto out; 1296 1297 /* 1298 * Some processors cannot always support single range for more than 1299 * 4KB - refer errata TGL052, ADL037 and RPL017. Future processors might 1300 * also be affected, so for now rather than trying to keep track of 1301 * which ones, just disable it for all. 1302 */ 1303 if (nr_pages > 1) 1304 goto out; 1305 1306 buf->single = true; 1307 buf->nr_pages = nr_pages; 1308 ret = 0; 1309 out: 1310 return ret; 1311 } 1312 1313 /** 1314 * pt_buffer_setup_aux() - set up topa tables for a PT buffer 1315 * @event: Performance event 1316 * @pages: Array of pointers to buffer pages passed from perf core. 1317 * @nr_pages: Number of pages in the buffer. 1318 * @snapshot: If this is a snapshot/overwrite counter. 1319 * 1320 * This is a pmu::setup_aux callback that sets up ToPA tables and all the 1321 * bookkeeping for an AUX buffer. 1322 * 1323 * Return: Our private PT buffer structure. 1324 */ 1325 static void * 1326 pt_buffer_setup_aux(struct perf_event *event, void **pages, 1327 int nr_pages, bool snapshot) 1328 { 1329 struct pt_buffer *buf; 1330 int node, ret, cpu = event->cpu; 1331 1332 if (!nr_pages) 1333 return NULL; 1334 1335 /* 1336 * Only support AUX sampling in snapshot mode, where we don't 1337 * generate NMIs. 1338 */ 1339 if (event->attr.aux_sample_size && !snapshot) 1340 return NULL; 1341 1342 if (cpu == -1) 1343 cpu = raw_smp_processor_id(); 1344 node = cpu_to_node(cpu); 1345 1346 buf = kzalloc_node(sizeof(struct pt_buffer), GFP_KERNEL, node); 1347 if (!buf) 1348 return NULL; 1349 1350 buf->snapshot = snapshot; 1351 buf->data_pages = pages; 1352 buf->stop_pos = -1; 1353 buf->intr_pos = -1; 1354 1355 INIT_LIST_HEAD(&buf->tables); 1356 1357 ret = pt_buffer_try_single(buf, nr_pages); 1358 if (!ret) 1359 return buf; 1360 1361 ret = pt_buffer_init_topa(buf, cpu, nr_pages, GFP_KERNEL); 1362 if (ret) { 1363 kfree(buf); 1364 return NULL; 1365 } 1366 1367 return buf; 1368 } 1369 1370 /** 1371 * pt_buffer_free_aux() - perf AUX deallocation path callback 1372 * @data: PT buffer. 1373 */ 1374 static void pt_buffer_free_aux(void *data) 1375 { 1376 struct pt_buffer *buf = data; 1377 1378 pt_buffer_fini_topa(buf); 1379 kfree(buf); 1380 } 1381 1382 static int pt_addr_filters_init(struct perf_event *event) 1383 { 1384 struct pt_filters *filters; 1385 int node = event->cpu == -1 ? -1 : cpu_to_node(event->cpu); 1386 1387 if (!intel_pt_validate_hw_cap(PT_CAP_num_address_ranges)) 1388 return 0; 1389 1390 filters = kzalloc_node(sizeof(struct pt_filters), GFP_KERNEL, node); 1391 if (!filters) 1392 return -ENOMEM; 1393 1394 if (event->parent) 1395 memcpy(filters, event->parent->hw.addr_filters, 1396 sizeof(*filters)); 1397 1398 event->hw.addr_filters = filters; 1399 1400 return 0; 1401 } 1402 1403 static void pt_addr_filters_fini(struct perf_event *event) 1404 { 1405 kfree(event->hw.addr_filters); 1406 event->hw.addr_filters = NULL; 1407 } 1408 1409 #ifdef CONFIG_X86_64 1410 /* Clamp to a canonical address greater-than-or-equal-to the address given */ 1411 static u64 clamp_to_ge_canonical_addr(u64 vaddr, u8 vaddr_bits) 1412 { 1413 return __is_canonical_address(vaddr, vaddr_bits) ? 1414 vaddr : 1415 -BIT_ULL(vaddr_bits - 1); 1416 } 1417 1418 /* Clamp to a canonical address less-than-or-equal-to the address given */ 1419 static u64 clamp_to_le_canonical_addr(u64 vaddr, u8 vaddr_bits) 1420 { 1421 return __is_canonical_address(vaddr, vaddr_bits) ? 1422 vaddr : 1423 BIT_ULL(vaddr_bits - 1) - 1; 1424 } 1425 #else 1426 #define clamp_to_ge_canonical_addr(x, y) (x) 1427 #define clamp_to_le_canonical_addr(x, y) (x) 1428 #endif 1429 1430 static int pt_event_addr_filters_validate(struct list_head *filters) 1431 { 1432 struct perf_addr_filter *filter; 1433 int range = 0; 1434 1435 list_for_each_entry(filter, filters, entry) { 1436 /* 1437 * PT doesn't support single address triggers and 1438 * 'start' filters. 1439 */ 1440 if (!filter->size || 1441 filter->action == PERF_ADDR_FILTER_ACTION_START) 1442 return -EOPNOTSUPP; 1443 1444 if (++range > intel_pt_validate_hw_cap(PT_CAP_num_address_ranges)) 1445 return -EOPNOTSUPP; 1446 } 1447 1448 return 0; 1449 } 1450 1451 static void pt_event_addr_filters_sync(struct perf_event *event) 1452 { 1453 struct perf_addr_filters_head *head = perf_event_addr_filters(event); 1454 unsigned long msr_a, msr_b; 1455 struct perf_addr_filter_range *fr = event->addr_filter_ranges; 1456 struct pt_filters *filters = event->hw.addr_filters; 1457 struct perf_addr_filter *filter; 1458 int range = 0; 1459 1460 if (!filters) 1461 return; 1462 1463 list_for_each_entry(filter, &head->list, entry) { 1464 if (filter->path.dentry && !fr[range].start) { 1465 msr_a = msr_b = 0; 1466 } else { 1467 unsigned long n = fr[range].size - 1; 1468 unsigned long a = fr[range].start; 1469 unsigned long b; 1470 1471 if (a > ULONG_MAX - n) 1472 b = ULONG_MAX; 1473 else 1474 b = a + n; 1475 /* 1476 * Apply the offset. 64-bit addresses written to the 1477 * MSRs must be canonical, but the range can encompass 1478 * non-canonical addresses. Since software cannot 1479 * execute at non-canonical addresses, adjusting to 1480 * canonical addresses does not affect the result of the 1481 * address filter. 1482 */ 1483 msr_a = clamp_to_ge_canonical_addr(a, boot_cpu_data.x86_virt_bits); 1484 msr_b = clamp_to_le_canonical_addr(b, boot_cpu_data.x86_virt_bits); 1485 if (msr_b < msr_a) 1486 msr_a = msr_b = 0; 1487 } 1488 1489 filters->filter[range].msr_a = msr_a; 1490 filters->filter[range].msr_b = msr_b; 1491 if (filter->action == PERF_ADDR_FILTER_ACTION_FILTER) 1492 filters->filter[range].config = 1; 1493 else 1494 filters->filter[range].config = 2; 1495 range++; 1496 } 1497 1498 filters->nr_filters = range; 1499 } 1500 1501 /** 1502 * intel_pt_interrupt() - PT PMI handler 1503 */ 1504 void intel_pt_interrupt(void) 1505 { 1506 struct pt *pt = this_cpu_ptr(&pt_ctx); 1507 struct pt_buffer *buf; 1508 struct perf_event *event = pt->handle.event; 1509 1510 /* 1511 * There may be a dangling PT bit in the interrupt status register 1512 * after PT has been disabled by pt_event_stop(). Make sure we don't 1513 * do anything (particularly, re-enable) for this event here. 1514 */ 1515 if (!READ_ONCE(pt->handle_nmi)) 1516 return; 1517 1518 if (!event) 1519 return; 1520 1521 pt_config_stop(event); 1522 1523 buf = perf_get_aux(&pt->handle); 1524 if (!buf) 1525 return; 1526 1527 pt_read_offset(buf); 1528 1529 pt_handle_status(pt); 1530 1531 pt_update_head(pt); 1532 1533 perf_aux_output_end(&pt->handle, local_xchg(&buf->data_size, 0)); 1534 1535 if (!event->hw.state) { 1536 int ret; 1537 1538 buf = perf_aux_output_begin(&pt->handle, event); 1539 if (!buf) { 1540 event->hw.state = PERF_HES_STOPPED; 1541 WRITE_ONCE(pt->resume_allowed, 0); 1542 return; 1543 } 1544 1545 pt_buffer_reset_offsets(buf, pt->handle.head); 1546 /* snapshot counters don't use PMI, so it's safe */ 1547 ret = pt_buffer_reset_markers(buf, &pt->handle); 1548 if (ret) { 1549 perf_aux_output_end(&pt->handle, 0); 1550 WRITE_ONCE(pt->resume_allowed, 0); 1551 return; 1552 } 1553 1554 pt_config_buffer(buf); 1555 pt_config_start(event); 1556 } 1557 } 1558 1559 void intel_pt_handle_vmx(int on) 1560 { 1561 struct pt *pt = this_cpu_ptr(&pt_ctx); 1562 struct perf_event *event; 1563 unsigned long flags; 1564 1565 /* PT plays nice with VMX, do nothing */ 1566 if (pt_pmu.vmx) 1567 return; 1568 1569 /* 1570 * VMXON will clear RTIT_CTL.TraceEn; we need to make 1571 * sure to not try to set it while VMX is on. Disable 1572 * interrupts to avoid racing with pmu callbacks; 1573 * concurrent PMI should be handled fine. 1574 */ 1575 local_irq_save(flags); 1576 WRITE_ONCE(pt->vmx_on, on); 1577 1578 /* 1579 * If an AUX transaction is in progress, it will contain 1580 * gap(s), so flag it PARTIAL to inform the user. 1581 */ 1582 event = pt->handle.event; 1583 if (event) 1584 perf_aux_output_flag(&pt->handle, 1585 PERF_AUX_FLAG_PARTIAL); 1586 1587 /* Turn PTs back on */ 1588 if (!on && event) 1589 wrmsrq(MSR_IA32_RTIT_CTL, event->hw.aux_config); 1590 1591 local_irq_restore(flags); 1592 } 1593 EXPORT_SYMBOL_GPL(intel_pt_handle_vmx); 1594 1595 /* 1596 * PMU callbacks 1597 */ 1598 1599 static void pt_event_start(struct perf_event *event, int mode) 1600 { 1601 struct hw_perf_event *hwc = &event->hw; 1602 struct pt *pt = this_cpu_ptr(&pt_ctx); 1603 struct pt_buffer *buf; 1604 1605 if (mode & PERF_EF_RESUME) { 1606 if (READ_ONCE(pt->resume_allowed)) { 1607 u64 status; 1608 1609 /* 1610 * Only if the trace is not active and the error and 1611 * stopped bits are clear, is it safe to start, but a 1612 * PMI might have just cleared these, so resume_allowed 1613 * must be checked again also. 1614 */ 1615 rdmsrq(MSR_IA32_RTIT_STATUS, status); 1616 if (!(status & (RTIT_STATUS_TRIGGEREN | 1617 RTIT_STATUS_ERROR | 1618 RTIT_STATUS_STOPPED)) && 1619 READ_ONCE(pt->resume_allowed)) 1620 pt_config_start(event); 1621 } 1622 return; 1623 } 1624 1625 buf = perf_aux_output_begin(&pt->handle, event); 1626 if (!buf) 1627 goto fail_stop; 1628 1629 pt_buffer_reset_offsets(buf, pt->handle.head); 1630 if (!buf->snapshot) { 1631 if (pt_buffer_reset_markers(buf, &pt->handle)) 1632 goto fail_end_stop; 1633 } 1634 1635 hwc->state = 0; 1636 1637 pt_config_buffer(buf); 1638 pt_config(event); 1639 1640 return; 1641 1642 fail_end_stop: 1643 perf_aux_output_end(&pt->handle, 0); 1644 fail_stop: 1645 hwc->state = PERF_HES_STOPPED; 1646 } 1647 1648 static void pt_event_stop(struct perf_event *event, int mode) 1649 { 1650 struct pt *pt = this_cpu_ptr(&pt_ctx); 1651 1652 if (mode & PERF_EF_PAUSE) { 1653 if (READ_ONCE(pt->pause_allowed)) 1654 pt_config_stop(event); 1655 return; 1656 } 1657 1658 /* 1659 * Protect against the PMI racing with disabling wrmsr, 1660 * see comment in intel_pt_interrupt(). 1661 */ 1662 WRITE_ONCE(pt->handle_nmi, 0); 1663 barrier(); 1664 1665 /* 1666 * Prevent a resume from attempting to restart tracing, or a pause 1667 * during a subsequent start. Do this after clearing handle_nmi so that 1668 * pt_event_snapshot_aux() will not re-allow them. 1669 */ 1670 WRITE_ONCE(pt->pause_allowed, 0); 1671 WRITE_ONCE(pt->resume_allowed, 0); 1672 barrier(); 1673 1674 pt_config_stop(event); 1675 1676 if (event->hw.state == PERF_HES_STOPPED) 1677 return; 1678 1679 event->hw.state = PERF_HES_STOPPED; 1680 1681 if (mode & PERF_EF_UPDATE) { 1682 struct pt_buffer *buf = perf_get_aux(&pt->handle); 1683 1684 if (!buf) 1685 return; 1686 1687 if (WARN_ON_ONCE(pt->handle.event != event)) 1688 return; 1689 1690 pt_read_offset(buf); 1691 1692 pt_handle_status(pt); 1693 1694 pt_update_head(pt); 1695 1696 if (buf->snapshot) 1697 pt->handle.head = 1698 local_xchg(&buf->data_size, 1699 buf->nr_pages << PAGE_SHIFT); 1700 perf_aux_output_end(&pt->handle, local_xchg(&buf->data_size, 0)); 1701 } 1702 } 1703 1704 static long pt_event_snapshot_aux(struct perf_event *event, 1705 struct perf_output_handle *handle, 1706 unsigned long size) 1707 { 1708 struct pt *pt = this_cpu_ptr(&pt_ctx); 1709 struct pt_buffer *buf = perf_get_aux(&pt->handle); 1710 unsigned long from = 0, to; 1711 long ret; 1712 1713 if (WARN_ON_ONCE(!buf)) 1714 return 0; 1715 1716 /* 1717 * Sampling is only allowed on snapshot events; 1718 * see pt_buffer_setup_aux(). 1719 */ 1720 if (WARN_ON_ONCE(!buf->snapshot)) 1721 return 0; 1722 1723 /* Prevent pause/resume from attempting to start/stop tracing */ 1724 WRITE_ONCE(pt->pause_allowed, 0); 1725 WRITE_ONCE(pt->resume_allowed, 0); 1726 barrier(); 1727 /* 1728 * There is no PT interrupt in this mode, so stop the trace and it will 1729 * remain stopped while the buffer is copied. 1730 */ 1731 pt_config_stop(event); 1732 pt_read_offset(buf); 1733 pt_update_head(pt); 1734 1735 to = local_read(&buf->data_size); 1736 if (to < size) 1737 from = buf->nr_pages << PAGE_SHIFT; 1738 from += to - size; 1739 1740 ret = perf_output_copy_aux(&pt->handle, handle, from, to); 1741 1742 /* 1743 * Here, handle_nmi tells us if the tracing was on. 1744 * If the tracing was on, restart it. 1745 */ 1746 if (READ_ONCE(pt->handle_nmi)) { 1747 WRITE_ONCE(pt->resume_allowed, 1); 1748 barrier(); 1749 pt_config_start(event); 1750 barrier(); 1751 WRITE_ONCE(pt->pause_allowed, 1); 1752 } 1753 1754 return ret; 1755 } 1756 1757 static void pt_event_del(struct perf_event *event, int mode) 1758 { 1759 pt_event_stop(event, PERF_EF_UPDATE); 1760 } 1761 1762 static int pt_event_add(struct perf_event *event, int mode) 1763 { 1764 struct pt *pt = this_cpu_ptr(&pt_ctx); 1765 struct hw_perf_event *hwc = &event->hw; 1766 int ret = -EBUSY; 1767 1768 if (pt->handle.event) 1769 goto fail; 1770 1771 if (mode & PERF_EF_START) { 1772 pt_event_start(event, 0); 1773 ret = -EINVAL; 1774 if (hwc->state == PERF_HES_STOPPED) 1775 goto fail; 1776 } else { 1777 hwc->state = PERF_HES_STOPPED; 1778 } 1779 1780 ret = 0; 1781 fail: 1782 1783 return ret; 1784 } 1785 1786 static void pt_event_read(struct perf_event *event) 1787 { 1788 } 1789 1790 static void pt_event_destroy(struct perf_event *event) 1791 { 1792 pt_addr_filters_fini(event); 1793 x86_del_exclusive(x86_lbr_exclusive_pt); 1794 } 1795 1796 static int pt_event_init(struct perf_event *event) 1797 { 1798 if (event->attr.type != pt_pmu.pmu.type) 1799 return -ENOENT; 1800 1801 if (!pt_event_valid(event)) 1802 return -EINVAL; 1803 1804 if (x86_add_exclusive(x86_lbr_exclusive_pt)) 1805 return -EBUSY; 1806 1807 if (pt_addr_filters_init(event)) { 1808 x86_del_exclusive(x86_lbr_exclusive_pt); 1809 return -ENOMEM; 1810 } 1811 1812 event->destroy = pt_event_destroy; 1813 1814 return 0; 1815 } 1816 1817 void cpu_emergency_stop_pt(void) 1818 { 1819 struct pt *pt = this_cpu_ptr(&pt_ctx); 1820 1821 if (pt->handle.event) 1822 pt_event_stop(pt->handle.event, PERF_EF_UPDATE); 1823 } 1824 1825 int is_intel_pt_event(struct perf_event *event) 1826 { 1827 return event->pmu == &pt_pmu.pmu; 1828 } 1829 1830 static __init int pt_init(void) 1831 { 1832 int ret, cpu, prior_warn = 0; 1833 1834 BUILD_BUG_ON(sizeof(struct topa) > PAGE_SIZE); 1835 1836 if (!boot_cpu_has(X86_FEATURE_INTEL_PT)) 1837 return -ENODEV; 1838 1839 cpus_read_lock(); 1840 for_each_online_cpu(cpu) { 1841 u64 ctl; 1842 1843 ret = rdmsrq_safe_on_cpu(cpu, MSR_IA32_RTIT_CTL, &ctl); 1844 if (!ret && (ctl & RTIT_CTL_TRACEEN)) 1845 prior_warn++; 1846 } 1847 cpus_read_unlock(); 1848 1849 if (prior_warn) { 1850 x86_add_exclusive(x86_lbr_exclusive_pt); 1851 pr_warn("PT is enabled at boot time, doing nothing\n"); 1852 1853 return -EBUSY; 1854 } 1855 1856 ret = pt_pmu_hw_init(); 1857 if (ret) 1858 return ret; 1859 1860 if (!intel_pt_validate_hw_cap(PT_CAP_topa_output)) { 1861 pr_warn("ToPA output is not supported on this CPU\n"); 1862 return -ENODEV; 1863 } 1864 1865 if (!intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries)) 1866 pt_pmu.pmu.capabilities = PERF_PMU_CAP_AUX_NO_SG; 1867 else 1868 pt_pmu.pmu.capabilities = PERF_PMU_CAP_AUX_PREFER_LARGE; 1869 1870 pt_pmu.pmu.capabilities |= PERF_PMU_CAP_EXCLUSIVE | 1871 PERF_PMU_CAP_ITRACE | 1872 PERF_PMU_CAP_AUX_PAUSE; 1873 pt_pmu.pmu.attr_groups = pt_attr_groups; 1874 pt_pmu.pmu.task_ctx_nr = perf_sw_context; 1875 pt_pmu.pmu.event_init = pt_event_init; 1876 pt_pmu.pmu.add = pt_event_add; 1877 pt_pmu.pmu.del = pt_event_del; 1878 pt_pmu.pmu.start = pt_event_start; 1879 pt_pmu.pmu.stop = pt_event_stop; 1880 pt_pmu.pmu.snapshot_aux = pt_event_snapshot_aux; 1881 pt_pmu.pmu.read = pt_event_read; 1882 pt_pmu.pmu.setup_aux = pt_buffer_setup_aux; 1883 pt_pmu.pmu.free_aux = pt_buffer_free_aux; 1884 pt_pmu.pmu.addr_filters_sync = pt_event_addr_filters_sync; 1885 pt_pmu.pmu.addr_filters_validate = pt_event_addr_filters_validate; 1886 pt_pmu.pmu.nr_addr_filters = 1887 intel_pt_validate_hw_cap(PT_CAP_num_address_ranges); 1888 1889 ret = perf_pmu_register(&pt_pmu.pmu, "intel_pt", -1); 1890 1891 return ret; 1892 } 1893 arch_initcall(pt_init); 1894