1 /* 2 * Performance events x86 architecture code 3 * 4 * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de> 5 * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar 6 * Copyright (C) 2009 Jaswinder Singh Rajput 7 * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter 8 * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra 9 * Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com> 10 * Copyright (C) 2009 Google, Inc., Stephane Eranian 11 * 12 * For licencing details see kernel-base/COPYING 13 */ 14 15 #include <linux/perf_event.h> 16 #include <linux/capability.h> 17 #include <linux/notifier.h> 18 #include <linux/hardirq.h> 19 #include <linux/kprobes.h> 20 #include <linux/export.h> 21 #include <linux/init.h> 22 #include <linux/kdebug.h> 23 #include <linux/sched/mm.h> 24 #include <linux/sched/clock.h> 25 #include <linux/uaccess.h> 26 #include <linux/slab.h> 27 #include <linux/cpu.h> 28 #include <linux/bitops.h> 29 #include <linux/device.h> 30 #include <linux/nospec.h> 31 #include <linux/static_call.h> 32 33 #include <asm/apic.h> 34 #include <asm/stacktrace.h> 35 #include <asm/nmi.h> 36 #include <asm/smp.h> 37 #include <asm/alternative.h> 38 #include <asm/mmu_context.h> 39 #include <asm/tlbflush.h> 40 #include <asm/timer.h> 41 #include <asm/desc.h> 42 #include <asm/ldt.h> 43 #include <asm/unwind.h> 44 #include <asm/uprobes.h> 45 #include <asm/ibt.h> 46 47 #include "perf_event.h" 48 49 struct x86_pmu x86_pmu __read_mostly; 50 static struct pmu pmu; 51 52 DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = { 53 .enabled = 1, 54 .pmu = &pmu, 55 }; 56 57 DEFINE_STATIC_KEY_FALSE(rdpmc_never_available_key); 58 DEFINE_STATIC_KEY_FALSE(rdpmc_always_available_key); 59 DEFINE_STATIC_KEY_FALSE(perf_is_hybrid); 60 61 /* 62 * This here uses DEFINE_STATIC_CALL_NULL() to get a static_call defined 63 * from just a typename, as opposed to an actual function. 64 */ 65 DEFINE_STATIC_CALL_NULL(x86_pmu_handle_irq, *x86_pmu.handle_irq); 66 DEFINE_STATIC_CALL_NULL(x86_pmu_disable_all, *x86_pmu.disable_all); 67 DEFINE_STATIC_CALL_NULL(x86_pmu_enable_all, *x86_pmu.enable_all); 68 DEFINE_STATIC_CALL_NULL(x86_pmu_enable, *x86_pmu.enable); 69 DEFINE_STATIC_CALL_NULL(x86_pmu_disable, *x86_pmu.disable); 70 71 DEFINE_STATIC_CALL_NULL(x86_pmu_assign, *x86_pmu.assign); 72 73 DEFINE_STATIC_CALL_NULL(x86_pmu_add, *x86_pmu.add); 74 DEFINE_STATIC_CALL_NULL(x86_pmu_del, *x86_pmu.del); 75 DEFINE_STATIC_CALL_NULL(x86_pmu_read, *x86_pmu.read); 76 77 DEFINE_STATIC_CALL_NULL(x86_pmu_set_period, *x86_pmu.set_period); 78 DEFINE_STATIC_CALL_NULL(x86_pmu_update, *x86_pmu.update); 79 DEFINE_STATIC_CALL_NULL(x86_pmu_limit_period, *x86_pmu.limit_period); 80 81 DEFINE_STATIC_CALL_NULL(x86_pmu_schedule_events, *x86_pmu.schedule_events); 82 DEFINE_STATIC_CALL_NULL(x86_pmu_get_event_constraints, *x86_pmu.get_event_constraints); 83 DEFINE_STATIC_CALL_NULL(x86_pmu_put_event_constraints, *x86_pmu.put_event_constraints); 84 85 DEFINE_STATIC_CALL_NULL(x86_pmu_start_scheduling, *x86_pmu.start_scheduling); 86 DEFINE_STATIC_CALL_NULL(x86_pmu_commit_scheduling, *x86_pmu.commit_scheduling); 87 DEFINE_STATIC_CALL_NULL(x86_pmu_stop_scheduling, *x86_pmu.stop_scheduling); 88 89 DEFINE_STATIC_CALL_NULL(x86_pmu_sched_task, *x86_pmu.sched_task); 90 91 DEFINE_STATIC_CALL_NULL(x86_pmu_drain_pebs, *x86_pmu.drain_pebs); 92 DEFINE_STATIC_CALL_NULL(x86_pmu_pebs_aliases, *x86_pmu.pebs_aliases); 93 94 DEFINE_STATIC_CALL_NULL(x86_pmu_filter, *x86_pmu.filter); 95 96 DEFINE_STATIC_CALL_NULL(x86_pmu_late_setup, *x86_pmu.late_setup); 97 98 /* 99 * This one is magic, it will get called even when PMU init fails (because 100 * there is no PMU), in which case it should simply return NULL. 101 */ 102 DEFINE_STATIC_CALL_RET0(x86_pmu_guest_get_msrs, *x86_pmu.guest_get_msrs); 103 104 u64 __read_mostly hw_cache_event_ids 105 [PERF_COUNT_HW_CACHE_MAX] 106 [PERF_COUNT_HW_CACHE_OP_MAX] 107 [PERF_COUNT_HW_CACHE_RESULT_MAX]; 108 u64 __read_mostly hw_cache_extra_regs 109 [PERF_COUNT_HW_CACHE_MAX] 110 [PERF_COUNT_HW_CACHE_OP_MAX] 111 [PERF_COUNT_HW_CACHE_RESULT_MAX]; 112 113 /* 114 * Propagate event elapsed time into the generic event. 115 * Can only be executed on the CPU where the event is active. 116 * Returns the delta events processed. 117 */ 118 u64 x86_perf_event_update(struct perf_event *event) 119 { 120 struct hw_perf_event *hwc = &event->hw; 121 int shift = 64 - x86_pmu.cntval_bits; 122 u64 prev_raw_count, new_raw_count; 123 u64 delta; 124 125 if (unlikely(!hwc->event_base)) 126 return 0; 127 128 /* 129 * Careful: an NMI might modify the previous event value. 130 * 131 * Our tactic to handle this is to first atomically read and 132 * exchange a new raw count - then add that new-prev delta 133 * count to the generic event atomically: 134 */ 135 prev_raw_count = local64_read(&hwc->prev_count); 136 do { 137 rdpmcl(hwc->event_base_rdpmc, new_raw_count); 138 } while (!local64_try_cmpxchg(&hwc->prev_count, 139 &prev_raw_count, new_raw_count)); 140 141 /* 142 * Now we have the new raw value and have updated the prev 143 * timestamp already. We can now calculate the elapsed delta 144 * (event-)time and add that to the generic event. 145 * 146 * Careful, not all hw sign-extends above the physical width 147 * of the count. 148 */ 149 delta = (new_raw_count << shift) - (prev_raw_count << shift); 150 delta >>= shift; 151 152 local64_add(delta, &event->count); 153 local64_sub(delta, &hwc->period_left); 154 155 return new_raw_count; 156 } 157 158 /* 159 * Find and validate any extra registers to set up. 160 */ 161 static int x86_pmu_extra_regs(u64 config, struct perf_event *event) 162 { 163 struct extra_reg *extra_regs = hybrid(event->pmu, extra_regs); 164 struct hw_perf_event_extra *reg; 165 struct extra_reg *er; 166 167 reg = &event->hw.extra_reg; 168 169 if (!extra_regs) 170 return 0; 171 172 for (er = extra_regs; er->msr; er++) { 173 if (er->event != (config & er->config_mask)) 174 continue; 175 if (event->attr.config1 & ~er->valid_mask) 176 return -EINVAL; 177 /* Check if the extra msrs can be safely accessed*/ 178 if (!er->extra_msr_access) 179 return -ENXIO; 180 181 reg->idx = er->idx; 182 reg->config = event->attr.config1; 183 reg->reg = er->msr; 184 break; 185 } 186 return 0; 187 } 188 189 static atomic_t active_events; 190 static atomic_t pmc_refcount; 191 static DEFINE_MUTEX(pmc_reserve_mutex); 192 193 #ifdef CONFIG_X86_LOCAL_APIC 194 195 static inline u64 get_possible_counter_mask(void) 196 { 197 u64 cntr_mask = x86_pmu.cntr_mask64; 198 int i; 199 200 if (!is_hybrid()) 201 return cntr_mask; 202 203 for (i = 0; i < x86_pmu.num_hybrid_pmus; i++) 204 cntr_mask |= x86_pmu.hybrid_pmu[i].cntr_mask64; 205 206 return cntr_mask; 207 } 208 209 static bool reserve_pmc_hardware(void) 210 { 211 u64 cntr_mask = get_possible_counter_mask(); 212 int i, end; 213 214 for_each_set_bit(i, (unsigned long *)&cntr_mask, X86_PMC_IDX_MAX) { 215 if (!reserve_perfctr_nmi(x86_pmu_event_addr(i))) 216 goto perfctr_fail; 217 } 218 219 for_each_set_bit(i, (unsigned long *)&cntr_mask, X86_PMC_IDX_MAX) { 220 if (!reserve_evntsel_nmi(x86_pmu_config_addr(i))) 221 goto eventsel_fail; 222 } 223 224 return true; 225 226 eventsel_fail: 227 end = i; 228 for_each_set_bit(i, (unsigned long *)&cntr_mask, end) 229 release_evntsel_nmi(x86_pmu_config_addr(i)); 230 i = X86_PMC_IDX_MAX; 231 232 perfctr_fail: 233 end = i; 234 for_each_set_bit(i, (unsigned long *)&cntr_mask, end) 235 release_perfctr_nmi(x86_pmu_event_addr(i)); 236 237 return false; 238 } 239 240 static void release_pmc_hardware(void) 241 { 242 u64 cntr_mask = get_possible_counter_mask(); 243 int i; 244 245 for_each_set_bit(i, (unsigned long *)&cntr_mask, X86_PMC_IDX_MAX) { 246 release_perfctr_nmi(x86_pmu_event_addr(i)); 247 release_evntsel_nmi(x86_pmu_config_addr(i)); 248 } 249 } 250 251 #else 252 253 static bool reserve_pmc_hardware(void) { return true; } 254 static void release_pmc_hardware(void) {} 255 256 #endif 257 258 bool check_hw_exists(struct pmu *pmu, unsigned long *cntr_mask, 259 unsigned long *fixed_cntr_mask) 260 { 261 u64 val, val_fail = -1, val_new= ~0; 262 int i, reg, reg_fail = -1, ret = 0; 263 int bios_fail = 0; 264 int reg_safe = -1; 265 266 /* 267 * Check to see if the BIOS enabled any of the counters, if so 268 * complain and bail. 269 */ 270 for_each_set_bit(i, cntr_mask, X86_PMC_IDX_MAX) { 271 reg = x86_pmu_config_addr(i); 272 ret = rdmsrl_safe(reg, &val); 273 if (ret) 274 goto msr_fail; 275 if (val & ARCH_PERFMON_EVENTSEL_ENABLE) { 276 bios_fail = 1; 277 val_fail = val; 278 reg_fail = reg; 279 } else { 280 reg_safe = i; 281 } 282 } 283 284 if (*(u64 *)fixed_cntr_mask) { 285 reg = MSR_ARCH_PERFMON_FIXED_CTR_CTRL; 286 ret = rdmsrl_safe(reg, &val); 287 if (ret) 288 goto msr_fail; 289 for_each_set_bit(i, fixed_cntr_mask, X86_PMC_IDX_MAX) { 290 if (fixed_counter_disabled(i, pmu)) 291 continue; 292 if (val & (0x03ULL << i*4)) { 293 bios_fail = 1; 294 val_fail = val; 295 reg_fail = reg; 296 } 297 } 298 } 299 300 /* 301 * If all the counters are enabled, the below test will always 302 * fail. The tools will also become useless in this scenario. 303 * Just fail and disable the hardware counters. 304 */ 305 306 if (reg_safe == -1) { 307 reg = reg_safe; 308 goto msr_fail; 309 } 310 311 /* 312 * Read the current value, change it and read it back to see if it 313 * matches, this is needed to detect certain hardware emulators 314 * (qemu/kvm) that don't trap on the MSR access and always return 0s. 315 */ 316 reg = x86_pmu_event_addr(reg_safe); 317 if (rdmsrl_safe(reg, &val)) 318 goto msr_fail; 319 val ^= 0xffffUL; 320 ret = wrmsrl_safe(reg, val); 321 ret |= rdmsrl_safe(reg, &val_new); 322 if (ret || val != val_new) 323 goto msr_fail; 324 325 /* 326 * We still allow the PMU driver to operate: 327 */ 328 if (bios_fail) { 329 pr_cont("Broken BIOS detected, complain to your hardware vendor.\n"); 330 pr_err(FW_BUG "the BIOS has corrupted hw-PMU resources (MSR %x is %Lx)\n", 331 reg_fail, val_fail); 332 } 333 334 return true; 335 336 msr_fail: 337 if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) { 338 pr_cont("PMU not available due to virtualization, using software events only.\n"); 339 } else { 340 pr_cont("Broken PMU hardware detected, using software events only.\n"); 341 pr_err("Failed to access perfctr msr (MSR %x is %Lx)\n", 342 reg, val_new); 343 } 344 345 return false; 346 } 347 348 static void hw_perf_event_destroy(struct perf_event *event) 349 { 350 x86_release_hardware(); 351 atomic_dec(&active_events); 352 } 353 354 void hw_perf_lbr_event_destroy(struct perf_event *event) 355 { 356 hw_perf_event_destroy(event); 357 358 /* undo the lbr/bts event accounting */ 359 x86_del_exclusive(x86_lbr_exclusive_lbr); 360 } 361 362 static inline int x86_pmu_initialized(void) 363 { 364 return x86_pmu.handle_irq != NULL; 365 } 366 367 static inline int 368 set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event *event) 369 { 370 struct perf_event_attr *attr = &event->attr; 371 unsigned int cache_type, cache_op, cache_result; 372 u64 config, val; 373 374 config = attr->config; 375 376 cache_type = (config >> 0) & 0xff; 377 if (cache_type >= PERF_COUNT_HW_CACHE_MAX) 378 return -EINVAL; 379 cache_type = array_index_nospec(cache_type, PERF_COUNT_HW_CACHE_MAX); 380 381 cache_op = (config >> 8) & 0xff; 382 if (cache_op >= PERF_COUNT_HW_CACHE_OP_MAX) 383 return -EINVAL; 384 cache_op = array_index_nospec(cache_op, PERF_COUNT_HW_CACHE_OP_MAX); 385 386 cache_result = (config >> 16) & 0xff; 387 if (cache_result >= PERF_COUNT_HW_CACHE_RESULT_MAX) 388 return -EINVAL; 389 cache_result = array_index_nospec(cache_result, PERF_COUNT_HW_CACHE_RESULT_MAX); 390 391 val = hybrid_var(event->pmu, hw_cache_event_ids)[cache_type][cache_op][cache_result]; 392 if (val == 0) 393 return -ENOENT; 394 395 if (val == -1) 396 return -EINVAL; 397 398 hwc->config |= val; 399 attr->config1 = hybrid_var(event->pmu, hw_cache_extra_regs)[cache_type][cache_op][cache_result]; 400 return x86_pmu_extra_regs(val, event); 401 } 402 403 int x86_reserve_hardware(void) 404 { 405 int err = 0; 406 407 if (!atomic_inc_not_zero(&pmc_refcount)) { 408 mutex_lock(&pmc_reserve_mutex); 409 if (atomic_read(&pmc_refcount) == 0) { 410 if (!reserve_pmc_hardware()) { 411 err = -EBUSY; 412 } else { 413 reserve_ds_buffers(); 414 reserve_lbr_buffers(); 415 } 416 } 417 if (!err) 418 atomic_inc(&pmc_refcount); 419 mutex_unlock(&pmc_reserve_mutex); 420 } 421 422 return err; 423 } 424 425 void x86_release_hardware(void) 426 { 427 if (atomic_dec_and_mutex_lock(&pmc_refcount, &pmc_reserve_mutex)) { 428 release_pmc_hardware(); 429 release_ds_buffers(); 430 release_lbr_buffers(); 431 mutex_unlock(&pmc_reserve_mutex); 432 } 433 } 434 435 /* 436 * Check if we can create event of a certain type (that no conflicting events 437 * are present). 438 */ 439 int x86_add_exclusive(unsigned int what) 440 { 441 int i; 442 443 /* 444 * When lbr_pt_coexist we allow PT to coexist with either LBR or BTS. 445 * LBR and BTS are still mutually exclusive. 446 */ 447 if (x86_pmu.lbr_pt_coexist && what == x86_lbr_exclusive_pt) 448 goto out; 449 450 if (!atomic_inc_not_zero(&x86_pmu.lbr_exclusive[what])) { 451 mutex_lock(&pmc_reserve_mutex); 452 for (i = 0; i < ARRAY_SIZE(x86_pmu.lbr_exclusive); i++) { 453 if (i != what && atomic_read(&x86_pmu.lbr_exclusive[i])) 454 goto fail_unlock; 455 } 456 atomic_inc(&x86_pmu.lbr_exclusive[what]); 457 mutex_unlock(&pmc_reserve_mutex); 458 } 459 460 out: 461 atomic_inc(&active_events); 462 return 0; 463 464 fail_unlock: 465 mutex_unlock(&pmc_reserve_mutex); 466 return -EBUSY; 467 } 468 469 void x86_del_exclusive(unsigned int what) 470 { 471 atomic_dec(&active_events); 472 473 /* 474 * See the comment in x86_add_exclusive(). 475 */ 476 if (x86_pmu.lbr_pt_coexist && what == x86_lbr_exclusive_pt) 477 return; 478 479 atomic_dec(&x86_pmu.lbr_exclusive[what]); 480 } 481 482 int x86_setup_perfctr(struct perf_event *event) 483 { 484 struct perf_event_attr *attr = &event->attr; 485 struct hw_perf_event *hwc = &event->hw; 486 u64 config; 487 488 if (!is_sampling_event(event)) { 489 hwc->sample_period = x86_pmu.max_period; 490 hwc->last_period = hwc->sample_period; 491 local64_set(&hwc->period_left, hwc->sample_period); 492 } 493 494 if (attr->type == event->pmu->type) 495 return x86_pmu_extra_regs(event->attr.config, event); 496 497 if (attr->type == PERF_TYPE_HW_CACHE) 498 return set_ext_hw_attr(hwc, event); 499 500 if (attr->config >= x86_pmu.max_events) 501 return -EINVAL; 502 503 attr->config = array_index_nospec((unsigned long)attr->config, x86_pmu.max_events); 504 505 /* 506 * The generic map: 507 */ 508 config = x86_pmu.event_map(attr->config); 509 510 if (config == 0) 511 return -ENOENT; 512 513 if (config == -1LL) 514 return -EINVAL; 515 516 hwc->config |= config; 517 518 return 0; 519 } 520 521 /* 522 * check that branch_sample_type is compatible with 523 * settings needed for precise_ip > 1 which implies 524 * using the LBR to capture ALL taken branches at the 525 * priv levels of the measurement 526 */ 527 static inline int precise_br_compat(struct perf_event *event) 528 { 529 u64 m = event->attr.branch_sample_type; 530 u64 b = 0; 531 532 /* must capture all branches */ 533 if (!(m & PERF_SAMPLE_BRANCH_ANY)) 534 return 0; 535 536 m &= PERF_SAMPLE_BRANCH_KERNEL | PERF_SAMPLE_BRANCH_USER; 537 538 if (!event->attr.exclude_user) 539 b |= PERF_SAMPLE_BRANCH_USER; 540 541 if (!event->attr.exclude_kernel) 542 b |= PERF_SAMPLE_BRANCH_KERNEL; 543 544 /* 545 * ignore PERF_SAMPLE_BRANCH_HV, not supported on x86 546 */ 547 548 return m == b; 549 } 550 551 int x86_pmu_max_precise(void) 552 { 553 int precise = 0; 554 555 /* Support for constant skid */ 556 if (x86_pmu.pebs_active && !x86_pmu.pebs_broken) { 557 precise++; 558 559 /* Support for IP fixup */ 560 if (x86_pmu.lbr_nr || x86_pmu.intel_cap.pebs_format >= 2) 561 precise++; 562 563 if (x86_pmu.pebs_prec_dist) 564 precise++; 565 } 566 return precise; 567 } 568 569 int x86_pmu_hw_config(struct perf_event *event) 570 { 571 if (event->attr.precise_ip) { 572 int precise = x86_pmu_max_precise(); 573 574 if (event->attr.precise_ip > precise) 575 return -EOPNOTSUPP; 576 577 /* There's no sense in having PEBS for non sampling events: */ 578 if (!is_sampling_event(event)) 579 return -EINVAL; 580 } 581 /* 582 * check that PEBS LBR correction does not conflict with 583 * whatever the user is asking with attr->branch_sample_type 584 */ 585 if (event->attr.precise_ip > 1 && x86_pmu.intel_cap.pebs_format < 2) { 586 u64 *br_type = &event->attr.branch_sample_type; 587 588 if (has_branch_stack(event)) { 589 if (!precise_br_compat(event)) 590 return -EOPNOTSUPP; 591 592 /* branch_sample_type is compatible */ 593 594 } else { 595 /* 596 * user did not specify branch_sample_type 597 * 598 * For PEBS fixups, we capture all 599 * the branches at the priv level of the 600 * event. 601 */ 602 *br_type = PERF_SAMPLE_BRANCH_ANY; 603 604 if (!event->attr.exclude_user) 605 *br_type |= PERF_SAMPLE_BRANCH_USER; 606 607 if (!event->attr.exclude_kernel) 608 *br_type |= PERF_SAMPLE_BRANCH_KERNEL; 609 } 610 } 611 612 if (branch_sample_call_stack(event)) 613 event->attach_state |= PERF_ATTACH_TASK_DATA; 614 615 /* 616 * Generate PMC IRQs: 617 * (keep 'enabled' bit clear for now) 618 */ 619 event->hw.config = ARCH_PERFMON_EVENTSEL_INT; 620 621 /* 622 * Count user and OS events unless requested not to 623 */ 624 if (!event->attr.exclude_user) 625 event->hw.config |= ARCH_PERFMON_EVENTSEL_USR; 626 if (!event->attr.exclude_kernel) 627 event->hw.config |= ARCH_PERFMON_EVENTSEL_OS; 628 629 if (event->attr.type == event->pmu->type) 630 event->hw.config |= x86_pmu_get_event_config(event); 631 632 if (is_sampling_event(event) && !event->attr.freq && x86_pmu.limit_period) { 633 s64 left = event->attr.sample_period; 634 x86_pmu.limit_period(event, &left); 635 if (left > event->attr.sample_period) 636 return -EINVAL; 637 } 638 639 /* sample_regs_user never support XMM registers */ 640 if (unlikely(event->attr.sample_regs_user & PERF_REG_EXTENDED_MASK)) 641 return -EINVAL; 642 /* 643 * Besides the general purpose registers, XMM registers may 644 * be collected in PEBS on some platforms, e.g. Icelake 645 */ 646 if (unlikely(event->attr.sample_regs_intr & PERF_REG_EXTENDED_MASK)) { 647 if (!(event->pmu->capabilities & PERF_PMU_CAP_EXTENDED_REGS)) 648 return -EINVAL; 649 650 if (!event->attr.precise_ip) 651 return -EINVAL; 652 } 653 654 return x86_setup_perfctr(event); 655 } 656 657 /* 658 * Setup the hardware configuration for a given attr_type 659 */ 660 static int __x86_pmu_event_init(struct perf_event *event) 661 { 662 int err; 663 664 if (!x86_pmu_initialized()) 665 return -ENODEV; 666 667 err = x86_reserve_hardware(); 668 if (err) 669 return err; 670 671 atomic_inc(&active_events); 672 event->destroy = hw_perf_event_destroy; 673 674 event->hw.idx = -1; 675 event->hw.last_cpu = -1; 676 event->hw.last_tag = ~0ULL; 677 678 /* mark unused */ 679 event->hw.extra_reg.idx = EXTRA_REG_NONE; 680 event->hw.branch_reg.idx = EXTRA_REG_NONE; 681 682 return x86_pmu.hw_config(event); 683 } 684 685 void x86_pmu_disable_all(void) 686 { 687 struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); 688 int idx; 689 690 for_each_set_bit(idx, x86_pmu.cntr_mask, X86_PMC_IDX_MAX) { 691 struct hw_perf_event *hwc = &cpuc->events[idx]->hw; 692 u64 val; 693 694 if (!test_bit(idx, cpuc->active_mask)) 695 continue; 696 rdmsrl(x86_pmu_config_addr(idx), val); 697 if (!(val & ARCH_PERFMON_EVENTSEL_ENABLE)) 698 continue; 699 val &= ~ARCH_PERFMON_EVENTSEL_ENABLE; 700 wrmsrl(x86_pmu_config_addr(idx), val); 701 if (is_counter_pair(hwc)) 702 wrmsrl(x86_pmu_config_addr(idx + 1), 0); 703 } 704 } 705 706 struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr, void *data) 707 { 708 return static_call(x86_pmu_guest_get_msrs)(nr, data); 709 } 710 EXPORT_SYMBOL_GPL(perf_guest_get_msrs); 711 712 /* 713 * There may be PMI landing after enabled=0. The PMI hitting could be before or 714 * after disable_all. 715 * 716 * If PMI hits before disable_all, the PMU will be disabled in the NMI handler. 717 * It will not be re-enabled in the NMI handler again, because enabled=0. After 718 * handling the NMI, disable_all will be called, which will not change the 719 * state either. If PMI hits after disable_all, the PMU is already disabled 720 * before entering NMI handler. The NMI handler will not change the state 721 * either. 722 * 723 * So either situation is harmless. 724 */ 725 static void x86_pmu_disable(struct pmu *pmu) 726 { 727 struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); 728 729 if (!x86_pmu_initialized()) 730 return; 731 732 if (!cpuc->enabled) 733 return; 734 735 cpuc->n_added = 0; 736 cpuc->enabled = 0; 737 barrier(); 738 739 static_call(x86_pmu_disable_all)(); 740 } 741 742 void x86_pmu_enable_all(int added) 743 { 744 struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); 745 int idx; 746 747 for_each_set_bit(idx, x86_pmu.cntr_mask, X86_PMC_IDX_MAX) { 748 struct hw_perf_event *hwc = &cpuc->events[idx]->hw; 749 750 if (!test_bit(idx, cpuc->active_mask)) 751 continue; 752 753 __x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE); 754 } 755 } 756 757 int is_x86_event(struct perf_event *event) 758 { 759 int i; 760 761 if (!is_hybrid()) 762 return event->pmu == &pmu; 763 764 for (i = 0; i < x86_pmu.num_hybrid_pmus; i++) { 765 if (event->pmu == &x86_pmu.hybrid_pmu[i].pmu) 766 return true; 767 } 768 769 return false; 770 } 771 772 struct pmu *x86_get_pmu(unsigned int cpu) 773 { 774 struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); 775 776 /* 777 * All CPUs of the hybrid type have been offline. 778 * The x86_get_pmu() should not be invoked. 779 */ 780 if (WARN_ON_ONCE(!cpuc->pmu)) 781 return &pmu; 782 783 return cpuc->pmu; 784 } 785 /* 786 * Event scheduler state: 787 * 788 * Assign events iterating over all events and counters, beginning 789 * with events with least weights first. Keep the current iterator 790 * state in struct sched_state. 791 */ 792 struct sched_state { 793 int weight; 794 int event; /* event index */ 795 int counter; /* counter index */ 796 int unassigned; /* number of events to be assigned left */ 797 int nr_gp; /* number of GP counters used */ 798 u64 used; 799 }; 800 801 /* Total max is X86_PMC_IDX_MAX, but we are O(n!) limited */ 802 #define SCHED_STATES_MAX 2 803 804 struct perf_sched { 805 int max_weight; 806 int max_events; 807 int max_gp; 808 int saved_states; 809 struct event_constraint **constraints; 810 struct sched_state state; 811 struct sched_state saved[SCHED_STATES_MAX]; 812 }; 813 814 /* 815 * Initialize iterator that runs through all events and counters. 816 */ 817 static void perf_sched_init(struct perf_sched *sched, struct event_constraint **constraints, 818 int num, int wmin, int wmax, int gpmax) 819 { 820 int idx; 821 822 memset(sched, 0, sizeof(*sched)); 823 sched->max_events = num; 824 sched->max_weight = wmax; 825 sched->max_gp = gpmax; 826 sched->constraints = constraints; 827 828 for (idx = 0; idx < num; idx++) { 829 if (constraints[idx]->weight == wmin) 830 break; 831 } 832 833 sched->state.event = idx; /* start with min weight */ 834 sched->state.weight = wmin; 835 sched->state.unassigned = num; 836 } 837 838 static void perf_sched_save_state(struct perf_sched *sched) 839 { 840 if (WARN_ON_ONCE(sched->saved_states >= SCHED_STATES_MAX)) 841 return; 842 843 sched->saved[sched->saved_states] = sched->state; 844 sched->saved_states++; 845 } 846 847 static bool perf_sched_restore_state(struct perf_sched *sched) 848 { 849 if (!sched->saved_states) 850 return false; 851 852 sched->saved_states--; 853 sched->state = sched->saved[sched->saved_states]; 854 855 /* this assignment didn't work out */ 856 /* XXX broken vs EVENT_PAIR */ 857 sched->state.used &= ~BIT_ULL(sched->state.counter); 858 859 /* try the next one */ 860 sched->state.counter++; 861 862 return true; 863 } 864 865 /* 866 * Select a counter for the current event to schedule. Return true on 867 * success. 868 */ 869 static bool __perf_sched_find_counter(struct perf_sched *sched) 870 { 871 struct event_constraint *c; 872 int idx; 873 874 if (!sched->state.unassigned) 875 return false; 876 877 if (sched->state.event >= sched->max_events) 878 return false; 879 880 c = sched->constraints[sched->state.event]; 881 /* Prefer fixed purpose counters */ 882 if (c->idxmsk64 & (~0ULL << INTEL_PMC_IDX_FIXED)) { 883 idx = INTEL_PMC_IDX_FIXED; 884 for_each_set_bit_from(idx, c->idxmsk, X86_PMC_IDX_MAX) { 885 u64 mask = BIT_ULL(idx); 886 887 if (sched->state.used & mask) 888 continue; 889 890 sched->state.used |= mask; 891 goto done; 892 } 893 } 894 895 /* Grab the first unused counter starting with idx */ 896 idx = sched->state.counter; 897 for_each_set_bit_from(idx, c->idxmsk, INTEL_PMC_IDX_FIXED) { 898 u64 mask = BIT_ULL(idx); 899 900 if (c->flags & PERF_X86_EVENT_PAIR) 901 mask |= mask << 1; 902 903 if (sched->state.used & mask) 904 continue; 905 906 if (sched->state.nr_gp++ >= sched->max_gp) 907 return false; 908 909 sched->state.used |= mask; 910 goto done; 911 } 912 913 return false; 914 915 done: 916 sched->state.counter = idx; 917 918 if (c->overlap) 919 perf_sched_save_state(sched); 920 921 return true; 922 } 923 924 static bool perf_sched_find_counter(struct perf_sched *sched) 925 { 926 while (!__perf_sched_find_counter(sched)) { 927 if (!perf_sched_restore_state(sched)) 928 return false; 929 } 930 931 return true; 932 } 933 934 /* 935 * Go through all unassigned events and find the next one to schedule. 936 * Take events with the least weight first. Return true on success. 937 */ 938 static bool perf_sched_next_event(struct perf_sched *sched) 939 { 940 struct event_constraint *c; 941 942 if (!sched->state.unassigned || !--sched->state.unassigned) 943 return false; 944 945 do { 946 /* next event */ 947 sched->state.event++; 948 if (sched->state.event >= sched->max_events) { 949 /* next weight */ 950 sched->state.event = 0; 951 sched->state.weight++; 952 if (sched->state.weight > sched->max_weight) 953 return false; 954 } 955 c = sched->constraints[sched->state.event]; 956 } while (c->weight != sched->state.weight); 957 958 sched->state.counter = 0; /* start with first counter */ 959 960 return true; 961 } 962 963 /* 964 * Assign a counter for each event. 965 */ 966 int perf_assign_events(struct event_constraint **constraints, int n, 967 int wmin, int wmax, int gpmax, int *assign) 968 { 969 struct perf_sched sched; 970 971 perf_sched_init(&sched, constraints, n, wmin, wmax, gpmax); 972 973 do { 974 if (!perf_sched_find_counter(&sched)) 975 break; /* failed */ 976 if (assign) 977 assign[sched.state.event] = sched.state.counter; 978 } while (perf_sched_next_event(&sched)); 979 980 return sched.state.unassigned; 981 } 982 EXPORT_SYMBOL_GPL(perf_assign_events); 983 984 int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) 985 { 986 struct event_constraint *c; 987 struct perf_event *e; 988 int n0, i, wmin, wmax, unsched = 0; 989 struct hw_perf_event *hwc; 990 u64 used_mask = 0; 991 992 /* 993 * Compute the number of events already present; see x86_pmu_add(), 994 * validate_group() and x86_pmu_commit_txn(). For the former two 995 * cpuc->n_events hasn't been updated yet, while for the latter 996 * cpuc->n_txn contains the number of events added in the current 997 * transaction. 998 */ 999 n0 = cpuc->n_events; 1000 if (cpuc->txn_flags & PERF_PMU_TXN_ADD) 1001 n0 -= cpuc->n_txn; 1002 1003 static_call_cond(x86_pmu_start_scheduling)(cpuc); 1004 1005 for (i = 0, wmin = X86_PMC_IDX_MAX, wmax = 0; i < n; i++) { 1006 c = cpuc->event_constraint[i]; 1007 1008 /* 1009 * Previously scheduled events should have a cached constraint, 1010 * while new events should not have one. 1011 */ 1012 WARN_ON_ONCE((c && i >= n0) || (!c && i < n0)); 1013 1014 /* 1015 * Request constraints for new events; or for those events that 1016 * have a dynamic constraint -- for those the constraint can 1017 * change due to external factors (sibling state, allow_tfa). 1018 */ 1019 if (!c || (c->flags & PERF_X86_EVENT_DYNAMIC)) { 1020 c = static_call(x86_pmu_get_event_constraints)(cpuc, i, cpuc->event_list[i]); 1021 cpuc->event_constraint[i] = c; 1022 } 1023 1024 wmin = min(wmin, c->weight); 1025 wmax = max(wmax, c->weight); 1026 } 1027 1028 /* 1029 * fastpath, try to reuse previous register 1030 */ 1031 for (i = 0; i < n; i++) { 1032 u64 mask; 1033 1034 hwc = &cpuc->event_list[i]->hw; 1035 c = cpuc->event_constraint[i]; 1036 1037 /* never assigned */ 1038 if (hwc->idx == -1) 1039 break; 1040 1041 /* constraint still honored */ 1042 if (!test_bit(hwc->idx, c->idxmsk)) 1043 break; 1044 1045 mask = BIT_ULL(hwc->idx); 1046 if (is_counter_pair(hwc)) 1047 mask |= mask << 1; 1048 1049 /* not already used */ 1050 if (used_mask & mask) 1051 break; 1052 1053 used_mask |= mask; 1054 1055 if (assign) 1056 assign[i] = hwc->idx; 1057 } 1058 1059 /* slow path */ 1060 if (i != n) { 1061 int gpmax = x86_pmu_max_num_counters(cpuc->pmu); 1062 1063 /* 1064 * Do not allow scheduling of more than half the available 1065 * generic counters. 1066 * 1067 * This helps avoid counter starvation of sibling thread by 1068 * ensuring at most half the counters cannot be in exclusive 1069 * mode. There is no designated counters for the limits. Any 1070 * N/2 counters can be used. This helps with events with 1071 * specific counter constraints. 1072 */ 1073 if (is_ht_workaround_enabled() && !cpuc->is_fake && 1074 READ_ONCE(cpuc->excl_cntrs->exclusive_present)) 1075 gpmax /= 2; 1076 1077 /* 1078 * Reduce the amount of available counters to allow fitting 1079 * the extra Merge events needed by large increment events. 1080 */ 1081 if (x86_pmu.flags & PMU_FL_PAIR) { 1082 gpmax -= cpuc->n_pair; 1083 WARN_ON(gpmax <= 0); 1084 } 1085 1086 unsched = perf_assign_events(cpuc->event_constraint, n, wmin, 1087 wmax, gpmax, assign); 1088 } 1089 1090 /* 1091 * In case of success (unsched = 0), mark events as committed, 1092 * so we do not put_constraint() in case new events are added 1093 * and fail to be scheduled 1094 * 1095 * We invoke the lower level commit callback to lock the resource 1096 * 1097 * We do not need to do all of this in case we are called to 1098 * validate an event group (assign == NULL) 1099 */ 1100 if (!unsched && assign) { 1101 for (i = 0; i < n; i++) 1102 static_call_cond(x86_pmu_commit_scheduling)(cpuc, i, assign[i]); 1103 } else { 1104 for (i = n0; i < n; i++) { 1105 e = cpuc->event_list[i]; 1106 1107 /* 1108 * release events that failed scheduling 1109 */ 1110 static_call_cond(x86_pmu_put_event_constraints)(cpuc, e); 1111 1112 cpuc->event_constraint[i] = NULL; 1113 } 1114 } 1115 1116 static_call_cond(x86_pmu_stop_scheduling)(cpuc); 1117 1118 return unsched ? -EINVAL : 0; 1119 } 1120 1121 static int add_nr_metric_event(struct cpu_hw_events *cpuc, 1122 struct perf_event *event) 1123 { 1124 if (is_metric_event(event)) { 1125 if (cpuc->n_metric == INTEL_TD_METRIC_NUM) 1126 return -EINVAL; 1127 cpuc->n_metric++; 1128 cpuc->n_txn_metric++; 1129 } 1130 1131 return 0; 1132 } 1133 1134 static void del_nr_metric_event(struct cpu_hw_events *cpuc, 1135 struct perf_event *event) 1136 { 1137 if (is_metric_event(event)) 1138 cpuc->n_metric--; 1139 } 1140 1141 static int collect_event(struct cpu_hw_events *cpuc, struct perf_event *event, 1142 int max_count, int n) 1143 { 1144 union perf_capabilities intel_cap = hybrid(cpuc->pmu, intel_cap); 1145 1146 if (intel_cap.perf_metrics && add_nr_metric_event(cpuc, event)) 1147 return -EINVAL; 1148 1149 if (n >= max_count + cpuc->n_metric) 1150 return -EINVAL; 1151 1152 cpuc->event_list[n] = event; 1153 if (is_counter_pair(&event->hw)) { 1154 cpuc->n_pair++; 1155 cpuc->n_txn_pair++; 1156 } 1157 1158 return 0; 1159 } 1160 1161 /* 1162 * dogrp: true if must collect siblings events (group) 1163 * returns total number of events and error code 1164 */ 1165 static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader, bool dogrp) 1166 { 1167 struct perf_event *event; 1168 int n, max_count; 1169 1170 max_count = x86_pmu_num_counters(cpuc->pmu) + x86_pmu_num_counters_fixed(cpuc->pmu); 1171 1172 /* current number of events already accepted */ 1173 n = cpuc->n_events; 1174 if (!cpuc->n_events) 1175 cpuc->pebs_output = 0; 1176 1177 if (!cpuc->is_fake && leader->attr.precise_ip) { 1178 /* 1179 * For PEBS->PT, if !aux_event, the group leader (PT) went 1180 * away, the group was broken down and this singleton event 1181 * can't schedule any more. 1182 */ 1183 if (is_pebs_pt(leader) && !leader->aux_event) 1184 return -EINVAL; 1185 1186 /* 1187 * pebs_output: 0: no PEBS so far, 1: PT, 2: DS 1188 */ 1189 if (cpuc->pebs_output && 1190 cpuc->pebs_output != is_pebs_pt(leader) + 1) 1191 return -EINVAL; 1192 1193 cpuc->pebs_output = is_pebs_pt(leader) + 1; 1194 } 1195 1196 if (is_x86_event(leader)) { 1197 if (collect_event(cpuc, leader, max_count, n)) 1198 return -EINVAL; 1199 n++; 1200 } 1201 1202 if (!dogrp) 1203 return n; 1204 1205 for_each_sibling_event(event, leader) { 1206 if (!is_x86_event(event) || event->state <= PERF_EVENT_STATE_OFF) 1207 continue; 1208 1209 if (collect_event(cpuc, event, max_count, n)) 1210 return -EINVAL; 1211 1212 n++; 1213 } 1214 return n; 1215 } 1216 1217 static inline void x86_assign_hw_event(struct perf_event *event, 1218 struct cpu_hw_events *cpuc, int i) 1219 { 1220 struct hw_perf_event *hwc = &event->hw; 1221 int idx; 1222 1223 idx = hwc->idx = cpuc->assign[i]; 1224 hwc->last_cpu = smp_processor_id(); 1225 hwc->last_tag = ++cpuc->tags[i]; 1226 1227 static_call_cond(x86_pmu_assign)(event, idx); 1228 1229 switch (hwc->idx) { 1230 case INTEL_PMC_IDX_FIXED_BTS: 1231 case INTEL_PMC_IDX_FIXED_VLBR: 1232 hwc->config_base = 0; 1233 hwc->event_base = 0; 1234 break; 1235 1236 case INTEL_PMC_IDX_METRIC_BASE ... INTEL_PMC_IDX_METRIC_END: 1237 /* All the metric events are mapped onto the fixed counter 3. */ 1238 idx = INTEL_PMC_IDX_FIXED_SLOTS; 1239 fallthrough; 1240 case INTEL_PMC_IDX_FIXED ... INTEL_PMC_IDX_FIXED_BTS-1: 1241 hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL; 1242 hwc->event_base = x86_pmu_fixed_ctr_addr(idx - INTEL_PMC_IDX_FIXED); 1243 hwc->event_base_rdpmc = (idx - INTEL_PMC_IDX_FIXED) | 1244 INTEL_PMC_FIXED_RDPMC_BASE; 1245 break; 1246 1247 default: 1248 hwc->config_base = x86_pmu_config_addr(hwc->idx); 1249 hwc->event_base = x86_pmu_event_addr(hwc->idx); 1250 hwc->event_base_rdpmc = x86_pmu_rdpmc_index(hwc->idx); 1251 break; 1252 } 1253 } 1254 1255 /** 1256 * x86_perf_rdpmc_index - Return PMC counter used for event 1257 * @event: the perf_event to which the PMC counter was assigned 1258 * 1259 * The counter assigned to this performance event may change if interrupts 1260 * are enabled. This counter should thus never be used while interrupts are 1261 * enabled. Before this function is used to obtain the assigned counter the 1262 * event should be checked for validity using, for example, 1263 * perf_event_read_local(), within the same interrupt disabled section in 1264 * which this counter is planned to be used. 1265 * 1266 * Return: The index of the performance monitoring counter assigned to 1267 * @perf_event. 1268 */ 1269 int x86_perf_rdpmc_index(struct perf_event *event) 1270 { 1271 lockdep_assert_irqs_disabled(); 1272 1273 return event->hw.event_base_rdpmc; 1274 } 1275 1276 static inline int match_prev_assignment(struct hw_perf_event *hwc, 1277 struct cpu_hw_events *cpuc, 1278 int i) 1279 { 1280 return hwc->idx == cpuc->assign[i] && 1281 hwc->last_cpu == smp_processor_id() && 1282 hwc->last_tag == cpuc->tags[i]; 1283 } 1284 1285 static void x86_pmu_start(struct perf_event *event, int flags); 1286 1287 static void x86_pmu_enable(struct pmu *pmu) 1288 { 1289 struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); 1290 struct perf_event *event; 1291 struct hw_perf_event *hwc; 1292 int i, added = cpuc->n_added; 1293 1294 if (!x86_pmu_initialized()) 1295 return; 1296 1297 if (cpuc->enabled) 1298 return; 1299 1300 if (cpuc->n_added) { 1301 int n_running = cpuc->n_events - cpuc->n_added; 1302 1303 /* 1304 * The late setup (after counters are scheduled) 1305 * is required for some cases, e.g., PEBS counters 1306 * snapshotting. Because an accurate counter index 1307 * is needed. 1308 */ 1309 static_call_cond(x86_pmu_late_setup)(); 1310 1311 /* 1312 * apply assignment obtained either from 1313 * hw_perf_group_sched_in() or x86_pmu_enable() 1314 * 1315 * step1: save events moving to new counters 1316 */ 1317 for (i = 0; i < n_running; i++) { 1318 event = cpuc->event_list[i]; 1319 hwc = &event->hw; 1320 1321 /* 1322 * we can avoid reprogramming counter if: 1323 * - assigned same counter as last time 1324 * - running on same CPU as last time 1325 * - no other event has used the counter since 1326 */ 1327 if (hwc->idx == -1 || 1328 match_prev_assignment(hwc, cpuc, i)) 1329 continue; 1330 1331 /* 1332 * Ensure we don't accidentally enable a stopped 1333 * counter simply because we rescheduled. 1334 */ 1335 if (hwc->state & PERF_HES_STOPPED) 1336 hwc->state |= PERF_HES_ARCH; 1337 1338 x86_pmu_stop(event, PERF_EF_UPDATE); 1339 } 1340 1341 /* 1342 * step2: reprogram moved events into new counters 1343 */ 1344 for (i = 0; i < cpuc->n_events; i++) { 1345 event = cpuc->event_list[i]; 1346 hwc = &event->hw; 1347 1348 if (!match_prev_assignment(hwc, cpuc, i)) 1349 x86_assign_hw_event(event, cpuc, i); 1350 else if (i < n_running) 1351 continue; 1352 1353 if (hwc->state & PERF_HES_ARCH) 1354 continue; 1355 1356 /* 1357 * if cpuc->enabled = 0, then no wrmsr as 1358 * per x86_pmu_enable_event() 1359 */ 1360 x86_pmu_start(event, PERF_EF_RELOAD); 1361 } 1362 cpuc->n_added = 0; 1363 perf_events_lapic_init(); 1364 } 1365 1366 cpuc->enabled = 1; 1367 barrier(); 1368 1369 static_call(x86_pmu_enable_all)(added); 1370 } 1371 1372 DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left); 1373 1374 /* 1375 * Set the next IRQ period, based on the hwc->period_left value. 1376 * To be called with the event disabled in hw: 1377 */ 1378 int x86_perf_event_set_period(struct perf_event *event) 1379 { 1380 struct hw_perf_event *hwc = &event->hw; 1381 s64 left = local64_read(&hwc->period_left); 1382 s64 period = hwc->sample_period; 1383 int ret = 0, idx = hwc->idx; 1384 1385 if (unlikely(!hwc->event_base)) 1386 return 0; 1387 1388 /* 1389 * If we are way outside a reasonable range then just skip forward: 1390 */ 1391 if (unlikely(left <= -period)) { 1392 left = period; 1393 local64_set(&hwc->period_left, left); 1394 hwc->last_period = period; 1395 ret = 1; 1396 } 1397 1398 if (unlikely(left <= 0)) { 1399 left += period; 1400 local64_set(&hwc->period_left, left); 1401 hwc->last_period = period; 1402 ret = 1; 1403 } 1404 /* 1405 * Quirk: certain CPUs dont like it if just 1 hw_event is left: 1406 */ 1407 if (unlikely(left < 2)) 1408 left = 2; 1409 1410 if (left > x86_pmu.max_period) 1411 left = x86_pmu.max_period; 1412 1413 static_call_cond(x86_pmu_limit_period)(event, &left); 1414 1415 this_cpu_write(pmc_prev_left[idx], left); 1416 1417 /* 1418 * The hw event starts counting from this event offset, 1419 * mark it to be able to extra future deltas: 1420 */ 1421 local64_set(&hwc->prev_count, (u64)-left); 1422 1423 wrmsrl(hwc->event_base, (u64)(-left) & x86_pmu.cntval_mask); 1424 1425 /* 1426 * Sign extend the Merge event counter's upper 16 bits since 1427 * we currently declare a 48-bit counter width 1428 */ 1429 if (is_counter_pair(hwc)) 1430 wrmsrl(x86_pmu_event_addr(idx + 1), 0xffff); 1431 1432 perf_event_update_userpage(event); 1433 1434 return ret; 1435 } 1436 1437 void x86_pmu_enable_event(struct perf_event *event) 1438 { 1439 if (__this_cpu_read(cpu_hw_events.enabled)) 1440 __x86_pmu_enable_event(&event->hw, 1441 ARCH_PERFMON_EVENTSEL_ENABLE); 1442 } 1443 1444 /* 1445 * Add a single event to the PMU. 1446 * 1447 * The event is added to the group of enabled events 1448 * but only if it can be scheduled with existing events. 1449 */ 1450 static int x86_pmu_add(struct perf_event *event, int flags) 1451 { 1452 struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); 1453 struct hw_perf_event *hwc; 1454 int assign[X86_PMC_IDX_MAX]; 1455 int n, n0, ret; 1456 1457 hwc = &event->hw; 1458 1459 n0 = cpuc->n_events; 1460 ret = n = collect_events(cpuc, event, false); 1461 if (ret < 0) 1462 goto out; 1463 1464 hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED; 1465 if (!(flags & PERF_EF_START)) 1466 hwc->state |= PERF_HES_ARCH; 1467 1468 /* 1469 * If group events scheduling transaction was started, 1470 * skip the schedulability test here, it will be performed 1471 * at commit time (->commit_txn) as a whole. 1472 * 1473 * If commit fails, we'll call ->del() on all events 1474 * for which ->add() was called. 1475 */ 1476 if (cpuc->txn_flags & PERF_PMU_TXN_ADD) 1477 goto done_collect; 1478 1479 ret = static_call(x86_pmu_schedule_events)(cpuc, n, assign); 1480 if (ret) 1481 goto out; 1482 /* 1483 * copy new assignment, now we know it is possible 1484 * will be used by hw_perf_enable() 1485 */ 1486 memcpy(cpuc->assign, assign, n*sizeof(int)); 1487 1488 done_collect: 1489 /* 1490 * Commit the collect_events() state. See x86_pmu_del() and 1491 * x86_pmu_*_txn(). 1492 */ 1493 cpuc->n_events = n; 1494 cpuc->n_added += n - n0; 1495 cpuc->n_txn += n - n0; 1496 1497 /* 1498 * This is before x86_pmu_enable() will call x86_pmu_start(), 1499 * so we enable LBRs before an event needs them etc.. 1500 */ 1501 static_call_cond(x86_pmu_add)(event); 1502 1503 ret = 0; 1504 out: 1505 return ret; 1506 } 1507 1508 static void x86_pmu_start(struct perf_event *event, int flags) 1509 { 1510 struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); 1511 int idx = event->hw.idx; 1512 1513 if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED))) 1514 return; 1515 1516 if (WARN_ON_ONCE(idx == -1)) 1517 return; 1518 1519 if (flags & PERF_EF_RELOAD) { 1520 WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE)); 1521 static_call(x86_pmu_set_period)(event); 1522 } 1523 1524 event->hw.state = 0; 1525 1526 cpuc->events[idx] = event; 1527 __set_bit(idx, cpuc->active_mask); 1528 static_call(x86_pmu_enable)(event); 1529 perf_event_update_userpage(event); 1530 } 1531 1532 void perf_event_print_debug(void) 1533 { 1534 u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed; 1535 unsigned long *cntr_mask, *fixed_cntr_mask; 1536 struct event_constraint *pebs_constraints; 1537 struct cpu_hw_events *cpuc; 1538 u64 pebs, debugctl; 1539 int cpu, idx; 1540 1541 guard(irqsave)(); 1542 1543 cpu = smp_processor_id(); 1544 cpuc = &per_cpu(cpu_hw_events, cpu); 1545 cntr_mask = hybrid(cpuc->pmu, cntr_mask); 1546 fixed_cntr_mask = hybrid(cpuc->pmu, fixed_cntr_mask); 1547 pebs_constraints = hybrid(cpuc->pmu, pebs_constraints); 1548 1549 if (!*(u64 *)cntr_mask) 1550 return; 1551 1552 if (x86_pmu.version >= 2) { 1553 rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl); 1554 rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); 1555 rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow); 1556 rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed); 1557 1558 pr_info("\n"); 1559 pr_info("CPU#%d: ctrl: %016llx\n", cpu, ctrl); 1560 pr_info("CPU#%d: status: %016llx\n", cpu, status); 1561 pr_info("CPU#%d: overflow: %016llx\n", cpu, overflow); 1562 pr_info("CPU#%d: fixed: %016llx\n", cpu, fixed); 1563 if (pebs_constraints) { 1564 rdmsrl(MSR_IA32_PEBS_ENABLE, pebs); 1565 pr_info("CPU#%d: pebs: %016llx\n", cpu, pebs); 1566 } 1567 if (x86_pmu.lbr_nr) { 1568 rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); 1569 pr_info("CPU#%d: debugctl: %016llx\n", cpu, debugctl); 1570 } 1571 } 1572 pr_info("CPU#%d: active: %016llx\n", cpu, *(u64 *)cpuc->active_mask); 1573 1574 for_each_set_bit(idx, cntr_mask, X86_PMC_IDX_MAX) { 1575 rdmsrl(x86_pmu_config_addr(idx), pmc_ctrl); 1576 rdmsrl(x86_pmu_event_addr(idx), pmc_count); 1577 1578 prev_left = per_cpu(pmc_prev_left[idx], cpu); 1579 1580 pr_info("CPU#%d: gen-PMC%d ctrl: %016llx\n", 1581 cpu, idx, pmc_ctrl); 1582 pr_info("CPU#%d: gen-PMC%d count: %016llx\n", 1583 cpu, idx, pmc_count); 1584 pr_info("CPU#%d: gen-PMC%d left: %016llx\n", 1585 cpu, idx, prev_left); 1586 } 1587 for_each_set_bit(idx, fixed_cntr_mask, X86_PMC_IDX_MAX) { 1588 if (fixed_counter_disabled(idx, cpuc->pmu)) 1589 continue; 1590 rdmsrl(x86_pmu_fixed_ctr_addr(idx), pmc_count); 1591 1592 pr_info("CPU#%d: fixed-PMC%d count: %016llx\n", 1593 cpu, idx, pmc_count); 1594 } 1595 } 1596 1597 void x86_pmu_stop(struct perf_event *event, int flags) 1598 { 1599 struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); 1600 struct hw_perf_event *hwc = &event->hw; 1601 1602 if (test_bit(hwc->idx, cpuc->active_mask)) { 1603 static_call(x86_pmu_disable)(event); 1604 __clear_bit(hwc->idx, cpuc->active_mask); 1605 cpuc->events[hwc->idx] = NULL; 1606 WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED); 1607 hwc->state |= PERF_HES_STOPPED; 1608 } 1609 1610 if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) { 1611 /* 1612 * Drain the remaining delta count out of a event 1613 * that we are disabling: 1614 */ 1615 static_call(x86_pmu_update)(event); 1616 hwc->state |= PERF_HES_UPTODATE; 1617 } 1618 } 1619 1620 static void x86_pmu_del(struct perf_event *event, int flags) 1621 { 1622 struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); 1623 union perf_capabilities intel_cap = hybrid(cpuc->pmu, intel_cap); 1624 int i; 1625 1626 /* 1627 * If we're called during a txn, we only need to undo x86_pmu.add. 1628 * The events never got scheduled and ->cancel_txn will truncate 1629 * the event_list. 1630 * 1631 * XXX assumes any ->del() called during a TXN will only be on 1632 * an event added during that same TXN. 1633 */ 1634 if (cpuc->txn_flags & PERF_PMU_TXN_ADD) 1635 goto do_del; 1636 1637 __set_bit(event->hw.idx, cpuc->dirty); 1638 1639 /* 1640 * Not a TXN, therefore cleanup properly. 1641 */ 1642 x86_pmu_stop(event, PERF_EF_UPDATE); 1643 1644 for (i = 0; i < cpuc->n_events; i++) { 1645 if (event == cpuc->event_list[i]) 1646 break; 1647 } 1648 1649 if (WARN_ON_ONCE(i == cpuc->n_events)) /* called ->del() without ->add() ? */ 1650 return; 1651 1652 /* If we have a newly added event; make sure to decrease n_added. */ 1653 if (i >= cpuc->n_events - cpuc->n_added) 1654 --cpuc->n_added; 1655 1656 static_call_cond(x86_pmu_put_event_constraints)(cpuc, event); 1657 1658 /* Delete the array entry. */ 1659 while (++i < cpuc->n_events) { 1660 cpuc->event_list[i-1] = cpuc->event_list[i]; 1661 cpuc->event_constraint[i-1] = cpuc->event_constraint[i]; 1662 cpuc->assign[i-1] = cpuc->assign[i]; 1663 } 1664 cpuc->event_constraint[i-1] = NULL; 1665 --cpuc->n_events; 1666 if (intel_cap.perf_metrics) 1667 del_nr_metric_event(cpuc, event); 1668 1669 perf_event_update_userpage(event); 1670 1671 do_del: 1672 1673 /* 1674 * This is after x86_pmu_stop(); so we disable LBRs after any 1675 * event can need them etc.. 1676 */ 1677 static_call_cond(x86_pmu_del)(event); 1678 } 1679 1680 int x86_pmu_handle_irq(struct pt_regs *regs) 1681 { 1682 struct perf_sample_data data; 1683 struct cpu_hw_events *cpuc; 1684 struct perf_event *event; 1685 int idx, handled = 0; 1686 u64 val; 1687 1688 cpuc = this_cpu_ptr(&cpu_hw_events); 1689 1690 /* 1691 * Some chipsets need to unmask the LVTPC in a particular spot 1692 * inside the nmi handler. As a result, the unmasking was pushed 1693 * into all the nmi handlers. 1694 * 1695 * This generic handler doesn't seem to have any issues where the 1696 * unmasking occurs so it was left at the top. 1697 */ 1698 apic_write(APIC_LVTPC, APIC_DM_NMI); 1699 1700 for_each_set_bit(idx, x86_pmu.cntr_mask, X86_PMC_IDX_MAX) { 1701 if (!test_bit(idx, cpuc->active_mask)) 1702 continue; 1703 1704 event = cpuc->events[idx]; 1705 1706 val = static_call(x86_pmu_update)(event); 1707 if (val & (1ULL << (x86_pmu.cntval_bits - 1))) 1708 continue; 1709 1710 /* 1711 * event overflow 1712 */ 1713 handled++; 1714 1715 if (!static_call(x86_pmu_set_period)(event)) 1716 continue; 1717 1718 perf_sample_data_init(&data, 0, event->hw.last_period); 1719 1720 perf_sample_save_brstack(&data, event, &cpuc->lbr_stack, NULL); 1721 1722 if (perf_event_overflow(event, &data, regs)) 1723 x86_pmu_stop(event, 0); 1724 } 1725 1726 if (handled) 1727 inc_irq_stat(apic_perf_irqs); 1728 1729 return handled; 1730 } 1731 1732 void perf_events_lapic_init(void) 1733 { 1734 if (!x86_pmu.apic || !x86_pmu_initialized()) 1735 return; 1736 1737 /* 1738 * Always use NMI for PMU 1739 */ 1740 apic_write(APIC_LVTPC, APIC_DM_NMI); 1741 } 1742 1743 static int 1744 perf_event_nmi_handler(unsigned int cmd, struct pt_regs *regs) 1745 { 1746 u64 start_clock; 1747 u64 finish_clock; 1748 int ret; 1749 1750 /* 1751 * All PMUs/events that share this PMI handler should make sure to 1752 * increment active_events for their events. 1753 */ 1754 if (!atomic_read(&active_events)) 1755 return NMI_DONE; 1756 1757 start_clock = sched_clock(); 1758 ret = static_call(x86_pmu_handle_irq)(regs); 1759 finish_clock = sched_clock(); 1760 1761 perf_sample_event_took(finish_clock - start_clock); 1762 1763 return ret; 1764 } 1765 NOKPROBE_SYMBOL(perf_event_nmi_handler); 1766 1767 struct event_constraint emptyconstraint; 1768 struct event_constraint unconstrained; 1769 1770 static int x86_pmu_prepare_cpu(unsigned int cpu) 1771 { 1772 struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); 1773 int i; 1774 1775 for (i = 0 ; i < X86_PERF_KFREE_MAX; i++) 1776 cpuc->kfree_on_online[i] = NULL; 1777 if (x86_pmu.cpu_prepare) 1778 return x86_pmu.cpu_prepare(cpu); 1779 return 0; 1780 } 1781 1782 static int x86_pmu_dead_cpu(unsigned int cpu) 1783 { 1784 if (x86_pmu.cpu_dead) 1785 x86_pmu.cpu_dead(cpu); 1786 return 0; 1787 } 1788 1789 static int x86_pmu_online_cpu(unsigned int cpu) 1790 { 1791 struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); 1792 int i; 1793 1794 for (i = 0 ; i < X86_PERF_KFREE_MAX; i++) { 1795 kfree(cpuc->kfree_on_online[i]); 1796 cpuc->kfree_on_online[i] = NULL; 1797 } 1798 return 0; 1799 } 1800 1801 static int x86_pmu_starting_cpu(unsigned int cpu) 1802 { 1803 if (x86_pmu.cpu_starting) 1804 x86_pmu.cpu_starting(cpu); 1805 return 0; 1806 } 1807 1808 static int x86_pmu_dying_cpu(unsigned int cpu) 1809 { 1810 if (x86_pmu.cpu_dying) 1811 x86_pmu.cpu_dying(cpu); 1812 return 0; 1813 } 1814 1815 static void __init pmu_check_apic(void) 1816 { 1817 if (boot_cpu_has(X86_FEATURE_APIC)) 1818 return; 1819 1820 x86_pmu.apic = 0; 1821 pr_info("no APIC, boot with the \"lapic\" boot parameter to force-enable it.\n"); 1822 pr_info("no hardware sampling interrupt available.\n"); 1823 1824 /* 1825 * If we have a PMU initialized but no APIC 1826 * interrupts, we cannot sample hardware 1827 * events (user-space has to fall back and 1828 * sample via a hrtimer based software event): 1829 */ 1830 pmu.capabilities |= PERF_PMU_CAP_NO_INTERRUPT; 1831 1832 } 1833 1834 static struct attribute_group x86_pmu_format_group __ro_after_init = { 1835 .name = "format", 1836 .attrs = NULL, 1837 }; 1838 1839 ssize_t events_sysfs_show(struct device *dev, struct device_attribute *attr, char *page) 1840 { 1841 struct perf_pmu_events_attr *pmu_attr = 1842 container_of(attr, struct perf_pmu_events_attr, attr); 1843 u64 config = 0; 1844 1845 if (pmu_attr->id < x86_pmu.max_events) 1846 config = x86_pmu.event_map(pmu_attr->id); 1847 1848 /* string trumps id */ 1849 if (pmu_attr->event_str) 1850 return sprintf(page, "%s\n", pmu_attr->event_str); 1851 1852 return x86_pmu.events_sysfs_show(page, config); 1853 } 1854 EXPORT_SYMBOL_GPL(events_sysfs_show); 1855 1856 ssize_t events_ht_sysfs_show(struct device *dev, struct device_attribute *attr, 1857 char *page) 1858 { 1859 struct perf_pmu_events_ht_attr *pmu_attr = 1860 container_of(attr, struct perf_pmu_events_ht_attr, attr); 1861 1862 /* 1863 * Report conditional events depending on Hyper-Threading. 1864 * 1865 * This is overly conservative as usually the HT special 1866 * handling is not needed if the other CPU thread is idle. 1867 * 1868 * Note this does not (and cannot) handle the case when thread 1869 * siblings are invisible, for example with virtualization 1870 * if they are owned by some other guest. The user tool 1871 * has to re-read when a thread sibling gets onlined later. 1872 */ 1873 return sprintf(page, "%s", 1874 topology_max_smt_threads() > 1 ? 1875 pmu_attr->event_str_ht : 1876 pmu_attr->event_str_noht); 1877 } 1878 1879 ssize_t events_hybrid_sysfs_show(struct device *dev, 1880 struct device_attribute *attr, 1881 char *page) 1882 { 1883 struct perf_pmu_events_hybrid_attr *pmu_attr = 1884 container_of(attr, struct perf_pmu_events_hybrid_attr, attr); 1885 struct x86_hybrid_pmu *pmu; 1886 const char *str, *next_str; 1887 int i; 1888 1889 if (hweight64(pmu_attr->pmu_type) == 1) 1890 return sprintf(page, "%s", pmu_attr->event_str); 1891 1892 /* 1893 * Hybrid PMUs may support the same event name, but with different 1894 * event encoding, e.g., the mem-loads event on an Atom PMU has 1895 * different event encoding from a Core PMU. 1896 * 1897 * The event_str includes all event encodings. Each event encoding 1898 * is divided by ";". The order of the event encodings must follow 1899 * the order of the hybrid PMU index. 1900 */ 1901 pmu = container_of(dev_get_drvdata(dev), struct x86_hybrid_pmu, pmu); 1902 1903 str = pmu_attr->event_str; 1904 for (i = 0; i < x86_pmu.num_hybrid_pmus; i++) { 1905 if (!(x86_pmu.hybrid_pmu[i].pmu_type & pmu_attr->pmu_type)) 1906 continue; 1907 if (x86_pmu.hybrid_pmu[i].pmu_type & pmu->pmu_type) { 1908 next_str = strchr(str, ';'); 1909 if (next_str) 1910 return snprintf(page, next_str - str + 1, "%s", str); 1911 else 1912 return sprintf(page, "%s", str); 1913 } 1914 str = strchr(str, ';'); 1915 str++; 1916 } 1917 1918 return 0; 1919 } 1920 EXPORT_SYMBOL_GPL(events_hybrid_sysfs_show); 1921 1922 EVENT_ATTR(cpu-cycles, CPU_CYCLES ); 1923 EVENT_ATTR(instructions, INSTRUCTIONS ); 1924 EVENT_ATTR(cache-references, CACHE_REFERENCES ); 1925 EVENT_ATTR(cache-misses, CACHE_MISSES ); 1926 EVENT_ATTR(branch-instructions, BRANCH_INSTRUCTIONS ); 1927 EVENT_ATTR(branch-misses, BRANCH_MISSES ); 1928 EVENT_ATTR(bus-cycles, BUS_CYCLES ); 1929 EVENT_ATTR(stalled-cycles-frontend, STALLED_CYCLES_FRONTEND ); 1930 EVENT_ATTR(stalled-cycles-backend, STALLED_CYCLES_BACKEND ); 1931 EVENT_ATTR(ref-cycles, REF_CPU_CYCLES ); 1932 1933 static struct attribute *empty_attrs; 1934 1935 static struct attribute *events_attr[] = { 1936 EVENT_PTR(CPU_CYCLES), 1937 EVENT_PTR(INSTRUCTIONS), 1938 EVENT_PTR(CACHE_REFERENCES), 1939 EVENT_PTR(CACHE_MISSES), 1940 EVENT_PTR(BRANCH_INSTRUCTIONS), 1941 EVENT_PTR(BRANCH_MISSES), 1942 EVENT_PTR(BUS_CYCLES), 1943 EVENT_PTR(STALLED_CYCLES_FRONTEND), 1944 EVENT_PTR(STALLED_CYCLES_BACKEND), 1945 EVENT_PTR(REF_CPU_CYCLES), 1946 NULL, 1947 }; 1948 1949 /* 1950 * Remove all undefined events (x86_pmu.event_map(id) == 0) 1951 * out of events_attr attributes. 1952 */ 1953 static umode_t 1954 is_visible(struct kobject *kobj, struct attribute *attr, int idx) 1955 { 1956 struct perf_pmu_events_attr *pmu_attr; 1957 1958 if (idx >= x86_pmu.max_events) 1959 return 0; 1960 1961 pmu_attr = container_of(attr, struct perf_pmu_events_attr, attr.attr); 1962 /* str trumps id */ 1963 return pmu_attr->event_str || x86_pmu.event_map(idx) ? attr->mode : 0; 1964 } 1965 1966 static struct attribute_group x86_pmu_events_group __ro_after_init = { 1967 .name = "events", 1968 .attrs = events_attr, 1969 .is_visible = is_visible, 1970 }; 1971 1972 ssize_t x86_event_sysfs_show(char *page, u64 config, u64 event) 1973 { 1974 u64 umask = (config & ARCH_PERFMON_EVENTSEL_UMASK) >> 8; 1975 u64 cmask = (config & ARCH_PERFMON_EVENTSEL_CMASK) >> 24; 1976 bool edge = (config & ARCH_PERFMON_EVENTSEL_EDGE); 1977 bool pc = (config & ARCH_PERFMON_EVENTSEL_PIN_CONTROL); 1978 bool any = (config & ARCH_PERFMON_EVENTSEL_ANY); 1979 bool inv = (config & ARCH_PERFMON_EVENTSEL_INV); 1980 ssize_t ret; 1981 1982 /* 1983 * We have whole page size to spend and just little data 1984 * to write, so we can safely use sprintf. 1985 */ 1986 ret = sprintf(page, "event=0x%02llx", event); 1987 1988 if (umask) 1989 ret += sprintf(page + ret, ",umask=0x%02llx", umask); 1990 1991 if (edge) 1992 ret += sprintf(page + ret, ",edge"); 1993 1994 if (pc) 1995 ret += sprintf(page + ret, ",pc"); 1996 1997 if (any) 1998 ret += sprintf(page + ret, ",any"); 1999 2000 if (inv) 2001 ret += sprintf(page + ret, ",inv"); 2002 2003 if (cmask) 2004 ret += sprintf(page + ret, ",cmask=0x%02llx", cmask); 2005 2006 ret += sprintf(page + ret, "\n"); 2007 2008 return ret; 2009 } 2010 2011 static struct attribute_group x86_pmu_attr_group; 2012 static struct attribute_group x86_pmu_caps_group; 2013 2014 static void x86_pmu_static_call_update(void) 2015 { 2016 static_call_update(x86_pmu_handle_irq, x86_pmu.handle_irq); 2017 static_call_update(x86_pmu_disable_all, x86_pmu.disable_all); 2018 static_call_update(x86_pmu_enable_all, x86_pmu.enable_all); 2019 static_call_update(x86_pmu_enable, x86_pmu.enable); 2020 static_call_update(x86_pmu_disable, x86_pmu.disable); 2021 2022 static_call_update(x86_pmu_assign, x86_pmu.assign); 2023 2024 static_call_update(x86_pmu_add, x86_pmu.add); 2025 static_call_update(x86_pmu_del, x86_pmu.del); 2026 static_call_update(x86_pmu_read, x86_pmu.read); 2027 2028 static_call_update(x86_pmu_set_period, x86_pmu.set_period); 2029 static_call_update(x86_pmu_update, x86_pmu.update); 2030 static_call_update(x86_pmu_limit_period, x86_pmu.limit_period); 2031 2032 static_call_update(x86_pmu_schedule_events, x86_pmu.schedule_events); 2033 static_call_update(x86_pmu_get_event_constraints, x86_pmu.get_event_constraints); 2034 static_call_update(x86_pmu_put_event_constraints, x86_pmu.put_event_constraints); 2035 2036 static_call_update(x86_pmu_start_scheduling, x86_pmu.start_scheduling); 2037 static_call_update(x86_pmu_commit_scheduling, x86_pmu.commit_scheduling); 2038 static_call_update(x86_pmu_stop_scheduling, x86_pmu.stop_scheduling); 2039 2040 static_call_update(x86_pmu_sched_task, x86_pmu.sched_task); 2041 2042 static_call_update(x86_pmu_drain_pebs, x86_pmu.drain_pebs); 2043 static_call_update(x86_pmu_pebs_aliases, x86_pmu.pebs_aliases); 2044 2045 static_call_update(x86_pmu_guest_get_msrs, x86_pmu.guest_get_msrs); 2046 static_call_update(x86_pmu_filter, x86_pmu.filter); 2047 2048 static_call_update(x86_pmu_late_setup, x86_pmu.late_setup); 2049 } 2050 2051 static void _x86_pmu_read(struct perf_event *event) 2052 { 2053 static_call(x86_pmu_update)(event); 2054 } 2055 2056 void x86_pmu_show_pmu_cap(struct pmu *pmu) 2057 { 2058 pr_info("... version: %d\n", x86_pmu.version); 2059 pr_info("... bit width: %d\n", x86_pmu.cntval_bits); 2060 pr_info("... generic registers: %d\n", x86_pmu_num_counters(pmu)); 2061 pr_info("... value mask: %016Lx\n", x86_pmu.cntval_mask); 2062 pr_info("... max period: %016Lx\n", x86_pmu.max_period); 2063 pr_info("... fixed-purpose events: %d\n", x86_pmu_num_counters_fixed(pmu)); 2064 pr_info("... event mask: %016Lx\n", hybrid(pmu, intel_ctrl)); 2065 } 2066 2067 static int __init init_hw_perf_events(void) 2068 { 2069 struct x86_pmu_quirk *quirk; 2070 int err; 2071 2072 pr_info("Performance Events: "); 2073 2074 switch (boot_cpu_data.x86_vendor) { 2075 case X86_VENDOR_INTEL: 2076 err = intel_pmu_init(); 2077 break; 2078 case X86_VENDOR_AMD: 2079 err = amd_pmu_init(); 2080 break; 2081 case X86_VENDOR_HYGON: 2082 err = amd_pmu_init(); 2083 x86_pmu.name = "HYGON"; 2084 break; 2085 case X86_VENDOR_ZHAOXIN: 2086 case X86_VENDOR_CENTAUR: 2087 err = zhaoxin_pmu_init(); 2088 break; 2089 default: 2090 err = -ENOTSUPP; 2091 } 2092 if (err != 0) { 2093 pr_cont("no PMU driver, software events only.\n"); 2094 err = 0; 2095 goto out_bad_pmu; 2096 } 2097 2098 pmu_check_apic(); 2099 2100 /* sanity check that the hardware exists or is emulated */ 2101 if (!check_hw_exists(&pmu, x86_pmu.cntr_mask, x86_pmu.fixed_cntr_mask)) 2102 goto out_bad_pmu; 2103 2104 pr_cont("%s PMU driver.\n", x86_pmu.name); 2105 2106 x86_pmu.attr_rdpmc = 1; /* enable userspace RDPMC usage by default */ 2107 2108 for (quirk = x86_pmu.quirks; quirk; quirk = quirk->next) 2109 quirk->func(); 2110 2111 if (!x86_pmu.intel_ctrl) 2112 x86_pmu.intel_ctrl = x86_pmu.cntr_mask64; 2113 2114 if (!x86_pmu.config_mask) 2115 x86_pmu.config_mask = X86_RAW_EVENT_MASK; 2116 2117 perf_events_lapic_init(); 2118 register_nmi_handler(NMI_LOCAL, perf_event_nmi_handler, 0, "PMI"); 2119 2120 unconstrained = (struct event_constraint) 2121 __EVENT_CONSTRAINT(0, x86_pmu.cntr_mask64, 2122 0, x86_pmu_num_counters(NULL), 0, 0); 2123 2124 x86_pmu_format_group.attrs = x86_pmu.format_attrs; 2125 2126 if (!x86_pmu.events_sysfs_show) 2127 x86_pmu_events_group.attrs = &empty_attrs; 2128 2129 pmu.attr_update = x86_pmu.attr_update; 2130 2131 if (!is_hybrid()) 2132 x86_pmu_show_pmu_cap(NULL); 2133 2134 if (!x86_pmu.read) 2135 x86_pmu.read = _x86_pmu_read; 2136 2137 if (!x86_pmu.guest_get_msrs) 2138 x86_pmu.guest_get_msrs = (void *)&__static_call_return0; 2139 2140 if (!x86_pmu.set_period) 2141 x86_pmu.set_period = x86_perf_event_set_period; 2142 2143 if (!x86_pmu.update) 2144 x86_pmu.update = x86_perf_event_update; 2145 2146 x86_pmu_static_call_update(); 2147 2148 /* 2149 * Install callbacks. Core will call them for each online 2150 * cpu. 2151 */ 2152 err = cpuhp_setup_state(CPUHP_PERF_X86_PREPARE, "perf/x86:prepare", 2153 x86_pmu_prepare_cpu, x86_pmu_dead_cpu); 2154 if (err) 2155 return err; 2156 2157 err = cpuhp_setup_state(CPUHP_AP_PERF_X86_STARTING, 2158 "perf/x86:starting", x86_pmu_starting_cpu, 2159 x86_pmu_dying_cpu); 2160 if (err) 2161 goto out; 2162 2163 err = cpuhp_setup_state(CPUHP_AP_PERF_X86_ONLINE, "perf/x86:online", 2164 x86_pmu_online_cpu, NULL); 2165 if (err) 2166 goto out1; 2167 2168 if (!is_hybrid()) { 2169 err = perf_pmu_register(&pmu, "cpu", PERF_TYPE_RAW); 2170 if (err) 2171 goto out2; 2172 } else { 2173 struct x86_hybrid_pmu *hybrid_pmu; 2174 int i, j; 2175 2176 for (i = 0; i < x86_pmu.num_hybrid_pmus; i++) { 2177 hybrid_pmu = &x86_pmu.hybrid_pmu[i]; 2178 2179 hybrid_pmu->pmu = pmu; 2180 hybrid_pmu->pmu.type = -1; 2181 hybrid_pmu->pmu.attr_update = x86_pmu.attr_update; 2182 hybrid_pmu->pmu.capabilities |= PERF_PMU_CAP_EXTENDED_HW_TYPE; 2183 2184 err = perf_pmu_register(&hybrid_pmu->pmu, hybrid_pmu->name, 2185 (hybrid_pmu->pmu_type == hybrid_big) ? PERF_TYPE_RAW : -1); 2186 if (err) 2187 break; 2188 } 2189 2190 if (i < x86_pmu.num_hybrid_pmus) { 2191 for (j = 0; j < i; j++) 2192 perf_pmu_unregister(&x86_pmu.hybrid_pmu[j].pmu); 2193 pr_warn("Failed to register hybrid PMUs\n"); 2194 kfree(x86_pmu.hybrid_pmu); 2195 x86_pmu.hybrid_pmu = NULL; 2196 x86_pmu.num_hybrid_pmus = 0; 2197 goto out2; 2198 } 2199 } 2200 2201 return 0; 2202 2203 out2: 2204 cpuhp_remove_state(CPUHP_AP_PERF_X86_ONLINE); 2205 out1: 2206 cpuhp_remove_state(CPUHP_AP_PERF_X86_STARTING); 2207 out: 2208 cpuhp_remove_state(CPUHP_PERF_X86_PREPARE); 2209 out_bad_pmu: 2210 memset(&x86_pmu, 0, sizeof(x86_pmu)); 2211 return err; 2212 } 2213 early_initcall(init_hw_perf_events); 2214 2215 static void x86_pmu_read(struct perf_event *event) 2216 { 2217 static_call(x86_pmu_read)(event); 2218 } 2219 2220 /* 2221 * Start group events scheduling transaction 2222 * Set the flag to make pmu::enable() not perform the 2223 * schedulability test, it will be performed at commit time 2224 * 2225 * We only support PERF_PMU_TXN_ADD transactions. Save the 2226 * transaction flags but otherwise ignore non-PERF_PMU_TXN_ADD 2227 * transactions. 2228 */ 2229 static void x86_pmu_start_txn(struct pmu *pmu, unsigned int txn_flags) 2230 { 2231 struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); 2232 2233 WARN_ON_ONCE(cpuc->txn_flags); /* txn already in flight */ 2234 2235 cpuc->txn_flags = txn_flags; 2236 if (txn_flags & ~PERF_PMU_TXN_ADD) 2237 return; 2238 2239 perf_pmu_disable(pmu); 2240 __this_cpu_write(cpu_hw_events.n_txn, 0); 2241 __this_cpu_write(cpu_hw_events.n_txn_pair, 0); 2242 __this_cpu_write(cpu_hw_events.n_txn_metric, 0); 2243 } 2244 2245 /* 2246 * Stop group events scheduling transaction 2247 * Clear the flag and pmu::enable() will perform the 2248 * schedulability test. 2249 */ 2250 static void x86_pmu_cancel_txn(struct pmu *pmu) 2251 { 2252 unsigned int txn_flags; 2253 struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); 2254 2255 WARN_ON_ONCE(!cpuc->txn_flags); /* no txn in flight */ 2256 2257 txn_flags = cpuc->txn_flags; 2258 cpuc->txn_flags = 0; 2259 if (txn_flags & ~PERF_PMU_TXN_ADD) 2260 return; 2261 2262 /* 2263 * Truncate collected array by the number of events added in this 2264 * transaction. See x86_pmu_add() and x86_pmu_*_txn(). 2265 */ 2266 __this_cpu_sub(cpu_hw_events.n_added, __this_cpu_read(cpu_hw_events.n_txn)); 2267 __this_cpu_sub(cpu_hw_events.n_events, __this_cpu_read(cpu_hw_events.n_txn)); 2268 __this_cpu_sub(cpu_hw_events.n_pair, __this_cpu_read(cpu_hw_events.n_txn_pair)); 2269 __this_cpu_sub(cpu_hw_events.n_metric, __this_cpu_read(cpu_hw_events.n_txn_metric)); 2270 perf_pmu_enable(pmu); 2271 } 2272 2273 /* 2274 * Commit group events scheduling transaction 2275 * Perform the group schedulability test as a whole 2276 * Return 0 if success 2277 * 2278 * Does not cancel the transaction on failure; expects the caller to do this. 2279 */ 2280 static int x86_pmu_commit_txn(struct pmu *pmu) 2281 { 2282 struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); 2283 int assign[X86_PMC_IDX_MAX]; 2284 int n, ret; 2285 2286 WARN_ON_ONCE(!cpuc->txn_flags); /* no txn in flight */ 2287 2288 if (cpuc->txn_flags & ~PERF_PMU_TXN_ADD) { 2289 cpuc->txn_flags = 0; 2290 return 0; 2291 } 2292 2293 n = cpuc->n_events; 2294 2295 if (!x86_pmu_initialized()) 2296 return -EAGAIN; 2297 2298 ret = static_call(x86_pmu_schedule_events)(cpuc, n, assign); 2299 if (ret) 2300 return ret; 2301 2302 /* 2303 * copy new assignment, now we know it is possible 2304 * will be used by hw_perf_enable() 2305 */ 2306 memcpy(cpuc->assign, assign, n*sizeof(int)); 2307 2308 cpuc->txn_flags = 0; 2309 perf_pmu_enable(pmu); 2310 return 0; 2311 } 2312 /* 2313 * a fake_cpuc is used to validate event groups. Due to 2314 * the extra reg logic, we need to also allocate a fake 2315 * per_core and per_cpu structure. Otherwise, group events 2316 * using extra reg may conflict without the kernel being 2317 * able to catch this when the last event gets added to 2318 * the group. 2319 */ 2320 static void free_fake_cpuc(struct cpu_hw_events *cpuc) 2321 { 2322 intel_cpuc_finish(cpuc); 2323 kfree(cpuc); 2324 } 2325 2326 static struct cpu_hw_events *allocate_fake_cpuc(struct pmu *event_pmu) 2327 { 2328 struct cpu_hw_events *cpuc; 2329 int cpu; 2330 2331 cpuc = kzalloc(sizeof(*cpuc), GFP_KERNEL); 2332 if (!cpuc) 2333 return ERR_PTR(-ENOMEM); 2334 cpuc->is_fake = 1; 2335 2336 if (is_hybrid()) { 2337 struct x86_hybrid_pmu *h_pmu; 2338 2339 h_pmu = hybrid_pmu(event_pmu); 2340 if (cpumask_empty(&h_pmu->supported_cpus)) 2341 goto error; 2342 cpu = cpumask_first(&h_pmu->supported_cpus); 2343 } else 2344 cpu = raw_smp_processor_id(); 2345 cpuc->pmu = event_pmu; 2346 2347 if (intel_cpuc_prepare(cpuc, cpu)) 2348 goto error; 2349 2350 return cpuc; 2351 error: 2352 free_fake_cpuc(cpuc); 2353 return ERR_PTR(-ENOMEM); 2354 } 2355 2356 /* 2357 * validate that we can schedule this event 2358 */ 2359 static int validate_event(struct perf_event *event) 2360 { 2361 struct cpu_hw_events *fake_cpuc; 2362 struct event_constraint *c; 2363 int ret = 0; 2364 2365 fake_cpuc = allocate_fake_cpuc(event->pmu); 2366 if (IS_ERR(fake_cpuc)) 2367 return PTR_ERR(fake_cpuc); 2368 2369 c = x86_pmu.get_event_constraints(fake_cpuc, 0, event); 2370 2371 if (!c || !c->weight) 2372 ret = -EINVAL; 2373 2374 if (x86_pmu.put_event_constraints) 2375 x86_pmu.put_event_constraints(fake_cpuc, event); 2376 2377 free_fake_cpuc(fake_cpuc); 2378 2379 return ret; 2380 } 2381 2382 /* 2383 * validate a single event group 2384 * 2385 * validation include: 2386 * - check events are compatible which each other 2387 * - events do not compete for the same counter 2388 * - number of events <= number of counters 2389 * 2390 * validation ensures the group can be loaded onto the 2391 * PMU if it was the only group available. 2392 */ 2393 static int validate_group(struct perf_event *event) 2394 { 2395 struct perf_event *leader = event->group_leader; 2396 struct cpu_hw_events *fake_cpuc; 2397 int ret = -EINVAL, n; 2398 2399 /* 2400 * Reject events from different hybrid PMUs. 2401 */ 2402 if (is_hybrid()) { 2403 struct perf_event *sibling; 2404 struct pmu *pmu = NULL; 2405 2406 if (is_x86_event(leader)) 2407 pmu = leader->pmu; 2408 2409 for_each_sibling_event(sibling, leader) { 2410 if (!is_x86_event(sibling)) 2411 continue; 2412 if (!pmu) 2413 pmu = sibling->pmu; 2414 else if (pmu != sibling->pmu) 2415 return ret; 2416 } 2417 } 2418 2419 fake_cpuc = allocate_fake_cpuc(event->pmu); 2420 if (IS_ERR(fake_cpuc)) 2421 return PTR_ERR(fake_cpuc); 2422 /* 2423 * the event is not yet connected with its 2424 * siblings therefore we must first collect 2425 * existing siblings, then add the new event 2426 * before we can simulate the scheduling 2427 */ 2428 n = collect_events(fake_cpuc, leader, true); 2429 if (n < 0) 2430 goto out; 2431 2432 fake_cpuc->n_events = n; 2433 n = collect_events(fake_cpuc, event, false); 2434 if (n < 0) 2435 goto out; 2436 2437 fake_cpuc->n_events = 0; 2438 ret = x86_pmu.schedule_events(fake_cpuc, n, NULL); 2439 2440 out: 2441 free_fake_cpuc(fake_cpuc); 2442 return ret; 2443 } 2444 2445 static int x86_pmu_event_init(struct perf_event *event) 2446 { 2447 struct x86_hybrid_pmu *pmu = NULL; 2448 int err; 2449 2450 if ((event->attr.type != event->pmu->type) && 2451 (event->attr.type != PERF_TYPE_HARDWARE) && 2452 (event->attr.type != PERF_TYPE_HW_CACHE)) 2453 return -ENOENT; 2454 2455 if (is_hybrid() && (event->cpu != -1)) { 2456 pmu = hybrid_pmu(event->pmu); 2457 if (!cpumask_test_cpu(event->cpu, &pmu->supported_cpus)) 2458 return -ENOENT; 2459 } 2460 2461 err = __x86_pmu_event_init(event); 2462 if (!err) { 2463 if (event->group_leader != event) 2464 err = validate_group(event); 2465 else 2466 err = validate_event(event); 2467 } 2468 if (err) { 2469 if (event->destroy) 2470 event->destroy(event); 2471 event->destroy = NULL; 2472 } 2473 2474 if (READ_ONCE(x86_pmu.attr_rdpmc) && 2475 !(event->hw.flags & PERF_X86_EVENT_LARGE_PEBS)) 2476 event->hw.flags |= PERF_EVENT_FLAG_USER_READ_CNT; 2477 2478 return err; 2479 } 2480 2481 void perf_clear_dirty_counters(void) 2482 { 2483 struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); 2484 int i; 2485 2486 /* Don't need to clear the assigned counter. */ 2487 for (i = 0; i < cpuc->n_events; i++) 2488 __clear_bit(cpuc->assign[i], cpuc->dirty); 2489 2490 if (bitmap_empty(cpuc->dirty, X86_PMC_IDX_MAX)) 2491 return; 2492 2493 for_each_set_bit(i, cpuc->dirty, X86_PMC_IDX_MAX) { 2494 if (i >= INTEL_PMC_IDX_FIXED) { 2495 /* Metrics and fake events don't have corresponding HW counters. */ 2496 if (!test_bit(i - INTEL_PMC_IDX_FIXED, hybrid(cpuc->pmu, fixed_cntr_mask))) 2497 continue; 2498 2499 wrmsrl(x86_pmu_fixed_ctr_addr(i - INTEL_PMC_IDX_FIXED), 0); 2500 } else { 2501 wrmsrl(x86_pmu_event_addr(i), 0); 2502 } 2503 } 2504 2505 bitmap_zero(cpuc->dirty, X86_PMC_IDX_MAX); 2506 } 2507 2508 static void x86_pmu_event_mapped(struct perf_event *event, struct mm_struct *mm) 2509 { 2510 if (!(event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT)) 2511 return; 2512 2513 /* 2514 * This function relies on not being called concurrently in two 2515 * tasks in the same mm. Otherwise one task could observe 2516 * perf_rdpmc_allowed > 1 and return all the way back to 2517 * userspace with CR4.PCE clear while another task is still 2518 * doing on_each_cpu_mask() to propagate CR4.PCE. 2519 * 2520 * For now, this can't happen because all callers hold mmap_lock 2521 * for write. If this changes, we'll need a different solution. 2522 */ 2523 mmap_assert_write_locked(mm); 2524 2525 if (atomic_inc_return(&mm->context.perf_rdpmc_allowed) == 1) 2526 on_each_cpu_mask(mm_cpumask(mm), cr4_update_pce, NULL, 1); 2527 } 2528 2529 static void x86_pmu_event_unmapped(struct perf_event *event, struct mm_struct *mm) 2530 { 2531 if (!(event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT)) 2532 return; 2533 2534 if (atomic_dec_and_test(&mm->context.perf_rdpmc_allowed)) 2535 on_each_cpu_mask(mm_cpumask(mm), cr4_update_pce, NULL, 1); 2536 } 2537 2538 static int x86_pmu_event_idx(struct perf_event *event) 2539 { 2540 struct hw_perf_event *hwc = &event->hw; 2541 2542 if (!(hwc->flags & PERF_EVENT_FLAG_USER_READ_CNT)) 2543 return 0; 2544 2545 if (is_metric_idx(hwc->idx)) 2546 return INTEL_PMC_FIXED_RDPMC_METRICS + 1; 2547 else 2548 return hwc->event_base_rdpmc + 1; 2549 } 2550 2551 static ssize_t get_attr_rdpmc(struct device *cdev, 2552 struct device_attribute *attr, 2553 char *buf) 2554 { 2555 return snprintf(buf, 40, "%d\n", x86_pmu.attr_rdpmc); 2556 } 2557 2558 static ssize_t set_attr_rdpmc(struct device *cdev, 2559 struct device_attribute *attr, 2560 const char *buf, size_t count) 2561 { 2562 static DEFINE_MUTEX(rdpmc_mutex); 2563 unsigned long val; 2564 ssize_t ret; 2565 2566 ret = kstrtoul(buf, 0, &val); 2567 if (ret) 2568 return ret; 2569 2570 if (val > 2) 2571 return -EINVAL; 2572 2573 if (x86_pmu.attr_rdpmc_broken) 2574 return -ENOTSUPP; 2575 2576 guard(mutex)(&rdpmc_mutex); 2577 2578 if (val != x86_pmu.attr_rdpmc) { 2579 /* 2580 * Changing into or out of never available or always available, 2581 * aka perf-event-bypassing mode. This path is extremely slow, 2582 * but only root can trigger it, so it's okay. 2583 */ 2584 if (val == 0) 2585 static_branch_inc(&rdpmc_never_available_key); 2586 else if (x86_pmu.attr_rdpmc == 0) 2587 static_branch_dec(&rdpmc_never_available_key); 2588 2589 if (val == 2) 2590 static_branch_inc(&rdpmc_always_available_key); 2591 else if (x86_pmu.attr_rdpmc == 2) 2592 static_branch_dec(&rdpmc_always_available_key); 2593 2594 on_each_cpu(cr4_update_pce, NULL, 1); 2595 x86_pmu.attr_rdpmc = val; 2596 } 2597 2598 return count; 2599 } 2600 2601 static DEVICE_ATTR(rdpmc, S_IRUSR | S_IWUSR, get_attr_rdpmc, set_attr_rdpmc); 2602 2603 static struct attribute *x86_pmu_attrs[] = { 2604 &dev_attr_rdpmc.attr, 2605 NULL, 2606 }; 2607 2608 static struct attribute_group x86_pmu_attr_group __ro_after_init = { 2609 .attrs = x86_pmu_attrs, 2610 }; 2611 2612 static ssize_t max_precise_show(struct device *cdev, 2613 struct device_attribute *attr, 2614 char *buf) 2615 { 2616 return snprintf(buf, PAGE_SIZE, "%d\n", x86_pmu_max_precise()); 2617 } 2618 2619 static DEVICE_ATTR_RO(max_precise); 2620 2621 static struct attribute *x86_pmu_caps_attrs[] = { 2622 &dev_attr_max_precise.attr, 2623 NULL 2624 }; 2625 2626 static struct attribute_group x86_pmu_caps_group __ro_after_init = { 2627 .name = "caps", 2628 .attrs = x86_pmu_caps_attrs, 2629 }; 2630 2631 static const struct attribute_group *x86_pmu_attr_groups[] = { 2632 &x86_pmu_attr_group, 2633 &x86_pmu_format_group, 2634 &x86_pmu_events_group, 2635 &x86_pmu_caps_group, 2636 NULL, 2637 }; 2638 2639 static void x86_pmu_sched_task(struct perf_event_pmu_context *pmu_ctx, 2640 struct task_struct *task, bool sched_in) 2641 { 2642 static_call_cond(x86_pmu_sched_task)(pmu_ctx, task, sched_in); 2643 } 2644 2645 void perf_check_microcode(void) 2646 { 2647 if (x86_pmu.check_microcode) 2648 x86_pmu.check_microcode(); 2649 } 2650 2651 static int x86_pmu_check_period(struct perf_event *event, u64 value) 2652 { 2653 if (x86_pmu.check_period && x86_pmu.check_period(event, value)) 2654 return -EINVAL; 2655 2656 if (value && x86_pmu.limit_period) { 2657 s64 left = value; 2658 x86_pmu.limit_period(event, &left); 2659 if (left > value) 2660 return -EINVAL; 2661 } 2662 2663 return 0; 2664 } 2665 2666 static int x86_pmu_aux_output_match(struct perf_event *event) 2667 { 2668 if (!(pmu.capabilities & PERF_PMU_CAP_AUX_OUTPUT)) 2669 return 0; 2670 2671 if (x86_pmu.aux_output_match) 2672 return x86_pmu.aux_output_match(event); 2673 2674 return 0; 2675 } 2676 2677 static bool x86_pmu_filter(struct pmu *pmu, int cpu) 2678 { 2679 bool ret = false; 2680 2681 static_call_cond(x86_pmu_filter)(pmu, cpu, &ret); 2682 2683 return ret; 2684 } 2685 2686 static struct pmu pmu = { 2687 .pmu_enable = x86_pmu_enable, 2688 .pmu_disable = x86_pmu_disable, 2689 2690 .attr_groups = x86_pmu_attr_groups, 2691 2692 .event_init = x86_pmu_event_init, 2693 2694 .event_mapped = x86_pmu_event_mapped, 2695 .event_unmapped = x86_pmu_event_unmapped, 2696 2697 .add = x86_pmu_add, 2698 .del = x86_pmu_del, 2699 .start = x86_pmu_start, 2700 .stop = x86_pmu_stop, 2701 .read = x86_pmu_read, 2702 2703 .start_txn = x86_pmu_start_txn, 2704 .cancel_txn = x86_pmu_cancel_txn, 2705 .commit_txn = x86_pmu_commit_txn, 2706 2707 .event_idx = x86_pmu_event_idx, 2708 .sched_task = x86_pmu_sched_task, 2709 .check_period = x86_pmu_check_period, 2710 2711 .aux_output_match = x86_pmu_aux_output_match, 2712 2713 .filter = x86_pmu_filter, 2714 }; 2715 2716 void arch_perf_update_userpage(struct perf_event *event, 2717 struct perf_event_mmap_page *userpg, u64 now) 2718 { 2719 struct cyc2ns_data data; 2720 u64 offset; 2721 2722 userpg->cap_user_time = 0; 2723 userpg->cap_user_time_zero = 0; 2724 userpg->cap_user_rdpmc = 2725 !!(event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT); 2726 userpg->pmc_width = x86_pmu.cntval_bits; 2727 2728 if (!using_native_sched_clock() || !sched_clock_stable()) 2729 return; 2730 2731 cyc2ns_read_begin(&data); 2732 2733 offset = data.cyc2ns_offset + __sched_clock_offset; 2734 2735 /* 2736 * Internal timekeeping for enabled/running/stopped times 2737 * is always in the local_clock domain. 2738 */ 2739 userpg->cap_user_time = 1; 2740 userpg->time_mult = data.cyc2ns_mul; 2741 userpg->time_shift = data.cyc2ns_shift; 2742 userpg->time_offset = offset - now; 2743 2744 /* 2745 * cap_user_time_zero doesn't make sense when we're using a different 2746 * time base for the records. 2747 */ 2748 if (!event->attr.use_clockid) { 2749 userpg->cap_user_time_zero = 1; 2750 userpg->time_zero = offset; 2751 } 2752 2753 cyc2ns_read_end(); 2754 } 2755 2756 /* 2757 * Determine whether the regs were taken from an irq/exception handler rather 2758 * than from perf_arch_fetch_caller_regs(). 2759 */ 2760 static bool perf_hw_regs(struct pt_regs *regs) 2761 { 2762 return regs->flags & X86_EFLAGS_FIXED; 2763 } 2764 2765 void 2766 perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) 2767 { 2768 struct unwind_state state; 2769 unsigned long addr; 2770 2771 if (perf_guest_state()) { 2772 /* TODO: We don't support guest os callchain now */ 2773 return; 2774 } 2775 2776 if (perf_callchain_store(entry, regs->ip)) 2777 return; 2778 2779 if (perf_hw_regs(regs)) 2780 unwind_start(&state, current, regs, NULL); 2781 else 2782 unwind_start(&state, current, NULL, (void *)regs->sp); 2783 2784 for (; !unwind_done(&state); unwind_next_frame(&state)) { 2785 addr = unwind_get_return_address(&state); 2786 if (!addr || perf_callchain_store(entry, addr)) 2787 return; 2788 } 2789 } 2790 2791 static inline int 2792 valid_user_frame(const void __user *fp, unsigned long size) 2793 { 2794 return __access_ok(fp, size); 2795 } 2796 2797 static unsigned long get_segment_base(unsigned int segment) 2798 { 2799 struct desc_struct *desc; 2800 unsigned int idx = segment >> 3; 2801 2802 if ((segment & SEGMENT_TI_MASK) == SEGMENT_LDT) { 2803 #ifdef CONFIG_MODIFY_LDT_SYSCALL 2804 struct ldt_struct *ldt; 2805 2806 /* IRQs are off, so this synchronizes with smp_store_release */ 2807 ldt = READ_ONCE(current->active_mm->context.ldt); 2808 if (!ldt || idx >= ldt->nr_entries) 2809 return 0; 2810 2811 desc = &ldt->entries[idx]; 2812 #else 2813 return 0; 2814 #endif 2815 } else { 2816 if (idx >= GDT_ENTRIES) 2817 return 0; 2818 2819 desc = raw_cpu_ptr(gdt_page.gdt) + idx; 2820 } 2821 2822 return get_desc_base(desc); 2823 } 2824 2825 #ifdef CONFIG_UPROBES 2826 /* 2827 * Heuristic-based check if uprobe is installed at the function entry. 2828 * 2829 * Under assumption of user code being compiled with frame pointers, 2830 * `push %rbp/%ebp` is a good indicator that we indeed are. 2831 * 2832 * Similarly, `endbr64` (assuming 64-bit mode) is also a common pattern. 2833 * If we get this wrong, captured stack trace might have one extra bogus 2834 * entry, but the rest of stack trace will still be meaningful. 2835 */ 2836 static bool is_uprobe_at_func_entry(struct pt_regs *regs) 2837 { 2838 struct arch_uprobe *auprobe; 2839 2840 if (!current->utask) 2841 return false; 2842 2843 auprobe = current->utask->auprobe; 2844 if (!auprobe) 2845 return false; 2846 2847 /* push %rbp/%ebp */ 2848 if (auprobe->insn[0] == 0x55) 2849 return true; 2850 2851 /* endbr64 (64-bit only) */ 2852 if (user_64bit_mode(regs) && is_endbr((u32 *)auprobe->insn)) 2853 return true; 2854 2855 return false; 2856 } 2857 2858 #else 2859 static bool is_uprobe_at_func_entry(struct pt_regs *regs) 2860 { 2861 return false; 2862 } 2863 #endif /* CONFIG_UPROBES */ 2864 2865 #ifdef CONFIG_IA32_EMULATION 2866 2867 #include <linux/compat.h> 2868 2869 static inline int 2870 perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry_ctx *entry) 2871 { 2872 /* 32-bit process in 64-bit kernel. */ 2873 unsigned long ss_base, cs_base; 2874 struct stack_frame_ia32 frame; 2875 const struct stack_frame_ia32 __user *fp; 2876 u32 ret_addr; 2877 2878 if (user_64bit_mode(regs)) 2879 return 0; 2880 2881 cs_base = get_segment_base(regs->cs); 2882 ss_base = get_segment_base(regs->ss); 2883 2884 fp = compat_ptr(ss_base + regs->bp); 2885 pagefault_disable(); 2886 2887 /* see perf_callchain_user() below for why we do this */ 2888 if (is_uprobe_at_func_entry(regs) && 2889 !get_user(ret_addr, (const u32 __user *)regs->sp)) 2890 perf_callchain_store(entry, ret_addr); 2891 2892 while (entry->nr < entry->max_stack) { 2893 if (!valid_user_frame(fp, sizeof(frame))) 2894 break; 2895 2896 if (__get_user(frame.next_frame, &fp->next_frame)) 2897 break; 2898 if (__get_user(frame.return_address, &fp->return_address)) 2899 break; 2900 2901 perf_callchain_store(entry, cs_base + frame.return_address); 2902 fp = compat_ptr(ss_base + frame.next_frame); 2903 } 2904 pagefault_enable(); 2905 return 1; 2906 } 2907 #else 2908 static inline int 2909 perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry_ctx *entry) 2910 { 2911 return 0; 2912 } 2913 #endif 2914 2915 void 2916 perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) 2917 { 2918 struct stack_frame frame; 2919 const struct stack_frame __user *fp; 2920 unsigned long ret_addr; 2921 2922 if (perf_guest_state()) { 2923 /* TODO: We don't support guest os callchain now */ 2924 return; 2925 } 2926 2927 /* 2928 * We don't know what to do with VM86 stacks.. ignore them for now. 2929 */ 2930 if (regs->flags & (X86_VM_MASK | PERF_EFLAGS_VM)) 2931 return; 2932 2933 fp = (void __user *)regs->bp; 2934 2935 perf_callchain_store(entry, regs->ip); 2936 2937 if (!nmi_uaccess_okay()) 2938 return; 2939 2940 if (perf_callchain_user32(regs, entry)) 2941 return; 2942 2943 pagefault_disable(); 2944 2945 /* 2946 * If we are called from uprobe handler, and we are indeed at the very 2947 * entry to user function (which is normally a `push %rbp` instruction, 2948 * under assumption of application being compiled with frame pointers), 2949 * we should read return address from *regs->sp before proceeding 2950 * to follow frame pointers, otherwise we'll skip immediate caller 2951 * as %rbp is not yet setup. 2952 */ 2953 if (is_uprobe_at_func_entry(regs) && 2954 !get_user(ret_addr, (const unsigned long __user *)regs->sp)) 2955 perf_callchain_store(entry, ret_addr); 2956 2957 while (entry->nr < entry->max_stack) { 2958 if (!valid_user_frame(fp, sizeof(frame))) 2959 break; 2960 2961 if (__get_user(frame.next_frame, &fp->next_frame)) 2962 break; 2963 if (__get_user(frame.return_address, &fp->return_address)) 2964 break; 2965 2966 perf_callchain_store(entry, frame.return_address); 2967 fp = (void __user *)frame.next_frame; 2968 } 2969 pagefault_enable(); 2970 } 2971 2972 /* 2973 * Deal with code segment offsets for the various execution modes: 2974 * 2975 * VM86 - the good olde 16 bit days, where the linear address is 2976 * 20 bits and we use regs->ip + 0x10 * regs->cs. 2977 * 2978 * IA32 - Where we need to look at GDT/LDT segment descriptor tables 2979 * to figure out what the 32bit base address is. 2980 * 2981 * X32 - has TIF_X32 set, but is running in x86_64 2982 * 2983 * X86_64 - CS,DS,SS,ES are all zero based. 2984 */ 2985 static unsigned long code_segment_base(struct pt_regs *regs) 2986 { 2987 /* 2988 * For IA32 we look at the GDT/LDT segment base to convert the 2989 * effective IP to a linear address. 2990 */ 2991 2992 #ifdef CONFIG_X86_32 2993 /* 2994 * If we are in VM86 mode, add the segment offset to convert to a 2995 * linear address. 2996 */ 2997 if (regs->flags & X86_VM_MASK) 2998 return 0x10 * regs->cs; 2999 3000 if (user_mode(regs) && regs->cs != __USER_CS) 3001 return get_segment_base(regs->cs); 3002 #else 3003 if (user_mode(regs) && !user_64bit_mode(regs) && 3004 regs->cs != __USER32_CS) 3005 return get_segment_base(regs->cs); 3006 #endif 3007 return 0; 3008 } 3009 3010 unsigned long perf_arch_instruction_pointer(struct pt_regs *regs) 3011 { 3012 return regs->ip + code_segment_base(regs); 3013 } 3014 3015 static unsigned long common_misc_flags(struct pt_regs *regs) 3016 { 3017 if (regs->flags & PERF_EFLAGS_EXACT) 3018 return PERF_RECORD_MISC_EXACT_IP; 3019 3020 return 0; 3021 } 3022 3023 static unsigned long guest_misc_flags(struct pt_regs *regs) 3024 { 3025 unsigned long guest_state = perf_guest_state(); 3026 3027 if (!(guest_state & PERF_GUEST_ACTIVE)) 3028 return 0; 3029 3030 if (guest_state & PERF_GUEST_USER) 3031 return PERF_RECORD_MISC_GUEST_USER; 3032 else 3033 return PERF_RECORD_MISC_GUEST_KERNEL; 3034 3035 } 3036 3037 static unsigned long host_misc_flags(struct pt_regs *regs) 3038 { 3039 if (user_mode(regs)) 3040 return PERF_RECORD_MISC_USER; 3041 else 3042 return PERF_RECORD_MISC_KERNEL; 3043 } 3044 3045 unsigned long perf_arch_guest_misc_flags(struct pt_regs *regs) 3046 { 3047 unsigned long flags = common_misc_flags(regs); 3048 3049 flags |= guest_misc_flags(regs); 3050 3051 return flags; 3052 } 3053 3054 unsigned long perf_arch_misc_flags(struct pt_regs *regs) 3055 { 3056 unsigned long flags = common_misc_flags(regs); 3057 3058 flags |= host_misc_flags(regs); 3059 3060 return flags; 3061 } 3062 3063 void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap) 3064 { 3065 /* This API doesn't currently support enumerating hybrid PMUs. */ 3066 if (WARN_ON_ONCE(cpu_feature_enabled(X86_FEATURE_HYBRID_CPU)) || 3067 !x86_pmu_initialized()) { 3068 memset(cap, 0, sizeof(*cap)); 3069 return; 3070 } 3071 3072 /* 3073 * Note, hybrid CPU models get tracked as having hybrid PMUs even when 3074 * all E-cores are disabled via BIOS. When E-cores are disabled, the 3075 * base PMU holds the correct number of counters for P-cores. 3076 */ 3077 cap->version = x86_pmu.version; 3078 cap->num_counters_gp = x86_pmu_num_counters(NULL); 3079 cap->num_counters_fixed = x86_pmu_num_counters_fixed(NULL); 3080 cap->bit_width_gp = x86_pmu.cntval_bits; 3081 cap->bit_width_fixed = x86_pmu.cntval_bits; 3082 cap->events_mask = (unsigned int)x86_pmu.events_maskl; 3083 cap->events_mask_len = x86_pmu.events_mask_len; 3084 cap->pebs_ept = x86_pmu.pebs_ept; 3085 } 3086 EXPORT_SYMBOL_GPL(perf_get_x86_pmu_capability); 3087 3088 u64 perf_get_hw_event_config(int hw_event) 3089 { 3090 int max = x86_pmu.max_events; 3091 3092 if (hw_event < max) 3093 return x86_pmu.event_map(array_index_nospec(hw_event, max)); 3094 3095 return 0; 3096 } 3097 EXPORT_SYMBOL_GPL(perf_get_hw_event_config); 3098