1 /* 2 * Performance events x86 architecture code 3 * 4 * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de> 5 * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar 6 * Copyright (C) 2009 Jaswinder Singh Rajput 7 * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter 8 * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra 9 * Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com> 10 * Copyright (C) 2009 Google, Inc., Stephane Eranian 11 * 12 * For licencing details see kernel-base/COPYING 13 */ 14 15 #include <linux/perf_event.h> 16 #include <linux/capability.h> 17 #include <linux/notifier.h> 18 #include <linux/hardirq.h> 19 #include <linux/kprobes.h> 20 #include <linux/export.h> 21 #include <linux/init.h> 22 #include <linux/kdebug.h> 23 #include <linux/sched/mm.h> 24 #include <linux/sched/clock.h> 25 #include <linux/uaccess.h> 26 #include <linux/slab.h> 27 #include <linux/cpu.h> 28 #include <linux/bitops.h> 29 #include <linux/device.h> 30 #include <linux/nospec.h> 31 #include <linux/static_call.h> 32 33 #include <asm/apic.h> 34 #include <asm/stacktrace.h> 35 #include <asm/nmi.h> 36 #include <asm/smp.h> 37 #include <asm/alternative.h> 38 #include <asm/mmu_context.h> 39 #include <asm/tlbflush.h> 40 #include <asm/timer.h> 41 #include <asm/desc.h> 42 #include <asm/ldt.h> 43 #include <asm/unwind.h> 44 45 #include "perf_event.h" 46 47 struct x86_pmu x86_pmu __read_mostly; 48 49 DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = { 50 .enabled = 1, 51 }; 52 53 DEFINE_STATIC_KEY_FALSE(rdpmc_never_available_key); 54 DEFINE_STATIC_KEY_FALSE(rdpmc_always_available_key); 55 56 /* 57 * This here uses DEFINE_STATIC_CALL_NULL() to get a static_call defined 58 * from just a typename, as opposed to an actual function. 59 */ 60 DEFINE_STATIC_CALL_NULL(x86_pmu_handle_irq, *x86_pmu.handle_irq); 61 DEFINE_STATIC_CALL_NULL(x86_pmu_disable_all, *x86_pmu.disable_all); 62 DEFINE_STATIC_CALL_NULL(x86_pmu_enable_all, *x86_pmu.enable_all); 63 DEFINE_STATIC_CALL_NULL(x86_pmu_enable, *x86_pmu.enable); 64 DEFINE_STATIC_CALL_NULL(x86_pmu_disable, *x86_pmu.disable); 65 66 DEFINE_STATIC_CALL_NULL(x86_pmu_add, *x86_pmu.add); 67 DEFINE_STATIC_CALL_NULL(x86_pmu_del, *x86_pmu.del); 68 DEFINE_STATIC_CALL_NULL(x86_pmu_read, *x86_pmu.read); 69 70 DEFINE_STATIC_CALL_NULL(x86_pmu_schedule_events, *x86_pmu.schedule_events); 71 DEFINE_STATIC_CALL_NULL(x86_pmu_get_event_constraints, *x86_pmu.get_event_constraints); 72 DEFINE_STATIC_CALL_NULL(x86_pmu_put_event_constraints, *x86_pmu.put_event_constraints); 73 74 DEFINE_STATIC_CALL_NULL(x86_pmu_start_scheduling, *x86_pmu.start_scheduling); 75 DEFINE_STATIC_CALL_NULL(x86_pmu_commit_scheduling, *x86_pmu.commit_scheduling); 76 DEFINE_STATIC_CALL_NULL(x86_pmu_stop_scheduling, *x86_pmu.stop_scheduling); 77 78 DEFINE_STATIC_CALL_NULL(x86_pmu_sched_task, *x86_pmu.sched_task); 79 DEFINE_STATIC_CALL_NULL(x86_pmu_swap_task_ctx, *x86_pmu.swap_task_ctx); 80 81 DEFINE_STATIC_CALL_NULL(x86_pmu_drain_pebs, *x86_pmu.drain_pebs); 82 DEFINE_STATIC_CALL_NULL(x86_pmu_pebs_aliases, *x86_pmu.pebs_aliases); 83 84 u64 __read_mostly hw_cache_event_ids 85 [PERF_COUNT_HW_CACHE_MAX] 86 [PERF_COUNT_HW_CACHE_OP_MAX] 87 [PERF_COUNT_HW_CACHE_RESULT_MAX]; 88 u64 __read_mostly hw_cache_extra_regs 89 [PERF_COUNT_HW_CACHE_MAX] 90 [PERF_COUNT_HW_CACHE_OP_MAX] 91 [PERF_COUNT_HW_CACHE_RESULT_MAX]; 92 93 /* 94 * Propagate event elapsed time into the generic event. 95 * Can only be executed on the CPU where the event is active. 96 * Returns the delta events processed. 97 */ 98 u64 x86_perf_event_update(struct perf_event *event) 99 { 100 struct hw_perf_event *hwc = &event->hw; 101 int shift = 64 - x86_pmu.cntval_bits; 102 u64 prev_raw_count, new_raw_count; 103 u64 delta; 104 105 if (unlikely(!hwc->event_base)) 106 return 0; 107 108 if (unlikely(is_topdown_count(event)) && x86_pmu.update_topdown_event) 109 return x86_pmu.update_topdown_event(event); 110 111 /* 112 * Careful: an NMI might modify the previous event value. 113 * 114 * Our tactic to handle this is to first atomically read and 115 * exchange a new raw count - then add that new-prev delta 116 * count to the generic event atomically: 117 */ 118 again: 119 prev_raw_count = local64_read(&hwc->prev_count); 120 rdpmcl(hwc->event_base_rdpmc, new_raw_count); 121 122 if (local64_cmpxchg(&hwc->prev_count, prev_raw_count, 123 new_raw_count) != prev_raw_count) 124 goto again; 125 126 /* 127 * Now we have the new raw value and have updated the prev 128 * timestamp already. We can now calculate the elapsed delta 129 * (event-)time and add that to the generic event. 130 * 131 * Careful, not all hw sign-extends above the physical width 132 * of the count. 133 */ 134 delta = (new_raw_count << shift) - (prev_raw_count << shift); 135 delta >>= shift; 136 137 local64_add(delta, &event->count); 138 local64_sub(delta, &hwc->period_left); 139 140 return new_raw_count; 141 } 142 143 /* 144 * Find and validate any extra registers to set up. 145 */ 146 static int x86_pmu_extra_regs(u64 config, struct perf_event *event) 147 { 148 struct hw_perf_event_extra *reg; 149 struct extra_reg *er; 150 151 reg = &event->hw.extra_reg; 152 153 if (!x86_pmu.extra_regs) 154 return 0; 155 156 for (er = x86_pmu.extra_regs; er->msr; er++) { 157 if (er->event != (config & er->config_mask)) 158 continue; 159 if (event->attr.config1 & ~er->valid_mask) 160 return -EINVAL; 161 /* Check if the extra msrs can be safely accessed*/ 162 if (!er->extra_msr_access) 163 return -ENXIO; 164 165 reg->idx = er->idx; 166 reg->config = event->attr.config1; 167 reg->reg = er->msr; 168 break; 169 } 170 return 0; 171 } 172 173 static atomic_t active_events; 174 static atomic_t pmc_refcount; 175 static DEFINE_MUTEX(pmc_reserve_mutex); 176 177 #ifdef CONFIG_X86_LOCAL_APIC 178 179 static bool reserve_pmc_hardware(void) 180 { 181 int i; 182 183 for (i = 0; i < x86_pmu.num_counters; i++) { 184 if (!reserve_perfctr_nmi(x86_pmu_event_addr(i))) 185 goto perfctr_fail; 186 } 187 188 for (i = 0; i < x86_pmu.num_counters; i++) { 189 if (!reserve_evntsel_nmi(x86_pmu_config_addr(i))) 190 goto eventsel_fail; 191 } 192 193 return true; 194 195 eventsel_fail: 196 for (i--; i >= 0; i--) 197 release_evntsel_nmi(x86_pmu_config_addr(i)); 198 199 i = x86_pmu.num_counters; 200 201 perfctr_fail: 202 for (i--; i >= 0; i--) 203 release_perfctr_nmi(x86_pmu_event_addr(i)); 204 205 return false; 206 } 207 208 static void release_pmc_hardware(void) 209 { 210 int i; 211 212 for (i = 0; i < x86_pmu.num_counters; i++) { 213 release_perfctr_nmi(x86_pmu_event_addr(i)); 214 release_evntsel_nmi(x86_pmu_config_addr(i)); 215 } 216 } 217 218 #else 219 220 static bool reserve_pmc_hardware(void) { return true; } 221 static void release_pmc_hardware(void) {} 222 223 #endif 224 225 static bool check_hw_exists(void) 226 { 227 u64 val, val_fail = -1, val_new= ~0; 228 int i, reg, reg_fail = -1, ret = 0; 229 int bios_fail = 0; 230 int reg_safe = -1; 231 232 /* 233 * Check to see if the BIOS enabled any of the counters, if so 234 * complain and bail. 235 */ 236 for (i = 0; i < x86_pmu.num_counters; i++) { 237 reg = x86_pmu_config_addr(i); 238 ret = rdmsrl_safe(reg, &val); 239 if (ret) 240 goto msr_fail; 241 if (val & ARCH_PERFMON_EVENTSEL_ENABLE) { 242 bios_fail = 1; 243 val_fail = val; 244 reg_fail = reg; 245 } else { 246 reg_safe = i; 247 } 248 } 249 250 if (x86_pmu.num_counters_fixed) { 251 reg = MSR_ARCH_PERFMON_FIXED_CTR_CTRL; 252 ret = rdmsrl_safe(reg, &val); 253 if (ret) 254 goto msr_fail; 255 for (i = 0; i < x86_pmu.num_counters_fixed; i++) { 256 if (val & (0x03 << i*4)) { 257 bios_fail = 1; 258 val_fail = val; 259 reg_fail = reg; 260 } 261 } 262 } 263 264 /* 265 * If all the counters are enabled, the below test will always 266 * fail. The tools will also become useless in this scenario. 267 * Just fail and disable the hardware counters. 268 */ 269 270 if (reg_safe == -1) { 271 reg = reg_safe; 272 goto msr_fail; 273 } 274 275 /* 276 * Read the current value, change it and read it back to see if it 277 * matches, this is needed to detect certain hardware emulators 278 * (qemu/kvm) that don't trap on the MSR access and always return 0s. 279 */ 280 reg = x86_pmu_event_addr(reg_safe); 281 if (rdmsrl_safe(reg, &val)) 282 goto msr_fail; 283 val ^= 0xffffUL; 284 ret = wrmsrl_safe(reg, val); 285 ret |= rdmsrl_safe(reg, &val_new); 286 if (ret || val != val_new) 287 goto msr_fail; 288 289 /* 290 * We still allow the PMU driver to operate: 291 */ 292 if (bios_fail) { 293 pr_cont("Broken BIOS detected, complain to your hardware vendor.\n"); 294 pr_err(FW_BUG "the BIOS has corrupted hw-PMU resources (MSR %x is %Lx)\n", 295 reg_fail, val_fail); 296 } 297 298 return true; 299 300 msr_fail: 301 if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) { 302 pr_cont("PMU not available due to virtualization, using software events only.\n"); 303 } else { 304 pr_cont("Broken PMU hardware detected, using software events only.\n"); 305 pr_err("Failed to access perfctr msr (MSR %x is %Lx)\n", 306 reg, val_new); 307 } 308 309 return false; 310 } 311 312 static void hw_perf_event_destroy(struct perf_event *event) 313 { 314 x86_release_hardware(); 315 atomic_dec(&active_events); 316 } 317 318 void hw_perf_lbr_event_destroy(struct perf_event *event) 319 { 320 hw_perf_event_destroy(event); 321 322 /* undo the lbr/bts event accounting */ 323 x86_del_exclusive(x86_lbr_exclusive_lbr); 324 } 325 326 static inline int x86_pmu_initialized(void) 327 { 328 return x86_pmu.handle_irq != NULL; 329 } 330 331 static inline int 332 set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event *event) 333 { 334 struct perf_event_attr *attr = &event->attr; 335 unsigned int cache_type, cache_op, cache_result; 336 u64 config, val; 337 338 config = attr->config; 339 340 cache_type = (config >> 0) & 0xff; 341 if (cache_type >= PERF_COUNT_HW_CACHE_MAX) 342 return -EINVAL; 343 cache_type = array_index_nospec(cache_type, PERF_COUNT_HW_CACHE_MAX); 344 345 cache_op = (config >> 8) & 0xff; 346 if (cache_op >= PERF_COUNT_HW_CACHE_OP_MAX) 347 return -EINVAL; 348 cache_op = array_index_nospec(cache_op, PERF_COUNT_HW_CACHE_OP_MAX); 349 350 cache_result = (config >> 16) & 0xff; 351 if (cache_result >= PERF_COUNT_HW_CACHE_RESULT_MAX) 352 return -EINVAL; 353 cache_result = array_index_nospec(cache_result, PERF_COUNT_HW_CACHE_RESULT_MAX); 354 355 val = hw_cache_event_ids[cache_type][cache_op][cache_result]; 356 357 if (val == 0) 358 return -ENOENT; 359 360 if (val == -1) 361 return -EINVAL; 362 363 hwc->config |= val; 364 attr->config1 = hw_cache_extra_regs[cache_type][cache_op][cache_result]; 365 return x86_pmu_extra_regs(val, event); 366 } 367 368 int x86_reserve_hardware(void) 369 { 370 int err = 0; 371 372 if (!atomic_inc_not_zero(&pmc_refcount)) { 373 mutex_lock(&pmc_reserve_mutex); 374 if (atomic_read(&pmc_refcount) == 0) { 375 if (!reserve_pmc_hardware()) 376 err = -EBUSY; 377 else 378 reserve_ds_buffers(); 379 } 380 if (!err) 381 atomic_inc(&pmc_refcount); 382 mutex_unlock(&pmc_reserve_mutex); 383 } 384 385 return err; 386 } 387 388 void x86_release_hardware(void) 389 { 390 if (atomic_dec_and_mutex_lock(&pmc_refcount, &pmc_reserve_mutex)) { 391 release_pmc_hardware(); 392 release_ds_buffers(); 393 release_lbr_buffers(); 394 mutex_unlock(&pmc_reserve_mutex); 395 } 396 } 397 398 /* 399 * Check if we can create event of a certain type (that no conflicting events 400 * are present). 401 */ 402 int x86_add_exclusive(unsigned int what) 403 { 404 int i; 405 406 /* 407 * When lbr_pt_coexist we allow PT to coexist with either LBR or BTS. 408 * LBR and BTS are still mutually exclusive. 409 */ 410 if (x86_pmu.lbr_pt_coexist && what == x86_lbr_exclusive_pt) 411 goto out; 412 413 if (!atomic_inc_not_zero(&x86_pmu.lbr_exclusive[what])) { 414 mutex_lock(&pmc_reserve_mutex); 415 for (i = 0; i < ARRAY_SIZE(x86_pmu.lbr_exclusive); i++) { 416 if (i != what && atomic_read(&x86_pmu.lbr_exclusive[i])) 417 goto fail_unlock; 418 } 419 atomic_inc(&x86_pmu.lbr_exclusive[what]); 420 mutex_unlock(&pmc_reserve_mutex); 421 } 422 423 out: 424 atomic_inc(&active_events); 425 return 0; 426 427 fail_unlock: 428 mutex_unlock(&pmc_reserve_mutex); 429 return -EBUSY; 430 } 431 432 void x86_del_exclusive(unsigned int what) 433 { 434 atomic_dec(&active_events); 435 436 /* 437 * See the comment in x86_add_exclusive(). 438 */ 439 if (x86_pmu.lbr_pt_coexist && what == x86_lbr_exclusive_pt) 440 return; 441 442 atomic_dec(&x86_pmu.lbr_exclusive[what]); 443 } 444 445 int x86_setup_perfctr(struct perf_event *event) 446 { 447 struct perf_event_attr *attr = &event->attr; 448 struct hw_perf_event *hwc = &event->hw; 449 u64 config; 450 451 if (!is_sampling_event(event)) { 452 hwc->sample_period = x86_pmu.max_period; 453 hwc->last_period = hwc->sample_period; 454 local64_set(&hwc->period_left, hwc->sample_period); 455 } 456 457 if (attr->type == PERF_TYPE_RAW) 458 return x86_pmu_extra_regs(event->attr.config, event); 459 460 if (attr->type == PERF_TYPE_HW_CACHE) 461 return set_ext_hw_attr(hwc, event); 462 463 if (attr->config >= x86_pmu.max_events) 464 return -EINVAL; 465 466 attr->config = array_index_nospec((unsigned long)attr->config, x86_pmu.max_events); 467 468 /* 469 * The generic map: 470 */ 471 config = x86_pmu.event_map(attr->config); 472 473 if (config == 0) 474 return -ENOENT; 475 476 if (config == -1LL) 477 return -EINVAL; 478 479 hwc->config |= config; 480 481 return 0; 482 } 483 484 /* 485 * check that branch_sample_type is compatible with 486 * settings needed for precise_ip > 1 which implies 487 * using the LBR to capture ALL taken branches at the 488 * priv levels of the measurement 489 */ 490 static inline int precise_br_compat(struct perf_event *event) 491 { 492 u64 m = event->attr.branch_sample_type; 493 u64 b = 0; 494 495 /* must capture all branches */ 496 if (!(m & PERF_SAMPLE_BRANCH_ANY)) 497 return 0; 498 499 m &= PERF_SAMPLE_BRANCH_KERNEL | PERF_SAMPLE_BRANCH_USER; 500 501 if (!event->attr.exclude_user) 502 b |= PERF_SAMPLE_BRANCH_USER; 503 504 if (!event->attr.exclude_kernel) 505 b |= PERF_SAMPLE_BRANCH_KERNEL; 506 507 /* 508 * ignore PERF_SAMPLE_BRANCH_HV, not supported on x86 509 */ 510 511 return m == b; 512 } 513 514 int x86_pmu_max_precise(void) 515 { 516 int precise = 0; 517 518 /* Support for constant skid */ 519 if (x86_pmu.pebs_active && !x86_pmu.pebs_broken) { 520 precise++; 521 522 /* Support for IP fixup */ 523 if (x86_pmu.lbr_nr || x86_pmu.intel_cap.pebs_format >= 2) 524 precise++; 525 526 if (x86_pmu.pebs_prec_dist) 527 precise++; 528 } 529 return precise; 530 } 531 532 int x86_pmu_hw_config(struct perf_event *event) 533 { 534 if (event->attr.precise_ip) { 535 int precise = x86_pmu_max_precise(); 536 537 if (event->attr.precise_ip > precise) 538 return -EOPNOTSUPP; 539 540 /* There's no sense in having PEBS for non sampling events: */ 541 if (!is_sampling_event(event)) 542 return -EINVAL; 543 } 544 /* 545 * check that PEBS LBR correction does not conflict with 546 * whatever the user is asking with attr->branch_sample_type 547 */ 548 if (event->attr.precise_ip > 1 && x86_pmu.intel_cap.pebs_format < 2) { 549 u64 *br_type = &event->attr.branch_sample_type; 550 551 if (has_branch_stack(event)) { 552 if (!precise_br_compat(event)) 553 return -EOPNOTSUPP; 554 555 /* branch_sample_type is compatible */ 556 557 } else { 558 /* 559 * user did not specify branch_sample_type 560 * 561 * For PEBS fixups, we capture all 562 * the branches at the priv level of the 563 * event. 564 */ 565 *br_type = PERF_SAMPLE_BRANCH_ANY; 566 567 if (!event->attr.exclude_user) 568 *br_type |= PERF_SAMPLE_BRANCH_USER; 569 570 if (!event->attr.exclude_kernel) 571 *br_type |= PERF_SAMPLE_BRANCH_KERNEL; 572 } 573 } 574 575 if (event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_CALL_STACK) 576 event->attach_state |= PERF_ATTACH_TASK_DATA; 577 578 /* 579 * Generate PMC IRQs: 580 * (keep 'enabled' bit clear for now) 581 */ 582 event->hw.config = ARCH_PERFMON_EVENTSEL_INT; 583 584 /* 585 * Count user and OS events unless requested not to 586 */ 587 if (!event->attr.exclude_user) 588 event->hw.config |= ARCH_PERFMON_EVENTSEL_USR; 589 if (!event->attr.exclude_kernel) 590 event->hw.config |= ARCH_PERFMON_EVENTSEL_OS; 591 592 if (event->attr.type == PERF_TYPE_RAW) 593 event->hw.config |= event->attr.config & X86_RAW_EVENT_MASK; 594 595 if (event->attr.sample_period && x86_pmu.limit_period) { 596 if (x86_pmu.limit_period(event, event->attr.sample_period) > 597 event->attr.sample_period) 598 return -EINVAL; 599 } 600 601 /* sample_regs_user never support XMM registers */ 602 if (unlikely(event->attr.sample_regs_user & PERF_REG_EXTENDED_MASK)) 603 return -EINVAL; 604 /* 605 * Besides the general purpose registers, XMM registers may 606 * be collected in PEBS on some platforms, e.g. Icelake 607 */ 608 if (unlikely(event->attr.sample_regs_intr & PERF_REG_EXTENDED_MASK)) { 609 if (!(event->pmu->capabilities & PERF_PMU_CAP_EXTENDED_REGS)) 610 return -EINVAL; 611 612 if (!event->attr.precise_ip) 613 return -EINVAL; 614 } 615 616 return x86_setup_perfctr(event); 617 } 618 619 /* 620 * Setup the hardware configuration for a given attr_type 621 */ 622 static int __x86_pmu_event_init(struct perf_event *event) 623 { 624 int err; 625 626 if (!x86_pmu_initialized()) 627 return -ENODEV; 628 629 err = x86_reserve_hardware(); 630 if (err) 631 return err; 632 633 atomic_inc(&active_events); 634 event->destroy = hw_perf_event_destroy; 635 636 event->hw.idx = -1; 637 event->hw.last_cpu = -1; 638 event->hw.last_tag = ~0ULL; 639 640 /* mark unused */ 641 event->hw.extra_reg.idx = EXTRA_REG_NONE; 642 event->hw.branch_reg.idx = EXTRA_REG_NONE; 643 644 return x86_pmu.hw_config(event); 645 } 646 647 void x86_pmu_disable_all(void) 648 { 649 struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); 650 int idx; 651 652 for (idx = 0; idx < x86_pmu.num_counters; idx++) { 653 struct hw_perf_event *hwc = &cpuc->events[idx]->hw; 654 u64 val; 655 656 if (!test_bit(idx, cpuc->active_mask)) 657 continue; 658 rdmsrl(x86_pmu_config_addr(idx), val); 659 if (!(val & ARCH_PERFMON_EVENTSEL_ENABLE)) 660 continue; 661 val &= ~ARCH_PERFMON_EVENTSEL_ENABLE; 662 wrmsrl(x86_pmu_config_addr(idx), val); 663 if (is_counter_pair(hwc)) 664 wrmsrl(x86_pmu_config_addr(idx + 1), 0); 665 } 666 } 667 668 /* 669 * There may be PMI landing after enabled=0. The PMI hitting could be before or 670 * after disable_all. 671 * 672 * If PMI hits before disable_all, the PMU will be disabled in the NMI handler. 673 * It will not be re-enabled in the NMI handler again, because enabled=0. After 674 * handling the NMI, disable_all will be called, which will not change the 675 * state either. If PMI hits after disable_all, the PMU is already disabled 676 * before entering NMI handler. The NMI handler will not change the state 677 * either. 678 * 679 * So either situation is harmless. 680 */ 681 static void x86_pmu_disable(struct pmu *pmu) 682 { 683 struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); 684 685 if (!x86_pmu_initialized()) 686 return; 687 688 if (!cpuc->enabled) 689 return; 690 691 cpuc->n_added = 0; 692 cpuc->enabled = 0; 693 barrier(); 694 695 static_call(x86_pmu_disable_all)(); 696 } 697 698 void x86_pmu_enable_all(int added) 699 { 700 struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); 701 int idx; 702 703 for (idx = 0; idx < x86_pmu.num_counters; idx++) { 704 struct hw_perf_event *hwc = &cpuc->events[idx]->hw; 705 706 if (!test_bit(idx, cpuc->active_mask)) 707 continue; 708 709 __x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE); 710 } 711 } 712 713 static struct pmu pmu; 714 715 static inline int is_x86_event(struct perf_event *event) 716 { 717 return event->pmu == &pmu; 718 } 719 720 struct pmu *x86_get_pmu(void) 721 { 722 return &pmu; 723 } 724 /* 725 * Event scheduler state: 726 * 727 * Assign events iterating over all events and counters, beginning 728 * with events with least weights first. Keep the current iterator 729 * state in struct sched_state. 730 */ 731 struct sched_state { 732 int weight; 733 int event; /* event index */ 734 int counter; /* counter index */ 735 int unassigned; /* number of events to be assigned left */ 736 int nr_gp; /* number of GP counters used */ 737 u64 used; 738 }; 739 740 /* Total max is X86_PMC_IDX_MAX, but we are O(n!) limited */ 741 #define SCHED_STATES_MAX 2 742 743 struct perf_sched { 744 int max_weight; 745 int max_events; 746 int max_gp; 747 int saved_states; 748 struct event_constraint **constraints; 749 struct sched_state state; 750 struct sched_state saved[SCHED_STATES_MAX]; 751 }; 752 753 /* 754 * Initialize interator that runs through all events and counters. 755 */ 756 static void perf_sched_init(struct perf_sched *sched, struct event_constraint **constraints, 757 int num, int wmin, int wmax, int gpmax) 758 { 759 int idx; 760 761 memset(sched, 0, sizeof(*sched)); 762 sched->max_events = num; 763 sched->max_weight = wmax; 764 sched->max_gp = gpmax; 765 sched->constraints = constraints; 766 767 for (idx = 0; idx < num; idx++) { 768 if (constraints[idx]->weight == wmin) 769 break; 770 } 771 772 sched->state.event = idx; /* start with min weight */ 773 sched->state.weight = wmin; 774 sched->state.unassigned = num; 775 } 776 777 static void perf_sched_save_state(struct perf_sched *sched) 778 { 779 if (WARN_ON_ONCE(sched->saved_states >= SCHED_STATES_MAX)) 780 return; 781 782 sched->saved[sched->saved_states] = sched->state; 783 sched->saved_states++; 784 } 785 786 static bool perf_sched_restore_state(struct perf_sched *sched) 787 { 788 if (!sched->saved_states) 789 return false; 790 791 sched->saved_states--; 792 sched->state = sched->saved[sched->saved_states]; 793 794 /* this assignment didn't work out */ 795 /* XXX broken vs EVENT_PAIR */ 796 sched->state.used &= ~BIT_ULL(sched->state.counter); 797 798 /* try the next one */ 799 sched->state.counter++; 800 801 return true; 802 } 803 804 /* 805 * Select a counter for the current event to schedule. Return true on 806 * success. 807 */ 808 static bool __perf_sched_find_counter(struct perf_sched *sched) 809 { 810 struct event_constraint *c; 811 int idx; 812 813 if (!sched->state.unassigned) 814 return false; 815 816 if (sched->state.event >= sched->max_events) 817 return false; 818 819 c = sched->constraints[sched->state.event]; 820 /* Prefer fixed purpose counters */ 821 if (c->idxmsk64 & (~0ULL << INTEL_PMC_IDX_FIXED)) { 822 idx = INTEL_PMC_IDX_FIXED; 823 for_each_set_bit_from(idx, c->idxmsk, X86_PMC_IDX_MAX) { 824 u64 mask = BIT_ULL(idx); 825 826 if (sched->state.used & mask) 827 continue; 828 829 sched->state.used |= mask; 830 goto done; 831 } 832 } 833 834 /* Grab the first unused counter starting with idx */ 835 idx = sched->state.counter; 836 for_each_set_bit_from(idx, c->idxmsk, INTEL_PMC_IDX_FIXED) { 837 u64 mask = BIT_ULL(idx); 838 839 if (c->flags & PERF_X86_EVENT_PAIR) 840 mask |= mask << 1; 841 842 if (sched->state.used & mask) 843 continue; 844 845 if (sched->state.nr_gp++ >= sched->max_gp) 846 return false; 847 848 sched->state.used |= mask; 849 goto done; 850 } 851 852 return false; 853 854 done: 855 sched->state.counter = idx; 856 857 if (c->overlap) 858 perf_sched_save_state(sched); 859 860 return true; 861 } 862 863 static bool perf_sched_find_counter(struct perf_sched *sched) 864 { 865 while (!__perf_sched_find_counter(sched)) { 866 if (!perf_sched_restore_state(sched)) 867 return false; 868 } 869 870 return true; 871 } 872 873 /* 874 * Go through all unassigned events and find the next one to schedule. 875 * Take events with the least weight first. Return true on success. 876 */ 877 static bool perf_sched_next_event(struct perf_sched *sched) 878 { 879 struct event_constraint *c; 880 881 if (!sched->state.unassigned || !--sched->state.unassigned) 882 return false; 883 884 do { 885 /* next event */ 886 sched->state.event++; 887 if (sched->state.event >= sched->max_events) { 888 /* next weight */ 889 sched->state.event = 0; 890 sched->state.weight++; 891 if (sched->state.weight > sched->max_weight) 892 return false; 893 } 894 c = sched->constraints[sched->state.event]; 895 } while (c->weight != sched->state.weight); 896 897 sched->state.counter = 0; /* start with first counter */ 898 899 return true; 900 } 901 902 /* 903 * Assign a counter for each event. 904 */ 905 int perf_assign_events(struct event_constraint **constraints, int n, 906 int wmin, int wmax, int gpmax, int *assign) 907 { 908 struct perf_sched sched; 909 910 perf_sched_init(&sched, constraints, n, wmin, wmax, gpmax); 911 912 do { 913 if (!perf_sched_find_counter(&sched)) 914 break; /* failed */ 915 if (assign) 916 assign[sched.state.event] = sched.state.counter; 917 } while (perf_sched_next_event(&sched)); 918 919 return sched.state.unassigned; 920 } 921 EXPORT_SYMBOL_GPL(perf_assign_events); 922 923 int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) 924 { 925 struct event_constraint *c; 926 struct perf_event *e; 927 int n0, i, wmin, wmax, unsched = 0; 928 struct hw_perf_event *hwc; 929 u64 used_mask = 0; 930 931 /* 932 * Compute the number of events already present; see x86_pmu_add(), 933 * validate_group() and x86_pmu_commit_txn(). For the former two 934 * cpuc->n_events hasn't been updated yet, while for the latter 935 * cpuc->n_txn contains the number of events added in the current 936 * transaction. 937 */ 938 n0 = cpuc->n_events; 939 if (cpuc->txn_flags & PERF_PMU_TXN_ADD) 940 n0 -= cpuc->n_txn; 941 942 static_call_cond(x86_pmu_start_scheduling)(cpuc); 943 944 for (i = 0, wmin = X86_PMC_IDX_MAX, wmax = 0; i < n; i++) { 945 c = cpuc->event_constraint[i]; 946 947 /* 948 * Previously scheduled events should have a cached constraint, 949 * while new events should not have one. 950 */ 951 WARN_ON_ONCE((c && i >= n0) || (!c && i < n0)); 952 953 /* 954 * Request constraints for new events; or for those events that 955 * have a dynamic constraint -- for those the constraint can 956 * change due to external factors (sibling state, allow_tfa). 957 */ 958 if (!c || (c->flags & PERF_X86_EVENT_DYNAMIC)) { 959 c = static_call(x86_pmu_get_event_constraints)(cpuc, i, cpuc->event_list[i]); 960 cpuc->event_constraint[i] = c; 961 } 962 963 wmin = min(wmin, c->weight); 964 wmax = max(wmax, c->weight); 965 } 966 967 /* 968 * fastpath, try to reuse previous register 969 */ 970 for (i = 0; i < n; i++) { 971 u64 mask; 972 973 hwc = &cpuc->event_list[i]->hw; 974 c = cpuc->event_constraint[i]; 975 976 /* never assigned */ 977 if (hwc->idx == -1) 978 break; 979 980 /* constraint still honored */ 981 if (!test_bit(hwc->idx, c->idxmsk)) 982 break; 983 984 mask = BIT_ULL(hwc->idx); 985 if (is_counter_pair(hwc)) 986 mask |= mask << 1; 987 988 /* not already used */ 989 if (used_mask & mask) 990 break; 991 992 used_mask |= mask; 993 994 if (assign) 995 assign[i] = hwc->idx; 996 } 997 998 /* slow path */ 999 if (i != n) { 1000 int gpmax = x86_pmu.num_counters; 1001 1002 /* 1003 * Do not allow scheduling of more than half the available 1004 * generic counters. 1005 * 1006 * This helps avoid counter starvation of sibling thread by 1007 * ensuring at most half the counters cannot be in exclusive 1008 * mode. There is no designated counters for the limits. Any 1009 * N/2 counters can be used. This helps with events with 1010 * specific counter constraints. 1011 */ 1012 if (is_ht_workaround_enabled() && !cpuc->is_fake && 1013 READ_ONCE(cpuc->excl_cntrs->exclusive_present)) 1014 gpmax /= 2; 1015 1016 /* 1017 * Reduce the amount of available counters to allow fitting 1018 * the extra Merge events needed by large increment events. 1019 */ 1020 if (x86_pmu.flags & PMU_FL_PAIR) { 1021 gpmax = x86_pmu.num_counters - cpuc->n_pair; 1022 WARN_ON(gpmax <= 0); 1023 } 1024 1025 unsched = perf_assign_events(cpuc->event_constraint, n, wmin, 1026 wmax, gpmax, assign); 1027 } 1028 1029 /* 1030 * In case of success (unsched = 0), mark events as committed, 1031 * so we do not put_constraint() in case new events are added 1032 * and fail to be scheduled 1033 * 1034 * We invoke the lower level commit callback to lock the resource 1035 * 1036 * We do not need to do all of this in case we are called to 1037 * validate an event group (assign == NULL) 1038 */ 1039 if (!unsched && assign) { 1040 for (i = 0; i < n; i++) { 1041 e = cpuc->event_list[i]; 1042 static_call_cond(x86_pmu_commit_scheduling)(cpuc, i, assign[i]); 1043 } 1044 } else { 1045 for (i = n0; i < n; i++) { 1046 e = cpuc->event_list[i]; 1047 1048 /* 1049 * release events that failed scheduling 1050 */ 1051 static_call_cond(x86_pmu_put_event_constraints)(cpuc, e); 1052 1053 cpuc->event_constraint[i] = NULL; 1054 } 1055 } 1056 1057 static_call_cond(x86_pmu_stop_scheduling)(cpuc); 1058 1059 return unsched ? -EINVAL : 0; 1060 } 1061 1062 static int add_nr_metric_event(struct cpu_hw_events *cpuc, 1063 struct perf_event *event) 1064 { 1065 if (is_metric_event(event)) { 1066 if (cpuc->n_metric == INTEL_TD_METRIC_NUM) 1067 return -EINVAL; 1068 cpuc->n_metric++; 1069 cpuc->n_txn_metric++; 1070 } 1071 1072 return 0; 1073 } 1074 1075 static void del_nr_metric_event(struct cpu_hw_events *cpuc, 1076 struct perf_event *event) 1077 { 1078 if (is_metric_event(event)) 1079 cpuc->n_metric--; 1080 } 1081 1082 static int collect_event(struct cpu_hw_events *cpuc, struct perf_event *event, 1083 int max_count, int n) 1084 { 1085 1086 if (x86_pmu.intel_cap.perf_metrics && add_nr_metric_event(cpuc, event)) 1087 return -EINVAL; 1088 1089 if (n >= max_count + cpuc->n_metric) 1090 return -EINVAL; 1091 1092 cpuc->event_list[n] = event; 1093 if (is_counter_pair(&event->hw)) { 1094 cpuc->n_pair++; 1095 cpuc->n_txn_pair++; 1096 } 1097 1098 return 0; 1099 } 1100 1101 /* 1102 * dogrp: true if must collect siblings events (group) 1103 * returns total number of events and error code 1104 */ 1105 static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader, bool dogrp) 1106 { 1107 struct perf_event *event; 1108 int n, max_count; 1109 1110 max_count = x86_pmu.num_counters + x86_pmu.num_counters_fixed; 1111 1112 /* current number of events already accepted */ 1113 n = cpuc->n_events; 1114 if (!cpuc->n_events) 1115 cpuc->pebs_output = 0; 1116 1117 if (!cpuc->is_fake && leader->attr.precise_ip) { 1118 /* 1119 * For PEBS->PT, if !aux_event, the group leader (PT) went 1120 * away, the group was broken down and this singleton event 1121 * can't schedule any more. 1122 */ 1123 if (is_pebs_pt(leader) && !leader->aux_event) 1124 return -EINVAL; 1125 1126 /* 1127 * pebs_output: 0: no PEBS so far, 1: PT, 2: DS 1128 */ 1129 if (cpuc->pebs_output && 1130 cpuc->pebs_output != is_pebs_pt(leader) + 1) 1131 return -EINVAL; 1132 1133 cpuc->pebs_output = is_pebs_pt(leader) + 1; 1134 } 1135 1136 if (is_x86_event(leader)) { 1137 if (collect_event(cpuc, leader, max_count, n)) 1138 return -EINVAL; 1139 n++; 1140 } 1141 1142 if (!dogrp) 1143 return n; 1144 1145 for_each_sibling_event(event, leader) { 1146 if (!is_x86_event(event) || event->state <= PERF_EVENT_STATE_OFF) 1147 continue; 1148 1149 if (collect_event(cpuc, event, max_count, n)) 1150 return -EINVAL; 1151 1152 n++; 1153 } 1154 return n; 1155 } 1156 1157 static inline void x86_assign_hw_event(struct perf_event *event, 1158 struct cpu_hw_events *cpuc, int i) 1159 { 1160 struct hw_perf_event *hwc = &event->hw; 1161 int idx; 1162 1163 idx = hwc->idx = cpuc->assign[i]; 1164 hwc->last_cpu = smp_processor_id(); 1165 hwc->last_tag = ++cpuc->tags[i]; 1166 1167 switch (hwc->idx) { 1168 case INTEL_PMC_IDX_FIXED_BTS: 1169 case INTEL_PMC_IDX_FIXED_VLBR: 1170 hwc->config_base = 0; 1171 hwc->event_base = 0; 1172 break; 1173 1174 case INTEL_PMC_IDX_METRIC_BASE ... INTEL_PMC_IDX_METRIC_END: 1175 /* All the metric events are mapped onto the fixed counter 3. */ 1176 idx = INTEL_PMC_IDX_FIXED_SLOTS; 1177 fallthrough; 1178 case INTEL_PMC_IDX_FIXED ... INTEL_PMC_IDX_FIXED_BTS-1: 1179 hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL; 1180 hwc->event_base = MSR_ARCH_PERFMON_FIXED_CTR0 + 1181 (idx - INTEL_PMC_IDX_FIXED); 1182 hwc->event_base_rdpmc = (idx - INTEL_PMC_IDX_FIXED) | 1183 INTEL_PMC_FIXED_RDPMC_BASE; 1184 break; 1185 1186 default: 1187 hwc->config_base = x86_pmu_config_addr(hwc->idx); 1188 hwc->event_base = x86_pmu_event_addr(hwc->idx); 1189 hwc->event_base_rdpmc = x86_pmu_rdpmc_index(hwc->idx); 1190 break; 1191 } 1192 } 1193 1194 /** 1195 * x86_perf_rdpmc_index - Return PMC counter used for event 1196 * @event: the perf_event to which the PMC counter was assigned 1197 * 1198 * The counter assigned to this performance event may change if interrupts 1199 * are enabled. This counter should thus never be used while interrupts are 1200 * enabled. Before this function is used to obtain the assigned counter the 1201 * event should be checked for validity using, for example, 1202 * perf_event_read_local(), within the same interrupt disabled section in 1203 * which this counter is planned to be used. 1204 * 1205 * Return: The index of the performance monitoring counter assigned to 1206 * @perf_event. 1207 */ 1208 int x86_perf_rdpmc_index(struct perf_event *event) 1209 { 1210 lockdep_assert_irqs_disabled(); 1211 1212 return event->hw.event_base_rdpmc; 1213 } 1214 1215 static inline int match_prev_assignment(struct hw_perf_event *hwc, 1216 struct cpu_hw_events *cpuc, 1217 int i) 1218 { 1219 return hwc->idx == cpuc->assign[i] && 1220 hwc->last_cpu == smp_processor_id() && 1221 hwc->last_tag == cpuc->tags[i]; 1222 } 1223 1224 static void x86_pmu_start(struct perf_event *event, int flags); 1225 1226 static void x86_pmu_enable(struct pmu *pmu) 1227 { 1228 struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); 1229 struct perf_event *event; 1230 struct hw_perf_event *hwc; 1231 int i, added = cpuc->n_added; 1232 1233 if (!x86_pmu_initialized()) 1234 return; 1235 1236 if (cpuc->enabled) 1237 return; 1238 1239 if (cpuc->n_added) { 1240 int n_running = cpuc->n_events - cpuc->n_added; 1241 /* 1242 * apply assignment obtained either from 1243 * hw_perf_group_sched_in() or x86_pmu_enable() 1244 * 1245 * step1: save events moving to new counters 1246 */ 1247 for (i = 0; i < n_running; i++) { 1248 event = cpuc->event_list[i]; 1249 hwc = &event->hw; 1250 1251 /* 1252 * we can avoid reprogramming counter if: 1253 * - assigned same counter as last time 1254 * - running on same CPU as last time 1255 * - no other event has used the counter since 1256 */ 1257 if (hwc->idx == -1 || 1258 match_prev_assignment(hwc, cpuc, i)) 1259 continue; 1260 1261 /* 1262 * Ensure we don't accidentally enable a stopped 1263 * counter simply because we rescheduled. 1264 */ 1265 if (hwc->state & PERF_HES_STOPPED) 1266 hwc->state |= PERF_HES_ARCH; 1267 1268 x86_pmu_stop(event, PERF_EF_UPDATE); 1269 } 1270 1271 /* 1272 * step2: reprogram moved events into new counters 1273 */ 1274 for (i = 0; i < cpuc->n_events; i++) { 1275 event = cpuc->event_list[i]; 1276 hwc = &event->hw; 1277 1278 if (!match_prev_assignment(hwc, cpuc, i)) 1279 x86_assign_hw_event(event, cpuc, i); 1280 else if (i < n_running) 1281 continue; 1282 1283 if (hwc->state & PERF_HES_ARCH) 1284 continue; 1285 1286 x86_pmu_start(event, PERF_EF_RELOAD); 1287 } 1288 cpuc->n_added = 0; 1289 perf_events_lapic_init(); 1290 } 1291 1292 cpuc->enabled = 1; 1293 barrier(); 1294 1295 static_call(x86_pmu_enable_all)(added); 1296 } 1297 1298 static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left); 1299 1300 /* 1301 * Set the next IRQ period, based on the hwc->period_left value. 1302 * To be called with the event disabled in hw: 1303 */ 1304 int x86_perf_event_set_period(struct perf_event *event) 1305 { 1306 struct hw_perf_event *hwc = &event->hw; 1307 s64 left = local64_read(&hwc->period_left); 1308 s64 period = hwc->sample_period; 1309 int ret = 0, idx = hwc->idx; 1310 1311 if (unlikely(!hwc->event_base)) 1312 return 0; 1313 1314 if (unlikely(is_topdown_count(event)) && 1315 x86_pmu.set_topdown_event_period) 1316 return x86_pmu.set_topdown_event_period(event); 1317 1318 /* 1319 * If we are way outside a reasonable range then just skip forward: 1320 */ 1321 if (unlikely(left <= -period)) { 1322 left = period; 1323 local64_set(&hwc->period_left, left); 1324 hwc->last_period = period; 1325 ret = 1; 1326 } 1327 1328 if (unlikely(left <= 0)) { 1329 left += period; 1330 local64_set(&hwc->period_left, left); 1331 hwc->last_period = period; 1332 ret = 1; 1333 } 1334 /* 1335 * Quirk: certain CPUs dont like it if just 1 hw_event is left: 1336 */ 1337 if (unlikely(left < 2)) 1338 left = 2; 1339 1340 if (left > x86_pmu.max_period) 1341 left = x86_pmu.max_period; 1342 1343 if (x86_pmu.limit_period) 1344 left = x86_pmu.limit_period(event, left); 1345 1346 per_cpu(pmc_prev_left[idx], smp_processor_id()) = left; 1347 1348 /* 1349 * The hw event starts counting from this event offset, 1350 * mark it to be able to extra future deltas: 1351 */ 1352 local64_set(&hwc->prev_count, (u64)-left); 1353 1354 wrmsrl(hwc->event_base, (u64)(-left) & x86_pmu.cntval_mask); 1355 1356 /* 1357 * Sign extend the Merge event counter's upper 16 bits since 1358 * we currently declare a 48-bit counter width 1359 */ 1360 if (is_counter_pair(hwc)) 1361 wrmsrl(x86_pmu_event_addr(idx + 1), 0xffff); 1362 1363 /* 1364 * Due to erratum on certan cpu we need 1365 * a second write to be sure the register 1366 * is updated properly 1367 */ 1368 if (x86_pmu.perfctr_second_write) { 1369 wrmsrl(hwc->event_base, 1370 (u64)(-left) & x86_pmu.cntval_mask); 1371 } 1372 1373 perf_event_update_userpage(event); 1374 1375 return ret; 1376 } 1377 1378 void x86_pmu_enable_event(struct perf_event *event) 1379 { 1380 if (__this_cpu_read(cpu_hw_events.enabled)) 1381 __x86_pmu_enable_event(&event->hw, 1382 ARCH_PERFMON_EVENTSEL_ENABLE); 1383 } 1384 1385 /* 1386 * Add a single event to the PMU. 1387 * 1388 * The event is added to the group of enabled events 1389 * but only if it can be scheduled with existing events. 1390 */ 1391 static int x86_pmu_add(struct perf_event *event, int flags) 1392 { 1393 struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); 1394 struct hw_perf_event *hwc; 1395 int assign[X86_PMC_IDX_MAX]; 1396 int n, n0, ret; 1397 1398 hwc = &event->hw; 1399 1400 n0 = cpuc->n_events; 1401 ret = n = collect_events(cpuc, event, false); 1402 if (ret < 0) 1403 goto out; 1404 1405 hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED; 1406 if (!(flags & PERF_EF_START)) 1407 hwc->state |= PERF_HES_ARCH; 1408 1409 /* 1410 * If group events scheduling transaction was started, 1411 * skip the schedulability test here, it will be performed 1412 * at commit time (->commit_txn) as a whole. 1413 * 1414 * If commit fails, we'll call ->del() on all events 1415 * for which ->add() was called. 1416 */ 1417 if (cpuc->txn_flags & PERF_PMU_TXN_ADD) 1418 goto done_collect; 1419 1420 ret = static_call(x86_pmu_schedule_events)(cpuc, n, assign); 1421 if (ret) 1422 goto out; 1423 /* 1424 * copy new assignment, now we know it is possible 1425 * will be used by hw_perf_enable() 1426 */ 1427 memcpy(cpuc->assign, assign, n*sizeof(int)); 1428 1429 done_collect: 1430 /* 1431 * Commit the collect_events() state. See x86_pmu_del() and 1432 * x86_pmu_*_txn(). 1433 */ 1434 cpuc->n_events = n; 1435 cpuc->n_added += n - n0; 1436 cpuc->n_txn += n - n0; 1437 1438 /* 1439 * This is before x86_pmu_enable() will call x86_pmu_start(), 1440 * so we enable LBRs before an event needs them etc.. 1441 */ 1442 static_call_cond(x86_pmu_add)(event); 1443 1444 ret = 0; 1445 out: 1446 return ret; 1447 } 1448 1449 static void x86_pmu_start(struct perf_event *event, int flags) 1450 { 1451 struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); 1452 int idx = event->hw.idx; 1453 1454 if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED))) 1455 return; 1456 1457 if (WARN_ON_ONCE(idx == -1)) 1458 return; 1459 1460 if (flags & PERF_EF_RELOAD) { 1461 WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE)); 1462 x86_perf_event_set_period(event); 1463 } 1464 1465 event->hw.state = 0; 1466 1467 cpuc->events[idx] = event; 1468 __set_bit(idx, cpuc->active_mask); 1469 __set_bit(idx, cpuc->running); 1470 static_call(x86_pmu_enable)(event); 1471 perf_event_update_userpage(event); 1472 } 1473 1474 void perf_event_print_debug(void) 1475 { 1476 u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed; 1477 u64 pebs, debugctl; 1478 struct cpu_hw_events *cpuc; 1479 unsigned long flags; 1480 int cpu, idx; 1481 1482 if (!x86_pmu.num_counters) 1483 return; 1484 1485 local_irq_save(flags); 1486 1487 cpu = smp_processor_id(); 1488 cpuc = &per_cpu(cpu_hw_events, cpu); 1489 1490 if (x86_pmu.version >= 2) { 1491 rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl); 1492 rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); 1493 rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow); 1494 rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed); 1495 1496 pr_info("\n"); 1497 pr_info("CPU#%d: ctrl: %016llx\n", cpu, ctrl); 1498 pr_info("CPU#%d: status: %016llx\n", cpu, status); 1499 pr_info("CPU#%d: overflow: %016llx\n", cpu, overflow); 1500 pr_info("CPU#%d: fixed: %016llx\n", cpu, fixed); 1501 if (x86_pmu.pebs_constraints) { 1502 rdmsrl(MSR_IA32_PEBS_ENABLE, pebs); 1503 pr_info("CPU#%d: pebs: %016llx\n", cpu, pebs); 1504 } 1505 if (x86_pmu.lbr_nr) { 1506 rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); 1507 pr_info("CPU#%d: debugctl: %016llx\n", cpu, debugctl); 1508 } 1509 } 1510 pr_info("CPU#%d: active: %016llx\n", cpu, *(u64 *)cpuc->active_mask); 1511 1512 for (idx = 0; idx < x86_pmu.num_counters; idx++) { 1513 rdmsrl(x86_pmu_config_addr(idx), pmc_ctrl); 1514 rdmsrl(x86_pmu_event_addr(idx), pmc_count); 1515 1516 prev_left = per_cpu(pmc_prev_left[idx], cpu); 1517 1518 pr_info("CPU#%d: gen-PMC%d ctrl: %016llx\n", 1519 cpu, idx, pmc_ctrl); 1520 pr_info("CPU#%d: gen-PMC%d count: %016llx\n", 1521 cpu, idx, pmc_count); 1522 pr_info("CPU#%d: gen-PMC%d left: %016llx\n", 1523 cpu, idx, prev_left); 1524 } 1525 for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) { 1526 rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count); 1527 1528 pr_info("CPU#%d: fixed-PMC%d count: %016llx\n", 1529 cpu, idx, pmc_count); 1530 } 1531 local_irq_restore(flags); 1532 } 1533 1534 void x86_pmu_stop(struct perf_event *event, int flags) 1535 { 1536 struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); 1537 struct hw_perf_event *hwc = &event->hw; 1538 1539 if (test_bit(hwc->idx, cpuc->active_mask)) { 1540 static_call(x86_pmu_disable)(event); 1541 __clear_bit(hwc->idx, cpuc->active_mask); 1542 cpuc->events[hwc->idx] = NULL; 1543 WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED); 1544 hwc->state |= PERF_HES_STOPPED; 1545 } 1546 1547 if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) { 1548 /* 1549 * Drain the remaining delta count out of a event 1550 * that we are disabling: 1551 */ 1552 x86_perf_event_update(event); 1553 hwc->state |= PERF_HES_UPTODATE; 1554 } 1555 } 1556 1557 static void x86_pmu_del(struct perf_event *event, int flags) 1558 { 1559 struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); 1560 int i; 1561 1562 /* 1563 * If we're called during a txn, we only need to undo x86_pmu.add. 1564 * The events never got scheduled and ->cancel_txn will truncate 1565 * the event_list. 1566 * 1567 * XXX assumes any ->del() called during a TXN will only be on 1568 * an event added during that same TXN. 1569 */ 1570 if (cpuc->txn_flags & PERF_PMU_TXN_ADD) 1571 goto do_del; 1572 1573 /* 1574 * Not a TXN, therefore cleanup properly. 1575 */ 1576 x86_pmu_stop(event, PERF_EF_UPDATE); 1577 1578 for (i = 0; i < cpuc->n_events; i++) { 1579 if (event == cpuc->event_list[i]) 1580 break; 1581 } 1582 1583 if (WARN_ON_ONCE(i == cpuc->n_events)) /* called ->del() without ->add() ? */ 1584 return; 1585 1586 /* If we have a newly added event; make sure to decrease n_added. */ 1587 if (i >= cpuc->n_events - cpuc->n_added) 1588 --cpuc->n_added; 1589 1590 static_call_cond(x86_pmu_put_event_constraints)(cpuc, event); 1591 1592 /* Delete the array entry. */ 1593 while (++i < cpuc->n_events) { 1594 cpuc->event_list[i-1] = cpuc->event_list[i]; 1595 cpuc->event_constraint[i-1] = cpuc->event_constraint[i]; 1596 } 1597 cpuc->event_constraint[i-1] = NULL; 1598 --cpuc->n_events; 1599 if (x86_pmu.intel_cap.perf_metrics) 1600 del_nr_metric_event(cpuc, event); 1601 1602 perf_event_update_userpage(event); 1603 1604 do_del: 1605 1606 /* 1607 * This is after x86_pmu_stop(); so we disable LBRs after any 1608 * event can need them etc.. 1609 */ 1610 static_call_cond(x86_pmu_del)(event); 1611 } 1612 1613 int x86_pmu_handle_irq(struct pt_regs *regs) 1614 { 1615 struct perf_sample_data data; 1616 struct cpu_hw_events *cpuc; 1617 struct perf_event *event; 1618 int idx, handled = 0; 1619 u64 val; 1620 1621 cpuc = this_cpu_ptr(&cpu_hw_events); 1622 1623 /* 1624 * Some chipsets need to unmask the LVTPC in a particular spot 1625 * inside the nmi handler. As a result, the unmasking was pushed 1626 * into all the nmi handlers. 1627 * 1628 * This generic handler doesn't seem to have any issues where the 1629 * unmasking occurs so it was left at the top. 1630 */ 1631 apic_write(APIC_LVTPC, APIC_DM_NMI); 1632 1633 for (idx = 0; idx < x86_pmu.num_counters; idx++) { 1634 if (!test_bit(idx, cpuc->active_mask)) 1635 continue; 1636 1637 event = cpuc->events[idx]; 1638 1639 val = x86_perf_event_update(event); 1640 if (val & (1ULL << (x86_pmu.cntval_bits - 1))) 1641 continue; 1642 1643 /* 1644 * event overflow 1645 */ 1646 handled++; 1647 perf_sample_data_init(&data, 0, event->hw.last_period); 1648 1649 if (!x86_perf_event_set_period(event)) 1650 continue; 1651 1652 if (perf_event_overflow(event, &data, regs)) 1653 x86_pmu_stop(event, 0); 1654 } 1655 1656 if (handled) 1657 inc_irq_stat(apic_perf_irqs); 1658 1659 return handled; 1660 } 1661 1662 void perf_events_lapic_init(void) 1663 { 1664 if (!x86_pmu.apic || !x86_pmu_initialized()) 1665 return; 1666 1667 /* 1668 * Always use NMI for PMU 1669 */ 1670 apic_write(APIC_LVTPC, APIC_DM_NMI); 1671 } 1672 1673 static int 1674 perf_event_nmi_handler(unsigned int cmd, struct pt_regs *regs) 1675 { 1676 u64 start_clock; 1677 u64 finish_clock; 1678 int ret; 1679 1680 /* 1681 * All PMUs/events that share this PMI handler should make sure to 1682 * increment active_events for their events. 1683 */ 1684 if (!atomic_read(&active_events)) 1685 return NMI_DONE; 1686 1687 start_clock = sched_clock(); 1688 ret = static_call(x86_pmu_handle_irq)(regs); 1689 finish_clock = sched_clock(); 1690 1691 perf_sample_event_took(finish_clock - start_clock); 1692 1693 return ret; 1694 } 1695 NOKPROBE_SYMBOL(perf_event_nmi_handler); 1696 1697 struct event_constraint emptyconstraint; 1698 struct event_constraint unconstrained; 1699 1700 static int x86_pmu_prepare_cpu(unsigned int cpu) 1701 { 1702 struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); 1703 int i; 1704 1705 for (i = 0 ; i < X86_PERF_KFREE_MAX; i++) 1706 cpuc->kfree_on_online[i] = NULL; 1707 if (x86_pmu.cpu_prepare) 1708 return x86_pmu.cpu_prepare(cpu); 1709 return 0; 1710 } 1711 1712 static int x86_pmu_dead_cpu(unsigned int cpu) 1713 { 1714 if (x86_pmu.cpu_dead) 1715 x86_pmu.cpu_dead(cpu); 1716 return 0; 1717 } 1718 1719 static int x86_pmu_online_cpu(unsigned int cpu) 1720 { 1721 struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); 1722 int i; 1723 1724 for (i = 0 ; i < X86_PERF_KFREE_MAX; i++) { 1725 kfree(cpuc->kfree_on_online[i]); 1726 cpuc->kfree_on_online[i] = NULL; 1727 } 1728 return 0; 1729 } 1730 1731 static int x86_pmu_starting_cpu(unsigned int cpu) 1732 { 1733 if (x86_pmu.cpu_starting) 1734 x86_pmu.cpu_starting(cpu); 1735 return 0; 1736 } 1737 1738 static int x86_pmu_dying_cpu(unsigned int cpu) 1739 { 1740 if (x86_pmu.cpu_dying) 1741 x86_pmu.cpu_dying(cpu); 1742 return 0; 1743 } 1744 1745 static void __init pmu_check_apic(void) 1746 { 1747 if (boot_cpu_has(X86_FEATURE_APIC)) 1748 return; 1749 1750 x86_pmu.apic = 0; 1751 pr_info("no APIC, boot with the \"lapic\" boot parameter to force-enable it.\n"); 1752 pr_info("no hardware sampling interrupt available.\n"); 1753 1754 /* 1755 * If we have a PMU initialized but no APIC 1756 * interrupts, we cannot sample hardware 1757 * events (user-space has to fall back and 1758 * sample via a hrtimer based software event): 1759 */ 1760 pmu.capabilities |= PERF_PMU_CAP_NO_INTERRUPT; 1761 1762 } 1763 1764 static struct attribute_group x86_pmu_format_group __ro_after_init = { 1765 .name = "format", 1766 .attrs = NULL, 1767 }; 1768 1769 ssize_t events_sysfs_show(struct device *dev, struct device_attribute *attr, char *page) 1770 { 1771 struct perf_pmu_events_attr *pmu_attr = 1772 container_of(attr, struct perf_pmu_events_attr, attr); 1773 u64 config = 0; 1774 1775 if (pmu_attr->id < x86_pmu.max_events) 1776 config = x86_pmu.event_map(pmu_attr->id); 1777 1778 /* string trumps id */ 1779 if (pmu_attr->event_str) 1780 return sprintf(page, "%s", pmu_attr->event_str); 1781 1782 return x86_pmu.events_sysfs_show(page, config); 1783 } 1784 EXPORT_SYMBOL_GPL(events_sysfs_show); 1785 1786 ssize_t events_ht_sysfs_show(struct device *dev, struct device_attribute *attr, 1787 char *page) 1788 { 1789 struct perf_pmu_events_ht_attr *pmu_attr = 1790 container_of(attr, struct perf_pmu_events_ht_attr, attr); 1791 1792 /* 1793 * Report conditional events depending on Hyper-Threading. 1794 * 1795 * This is overly conservative as usually the HT special 1796 * handling is not needed if the other CPU thread is idle. 1797 * 1798 * Note this does not (and cannot) handle the case when thread 1799 * siblings are invisible, for example with virtualization 1800 * if they are owned by some other guest. The user tool 1801 * has to re-read when a thread sibling gets onlined later. 1802 */ 1803 return sprintf(page, "%s", 1804 topology_max_smt_threads() > 1 ? 1805 pmu_attr->event_str_ht : 1806 pmu_attr->event_str_noht); 1807 } 1808 1809 EVENT_ATTR(cpu-cycles, CPU_CYCLES ); 1810 EVENT_ATTR(instructions, INSTRUCTIONS ); 1811 EVENT_ATTR(cache-references, CACHE_REFERENCES ); 1812 EVENT_ATTR(cache-misses, CACHE_MISSES ); 1813 EVENT_ATTR(branch-instructions, BRANCH_INSTRUCTIONS ); 1814 EVENT_ATTR(branch-misses, BRANCH_MISSES ); 1815 EVENT_ATTR(bus-cycles, BUS_CYCLES ); 1816 EVENT_ATTR(stalled-cycles-frontend, STALLED_CYCLES_FRONTEND ); 1817 EVENT_ATTR(stalled-cycles-backend, STALLED_CYCLES_BACKEND ); 1818 EVENT_ATTR(ref-cycles, REF_CPU_CYCLES ); 1819 1820 static struct attribute *empty_attrs; 1821 1822 static struct attribute *events_attr[] = { 1823 EVENT_PTR(CPU_CYCLES), 1824 EVENT_PTR(INSTRUCTIONS), 1825 EVENT_PTR(CACHE_REFERENCES), 1826 EVENT_PTR(CACHE_MISSES), 1827 EVENT_PTR(BRANCH_INSTRUCTIONS), 1828 EVENT_PTR(BRANCH_MISSES), 1829 EVENT_PTR(BUS_CYCLES), 1830 EVENT_PTR(STALLED_CYCLES_FRONTEND), 1831 EVENT_PTR(STALLED_CYCLES_BACKEND), 1832 EVENT_PTR(REF_CPU_CYCLES), 1833 NULL, 1834 }; 1835 1836 /* 1837 * Remove all undefined events (x86_pmu.event_map(id) == 0) 1838 * out of events_attr attributes. 1839 */ 1840 static umode_t 1841 is_visible(struct kobject *kobj, struct attribute *attr, int idx) 1842 { 1843 struct perf_pmu_events_attr *pmu_attr; 1844 1845 if (idx >= x86_pmu.max_events) 1846 return 0; 1847 1848 pmu_attr = container_of(attr, struct perf_pmu_events_attr, attr.attr); 1849 /* str trumps id */ 1850 return pmu_attr->event_str || x86_pmu.event_map(idx) ? attr->mode : 0; 1851 } 1852 1853 static struct attribute_group x86_pmu_events_group __ro_after_init = { 1854 .name = "events", 1855 .attrs = events_attr, 1856 .is_visible = is_visible, 1857 }; 1858 1859 ssize_t x86_event_sysfs_show(char *page, u64 config, u64 event) 1860 { 1861 u64 umask = (config & ARCH_PERFMON_EVENTSEL_UMASK) >> 8; 1862 u64 cmask = (config & ARCH_PERFMON_EVENTSEL_CMASK) >> 24; 1863 bool edge = (config & ARCH_PERFMON_EVENTSEL_EDGE); 1864 bool pc = (config & ARCH_PERFMON_EVENTSEL_PIN_CONTROL); 1865 bool any = (config & ARCH_PERFMON_EVENTSEL_ANY); 1866 bool inv = (config & ARCH_PERFMON_EVENTSEL_INV); 1867 ssize_t ret; 1868 1869 /* 1870 * We have whole page size to spend and just little data 1871 * to write, so we can safely use sprintf. 1872 */ 1873 ret = sprintf(page, "event=0x%02llx", event); 1874 1875 if (umask) 1876 ret += sprintf(page + ret, ",umask=0x%02llx", umask); 1877 1878 if (edge) 1879 ret += sprintf(page + ret, ",edge"); 1880 1881 if (pc) 1882 ret += sprintf(page + ret, ",pc"); 1883 1884 if (any) 1885 ret += sprintf(page + ret, ",any"); 1886 1887 if (inv) 1888 ret += sprintf(page + ret, ",inv"); 1889 1890 if (cmask) 1891 ret += sprintf(page + ret, ",cmask=0x%02llx", cmask); 1892 1893 ret += sprintf(page + ret, "\n"); 1894 1895 return ret; 1896 } 1897 1898 static struct attribute_group x86_pmu_attr_group; 1899 static struct attribute_group x86_pmu_caps_group; 1900 1901 static void x86_pmu_static_call_update(void) 1902 { 1903 static_call_update(x86_pmu_handle_irq, x86_pmu.handle_irq); 1904 static_call_update(x86_pmu_disable_all, x86_pmu.disable_all); 1905 static_call_update(x86_pmu_enable_all, x86_pmu.enable_all); 1906 static_call_update(x86_pmu_enable, x86_pmu.enable); 1907 static_call_update(x86_pmu_disable, x86_pmu.disable); 1908 1909 static_call_update(x86_pmu_add, x86_pmu.add); 1910 static_call_update(x86_pmu_del, x86_pmu.del); 1911 static_call_update(x86_pmu_read, x86_pmu.read); 1912 1913 static_call_update(x86_pmu_schedule_events, x86_pmu.schedule_events); 1914 static_call_update(x86_pmu_get_event_constraints, x86_pmu.get_event_constraints); 1915 static_call_update(x86_pmu_put_event_constraints, x86_pmu.put_event_constraints); 1916 1917 static_call_update(x86_pmu_start_scheduling, x86_pmu.start_scheduling); 1918 static_call_update(x86_pmu_commit_scheduling, x86_pmu.commit_scheduling); 1919 static_call_update(x86_pmu_stop_scheduling, x86_pmu.stop_scheduling); 1920 1921 static_call_update(x86_pmu_sched_task, x86_pmu.sched_task); 1922 static_call_update(x86_pmu_swap_task_ctx, x86_pmu.swap_task_ctx); 1923 1924 static_call_update(x86_pmu_drain_pebs, x86_pmu.drain_pebs); 1925 static_call_update(x86_pmu_pebs_aliases, x86_pmu.pebs_aliases); 1926 } 1927 1928 static void _x86_pmu_read(struct perf_event *event) 1929 { 1930 x86_perf_event_update(event); 1931 } 1932 1933 static int __init init_hw_perf_events(void) 1934 { 1935 struct x86_pmu_quirk *quirk; 1936 int err; 1937 1938 pr_info("Performance Events: "); 1939 1940 switch (boot_cpu_data.x86_vendor) { 1941 case X86_VENDOR_INTEL: 1942 err = intel_pmu_init(); 1943 break; 1944 case X86_VENDOR_AMD: 1945 err = amd_pmu_init(); 1946 break; 1947 case X86_VENDOR_HYGON: 1948 err = amd_pmu_init(); 1949 x86_pmu.name = "HYGON"; 1950 break; 1951 case X86_VENDOR_ZHAOXIN: 1952 case X86_VENDOR_CENTAUR: 1953 err = zhaoxin_pmu_init(); 1954 break; 1955 default: 1956 err = -ENOTSUPP; 1957 } 1958 if (err != 0) { 1959 pr_cont("no PMU driver, software events only.\n"); 1960 return 0; 1961 } 1962 1963 pmu_check_apic(); 1964 1965 /* sanity check that the hardware exists or is emulated */ 1966 if (!check_hw_exists()) 1967 return 0; 1968 1969 pr_cont("%s PMU driver.\n", x86_pmu.name); 1970 1971 x86_pmu.attr_rdpmc = 1; /* enable userspace RDPMC usage by default */ 1972 1973 for (quirk = x86_pmu.quirks; quirk; quirk = quirk->next) 1974 quirk->func(); 1975 1976 if (!x86_pmu.intel_ctrl) 1977 x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1; 1978 1979 perf_events_lapic_init(); 1980 register_nmi_handler(NMI_LOCAL, perf_event_nmi_handler, 0, "PMI"); 1981 1982 unconstrained = (struct event_constraint) 1983 __EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_counters) - 1, 1984 0, x86_pmu.num_counters, 0, 0); 1985 1986 x86_pmu_format_group.attrs = x86_pmu.format_attrs; 1987 1988 if (!x86_pmu.events_sysfs_show) 1989 x86_pmu_events_group.attrs = &empty_attrs; 1990 1991 pmu.attr_update = x86_pmu.attr_update; 1992 1993 pr_info("... version: %d\n", x86_pmu.version); 1994 pr_info("... bit width: %d\n", x86_pmu.cntval_bits); 1995 pr_info("... generic registers: %d\n", x86_pmu.num_counters); 1996 pr_info("... value mask: %016Lx\n", x86_pmu.cntval_mask); 1997 pr_info("... max period: %016Lx\n", x86_pmu.max_period); 1998 pr_info("... fixed-purpose events: %d\n", x86_pmu.num_counters_fixed); 1999 pr_info("... event mask: %016Lx\n", x86_pmu.intel_ctrl); 2000 2001 if (!x86_pmu.read) 2002 x86_pmu.read = _x86_pmu_read; 2003 2004 x86_pmu_static_call_update(); 2005 2006 /* 2007 * Install callbacks. Core will call them for each online 2008 * cpu. 2009 */ 2010 err = cpuhp_setup_state(CPUHP_PERF_X86_PREPARE, "perf/x86:prepare", 2011 x86_pmu_prepare_cpu, x86_pmu_dead_cpu); 2012 if (err) 2013 return err; 2014 2015 err = cpuhp_setup_state(CPUHP_AP_PERF_X86_STARTING, 2016 "perf/x86:starting", x86_pmu_starting_cpu, 2017 x86_pmu_dying_cpu); 2018 if (err) 2019 goto out; 2020 2021 err = cpuhp_setup_state(CPUHP_AP_PERF_X86_ONLINE, "perf/x86:online", 2022 x86_pmu_online_cpu, NULL); 2023 if (err) 2024 goto out1; 2025 2026 err = perf_pmu_register(&pmu, "cpu", PERF_TYPE_RAW); 2027 if (err) 2028 goto out2; 2029 2030 return 0; 2031 2032 out2: 2033 cpuhp_remove_state(CPUHP_AP_PERF_X86_ONLINE); 2034 out1: 2035 cpuhp_remove_state(CPUHP_AP_PERF_X86_STARTING); 2036 out: 2037 cpuhp_remove_state(CPUHP_PERF_X86_PREPARE); 2038 return err; 2039 } 2040 early_initcall(init_hw_perf_events); 2041 2042 static void x86_pmu_read(struct perf_event *event) 2043 { 2044 static_call(x86_pmu_read)(event); 2045 } 2046 2047 /* 2048 * Start group events scheduling transaction 2049 * Set the flag to make pmu::enable() not perform the 2050 * schedulability test, it will be performed at commit time 2051 * 2052 * We only support PERF_PMU_TXN_ADD transactions. Save the 2053 * transaction flags but otherwise ignore non-PERF_PMU_TXN_ADD 2054 * transactions. 2055 */ 2056 static void x86_pmu_start_txn(struct pmu *pmu, unsigned int txn_flags) 2057 { 2058 struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); 2059 2060 WARN_ON_ONCE(cpuc->txn_flags); /* txn already in flight */ 2061 2062 cpuc->txn_flags = txn_flags; 2063 if (txn_flags & ~PERF_PMU_TXN_ADD) 2064 return; 2065 2066 perf_pmu_disable(pmu); 2067 __this_cpu_write(cpu_hw_events.n_txn, 0); 2068 __this_cpu_write(cpu_hw_events.n_txn_pair, 0); 2069 __this_cpu_write(cpu_hw_events.n_txn_metric, 0); 2070 } 2071 2072 /* 2073 * Stop group events scheduling transaction 2074 * Clear the flag and pmu::enable() will perform the 2075 * schedulability test. 2076 */ 2077 static void x86_pmu_cancel_txn(struct pmu *pmu) 2078 { 2079 unsigned int txn_flags; 2080 struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); 2081 2082 WARN_ON_ONCE(!cpuc->txn_flags); /* no txn in flight */ 2083 2084 txn_flags = cpuc->txn_flags; 2085 cpuc->txn_flags = 0; 2086 if (txn_flags & ~PERF_PMU_TXN_ADD) 2087 return; 2088 2089 /* 2090 * Truncate collected array by the number of events added in this 2091 * transaction. See x86_pmu_add() and x86_pmu_*_txn(). 2092 */ 2093 __this_cpu_sub(cpu_hw_events.n_added, __this_cpu_read(cpu_hw_events.n_txn)); 2094 __this_cpu_sub(cpu_hw_events.n_events, __this_cpu_read(cpu_hw_events.n_txn)); 2095 __this_cpu_sub(cpu_hw_events.n_pair, __this_cpu_read(cpu_hw_events.n_txn_pair)); 2096 __this_cpu_sub(cpu_hw_events.n_metric, __this_cpu_read(cpu_hw_events.n_txn_metric)); 2097 perf_pmu_enable(pmu); 2098 } 2099 2100 /* 2101 * Commit group events scheduling transaction 2102 * Perform the group schedulability test as a whole 2103 * Return 0 if success 2104 * 2105 * Does not cancel the transaction on failure; expects the caller to do this. 2106 */ 2107 static int x86_pmu_commit_txn(struct pmu *pmu) 2108 { 2109 struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); 2110 int assign[X86_PMC_IDX_MAX]; 2111 int n, ret; 2112 2113 WARN_ON_ONCE(!cpuc->txn_flags); /* no txn in flight */ 2114 2115 if (cpuc->txn_flags & ~PERF_PMU_TXN_ADD) { 2116 cpuc->txn_flags = 0; 2117 return 0; 2118 } 2119 2120 n = cpuc->n_events; 2121 2122 if (!x86_pmu_initialized()) 2123 return -EAGAIN; 2124 2125 ret = static_call(x86_pmu_schedule_events)(cpuc, n, assign); 2126 if (ret) 2127 return ret; 2128 2129 /* 2130 * copy new assignment, now we know it is possible 2131 * will be used by hw_perf_enable() 2132 */ 2133 memcpy(cpuc->assign, assign, n*sizeof(int)); 2134 2135 cpuc->txn_flags = 0; 2136 perf_pmu_enable(pmu); 2137 return 0; 2138 } 2139 /* 2140 * a fake_cpuc is used to validate event groups. Due to 2141 * the extra reg logic, we need to also allocate a fake 2142 * per_core and per_cpu structure. Otherwise, group events 2143 * using extra reg may conflict without the kernel being 2144 * able to catch this when the last event gets added to 2145 * the group. 2146 */ 2147 static void free_fake_cpuc(struct cpu_hw_events *cpuc) 2148 { 2149 intel_cpuc_finish(cpuc); 2150 kfree(cpuc); 2151 } 2152 2153 static struct cpu_hw_events *allocate_fake_cpuc(void) 2154 { 2155 struct cpu_hw_events *cpuc; 2156 int cpu = raw_smp_processor_id(); 2157 2158 cpuc = kzalloc(sizeof(*cpuc), GFP_KERNEL); 2159 if (!cpuc) 2160 return ERR_PTR(-ENOMEM); 2161 cpuc->is_fake = 1; 2162 2163 if (intel_cpuc_prepare(cpuc, cpu)) 2164 goto error; 2165 2166 return cpuc; 2167 error: 2168 free_fake_cpuc(cpuc); 2169 return ERR_PTR(-ENOMEM); 2170 } 2171 2172 /* 2173 * validate that we can schedule this event 2174 */ 2175 static int validate_event(struct perf_event *event) 2176 { 2177 struct cpu_hw_events *fake_cpuc; 2178 struct event_constraint *c; 2179 int ret = 0; 2180 2181 fake_cpuc = allocate_fake_cpuc(); 2182 if (IS_ERR(fake_cpuc)) 2183 return PTR_ERR(fake_cpuc); 2184 2185 c = x86_pmu.get_event_constraints(fake_cpuc, 0, event); 2186 2187 if (!c || !c->weight) 2188 ret = -EINVAL; 2189 2190 if (x86_pmu.put_event_constraints) 2191 x86_pmu.put_event_constraints(fake_cpuc, event); 2192 2193 free_fake_cpuc(fake_cpuc); 2194 2195 return ret; 2196 } 2197 2198 /* 2199 * validate a single event group 2200 * 2201 * validation include: 2202 * - check events are compatible which each other 2203 * - events do not compete for the same counter 2204 * - number of events <= number of counters 2205 * 2206 * validation ensures the group can be loaded onto the 2207 * PMU if it was the only group available. 2208 */ 2209 static int validate_group(struct perf_event *event) 2210 { 2211 struct perf_event *leader = event->group_leader; 2212 struct cpu_hw_events *fake_cpuc; 2213 int ret = -EINVAL, n; 2214 2215 fake_cpuc = allocate_fake_cpuc(); 2216 if (IS_ERR(fake_cpuc)) 2217 return PTR_ERR(fake_cpuc); 2218 /* 2219 * the event is not yet connected with its 2220 * siblings therefore we must first collect 2221 * existing siblings, then add the new event 2222 * before we can simulate the scheduling 2223 */ 2224 n = collect_events(fake_cpuc, leader, true); 2225 if (n < 0) 2226 goto out; 2227 2228 fake_cpuc->n_events = n; 2229 n = collect_events(fake_cpuc, event, false); 2230 if (n < 0) 2231 goto out; 2232 2233 fake_cpuc->n_events = 0; 2234 ret = x86_pmu.schedule_events(fake_cpuc, n, NULL); 2235 2236 out: 2237 free_fake_cpuc(fake_cpuc); 2238 return ret; 2239 } 2240 2241 static int x86_pmu_event_init(struct perf_event *event) 2242 { 2243 struct pmu *tmp; 2244 int err; 2245 2246 switch (event->attr.type) { 2247 case PERF_TYPE_RAW: 2248 case PERF_TYPE_HARDWARE: 2249 case PERF_TYPE_HW_CACHE: 2250 break; 2251 2252 default: 2253 return -ENOENT; 2254 } 2255 2256 err = __x86_pmu_event_init(event); 2257 if (!err) { 2258 /* 2259 * we temporarily connect event to its pmu 2260 * such that validate_group() can classify 2261 * it as an x86 event using is_x86_event() 2262 */ 2263 tmp = event->pmu; 2264 event->pmu = &pmu; 2265 2266 if (event->group_leader != event) 2267 err = validate_group(event); 2268 else 2269 err = validate_event(event); 2270 2271 event->pmu = tmp; 2272 } 2273 if (err) { 2274 if (event->destroy) 2275 event->destroy(event); 2276 } 2277 2278 if (READ_ONCE(x86_pmu.attr_rdpmc) && 2279 !(event->hw.flags & PERF_X86_EVENT_LARGE_PEBS)) 2280 event->hw.flags |= PERF_X86_EVENT_RDPMC_ALLOWED; 2281 2282 return err; 2283 } 2284 2285 static void x86_pmu_event_mapped(struct perf_event *event, struct mm_struct *mm) 2286 { 2287 if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED)) 2288 return; 2289 2290 /* 2291 * This function relies on not being called concurrently in two 2292 * tasks in the same mm. Otherwise one task could observe 2293 * perf_rdpmc_allowed > 1 and return all the way back to 2294 * userspace with CR4.PCE clear while another task is still 2295 * doing on_each_cpu_mask() to propagate CR4.PCE. 2296 * 2297 * For now, this can't happen because all callers hold mmap_lock 2298 * for write. If this changes, we'll need a different solution. 2299 */ 2300 mmap_assert_write_locked(mm); 2301 2302 if (atomic_inc_return(&mm->context.perf_rdpmc_allowed) == 1) 2303 on_each_cpu_mask(mm_cpumask(mm), cr4_update_pce, NULL, 1); 2304 } 2305 2306 static void x86_pmu_event_unmapped(struct perf_event *event, struct mm_struct *mm) 2307 { 2308 2309 if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED)) 2310 return; 2311 2312 if (atomic_dec_and_test(&mm->context.perf_rdpmc_allowed)) 2313 on_each_cpu_mask(mm_cpumask(mm), cr4_update_pce, NULL, 1); 2314 } 2315 2316 static int x86_pmu_event_idx(struct perf_event *event) 2317 { 2318 struct hw_perf_event *hwc = &event->hw; 2319 2320 if (!(hwc->flags & PERF_X86_EVENT_RDPMC_ALLOWED)) 2321 return 0; 2322 2323 if (is_metric_idx(hwc->idx)) 2324 return INTEL_PMC_FIXED_RDPMC_METRICS + 1; 2325 else 2326 return hwc->event_base_rdpmc + 1; 2327 } 2328 2329 static ssize_t get_attr_rdpmc(struct device *cdev, 2330 struct device_attribute *attr, 2331 char *buf) 2332 { 2333 return snprintf(buf, 40, "%d\n", x86_pmu.attr_rdpmc); 2334 } 2335 2336 static ssize_t set_attr_rdpmc(struct device *cdev, 2337 struct device_attribute *attr, 2338 const char *buf, size_t count) 2339 { 2340 unsigned long val; 2341 ssize_t ret; 2342 2343 ret = kstrtoul(buf, 0, &val); 2344 if (ret) 2345 return ret; 2346 2347 if (val > 2) 2348 return -EINVAL; 2349 2350 if (x86_pmu.attr_rdpmc_broken) 2351 return -ENOTSUPP; 2352 2353 if (val != x86_pmu.attr_rdpmc) { 2354 /* 2355 * Changing into or out of never available or always available, 2356 * aka perf-event-bypassing mode. This path is extremely slow, 2357 * but only root can trigger it, so it's okay. 2358 */ 2359 if (val == 0) 2360 static_branch_inc(&rdpmc_never_available_key); 2361 else if (x86_pmu.attr_rdpmc == 0) 2362 static_branch_dec(&rdpmc_never_available_key); 2363 2364 if (val == 2) 2365 static_branch_inc(&rdpmc_always_available_key); 2366 else if (x86_pmu.attr_rdpmc == 2) 2367 static_branch_dec(&rdpmc_always_available_key); 2368 2369 on_each_cpu(cr4_update_pce, NULL, 1); 2370 x86_pmu.attr_rdpmc = val; 2371 } 2372 2373 return count; 2374 } 2375 2376 static DEVICE_ATTR(rdpmc, S_IRUSR | S_IWUSR, get_attr_rdpmc, set_attr_rdpmc); 2377 2378 static struct attribute *x86_pmu_attrs[] = { 2379 &dev_attr_rdpmc.attr, 2380 NULL, 2381 }; 2382 2383 static struct attribute_group x86_pmu_attr_group __ro_after_init = { 2384 .attrs = x86_pmu_attrs, 2385 }; 2386 2387 static ssize_t max_precise_show(struct device *cdev, 2388 struct device_attribute *attr, 2389 char *buf) 2390 { 2391 return snprintf(buf, PAGE_SIZE, "%d\n", x86_pmu_max_precise()); 2392 } 2393 2394 static DEVICE_ATTR_RO(max_precise); 2395 2396 static struct attribute *x86_pmu_caps_attrs[] = { 2397 &dev_attr_max_precise.attr, 2398 NULL 2399 }; 2400 2401 static struct attribute_group x86_pmu_caps_group __ro_after_init = { 2402 .name = "caps", 2403 .attrs = x86_pmu_caps_attrs, 2404 }; 2405 2406 static const struct attribute_group *x86_pmu_attr_groups[] = { 2407 &x86_pmu_attr_group, 2408 &x86_pmu_format_group, 2409 &x86_pmu_events_group, 2410 &x86_pmu_caps_group, 2411 NULL, 2412 }; 2413 2414 static void x86_pmu_sched_task(struct perf_event_context *ctx, bool sched_in) 2415 { 2416 static_call_cond(x86_pmu_sched_task)(ctx, sched_in); 2417 } 2418 2419 static void x86_pmu_swap_task_ctx(struct perf_event_context *prev, 2420 struct perf_event_context *next) 2421 { 2422 static_call_cond(x86_pmu_swap_task_ctx)(prev, next); 2423 } 2424 2425 void perf_check_microcode(void) 2426 { 2427 if (x86_pmu.check_microcode) 2428 x86_pmu.check_microcode(); 2429 } 2430 2431 static int x86_pmu_check_period(struct perf_event *event, u64 value) 2432 { 2433 if (x86_pmu.check_period && x86_pmu.check_period(event, value)) 2434 return -EINVAL; 2435 2436 if (value && x86_pmu.limit_period) { 2437 if (x86_pmu.limit_period(event, value) > value) 2438 return -EINVAL; 2439 } 2440 2441 return 0; 2442 } 2443 2444 static int x86_pmu_aux_output_match(struct perf_event *event) 2445 { 2446 if (!(pmu.capabilities & PERF_PMU_CAP_AUX_OUTPUT)) 2447 return 0; 2448 2449 if (x86_pmu.aux_output_match) 2450 return x86_pmu.aux_output_match(event); 2451 2452 return 0; 2453 } 2454 2455 static struct pmu pmu = { 2456 .pmu_enable = x86_pmu_enable, 2457 .pmu_disable = x86_pmu_disable, 2458 2459 .attr_groups = x86_pmu_attr_groups, 2460 2461 .event_init = x86_pmu_event_init, 2462 2463 .event_mapped = x86_pmu_event_mapped, 2464 .event_unmapped = x86_pmu_event_unmapped, 2465 2466 .add = x86_pmu_add, 2467 .del = x86_pmu_del, 2468 .start = x86_pmu_start, 2469 .stop = x86_pmu_stop, 2470 .read = x86_pmu_read, 2471 2472 .start_txn = x86_pmu_start_txn, 2473 .cancel_txn = x86_pmu_cancel_txn, 2474 .commit_txn = x86_pmu_commit_txn, 2475 2476 .event_idx = x86_pmu_event_idx, 2477 .sched_task = x86_pmu_sched_task, 2478 .swap_task_ctx = x86_pmu_swap_task_ctx, 2479 .check_period = x86_pmu_check_period, 2480 2481 .aux_output_match = x86_pmu_aux_output_match, 2482 }; 2483 2484 void arch_perf_update_userpage(struct perf_event *event, 2485 struct perf_event_mmap_page *userpg, u64 now) 2486 { 2487 struct cyc2ns_data data; 2488 u64 offset; 2489 2490 userpg->cap_user_time = 0; 2491 userpg->cap_user_time_zero = 0; 2492 userpg->cap_user_rdpmc = 2493 !!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED); 2494 userpg->pmc_width = x86_pmu.cntval_bits; 2495 2496 if (!using_native_sched_clock() || !sched_clock_stable()) 2497 return; 2498 2499 cyc2ns_read_begin(&data); 2500 2501 offset = data.cyc2ns_offset + __sched_clock_offset; 2502 2503 /* 2504 * Internal timekeeping for enabled/running/stopped times 2505 * is always in the local_clock domain. 2506 */ 2507 userpg->cap_user_time = 1; 2508 userpg->time_mult = data.cyc2ns_mul; 2509 userpg->time_shift = data.cyc2ns_shift; 2510 userpg->time_offset = offset - now; 2511 2512 /* 2513 * cap_user_time_zero doesn't make sense when we're using a different 2514 * time base for the records. 2515 */ 2516 if (!event->attr.use_clockid) { 2517 userpg->cap_user_time_zero = 1; 2518 userpg->time_zero = offset; 2519 } 2520 2521 cyc2ns_read_end(); 2522 } 2523 2524 /* 2525 * Determine whether the regs were taken from an irq/exception handler rather 2526 * than from perf_arch_fetch_caller_regs(). 2527 */ 2528 static bool perf_hw_regs(struct pt_regs *regs) 2529 { 2530 return regs->flags & X86_EFLAGS_FIXED; 2531 } 2532 2533 void 2534 perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) 2535 { 2536 struct unwind_state state; 2537 unsigned long addr; 2538 2539 if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) { 2540 /* TODO: We don't support guest os callchain now */ 2541 return; 2542 } 2543 2544 if (perf_callchain_store(entry, regs->ip)) 2545 return; 2546 2547 if (perf_hw_regs(regs)) 2548 unwind_start(&state, current, regs, NULL); 2549 else 2550 unwind_start(&state, current, NULL, (void *)regs->sp); 2551 2552 for (; !unwind_done(&state); unwind_next_frame(&state)) { 2553 addr = unwind_get_return_address(&state); 2554 if (!addr || perf_callchain_store(entry, addr)) 2555 return; 2556 } 2557 } 2558 2559 static inline int 2560 valid_user_frame(const void __user *fp, unsigned long size) 2561 { 2562 return (__range_not_ok(fp, size, TASK_SIZE) == 0); 2563 } 2564 2565 static unsigned long get_segment_base(unsigned int segment) 2566 { 2567 struct desc_struct *desc; 2568 unsigned int idx = segment >> 3; 2569 2570 if ((segment & SEGMENT_TI_MASK) == SEGMENT_LDT) { 2571 #ifdef CONFIG_MODIFY_LDT_SYSCALL 2572 struct ldt_struct *ldt; 2573 2574 /* IRQs are off, so this synchronizes with smp_store_release */ 2575 ldt = READ_ONCE(current->active_mm->context.ldt); 2576 if (!ldt || idx >= ldt->nr_entries) 2577 return 0; 2578 2579 desc = &ldt->entries[idx]; 2580 #else 2581 return 0; 2582 #endif 2583 } else { 2584 if (idx >= GDT_ENTRIES) 2585 return 0; 2586 2587 desc = raw_cpu_ptr(gdt_page.gdt) + idx; 2588 } 2589 2590 return get_desc_base(desc); 2591 } 2592 2593 #ifdef CONFIG_IA32_EMULATION 2594 2595 #include <linux/compat.h> 2596 2597 static inline int 2598 perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry_ctx *entry) 2599 { 2600 /* 32-bit process in 64-bit kernel. */ 2601 unsigned long ss_base, cs_base; 2602 struct stack_frame_ia32 frame; 2603 const struct stack_frame_ia32 __user *fp; 2604 2605 if (user_64bit_mode(regs)) 2606 return 0; 2607 2608 cs_base = get_segment_base(regs->cs); 2609 ss_base = get_segment_base(regs->ss); 2610 2611 fp = compat_ptr(ss_base + regs->bp); 2612 pagefault_disable(); 2613 while (entry->nr < entry->max_stack) { 2614 if (!valid_user_frame(fp, sizeof(frame))) 2615 break; 2616 2617 if (__get_user(frame.next_frame, &fp->next_frame)) 2618 break; 2619 if (__get_user(frame.return_address, &fp->return_address)) 2620 break; 2621 2622 perf_callchain_store(entry, cs_base + frame.return_address); 2623 fp = compat_ptr(ss_base + frame.next_frame); 2624 } 2625 pagefault_enable(); 2626 return 1; 2627 } 2628 #else 2629 static inline int 2630 perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry_ctx *entry) 2631 { 2632 return 0; 2633 } 2634 #endif 2635 2636 void 2637 perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) 2638 { 2639 struct stack_frame frame; 2640 const struct stack_frame __user *fp; 2641 2642 if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) { 2643 /* TODO: We don't support guest os callchain now */ 2644 return; 2645 } 2646 2647 /* 2648 * We don't know what to do with VM86 stacks.. ignore them for now. 2649 */ 2650 if (regs->flags & (X86_VM_MASK | PERF_EFLAGS_VM)) 2651 return; 2652 2653 fp = (void __user *)regs->bp; 2654 2655 perf_callchain_store(entry, regs->ip); 2656 2657 if (!nmi_uaccess_okay()) 2658 return; 2659 2660 if (perf_callchain_user32(regs, entry)) 2661 return; 2662 2663 pagefault_disable(); 2664 while (entry->nr < entry->max_stack) { 2665 if (!valid_user_frame(fp, sizeof(frame))) 2666 break; 2667 2668 if (__get_user(frame.next_frame, &fp->next_frame)) 2669 break; 2670 if (__get_user(frame.return_address, &fp->return_address)) 2671 break; 2672 2673 perf_callchain_store(entry, frame.return_address); 2674 fp = (void __user *)frame.next_frame; 2675 } 2676 pagefault_enable(); 2677 } 2678 2679 /* 2680 * Deal with code segment offsets for the various execution modes: 2681 * 2682 * VM86 - the good olde 16 bit days, where the linear address is 2683 * 20 bits and we use regs->ip + 0x10 * regs->cs. 2684 * 2685 * IA32 - Where we need to look at GDT/LDT segment descriptor tables 2686 * to figure out what the 32bit base address is. 2687 * 2688 * X32 - has TIF_X32 set, but is running in x86_64 2689 * 2690 * X86_64 - CS,DS,SS,ES are all zero based. 2691 */ 2692 static unsigned long code_segment_base(struct pt_regs *regs) 2693 { 2694 /* 2695 * For IA32 we look at the GDT/LDT segment base to convert the 2696 * effective IP to a linear address. 2697 */ 2698 2699 #ifdef CONFIG_X86_32 2700 /* 2701 * If we are in VM86 mode, add the segment offset to convert to a 2702 * linear address. 2703 */ 2704 if (regs->flags & X86_VM_MASK) 2705 return 0x10 * regs->cs; 2706 2707 if (user_mode(regs) && regs->cs != __USER_CS) 2708 return get_segment_base(regs->cs); 2709 #else 2710 if (user_mode(regs) && !user_64bit_mode(regs) && 2711 regs->cs != __USER32_CS) 2712 return get_segment_base(regs->cs); 2713 #endif 2714 return 0; 2715 } 2716 2717 unsigned long perf_instruction_pointer(struct pt_regs *regs) 2718 { 2719 if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) 2720 return perf_guest_cbs->get_guest_ip(); 2721 2722 return regs->ip + code_segment_base(regs); 2723 } 2724 2725 unsigned long perf_misc_flags(struct pt_regs *regs) 2726 { 2727 int misc = 0; 2728 2729 if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) { 2730 if (perf_guest_cbs->is_user_mode()) 2731 misc |= PERF_RECORD_MISC_GUEST_USER; 2732 else 2733 misc |= PERF_RECORD_MISC_GUEST_KERNEL; 2734 } else { 2735 if (user_mode(regs)) 2736 misc |= PERF_RECORD_MISC_USER; 2737 else 2738 misc |= PERF_RECORD_MISC_KERNEL; 2739 } 2740 2741 if (regs->flags & PERF_EFLAGS_EXACT) 2742 misc |= PERF_RECORD_MISC_EXACT_IP; 2743 2744 return misc; 2745 } 2746 2747 void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap) 2748 { 2749 cap->version = x86_pmu.version; 2750 cap->num_counters_gp = x86_pmu.num_counters; 2751 cap->num_counters_fixed = x86_pmu.num_counters_fixed; 2752 cap->bit_width_gp = x86_pmu.cntval_bits; 2753 cap->bit_width_fixed = x86_pmu.cntval_bits; 2754 cap->events_mask = (unsigned int)x86_pmu.events_maskl; 2755 cap->events_mask_len = x86_pmu.events_mask_len; 2756 } 2757 EXPORT_SYMBOL_GPL(perf_get_x86_pmu_capability); 2758