1 /*
2 * Performance events x86 architecture code
3 *
4 * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
5 * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
6 * Copyright (C) 2009 Jaswinder Singh Rajput
7 * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter
8 * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra
9 * Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com>
10 * Copyright (C) 2009 Google, Inc., Stephane Eranian
11 *
12 * For licencing details see kernel-base/COPYING
13 */
14
15 #include <linux/perf_event.h>
16 #include <linux/capability.h>
17 #include <linux/notifier.h>
18 #include <linux/hardirq.h>
19 #include <linux/kprobes.h>
20 #include <linux/export.h>
21 #include <linux/init.h>
22 #include <linux/kdebug.h>
23 #include <linux/sched/mm.h>
24 #include <linux/sched/clock.h>
25 #include <linux/uaccess.h>
26 #include <linux/slab.h>
27 #include <linux/cpu.h>
28 #include <linux/bitops.h>
29 #include <linux/device.h>
30 #include <linux/nospec.h>
31 #include <linux/static_call.h>
32
33 #include <asm/apic.h>
34 #include <asm/stacktrace.h>
35 #include <asm/nmi.h>
36 #include <asm/smp.h>
37 #include <asm/alternative.h>
38 #include <asm/mmu_context.h>
39 #include <asm/tlbflush.h>
40 #include <asm/timer.h>
41 #include <asm/desc.h>
42 #include <asm/ldt.h>
43 #include <asm/unwind.h>
44 #include <asm/uprobes.h>
45 #include <asm/ibt.h>
46
47 #include "perf_event.h"
48
49 struct x86_pmu x86_pmu __read_mostly;
50 static struct pmu pmu;
51
52 DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = {
53 .enabled = 1,
54 .pmu = &pmu,
55 };
56
57 DEFINE_STATIC_KEY_FALSE(rdpmc_never_available_key);
58 DEFINE_STATIC_KEY_FALSE(rdpmc_always_available_key);
59 DEFINE_STATIC_KEY_FALSE(perf_is_hybrid);
60
61 /*
62 * This here uses DEFINE_STATIC_CALL_NULL() to get a static_call defined
63 * from just a typename, as opposed to an actual function.
64 */
65 DEFINE_STATIC_CALL_NULL(x86_pmu_handle_irq, *x86_pmu.handle_irq);
66 DEFINE_STATIC_CALL_NULL(x86_pmu_disable_all, *x86_pmu.disable_all);
67 DEFINE_STATIC_CALL_NULL(x86_pmu_enable_all, *x86_pmu.enable_all);
68 DEFINE_STATIC_CALL_NULL(x86_pmu_enable, *x86_pmu.enable);
69 DEFINE_STATIC_CALL_NULL(x86_pmu_disable, *x86_pmu.disable);
70
71 DEFINE_STATIC_CALL_NULL(x86_pmu_assign, *x86_pmu.assign);
72
73 DEFINE_STATIC_CALL_NULL(x86_pmu_add, *x86_pmu.add);
74 DEFINE_STATIC_CALL_NULL(x86_pmu_del, *x86_pmu.del);
75 DEFINE_STATIC_CALL_NULL(x86_pmu_read, *x86_pmu.read);
76
77 DEFINE_STATIC_CALL_NULL(x86_pmu_set_period, *x86_pmu.set_period);
78 DEFINE_STATIC_CALL_NULL(x86_pmu_update, *x86_pmu.update);
79 DEFINE_STATIC_CALL_NULL(x86_pmu_limit_period, *x86_pmu.limit_period);
80
81 DEFINE_STATIC_CALL_NULL(x86_pmu_schedule_events, *x86_pmu.schedule_events);
82 DEFINE_STATIC_CALL_NULL(x86_pmu_get_event_constraints, *x86_pmu.get_event_constraints);
83 DEFINE_STATIC_CALL_NULL(x86_pmu_put_event_constraints, *x86_pmu.put_event_constraints);
84
85 DEFINE_STATIC_CALL_NULL(x86_pmu_start_scheduling, *x86_pmu.start_scheduling);
86 DEFINE_STATIC_CALL_NULL(x86_pmu_commit_scheduling, *x86_pmu.commit_scheduling);
87 DEFINE_STATIC_CALL_NULL(x86_pmu_stop_scheduling, *x86_pmu.stop_scheduling);
88
89 DEFINE_STATIC_CALL_NULL(x86_pmu_sched_task, *x86_pmu.sched_task);
90
91 DEFINE_STATIC_CALL_NULL(x86_pmu_drain_pebs, *x86_pmu.drain_pebs);
92 DEFINE_STATIC_CALL_NULL(x86_pmu_pebs_aliases, *x86_pmu.pebs_aliases);
93
94 DEFINE_STATIC_CALL_NULL(x86_pmu_filter, *x86_pmu.filter);
95
96 DEFINE_STATIC_CALL_NULL(x86_pmu_late_setup, *x86_pmu.late_setup);
97
98 /*
99 * This one is magic, it will get called even when PMU init fails (because
100 * there is no PMU), in which case it should simply return NULL.
101 */
102 DEFINE_STATIC_CALL_RET0(x86_pmu_guest_get_msrs, *x86_pmu.guest_get_msrs);
103
104 u64 __read_mostly hw_cache_event_ids
105 [PERF_COUNT_HW_CACHE_MAX]
106 [PERF_COUNT_HW_CACHE_OP_MAX]
107 [PERF_COUNT_HW_CACHE_RESULT_MAX];
108 u64 __read_mostly hw_cache_extra_regs
109 [PERF_COUNT_HW_CACHE_MAX]
110 [PERF_COUNT_HW_CACHE_OP_MAX]
111 [PERF_COUNT_HW_CACHE_RESULT_MAX];
112
113 /*
114 * Propagate event elapsed time into the generic event.
115 * Can only be executed on the CPU where the event is active.
116 * Returns the delta events processed.
117 */
x86_perf_event_update(struct perf_event * event)118 u64 x86_perf_event_update(struct perf_event *event)
119 {
120 struct hw_perf_event *hwc = &event->hw;
121 int shift = 64 - x86_pmu.cntval_bits;
122 u64 prev_raw_count, new_raw_count;
123 u64 delta;
124
125 if (unlikely(!hwc->event_base))
126 return 0;
127
128 /*
129 * Careful: an NMI might modify the previous event value.
130 *
131 * Our tactic to handle this is to first atomically read and
132 * exchange a new raw count - then add that new-prev delta
133 * count to the generic event atomically:
134 */
135 prev_raw_count = local64_read(&hwc->prev_count);
136 do {
137 rdpmcl(hwc->event_base_rdpmc, new_raw_count);
138 } while (!local64_try_cmpxchg(&hwc->prev_count,
139 &prev_raw_count, new_raw_count));
140
141 /*
142 * Now we have the new raw value and have updated the prev
143 * timestamp already. We can now calculate the elapsed delta
144 * (event-)time and add that to the generic event.
145 *
146 * Careful, not all hw sign-extends above the physical width
147 * of the count.
148 */
149 delta = (new_raw_count << shift) - (prev_raw_count << shift);
150 delta >>= shift;
151
152 local64_add(delta, &event->count);
153 local64_sub(delta, &hwc->period_left);
154
155 return new_raw_count;
156 }
157
158 /*
159 * Find and validate any extra registers to set up.
160 */
x86_pmu_extra_regs(u64 config,struct perf_event * event)161 static int x86_pmu_extra_regs(u64 config, struct perf_event *event)
162 {
163 struct extra_reg *extra_regs = hybrid(event->pmu, extra_regs);
164 struct hw_perf_event_extra *reg;
165 struct extra_reg *er;
166
167 reg = &event->hw.extra_reg;
168
169 if (!extra_regs)
170 return 0;
171
172 for (er = extra_regs; er->msr; er++) {
173 if (er->event != (config & er->config_mask))
174 continue;
175 if (event->attr.config1 & ~er->valid_mask)
176 return -EINVAL;
177 /* Check if the extra msrs can be safely accessed*/
178 if (!er->extra_msr_access)
179 return -ENXIO;
180
181 reg->idx = er->idx;
182 reg->config = event->attr.config1;
183 reg->reg = er->msr;
184 break;
185 }
186 return 0;
187 }
188
189 static atomic_t active_events;
190 static atomic_t pmc_refcount;
191 static DEFINE_MUTEX(pmc_reserve_mutex);
192
193 #ifdef CONFIG_X86_LOCAL_APIC
194
get_possible_counter_mask(void)195 static inline u64 get_possible_counter_mask(void)
196 {
197 u64 cntr_mask = x86_pmu.cntr_mask64;
198 int i;
199
200 if (!is_hybrid())
201 return cntr_mask;
202
203 for (i = 0; i < x86_pmu.num_hybrid_pmus; i++)
204 cntr_mask |= x86_pmu.hybrid_pmu[i].cntr_mask64;
205
206 return cntr_mask;
207 }
208
reserve_pmc_hardware(void)209 static bool reserve_pmc_hardware(void)
210 {
211 u64 cntr_mask = get_possible_counter_mask();
212 int i, end;
213
214 for_each_set_bit(i, (unsigned long *)&cntr_mask, X86_PMC_IDX_MAX) {
215 if (!reserve_perfctr_nmi(x86_pmu_event_addr(i)))
216 goto perfctr_fail;
217 }
218
219 for_each_set_bit(i, (unsigned long *)&cntr_mask, X86_PMC_IDX_MAX) {
220 if (!reserve_evntsel_nmi(x86_pmu_config_addr(i)))
221 goto eventsel_fail;
222 }
223
224 return true;
225
226 eventsel_fail:
227 end = i;
228 for_each_set_bit(i, (unsigned long *)&cntr_mask, end)
229 release_evntsel_nmi(x86_pmu_config_addr(i));
230 i = X86_PMC_IDX_MAX;
231
232 perfctr_fail:
233 end = i;
234 for_each_set_bit(i, (unsigned long *)&cntr_mask, end)
235 release_perfctr_nmi(x86_pmu_event_addr(i));
236
237 return false;
238 }
239
release_pmc_hardware(void)240 static void release_pmc_hardware(void)
241 {
242 u64 cntr_mask = get_possible_counter_mask();
243 int i;
244
245 for_each_set_bit(i, (unsigned long *)&cntr_mask, X86_PMC_IDX_MAX) {
246 release_perfctr_nmi(x86_pmu_event_addr(i));
247 release_evntsel_nmi(x86_pmu_config_addr(i));
248 }
249 }
250
251 #else
252
reserve_pmc_hardware(void)253 static bool reserve_pmc_hardware(void) { return true; }
release_pmc_hardware(void)254 static void release_pmc_hardware(void) {}
255
256 #endif
257
check_hw_exists(struct pmu * pmu,unsigned long * cntr_mask,unsigned long * fixed_cntr_mask)258 bool check_hw_exists(struct pmu *pmu, unsigned long *cntr_mask,
259 unsigned long *fixed_cntr_mask)
260 {
261 u64 val, val_fail = -1, val_new= ~0;
262 int i, reg, reg_fail = -1, ret = 0;
263 int bios_fail = 0;
264 int reg_safe = -1;
265
266 /*
267 * Check to see if the BIOS enabled any of the counters, if so
268 * complain and bail.
269 */
270 for_each_set_bit(i, cntr_mask, X86_PMC_IDX_MAX) {
271 reg = x86_pmu_config_addr(i);
272 ret = rdmsrl_safe(reg, &val);
273 if (ret)
274 goto msr_fail;
275 if (val & ARCH_PERFMON_EVENTSEL_ENABLE) {
276 bios_fail = 1;
277 val_fail = val;
278 reg_fail = reg;
279 } else {
280 reg_safe = i;
281 }
282 }
283
284 if (*(u64 *)fixed_cntr_mask) {
285 reg = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
286 ret = rdmsrl_safe(reg, &val);
287 if (ret)
288 goto msr_fail;
289 for_each_set_bit(i, fixed_cntr_mask, X86_PMC_IDX_MAX) {
290 if (fixed_counter_disabled(i, pmu))
291 continue;
292 if (val & (0x03ULL << i*4)) {
293 bios_fail = 1;
294 val_fail = val;
295 reg_fail = reg;
296 }
297 }
298 }
299
300 /*
301 * If all the counters are enabled, the below test will always
302 * fail. The tools will also become useless in this scenario.
303 * Just fail and disable the hardware counters.
304 */
305
306 if (reg_safe == -1) {
307 reg = reg_safe;
308 goto msr_fail;
309 }
310
311 /*
312 * Read the current value, change it and read it back to see if it
313 * matches, this is needed to detect certain hardware emulators
314 * (qemu/kvm) that don't trap on the MSR access and always return 0s.
315 */
316 reg = x86_pmu_event_addr(reg_safe);
317 if (rdmsrl_safe(reg, &val))
318 goto msr_fail;
319 val ^= 0xffffUL;
320 ret = wrmsrl_safe(reg, val);
321 ret |= rdmsrl_safe(reg, &val_new);
322 if (ret || val != val_new)
323 goto msr_fail;
324
325 /*
326 * We still allow the PMU driver to operate:
327 */
328 if (bios_fail) {
329 pr_cont("Broken BIOS detected, complain to your hardware vendor.\n");
330 pr_err(FW_BUG "the BIOS has corrupted hw-PMU resources (MSR %x is %Lx)\n",
331 reg_fail, val_fail);
332 }
333
334 return true;
335
336 msr_fail:
337 if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) {
338 pr_cont("PMU not available due to virtualization, using software events only.\n");
339 } else {
340 pr_cont("Broken PMU hardware detected, using software events only.\n");
341 pr_err("Failed to access perfctr msr (MSR %x is %Lx)\n",
342 reg, val_new);
343 }
344
345 return false;
346 }
347
hw_perf_event_destroy(struct perf_event * event)348 static void hw_perf_event_destroy(struct perf_event *event)
349 {
350 x86_release_hardware();
351 atomic_dec(&active_events);
352 }
353
hw_perf_lbr_event_destroy(struct perf_event * event)354 void hw_perf_lbr_event_destroy(struct perf_event *event)
355 {
356 hw_perf_event_destroy(event);
357
358 /* undo the lbr/bts event accounting */
359 x86_del_exclusive(x86_lbr_exclusive_lbr);
360 }
361
x86_pmu_initialized(void)362 static inline int x86_pmu_initialized(void)
363 {
364 return x86_pmu.handle_irq != NULL;
365 }
366
367 static inline int
set_ext_hw_attr(struct hw_perf_event * hwc,struct perf_event * event)368 set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event *event)
369 {
370 struct perf_event_attr *attr = &event->attr;
371 unsigned int cache_type, cache_op, cache_result;
372 u64 config, val;
373
374 config = attr->config;
375
376 cache_type = (config >> 0) & 0xff;
377 if (cache_type >= PERF_COUNT_HW_CACHE_MAX)
378 return -EINVAL;
379 cache_type = array_index_nospec(cache_type, PERF_COUNT_HW_CACHE_MAX);
380
381 cache_op = (config >> 8) & 0xff;
382 if (cache_op >= PERF_COUNT_HW_CACHE_OP_MAX)
383 return -EINVAL;
384 cache_op = array_index_nospec(cache_op, PERF_COUNT_HW_CACHE_OP_MAX);
385
386 cache_result = (config >> 16) & 0xff;
387 if (cache_result >= PERF_COUNT_HW_CACHE_RESULT_MAX)
388 return -EINVAL;
389 cache_result = array_index_nospec(cache_result, PERF_COUNT_HW_CACHE_RESULT_MAX);
390
391 val = hybrid_var(event->pmu, hw_cache_event_ids)[cache_type][cache_op][cache_result];
392 if (val == 0)
393 return -ENOENT;
394
395 if (val == -1)
396 return -EINVAL;
397
398 hwc->config |= val;
399 attr->config1 = hybrid_var(event->pmu, hw_cache_extra_regs)[cache_type][cache_op][cache_result];
400 return x86_pmu_extra_regs(val, event);
401 }
402
x86_reserve_hardware(void)403 int x86_reserve_hardware(void)
404 {
405 int err = 0;
406
407 if (!atomic_inc_not_zero(&pmc_refcount)) {
408 mutex_lock(&pmc_reserve_mutex);
409 if (atomic_read(&pmc_refcount) == 0) {
410 if (!reserve_pmc_hardware()) {
411 err = -EBUSY;
412 } else {
413 reserve_ds_buffers();
414 reserve_lbr_buffers();
415 }
416 }
417 if (!err)
418 atomic_inc(&pmc_refcount);
419 mutex_unlock(&pmc_reserve_mutex);
420 }
421
422 return err;
423 }
424
x86_release_hardware(void)425 void x86_release_hardware(void)
426 {
427 if (atomic_dec_and_mutex_lock(&pmc_refcount, &pmc_reserve_mutex)) {
428 release_pmc_hardware();
429 release_ds_buffers();
430 release_lbr_buffers();
431 mutex_unlock(&pmc_reserve_mutex);
432 }
433 }
434
435 /*
436 * Check if we can create event of a certain type (that no conflicting events
437 * are present).
438 */
x86_add_exclusive(unsigned int what)439 int x86_add_exclusive(unsigned int what)
440 {
441 int i;
442
443 /*
444 * When lbr_pt_coexist we allow PT to coexist with either LBR or BTS.
445 * LBR and BTS are still mutually exclusive.
446 */
447 if (x86_pmu.lbr_pt_coexist && what == x86_lbr_exclusive_pt)
448 goto out;
449
450 if (!atomic_inc_not_zero(&x86_pmu.lbr_exclusive[what])) {
451 mutex_lock(&pmc_reserve_mutex);
452 for (i = 0; i < ARRAY_SIZE(x86_pmu.lbr_exclusive); i++) {
453 if (i != what && atomic_read(&x86_pmu.lbr_exclusive[i]))
454 goto fail_unlock;
455 }
456 atomic_inc(&x86_pmu.lbr_exclusive[what]);
457 mutex_unlock(&pmc_reserve_mutex);
458 }
459
460 out:
461 atomic_inc(&active_events);
462 return 0;
463
464 fail_unlock:
465 mutex_unlock(&pmc_reserve_mutex);
466 return -EBUSY;
467 }
468
x86_del_exclusive(unsigned int what)469 void x86_del_exclusive(unsigned int what)
470 {
471 atomic_dec(&active_events);
472
473 /*
474 * See the comment in x86_add_exclusive().
475 */
476 if (x86_pmu.lbr_pt_coexist && what == x86_lbr_exclusive_pt)
477 return;
478
479 atomic_dec(&x86_pmu.lbr_exclusive[what]);
480 }
481
x86_setup_perfctr(struct perf_event * event)482 int x86_setup_perfctr(struct perf_event *event)
483 {
484 struct perf_event_attr *attr = &event->attr;
485 struct hw_perf_event *hwc = &event->hw;
486 u64 config;
487
488 if (!is_sampling_event(event)) {
489 hwc->sample_period = x86_pmu.max_period;
490 hwc->last_period = hwc->sample_period;
491 local64_set(&hwc->period_left, hwc->sample_period);
492 }
493
494 if (attr->type == event->pmu->type)
495 return x86_pmu_extra_regs(event->attr.config, event);
496
497 if (attr->type == PERF_TYPE_HW_CACHE)
498 return set_ext_hw_attr(hwc, event);
499
500 if (attr->config >= x86_pmu.max_events)
501 return -EINVAL;
502
503 attr->config = array_index_nospec((unsigned long)attr->config, x86_pmu.max_events);
504
505 /*
506 * The generic map:
507 */
508 config = x86_pmu.event_map(attr->config);
509
510 if (config == 0)
511 return -ENOENT;
512
513 if (config == -1LL)
514 return -EINVAL;
515
516 hwc->config |= config;
517
518 return 0;
519 }
520
521 /*
522 * check that branch_sample_type is compatible with
523 * settings needed for precise_ip > 1 which implies
524 * using the LBR to capture ALL taken branches at the
525 * priv levels of the measurement
526 */
precise_br_compat(struct perf_event * event)527 static inline int precise_br_compat(struct perf_event *event)
528 {
529 u64 m = event->attr.branch_sample_type;
530 u64 b = 0;
531
532 /* must capture all branches */
533 if (!(m & PERF_SAMPLE_BRANCH_ANY))
534 return 0;
535
536 m &= PERF_SAMPLE_BRANCH_KERNEL | PERF_SAMPLE_BRANCH_USER;
537
538 if (!event->attr.exclude_user)
539 b |= PERF_SAMPLE_BRANCH_USER;
540
541 if (!event->attr.exclude_kernel)
542 b |= PERF_SAMPLE_BRANCH_KERNEL;
543
544 /*
545 * ignore PERF_SAMPLE_BRANCH_HV, not supported on x86
546 */
547
548 return m == b;
549 }
550
x86_pmu_max_precise(void)551 int x86_pmu_max_precise(void)
552 {
553 int precise = 0;
554
555 /* Support for constant skid */
556 if (x86_pmu.pebs_active && !x86_pmu.pebs_broken) {
557 precise++;
558
559 /* Support for IP fixup */
560 if (x86_pmu.lbr_nr || x86_pmu.intel_cap.pebs_format >= 2)
561 precise++;
562
563 if (x86_pmu.pebs_prec_dist)
564 precise++;
565 }
566 return precise;
567 }
568
x86_pmu_hw_config(struct perf_event * event)569 int x86_pmu_hw_config(struct perf_event *event)
570 {
571 if (event->attr.precise_ip) {
572 int precise = x86_pmu_max_precise();
573
574 if (event->attr.precise_ip > precise)
575 return -EOPNOTSUPP;
576
577 /* There's no sense in having PEBS for non sampling events: */
578 if (!is_sampling_event(event))
579 return -EINVAL;
580 }
581 /*
582 * check that PEBS LBR correction does not conflict with
583 * whatever the user is asking with attr->branch_sample_type
584 */
585 if (event->attr.precise_ip > 1 && x86_pmu.intel_cap.pebs_format < 2) {
586 u64 *br_type = &event->attr.branch_sample_type;
587
588 if (has_branch_stack(event)) {
589 if (!precise_br_compat(event))
590 return -EOPNOTSUPP;
591
592 /* branch_sample_type is compatible */
593
594 } else {
595 /*
596 * user did not specify branch_sample_type
597 *
598 * For PEBS fixups, we capture all
599 * the branches at the priv level of the
600 * event.
601 */
602 *br_type = PERF_SAMPLE_BRANCH_ANY;
603
604 if (!event->attr.exclude_user)
605 *br_type |= PERF_SAMPLE_BRANCH_USER;
606
607 if (!event->attr.exclude_kernel)
608 *br_type |= PERF_SAMPLE_BRANCH_KERNEL;
609 }
610 }
611
612 if (branch_sample_call_stack(event))
613 event->attach_state |= PERF_ATTACH_TASK_DATA;
614
615 /*
616 * Generate PMC IRQs:
617 * (keep 'enabled' bit clear for now)
618 */
619 event->hw.config = ARCH_PERFMON_EVENTSEL_INT;
620
621 /*
622 * Count user and OS events unless requested not to
623 */
624 if (!event->attr.exclude_user)
625 event->hw.config |= ARCH_PERFMON_EVENTSEL_USR;
626 if (!event->attr.exclude_kernel)
627 event->hw.config |= ARCH_PERFMON_EVENTSEL_OS;
628
629 if (event->attr.type == event->pmu->type)
630 event->hw.config |= x86_pmu_get_event_config(event);
631
632 if (!event->attr.freq && x86_pmu.limit_period) {
633 s64 left = event->attr.sample_period;
634 x86_pmu.limit_period(event, &left);
635 if (left > event->attr.sample_period)
636 return -EINVAL;
637 }
638
639 /* sample_regs_user never support XMM registers */
640 if (unlikely(event->attr.sample_regs_user & PERF_REG_EXTENDED_MASK))
641 return -EINVAL;
642 /*
643 * Besides the general purpose registers, XMM registers may
644 * be collected in PEBS on some platforms, e.g. Icelake
645 */
646 if (unlikely(event->attr.sample_regs_intr & PERF_REG_EXTENDED_MASK)) {
647 if (!(event->pmu->capabilities & PERF_PMU_CAP_EXTENDED_REGS))
648 return -EINVAL;
649
650 if (!event->attr.precise_ip)
651 return -EINVAL;
652 }
653
654 return x86_setup_perfctr(event);
655 }
656
657 /*
658 * Setup the hardware configuration for a given attr_type
659 */
__x86_pmu_event_init(struct perf_event * event)660 static int __x86_pmu_event_init(struct perf_event *event)
661 {
662 int err;
663
664 if (!x86_pmu_initialized())
665 return -ENODEV;
666
667 err = x86_reserve_hardware();
668 if (err)
669 return err;
670
671 atomic_inc(&active_events);
672 event->destroy = hw_perf_event_destroy;
673
674 event->hw.idx = -1;
675 event->hw.last_cpu = -1;
676 event->hw.last_tag = ~0ULL;
677
678 /* mark unused */
679 event->hw.extra_reg.idx = EXTRA_REG_NONE;
680 event->hw.branch_reg.idx = EXTRA_REG_NONE;
681
682 return x86_pmu.hw_config(event);
683 }
684
x86_pmu_disable_all(void)685 void x86_pmu_disable_all(void)
686 {
687 struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
688 int idx;
689
690 for_each_set_bit(idx, x86_pmu.cntr_mask, X86_PMC_IDX_MAX) {
691 struct hw_perf_event *hwc = &cpuc->events[idx]->hw;
692 u64 val;
693
694 if (!test_bit(idx, cpuc->active_mask))
695 continue;
696 rdmsrl(x86_pmu_config_addr(idx), val);
697 if (!(val & ARCH_PERFMON_EVENTSEL_ENABLE))
698 continue;
699 val &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
700 wrmsrl(x86_pmu_config_addr(idx), val);
701 if (is_counter_pair(hwc))
702 wrmsrl(x86_pmu_config_addr(idx + 1), 0);
703 }
704 }
705
perf_guest_get_msrs(int * nr,void * data)706 struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr, void *data)
707 {
708 return static_call(x86_pmu_guest_get_msrs)(nr, data);
709 }
710 EXPORT_SYMBOL_GPL(perf_guest_get_msrs);
711
712 /*
713 * There may be PMI landing after enabled=0. The PMI hitting could be before or
714 * after disable_all.
715 *
716 * If PMI hits before disable_all, the PMU will be disabled in the NMI handler.
717 * It will not be re-enabled in the NMI handler again, because enabled=0. After
718 * handling the NMI, disable_all will be called, which will not change the
719 * state either. If PMI hits after disable_all, the PMU is already disabled
720 * before entering NMI handler. The NMI handler will not change the state
721 * either.
722 *
723 * So either situation is harmless.
724 */
x86_pmu_disable(struct pmu * pmu)725 static void x86_pmu_disable(struct pmu *pmu)
726 {
727 struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
728
729 if (!x86_pmu_initialized())
730 return;
731
732 if (!cpuc->enabled)
733 return;
734
735 cpuc->n_added = 0;
736 cpuc->enabled = 0;
737 barrier();
738
739 static_call(x86_pmu_disable_all)();
740 }
741
x86_pmu_enable_all(int added)742 void x86_pmu_enable_all(int added)
743 {
744 struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
745 int idx;
746
747 for_each_set_bit(idx, x86_pmu.cntr_mask, X86_PMC_IDX_MAX) {
748 struct hw_perf_event *hwc = &cpuc->events[idx]->hw;
749
750 if (!test_bit(idx, cpuc->active_mask))
751 continue;
752
753 __x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE);
754 }
755 }
756
is_x86_event(struct perf_event * event)757 static inline int is_x86_event(struct perf_event *event)
758 {
759 int i;
760
761 if (!is_hybrid())
762 return event->pmu == &pmu;
763
764 for (i = 0; i < x86_pmu.num_hybrid_pmus; i++) {
765 if (event->pmu == &x86_pmu.hybrid_pmu[i].pmu)
766 return true;
767 }
768
769 return false;
770 }
771
x86_get_pmu(unsigned int cpu)772 struct pmu *x86_get_pmu(unsigned int cpu)
773 {
774 struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
775
776 /*
777 * All CPUs of the hybrid type have been offline.
778 * The x86_get_pmu() should not be invoked.
779 */
780 if (WARN_ON_ONCE(!cpuc->pmu))
781 return &pmu;
782
783 return cpuc->pmu;
784 }
785 /*
786 * Event scheduler state:
787 *
788 * Assign events iterating over all events and counters, beginning
789 * with events with least weights first. Keep the current iterator
790 * state in struct sched_state.
791 */
792 struct sched_state {
793 int weight;
794 int event; /* event index */
795 int counter; /* counter index */
796 int unassigned; /* number of events to be assigned left */
797 int nr_gp; /* number of GP counters used */
798 u64 used;
799 };
800
801 /* Total max is X86_PMC_IDX_MAX, but we are O(n!) limited */
802 #define SCHED_STATES_MAX 2
803
804 struct perf_sched {
805 int max_weight;
806 int max_events;
807 int max_gp;
808 int saved_states;
809 struct event_constraint **constraints;
810 struct sched_state state;
811 struct sched_state saved[SCHED_STATES_MAX];
812 };
813
814 /*
815 * Initialize iterator that runs through all events and counters.
816 */
perf_sched_init(struct perf_sched * sched,struct event_constraint ** constraints,int num,int wmin,int wmax,int gpmax)817 static void perf_sched_init(struct perf_sched *sched, struct event_constraint **constraints,
818 int num, int wmin, int wmax, int gpmax)
819 {
820 int idx;
821
822 memset(sched, 0, sizeof(*sched));
823 sched->max_events = num;
824 sched->max_weight = wmax;
825 sched->max_gp = gpmax;
826 sched->constraints = constraints;
827
828 for (idx = 0; idx < num; idx++) {
829 if (constraints[idx]->weight == wmin)
830 break;
831 }
832
833 sched->state.event = idx; /* start with min weight */
834 sched->state.weight = wmin;
835 sched->state.unassigned = num;
836 }
837
perf_sched_save_state(struct perf_sched * sched)838 static void perf_sched_save_state(struct perf_sched *sched)
839 {
840 if (WARN_ON_ONCE(sched->saved_states >= SCHED_STATES_MAX))
841 return;
842
843 sched->saved[sched->saved_states] = sched->state;
844 sched->saved_states++;
845 }
846
perf_sched_restore_state(struct perf_sched * sched)847 static bool perf_sched_restore_state(struct perf_sched *sched)
848 {
849 if (!sched->saved_states)
850 return false;
851
852 sched->saved_states--;
853 sched->state = sched->saved[sched->saved_states];
854
855 /* this assignment didn't work out */
856 /* XXX broken vs EVENT_PAIR */
857 sched->state.used &= ~BIT_ULL(sched->state.counter);
858
859 /* try the next one */
860 sched->state.counter++;
861
862 return true;
863 }
864
865 /*
866 * Select a counter for the current event to schedule. Return true on
867 * success.
868 */
__perf_sched_find_counter(struct perf_sched * sched)869 static bool __perf_sched_find_counter(struct perf_sched *sched)
870 {
871 struct event_constraint *c;
872 int idx;
873
874 if (!sched->state.unassigned)
875 return false;
876
877 if (sched->state.event >= sched->max_events)
878 return false;
879
880 c = sched->constraints[sched->state.event];
881 /* Prefer fixed purpose counters */
882 if (c->idxmsk64 & (~0ULL << INTEL_PMC_IDX_FIXED)) {
883 idx = INTEL_PMC_IDX_FIXED;
884 for_each_set_bit_from(idx, c->idxmsk, X86_PMC_IDX_MAX) {
885 u64 mask = BIT_ULL(idx);
886
887 if (sched->state.used & mask)
888 continue;
889
890 sched->state.used |= mask;
891 goto done;
892 }
893 }
894
895 /* Grab the first unused counter starting with idx */
896 idx = sched->state.counter;
897 for_each_set_bit_from(idx, c->idxmsk, INTEL_PMC_IDX_FIXED) {
898 u64 mask = BIT_ULL(idx);
899
900 if (c->flags & PERF_X86_EVENT_PAIR)
901 mask |= mask << 1;
902
903 if (sched->state.used & mask)
904 continue;
905
906 if (sched->state.nr_gp++ >= sched->max_gp)
907 return false;
908
909 sched->state.used |= mask;
910 goto done;
911 }
912
913 return false;
914
915 done:
916 sched->state.counter = idx;
917
918 if (c->overlap)
919 perf_sched_save_state(sched);
920
921 return true;
922 }
923
perf_sched_find_counter(struct perf_sched * sched)924 static bool perf_sched_find_counter(struct perf_sched *sched)
925 {
926 while (!__perf_sched_find_counter(sched)) {
927 if (!perf_sched_restore_state(sched))
928 return false;
929 }
930
931 return true;
932 }
933
934 /*
935 * Go through all unassigned events and find the next one to schedule.
936 * Take events with the least weight first. Return true on success.
937 */
perf_sched_next_event(struct perf_sched * sched)938 static bool perf_sched_next_event(struct perf_sched *sched)
939 {
940 struct event_constraint *c;
941
942 if (!sched->state.unassigned || !--sched->state.unassigned)
943 return false;
944
945 do {
946 /* next event */
947 sched->state.event++;
948 if (sched->state.event >= sched->max_events) {
949 /* next weight */
950 sched->state.event = 0;
951 sched->state.weight++;
952 if (sched->state.weight > sched->max_weight)
953 return false;
954 }
955 c = sched->constraints[sched->state.event];
956 } while (c->weight != sched->state.weight);
957
958 sched->state.counter = 0; /* start with first counter */
959
960 return true;
961 }
962
963 /*
964 * Assign a counter for each event.
965 */
perf_assign_events(struct event_constraint ** constraints,int n,int wmin,int wmax,int gpmax,int * assign)966 int perf_assign_events(struct event_constraint **constraints, int n,
967 int wmin, int wmax, int gpmax, int *assign)
968 {
969 struct perf_sched sched;
970
971 perf_sched_init(&sched, constraints, n, wmin, wmax, gpmax);
972
973 do {
974 if (!perf_sched_find_counter(&sched))
975 break; /* failed */
976 if (assign)
977 assign[sched.state.event] = sched.state.counter;
978 } while (perf_sched_next_event(&sched));
979
980 return sched.state.unassigned;
981 }
982 EXPORT_SYMBOL_GPL(perf_assign_events);
983
x86_schedule_events(struct cpu_hw_events * cpuc,int n,int * assign)984 int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
985 {
986 struct event_constraint *c;
987 struct perf_event *e;
988 int n0, i, wmin, wmax, unsched = 0;
989 struct hw_perf_event *hwc;
990 u64 used_mask = 0;
991
992 /*
993 * Compute the number of events already present; see x86_pmu_add(),
994 * validate_group() and x86_pmu_commit_txn(). For the former two
995 * cpuc->n_events hasn't been updated yet, while for the latter
996 * cpuc->n_txn contains the number of events added in the current
997 * transaction.
998 */
999 n0 = cpuc->n_events;
1000 if (cpuc->txn_flags & PERF_PMU_TXN_ADD)
1001 n0 -= cpuc->n_txn;
1002
1003 static_call_cond(x86_pmu_start_scheduling)(cpuc);
1004
1005 for (i = 0, wmin = X86_PMC_IDX_MAX, wmax = 0; i < n; i++) {
1006 c = cpuc->event_constraint[i];
1007
1008 /*
1009 * Previously scheduled events should have a cached constraint,
1010 * while new events should not have one.
1011 */
1012 WARN_ON_ONCE((c && i >= n0) || (!c && i < n0));
1013
1014 /*
1015 * Request constraints for new events; or for those events that
1016 * have a dynamic constraint -- for those the constraint can
1017 * change due to external factors (sibling state, allow_tfa).
1018 */
1019 if (!c || (c->flags & PERF_X86_EVENT_DYNAMIC)) {
1020 c = static_call(x86_pmu_get_event_constraints)(cpuc, i, cpuc->event_list[i]);
1021 cpuc->event_constraint[i] = c;
1022 }
1023
1024 wmin = min(wmin, c->weight);
1025 wmax = max(wmax, c->weight);
1026 }
1027
1028 /*
1029 * fastpath, try to reuse previous register
1030 */
1031 for (i = 0; i < n; i++) {
1032 u64 mask;
1033
1034 hwc = &cpuc->event_list[i]->hw;
1035 c = cpuc->event_constraint[i];
1036
1037 /* never assigned */
1038 if (hwc->idx == -1)
1039 break;
1040
1041 /* constraint still honored */
1042 if (!test_bit(hwc->idx, c->idxmsk))
1043 break;
1044
1045 mask = BIT_ULL(hwc->idx);
1046 if (is_counter_pair(hwc))
1047 mask |= mask << 1;
1048
1049 /* not already used */
1050 if (used_mask & mask)
1051 break;
1052
1053 used_mask |= mask;
1054
1055 if (assign)
1056 assign[i] = hwc->idx;
1057 }
1058
1059 /* slow path */
1060 if (i != n) {
1061 int gpmax = x86_pmu_max_num_counters(cpuc->pmu);
1062
1063 /*
1064 * Do not allow scheduling of more than half the available
1065 * generic counters.
1066 *
1067 * This helps avoid counter starvation of sibling thread by
1068 * ensuring at most half the counters cannot be in exclusive
1069 * mode. There is no designated counters for the limits. Any
1070 * N/2 counters can be used. This helps with events with
1071 * specific counter constraints.
1072 */
1073 if (is_ht_workaround_enabled() && !cpuc->is_fake &&
1074 READ_ONCE(cpuc->excl_cntrs->exclusive_present))
1075 gpmax /= 2;
1076
1077 /*
1078 * Reduce the amount of available counters to allow fitting
1079 * the extra Merge events needed by large increment events.
1080 */
1081 if (x86_pmu.flags & PMU_FL_PAIR) {
1082 gpmax -= cpuc->n_pair;
1083 WARN_ON(gpmax <= 0);
1084 }
1085
1086 unsched = perf_assign_events(cpuc->event_constraint, n, wmin,
1087 wmax, gpmax, assign);
1088 }
1089
1090 /*
1091 * In case of success (unsched = 0), mark events as committed,
1092 * so we do not put_constraint() in case new events are added
1093 * and fail to be scheduled
1094 *
1095 * We invoke the lower level commit callback to lock the resource
1096 *
1097 * We do not need to do all of this in case we are called to
1098 * validate an event group (assign == NULL)
1099 */
1100 if (!unsched && assign) {
1101 for (i = 0; i < n; i++)
1102 static_call_cond(x86_pmu_commit_scheduling)(cpuc, i, assign[i]);
1103 } else {
1104 for (i = n0; i < n; i++) {
1105 e = cpuc->event_list[i];
1106
1107 /*
1108 * release events that failed scheduling
1109 */
1110 static_call_cond(x86_pmu_put_event_constraints)(cpuc, e);
1111
1112 cpuc->event_constraint[i] = NULL;
1113 }
1114 }
1115
1116 static_call_cond(x86_pmu_stop_scheduling)(cpuc);
1117
1118 return unsched ? -EINVAL : 0;
1119 }
1120
add_nr_metric_event(struct cpu_hw_events * cpuc,struct perf_event * event)1121 static int add_nr_metric_event(struct cpu_hw_events *cpuc,
1122 struct perf_event *event)
1123 {
1124 if (is_metric_event(event)) {
1125 if (cpuc->n_metric == INTEL_TD_METRIC_NUM)
1126 return -EINVAL;
1127 cpuc->n_metric++;
1128 cpuc->n_txn_metric++;
1129 }
1130
1131 return 0;
1132 }
1133
del_nr_metric_event(struct cpu_hw_events * cpuc,struct perf_event * event)1134 static void del_nr_metric_event(struct cpu_hw_events *cpuc,
1135 struct perf_event *event)
1136 {
1137 if (is_metric_event(event))
1138 cpuc->n_metric--;
1139 }
1140
collect_event(struct cpu_hw_events * cpuc,struct perf_event * event,int max_count,int n)1141 static int collect_event(struct cpu_hw_events *cpuc, struct perf_event *event,
1142 int max_count, int n)
1143 {
1144 union perf_capabilities intel_cap = hybrid(cpuc->pmu, intel_cap);
1145
1146 if (intel_cap.perf_metrics && add_nr_metric_event(cpuc, event))
1147 return -EINVAL;
1148
1149 if (n >= max_count + cpuc->n_metric)
1150 return -EINVAL;
1151
1152 cpuc->event_list[n] = event;
1153 if (is_counter_pair(&event->hw)) {
1154 cpuc->n_pair++;
1155 cpuc->n_txn_pair++;
1156 }
1157
1158 return 0;
1159 }
1160
1161 /*
1162 * dogrp: true if must collect siblings events (group)
1163 * returns total number of events and error code
1164 */
collect_events(struct cpu_hw_events * cpuc,struct perf_event * leader,bool dogrp)1165 static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader, bool dogrp)
1166 {
1167 struct perf_event *event;
1168 int n, max_count;
1169
1170 max_count = x86_pmu_num_counters(cpuc->pmu) + x86_pmu_num_counters_fixed(cpuc->pmu);
1171
1172 /* current number of events already accepted */
1173 n = cpuc->n_events;
1174 if (!cpuc->n_events)
1175 cpuc->pebs_output = 0;
1176
1177 if (!cpuc->is_fake && leader->attr.precise_ip) {
1178 /*
1179 * For PEBS->PT, if !aux_event, the group leader (PT) went
1180 * away, the group was broken down and this singleton event
1181 * can't schedule any more.
1182 */
1183 if (is_pebs_pt(leader) && !leader->aux_event)
1184 return -EINVAL;
1185
1186 /*
1187 * pebs_output: 0: no PEBS so far, 1: PT, 2: DS
1188 */
1189 if (cpuc->pebs_output &&
1190 cpuc->pebs_output != is_pebs_pt(leader) + 1)
1191 return -EINVAL;
1192
1193 cpuc->pebs_output = is_pebs_pt(leader) + 1;
1194 }
1195
1196 if (is_x86_event(leader)) {
1197 if (collect_event(cpuc, leader, max_count, n))
1198 return -EINVAL;
1199 n++;
1200 }
1201
1202 if (!dogrp)
1203 return n;
1204
1205 for_each_sibling_event(event, leader) {
1206 if (!is_x86_event(event) || event->state <= PERF_EVENT_STATE_OFF)
1207 continue;
1208
1209 if (collect_event(cpuc, event, max_count, n))
1210 return -EINVAL;
1211
1212 n++;
1213 }
1214 return n;
1215 }
1216
x86_assign_hw_event(struct perf_event * event,struct cpu_hw_events * cpuc,int i)1217 static inline void x86_assign_hw_event(struct perf_event *event,
1218 struct cpu_hw_events *cpuc, int i)
1219 {
1220 struct hw_perf_event *hwc = &event->hw;
1221 int idx;
1222
1223 idx = hwc->idx = cpuc->assign[i];
1224 hwc->last_cpu = smp_processor_id();
1225 hwc->last_tag = ++cpuc->tags[i];
1226
1227 static_call_cond(x86_pmu_assign)(event, idx);
1228
1229 switch (hwc->idx) {
1230 case INTEL_PMC_IDX_FIXED_BTS:
1231 case INTEL_PMC_IDX_FIXED_VLBR:
1232 hwc->config_base = 0;
1233 hwc->event_base = 0;
1234 break;
1235
1236 case INTEL_PMC_IDX_METRIC_BASE ... INTEL_PMC_IDX_METRIC_END:
1237 /* All the metric events are mapped onto the fixed counter 3. */
1238 idx = INTEL_PMC_IDX_FIXED_SLOTS;
1239 fallthrough;
1240 case INTEL_PMC_IDX_FIXED ... INTEL_PMC_IDX_FIXED_BTS-1:
1241 hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
1242 hwc->event_base = x86_pmu_fixed_ctr_addr(idx - INTEL_PMC_IDX_FIXED);
1243 hwc->event_base_rdpmc = (idx - INTEL_PMC_IDX_FIXED) |
1244 INTEL_PMC_FIXED_RDPMC_BASE;
1245 break;
1246
1247 default:
1248 hwc->config_base = x86_pmu_config_addr(hwc->idx);
1249 hwc->event_base = x86_pmu_event_addr(hwc->idx);
1250 hwc->event_base_rdpmc = x86_pmu_rdpmc_index(hwc->idx);
1251 break;
1252 }
1253 }
1254
1255 /**
1256 * x86_perf_rdpmc_index - Return PMC counter used for event
1257 * @event: the perf_event to which the PMC counter was assigned
1258 *
1259 * The counter assigned to this performance event may change if interrupts
1260 * are enabled. This counter should thus never be used while interrupts are
1261 * enabled. Before this function is used to obtain the assigned counter the
1262 * event should be checked for validity using, for example,
1263 * perf_event_read_local(), within the same interrupt disabled section in
1264 * which this counter is planned to be used.
1265 *
1266 * Return: The index of the performance monitoring counter assigned to
1267 * @perf_event.
1268 */
x86_perf_rdpmc_index(struct perf_event * event)1269 int x86_perf_rdpmc_index(struct perf_event *event)
1270 {
1271 lockdep_assert_irqs_disabled();
1272
1273 return event->hw.event_base_rdpmc;
1274 }
1275
match_prev_assignment(struct hw_perf_event * hwc,struct cpu_hw_events * cpuc,int i)1276 static inline int match_prev_assignment(struct hw_perf_event *hwc,
1277 struct cpu_hw_events *cpuc,
1278 int i)
1279 {
1280 return hwc->idx == cpuc->assign[i] &&
1281 hwc->last_cpu == smp_processor_id() &&
1282 hwc->last_tag == cpuc->tags[i];
1283 }
1284
1285 static void x86_pmu_start(struct perf_event *event, int flags);
1286
x86_pmu_enable(struct pmu * pmu)1287 static void x86_pmu_enable(struct pmu *pmu)
1288 {
1289 struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
1290 struct perf_event *event;
1291 struct hw_perf_event *hwc;
1292 int i, added = cpuc->n_added;
1293
1294 if (!x86_pmu_initialized())
1295 return;
1296
1297 if (cpuc->enabled)
1298 return;
1299
1300 if (cpuc->n_added) {
1301 int n_running = cpuc->n_events - cpuc->n_added;
1302
1303 /*
1304 * The late setup (after counters are scheduled)
1305 * is required for some cases, e.g., PEBS counters
1306 * snapshotting. Because an accurate counter index
1307 * is needed.
1308 */
1309 static_call_cond(x86_pmu_late_setup)();
1310
1311 /*
1312 * apply assignment obtained either from
1313 * hw_perf_group_sched_in() or x86_pmu_enable()
1314 *
1315 * step1: save events moving to new counters
1316 */
1317 for (i = 0; i < n_running; i++) {
1318 event = cpuc->event_list[i];
1319 hwc = &event->hw;
1320
1321 /*
1322 * we can avoid reprogramming counter if:
1323 * - assigned same counter as last time
1324 * - running on same CPU as last time
1325 * - no other event has used the counter since
1326 */
1327 if (hwc->idx == -1 ||
1328 match_prev_assignment(hwc, cpuc, i))
1329 continue;
1330
1331 /*
1332 * Ensure we don't accidentally enable a stopped
1333 * counter simply because we rescheduled.
1334 */
1335 if (hwc->state & PERF_HES_STOPPED)
1336 hwc->state |= PERF_HES_ARCH;
1337
1338 x86_pmu_stop(event, PERF_EF_UPDATE);
1339 }
1340
1341 /*
1342 * step2: reprogram moved events into new counters
1343 */
1344 for (i = 0; i < cpuc->n_events; i++) {
1345 event = cpuc->event_list[i];
1346 hwc = &event->hw;
1347
1348 if (!match_prev_assignment(hwc, cpuc, i))
1349 x86_assign_hw_event(event, cpuc, i);
1350 else if (i < n_running)
1351 continue;
1352
1353 if (hwc->state & PERF_HES_ARCH)
1354 continue;
1355
1356 /*
1357 * if cpuc->enabled = 0, then no wrmsr as
1358 * per x86_pmu_enable_event()
1359 */
1360 x86_pmu_start(event, PERF_EF_RELOAD);
1361 }
1362 cpuc->n_added = 0;
1363 perf_events_lapic_init();
1364 }
1365
1366 cpuc->enabled = 1;
1367 barrier();
1368
1369 static_call(x86_pmu_enable_all)(added);
1370 }
1371
1372 DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left);
1373
1374 /*
1375 * Set the next IRQ period, based on the hwc->period_left value.
1376 * To be called with the event disabled in hw:
1377 */
x86_perf_event_set_period(struct perf_event * event)1378 int x86_perf_event_set_period(struct perf_event *event)
1379 {
1380 struct hw_perf_event *hwc = &event->hw;
1381 s64 left = local64_read(&hwc->period_left);
1382 s64 period = hwc->sample_period;
1383 int ret = 0, idx = hwc->idx;
1384
1385 if (unlikely(!hwc->event_base))
1386 return 0;
1387
1388 /*
1389 * If we are way outside a reasonable range then just skip forward:
1390 */
1391 if (unlikely(left <= -period)) {
1392 left = period;
1393 local64_set(&hwc->period_left, left);
1394 hwc->last_period = period;
1395 ret = 1;
1396 }
1397
1398 if (unlikely(left <= 0)) {
1399 left += period;
1400 local64_set(&hwc->period_left, left);
1401 hwc->last_period = period;
1402 ret = 1;
1403 }
1404 /*
1405 * Quirk: certain CPUs dont like it if just 1 hw_event is left:
1406 */
1407 if (unlikely(left < 2))
1408 left = 2;
1409
1410 if (left > x86_pmu.max_period)
1411 left = x86_pmu.max_period;
1412
1413 static_call_cond(x86_pmu_limit_period)(event, &left);
1414
1415 this_cpu_write(pmc_prev_left[idx], left);
1416
1417 /*
1418 * The hw event starts counting from this event offset,
1419 * mark it to be able to extra future deltas:
1420 */
1421 local64_set(&hwc->prev_count, (u64)-left);
1422
1423 wrmsrl(hwc->event_base, (u64)(-left) & x86_pmu.cntval_mask);
1424
1425 /*
1426 * Sign extend the Merge event counter's upper 16 bits since
1427 * we currently declare a 48-bit counter width
1428 */
1429 if (is_counter_pair(hwc))
1430 wrmsrl(x86_pmu_event_addr(idx + 1), 0xffff);
1431
1432 perf_event_update_userpage(event);
1433
1434 return ret;
1435 }
1436
x86_pmu_enable_event(struct perf_event * event)1437 void x86_pmu_enable_event(struct perf_event *event)
1438 {
1439 if (__this_cpu_read(cpu_hw_events.enabled))
1440 __x86_pmu_enable_event(&event->hw,
1441 ARCH_PERFMON_EVENTSEL_ENABLE);
1442 }
1443
1444 /*
1445 * Add a single event to the PMU.
1446 *
1447 * The event is added to the group of enabled events
1448 * but only if it can be scheduled with existing events.
1449 */
x86_pmu_add(struct perf_event * event,int flags)1450 static int x86_pmu_add(struct perf_event *event, int flags)
1451 {
1452 struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
1453 struct hw_perf_event *hwc;
1454 int assign[X86_PMC_IDX_MAX];
1455 int n, n0, ret;
1456
1457 hwc = &event->hw;
1458
1459 n0 = cpuc->n_events;
1460 ret = n = collect_events(cpuc, event, false);
1461 if (ret < 0)
1462 goto out;
1463
1464 hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
1465 if (!(flags & PERF_EF_START))
1466 hwc->state |= PERF_HES_ARCH;
1467
1468 /*
1469 * If group events scheduling transaction was started,
1470 * skip the schedulability test here, it will be performed
1471 * at commit time (->commit_txn) as a whole.
1472 *
1473 * If commit fails, we'll call ->del() on all events
1474 * for which ->add() was called.
1475 */
1476 if (cpuc->txn_flags & PERF_PMU_TXN_ADD)
1477 goto done_collect;
1478
1479 ret = static_call(x86_pmu_schedule_events)(cpuc, n, assign);
1480 if (ret)
1481 goto out;
1482 /*
1483 * copy new assignment, now we know it is possible
1484 * will be used by hw_perf_enable()
1485 */
1486 memcpy(cpuc->assign, assign, n*sizeof(int));
1487
1488 done_collect:
1489 /*
1490 * Commit the collect_events() state. See x86_pmu_del() and
1491 * x86_pmu_*_txn().
1492 */
1493 cpuc->n_events = n;
1494 cpuc->n_added += n - n0;
1495 cpuc->n_txn += n - n0;
1496
1497 /*
1498 * This is before x86_pmu_enable() will call x86_pmu_start(),
1499 * so we enable LBRs before an event needs them etc..
1500 */
1501 static_call_cond(x86_pmu_add)(event);
1502
1503 ret = 0;
1504 out:
1505 return ret;
1506 }
1507
x86_pmu_start(struct perf_event * event,int flags)1508 static void x86_pmu_start(struct perf_event *event, int flags)
1509 {
1510 struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
1511 int idx = event->hw.idx;
1512
1513 if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
1514 return;
1515
1516 if (WARN_ON_ONCE(idx == -1))
1517 return;
1518
1519 if (flags & PERF_EF_RELOAD) {
1520 WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE));
1521 static_call(x86_pmu_set_period)(event);
1522 }
1523
1524 event->hw.state = 0;
1525
1526 cpuc->events[idx] = event;
1527 __set_bit(idx, cpuc->active_mask);
1528 static_call(x86_pmu_enable)(event);
1529 perf_event_update_userpage(event);
1530 }
1531
perf_event_print_debug(void)1532 void perf_event_print_debug(void)
1533 {
1534 u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed;
1535 unsigned long *cntr_mask, *fixed_cntr_mask;
1536 struct event_constraint *pebs_constraints;
1537 struct cpu_hw_events *cpuc;
1538 u64 pebs, debugctl;
1539 int cpu, idx;
1540
1541 guard(irqsave)();
1542
1543 cpu = smp_processor_id();
1544 cpuc = &per_cpu(cpu_hw_events, cpu);
1545 cntr_mask = hybrid(cpuc->pmu, cntr_mask);
1546 fixed_cntr_mask = hybrid(cpuc->pmu, fixed_cntr_mask);
1547 pebs_constraints = hybrid(cpuc->pmu, pebs_constraints);
1548
1549 if (!*(u64 *)cntr_mask)
1550 return;
1551
1552 if (x86_pmu.version >= 2) {
1553 rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
1554 rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
1555 rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow);
1556 rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed);
1557
1558 pr_info("\n");
1559 pr_info("CPU#%d: ctrl: %016llx\n", cpu, ctrl);
1560 pr_info("CPU#%d: status: %016llx\n", cpu, status);
1561 pr_info("CPU#%d: overflow: %016llx\n", cpu, overflow);
1562 pr_info("CPU#%d: fixed: %016llx\n", cpu, fixed);
1563 if (pebs_constraints) {
1564 rdmsrl(MSR_IA32_PEBS_ENABLE, pebs);
1565 pr_info("CPU#%d: pebs: %016llx\n", cpu, pebs);
1566 }
1567 if (x86_pmu.lbr_nr) {
1568 rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
1569 pr_info("CPU#%d: debugctl: %016llx\n", cpu, debugctl);
1570 }
1571 }
1572 pr_info("CPU#%d: active: %016llx\n", cpu, *(u64 *)cpuc->active_mask);
1573
1574 for_each_set_bit(idx, cntr_mask, X86_PMC_IDX_MAX) {
1575 rdmsrl(x86_pmu_config_addr(idx), pmc_ctrl);
1576 rdmsrl(x86_pmu_event_addr(idx), pmc_count);
1577
1578 prev_left = per_cpu(pmc_prev_left[idx], cpu);
1579
1580 pr_info("CPU#%d: gen-PMC%d ctrl: %016llx\n",
1581 cpu, idx, pmc_ctrl);
1582 pr_info("CPU#%d: gen-PMC%d count: %016llx\n",
1583 cpu, idx, pmc_count);
1584 pr_info("CPU#%d: gen-PMC%d left: %016llx\n",
1585 cpu, idx, prev_left);
1586 }
1587 for_each_set_bit(idx, fixed_cntr_mask, X86_PMC_IDX_MAX) {
1588 if (fixed_counter_disabled(idx, cpuc->pmu))
1589 continue;
1590 rdmsrl(x86_pmu_fixed_ctr_addr(idx), pmc_count);
1591
1592 pr_info("CPU#%d: fixed-PMC%d count: %016llx\n",
1593 cpu, idx, pmc_count);
1594 }
1595 }
1596
x86_pmu_stop(struct perf_event * event,int flags)1597 void x86_pmu_stop(struct perf_event *event, int flags)
1598 {
1599 struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
1600 struct hw_perf_event *hwc = &event->hw;
1601
1602 if (test_bit(hwc->idx, cpuc->active_mask)) {
1603 static_call(x86_pmu_disable)(event);
1604 __clear_bit(hwc->idx, cpuc->active_mask);
1605 cpuc->events[hwc->idx] = NULL;
1606 WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
1607 hwc->state |= PERF_HES_STOPPED;
1608 }
1609
1610 if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
1611 /*
1612 * Drain the remaining delta count out of a event
1613 * that we are disabling:
1614 */
1615 static_call(x86_pmu_update)(event);
1616 hwc->state |= PERF_HES_UPTODATE;
1617 }
1618 }
1619
x86_pmu_del(struct perf_event * event,int flags)1620 static void x86_pmu_del(struct perf_event *event, int flags)
1621 {
1622 struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
1623 union perf_capabilities intel_cap = hybrid(cpuc->pmu, intel_cap);
1624 int i;
1625
1626 /*
1627 * If we're called during a txn, we only need to undo x86_pmu.add.
1628 * The events never got scheduled and ->cancel_txn will truncate
1629 * the event_list.
1630 *
1631 * XXX assumes any ->del() called during a TXN will only be on
1632 * an event added during that same TXN.
1633 */
1634 if (cpuc->txn_flags & PERF_PMU_TXN_ADD)
1635 goto do_del;
1636
1637 __set_bit(event->hw.idx, cpuc->dirty);
1638
1639 /*
1640 * Not a TXN, therefore cleanup properly.
1641 */
1642 x86_pmu_stop(event, PERF_EF_UPDATE);
1643
1644 for (i = 0; i < cpuc->n_events; i++) {
1645 if (event == cpuc->event_list[i])
1646 break;
1647 }
1648
1649 if (WARN_ON_ONCE(i == cpuc->n_events)) /* called ->del() without ->add() ? */
1650 return;
1651
1652 /* If we have a newly added event; make sure to decrease n_added. */
1653 if (i >= cpuc->n_events - cpuc->n_added)
1654 --cpuc->n_added;
1655
1656 static_call_cond(x86_pmu_put_event_constraints)(cpuc, event);
1657
1658 /* Delete the array entry. */
1659 while (++i < cpuc->n_events) {
1660 cpuc->event_list[i-1] = cpuc->event_list[i];
1661 cpuc->event_constraint[i-1] = cpuc->event_constraint[i];
1662 cpuc->assign[i-1] = cpuc->assign[i];
1663 }
1664 cpuc->event_constraint[i-1] = NULL;
1665 --cpuc->n_events;
1666 if (intel_cap.perf_metrics)
1667 del_nr_metric_event(cpuc, event);
1668
1669 perf_event_update_userpage(event);
1670
1671 do_del:
1672
1673 /*
1674 * This is after x86_pmu_stop(); so we disable LBRs after any
1675 * event can need them etc..
1676 */
1677 static_call_cond(x86_pmu_del)(event);
1678 }
1679
x86_pmu_handle_irq(struct pt_regs * regs)1680 int x86_pmu_handle_irq(struct pt_regs *regs)
1681 {
1682 struct perf_sample_data data;
1683 struct cpu_hw_events *cpuc;
1684 struct perf_event *event;
1685 int idx, handled = 0;
1686 u64 val;
1687
1688 cpuc = this_cpu_ptr(&cpu_hw_events);
1689
1690 /*
1691 * Some chipsets need to unmask the LVTPC in a particular spot
1692 * inside the nmi handler. As a result, the unmasking was pushed
1693 * into all the nmi handlers.
1694 *
1695 * This generic handler doesn't seem to have any issues where the
1696 * unmasking occurs so it was left at the top.
1697 */
1698 apic_write(APIC_LVTPC, APIC_DM_NMI);
1699
1700 for_each_set_bit(idx, x86_pmu.cntr_mask, X86_PMC_IDX_MAX) {
1701 if (!test_bit(idx, cpuc->active_mask))
1702 continue;
1703
1704 event = cpuc->events[idx];
1705
1706 val = static_call(x86_pmu_update)(event);
1707 if (val & (1ULL << (x86_pmu.cntval_bits - 1)))
1708 continue;
1709
1710 /*
1711 * event overflow
1712 */
1713 handled++;
1714
1715 if (!static_call(x86_pmu_set_period)(event))
1716 continue;
1717
1718 perf_sample_data_init(&data, 0, event->hw.last_period);
1719
1720 perf_sample_save_brstack(&data, event, &cpuc->lbr_stack, NULL);
1721
1722 if (perf_event_overflow(event, &data, regs))
1723 x86_pmu_stop(event, 0);
1724 }
1725
1726 if (handled)
1727 inc_irq_stat(apic_perf_irqs);
1728
1729 return handled;
1730 }
1731
perf_events_lapic_init(void)1732 void perf_events_lapic_init(void)
1733 {
1734 if (!x86_pmu.apic || !x86_pmu_initialized())
1735 return;
1736
1737 /*
1738 * Always use NMI for PMU
1739 */
1740 apic_write(APIC_LVTPC, APIC_DM_NMI);
1741 }
1742
1743 static int
perf_event_nmi_handler(unsigned int cmd,struct pt_regs * regs)1744 perf_event_nmi_handler(unsigned int cmd, struct pt_regs *regs)
1745 {
1746 u64 start_clock;
1747 u64 finish_clock;
1748 int ret;
1749
1750 /*
1751 * All PMUs/events that share this PMI handler should make sure to
1752 * increment active_events for their events.
1753 */
1754 if (!atomic_read(&active_events))
1755 return NMI_DONE;
1756
1757 start_clock = sched_clock();
1758 ret = static_call(x86_pmu_handle_irq)(regs);
1759 finish_clock = sched_clock();
1760
1761 perf_sample_event_took(finish_clock - start_clock);
1762
1763 return ret;
1764 }
1765 NOKPROBE_SYMBOL(perf_event_nmi_handler);
1766
1767 struct event_constraint emptyconstraint;
1768 struct event_constraint unconstrained;
1769
x86_pmu_prepare_cpu(unsigned int cpu)1770 static int x86_pmu_prepare_cpu(unsigned int cpu)
1771 {
1772 struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
1773 int i;
1774
1775 for (i = 0 ; i < X86_PERF_KFREE_MAX; i++)
1776 cpuc->kfree_on_online[i] = NULL;
1777 if (x86_pmu.cpu_prepare)
1778 return x86_pmu.cpu_prepare(cpu);
1779 return 0;
1780 }
1781
x86_pmu_dead_cpu(unsigned int cpu)1782 static int x86_pmu_dead_cpu(unsigned int cpu)
1783 {
1784 if (x86_pmu.cpu_dead)
1785 x86_pmu.cpu_dead(cpu);
1786 return 0;
1787 }
1788
x86_pmu_online_cpu(unsigned int cpu)1789 static int x86_pmu_online_cpu(unsigned int cpu)
1790 {
1791 struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
1792 int i;
1793
1794 for (i = 0 ; i < X86_PERF_KFREE_MAX; i++) {
1795 kfree(cpuc->kfree_on_online[i]);
1796 cpuc->kfree_on_online[i] = NULL;
1797 }
1798 return 0;
1799 }
1800
x86_pmu_starting_cpu(unsigned int cpu)1801 static int x86_pmu_starting_cpu(unsigned int cpu)
1802 {
1803 if (x86_pmu.cpu_starting)
1804 x86_pmu.cpu_starting(cpu);
1805 return 0;
1806 }
1807
x86_pmu_dying_cpu(unsigned int cpu)1808 static int x86_pmu_dying_cpu(unsigned int cpu)
1809 {
1810 if (x86_pmu.cpu_dying)
1811 x86_pmu.cpu_dying(cpu);
1812 return 0;
1813 }
1814
pmu_check_apic(void)1815 static void __init pmu_check_apic(void)
1816 {
1817 if (boot_cpu_has(X86_FEATURE_APIC))
1818 return;
1819
1820 x86_pmu.apic = 0;
1821 pr_info("no APIC, boot with the \"lapic\" boot parameter to force-enable it.\n");
1822 pr_info("no hardware sampling interrupt available.\n");
1823
1824 /*
1825 * If we have a PMU initialized but no APIC
1826 * interrupts, we cannot sample hardware
1827 * events (user-space has to fall back and
1828 * sample via a hrtimer based software event):
1829 */
1830 pmu.capabilities |= PERF_PMU_CAP_NO_INTERRUPT;
1831
1832 }
1833
1834 static struct attribute_group x86_pmu_format_group __ro_after_init = {
1835 .name = "format",
1836 .attrs = NULL,
1837 };
1838
events_sysfs_show(struct device * dev,struct device_attribute * attr,char * page)1839 ssize_t events_sysfs_show(struct device *dev, struct device_attribute *attr, char *page)
1840 {
1841 struct perf_pmu_events_attr *pmu_attr =
1842 container_of(attr, struct perf_pmu_events_attr, attr);
1843 u64 config = 0;
1844
1845 if (pmu_attr->id < x86_pmu.max_events)
1846 config = x86_pmu.event_map(pmu_attr->id);
1847
1848 /* string trumps id */
1849 if (pmu_attr->event_str)
1850 return sprintf(page, "%s\n", pmu_attr->event_str);
1851
1852 return x86_pmu.events_sysfs_show(page, config);
1853 }
1854 EXPORT_SYMBOL_GPL(events_sysfs_show);
1855
events_ht_sysfs_show(struct device * dev,struct device_attribute * attr,char * page)1856 ssize_t events_ht_sysfs_show(struct device *dev, struct device_attribute *attr,
1857 char *page)
1858 {
1859 struct perf_pmu_events_ht_attr *pmu_attr =
1860 container_of(attr, struct perf_pmu_events_ht_attr, attr);
1861
1862 /*
1863 * Report conditional events depending on Hyper-Threading.
1864 *
1865 * This is overly conservative as usually the HT special
1866 * handling is not needed if the other CPU thread is idle.
1867 *
1868 * Note this does not (and cannot) handle the case when thread
1869 * siblings are invisible, for example with virtualization
1870 * if they are owned by some other guest. The user tool
1871 * has to re-read when a thread sibling gets onlined later.
1872 */
1873 return sprintf(page, "%s",
1874 topology_max_smt_threads() > 1 ?
1875 pmu_attr->event_str_ht :
1876 pmu_attr->event_str_noht);
1877 }
1878
events_hybrid_sysfs_show(struct device * dev,struct device_attribute * attr,char * page)1879 ssize_t events_hybrid_sysfs_show(struct device *dev,
1880 struct device_attribute *attr,
1881 char *page)
1882 {
1883 struct perf_pmu_events_hybrid_attr *pmu_attr =
1884 container_of(attr, struct perf_pmu_events_hybrid_attr, attr);
1885 struct x86_hybrid_pmu *pmu;
1886 const char *str, *next_str;
1887 int i;
1888
1889 if (hweight64(pmu_attr->pmu_type) == 1)
1890 return sprintf(page, "%s", pmu_attr->event_str);
1891
1892 /*
1893 * Hybrid PMUs may support the same event name, but with different
1894 * event encoding, e.g., the mem-loads event on an Atom PMU has
1895 * different event encoding from a Core PMU.
1896 *
1897 * The event_str includes all event encodings. Each event encoding
1898 * is divided by ";". The order of the event encodings must follow
1899 * the order of the hybrid PMU index.
1900 */
1901 pmu = container_of(dev_get_drvdata(dev), struct x86_hybrid_pmu, pmu);
1902
1903 str = pmu_attr->event_str;
1904 for (i = 0; i < x86_pmu.num_hybrid_pmus; i++) {
1905 if (!(x86_pmu.hybrid_pmu[i].pmu_type & pmu_attr->pmu_type))
1906 continue;
1907 if (x86_pmu.hybrid_pmu[i].pmu_type & pmu->pmu_type) {
1908 next_str = strchr(str, ';');
1909 if (next_str)
1910 return snprintf(page, next_str - str + 1, "%s", str);
1911 else
1912 return sprintf(page, "%s", str);
1913 }
1914 str = strchr(str, ';');
1915 str++;
1916 }
1917
1918 return 0;
1919 }
1920 EXPORT_SYMBOL_GPL(events_hybrid_sysfs_show);
1921
1922 EVENT_ATTR(cpu-cycles, CPU_CYCLES );
1923 EVENT_ATTR(instructions, INSTRUCTIONS );
1924 EVENT_ATTR(cache-references, CACHE_REFERENCES );
1925 EVENT_ATTR(cache-misses, CACHE_MISSES );
1926 EVENT_ATTR(branch-instructions, BRANCH_INSTRUCTIONS );
1927 EVENT_ATTR(branch-misses, BRANCH_MISSES );
1928 EVENT_ATTR(bus-cycles, BUS_CYCLES );
1929 EVENT_ATTR(stalled-cycles-frontend, STALLED_CYCLES_FRONTEND );
1930 EVENT_ATTR(stalled-cycles-backend, STALLED_CYCLES_BACKEND );
1931 EVENT_ATTR(ref-cycles, REF_CPU_CYCLES );
1932
1933 static struct attribute *empty_attrs;
1934
1935 static struct attribute *events_attr[] = {
1936 EVENT_PTR(CPU_CYCLES),
1937 EVENT_PTR(INSTRUCTIONS),
1938 EVENT_PTR(CACHE_REFERENCES),
1939 EVENT_PTR(CACHE_MISSES),
1940 EVENT_PTR(BRANCH_INSTRUCTIONS),
1941 EVENT_PTR(BRANCH_MISSES),
1942 EVENT_PTR(BUS_CYCLES),
1943 EVENT_PTR(STALLED_CYCLES_FRONTEND),
1944 EVENT_PTR(STALLED_CYCLES_BACKEND),
1945 EVENT_PTR(REF_CPU_CYCLES),
1946 NULL,
1947 };
1948
1949 /*
1950 * Remove all undefined events (x86_pmu.event_map(id) == 0)
1951 * out of events_attr attributes.
1952 */
1953 static umode_t
is_visible(struct kobject * kobj,struct attribute * attr,int idx)1954 is_visible(struct kobject *kobj, struct attribute *attr, int idx)
1955 {
1956 struct perf_pmu_events_attr *pmu_attr;
1957
1958 if (idx >= x86_pmu.max_events)
1959 return 0;
1960
1961 pmu_attr = container_of(attr, struct perf_pmu_events_attr, attr.attr);
1962 /* str trumps id */
1963 return pmu_attr->event_str || x86_pmu.event_map(idx) ? attr->mode : 0;
1964 }
1965
1966 static struct attribute_group x86_pmu_events_group __ro_after_init = {
1967 .name = "events",
1968 .attrs = events_attr,
1969 .is_visible = is_visible,
1970 };
1971
x86_event_sysfs_show(char * page,u64 config,u64 event)1972 ssize_t x86_event_sysfs_show(char *page, u64 config, u64 event)
1973 {
1974 u64 umask = (config & ARCH_PERFMON_EVENTSEL_UMASK) >> 8;
1975 u64 cmask = (config & ARCH_PERFMON_EVENTSEL_CMASK) >> 24;
1976 bool edge = (config & ARCH_PERFMON_EVENTSEL_EDGE);
1977 bool pc = (config & ARCH_PERFMON_EVENTSEL_PIN_CONTROL);
1978 bool any = (config & ARCH_PERFMON_EVENTSEL_ANY);
1979 bool inv = (config & ARCH_PERFMON_EVENTSEL_INV);
1980 ssize_t ret;
1981
1982 /*
1983 * We have whole page size to spend and just little data
1984 * to write, so we can safely use sprintf.
1985 */
1986 ret = sprintf(page, "event=0x%02llx", event);
1987
1988 if (umask)
1989 ret += sprintf(page + ret, ",umask=0x%02llx", umask);
1990
1991 if (edge)
1992 ret += sprintf(page + ret, ",edge");
1993
1994 if (pc)
1995 ret += sprintf(page + ret, ",pc");
1996
1997 if (any)
1998 ret += sprintf(page + ret, ",any");
1999
2000 if (inv)
2001 ret += sprintf(page + ret, ",inv");
2002
2003 if (cmask)
2004 ret += sprintf(page + ret, ",cmask=0x%02llx", cmask);
2005
2006 ret += sprintf(page + ret, "\n");
2007
2008 return ret;
2009 }
2010
2011 static struct attribute_group x86_pmu_attr_group;
2012 static struct attribute_group x86_pmu_caps_group;
2013
x86_pmu_static_call_update(void)2014 static void x86_pmu_static_call_update(void)
2015 {
2016 static_call_update(x86_pmu_handle_irq, x86_pmu.handle_irq);
2017 static_call_update(x86_pmu_disable_all, x86_pmu.disable_all);
2018 static_call_update(x86_pmu_enable_all, x86_pmu.enable_all);
2019 static_call_update(x86_pmu_enable, x86_pmu.enable);
2020 static_call_update(x86_pmu_disable, x86_pmu.disable);
2021
2022 static_call_update(x86_pmu_assign, x86_pmu.assign);
2023
2024 static_call_update(x86_pmu_add, x86_pmu.add);
2025 static_call_update(x86_pmu_del, x86_pmu.del);
2026 static_call_update(x86_pmu_read, x86_pmu.read);
2027
2028 static_call_update(x86_pmu_set_period, x86_pmu.set_period);
2029 static_call_update(x86_pmu_update, x86_pmu.update);
2030 static_call_update(x86_pmu_limit_period, x86_pmu.limit_period);
2031
2032 static_call_update(x86_pmu_schedule_events, x86_pmu.schedule_events);
2033 static_call_update(x86_pmu_get_event_constraints, x86_pmu.get_event_constraints);
2034 static_call_update(x86_pmu_put_event_constraints, x86_pmu.put_event_constraints);
2035
2036 static_call_update(x86_pmu_start_scheduling, x86_pmu.start_scheduling);
2037 static_call_update(x86_pmu_commit_scheduling, x86_pmu.commit_scheduling);
2038 static_call_update(x86_pmu_stop_scheduling, x86_pmu.stop_scheduling);
2039
2040 static_call_update(x86_pmu_sched_task, x86_pmu.sched_task);
2041
2042 static_call_update(x86_pmu_drain_pebs, x86_pmu.drain_pebs);
2043 static_call_update(x86_pmu_pebs_aliases, x86_pmu.pebs_aliases);
2044
2045 static_call_update(x86_pmu_guest_get_msrs, x86_pmu.guest_get_msrs);
2046 static_call_update(x86_pmu_filter, x86_pmu.filter);
2047
2048 static_call_update(x86_pmu_late_setup, x86_pmu.late_setup);
2049 }
2050
_x86_pmu_read(struct perf_event * event)2051 static void _x86_pmu_read(struct perf_event *event)
2052 {
2053 static_call(x86_pmu_update)(event);
2054 }
2055
x86_pmu_show_pmu_cap(struct pmu * pmu)2056 void x86_pmu_show_pmu_cap(struct pmu *pmu)
2057 {
2058 pr_info("... version: %d\n", x86_pmu.version);
2059 pr_info("... bit width: %d\n", x86_pmu.cntval_bits);
2060 pr_info("... generic registers: %d\n", x86_pmu_num_counters(pmu));
2061 pr_info("... value mask: %016Lx\n", x86_pmu.cntval_mask);
2062 pr_info("... max period: %016Lx\n", x86_pmu.max_period);
2063 pr_info("... fixed-purpose events: %d\n", x86_pmu_num_counters_fixed(pmu));
2064 pr_info("... event mask: %016Lx\n", hybrid(pmu, intel_ctrl));
2065 }
2066
init_hw_perf_events(void)2067 static int __init init_hw_perf_events(void)
2068 {
2069 struct x86_pmu_quirk *quirk;
2070 int err;
2071
2072 pr_info("Performance Events: ");
2073
2074 switch (boot_cpu_data.x86_vendor) {
2075 case X86_VENDOR_INTEL:
2076 err = intel_pmu_init();
2077 break;
2078 case X86_VENDOR_AMD:
2079 err = amd_pmu_init();
2080 break;
2081 case X86_VENDOR_HYGON:
2082 err = amd_pmu_init();
2083 x86_pmu.name = "HYGON";
2084 break;
2085 case X86_VENDOR_ZHAOXIN:
2086 case X86_VENDOR_CENTAUR:
2087 err = zhaoxin_pmu_init();
2088 break;
2089 default:
2090 err = -ENOTSUPP;
2091 }
2092 if (err != 0) {
2093 pr_cont("no PMU driver, software events only.\n");
2094 err = 0;
2095 goto out_bad_pmu;
2096 }
2097
2098 pmu_check_apic();
2099
2100 /* sanity check that the hardware exists or is emulated */
2101 if (!check_hw_exists(&pmu, x86_pmu.cntr_mask, x86_pmu.fixed_cntr_mask))
2102 goto out_bad_pmu;
2103
2104 pr_cont("%s PMU driver.\n", x86_pmu.name);
2105
2106 x86_pmu.attr_rdpmc = 1; /* enable userspace RDPMC usage by default */
2107
2108 for (quirk = x86_pmu.quirks; quirk; quirk = quirk->next)
2109 quirk->func();
2110
2111 if (!x86_pmu.intel_ctrl)
2112 x86_pmu.intel_ctrl = x86_pmu.cntr_mask64;
2113
2114 if (!x86_pmu.config_mask)
2115 x86_pmu.config_mask = X86_RAW_EVENT_MASK;
2116
2117 perf_events_lapic_init();
2118 register_nmi_handler(NMI_LOCAL, perf_event_nmi_handler, 0, "PMI");
2119
2120 unconstrained = (struct event_constraint)
2121 __EVENT_CONSTRAINT(0, x86_pmu.cntr_mask64,
2122 0, x86_pmu_num_counters(NULL), 0, 0);
2123
2124 x86_pmu_format_group.attrs = x86_pmu.format_attrs;
2125
2126 if (!x86_pmu.events_sysfs_show)
2127 x86_pmu_events_group.attrs = &empty_attrs;
2128
2129 pmu.attr_update = x86_pmu.attr_update;
2130
2131 if (!is_hybrid())
2132 x86_pmu_show_pmu_cap(NULL);
2133
2134 if (!x86_pmu.read)
2135 x86_pmu.read = _x86_pmu_read;
2136
2137 if (!x86_pmu.guest_get_msrs)
2138 x86_pmu.guest_get_msrs = (void *)&__static_call_return0;
2139
2140 if (!x86_pmu.set_period)
2141 x86_pmu.set_period = x86_perf_event_set_period;
2142
2143 if (!x86_pmu.update)
2144 x86_pmu.update = x86_perf_event_update;
2145
2146 x86_pmu_static_call_update();
2147
2148 /*
2149 * Install callbacks. Core will call them for each online
2150 * cpu.
2151 */
2152 err = cpuhp_setup_state(CPUHP_PERF_X86_PREPARE, "perf/x86:prepare",
2153 x86_pmu_prepare_cpu, x86_pmu_dead_cpu);
2154 if (err)
2155 return err;
2156
2157 err = cpuhp_setup_state(CPUHP_AP_PERF_X86_STARTING,
2158 "perf/x86:starting", x86_pmu_starting_cpu,
2159 x86_pmu_dying_cpu);
2160 if (err)
2161 goto out;
2162
2163 err = cpuhp_setup_state(CPUHP_AP_PERF_X86_ONLINE, "perf/x86:online",
2164 x86_pmu_online_cpu, NULL);
2165 if (err)
2166 goto out1;
2167
2168 if (!is_hybrid()) {
2169 err = perf_pmu_register(&pmu, "cpu", PERF_TYPE_RAW);
2170 if (err)
2171 goto out2;
2172 } else {
2173 struct x86_hybrid_pmu *hybrid_pmu;
2174 int i, j;
2175
2176 for (i = 0; i < x86_pmu.num_hybrid_pmus; i++) {
2177 hybrid_pmu = &x86_pmu.hybrid_pmu[i];
2178
2179 hybrid_pmu->pmu = pmu;
2180 hybrid_pmu->pmu.type = -1;
2181 hybrid_pmu->pmu.attr_update = x86_pmu.attr_update;
2182 hybrid_pmu->pmu.capabilities |= PERF_PMU_CAP_EXTENDED_HW_TYPE;
2183
2184 err = perf_pmu_register(&hybrid_pmu->pmu, hybrid_pmu->name,
2185 (hybrid_pmu->pmu_type == hybrid_big) ? PERF_TYPE_RAW : -1);
2186 if (err)
2187 break;
2188 }
2189
2190 if (i < x86_pmu.num_hybrid_pmus) {
2191 for (j = 0; j < i; j++)
2192 perf_pmu_unregister(&x86_pmu.hybrid_pmu[j].pmu);
2193 pr_warn("Failed to register hybrid PMUs\n");
2194 kfree(x86_pmu.hybrid_pmu);
2195 x86_pmu.hybrid_pmu = NULL;
2196 x86_pmu.num_hybrid_pmus = 0;
2197 goto out2;
2198 }
2199 }
2200
2201 return 0;
2202
2203 out2:
2204 cpuhp_remove_state(CPUHP_AP_PERF_X86_ONLINE);
2205 out1:
2206 cpuhp_remove_state(CPUHP_AP_PERF_X86_STARTING);
2207 out:
2208 cpuhp_remove_state(CPUHP_PERF_X86_PREPARE);
2209 out_bad_pmu:
2210 memset(&x86_pmu, 0, sizeof(x86_pmu));
2211 return err;
2212 }
2213 early_initcall(init_hw_perf_events);
2214
x86_pmu_read(struct perf_event * event)2215 static void x86_pmu_read(struct perf_event *event)
2216 {
2217 static_call(x86_pmu_read)(event);
2218 }
2219
2220 /*
2221 * Start group events scheduling transaction
2222 * Set the flag to make pmu::enable() not perform the
2223 * schedulability test, it will be performed at commit time
2224 *
2225 * We only support PERF_PMU_TXN_ADD transactions. Save the
2226 * transaction flags but otherwise ignore non-PERF_PMU_TXN_ADD
2227 * transactions.
2228 */
x86_pmu_start_txn(struct pmu * pmu,unsigned int txn_flags)2229 static void x86_pmu_start_txn(struct pmu *pmu, unsigned int txn_flags)
2230 {
2231 struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
2232
2233 WARN_ON_ONCE(cpuc->txn_flags); /* txn already in flight */
2234
2235 cpuc->txn_flags = txn_flags;
2236 if (txn_flags & ~PERF_PMU_TXN_ADD)
2237 return;
2238
2239 perf_pmu_disable(pmu);
2240 __this_cpu_write(cpu_hw_events.n_txn, 0);
2241 __this_cpu_write(cpu_hw_events.n_txn_pair, 0);
2242 __this_cpu_write(cpu_hw_events.n_txn_metric, 0);
2243 }
2244
2245 /*
2246 * Stop group events scheduling transaction
2247 * Clear the flag and pmu::enable() will perform the
2248 * schedulability test.
2249 */
x86_pmu_cancel_txn(struct pmu * pmu)2250 static void x86_pmu_cancel_txn(struct pmu *pmu)
2251 {
2252 unsigned int txn_flags;
2253 struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
2254
2255 WARN_ON_ONCE(!cpuc->txn_flags); /* no txn in flight */
2256
2257 txn_flags = cpuc->txn_flags;
2258 cpuc->txn_flags = 0;
2259 if (txn_flags & ~PERF_PMU_TXN_ADD)
2260 return;
2261
2262 /*
2263 * Truncate collected array by the number of events added in this
2264 * transaction. See x86_pmu_add() and x86_pmu_*_txn().
2265 */
2266 __this_cpu_sub(cpu_hw_events.n_added, __this_cpu_read(cpu_hw_events.n_txn));
2267 __this_cpu_sub(cpu_hw_events.n_events, __this_cpu_read(cpu_hw_events.n_txn));
2268 __this_cpu_sub(cpu_hw_events.n_pair, __this_cpu_read(cpu_hw_events.n_txn_pair));
2269 __this_cpu_sub(cpu_hw_events.n_metric, __this_cpu_read(cpu_hw_events.n_txn_metric));
2270 perf_pmu_enable(pmu);
2271 }
2272
2273 /*
2274 * Commit group events scheduling transaction
2275 * Perform the group schedulability test as a whole
2276 * Return 0 if success
2277 *
2278 * Does not cancel the transaction on failure; expects the caller to do this.
2279 */
x86_pmu_commit_txn(struct pmu * pmu)2280 static int x86_pmu_commit_txn(struct pmu *pmu)
2281 {
2282 struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
2283 int assign[X86_PMC_IDX_MAX];
2284 int n, ret;
2285
2286 WARN_ON_ONCE(!cpuc->txn_flags); /* no txn in flight */
2287
2288 if (cpuc->txn_flags & ~PERF_PMU_TXN_ADD) {
2289 cpuc->txn_flags = 0;
2290 return 0;
2291 }
2292
2293 n = cpuc->n_events;
2294
2295 if (!x86_pmu_initialized())
2296 return -EAGAIN;
2297
2298 ret = static_call(x86_pmu_schedule_events)(cpuc, n, assign);
2299 if (ret)
2300 return ret;
2301
2302 /*
2303 * copy new assignment, now we know it is possible
2304 * will be used by hw_perf_enable()
2305 */
2306 memcpy(cpuc->assign, assign, n*sizeof(int));
2307
2308 cpuc->txn_flags = 0;
2309 perf_pmu_enable(pmu);
2310 return 0;
2311 }
2312 /*
2313 * a fake_cpuc is used to validate event groups. Due to
2314 * the extra reg logic, we need to also allocate a fake
2315 * per_core and per_cpu structure. Otherwise, group events
2316 * using extra reg may conflict without the kernel being
2317 * able to catch this when the last event gets added to
2318 * the group.
2319 */
free_fake_cpuc(struct cpu_hw_events * cpuc)2320 static void free_fake_cpuc(struct cpu_hw_events *cpuc)
2321 {
2322 intel_cpuc_finish(cpuc);
2323 kfree(cpuc);
2324 }
2325
allocate_fake_cpuc(struct pmu * event_pmu)2326 static struct cpu_hw_events *allocate_fake_cpuc(struct pmu *event_pmu)
2327 {
2328 struct cpu_hw_events *cpuc;
2329 int cpu;
2330
2331 cpuc = kzalloc(sizeof(*cpuc), GFP_KERNEL);
2332 if (!cpuc)
2333 return ERR_PTR(-ENOMEM);
2334 cpuc->is_fake = 1;
2335
2336 if (is_hybrid()) {
2337 struct x86_hybrid_pmu *h_pmu;
2338
2339 h_pmu = hybrid_pmu(event_pmu);
2340 if (cpumask_empty(&h_pmu->supported_cpus))
2341 goto error;
2342 cpu = cpumask_first(&h_pmu->supported_cpus);
2343 } else
2344 cpu = raw_smp_processor_id();
2345 cpuc->pmu = event_pmu;
2346
2347 if (intel_cpuc_prepare(cpuc, cpu))
2348 goto error;
2349
2350 return cpuc;
2351 error:
2352 free_fake_cpuc(cpuc);
2353 return ERR_PTR(-ENOMEM);
2354 }
2355
2356 /*
2357 * validate that we can schedule this event
2358 */
validate_event(struct perf_event * event)2359 static int validate_event(struct perf_event *event)
2360 {
2361 struct cpu_hw_events *fake_cpuc;
2362 struct event_constraint *c;
2363 int ret = 0;
2364
2365 fake_cpuc = allocate_fake_cpuc(event->pmu);
2366 if (IS_ERR(fake_cpuc))
2367 return PTR_ERR(fake_cpuc);
2368
2369 c = x86_pmu.get_event_constraints(fake_cpuc, 0, event);
2370
2371 if (!c || !c->weight)
2372 ret = -EINVAL;
2373
2374 if (x86_pmu.put_event_constraints)
2375 x86_pmu.put_event_constraints(fake_cpuc, event);
2376
2377 free_fake_cpuc(fake_cpuc);
2378
2379 return ret;
2380 }
2381
2382 /*
2383 * validate a single event group
2384 *
2385 * validation include:
2386 * - check events are compatible which each other
2387 * - events do not compete for the same counter
2388 * - number of events <= number of counters
2389 *
2390 * validation ensures the group can be loaded onto the
2391 * PMU if it was the only group available.
2392 */
validate_group(struct perf_event * event)2393 static int validate_group(struct perf_event *event)
2394 {
2395 struct perf_event *leader = event->group_leader;
2396 struct cpu_hw_events *fake_cpuc;
2397 int ret = -EINVAL, n;
2398
2399 /*
2400 * Reject events from different hybrid PMUs.
2401 */
2402 if (is_hybrid()) {
2403 struct perf_event *sibling;
2404 struct pmu *pmu = NULL;
2405
2406 if (is_x86_event(leader))
2407 pmu = leader->pmu;
2408
2409 for_each_sibling_event(sibling, leader) {
2410 if (!is_x86_event(sibling))
2411 continue;
2412 if (!pmu)
2413 pmu = sibling->pmu;
2414 else if (pmu != sibling->pmu)
2415 return ret;
2416 }
2417 }
2418
2419 fake_cpuc = allocate_fake_cpuc(event->pmu);
2420 if (IS_ERR(fake_cpuc))
2421 return PTR_ERR(fake_cpuc);
2422 /*
2423 * the event is not yet connected with its
2424 * siblings therefore we must first collect
2425 * existing siblings, then add the new event
2426 * before we can simulate the scheduling
2427 */
2428 n = collect_events(fake_cpuc, leader, true);
2429 if (n < 0)
2430 goto out;
2431
2432 fake_cpuc->n_events = n;
2433 n = collect_events(fake_cpuc, event, false);
2434 if (n < 0)
2435 goto out;
2436
2437 fake_cpuc->n_events = 0;
2438 ret = x86_pmu.schedule_events(fake_cpuc, n, NULL);
2439
2440 out:
2441 free_fake_cpuc(fake_cpuc);
2442 return ret;
2443 }
2444
x86_pmu_event_init(struct perf_event * event)2445 static int x86_pmu_event_init(struct perf_event *event)
2446 {
2447 struct x86_hybrid_pmu *pmu = NULL;
2448 int err;
2449
2450 if ((event->attr.type != event->pmu->type) &&
2451 (event->attr.type != PERF_TYPE_HARDWARE) &&
2452 (event->attr.type != PERF_TYPE_HW_CACHE))
2453 return -ENOENT;
2454
2455 if (is_hybrid() && (event->cpu != -1)) {
2456 pmu = hybrid_pmu(event->pmu);
2457 if (!cpumask_test_cpu(event->cpu, &pmu->supported_cpus))
2458 return -ENOENT;
2459 }
2460
2461 err = __x86_pmu_event_init(event);
2462 if (!err) {
2463 if (event->group_leader != event)
2464 err = validate_group(event);
2465 else
2466 err = validate_event(event);
2467 }
2468 if (err) {
2469 if (event->destroy)
2470 event->destroy(event);
2471 event->destroy = NULL;
2472 }
2473
2474 if (READ_ONCE(x86_pmu.attr_rdpmc) &&
2475 !(event->hw.flags & PERF_X86_EVENT_LARGE_PEBS))
2476 event->hw.flags |= PERF_EVENT_FLAG_USER_READ_CNT;
2477
2478 return err;
2479 }
2480
perf_clear_dirty_counters(void)2481 void perf_clear_dirty_counters(void)
2482 {
2483 struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
2484 int i;
2485
2486 /* Don't need to clear the assigned counter. */
2487 for (i = 0; i < cpuc->n_events; i++)
2488 __clear_bit(cpuc->assign[i], cpuc->dirty);
2489
2490 if (bitmap_empty(cpuc->dirty, X86_PMC_IDX_MAX))
2491 return;
2492
2493 for_each_set_bit(i, cpuc->dirty, X86_PMC_IDX_MAX) {
2494 if (i >= INTEL_PMC_IDX_FIXED) {
2495 /* Metrics and fake events don't have corresponding HW counters. */
2496 if (!test_bit(i - INTEL_PMC_IDX_FIXED, hybrid(cpuc->pmu, fixed_cntr_mask)))
2497 continue;
2498
2499 wrmsrl(x86_pmu_fixed_ctr_addr(i - INTEL_PMC_IDX_FIXED), 0);
2500 } else {
2501 wrmsrl(x86_pmu_event_addr(i), 0);
2502 }
2503 }
2504
2505 bitmap_zero(cpuc->dirty, X86_PMC_IDX_MAX);
2506 }
2507
x86_pmu_event_mapped(struct perf_event * event,struct mm_struct * mm)2508 static void x86_pmu_event_mapped(struct perf_event *event, struct mm_struct *mm)
2509 {
2510 if (!(event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT))
2511 return;
2512
2513 /*
2514 * This function relies on not being called concurrently in two
2515 * tasks in the same mm. Otherwise one task could observe
2516 * perf_rdpmc_allowed > 1 and return all the way back to
2517 * userspace with CR4.PCE clear while another task is still
2518 * doing on_each_cpu_mask() to propagate CR4.PCE.
2519 *
2520 * For now, this can't happen because all callers hold mmap_lock
2521 * for write. If this changes, we'll need a different solution.
2522 */
2523 mmap_assert_write_locked(mm);
2524
2525 if (atomic_inc_return(&mm->context.perf_rdpmc_allowed) == 1)
2526 on_each_cpu_mask(mm_cpumask(mm), cr4_update_pce, NULL, 1);
2527 }
2528
x86_pmu_event_unmapped(struct perf_event * event,struct mm_struct * mm)2529 static void x86_pmu_event_unmapped(struct perf_event *event, struct mm_struct *mm)
2530 {
2531 if (!(event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT))
2532 return;
2533
2534 if (atomic_dec_and_test(&mm->context.perf_rdpmc_allowed))
2535 on_each_cpu_mask(mm_cpumask(mm), cr4_update_pce, NULL, 1);
2536 }
2537
x86_pmu_event_idx(struct perf_event * event)2538 static int x86_pmu_event_idx(struct perf_event *event)
2539 {
2540 struct hw_perf_event *hwc = &event->hw;
2541
2542 if (!(hwc->flags & PERF_EVENT_FLAG_USER_READ_CNT))
2543 return 0;
2544
2545 if (is_metric_idx(hwc->idx))
2546 return INTEL_PMC_FIXED_RDPMC_METRICS + 1;
2547 else
2548 return hwc->event_base_rdpmc + 1;
2549 }
2550
get_attr_rdpmc(struct device * cdev,struct device_attribute * attr,char * buf)2551 static ssize_t get_attr_rdpmc(struct device *cdev,
2552 struct device_attribute *attr,
2553 char *buf)
2554 {
2555 return snprintf(buf, 40, "%d\n", x86_pmu.attr_rdpmc);
2556 }
2557
set_attr_rdpmc(struct device * cdev,struct device_attribute * attr,const char * buf,size_t count)2558 static ssize_t set_attr_rdpmc(struct device *cdev,
2559 struct device_attribute *attr,
2560 const char *buf, size_t count)
2561 {
2562 static DEFINE_MUTEX(rdpmc_mutex);
2563 unsigned long val;
2564 ssize_t ret;
2565
2566 ret = kstrtoul(buf, 0, &val);
2567 if (ret)
2568 return ret;
2569
2570 if (val > 2)
2571 return -EINVAL;
2572
2573 if (x86_pmu.attr_rdpmc_broken)
2574 return -ENOTSUPP;
2575
2576 guard(mutex)(&rdpmc_mutex);
2577
2578 if (val != x86_pmu.attr_rdpmc) {
2579 /*
2580 * Changing into or out of never available or always available,
2581 * aka perf-event-bypassing mode. This path is extremely slow,
2582 * but only root can trigger it, so it's okay.
2583 */
2584 if (val == 0)
2585 static_branch_inc(&rdpmc_never_available_key);
2586 else if (x86_pmu.attr_rdpmc == 0)
2587 static_branch_dec(&rdpmc_never_available_key);
2588
2589 if (val == 2)
2590 static_branch_inc(&rdpmc_always_available_key);
2591 else if (x86_pmu.attr_rdpmc == 2)
2592 static_branch_dec(&rdpmc_always_available_key);
2593
2594 on_each_cpu(cr4_update_pce, NULL, 1);
2595 x86_pmu.attr_rdpmc = val;
2596 }
2597
2598 return count;
2599 }
2600
2601 static DEVICE_ATTR(rdpmc, S_IRUSR | S_IWUSR, get_attr_rdpmc, set_attr_rdpmc);
2602
2603 static struct attribute *x86_pmu_attrs[] = {
2604 &dev_attr_rdpmc.attr,
2605 NULL,
2606 };
2607
2608 static struct attribute_group x86_pmu_attr_group __ro_after_init = {
2609 .attrs = x86_pmu_attrs,
2610 };
2611
max_precise_show(struct device * cdev,struct device_attribute * attr,char * buf)2612 static ssize_t max_precise_show(struct device *cdev,
2613 struct device_attribute *attr,
2614 char *buf)
2615 {
2616 return snprintf(buf, PAGE_SIZE, "%d\n", x86_pmu_max_precise());
2617 }
2618
2619 static DEVICE_ATTR_RO(max_precise);
2620
2621 static struct attribute *x86_pmu_caps_attrs[] = {
2622 &dev_attr_max_precise.attr,
2623 NULL
2624 };
2625
2626 static struct attribute_group x86_pmu_caps_group __ro_after_init = {
2627 .name = "caps",
2628 .attrs = x86_pmu_caps_attrs,
2629 };
2630
2631 static const struct attribute_group *x86_pmu_attr_groups[] = {
2632 &x86_pmu_attr_group,
2633 &x86_pmu_format_group,
2634 &x86_pmu_events_group,
2635 &x86_pmu_caps_group,
2636 NULL,
2637 };
2638
x86_pmu_sched_task(struct perf_event_pmu_context * pmu_ctx,struct task_struct * task,bool sched_in)2639 static void x86_pmu_sched_task(struct perf_event_pmu_context *pmu_ctx,
2640 struct task_struct *task, bool sched_in)
2641 {
2642 static_call_cond(x86_pmu_sched_task)(pmu_ctx, task, sched_in);
2643 }
2644
perf_check_microcode(void)2645 void perf_check_microcode(void)
2646 {
2647 if (x86_pmu.check_microcode)
2648 x86_pmu.check_microcode();
2649 }
2650
x86_pmu_check_period(struct perf_event * event,u64 value)2651 static int x86_pmu_check_period(struct perf_event *event, u64 value)
2652 {
2653 if (x86_pmu.check_period && x86_pmu.check_period(event, value))
2654 return -EINVAL;
2655
2656 if (value && x86_pmu.limit_period) {
2657 s64 left = value;
2658 x86_pmu.limit_period(event, &left);
2659 if (left > value)
2660 return -EINVAL;
2661 }
2662
2663 return 0;
2664 }
2665
x86_pmu_aux_output_match(struct perf_event * event)2666 static int x86_pmu_aux_output_match(struct perf_event *event)
2667 {
2668 if (!(pmu.capabilities & PERF_PMU_CAP_AUX_OUTPUT))
2669 return 0;
2670
2671 if (x86_pmu.aux_output_match)
2672 return x86_pmu.aux_output_match(event);
2673
2674 return 0;
2675 }
2676
x86_pmu_filter(struct pmu * pmu,int cpu)2677 static bool x86_pmu_filter(struct pmu *pmu, int cpu)
2678 {
2679 bool ret = false;
2680
2681 static_call_cond(x86_pmu_filter)(pmu, cpu, &ret);
2682
2683 return ret;
2684 }
2685
2686 static struct pmu pmu = {
2687 .pmu_enable = x86_pmu_enable,
2688 .pmu_disable = x86_pmu_disable,
2689
2690 .attr_groups = x86_pmu_attr_groups,
2691
2692 .event_init = x86_pmu_event_init,
2693
2694 .event_mapped = x86_pmu_event_mapped,
2695 .event_unmapped = x86_pmu_event_unmapped,
2696
2697 .add = x86_pmu_add,
2698 .del = x86_pmu_del,
2699 .start = x86_pmu_start,
2700 .stop = x86_pmu_stop,
2701 .read = x86_pmu_read,
2702
2703 .start_txn = x86_pmu_start_txn,
2704 .cancel_txn = x86_pmu_cancel_txn,
2705 .commit_txn = x86_pmu_commit_txn,
2706
2707 .event_idx = x86_pmu_event_idx,
2708 .sched_task = x86_pmu_sched_task,
2709 .check_period = x86_pmu_check_period,
2710
2711 .aux_output_match = x86_pmu_aux_output_match,
2712
2713 .filter = x86_pmu_filter,
2714 };
2715
arch_perf_update_userpage(struct perf_event * event,struct perf_event_mmap_page * userpg,u64 now)2716 void arch_perf_update_userpage(struct perf_event *event,
2717 struct perf_event_mmap_page *userpg, u64 now)
2718 {
2719 struct cyc2ns_data data;
2720 u64 offset;
2721
2722 userpg->cap_user_time = 0;
2723 userpg->cap_user_time_zero = 0;
2724 userpg->cap_user_rdpmc =
2725 !!(event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT);
2726 userpg->pmc_width = x86_pmu.cntval_bits;
2727
2728 if (!using_native_sched_clock() || !sched_clock_stable())
2729 return;
2730
2731 cyc2ns_read_begin(&data);
2732
2733 offset = data.cyc2ns_offset + __sched_clock_offset;
2734
2735 /*
2736 * Internal timekeeping for enabled/running/stopped times
2737 * is always in the local_clock domain.
2738 */
2739 userpg->cap_user_time = 1;
2740 userpg->time_mult = data.cyc2ns_mul;
2741 userpg->time_shift = data.cyc2ns_shift;
2742 userpg->time_offset = offset - now;
2743
2744 /*
2745 * cap_user_time_zero doesn't make sense when we're using a different
2746 * time base for the records.
2747 */
2748 if (!event->attr.use_clockid) {
2749 userpg->cap_user_time_zero = 1;
2750 userpg->time_zero = offset;
2751 }
2752
2753 cyc2ns_read_end();
2754 }
2755
2756 /*
2757 * Determine whether the regs were taken from an irq/exception handler rather
2758 * than from perf_arch_fetch_caller_regs().
2759 */
perf_hw_regs(struct pt_regs * regs)2760 static bool perf_hw_regs(struct pt_regs *regs)
2761 {
2762 return regs->flags & X86_EFLAGS_FIXED;
2763 }
2764
2765 void
perf_callchain_kernel(struct perf_callchain_entry_ctx * entry,struct pt_regs * regs)2766 perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs)
2767 {
2768 struct unwind_state state;
2769 unsigned long addr;
2770
2771 if (perf_guest_state()) {
2772 /* TODO: We don't support guest os callchain now */
2773 return;
2774 }
2775
2776 if (perf_callchain_store(entry, regs->ip))
2777 return;
2778
2779 if (perf_hw_regs(regs))
2780 unwind_start(&state, current, regs, NULL);
2781 else
2782 unwind_start(&state, current, NULL, (void *)regs->sp);
2783
2784 for (; !unwind_done(&state); unwind_next_frame(&state)) {
2785 addr = unwind_get_return_address(&state);
2786 if (!addr || perf_callchain_store(entry, addr))
2787 return;
2788 }
2789 }
2790
2791 static inline int
valid_user_frame(const void __user * fp,unsigned long size)2792 valid_user_frame(const void __user *fp, unsigned long size)
2793 {
2794 return __access_ok(fp, size);
2795 }
2796
get_segment_base(unsigned int segment)2797 static unsigned long get_segment_base(unsigned int segment)
2798 {
2799 struct desc_struct *desc;
2800 unsigned int idx = segment >> 3;
2801
2802 if ((segment & SEGMENT_TI_MASK) == SEGMENT_LDT) {
2803 #ifdef CONFIG_MODIFY_LDT_SYSCALL
2804 struct ldt_struct *ldt;
2805
2806 /* IRQs are off, so this synchronizes with smp_store_release */
2807 ldt = READ_ONCE(current->active_mm->context.ldt);
2808 if (!ldt || idx >= ldt->nr_entries)
2809 return 0;
2810
2811 desc = &ldt->entries[idx];
2812 #else
2813 return 0;
2814 #endif
2815 } else {
2816 if (idx >= GDT_ENTRIES)
2817 return 0;
2818
2819 desc = raw_cpu_ptr(gdt_page.gdt) + idx;
2820 }
2821
2822 return get_desc_base(desc);
2823 }
2824
2825 #ifdef CONFIG_UPROBES
2826 /*
2827 * Heuristic-based check if uprobe is installed at the function entry.
2828 *
2829 * Under assumption of user code being compiled with frame pointers,
2830 * `push %rbp/%ebp` is a good indicator that we indeed are.
2831 *
2832 * Similarly, `endbr64` (assuming 64-bit mode) is also a common pattern.
2833 * If we get this wrong, captured stack trace might have one extra bogus
2834 * entry, but the rest of stack trace will still be meaningful.
2835 */
is_uprobe_at_func_entry(struct pt_regs * regs)2836 static bool is_uprobe_at_func_entry(struct pt_regs *regs)
2837 {
2838 struct arch_uprobe *auprobe;
2839
2840 if (!current->utask)
2841 return false;
2842
2843 auprobe = current->utask->auprobe;
2844 if (!auprobe)
2845 return false;
2846
2847 /* push %rbp/%ebp */
2848 if (auprobe->insn[0] == 0x55)
2849 return true;
2850
2851 /* endbr64 (64-bit only) */
2852 if (user_64bit_mode(regs) && is_endbr((u32 *)auprobe->insn))
2853 return true;
2854
2855 return false;
2856 }
2857
2858 #else
is_uprobe_at_func_entry(struct pt_regs * regs)2859 static bool is_uprobe_at_func_entry(struct pt_regs *regs)
2860 {
2861 return false;
2862 }
2863 #endif /* CONFIG_UPROBES */
2864
2865 #ifdef CONFIG_IA32_EMULATION
2866
2867 #include <linux/compat.h>
2868
2869 static inline int
perf_callchain_user32(struct pt_regs * regs,struct perf_callchain_entry_ctx * entry)2870 perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry_ctx *entry)
2871 {
2872 /* 32-bit process in 64-bit kernel. */
2873 unsigned long ss_base, cs_base;
2874 struct stack_frame_ia32 frame;
2875 const struct stack_frame_ia32 __user *fp;
2876 u32 ret_addr;
2877
2878 if (user_64bit_mode(regs))
2879 return 0;
2880
2881 cs_base = get_segment_base(regs->cs);
2882 ss_base = get_segment_base(regs->ss);
2883
2884 fp = compat_ptr(ss_base + regs->bp);
2885 pagefault_disable();
2886
2887 /* see perf_callchain_user() below for why we do this */
2888 if (is_uprobe_at_func_entry(regs) &&
2889 !get_user(ret_addr, (const u32 __user *)regs->sp))
2890 perf_callchain_store(entry, ret_addr);
2891
2892 while (entry->nr < entry->max_stack) {
2893 if (!valid_user_frame(fp, sizeof(frame)))
2894 break;
2895
2896 if (__get_user(frame.next_frame, &fp->next_frame))
2897 break;
2898 if (__get_user(frame.return_address, &fp->return_address))
2899 break;
2900
2901 perf_callchain_store(entry, cs_base + frame.return_address);
2902 fp = compat_ptr(ss_base + frame.next_frame);
2903 }
2904 pagefault_enable();
2905 return 1;
2906 }
2907 #else
2908 static inline int
perf_callchain_user32(struct pt_regs * regs,struct perf_callchain_entry_ctx * entry)2909 perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry_ctx *entry)
2910 {
2911 return 0;
2912 }
2913 #endif
2914
2915 void
perf_callchain_user(struct perf_callchain_entry_ctx * entry,struct pt_regs * regs)2916 perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs)
2917 {
2918 struct stack_frame frame;
2919 const struct stack_frame __user *fp;
2920 unsigned long ret_addr;
2921
2922 if (perf_guest_state()) {
2923 /* TODO: We don't support guest os callchain now */
2924 return;
2925 }
2926
2927 /*
2928 * We don't know what to do with VM86 stacks.. ignore them for now.
2929 */
2930 if (regs->flags & (X86_VM_MASK | PERF_EFLAGS_VM))
2931 return;
2932
2933 fp = (void __user *)regs->bp;
2934
2935 perf_callchain_store(entry, regs->ip);
2936
2937 if (!nmi_uaccess_okay())
2938 return;
2939
2940 if (perf_callchain_user32(regs, entry))
2941 return;
2942
2943 pagefault_disable();
2944
2945 /*
2946 * If we are called from uprobe handler, and we are indeed at the very
2947 * entry to user function (which is normally a `push %rbp` instruction,
2948 * under assumption of application being compiled with frame pointers),
2949 * we should read return address from *regs->sp before proceeding
2950 * to follow frame pointers, otherwise we'll skip immediate caller
2951 * as %rbp is not yet setup.
2952 */
2953 if (is_uprobe_at_func_entry(regs) &&
2954 !get_user(ret_addr, (const unsigned long __user *)regs->sp))
2955 perf_callchain_store(entry, ret_addr);
2956
2957 while (entry->nr < entry->max_stack) {
2958 if (!valid_user_frame(fp, sizeof(frame)))
2959 break;
2960
2961 if (__get_user(frame.next_frame, &fp->next_frame))
2962 break;
2963 if (__get_user(frame.return_address, &fp->return_address))
2964 break;
2965
2966 perf_callchain_store(entry, frame.return_address);
2967 fp = (void __user *)frame.next_frame;
2968 }
2969 pagefault_enable();
2970 }
2971
2972 /*
2973 * Deal with code segment offsets for the various execution modes:
2974 *
2975 * VM86 - the good olde 16 bit days, where the linear address is
2976 * 20 bits and we use regs->ip + 0x10 * regs->cs.
2977 *
2978 * IA32 - Where we need to look at GDT/LDT segment descriptor tables
2979 * to figure out what the 32bit base address is.
2980 *
2981 * X32 - has TIF_X32 set, but is running in x86_64
2982 *
2983 * X86_64 - CS,DS,SS,ES are all zero based.
2984 */
code_segment_base(struct pt_regs * regs)2985 static unsigned long code_segment_base(struct pt_regs *regs)
2986 {
2987 /*
2988 * For IA32 we look at the GDT/LDT segment base to convert the
2989 * effective IP to a linear address.
2990 */
2991
2992 #ifdef CONFIG_X86_32
2993 /*
2994 * If we are in VM86 mode, add the segment offset to convert to a
2995 * linear address.
2996 */
2997 if (regs->flags & X86_VM_MASK)
2998 return 0x10 * regs->cs;
2999
3000 if (user_mode(regs) && regs->cs != __USER_CS)
3001 return get_segment_base(regs->cs);
3002 #else
3003 if (user_mode(regs) && !user_64bit_mode(regs) &&
3004 regs->cs != __USER32_CS)
3005 return get_segment_base(regs->cs);
3006 #endif
3007 return 0;
3008 }
3009
perf_arch_instruction_pointer(struct pt_regs * regs)3010 unsigned long perf_arch_instruction_pointer(struct pt_regs *regs)
3011 {
3012 return regs->ip + code_segment_base(regs);
3013 }
3014
common_misc_flags(struct pt_regs * regs)3015 static unsigned long common_misc_flags(struct pt_regs *regs)
3016 {
3017 if (regs->flags & PERF_EFLAGS_EXACT)
3018 return PERF_RECORD_MISC_EXACT_IP;
3019
3020 return 0;
3021 }
3022
guest_misc_flags(struct pt_regs * regs)3023 static unsigned long guest_misc_flags(struct pt_regs *regs)
3024 {
3025 unsigned long guest_state = perf_guest_state();
3026
3027 if (!(guest_state & PERF_GUEST_ACTIVE))
3028 return 0;
3029
3030 if (guest_state & PERF_GUEST_USER)
3031 return PERF_RECORD_MISC_GUEST_USER;
3032 else
3033 return PERF_RECORD_MISC_GUEST_KERNEL;
3034
3035 }
3036
host_misc_flags(struct pt_regs * regs)3037 static unsigned long host_misc_flags(struct pt_regs *regs)
3038 {
3039 if (user_mode(regs))
3040 return PERF_RECORD_MISC_USER;
3041 else
3042 return PERF_RECORD_MISC_KERNEL;
3043 }
3044
perf_arch_guest_misc_flags(struct pt_regs * regs)3045 unsigned long perf_arch_guest_misc_flags(struct pt_regs *regs)
3046 {
3047 unsigned long flags = common_misc_flags(regs);
3048
3049 flags |= guest_misc_flags(regs);
3050
3051 return flags;
3052 }
3053
perf_arch_misc_flags(struct pt_regs * regs)3054 unsigned long perf_arch_misc_flags(struct pt_regs *regs)
3055 {
3056 unsigned long flags = common_misc_flags(regs);
3057
3058 flags |= host_misc_flags(regs);
3059
3060 return flags;
3061 }
3062
perf_get_x86_pmu_capability(struct x86_pmu_capability * cap)3063 void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap)
3064 {
3065 /* This API doesn't currently support enumerating hybrid PMUs. */
3066 if (WARN_ON_ONCE(cpu_feature_enabled(X86_FEATURE_HYBRID_CPU)) ||
3067 !x86_pmu_initialized()) {
3068 memset(cap, 0, sizeof(*cap));
3069 return;
3070 }
3071
3072 /*
3073 * Note, hybrid CPU models get tracked as having hybrid PMUs even when
3074 * all E-cores are disabled via BIOS. When E-cores are disabled, the
3075 * base PMU holds the correct number of counters for P-cores.
3076 */
3077 cap->version = x86_pmu.version;
3078 cap->num_counters_gp = x86_pmu_num_counters(NULL);
3079 cap->num_counters_fixed = x86_pmu_num_counters_fixed(NULL);
3080 cap->bit_width_gp = x86_pmu.cntval_bits;
3081 cap->bit_width_fixed = x86_pmu.cntval_bits;
3082 cap->events_mask = (unsigned int)x86_pmu.events_maskl;
3083 cap->events_mask_len = x86_pmu.events_mask_len;
3084 cap->pebs_ept = x86_pmu.pebs_ept;
3085 }
3086 EXPORT_SYMBOL_GPL(perf_get_x86_pmu_capability);
3087
perf_get_hw_event_config(int hw_event)3088 u64 perf_get_hw_event_config(int hw_event)
3089 {
3090 int max = x86_pmu.max_events;
3091
3092 if (hw_event < max)
3093 return x86_pmu.event_map(array_index_nospec(hw_event, max));
3094
3095 return 0;
3096 }
3097 EXPORT_SYMBOL_GPL(perf_get_hw_event_config);
3098