1 /*
2 * Performance events x86 architecture code
3 *
4 * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
5 * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
6 * Copyright (C) 2009 Jaswinder Singh Rajput
7 * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter
8 * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra
9 * Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com>
10 * Copyright (C) 2009 Google, Inc., Stephane Eranian
11 *
12 * For licencing details see kernel-base/COPYING
13 */
14
15 #include <linux/perf_event.h>
16 #include <linux/capability.h>
17 #include <linux/notifier.h>
18 #include <linux/hardirq.h>
19 #include <linux/kprobes.h>
20 #include <linux/export.h>
21 #include <linux/init.h>
22 #include <linux/kdebug.h>
23 #include <linux/kvm_types.h>
24 #include <linux/sched/mm.h>
25 #include <linux/sched/clock.h>
26 #include <linux/uaccess.h>
27 #include <linux/slab.h>
28 #include <linux/cpu.h>
29 #include <linux/bitops.h>
30 #include <linux/device.h>
31 #include <linux/nospec.h>
32 #include <linux/static_call.h>
33
34 #include <asm/apic.h>
35 #include <asm/stacktrace.h>
36 #include <asm/msr.h>
37 #include <asm/nmi.h>
38 #include <asm/smp.h>
39 #include <asm/alternative.h>
40 #include <asm/mmu_context.h>
41 #include <asm/tlbflush.h>
42 #include <asm/timer.h>
43 #include <asm/desc.h>
44 #include <asm/ldt.h>
45 #include <asm/unwind.h>
46 #include <asm/uprobes.h>
47 #include <asm/ibt.h>
48
49 #include "perf_event.h"
50
51 struct x86_pmu x86_pmu __read_mostly;
52 static struct pmu pmu;
53
54 DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = {
55 .enabled = 1,
56 .pmu = &pmu,
57 };
58
59 DEFINE_STATIC_KEY_FALSE(rdpmc_never_available_key);
60 DEFINE_STATIC_KEY_FALSE(rdpmc_always_available_key);
61 DEFINE_STATIC_KEY_FALSE(perf_is_hybrid);
62
63 /*
64 * This here uses DEFINE_STATIC_CALL_NULL() to get a static_call defined
65 * from just a typename, as opposed to an actual function.
66 */
67 DEFINE_STATIC_CALL_NULL(x86_pmu_handle_irq, *x86_pmu.handle_irq);
68 DEFINE_STATIC_CALL_NULL(x86_pmu_disable_all, *x86_pmu.disable_all);
69 DEFINE_STATIC_CALL_NULL(x86_pmu_enable_all, *x86_pmu.enable_all);
70 DEFINE_STATIC_CALL_NULL(x86_pmu_enable, *x86_pmu.enable);
71 DEFINE_STATIC_CALL_NULL(x86_pmu_disable, *x86_pmu.disable);
72
73 DEFINE_STATIC_CALL_NULL(x86_pmu_assign, *x86_pmu.assign);
74
75 DEFINE_STATIC_CALL_NULL(x86_pmu_add, *x86_pmu.add);
76 DEFINE_STATIC_CALL_NULL(x86_pmu_del, *x86_pmu.del);
77 DEFINE_STATIC_CALL_NULL(x86_pmu_read, *x86_pmu.read);
78
79 DEFINE_STATIC_CALL_NULL(x86_pmu_set_period, *x86_pmu.set_period);
80 DEFINE_STATIC_CALL_NULL(x86_pmu_update, *x86_pmu.update);
81 DEFINE_STATIC_CALL_NULL(x86_pmu_limit_period, *x86_pmu.limit_period);
82
83 DEFINE_STATIC_CALL_NULL(x86_pmu_schedule_events, *x86_pmu.schedule_events);
84 DEFINE_STATIC_CALL_NULL(x86_pmu_get_event_constraints, *x86_pmu.get_event_constraints);
85 DEFINE_STATIC_CALL_NULL(x86_pmu_put_event_constraints, *x86_pmu.put_event_constraints);
86
87 DEFINE_STATIC_CALL_NULL(x86_pmu_start_scheduling, *x86_pmu.start_scheduling);
88 DEFINE_STATIC_CALL_NULL(x86_pmu_commit_scheduling, *x86_pmu.commit_scheduling);
89 DEFINE_STATIC_CALL_NULL(x86_pmu_stop_scheduling, *x86_pmu.stop_scheduling);
90
91 DEFINE_STATIC_CALL_NULL(x86_pmu_sched_task, *x86_pmu.sched_task);
92
93 DEFINE_STATIC_CALL_NULL(x86_pmu_drain_pebs, *x86_pmu.drain_pebs);
94 DEFINE_STATIC_CALL_NULL(x86_pmu_pebs_aliases, *x86_pmu.pebs_aliases);
95
96 DEFINE_STATIC_CALL_NULL(x86_pmu_filter, *x86_pmu.filter);
97
98 DEFINE_STATIC_CALL_NULL(x86_pmu_late_setup, *x86_pmu.late_setup);
99
100 DEFINE_STATIC_CALL_NULL(x86_pmu_pebs_enable, *x86_pmu.pebs_enable);
101 DEFINE_STATIC_CALL_NULL(x86_pmu_pebs_disable, *x86_pmu.pebs_disable);
102 DEFINE_STATIC_CALL_NULL(x86_pmu_pebs_enable_all, *x86_pmu.pebs_enable_all);
103 DEFINE_STATIC_CALL_NULL(x86_pmu_pebs_disable_all, *x86_pmu.pebs_disable_all);
104
105 /*
106 * This one is magic, it will get called even when PMU init fails (because
107 * there is no PMU), in which case it should simply return NULL.
108 */
109 DEFINE_STATIC_CALL_RET0(x86_pmu_guest_get_msrs, *x86_pmu.guest_get_msrs);
110
111 u64 __read_mostly hw_cache_event_ids
112 [PERF_COUNT_HW_CACHE_MAX]
113 [PERF_COUNT_HW_CACHE_OP_MAX]
114 [PERF_COUNT_HW_CACHE_RESULT_MAX];
115 u64 __read_mostly hw_cache_extra_regs
116 [PERF_COUNT_HW_CACHE_MAX]
117 [PERF_COUNT_HW_CACHE_OP_MAX]
118 [PERF_COUNT_HW_CACHE_RESULT_MAX];
119
120 /*
121 * Propagate event elapsed time into the generic event.
122 * Can only be executed on the CPU where the event is active.
123 * Returns the delta events processed.
124 */
x86_perf_event_update(struct perf_event * event)125 u64 x86_perf_event_update(struct perf_event *event)
126 {
127 struct hw_perf_event *hwc = &event->hw;
128 int shift = 64 - x86_pmu.cntval_bits;
129 u64 prev_raw_count, new_raw_count;
130 u64 delta;
131
132 if (unlikely(!hwc->event_base))
133 return 0;
134
135 /*
136 * Careful: an NMI might modify the previous event value.
137 *
138 * Our tactic to handle this is to first atomically read and
139 * exchange a new raw count - then add that new-prev delta
140 * count to the generic event atomically:
141 */
142 prev_raw_count = local64_read(&hwc->prev_count);
143 do {
144 new_raw_count = rdpmc(hwc->event_base_rdpmc);
145 } while (!local64_try_cmpxchg(&hwc->prev_count,
146 &prev_raw_count, new_raw_count));
147
148 /*
149 * Now we have the new raw value and have updated the prev
150 * timestamp already. We can now calculate the elapsed delta
151 * (event-)time and add that to the generic event.
152 *
153 * Careful, not all hw sign-extends above the physical width
154 * of the count.
155 */
156 delta = (new_raw_count << shift) - (prev_raw_count << shift);
157 delta >>= shift;
158
159 local64_add(delta, &event->count);
160 local64_sub(delta, &hwc->period_left);
161
162 return new_raw_count;
163 }
164
165 /*
166 * Find and validate any extra registers to set up.
167 */
x86_pmu_extra_regs(u64 config,struct perf_event * event)168 static int x86_pmu_extra_regs(u64 config, struct perf_event *event)
169 {
170 struct extra_reg *extra_regs = hybrid(event->pmu, extra_regs);
171 struct hw_perf_event_extra *reg;
172 struct extra_reg *er;
173
174 reg = &event->hw.extra_reg;
175
176 if (!extra_regs)
177 return 0;
178
179 for (er = extra_regs; er->msr; er++) {
180 if (er->event != (config & er->config_mask))
181 continue;
182 if (event->attr.config1 & ~er->valid_mask)
183 return -EINVAL;
184 /* Check if the extra msrs can be safely accessed*/
185 if (!er->extra_msr_access)
186 return -ENXIO;
187
188 reg->idx = er->idx;
189 reg->config = event->attr.config1;
190 reg->reg = er->msr;
191 break;
192 }
193 return 0;
194 }
195
196 static atomic_t active_events;
197 static atomic_t pmc_refcount;
198 static DEFINE_MUTEX(pmc_reserve_mutex);
199
200 #ifdef CONFIG_X86_LOCAL_APIC
201
get_possible_counter_mask(void)202 static inline u64 get_possible_counter_mask(void)
203 {
204 u64 cntr_mask = x86_pmu.cntr_mask64;
205 int i;
206
207 if (!is_hybrid())
208 return cntr_mask;
209
210 for (i = 0; i < x86_pmu.num_hybrid_pmus; i++)
211 cntr_mask |= x86_pmu.hybrid_pmu[i].cntr_mask64;
212
213 return cntr_mask;
214 }
215
reserve_pmc_hardware(void)216 static bool reserve_pmc_hardware(void)
217 {
218 u64 cntr_mask = get_possible_counter_mask();
219 int i, end;
220
221 for_each_set_bit(i, (unsigned long *)&cntr_mask, X86_PMC_IDX_MAX) {
222 if (!reserve_perfctr_nmi(x86_pmu_event_addr(i)))
223 goto perfctr_fail;
224 }
225
226 for_each_set_bit(i, (unsigned long *)&cntr_mask, X86_PMC_IDX_MAX) {
227 if (!reserve_evntsel_nmi(x86_pmu_config_addr(i)))
228 goto eventsel_fail;
229 }
230
231 return true;
232
233 eventsel_fail:
234 end = i;
235 for_each_set_bit(i, (unsigned long *)&cntr_mask, end)
236 release_evntsel_nmi(x86_pmu_config_addr(i));
237 i = X86_PMC_IDX_MAX;
238
239 perfctr_fail:
240 end = i;
241 for_each_set_bit(i, (unsigned long *)&cntr_mask, end)
242 release_perfctr_nmi(x86_pmu_event_addr(i));
243
244 return false;
245 }
246
release_pmc_hardware(void)247 static void release_pmc_hardware(void)
248 {
249 u64 cntr_mask = get_possible_counter_mask();
250 int i;
251
252 for_each_set_bit(i, (unsigned long *)&cntr_mask, X86_PMC_IDX_MAX) {
253 release_perfctr_nmi(x86_pmu_event_addr(i));
254 release_evntsel_nmi(x86_pmu_config_addr(i));
255 }
256 }
257
258 #else
259
reserve_pmc_hardware(void)260 static bool reserve_pmc_hardware(void) { return true; }
release_pmc_hardware(void)261 static void release_pmc_hardware(void) {}
262
263 #endif
264
check_hw_exists(struct pmu * pmu,unsigned long * cntr_mask,unsigned long * fixed_cntr_mask)265 bool check_hw_exists(struct pmu *pmu, unsigned long *cntr_mask,
266 unsigned long *fixed_cntr_mask)
267 {
268 u64 val, val_fail = -1, val_new= ~0;
269 int i, reg, reg_fail = -1, ret = 0;
270 int bios_fail = 0;
271 int reg_safe = -1;
272
273 /*
274 * Check to see if the BIOS enabled any of the counters, if so
275 * complain and bail.
276 */
277 for_each_set_bit(i, cntr_mask, X86_PMC_IDX_MAX) {
278 reg = x86_pmu_config_addr(i);
279 ret = rdmsrq_safe(reg, &val);
280 if (ret)
281 goto msr_fail;
282 if (val & ARCH_PERFMON_EVENTSEL_ENABLE) {
283 bios_fail = 1;
284 val_fail = val;
285 reg_fail = reg;
286 } else {
287 reg_safe = i;
288 }
289 }
290
291 if (*(u64 *)fixed_cntr_mask) {
292 reg = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
293 ret = rdmsrq_safe(reg, &val);
294 if (ret)
295 goto msr_fail;
296 for_each_set_bit(i, fixed_cntr_mask, X86_PMC_IDX_MAX) {
297 if (fixed_counter_disabled(i, pmu))
298 continue;
299 if (val & (0x03ULL << i*4)) {
300 bios_fail = 1;
301 val_fail = val;
302 reg_fail = reg;
303 }
304 }
305 }
306
307 /*
308 * If all the counters are enabled, the below test will always
309 * fail. The tools will also become useless in this scenario.
310 * Just fail and disable the hardware counters.
311 */
312
313 if (reg_safe == -1) {
314 reg = reg_safe;
315 goto msr_fail;
316 }
317
318 /*
319 * Read the current value, change it and read it back to see if it
320 * matches, this is needed to detect certain hardware emulators
321 * (qemu/kvm) that don't trap on the MSR access and always return 0s.
322 */
323 reg = x86_pmu_event_addr(reg_safe);
324 if (rdmsrq_safe(reg, &val))
325 goto msr_fail;
326 val ^= 0xffffUL;
327 ret = wrmsrq_safe(reg, val);
328 ret |= rdmsrq_safe(reg, &val_new);
329 if (ret || val != val_new)
330 goto msr_fail;
331
332 /*
333 * We still allow the PMU driver to operate:
334 */
335 if (bios_fail) {
336 pr_cont("Broken BIOS detected, complain to your hardware vendor.\n");
337 pr_err(FW_BUG "the BIOS has corrupted hw-PMU resources (MSR %x is %Lx)\n",
338 reg_fail, val_fail);
339 }
340
341 return true;
342
343 msr_fail:
344 if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) {
345 pr_cont("PMU not available due to virtualization, using software events only.\n");
346 } else {
347 pr_cont("Broken PMU hardware detected, using software events only.\n");
348 pr_err("Failed to access perfctr msr (MSR %x is %Lx)\n",
349 reg, val_new);
350 }
351
352 return false;
353 }
354
hw_perf_event_destroy(struct perf_event * event)355 static void hw_perf_event_destroy(struct perf_event *event)
356 {
357 x86_release_hardware();
358 atomic_dec(&active_events);
359 }
360
hw_perf_lbr_event_destroy(struct perf_event * event)361 void hw_perf_lbr_event_destroy(struct perf_event *event)
362 {
363 hw_perf_event_destroy(event);
364
365 /* undo the lbr/bts event accounting */
366 x86_del_exclusive(x86_lbr_exclusive_lbr);
367 }
368
x86_pmu_initialized(void)369 static inline int x86_pmu_initialized(void)
370 {
371 return x86_pmu.handle_irq != NULL;
372 }
373
374 static inline int
set_ext_hw_attr(struct hw_perf_event * hwc,struct perf_event * event)375 set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event *event)
376 {
377 struct perf_event_attr *attr = &event->attr;
378 unsigned int cache_type, cache_op, cache_result;
379 u64 config, val;
380
381 config = attr->config;
382
383 cache_type = (config >> 0) & 0xff;
384 if (cache_type >= PERF_COUNT_HW_CACHE_MAX)
385 return -EINVAL;
386 cache_type = array_index_nospec(cache_type, PERF_COUNT_HW_CACHE_MAX);
387
388 cache_op = (config >> 8) & 0xff;
389 if (cache_op >= PERF_COUNT_HW_CACHE_OP_MAX)
390 return -EINVAL;
391 cache_op = array_index_nospec(cache_op, PERF_COUNT_HW_CACHE_OP_MAX);
392
393 cache_result = (config >> 16) & 0xff;
394 if (cache_result >= PERF_COUNT_HW_CACHE_RESULT_MAX)
395 return -EINVAL;
396 cache_result = array_index_nospec(cache_result, PERF_COUNT_HW_CACHE_RESULT_MAX);
397
398 val = hybrid_var(event->pmu, hw_cache_event_ids)[cache_type][cache_op][cache_result];
399 if (val == 0)
400 return -ENOENT;
401
402 if (val == -1)
403 return -EINVAL;
404
405 hwc->config |= val;
406 attr->config1 = hybrid_var(event->pmu, hw_cache_extra_regs)[cache_type][cache_op][cache_result];
407 return x86_pmu_extra_regs(val, event);
408 }
409
x86_reserve_hardware(void)410 int x86_reserve_hardware(void)
411 {
412 int err = 0;
413
414 if (!atomic_inc_not_zero(&pmc_refcount)) {
415 mutex_lock(&pmc_reserve_mutex);
416 if (atomic_read(&pmc_refcount) == 0) {
417 if (!reserve_pmc_hardware()) {
418 err = -EBUSY;
419 } else {
420 reserve_ds_buffers();
421 reserve_lbr_buffers();
422 }
423 }
424 if (!err)
425 atomic_inc(&pmc_refcount);
426 mutex_unlock(&pmc_reserve_mutex);
427 }
428
429 return err;
430 }
431
x86_release_hardware(void)432 void x86_release_hardware(void)
433 {
434 if (atomic_dec_and_mutex_lock(&pmc_refcount, &pmc_reserve_mutex)) {
435 release_pmc_hardware();
436 release_ds_buffers();
437 release_lbr_buffers();
438 mutex_unlock(&pmc_reserve_mutex);
439 }
440 }
441
442 /*
443 * Check if we can create event of a certain type (that no conflicting events
444 * are present).
445 */
x86_add_exclusive(unsigned int what)446 int x86_add_exclusive(unsigned int what)
447 {
448 int i;
449
450 /*
451 * When lbr_pt_coexist we allow PT to coexist with either LBR or BTS.
452 * LBR and BTS are still mutually exclusive.
453 */
454 if (x86_pmu.lbr_pt_coexist && what == x86_lbr_exclusive_pt)
455 goto out;
456
457 if (!atomic_inc_not_zero(&x86_pmu.lbr_exclusive[what])) {
458 mutex_lock(&pmc_reserve_mutex);
459 for (i = 0; i < ARRAY_SIZE(x86_pmu.lbr_exclusive); i++) {
460 if (i != what && atomic_read(&x86_pmu.lbr_exclusive[i]))
461 goto fail_unlock;
462 }
463 atomic_inc(&x86_pmu.lbr_exclusive[what]);
464 mutex_unlock(&pmc_reserve_mutex);
465 }
466
467 out:
468 atomic_inc(&active_events);
469 return 0;
470
471 fail_unlock:
472 mutex_unlock(&pmc_reserve_mutex);
473 return -EBUSY;
474 }
475
x86_del_exclusive(unsigned int what)476 void x86_del_exclusive(unsigned int what)
477 {
478 atomic_dec(&active_events);
479
480 /*
481 * See the comment in x86_add_exclusive().
482 */
483 if (x86_pmu.lbr_pt_coexist && what == x86_lbr_exclusive_pt)
484 return;
485
486 atomic_dec(&x86_pmu.lbr_exclusive[what]);
487 }
488
x86_setup_perfctr(struct perf_event * event)489 int x86_setup_perfctr(struct perf_event *event)
490 {
491 struct perf_event_attr *attr = &event->attr;
492 struct hw_perf_event *hwc = &event->hw;
493 u64 config;
494
495 if (!is_sampling_event(event)) {
496 hwc->sample_period = x86_pmu.max_period;
497 hwc->last_period = hwc->sample_period;
498 local64_set(&hwc->period_left, hwc->sample_period);
499 }
500
501 if (attr->type == event->pmu->type)
502 return x86_pmu_extra_regs(event->attr.config, event);
503
504 if (attr->type == PERF_TYPE_HW_CACHE)
505 return set_ext_hw_attr(hwc, event);
506
507 if (attr->config >= x86_pmu.max_events)
508 return -EINVAL;
509
510 attr->config = array_index_nospec((unsigned long)attr->config, x86_pmu.max_events);
511
512 /*
513 * The generic map:
514 */
515 config = x86_pmu.event_map(attr->config);
516
517 if (config == 0)
518 return -ENOENT;
519
520 if (config == -1LL)
521 return -EINVAL;
522
523 hwc->config |= config;
524
525 return 0;
526 }
527
528 /*
529 * check that branch_sample_type is compatible with
530 * settings needed for precise_ip > 1 which implies
531 * using the LBR to capture ALL taken branches at the
532 * priv levels of the measurement
533 */
precise_br_compat(struct perf_event * event)534 static inline int precise_br_compat(struct perf_event *event)
535 {
536 u64 m = event->attr.branch_sample_type;
537 u64 b = 0;
538
539 /* must capture all branches */
540 if (!(m & PERF_SAMPLE_BRANCH_ANY))
541 return 0;
542
543 m &= PERF_SAMPLE_BRANCH_KERNEL | PERF_SAMPLE_BRANCH_USER;
544
545 if (!event->attr.exclude_user)
546 b |= PERF_SAMPLE_BRANCH_USER;
547
548 if (!event->attr.exclude_kernel)
549 b |= PERF_SAMPLE_BRANCH_KERNEL;
550
551 /*
552 * ignore PERF_SAMPLE_BRANCH_HV, not supported on x86
553 */
554
555 return m == b;
556 }
557
x86_pmu_max_precise(struct pmu * pmu)558 int x86_pmu_max_precise(struct pmu *pmu)
559 {
560 int precise = 0;
561
562 if (x86_pmu.pebs_active && !x86_pmu.pebs_broken) {
563 /* arch PEBS */
564 if (x86_pmu.arch_pebs) {
565 precise = 2;
566 if (hybrid(pmu, arch_pebs_cap).pdists)
567 precise++;
568
569 return precise;
570 }
571
572 /* legacy PEBS - support for constant skid */
573 precise++;
574 /* Support for IP fixup */
575 if (x86_pmu.lbr_nr || x86_pmu.intel_cap.pebs_format >= 2)
576 precise++;
577
578 if (x86_pmu.pebs_prec_dist)
579 precise++;
580 }
581
582 return precise;
583 }
584
x86_pmu_hw_config(struct perf_event * event)585 int x86_pmu_hw_config(struct perf_event *event)
586 {
587 if (event->attr.precise_ip) {
588 int precise = x86_pmu_max_precise(event->pmu);
589
590 if (event->attr.precise_ip > precise)
591 return -EOPNOTSUPP;
592
593 /* There's no sense in having PEBS for non sampling events: */
594 if (!is_sampling_event(event))
595 return -EINVAL;
596 }
597 /*
598 * check that PEBS LBR correction does not conflict with
599 * whatever the user is asking with attr->branch_sample_type
600 */
601 if (event->attr.precise_ip > 1 && x86_pmu.intel_cap.pebs_format < 2) {
602 u64 *br_type = &event->attr.branch_sample_type;
603
604 if (has_branch_stack(event)) {
605 if (!precise_br_compat(event))
606 return -EOPNOTSUPP;
607
608 /* branch_sample_type is compatible */
609
610 } else {
611 /*
612 * user did not specify branch_sample_type
613 *
614 * For PEBS fixups, we capture all
615 * the branches at the priv level of the
616 * event.
617 */
618 *br_type = PERF_SAMPLE_BRANCH_ANY;
619
620 if (!event->attr.exclude_user)
621 *br_type |= PERF_SAMPLE_BRANCH_USER;
622
623 if (!event->attr.exclude_kernel)
624 *br_type |= PERF_SAMPLE_BRANCH_KERNEL;
625 }
626 }
627
628 if (branch_sample_call_stack(event))
629 event->attach_state |= PERF_ATTACH_TASK_DATA;
630
631 /*
632 * Generate PMC IRQs:
633 * (keep 'enabled' bit clear for now)
634 */
635 event->hw.config = ARCH_PERFMON_EVENTSEL_INT;
636
637 /*
638 * Count user and OS events unless requested not to
639 */
640 if (!event->attr.exclude_user)
641 event->hw.config |= ARCH_PERFMON_EVENTSEL_USR;
642 if (!event->attr.exclude_kernel)
643 event->hw.config |= ARCH_PERFMON_EVENTSEL_OS;
644
645 if (event->attr.type == event->pmu->type)
646 event->hw.config |= x86_pmu_get_event_config(event);
647
648 if (is_sampling_event(event) && !event->attr.freq && x86_pmu.limit_period) {
649 s64 left = event->attr.sample_period;
650 x86_pmu.limit_period(event, &left);
651 if (left > event->attr.sample_period)
652 return -EINVAL;
653 }
654
655 /* sample_regs_user never support XMM registers */
656 if (unlikely(event->attr.sample_regs_user & PERF_REG_EXTENDED_MASK))
657 return -EINVAL;
658 /*
659 * Besides the general purpose registers, XMM registers may
660 * be collected in PEBS on some platforms, e.g. Icelake
661 */
662 if (unlikely(event->attr.sample_regs_intr & PERF_REG_EXTENDED_MASK)) {
663 if (!(event->pmu->capabilities & PERF_PMU_CAP_EXTENDED_REGS))
664 return -EINVAL;
665
666 if (!event->attr.precise_ip)
667 return -EINVAL;
668 }
669
670 return x86_setup_perfctr(event);
671 }
672
673 /*
674 * Setup the hardware configuration for a given attr_type
675 */
__x86_pmu_event_init(struct perf_event * event)676 static int __x86_pmu_event_init(struct perf_event *event)
677 {
678 int err;
679
680 if (!x86_pmu_initialized())
681 return -ENODEV;
682
683 err = x86_reserve_hardware();
684 if (err)
685 return err;
686
687 atomic_inc(&active_events);
688 event->destroy = hw_perf_event_destroy;
689
690 event->hw.idx = -1;
691 event->hw.last_cpu = -1;
692 event->hw.last_tag = ~0ULL;
693 event->hw.dyn_constraint = ~0ULL;
694
695 /* mark unused */
696 event->hw.extra_reg.idx = EXTRA_REG_NONE;
697 event->hw.branch_reg.idx = EXTRA_REG_NONE;
698
699 return x86_pmu.hw_config(event);
700 }
701
x86_pmu_disable_all(void)702 void x86_pmu_disable_all(void)
703 {
704 struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
705 int idx;
706
707 for_each_set_bit(idx, x86_pmu.cntr_mask, X86_PMC_IDX_MAX) {
708 struct hw_perf_event *hwc = &cpuc->events[idx]->hw;
709 u64 val;
710
711 if (!test_bit(idx, cpuc->active_mask))
712 continue;
713 rdmsrq(x86_pmu_config_addr(idx), val);
714 if (!(val & ARCH_PERFMON_EVENTSEL_ENABLE))
715 continue;
716 val &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
717 wrmsrq(x86_pmu_config_addr(idx), val);
718 if (is_counter_pair(hwc))
719 wrmsrq(x86_pmu_config_addr(idx + 1), 0);
720 }
721 }
722
perf_guest_get_msrs(int * nr,void * data)723 struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr, void *data)
724 {
725 return static_call(x86_pmu_guest_get_msrs)(nr, data);
726 }
727 EXPORT_SYMBOL_FOR_KVM(perf_guest_get_msrs);
728
729 /*
730 * There may be PMI landing after enabled=0. The PMI hitting could be before or
731 * after disable_all.
732 *
733 * If PMI hits before disable_all, the PMU will be disabled in the NMI handler.
734 * It will not be re-enabled in the NMI handler again, because enabled=0. After
735 * handling the NMI, disable_all will be called, which will not change the
736 * state either. If PMI hits after disable_all, the PMU is already disabled
737 * before entering NMI handler. The NMI handler will not change the state
738 * either.
739 *
740 * So either situation is harmless.
741 */
x86_pmu_disable(struct pmu * pmu)742 static void x86_pmu_disable(struct pmu *pmu)
743 {
744 struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
745
746 if (!x86_pmu_initialized())
747 return;
748
749 if (!cpuc->enabled)
750 return;
751
752 cpuc->n_added = 0;
753 cpuc->enabled = 0;
754 barrier();
755
756 static_call(x86_pmu_disable_all)();
757 }
758
x86_pmu_enable_all(int added)759 void x86_pmu_enable_all(int added)
760 {
761 struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
762 int idx;
763
764 for_each_set_bit(idx, x86_pmu.cntr_mask, X86_PMC_IDX_MAX) {
765 struct hw_perf_event *hwc = &cpuc->events[idx]->hw;
766
767 if (!test_bit(idx, cpuc->active_mask))
768 continue;
769
770 __x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE);
771 }
772 }
773
is_x86_event(struct perf_event * event)774 int is_x86_event(struct perf_event *event)
775 {
776 /*
777 * For a non-hybrid platforms, the type of X86 pmu is
778 * always PERF_TYPE_RAW.
779 * For a hybrid platform, the PERF_PMU_CAP_EXTENDED_HW_TYPE
780 * is a unique capability for the X86 PMU.
781 * Use them to detect a X86 event.
782 */
783 if (event->pmu->type == PERF_TYPE_RAW ||
784 event->pmu->capabilities & PERF_PMU_CAP_EXTENDED_HW_TYPE)
785 return true;
786
787 return false;
788 }
789
x86_get_pmu(unsigned int cpu)790 struct pmu *x86_get_pmu(unsigned int cpu)
791 {
792 struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
793
794 /*
795 * All CPUs of the hybrid type have been offline.
796 * The x86_get_pmu() should not be invoked.
797 */
798 if (WARN_ON_ONCE(!cpuc->pmu))
799 return &pmu;
800
801 return cpuc->pmu;
802 }
803 /*
804 * Event scheduler state:
805 *
806 * Assign events iterating over all events and counters, beginning
807 * with events with least weights first. Keep the current iterator
808 * state in struct sched_state.
809 */
810 struct sched_state {
811 int weight;
812 int event; /* event index */
813 int counter; /* counter index */
814 int unassigned; /* number of events to be assigned left */
815 int nr_gp; /* number of GP counters used */
816 u64 used;
817 };
818
819 /* Total max is X86_PMC_IDX_MAX, but we are O(n!) limited */
820 #define SCHED_STATES_MAX 2
821
822 struct perf_sched {
823 int max_weight;
824 int max_events;
825 int max_gp;
826 int saved_states;
827 struct event_constraint **constraints;
828 struct sched_state state;
829 struct sched_state saved[SCHED_STATES_MAX];
830 };
831
832 /*
833 * Initialize iterator that runs through all events and counters.
834 */
perf_sched_init(struct perf_sched * sched,struct event_constraint ** constraints,int num,int wmin,int wmax,int gpmax)835 static void perf_sched_init(struct perf_sched *sched, struct event_constraint **constraints,
836 int num, int wmin, int wmax, int gpmax)
837 {
838 int idx;
839
840 memset(sched, 0, sizeof(*sched));
841 sched->max_events = num;
842 sched->max_weight = wmax;
843 sched->max_gp = gpmax;
844 sched->constraints = constraints;
845
846 for (idx = 0; idx < num; idx++) {
847 if (constraints[idx]->weight == wmin)
848 break;
849 }
850
851 sched->state.event = idx; /* start with min weight */
852 sched->state.weight = wmin;
853 sched->state.unassigned = num;
854 }
855
perf_sched_save_state(struct perf_sched * sched)856 static void perf_sched_save_state(struct perf_sched *sched)
857 {
858 if (WARN_ON_ONCE(sched->saved_states >= SCHED_STATES_MAX))
859 return;
860
861 sched->saved[sched->saved_states] = sched->state;
862 sched->saved_states++;
863 }
864
perf_sched_restore_state(struct perf_sched * sched)865 static bool perf_sched_restore_state(struct perf_sched *sched)
866 {
867 if (!sched->saved_states)
868 return false;
869
870 sched->saved_states--;
871 sched->state = sched->saved[sched->saved_states];
872
873 /* this assignment didn't work out */
874 /* XXX broken vs EVENT_PAIR */
875 sched->state.used &= ~BIT_ULL(sched->state.counter);
876
877 /* try the next one */
878 sched->state.counter++;
879
880 return true;
881 }
882
883 /*
884 * Select a counter for the current event to schedule. Return true on
885 * success.
886 */
__perf_sched_find_counter(struct perf_sched * sched)887 static bool __perf_sched_find_counter(struct perf_sched *sched)
888 {
889 struct event_constraint *c;
890 int idx;
891
892 if (!sched->state.unassigned)
893 return false;
894
895 if (sched->state.event >= sched->max_events)
896 return false;
897
898 c = sched->constraints[sched->state.event];
899 /* Prefer fixed purpose counters */
900 if (c->idxmsk64 & (~0ULL << INTEL_PMC_IDX_FIXED)) {
901 idx = INTEL_PMC_IDX_FIXED;
902 for_each_set_bit_from(idx, c->idxmsk, X86_PMC_IDX_MAX) {
903 u64 mask = BIT_ULL(idx);
904
905 if (sched->state.used & mask)
906 continue;
907
908 sched->state.used |= mask;
909 goto done;
910 }
911 }
912
913 /* Grab the first unused counter starting with idx */
914 idx = sched->state.counter;
915 for_each_set_bit_from(idx, c->idxmsk, INTEL_PMC_IDX_FIXED) {
916 u64 mask = BIT_ULL(idx);
917
918 if (c->flags & PERF_X86_EVENT_PAIR)
919 mask |= mask << 1;
920
921 if (sched->state.used & mask)
922 continue;
923
924 if (sched->state.nr_gp++ >= sched->max_gp)
925 return false;
926
927 sched->state.used |= mask;
928 goto done;
929 }
930
931 return false;
932
933 done:
934 sched->state.counter = idx;
935
936 if (c->overlap)
937 perf_sched_save_state(sched);
938
939 return true;
940 }
941
perf_sched_find_counter(struct perf_sched * sched)942 static bool perf_sched_find_counter(struct perf_sched *sched)
943 {
944 while (!__perf_sched_find_counter(sched)) {
945 if (!perf_sched_restore_state(sched))
946 return false;
947 }
948
949 return true;
950 }
951
952 /*
953 * Go through all unassigned events and find the next one to schedule.
954 * Take events with the least weight first. Return true on success.
955 */
perf_sched_next_event(struct perf_sched * sched)956 static bool perf_sched_next_event(struct perf_sched *sched)
957 {
958 struct event_constraint *c;
959
960 if (!sched->state.unassigned || !--sched->state.unassigned)
961 return false;
962
963 do {
964 /* next event */
965 sched->state.event++;
966 if (sched->state.event >= sched->max_events) {
967 /* next weight */
968 sched->state.event = 0;
969 sched->state.weight++;
970 if (sched->state.weight > sched->max_weight)
971 return false;
972 }
973 c = sched->constraints[sched->state.event];
974 } while (c->weight != sched->state.weight);
975
976 sched->state.counter = 0; /* start with first counter */
977
978 return true;
979 }
980
981 /*
982 * Assign a counter for each event.
983 */
perf_assign_events(struct event_constraint ** constraints,int n,int wmin,int wmax,int gpmax,int * assign)984 int perf_assign_events(struct event_constraint **constraints, int n,
985 int wmin, int wmax, int gpmax, int *assign)
986 {
987 struct perf_sched sched;
988
989 perf_sched_init(&sched, constraints, n, wmin, wmax, gpmax);
990
991 do {
992 if (!perf_sched_find_counter(&sched))
993 break; /* failed */
994 if (assign)
995 assign[sched.state.event] = sched.state.counter;
996 } while (perf_sched_next_event(&sched));
997
998 return sched.state.unassigned;
999 }
1000 EXPORT_SYMBOL_GPL(perf_assign_events);
1001
x86_schedule_events(struct cpu_hw_events * cpuc,int n,int * assign)1002 int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
1003 {
1004 struct event_constraint *c;
1005 struct perf_event *e;
1006 int n0, i, wmin, wmax, unsched = 0;
1007 struct hw_perf_event *hwc;
1008 u64 used_mask = 0;
1009
1010 /*
1011 * Compute the number of events already present; see x86_pmu_add(),
1012 * validate_group() and x86_pmu_commit_txn(). For the former two
1013 * cpuc->n_events hasn't been updated yet, while for the latter
1014 * cpuc->n_txn contains the number of events added in the current
1015 * transaction.
1016 */
1017 n0 = cpuc->n_events;
1018 if (cpuc->txn_flags & PERF_PMU_TXN_ADD)
1019 n0 -= cpuc->n_txn;
1020
1021 static_call_cond(x86_pmu_start_scheduling)(cpuc);
1022
1023 for (i = 0, wmin = X86_PMC_IDX_MAX, wmax = 0; i < n; i++) {
1024 c = cpuc->event_constraint[i];
1025
1026 /*
1027 * Previously scheduled events should have a cached constraint,
1028 * while new events should not have one.
1029 */
1030 WARN_ON_ONCE((c && i >= n0) || (!c && i < n0));
1031
1032 /*
1033 * Request constraints for new events; or for those events that
1034 * have a dynamic constraint -- for those the constraint can
1035 * change due to external factors (sibling state, allow_tfa).
1036 */
1037 if (!c || (c->flags & PERF_X86_EVENT_DYNAMIC)) {
1038 c = static_call(x86_pmu_get_event_constraints)(cpuc, i, cpuc->event_list[i]);
1039 cpuc->event_constraint[i] = c;
1040 }
1041
1042 wmin = min(wmin, c->weight);
1043 wmax = max(wmax, c->weight);
1044 }
1045
1046 /*
1047 * fastpath, try to reuse previous register
1048 */
1049 for (i = 0; i < n; i++) {
1050 u64 mask;
1051
1052 hwc = &cpuc->event_list[i]->hw;
1053 c = cpuc->event_constraint[i];
1054
1055 /* never assigned */
1056 if (hwc->idx == -1)
1057 break;
1058
1059 /* constraint still honored */
1060 if (!test_bit(hwc->idx, c->idxmsk))
1061 break;
1062
1063 mask = BIT_ULL(hwc->idx);
1064 if (is_counter_pair(hwc))
1065 mask |= mask << 1;
1066
1067 /* not already used */
1068 if (used_mask & mask)
1069 break;
1070
1071 used_mask |= mask;
1072
1073 if (assign)
1074 assign[i] = hwc->idx;
1075 }
1076
1077 /* slow path */
1078 if (i != n) {
1079 int gpmax = x86_pmu_max_num_counters(cpuc->pmu);
1080
1081 /*
1082 * Do not allow scheduling of more than half the available
1083 * generic counters.
1084 *
1085 * This helps avoid counter starvation of sibling thread by
1086 * ensuring at most half the counters cannot be in exclusive
1087 * mode. There is no designated counters for the limits. Any
1088 * N/2 counters can be used. This helps with events with
1089 * specific counter constraints.
1090 */
1091 if (is_ht_workaround_enabled() && !cpuc->is_fake &&
1092 READ_ONCE(cpuc->excl_cntrs->exclusive_present))
1093 gpmax /= 2;
1094
1095 /*
1096 * Reduce the amount of available counters to allow fitting
1097 * the extra Merge events needed by large increment events.
1098 */
1099 if (x86_pmu.flags & PMU_FL_PAIR) {
1100 gpmax -= cpuc->n_pair;
1101 WARN_ON(gpmax <= 0);
1102 }
1103
1104 unsched = perf_assign_events(cpuc->event_constraint, n, wmin,
1105 wmax, gpmax, assign);
1106 }
1107
1108 /*
1109 * In case of success (unsched = 0), mark events as committed,
1110 * so we do not put_constraint() in case new events are added
1111 * and fail to be scheduled
1112 *
1113 * We invoke the lower level commit callback to lock the resource
1114 *
1115 * We do not need to do all of this in case we are called to
1116 * validate an event group (assign == NULL)
1117 */
1118 if (!unsched && assign) {
1119 for (i = 0; i < n; i++)
1120 static_call_cond(x86_pmu_commit_scheduling)(cpuc, i, assign[i]);
1121 } else {
1122 for (i = n0; i < n; i++) {
1123 e = cpuc->event_list[i];
1124
1125 /*
1126 * release events that failed scheduling
1127 */
1128 static_call_cond(x86_pmu_put_event_constraints)(cpuc, e);
1129
1130 cpuc->event_constraint[i] = NULL;
1131 }
1132 }
1133
1134 static_call_cond(x86_pmu_stop_scheduling)(cpuc);
1135
1136 return unsched ? -EINVAL : 0;
1137 }
1138
add_nr_metric_event(struct cpu_hw_events * cpuc,struct perf_event * event)1139 static int add_nr_metric_event(struct cpu_hw_events *cpuc,
1140 struct perf_event *event)
1141 {
1142 if (is_metric_event(event)) {
1143 if (cpuc->n_metric == INTEL_TD_METRIC_NUM)
1144 return -EINVAL;
1145 cpuc->n_metric++;
1146 cpuc->n_txn_metric++;
1147 }
1148
1149 return 0;
1150 }
1151
del_nr_metric_event(struct cpu_hw_events * cpuc,struct perf_event * event)1152 static void del_nr_metric_event(struct cpu_hw_events *cpuc,
1153 struct perf_event *event)
1154 {
1155 if (is_metric_event(event))
1156 cpuc->n_metric--;
1157 }
1158
collect_event(struct cpu_hw_events * cpuc,struct perf_event * event,int max_count,int n)1159 static int collect_event(struct cpu_hw_events *cpuc, struct perf_event *event,
1160 int max_count, int n)
1161 {
1162 union perf_capabilities intel_cap = hybrid(cpuc->pmu, intel_cap);
1163
1164 if (intel_cap.perf_metrics && add_nr_metric_event(cpuc, event))
1165 return -EINVAL;
1166
1167 if (n >= max_count + cpuc->n_metric)
1168 return -EINVAL;
1169
1170 cpuc->event_list[n] = event;
1171 if (is_counter_pair(&event->hw)) {
1172 cpuc->n_pair++;
1173 cpuc->n_txn_pair++;
1174 }
1175
1176 return 0;
1177 }
1178
1179 /*
1180 * dogrp: true if must collect siblings events (group)
1181 * returns total number of events and error code
1182 */
collect_events(struct cpu_hw_events * cpuc,struct perf_event * leader,bool dogrp)1183 static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader, bool dogrp)
1184 {
1185 struct perf_event *event;
1186 int n, max_count;
1187
1188 max_count = x86_pmu_num_counters(cpuc->pmu) + x86_pmu_num_counters_fixed(cpuc->pmu);
1189
1190 /* current number of events already accepted */
1191 n = cpuc->n_events;
1192 if (!cpuc->n_events)
1193 cpuc->pebs_output = 0;
1194
1195 if (!cpuc->is_fake && leader->attr.precise_ip) {
1196 /*
1197 * For PEBS->PT, if !aux_event, the group leader (PT) went
1198 * away, the group was broken down and this singleton event
1199 * can't schedule any more.
1200 */
1201 if (is_pebs_pt(leader) && !leader->aux_event)
1202 return -EINVAL;
1203
1204 /*
1205 * pebs_output: 0: no PEBS so far, 1: PT, 2: DS
1206 */
1207 if (cpuc->pebs_output &&
1208 cpuc->pebs_output != is_pebs_pt(leader) + 1)
1209 return -EINVAL;
1210
1211 cpuc->pebs_output = is_pebs_pt(leader) + 1;
1212 }
1213
1214 if (is_x86_event(leader)) {
1215 if (collect_event(cpuc, leader, max_count, n))
1216 return -EINVAL;
1217 n++;
1218 }
1219
1220 if (!dogrp)
1221 return n;
1222
1223 for_each_sibling_event(event, leader) {
1224 if (!is_x86_event(event) || event->state <= PERF_EVENT_STATE_OFF)
1225 continue;
1226
1227 if (collect_event(cpuc, event, max_count, n))
1228 return -EINVAL;
1229
1230 n++;
1231 }
1232 return n;
1233 }
1234
x86_assign_hw_event(struct perf_event * event,struct cpu_hw_events * cpuc,int i)1235 static inline void x86_assign_hw_event(struct perf_event *event,
1236 struct cpu_hw_events *cpuc, int i)
1237 {
1238 struct hw_perf_event *hwc = &event->hw;
1239 int idx;
1240
1241 idx = hwc->idx = cpuc->assign[i];
1242 hwc->last_cpu = smp_processor_id();
1243 hwc->last_tag = ++cpuc->tags[i];
1244
1245 static_call_cond(x86_pmu_assign)(event, idx);
1246
1247 switch (hwc->idx) {
1248 case INTEL_PMC_IDX_FIXED_BTS:
1249 case INTEL_PMC_IDX_FIXED_VLBR:
1250 hwc->config_base = 0;
1251 hwc->event_base = 0;
1252 break;
1253
1254 case INTEL_PMC_IDX_METRIC_BASE ... INTEL_PMC_IDX_METRIC_END:
1255 /* All the metric events are mapped onto the fixed counter 3. */
1256 idx = INTEL_PMC_IDX_FIXED_SLOTS;
1257 fallthrough;
1258 case INTEL_PMC_IDX_FIXED ... INTEL_PMC_IDX_FIXED_BTS-1:
1259 hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
1260 hwc->event_base = x86_pmu_fixed_ctr_addr(idx - INTEL_PMC_IDX_FIXED);
1261 hwc->event_base_rdpmc = (idx - INTEL_PMC_IDX_FIXED) |
1262 INTEL_PMC_FIXED_RDPMC_BASE;
1263 break;
1264
1265 default:
1266 hwc->config_base = x86_pmu_config_addr(hwc->idx);
1267 hwc->event_base = x86_pmu_event_addr(hwc->idx);
1268 hwc->event_base_rdpmc = x86_pmu_rdpmc_index(hwc->idx);
1269 break;
1270 }
1271 }
1272
1273 /**
1274 * x86_perf_rdpmc_index - Return PMC counter used for event
1275 * @event: the perf_event to which the PMC counter was assigned
1276 *
1277 * The counter assigned to this performance event may change if interrupts
1278 * are enabled. This counter should thus never be used while interrupts are
1279 * enabled. Before this function is used to obtain the assigned counter the
1280 * event should be checked for validity using, for example,
1281 * perf_event_read_local(), within the same interrupt disabled section in
1282 * which this counter is planned to be used.
1283 *
1284 * Return: The index of the performance monitoring counter assigned to
1285 * @perf_event.
1286 */
x86_perf_rdpmc_index(struct perf_event * event)1287 int x86_perf_rdpmc_index(struct perf_event *event)
1288 {
1289 lockdep_assert_irqs_disabled();
1290
1291 return event->hw.event_base_rdpmc;
1292 }
1293
match_prev_assignment(struct hw_perf_event * hwc,struct cpu_hw_events * cpuc,int i)1294 static inline int match_prev_assignment(struct hw_perf_event *hwc,
1295 struct cpu_hw_events *cpuc,
1296 int i)
1297 {
1298 return hwc->idx == cpuc->assign[i] &&
1299 hwc->last_cpu == smp_processor_id() &&
1300 hwc->last_tag == cpuc->tags[i];
1301 }
1302
1303 static void x86_pmu_start(struct perf_event *event, int flags);
1304
x86_pmu_enable(struct pmu * pmu)1305 static void x86_pmu_enable(struct pmu *pmu)
1306 {
1307 struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
1308 struct perf_event *event;
1309 struct hw_perf_event *hwc;
1310 int i, added = cpuc->n_added;
1311
1312 if (!x86_pmu_initialized())
1313 return;
1314
1315 if (cpuc->enabled)
1316 return;
1317
1318 if (cpuc->n_added) {
1319 int n_running = cpuc->n_events - cpuc->n_added;
1320
1321 /*
1322 * The late setup (after counters are scheduled)
1323 * is required for some cases, e.g., PEBS counters
1324 * snapshotting. Because an accurate counter index
1325 * is needed.
1326 */
1327 static_call_cond(x86_pmu_late_setup)();
1328
1329 /*
1330 * apply assignment obtained either from
1331 * hw_perf_group_sched_in() or x86_pmu_enable()
1332 *
1333 * step1: save events moving to new counters
1334 */
1335 for (i = 0; i < n_running; i++) {
1336 event = cpuc->event_list[i];
1337 hwc = &event->hw;
1338
1339 /*
1340 * we can avoid reprogramming counter if:
1341 * - assigned same counter as last time
1342 * - running on same CPU as last time
1343 * - no other event has used the counter since
1344 */
1345 if (hwc->idx == -1 ||
1346 match_prev_assignment(hwc, cpuc, i))
1347 continue;
1348
1349 /*
1350 * Ensure we don't accidentally enable a stopped
1351 * counter simply because we rescheduled.
1352 */
1353 if (hwc->state & PERF_HES_STOPPED)
1354 hwc->state |= PERF_HES_ARCH;
1355
1356 x86_pmu_stop(event, PERF_EF_UPDATE);
1357 cpuc->events[hwc->idx] = NULL;
1358 }
1359
1360 /*
1361 * step2: reprogram moved events into new counters
1362 */
1363 for (i = 0; i < cpuc->n_events; i++) {
1364 event = cpuc->event_list[i];
1365 hwc = &event->hw;
1366
1367 if (!match_prev_assignment(hwc, cpuc, i))
1368 x86_assign_hw_event(event, cpuc, i);
1369 else if (i < n_running)
1370 continue;
1371
1372 if (hwc->state & PERF_HES_ARCH)
1373 continue;
1374
1375 /*
1376 * if cpuc->enabled = 0, then no wrmsr as
1377 * per x86_pmu_enable_event()
1378 */
1379 cpuc->events[hwc->idx] = event;
1380 x86_pmu_start(event, PERF_EF_RELOAD);
1381 }
1382 cpuc->n_added = 0;
1383 perf_events_lapic_init();
1384 }
1385
1386 cpuc->enabled = 1;
1387 barrier();
1388
1389 static_call(x86_pmu_enable_all)(added);
1390 }
1391
1392 DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left);
1393
1394 /*
1395 * Set the next IRQ period, based on the hwc->period_left value.
1396 * To be called with the event disabled in hw:
1397 */
x86_perf_event_set_period(struct perf_event * event)1398 int x86_perf_event_set_period(struct perf_event *event)
1399 {
1400 struct hw_perf_event *hwc = &event->hw;
1401 s64 left = local64_read(&hwc->period_left);
1402 s64 period = hwc->sample_period;
1403 int ret = 0, idx = hwc->idx;
1404
1405 if (unlikely(!hwc->event_base))
1406 return 0;
1407
1408 /*
1409 * If we are way outside a reasonable range then just skip forward:
1410 */
1411 if (unlikely(left <= -period)) {
1412 left = period;
1413 local64_set(&hwc->period_left, left);
1414 hwc->last_period = period;
1415 ret = 1;
1416 }
1417
1418 if (unlikely(left <= 0)) {
1419 left += period;
1420 local64_set(&hwc->period_left, left);
1421 hwc->last_period = period;
1422 ret = 1;
1423 }
1424 /*
1425 * Quirk: certain CPUs dont like it if just 1 hw_event is left:
1426 */
1427 if (unlikely(left < 2))
1428 left = 2;
1429
1430 if (left > x86_pmu.max_period)
1431 left = x86_pmu.max_period;
1432
1433 static_call_cond(x86_pmu_limit_period)(event, &left);
1434
1435 this_cpu_write(pmc_prev_left[idx], left);
1436
1437 /*
1438 * The hw event starts counting from this event offset,
1439 * mark it to be able to extra future deltas:
1440 */
1441 local64_set(&hwc->prev_count, (u64)-left);
1442
1443 wrmsrq(hwc->event_base, (u64)(-left) & x86_pmu.cntval_mask);
1444
1445 /*
1446 * Sign extend the Merge event counter's upper 16 bits since
1447 * we currently declare a 48-bit counter width
1448 */
1449 if (is_counter_pair(hwc))
1450 wrmsrq(x86_pmu_event_addr(idx + 1), 0xffff);
1451
1452 perf_event_update_userpage(event);
1453
1454 return ret;
1455 }
1456
x86_pmu_enable_event(struct perf_event * event)1457 void x86_pmu_enable_event(struct perf_event *event)
1458 {
1459 if (__this_cpu_read(cpu_hw_events.enabled))
1460 __x86_pmu_enable_event(&event->hw,
1461 ARCH_PERFMON_EVENTSEL_ENABLE);
1462 }
1463
1464 /*
1465 * Add a single event to the PMU.
1466 *
1467 * The event is added to the group of enabled events
1468 * but only if it can be scheduled with existing events.
1469 */
x86_pmu_add(struct perf_event * event,int flags)1470 static int x86_pmu_add(struct perf_event *event, int flags)
1471 {
1472 struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
1473 struct hw_perf_event *hwc;
1474 int assign[X86_PMC_IDX_MAX];
1475 int n, n0, ret;
1476
1477 hwc = &event->hw;
1478
1479 n0 = cpuc->n_events;
1480 ret = n = collect_events(cpuc, event, false);
1481 if (ret < 0)
1482 goto out;
1483
1484 hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
1485 if (!(flags & PERF_EF_START))
1486 hwc->state |= PERF_HES_ARCH;
1487
1488 /*
1489 * If group events scheduling transaction was started,
1490 * skip the schedulability test here, it will be performed
1491 * at commit time (->commit_txn) as a whole.
1492 *
1493 * If commit fails, we'll call ->del() on all events
1494 * for which ->add() was called.
1495 */
1496 if (cpuc->txn_flags & PERF_PMU_TXN_ADD)
1497 goto done_collect;
1498
1499 ret = static_call(x86_pmu_schedule_events)(cpuc, n, assign);
1500 if (ret)
1501 goto out;
1502 /*
1503 * copy new assignment, now we know it is possible
1504 * will be used by hw_perf_enable()
1505 */
1506 memcpy(cpuc->assign, assign, n*sizeof(int));
1507
1508 done_collect:
1509 /*
1510 * Commit the collect_events() state. See x86_pmu_del() and
1511 * x86_pmu_*_txn().
1512 */
1513 cpuc->n_events = n;
1514 cpuc->n_added += n - n0;
1515 cpuc->n_txn += n - n0;
1516
1517 /*
1518 * This is before x86_pmu_enable() will call x86_pmu_start(),
1519 * so we enable LBRs before an event needs them etc..
1520 */
1521 static_call_cond(x86_pmu_add)(event);
1522
1523 ret = 0;
1524 out:
1525 return ret;
1526 }
1527
x86_pmu_start(struct perf_event * event,int flags)1528 static void x86_pmu_start(struct perf_event *event, int flags)
1529 {
1530 struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
1531 int idx = event->hw.idx;
1532
1533 if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
1534 return;
1535
1536 if (WARN_ON_ONCE(idx == -1))
1537 return;
1538
1539 if (flags & PERF_EF_RELOAD) {
1540 WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE));
1541 static_call(x86_pmu_set_period)(event);
1542 }
1543
1544 event->hw.state = 0;
1545
1546 __set_bit(idx, cpuc->active_mask);
1547 static_call(x86_pmu_enable)(event);
1548 perf_event_update_userpage(event);
1549 }
1550
perf_event_print_debug(void)1551 void perf_event_print_debug(void)
1552 {
1553 u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed;
1554 unsigned long *cntr_mask, *fixed_cntr_mask;
1555 struct event_constraint *pebs_constraints;
1556 struct cpu_hw_events *cpuc;
1557 u64 pebs, debugctl;
1558 int cpu, idx;
1559
1560 guard(irqsave)();
1561
1562 cpu = smp_processor_id();
1563 cpuc = &per_cpu(cpu_hw_events, cpu);
1564 cntr_mask = hybrid(cpuc->pmu, cntr_mask);
1565 fixed_cntr_mask = hybrid(cpuc->pmu, fixed_cntr_mask);
1566 pebs_constraints = hybrid(cpuc->pmu, pebs_constraints);
1567
1568 if (!*(u64 *)cntr_mask)
1569 return;
1570
1571 if (x86_pmu.version >= 2) {
1572 rdmsrq(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
1573 rdmsrq(MSR_CORE_PERF_GLOBAL_STATUS, status);
1574 rdmsrq(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow);
1575 rdmsrq(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed);
1576
1577 pr_info("\n");
1578 pr_info("CPU#%d: ctrl: %016llx\n", cpu, ctrl);
1579 pr_info("CPU#%d: status: %016llx\n", cpu, status);
1580 pr_info("CPU#%d: overflow: %016llx\n", cpu, overflow);
1581 pr_info("CPU#%d: fixed: %016llx\n", cpu, fixed);
1582 if (pebs_constraints) {
1583 rdmsrq(MSR_IA32_PEBS_ENABLE, pebs);
1584 pr_info("CPU#%d: pebs: %016llx\n", cpu, pebs);
1585 }
1586 if (x86_pmu.lbr_nr) {
1587 rdmsrq(MSR_IA32_DEBUGCTLMSR, debugctl);
1588 pr_info("CPU#%d: debugctl: %016llx\n", cpu, debugctl);
1589 }
1590 }
1591 pr_info("CPU#%d: active: %016llx\n", cpu, *(u64 *)cpuc->active_mask);
1592
1593 for_each_set_bit(idx, cntr_mask, X86_PMC_IDX_MAX) {
1594 rdmsrq(x86_pmu_config_addr(idx), pmc_ctrl);
1595 rdmsrq(x86_pmu_event_addr(idx), pmc_count);
1596
1597 prev_left = per_cpu(pmc_prev_left[idx], cpu);
1598
1599 pr_info("CPU#%d: gen-PMC%d ctrl: %016llx\n",
1600 cpu, idx, pmc_ctrl);
1601 pr_info("CPU#%d: gen-PMC%d count: %016llx\n",
1602 cpu, idx, pmc_count);
1603 pr_info("CPU#%d: gen-PMC%d left: %016llx\n",
1604 cpu, idx, prev_left);
1605 }
1606 for_each_set_bit(idx, fixed_cntr_mask, X86_PMC_IDX_MAX) {
1607 if (fixed_counter_disabled(idx, cpuc->pmu))
1608 continue;
1609 rdmsrq(x86_pmu_fixed_ctr_addr(idx), pmc_count);
1610
1611 pr_info("CPU#%d: fixed-PMC%d count: %016llx\n",
1612 cpu, idx, pmc_count);
1613 }
1614 }
1615
x86_pmu_stop(struct perf_event * event,int flags)1616 void x86_pmu_stop(struct perf_event *event, int flags)
1617 {
1618 struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
1619 struct hw_perf_event *hwc = &event->hw;
1620
1621 if (test_bit(hwc->idx, cpuc->active_mask)) {
1622 static_call(x86_pmu_disable)(event);
1623 __clear_bit(hwc->idx, cpuc->active_mask);
1624 WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
1625 hwc->state |= PERF_HES_STOPPED;
1626 }
1627
1628 if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
1629 /*
1630 * Drain the remaining delta count out of a event
1631 * that we are disabling:
1632 */
1633 static_call(x86_pmu_update)(event);
1634 hwc->state |= PERF_HES_UPTODATE;
1635 }
1636 }
1637
x86_pmu_del(struct perf_event * event,int flags)1638 static void x86_pmu_del(struct perf_event *event, int flags)
1639 {
1640 struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
1641 union perf_capabilities intel_cap = hybrid(cpuc->pmu, intel_cap);
1642 int i;
1643
1644 /*
1645 * If we're called during a txn, we only need to undo x86_pmu.add.
1646 * The events never got scheduled and ->cancel_txn will truncate
1647 * the event_list.
1648 *
1649 * XXX assumes any ->del() called during a TXN will only be on
1650 * an event added during that same TXN.
1651 */
1652 if (cpuc->txn_flags & PERF_PMU_TXN_ADD)
1653 goto do_del;
1654
1655 __set_bit(event->hw.idx, cpuc->dirty);
1656
1657 /*
1658 * Not a TXN, therefore cleanup properly.
1659 */
1660 x86_pmu_stop(event, PERF_EF_UPDATE);
1661 cpuc->events[event->hw.idx] = NULL;
1662
1663 for (i = 0; i < cpuc->n_events; i++) {
1664 if (event == cpuc->event_list[i])
1665 break;
1666 }
1667
1668 if (WARN_ON_ONCE(i == cpuc->n_events)) /* called ->del() without ->add() ? */
1669 return;
1670
1671 /* If we have a newly added event; make sure to decrease n_added. */
1672 if (i >= cpuc->n_events - cpuc->n_added)
1673 --cpuc->n_added;
1674
1675 static_call_cond(x86_pmu_put_event_constraints)(cpuc, event);
1676
1677 /* Delete the array entry. */
1678 while (++i < cpuc->n_events) {
1679 cpuc->event_list[i-1] = cpuc->event_list[i];
1680 cpuc->event_constraint[i-1] = cpuc->event_constraint[i];
1681 cpuc->assign[i-1] = cpuc->assign[i];
1682 }
1683 cpuc->event_constraint[i-1] = NULL;
1684 --cpuc->n_events;
1685 if (intel_cap.perf_metrics)
1686 del_nr_metric_event(cpuc, event);
1687
1688 perf_event_update_userpage(event);
1689
1690 do_del:
1691
1692 /*
1693 * This is after x86_pmu_stop(); so we disable LBRs after any
1694 * event can need them etc..
1695 */
1696 static_call_cond(x86_pmu_del)(event);
1697 }
1698
x86_pmu_handle_irq(struct pt_regs * regs)1699 int x86_pmu_handle_irq(struct pt_regs *regs)
1700 {
1701 struct perf_sample_data data;
1702 struct cpu_hw_events *cpuc;
1703 struct perf_event *event;
1704 int idx, handled = 0;
1705 u64 last_period;
1706 u64 val;
1707
1708 cpuc = this_cpu_ptr(&cpu_hw_events);
1709
1710 /*
1711 * Some chipsets need to unmask the LVTPC in a particular spot
1712 * inside the nmi handler. As a result, the unmasking was pushed
1713 * into all the nmi handlers.
1714 *
1715 * This generic handler doesn't seem to have any issues where the
1716 * unmasking occurs so it was left at the top.
1717 */
1718 apic_write(APIC_LVTPC, APIC_DM_NMI);
1719
1720 for_each_set_bit(idx, x86_pmu.cntr_mask, X86_PMC_IDX_MAX) {
1721 if (!test_bit(idx, cpuc->active_mask))
1722 continue;
1723
1724 event = cpuc->events[idx];
1725 last_period = event->hw.last_period;
1726
1727 val = static_call(x86_pmu_update)(event);
1728 if (val & (1ULL << (x86_pmu.cntval_bits - 1)))
1729 continue;
1730
1731 /*
1732 * event overflow
1733 */
1734 handled++;
1735
1736 if (!static_call(x86_pmu_set_period)(event))
1737 continue;
1738
1739 perf_sample_data_init(&data, 0, last_period);
1740
1741 perf_sample_save_brstack(&data, event, &cpuc->lbr_stack, NULL);
1742
1743 perf_event_overflow(event, &data, regs);
1744 }
1745
1746 if (handled)
1747 inc_irq_stat(apic_perf_irqs);
1748
1749 return handled;
1750 }
1751
perf_events_lapic_init(void)1752 void perf_events_lapic_init(void)
1753 {
1754 if (!x86_pmu.apic || !x86_pmu_initialized())
1755 return;
1756
1757 /*
1758 * Always use NMI for PMU
1759 */
1760 apic_write(APIC_LVTPC, APIC_DM_NMI);
1761 }
1762
1763 static int
perf_event_nmi_handler(unsigned int cmd,struct pt_regs * regs)1764 perf_event_nmi_handler(unsigned int cmd, struct pt_regs *regs)
1765 {
1766 u64 start_clock;
1767 u64 finish_clock;
1768 int ret;
1769
1770 /*
1771 * All PMUs/events that share this PMI handler should make sure to
1772 * increment active_events for their events.
1773 */
1774 if (!atomic_read(&active_events))
1775 return NMI_DONE;
1776
1777 start_clock = sched_clock();
1778 ret = static_call(x86_pmu_handle_irq)(regs);
1779 finish_clock = sched_clock();
1780
1781 perf_sample_event_took(finish_clock - start_clock);
1782
1783 return ret;
1784 }
1785 NOKPROBE_SYMBOL(perf_event_nmi_handler);
1786
1787 struct event_constraint emptyconstraint;
1788 struct event_constraint unconstrained;
1789
x86_pmu_prepare_cpu(unsigned int cpu)1790 static int x86_pmu_prepare_cpu(unsigned int cpu)
1791 {
1792 struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
1793 int i;
1794
1795 for (i = 0 ; i < X86_PERF_KFREE_MAX; i++)
1796 cpuc->kfree_on_online[i] = NULL;
1797 if (x86_pmu.cpu_prepare)
1798 return x86_pmu.cpu_prepare(cpu);
1799 return 0;
1800 }
1801
x86_pmu_dead_cpu(unsigned int cpu)1802 static int x86_pmu_dead_cpu(unsigned int cpu)
1803 {
1804 if (x86_pmu.cpu_dead)
1805 x86_pmu.cpu_dead(cpu);
1806 return 0;
1807 }
1808
x86_pmu_online_cpu(unsigned int cpu)1809 static int x86_pmu_online_cpu(unsigned int cpu)
1810 {
1811 struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
1812 int i;
1813
1814 for (i = 0 ; i < X86_PERF_KFREE_MAX; i++) {
1815 kfree(cpuc->kfree_on_online[i]);
1816 cpuc->kfree_on_online[i] = NULL;
1817 }
1818 return 0;
1819 }
1820
x86_pmu_starting_cpu(unsigned int cpu)1821 static int x86_pmu_starting_cpu(unsigned int cpu)
1822 {
1823 if (x86_pmu.cpu_starting)
1824 x86_pmu.cpu_starting(cpu);
1825 return 0;
1826 }
1827
x86_pmu_dying_cpu(unsigned int cpu)1828 static int x86_pmu_dying_cpu(unsigned int cpu)
1829 {
1830 if (x86_pmu.cpu_dying)
1831 x86_pmu.cpu_dying(cpu);
1832 return 0;
1833 }
1834
pmu_check_apic(void)1835 static void __init pmu_check_apic(void)
1836 {
1837 if (boot_cpu_has(X86_FEATURE_APIC))
1838 return;
1839
1840 x86_pmu.apic = 0;
1841 pr_info("no APIC, boot with the \"lapic\" boot parameter to force-enable it.\n");
1842 pr_info("no hardware sampling interrupt available.\n");
1843
1844 /*
1845 * If we have a PMU initialized but no APIC
1846 * interrupts, we cannot sample hardware
1847 * events (user-space has to fall back and
1848 * sample via a hrtimer based software event):
1849 */
1850 pmu.capabilities |= PERF_PMU_CAP_NO_INTERRUPT;
1851
1852 }
1853
1854 static struct attribute_group x86_pmu_format_group __ro_after_init = {
1855 .name = "format",
1856 .attrs = NULL,
1857 };
1858
events_sysfs_show(struct device * dev,struct device_attribute * attr,char * page)1859 ssize_t events_sysfs_show(struct device *dev, struct device_attribute *attr, char *page)
1860 {
1861 struct perf_pmu_events_attr *pmu_attr =
1862 container_of(attr, struct perf_pmu_events_attr, attr);
1863 u64 config = 0;
1864
1865 if (pmu_attr->id < x86_pmu.max_events)
1866 config = x86_pmu.event_map(pmu_attr->id);
1867
1868 /* string trumps id */
1869 if (pmu_attr->event_str)
1870 return sprintf(page, "%s\n", pmu_attr->event_str);
1871
1872 return x86_pmu.events_sysfs_show(page, config);
1873 }
1874 EXPORT_SYMBOL_GPL(events_sysfs_show);
1875
events_ht_sysfs_show(struct device * dev,struct device_attribute * attr,char * page)1876 ssize_t events_ht_sysfs_show(struct device *dev, struct device_attribute *attr,
1877 char *page)
1878 {
1879 struct perf_pmu_events_ht_attr *pmu_attr =
1880 container_of(attr, struct perf_pmu_events_ht_attr, attr);
1881
1882 /*
1883 * Report conditional events depending on Hyper-Threading.
1884 *
1885 * This is overly conservative as usually the HT special
1886 * handling is not needed if the other CPU thread is idle.
1887 *
1888 * Note this does not (and cannot) handle the case when thread
1889 * siblings are invisible, for example with virtualization
1890 * if they are owned by some other guest. The user tool
1891 * has to re-read when a thread sibling gets onlined later.
1892 */
1893 return sprintf(page, "%s",
1894 topology_max_smt_threads() > 1 ?
1895 pmu_attr->event_str_ht :
1896 pmu_attr->event_str_noht);
1897 }
1898
events_hybrid_sysfs_show(struct device * dev,struct device_attribute * attr,char * page)1899 ssize_t events_hybrid_sysfs_show(struct device *dev,
1900 struct device_attribute *attr,
1901 char *page)
1902 {
1903 struct perf_pmu_events_hybrid_attr *pmu_attr =
1904 container_of(attr, struct perf_pmu_events_hybrid_attr, attr);
1905 struct x86_hybrid_pmu *pmu;
1906 const char *str, *next_str;
1907 int i;
1908
1909 if (hweight64(pmu_attr->pmu_type) == 1)
1910 return sprintf(page, "%s", pmu_attr->event_str);
1911
1912 /*
1913 * Hybrid PMUs may support the same event name, but with different
1914 * event encoding, e.g., the mem-loads event on an Atom PMU has
1915 * different event encoding from a Core PMU.
1916 *
1917 * The event_str includes all event encodings. Each event encoding
1918 * is divided by ";". The order of the event encodings must follow
1919 * the order of the hybrid PMU index.
1920 */
1921 pmu = container_of(dev_get_drvdata(dev), struct x86_hybrid_pmu, pmu);
1922
1923 str = pmu_attr->event_str;
1924 for (i = 0; i < x86_pmu.num_hybrid_pmus; i++) {
1925 if (!(x86_pmu.hybrid_pmu[i].pmu_type & pmu_attr->pmu_type))
1926 continue;
1927 if (x86_pmu.hybrid_pmu[i].pmu_type & pmu->pmu_type) {
1928 next_str = strchr(str, ';');
1929 if (next_str)
1930 return snprintf(page, next_str - str + 1, "%s", str);
1931 else
1932 return sprintf(page, "%s", str);
1933 }
1934 str = strchr(str, ';');
1935 str++;
1936 }
1937
1938 return 0;
1939 }
1940 EXPORT_SYMBOL_GPL(events_hybrid_sysfs_show);
1941
1942 EVENT_ATTR(cpu-cycles, CPU_CYCLES );
1943 EVENT_ATTR(instructions, INSTRUCTIONS );
1944 EVENT_ATTR(cache-references, CACHE_REFERENCES );
1945 EVENT_ATTR(cache-misses, CACHE_MISSES );
1946 EVENT_ATTR(branch-instructions, BRANCH_INSTRUCTIONS );
1947 EVENT_ATTR(branch-misses, BRANCH_MISSES );
1948 EVENT_ATTR(bus-cycles, BUS_CYCLES );
1949 EVENT_ATTR(stalled-cycles-frontend, STALLED_CYCLES_FRONTEND );
1950 EVENT_ATTR(stalled-cycles-backend, STALLED_CYCLES_BACKEND );
1951 EVENT_ATTR(ref-cycles, REF_CPU_CYCLES );
1952
1953 static struct attribute *empty_attrs;
1954
1955 static struct attribute *events_attr[] = {
1956 EVENT_PTR(CPU_CYCLES),
1957 EVENT_PTR(INSTRUCTIONS),
1958 EVENT_PTR(CACHE_REFERENCES),
1959 EVENT_PTR(CACHE_MISSES),
1960 EVENT_PTR(BRANCH_INSTRUCTIONS),
1961 EVENT_PTR(BRANCH_MISSES),
1962 EVENT_PTR(BUS_CYCLES),
1963 EVENT_PTR(STALLED_CYCLES_FRONTEND),
1964 EVENT_PTR(STALLED_CYCLES_BACKEND),
1965 EVENT_PTR(REF_CPU_CYCLES),
1966 NULL,
1967 };
1968
1969 /*
1970 * Remove all undefined events (x86_pmu.event_map(id) == 0)
1971 * out of events_attr attributes.
1972 */
1973 static umode_t
is_visible(struct kobject * kobj,struct attribute * attr,int idx)1974 is_visible(struct kobject *kobj, struct attribute *attr, int idx)
1975 {
1976 struct perf_pmu_events_attr *pmu_attr;
1977
1978 if (idx >= x86_pmu.max_events)
1979 return 0;
1980
1981 pmu_attr = container_of(attr, struct perf_pmu_events_attr, attr.attr);
1982 /* str trumps id */
1983 return pmu_attr->event_str || x86_pmu.event_map(idx) ? attr->mode : 0;
1984 }
1985
1986 static struct attribute_group x86_pmu_events_group __ro_after_init = {
1987 .name = "events",
1988 .attrs = events_attr,
1989 .is_visible = is_visible,
1990 };
1991
x86_event_sysfs_show(char * page,u64 config,u64 event)1992 ssize_t x86_event_sysfs_show(char *page, u64 config, u64 event)
1993 {
1994 u64 umask = (config & ARCH_PERFMON_EVENTSEL_UMASK) >> 8;
1995 u64 cmask = (config & ARCH_PERFMON_EVENTSEL_CMASK) >> 24;
1996 bool edge = (config & ARCH_PERFMON_EVENTSEL_EDGE);
1997 bool pc = (config & ARCH_PERFMON_EVENTSEL_PIN_CONTROL);
1998 bool any = (config & ARCH_PERFMON_EVENTSEL_ANY);
1999 bool inv = (config & ARCH_PERFMON_EVENTSEL_INV);
2000 ssize_t ret;
2001
2002 /*
2003 * We have whole page size to spend and just little data
2004 * to write, so we can safely use sprintf.
2005 */
2006 ret = sprintf(page, "event=0x%02llx", event);
2007
2008 if (umask)
2009 ret += sprintf(page + ret, ",umask=0x%02llx", umask);
2010
2011 if (edge)
2012 ret += sprintf(page + ret, ",edge");
2013
2014 if (pc)
2015 ret += sprintf(page + ret, ",pc");
2016
2017 if (any)
2018 ret += sprintf(page + ret, ",any");
2019
2020 if (inv)
2021 ret += sprintf(page + ret, ",inv");
2022
2023 if (cmask)
2024 ret += sprintf(page + ret, ",cmask=0x%02llx", cmask);
2025
2026 ret += sprintf(page + ret, "\n");
2027
2028 return ret;
2029 }
2030
2031 static struct attribute_group x86_pmu_attr_group;
2032 static struct attribute_group x86_pmu_caps_group;
2033
x86_pmu_static_call_update(void)2034 static void x86_pmu_static_call_update(void)
2035 {
2036 static_call_update(x86_pmu_handle_irq, x86_pmu.handle_irq);
2037 static_call_update(x86_pmu_disable_all, x86_pmu.disable_all);
2038 static_call_update(x86_pmu_enable_all, x86_pmu.enable_all);
2039 static_call_update(x86_pmu_enable, x86_pmu.enable);
2040 static_call_update(x86_pmu_disable, x86_pmu.disable);
2041
2042 static_call_update(x86_pmu_assign, x86_pmu.assign);
2043
2044 static_call_update(x86_pmu_add, x86_pmu.add);
2045 static_call_update(x86_pmu_del, x86_pmu.del);
2046 static_call_update(x86_pmu_read, x86_pmu.read);
2047
2048 static_call_update(x86_pmu_set_period, x86_pmu.set_period);
2049 static_call_update(x86_pmu_update, x86_pmu.update);
2050 static_call_update(x86_pmu_limit_period, x86_pmu.limit_period);
2051
2052 static_call_update(x86_pmu_schedule_events, x86_pmu.schedule_events);
2053 static_call_update(x86_pmu_get_event_constraints, x86_pmu.get_event_constraints);
2054 static_call_update(x86_pmu_put_event_constraints, x86_pmu.put_event_constraints);
2055
2056 static_call_update(x86_pmu_start_scheduling, x86_pmu.start_scheduling);
2057 static_call_update(x86_pmu_commit_scheduling, x86_pmu.commit_scheduling);
2058 static_call_update(x86_pmu_stop_scheduling, x86_pmu.stop_scheduling);
2059
2060 static_call_update(x86_pmu_sched_task, x86_pmu.sched_task);
2061
2062 static_call_update(x86_pmu_drain_pebs, x86_pmu.drain_pebs);
2063 static_call_update(x86_pmu_pebs_aliases, x86_pmu.pebs_aliases);
2064
2065 static_call_update(x86_pmu_guest_get_msrs, x86_pmu.guest_get_msrs);
2066 static_call_update(x86_pmu_filter, x86_pmu.filter);
2067
2068 static_call_update(x86_pmu_late_setup, x86_pmu.late_setup);
2069
2070 static_call_update(x86_pmu_pebs_enable, x86_pmu.pebs_enable);
2071 static_call_update(x86_pmu_pebs_disable, x86_pmu.pebs_disable);
2072 static_call_update(x86_pmu_pebs_enable_all, x86_pmu.pebs_enable_all);
2073 static_call_update(x86_pmu_pebs_disable_all, x86_pmu.pebs_disable_all);
2074 }
2075
_x86_pmu_read(struct perf_event * event)2076 static void _x86_pmu_read(struct perf_event *event)
2077 {
2078 static_call(x86_pmu_update)(event);
2079 }
2080
x86_pmu_show_pmu_cap(struct pmu * pmu)2081 void x86_pmu_show_pmu_cap(struct pmu *pmu)
2082 {
2083 pr_info("... version: %d\n", x86_pmu.version);
2084 pr_info("... bit width: %d\n", x86_pmu.cntval_bits);
2085 pr_info("... generic counters: %d\n", x86_pmu_num_counters(pmu));
2086 pr_info("... generic bitmap: %016llx\n", hybrid(pmu, cntr_mask64));
2087 pr_info("... fixed-purpose counters: %d\n", x86_pmu_num_counters_fixed(pmu));
2088 pr_info("... fixed-purpose bitmap: %016llx\n", hybrid(pmu, fixed_cntr_mask64));
2089 pr_info("... value mask: %016llx\n", x86_pmu.cntval_mask);
2090 pr_info("... max period: %016llx\n", x86_pmu.max_period);
2091 pr_info("... global_ctrl mask: %016llx\n", hybrid(pmu, intel_ctrl));
2092 }
2093
init_hw_perf_events(void)2094 static int __init init_hw_perf_events(void)
2095 {
2096 struct x86_pmu_quirk *quirk;
2097 int err;
2098
2099 pr_info("Performance Events: ");
2100
2101 switch (boot_cpu_data.x86_vendor) {
2102 case X86_VENDOR_INTEL:
2103 err = intel_pmu_init();
2104 break;
2105 case X86_VENDOR_AMD:
2106 err = amd_pmu_init();
2107 break;
2108 case X86_VENDOR_HYGON:
2109 err = amd_pmu_init();
2110 x86_pmu.name = "HYGON";
2111 break;
2112 case X86_VENDOR_ZHAOXIN:
2113 case X86_VENDOR_CENTAUR:
2114 err = zhaoxin_pmu_init();
2115 break;
2116 default:
2117 err = -ENOTSUPP;
2118 }
2119 if (err != 0) {
2120 pr_cont("no PMU driver, software events only.\n");
2121 err = 0;
2122 goto out_bad_pmu;
2123 }
2124
2125 pmu_check_apic();
2126
2127 /* sanity check that the hardware exists or is emulated */
2128 if (!check_hw_exists(&pmu, x86_pmu.cntr_mask, x86_pmu.fixed_cntr_mask))
2129 goto out_bad_pmu;
2130
2131 pr_cont("%s PMU driver.\n", x86_pmu.name);
2132
2133 x86_pmu.attr_rdpmc = 1; /* enable userspace RDPMC usage by default */
2134
2135 for (quirk = x86_pmu.quirks; quirk; quirk = quirk->next)
2136 quirk->func();
2137
2138 if (!x86_pmu.intel_ctrl)
2139 x86_pmu.intel_ctrl = x86_pmu.cntr_mask64;
2140
2141 if (!x86_pmu.config_mask)
2142 x86_pmu.config_mask = X86_RAW_EVENT_MASK;
2143
2144 perf_events_lapic_init();
2145 register_nmi_handler(NMI_LOCAL, perf_event_nmi_handler, 0, "PMI");
2146
2147 unconstrained = (struct event_constraint)
2148 __EVENT_CONSTRAINT(0, x86_pmu.cntr_mask64,
2149 0, x86_pmu_num_counters(NULL), 0, 0);
2150
2151 x86_pmu_format_group.attrs = x86_pmu.format_attrs;
2152
2153 if (!x86_pmu.events_sysfs_show)
2154 x86_pmu_events_group.attrs = &empty_attrs;
2155
2156 pmu.attr_update = x86_pmu.attr_update;
2157
2158 if (!is_hybrid())
2159 x86_pmu_show_pmu_cap(NULL);
2160
2161 if (!x86_pmu.read)
2162 x86_pmu.read = _x86_pmu_read;
2163
2164 if (!x86_pmu.guest_get_msrs)
2165 x86_pmu.guest_get_msrs = (void *)&__static_call_return0;
2166
2167 if (!x86_pmu.set_period)
2168 x86_pmu.set_period = x86_perf_event_set_period;
2169
2170 if (!x86_pmu.update)
2171 x86_pmu.update = x86_perf_event_update;
2172
2173 x86_pmu_static_call_update();
2174
2175 /*
2176 * Install callbacks. Core will call them for each online
2177 * cpu.
2178 */
2179 err = cpuhp_setup_state(CPUHP_PERF_X86_PREPARE, "perf/x86:prepare",
2180 x86_pmu_prepare_cpu, x86_pmu_dead_cpu);
2181 if (err)
2182 return err;
2183
2184 err = cpuhp_setup_state(CPUHP_AP_PERF_X86_STARTING,
2185 "perf/x86:starting", x86_pmu_starting_cpu,
2186 x86_pmu_dying_cpu);
2187 if (err)
2188 goto out;
2189
2190 err = cpuhp_setup_state(CPUHP_AP_PERF_X86_ONLINE, "perf/x86:online",
2191 x86_pmu_online_cpu, NULL);
2192 if (err)
2193 goto out1;
2194
2195 if (!is_hybrid()) {
2196 err = perf_pmu_register(&pmu, "cpu", PERF_TYPE_RAW);
2197 if (err)
2198 goto out2;
2199 } else {
2200 struct x86_hybrid_pmu *hybrid_pmu;
2201 int i, j;
2202
2203 for (i = 0; i < x86_pmu.num_hybrid_pmus; i++) {
2204 hybrid_pmu = &x86_pmu.hybrid_pmu[i];
2205
2206 hybrid_pmu->pmu = pmu;
2207 hybrid_pmu->pmu.type = -1;
2208 hybrid_pmu->pmu.attr_update = x86_pmu.attr_update;
2209 hybrid_pmu->pmu.capabilities |= PERF_PMU_CAP_EXTENDED_HW_TYPE;
2210
2211 err = perf_pmu_register(&hybrid_pmu->pmu, hybrid_pmu->name,
2212 (hybrid_pmu->pmu_type == hybrid_big) ? PERF_TYPE_RAW : -1);
2213 if (err)
2214 break;
2215 }
2216
2217 if (i < x86_pmu.num_hybrid_pmus) {
2218 for (j = 0; j < i; j++)
2219 perf_pmu_unregister(&x86_pmu.hybrid_pmu[j].pmu);
2220 pr_warn("Failed to register hybrid PMUs\n");
2221 kfree(x86_pmu.hybrid_pmu);
2222 x86_pmu.hybrid_pmu = NULL;
2223 x86_pmu.num_hybrid_pmus = 0;
2224 goto out2;
2225 }
2226 }
2227
2228 return 0;
2229
2230 out2:
2231 cpuhp_remove_state(CPUHP_AP_PERF_X86_ONLINE);
2232 out1:
2233 cpuhp_remove_state(CPUHP_AP_PERF_X86_STARTING);
2234 out:
2235 cpuhp_remove_state(CPUHP_PERF_X86_PREPARE);
2236 out_bad_pmu:
2237 memset(&x86_pmu, 0, sizeof(x86_pmu));
2238 return err;
2239 }
2240 early_initcall(init_hw_perf_events);
2241
x86_pmu_read(struct perf_event * event)2242 static void x86_pmu_read(struct perf_event *event)
2243 {
2244 static_call(x86_pmu_read)(event);
2245 }
2246
2247 /*
2248 * Start group events scheduling transaction
2249 * Set the flag to make pmu::enable() not perform the
2250 * schedulability test, it will be performed at commit time
2251 *
2252 * We only support PERF_PMU_TXN_ADD transactions. Save the
2253 * transaction flags but otherwise ignore non-PERF_PMU_TXN_ADD
2254 * transactions.
2255 */
x86_pmu_start_txn(struct pmu * pmu,unsigned int txn_flags)2256 static void x86_pmu_start_txn(struct pmu *pmu, unsigned int txn_flags)
2257 {
2258 struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
2259
2260 WARN_ON_ONCE(cpuc->txn_flags); /* txn already in flight */
2261
2262 cpuc->txn_flags = txn_flags;
2263 if (txn_flags & ~PERF_PMU_TXN_ADD)
2264 return;
2265
2266 perf_pmu_disable(pmu);
2267 __this_cpu_write(cpu_hw_events.n_txn, 0);
2268 __this_cpu_write(cpu_hw_events.n_txn_pair, 0);
2269 __this_cpu_write(cpu_hw_events.n_txn_metric, 0);
2270 }
2271
2272 /*
2273 * Stop group events scheduling transaction
2274 * Clear the flag and pmu::enable() will perform the
2275 * schedulability test.
2276 */
x86_pmu_cancel_txn(struct pmu * pmu)2277 static void x86_pmu_cancel_txn(struct pmu *pmu)
2278 {
2279 unsigned int txn_flags;
2280 struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
2281
2282 WARN_ON_ONCE(!cpuc->txn_flags); /* no txn in flight */
2283
2284 txn_flags = cpuc->txn_flags;
2285 cpuc->txn_flags = 0;
2286 if (txn_flags & ~PERF_PMU_TXN_ADD)
2287 return;
2288
2289 /*
2290 * Truncate collected array by the number of events added in this
2291 * transaction. See x86_pmu_add() and x86_pmu_*_txn().
2292 */
2293 __this_cpu_sub(cpu_hw_events.n_added, __this_cpu_read(cpu_hw_events.n_txn));
2294 __this_cpu_sub(cpu_hw_events.n_events, __this_cpu_read(cpu_hw_events.n_txn));
2295 __this_cpu_sub(cpu_hw_events.n_pair, __this_cpu_read(cpu_hw_events.n_txn_pair));
2296 __this_cpu_sub(cpu_hw_events.n_metric, __this_cpu_read(cpu_hw_events.n_txn_metric));
2297 perf_pmu_enable(pmu);
2298 }
2299
2300 /*
2301 * Commit group events scheduling transaction
2302 * Perform the group schedulability test as a whole
2303 * Return 0 if success
2304 *
2305 * Does not cancel the transaction on failure; expects the caller to do this.
2306 */
x86_pmu_commit_txn(struct pmu * pmu)2307 static int x86_pmu_commit_txn(struct pmu *pmu)
2308 {
2309 struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
2310 int assign[X86_PMC_IDX_MAX];
2311 int n, ret;
2312
2313 WARN_ON_ONCE(!cpuc->txn_flags); /* no txn in flight */
2314
2315 if (cpuc->txn_flags & ~PERF_PMU_TXN_ADD) {
2316 cpuc->txn_flags = 0;
2317 return 0;
2318 }
2319
2320 n = cpuc->n_events;
2321
2322 if (!x86_pmu_initialized())
2323 return -EAGAIN;
2324
2325 ret = static_call(x86_pmu_schedule_events)(cpuc, n, assign);
2326 if (ret)
2327 return ret;
2328
2329 /*
2330 * copy new assignment, now we know it is possible
2331 * will be used by hw_perf_enable()
2332 */
2333 memcpy(cpuc->assign, assign, n*sizeof(int));
2334
2335 cpuc->txn_flags = 0;
2336 perf_pmu_enable(pmu);
2337 return 0;
2338 }
2339 /*
2340 * a fake_cpuc is used to validate event groups. Due to
2341 * the extra reg logic, we need to also allocate a fake
2342 * per_core and per_cpu structure. Otherwise, group events
2343 * using extra reg may conflict without the kernel being
2344 * able to catch this when the last event gets added to
2345 * the group.
2346 */
free_fake_cpuc(struct cpu_hw_events * cpuc)2347 static void free_fake_cpuc(struct cpu_hw_events *cpuc)
2348 {
2349 intel_cpuc_finish(cpuc);
2350 kfree(cpuc);
2351 }
2352
allocate_fake_cpuc(struct pmu * event_pmu)2353 static struct cpu_hw_events *allocate_fake_cpuc(struct pmu *event_pmu)
2354 {
2355 struct cpu_hw_events *cpuc;
2356 int cpu;
2357
2358 cpuc = kzalloc(sizeof(*cpuc), GFP_KERNEL);
2359 if (!cpuc)
2360 return ERR_PTR(-ENOMEM);
2361 cpuc->is_fake = 1;
2362
2363 if (is_hybrid()) {
2364 struct x86_hybrid_pmu *h_pmu;
2365
2366 h_pmu = hybrid_pmu(event_pmu);
2367 if (cpumask_empty(&h_pmu->supported_cpus))
2368 goto error;
2369 cpu = cpumask_first(&h_pmu->supported_cpus);
2370 } else
2371 cpu = raw_smp_processor_id();
2372 cpuc->pmu = event_pmu;
2373
2374 if (intel_cpuc_prepare(cpuc, cpu))
2375 goto error;
2376
2377 return cpuc;
2378 error:
2379 free_fake_cpuc(cpuc);
2380 return ERR_PTR(-ENOMEM);
2381 }
2382
2383 /*
2384 * validate that we can schedule this event
2385 */
validate_event(struct perf_event * event)2386 static int validate_event(struct perf_event *event)
2387 {
2388 struct cpu_hw_events *fake_cpuc;
2389 struct event_constraint *c;
2390 int ret = 0;
2391
2392 fake_cpuc = allocate_fake_cpuc(event->pmu);
2393 if (IS_ERR(fake_cpuc))
2394 return PTR_ERR(fake_cpuc);
2395
2396 c = x86_pmu.get_event_constraints(fake_cpuc, 0, event);
2397
2398 if (!c || !c->weight)
2399 ret = -EINVAL;
2400
2401 if (x86_pmu.put_event_constraints)
2402 x86_pmu.put_event_constraints(fake_cpuc, event);
2403
2404 free_fake_cpuc(fake_cpuc);
2405
2406 return ret;
2407 }
2408
2409 /*
2410 * validate a single event group
2411 *
2412 * validation include:
2413 * - check events are compatible which each other
2414 * - events do not compete for the same counter
2415 * - number of events <= number of counters
2416 *
2417 * validation ensures the group can be loaded onto the
2418 * PMU if it was the only group available.
2419 */
validate_group(struct perf_event * event)2420 static int validate_group(struct perf_event *event)
2421 {
2422 struct perf_event *leader = event->group_leader;
2423 struct cpu_hw_events *fake_cpuc;
2424 int ret = -EINVAL, n;
2425
2426 /*
2427 * Reject events from different hybrid PMUs.
2428 */
2429 if (is_hybrid()) {
2430 struct perf_event *sibling;
2431 struct pmu *pmu = NULL;
2432
2433 if (is_x86_event(leader))
2434 pmu = leader->pmu;
2435
2436 for_each_sibling_event(sibling, leader) {
2437 if (!is_x86_event(sibling))
2438 continue;
2439 if (!pmu)
2440 pmu = sibling->pmu;
2441 else if (pmu != sibling->pmu)
2442 return ret;
2443 }
2444 }
2445
2446 fake_cpuc = allocate_fake_cpuc(event->pmu);
2447 if (IS_ERR(fake_cpuc))
2448 return PTR_ERR(fake_cpuc);
2449 /*
2450 * the event is not yet connected with its
2451 * siblings therefore we must first collect
2452 * existing siblings, then add the new event
2453 * before we can simulate the scheduling
2454 */
2455 n = collect_events(fake_cpuc, leader, true);
2456 if (n < 0)
2457 goto out;
2458
2459 fake_cpuc->n_events = n;
2460 n = collect_events(fake_cpuc, event, false);
2461 if (n < 0)
2462 goto out;
2463
2464 fake_cpuc->n_events = 0;
2465 ret = x86_pmu.schedule_events(fake_cpuc, n, NULL);
2466
2467 out:
2468 free_fake_cpuc(fake_cpuc);
2469 return ret;
2470 }
2471
x86_pmu_event_init(struct perf_event * event)2472 static int x86_pmu_event_init(struct perf_event *event)
2473 {
2474 struct x86_hybrid_pmu *pmu = NULL;
2475 int err;
2476
2477 if ((event->attr.type != event->pmu->type) &&
2478 (event->attr.type != PERF_TYPE_HARDWARE) &&
2479 (event->attr.type != PERF_TYPE_HW_CACHE))
2480 return -ENOENT;
2481
2482 if (is_hybrid() && (event->cpu != -1)) {
2483 pmu = hybrid_pmu(event->pmu);
2484 if (!cpumask_test_cpu(event->cpu, &pmu->supported_cpus))
2485 return -ENOENT;
2486 }
2487
2488 err = __x86_pmu_event_init(event);
2489 if (!err) {
2490 if (event->group_leader != event)
2491 err = validate_group(event);
2492 else
2493 err = validate_event(event);
2494 }
2495 if (err) {
2496 if (event->destroy)
2497 event->destroy(event);
2498 event->destroy = NULL;
2499 }
2500
2501 if (READ_ONCE(x86_pmu.attr_rdpmc) &&
2502 !(event->hw.flags & PERF_X86_EVENT_LARGE_PEBS))
2503 event->hw.flags |= PERF_EVENT_FLAG_USER_READ_CNT;
2504
2505 return err;
2506 }
2507
perf_clear_dirty_counters(void)2508 void perf_clear_dirty_counters(void)
2509 {
2510 struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
2511 int i;
2512
2513 /* Don't need to clear the assigned counter. */
2514 for (i = 0; i < cpuc->n_events; i++)
2515 __clear_bit(cpuc->assign[i], cpuc->dirty);
2516
2517 if (bitmap_empty(cpuc->dirty, X86_PMC_IDX_MAX))
2518 return;
2519
2520 for_each_set_bit(i, cpuc->dirty, X86_PMC_IDX_MAX) {
2521 if (i >= INTEL_PMC_IDX_FIXED) {
2522 /* Metrics and fake events don't have corresponding HW counters. */
2523 if (!test_bit(i - INTEL_PMC_IDX_FIXED, hybrid(cpuc->pmu, fixed_cntr_mask)))
2524 continue;
2525
2526 wrmsrq(x86_pmu_fixed_ctr_addr(i - INTEL_PMC_IDX_FIXED), 0);
2527 } else {
2528 wrmsrq(x86_pmu_event_addr(i), 0);
2529 }
2530 }
2531
2532 bitmap_zero(cpuc->dirty, X86_PMC_IDX_MAX);
2533 }
2534
x86_pmu_event_mapped(struct perf_event * event,struct mm_struct * mm)2535 static void x86_pmu_event_mapped(struct perf_event *event, struct mm_struct *mm)
2536 {
2537 if (!(event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT))
2538 return;
2539
2540 /*
2541 * This function relies on not being called concurrently in two
2542 * tasks in the same mm. Otherwise one task could observe
2543 * perf_rdpmc_allowed > 1 and return all the way back to
2544 * userspace with CR4.PCE clear while another task is still
2545 * doing on_each_cpu_mask() to propagate CR4.PCE.
2546 *
2547 * For now, this can't happen because all callers hold mmap_lock
2548 * for write. If this changes, we'll need a different solution.
2549 */
2550 mmap_assert_write_locked(mm);
2551
2552 if (atomic_inc_return(&mm->context.perf_rdpmc_allowed) == 1)
2553 on_each_cpu_mask(mm_cpumask(mm), cr4_update_pce, NULL, 1);
2554 }
2555
x86_pmu_event_unmapped(struct perf_event * event,struct mm_struct * mm)2556 static void x86_pmu_event_unmapped(struct perf_event *event, struct mm_struct *mm)
2557 {
2558 if (!(event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT))
2559 return;
2560
2561 if (atomic_dec_and_test(&mm->context.perf_rdpmc_allowed))
2562 on_each_cpu_mask(mm_cpumask(mm), cr4_update_pce, NULL, 1);
2563 }
2564
x86_pmu_event_idx(struct perf_event * event)2565 static int x86_pmu_event_idx(struct perf_event *event)
2566 {
2567 struct hw_perf_event *hwc = &event->hw;
2568
2569 if (!(hwc->flags & PERF_EVENT_FLAG_USER_READ_CNT))
2570 return 0;
2571
2572 if (is_metric_idx(hwc->idx))
2573 return INTEL_PMC_FIXED_RDPMC_METRICS + 1;
2574 else
2575 return hwc->event_base_rdpmc + 1;
2576 }
2577
get_attr_rdpmc(struct device * cdev,struct device_attribute * attr,char * buf)2578 static ssize_t get_attr_rdpmc(struct device *cdev,
2579 struct device_attribute *attr,
2580 char *buf)
2581 {
2582 return snprintf(buf, 40, "%d\n", x86_pmu.attr_rdpmc);
2583 }
2584
set_attr_rdpmc(struct device * cdev,struct device_attribute * attr,const char * buf,size_t count)2585 static ssize_t set_attr_rdpmc(struct device *cdev,
2586 struct device_attribute *attr,
2587 const char *buf, size_t count)
2588 {
2589 static DEFINE_MUTEX(rdpmc_mutex);
2590 unsigned long val;
2591 ssize_t ret;
2592
2593 ret = kstrtoul(buf, 0, &val);
2594 if (ret)
2595 return ret;
2596
2597 if (val > 2)
2598 return -EINVAL;
2599
2600 if (x86_pmu.attr_rdpmc_broken)
2601 return -ENOTSUPP;
2602
2603 guard(mutex)(&rdpmc_mutex);
2604
2605 if (val != x86_pmu.attr_rdpmc) {
2606 /*
2607 * Changing into or out of never available or always available,
2608 * aka perf-event-bypassing mode. This path is extremely slow,
2609 * but only root can trigger it, so it's okay.
2610 */
2611 if (val == 0)
2612 static_branch_inc(&rdpmc_never_available_key);
2613 else if (x86_pmu.attr_rdpmc == 0)
2614 static_branch_dec(&rdpmc_never_available_key);
2615
2616 if (val == 2)
2617 static_branch_inc(&rdpmc_always_available_key);
2618 else if (x86_pmu.attr_rdpmc == 2)
2619 static_branch_dec(&rdpmc_always_available_key);
2620
2621 on_each_cpu(cr4_update_pce, NULL, 1);
2622 x86_pmu.attr_rdpmc = val;
2623 }
2624
2625 return count;
2626 }
2627
2628 static DEVICE_ATTR(rdpmc, S_IRUSR | S_IWUSR, get_attr_rdpmc, set_attr_rdpmc);
2629
2630 static struct attribute *x86_pmu_attrs[] = {
2631 &dev_attr_rdpmc.attr,
2632 NULL,
2633 };
2634
2635 static struct attribute_group x86_pmu_attr_group __ro_after_init = {
2636 .attrs = x86_pmu_attrs,
2637 };
2638
max_precise_show(struct device * cdev,struct device_attribute * attr,char * buf)2639 static ssize_t max_precise_show(struct device *cdev,
2640 struct device_attribute *attr,
2641 char *buf)
2642 {
2643 struct pmu *pmu = dev_get_drvdata(cdev);
2644
2645 return snprintf(buf, PAGE_SIZE, "%d\n", x86_pmu_max_precise(pmu));
2646 }
2647
2648 static DEVICE_ATTR_RO(max_precise);
2649
2650 static struct attribute *x86_pmu_caps_attrs[] = {
2651 &dev_attr_max_precise.attr,
2652 NULL
2653 };
2654
2655 static struct attribute_group x86_pmu_caps_group __ro_after_init = {
2656 .name = "caps",
2657 .attrs = x86_pmu_caps_attrs,
2658 };
2659
2660 static const struct attribute_group *x86_pmu_attr_groups[] = {
2661 &x86_pmu_attr_group,
2662 &x86_pmu_format_group,
2663 &x86_pmu_events_group,
2664 &x86_pmu_caps_group,
2665 NULL,
2666 };
2667
x86_pmu_sched_task(struct perf_event_pmu_context * pmu_ctx,struct task_struct * task,bool sched_in)2668 static void x86_pmu_sched_task(struct perf_event_pmu_context *pmu_ctx,
2669 struct task_struct *task, bool sched_in)
2670 {
2671 static_call_cond(x86_pmu_sched_task)(pmu_ctx, task, sched_in);
2672 }
2673
perf_check_microcode(void)2674 void perf_check_microcode(void)
2675 {
2676 if (x86_pmu.check_microcode)
2677 x86_pmu.check_microcode();
2678 }
2679
x86_pmu_check_period(struct perf_event * event,u64 value)2680 static int x86_pmu_check_period(struct perf_event *event, u64 value)
2681 {
2682 if (x86_pmu.check_period && x86_pmu.check_period(event, value))
2683 return -EINVAL;
2684
2685 if (value && x86_pmu.limit_period) {
2686 s64 left = value;
2687 x86_pmu.limit_period(event, &left);
2688 if (left > value)
2689 return -EINVAL;
2690 }
2691
2692 return 0;
2693 }
2694
x86_pmu_aux_output_match(struct perf_event * event)2695 static int x86_pmu_aux_output_match(struct perf_event *event)
2696 {
2697 if (!(pmu.capabilities & PERF_PMU_CAP_AUX_OUTPUT))
2698 return 0;
2699
2700 if (x86_pmu.aux_output_match)
2701 return x86_pmu.aux_output_match(event);
2702
2703 return 0;
2704 }
2705
x86_pmu_filter(struct pmu * pmu,int cpu)2706 static bool x86_pmu_filter(struct pmu *pmu, int cpu)
2707 {
2708 bool ret = false;
2709
2710 static_call_cond(x86_pmu_filter)(pmu, cpu, &ret);
2711
2712 return ret;
2713 }
2714
2715 static struct pmu pmu = {
2716 .pmu_enable = x86_pmu_enable,
2717 .pmu_disable = x86_pmu_disable,
2718
2719 .attr_groups = x86_pmu_attr_groups,
2720
2721 .event_init = x86_pmu_event_init,
2722
2723 .event_mapped = x86_pmu_event_mapped,
2724 .event_unmapped = x86_pmu_event_unmapped,
2725
2726 .add = x86_pmu_add,
2727 .del = x86_pmu_del,
2728 .start = x86_pmu_start,
2729 .stop = x86_pmu_stop,
2730 .read = x86_pmu_read,
2731
2732 .start_txn = x86_pmu_start_txn,
2733 .cancel_txn = x86_pmu_cancel_txn,
2734 .commit_txn = x86_pmu_commit_txn,
2735
2736 .event_idx = x86_pmu_event_idx,
2737 .sched_task = x86_pmu_sched_task,
2738 .check_period = x86_pmu_check_period,
2739
2740 .aux_output_match = x86_pmu_aux_output_match,
2741
2742 .filter = x86_pmu_filter,
2743 };
2744
arch_perf_update_userpage(struct perf_event * event,struct perf_event_mmap_page * userpg,u64 now)2745 void arch_perf_update_userpage(struct perf_event *event,
2746 struct perf_event_mmap_page *userpg, u64 now)
2747 {
2748 struct cyc2ns_data data;
2749 u64 offset;
2750
2751 userpg->cap_user_time = 0;
2752 userpg->cap_user_time_zero = 0;
2753 userpg->cap_user_rdpmc =
2754 !!(event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT);
2755 userpg->pmc_width = x86_pmu.cntval_bits;
2756
2757 if (!using_native_sched_clock() || !sched_clock_stable())
2758 return;
2759
2760 cyc2ns_read_begin(&data);
2761
2762 offset = data.cyc2ns_offset + __sched_clock_offset;
2763
2764 /*
2765 * Internal timekeeping for enabled/running/stopped times
2766 * is always in the local_clock domain.
2767 */
2768 userpg->cap_user_time = 1;
2769 userpg->time_mult = data.cyc2ns_mul;
2770 userpg->time_shift = data.cyc2ns_shift;
2771 userpg->time_offset = offset - now;
2772
2773 /*
2774 * cap_user_time_zero doesn't make sense when we're using a different
2775 * time base for the records.
2776 */
2777 if (!event->attr.use_clockid) {
2778 userpg->cap_user_time_zero = 1;
2779 userpg->time_zero = offset;
2780 }
2781
2782 cyc2ns_read_end();
2783 }
2784
2785 /*
2786 * Determine whether the regs were taken from an irq/exception handler rather
2787 * than from perf_arch_fetch_caller_regs().
2788 */
perf_hw_regs(struct pt_regs * regs)2789 static bool perf_hw_regs(struct pt_regs *regs)
2790 {
2791 return regs->flags & X86_EFLAGS_FIXED;
2792 }
2793
2794 void
perf_callchain_kernel(struct perf_callchain_entry_ctx * entry,struct pt_regs * regs)2795 perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs)
2796 {
2797 struct unwind_state state;
2798 unsigned long addr;
2799
2800 if (perf_guest_state()) {
2801 /* TODO: We don't support guest os callchain now */
2802 return;
2803 }
2804
2805 if (perf_hw_regs(regs)) {
2806 if (perf_callchain_store(entry, regs->ip))
2807 return;
2808 unwind_start(&state, current, regs, NULL);
2809 } else {
2810 unwind_start(&state, current, NULL, (void *)regs->sp);
2811 }
2812
2813 for (; !unwind_done(&state); unwind_next_frame(&state)) {
2814 addr = unwind_get_return_address(&state);
2815 if (!addr || perf_callchain_store(entry, addr))
2816 return;
2817 }
2818 }
2819
2820 static inline int
valid_user_frame(const void __user * fp,unsigned long size)2821 valid_user_frame(const void __user *fp, unsigned long size)
2822 {
2823 return __access_ok(fp, size);
2824 }
2825
get_segment_base(unsigned int segment)2826 static unsigned long get_segment_base(unsigned int segment)
2827 {
2828 struct desc_struct *desc;
2829 unsigned int idx = segment >> 3;
2830
2831 if ((segment & SEGMENT_TI_MASK) == SEGMENT_LDT) {
2832 #ifdef CONFIG_MODIFY_LDT_SYSCALL
2833 struct ldt_struct *ldt;
2834
2835 /*
2836 * If we're not in a valid context with a real (not just lazy)
2837 * user mm, then don't even try.
2838 */
2839 if (!nmi_uaccess_okay())
2840 return 0;
2841
2842 /* IRQs are off, so this synchronizes with smp_store_release */
2843 ldt = smp_load_acquire(¤t->mm->context.ldt);
2844 if (!ldt || idx >= ldt->nr_entries)
2845 return 0;
2846
2847 desc = &ldt->entries[idx];
2848 #else
2849 return 0;
2850 #endif
2851 } else {
2852 if (idx >= GDT_ENTRIES)
2853 return 0;
2854
2855 desc = raw_cpu_ptr(gdt_page.gdt) + idx;
2856 }
2857
2858 return get_desc_base(desc);
2859 }
2860
2861 #ifdef CONFIG_IA32_EMULATION
2862
2863 #include <linux/compat.h>
2864
2865 static inline int
perf_callchain_user32(struct pt_regs * regs,struct perf_callchain_entry_ctx * entry)2866 perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry_ctx *entry)
2867 {
2868 /* 32-bit process in 64-bit kernel. */
2869 unsigned long ss_base, cs_base;
2870 struct stack_frame_ia32 frame;
2871 const struct stack_frame_ia32 __user *fp;
2872 u32 ret_addr;
2873
2874 if (user_64bit_mode(regs))
2875 return 0;
2876
2877 cs_base = get_segment_base(regs->cs);
2878 ss_base = get_segment_base(regs->ss);
2879
2880 fp = compat_ptr(ss_base + regs->bp);
2881 pagefault_disable();
2882
2883 /* see perf_callchain_user() below for why we do this */
2884 if (is_uprobe_at_func_entry(regs) &&
2885 !get_user(ret_addr, (const u32 __user *)regs->sp))
2886 perf_callchain_store(entry, ret_addr);
2887
2888 while (entry->nr < entry->max_stack) {
2889 if (!valid_user_frame(fp, sizeof(frame)))
2890 break;
2891
2892 if (__get_user(frame.next_frame, &fp->next_frame))
2893 break;
2894 if (__get_user(frame.return_address, &fp->return_address))
2895 break;
2896
2897 perf_callchain_store(entry, cs_base + frame.return_address);
2898 fp = compat_ptr(ss_base + frame.next_frame);
2899 }
2900 pagefault_enable();
2901 return 1;
2902 }
2903 #else
2904 static inline int
perf_callchain_user32(struct pt_regs * regs,struct perf_callchain_entry_ctx * entry)2905 perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry_ctx *entry)
2906 {
2907 return 0;
2908 }
2909 #endif
2910
2911 void
perf_callchain_user(struct perf_callchain_entry_ctx * entry,struct pt_regs * regs)2912 perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs)
2913 {
2914 struct stack_frame frame;
2915 const struct stack_frame __user *fp;
2916 unsigned long ret_addr;
2917
2918 if (perf_guest_state()) {
2919 /* TODO: We don't support guest os callchain now */
2920 return;
2921 }
2922
2923 /*
2924 * We don't know what to do with VM86 stacks.. ignore them for now.
2925 */
2926 if (regs->flags & (X86_VM_MASK | PERF_EFLAGS_VM))
2927 return;
2928
2929 fp = (void __user *)regs->bp;
2930
2931 perf_callchain_store(entry, regs->ip);
2932
2933 if (!nmi_uaccess_okay())
2934 return;
2935
2936 if (perf_callchain_user32(regs, entry))
2937 return;
2938
2939 pagefault_disable();
2940
2941 /*
2942 * If we are called from uprobe handler, and we are indeed at the very
2943 * entry to user function (which is normally a `push %rbp` instruction,
2944 * under assumption of application being compiled with frame pointers),
2945 * we should read return address from *regs->sp before proceeding
2946 * to follow frame pointers, otherwise we'll skip immediate caller
2947 * as %rbp is not yet setup.
2948 */
2949 if (is_uprobe_at_func_entry(regs) &&
2950 !get_user(ret_addr, (const unsigned long __user *)regs->sp))
2951 perf_callchain_store(entry, ret_addr);
2952
2953 while (entry->nr < entry->max_stack) {
2954 if (!valid_user_frame(fp, sizeof(frame)))
2955 break;
2956
2957 if (__get_user(frame.next_frame, &fp->next_frame))
2958 break;
2959 if (__get_user(frame.return_address, &fp->return_address))
2960 break;
2961
2962 perf_callchain_store(entry, frame.return_address);
2963 fp = (void __user *)frame.next_frame;
2964 }
2965 pagefault_enable();
2966 }
2967
2968 /*
2969 * Deal with code segment offsets for the various execution modes:
2970 *
2971 * VM86 - the good olde 16 bit days, where the linear address is
2972 * 20 bits and we use regs->ip + 0x10 * regs->cs.
2973 *
2974 * IA32 - Where we need to look at GDT/LDT segment descriptor tables
2975 * to figure out what the 32bit base address is.
2976 *
2977 * X32 - has TIF_X32 set, but is running in x86_64
2978 *
2979 * X86_64 - CS,DS,SS,ES are all zero based.
2980 */
code_segment_base(struct pt_regs * regs)2981 static unsigned long code_segment_base(struct pt_regs *regs)
2982 {
2983 /*
2984 * For IA32 we look at the GDT/LDT segment base to convert the
2985 * effective IP to a linear address.
2986 */
2987
2988 #ifdef CONFIG_X86_32
2989 /*
2990 * If we are in VM86 mode, add the segment offset to convert to a
2991 * linear address.
2992 */
2993 if (regs->flags & X86_VM_MASK)
2994 return 0x10 * regs->cs;
2995
2996 if (user_mode(regs) && regs->cs != __USER_CS)
2997 return get_segment_base(regs->cs);
2998 #else
2999 if (user_mode(regs) && !user_64bit_mode(regs) &&
3000 regs->cs != __USER32_CS)
3001 return get_segment_base(regs->cs);
3002 #endif
3003 return 0;
3004 }
3005
perf_arch_instruction_pointer(struct pt_regs * regs)3006 unsigned long perf_arch_instruction_pointer(struct pt_regs *regs)
3007 {
3008 return regs->ip + code_segment_base(regs);
3009 }
3010
common_misc_flags(struct pt_regs * regs)3011 static unsigned long common_misc_flags(struct pt_regs *regs)
3012 {
3013 if (regs->flags & PERF_EFLAGS_EXACT)
3014 return PERF_RECORD_MISC_EXACT_IP;
3015
3016 return 0;
3017 }
3018
guest_misc_flags(struct pt_regs * regs)3019 static unsigned long guest_misc_flags(struct pt_regs *regs)
3020 {
3021 unsigned long guest_state = perf_guest_state();
3022
3023 if (!(guest_state & PERF_GUEST_ACTIVE))
3024 return 0;
3025
3026 if (guest_state & PERF_GUEST_USER)
3027 return PERF_RECORD_MISC_GUEST_USER;
3028 else
3029 return PERF_RECORD_MISC_GUEST_KERNEL;
3030
3031 }
3032
host_misc_flags(struct pt_regs * regs)3033 static unsigned long host_misc_flags(struct pt_regs *regs)
3034 {
3035 if (user_mode(regs))
3036 return PERF_RECORD_MISC_USER;
3037 else
3038 return PERF_RECORD_MISC_KERNEL;
3039 }
3040
perf_arch_guest_misc_flags(struct pt_regs * regs)3041 unsigned long perf_arch_guest_misc_flags(struct pt_regs *regs)
3042 {
3043 unsigned long flags = common_misc_flags(regs);
3044
3045 flags |= guest_misc_flags(regs);
3046
3047 return flags;
3048 }
3049
perf_arch_misc_flags(struct pt_regs * regs)3050 unsigned long perf_arch_misc_flags(struct pt_regs *regs)
3051 {
3052 unsigned long flags = common_misc_flags(regs);
3053
3054 flags |= host_misc_flags(regs);
3055
3056 return flags;
3057 }
3058
perf_get_x86_pmu_capability(struct x86_pmu_capability * cap)3059 void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap)
3060 {
3061 /* This API doesn't currently support enumerating hybrid PMUs. */
3062 if (WARN_ON_ONCE(cpu_feature_enabled(X86_FEATURE_HYBRID_CPU)) ||
3063 !x86_pmu_initialized()) {
3064 memset(cap, 0, sizeof(*cap));
3065 return;
3066 }
3067
3068 /*
3069 * Note, hybrid CPU models get tracked as having hybrid PMUs even when
3070 * all E-cores are disabled via BIOS. When E-cores are disabled, the
3071 * base PMU holds the correct number of counters for P-cores.
3072 */
3073 cap->version = x86_pmu.version;
3074 cap->num_counters_gp = x86_pmu_num_counters(NULL);
3075 cap->num_counters_fixed = x86_pmu_num_counters_fixed(NULL);
3076 cap->bit_width_gp = x86_pmu.cntval_bits;
3077 cap->bit_width_fixed = x86_pmu.cntval_bits;
3078 cap->events_mask = (unsigned int)x86_pmu.events_maskl;
3079 cap->events_mask_len = x86_pmu.events_mask_len;
3080 cap->pebs_ept = x86_pmu.pebs_ept;
3081 }
3082 EXPORT_SYMBOL_FOR_KVM(perf_get_x86_pmu_capability);
3083
perf_get_hw_event_config(int hw_event)3084 u64 perf_get_hw_event_config(int hw_event)
3085 {
3086 int max = x86_pmu.max_events;
3087
3088 if (hw_event < max)
3089 return x86_pmu.event_map(array_index_nospec(hw_event, max));
3090
3091 return 0;
3092 }
3093 EXPORT_SYMBOL_FOR_KVM(perf_get_hw_event_config);
3094