1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3 * Kernel-based Virtual Machine -- Performance Monitoring Unit support
4 *
5 * Copyright 2015 Red Hat, Inc. and/or its affiliates.
6 *
7 * Authors:
8 * Avi Kivity <avi@redhat.com>
9 * Gleb Natapov <gleb@redhat.com>
10 * Wei Huang <wei@redhat.com>
11 */
12 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
13
14 #include <linux/types.h>
15 #include <linux/kvm_host.h>
16 #include <linux/perf_event.h>
17 #include <linux/bsearch.h>
18 #include <linux/sort.h>
19 #include <asm/perf_event.h>
20 #include <asm/cpu_device_id.h>
21 #include "x86.h"
22 #include "cpuid.h"
23 #include "lapic.h"
24 #include "pmu.h"
25
26 /* This is enough to filter the vast majority of currently defined events. */
27 #define KVM_PMU_EVENT_FILTER_MAX_EVENTS 300
28
29 struct x86_pmu_capability __read_mostly kvm_pmu_cap;
30 EXPORT_SYMBOL_GPL(kvm_pmu_cap);
31
32 struct kvm_pmu_emulated_event_selectors __read_mostly kvm_pmu_eventsel;
33 EXPORT_SYMBOL_GPL(kvm_pmu_eventsel);
34
35 /* Precise Distribution of Instructions Retired (PDIR) */
36 static const struct x86_cpu_id vmx_pebs_pdir_cpu[] = {
37 X86_MATCH_VFM(INTEL_ICELAKE_D, NULL),
38 X86_MATCH_VFM(INTEL_ICELAKE_X, NULL),
39 /* Instruction-Accurate PDIR (PDIR++) */
40 X86_MATCH_VFM(INTEL_SAPPHIRERAPIDS_X, NULL),
41 {}
42 };
43
44 /* Precise Distribution (PDist) */
45 static const struct x86_cpu_id vmx_pebs_pdist_cpu[] = {
46 X86_MATCH_VFM(INTEL_SAPPHIRERAPIDS_X, NULL),
47 {}
48 };
49
50 /* NOTE:
51 * - Each perf counter is defined as "struct kvm_pmc";
52 * - There are two types of perf counters: general purpose (gp) and fixed.
53 * gp counters are stored in gp_counters[] and fixed counters are stored
54 * in fixed_counters[] respectively. Both of them are part of "struct
55 * kvm_pmu";
56 * - pmu.c understands the difference between gp counters and fixed counters.
57 * However AMD doesn't support fixed-counters;
58 * - There are three types of index to access perf counters (PMC):
59 * 1. MSR (named msr): For example Intel has MSR_IA32_PERFCTRn and AMD
60 * has MSR_K7_PERFCTRn and, for families 15H and later,
61 * MSR_F15H_PERF_CTRn, where MSR_F15H_PERF_CTR[0-3] are
62 * aliased to MSR_K7_PERFCTRn.
63 * 2. MSR Index (named idx): This normally is used by RDPMC instruction.
64 * For instance AMD RDPMC instruction uses 0000_0003h in ECX to access
65 * C001_0007h (MSR_K7_PERCTR3). Intel has a similar mechanism, except
66 * that it also supports fixed counters. idx can be used to as index to
67 * gp and fixed counters.
68 * 3. Global PMC Index (named pmc): pmc is an index specific to PMU
69 * code. Each pmc, stored in kvm_pmc.idx field, is unique across
70 * all perf counters (both gp and fixed). The mapping relationship
71 * between pmc and perf counters is as the following:
72 * * Intel: [0 .. KVM_MAX_NR_INTEL_GP_COUNTERS-1] <=> gp counters
73 * [KVM_FIXED_PMC_BASE_IDX .. KVM_FIXED_PMC_BASE_IDX + 2] <=> fixed
74 * * AMD: [0 .. AMD64_NUM_COUNTERS-1] and, for families 15H
75 * and later, [0 .. AMD64_NUM_COUNTERS_CORE-1] <=> gp counters
76 */
77
78 static struct kvm_pmu_ops kvm_pmu_ops __read_mostly;
79
80 #define KVM_X86_PMU_OP(func) \
81 DEFINE_STATIC_CALL_NULL(kvm_x86_pmu_##func, \
82 *(((struct kvm_pmu_ops *)0)->func));
83 #define KVM_X86_PMU_OP_OPTIONAL KVM_X86_PMU_OP
84 #include <asm/kvm-x86-pmu-ops.h>
85
kvm_pmu_ops_update(const struct kvm_pmu_ops * pmu_ops)86 void kvm_pmu_ops_update(const struct kvm_pmu_ops *pmu_ops)
87 {
88 memcpy(&kvm_pmu_ops, pmu_ops, sizeof(kvm_pmu_ops));
89
90 #define __KVM_X86_PMU_OP(func) \
91 static_call_update(kvm_x86_pmu_##func, kvm_pmu_ops.func);
92 #define KVM_X86_PMU_OP(func) \
93 WARN_ON(!kvm_pmu_ops.func); __KVM_X86_PMU_OP(func)
94 #define KVM_X86_PMU_OP_OPTIONAL __KVM_X86_PMU_OP
95 #include <asm/kvm-x86-pmu-ops.h>
96 #undef __KVM_X86_PMU_OP
97 }
98
__kvm_perf_overflow(struct kvm_pmc * pmc,bool in_pmi)99 static inline void __kvm_perf_overflow(struct kvm_pmc *pmc, bool in_pmi)
100 {
101 struct kvm_pmu *pmu = pmc_to_pmu(pmc);
102 bool skip_pmi = false;
103
104 if (pmc->perf_event && pmc->perf_event->attr.precise_ip) {
105 if (!in_pmi) {
106 /*
107 * TODO: KVM is currently _choosing_ to not generate records
108 * for emulated instructions, avoiding BUFFER_OVF PMI when
109 * there are no records. Strictly speaking, it should be done
110 * as well in the right context to improve sampling accuracy.
111 */
112 skip_pmi = true;
113 } else {
114 /* Indicate PEBS overflow PMI to guest. */
115 skip_pmi = __test_and_set_bit(GLOBAL_STATUS_BUFFER_OVF_BIT,
116 (unsigned long *)&pmu->global_status);
117 }
118 } else {
119 __set_bit(pmc->idx, (unsigned long *)&pmu->global_status);
120 }
121
122 if (pmc->intr && !skip_pmi)
123 kvm_make_request(KVM_REQ_PMI, pmc->vcpu);
124 }
125
kvm_perf_overflow(struct perf_event * perf_event,struct perf_sample_data * data,struct pt_regs * regs)126 static void kvm_perf_overflow(struct perf_event *perf_event,
127 struct perf_sample_data *data,
128 struct pt_regs *regs)
129 {
130 struct kvm_pmc *pmc = perf_event->overflow_handler_context;
131
132 /*
133 * Ignore asynchronous overflow events for counters that are scheduled
134 * to be reprogrammed, e.g. if a PMI for the previous event races with
135 * KVM's handling of a related guest WRMSR.
136 */
137 if (test_and_set_bit(pmc->idx, pmc_to_pmu(pmc)->reprogram_pmi))
138 return;
139
140 __kvm_perf_overflow(pmc, true);
141
142 kvm_make_request(KVM_REQ_PMU, pmc->vcpu);
143 }
144
pmc_get_pebs_precise_level(struct kvm_pmc * pmc)145 static u64 pmc_get_pebs_precise_level(struct kvm_pmc *pmc)
146 {
147 /*
148 * For some model specific pebs counters with special capabilities
149 * (PDIR, PDIR++, PDIST), KVM needs to raise the event precise
150 * level to the maximum value (currently 3, backwards compatible)
151 * so that the perf subsystem would assign specific hardware counter
152 * with that capability for vPMC.
153 */
154 if ((pmc->idx == 0 && x86_match_cpu(vmx_pebs_pdist_cpu)) ||
155 (pmc->idx == 32 && x86_match_cpu(vmx_pebs_pdir_cpu)))
156 return 3;
157
158 /*
159 * The non-zero precision level of guest event makes the ordinary
160 * guest event becomes a guest PEBS event and triggers the host
161 * PEBS PMI handler to determine whether the PEBS overflow PMI
162 * comes from the host counters or the guest.
163 */
164 return 1;
165 }
166
get_sample_period(struct kvm_pmc * pmc,u64 counter_value)167 static u64 get_sample_period(struct kvm_pmc *pmc, u64 counter_value)
168 {
169 u64 sample_period = (-counter_value) & pmc_bitmask(pmc);
170
171 if (!sample_period)
172 sample_period = pmc_bitmask(pmc) + 1;
173 return sample_period;
174 }
175
pmc_reprogram_counter(struct kvm_pmc * pmc,u32 type,u64 config,bool exclude_user,bool exclude_kernel,bool intr)176 static int pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, u64 config,
177 bool exclude_user, bool exclude_kernel,
178 bool intr)
179 {
180 struct kvm_pmu *pmu = pmc_to_pmu(pmc);
181 struct perf_event *event;
182 struct perf_event_attr attr = {
183 .type = type,
184 .size = sizeof(attr),
185 .pinned = true,
186 .exclude_idle = true,
187 .exclude_host = 1,
188 .exclude_user = exclude_user,
189 .exclude_kernel = exclude_kernel,
190 .config = config,
191 };
192 bool pebs = test_bit(pmc->idx, (unsigned long *)&pmu->pebs_enable);
193
194 attr.sample_period = get_sample_period(pmc, pmc->counter);
195
196 if ((attr.config & HSW_IN_TX_CHECKPOINTED) &&
197 (boot_cpu_has(X86_FEATURE_RTM) || boot_cpu_has(X86_FEATURE_HLE))) {
198 /*
199 * HSW_IN_TX_CHECKPOINTED is not supported with nonzero
200 * period. Just clear the sample period so at least
201 * allocating the counter doesn't fail.
202 */
203 attr.sample_period = 0;
204 }
205 if (pebs) {
206 /*
207 * For most PEBS hardware events, the difference in the software
208 * precision levels of guest and host PEBS events will not affect
209 * the accuracy of the PEBS profiling result, because the "event IP"
210 * in the PEBS record is calibrated on the guest side.
211 */
212 attr.precise_ip = pmc_get_pebs_precise_level(pmc);
213 }
214
215 event = perf_event_create_kernel_counter(&attr, -1, current,
216 kvm_perf_overflow, pmc);
217 if (IS_ERR(event)) {
218 pr_debug_ratelimited("kvm_pmu: event creation failed %ld for pmc->idx = %d\n",
219 PTR_ERR(event), pmc->idx);
220 return PTR_ERR(event);
221 }
222
223 pmc->perf_event = event;
224 pmc_to_pmu(pmc)->event_count++;
225 pmc->is_paused = false;
226 pmc->intr = intr || pebs;
227 return 0;
228 }
229
pmc_pause_counter(struct kvm_pmc * pmc)230 static bool pmc_pause_counter(struct kvm_pmc *pmc)
231 {
232 u64 counter = pmc->counter;
233 u64 prev_counter;
234
235 /* update counter, reset event value to avoid redundant accumulation */
236 if (pmc->perf_event && !pmc->is_paused)
237 counter += perf_event_pause(pmc->perf_event, true);
238
239 /*
240 * Snapshot the previous counter *after* accumulating state from perf.
241 * If overflow already happened, hardware (via perf) is responsible for
242 * generating a PMI. KVM just needs to detect overflow on emulated
243 * counter events that haven't yet been processed.
244 */
245 prev_counter = counter & pmc_bitmask(pmc);
246
247 counter += pmc->emulated_counter;
248 pmc->counter = counter & pmc_bitmask(pmc);
249
250 pmc->emulated_counter = 0;
251 pmc->is_paused = true;
252
253 return pmc->counter < prev_counter;
254 }
255
pmc_resume_counter(struct kvm_pmc * pmc)256 static bool pmc_resume_counter(struct kvm_pmc *pmc)
257 {
258 if (!pmc->perf_event)
259 return false;
260
261 /* recalibrate sample period and check if it's accepted by perf core */
262 if (is_sampling_event(pmc->perf_event) &&
263 perf_event_period(pmc->perf_event,
264 get_sample_period(pmc, pmc->counter)))
265 return false;
266
267 if (test_bit(pmc->idx, (unsigned long *)&pmc_to_pmu(pmc)->pebs_enable) !=
268 (!!pmc->perf_event->attr.precise_ip))
269 return false;
270
271 /* reuse perf_event to serve as pmc_reprogram_counter() does*/
272 perf_event_enable(pmc->perf_event);
273 pmc->is_paused = false;
274
275 return true;
276 }
277
pmc_release_perf_event(struct kvm_pmc * pmc)278 static void pmc_release_perf_event(struct kvm_pmc *pmc)
279 {
280 if (pmc->perf_event) {
281 perf_event_release_kernel(pmc->perf_event);
282 pmc->perf_event = NULL;
283 pmc->current_config = 0;
284 pmc_to_pmu(pmc)->event_count--;
285 }
286 }
287
pmc_stop_counter(struct kvm_pmc * pmc)288 static void pmc_stop_counter(struct kvm_pmc *pmc)
289 {
290 if (pmc->perf_event) {
291 pmc->counter = pmc_read_counter(pmc);
292 pmc_release_perf_event(pmc);
293 }
294 }
295
pmc_update_sample_period(struct kvm_pmc * pmc)296 static void pmc_update_sample_period(struct kvm_pmc *pmc)
297 {
298 if (!pmc->perf_event || pmc->is_paused ||
299 !is_sampling_event(pmc->perf_event))
300 return;
301
302 perf_event_period(pmc->perf_event,
303 get_sample_period(pmc, pmc->counter));
304 }
305
pmc_write_counter(struct kvm_pmc * pmc,u64 val)306 void pmc_write_counter(struct kvm_pmc *pmc, u64 val)
307 {
308 /*
309 * Drop any unconsumed accumulated counts, the WRMSR is a write, not a
310 * read-modify-write. Adjust the counter value so that its value is
311 * relative to the current count, as reading the current count from
312 * perf is faster than pausing and repgrogramming the event in order to
313 * reset it to '0'. Note, this very sneakily offsets the accumulated
314 * emulated count too, by using pmc_read_counter()!
315 */
316 pmc->emulated_counter = 0;
317 pmc->counter += val - pmc_read_counter(pmc);
318 pmc->counter &= pmc_bitmask(pmc);
319 pmc_update_sample_period(pmc);
320 }
321 EXPORT_SYMBOL_GPL(pmc_write_counter);
322
filter_cmp(const void * pa,const void * pb,u64 mask)323 static int filter_cmp(const void *pa, const void *pb, u64 mask)
324 {
325 u64 a = *(u64 *)pa & mask;
326 u64 b = *(u64 *)pb & mask;
327
328 return (a > b) - (a < b);
329 }
330
331
filter_sort_cmp(const void * pa,const void * pb)332 static int filter_sort_cmp(const void *pa, const void *pb)
333 {
334 return filter_cmp(pa, pb, (KVM_PMU_MASKED_ENTRY_EVENT_SELECT |
335 KVM_PMU_MASKED_ENTRY_EXCLUDE));
336 }
337
338 /*
339 * For the event filter, searching is done on the 'includes' list and
340 * 'excludes' list separately rather than on the 'events' list (which
341 * has both). As a result the exclude bit can be ignored.
342 */
filter_event_cmp(const void * pa,const void * pb)343 static int filter_event_cmp(const void *pa, const void *pb)
344 {
345 return filter_cmp(pa, pb, (KVM_PMU_MASKED_ENTRY_EVENT_SELECT));
346 }
347
find_filter_index(u64 * events,u64 nevents,u64 key)348 static int find_filter_index(u64 *events, u64 nevents, u64 key)
349 {
350 u64 *fe = bsearch(&key, events, nevents, sizeof(events[0]),
351 filter_event_cmp);
352
353 if (!fe)
354 return -1;
355
356 return fe - events;
357 }
358
is_filter_entry_match(u64 filter_event,u64 umask)359 static bool is_filter_entry_match(u64 filter_event, u64 umask)
360 {
361 u64 mask = filter_event >> (KVM_PMU_MASKED_ENTRY_UMASK_MASK_SHIFT - 8);
362 u64 match = filter_event & KVM_PMU_MASKED_ENTRY_UMASK_MATCH;
363
364 BUILD_BUG_ON((KVM_PMU_ENCODE_MASKED_ENTRY(0, 0xff, 0, false) >>
365 (KVM_PMU_MASKED_ENTRY_UMASK_MASK_SHIFT - 8)) !=
366 ARCH_PERFMON_EVENTSEL_UMASK);
367
368 return (umask & mask) == match;
369 }
370
filter_contains_match(u64 * events,u64 nevents,u64 eventsel)371 static bool filter_contains_match(u64 *events, u64 nevents, u64 eventsel)
372 {
373 u64 event_select = eventsel & kvm_pmu_ops.EVENTSEL_EVENT;
374 u64 umask = eventsel & ARCH_PERFMON_EVENTSEL_UMASK;
375 int i, index;
376
377 index = find_filter_index(events, nevents, event_select);
378 if (index < 0)
379 return false;
380
381 /*
382 * Entries are sorted by the event select. Walk the list in both
383 * directions to process all entries with the targeted event select.
384 */
385 for (i = index; i < nevents; i++) {
386 if (filter_event_cmp(&events[i], &event_select))
387 break;
388
389 if (is_filter_entry_match(events[i], umask))
390 return true;
391 }
392
393 for (i = index - 1; i >= 0; i--) {
394 if (filter_event_cmp(&events[i], &event_select))
395 break;
396
397 if (is_filter_entry_match(events[i], umask))
398 return true;
399 }
400
401 return false;
402 }
403
is_gp_event_allowed(struct kvm_x86_pmu_event_filter * f,u64 eventsel)404 static bool is_gp_event_allowed(struct kvm_x86_pmu_event_filter *f,
405 u64 eventsel)
406 {
407 if (filter_contains_match(f->includes, f->nr_includes, eventsel) &&
408 !filter_contains_match(f->excludes, f->nr_excludes, eventsel))
409 return f->action == KVM_PMU_EVENT_ALLOW;
410
411 return f->action == KVM_PMU_EVENT_DENY;
412 }
413
is_fixed_event_allowed(struct kvm_x86_pmu_event_filter * filter,int idx)414 static bool is_fixed_event_allowed(struct kvm_x86_pmu_event_filter *filter,
415 int idx)
416 {
417 int fixed_idx = idx - KVM_FIXED_PMC_BASE_IDX;
418
419 if (filter->action == KVM_PMU_EVENT_DENY &&
420 test_bit(fixed_idx, (ulong *)&filter->fixed_counter_bitmap))
421 return false;
422 if (filter->action == KVM_PMU_EVENT_ALLOW &&
423 !test_bit(fixed_idx, (ulong *)&filter->fixed_counter_bitmap))
424 return false;
425
426 return true;
427 }
428
check_pmu_event_filter(struct kvm_pmc * pmc)429 static bool check_pmu_event_filter(struct kvm_pmc *pmc)
430 {
431 struct kvm_x86_pmu_event_filter *filter;
432 struct kvm *kvm = pmc->vcpu->kvm;
433
434 filter = srcu_dereference(kvm->arch.pmu_event_filter, &kvm->srcu);
435 if (!filter)
436 return true;
437
438 if (pmc_is_gp(pmc))
439 return is_gp_event_allowed(filter, pmc->eventsel);
440
441 return is_fixed_event_allowed(filter, pmc->idx);
442 }
443
pmc_event_is_allowed(struct kvm_pmc * pmc)444 static bool pmc_event_is_allowed(struct kvm_pmc *pmc)
445 {
446 return pmc_is_globally_enabled(pmc) && pmc_speculative_in_use(pmc) &&
447 check_pmu_event_filter(pmc);
448 }
449
reprogram_counter(struct kvm_pmc * pmc)450 static int reprogram_counter(struct kvm_pmc *pmc)
451 {
452 struct kvm_pmu *pmu = pmc_to_pmu(pmc);
453 u64 eventsel = pmc->eventsel;
454 u64 new_config = eventsel;
455 bool emulate_overflow;
456 u8 fixed_ctr_ctrl;
457
458 emulate_overflow = pmc_pause_counter(pmc);
459
460 if (!pmc_event_is_allowed(pmc))
461 return 0;
462
463 if (emulate_overflow)
464 __kvm_perf_overflow(pmc, false);
465
466 if (eventsel & ARCH_PERFMON_EVENTSEL_PIN_CONTROL)
467 printk_once("kvm pmu: pin control bit is ignored\n");
468
469 if (pmc_is_fixed(pmc)) {
470 fixed_ctr_ctrl = fixed_ctrl_field(pmu->fixed_ctr_ctrl,
471 pmc->idx - KVM_FIXED_PMC_BASE_IDX);
472 if (fixed_ctr_ctrl & INTEL_FIXED_0_KERNEL)
473 eventsel |= ARCH_PERFMON_EVENTSEL_OS;
474 if (fixed_ctr_ctrl & INTEL_FIXED_0_USER)
475 eventsel |= ARCH_PERFMON_EVENTSEL_USR;
476 if (fixed_ctr_ctrl & INTEL_FIXED_0_ENABLE_PMI)
477 eventsel |= ARCH_PERFMON_EVENTSEL_INT;
478 new_config = (u64)fixed_ctr_ctrl;
479 }
480
481 if (pmc->current_config == new_config && pmc_resume_counter(pmc))
482 return 0;
483
484 pmc_release_perf_event(pmc);
485
486 pmc->current_config = new_config;
487
488 return pmc_reprogram_counter(pmc, PERF_TYPE_RAW,
489 (eventsel & pmu->raw_event_mask),
490 !(eventsel & ARCH_PERFMON_EVENTSEL_USR),
491 !(eventsel & ARCH_PERFMON_EVENTSEL_OS),
492 eventsel & ARCH_PERFMON_EVENTSEL_INT);
493 }
494
kvm_pmu_handle_event(struct kvm_vcpu * vcpu)495 void kvm_pmu_handle_event(struct kvm_vcpu *vcpu)
496 {
497 DECLARE_BITMAP(bitmap, X86_PMC_IDX_MAX);
498 struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
499 struct kvm_pmc *pmc;
500 int bit;
501
502 bitmap_copy(bitmap, pmu->reprogram_pmi, X86_PMC_IDX_MAX);
503
504 /*
505 * The reprogramming bitmap can be written asynchronously by something
506 * other than the task that holds vcpu->mutex, take care to clear only
507 * the bits that will actually processed.
508 */
509 BUILD_BUG_ON(sizeof(bitmap) != sizeof(atomic64_t));
510 atomic64_andnot(*(s64 *)bitmap, &pmu->__reprogram_pmi);
511
512 kvm_for_each_pmc(pmu, pmc, bit, bitmap) {
513 /*
514 * If reprogramming fails, e.g. due to contention, re-set the
515 * regprogram bit set, i.e. opportunistically try again on the
516 * next PMU refresh. Don't make a new request as doing so can
517 * stall the guest if reprogramming repeatedly fails.
518 */
519 if (reprogram_counter(pmc))
520 set_bit(pmc->idx, pmu->reprogram_pmi);
521 }
522
523 /*
524 * Release unused perf_events if the corresponding guest MSRs weren't
525 * accessed during the last vCPU time slice (need_cleanup is set when
526 * the vCPU is scheduled back in).
527 */
528 if (unlikely(pmu->need_cleanup))
529 kvm_pmu_cleanup(vcpu);
530 }
531
kvm_pmu_check_rdpmc_early(struct kvm_vcpu * vcpu,unsigned int idx)532 int kvm_pmu_check_rdpmc_early(struct kvm_vcpu *vcpu, unsigned int idx)
533 {
534 /*
535 * On Intel, VMX interception has priority over RDPMC exceptions that
536 * aren't already handled by the emulator, i.e. there are no additional
537 * check needed for Intel PMUs.
538 *
539 * On AMD, _all_ exceptions on RDPMC have priority over SVM intercepts,
540 * i.e. an invalid PMC results in a #GP, not #VMEXIT.
541 */
542 if (!kvm_pmu_ops.check_rdpmc_early)
543 return 0;
544
545 return kvm_pmu_call(check_rdpmc_early)(vcpu, idx);
546 }
547
is_vmware_backdoor_pmc(u32 pmc_idx)548 bool is_vmware_backdoor_pmc(u32 pmc_idx)
549 {
550 switch (pmc_idx) {
551 case VMWARE_BACKDOOR_PMC_HOST_TSC:
552 case VMWARE_BACKDOOR_PMC_REAL_TIME:
553 case VMWARE_BACKDOOR_PMC_APPARENT_TIME:
554 return true;
555 }
556 return false;
557 }
558
kvm_pmu_rdpmc_vmware(struct kvm_vcpu * vcpu,unsigned idx,u64 * data)559 static int kvm_pmu_rdpmc_vmware(struct kvm_vcpu *vcpu, unsigned idx, u64 *data)
560 {
561 u64 ctr_val;
562
563 switch (idx) {
564 case VMWARE_BACKDOOR_PMC_HOST_TSC:
565 ctr_val = rdtsc();
566 break;
567 case VMWARE_BACKDOOR_PMC_REAL_TIME:
568 ctr_val = ktime_get_boottime_ns();
569 break;
570 case VMWARE_BACKDOOR_PMC_APPARENT_TIME:
571 ctr_val = ktime_get_boottime_ns() +
572 vcpu->kvm->arch.kvmclock_offset;
573 break;
574 default:
575 return 1;
576 }
577
578 *data = ctr_val;
579 return 0;
580 }
581
kvm_pmu_rdpmc(struct kvm_vcpu * vcpu,unsigned idx,u64 * data)582 int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *data)
583 {
584 struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
585 struct kvm_pmc *pmc;
586 u64 mask = ~0ull;
587
588 if (!pmu->version)
589 return 1;
590
591 if (is_vmware_backdoor_pmc(idx))
592 return kvm_pmu_rdpmc_vmware(vcpu, idx, data);
593
594 pmc = kvm_pmu_call(rdpmc_ecx_to_pmc)(vcpu, idx, &mask);
595 if (!pmc)
596 return 1;
597
598 if (!kvm_is_cr4_bit_set(vcpu, X86_CR4_PCE) &&
599 (kvm_x86_call(get_cpl)(vcpu) != 0) &&
600 kvm_is_cr0_bit_set(vcpu, X86_CR0_PE))
601 return 1;
602
603 *data = pmc_read_counter(pmc) & mask;
604 return 0;
605 }
606
kvm_pmu_deliver_pmi(struct kvm_vcpu * vcpu)607 void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu)
608 {
609 if (lapic_in_kernel(vcpu)) {
610 kvm_pmu_call(deliver_pmi)(vcpu);
611 kvm_apic_local_deliver(vcpu->arch.apic, APIC_LVTPC);
612 }
613 }
614
kvm_pmu_is_valid_msr(struct kvm_vcpu * vcpu,u32 msr)615 bool kvm_pmu_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr)
616 {
617 switch (msr) {
618 case MSR_CORE_PERF_GLOBAL_STATUS:
619 case MSR_CORE_PERF_GLOBAL_CTRL:
620 case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
621 return kvm_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu));
622 default:
623 break;
624 }
625 return kvm_pmu_call(msr_idx_to_pmc)(vcpu, msr) ||
626 kvm_pmu_call(is_valid_msr)(vcpu, msr);
627 }
628
kvm_pmu_mark_pmc_in_use(struct kvm_vcpu * vcpu,u32 msr)629 static void kvm_pmu_mark_pmc_in_use(struct kvm_vcpu *vcpu, u32 msr)
630 {
631 struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
632 struct kvm_pmc *pmc = kvm_pmu_call(msr_idx_to_pmc)(vcpu, msr);
633
634 if (pmc)
635 __set_bit(pmc->idx, pmu->pmc_in_use);
636 }
637
kvm_pmu_get_msr(struct kvm_vcpu * vcpu,struct msr_data * msr_info)638 int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
639 {
640 struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
641 u32 msr = msr_info->index;
642
643 switch (msr) {
644 case MSR_CORE_PERF_GLOBAL_STATUS:
645 case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS:
646 msr_info->data = pmu->global_status;
647 break;
648 case MSR_AMD64_PERF_CNTR_GLOBAL_CTL:
649 case MSR_CORE_PERF_GLOBAL_CTRL:
650 msr_info->data = pmu->global_ctrl;
651 break;
652 case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR:
653 case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
654 msr_info->data = 0;
655 break;
656 default:
657 return kvm_pmu_call(get_msr)(vcpu, msr_info);
658 }
659
660 return 0;
661 }
662
kvm_pmu_set_msr(struct kvm_vcpu * vcpu,struct msr_data * msr_info)663 int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
664 {
665 struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
666 u32 msr = msr_info->index;
667 u64 data = msr_info->data;
668 u64 diff;
669
670 /*
671 * Note, AMD ignores writes to reserved bits and read-only PMU MSRs,
672 * whereas Intel generates #GP on attempts to write reserved/RO MSRs.
673 */
674 switch (msr) {
675 case MSR_CORE_PERF_GLOBAL_STATUS:
676 if (!msr_info->host_initiated)
677 return 1; /* RO MSR */
678 fallthrough;
679 case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS:
680 /* Per PPR, Read-only MSR. Writes are ignored. */
681 if (!msr_info->host_initiated)
682 break;
683
684 if (data & pmu->global_status_rsvd)
685 return 1;
686
687 pmu->global_status = data;
688 break;
689 case MSR_AMD64_PERF_CNTR_GLOBAL_CTL:
690 data &= ~pmu->global_ctrl_rsvd;
691 fallthrough;
692 case MSR_CORE_PERF_GLOBAL_CTRL:
693 if (!kvm_valid_perf_global_ctrl(pmu, data))
694 return 1;
695
696 if (pmu->global_ctrl != data) {
697 diff = pmu->global_ctrl ^ data;
698 pmu->global_ctrl = data;
699 reprogram_counters(pmu, diff);
700 }
701 break;
702 case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
703 /*
704 * GLOBAL_OVF_CTRL, a.k.a. GLOBAL STATUS_RESET, clears bits in
705 * GLOBAL_STATUS, and so the set of reserved bits is the same.
706 */
707 if (data & pmu->global_status_rsvd)
708 return 1;
709 fallthrough;
710 case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR:
711 if (!msr_info->host_initiated)
712 pmu->global_status &= ~data;
713 break;
714 default:
715 kvm_pmu_mark_pmc_in_use(vcpu, msr_info->index);
716 return kvm_pmu_call(set_msr)(vcpu, msr_info);
717 }
718
719 return 0;
720 }
721
kvm_pmu_reset(struct kvm_vcpu * vcpu)722 static void kvm_pmu_reset(struct kvm_vcpu *vcpu)
723 {
724 struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
725 struct kvm_pmc *pmc;
726 int i;
727
728 pmu->need_cleanup = false;
729
730 bitmap_zero(pmu->reprogram_pmi, X86_PMC_IDX_MAX);
731
732 kvm_for_each_pmc(pmu, pmc, i, pmu->all_valid_pmc_idx) {
733 pmc_stop_counter(pmc);
734 pmc->counter = 0;
735 pmc->emulated_counter = 0;
736
737 if (pmc_is_gp(pmc))
738 pmc->eventsel = 0;
739 }
740
741 pmu->fixed_ctr_ctrl = pmu->global_ctrl = pmu->global_status = 0;
742
743 kvm_pmu_call(reset)(vcpu);
744 }
745
746
747 /*
748 * Refresh the PMU configuration for the vCPU, e.g. if userspace changes CPUID
749 * and/or PERF_CAPABILITIES.
750 */
kvm_pmu_refresh(struct kvm_vcpu * vcpu)751 void kvm_pmu_refresh(struct kvm_vcpu *vcpu)
752 {
753 struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
754
755 if (KVM_BUG_ON(kvm_vcpu_has_run(vcpu), vcpu->kvm))
756 return;
757
758 /*
759 * Stop/release all existing counters/events before realizing the new
760 * vPMU model.
761 */
762 kvm_pmu_reset(vcpu);
763
764 pmu->version = 0;
765 pmu->nr_arch_gp_counters = 0;
766 pmu->nr_arch_fixed_counters = 0;
767 pmu->counter_bitmask[KVM_PMC_GP] = 0;
768 pmu->counter_bitmask[KVM_PMC_FIXED] = 0;
769 pmu->reserved_bits = 0xffffffff00200000ull;
770 pmu->raw_event_mask = X86_RAW_EVENT_MASK;
771 pmu->global_ctrl_rsvd = ~0ull;
772 pmu->global_status_rsvd = ~0ull;
773 pmu->fixed_ctr_ctrl_rsvd = ~0ull;
774 pmu->pebs_enable_rsvd = ~0ull;
775 pmu->pebs_data_cfg_rsvd = ~0ull;
776 bitmap_zero(pmu->all_valid_pmc_idx, X86_PMC_IDX_MAX);
777
778 if (!vcpu->kvm->arch.enable_pmu)
779 return;
780
781 kvm_pmu_call(refresh)(vcpu);
782
783 /*
784 * At RESET, both Intel and AMD CPUs set all enable bits for general
785 * purpose counters in IA32_PERF_GLOBAL_CTRL (so that software that
786 * was written for v1 PMUs don't unknowingly leave GP counters disabled
787 * in the global controls). Emulate that behavior when refreshing the
788 * PMU so that userspace doesn't need to manually set PERF_GLOBAL_CTRL.
789 */
790 if (kvm_pmu_has_perf_global_ctrl(pmu) && pmu->nr_arch_gp_counters)
791 pmu->global_ctrl = GENMASK_ULL(pmu->nr_arch_gp_counters - 1, 0);
792 }
793
kvm_pmu_init(struct kvm_vcpu * vcpu)794 void kvm_pmu_init(struct kvm_vcpu *vcpu)
795 {
796 struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
797
798 memset(pmu, 0, sizeof(*pmu));
799 kvm_pmu_call(init)(vcpu);
800 kvm_pmu_refresh(vcpu);
801 }
802
803 /* Release perf_events for vPMCs that have been unused for a full time slice. */
kvm_pmu_cleanup(struct kvm_vcpu * vcpu)804 void kvm_pmu_cleanup(struct kvm_vcpu *vcpu)
805 {
806 struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
807 struct kvm_pmc *pmc = NULL;
808 DECLARE_BITMAP(bitmask, X86_PMC_IDX_MAX);
809 int i;
810
811 pmu->need_cleanup = false;
812
813 bitmap_andnot(bitmask, pmu->all_valid_pmc_idx,
814 pmu->pmc_in_use, X86_PMC_IDX_MAX);
815
816 kvm_for_each_pmc(pmu, pmc, i, bitmask) {
817 if (pmc->perf_event && !pmc_speculative_in_use(pmc))
818 pmc_stop_counter(pmc);
819 }
820
821 kvm_pmu_call(cleanup)(vcpu);
822
823 bitmap_zero(pmu->pmc_in_use, X86_PMC_IDX_MAX);
824 }
825
kvm_pmu_destroy(struct kvm_vcpu * vcpu)826 void kvm_pmu_destroy(struct kvm_vcpu *vcpu)
827 {
828 kvm_pmu_reset(vcpu);
829 }
830
kvm_pmu_incr_counter(struct kvm_pmc * pmc)831 static void kvm_pmu_incr_counter(struct kvm_pmc *pmc)
832 {
833 pmc->emulated_counter++;
834 kvm_pmu_request_counter_reprogram(pmc);
835 }
836
cpl_is_matched(struct kvm_pmc * pmc)837 static inline bool cpl_is_matched(struct kvm_pmc *pmc)
838 {
839 bool select_os, select_user;
840 u64 config;
841
842 if (pmc_is_gp(pmc)) {
843 config = pmc->eventsel;
844 select_os = config & ARCH_PERFMON_EVENTSEL_OS;
845 select_user = config & ARCH_PERFMON_EVENTSEL_USR;
846 } else {
847 config = fixed_ctrl_field(pmc_to_pmu(pmc)->fixed_ctr_ctrl,
848 pmc->idx - KVM_FIXED_PMC_BASE_IDX);
849 select_os = config & INTEL_FIXED_0_KERNEL;
850 select_user = config & INTEL_FIXED_0_USER;
851 }
852
853 /*
854 * Skip the CPL lookup, which isn't free on Intel, if the result will
855 * be the same regardless of the CPL.
856 */
857 if (select_os == select_user)
858 return select_os;
859
860 return (kvm_x86_call(get_cpl)(pmc->vcpu) == 0) ? select_os :
861 select_user;
862 }
863
kvm_pmu_trigger_event(struct kvm_vcpu * vcpu,u64 eventsel)864 void kvm_pmu_trigger_event(struct kvm_vcpu *vcpu, u64 eventsel)
865 {
866 DECLARE_BITMAP(bitmap, X86_PMC_IDX_MAX);
867 struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
868 struct kvm_pmc *pmc;
869 int i;
870
871 BUILD_BUG_ON(sizeof(pmu->global_ctrl) * BITS_PER_BYTE != X86_PMC_IDX_MAX);
872
873 if (!kvm_pmu_has_perf_global_ctrl(pmu))
874 bitmap_copy(bitmap, pmu->all_valid_pmc_idx, X86_PMC_IDX_MAX);
875 else if (!bitmap_and(bitmap, pmu->all_valid_pmc_idx,
876 (unsigned long *)&pmu->global_ctrl, X86_PMC_IDX_MAX))
877 return;
878
879 kvm_for_each_pmc(pmu, pmc, i, bitmap) {
880 /*
881 * Ignore checks for edge detect (all events currently emulated
882 * but KVM are always rising edges), pin control (unsupported
883 * by modern CPUs), and counter mask and its invert flag (KVM
884 * doesn't emulate multiple events in a single clock cycle).
885 *
886 * Note, the uppermost nibble of AMD's mask overlaps Intel's
887 * IN_TX (bit 32) and IN_TXCP (bit 33), as well as two reserved
888 * bits (bits 35:34). Checking the "in HLE/RTM transaction"
889 * flags is correct as the vCPU can't be in a transaction if
890 * KVM is emulating an instruction. Checking the reserved bits
891 * might be wrong if they are defined in the future, but so
892 * could ignoring them, so do the simple thing for now.
893 */
894 if (((pmc->eventsel ^ eventsel) & AMD64_RAW_EVENT_MASK_NB) ||
895 !pmc_event_is_allowed(pmc) || !cpl_is_matched(pmc))
896 continue;
897
898 kvm_pmu_incr_counter(pmc);
899 }
900 }
901 EXPORT_SYMBOL_GPL(kvm_pmu_trigger_event);
902
is_masked_filter_valid(const struct kvm_x86_pmu_event_filter * filter)903 static bool is_masked_filter_valid(const struct kvm_x86_pmu_event_filter *filter)
904 {
905 u64 mask = kvm_pmu_ops.EVENTSEL_EVENT |
906 KVM_PMU_MASKED_ENTRY_UMASK_MASK |
907 KVM_PMU_MASKED_ENTRY_UMASK_MATCH |
908 KVM_PMU_MASKED_ENTRY_EXCLUDE;
909 int i;
910
911 for (i = 0; i < filter->nevents; i++) {
912 if (filter->events[i] & ~mask)
913 return false;
914 }
915
916 return true;
917 }
918
convert_to_masked_filter(struct kvm_x86_pmu_event_filter * filter)919 static void convert_to_masked_filter(struct kvm_x86_pmu_event_filter *filter)
920 {
921 int i, j;
922
923 for (i = 0, j = 0; i < filter->nevents; i++) {
924 /*
925 * Skip events that are impossible to match against a guest
926 * event. When filtering, only the event select + unit mask
927 * of the guest event is used. To maintain backwards
928 * compatibility, impossible filters can't be rejected :-(
929 */
930 if (filter->events[i] & ~(kvm_pmu_ops.EVENTSEL_EVENT |
931 ARCH_PERFMON_EVENTSEL_UMASK))
932 continue;
933 /*
934 * Convert userspace events to a common in-kernel event so
935 * only one code path is needed to support both events. For
936 * the in-kernel events use masked events because they are
937 * flexible enough to handle both cases. To convert to masked
938 * events all that's needed is to add an "all ones" umask_mask,
939 * (unmasked filter events don't support EXCLUDE).
940 */
941 filter->events[j++] = filter->events[i] |
942 (0xFFULL << KVM_PMU_MASKED_ENTRY_UMASK_MASK_SHIFT);
943 }
944
945 filter->nevents = j;
946 }
947
prepare_filter_lists(struct kvm_x86_pmu_event_filter * filter)948 static int prepare_filter_lists(struct kvm_x86_pmu_event_filter *filter)
949 {
950 int i;
951
952 if (!(filter->flags & KVM_PMU_EVENT_FLAG_MASKED_EVENTS))
953 convert_to_masked_filter(filter);
954 else if (!is_masked_filter_valid(filter))
955 return -EINVAL;
956
957 /*
958 * Sort entries by event select and includes vs. excludes so that all
959 * entries for a given event select can be processed efficiently during
960 * filtering. The EXCLUDE flag uses a more significant bit than the
961 * event select, and so the sorted list is also effectively split into
962 * includes and excludes sub-lists.
963 */
964 sort(&filter->events, filter->nevents, sizeof(filter->events[0]),
965 filter_sort_cmp, NULL);
966
967 i = filter->nevents;
968 /* Find the first EXCLUDE event (only supported for masked events). */
969 if (filter->flags & KVM_PMU_EVENT_FLAG_MASKED_EVENTS) {
970 for (i = 0; i < filter->nevents; i++) {
971 if (filter->events[i] & KVM_PMU_MASKED_ENTRY_EXCLUDE)
972 break;
973 }
974 }
975
976 filter->nr_includes = i;
977 filter->nr_excludes = filter->nevents - filter->nr_includes;
978 filter->includes = filter->events;
979 filter->excludes = filter->events + filter->nr_includes;
980
981 return 0;
982 }
983
kvm_vm_ioctl_set_pmu_event_filter(struct kvm * kvm,void __user * argp)984 int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp)
985 {
986 struct kvm_pmu_event_filter __user *user_filter = argp;
987 struct kvm_x86_pmu_event_filter *filter;
988 struct kvm_pmu_event_filter tmp;
989 struct kvm_vcpu *vcpu;
990 unsigned long i;
991 size_t size;
992 int r;
993
994 if (copy_from_user(&tmp, user_filter, sizeof(tmp)))
995 return -EFAULT;
996
997 if (tmp.action != KVM_PMU_EVENT_ALLOW &&
998 tmp.action != KVM_PMU_EVENT_DENY)
999 return -EINVAL;
1000
1001 if (tmp.flags & ~KVM_PMU_EVENT_FLAGS_VALID_MASK)
1002 return -EINVAL;
1003
1004 if (tmp.nevents > KVM_PMU_EVENT_FILTER_MAX_EVENTS)
1005 return -E2BIG;
1006
1007 size = struct_size(filter, events, tmp.nevents);
1008 filter = kzalloc(size, GFP_KERNEL_ACCOUNT);
1009 if (!filter)
1010 return -ENOMEM;
1011
1012 filter->action = tmp.action;
1013 filter->nevents = tmp.nevents;
1014 filter->fixed_counter_bitmap = tmp.fixed_counter_bitmap;
1015 filter->flags = tmp.flags;
1016
1017 r = -EFAULT;
1018 if (copy_from_user(filter->events, user_filter->events,
1019 sizeof(filter->events[0]) * filter->nevents))
1020 goto cleanup;
1021
1022 r = prepare_filter_lists(filter);
1023 if (r)
1024 goto cleanup;
1025
1026 mutex_lock(&kvm->lock);
1027 filter = rcu_replace_pointer(kvm->arch.pmu_event_filter, filter,
1028 mutex_is_locked(&kvm->lock));
1029 mutex_unlock(&kvm->lock);
1030 synchronize_srcu_expedited(&kvm->srcu);
1031
1032 BUILD_BUG_ON(sizeof(((struct kvm_pmu *)0)->reprogram_pmi) >
1033 sizeof(((struct kvm_pmu *)0)->__reprogram_pmi));
1034
1035 kvm_for_each_vcpu(i, vcpu, kvm)
1036 atomic64_set(&vcpu_to_pmu(vcpu)->__reprogram_pmi, -1ull);
1037
1038 kvm_make_all_cpus_request(kvm, KVM_REQ_PMU);
1039
1040 r = 0;
1041 cleanup:
1042 kfree(filter);
1043 return r;
1044 }
1045