1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3 * Kernel-based Virtual Machine -- Performance Monitoring Unit support
4 *
5 * Copyright 2015 Red Hat, Inc. and/or its affiliates.
6 *
7 * Authors:
8 * Avi Kivity <avi@redhat.com>
9 * Gleb Natapov <gleb@redhat.com>
10 * Wei Huang <wei@redhat.com>
11 */
12 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
13
14 #include <linux/types.h>
15 #include <linux/kvm_host.h>
16 #include <linux/perf_event.h>
17 #include <linux/bsearch.h>
18 #include <linux/sort.h>
19 #include <asm/perf_event.h>
20 #include <asm/cpu_device_id.h>
21 #include "x86.h"
22 #include "cpuid.h"
23 #include "lapic.h"
24 #include "pmu.h"
25
26 /* This is enough to filter the vast majority of currently defined events. */
27 #define KVM_PMU_EVENT_FILTER_MAX_EVENTS 300
28
29 /* Unadultered PMU capabilities of the host, i.e. of hardware. */
30 static struct x86_pmu_capability __read_mostly kvm_host_pmu;
31
32 /* KVM's PMU capabilities, i.e. the intersection of KVM and hardware support. */
33 struct x86_pmu_capability __read_mostly kvm_pmu_cap;
34 EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_pmu_cap);
35
36 struct kvm_pmu_emulated_event_selectors {
37 u64 INSTRUCTIONS_RETIRED;
38 u64 BRANCH_INSTRUCTIONS_RETIRED;
39 };
40 static struct kvm_pmu_emulated_event_selectors __read_mostly kvm_pmu_eventsel;
41
42 /* Precise Distribution of Instructions Retired (PDIR) */
43 static const struct x86_cpu_id vmx_pebs_pdir_cpu[] = {
44 X86_MATCH_VFM(INTEL_ICELAKE_D, NULL),
45 X86_MATCH_VFM(INTEL_ICELAKE_X, NULL),
46 /* Instruction-Accurate PDIR (PDIR++) */
47 X86_MATCH_VFM(INTEL_SAPPHIRERAPIDS_X, NULL),
48 {}
49 };
50
51 /* Precise Distribution (PDist) */
52 static const struct x86_cpu_id vmx_pebs_pdist_cpu[] = {
53 X86_MATCH_VFM(INTEL_SAPPHIRERAPIDS_X, NULL),
54 {}
55 };
56
57 /* NOTE:
58 * - Each perf counter is defined as "struct kvm_pmc";
59 * - There are two types of perf counters: general purpose (gp) and fixed.
60 * gp counters are stored in gp_counters[] and fixed counters are stored
61 * in fixed_counters[] respectively. Both of them are part of "struct
62 * kvm_pmu";
63 * - pmu.c understands the difference between gp counters and fixed counters.
64 * However AMD doesn't support fixed-counters;
65 * - There are three types of index to access perf counters (PMC):
66 * 1. MSR (named msr): For example Intel has MSR_IA32_PERFCTRn and AMD
67 * has MSR_K7_PERFCTRn and, for families 15H and later,
68 * MSR_F15H_PERF_CTRn, where MSR_F15H_PERF_CTR[0-3] are
69 * aliased to MSR_K7_PERFCTRn.
70 * 2. MSR Index (named idx): This normally is used by RDPMC instruction.
71 * For instance AMD RDPMC instruction uses 0000_0003h in ECX to access
72 * C001_0007h (MSR_K7_PERCTR3). Intel has a similar mechanism, except
73 * that it also supports fixed counters. idx can be used to as index to
74 * gp and fixed counters.
75 * 3. Global PMC Index (named pmc): pmc is an index specific to PMU
76 * code. Each pmc, stored in kvm_pmc.idx field, is unique across
77 * all perf counters (both gp and fixed). The mapping relationship
78 * between pmc and perf counters is as the following:
79 * * Intel: [0 .. KVM_MAX_NR_INTEL_GP_COUNTERS-1] <=> gp counters
80 * [KVM_FIXED_PMC_BASE_IDX .. KVM_FIXED_PMC_BASE_IDX + 2] <=> fixed
81 * * AMD: [0 .. AMD64_NUM_COUNTERS-1] and, for families 15H
82 * and later, [0 .. AMD64_NUM_COUNTERS_CORE-1] <=> gp counters
83 */
84
85 static struct kvm_pmu_ops kvm_pmu_ops __read_mostly;
86
87 #define KVM_X86_PMU_OP(func) \
88 DEFINE_STATIC_CALL_NULL(kvm_x86_pmu_##func, \
89 *(((struct kvm_pmu_ops *)0)->func));
90 #define KVM_X86_PMU_OP_OPTIONAL KVM_X86_PMU_OP
91 #include <asm/kvm-x86-pmu-ops.h>
92
kvm_pmu_ops_update(const struct kvm_pmu_ops * pmu_ops)93 void kvm_pmu_ops_update(const struct kvm_pmu_ops *pmu_ops)
94 {
95 memcpy(&kvm_pmu_ops, pmu_ops, sizeof(kvm_pmu_ops));
96
97 #define __KVM_X86_PMU_OP(func) \
98 static_call_update(kvm_x86_pmu_##func, kvm_pmu_ops.func);
99 #define KVM_X86_PMU_OP(func) \
100 WARN_ON(!kvm_pmu_ops.func); __KVM_X86_PMU_OP(func)
101 #define KVM_X86_PMU_OP_OPTIONAL __KVM_X86_PMU_OP
102 #include <asm/kvm-x86-pmu-ops.h>
103 #undef __KVM_X86_PMU_OP
104 }
105
kvm_init_pmu_capability(const struct kvm_pmu_ops * pmu_ops)106 void kvm_init_pmu_capability(const struct kvm_pmu_ops *pmu_ops)
107 {
108 bool is_intel = boot_cpu_data.x86_vendor == X86_VENDOR_INTEL;
109 int min_nr_gp_ctrs = pmu_ops->MIN_NR_GP_COUNTERS;
110
111 /*
112 * Hybrid PMUs don't play nice with virtualization without careful
113 * configuration by userspace, and KVM's APIs for reporting supported
114 * vPMU features do not account for hybrid PMUs. Disable vPMU support
115 * for hybrid PMUs until KVM gains a way to let userspace opt-in.
116 */
117 if (cpu_feature_enabled(X86_FEATURE_HYBRID_CPU)) {
118 enable_pmu = false;
119 memset(&kvm_host_pmu, 0, sizeof(kvm_host_pmu));
120 } else {
121 perf_get_x86_pmu_capability(&kvm_host_pmu);
122 }
123
124 if (enable_pmu) {
125 /*
126 * WARN if perf did NOT disable hardware PMU if the number of
127 * architecturally required GP counters aren't present, i.e. if
128 * there are a non-zero number of counters, but fewer than what
129 * is architecturally required.
130 */
131 if (!kvm_host_pmu.num_counters_gp ||
132 WARN_ON_ONCE(kvm_host_pmu.num_counters_gp < min_nr_gp_ctrs))
133 enable_pmu = false;
134 else if (is_intel && !kvm_host_pmu.version)
135 enable_pmu = false;
136 }
137
138 if (!enable_pmu) {
139 memset(&kvm_pmu_cap, 0, sizeof(kvm_pmu_cap));
140 return;
141 }
142
143 memcpy(&kvm_pmu_cap, &kvm_host_pmu, sizeof(kvm_host_pmu));
144 kvm_pmu_cap.version = min(kvm_pmu_cap.version, 2);
145 kvm_pmu_cap.num_counters_gp = min(kvm_pmu_cap.num_counters_gp,
146 pmu_ops->MAX_NR_GP_COUNTERS);
147 kvm_pmu_cap.num_counters_fixed = min(kvm_pmu_cap.num_counters_fixed,
148 KVM_MAX_NR_FIXED_COUNTERS);
149
150 kvm_pmu_eventsel.INSTRUCTIONS_RETIRED =
151 perf_get_hw_event_config(PERF_COUNT_HW_INSTRUCTIONS);
152 kvm_pmu_eventsel.BRANCH_INSTRUCTIONS_RETIRED =
153 perf_get_hw_event_config(PERF_COUNT_HW_BRANCH_INSTRUCTIONS);
154 }
155
__kvm_perf_overflow(struct kvm_pmc * pmc,bool in_pmi)156 static inline void __kvm_perf_overflow(struct kvm_pmc *pmc, bool in_pmi)
157 {
158 struct kvm_pmu *pmu = pmc_to_pmu(pmc);
159 bool skip_pmi = false;
160
161 if (pmc->perf_event && pmc->perf_event->attr.precise_ip) {
162 if (!in_pmi) {
163 /*
164 * TODO: KVM is currently _choosing_ to not generate records
165 * for emulated instructions, avoiding BUFFER_OVF PMI when
166 * there are no records. Strictly speaking, it should be done
167 * as well in the right context to improve sampling accuracy.
168 */
169 skip_pmi = true;
170 } else {
171 /* Indicate PEBS overflow PMI to guest. */
172 skip_pmi = __test_and_set_bit(GLOBAL_STATUS_BUFFER_OVF_BIT,
173 (unsigned long *)&pmu->global_status);
174 }
175 } else {
176 __set_bit(pmc->idx, (unsigned long *)&pmu->global_status);
177 }
178
179 if (pmc->intr && !skip_pmi)
180 kvm_make_request(KVM_REQ_PMI, pmc->vcpu);
181 }
182
kvm_perf_overflow(struct perf_event * perf_event,struct perf_sample_data * data,struct pt_regs * regs)183 static void kvm_perf_overflow(struct perf_event *perf_event,
184 struct perf_sample_data *data,
185 struct pt_regs *regs)
186 {
187 struct kvm_pmc *pmc = perf_event->overflow_handler_context;
188
189 /*
190 * Ignore asynchronous overflow events for counters that are scheduled
191 * to be reprogrammed, e.g. if a PMI for the previous event races with
192 * KVM's handling of a related guest WRMSR.
193 */
194 if (test_and_set_bit(pmc->idx, pmc_to_pmu(pmc)->reprogram_pmi))
195 return;
196
197 __kvm_perf_overflow(pmc, true);
198
199 kvm_make_request(KVM_REQ_PMU, pmc->vcpu);
200 }
201
pmc_get_pebs_precise_level(struct kvm_pmc * pmc)202 static u64 pmc_get_pebs_precise_level(struct kvm_pmc *pmc)
203 {
204 /*
205 * For some model specific pebs counters with special capabilities
206 * (PDIR, PDIR++, PDIST), KVM needs to raise the event precise
207 * level to the maximum value (currently 3, backwards compatible)
208 * so that the perf subsystem would assign specific hardware counter
209 * with that capability for vPMC.
210 */
211 if ((pmc->idx == 0 && x86_match_cpu(vmx_pebs_pdist_cpu)) ||
212 (pmc->idx == 32 && x86_match_cpu(vmx_pebs_pdir_cpu)))
213 return 3;
214
215 /*
216 * The non-zero precision level of guest event makes the ordinary
217 * guest event becomes a guest PEBS event and triggers the host
218 * PEBS PMI handler to determine whether the PEBS overflow PMI
219 * comes from the host counters or the guest.
220 */
221 return 1;
222 }
223
get_sample_period(struct kvm_pmc * pmc,u64 counter_value)224 static u64 get_sample_period(struct kvm_pmc *pmc, u64 counter_value)
225 {
226 u64 sample_period = (-counter_value) & pmc_bitmask(pmc);
227
228 if (!sample_period)
229 sample_period = pmc_bitmask(pmc) + 1;
230 return sample_period;
231 }
232
pmc_reprogram_counter(struct kvm_pmc * pmc,u32 type,u64 config,bool exclude_user,bool exclude_kernel,bool intr)233 static int pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, u64 config,
234 bool exclude_user, bool exclude_kernel,
235 bool intr)
236 {
237 struct kvm_pmu *pmu = pmc_to_pmu(pmc);
238 struct perf_event *event;
239 struct perf_event_attr attr = {
240 .type = type,
241 .size = sizeof(attr),
242 .pinned = true,
243 .exclude_idle = true,
244 .exclude_host = 1,
245 .exclude_user = exclude_user,
246 .exclude_kernel = exclude_kernel,
247 .config = config,
248 };
249 bool pebs = test_bit(pmc->idx, (unsigned long *)&pmu->pebs_enable);
250
251 attr.sample_period = get_sample_period(pmc, pmc->counter);
252
253 if ((attr.config & HSW_IN_TX_CHECKPOINTED) &&
254 (boot_cpu_has(X86_FEATURE_RTM) || boot_cpu_has(X86_FEATURE_HLE))) {
255 /*
256 * HSW_IN_TX_CHECKPOINTED is not supported with nonzero
257 * period. Just clear the sample period so at least
258 * allocating the counter doesn't fail.
259 */
260 attr.sample_period = 0;
261 }
262 if (pebs) {
263 /*
264 * For most PEBS hardware events, the difference in the software
265 * precision levels of guest and host PEBS events will not affect
266 * the accuracy of the PEBS profiling result, because the "event IP"
267 * in the PEBS record is calibrated on the guest side.
268 */
269 attr.precise_ip = pmc_get_pebs_precise_level(pmc);
270 }
271
272 event = perf_event_create_kernel_counter(&attr, -1, current,
273 kvm_perf_overflow, pmc);
274 if (IS_ERR(event)) {
275 pr_debug_ratelimited("kvm_pmu: event creation failed %ld for pmc->idx = %d\n",
276 PTR_ERR(event), pmc->idx);
277 return PTR_ERR(event);
278 }
279
280 pmc->perf_event = event;
281 pmc_to_pmu(pmc)->event_count++;
282 pmc->is_paused = false;
283 pmc->intr = intr || pebs;
284 return 0;
285 }
286
pmc_pause_counter(struct kvm_pmc * pmc)287 static bool pmc_pause_counter(struct kvm_pmc *pmc)
288 {
289 u64 counter = pmc->counter;
290 u64 prev_counter;
291
292 /* update counter, reset event value to avoid redundant accumulation */
293 if (pmc->perf_event && !pmc->is_paused)
294 counter += perf_event_pause(pmc->perf_event, true);
295
296 /*
297 * Snapshot the previous counter *after* accumulating state from perf.
298 * If overflow already happened, hardware (via perf) is responsible for
299 * generating a PMI. KVM just needs to detect overflow on emulated
300 * counter events that haven't yet been processed.
301 */
302 prev_counter = counter & pmc_bitmask(pmc);
303
304 counter += pmc->emulated_counter;
305 pmc->counter = counter & pmc_bitmask(pmc);
306
307 pmc->emulated_counter = 0;
308 pmc->is_paused = true;
309
310 return pmc->counter < prev_counter;
311 }
312
pmc_resume_counter(struct kvm_pmc * pmc)313 static bool pmc_resume_counter(struct kvm_pmc *pmc)
314 {
315 if (!pmc->perf_event)
316 return false;
317
318 /* recalibrate sample period and check if it's accepted by perf core */
319 if (is_sampling_event(pmc->perf_event) &&
320 perf_event_period(pmc->perf_event,
321 get_sample_period(pmc, pmc->counter)))
322 return false;
323
324 if (test_bit(pmc->idx, (unsigned long *)&pmc_to_pmu(pmc)->pebs_enable) !=
325 (!!pmc->perf_event->attr.precise_ip))
326 return false;
327
328 /* reuse perf_event to serve as pmc_reprogram_counter() does*/
329 perf_event_enable(pmc->perf_event);
330 pmc->is_paused = false;
331
332 return true;
333 }
334
pmc_release_perf_event(struct kvm_pmc * pmc)335 static void pmc_release_perf_event(struct kvm_pmc *pmc)
336 {
337 if (pmc->perf_event) {
338 perf_event_release_kernel(pmc->perf_event);
339 pmc->perf_event = NULL;
340 pmc->current_config = 0;
341 pmc_to_pmu(pmc)->event_count--;
342 }
343 }
344
pmc_stop_counter(struct kvm_pmc * pmc)345 static void pmc_stop_counter(struct kvm_pmc *pmc)
346 {
347 if (pmc->perf_event) {
348 pmc->counter = pmc_read_counter(pmc);
349 pmc_release_perf_event(pmc);
350 }
351 }
352
pmc_update_sample_period(struct kvm_pmc * pmc)353 static void pmc_update_sample_period(struct kvm_pmc *pmc)
354 {
355 if (!pmc->perf_event || pmc->is_paused ||
356 !is_sampling_event(pmc->perf_event))
357 return;
358
359 perf_event_period(pmc->perf_event,
360 get_sample_period(pmc, pmc->counter));
361 }
362
pmc_write_counter(struct kvm_pmc * pmc,u64 val)363 void pmc_write_counter(struct kvm_pmc *pmc, u64 val)
364 {
365 /*
366 * Drop any unconsumed accumulated counts, the WRMSR is a write, not a
367 * read-modify-write. Adjust the counter value so that its value is
368 * relative to the current count, as reading the current count from
369 * perf is faster than pausing and repgrogramming the event in order to
370 * reset it to '0'. Note, this very sneakily offsets the accumulated
371 * emulated count too, by using pmc_read_counter()!
372 */
373 pmc->emulated_counter = 0;
374 pmc->counter += val - pmc_read_counter(pmc);
375 pmc->counter &= pmc_bitmask(pmc);
376 pmc_update_sample_period(pmc);
377 }
378 EXPORT_SYMBOL_FOR_KVM_INTERNAL(pmc_write_counter);
379
filter_cmp(const void * pa,const void * pb,u64 mask)380 static int filter_cmp(const void *pa, const void *pb, u64 mask)
381 {
382 u64 a = *(u64 *)pa & mask;
383 u64 b = *(u64 *)pb & mask;
384
385 return (a > b) - (a < b);
386 }
387
388
filter_sort_cmp(const void * pa,const void * pb)389 static int filter_sort_cmp(const void *pa, const void *pb)
390 {
391 return filter_cmp(pa, pb, (KVM_PMU_MASKED_ENTRY_EVENT_SELECT |
392 KVM_PMU_MASKED_ENTRY_EXCLUDE));
393 }
394
395 /*
396 * For the event filter, searching is done on the 'includes' list and
397 * 'excludes' list separately rather than on the 'events' list (which
398 * has both). As a result the exclude bit can be ignored.
399 */
filter_event_cmp(const void * pa,const void * pb)400 static int filter_event_cmp(const void *pa, const void *pb)
401 {
402 return filter_cmp(pa, pb, (KVM_PMU_MASKED_ENTRY_EVENT_SELECT));
403 }
404
find_filter_index(u64 * events,u64 nevents,u64 key)405 static int find_filter_index(u64 *events, u64 nevents, u64 key)
406 {
407 u64 *fe = bsearch(&key, events, nevents, sizeof(events[0]),
408 filter_event_cmp);
409
410 if (!fe)
411 return -1;
412
413 return fe - events;
414 }
415
is_filter_entry_match(u64 filter_event,u64 umask)416 static bool is_filter_entry_match(u64 filter_event, u64 umask)
417 {
418 u64 mask = filter_event >> (KVM_PMU_MASKED_ENTRY_UMASK_MASK_SHIFT - 8);
419 u64 match = filter_event & KVM_PMU_MASKED_ENTRY_UMASK_MATCH;
420
421 BUILD_BUG_ON((KVM_PMU_ENCODE_MASKED_ENTRY(0, 0xff, 0, false) >>
422 (KVM_PMU_MASKED_ENTRY_UMASK_MASK_SHIFT - 8)) !=
423 ARCH_PERFMON_EVENTSEL_UMASK);
424
425 return (umask & mask) == match;
426 }
427
filter_contains_match(u64 * events,u64 nevents,u64 eventsel)428 static bool filter_contains_match(u64 *events, u64 nevents, u64 eventsel)
429 {
430 u64 event_select = eventsel & kvm_pmu_ops.EVENTSEL_EVENT;
431 u64 umask = eventsel & ARCH_PERFMON_EVENTSEL_UMASK;
432 int i, index;
433
434 index = find_filter_index(events, nevents, event_select);
435 if (index < 0)
436 return false;
437
438 /*
439 * Entries are sorted by the event select. Walk the list in both
440 * directions to process all entries with the targeted event select.
441 */
442 for (i = index; i < nevents; i++) {
443 if (filter_event_cmp(&events[i], &event_select))
444 break;
445
446 if (is_filter_entry_match(events[i], umask))
447 return true;
448 }
449
450 for (i = index - 1; i >= 0; i--) {
451 if (filter_event_cmp(&events[i], &event_select))
452 break;
453
454 if (is_filter_entry_match(events[i], umask))
455 return true;
456 }
457
458 return false;
459 }
460
is_gp_event_allowed(struct kvm_x86_pmu_event_filter * f,u64 eventsel)461 static bool is_gp_event_allowed(struct kvm_x86_pmu_event_filter *f,
462 u64 eventsel)
463 {
464 if (filter_contains_match(f->includes, f->nr_includes, eventsel) &&
465 !filter_contains_match(f->excludes, f->nr_excludes, eventsel))
466 return f->action == KVM_PMU_EVENT_ALLOW;
467
468 return f->action == KVM_PMU_EVENT_DENY;
469 }
470
is_fixed_event_allowed(struct kvm_x86_pmu_event_filter * filter,int idx)471 static bool is_fixed_event_allowed(struct kvm_x86_pmu_event_filter *filter,
472 int idx)
473 {
474 int fixed_idx = idx - KVM_FIXED_PMC_BASE_IDX;
475
476 if (filter->action == KVM_PMU_EVENT_DENY &&
477 test_bit(fixed_idx, (ulong *)&filter->fixed_counter_bitmap))
478 return false;
479 if (filter->action == KVM_PMU_EVENT_ALLOW &&
480 !test_bit(fixed_idx, (ulong *)&filter->fixed_counter_bitmap))
481 return false;
482
483 return true;
484 }
485
pmc_is_event_allowed(struct kvm_pmc * pmc)486 static bool pmc_is_event_allowed(struct kvm_pmc *pmc)
487 {
488 struct kvm_x86_pmu_event_filter *filter;
489 struct kvm *kvm = pmc->vcpu->kvm;
490
491 filter = srcu_dereference(kvm->arch.pmu_event_filter, &kvm->srcu);
492 if (!filter)
493 return true;
494
495 if (pmc_is_gp(pmc))
496 return is_gp_event_allowed(filter, pmc->eventsel);
497
498 return is_fixed_event_allowed(filter, pmc->idx);
499 }
500
reprogram_counter(struct kvm_pmc * pmc)501 static int reprogram_counter(struct kvm_pmc *pmc)
502 {
503 struct kvm_pmu *pmu = pmc_to_pmu(pmc);
504 u64 eventsel = pmc->eventsel;
505 u64 new_config = eventsel;
506 bool emulate_overflow;
507 u8 fixed_ctr_ctrl;
508
509 emulate_overflow = pmc_pause_counter(pmc);
510
511 if (!pmc_is_globally_enabled(pmc) || !pmc_is_locally_enabled(pmc) ||
512 !pmc_is_event_allowed(pmc))
513 return 0;
514
515 if (emulate_overflow)
516 __kvm_perf_overflow(pmc, false);
517
518 if (eventsel & ARCH_PERFMON_EVENTSEL_PIN_CONTROL)
519 printk_once("kvm pmu: pin control bit is ignored\n");
520
521 if (pmc_is_fixed(pmc)) {
522 fixed_ctr_ctrl = fixed_ctrl_field(pmu->fixed_ctr_ctrl,
523 pmc->idx - KVM_FIXED_PMC_BASE_IDX);
524 if (fixed_ctr_ctrl & INTEL_FIXED_0_KERNEL)
525 eventsel |= ARCH_PERFMON_EVENTSEL_OS;
526 if (fixed_ctr_ctrl & INTEL_FIXED_0_USER)
527 eventsel |= ARCH_PERFMON_EVENTSEL_USR;
528 if (fixed_ctr_ctrl & INTEL_FIXED_0_ENABLE_PMI)
529 eventsel |= ARCH_PERFMON_EVENTSEL_INT;
530 new_config = (u64)fixed_ctr_ctrl;
531 }
532
533 if (pmc->current_config == new_config && pmc_resume_counter(pmc))
534 return 0;
535
536 pmc_release_perf_event(pmc);
537
538 pmc->current_config = new_config;
539
540 return pmc_reprogram_counter(pmc, PERF_TYPE_RAW,
541 (eventsel & pmu->raw_event_mask),
542 !(eventsel & ARCH_PERFMON_EVENTSEL_USR),
543 !(eventsel & ARCH_PERFMON_EVENTSEL_OS),
544 eventsel & ARCH_PERFMON_EVENTSEL_INT);
545 }
546
pmc_is_event_match(struct kvm_pmc * pmc,u64 eventsel)547 static bool pmc_is_event_match(struct kvm_pmc *pmc, u64 eventsel)
548 {
549 /*
550 * Ignore checks for edge detect (all events currently emulated by KVM
551 * are always rising edges), pin control (unsupported by modern CPUs),
552 * and counter mask and its invert flag (KVM doesn't emulate multiple
553 * events in a single clock cycle).
554 *
555 * Note, the uppermost nibble of AMD's mask overlaps Intel's IN_TX (bit
556 * 32) and IN_TXCP (bit 33), as well as two reserved bits (bits 35:34).
557 * Checking the "in HLE/RTM transaction" flags is correct as the vCPU
558 * can't be in a transaction if KVM is emulating an instruction.
559 *
560 * Checking the reserved bits might be wrong if they are defined in the
561 * future, but so could ignoring them, so do the simple thing for now.
562 */
563 return !((pmc->eventsel ^ eventsel) & AMD64_RAW_EVENT_MASK_NB);
564 }
565
kvm_pmu_recalc_pmc_emulation(struct kvm_pmu * pmu,struct kvm_pmc * pmc)566 void kvm_pmu_recalc_pmc_emulation(struct kvm_pmu *pmu, struct kvm_pmc *pmc)
567 {
568 bitmap_clear(pmu->pmc_counting_instructions, pmc->idx, 1);
569 bitmap_clear(pmu->pmc_counting_branches, pmc->idx, 1);
570
571 /*
572 * Do NOT consult the PMU event filters, as the filters must be checked
573 * at the time of emulation to ensure KVM uses fresh information, e.g.
574 * omitting a PMC from a bitmap could result in a missed event if the
575 * filter is changed to allow counting the event.
576 */
577 if (!pmc_is_locally_enabled(pmc))
578 return;
579
580 if (pmc_is_event_match(pmc, kvm_pmu_eventsel.INSTRUCTIONS_RETIRED))
581 bitmap_set(pmu->pmc_counting_instructions, pmc->idx, 1);
582
583 if (pmc_is_event_match(pmc, kvm_pmu_eventsel.BRANCH_INSTRUCTIONS_RETIRED))
584 bitmap_set(pmu->pmc_counting_branches, pmc->idx, 1);
585 }
586 EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_pmu_recalc_pmc_emulation);
587
kvm_pmu_handle_event(struct kvm_vcpu * vcpu)588 void kvm_pmu_handle_event(struct kvm_vcpu *vcpu)
589 {
590 DECLARE_BITMAP(bitmap, X86_PMC_IDX_MAX);
591 struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
592 struct kvm_pmc *pmc;
593 int bit;
594
595 bitmap_copy(bitmap, pmu->reprogram_pmi, X86_PMC_IDX_MAX);
596
597 /*
598 * The reprogramming bitmap can be written asynchronously by something
599 * other than the task that holds vcpu->mutex, take care to clear only
600 * the bits that will actually processed.
601 */
602 BUILD_BUG_ON(sizeof(bitmap) != sizeof(atomic64_t));
603 atomic64_andnot(*(s64 *)bitmap, &pmu->__reprogram_pmi);
604
605 kvm_for_each_pmc(pmu, pmc, bit, bitmap) {
606 /*
607 * If reprogramming fails, e.g. due to contention, re-set the
608 * regprogram bit set, i.e. opportunistically try again on the
609 * next PMU refresh. Don't make a new request as doing so can
610 * stall the guest if reprogramming repeatedly fails.
611 */
612 if (reprogram_counter(pmc))
613 set_bit(pmc->idx, pmu->reprogram_pmi);
614 }
615
616 /*
617 * Release unused perf_events if the corresponding guest MSRs weren't
618 * accessed during the last vCPU time slice (need_cleanup is set when
619 * the vCPU is scheduled back in).
620 */
621 if (unlikely(pmu->need_cleanup))
622 kvm_pmu_cleanup(vcpu);
623
624 kvm_for_each_pmc(pmu, pmc, bit, bitmap)
625 kvm_pmu_recalc_pmc_emulation(pmu, pmc);
626 }
627
kvm_pmu_check_rdpmc_early(struct kvm_vcpu * vcpu,unsigned int idx)628 int kvm_pmu_check_rdpmc_early(struct kvm_vcpu *vcpu, unsigned int idx)
629 {
630 /*
631 * On Intel, VMX interception has priority over RDPMC exceptions that
632 * aren't already handled by the emulator, i.e. there are no additional
633 * check needed for Intel PMUs.
634 *
635 * On AMD, _all_ exceptions on RDPMC have priority over SVM intercepts,
636 * i.e. an invalid PMC results in a #GP, not #VMEXIT.
637 */
638 if (!kvm_pmu_ops.check_rdpmc_early)
639 return 0;
640
641 return kvm_pmu_call(check_rdpmc_early)(vcpu, idx);
642 }
643
is_vmware_backdoor_pmc(u32 pmc_idx)644 bool is_vmware_backdoor_pmc(u32 pmc_idx)
645 {
646 switch (pmc_idx) {
647 case VMWARE_BACKDOOR_PMC_HOST_TSC:
648 case VMWARE_BACKDOOR_PMC_REAL_TIME:
649 case VMWARE_BACKDOOR_PMC_APPARENT_TIME:
650 return true;
651 }
652 return false;
653 }
654
kvm_pmu_rdpmc_vmware(struct kvm_vcpu * vcpu,unsigned idx,u64 * data)655 static int kvm_pmu_rdpmc_vmware(struct kvm_vcpu *vcpu, unsigned idx, u64 *data)
656 {
657 u64 ctr_val;
658
659 switch (idx) {
660 case VMWARE_BACKDOOR_PMC_HOST_TSC:
661 ctr_val = rdtsc();
662 break;
663 case VMWARE_BACKDOOR_PMC_REAL_TIME:
664 ctr_val = ktime_get_boottime_ns();
665 break;
666 case VMWARE_BACKDOOR_PMC_APPARENT_TIME:
667 ctr_val = ktime_get_boottime_ns() +
668 vcpu->kvm->arch.kvmclock_offset;
669 break;
670 default:
671 return 1;
672 }
673
674 *data = ctr_val;
675 return 0;
676 }
677
kvm_pmu_rdpmc(struct kvm_vcpu * vcpu,unsigned idx,u64 * data)678 int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *data)
679 {
680 struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
681 struct kvm_pmc *pmc;
682 u64 mask = ~0ull;
683
684 if (!pmu->version)
685 return 1;
686
687 if (is_vmware_backdoor_pmc(idx))
688 return kvm_pmu_rdpmc_vmware(vcpu, idx, data);
689
690 pmc = kvm_pmu_call(rdpmc_ecx_to_pmc)(vcpu, idx, &mask);
691 if (!pmc)
692 return 1;
693
694 if (!kvm_is_cr4_bit_set(vcpu, X86_CR4_PCE) &&
695 (kvm_x86_call(get_cpl)(vcpu) != 0) &&
696 kvm_is_cr0_bit_set(vcpu, X86_CR0_PE))
697 return 1;
698
699 *data = pmc_read_counter(pmc) & mask;
700 return 0;
701 }
702
kvm_pmu_deliver_pmi(struct kvm_vcpu * vcpu)703 void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu)
704 {
705 if (lapic_in_kernel(vcpu)) {
706 kvm_pmu_call(deliver_pmi)(vcpu);
707 kvm_apic_local_deliver(vcpu->arch.apic, APIC_LVTPC);
708 }
709 }
710
kvm_pmu_is_valid_msr(struct kvm_vcpu * vcpu,u32 msr)711 bool kvm_pmu_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr)
712 {
713 switch (msr) {
714 case MSR_CORE_PERF_GLOBAL_STATUS:
715 case MSR_CORE_PERF_GLOBAL_CTRL:
716 case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
717 return kvm_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu));
718 default:
719 break;
720 }
721 return kvm_pmu_call(msr_idx_to_pmc)(vcpu, msr) ||
722 kvm_pmu_call(is_valid_msr)(vcpu, msr);
723 }
724
kvm_pmu_mark_pmc_in_use(struct kvm_vcpu * vcpu,u32 msr)725 static void kvm_pmu_mark_pmc_in_use(struct kvm_vcpu *vcpu, u32 msr)
726 {
727 struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
728 struct kvm_pmc *pmc = kvm_pmu_call(msr_idx_to_pmc)(vcpu, msr);
729
730 if (pmc)
731 __set_bit(pmc->idx, pmu->pmc_in_use);
732 }
733
kvm_pmu_get_msr(struct kvm_vcpu * vcpu,struct msr_data * msr_info)734 int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
735 {
736 struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
737 u32 msr = msr_info->index;
738
739 switch (msr) {
740 case MSR_CORE_PERF_GLOBAL_STATUS:
741 case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS:
742 msr_info->data = pmu->global_status;
743 break;
744 case MSR_AMD64_PERF_CNTR_GLOBAL_CTL:
745 case MSR_CORE_PERF_GLOBAL_CTRL:
746 msr_info->data = pmu->global_ctrl;
747 break;
748 case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR:
749 case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_SET:
750 case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
751 msr_info->data = 0;
752 break;
753 default:
754 return kvm_pmu_call(get_msr)(vcpu, msr_info);
755 }
756
757 return 0;
758 }
759
kvm_pmu_set_msr(struct kvm_vcpu * vcpu,struct msr_data * msr_info)760 int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
761 {
762 struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
763 u32 msr = msr_info->index;
764 u64 data = msr_info->data;
765 u64 diff;
766
767 /*
768 * Note, AMD ignores writes to reserved bits and read-only PMU MSRs,
769 * whereas Intel generates #GP on attempts to write reserved/RO MSRs.
770 */
771 switch (msr) {
772 case MSR_CORE_PERF_GLOBAL_STATUS:
773 if (!msr_info->host_initiated)
774 return 1; /* RO MSR */
775 fallthrough;
776 case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS:
777 /* Per PPR, Read-only MSR. Writes are ignored. */
778 if (!msr_info->host_initiated)
779 break;
780
781 if (data & pmu->global_status_rsvd)
782 return 1;
783
784 pmu->global_status = data;
785 break;
786 case MSR_AMD64_PERF_CNTR_GLOBAL_CTL:
787 data &= ~pmu->global_ctrl_rsvd;
788 fallthrough;
789 case MSR_CORE_PERF_GLOBAL_CTRL:
790 if (!kvm_valid_perf_global_ctrl(pmu, data))
791 return 1;
792
793 if (pmu->global_ctrl != data) {
794 diff = pmu->global_ctrl ^ data;
795 pmu->global_ctrl = data;
796 reprogram_counters(pmu, diff);
797 }
798 break;
799 case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
800 /*
801 * GLOBAL_OVF_CTRL, a.k.a. GLOBAL STATUS_RESET, clears bits in
802 * GLOBAL_STATUS, and so the set of reserved bits is the same.
803 */
804 if (data & pmu->global_status_rsvd)
805 return 1;
806 fallthrough;
807 case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR:
808 if (!msr_info->host_initiated)
809 pmu->global_status &= ~data;
810 break;
811 case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_SET:
812 if (!msr_info->host_initiated)
813 pmu->global_status |= data & ~pmu->global_status_rsvd;
814 break;
815 default:
816 kvm_pmu_mark_pmc_in_use(vcpu, msr_info->index);
817 return kvm_pmu_call(set_msr)(vcpu, msr_info);
818 }
819
820 return 0;
821 }
822
kvm_pmu_reset(struct kvm_vcpu * vcpu)823 static void kvm_pmu_reset(struct kvm_vcpu *vcpu)
824 {
825 struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
826 struct kvm_pmc *pmc;
827 int i;
828
829 pmu->need_cleanup = false;
830
831 bitmap_zero(pmu->reprogram_pmi, X86_PMC_IDX_MAX);
832
833 kvm_for_each_pmc(pmu, pmc, i, pmu->all_valid_pmc_idx) {
834 pmc_stop_counter(pmc);
835 pmc->counter = 0;
836 pmc->emulated_counter = 0;
837
838 if (pmc_is_gp(pmc))
839 pmc->eventsel = 0;
840 }
841
842 pmu->fixed_ctr_ctrl = pmu->global_ctrl = pmu->global_status = 0;
843
844 kvm_pmu_call(reset)(vcpu);
845 }
846
847
848 /*
849 * Refresh the PMU configuration for the vCPU, e.g. if userspace changes CPUID
850 * and/or PERF_CAPABILITIES.
851 */
kvm_pmu_refresh(struct kvm_vcpu * vcpu)852 void kvm_pmu_refresh(struct kvm_vcpu *vcpu)
853 {
854 struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
855
856 if (KVM_BUG_ON(kvm_vcpu_has_run(vcpu), vcpu->kvm))
857 return;
858
859 /*
860 * Stop/release all existing counters/events before realizing the new
861 * vPMU model.
862 */
863 kvm_pmu_reset(vcpu);
864
865 pmu->version = 0;
866 pmu->nr_arch_gp_counters = 0;
867 pmu->nr_arch_fixed_counters = 0;
868 pmu->counter_bitmask[KVM_PMC_GP] = 0;
869 pmu->counter_bitmask[KVM_PMC_FIXED] = 0;
870 pmu->reserved_bits = 0xffffffff00200000ull;
871 pmu->raw_event_mask = X86_RAW_EVENT_MASK;
872 pmu->global_ctrl_rsvd = ~0ull;
873 pmu->global_status_rsvd = ~0ull;
874 pmu->fixed_ctr_ctrl_rsvd = ~0ull;
875 pmu->pebs_enable_rsvd = ~0ull;
876 pmu->pebs_data_cfg_rsvd = ~0ull;
877 bitmap_zero(pmu->all_valid_pmc_idx, X86_PMC_IDX_MAX);
878
879 if (!vcpu->kvm->arch.enable_pmu)
880 return;
881
882 kvm_pmu_call(refresh)(vcpu);
883
884 /*
885 * At RESET, both Intel and AMD CPUs set all enable bits for general
886 * purpose counters in IA32_PERF_GLOBAL_CTRL (so that software that
887 * was written for v1 PMUs don't unknowingly leave GP counters disabled
888 * in the global controls). Emulate that behavior when refreshing the
889 * PMU so that userspace doesn't need to manually set PERF_GLOBAL_CTRL.
890 */
891 if (kvm_pmu_has_perf_global_ctrl(pmu) && pmu->nr_arch_gp_counters)
892 pmu->global_ctrl = GENMASK_ULL(pmu->nr_arch_gp_counters - 1, 0);
893
894 bitmap_set(pmu->all_valid_pmc_idx, 0, pmu->nr_arch_gp_counters);
895 bitmap_set(pmu->all_valid_pmc_idx, KVM_FIXED_PMC_BASE_IDX,
896 pmu->nr_arch_fixed_counters);
897 }
898
kvm_pmu_init(struct kvm_vcpu * vcpu)899 void kvm_pmu_init(struct kvm_vcpu *vcpu)
900 {
901 struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
902
903 memset(pmu, 0, sizeof(*pmu));
904 kvm_pmu_call(init)(vcpu);
905 }
906
907 /* Release perf_events for vPMCs that have been unused for a full time slice. */
kvm_pmu_cleanup(struct kvm_vcpu * vcpu)908 void kvm_pmu_cleanup(struct kvm_vcpu *vcpu)
909 {
910 struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
911 struct kvm_pmc *pmc = NULL;
912 DECLARE_BITMAP(bitmask, X86_PMC_IDX_MAX);
913 int i;
914
915 pmu->need_cleanup = false;
916
917 bitmap_andnot(bitmask, pmu->all_valid_pmc_idx,
918 pmu->pmc_in_use, X86_PMC_IDX_MAX);
919
920 kvm_for_each_pmc(pmu, pmc, i, bitmask) {
921 if (pmc->perf_event && !pmc_is_locally_enabled(pmc))
922 pmc_stop_counter(pmc);
923 }
924
925 kvm_pmu_call(cleanup)(vcpu);
926
927 bitmap_zero(pmu->pmc_in_use, X86_PMC_IDX_MAX);
928 }
929
kvm_pmu_destroy(struct kvm_vcpu * vcpu)930 void kvm_pmu_destroy(struct kvm_vcpu *vcpu)
931 {
932 kvm_pmu_reset(vcpu);
933 }
934
kvm_pmu_incr_counter(struct kvm_pmc * pmc)935 static void kvm_pmu_incr_counter(struct kvm_pmc *pmc)
936 {
937 pmc->emulated_counter++;
938 kvm_pmu_request_counter_reprogram(pmc);
939 }
940
cpl_is_matched(struct kvm_pmc * pmc)941 static inline bool cpl_is_matched(struct kvm_pmc *pmc)
942 {
943 bool select_os, select_user;
944 u64 config;
945
946 if (pmc_is_gp(pmc)) {
947 config = pmc->eventsel;
948 select_os = config & ARCH_PERFMON_EVENTSEL_OS;
949 select_user = config & ARCH_PERFMON_EVENTSEL_USR;
950 } else {
951 config = fixed_ctrl_field(pmc_to_pmu(pmc)->fixed_ctr_ctrl,
952 pmc->idx - KVM_FIXED_PMC_BASE_IDX);
953 select_os = config & INTEL_FIXED_0_KERNEL;
954 select_user = config & INTEL_FIXED_0_USER;
955 }
956
957 /*
958 * Skip the CPL lookup, which isn't free on Intel, if the result will
959 * be the same regardless of the CPL.
960 */
961 if (select_os == select_user)
962 return select_os;
963
964 return (kvm_x86_call(get_cpl)(pmc->vcpu) == 0) ? select_os :
965 select_user;
966 }
967
kvm_pmu_trigger_event(struct kvm_vcpu * vcpu,const unsigned long * event_pmcs)968 static void kvm_pmu_trigger_event(struct kvm_vcpu *vcpu,
969 const unsigned long *event_pmcs)
970 {
971 DECLARE_BITMAP(bitmap, X86_PMC_IDX_MAX);
972 struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
973 struct kvm_pmc *pmc;
974 int i, idx;
975
976 BUILD_BUG_ON(sizeof(pmu->global_ctrl) * BITS_PER_BYTE != X86_PMC_IDX_MAX);
977
978 if (bitmap_empty(event_pmcs, X86_PMC_IDX_MAX))
979 return;
980
981 if (!kvm_pmu_has_perf_global_ctrl(pmu))
982 bitmap_copy(bitmap, event_pmcs, X86_PMC_IDX_MAX);
983 else if (!bitmap_and(bitmap, event_pmcs,
984 (unsigned long *)&pmu->global_ctrl, X86_PMC_IDX_MAX))
985 return;
986
987 idx = srcu_read_lock(&vcpu->kvm->srcu);
988 kvm_for_each_pmc(pmu, pmc, i, bitmap) {
989 if (!pmc_is_event_allowed(pmc) || !cpl_is_matched(pmc))
990 continue;
991
992 kvm_pmu_incr_counter(pmc);
993 }
994 srcu_read_unlock(&vcpu->kvm->srcu, idx);
995 }
996
kvm_pmu_instruction_retired(struct kvm_vcpu * vcpu)997 void kvm_pmu_instruction_retired(struct kvm_vcpu *vcpu)
998 {
999 kvm_pmu_trigger_event(vcpu, vcpu_to_pmu(vcpu)->pmc_counting_instructions);
1000 }
1001 EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_pmu_instruction_retired);
1002
kvm_pmu_branch_retired(struct kvm_vcpu * vcpu)1003 void kvm_pmu_branch_retired(struct kvm_vcpu *vcpu)
1004 {
1005 kvm_pmu_trigger_event(vcpu, vcpu_to_pmu(vcpu)->pmc_counting_branches);
1006 }
1007 EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_pmu_branch_retired);
1008
is_masked_filter_valid(const struct kvm_x86_pmu_event_filter * filter)1009 static bool is_masked_filter_valid(const struct kvm_x86_pmu_event_filter *filter)
1010 {
1011 u64 mask = kvm_pmu_ops.EVENTSEL_EVENT |
1012 KVM_PMU_MASKED_ENTRY_UMASK_MASK |
1013 KVM_PMU_MASKED_ENTRY_UMASK_MATCH |
1014 KVM_PMU_MASKED_ENTRY_EXCLUDE;
1015 int i;
1016
1017 for (i = 0; i < filter->nevents; i++) {
1018 if (filter->events[i] & ~mask)
1019 return false;
1020 }
1021
1022 return true;
1023 }
1024
convert_to_masked_filter(struct kvm_x86_pmu_event_filter * filter)1025 static void convert_to_masked_filter(struct kvm_x86_pmu_event_filter *filter)
1026 {
1027 int i, j;
1028
1029 for (i = 0, j = 0; i < filter->nevents; i++) {
1030 /*
1031 * Skip events that are impossible to match against a guest
1032 * event. When filtering, only the event select + unit mask
1033 * of the guest event is used. To maintain backwards
1034 * compatibility, impossible filters can't be rejected :-(
1035 */
1036 if (filter->events[i] & ~(kvm_pmu_ops.EVENTSEL_EVENT |
1037 ARCH_PERFMON_EVENTSEL_UMASK))
1038 continue;
1039 /*
1040 * Convert userspace events to a common in-kernel event so
1041 * only one code path is needed to support both events. For
1042 * the in-kernel events use masked events because they are
1043 * flexible enough to handle both cases. To convert to masked
1044 * events all that's needed is to add an "all ones" umask_mask,
1045 * (unmasked filter events don't support EXCLUDE).
1046 */
1047 filter->events[j++] = filter->events[i] |
1048 (0xFFULL << KVM_PMU_MASKED_ENTRY_UMASK_MASK_SHIFT);
1049 }
1050
1051 filter->nevents = j;
1052 }
1053
prepare_filter_lists(struct kvm_x86_pmu_event_filter * filter)1054 static int prepare_filter_lists(struct kvm_x86_pmu_event_filter *filter)
1055 {
1056 int i;
1057
1058 if (!(filter->flags & KVM_PMU_EVENT_FLAG_MASKED_EVENTS))
1059 convert_to_masked_filter(filter);
1060 else if (!is_masked_filter_valid(filter))
1061 return -EINVAL;
1062
1063 /*
1064 * Sort entries by event select and includes vs. excludes so that all
1065 * entries for a given event select can be processed efficiently during
1066 * filtering. The EXCLUDE flag uses a more significant bit than the
1067 * event select, and so the sorted list is also effectively split into
1068 * includes and excludes sub-lists.
1069 */
1070 sort(&filter->events, filter->nevents, sizeof(filter->events[0]),
1071 filter_sort_cmp, NULL);
1072
1073 i = filter->nevents;
1074 /* Find the first EXCLUDE event (only supported for masked events). */
1075 if (filter->flags & KVM_PMU_EVENT_FLAG_MASKED_EVENTS) {
1076 for (i = 0; i < filter->nevents; i++) {
1077 if (filter->events[i] & KVM_PMU_MASKED_ENTRY_EXCLUDE)
1078 break;
1079 }
1080 }
1081
1082 filter->nr_includes = i;
1083 filter->nr_excludes = filter->nevents - filter->nr_includes;
1084 filter->includes = filter->events;
1085 filter->excludes = filter->events + filter->nr_includes;
1086
1087 return 0;
1088 }
1089
kvm_vm_ioctl_set_pmu_event_filter(struct kvm * kvm,void __user * argp)1090 int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp)
1091 {
1092 struct kvm_pmu_event_filter __user *user_filter = argp;
1093 struct kvm_x86_pmu_event_filter *filter;
1094 struct kvm_pmu_event_filter tmp;
1095 struct kvm_vcpu *vcpu;
1096 unsigned long i;
1097 size_t size;
1098 int r;
1099
1100 if (copy_from_user(&tmp, user_filter, sizeof(tmp)))
1101 return -EFAULT;
1102
1103 if (tmp.action != KVM_PMU_EVENT_ALLOW &&
1104 tmp.action != KVM_PMU_EVENT_DENY)
1105 return -EINVAL;
1106
1107 if (tmp.flags & ~KVM_PMU_EVENT_FLAGS_VALID_MASK)
1108 return -EINVAL;
1109
1110 if (tmp.nevents > KVM_PMU_EVENT_FILTER_MAX_EVENTS)
1111 return -E2BIG;
1112
1113 size = struct_size(filter, events, tmp.nevents);
1114 filter = kzalloc(size, GFP_KERNEL_ACCOUNT);
1115 if (!filter)
1116 return -ENOMEM;
1117
1118 filter->action = tmp.action;
1119 filter->nevents = tmp.nevents;
1120 filter->fixed_counter_bitmap = tmp.fixed_counter_bitmap;
1121 filter->flags = tmp.flags;
1122
1123 r = -EFAULT;
1124 if (copy_from_user(filter->events, user_filter->events,
1125 sizeof(filter->events[0]) * filter->nevents))
1126 goto cleanup;
1127
1128 r = prepare_filter_lists(filter);
1129 if (r)
1130 goto cleanup;
1131
1132 mutex_lock(&kvm->lock);
1133 filter = rcu_replace_pointer(kvm->arch.pmu_event_filter, filter,
1134 mutex_is_locked(&kvm->lock));
1135 mutex_unlock(&kvm->lock);
1136 synchronize_srcu_expedited(&kvm->srcu);
1137
1138 BUILD_BUG_ON(sizeof(((struct kvm_pmu *)0)->reprogram_pmi) >
1139 sizeof(((struct kvm_pmu *)0)->__reprogram_pmi));
1140
1141 kvm_for_each_vcpu(i, vcpu, kvm)
1142 atomic64_set(&vcpu_to_pmu(vcpu)->__reprogram_pmi, -1ull);
1143
1144 kvm_make_all_cpus_request(kvm, KVM_REQ_PMU);
1145
1146 r = 0;
1147 cleanup:
1148 kfree(filter);
1149 return r;
1150 }
1151