xref: /linux/arch/riscv/kvm/vcpu_pmu.c (revision ff9f065318e17a1a97981d9e535fcfc6ce5d5614)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (c) 2023 Rivos Inc
4  *
5  * Authors:
6  *     Atish Patra <atishp@rivosinc.com>
7  */
8 
9 #define pr_fmt(fmt)	"riscv-kvm-pmu: " fmt
10 #include <linux/errno.h>
11 #include <linux/err.h>
12 #include <linux/kvm_host.h>
13 #include <linux/perf/riscv_pmu.h>
14 #include <asm/csr.h>
15 #include <asm/kvm_vcpu_sbi.h>
16 #include <asm/kvm_vcpu_pmu.h>
17 #include <asm/sbi.h>
18 #include <linux/bitops.h>
19 
20 #define kvm_pmu_num_counters(pmu) ((pmu)->num_hw_ctrs + (pmu)->num_fw_ctrs)
21 #define get_event_type(x) (((x) & SBI_PMU_EVENT_IDX_TYPE_MASK) >> 16)
22 #define get_event_code(x) ((x) & SBI_PMU_EVENT_IDX_CODE_MASK)
23 
24 static enum perf_hw_id hw_event_perf_map[SBI_PMU_HW_GENERAL_MAX] = {
25 	[SBI_PMU_HW_CPU_CYCLES] = PERF_COUNT_HW_CPU_CYCLES,
26 	[SBI_PMU_HW_INSTRUCTIONS] = PERF_COUNT_HW_INSTRUCTIONS,
27 	[SBI_PMU_HW_CACHE_REFERENCES] = PERF_COUNT_HW_CACHE_REFERENCES,
28 	[SBI_PMU_HW_CACHE_MISSES] = PERF_COUNT_HW_CACHE_MISSES,
29 	[SBI_PMU_HW_BRANCH_INSTRUCTIONS] = PERF_COUNT_HW_BRANCH_INSTRUCTIONS,
30 	[SBI_PMU_HW_BRANCH_MISSES] = PERF_COUNT_HW_BRANCH_MISSES,
31 	[SBI_PMU_HW_BUS_CYCLES] = PERF_COUNT_HW_BUS_CYCLES,
32 	[SBI_PMU_HW_STALLED_CYCLES_FRONTEND] = PERF_COUNT_HW_STALLED_CYCLES_FRONTEND,
33 	[SBI_PMU_HW_STALLED_CYCLES_BACKEND] = PERF_COUNT_HW_STALLED_CYCLES_BACKEND,
34 	[SBI_PMU_HW_REF_CPU_CYCLES] = PERF_COUNT_HW_REF_CPU_CYCLES,
35 };
36 
37 static u64 kvm_pmu_get_sample_period(struct kvm_pmc *pmc)
38 {
39 	u64 counter_val_mask = GENMASK(pmc->cinfo.width, 0);
40 	u64 sample_period;
41 
42 	if (!pmc->counter_val)
43 		sample_period = counter_val_mask;
44 	else
45 		sample_period = (-pmc->counter_val) & counter_val_mask;
46 
47 	return sample_period;
48 }
49 
50 static u32 kvm_pmu_get_perf_event_type(unsigned long eidx)
51 {
52 	enum sbi_pmu_event_type etype = get_event_type(eidx);
53 	u32 type = PERF_TYPE_MAX;
54 
55 	switch (etype) {
56 	case SBI_PMU_EVENT_TYPE_HW:
57 		type = PERF_TYPE_HARDWARE;
58 		break;
59 	case SBI_PMU_EVENT_TYPE_CACHE:
60 		type = PERF_TYPE_HW_CACHE;
61 		break;
62 	case SBI_PMU_EVENT_TYPE_RAW:
63 	case SBI_PMU_EVENT_TYPE_FW:
64 		type = PERF_TYPE_RAW;
65 		break;
66 	default:
67 		break;
68 	}
69 
70 	return type;
71 }
72 
73 static bool kvm_pmu_is_fw_event(unsigned long eidx)
74 {
75 	return get_event_type(eidx) == SBI_PMU_EVENT_TYPE_FW;
76 }
77 
78 static void kvm_pmu_release_perf_event(struct kvm_pmc *pmc)
79 {
80 	if (pmc->perf_event) {
81 		perf_event_disable(pmc->perf_event);
82 		perf_event_release_kernel(pmc->perf_event);
83 		pmc->perf_event = NULL;
84 	}
85 }
86 
87 static u64 kvm_pmu_get_perf_event_hw_config(u32 sbi_event_code)
88 {
89 	return hw_event_perf_map[sbi_event_code];
90 }
91 
92 static u64 kvm_pmu_get_perf_event_cache_config(u32 sbi_event_code)
93 {
94 	u64 config = U64_MAX;
95 	unsigned int cache_type, cache_op, cache_result;
96 
97 	/* All the cache event masks lie within 0xFF. No separate masking is necessary */
98 	cache_type = (sbi_event_code & SBI_PMU_EVENT_CACHE_ID_CODE_MASK) >>
99 		      SBI_PMU_EVENT_CACHE_ID_SHIFT;
100 	cache_op = (sbi_event_code & SBI_PMU_EVENT_CACHE_OP_ID_CODE_MASK) >>
101 		    SBI_PMU_EVENT_CACHE_OP_SHIFT;
102 	cache_result = sbi_event_code & SBI_PMU_EVENT_CACHE_RESULT_ID_CODE_MASK;
103 
104 	if (cache_type >= PERF_COUNT_HW_CACHE_MAX ||
105 	    cache_op >= PERF_COUNT_HW_CACHE_OP_MAX ||
106 	    cache_result >= PERF_COUNT_HW_CACHE_RESULT_MAX)
107 		return config;
108 
109 	config = cache_type | (cache_op << 8) | (cache_result << 16);
110 
111 	return config;
112 }
113 
114 static u64 kvm_pmu_get_perf_event_config(unsigned long eidx, uint64_t evt_data)
115 {
116 	enum sbi_pmu_event_type etype = get_event_type(eidx);
117 	u32 ecode = get_event_code(eidx);
118 	u64 config = U64_MAX;
119 
120 	switch (etype) {
121 	case SBI_PMU_EVENT_TYPE_HW:
122 		if (ecode < SBI_PMU_HW_GENERAL_MAX)
123 			config = kvm_pmu_get_perf_event_hw_config(ecode);
124 		break;
125 	case SBI_PMU_EVENT_TYPE_CACHE:
126 		config = kvm_pmu_get_perf_event_cache_config(ecode);
127 		break;
128 	case SBI_PMU_EVENT_TYPE_RAW:
129 		config = evt_data & RISCV_PMU_RAW_EVENT_MASK;
130 		break;
131 	case SBI_PMU_EVENT_TYPE_FW:
132 		if (ecode < SBI_PMU_FW_MAX)
133 			config = (1ULL << 63) | ecode;
134 		break;
135 	default:
136 		break;
137 	}
138 
139 	return config;
140 }
141 
142 static int kvm_pmu_get_fixed_pmc_index(unsigned long eidx)
143 {
144 	u32 etype = kvm_pmu_get_perf_event_type(eidx);
145 	u32 ecode = get_event_code(eidx);
146 
147 	if (etype != SBI_PMU_EVENT_TYPE_HW)
148 		return -EINVAL;
149 
150 	if (ecode == SBI_PMU_HW_CPU_CYCLES)
151 		return 0;
152 	else if (ecode == SBI_PMU_HW_INSTRUCTIONS)
153 		return 2;
154 	else
155 		return -EINVAL;
156 }
157 
158 static int kvm_pmu_get_programmable_pmc_index(struct kvm_pmu *kvpmu, unsigned long eidx,
159 					      unsigned long cbase, unsigned long cmask)
160 {
161 	int ctr_idx = -1;
162 	int i, pmc_idx;
163 	int min, max;
164 
165 	if (kvm_pmu_is_fw_event(eidx)) {
166 		/* Firmware counters are mapped 1:1 starting from num_hw_ctrs for simplicity */
167 		min = kvpmu->num_hw_ctrs;
168 		max = min + kvpmu->num_fw_ctrs;
169 	} else {
170 		/* First 3 counters are reserved for fixed counters */
171 		min = 3;
172 		max = kvpmu->num_hw_ctrs;
173 	}
174 
175 	for_each_set_bit(i, &cmask, BITS_PER_LONG) {
176 		pmc_idx = i + cbase;
177 		if ((pmc_idx >= min && pmc_idx < max) &&
178 		    !test_bit(pmc_idx, kvpmu->pmc_in_use)) {
179 			ctr_idx = pmc_idx;
180 			break;
181 		}
182 	}
183 
184 	return ctr_idx;
185 }
186 
187 static int pmu_get_pmc_index(struct kvm_pmu *pmu, unsigned long eidx,
188 			     unsigned long cbase, unsigned long cmask)
189 {
190 	int ret;
191 
192 	/* Fixed counters need to be have fixed mapping as they have different width */
193 	ret = kvm_pmu_get_fixed_pmc_index(eidx);
194 	if (ret >= 0)
195 		return ret;
196 
197 	return kvm_pmu_get_programmable_pmc_index(pmu, eidx, cbase, cmask);
198 }
199 
200 static int pmu_fw_ctr_read_hi(struct kvm_vcpu *vcpu, unsigned long cidx,
201 			      unsigned long *out_val)
202 {
203 	struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu);
204 	struct kvm_pmc *pmc;
205 	int fevent_code;
206 
207 	if (!IS_ENABLED(CONFIG_32BIT)) {
208 		pr_warn("%s: should be invoked for only RV32\n", __func__);
209 		return -EINVAL;
210 	}
211 
212 	if (cidx >= kvm_pmu_num_counters(kvpmu) || cidx == 1) {
213 		pr_warn("Invalid counter id [%ld]during read\n", cidx);
214 		return -EINVAL;
215 	}
216 
217 	pmc = &kvpmu->pmc[cidx];
218 
219 	if (pmc->cinfo.type != SBI_PMU_CTR_TYPE_FW)
220 		return -EINVAL;
221 
222 	fevent_code = get_event_code(pmc->event_idx);
223 	pmc->counter_val = kvpmu->fw_event[fevent_code].value;
224 
225 	*out_val = pmc->counter_val >> 32;
226 
227 	return 0;
228 }
229 
230 static int pmu_ctr_read(struct kvm_vcpu *vcpu, unsigned long cidx,
231 			unsigned long *out_val)
232 {
233 	struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu);
234 	struct kvm_pmc *pmc;
235 	u64 enabled, running;
236 	int fevent_code;
237 
238 	if (cidx >= kvm_pmu_num_counters(kvpmu) || cidx == 1) {
239 		pr_warn("Invalid counter id [%ld] during read\n", cidx);
240 		return -EINVAL;
241 	}
242 
243 	pmc = &kvpmu->pmc[cidx];
244 
245 	if (pmc->cinfo.type == SBI_PMU_CTR_TYPE_FW) {
246 		fevent_code = get_event_code(pmc->event_idx);
247 		pmc->counter_val = kvpmu->fw_event[fevent_code].value;
248 	} else if (pmc->perf_event) {
249 		pmc->counter_val += perf_event_read_value(pmc->perf_event, &enabled, &running);
250 	} else {
251 		return -EINVAL;
252 	}
253 	*out_val = pmc->counter_val;
254 
255 	return 0;
256 }
257 
258 static int kvm_pmu_validate_counter_mask(struct kvm_pmu *kvpmu, unsigned long ctr_base,
259 					 unsigned long ctr_mask)
260 {
261 	/* Make sure the we have a valid counter mask requested from the caller */
262 	if (!ctr_mask || (ctr_base + __fls(ctr_mask) >= kvm_pmu_num_counters(kvpmu)))
263 		return -EINVAL;
264 
265 	return 0;
266 }
267 
268 static void kvm_riscv_pmu_overflow(struct perf_event *perf_event,
269 				   struct perf_sample_data *data,
270 				   struct pt_regs *regs)
271 {
272 	struct kvm_pmc *pmc = perf_event->overflow_handler_context;
273 	struct kvm_vcpu *vcpu = pmc->vcpu;
274 	struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu);
275 	struct riscv_pmu *rpmu = to_riscv_pmu(perf_event->pmu);
276 	u64 period;
277 
278 	/*
279 	 * Stop the event counting by directly accessing the perf_event.
280 	 * Otherwise, this needs to deferred via a workqueue.
281 	 * That will introduce skew in the counter value because the actual
282 	 * physical counter would start after returning from this function.
283 	 * It will be stopped again once the workqueue is scheduled
284 	 */
285 	rpmu->pmu.stop(perf_event, PERF_EF_UPDATE);
286 
287 	/*
288 	 * The hw counter would start automatically when this function returns.
289 	 * Thus, the host may continue to interrupt and inject it to the guest
290 	 * even without the guest configuring the next event. Depending on the hardware
291 	 * the host may have some sluggishness only if privilege mode filtering is not
292 	 * available. In an ideal world, where qemu is not the only capable hardware,
293 	 * this can be removed.
294 	 * FYI: ARM64 does this way while x86 doesn't do anything as such.
295 	 * TODO: Should we keep it for RISC-V ?
296 	 */
297 	period = -(local64_read(&perf_event->count));
298 
299 	local64_set(&perf_event->hw.period_left, 0);
300 	perf_event->attr.sample_period = period;
301 	perf_event->hw.sample_period = period;
302 
303 	set_bit(pmc->idx, kvpmu->pmc_overflown);
304 	kvm_riscv_vcpu_set_interrupt(vcpu, IRQ_PMU_OVF);
305 
306 	rpmu->pmu.start(perf_event, PERF_EF_RELOAD);
307 }
308 
309 static long kvm_pmu_create_perf_event(struct kvm_pmc *pmc, struct perf_event_attr *attr,
310 				      unsigned long flags, unsigned long eidx,
311 				      unsigned long evtdata)
312 {
313 	struct perf_event *event;
314 
315 	kvm_pmu_release_perf_event(pmc);
316 	attr->config = kvm_pmu_get_perf_event_config(eidx, evtdata);
317 	if (flags & SBI_PMU_CFG_FLAG_CLEAR_VALUE) {
318 		//TODO: Do we really want to clear the value in hardware counter
319 		pmc->counter_val = 0;
320 	}
321 
322 	/*
323 	 * Set the default sample_period for now. The guest specified value
324 	 * will be updated in the start call.
325 	 */
326 	attr->sample_period = kvm_pmu_get_sample_period(pmc);
327 
328 	event = perf_event_create_kernel_counter(attr, -1, current, kvm_riscv_pmu_overflow, pmc);
329 	if (IS_ERR(event)) {
330 		pr_debug("kvm pmu event creation failed for eidx %lx: %ld\n", eidx, PTR_ERR(event));
331 		return PTR_ERR(event);
332 	}
333 
334 	pmc->perf_event = event;
335 	if (flags & SBI_PMU_CFG_FLAG_AUTO_START)
336 		perf_event_enable(pmc->perf_event);
337 
338 	return 0;
339 }
340 
341 int kvm_riscv_vcpu_pmu_incr_fw(struct kvm_vcpu *vcpu, unsigned long fid)
342 {
343 	struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu);
344 	struct kvm_fw_event *fevent;
345 
346 	if (!kvpmu || fid >= SBI_PMU_FW_MAX)
347 		return -EINVAL;
348 
349 	fevent = &kvpmu->fw_event[fid];
350 	if (fevent->started)
351 		fevent->value++;
352 
353 	return 0;
354 }
355 
356 int kvm_riscv_vcpu_pmu_read_hpm(struct kvm_vcpu *vcpu, unsigned int csr_num,
357 				unsigned long *val, unsigned long new_val,
358 				unsigned long wr_mask)
359 {
360 	struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu);
361 	int cidx, ret = KVM_INSN_CONTINUE_NEXT_SEPC;
362 
363 	if (!kvpmu || !kvpmu->init_done) {
364 		/*
365 		 * In absence of sscofpmf in the platform, the guest OS may use
366 		 * the legacy PMU driver to read cycle/instret. In that case,
367 		 * just return 0 to avoid any illegal trap. However, any other
368 		 * hpmcounter access should result in illegal trap as they must
369 		 * be access through SBI PMU only.
370 		 */
371 		if (csr_num == CSR_CYCLE || csr_num == CSR_INSTRET) {
372 			*val = 0;
373 			return ret;
374 		} else {
375 			return KVM_INSN_ILLEGAL_TRAP;
376 		}
377 	}
378 
379 	/* The counter CSR are read only. Thus, any write should result in illegal traps */
380 	if (wr_mask)
381 		return KVM_INSN_ILLEGAL_TRAP;
382 
383 	cidx = csr_num - CSR_CYCLE;
384 
385 	if (pmu_ctr_read(vcpu, cidx, val) < 0)
386 		return KVM_INSN_ILLEGAL_TRAP;
387 
388 	return ret;
389 }
390 
391 static void kvm_pmu_clear_snapshot_area(struct kvm_vcpu *vcpu)
392 {
393 	struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu);
394 	int snapshot_area_size = sizeof(struct riscv_pmu_snapshot_data);
395 
396 	if (kvpmu->sdata) {
397 		if (kvpmu->snapshot_addr != INVALID_GPA) {
398 			memset(kvpmu->sdata, 0, snapshot_area_size);
399 			kvm_vcpu_write_guest(vcpu, kvpmu->snapshot_addr,
400 					     kvpmu->sdata, snapshot_area_size);
401 		} else {
402 			pr_warn("snapshot address invalid\n");
403 		}
404 		kfree(kvpmu->sdata);
405 		kvpmu->sdata = NULL;
406 	}
407 	kvpmu->snapshot_addr = INVALID_GPA;
408 }
409 
410 int kvm_riscv_vcpu_pmu_snapshot_set_shmem(struct kvm_vcpu *vcpu, unsigned long saddr_low,
411 				      unsigned long saddr_high, unsigned long flags,
412 				      struct kvm_vcpu_sbi_return *retdata)
413 {
414 	struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu);
415 	int snapshot_area_size = sizeof(struct riscv_pmu_snapshot_data);
416 	int sbiret = 0;
417 	gpa_t saddr;
418 	unsigned long hva;
419 	bool writable;
420 
421 	if (!kvpmu || flags) {
422 		sbiret = SBI_ERR_INVALID_PARAM;
423 		goto out;
424 	}
425 
426 	if (saddr_low == SBI_SHMEM_DISABLE && saddr_high == SBI_SHMEM_DISABLE) {
427 		kvm_pmu_clear_snapshot_area(vcpu);
428 		return 0;
429 	}
430 
431 	saddr = saddr_low;
432 
433 	if (saddr_high != 0) {
434 		if (IS_ENABLED(CONFIG_32BIT))
435 			saddr |= ((gpa_t)saddr_high << 32);
436 		else
437 			sbiret = SBI_ERR_INVALID_ADDRESS;
438 		goto out;
439 	}
440 
441 	hva = kvm_vcpu_gfn_to_hva_prot(vcpu, saddr >> PAGE_SHIFT, &writable);
442 	if (kvm_is_error_hva(hva) || !writable) {
443 		sbiret = SBI_ERR_INVALID_ADDRESS;
444 		goto out;
445 	}
446 
447 	kvpmu->sdata = kzalloc(snapshot_area_size, GFP_ATOMIC);
448 	if (!kvpmu->sdata)
449 		return -ENOMEM;
450 
451 	if (kvm_vcpu_write_guest(vcpu, saddr, kvpmu->sdata, snapshot_area_size)) {
452 		kfree(kvpmu->sdata);
453 		sbiret = SBI_ERR_FAILURE;
454 		goto out;
455 	}
456 
457 	kvpmu->snapshot_addr = saddr;
458 
459 out:
460 	retdata->err_val = sbiret;
461 
462 	return 0;
463 }
464 
465 int kvm_riscv_vcpu_pmu_num_ctrs(struct kvm_vcpu *vcpu,
466 				struct kvm_vcpu_sbi_return *retdata)
467 {
468 	struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu);
469 
470 	retdata->out_val = kvm_pmu_num_counters(kvpmu);
471 
472 	return 0;
473 }
474 
475 int kvm_riscv_vcpu_pmu_ctr_info(struct kvm_vcpu *vcpu, unsigned long cidx,
476 				struct kvm_vcpu_sbi_return *retdata)
477 {
478 	struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu);
479 
480 	if (cidx > RISCV_KVM_MAX_COUNTERS || cidx == 1) {
481 		retdata->err_val = SBI_ERR_INVALID_PARAM;
482 		return 0;
483 	}
484 
485 	retdata->out_val = kvpmu->pmc[cidx].cinfo.value;
486 
487 	return 0;
488 }
489 
490 int kvm_riscv_vcpu_pmu_ctr_start(struct kvm_vcpu *vcpu, unsigned long ctr_base,
491 				 unsigned long ctr_mask, unsigned long flags, u64 ival,
492 				 struct kvm_vcpu_sbi_return *retdata)
493 {
494 	struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu);
495 	int i, pmc_index, sbiret = 0;
496 	struct kvm_pmc *pmc;
497 	int fevent_code;
498 	bool snap_flag_set = flags & SBI_PMU_START_FLAG_INIT_SNAPSHOT;
499 
500 	if (kvm_pmu_validate_counter_mask(kvpmu, ctr_base, ctr_mask) < 0) {
501 		sbiret = SBI_ERR_INVALID_PARAM;
502 		goto out;
503 	}
504 
505 	if (snap_flag_set) {
506 		if (kvpmu->snapshot_addr == INVALID_GPA) {
507 			sbiret = SBI_ERR_NO_SHMEM;
508 			goto out;
509 		}
510 		if (kvm_vcpu_read_guest(vcpu, kvpmu->snapshot_addr, kvpmu->sdata,
511 					sizeof(struct riscv_pmu_snapshot_data))) {
512 			pr_warn("Unable to read snapshot shared memory while starting counters\n");
513 			sbiret = SBI_ERR_FAILURE;
514 			goto out;
515 		}
516 	}
517 	/* Start the counters that have been configured and requested by the guest */
518 	for_each_set_bit(i, &ctr_mask, RISCV_MAX_COUNTERS) {
519 		pmc_index = i + ctr_base;
520 		if (!test_bit(pmc_index, kvpmu->pmc_in_use))
521 			continue;
522 		/* The guest started the counter again. Reset the overflow status */
523 		clear_bit(pmc_index, kvpmu->pmc_overflown);
524 		pmc = &kvpmu->pmc[pmc_index];
525 		if (flags & SBI_PMU_START_FLAG_SET_INIT_VALUE) {
526 			pmc->counter_val = ival;
527 		} else if (snap_flag_set) {
528 			/* The counter index in the snapshot are relative to the counter base */
529 			pmc->counter_val = kvpmu->sdata->ctr_values[i];
530 		}
531 
532 		if (pmc->cinfo.type == SBI_PMU_CTR_TYPE_FW) {
533 			fevent_code = get_event_code(pmc->event_idx);
534 			if (fevent_code >= SBI_PMU_FW_MAX) {
535 				sbiret = SBI_ERR_INVALID_PARAM;
536 				goto out;
537 			}
538 
539 			/* Check if the counter was already started for some reason */
540 			if (kvpmu->fw_event[fevent_code].started) {
541 				sbiret = SBI_ERR_ALREADY_STARTED;
542 				continue;
543 			}
544 
545 			kvpmu->fw_event[fevent_code].started = true;
546 			kvpmu->fw_event[fevent_code].value = pmc->counter_val;
547 		} else if (pmc->perf_event) {
548 			if (unlikely(pmc->started)) {
549 				sbiret = SBI_ERR_ALREADY_STARTED;
550 				continue;
551 			}
552 			perf_event_period(pmc->perf_event, kvm_pmu_get_sample_period(pmc));
553 			perf_event_enable(pmc->perf_event);
554 			pmc->started = true;
555 		} else {
556 			sbiret = SBI_ERR_INVALID_PARAM;
557 		}
558 	}
559 
560 out:
561 	retdata->err_val = sbiret;
562 
563 	return 0;
564 }
565 
566 int kvm_riscv_vcpu_pmu_ctr_stop(struct kvm_vcpu *vcpu, unsigned long ctr_base,
567 				unsigned long ctr_mask, unsigned long flags,
568 				struct kvm_vcpu_sbi_return *retdata)
569 {
570 	struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu);
571 	int i, pmc_index, sbiret = 0;
572 	u64 enabled, running;
573 	struct kvm_pmc *pmc;
574 	int fevent_code;
575 	bool snap_flag_set = flags & SBI_PMU_STOP_FLAG_TAKE_SNAPSHOT;
576 	bool shmem_needs_update = false;
577 
578 	if (kvm_pmu_validate_counter_mask(kvpmu, ctr_base, ctr_mask) < 0) {
579 		sbiret = SBI_ERR_INVALID_PARAM;
580 		goto out;
581 	}
582 
583 	if (snap_flag_set && kvpmu->snapshot_addr == INVALID_GPA) {
584 		sbiret = SBI_ERR_NO_SHMEM;
585 		goto out;
586 	}
587 
588 	/* Stop the counters that have been configured and requested by the guest */
589 	for_each_set_bit(i, &ctr_mask, RISCV_MAX_COUNTERS) {
590 		pmc_index = i + ctr_base;
591 		if (!test_bit(pmc_index, kvpmu->pmc_in_use))
592 			continue;
593 		pmc = &kvpmu->pmc[pmc_index];
594 		if (pmc->cinfo.type == SBI_PMU_CTR_TYPE_FW) {
595 			fevent_code = get_event_code(pmc->event_idx);
596 			if (fevent_code >= SBI_PMU_FW_MAX) {
597 				sbiret = SBI_ERR_INVALID_PARAM;
598 				goto out;
599 			}
600 
601 			if (!kvpmu->fw_event[fevent_code].started)
602 				sbiret = SBI_ERR_ALREADY_STOPPED;
603 
604 			kvpmu->fw_event[fevent_code].started = false;
605 		} else if (pmc->perf_event) {
606 			if (pmc->started) {
607 				/* Stop counting the counter */
608 				perf_event_disable(pmc->perf_event);
609 				pmc->started = false;
610 			} else {
611 				sbiret = SBI_ERR_ALREADY_STOPPED;
612 			}
613 
614 			if (flags & SBI_PMU_STOP_FLAG_RESET)
615 				/* Release the counter if this is a reset request */
616 				kvm_pmu_release_perf_event(pmc);
617 		} else {
618 			sbiret = SBI_ERR_INVALID_PARAM;
619 		}
620 
621 		if (snap_flag_set && !sbiret) {
622 			if (pmc->cinfo.type == SBI_PMU_CTR_TYPE_FW)
623 				pmc->counter_val = kvpmu->fw_event[fevent_code].value;
624 			else if (pmc->perf_event)
625 				pmc->counter_val += perf_event_read_value(pmc->perf_event,
626 									  &enabled, &running);
627 			/*
628 			 * The counter and overflow indicies in the snapshot region are w.r.to
629 			 * cbase. Modify the set bit in the counter mask instead of the pmc_index
630 			 * which indicates the absolute counter index.
631 			 */
632 			if (test_bit(pmc_index, kvpmu->pmc_overflown))
633 				kvpmu->sdata->ctr_overflow_mask |= BIT(i);
634 			kvpmu->sdata->ctr_values[i] = pmc->counter_val;
635 			shmem_needs_update = true;
636 		}
637 
638 		if (flags & SBI_PMU_STOP_FLAG_RESET) {
639 			pmc->event_idx = SBI_PMU_EVENT_IDX_INVALID;
640 			clear_bit(pmc_index, kvpmu->pmc_in_use);
641 			clear_bit(pmc_index, kvpmu->pmc_overflown);
642 			if (snap_flag_set) {
643 				/*
644 				 * Only clear the given counter as the caller is responsible to
645 				 * validate both the overflow mask and configured counters.
646 				 */
647 				kvpmu->sdata->ctr_overflow_mask &= ~BIT(i);
648 				shmem_needs_update = true;
649 			}
650 		}
651 	}
652 
653 	if (shmem_needs_update)
654 		kvm_vcpu_write_guest(vcpu, kvpmu->snapshot_addr, kvpmu->sdata,
655 					     sizeof(struct riscv_pmu_snapshot_data));
656 
657 out:
658 	retdata->err_val = sbiret;
659 
660 	return 0;
661 }
662 
663 int kvm_riscv_vcpu_pmu_ctr_cfg_match(struct kvm_vcpu *vcpu, unsigned long ctr_base,
664 				     unsigned long ctr_mask, unsigned long flags,
665 				     unsigned long eidx, u64 evtdata,
666 				     struct kvm_vcpu_sbi_return *retdata)
667 {
668 	int ctr_idx, sbiret = 0;
669 	long ret;
670 	bool is_fevent;
671 	unsigned long event_code;
672 	u32 etype = kvm_pmu_get_perf_event_type(eidx);
673 	struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu);
674 	struct kvm_pmc *pmc = NULL;
675 	struct perf_event_attr attr = {
676 		.type = etype,
677 		.size = sizeof(struct perf_event_attr),
678 		.pinned = true,
679 		/*
680 		 * It should never reach here if the platform doesn't support the sscofpmf
681 		 * extension as mode filtering won't work without it.
682 		 */
683 		.exclude_host = true,
684 		.exclude_hv = true,
685 		.exclude_user = !!(flags & SBI_PMU_CFG_FLAG_SET_UINH),
686 		.exclude_kernel = !!(flags & SBI_PMU_CFG_FLAG_SET_SINH),
687 		.config1 = RISCV_PMU_CONFIG1_GUEST_EVENTS,
688 	};
689 
690 	if (kvm_pmu_validate_counter_mask(kvpmu, ctr_base, ctr_mask) < 0) {
691 		sbiret = SBI_ERR_INVALID_PARAM;
692 		goto out;
693 	}
694 
695 	event_code = get_event_code(eidx);
696 	is_fevent = kvm_pmu_is_fw_event(eidx);
697 	if (is_fevent && event_code >= SBI_PMU_FW_MAX) {
698 		sbiret = SBI_ERR_NOT_SUPPORTED;
699 		goto out;
700 	}
701 
702 	/*
703 	 * SKIP_MATCH flag indicates the caller is aware of the assigned counter
704 	 * for this event. Just do a sanity check if it already marked used.
705 	 */
706 	if (flags & SBI_PMU_CFG_FLAG_SKIP_MATCH) {
707 		if (!test_bit(ctr_base + __ffs(ctr_mask), kvpmu->pmc_in_use)) {
708 			sbiret = SBI_ERR_FAILURE;
709 			goto out;
710 		}
711 		ctr_idx = ctr_base + __ffs(ctr_mask);
712 	} else  {
713 		ctr_idx = pmu_get_pmc_index(kvpmu, eidx, ctr_base, ctr_mask);
714 		if (ctr_idx < 0) {
715 			sbiret = SBI_ERR_NOT_SUPPORTED;
716 			goto out;
717 		}
718 	}
719 
720 	pmc = &kvpmu->pmc[ctr_idx];
721 	pmc->idx = ctr_idx;
722 
723 	if (is_fevent) {
724 		if (flags & SBI_PMU_CFG_FLAG_AUTO_START)
725 			kvpmu->fw_event[event_code].started = true;
726 	} else {
727 		ret = kvm_pmu_create_perf_event(pmc, &attr, flags, eidx, evtdata);
728 		if (ret) {
729 			sbiret = SBI_ERR_NOT_SUPPORTED;
730 			goto out;
731 		}
732 	}
733 
734 	set_bit(ctr_idx, kvpmu->pmc_in_use);
735 	pmc->event_idx = eidx;
736 	retdata->out_val = ctr_idx;
737 out:
738 	retdata->err_val = sbiret;
739 
740 	return 0;
741 }
742 
743 int kvm_riscv_vcpu_pmu_fw_ctr_read_hi(struct kvm_vcpu *vcpu, unsigned long cidx,
744 				      struct kvm_vcpu_sbi_return *retdata)
745 {
746 	int ret;
747 
748 	ret = pmu_fw_ctr_read_hi(vcpu, cidx, &retdata->out_val);
749 	if (ret == -EINVAL)
750 		retdata->err_val = SBI_ERR_INVALID_PARAM;
751 
752 	return 0;
753 }
754 
755 int kvm_riscv_vcpu_pmu_fw_ctr_read(struct kvm_vcpu *vcpu, unsigned long cidx,
756 				struct kvm_vcpu_sbi_return *retdata)
757 {
758 	int ret;
759 
760 	ret = pmu_ctr_read(vcpu, cidx, &retdata->out_val);
761 	if (ret == -EINVAL)
762 		retdata->err_val = SBI_ERR_INVALID_PARAM;
763 
764 	return 0;
765 }
766 
767 void kvm_riscv_vcpu_pmu_init(struct kvm_vcpu *vcpu)
768 {
769 	int i = 0, ret, num_hw_ctrs = 0, hpm_width = 0;
770 	struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu);
771 	struct kvm_pmc *pmc;
772 
773 	/*
774 	 * PMU functionality should be only available to guests if privilege mode
775 	 * filtering is available in the host. Otherwise, guest will always count
776 	 * events while the execution is in hypervisor mode.
777 	 */
778 	if (!riscv_isa_extension_available(NULL, SSCOFPMF))
779 		return;
780 
781 	ret = riscv_pmu_get_hpm_info(&hpm_width, &num_hw_ctrs);
782 	if (ret < 0 || !hpm_width || !num_hw_ctrs)
783 		return;
784 
785 	/*
786 	 * Increase the number of hardware counters to offset the time counter.
787 	 */
788 	kvpmu->num_hw_ctrs = num_hw_ctrs + 1;
789 	kvpmu->num_fw_ctrs = SBI_PMU_FW_MAX;
790 	memset(&kvpmu->fw_event, 0, SBI_PMU_FW_MAX * sizeof(struct kvm_fw_event));
791 	kvpmu->snapshot_addr = INVALID_GPA;
792 
793 	if (kvpmu->num_hw_ctrs > RISCV_KVM_MAX_HW_CTRS) {
794 		pr_warn_once("Limiting the hardware counters to 32 as specified by the ISA");
795 		kvpmu->num_hw_ctrs = RISCV_KVM_MAX_HW_CTRS;
796 	}
797 
798 	/*
799 	 * There is no correlation between the logical hardware counter and virtual counters.
800 	 * However, we need to encode a hpmcounter CSR in the counter info field so that
801 	 * KVM can trap n emulate the read. This works well in the migration use case as
802 	 * KVM doesn't care if the actual hpmcounter is available in the hardware or not.
803 	 */
804 	for (i = 0; i < kvm_pmu_num_counters(kvpmu); i++) {
805 		/* TIME CSR shouldn't be read from perf interface */
806 		if (i == 1)
807 			continue;
808 		pmc = &kvpmu->pmc[i];
809 		pmc->idx = i;
810 		pmc->event_idx = SBI_PMU_EVENT_IDX_INVALID;
811 		pmc->vcpu = vcpu;
812 		if (i < kvpmu->num_hw_ctrs) {
813 			pmc->cinfo.type = SBI_PMU_CTR_TYPE_HW;
814 			if (i < 3)
815 				/* CY, IR counters */
816 				pmc->cinfo.width = 63;
817 			else
818 				pmc->cinfo.width = hpm_width;
819 			/*
820 			 * The CSR number doesn't have any relation with the logical
821 			 * hardware counters. The CSR numbers are encoded sequentially
822 			 * to avoid maintaining a map between the virtual counter
823 			 * and CSR number.
824 			 */
825 			pmc->cinfo.csr = CSR_CYCLE + i;
826 		} else {
827 			pmc->cinfo.type = SBI_PMU_CTR_TYPE_FW;
828 			pmc->cinfo.width = 63;
829 		}
830 	}
831 
832 	kvpmu->init_done = true;
833 }
834 
835 void kvm_riscv_vcpu_pmu_deinit(struct kvm_vcpu *vcpu)
836 {
837 	struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu);
838 	struct kvm_pmc *pmc;
839 	int i;
840 
841 	if (!kvpmu)
842 		return;
843 
844 	for_each_set_bit(i, kvpmu->pmc_in_use, RISCV_KVM_MAX_COUNTERS) {
845 		pmc = &kvpmu->pmc[i];
846 		pmc->counter_val = 0;
847 		kvm_pmu_release_perf_event(pmc);
848 		pmc->event_idx = SBI_PMU_EVENT_IDX_INVALID;
849 	}
850 	bitmap_zero(kvpmu->pmc_in_use, RISCV_KVM_MAX_COUNTERS);
851 	bitmap_zero(kvpmu->pmc_overflown, RISCV_KVM_MAX_COUNTERS);
852 	memset(&kvpmu->fw_event, 0, SBI_PMU_FW_MAX * sizeof(struct kvm_fw_event));
853 	kvm_pmu_clear_snapshot_area(vcpu);
854 }
855 
856 void kvm_riscv_vcpu_pmu_reset(struct kvm_vcpu *vcpu)
857 {
858 	kvm_riscv_vcpu_pmu_deinit(vcpu);
859 }
860