xref: /linux/arch/riscv/kvm/vcpu_pmu.c (revision 68a052239fc4b351e961f698b824f7654a346091)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (c) 2023 Rivos Inc
4  *
5  * Authors:
6  *     Atish Patra <atishp@rivosinc.com>
7  */
8 
9 #define pr_fmt(fmt)	"riscv-kvm-pmu: " fmt
10 #include <linux/errno.h>
11 #include <linux/err.h>
12 #include <linux/kvm_host.h>
13 #include <linux/perf/riscv_pmu.h>
14 #include <asm/csr.h>
15 #include <asm/kvm_vcpu_sbi.h>
16 #include <asm/kvm_vcpu_pmu.h>
17 #include <asm/sbi.h>
18 #include <linux/bitops.h>
19 
20 #define kvm_pmu_num_counters(pmu) ((pmu)->num_hw_ctrs + (pmu)->num_fw_ctrs)
21 #define get_event_type(x) (((x) & SBI_PMU_EVENT_IDX_TYPE_MASK) >> 16)
22 #define get_event_code(x) ((x) & SBI_PMU_EVENT_IDX_CODE_MASK)
23 
24 static enum perf_hw_id hw_event_perf_map[SBI_PMU_HW_GENERAL_MAX] = {
25 	[SBI_PMU_HW_CPU_CYCLES] = PERF_COUNT_HW_CPU_CYCLES,
26 	[SBI_PMU_HW_INSTRUCTIONS] = PERF_COUNT_HW_INSTRUCTIONS,
27 	[SBI_PMU_HW_CACHE_REFERENCES] = PERF_COUNT_HW_CACHE_REFERENCES,
28 	[SBI_PMU_HW_CACHE_MISSES] = PERF_COUNT_HW_CACHE_MISSES,
29 	[SBI_PMU_HW_BRANCH_INSTRUCTIONS] = PERF_COUNT_HW_BRANCH_INSTRUCTIONS,
30 	[SBI_PMU_HW_BRANCH_MISSES] = PERF_COUNT_HW_BRANCH_MISSES,
31 	[SBI_PMU_HW_BUS_CYCLES] = PERF_COUNT_HW_BUS_CYCLES,
32 	[SBI_PMU_HW_STALLED_CYCLES_FRONTEND] = PERF_COUNT_HW_STALLED_CYCLES_FRONTEND,
33 	[SBI_PMU_HW_STALLED_CYCLES_BACKEND] = PERF_COUNT_HW_STALLED_CYCLES_BACKEND,
34 	[SBI_PMU_HW_REF_CPU_CYCLES] = PERF_COUNT_HW_REF_CPU_CYCLES,
35 };
36 
37 static u64 kvm_pmu_get_sample_period(struct kvm_pmc *pmc)
38 {
39 	u64 counter_val_mask = GENMASK(pmc->cinfo.width, 0);
40 	u64 sample_period;
41 
42 	if (!pmc->counter_val)
43 		sample_period = counter_val_mask;
44 	else
45 		sample_period = (-pmc->counter_val) & counter_val_mask;
46 
47 	return sample_period;
48 }
49 
50 static u32 kvm_pmu_get_perf_event_type(unsigned long eidx)
51 {
52 	enum sbi_pmu_event_type etype = get_event_type(eidx);
53 	u32 type = PERF_TYPE_MAX;
54 
55 	switch (etype) {
56 	case SBI_PMU_EVENT_TYPE_HW:
57 		type = PERF_TYPE_HARDWARE;
58 		break;
59 	case SBI_PMU_EVENT_TYPE_CACHE:
60 		type = PERF_TYPE_HW_CACHE;
61 		break;
62 	case SBI_PMU_EVENT_TYPE_RAW:
63 	case SBI_PMU_EVENT_TYPE_RAW_V2:
64 	case SBI_PMU_EVENT_TYPE_FW:
65 		type = PERF_TYPE_RAW;
66 		break;
67 	default:
68 		break;
69 	}
70 
71 	return type;
72 }
73 
74 static bool kvm_pmu_is_fw_event(unsigned long eidx)
75 {
76 	return get_event_type(eidx) == SBI_PMU_EVENT_TYPE_FW;
77 }
78 
79 static void kvm_pmu_release_perf_event(struct kvm_pmc *pmc)
80 {
81 	if (pmc->perf_event) {
82 		perf_event_disable(pmc->perf_event);
83 		perf_event_release_kernel(pmc->perf_event);
84 		pmc->perf_event = NULL;
85 	}
86 }
87 
88 static u64 kvm_pmu_get_perf_event_hw_config(u32 sbi_event_code)
89 {
90 	return hw_event_perf_map[sbi_event_code];
91 }
92 
93 static u64 kvm_pmu_get_perf_event_cache_config(u32 sbi_event_code)
94 {
95 	u64 config = U64_MAX;
96 	unsigned int cache_type, cache_op, cache_result;
97 
98 	/* All the cache event masks lie within 0xFF. No separate masking is necessary */
99 	cache_type = (sbi_event_code & SBI_PMU_EVENT_CACHE_ID_CODE_MASK) >>
100 		      SBI_PMU_EVENT_CACHE_ID_SHIFT;
101 	cache_op = (sbi_event_code & SBI_PMU_EVENT_CACHE_OP_ID_CODE_MASK) >>
102 		    SBI_PMU_EVENT_CACHE_OP_SHIFT;
103 	cache_result = sbi_event_code & SBI_PMU_EVENT_CACHE_RESULT_ID_CODE_MASK;
104 
105 	if (cache_type >= PERF_COUNT_HW_CACHE_MAX ||
106 	    cache_op >= PERF_COUNT_HW_CACHE_OP_MAX ||
107 	    cache_result >= PERF_COUNT_HW_CACHE_RESULT_MAX)
108 		return config;
109 
110 	config = cache_type | (cache_op << 8) | (cache_result << 16);
111 
112 	return config;
113 }
114 
115 static u64 kvm_pmu_get_perf_event_config(unsigned long eidx, uint64_t evt_data)
116 {
117 	enum sbi_pmu_event_type etype = get_event_type(eidx);
118 	u32 ecode = get_event_code(eidx);
119 	u64 config = U64_MAX;
120 
121 	switch (etype) {
122 	case SBI_PMU_EVENT_TYPE_HW:
123 		if (ecode < SBI_PMU_HW_GENERAL_MAX)
124 			config = kvm_pmu_get_perf_event_hw_config(ecode);
125 		break;
126 	case SBI_PMU_EVENT_TYPE_CACHE:
127 		config = kvm_pmu_get_perf_event_cache_config(ecode);
128 		break;
129 	case SBI_PMU_EVENT_TYPE_RAW:
130 		config = evt_data & RISCV_PMU_RAW_EVENT_MASK;
131 		break;
132 	case SBI_PMU_EVENT_TYPE_RAW_V2:
133 		config = evt_data & RISCV_PMU_RAW_EVENT_V2_MASK;
134 		break;
135 	case SBI_PMU_EVENT_TYPE_FW:
136 		if (ecode < SBI_PMU_FW_MAX)
137 			config = (1ULL << 63) | ecode;
138 		break;
139 	default:
140 		break;
141 	}
142 
143 	return config;
144 }
145 
146 static int kvm_pmu_get_fixed_pmc_index(unsigned long eidx)
147 {
148 	u32 etype = kvm_pmu_get_perf_event_type(eidx);
149 	u32 ecode = get_event_code(eidx);
150 
151 	if (etype != SBI_PMU_EVENT_TYPE_HW)
152 		return -EINVAL;
153 
154 	if (ecode == SBI_PMU_HW_CPU_CYCLES)
155 		return 0;
156 	else if (ecode == SBI_PMU_HW_INSTRUCTIONS)
157 		return 2;
158 	else
159 		return -EINVAL;
160 }
161 
162 static int kvm_pmu_get_programmable_pmc_index(struct kvm_pmu *kvpmu, unsigned long eidx,
163 					      unsigned long cbase, unsigned long cmask)
164 {
165 	int ctr_idx = -1;
166 	int i, pmc_idx;
167 	int min, max;
168 
169 	if (kvm_pmu_is_fw_event(eidx)) {
170 		/* Firmware counters are mapped 1:1 starting from num_hw_ctrs for simplicity */
171 		min = kvpmu->num_hw_ctrs;
172 		max = min + kvpmu->num_fw_ctrs;
173 	} else {
174 		/* First 3 counters are reserved for fixed counters */
175 		min = 3;
176 		max = kvpmu->num_hw_ctrs;
177 	}
178 
179 	for_each_set_bit(i, &cmask, BITS_PER_LONG) {
180 		pmc_idx = i + cbase;
181 		if ((pmc_idx >= min && pmc_idx < max) &&
182 		    !test_bit(pmc_idx, kvpmu->pmc_in_use)) {
183 			ctr_idx = pmc_idx;
184 			break;
185 		}
186 	}
187 
188 	return ctr_idx;
189 }
190 
191 static int pmu_get_pmc_index(struct kvm_pmu *pmu, unsigned long eidx,
192 			     unsigned long cbase, unsigned long cmask)
193 {
194 	int ret;
195 
196 	/* Fixed counters need to be have fixed mapping as they have different width */
197 	ret = kvm_pmu_get_fixed_pmc_index(eidx);
198 	if (ret >= 0)
199 		return ret;
200 
201 	return kvm_pmu_get_programmable_pmc_index(pmu, eidx, cbase, cmask);
202 }
203 
204 static int pmu_fw_ctr_read_hi(struct kvm_vcpu *vcpu, unsigned long cidx,
205 			      unsigned long *out_val)
206 {
207 	struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu);
208 	struct kvm_pmc *pmc;
209 	int fevent_code;
210 
211 	if (!IS_ENABLED(CONFIG_32BIT)) {
212 		pr_warn("%s: should be invoked for only RV32\n", __func__);
213 		return -EINVAL;
214 	}
215 
216 	if (cidx >= kvm_pmu_num_counters(kvpmu) || cidx == 1) {
217 		pr_warn("Invalid counter id [%ld]during read\n", cidx);
218 		return -EINVAL;
219 	}
220 
221 	pmc = &kvpmu->pmc[cidx];
222 
223 	if (pmc->cinfo.type != SBI_PMU_CTR_TYPE_FW)
224 		return -EINVAL;
225 
226 	fevent_code = get_event_code(pmc->event_idx);
227 	pmc->counter_val = kvpmu->fw_event[fevent_code].value;
228 
229 	*out_val = pmc->counter_val >> 32;
230 
231 	return 0;
232 }
233 
234 static int pmu_ctr_read(struct kvm_vcpu *vcpu, unsigned long cidx,
235 			unsigned long *out_val)
236 {
237 	struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu);
238 	struct kvm_pmc *pmc;
239 	u64 enabled, running;
240 	int fevent_code;
241 
242 	if (cidx >= kvm_pmu_num_counters(kvpmu) || cidx == 1) {
243 		pr_warn("Invalid counter id [%ld] during read\n", cidx);
244 		return -EINVAL;
245 	}
246 
247 	pmc = &kvpmu->pmc[cidx];
248 
249 	if (pmc->cinfo.type == SBI_PMU_CTR_TYPE_FW) {
250 		fevent_code = get_event_code(pmc->event_idx);
251 		pmc->counter_val = kvpmu->fw_event[fevent_code].value;
252 	} else if (pmc->perf_event) {
253 		pmc->counter_val += perf_event_read_value(pmc->perf_event, &enabled, &running);
254 	} else {
255 		return -EINVAL;
256 	}
257 	*out_val = pmc->counter_val;
258 
259 	return 0;
260 }
261 
262 static int kvm_pmu_validate_counter_mask(struct kvm_pmu *kvpmu, unsigned long ctr_base,
263 					 unsigned long ctr_mask)
264 {
265 	/* Make sure the we have a valid counter mask requested from the caller */
266 	if (!ctr_mask || (ctr_base + __fls(ctr_mask) >= kvm_pmu_num_counters(kvpmu)))
267 		return -EINVAL;
268 
269 	return 0;
270 }
271 
272 static void kvm_riscv_pmu_overflow(struct perf_event *perf_event,
273 				   struct perf_sample_data *data,
274 				   struct pt_regs *regs)
275 {
276 	struct kvm_pmc *pmc = perf_event->overflow_handler_context;
277 	struct kvm_vcpu *vcpu = pmc->vcpu;
278 	struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu);
279 	struct riscv_pmu *rpmu = to_riscv_pmu(perf_event->pmu);
280 	u64 period;
281 
282 	/*
283 	 * Stop the event counting by directly accessing the perf_event.
284 	 * Otherwise, this needs to deferred via a workqueue.
285 	 * That will introduce skew in the counter value because the actual
286 	 * physical counter would start after returning from this function.
287 	 * It will be stopped again once the workqueue is scheduled
288 	 */
289 	rpmu->pmu.stop(perf_event, PERF_EF_UPDATE);
290 
291 	/*
292 	 * The hw counter would start automatically when this function returns.
293 	 * Thus, the host may continue to interrupt and inject it to the guest
294 	 * even without the guest configuring the next event. Depending on the hardware
295 	 * the host may have some sluggishness only if privilege mode filtering is not
296 	 * available. In an ideal world, where qemu is not the only capable hardware,
297 	 * this can be removed.
298 	 * FYI: ARM64 does this way while x86 doesn't do anything as such.
299 	 * TODO: Should we keep it for RISC-V ?
300 	 */
301 	period = -(local64_read(&perf_event->count));
302 
303 	local64_set(&perf_event->hw.period_left, 0);
304 	perf_event->attr.sample_period = period;
305 	perf_event->hw.sample_period = period;
306 
307 	set_bit(pmc->idx, kvpmu->pmc_overflown);
308 	kvm_riscv_vcpu_set_interrupt(vcpu, IRQ_PMU_OVF);
309 
310 	rpmu->pmu.start(perf_event, PERF_EF_RELOAD);
311 }
312 
313 static long kvm_pmu_create_perf_event(struct kvm_pmc *pmc, struct perf_event_attr *attr,
314 				      unsigned long flags, unsigned long eidx,
315 				      unsigned long evtdata)
316 {
317 	struct perf_event *event;
318 
319 	kvm_pmu_release_perf_event(pmc);
320 	attr->config = kvm_pmu_get_perf_event_config(eidx, evtdata);
321 	if (flags & SBI_PMU_CFG_FLAG_CLEAR_VALUE) {
322 		//TODO: Do we really want to clear the value in hardware counter
323 		pmc->counter_val = 0;
324 	}
325 
326 	/*
327 	 * Set the default sample_period for now. The guest specified value
328 	 * will be updated in the start call.
329 	 */
330 	attr->sample_period = kvm_pmu_get_sample_period(pmc);
331 
332 	event = perf_event_create_kernel_counter(attr, -1, current, kvm_riscv_pmu_overflow, pmc);
333 	if (IS_ERR(event)) {
334 		pr_debug("kvm pmu event creation failed for eidx %lx: %ld\n", eidx, PTR_ERR(event));
335 		return PTR_ERR(event);
336 	}
337 
338 	pmc->perf_event = event;
339 	if (flags & SBI_PMU_CFG_FLAG_AUTO_START)
340 		perf_event_enable(pmc->perf_event);
341 
342 	return 0;
343 }
344 
345 int kvm_riscv_vcpu_pmu_incr_fw(struct kvm_vcpu *vcpu, unsigned long fid)
346 {
347 	struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu);
348 	struct kvm_fw_event *fevent;
349 
350 	if (!kvpmu || fid >= SBI_PMU_FW_MAX)
351 		return -EINVAL;
352 
353 	fevent = &kvpmu->fw_event[fid];
354 	if (fevent->started)
355 		fevent->value++;
356 
357 	return 0;
358 }
359 
360 int kvm_riscv_vcpu_pmu_read_hpm(struct kvm_vcpu *vcpu, unsigned int csr_num,
361 				unsigned long *val, unsigned long new_val,
362 				unsigned long wr_mask)
363 {
364 	struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu);
365 	int cidx, ret = KVM_INSN_CONTINUE_NEXT_SEPC;
366 
367 	if (!kvpmu || !kvpmu->init_done) {
368 		/*
369 		 * In absence of sscofpmf in the platform, the guest OS may use
370 		 * the legacy PMU driver to read cycle/instret. In that case,
371 		 * just return 0 to avoid any illegal trap. However, any other
372 		 * hpmcounter access should result in illegal trap as they must
373 		 * be access through SBI PMU only.
374 		 */
375 		if (csr_num == CSR_CYCLE || csr_num == CSR_INSTRET) {
376 			*val = 0;
377 			return ret;
378 		} else {
379 			return KVM_INSN_ILLEGAL_TRAP;
380 		}
381 	}
382 
383 	/* The counter CSR are read only. Thus, any write should result in illegal traps */
384 	if (wr_mask)
385 		return KVM_INSN_ILLEGAL_TRAP;
386 
387 	cidx = csr_num - CSR_CYCLE;
388 
389 	if (pmu_ctr_read(vcpu, cidx, val) < 0)
390 		return KVM_INSN_ILLEGAL_TRAP;
391 
392 	return ret;
393 }
394 
395 static void kvm_pmu_clear_snapshot_area(struct kvm_vcpu *vcpu)
396 {
397 	struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu);
398 
399 	kfree(kvpmu->sdata);
400 	kvpmu->sdata = NULL;
401 	kvpmu->snapshot_addr = INVALID_GPA;
402 }
403 
404 int kvm_riscv_vcpu_pmu_snapshot_set_shmem(struct kvm_vcpu *vcpu, unsigned long saddr_low,
405 				      unsigned long saddr_high, unsigned long flags,
406 				      struct kvm_vcpu_sbi_return *retdata)
407 {
408 	struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu);
409 	int snapshot_area_size = sizeof(struct riscv_pmu_snapshot_data);
410 	int sbiret = 0;
411 	gpa_t saddr;
412 
413 	if (!kvpmu || flags) {
414 		sbiret = SBI_ERR_INVALID_PARAM;
415 		goto out;
416 	}
417 
418 	if (saddr_low == SBI_SHMEM_DISABLE && saddr_high == SBI_SHMEM_DISABLE) {
419 		kvm_pmu_clear_snapshot_area(vcpu);
420 		return 0;
421 	}
422 
423 	saddr = saddr_low;
424 
425 	if (saddr_high != 0) {
426 		if (IS_ENABLED(CONFIG_32BIT))
427 			saddr |= ((gpa_t)saddr_high << 32);
428 		else
429 			sbiret = SBI_ERR_INVALID_ADDRESS;
430 		goto out;
431 	}
432 
433 	kvpmu->sdata = kzalloc(snapshot_area_size, GFP_ATOMIC);
434 	if (!kvpmu->sdata)
435 		return -ENOMEM;
436 
437 	/* No need to check writable slot explicitly as kvm_vcpu_write_guest does it internally */
438 	if (kvm_vcpu_write_guest(vcpu, saddr, kvpmu->sdata, snapshot_area_size)) {
439 		kfree(kvpmu->sdata);
440 		sbiret = SBI_ERR_INVALID_ADDRESS;
441 		goto out;
442 	}
443 
444 	kvpmu->snapshot_addr = saddr;
445 
446 out:
447 	retdata->err_val = sbiret;
448 
449 	return 0;
450 }
451 
452 int kvm_riscv_vcpu_pmu_event_info(struct kvm_vcpu *vcpu, unsigned long saddr_low,
453 				  unsigned long saddr_high, unsigned long num_events,
454 				  unsigned long flags, struct kvm_vcpu_sbi_return *retdata)
455 {
456 	struct riscv_pmu_event_info *einfo = NULL;
457 	int shmem_size = num_events * sizeof(*einfo);
458 	gpa_t shmem;
459 	u32 eidx, etype;
460 	u64 econfig;
461 	int ret;
462 
463 	if (flags != 0 || (saddr_low & (SZ_16 - 1) || num_events == 0)) {
464 		ret = SBI_ERR_INVALID_PARAM;
465 		goto out;
466 	}
467 
468 	shmem = saddr_low;
469 	if (saddr_high != 0) {
470 		if (IS_ENABLED(CONFIG_32BIT)) {
471 			shmem |= ((gpa_t)saddr_high << 32);
472 		} else {
473 			ret = SBI_ERR_INVALID_ADDRESS;
474 			goto out;
475 		}
476 	}
477 
478 	einfo = kzalloc(shmem_size, GFP_KERNEL);
479 	if (!einfo)
480 		return -ENOMEM;
481 
482 	ret = kvm_vcpu_read_guest(vcpu, shmem, einfo, shmem_size);
483 	if (ret) {
484 		ret = SBI_ERR_FAILURE;
485 		goto free_mem;
486 	}
487 
488 	for (int i = 0; i < num_events; i++) {
489 		eidx = einfo[i].event_idx;
490 		etype = kvm_pmu_get_perf_event_type(eidx);
491 		econfig = kvm_pmu_get_perf_event_config(eidx, einfo[i].event_data);
492 		ret = riscv_pmu_get_event_info(etype, econfig, NULL);
493 		einfo[i].output = (ret > 0) ? 1 : 0;
494 	}
495 
496 	ret = kvm_vcpu_write_guest(vcpu, shmem, einfo, shmem_size);
497 	if (ret) {
498 		ret = SBI_ERR_INVALID_ADDRESS;
499 		goto free_mem;
500 	}
501 
502 	ret = 0;
503 free_mem:
504 	kfree(einfo);
505 out:
506 	retdata->err_val = ret;
507 
508 	return 0;
509 }
510 
511 int kvm_riscv_vcpu_pmu_num_ctrs(struct kvm_vcpu *vcpu,
512 				struct kvm_vcpu_sbi_return *retdata)
513 {
514 	struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu);
515 
516 	retdata->out_val = kvm_pmu_num_counters(kvpmu);
517 
518 	return 0;
519 }
520 
521 int kvm_riscv_vcpu_pmu_ctr_info(struct kvm_vcpu *vcpu, unsigned long cidx,
522 				struct kvm_vcpu_sbi_return *retdata)
523 {
524 	struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu);
525 
526 	if (cidx > RISCV_KVM_MAX_COUNTERS || cidx == 1) {
527 		retdata->err_val = SBI_ERR_INVALID_PARAM;
528 		return 0;
529 	}
530 
531 	retdata->out_val = kvpmu->pmc[cidx].cinfo.value;
532 
533 	return 0;
534 }
535 
536 int kvm_riscv_vcpu_pmu_ctr_start(struct kvm_vcpu *vcpu, unsigned long ctr_base,
537 				 unsigned long ctr_mask, unsigned long flags, u64 ival,
538 				 struct kvm_vcpu_sbi_return *retdata)
539 {
540 	struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu);
541 	int i, pmc_index, sbiret = 0;
542 	struct kvm_pmc *pmc;
543 	int fevent_code;
544 	bool snap_flag_set = flags & SBI_PMU_START_FLAG_INIT_SNAPSHOT;
545 
546 	if (kvm_pmu_validate_counter_mask(kvpmu, ctr_base, ctr_mask) < 0) {
547 		sbiret = SBI_ERR_INVALID_PARAM;
548 		goto out;
549 	}
550 
551 	if (snap_flag_set) {
552 		if (kvpmu->snapshot_addr == INVALID_GPA) {
553 			sbiret = SBI_ERR_NO_SHMEM;
554 			goto out;
555 		}
556 		if (kvm_vcpu_read_guest(vcpu, kvpmu->snapshot_addr, kvpmu->sdata,
557 					sizeof(struct riscv_pmu_snapshot_data))) {
558 			pr_warn("Unable to read snapshot shared memory while starting counters\n");
559 			sbiret = SBI_ERR_FAILURE;
560 			goto out;
561 		}
562 	}
563 	/* Start the counters that have been configured and requested by the guest */
564 	for_each_set_bit(i, &ctr_mask, RISCV_MAX_COUNTERS) {
565 		pmc_index = i + ctr_base;
566 		if (!test_bit(pmc_index, kvpmu->pmc_in_use))
567 			continue;
568 		/* The guest started the counter again. Reset the overflow status */
569 		clear_bit(pmc_index, kvpmu->pmc_overflown);
570 		pmc = &kvpmu->pmc[pmc_index];
571 		if (flags & SBI_PMU_START_FLAG_SET_INIT_VALUE) {
572 			pmc->counter_val = ival;
573 		} else if (snap_flag_set) {
574 			/* The counter index in the snapshot are relative to the counter base */
575 			pmc->counter_val = kvpmu->sdata->ctr_values[i];
576 		}
577 
578 		if (pmc->cinfo.type == SBI_PMU_CTR_TYPE_FW) {
579 			fevent_code = get_event_code(pmc->event_idx);
580 			if (fevent_code >= SBI_PMU_FW_MAX) {
581 				sbiret = SBI_ERR_INVALID_PARAM;
582 				goto out;
583 			}
584 
585 			/* Check if the counter was already started for some reason */
586 			if (kvpmu->fw_event[fevent_code].started) {
587 				sbiret = SBI_ERR_ALREADY_STARTED;
588 				continue;
589 			}
590 
591 			kvpmu->fw_event[fevent_code].started = true;
592 			kvpmu->fw_event[fevent_code].value = pmc->counter_val;
593 		} else if (pmc->perf_event) {
594 			if (unlikely(pmc->started)) {
595 				sbiret = SBI_ERR_ALREADY_STARTED;
596 				continue;
597 			}
598 			perf_event_period(pmc->perf_event, kvm_pmu_get_sample_period(pmc));
599 			perf_event_enable(pmc->perf_event);
600 			pmc->started = true;
601 		} else {
602 			sbiret = SBI_ERR_INVALID_PARAM;
603 		}
604 	}
605 
606 out:
607 	retdata->err_val = sbiret;
608 
609 	return 0;
610 }
611 
612 int kvm_riscv_vcpu_pmu_ctr_stop(struct kvm_vcpu *vcpu, unsigned long ctr_base,
613 				unsigned long ctr_mask, unsigned long flags,
614 				struct kvm_vcpu_sbi_return *retdata)
615 {
616 	struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu);
617 	int i, pmc_index, sbiret = 0;
618 	u64 enabled, running;
619 	struct kvm_pmc *pmc;
620 	int fevent_code;
621 	bool snap_flag_set = flags & SBI_PMU_STOP_FLAG_TAKE_SNAPSHOT;
622 	bool shmem_needs_update = false;
623 
624 	if (kvm_pmu_validate_counter_mask(kvpmu, ctr_base, ctr_mask) < 0) {
625 		sbiret = SBI_ERR_INVALID_PARAM;
626 		goto out;
627 	}
628 
629 	if (snap_flag_set && kvpmu->snapshot_addr == INVALID_GPA) {
630 		sbiret = SBI_ERR_NO_SHMEM;
631 		goto out;
632 	}
633 
634 	/* Stop the counters that have been configured and requested by the guest */
635 	for_each_set_bit(i, &ctr_mask, RISCV_MAX_COUNTERS) {
636 		pmc_index = i + ctr_base;
637 		if (!test_bit(pmc_index, kvpmu->pmc_in_use))
638 			continue;
639 		pmc = &kvpmu->pmc[pmc_index];
640 		if (pmc->cinfo.type == SBI_PMU_CTR_TYPE_FW) {
641 			fevent_code = get_event_code(pmc->event_idx);
642 			if (fevent_code >= SBI_PMU_FW_MAX) {
643 				sbiret = SBI_ERR_INVALID_PARAM;
644 				goto out;
645 			}
646 
647 			if (!kvpmu->fw_event[fevent_code].started)
648 				sbiret = SBI_ERR_ALREADY_STOPPED;
649 
650 			kvpmu->fw_event[fevent_code].started = false;
651 		} else if (pmc->perf_event) {
652 			if (pmc->started) {
653 				/* Stop counting the counter */
654 				perf_event_disable(pmc->perf_event);
655 				pmc->started = false;
656 			} else {
657 				sbiret = SBI_ERR_ALREADY_STOPPED;
658 			}
659 
660 			if (flags & SBI_PMU_STOP_FLAG_RESET)
661 				/* Release the counter if this is a reset request */
662 				kvm_pmu_release_perf_event(pmc);
663 		} else {
664 			sbiret = SBI_ERR_INVALID_PARAM;
665 		}
666 
667 		if (snap_flag_set && !sbiret) {
668 			if (pmc->cinfo.type == SBI_PMU_CTR_TYPE_FW)
669 				pmc->counter_val = kvpmu->fw_event[fevent_code].value;
670 			else if (pmc->perf_event)
671 				pmc->counter_val += perf_event_read_value(pmc->perf_event,
672 									  &enabled, &running);
673 			/*
674 			 * The counter and overflow indicies in the snapshot region are w.r.to
675 			 * cbase. Modify the set bit in the counter mask instead of the pmc_index
676 			 * which indicates the absolute counter index.
677 			 */
678 			if (test_bit(pmc_index, kvpmu->pmc_overflown))
679 				kvpmu->sdata->ctr_overflow_mask |= BIT(i);
680 			kvpmu->sdata->ctr_values[i] = pmc->counter_val;
681 			shmem_needs_update = true;
682 		}
683 
684 		if (flags & SBI_PMU_STOP_FLAG_RESET) {
685 			pmc->event_idx = SBI_PMU_EVENT_IDX_INVALID;
686 			clear_bit(pmc_index, kvpmu->pmc_in_use);
687 			clear_bit(pmc_index, kvpmu->pmc_overflown);
688 			if (snap_flag_set) {
689 				/*
690 				 * Only clear the given counter as the caller is responsible to
691 				 * validate both the overflow mask and configured counters.
692 				 */
693 				kvpmu->sdata->ctr_overflow_mask &= ~BIT(i);
694 				shmem_needs_update = true;
695 			}
696 		}
697 	}
698 
699 	if (shmem_needs_update)
700 		kvm_vcpu_write_guest(vcpu, kvpmu->snapshot_addr, kvpmu->sdata,
701 					     sizeof(struct riscv_pmu_snapshot_data));
702 
703 out:
704 	retdata->err_val = sbiret;
705 
706 	return 0;
707 }
708 
709 int kvm_riscv_vcpu_pmu_ctr_cfg_match(struct kvm_vcpu *vcpu, unsigned long ctr_base,
710 				     unsigned long ctr_mask, unsigned long flags,
711 				     unsigned long eidx, u64 evtdata,
712 				     struct kvm_vcpu_sbi_return *retdata)
713 {
714 	int ctr_idx, sbiret = 0;
715 	long ret;
716 	bool is_fevent;
717 	unsigned long event_code;
718 	u32 etype = kvm_pmu_get_perf_event_type(eidx);
719 	struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu);
720 	struct kvm_pmc *pmc = NULL;
721 	struct perf_event_attr attr = {
722 		.type = etype,
723 		.size = sizeof(struct perf_event_attr),
724 		.pinned = true,
725 		.disabled = true,
726 		/*
727 		 * It should never reach here if the platform doesn't support the sscofpmf
728 		 * extension as mode filtering won't work without it.
729 		 */
730 		.exclude_host = true,
731 		.exclude_hv = true,
732 		.exclude_user = !!(flags & SBI_PMU_CFG_FLAG_SET_UINH),
733 		.exclude_kernel = !!(flags & SBI_PMU_CFG_FLAG_SET_SINH),
734 		.config1 = RISCV_PMU_CONFIG1_GUEST_EVENTS,
735 	};
736 
737 	if (kvm_pmu_validate_counter_mask(kvpmu, ctr_base, ctr_mask) < 0) {
738 		sbiret = SBI_ERR_INVALID_PARAM;
739 		goto out;
740 	}
741 
742 	event_code = get_event_code(eidx);
743 	is_fevent = kvm_pmu_is_fw_event(eidx);
744 	if (is_fevent && event_code >= SBI_PMU_FW_MAX) {
745 		sbiret = SBI_ERR_NOT_SUPPORTED;
746 		goto out;
747 	}
748 
749 	/*
750 	 * SKIP_MATCH flag indicates the caller is aware of the assigned counter
751 	 * for this event. Just do a sanity check if it already marked used.
752 	 */
753 	if (flags & SBI_PMU_CFG_FLAG_SKIP_MATCH) {
754 		if (!test_bit(ctr_base + __ffs(ctr_mask), kvpmu->pmc_in_use)) {
755 			sbiret = SBI_ERR_FAILURE;
756 			goto out;
757 		}
758 		ctr_idx = ctr_base + __ffs(ctr_mask);
759 	} else  {
760 		ctr_idx = pmu_get_pmc_index(kvpmu, eidx, ctr_base, ctr_mask);
761 		if (ctr_idx < 0) {
762 			sbiret = SBI_ERR_NOT_SUPPORTED;
763 			goto out;
764 		}
765 	}
766 
767 	pmc = &kvpmu->pmc[ctr_idx];
768 	pmc->idx = ctr_idx;
769 
770 	if (is_fevent) {
771 		if (flags & SBI_PMU_CFG_FLAG_AUTO_START)
772 			kvpmu->fw_event[event_code].started = true;
773 	} else {
774 		ret = kvm_pmu_create_perf_event(pmc, &attr, flags, eidx, evtdata);
775 		if (ret) {
776 			sbiret = SBI_ERR_NOT_SUPPORTED;
777 			goto out;
778 		}
779 	}
780 
781 	set_bit(ctr_idx, kvpmu->pmc_in_use);
782 	pmc->event_idx = eidx;
783 	retdata->out_val = ctr_idx;
784 out:
785 	retdata->err_val = sbiret;
786 
787 	return 0;
788 }
789 
790 int kvm_riscv_vcpu_pmu_fw_ctr_read_hi(struct kvm_vcpu *vcpu, unsigned long cidx,
791 				      struct kvm_vcpu_sbi_return *retdata)
792 {
793 	int ret;
794 
795 	ret = pmu_fw_ctr_read_hi(vcpu, cidx, &retdata->out_val);
796 	if (ret == -EINVAL)
797 		retdata->err_val = SBI_ERR_INVALID_PARAM;
798 
799 	return 0;
800 }
801 
802 int kvm_riscv_vcpu_pmu_fw_ctr_read(struct kvm_vcpu *vcpu, unsigned long cidx,
803 				struct kvm_vcpu_sbi_return *retdata)
804 {
805 	int ret;
806 
807 	ret = pmu_ctr_read(vcpu, cidx, &retdata->out_val);
808 	if (ret == -EINVAL)
809 		retdata->err_val = SBI_ERR_INVALID_PARAM;
810 
811 	return 0;
812 }
813 
814 void kvm_riscv_vcpu_pmu_init(struct kvm_vcpu *vcpu)
815 {
816 	int i = 0, ret, num_hw_ctrs = 0, hpm_width = 0;
817 	struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu);
818 	struct kvm_pmc *pmc;
819 
820 	/*
821 	 * PMU functionality should be only available to guests if privilege mode
822 	 * filtering is available in the host. Otherwise, guest will always count
823 	 * events while the execution is in hypervisor mode.
824 	 */
825 	if (!riscv_isa_extension_available(NULL, SSCOFPMF))
826 		return;
827 
828 	ret = riscv_pmu_get_hpm_info(&hpm_width, &num_hw_ctrs);
829 	if (ret < 0 || !hpm_width || !num_hw_ctrs)
830 		return;
831 
832 	/*
833 	 * Increase the number of hardware counters to offset the time counter.
834 	 */
835 	kvpmu->num_hw_ctrs = num_hw_ctrs + 1;
836 	kvpmu->num_fw_ctrs = SBI_PMU_FW_MAX;
837 	memset(&kvpmu->fw_event, 0, SBI_PMU_FW_MAX * sizeof(struct kvm_fw_event));
838 	kvpmu->snapshot_addr = INVALID_GPA;
839 
840 	if (kvpmu->num_hw_ctrs > RISCV_KVM_MAX_HW_CTRS) {
841 		pr_warn_once("Limiting the hardware counters to 32 as specified by the ISA");
842 		kvpmu->num_hw_ctrs = RISCV_KVM_MAX_HW_CTRS;
843 	}
844 
845 	/*
846 	 * There is no correlation between the logical hardware counter and virtual counters.
847 	 * However, we need to encode a hpmcounter CSR in the counter info field so that
848 	 * KVM can trap n emulate the read. This works well in the migration use case as
849 	 * KVM doesn't care if the actual hpmcounter is available in the hardware or not.
850 	 */
851 	for (i = 0; i < kvm_pmu_num_counters(kvpmu); i++) {
852 		/* TIME CSR shouldn't be read from perf interface */
853 		if (i == 1)
854 			continue;
855 		pmc = &kvpmu->pmc[i];
856 		pmc->idx = i;
857 		pmc->event_idx = SBI_PMU_EVENT_IDX_INVALID;
858 		pmc->vcpu = vcpu;
859 		if (i < kvpmu->num_hw_ctrs) {
860 			pmc->cinfo.type = SBI_PMU_CTR_TYPE_HW;
861 			if (i < 3)
862 				/* CY, IR counters */
863 				pmc->cinfo.width = 63;
864 			else
865 				pmc->cinfo.width = hpm_width;
866 			/*
867 			 * The CSR number doesn't have any relation with the logical
868 			 * hardware counters. The CSR numbers are encoded sequentially
869 			 * to avoid maintaining a map between the virtual counter
870 			 * and CSR number.
871 			 */
872 			pmc->cinfo.csr = CSR_CYCLE + i;
873 		} else {
874 			pmc->cinfo.type = SBI_PMU_CTR_TYPE_FW;
875 			pmc->cinfo.width = 63;
876 		}
877 	}
878 
879 	kvpmu->init_done = true;
880 }
881 
882 void kvm_riscv_vcpu_pmu_deinit(struct kvm_vcpu *vcpu)
883 {
884 	struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu);
885 	struct kvm_pmc *pmc;
886 	int i;
887 
888 	if (!kvpmu)
889 		return;
890 
891 	for_each_set_bit(i, kvpmu->pmc_in_use, RISCV_KVM_MAX_COUNTERS) {
892 		pmc = &kvpmu->pmc[i];
893 		pmc->counter_val = 0;
894 		kvm_pmu_release_perf_event(pmc);
895 		pmc->event_idx = SBI_PMU_EVENT_IDX_INVALID;
896 	}
897 	bitmap_zero(kvpmu->pmc_in_use, RISCV_KVM_MAX_COUNTERS);
898 	bitmap_zero(kvpmu->pmc_overflown, RISCV_KVM_MAX_COUNTERS);
899 	memset(&kvpmu->fw_event, 0, SBI_PMU_FW_MAX * sizeof(struct kvm_fw_event));
900 	kvm_pmu_clear_snapshot_area(vcpu);
901 }
902 
903 void kvm_riscv_vcpu_pmu_reset(struct kvm_vcpu *vcpu)
904 {
905 	kvm_riscv_vcpu_pmu_deinit(vcpu);
906 }
907