xref: /freebsd/sys/riscv/vmm/vmm.c (revision df21a004be237a1dccd03c7b47254625eea62fa9)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2015 Mihai Carabas <mihai.carabas@gmail.com>
5  * Copyright (c) 2024 Ruslan Bukin <br@bsdpad.com>
6  *
7  * This software was developed by the University of Cambridge Computer
8  * Laboratory (Department of Computer Science and Technology) under Innovate
9  * UK project 105694, "Digital Security by Design (DSbD) Technology Platform
10  * Prototype".
11  *
12  * Redistribution and use in source and binary forms, with or without
13  * modification, are permitted provided that the following conditions
14  * are met:
15  * 1. Redistributions of source code must retain the above copyright
16  *    notice, this list of conditions and the following disclaimer.
17  * 2. Redistributions in binary form must reproduce the above copyright
18  *    notice, this list of conditions and the following disclaimer in the
19  *    documentation and/or other materials provided with the distribution.
20  *
21  * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND
22  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24  * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
25  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31  * SUCH DAMAGE.
32  */
33 
34 #include <sys/param.h>
35 #include <sys/systm.h>
36 #include <sys/cpuset.h>
37 #include <sys/kernel.h>
38 #include <sys/linker.h>
39 #include <sys/lock.h>
40 #include <sys/malloc.h>
41 #include <sys/module.h>
42 #include <sys/mutex.h>
43 #include <sys/pcpu.h>
44 #include <sys/proc.h>
45 #include <sys/queue.h>
46 #include <sys/rwlock.h>
47 #include <sys/sched.h>
48 #include <sys/smp.h>
49 #include <sys/sysctl.h>
50 
51 #include <vm/vm.h>
52 #include <vm/vm_object.h>
53 #include <vm/vm_page.h>
54 #include <vm/pmap.h>
55 #include <vm/vm_map.h>
56 #include <vm/vm_extern.h>
57 #include <vm/vm_param.h>
58 
59 #include <machine/riscvreg.h>
60 #include <machine/cpu.h>
61 #include <machine/fpe.h>
62 #include <machine/machdep.h>
63 #include <machine/pcb.h>
64 #include <machine/smp.h>
65 #include <machine/vm.h>
66 #include <machine/vmparam.h>
67 #include <machine/vmm.h>
68 #include <machine/vmm_instruction_emul.h>
69 
70 #include <dev/pci/pcireg.h>
71 
72 #include <dev/vmm/vmm_dev.h>
73 #include <dev/vmm/vmm_ktr.h>
74 #include <dev/vmm/vmm_mem.h>
75 
76 #include "vmm_stat.h"
77 #include "riscv.h"
78 
79 #include "vmm_aplic.h"
80 
81 struct vcpu {
82 	int		flags;
83 	enum vcpu_state	state;
84 	struct mtx	mtx;
85 	int		hostcpu;	/* host cpuid this vcpu last ran on */
86 	int		vcpuid;
87 	void		*stats;
88 	struct vm_exit	exitinfo;
89 	uint64_t	nextpc;		/* (x) next instruction to execute */
90 	struct vm	*vm;		/* (o) */
91 	void		*cookie;	/* (i) cpu-specific data */
92 	struct fpreg	*guestfpu;	/* (a,i) guest fpu state */
93 };
94 
95 #define	vcpu_lock_init(v)	mtx_init(&((v)->mtx), "vcpu lock", 0, MTX_SPIN)
96 #define	vcpu_lock_destroy(v)	mtx_destroy(&((v)->mtx))
97 #define	vcpu_lock(v)		mtx_lock_spin(&((v)->mtx))
98 #define	vcpu_unlock(v)		mtx_unlock_spin(&((v)->mtx))
99 #define	vcpu_assert_locked(v)	mtx_assert(&((v)->mtx), MA_OWNED)
100 
101 struct vmm_mmio_region {
102 	uint64_t start;
103 	uint64_t end;
104 	mem_region_read_t read;
105 	mem_region_write_t write;
106 };
107 #define	VM_MAX_MMIO_REGIONS	4
108 
109 /*
110  * Initialization:
111  * (o) initialized the first time the VM is created
112  * (i) initialized when VM is created and when it is reinitialized
113  * (x) initialized before use
114  */
115 struct vm {
116 	void		*cookie;		/* (i) cpu-specific data */
117 	volatile cpuset_t active_cpus;		/* (i) active vcpus */
118 	volatile cpuset_t debug_cpus;		/* (i) vcpus stopped for debug*/
119 	int		suspend;		/* (i) stop VM execution */
120 	bool		dying;			/* (o) is dying */
121 	volatile cpuset_t suspended_cpus; 	/* (i) suspended vcpus */
122 	volatile cpuset_t halted_cpus;		/* (x) cpus in a hard halt */
123 	struct vm_mem	mem;			/* (i) [m+v] guest memory */
124 	char		name[VM_MAX_NAMELEN];	/* (o) virtual machine name */
125 	struct vcpu	**vcpu;			/* (i) guest vcpus */
126 	struct vmm_mmio_region mmio_region[VM_MAX_MMIO_REGIONS];
127 						/* (o) guest MMIO regions */
128 	/* The following describe the vm cpu topology */
129 	uint16_t	sockets;		/* (o) num of sockets */
130 	uint16_t	cores;			/* (o) num of cores/socket */
131 	uint16_t	threads;		/* (o) num of threads/core */
132 	uint16_t	maxcpus;		/* (o) max pluggable cpus */
133 	struct sx	vcpus_init_lock;	/* (o) */
134 };
135 
136 static bool vmm_initialized = false;
137 
138 static MALLOC_DEFINE(M_VMM, "vmm", "vmm");
139 
140 /* statistics */
141 static VMM_STAT(VCPU_TOTAL_RUNTIME, "vcpu total runtime");
142 
143 SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW, NULL, NULL);
144 
145 static int vmm_ipinum;
146 SYSCTL_INT(_hw_vmm, OID_AUTO, ipinum, CTLFLAG_RD, &vmm_ipinum, 0,
147     "IPI vector used for vcpu notifications");
148 
149 u_int vm_maxcpu;
150 SYSCTL_UINT(_hw_vmm, OID_AUTO, maxcpu, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
151     &vm_maxcpu, 0, "Maximum number of vCPUs");
152 
153 static void vcpu_notify_event_locked(struct vcpu *vcpu);
154 
155 /* global statistics */
156 VMM_STAT(VMEXIT_COUNT, "total number of vm exits");
157 VMM_STAT(VMEXIT_IRQ, "number of vmexits for an irq");
158 VMM_STAT(VMEXIT_UNHANDLED, "number of vmexits for an unhandled exception");
159 
160 /*
161  * Upper limit on vm_maxcpu. We could increase this to 28 bits, but this
162  * is a safe value for now.
163  */
164 #define	VM_MAXCPU	MIN(0xffff - 1, CPU_SETSIZE)
165 
166 static void
167 vcpu_cleanup(struct vcpu *vcpu, bool destroy)
168 {
169 	vmmops_vcpu_cleanup(vcpu->cookie);
170 	vcpu->cookie = NULL;
171 	if (destroy) {
172 		vmm_stat_free(vcpu->stats);
173 		fpu_save_area_free(vcpu->guestfpu);
174 		vcpu_lock_destroy(vcpu);
175 		free(vcpu, M_VMM);
176 	}
177 }
178 
179 static struct vcpu *
180 vcpu_alloc(struct vm *vm, int vcpu_id)
181 {
182 	struct vcpu *vcpu;
183 
184 	KASSERT(vcpu_id >= 0 && vcpu_id < vm->maxcpus,
185 	    ("vcpu_alloc: invalid vcpu %d", vcpu_id));
186 
187 	vcpu = malloc(sizeof(*vcpu), M_VMM, M_WAITOK | M_ZERO);
188 	vcpu_lock_init(vcpu);
189 	vcpu->state = VCPU_IDLE;
190 	vcpu->hostcpu = NOCPU;
191 	vcpu->vcpuid = vcpu_id;
192 	vcpu->vm = vm;
193 	vcpu->guestfpu = fpu_save_area_alloc();
194 	vcpu->stats = vmm_stat_alloc();
195 	return (vcpu);
196 }
197 
198 static void
199 vcpu_init(struct vcpu *vcpu)
200 {
201 	vcpu->cookie = vmmops_vcpu_init(vcpu->vm->cookie, vcpu, vcpu->vcpuid);
202 	MPASS(vcpu->cookie != NULL);
203 	fpu_save_area_reset(vcpu->guestfpu);
204 	vmm_stat_init(vcpu->stats);
205 }
206 
207 struct vm_exit *
208 vm_exitinfo(struct vcpu *vcpu)
209 {
210 	return (&vcpu->exitinfo);
211 }
212 
213 static int
214 vmm_init(void)
215 {
216 
217 	vm_maxcpu = mp_ncpus;
218 
219 	TUNABLE_INT_FETCH("hw.vmm.maxcpu", &vm_maxcpu);
220 
221 	if (vm_maxcpu > VM_MAXCPU) {
222 		printf("vmm: vm_maxcpu clamped to %u\n", VM_MAXCPU);
223 		vm_maxcpu = VM_MAXCPU;
224 	}
225 
226 	if (vm_maxcpu == 0)
227 		vm_maxcpu = 1;
228 
229 	return (vmmops_modinit());
230 }
231 
232 static int
233 vmm_handler(module_t mod, int what, void *arg)
234 {
235 	int error;
236 
237 	switch (what) {
238 	case MOD_LOAD:
239 		error = vmmdev_init();
240 		if (error != 0)
241 			break;
242 		error = vmm_init();
243 		if (error == 0)
244 			vmm_initialized = true;
245 		else
246 			(void)vmmdev_cleanup();
247 		break;
248 	case MOD_UNLOAD:
249 		error = vmmdev_cleanup();
250 		if (error == 0 && vmm_initialized) {
251 			error = vmmops_modcleanup();
252 			if (error) {
253 				/*
254 				 * Something bad happened - prevent new
255 				 * VMs from being created
256 				 */
257 				vmm_initialized = false;
258 			}
259 		}
260 		break;
261 	default:
262 		error = 0;
263 		break;
264 	}
265 	return (error);
266 }
267 
268 static moduledata_t vmm_kmod = {
269 	"vmm",
270 	vmm_handler,
271 	NULL
272 };
273 
274 /*
275  * vmm initialization has the following dependencies:
276  *
277  * - vmm device initialization requires an initialized devfs.
278  */
279 DECLARE_MODULE(vmm, vmm_kmod, SI_SUB_DEVFS + 1, SI_ORDER_ANY);
280 MODULE_VERSION(vmm, 1);
281 
282 static void
283 vm_init(struct vm *vm, bool create)
284 {
285 	int i;
286 
287 	vm->cookie = vmmops_init(vm, vmspace_pmap(vm_vmspace(vm)));
288 	MPASS(vm->cookie != NULL);
289 
290 	CPU_ZERO(&vm->active_cpus);
291 	CPU_ZERO(&vm->debug_cpus);
292 
293 	vm->suspend = 0;
294 	CPU_ZERO(&vm->suspended_cpus);
295 
296 	memset(vm->mmio_region, 0, sizeof(vm->mmio_region));
297 
298 	if (!create) {
299 		for (i = 0; i < vm->maxcpus; i++) {
300 			if (vm->vcpu[i] != NULL)
301 				vcpu_init(vm->vcpu[i]);
302 		}
303 	}
304 }
305 
306 void
307 vm_disable_vcpu_creation(struct vm *vm)
308 {
309 	sx_xlock(&vm->vcpus_init_lock);
310 	vm->dying = true;
311 	sx_xunlock(&vm->vcpus_init_lock);
312 }
313 
314 struct vcpu *
315 vm_alloc_vcpu(struct vm *vm, int vcpuid)
316 {
317 	struct vcpu *vcpu;
318 
319 	if (vcpuid < 0 || vcpuid >= vm_get_maxcpus(vm))
320 		return (NULL);
321 
322 	/* Some interrupt controllers may have a CPU limit */
323 	if (vcpuid >= aplic_max_cpu_count(vm->cookie))
324 		return (NULL);
325 
326 	vcpu = (struct vcpu *)
327 	    atomic_load_acq_ptr((uintptr_t *)&vm->vcpu[vcpuid]);
328 	if (__predict_true(vcpu != NULL))
329 		return (vcpu);
330 
331 	sx_xlock(&vm->vcpus_init_lock);
332 	vcpu = vm->vcpu[vcpuid];
333 	if (vcpu == NULL && !vm->dying) {
334 		vcpu = vcpu_alloc(vm, vcpuid);
335 		vcpu_init(vcpu);
336 
337 		/*
338 		 * Ensure vCPU is fully created before updating pointer
339 		 * to permit unlocked reads above.
340 		 */
341 		atomic_store_rel_ptr((uintptr_t *)&vm->vcpu[vcpuid],
342 		    (uintptr_t)vcpu);
343 	}
344 	sx_xunlock(&vm->vcpus_init_lock);
345 	return (vcpu);
346 }
347 
348 void
349 vm_lock_vcpus(struct vm *vm)
350 {
351 	sx_xlock(&vm->vcpus_init_lock);
352 }
353 
354 void
355 vm_unlock_vcpus(struct vm *vm)
356 {
357 	sx_unlock(&vm->vcpus_init_lock);
358 }
359 
360 int
361 vm_create(const char *name, struct vm **retvm)
362 {
363 	struct vm *vm;
364 	int error;
365 
366 	/*
367 	 * If vmm.ko could not be successfully initialized then don't attempt
368 	 * to create the virtual machine.
369 	 */
370 	if (!vmm_initialized)
371 		return (ENXIO);
372 
373 	if (name == NULL || strlen(name) >= VM_MAX_NAMELEN)
374 		return (EINVAL);
375 
376 	vm = malloc(sizeof(struct vm), M_VMM, M_WAITOK | M_ZERO);
377 	error = vm_mem_init(&vm->mem, 0, 1ul << 39);
378 	if (error != 0) {
379 		free(vm, M_VMM);
380 		return (error);
381 	}
382 	strcpy(vm->name, name);
383 	sx_init(&vm->vcpus_init_lock, "vm vcpus");
384 
385 	vm->sockets = 1;
386 	vm->cores = 1;			/* XXX backwards compatibility */
387 	vm->threads = 1;		/* XXX backwards compatibility */
388 	vm->maxcpus = vm_maxcpu;
389 
390 	vm->vcpu = malloc(sizeof(*vm->vcpu) * vm->maxcpus, M_VMM,
391 	    M_WAITOK | M_ZERO);
392 
393 	vm_init(vm, true);
394 
395 	*retvm = vm;
396 	return (0);
397 }
398 
399 void
400 vm_get_topology(struct vm *vm, uint16_t *sockets, uint16_t *cores,
401     uint16_t *threads, uint16_t *maxcpus)
402 {
403 	*sockets = vm->sockets;
404 	*cores = vm->cores;
405 	*threads = vm->threads;
406 	*maxcpus = vm->maxcpus;
407 }
408 
409 uint16_t
410 vm_get_maxcpus(struct vm *vm)
411 {
412 	return (vm->maxcpus);
413 }
414 
415 int
416 vm_set_topology(struct vm *vm, uint16_t sockets, uint16_t cores,
417     uint16_t threads, uint16_t maxcpus)
418 {
419 	/* Ignore maxcpus. */
420 	if ((sockets * cores * threads) > vm->maxcpus)
421 		return (EINVAL);
422 	vm->sockets = sockets;
423 	vm->cores = cores;
424 	vm->threads = threads;
425 	return(0);
426 }
427 
428 static void
429 vm_cleanup(struct vm *vm, bool destroy)
430 {
431 	int i;
432 
433 	if (destroy)
434 		vm_xlock_memsegs(vm);
435 	else
436 		vm_assert_memseg_xlocked(vm);
437 
438 	aplic_detach_from_vm(vm->cookie);
439 
440 	for (i = 0; i < vm->maxcpus; i++) {
441 		if (vm->vcpu[i] != NULL)
442 			vcpu_cleanup(vm->vcpu[i], destroy);
443 	}
444 
445 	vmmops_cleanup(vm->cookie);
446 
447 	vm_mem_cleanup(vm);
448 	if (destroy) {
449 		vm_mem_destroy(vm);
450 
451 		free(vm->vcpu, M_VMM);
452 		sx_destroy(&vm->vcpus_init_lock);
453 	}
454 }
455 
456 void
457 vm_destroy(struct vm *vm)
458 {
459 
460 	vm_cleanup(vm, true);
461 
462 	free(vm, M_VMM);
463 }
464 
465 int
466 vm_reinit(struct vm *vm)
467 {
468 	int error;
469 
470 	/*
471 	 * A virtual machine can be reset only if all vcpus are suspended.
472 	 */
473 	if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) {
474 		vm_cleanup(vm, false);
475 		vm_init(vm, false);
476 		error = 0;
477 	} else {
478 		error = EBUSY;
479 	}
480 
481 	return (error);
482 }
483 
484 const char *
485 vm_name(struct vm *vm)
486 {
487 	return (vm->name);
488 }
489 
490 int
491 vm_gla2gpa_nofault(struct vcpu *vcpu, struct vm_guest_paging *paging,
492     uint64_t gla, int prot, uint64_t *gpa, int *is_fault)
493 {
494 	return (vmmops_gla2gpa(vcpu->cookie, paging, gla, prot, gpa, is_fault));
495 }
496 
497 void
498 vm_register_inst_handler(struct vm *vm, uint64_t start, uint64_t size,
499     mem_region_read_t mmio_read, mem_region_write_t mmio_write)
500 {
501 	int i;
502 
503 	for (i = 0; i < nitems(vm->mmio_region); i++) {
504 		if (vm->mmio_region[i].start == 0 &&
505 		    vm->mmio_region[i].end == 0) {
506 			vm->mmio_region[i].start = start;
507 			vm->mmio_region[i].end = start + size;
508 			vm->mmio_region[i].read = mmio_read;
509 			vm->mmio_region[i].write = mmio_write;
510 			return;
511 		}
512 	}
513 
514 	panic("%s: No free MMIO region", __func__);
515 }
516 
517 void
518 vm_deregister_inst_handler(struct vm *vm, uint64_t start, uint64_t size)
519 {
520 	int i;
521 
522 	for (i = 0; i < nitems(vm->mmio_region); i++) {
523 		if (vm->mmio_region[i].start == start &&
524 		    vm->mmio_region[i].end == start + size) {
525 			memset(&vm->mmio_region[i], 0,
526 			    sizeof(vm->mmio_region[i]));
527 			return;
528 		}
529 	}
530 
531 	panic("%s: Invalid MMIO region: %lx - %lx", __func__, start,
532 	    start + size);
533 }
534 
535 static int
536 vm_handle_inst_emul(struct vcpu *vcpu, bool *retu)
537 {
538 	struct vm *vm;
539 	struct vm_exit *vme;
540 	struct vie *vie;
541 	struct hyp *hyp;
542 	uint64_t fault_ipa;
543 	struct vm_guest_paging *paging;
544 	struct vmm_mmio_region *vmr;
545 	int error, i;
546 
547 	vm = vcpu->vm;
548 	hyp = vm->cookie;
549 	if (!hyp->aplic_attached)
550 		goto out_user;
551 
552 	vme = &vcpu->exitinfo;
553 	vie = &vme->u.inst_emul.vie;
554 	paging = &vme->u.inst_emul.paging;
555 
556 	fault_ipa = vme->u.inst_emul.gpa;
557 
558 	vmr = NULL;
559 	for (i = 0; i < nitems(vm->mmio_region); i++) {
560 		if (vm->mmio_region[i].start <= fault_ipa &&
561 		    vm->mmio_region[i].end > fault_ipa) {
562 			vmr = &vm->mmio_region[i];
563 			break;
564 		}
565 	}
566 	if (vmr == NULL)
567 		goto out_user;
568 
569 	error = vmm_emulate_instruction(vcpu, fault_ipa, vie, paging,
570 	    vmr->read, vmr->write, retu);
571 	return (error);
572 
573 out_user:
574 	*retu = true;
575 	return (0);
576 }
577 
578 int
579 vm_suspend(struct vm *vm, enum vm_suspend_how how)
580 {
581 	int i;
582 
583 	if (how <= VM_SUSPEND_NONE || how >= VM_SUSPEND_LAST)
584 		return (EINVAL);
585 
586 	if (atomic_cmpset_int(&vm->suspend, 0, how) == 0) {
587 		VM_CTR2(vm, "virtual machine already suspended %d/%d",
588 		    vm->suspend, how);
589 		return (EALREADY);
590 	}
591 
592 	VM_CTR1(vm, "virtual machine successfully suspended %d", how);
593 
594 	/*
595 	 * Notify all active vcpus that they are now suspended.
596 	 */
597 	for (i = 0; i < vm->maxcpus; i++) {
598 		if (CPU_ISSET(i, &vm->active_cpus))
599 			vcpu_notify_event(vm_vcpu(vm, i));
600 	}
601 
602 	return (0);
603 }
604 
605 void
606 vm_exit_suspended(struct vcpu *vcpu, uint64_t pc)
607 {
608 	struct vm *vm = vcpu->vm;
609 	struct vm_exit *vmexit;
610 
611 	KASSERT(vm->suspend > VM_SUSPEND_NONE && vm->suspend < VM_SUSPEND_LAST,
612 	    ("vm_exit_suspended: invalid suspend type %d", vm->suspend));
613 
614 	vmexit = vm_exitinfo(vcpu);
615 	vmexit->pc = pc;
616 	vmexit->inst_length = 4;
617 	vmexit->exitcode = VM_EXITCODE_SUSPENDED;
618 	vmexit->u.suspended.how = vm->suspend;
619 }
620 
621 void
622 vm_exit_debug(struct vcpu *vcpu, uint64_t pc)
623 {
624 	struct vm_exit *vmexit;
625 
626 	vmexit = vm_exitinfo(vcpu);
627 	vmexit->pc = pc;
628 	vmexit->inst_length = 4;
629 	vmexit->exitcode = VM_EXITCODE_DEBUG;
630 }
631 
632 int
633 vm_activate_cpu(struct vcpu *vcpu)
634 {
635 	struct vm *vm = vcpu->vm;
636 
637 	if (CPU_ISSET(vcpu->vcpuid, &vm->active_cpus))
638 		return (EBUSY);
639 
640 	CPU_SET_ATOMIC(vcpu->vcpuid, &vm->active_cpus);
641 	return (0);
642 
643 }
644 
645 int
646 vm_suspend_cpu(struct vm *vm, struct vcpu *vcpu)
647 {
648 	if (vcpu == NULL) {
649 		vm->debug_cpus = vm->active_cpus;
650 		for (int i = 0; i < vm->maxcpus; i++) {
651 			if (CPU_ISSET(i, &vm->active_cpus))
652 				vcpu_notify_event(vm_vcpu(vm, i));
653 		}
654 	} else {
655 		if (!CPU_ISSET(vcpu->vcpuid, &vm->active_cpus))
656 			return (EINVAL);
657 
658 		CPU_SET_ATOMIC(vcpu->vcpuid, &vm->debug_cpus);
659 		vcpu_notify_event(vcpu);
660 	}
661 	return (0);
662 }
663 
664 int
665 vm_resume_cpu(struct vm *vm, struct vcpu *vcpu)
666 {
667 
668 	if (vcpu == NULL) {
669 		CPU_ZERO(&vm->debug_cpus);
670 	} else {
671 		if (!CPU_ISSET(vcpu->vcpuid, &vm->debug_cpus))
672 			return (EINVAL);
673 
674 		CPU_CLR_ATOMIC(vcpu->vcpuid, &vm->debug_cpus);
675 	}
676 	return (0);
677 }
678 
679 int
680 vcpu_debugged(struct vcpu *vcpu)
681 {
682 
683 	return (CPU_ISSET(vcpu->vcpuid, &vcpu->vm->debug_cpus));
684 }
685 
686 cpuset_t
687 vm_active_cpus(struct vm *vm)
688 {
689 
690 	return (vm->active_cpus);
691 }
692 
693 cpuset_t
694 vm_debug_cpus(struct vm *vm)
695 {
696 
697 	return (vm->debug_cpus);
698 }
699 
700 cpuset_t
701 vm_suspended_cpus(struct vm *vm)
702 {
703 
704 	return (vm->suspended_cpus);
705 }
706 
707 
708 void *
709 vcpu_stats(struct vcpu *vcpu)
710 {
711 
712 	return (vcpu->stats);
713 }
714 
715 /*
716  * This function is called to ensure that a vcpu "sees" a pending event
717  * as soon as possible:
718  * - If the vcpu thread is sleeping then it is woken up.
719  * - If the vcpu is running on a different host_cpu then an IPI will be directed
720  *   to the host_cpu to cause the vcpu to trap into the hypervisor.
721  */
722 static void
723 vcpu_notify_event_locked(struct vcpu *vcpu)
724 {
725 	int hostcpu;
726 
727 	hostcpu = vcpu->hostcpu;
728 	if (vcpu->state == VCPU_RUNNING) {
729 		KASSERT(hostcpu != NOCPU, ("vcpu running on invalid hostcpu"));
730 		if (hostcpu != curcpu) {
731 			ipi_cpu(hostcpu, vmm_ipinum);
732 		} else {
733 			/*
734 			 * If the 'vcpu' is running on 'curcpu' then it must
735 			 * be sending a notification to itself (e.g. SELF_IPI).
736 			 * The pending event will be picked up when the vcpu
737 			 * transitions back to guest context.
738 			 */
739 		}
740 	} else {
741 		KASSERT(hostcpu == NOCPU, ("vcpu state %d not consistent "
742 		    "with hostcpu %d", vcpu->state, hostcpu));
743 		if (vcpu->state == VCPU_SLEEPING)
744 			wakeup_one(vcpu);
745 	}
746 }
747 
748 void
749 vcpu_notify_event(struct vcpu *vcpu)
750 {
751 	vcpu_lock(vcpu);
752 	vcpu_notify_event_locked(vcpu);
753 	vcpu_unlock(vcpu);
754 }
755 
756 struct vm_mem *
757 vm_mem(struct vm *vm)
758 {
759 	return (&vm->mem);
760 }
761 
762 static void
763 restore_guest_fpustate(struct vcpu *vcpu)
764 {
765 
766 	/* Flush host state to the pcb. */
767 	fpe_state_save(curthread);
768 
769 	/* Ensure the VFP state will be re-loaded when exiting the guest. */
770 	PCPU_SET(fpcurthread, NULL);
771 
772 	/* restore guest FPU state */
773 	fpe_enable();
774 	fpe_restore(vcpu->guestfpu);
775 
776 	/*
777 	 * The FPU is now "dirty" with the guest's state so turn on emulation
778 	 * to trap any access to the FPU by the host.
779 	 */
780 	fpe_disable();
781 }
782 
783 static void
784 save_guest_fpustate(struct vcpu *vcpu)
785 {
786 
787 	/* Save guest FPE state. */
788 	fpe_enable();
789 	fpe_store(vcpu->guestfpu);
790 	fpe_disable();
791 
792 	KASSERT(PCPU_GET(fpcurthread) == NULL,
793 	    ("%s: fpcurthread set with guest registers", __func__));
794 }
795 
796 static int
797 vcpu_set_state_locked(struct vcpu *vcpu, enum vcpu_state newstate,
798     bool from_idle)
799 {
800 	int error;
801 
802 	vcpu_assert_locked(vcpu);
803 
804 	/*
805 	 * State transitions from the vmmdev_ioctl() must always begin from
806 	 * the VCPU_IDLE state. This guarantees that there is only a single
807 	 * ioctl() operating on a vcpu at any point.
808 	 */
809 	if (from_idle) {
810 		while (vcpu->state != VCPU_IDLE) {
811 			vcpu_notify_event_locked(vcpu);
812 			msleep_spin(&vcpu->state, &vcpu->mtx, "vmstat", hz);
813 		}
814 	} else {
815 		KASSERT(vcpu->state != VCPU_IDLE, ("invalid transition from "
816 		    "vcpu idle state"));
817 	}
818 
819 	if (vcpu->state == VCPU_RUNNING) {
820 		KASSERT(vcpu->hostcpu == curcpu, ("curcpu %d and hostcpu %d "
821 		    "mismatch for running vcpu", curcpu, vcpu->hostcpu));
822 	} else {
823 		KASSERT(vcpu->hostcpu == NOCPU, ("Invalid hostcpu %d for a "
824 		    "vcpu that is not running", vcpu->hostcpu));
825 	}
826 
827 	/*
828 	 * The following state transitions are allowed:
829 	 * IDLE -> FROZEN -> IDLE
830 	 * FROZEN -> RUNNING -> FROZEN
831 	 * FROZEN -> SLEEPING -> FROZEN
832 	 */
833 	switch (vcpu->state) {
834 	case VCPU_IDLE:
835 	case VCPU_RUNNING:
836 	case VCPU_SLEEPING:
837 		error = (newstate != VCPU_FROZEN);
838 		break;
839 	case VCPU_FROZEN:
840 		error = (newstate == VCPU_FROZEN);
841 		break;
842 	default:
843 		error = 1;
844 		break;
845 	}
846 
847 	if (error)
848 		return (EBUSY);
849 
850 	vcpu->state = newstate;
851 	if (newstate == VCPU_RUNNING)
852 		vcpu->hostcpu = curcpu;
853 	else
854 		vcpu->hostcpu = NOCPU;
855 
856 	if (newstate == VCPU_IDLE)
857 		wakeup(&vcpu->state);
858 
859 	return (0);
860 }
861 
862 static void
863 vcpu_require_state(struct vcpu *vcpu, enum vcpu_state newstate)
864 {
865 	int error;
866 
867 	if ((error = vcpu_set_state(vcpu, newstate, false)) != 0)
868 		panic("Error %d setting state to %d\n", error, newstate);
869 }
870 
871 static void
872 vcpu_require_state_locked(struct vcpu *vcpu, enum vcpu_state newstate)
873 {
874 	int error;
875 
876 	if ((error = vcpu_set_state_locked(vcpu, newstate, false)) != 0)
877 		panic("Error %d setting state to %d", error, newstate);
878 }
879 
880 int
881 vm_get_capability(struct vcpu *vcpu, int type, int *retval)
882 {
883 
884 	if (type < 0 || type >= VM_CAP_MAX)
885 		return (EINVAL);
886 
887 	return (vmmops_getcap(vcpu->cookie, type, retval));
888 }
889 
890 int
891 vm_set_capability(struct vcpu *vcpu, int type, int val)
892 {
893 
894 	if (type < 0 || type >= VM_CAP_MAX)
895 		return (EINVAL);
896 
897 	return (vmmops_setcap(vcpu->cookie, type, val));
898 }
899 
900 struct vm *
901 vcpu_vm(struct vcpu *vcpu)
902 {
903 
904 	return (vcpu->vm);
905 }
906 
907 int
908 vcpu_vcpuid(struct vcpu *vcpu)
909 {
910 
911 	return (vcpu->vcpuid);
912 }
913 
914 void *
915 vcpu_get_cookie(struct vcpu *vcpu)
916 {
917 
918 	return (vcpu->cookie);
919 }
920 
921 struct vcpu *
922 vm_vcpu(struct vm *vm, int vcpuid)
923 {
924 
925 	return (vm->vcpu[vcpuid]);
926 }
927 
928 int
929 vcpu_set_state(struct vcpu *vcpu, enum vcpu_state newstate, bool from_idle)
930 {
931 	int error;
932 
933 	vcpu_lock(vcpu);
934 	error = vcpu_set_state_locked(vcpu, newstate, from_idle);
935 	vcpu_unlock(vcpu);
936 
937 	return (error);
938 }
939 
940 enum vcpu_state
941 vcpu_get_state(struct vcpu *vcpu, int *hostcpu)
942 {
943 	enum vcpu_state state;
944 
945 	vcpu_lock(vcpu);
946 	state = vcpu->state;
947 	if (hostcpu != NULL)
948 		*hostcpu = vcpu->hostcpu;
949 	vcpu_unlock(vcpu);
950 
951 	return (state);
952 }
953 
954 int
955 vm_get_register(struct vcpu *vcpu, int reg, uint64_t *retval)
956 {
957 	if (reg < 0 || reg >= VM_REG_LAST)
958 		return (EINVAL);
959 
960 	return (vmmops_getreg(vcpu->cookie, reg, retval));
961 }
962 
963 int
964 vm_set_register(struct vcpu *vcpu, int reg, uint64_t val)
965 {
966 	int error;
967 
968 	if (reg < 0 || reg >= VM_REG_LAST)
969 		return (EINVAL);
970 	error = vmmops_setreg(vcpu->cookie, reg, val);
971 	if (error || reg != VM_REG_GUEST_SEPC)
972 		return (error);
973 
974 	vcpu->nextpc = val;
975 
976 	return (0);
977 }
978 
979 void *
980 vm_get_cookie(struct vm *vm)
981 {
982 
983 	return (vm->cookie);
984 }
985 
986 int
987 vm_inject_exception(struct vcpu *vcpu, uint64_t scause)
988 {
989 
990 	return (vmmops_exception(vcpu->cookie, scause));
991 }
992 
993 int
994 vm_attach_aplic(struct vm *vm, struct vm_aplic_descr *descr)
995 {
996 
997 	return (aplic_attach_to_vm(vm->cookie, descr));
998 }
999 
1000 int
1001 vm_assert_irq(struct vm *vm, uint32_t irq)
1002 {
1003 
1004 	return (aplic_inject_irq(vm->cookie, -1, irq, true));
1005 }
1006 
1007 int
1008 vm_deassert_irq(struct vm *vm, uint32_t irq)
1009 {
1010 
1011 	return (aplic_inject_irq(vm->cookie, -1, irq, false));
1012 }
1013 
1014 int
1015 vm_raise_msi(struct vm *vm, uint64_t msg, uint64_t addr, int bus, int slot,
1016     int func)
1017 {
1018 
1019 	return (aplic_inject_msi(vm->cookie, msg, addr));
1020 }
1021 
1022 static int
1023 vm_handle_wfi(struct vcpu *vcpu, struct vm_exit *vme, bool *retu)
1024 {
1025 	struct vm *vm;
1026 
1027 	vm = vcpu->vm;
1028 	vcpu_lock(vcpu);
1029 	while (1) {
1030 		if (vm->suspend)
1031 			break;
1032 
1033 		if (aplic_check_pending(vcpu->cookie))
1034 			break;
1035 
1036 		if (riscv_check_ipi(vcpu->cookie, false))
1037 			break;
1038 
1039 		if (riscv_check_interrupts_pending(vcpu->cookie))
1040 			break;
1041 
1042 		if (vcpu_should_yield(vcpu))
1043 			break;
1044 
1045 		vcpu_require_state_locked(vcpu, VCPU_SLEEPING);
1046 		/*
1047 		 * XXX msleep_spin() cannot be interrupted by signals so
1048 		 * wake up periodically to check pending signals.
1049 		 */
1050 		msleep_spin(vcpu, &vcpu->mtx, "vmidle", hz);
1051 		vcpu_require_state_locked(vcpu, VCPU_FROZEN);
1052 	}
1053 	vcpu_unlock(vcpu);
1054 
1055 	*retu = false;
1056 
1057 	return (0);
1058 }
1059 
1060 static int
1061 vm_handle_paging(struct vcpu *vcpu, bool *retu)
1062 {
1063 	struct vm *vm;
1064 	struct vm_exit *vme;
1065 	struct vm_map *map;
1066 	uint64_t addr;
1067 	pmap_t pmap;
1068 	int ftype, rv;
1069 
1070 	vm = vcpu->vm;
1071 	vme = &vcpu->exitinfo;
1072 
1073 	pmap = vmspace_pmap(vm_vmspace(vm));
1074 	addr = (vme->htval << 2) & ~(PAGE_SIZE - 1);
1075 
1076 	dprintf("%s: %lx\n", __func__, addr);
1077 
1078 	switch (vme->scause) {
1079 	case SCAUSE_STORE_GUEST_PAGE_FAULT:
1080 		ftype = VM_PROT_WRITE;
1081 		break;
1082 	case SCAUSE_FETCH_GUEST_PAGE_FAULT:
1083 		ftype = VM_PROT_EXECUTE;
1084 		break;
1085 	case SCAUSE_LOAD_GUEST_PAGE_FAULT:
1086 		ftype = VM_PROT_READ;
1087 		break;
1088 	default:
1089 		panic("unknown page trap: %lu", vme->scause);
1090 	}
1091 
1092 	/* The page exists, but the page table needs to be updated. */
1093 	if (pmap_fault(pmap, addr, ftype))
1094 		return (0);
1095 
1096 	map = &vm_vmspace(vm)->vm_map;
1097 	rv = vm_fault(map, addr, ftype, VM_FAULT_NORMAL, NULL);
1098 	if (rv != KERN_SUCCESS) {
1099 		printf("%s: vm_fault failed, addr %lx, ftype %d, err %d\n",
1100 		    __func__, addr, ftype, rv);
1101 		return (EFAULT);
1102 	}
1103 
1104 	return (0);
1105 }
1106 
1107 static int
1108 vm_handle_suspend(struct vcpu *vcpu, bool *retu)
1109 {
1110 	struct vm *vm = vcpu->vm;
1111 	int error, i;
1112 	struct thread *td;
1113 
1114 	error = 0;
1115 	td = curthread;
1116 
1117 	CPU_SET_ATOMIC(vcpu->vcpuid, &vm->suspended_cpus);
1118 
1119 	/*
1120 	 * Wait until all 'active_cpus' have suspended themselves.
1121 	 *
1122 	 * Since a VM may be suspended at any time including when one or
1123 	 * more vcpus are doing a rendezvous we need to call the rendezvous
1124 	 * handler while we are waiting to prevent a deadlock.
1125 	 */
1126 	vcpu_lock(vcpu);
1127 	while (error == 0) {
1128 		if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0)
1129 			break;
1130 
1131 		vcpu_require_state_locked(vcpu, VCPU_SLEEPING);
1132 		msleep_spin(vcpu, &vcpu->mtx, "vmsusp", hz);
1133 		vcpu_require_state_locked(vcpu, VCPU_FROZEN);
1134 		if (td_ast_pending(td, TDA_SUSPEND)) {
1135 			vcpu_unlock(vcpu);
1136 			error = thread_check_susp(td, false);
1137 			vcpu_lock(vcpu);
1138 		}
1139 	}
1140 	vcpu_unlock(vcpu);
1141 
1142 	/*
1143 	 * Wakeup the other sleeping vcpus and return to userspace.
1144 	 */
1145 	for (i = 0; i < vm->maxcpus; i++) {
1146 		if (CPU_ISSET(i, &vm->suspended_cpus)) {
1147 			vcpu_notify_event(vm_vcpu(vm, i));
1148 		}
1149 	}
1150 
1151 	*retu = true;
1152 	return (error);
1153 }
1154 
1155 int
1156 vm_run(struct vcpu *vcpu)
1157 {
1158 	struct vm_eventinfo evinfo;
1159 	struct vm_exit *vme;
1160 	struct vm *vm;
1161 	pmap_t pmap;
1162 	int error;
1163 	int vcpuid;
1164 	bool retu;
1165 
1166 	vm = vcpu->vm;
1167 
1168 	dprintf("%s\n", __func__);
1169 
1170 	vcpuid = vcpu->vcpuid;
1171 
1172 	if (!CPU_ISSET(vcpuid, &vm->active_cpus))
1173 		return (EINVAL);
1174 
1175 	if (CPU_ISSET(vcpuid, &vm->suspended_cpus))
1176 		return (EINVAL);
1177 
1178 	pmap = vmspace_pmap(vm_vmspace(vm));
1179 	vme = &vcpu->exitinfo;
1180 	evinfo.rptr = NULL;
1181 	evinfo.sptr = &vm->suspend;
1182 	evinfo.iptr = NULL;
1183 restart:
1184 	critical_enter();
1185 
1186 	restore_guest_fpustate(vcpu);
1187 
1188 	vcpu_require_state(vcpu, VCPU_RUNNING);
1189 	error = vmmops_run(vcpu->cookie, vcpu->nextpc, pmap, &evinfo);
1190 	vcpu_require_state(vcpu, VCPU_FROZEN);
1191 
1192 	save_guest_fpustate(vcpu);
1193 
1194 	critical_exit();
1195 
1196 	if (error == 0) {
1197 		retu = false;
1198 		switch (vme->exitcode) {
1199 		case VM_EXITCODE_INST_EMUL:
1200 			vcpu->nextpc = vme->pc + vme->inst_length;
1201 			error = vm_handle_inst_emul(vcpu, &retu);
1202 			break;
1203 		case VM_EXITCODE_WFI:
1204 			vcpu->nextpc = vme->pc + vme->inst_length;
1205 			error = vm_handle_wfi(vcpu, vme, &retu);
1206 			break;
1207 		case VM_EXITCODE_ECALL:
1208 			/* Handle in userland. */
1209 			vcpu->nextpc = vme->pc + vme->inst_length;
1210 			retu = true;
1211 			break;
1212 		case VM_EXITCODE_PAGING:
1213 			vcpu->nextpc = vme->pc;
1214 			error = vm_handle_paging(vcpu, &retu);
1215 			break;
1216 		case VM_EXITCODE_BOGUS:
1217 			vcpu->nextpc = vme->pc;
1218 			retu = false;
1219 			error = 0;
1220 			break;
1221 		case VM_EXITCODE_SUSPENDED:
1222 			vcpu->nextpc = vme->pc;
1223 			error = vm_handle_suspend(vcpu, &retu);
1224 			break;
1225 		default:
1226 			/* Handle in userland. */
1227 			vcpu->nextpc = vme->pc;
1228 			retu = true;
1229 			break;
1230 		}
1231 	}
1232 
1233 	if (error == 0 && retu == false)
1234 		goto restart;
1235 
1236 	return (error);
1237 }
1238