xref: /freebsd/sys/riscv/vmm/vmm.c (revision 5f62a964e9f8abc6a05d8338273fadd154f0a206)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2015 Mihai Carabas <mihai.carabas@gmail.com>
5  * Copyright (c) 2024 Ruslan Bukin <br@bsdpad.com>
6  *
7  * This software was developed by the University of Cambridge Computer
8  * Laboratory (Department of Computer Science and Technology) under Innovate
9  * UK project 105694, "Digital Security by Design (DSbD) Technology Platform
10  * Prototype".
11  *
12  * Redistribution and use in source and binary forms, with or without
13  * modification, are permitted provided that the following conditions
14  * are met:
15  * 1. Redistributions of source code must retain the above copyright
16  *    notice, this list of conditions and the following disclaimer.
17  * 2. Redistributions in binary form must reproduce the above copyright
18  *    notice, this list of conditions and the following disclaimer in the
19  *    documentation and/or other materials provided with the distribution.
20  *
21  * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND
22  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24  * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
25  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31  * SUCH DAMAGE.
32  */
33 
34 #include <sys/param.h>
35 #include <sys/systm.h>
36 #include <sys/cpuset.h>
37 #include <sys/kernel.h>
38 #include <sys/linker.h>
39 #include <sys/lock.h>
40 #include <sys/malloc.h>
41 #include <sys/mutex.h>
42 #include <sys/pcpu.h>
43 #include <sys/proc.h>
44 #include <sys/queue.h>
45 #include <sys/rwlock.h>
46 #include <sys/sched.h>
47 #include <sys/smp.h>
48 #include <sys/sysctl.h>
49 
50 #include <vm/vm.h>
51 #include <vm/vm_object.h>
52 #include <vm/vm_page.h>
53 #include <vm/pmap.h>
54 #include <vm/vm_map.h>
55 #include <vm/vm_extern.h>
56 #include <vm/vm_param.h>
57 
58 #include <machine/riscvreg.h>
59 #include <machine/cpu.h>
60 #include <machine/fpe.h>
61 #include <machine/machdep.h>
62 #include <machine/pcb.h>
63 #include <machine/smp.h>
64 #include <machine/vm.h>
65 #include <machine/vmparam.h>
66 #include <machine/vmm.h>
67 #include <machine/vmm_instruction_emul.h>
68 
69 #include <dev/pci/pcireg.h>
70 
71 #include <dev/vmm/vmm_dev.h>
72 #include <dev/vmm/vmm_ktr.h>
73 #include <dev/vmm/vmm_mem.h>
74 
75 #include "vmm_stat.h"
76 #include "riscv.h"
77 
78 #include "vmm_aplic.h"
79 
80 struct vcpu {
81 	int		flags;
82 	enum vcpu_state	state;
83 	struct mtx	mtx;
84 	int		hostcpu;	/* host cpuid this vcpu last ran on */
85 	int		vcpuid;
86 	void		*stats;
87 	struct vm_exit	exitinfo;
88 	uint64_t	nextpc;		/* (x) next instruction to execute */
89 	struct vm	*vm;		/* (o) */
90 	void		*cookie;	/* (i) cpu-specific data */
91 	struct fpreg	*guestfpu;	/* (a,i) guest fpu state */
92 };
93 
94 #define	vcpu_lock_init(v)	mtx_init(&((v)->mtx), "vcpu lock", 0, MTX_SPIN)
95 #define	vcpu_lock_destroy(v)	mtx_destroy(&((v)->mtx))
96 #define	vcpu_lock(v)		mtx_lock_spin(&((v)->mtx))
97 #define	vcpu_unlock(v)		mtx_unlock_spin(&((v)->mtx))
98 #define	vcpu_assert_locked(v)	mtx_assert(&((v)->mtx), MA_OWNED)
99 
100 struct vmm_mmio_region {
101 	uint64_t start;
102 	uint64_t end;
103 	mem_region_read_t read;
104 	mem_region_write_t write;
105 };
106 #define	VM_MAX_MMIO_REGIONS	4
107 
108 /*
109  * Initialization:
110  * (o) initialized the first time the VM is created
111  * (i) initialized when VM is created and when it is reinitialized
112  * (x) initialized before use
113  */
114 struct vm {
115 	void		*cookie;		/* (i) cpu-specific data */
116 	volatile cpuset_t active_cpus;		/* (i) active vcpus */
117 	volatile cpuset_t debug_cpus;		/* (i) vcpus stopped for debug*/
118 	int		suspend;		/* (i) stop VM execution */
119 	bool		dying;			/* (o) is dying */
120 	volatile cpuset_t suspended_cpus; 	/* (i) suspended vcpus */
121 	volatile cpuset_t halted_cpus;		/* (x) cpus in a hard halt */
122 	struct vm_mem	mem;			/* (i) [m+v] guest memory */
123 	char		name[VM_MAX_NAMELEN + 1]; /* (o) virtual machine name */
124 	struct vcpu	**vcpu;			/* (i) guest vcpus */
125 	struct vmm_mmio_region mmio_region[VM_MAX_MMIO_REGIONS];
126 						/* (o) guest MMIO regions */
127 	/* The following describe the vm cpu topology */
128 	uint16_t	sockets;		/* (o) num of sockets */
129 	uint16_t	cores;			/* (o) num of cores/socket */
130 	uint16_t	threads;		/* (o) num of threads/core */
131 	uint16_t	maxcpus;		/* (o) max pluggable cpus */
132 	struct sx	vcpus_init_lock;	/* (o) */
133 };
134 
135 static MALLOC_DEFINE(M_VMM, "vmm", "vmm");
136 
137 /* statistics */
138 static VMM_STAT(VCPU_TOTAL_RUNTIME, "vcpu total runtime");
139 
140 SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW, NULL, NULL);
141 
142 static int vmm_ipinum;
143 SYSCTL_INT(_hw_vmm, OID_AUTO, ipinum, CTLFLAG_RD, &vmm_ipinum, 0,
144     "IPI vector used for vcpu notifications");
145 
146 static void vcpu_notify_event_locked(struct vcpu *vcpu);
147 
148 /* global statistics */
149 VMM_STAT(VMEXIT_COUNT, "total number of vm exits");
150 VMM_STAT(VMEXIT_IRQ, "number of vmexits for an irq");
151 VMM_STAT(VMEXIT_UNHANDLED, "number of vmexits for an unhandled exception");
152 
153 static void
154 vcpu_cleanup(struct vcpu *vcpu, bool destroy)
155 {
156 	vmmops_vcpu_cleanup(vcpu->cookie);
157 	vcpu->cookie = NULL;
158 	if (destroy) {
159 		vmm_stat_free(vcpu->stats);
160 		fpu_save_area_free(vcpu->guestfpu);
161 		vcpu_lock_destroy(vcpu);
162 		free(vcpu, M_VMM);
163 	}
164 }
165 
166 static struct vcpu *
167 vcpu_alloc(struct vm *vm, int vcpu_id)
168 {
169 	struct vcpu *vcpu;
170 
171 	KASSERT(vcpu_id >= 0 && vcpu_id < vm->maxcpus,
172 	    ("vcpu_alloc: invalid vcpu %d", vcpu_id));
173 
174 	vcpu = malloc(sizeof(*vcpu), M_VMM, M_WAITOK | M_ZERO);
175 	vcpu_lock_init(vcpu);
176 	vcpu->state = VCPU_IDLE;
177 	vcpu->hostcpu = NOCPU;
178 	vcpu->vcpuid = vcpu_id;
179 	vcpu->vm = vm;
180 	vcpu->guestfpu = fpu_save_area_alloc();
181 	vcpu->stats = vmm_stat_alloc();
182 	return (vcpu);
183 }
184 
185 static void
186 vcpu_init(struct vcpu *vcpu)
187 {
188 	vcpu->cookie = vmmops_vcpu_init(vcpu->vm->cookie, vcpu, vcpu->vcpuid);
189 	MPASS(vcpu->cookie != NULL);
190 	fpu_save_area_reset(vcpu->guestfpu);
191 	vmm_stat_init(vcpu->stats);
192 }
193 
194 struct vm_exit *
195 vm_exitinfo(struct vcpu *vcpu)
196 {
197 	return (&vcpu->exitinfo);
198 }
199 
200 int
201 vmm_modinit(void)
202 {
203 	return (vmmops_modinit());
204 }
205 
206 int
207 vmm_modcleanup(void)
208 {
209 	return (vmmops_modcleanup());
210 }
211 
212 static void
213 vm_init(struct vm *vm, bool create)
214 {
215 	int i;
216 
217 	vm->cookie = vmmops_init(vm, vmspace_pmap(vm_vmspace(vm)));
218 	MPASS(vm->cookie != NULL);
219 
220 	CPU_ZERO(&vm->active_cpus);
221 	CPU_ZERO(&vm->debug_cpus);
222 
223 	vm->suspend = 0;
224 	CPU_ZERO(&vm->suspended_cpus);
225 
226 	memset(vm->mmio_region, 0, sizeof(vm->mmio_region));
227 
228 	if (!create) {
229 		for (i = 0; i < vm->maxcpus; i++) {
230 			if (vm->vcpu[i] != NULL)
231 				vcpu_init(vm->vcpu[i]);
232 		}
233 	}
234 }
235 
236 void
237 vm_disable_vcpu_creation(struct vm *vm)
238 {
239 	sx_xlock(&vm->vcpus_init_lock);
240 	vm->dying = true;
241 	sx_xunlock(&vm->vcpus_init_lock);
242 }
243 
244 struct vcpu *
245 vm_alloc_vcpu(struct vm *vm, int vcpuid)
246 {
247 	struct vcpu *vcpu;
248 
249 	if (vcpuid < 0 || vcpuid >= vm_get_maxcpus(vm))
250 		return (NULL);
251 
252 	vcpu = (struct vcpu *)
253 	    atomic_load_acq_ptr((uintptr_t *)&vm->vcpu[vcpuid]);
254 	if (__predict_true(vcpu != NULL))
255 		return (vcpu);
256 
257 	sx_xlock(&vm->vcpus_init_lock);
258 	vcpu = vm->vcpu[vcpuid];
259 	if (vcpu == NULL && !vm->dying) {
260 		vcpu = vcpu_alloc(vm, vcpuid);
261 		vcpu_init(vcpu);
262 
263 		/*
264 		 * Ensure vCPU is fully created before updating pointer
265 		 * to permit unlocked reads above.
266 		 */
267 		atomic_store_rel_ptr((uintptr_t *)&vm->vcpu[vcpuid],
268 		    (uintptr_t)vcpu);
269 	}
270 	sx_xunlock(&vm->vcpus_init_lock);
271 	return (vcpu);
272 }
273 
274 void
275 vm_lock_vcpus(struct vm *vm)
276 {
277 	sx_xlock(&vm->vcpus_init_lock);
278 }
279 
280 void
281 vm_unlock_vcpus(struct vm *vm)
282 {
283 	sx_unlock(&vm->vcpus_init_lock);
284 }
285 
286 int
287 vm_create(const char *name, struct vm **retvm)
288 {
289 	struct vm *vm;
290 	int error;
291 
292 	vm = malloc(sizeof(struct vm), M_VMM, M_WAITOK | M_ZERO);
293 	error = vm_mem_init(&vm->mem, 0, 1ul << 39);
294 	if (error != 0) {
295 		free(vm, M_VMM);
296 		return (error);
297 	}
298 	strcpy(vm->name, name);
299 	sx_init(&vm->vcpus_init_lock, "vm vcpus");
300 
301 	vm->sockets = 1;
302 	vm->cores = 1;			/* XXX backwards compatibility */
303 	vm->threads = 1;		/* XXX backwards compatibility */
304 	vm->maxcpus = vm_maxcpu;
305 
306 	vm->vcpu = malloc(sizeof(*vm->vcpu) * vm->maxcpus, M_VMM,
307 	    M_WAITOK | M_ZERO);
308 
309 	vm_init(vm, true);
310 
311 	*retvm = vm;
312 	return (0);
313 }
314 
315 void
316 vm_get_topology(struct vm *vm, uint16_t *sockets, uint16_t *cores,
317     uint16_t *threads, uint16_t *maxcpus)
318 {
319 	*sockets = vm->sockets;
320 	*cores = vm->cores;
321 	*threads = vm->threads;
322 	*maxcpus = vm->maxcpus;
323 }
324 
325 uint16_t
326 vm_get_maxcpus(struct vm *vm)
327 {
328 	return (vm->maxcpus);
329 }
330 
331 int
332 vm_set_topology(struct vm *vm, uint16_t sockets, uint16_t cores,
333     uint16_t threads, uint16_t maxcpus)
334 {
335 	/* Ignore maxcpus. */
336 	if ((sockets * cores * threads) > vm->maxcpus)
337 		return (EINVAL);
338 	vm->sockets = sockets;
339 	vm->cores = cores;
340 	vm->threads = threads;
341 	return(0);
342 }
343 
344 static void
345 vm_cleanup(struct vm *vm, bool destroy)
346 {
347 	int i;
348 
349 	if (destroy)
350 		vm_xlock_memsegs(vm);
351 	else
352 		vm_assert_memseg_xlocked(vm);
353 
354 	aplic_detach_from_vm(vm->cookie);
355 
356 	for (i = 0; i < vm->maxcpus; i++) {
357 		if (vm->vcpu[i] != NULL)
358 			vcpu_cleanup(vm->vcpu[i], destroy);
359 	}
360 
361 	vmmops_cleanup(vm->cookie);
362 
363 	vm_mem_cleanup(vm);
364 	if (destroy) {
365 		vm_mem_destroy(vm);
366 
367 		free(vm->vcpu, M_VMM);
368 		sx_destroy(&vm->vcpus_init_lock);
369 	}
370 }
371 
372 void
373 vm_destroy(struct vm *vm)
374 {
375 
376 	vm_cleanup(vm, true);
377 
378 	free(vm, M_VMM);
379 }
380 
381 int
382 vm_reinit(struct vm *vm)
383 {
384 	int error;
385 
386 	/*
387 	 * A virtual machine can be reset only if all vcpus are suspended.
388 	 */
389 	if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) {
390 		vm_cleanup(vm, false);
391 		vm_init(vm, false);
392 		error = 0;
393 	} else {
394 		error = EBUSY;
395 	}
396 
397 	return (error);
398 }
399 
400 const char *
401 vm_name(struct vm *vm)
402 {
403 	return (vm->name);
404 }
405 
406 int
407 vm_gla2gpa_nofault(struct vcpu *vcpu, struct vm_guest_paging *paging,
408     uint64_t gla, int prot, uint64_t *gpa, int *is_fault)
409 {
410 	return (vmmops_gla2gpa(vcpu->cookie, paging, gla, prot, gpa, is_fault));
411 }
412 
413 void
414 vm_register_inst_handler(struct vm *vm, uint64_t start, uint64_t size,
415     mem_region_read_t mmio_read, mem_region_write_t mmio_write)
416 {
417 	int i;
418 
419 	for (i = 0; i < nitems(vm->mmio_region); i++) {
420 		if (vm->mmio_region[i].start == 0 &&
421 		    vm->mmio_region[i].end == 0) {
422 			vm->mmio_region[i].start = start;
423 			vm->mmio_region[i].end = start + size;
424 			vm->mmio_region[i].read = mmio_read;
425 			vm->mmio_region[i].write = mmio_write;
426 			return;
427 		}
428 	}
429 
430 	panic("%s: No free MMIO region", __func__);
431 }
432 
433 void
434 vm_deregister_inst_handler(struct vm *vm, uint64_t start, uint64_t size)
435 {
436 	int i;
437 
438 	for (i = 0; i < nitems(vm->mmio_region); i++) {
439 		if (vm->mmio_region[i].start == start &&
440 		    vm->mmio_region[i].end == start + size) {
441 			memset(&vm->mmio_region[i], 0,
442 			    sizeof(vm->mmio_region[i]));
443 			return;
444 		}
445 	}
446 
447 	panic("%s: Invalid MMIO region: %lx - %lx", __func__, start,
448 	    start + size);
449 }
450 
451 static int
452 vm_handle_inst_emul(struct vcpu *vcpu, bool *retu)
453 {
454 	struct vm *vm;
455 	struct vm_exit *vme;
456 	struct vie *vie;
457 	struct hyp *hyp;
458 	uint64_t fault_ipa;
459 	struct vm_guest_paging *paging;
460 	struct vmm_mmio_region *vmr;
461 	int error, i;
462 
463 	vm = vcpu->vm;
464 	hyp = vm->cookie;
465 	if (!hyp->aplic_attached)
466 		goto out_user;
467 
468 	vme = &vcpu->exitinfo;
469 	vie = &vme->u.inst_emul.vie;
470 	paging = &vme->u.inst_emul.paging;
471 
472 	fault_ipa = vme->u.inst_emul.gpa;
473 
474 	vmr = NULL;
475 	for (i = 0; i < nitems(vm->mmio_region); i++) {
476 		if (vm->mmio_region[i].start <= fault_ipa &&
477 		    vm->mmio_region[i].end > fault_ipa) {
478 			vmr = &vm->mmio_region[i];
479 			break;
480 		}
481 	}
482 	if (vmr == NULL)
483 		goto out_user;
484 
485 	error = vmm_emulate_instruction(vcpu, fault_ipa, vie, paging,
486 	    vmr->read, vmr->write, retu);
487 	return (error);
488 
489 out_user:
490 	*retu = true;
491 	return (0);
492 }
493 
494 int
495 vm_suspend(struct vm *vm, enum vm_suspend_how how)
496 {
497 	int i;
498 
499 	if (how <= VM_SUSPEND_NONE || how >= VM_SUSPEND_LAST)
500 		return (EINVAL);
501 
502 	if (atomic_cmpset_int(&vm->suspend, 0, how) == 0) {
503 		VM_CTR2(vm, "virtual machine already suspended %d/%d",
504 		    vm->suspend, how);
505 		return (EALREADY);
506 	}
507 
508 	VM_CTR1(vm, "virtual machine successfully suspended %d", how);
509 
510 	/*
511 	 * Notify all active vcpus that they are now suspended.
512 	 */
513 	for (i = 0; i < vm->maxcpus; i++) {
514 		if (CPU_ISSET(i, &vm->active_cpus))
515 			vcpu_notify_event(vm_vcpu(vm, i));
516 	}
517 
518 	return (0);
519 }
520 
521 void
522 vm_exit_suspended(struct vcpu *vcpu, uint64_t pc)
523 {
524 	struct vm *vm = vcpu->vm;
525 	struct vm_exit *vmexit;
526 
527 	KASSERT(vm->suspend > VM_SUSPEND_NONE && vm->suspend < VM_SUSPEND_LAST,
528 	    ("vm_exit_suspended: invalid suspend type %d", vm->suspend));
529 
530 	vmexit = vm_exitinfo(vcpu);
531 	vmexit->pc = pc;
532 	vmexit->inst_length = 4;
533 	vmexit->exitcode = VM_EXITCODE_SUSPENDED;
534 	vmexit->u.suspended.how = vm->suspend;
535 }
536 
537 void
538 vm_exit_debug(struct vcpu *vcpu, uint64_t pc)
539 {
540 	struct vm_exit *vmexit;
541 
542 	vmexit = vm_exitinfo(vcpu);
543 	vmexit->pc = pc;
544 	vmexit->inst_length = 4;
545 	vmexit->exitcode = VM_EXITCODE_DEBUG;
546 }
547 
548 int
549 vm_activate_cpu(struct vcpu *vcpu)
550 {
551 	struct vm *vm = vcpu->vm;
552 
553 	if (CPU_ISSET(vcpu->vcpuid, &vm->active_cpus))
554 		return (EBUSY);
555 
556 	CPU_SET_ATOMIC(vcpu->vcpuid, &vm->active_cpus);
557 	return (0);
558 
559 }
560 
561 int
562 vm_suspend_cpu(struct vm *vm, struct vcpu *vcpu)
563 {
564 	if (vcpu == NULL) {
565 		vm->debug_cpus = vm->active_cpus;
566 		for (int i = 0; i < vm->maxcpus; i++) {
567 			if (CPU_ISSET(i, &vm->active_cpus))
568 				vcpu_notify_event(vm_vcpu(vm, i));
569 		}
570 	} else {
571 		if (!CPU_ISSET(vcpu->vcpuid, &vm->active_cpus))
572 			return (EINVAL);
573 
574 		CPU_SET_ATOMIC(vcpu->vcpuid, &vm->debug_cpus);
575 		vcpu_notify_event(vcpu);
576 	}
577 	return (0);
578 }
579 
580 int
581 vm_resume_cpu(struct vm *vm, struct vcpu *vcpu)
582 {
583 
584 	if (vcpu == NULL) {
585 		CPU_ZERO(&vm->debug_cpus);
586 	} else {
587 		if (!CPU_ISSET(vcpu->vcpuid, &vm->debug_cpus))
588 			return (EINVAL);
589 
590 		CPU_CLR_ATOMIC(vcpu->vcpuid, &vm->debug_cpus);
591 	}
592 	return (0);
593 }
594 
595 int
596 vcpu_debugged(struct vcpu *vcpu)
597 {
598 
599 	return (CPU_ISSET(vcpu->vcpuid, &vcpu->vm->debug_cpus));
600 }
601 
602 cpuset_t
603 vm_active_cpus(struct vm *vm)
604 {
605 
606 	return (vm->active_cpus);
607 }
608 
609 cpuset_t
610 vm_debug_cpus(struct vm *vm)
611 {
612 
613 	return (vm->debug_cpus);
614 }
615 
616 cpuset_t
617 vm_suspended_cpus(struct vm *vm)
618 {
619 
620 	return (vm->suspended_cpus);
621 }
622 
623 
624 void *
625 vcpu_stats(struct vcpu *vcpu)
626 {
627 
628 	return (vcpu->stats);
629 }
630 
631 /*
632  * This function is called to ensure that a vcpu "sees" a pending event
633  * as soon as possible:
634  * - If the vcpu thread is sleeping then it is woken up.
635  * - If the vcpu is running on a different host_cpu then an IPI will be directed
636  *   to the host_cpu to cause the vcpu to trap into the hypervisor.
637  */
638 static void
639 vcpu_notify_event_locked(struct vcpu *vcpu)
640 {
641 	int hostcpu;
642 
643 	hostcpu = vcpu->hostcpu;
644 	if (vcpu->state == VCPU_RUNNING) {
645 		KASSERT(hostcpu != NOCPU, ("vcpu running on invalid hostcpu"));
646 		if (hostcpu != curcpu) {
647 			ipi_cpu(hostcpu, vmm_ipinum);
648 		} else {
649 			/*
650 			 * If the 'vcpu' is running on 'curcpu' then it must
651 			 * be sending a notification to itself (e.g. SELF_IPI).
652 			 * The pending event will be picked up when the vcpu
653 			 * transitions back to guest context.
654 			 */
655 		}
656 	} else {
657 		KASSERT(hostcpu == NOCPU, ("vcpu state %d not consistent "
658 		    "with hostcpu %d", vcpu->state, hostcpu));
659 		if (vcpu->state == VCPU_SLEEPING)
660 			wakeup_one(vcpu);
661 	}
662 }
663 
664 void
665 vcpu_notify_event(struct vcpu *vcpu)
666 {
667 	vcpu_lock(vcpu);
668 	vcpu_notify_event_locked(vcpu);
669 	vcpu_unlock(vcpu);
670 }
671 
672 struct vm_mem *
673 vm_mem(struct vm *vm)
674 {
675 	return (&vm->mem);
676 }
677 
678 static void
679 restore_guest_fpustate(struct vcpu *vcpu)
680 {
681 
682 	/* Flush host state to the pcb. */
683 	fpe_state_save(curthread);
684 
685 	/* Ensure the VFP state will be re-loaded when exiting the guest. */
686 	PCPU_SET(fpcurthread, NULL);
687 
688 	/* restore guest FPU state */
689 	fpe_enable();
690 	fpe_restore(vcpu->guestfpu);
691 
692 	/*
693 	 * The FPU is now "dirty" with the guest's state so turn on emulation
694 	 * to trap any access to the FPU by the host.
695 	 */
696 	fpe_disable();
697 }
698 
699 static void
700 save_guest_fpustate(struct vcpu *vcpu)
701 {
702 
703 	/* Save guest FPE state. */
704 	fpe_enable();
705 	fpe_store(vcpu->guestfpu);
706 	fpe_disable();
707 
708 	KASSERT(PCPU_GET(fpcurthread) == NULL,
709 	    ("%s: fpcurthread set with guest registers", __func__));
710 }
711 
712 static int
713 vcpu_set_state_locked(struct vcpu *vcpu, enum vcpu_state newstate,
714     bool from_idle)
715 {
716 	int error;
717 
718 	vcpu_assert_locked(vcpu);
719 
720 	/*
721 	 * State transitions from the vmmdev_ioctl() must always begin from
722 	 * the VCPU_IDLE state. This guarantees that there is only a single
723 	 * ioctl() operating on a vcpu at any point.
724 	 */
725 	if (from_idle) {
726 		while (vcpu->state != VCPU_IDLE) {
727 			vcpu_notify_event_locked(vcpu);
728 			msleep_spin(&vcpu->state, &vcpu->mtx, "vmstat", hz);
729 		}
730 	} else {
731 		KASSERT(vcpu->state != VCPU_IDLE, ("invalid transition from "
732 		    "vcpu idle state"));
733 	}
734 
735 	if (vcpu->state == VCPU_RUNNING) {
736 		KASSERT(vcpu->hostcpu == curcpu, ("curcpu %d and hostcpu %d "
737 		    "mismatch for running vcpu", curcpu, vcpu->hostcpu));
738 	} else {
739 		KASSERT(vcpu->hostcpu == NOCPU, ("Invalid hostcpu %d for a "
740 		    "vcpu that is not running", vcpu->hostcpu));
741 	}
742 
743 	/*
744 	 * The following state transitions are allowed:
745 	 * IDLE -> FROZEN -> IDLE
746 	 * FROZEN -> RUNNING -> FROZEN
747 	 * FROZEN -> SLEEPING -> FROZEN
748 	 */
749 	switch (vcpu->state) {
750 	case VCPU_IDLE:
751 	case VCPU_RUNNING:
752 	case VCPU_SLEEPING:
753 		error = (newstate != VCPU_FROZEN);
754 		break;
755 	case VCPU_FROZEN:
756 		error = (newstate == VCPU_FROZEN);
757 		break;
758 	default:
759 		error = 1;
760 		break;
761 	}
762 
763 	if (error)
764 		return (EBUSY);
765 
766 	vcpu->state = newstate;
767 	if (newstate == VCPU_RUNNING)
768 		vcpu->hostcpu = curcpu;
769 	else
770 		vcpu->hostcpu = NOCPU;
771 
772 	if (newstate == VCPU_IDLE)
773 		wakeup(&vcpu->state);
774 
775 	return (0);
776 }
777 
778 static void
779 vcpu_require_state(struct vcpu *vcpu, enum vcpu_state newstate)
780 {
781 	int error;
782 
783 	if ((error = vcpu_set_state(vcpu, newstate, false)) != 0)
784 		panic("Error %d setting state to %d\n", error, newstate);
785 }
786 
787 static void
788 vcpu_require_state_locked(struct vcpu *vcpu, enum vcpu_state newstate)
789 {
790 	int error;
791 
792 	if ((error = vcpu_set_state_locked(vcpu, newstate, false)) != 0)
793 		panic("Error %d setting state to %d", error, newstate);
794 }
795 
796 int
797 vm_get_capability(struct vcpu *vcpu, int type, int *retval)
798 {
799 
800 	if (type < 0 || type >= VM_CAP_MAX)
801 		return (EINVAL);
802 
803 	return (vmmops_getcap(vcpu->cookie, type, retval));
804 }
805 
806 int
807 vm_set_capability(struct vcpu *vcpu, int type, int val)
808 {
809 
810 	if (type < 0 || type >= VM_CAP_MAX)
811 		return (EINVAL);
812 
813 	return (vmmops_setcap(vcpu->cookie, type, val));
814 }
815 
816 struct vm *
817 vcpu_vm(struct vcpu *vcpu)
818 {
819 
820 	return (vcpu->vm);
821 }
822 
823 int
824 vcpu_vcpuid(struct vcpu *vcpu)
825 {
826 
827 	return (vcpu->vcpuid);
828 }
829 
830 void *
831 vcpu_get_cookie(struct vcpu *vcpu)
832 {
833 
834 	return (vcpu->cookie);
835 }
836 
837 struct vcpu *
838 vm_vcpu(struct vm *vm, int vcpuid)
839 {
840 
841 	return (vm->vcpu[vcpuid]);
842 }
843 
844 int
845 vcpu_set_state(struct vcpu *vcpu, enum vcpu_state newstate, bool from_idle)
846 {
847 	int error;
848 
849 	vcpu_lock(vcpu);
850 	error = vcpu_set_state_locked(vcpu, newstate, from_idle);
851 	vcpu_unlock(vcpu);
852 
853 	return (error);
854 }
855 
856 enum vcpu_state
857 vcpu_get_state(struct vcpu *vcpu, int *hostcpu)
858 {
859 	enum vcpu_state state;
860 
861 	vcpu_lock(vcpu);
862 	state = vcpu->state;
863 	if (hostcpu != NULL)
864 		*hostcpu = vcpu->hostcpu;
865 	vcpu_unlock(vcpu);
866 
867 	return (state);
868 }
869 
870 int
871 vm_get_register(struct vcpu *vcpu, int reg, uint64_t *retval)
872 {
873 	if (reg < 0 || reg >= VM_REG_LAST)
874 		return (EINVAL);
875 
876 	return (vmmops_getreg(vcpu->cookie, reg, retval));
877 }
878 
879 int
880 vm_set_register(struct vcpu *vcpu, int reg, uint64_t val)
881 {
882 	int error;
883 
884 	if (reg < 0 || reg >= VM_REG_LAST)
885 		return (EINVAL);
886 	error = vmmops_setreg(vcpu->cookie, reg, val);
887 	if (error || reg != VM_REG_GUEST_SEPC)
888 		return (error);
889 
890 	vcpu->nextpc = val;
891 
892 	return (0);
893 }
894 
895 void *
896 vm_get_cookie(struct vm *vm)
897 {
898 
899 	return (vm->cookie);
900 }
901 
902 int
903 vm_inject_exception(struct vcpu *vcpu, uint64_t scause)
904 {
905 
906 	return (vmmops_exception(vcpu->cookie, scause));
907 }
908 
909 int
910 vm_attach_aplic(struct vm *vm, struct vm_aplic_descr *descr)
911 {
912 
913 	return (aplic_attach_to_vm(vm->cookie, descr));
914 }
915 
916 int
917 vm_assert_irq(struct vm *vm, uint32_t irq)
918 {
919 
920 	return (aplic_inject_irq(vm->cookie, -1, irq, true));
921 }
922 
923 int
924 vm_deassert_irq(struct vm *vm, uint32_t irq)
925 {
926 
927 	return (aplic_inject_irq(vm->cookie, -1, irq, false));
928 }
929 
930 int
931 vm_raise_msi(struct vm *vm, uint64_t msg, uint64_t addr, int bus, int slot,
932     int func)
933 {
934 
935 	return (aplic_inject_msi(vm->cookie, msg, addr));
936 }
937 
938 static int
939 vm_handle_wfi(struct vcpu *vcpu, struct vm_exit *vme, bool *retu)
940 {
941 	struct vm *vm;
942 
943 	vm = vcpu->vm;
944 	vcpu_lock(vcpu);
945 	while (1) {
946 		if (vm->suspend)
947 			break;
948 
949 		if (aplic_check_pending(vcpu->cookie))
950 			break;
951 
952 		if (riscv_check_ipi(vcpu->cookie, false))
953 			break;
954 
955 		if (riscv_check_interrupts_pending(vcpu->cookie))
956 			break;
957 
958 		if (vcpu_should_yield(vcpu))
959 			break;
960 
961 		vcpu_require_state_locked(vcpu, VCPU_SLEEPING);
962 		/*
963 		 * XXX msleep_spin() cannot be interrupted by signals so
964 		 * wake up periodically to check pending signals.
965 		 */
966 		msleep_spin(vcpu, &vcpu->mtx, "vmidle", hz);
967 		vcpu_require_state_locked(vcpu, VCPU_FROZEN);
968 	}
969 	vcpu_unlock(vcpu);
970 
971 	*retu = false;
972 
973 	return (0);
974 }
975 
976 static int
977 vm_handle_paging(struct vcpu *vcpu, bool *retu)
978 {
979 	struct vm *vm;
980 	struct vm_exit *vme;
981 	struct vm_map *map;
982 	uint64_t addr;
983 	pmap_t pmap;
984 	int ftype, rv;
985 
986 	vm = vcpu->vm;
987 	vme = &vcpu->exitinfo;
988 
989 	pmap = vmspace_pmap(vm_vmspace(vm));
990 	addr = (vme->htval << 2) & ~(PAGE_SIZE - 1);
991 
992 	dprintf("%s: %lx\n", __func__, addr);
993 
994 	switch (vme->scause) {
995 	case SCAUSE_STORE_GUEST_PAGE_FAULT:
996 		ftype = VM_PROT_WRITE;
997 		break;
998 	case SCAUSE_FETCH_GUEST_PAGE_FAULT:
999 		ftype = VM_PROT_EXECUTE;
1000 		break;
1001 	case SCAUSE_LOAD_GUEST_PAGE_FAULT:
1002 		ftype = VM_PROT_READ;
1003 		break;
1004 	default:
1005 		panic("unknown page trap: %lu", vme->scause);
1006 	}
1007 
1008 	/* The page exists, but the page table needs to be updated. */
1009 	if (pmap_fault(pmap, addr, ftype))
1010 		return (0);
1011 
1012 	map = &vm_vmspace(vm)->vm_map;
1013 	rv = vm_fault(map, addr, ftype, VM_FAULT_NORMAL, NULL);
1014 	if (rv != KERN_SUCCESS) {
1015 		printf("%s: vm_fault failed, addr %lx, ftype %d, err %d\n",
1016 		    __func__, addr, ftype, rv);
1017 		return (EFAULT);
1018 	}
1019 
1020 	return (0);
1021 }
1022 
1023 static int
1024 vm_handle_suspend(struct vcpu *vcpu, bool *retu)
1025 {
1026 	struct vm *vm = vcpu->vm;
1027 	int error, i;
1028 	struct thread *td;
1029 
1030 	error = 0;
1031 	td = curthread;
1032 
1033 	CPU_SET_ATOMIC(vcpu->vcpuid, &vm->suspended_cpus);
1034 
1035 	/*
1036 	 * Wait until all 'active_cpus' have suspended themselves.
1037 	 *
1038 	 * Since a VM may be suspended at any time including when one or
1039 	 * more vcpus are doing a rendezvous we need to call the rendezvous
1040 	 * handler while we are waiting to prevent a deadlock.
1041 	 */
1042 	vcpu_lock(vcpu);
1043 	while (error == 0) {
1044 		if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0)
1045 			break;
1046 
1047 		vcpu_require_state_locked(vcpu, VCPU_SLEEPING);
1048 		msleep_spin(vcpu, &vcpu->mtx, "vmsusp", hz);
1049 		vcpu_require_state_locked(vcpu, VCPU_FROZEN);
1050 		if (td_ast_pending(td, TDA_SUSPEND)) {
1051 			vcpu_unlock(vcpu);
1052 			error = thread_check_susp(td, false);
1053 			vcpu_lock(vcpu);
1054 		}
1055 	}
1056 	vcpu_unlock(vcpu);
1057 
1058 	/*
1059 	 * Wakeup the other sleeping vcpus and return to userspace.
1060 	 */
1061 	for (i = 0; i < vm->maxcpus; i++) {
1062 		if (CPU_ISSET(i, &vm->suspended_cpus)) {
1063 			vcpu_notify_event(vm_vcpu(vm, i));
1064 		}
1065 	}
1066 
1067 	*retu = true;
1068 	return (error);
1069 }
1070 
1071 int
1072 vm_run(struct vcpu *vcpu)
1073 {
1074 	struct vm_eventinfo evinfo;
1075 	struct vm_exit *vme;
1076 	struct vm *vm;
1077 	pmap_t pmap;
1078 	int error;
1079 	int vcpuid;
1080 	bool retu;
1081 
1082 	vm = vcpu->vm;
1083 
1084 	dprintf("%s\n", __func__);
1085 
1086 	vcpuid = vcpu->vcpuid;
1087 
1088 	if (!CPU_ISSET(vcpuid, &vm->active_cpus))
1089 		return (EINVAL);
1090 
1091 	if (CPU_ISSET(vcpuid, &vm->suspended_cpus))
1092 		return (EINVAL);
1093 
1094 	pmap = vmspace_pmap(vm_vmspace(vm));
1095 	vme = &vcpu->exitinfo;
1096 	evinfo.rptr = NULL;
1097 	evinfo.sptr = &vm->suspend;
1098 	evinfo.iptr = NULL;
1099 restart:
1100 	critical_enter();
1101 
1102 	restore_guest_fpustate(vcpu);
1103 
1104 	vcpu_require_state(vcpu, VCPU_RUNNING);
1105 	error = vmmops_run(vcpu->cookie, vcpu->nextpc, pmap, &evinfo);
1106 	vcpu_require_state(vcpu, VCPU_FROZEN);
1107 
1108 	save_guest_fpustate(vcpu);
1109 
1110 	critical_exit();
1111 
1112 	if (error == 0) {
1113 		retu = false;
1114 		switch (vme->exitcode) {
1115 		case VM_EXITCODE_INST_EMUL:
1116 			vcpu->nextpc = vme->pc + vme->inst_length;
1117 			error = vm_handle_inst_emul(vcpu, &retu);
1118 			break;
1119 		case VM_EXITCODE_WFI:
1120 			vcpu->nextpc = vme->pc + vme->inst_length;
1121 			error = vm_handle_wfi(vcpu, vme, &retu);
1122 			break;
1123 		case VM_EXITCODE_ECALL:
1124 			/* Handle in userland. */
1125 			vcpu->nextpc = vme->pc + vme->inst_length;
1126 			retu = true;
1127 			break;
1128 		case VM_EXITCODE_PAGING:
1129 			vcpu->nextpc = vme->pc;
1130 			error = vm_handle_paging(vcpu, &retu);
1131 			break;
1132 		case VM_EXITCODE_BOGUS:
1133 			vcpu->nextpc = vme->pc;
1134 			retu = false;
1135 			error = 0;
1136 			break;
1137 		case VM_EXITCODE_SUSPENDED:
1138 			vcpu->nextpc = vme->pc;
1139 			error = vm_handle_suspend(vcpu, &retu);
1140 			break;
1141 		default:
1142 			/* Handle in userland. */
1143 			vcpu->nextpc = vme->pc;
1144 			retu = true;
1145 			break;
1146 		}
1147 	}
1148 
1149 	if (error == 0 && retu == false)
1150 		goto restart;
1151 
1152 	return (error);
1153 }
1154