xref: /freebsd/sys/riscv/vmm/vmm.c (revision b2d2a78ad80ec68d4a17f5aef97d21686cb1e29b)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2015 Mihai Carabas <mihai.carabas@gmail.com>
5  * Copyright (c) 2024 Ruslan Bukin <br@bsdpad.com>
6  *
7  * This software was developed by the University of Cambridge Computer
8  * Laboratory (Department of Computer Science and Technology) under Innovate
9  * UK project 105694, "Digital Security by Design (DSbD) Technology Platform
10  * Prototype".
11  *
12  * Redistribution and use in source and binary forms, with or without
13  * modification, are permitted provided that the following conditions
14  * are met:
15  * 1. Redistributions of source code must retain the above copyright
16  *    notice, this list of conditions and the following disclaimer.
17  * 2. Redistributions in binary form must reproduce the above copyright
18  *    notice, this list of conditions and the following disclaimer in the
19  *    documentation and/or other materials provided with the distribution.
20  *
21  * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND
22  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24  * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
25  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31  * SUCH DAMAGE.
32  */
33 
34 #include <sys/param.h>
35 #include <sys/systm.h>
36 #include <sys/cpuset.h>
37 #include <sys/kernel.h>
38 #include <sys/linker.h>
39 #include <sys/lock.h>
40 #include <sys/malloc.h>
41 #include <sys/module.h>
42 #include <sys/mutex.h>
43 #include <sys/pcpu.h>
44 #include <sys/proc.h>
45 #include <sys/queue.h>
46 #include <sys/rwlock.h>
47 #include <sys/sched.h>
48 #include <sys/smp.h>
49 #include <sys/sysctl.h>
50 
51 #include <vm/vm.h>
52 #include <vm/vm_object.h>
53 #include <vm/vm_page.h>
54 #include <vm/pmap.h>
55 #include <vm/vm_map.h>
56 #include <vm/vm_extern.h>
57 #include <vm/vm_param.h>
58 
59 #include <machine/riscvreg.h>
60 #include <machine/cpu.h>
61 #include <machine/fpe.h>
62 #include <machine/machdep.h>
63 #include <machine/pcb.h>
64 #include <machine/smp.h>
65 #include <machine/vm.h>
66 #include <machine/vmparam.h>
67 #include <machine/vmm.h>
68 #include <machine/vmm_instruction_emul.h>
69 
70 #include <dev/pci/pcireg.h>
71 
72 #include <dev/vmm/vmm_dev.h>
73 #include <dev/vmm/vmm_ktr.h>
74 
75 #include "vmm_stat.h"
76 #include "riscv.h"
77 
78 #include "vmm_aplic.h"
79 
80 struct vcpu {
81 	int		flags;
82 	enum vcpu_state	state;
83 	struct mtx	mtx;
84 	int		hostcpu;	/* host cpuid this vcpu last ran on */
85 	int		vcpuid;
86 	void		*stats;
87 	struct vm_exit	exitinfo;
88 	uint64_t	nextpc;		/* (x) next instruction to execute */
89 	struct vm	*vm;		/* (o) */
90 	void		*cookie;	/* (i) cpu-specific data */
91 	struct fpreg	*guestfpu;	/* (a,i) guest fpu state */
92 };
93 
94 #define	vcpu_lock_initialized(v) mtx_initialized(&((v)->mtx))
95 #define	vcpu_lock_init(v)	mtx_init(&((v)->mtx), "vcpu lock", 0, MTX_SPIN)
96 #define	vcpu_lock_destroy(v)	mtx_destroy(&((v)->mtx))
97 #define	vcpu_lock(v)		mtx_lock_spin(&((v)->mtx))
98 #define	vcpu_unlock(v)		mtx_unlock_spin(&((v)->mtx))
99 #define	vcpu_assert_locked(v)	mtx_assert(&((v)->mtx), MA_OWNED)
100 
101 struct mem_seg {
102 	uint64_t	gpa;
103 	size_t		len;
104 	bool		wired;
105 	bool		sysmem;
106 	vm_object_t	object;
107 };
108 #define	VM_MAX_MEMSEGS	3
109 
110 struct mem_map {
111 	vm_paddr_t	gpa;
112 	size_t		len;
113 	vm_ooffset_t	segoff;
114 	int		segid;
115 	int		prot;
116 	int		flags;
117 };
118 #define	VM_MAX_MEMMAPS	4
119 
120 struct vmm_mmio_region {
121 	uint64_t start;
122 	uint64_t end;
123 	mem_region_read_t read;
124 	mem_region_write_t write;
125 };
126 #define	VM_MAX_MMIO_REGIONS	4
127 
128 /*
129  * Initialization:
130  * (o) initialized the first time the VM is created
131  * (i) initialized when VM is created and when it is reinitialized
132  * (x) initialized before use
133  */
134 struct vm {
135 	void		*cookie;		/* (i) cpu-specific data */
136 	volatile cpuset_t active_cpus;		/* (i) active vcpus */
137 	volatile cpuset_t debug_cpus;		/* (i) vcpus stopped for debug*/
138 	int		suspend;		/* (i) stop VM execution */
139 	bool		dying;			/* (o) is dying */
140 	volatile cpuset_t suspended_cpus; 	/* (i) suspended vcpus */
141 	volatile cpuset_t halted_cpus;		/* (x) cpus in a hard halt */
142 	struct mem_map	mem_maps[VM_MAX_MEMMAPS]; /* (i) guest address space */
143 	struct mem_seg	mem_segs[VM_MAX_MEMSEGS]; /* (o) guest memory regions */
144 	struct vmspace	*vmspace;		/* (o) guest's address space */
145 	char		name[VM_MAX_NAMELEN];	/* (o) virtual machine name */
146 	struct vcpu	**vcpu;			/* (i) guest vcpus */
147 	struct vmm_mmio_region mmio_region[VM_MAX_MMIO_REGIONS];
148 						/* (o) guest MMIO regions */
149 	/* The following describe the vm cpu topology */
150 	uint16_t	sockets;		/* (o) num of sockets */
151 	uint16_t	cores;			/* (o) num of cores/socket */
152 	uint16_t	threads;		/* (o) num of threads/core */
153 	uint16_t	maxcpus;		/* (o) max pluggable cpus */
154 	struct sx	mem_segs_lock;		/* (o) */
155 	struct sx	vcpus_init_lock;	/* (o) */
156 };
157 
158 static bool vmm_initialized = false;
159 
160 static MALLOC_DEFINE(M_VMM, "vmm", "vmm");
161 
162 /* statistics */
163 static VMM_STAT(VCPU_TOTAL_RUNTIME, "vcpu total runtime");
164 
165 SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW, NULL, NULL);
166 
167 static int vmm_ipinum;
168 SYSCTL_INT(_hw_vmm, OID_AUTO, ipinum, CTLFLAG_RD, &vmm_ipinum, 0,
169     "IPI vector used for vcpu notifications");
170 
171 u_int vm_maxcpu;
172 SYSCTL_UINT(_hw_vmm, OID_AUTO, maxcpu, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
173     &vm_maxcpu, 0, "Maximum number of vCPUs");
174 
175 static void vm_free_memmap(struct vm *vm, int ident);
176 static bool sysmem_mapping(struct vm *vm, struct mem_map *mm);
177 static void vcpu_notify_event_locked(struct vcpu *vcpu);
178 
179 /* global statistics */
180 VMM_STAT(VMEXIT_COUNT, "total number of vm exits");
181 VMM_STAT(VMEXIT_IRQ, "number of vmexits for an irq");
182 VMM_STAT(VMEXIT_UNHANDLED, "number of vmexits for an unhandled exception");
183 
184 /*
185  * Upper limit on vm_maxcpu. We could increase this to 28 bits, but this
186  * is a safe value for now.
187  */
188 #define	VM_MAXCPU	MIN(0xffff - 1, CPU_SETSIZE)
189 
190 static void
191 vcpu_cleanup(struct vcpu *vcpu, bool destroy)
192 {
193 	vmmops_vcpu_cleanup(vcpu->cookie);
194 	vcpu->cookie = NULL;
195 	if (destroy) {
196 		vmm_stat_free(vcpu->stats);
197 		fpu_save_area_free(vcpu->guestfpu);
198 		vcpu_lock_destroy(vcpu);
199 	}
200 }
201 
202 static struct vcpu *
203 vcpu_alloc(struct vm *vm, int vcpu_id)
204 {
205 	struct vcpu *vcpu;
206 
207 	KASSERT(vcpu_id >= 0 && vcpu_id < vm->maxcpus,
208 	    ("vcpu_alloc: invalid vcpu %d", vcpu_id));
209 
210 	vcpu = malloc(sizeof(*vcpu), M_VMM, M_WAITOK | M_ZERO);
211 	vcpu_lock_init(vcpu);
212 	vcpu->state = VCPU_IDLE;
213 	vcpu->hostcpu = NOCPU;
214 	vcpu->vcpuid = vcpu_id;
215 	vcpu->vm = vm;
216 	vcpu->guestfpu = fpu_save_area_alloc();
217 	vcpu->stats = vmm_stat_alloc();
218 	return (vcpu);
219 }
220 
221 static void
222 vcpu_init(struct vcpu *vcpu)
223 {
224 	vcpu->cookie = vmmops_vcpu_init(vcpu->vm->cookie, vcpu, vcpu->vcpuid);
225 	MPASS(vcpu->cookie != NULL);
226 	fpu_save_area_reset(vcpu->guestfpu);
227 	vmm_stat_init(vcpu->stats);
228 }
229 
230 struct vm_exit *
231 vm_exitinfo(struct vcpu *vcpu)
232 {
233 	return (&vcpu->exitinfo);
234 }
235 
236 static int
237 vmm_init(void)
238 {
239 
240 	vm_maxcpu = mp_ncpus;
241 
242 	TUNABLE_INT_FETCH("hw.vmm.maxcpu", &vm_maxcpu);
243 
244 	if (vm_maxcpu > VM_MAXCPU) {
245 		printf("vmm: vm_maxcpu clamped to %u\n", VM_MAXCPU);
246 		vm_maxcpu = VM_MAXCPU;
247 	}
248 
249 	if (vm_maxcpu == 0)
250 		vm_maxcpu = 1;
251 
252 	return (vmmops_modinit());
253 }
254 
255 static int
256 vmm_handler(module_t mod, int what, void *arg)
257 {
258 	int error;
259 
260 	switch (what) {
261 	case MOD_LOAD:
262 		/* TODO: check if has_hyp here? */
263 		error = vmmdev_init();
264 		if (error != 0)
265 			break;
266 		error = vmm_init();
267 		if (error == 0)
268 			vmm_initialized = true;
269 		break;
270 	case MOD_UNLOAD:
271 		/* TODO: check if has_hyp here? */
272 		error = vmmdev_cleanup();
273 		if (error == 0 && vmm_initialized) {
274 			error = vmmops_modcleanup();
275 			if (error)
276 				vmm_initialized = false;
277 		}
278 		break;
279 	default:
280 		error = 0;
281 		break;
282 	}
283 	return (error);
284 }
285 
286 static moduledata_t vmm_kmod = {
287 	"vmm",
288 	vmm_handler,
289 	NULL
290 };
291 
292 /*
293  * vmm initialization has the following dependencies:
294  *
295  * - vmm device initialization requires an initialized devfs.
296  */
297 DECLARE_MODULE(vmm, vmm_kmod, SI_SUB_DEVFS + 1, SI_ORDER_ANY);
298 MODULE_VERSION(vmm, 1);
299 
300 static void
301 vm_init(struct vm *vm, bool create)
302 {
303 	int i;
304 
305 	vm->cookie = vmmops_init(vm, vmspace_pmap(vm->vmspace));
306 	MPASS(vm->cookie != NULL);
307 
308 	CPU_ZERO(&vm->active_cpus);
309 	CPU_ZERO(&vm->debug_cpus);
310 
311 	vm->suspend = 0;
312 	CPU_ZERO(&vm->suspended_cpus);
313 
314 	memset(vm->mmio_region, 0, sizeof(vm->mmio_region));
315 
316 	if (!create) {
317 		for (i = 0; i < vm->maxcpus; i++) {
318 			if (vm->vcpu[i] != NULL)
319 				vcpu_init(vm->vcpu[i]);
320 		}
321 	}
322 }
323 
324 void
325 vm_disable_vcpu_creation(struct vm *vm)
326 {
327 	sx_xlock(&vm->vcpus_init_lock);
328 	vm->dying = true;
329 	sx_xunlock(&vm->vcpus_init_lock);
330 }
331 
332 struct vcpu *
333 vm_alloc_vcpu(struct vm *vm, int vcpuid)
334 {
335 	struct vcpu *vcpu;
336 
337 	if (vcpuid < 0 || vcpuid >= vm_get_maxcpus(vm))
338 		return (NULL);
339 
340 	/* Some interrupt controllers may have a CPU limit */
341 	if (vcpuid >= aplic_max_cpu_count(vm->cookie))
342 		return (NULL);
343 
344 	vcpu = (struct vcpu *)
345 	    atomic_load_acq_ptr((uintptr_t *)&vm->vcpu[vcpuid]);
346 	if (__predict_true(vcpu != NULL))
347 		return (vcpu);
348 
349 	sx_xlock(&vm->vcpus_init_lock);
350 	vcpu = vm->vcpu[vcpuid];
351 	if (vcpu == NULL && !vm->dying) {
352 		vcpu = vcpu_alloc(vm, vcpuid);
353 		vcpu_init(vcpu);
354 
355 		/*
356 		 * Ensure vCPU is fully created before updating pointer
357 		 * to permit unlocked reads above.
358 		 */
359 		atomic_store_rel_ptr((uintptr_t *)&vm->vcpu[vcpuid],
360 		    (uintptr_t)vcpu);
361 	}
362 	sx_xunlock(&vm->vcpus_init_lock);
363 	return (vcpu);
364 }
365 
366 void
367 vm_slock_vcpus(struct vm *vm)
368 {
369 	sx_slock(&vm->vcpus_init_lock);
370 }
371 
372 void
373 vm_unlock_vcpus(struct vm *vm)
374 {
375 	sx_unlock(&vm->vcpus_init_lock);
376 }
377 
378 int
379 vm_create(const char *name, struct vm **retvm)
380 {
381 	struct vm *vm;
382 	struct vmspace *vmspace;
383 
384 	/*
385 	 * If vmm.ko could not be successfully initialized then don't attempt
386 	 * to create the virtual machine.
387 	 */
388 	if (!vmm_initialized)
389 		return (ENXIO);
390 
391 	if (name == NULL || strlen(name) >= VM_MAX_NAMELEN)
392 		return (EINVAL);
393 
394 	vmspace = vmmops_vmspace_alloc(0, 1ul << 39);
395 	if (vmspace == NULL)
396 		return (ENOMEM);
397 
398 	vm = malloc(sizeof(struct vm), M_VMM, M_WAITOK | M_ZERO);
399 	strcpy(vm->name, name);
400 	vm->vmspace = vmspace;
401 	sx_init(&vm->mem_segs_lock, "vm mem_segs");
402 	sx_init(&vm->vcpus_init_lock, "vm vcpus");
403 
404 	vm->sockets = 1;
405 	vm->cores = 1;			/* XXX backwards compatibility */
406 	vm->threads = 1;		/* XXX backwards compatibility */
407 	vm->maxcpus = vm_maxcpu;
408 
409 	vm->vcpu = malloc(sizeof(*vm->vcpu) * vm->maxcpus, M_VMM,
410 	    M_WAITOK | M_ZERO);
411 
412 	vm_init(vm, true);
413 
414 	*retvm = vm;
415 	return (0);
416 }
417 
418 void
419 vm_get_topology(struct vm *vm, uint16_t *sockets, uint16_t *cores,
420     uint16_t *threads, uint16_t *maxcpus)
421 {
422 	*sockets = vm->sockets;
423 	*cores = vm->cores;
424 	*threads = vm->threads;
425 	*maxcpus = vm->maxcpus;
426 }
427 
428 uint16_t
429 vm_get_maxcpus(struct vm *vm)
430 {
431 	return (vm->maxcpus);
432 }
433 
434 int
435 vm_set_topology(struct vm *vm, uint16_t sockets, uint16_t cores,
436     uint16_t threads, uint16_t maxcpus)
437 {
438 	/* Ignore maxcpus. */
439 	if ((sockets * cores * threads) > vm->maxcpus)
440 		return (EINVAL);
441 	vm->sockets = sockets;
442 	vm->cores = cores;
443 	vm->threads = threads;
444 	return(0);
445 }
446 
447 static void
448 vm_cleanup(struct vm *vm, bool destroy)
449 {
450 	struct mem_map *mm;
451 	int i;
452 
453 	aplic_detach_from_vm(vm->cookie);
454 
455 	for (i = 0; i < vm->maxcpus; i++) {
456 		if (vm->vcpu[i] != NULL)
457 			vcpu_cleanup(vm->vcpu[i], destroy);
458 	}
459 
460 	vmmops_cleanup(vm->cookie);
461 
462 	/*
463 	 * System memory is removed from the guest address space only when
464 	 * the VM is destroyed. This is because the mapping remains the same
465 	 * across VM reset.
466 	 *
467 	 * Device memory can be relocated by the guest (e.g. using PCI BARs)
468 	 * so those mappings are removed on a VM reset.
469 	 */
470 	if (!destroy) {
471 		for (i = 0; i < VM_MAX_MEMMAPS; i++) {
472 			mm = &vm->mem_maps[i];
473 			if (destroy || !sysmem_mapping(vm, mm))
474 				vm_free_memmap(vm, i);
475 		}
476 	}
477 
478 	if (destroy) {
479 		for (i = 0; i < VM_MAX_MEMSEGS; i++)
480 			vm_free_memseg(vm, i);
481 
482 		vmmops_vmspace_free(vm->vmspace);
483 		vm->vmspace = NULL;
484 
485 		for (i = 0; i < vm->maxcpus; i++)
486 			free(vm->vcpu[i], M_VMM);
487 		free(vm->vcpu, M_VMM);
488 		sx_destroy(&vm->vcpus_init_lock);
489 		sx_destroy(&vm->mem_segs_lock);
490 	}
491 }
492 
493 void
494 vm_destroy(struct vm *vm)
495 {
496 
497 	vm_cleanup(vm, true);
498 
499 	free(vm, M_VMM);
500 }
501 
502 int
503 vm_reinit(struct vm *vm)
504 {
505 	int error;
506 
507 	/*
508 	 * A virtual machine can be reset only if all vcpus are suspended.
509 	 */
510 	if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) {
511 		vm_cleanup(vm, false);
512 		vm_init(vm, false);
513 		error = 0;
514 	} else {
515 		error = EBUSY;
516 	}
517 
518 	return (error);
519 }
520 
521 const char *
522 vm_name(struct vm *vm)
523 {
524 	return (vm->name);
525 }
526 
527 void
528 vm_slock_memsegs(struct vm *vm)
529 {
530 	sx_slock(&vm->mem_segs_lock);
531 }
532 
533 void
534 vm_xlock_memsegs(struct vm *vm)
535 {
536 	sx_xlock(&vm->mem_segs_lock);
537 }
538 
539 void
540 vm_unlock_memsegs(struct vm *vm)
541 {
542 	sx_unlock(&vm->mem_segs_lock);
543 }
544 
545 /*
546  * Return 'true' if 'gpa' is allocated in the guest address space.
547  *
548  * This function is called in the context of a running vcpu which acts as
549  * an implicit lock on 'vm->mem_maps[]'.
550  */
551 bool
552 vm_mem_allocated(struct vcpu *vcpu, vm_paddr_t gpa)
553 {
554 	struct vm *vm = vcpu->vm;
555 	struct mem_map *mm;
556 	int i;
557 
558 #ifdef INVARIANTS
559 	int hostcpu, state;
560 	state = vcpu_get_state(vcpu, &hostcpu);
561 	KASSERT(state == VCPU_RUNNING && hostcpu == curcpu,
562 	    ("%s: invalid vcpu state %d/%d", __func__, state, hostcpu));
563 #endif
564 
565 	for (i = 0; i < VM_MAX_MEMMAPS; i++) {
566 		mm = &vm->mem_maps[i];
567 		if (mm->len != 0 && gpa >= mm->gpa && gpa < mm->gpa + mm->len)
568 			return (true);		/* 'gpa' is sysmem or devmem */
569 	}
570 
571 	return (false);
572 }
573 
574 int
575 vm_alloc_memseg(struct vm *vm, int ident, size_t len, bool sysmem)
576 {
577 	struct mem_seg *seg;
578 	vm_object_t obj;
579 
580 	sx_assert(&vm->mem_segs_lock, SX_XLOCKED);
581 
582 	if (ident < 0 || ident >= VM_MAX_MEMSEGS)
583 		return (EINVAL);
584 
585 	if (len == 0 || (len & PAGE_MASK))
586 		return (EINVAL);
587 
588 	seg = &vm->mem_segs[ident];
589 	if (seg->object != NULL) {
590 		if (seg->len == len && seg->sysmem == sysmem)
591 			return (EEXIST);
592 		else
593 			return (EINVAL);
594 	}
595 
596 	obj = vm_object_allocate(OBJT_DEFAULT, len >> PAGE_SHIFT);
597 	if (obj == NULL)
598 		return (ENOMEM);
599 
600 	seg->len = len;
601 	seg->object = obj;
602 	seg->sysmem = sysmem;
603 	return (0);
604 }
605 
606 int
607 vm_get_memseg(struct vm *vm, int ident, size_t *len, bool *sysmem,
608     vm_object_t *objptr)
609 {
610 	struct mem_seg *seg;
611 
612 	sx_assert(&vm->mem_segs_lock, SX_LOCKED);
613 
614 	if (ident < 0 || ident >= VM_MAX_MEMSEGS)
615 		return (EINVAL);
616 
617 	seg = &vm->mem_segs[ident];
618 	if (len)
619 		*len = seg->len;
620 	if (sysmem)
621 		*sysmem = seg->sysmem;
622 	if (objptr)
623 		*objptr = seg->object;
624 	return (0);
625 }
626 
627 void
628 vm_free_memseg(struct vm *vm, int ident)
629 {
630 	struct mem_seg *seg;
631 
632 	KASSERT(ident >= 0 && ident < VM_MAX_MEMSEGS,
633 	    ("%s: invalid memseg ident %d", __func__, ident));
634 
635 	seg = &vm->mem_segs[ident];
636 	if (seg->object != NULL) {
637 		vm_object_deallocate(seg->object);
638 		bzero(seg, sizeof(struct mem_seg));
639 	}
640 }
641 
642 int
643 vm_mmap_memseg(struct vm *vm, vm_paddr_t gpa, int segid, vm_ooffset_t first,
644     size_t len, int prot, int flags)
645 {
646 	struct mem_seg *seg;
647 	struct mem_map *m, *map;
648 	vm_ooffset_t last;
649 	int i, error;
650 
651 	dprintf("%s: gpa %lx first %lx len %lx\n", __func__, gpa, first, len);
652 
653 	if (prot == 0 || (prot & ~(VM_PROT_ALL)) != 0)
654 		return (EINVAL);
655 
656 	if (flags & ~VM_MEMMAP_F_WIRED)
657 		return (EINVAL);
658 
659 	if (segid < 0 || segid >= VM_MAX_MEMSEGS)
660 		return (EINVAL);
661 
662 	seg = &vm->mem_segs[segid];
663 	if (seg->object == NULL)
664 		return (EINVAL);
665 
666 	last = first + len;
667 	if (first < 0 || first >= last || last > seg->len)
668 		return (EINVAL);
669 
670 	if ((gpa | first | last) & PAGE_MASK)
671 		return (EINVAL);
672 
673 	map = NULL;
674 	for (i = 0; i < VM_MAX_MEMMAPS; i++) {
675 		m = &vm->mem_maps[i];
676 		if (m->len == 0) {
677 			map = m;
678 			break;
679 		}
680 	}
681 
682 	if (map == NULL)
683 		return (ENOSPC);
684 
685 	error = vm_map_find(&vm->vmspace->vm_map, seg->object, first, &gpa,
686 	    len, 0, VMFS_NO_SPACE, prot, prot, 0);
687 	if (error != KERN_SUCCESS)
688 		return (EFAULT);
689 
690 	vm_object_reference(seg->object);
691 
692 	if (flags & VM_MEMMAP_F_WIRED) {
693 		error = vm_map_wire(&vm->vmspace->vm_map, gpa, gpa + len,
694 		    VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
695 		if (error != KERN_SUCCESS) {
696 			vm_map_remove(&vm->vmspace->vm_map, gpa, gpa + len);
697 			return (error == KERN_RESOURCE_SHORTAGE ? ENOMEM :
698 			    EFAULT);
699 		}
700 	}
701 
702 	map->gpa = gpa;
703 	map->len = len;
704 	map->segoff = first;
705 	map->segid = segid;
706 	map->prot = prot;
707 	map->flags = flags;
708 	return (0);
709 }
710 
711 int
712 vm_munmap_memseg(struct vm *vm, vm_paddr_t gpa, size_t len)
713 {
714 	struct mem_map *m;
715 	int i;
716 
717 	dprintf("%s: gpa %lx len %lx\n", __func__, gpa, len);
718 
719 	for (i = 0; i < VM_MAX_MEMMAPS; i++) {
720 		m = &vm->mem_maps[i];
721 		if (m->gpa == gpa && m->len == len) {
722 			vm_free_memmap(vm, i);
723 			return (0);
724 		}
725 	}
726 
727 	return (EINVAL);
728 }
729 
730 int
731 vm_mmap_getnext(struct vm *vm, vm_paddr_t *gpa, int *segid,
732     vm_ooffset_t *segoff, size_t *len, int *prot, int *flags)
733 {
734 	struct mem_map *mm, *mmnext;
735 	int i;
736 
737 	mmnext = NULL;
738 	for (i = 0; i < VM_MAX_MEMMAPS; i++) {
739 		mm = &vm->mem_maps[i];
740 		if (mm->len == 0 || mm->gpa < *gpa)
741 			continue;
742 		if (mmnext == NULL || mm->gpa < mmnext->gpa)
743 			mmnext = mm;
744 	}
745 
746 	if (mmnext != NULL) {
747 		*gpa = mmnext->gpa;
748 		if (segid)
749 			*segid = mmnext->segid;
750 		if (segoff)
751 			*segoff = mmnext->segoff;
752 		if (len)
753 			*len = mmnext->len;
754 		if (prot)
755 			*prot = mmnext->prot;
756 		if (flags)
757 			*flags = mmnext->flags;
758 		return (0);
759 	} else {
760 		return (ENOENT);
761 	}
762 }
763 
764 static void
765 vm_free_memmap(struct vm *vm, int ident)
766 {
767 	struct mem_map *mm;
768 	int error __diagused;
769 
770 	mm = &vm->mem_maps[ident];
771 	if (mm->len) {
772 		error = vm_map_remove(&vm->vmspace->vm_map, mm->gpa,
773 		    mm->gpa + mm->len);
774 		KASSERT(error == KERN_SUCCESS, ("%s: vm_map_remove error %d",
775 		    __func__, error));
776 		bzero(mm, sizeof(struct mem_map));
777 	}
778 }
779 
780 static __inline bool
781 sysmem_mapping(struct vm *vm, struct mem_map *mm)
782 {
783 
784 	if (mm->len != 0 && vm->mem_segs[mm->segid].sysmem)
785 		return (true);
786 	else
787 		return (false);
788 }
789 
790 vm_paddr_t
791 vmm_sysmem_maxaddr(struct vm *vm)
792 {
793 	struct mem_map *mm;
794 	vm_paddr_t maxaddr;
795 	int i;
796 
797 	maxaddr = 0;
798 	for (i = 0; i < VM_MAX_MEMMAPS; i++) {
799 		mm = &vm->mem_maps[i];
800 		if (sysmem_mapping(vm, mm)) {
801 			if (maxaddr < mm->gpa + mm->len)
802 				maxaddr = mm->gpa + mm->len;
803 		}
804 	}
805 	return (maxaddr);
806 }
807 
808 int
809 vm_gla2gpa_nofault(struct vcpu *vcpu, struct vm_guest_paging *paging,
810     uint64_t gla, int prot, uint64_t *gpa, int *is_fault)
811 {
812 	int error;
813 
814 	error = vmmops_gla2gpa(vcpu->cookie, paging, gla, prot, gpa, is_fault);
815 
816 	return (error);
817 }
818 
819 void
820 vm_register_inst_handler(struct vm *vm, uint64_t start, uint64_t size,
821     mem_region_read_t mmio_read, mem_region_write_t mmio_write)
822 {
823 	int i;
824 
825 	for (i = 0; i < nitems(vm->mmio_region); i++) {
826 		if (vm->mmio_region[i].start == 0 &&
827 		    vm->mmio_region[i].end == 0) {
828 			vm->mmio_region[i].start = start;
829 			vm->mmio_region[i].end = start + size;
830 			vm->mmio_region[i].read = mmio_read;
831 			vm->mmio_region[i].write = mmio_write;
832 			return;
833 		}
834 	}
835 
836 	panic("%s: No free MMIO region", __func__);
837 }
838 
839 void
840 vm_deregister_inst_handler(struct vm *vm, uint64_t start, uint64_t size)
841 {
842 	int i;
843 
844 	for (i = 0; i < nitems(vm->mmio_region); i++) {
845 		if (vm->mmio_region[i].start == start &&
846 		    vm->mmio_region[i].end == start + size) {
847 			memset(&vm->mmio_region[i], 0,
848 			    sizeof(vm->mmio_region[i]));
849 			return;
850 		}
851 	}
852 
853 	panic("%s: Invalid MMIO region: %lx - %lx", __func__, start,
854 	    start + size);
855 }
856 
857 static int
858 vm_handle_inst_emul(struct vcpu *vcpu, bool *retu)
859 {
860 	struct vm *vm;
861 	struct vm_exit *vme;
862 	struct vie *vie;
863 	struct hyp *hyp;
864 	uint64_t fault_ipa;
865 	struct vm_guest_paging *paging;
866 	struct vmm_mmio_region *vmr;
867 	int error, i;
868 
869 	vm = vcpu->vm;
870 	hyp = vm->cookie;
871 	if (!hyp->aplic_attached)
872 		goto out_user;
873 
874 	vme = &vcpu->exitinfo;
875 	vie = &vme->u.inst_emul.vie;
876 	paging = &vme->u.inst_emul.paging;
877 
878 	fault_ipa = vme->u.inst_emul.gpa;
879 
880 	vmr = NULL;
881 	for (i = 0; i < nitems(vm->mmio_region); i++) {
882 		if (vm->mmio_region[i].start <= fault_ipa &&
883 		    vm->mmio_region[i].end > fault_ipa) {
884 			vmr = &vm->mmio_region[i];
885 			break;
886 		}
887 	}
888 	if (vmr == NULL)
889 		goto out_user;
890 
891 	error = vmm_emulate_instruction(vcpu, fault_ipa, vie, paging,
892 	    vmr->read, vmr->write, retu);
893 	return (error);
894 
895 out_user:
896 	*retu = true;
897 	return (0);
898 }
899 
900 int
901 vm_suspend(struct vm *vm, enum vm_suspend_how how)
902 {
903 	int i;
904 
905 	if (how <= VM_SUSPEND_NONE || how >= VM_SUSPEND_LAST)
906 		return (EINVAL);
907 
908 	if (atomic_cmpset_int(&vm->suspend, 0, how) == 0) {
909 		VM_CTR2(vm, "virtual machine already suspended %d/%d",
910 		    vm->suspend, how);
911 		return (EALREADY);
912 	}
913 
914 	VM_CTR1(vm, "virtual machine successfully suspended %d", how);
915 
916 	/*
917 	 * Notify all active vcpus that they are now suspended.
918 	 */
919 	for (i = 0; i < vm->maxcpus; i++) {
920 		if (CPU_ISSET(i, &vm->active_cpus))
921 			vcpu_notify_event(vm_vcpu(vm, i));
922 	}
923 
924 	return (0);
925 }
926 
927 void
928 vm_exit_suspended(struct vcpu *vcpu, uint64_t pc)
929 {
930 	struct vm *vm = vcpu->vm;
931 	struct vm_exit *vmexit;
932 
933 	KASSERT(vm->suspend > VM_SUSPEND_NONE && vm->suspend < VM_SUSPEND_LAST,
934 	    ("vm_exit_suspended: invalid suspend type %d", vm->suspend));
935 
936 	vmexit = vm_exitinfo(vcpu);
937 	vmexit->pc = pc;
938 	vmexit->inst_length = 4;
939 	vmexit->exitcode = VM_EXITCODE_SUSPENDED;
940 	vmexit->u.suspended.how = vm->suspend;
941 }
942 
943 void
944 vm_exit_debug(struct vcpu *vcpu, uint64_t pc)
945 {
946 	struct vm_exit *vmexit;
947 
948 	vmexit = vm_exitinfo(vcpu);
949 	vmexit->pc = pc;
950 	vmexit->inst_length = 4;
951 	vmexit->exitcode = VM_EXITCODE_DEBUG;
952 }
953 
954 int
955 vm_activate_cpu(struct vcpu *vcpu)
956 {
957 	struct vm *vm = vcpu->vm;
958 
959 	if (CPU_ISSET(vcpu->vcpuid, &vm->active_cpus))
960 		return (EBUSY);
961 
962 	CPU_SET_ATOMIC(vcpu->vcpuid, &vm->active_cpus);
963 	return (0);
964 
965 }
966 
967 int
968 vm_suspend_cpu(struct vm *vm, struct vcpu *vcpu)
969 {
970 	if (vcpu == NULL) {
971 		vm->debug_cpus = vm->active_cpus;
972 		for (int i = 0; i < vm->maxcpus; i++) {
973 			if (CPU_ISSET(i, &vm->active_cpus))
974 				vcpu_notify_event(vm_vcpu(vm, i));
975 		}
976 	} else {
977 		if (!CPU_ISSET(vcpu->vcpuid, &vm->active_cpus))
978 			return (EINVAL);
979 
980 		CPU_SET_ATOMIC(vcpu->vcpuid, &vm->debug_cpus);
981 		vcpu_notify_event(vcpu);
982 	}
983 	return (0);
984 }
985 
986 int
987 vm_resume_cpu(struct vm *vm, struct vcpu *vcpu)
988 {
989 
990 	if (vcpu == NULL) {
991 		CPU_ZERO(&vm->debug_cpus);
992 	} else {
993 		if (!CPU_ISSET(vcpu->vcpuid, &vm->debug_cpus))
994 			return (EINVAL);
995 
996 		CPU_CLR_ATOMIC(vcpu->vcpuid, &vm->debug_cpus);
997 	}
998 	return (0);
999 }
1000 
1001 int
1002 vcpu_debugged(struct vcpu *vcpu)
1003 {
1004 
1005 	return (CPU_ISSET(vcpu->vcpuid, &vcpu->vm->debug_cpus));
1006 }
1007 
1008 cpuset_t
1009 vm_active_cpus(struct vm *vm)
1010 {
1011 
1012 	return (vm->active_cpus);
1013 }
1014 
1015 cpuset_t
1016 vm_debug_cpus(struct vm *vm)
1017 {
1018 
1019 	return (vm->debug_cpus);
1020 }
1021 
1022 cpuset_t
1023 vm_suspended_cpus(struct vm *vm)
1024 {
1025 
1026 	return (vm->suspended_cpus);
1027 }
1028 
1029 
1030 void *
1031 vcpu_stats(struct vcpu *vcpu)
1032 {
1033 
1034 	return (vcpu->stats);
1035 }
1036 
1037 /*
1038  * This function is called to ensure that a vcpu "sees" a pending event
1039  * as soon as possible:
1040  * - If the vcpu thread is sleeping then it is woken up.
1041  * - If the vcpu is running on a different host_cpu then an IPI will be directed
1042  *   to the host_cpu to cause the vcpu to trap into the hypervisor.
1043  */
1044 static void
1045 vcpu_notify_event_locked(struct vcpu *vcpu)
1046 {
1047 	int hostcpu;
1048 
1049 	hostcpu = vcpu->hostcpu;
1050 	if (vcpu->state == VCPU_RUNNING) {
1051 		KASSERT(hostcpu != NOCPU, ("vcpu running on invalid hostcpu"));
1052 		if (hostcpu != curcpu) {
1053 			ipi_cpu(hostcpu, vmm_ipinum);
1054 		} else {
1055 			/*
1056 			 * If the 'vcpu' is running on 'curcpu' then it must
1057 			 * be sending a notification to itself (e.g. SELF_IPI).
1058 			 * The pending event will be picked up when the vcpu
1059 			 * transitions back to guest context.
1060 			 */
1061 		}
1062 	} else {
1063 		KASSERT(hostcpu == NOCPU, ("vcpu state %d not consistent "
1064 		    "with hostcpu %d", vcpu->state, hostcpu));
1065 		if (vcpu->state == VCPU_SLEEPING)
1066 			wakeup_one(vcpu);
1067 	}
1068 }
1069 
1070 void
1071 vcpu_notify_event(struct vcpu *vcpu)
1072 {
1073 	vcpu_lock(vcpu);
1074 	vcpu_notify_event_locked(vcpu);
1075 	vcpu_unlock(vcpu);
1076 }
1077 
1078 static void
1079 restore_guest_fpustate(struct vcpu *vcpu)
1080 {
1081 
1082 	/* Flush host state to the pcb. */
1083 	fpe_state_save(curthread);
1084 
1085 	/* Ensure the VFP state will be re-loaded when exiting the guest. */
1086 	PCPU_SET(fpcurthread, NULL);
1087 
1088 	/* restore guest FPU state */
1089 	fpe_enable();
1090 	fpe_restore(vcpu->guestfpu);
1091 
1092 	/*
1093 	 * The FPU is now "dirty" with the guest's state so turn on emulation
1094 	 * to trap any access to the FPU by the host.
1095 	 */
1096 	fpe_disable();
1097 }
1098 
1099 static void
1100 save_guest_fpustate(struct vcpu *vcpu)
1101 {
1102 
1103 	/* Save guest FPE state. */
1104 	fpe_enable();
1105 	fpe_store(vcpu->guestfpu);
1106 	fpe_disable();
1107 
1108 	KASSERT(PCPU_GET(fpcurthread) == NULL,
1109 	    ("%s: fpcurthread set with guest registers", __func__));
1110 }
1111 
1112 static int
1113 vcpu_set_state_locked(struct vcpu *vcpu, enum vcpu_state newstate,
1114     bool from_idle)
1115 {
1116 	int error;
1117 
1118 	vcpu_assert_locked(vcpu);
1119 
1120 	/*
1121 	 * State transitions from the vmmdev_ioctl() must always begin from
1122 	 * the VCPU_IDLE state. This guarantees that there is only a single
1123 	 * ioctl() operating on a vcpu at any point.
1124 	 */
1125 	if (from_idle) {
1126 		while (vcpu->state != VCPU_IDLE) {
1127 			vcpu_notify_event_locked(vcpu);
1128 			msleep_spin(&vcpu->state, &vcpu->mtx, "vmstat",
1129 			    hz / 1000);
1130 		}
1131 	} else {
1132 		KASSERT(vcpu->state != VCPU_IDLE, ("invalid transition from "
1133 		    "vcpu idle state"));
1134 	}
1135 
1136 	if (vcpu->state == VCPU_RUNNING) {
1137 		KASSERT(vcpu->hostcpu == curcpu, ("curcpu %d and hostcpu %d "
1138 		    "mismatch for running vcpu", curcpu, vcpu->hostcpu));
1139 	} else {
1140 		KASSERT(vcpu->hostcpu == NOCPU, ("Invalid hostcpu %d for a "
1141 		    "vcpu that is not running", vcpu->hostcpu));
1142 	}
1143 
1144 	/*
1145 	 * The following state transitions are allowed:
1146 	 * IDLE -> FROZEN -> IDLE
1147 	 * FROZEN -> RUNNING -> FROZEN
1148 	 * FROZEN -> SLEEPING -> FROZEN
1149 	 */
1150 	switch (vcpu->state) {
1151 	case VCPU_IDLE:
1152 	case VCPU_RUNNING:
1153 	case VCPU_SLEEPING:
1154 		error = (newstate != VCPU_FROZEN);
1155 		break;
1156 	case VCPU_FROZEN:
1157 		error = (newstate == VCPU_FROZEN);
1158 		break;
1159 	default:
1160 		error = 1;
1161 		break;
1162 	}
1163 
1164 	if (error)
1165 		return (EBUSY);
1166 
1167 	vcpu->state = newstate;
1168 	if (newstate == VCPU_RUNNING)
1169 		vcpu->hostcpu = curcpu;
1170 	else
1171 		vcpu->hostcpu = NOCPU;
1172 
1173 	if (newstate == VCPU_IDLE)
1174 		wakeup(&vcpu->state);
1175 
1176 	return (0);
1177 }
1178 
1179 static void
1180 vcpu_require_state(struct vcpu *vcpu, enum vcpu_state newstate)
1181 {
1182 	int error;
1183 
1184 	if ((error = vcpu_set_state(vcpu, newstate, false)) != 0)
1185 		panic("Error %d setting state to %d\n", error, newstate);
1186 }
1187 
1188 static void
1189 vcpu_require_state_locked(struct vcpu *vcpu, enum vcpu_state newstate)
1190 {
1191 	int error;
1192 
1193 	if ((error = vcpu_set_state_locked(vcpu, newstate, false)) != 0)
1194 		panic("Error %d setting state to %d", error, newstate);
1195 }
1196 
1197 int
1198 vm_get_capability(struct vcpu *vcpu, int type, int *retval)
1199 {
1200 
1201 	if (type < 0 || type >= VM_CAP_MAX)
1202 		return (EINVAL);
1203 
1204 	return (vmmops_getcap(vcpu->cookie, type, retval));
1205 }
1206 
1207 int
1208 vm_set_capability(struct vcpu *vcpu, int type, int val)
1209 {
1210 
1211 	if (type < 0 || type >= VM_CAP_MAX)
1212 		return (EINVAL);
1213 
1214 	return (vmmops_setcap(vcpu->cookie, type, val));
1215 }
1216 
1217 struct vm *
1218 vcpu_vm(struct vcpu *vcpu)
1219 {
1220 
1221 	return (vcpu->vm);
1222 }
1223 
1224 int
1225 vcpu_vcpuid(struct vcpu *vcpu)
1226 {
1227 
1228 	return (vcpu->vcpuid);
1229 }
1230 
1231 void *
1232 vcpu_get_cookie(struct vcpu *vcpu)
1233 {
1234 
1235 	return (vcpu->cookie);
1236 }
1237 
1238 struct vcpu *
1239 vm_vcpu(struct vm *vm, int vcpuid)
1240 {
1241 
1242 	return (vm->vcpu[vcpuid]);
1243 }
1244 
1245 int
1246 vcpu_set_state(struct vcpu *vcpu, enum vcpu_state newstate, bool from_idle)
1247 {
1248 	int error;
1249 
1250 	vcpu_lock(vcpu);
1251 	error = vcpu_set_state_locked(vcpu, newstate, from_idle);
1252 	vcpu_unlock(vcpu);
1253 
1254 	return (error);
1255 }
1256 
1257 enum vcpu_state
1258 vcpu_get_state(struct vcpu *vcpu, int *hostcpu)
1259 {
1260 	enum vcpu_state state;
1261 
1262 	vcpu_lock(vcpu);
1263 	state = vcpu->state;
1264 	if (hostcpu != NULL)
1265 		*hostcpu = vcpu->hostcpu;
1266 	vcpu_unlock(vcpu);
1267 
1268 	return (state);
1269 }
1270 
1271 static void *
1272 _vm_gpa_hold(struct vm *vm, vm_paddr_t gpa, size_t len, int reqprot,
1273     void **cookie)
1274 {
1275 	int i, count, pageoff;
1276 	struct mem_map *mm;
1277 	vm_page_t m;
1278 
1279 	pageoff = gpa & PAGE_MASK;
1280 	if (len > PAGE_SIZE - pageoff)
1281 		panic("vm_gpa_hold: invalid gpa/len: 0x%016lx/%lu", gpa, len);
1282 
1283 	count = 0;
1284 	for (i = 0; i < VM_MAX_MEMMAPS; i++) {
1285 		mm = &vm->mem_maps[i];
1286 		if (sysmem_mapping(vm, mm) && gpa >= mm->gpa &&
1287 		    gpa < mm->gpa + mm->len) {
1288 			count = vm_fault_quick_hold_pages(&vm->vmspace->vm_map,
1289 			    trunc_page(gpa), PAGE_SIZE, reqprot, &m, 1);
1290 			break;
1291 		}
1292 	}
1293 
1294 	if (count == 1) {
1295 		*cookie = m;
1296 		return ((void *)(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)) + pageoff));
1297 	} else {
1298 		*cookie = NULL;
1299 		return (NULL);
1300 	}
1301 }
1302 
1303 void *
1304 vm_gpa_hold(struct vcpu *vcpu, vm_paddr_t gpa, size_t len, int reqprot,
1305 	    void **cookie)
1306 {
1307 #ifdef INVARIANTS
1308 	/*
1309 	 * The current vcpu should be frozen to ensure 'vm_memmap[]'
1310 	 * stability.
1311 	 */
1312 	int state = vcpu_get_state(vcpu, NULL);
1313 	KASSERT(state == VCPU_FROZEN, ("%s: invalid vcpu state %d",
1314 	    __func__, state));
1315 #endif
1316 	return (_vm_gpa_hold(vcpu->vm, gpa, len, reqprot, cookie));
1317 }
1318 
1319 void *
1320 vm_gpa_hold_global(struct vm *vm, vm_paddr_t gpa, size_t len, int reqprot,
1321     void **cookie)
1322 {
1323 	sx_assert(&vm->mem_segs_lock, SX_LOCKED);
1324 	return (_vm_gpa_hold(vm, gpa, len, reqprot, cookie));
1325 }
1326 
1327 void
1328 vm_gpa_release(void *cookie)
1329 {
1330 	vm_page_t m = cookie;
1331 
1332 	vm_page_unwire(m, PQ_ACTIVE);
1333 }
1334 
1335 int
1336 vm_get_register(struct vcpu *vcpu, int reg, uint64_t *retval)
1337 {
1338 
1339 	if (reg >= VM_REG_LAST)
1340 		return (EINVAL);
1341 
1342 	return (vmmops_getreg(vcpu->cookie, reg, retval));
1343 }
1344 
1345 int
1346 vm_set_register(struct vcpu *vcpu, int reg, uint64_t val)
1347 {
1348 	int error;
1349 
1350 	if (reg >= VM_REG_LAST)
1351 		return (EINVAL);
1352 	error = vmmops_setreg(vcpu->cookie, reg, val);
1353 	if (error || reg != VM_REG_GUEST_SEPC)
1354 		return (error);
1355 
1356 	vcpu->nextpc = val;
1357 
1358 	return (0);
1359 }
1360 
1361 void *
1362 vm_get_cookie(struct vm *vm)
1363 {
1364 
1365 	return (vm->cookie);
1366 }
1367 
1368 int
1369 vm_inject_exception(struct vcpu *vcpu, uint64_t scause)
1370 {
1371 
1372 	return (vmmops_exception(vcpu->cookie, scause));
1373 }
1374 
1375 int
1376 vm_attach_aplic(struct vm *vm, struct vm_aplic_descr *descr)
1377 {
1378 
1379 	return (aplic_attach_to_vm(vm->cookie, descr));
1380 }
1381 
1382 int
1383 vm_assert_irq(struct vm *vm, uint32_t irq)
1384 {
1385 
1386 	return (aplic_inject_irq(vm->cookie, -1, irq, true));
1387 }
1388 
1389 int
1390 vm_deassert_irq(struct vm *vm, uint32_t irq)
1391 {
1392 
1393 	return (aplic_inject_irq(vm->cookie, -1, irq, false));
1394 }
1395 
1396 int
1397 vm_raise_msi(struct vm *vm, uint64_t msg, uint64_t addr, int bus, int slot,
1398     int func)
1399 {
1400 
1401 	return (aplic_inject_msi(vm->cookie, msg, addr));
1402 }
1403 
1404 static int
1405 vm_handle_wfi(struct vcpu *vcpu, struct vm_exit *vme, bool *retu)
1406 {
1407 
1408 	vcpu_lock(vcpu);
1409 
1410 	while (1) {
1411 		if (aplic_check_pending(vcpu->cookie))
1412 			break;
1413 
1414 		if (riscv_check_ipi(vcpu->cookie, false))
1415 			break;
1416 
1417 		if (vcpu_should_yield(vcpu))
1418 			break;
1419 
1420 		vcpu_require_state_locked(vcpu, VCPU_SLEEPING);
1421 		/*
1422 		 * XXX msleep_spin() cannot be interrupted by signals so
1423 		 * wake up periodically to check pending signals.
1424 		 */
1425 		msleep_spin(vcpu, &vcpu->mtx, "vmidle", hz / 1000);
1426 		vcpu_require_state_locked(vcpu, VCPU_FROZEN);
1427 	}
1428 	vcpu_unlock(vcpu);
1429 
1430 	*retu = false;
1431 
1432 	return (0);
1433 }
1434 
1435 static int
1436 vm_handle_paging(struct vcpu *vcpu, bool *retu)
1437 {
1438 	struct vm *vm;
1439 	struct vm_exit *vme;
1440 	struct vm_map *map;
1441 	uint64_t addr;
1442 	pmap_t pmap;
1443 	int ftype, rv;
1444 
1445 	vm = vcpu->vm;
1446 	vme = &vcpu->exitinfo;
1447 
1448 	pmap = vmspace_pmap(vm->vmspace);
1449 	addr = (vme->htval << 2) & ~(PAGE_SIZE - 1);
1450 
1451 	dprintf("%s: %lx\n", __func__, addr);
1452 
1453 	switch (vme->scause) {
1454 	case SCAUSE_STORE_GUEST_PAGE_FAULT:
1455 		ftype = VM_PROT_WRITE;
1456 		break;
1457 	case SCAUSE_FETCH_GUEST_PAGE_FAULT:
1458 		ftype = VM_PROT_EXECUTE;
1459 		break;
1460 	case SCAUSE_LOAD_GUEST_PAGE_FAULT:
1461 		ftype = VM_PROT_READ;
1462 		break;
1463 	default:
1464 		panic("unknown page trap: %lu", vme->scause);
1465 	}
1466 
1467 	/* The page exists, but the page table needs to be updated. */
1468 	if (pmap_fault(pmap, addr, ftype))
1469 		return (0);
1470 
1471 	map = &vm->vmspace->vm_map;
1472 	rv = vm_fault(map, addr, ftype, VM_FAULT_NORMAL, NULL);
1473 	if (rv != KERN_SUCCESS) {
1474 		printf("%s: vm_fault failed, addr %lx, ftype %d, err %d\n",
1475 		    __func__, addr, ftype, rv);
1476 		return (EFAULT);
1477 	}
1478 
1479 	return (0);
1480 }
1481 
1482 static int
1483 vm_handle_suspend(struct vcpu *vcpu, bool *retu)
1484 {
1485 	struct vm *vm = vcpu->vm;
1486 	int error, i;
1487 	struct thread *td;
1488 
1489 	error = 0;
1490 	td = curthread;
1491 
1492 	CPU_SET_ATOMIC(vcpu->vcpuid, &vm->suspended_cpus);
1493 
1494 	/*
1495 	 * Wait until all 'active_cpus' have suspended themselves.
1496 	 *
1497 	 * Since a VM may be suspended at any time including when one or
1498 	 * more vcpus are doing a rendezvous we need to call the rendezvous
1499 	 * handler while we are waiting to prevent a deadlock.
1500 	 */
1501 	vcpu_lock(vcpu);
1502 	while (error == 0) {
1503 		if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0)
1504 			break;
1505 
1506 		vcpu_require_state_locked(vcpu, VCPU_SLEEPING);
1507 		msleep_spin(vcpu, &vcpu->mtx, "vmsusp", hz);
1508 		vcpu_require_state_locked(vcpu, VCPU_FROZEN);
1509 		if (td_ast_pending(td, TDA_SUSPEND)) {
1510 			vcpu_unlock(vcpu);
1511 			error = thread_check_susp(td, false);
1512 			vcpu_lock(vcpu);
1513 		}
1514 	}
1515 	vcpu_unlock(vcpu);
1516 
1517 	/*
1518 	 * Wakeup the other sleeping vcpus and return to userspace.
1519 	 */
1520 	for (i = 0; i < vm->maxcpus; i++) {
1521 		if (CPU_ISSET(i, &vm->suspended_cpus)) {
1522 			vcpu_notify_event(vm_vcpu(vm, i));
1523 		}
1524 	}
1525 
1526 	*retu = true;
1527 	return (error);
1528 }
1529 
1530 int
1531 vm_run(struct vcpu *vcpu)
1532 {
1533 	struct vm_eventinfo evinfo;
1534 	struct vm_exit *vme;
1535 	struct vm *vm;
1536 	pmap_t pmap;
1537 	int error;
1538 	int vcpuid;
1539 	bool retu;
1540 
1541 	vm = vcpu->vm;
1542 
1543 	dprintf("%s\n", __func__);
1544 
1545 	vcpuid = vcpu->vcpuid;
1546 
1547 	if (!CPU_ISSET(vcpuid, &vm->active_cpus))
1548 		return (EINVAL);
1549 
1550 	if (CPU_ISSET(vcpuid, &vm->suspended_cpus))
1551 		return (EINVAL);
1552 
1553 	pmap = vmspace_pmap(vm->vmspace);
1554 	vme = &vcpu->exitinfo;
1555 	evinfo.rptr = NULL;
1556 	evinfo.sptr = &vm->suspend;
1557 	evinfo.iptr = NULL;
1558 restart:
1559 	critical_enter();
1560 
1561 	restore_guest_fpustate(vcpu);
1562 
1563 	vcpu_require_state(vcpu, VCPU_RUNNING);
1564 	error = vmmops_run(vcpu->cookie, vcpu->nextpc, pmap, &evinfo);
1565 	vcpu_require_state(vcpu, VCPU_FROZEN);
1566 
1567 	save_guest_fpustate(vcpu);
1568 
1569 	critical_exit();
1570 
1571 	if (error == 0) {
1572 		retu = false;
1573 		switch (vme->exitcode) {
1574 		case VM_EXITCODE_INST_EMUL:
1575 			vcpu->nextpc = vme->pc + vme->inst_length;
1576 			error = vm_handle_inst_emul(vcpu, &retu);
1577 			break;
1578 		case VM_EXITCODE_WFI:
1579 			vcpu->nextpc = vme->pc + vme->inst_length;
1580 			error = vm_handle_wfi(vcpu, vme, &retu);
1581 			break;
1582 		case VM_EXITCODE_ECALL:
1583 			/* Handle in userland. */
1584 			vcpu->nextpc = vme->pc + vme->inst_length;
1585 			retu = true;
1586 			break;
1587 		case VM_EXITCODE_PAGING:
1588 			vcpu->nextpc = vme->pc;
1589 			error = vm_handle_paging(vcpu, &retu);
1590 			break;
1591 		case VM_EXITCODE_BOGUS:
1592 			vcpu->nextpc = vme->pc;
1593 			retu = false;
1594 			error = 0;
1595 			break;
1596 		case VM_EXITCODE_SUSPENDED:
1597 			vcpu->nextpc = vme->pc;
1598 			error = vm_handle_suspend(vcpu, &retu);
1599 			break;
1600 		default:
1601 			/* Handle in userland. */
1602 			vcpu->nextpc = vme->pc;
1603 			retu = true;
1604 			break;
1605 		}
1606 	}
1607 
1608 	if (error == 0 && retu == false)
1609 		goto restart;
1610 
1611 	return (error);
1612 }
1613