xref: /freebsd/sys/riscv/vmm/vmm.c (revision 7ebc7d1ab76b9d06be9400d6c9fc74fcc43603a1)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2015 Mihai Carabas <mihai.carabas@gmail.com>
5  * Copyright (c) 2024 Ruslan Bukin <br@bsdpad.com>
6  *
7  * This software was developed by the University of Cambridge Computer
8  * Laboratory (Department of Computer Science and Technology) under Innovate
9  * UK project 105694, "Digital Security by Design (DSbD) Technology Platform
10  * Prototype".
11  *
12  * Redistribution and use in source and binary forms, with or without
13  * modification, are permitted provided that the following conditions
14  * are met:
15  * 1. Redistributions of source code must retain the above copyright
16  *    notice, this list of conditions and the following disclaimer.
17  * 2. Redistributions in binary form must reproduce the above copyright
18  *    notice, this list of conditions and the following disclaimer in the
19  *    documentation and/or other materials provided with the distribution.
20  *
21  * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND
22  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24  * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
25  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31  * SUCH DAMAGE.
32  */
33 
34 #include <sys/param.h>
35 #include <sys/systm.h>
36 #include <sys/cpuset.h>
37 #include <sys/kernel.h>
38 #include <sys/linker.h>
39 #include <sys/lock.h>
40 #include <sys/malloc.h>
41 #include <sys/module.h>
42 #include <sys/mutex.h>
43 #include <sys/pcpu.h>
44 #include <sys/proc.h>
45 #include <sys/queue.h>
46 #include <sys/rwlock.h>
47 #include <sys/sched.h>
48 #include <sys/smp.h>
49 #include <sys/sysctl.h>
50 
51 #include <vm/vm.h>
52 #include <vm/vm_object.h>
53 #include <vm/vm_page.h>
54 #include <vm/pmap.h>
55 #include <vm/vm_map.h>
56 #include <vm/vm_extern.h>
57 #include <vm/vm_param.h>
58 
59 #include <machine/riscvreg.h>
60 #include <machine/cpu.h>
61 #include <machine/fpe.h>
62 #include <machine/machdep.h>
63 #include <machine/pcb.h>
64 #include <machine/smp.h>
65 #include <machine/vm.h>
66 #include <machine/vmparam.h>
67 #include <machine/vmm.h>
68 #include <machine/vmm_instruction_emul.h>
69 
70 #include <dev/pci/pcireg.h>
71 
72 #include <dev/vmm/vmm_dev.h>
73 #include <dev/vmm/vmm_ktr.h>
74 
75 #include "vmm_stat.h"
76 #include "riscv.h"
77 
78 #include "vmm_aplic.h"
79 
80 struct vcpu {
81 	int		flags;
82 	enum vcpu_state	state;
83 	struct mtx	mtx;
84 	int		hostcpu;	/* host cpuid this vcpu last ran on */
85 	int		vcpuid;
86 	void		*stats;
87 	struct vm_exit	exitinfo;
88 	uint64_t	nextpc;		/* (x) next instruction to execute */
89 	struct vm	*vm;		/* (o) */
90 	void		*cookie;	/* (i) cpu-specific data */
91 	struct fpreg	*guestfpu;	/* (a,i) guest fpu state */
92 };
93 
94 #define	vcpu_lock_initialized(v) mtx_initialized(&((v)->mtx))
95 #define	vcpu_lock_init(v)	mtx_init(&((v)->mtx), "vcpu lock", 0, MTX_SPIN)
96 #define	vcpu_lock_destroy(v)	mtx_destroy(&((v)->mtx))
97 #define	vcpu_lock(v)		mtx_lock_spin(&((v)->mtx))
98 #define	vcpu_unlock(v)		mtx_unlock_spin(&((v)->mtx))
99 #define	vcpu_assert_locked(v)	mtx_assert(&((v)->mtx), MA_OWNED)
100 
101 struct mem_seg {
102 	uint64_t	gpa;
103 	size_t		len;
104 	bool		wired;
105 	bool		sysmem;
106 	vm_object_t	object;
107 };
108 #define	VM_MAX_MEMSEGS	3
109 
110 struct mem_map {
111 	vm_paddr_t	gpa;
112 	size_t		len;
113 	vm_ooffset_t	segoff;
114 	int		segid;
115 	int		prot;
116 	int		flags;
117 };
118 #define	VM_MAX_MEMMAPS	4
119 
120 struct vmm_mmio_region {
121 	uint64_t start;
122 	uint64_t end;
123 	mem_region_read_t read;
124 	mem_region_write_t write;
125 };
126 #define	VM_MAX_MMIO_REGIONS	4
127 
128 /*
129  * Initialization:
130  * (o) initialized the first time the VM is created
131  * (i) initialized when VM is created and when it is reinitialized
132  * (x) initialized before use
133  */
134 struct vm {
135 	void		*cookie;		/* (i) cpu-specific data */
136 	volatile cpuset_t active_cpus;		/* (i) active vcpus */
137 	volatile cpuset_t debug_cpus;		/* (i) vcpus stopped for debug*/
138 	int		suspend;		/* (i) stop VM execution */
139 	bool		dying;			/* (o) is dying */
140 	volatile cpuset_t suspended_cpus; 	/* (i) suspended vcpus */
141 	volatile cpuset_t halted_cpus;		/* (x) cpus in a hard halt */
142 	struct mem_map	mem_maps[VM_MAX_MEMMAPS]; /* (i) guest address space */
143 	struct mem_seg	mem_segs[VM_MAX_MEMSEGS]; /* (o) guest memory regions */
144 	struct vmspace	*vmspace;		/* (o) guest's address space */
145 	char		name[VM_MAX_NAMELEN];	/* (o) virtual machine name */
146 	struct vcpu	**vcpu;			/* (i) guest vcpus */
147 	struct vmm_mmio_region mmio_region[VM_MAX_MMIO_REGIONS];
148 						/* (o) guest MMIO regions */
149 	/* The following describe the vm cpu topology */
150 	uint16_t	sockets;		/* (o) num of sockets */
151 	uint16_t	cores;			/* (o) num of cores/socket */
152 	uint16_t	threads;		/* (o) num of threads/core */
153 	uint16_t	maxcpus;		/* (o) max pluggable cpus */
154 	struct sx	mem_segs_lock;		/* (o) */
155 	struct sx	vcpus_init_lock;	/* (o) */
156 };
157 
158 static bool vmm_initialized = false;
159 
160 static MALLOC_DEFINE(M_VMM, "vmm", "vmm");
161 
162 /* statistics */
163 static VMM_STAT(VCPU_TOTAL_RUNTIME, "vcpu total runtime");
164 
165 SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW, NULL, NULL);
166 
167 static int vmm_ipinum;
168 SYSCTL_INT(_hw_vmm, OID_AUTO, ipinum, CTLFLAG_RD, &vmm_ipinum, 0,
169     "IPI vector used for vcpu notifications");
170 
171 u_int vm_maxcpu;
172 SYSCTL_UINT(_hw_vmm, OID_AUTO, maxcpu, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
173     &vm_maxcpu, 0, "Maximum number of vCPUs");
174 
175 static void vm_free_memmap(struct vm *vm, int ident);
176 static bool sysmem_mapping(struct vm *vm, struct mem_map *mm);
177 static void vcpu_notify_event_locked(struct vcpu *vcpu);
178 
179 /*
180  * Upper limit on vm_maxcpu. We could increase this to 28 bits, but this
181  * is a safe value for now.
182  */
183 #define	VM_MAXCPU	MIN(0xffff - 1, CPU_SETSIZE)
184 
185 static void
186 vcpu_cleanup(struct vcpu *vcpu, bool destroy)
187 {
188 	vmmops_vcpu_cleanup(vcpu->cookie);
189 	vcpu->cookie = NULL;
190 	if (destroy) {
191 		vmm_stat_free(vcpu->stats);
192 		fpu_save_area_free(vcpu->guestfpu);
193 		vcpu_lock_destroy(vcpu);
194 	}
195 }
196 
197 static struct vcpu *
198 vcpu_alloc(struct vm *vm, int vcpu_id)
199 {
200 	struct vcpu *vcpu;
201 
202 	KASSERT(vcpu_id >= 0 && vcpu_id < vm->maxcpus,
203 	    ("vcpu_alloc: invalid vcpu %d", vcpu_id));
204 
205 	vcpu = malloc(sizeof(*vcpu), M_VMM, M_WAITOK | M_ZERO);
206 	vcpu_lock_init(vcpu);
207 	vcpu->state = VCPU_IDLE;
208 	vcpu->hostcpu = NOCPU;
209 	vcpu->vcpuid = vcpu_id;
210 	vcpu->vm = vm;
211 	vcpu->guestfpu = fpu_save_area_alloc();
212 	vcpu->stats = vmm_stat_alloc();
213 	return (vcpu);
214 }
215 
216 static void
217 vcpu_init(struct vcpu *vcpu)
218 {
219 	vcpu->cookie = vmmops_vcpu_init(vcpu->vm->cookie, vcpu, vcpu->vcpuid);
220 	MPASS(vcpu->cookie != NULL);
221 	fpu_save_area_reset(vcpu->guestfpu);
222 	vmm_stat_init(vcpu->stats);
223 }
224 
225 struct vm_exit *
226 vm_exitinfo(struct vcpu *vcpu)
227 {
228 	return (&vcpu->exitinfo);
229 }
230 
231 static int
232 vmm_init(void)
233 {
234 
235 	vm_maxcpu = mp_ncpus;
236 
237 	TUNABLE_INT_FETCH("hw.vmm.maxcpu", &vm_maxcpu);
238 
239 	if (vm_maxcpu > VM_MAXCPU) {
240 		printf("vmm: vm_maxcpu clamped to %u\n", VM_MAXCPU);
241 		vm_maxcpu = VM_MAXCPU;
242 	}
243 
244 	if (vm_maxcpu == 0)
245 		vm_maxcpu = 1;
246 
247 	return (vmmops_modinit());
248 }
249 
250 static int
251 vmm_handler(module_t mod, int what, void *arg)
252 {
253 	int error;
254 
255 	switch (what) {
256 	case MOD_LOAD:
257 		/* TODO: check if has_hyp here? */
258 		vmmdev_init();
259 		error = vmm_init();
260 		if (error == 0)
261 			vmm_initialized = true;
262 		break;
263 	case MOD_UNLOAD:
264 		/* TODO: check if has_hyp here? */
265 		error = vmmdev_cleanup();
266 		if (error == 0 && vmm_initialized) {
267 			error = vmmops_modcleanup();
268 			if (error)
269 				vmm_initialized = false;
270 		}
271 		break;
272 	default:
273 		error = 0;
274 		break;
275 	}
276 	return (error);
277 }
278 
279 static moduledata_t vmm_kmod = {
280 	"vmm",
281 	vmm_handler,
282 	NULL
283 };
284 
285 /*
286  * vmm initialization has the following dependencies:
287  *
288  * - HYP initialization requires smp_rendezvous() and therefore must happen
289  *   after SMP is fully functional (after SI_SUB_SMP).
290  */
291 DECLARE_MODULE(vmm, vmm_kmod, SI_SUB_SMP + 1, SI_ORDER_ANY);
292 MODULE_VERSION(vmm, 1);
293 
294 static void
295 vm_init(struct vm *vm, bool create)
296 {
297 	int i;
298 
299 	vm->cookie = vmmops_init(vm, vmspace_pmap(vm->vmspace));
300 	MPASS(vm->cookie != NULL);
301 
302 	CPU_ZERO(&vm->active_cpus);
303 	CPU_ZERO(&vm->debug_cpus);
304 
305 	vm->suspend = 0;
306 	CPU_ZERO(&vm->suspended_cpus);
307 
308 	memset(vm->mmio_region, 0, sizeof(vm->mmio_region));
309 
310 	if (!create) {
311 		for (i = 0; i < vm->maxcpus; i++) {
312 			if (vm->vcpu[i] != NULL)
313 				vcpu_init(vm->vcpu[i]);
314 		}
315 	}
316 }
317 
318 void
319 vm_disable_vcpu_creation(struct vm *vm)
320 {
321 	sx_xlock(&vm->vcpus_init_lock);
322 	vm->dying = true;
323 	sx_xunlock(&vm->vcpus_init_lock);
324 }
325 
326 struct vcpu *
327 vm_alloc_vcpu(struct vm *vm, int vcpuid)
328 {
329 	struct vcpu *vcpu;
330 
331 	if (vcpuid < 0 || vcpuid >= vm_get_maxcpus(vm))
332 		return (NULL);
333 
334 	/* Some interrupt controllers may have a CPU limit */
335 	if (vcpuid >= aplic_max_cpu_count(vm->cookie))
336 		return (NULL);
337 
338 	vcpu = (struct vcpu *)
339 	    atomic_load_acq_ptr((uintptr_t *)&vm->vcpu[vcpuid]);
340 	if (__predict_true(vcpu != NULL))
341 		return (vcpu);
342 
343 	sx_xlock(&vm->vcpus_init_lock);
344 	vcpu = vm->vcpu[vcpuid];
345 	if (vcpu == NULL && !vm->dying) {
346 		vcpu = vcpu_alloc(vm, vcpuid);
347 		vcpu_init(vcpu);
348 
349 		/*
350 		 * Ensure vCPU is fully created before updating pointer
351 		 * to permit unlocked reads above.
352 		 */
353 		atomic_store_rel_ptr((uintptr_t *)&vm->vcpu[vcpuid],
354 		    (uintptr_t)vcpu);
355 	}
356 	sx_xunlock(&vm->vcpus_init_lock);
357 	return (vcpu);
358 }
359 
360 void
361 vm_slock_vcpus(struct vm *vm)
362 {
363 	sx_slock(&vm->vcpus_init_lock);
364 }
365 
366 void
367 vm_unlock_vcpus(struct vm *vm)
368 {
369 	sx_unlock(&vm->vcpus_init_lock);
370 }
371 
372 int
373 vm_create(const char *name, struct vm **retvm)
374 {
375 	struct vm *vm;
376 	struct vmspace *vmspace;
377 
378 	/*
379 	 * If vmm.ko could not be successfully initialized then don't attempt
380 	 * to create the virtual machine.
381 	 */
382 	if (!vmm_initialized)
383 		return (ENXIO);
384 
385 	if (name == NULL || strlen(name) >= VM_MAX_NAMELEN)
386 		return (EINVAL);
387 
388 	vmspace = vmmops_vmspace_alloc(0, 1ul << 39);
389 	if (vmspace == NULL)
390 		return (ENOMEM);
391 
392 	vm = malloc(sizeof(struct vm), M_VMM, M_WAITOK | M_ZERO);
393 	strcpy(vm->name, name);
394 	vm->vmspace = vmspace;
395 	sx_init(&vm->mem_segs_lock, "vm mem_segs");
396 	sx_init(&vm->vcpus_init_lock, "vm vcpus");
397 
398 	vm->sockets = 1;
399 	vm->cores = 1;			/* XXX backwards compatibility */
400 	vm->threads = 1;		/* XXX backwards compatibility */
401 	vm->maxcpus = vm_maxcpu;
402 
403 	vm->vcpu = malloc(sizeof(*vm->vcpu) * vm->maxcpus, M_VMM,
404 	    M_WAITOK | M_ZERO);
405 
406 	vm_init(vm, true);
407 
408 	*retvm = vm;
409 	return (0);
410 }
411 
412 void
413 vm_get_topology(struct vm *vm, uint16_t *sockets, uint16_t *cores,
414     uint16_t *threads, uint16_t *maxcpus)
415 {
416 	*sockets = vm->sockets;
417 	*cores = vm->cores;
418 	*threads = vm->threads;
419 	*maxcpus = vm->maxcpus;
420 }
421 
422 uint16_t
423 vm_get_maxcpus(struct vm *vm)
424 {
425 	return (vm->maxcpus);
426 }
427 
428 int
429 vm_set_topology(struct vm *vm, uint16_t sockets, uint16_t cores,
430     uint16_t threads, uint16_t maxcpus)
431 {
432 	/* Ignore maxcpus. */
433 	if ((sockets * cores * threads) > vm->maxcpus)
434 		return (EINVAL);
435 	vm->sockets = sockets;
436 	vm->cores = cores;
437 	vm->threads = threads;
438 	return(0);
439 }
440 
441 static void
442 vm_cleanup(struct vm *vm, bool destroy)
443 {
444 	struct mem_map *mm;
445 	int i;
446 
447 	aplic_detach_from_vm(vm->cookie);
448 
449 	for (i = 0; i < vm->maxcpus; i++) {
450 		if (vm->vcpu[i] != NULL)
451 			vcpu_cleanup(vm->vcpu[i], destroy);
452 	}
453 
454 	vmmops_cleanup(vm->cookie);
455 
456 	/*
457 	 * System memory is removed from the guest address space only when
458 	 * the VM is destroyed. This is because the mapping remains the same
459 	 * across VM reset.
460 	 *
461 	 * Device memory can be relocated by the guest (e.g. using PCI BARs)
462 	 * so those mappings are removed on a VM reset.
463 	 */
464 	if (!destroy) {
465 		for (i = 0; i < VM_MAX_MEMMAPS; i++) {
466 			mm = &vm->mem_maps[i];
467 			if (destroy || !sysmem_mapping(vm, mm))
468 				vm_free_memmap(vm, i);
469 		}
470 	}
471 
472 	if (destroy) {
473 		for (i = 0; i < VM_MAX_MEMSEGS; i++)
474 			vm_free_memseg(vm, i);
475 
476 		vmmops_vmspace_free(vm->vmspace);
477 		vm->vmspace = NULL;
478 
479 		for (i = 0; i < vm->maxcpus; i++)
480 			free(vm->vcpu[i], M_VMM);
481 		free(vm->vcpu, M_VMM);
482 		sx_destroy(&vm->vcpus_init_lock);
483 		sx_destroy(&vm->mem_segs_lock);
484 	}
485 }
486 
487 void
488 vm_destroy(struct vm *vm)
489 {
490 
491 	vm_cleanup(vm, true);
492 
493 	free(vm, M_VMM);
494 }
495 
496 int
497 vm_reinit(struct vm *vm)
498 {
499 	int error;
500 
501 	/*
502 	 * A virtual machine can be reset only if all vcpus are suspended.
503 	 */
504 	if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) {
505 		vm_cleanup(vm, false);
506 		vm_init(vm, false);
507 		error = 0;
508 	} else {
509 		error = EBUSY;
510 	}
511 
512 	return (error);
513 }
514 
515 const char *
516 vm_name(struct vm *vm)
517 {
518 	return (vm->name);
519 }
520 
521 void
522 vm_slock_memsegs(struct vm *vm)
523 {
524 	sx_slock(&vm->mem_segs_lock);
525 }
526 
527 void
528 vm_xlock_memsegs(struct vm *vm)
529 {
530 	sx_xlock(&vm->mem_segs_lock);
531 }
532 
533 void
534 vm_unlock_memsegs(struct vm *vm)
535 {
536 	sx_unlock(&vm->mem_segs_lock);
537 }
538 
539 /*
540  * Return 'true' if 'gpa' is allocated in the guest address space.
541  *
542  * This function is called in the context of a running vcpu which acts as
543  * an implicit lock on 'vm->mem_maps[]'.
544  */
545 bool
546 vm_mem_allocated(struct vcpu *vcpu, vm_paddr_t gpa)
547 {
548 	struct vm *vm = vcpu->vm;
549 	struct mem_map *mm;
550 	int i;
551 
552 #ifdef INVARIANTS
553 	int hostcpu, state;
554 	state = vcpu_get_state(vcpu, &hostcpu);
555 	KASSERT(state == VCPU_RUNNING && hostcpu == curcpu,
556 	    ("%s: invalid vcpu state %d/%d", __func__, state, hostcpu));
557 #endif
558 
559 	for (i = 0; i < VM_MAX_MEMMAPS; i++) {
560 		mm = &vm->mem_maps[i];
561 		if (mm->len != 0 && gpa >= mm->gpa && gpa < mm->gpa + mm->len)
562 			return (true);		/* 'gpa' is sysmem or devmem */
563 	}
564 
565 	return (false);
566 }
567 
568 int
569 vm_alloc_memseg(struct vm *vm, int ident, size_t len, bool sysmem)
570 {
571 	struct mem_seg *seg;
572 	vm_object_t obj;
573 
574 	sx_assert(&vm->mem_segs_lock, SX_XLOCKED);
575 
576 	if (ident < 0 || ident >= VM_MAX_MEMSEGS)
577 		return (EINVAL);
578 
579 	if (len == 0 || (len & PAGE_MASK))
580 		return (EINVAL);
581 
582 	seg = &vm->mem_segs[ident];
583 	if (seg->object != NULL) {
584 		if (seg->len == len && seg->sysmem == sysmem)
585 			return (EEXIST);
586 		else
587 			return (EINVAL);
588 	}
589 
590 	obj = vm_object_allocate(OBJT_DEFAULT, len >> PAGE_SHIFT);
591 	if (obj == NULL)
592 		return (ENOMEM);
593 
594 	seg->len = len;
595 	seg->object = obj;
596 	seg->sysmem = sysmem;
597 	return (0);
598 }
599 
600 int
601 vm_get_memseg(struct vm *vm, int ident, size_t *len, bool *sysmem,
602     vm_object_t *objptr)
603 {
604 	struct mem_seg *seg;
605 
606 	sx_assert(&vm->mem_segs_lock, SX_LOCKED);
607 
608 	if (ident < 0 || ident >= VM_MAX_MEMSEGS)
609 		return (EINVAL);
610 
611 	seg = &vm->mem_segs[ident];
612 	if (len)
613 		*len = seg->len;
614 	if (sysmem)
615 		*sysmem = seg->sysmem;
616 	if (objptr)
617 		*objptr = seg->object;
618 	return (0);
619 }
620 
621 void
622 vm_free_memseg(struct vm *vm, int ident)
623 {
624 	struct mem_seg *seg;
625 
626 	KASSERT(ident >= 0 && ident < VM_MAX_MEMSEGS,
627 	    ("%s: invalid memseg ident %d", __func__, ident));
628 
629 	seg = &vm->mem_segs[ident];
630 	if (seg->object != NULL) {
631 		vm_object_deallocate(seg->object);
632 		bzero(seg, sizeof(struct mem_seg));
633 	}
634 }
635 
636 int
637 vm_mmap_memseg(struct vm *vm, vm_paddr_t gpa, int segid, vm_ooffset_t first,
638     size_t len, int prot, int flags)
639 {
640 	struct mem_seg *seg;
641 	struct mem_map *m, *map;
642 	vm_ooffset_t last;
643 	int i, error;
644 
645 	dprintf("%s: gpa %lx first %lx len %lx\n", __func__, gpa, first, len);
646 
647 	if (prot == 0 || (prot & ~(VM_PROT_ALL)) != 0)
648 		return (EINVAL);
649 
650 	if (flags & ~VM_MEMMAP_F_WIRED)
651 		return (EINVAL);
652 
653 	if (segid < 0 || segid >= VM_MAX_MEMSEGS)
654 		return (EINVAL);
655 
656 	seg = &vm->mem_segs[segid];
657 	if (seg->object == NULL)
658 		return (EINVAL);
659 
660 	last = first + len;
661 	if (first < 0 || first >= last || last > seg->len)
662 		return (EINVAL);
663 
664 	if ((gpa | first | last) & PAGE_MASK)
665 		return (EINVAL);
666 
667 	map = NULL;
668 	for (i = 0; i < VM_MAX_MEMMAPS; i++) {
669 		m = &vm->mem_maps[i];
670 		if (m->len == 0) {
671 			map = m;
672 			break;
673 		}
674 	}
675 
676 	if (map == NULL)
677 		return (ENOSPC);
678 
679 	error = vm_map_find(&vm->vmspace->vm_map, seg->object, first, &gpa,
680 	    len, 0, VMFS_NO_SPACE, prot, prot, 0);
681 	if (error != KERN_SUCCESS)
682 		return (EFAULT);
683 
684 	vm_object_reference(seg->object);
685 
686 	if (flags & VM_MEMMAP_F_WIRED) {
687 		error = vm_map_wire(&vm->vmspace->vm_map, gpa, gpa + len,
688 		    VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
689 		if (error != KERN_SUCCESS) {
690 			vm_map_remove(&vm->vmspace->vm_map, gpa, gpa + len);
691 			return (error == KERN_RESOURCE_SHORTAGE ? ENOMEM :
692 			    EFAULT);
693 		}
694 	}
695 
696 	map->gpa = gpa;
697 	map->len = len;
698 	map->segoff = first;
699 	map->segid = segid;
700 	map->prot = prot;
701 	map->flags = flags;
702 	return (0);
703 }
704 
705 int
706 vm_munmap_memseg(struct vm *vm, vm_paddr_t gpa, size_t len)
707 {
708 	struct mem_map *m;
709 	int i;
710 
711 	dprintf("%s: gpa %lx len %lx\n", __func__, gpa, len);
712 
713 	for (i = 0; i < VM_MAX_MEMMAPS; i++) {
714 		m = &vm->mem_maps[i];
715 		if (m->gpa == gpa && m->len == len) {
716 			vm_free_memmap(vm, i);
717 			return (0);
718 		}
719 	}
720 
721 	return (EINVAL);
722 }
723 
724 int
725 vm_mmap_getnext(struct vm *vm, vm_paddr_t *gpa, int *segid,
726     vm_ooffset_t *segoff, size_t *len, int *prot, int *flags)
727 {
728 	struct mem_map *mm, *mmnext;
729 	int i;
730 
731 	mmnext = NULL;
732 	for (i = 0; i < VM_MAX_MEMMAPS; i++) {
733 		mm = &vm->mem_maps[i];
734 		if (mm->len == 0 || mm->gpa < *gpa)
735 			continue;
736 		if (mmnext == NULL || mm->gpa < mmnext->gpa)
737 			mmnext = mm;
738 	}
739 
740 	if (mmnext != NULL) {
741 		*gpa = mmnext->gpa;
742 		if (segid)
743 			*segid = mmnext->segid;
744 		if (segoff)
745 			*segoff = mmnext->segoff;
746 		if (len)
747 			*len = mmnext->len;
748 		if (prot)
749 			*prot = mmnext->prot;
750 		if (flags)
751 			*flags = mmnext->flags;
752 		return (0);
753 	} else {
754 		return (ENOENT);
755 	}
756 }
757 
758 static void
759 vm_free_memmap(struct vm *vm, int ident)
760 {
761 	struct mem_map *mm;
762 	int error __diagused;
763 
764 	mm = &vm->mem_maps[ident];
765 	if (mm->len) {
766 		error = vm_map_remove(&vm->vmspace->vm_map, mm->gpa,
767 		    mm->gpa + mm->len);
768 		KASSERT(error == KERN_SUCCESS, ("%s: vm_map_remove error %d",
769 		    __func__, error));
770 		bzero(mm, sizeof(struct mem_map));
771 	}
772 }
773 
774 static __inline bool
775 sysmem_mapping(struct vm *vm, struct mem_map *mm)
776 {
777 
778 	if (mm->len != 0 && vm->mem_segs[mm->segid].sysmem)
779 		return (true);
780 	else
781 		return (false);
782 }
783 
784 vm_paddr_t
785 vmm_sysmem_maxaddr(struct vm *vm)
786 {
787 	struct mem_map *mm;
788 	vm_paddr_t maxaddr;
789 	int i;
790 
791 	maxaddr = 0;
792 	for (i = 0; i < VM_MAX_MEMMAPS; i++) {
793 		mm = &vm->mem_maps[i];
794 		if (sysmem_mapping(vm, mm)) {
795 			if (maxaddr < mm->gpa + mm->len)
796 				maxaddr = mm->gpa + mm->len;
797 		}
798 	}
799 	return (maxaddr);
800 }
801 
802 int
803 vm_gla2gpa_nofault(struct vcpu *vcpu, struct vm_guest_paging *paging,
804     uint64_t gla, int prot, uint64_t *gpa, int *is_fault)
805 {
806 	int error;
807 
808 	error = vmmops_gla2gpa(vcpu->cookie, paging, gla, prot, gpa, is_fault);
809 
810 	return (error);
811 }
812 
813 void
814 vm_register_inst_handler(struct vm *vm, uint64_t start, uint64_t size,
815     mem_region_read_t mmio_read, mem_region_write_t mmio_write)
816 {
817 	int i;
818 
819 	for (i = 0; i < nitems(vm->mmio_region); i++) {
820 		if (vm->mmio_region[i].start == 0 &&
821 		    vm->mmio_region[i].end == 0) {
822 			vm->mmio_region[i].start = start;
823 			vm->mmio_region[i].end = start + size;
824 			vm->mmio_region[i].read = mmio_read;
825 			vm->mmio_region[i].write = mmio_write;
826 			return;
827 		}
828 	}
829 
830 	panic("%s: No free MMIO region", __func__);
831 }
832 
833 void
834 vm_deregister_inst_handler(struct vm *vm, uint64_t start, uint64_t size)
835 {
836 	int i;
837 
838 	for (i = 0; i < nitems(vm->mmio_region); i++) {
839 		if (vm->mmio_region[i].start == start &&
840 		    vm->mmio_region[i].end == start + size) {
841 			memset(&vm->mmio_region[i], 0,
842 			    sizeof(vm->mmio_region[i]));
843 			return;
844 		}
845 	}
846 
847 	panic("%s: Invalid MMIO region: %lx - %lx", __func__, start,
848 	    start + size);
849 }
850 
851 static int
852 vm_handle_inst_emul(struct vcpu *vcpu, bool *retu)
853 {
854 	struct vm *vm;
855 	struct vm_exit *vme;
856 	struct vie *vie;
857 	struct hyp *hyp;
858 	uint64_t fault_ipa;
859 	struct vm_guest_paging *paging;
860 	struct vmm_mmio_region *vmr;
861 	int error, i;
862 
863 	vm = vcpu->vm;
864 	hyp = vm->cookie;
865 	if (!hyp->aplic_attached)
866 		goto out_user;
867 
868 	vme = &vcpu->exitinfo;
869 	vie = &vme->u.inst_emul.vie;
870 	paging = &vme->u.inst_emul.paging;
871 
872 	fault_ipa = vme->u.inst_emul.gpa;
873 
874 	vmr = NULL;
875 	for (i = 0; i < nitems(vm->mmio_region); i++) {
876 		if (vm->mmio_region[i].start <= fault_ipa &&
877 		    vm->mmio_region[i].end > fault_ipa) {
878 			vmr = &vm->mmio_region[i];
879 			break;
880 		}
881 	}
882 	if (vmr == NULL)
883 		goto out_user;
884 
885 	error = vmm_emulate_instruction(vcpu, fault_ipa, vie, paging,
886 	    vmr->read, vmr->write, retu);
887 	return (error);
888 
889 out_user:
890 	*retu = true;
891 	return (0);
892 }
893 
894 int
895 vm_suspend(struct vm *vm, enum vm_suspend_how how)
896 {
897 	int i;
898 
899 	if (how <= VM_SUSPEND_NONE || how >= VM_SUSPEND_LAST)
900 		return (EINVAL);
901 
902 	if (atomic_cmpset_int(&vm->suspend, 0, how) == 0) {
903 		VM_CTR2(vm, "virtual machine already suspended %d/%d",
904 		    vm->suspend, how);
905 		return (EALREADY);
906 	}
907 
908 	VM_CTR1(vm, "virtual machine successfully suspended %d", how);
909 
910 	/*
911 	 * Notify all active vcpus that they are now suspended.
912 	 */
913 	for (i = 0; i < vm->maxcpus; i++) {
914 		if (CPU_ISSET(i, &vm->active_cpus))
915 			vcpu_notify_event(vm_vcpu(vm, i));
916 	}
917 
918 	return (0);
919 }
920 
921 void
922 vm_exit_suspended(struct vcpu *vcpu, uint64_t pc)
923 {
924 	struct vm *vm = vcpu->vm;
925 	struct vm_exit *vmexit;
926 
927 	KASSERT(vm->suspend > VM_SUSPEND_NONE && vm->suspend < VM_SUSPEND_LAST,
928 	    ("vm_exit_suspended: invalid suspend type %d", vm->suspend));
929 
930 	vmexit = vm_exitinfo(vcpu);
931 	vmexit->pc = pc;
932 	vmexit->inst_length = 4;
933 	vmexit->exitcode = VM_EXITCODE_SUSPENDED;
934 	vmexit->u.suspended.how = vm->suspend;
935 }
936 
937 void
938 vm_exit_debug(struct vcpu *vcpu, uint64_t pc)
939 {
940 	struct vm_exit *vmexit;
941 
942 	vmexit = vm_exitinfo(vcpu);
943 	vmexit->pc = pc;
944 	vmexit->inst_length = 4;
945 	vmexit->exitcode = VM_EXITCODE_DEBUG;
946 }
947 
948 int
949 vm_activate_cpu(struct vcpu *vcpu)
950 {
951 	struct vm *vm = vcpu->vm;
952 
953 	if (CPU_ISSET(vcpu->vcpuid, &vm->active_cpus))
954 		return (EBUSY);
955 
956 	CPU_SET_ATOMIC(vcpu->vcpuid, &vm->active_cpus);
957 	return (0);
958 
959 }
960 
961 int
962 vm_suspend_cpu(struct vm *vm, struct vcpu *vcpu)
963 {
964 	if (vcpu == NULL) {
965 		vm->debug_cpus = vm->active_cpus;
966 		for (int i = 0; i < vm->maxcpus; i++) {
967 			if (CPU_ISSET(i, &vm->active_cpus))
968 				vcpu_notify_event(vm_vcpu(vm, i));
969 		}
970 	} else {
971 		if (!CPU_ISSET(vcpu->vcpuid, &vm->active_cpus))
972 			return (EINVAL);
973 
974 		CPU_SET_ATOMIC(vcpu->vcpuid, &vm->debug_cpus);
975 		vcpu_notify_event(vcpu);
976 	}
977 	return (0);
978 }
979 
980 int
981 vm_resume_cpu(struct vm *vm, struct vcpu *vcpu)
982 {
983 
984 	if (vcpu == NULL) {
985 		CPU_ZERO(&vm->debug_cpus);
986 	} else {
987 		if (!CPU_ISSET(vcpu->vcpuid, &vm->debug_cpus))
988 			return (EINVAL);
989 
990 		CPU_CLR_ATOMIC(vcpu->vcpuid, &vm->debug_cpus);
991 	}
992 	return (0);
993 }
994 
995 int
996 vcpu_debugged(struct vcpu *vcpu)
997 {
998 
999 	return (CPU_ISSET(vcpu->vcpuid, &vcpu->vm->debug_cpus));
1000 }
1001 
1002 cpuset_t
1003 vm_active_cpus(struct vm *vm)
1004 {
1005 
1006 	return (vm->active_cpus);
1007 }
1008 
1009 cpuset_t
1010 vm_debug_cpus(struct vm *vm)
1011 {
1012 
1013 	return (vm->debug_cpus);
1014 }
1015 
1016 cpuset_t
1017 vm_suspended_cpus(struct vm *vm)
1018 {
1019 
1020 	return (vm->suspended_cpus);
1021 }
1022 
1023 
1024 void *
1025 vcpu_stats(struct vcpu *vcpu)
1026 {
1027 
1028 	return (vcpu->stats);
1029 }
1030 
1031 /*
1032  * This function is called to ensure that a vcpu "sees" a pending event
1033  * as soon as possible:
1034  * - If the vcpu thread is sleeping then it is woken up.
1035  * - If the vcpu is running on a different host_cpu then an IPI will be directed
1036  *   to the host_cpu to cause the vcpu to trap into the hypervisor.
1037  */
1038 static void
1039 vcpu_notify_event_locked(struct vcpu *vcpu)
1040 {
1041 	int hostcpu;
1042 
1043 	hostcpu = vcpu->hostcpu;
1044 	if (vcpu->state == VCPU_RUNNING) {
1045 		KASSERT(hostcpu != NOCPU, ("vcpu running on invalid hostcpu"));
1046 		if (hostcpu != curcpu) {
1047 			ipi_cpu(hostcpu, vmm_ipinum);
1048 		} else {
1049 			/*
1050 			 * If the 'vcpu' is running on 'curcpu' then it must
1051 			 * be sending a notification to itself (e.g. SELF_IPI).
1052 			 * The pending event will be picked up when the vcpu
1053 			 * transitions back to guest context.
1054 			 */
1055 		}
1056 	} else {
1057 		KASSERT(hostcpu == NOCPU, ("vcpu state %d not consistent "
1058 		    "with hostcpu %d", vcpu->state, hostcpu));
1059 		if (vcpu->state == VCPU_SLEEPING)
1060 			wakeup_one(vcpu);
1061 	}
1062 }
1063 
1064 void
1065 vcpu_notify_event(struct vcpu *vcpu)
1066 {
1067 	vcpu_lock(vcpu);
1068 	vcpu_notify_event_locked(vcpu);
1069 	vcpu_unlock(vcpu);
1070 }
1071 
1072 static void
1073 restore_guest_fpustate(struct vcpu *vcpu)
1074 {
1075 
1076 	/* Flush host state to the pcb. */
1077 	fpe_state_save(curthread);
1078 
1079 	/* Ensure the VFP state will be re-loaded when exiting the guest. */
1080 	PCPU_SET(fpcurthread, NULL);
1081 
1082 	/* restore guest FPU state */
1083 	fpe_enable();
1084 	fpe_restore(vcpu->guestfpu);
1085 
1086 	/*
1087 	 * The FPU is now "dirty" with the guest's state so turn on emulation
1088 	 * to trap any access to the FPU by the host.
1089 	 */
1090 	fpe_disable();
1091 }
1092 
1093 static void
1094 save_guest_fpustate(struct vcpu *vcpu)
1095 {
1096 
1097 	/* Save guest FPE state. */
1098 	fpe_enable();
1099 	fpe_store(vcpu->guestfpu);
1100 	fpe_disable();
1101 
1102 	KASSERT(PCPU_GET(fpcurthread) == NULL,
1103 	    ("%s: fpcurthread set with guest registers", __func__));
1104 }
1105 
1106 static int
1107 vcpu_set_state_locked(struct vcpu *vcpu, enum vcpu_state newstate,
1108     bool from_idle)
1109 {
1110 	int error;
1111 
1112 	vcpu_assert_locked(vcpu);
1113 
1114 	/*
1115 	 * State transitions from the vmmdev_ioctl() must always begin from
1116 	 * the VCPU_IDLE state. This guarantees that there is only a single
1117 	 * ioctl() operating on a vcpu at any point.
1118 	 */
1119 	if (from_idle) {
1120 		while (vcpu->state != VCPU_IDLE) {
1121 			vcpu_notify_event_locked(vcpu);
1122 			msleep_spin(&vcpu->state, &vcpu->mtx, "vmstat",
1123 			    hz / 1000);
1124 		}
1125 	} else {
1126 		KASSERT(vcpu->state != VCPU_IDLE, ("invalid transition from "
1127 		    "vcpu idle state"));
1128 	}
1129 
1130 	if (vcpu->state == VCPU_RUNNING) {
1131 		KASSERT(vcpu->hostcpu == curcpu, ("curcpu %d and hostcpu %d "
1132 		    "mismatch for running vcpu", curcpu, vcpu->hostcpu));
1133 	} else {
1134 		KASSERT(vcpu->hostcpu == NOCPU, ("Invalid hostcpu %d for a "
1135 		    "vcpu that is not running", vcpu->hostcpu));
1136 	}
1137 
1138 	/*
1139 	 * The following state transitions are allowed:
1140 	 * IDLE -> FROZEN -> IDLE
1141 	 * FROZEN -> RUNNING -> FROZEN
1142 	 * FROZEN -> SLEEPING -> FROZEN
1143 	 */
1144 	switch (vcpu->state) {
1145 	case VCPU_IDLE:
1146 	case VCPU_RUNNING:
1147 	case VCPU_SLEEPING:
1148 		error = (newstate != VCPU_FROZEN);
1149 		break;
1150 	case VCPU_FROZEN:
1151 		error = (newstate == VCPU_FROZEN);
1152 		break;
1153 	default:
1154 		error = 1;
1155 		break;
1156 	}
1157 
1158 	if (error)
1159 		return (EBUSY);
1160 
1161 	vcpu->state = newstate;
1162 	if (newstate == VCPU_RUNNING)
1163 		vcpu->hostcpu = curcpu;
1164 	else
1165 		vcpu->hostcpu = NOCPU;
1166 
1167 	if (newstate == VCPU_IDLE)
1168 		wakeup(&vcpu->state);
1169 
1170 	return (0);
1171 }
1172 
1173 static void
1174 vcpu_require_state(struct vcpu *vcpu, enum vcpu_state newstate)
1175 {
1176 	int error;
1177 
1178 	if ((error = vcpu_set_state(vcpu, newstate, false)) != 0)
1179 		panic("Error %d setting state to %d\n", error, newstate);
1180 }
1181 
1182 static void
1183 vcpu_require_state_locked(struct vcpu *vcpu, enum vcpu_state newstate)
1184 {
1185 	int error;
1186 
1187 	if ((error = vcpu_set_state_locked(vcpu, newstate, false)) != 0)
1188 		panic("Error %d setting state to %d", error, newstate);
1189 }
1190 
1191 int
1192 vm_get_capability(struct vcpu *vcpu, int type, int *retval)
1193 {
1194 
1195 	if (type < 0 || type >= VM_CAP_MAX)
1196 		return (EINVAL);
1197 
1198 	return (vmmops_getcap(vcpu->cookie, type, retval));
1199 }
1200 
1201 int
1202 vm_set_capability(struct vcpu *vcpu, int type, int val)
1203 {
1204 
1205 	if (type < 0 || type >= VM_CAP_MAX)
1206 		return (EINVAL);
1207 
1208 	return (vmmops_setcap(vcpu->cookie, type, val));
1209 }
1210 
1211 struct vm *
1212 vcpu_vm(struct vcpu *vcpu)
1213 {
1214 
1215 	return (vcpu->vm);
1216 }
1217 
1218 int
1219 vcpu_vcpuid(struct vcpu *vcpu)
1220 {
1221 
1222 	return (vcpu->vcpuid);
1223 }
1224 
1225 void *
1226 vcpu_get_cookie(struct vcpu *vcpu)
1227 {
1228 
1229 	return (vcpu->cookie);
1230 }
1231 
1232 struct vcpu *
1233 vm_vcpu(struct vm *vm, int vcpuid)
1234 {
1235 
1236 	return (vm->vcpu[vcpuid]);
1237 }
1238 
1239 int
1240 vcpu_set_state(struct vcpu *vcpu, enum vcpu_state newstate, bool from_idle)
1241 {
1242 	int error;
1243 
1244 	vcpu_lock(vcpu);
1245 	error = vcpu_set_state_locked(vcpu, newstate, from_idle);
1246 	vcpu_unlock(vcpu);
1247 
1248 	return (error);
1249 }
1250 
1251 enum vcpu_state
1252 vcpu_get_state(struct vcpu *vcpu, int *hostcpu)
1253 {
1254 	enum vcpu_state state;
1255 
1256 	vcpu_lock(vcpu);
1257 	state = vcpu->state;
1258 	if (hostcpu != NULL)
1259 		*hostcpu = vcpu->hostcpu;
1260 	vcpu_unlock(vcpu);
1261 
1262 	return (state);
1263 }
1264 
1265 static void *
1266 _vm_gpa_hold(struct vm *vm, vm_paddr_t gpa, size_t len, int reqprot,
1267     void **cookie)
1268 {
1269 	int i, count, pageoff;
1270 	struct mem_map *mm;
1271 	vm_page_t m;
1272 
1273 	pageoff = gpa & PAGE_MASK;
1274 	if (len > PAGE_SIZE - pageoff)
1275 		panic("vm_gpa_hold: invalid gpa/len: 0x%016lx/%lu", gpa, len);
1276 
1277 	count = 0;
1278 	for (i = 0; i < VM_MAX_MEMMAPS; i++) {
1279 		mm = &vm->mem_maps[i];
1280 		if (sysmem_mapping(vm, mm) && gpa >= mm->gpa &&
1281 		    gpa < mm->gpa + mm->len) {
1282 			count = vm_fault_quick_hold_pages(&vm->vmspace->vm_map,
1283 			    trunc_page(gpa), PAGE_SIZE, reqprot, &m, 1);
1284 			break;
1285 		}
1286 	}
1287 
1288 	if (count == 1) {
1289 		*cookie = m;
1290 		return ((void *)(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)) + pageoff));
1291 	} else {
1292 		*cookie = NULL;
1293 		return (NULL);
1294 	}
1295 }
1296 
1297 void *
1298 vm_gpa_hold(struct vcpu *vcpu, vm_paddr_t gpa, size_t len, int reqprot,
1299 	    void **cookie)
1300 {
1301 #ifdef INVARIANTS
1302 	/*
1303 	 * The current vcpu should be frozen to ensure 'vm_memmap[]'
1304 	 * stability.
1305 	 */
1306 	int state = vcpu_get_state(vcpu, NULL);
1307 	KASSERT(state == VCPU_FROZEN, ("%s: invalid vcpu state %d",
1308 	    __func__, state));
1309 #endif
1310 	return (_vm_gpa_hold(vcpu->vm, gpa, len, reqprot, cookie));
1311 }
1312 
1313 void *
1314 vm_gpa_hold_global(struct vm *vm, vm_paddr_t gpa, size_t len, int reqprot,
1315     void **cookie)
1316 {
1317 	sx_assert(&vm->mem_segs_lock, SX_LOCKED);
1318 	return (_vm_gpa_hold(vm, gpa, len, reqprot, cookie));
1319 }
1320 
1321 void
1322 vm_gpa_release(void *cookie)
1323 {
1324 	vm_page_t m = cookie;
1325 
1326 	vm_page_unwire(m, PQ_ACTIVE);
1327 }
1328 
1329 int
1330 vm_get_register(struct vcpu *vcpu, int reg, uint64_t *retval)
1331 {
1332 
1333 	if (reg >= VM_REG_LAST)
1334 		return (EINVAL);
1335 
1336 	return (vmmops_getreg(vcpu->cookie, reg, retval));
1337 }
1338 
1339 int
1340 vm_set_register(struct vcpu *vcpu, int reg, uint64_t val)
1341 {
1342 	int error;
1343 
1344 	if (reg >= VM_REG_LAST)
1345 		return (EINVAL);
1346 	error = vmmops_setreg(vcpu->cookie, reg, val);
1347 	if (error || reg != VM_REG_GUEST_SEPC)
1348 		return (error);
1349 
1350 	vcpu->nextpc = val;
1351 
1352 	return (0);
1353 }
1354 
1355 void *
1356 vm_get_cookie(struct vm *vm)
1357 {
1358 
1359 	return (vm->cookie);
1360 }
1361 
1362 int
1363 vm_inject_exception(struct vcpu *vcpu, uint64_t scause)
1364 {
1365 
1366 	return (vmmops_exception(vcpu->cookie, scause));
1367 }
1368 
1369 int
1370 vm_attach_aplic(struct vm *vm, struct vm_aplic_descr *descr)
1371 {
1372 
1373 	return (aplic_attach_to_vm(vm->cookie, descr));
1374 }
1375 
1376 int
1377 vm_assert_irq(struct vm *vm, uint32_t irq)
1378 {
1379 
1380 	return (aplic_inject_irq(vm->cookie, -1, irq, true));
1381 }
1382 
1383 int
1384 vm_deassert_irq(struct vm *vm, uint32_t irq)
1385 {
1386 
1387 	return (aplic_inject_irq(vm->cookie, -1, irq, false));
1388 }
1389 
1390 int
1391 vm_raise_msi(struct vm *vm, uint64_t msg, uint64_t addr, int bus, int slot,
1392     int func)
1393 {
1394 
1395 	return (aplic_inject_msi(vm->cookie, msg, addr));
1396 }
1397 
1398 static int
1399 vm_handle_wfi(struct vcpu *vcpu, struct vm_exit *vme, bool *retu)
1400 {
1401 
1402 	vcpu_lock(vcpu);
1403 
1404 	while (1) {
1405 		if (aplic_check_pending(vcpu->cookie))
1406 			break;
1407 
1408 		if (riscv_check_ipi(vcpu->cookie, false))
1409 			break;
1410 
1411 		if (vcpu_should_yield(vcpu))
1412 			break;
1413 
1414 		vcpu_require_state_locked(vcpu, VCPU_SLEEPING);
1415 		/*
1416 		 * XXX msleep_spin() cannot be interrupted by signals so
1417 		 * wake up periodically to check pending signals.
1418 		 */
1419 		msleep_spin(vcpu, &vcpu->mtx, "vmidle", hz / 1000);
1420 		vcpu_require_state_locked(vcpu, VCPU_FROZEN);
1421 	}
1422 	vcpu_unlock(vcpu);
1423 
1424 	*retu = false;
1425 
1426 	return (0);
1427 }
1428 
1429 static int
1430 vm_handle_paging(struct vcpu *vcpu, bool *retu)
1431 {
1432 	struct vm *vm;
1433 	struct vm_exit *vme;
1434 	struct vm_map *map;
1435 	uint64_t addr;
1436 	pmap_t pmap;
1437 	int ftype, rv;
1438 
1439 	vm = vcpu->vm;
1440 	vme = &vcpu->exitinfo;
1441 
1442 	pmap = vmspace_pmap(vm->vmspace);
1443 	addr = (vme->htval << 2) & ~(PAGE_SIZE - 1);
1444 
1445 	dprintf("%s: %lx\n", __func__, addr);
1446 
1447 	switch (vme->scause) {
1448 	case SCAUSE_STORE_GUEST_PAGE_FAULT:
1449 		ftype = VM_PROT_WRITE;
1450 		break;
1451 	case SCAUSE_FETCH_GUEST_PAGE_FAULT:
1452 		ftype = VM_PROT_EXECUTE;
1453 		break;
1454 	case SCAUSE_LOAD_GUEST_PAGE_FAULT:
1455 		ftype = VM_PROT_READ;
1456 		break;
1457 	default:
1458 		panic("unknown page trap: %lu", vme->scause);
1459 	}
1460 
1461 	/* The page exists, but the page table needs to be updated. */
1462 	if (pmap_fault(pmap, addr, ftype))
1463 		return (0);
1464 
1465 	map = &vm->vmspace->vm_map;
1466 	rv = vm_fault(map, addr, ftype, VM_FAULT_NORMAL, NULL);
1467 	if (rv != KERN_SUCCESS) {
1468 		printf("%s: vm_fault failed, addr %lx, ftype %d, err %d\n",
1469 		    __func__, addr, ftype, rv);
1470 		return (EFAULT);
1471 	}
1472 
1473 	return (0);
1474 }
1475 
1476 static int
1477 vm_handle_suspend(struct vcpu *vcpu, bool *retu)
1478 {
1479 	struct vm *vm = vcpu->vm;
1480 	int error, i;
1481 	struct thread *td;
1482 
1483 	error = 0;
1484 	td = curthread;
1485 
1486 	CPU_SET_ATOMIC(vcpu->vcpuid, &vm->suspended_cpus);
1487 
1488 	/*
1489 	 * Wait until all 'active_cpus' have suspended themselves.
1490 	 *
1491 	 * Since a VM may be suspended at any time including when one or
1492 	 * more vcpus are doing a rendezvous we need to call the rendezvous
1493 	 * handler while we are waiting to prevent a deadlock.
1494 	 */
1495 	vcpu_lock(vcpu);
1496 	while (error == 0) {
1497 		if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0)
1498 			break;
1499 
1500 		vcpu_require_state_locked(vcpu, VCPU_SLEEPING);
1501 		msleep_spin(vcpu, &vcpu->mtx, "vmsusp", hz);
1502 		vcpu_require_state_locked(vcpu, VCPU_FROZEN);
1503 		if (td_ast_pending(td, TDA_SUSPEND)) {
1504 			vcpu_unlock(vcpu);
1505 			error = thread_check_susp(td, false);
1506 			vcpu_lock(vcpu);
1507 		}
1508 	}
1509 	vcpu_unlock(vcpu);
1510 
1511 	/*
1512 	 * Wakeup the other sleeping vcpus and return to userspace.
1513 	 */
1514 	for (i = 0; i < vm->maxcpus; i++) {
1515 		if (CPU_ISSET(i, &vm->suspended_cpus)) {
1516 			vcpu_notify_event(vm_vcpu(vm, i));
1517 		}
1518 	}
1519 
1520 	*retu = true;
1521 	return (error);
1522 }
1523 
1524 int
1525 vm_run(struct vcpu *vcpu)
1526 {
1527 	struct vm_eventinfo evinfo;
1528 	struct vm_exit *vme;
1529 	struct vm *vm;
1530 	pmap_t pmap;
1531 	int error;
1532 	int vcpuid;
1533 	bool retu;
1534 
1535 	vm = vcpu->vm;
1536 
1537 	dprintf("%s\n", __func__);
1538 
1539 	vcpuid = vcpu->vcpuid;
1540 
1541 	if (!CPU_ISSET(vcpuid, &vm->active_cpus))
1542 		return (EINVAL);
1543 
1544 	if (CPU_ISSET(vcpuid, &vm->suspended_cpus))
1545 		return (EINVAL);
1546 
1547 	pmap = vmspace_pmap(vm->vmspace);
1548 	vme = &vcpu->exitinfo;
1549 	evinfo.rptr = NULL;
1550 	evinfo.sptr = &vm->suspend;
1551 	evinfo.iptr = NULL;
1552 restart:
1553 	critical_enter();
1554 
1555 	restore_guest_fpustate(vcpu);
1556 
1557 	vcpu_require_state(vcpu, VCPU_RUNNING);
1558 	error = vmmops_run(vcpu->cookie, vcpu->nextpc, pmap, &evinfo);
1559 	vcpu_require_state(vcpu, VCPU_FROZEN);
1560 
1561 	save_guest_fpustate(vcpu);
1562 
1563 	critical_exit();
1564 
1565 	if (error == 0) {
1566 		retu = false;
1567 		switch (vme->exitcode) {
1568 		case VM_EXITCODE_INST_EMUL:
1569 			vcpu->nextpc = vme->pc + vme->inst_length;
1570 			error = vm_handle_inst_emul(vcpu, &retu);
1571 			break;
1572 		case VM_EXITCODE_WFI:
1573 			vcpu->nextpc = vme->pc + vme->inst_length;
1574 			error = vm_handle_wfi(vcpu, vme, &retu);
1575 			break;
1576 		case VM_EXITCODE_ECALL:
1577 			/* Handle in userland. */
1578 			vcpu->nextpc = vme->pc + vme->inst_length;
1579 			retu = true;
1580 			break;
1581 		case VM_EXITCODE_PAGING:
1582 			vcpu->nextpc = vme->pc;
1583 			error = vm_handle_paging(vcpu, &retu);
1584 			break;
1585 		case VM_EXITCODE_BOGUS:
1586 			vcpu->nextpc = vme->pc;
1587 			retu = false;
1588 			error = 0;
1589 			break;
1590 		case VM_EXITCODE_SUSPENDED:
1591 			vcpu->nextpc = vme->pc;
1592 			error = vm_handle_suspend(vcpu, &retu);
1593 			break;
1594 		default:
1595 			/* Handle in userland. */
1596 			vcpu->nextpc = vme->pc;
1597 			retu = true;
1598 			break;
1599 		}
1600 	}
1601 
1602 	if (error == 0 && retu == false)
1603 		goto restart;
1604 
1605 	return (error);
1606 }
1607