xref: /freebsd/sys/riscv/vmm/vmm.c (revision 5036d9652a5701d00e9e40ea942c278e9f77d33d)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2015 Mihai Carabas <mihai.carabas@gmail.com>
5  * Copyright (c) 2024 Ruslan Bukin <br@bsdpad.com>
6  *
7  * This software was developed by the University of Cambridge Computer
8  * Laboratory (Department of Computer Science and Technology) under Innovate
9  * UK project 105694, "Digital Security by Design (DSbD) Technology Platform
10  * Prototype".
11  *
12  * Redistribution and use in source and binary forms, with or without
13  * modification, are permitted provided that the following conditions
14  * are met:
15  * 1. Redistributions of source code must retain the above copyright
16  *    notice, this list of conditions and the following disclaimer.
17  * 2. Redistributions in binary form must reproduce the above copyright
18  *    notice, this list of conditions and the following disclaimer in the
19  *    documentation and/or other materials provided with the distribution.
20  *
21  * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND
22  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24  * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
25  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31  * SUCH DAMAGE.
32  */
33 
34 #include <sys/param.h>
35 #include <sys/systm.h>
36 #include <sys/cpuset.h>
37 #include <sys/kernel.h>
38 #include <sys/linker.h>
39 #include <sys/lock.h>
40 #include <sys/malloc.h>
41 #include <sys/module.h>
42 #include <sys/mutex.h>
43 #include <sys/pcpu.h>
44 #include <sys/proc.h>
45 #include <sys/queue.h>
46 #include <sys/rwlock.h>
47 #include <sys/sched.h>
48 #include <sys/smp.h>
49 #include <sys/sysctl.h>
50 
51 #include <vm/vm.h>
52 #include <vm/vm_object.h>
53 #include <vm/vm_page.h>
54 #include <vm/pmap.h>
55 #include <vm/vm_map.h>
56 #include <vm/vm_extern.h>
57 #include <vm/vm_param.h>
58 
59 #include <machine/riscvreg.h>
60 #include <machine/cpu.h>
61 #include <machine/fpe.h>
62 #include <machine/machdep.h>
63 #include <machine/pcb.h>
64 #include <machine/smp.h>
65 #include <machine/vm.h>
66 #include <machine/vmparam.h>
67 #include <machine/vmm.h>
68 #include <machine/vmm_instruction_emul.h>
69 
70 #include <dev/pci/pcireg.h>
71 
72 #include <dev/vmm/vmm_dev.h>
73 #include <dev/vmm/vmm_ktr.h>
74 
75 #include "vmm_stat.h"
76 #include "riscv.h"
77 
78 #include "vmm_aplic.h"
79 
80 struct vcpu {
81 	int		flags;
82 	enum vcpu_state	state;
83 	struct mtx	mtx;
84 	int		hostcpu;	/* host cpuid this vcpu last ran on */
85 	int		vcpuid;
86 	void		*stats;
87 	struct vm_exit	exitinfo;
88 	uint64_t	nextpc;		/* (x) next instruction to execute */
89 	struct vm	*vm;		/* (o) */
90 	void		*cookie;	/* (i) cpu-specific data */
91 	struct fpreg	*guestfpu;	/* (a,i) guest fpu state */
92 };
93 
94 #define	vcpu_lock_initialized(v) mtx_initialized(&((v)->mtx))
95 #define	vcpu_lock_init(v)	mtx_init(&((v)->mtx), "vcpu lock", 0, MTX_SPIN)
96 #define	vcpu_lock_destroy(v)	mtx_destroy(&((v)->mtx))
97 #define	vcpu_lock(v)		mtx_lock_spin(&((v)->mtx))
98 #define	vcpu_unlock(v)		mtx_unlock_spin(&((v)->mtx))
99 #define	vcpu_assert_locked(v)	mtx_assert(&((v)->mtx), MA_OWNED)
100 
101 struct mem_seg {
102 	uint64_t	gpa;
103 	size_t		len;
104 	bool		wired;
105 	bool		sysmem;
106 	vm_object_t	object;
107 };
108 #define	VM_MAX_MEMSEGS	3
109 
110 struct mem_map {
111 	vm_paddr_t	gpa;
112 	size_t		len;
113 	vm_ooffset_t	segoff;
114 	int		segid;
115 	int		prot;
116 	int		flags;
117 };
118 #define	VM_MAX_MEMMAPS	4
119 
120 struct vmm_mmio_region {
121 	uint64_t start;
122 	uint64_t end;
123 	mem_region_read_t read;
124 	mem_region_write_t write;
125 };
126 #define	VM_MAX_MMIO_REGIONS	4
127 
128 /*
129  * Initialization:
130  * (o) initialized the first time the VM is created
131  * (i) initialized when VM is created and when it is reinitialized
132  * (x) initialized before use
133  */
134 struct vm {
135 	void		*cookie;		/* (i) cpu-specific data */
136 	volatile cpuset_t active_cpus;		/* (i) active vcpus */
137 	volatile cpuset_t debug_cpus;		/* (i) vcpus stopped for debug*/
138 	int		suspend;		/* (i) stop VM execution */
139 	bool		dying;			/* (o) is dying */
140 	volatile cpuset_t suspended_cpus; 	/* (i) suspended vcpus */
141 	volatile cpuset_t halted_cpus;		/* (x) cpus in a hard halt */
142 	struct mem_map	mem_maps[VM_MAX_MEMMAPS]; /* (i) guest address space */
143 	struct mem_seg	mem_segs[VM_MAX_MEMSEGS]; /* (o) guest memory regions */
144 	struct vmspace	*vmspace;		/* (o) guest's address space */
145 	char		name[VM_MAX_NAMELEN];	/* (o) virtual machine name */
146 	struct vcpu	**vcpu;			/* (i) guest vcpus */
147 	struct vmm_mmio_region mmio_region[VM_MAX_MMIO_REGIONS];
148 						/* (o) guest MMIO regions */
149 	/* The following describe the vm cpu topology */
150 	uint16_t	sockets;		/* (o) num of sockets */
151 	uint16_t	cores;			/* (o) num of cores/socket */
152 	uint16_t	threads;		/* (o) num of threads/core */
153 	uint16_t	maxcpus;		/* (o) max pluggable cpus */
154 	struct sx	mem_segs_lock;		/* (o) */
155 	struct sx	vcpus_init_lock;	/* (o) */
156 };
157 
158 static bool vmm_initialized = false;
159 
160 static MALLOC_DEFINE(M_VMM, "vmm", "vmm");
161 
162 /* statistics */
163 static VMM_STAT(VCPU_TOTAL_RUNTIME, "vcpu total runtime");
164 
165 SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW, NULL, NULL);
166 
167 static int vmm_ipinum;
168 SYSCTL_INT(_hw_vmm, OID_AUTO, ipinum, CTLFLAG_RD, &vmm_ipinum, 0,
169     "IPI vector used for vcpu notifications");
170 
171 u_int vm_maxcpu;
172 SYSCTL_UINT(_hw_vmm, OID_AUTO, maxcpu, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
173     &vm_maxcpu, 0, "Maximum number of vCPUs");
174 
175 static void vm_free_memmap(struct vm *vm, int ident);
176 static bool sysmem_mapping(struct vm *vm, struct mem_map *mm);
177 static void vcpu_notify_event_locked(struct vcpu *vcpu);
178 
179 /*
180  * Upper limit on vm_maxcpu. We could increase this to 28 bits, but this
181  * is a safe value for now.
182  */
183 #define	VM_MAXCPU	MIN(0xffff - 1, CPU_SETSIZE)
184 
185 static void
186 vcpu_cleanup(struct vcpu *vcpu, bool destroy)
187 {
188 	vmmops_vcpu_cleanup(vcpu->cookie);
189 	vcpu->cookie = NULL;
190 	if (destroy) {
191 		vmm_stat_free(vcpu->stats);
192 		fpu_save_area_free(vcpu->guestfpu);
193 		vcpu_lock_destroy(vcpu);
194 	}
195 }
196 
197 static struct vcpu *
198 vcpu_alloc(struct vm *vm, int vcpu_id)
199 {
200 	struct vcpu *vcpu;
201 
202 	KASSERT(vcpu_id >= 0 && vcpu_id < vm->maxcpus,
203 	    ("vcpu_alloc: invalid vcpu %d", vcpu_id));
204 
205 	vcpu = malloc(sizeof(*vcpu), M_VMM, M_WAITOK | M_ZERO);
206 	vcpu_lock_init(vcpu);
207 	vcpu->state = VCPU_IDLE;
208 	vcpu->hostcpu = NOCPU;
209 	vcpu->vcpuid = vcpu_id;
210 	vcpu->vm = vm;
211 	vcpu->guestfpu = fpu_save_area_alloc();
212 	vcpu->stats = vmm_stat_alloc();
213 	return (vcpu);
214 }
215 
216 static void
217 vcpu_init(struct vcpu *vcpu)
218 {
219 	vcpu->cookie = vmmops_vcpu_init(vcpu->vm->cookie, vcpu, vcpu->vcpuid);
220 	MPASS(vcpu->cookie != NULL);
221 	fpu_save_area_reset(vcpu->guestfpu);
222 	vmm_stat_init(vcpu->stats);
223 }
224 
225 struct vm_exit *
226 vm_exitinfo(struct vcpu *vcpu)
227 {
228 	return (&vcpu->exitinfo);
229 }
230 
231 static int
232 vmm_init(void)
233 {
234 
235 	vm_maxcpu = mp_ncpus;
236 
237 	TUNABLE_INT_FETCH("hw.vmm.maxcpu", &vm_maxcpu);
238 
239 	if (vm_maxcpu > VM_MAXCPU) {
240 		printf("vmm: vm_maxcpu clamped to %u\n", VM_MAXCPU);
241 		vm_maxcpu = VM_MAXCPU;
242 	}
243 
244 	if (vm_maxcpu == 0)
245 		vm_maxcpu = 1;
246 
247 	return (vmmops_modinit());
248 }
249 
250 static int
251 vmm_handler(module_t mod, int what, void *arg)
252 {
253 	int error;
254 
255 	switch (what) {
256 	case MOD_LOAD:
257 		/* TODO: check if has_hyp here? */
258 		error = vmmdev_init();
259 		if (error != 0)
260 			break;
261 		error = vmm_init();
262 		if (error == 0)
263 			vmm_initialized = true;
264 		break;
265 	case MOD_UNLOAD:
266 		/* TODO: check if has_hyp here? */
267 		error = vmmdev_cleanup();
268 		if (error == 0 && vmm_initialized) {
269 			error = vmmops_modcleanup();
270 			if (error)
271 				vmm_initialized = false;
272 		}
273 		break;
274 	default:
275 		error = 0;
276 		break;
277 	}
278 	return (error);
279 }
280 
281 static moduledata_t vmm_kmod = {
282 	"vmm",
283 	vmm_handler,
284 	NULL
285 };
286 
287 /*
288  * vmm initialization has the following dependencies:
289  *
290  * - HYP initialization requires smp_rendezvous() and therefore must happen
291  *   after SMP is fully functional (after SI_SUB_SMP).
292  */
293 DECLARE_MODULE(vmm, vmm_kmod, SI_SUB_SMP + 1, SI_ORDER_ANY);
294 MODULE_VERSION(vmm, 1);
295 
296 static void
297 vm_init(struct vm *vm, bool create)
298 {
299 	int i;
300 
301 	vm->cookie = vmmops_init(vm, vmspace_pmap(vm->vmspace));
302 	MPASS(vm->cookie != NULL);
303 
304 	CPU_ZERO(&vm->active_cpus);
305 	CPU_ZERO(&vm->debug_cpus);
306 
307 	vm->suspend = 0;
308 	CPU_ZERO(&vm->suspended_cpus);
309 
310 	memset(vm->mmio_region, 0, sizeof(vm->mmio_region));
311 
312 	if (!create) {
313 		for (i = 0; i < vm->maxcpus; i++) {
314 			if (vm->vcpu[i] != NULL)
315 				vcpu_init(vm->vcpu[i]);
316 		}
317 	}
318 }
319 
320 void
321 vm_disable_vcpu_creation(struct vm *vm)
322 {
323 	sx_xlock(&vm->vcpus_init_lock);
324 	vm->dying = true;
325 	sx_xunlock(&vm->vcpus_init_lock);
326 }
327 
328 struct vcpu *
329 vm_alloc_vcpu(struct vm *vm, int vcpuid)
330 {
331 	struct vcpu *vcpu;
332 
333 	if (vcpuid < 0 || vcpuid >= vm_get_maxcpus(vm))
334 		return (NULL);
335 
336 	/* Some interrupt controllers may have a CPU limit */
337 	if (vcpuid >= aplic_max_cpu_count(vm->cookie))
338 		return (NULL);
339 
340 	vcpu = (struct vcpu *)
341 	    atomic_load_acq_ptr((uintptr_t *)&vm->vcpu[vcpuid]);
342 	if (__predict_true(vcpu != NULL))
343 		return (vcpu);
344 
345 	sx_xlock(&vm->vcpus_init_lock);
346 	vcpu = vm->vcpu[vcpuid];
347 	if (vcpu == NULL && !vm->dying) {
348 		vcpu = vcpu_alloc(vm, vcpuid);
349 		vcpu_init(vcpu);
350 
351 		/*
352 		 * Ensure vCPU is fully created before updating pointer
353 		 * to permit unlocked reads above.
354 		 */
355 		atomic_store_rel_ptr((uintptr_t *)&vm->vcpu[vcpuid],
356 		    (uintptr_t)vcpu);
357 	}
358 	sx_xunlock(&vm->vcpus_init_lock);
359 	return (vcpu);
360 }
361 
362 void
363 vm_slock_vcpus(struct vm *vm)
364 {
365 	sx_slock(&vm->vcpus_init_lock);
366 }
367 
368 void
369 vm_unlock_vcpus(struct vm *vm)
370 {
371 	sx_unlock(&vm->vcpus_init_lock);
372 }
373 
374 int
375 vm_create(const char *name, struct vm **retvm)
376 {
377 	struct vm *vm;
378 	struct vmspace *vmspace;
379 
380 	/*
381 	 * If vmm.ko could not be successfully initialized then don't attempt
382 	 * to create the virtual machine.
383 	 */
384 	if (!vmm_initialized)
385 		return (ENXIO);
386 
387 	if (name == NULL || strlen(name) >= VM_MAX_NAMELEN)
388 		return (EINVAL);
389 
390 	vmspace = vmmops_vmspace_alloc(0, 1ul << 39);
391 	if (vmspace == NULL)
392 		return (ENOMEM);
393 
394 	vm = malloc(sizeof(struct vm), M_VMM, M_WAITOK | M_ZERO);
395 	strcpy(vm->name, name);
396 	vm->vmspace = vmspace;
397 	sx_init(&vm->mem_segs_lock, "vm mem_segs");
398 	sx_init(&vm->vcpus_init_lock, "vm vcpus");
399 
400 	vm->sockets = 1;
401 	vm->cores = 1;			/* XXX backwards compatibility */
402 	vm->threads = 1;		/* XXX backwards compatibility */
403 	vm->maxcpus = vm_maxcpu;
404 
405 	vm->vcpu = malloc(sizeof(*vm->vcpu) * vm->maxcpus, M_VMM,
406 	    M_WAITOK | M_ZERO);
407 
408 	vm_init(vm, true);
409 
410 	*retvm = vm;
411 	return (0);
412 }
413 
414 void
415 vm_get_topology(struct vm *vm, uint16_t *sockets, uint16_t *cores,
416     uint16_t *threads, uint16_t *maxcpus)
417 {
418 	*sockets = vm->sockets;
419 	*cores = vm->cores;
420 	*threads = vm->threads;
421 	*maxcpus = vm->maxcpus;
422 }
423 
424 uint16_t
425 vm_get_maxcpus(struct vm *vm)
426 {
427 	return (vm->maxcpus);
428 }
429 
430 int
431 vm_set_topology(struct vm *vm, uint16_t sockets, uint16_t cores,
432     uint16_t threads, uint16_t maxcpus)
433 {
434 	/* Ignore maxcpus. */
435 	if ((sockets * cores * threads) > vm->maxcpus)
436 		return (EINVAL);
437 	vm->sockets = sockets;
438 	vm->cores = cores;
439 	vm->threads = threads;
440 	return(0);
441 }
442 
443 static void
444 vm_cleanup(struct vm *vm, bool destroy)
445 {
446 	struct mem_map *mm;
447 	int i;
448 
449 	aplic_detach_from_vm(vm->cookie);
450 
451 	for (i = 0; i < vm->maxcpus; i++) {
452 		if (vm->vcpu[i] != NULL)
453 			vcpu_cleanup(vm->vcpu[i], destroy);
454 	}
455 
456 	vmmops_cleanup(vm->cookie);
457 
458 	/*
459 	 * System memory is removed from the guest address space only when
460 	 * the VM is destroyed. This is because the mapping remains the same
461 	 * across VM reset.
462 	 *
463 	 * Device memory can be relocated by the guest (e.g. using PCI BARs)
464 	 * so those mappings are removed on a VM reset.
465 	 */
466 	if (!destroy) {
467 		for (i = 0; i < VM_MAX_MEMMAPS; i++) {
468 			mm = &vm->mem_maps[i];
469 			if (destroy || !sysmem_mapping(vm, mm))
470 				vm_free_memmap(vm, i);
471 		}
472 	}
473 
474 	if (destroy) {
475 		for (i = 0; i < VM_MAX_MEMSEGS; i++)
476 			vm_free_memseg(vm, i);
477 
478 		vmmops_vmspace_free(vm->vmspace);
479 		vm->vmspace = NULL;
480 
481 		for (i = 0; i < vm->maxcpus; i++)
482 			free(vm->vcpu[i], M_VMM);
483 		free(vm->vcpu, M_VMM);
484 		sx_destroy(&vm->vcpus_init_lock);
485 		sx_destroy(&vm->mem_segs_lock);
486 	}
487 }
488 
489 void
490 vm_destroy(struct vm *vm)
491 {
492 
493 	vm_cleanup(vm, true);
494 
495 	free(vm, M_VMM);
496 }
497 
498 int
499 vm_reinit(struct vm *vm)
500 {
501 	int error;
502 
503 	/*
504 	 * A virtual machine can be reset only if all vcpus are suspended.
505 	 */
506 	if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) {
507 		vm_cleanup(vm, false);
508 		vm_init(vm, false);
509 		error = 0;
510 	} else {
511 		error = EBUSY;
512 	}
513 
514 	return (error);
515 }
516 
517 const char *
518 vm_name(struct vm *vm)
519 {
520 	return (vm->name);
521 }
522 
523 void
524 vm_slock_memsegs(struct vm *vm)
525 {
526 	sx_slock(&vm->mem_segs_lock);
527 }
528 
529 void
530 vm_xlock_memsegs(struct vm *vm)
531 {
532 	sx_xlock(&vm->mem_segs_lock);
533 }
534 
535 void
536 vm_unlock_memsegs(struct vm *vm)
537 {
538 	sx_unlock(&vm->mem_segs_lock);
539 }
540 
541 /*
542  * Return 'true' if 'gpa' is allocated in the guest address space.
543  *
544  * This function is called in the context of a running vcpu which acts as
545  * an implicit lock on 'vm->mem_maps[]'.
546  */
547 bool
548 vm_mem_allocated(struct vcpu *vcpu, vm_paddr_t gpa)
549 {
550 	struct vm *vm = vcpu->vm;
551 	struct mem_map *mm;
552 	int i;
553 
554 #ifdef INVARIANTS
555 	int hostcpu, state;
556 	state = vcpu_get_state(vcpu, &hostcpu);
557 	KASSERT(state == VCPU_RUNNING && hostcpu == curcpu,
558 	    ("%s: invalid vcpu state %d/%d", __func__, state, hostcpu));
559 #endif
560 
561 	for (i = 0; i < VM_MAX_MEMMAPS; i++) {
562 		mm = &vm->mem_maps[i];
563 		if (mm->len != 0 && gpa >= mm->gpa && gpa < mm->gpa + mm->len)
564 			return (true);		/* 'gpa' is sysmem or devmem */
565 	}
566 
567 	return (false);
568 }
569 
570 int
571 vm_alloc_memseg(struct vm *vm, int ident, size_t len, bool sysmem)
572 {
573 	struct mem_seg *seg;
574 	vm_object_t obj;
575 
576 	sx_assert(&vm->mem_segs_lock, SX_XLOCKED);
577 
578 	if (ident < 0 || ident >= VM_MAX_MEMSEGS)
579 		return (EINVAL);
580 
581 	if (len == 0 || (len & PAGE_MASK))
582 		return (EINVAL);
583 
584 	seg = &vm->mem_segs[ident];
585 	if (seg->object != NULL) {
586 		if (seg->len == len && seg->sysmem == sysmem)
587 			return (EEXIST);
588 		else
589 			return (EINVAL);
590 	}
591 
592 	obj = vm_object_allocate(OBJT_DEFAULT, len >> PAGE_SHIFT);
593 	if (obj == NULL)
594 		return (ENOMEM);
595 
596 	seg->len = len;
597 	seg->object = obj;
598 	seg->sysmem = sysmem;
599 	return (0);
600 }
601 
602 int
603 vm_get_memseg(struct vm *vm, int ident, size_t *len, bool *sysmem,
604     vm_object_t *objptr)
605 {
606 	struct mem_seg *seg;
607 
608 	sx_assert(&vm->mem_segs_lock, SX_LOCKED);
609 
610 	if (ident < 0 || ident >= VM_MAX_MEMSEGS)
611 		return (EINVAL);
612 
613 	seg = &vm->mem_segs[ident];
614 	if (len)
615 		*len = seg->len;
616 	if (sysmem)
617 		*sysmem = seg->sysmem;
618 	if (objptr)
619 		*objptr = seg->object;
620 	return (0);
621 }
622 
623 void
624 vm_free_memseg(struct vm *vm, int ident)
625 {
626 	struct mem_seg *seg;
627 
628 	KASSERT(ident >= 0 && ident < VM_MAX_MEMSEGS,
629 	    ("%s: invalid memseg ident %d", __func__, ident));
630 
631 	seg = &vm->mem_segs[ident];
632 	if (seg->object != NULL) {
633 		vm_object_deallocate(seg->object);
634 		bzero(seg, sizeof(struct mem_seg));
635 	}
636 }
637 
638 int
639 vm_mmap_memseg(struct vm *vm, vm_paddr_t gpa, int segid, vm_ooffset_t first,
640     size_t len, int prot, int flags)
641 {
642 	struct mem_seg *seg;
643 	struct mem_map *m, *map;
644 	vm_ooffset_t last;
645 	int i, error;
646 
647 	dprintf("%s: gpa %lx first %lx len %lx\n", __func__, gpa, first, len);
648 
649 	if (prot == 0 || (prot & ~(VM_PROT_ALL)) != 0)
650 		return (EINVAL);
651 
652 	if (flags & ~VM_MEMMAP_F_WIRED)
653 		return (EINVAL);
654 
655 	if (segid < 0 || segid >= VM_MAX_MEMSEGS)
656 		return (EINVAL);
657 
658 	seg = &vm->mem_segs[segid];
659 	if (seg->object == NULL)
660 		return (EINVAL);
661 
662 	last = first + len;
663 	if (first < 0 || first >= last || last > seg->len)
664 		return (EINVAL);
665 
666 	if ((gpa | first | last) & PAGE_MASK)
667 		return (EINVAL);
668 
669 	map = NULL;
670 	for (i = 0; i < VM_MAX_MEMMAPS; i++) {
671 		m = &vm->mem_maps[i];
672 		if (m->len == 0) {
673 			map = m;
674 			break;
675 		}
676 	}
677 
678 	if (map == NULL)
679 		return (ENOSPC);
680 
681 	error = vm_map_find(&vm->vmspace->vm_map, seg->object, first, &gpa,
682 	    len, 0, VMFS_NO_SPACE, prot, prot, 0);
683 	if (error != KERN_SUCCESS)
684 		return (EFAULT);
685 
686 	vm_object_reference(seg->object);
687 
688 	if (flags & VM_MEMMAP_F_WIRED) {
689 		error = vm_map_wire(&vm->vmspace->vm_map, gpa, gpa + len,
690 		    VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
691 		if (error != KERN_SUCCESS) {
692 			vm_map_remove(&vm->vmspace->vm_map, gpa, gpa + len);
693 			return (error == KERN_RESOURCE_SHORTAGE ? ENOMEM :
694 			    EFAULT);
695 		}
696 	}
697 
698 	map->gpa = gpa;
699 	map->len = len;
700 	map->segoff = first;
701 	map->segid = segid;
702 	map->prot = prot;
703 	map->flags = flags;
704 	return (0);
705 }
706 
707 int
708 vm_munmap_memseg(struct vm *vm, vm_paddr_t gpa, size_t len)
709 {
710 	struct mem_map *m;
711 	int i;
712 
713 	dprintf("%s: gpa %lx len %lx\n", __func__, gpa, len);
714 
715 	for (i = 0; i < VM_MAX_MEMMAPS; i++) {
716 		m = &vm->mem_maps[i];
717 		if (m->gpa == gpa && m->len == len) {
718 			vm_free_memmap(vm, i);
719 			return (0);
720 		}
721 	}
722 
723 	return (EINVAL);
724 }
725 
726 int
727 vm_mmap_getnext(struct vm *vm, vm_paddr_t *gpa, int *segid,
728     vm_ooffset_t *segoff, size_t *len, int *prot, int *flags)
729 {
730 	struct mem_map *mm, *mmnext;
731 	int i;
732 
733 	mmnext = NULL;
734 	for (i = 0; i < VM_MAX_MEMMAPS; i++) {
735 		mm = &vm->mem_maps[i];
736 		if (mm->len == 0 || mm->gpa < *gpa)
737 			continue;
738 		if (mmnext == NULL || mm->gpa < mmnext->gpa)
739 			mmnext = mm;
740 	}
741 
742 	if (mmnext != NULL) {
743 		*gpa = mmnext->gpa;
744 		if (segid)
745 			*segid = mmnext->segid;
746 		if (segoff)
747 			*segoff = mmnext->segoff;
748 		if (len)
749 			*len = mmnext->len;
750 		if (prot)
751 			*prot = mmnext->prot;
752 		if (flags)
753 			*flags = mmnext->flags;
754 		return (0);
755 	} else {
756 		return (ENOENT);
757 	}
758 }
759 
760 static void
761 vm_free_memmap(struct vm *vm, int ident)
762 {
763 	struct mem_map *mm;
764 	int error __diagused;
765 
766 	mm = &vm->mem_maps[ident];
767 	if (mm->len) {
768 		error = vm_map_remove(&vm->vmspace->vm_map, mm->gpa,
769 		    mm->gpa + mm->len);
770 		KASSERT(error == KERN_SUCCESS, ("%s: vm_map_remove error %d",
771 		    __func__, error));
772 		bzero(mm, sizeof(struct mem_map));
773 	}
774 }
775 
776 static __inline bool
777 sysmem_mapping(struct vm *vm, struct mem_map *mm)
778 {
779 
780 	if (mm->len != 0 && vm->mem_segs[mm->segid].sysmem)
781 		return (true);
782 	else
783 		return (false);
784 }
785 
786 vm_paddr_t
787 vmm_sysmem_maxaddr(struct vm *vm)
788 {
789 	struct mem_map *mm;
790 	vm_paddr_t maxaddr;
791 	int i;
792 
793 	maxaddr = 0;
794 	for (i = 0; i < VM_MAX_MEMMAPS; i++) {
795 		mm = &vm->mem_maps[i];
796 		if (sysmem_mapping(vm, mm)) {
797 			if (maxaddr < mm->gpa + mm->len)
798 				maxaddr = mm->gpa + mm->len;
799 		}
800 	}
801 	return (maxaddr);
802 }
803 
804 int
805 vm_gla2gpa_nofault(struct vcpu *vcpu, struct vm_guest_paging *paging,
806     uint64_t gla, int prot, uint64_t *gpa, int *is_fault)
807 {
808 	int error;
809 
810 	error = vmmops_gla2gpa(vcpu->cookie, paging, gla, prot, gpa, is_fault);
811 
812 	return (error);
813 }
814 
815 void
816 vm_register_inst_handler(struct vm *vm, uint64_t start, uint64_t size,
817     mem_region_read_t mmio_read, mem_region_write_t mmio_write)
818 {
819 	int i;
820 
821 	for (i = 0; i < nitems(vm->mmio_region); i++) {
822 		if (vm->mmio_region[i].start == 0 &&
823 		    vm->mmio_region[i].end == 0) {
824 			vm->mmio_region[i].start = start;
825 			vm->mmio_region[i].end = start + size;
826 			vm->mmio_region[i].read = mmio_read;
827 			vm->mmio_region[i].write = mmio_write;
828 			return;
829 		}
830 	}
831 
832 	panic("%s: No free MMIO region", __func__);
833 }
834 
835 void
836 vm_deregister_inst_handler(struct vm *vm, uint64_t start, uint64_t size)
837 {
838 	int i;
839 
840 	for (i = 0; i < nitems(vm->mmio_region); i++) {
841 		if (vm->mmio_region[i].start == start &&
842 		    vm->mmio_region[i].end == start + size) {
843 			memset(&vm->mmio_region[i], 0,
844 			    sizeof(vm->mmio_region[i]));
845 			return;
846 		}
847 	}
848 
849 	panic("%s: Invalid MMIO region: %lx - %lx", __func__, start,
850 	    start + size);
851 }
852 
853 static int
854 vm_handle_inst_emul(struct vcpu *vcpu, bool *retu)
855 {
856 	struct vm *vm;
857 	struct vm_exit *vme;
858 	struct vie *vie;
859 	struct hyp *hyp;
860 	uint64_t fault_ipa;
861 	struct vm_guest_paging *paging;
862 	struct vmm_mmio_region *vmr;
863 	int error, i;
864 
865 	vm = vcpu->vm;
866 	hyp = vm->cookie;
867 	if (!hyp->aplic_attached)
868 		goto out_user;
869 
870 	vme = &vcpu->exitinfo;
871 	vie = &vme->u.inst_emul.vie;
872 	paging = &vme->u.inst_emul.paging;
873 
874 	fault_ipa = vme->u.inst_emul.gpa;
875 
876 	vmr = NULL;
877 	for (i = 0; i < nitems(vm->mmio_region); i++) {
878 		if (vm->mmio_region[i].start <= fault_ipa &&
879 		    vm->mmio_region[i].end > fault_ipa) {
880 			vmr = &vm->mmio_region[i];
881 			break;
882 		}
883 	}
884 	if (vmr == NULL)
885 		goto out_user;
886 
887 	error = vmm_emulate_instruction(vcpu, fault_ipa, vie, paging,
888 	    vmr->read, vmr->write, retu);
889 	return (error);
890 
891 out_user:
892 	*retu = true;
893 	return (0);
894 }
895 
896 int
897 vm_suspend(struct vm *vm, enum vm_suspend_how how)
898 {
899 	int i;
900 
901 	if (how <= VM_SUSPEND_NONE || how >= VM_SUSPEND_LAST)
902 		return (EINVAL);
903 
904 	if (atomic_cmpset_int(&vm->suspend, 0, how) == 0) {
905 		VM_CTR2(vm, "virtual machine already suspended %d/%d",
906 		    vm->suspend, how);
907 		return (EALREADY);
908 	}
909 
910 	VM_CTR1(vm, "virtual machine successfully suspended %d", how);
911 
912 	/*
913 	 * Notify all active vcpus that they are now suspended.
914 	 */
915 	for (i = 0; i < vm->maxcpus; i++) {
916 		if (CPU_ISSET(i, &vm->active_cpus))
917 			vcpu_notify_event(vm_vcpu(vm, i));
918 	}
919 
920 	return (0);
921 }
922 
923 void
924 vm_exit_suspended(struct vcpu *vcpu, uint64_t pc)
925 {
926 	struct vm *vm = vcpu->vm;
927 	struct vm_exit *vmexit;
928 
929 	KASSERT(vm->suspend > VM_SUSPEND_NONE && vm->suspend < VM_SUSPEND_LAST,
930 	    ("vm_exit_suspended: invalid suspend type %d", vm->suspend));
931 
932 	vmexit = vm_exitinfo(vcpu);
933 	vmexit->pc = pc;
934 	vmexit->inst_length = 4;
935 	vmexit->exitcode = VM_EXITCODE_SUSPENDED;
936 	vmexit->u.suspended.how = vm->suspend;
937 }
938 
939 void
940 vm_exit_debug(struct vcpu *vcpu, uint64_t pc)
941 {
942 	struct vm_exit *vmexit;
943 
944 	vmexit = vm_exitinfo(vcpu);
945 	vmexit->pc = pc;
946 	vmexit->inst_length = 4;
947 	vmexit->exitcode = VM_EXITCODE_DEBUG;
948 }
949 
950 int
951 vm_activate_cpu(struct vcpu *vcpu)
952 {
953 	struct vm *vm = vcpu->vm;
954 
955 	if (CPU_ISSET(vcpu->vcpuid, &vm->active_cpus))
956 		return (EBUSY);
957 
958 	CPU_SET_ATOMIC(vcpu->vcpuid, &vm->active_cpus);
959 	return (0);
960 
961 }
962 
963 int
964 vm_suspend_cpu(struct vm *vm, struct vcpu *vcpu)
965 {
966 	if (vcpu == NULL) {
967 		vm->debug_cpus = vm->active_cpus;
968 		for (int i = 0; i < vm->maxcpus; i++) {
969 			if (CPU_ISSET(i, &vm->active_cpus))
970 				vcpu_notify_event(vm_vcpu(vm, i));
971 		}
972 	} else {
973 		if (!CPU_ISSET(vcpu->vcpuid, &vm->active_cpus))
974 			return (EINVAL);
975 
976 		CPU_SET_ATOMIC(vcpu->vcpuid, &vm->debug_cpus);
977 		vcpu_notify_event(vcpu);
978 	}
979 	return (0);
980 }
981 
982 int
983 vm_resume_cpu(struct vm *vm, struct vcpu *vcpu)
984 {
985 
986 	if (vcpu == NULL) {
987 		CPU_ZERO(&vm->debug_cpus);
988 	} else {
989 		if (!CPU_ISSET(vcpu->vcpuid, &vm->debug_cpus))
990 			return (EINVAL);
991 
992 		CPU_CLR_ATOMIC(vcpu->vcpuid, &vm->debug_cpus);
993 	}
994 	return (0);
995 }
996 
997 int
998 vcpu_debugged(struct vcpu *vcpu)
999 {
1000 
1001 	return (CPU_ISSET(vcpu->vcpuid, &vcpu->vm->debug_cpus));
1002 }
1003 
1004 cpuset_t
1005 vm_active_cpus(struct vm *vm)
1006 {
1007 
1008 	return (vm->active_cpus);
1009 }
1010 
1011 cpuset_t
1012 vm_debug_cpus(struct vm *vm)
1013 {
1014 
1015 	return (vm->debug_cpus);
1016 }
1017 
1018 cpuset_t
1019 vm_suspended_cpus(struct vm *vm)
1020 {
1021 
1022 	return (vm->suspended_cpus);
1023 }
1024 
1025 
1026 void *
1027 vcpu_stats(struct vcpu *vcpu)
1028 {
1029 
1030 	return (vcpu->stats);
1031 }
1032 
1033 /*
1034  * This function is called to ensure that a vcpu "sees" a pending event
1035  * as soon as possible:
1036  * - If the vcpu thread is sleeping then it is woken up.
1037  * - If the vcpu is running on a different host_cpu then an IPI will be directed
1038  *   to the host_cpu to cause the vcpu to trap into the hypervisor.
1039  */
1040 static void
1041 vcpu_notify_event_locked(struct vcpu *vcpu)
1042 {
1043 	int hostcpu;
1044 
1045 	hostcpu = vcpu->hostcpu;
1046 	if (vcpu->state == VCPU_RUNNING) {
1047 		KASSERT(hostcpu != NOCPU, ("vcpu running on invalid hostcpu"));
1048 		if (hostcpu != curcpu) {
1049 			ipi_cpu(hostcpu, vmm_ipinum);
1050 		} else {
1051 			/*
1052 			 * If the 'vcpu' is running on 'curcpu' then it must
1053 			 * be sending a notification to itself (e.g. SELF_IPI).
1054 			 * The pending event will be picked up when the vcpu
1055 			 * transitions back to guest context.
1056 			 */
1057 		}
1058 	} else {
1059 		KASSERT(hostcpu == NOCPU, ("vcpu state %d not consistent "
1060 		    "with hostcpu %d", vcpu->state, hostcpu));
1061 		if (vcpu->state == VCPU_SLEEPING)
1062 			wakeup_one(vcpu);
1063 	}
1064 }
1065 
1066 void
1067 vcpu_notify_event(struct vcpu *vcpu)
1068 {
1069 	vcpu_lock(vcpu);
1070 	vcpu_notify_event_locked(vcpu);
1071 	vcpu_unlock(vcpu);
1072 }
1073 
1074 static void
1075 restore_guest_fpustate(struct vcpu *vcpu)
1076 {
1077 
1078 	/* Flush host state to the pcb. */
1079 	fpe_state_save(curthread);
1080 
1081 	/* Ensure the VFP state will be re-loaded when exiting the guest. */
1082 	PCPU_SET(fpcurthread, NULL);
1083 
1084 	/* restore guest FPU state */
1085 	fpe_enable();
1086 	fpe_restore(vcpu->guestfpu);
1087 
1088 	/*
1089 	 * The FPU is now "dirty" with the guest's state so turn on emulation
1090 	 * to trap any access to the FPU by the host.
1091 	 */
1092 	fpe_disable();
1093 }
1094 
1095 static void
1096 save_guest_fpustate(struct vcpu *vcpu)
1097 {
1098 
1099 	/* Save guest FPE state. */
1100 	fpe_enable();
1101 	fpe_store(vcpu->guestfpu);
1102 	fpe_disable();
1103 
1104 	KASSERT(PCPU_GET(fpcurthread) == NULL,
1105 	    ("%s: fpcurthread set with guest registers", __func__));
1106 }
1107 
1108 static int
1109 vcpu_set_state_locked(struct vcpu *vcpu, enum vcpu_state newstate,
1110     bool from_idle)
1111 {
1112 	int error;
1113 
1114 	vcpu_assert_locked(vcpu);
1115 
1116 	/*
1117 	 * State transitions from the vmmdev_ioctl() must always begin from
1118 	 * the VCPU_IDLE state. This guarantees that there is only a single
1119 	 * ioctl() operating on a vcpu at any point.
1120 	 */
1121 	if (from_idle) {
1122 		while (vcpu->state != VCPU_IDLE) {
1123 			vcpu_notify_event_locked(vcpu);
1124 			msleep_spin(&vcpu->state, &vcpu->mtx, "vmstat",
1125 			    hz / 1000);
1126 		}
1127 	} else {
1128 		KASSERT(vcpu->state != VCPU_IDLE, ("invalid transition from "
1129 		    "vcpu idle state"));
1130 	}
1131 
1132 	if (vcpu->state == VCPU_RUNNING) {
1133 		KASSERT(vcpu->hostcpu == curcpu, ("curcpu %d and hostcpu %d "
1134 		    "mismatch for running vcpu", curcpu, vcpu->hostcpu));
1135 	} else {
1136 		KASSERT(vcpu->hostcpu == NOCPU, ("Invalid hostcpu %d for a "
1137 		    "vcpu that is not running", vcpu->hostcpu));
1138 	}
1139 
1140 	/*
1141 	 * The following state transitions are allowed:
1142 	 * IDLE -> FROZEN -> IDLE
1143 	 * FROZEN -> RUNNING -> FROZEN
1144 	 * FROZEN -> SLEEPING -> FROZEN
1145 	 */
1146 	switch (vcpu->state) {
1147 	case VCPU_IDLE:
1148 	case VCPU_RUNNING:
1149 	case VCPU_SLEEPING:
1150 		error = (newstate != VCPU_FROZEN);
1151 		break;
1152 	case VCPU_FROZEN:
1153 		error = (newstate == VCPU_FROZEN);
1154 		break;
1155 	default:
1156 		error = 1;
1157 		break;
1158 	}
1159 
1160 	if (error)
1161 		return (EBUSY);
1162 
1163 	vcpu->state = newstate;
1164 	if (newstate == VCPU_RUNNING)
1165 		vcpu->hostcpu = curcpu;
1166 	else
1167 		vcpu->hostcpu = NOCPU;
1168 
1169 	if (newstate == VCPU_IDLE)
1170 		wakeup(&vcpu->state);
1171 
1172 	return (0);
1173 }
1174 
1175 static void
1176 vcpu_require_state(struct vcpu *vcpu, enum vcpu_state newstate)
1177 {
1178 	int error;
1179 
1180 	if ((error = vcpu_set_state(vcpu, newstate, false)) != 0)
1181 		panic("Error %d setting state to %d\n", error, newstate);
1182 }
1183 
1184 static void
1185 vcpu_require_state_locked(struct vcpu *vcpu, enum vcpu_state newstate)
1186 {
1187 	int error;
1188 
1189 	if ((error = vcpu_set_state_locked(vcpu, newstate, false)) != 0)
1190 		panic("Error %d setting state to %d", error, newstate);
1191 }
1192 
1193 int
1194 vm_get_capability(struct vcpu *vcpu, int type, int *retval)
1195 {
1196 
1197 	if (type < 0 || type >= VM_CAP_MAX)
1198 		return (EINVAL);
1199 
1200 	return (vmmops_getcap(vcpu->cookie, type, retval));
1201 }
1202 
1203 int
1204 vm_set_capability(struct vcpu *vcpu, int type, int val)
1205 {
1206 
1207 	if (type < 0 || type >= VM_CAP_MAX)
1208 		return (EINVAL);
1209 
1210 	return (vmmops_setcap(vcpu->cookie, type, val));
1211 }
1212 
1213 struct vm *
1214 vcpu_vm(struct vcpu *vcpu)
1215 {
1216 
1217 	return (vcpu->vm);
1218 }
1219 
1220 int
1221 vcpu_vcpuid(struct vcpu *vcpu)
1222 {
1223 
1224 	return (vcpu->vcpuid);
1225 }
1226 
1227 void *
1228 vcpu_get_cookie(struct vcpu *vcpu)
1229 {
1230 
1231 	return (vcpu->cookie);
1232 }
1233 
1234 struct vcpu *
1235 vm_vcpu(struct vm *vm, int vcpuid)
1236 {
1237 
1238 	return (vm->vcpu[vcpuid]);
1239 }
1240 
1241 int
1242 vcpu_set_state(struct vcpu *vcpu, enum vcpu_state newstate, bool from_idle)
1243 {
1244 	int error;
1245 
1246 	vcpu_lock(vcpu);
1247 	error = vcpu_set_state_locked(vcpu, newstate, from_idle);
1248 	vcpu_unlock(vcpu);
1249 
1250 	return (error);
1251 }
1252 
1253 enum vcpu_state
1254 vcpu_get_state(struct vcpu *vcpu, int *hostcpu)
1255 {
1256 	enum vcpu_state state;
1257 
1258 	vcpu_lock(vcpu);
1259 	state = vcpu->state;
1260 	if (hostcpu != NULL)
1261 		*hostcpu = vcpu->hostcpu;
1262 	vcpu_unlock(vcpu);
1263 
1264 	return (state);
1265 }
1266 
1267 static void *
1268 _vm_gpa_hold(struct vm *vm, vm_paddr_t gpa, size_t len, int reqprot,
1269     void **cookie)
1270 {
1271 	int i, count, pageoff;
1272 	struct mem_map *mm;
1273 	vm_page_t m;
1274 
1275 	pageoff = gpa & PAGE_MASK;
1276 	if (len > PAGE_SIZE - pageoff)
1277 		panic("vm_gpa_hold: invalid gpa/len: 0x%016lx/%lu", gpa, len);
1278 
1279 	count = 0;
1280 	for (i = 0; i < VM_MAX_MEMMAPS; i++) {
1281 		mm = &vm->mem_maps[i];
1282 		if (sysmem_mapping(vm, mm) && gpa >= mm->gpa &&
1283 		    gpa < mm->gpa + mm->len) {
1284 			count = vm_fault_quick_hold_pages(&vm->vmspace->vm_map,
1285 			    trunc_page(gpa), PAGE_SIZE, reqprot, &m, 1);
1286 			break;
1287 		}
1288 	}
1289 
1290 	if (count == 1) {
1291 		*cookie = m;
1292 		return ((void *)(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)) + pageoff));
1293 	} else {
1294 		*cookie = NULL;
1295 		return (NULL);
1296 	}
1297 }
1298 
1299 void *
1300 vm_gpa_hold(struct vcpu *vcpu, vm_paddr_t gpa, size_t len, int reqprot,
1301 	    void **cookie)
1302 {
1303 #ifdef INVARIANTS
1304 	/*
1305 	 * The current vcpu should be frozen to ensure 'vm_memmap[]'
1306 	 * stability.
1307 	 */
1308 	int state = vcpu_get_state(vcpu, NULL);
1309 	KASSERT(state == VCPU_FROZEN, ("%s: invalid vcpu state %d",
1310 	    __func__, state));
1311 #endif
1312 	return (_vm_gpa_hold(vcpu->vm, gpa, len, reqprot, cookie));
1313 }
1314 
1315 void *
1316 vm_gpa_hold_global(struct vm *vm, vm_paddr_t gpa, size_t len, int reqprot,
1317     void **cookie)
1318 {
1319 	sx_assert(&vm->mem_segs_lock, SX_LOCKED);
1320 	return (_vm_gpa_hold(vm, gpa, len, reqprot, cookie));
1321 }
1322 
1323 void
1324 vm_gpa_release(void *cookie)
1325 {
1326 	vm_page_t m = cookie;
1327 
1328 	vm_page_unwire(m, PQ_ACTIVE);
1329 }
1330 
1331 int
1332 vm_get_register(struct vcpu *vcpu, int reg, uint64_t *retval)
1333 {
1334 
1335 	if (reg >= VM_REG_LAST)
1336 		return (EINVAL);
1337 
1338 	return (vmmops_getreg(vcpu->cookie, reg, retval));
1339 }
1340 
1341 int
1342 vm_set_register(struct vcpu *vcpu, int reg, uint64_t val)
1343 {
1344 	int error;
1345 
1346 	if (reg >= VM_REG_LAST)
1347 		return (EINVAL);
1348 	error = vmmops_setreg(vcpu->cookie, reg, val);
1349 	if (error || reg != VM_REG_GUEST_SEPC)
1350 		return (error);
1351 
1352 	vcpu->nextpc = val;
1353 
1354 	return (0);
1355 }
1356 
1357 void *
1358 vm_get_cookie(struct vm *vm)
1359 {
1360 
1361 	return (vm->cookie);
1362 }
1363 
1364 int
1365 vm_inject_exception(struct vcpu *vcpu, uint64_t scause)
1366 {
1367 
1368 	return (vmmops_exception(vcpu->cookie, scause));
1369 }
1370 
1371 int
1372 vm_attach_aplic(struct vm *vm, struct vm_aplic_descr *descr)
1373 {
1374 
1375 	return (aplic_attach_to_vm(vm->cookie, descr));
1376 }
1377 
1378 int
1379 vm_assert_irq(struct vm *vm, uint32_t irq)
1380 {
1381 
1382 	return (aplic_inject_irq(vm->cookie, -1, irq, true));
1383 }
1384 
1385 int
1386 vm_deassert_irq(struct vm *vm, uint32_t irq)
1387 {
1388 
1389 	return (aplic_inject_irq(vm->cookie, -1, irq, false));
1390 }
1391 
1392 int
1393 vm_raise_msi(struct vm *vm, uint64_t msg, uint64_t addr, int bus, int slot,
1394     int func)
1395 {
1396 
1397 	return (aplic_inject_msi(vm->cookie, msg, addr));
1398 }
1399 
1400 static int
1401 vm_handle_wfi(struct vcpu *vcpu, struct vm_exit *vme, bool *retu)
1402 {
1403 
1404 	vcpu_lock(vcpu);
1405 
1406 	while (1) {
1407 		if (aplic_check_pending(vcpu->cookie))
1408 			break;
1409 
1410 		if (riscv_check_ipi(vcpu->cookie, false))
1411 			break;
1412 
1413 		if (vcpu_should_yield(vcpu))
1414 			break;
1415 
1416 		vcpu_require_state_locked(vcpu, VCPU_SLEEPING);
1417 		/*
1418 		 * XXX msleep_spin() cannot be interrupted by signals so
1419 		 * wake up periodically to check pending signals.
1420 		 */
1421 		msleep_spin(vcpu, &vcpu->mtx, "vmidle", hz / 1000);
1422 		vcpu_require_state_locked(vcpu, VCPU_FROZEN);
1423 	}
1424 	vcpu_unlock(vcpu);
1425 
1426 	*retu = false;
1427 
1428 	return (0);
1429 }
1430 
1431 static int
1432 vm_handle_paging(struct vcpu *vcpu, bool *retu)
1433 {
1434 	struct vm *vm;
1435 	struct vm_exit *vme;
1436 	struct vm_map *map;
1437 	uint64_t addr;
1438 	pmap_t pmap;
1439 	int ftype, rv;
1440 
1441 	vm = vcpu->vm;
1442 	vme = &vcpu->exitinfo;
1443 
1444 	pmap = vmspace_pmap(vm->vmspace);
1445 	addr = (vme->htval << 2) & ~(PAGE_SIZE - 1);
1446 
1447 	dprintf("%s: %lx\n", __func__, addr);
1448 
1449 	switch (vme->scause) {
1450 	case SCAUSE_STORE_GUEST_PAGE_FAULT:
1451 		ftype = VM_PROT_WRITE;
1452 		break;
1453 	case SCAUSE_FETCH_GUEST_PAGE_FAULT:
1454 		ftype = VM_PROT_EXECUTE;
1455 		break;
1456 	case SCAUSE_LOAD_GUEST_PAGE_FAULT:
1457 		ftype = VM_PROT_READ;
1458 		break;
1459 	default:
1460 		panic("unknown page trap: %lu", vme->scause);
1461 	}
1462 
1463 	/* The page exists, but the page table needs to be updated. */
1464 	if (pmap_fault(pmap, addr, ftype))
1465 		return (0);
1466 
1467 	map = &vm->vmspace->vm_map;
1468 	rv = vm_fault(map, addr, ftype, VM_FAULT_NORMAL, NULL);
1469 	if (rv != KERN_SUCCESS) {
1470 		printf("%s: vm_fault failed, addr %lx, ftype %d, err %d\n",
1471 		    __func__, addr, ftype, rv);
1472 		return (EFAULT);
1473 	}
1474 
1475 	return (0);
1476 }
1477 
1478 static int
1479 vm_handle_suspend(struct vcpu *vcpu, bool *retu)
1480 {
1481 	struct vm *vm = vcpu->vm;
1482 	int error, i;
1483 	struct thread *td;
1484 
1485 	error = 0;
1486 	td = curthread;
1487 
1488 	CPU_SET_ATOMIC(vcpu->vcpuid, &vm->suspended_cpus);
1489 
1490 	/*
1491 	 * Wait until all 'active_cpus' have suspended themselves.
1492 	 *
1493 	 * Since a VM may be suspended at any time including when one or
1494 	 * more vcpus are doing a rendezvous we need to call the rendezvous
1495 	 * handler while we are waiting to prevent a deadlock.
1496 	 */
1497 	vcpu_lock(vcpu);
1498 	while (error == 0) {
1499 		if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0)
1500 			break;
1501 
1502 		vcpu_require_state_locked(vcpu, VCPU_SLEEPING);
1503 		msleep_spin(vcpu, &vcpu->mtx, "vmsusp", hz);
1504 		vcpu_require_state_locked(vcpu, VCPU_FROZEN);
1505 		if (td_ast_pending(td, TDA_SUSPEND)) {
1506 			vcpu_unlock(vcpu);
1507 			error = thread_check_susp(td, false);
1508 			vcpu_lock(vcpu);
1509 		}
1510 	}
1511 	vcpu_unlock(vcpu);
1512 
1513 	/*
1514 	 * Wakeup the other sleeping vcpus and return to userspace.
1515 	 */
1516 	for (i = 0; i < vm->maxcpus; i++) {
1517 		if (CPU_ISSET(i, &vm->suspended_cpus)) {
1518 			vcpu_notify_event(vm_vcpu(vm, i));
1519 		}
1520 	}
1521 
1522 	*retu = true;
1523 	return (error);
1524 }
1525 
1526 int
1527 vm_run(struct vcpu *vcpu)
1528 {
1529 	struct vm_eventinfo evinfo;
1530 	struct vm_exit *vme;
1531 	struct vm *vm;
1532 	pmap_t pmap;
1533 	int error;
1534 	int vcpuid;
1535 	bool retu;
1536 
1537 	vm = vcpu->vm;
1538 
1539 	dprintf("%s\n", __func__);
1540 
1541 	vcpuid = vcpu->vcpuid;
1542 
1543 	if (!CPU_ISSET(vcpuid, &vm->active_cpus))
1544 		return (EINVAL);
1545 
1546 	if (CPU_ISSET(vcpuid, &vm->suspended_cpus))
1547 		return (EINVAL);
1548 
1549 	pmap = vmspace_pmap(vm->vmspace);
1550 	vme = &vcpu->exitinfo;
1551 	evinfo.rptr = NULL;
1552 	evinfo.sptr = &vm->suspend;
1553 	evinfo.iptr = NULL;
1554 restart:
1555 	critical_enter();
1556 
1557 	restore_guest_fpustate(vcpu);
1558 
1559 	vcpu_require_state(vcpu, VCPU_RUNNING);
1560 	error = vmmops_run(vcpu->cookie, vcpu->nextpc, pmap, &evinfo);
1561 	vcpu_require_state(vcpu, VCPU_FROZEN);
1562 
1563 	save_guest_fpustate(vcpu);
1564 
1565 	critical_exit();
1566 
1567 	if (error == 0) {
1568 		retu = false;
1569 		switch (vme->exitcode) {
1570 		case VM_EXITCODE_INST_EMUL:
1571 			vcpu->nextpc = vme->pc + vme->inst_length;
1572 			error = vm_handle_inst_emul(vcpu, &retu);
1573 			break;
1574 		case VM_EXITCODE_WFI:
1575 			vcpu->nextpc = vme->pc + vme->inst_length;
1576 			error = vm_handle_wfi(vcpu, vme, &retu);
1577 			break;
1578 		case VM_EXITCODE_ECALL:
1579 			/* Handle in userland. */
1580 			vcpu->nextpc = vme->pc + vme->inst_length;
1581 			retu = true;
1582 			break;
1583 		case VM_EXITCODE_PAGING:
1584 			vcpu->nextpc = vme->pc;
1585 			error = vm_handle_paging(vcpu, &retu);
1586 			break;
1587 		case VM_EXITCODE_BOGUS:
1588 			vcpu->nextpc = vme->pc;
1589 			retu = false;
1590 			error = 0;
1591 			break;
1592 		case VM_EXITCODE_SUSPENDED:
1593 			vcpu->nextpc = vme->pc;
1594 			error = vm_handle_suspend(vcpu, &retu);
1595 			break;
1596 		default:
1597 			/* Handle in userland. */
1598 			vcpu->nextpc = vme->pc;
1599 			retu = true;
1600 			break;
1601 		}
1602 	}
1603 
1604 	if (error == 0 && retu == false)
1605 		goto restart;
1606 
1607 	return (error);
1608 }
1609