xref: /freebsd/sys/riscv/vmm/vmm.c (revision dd21556857e8d40f66bf5ad54754d9d52669ebf7)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2015 Mihai Carabas <mihai.carabas@gmail.com>
5  * Copyright (c) 2024 Ruslan Bukin <br@bsdpad.com>
6  *
7  * This software was developed by the University of Cambridge Computer
8  * Laboratory (Department of Computer Science and Technology) under Innovate
9  * UK project 105694, "Digital Security by Design (DSbD) Technology Platform
10  * Prototype".
11  *
12  * Redistribution and use in source and binary forms, with or without
13  * modification, are permitted provided that the following conditions
14  * are met:
15  * 1. Redistributions of source code must retain the above copyright
16  *    notice, this list of conditions and the following disclaimer.
17  * 2. Redistributions in binary form must reproduce the above copyright
18  *    notice, this list of conditions and the following disclaimer in the
19  *    documentation and/or other materials provided with the distribution.
20  *
21  * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND
22  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24  * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
25  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31  * SUCH DAMAGE.
32  */
33 
34 #include <sys/param.h>
35 #include <sys/systm.h>
36 #include <sys/cpuset.h>
37 #include <sys/kernel.h>
38 #include <sys/linker.h>
39 #include <sys/lock.h>
40 #include <sys/malloc.h>
41 #include <sys/module.h>
42 #include <sys/mutex.h>
43 #include <sys/pcpu.h>
44 #include <sys/proc.h>
45 #include <sys/queue.h>
46 #include <sys/rwlock.h>
47 #include <sys/sched.h>
48 #include <sys/smp.h>
49 #include <sys/sysctl.h>
50 
51 #include <vm/vm.h>
52 #include <vm/vm_object.h>
53 #include <vm/vm_page.h>
54 #include <vm/pmap.h>
55 #include <vm/vm_map.h>
56 #include <vm/vm_extern.h>
57 #include <vm/vm_param.h>
58 
59 #include <machine/riscvreg.h>
60 #include <machine/cpu.h>
61 #include <machine/fpe.h>
62 #include <machine/machdep.h>
63 #include <machine/pcb.h>
64 #include <machine/smp.h>
65 #include <machine/vm.h>
66 #include <machine/vmparam.h>
67 #include <machine/vmm.h>
68 #include <machine/vmm_instruction_emul.h>
69 
70 #include <dev/pci/pcireg.h>
71 
72 #include <dev/vmm/vmm_dev.h>
73 #include <dev/vmm/vmm_ktr.h>
74 
75 #include "vmm_stat.h"
76 #include "riscv.h"
77 
78 #include "vmm_aplic.h"
79 
80 struct vcpu {
81 	int		flags;
82 	enum vcpu_state	state;
83 	struct mtx	mtx;
84 	int		hostcpu;	/* host cpuid this vcpu last ran on */
85 	int		vcpuid;
86 	void		*stats;
87 	struct vm_exit	exitinfo;
88 	uint64_t	nextpc;		/* (x) next instruction to execute */
89 	struct vm	*vm;		/* (o) */
90 	void		*cookie;	/* (i) cpu-specific data */
91 	struct fpreg	*guestfpu;	/* (a,i) guest fpu state */
92 };
93 
94 #define	vcpu_lock_initialized(v) mtx_initialized(&((v)->mtx))
95 #define	vcpu_lock_init(v)	mtx_init(&((v)->mtx), "vcpu lock", 0, MTX_SPIN)
96 #define	vcpu_lock_destroy(v)	mtx_destroy(&((v)->mtx))
97 #define	vcpu_lock(v)		mtx_lock_spin(&((v)->mtx))
98 #define	vcpu_unlock(v)		mtx_unlock_spin(&((v)->mtx))
99 #define	vcpu_assert_locked(v)	mtx_assert(&((v)->mtx), MA_OWNED)
100 
101 struct mem_seg {
102 	uint64_t	gpa;
103 	size_t		len;
104 	bool		wired;
105 	bool		sysmem;
106 	vm_object_t	object;
107 };
108 #define	VM_MAX_MEMSEGS	3
109 
110 struct mem_map {
111 	vm_paddr_t	gpa;
112 	size_t		len;
113 	vm_ooffset_t	segoff;
114 	int		segid;
115 	int		prot;
116 	int		flags;
117 };
118 #define	VM_MAX_MEMMAPS	4
119 
120 struct vmm_mmio_region {
121 	uint64_t start;
122 	uint64_t end;
123 	mem_region_read_t read;
124 	mem_region_write_t write;
125 };
126 #define	VM_MAX_MMIO_REGIONS	4
127 
128 /*
129  * Initialization:
130  * (o) initialized the first time the VM is created
131  * (i) initialized when VM is created and when it is reinitialized
132  * (x) initialized before use
133  */
134 struct vm {
135 	void		*cookie;		/* (i) cpu-specific data */
136 	volatile cpuset_t active_cpus;		/* (i) active vcpus */
137 	volatile cpuset_t debug_cpus;		/* (i) vcpus stopped for debug*/
138 	int		suspend;		/* (i) stop VM execution */
139 	bool		dying;			/* (o) is dying */
140 	volatile cpuset_t suspended_cpus; 	/* (i) suspended vcpus */
141 	volatile cpuset_t halted_cpus;		/* (x) cpus in a hard halt */
142 	struct mem_map	mem_maps[VM_MAX_MEMMAPS]; /* (i) guest address space */
143 	struct mem_seg	mem_segs[VM_MAX_MEMSEGS]; /* (o) guest memory regions */
144 	struct vmspace	*vmspace;		/* (o) guest's address space */
145 	char		name[VM_MAX_NAMELEN];	/* (o) virtual machine name */
146 	struct vcpu	**vcpu;			/* (i) guest vcpus */
147 	struct vmm_mmio_region mmio_region[VM_MAX_MMIO_REGIONS];
148 						/* (o) guest MMIO regions */
149 	/* The following describe the vm cpu topology */
150 	uint16_t	sockets;		/* (o) num of sockets */
151 	uint16_t	cores;			/* (o) num of cores/socket */
152 	uint16_t	threads;		/* (o) num of threads/core */
153 	uint16_t	maxcpus;		/* (o) max pluggable cpus */
154 	struct sx	mem_segs_lock;		/* (o) */
155 	struct sx	vcpus_init_lock;	/* (o) */
156 };
157 
158 static bool vmm_initialized = false;
159 
160 static MALLOC_DEFINE(M_VMM, "vmm", "vmm");
161 
162 /* statistics */
163 static VMM_STAT(VCPU_TOTAL_RUNTIME, "vcpu total runtime");
164 
165 SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW, NULL, NULL);
166 
167 static int vmm_ipinum;
168 SYSCTL_INT(_hw_vmm, OID_AUTO, ipinum, CTLFLAG_RD, &vmm_ipinum, 0,
169     "IPI vector used for vcpu notifications");
170 
171 u_int vm_maxcpu;
172 SYSCTL_UINT(_hw_vmm, OID_AUTO, maxcpu, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
173     &vm_maxcpu, 0, "Maximum number of vCPUs");
174 
175 static void vm_free_memmap(struct vm *vm, int ident);
176 static bool sysmem_mapping(struct vm *vm, struct mem_map *mm);
177 static void vcpu_notify_event_locked(struct vcpu *vcpu);
178 
179 /* global statistics */
180 VMM_STAT(VMEXIT_COUNT, "total number of vm exits");
181 VMM_STAT(VMEXIT_IRQ, "number of vmexits for an irq");
182 VMM_STAT(VMEXIT_UNHANDLED, "number of vmexits for an unhandled exception");
183 
184 /*
185  * Upper limit on vm_maxcpu. We could increase this to 28 bits, but this
186  * is a safe value for now.
187  */
188 #define	VM_MAXCPU	MIN(0xffff - 1, CPU_SETSIZE)
189 
190 static void
191 vcpu_cleanup(struct vcpu *vcpu, bool destroy)
192 {
193 	vmmops_vcpu_cleanup(vcpu->cookie);
194 	vcpu->cookie = NULL;
195 	if (destroy) {
196 		vmm_stat_free(vcpu->stats);
197 		fpu_save_area_free(vcpu->guestfpu);
198 		vcpu_lock_destroy(vcpu);
199 	}
200 }
201 
202 static struct vcpu *
203 vcpu_alloc(struct vm *vm, int vcpu_id)
204 {
205 	struct vcpu *vcpu;
206 
207 	KASSERT(vcpu_id >= 0 && vcpu_id < vm->maxcpus,
208 	    ("vcpu_alloc: invalid vcpu %d", vcpu_id));
209 
210 	vcpu = malloc(sizeof(*vcpu), M_VMM, M_WAITOK | M_ZERO);
211 	vcpu_lock_init(vcpu);
212 	vcpu->state = VCPU_IDLE;
213 	vcpu->hostcpu = NOCPU;
214 	vcpu->vcpuid = vcpu_id;
215 	vcpu->vm = vm;
216 	vcpu->guestfpu = fpu_save_area_alloc();
217 	vcpu->stats = vmm_stat_alloc();
218 	return (vcpu);
219 }
220 
221 static void
222 vcpu_init(struct vcpu *vcpu)
223 {
224 	vcpu->cookie = vmmops_vcpu_init(vcpu->vm->cookie, vcpu, vcpu->vcpuid);
225 	MPASS(vcpu->cookie != NULL);
226 	fpu_save_area_reset(vcpu->guestfpu);
227 	vmm_stat_init(vcpu->stats);
228 }
229 
230 struct vm_exit *
231 vm_exitinfo(struct vcpu *vcpu)
232 {
233 	return (&vcpu->exitinfo);
234 }
235 
236 static int
237 vmm_init(void)
238 {
239 
240 	vm_maxcpu = mp_ncpus;
241 
242 	TUNABLE_INT_FETCH("hw.vmm.maxcpu", &vm_maxcpu);
243 
244 	if (vm_maxcpu > VM_MAXCPU) {
245 		printf("vmm: vm_maxcpu clamped to %u\n", VM_MAXCPU);
246 		vm_maxcpu = VM_MAXCPU;
247 	}
248 
249 	if (vm_maxcpu == 0)
250 		vm_maxcpu = 1;
251 
252 	return (vmmops_modinit());
253 }
254 
255 static int
256 vmm_handler(module_t mod, int what, void *arg)
257 {
258 	int error;
259 
260 	switch (what) {
261 	case MOD_LOAD:
262 		error = vmmdev_init();
263 		if (error != 0)
264 			break;
265 		error = vmm_init();
266 		if (error == 0)
267 			vmm_initialized = true;
268 		else
269 			(void)vmmdev_cleanup();
270 		break;
271 	case MOD_UNLOAD:
272 		error = vmmdev_cleanup();
273 		if (error == 0 && vmm_initialized) {
274 			error = vmmops_modcleanup();
275 			if (error) {
276 				/*
277 				 * Something bad happened - prevent new
278 				 * VMs from being created
279 				 */
280 				vmm_initialized = false;
281 			}
282 		}
283 		break;
284 	default:
285 		error = 0;
286 		break;
287 	}
288 	return (error);
289 }
290 
291 static moduledata_t vmm_kmod = {
292 	"vmm",
293 	vmm_handler,
294 	NULL
295 };
296 
297 /*
298  * vmm initialization has the following dependencies:
299  *
300  * - vmm device initialization requires an initialized devfs.
301  */
302 DECLARE_MODULE(vmm, vmm_kmod, SI_SUB_DEVFS + 1, SI_ORDER_ANY);
303 MODULE_VERSION(vmm, 1);
304 
305 static void
306 vm_init(struct vm *vm, bool create)
307 {
308 	int i;
309 
310 	vm->cookie = vmmops_init(vm, vmspace_pmap(vm->vmspace));
311 	MPASS(vm->cookie != NULL);
312 
313 	CPU_ZERO(&vm->active_cpus);
314 	CPU_ZERO(&vm->debug_cpus);
315 
316 	vm->suspend = 0;
317 	CPU_ZERO(&vm->suspended_cpus);
318 
319 	memset(vm->mmio_region, 0, sizeof(vm->mmio_region));
320 
321 	if (!create) {
322 		for (i = 0; i < vm->maxcpus; i++) {
323 			if (vm->vcpu[i] != NULL)
324 				vcpu_init(vm->vcpu[i]);
325 		}
326 	}
327 }
328 
329 void
330 vm_disable_vcpu_creation(struct vm *vm)
331 {
332 	sx_xlock(&vm->vcpus_init_lock);
333 	vm->dying = true;
334 	sx_xunlock(&vm->vcpus_init_lock);
335 }
336 
337 struct vcpu *
338 vm_alloc_vcpu(struct vm *vm, int vcpuid)
339 {
340 	struct vcpu *vcpu;
341 
342 	if (vcpuid < 0 || vcpuid >= vm_get_maxcpus(vm))
343 		return (NULL);
344 
345 	/* Some interrupt controllers may have a CPU limit */
346 	if (vcpuid >= aplic_max_cpu_count(vm->cookie))
347 		return (NULL);
348 
349 	vcpu = (struct vcpu *)
350 	    atomic_load_acq_ptr((uintptr_t *)&vm->vcpu[vcpuid]);
351 	if (__predict_true(vcpu != NULL))
352 		return (vcpu);
353 
354 	sx_xlock(&vm->vcpus_init_lock);
355 	vcpu = vm->vcpu[vcpuid];
356 	if (vcpu == NULL && !vm->dying) {
357 		vcpu = vcpu_alloc(vm, vcpuid);
358 		vcpu_init(vcpu);
359 
360 		/*
361 		 * Ensure vCPU is fully created before updating pointer
362 		 * to permit unlocked reads above.
363 		 */
364 		atomic_store_rel_ptr((uintptr_t *)&vm->vcpu[vcpuid],
365 		    (uintptr_t)vcpu);
366 	}
367 	sx_xunlock(&vm->vcpus_init_lock);
368 	return (vcpu);
369 }
370 
371 void
372 vm_slock_vcpus(struct vm *vm)
373 {
374 	sx_slock(&vm->vcpus_init_lock);
375 }
376 
377 void
378 vm_unlock_vcpus(struct vm *vm)
379 {
380 	sx_unlock(&vm->vcpus_init_lock);
381 }
382 
383 int
384 vm_create(const char *name, struct vm **retvm)
385 {
386 	struct vm *vm;
387 	struct vmspace *vmspace;
388 
389 	/*
390 	 * If vmm.ko could not be successfully initialized then don't attempt
391 	 * to create the virtual machine.
392 	 */
393 	if (!vmm_initialized)
394 		return (ENXIO);
395 
396 	if (name == NULL || strlen(name) >= VM_MAX_NAMELEN)
397 		return (EINVAL);
398 
399 	vmspace = vmmops_vmspace_alloc(0, 1ul << 39);
400 	if (vmspace == NULL)
401 		return (ENOMEM);
402 
403 	vm = malloc(sizeof(struct vm), M_VMM, M_WAITOK | M_ZERO);
404 	strcpy(vm->name, name);
405 	vm->vmspace = vmspace;
406 	sx_init(&vm->mem_segs_lock, "vm mem_segs");
407 	sx_init(&vm->vcpus_init_lock, "vm vcpus");
408 
409 	vm->sockets = 1;
410 	vm->cores = 1;			/* XXX backwards compatibility */
411 	vm->threads = 1;		/* XXX backwards compatibility */
412 	vm->maxcpus = vm_maxcpu;
413 
414 	vm->vcpu = malloc(sizeof(*vm->vcpu) * vm->maxcpus, M_VMM,
415 	    M_WAITOK | M_ZERO);
416 
417 	vm_init(vm, true);
418 
419 	*retvm = vm;
420 	return (0);
421 }
422 
423 void
424 vm_get_topology(struct vm *vm, uint16_t *sockets, uint16_t *cores,
425     uint16_t *threads, uint16_t *maxcpus)
426 {
427 	*sockets = vm->sockets;
428 	*cores = vm->cores;
429 	*threads = vm->threads;
430 	*maxcpus = vm->maxcpus;
431 }
432 
433 uint16_t
434 vm_get_maxcpus(struct vm *vm)
435 {
436 	return (vm->maxcpus);
437 }
438 
439 int
440 vm_set_topology(struct vm *vm, uint16_t sockets, uint16_t cores,
441     uint16_t threads, uint16_t maxcpus)
442 {
443 	/* Ignore maxcpus. */
444 	if ((sockets * cores * threads) > vm->maxcpus)
445 		return (EINVAL);
446 	vm->sockets = sockets;
447 	vm->cores = cores;
448 	vm->threads = threads;
449 	return(0);
450 }
451 
452 static void
453 vm_cleanup(struct vm *vm, bool destroy)
454 {
455 	struct mem_map *mm;
456 	int i;
457 
458 	aplic_detach_from_vm(vm->cookie);
459 
460 	for (i = 0; i < vm->maxcpus; i++) {
461 		if (vm->vcpu[i] != NULL)
462 			vcpu_cleanup(vm->vcpu[i], destroy);
463 	}
464 
465 	vmmops_cleanup(vm->cookie);
466 
467 	/*
468 	 * System memory is removed from the guest address space only when
469 	 * the VM is destroyed. This is because the mapping remains the same
470 	 * across VM reset.
471 	 *
472 	 * Device memory can be relocated by the guest (e.g. using PCI BARs)
473 	 * so those mappings are removed on a VM reset.
474 	 */
475 	if (!destroy) {
476 		for (i = 0; i < VM_MAX_MEMMAPS; i++) {
477 			mm = &vm->mem_maps[i];
478 			if (destroy || !sysmem_mapping(vm, mm))
479 				vm_free_memmap(vm, i);
480 		}
481 	}
482 
483 	if (destroy) {
484 		for (i = 0; i < VM_MAX_MEMSEGS; i++)
485 			vm_free_memseg(vm, i);
486 
487 		vmmops_vmspace_free(vm->vmspace);
488 		vm->vmspace = NULL;
489 
490 		for (i = 0; i < vm->maxcpus; i++)
491 			free(vm->vcpu[i], M_VMM);
492 		free(vm->vcpu, M_VMM);
493 		sx_destroy(&vm->vcpus_init_lock);
494 		sx_destroy(&vm->mem_segs_lock);
495 	}
496 }
497 
498 void
499 vm_destroy(struct vm *vm)
500 {
501 
502 	vm_cleanup(vm, true);
503 
504 	free(vm, M_VMM);
505 }
506 
507 int
508 vm_reinit(struct vm *vm)
509 {
510 	int error;
511 
512 	/*
513 	 * A virtual machine can be reset only if all vcpus are suspended.
514 	 */
515 	if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) {
516 		vm_cleanup(vm, false);
517 		vm_init(vm, false);
518 		error = 0;
519 	} else {
520 		error = EBUSY;
521 	}
522 
523 	return (error);
524 }
525 
526 const char *
527 vm_name(struct vm *vm)
528 {
529 	return (vm->name);
530 }
531 
532 void
533 vm_slock_memsegs(struct vm *vm)
534 {
535 	sx_slock(&vm->mem_segs_lock);
536 }
537 
538 void
539 vm_xlock_memsegs(struct vm *vm)
540 {
541 	sx_xlock(&vm->mem_segs_lock);
542 }
543 
544 void
545 vm_unlock_memsegs(struct vm *vm)
546 {
547 	sx_unlock(&vm->mem_segs_lock);
548 }
549 
550 /*
551  * Return 'true' if 'gpa' is allocated in the guest address space.
552  *
553  * This function is called in the context of a running vcpu which acts as
554  * an implicit lock on 'vm->mem_maps[]'.
555  */
556 bool
557 vm_mem_allocated(struct vcpu *vcpu, vm_paddr_t gpa)
558 {
559 	struct vm *vm = vcpu->vm;
560 	struct mem_map *mm;
561 	int i;
562 
563 #ifdef INVARIANTS
564 	int hostcpu, state;
565 	state = vcpu_get_state(vcpu, &hostcpu);
566 	KASSERT(state == VCPU_RUNNING && hostcpu == curcpu,
567 	    ("%s: invalid vcpu state %d/%d", __func__, state, hostcpu));
568 #endif
569 
570 	for (i = 0; i < VM_MAX_MEMMAPS; i++) {
571 		mm = &vm->mem_maps[i];
572 		if (mm->len != 0 && gpa >= mm->gpa && gpa < mm->gpa + mm->len)
573 			return (true);		/* 'gpa' is sysmem or devmem */
574 	}
575 
576 	return (false);
577 }
578 
579 int
580 vm_alloc_memseg(struct vm *vm, int ident, size_t len, bool sysmem)
581 {
582 	struct mem_seg *seg;
583 	vm_object_t obj;
584 
585 	sx_assert(&vm->mem_segs_lock, SX_XLOCKED);
586 
587 	if (ident < 0 || ident >= VM_MAX_MEMSEGS)
588 		return (EINVAL);
589 
590 	if (len == 0 || (len & PAGE_MASK))
591 		return (EINVAL);
592 
593 	seg = &vm->mem_segs[ident];
594 	if (seg->object != NULL) {
595 		if (seg->len == len && seg->sysmem == sysmem)
596 			return (EEXIST);
597 		else
598 			return (EINVAL);
599 	}
600 
601 	obj = vm_object_allocate(OBJT_DEFAULT, len >> PAGE_SHIFT);
602 	if (obj == NULL)
603 		return (ENOMEM);
604 
605 	seg->len = len;
606 	seg->object = obj;
607 	seg->sysmem = sysmem;
608 	return (0);
609 }
610 
611 int
612 vm_get_memseg(struct vm *vm, int ident, size_t *len, bool *sysmem,
613     vm_object_t *objptr)
614 {
615 	struct mem_seg *seg;
616 
617 	sx_assert(&vm->mem_segs_lock, SX_LOCKED);
618 
619 	if (ident < 0 || ident >= VM_MAX_MEMSEGS)
620 		return (EINVAL);
621 
622 	seg = &vm->mem_segs[ident];
623 	if (len)
624 		*len = seg->len;
625 	if (sysmem)
626 		*sysmem = seg->sysmem;
627 	if (objptr)
628 		*objptr = seg->object;
629 	return (0);
630 }
631 
632 void
633 vm_free_memseg(struct vm *vm, int ident)
634 {
635 	struct mem_seg *seg;
636 
637 	KASSERT(ident >= 0 && ident < VM_MAX_MEMSEGS,
638 	    ("%s: invalid memseg ident %d", __func__, ident));
639 
640 	seg = &vm->mem_segs[ident];
641 	if (seg->object != NULL) {
642 		vm_object_deallocate(seg->object);
643 		bzero(seg, sizeof(struct mem_seg));
644 	}
645 }
646 
647 int
648 vm_mmap_memseg(struct vm *vm, vm_paddr_t gpa, int segid, vm_ooffset_t first,
649     size_t len, int prot, int flags)
650 {
651 	struct mem_seg *seg;
652 	struct mem_map *m, *map;
653 	vm_ooffset_t last;
654 	int i, error;
655 
656 	dprintf("%s: gpa %lx first %lx len %lx\n", __func__, gpa, first, len);
657 
658 	if (prot == 0 || (prot & ~(VM_PROT_ALL)) != 0)
659 		return (EINVAL);
660 
661 	if (flags & ~VM_MEMMAP_F_WIRED)
662 		return (EINVAL);
663 
664 	if (segid < 0 || segid >= VM_MAX_MEMSEGS)
665 		return (EINVAL);
666 
667 	seg = &vm->mem_segs[segid];
668 	if (seg->object == NULL)
669 		return (EINVAL);
670 
671 	last = first + len;
672 	if (first < 0 || first >= last || last > seg->len)
673 		return (EINVAL);
674 
675 	if ((gpa | first | last) & PAGE_MASK)
676 		return (EINVAL);
677 
678 	map = NULL;
679 	for (i = 0; i < VM_MAX_MEMMAPS; i++) {
680 		m = &vm->mem_maps[i];
681 		if (m->len == 0) {
682 			map = m;
683 			break;
684 		}
685 	}
686 
687 	if (map == NULL)
688 		return (ENOSPC);
689 
690 	error = vm_map_find(&vm->vmspace->vm_map, seg->object, first, &gpa,
691 	    len, 0, VMFS_NO_SPACE, prot, prot, 0);
692 	if (error != KERN_SUCCESS)
693 		return (EFAULT);
694 
695 	vm_object_reference(seg->object);
696 
697 	if (flags & VM_MEMMAP_F_WIRED) {
698 		error = vm_map_wire(&vm->vmspace->vm_map, gpa, gpa + len,
699 		    VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
700 		if (error != KERN_SUCCESS) {
701 			vm_map_remove(&vm->vmspace->vm_map, gpa, gpa + len);
702 			return (error == KERN_RESOURCE_SHORTAGE ? ENOMEM :
703 			    EFAULT);
704 		}
705 	}
706 
707 	map->gpa = gpa;
708 	map->len = len;
709 	map->segoff = first;
710 	map->segid = segid;
711 	map->prot = prot;
712 	map->flags = flags;
713 	return (0);
714 }
715 
716 int
717 vm_munmap_memseg(struct vm *vm, vm_paddr_t gpa, size_t len)
718 {
719 	struct mem_map *m;
720 	int i;
721 
722 	dprintf("%s: gpa %lx len %lx\n", __func__, gpa, len);
723 
724 	for (i = 0; i < VM_MAX_MEMMAPS; i++) {
725 		m = &vm->mem_maps[i];
726 		if (m->gpa == gpa && m->len == len) {
727 			vm_free_memmap(vm, i);
728 			return (0);
729 		}
730 	}
731 
732 	return (EINVAL);
733 }
734 
735 int
736 vm_mmap_getnext(struct vm *vm, vm_paddr_t *gpa, int *segid,
737     vm_ooffset_t *segoff, size_t *len, int *prot, int *flags)
738 {
739 	struct mem_map *mm, *mmnext;
740 	int i;
741 
742 	mmnext = NULL;
743 	for (i = 0; i < VM_MAX_MEMMAPS; i++) {
744 		mm = &vm->mem_maps[i];
745 		if (mm->len == 0 || mm->gpa < *gpa)
746 			continue;
747 		if (mmnext == NULL || mm->gpa < mmnext->gpa)
748 			mmnext = mm;
749 	}
750 
751 	if (mmnext != NULL) {
752 		*gpa = mmnext->gpa;
753 		if (segid)
754 			*segid = mmnext->segid;
755 		if (segoff)
756 			*segoff = mmnext->segoff;
757 		if (len)
758 			*len = mmnext->len;
759 		if (prot)
760 			*prot = mmnext->prot;
761 		if (flags)
762 			*flags = mmnext->flags;
763 		return (0);
764 	} else {
765 		return (ENOENT);
766 	}
767 }
768 
769 static void
770 vm_free_memmap(struct vm *vm, int ident)
771 {
772 	struct mem_map *mm;
773 	int error __diagused;
774 
775 	mm = &vm->mem_maps[ident];
776 	if (mm->len) {
777 		error = vm_map_remove(&vm->vmspace->vm_map, mm->gpa,
778 		    mm->gpa + mm->len);
779 		KASSERT(error == KERN_SUCCESS, ("%s: vm_map_remove error %d",
780 		    __func__, error));
781 		bzero(mm, sizeof(struct mem_map));
782 	}
783 }
784 
785 static __inline bool
786 sysmem_mapping(struct vm *vm, struct mem_map *mm)
787 {
788 
789 	if (mm->len != 0 && vm->mem_segs[mm->segid].sysmem)
790 		return (true);
791 	else
792 		return (false);
793 }
794 
795 vm_paddr_t
796 vmm_sysmem_maxaddr(struct vm *vm)
797 {
798 	struct mem_map *mm;
799 	vm_paddr_t maxaddr;
800 	int i;
801 
802 	maxaddr = 0;
803 	for (i = 0; i < VM_MAX_MEMMAPS; i++) {
804 		mm = &vm->mem_maps[i];
805 		if (sysmem_mapping(vm, mm)) {
806 			if (maxaddr < mm->gpa + mm->len)
807 				maxaddr = mm->gpa + mm->len;
808 		}
809 	}
810 	return (maxaddr);
811 }
812 
813 int
814 vm_gla2gpa_nofault(struct vcpu *vcpu, struct vm_guest_paging *paging,
815     uint64_t gla, int prot, uint64_t *gpa, int *is_fault)
816 {
817 	int error;
818 
819 	error = vmmops_gla2gpa(vcpu->cookie, paging, gla, prot, gpa, is_fault);
820 
821 	return (error);
822 }
823 
824 void
825 vm_register_inst_handler(struct vm *vm, uint64_t start, uint64_t size,
826     mem_region_read_t mmio_read, mem_region_write_t mmio_write)
827 {
828 	int i;
829 
830 	for (i = 0; i < nitems(vm->mmio_region); i++) {
831 		if (vm->mmio_region[i].start == 0 &&
832 		    vm->mmio_region[i].end == 0) {
833 			vm->mmio_region[i].start = start;
834 			vm->mmio_region[i].end = start + size;
835 			vm->mmio_region[i].read = mmio_read;
836 			vm->mmio_region[i].write = mmio_write;
837 			return;
838 		}
839 	}
840 
841 	panic("%s: No free MMIO region", __func__);
842 }
843 
844 void
845 vm_deregister_inst_handler(struct vm *vm, uint64_t start, uint64_t size)
846 {
847 	int i;
848 
849 	for (i = 0; i < nitems(vm->mmio_region); i++) {
850 		if (vm->mmio_region[i].start == start &&
851 		    vm->mmio_region[i].end == start + size) {
852 			memset(&vm->mmio_region[i], 0,
853 			    sizeof(vm->mmio_region[i]));
854 			return;
855 		}
856 	}
857 
858 	panic("%s: Invalid MMIO region: %lx - %lx", __func__, start,
859 	    start + size);
860 }
861 
862 static int
863 vm_handle_inst_emul(struct vcpu *vcpu, bool *retu)
864 {
865 	struct vm *vm;
866 	struct vm_exit *vme;
867 	struct vie *vie;
868 	struct hyp *hyp;
869 	uint64_t fault_ipa;
870 	struct vm_guest_paging *paging;
871 	struct vmm_mmio_region *vmr;
872 	int error, i;
873 
874 	vm = vcpu->vm;
875 	hyp = vm->cookie;
876 	if (!hyp->aplic_attached)
877 		goto out_user;
878 
879 	vme = &vcpu->exitinfo;
880 	vie = &vme->u.inst_emul.vie;
881 	paging = &vme->u.inst_emul.paging;
882 
883 	fault_ipa = vme->u.inst_emul.gpa;
884 
885 	vmr = NULL;
886 	for (i = 0; i < nitems(vm->mmio_region); i++) {
887 		if (vm->mmio_region[i].start <= fault_ipa &&
888 		    vm->mmio_region[i].end > fault_ipa) {
889 			vmr = &vm->mmio_region[i];
890 			break;
891 		}
892 	}
893 	if (vmr == NULL)
894 		goto out_user;
895 
896 	error = vmm_emulate_instruction(vcpu, fault_ipa, vie, paging,
897 	    vmr->read, vmr->write, retu);
898 	return (error);
899 
900 out_user:
901 	*retu = true;
902 	return (0);
903 }
904 
905 int
906 vm_suspend(struct vm *vm, enum vm_suspend_how how)
907 {
908 	int i;
909 
910 	if (how <= VM_SUSPEND_NONE || how >= VM_SUSPEND_LAST)
911 		return (EINVAL);
912 
913 	if (atomic_cmpset_int(&vm->suspend, 0, how) == 0) {
914 		VM_CTR2(vm, "virtual machine already suspended %d/%d",
915 		    vm->suspend, how);
916 		return (EALREADY);
917 	}
918 
919 	VM_CTR1(vm, "virtual machine successfully suspended %d", how);
920 
921 	/*
922 	 * Notify all active vcpus that they are now suspended.
923 	 */
924 	for (i = 0; i < vm->maxcpus; i++) {
925 		if (CPU_ISSET(i, &vm->active_cpus))
926 			vcpu_notify_event(vm_vcpu(vm, i));
927 	}
928 
929 	return (0);
930 }
931 
932 void
933 vm_exit_suspended(struct vcpu *vcpu, uint64_t pc)
934 {
935 	struct vm *vm = vcpu->vm;
936 	struct vm_exit *vmexit;
937 
938 	KASSERT(vm->suspend > VM_SUSPEND_NONE && vm->suspend < VM_SUSPEND_LAST,
939 	    ("vm_exit_suspended: invalid suspend type %d", vm->suspend));
940 
941 	vmexit = vm_exitinfo(vcpu);
942 	vmexit->pc = pc;
943 	vmexit->inst_length = 4;
944 	vmexit->exitcode = VM_EXITCODE_SUSPENDED;
945 	vmexit->u.suspended.how = vm->suspend;
946 }
947 
948 void
949 vm_exit_debug(struct vcpu *vcpu, uint64_t pc)
950 {
951 	struct vm_exit *vmexit;
952 
953 	vmexit = vm_exitinfo(vcpu);
954 	vmexit->pc = pc;
955 	vmexit->inst_length = 4;
956 	vmexit->exitcode = VM_EXITCODE_DEBUG;
957 }
958 
959 int
960 vm_activate_cpu(struct vcpu *vcpu)
961 {
962 	struct vm *vm = vcpu->vm;
963 
964 	if (CPU_ISSET(vcpu->vcpuid, &vm->active_cpus))
965 		return (EBUSY);
966 
967 	CPU_SET_ATOMIC(vcpu->vcpuid, &vm->active_cpus);
968 	return (0);
969 
970 }
971 
972 int
973 vm_suspend_cpu(struct vm *vm, struct vcpu *vcpu)
974 {
975 	if (vcpu == NULL) {
976 		vm->debug_cpus = vm->active_cpus;
977 		for (int i = 0; i < vm->maxcpus; i++) {
978 			if (CPU_ISSET(i, &vm->active_cpus))
979 				vcpu_notify_event(vm_vcpu(vm, i));
980 		}
981 	} else {
982 		if (!CPU_ISSET(vcpu->vcpuid, &vm->active_cpus))
983 			return (EINVAL);
984 
985 		CPU_SET_ATOMIC(vcpu->vcpuid, &vm->debug_cpus);
986 		vcpu_notify_event(vcpu);
987 	}
988 	return (0);
989 }
990 
991 int
992 vm_resume_cpu(struct vm *vm, struct vcpu *vcpu)
993 {
994 
995 	if (vcpu == NULL) {
996 		CPU_ZERO(&vm->debug_cpus);
997 	} else {
998 		if (!CPU_ISSET(vcpu->vcpuid, &vm->debug_cpus))
999 			return (EINVAL);
1000 
1001 		CPU_CLR_ATOMIC(vcpu->vcpuid, &vm->debug_cpus);
1002 	}
1003 	return (0);
1004 }
1005 
1006 int
1007 vcpu_debugged(struct vcpu *vcpu)
1008 {
1009 
1010 	return (CPU_ISSET(vcpu->vcpuid, &vcpu->vm->debug_cpus));
1011 }
1012 
1013 cpuset_t
1014 vm_active_cpus(struct vm *vm)
1015 {
1016 
1017 	return (vm->active_cpus);
1018 }
1019 
1020 cpuset_t
1021 vm_debug_cpus(struct vm *vm)
1022 {
1023 
1024 	return (vm->debug_cpus);
1025 }
1026 
1027 cpuset_t
1028 vm_suspended_cpus(struct vm *vm)
1029 {
1030 
1031 	return (vm->suspended_cpus);
1032 }
1033 
1034 
1035 void *
1036 vcpu_stats(struct vcpu *vcpu)
1037 {
1038 
1039 	return (vcpu->stats);
1040 }
1041 
1042 /*
1043  * This function is called to ensure that a vcpu "sees" a pending event
1044  * as soon as possible:
1045  * - If the vcpu thread is sleeping then it is woken up.
1046  * - If the vcpu is running on a different host_cpu then an IPI will be directed
1047  *   to the host_cpu to cause the vcpu to trap into the hypervisor.
1048  */
1049 static void
1050 vcpu_notify_event_locked(struct vcpu *vcpu)
1051 {
1052 	int hostcpu;
1053 
1054 	hostcpu = vcpu->hostcpu;
1055 	if (vcpu->state == VCPU_RUNNING) {
1056 		KASSERT(hostcpu != NOCPU, ("vcpu running on invalid hostcpu"));
1057 		if (hostcpu != curcpu) {
1058 			ipi_cpu(hostcpu, vmm_ipinum);
1059 		} else {
1060 			/*
1061 			 * If the 'vcpu' is running on 'curcpu' then it must
1062 			 * be sending a notification to itself (e.g. SELF_IPI).
1063 			 * The pending event will be picked up when the vcpu
1064 			 * transitions back to guest context.
1065 			 */
1066 		}
1067 	} else {
1068 		KASSERT(hostcpu == NOCPU, ("vcpu state %d not consistent "
1069 		    "with hostcpu %d", vcpu->state, hostcpu));
1070 		if (vcpu->state == VCPU_SLEEPING)
1071 			wakeup_one(vcpu);
1072 	}
1073 }
1074 
1075 void
1076 vcpu_notify_event(struct vcpu *vcpu)
1077 {
1078 	vcpu_lock(vcpu);
1079 	vcpu_notify_event_locked(vcpu);
1080 	vcpu_unlock(vcpu);
1081 }
1082 
1083 static void
1084 restore_guest_fpustate(struct vcpu *vcpu)
1085 {
1086 
1087 	/* Flush host state to the pcb. */
1088 	fpe_state_save(curthread);
1089 
1090 	/* Ensure the VFP state will be re-loaded when exiting the guest. */
1091 	PCPU_SET(fpcurthread, NULL);
1092 
1093 	/* restore guest FPU state */
1094 	fpe_enable();
1095 	fpe_restore(vcpu->guestfpu);
1096 
1097 	/*
1098 	 * The FPU is now "dirty" with the guest's state so turn on emulation
1099 	 * to trap any access to the FPU by the host.
1100 	 */
1101 	fpe_disable();
1102 }
1103 
1104 static void
1105 save_guest_fpustate(struct vcpu *vcpu)
1106 {
1107 
1108 	/* Save guest FPE state. */
1109 	fpe_enable();
1110 	fpe_store(vcpu->guestfpu);
1111 	fpe_disable();
1112 
1113 	KASSERT(PCPU_GET(fpcurthread) == NULL,
1114 	    ("%s: fpcurthread set with guest registers", __func__));
1115 }
1116 
1117 static int
1118 vcpu_set_state_locked(struct vcpu *vcpu, enum vcpu_state newstate,
1119     bool from_idle)
1120 {
1121 	int error;
1122 
1123 	vcpu_assert_locked(vcpu);
1124 
1125 	/*
1126 	 * State transitions from the vmmdev_ioctl() must always begin from
1127 	 * the VCPU_IDLE state. This guarantees that there is only a single
1128 	 * ioctl() operating on a vcpu at any point.
1129 	 */
1130 	if (from_idle) {
1131 		while (vcpu->state != VCPU_IDLE) {
1132 			vcpu_notify_event_locked(vcpu);
1133 			msleep_spin(&vcpu->state, &vcpu->mtx, "vmstat", hz);
1134 		}
1135 	} else {
1136 		KASSERT(vcpu->state != VCPU_IDLE, ("invalid transition from "
1137 		    "vcpu idle state"));
1138 	}
1139 
1140 	if (vcpu->state == VCPU_RUNNING) {
1141 		KASSERT(vcpu->hostcpu == curcpu, ("curcpu %d and hostcpu %d "
1142 		    "mismatch for running vcpu", curcpu, vcpu->hostcpu));
1143 	} else {
1144 		KASSERT(vcpu->hostcpu == NOCPU, ("Invalid hostcpu %d for a "
1145 		    "vcpu that is not running", vcpu->hostcpu));
1146 	}
1147 
1148 	/*
1149 	 * The following state transitions are allowed:
1150 	 * IDLE -> FROZEN -> IDLE
1151 	 * FROZEN -> RUNNING -> FROZEN
1152 	 * FROZEN -> SLEEPING -> FROZEN
1153 	 */
1154 	switch (vcpu->state) {
1155 	case VCPU_IDLE:
1156 	case VCPU_RUNNING:
1157 	case VCPU_SLEEPING:
1158 		error = (newstate != VCPU_FROZEN);
1159 		break;
1160 	case VCPU_FROZEN:
1161 		error = (newstate == VCPU_FROZEN);
1162 		break;
1163 	default:
1164 		error = 1;
1165 		break;
1166 	}
1167 
1168 	if (error)
1169 		return (EBUSY);
1170 
1171 	vcpu->state = newstate;
1172 	if (newstate == VCPU_RUNNING)
1173 		vcpu->hostcpu = curcpu;
1174 	else
1175 		vcpu->hostcpu = NOCPU;
1176 
1177 	if (newstate == VCPU_IDLE)
1178 		wakeup(&vcpu->state);
1179 
1180 	return (0);
1181 }
1182 
1183 static void
1184 vcpu_require_state(struct vcpu *vcpu, enum vcpu_state newstate)
1185 {
1186 	int error;
1187 
1188 	if ((error = vcpu_set_state(vcpu, newstate, false)) != 0)
1189 		panic("Error %d setting state to %d\n", error, newstate);
1190 }
1191 
1192 static void
1193 vcpu_require_state_locked(struct vcpu *vcpu, enum vcpu_state newstate)
1194 {
1195 	int error;
1196 
1197 	if ((error = vcpu_set_state_locked(vcpu, newstate, false)) != 0)
1198 		panic("Error %d setting state to %d", error, newstate);
1199 }
1200 
1201 int
1202 vm_get_capability(struct vcpu *vcpu, int type, int *retval)
1203 {
1204 
1205 	if (type < 0 || type >= VM_CAP_MAX)
1206 		return (EINVAL);
1207 
1208 	return (vmmops_getcap(vcpu->cookie, type, retval));
1209 }
1210 
1211 int
1212 vm_set_capability(struct vcpu *vcpu, int type, int val)
1213 {
1214 
1215 	if (type < 0 || type >= VM_CAP_MAX)
1216 		return (EINVAL);
1217 
1218 	return (vmmops_setcap(vcpu->cookie, type, val));
1219 }
1220 
1221 struct vm *
1222 vcpu_vm(struct vcpu *vcpu)
1223 {
1224 
1225 	return (vcpu->vm);
1226 }
1227 
1228 int
1229 vcpu_vcpuid(struct vcpu *vcpu)
1230 {
1231 
1232 	return (vcpu->vcpuid);
1233 }
1234 
1235 void *
1236 vcpu_get_cookie(struct vcpu *vcpu)
1237 {
1238 
1239 	return (vcpu->cookie);
1240 }
1241 
1242 struct vcpu *
1243 vm_vcpu(struct vm *vm, int vcpuid)
1244 {
1245 
1246 	return (vm->vcpu[vcpuid]);
1247 }
1248 
1249 int
1250 vcpu_set_state(struct vcpu *vcpu, enum vcpu_state newstate, bool from_idle)
1251 {
1252 	int error;
1253 
1254 	vcpu_lock(vcpu);
1255 	error = vcpu_set_state_locked(vcpu, newstate, from_idle);
1256 	vcpu_unlock(vcpu);
1257 
1258 	return (error);
1259 }
1260 
1261 enum vcpu_state
1262 vcpu_get_state(struct vcpu *vcpu, int *hostcpu)
1263 {
1264 	enum vcpu_state state;
1265 
1266 	vcpu_lock(vcpu);
1267 	state = vcpu->state;
1268 	if (hostcpu != NULL)
1269 		*hostcpu = vcpu->hostcpu;
1270 	vcpu_unlock(vcpu);
1271 
1272 	return (state);
1273 }
1274 
1275 static void *
1276 _vm_gpa_hold(struct vm *vm, vm_paddr_t gpa, size_t len, int reqprot,
1277     void **cookie)
1278 {
1279 	int i, count, pageoff;
1280 	struct mem_map *mm;
1281 	vm_page_t m;
1282 
1283 	pageoff = gpa & PAGE_MASK;
1284 	if (len > PAGE_SIZE - pageoff)
1285 		panic("vm_gpa_hold: invalid gpa/len: 0x%016lx/%lu", gpa, len);
1286 
1287 	count = 0;
1288 	for (i = 0; i < VM_MAX_MEMMAPS; i++) {
1289 		mm = &vm->mem_maps[i];
1290 		if (sysmem_mapping(vm, mm) && gpa >= mm->gpa &&
1291 		    gpa < mm->gpa + mm->len) {
1292 			count = vm_fault_quick_hold_pages(&vm->vmspace->vm_map,
1293 			    trunc_page(gpa), PAGE_SIZE, reqprot, &m, 1);
1294 			break;
1295 		}
1296 	}
1297 
1298 	if (count == 1) {
1299 		*cookie = m;
1300 		return ((void *)(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)) + pageoff));
1301 	} else {
1302 		*cookie = NULL;
1303 		return (NULL);
1304 	}
1305 }
1306 
1307 void *
1308 vm_gpa_hold(struct vcpu *vcpu, vm_paddr_t gpa, size_t len, int reqprot,
1309 	    void **cookie)
1310 {
1311 #ifdef INVARIANTS
1312 	/*
1313 	 * The current vcpu should be frozen to ensure 'vm_memmap[]'
1314 	 * stability.
1315 	 */
1316 	int state = vcpu_get_state(vcpu, NULL);
1317 	KASSERT(state == VCPU_FROZEN, ("%s: invalid vcpu state %d",
1318 	    __func__, state));
1319 #endif
1320 	return (_vm_gpa_hold(vcpu->vm, gpa, len, reqprot, cookie));
1321 }
1322 
1323 void *
1324 vm_gpa_hold_global(struct vm *vm, vm_paddr_t gpa, size_t len, int reqprot,
1325     void **cookie)
1326 {
1327 	sx_assert(&vm->mem_segs_lock, SX_LOCKED);
1328 	return (_vm_gpa_hold(vm, gpa, len, reqprot, cookie));
1329 }
1330 
1331 void
1332 vm_gpa_release(void *cookie)
1333 {
1334 	vm_page_t m = cookie;
1335 
1336 	vm_page_unwire(m, PQ_ACTIVE);
1337 }
1338 
1339 int
1340 vm_get_register(struct vcpu *vcpu, int reg, uint64_t *retval)
1341 {
1342 
1343 	if (reg >= VM_REG_LAST)
1344 		return (EINVAL);
1345 
1346 	return (vmmops_getreg(vcpu->cookie, reg, retval));
1347 }
1348 
1349 int
1350 vm_set_register(struct vcpu *vcpu, int reg, uint64_t val)
1351 {
1352 	int error;
1353 
1354 	if (reg >= VM_REG_LAST)
1355 		return (EINVAL);
1356 	error = vmmops_setreg(vcpu->cookie, reg, val);
1357 	if (error || reg != VM_REG_GUEST_SEPC)
1358 		return (error);
1359 
1360 	vcpu->nextpc = val;
1361 
1362 	return (0);
1363 }
1364 
1365 void *
1366 vm_get_cookie(struct vm *vm)
1367 {
1368 
1369 	return (vm->cookie);
1370 }
1371 
1372 int
1373 vm_inject_exception(struct vcpu *vcpu, uint64_t scause)
1374 {
1375 
1376 	return (vmmops_exception(vcpu->cookie, scause));
1377 }
1378 
1379 int
1380 vm_attach_aplic(struct vm *vm, struct vm_aplic_descr *descr)
1381 {
1382 
1383 	return (aplic_attach_to_vm(vm->cookie, descr));
1384 }
1385 
1386 int
1387 vm_assert_irq(struct vm *vm, uint32_t irq)
1388 {
1389 
1390 	return (aplic_inject_irq(vm->cookie, -1, irq, true));
1391 }
1392 
1393 int
1394 vm_deassert_irq(struct vm *vm, uint32_t irq)
1395 {
1396 
1397 	return (aplic_inject_irq(vm->cookie, -1, irq, false));
1398 }
1399 
1400 int
1401 vm_raise_msi(struct vm *vm, uint64_t msg, uint64_t addr, int bus, int slot,
1402     int func)
1403 {
1404 
1405 	return (aplic_inject_msi(vm->cookie, msg, addr));
1406 }
1407 
1408 static int
1409 vm_handle_wfi(struct vcpu *vcpu, struct vm_exit *vme, bool *retu)
1410 {
1411 
1412 	vcpu_lock(vcpu);
1413 
1414 	while (1) {
1415 		if (aplic_check_pending(vcpu->cookie))
1416 			break;
1417 
1418 		if (riscv_check_ipi(vcpu->cookie, false))
1419 			break;
1420 
1421 		if (riscv_check_interrupts_pending(vcpu->cookie))
1422 			break;
1423 
1424 		if (vcpu_should_yield(vcpu))
1425 			break;
1426 
1427 		vcpu_require_state_locked(vcpu, VCPU_SLEEPING);
1428 		/*
1429 		 * XXX msleep_spin() cannot be interrupted by signals so
1430 		 * wake up periodically to check pending signals.
1431 		 */
1432 		msleep_spin(vcpu, &vcpu->mtx, "vmidle", hz);
1433 		vcpu_require_state_locked(vcpu, VCPU_FROZEN);
1434 	}
1435 	vcpu_unlock(vcpu);
1436 
1437 	*retu = false;
1438 
1439 	return (0);
1440 }
1441 
1442 static int
1443 vm_handle_paging(struct vcpu *vcpu, bool *retu)
1444 {
1445 	struct vm *vm;
1446 	struct vm_exit *vme;
1447 	struct vm_map *map;
1448 	uint64_t addr;
1449 	pmap_t pmap;
1450 	int ftype, rv;
1451 
1452 	vm = vcpu->vm;
1453 	vme = &vcpu->exitinfo;
1454 
1455 	pmap = vmspace_pmap(vm->vmspace);
1456 	addr = (vme->htval << 2) & ~(PAGE_SIZE - 1);
1457 
1458 	dprintf("%s: %lx\n", __func__, addr);
1459 
1460 	switch (vme->scause) {
1461 	case SCAUSE_STORE_GUEST_PAGE_FAULT:
1462 		ftype = VM_PROT_WRITE;
1463 		break;
1464 	case SCAUSE_FETCH_GUEST_PAGE_FAULT:
1465 		ftype = VM_PROT_EXECUTE;
1466 		break;
1467 	case SCAUSE_LOAD_GUEST_PAGE_FAULT:
1468 		ftype = VM_PROT_READ;
1469 		break;
1470 	default:
1471 		panic("unknown page trap: %lu", vme->scause);
1472 	}
1473 
1474 	/* The page exists, but the page table needs to be updated. */
1475 	if (pmap_fault(pmap, addr, ftype))
1476 		return (0);
1477 
1478 	map = &vm->vmspace->vm_map;
1479 	rv = vm_fault(map, addr, ftype, VM_FAULT_NORMAL, NULL);
1480 	if (rv != KERN_SUCCESS) {
1481 		printf("%s: vm_fault failed, addr %lx, ftype %d, err %d\n",
1482 		    __func__, addr, ftype, rv);
1483 		return (EFAULT);
1484 	}
1485 
1486 	return (0);
1487 }
1488 
1489 static int
1490 vm_handle_suspend(struct vcpu *vcpu, bool *retu)
1491 {
1492 	struct vm *vm = vcpu->vm;
1493 	int error, i;
1494 	struct thread *td;
1495 
1496 	error = 0;
1497 	td = curthread;
1498 
1499 	CPU_SET_ATOMIC(vcpu->vcpuid, &vm->suspended_cpus);
1500 
1501 	/*
1502 	 * Wait until all 'active_cpus' have suspended themselves.
1503 	 *
1504 	 * Since a VM may be suspended at any time including when one or
1505 	 * more vcpus are doing a rendezvous we need to call the rendezvous
1506 	 * handler while we are waiting to prevent a deadlock.
1507 	 */
1508 	vcpu_lock(vcpu);
1509 	while (error == 0) {
1510 		if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0)
1511 			break;
1512 
1513 		vcpu_require_state_locked(vcpu, VCPU_SLEEPING);
1514 		msleep_spin(vcpu, &vcpu->mtx, "vmsusp", hz);
1515 		vcpu_require_state_locked(vcpu, VCPU_FROZEN);
1516 		if (td_ast_pending(td, TDA_SUSPEND)) {
1517 			vcpu_unlock(vcpu);
1518 			error = thread_check_susp(td, false);
1519 			vcpu_lock(vcpu);
1520 		}
1521 	}
1522 	vcpu_unlock(vcpu);
1523 
1524 	/*
1525 	 * Wakeup the other sleeping vcpus and return to userspace.
1526 	 */
1527 	for (i = 0; i < vm->maxcpus; i++) {
1528 		if (CPU_ISSET(i, &vm->suspended_cpus)) {
1529 			vcpu_notify_event(vm_vcpu(vm, i));
1530 		}
1531 	}
1532 
1533 	*retu = true;
1534 	return (error);
1535 }
1536 
1537 int
1538 vm_run(struct vcpu *vcpu)
1539 {
1540 	struct vm_eventinfo evinfo;
1541 	struct vm_exit *vme;
1542 	struct vm *vm;
1543 	pmap_t pmap;
1544 	int error;
1545 	int vcpuid;
1546 	bool retu;
1547 
1548 	vm = vcpu->vm;
1549 
1550 	dprintf("%s\n", __func__);
1551 
1552 	vcpuid = vcpu->vcpuid;
1553 
1554 	if (!CPU_ISSET(vcpuid, &vm->active_cpus))
1555 		return (EINVAL);
1556 
1557 	if (CPU_ISSET(vcpuid, &vm->suspended_cpus))
1558 		return (EINVAL);
1559 
1560 	pmap = vmspace_pmap(vm->vmspace);
1561 	vme = &vcpu->exitinfo;
1562 	evinfo.rptr = NULL;
1563 	evinfo.sptr = &vm->suspend;
1564 	evinfo.iptr = NULL;
1565 restart:
1566 	critical_enter();
1567 
1568 	restore_guest_fpustate(vcpu);
1569 
1570 	vcpu_require_state(vcpu, VCPU_RUNNING);
1571 	error = vmmops_run(vcpu->cookie, vcpu->nextpc, pmap, &evinfo);
1572 	vcpu_require_state(vcpu, VCPU_FROZEN);
1573 
1574 	save_guest_fpustate(vcpu);
1575 
1576 	critical_exit();
1577 
1578 	if (error == 0) {
1579 		retu = false;
1580 		switch (vme->exitcode) {
1581 		case VM_EXITCODE_INST_EMUL:
1582 			vcpu->nextpc = vme->pc + vme->inst_length;
1583 			error = vm_handle_inst_emul(vcpu, &retu);
1584 			break;
1585 		case VM_EXITCODE_WFI:
1586 			vcpu->nextpc = vme->pc + vme->inst_length;
1587 			error = vm_handle_wfi(vcpu, vme, &retu);
1588 			break;
1589 		case VM_EXITCODE_ECALL:
1590 			/* Handle in userland. */
1591 			vcpu->nextpc = vme->pc + vme->inst_length;
1592 			retu = true;
1593 			break;
1594 		case VM_EXITCODE_PAGING:
1595 			vcpu->nextpc = vme->pc;
1596 			error = vm_handle_paging(vcpu, &retu);
1597 			break;
1598 		case VM_EXITCODE_BOGUS:
1599 			vcpu->nextpc = vme->pc;
1600 			retu = false;
1601 			error = 0;
1602 			break;
1603 		case VM_EXITCODE_SUSPENDED:
1604 			vcpu->nextpc = vme->pc;
1605 			error = vm_handle_suspend(vcpu, &retu);
1606 			break;
1607 		default:
1608 			/* Handle in userland. */
1609 			vcpu->nextpc = vme->pc;
1610 			retu = true;
1611 			break;
1612 		}
1613 	}
1614 
1615 	if (error == 0 && retu == false)
1616 		goto restart;
1617 
1618 	return (error);
1619 }
1620