xref: /freebsd/sys/arm64/vmm/vmm.c (revision df21a004be237a1dccd03c7b47254625eea62fa9)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (C) 2015 Mihai Carabas <mihai.carabas@gmail.com>
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28 
29 #include <sys/param.h>
30 #include <sys/systm.h>
31 #include <sys/cpuset.h>
32 #include <sys/kernel.h>
33 #include <sys/linker.h>
34 #include <sys/lock.h>
35 #include <sys/malloc.h>
36 #include <sys/module.h>
37 #include <sys/mutex.h>
38 #include <sys/pcpu.h>
39 #include <sys/proc.h>
40 #include <sys/queue.h>
41 #include <sys/rwlock.h>
42 #include <sys/sched.h>
43 #include <sys/smp.h>
44 #include <sys/sysctl.h>
45 
46 #include <vm/vm.h>
47 #include <vm/vm_object.h>
48 #include <vm/vm_page.h>
49 #include <vm/pmap.h>
50 #include <vm/vm_map.h>
51 #include <vm/vm_extern.h>
52 #include <vm/vm_param.h>
53 
54 #include <machine/cpu.h>
55 #include <machine/fpu.h>
56 #include <machine/machdep.h>
57 #include <machine/pcb.h>
58 #include <machine/smp.h>
59 #include <machine/vm.h>
60 #include <machine/vmparam.h>
61 #include <machine/vmm.h>
62 #include <machine/vmm_instruction_emul.h>
63 
64 #include <dev/pci/pcireg.h>
65 #include <dev/vmm/vmm_dev.h>
66 #include <dev/vmm/vmm_ktr.h>
67 #include <dev/vmm/vmm_mem.h>
68 #include <dev/vmm/vmm_stat.h>
69 
70 #include "arm64.h"
71 #include "mmu.h"
72 
73 #include "io/vgic.h"
74 #include "io/vtimer.h"
75 
76 struct vcpu {
77 	int		flags;
78 	enum vcpu_state	state;
79 	struct mtx	mtx;
80 	int		hostcpu;	/* host cpuid this vcpu last ran on */
81 	int		vcpuid;
82 	void		*stats;
83 	struct vm_exit	exitinfo;
84 	uint64_t	nextpc;		/* (x) next instruction to execute */
85 	struct vm	*vm;		/* (o) */
86 	void		*cookie;	/* (i) cpu-specific data */
87 	struct vfpstate	*guestfpu;	/* (a,i) guest fpu state */
88 };
89 
90 #define	vcpu_lock_init(v)	mtx_init(&((v)->mtx), "vcpu lock", 0, MTX_SPIN)
91 #define	vcpu_lock_destroy(v)	mtx_destroy(&((v)->mtx))
92 #define	vcpu_lock(v)		mtx_lock_spin(&((v)->mtx))
93 #define	vcpu_unlock(v)		mtx_unlock_spin(&((v)->mtx))
94 #define	vcpu_assert_locked(v)	mtx_assert(&((v)->mtx), MA_OWNED)
95 
96 struct vmm_mmio_region {
97 	uint64_t start;
98 	uint64_t end;
99 	mem_region_read_t read;
100 	mem_region_write_t write;
101 };
102 #define	VM_MAX_MMIO_REGIONS	4
103 
104 struct vmm_special_reg {
105 	uint32_t	esr_iss;
106 	uint32_t	esr_mask;
107 	reg_read_t	reg_read;
108 	reg_write_t	reg_write;
109 	void		*arg;
110 };
111 #define	VM_MAX_SPECIAL_REGS	16
112 
113 /*
114  * Initialization:
115  * (o) initialized the first time the VM is created
116  * (i) initialized when VM is created and when it is reinitialized
117  * (x) initialized before use
118  */
119 struct vm {
120 	void		*cookie;		/* (i) cpu-specific data */
121 	volatile cpuset_t active_cpus;		/* (i) active vcpus */
122 	volatile cpuset_t debug_cpus;		/* (i) vcpus stopped for debug */
123 	int		suspend;		/* (i) stop VM execution */
124 	bool		dying;			/* (o) is dying */
125 	volatile cpuset_t suspended_cpus; 	/* (i) suspended vcpus */
126 	volatile cpuset_t halted_cpus;		/* (x) cpus in a hard halt */
127 	struct vm_mem	mem;			/* (i) guest memory */
128 	char		name[VM_MAX_NAMELEN];	/* (o) virtual machine name */
129 	struct vcpu	**vcpu;			/* (i) guest vcpus */
130 	struct vmm_mmio_region mmio_region[VM_MAX_MMIO_REGIONS];
131 						/* (o) guest MMIO regions */
132 	struct vmm_special_reg special_reg[VM_MAX_SPECIAL_REGS];
133 	/* The following describe the vm cpu topology */
134 	uint16_t	sockets;		/* (o) num of sockets */
135 	uint16_t	cores;			/* (o) num of cores/socket */
136 	uint16_t	threads;		/* (o) num of threads/core */
137 	uint16_t	maxcpus;		/* (o) max pluggable cpus */
138 	struct sx	vcpus_init_lock;	/* (o) */
139 };
140 
141 static bool vmm_initialized = false;
142 
143 static int vm_handle_wfi(struct vcpu *vcpu,
144 			 struct vm_exit *vme, bool *retu);
145 
146 static MALLOC_DEFINE(M_VMM, "vmm", "vmm");
147 
148 /* statistics */
149 static VMM_STAT(VCPU_TOTAL_RUNTIME, "vcpu total runtime");
150 
151 SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW, NULL, NULL);
152 
153 static int vmm_ipinum;
154 SYSCTL_INT(_hw_vmm, OID_AUTO, ipinum, CTLFLAG_RD, &vmm_ipinum, 0,
155     "IPI vector used for vcpu notifications");
156 
157 struct vmm_regs {
158 	uint64_t	id_aa64afr0;
159 	uint64_t	id_aa64afr1;
160 	uint64_t	id_aa64dfr0;
161 	uint64_t	id_aa64dfr1;
162 	uint64_t	id_aa64isar0;
163 	uint64_t	id_aa64isar1;
164 	uint64_t	id_aa64isar2;
165 	uint64_t	id_aa64mmfr0;
166 	uint64_t	id_aa64mmfr1;
167 	uint64_t	id_aa64mmfr2;
168 	uint64_t	id_aa64pfr0;
169 	uint64_t	id_aa64pfr1;
170 };
171 
172 static const struct vmm_regs vmm_arch_regs_masks = {
173 	.id_aa64dfr0 =
174 	    ID_AA64DFR0_CTX_CMPs_MASK |
175 	    ID_AA64DFR0_WRPs_MASK |
176 	    ID_AA64DFR0_BRPs_MASK |
177 	    ID_AA64DFR0_PMUVer_3 |
178 	    ID_AA64DFR0_DebugVer_8,
179 	.id_aa64isar0 =
180 	    ID_AA64ISAR0_TLB_TLBIOSR |
181 	    ID_AA64ISAR0_SHA3_IMPL |
182 	    ID_AA64ISAR0_RDM_IMPL |
183 	    ID_AA64ISAR0_Atomic_IMPL |
184 	    ID_AA64ISAR0_CRC32_BASE |
185 	    ID_AA64ISAR0_SHA2_512 |
186 	    ID_AA64ISAR0_SHA1_BASE |
187 	    ID_AA64ISAR0_AES_PMULL,
188 	.id_aa64mmfr0 =
189 	    ID_AA64MMFR0_TGran4_IMPL |
190 	    ID_AA64MMFR0_TGran64_IMPL |
191 	    ID_AA64MMFR0_TGran16_IMPL |
192 	    ID_AA64MMFR0_ASIDBits_16 |
193 	    ID_AA64MMFR0_PARange_4P,
194 	.id_aa64mmfr1 =
195 	    ID_AA64MMFR1_SpecSEI_IMPL |
196 	    ID_AA64MMFR1_PAN_ATS1E1 |
197 	    ID_AA64MMFR1_HAFDBS_AF,
198 	.id_aa64pfr0 =
199 	    ID_AA64PFR0_GIC_CPUIF_NONE |
200 	    ID_AA64PFR0_AdvSIMD_HP |
201 	    ID_AA64PFR0_FP_HP |
202 	    ID_AA64PFR0_EL3_64 |
203 	    ID_AA64PFR0_EL2_64 |
204 	    ID_AA64PFR0_EL1_64 |
205 	    ID_AA64PFR0_EL0_64,
206 };
207 
208 /* Host registers masked by vmm_arch_regs_masks. */
209 static struct vmm_regs vmm_arch_regs;
210 
211 u_int vm_maxcpu;
212 SYSCTL_UINT(_hw_vmm, OID_AUTO, maxcpu, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
213     &vm_maxcpu, 0, "Maximum number of vCPUs");
214 
215 static void vcpu_notify_event_locked(struct vcpu *vcpu);
216 
217 /* global statistics */
218 VMM_STAT(VMEXIT_COUNT, "total number of vm exits");
219 VMM_STAT(VMEXIT_UNKNOWN, "number of vmexits for the unknown exception");
220 VMM_STAT(VMEXIT_WFI, "number of times wfi was intercepted");
221 VMM_STAT(VMEXIT_WFE, "number of times wfe was intercepted");
222 VMM_STAT(VMEXIT_HVC, "number of times hvc was intercepted");
223 VMM_STAT(VMEXIT_MSR, "number of times msr/mrs was intercepted");
224 VMM_STAT(VMEXIT_DATA_ABORT, "number of vmexits for a data abort");
225 VMM_STAT(VMEXIT_INSN_ABORT, "number of vmexits for an instruction abort");
226 VMM_STAT(VMEXIT_UNHANDLED_SYNC, "number of vmexits for an unhandled synchronous exception");
227 VMM_STAT(VMEXIT_IRQ, "number of vmexits for an irq");
228 VMM_STAT(VMEXIT_FIQ, "number of vmexits for an interrupt");
229 VMM_STAT(VMEXIT_BRK, "number of vmexits for a breakpoint exception");
230 VMM_STAT(VMEXIT_SS, "number of vmexits for a single-step exception");
231 VMM_STAT(VMEXIT_UNHANDLED_EL2, "number of vmexits for an unhandled EL2 exception");
232 VMM_STAT(VMEXIT_UNHANDLED, "number of vmexits for an unhandled exception");
233 
234 /*
235  * Upper limit on vm_maxcpu. We could increase this to 28 bits, but this
236  * is a safe value for now.
237  */
238 #define	VM_MAXCPU	MIN(0xffff - 1, CPU_SETSIZE)
239 
240 static int
241 vmm_regs_init(struct vmm_regs *regs, const struct vmm_regs *masks)
242 {
243 #define	_FETCH_KERN_REG(reg, field) do {				\
244 	regs->field = vmm_arch_regs_masks.field;			\
245 	if (!get_kernel_reg_iss_masked(reg ## _ISS, &regs->field,	\
246 	    masks->field))						\
247 		regs->field = 0;					\
248 } while (0)
249 	_FETCH_KERN_REG(ID_AA64AFR0_EL1, id_aa64afr0);
250 	_FETCH_KERN_REG(ID_AA64AFR1_EL1, id_aa64afr1);
251 	_FETCH_KERN_REG(ID_AA64DFR0_EL1, id_aa64dfr0);
252 	_FETCH_KERN_REG(ID_AA64DFR1_EL1, id_aa64dfr1);
253 	_FETCH_KERN_REG(ID_AA64ISAR0_EL1, id_aa64isar0);
254 	_FETCH_KERN_REG(ID_AA64ISAR1_EL1, id_aa64isar1);
255 	_FETCH_KERN_REG(ID_AA64ISAR2_EL1, id_aa64isar2);
256 	_FETCH_KERN_REG(ID_AA64MMFR0_EL1, id_aa64mmfr0);
257 	_FETCH_KERN_REG(ID_AA64MMFR1_EL1, id_aa64mmfr1);
258 	_FETCH_KERN_REG(ID_AA64MMFR2_EL1, id_aa64mmfr2);
259 	_FETCH_KERN_REG(ID_AA64PFR0_EL1, id_aa64pfr0);
260 	_FETCH_KERN_REG(ID_AA64PFR1_EL1, id_aa64pfr1);
261 #undef _FETCH_KERN_REG
262 	return (0);
263 }
264 
265 static void
266 vcpu_cleanup(struct vcpu *vcpu, bool destroy)
267 {
268 	vmmops_vcpu_cleanup(vcpu->cookie);
269 	vcpu->cookie = NULL;
270 	if (destroy) {
271 		vmm_stat_free(vcpu->stats);
272 		fpu_save_area_free(vcpu->guestfpu);
273 		vcpu_lock_destroy(vcpu);
274 		free(vcpu, M_VMM);
275 	}
276 }
277 
278 static struct vcpu *
279 vcpu_alloc(struct vm *vm, int vcpu_id)
280 {
281 	struct vcpu *vcpu;
282 
283 	KASSERT(vcpu_id >= 0 && vcpu_id < vm->maxcpus,
284 	    ("vcpu_alloc: invalid vcpu %d", vcpu_id));
285 
286 	vcpu = malloc(sizeof(*vcpu), M_VMM, M_WAITOK | M_ZERO);
287 	vcpu_lock_init(vcpu);
288 	vcpu->state = VCPU_IDLE;
289 	vcpu->hostcpu = NOCPU;
290 	vcpu->vcpuid = vcpu_id;
291 	vcpu->vm = vm;
292 	vcpu->guestfpu = fpu_save_area_alloc();
293 	vcpu->stats = vmm_stat_alloc();
294 	return (vcpu);
295 }
296 
297 static void
298 vcpu_init(struct vcpu *vcpu)
299 {
300 	vcpu->cookie = vmmops_vcpu_init(vcpu->vm->cookie, vcpu, vcpu->vcpuid);
301 	MPASS(vcpu->cookie != NULL);
302 	fpu_save_area_reset(vcpu->guestfpu);
303 	vmm_stat_init(vcpu->stats);
304 }
305 
306 struct vm_exit *
307 vm_exitinfo(struct vcpu *vcpu)
308 {
309 	return (&vcpu->exitinfo);
310 }
311 
312 static int
313 vmm_unsupported_quirk(void)
314 {
315 	/*
316 	 * Known to not load on Ampere eMAG
317 	 * https://bugs.freebsd.org/bugzilla/show_bug.cgi?id=285051
318 	 */
319 	if (CPU_MATCH(CPU_IMPL_MASK | CPU_PART_MASK, CPU_IMPL_APM,
320 	    CPU_PART_EMAG8180, 0, 0))
321 		return (ENXIO);
322 
323 	return (0);
324 }
325 
326 static int
327 vmm_init(void)
328 {
329 	int error;
330 
331 	vm_maxcpu = mp_ncpus;
332 	TUNABLE_INT_FETCH("hw.vmm.maxcpu", &vm_maxcpu);
333 
334 	if (vm_maxcpu > VM_MAXCPU) {
335 		printf("vmm: vm_maxcpu clamped to %u\n", VM_MAXCPU);
336 		vm_maxcpu = VM_MAXCPU;
337 	}
338 	if (vm_maxcpu == 0)
339 		vm_maxcpu = 1;
340 
341 	error = vmm_regs_init(&vmm_arch_regs, &vmm_arch_regs_masks);
342 	if (error != 0)
343 		return (error);
344 
345 	return (vmmops_modinit(0));
346 }
347 
348 static int
349 vmm_handler(module_t mod, int what, void *arg)
350 {
351 	int error;
352 
353 	switch (what) {
354 	case MOD_LOAD:
355 		error = vmm_unsupported_quirk();
356 		if (error != 0)
357 			break;
358 		error = vmmdev_init();
359 		if (error != 0)
360 			break;
361 		error = vmm_init();
362 		if (error == 0)
363 			vmm_initialized = true;
364 		else
365 			(void)vmmdev_cleanup();
366 		break;
367 	case MOD_UNLOAD:
368 		error = vmmdev_cleanup();
369 		if (error == 0 && vmm_initialized) {
370 			error = vmmops_modcleanup();
371 			if (error) {
372 				/*
373 				 * Something bad happened - prevent new
374 				 * VMs from being created
375 				 */
376 				vmm_initialized = false;
377 			}
378 		}
379 		break;
380 	default:
381 		error = 0;
382 		break;
383 	}
384 	return (error);
385 }
386 
387 static moduledata_t vmm_kmod = {
388 	"vmm",
389 	vmm_handler,
390 	NULL
391 };
392 
393 /*
394  * vmm initialization has the following dependencies:
395  *
396  * - HYP initialization requires smp_rendezvous() and therefore must happen
397  *   after SMP is fully functional (after SI_SUB_SMP).
398  * - vmm device initialization requires an initialized devfs.
399  */
400 DECLARE_MODULE(vmm, vmm_kmod, MAX(SI_SUB_SMP, SI_SUB_DEVFS) + 1, SI_ORDER_ANY);
401 MODULE_VERSION(vmm, 1);
402 
403 static void
404 vm_init(struct vm *vm, bool create)
405 {
406 	int i;
407 
408 	vm->cookie = vmmops_init(vm, vmspace_pmap(vm_vmspace(vm)));
409 	MPASS(vm->cookie != NULL);
410 
411 	CPU_ZERO(&vm->active_cpus);
412 	CPU_ZERO(&vm->debug_cpus);
413 
414 	vm->suspend = 0;
415 	CPU_ZERO(&vm->suspended_cpus);
416 
417 	memset(vm->mmio_region, 0, sizeof(vm->mmio_region));
418 	memset(vm->special_reg, 0, sizeof(vm->special_reg));
419 
420 	if (!create) {
421 		for (i = 0; i < vm->maxcpus; i++) {
422 			if (vm->vcpu[i] != NULL)
423 				vcpu_init(vm->vcpu[i]);
424 		}
425 	}
426 }
427 
428 void
429 vm_disable_vcpu_creation(struct vm *vm)
430 {
431 	sx_xlock(&vm->vcpus_init_lock);
432 	vm->dying = true;
433 	sx_xunlock(&vm->vcpus_init_lock);
434 }
435 
436 struct vcpu *
437 vm_alloc_vcpu(struct vm *vm, int vcpuid)
438 {
439 	struct vcpu *vcpu;
440 
441 	if (vcpuid < 0 || vcpuid >= vm_get_maxcpus(vm))
442 		return (NULL);
443 
444 	/* Some interrupt controllers may have a CPU limit */
445 	if (vcpuid >= vgic_max_cpu_count(vm->cookie))
446 		return (NULL);
447 
448 	vcpu = (struct vcpu *)
449 	    atomic_load_acq_ptr((uintptr_t *)&vm->vcpu[vcpuid]);
450 	if (__predict_true(vcpu != NULL))
451 		return (vcpu);
452 
453 	sx_xlock(&vm->vcpus_init_lock);
454 	vcpu = vm->vcpu[vcpuid];
455 	if (vcpu == NULL && !vm->dying) {
456 		vcpu = vcpu_alloc(vm, vcpuid);
457 		vcpu_init(vcpu);
458 
459 		/*
460 		 * Ensure vCPU is fully created before updating pointer
461 		 * to permit unlocked reads above.
462 		 */
463 		atomic_store_rel_ptr((uintptr_t *)&vm->vcpu[vcpuid],
464 		    (uintptr_t)vcpu);
465 	}
466 	sx_xunlock(&vm->vcpus_init_lock);
467 	return (vcpu);
468 }
469 
470 void
471 vm_lock_vcpus(struct vm *vm)
472 {
473 	sx_xlock(&vm->vcpus_init_lock);
474 }
475 
476 void
477 vm_unlock_vcpus(struct vm *vm)
478 {
479 	sx_unlock(&vm->vcpus_init_lock);
480 }
481 
482 int
483 vm_create(const char *name, struct vm **retvm)
484 {
485 	struct vm *vm;
486 	int error;
487 
488 	/*
489 	 * If vmm.ko could not be successfully initialized then don't attempt
490 	 * to create the virtual machine.
491 	 */
492 	if (!vmm_initialized)
493 		return (ENXIO);
494 
495 	if (name == NULL || strlen(name) >= VM_MAX_NAMELEN)
496 		return (EINVAL);
497 
498 	vm = malloc(sizeof(struct vm), M_VMM, M_WAITOK | M_ZERO);
499 	error = vm_mem_init(&vm->mem, 0, 1ul << 39);
500 	if (error != 0) {
501 		free(vm, M_VMM);
502 		return (error);
503 	}
504 	strcpy(vm->name, name);
505 	sx_init(&vm->vcpus_init_lock, "vm vcpus");
506 
507 	vm->sockets = 1;
508 	vm->cores = 1;			/* XXX backwards compatibility */
509 	vm->threads = 1;		/* XXX backwards compatibility */
510 	vm->maxcpus = vm_maxcpu;
511 
512 	vm->vcpu = malloc(sizeof(*vm->vcpu) * vm->maxcpus, M_VMM,
513 	    M_WAITOK | M_ZERO);
514 
515 	vm_init(vm, true);
516 
517 	*retvm = vm;
518 	return (0);
519 }
520 
521 void
522 vm_get_topology(struct vm *vm, uint16_t *sockets, uint16_t *cores,
523     uint16_t *threads, uint16_t *maxcpus)
524 {
525 	*sockets = vm->sockets;
526 	*cores = vm->cores;
527 	*threads = vm->threads;
528 	*maxcpus = vm->maxcpus;
529 }
530 
531 uint16_t
532 vm_get_maxcpus(struct vm *vm)
533 {
534 	return (vm->maxcpus);
535 }
536 
537 int
538 vm_set_topology(struct vm *vm, uint16_t sockets, uint16_t cores,
539     uint16_t threads, uint16_t maxcpus)
540 {
541 	/* Ignore maxcpus. */
542 	if ((sockets * cores * threads) > vm->maxcpus)
543 		return (EINVAL);
544 	vm->sockets = sockets;
545 	vm->cores = cores;
546 	vm->threads = threads;
547 	return(0);
548 }
549 
550 static void
551 vm_cleanup(struct vm *vm, bool destroy)
552 {
553 	pmap_t pmap __diagused;
554 	int i;
555 
556 	if (destroy) {
557 		vm_xlock_memsegs(vm);
558 		pmap = vmspace_pmap(vm_vmspace(vm));
559 		sched_pin();
560 		PCPU_SET(curvmpmap, NULL);
561 		sched_unpin();
562 		CPU_FOREACH(i) {
563 			MPASS(cpuid_to_pcpu[i]->pc_curvmpmap != pmap);
564 		}
565 	} else
566 		vm_assert_memseg_xlocked(vm);
567 
568 
569 	vgic_detach_from_vm(vm->cookie);
570 
571 	for (i = 0; i < vm->maxcpus; i++) {
572 		if (vm->vcpu[i] != NULL)
573 			vcpu_cleanup(vm->vcpu[i], destroy);
574 	}
575 
576 	vmmops_cleanup(vm->cookie);
577 
578 	vm_mem_cleanup(vm);
579 	if (destroy) {
580 		vm_mem_destroy(vm);
581 
582 		free(vm->vcpu, M_VMM);
583 		sx_destroy(&vm->vcpus_init_lock);
584 	}
585 }
586 
587 void
588 vm_destroy(struct vm *vm)
589 {
590 	vm_cleanup(vm, true);
591 	free(vm, M_VMM);
592 }
593 
594 int
595 vm_reinit(struct vm *vm)
596 {
597 	int error;
598 
599 	/*
600 	 * A virtual machine can be reset only if all vcpus are suspended.
601 	 */
602 	if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) {
603 		vm_cleanup(vm, false);
604 		vm_init(vm, false);
605 		error = 0;
606 	} else {
607 		error = EBUSY;
608 	}
609 
610 	return (error);
611 }
612 
613 const char *
614 vm_name(struct vm *vm)
615 {
616 	return (vm->name);
617 }
618 
619 int
620 vm_gla2gpa_nofault(struct vcpu *vcpu, struct vm_guest_paging *paging,
621     uint64_t gla, int prot, uint64_t *gpa, int *is_fault)
622 {
623 	return (vmmops_gla2gpa(vcpu->cookie, paging, gla, prot, gpa, is_fault));
624 }
625 
626 static int
627 vmm_reg_raz(struct vcpu *vcpu, uint64_t *rval, void *arg)
628 {
629 	*rval = 0;
630 	return (0);
631 }
632 
633 static int
634 vmm_reg_read_arg(struct vcpu *vcpu, uint64_t *rval, void *arg)
635 {
636 	*rval = *(uint64_t *)arg;
637 	return (0);
638 }
639 
640 static int
641 vmm_reg_wi(struct vcpu *vcpu, uint64_t wval, void *arg)
642 {
643 	return (0);
644 }
645 
646 static int
647 vmm_write_oslar_el1(struct vcpu *vcpu, uint64_t wval, void *arg)
648 {
649 	struct hypctx *hypctx;
650 
651 	hypctx = vcpu_get_cookie(vcpu);
652 	/* All other fields are RES0 & we don't do anything with this */
653 	/* TODO: Disable access to other debug state when locked */
654 	hypctx->dbg_oslock = (wval & OSLAR_OSLK) == OSLAR_OSLK;
655 	return (0);
656 }
657 
658 static int
659 vmm_read_oslsr_el1(struct vcpu *vcpu, uint64_t *rval, void *arg)
660 {
661 	struct hypctx *hypctx;
662 	uint64_t val;
663 
664 	hypctx = vcpu_get_cookie(vcpu);
665 	val = OSLSR_OSLM_1;
666 	if (hypctx->dbg_oslock)
667 		val |= OSLSR_OSLK;
668 	*rval = val;
669 
670 	return (0);
671 }
672 
673 static const struct vmm_special_reg vmm_special_regs[] = {
674 #define	SPECIAL_REG(_reg, _read, _write)				\
675 	{								\
676 		.esr_iss = ((_reg ## _op0) << ISS_MSR_OP0_SHIFT) |	\
677 		    ((_reg ## _op1) << ISS_MSR_OP1_SHIFT) |		\
678 		    ((_reg ## _CRn) << ISS_MSR_CRn_SHIFT) |		\
679 		    ((_reg ## _CRm) << ISS_MSR_CRm_SHIFT) |		\
680 		    ((_reg ## _op2) << ISS_MSR_OP2_SHIFT),		\
681 		.esr_mask = ISS_MSR_REG_MASK,				\
682 		.reg_read = (_read),					\
683 		.reg_write = (_write),					\
684 		.arg = NULL,						\
685 	}
686 #define	ID_SPECIAL_REG(_reg, _name)					\
687 	{								\
688 		.esr_iss = ((_reg ## _op0) << ISS_MSR_OP0_SHIFT) |	\
689 		    ((_reg ## _op1) << ISS_MSR_OP1_SHIFT) |		\
690 		    ((_reg ## _CRn) << ISS_MSR_CRn_SHIFT) |		\
691 		    ((_reg ## _CRm) << ISS_MSR_CRm_SHIFT) |		\
692 		    ((_reg ## _op2) << ISS_MSR_OP2_SHIFT),		\
693 		.esr_mask = ISS_MSR_REG_MASK,				\
694 		.reg_read = vmm_reg_read_arg,				\
695 		.reg_write = vmm_reg_wi,				\
696 		.arg = &(vmm_arch_regs._name),				\
697 	}
698 
699 	/* ID registers */
700 	ID_SPECIAL_REG(ID_AA64PFR0_EL1, id_aa64pfr0),
701 	ID_SPECIAL_REG(ID_AA64DFR0_EL1, id_aa64dfr0),
702 	ID_SPECIAL_REG(ID_AA64ISAR0_EL1, id_aa64isar0),
703 	ID_SPECIAL_REG(ID_AA64MMFR0_EL1, id_aa64mmfr0),
704 	ID_SPECIAL_REG(ID_AA64MMFR1_EL1, id_aa64mmfr1),
705 
706 	/*
707 	 * All other ID registers are read as zero.
708 	 * They are all in the op0=3, op1=0, CRn=0, CRm={0..7} space.
709 	 */
710 	{
711 		.esr_iss = (3 << ISS_MSR_OP0_SHIFT) |
712 		    (0 << ISS_MSR_OP1_SHIFT) |
713 		    (0 << ISS_MSR_CRn_SHIFT) |
714 		    (0 << ISS_MSR_CRm_SHIFT),
715 		.esr_mask = ISS_MSR_OP0_MASK | ISS_MSR_OP1_MASK |
716 		    ISS_MSR_CRn_MASK | (0x8 << ISS_MSR_CRm_SHIFT),
717 		.reg_read = vmm_reg_raz,
718 		.reg_write = vmm_reg_wi,
719 		.arg = NULL,
720 	},
721 
722 	/* Counter physical registers */
723 	SPECIAL_REG(CNTP_CTL_EL0, vtimer_phys_ctl_read, vtimer_phys_ctl_write),
724 	SPECIAL_REG(CNTP_CVAL_EL0, vtimer_phys_cval_read,
725 	    vtimer_phys_cval_write),
726 	SPECIAL_REG(CNTP_TVAL_EL0, vtimer_phys_tval_read,
727 	    vtimer_phys_tval_write),
728 	SPECIAL_REG(CNTPCT_EL0, vtimer_phys_cnt_read, vtimer_phys_cnt_write),
729 
730 	/* Debug registers */
731 	SPECIAL_REG(DBGPRCR_EL1, vmm_reg_raz, vmm_reg_wi),
732 	SPECIAL_REG(OSDLR_EL1, vmm_reg_raz, vmm_reg_wi),
733 	/* TODO: Exceptions on invalid access */
734 	SPECIAL_REG(OSLAR_EL1, vmm_reg_raz, vmm_write_oslar_el1),
735 	SPECIAL_REG(OSLSR_EL1, vmm_read_oslsr_el1, vmm_reg_wi),
736 #undef SPECIAL_REG
737 };
738 
739 void
740 vm_register_reg_handler(struct vm *vm, uint64_t iss, uint64_t mask,
741     reg_read_t reg_read, reg_write_t reg_write, void *arg)
742 {
743 	int i;
744 
745 	for (i = 0; i < nitems(vm->special_reg); i++) {
746 		if (vm->special_reg[i].esr_iss == 0 &&
747 		    vm->special_reg[i].esr_mask == 0) {
748 			vm->special_reg[i].esr_iss = iss;
749 			vm->special_reg[i].esr_mask = mask;
750 			vm->special_reg[i].reg_read = reg_read;
751 			vm->special_reg[i].reg_write = reg_write;
752 			vm->special_reg[i].arg = arg;
753 			return;
754 		}
755 	}
756 
757 	panic("%s: No free special register slot", __func__);
758 }
759 
760 void
761 vm_deregister_reg_handler(struct vm *vm, uint64_t iss, uint64_t mask)
762 {
763 	int i;
764 
765 	for (i = 0; i < nitems(vm->special_reg); i++) {
766 		if (vm->special_reg[i].esr_iss == iss &&
767 		    vm->special_reg[i].esr_mask == mask) {
768 			memset(&vm->special_reg[i], 0,
769 			    sizeof(vm->special_reg[i]));
770 			return;
771 		}
772 	}
773 
774 	panic("%s: Invalid special register: iss %lx mask %lx", __func__, iss,
775 	    mask);
776 }
777 
778 static int
779 vm_handle_reg_emul(struct vcpu *vcpu, bool *retu)
780 {
781 	struct vm *vm;
782 	struct vm_exit *vme;
783 	struct vre *vre;
784 	int i, rv;
785 
786 	vm = vcpu->vm;
787 	vme = &vcpu->exitinfo;
788 	vre = &vme->u.reg_emul.vre;
789 
790 	for (i = 0; i < nitems(vm->special_reg); i++) {
791 		if (vm->special_reg[i].esr_iss == 0 &&
792 		    vm->special_reg[i].esr_mask == 0)
793 			continue;
794 
795 		if ((vre->inst_syndrome & vm->special_reg[i].esr_mask) ==
796 		    vm->special_reg[i].esr_iss) {
797 			rv = vmm_emulate_register(vcpu, vre,
798 			    vm->special_reg[i].reg_read,
799 			    vm->special_reg[i].reg_write,
800 			    vm->special_reg[i].arg);
801 			if (rv == 0) {
802 				*retu = false;
803 			}
804 			return (rv);
805 		}
806 	}
807 	for (i = 0; i < nitems(vmm_special_regs); i++) {
808 		if ((vre->inst_syndrome & vmm_special_regs[i].esr_mask) ==
809 		    vmm_special_regs[i].esr_iss) {
810 			rv = vmm_emulate_register(vcpu, vre,
811 			    vmm_special_regs[i].reg_read,
812 			    vmm_special_regs[i].reg_write,
813 			    vmm_special_regs[i].arg);
814 			if (rv == 0) {
815 				*retu = false;
816 			}
817 			return (rv);
818 		}
819 	}
820 
821 
822 	*retu = true;
823 	return (0);
824 }
825 
826 void
827 vm_register_inst_handler(struct vm *vm, uint64_t start, uint64_t size,
828     mem_region_read_t mmio_read, mem_region_write_t mmio_write)
829 {
830 	int i;
831 
832 	for (i = 0; i < nitems(vm->mmio_region); i++) {
833 		if (vm->mmio_region[i].start == 0 &&
834 		    vm->mmio_region[i].end == 0) {
835 			vm->mmio_region[i].start = start;
836 			vm->mmio_region[i].end = start + size;
837 			vm->mmio_region[i].read = mmio_read;
838 			vm->mmio_region[i].write = mmio_write;
839 			return;
840 		}
841 	}
842 
843 	panic("%s: No free MMIO region", __func__);
844 }
845 
846 void
847 vm_deregister_inst_handler(struct vm *vm, uint64_t start, uint64_t size)
848 {
849 	int i;
850 
851 	for (i = 0; i < nitems(vm->mmio_region); i++) {
852 		if (vm->mmio_region[i].start == start &&
853 		    vm->mmio_region[i].end == start + size) {
854 			memset(&vm->mmio_region[i], 0,
855 			    sizeof(vm->mmio_region[i]));
856 			return;
857 		}
858 	}
859 
860 	panic("%s: Invalid MMIO region: %lx - %lx", __func__, start,
861 	    start + size);
862 }
863 
864 static int
865 vm_handle_inst_emul(struct vcpu *vcpu, bool *retu)
866 {
867 	struct vm *vm;
868 	struct vm_exit *vme;
869 	struct vie *vie;
870 	struct hyp *hyp;
871 	uint64_t fault_ipa;
872 	struct vm_guest_paging *paging;
873 	struct vmm_mmio_region *vmr;
874 	int error, i;
875 
876 	vm = vcpu->vm;
877 	hyp = vm->cookie;
878 	if (!hyp->vgic_attached)
879 		goto out_user;
880 
881 	vme = &vcpu->exitinfo;
882 	vie = &vme->u.inst_emul.vie;
883 	paging = &vme->u.inst_emul.paging;
884 
885 	fault_ipa = vme->u.inst_emul.gpa;
886 
887 	vmr = NULL;
888 	for (i = 0; i < nitems(vm->mmio_region); i++) {
889 		if (vm->mmio_region[i].start <= fault_ipa &&
890 		    vm->mmio_region[i].end > fault_ipa) {
891 			vmr = &vm->mmio_region[i];
892 			break;
893 		}
894 	}
895 	if (vmr == NULL)
896 		goto out_user;
897 
898 	error = vmm_emulate_instruction(vcpu, fault_ipa, vie, paging,
899 	    vmr->read, vmr->write, retu);
900 	return (error);
901 
902 out_user:
903 	*retu = true;
904 	return (0);
905 }
906 
907 int
908 vm_suspend(struct vm *vm, enum vm_suspend_how how)
909 {
910 	int i;
911 
912 	if (how <= VM_SUSPEND_NONE || how >= VM_SUSPEND_LAST)
913 		return (EINVAL);
914 
915 	if (atomic_cmpset_int(&vm->suspend, 0, how) == 0) {
916 		VM_CTR2(vm, "virtual machine already suspended %d/%d",
917 		    vm->suspend, how);
918 		return (EALREADY);
919 	}
920 
921 	VM_CTR1(vm, "virtual machine successfully suspended %d", how);
922 
923 	/*
924 	 * Notify all active vcpus that they are now suspended.
925 	 */
926 	for (i = 0; i < vm->maxcpus; i++) {
927 		if (CPU_ISSET(i, &vm->active_cpus))
928 			vcpu_notify_event(vm_vcpu(vm, i));
929 	}
930 
931 	return (0);
932 }
933 
934 void
935 vm_exit_suspended(struct vcpu *vcpu, uint64_t pc)
936 {
937 	struct vm *vm = vcpu->vm;
938 	struct vm_exit *vmexit;
939 
940 	KASSERT(vm->suspend > VM_SUSPEND_NONE && vm->suspend < VM_SUSPEND_LAST,
941 	    ("vm_exit_suspended: invalid suspend type %d", vm->suspend));
942 
943 	vmexit = vm_exitinfo(vcpu);
944 	vmexit->pc = pc;
945 	vmexit->inst_length = 4;
946 	vmexit->exitcode = VM_EXITCODE_SUSPENDED;
947 	vmexit->u.suspended.how = vm->suspend;
948 }
949 
950 void
951 vm_exit_debug(struct vcpu *vcpu, uint64_t pc)
952 {
953 	struct vm_exit *vmexit;
954 
955 	vmexit = vm_exitinfo(vcpu);
956 	vmexit->pc = pc;
957 	vmexit->inst_length = 4;
958 	vmexit->exitcode = VM_EXITCODE_DEBUG;
959 }
960 
961 int
962 vm_activate_cpu(struct vcpu *vcpu)
963 {
964 	struct vm *vm = vcpu->vm;
965 
966 	if (CPU_ISSET(vcpu->vcpuid, &vm->active_cpus))
967 		return (EBUSY);
968 
969 	CPU_SET_ATOMIC(vcpu->vcpuid, &vm->active_cpus);
970 	return (0);
971 
972 }
973 
974 int
975 vm_suspend_cpu(struct vm *vm, struct vcpu *vcpu)
976 {
977 	if (vcpu == NULL) {
978 		vm->debug_cpus = vm->active_cpus;
979 		for (int i = 0; i < vm->maxcpus; i++) {
980 			if (CPU_ISSET(i, &vm->active_cpus))
981 				vcpu_notify_event(vm_vcpu(vm, i));
982 		}
983 	} else {
984 		if (!CPU_ISSET(vcpu->vcpuid, &vm->active_cpus))
985 			return (EINVAL);
986 
987 		CPU_SET_ATOMIC(vcpu->vcpuid, &vm->debug_cpus);
988 		vcpu_notify_event(vcpu);
989 	}
990 	return (0);
991 }
992 
993 int
994 vm_resume_cpu(struct vm *vm, struct vcpu *vcpu)
995 {
996 
997 	if (vcpu == NULL) {
998 		CPU_ZERO(&vm->debug_cpus);
999 	} else {
1000 		if (!CPU_ISSET(vcpu->vcpuid, &vm->debug_cpus))
1001 			return (EINVAL);
1002 
1003 		CPU_CLR_ATOMIC(vcpu->vcpuid, &vm->debug_cpus);
1004 	}
1005 	return (0);
1006 }
1007 
1008 int
1009 vcpu_debugged(struct vcpu *vcpu)
1010 {
1011 
1012 	return (CPU_ISSET(vcpu->vcpuid, &vcpu->vm->debug_cpus));
1013 }
1014 
1015 cpuset_t
1016 vm_active_cpus(struct vm *vm)
1017 {
1018 
1019 	return (vm->active_cpus);
1020 }
1021 
1022 cpuset_t
1023 vm_debug_cpus(struct vm *vm)
1024 {
1025 
1026 	return (vm->debug_cpus);
1027 }
1028 
1029 cpuset_t
1030 vm_suspended_cpus(struct vm *vm)
1031 {
1032 
1033 	return (vm->suspended_cpus);
1034 }
1035 
1036 
1037 void *
1038 vcpu_stats(struct vcpu *vcpu)
1039 {
1040 
1041 	return (vcpu->stats);
1042 }
1043 
1044 /*
1045  * This function is called to ensure that a vcpu "sees" a pending event
1046  * as soon as possible:
1047  * - If the vcpu thread is sleeping then it is woken up.
1048  * - If the vcpu is running on a different host_cpu then an IPI will be directed
1049  *   to the host_cpu to cause the vcpu to trap into the hypervisor.
1050  */
1051 static void
1052 vcpu_notify_event_locked(struct vcpu *vcpu)
1053 {
1054 	int hostcpu;
1055 
1056 	hostcpu = vcpu->hostcpu;
1057 	if (vcpu->state == VCPU_RUNNING) {
1058 		KASSERT(hostcpu != NOCPU, ("vcpu running on invalid hostcpu"));
1059 		if (hostcpu != curcpu) {
1060 			ipi_cpu(hostcpu, vmm_ipinum);
1061 		} else {
1062 			/*
1063 			 * If the 'vcpu' is running on 'curcpu' then it must
1064 			 * be sending a notification to itself (e.g. SELF_IPI).
1065 			 * The pending event will be picked up when the vcpu
1066 			 * transitions back to guest context.
1067 			 */
1068 		}
1069 	} else {
1070 		KASSERT(hostcpu == NOCPU, ("vcpu state %d not consistent "
1071 		    "with hostcpu %d", vcpu->state, hostcpu));
1072 		if (vcpu->state == VCPU_SLEEPING)
1073 			wakeup_one(vcpu);
1074 	}
1075 }
1076 
1077 void
1078 vcpu_notify_event(struct vcpu *vcpu)
1079 {
1080 	vcpu_lock(vcpu);
1081 	vcpu_notify_event_locked(vcpu);
1082 	vcpu_unlock(vcpu);
1083 }
1084 
1085 struct vm_mem *
1086 vm_mem(struct vm *vm)
1087 {
1088 	return (&vm->mem);
1089 }
1090 
1091 static void
1092 restore_guest_fpustate(struct vcpu *vcpu)
1093 {
1094 
1095 	/* flush host state to the pcb */
1096 	vfp_save_state(curthread, curthread->td_pcb);
1097 	/* Ensure the VFP state will be re-loaded when exiting the guest */
1098 	PCPU_SET(fpcurthread, NULL);
1099 
1100 	/* restore guest FPU state */
1101 	vfp_enable();
1102 	vfp_restore(vcpu->guestfpu);
1103 
1104 	/*
1105 	 * The FPU is now "dirty" with the guest's state so turn on emulation
1106 	 * to trap any access to the FPU by the host.
1107 	 */
1108 	vfp_disable();
1109 }
1110 
1111 static void
1112 save_guest_fpustate(struct vcpu *vcpu)
1113 {
1114 	if ((READ_SPECIALREG(cpacr_el1) & CPACR_FPEN_MASK) !=
1115 	    CPACR_FPEN_TRAP_ALL1)
1116 		panic("VFP not enabled in host!");
1117 
1118 	/* save guest FPU state */
1119 	vfp_enable();
1120 	vfp_store(vcpu->guestfpu);
1121 	vfp_disable();
1122 
1123 	KASSERT(PCPU_GET(fpcurthread) == NULL,
1124 	    ("%s: fpcurthread set with guest registers", __func__));
1125 }
1126 static int
1127 vcpu_set_state_locked(struct vcpu *vcpu, enum vcpu_state newstate,
1128     bool from_idle)
1129 {
1130 	int error;
1131 
1132 	vcpu_assert_locked(vcpu);
1133 
1134 	/*
1135 	 * State transitions from the vmmdev_ioctl() must always begin from
1136 	 * the VCPU_IDLE state. This guarantees that there is only a single
1137 	 * ioctl() operating on a vcpu at any point.
1138 	 */
1139 	if (from_idle) {
1140 		while (vcpu->state != VCPU_IDLE) {
1141 			vcpu_notify_event_locked(vcpu);
1142 			msleep_spin(&vcpu->state, &vcpu->mtx, "vmstat", hz);
1143 		}
1144 	} else {
1145 		KASSERT(vcpu->state != VCPU_IDLE, ("invalid transition from "
1146 		    "vcpu idle state"));
1147 	}
1148 
1149 	if (vcpu->state == VCPU_RUNNING) {
1150 		KASSERT(vcpu->hostcpu == curcpu, ("curcpu %d and hostcpu %d "
1151 		    "mismatch for running vcpu", curcpu, vcpu->hostcpu));
1152 	} else {
1153 		KASSERT(vcpu->hostcpu == NOCPU, ("Invalid hostcpu %d for a "
1154 		    "vcpu that is not running", vcpu->hostcpu));
1155 	}
1156 
1157 	/*
1158 	 * The following state transitions are allowed:
1159 	 * IDLE -> FROZEN -> IDLE
1160 	 * FROZEN -> RUNNING -> FROZEN
1161 	 * FROZEN -> SLEEPING -> FROZEN
1162 	 */
1163 	switch (vcpu->state) {
1164 	case VCPU_IDLE:
1165 	case VCPU_RUNNING:
1166 	case VCPU_SLEEPING:
1167 		error = (newstate != VCPU_FROZEN);
1168 		break;
1169 	case VCPU_FROZEN:
1170 		error = (newstate == VCPU_FROZEN);
1171 		break;
1172 	default:
1173 		error = 1;
1174 		break;
1175 	}
1176 
1177 	if (error)
1178 		return (EBUSY);
1179 
1180 	vcpu->state = newstate;
1181 	if (newstate == VCPU_RUNNING)
1182 		vcpu->hostcpu = curcpu;
1183 	else
1184 		vcpu->hostcpu = NOCPU;
1185 
1186 	if (newstate == VCPU_IDLE)
1187 		wakeup(&vcpu->state);
1188 
1189 	return (0);
1190 }
1191 
1192 static void
1193 vcpu_require_state(struct vcpu *vcpu, enum vcpu_state newstate)
1194 {
1195 	int error;
1196 
1197 	if ((error = vcpu_set_state(vcpu, newstate, false)) != 0)
1198 		panic("Error %d setting state to %d\n", error, newstate);
1199 }
1200 
1201 static void
1202 vcpu_require_state_locked(struct vcpu *vcpu, enum vcpu_state newstate)
1203 {
1204 	int error;
1205 
1206 	if ((error = vcpu_set_state_locked(vcpu, newstate, false)) != 0)
1207 		panic("Error %d setting state to %d", error, newstate);
1208 }
1209 
1210 int
1211 vm_get_capability(struct vcpu *vcpu, int type, int *retval)
1212 {
1213 	if (type < 0 || type >= VM_CAP_MAX)
1214 		return (EINVAL);
1215 
1216 	return (vmmops_getcap(vcpu->cookie, type, retval));
1217 }
1218 
1219 int
1220 vm_set_capability(struct vcpu *vcpu, int type, int val)
1221 {
1222 	if (type < 0 || type >= VM_CAP_MAX)
1223 		return (EINVAL);
1224 
1225 	return (vmmops_setcap(vcpu->cookie, type, val));
1226 }
1227 
1228 struct vm *
1229 vcpu_vm(struct vcpu *vcpu)
1230 {
1231 	return (vcpu->vm);
1232 }
1233 
1234 int
1235 vcpu_vcpuid(struct vcpu *vcpu)
1236 {
1237 	return (vcpu->vcpuid);
1238 }
1239 
1240 void *
1241 vcpu_get_cookie(struct vcpu *vcpu)
1242 {
1243 	return (vcpu->cookie);
1244 }
1245 
1246 struct vcpu *
1247 vm_vcpu(struct vm *vm, int vcpuid)
1248 {
1249 	return (vm->vcpu[vcpuid]);
1250 }
1251 
1252 int
1253 vcpu_set_state(struct vcpu *vcpu, enum vcpu_state newstate, bool from_idle)
1254 {
1255 	int error;
1256 
1257 	vcpu_lock(vcpu);
1258 	error = vcpu_set_state_locked(vcpu, newstate, from_idle);
1259 	vcpu_unlock(vcpu);
1260 
1261 	return (error);
1262 }
1263 
1264 enum vcpu_state
1265 vcpu_get_state(struct vcpu *vcpu, int *hostcpu)
1266 {
1267 	enum vcpu_state state;
1268 
1269 	vcpu_lock(vcpu);
1270 	state = vcpu->state;
1271 	if (hostcpu != NULL)
1272 		*hostcpu = vcpu->hostcpu;
1273 	vcpu_unlock(vcpu);
1274 
1275 	return (state);
1276 }
1277 
1278 int
1279 vm_get_register(struct vcpu *vcpu, int reg, uint64_t *retval)
1280 {
1281 	if (reg < 0 || reg >= VM_REG_LAST)
1282 		return (EINVAL);
1283 
1284 	return (vmmops_getreg(vcpu->cookie, reg, retval));
1285 }
1286 
1287 int
1288 vm_set_register(struct vcpu *vcpu, int reg, uint64_t val)
1289 {
1290 	int error;
1291 
1292 	if (reg < 0 || reg >= VM_REG_LAST)
1293 		return (EINVAL);
1294 	error = vmmops_setreg(vcpu->cookie, reg, val);
1295 	if (error || reg != VM_REG_GUEST_PC)
1296 		return (error);
1297 
1298 	vcpu->nextpc = val;
1299 
1300 	return (0);
1301 }
1302 
1303 void *
1304 vm_get_cookie(struct vm *vm)
1305 {
1306 	return (vm->cookie);
1307 }
1308 
1309 int
1310 vm_inject_exception(struct vcpu *vcpu, uint64_t esr, uint64_t far)
1311 {
1312 	return (vmmops_exception(vcpu->cookie, esr, far));
1313 }
1314 
1315 int
1316 vm_attach_vgic(struct vm *vm, struct vm_vgic_descr *descr)
1317 {
1318 	return (vgic_attach_to_vm(vm->cookie, descr));
1319 }
1320 
1321 int
1322 vm_assert_irq(struct vm *vm, uint32_t irq)
1323 {
1324 	return (vgic_inject_irq(vm->cookie, -1, irq, true));
1325 }
1326 
1327 int
1328 vm_deassert_irq(struct vm *vm, uint32_t irq)
1329 {
1330 	return (vgic_inject_irq(vm->cookie, -1, irq, false));
1331 }
1332 
1333 int
1334 vm_raise_msi(struct vm *vm, uint64_t msg, uint64_t addr, int bus, int slot,
1335     int func)
1336 {
1337 	/* TODO: Should we raise an SError? */
1338 	return (vgic_inject_msi(vm->cookie, msg, addr));
1339 }
1340 
1341 static int
1342 vm_handle_smccc_call(struct vcpu *vcpu, struct vm_exit *vme, bool *retu)
1343 {
1344 	struct hypctx *hypctx;
1345 	int i;
1346 
1347 	hypctx = vcpu_get_cookie(vcpu);
1348 
1349 	if ((hypctx->tf.tf_esr & ESR_ELx_ISS_MASK) != 0)
1350 		return (1);
1351 
1352 	vme->exitcode = VM_EXITCODE_SMCCC;
1353 	vme->u.smccc_call.func_id = hypctx->tf.tf_x[0];
1354 	for (i = 0; i < nitems(vme->u.smccc_call.args); i++)
1355 		vme->u.smccc_call.args[i] = hypctx->tf.tf_x[i + 1];
1356 
1357 	*retu = true;
1358 	return (0);
1359 }
1360 
1361 static int
1362 vm_handle_wfi(struct vcpu *vcpu, struct vm_exit *vme, bool *retu)
1363 {
1364 	struct vm *vm;
1365 
1366 	vm = vcpu->vm;
1367 	vcpu_lock(vcpu);
1368 	while (1) {
1369 		if (vm->suspend)
1370 			break;
1371 
1372 		if (vgic_has_pending_irq(vcpu->cookie))
1373 			break;
1374 
1375 		if (vcpu_should_yield(vcpu))
1376 			break;
1377 
1378 		vcpu_require_state_locked(vcpu, VCPU_SLEEPING);
1379 		/*
1380 		 * XXX msleep_spin() cannot be interrupted by signals so
1381 		 * wake up periodically to check pending signals.
1382 		 */
1383 		msleep_spin(vcpu, &vcpu->mtx, "vmidle", hz);
1384 		vcpu_require_state_locked(vcpu, VCPU_FROZEN);
1385 	}
1386 	vcpu_unlock(vcpu);
1387 
1388 	*retu = false;
1389 	return (0);
1390 }
1391 
1392 static int
1393 vm_handle_paging(struct vcpu *vcpu, bool *retu)
1394 {
1395 	struct vm *vm = vcpu->vm;
1396 	struct vm_exit *vme;
1397 	struct vm_map *map;
1398 	uint64_t addr, esr;
1399 	pmap_t pmap;
1400 	int ftype, rv;
1401 
1402 	vme = &vcpu->exitinfo;
1403 
1404 	pmap = vmspace_pmap(vm_vmspace(vcpu->vm));
1405 	addr = vme->u.paging.gpa;
1406 	esr = vme->u.paging.esr;
1407 
1408 	/* The page exists, but the page table needs to be updated. */
1409 	if (pmap_fault(pmap, esr, addr) == KERN_SUCCESS)
1410 		return (0);
1411 
1412 	switch (ESR_ELx_EXCEPTION(esr)) {
1413 	case EXCP_INSN_ABORT_L:
1414 	case EXCP_DATA_ABORT_L:
1415 		ftype = VM_PROT_EXECUTE | VM_PROT_READ | VM_PROT_WRITE;
1416 		break;
1417 	default:
1418 		panic("%s: Invalid exception (esr = %lx)", __func__, esr);
1419 	}
1420 
1421 	map = &vm_vmspace(vm)->vm_map;
1422 	rv = vm_fault(map, vme->u.paging.gpa, ftype, VM_FAULT_NORMAL, NULL);
1423 	if (rv != KERN_SUCCESS)
1424 		return (EFAULT);
1425 
1426 	return (0);
1427 }
1428 
1429 static int
1430 vm_handle_suspend(struct vcpu *vcpu, bool *retu)
1431 {
1432 	struct vm *vm = vcpu->vm;
1433 	int error, i;
1434 	struct thread *td;
1435 
1436 	error = 0;
1437 	td = curthread;
1438 
1439 	CPU_SET_ATOMIC(vcpu->vcpuid, &vm->suspended_cpus);
1440 
1441 	/*
1442 	 * Wait until all 'active_cpus' have suspended themselves.
1443 	 *
1444 	 * Since a VM may be suspended at any time including when one or
1445 	 * more vcpus are doing a rendezvous we need to call the rendezvous
1446 	 * handler while we are waiting to prevent a deadlock.
1447 	 */
1448 	vcpu_lock(vcpu);
1449 	while (error == 0) {
1450 		if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0)
1451 			break;
1452 
1453 		vcpu_require_state_locked(vcpu, VCPU_SLEEPING);
1454 		msleep_spin(vcpu, &vcpu->mtx, "vmsusp", hz);
1455 		vcpu_require_state_locked(vcpu, VCPU_FROZEN);
1456 		if (td_ast_pending(td, TDA_SUSPEND)) {
1457 			vcpu_unlock(vcpu);
1458 			error = thread_check_susp(td, false);
1459 			vcpu_lock(vcpu);
1460 		}
1461 	}
1462 	vcpu_unlock(vcpu);
1463 
1464 	/*
1465 	 * Wakeup the other sleeping vcpus and return to userspace.
1466 	 */
1467 	for (i = 0; i < vm->maxcpus; i++) {
1468 		if (CPU_ISSET(i, &vm->suspended_cpus)) {
1469 			vcpu_notify_event(vm_vcpu(vm, i));
1470 		}
1471 	}
1472 
1473 	*retu = true;
1474 	return (error);
1475 }
1476 
1477 int
1478 vm_run(struct vcpu *vcpu)
1479 {
1480 	struct vm *vm = vcpu->vm;
1481 	struct vm_eventinfo evinfo;
1482 	int error, vcpuid;
1483 	struct vm_exit *vme;
1484 	bool retu;
1485 	pmap_t pmap;
1486 
1487 	vcpuid = vcpu->vcpuid;
1488 
1489 	if (!CPU_ISSET(vcpuid, &vm->active_cpus))
1490 		return (EINVAL);
1491 
1492 	if (CPU_ISSET(vcpuid, &vm->suspended_cpus))
1493 		return (EINVAL);
1494 
1495 	pmap = vmspace_pmap(vm_vmspace(vm));
1496 	vme = &vcpu->exitinfo;
1497 	evinfo.rptr = NULL;
1498 	evinfo.sptr = &vm->suspend;
1499 	evinfo.iptr = NULL;
1500 restart:
1501 	critical_enter();
1502 
1503 	restore_guest_fpustate(vcpu);
1504 
1505 	vcpu_require_state(vcpu, VCPU_RUNNING);
1506 	error = vmmops_run(vcpu->cookie, vcpu->nextpc, pmap, &evinfo);
1507 	vcpu_require_state(vcpu, VCPU_FROZEN);
1508 
1509 	save_guest_fpustate(vcpu);
1510 
1511 	critical_exit();
1512 
1513 	if (error == 0) {
1514 		retu = false;
1515 		switch (vme->exitcode) {
1516 		case VM_EXITCODE_INST_EMUL:
1517 			vcpu->nextpc = vme->pc + vme->inst_length;
1518 			error = vm_handle_inst_emul(vcpu, &retu);
1519 			break;
1520 
1521 		case VM_EXITCODE_REG_EMUL:
1522 			vcpu->nextpc = vme->pc + vme->inst_length;
1523 			error = vm_handle_reg_emul(vcpu, &retu);
1524 			break;
1525 
1526 		case VM_EXITCODE_HVC:
1527 			/*
1528 			 * The HVC instruction saves the address for the
1529 			 * next instruction as the return address.
1530 			 */
1531 			vcpu->nextpc = vme->pc;
1532 			/*
1533 			 * The PSCI call can change the exit information in the
1534 			 * case of suspend/reset/poweroff/cpu off/cpu on.
1535 			 */
1536 			error = vm_handle_smccc_call(vcpu, vme, &retu);
1537 			break;
1538 
1539 		case VM_EXITCODE_WFI:
1540 			vcpu->nextpc = vme->pc + vme->inst_length;
1541 			error = vm_handle_wfi(vcpu, vme, &retu);
1542 			break;
1543 
1544 		case VM_EXITCODE_PAGING:
1545 			vcpu->nextpc = vme->pc;
1546 			error = vm_handle_paging(vcpu, &retu);
1547 			break;
1548 
1549 		case VM_EXITCODE_SUSPENDED:
1550 			vcpu->nextpc = vme->pc;
1551 			error = vm_handle_suspend(vcpu, &retu);
1552 			break;
1553 
1554 		default:
1555 			/* Handle in userland */
1556 			vcpu->nextpc = vme->pc;
1557 			retu = true;
1558 			break;
1559 		}
1560 	}
1561 
1562 	if (error == 0 && retu == false)
1563 		goto restart;
1564 
1565 	return (error);
1566 }
1567