xref: /freebsd/sys/arm64/vmm/vmm.c (revision d4033e6d37747f5213bb245c8e605406703a8766)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (C) 2015 Mihai Carabas <mihai.carabas@gmail.com>
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28 
29 #include <sys/param.h>
30 #include <sys/systm.h>
31 #include <sys/cpuset.h>
32 #include <sys/kernel.h>
33 #include <sys/linker.h>
34 #include <sys/lock.h>
35 #include <sys/malloc.h>
36 #include <sys/module.h>
37 #include <sys/mutex.h>
38 #include <sys/pcpu.h>
39 #include <sys/proc.h>
40 #include <sys/queue.h>
41 #include <sys/rwlock.h>
42 #include <sys/sched.h>
43 #include <sys/smp.h>
44 #include <sys/sysctl.h>
45 
46 #include <vm/vm.h>
47 #include <vm/vm_object.h>
48 #include <vm/vm_page.h>
49 #include <vm/pmap.h>
50 #include <vm/vm_map.h>
51 #include <vm/vm_extern.h>
52 #include <vm/vm_param.h>
53 
54 #include <machine/armreg.h>
55 #include <machine/cpu.h>
56 #include <machine/fpu.h>
57 #include <machine/machdep.h>
58 #include <machine/pcb.h>
59 #include <machine/smp.h>
60 #include <machine/vm.h>
61 #include <machine/vmparam.h>
62 #include <machine/vmm.h>
63 #include <machine/vmm_instruction_emul.h>
64 
65 #include <dev/pci/pcireg.h>
66 #include <dev/vmm/vmm_dev.h>
67 #include <dev/vmm/vmm_ktr.h>
68 #include <dev/vmm/vmm_mem.h>
69 #include <dev/vmm/vmm_stat.h>
70 
71 #include "arm64.h"
72 #include "mmu.h"
73 
74 #include "io/vgic.h"
75 #include "io/vtimer.h"
76 
77 struct vcpu {
78 	int		flags;
79 	enum vcpu_state	state;
80 	struct mtx	mtx;
81 	int		hostcpu;	/* host cpuid this vcpu last ran on */
82 	int		vcpuid;
83 	void		*stats;
84 	struct vm_exit	exitinfo;
85 	uint64_t	nextpc;		/* (x) next instruction to execute */
86 	struct vm	*vm;		/* (o) */
87 	void		*cookie;	/* (i) cpu-specific data */
88 	struct vfpstate	*guestfpu;	/* (a,i) guest fpu state */
89 };
90 
91 #define	vcpu_lock_initialized(v) mtx_initialized(&((v)->mtx))
92 #define	vcpu_lock_init(v)	mtx_init(&((v)->mtx), "vcpu lock", 0, MTX_SPIN)
93 #define	vcpu_lock_destroy(v)	mtx_destroy(&((v)->mtx))
94 #define	vcpu_lock(v)		mtx_lock_spin(&((v)->mtx))
95 #define	vcpu_unlock(v)		mtx_unlock_spin(&((v)->mtx))
96 #define	vcpu_assert_locked(v)	mtx_assert(&((v)->mtx), MA_OWNED)
97 
98 struct vmm_mmio_region {
99 	uint64_t start;
100 	uint64_t end;
101 	mem_region_read_t read;
102 	mem_region_write_t write;
103 };
104 #define	VM_MAX_MMIO_REGIONS	4
105 
106 struct vmm_special_reg {
107 	uint32_t	esr_iss;
108 	uint32_t	esr_mask;
109 	reg_read_t	reg_read;
110 	reg_write_t	reg_write;
111 	void		*arg;
112 };
113 #define	VM_MAX_SPECIAL_REGS	16
114 
115 /*
116  * Initialization:
117  * (o) initialized the first time the VM is created
118  * (i) initialized when VM is created and when it is reinitialized
119  * (x) initialized before use
120  */
121 struct vm {
122 	void		*cookie;		/* (i) cpu-specific data */
123 	volatile cpuset_t active_cpus;		/* (i) active vcpus */
124 	volatile cpuset_t debug_cpus;		/* (i) vcpus stopped for debug */
125 	int		suspend;		/* (i) stop VM execution */
126 	bool		dying;			/* (o) is dying */
127 	volatile cpuset_t suspended_cpus; 	/* (i) suspended vcpus */
128 	volatile cpuset_t halted_cpus;		/* (x) cpus in a hard halt */
129 	struct vmspace	*vmspace;		/* (o) guest's address space */
130 	struct vm_mem	mem;			/* (i) guest memory */
131 	char		name[VM_MAX_NAMELEN];	/* (o) virtual machine name */
132 	struct vcpu	**vcpu;			/* (i) guest vcpus */
133 	struct vmm_mmio_region mmio_region[VM_MAX_MMIO_REGIONS];
134 						/* (o) guest MMIO regions */
135 	struct vmm_special_reg special_reg[VM_MAX_SPECIAL_REGS];
136 	/* The following describe the vm cpu topology */
137 	uint16_t	sockets;		/* (o) num of sockets */
138 	uint16_t	cores;			/* (o) num of cores/socket */
139 	uint16_t	threads;		/* (o) num of threads/core */
140 	uint16_t	maxcpus;		/* (o) max pluggable cpus */
141 	struct sx	vcpus_init_lock;	/* (o) */
142 };
143 
144 static bool vmm_initialized = false;
145 
146 static int vm_handle_wfi(struct vcpu *vcpu,
147 			 struct vm_exit *vme, bool *retu);
148 
149 static MALLOC_DEFINE(M_VMM, "vmm", "vmm");
150 
151 /* statistics */
152 static VMM_STAT(VCPU_TOTAL_RUNTIME, "vcpu total runtime");
153 
154 SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW, NULL, NULL);
155 
156 static int vmm_ipinum;
157 SYSCTL_INT(_hw_vmm, OID_AUTO, ipinum, CTLFLAG_RD, &vmm_ipinum, 0,
158     "IPI vector used for vcpu notifications");
159 
160 struct vmm_regs {
161 	uint64_t	id_aa64afr0;
162 	uint64_t	id_aa64afr1;
163 	uint64_t	id_aa64dfr0;
164 	uint64_t	id_aa64dfr1;
165 	uint64_t	id_aa64isar0;
166 	uint64_t	id_aa64isar1;
167 	uint64_t	id_aa64isar2;
168 	uint64_t	id_aa64mmfr0;
169 	uint64_t	id_aa64mmfr1;
170 	uint64_t	id_aa64mmfr2;
171 	uint64_t	id_aa64pfr0;
172 	uint64_t	id_aa64pfr1;
173 };
174 
175 static const struct vmm_regs vmm_arch_regs_masks = {
176 	.id_aa64dfr0 =
177 	    ID_AA64DFR0_CTX_CMPs_MASK |
178 	    ID_AA64DFR0_WRPs_MASK |
179 	    ID_AA64DFR0_BRPs_MASK |
180 	    ID_AA64DFR0_PMUVer_3 |
181 	    ID_AA64DFR0_DebugVer_8,
182 	.id_aa64isar0 =
183 	    ID_AA64ISAR0_TLB_TLBIOSR |
184 	    ID_AA64ISAR0_SHA3_IMPL |
185 	    ID_AA64ISAR0_RDM_IMPL |
186 	    ID_AA64ISAR0_Atomic_IMPL |
187 	    ID_AA64ISAR0_CRC32_BASE |
188 	    ID_AA64ISAR0_SHA2_512 |
189 	    ID_AA64ISAR0_SHA1_BASE |
190 	    ID_AA64ISAR0_AES_PMULL,
191 	.id_aa64mmfr0 =
192 	    ID_AA64MMFR0_TGran4_IMPL |
193 	    ID_AA64MMFR0_TGran64_IMPL |
194 	    ID_AA64MMFR0_TGran16_IMPL |
195 	    ID_AA64MMFR0_ASIDBits_16 |
196 	    ID_AA64MMFR0_PARange_4P,
197 	.id_aa64mmfr1 =
198 	    ID_AA64MMFR1_SpecSEI_IMPL |
199 	    ID_AA64MMFR1_PAN_ATS1E1 |
200 	    ID_AA64MMFR1_HAFDBS_AF,
201 	.id_aa64pfr0 =
202 	    ID_AA64PFR0_GIC_CPUIF_NONE |
203 	    ID_AA64PFR0_AdvSIMD_HP |
204 	    ID_AA64PFR0_FP_HP |
205 	    ID_AA64PFR0_EL3_64 |
206 	    ID_AA64PFR0_EL2_64 |
207 	    ID_AA64PFR0_EL1_64 |
208 	    ID_AA64PFR0_EL0_64,
209 };
210 
211 /* Host registers masked by vmm_arch_regs_masks. */
212 static struct vmm_regs vmm_arch_regs;
213 
214 u_int vm_maxcpu;
215 SYSCTL_UINT(_hw_vmm, OID_AUTO, maxcpu, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
216     &vm_maxcpu, 0, "Maximum number of vCPUs");
217 
218 static void vcpu_notify_event_locked(struct vcpu *vcpu);
219 
220 /* global statistics */
221 VMM_STAT(VMEXIT_COUNT, "total number of vm exits");
222 VMM_STAT(VMEXIT_UNKNOWN, "number of vmexits for the unknown exception");
223 VMM_STAT(VMEXIT_WFI, "number of times wfi was intercepted");
224 VMM_STAT(VMEXIT_WFE, "number of times wfe was intercepted");
225 VMM_STAT(VMEXIT_HVC, "number of times hvc was intercepted");
226 VMM_STAT(VMEXIT_MSR, "number of times msr/mrs was intercepted");
227 VMM_STAT(VMEXIT_DATA_ABORT, "number of vmexits for a data abort");
228 VMM_STAT(VMEXIT_INSN_ABORT, "number of vmexits for an instruction abort");
229 VMM_STAT(VMEXIT_UNHANDLED_SYNC, "number of vmexits for an unhandled synchronous exception");
230 VMM_STAT(VMEXIT_IRQ, "number of vmexits for an irq");
231 VMM_STAT(VMEXIT_FIQ, "number of vmexits for an interrupt");
232 VMM_STAT(VMEXIT_BRK, "number of vmexits for a breakpoint exception");
233 VMM_STAT(VMEXIT_SS, "number of vmexits for a single-step exception");
234 VMM_STAT(VMEXIT_UNHANDLED_EL2, "number of vmexits for an unhandled EL2 exception");
235 VMM_STAT(VMEXIT_UNHANDLED, "number of vmexits for an unhandled exception");
236 
237 /*
238  * Upper limit on vm_maxcpu. We could increase this to 28 bits, but this
239  * is a safe value for now.
240  */
241 #define	VM_MAXCPU	MIN(0xffff - 1, CPU_SETSIZE)
242 
243 static int
244 vmm_regs_init(struct vmm_regs *regs, const struct vmm_regs *masks)
245 {
246 #define	_FETCH_KERN_REG(reg, field) do {				\
247 	regs->field = vmm_arch_regs_masks.field;			\
248 	if (!get_kernel_reg_iss_masked(reg ## _ISS, &regs->field,	\
249 	    masks->field))						\
250 		regs->field = 0;					\
251 } while (0)
252 	_FETCH_KERN_REG(ID_AA64AFR0_EL1, id_aa64afr0);
253 	_FETCH_KERN_REG(ID_AA64AFR1_EL1, id_aa64afr1);
254 	_FETCH_KERN_REG(ID_AA64DFR0_EL1, id_aa64dfr0);
255 	_FETCH_KERN_REG(ID_AA64DFR1_EL1, id_aa64dfr1);
256 	_FETCH_KERN_REG(ID_AA64ISAR0_EL1, id_aa64isar0);
257 	_FETCH_KERN_REG(ID_AA64ISAR1_EL1, id_aa64isar1);
258 	_FETCH_KERN_REG(ID_AA64ISAR2_EL1, id_aa64isar2);
259 	_FETCH_KERN_REG(ID_AA64MMFR0_EL1, id_aa64mmfr0);
260 	_FETCH_KERN_REG(ID_AA64MMFR1_EL1, id_aa64mmfr1);
261 	_FETCH_KERN_REG(ID_AA64MMFR2_EL1, id_aa64mmfr2);
262 	_FETCH_KERN_REG(ID_AA64PFR0_EL1, id_aa64pfr0);
263 	_FETCH_KERN_REG(ID_AA64PFR1_EL1, id_aa64pfr1);
264 #undef _FETCH_KERN_REG
265 	return (0);
266 }
267 
268 static void
269 vcpu_cleanup(struct vcpu *vcpu, bool destroy)
270 {
271 	vmmops_vcpu_cleanup(vcpu->cookie);
272 	vcpu->cookie = NULL;
273 	if (destroy) {
274 		vmm_stat_free(vcpu->stats);
275 		fpu_save_area_free(vcpu->guestfpu);
276 		vcpu_lock_destroy(vcpu);
277 	}
278 }
279 
280 static struct vcpu *
281 vcpu_alloc(struct vm *vm, int vcpu_id)
282 {
283 	struct vcpu *vcpu;
284 
285 	KASSERT(vcpu_id >= 0 && vcpu_id < vm->maxcpus,
286 	    ("vcpu_alloc: invalid vcpu %d", vcpu_id));
287 
288 	vcpu = malloc(sizeof(*vcpu), M_VMM, M_WAITOK | M_ZERO);
289 	vcpu_lock_init(vcpu);
290 	vcpu->state = VCPU_IDLE;
291 	vcpu->hostcpu = NOCPU;
292 	vcpu->vcpuid = vcpu_id;
293 	vcpu->vm = vm;
294 	vcpu->guestfpu = fpu_save_area_alloc();
295 	vcpu->stats = vmm_stat_alloc();
296 	return (vcpu);
297 }
298 
299 static void
300 vcpu_init(struct vcpu *vcpu)
301 {
302 	vcpu->cookie = vmmops_vcpu_init(vcpu->vm->cookie, vcpu, vcpu->vcpuid);
303 	MPASS(vcpu->cookie != NULL);
304 	fpu_save_area_reset(vcpu->guestfpu);
305 	vmm_stat_init(vcpu->stats);
306 }
307 
308 struct vm_exit *
309 vm_exitinfo(struct vcpu *vcpu)
310 {
311 	return (&vcpu->exitinfo);
312 }
313 
314 static int
315 vmm_unsupported_quirk(void)
316 {
317 	/*
318 	 * Known to not load on Ampere eMAG
319 	 * https://bugs.freebsd.org/bugzilla/show_bug.cgi?id=285051
320 	 */
321 	if (CPU_MATCH(CPU_IMPL_MASK | CPU_PART_MASK, CPU_IMPL_APM,
322 	    CPU_PART_EMAG8180, 0, 0))
323 		return (ENXIO);
324 
325 	return (0);
326 }
327 
328 static int
329 vmm_init(void)
330 {
331 	int error;
332 
333 	vm_maxcpu = mp_ncpus;
334 	TUNABLE_INT_FETCH("hw.vmm.maxcpu", &vm_maxcpu);
335 
336 	if (vm_maxcpu > VM_MAXCPU) {
337 		printf("vmm: vm_maxcpu clamped to %u\n", VM_MAXCPU);
338 		vm_maxcpu = VM_MAXCPU;
339 	}
340 	if (vm_maxcpu == 0)
341 		vm_maxcpu = 1;
342 
343 	error = vmm_regs_init(&vmm_arch_regs, &vmm_arch_regs_masks);
344 	if (error != 0)
345 		return (error);
346 
347 	return (vmmops_modinit(0));
348 }
349 
350 static int
351 vmm_handler(module_t mod, int what, void *arg)
352 {
353 	int error;
354 
355 	switch (what) {
356 	case MOD_LOAD:
357 		error = vmm_unsupported_quirk();
358 		if (error != 0)
359 			break;
360 		error = vmmdev_init();
361 		if (error != 0)
362 			break;
363 		error = vmm_init();
364 		if (error == 0)
365 			vmm_initialized = true;
366 		else
367 			(void)vmmdev_cleanup();
368 		break;
369 	case MOD_UNLOAD:
370 		error = vmmdev_cleanup();
371 		if (error == 0 && vmm_initialized) {
372 			error = vmmops_modcleanup();
373 			if (error) {
374 				/*
375 				 * Something bad happened - prevent new
376 				 * VMs from being created
377 				 */
378 				vmm_initialized = false;
379 			}
380 		}
381 		break;
382 	default:
383 		error = 0;
384 		break;
385 	}
386 	return (error);
387 }
388 
389 static moduledata_t vmm_kmod = {
390 	"vmm",
391 	vmm_handler,
392 	NULL
393 };
394 
395 /*
396  * vmm initialization has the following dependencies:
397  *
398  * - HYP initialization requires smp_rendezvous() and therefore must happen
399  *   after SMP is fully functional (after SI_SUB_SMP).
400  * - vmm device initialization requires an initialized devfs.
401  */
402 DECLARE_MODULE(vmm, vmm_kmod, MAX(SI_SUB_SMP, SI_SUB_DEVFS) + 1, SI_ORDER_ANY);
403 MODULE_VERSION(vmm, 1);
404 
405 static void
406 vm_init(struct vm *vm, bool create)
407 {
408 	int i;
409 
410 	vm->cookie = vmmops_init(vm, vmspace_pmap(vm->vmspace));
411 	MPASS(vm->cookie != NULL);
412 
413 	CPU_ZERO(&vm->active_cpus);
414 	CPU_ZERO(&vm->debug_cpus);
415 
416 	vm->suspend = 0;
417 	CPU_ZERO(&vm->suspended_cpus);
418 
419 	memset(vm->mmio_region, 0, sizeof(vm->mmio_region));
420 	memset(vm->special_reg, 0, sizeof(vm->special_reg));
421 
422 	if (!create) {
423 		for (i = 0; i < vm->maxcpus; i++) {
424 			if (vm->vcpu[i] != NULL)
425 				vcpu_init(vm->vcpu[i]);
426 		}
427 	}
428 }
429 
430 void
431 vm_disable_vcpu_creation(struct vm *vm)
432 {
433 	sx_xlock(&vm->vcpus_init_lock);
434 	vm->dying = true;
435 	sx_xunlock(&vm->vcpus_init_lock);
436 }
437 
438 struct vcpu *
439 vm_alloc_vcpu(struct vm *vm, int vcpuid)
440 {
441 	struct vcpu *vcpu;
442 
443 	if (vcpuid < 0 || vcpuid >= vm_get_maxcpus(vm))
444 		return (NULL);
445 
446 	/* Some interrupt controllers may have a CPU limit */
447 	if (vcpuid >= vgic_max_cpu_count(vm->cookie))
448 		return (NULL);
449 
450 	vcpu = (struct vcpu *)
451 	    atomic_load_acq_ptr((uintptr_t *)&vm->vcpu[vcpuid]);
452 	if (__predict_true(vcpu != NULL))
453 		return (vcpu);
454 
455 	sx_xlock(&vm->vcpus_init_lock);
456 	vcpu = vm->vcpu[vcpuid];
457 	if (vcpu == NULL && !vm->dying) {
458 		vcpu = vcpu_alloc(vm, vcpuid);
459 		vcpu_init(vcpu);
460 
461 		/*
462 		 * Ensure vCPU is fully created before updating pointer
463 		 * to permit unlocked reads above.
464 		 */
465 		atomic_store_rel_ptr((uintptr_t *)&vm->vcpu[vcpuid],
466 		    (uintptr_t)vcpu);
467 	}
468 	sx_xunlock(&vm->vcpus_init_lock);
469 	return (vcpu);
470 }
471 
472 void
473 vm_slock_vcpus(struct vm *vm)
474 {
475 	sx_slock(&vm->vcpus_init_lock);
476 }
477 
478 void
479 vm_unlock_vcpus(struct vm *vm)
480 {
481 	sx_unlock(&vm->vcpus_init_lock);
482 }
483 
484 int
485 vm_create(const char *name, struct vm **retvm)
486 {
487 	struct vm *vm;
488 	struct vmspace *vmspace;
489 
490 	/*
491 	 * If vmm.ko could not be successfully initialized then don't attempt
492 	 * to create the virtual machine.
493 	 */
494 	if (!vmm_initialized)
495 		return (ENXIO);
496 
497 	if (name == NULL || strlen(name) >= VM_MAX_NAMELEN)
498 		return (EINVAL);
499 
500 	vmspace = vmmops_vmspace_alloc(0, 1ul << 39);
501 	if (vmspace == NULL)
502 		return (ENOMEM);
503 
504 	vm = malloc(sizeof(struct vm), M_VMM, M_WAITOK | M_ZERO);
505 	strcpy(vm->name, name);
506 	vm->vmspace = vmspace;
507 	vm_mem_init(&vm->mem);
508 	sx_init(&vm->vcpus_init_lock, "vm vcpus");
509 
510 	vm->sockets = 1;
511 	vm->cores = 1;			/* XXX backwards compatibility */
512 	vm->threads = 1;		/* XXX backwards compatibility */
513 	vm->maxcpus = vm_maxcpu;
514 
515 	vm->vcpu = malloc(sizeof(*vm->vcpu) * vm->maxcpus, M_VMM,
516 	    M_WAITOK | M_ZERO);
517 
518 	vm_init(vm, true);
519 
520 	*retvm = vm;
521 	return (0);
522 }
523 
524 void
525 vm_get_topology(struct vm *vm, uint16_t *sockets, uint16_t *cores,
526     uint16_t *threads, uint16_t *maxcpus)
527 {
528 	*sockets = vm->sockets;
529 	*cores = vm->cores;
530 	*threads = vm->threads;
531 	*maxcpus = vm->maxcpus;
532 }
533 
534 uint16_t
535 vm_get_maxcpus(struct vm *vm)
536 {
537 	return (vm->maxcpus);
538 }
539 
540 int
541 vm_set_topology(struct vm *vm, uint16_t sockets, uint16_t cores,
542     uint16_t threads, uint16_t maxcpus)
543 {
544 	/* Ignore maxcpus. */
545 	if ((sockets * cores * threads) > vm->maxcpus)
546 		return (EINVAL);
547 	vm->sockets = sockets;
548 	vm->cores = cores;
549 	vm->threads = threads;
550 	return(0);
551 }
552 
553 static void
554 vm_cleanup(struct vm *vm, bool destroy)
555 {
556 	pmap_t pmap __diagused;
557 	int i;
558 
559 	if (destroy) {
560 		vm_xlock_memsegs(vm);
561 		pmap = vmspace_pmap(vm->vmspace);
562 		sched_pin();
563 		PCPU_SET(curvmpmap, NULL);
564 		sched_unpin();
565 		CPU_FOREACH(i) {
566 			MPASS(cpuid_to_pcpu[i]->pc_curvmpmap != pmap);
567 		}
568 	} else
569 		vm_assert_memseg_xlocked(vm);
570 
571 
572 	vgic_detach_from_vm(vm->cookie);
573 
574 	for (i = 0; i < vm->maxcpus; i++) {
575 		if (vm->vcpu[i] != NULL)
576 			vcpu_cleanup(vm->vcpu[i], destroy);
577 	}
578 
579 	vmmops_cleanup(vm->cookie);
580 
581 	vm_mem_cleanup(vm);
582 	if (destroy) {
583 		vm_mem_destroy(vm);
584 
585 		vmmops_vmspace_free(vm->vmspace);
586 		vm->vmspace = NULL;
587 
588 		for (i = 0; i < vm->maxcpus; i++)
589 			free(vm->vcpu[i], M_VMM);
590 		free(vm->vcpu, M_VMM);
591 		sx_destroy(&vm->vcpus_init_lock);
592 	}
593 }
594 
595 void
596 vm_destroy(struct vm *vm)
597 {
598 	vm_cleanup(vm, true);
599 	free(vm, M_VMM);
600 }
601 
602 int
603 vm_reinit(struct vm *vm)
604 {
605 	int error;
606 
607 	/*
608 	 * A virtual machine can be reset only if all vcpus are suspended.
609 	 */
610 	if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) {
611 		vm_cleanup(vm, false);
612 		vm_init(vm, false);
613 		error = 0;
614 	} else {
615 		error = EBUSY;
616 	}
617 
618 	return (error);
619 }
620 
621 const char *
622 vm_name(struct vm *vm)
623 {
624 	return (vm->name);
625 }
626 
627 int
628 vm_gla2gpa_nofault(struct vcpu *vcpu, struct vm_guest_paging *paging,
629     uint64_t gla, int prot, uint64_t *gpa, int *is_fault)
630 {
631 	return (vmmops_gla2gpa(vcpu->cookie, paging, gla, prot, gpa, is_fault));
632 }
633 
634 static int
635 vmm_reg_raz(struct vcpu *vcpu, uint64_t *rval, void *arg)
636 {
637 	*rval = 0;
638 	return (0);
639 }
640 
641 static int
642 vmm_reg_read_arg(struct vcpu *vcpu, uint64_t *rval, void *arg)
643 {
644 	*rval = *(uint64_t *)arg;
645 	return (0);
646 }
647 
648 static int
649 vmm_reg_wi(struct vcpu *vcpu, uint64_t wval, void *arg)
650 {
651 	return (0);
652 }
653 
654 static int
655 vmm_write_oslar_el1(struct vcpu *vcpu, uint64_t wval, void *arg)
656 {
657 	struct hypctx *hypctx;
658 
659 	hypctx = vcpu_get_cookie(vcpu);
660 	/* All other fields are RES0 & we don't do anything with this */
661 	/* TODO: Disable access to other debug state when locked */
662 	hypctx->dbg_oslock = (wval & OSLAR_OSLK) == OSLAR_OSLK;
663 	return (0);
664 }
665 
666 static int
667 vmm_read_oslsr_el1(struct vcpu *vcpu, uint64_t *rval, void *arg)
668 {
669 	struct hypctx *hypctx;
670 	uint64_t val;
671 
672 	hypctx = vcpu_get_cookie(vcpu);
673 	val = OSLSR_OSLM_1;
674 	if (hypctx->dbg_oslock)
675 		val |= OSLSR_OSLK;
676 	*rval = val;
677 
678 	return (0);
679 }
680 
681 static const struct vmm_special_reg vmm_special_regs[] = {
682 #define	SPECIAL_REG(_reg, _read, _write)				\
683 	{								\
684 		.esr_iss = ((_reg ## _op0) << ISS_MSR_OP0_SHIFT) |	\
685 		    ((_reg ## _op1) << ISS_MSR_OP1_SHIFT) |		\
686 		    ((_reg ## _CRn) << ISS_MSR_CRn_SHIFT) |		\
687 		    ((_reg ## _CRm) << ISS_MSR_CRm_SHIFT) |		\
688 		    ((_reg ## _op2) << ISS_MSR_OP2_SHIFT),		\
689 		.esr_mask = ISS_MSR_REG_MASK,				\
690 		.reg_read = (_read),					\
691 		.reg_write = (_write),					\
692 		.arg = NULL,						\
693 	}
694 #define	ID_SPECIAL_REG(_reg, _name)					\
695 	{								\
696 		.esr_iss = ((_reg ## _op0) << ISS_MSR_OP0_SHIFT) |	\
697 		    ((_reg ## _op1) << ISS_MSR_OP1_SHIFT) |		\
698 		    ((_reg ## _CRn) << ISS_MSR_CRn_SHIFT) |		\
699 		    ((_reg ## _CRm) << ISS_MSR_CRm_SHIFT) |		\
700 		    ((_reg ## _op2) << ISS_MSR_OP2_SHIFT),		\
701 		.esr_mask = ISS_MSR_REG_MASK,				\
702 		.reg_read = vmm_reg_read_arg,				\
703 		.reg_write = vmm_reg_wi,				\
704 		.arg = &(vmm_arch_regs._name),				\
705 	}
706 
707 	/* ID registers */
708 	ID_SPECIAL_REG(ID_AA64PFR0_EL1, id_aa64pfr0),
709 	ID_SPECIAL_REG(ID_AA64DFR0_EL1, id_aa64dfr0),
710 	ID_SPECIAL_REG(ID_AA64ISAR0_EL1, id_aa64isar0),
711 	ID_SPECIAL_REG(ID_AA64MMFR0_EL1, id_aa64mmfr0),
712 	ID_SPECIAL_REG(ID_AA64MMFR1_EL1, id_aa64mmfr1),
713 
714 	/*
715 	 * All other ID registers are read as zero.
716 	 * They are all in the op0=3, op1=0, CRn=0, CRm={0..7} space.
717 	 */
718 	{
719 		.esr_iss = (3 << ISS_MSR_OP0_SHIFT) |
720 		    (0 << ISS_MSR_OP1_SHIFT) |
721 		    (0 << ISS_MSR_CRn_SHIFT) |
722 		    (0 << ISS_MSR_CRm_SHIFT),
723 		.esr_mask = ISS_MSR_OP0_MASK | ISS_MSR_OP1_MASK |
724 		    ISS_MSR_CRn_MASK | (0x8 << ISS_MSR_CRm_SHIFT),
725 		.reg_read = vmm_reg_raz,
726 		.reg_write = vmm_reg_wi,
727 		.arg = NULL,
728 	},
729 
730 	/* Counter physical registers */
731 	SPECIAL_REG(CNTP_CTL_EL0, vtimer_phys_ctl_read, vtimer_phys_ctl_write),
732 	SPECIAL_REG(CNTP_CVAL_EL0, vtimer_phys_cval_read,
733 	    vtimer_phys_cval_write),
734 	SPECIAL_REG(CNTP_TVAL_EL0, vtimer_phys_tval_read,
735 	    vtimer_phys_tval_write),
736 	SPECIAL_REG(CNTPCT_EL0, vtimer_phys_cnt_read, vtimer_phys_cnt_write),
737 
738 	/* Debug registers */
739 	SPECIAL_REG(DBGPRCR_EL1, vmm_reg_raz, vmm_reg_wi),
740 	SPECIAL_REG(OSDLR_EL1, vmm_reg_raz, vmm_reg_wi),
741 	/* TODO: Exceptions on invalid access */
742 	SPECIAL_REG(OSLAR_EL1, vmm_reg_raz, vmm_write_oslar_el1),
743 	SPECIAL_REG(OSLSR_EL1, vmm_read_oslsr_el1, vmm_reg_wi),
744 #undef SPECIAL_REG
745 };
746 
747 void
748 vm_register_reg_handler(struct vm *vm, uint64_t iss, uint64_t mask,
749     reg_read_t reg_read, reg_write_t reg_write, void *arg)
750 {
751 	int i;
752 
753 	for (i = 0; i < nitems(vm->special_reg); i++) {
754 		if (vm->special_reg[i].esr_iss == 0 &&
755 		    vm->special_reg[i].esr_mask == 0) {
756 			vm->special_reg[i].esr_iss = iss;
757 			vm->special_reg[i].esr_mask = mask;
758 			vm->special_reg[i].reg_read = reg_read;
759 			vm->special_reg[i].reg_write = reg_write;
760 			vm->special_reg[i].arg = arg;
761 			return;
762 		}
763 	}
764 
765 	panic("%s: No free special register slot", __func__);
766 }
767 
768 void
769 vm_deregister_reg_handler(struct vm *vm, uint64_t iss, uint64_t mask)
770 {
771 	int i;
772 
773 	for (i = 0; i < nitems(vm->special_reg); i++) {
774 		if (vm->special_reg[i].esr_iss == iss &&
775 		    vm->special_reg[i].esr_mask == mask) {
776 			memset(&vm->special_reg[i], 0,
777 			    sizeof(vm->special_reg[i]));
778 			return;
779 		}
780 	}
781 
782 	panic("%s: Invalid special register: iss %lx mask %lx", __func__, iss,
783 	    mask);
784 }
785 
786 static int
787 vm_handle_reg_emul(struct vcpu *vcpu, bool *retu)
788 {
789 	struct vm *vm;
790 	struct vm_exit *vme;
791 	struct vre *vre;
792 	int i, rv;
793 
794 	vm = vcpu->vm;
795 	vme = &vcpu->exitinfo;
796 	vre = &vme->u.reg_emul.vre;
797 
798 	for (i = 0; i < nitems(vm->special_reg); i++) {
799 		if (vm->special_reg[i].esr_iss == 0 &&
800 		    vm->special_reg[i].esr_mask == 0)
801 			continue;
802 
803 		if ((vre->inst_syndrome & vm->special_reg[i].esr_mask) ==
804 		    vm->special_reg[i].esr_iss) {
805 			rv = vmm_emulate_register(vcpu, vre,
806 			    vm->special_reg[i].reg_read,
807 			    vm->special_reg[i].reg_write,
808 			    vm->special_reg[i].arg);
809 			if (rv == 0) {
810 				*retu = false;
811 			}
812 			return (rv);
813 		}
814 	}
815 	for (i = 0; i < nitems(vmm_special_regs); i++) {
816 		if ((vre->inst_syndrome & vmm_special_regs[i].esr_mask) ==
817 		    vmm_special_regs[i].esr_iss) {
818 			rv = vmm_emulate_register(vcpu, vre,
819 			    vmm_special_regs[i].reg_read,
820 			    vmm_special_regs[i].reg_write,
821 			    vmm_special_regs[i].arg);
822 			if (rv == 0) {
823 				*retu = false;
824 			}
825 			return (rv);
826 		}
827 	}
828 
829 
830 	*retu = true;
831 	return (0);
832 }
833 
834 void
835 vm_register_inst_handler(struct vm *vm, uint64_t start, uint64_t size,
836     mem_region_read_t mmio_read, mem_region_write_t mmio_write)
837 {
838 	int i;
839 
840 	for (i = 0; i < nitems(vm->mmio_region); i++) {
841 		if (vm->mmio_region[i].start == 0 &&
842 		    vm->mmio_region[i].end == 0) {
843 			vm->mmio_region[i].start = start;
844 			vm->mmio_region[i].end = start + size;
845 			vm->mmio_region[i].read = mmio_read;
846 			vm->mmio_region[i].write = mmio_write;
847 			return;
848 		}
849 	}
850 
851 	panic("%s: No free MMIO region", __func__);
852 }
853 
854 void
855 vm_deregister_inst_handler(struct vm *vm, uint64_t start, uint64_t size)
856 {
857 	int i;
858 
859 	for (i = 0; i < nitems(vm->mmio_region); i++) {
860 		if (vm->mmio_region[i].start == start &&
861 		    vm->mmio_region[i].end == start + size) {
862 			memset(&vm->mmio_region[i], 0,
863 			    sizeof(vm->mmio_region[i]));
864 			return;
865 		}
866 	}
867 
868 	panic("%s: Invalid MMIO region: %lx - %lx", __func__, start,
869 	    start + size);
870 }
871 
872 static int
873 vm_handle_inst_emul(struct vcpu *vcpu, bool *retu)
874 {
875 	struct vm *vm;
876 	struct vm_exit *vme;
877 	struct vie *vie;
878 	struct hyp *hyp;
879 	uint64_t fault_ipa;
880 	struct vm_guest_paging *paging;
881 	struct vmm_mmio_region *vmr;
882 	int error, i;
883 
884 	vm = vcpu->vm;
885 	hyp = vm->cookie;
886 	if (!hyp->vgic_attached)
887 		goto out_user;
888 
889 	vme = &vcpu->exitinfo;
890 	vie = &vme->u.inst_emul.vie;
891 	paging = &vme->u.inst_emul.paging;
892 
893 	fault_ipa = vme->u.inst_emul.gpa;
894 
895 	vmr = NULL;
896 	for (i = 0; i < nitems(vm->mmio_region); i++) {
897 		if (vm->mmio_region[i].start <= fault_ipa &&
898 		    vm->mmio_region[i].end > fault_ipa) {
899 			vmr = &vm->mmio_region[i];
900 			break;
901 		}
902 	}
903 	if (vmr == NULL)
904 		goto out_user;
905 
906 	error = vmm_emulate_instruction(vcpu, fault_ipa, vie, paging,
907 	    vmr->read, vmr->write, retu);
908 	return (error);
909 
910 out_user:
911 	*retu = true;
912 	return (0);
913 }
914 
915 int
916 vm_suspend(struct vm *vm, enum vm_suspend_how how)
917 {
918 	int i;
919 
920 	if (how <= VM_SUSPEND_NONE || how >= VM_SUSPEND_LAST)
921 		return (EINVAL);
922 
923 	if (atomic_cmpset_int(&vm->suspend, 0, how) == 0) {
924 		VM_CTR2(vm, "virtual machine already suspended %d/%d",
925 		    vm->suspend, how);
926 		return (EALREADY);
927 	}
928 
929 	VM_CTR1(vm, "virtual machine successfully suspended %d", how);
930 
931 	/*
932 	 * Notify all active vcpus that they are now suspended.
933 	 */
934 	for (i = 0; i < vm->maxcpus; i++) {
935 		if (CPU_ISSET(i, &vm->active_cpus))
936 			vcpu_notify_event(vm_vcpu(vm, i));
937 	}
938 
939 	return (0);
940 }
941 
942 void
943 vm_exit_suspended(struct vcpu *vcpu, uint64_t pc)
944 {
945 	struct vm *vm = vcpu->vm;
946 	struct vm_exit *vmexit;
947 
948 	KASSERT(vm->suspend > VM_SUSPEND_NONE && vm->suspend < VM_SUSPEND_LAST,
949 	    ("vm_exit_suspended: invalid suspend type %d", vm->suspend));
950 
951 	vmexit = vm_exitinfo(vcpu);
952 	vmexit->pc = pc;
953 	vmexit->inst_length = 4;
954 	vmexit->exitcode = VM_EXITCODE_SUSPENDED;
955 	vmexit->u.suspended.how = vm->suspend;
956 }
957 
958 void
959 vm_exit_debug(struct vcpu *vcpu, uint64_t pc)
960 {
961 	struct vm_exit *vmexit;
962 
963 	vmexit = vm_exitinfo(vcpu);
964 	vmexit->pc = pc;
965 	vmexit->inst_length = 4;
966 	vmexit->exitcode = VM_EXITCODE_DEBUG;
967 }
968 
969 int
970 vm_activate_cpu(struct vcpu *vcpu)
971 {
972 	struct vm *vm = vcpu->vm;
973 
974 	if (CPU_ISSET(vcpu->vcpuid, &vm->active_cpus))
975 		return (EBUSY);
976 
977 	CPU_SET_ATOMIC(vcpu->vcpuid, &vm->active_cpus);
978 	return (0);
979 
980 }
981 
982 int
983 vm_suspend_cpu(struct vm *vm, struct vcpu *vcpu)
984 {
985 	if (vcpu == NULL) {
986 		vm->debug_cpus = vm->active_cpus;
987 		for (int i = 0; i < vm->maxcpus; i++) {
988 			if (CPU_ISSET(i, &vm->active_cpus))
989 				vcpu_notify_event(vm_vcpu(vm, i));
990 		}
991 	} else {
992 		if (!CPU_ISSET(vcpu->vcpuid, &vm->active_cpus))
993 			return (EINVAL);
994 
995 		CPU_SET_ATOMIC(vcpu->vcpuid, &vm->debug_cpus);
996 		vcpu_notify_event(vcpu);
997 	}
998 	return (0);
999 }
1000 
1001 int
1002 vm_resume_cpu(struct vm *vm, struct vcpu *vcpu)
1003 {
1004 
1005 	if (vcpu == NULL) {
1006 		CPU_ZERO(&vm->debug_cpus);
1007 	} else {
1008 		if (!CPU_ISSET(vcpu->vcpuid, &vm->debug_cpus))
1009 			return (EINVAL);
1010 
1011 		CPU_CLR_ATOMIC(vcpu->vcpuid, &vm->debug_cpus);
1012 	}
1013 	return (0);
1014 }
1015 
1016 int
1017 vcpu_debugged(struct vcpu *vcpu)
1018 {
1019 
1020 	return (CPU_ISSET(vcpu->vcpuid, &vcpu->vm->debug_cpus));
1021 }
1022 
1023 cpuset_t
1024 vm_active_cpus(struct vm *vm)
1025 {
1026 
1027 	return (vm->active_cpus);
1028 }
1029 
1030 cpuset_t
1031 vm_debug_cpus(struct vm *vm)
1032 {
1033 
1034 	return (vm->debug_cpus);
1035 }
1036 
1037 cpuset_t
1038 vm_suspended_cpus(struct vm *vm)
1039 {
1040 
1041 	return (vm->suspended_cpus);
1042 }
1043 
1044 
1045 void *
1046 vcpu_stats(struct vcpu *vcpu)
1047 {
1048 
1049 	return (vcpu->stats);
1050 }
1051 
1052 /*
1053  * This function is called to ensure that a vcpu "sees" a pending event
1054  * as soon as possible:
1055  * - If the vcpu thread is sleeping then it is woken up.
1056  * - If the vcpu is running on a different host_cpu then an IPI will be directed
1057  *   to the host_cpu to cause the vcpu to trap into the hypervisor.
1058  */
1059 static void
1060 vcpu_notify_event_locked(struct vcpu *vcpu)
1061 {
1062 	int hostcpu;
1063 
1064 	hostcpu = vcpu->hostcpu;
1065 	if (vcpu->state == VCPU_RUNNING) {
1066 		KASSERT(hostcpu != NOCPU, ("vcpu running on invalid hostcpu"));
1067 		if (hostcpu != curcpu) {
1068 			ipi_cpu(hostcpu, vmm_ipinum);
1069 		} else {
1070 			/*
1071 			 * If the 'vcpu' is running on 'curcpu' then it must
1072 			 * be sending a notification to itself (e.g. SELF_IPI).
1073 			 * The pending event will be picked up when the vcpu
1074 			 * transitions back to guest context.
1075 			 */
1076 		}
1077 	} else {
1078 		KASSERT(hostcpu == NOCPU, ("vcpu state %d not consistent "
1079 		    "with hostcpu %d", vcpu->state, hostcpu));
1080 		if (vcpu->state == VCPU_SLEEPING)
1081 			wakeup_one(vcpu);
1082 	}
1083 }
1084 
1085 void
1086 vcpu_notify_event(struct vcpu *vcpu)
1087 {
1088 	vcpu_lock(vcpu);
1089 	vcpu_notify_event_locked(vcpu);
1090 	vcpu_unlock(vcpu);
1091 }
1092 
1093 struct vmspace *
1094 vm_vmspace(struct vm *vm)
1095 {
1096 	return (vm->vmspace);
1097 }
1098 
1099 struct vm_mem *
1100 vm_mem(struct vm *vm)
1101 {
1102 	return (&vm->mem);
1103 }
1104 
1105 static void
1106 restore_guest_fpustate(struct vcpu *vcpu)
1107 {
1108 
1109 	/* flush host state to the pcb */
1110 	vfp_save_state(curthread, curthread->td_pcb);
1111 	/* Ensure the VFP state will be re-loaded when exiting the guest */
1112 	PCPU_SET(fpcurthread, NULL);
1113 
1114 	/* restore guest FPU state */
1115 	vfp_enable();
1116 	vfp_restore(vcpu->guestfpu);
1117 
1118 	/*
1119 	 * The FPU is now "dirty" with the guest's state so turn on emulation
1120 	 * to trap any access to the FPU by the host.
1121 	 */
1122 	vfp_disable();
1123 }
1124 
1125 static void
1126 save_guest_fpustate(struct vcpu *vcpu)
1127 {
1128 	if ((READ_SPECIALREG(cpacr_el1) & CPACR_FPEN_MASK) !=
1129 	    CPACR_FPEN_TRAP_ALL1)
1130 		panic("VFP not enabled in host!");
1131 
1132 	/* save guest FPU state */
1133 	vfp_enable();
1134 	vfp_store(vcpu->guestfpu);
1135 	vfp_disable();
1136 
1137 	KASSERT(PCPU_GET(fpcurthread) == NULL,
1138 	    ("%s: fpcurthread set with guest registers", __func__));
1139 }
1140 static int
1141 vcpu_set_state_locked(struct vcpu *vcpu, enum vcpu_state newstate,
1142     bool from_idle)
1143 {
1144 	int error;
1145 
1146 	vcpu_assert_locked(vcpu);
1147 
1148 	/*
1149 	 * State transitions from the vmmdev_ioctl() must always begin from
1150 	 * the VCPU_IDLE state. This guarantees that there is only a single
1151 	 * ioctl() operating on a vcpu at any point.
1152 	 */
1153 	if (from_idle) {
1154 		while (vcpu->state != VCPU_IDLE) {
1155 			vcpu_notify_event_locked(vcpu);
1156 			msleep_spin(&vcpu->state, &vcpu->mtx, "vmstat", hz);
1157 		}
1158 	} else {
1159 		KASSERT(vcpu->state != VCPU_IDLE, ("invalid transition from "
1160 		    "vcpu idle state"));
1161 	}
1162 
1163 	if (vcpu->state == VCPU_RUNNING) {
1164 		KASSERT(vcpu->hostcpu == curcpu, ("curcpu %d and hostcpu %d "
1165 		    "mismatch for running vcpu", curcpu, vcpu->hostcpu));
1166 	} else {
1167 		KASSERT(vcpu->hostcpu == NOCPU, ("Invalid hostcpu %d for a "
1168 		    "vcpu that is not running", vcpu->hostcpu));
1169 	}
1170 
1171 	/*
1172 	 * The following state transitions are allowed:
1173 	 * IDLE -> FROZEN -> IDLE
1174 	 * FROZEN -> RUNNING -> FROZEN
1175 	 * FROZEN -> SLEEPING -> FROZEN
1176 	 */
1177 	switch (vcpu->state) {
1178 	case VCPU_IDLE:
1179 	case VCPU_RUNNING:
1180 	case VCPU_SLEEPING:
1181 		error = (newstate != VCPU_FROZEN);
1182 		break;
1183 	case VCPU_FROZEN:
1184 		error = (newstate == VCPU_FROZEN);
1185 		break;
1186 	default:
1187 		error = 1;
1188 		break;
1189 	}
1190 
1191 	if (error)
1192 		return (EBUSY);
1193 
1194 	vcpu->state = newstate;
1195 	if (newstate == VCPU_RUNNING)
1196 		vcpu->hostcpu = curcpu;
1197 	else
1198 		vcpu->hostcpu = NOCPU;
1199 
1200 	if (newstate == VCPU_IDLE)
1201 		wakeup(&vcpu->state);
1202 
1203 	return (0);
1204 }
1205 
1206 static void
1207 vcpu_require_state(struct vcpu *vcpu, enum vcpu_state newstate)
1208 {
1209 	int error;
1210 
1211 	if ((error = vcpu_set_state(vcpu, newstate, false)) != 0)
1212 		panic("Error %d setting state to %d\n", error, newstate);
1213 }
1214 
1215 static void
1216 vcpu_require_state_locked(struct vcpu *vcpu, enum vcpu_state newstate)
1217 {
1218 	int error;
1219 
1220 	if ((error = vcpu_set_state_locked(vcpu, newstate, false)) != 0)
1221 		panic("Error %d setting state to %d", error, newstate);
1222 }
1223 
1224 int
1225 vm_get_capability(struct vcpu *vcpu, int type, int *retval)
1226 {
1227 	if (type < 0 || type >= VM_CAP_MAX)
1228 		return (EINVAL);
1229 
1230 	return (vmmops_getcap(vcpu->cookie, type, retval));
1231 }
1232 
1233 int
1234 vm_set_capability(struct vcpu *vcpu, int type, int val)
1235 {
1236 	if (type < 0 || type >= VM_CAP_MAX)
1237 		return (EINVAL);
1238 
1239 	return (vmmops_setcap(vcpu->cookie, type, val));
1240 }
1241 
1242 struct vm *
1243 vcpu_vm(struct vcpu *vcpu)
1244 {
1245 	return (vcpu->vm);
1246 }
1247 
1248 int
1249 vcpu_vcpuid(struct vcpu *vcpu)
1250 {
1251 	return (vcpu->vcpuid);
1252 }
1253 
1254 void *
1255 vcpu_get_cookie(struct vcpu *vcpu)
1256 {
1257 	return (vcpu->cookie);
1258 }
1259 
1260 struct vcpu *
1261 vm_vcpu(struct vm *vm, int vcpuid)
1262 {
1263 	return (vm->vcpu[vcpuid]);
1264 }
1265 
1266 int
1267 vcpu_set_state(struct vcpu *vcpu, enum vcpu_state newstate, bool from_idle)
1268 {
1269 	int error;
1270 
1271 	vcpu_lock(vcpu);
1272 	error = vcpu_set_state_locked(vcpu, newstate, from_idle);
1273 	vcpu_unlock(vcpu);
1274 
1275 	return (error);
1276 }
1277 
1278 enum vcpu_state
1279 vcpu_get_state(struct vcpu *vcpu, int *hostcpu)
1280 {
1281 	enum vcpu_state state;
1282 
1283 	vcpu_lock(vcpu);
1284 	state = vcpu->state;
1285 	if (hostcpu != NULL)
1286 		*hostcpu = vcpu->hostcpu;
1287 	vcpu_unlock(vcpu);
1288 
1289 	return (state);
1290 }
1291 
1292 int
1293 vm_get_register(struct vcpu *vcpu, int reg, uint64_t *retval)
1294 {
1295 
1296 	if (reg >= VM_REG_LAST)
1297 		return (EINVAL);
1298 
1299 	return (vmmops_getreg(vcpu->cookie, reg, retval));
1300 }
1301 
1302 int
1303 vm_set_register(struct vcpu *vcpu, int reg, uint64_t val)
1304 {
1305 	int error;
1306 
1307 	if (reg >= VM_REG_LAST)
1308 		return (EINVAL);
1309 	error = vmmops_setreg(vcpu->cookie, reg, val);
1310 	if (error || reg != VM_REG_GUEST_PC)
1311 		return (error);
1312 
1313 	vcpu->nextpc = val;
1314 
1315 	return (0);
1316 }
1317 
1318 void *
1319 vm_get_cookie(struct vm *vm)
1320 {
1321 	return (vm->cookie);
1322 }
1323 
1324 int
1325 vm_inject_exception(struct vcpu *vcpu, uint64_t esr, uint64_t far)
1326 {
1327 	return (vmmops_exception(vcpu->cookie, esr, far));
1328 }
1329 
1330 int
1331 vm_attach_vgic(struct vm *vm, struct vm_vgic_descr *descr)
1332 {
1333 	return (vgic_attach_to_vm(vm->cookie, descr));
1334 }
1335 
1336 int
1337 vm_assert_irq(struct vm *vm, uint32_t irq)
1338 {
1339 	return (vgic_inject_irq(vm->cookie, -1, irq, true));
1340 }
1341 
1342 int
1343 vm_deassert_irq(struct vm *vm, uint32_t irq)
1344 {
1345 	return (vgic_inject_irq(vm->cookie, -1, irq, false));
1346 }
1347 
1348 int
1349 vm_raise_msi(struct vm *vm, uint64_t msg, uint64_t addr, int bus, int slot,
1350     int func)
1351 {
1352 	/* TODO: Should we raise an SError? */
1353 	return (vgic_inject_msi(vm->cookie, msg, addr));
1354 }
1355 
1356 static int
1357 vm_handle_smccc_call(struct vcpu *vcpu, struct vm_exit *vme, bool *retu)
1358 {
1359 	struct hypctx *hypctx;
1360 	int i;
1361 
1362 	hypctx = vcpu_get_cookie(vcpu);
1363 
1364 	if ((hypctx->tf.tf_esr & ESR_ELx_ISS_MASK) != 0)
1365 		return (1);
1366 
1367 	vme->exitcode = VM_EXITCODE_SMCCC;
1368 	vme->u.smccc_call.func_id = hypctx->tf.tf_x[0];
1369 	for (i = 0; i < nitems(vme->u.smccc_call.args); i++)
1370 		vme->u.smccc_call.args[i] = hypctx->tf.tf_x[i + 1];
1371 
1372 	*retu = true;
1373 	return (0);
1374 }
1375 
1376 static int
1377 vm_handle_wfi(struct vcpu *vcpu, struct vm_exit *vme, bool *retu)
1378 {
1379 	struct vm *vm;
1380 
1381 	vm = vcpu->vm;
1382 	vcpu_lock(vcpu);
1383 	while (1) {
1384 		if (vm->suspend)
1385 			break;
1386 
1387 		if (vgic_has_pending_irq(vcpu->cookie))
1388 			break;
1389 
1390 		if (vcpu_should_yield(vcpu))
1391 			break;
1392 
1393 		vcpu_require_state_locked(vcpu, VCPU_SLEEPING);
1394 		/*
1395 		 * XXX msleep_spin() cannot be interrupted by signals so
1396 		 * wake up periodically to check pending signals.
1397 		 */
1398 		msleep_spin(vcpu, &vcpu->mtx, "vmidle", hz);
1399 		vcpu_require_state_locked(vcpu, VCPU_FROZEN);
1400 	}
1401 	vcpu_unlock(vcpu);
1402 
1403 	*retu = false;
1404 	return (0);
1405 }
1406 
1407 static int
1408 vm_handle_paging(struct vcpu *vcpu, bool *retu)
1409 {
1410 	struct vm *vm = vcpu->vm;
1411 	struct vm_exit *vme;
1412 	struct vm_map *map;
1413 	uint64_t addr, esr;
1414 	pmap_t pmap;
1415 	int ftype, rv;
1416 
1417 	vme = &vcpu->exitinfo;
1418 
1419 	pmap = vmspace_pmap(vcpu->vm->vmspace);
1420 	addr = vme->u.paging.gpa;
1421 	esr = vme->u.paging.esr;
1422 
1423 	/* The page exists, but the page table needs to be updated. */
1424 	if (pmap_fault(pmap, esr, addr) == KERN_SUCCESS)
1425 		return (0);
1426 
1427 	switch (ESR_ELx_EXCEPTION(esr)) {
1428 	case EXCP_INSN_ABORT_L:
1429 	case EXCP_DATA_ABORT_L:
1430 		ftype = VM_PROT_EXECUTE | VM_PROT_READ | VM_PROT_WRITE;
1431 		break;
1432 	default:
1433 		panic("%s: Invalid exception (esr = %lx)", __func__, esr);
1434 	}
1435 
1436 	map = &vm->vmspace->vm_map;
1437 	rv = vm_fault(map, vme->u.paging.gpa, ftype, VM_FAULT_NORMAL, NULL);
1438 	if (rv != KERN_SUCCESS)
1439 		return (EFAULT);
1440 
1441 	return (0);
1442 }
1443 
1444 static int
1445 vm_handle_suspend(struct vcpu *vcpu, bool *retu)
1446 {
1447 	struct vm *vm = vcpu->vm;
1448 	int error, i;
1449 	struct thread *td;
1450 
1451 	error = 0;
1452 	td = curthread;
1453 
1454 	CPU_SET_ATOMIC(vcpu->vcpuid, &vm->suspended_cpus);
1455 
1456 	/*
1457 	 * Wait until all 'active_cpus' have suspended themselves.
1458 	 *
1459 	 * Since a VM may be suspended at any time including when one or
1460 	 * more vcpus are doing a rendezvous we need to call the rendezvous
1461 	 * handler while we are waiting to prevent a deadlock.
1462 	 */
1463 	vcpu_lock(vcpu);
1464 	while (error == 0) {
1465 		if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0)
1466 			break;
1467 
1468 		vcpu_require_state_locked(vcpu, VCPU_SLEEPING);
1469 		msleep_spin(vcpu, &vcpu->mtx, "vmsusp", hz);
1470 		vcpu_require_state_locked(vcpu, VCPU_FROZEN);
1471 		if (td_ast_pending(td, TDA_SUSPEND)) {
1472 			vcpu_unlock(vcpu);
1473 			error = thread_check_susp(td, false);
1474 			vcpu_lock(vcpu);
1475 		}
1476 	}
1477 	vcpu_unlock(vcpu);
1478 
1479 	/*
1480 	 * Wakeup the other sleeping vcpus and return to userspace.
1481 	 */
1482 	for (i = 0; i < vm->maxcpus; i++) {
1483 		if (CPU_ISSET(i, &vm->suspended_cpus)) {
1484 			vcpu_notify_event(vm_vcpu(vm, i));
1485 		}
1486 	}
1487 
1488 	*retu = true;
1489 	return (error);
1490 }
1491 
1492 int
1493 vm_run(struct vcpu *vcpu)
1494 {
1495 	struct vm *vm = vcpu->vm;
1496 	struct vm_eventinfo evinfo;
1497 	int error, vcpuid;
1498 	struct vm_exit *vme;
1499 	bool retu;
1500 	pmap_t pmap;
1501 
1502 	vcpuid = vcpu->vcpuid;
1503 
1504 	if (!CPU_ISSET(vcpuid, &vm->active_cpus))
1505 		return (EINVAL);
1506 
1507 	if (CPU_ISSET(vcpuid, &vm->suspended_cpus))
1508 		return (EINVAL);
1509 
1510 	pmap = vmspace_pmap(vm->vmspace);
1511 	vme = &vcpu->exitinfo;
1512 	evinfo.rptr = NULL;
1513 	evinfo.sptr = &vm->suspend;
1514 	evinfo.iptr = NULL;
1515 restart:
1516 	critical_enter();
1517 
1518 	restore_guest_fpustate(vcpu);
1519 
1520 	vcpu_require_state(vcpu, VCPU_RUNNING);
1521 	error = vmmops_run(vcpu->cookie, vcpu->nextpc, pmap, &evinfo);
1522 	vcpu_require_state(vcpu, VCPU_FROZEN);
1523 
1524 	save_guest_fpustate(vcpu);
1525 
1526 	critical_exit();
1527 
1528 	if (error == 0) {
1529 		retu = false;
1530 		switch (vme->exitcode) {
1531 		case VM_EXITCODE_INST_EMUL:
1532 			vcpu->nextpc = vme->pc + vme->inst_length;
1533 			error = vm_handle_inst_emul(vcpu, &retu);
1534 			break;
1535 
1536 		case VM_EXITCODE_REG_EMUL:
1537 			vcpu->nextpc = vme->pc + vme->inst_length;
1538 			error = vm_handle_reg_emul(vcpu, &retu);
1539 			break;
1540 
1541 		case VM_EXITCODE_HVC:
1542 			/*
1543 			 * The HVC instruction saves the address for the
1544 			 * next instruction as the return address.
1545 			 */
1546 			vcpu->nextpc = vme->pc;
1547 			/*
1548 			 * The PSCI call can change the exit information in the
1549 			 * case of suspend/reset/poweroff/cpu off/cpu on.
1550 			 */
1551 			error = vm_handle_smccc_call(vcpu, vme, &retu);
1552 			break;
1553 
1554 		case VM_EXITCODE_WFI:
1555 			vcpu->nextpc = vme->pc + vme->inst_length;
1556 			error = vm_handle_wfi(vcpu, vme, &retu);
1557 			break;
1558 
1559 		case VM_EXITCODE_PAGING:
1560 			vcpu->nextpc = vme->pc;
1561 			error = vm_handle_paging(vcpu, &retu);
1562 			break;
1563 
1564 		case VM_EXITCODE_SUSPENDED:
1565 			vcpu->nextpc = vme->pc;
1566 			error = vm_handle_suspend(vcpu, &retu);
1567 			break;
1568 
1569 		default:
1570 			/* Handle in userland */
1571 			vcpu->nextpc = vme->pc;
1572 			retu = true;
1573 			break;
1574 		}
1575 	}
1576 
1577 	if (error == 0 && retu == false)
1578 		goto restart;
1579 
1580 	return (error);
1581 }
1582