xref: /freebsd/sys/arm64/vmm/vmm.c (revision d1650d226205cdf07fb19e0c9b10b47b941e8747)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (C) 2015 Mihai Carabas <mihai.carabas@gmail.com>
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28 
29 #include <sys/param.h>
30 #include <sys/systm.h>
31 #include <sys/cpuset.h>
32 #include <sys/kernel.h>
33 #include <sys/linker.h>
34 #include <sys/lock.h>
35 #include <sys/malloc.h>
36 #include <sys/mutex.h>
37 #include <sys/pcpu.h>
38 #include <sys/proc.h>
39 #include <sys/queue.h>
40 #include <sys/rwlock.h>
41 #include <sys/sched.h>
42 #include <sys/smp.h>
43 
44 #include <vm/vm.h>
45 #include <vm/vm_object.h>
46 #include <vm/vm_page.h>
47 #include <vm/pmap.h>
48 #include <vm/vm_map.h>
49 #include <vm/vm_extern.h>
50 #include <vm/vm_param.h>
51 
52 #include <machine/cpu.h>
53 #include <machine/fpu.h>
54 #include <machine/machdep.h>
55 #include <machine/pcb.h>
56 #include <machine/smp.h>
57 #include <machine/vm.h>
58 #include <machine/vmparam.h>
59 #include <machine/vmm.h>
60 #include <machine/vmm_instruction_emul.h>
61 
62 #include <dev/pci/pcireg.h>
63 
64 #include <dev/vmm/vmm_dev.h>
65 #include <dev/vmm/vmm_ktr.h>
66 #include <dev/vmm/vmm_mem.h>
67 #include <dev/vmm/vmm_stat.h>
68 #include <dev/vmm/vmm_vm.h>
69 
70 #include "arm64.h"
71 #include "mmu.h"
72 
73 #include "io/vgic.h"
74 #include "io/vtimer.h"
75 
76 static MALLOC_DEFINE(M_VMM, "vmm", "vmm");
77 
78 /* statistics */
79 static VMM_STAT(VCPU_TOTAL_RUNTIME, "vcpu total runtime");
80 
81 struct vmm_regs {
82 	uint64_t	id_aa64afr0;
83 	uint64_t	id_aa64afr1;
84 	uint64_t	id_aa64dfr0;
85 	uint64_t	id_aa64dfr1;
86 	uint64_t	id_aa64isar0;
87 	uint64_t	id_aa64isar1;
88 	uint64_t	id_aa64isar2;
89 	uint64_t	id_aa64mmfr0;
90 	uint64_t	id_aa64mmfr1;
91 	uint64_t	id_aa64mmfr2;
92 	uint64_t	id_aa64pfr0;
93 	uint64_t	id_aa64pfr1;
94 };
95 
96 static const struct vmm_regs vmm_arch_regs_masks = {
97 	.id_aa64dfr0 =
98 	    ID_AA64DFR0_CTX_CMPs_MASK |
99 	    ID_AA64DFR0_WRPs_MASK |
100 	    ID_AA64DFR0_BRPs_MASK |
101 	    ID_AA64DFR0_PMUVer_3_9 |
102 	    ID_AA64DFR0_DebugVer_8,
103 	.id_aa64isar0 =
104 	    ID_AA64ISAR0_TLB_TLBIOSR |
105 	    ID_AA64ISAR0_SHA3_IMPL |
106 	    ID_AA64ISAR0_RDM_IMPL |
107 	    ID_AA64ISAR0_Atomic_IMPL |
108 	    ID_AA64ISAR0_CRC32_BASE |
109 	    ID_AA64ISAR0_SHA2_512 |
110 	    ID_AA64ISAR0_SHA1_BASE |
111 	    ID_AA64ISAR0_AES_PMULL,
112 	.id_aa64mmfr0 =
113 	    ID_AA64MMFR0_TGran4_IMPL |
114 	    ID_AA64MMFR0_TGran64_IMPL |
115 	    ID_AA64MMFR0_TGran16_IMPL |
116 	    ID_AA64MMFR0_ASIDBits_16 |
117 	    ID_AA64MMFR0_PARange_4P,
118 	.id_aa64mmfr1 =
119 	    ID_AA64MMFR1_SpecSEI_IMPL |
120 	    ID_AA64MMFR1_PAN_ATS1E1 |
121 	    ID_AA64MMFR1_HAFDBS_AF,
122 	.id_aa64pfr0 =
123 	    ID_AA64PFR0_GIC_CPUIF_NONE |
124 	    ID_AA64PFR0_AdvSIMD_HP |
125 	    ID_AA64PFR0_FP_HP |
126 	    ID_AA64PFR0_EL3_64 |
127 	    ID_AA64PFR0_EL2_64 |
128 	    ID_AA64PFR0_EL1_64 |
129 	    ID_AA64PFR0_EL0_64,
130 };
131 
132 /* Host registers masked by vmm_arch_regs_masks. */
133 static struct vmm_regs vmm_arch_regs;
134 
135 /* global statistics */
136 VMM_STAT(VMEXIT_COUNT, "total number of vm exits");
137 VMM_STAT(VMEXIT_UNKNOWN, "number of vmexits for the unknown exception");
138 VMM_STAT(VMEXIT_WFI, "number of times wfi was intercepted");
139 VMM_STAT(VMEXIT_WFE, "number of times wfe was intercepted");
140 VMM_STAT(VMEXIT_HVC, "number of times hvc was intercepted");
141 VMM_STAT(VMEXIT_MSR, "number of times msr/mrs was intercepted");
142 VMM_STAT(VMEXIT_DATA_ABORT, "number of vmexits for a data abort");
143 VMM_STAT(VMEXIT_INSN_ABORT, "number of vmexits for an instruction abort");
144 VMM_STAT(VMEXIT_UNHANDLED_SYNC, "number of vmexits for an unhandled synchronous exception");
145 VMM_STAT(VMEXIT_IRQ, "number of vmexits for an irq");
146 VMM_STAT(VMEXIT_FIQ, "number of vmexits for an interrupt");
147 VMM_STAT(VMEXIT_BRK, "number of vmexits for a breakpoint exception");
148 VMM_STAT(VMEXIT_SS, "number of vmexits for a single-step exception");
149 VMM_STAT(VMEXIT_UNHANDLED_EL2, "number of vmexits for an unhandled EL2 exception");
150 VMM_STAT(VMEXIT_UNHANDLED, "number of vmexits for an unhandled exception");
151 
152 static int
vmm_regs_init(struct vmm_regs * regs,const struct vmm_regs * masks)153 vmm_regs_init(struct vmm_regs *regs, const struct vmm_regs *masks)
154 {
155 #define	_FETCH_KERN_REG(reg, field) do {				\
156 	regs->field = vmm_arch_regs_masks.field;			\
157 	get_kernel_reg_iss_masked(reg ## _ISS, &regs->field,		\
158 	    masks->field);						\
159 } while (0)
160 	_FETCH_KERN_REG(ID_AA64AFR0_EL1, id_aa64afr0);
161 	_FETCH_KERN_REG(ID_AA64AFR1_EL1, id_aa64afr1);
162 	_FETCH_KERN_REG(ID_AA64DFR0_EL1, id_aa64dfr0);
163 	_FETCH_KERN_REG(ID_AA64DFR1_EL1, id_aa64dfr1);
164 	_FETCH_KERN_REG(ID_AA64ISAR0_EL1, id_aa64isar0);
165 	_FETCH_KERN_REG(ID_AA64ISAR1_EL1, id_aa64isar1);
166 	_FETCH_KERN_REG(ID_AA64ISAR2_EL1, id_aa64isar2);
167 	_FETCH_KERN_REG(ID_AA64MMFR0_EL1, id_aa64mmfr0);
168 	_FETCH_KERN_REG(ID_AA64MMFR1_EL1, id_aa64mmfr1);
169 	_FETCH_KERN_REG(ID_AA64MMFR2_EL1, id_aa64mmfr2);
170 	_FETCH_KERN_REG(ID_AA64PFR0_EL1, id_aa64pfr0);
171 	_FETCH_KERN_REG(ID_AA64PFR1_EL1, id_aa64pfr1);
172 #undef _FETCH_KERN_REG
173 	return (0);
174 }
175 
176 static void
vcpu_cleanup(struct vcpu * vcpu,bool destroy)177 vcpu_cleanup(struct vcpu *vcpu, bool destroy)
178 {
179 	vmmops_vcpu_cleanup(vcpu->cookie);
180 	vcpu->cookie = NULL;
181 	if (destroy) {
182 		vmm_stat_free(vcpu->stats);
183 		fpu_save_area_free(vcpu->guestfpu);
184 		vcpu_lock_destroy(vcpu);
185 		free(vcpu, M_VMM);
186 	}
187 }
188 
189 static struct vcpu *
vcpu_alloc(struct vm * vm,int vcpu_id)190 vcpu_alloc(struct vm *vm, int vcpu_id)
191 {
192 	struct vcpu *vcpu;
193 
194 	KASSERT(vcpu_id >= 0 && vcpu_id < vm->maxcpus,
195 	    ("vcpu_alloc: invalid vcpu %d", vcpu_id));
196 
197 	vcpu = malloc(sizeof(*vcpu), M_VMM, M_WAITOK | M_ZERO);
198 	vcpu_lock_init(vcpu);
199 	vcpu->state = VCPU_IDLE;
200 	vcpu->hostcpu = NOCPU;
201 	vcpu->vcpuid = vcpu_id;
202 	vcpu->vm = vm;
203 	vcpu->guestfpu = fpu_save_area_alloc();
204 	vcpu->stats = vmm_stat_alloc();
205 	return (vcpu);
206 }
207 
208 static void
vcpu_init(struct vcpu * vcpu)209 vcpu_init(struct vcpu *vcpu)
210 {
211 	vcpu->cookie = vmmops_vcpu_init(vcpu->vm->cookie, vcpu, vcpu->vcpuid);
212 	MPASS(vcpu->cookie != NULL);
213 	fpu_save_area_reset(vcpu->guestfpu);
214 	vmm_stat_init(vcpu->stats);
215 }
216 
217 struct vm_exit *
vm_exitinfo(struct vcpu * vcpu)218 vm_exitinfo(struct vcpu *vcpu)
219 {
220 	return (&vcpu->exitinfo);
221 }
222 
223 static int
vmm_unsupported_quirk(void)224 vmm_unsupported_quirk(void)
225 {
226 	/*
227 	 * Known to not load on Ampere eMAG
228 	 * https://bugs.freebsd.org/bugzilla/show_bug.cgi?id=285051
229 	 */
230 	if (CPU_MATCH(CPU_IMPL_MASK | CPU_PART_MASK, CPU_IMPL_APM,
231 	    CPU_PART_EMAG8180, 0, 0))
232 		return (ENXIO);
233 
234 	return (0);
235 }
236 
237 int
vmm_modinit(void)238 vmm_modinit(void)
239 {
240 	int error;
241 
242 	error = vmm_unsupported_quirk();
243 	if (error != 0)
244 		return (error);
245 
246 	error = vmm_regs_init(&vmm_arch_regs, &vmm_arch_regs_masks);
247 	if (error != 0)
248 		return (error);
249 
250 	return (vmmops_modinit(0));
251 }
252 
253 int
vmm_modcleanup(void)254 vmm_modcleanup(void)
255 {
256 	return (vmmops_modcleanup());
257 }
258 
259 static void
vm_init(struct vm * vm,bool create)260 vm_init(struct vm *vm, bool create)
261 {
262 	int i;
263 
264 	vm->cookie = vmmops_init(vm, vmspace_pmap(vm_vmspace(vm)));
265 	MPASS(vm->cookie != NULL);
266 
267 	CPU_ZERO(&vm->active_cpus);
268 	CPU_ZERO(&vm->debug_cpus);
269 
270 	vm->suspend = 0;
271 	CPU_ZERO(&vm->suspended_cpus);
272 
273 	memset(vm->mmio_region, 0, sizeof(vm->mmio_region));
274 	memset(vm->special_reg, 0, sizeof(vm->special_reg));
275 
276 	if (!create) {
277 		for (i = 0; i < vm->maxcpus; i++) {
278 			if (vm->vcpu[i] != NULL)
279 				vcpu_init(vm->vcpu[i]);
280 		}
281 	}
282 }
283 
284 struct vcpu *
vm_alloc_vcpu(struct vm * vm,int vcpuid)285 vm_alloc_vcpu(struct vm *vm, int vcpuid)
286 {
287 	struct vcpu *vcpu;
288 
289 	if (vcpuid < 0 || vcpuid >= vm_get_maxcpus(vm))
290 		return (NULL);
291 
292 	vcpu = (struct vcpu *)
293 	    atomic_load_acq_ptr((uintptr_t *)&vm->vcpu[vcpuid]);
294 	if (__predict_true(vcpu != NULL))
295 		return (vcpu);
296 
297 	sx_xlock(&vm->vcpus_init_lock);
298 	vcpu = vm->vcpu[vcpuid];
299 	if (vcpu == NULL && !vm->dying) {
300 		/* Some interrupt controllers may have a CPU limit */
301 		if (vcpuid >= vgic_max_cpu_count(vm->cookie)) {
302 			sx_xunlock(&vm->vcpus_init_lock);
303 			return (NULL);
304 		}
305 
306 		vcpu = vcpu_alloc(vm, vcpuid);
307 		vcpu_init(vcpu);
308 
309 		/*
310 		 * Ensure vCPU is fully created before updating pointer
311 		 * to permit unlocked reads above.
312 		 */
313 		atomic_store_rel_ptr((uintptr_t *)&vm->vcpu[vcpuid],
314 		    (uintptr_t)vcpu);
315 	}
316 	sx_xunlock(&vm->vcpus_init_lock);
317 	return (vcpu);
318 }
319 
320 int
vm_create(const char * name,struct vm ** retvm)321 vm_create(const char *name, struct vm **retvm)
322 {
323 	struct vm *vm;
324 	int error;
325 
326 	vm = malloc(sizeof(struct vm), M_VMM, M_WAITOK | M_ZERO);
327 	error = vm_mem_init(&vm->mem, 0, 1ul << 39);
328 	if (error != 0) {
329 		free(vm, M_VMM);
330 		return (error);
331 	}
332 	strcpy(vm->name, name);
333 	mtx_init(&vm->rendezvous_mtx, "vm rendezvous lock", 0, MTX_DEF);
334 	sx_init(&vm->vcpus_init_lock, "vm vcpus");
335 
336 	vm->sockets = 1;
337 	vm->cores = 1;			/* XXX backwards compatibility */
338 	vm->threads = 1;		/* XXX backwards compatibility */
339 	vm->maxcpus = vm_maxcpu;
340 
341 	vm->vcpu = malloc(sizeof(*vm->vcpu) * vm->maxcpus, M_VMM,
342 	    M_WAITOK | M_ZERO);
343 
344 	vm_init(vm, true);
345 
346 	*retvm = vm;
347 	return (0);
348 }
349 
350 static void
vm_cleanup(struct vm * vm,bool destroy)351 vm_cleanup(struct vm *vm, bool destroy)
352 {
353 	pmap_t pmap __diagused;
354 	int i;
355 
356 	if (destroy) {
357 		vm_xlock_memsegs(vm);
358 		pmap = vmspace_pmap(vm_vmspace(vm));
359 		sched_pin();
360 		PCPU_SET(curvmpmap, NULL);
361 		sched_unpin();
362 		CPU_FOREACH(i) {
363 			MPASS(cpuid_to_pcpu[i]->pc_curvmpmap != pmap);
364 		}
365 	} else
366 		vm_assert_memseg_xlocked(vm);
367 
368 
369 	vgic_detach_from_vm(vm->cookie);
370 
371 	for (i = 0; i < vm->maxcpus; i++) {
372 		if (vm->vcpu[i] != NULL)
373 			vcpu_cleanup(vm->vcpu[i], destroy);
374 	}
375 
376 	vmmops_cleanup(vm->cookie);
377 
378 	vm_mem_cleanup(vm);
379 	if (destroy) {
380 		vm_mem_destroy(vm);
381 
382 		free(vm->vcpu, M_VMM);
383 		sx_destroy(&vm->vcpus_init_lock);
384 	}
385 }
386 
387 void
vm_destroy(struct vm * vm)388 vm_destroy(struct vm *vm)
389 {
390 	vm_cleanup(vm, true);
391 	free(vm, M_VMM);
392 }
393 
394 void
vm_reset(struct vm * vm)395 vm_reset(struct vm *vm)
396 {
397 	vm_cleanup(vm, false);
398 	vm_init(vm, false);
399 }
400 
401 int
vm_gla2gpa_nofault(struct vcpu * vcpu,struct vm_guest_paging * paging,uint64_t gla,int prot,uint64_t * gpa,int * is_fault)402 vm_gla2gpa_nofault(struct vcpu *vcpu, struct vm_guest_paging *paging,
403     uint64_t gla, int prot, uint64_t *gpa, int *is_fault)
404 {
405 	return (vmmops_gla2gpa(vcpu->cookie, paging, gla, prot, gpa, is_fault));
406 }
407 
408 static int
vmm_reg_raz(struct vcpu * vcpu,uint64_t * rval,void * arg)409 vmm_reg_raz(struct vcpu *vcpu, uint64_t *rval, void *arg)
410 {
411 	*rval = 0;
412 	return (0);
413 }
414 
415 static int
vmm_reg_read_arg(struct vcpu * vcpu,uint64_t * rval,void * arg)416 vmm_reg_read_arg(struct vcpu *vcpu, uint64_t *rval, void *arg)
417 {
418 	*rval = *(uint64_t *)arg;
419 	return (0);
420 }
421 
422 static int
vmm_reg_wi(struct vcpu * vcpu,uint64_t wval,void * arg)423 vmm_reg_wi(struct vcpu *vcpu, uint64_t wval, void *arg)
424 {
425 	return (0);
426 }
427 
428 static int
vmm_write_oslar_el1(struct vcpu * vcpu,uint64_t wval,void * arg)429 vmm_write_oslar_el1(struct vcpu *vcpu, uint64_t wval, void *arg)
430 {
431 	struct hypctx *hypctx;
432 
433 	hypctx = vcpu_get_cookie(vcpu);
434 	/* All other fields are RES0 & we don't do anything with this */
435 	/* TODO: Disable access to other debug state when locked */
436 	hypctx->dbg_oslock = (wval & OSLAR_OSLK) == OSLAR_OSLK;
437 	return (0);
438 }
439 
440 static int
vmm_read_oslsr_el1(struct vcpu * vcpu,uint64_t * rval,void * arg)441 vmm_read_oslsr_el1(struct vcpu *vcpu, uint64_t *rval, void *arg)
442 {
443 	struct hypctx *hypctx;
444 	uint64_t val;
445 
446 	hypctx = vcpu_get_cookie(vcpu);
447 	val = OSLSR_OSLM_1;
448 	if (hypctx->dbg_oslock)
449 		val |= OSLSR_OSLK;
450 	*rval = val;
451 
452 	return (0);
453 }
454 
455 static const struct vmm_special_reg vmm_special_regs[] = {
456 #define	SPECIAL_REG(_reg, _read, _write)				\
457 	{								\
458 		.esr_iss = ((_reg ## _op0) << ISS_MSR_OP0_SHIFT) |	\
459 		    ((_reg ## _op1) << ISS_MSR_OP1_SHIFT) |		\
460 		    ((_reg ## _CRn) << ISS_MSR_CRn_SHIFT) |		\
461 		    ((_reg ## _CRm) << ISS_MSR_CRm_SHIFT) |		\
462 		    ((_reg ## _op2) << ISS_MSR_OP2_SHIFT),		\
463 		.esr_mask = ISS_MSR_REG_MASK,				\
464 		.reg_read = (_read),					\
465 		.reg_write = (_write),					\
466 		.arg = NULL,						\
467 	}
468 #define	ID_SPECIAL_REG(_reg, _name)					\
469 	{								\
470 		.esr_iss = ((_reg ## _op0) << ISS_MSR_OP0_SHIFT) |	\
471 		    ((_reg ## _op1) << ISS_MSR_OP1_SHIFT) |		\
472 		    ((_reg ## _CRn) << ISS_MSR_CRn_SHIFT) |		\
473 		    ((_reg ## _CRm) << ISS_MSR_CRm_SHIFT) |		\
474 		    ((_reg ## _op2) << ISS_MSR_OP2_SHIFT),		\
475 		.esr_mask = ISS_MSR_REG_MASK,				\
476 		.reg_read = vmm_reg_read_arg,				\
477 		.reg_write = vmm_reg_wi,				\
478 		.arg = &(vmm_arch_regs._name),				\
479 	}
480 
481 	/* ID registers */
482 	ID_SPECIAL_REG(ID_AA64PFR0_EL1, id_aa64pfr0),
483 	ID_SPECIAL_REG(ID_AA64DFR0_EL1, id_aa64dfr0),
484 	ID_SPECIAL_REG(ID_AA64ISAR0_EL1, id_aa64isar0),
485 	ID_SPECIAL_REG(ID_AA64MMFR0_EL1, id_aa64mmfr0),
486 	ID_SPECIAL_REG(ID_AA64MMFR1_EL1, id_aa64mmfr1),
487 
488 	/*
489 	 * All other ID registers are read as zero.
490 	 * They are all in the op0=3, op1=0, CRn=0, CRm={0..7} space.
491 	 */
492 	{
493 		.esr_iss = (3 << ISS_MSR_OP0_SHIFT) |
494 		    (0 << ISS_MSR_OP1_SHIFT) |
495 		    (0 << ISS_MSR_CRn_SHIFT) |
496 		    (0 << ISS_MSR_CRm_SHIFT),
497 		.esr_mask = ISS_MSR_OP0_MASK | ISS_MSR_OP1_MASK |
498 		    ISS_MSR_CRn_MASK | (0x8 << ISS_MSR_CRm_SHIFT),
499 		.reg_read = vmm_reg_raz,
500 		.reg_write = vmm_reg_wi,
501 		.arg = NULL,
502 	},
503 
504 	/* Counter physical registers */
505 	SPECIAL_REG(CNTP_CTL_EL0, vtimer_phys_ctl_read, vtimer_phys_ctl_write),
506 	SPECIAL_REG(CNTP_CVAL_EL0, vtimer_phys_cval_read,
507 	    vtimer_phys_cval_write),
508 	SPECIAL_REG(CNTP_TVAL_EL0, vtimer_phys_tval_read,
509 	    vtimer_phys_tval_write),
510 	SPECIAL_REG(CNTPCT_EL0, vtimer_phys_cnt_read, vtimer_phys_cnt_write),
511 
512 	/* Debug registers */
513 	SPECIAL_REG(DBGPRCR_EL1, vmm_reg_raz, vmm_reg_wi),
514 	SPECIAL_REG(OSDLR_EL1, vmm_reg_raz, vmm_reg_wi),
515 	/* TODO: Exceptions on invalid access */
516 	SPECIAL_REG(OSLAR_EL1, vmm_reg_raz, vmm_write_oslar_el1),
517 	SPECIAL_REG(OSLSR_EL1, vmm_read_oslsr_el1, vmm_reg_wi),
518 #undef SPECIAL_REG
519 };
520 
521 void
vm_register_reg_handler(struct vm * vm,uint64_t iss,uint64_t mask,reg_read_t reg_read,reg_write_t reg_write,void * arg)522 vm_register_reg_handler(struct vm *vm, uint64_t iss, uint64_t mask,
523     reg_read_t reg_read, reg_write_t reg_write, void *arg)
524 {
525 	int i;
526 
527 	for (i = 0; i < nitems(vm->special_reg); i++) {
528 		if (vm->special_reg[i].esr_iss == 0 &&
529 		    vm->special_reg[i].esr_mask == 0) {
530 			vm->special_reg[i].esr_iss = iss;
531 			vm->special_reg[i].esr_mask = mask;
532 			vm->special_reg[i].reg_read = reg_read;
533 			vm->special_reg[i].reg_write = reg_write;
534 			vm->special_reg[i].arg = arg;
535 			return;
536 		}
537 	}
538 
539 	panic("%s: No free special register slot", __func__);
540 }
541 
542 void
vm_deregister_reg_handler(struct vm * vm,uint64_t iss,uint64_t mask)543 vm_deregister_reg_handler(struct vm *vm, uint64_t iss, uint64_t mask)
544 {
545 	int i;
546 
547 	for (i = 0; i < nitems(vm->special_reg); i++) {
548 		if (vm->special_reg[i].esr_iss == iss &&
549 		    vm->special_reg[i].esr_mask == mask) {
550 			memset(&vm->special_reg[i], 0,
551 			    sizeof(vm->special_reg[i]));
552 			return;
553 		}
554 	}
555 
556 	panic("%s: Invalid special register: iss %lx mask %lx", __func__, iss,
557 	    mask);
558 }
559 
560 static int
vm_handle_reg_emul(struct vcpu * vcpu,bool * retu)561 vm_handle_reg_emul(struct vcpu *vcpu, bool *retu)
562 {
563 	struct vm *vm;
564 	struct vm_exit *vme;
565 	struct vre *vre;
566 	int i, rv;
567 
568 	vm = vcpu->vm;
569 	vme = &vcpu->exitinfo;
570 	vre = &vme->u.reg_emul.vre;
571 
572 	for (i = 0; i < nitems(vm->special_reg); i++) {
573 		if (vm->special_reg[i].esr_iss == 0 &&
574 		    vm->special_reg[i].esr_mask == 0)
575 			continue;
576 
577 		if ((vre->inst_syndrome & vm->special_reg[i].esr_mask) ==
578 		    vm->special_reg[i].esr_iss) {
579 			rv = vmm_emulate_register(vcpu, vre,
580 			    vm->special_reg[i].reg_read,
581 			    vm->special_reg[i].reg_write,
582 			    vm->special_reg[i].arg);
583 			if (rv == 0) {
584 				*retu = false;
585 			}
586 			return (rv);
587 		}
588 	}
589 	for (i = 0; i < nitems(vmm_special_regs); i++) {
590 		if ((vre->inst_syndrome & vmm_special_regs[i].esr_mask) ==
591 		    vmm_special_regs[i].esr_iss) {
592 			rv = vmm_emulate_register(vcpu, vre,
593 			    vmm_special_regs[i].reg_read,
594 			    vmm_special_regs[i].reg_write,
595 			    vmm_special_regs[i].arg);
596 			if (rv == 0) {
597 				*retu = false;
598 			}
599 			return (rv);
600 		}
601 	}
602 
603 
604 	*retu = true;
605 	return (0);
606 }
607 
608 void
vm_register_inst_handler(struct vm * vm,uint64_t start,uint64_t size,mem_region_read_t mmio_read,mem_region_write_t mmio_write)609 vm_register_inst_handler(struct vm *vm, uint64_t start, uint64_t size,
610     mem_region_read_t mmio_read, mem_region_write_t mmio_write)
611 {
612 	int i;
613 
614 	for (i = 0; i < nitems(vm->mmio_region); i++) {
615 		if (vm->mmio_region[i].start == 0 &&
616 		    vm->mmio_region[i].end == 0) {
617 			vm->mmio_region[i].start = start;
618 			vm->mmio_region[i].end = start + size;
619 			vm->mmio_region[i].read = mmio_read;
620 			vm->mmio_region[i].write = mmio_write;
621 			return;
622 		}
623 	}
624 
625 	panic("%s: No free MMIO region", __func__);
626 }
627 
628 void
vm_deregister_inst_handler(struct vm * vm,uint64_t start,uint64_t size)629 vm_deregister_inst_handler(struct vm *vm, uint64_t start, uint64_t size)
630 {
631 	int i;
632 
633 	for (i = 0; i < nitems(vm->mmio_region); i++) {
634 		if (vm->mmio_region[i].start == start &&
635 		    vm->mmio_region[i].end == start + size) {
636 			memset(&vm->mmio_region[i], 0,
637 			    sizeof(vm->mmio_region[i]));
638 			return;
639 		}
640 	}
641 
642 	panic("%s: Invalid MMIO region: %lx - %lx", __func__, start,
643 	    start + size);
644 }
645 
646 static int
vm_handle_inst_emul(struct vcpu * vcpu,bool * retu)647 vm_handle_inst_emul(struct vcpu *vcpu, bool *retu)
648 {
649 	struct vm *vm;
650 	struct vm_exit *vme;
651 	struct vie *vie;
652 	struct hyp *hyp;
653 	uint64_t fault_ipa;
654 	struct vm_guest_paging *paging;
655 	struct vmm_mmio_region *vmr;
656 	int error, i;
657 
658 	vm = vcpu->vm;
659 	hyp = vm->cookie;
660 	if (!hyp->vgic_attached)
661 		goto out_user;
662 
663 	vme = &vcpu->exitinfo;
664 	vie = &vme->u.inst_emul.vie;
665 	paging = &vme->u.inst_emul.paging;
666 
667 	fault_ipa = vme->u.inst_emul.gpa;
668 
669 	vmr = NULL;
670 	for (i = 0; i < nitems(vm->mmio_region); i++) {
671 		if (vm->mmio_region[i].start <= fault_ipa &&
672 		    vm->mmio_region[i].end > fault_ipa) {
673 			vmr = &vm->mmio_region[i];
674 			break;
675 		}
676 	}
677 	if (vmr == NULL)
678 		goto out_user;
679 
680 	error = vmm_emulate_instruction(vcpu, fault_ipa, vie, paging,
681 	    vmr->read, vmr->write, retu);
682 	return (error);
683 
684 out_user:
685 	*retu = true;
686 	return (0);
687 }
688 
689 void
vm_exit_suspended(struct vcpu * vcpu,uint64_t pc)690 vm_exit_suspended(struct vcpu *vcpu, uint64_t pc)
691 {
692 	struct vm *vm = vcpu->vm;
693 	struct vm_exit *vmexit;
694 
695 	KASSERT(vm->suspend > VM_SUSPEND_NONE && vm->suspend < VM_SUSPEND_LAST,
696 	    ("vm_exit_suspended: invalid suspend type %d", vm->suspend));
697 
698 	vmexit = vm_exitinfo(vcpu);
699 	vmexit->pc = pc;
700 	vmexit->inst_length = 4;
701 	vmexit->exitcode = VM_EXITCODE_SUSPENDED;
702 	vmexit->u.suspended.how = vm->suspend;
703 }
704 
705 void
vm_exit_debug(struct vcpu * vcpu,uint64_t pc)706 vm_exit_debug(struct vcpu *vcpu, uint64_t pc)
707 {
708 	struct vm_exit *vmexit;
709 
710 	vmexit = vm_exitinfo(vcpu);
711 	vmexit->pc = pc;
712 	vmexit->inst_length = 4;
713 	vmexit->exitcode = VM_EXITCODE_DEBUG;
714 }
715 
716 static void
restore_guest_fpustate(struct vcpu * vcpu)717 restore_guest_fpustate(struct vcpu *vcpu)
718 {
719 
720 	/* flush host state to the pcb */
721 	vfp_save_state(curthread, curthread->td_pcb);
722 	/* Ensure the VFP state will be re-loaded when exiting the guest */
723 	PCPU_SET(fpcurthread, NULL);
724 
725 	/* restore guest FPU state */
726 	vfp_enable();
727 	vfp_restore(vcpu->guestfpu);
728 
729 	/*
730 	 * The FPU is now "dirty" with the guest's state so turn on emulation
731 	 * to trap any access to the FPU by the host.
732 	 */
733 	vfp_disable();
734 }
735 
736 static void
save_guest_fpustate(struct vcpu * vcpu)737 save_guest_fpustate(struct vcpu *vcpu)
738 {
739 	if ((READ_SPECIALREG(cpacr_el1) & CPACR_FPEN_MASK) !=
740 	    CPACR_FPEN_TRAP_ALL1)
741 		panic("VFP not enabled in host!");
742 
743 	/* save guest FPU state */
744 	vfp_enable();
745 	vfp_store(vcpu->guestfpu);
746 	vfp_disable();
747 
748 	KASSERT(PCPU_GET(fpcurthread) == NULL,
749 	    ("%s: fpcurthread set with guest registers", __func__));
750 }
751 
752 static void
vcpu_require_state(struct vcpu * vcpu,enum vcpu_state newstate)753 vcpu_require_state(struct vcpu *vcpu, enum vcpu_state newstate)
754 {
755 	int error;
756 
757 	if ((error = vcpu_set_state(vcpu, newstate, false)) != 0)
758 		panic("Error %d setting state to %d\n", error, newstate);
759 }
760 
761 static void
vcpu_require_state_locked(struct vcpu * vcpu,enum vcpu_state newstate)762 vcpu_require_state_locked(struct vcpu *vcpu, enum vcpu_state newstate)
763 {
764 	int error;
765 
766 	if ((error = vcpu_set_state_locked(vcpu, newstate, false)) != 0)
767 		panic("Error %d setting state to %d", error, newstate);
768 }
769 
770 int
vm_get_capability(struct vcpu * vcpu,int type,int * retval)771 vm_get_capability(struct vcpu *vcpu, int type, int *retval)
772 {
773 	if (type < 0 || type >= VM_CAP_MAX)
774 		return (EINVAL);
775 
776 	return (vmmops_getcap(vcpu->cookie, type, retval));
777 }
778 
779 int
vm_set_capability(struct vcpu * vcpu,int type,int val)780 vm_set_capability(struct vcpu *vcpu, int type, int val)
781 {
782 	if (type < 0 || type >= VM_CAP_MAX)
783 		return (EINVAL);
784 
785 	return (vmmops_setcap(vcpu->cookie, type, val));
786 }
787 
788 void *
vcpu_get_cookie(struct vcpu * vcpu)789 vcpu_get_cookie(struct vcpu *vcpu)
790 {
791 	return (vcpu->cookie);
792 }
793 
794 int
vm_get_register(struct vcpu * vcpu,int reg,uint64_t * retval)795 vm_get_register(struct vcpu *vcpu, int reg, uint64_t *retval)
796 {
797 	if (reg < 0 || reg >= VM_REG_LAST)
798 		return (EINVAL);
799 
800 	return (vmmops_getreg(vcpu->cookie, reg, retval));
801 }
802 
803 int
vm_set_register(struct vcpu * vcpu,int reg,uint64_t val)804 vm_set_register(struct vcpu *vcpu, int reg, uint64_t val)
805 {
806 	int error;
807 
808 	if (reg < 0 || reg >= VM_REG_LAST)
809 		return (EINVAL);
810 	error = vmmops_setreg(vcpu->cookie, reg, val);
811 	if (error || reg != VM_REG_GUEST_PC)
812 		return (error);
813 
814 	vcpu->nextpc = val;
815 
816 	return (0);
817 }
818 
819 void *
vm_get_cookie(struct vm * vm)820 vm_get_cookie(struct vm *vm)
821 {
822 	return (vm->cookie);
823 }
824 
825 int
vm_inject_exception(struct vcpu * vcpu,uint64_t esr,uint64_t far)826 vm_inject_exception(struct vcpu *vcpu, uint64_t esr, uint64_t far)
827 {
828 	return (vmmops_exception(vcpu->cookie, esr, far));
829 }
830 
831 int
vm_attach_vgic(struct vm * vm,struct vm_vgic_descr * descr)832 vm_attach_vgic(struct vm *vm, struct vm_vgic_descr *descr)
833 {
834 	return (vgic_attach_to_vm(vm->cookie, descr));
835 }
836 
837 int
vm_assert_irq(struct vm * vm,uint32_t irq)838 vm_assert_irq(struct vm *vm, uint32_t irq)
839 {
840 	return (vgic_inject_irq(vm->cookie, -1, irq, true));
841 }
842 
843 int
vm_deassert_irq(struct vm * vm,uint32_t irq)844 vm_deassert_irq(struct vm *vm, uint32_t irq)
845 {
846 	return (vgic_inject_irq(vm->cookie, -1, irq, false));
847 }
848 
849 int
vm_raise_msi(struct vm * vm,uint64_t msg,uint64_t addr,int bus,int slot,int func)850 vm_raise_msi(struct vm *vm, uint64_t msg, uint64_t addr, int bus, int slot,
851     int func)
852 {
853 	/* TODO: Should we raise an SError? */
854 	return (vgic_inject_msi(vm->cookie, msg, addr));
855 }
856 
857 static int
vm_handle_smccc_call(struct vcpu * vcpu,struct vm_exit * vme,bool * retu)858 vm_handle_smccc_call(struct vcpu *vcpu, struct vm_exit *vme, bool *retu)
859 {
860 	struct hypctx *hypctx;
861 	int i;
862 
863 	hypctx = vcpu_get_cookie(vcpu);
864 
865 	if ((hypctx->tf.tf_esr & ESR_ELx_ISS_MASK) != 0)
866 		return (1);
867 
868 	vme->exitcode = VM_EXITCODE_SMCCC;
869 	vme->u.smccc_call.func_id = hypctx->tf.tf_x[0];
870 	for (i = 0; i < nitems(vme->u.smccc_call.args); i++)
871 		vme->u.smccc_call.args[i] = hypctx->tf.tf_x[i + 1];
872 
873 	*retu = true;
874 	return (0);
875 }
876 
877 static int
vm_handle_wfi(struct vcpu * vcpu,struct vm_exit * vme,bool * retu)878 vm_handle_wfi(struct vcpu *vcpu, struct vm_exit *vme, bool *retu)
879 {
880 	struct vm *vm;
881 
882 	vm = vcpu->vm;
883 	vcpu_lock(vcpu);
884 	while (1) {
885 		if (vm->suspend)
886 			break;
887 
888 		if (vgic_has_pending_irq(vcpu->cookie))
889 			break;
890 
891 		if (vcpu_should_yield(vcpu))
892 			break;
893 
894 		vcpu_require_state_locked(vcpu, VCPU_SLEEPING);
895 		/*
896 		 * XXX msleep_spin() cannot be interrupted by signals so
897 		 * wake up periodically to check pending signals.
898 		 */
899 		msleep_spin(vcpu, &vcpu->mtx, "vmidle", hz);
900 		vcpu_require_state_locked(vcpu, VCPU_FROZEN);
901 	}
902 	vcpu_unlock(vcpu);
903 
904 	*retu = false;
905 	return (0);
906 }
907 
908 static int
vm_handle_paging(struct vcpu * vcpu,bool * retu)909 vm_handle_paging(struct vcpu *vcpu, bool *retu)
910 {
911 	struct vm *vm = vcpu->vm;
912 	struct vm_exit *vme;
913 	struct vm_map *map;
914 	uint64_t addr, esr;
915 	pmap_t pmap;
916 	int ftype, rv;
917 
918 	vme = &vcpu->exitinfo;
919 
920 	pmap = vmspace_pmap(vm_vmspace(vcpu->vm));
921 	addr = vme->u.paging.gpa;
922 	esr = vme->u.paging.esr;
923 
924 	/* The page exists, but the page table needs to be updated. */
925 	if (pmap_fault(pmap, esr, addr) == KERN_SUCCESS)
926 		return (0);
927 
928 	switch (ESR_ELx_EXCEPTION(esr)) {
929 	case EXCP_INSN_ABORT_L:
930 	case EXCP_DATA_ABORT_L:
931 		ftype = VM_PROT_EXECUTE | VM_PROT_READ | VM_PROT_WRITE;
932 		break;
933 	default:
934 		panic("%s: Invalid exception (esr = %lx)", __func__, esr);
935 	}
936 
937 	map = &vm_vmspace(vm)->vm_map;
938 	rv = vm_fault(map, vme->u.paging.gpa, ftype, VM_FAULT_NORMAL, NULL);
939 	if (rv != KERN_SUCCESS)
940 		return (EFAULT);
941 
942 	return (0);
943 }
944 
945 static int
vm_handle_suspend(struct vcpu * vcpu,bool * retu)946 vm_handle_suspend(struct vcpu *vcpu, bool *retu)
947 {
948 	struct vm *vm = vcpu->vm;
949 	int error, i;
950 	struct thread *td;
951 
952 	error = 0;
953 	td = curthread;
954 
955 	CPU_SET_ATOMIC(vcpu->vcpuid, &vm->suspended_cpus);
956 
957 	/*
958 	 * Wait until all 'active_cpus' have suspended themselves.
959 	 *
960 	 * Since a VM may be suspended at any time including when one or
961 	 * more vcpus are doing a rendezvous we need to call the rendezvous
962 	 * handler while we are waiting to prevent a deadlock.
963 	 */
964 	vcpu_lock(vcpu);
965 	while (error == 0) {
966 		if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0)
967 			break;
968 
969 		vcpu_require_state_locked(vcpu, VCPU_SLEEPING);
970 		msleep_spin(vcpu, &vcpu->mtx, "vmsusp", hz);
971 		vcpu_require_state_locked(vcpu, VCPU_FROZEN);
972 		if (td_ast_pending(td, TDA_SUSPEND)) {
973 			vcpu_unlock(vcpu);
974 			error = thread_check_susp(td, false);
975 			vcpu_lock(vcpu);
976 		}
977 	}
978 	vcpu_unlock(vcpu);
979 
980 	/*
981 	 * Wakeup the other sleeping vcpus and return to userspace.
982 	 */
983 	for (i = 0; i < vm->maxcpus; i++) {
984 		if (CPU_ISSET(i, &vm->suspended_cpus)) {
985 			vcpu_notify_event(vm_vcpu(vm, i));
986 		}
987 	}
988 
989 	*retu = true;
990 	return (error);
991 }
992 
993 int
vm_run(struct vcpu * vcpu)994 vm_run(struct vcpu *vcpu)
995 {
996 	struct vm *vm = vcpu->vm;
997 	struct vm_eventinfo evinfo;
998 	int error, vcpuid;
999 	struct vm_exit *vme;
1000 	bool retu;
1001 	pmap_t pmap;
1002 
1003 	vcpuid = vcpu->vcpuid;
1004 
1005 	if (!CPU_ISSET(vcpuid, &vm->active_cpus))
1006 		return (EINVAL);
1007 
1008 	if (CPU_ISSET(vcpuid, &vm->suspended_cpus))
1009 		return (EINVAL);
1010 
1011 	pmap = vmspace_pmap(vm_vmspace(vm));
1012 	vme = &vcpu->exitinfo;
1013 	evinfo.rptr = NULL;
1014 	evinfo.sptr = &vm->suspend;
1015 	evinfo.iptr = NULL;
1016 restart:
1017 	critical_enter();
1018 
1019 	restore_guest_fpustate(vcpu);
1020 
1021 	vcpu_require_state(vcpu, VCPU_RUNNING);
1022 	error = vmmops_run(vcpu->cookie, vcpu->nextpc, pmap, &evinfo);
1023 	vcpu_require_state(vcpu, VCPU_FROZEN);
1024 
1025 	save_guest_fpustate(vcpu);
1026 
1027 	critical_exit();
1028 
1029 	if (error == 0) {
1030 		retu = false;
1031 		switch (vme->exitcode) {
1032 		case VM_EXITCODE_INST_EMUL:
1033 			vcpu->nextpc = vme->pc + vme->inst_length;
1034 			error = vm_handle_inst_emul(vcpu, &retu);
1035 			break;
1036 
1037 		case VM_EXITCODE_REG_EMUL:
1038 			vcpu->nextpc = vme->pc + vme->inst_length;
1039 			error = vm_handle_reg_emul(vcpu, &retu);
1040 			break;
1041 
1042 		case VM_EXITCODE_HVC:
1043 			/*
1044 			 * The HVC instruction saves the address for the
1045 			 * next instruction as the return address.
1046 			 */
1047 			vcpu->nextpc = vme->pc;
1048 			/*
1049 			 * The PSCI call can change the exit information in the
1050 			 * case of suspend/reset/poweroff/cpu off/cpu on.
1051 			 */
1052 			error = vm_handle_smccc_call(vcpu, vme, &retu);
1053 			break;
1054 
1055 		case VM_EXITCODE_WFI:
1056 			vcpu->nextpc = vme->pc + vme->inst_length;
1057 			error = vm_handle_wfi(vcpu, vme, &retu);
1058 			break;
1059 
1060 		case VM_EXITCODE_PAGING:
1061 			vcpu->nextpc = vme->pc;
1062 			error = vm_handle_paging(vcpu, &retu);
1063 			break;
1064 
1065 		case VM_EXITCODE_SUSPENDED:
1066 			vcpu->nextpc = vme->pc;
1067 			error = vm_handle_suspend(vcpu, &retu);
1068 			break;
1069 
1070 		default:
1071 			/* Handle in userland */
1072 			vcpu->nextpc = vme->pc;
1073 			retu = true;
1074 			break;
1075 		}
1076 	}
1077 
1078 	if (error == 0 && retu == false)
1079 		goto restart;
1080 
1081 	return (error);
1082 }
1083