xref: /freebsd/sys/arm64/vmm/vmm_arm64.c (revision a0ca4af9455b844c5e094fc1b09b1390ffa979fc)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (C) 2015 Mihai Carabas <mihai.carabas@gmail.com>
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28 
29 #include <sys/cdefs.h>
30 #include <sys/param.h>
31 #include <sys/systm.h>
32 #include <sys/smp.h>
33 #include <sys/kernel.h>
34 #include <sys/malloc.h>
35 #include <sys/mman.h>
36 #include <sys/pcpu.h>
37 #include <sys/proc.h>
38 #include <sys/sysctl.h>
39 #include <sys/lock.h>
40 #include <sys/mutex.h>
41 #include <sys/vmem.h>
42 
43 #include <vm/vm.h>
44 #include <vm/pmap.h>
45 #include <vm/vm_extern.h>
46 #include <vm/vm_map.h>
47 #include <vm/vm_page.h>
48 #include <vm/vm_param.h>
49 
50 #include <machine/armreg.h>
51 #include <machine/vm.h>
52 #include <machine/cpufunc.h>
53 #include <machine/cpu.h>
54 #include <machine/machdep.h>
55 #include <machine/vmm.h>
56 #include <machine/vmm_dev.h>
57 #include <machine/atomic.h>
58 #include <machine/hypervisor.h>
59 #include <machine/pmap.h>
60 
61 #include "mmu.h"
62 #include "arm64.h"
63 #include "hyp.h"
64 #include "reset.h"
65 #include "io/vgic.h"
66 #include "io/vgic_v3.h"
67 #include "io/vtimer.h"
68 #include "vmm_stat.h"
69 
70 #define	HANDLED		1
71 #define	UNHANDLED	0
72 
73 /* Number of bits in an EL2 virtual address */
74 #define	EL2_VIRT_BITS	48
75 CTASSERT((1ul << EL2_VIRT_BITS) >= HYP_VM_MAX_ADDRESS);
76 
77 /* TODO: Move the host hypctx off the stack */
78 #define	VMM_STACK_PAGES	4
79 #define	VMM_STACK_SIZE	(VMM_STACK_PAGES * PAGE_SIZE)
80 
81 static int vmm_pmap_levels, vmm_virt_bits, vmm_max_ipa_bits;
82 
83 /* Register values passed to arm_setup_vectors to set in the hypervisor */
84 struct vmm_init_regs {
85 	uint64_t tcr_el2;
86 	uint64_t vtcr_el2;
87 };
88 
89 MALLOC_DEFINE(M_HYP, "ARM VMM HYP", "ARM VMM HYP");
90 
91 extern char hyp_init_vectors[];
92 extern char hyp_vectors[];
93 extern char hyp_stub_vectors[];
94 
95 static vm_paddr_t hyp_code_base;
96 static size_t hyp_code_len;
97 
98 static char *stack[MAXCPU];
99 static vm_offset_t stack_hyp_va[MAXCPU];
100 
101 static vmem_t *el2_mem_alloc;
102 
103 static void arm_setup_vectors(void *arg);
104 static void vmm_pmap_clean_stage2_tlbi(void);
105 static void vmm_pmap_invalidate_range(uint64_t, vm_offset_t, vm_offset_t, bool);
106 static void vmm_pmap_invalidate_all(uint64_t);
107 
108 DPCPU_DEFINE_STATIC(struct hypctx *, vcpu);
109 
110 static inline void
111 arm64_set_active_vcpu(struct hypctx *hypctx)
112 {
113 	DPCPU_SET(vcpu, hypctx);
114 }
115 
116 struct hypctx *
117 arm64_get_active_vcpu(void)
118 {
119 	return (DPCPU_GET(vcpu));
120 }
121 
122 static void
123 arm_setup_vectors(void *arg)
124 {
125 	struct vmm_init_regs *el2_regs;
126 	uintptr_t stack_top;
127 	uint32_t sctlr_el2;
128 	register_t daif;
129 
130 	el2_regs = arg;
131 	arm64_set_active_vcpu(NULL);
132 
133 	daif = intr_disable();
134 
135 	/*
136 	 * Install the temporary vectors which will be responsible for
137 	 * initializing the VMM when we next trap into EL2.
138 	 *
139 	 * x0: the exception vector table responsible for hypervisor
140 	 * initialization on the next call.
141 	 */
142 	vmm_call_hyp(vtophys(&vmm_hyp_code));
143 
144 	/* Create and map the hypervisor stack */
145 	stack_top = stack_hyp_va[PCPU_GET(cpuid)] + VMM_STACK_SIZE;
146 
147 	/*
148 	 * Configure the system control register for EL2:
149 	 *
150 	 * SCTLR_EL2_M: MMU on
151 	 * SCTLR_EL2_C: Data cacheability not affected
152 	 * SCTLR_EL2_I: Instruction cacheability not affected
153 	 * SCTLR_EL2_A: Instruction alignment check
154 	 * SCTLR_EL2_SA: Stack pointer alignment check
155 	 * SCTLR_EL2_WXN: Treat writable memory as execute never
156 	 * ~SCTLR_EL2_EE: Data accesses are little-endian
157 	 */
158 	sctlr_el2 = SCTLR_EL2_RES1;
159 	sctlr_el2 |= SCTLR_EL2_M | SCTLR_EL2_C | SCTLR_EL2_I;
160 	sctlr_el2 |= SCTLR_EL2_A | SCTLR_EL2_SA;
161 	sctlr_el2 |= SCTLR_EL2_WXN;
162 	sctlr_el2 &= ~SCTLR_EL2_EE;
163 
164 	/* Special call to initialize EL2 */
165 	vmm_call_hyp(vmmpmap_to_ttbr0(), stack_top, el2_regs->tcr_el2,
166 	    sctlr_el2, el2_regs->vtcr_el2);
167 
168 	intr_restore(daif);
169 }
170 
171 static void
172 arm_teardown_vectors(void *arg)
173 {
174 	register_t daif;
175 
176 	/*
177 	 * vmm_cleanup() will disable the MMU. For the next few instructions,
178 	 * before the hardware disables the MMU, one of the following is
179 	 * possible:
180 	 *
181 	 * a. The instruction addresses are fetched with the MMU disabled,
182 	 * and they must represent the actual physical addresses. This will work
183 	 * because we call the vmm_cleanup() function by its physical address.
184 	 *
185 	 * b. The instruction addresses are fetched using the old translation
186 	 * tables. This will work because we have an identity mapping in place
187 	 * in the translation tables and vmm_cleanup() is called by its physical
188 	 * address.
189 	 */
190 	daif = intr_disable();
191 	/* TODO: Invalidate the cache */
192 	vmm_call_hyp(HYP_CLEANUP, vtophys(hyp_stub_vectors));
193 	intr_restore(daif);
194 
195 	arm64_set_active_vcpu(NULL);
196 }
197 
198 static uint64_t
199 vmm_vtcr_el2_sl(u_int levels)
200 {
201 #if PAGE_SIZE == PAGE_SIZE_4K
202 	switch (levels) {
203 	case 2:
204 		return (VTCR_EL2_SL0_4K_LVL2);
205 	case 3:
206 		return (VTCR_EL2_SL0_4K_LVL1);
207 	case 4:
208 		return (VTCR_EL2_SL0_4K_LVL0);
209 	default:
210 		panic("%s: Invalid number of page table levels %u", __func__,
211 		    levels);
212 	}
213 #elif PAGE_SIZE == PAGE_SIZE_16K
214 	switch (levels) {
215 	case 2:
216 		return (VTCR_EL2_SL0_16K_LVL2);
217 	case 3:
218 		return (VTCR_EL2_SL0_16K_LVL1);
219 	case 4:
220 		return (VTCR_EL2_SL0_16K_LVL0);
221 	default:
222 		panic("%s: Invalid number of page table levels %u", __func__,
223 		    levels);
224 	}
225 #else
226 #error Unsupported page size
227 #endif
228 }
229 
230 int
231 vmmops_modinit(int ipinum)
232 {
233 	struct vmm_init_regs el2_regs;
234 	vm_offset_t next_hyp_va;
235 	vm_paddr_t vmm_base;
236 	uint64_t id_aa64mmfr0_el1, pa_range_bits, pa_range_field;
237 	uint64_t cnthctl_el2;
238 	register_t daif;
239 	int cpu, i;
240 	bool rv __diagused;
241 
242 	if (!virt_enabled()) {
243 		printf(
244 		    "vmm: Processor doesn't have support for virtualization\n");
245 		return (ENXIO);
246 	}
247 
248 	/* TODO: Support VHE */
249 	if (in_vhe()) {
250 		printf("vmm: VHE is unsupported\n");
251 		return (ENXIO);
252 	}
253 
254 	if (!vgic_present()) {
255 		printf("vmm: No vgic found\n");
256 		return (ENODEV);
257 	}
258 
259 	if (!get_kernel_reg(ID_AA64MMFR0_EL1, &id_aa64mmfr0_el1)) {
260 		printf("vmm: Unable to read ID_AA64MMFR0_EL1\n");
261 		return (ENXIO);
262 	}
263 	pa_range_field = ID_AA64MMFR0_PARange_VAL(id_aa64mmfr0_el1);
264 	/*
265 	 * Use 3 levels to give us up to 39 bits with 4k pages, or
266 	 * 47 bits with 16k pages.
267 	 */
268 	/* TODO: Check the number of levels for 64k pages */
269 	vmm_pmap_levels = 3;
270 	switch (pa_range_field) {
271 	case ID_AA64MMFR0_PARange_4G:
272 		printf("vmm: Not enough physical address bits\n");
273 		return (ENXIO);
274 	case ID_AA64MMFR0_PARange_64G:
275 		vmm_virt_bits = 36;
276 #if PAGE_SIZE == PAGE_SIZE_16K
277 		vmm_pmap_levels = 2;
278 #endif
279 		break;
280 	default:
281 		vmm_virt_bits = 39;
282 		break;
283 	}
284 	pa_range_bits = pa_range_field >> ID_AA64MMFR0_PARange_SHIFT;
285 
286 	/* Initialise the EL2 MMU */
287 	if (!vmmpmap_init()) {
288 		printf("vmm: Failed to init the EL2 MMU\n");
289 		return (ENOMEM);
290 	}
291 
292 	/* Set up the stage 2 pmap callbacks */
293 	MPASS(pmap_clean_stage2_tlbi == NULL);
294 	pmap_clean_stage2_tlbi = vmm_pmap_clean_stage2_tlbi;
295 	pmap_stage2_invalidate_range = vmm_pmap_invalidate_range;
296 	pmap_stage2_invalidate_all = vmm_pmap_invalidate_all;
297 
298 	/*
299 	 * Create an allocator for the virtual address space used by EL2.
300 	 * EL2 code is identity-mapped; the allocator is used to find space for
301 	 * VM structures.
302 	 */
303 	el2_mem_alloc = vmem_create("VMM EL2", 0, 0, PAGE_SIZE, 0, M_WAITOK);
304 
305 	/* Create the mappings for the hypervisor translation table. */
306 	hyp_code_len = round_page(&vmm_hyp_code_end - &vmm_hyp_code);
307 
308 	/* We need an physical identity mapping for when we activate the MMU */
309 	hyp_code_base = vmm_base = vtophys(&vmm_hyp_code);
310 	rv = vmmpmap_enter(vmm_base, hyp_code_len, vmm_base,
311 	    VM_PROT_READ | VM_PROT_EXECUTE);
312 	MPASS(rv);
313 
314 	next_hyp_va = roundup2(vmm_base + hyp_code_len, L2_SIZE);
315 
316 	/* Create a per-CPU hypervisor stack */
317 	CPU_FOREACH(cpu) {
318 		stack[cpu] = malloc(VMM_STACK_SIZE, M_HYP, M_WAITOK | M_ZERO);
319 		stack_hyp_va[cpu] = next_hyp_va;
320 
321 		for (i = 0; i < VMM_STACK_PAGES; i++) {
322 			rv = vmmpmap_enter(stack_hyp_va[cpu] + ptoa(i),
323 			    PAGE_SIZE, vtophys(stack[cpu] + ptoa(i)),
324 			    VM_PROT_READ | VM_PROT_WRITE);
325 			MPASS(rv);
326 		}
327 		next_hyp_va += L2_SIZE;
328 	}
329 
330 	el2_regs.tcr_el2 = TCR_EL2_RES1;
331 	el2_regs.tcr_el2 |= min(pa_range_bits << TCR_EL2_PS_SHIFT,
332 	    TCR_EL2_PS_52BITS);
333 	el2_regs.tcr_el2 |= TCR_EL2_T0SZ(64 - EL2_VIRT_BITS);
334 	el2_regs.tcr_el2 |= TCR_EL2_IRGN0_WBWA | TCR_EL2_ORGN0_WBWA;
335 #if PAGE_SIZE == PAGE_SIZE_4K
336 	el2_regs.tcr_el2 |= TCR_EL2_TG0_4K;
337 #elif PAGE_SIZE == PAGE_SIZE_16K
338 	el2_regs.tcr_el2 |= TCR_EL2_TG0_16K;
339 #else
340 #error Unsupported page size
341 #endif
342 #ifdef SMP
343 	el2_regs.tcr_el2 |= TCR_EL2_SH0_IS;
344 #endif
345 
346 	switch (el2_regs.tcr_el2 & TCR_EL2_PS_MASK) {
347 	case TCR_EL2_PS_32BITS:
348 		vmm_max_ipa_bits = 32;
349 		break;
350 	case TCR_EL2_PS_36BITS:
351 		vmm_max_ipa_bits = 36;
352 		break;
353 	case TCR_EL2_PS_40BITS:
354 		vmm_max_ipa_bits = 40;
355 		break;
356 	case TCR_EL2_PS_42BITS:
357 		vmm_max_ipa_bits = 42;
358 		break;
359 	case TCR_EL2_PS_44BITS:
360 		vmm_max_ipa_bits = 44;
361 		break;
362 	case TCR_EL2_PS_48BITS:
363 		vmm_max_ipa_bits = 48;
364 		break;
365 	case TCR_EL2_PS_52BITS:
366 	default:
367 		vmm_max_ipa_bits = 52;
368 		break;
369 	}
370 
371 	/*
372 	 * Configure the Stage 2 translation control register:
373 	 *
374 	 * VTCR_IRGN0_WBWA: Translation table walks access inner cacheable
375 	 * normal memory
376 	 * VTCR_ORGN0_WBWA: Translation table walks access outer cacheable
377 	 * normal memory
378 	 * VTCR_EL2_TG0_4K/16K: Stage 2 uses the same page size as the kernel
379 	 * VTCR_EL2_SL0_4K_LVL1: Stage 2 uses concatenated level 1 tables
380 	 * VTCR_EL2_SH0_IS: Memory associated with Stage 2 walks is inner
381 	 * shareable
382 	 */
383 	el2_regs.vtcr_el2 = VTCR_EL2_RES1;
384 	el2_regs.vtcr_el2 |=
385 	    min(pa_range_bits << VTCR_EL2_PS_SHIFT, VTCR_EL2_PS_48BIT);
386 	el2_regs.vtcr_el2 |= VTCR_EL2_IRGN0_WBWA | VTCR_EL2_ORGN0_WBWA;
387 	el2_regs.vtcr_el2 |= VTCR_EL2_T0SZ(64 - vmm_virt_bits);
388 	el2_regs.vtcr_el2 |= vmm_vtcr_el2_sl(vmm_pmap_levels);
389 #if PAGE_SIZE == PAGE_SIZE_4K
390 	el2_regs.vtcr_el2 |= VTCR_EL2_TG0_4K;
391 #elif PAGE_SIZE == PAGE_SIZE_16K
392 	el2_regs.vtcr_el2 |= VTCR_EL2_TG0_16K;
393 #else
394 #error Unsupported page size
395 #endif
396 #ifdef SMP
397 	el2_regs.vtcr_el2 |= VTCR_EL2_SH0_IS;
398 #endif
399 
400 	smp_rendezvous(NULL, arm_setup_vectors, NULL, &el2_regs);
401 
402 	/* Add memory to the vmem allocator (checking there is space) */
403 	if (vmm_base > (L2_SIZE + PAGE_SIZE)) {
404 		/*
405 		 * Ensure there is an L2 block before the vmm code to check
406 		 * for buffer overflows on earlier data. Include the PAGE_SIZE
407 		 * of the minimum we can allocate.
408 		 */
409 		vmm_base -= L2_SIZE + PAGE_SIZE;
410 		vmm_base = rounddown2(vmm_base, L2_SIZE);
411 
412 		/*
413 		 * Check there is memory before the vmm code to add.
414 		 *
415 		 * Reserve the L2 block at address 0 so NULL dereference will
416 		 * raise an exception.
417 		 */
418 		if (vmm_base > L2_SIZE)
419 			vmem_add(el2_mem_alloc, L2_SIZE, vmm_base - L2_SIZE,
420 			    M_WAITOK);
421 	}
422 
423 	/*
424 	 * Add the memory after the stacks. There is most of an L2 block
425 	 * between the last stack and the first allocation so this should
426 	 * be safe without adding more padding.
427 	 */
428 	if (next_hyp_va < HYP_VM_MAX_ADDRESS - PAGE_SIZE)
429 		vmem_add(el2_mem_alloc, next_hyp_va,
430 		    HYP_VM_MAX_ADDRESS - next_hyp_va, M_WAITOK);
431 
432 	daif = intr_disable();
433 	cnthctl_el2 = vmm_call_hyp(HYP_READ_REGISTER, HYP_REG_CNTHCTL);
434 	intr_restore(daif);
435 
436 	vgic_init();
437 	vtimer_init(cnthctl_el2);
438 
439 	return (0);
440 }
441 
442 int
443 vmmops_modcleanup(void)
444 {
445 	int cpu;
446 
447 	smp_rendezvous(NULL, arm_teardown_vectors, NULL, NULL);
448 
449 	CPU_FOREACH(cpu) {
450 		vmmpmap_remove(stack_hyp_va[cpu], VMM_STACK_PAGES * PAGE_SIZE,
451 		    false);
452 	}
453 
454 	vmmpmap_remove(hyp_code_base, hyp_code_len, false);
455 
456 	vtimer_cleanup();
457 
458 	vmmpmap_fini();
459 
460 	CPU_FOREACH(cpu)
461 		free(stack[cpu], M_HYP);
462 
463 	pmap_clean_stage2_tlbi = NULL;
464 	pmap_stage2_invalidate_range = NULL;
465 	pmap_stage2_invalidate_all = NULL;
466 
467 	return (0);
468 }
469 
470 static vm_size_t
471 el2_hyp_size(struct vm *vm)
472 {
473 	return (round_page(sizeof(struct hyp) +
474 	    sizeof(struct hypctx *) * vm_get_maxcpus(vm)));
475 }
476 
477 static vm_size_t
478 el2_hypctx_size(void)
479 {
480 	return (round_page(sizeof(struct hypctx)));
481 }
482 
483 static vm_offset_t
484 el2_map_enter(vm_offset_t data, vm_size_t size, vm_prot_t prot)
485 {
486 	vmem_addr_t addr;
487 	int err __diagused;
488 	bool rv __diagused;
489 
490 	err = vmem_alloc(el2_mem_alloc, size, M_NEXTFIT | M_WAITOK, &addr);
491 	MPASS(err == 0);
492 	rv = vmmpmap_enter(addr, size, vtophys(data), prot);
493 	MPASS(rv);
494 
495 	return (addr);
496 }
497 
498 void *
499 vmmops_init(struct vm *vm, pmap_t pmap)
500 {
501 	struct hyp *hyp;
502 	vm_size_t size;
503 
504 	size = el2_hyp_size(vm);
505 	hyp = malloc_aligned(size, PAGE_SIZE, M_HYP, M_WAITOK | M_ZERO);
506 
507 	hyp->vm = vm;
508 	hyp->vgic_attached = false;
509 
510 	vtimer_vminit(hyp);
511 	vgic_vminit(hyp);
512 
513 	hyp->el2_addr = el2_map_enter((vm_offset_t)hyp, size,
514 	    VM_PROT_READ | VM_PROT_WRITE);
515 
516 	return (hyp);
517 }
518 
519 void *
520 vmmops_vcpu_init(void *vmi, struct vcpu *vcpu1, int vcpuid)
521 {
522 	struct hyp *hyp = vmi;
523 	struct hypctx *hypctx;
524 	vm_size_t size;
525 
526 	size = el2_hypctx_size();
527 	hypctx = malloc_aligned(size, PAGE_SIZE, M_HYP, M_WAITOK | M_ZERO);
528 
529 	KASSERT(vcpuid >= 0 && vcpuid < vm_get_maxcpus(hyp->vm),
530 	    ("%s: Invalid vcpuid %d", __func__, vcpuid));
531 	hyp->ctx[vcpuid] = hypctx;
532 
533 	hypctx->hyp = hyp;
534 	hypctx->vcpu = vcpu1;
535 
536 	reset_vm_el01_regs(hypctx);
537 	reset_vm_el2_regs(hypctx);
538 
539 	vtimer_cpuinit(hypctx);
540 	vgic_cpuinit(hypctx);
541 
542 	hypctx->el2_addr = el2_map_enter((vm_offset_t)hypctx, size,
543 	    VM_PROT_READ | VM_PROT_WRITE);
544 
545 	return (hypctx);
546 }
547 
548 static int
549 arm_vmm_pinit(pmap_t pmap)
550 {
551 
552 	pmap_pinit_stage(pmap, PM_STAGE2, vmm_pmap_levels);
553 	return (1);
554 }
555 
556 struct vmspace *
557 vmmops_vmspace_alloc(vm_offset_t min, vm_offset_t max)
558 {
559 	return (vmspace_alloc(min, max, arm_vmm_pinit));
560 }
561 
562 void
563 vmmops_vmspace_free(struct vmspace *vmspace)
564 {
565 
566 	pmap_remove_pages(vmspace_pmap(vmspace));
567 	vmspace_free(vmspace);
568 }
569 
570 static void
571 vmm_pmap_clean_stage2_tlbi(void)
572 {
573 	vmm_call_hyp(HYP_CLEAN_S2_TLBI);
574 }
575 
576 static void
577 vmm_pmap_invalidate_range(uint64_t vttbr, vm_offset_t sva, vm_offset_t eva,
578     bool final_only)
579 {
580 	MPASS(eva > sva);
581 	vmm_call_hyp(HYP_S2_TLBI_RANGE, vttbr, sva, eva, final_only);
582 }
583 
584 static void
585 vmm_pmap_invalidate_all(uint64_t vttbr)
586 {
587 	vmm_call_hyp(HYP_S2_TLBI_ALL, vttbr);
588 }
589 
590 static inline void
591 arm64_print_hyp_regs(struct vm_exit *vme)
592 {
593 	printf("esr_el2:   0x%016lx\n", vme->u.hyp.esr_el2);
594 	printf("far_el2:   0x%016lx\n", vme->u.hyp.far_el2);
595 	printf("hpfar_el2: 0x%016lx\n", vme->u.hyp.hpfar_el2);
596 	printf("elr_el2:   0x%016lx\n", vme->pc);
597 }
598 
599 static void
600 arm64_gen_inst_emul_data(struct hypctx *hypctx, uint32_t esr_iss,
601     struct vm_exit *vme_ret)
602 {
603 	struct vm_guest_paging *paging;
604 	struct vie *vie;
605 	uint32_t esr_sas, reg_num;
606 
607 	/*
608 	 * Get the page address from HPFAR_EL2.
609 	 */
610 	vme_ret->u.inst_emul.gpa =
611 	    HPFAR_EL2_FIPA_ADDR(hypctx->exit_info.hpfar_el2);
612 	/* Bits [11:0] are the same as bits [11:0] from the virtual address. */
613 	vme_ret->u.inst_emul.gpa += hypctx->exit_info.far_el2 &
614 	    FAR_EL2_HPFAR_PAGE_MASK;
615 
616 	esr_sas = (esr_iss & ISS_DATA_SAS_MASK) >> ISS_DATA_SAS_SHIFT;
617 	reg_num = (esr_iss & ISS_DATA_SRT_MASK) >> ISS_DATA_SRT_SHIFT;
618 
619 	vie = &vme_ret->u.inst_emul.vie;
620 	vie->access_size = 1 << esr_sas;
621 	vie->sign_extend = (esr_iss & ISS_DATA_SSE) ? 1 : 0;
622 	vie->dir = (esr_iss & ISS_DATA_WnR) ? VM_DIR_WRITE : VM_DIR_READ;
623 	vie->reg = reg_num;
624 
625 	paging = &vme_ret->u.inst_emul.paging;
626 	paging->ttbr0_addr = hypctx->ttbr0_el1 & ~(TTBR_ASID_MASK | TTBR_CnP);
627 	paging->ttbr1_addr = hypctx->ttbr1_el1 & ~(TTBR_ASID_MASK | TTBR_CnP);
628 	paging->tcr_el1 = hypctx->tcr_el1;
629 	paging->tcr2_el1 = hypctx->tcr2_el1;
630 	paging->flags = hypctx->tf.tf_spsr & (PSR_M_MASK | PSR_M_32);
631 	if ((hypctx->sctlr_el1 & SCTLR_M) != 0)
632 		paging->flags |= VM_GP_MMU_ENABLED;
633 }
634 
635 static void
636 arm64_gen_reg_emul_data(uint32_t esr_iss, struct vm_exit *vme_ret)
637 {
638 	uint32_t reg_num;
639 	struct vre *vre;
640 
641 	/* u.hyp member will be replaced by u.reg_emul */
642 	vre = &vme_ret->u.reg_emul.vre;
643 
644 	vre->inst_syndrome = esr_iss;
645 	/* ARMv8 Architecture Manual, p. D7-2273: 1 means read */
646 	vre->dir = (esr_iss & ISS_MSR_DIR) ? VM_DIR_READ : VM_DIR_WRITE;
647 	reg_num = ISS_MSR_Rt(esr_iss);
648 	vre->reg = reg_num;
649 }
650 
651 void
652 raise_data_insn_abort(struct hypctx *hypctx, uint64_t far, bool dabort, int fsc)
653 {
654 	uint64_t esr;
655 
656 	if ((hypctx->tf.tf_spsr & PSR_M_MASK) == PSR_M_EL0t)
657 		esr = EXCP_INSN_ABORT_L << ESR_ELx_EC_SHIFT;
658 	else
659 		esr = EXCP_INSN_ABORT << ESR_ELx_EC_SHIFT;
660 	/* Set the bit that changes from insn -> data abort */
661 	if (dabort)
662 		esr |= EXCP_DATA_ABORT_L << ESR_ELx_EC_SHIFT;
663 	/* Set the IL bit if set by hardware */
664 	esr |= hypctx->tf.tf_esr & ESR_ELx_IL;
665 
666 	vmmops_exception(hypctx, esr | fsc, far);
667 }
668 
669 static int
670 handle_el1_sync_excp(struct hypctx *hypctx, struct vm_exit *vme_ret,
671     pmap_t pmap)
672 {
673 	uint64_t gpa;
674 	uint32_t esr_ec, esr_iss;
675 
676 	esr_ec = ESR_ELx_EXCEPTION(hypctx->tf.tf_esr);
677 	esr_iss = hypctx->tf.tf_esr & ESR_ELx_ISS_MASK;
678 
679 	switch (esr_ec) {
680 	case EXCP_UNKNOWN:
681 		vmm_stat_incr(hypctx->vcpu, VMEXIT_UNKNOWN, 1);
682 		arm64_print_hyp_regs(vme_ret);
683 		vme_ret->exitcode = VM_EXITCODE_HYP;
684 		break;
685 	case EXCP_TRAP_WFI_WFE:
686 		if ((hypctx->tf.tf_esr & 0x3) == 0) { /* WFI */
687 			vmm_stat_incr(hypctx->vcpu, VMEXIT_WFI, 1);
688 			vme_ret->exitcode = VM_EXITCODE_WFI;
689 		} else {
690 			vmm_stat_incr(hypctx->vcpu, VMEXIT_WFE, 1);
691 			vme_ret->exitcode = VM_EXITCODE_HYP;
692 		}
693 		break;
694 	case EXCP_HVC:
695 		vmm_stat_incr(hypctx->vcpu, VMEXIT_HVC, 1);
696 		vme_ret->exitcode = VM_EXITCODE_HVC;
697 		break;
698 	case EXCP_MSR:
699 		vmm_stat_incr(hypctx->vcpu, VMEXIT_MSR, 1);
700 		arm64_gen_reg_emul_data(esr_iss, vme_ret);
701 		vme_ret->exitcode = VM_EXITCODE_REG_EMUL;
702 		break;
703 	case EXCP_BRK:
704 		vmm_stat_incr(hypctx->vcpu, VMEXIT_BRK, 1);
705 		vme_ret->exitcode = VM_EXITCODE_BRK;
706 		break;
707 	case EXCP_SOFTSTP_EL0:
708 		vmm_stat_incr(hypctx->vcpu, VMEXIT_SS, 1);
709 		vme_ret->exitcode = VM_EXITCODE_SS;
710 		break;
711 	case EXCP_INSN_ABORT_L:
712 	case EXCP_DATA_ABORT_L:
713 		vmm_stat_incr(hypctx->vcpu, esr_ec == EXCP_DATA_ABORT_L ?
714 		    VMEXIT_DATA_ABORT : VMEXIT_INSN_ABORT, 1);
715 		switch (hypctx->tf.tf_esr & ISS_DATA_DFSC_MASK) {
716 		case ISS_DATA_DFSC_TF_L0:
717 		case ISS_DATA_DFSC_TF_L1:
718 		case ISS_DATA_DFSC_TF_L2:
719 		case ISS_DATA_DFSC_TF_L3:
720 		case ISS_DATA_DFSC_AFF_L1:
721 		case ISS_DATA_DFSC_AFF_L2:
722 		case ISS_DATA_DFSC_AFF_L3:
723 		case ISS_DATA_DFSC_PF_L1:
724 		case ISS_DATA_DFSC_PF_L2:
725 		case ISS_DATA_DFSC_PF_L3:
726 			gpa = HPFAR_EL2_FIPA_ADDR(hypctx->exit_info.hpfar_el2);
727 			/* Check the IPA is valid */
728 			if (gpa >= (1ul << vmm_max_ipa_bits)) {
729 				raise_data_insn_abort(hypctx,
730 				    hypctx->exit_info.far_el2,
731 				    esr_ec == EXCP_DATA_ABORT_L,
732 				    ISS_DATA_DFSC_ASF_L0);
733 				vme_ret->inst_length = 0;
734 				return (HANDLED);
735 			}
736 
737 			if (vm_mem_allocated(hypctx->vcpu, gpa)) {
738 				vme_ret->exitcode = VM_EXITCODE_PAGING;
739 				vme_ret->inst_length = 0;
740 				vme_ret->u.paging.esr = hypctx->tf.tf_esr;
741 				vme_ret->u.paging.gpa = gpa;
742 			} else if (esr_ec == EXCP_INSN_ABORT_L) {
743 				/*
744 				 * Raise an external abort. Device memory is
745 				 * not executable
746 				 */
747 				raise_data_insn_abort(hypctx,
748 				    hypctx->exit_info.far_el2, false,
749 				    ISS_DATA_DFSC_EXT);
750 				vme_ret->inst_length = 0;
751 				return (HANDLED);
752 			} else {
753 				arm64_gen_inst_emul_data(hypctx, esr_iss,
754 				    vme_ret);
755 				vme_ret->exitcode = VM_EXITCODE_INST_EMUL;
756 			}
757 			break;
758 		default:
759 			arm64_print_hyp_regs(vme_ret);
760 			vme_ret->exitcode = VM_EXITCODE_HYP;
761 			break;
762 		}
763 
764 		break;
765 
766 	default:
767 		vmm_stat_incr(hypctx->vcpu, VMEXIT_UNHANDLED_SYNC, 1);
768 		arm64_print_hyp_regs(vme_ret);
769 		vme_ret->exitcode = VM_EXITCODE_HYP;
770 		break;
771 	}
772 
773 	/* We don't don't do any instruction emulation here */
774 	return (UNHANDLED);
775 }
776 
777 static int
778 arm64_handle_world_switch(struct hypctx *hypctx, int excp_type,
779     struct vm_exit *vme, pmap_t pmap)
780 {
781 	int handled;
782 
783 	switch (excp_type) {
784 	case EXCP_TYPE_EL1_SYNC:
785 		/* The exit code will be set by handle_el1_sync_excp(). */
786 		handled = handle_el1_sync_excp(hypctx, vme, pmap);
787 		break;
788 
789 	case EXCP_TYPE_EL1_IRQ:
790 	case EXCP_TYPE_EL1_FIQ:
791 		/* The host kernel will handle IRQs and FIQs. */
792 		vmm_stat_incr(hypctx->vcpu,
793 		    excp_type == EXCP_TYPE_EL1_IRQ ? VMEXIT_IRQ : VMEXIT_FIQ,1);
794 		vme->exitcode = VM_EXITCODE_BOGUS;
795 		handled = UNHANDLED;
796 		break;
797 
798 	case EXCP_TYPE_EL1_ERROR:
799 	case EXCP_TYPE_EL2_SYNC:
800 	case EXCP_TYPE_EL2_IRQ:
801 	case EXCP_TYPE_EL2_FIQ:
802 	case EXCP_TYPE_EL2_ERROR:
803 		vmm_stat_incr(hypctx->vcpu, VMEXIT_UNHANDLED_EL2, 1);
804 		vme->exitcode = VM_EXITCODE_BOGUS;
805 		handled = UNHANDLED;
806 		break;
807 
808 	default:
809 		vmm_stat_incr(hypctx->vcpu, VMEXIT_UNHANDLED, 1);
810 		vme->exitcode = VM_EXITCODE_BOGUS;
811 		handled = UNHANDLED;
812 		break;
813 	}
814 
815 	return (handled);
816 }
817 
818 static void
819 ptp_release(void **cookie)
820 {
821 	if (*cookie != NULL) {
822 		vm_gpa_release(*cookie);
823 		*cookie = NULL;
824 	}
825 }
826 
827 static void *
828 ptp_hold(struct vcpu *vcpu, vm_paddr_t ptpphys, size_t len, void **cookie)
829 {
830 	void *ptr;
831 
832 	ptp_release(cookie);
833 	ptr = vm_gpa_hold(vcpu, ptpphys, len, VM_PROT_RW, cookie);
834 	return (ptr);
835 }
836 
837 /* log2 of the number of bytes in a page table entry */
838 #define	PTE_SHIFT	3
839 int
840 vmmops_gla2gpa(void *vcpui, struct vm_guest_paging *paging, uint64_t gla,
841     int prot, uint64_t *gpa, int *is_fault)
842 {
843 	struct hypctx *hypctx;
844 	void *cookie;
845 	uint64_t mask, *ptep, pte, pte_addr;
846 	int address_bits, granule_shift, ia_bits, levels, pte_shift, tsz;
847 	bool is_el0;
848 
849 	/* Check if the MMU is off */
850 	if ((paging->flags & VM_GP_MMU_ENABLED) == 0) {
851 		*is_fault = 0;
852 		*gpa = gla;
853 		return (0);
854 	}
855 
856 	is_el0 = (paging->flags & PSR_M_MASK) == PSR_M_EL0t;
857 
858 	if (ADDR_IS_KERNEL(gla)) {
859 		/* If address translation is disabled raise an exception */
860 		if ((paging->tcr_el1 & TCR_EPD1) != 0) {
861 			*is_fault = 1;
862 			return (0);
863 		}
864 		if (is_el0 && (paging->tcr_el1 & TCR_E0PD1) != 0) {
865 			*is_fault = 1;
866 			return (0);
867 		}
868 		pte_addr = paging->ttbr1_addr;
869 		tsz = (paging->tcr_el1 & TCR_T1SZ_MASK) >> TCR_T1SZ_SHIFT;
870 		/* Clear the top byte if TBI is on */
871 		if ((paging->tcr_el1 & TCR_TBI1) != 0)
872 			gla |= (0xfful << 56);
873 		switch (paging->tcr_el1 & TCR_TG1_MASK) {
874 		case TCR_TG1_4K:
875 			granule_shift = PAGE_SHIFT_4K;
876 			break;
877 		case TCR_TG1_16K:
878 			granule_shift = PAGE_SHIFT_16K;
879 			break;
880 		case TCR_TG1_64K:
881 			granule_shift = PAGE_SHIFT_64K;
882 			break;
883 		default:
884 			*is_fault = 1;
885 			return (EINVAL);
886 		}
887 	} else {
888 		/* If address translation is disabled raise an exception */
889 		if ((paging->tcr_el1 & TCR_EPD0) != 0) {
890 			*is_fault = 1;
891 			return (0);
892 		}
893 		if (is_el0 && (paging->tcr_el1 & TCR_E0PD0) != 0) {
894 			*is_fault = 1;
895 			return (0);
896 		}
897 		pte_addr = paging->ttbr0_addr;
898 		tsz = (paging->tcr_el1 & TCR_T0SZ_MASK) >> TCR_T0SZ_SHIFT;
899 		/* Clear the top byte if TBI is on */
900 		if ((paging->tcr_el1 & TCR_TBI0) != 0)
901 			gla &= ~(0xfful << 56);
902 		switch (paging->tcr_el1 & TCR_TG0_MASK) {
903 		case TCR_TG0_4K:
904 			granule_shift = PAGE_SHIFT_4K;
905 			break;
906 		case TCR_TG0_16K:
907 			granule_shift = PAGE_SHIFT_16K;
908 			break;
909 		case TCR_TG0_64K:
910 			granule_shift = PAGE_SHIFT_64K;
911 			break;
912 		default:
913 			*is_fault = 1;
914 			return (EINVAL);
915 		}
916 	}
917 
918 	/*
919 	 * TODO: Support FEAT_TTST for smaller tsz values and FEAT_LPA2
920 	 * for larger values.
921 	 */
922 	switch (granule_shift) {
923 	case PAGE_SHIFT_4K:
924 	case PAGE_SHIFT_16K:
925 		/*
926 		 * See "Table D8-11 4KB granule, determining stage 1 initial
927 		 * lookup level" and "Table D8-21 16KB granule, determining
928 		 * stage 1 initial lookup level" from the "Arm Architecture
929 		 * Reference Manual for A-Profile architecture" revision I.a
930 		 * for the minimum and maximum values.
931 		 *
932 		 * TODO: Support less than 16 when FEAT_LPA2 is implemented
933 		 * and TCR_EL1.DS == 1
934 		 * TODO: Support more than 39 when FEAT_TTST is implemented
935 		 */
936 		if (tsz < 16 || tsz > 39) {
937 			*is_fault = 1;
938 			return (EINVAL);
939 		}
940 		break;
941 	case PAGE_SHIFT_64K:
942 	/* TODO: Support 64k granule. It will probably work, but is untested */
943 	default:
944 		*is_fault = 1;
945 		return (EINVAL);
946 	}
947 
948 	/*
949 	 * Calculate the input address bits. These are 64 bit in an address
950 	 * with the top tsz bits being all 0 or all 1.
951 	  */
952 	ia_bits = 64 - tsz;
953 
954 	/*
955 	 * Calculate the number of address bits used in the page table
956 	 * calculation. This is ia_bits minus the bottom granule_shift
957 	 * bits that are passed to the output address.
958 	 */
959 	address_bits = ia_bits - granule_shift;
960 
961 	/*
962 	 * Calculate the number of levels. Each level uses
963 	 * granule_shift - PTE_SHIFT bits of the input address.
964 	 * This is because the table is 1 << granule_shift and each
965 	 * entry is 1 << PTE_SHIFT bytes.
966 	 */
967 	levels = howmany(address_bits, granule_shift - PTE_SHIFT);
968 
969 	/* Mask of the upper unused bits in the virtual address */
970 	gla &= (1ul << ia_bits) - 1;
971 	hypctx = (struct hypctx *)vcpui;
972 	cookie = NULL;
973 	/* TODO: Check if the level supports block descriptors */
974 	for (;levels > 0; levels--) {
975 		int idx;
976 
977 		pte_shift = (levels - 1) * (granule_shift - PTE_SHIFT) +
978 		    granule_shift;
979 		idx = (gla >> pte_shift) &
980 		    ((1ul << (granule_shift - PTE_SHIFT)) - 1);
981 		while (idx > PAGE_SIZE / sizeof(pte)) {
982 			idx -= PAGE_SIZE / sizeof(pte);
983 			pte_addr += PAGE_SIZE;
984 		}
985 
986 		ptep = ptp_hold(hypctx->vcpu, pte_addr, PAGE_SIZE, &cookie);
987 		if (ptep == NULL)
988 			goto error;
989 		pte = ptep[idx];
990 
991 		/* Calculate the level we are looking at */
992 		switch (levels) {
993 		default:
994 			goto fault;
995 		/* TODO: Level -1 when FEAT_LPA2 is implemented */
996 		case 4: /* Level 0 */
997 			if ((pte & ATTR_DESCR_MASK) != L0_TABLE)
998 				goto fault;
999 			/* FALLTHROUGH */
1000 		case 3: /* Level 1 */
1001 		case 2: /* Level 2 */
1002 			switch (pte & ATTR_DESCR_MASK) {
1003 			/* Use L1 macro as all levels are the same */
1004 			case L1_TABLE:
1005 				/* Check if EL0 can access this address space */
1006 				if (is_el0 &&
1007 				    (pte & TATTR_AP_TABLE_NO_EL0) != 0)
1008 					goto fault;
1009 				/* Check if the address space is writable */
1010 				if ((prot & PROT_WRITE) != 0 &&
1011 				    (pte & TATTR_AP_TABLE_RO) != 0)
1012 					goto fault;
1013 				if ((prot & PROT_EXEC) != 0) {
1014 					/* Check the table exec attribute */
1015 					if ((is_el0 &&
1016 					    (pte & TATTR_UXN_TABLE) != 0) ||
1017 					    (!is_el0 &&
1018 					     (pte & TATTR_PXN_TABLE) != 0))
1019 						goto fault;
1020 				}
1021 				pte_addr = pte & ~ATTR_MASK;
1022 				break;
1023 			case L1_BLOCK:
1024 				goto done;
1025 			default:
1026 				goto fault;
1027 			}
1028 			break;
1029 		case 1: /* Level 3 */
1030 			if ((pte & ATTR_DESCR_MASK) == L3_PAGE)
1031 				goto done;
1032 			goto fault;
1033 		}
1034 	}
1035 
1036 done:
1037 	/* Check if EL0 has access to the block/page */
1038 	if (is_el0 && (pte & ATTR_S1_AP(ATTR_S1_AP_USER)) == 0)
1039 		goto fault;
1040 	if ((prot & PROT_WRITE) != 0 && (pte & ATTR_S1_AP_RW_BIT) != 0)
1041 		goto fault;
1042 	if ((prot & PROT_EXEC) != 0) {
1043 		if ((is_el0 && (pte & ATTR_S1_UXN) != 0) ||
1044 		    (!is_el0 && (pte & ATTR_S1_PXN) != 0))
1045 			goto fault;
1046 	}
1047 	mask = (1ul << pte_shift) - 1;
1048 	*gpa = (pte & ~ATTR_MASK) | (gla & mask);
1049 	*is_fault = 0;
1050 	ptp_release(&cookie);
1051 	return (0);
1052 
1053 error:
1054 	ptp_release(&cookie);
1055 	return (EFAULT);
1056 fault:
1057 	*is_fault = 1;
1058 	ptp_release(&cookie);
1059 	return (0);
1060 }
1061 
1062 int
1063 vmmops_run(void *vcpui, register_t pc, pmap_t pmap, struct vm_eventinfo *evinfo)
1064 {
1065 	uint64_t excp_type;
1066 	int handled;
1067 	register_t daif;
1068 	struct hyp *hyp;
1069 	struct hypctx *hypctx;
1070 	struct vcpu *vcpu;
1071 	struct vm_exit *vme;
1072 	int mode;
1073 
1074 	hypctx = (struct hypctx *)vcpui;
1075 	hyp = hypctx->hyp;
1076 	vcpu = hypctx->vcpu;
1077 	vme = vm_exitinfo(vcpu);
1078 
1079 	hypctx->tf.tf_elr = (uint64_t)pc;
1080 
1081 	for (;;) {
1082 		if (hypctx->has_exception) {
1083 			hypctx->has_exception = false;
1084 			hypctx->elr_el1 = hypctx->tf.tf_elr;
1085 
1086 			mode = hypctx->tf.tf_spsr & (PSR_M_MASK | PSR_M_32);
1087 
1088 			if (mode == PSR_M_EL1t) {
1089 				hypctx->tf.tf_elr = hypctx->vbar_el1 + 0x0;
1090 			} else if (mode == PSR_M_EL1h) {
1091 				hypctx->tf.tf_elr = hypctx->vbar_el1 + 0x200;
1092 			} else if ((mode & PSR_M_32) == PSR_M_64) {
1093 				/* 64-bit EL0 */
1094 				hypctx->tf.tf_elr = hypctx->vbar_el1 + 0x400;
1095 			} else {
1096 				/* 32-bit EL0 */
1097 				hypctx->tf.tf_elr = hypctx->vbar_el1 + 0x600;
1098 			}
1099 
1100 			/* Set the new spsr */
1101 			hypctx->spsr_el1 = hypctx->tf.tf_spsr;
1102 
1103 			/* Set the new cpsr */
1104 			hypctx->tf.tf_spsr = hypctx->spsr_el1 & PSR_FLAGS;
1105 			hypctx->tf.tf_spsr |= PSR_DAIF | PSR_M_EL1h;
1106 
1107 			/*
1108 			 * Update fields that may change on exeption entry
1109 			 * based on how sctlr_el1 is configured.
1110 			 */
1111 			if ((hypctx->sctlr_el1 & SCTLR_SPAN) != 0)
1112 				hypctx->tf.tf_spsr |= PSR_PAN;
1113 			if ((hypctx->sctlr_el1 & SCTLR_DSSBS) == 0)
1114 				hypctx->tf.tf_spsr &= ~PSR_SSBS;
1115 			else
1116 				hypctx->tf.tf_spsr |= PSR_SSBS;
1117 		}
1118 
1119 		daif = intr_disable();
1120 
1121 		/* Check if the vcpu is suspended */
1122 		if (vcpu_suspended(evinfo)) {
1123 			intr_restore(daif);
1124 			vm_exit_suspended(vcpu, pc);
1125 			break;
1126 		}
1127 
1128 		if (vcpu_debugged(vcpu)) {
1129 			intr_restore(daif);
1130 			vm_exit_debug(vcpu, pc);
1131 			break;
1132 		}
1133 
1134 		/* Activate the stage2 pmap so the vmid is valid */
1135 		pmap_activate_vm(pmap);
1136 		hyp->vttbr_el2 = pmap_to_ttbr0(pmap);
1137 
1138 		/*
1139 		 * TODO: What happens if a timer interrupt is asserted exactly
1140 		 * here, but for the previous VM?
1141 		 */
1142 		arm64_set_active_vcpu(hypctx);
1143 		vgic_flush_hwstate(hypctx);
1144 
1145 		/* Call into EL2 to switch to the guest */
1146 		excp_type = vmm_call_hyp(HYP_ENTER_GUEST,
1147 		    hyp->el2_addr, hypctx->el2_addr);
1148 
1149 		vgic_sync_hwstate(hypctx);
1150 		vtimer_sync_hwstate(hypctx);
1151 
1152 		/*
1153 		 * Deactivate the stage2 pmap. vmm_pmap_clean_stage2_tlbi
1154 		 * depends on this meaning we activate the VM before entering
1155 		 * the vm again
1156 		 */
1157 		PCPU_SET(curvmpmap, NULL);
1158 		intr_restore(daif);
1159 
1160 		vmm_stat_incr(vcpu, VMEXIT_COUNT, 1);
1161 		if (excp_type == EXCP_TYPE_MAINT_IRQ)
1162 			continue;
1163 
1164 		vme->pc = hypctx->tf.tf_elr;
1165 		vme->inst_length = INSN_SIZE;
1166 		vme->u.hyp.exception_nr = excp_type;
1167 		vme->u.hyp.esr_el2 = hypctx->tf.tf_esr;
1168 		vme->u.hyp.far_el2 = hypctx->exit_info.far_el2;
1169 		vme->u.hyp.hpfar_el2 = hypctx->exit_info.hpfar_el2;
1170 
1171 		handled = arm64_handle_world_switch(hypctx, excp_type, vme,
1172 		    pmap);
1173 		if (handled == UNHANDLED)
1174 			/* Exit loop to emulate instruction. */
1175 			break;
1176 		else
1177 			/* Resume guest execution from the next instruction. */
1178 			hypctx->tf.tf_elr += vme->inst_length;
1179 	}
1180 
1181 	return (0);
1182 }
1183 
1184 static void
1185 arm_pcpu_vmcleanup(void *arg)
1186 {
1187 	struct hyp *hyp;
1188 	int i, maxcpus;
1189 
1190 	hyp = arg;
1191 	maxcpus = vm_get_maxcpus(hyp->vm);
1192 	for (i = 0; i < maxcpus; i++) {
1193 		if (arm64_get_active_vcpu() == hyp->ctx[i]) {
1194 			arm64_set_active_vcpu(NULL);
1195 			break;
1196 		}
1197 	}
1198 }
1199 
1200 void
1201 vmmops_vcpu_cleanup(void *vcpui)
1202 {
1203 	struct hypctx *hypctx = vcpui;
1204 
1205 	vtimer_cpucleanup(hypctx);
1206 	vgic_cpucleanup(hypctx);
1207 
1208 	vmmpmap_remove(hypctx->el2_addr, el2_hypctx_size(), true);
1209 
1210 	free(hypctx, M_HYP);
1211 }
1212 
1213 void
1214 vmmops_cleanup(void *vmi)
1215 {
1216 	struct hyp *hyp = vmi;
1217 
1218 	vtimer_vmcleanup(hyp);
1219 	vgic_vmcleanup(hyp);
1220 
1221 	smp_rendezvous(NULL, arm_pcpu_vmcleanup, NULL, hyp);
1222 
1223 	vmmpmap_remove(hyp->el2_addr, el2_hyp_size(hyp->vm), true);
1224 
1225 	free(hyp, M_HYP);
1226 }
1227 
1228 /*
1229  * Return register value. Registers have different sizes and an explicit cast
1230  * must be made to ensure proper conversion.
1231  */
1232 static uint64_t *
1233 hypctx_regptr(struct hypctx *hypctx, int reg)
1234 {
1235 	switch (reg) {
1236 	case VM_REG_GUEST_X0 ... VM_REG_GUEST_X29:
1237 		return (&hypctx->tf.tf_x[reg]);
1238 	case VM_REG_GUEST_LR:
1239 		return (&hypctx->tf.tf_lr);
1240 	case VM_REG_GUEST_SP:
1241 		return (&hypctx->tf.tf_sp);
1242 	case VM_REG_GUEST_CPSR:
1243 		return (&hypctx->tf.tf_spsr);
1244 	case VM_REG_GUEST_PC:
1245 		return (&hypctx->tf.tf_elr);
1246 	case VM_REG_GUEST_SCTLR_EL1:
1247 		return (&hypctx->sctlr_el1);
1248 	case VM_REG_GUEST_TTBR0_EL1:
1249 		return (&hypctx->ttbr0_el1);
1250 	case VM_REG_GUEST_TTBR1_EL1:
1251 		return (&hypctx->ttbr1_el1);
1252 	case VM_REG_GUEST_TCR_EL1:
1253 		return (&hypctx->tcr_el1);
1254 	case VM_REG_GUEST_TCR2_EL1:
1255 		return (&hypctx->tcr2_el1);
1256 	default:
1257 		break;
1258 	}
1259 	return (NULL);
1260 }
1261 
1262 int
1263 vmmops_getreg(void *vcpui, int reg, uint64_t *retval)
1264 {
1265 	uint64_t *regp;
1266 	int running, hostcpu;
1267 	struct hypctx *hypctx = vcpui;
1268 
1269 	running = vcpu_is_running(hypctx->vcpu, &hostcpu);
1270 	if (running && hostcpu != curcpu)
1271 		panic("arm_getreg: %s%d is running", vm_name(hypctx->hyp->vm),
1272 		    vcpu_vcpuid(hypctx->vcpu));
1273 
1274 	regp = hypctx_regptr(hypctx, reg);
1275 	if (regp == NULL)
1276 		return (EINVAL);
1277 
1278 	*retval = *regp;
1279 	return (0);
1280 }
1281 
1282 int
1283 vmmops_setreg(void *vcpui, int reg, uint64_t val)
1284 {
1285 	uint64_t *regp;
1286 	struct hypctx *hypctx = vcpui;
1287 	int running, hostcpu;
1288 
1289 	running = vcpu_is_running(hypctx->vcpu, &hostcpu);
1290 	if (running && hostcpu != curcpu)
1291 		panic("arm_setreg: %s%d is running", vm_name(hypctx->hyp->vm),
1292 		    vcpu_vcpuid(hypctx->vcpu));
1293 
1294 	regp = hypctx_regptr(hypctx, reg);
1295 	if (regp == NULL)
1296 		return (EINVAL);
1297 
1298 	*regp = val;
1299 	return (0);
1300 }
1301 
1302 int
1303 vmmops_exception(void *vcpui, uint64_t esr, uint64_t far)
1304 {
1305 	struct hypctx *hypctx = vcpui;
1306 	int running, hostcpu;
1307 
1308 	running = vcpu_is_running(hypctx->vcpu, &hostcpu);
1309 	if (running && hostcpu != curcpu)
1310 		panic("%s: %s%d is running", __func__, vm_name(hypctx->hyp->vm),
1311 		    vcpu_vcpuid(hypctx->vcpu));
1312 
1313 	hypctx->far_el1 = far;
1314 	hypctx->esr_el1 = esr;
1315 	hypctx->has_exception = true;
1316 
1317 	return (0);
1318 }
1319 
1320 int
1321 vmmops_getcap(void *vcpui, int num, int *retval)
1322 {
1323 	struct hypctx *hypctx = vcpui;
1324 	int ret;
1325 
1326 	ret = ENOENT;
1327 
1328 	switch (num) {
1329 	case VM_CAP_UNRESTRICTED_GUEST:
1330 		*retval = 1;
1331 		ret = 0;
1332 		break;
1333 	case VM_CAP_BRK_EXIT:
1334 	case VM_CAP_SS_EXIT:
1335 	case VM_CAP_MASK_HWINTR:
1336 		*retval = (hypctx->setcaps & (1ul << num)) != 0;
1337 		break;
1338 	default:
1339 		break;
1340 	}
1341 
1342 	return (ret);
1343 }
1344 
1345 int
1346 vmmops_setcap(void *vcpui, int num, int val)
1347 {
1348 	struct hypctx *hypctx = vcpui;
1349 	int ret;
1350 
1351 	ret = 0;
1352 
1353 	switch (num) {
1354 	case VM_CAP_BRK_EXIT:
1355 		if ((val != 0) == (hypctx->setcaps & (1ul << num)) != 0)
1356 			break;
1357 		if (val != 0)
1358 			hypctx->mdcr_el2 |= MDCR_EL2_TDE;
1359 		else
1360 			hypctx->mdcr_el2 &= ~MDCR_EL2_TDE;
1361 		break;
1362 	case VM_CAP_SS_EXIT:
1363 		if ((val != 0) == (hypctx->setcaps & (1ul << num)) != 0)
1364 			break;
1365 
1366 		if (val != 0) {
1367 			hypctx->debug_spsr |= (hypctx->tf.tf_spsr & PSR_SS);
1368 			hypctx->debug_mdscr |= hypctx->mdscr_el1 &
1369 			    (MDSCR_SS | MDSCR_KDE);
1370 
1371 			hypctx->tf.tf_spsr |= PSR_SS;
1372 			hypctx->mdscr_el1 |= MDSCR_SS | MDSCR_KDE;
1373 			hypctx->mdcr_el2 |= MDCR_EL2_TDE;
1374 		} else {
1375 			hypctx->tf.tf_spsr &= ~PSR_SS;
1376 			hypctx->tf.tf_spsr |= hypctx->debug_spsr;
1377 			hypctx->debug_spsr &= ~PSR_SS;
1378 			hypctx->mdscr_el1 &= ~(MDSCR_SS | MDSCR_KDE);
1379 			hypctx->mdscr_el1 |= hypctx->debug_mdscr;
1380 			hypctx->debug_mdscr &= ~(MDSCR_SS | MDSCR_KDE);
1381 			hypctx->mdcr_el2 &= ~MDCR_EL2_TDE;
1382 		}
1383 		break;
1384 	case VM_CAP_MASK_HWINTR:
1385 		if ((val != 0) == (hypctx->setcaps & (1ul << num)) != 0)
1386 			break;
1387 
1388 		if (val != 0) {
1389 			hypctx->debug_spsr |= (hypctx->tf.tf_spsr &
1390 			    (PSR_I | PSR_F));
1391 			hypctx->tf.tf_spsr |= PSR_I | PSR_F;
1392 		} else {
1393 			hypctx->tf.tf_spsr &= ~(PSR_I | PSR_F);
1394 			hypctx->tf.tf_spsr |= (hypctx->debug_spsr &
1395 			    (PSR_I | PSR_F));
1396 			hypctx->debug_spsr &= ~(PSR_I | PSR_F);
1397 		}
1398 		break;
1399 	default:
1400 		ret = ENOENT;
1401 		break;
1402 	}
1403 
1404 	if (ret == 0) {
1405 		if (val == 0)
1406 			hypctx->setcaps &= ~(1ul << num);
1407 		else
1408 			hypctx->setcaps |= (1ul << num);
1409 	}
1410 
1411 	return (ret);
1412 }
1413