xref: /freebsd/sys/arm64/vmm/vmm_arm64.c (revision 1b9cfd6a625dc82611846cb9a53c1886f7af3758)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (C) 2015 Mihai Carabas <mihai.carabas@gmail.com>
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28 
29 #include <sys/cdefs.h>
30 #include <sys/param.h>
31 #include <sys/systm.h>
32 #include <sys/smp.h>
33 #include <sys/kernel.h>
34 #include <sys/malloc.h>
35 #include <sys/mman.h>
36 #include <sys/pcpu.h>
37 #include <sys/proc.h>
38 #include <sys/sysctl.h>
39 #include <sys/lock.h>
40 #include <sys/mutex.h>
41 #include <sys/vmem.h>
42 
43 #include <vm/vm.h>
44 #include <vm/pmap.h>
45 #include <vm/vm_extern.h>
46 #include <vm/vm_map.h>
47 #include <vm/vm_page.h>
48 #include <vm/vm_param.h>
49 
50 #include <machine/armreg.h>
51 #include <machine/vm.h>
52 #include <machine/cpufunc.h>
53 #include <machine/cpu.h>
54 #include <machine/machdep.h>
55 #include <machine/vmm.h>
56 #include <machine/vmm_dev.h>
57 #include <machine/atomic.h>
58 #include <machine/hypervisor.h>
59 #include <machine/pmap.h>
60 
61 #include "mmu.h"
62 #include "arm64.h"
63 #include "hyp.h"
64 #include "reset.h"
65 #include "io/vgic.h"
66 #include "io/vgic_v3.h"
67 #include "io/vtimer.h"
68 #include "vmm_handlers.h"
69 #include "vmm_stat.h"
70 
71 #define	HANDLED		1
72 #define	UNHANDLED	0
73 
74 /* Number of bits in an EL2 virtual address */
75 #define	EL2_VIRT_BITS	48
76 CTASSERT((1ul << EL2_VIRT_BITS) >= HYP_VM_MAX_ADDRESS);
77 
78 /* TODO: Move the host hypctx off the stack */
79 #define	VMM_STACK_PAGES	4
80 #define	VMM_STACK_SIZE	(VMM_STACK_PAGES * PAGE_SIZE)
81 
82 static int vmm_pmap_levels, vmm_virt_bits, vmm_max_ipa_bits;
83 
84 /* Register values passed to arm_setup_vectors to set in the hypervisor */
85 struct vmm_init_regs {
86 	uint64_t tcr_el2;
87 	uint64_t vtcr_el2;
88 };
89 
90 MALLOC_DEFINE(M_HYP, "ARM VMM HYP", "ARM VMM HYP");
91 
92 extern char hyp_init_vectors[];
93 extern char hyp_vectors[];
94 extern char hyp_stub_vectors[];
95 
96 static vm_paddr_t hyp_code_base;
97 static size_t hyp_code_len;
98 
99 static char *stack[MAXCPU];
100 static vm_offset_t stack_hyp_va[MAXCPU];
101 
102 static vmem_t *el2_mem_alloc;
103 
104 static void arm_setup_vectors(void *arg);
105 
106 DPCPU_DEFINE_STATIC(struct hypctx *, vcpu);
107 
108 static inline void
109 arm64_set_active_vcpu(struct hypctx *hypctx)
110 {
111 	DPCPU_SET(vcpu, hypctx);
112 }
113 
114 struct hypctx *
115 arm64_get_active_vcpu(void)
116 {
117 	return (DPCPU_GET(vcpu));
118 }
119 
120 static void
121 arm_setup_vectors(void *arg)
122 {
123 	struct vmm_init_regs *el2_regs;
124 	uintptr_t stack_top;
125 	uint32_t sctlr_el2;
126 	register_t daif;
127 
128 	el2_regs = arg;
129 	arm64_set_active_vcpu(NULL);
130 
131 	/*
132 	 * Configure the system control register for EL2:
133 	 *
134 	 * SCTLR_EL2_M: MMU on
135 	 * SCTLR_EL2_C: Data cacheability not affected
136 	 * SCTLR_EL2_I: Instruction cacheability not affected
137 	 * SCTLR_EL2_A: Instruction alignment check
138 	 * SCTLR_EL2_SA: Stack pointer alignment check
139 	 * SCTLR_EL2_WXN: Treat writable memory as execute never
140 	 * ~SCTLR_EL2_EE: Data accesses are little-endian
141 	 */
142 	sctlr_el2 = SCTLR_EL2_RES1;
143 	sctlr_el2 |= SCTLR_EL2_M | SCTLR_EL2_C | SCTLR_EL2_I;
144 	sctlr_el2 |= SCTLR_EL2_A | SCTLR_EL2_SA;
145 	sctlr_el2 |= SCTLR_EL2_WXN;
146 	sctlr_el2 &= ~SCTLR_EL2_EE;
147 
148 	daif = intr_disable();
149 
150 	if (in_vhe()) {
151 		WRITE_SPECIALREG(vtcr_el2, el2_regs->vtcr_el2);
152 	} else {
153 		/*
154 		 * Install the temporary vectors which will be responsible for
155 		 * initializing the VMM when we next trap into EL2.
156 		 *
157 		 * x0: the exception vector table responsible for hypervisor
158 		 * initialization on the next call.
159 		 */
160 		vmm_call_hyp(vtophys(&vmm_hyp_code));
161 
162 		/* Create and map the hypervisor stack */
163 		stack_top = stack_hyp_va[PCPU_GET(cpuid)] + VMM_STACK_SIZE;
164 
165 		/* Special call to initialize EL2 */
166 		vmm_call_hyp(vmmpmap_to_ttbr0(), stack_top, el2_regs->tcr_el2,
167 		    sctlr_el2, el2_regs->vtcr_el2);
168 	}
169 
170 	intr_restore(daif);
171 }
172 
173 static void
174 arm_teardown_vectors(void *arg)
175 {
176 	register_t daif;
177 
178 	/*
179 	 * vmm_cleanup() will disable the MMU. For the next few instructions,
180 	 * before the hardware disables the MMU, one of the following is
181 	 * possible:
182 	 *
183 	 * a. The instruction addresses are fetched with the MMU disabled,
184 	 * and they must represent the actual physical addresses. This will work
185 	 * because we call the vmm_cleanup() function by its physical address.
186 	 *
187 	 * b. The instruction addresses are fetched using the old translation
188 	 * tables. This will work because we have an identity mapping in place
189 	 * in the translation tables and vmm_cleanup() is called by its physical
190 	 * address.
191 	 */
192 	daif = intr_disable();
193 	/* TODO: Invalidate the cache */
194 	vmm_call_hyp(HYP_CLEANUP, vtophys(hyp_stub_vectors));
195 	intr_restore(daif);
196 
197 	arm64_set_active_vcpu(NULL);
198 }
199 
200 static uint64_t
201 vmm_vtcr_el2_sl(u_int levels)
202 {
203 #if PAGE_SIZE == PAGE_SIZE_4K
204 	switch (levels) {
205 	case 2:
206 		return (VTCR_EL2_SL0_4K_LVL2);
207 	case 3:
208 		return (VTCR_EL2_SL0_4K_LVL1);
209 	case 4:
210 		return (VTCR_EL2_SL0_4K_LVL0);
211 	default:
212 		panic("%s: Invalid number of page table levels %u", __func__,
213 		    levels);
214 	}
215 #elif PAGE_SIZE == PAGE_SIZE_16K
216 	switch (levels) {
217 	case 2:
218 		return (VTCR_EL2_SL0_16K_LVL2);
219 	case 3:
220 		return (VTCR_EL2_SL0_16K_LVL1);
221 	case 4:
222 		return (VTCR_EL2_SL0_16K_LVL0);
223 	default:
224 		panic("%s: Invalid number of page table levels %u", __func__,
225 		    levels);
226 	}
227 #else
228 #error Unsupported page size
229 #endif
230 }
231 
232 int
233 vmmops_modinit(int ipinum)
234 {
235 	struct vmm_init_regs el2_regs;
236 	vm_offset_t next_hyp_va;
237 	vm_paddr_t vmm_base;
238 	uint64_t id_aa64mmfr0_el1, pa_range_bits, pa_range_field;
239 	uint64_t cnthctl_el2;
240 	int cpu, i;
241 	bool rv __diagused;
242 
243 	if (!has_hyp()) {
244 		printf(
245 		    "vmm: Processor doesn't have support for virtualization\n");
246 		return (ENXIO);
247 	}
248 
249 	if (!vgic_present()) {
250 		printf("vmm: No vgic found\n");
251 		return (ENODEV);
252 	}
253 
254 	if (!get_kernel_reg(ID_AA64MMFR0_EL1, &id_aa64mmfr0_el1)) {
255 		printf("vmm: Unable to read ID_AA64MMFR0_EL1\n");
256 		return (ENXIO);
257 	}
258 	pa_range_field = ID_AA64MMFR0_PARange_VAL(id_aa64mmfr0_el1);
259 	/*
260 	 * Use 3 levels to give us up to 39 bits with 4k pages, or
261 	 * 47 bits with 16k pages.
262 	 */
263 	/* TODO: Check the number of levels for 64k pages */
264 	vmm_pmap_levels = 3;
265 	switch (pa_range_field) {
266 	case ID_AA64MMFR0_PARange_4G:
267 		printf("vmm: Not enough physical address bits\n");
268 		return (ENXIO);
269 	case ID_AA64MMFR0_PARange_64G:
270 		vmm_virt_bits = 36;
271 #if PAGE_SIZE == PAGE_SIZE_16K
272 		vmm_pmap_levels = 2;
273 #endif
274 		break;
275 	default:
276 		vmm_virt_bits = 39;
277 		break;
278 	}
279 	pa_range_bits = pa_range_field >> ID_AA64MMFR0_PARange_SHIFT;
280 
281 	if (!in_vhe()) {
282 		/* Initialise the EL2 MMU */
283 		if (!vmmpmap_init()) {
284 			printf("vmm: Failed to init the EL2 MMU\n");
285 			return (ENOMEM);
286 		}
287 	}
288 
289 	/* Set up the stage 2 pmap callbacks */
290 	MPASS(pmap_clean_stage2_tlbi == NULL);
291 	pmap_clean_stage2_tlbi = vmm_clean_s2_tlbi;
292 	pmap_stage2_invalidate_range = vmm_s2_tlbi_range;
293 	pmap_stage2_invalidate_all = vmm_s2_tlbi_all;
294 
295 	if (!in_vhe()) {
296 		/*
297 		 * Create an allocator for the virtual address space used by
298 		 * EL2. EL2 code is identity-mapped; the allocator is used to
299 		 * find space for VM structures.
300 		 */
301 		el2_mem_alloc = vmem_create("VMM EL2", 0, 0, PAGE_SIZE, 0,
302 		    M_WAITOK);
303 
304 		/* Create the mappings for the hypervisor translation table. */
305 		hyp_code_len = round_page(&vmm_hyp_code_end - &vmm_hyp_code);
306 
307 		/* We need an physical identity mapping for when we activate the MMU */
308 		hyp_code_base = vmm_base = vtophys(&vmm_hyp_code);
309 		rv = vmmpmap_enter(vmm_base, hyp_code_len, vmm_base,
310 		    VM_PROT_READ | VM_PROT_EXECUTE);
311 		MPASS(rv);
312 
313 		next_hyp_va = roundup2(vmm_base + hyp_code_len, L2_SIZE);
314 
315 		/* Create a per-CPU hypervisor stack */
316 		CPU_FOREACH(cpu) {
317 			stack[cpu] = malloc(VMM_STACK_SIZE, M_HYP, M_WAITOK | M_ZERO);
318 			stack_hyp_va[cpu] = next_hyp_va;
319 
320 			for (i = 0; i < VMM_STACK_PAGES; i++) {
321 				rv = vmmpmap_enter(stack_hyp_va[cpu] + ptoa(i),
322 				    PAGE_SIZE, vtophys(stack[cpu] + ptoa(i)),
323 				    VM_PROT_READ | VM_PROT_WRITE);
324 				MPASS(rv);
325 			}
326 			next_hyp_va += L2_SIZE;
327 		}
328 
329 		el2_regs.tcr_el2 = TCR_EL2_RES1;
330 		el2_regs.tcr_el2 |= min(pa_range_bits << TCR_EL2_PS_SHIFT,
331 		    TCR_EL2_PS_52BITS);
332 		el2_regs.tcr_el2 |= TCR_EL2_T0SZ(64 - EL2_VIRT_BITS);
333 		el2_regs.tcr_el2 |= TCR_EL2_IRGN0_WBWA | TCR_EL2_ORGN0_WBWA;
334 #if PAGE_SIZE == PAGE_SIZE_4K
335 		el2_regs.tcr_el2 |= TCR_EL2_TG0_4K;
336 #elif PAGE_SIZE == PAGE_SIZE_16K
337 		el2_regs.tcr_el2 |= TCR_EL2_TG0_16K;
338 #else
339 #error Unsupported page size
340 #endif
341 #ifdef SMP
342 		el2_regs.tcr_el2 |= TCR_EL2_SH0_IS;
343 #endif
344 	}
345 
346 	switch (pa_range_bits << TCR_EL2_PS_SHIFT) {
347 	case TCR_EL2_PS_32BITS:
348 		vmm_max_ipa_bits = 32;
349 		break;
350 	case TCR_EL2_PS_36BITS:
351 		vmm_max_ipa_bits = 36;
352 		break;
353 	case TCR_EL2_PS_40BITS:
354 		vmm_max_ipa_bits = 40;
355 		break;
356 	case TCR_EL2_PS_42BITS:
357 		vmm_max_ipa_bits = 42;
358 		break;
359 	case TCR_EL2_PS_44BITS:
360 		vmm_max_ipa_bits = 44;
361 		break;
362 	case TCR_EL2_PS_48BITS:
363 		vmm_max_ipa_bits = 48;
364 		break;
365 	case TCR_EL2_PS_52BITS:
366 	default:
367 		vmm_max_ipa_bits = 52;
368 		break;
369 	}
370 
371 	/*
372 	 * Configure the Stage 2 translation control register:
373 	 *
374 	 * VTCR_IRGN0_WBWA: Translation table walks access inner cacheable
375 	 * normal memory
376 	 * VTCR_ORGN0_WBWA: Translation table walks access outer cacheable
377 	 * normal memory
378 	 * VTCR_EL2_TG0_4K/16K: Stage 2 uses the same page size as the kernel
379 	 * VTCR_EL2_SL0_4K_LVL1: Stage 2 uses concatenated level 1 tables
380 	 * VTCR_EL2_SH0_IS: Memory associated with Stage 2 walks is inner
381 	 * shareable
382 	 */
383 	el2_regs.vtcr_el2 = VTCR_EL2_RES1;
384 	el2_regs.vtcr_el2 |=
385 	    min(pa_range_bits << VTCR_EL2_PS_SHIFT, VTCR_EL2_PS_48BIT);
386 	el2_regs.vtcr_el2 |= VTCR_EL2_IRGN0_WBWA | VTCR_EL2_ORGN0_WBWA;
387 	el2_regs.vtcr_el2 |= VTCR_EL2_T0SZ(64 - vmm_virt_bits);
388 	el2_regs.vtcr_el2 |= vmm_vtcr_el2_sl(vmm_pmap_levels);
389 #if PAGE_SIZE == PAGE_SIZE_4K
390 	el2_regs.vtcr_el2 |= VTCR_EL2_TG0_4K;
391 #elif PAGE_SIZE == PAGE_SIZE_16K
392 	el2_regs.vtcr_el2 |= VTCR_EL2_TG0_16K;
393 #else
394 #error Unsupported page size
395 #endif
396 #ifdef SMP
397 	el2_regs.vtcr_el2 |= VTCR_EL2_SH0_IS;
398 #endif
399 
400 	smp_rendezvous(NULL, arm_setup_vectors, NULL, &el2_regs);
401 
402 	if (!in_vhe()) {
403 		/* Add memory to the vmem allocator (checking there is space) */
404 		if (vmm_base > (L2_SIZE + PAGE_SIZE)) {
405 			/*
406 			 * Ensure there is an L2 block before the vmm code to check
407 			 * for buffer overflows on earlier data. Include the PAGE_SIZE
408 			 * of the minimum we can allocate.
409 			 */
410 			vmm_base -= L2_SIZE + PAGE_SIZE;
411 			vmm_base = rounddown2(vmm_base, L2_SIZE);
412 
413 			/*
414 			 * Check there is memory before the vmm code to add.
415 			 *
416 			 * Reserve the L2 block at address 0 so NULL dereference will
417 			 * raise an exception.
418 			 */
419 			if (vmm_base > L2_SIZE)
420 				vmem_add(el2_mem_alloc, L2_SIZE, vmm_base - L2_SIZE,
421 				    M_WAITOK);
422 		}
423 
424 		/*
425 		 * Add the memory after the stacks. There is most of an L2 block
426 		 * between the last stack and the first allocation so this should
427 		 * be safe without adding more padding.
428 		 */
429 		if (next_hyp_va < HYP_VM_MAX_ADDRESS - PAGE_SIZE)
430 			vmem_add(el2_mem_alloc, next_hyp_va,
431 			    HYP_VM_MAX_ADDRESS - next_hyp_va, M_WAITOK);
432 	}
433 	cnthctl_el2 = vmm_read_reg(HYP_REG_CNTHCTL);
434 
435 	vgic_init();
436 	vtimer_init(cnthctl_el2);
437 
438 	return (0);
439 }
440 
441 int
442 vmmops_modcleanup(void)
443 {
444 	int cpu;
445 
446 	if (!in_vhe()) {
447 		smp_rendezvous(NULL, arm_teardown_vectors, NULL, NULL);
448 
449 		CPU_FOREACH(cpu) {
450 			vmmpmap_remove(stack_hyp_va[cpu],
451 			    VMM_STACK_PAGES * PAGE_SIZE, false);
452 		}
453 
454 		vmmpmap_remove(hyp_code_base, hyp_code_len, false);
455 	}
456 
457 	vtimer_cleanup();
458 
459 	if (!in_vhe()) {
460 		vmmpmap_fini();
461 
462 		CPU_FOREACH(cpu)
463 			free(stack[cpu], M_HYP);
464 	}
465 
466 	pmap_clean_stage2_tlbi = NULL;
467 	pmap_stage2_invalidate_range = NULL;
468 	pmap_stage2_invalidate_all = NULL;
469 
470 	return (0);
471 }
472 
473 static vm_size_t
474 el2_hyp_size(struct vm *vm)
475 {
476 	return (round_page(sizeof(struct hyp) +
477 	    sizeof(struct hypctx *) * vm_get_maxcpus(vm)));
478 }
479 
480 static vm_size_t
481 el2_hypctx_size(void)
482 {
483 	return (round_page(sizeof(struct hypctx)));
484 }
485 
486 static vm_offset_t
487 el2_map_enter(vm_offset_t data, vm_size_t size, vm_prot_t prot)
488 {
489 	vmem_addr_t addr;
490 	int err __diagused;
491 	bool rv __diagused;
492 
493 	err = vmem_alloc(el2_mem_alloc, size, M_NEXTFIT | M_WAITOK, &addr);
494 	MPASS(err == 0);
495 	rv = vmmpmap_enter(addr, size, vtophys(data), prot);
496 	MPASS(rv);
497 
498 	return (addr);
499 }
500 
501 void *
502 vmmops_init(struct vm *vm, pmap_t pmap)
503 {
504 	struct hyp *hyp;
505 	vm_size_t size;
506 
507 	size = el2_hyp_size(vm);
508 	hyp = malloc_aligned(size, PAGE_SIZE, M_HYP, M_WAITOK | M_ZERO);
509 
510 	hyp->vm = vm;
511 	hyp->vgic_attached = false;
512 
513 	vtimer_vminit(hyp);
514 	vgic_vminit(hyp);
515 
516 	if (!in_vhe())
517 		hyp->el2_addr = el2_map_enter((vm_offset_t)hyp, size,
518 		    VM_PROT_READ | VM_PROT_WRITE);
519 
520 	return (hyp);
521 }
522 
523 void *
524 vmmops_vcpu_init(void *vmi, struct vcpu *vcpu1, int vcpuid)
525 {
526 	struct hyp *hyp = vmi;
527 	struct hypctx *hypctx;
528 	vm_size_t size;
529 
530 	size = el2_hypctx_size();
531 	hypctx = malloc_aligned(size, PAGE_SIZE, M_HYP, M_WAITOK | M_ZERO);
532 
533 	KASSERT(vcpuid >= 0 && vcpuid < vm_get_maxcpus(hyp->vm),
534 	    ("%s: Invalid vcpuid %d", __func__, vcpuid));
535 	hyp->ctx[vcpuid] = hypctx;
536 
537 	hypctx->hyp = hyp;
538 	hypctx->vcpu = vcpu1;
539 
540 	reset_vm_el01_regs(hypctx);
541 	reset_vm_el2_regs(hypctx);
542 
543 	vtimer_cpuinit(hypctx);
544 	vgic_cpuinit(hypctx);
545 
546 	if (!in_vhe())
547 		hypctx->el2_addr = el2_map_enter((vm_offset_t)hypctx, size,
548 		    VM_PROT_READ | VM_PROT_WRITE);
549 
550 	return (hypctx);
551 }
552 
553 static int
554 arm_vmm_pinit(pmap_t pmap)
555 {
556 
557 	pmap_pinit_stage(pmap, PM_STAGE2, vmm_pmap_levels);
558 	return (1);
559 }
560 
561 struct vmspace *
562 vmmops_vmspace_alloc(vm_offset_t min, vm_offset_t max)
563 {
564 	return (vmspace_alloc(min, max, arm_vmm_pinit));
565 }
566 
567 void
568 vmmops_vmspace_free(struct vmspace *vmspace)
569 {
570 
571 	pmap_remove_pages(vmspace_pmap(vmspace));
572 	vmspace_free(vmspace);
573 }
574 
575 static inline void
576 arm64_print_hyp_regs(struct vm_exit *vme)
577 {
578 	printf("esr_el2:   0x%016lx\n", vme->u.hyp.esr_el2);
579 	printf("far_el2:   0x%016lx\n", vme->u.hyp.far_el2);
580 	printf("hpfar_el2: 0x%016lx\n", vme->u.hyp.hpfar_el2);
581 	printf("elr_el2:   0x%016lx\n", vme->pc);
582 }
583 
584 static void
585 arm64_gen_inst_emul_data(struct hypctx *hypctx, uint32_t esr_iss,
586     struct vm_exit *vme_ret)
587 {
588 	struct vm_guest_paging *paging;
589 	struct vie *vie;
590 	uint32_t esr_sas, reg_num;
591 
592 	/*
593 	 * Get the page address from HPFAR_EL2.
594 	 */
595 	vme_ret->u.inst_emul.gpa =
596 	    HPFAR_EL2_FIPA_ADDR(hypctx->exit_info.hpfar_el2);
597 	/* Bits [11:0] are the same as bits [11:0] from the virtual address. */
598 	vme_ret->u.inst_emul.gpa += hypctx->exit_info.far_el2 &
599 	    FAR_EL2_HPFAR_PAGE_MASK;
600 
601 	esr_sas = (esr_iss & ISS_DATA_SAS_MASK) >> ISS_DATA_SAS_SHIFT;
602 	reg_num = (esr_iss & ISS_DATA_SRT_MASK) >> ISS_DATA_SRT_SHIFT;
603 
604 	vie = &vme_ret->u.inst_emul.vie;
605 	vie->access_size = 1 << esr_sas;
606 	vie->sign_extend = (esr_iss & ISS_DATA_SSE) ? 1 : 0;
607 	vie->dir = (esr_iss & ISS_DATA_WnR) ? VM_DIR_WRITE : VM_DIR_READ;
608 	vie->reg = reg_num;
609 
610 	paging = &vme_ret->u.inst_emul.paging;
611 	paging->ttbr0_addr = hypctx->ttbr0_el1 & ~(TTBR_ASID_MASK | TTBR_CnP);
612 	paging->ttbr1_addr = hypctx->ttbr1_el1 & ~(TTBR_ASID_MASK | TTBR_CnP);
613 	paging->tcr_el1 = hypctx->tcr_el1;
614 	paging->tcr2_el1 = hypctx->tcr2_el1;
615 	paging->flags = hypctx->tf.tf_spsr & (PSR_M_MASK | PSR_M_32);
616 	if ((hypctx->sctlr_el1 & SCTLR_M) != 0)
617 		paging->flags |= VM_GP_MMU_ENABLED;
618 }
619 
620 static void
621 arm64_gen_reg_emul_data(uint32_t esr_iss, struct vm_exit *vme_ret)
622 {
623 	uint32_t reg_num;
624 	struct vre *vre;
625 
626 	/* u.hyp member will be replaced by u.reg_emul */
627 	vre = &vme_ret->u.reg_emul.vre;
628 
629 	vre->inst_syndrome = esr_iss;
630 	/* ARMv8 Architecture Manual, p. D7-2273: 1 means read */
631 	vre->dir = (esr_iss & ISS_MSR_DIR) ? VM_DIR_READ : VM_DIR_WRITE;
632 	reg_num = ISS_MSR_Rt(esr_iss);
633 	vre->reg = reg_num;
634 }
635 
636 void
637 raise_data_insn_abort(struct hypctx *hypctx, uint64_t far, bool dabort, int fsc)
638 {
639 	uint64_t esr;
640 
641 	if ((hypctx->tf.tf_spsr & PSR_M_MASK) == PSR_M_EL0t)
642 		esr = EXCP_INSN_ABORT_L << ESR_ELx_EC_SHIFT;
643 	else
644 		esr = EXCP_INSN_ABORT << ESR_ELx_EC_SHIFT;
645 	/* Set the bit that changes from insn -> data abort */
646 	if (dabort)
647 		esr |= EXCP_DATA_ABORT_L << ESR_ELx_EC_SHIFT;
648 	/* Set the IL bit if set by hardware */
649 	esr |= hypctx->tf.tf_esr & ESR_ELx_IL;
650 
651 	vmmops_exception(hypctx, esr | fsc, far);
652 }
653 
654 static int
655 handle_el1_sync_excp(struct hypctx *hypctx, struct vm_exit *vme_ret,
656     pmap_t pmap)
657 {
658 	uint64_t gpa;
659 	uint32_t esr_ec, esr_iss;
660 
661 	esr_ec = ESR_ELx_EXCEPTION(hypctx->tf.tf_esr);
662 	esr_iss = hypctx->tf.tf_esr & ESR_ELx_ISS_MASK;
663 
664 	switch (esr_ec) {
665 	case EXCP_UNKNOWN:
666 		vmm_stat_incr(hypctx->vcpu, VMEXIT_UNKNOWN, 1);
667 		arm64_print_hyp_regs(vme_ret);
668 		vme_ret->exitcode = VM_EXITCODE_HYP;
669 		break;
670 	case EXCP_TRAP_WFI_WFE:
671 		if ((hypctx->tf.tf_esr & 0x3) == 0) { /* WFI */
672 			vmm_stat_incr(hypctx->vcpu, VMEXIT_WFI, 1);
673 			vme_ret->exitcode = VM_EXITCODE_WFI;
674 		} else {
675 			vmm_stat_incr(hypctx->vcpu, VMEXIT_WFE, 1);
676 			vme_ret->exitcode = VM_EXITCODE_HYP;
677 		}
678 		break;
679 	case EXCP_HVC:
680 		vmm_stat_incr(hypctx->vcpu, VMEXIT_HVC, 1);
681 		vme_ret->exitcode = VM_EXITCODE_HVC;
682 		break;
683 	case EXCP_MSR:
684 		vmm_stat_incr(hypctx->vcpu, VMEXIT_MSR, 1);
685 		arm64_gen_reg_emul_data(esr_iss, vme_ret);
686 		vme_ret->exitcode = VM_EXITCODE_REG_EMUL;
687 		break;
688 	case EXCP_BRK:
689 		vmm_stat_incr(hypctx->vcpu, VMEXIT_BRK, 1);
690 		vme_ret->exitcode = VM_EXITCODE_BRK;
691 		break;
692 	case EXCP_SOFTSTP_EL0:
693 		vmm_stat_incr(hypctx->vcpu, VMEXIT_SS, 1);
694 		vme_ret->exitcode = VM_EXITCODE_SS;
695 		break;
696 	case EXCP_INSN_ABORT_L:
697 	case EXCP_DATA_ABORT_L:
698 		vmm_stat_incr(hypctx->vcpu, esr_ec == EXCP_DATA_ABORT_L ?
699 		    VMEXIT_DATA_ABORT : VMEXIT_INSN_ABORT, 1);
700 		switch (hypctx->tf.tf_esr & ISS_DATA_DFSC_MASK) {
701 		case ISS_DATA_DFSC_TF_L0:
702 		case ISS_DATA_DFSC_TF_L1:
703 		case ISS_DATA_DFSC_TF_L2:
704 		case ISS_DATA_DFSC_TF_L3:
705 		case ISS_DATA_DFSC_AFF_L1:
706 		case ISS_DATA_DFSC_AFF_L2:
707 		case ISS_DATA_DFSC_AFF_L3:
708 		case ISS_DATA_DFSC_PF_L1:
709 		case ISS_DATA_DFSC_PF_L2:
710 		case ISS_DATA_DFSC_PF_L3:
711 			gpa = HPFAR_EL2_FIPA_ADDR(hypctx->exit_info.hpfar_el2);
712 			/* Check the IPA is valid */
713 			if (gpa >= (1ul << vmm_max_ipa_bits)) {
714 				raise_data_insn_abort(hypctx,
715 				    hypctx->exit_info.far_el2,
716 				    esr_ec == EXCP_DATA_ABORT_L,
717 				    ISS_DATA_DFSC_ASF_L0);
718 				vme_ret->inst_length = 0;
719 				return (HANDLED);
720 			}
721 
722 			if (vm_mem_allocated(hypctx->vcpu, gpa)) {
723 				vme_ret->exitcode = VM_EXITCODE_PAGING;
724 				vme_ret->inst_length = 0;
725 				vme_ret->u.paging.esr = hypctx->tf.tf_esr;
726 				vme_ret->u.paging.gpa = gpa;
727 			} else if (esr_ec == EXCP_INSN_ABORT_L) {
728 				/*
729 				 * Raise an external abort. Device memory is
730 				 * not executable
731 				 */
732 				raise_data_insn_abort(hypctx,
733 				    hypctx->exit_info.far_el2, false,
734 				    ISS_DATA_DFSC_EXT);
735 				vme_ret->inst_length = 0;
736 				return (HANDLED);
737 			} else {
738 				arm64_gen_inst_emul_data(hypctx, esr_iss,
739 				    vme_ret);
740 				vme_ret->exitcode = VM_EXITCODE_INST_EMUL;
741 			}
742 			break;
743 		default:
744 			arm64_print_hyp_regs(vme_ret);
745 			vme_ret->exitcode = VM_EXITCODE_HYP;
746 			break;
747 		}
748 
749 		break;
750 
751 	default:
752 		vmm_stat_incr(hypctx->vcpu, VMEXIT_UNHANDLED_SYNC, 1);
753 		arm64_print_hyp_regs(vme_ret);
754 		vme_ret->exitcode = VM_EXITCODE_HYP;
755 		break;
756 	}
757 
758 	/* We don't don't do any instruction emulation here */
759 	return (UNHANDLED);
760 }
761 
762 static int
763 arm64_handle_world_switch(struct hypctx *hypctx, int excp_type,
764     struct vm_exit *vme, pmap_t pmap)
765 {
766 	int handled;
767 
768 	switch (excp_type) {
769 	case EXCP_TYPE_EL1_SYNC:
770 		/* The exit code will be set by handle_el1_sync_excp(). */
771 		handled = handle_el1_sync_excp(hypctx, vme, pmap);
772 		break;
773 
774 	case EXCP_TYPE_EL1_IRQ:
775 	case EXCP_TYPE_EL1_FIQ:
776 		/* The host kernel will handle IRQs and FIQs. */
777 		vmm_stat_incr(hypctx->vcpu,
778 		    excp_type == EXCP_TYPE_EL1_IRQ ? VMEXIT_IRQ : VMEXIT_FIQ,1);
779 		vme->exitcode = VM_EXITCODE_BOGUS;
780 		handled = UNHANDLED;
781 		break;
782 
783 	case EXCP_TYPE_EL1_ERROR:
784 	case EXCP_TYPE_EL2_SYNC:
785 	case EXCP_TYPE_EL2_IRQ:
786 	case EXCP_TYPE_EL2_FIQ:
787 	case EXCP_TYPE_EL2_ERROR:
788 		vmm_stat_incr(hypctx->vcpu, VMEXIT_UNHANDLED_EL2, 1);
789 		vme->exitcode = VM_EXITCODE_BOGUS;
790 		handled = UNHANDLED;
791 		break;
792 
793 	default:
794 		vmm_stat_incr(hypctx->vcpu, VMEXIT_UNHANDLED, 1);
795 		vme->exitcode = VM_EXITCODE_BOGUS;
796 		handled = UNHANDLED;
797 		break;
798 	}
799 
800 	return (handled);
801 }
802 
803 static void
804 ptp_release(void **cookie)
805 {
806 	if (*cookie != NULL) {
807 		vm_gpa_release(*cookie);
808 		*cookie = NULL;
809 	}
810 }
811 
812 static void *
813 ptp_hold(struct vcpu *vcpu, vm_paddr_t ptpphys, size_t len, void **cookie)
814 {
815 	void *ptr;
816 
817 	ptp_release(cookie);
818 	ptr = vm_gpa_hold(vcpu, ptpphys, len, VM_PROT_RW, cookie);
819 	return (ptr);
820 }
821 
822 /* log2 of the number of bytes in a page table entry */
823 #define	PTE_SHIFT	3
824 int
825 vmmops_gla2gpa(void *vcpui, struct vm_guest_paging *paging, uint64_t gla,
826     int prot, uint64_t *gpa, int *is_fault)
827 {
828 	struct hypctx *hypctx;
829 	void *cookie;
830 	uint64_t mask, *ptep, pte, pte_addr;
831 	int address_bits, granule_shift, ia_bits, levels, pte_shift, tsz;
832 	bool is_el0;
833 
834 	/* Check if the MMU is off */
835 	if ((paging->flags & VM_GP_MMU_ENABLED) == 0) {
836 		*is_fault = 0;
837 		*gpa = gla;
838 		return (0);
839 	}
840 
841 	is_el0 = (paging->flags & PSR_M_MASK) == PSR_M_EL0t;
842 
843 	if (ADDR_IS_KERNEL(gla)) {
844 		/* If address translation is disabled raise an exception */
845 		if ((paging->tcr_el1 & TCR_EPD1) != 0) {
846 			*is_fault = 1;
847 			return (0);
848 		}
849 		if (is_el0 && (paging->tcr_el1 & TCR_E0PD1) != 0) {
850 			*is_fault = 1;
851 			return (0);
852 		}
853 		pte_addr = paging->ttbr1_addr;
854 		tsz = (paging->tcr_el1 & TCR_T1SZ_MASK) >> TCR_T1SZ_SHIFT;
855 		/* Clear the top byte if TBI is on */
856 		if ((paging->tcr_el1 & TCR_TBI1) != 0)
857 			gla |= (0xfful << 56);
858 		switch (paging->tcr_el1 & TCR_TG1_MASK) {
859 		case TCR_TG1_4K:
860 			granule_shift = PAGE_SHIFT_4K;
861 			break;
862 		case TCR_TG1_16K:
863 			granule_shift = PAGE_SHIFT_16K;
864 			break;
865 		case TCR_TG1_64K:
866 			granule_shift = PAGE_SHIFT_64K;
867 			break;
868 		default:
869 			*is_fault = 1;
870 			return (EINVAL);
871 		}
872 	} else {
873 		/* If address translation is disabled raise an exception */
874 		if ((paging->tcr_el1 & TCR_EPD0) != 0) {
875 			*is_fault = 1;
876 			return (0);
877 		}
878 		if (is_el0 && (paging->tcr_el1 & TCR_E0PD0) != 0) {
879 			*is_fault = 1;
880 			return (0);
881 		}
882 		pte_addr = paging->ttbr0_addr;
883 		tsz = (paging->tcr_el1 & TCR_T0SZ_MASK) >> TCR_T0SZ_SHIFT;
884 		/* Clear the top byte if TBI is on */
885 		if ((paging->tcr_el1 & TCR_TBI0) != 0)
886 			gla &= ~(0xfful << 56);
887 		switch (paging->tcr_el1 & TCR_TG0_MASK) {
888 		case TCR_TG0_4K:
889 			granule_shift = PAGE_SHIFT_4K;
890 			break;
891 		case TCR_TG0_16K:
892 			granule_shift = PAGE_SHIFT_16K;
893 			break;
894 		case TCR_TG0_64K:
895 			granule_shift = PAGE_SHIFT_64K;
896 			break;
897 		default:
898 			*is_fault = 1;
899 			return (EINVAL);
900 		}
901 	}
902 
903 	/*
904 	 * TODO: Support FEAT_TTST for smaller tsz values and FEAT_LPA2
905 	 * for larger values.
906 	 */
907 	switch (granule_shift) {
908 	case PAGE_SHIFT_4K:
909 	case PAGE_SHIFT_16K:
910 		/*
911 		 * See "Table D8-11 4KB granule, determining stage 1 initial
912 		 * lookup level" and "Table D8-21 16KB granule, determining
913 		 * stage 1 initial lookup level" from the "Arm Architecture
914 		 * Reference Manual for A-Profile architecture" revision I.a
915 		 * for the minimum and maximum values.
916 		 *
917 		 * TODO: Support less than 16 when FEAT_LPA2 is implemented
918 		 * and TCR_EL1.DS == 1
919 		 * TODO: Support more than 39 when FEAT_TTST is implemented
920 		 */
921 		if (tsz < 16 || tsz > 39) {
922 			*is_fault = 1;
923 			return (EINVAL);
924 		}
925 		break;
926 	case PAGE_SHIFT_64K:
927 	/* TODO: Support 64k granule. It will probably work, but is untested */
928 	default:
929 		*is_fault = 1;
930 		return (EINVAL);
931 	}
932 
933 	/*
934 	 * Calculate the input address bits. These are 64 bit in an address
935 	 * with the top tsz bits being all 0 or all 1.
936 	  */
937 	ia_bits = 64 - tsz;
938 
939 	/*
940 	 * Calculate the number of address bits used in the page table
941 	 * calculation. This is ia_bits minus the bottom granule_shift
942 	 * bits that are passed to the output address.
943 	 */
944 	address_bits = ia_bits - granule_shift;
945 
946 	/*
947 	 * Calculate the number of levels. Each level uses
948 	 * granule_shift - PTE_SHIFT bits of the input address.
949 	 * This is because the table is 1 << granule_shift and each
950 	 * entry is 1 << PTE_SHIFT bytes.
951 	 */
952 	levels = howmany(address_bits, granule_shift - PTE_SHIFT);
953 
954 	/* Mask of the upper unused bits in the virtual address */
955 	gla &= (1ul << ia_bits) - 1;
956 	hypctx = (struct hypctx *)vcpui;
957 	cookie = NULL;
958 	/* TODO: Check if the level supports block descriptors */
959 	for (;levels > 0; levels--) {
960 		int idx;
961 
962 		pte_shift = (levels - 1) * (granule_shift - PTE_SHIFT) +
963 		    granule_shift;
964 		idx = (gla >> pte_shift) &
965 		    ((1ul << (granule_shift - PTE_SHIFT)) - 1);
966 		while (idx > PAGE_SIZE / sizeof(pte)) {
967 			idx -= PAGE_SIZE / sizeof(pte);
968 			pte_addr += PAGE_SIZE;
969 		}
970 
971 		ptep = ptp_hold(hypctx->vcpu, pte_addr, PAGE_SIZE, &cookie);
972 		if (ptep == NULL)
973 			goto error;
974 		pte = ptep[idx];
975 
976 		/* Calculate the level we are looking at */
977 		switch (levels) {
978 		default:
979 			goto fault;
980 		/* TODO: Level -1 when FEAT_LPA2 is implemented */
981 		case 4: /* Level 0 */
982 			if ((pte & ATTR_DESCR_MASK) != L0_TABLE)
983 				goto fault;
984 			/* FALLTHROUGH */
985 		case 3: /* Level 1 */
986 		case 2: /* Level 2 */
987 			switch (pte & ATTR_DESCR_MASK) {
988 			/* Use L1 macro as all levels are the same */
989 			case L1_TABLE:
990 				/* Check if EL0 can access this address space */
991 				if (is_el0 &&
992 				    (pte & TATTR_AP_TABLE_NO_EL0) != 0)
993 					goto fault;
994 				/* Check if the address space is writable */
995 				if ((prot & PROT_WRITE) != 0 &&
996 				    (pte & TATTR_AP_TABLE_RO) != 0)
997 					goto fault;
998 				if ((prot & PROT_EXEC) != 0) {
999 					/* Check the table exec attribute */
1000 					if ((is_el0 &&
1001 					    (pte & TATTR_UXN_TABLE) != 0) ||
1002 					    (!is_el0 &&
1003 					     (pte & TATTR_PXN_TABLE) != 0))
1004 						goto fault;
1005 				}
1006 				pte_addr = pte & ~ATTR_MASK;
1007 				break;
1008 			case L1_BLOCK:
1009 				goto done;
1010 			default:
1011 				goto fault;
1012 			}
1013 			break;
1014 		case 1: /* Level 3 */
1015 			if ((pte & ATTR_DESCR_MASK) == L3_PAGE)
1016 				goto done;
1017 			goto fault;
1018 		}
1019 	}
1020 
1021 done:
1022 	/* Check if EL0 has access to the block/page */
1023 	if (is_el0 && (pte & ATTR_S1_AP(ATTR_S1_AP_USER)) == 0)
1024 		goto fault;
1025 	if ((prot & PROT_WRITE) != 0 && (pte & ATTR_S1_AP_RW_BIT) != 0)
1026 		goto fault;
1027 	if ((prot & PROT_EXEC) != 0) {
1028 		if ((is_el0 && (pte & ATTR_S1_UXN) != 0) ||
1029 		    (!is_el0 && (pte & ATTR_S1_PXN) != 0))
1030 			goto fault;
1031 	}
1032 	mask = (1ul << pte_shift) - 1;
1033 	*gpa = (pte & ~ATTR_MASK) | (gla & mask);
1034 	*is_fault = 0;
1035 	ptp_release(&cookie);
1036 	return (0);
1037 
1038 error:
1039 	ptp_release(&cookie);
1040 	return (EFAULT);
1041 fault:
1042 	*is_fault = 1;
1043 	ptp_release(&cookie);
1044 	return (0);
1045 }
1046 
1047 int
1048 vmmops_run(void *vcpui, register_t pc, pmap_t pmap, struct vm_eventinfo *evinfo)
1049 {
1050 	uint64_t excp_type;
1051 	int handled;
1052 	register_t daif;
1053 	struct hyp *hyp;
1054 	struct hypctx *hypctx;
1055 	struct vcpu *vcpu;
1056 	struct vm_exit *vme;
1057 	int mode;
1058 
1059 	hypctx = (struct hypctx *)vcpui;
1060 	hyp = hypctx->hyp;
1061 	vcpu = hypctx->vcpu;
1062 	vme = vm_exitinfo(vcpu);
1063 
1064 	hypctx->tf.tf_elr = (uint64_t)pc;
1065 
1066 	for (;;) {
1067 		if (hypctx->has_exception) {
1068 			hypctx->has_exception = false;
1069 			hypctx->elr_el1 = hypctx->tf.tf_elr;
1070 
1071 			mode = hypctx->tf.tf_spsr & (PSR_M_MASK | PSR_M_32);
1072 
1073 			if (mode == PSR_M_EL1t) {
1074 				hypctx->tf.tf_elr = hypctx->vbar_el1 + 0x0;
1075 			} else if (mode == PSR_M_EL1h) {
1076 				hypctx->tf.tf_elr = hypctx->vbar_el1 + 0x200;
1077 			} else if ((mode & PSR_M_32) == PSR_M_64) {
1078 				/* 64-bit EL0 */
1079 				hypctx->tf.tf_elr = hypctx->vbar_el1 + 0x400;
1080 			} else {
1081 				/* 32-bit EL0 */
1082 				hypctx->tf.tf_elr = hypctx->vbar_el1 + 0x600;
1083 			}
1084 
1085 			/* Set the new spsr */
1086 			hypctx->spsr_el1 = hypctx->tf.tf_spsr;
1087 
1088 			/* Set the new cpsr */
1089 			hypctx->tf.tf_spsr = hypctx->spsr_el1 & PSR_FLAGS;
1090 			hypctx->tf.tf_spsr |= PSR_DAIF | PSR_M_EL1h;
1091 
1092 			/*
1093 			 * Update fields that may change on exeption entry
1094 			 * based on how sctlr_el1 is configured.
1095 			 */
1096 			if ((hypctx->sctlr_el1 & SCTLR_SPAN) == 0)
1097 				hypctx->tf.tf_spsr |= PSR_PAN;
1098 			if ((hypctx->sctlr_el1 & SCTLR_DSSBS) == 0)
1099 				hypctx->tf.tf_spsr &= ~PSR_SSBS;
1100 			else
1101 				hypctx->tf.tf_spsr |= PSR_SSBS;
1102 		}
1103 
1104 		daif = intr_disable();
1105 
1106 		/* Check if the vcpu is suspended */
1107 		if (vcpu_suspended(evinfo)) {
1108 			intr_restore(daif);
1109 			vm_exit_suspended(vcpu, pc);
1110 			break;
1111 		}
1112 
1113 		if (vcpu_debugged(vcpu)) {
1114 			intr_restore(daif);
1115 			vm_exit_debug(vcpu, pc);
1116 			break;
1117 		}
1118 
1119 		/* Activate the stage2 pmap so the vmid is valid */
1120 		pmap_activate_vm(pmap);
1121 		hyp->vttbr_el2 = pmap_to_ttbr0(pmap);
1122 
1123 		/*
1124 		 * TODO: What happens if a timer interrupt is asserted exactly
1125 		 * here, but for the previous VM?
1126 		 */
1127 		arm64_set_active_vcpu(hypctx);
1128 		vgic_flush_hwstate(hypctx);
1129 
1130 		/* Call into EL2 to switch to the guest */
1131 		excp_type = vmm_enter_guest(hyp, hypctx);
1132 
1133 		vgic_sync_hwstate(hypctx);
1134 		vtimer_sync_hwstate(hypctx);
1135 
1136 		/*
1137 		 * Deactivate the stage2 pmap.
1138 		 */
1139 		PCPU_SET(curvmpmap, NULL);
1140 		intr_restore(daif);
1141 
1142 		vmm_stat_incr(vcpu, VMEXIT_COUNT, 1);
1143 		if (excp_type == EXCP_TYPE_MAINT_IRQ)
1144 			continue;
1145 
1146 		vme->pc = hypctx->tf.tf_elr;
1147 		vme->inst_length = INSN_SIZE;
1148 		vme->u.hyp.exception_nr = excp_type;
1149 		vme->u.hyp.esr_el2 = hypctx->tf.tf_esr;
1150 		vme->u.hyp.far_el2 = hypctx->exit_info.far_el2;
1151 		vme->u.hyp.hpfar_el2 = hypctx->exit_info.hpfar_el2;
1152 
1153 		handled = arm64_handle_world_switch(hypctx, excp_type, vme,
1154 		    pmap);
1155 		if (handled == UNHANDLED)
1156 			/* Exit loop to emulate instruction. */
1157 			break;
1158 		else
1159 			/* Resume guest execution from the next instruction. */
1160 			hypctx->tf.tf_elr += vme->inst_length;
1161 	}
1162 
1163 	return (0);
1164 }
1165 
1166 static void
1167 arm_pcpu_vmcleanup(void *arg)
1168 {
1169 	struct hyp *hyp;
1170 	int i, maxcpus;
1171 
1172 	hyp = arg;
1173 	maxcpus = vm_get_maxcpus(hyp->vm);
1174 	for (i = 0; i < maxcpus; i++) {
1175 		if (arm64_get_active_vcpu() == hyp->ctx[i]) {
1176 			arm64_set_active_vcpu(NULL);
1177 			break;
1178 		}
1179 	}
1180 }
1181 
1182 void
1183 vmmops_vcpu_cleanup(void *vcpui)
1184 {
1185 	struct hypctx *hypctx = vcpui;
1186 
1187 	vtimer_cpucleanup(hypctx);
1188 	vgic_cpucleanup(hypctx);
1189 
1190 	if (!in_vhe())
1191 		vmmpmap_remove(hypctx->el2_addr, el2_hypctx_size(), true);
1192 
1193 	free(hypctx, M_HYP);
1194 }
1195 
1196 void
1197 vmmops_cleanup(void *vmi)
1198 {
1199 	struct hyp *hyp = vmi;
1200 
1201 	vtimer_vmcleanup(hyp);
1202 	vgic_vmcleanup(hyp);
1203 
1204 	smp_rendezvous(NULL, arm_pcpu_vmcleanup, NULL, hyp);
1205 
1206 	if (!in_vhe())
1207 		vmmpmap_remove(hyp->el2_addr, el2_hyp_size(hyp->vm), true);
1208 
1209 	free(hyp, M_HYP);
1210 }
1211 
1212 /*
1213  * Return register value. Registers have different sizes and an explicit cast
1214  * must be made to ensure proper conversion.
1215  */
1216 static uint64_t *
1217 hypctx_regptr(struct hypctx *hypctx, int reg)
1218 {
1219 	switch (reg) {
1220 	case VM_REG_GUEST_X0 ... VM_REG_GUEST_X29:
1221 		return (&hypctx->tf.tf_x[reg]);
1222 	case VM_REG_GUEST_LR:
1223 		return (&hypctx->tf.tf_lr);
1224 	case VM_REG_GUEST_SP:
1225 		return (&hypctx->tf.tf_sp);
1226 	case VM_REG_GUEST_CPSR:
1227 		return (&hypctx->tf.tf_spsr);
1228 	case VM_REG_GUEST_PC:
1229 		return (&hypctx->tf.tf_elr);
1230 	case VM_REG_GUEST_SCTLR_EL1:
1231 		return (&hypctx->sctlr_el1);
1232 	case VM_REG_GUEST_TTBR0_EL1:
1233 		return (&hypctx->ttbr0_el1);
1234 	case VM_REG_GUEST_TTBR1_EL1:
1235 		return (&hypctx->ttbr1_el1);
1236 	case VM_REG_GUEST_TCR_EL1:
1237 		return (&hypctx->tcr_el1);
1238 	case VM_REG_GUEST_TCR2_EL1:
1239 		return (&hypctx->tcr2_el1);
1240 	default:
1241 		break;
1242 	}
1243 	return (NULL);
1244 }
1245 
1246 int
1247 vmmops_getreg(void *vcpui, int reg, uint64_t *retval)
1248 {
1249 	uint64_t *regp;
1250 	int running, hostcpu;
1251 	struct hypctx *hypctx = vcpui;
1252 
1253 	running = vcpu_is_running(hypctx->vcpu, &hostcpu);
1254 	if (running && hostcpu != curcpu)
1255 		panic("arm_getreg: %s%d is running", vm_name(hypctx->hyp->vm),
1256 		    vcpu_vcpuid(hypctx->vcpu));
1257 
1258 	regp = hypctx_regptr(hypctx, reg);
1259 	if (regp == NULL)
1260 		return (EINVAL);
1261 
1262 	*retval = *regp;
1263 	return (0);
1264 }
1265 
1266 int
1267 vmmops_setreg(void *vcpui, int reg, uint64_t val)
1268 {
1269 	uint64_t *regp;
1270 	struct hypctx *hypctx = vcpui;
1271 	int running, hostcpu;
1272 
1273 	running = vcpu_is_running(hypctx->vcpu, &hostcpu);
1274 	if (running && hostcpu != curcpu)
1275 		panic("arm_setreg: %s%d is running", vm_name(hypctx->hyp->vm),
1276 		    vcpu_vcpuid(hypctx->vcpu));
1277 
1278 	regp = hypctx_regptr(hypctx, reg);
1279 	if (regp == NULL)
1280 		return (EINVAL);
1281 
1282 	*regp = val;
1283 	return (0);
1284 }
1285 
1286 int
1287 vmmops_exception(void *vcpui, uint64_t esr, uint64_t far)
1288 {
1289 	struct hypctx *hypctx = vcpui;
1290 	int running, hostcpu;
1291 
1292 	running = vcpu_is_running(hypctx->vcpu, &hostcpu);
1293 	if (running && hostcpu != curcpu)
1294 		panic("%s: %s%d is running", __func__, vm_name(hypctx->hyp->vm),
1295 		    vcpu_vcpuid(hypctx->vcpu));
1296 
1297 	hypctx->far_el1 = far;
1298 	hypctx->esr_el1 = esr;
1299 	hypctx->has_exception = true;
1300 
1301 	return (0);
1302 }
1303 
1304 int
1305 vmmops_getcap(void *vcpui, int num, int *retval)
1306 {
1307 	struct hypctx *hypctx = vcpui;
1308 	int ret;
1309 
1310 	ret = ENOENT;
1311 
1312 	switch (num) {
1313 	case VM_CAP_UNRESTRICTED_GUEST:
1314 		*retval = 1;
1315 		ret = 0;
1316 		break;
1317 	case VM_CAP_BRK_EXIT:
1318 	case VM_CAP_SS_EXIT:
1319 	case VM_CAP_MASK_HWINTR:
1320 		*retval = (hypctx->setcaps & (1ul << num)) != 0;
1321 		break;
1322 	default:
1323 		break;
1324 	}
1325 
1326 	return (ret);
1327 }
1328 
1329 int
1330 vmmops_setcap(void *vcpui, int num, int val)
1331 {
1332 	struct hypctx *hypctx = vcpui;
1333 	int ret;
1334 
1335 	ret = 0;
1336 
1337 	switch (num) {
1338 	case VM_CAP_BRK_EXIT:
1339 		if ((val != 0) == ((hypctx->setcaps & (1ul << num)) != 0))
1340 			break;
1341 		if (val != 0)
1342 			hypctx->mdcr_el2 |= MDCR_EL2_TDE;
1343 		else
1344 			hypctx->mdcr_el2 &= ~MDCR_EL2_TDE;
1345 		break;
1346 	case VM_CAP_SS_EXIT:
1347 		if ((val != 0) == ((hypctx->setcaps & (1ul << num)) != 0))
1348 			break;
1349 
1350 		if (val != 0) {
1351 			hypctx->debug_spsr |= (hypctx->tf.tf_spsr & PSR_SS);
1352 			hypctx->debug_mdscr |= hypctx->mdscr_el1 &
1353 			    (MDSCR_SS | MDSCR_KDE);
1354 
1355 			hypctx->tf.tf_spsr |= PSR_SS;
1356 			hypctx->mdscr_el1 |= MDSCR_SS | MDSCR_KDE;
1357 			hypctx->mdcr_el2 |= MDCR_EL2_TDE;
1358 		} else {
1359 			hypctx->tf.tf_spsr &= ~PSR_SS;
1360 			hypctx->tf.tf_spsr |= hypctx->debug_spsr;
1361 			hypctx->debug_spsr &= ~PSR_SS;
1362 			hypctx->mdscr_el1 &= ~(MDSCR_SS | MDSCR_KDE);
1363 			hypctx->mdscr_el1 |= hypctx->debug_mdscr;
1364 			hypctx->debug_mdscr &= ~(MDSCR_SS | MDSCR_KDE);
1365 			hypctx->mdcr_el2 &= ~MDCR_EL2_TDE;
1366 		}
1367 		break;
1368 	case VM_CAP_MASK_HWINTR:
1369 		if ((val != 0) == ((hypctx->setcaps & (1ul << num)) != 0))
1370 			break;
1371 
1372 		if (val != 0) {
1373 			hypctx->debug_spsr |= (hypctx->tf.tf_spsr &
1374 			    (PSR_I | PSR_F));
1375 			hypctx->tf.tf_spsr |= PSR_I | PSR_F;
1376 		} else {
1377 			hypctx->tf.tf_spsr &= ~(PSR_I | PSR_F);
1378 			hypctx->tf.tf_spsr |= (hypctx->debug_spsr &
1379 			    (PSR_I | PSR_F));
1380 			hypctx->debug_spsr &= ~(PSR_I | PSR_F);
1381 		}
1382 		break;
1383 	default:
1384 		ret = ENOENT;
1385 		break;
1386 	}
1387 
1388 	if (ret == 0) {
1389 		if (val == 0)
1390 			hypctx->setcaps &= ~(1ul << num);
1391 		else
1392 			hypctx->setcaps |= (1ul << num);
1393 	}
1394 
1395 	return (ret);
1396 }
1397