xref: /freebsd/sys/arm64/vmm/vmm_arm64.c (revision c76c2a19ae3763d17aa6a60a5831ed24cbc16e83)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (C) 2015 Mihai Carabas <mihai.carabas@gmail.com>
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28 
29 #include <sys/cdefs.h>
30 #include <sys/param.h>
31 #include <sys/systm.h>
32 #include <sys/smp.h>
33 #include <sys/kernel.h>
34 #include <sys/malloc.h>
35 #include <sys/mman.h>
36 #include <sys/pcpu.h>
37 #include <sys/proc.h>
38 #include <sys/sysctl.h>
39 #include <sys/lock.h>
40 #include <sys/mutex.h>
41 #include <sys/vmem.h>
42 
43 #include <vm/vm.h>
44 #include <vm/pmap.h>
45 #include <vm/vm_extern.h>
46 #include <vm/vm_map.h>
47 #include <vm/vm_page.h>
48 #include <vm/vm_param.h>
49 
50 #include <machine/armreg.h>
51 #include <machine/vm.h>
52 #include <machine/cpufunc.h>
53 #include <machine/cpu.h>
54 #include <machine/machdep.h>
55 #include <machine/vmm.h>
56 #include <machine/vmm_dev.h>
57 #include <machine/atomic.h>
58 #include <machine/hypervisor.h>
59 #include <machine/pmap.h>
60 
61 #include <dev/vmm/vmm_mem.h>
62 
63 #include "mmu.h"
64 #include "arm64.h"
65 #include "hyp.h"
66 #include "reset.h"
67 #include "io/vgic.h"
68 #include "io/vgic_v3.h"
69 #include "io/vtimer.h"
70 #include "vmm_handlers.h"
71 #include "vmm_stat.h"
72 
73 #define	HANDLED		1
74 #define	UNHANDLED	0
75 
76 /* Number of bits in an EL2 virtual address */
77 #define	EL2_VIRT_BITS	48
78 CTASSERT((1ul << EL2_VIRT_BITS) >= HYP_VM_MAX_ADDRESS);
79 
80 /* TODO: Move the host hypctx off the stack */
81 #define	VMM_STACK_PAGES	4
82 #define	VMM_STACK_SIZE	(VMM_STACK_PAGES * PAGE_SIZE)
83 
84 static int vmm_pmap_levels, vmm_virt_bits, vmm_max_ipa_bits;
85 
86 /* Register values passed to arm_setup_vectors to set in the hypervisor */
87 struct vmm_init_regs {
88 	uint64_t tcr_el2;
89 	uint64_t vtcr_el2;
90 };
91 
92 MALLOC_DEFINE(M_HYP, "ARM VMM HYP", "ARM VMM HYP");
93 
94 extern char hyp_init_vectors[];
95 extern char hyp_vectors[];
96 extern char hyp_stub_vectors[];
97 
98 static vm_paddr_t hyp_code_base;
99 static size_t hyp_code_len;
100 
101 static char *stack[MAXCPU];
102 static vm_offset_t stack_hyp_va[MAXCPU];
103 
104 static vmem_t *el2_mem_alloc;
105 
106 static void arm_setup_vectors(void *arg);
107 
108 DPCPU_DEFINE_STATIC(struct hypctx *, vcpu);
109 
110 static inline void
arm64_set_active_vcpu(struct hypctx * hypctx)111 arm64_set_active_vcpu(struct hypctx *hypctx)
112 {
113 	DPCPU_SET(vcpu, hypctx);
114 }
115 
116 struct hypctx *
arm64_get_active_vcpu(void)117 arm64_get_active_vcpu(void)
118 {
119 	return (DPCPU_GET(vcpu));
120 }
121 
122 static void
arm_setup_vectors(void * arg)123 arm_setup_vectors(void *arg)
124 {
125 	struct vmm_init_regs *el2_regs;
126 	uintptr_t stack_top;
127 	uint32_t sctlr_el2;
128 	register_t daif;
129 
130 	el2_regs = arg;
131 	arm64_set_active_vcpu(NULL);
132 
133 	/*
134 	 * Configure the system control register for EL2:
135 	 *
136 	 * SCTLR_EL2_M: MMU on
137 	 * SCTLR_EL2_C: Data cacheability not affected
138 	 * SCTLR_EL2_I: Instruction cacheability not affected
139 	 * SCTLR_EL2_A: Instruction alignment check
140 	 * SCTLR_EL2_SA: Stack pointer alignment check
141 	 * SCTLR_EL2_WXN: Treat writable memory as execute never
142 	 * ~SCTLR_EL2_EE: Data accesses are little-endian
143 	 */
144 	sctlr_el2 = SCTLR_EL2_RES1;
145 	sctlr_el2 |= SCTLR_EL2_M | SCTLR_EL2_C | SCTLR_EL2_I;
146 	sctlr_el2 |= SCTLR_EL2_A | SCTLR_EL2_SA;
147 	sctlr_el2 |= SCTLR_EL2_WXN;
148 	sctlr_el2 &= ~SCTLR_EL2_EE;
149 
150 	daif = intr_disable();
151 
152 	if (in_vhe()) {
153 		WRITE_SPECIALREG(vtcr_el2, el2_regs->vtcr_el2);
154 	} else {
155 		/*
156 		 * Install the temporary vectors which will be responsible for
157 		 * initializing the VMM when we next trap into EL2.
158 		 *
159 		 * x0: the exception vector table responsible for hypervisor
160 		 * initialization on the next call.
161 		 */
162 		vmm_call_hyp(vtophys(&vmm_hyp_code));
163 
164 		/* Create and map the hypervisor stack */
165 		stack_top = stack_hyp_va[PCPU_GET(cpuid)] + VMM_STACK_SIZE;
166 
167 		/* Special call to initialize EL2 */
168 		vmm_call_hyp(vmmpmap_to_ttbr0(), stack_top, el2_regs->tcr_el2,
169 		    sctlr_el2, el2_regs->vtcr_el2);
170 	}
171 
172 	intr_restore(daif);
173 }
174 
175 static void
arm_teardown_vectors(void * arg)176 arm_teardown_vectors(void *arg)
177 {
178 	register_t daif;
179 
180 	/*
181 	 * vmm_cleanup() will disable the MMU. For the next few instructions,
182 	 * before the hardware disables the MMU, one of the following is
183 	 * possible:
184 	 *
185 	 * a. The instruction addresses are fetched with the MMU disabled,
186 	 * and they must represent the actual physical addresses. This will work
187 	 * because we call the vmm_cleanup() function by its physical address.
188 	 *
189 	 * b. The instruction addresses are fetched using the old translation
190 	 * tables. This will work because we have an identity mapping in place
191 	 * in the translation tables and vmm_cleanup() is called by its physical
192 	 * address.
193 	 */
194 	daif = intr_disable();
195 	/* TODO: Invalidate the cache */
196 	vmm_call_hyp(HYP_CLEANUP, vtophys(hyp_stub_vectors));
197 	intr_restore(daif);
198 
199 	arm64_set_active_vcpu(NULL);
200 }
201 
202 static uint64_t
vmm_vtcr_el2_sl(u_int levels)203 vmm_vtcr_el2_sl(u_int levels)
204 {
205 #if PAGE_SIZE == PAGE_SIZE_4K
206 	switch (levels) {
207 	case 2:
208 		return (VTCR_EL2_SL0_4K_LVL2);
209 	case 3:
210 		return (VTCR_EL2_SL0_4K_LVL1);
211 	case 4:
212 		return (VTCR_EL2_SL0_4K_LVL0);
213 	default:
214 		panic("%s: Invalid number of page table levels %u", __func__,
215 		    levels);
216 	}
217 #elif PAGE_SIZE == PAGE_SIZE_16K
218 	switch (levels) {
219 	case 2:
220 		return (VTCR_EL2_SL0_16K_LVL2);
221 	case 3:
222 		return (VTCR_EL2_SL0_16K_LVL1);
223 	case 4:
224 		return (VTCR_EL2_SL0_16K_LVL0);
225 	default:
226 		panic("%s: Invalid number of page table levels %u", __func__,
227 		    levels);
228 	}
229 #else
230 #error Unsupported page size
231 #endif
232 }
233 
234 int
vmmops_modinit(int ipinum)235 vmmops_modinit(int ipinum)
236 {
237 	struct vmm_init_regs el2_regs;
238 	vm_offset_t next_hyp_va;
239 	vm_paddr_t vmm_base;
240 	uint64_t id_aa64mmfr0_el1, pa_range_bits, pa_range_field;
241 	uint64_t cnthctl_el2;
242 	int cpu, i;
243 	bool rv __diagused;
244 
245 	if (!has_hyp()) {
246 		printf(
247 		    "vmm: Processor doesn't have support for virtualization\n");
248 		return (ENXIO);
249 	}
250 
251 	if (!vgic_present()) {
252 		printf("vmm: No vgic found\n");
253 		return (ENODEV);
254 	}
255 
256 	if (!get_kernel_reg(ID_AA64MMFR0_EL1, &id_aa64mmfr0_el1)) {
257 		printf("vmm: Unable to read ID_AA64MMFR0_EL1\n");
258 		return (ENXIO);
259 	}
260 	pa_range_field = ID_AA64MMFR0_PARange_VAL(id_aa64mmfr0_el1);
261 	/*
262 	 * Use 3 levels to give us up to 39 bits with 4k pages, or
263 	 * 47 bits with 16k pages.
264 	 */
265 	/* TODO: Check the number of levels for 64k pages */
266 	vmm_pmap_levels = 3;
267 	switch (pa_range_field) {
268 	case ID_AA64MMFR0_PARange_4G:
269 		printf("vmm: Not enough physical address bits\n");
270 		return (ENXIO);
271 	case ID_AA64MMFR0_PARange_64G:
272 		vmm_virt_bits = 36;
273 #if PAGE_SIZE == PAGE_SIZE_16K
274 		vmm_pmap_levels = 2;
275 #endif
276 		break;
277 	default:
278 		vmm_virt_bits = 39;
279 		break;
280 	}
281 	pa_range_bits = pa_range_field >> ID_AA64MMFR0_PARange_SHIFT;
282 
283 	if (!in_vhe()) {
284 		/* Initialise the EL2 MMU */
285 		if (!vmmpmap_init()) {
286 			printf("vmm: Failed to init the EL2 MMU\n");
287 			return (ENOMEM);
288 		}
289 	}
290 
291 	/* Set up the stage 2 pmap callbacks */
292 	MPASS(pmap_clean_stage2_tlbi == NULL);
293 	pmap_clean_stage2_tlbi = vmm_clean_s2_tlbi;
294 	pmap_stage2_invalidate_range = vmm_s2_tlbi_range;
295 	pmap_stage2_invalidate_all = vmm_s2_tlbi_all;
296 
297 	if (!in_vhe()) {
298 		/*
299 		 * Create an allocator for the virtual address space used by
300 		 * EL2. EL2 code is identity-mapped; the allocator is used to
301 		 * find space for VM structures.
302 		 */
303 		el2_mem_alloc = vmem_create("VMM EL2", 0, 0, PAGE_SIZE, 0,
304 		    M_WAITOK);
305 
306 		/* Create the mappings for the hypervisor translation table. */
307 		hyp_code_len = round_page(&vmm_hyp_code_end - &vmm_hyp_code);
308 
309 		/* We need an physical identity mapping for when we activate the MMU */
310 		hyp_code_base = vmm_base = vtophys(&vmm_hyp_code);
311 		rv = vmmpmap_enter(vmm_base, hyp_code_len, vmm_base,
312 		    VM_PROT_READ | VM_PROT_EXECUTE);
313 		MPASS(rv);
314 
315 		next_hyp_va = roundup2(vmm_base + hyp_code_len, L2_SIZE);
316 
317 		/* Create a per-CPU hypervisor stack */
318 		CPU_FOREACH(cpu) {
319 			stack[cpu] = malloc(VMM_STACK_SIZE, M_HYP, M_WAITOK | M_ZERO);
320 			stack_hyp_va[cpu] = next_hyp_va;
321 
322 			for (i = 0; i < VMM_STACK_PAGES; i++) {
323 				rv = vmmpmap_enter(stack_hyp_va[cpu] + ptoa(i),
324 				    PAGE_SIZE, vtophys(stack[cpu] + ptoa(i)),
325 				    VM_PROT_READ | VM_PROT_WRITE);
326 				MPASS(rv);
327 			}
328 			next_hyp_va += L2_SIZE;
329 		}
330 
331 		el2_regs.tcr_el2 = TCR_EL2_RES1;
332 		el2_regs.tcr_el2 |= min(pa_range_bits << TCR_EL2_PS_SHIFT,
333 		    TCR_EL2_PS_52BITS);
334 		el2_regs.tcr_el2 |= TCR_EL2_T0SZ(64 - EL2_VIRT_BITS);
335 		el2_regs.tcr_el2 |= TCR_EL2_IRGN0_WBWA | TCR_EL2_ORGN0_WBWA;
336 #if PAGE_SIZE == PAGE_SIZE_4K
337 		el2_regs.tcr_el2 |= TCR_EL2_TG0_4K;
338 #elif PAGE_SIZE == PAGE_SIZE_16K
339 		el2_regs.tcr_el2 |= TCR_EL2_TG0_16K;
340 #else
341 #error Unsupported page size
342 #endif
343 #ifdef SMP
344 		el2_regs.tcr_el2 |= TCR_EL2_SH0_IS;
345 #endif
346 	}
347 
348 	switch (pa_range_bits << TCR_EL2_PS_SHIFT) {
349 	case TCR_EL2_PS_32BITS:
350 		vmm_max_ipa_bits = 32;
351 		break;
352 	case TCR_EL2_PS_36BITS:
353 		vmm_max_ipa_bits = 36;
354 		break;
355 	case TCR_EL2_PS_40BITS:
356 		vmm_max_ipa_bits = 40;
357 		break;
358 	case TCR_EL2_PS_42BITS:
359 		vmm_max_ipa_bits = 42;
360 		break;
361 	case TCR_EL2_PS_44BITS:
362 		vmm_max_ipa_bits = 44;
363 		break;
364 	case TCR_EL2_PS_48BITS:
365 		vmm_max_ipa_bits = 48;
366 		break;
367 	case TCR_EL2_PS_52BITS:
368 	default:
369 		vmm_max_ipa_bits = 52;
370 		break;
371 	}
372 
373 	/*
374 	 * Configure the Stage 2 translation control register:
375 	 *
376 	 * VTCR_IRGN0_WBWA: Translation table walks access inner cacheable
377 	 * normal memory
378 	 * VTCR_ORGN0_WBWA: Translation table walks access outer cacheable
379 	 * normal memory
380 	 * VTCR_EL2_TG0_4K/16K: Stage 2 uses the same page size as the kernel
381 	 * VTCR_EL2_SL0_4K_LVL1: Stage 2 uses concatenated level 1 tables
382 	 * VTCR_EL2_SH0_IS: Memory associated with Stage 2 walks is inner
383 	 * shareable
384 	 */
385 	el2_regs.vtcr_el2 = VTCR_EL2_RES1;
386 	el2_regs.vtcr_el2 |= VTCR_EL2_IRGN0_WBWA | VTCR_EL2_ORGN0_WBWA;
387 	el2_regs.vtcr_el2 |= VTCR_EL2_T0SZ(64 - vmm_virt_bits);
388 	el2_regs.vtcr_el2 |= vmm_vtcr_el2_sl(vmm_pmap_levels);
389 #if PAGE_SIZE == PAGE_SIZE_4K
390 	el2_regs.vtcr_el2 |= VTCR_EL2_TG0_4K;
391 #elif PAGE_SIZE == PAGE_SIZE_16K
392 	el2_regs.vtcr_el2 |= VTCR_EL2_TG0_16K;
393 #else
394 #error Unsupported page size
395 #endif
396 #ifdef SMP
397 	el2_regs.vtcr_el2 |= VTCR_EL2_SH0_IS;
398 #endif
399 	/*
400 	 * If FEAT_LPA2 is enabled in the host then we need to enable it here
401 	 * so the page tables created by pmap.c are correct. The meaning of
402 	 * the shareability field changes to become address bits when this
403 	 * is set.
404 	 */
405 	if ((READ_SPECIALREG(tcr_el1) & TCR_DS) != 0) {
406 		el2_regs.vtcr_el2 |= VTCR_EL2_DS;
407 		el2_regs.vtcr_el2 |=
408 		    min(pa_range_bits << VTCR_EL2_PS_SHIFT, VTCR_EL2_PS_52BIT);
409 	} else {
410 		el2_regs.vtcr_el2 |=
411 		    min(pa_range_bits << VTCR_EL2_PS_SHIFT, VTCR_EL2_PS_48BIT);
412 	}
413 
414 	smp_rendezvous(NULL, arm_setup_vectors, NULL, &el2_regs);
415 
416 	if (!in_vhe()) {
417 		/* Add memory to the vmem allocator (checking there is space) */
418 		if (vmm_base > (L2_SIZE + PAGE_SIZE)) {
419 			/*
420 			 * Ensure there is an L2 block before the vmm code to check
421 			 * for buffer overflows on earlier data. Include the PAGE_SIZE
422 			 * of the minimum we can allocate.
423 			 */
424 			vmm_base -= L2_SIZE + PAGE_SIZE;
425 			vmm_base = rounddown2(vmm_base, L2_SIZE);
426 
427 			/*
428 			 * Check there is memory before the vmm code to add.
429 			 *
430 			 * Reserve the L2 block at address 0 so NULL dereference will
431 			 * raise an exception.
432 			 */
433 			if (vmm_base > L2_SIZE)
434 				vmem_add(el2_mem_alloc, L2_SIZE, vmm_base - L2_SIZE,
435 				    M_WAITOK);
436 		}
437 
438 		/*
439 		 * Add the memory after the stacks. There is most of an L2 block
440 		 * between the last stack and the first allocation so this should
441 		 * be safe without adding more padding.
442 		 */
443 		if (next_hyp_va < HYP_VM_MAX_ADDRESS - PAGE_SIZE)
444 			vmem_add(el2_mem_alloc, next_hyp_va,
445 			    HYP_VM_MAX_ADDRESS - next_hyp_va, M_WAITOK);
446 	}
447 	cnthctl_el2 = vmm_read_reg(HYP_REG_CNTHCTL);
448 
449 	vgic_init();
450 	vtimer_init(cnthctl_el2);
451 
452 	return (0);
453 }
454 
455 int
vmmops_modcleanup(void)456 vmmops_modcleanup(void)
457 {
458 	int cpu;
459 
460 	if (!in_vhe()) {
461 		smp_rendezvous(NULL, arm_teardown_vectors, NULL, NULL);
462 
463 		CPU_FOREACH(cpu) {
464 			vmmpmap_remove(stack_hyp_va[cpu],
465 			    VMM_STACK_PAGES * PAGE_SIZE, false);
466 		}
467 
468 		vmmpmap_remove(hyp_code_base, hyp_code_len, false);
469 	}
470 
471 	vtimer_cleanup();
472 
473 	if (!in_vhe()) {
474 		vmmpmap_fini();
475 
476 		CPU_FOREACH(cpu)
477 			free(stack[cpu], M_HYP);
478 	}
479 
480 	pmap_clean_stage2_tlbi = NULL;
481 	pmap_stage2_invalidate_range = NULL;
482 	pmap_stage2_invalidate_all = NULL;
483 
484 	return (0);
485 }
486 
487 static vm_size_t
el2_hyp_size(struct vm * vm)488 el2_hyp_size(struct vm *vm)
489 {
490 	return (round_page(sizeof(struct hyp) +
491 	    sizeof(struct hypctx *) * vm_get_maxcpus(vm)));
492 }
493 
494 static vm_size_t
el2_hypctx_size(void)495 el2_hypctx_size(void)
496 {
497 	return (round_page(sizeof(struct hypctx)));
498 }
499 
500 static vm_offset_t
el2_map_enter(vm_offset_t data,vm_size_t size,vm_prot_t prot)501 el2_map_enter(vm_offset_t data, vm_size_t size, vm_prot_t prot)
502 {
503 	vmem_addr_t addr;
504 	int err __diagused;
505 	bool rv __diagused;
506 
507 	err = vmem_alloc(el2_mem_alloc, size, M_NEXTFIT | M_WAITOK, &addr);
508 	MPASS(err == 0);
509 	rv = vmmpmap_enter(addr, size, vtophys(data), prot);
510 	MPASS(rv);
511 
512 	return (addr);
513 }
514 
515 void *
vmmops_init(struct vm * vm,pmap_t pmap)516 vmmops_init(struct vm *vm, pmap_t pmap)
517 {
518 	struct hyp *hyp;
519 	vm_size_t size;
520 
521 	size = el2_hyp_size(vm);
522 	hyp = malloc_aligned(size, PAGE_SIZE, M_HYP, M_WAITOK | M_ZERO);
523 
524 	hyp->vm = vm;
525 	hyp->vgic_attached = false;
526 
527 	vtimer_vminit(hyp);
528 	vgic_vminit(hyp);
529 
530 	if (!in_vhe())
531 		hyp->el2_addr = el2_map_enter((vm_offset_t)hyp, size,
532 		    VM_PROT_READ | VM_PROT_WRITE);
533 
534 	return (hyp);
535 }
536 
537 void *
vmmops_vcpu_init(void * vmi,struct vcpu * vcpu1,int vcpuid)538 vmmops_vcpu_init(void *vmi, struct vcpu *vcpu1, int vcpuid)
539 {
540 	struct hyp *hyp = vmi;
541 	struct hypctx *hypctx;
542 	vm_size_t size;
543 
544 	size = el2_hypctx_size();
545 	hypctx = malloc_aligned(size, PAGE_SIZE, M_HYP, M_WAITOK | M_ZERO);
546 
547 	KASSERT(vcpuid >= 0 && vcpuid < vm_get_maxcpus(hyp->vm),
548 	    ("%s: Invalid vcpuid %d", __func__, vcpuid));
549 	hyp->ctx[vcpuid] = hypctx;
550 
551 	hypctx->hyp = hyp;
552 	hypctx->vcpu = vcpu1;
553 
554 	reset_vm_el01_regs(hypctx);
555 	reset_vm_el2_regs(hypctx);
556 
557 	vtimer_cpuinit(hypctx);
558 	vgic_cpuinit(hypctx);
559 
560 	if (!in_vhe())
561 		hypctx->el2_addr = el2_map_enter((vm_offset_t)hypctx, size,
562 		    VM_PROT_READ | VM_PROT_WRITE);
563 
564 	return (hypctx);
565 }
566 
567 static int
arm_vmm_pinit(pmap_t pmap)568 arm_vmm_pinit(pmap_t pmap)
569 {
570 
571 	pmap_pinit_stage(pmap, PM_STAGE2, vmm_pmap_levels);
572 	return (1);
573 }
574 
575 struct vmspace *
vmmops_vmspace_alloc(vm_offset_t min,vm_offset_t max)576 vmmops_vmspace_alloc(vm_offset_t min, vm_offset_t max)
577 {
578 	return (vmspace_alloc(min, max, arm_vmm_pinit));
579 }
580 
581 void
vmmops_vmspace_free(struct vmspace * vmspace)582 vmmops_vmspace_free(struct vmspace *vmspace)
583 {
584 
585 	pmap_remove_pages(vmspace_pmap(vmspace));
586 	vmspace_free(vmspace);
587 }
588 
589 static inline void
arm64_print_hyp_regs(struct vm_exit * vme)590 arm64_print_hyp_regs(struct vm_exit *vme)
591 {
592 	printf("esr_el2:   0x%016lx\n", vme->u.hyp.esr_el2);
593 	printf("far_el2:   0x%016lx\n", vme->u.hyp.far_el2);
594 	printf("hpfar_el2: 0x%016lx\n", vme->u.hyp.hpfar_el2);
595 	printf("elr_el2:   0x%016lx\n", vme->pc);
596 }
597 
598 static void
arm64_gen_inst_emul_data(struct hypctx * hypctx,uint32_t esr_iss,struct vm_exit * vme_ret)599 arm64_gen_inst_emul_data(struct hypctx *hypctx, uint32_t esr_iss,
600     struct vm_exit *vme_ret)
601 {
602 	struct vm_guest_paging *paging;
603 	struct vie *vie;
604 	uint32_t esr_sas, reg_num;
605 
606 	/*
607 	 * Get the page address from HPFAR_EL2.
608 	 */
609 	vme_ret->u.inst_emul.gpa =
610 	    HPFAR_EL2_FIPA_ADDR(hypctx->exit_info.hpfar_el2);
611 	/* Bits [11:0] are the same as bits [11:0] from the virtual address. */
612 	vme_ret->u.inst_emul.gpa += hypctx->exit_info.far_el2 &
613 	    FAR_EL2_HPFAR_PAGE_MASK;
614 
615 	esr_sas = (esr_iss & ISS_DATA_SAS_MASK) >> ISS_DATA_SAS_SHIFT;
616 	reg_num = (esr_iss & ISS_DATA_SRT_MASK) >> ISS_DATA_SRT_SHIFT;
617 
618 	vie = &vme_ret->u.inst_emul.vie;
619 	vie->access_size = 1 << esr_sas;
620 	vie->sign_extend = (esr_iss & ISS_DATA_SSE) ? 1 : 0;
621 	vie->dir = (esr_iss & ISS_DATA_WnR) ? VM_DIR_WRITE : VM_DIR_READ;
622 	vie->reg = reg_num;
623 
624 	paging = &vme_ret->u.inst_emul.paging;
625 	paging->ttbr0_addr = hypctx->ttbr0_el1 & ~(TTBR_ASID_MASK | TTBR_CnP);
626 	paging->ttbr1_addr = hypctx->ttbr1_el1 & ~(TTBR_ASID_MASK | TTBR_CnP);
627 	paging->tcr_el1 = hypctx->tcr_el1;
628 	paging->tcr2_el1 = hypctx->tcr2_el1;
629 	paging->flags = hypctx->tf.tf_spsr & (PSR_M_MASK | PSR_M_32);
630 	if ((hypctx->sctlr_el1 & SCTLR_M) != 0)
631 		paging->flags |= VM_GP_MMU_ENABLED;
632 }
633 
634 static void
arm64_gen_reg_emul_data(uint32_t esr_iss,struct vm_exit * vme_ret)635 arm64_gen_reg_emul_data(uint32_t esr_iss, struct vm_exit *vme_ret)
636 {
637 	uint32_t reg_num;
638 	struct vre *vre;
639 
640 	/* u.hyp member will be replaced by u.reg_emul */
641 	vre = &vme_ret->u.reg_emul.vre;
642 
643 	vre->inst_syndrome = esr_iss;
644 	/* ARMv8 Architecture Manual, p. D7-2273: 1 means read */
645 	vre->dir = (esr_iss & ISS_MSR_DIR) ? VM_DIR_READ : VM_DIR_WRITE;
646 	reg_num = ISS_MSR_Rt(esr_iss);
647 	vre->reg = reg_num;
648 }
649 
650 void
raise_data_insn_abort(struct hypctx * hypctx,uint64_t far,bool dabort,int fsc)651 raise_data_insn_abort(struct hypctx *hypctx, uint64_t far, bool dabort, int fsc)
652 {
653 	uint64_t esr;
654 
655 	if ((hypctx->tf.tf_spsr & PSR_M_MASK) == PSR_M_EL0t)
656 		esr = EXCP_INSN_ABORT_L << ESR_ELx_EC_SHIFT;
657 	else
658 		esr = EXCP_INSN_ABORT << ESR_ELx_EC_SHIFT;
659 	/* Set the bit that changes from insn -> data abort */
660 	if (dabort)
661 		esr |= EXCP_DATA_ABORT_L << ESR_ELx_EC_SHIFT;
662 	/* Set the IL bit if set by hardware */
663 	esr |= hypctx->tf.tf_esr & ESR_ELx_IL;
664 
665 	vmmops_exception(hypctx, esr | fsc, far);
666 }
667 
668 static int
handle_el1_sync_excp(struct hypctx * hypctx,struct vm_exit * vme_ret,pmap_t pmap)669 handle_el1_sync_excp(struct hypctx *hypctx, struct vm_exit *vme_ret,
670     pmap_t pmap)
671 {
672 	uint64_t gpa;
673 	uint32_t esr_ec, esr_iss;
674 
675 	esr_ec = ESR_ELx_EXCEPTION(hypctx->tf.tf_esr);
676 	esr_iss = hypctx->tf.tf_esr & ESR_ELx_ISS_MASK;
677 
678 	switch (esr_ec) {
679 	case EXCP_UNKNOWN:
680 		vmm_stat_incr(hypctx->vcpu, VMEXIT_UNKNOWN, 1);
681 		arm64_print_hyp_regs(vme_ret);
682 		vme_ret->exitcode = VM_EXITCODE_HYP;
683 		break;
684 	case EXCP_TRAP_WFI_WFE:
685 		if ((hypctx->tf.tf_esr & 0x3) == 0) { /* WFI */
686 			vmm_stat_incr(hypctx->vcpu, VMEXIT_WFI, 1);
687 			vme_ret->exitcode = VM_EXITCODE_WFI;
688 		} else {
689 			vmm_stat_incr(hypctx->vcpu, VMEXIT_WFE, 1);
690 			vme_ret->exitcode = VM_EXITCODE_HYP;
691 		}
692 		break;
693 	case EXCP_HVC:
694 		vmm_stat_incr(hypctx->vcpu, VMEXIT_HVC, 1);
695 		vme_ret->exitcode = VM_EXITCODE_HVC;
696 		break;
697 	case EXCP_MSR:
698 		vmm_stat_incr(hypctx->vcpu, VMEXIT_MSR, 1);
699 		arm64_gen_reg_emul_data(esr_iss, vme_ret);
700 		vme_ret->exitcode = VM_EXITCODE_REG_EMUL;
701 		break;
702 	case EXCP_BRK:
703 		vmm_stat_incr(hypctx->vcpu, VMEXIT_BRK, 1);
704 		vme_ret->exitcode = VM_EXITCODE_BRK;
705 		break;
706 	case EXCP_SOFTSTP_EL0:
707 		vmm_stat_incr(hypctx->vcpu, VMEXIT_SS, 1);
708 		vme_ret->exitcode = VM_EXITCODE_SS;
709 		break;
710 	case EXCP_INSN_ABORT_L:
711 	case EXCP_DATA_ABORT_L:
712 		vmm_stat_incr(hypctx->vcpu, esr_ec == EXCP_DATA_ABORT_L ?
713 		    VMEXIT_DATA_ABORT : VMEXIT_INSN_ABORT, 1);
714 		switch (hypctx->tf.tf_esr & ISS_DATA_DFSC_MASK) {
715 		case ISS_DATA_DFSC_TF_L0:
716 		case ISS_DATA_DFSC_TF_L1:
717 		case ISS_DATA_DFSC_TF_L2:
718 		case ISS_DATA_DFSC_TF_L3:
719 		case ISS_DATA_DFSC_AFF_L1:
720 		case ISS_DATA_DFSC_AFF_L2:
721 		case ISS_DATA_DFSC_AFF_L3:
722 		case ISS_DATA_DFSC_PF_L1:
723 		case ISS_DATA_DFSC_PF_L2:
724 		case ISS_DATA_DFSC_PF_L3:
725 			gpa = HPFAR_EL2_FIPA_ADDR(hypctx->exit_info.hpfar_el2);
726 			/* Check the IPA is valid */
727 			if (gpa >= (1ul << vmm_max_ipa_bits)) {
728 				raise_data_insn_abort(hypctx,
729 				    hypctx->exit_info.far_el2,
730 				    esr_ec == EXCP_DATA_ABORT_L,
731 				    ISS_DATA_DFSC_ASF_L0);
732 				vme_ret->inst_length = 0;
733 				return (HANDLED);
734 			}
735 
736 			if (vm_mem_allocated(hypctx->vcpu, gpa)) {
737 				vme_ret->exitcode = VM_EXITCODE_PAGING;
738 				vme_ret->inst_length = 0;
739 				vme_ret->u.paging.esr = hypctx->tf.tf_esr;
740 				vme_ret->u.paging.gpa = gpa;
741 			} else if (esr_ec == EXCP_INSN_ABORT_L) {
742 				/*
743 				 * Raise an external abort. Device memory is
744 				 * not executable
745 				 */
746 				raise_data_insn_abort(hypctx,
747 				    hypctx->exit_info.far_el2, false,
748 				    ISS_DATA_DFSC_EXT);
749 				vme_ret->inst_length = 0;
750 				return (HANDLED);
751 			} else {
752 				arm64_gen_inst_emul_data(hypctx, esr_iss,
753 				    vme_ret);
754 				vme_ret->exitcode = VM_EXITCODE_INST_EMUL;
755 			}
756 			break;
757 		default:
758 			arm64_print_hyp_regs(vme_ret);
759 			vme_ret->exitcode = VM_EXITCODE_HYP;
760 			break;
761 		}
762 
763 		break;
764 
765 	default:
766 		vmm_stat_incr(hypctx->vcpu, VMEXIT_UNHANDLED_SYNC, 1);
767 		arm64_print_hyp_regs(vme_ret);
768 		vme_ret->exitcode = VM_EXITCODE_HYP;
769 		break;
770 	}
771 
772 	/* We don't don't do any instruction emulation here */
773 	return (UNHANDLED);
774 }
775 
776 static int
arm64_handle_world_switch(struct hypctx * hypctx,int excp_type,struct vm_exit * vme,pmap_t pmap)777 arm64_handle_world_switch(struct hypctx *hypctx, int excp_type,
778     struct vm_exit *vme, pmap_t pmap)
779 {
780 	int handled;
781 
782 	switch (excp_type) {
783 	case EXCP_TYPE_EL1_SYNC:
784 		/* The exit code will be set by handle_el1_sync_excp(). */
785 		handled = handle_el1_sync_excp(hypctx, vme, pmap);
786 		break;
787 
788 	case EXCP_TYPE_EL1_IRQ:
789 	case EXCP_TYPE_EL1_FIQ:
790 		/* The host kernel will handle IRQs and FIQs. */
791 		vmm_stat_incr(hypctx->vcpu,
792 		    excp_type == EXCP_TYPE_EL1_IRQ ? VMEXIT_IRQ : VMEXIT_FIQ,1);
793 		vme->exitcode = VM_EXITCODE_BOGUS;
794 		handled = UNHANDLED;
795 		break;
796 
797 	case EXCP_TYPE_EL1_ERROR:
798 	case EXCP_TYPE_EL2_SYNC:
799 	case EXCP_TYPE_EL2_IRQ:
800 	case EXCP_TYPE_EL2_FIQ:
801 	case EXCP_TYPE_EL2_ERROR:
802 		vmm_stat_incr(hypctx->vcpu, VMEXIT_UNHANDLED_EL2, 1);
803 		vme->exitcode = VM_EXITCODE_BOGUS;
804 		handled = UNHANDLED;
805 		break;
806 
807 	default:
808 		vmm_stat_incr(hypctx->vcpu, VMEXIT_UNHANDLED, 1);
809 		vme->exitcode = VM_EXITCODE_BOGUS;
810 		handled = UNHANDLED;
811 		break;
812 	}
813 
814 	return (handled);
815 }
816 
817 static void
ptp_release(void ** cookie)818 ptp_release(void **cookie)
819 {
820 	if (*cookie != NULL) {
821 		vm_gpa_release(*cookie);
822 		*cookie = NULL;
823 	}
824 }
825 
826 static void *
ptp_hold(struct vcpu * vcpu,vm_paddr_t ptpphys,size_t len,void ** cookie)827 ptp_hold(struct vcpu *vcpu, vm_paddr_t ptpphys, size_t len, void **cookie)
828 {
829 	void *ptr;
830 
831 	ptp_release(cookie);
832 	ptr = vm_gpa_hold(vcpu, ptpphys, len, VM_PROT_RW, cookie);
833 	return (ptr);
834 }
835 
836 /* log2 of the number of bytes in a page table entry */
837 #define	PTE_SHIFT	3
838 int
vmmops_gla2gpa(void * vcpui,struct vm_guest_paging * paging,uint64_t gla,int prot,uint64_t * gpa,int * is_fault)839 vmmops_gla2gpa(void *vcpui, struct vm_guest_paging *paging, uint64_t gla,
840     int prot, uint64_t *gpa, int *is_fault)
841 {
842 	struct hypctx *hypctx;
843 	void *cookie;
844 	uint64_t mask, *ptep, pte, pte_addr;
845 	int address_bits, granule_shift, ia_bits, levels, pte_shift, tsz;
846 	bool is_el0;
847 
848 	/* Check if the MMU is off */
849 	if ((paging->flags & VM_GP_MMU_ENABLED) == 0) {
850 		*is_fault = 0;
851 		*gpa = gla;
852 		return (0);
853 	}
854 
855 	is_el0 = (paging->flags & PSR_M_MASK) == PSR_M_EL0t;
856 
857 	if (ADDR_IS_KERNEL(gla)) {
858 		/* If address translation is disabled raise an exception */
859 		if ((paging->tcr_el1 & TCR_EPD1) != 0) {
860 			*is_fault = 1;
861 			return (0);
862 		}
863 		if (is_el0 && (paging->tcr_el1 & TCR_E0PD1) != 0) {
864 			*is_fault = 1;
865 			return (0);
866 		}
867 		pte_addr = paging->ttbr1_addr;
868 		tsz = (paging->tcr_el1 & TCR_T1SZ_MASK) >> TCR_T1SZ_SHIFT;
869 		/* Clear the top byte if TBI is on */
870 		if ((paging->tcr_el1 & TCR_TBI1) != 0)
871 			gla |= (0xfful << 56);
872 		switch (paging->tcr_el1 & TCR_TG1_MASK) {
873 		case TCR_TG1_4K:
874 			granule_shift = PAGE_SHIFT_4K;
875 			break;
876 		case TCR_TG1_16K:
877 			granule_shift = PAGE_SHIFT_16K;
878 			break;
879 		case TCR_TG1_64K:
880 			granule_shift = PAGE_SHIFT_64K;
881 			break;
882 		default:
883 			*is_fault = 1;
884 			return (EINVAL);
885 		}
886 	} else {
887 		/* If address translation is disabled raise an exception */
888 		if ((paging->tcr_el1 & TCR_EPD0) != 0) {
889 			*is_fault = 1;
890 			return (0);
891 		}
892 		if (is_el0 && (paging->tcr_el1 & TCR_E0PD0) != 0) {
893 			*is_fault = 1;
894 			return (0);
895 		}
896 		pte_addr = paging->ttbr0_addr;
897 		tsz = (paging->tcr_el1 & TCR_T0SZ_MASK) >> TCR_T0SZ_SHIFT;
898 		/* Clear the top byte if TBI is on */
899 		if ((paging->tcr_el1 & TCR_TBI0) != 0)
900 			gla &= ~(0xfful << 56);
901 		switch (paging->tcr_el1 & TCR_TG0_MASK) {
902 		case TCR_TG0_4K:
903 			granule_shift = PAGE_SHIFT_4K;
904 			break;
905 		case TCR_TG0_16K:
906 			granule_shift = PAGE_SHIFT_16K;
907 			break;
908 		case TCR_TG0_64K:
909 			granule_shift = PAGE_SHIFT_64K;
910 			break;
911 		default:
912 			*is_fault = 1;
913 			return (EINVAL);
914 		}
915 	}
916 
917 	/*
918 	 * TODO: Support FEAT_TTST for smaller tsz values and FEAT_LPA2
919 	 * for larger values.
920 	 */
921 	switch (granule_shift) {
922 	case PAGE_SHIFT_4K:
923 	case PAGE_SHIFT_16K:
924 		/*
925 		 * See "Table D8-11 4KB granule, determining stage 1 initial
926 		 * lookup level" and "Table D8-21 16KB granule, determining
927 		 * stage 1 initial lookup level" from the "Arm Architecture
928 		 * Reference Manual for A-Profile architecture" revision I.a
929 		 * for the minimum and maximum values.
930 		 *
931 		 * TODO: Support less than 16 when FEAT_LPA2 is implemented
932 		 * and TCR_EL1.DS == 1
933 		 * TODO: Support more than 39 when FEAT_TTST is implemented
934 		 */
935 		if (tsz < 16 || tsz > 39) {
936 			*is_fault = 1;
937 			return (EINVAL);
938 		}
939 		break;
940 	case PAGE_SHIFT_64K:
941 	/* TODO: Support 64k granule. It will probably work, but is untested */
942 	default:
943 		*is_fault = 1;
944 		return (EINVAL);
945 	}
946 
947 	/*
948 	 * Calculate the input address bits. These are 64 bit in an address
949 	 * with the top tsz bits being all 0 or all 1.
950 	  */
951 	ia_bits = 64 - tsz;
952 
953 	/*
954 	 * Calculate the number of address bits used in the page table
955 	 * calculation. This is ia_bits minus the bottom granule_shift
956 	 * bits that are passed to the output address.
957 	 */
958 	address_bits = ia_bits - granule_shift;
959 
960 	/*
961 	 * Calculate the number of levels. Each level uses
962 	 * granule_shift - PTE_SHIFT bits of the input address.
963 	 * This is because the table is 1 << granule_shift and each
964 	 * entry is 1 << PTE_SHIFT bytes.
965 	 */
966 	levels = howmany(address_bits, granule_shift - PTE_SHIFT);
967 
968 	/* Mask of the upper unused bits in the virtual address */
969 	gla &= (1ul << ia_bits) - 1;
970 	hypctx = (struct hypctx *)vcpui;
971 	cookie = NULL;
972 	/* TODO: Check if the level supports block descriptors */
973 	for (;levels > 0; levels--) {
974 		int idx;
975 
976 		pte_shift = (levels - 1) * (granule_shift - PTE_SHIFT) +
977 		    granule_shift;
978 		idx = (gla >> pte_shift) &
979 		    ((1ul << (granule_shift - PTE_SHIFT)) - 1);
980 		while (idx > PAGE_SIZE / sizeof(pte)) {
981 			idx -= PAGE_SIZE / sizeof(pte);
982 			pte_addr += PAGE_SIZE;
983 		}
984 
985 		ptep = ptp_hold(hypctx->vcpu, pte_addr, PAGE_SIZE, &cookie);
986 		if (ptep == NULL)
987 			goto error;
988 		pte = ptep[idx];
989 
990 		/* Calculate the level we are looking at */
991 		switch (levels) {
992 		default:
993 			goto fault;
994 		/* TODO: Level -1 when FEAT_LPA2 is implemented */
995 		case 4: /* Level 0 */
996 			if ((pte & ATTR_DESCR_MASK) != L0_TABLE)
997 				goto fault;
998 			/* FALLTHROUGH */
999 		case 3: /* Level 1 */
1000 		case 2: /* Level 2 */
1001 			switch (pte & ATTR_DESCR_MASK) {
1002 			/* Use L1 macro as all levels are the same */
1003 			case L1_TABLE:
1004 				/* Check if EL0 can access this address space */
1005 				if (is_el0 &&
1006 				    (pte & TATTR_AP_TABLE_NO_EL0) != 0)
1007 					goto fault;
1008 				/* Check if the address space is writable */
1009 				if ((prot & PROT_WRITE) != 0 &&
1010 				    (pte & TATTR_AP_TABLE_RO) != 0)
1011 					goto fault;
1012 				if ((prot & PROT_EXEC) != 0) {
1013 					/* Check the table exec attribute */
1014 					if ((is_el0 &&
1015 					    (pte & TATTR_UXN_TABLE) != 0) ||
1016 					    (!is_el0 &&
1017 					     (pte & TATTR_PXN_TABLE) != 0))
1018 						goto fault;
1019 				}
1020 				pte_addr = pte & ~ATTR_MASK;
1021 				break;
1022 			case L1_BLOCK:
1023 				goto done;
1024 			default:
1025 				goto fault;
1026 			}
1027 			break;
1028 		case 1: /* Level 3 */
1029 			if ((pte & ATTR_DESCR_MASK) == L3_PAGE)
1030 				goto done;
1031 			goto fault;
1032 		}
1033 	}
1034 
1035 done:
1036 	/* Check if EL0 has access to the block/page */
1037 	if (is_el0 && (pte & ATTR_S1_AP(ATTR_S1_AP_USER)) == 0)
1038 		goto fault;
1039 	if ((prot & PROT_WRITE) != 0 && (pte & ATTR_S1_AP_RW_BIT) != 0)
1040 		goto fault;
1041 	if ((prot & PROT_EXEC) != 0) {
1042 		if ((is_el0 && (pte & ATTR_S1_UXN) != 0) ||
1043 		    (!is_el0 && (pte & ATTR_S1_PXN) != 0))
1044 			goto fault;
1045 	}
1046 	mask = (1ul << pte_shift) - 1;
1047 	*gpa = (pte & ~ATTR_MASK) | (gla & mask);
1048 	*is_fault = 0;
1049 	ptp_release(&cookie);
1050 	return (0);
1051 
1052 error:
1053 	ptp_release(&cookie);
1054 	return (EFAULT);
1055 fault:
1056 	*is_fault = 1;
1057 	ptp_release(&cookie);
1058 	return (0);
1059 }
1060 
1061 int
vmmops_run(void * vcpui,register_t pc,pmap_t pmap,struct vm_eventinfo * evinfo)1062 vmmops_run(void *vcpui, register_t pc, pmap_t pmap, struct vm_eventinfo *evinfo)
1063 {
1064 	uint64_t excp_type;
1065 	int handled;
1066 	register_t daif;
1067 	struct hyp *hyp;
1068 	struct hypctx *hypctx;
1069 	struct vcpu *vcpu;
1070 	struct vm_exit *vme;
1071 	int mode;
1072 
1073 	hypctx = (struct hypctx *)vcpui;
1074 	hyp = hypctx->hyp;
1075 	vcpu = hypctx->vcpu;
1076 	vme = vm_exitinfo(vcpu);
1077 
1078 	hypctx->tf.tf_elr = (uint64_t)pc;
1079 
1080 	for (;;) {
1081 		if (hypctx->has_exception) {
1082 			hypctx->has_exception = false;
1083 			hypctx->elr_el1 = hypctx->tf.tf_elr;
1084 
1085 			mode = hypctx->tf.tf_spsr & (PSR_M_MASK | PSR_M_32);
1086 
1087 			if (mode == PSR_M_EL1t) {
1088 				hypctx->tf.tf_elr = hypctx->vbar_el1 + 0x0;
1089 			} else if (mode == PSR_M_EL1h) {
1090 				hypctx->tf.tf_elr = hypctx->vbar_el1 + 0x200;
1091 			} else if ((mode & PSR_M_32) == PSR_M_64) {
1092 				/* 64-bit EL0 */
1093 				hypctx->tf.tf_elr = hypctx->vbar_el1 + 0x400;
1094 			} else {
1095 				/* 32-bit EL0 */
1096 				hypctx->tf.tf_elr = hypctx->vbar_el1 + 0x600;
1097 			}
1098 
1099 			/* Set the new spsr */
1100 			hypctx->spsr_el1 = hypctx->tf.tf_spsr;
1101 
1102 			/* Set the new cpsr */
1103 			hypctx->tf.tf_spsr = hypctx->spsr_el1 & PSR_FLAGS;
1104 			hypctx->tf.tf_spsr |= PSR_DAIF | PSR_M_EL1h;
1105 
1106 			/*
1107 			 * Update fields that may change on exeption entry
1108 			 * based on how sctlr_el1 is configured.
1109 			 */
1110 			if ((hypctx->sctlr_el1 & SCTLR_SPAN) == 0)
1111 				hypctx->tf.tf_spsr |= PSR_PAN;
1112 			if ((hypctx->sctlr_el1 & SCTLR_DSSBS) == 0)
1113 				hypctx->tf.tf_spsr &= ~PSR_SSBS;
1114 			else
1115 				hypctx->tf.tf_spsr |= PSR_SSBS;
1116 		}
1117 
1118 		daif = intr_disable();
1119 
1120 		/* Check if the vcpu is suspended */
1121 		if (vcpu_suspended(evinfo)) {
1122 			intr_restore(daif);
1123 			vm_exit_suspended(vcpu, pc);
1124 			break;
1125 		}
1126 
1127 		if (vcpu_debugged(vcpu)) {
1128 			intr_restore(daif);
1129 			vm_exit_debug(vcpu, pc);
1130 			break;
1131 		}
1132 
1133 		/* Activate the stage2 pmap so the vmid is valid */
1134 		pmap_activate_vm(pmap);
1135 		hyp->vttbr_el2 = pmap_to_ttbr0(pmap);
1136 
1137 		/*
1138 		 * TODO: What happens if a timer interrupt is asserted exactly
1139 		 * here, but for the previous VM?
1140 		 */
1141 		arm64_set_active_vcpu(hypctx);
1142 		vgic_flush_hwstate(hypctx);
1143 
1144 		/* Call into EL2 to switch to the guest */
1145 		excp_type = vmm_enter_guest(hyp, hypctx);
1146 
1147 		vgic_sync_hwstate(hypctx);
1148 		vtimer_sync_hwstate(hypctx);
1149 
1150 		/*
1151 		 * Deactivate the stage2 pmap.
1152 		 */
1153 		PCPU_SET(curvmpmap, NULL);
1154 		intr_restore(daif);
1155 
1156 		vmm_stat_incr(vcpu, VMEXIT_COUNT, 1);
1157 		if (excp_type == EXCP_TYPE_MAINT_IRQ)
1158 			continue;
1159 
1160 		vme->pc = hypctx->tf.tf_elr;
1161 		vme->inst_length = INSN_SIZE;
1162 		vme->u.hyp.exception_nr = excp_type;
1163 		vme->u.hyp.esr_el2 = hypctx->tf.tf_esr;
1164 		vme->u.hyp.far_el2 = hypctx->exit_info.far_el2;
1165 		vme->u.hyp.hpfar_el2 = hypctx->exit_info.hpfar_el2;
1166 
1167 		handled = arm64_handle_world_switch(hypctx, excp_type, vme,
1168 		    pmap);
1169 		if (handled == UNHANDLED)
1170 			/* Exit loop to emulate instruction. */
1171 			break;
1172 		else
1173 			/* Resume guest execution from the next instruction. */
1174 			hypctx->tf.tf_elr += vme->inst_length;
1175 	}
1176 
1177 	return (0);
1178 }
1179 
1180 static void
arm_pcpu_vmcleanup(void * arg)1181 arm_pcpu_vmcleanup(void *arg)
1182 {
1183 	struct hyp *hyp;
1184 	int i, maxcpus;
1185 
1186 	hyp = arg;
1187 	maxcpus = vm_get_maxcpus(hyp->vm);
1188 	for (i = 0; i < maxcpus; i++) {
1189 		if (arm64_get_active_vcpu() == hyp->ctx[i]) {
1190 			arm64_set_active_vcpu(NULL);
1191 			break;
1192 		}
1193 	}
1194 }
1195 
1196 void
vmmops_vcpu_cleanup(void * vcpui)1197 vmmops_vcpu_cleanup(void *vcpui)
1198 {
1199 	struct hypctx *hypctx = vcpui;
1200 
1201 	vtimer_cpucleanup(hypctx);
1202 	vgic_cpucleanup(hypctx);
1203 
1204 	if (!in_vhe())
1205 		vmmpmap_remove(hypctx->el2_addr, el2_hypctx_size(), true);
1206 
1207 	free(hypctx, M_HYP);
1208 }
1209 
1210 void
vmmops_cleanup(void * vmi)1211 vmmops_cleanup(void *vmi)
1212 {
1213 	struct hyp *hyp = vmi;
1214 
1215 	vtimer_vmcleanup(hyp);
1216 	vgic_vmcleanup(hyp);
1217 
1218 	smp_rendezvous(NULL, arm_pcpu_vmcleanup, NULL, hyp);
1219 
1220 	if (!in_vhe())
1221 		vmmpmap_remove(hyp->el2_addr, el2_hyp_size(hyp->vm), true);
1222 
1223 	free(hyp, M_HYP);
1224 }
1225 
1226 /*
1227  * Return register value. Registers have different sizes and an explicit cast
1228  * must be made to ensure proper conversion.
1229  */
1230 static uint64_t *
hypctx_regptr(struct hypctx * hypctx,int reg)1231 hypctx_regptr(struct hypctx *hypctx, int reg)
1232 {
1233 	switch (reg) {
1234 	case VM_REG_GUEST_X0 ... VM_REG_GUEST_X29:
1235 		return (&hypctx->tf.tf_x[reg]);
1236 	case VM_REG_GUEST_LR:
1237 		return (&hypctx->tf.tf_lr);
1238 	case VM_REG_GUEST_SP:
1239 		return (&hypctx->tf.tf_sp);
1240 	case VM_REG_GUEST_CPSR:
1241 		return (&hypctx->tf.tf_spsr);
1242 	case VM_REG_GUEST_PC:
1243 		return (&hypctx->tf.tf_elr);
1244 	case VM_REG_GUEST_SCTLR_EL1:
1245 		return (&hypctx->sctlr_el1);
1246 	case VM_REG_GUEST_TTBR0_EL1:
1247 		return (&hypctx->ttbr0_el1);
1248 	case VM_REG_GUEST_TTBR1_EL1:
1249 		return (&hypctx->ttbr1_el1);
1250 	case VM_REG_GUEST_TCR_EL1:
1251 		return (&hypctx->tcr_el1);
1252 	case VM_REG_GUEST_TCR2_EL1:
1253 		return (&hypctx->tcr2_el1);
1254 	default:
1255 		break;
1256 	}
1257 	return (NULL);
1258 }
1259 
1260 int
vmmops_getreg(void * vcpui,int reg,uint64_t * retval)1261 vmmops_getreg(void *vcpui, int reg, uint64_t *retval)
1262 {
1263 	uint64_t *regp;
1264 	int running, hostcpu;
1265 	struct hypctx *hypctx = vcpui;
1266 
1267 	running = vcpu_is_running(hypctx->vcpu, &hostcpu);
1268 	if (running && hostcpu != curcpu)
1269 		panic("arm_getreg: %s%d is running", vm_name(hypctx->hyp->vm),
1270 		    vcpu_vcpuid(hypctx->vcpu));
1271 
1272 	regp = hypctx_regptr(hypctx, reg);
1273 	if (regp == NULL)
1274 		return (EINVAL);
1275 
1276 	*retval = *regp;
1277 	return (0);
1278 }
1279 
1280 int
vmmops_setreg(void * vcpui,int reg,uint64_t val)1281 vmmops_setreg(void *vcpui, int reg, uint64_t val)
1282 {
1283 	uint64_t *regp;
1284 	struct hypctx *hypctx = vcpui;
1285 	int running, hostcpu;
1286 
1287 	running = vcpu_is_running(hypctx->vcpu, &hostcpu);
1288 	if (running && hostcpu != curcpu)
1289 		panic("arm_setreg: %s%d is running", vm_name(hypctx->hyp->vm),
1290 		    vcpu_vcpuid(hypctx->vcpu));
1291 
1292 	regp = hypctx_regptr(hypctx, reg);
1293 	if (regp == NULL)
1294 		return (EINVAL);
1295 
1296 	*regp = val;
1297 	return (0);
1298 }
1299 
1300 int
vmmops_exception(void * vcpui,uint64_t esr,uint64_t far)1301 vmmops_exception(void *vcpui, uint64_t esr, uint64_t far)
1302 {
1303 	struct hypctx *hypctx = vcpui;
1304 	int running, hostcpu;
1305 
1306 	running = vcpu_is_running(hypctx->vcpu, &hostcpu);
1307 	if (running && hostcpu != curcpu)
1308 		panic("%s: %s%d is running", __func__, vm_name(hypctx->hyp->vm),
1309 		    vcpu_vcpuid(hypctx->vcpu));
1310 
1311 	hypctx->far_el1 = far;
1312 	hypctx->esr_el1 = esr;
1313 	hypctx->has_exception = true;
1314 
1315 	return (0);
1316 }
1317 
1318 int
vmmops_getcap(void * vcpui,int num,int * retval)1319 vmmops_getcap(void *vcpui, int num, int *retval)
1320 {
1321 	struct hypctx *hypctx = vcpui;
1322 	int ret;
1323 
1324 	ret = ENOENT;
1325 
1326 	switch (num) {
1327 	case VM_CAP_UNRESTRICTED_GUEST:
1328 		*retval = 1;
1329 		ret = 0;
1330 		break;
1331 	case VM_CAP_BRK_EXIT:
1332 	case VM_CAP_SS_EXIT:
1333 	case VM_CAP_MASK_HWINTR:
1334 		*retval = (hypctx->setcaps & (1ul << num)) != 0;
1335 		break;
1336 	default:
1337 		break;
1338 	}
1339 
1340 	return (ret);
1341 }
1342 
1343 int
vmmops_setcap(void * vcpui,int num,int val)1344 vmmops_setcap(void *vcpui, int num, int val)
1345 {
1346 	struct hypctx *hypctx = vcpui;
1347 	int ret;
1348 
1349 	ret = 0;
1350 
1351 	switch (num) {
1352 	case VM_CAP_BRK_EXIT:
1353 		if ((val != 0) == ((hypctx->setcaps & (1ul << num)) != 0))
1354 			break;
1355 		if (val != 0)
1356 			hypctx->mdcr_el2 |= MDCR_EL2_TDE;
1357 		else
1358 			hypctx->mdcr_el2 &= ~MDCR_EL2_TDE;
1359 		break;
1360 	case VM_CAP_SS_EXIT:
1361 		if ((val != 0) == ((hypctx->setcaps & (1ul << num)) != 0))
1362 			break;
1363 
1364 		if (val != 0) {
1365 			hypctx->debug_spsr |= (hypctx->tf.tf_spsr & PSR_SS);
1366 			hypctx->debug_mdscr |= hypctx->mdscr_el1 &
1367 			    (MDSCR_SS | MDSCR_KDE);
1368 
1369 			hypctx->tf.tf_spsr |= PSR_SS;
1370 			hypctx->mdscr_el1 |= MDSCR_SS | MDSCR_KDE;
1371 			hypctx->mdcr_el2 |= MDCR_EL2_TDE;
1372 		} else {
1373 			hypctx->tf.tf_spsr &= ~PSR_SS;
1374 			hypctx->tf.tf_spsr |= hypctx->debug_spsr;
1375 			hypctx->debug_spsr &= ~PSR_SS;
1376 			hypctx->mdscr_el1 &= ~(MDSCR_SS | MDSCR_KDE);
1377 			hypctx->mdscr_el1 |= hypctx->debug_mdscr;
1378 			hypctx->debug_mdscr &= ~(MDSCR_SS | MDSCR_KDE);
1379 			hypctx->mdcr_el2 &= ~MDCR_EL2_TDE;
1380 		}
1381 		break;
1382 	case VM_CAP_MASK_HWINTR:
1383 		if ((val != 0) == ((hypctx->setcaps & (1ul << num)) != 0))
1384 			break;
1385 
1386 		if (val != 0) {
1387 			hypctx->debug_spsr |= (hypctx->tf.tf_spsr &
1388 			    (PSR_I | PSR_F));
1389 			hypctx->tf.tf_spsr |= PSR_I | PSR_F;
1390 		} else {
1391 			hypctx->tf.tf_spsr &= ~(PSR_I | PSR_F);
1392 			hypctx->tf.tf_spsr |= (hypctx->debug_spsr &
1393 			    (PSR_I | PSR_F));
1394 			hypctx->debug_spsr &= ~(PSR_I | PSR_F);
1395 		}
1396 		break;
1397 	default:
1398 		ret = ENOENT;
1399 		break;
1400 	}
1401 
1402 	if (ret == 0) {
1403 		if (val == 0)
1404 			hypctx->setcaps &= ~(1ul << num);
1405 		else
1406 			hypctx->setcaps |= (1ul << num);
1407 	}
1408 
1409 	return (ret);
1410 }
1411