xref: /freebsd/sys/arm64/vmm/vmm.c (revision 14133abfe9c218b97e888edf04d2ec4a86e7ab4b)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (C) 2015 Mihai Carabas <mihai.carabas@gmail.com>
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28 
29 #include <sys/param.h>
30 #include <sys/systm.h>
31 #include <sys/cpuset.h>
32 #include <sys/kernel.h>
33 #include <sys/linker.h>
34 #include <sys/lock.h>
35 #include <sys/malloc.h>
36 #include <sys/module.h>
37 #include <sys/mutex.h>
38 #include <sys/pcpu.h>
39 #include <sys/proc.h>
40 #include <sys/queue.h>
41 #include <sys/rwlock.h>
42 #include <sys/sched.h>
43 #include <sys/smp.h>
44 #include <sys/sysctl.h>
45 
46 #include <vm/vm.h>
47 #include <vm/vm_object.h>
48 #include <vm/vm_page.h>
49 #include <vm/pmap.h>
50 #include <vm/vm_map.h>
51 #include <vm/vm_extern.h>
52 #include <vm/vm_param.h>
53 
54 #include <machine/armreg.h>
55 #include <machine/cpu.h>
56 #include <machine/fpu.h>
57 #include <machine/machdep.h>
58 #include <machine/pcb.h>
59 #include <machine/smp.h>
60 #include <machine/vm.h>
61 #include <machine/vmparam.h>
62 #include <machine/vmm.h>
63 #include <machine/vmm_instruction_emul.h>
64 
65 #include <dev/pci/pcireg.h>
66 #include <dev/vmm/vmm_dev.h>
67 #include <dev/vmm/vmm_ktr.h>
68 #include <dev/vmm/vmm_mem.h>
69 #include <dev/vmm/vmm_stat.h>
70 
71 #include "arm64.h"
72 #include "mmu.h"
73 
74 #include "io/vgic.h"
75 #include "io/vtimer.h"
76 
77 struct vcpu {
78 	int		flags;
79 	enum vcpu_state	state;
80 	struct mtx	mtx;
81 	int		hostcpu;	/* host cpuid this vcpu last ran on */
82 	int		vcpuid;
83 	void		*stats;
84 	struct vm_exit	exitinfo;
85 	uint64_t	nextpc;		/* (x) next instruction to execute */
86 	struct vm	*vm;		/* (o) */
87 	void		*cookie;	/* (i) cpu-specific data */
88 	struct vfpstate	*guestfpu;	/* (a,i) guest fpu state */
89 };
90 
91 #define	vcpu_lock_init(v)	mtx_init(&((v)->mtx), "vcpu lock", 0, MTX_SPIN)
92 #define	vcpu_lock_destroy(v)	mtx_destroy(&((v)->mtx))
93 #define	vcpu_lock(v)		mtx_lock_spin(&((v)->mtx))
94 #define	vcpu_unlock(v)		mtx_unlock_spin(&((v)->mtx))
95 #define	vcpu_assert_locked(v)	mtx_assert(&((v)->mtx), MA_OWNED)
96 
97 struct vmm_mmio_region {
98 	uint64_t start;
99 	uint64_t end;
100 	mem_region_read_t read;
101 	mem_region_write_t write;
102 };
103 #define	VM_MAX_MMIO_REGIONS	4
104 
105 struct vmm_special_reg {
106 	uint32_t	esr_iss;
107 	uint32_t	esr_mask;
108 	reg_read_t	reg_read;
109 	reg_write_t	reg_write;
110 	void		*arg;
111 };
112 #define	VM_MAX_SPECIAL_REGS	16
113 
114 /*
115  * Initialization:
116  * (o) initialized the first time the VM is created
117  * (i) initialized when VM is created and when it is reinitialized
118  * (x) initialized before use
119  */
120 struct vm {
121 	void		*cookie;		/* (i) cpu-specific data */
122 	volatile cpuset_t active_cpus;		/* (i) active vcpus */
123 	volatile cpuset_t debug_cpus;		/* (i) vcpus stopped for debug */
124 	int		suspend;		/* (i) stop VM execution */
125 	bool		dying;			/* (o) is dying */
126 	volatile cpuset_t suspended_cpus; 	/* (i) suspended vcpus */
127 	volatile cpuset_t halted_cpus;		/* (x) cpus in a hard halt */
128 	struct vm_mem	mem;			/* (i) guest memory */
129 	char		name[VM_MAX_NAMELEN];	/* (o) virtual machine name */
130 	struct vcpu	**vcpu;			/* (i) guest vcpus */
131 	struct vmm_mmio_region mmio_region[VM_MAX_MMIO_REGIONS];
132 						/* (o) guest MMIO regions */
133 	struct vmm_special_reg special_reg[VM_MAX_SPECIAL_REGS];
134 	/* The following describe the vm cpu topology */
135 	uint16_t	sockets;		/* (o) num of sockets */
136 	uint16_t	cores;			/* (o) num of cores/socket */
137 	uint16_t	threads;		/* (o) num of threads/core */
138 	uint16_t	maxcpus;		/* (o) max pluggable cpus */
139 	struct sx	vcpus_init_lock;	/* (o) */
140 };
141 
142 static bool vmm_initialized = false;
143 
144 static int vm_handle_wfi(struct vcpu *vcpu,
145 			 struct vm_exit *vme, bool *retu);
146 
147 static MALLOC_DEFINE(M_VMM, "vmm", "vmm");
148 
149 /* statistics */
150 static VMM_STAT(VCPU_TOTAL_RUNTIME, "vcpu total runtime");
151 
152 SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW, NULL, NULL);
153 
154 static int vmm_ipinum;
155 SYSCTL_INT(_hw_vmm, OID_AUTO, ipinum, CTLFLAG_RD, &vmm_ipinum, 0,
156     "IPI vector used for vcpu notifications");
157 
158 struct vmm_regs {
159 	uint64_t	id_aa64afr0;
160 	uint64_t	id_aa64afr1;
161 	uint64_t	id_aa64dfr0;
162 	uint64_t	id_aa64dfr1;
163 	uint64_t	id_aa64isar0;
164 	uint64_t	id_aa64isar1;
165 	uint64_t	id_aa64isar2;
166 	uint64_t	id_aa64mmfr0;
167 	uint64_t	id_aa64mmfr1;
168 	uint64_t	id_aa64mmfr2;
169 	uint64_t	id_aa64pfr0;
170 	uint64_t	id_aa64pfr1;
171 };
172 
173 static const struct vmm_regs vmm_arch_regs_masks = {
174 	.id_aa64dfr0 =
175 	    ID_AA64DFR0_CTX_CMPs_MASK |
176 	    ID_AA64DFR0_WRPs_MASK |
177 	    ID_AA64DFR0_BRPs_MASK |
178 	    ID_AA64DFR0_PMUVer_3 |
179 	    ID_AA64DFR0_DebugVer_8,
180 	.id_aa64isar0 =
181 	    ID_AA64ISAR0_TLB_TLBIOSR |
182 	    ID_AA64ISAR0_SHA3_IMPL |
183 	    ID_AA64ISAR0_RDM_IMPL |
184 	    ID_AA64ISAR0_Atomic_IMPL |
185 	    ID_AA64ISAR0_CRC32_BASE |
186 	    ID_AA64ISAR0_SHA2_512 |
187 	    ID_AA64ISAR0_SHA1_BASE |
188 	    ID_AA64ISAR0_AES_PMULL,
189 	.id_aa64mmfr0 =
190 	    ID_AA64MMFR0_TGran4_IMPL |
191 	    ID_AA64MMFR0_TGran64_IMPL |
192 	    ID_AA64MMFR0_TGran16_IMPL |
193 	    ID_AA64MMFR0_ASIDBits_16 |
194 	    ID_AA64MMFR0_PARange_4P,
195 	.id_aa64mmfr1 =
196 	    ID_AA64MMFR1_SpecSEI_IMPL |
197 	    ID_AA64MMFR1_PAN_ATS1E1 |
198 	    ID_AA64MMFR1_HAFDBS_AF,
199 	.id_aa64pfr0 =
200 	    ID_AA64PFR0_GIC_CPUIF_NONE |
201 	    ID_AA64PFR0_AdvSIMD_HP |
202 	    ID_AA64PFR0_FP_HP |
203 	    ID_AA64PFR0_EL3_64 |
204 	    ID_AA64PFR0_EL2_64 |
205 	    ID_AA64PFR0_EL1_64 |
206 	    ID_AA64PFR0_EL0_64,
207 };
208 
209 /* Host registers masked by vmm_arch_regs_masks. */
210 static struct vmm_regs vmm_arch_regs;
211 
212 u_int vm_maxcpu;
213 SYSCTL_UINT(_hw_vmm, OID_AUTO, maxcpu, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
214     &vm_maxcpu, 0, "Maximum number of vCPUs");
215 
216 static void vcpu_notify_event_locked(struct vcpu *vcpu);
217 
218 /* global statistics */
219 VMM_STAT(VMEXIT_COUNT, "total number of vm exits");
220 VMM_STAT(VMEXIT_UNKNOWN, "number of vmexits for the unknown exception");
221 VMM_STAT(VMEXIT_WFI, "number of times wfi was intercepted");
222 VMM_STAT(VMEXIT_WFE, "number of times wfe was intercepted");
223 VMM_STAT(VMEXIT_HVC, "number of times hvc was intercepted");
224 VMM_STAT(VMEXIT_MSR, "number of times msr/mrs was intercepted");
225 VMM_STAT(VMEXIT_DATA_ABORT, "number of vmexits for a data abort");
226 VMM_STAT(VMEXIT_INSN_ABORT, "number of vmexits for an instruction abort");
227 VMM_STAT(VMEXIT_UNHANDLED_SYNC, "number of vmexits for an unhandled synchronous exception");
228 VMM_STAT(VMEXIT_IRQ, "number of vmexits for an irq");
229 VMM_STAT(VMEXIT_FIQ, "number of vmexits for an interrupt");
230 VMM_STAT(VMEXIT_BRK, "number of vmexits for a breakpoint exception");
231 VMM_STAT(VMEXIT_SS, "number of vmexits for a single-step exception");
232 VMM_STAT(VMEXIT_UNHANDLED_EL2, "number of vmexits for an unhandled EL2 exception");
233 VMM_STAT(VMEXIT_UNHANDLED, "number of vmexits for an unhandled exception");
234 
235 /*
236  * Upper limit on vm_maxcpu. We could increase this to 28 bits, but this
237  * is a safe value for now.
238  */
239 #define	VM_MAXCPU	MIN(0xffff - 1, CPU_SETSIZE)
240 
241 static int
vmm_regs_init(struct vmm_regs * regs,const struct vmm_regs * masks)242 vmm_regs_init(struct vmm_regs *regs, const struct vmm_regs *masks)
243 {
244 #define	_FETCH_KERN_REG(reg, field) do {				\
245 	regs->field = vmm_arch_regs_masks.field;			\
246 	if (!get_kernel_reg_iss_masked(reg ## _ISS, &regs->field,	\
247 	    masks->field))						\
248 		regs->field = 0;					\
249 } while (0)
250 	_FETCH_KERN_REG(ID_AA64AFR0_EL1, id_aa64afr0);
251 	_FETCH_KERN_REG(ID_AA64AFR1_EL1, id_aa64afr1);
252 	_FETCH_KERN_REG(ID_AA64DFR0_EL1, id_aa64dfr0);
253 	_FETCH_KERN_REG(ID_AA64DFR1_EL1, id_aa64dfr1);
254 	_FETCH_KERN_REG(ID_AA64ISAR0_EL1, id_aa64isar0);
255 	_FETCH_KERN_REG(ID_AA64ISAR1_EL1, id_aa64isar1);
256 	_FETCH_KERN_REG(ID_AA64ISAR2_EL1, id_aa64isar2);
257 	_FETCH_KERN_REG(ID_AA64MMFR0_EL1, id_aa64mmfr0);
258 	_FETCH_KERN_REG(ID_AA64MMFR1_EL1, id_aa64mmfr1);
259 	_FETCH_KERN_REG(ID_AA64MMFR2_EL1, id_aa64mmfr2);
260 	_FETCH_KERN_REG(ID_AA64PFR0_EL1, id_aa64pfr0);
261 	_FETCH_KERN_REG(ID_AA64PFR1_EL1, id_aa64pfr1);
262 #undef _FETCH_KERN_REG
263 	return (0);
264 }
265 
266 static void
vcpu_cleanup(struct vcpu * vcpu,bool destroy)267 vcpu_cleanup(struct vcpu *vcpu, bool destroy)
268 {
269 	vmmops_vcpu_cleanup(vcpu->cookie);
270 	vcpu->cookie = NULL;
271 	if (destroy) {
272 		vmm_stat_free(vcpu->stats);
273 		fpu_save_area_free(vcpu->guestfpu);
274 		vcpu_lock_destroy(vcpu);
275 		free(vcpu, M_VMM);
276 	}
277 }
278 
279 static struct vcpu *
vcpu_alloc(struct vm * vm,int vcpu_id)280 vcpu_alloc(struct vm *vm, int vcpu_id)
281 {
282 	struct vcpu *vcpu;
283 
284 	KASSERT(vcpu_id >= 0 && vcpu_id < vm->maxcpus,
285 	    ("vcpu_alloc: invalid vcpu %d", vcpu_id));
286 
287 	vcpu = malloc(sizeof(*vcpu), M_VMM, M_WAITOK | M_ZERO);
288 	vcpu_lock_init(vcpu);
289 	vcpu->state = VCPU_IDLE;
290 	vcpu->hostcpu = NOCPU;
291 	vcpu->vcpuid = vcpu_id;
292 	vcpu->vm = vm;
293 	vcpu->guestfpu = fpu_save_area_alloc();
294 	vcpu->stats = vmm_stat_alloc();
295 	return (vcpu);
296 }
297 
298 static void
vcpu_init(struct vcpu * vcpu)299 vcpu_init(struct vcpu *vcpu)
300 {
301 	vcpu->cookie = vmmops_vcpu_init(vcpu->vm->cookie, vcpu, vcpu->vcpuid);
302 	MPASS(vcpu->cookie != NULL);
303 	fpu_save_area_reset(vcpu->guestfpu);
304 	vmm_stat_init(vcpu->stats);
305 }
306 
307 struct vm_exit *
vm_exitinfo(struct vcpu * vcpu)308 vm_exitinfo(struct vcpu *vcpu)
309 {
310 	return (&vcpu->exitinfo);
311 }
312 
313 static int
vmm_unsupported_quirk(void)314 vmm_unsupported_quirk(void)
315 {
316 	/*
317 	 * Known to not load on Ampere eMAG
318 	 * https://bugs.freebsd.org/bugzilla/show_bug.cgi?id=285051
319 	 */
320 	if (CPU_MATCH(CPU_IMPL_MASK | CPU_PART_MASK, CPU_IMPL_APM,
321 	    CPU_PART_EMAG8180, 0, 0))
322 		return (ENXIO);
323 
324 	return (0);
325 }
326 
327 static int
vmm_init(void)328 vmm_init(void)
329 {
330 	int error;
331 
332 	vm_maxcpu = mp_ncpus;
333 	TUNABLE_INT_FETCH("hw.vmm.maxcpu", &vm_maxcpu);
334 
335 	if (vm_maxcpu > VM_MAXCPU) {
336 		printf("vmm: vm_maxcpu clamped to %u\n", VM_MAXCPU);
337 		vm_maxcpu = VM_MAXCPU;
338 	}
339 	if (vm_maxcpu == 0)
340 		vm_maxcpu = 1;
341 
342 	error = vmm_regs_init(&vmm_arch_regs, &vmm_arch_regs_masks);
343 	if (error != 0)
344 		return (error);
345 
346 	return (vmmops_modinit(0));
347 }
348 
349 static int
vmm_handler(module_t mod,int what,void * arg)350 vmm_handler(module_t mod, int what, void *arg)
351 {
352 	int error;
353 
354 	switch (what) {
355 	case MOD_LOAD:
356 		error = vmm_unsupported_quirk();
357 		if (error != 0)
358 			break;
359 		error = vmmdev_init();
360 		if (error != 0)
361 			break;
362 		error = vmm_init();
363 		if (error == 0)
364 			vmm_initialized = true;
365 		else
366 			(void)vmmdev_cleanup();
367 		break;
368 	case MOD_UNLOAD:
369 		error = vmmdev_cleanup();
370 		if (error == 0 && vmm_initialized) {
371 			error = vmmops_modcleanup();
372 			if (error) {
373 				/*
374 				 * Something bad happened - prevent new
375 				 * VMs from being created
376 				 */
377 				vmm_initialized = false;
378 			}
379 		}
380 		break;
381 	default:
382 		error = 0;
383 		break;
384 	}
385 	return (error);
386 }
387 
388 static moduledata_t vmm_kmod = {
389 	"vmm",
390 	vmm_handler,
391 	NULL
392 };
393 
394 /*
395  * vmm initialization has the following dependencies:
396  *
397  * - HYP initialization requires smp_rendezvous() and therefore must happen
398  *   after SMP is fully functional (after SI_SUB_SMP).
399  * - vmm device initialization requires an initialized devfs.
400  */
401 DECLARE_MODULE(vmm, vmm_kmod, MAX(SI_SUB_SMP, SI_SUB_DEVFS) + 1, SI_ORDER_ANY);
402 MODULE_VERSION(vmm, 1);
403 
404 static void
vm_init(struct vm * vm,bool create)405 vm_init(struct vm *vm, bool create)
406 {
407 	int i;
408 
409 	vm->cookie = vmmops_init(vm, vmspace_pmap(vm_vmspace(vm)));
410 	MPASS(vm->cookie != NULL);
411 
412 	CPU_ZERO(&vm->active_cpus);
413 	CPU_ZERO(&vm->debug_cpus);
414 
415 	vm->suspend = 0;
416 	CPU_ZERO(&vm->suspended_cpus);
417 
418 	memset(vm->mmio_region, 0, sizeof(vm->mmio_region));
419 	memset(vm->special_reg, 0, sizeof(vm->special_reg));
420 
421 	if (!create) {
422 		for (i = 0; i < vm->maxcpus; i++) {
423 			if (vm->vcpu[i] != NULL)
424 				vcpu_init(vm->vcpu[i]);
425 		}
426 	}
427 }
428 
429 void
vm_disable_vcpu_creation(struct vm * vm)430 vm_disable_vcpu_creation(struct vm *vm)
431 {
432 	sx_xlock(&vm->vcpus_init_lock);
433 	vm->dying = true;
434 	sx_xunlock(&vm->vcpus_init_lock);
435 }
436 
437 struct vcpu *
vm_alloc_vcpu(struct vm * vm,int vcpuid)438 vm_alloc_vcpu(struct vm *vm, int vcpuid)
439 {
440 	struct vcpu *vcpu;
441 
442 	if (vcpuid < 0 || vcpuid >= vm_get_maxcpus(vm))
443 		return (NULL);
444 
445 	/* Some interrupt controllers may have a CPU limit */
446 	if (vcpuid >= vgic_max_cpu_count(vm->cookie))
447 		return (NULL);
448 
449 	vcpu = (struct vcpu *)
450 	    atomic_load_acq_ptr((uintptr_t *)&vm->vcpu[vcpuid]);
451 	if (__predict_true(vcpu != NULL))
452 		return (vcpu);
453 
454 	sx_xlock(&vm->vcpus_init_lock);
455 	vcpu = vm->vcpu[vcpuid];
456 	if (vcpu == NULL && !vm->dying) {
457 		vcpu = vcpu_alloc(vm, vcpuid);
458 		vcpu_init(vcpu);
459 
460 		/*
461 		 * Ensure vCPU is fully created before updating pointer
462 		 * to permit unlocked reads above.
463 		 */
464 		atomic_store_rel_ptr((uintptr_t *)&vm->vcpu[vcpuid],
465 		    (uintptr_t)vcpu);
466 	}
467 	sx_xunlock(&vm->vcpus_init_lock);
468 	return (vcpu);
469 }
470 
471 void
vm_lock_vcpus(struct vm * vm)472 vm_lock_vcpus(struct vm *vm)
473 {
474 	sx_xlock(&vm->vcpus_init_lock);
475 }
476 
477 void
vm_unlock_vcpus(struct vm * vm)478 vm_unlock_vcpus(struct vm *vm)
479 {
480 	sx_unlock(&vm->vcpus_init_lock);
481 }
482 
483 int
vm_create(const char * name,struct vm ** retvm)484 vm_create(const char *name, struct vm **retvm)
485 {
486 	struct vm *vm;
487 	int error;
488 
489 	/*
490 	 * If vmm.ko could not be successfully initialized then don't attempt
491 	 * to create the virtual machine.
492 	 */
493 	if (!vmm_initialized)
494 		return (ENXIO);
495 
496 	if (name == NULL || strlen(name) >= VM_MAX_NAMELEN)
497 		return (EINVAL);
498 
499 	vm = malloc(sizeof(struct vm), M_VMM, M_WAITOK | M_ZERO);
500 	error = vm_mem_init(&vm->mem, 0, 1ul << 39);
501 	if (error != 0) {
502 		free(vm, M_VMM);
503 		return (error);
504 	}
505 	strcpy(vm->name, name);
506 	sx_init(&vm->vcpus_init_lock, "vm vcpus");
507 
508 	vm->sockets = 1;
509 	vm->cores = 1;			/* XXX backwards compatibility */
510 	vm->threads = 1;		/* XXX backwards compatibility */
511 	vm->maxcpus = vm_maxcpu;
512 
513 	vm->vcpu = malloc(sizeof(*vm->vcpu) * vm->maxcpus, M_VMM,
514 	    M_WAITOK | M_ZERO);
515 
516 	vm_init(vm, true);
517 
518 	*retvm = vm;
519 	return (0);
520 }
521 
522 void
vm_get_topology(struct vm * vm,uint16_t * sockets,uint16_t * cores,uint16_t * threads,uint16_t * maxcpus)523 vm_get_topology(struct vm *vm, uint16_t *sockets, uint16_t *cores,
524     uint16_t *threads, uint16_t *maxcpus)
525 {
526 	*sockets = vm->sockets;
527 	*cores = vm->cores;
528 	*threads = vm->threads;
529 	*maxcpus = vm->maxcpus;
530 }
531 
532 uint16_t
vm_get_maxcpus(struct vm * vm)533 vm_get_maxcpus(struct vm *vm)
534 {
535 	return (vm->maxcpus);
536 }
537 
538 int
vm_set_topology(struct vm * vm,uint16_t sockets,uint16_t cores,uint16_t threads,uint16_t maxcpus)539 vm_set_topology(struct vm *vm, uint16_t sockets, uint16_t cores,
540     uint16_t threads, uint16_t maxcpus)
541 {
542 	/* Ignore maxcpus. */
543 	if ((sockets * cores * threads) > vm->maxcpus)
544 		return (EINVAL);
545 	vm->sockets = sockets;
546 	vm->cores = cores;
547 	vm->threads = threads;
548 	return(0);
549 }
550 
551 static void
vm_cleanup(struct vm * vm,bool destroy)552 vm_cleanup(struct vm *vm, bool destroy)
553 {
554 	pmap_t pmap __diagused;
555 	int i;
556 
557 	if (destroy) {
558 		vm_xlock_memsegs(vm);
559 		pmap = vmspace_pmap(vm_vmspace(vm));
560 		sched_pin();
561 		PCPU_SET(curvmpmap, NULL);
562 		sched_unpin();
563 		CPU_FOREACH(i) {
564 			MPASS(cpuid_to_pcpu[i]->pc_curvmpmap != pmap);
565 		}
566 	} else
567 		vm_assert_memseg_xlocked(vm);
568 
569 
570 	vgic_detach_from_vm(vm->cookie);
571 
572 	for (i = 0; i < vm->maxcpus; i++) {
573 		if (vm->vcpu[i] != NULL)
574 			vcpu_cleanup(vm->vcpu[i], destroy);
575 	}
576 
577 	vmmops_cleanup(vm->cookie);
578 
579 	vm_mem_cleanup(vm);
580 	if (destroy) {
581 		vm_mem_destroy(vm);
582 
583 		free(vm->vcpu, M_VMM);
584 		sx_destroy(&vm->vcpus_init_lock);
585 	}
586 }
587 
588 void
vm_destroy(struct vm * vm)589 vm_destroy(struct vm *vm)
590 {
591 	vm_cleanup(vm, true);
592 	free(vm, M_VMM);
593 }
594 
595 int
vm_reinit(struct vm * vm)596 vm_reinit(struct vm *vm)
597 {
598 	int error;
599 
600 	/*
601 	 * A virtual machine can be reset only if all vcpus are suspended.
602 	 */
603 	if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) {
604 		vm_cleanup(vm, false);
605 		vm_init(vm, false);
606 		error = 0;
607 	} else {
608 		error = EBUSY;
609 	}
610 
611 	return (error);
612 }
613 
614 const char *
vm_name(struct vm * vm)615 vm_name(struct vm *vm)
616 {
617 	return (vm->name);
618 }
619 
620 int
vm_gla2gpa_nofault(struct vcpu * vcpu,struct vm_guest_paging * paging,uint64_t gla,int prot,uint64_t * gpa,int * is_fault)621 vm_gla2gpa_nofault(struct vcpu *vcpu, struct vm_guest_paging *paging,
622     uint64_t gla, int prot, uint64_t *gpa, int *is_fault)
623 {
624 	return (vmmops_gla2gpa(vcpu->cookie, paging, gla, prot, gpa, is_fault));
625 }
626 
627 static int
vmm_reg_raz(struct vcpu * vcpu,uint64_t * rval,void * arg)628 vmm_reg_raz(struct vcpu *vcpu, uint64_t *rval, void *arg)
629 {
630 	*rval = 0;
631 	return (0);
632 }
633 
634 static int
vmm_reg_read_arg(struct vcpu * vcpu,uint64_t * rval,void * arg)635 vmm_reg_read_arg(struct vcpu *vcpu, uint64_t *rval, void *arg)
636 {
637 	*rval = *(uint64_t *)arg;
638 	return (0);
639 }
640 
641 static int
vmm_reg_wi(struct vcpu * vcpu,uint64_t wval,void * arg)642 vmm_reg_wi(struct vcpu *vcpu, uint64_t wval, void *arg)
643 {
644 	return (0);
645 }
646 
647 static int
vmm_write_oslar_el1(struct vcpu * vcpu,uint64_t wval,void * arg)648 vmm_write_oslar_el1(struct vcpu *vcpu, uint64_t wval, void *arg)
649 {
650 	struct hypctx *hypctx;
651 
652 	hypctx = vcpu_get_cookie(vcpu);
653 	/* All other fields are RES0 & we don't do anything with this */
654 	/* TODO: Disable access to other debug state when locked */
655 	hypctx->dbg_oslock = (wval & OSLAR_OSLK) == OSLAR_OSLK;
656 	return (0);
657 }
658 
659 static int
vmm_read_oslsr_el1(struct vcpu * vcpu,uint64_t * rval,void * arg)660 vmm_read_oslsr_el1(struct vcpu *vcpu, uint64_t *rval, void *arg)
661 {
662 	struct hypctx *hypctx;
663 	uint64_t val;
664 
665 	hypctx = vcpu_get_cookie(vcpu);
666 	val = OSLSR_OSLM_1;
667 	if (hypctx->dbg_oslock)
668 		val |= OSLSR_OSLK;
669 	*rval = val;
670 
671 	return (0);
672 }
673 
674 static const struct vmm_special_reg vmm_special_regs[] = {
675 #define	SPECIAL_REG(_reg, _read, _write)				\
676 	{								\
677 		.esr_iss = ((_reg ## _op0) << ISS_MSR_OP0_SHIFT) |	\
678 		    ((_reg ## _op1) << ISS_MSR_OP1_SHIFT) |		\
679 		    ((_reg ## _CRn) << ISS_MSR_CRn_SHIFT) |		\
680 		    ((_reg ## _CRm) << ISS_MSR_CRm_SHIFT) |		\
681 		    ((_reg ## _op2) << ISS_MSR_OP2_SHIFT),		\
682 		.esr_mask = ISS_MSR_REG_MASK,				\
683 		.reg_read = (_read),					\
684 		.reg_write = (_write),					\
685 		.arg = NULL,						\
686 	}
687 #define	ID_SPECIAL_REG(_reg, _name)					\
688 	{								\
689 		.esr_iss = ((_reg ## _op0) << ISS_MSR_OP0_SHIFT) |	\
690 		    ((_reg ## _op1) << ISS_MSR_OP1_SHIFT) |		\
691 		    ((_reg ## _CRn) << ISS_MSR_CRn_SHIFT) |		\
692 		    ((_reg ## _CRm) << ISS_MSR_CRm_SHIFT) |		\
693 		    ((_reg ## _op2) << ISS_MSR_OP2_SHIFT),		\
694 		.esr_mask = ISS_MSR_REG_MASK,				\
695 		.reg_read = vmm_reg_read_arg,				\
696 		.reg_write = vmm_reg_wi,				\
697 		.arg = &(vmm_arch_regs._name),				\
698 	}
699 
700 	/* ID registers */
701 	ID_SPECIAL_REG(ID_AA64PFR0_EL1, id_aa64pfr0),
702 	ID_SPECIAL_REG(ID_AA64DFR0_EL1, id_aa64dfr0),
703 	ID_SPECIAL_REG(ID_AA64ISAR0_EL1, id_aa64isar0),
704 	ID_SPECIAL_REG(ID_AA64MMFR0_EL1, id_aa64mmfr0),
705 	ID_SPECIAL_REG(ID_AA64MMFR1_EL1, id_aa64mmfr1),
706 
707 	/*
708 	 * All other ID registers are read as zero.
709 	 * They are all in the op0=3, op1=0, CRn=0, CRm={0..7} space.
710 	 */
711 	{
712 		.esr_iss = (3 << ISS_MSR_OP0_SHIFT) |
713 		    (0 << ISS_MSR_OP1_SHIFT) |
714 		    (0 << ISS_MSR_CRn_SHIFT) |
715 		    (0 << ISS_MSR_CRm_SHIFT),
716 		.esr_mask = ISS_MSR_OP0_MASK | ISS_MSR_OP1_MASK |
717 		    ISS_MSR_CRn_MASK | (0x8 << ISS_MSR_CRm_SHIFT),
718 		.reg_read = vmm_reg_raz,
719 		.reg_write = vmm_reg_wi,
720 		.arg = NULL,
721 	},
722 
723 	/* Counter physical registers */
724 	SPECIAL_REG(CNTP_CTL_EL0, vtimer_phys_ctl_read, vtimer_phys_ctl_write),
725 	SPECIAL_REG(CNTP_CVAL_EL0, vtimer_phys_cval_read,
726 	    vtimer_phys_cval_write),
727 	SPECIAL_REG(CNTP_TVAL_EL0, vtimer_phys_tval_read,
728 	    vtimer_phys_tval_write),
729 	SPECIAL_REG(CNTPCT_EL0, vtimer_phys_cnt_read, vtimer_phys_cnt_write),
730 
731 	/* Debug registers */
732 	SPECIAL_REG(DBGPRCR_EL1, vmm_reg_raz, vmm_reg_wi),
733 	SPECIAL_REG(OSDLR_EL1, vmm_reg_raz, vmm_reg_wi),
734 	/* TODO: Exceptions on invalid access */
735 	SPECIAL_REG(OSLAR_EL1, vmm_reg_raz, vmm_write_oslar_el1),
736 	SPECIAL_REG(OSLSR_EL1, vmm_read_oslsr_el1, vmm_reg_wi),
737 #undef SPECIAL_REG
738 };
739 
740 void
vm_register_reg_handler(struct vm * vm,uint64_t iss,uint64_t mask,reg_read_t reg_read,reg_write_t reg_write,void * arg)741 vm_register_reg_handler(struct vm *vm, uint64_t iss, uint64_t mask,
742     reg_read_t reg_read, reg_write_t reg_write, void *arg)
743 {
744 	int i;
745 
746 	for (i = 0; i < nitems(vm->special_reg); i++) {
747 		if (vm->special_reg[i].esr_iss == 0 &&
748 		    vm->special_reg[i].esr_mask == 0) {
749 			vm->special_reg[i].esr_iss = iss;
750 			vm->special_reg[i].esr_mask = mask;
751 			vm->special_reg[i].reg_read = reg_read;
752 			vm->special_reg[i].reg_write = reg_write;
753 			vm->special_reg[i].arg = arg;
754 			return;
755 		}
756 	}
757 
758 	panic("%s: No free special register slot", __func__);
759 }
760 
761 void
vm_deregister_reg_handler(struct vm * vm,uint64_t iss,uint64_t mask)762 vm_deregister_reg_handler(struct vm *vm, uint64_t iss, uint64_t mask)
763 {
764 	int i;
765 
766 	for (i = 0; i < nitems(vm->special_reg); i++) {
767 		if (vm->special_reg[i].esr_iss == iss &&
768 		    vm->special_reg[i].esr_mask == mask) {
769 			memset(&vm->special_reg[i], 0,
770 			    sizeof(vm->special_reg[i]));
771 			return;
772 		}
773 	}
774 
775 	panic("%s: Invalid special register: iss %lx mask %lx", __func__, iss,
776 	    mask);
777 }
778 
779 static int
vm_handle_reg_emul(struct vcpu * vcpu,bool * retu)780 vm_handle_reg_emul(struct vcpu *vcpu, bool *retu)
781 {
782 	struct vm *vm;
783 	struct vm_exit *vme;
784 	struct vre *vre;
785 	int i, rv;
786 
787 	vm = vcpu->vm;
788 	vme = &vcpu->exitinfo;
789 	vre = &vme->u.reg_emul.vre;
790 
791 	for (i = 0; i < nitems(vm->special_reg); i++) {
792 		if (vm->special_reg[i].esr_iss == 0 &&
793 		    vm->special_reg[i].esr_mask == 0)
794 			continue;
795 
796 		if ((vre->inst_syndrome & vm->special_reg[i].esr_mask) ==
797 		    vm->special_reg[i].esr_iss) {
798 			rv = vmm_emulate_register(vcpu, vre,
799 			    vm->special_reg[i].reg_read,
800 			    vm->special_reg[i].reg_write,
801 			    vm->special_reg[i].arg);
802 			if (rv == 0) {
803 				*retu = false;
804 			}
805 			return (rv);
806 		}
807 	}
808 	for (i = 0; i < nitems(vmm_special_regs); i++) {
809 		if ((vre->inst_syndrome & vmm_special_regs[i].esr_mask) ==
810 		    vmm_special_regs[i].esr_iss) {
811 			rv = vmm_emulate_register(vcpu, vre,
812 			    vmm_special_regs[i].reg_read,
813 			    vmm_special_regs[i].reg_write,
814 			    vmm_special_regs[i].arg);
815 			if (rv == 0) {
816 				*retu = false;
817 			}
818 			return (rv);
819 		}
820 	}
821 
822 
823 	*retu = true;
824 	return (0);
825 }
826 
827 void
vm_register_inst_handler(struct vm * vm,uint64_t start,uint64_t size,mem_region_read_t mmio_read,mem_region_write_t mmio_write)828 vm_register_inst_handler(struct vm *vm, uint64_t start, uint64_t size,
829     mem_region_read_t mmio_read, mem_region_write_t mmio_write)
830 {
831 	int i;
832 
833 	for (i = 0; i < nitems(vm->mmio_region); i++) {
834 		if (vm->mmio_region[i].start == 0 &&
835 		    vm->mmio_region[i].end == 0) {
836 			vm->mmio_region[i].start = start;
837 			vm->mmio_region[i].end = start + size;
838 			vm->mmio_region[i].read = mmio_read;
839 			vm->mmio_region[i].write = mmio_write;
840 			return;
841 		}
842 	}
843 
844 	panic("%s: No free MMIO region", __func__);
845 }
846 
847 void
vm_deregister_inst_handler(struct vm * vm,uint64_t start,uint64_t size)848 vm_deregister_inst_handler(struct vm *vm, uint64_t start, uint64_t size)
849 {
850 	int i;
851 
852 	for (i = 0; i < nitems(vm->mmio_region); i++) {
853 		if (vm->mmio_region[i].start == start &&
854 		    vm->mmio_region[i].end == start + size) {
855 			memset(&vm->mmio_region[i], 0,
856 			    sizeof(vm->mmio_region[i]));
857 			return;
858 		}
859 	}
860 
861 	panic("%s: Invalid MMIO region: %lx - %lx", __func__, start,
862 	    start + size);
863 }
864 
865 static int
vm_handle_inst_emul(struct vcpu * vcpu,bool * retu)866 vm_handle_inst_emul(struct vcpu *vcpu, bool *retu)
867 {
868 	struct vm *vm;
869 	struct vm_exit *vme;
870 	struct vie *vie;
871 	struct hyp *hyp;
872 	uint64_t fault_ipa;
873 	struct vm_guest_paging *paging;
874 	struct vmm_mmio_region *vmr;
875 	int error, i;
876 
877 	vm = vcpu->vm;
878 	hyp = vm->cookie;
879 	if (!hyp->vgic_attached)
880 		goto out_user;
881 
882 	vme = &vcpu->exitinfo;
883 	vie = &vme->u.inst_emul.vie;
884 	paging = &vme->u.inst_emul.paging;
885 
886 	fault_ipa = vme->u.inst_emul.gpa;
887 
888 	vmr = NULL;
889 	for (i = 0; i < nitems(vm->mmio_region); i++) {
890 		if (vm->mmio_region[i].start <= fault_ipa &&
891 		    vm->mmio_region[i].end > fault_ipa) {
892 			vmr = &vm->mmio_region[i];
893 			break;
894 		}
895 	}
896 	if (vmr == NULL)
897 		goto out_user;
898 
899 	error = vmm_emulate_instruction(vcpu, fault_ipa, vie, paging,
900 	    vmr->read, vmr->write, retu);
901 	return (error);
902 
903 out_user:
904 	*retu = true;
905 	return (0);
906 }
907 
908 int
vm_suspend(struct vm * vm,enum vm_suspend_how how)909 vm_suspend(struct vm *vm, enum vm_suspend_how how)
910 {
911 	int i;
912 
913 	if (how <= VM_SUSPEND_NONE || how >= VM_SUSPEND_LAST)
914 		return (EINVAL);
915 
916 	if (atomic_cmpset_int(&vm->suspend, 0, how) == 0) {
917 		VM_CTR2(vm, "virtual machine already suspended %d/%d",
918 		    vm->suspend, how);
919 		return (EALREADY);
920 	}
921 
922 	VM_CTR1(vm, "virtual machine successfully suspended %d", how);
923 
924 	/*
925 	 * Notify all active vcpus that they are now suspended.
926 	 */
927 	for (i = 0; i < vm->maxcpus; i++) {
928 		if (CPU_ISSET(i, &vm->active_cpus))
929 			vcpu_notify_event(vm_vcpu(vm, i));
930 	}
931 
932 	return (0);
933 }
934 
935 void
vm_exit_suspended(struct vcpu * vcpu,uint64_t pc)936 vm_exit_suspended(struct vcpu *vcpu, uint64_t pc)
937 {
938 	struct vm *vm = vcpu->vm;
939 	struct vm_exit *vmexit;
940 
941 	KASSERT(vm->suspend > VM_SUSPEND_NONE && vm->suspend < VM_SUSPEND_LAST,
942 	    ("vm_exit_suspended: invalid suspend type %d", vm->suspend));
943 
944 	vmexit = vm_exitinfo(vcpu);
945 	vmexit->pc = pc;
946 	vmexit->inst_length = 4;
947 	vmexit->exitcode = VM_EXITCODE_SUSPENDED;
948 	vmexit->u.suspended.how = vm->suspend;
949 }
950 
951 void
vm_exit_debug(struct vcpu * vcpu,uint64_t pc)952 vm_exit_debug(struct vcpu *vcpu, uint64_t pc)
953 {
954 	struct vm_exit *vmexit;
955 
956 	vmexit = vm_exitinfo(vcpu);
957 	vmexit->pc = pc;
958 	vmexit->inst_length = 4;
959 	vmexit->exitcode = VM_EXITCODE_DEBUG;
960 }
961 
962 int
vm_activate_cpu(struct vcpu * vcpu)963 vm_activate_cpu(struct vcpu *vcpu)
964 {
965 	struct vm *vm = vcpu->vm;
966 
967 	if (CPU_ISSET(vcpu->vcpuid, &vm->active_cpus))
968 		return (EBUSY);
969 
970 	CPU_SET_ATOMIC(vcpu->vcpuid, &vm->active_cpus);
971 	return (0);
972 
973 }
974 
975 int
vm_suspend_cpu(struct vm * vm,struct vcpu * vcpu)976 vm_suspend_cpu(struct vm *vm, struct vcpu *vcpu)
977 {
978 	if (vcpu == NULL) {
979 		vm->debug_cpus = vm->active_cpus;
980 		for (int i = 0; i < vm->maxcpus; i++) {
981 			if (CPU_ISSET(i, &vm->active_cpus))
982 				vcpu_notify_event(vm_vcpu(vm, i));
983 		}
984 	} else {
985 		if (!CPU_ISSET(vcpu->vcpuid, &vm->active_cpus))
986 			return (EINVAL);
987 
988 		CPU_SET_ATOMIC(vcpu->vcpuid, &vm->debug_cpus);
989 		vcpu_notify_event(vcpu);
990 	}
991 	return (0);
992 }
993 
994 int
vm_resume_cpu(struct vm * vm,struct vcpu * vcpu)995 vm_resume_cpu(struct vm *vm, struct vcpu *vcpu)
996 {
997 
998 	if (vcpu == NULL) {
999 		CPU_ZERO(&vm->debug_cpus);
1000 	} else {
1001 		if (!CPU_ISSET(vcpu->vcpuid, &vm->debug_cpus))
1002 			return (EINVAL);
1003 
1004 		CPU_CLR_ATOMIC(vcpu->vcpuid, &vm->debug_cpus);
1005 	}
1006 	return (0);
1007 }
1008 
1009 int
vcpu_debugged(struct vcpu * vcpu)1010 vcpu_debugged(struct vcpu *vcpu)
1011 {
1012 
1013 	return (CPU_ISSET(vcpu->vcpuid, &vcpu->vm->debug_cpus));
1014 }
1015 
1016 cpuset_t
vm_active_cpus(struct vm * vm)1017 vm_active_cpus(struct vm *vm)
1018 {
1019 
1020 	return (vm->active_cpus);
1021 }
1022 
1023 cpuset_t
vm_debug_cpus(struct vm * vm)1024 vm_debug_cpus(struct vm *vm)
1025 {
1026 
1027 	return (vm->debug_cpus);
1028 }
1029 
1030 cpuset_t
vm_suspended_cpus(struct vm * vm)1031 vm_suspended_cpus(struct vm *vm)
1032 {
1033 
1034 	return (vm->suspended_cpus);
1035 }
1036 
1037 
1038 void *
vcpu_stats(struct vcpu * vcpu)1039 vcpu_stats(struct vcpu *vcpu)
1040 {
1041 
1042 	return (vcpu->stats);
1043 }
1044 
1045 /*
1046  * This function is called to ensure that a vcpu "sees" a pending event
1047  * as soon as possible:
1048  * - If the vcpu thread is sleeping then it is woken up.
1049  * - If the vcpu is running on a different host_cpu then an IPI will be directed
1050  *   to the host_cpu to cause the vcpu to trap into the hypervisor.
1051  */
1052 static void
vcpu_notify_event_locked(struct vcpu * vcpu)1053 vcpu_notify_event_locked(struct vcpu *vcpu)
1054 {
1055 	int hostcpu;
1056 
1057 	hostcpu = vcpu->hostcpu;
1058 	if (vcpu->state == VCPU_RUNNING) {
1059 		KASSERT(hostcpu != NOCPU, ("vcpu running on invalid hostcpu"));
1060 		if (hostcpu != curcpu) {
1061 			ipi_cpu(hostcpu, vmm_ipinum);
1062 		} else {
1063 			/*
1064 			 * If the 'vcpu' is running on 'curcpu' then it must
1065 			 * be sending a notification to itself (e.g. SELF_IPI).
1066 			 * The pending event will be picked up when the vcpu
1067 			 * transitions back to guest context.
1068 			 */
1069 		}
1070 	} else {
1071 		KASSERT(hostcpu == NOCPU, ("vcpu state %d not consistent "
1072 		    "with hostcpu %d", vcpu->state, hostcpu));
1073 		if (vcpu->state == VCPU_SLEEPING)
1074 			wakeup_one(vcpu);
1075 	}
1076 }
1077 
1078 void
vcpu_notify_event(struct vcpu * vcpu)1079 vcpu_notify_event(struct vcpu *vcpu)
1080 {
1081 	vcpu_lock(vcpu);
1082 	vcpu_notify_event_locked(vcpu);
1083 	vcpu_unlock(vcpu);
1084 }
1085 
1086 struct vm_mem *
vm_mem(struct vm * vm)1087 vm_mem(struct vm *vm)
1088 {
1089 	return (&vm->mem);
1090 }
1091 
1092 static void
restore_guest_fpustate(struct vcpu * vcpu)1093 restore_guest_fpustate(struct vcpu *vcpu)
1094 {
1095 
1096 	/* flush host state to the pcb */
1097 	vfp_save_state(curthread, curthread->td_pcb);
1098 	/* Ensure the VFP state will be re-loaded when exiting the guest */
1099 	PCPU_SET(fpcurthread, NULL);
1100 
1101 	/* restore guest FPU state */
1102 	vfp_enable();
1103 	vfp_restore(vcpu->guestfpu);
1104 
1105 	/*
1106 	 * The FPU is now "dirty" with the guest's state so turn on emulation
1107 	 * to trap any access to the FPU by the host.
1108 	 */
1109 	vfp_disable();
1110 }
1111 
1112 static void
save_guest_fpustate(struct vcpu * vcpu)1113 save_guest_fpustate(struct vcpu *vcpu)
1114 {
1115 	if ((READ_SPECIALREG(cpacr_el1) & CPACR_FPEN_MASK) !=
1116 	    CPACR_FPEN_TRAP_ALL1)
1117 		panic("VFP not enabled in host!");
1118 
1119 	/* save guest FPU state */
1120 	vfp_enable();
1121 	vfp_store(vcpu->guestfpu);
1122 	vfp_disable();
1123 
1124 	KASSERT(PCPU_GET(fpcurthread) == NULL,
1125 	    ("%s: fpcurthread set with guest registers", __func__));
1126 }
1127 static int
vcpu_set_state_locked(struct vcpu * vcpu,enum vcpu_state newstate,bool from_idle)1128 vcpu_set_state_locked(struct vcpu *vcpu, enum vcpu_state newstate,
1129     bool from_idle)
1130 {
1131 	int error;
1132 
1133 	vcpu_assert_locked(vcpu);
1134 
1135 	/*
1136 	 * State transitions from the vmmdev_ioctl() must always begin from
1137 	 * the VCPU_IDLE state. This guarantees that there is only a single
1138 	 * ioctl() operating on a vcpu at any point.
1139 	 */
1140 	if (from_idle) {
1141 		while (vcpu->state != VCPU_IDLE) {
1142 			vcpu_notify_event_locked(vcpu);
1143 			msleep_spin(&vcpu->state, &vcpu->mtx, "vmstat", hz);
1144 		}
1145 	} else {
1146 		KASSERT(vcpu->state != VCPU_IDLE, ("invalid transition from "
1147 		    "vcpu idle state"));
1148 	}
1149 
1150 	if (vcpu->state == VCPU_RUNNING) {
1151 		KASSERT(vcpu->hostcpu == curcpu, ("curcpu %d and hostcpu %d "
1152 		    "mismatch for running vcpu", curcpu, vcpu->hostcpu));
1153 	} else {
1154 		KASSERT(vcpu->hostcpu == NOCPU, ("Invalid hostcpu %d for a "
1155 		    "vcpu that is not running", vcpu->hostcpu));
1156 	}
1157 
1158 	/*
1159 	 * The following state transitions are allowed:
1160 	 * IDLE -> FROZEN -> IDLE
1161 	 * FROZEN -> RUNNING -> FROZEN
1162 	 * FROZEN -> SLEEPING -> FROZEN
1163 	 */
1164 	switch (vcpu->state) {
1165 	case VCPU_IDLE:
1166 	case VCPU_RUNNING:
1167 	case VCPU_SLEEPING:
1168 		error = (newstate != VCPU_FROZEN);
1169 		break;
1170 	case VCPU_FROZEN:
1171 		error = (newstate == VCPU_FROZEN);
1172 		break;
1173 	default:
1174 		error = 1;
1175 		break;
1176 	}
1177 
1178 	if (error)
1179 		return (EBUSY);
1180 
1181 	vcpu->state = newstate;
1182 	if (newstate == VCPU_RUNNING)
1183 		vcpu->hostcpu = curcpu;
1184 	else
1185 		vcpu->hostcpu = NOCPU;
1186 
1187 	if (newstate == VCPU_IDLE)
1188 		wakeup(&vcpu->state);
1189 
1190 	return (0);
1191 }
1192 
1193 static void
vcpu_require_state(struct vcpu * vcpu,enum vcpu_state newstate)1194 vcpu_require_state(struct vcpu *vcpu, enum vcpu_state newstate)
1195 {
1196 	int error;
1197 
1198 	if ((error = vcpu_set_state(vcpu, newstate, false)) != 0)
1199 		panic("Error %d setting state to %d\n", error, newstate);
1200 }
1201 
1202 static void
vcpu_require_state_locked(struct vcpu * vcpu,enum vcpu_state newstate)1203 vcpu_require_state_locked(struct vcpu *vcpu, enum vcpu_state newstate)
1204 {
1205 	int error;
1206 
1207 	if ((error = vcpu_set_state_locked(vcpu, newstate, false)) != 0)
1208 		panic("Error %d setting state to %d", error, newstate);
1209 }
1210 
1211 int
vm_get_capability(struct vcpu * vcpu,int type,int * retval)1212 vm_get_capability(struct vcpu *vcpu, int type, int *retval)
1213 {
1214 	if (type < 0 || type >= VM_CAP_MAX)
1215 		return (EINVAL);
1216 
1217 	return (vmmops_getcap(vcpu->cookie, type, retval));
1218 }
1219 
1220 int
vm_set_capability(struct vcpu * vcpu,int type,int val)1221 vm_set_capability(struct vcpu *vcpu, int type, int val)
1222 {
1223 	if (type < 0 || type >= VM_CAP_MAX)
1224 		return (EINVAL);
1225 
1226 	return (vmmops_setcap(vcpu->cookie, type, val));
1227 }
1228 
1229 struct vm *
vcpu_vm(struct vcpu * vcpu)1230 vcpu_vm(struct vcpu *vcpu)
1231 {
1232 	return (vcpu->vm);
1233 }
1234 
1235 int
vcpu_vcpuid(struct vcpu * vcpu)1236 vcpu_vcpuid(struct vcpu *vcpu)
1237 {
1238 	return (vcpu->vcpuid);
1239 }
1240 
1241 void *
vcpu_get_cookie(struct vcpu * vcpu)1242 vcpu_get_cookie(struct vcpu *vcpu)
1243 {
1244 	return (vcpu->cookie);
1245 }
1246 
1247 struct vcpu *
vm_vcpu(struct vm * vm,int vcpuid)1248 vm_vcpu(struct vm *vm, int vcpuid)
1249 {
1250 	return (vm->vcpu[vcpuid]);
1251 }
1252 
1253 int
vcpu_set_state(struct vcpu * vcpu,enum vcpu_state newstate,bool from_idle)1254 vcpu_set_state(struct vcpu *vcpu, enum vcpu_state newstate, bool from_idle)
1255 {
1256 	int error;
1257 
1258 	vcpu_lock(vcpu);
1259 	error = vcpu_set_state_locked(vcpu, newstate, from_idle);
1260 	vcpu_unlock(vcpu);
1261 
1262 	return (error);
1263 }
1264 
1265 enum vcpu_state
vcpu_get_state(struct vcpu * vcpu,int * hostcpu)1266 vcpu_get_state(struct vcpu *vcpu, int *hostcpu)
1267 {
1268 	enum vcpu_state state;
1269 
1270 	vcpu_lock(vcpu);
1271 	state = vcpu->state;
1272 	if (hostcpu != NULL)
1273 		*hostcpu = vcpu->hostcpu;
1274 	vcpu_unlock(vcpu);
1275 
1276 	return (state);
1277 }
1278 
1279 int
vm_get_register(struct vcpu * vcpu,int reg,uint64_t * retval)1280 vm_get_register(struct vcpu *vcpu, int reg, uint64_t *retval)
1281 {
1282 	if (reg < 0 || reg >= VM_REG_LAST)
1283 		return (EINVAL);
1284 
1285 	return (vmmops_getreg(vcpu->cookie, reg, retval));
1286 }
1287 
1288 int
vm_set_register(struct vcpu * vcpu,int reg,uint64_t val)1289 vm_set_register(struct vcpu *vcpu, int reg, uint64_t val)
1290 {
1291 	int error;
1292 
1293 	if (reg < 0 || reg >= VM_REG_LAST)
1294 		return (EINVAL);
1295 	error = vmmops_setreg(vcpu->cookie, reg, val);
1296 	if (error || reg != VM_REG_GUEST_PC)
1297 		return (error);
1298 
1299 	vcpu->nextpc = val;
1300 
1301 	return (0);
1302 }
1303 
1304 void *
vm_get_cookie(struct vm * vm)1305 vm_get_cookie(struct vm *vm)
1306 {
1307 	return (vm->cookie);
1308 }
1309 
1310 int
vm_inject_exception(struct vcpu * vcpu,uint64_t esr,uint64_t far)1311 vm_inject_exception(struct vcpu *vcpu, uint64_t esr, uint64_t far)
1312 {
1313 	return (vmmops_exception(vcpu->cookie, esr, far));
1314 }
1315 
1316 int
vm_attach_vgic(struct vm * vm,struct vm_vgic_descr * descr)1317 vm_attach_vgic(struct vm *vm, struct vm_vgic_descr *descr)
1318 {
1319 	return (vgic_attach_to_vm(vm->cookie, descr));
1320 }
1321 
1322 int
vm_assert_irq(struct vm * vm,uint32_t irq)1323 vm_assert_irq(struct vm *vm, uint32_t irq)
1324 {
1325 	return (vgic_inject_irq(vm->cookie, -1, irq, true));
1326 }
1327 
1328 int
vm_deassert_irq(struct vm * vm,uint32_t irq)1329 vm_deassert_irq(struct vm *vm, uint32_t irq)
1330 {
1331 	return (vgic_inject_irq(vm->cookie, -1, irq, false));
1332 }
1333 
1334 int
vm_raise_msi(struct vm * vm,uint64_t msg,uint64_t addr,int bus,int slot,int func)1335 vm_raise_msi(struct vm *vm, uint64_t msg, uint64_t addr, int bus, int slot,
1336     int func)
1337 {
1338 	/* TODO: Should we raise an SError? */
1339 	return (vgic_inject_msi(vm->cookie, msg, addr));
1340 }
1341 
1342 static int
vm_handle_smccc_call(struct vcpu * vcpu,struct vm_exit * vme,bool * retu)1343 vm_handle_smccc_call(struct vcpu *vcpu, struct vm_exit *vme, bool *retu)
1344 {
1345 	struct hypctx *hypctx;
1346 	int i;
1347 
1348 	hypctx = vcpu_get_cookie(vcpu);
1349 
1350 	if ((hypctx->tf.tf_esr & ESR_ELx_ISS_MASK) != 0)
1351 		return (1);
1352 
1353 	vme->exitcode = VM_EXITCODE_SMCCC;
1354 	vme->u.smccc_call.func_id = hypctx->tf.tf_x[0];
1355 	for (i = 0; i < nitems(vme->u.smccc_call.args); i++)
1356 		vme->u.smccc_call.args[i] = hypctx->tf.tf_x[i + 1];
1357 
1358 	*retu = true;
1359 	return (0);
1360 }
1361 
1362 static int
vm_handle_wfi(struct vcpu * vcpu,struct vm_exit * vme,bool * retu)1363 vm_handle_wfi(struct vcpu *vcpu, struct vm_exit *vme, bool *retu)
1364 {
1365 	struct vm *vm;
1366 
1367 	vm = vcpu->vm;
1368 	vcpu_lock(vcpu);
1369 	while (1) {
1370 		if (vm->suspend)
1371 			break;
1372 
1373 		if (vgic_has_pending_irq(vcpu->cookie))
1374 			break;
1375 
1376 		if (vcpu_should_yield(vcpu))
1377 			break;
1378 
1379 		vcpu_require_state_locked(vcpu, VCPU_SLEEPING);
1380 		/*
1381 		 * XXX msleep_spin() cannot be interrupted by signals so
1382 		 * wake up periodically to check pending signals.
1383 		 */
1384 		msleep_spin(vcpu, &vcpu->mtx, "vmidle", hz);
1385 		vcpu_require_state_locked(vcpu, VCPU_FROZEN);
1386 	}
1387 	vcpu_unlock(vcpu);
1388 
1389 	*retu = false;
1390 	return (0);
1391 }
1392 
1393 static int
vm_handle_paging(struct vcpu * vcpu,bool * retu)1394 vm_handle_paging(struct vcpu *vcpu, bool *retu)
1395 {
1396 	struct vm *vm = vcpu->vm;
1397 	struct vm_exit *vme;
1398 	struct vm_map *map;
1399 	uint64_t addr, esr;
1400 	pmap_t pmap;
1401 	int ftype, rv;
1402 
1403 	vme = &vcpu->exitinfo;
1404 
1405 	pmap = vmspace_pmap(vm_vmspace(vcpu->vm));
1406 	addr = vme->u.paging.gpa;
1407 	esr = vme->u.paging.esr;
1408 
1409 	/* The page exists, but the page table needs to be updated. */
1410 	if (pmap_fault(pmap, esr, addr) == KERN_SUCCESS)
1411 		return (0);
1412 
1413 	switch (ESR_ELx_EXCEPTION(esr)) {
1414 	case EXCP_INSN_ABORT_L:
1415 	case EXCP_DATA_ABORT_L:
1416 		ftype = VM_PROT_EXECUTE | VM_PROT_READ | VM_PROT_WRITE;
1417 		break;
1418 	default:
1419 		panic("%s: Invalid exception (esr = %lx)", __func__, esr);
1420 	}
1421 
1422 	map = &vm_vmspace(vm)->vm_map;
1423 	rv = vm_fault(map, vme->u.paging.gpa, ftype, VM_FAULT_NORMAL, NULL);
1424 	if (rv != KERN_SUCCESS)
1425 		return (EFAULT);
1426 
1427 	return (0);
1428 }
1429 
1430 static int
vm_handle_suspend(struct vcpu * vcpu,bool * retu)1431 vm_handle_suspend(struct vcpu *vcpu, bool *retu)
1432 {
1433 	struct vm *vm = vcpu->vm;
1434 	int error, i;
1435 	struct thread *td;
1436 
1437 	error = 0;
1438 	td = curthread;
1439 
1440 	CPU_SET_ATOMIC(vcpu->vcpuid, &vm->suspended_cpus);
1441 
1442 	/*
1443 	 * Wait until all 'active_cpus' have suspended themselves.
1444 	 *
1445 	 * Since a VM may be suspended at any time including when one or
1446 	 * more vcpus are doing a rendezvous we need to call the rendezvous
1447 	 * handler while we are waiting to prevent a deadlock.
1448 	 */
1449 	vcpu_lock(vcpu);
1450 	while (error == 0) {
1451 		if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0)
1452 			break;
1453 
1454 		vcpu_require_state_locked(vcpu, VCPU_SLEEPING);
1455 		msleep_spin(vcpu, &vcpu->mtx, "vmsusp", hz);
1456 		vcpu_require_state_locked(vcpu, VCPU_FROZEN);
1457 		if (td_ast_pending(td, TDA_SUSPEND)) {
1458 			vcpu_unlock(vcpu);
1459 			error = thread_check_susp(td, false);
1460 			vcpu_lock(vcpu);
1461 		}
1462 	}
1463 	vcpu_unlock(vcpu);
1464 
1465 	/*
1466 	 * Wakeup the other sleeping vcpus and return to userspace.
1467 	 */
1468 	for (i = 0; i < vm->maxcpus; i++) {
1469 		if (CPU_ISSET(i, &vm->suspended_cpus)) {
1470 			vcpu_notify_event(vm_vcpu(vm, i));
1471 		}
1472 	}
1473 
1474 	*retu = true;
1475 	return (error);
1476 }
1477 
1478 int
vm_run(struct vcpu * vcpu)1479 vm_run(struct vcpu *vcpu)
1480 {
1481 	struct vm *vm = vcpu->vm;
1482 	struct vm_eventinfo evinfo;
1483 	int error, vcpuid;
1484 	struct vm_exit *vme;
1485 	bool retu;
1486 	pmap_t pmap;
1487 
1488 	vcpuid = vcpu->vcpuid;
1489 
1490 	if (!CPU_ISSET(vcpuid, &vm->active_cpus))
1491 		return (EINVAL);
1492 
1493 	if (CPU_ISSET(vcpuid, &vm->suspended_cpus))
1494 		return (EINVAL);
1495 
1496 	pmap = vmspace_pmap(vm_vmspace(vm));
1497 	vme = &vcpu->exitinfo;
1498 	evinfo.rptr = NULL;
1499 	evinfo.sptr = &vm->suspend;
1500 	evinfo.iptr = NULL;
1501 restart:
1502 	critical_enter();
1503 
1504 	restore_guest_fpustate(vcpu);
1505 
1506 	vcpu_require_state(vcpu, VCPU_RUNNING);
1507 	error = vmmops_run(vcpu->cookie, vcpu->nextpc, pmap, &evinfo);
1508 	vcpu_require_state(vcpu, VCPU_FROZEN);
1509 
1510 	save_guest_fpustate(vcpu);
1511 
1512 	critical_exit();
1513 
1514 	if (error == 0) {
1515 		retu = false;
1516 		switch (vme->exitcode) {
1517 		case VM_EXITCODE_INST_EMUL:
1518 			vcpu->nextpc = vme->pc + vme->inst_length;
1519 			error = vm_handle_inst_emul(vcpu, &retu);
1520 			break;
1521 
1522 		case VM_EXITCODE_REG_EMUL:
1523 			vcpu->nextpc = vme->pc + vme->inst_length;
1524 			error = vm_handle_reg_emul(vcpu, &retu);
1525 			break;
1526 
1527 		case VM_EXITCODE_HVC:
1528 			/*
1529 			 * The HVC instruction saves the address for the
1530 			 * next instruction as the return address.
1531 			 */
1532 			vcpu->nextpc = vme->pc;
1533 			/*
1534 			 * The PSCI call can change the exit information in the
1535 			 * case of suspend/reset/poweroff/cpu off/cpu on.
1536 			 */
1537 			error = vm_handle_smccc_call(vcpu, vme, &retu);
1538 			break;
1539 
1540 		case VM_EXITCODE_WFI:
1541 			vcpu->nextpc = vme->pc + vme->inst_length;
1542 			error = vm_handle_wfi(vcpu, vme, &retu);
1543 			break;
1544 
1545 		case VM_EXITCODE_PAGING:
1546 			vcpu->nextpc = vme->pc;
1547 			error = vm_handle_paging(vcpu, &retu);
1548 			break;
1549 
1550 		case VM_EXITCODE_SUSPENDED:
1551 			vcpu->nextpc = vme->pc;
1552 			error = vm_handle_suspend(vcpu, &retu);
1553 			break;
1554 
1555 		default:
1556 			/* Handle in userland */
1557 			vcpu->nextpc = vme->pc;
1558 			retu = true;
1559 			break;
1560 		}
1561 	}
1562 
1563 	if (error == 0 && retu == false)
1564 		goto restart;
1565 
1566 	return (error);
1567 }
1568