xref: /freebsd/sys/arm64/vmm/vmm.c (revision dd3603749cb7f20a628f04d595b105962b21a3d2)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (C) 2015 Mihai Carabas <mihai.carabas@gmail.com>
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28 
29 #include <sys/param.h>
30 #include <sys/systm.h>
31 #include <sys/cpuset.h>
32 #include <sys/kernel.h>
33 #include <sys/linker.h>
34 #include <sys/lock.h>
35 #include <sys/malloc.h>
36 #include <sys/module.h>
37 #include <sys/mutex.h>
38 #include <sys/pcpu.h>
39 #include <sys/proc.h>
40 #include <sys/queue.h>
41 #include <sys/rwlock.h>
42 #include <sys/sched.h>
43 #include <sys/smp.h>
44 #include <sys/sysctl.h>
45 
46 #include <vm/vm.h>
47 #include <vm/vm_object.h>
48 #include <vm/vm_page.h>
49 #include <vm/pmap.h>
50 #include <vm/vm_map.h>
51 #include <vm/vm_extern.h>
52 #include <vm/vm_param.h>
53 
54 #include <machine/armreg.h>
55 #include <machine/cpu.h>
56 #include <machine/fpu.h>
57 #include <machine/machdep.h>
58 #include <machine/pcb.h>
59 #include <machine/smp.h>
60 #include <machine/vm.h>
61 #include <machine/vmparam.h>
62 #include <machine/vmm.h>
63 #include <machine/vmm_instruction_emul.h>
64 
65 #include <dev/pci/pcireg.h>
66 #include <dev/vmm/vmm_dev.h>
67 #include <dev/vmm/vmm_ktr.h>
68 #include <dev/vmm/vmm_stat.h>
69 
70 #include "arm64.h"
71 #include "mmu.h"
72 
73 #include "io/vgic.h"
74 #include "io/vtimer.h"
75 
76 struct vcpu {
77 	int		flags;
78 	enum vcpu_state	state;
79 	struct mtx	mtx;
80 	int		hostcpu;	/* host cpuid this vcpu last ran on */
81 	int		vcpuid;
82 	void		*stats;
83 	struct vm_exit	exitinfo;
84 	uint64_t	nextpc;		/* (x) next instruction to execute */
85 	struct vm	*vm;		/* (o) */
86 	void		*cookie;	/* (i) cpu-specific data */
87 	struct vfpstate	*guestfpu;	/* (a,i) guest fpu state */
88 };
89 
90 #define	vcpu_lock_initialized(v) mtx_initialized(&((v)->mtx))
91 #define	vcpu_lock_init(v)	mtx_init(&((v)->mtx), "vcpu lock", 0, MTX_SPIN)
92 #define	vcpu_lock_destroy(v)	mtx_destroy(&((v)->mtx))
93 #define	vcpu_lock(v)		mtx_lock_spin(&((v)->mtx))
94 #define	vcpu_unlock(v)		mtx_unlock_spin(&((v)->mtx))
95 #define	vcpu_assert_locked(v)	mtx_assert(&((v)->mtx), MA_OWNED)
96 
97 struct mem_seg {
98 	uint64_t	gpa;
99 	size_t		len;
100 	bool		wired;
101 	bool		sysmem;
102 	vm_object_t	object;
103 };
104 #define	VM_MAX_MEMSEGS	3
105 
106 struct mem_map {
107 	vm_paddr_t	gpa;
108 	size_t		len;
109 	vm_ooffset_t	segoff;
110 	int		segid;
111 	int		prot;
112 	int		flags;
113 };
114 #define	VM_MAX_MEMMAPS	4
115 
116 struct vmm_mmio_region {
117 	uint64_t start;
118 	uint64_t end;
119 	mem_region_read_t read;
120 	mem_region_write_t write;
121 };
122 #define	VM_MAX_MMIO_REGIONS	4
123 
124 struct vmm_special_reg {
125 	uint32_t	esr_iss;
126 	uint32_t	esr_mask;
127 	reg_read_t	reg_read;
128 	reg_write_t	reg_write;
129 	void		*arg;
130 };
131 #define	VM_MAX_SPECIAL_REGS	16
132 
133 /*
134  * Initialization:
135  * (o) initialized the first time the VM is created
136  * (i) initialized when VM is created and when it is reinitialized
137  * (x) initialized before use
138  */
139 struct vm {
140 	void		*cookie;		/* (i) cpu-specific data */
141 	volatile cpuset_t active_cpus;		/* (i) active vcpus */
142 	volatile cpuset_t debug_cpus;		/* (i) vcpus stopped for debug */
143 	int		suspend;		/* (i) stop VM execution */
144 	bool		dying;			/* (o) is dying */
145 	volatile cpuset_t suspended_cpus; 	/* (i) suspended vcpus */
146 	volatile cpuset_t halted_cpus;		/* (x) cpus in a hard halt */
147 	struct mem_map	mem_maps[VM_MAX_MEMMAPS]; /* (i) guest address space */
148 	struct mem_seg	mem_segs[VM_MAX_MEMSEGS]; /* (o) guest memory regions */
149 	struct vmspace	*vmspace;		/* (o) guest's address space */
150 	char		name[VM_MAX_NAMELEN];	/* (o) virtual machine name */
151 	struct vcpu	**vcpu;			/* (i) guest vcpus */
152 	struct vmm_mmio_region mmio_region[VM_MAX_MMIO_REGIONS];
153 						/* (o) guest MMIO regions */
154 	struct vmm_special_reg special_reg[VM_MAX_SPECIAL_REGS];
155 	/* The following describe the vm cpu topology */
156 	uint16_t	sockets;		/* (o) num of sockets */
157 	uint16_t	cores;			/* (o) num of cores/socket */
158 	uint16_t	threads;		/* (o) num of threads/core */
159 	uint16_t	maxcpus;		/* (o) max pluggable cpus */
160 	struct sx	mem_segs_lock;		/* (o) */
161 	struct sx	vcpus_init_lock;	/* (o) */
162 };
163 
164 static bool vmm_initialized = false;
165 
166 static int vm_handle_wfi(struct vcpu *vcpu,
167 			 struct vm_exit *vme, bool *retu);
168 
169 static MALLOC_DEFINE(M_VMM, "vmm", "vmm");
170 
171 /* statistics */
172 static VMM_STAT(VCPU_TOTAL_RUNTIME, "vcpu total runtime");
173 
174 SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW, NULL, NULL);
175 
176 static int vmm_ipinum;
177 SYSCTL_INT(_hw_vmm, OID_AUTO, ipinum, CTLFLAG_RD, &vmm_ipinum, 0,
178     "IPI vector used for vcpu notifications");
179 
180 struct vmm_regs {
181 	uint64_t	id_aa64afr0;
182 	uint64_t	id_aa64afr1;
183 	uint64_t	id_aa64dfr0;
184 	uint64_t	id_aa64dfr1;
185 	uint64_t	id_aa64isar0;
186 	uint64_t	id_aa64isar1;
187 	uint64_t	id_aa64isar2;
188 	uint64_t	id_aa64mmfr0;
189 	uint64_t	id_aa64mmfr1;
190 	uint64_t	id_aa64mmfr2;
191 	uint64_t	id_aa64pfr0;
192 	uint64_t	id_aa64pfr1;
193 };
194 
195 static const struct vmm_regs vmm_arch_regs_masks = {
196 	.id_aa64dfr0 =
197 	    ID_AA64DFR0_CTX_CMPs_MASK |
198 	    ID_AA64DFR0_WRPs_MASK |
199 	    ID_AA64DFR0_BRPs_MASK |
200 	    ID_AA64DFR0_PMUVer_3 |
201 	    ID_AA64DFR0_DebugVer_8,
202 	.id_aa64isar0 =
203 	    ID_AA64ISAR0_TLB_TLBIOSR |
204 	    ID_AA64ISAR0_SHA3_IMPL |
205 	    ID_AA64ISAR0_RDM_IMPL |
206 	    ID_AA64ISAR0_Atomic_IMPL |
207 	    ID_AA64ISAR0_CRC32_BASE |
208 	    ID_AA64ISAR0_SHA2_512 |
209 	    ID_AA64ISAR0_SHA1_BASE |
210 	    ID_AA64ISAR0_AES_PMULL,
211 	.id_aa64mmfr0 =
212 	    ID_AA64MMFR0_TGran4_IMPL |
213 	    ID_AA64MMFR0_TGran64_IMPL |
214 	    ID_AA64MMFR0_TGran16_IMPL |
215 	    ID_AA64MMFR0_ASIDBits_16 |
216 	    ID_AA64MMFR0_PARange_4P,
217 	.id_aa64mmfr1 =
218 	    ID_AA64MMFR1_SpecSEI_IMPL |
219 	    ID_AA64MMFR1_PAN_ATS1E1 |
220 	    ID_AA64MMFR1_HAFDBS_AF,
221 	.id_aa64pfr0 =
222 	    ID_AA64PFR0_GIC_CPUIF_NONE |
223 	    ID_AA64PFR0_AdvSIMD_HP |
224 	    ID_AA64PFR0_FP_HP |
225 	    ID_AA64PFR0_EL3_64 |
226 	    ID_AA64PFR0_EL2_64 |
227 	    ID_AA64PFR0_EL1_64 |
228 	    ID_AA64PFR0_EL0_64,
229 };
230 
231 /* Host registers masked by vmm_arch_regs_masks. */
232 static struct vmm_regs vmm_arch_regs;
233 
234 u_int vm_maxcpu;
235 SYSCTL_UINT(_hw_vmm, OID_AUTO, maxcpu, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
236     &vm_maxcpu, 0, "Maximum number of vCPUs");
237 
238 static void vm_free_memmap(struct vm *vm, int ident);
239 static bool sysmem_mapping(struct vm *vm, struct mem_map *mm);
240 static void vcpu_notify_event_locked(struct vcpu *vcpu);
241 
242 /* global statistics */
243 VMM_STAT(VMEXIT_COUNT, "total number of vm exits");
244 VMM_STAT(VMEXIT_UNKNOWN, "number of vmexits for the unknown exception");
245 VMM_STAT(VMEXIT_WFI, "number of times wfi was intercepted");
246 VMM_STAT(VMEXIT_WFE, "number of times wfe was intercepted");
247 VMM_STAT(VMEXIT_HVC, "number of times hvc was intercepted");
248 VMM_STAT(VMEXIT_MSR, "number of times msr/mrs was intercepted");
249 VMM_STAT(VMEXIT_DATA_ABORT, "number of vmexits for a data abort");
250 VMM_STAT(VMEXIT_INSN_ABORT, "number of vmexits for an instruction abort");
251 VMM_STAT(VMEXIT_UNHANDLED_SYNC, "number of vmexits for an unhandled synchronous exception");
252 VMM_STAT(VMEXIT_IRQ, "number of vmexits for an irq");
253 VMM_STAT(VMEXIT_FIQ, "number of vmexits for an interrupt");
254 VMM_STAT(VMEXIT_BRK, "number of vmexits for a breakpoint exception");
255 VMM_STAT(VMEXIT_SS, "number of vmexits for a single-step exception");
256 VMM_STAT(VMEXIT_UNHANDLED_EL2, "number of vmexits for an unhandled EL2 exception");
257 VMM_STAT(VMEXIT_UNHANDLED, "number of vmexits for an unhandled exception");
258 
259 /*
260  * Upper limit on vm_maxcpu. We could increase this to 28 bits, but this
261  * is a safe value for now.
262  */
263 #define	VM_MAXCPU	MIN(0xffff - 1, CPU_SETSIZE)
264 
265 static int
266 vmm_regs_init(struct vmm_regs *regs, const struct vmm_regs *masks)
267 {
268 #define	_FETCH_KERN_REG(reg, field) do {				\
269 	regs->field = vmm_arch_regs_masks.field;			\
270 	if (!get_kernel_reg_masked(reg, &regs->field, masks->field))	\
271 		regs->field = 0;					\
272 } while (0)
273 	_FETCH_KERN_REG(ID_AA64AFR0_EL1, id_aa64afr0);
274 	_FETCH_KERN_REG(ID_AA64AFR1_EL1, id_aa64afr1);
275 	_FETCH_KERN_REG(ID_AA64DFR0_EL1, id_aa64dfr0);
276 	_FETCH_KERN_REG(ID_AA64DFR1_EL1, id_aa64dfr1);
277 	_FETCH_KERN_REG(ID_AA64ISAR0_EL1, id_aa64isar0);
278 	_FETCH_KERN_REG(ID_AA64ISAR1_EL1, id_aa64isar1);
279 	_FETCH_KERN_REG(ID_AA64ISAR2_EL1, id_aa64isar2);
280 	_FETCH_KERN_REG(ID_AA64MMFR0_EL1, id_aa64mmfr0);
281 	_FETCH_KERN_REG(ID_AA64MMFR1_EL1, id_aa64mmfr1);
282 	_FETCH_KERN_REG(ID_AA64MMFR2_EL1, id_aa64mmfr2);
283 	_FETCH_KERN_REG(ID_AA64PFR0_EL1, id_aa64pfr0);
284 	_FETCH_KERN_REG(ID_AA64PFR1_EL1, id_aa64pfr1);
285 #undef _FETCH_KERN_REG
286 	return (0);
287 }
288 
289 static void
290 vcpu_cleanup(struct vcpu *vcpu, bool destroy)
291 {
292 	vmmops_vcpu_cleanup(vcpu->cookie);
293 	vcpu->cookie = NULL;
294 	if (destroy) {
295 		vmm_stat_free(vcpu->stats);
296 		fpu_save_area_free(vcpu->guestfpu);
297 		vcpu_lock_destroy(vcpu);
298 	}
299 }
300 
301 static struct vcpu *
302 vcpu_alloc(struct vm *vm, int vcpu_id)
303 {
304 	struct vcpu *vcpu;
305 
306 	KASSERT(vcpu_id >= 0 && vcpu_id < vm->maxcpus,
307 	    ("vcpu_alloc: invalid vcpu %d", vcpu_id));
308 
309 	vcpu = malloc(sizeof(*vcpu), M_VMM, M_WAITOK | M_ZERO);
310 	vcpu_lock_init(vcpu);
311 	vcpu->state = VCPU_IDLE;
312 	vcpu->hostcpu = NOCPU;
313 	vcpu->vcpuid = vcpu_id;
314 	vcpu->vm = vm;
315 	vcpu->guestfpu = fpu_save_area_alloc();
316 	vcpu->stats = vmm_stat_alloc();
317 	return (vcpu);
318 }
319 
320 static void
321 vcpu_init(struct vcpu *vcpu)
322 {
323 	vcpu->cookie = vmmops_vcpu_init(vcpu->vm->cookie, vcpu, vcpu->vcpuid);
324 	MPASS(vcpu->cookie != NULL);
325 	fpu_save_area_reset(vcpu->guestfpu);
326 	vmm_stat_init(vcpu->stats);
327 }
328 
329 struct vm_exit *
330 vm_exitinfo(struct vcpu *vcpu)
331 {
332 	return (&vcpu->exitinfo);
333 }
334 
335 static int
336 vmm_init(void)
337 {
338 	int error;
339 
340 	vm_maxcpu = mp_ncpus;
341 	TUNABLE_INT_FETCH("hw.vmm.maxcpu", &vm_maxcpu);
342 
343 	if (vm_maxcpu > VM_MAXCPU) {
344 		printf("vmm: vm_maxcpu clamped to %u\n", VM_MAXCPU);
345 		vm_maxcpu = VM_MAXCPU;
346 	}
347 	if (vm_maxcpu == 0)
348 		vm_maxcpu = 1;
349 
350 	error = vmm_regs_init(&vmm_arch_regs, &vmm_arch_regs_masks);
351 	if (error != 0)
352 		return (error);
353 
354 	return (vmmops_modinit(0));
355 }
356 
357 static int
358 vmm_handler(module_t mod, int what, void *arg)
359 {
360 	int error;
361 
362 	switch (what) {
363 	case MOD_LOAD:
364 		error = vmmdev_init();
365 		if (error != 0)
366 			break;
367 		error = vmm_init();
368 		if (error == 0)
369 			vmm_initialized = true;
370 		else
371 			(void)vmmdev_cleanup();
372 		break;
373 	case MOD_UNLOAD:
374 		error = vmmdev_cleanup();
375 		if (error == 0 && vmm_initialized) {
376 			error = vmmops_modcleanup();
377 			if (error) {
378 				/*
379 				 * Something bad happened - prevent new
380 				 * VMs from being created
381 				 */
382 				vmm_initialized = false;
383 			}
384 		}
385 		break;
386 	default:
387 		error = 0;
388 		break;
389 	}
390 	return (error);
391 }
392 
393 static moduledata_t vmm_kmod = {
394 	"vmm",
395 	vmm_handler,
396 	NULL
397 };
398 
399 /*
400  * vmm initialization has the following dependencies:
401  *
402  * - HYP initialization requires smp_rendezvous() and therefore must happen
403  *   after SMP is fully functional (after SI_SUB_SMP).
404  * - vmm device initialization requires an initialized devfs.
405  */
406 DECLARE_MODULE(vmm, vmm_kmod, MAX(SI_SUB_SMP, SI_SUB_DEVFS) + 1, SI_ORDER_ANY);
407 MODULE_VERSION(vmm, 1);
408 
409 static void
410 vm_init(struct vm *vm, bool create)
411 {
412 	int i;
413 
414 	vm->cookie = vmmops_init(vm, vmspace_pmap(vm->vmspace));
415 	MPASS(vm->cookie != NULL);
416 
417 	CPU_ZERO(&vm->active_cpus);
418 	CPU_ZERO(&vm->debug_cpus);
419 
420 	vm->suspend = 0;
421 	CPU_ZERO(&vm->suspended_cpus);
422 
423 	memset(vm->mmio_region, 0, sizeof(vm->mmio_region));
424 	memset(vm->special_reg, 0, sizeof(vm->special_reg));
425 
426 	if (!create) {
427 		for (i = 0; i < vm->maxcpus; i++) {
428 			if (vm->vcpu[i] != NULL)
429 				vcpu_init(vm->vcpu[i]);
430 		}
431 	}
432 }
433 
434 void
435 vm_disable_vcpu_creation(struct vm *vm)
436 {
437 	sx_xlock(&vm->vcpus_init_lock);
438 	vm->dying = true;
439 	sx_xunlock(&vm->vcpus_init_lock);
440 }
441 
442 struct vcpu *
443 vm_alloc_vcpu(struct vm *vm, int vcpuid)
444 {
445 	struct vcpu *vcpu;
446 
447 	if (vcpuid < 0 || vcpuid >= vm_get_maxcpus(vm))
448 		return (NULL);
449 
450 	/* Some interrupt controllers may have a CPU limit */
451 	if (vcpuid >= vgic_max_cpu_count(vm->cookie))
452 		return (NULL);
453 
454 	vcpu = (struct vcpu *)
455 	    atomic_load_acq_ptr((uintptr_t *)&vm->vcpu[vcpuid]);
456 	if (__predict_true(vcpu != NULL))
457 		return (vcpu);
458 
459 	sx_xlock(&vm->vcpus_init_lock);
460 	vcpu = vm->vcpu[vcpuid];
461 	if (vcpu == NULL && !vm->dying) {
462 		vcpu = vcpu_alloc(vm, vcpuid);
463 		vcpu_init(vcpu);
464 
465 		/*
466 		 * Ensure vCPU is fully created before updating pointer
467 		 * to permit unlocked reads above.
468 		 */
469 		atomic_store_rel_ptr((uintptr_t *)&vm->vcpu[vcpuid],
470 		    (uintptr_t)vcpu);
471 	}
472 	sx_xunlock(&vm->vcpus_init_lock);
473 	return (vcpu);
474 }
475 
476 void
477 vm_slock_vcpus(struct vm *vm)
478 {
479 	sx_slock(&vm->vcpus_init_lock);
480 }
481 
482 void
483 vm_unlock_vcpus(struct vm *vm)
484 {
485 	sx_unlock(&vm->vcpus_init_lock);
486 }
487 
488 int
489 vm_create(const char *name, struct vm **retvm)
490 {
491 	struct vm *vm;
492 	struct vmspace *vmspace;
493 
494 	/*
495 	 * If vmm.ko could not be successfully initialized then don't attempt
496 	 * to create the virtual machine.
497 	 */
498 	if (!vmm_initialized)
499 		return (ENXIO);
500 
501 	if (name == NULL || strlen(name) >= VM_MAX_NAMELEN)
502 		return (EINVAL);
503 
504 	vmspace = vmmops_vmspace_alloc(0, 1ul << 39);
505 	if (vmspace == NULL)
506 		return (ENOMEM);
507 
508 	vm = malloc(sizeof(struct vm), M_VMM, M_WAITOK | M_ZERO);
509 	strcpy(vm->name, name);
510 	vm->vmspace = vmspace;
511 	sx_init(&vm->mem_segs_lock, "vm mem_segs");
512 	sx_init(&vm->vcpus_init_lock, "vm vcpus");
513 
514 	vm->sockets = 1;
515 	vm->cores = 1;			/* XXX backwards compatibility */
516 	vm->threads = 1;		/* XXX backwards compatibility */
517 	vm->maxcpus = vm_maxcpu;
518 
519 	vm->vcpu = malloc(sizeof(*vm->vcpu) * vm->maxcpus, M_VMM,
520 	    M_WAITOK | M_ZERO);
521 
522 	vm_init(vm, true);
523 
524 	*retvm = vm;
525 	return (0);
526 }
527 
528 void
529 vm_get_topology(struct vm *vm, uint16_t *sockets, uint16_t *cores,
530     uint16_t *threads, uint16_t *maxcpus)
531 {
532 	*sockets = vm->sockets;
533 	*cores = vm->cores;
534 	*threads = vm->threads;
535 	*maxcpus = vm->maxcpus;
536 }
537 
538 uint16_t
539 vm_get_maxcpus(struct vm *vm)
540 {
541 	return (vm->maxcpus);
542 }
543 
544 int
545 vm_set_topology(struct vm *vm, uint16_t sockets, uint16_t cores,
546     uint16_t threads, uint16_t maxcpus)
547 {
548 	/* Ignore maxcpus. */
549 	if ((sockets * cores * threads) > vm->maxcpus)
550 		return (EINVAL);
551 	vm->sockets = sockets;
552 	vm->cores = cores;
553 	vm->threads = threads;
554 	return(0);
555 }
556 
557 static void
558 vm_cleanup(struct vm *vm, bool destroy)
559 {
560 	struct mem_map *mm;
561 	pmap_t pmap __diagused;
562 	int i;
563 
564 	if (destroy) {
565 		pmap = vmspace_pmap(vm->vmspace);
566 		sched_pin();
567 		PCPU_SET(curvmpmap, NULL);
568 		sched_unpin();
569 		CPU_FOREACH(i) {
570 			MPASS(cpuid_to_pcpu[i]->pc_curvmpmap != pmap);
571 		}
572 	}
573 
574 	vgic_detach_from_vm(vm->cookie);
575 
576 	for (i = 0; i < vm->maxcpus; i++) {
577 		if (vm->vcpu[i] != NULL)
578 			vcpu_cleanup(vm->vcpu[i], destroy);
579 	}
580 
581 	vmmops_cleanup(vm->cookie);
582 
583 	/*
584 	 * System memory is removed from the guest address space only when
585 	 * the VM is destroyed. This is because the mapping remains the same
586 	 * across VM reset.
587 	 *
588 	 * Device memory can be relocated by the guest (e.g. using PCI BARs)
589 	 * so those mappings are removed on a VM reset.
590 	 */
591 	if (!destroy) {
592 		for (i = 0; i < VM_MAX_MEMMAPS; i++) {
593 			mm = &vm->mem_maps[i];
594 			if (destroy || !sysmem_mapping(vm, mm))
595 				vm_free_memmap(vm, i);
596 		}
597 	}
598 
599 	if (destroy) {
600 		for (i = 0; i < VM_MAX_MEMSEGS; i++)
601 			vm_free_memseg(vm, i);
602 
603 		vmmops_vmspace_free(vm->vmspace);
604 		vm->vmspace = NULL;
605 
606 		for (i = 0; i < vm->maxcpus; i++)
607 			free(vm->vcpu[i], M_VMM);
608 		free(vm->vcpu, M_VMM);
609 		sx_destroy(&vm->vcpus_init_lock);
610 		sx_destroy(&vm->mem_segs_lock);
611 	}
612 }
613 
614 void
615 vm_destroy(struct vm *vm)
616 {
617 	vm_cleanup(vm, true);
618 	free(vm, M_VMM);
619 }
620 
621 int
622 vm_reinit(struct vm *vm)
623 {
624 	int error;
625 
626 	/*
627 	 * A virtual machine can be reset only if all vcpus are suspended.
628 	 */
629 	if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) {
630 		vm_cleanup(vm, false);
631 		vm_init(vm, false);
632 		error = 0;
633 	} else {
634 		error = EBUSY;
635 	}
636 
637 	return (error);
638 }
639 
640 const char *
641 vm_name(struct vm *vm)
642 {
643 	return (vm->name);
644 }
645 
646 void
647 vm_slock_memsegs(struct vm *vm)
648 {
649 	sx_slock(&vm->mem_segs_lock);
650 }
651 
652 void
653 vm_xlock_memsegs(struct vm *vm)
654 {
655 	sx_xlock(&vm->mem_segs_lock);
656 }
657 
658 void
659 vm_unlock_memsegs(struct vm *vm)
660 {
661 	sx_unlock(&vm->mem_segs_lock);
662 }
663 
664 /*
665  * Return 'true' if 'gpa' is allocated in the guest address space.
666  *
667  * This function is called in the context of a running vcpu which acts as
668  * an implicit lock on 'vm->mem_maps[]'.
669  */
670 bool
671 vm_mem_allocated(struct vcpu *vcpu, vm_paddr_t gpa)
672 {
673 	struct vm *vm = vcpu->vm;
674 	struct mem_map *mm;
675 	int i;
676 
677 #ifdef INVARIANTS
678 	int hostcpu, state;
679 	state = vcpu_get_state(vcpu, &hostcpu);
680 	KASSERT(state == VCPU_RUNNING && hostcpu == curcpu,
681 	    ("%s: invalid vcpu state %d/%d", __func__, state, hostcpu));
682 #endif
683 
684 	for (i = 0; i < VM_MAX_MEMMAPS; i++) {
685 		mm = &vm->mem_maps[i];
686 		if (mm->len != 0 && gpa >= mm->gpa && gpa < mm->gpa + mm->len)
687 			return (true);		/* 'gpa' is sysmem or devmem */
688 	}
689 
690 	return (false);
691 }
692 
693 int
694 vm_alloc_memseg(struct vm *vm, int ident, size_t len, bool sysmem)
695 {
696 	struct mem_seg *seg;
697 	vm_object_t obj;
698 
699 	sx_assert(&vm->mem_segs_lock, SX_XLOCKED);
700 
701 	if (ident < 0 || ident >= VM_MAX_MEMSEGS)
702 		return (EINVAL);
703 
704 	if (len == 0 || (len & PAGE_MASK))
705 		return (EINVAL);
706 
707 	seg = &vm->mem_segs[ident];
708 	if (seg->object != NULL) {
709 		if (seg->len == len && seg->sysmem == sysmem)
710 			return (EEXIST);
711 		else
712 			return (EINVAL);
713 	}
714 
715 	obj = vm_object_allocate(OBJT_DEFAULT, len >> PAGE_SHIFT);
716 	if (obj == NULL)
717 		return (ENOMEM);
718 
719 	seg->len = len;
720 	seg->object = obj;
721 	seg->sysmem = sysmem;
722 	return (0);
723 }
724 
725 int
726 vm_get_memseg(struct vm *vm, int ident, size_t *len, bool *sysmem,
727     vm_object_t *objptr)
728 {
729 	struct mem_seg *seg;
730 
731 	sx_assert(&vm->mem_segs_lock, SX_LOCKED);
732 
733 	if (ident < 0 || ident >= VM_MAX_MEMSEGS)
734 		return (EINVAL);
735 
736 	seg = &vm->mem_segs[ident];
737 	if (len)
738 		*len = seg->len;
739 	if (sysmem)
740 		*sysmem = seg->sysmem;
741 	if (objptr)
742 		*objptr = seg->object;
743 	return (0);
744 }
745 
746 void
747 vm_free_memseg(struct vm *vm, int ident)
748 {
749 	struct mem_seg *seg;
750 
751 	KASSERT(ident >= 0 && ident < VM_MAX_MEMSEGS,
752 	    ("%s: invalid memseg ident %d", __func__, ident));
753 
754 	seg = &vm->mem_segs[ident];
755 	if (seg->object != NULL) {
756 		vm_object_deallocate(seg->object);
757 		bzero(seg, sizeof(struct mem_seg));
758 	}
759 }
760 
761 int
762 vm_mmap_memseg(struct vm *vm, vm_paddr_t gpa, int segid, vm_ooffset_t first,
763     size_t len, int prot, int flags)
764 {
765 	struct mem_seg *seg;
766 	struct mem_map *m, *map;
767 	vm_ooffset_t last;
768 	int i, error;
769 
770 	if (prot == 0 || (prot & ~(VM_PROT_ALL)) != 0)
771 		return (EINVAL);
772 
773 	if (flags & ~VM_MEMMAP_F_WIRED)
774 		return (EINVAL);
775 
776 	if (segid < 0 || segid >= VM_MAX_MEMSEGS)
777 		return (EINVAL);
778 
779 	seg = &vm->mem_segs[segid];
780 	if (seg->object == NULL)
781 		return (EINVAL);
782 
783 	last = first + len;
784 	if (first < 0 || first >= last || last > seg->len)
785 		return (EINVAL);
786 
787 	if ((gpa | first | last) & PAGE_MASK)
788 		return (EINVAL);
789 
790 	map = NULL;
791 	for (i = 0; i < VM_MAX_MEMMAPS; i++) {
792 		m = &vm->mem_maps[i];
793 		if (m->len == 0) {
794 			map = m;
795 			break;
796 		}
797 	}
798 
799 	if (map == NULL)
800 		return (ENOSPC);
801 
802 	error = vm_map_find(&vm->vmspace->vm_map, seg->object, first, &gpa,
803 	    len, 0, VMFS_NO_SPACE, prot, prot, 0);
804 	if (error != KERN_SUCCESS)
805 		return (EFAULT);
806 
807 	vm_object_reference(seg->object);
808 
809 	if (flags & VM_MEMMAP_F_WIRED) {
810 		error = vm_map_wire(&vm->vmspace->vm_map, gpa, gpa + len,
811 		    VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
812 		if (error != KERN_SUCCESS) {
813 			vm_map_remove(&vm->vmspace->vm_map, gpa, gpa + len);
814 			return (error == KERN_RESOURCE_SHORTAGE ? ENOMEM :
815 			    EFAULT);
816 		}
817 	}
818 
819 	map->gpa = gpa;
820 	map->len = len;
821 	map->segoff = first;
822 	map->segid = segid;
823 	map->prot = prot;
824 	map->flags = flags;
825 	return (0);
826 }
827 
828 int
829 vm_munmap_memseg(struct vm *vm, vm_paddr_t gpa, size_t len)
830 {
831 	struct mem_map *m;
832 	int i;
833 
834 	for (i = 0; i < VM_MAX_MEMMAPS; i++) {
835 		m = &vm->mem_maps[i];
836 		if (m->gpa == gpa && m->len == len) {
837 			vm_free_memmap(vm, i);
838 			return (0);
839 		}
840 	}
841 
842 	return (EINVAL);
843 }
844 
845 int
846 vm_mmap_getnext(struct vm *vm, vm_paddr_t *gpa, int *segid,
847     vm_ooffset_t *segoff, size_t *len, int *prot, int *flags)
848 {
849 	struct mem_map *mm, *mmnext;
850 	int i;
851 
852 	mmnext = NULL;
853 	for (i = 0; i < VM_MAX_MEMMAPS; i++) {
854 		mm = &vm->mem_maps[i];
855 		if (mm->len == 0 || mm->gpa < *gpa)
856 			continue;
857 		if (mmnext == NULL || mm->gpa < mmnext->gpa)
858 			mmnext = mm;
859 	}
860 
861 	if (mmnext != NULL) {
862 		*gpa = mmnext->gpa;
863 		if (segid)
864 			*segid = mmnext->segid;
865 		if (segoff)
866 			*segoff = mmnext->segoff;
867 		if (len)
868 			*len = mmnext->len;
869 		if (prot)
870 			*prot = mmnext->prot;
871 		if (flags)
872 			*flags = mmnext->flags;
873 		return (0);
874 	} else {
875 		return (ENOENT);
876 	}
877 }
878 
879 static void
880 vm_free_memmap(struct vm *vm, int ident)
881 {
882 	struct mem_map *mm;
883 	int error __diagused;
884 
885 	mm = &vm->mem_maps[ident];
886 	if (mm->len) {
887 		error = vm_map_remove(&vm->vmspace->vm_map, mm->gpa,
888 		    mm->gpa + mm->len);
889 		KASSERT(error == KERN_SUCCESS, ("%s: vm_map_remove error %d",
890 		    __func__, error));
891 		bzero(mm, sizeof(struct mem_map));
892 	}
893 }
894 
895 static __inline bool
896 sysmem_mapping(struct vm *vm, struct mem_map *mm)
897 {
898 
899 	if (mm->len != 0 && vm->mem_segs[mm->segid].sysmem)
900 		return (true);
901 	else
902 		return (false);
903 }
904 
905 vm_paddr_t
906 vmm_sysmem_maxaddr(struct vm *vm)
907 {
908 	struct mem_map *mm;
909 	vm_paddr_t maxaddr;
910 	int i;
911 
912 	maxaddr = 0;
913 	for (i = 0; i < VM_MAX_MEMMAPS; i++) {
914 		mm = &vm->mem_maps[i];
915 		if (sysmem_mapping(vm, mm)) {
916 			if (maxaddr < mm->gpa + mm->len)
917 				maxaddr = mm->gpa + mm->len;
918 		}
919 	}
920 	return (maxaddr);
921 }
922 
923 int
924 vm_gla2gpa_nofault(struct vcpu *vcpu, struct vm_guest_paging *paging,
925     uint64_t gla, int prot, uint64_t *gpa, int *is_fault)
926 {
927 
928 	vmmops_gla2gpa(vcpu->cookie, paging, gla, prot, gpa, is_fault);
929 	return (0);
930 }
931 
932 static int
933 vmm_reg_raz(struct vcpu *vcpu, uint64_t *rval, void *arg)
934 {
935 	*rval = 0;
936 	return (0);
937 }
938 
939 static int
940 vmm_reg_read_arg(struct vcpu *vcpu, uint64_t *rval, void *arg)
941 {
942 	*rval = *(uint64_t *)arg;
943 	return (0);
944 }
945 
946 static int
947 vmm_reg_wi(struct vcpu *vcpu, uint64_t wval, void *arg)
948 {
949 	return (0);
950 }
951 
952 static const struct vmm_special_reg vmm_special_regs[] = {
953 #define	SPECIAL_REG(_reg, _read, _write)				\
954 	{								\
955 		.esr_iss = ((_reg ## _op0) << ISS_MSR_OP0_SHIFT) |	\
956 		    ((_reg ## _op1) << ISS_MSR_OP1_SHIFT) |		\
957 		    ((_reg ## _CRn) << ISS_MSR_CRn_SHIFT) |		\
958 		    ((_reg ## _CRm) << ISS_MSR_CRm_SHIFT) |		\
959 		    ((_reg ## _op2) << ISS_MSR_OP2_SHIFT),		\
960 		.esr_mask = ISS_MSR_REG_MASK,				\
961 		.reg_read = (_read),					\
962 		.reg_write = (_write),					\
963 		.arg = NULL,						\
964 	}
965 #define	ID_SPECIAL_REG(_reg, _name)					\
966 	{								\
967 		.esr_iss = ((_reg ## _op0) << ISS_MSR_OP0_SHIFT) |	\
968 		    ((_reg ## _op1) << ISS_MSR_OP1_SHIFT) |		\
969 		    ((_reg ## _CRn) << ISS_MSR_CRn_SHIFT) |		\
970 		    ((_reg ## _CRm) << ISS_MSR_CRm_SHIFT) |		\
971 		    ((_reg ## _op2) << ISS_MSR_OP2_SHIFT),		\
972 		.esr_mask = ISS_MSR_REG_MASK,				\
973 		.reg_read = vmm_reg_read_arg,				\
974 		.reg_write = vmm_reg_wi,				\
975 		.arg = &(vmm_arch_regs._name),				\
976 	}
977 
978 	/* ID registers */
979 	ID_SPECIAL_REG(ID_AA64PFR0_EL1, id_aa64pfr0),
980 	ID_SPECIAL_REG(ID_AA64DFR0_EL1, id_aa64dfr0),
981 	ID_SPECIAL_REG(ID_AA64ISAR0_EL1, id_aa64isar0),
982 	ID_SPECIAL_REG(ID_AA64MMFR0_EL1, id_aa64mmfr0),
983 	ID_SPECIAL_REG(ID_AA64MMFR1_EL1, id_aa64mmfr1),
984 
985 	/*
986 	 * All other ID registers are read as zero.
987 	 * They are all in the op0=3, op1=0, CRn=0, CRm={0..7} space.
988 	 */
989 	{
990 		.esr_iss = (3 << ISS_MSR_OP0_SHIFT) |
991 		    (0 << ISS_MSR_OP1_SHIFT) |
992 		    (0 << ISS_MSR_CRn_SHIFT) |
993 		    (0 << ISS_MSR_CRm_SHIFT),
994 		.esr_mask = ISS_MSR_OP0_MASK | ISS_MSR_OP1_MASK |
995 		    ISS_MSR_CRn_MASK | (0x8 << ISS_MSR_CRm_SHIFT),
996 		.reg_read = vmm_reg_raz,
997 		.reg_write = vmm_reg_wi,
998 		.arg = NULL,
999 	},
1000 
1001 	/* Counter physical registers */
1002 	SPECIAL_REG(CNTP_CTL_EL0, vtimer_phys_ctl_read, vtimer_phys_ctl_write),
1003 	SPECIAL_REG(CNTP_CVAL_EL0, vtimer_phys_cval_read,
1004 	    vtimer_phys_cval_write),
1005 	SPECIAL_REG(CNTP_TVAL_EL0, vtimer_phys_tval_read,
1006 	    vtimer_phys_tval_write),
1007 	SPECIAL_REG(CNTPCT_EL0, vtimer_phys_cnt_read, vtimer_phys_cnt_write),
1008 #undef SPECIAL_REG
1009 };
1010 
1011 void
1012 vm_register_reg_handler(struct vm *vm, uint64_t iss, uint64_t mask,
1013     reg_read_t reg_read, reg_write_t reg_write, void *arg)
1014 {
1015 	int i;
1016 
1017 	for (i = 0; i < nitems(vm->special_reg); i++) {
1018 		if (vm->special_reg[i].esr_iss == 0 &&
1019 		    vm->special_reg[i].esr_mask == 0) {
1020 			vm->special_reg[i].esr_iss = iss;
1021 			vm->special_reg[i].esr_mask = mask;
1022 			vm->special_reg[i].reg_read = reg_read;
1023 			vm->special_reg[i].reg_write = reg_write;
1024 			vm->special_reg[i].arg = arg;
1025 			return;
1026 		}
1027 	}
1028 
1029 	panic("%s: No free special register slot", __func__);
1030 }
1031 
1032 void
1033 vm_deregister_reg_handler(struct vm *vm, uint64_t iss, uint64_t mask)
1034 {
1035 	int i;
1036 
1037 	for (i = 0; i < nitems(vm->special_reg); i++) {
1038 		if (vm->special_reg[i].esr_iss == iss &&
1039 		    vm->special_reg[i].esr_mask == mask) {
1040 			memset(&vm->special_reg[i], 0,
1041 			    sizeof(vm->special_reg[i]));
1042 			return;
1043 		}
1044 	}
1045 
1046 	panic("%s: Invalid special register: iss %lx mask %lx", __func__, iss,
1047 	    mask);
1048 }
1049 
1050 static int
1051 vm_handle_reg_emul(struct vcpu *vcpu, bool *retu)
1052 {
1053 	struct vm *vm;
1054 	struct vm_exit *vme;
1055 	struct vre *vre;
1056 	int i, rv;
1057 
1058 	vm = vcpu->vm;
1059 	vme = &vcpu->exitinfo;
1060 	vre = &vme->u.reg_emul.vre;
1061 
1062 	for (i = 0; i < nitems(vm->special_reg); i++) {
1063 		if (vm->special_reg[i].esr_iss == 0 &&
1064 		    vm->special_reg[i].esr_mask == 0)
1065 			continue;
1066 
1067 		if ((vre->inst_syndrome & vm->special_reg[i].esr_mask) ==
1068 		    vm->special_reg[i].esr_iss) {
1069 			rv = vmm_emulate_register(vcpu, vre,
1070 			    vm->special_reg[i].reg_read,
1071 			    vm->special_reg[i].reg_write,
1072 			    vm->special_reg[i].arg);
1073 			if (rv == 0) {
1074 				*retu = false;
1075 			}
1076 			return (rv);
1077 		}
1078 	}
1079 	for (i = 0; i < nitems(vmm_special_regs); i++) {
1080 		if ((vre->inst_syndrome & vmm_special_regs[i].esr_mask) ==
1081 		    vmm_special_regs[i].esr_iss) {
1082 			rv = vmm_emulate_register(vcpu, vre,
1083 			    vmm_special_regs[i].reg_read,
1084 			    vmm_special_regs[i].reg_write,
1085 			    vmm_special_regs[i].arg);
1086 			if (rv == 0) {
1087 				*retu = false;
1088 			}
1089 			return (rv);
1090 		}
1091 	}
1092 
1093 
1094 	*retu = true;
1095 	return (0);
1096 }
1097 
1098 void
1099 vm_register_inst_handler(struct vm *vm, uint64_t start, uint64_t size,
1100     mem_region_read_t mmio_read, mem_region_write_t mmio_write)
1101 {
1102 	int i;
1103 
1104 	for (i = 0; i < nitems(vm->mmio_region); i++) {
1105 		if (vm->mmio_region[i].start == 0 &&
1106 		    vm->mmio_region[i].end == 0) {
1107 			vm->mmio_region[i].start = start;
1108 			vm->mmio_region[i].end = start + size;
1109 			vm->mmio_region[i].read = mmio_read;
1110 			vm->mmio_region[i].write = mmio_write;
1111 			return;
1112 		}
1113 	}
1114 
1115 	panic("%s: No free MMIO region", __func__);
1116 }
1117 
1118 void
1119 vm_deregister_inst_handler(struct vm *vm, uint64_t start, uint64_t size)
1120 {
1121 	int i;
1122 
1123 	for (i = 0; i < nitems(vm->mmio_region); i++) {
1124 		if (vm->mmio_region[i].start == start &&
1125 		    vm->mmio_region[i].end == start + size) {
1126 			memset(&vm->mmio_region[i], 0,
1127 			    sizeof(vm->mmio_region[i]));
1128 			return;
1129 		}
1130 	}
1131 
1132 	panic("%s: Invalid MMIO region: %lx - %lx", __func__, start,
1133 	    start + size);
1134 }
1135 
1136 static int
1137 vm_handle_inst_emul(struct vcpu *vcpu, bool *retu)
1138 {
1139 	struct vm *vm;
1140 	struct vm_exit *vme;
1141 	struct vie *vie;
1142 	struct hyp *hyp;
1143 	uint64_t fault_ipa;
1144 	struct vm_guest_paging *paging;
1145 	struct vmm_mmio_region *vmr;
1146 	int error, i;
1147 
1148 	vm = vcpu->vm;
1149 	hyp = vm->cookie;
1150 	if (!hyp->vgic_attached)
1151 		goto out_user;
1152 
1153 	vme = &vcpu->exitinfo;
1154 	vie = &vme->u.inst_emul.vie;
1155 	paging = &vme->u.inst_emul.paging;
1156 
1157 	fault_ipa = vme->u.inst_emul.gpa;
1158 
1159 	vmr = NULL;
1160 	for (i = 0; i < nitems(vm->mmio_region); i++) {
1161 		if (vm->mmio_region[i].start <= fault_ipa &&
1162 		    vm->mmio_region[i].end > fault_ipa) {
1163 			vmr = &vm->mmio_region[i];
1164 			break;
1165 		}
1166 	}
1167 	if (vmr == NULL)
1168 		goto out_user;
1169 
1170 	error = vmm_emulate_instruction(vcpu, fault_ipa, vie, paging,
1171 	    vmr->read, vmr->write, retu);
1172 	return (error);
1173 
1174 out_user:
1175 	*retu = true;
1176 	return (0);
1177 }
1178 
1179 int
1180 vm_suspend(struct vm *vm, enum vm_suspend_how how)
1181 {
1182 	int i;
1183 
1184 	if (how <= VM_SUSPEND_NONE || how >= VM_SUSPEND_LAST)
1185 		return (EINVAL);
1186 
1187 	if (atomic_cmpset_int(&vm->suspend, 0, how) == 0) {
1188 		VM_CTR2(vm, "virtual machine already suspended %d/%d",
1189 		    vm->suspend, how);
1190 		return (EALREADY);
1191 	}
1192 
1193 	VM_CTR1(vm, "virtual machine successfully suspended %d", how);
1194 
1195 	/*
1196 	 * Notify all active vcpus that they are now suspended.
1197 	 */
1198 	for (i = 0; i < vm->maxcpus; i++) {
1199 		if (CPU_ISSET(i, &vm->active_cpus))
1200 			vcpu_notify_event(vm_vcpu(vm, i));
1201 	}
1202 
1203 	return (0);
1204 }
1205 
1206 void
1207 vm_exit_suspended(struct vcpu *vcpu, uint64_t pc)
1208 {
1209 	struct vm *vm = vcpu->vm;
1210 	struct vm_exit *vmexit;
1211 
1212 	KASSERT(vm->suspend > VM_SUSPEND_NONE && vm->suspend < VM_SUSPEND_LAST,
1213 	    ("vm_exit_suspended: invalid suspend type %d", vm->suspend));
1214 
1215 	vmexit = vm_exitinfo(vcpu);
1216 	vmexit->pc = pc;
1217 	vmexit->inst_length = 4;
1218 	vmexit->exitcode = VM_EXITCODE_SUSPENDED;
1219 	vmexit->u.suspended.how = vm->suspend;
1220 }
1221 
1222 void
1223 vm_exit_debug(struct vcpu *vcpu, uint64_t pc)
1224 {
1225 	struct vm_exit *vmexit;
1226 
1227 	vmexit = vm_exitinfo(vcpu);
1228 	vmexit->pc = pc;
1229 	vmexit->inst_length = 4;
1230 	vmexit->exitcode = VM_EXITCODE_DEBUG;
1231 }
1232 
1233 int
1234 vm_activate_cpu(struct vcpu *vcpu)
1235 {
1236 	struct vm *vm = vcpu->vm;
1237 
1238 	if (CPU_ISSET(vcpu->vcpuid, &vm->active_cpus))
1239 		return (EBUSY);
1240 
1241 	CPU_SET_ATOMIC(vcpu->vcpuid, &vm->active_cpus);
1242 	return (0);
1243 
1244 }
1245 
1246 int
1247 vm_suspend_cpu(struct vm *vm, struct vcpu *vcpu)
1248 {
1249 	if (vcpu == NULL) {
1250 		vm->debug_cpus = vm->active_cpus;
1251 		for (int i = 0; i < vm->maxcpus; i++) {
1252 			if (CPU_ISSET(i, &vm->active_cpus))
1253 				vcpu_notify_event(vm_vcpu(vm, i));
1254 		}
1255 	} else {
1256 		if (!CPU_ISSET(vcpu->vcpuid, &vm->active_cpus))
1257 			return (EINVAL);
1258 
1259 		CPU_SET_ATOMIC(vcpu->vcpuid, &vm->debug_cpus);
1260 		vcpu_notify_event(vcpu);
1261 	}
1262 	return (0);
1263 }
1264 
1265 int
1266 vm_resume_cpu(struct vm *vm, struct vcpu *vcpu)
1267 {
1268 
1269 	if (vcpu == NULL) {
1270 		CPU_ZERO(&vm->debug_cpus);
1271 	} else {
1272 		if (!CPU_ISSET(vcpu->vcpuid, &vm->debug_cpus))
1273 			return (EINVAL);
1274 
1275 		CPU_CLR_ATOMIC(vcpu->vcpuid, &vm->debug_cpus);
1276 	}
1277 	return (0);
1278 }
1279 
1280 int
1281 vcpu_debugged(struct vcpu *vcpu)
1282 {
1283 
1284 	return (CPU_ISSET(vcpu->vcpuid, &vcpu->vm->debug_cpus));
1285 }
1286 
1287 cpuset_t
1288 vm_active_cpus(struct vm *vm)
1289 {
1290 
1291 	return (vm->active_cpus);
1292 }
1293 
1294 cpuset_t
1295 vm_debug_cpus(struct vm *vm)
1296 {
1297 
1298 	return (vm->debug_cpus);
1299 }
1300 
1301 cpuset_t
1302 vm_suspended_cpus(struct vm *vm)
1303 {
1304 
1305 	return (vm->suspended_cpus);
1306 }
1307 
1308 
1309 void *
1310 vcpu_stats(struct vcpu *vcpu)
1311 {
1312 
1313 	return (vcpu->stats);
1314 }
1315 
1316 /*
1317  * This function is called to ensure that a vcpu "sees" a pending event
1318  * as soon as possible:
1319  * - If the vcpu thread is sleeping then it is woken up.
1320  * - If the vcpu is running on a different host_cpu then an IPI will be directed
1321  *   to the host_cpu to cause the vcpu to trap into the hypervisor.
1322  */
1323 static void
1324 vcpu_notify_event_locked(struct vcpu *vcpu)
1325 {
1326 	int hostcpu;
1327 
1328 	hostcpu = vcpu->hostcpu;
1329 	if (vcpu->state == VCPU_RUNNING) {
1330 		KASSERT(hostcpu != NOCPU, ("vcpu running on invalid hostcpu"));
1331 		if (hostcpu != curcpu) {
1332 			ipi_cpu(hostcpu, vmm_ipinum);
1333 		} else {
1334 			/*
1335 			 * If the 'vcpu' is running on 'curcpu' then it must
1336 			 * be sending a notification to itself (e.g. SELF_IPI).
1337 			 * The pending event will be picked up when the vcpu
1338 			 * transitions back to guest context.
1339 			 */
1340 		}
1341 	} else {
1342 		KASSERT(hostcpu == NOCPU, ("vcpu state %d not consistent "
1343 		    "with hostcpu %d", vcpu->state, hostcpu));
1344 		if (vcpu->state == VCPU_SLEEPING)
1345 			wakeup_one(vcpu);
1346 	}
1347 }
1348 
1349 void
1350 vcpu_notify_event(struct vcpu *vcpu)
1351 {
1352 	vcpu_lock(vcpu);
1353 	vcpu_notify_event_locked(vcpu);
1354 	vcpu_unlock(vcpu);
1355 }
1356 
1357 static void
1358 restore_guest_fpustate(struct vcpu *vcpu)
1359 {
1360 
1361 	/* flush host state to the pcb */
1362 	vfp_save_state(curthread, curthread->td_pcb);
1363 	/* Ensure the VFP state will be re-loaded when exiting the guest */
1364 	PCPU_SET(fpcurthread, NULL);
1365 
1366 	/* restore guest FPU state */
1367 	vfp_enable();
1368 	vfp_restore(vcpu->guestfpu);
1369 
1370 	/*
1371 	 * The FPU is now "dirty" with the guest's state so turn on emulation
1372 	 * to trap any access to the FPU by the host.
1373 	 */
1374 	vfp_disable();
1375 }
1376 
1377 static void
1378 save_guest_fpustate(struct vcpu *vcpu)
1379 {
1380 	if ((READ_SPECIALREG(cpacr_el1) & CPACR_FPEN_MASK) !=
1381 	    CPACR_FPEN_TRAP_ALL1)
1382 		panic("VFP not enabled in host!");
1383 
1384 	/* save guest FPU state */
1385 	vfp_enable();
1386 	vfp_store(vcpu->guestfpu);
1387 	vfp_disable();
1388 
1389 	KASSERT(PCPU_GET(fpcurthread) == NULL,
1390 	    ("%s: fpcurthread set with guest registers", __func__));
1391 }
1392 static int
1393 vcpu_set_state_locked(struct vcpu *vcpu, enum vcpu_state newstate,
1394     bool from_idle)
1395 {
1396 	int error;
1397 
1398 	vcpu_assert_locked(vcpu);
1399 
1400 	/*
1401 	 * State transitions from the vmmdev_ioctl() must always begin from
1402 	 * the VCPU_IDLE state. This guarantees that there is only a single
1403 	 * ioctl() operating on a vcpu at any point.
1404 	 */
1405 	if (from_idle) {
1406 		while (vcpu->state != VCPU_IDLE) {
1407 			vcpu_notify_event_locked(vcpu);
1408 			msleep_spin(&vcpu->state, &vcpu->mtx, "vmstat", hz);
1409 		}
1410 	} else {
1411 		KASSERT(vcpu->state != VCPU_IDLE, ("invalid transition from "
1412 		    "vcpu idle state"));
1413 	}
1414 
1415 	if (vcpu->state == VCPU_RUNNING) {
1416 		KASSERT(vcpu->hostcpu == curcpu, ("curcpu %d and hostcpu %d "
1417 		    "mismatch for running vcpu", curcpu, vcpu->hostcpu));
1418 	} else {
1419 		KASSERT(vcpu->hostcpu == NOCPU, ("Invalid hostcpu %d for a "
1420 		    "vcpu that is not running", vcpu->hostcpu));
1421 	}
1422 
1423 	/*
1424 	 * The following state transitions are allowed:
1425 	 * IDLE -> FROZEN -> IDLE
1426 	 * FROZEN -> RUNNING -> FROZEN
1427 	 * FROZEN -> SLEEPING -> FROZEN
1428 	 */
1429 	switch (vcpu->state) {
1430 	case VCPU_IDLE:
1431 	case VCPU_RUNNING:
1432 	case VCPU_SLEEPING:
1433 		error = (newstate != VCPU_FROZEN);
1434 		break;
1435 	case VCPU_FROZEN:
1436 		error = (newstate == VCPU_FROZEN);
1437 		break;
1438 	default:
1439 		error = 1;
1440 		break;
1441 	}
1442 
1443 	if (error)
1444 		return (EBUSY);
1445 
1446 	vcpu->state = newstate;
1447 	if (newstate == VCPU_RUNNING)
1448 		vcpu->hostcpu = curcpu;
1449 	else
1450 		vcpu->hostcpu = NOCPU;
1451 
1452 	if (newstate == VCPU_IDLE)
1453 		wakeup(&vcpu->state);
1454 
1455 	return (0);
1456 }
1457 
1458 static void
1459 vcpu_require_state(struct vcpu *vcpu, enum vcpu_state newstate)
1460 {
1461 	int error;
1462 
1463 	if ((error = vcpu_set_state(vcpu, newstate, false)) != 0)
1464 		panic("Error %d setting state to %d\n", error, newstate);
1465 }
1466 
1467 static void
1468 vcpu_require_state_locked(struct vcpu *vcpu, enum vcpu_state newstate)
1469 {
1470 	int error;
1471 
1472 	if ((error = vcpu_set_state_locked(vcpu, newstate, false)) != 0)
1473 		panic("Error %d setting state to %d", error, newstate);
1474 }
1475 
1476 int
1477 vm_get_capability(struct vcpu *vcpu, int type, int *retval)
1478 {
1479 	if (type < 0 || type >= VM_CAP_MAX)
1480 		return (EINVAL);
1481 
1482 	return (vmmops_getcap(vcpu->cookie, type, retval));
1483 }
1484 
1485 int
1486 vm_set_capability(struct vcpu *vcpu, int type, int val)
1487 {
1488 	if (type < 0 || type >= VM_CAP_MAX)
1489 		return (EINVAL);
1490 
1491 	return (vmmops_setcap(vcpu->cookie, type, val));
1492 }
1493 
1494 struct vm *
1495 vcpu_vm(struct vcpu *vcpu)
1496 {
1497 	return (vcpu->vm);
1498 }
1499 
1500 int
1501 vcpu_vcpuid(struct vcpu *vcpu)
1502 {
1503 	return (vcpu->vcpuid);
1504 }
1505 
1506 void *
1507 vcpu_get_cookie(struct vcpu *vcpu)
1508 {
1509 	return (vcpu->cookie);
1510 }
1511 
1512 struct vcpu *
1513 vm_vcpu(struct vm *vm, int vcpuid)
1514 {
1515 	return (vm->vcpu[vcpuid]);
1516 }
1517 
1518 int
1519 vcpu_set_state(struct vcpu *vcpu, enum vcpu_state newstate, bool from_idle)
1520 {
1521 	int error;
1522 
1523 	vcpu_lock(vcpu);
1524 	error = vcpu_set_state_locked(vcpu, newstate, from_idle);
1525 	vcpu_unlock(vcpu);
1526 
1527 	return (error);
1528 }
1529 
1530 enum vcpu_state
1531 vcpu_get_state(struct vcpu *vcpu, int *hostcpu)
1532 {
1533 	enum vcpu_state state;
1534 
1535 	vcpu_lock(vcpu);
1536 	state = vcpu->state;
1537 	if (hostcpu != NULL)
1538 		*hostcpu = vcpu->hostcpu;
1539 	vcpu_unlock(vcpu);
1540 
1541 	return (state);
1542 }
1543 
1544 static void *
1545 _vm_gpa_hold(struct vm *vm, vm_paddr_t gpa, size_t len, int reqprot,
1546     void **cookie)
1547 {
1548 	int i, count, pageoff;
1549 	struct mem_map *mm;
1550 	vm_page_t m;
1551 
1552 	pageoff = gpa & PAGE_MASK;
1553 	if (len > PAGE_SIZE - pageoff)
1554 		panic("vm_gpa_hold: invalid gpa/len: 0x%016lx/%lu", gpa, len);
1555 
1556 	count = 0;
1557 	for (i = 0; i < VM_MAX_MEMMAPS; i++) {
1558 		mm = &vm->mem_maps[i];
1559 		if (sysmem_mapping(vm, mm) && gpa >= mm->gpa &&
1560 		    gpa < mm->gpa + mm->len) {
1561 			count = vm_fault_quick_hold_pages(&vm->vmspace->vm_map,
1562 			    trunc_page(gpa), PAGE_SIZE, reqprot, &m, 1);
1563 			break;
1564 		}
1565 	}
1566 
1567 	if (count == 1) {
1568 		*cookie = m;
1569 		return ((void *)(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)) + pageoff));
1570 	} else {
1571 		*cookie = NULL;
1572 		return (NULL);
1573 	}
1574 }
1575 
1576 void *
1577 vm_gpa_hold(struct vcpu *vcpu, vm_paddr_t gpa, size_t len, int reqprot,
1578 	    void **cookie)
1579 {
1580 #ifdef INVARIANTS
1581 	/*
1582 	 * The current vcpu should be frozen to ensure 'vm_memmap[]'
1583 	 * stability.
1584 	 */
1585 	int state = vcpu_get_state(vcpu, NULL);
1586 	KASSERT(state == VCPU_FROZEN, ("%s: invalid vcpu state %d",
1587 	    __func__, state));
1588 #endif
1589 	return (_vm_gpa_hold(vcpu->vm, gpa, len, reqprot, cookie));
1590 }
1591 
1592 void *
1593 vm_gpa_hold_global(struct vm *vm, vm_paddr_t gpa, size_t len, int reqprot,
1594     void **cookie)
1595 {
1596 	sx_assert(&vm->mem_segs_lock, SX_LOCKED);
1597 	return (_vm_gpa_hold(vm, gpa, len, reqprot, cookie));
1598 }
1599 
1600 void
1601 vm_gpa_release(void *cookie)
1602 {
1603 	vm_page_t m = cookie;
1604 
1605 	vm_page_unwire(m, PQ_ACTIVE);
1606 }
1607 
1608 int
1609 vm_get_register(struct vcpu *vcpu, int reg, uint64_t *retval)
1610 {
1611 
1612 	if (reg >= VM_REG_LAST)
1613 		return (EINVAL);
1614 
1615 	return (vmmops_getreg(vcpu->cookie, reg, retval));
1616 }
1617 
1618 int
1619 vm_set_register(struct vcpu *vcpu, int reg, uint64_t val)
1620 {
1621 	int error;
1622 
1623 	if (reg >= VM_REG_LAST)
1624 		return (EINVAL);
1625 	error = vmmops_setreg(vcpu->cookie, reg, val);
1626 	if (error || reg != VM_REG_GUEST_PC)
1627 		return (error);
1628 
1629 	vcpu->nextpc = val;
1630 
1631 	return (0);
1632 }
1633 
1634 void *
1635 vm_get_cookie(struct vm *vm)
1636 {
1637 	return (vm->cookie);
1638 }
1639 
1640 int
1641 vm_inject_exception(struct vcpu *vcpu, uint64_t esr, uint64_t far)
1642 {
1643 	return (vmmops_exception(vcpu->cookie, esr, far));
1644 }
1645 
1646 int
1647 vm_attach_vgic(struct vm *vm, struct vm_vgic_descr *descr)
1648 {
1649 	return (vgic_attach_to_vm(vm->cookie, descr));
1650 }
1651 
1652 int
1653 vm_assert_irq(struct vm *vm, uint32_t irq)
1654 {
1655 	return (vgic_inject_irq(vm->cookie, -1, irq, true));
1656 }
1657 
1658 int
1659 vm_deassert_irq(struct vm *vm, uint32_t irq)
1660 {
1661 	return (vgic_inject_irq(vm->cookie, -1, irq, false));
1662 }
1663 
1664 int
1665 vm_raise_msi(struct vm *vm, uint64_t msg, uint64_t addr, int bus, int slot,
1666     int func)
1667 {
1668 	/* TODO: Should we raise an SError? */
1669 	return (vgic_inject_msi(vm->cookie, msg, addr));
1670 }
1671 
1672 static int
1673 vm_handle_smccc_call(struct vcpu *vcpu, struct vm_exit *vme, bool *retu)
1674 {
1675 	struct hypctx *hypctx;
1676 	int i;
1677 
1678 	hypctx = vcpu_get_cookie(vcpu);
1679 
1680 	if ((hypctx->tf.tf_esr & ESR_ELx_ISS_MASK) != 0)
1681 		return (1);
1682 
1683 	vme->exitcode = VM_EXITCODE_SMCCC;
1684 	vme->u.smccc_call.func_id = hypctx->tf.tf_x[0];
1685 	for (i = 0; i < nitems(vme->u.smccc_call.args); i++)
1686 		vme->u.smccc_call.args[i] = hypctx->tf.tf_x[i + 1];
1687 
1688 	*retu = true;
1689 	return (0);
1690 }
1691 
1692 static int
1693 vm_handle_wfi(struct vcpu *vcpu, struct vm_exit *vme, bool *retu)
1694 {
1695 	vcpu_lock(vcpu);
1696 	while (1) {
1697 		if (vgic_has_pending_irq(vcpu->cookie))
1698 			break;
1699 
1700 		if (vcpu_should_yield(vcpu))
1701 			break;
1702 
1703 		vcpu_require_state_locked(vcpu, VCPU_SLEEPING);
1704 		/*
1705 		 * XXX msleep_spin() cannot be interrupted by signals so
1706 		 * wake up periodically to check pending signals.
1707 		 */
1708 		msleep_spin(vcpu, &vcpu->mtx, "vmidle", hz);
1709 		vcpu_require_state_locked(vcpu, VCPU_FROZEN);
1710 	}
1711 	vcpu_unlock(vcpu);
1712 
1713 	*retu = false;
1714 	return (0);
1715 }
1716 
1717 static int
1718 vm_handle_paging(struct vcpu *vcpu, bool *retu)
1719 {
1720 	struct vm *vm = vcpu->vm;
1721 	struct vm_exit *vme;
1722 	struct vm_map *map;
1723 	uint64_t addr, esr;
1724 	pmap_t pmap;
1725 	int ftype, rv;
1726 
1727 	vme = &vcpu->exitinfo;
1728 
1729 	pmap = vmspace_pmap(vcpu->vm->vmspace);
1730 	addr = vme->u.paging.gpa;
1731 	esr = vme->u.paging.esr;
1732 
1733 	/* The page exists, but the page table needs to be updated. */
1734 	if (pmap_fault(pmap, esr, addr) == KERN_SUCCESS)
1735 		return (0);
1736 
1737 	switch (ESR_ELx_EXCEPTION(esr)) {
1738 	case EXCP_INSN_ABORT_L:
1739 	case EXCP_DATA_ABORT_L:
1740 		ftype = VM_PROT_EXECUTE | VM_PROT_READ | VM_PROT_WRITE;
1741 		break;
1742 	default:
1743 		panic("%s: Invalid exception (esr = %lx)", __func__, esr);
1744 	}
1745 
1746 	map = &vm->vmspace->vm_map;
1747 	rv = vm_fault(map, vme->u.paging.gpa, ftype, VM_FAULT_NORMAL, NULL);
1748 	if (rv != KERN_SUCCESS)
1749 		return (EFAULT);
1750 
1751 	return (0);
1752 }
1753 
1754 static int
1755 vm_handle_suspend(struct vcpu *vcpu, bool *retu)
1756 {
1757 	struct vm *vm = vcpu->vm;
1758 	int error, i;
1759 	struct thread *td;
1760 
1761 	error = 0;
1762 	td = curthread;
1763 
1764 	CPU_SET_ATOMIC(vcpu->vcpuid, &vm->suspended_cpus);
1765 
1766 	/*
1767 	 * Wait until all 'active_cpus' have suspended themselves.
1768 	 *
1769 	 * Since a VM may be suspended at any time including when one or
1770 	 * more vcpus are doing a rendezvous we need to call the rendezvous
1771 	 * handler while we are waiting to prevent a deadlock.
1772 	 */
1773 	vcpu_lock(vcpu);
1774 	while (error == 0) {
1775 		if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0)
1776 			break;
1777 
1778 		vcpu_require_state_locked(vcpu, VCPU_SLEEPING);
1779 		msleep_spin(vcpu, &vcpu->mtx, "vmsusp", hz);
1780 		vcpu_require_state_locked(vcpu, VCPU_FROZEN);
1781 		if (td_ast_pending(td, TDA_SUSPEND)) {
1782 			vcpu_unlock(vcpu);
1783 			error = thread_check_susp(td, false);
1784 			vcpu_lock(vcpu);
1785 		}
1786 	}
1787 	vcpu_unlock(vcpu);
1788 
1789 	/*
1790 	 * Wakeup the other sleeping vcpus and return to userspace.
1791 	 */
1792 	for (i = 0; i < vm->maxcpus; i++) {
1793 		if (CPU_ISSET(i, &vm->suspended_cpus)) {
1794 			vcpu_notify_event(vm_vcpu(vm, i));
1795 		}
1796 	}
1797 
1798 	*retu = true;
1799 	return (error);
1800 }
1801 
1802 int
1803 vm_run(struct vcpu *vcpu)
1804 {
1805 	struct vm *vm = vcpu->vm;
1806 	struct vm_eventinfo evinfo;
1807 	int error, vcpuid;
1808 	struct vm_exit *vme;
1809 	bool retu;
1810 	pmap_t pmap;
1811 
1812 	vcpuid = vcpu->vcpuid;
1813 
1814 	if (!CPU_ISSET(vcpuid, &vm->active_cpus))
1815 		return (EINVAL);
1816 
1817 	if (CPU_ISSET(vcpuid, &vm->suspended_cpus))
1818 		return (EINVAL);
1819 
1820 	pmap = vmspace_pmap(vm->vmspace);
1821 	vme = &vcpu->exitinfo;
1822 	evinfo.rptr = NULL;
1823 	evinfo.sptr = &vm->suspend;
1824 	evinfo.iptr = NULL;
1825 restart:
1826 	critical_enter();
1827 
1828 	restore_guest_fpustate(vcpu);
1829 
1830 	vcpu_require_state(vcpu, VCPU_RUNNING);
1831 	error = vmmops_run(vcpu->cookie, vcpu->nextpc, pmap, &evinfo);
1832 	vcpu_require_state(vcpu, VCPU_FROZEN);
1833 
1834 	save_guest_fpustate(vcpu);
1835 
1836 	critical_exit();
1837 
1838 	if (error == 0) {
1839 		retu = false;
1840 		switch (vme->exitcode) {
1841 		case VM_EXITCODE_INST_EMUL:
1842 			vcpu->nextpc = vme->pc + vme->inst_length;
1843 			error = vm_handle_inst_emul(vcpu, &retu);
1844 			break;
1845 
1846 		case VM_EXITCODE_REG_EMUL:
1847 			vcpu->nextpc = vme->pc + vme->inst_length;
1848 			error = vm_handle_reg_emul(vcpu, &retu);
1849 			break;
1850 
1851 		case VM_EXITCODE_HVC:
1852 			/*
1853 			 * The HVC instruction saves the address for the
1854 			 * next instruction as the return address.
1855 			 */
1856 			vcpu->nextpc = vme->pc;
1857 			/*
1858 			 * The PSCI call can change the exit information in the
1859 			 * case of suspend/reset/poweroff/cpu off/cpu on.
1860 			 */
1861 			error = vm_handle_smccc_call(vcpu, vme, &retu);
1862 			break;
1863 
1864 		case VM_EXITCODE_WFI:
1865 			vcpu->nextpc = vme->pc + vme->inst_length;
1866 			error = vm_handle_wfi(vcpu, vme, &retu);
1867 			break;
1868 
1869 		case VM_EXITCODE_PAGING:
1870 			vcpu->nextpc = vme->pc;
1871 			error = vm_handle_paging(vcpu, &retu);
1872 			break;
1873 
1874 		case VM_EXITCODE_SUSPENDED:
1875 			vcpu->nextpc = vme->pc;
1876 			error = vm_handle_suspend(vcpu, &retu);
1877 			break;
1878 
1879 		default:
1880 			/* Handle in userland */
1881 			vcpu->nextpc = vme->pc;
1882 			retu = true;
1883 			break;
1884 		}
1885 	}
1886 
1887 	if (error == 0 && retu == false)
1888 		goto restart;
1889 
1890 	return (error);
1891 }
1892