xref: /freebsd/sys/arm64/vmm/vmm.c (revision 3e15b01d6914c927e37d1699645783acf286655c)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (C) 2015 Mihai Carabas <mihai.carabas@gmail.com>
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28 
29 #include <sys/param.h>
30 #include <sys/systm.h>
31 #include <sys/cpuset.h>
32 #include <sys/kernel.h>
33 #include <sys/linker.h>
34 #include <sys/lock.h>
35 #include <sys/malloc.h>
36 #include <sys/module.h>
37 #include <sys/mutex.h>
38 #include <sys/pcpu.h>
39 #include <sys/proc.h>
40 #include <sys/queue.h>
41 #include <sys/rwlock.h>
42 #include <sys/sched.h>
43 #include <sys/smp.h>
44 #include <sys/sysctl.h>
45 
46 #include <vm/vm.h>
47 #include <vm/vm_object.h>
48 #include <vm/vm_page.h>
49 #include <vm/pmap.h>
50 #include <vm/vm_map.h>
51 #include <vm/vm_extern.h>
52 #include <vm/vm_param.h>
53 
54 #include <machine/armreg.h>
55 #include <machine/cpu.h>
56 #include <machine/fpu.h>
57 #include <machine/machdep.h>
58 #include <machine/pcb.h>
59 #include <machine/smp.h>
60 #include <machine/vm.h>
61 #include <machine/vmparam.h>
62 #include <machine/vmm.h>
63 #include <machine/vmm_dev.h>
64 #include <machine/vmm_instruction_emul.h>
65 
66 #include <dev/pci/pcireg.h>
67 
68 #include "vmm_ktr.h"
69 #include "vmm_stat.h"
70 #include "arm64.h"
71 #include "mmu.h"
72 
73 #include "io/vgic.h"
74 #include "io/vtimer.h"
75 
76 struct vcpu {
77 	int		flags;
78 	enum vcpu_state	state;
79 	struct mtx	mtx;
80 	int		hostcpu;	/* host cpuid this vcpu last ran on */
81 	int		vcpuid;
82 	void		*stats;
83 	struct vm_exit	exitinfo;
84 	uint64_t	nextpc;		/* (x) next instruction to execute */
85 	struct vm	*vm;		/* (o) */
86 	void		*cookie;	/* (i) cpu-specific data */
87 	struct vfpstate	*guestfpu;	/* (a,i) guest fpu state */
88 };
89 
90 #define	vcpu_lock_initialized(v) mtx_initialized(&((v)->mtx))
91 #define	vcpu_lock_init(v)	mtx_init(&((v)->mtx), "vcpu lock", 0, MTX_SPIN)
92 #define	vcpu_lock_destroy(v)	mtx_destroy(&((v)->mtx))
93 #define	vcpu_lock(v)		mtx_lock_spin(&((v)->mtx))
94 #define	vcpu_unlock(v)		mtx_unlock_spin(&((v)->mtx))
95 #define	vcpu_assert_locked(v)	mtx_assert(&((v)->mtx), MA_OWNED)
96 
97 struct mem_seg {
98 	uint64_t	gpa;
99 	size_t		len;
100 	bool		wired;
101 	bool		sysmem;
102 	vm_object_t	object;
103 };
104 #define	VM_MAX_MEMSEGS	3
105 
106 struct mem_map {
107 	vm_paddr_t	gpa;
108 	size_t		len;
109 	vm_ooffset_t	segoff;
110 	int		segid;
111 	int		prot;
112 	int		flags;
113 };
114 #define	VM_MAX_MEMMAPS	4
115 
116 struct vmm_mmio_region {
117 	uint64_t start;
118 	uint64_t end;
119 	mem_region_read_t read;
120 	mem_region_write_t write;
121 };
122 #define	VM_MAX_MMIO_REGIONS	4
123 
124 struct vmm_special_reg {
125 	uint32_t	esr_iss;
126 	uint32_t	esr_mask;
127 	reg_read_t	reg_read;
128 	reg_write_t	reg_write;
129 	void		*arg;
130 };
131 #define	VM_MAX_SPECIAL_REGS	16
132 
133 /*
134  * Initialization:
135  * (o) initialized the first time the VM is created
136  * (i) initialized when VM is created and when it is reinitialized
137  * (x) initialized before use
138  */
139 struct vm {
140 	void		*cookie;		/* (i) cpu-specific data */
141 	volatile cpuset_t active_cpus;		/* (i) active vcpus */
142 	volatile cpuset_t debug_cpus;		/* (i) vcpus stopped for debug */
143 	int		suspend;		/* (i) stop VM execution */
144 	volatile cpuset_t suspended_cpus; 	/* (i) suspended vcpus */
145 	volatile cpuset_t halted_cpus;		/* (x) cpus in a hard halt */
146 	struct mem_map	mem_maps[VM_MAX_MEMMAPS]; /* (i) guest address space */
147 	struct mem_seg	mem_segs[VM_MAX_MEMSEGS]; /* (o) guest memory regions */
148 	struct vmspace	*vmspace;		/* (o) guest's address space */
149 	char		name[VM_MAX_NAMELEN];	/* (o) virtual machine name */
150 	struct vcpu	**vcpu;			/* (i) guest vcpus */
151 	struct vmm_mmio_region mmio_region[VM_MAX_MMIO_REGIONS];
152 						/* (o) guest MMIO regions */
153 	struct vmm_special_reg special_reg[VM_MAX_SPECIAL_REGS];
154 	/* The following describe the vm cpu topology */
155 	uint16_t	sockets;		/* (o) num of sockets */
156 	uint16_t	cores;			/* (o) num of cores/socket */
157 	uint16_t	threads;		/* (o) num of threads/core */
158 	uint16_t	maxcpus;		/* (o) max pluggable cpus */
159 	struct sx	mem_segs_lock;		/* (o) */
160 	struct sx	vcpus_init_lock;	/* (o) */
161 };
162 
163 static bool vmm_initialized = false;
164 
165 static int vm_handle_wfi(struct vcpu *vcpu,
166 			 struct vm_exit *vme, bool *retu);
167 
168 static MALLOC_DEFINE(M_VMM, "vmm", "vmm");
169 
170 /* statistics */
171 static VMM_STAT(VCPU_TOTAL_RUNTIME, "vcpu total runtime");
172 
173 SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW, NULL, NULL);
174 
175 static int vmm_ipinum;
176 SYSCTL_INT(_hw_vmm, OID_AUTO, ipinum, CTLFLAG_RD, &vmm_ipinum, 0,
177     "IPI vector used for vcpu notifications");
178 
179 struct vmm_regs {
180 	uint64_t	id_aa64afr0;
181 	uint64_t	id_aa64afr1;
182 	uint64_t	id_aa64dfr0;
183 	uint64_t	id_aa64dfr1;
184 	uint64_t	id_aa64isar0;
185 	uint64_t	id_aa64isar1;
186 	uint64_t	id_aa64isar2;
187 	uint64_t	id_aa64mmfr0;
188 	uint64_t	id_aa64mmfr1;
189 	uint64_t	id_aa64mmfr2;
190 	uint64_t	id_aa64pfr0;
191 	uint64_t	id_aa64pfr1;
192 };
193 
194 static const struct vmm_regs vmm_arch_regs_masks = {
195 	.id_aa64dfr0 =
196 	    ID_AA64DFR0_CTX_CMPs_MASK |
197 	    ID_AA64DFR0_WRPs_MASK |
198 	    ID_AA64DFR0_BRPs_MASK |
199 	    ID_AA64DFR0_PMUVer_3 |
200 	    ID_AA64DFR0_DebugVer_8,
201 	.id_aa64isar0 =
202 	    ID_AA64ISAR0_TLB_TLBIOSR |
203 	    ID_AA64ISAR0_SHA3_IMPL |
204 	    ID_AA64ISAR0_RDM_IMPL |
205 	    ID_AA64ISAR0_Atomic_IMPL |
206 	    ID_AA64ISAR0_CRC32_BASE |
207 	    ID_AA64ISAR0_SHA2_512 |
208 	    ID_AA64ISAR0_SHA1_BASE |
209 	    ID_AA64ISAR0_AES_PMULL,
210 	.id_aa64mmfr0 =
211 	    ID_AA64MMFR0_TGran4_IMPL |
212 	    ID_AA64MMFR0_TGran64_IMPL |
213 	    ID_AA64MMFR0_TGran16_IMPL |
214 	    ID_AA64MMFR0_ASIDBits_16 |
215 	    ID_AA64MMFR0_PARange_4P,
216 	.id_aa64mmfr1 =
217 	    ID_AA64MMFR1_SpecSEI_IMPL |
218 	    ID_AA64MMFR1_PAN_ATS1E1 |
219 	    ID_AA64MMFR1_HAFDBS_AF,
220 	.id_aa64pfr0 =
221 	    ID_AA64PFR0_GIC_CPUIF_NONE |
222 	    ID_AA64PFR0_AdvSIMD_HP |
223 	    ID_AA64PFR0_FP_HP |
224 	    ID_AA64PFR0_EL3_64 |
225 	    ID_AA64PFR0_EL2_64 |
226 	    ID_AA64PFR0_EL1_64 |
227 	    ID_AA64PFR0_EL0_64,
228 };
229 
230 /* Host registers masked by vmm_arch_regs_masks. */
231 static struct vmm_regs vmm_arch_regs;
232 
233 u_int vm_maxcpu;
234 SYSCTL_UINT(_hw_vmm, OID_AUTO, maxcpu, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
235     &vm_maxcpu, 0, "Maximum number of vCPUs");
236 
237 static void vm_free_memmap(struct vm *vm, int ident);
238 static bool sysmem_mapping(struct vm *vm, struct mem_map *mm);
239 static void vcpu_notify_event_locked(struct vcpu *vcpu);
240 
241 /*
242  * Upper limit on vm_maxcpu. We could increase this to 28 bits, but this
243  * is a safe value for now.
244  */
245 #define	VM_MAXCPU	MIN(0xffff - 1, CPU_SETSIZE)
246 
247 static int
248 vmm_regs_init(struct vmm_regs *regs, const struct vmm_regs *masks)
249 {
250 #define	_FETCH_KERN_REG(reg, field) do {				\
251 	regs->field = vmm_arch_regs_masks.field;			\
252 	if (!get_kernel_reg_masked(reg, &regs->field, masks->field))	\
253 		regs->field = 0;					\
254 } while (0)
255 	_FETCH_KERN_REG(ID_AA64AFR0_EL1, id_aa64afr0);
256 	_FETCH_KERN_REG(ID_AA64AFR1_EL1, id_aa64afr1);
257 	_FETCH_KERN_REG(ID_AA64DFR0_EL1, id_aa64dfr0);
258 	_FETCH_KERN_REG(ID_AA64DFR1_EL1, id_aa64dfr1);
259 	_FETCH_KERN_REG(ID_AA64ISAR0_EL1, id_aa64isar0);
260 	_FETCH_KERN_REG(ID_AA64ISAR1_EL1, id_aa64isar1);
261 	_FETCH_KERN_REG(ID_AA64ISAR2_EL1, id_aa64isar2);
262 	_FETCH_KERN_REG(ID_AA64MMFR0_EL1, id_aa64mmfr0);
263 	_FETCH_KERN_REG(ID_AA64MMFR1_EL1, id_aa64mmfr1);
264 	_FETCH_KERN_REG(ID_AA64MMFR2_EL1, id_aa64mmfr2);
265 	_FETCH_KERN_REG(ID_AA64PFR0_EL1, id_aa64pfr0);
266 	_FETCH_KERN_REG(ID_AA64PFR1_EL1, id_aa64pfr1);
267 #undef _FETCH_KERN_REG
268 	return (0);
269 }
270 
271 static void
272 vcpu_cleanup(struct vcpu *vcpu, bool destroy)
273 {
274 	vmmops_vcpu_cleanup(vcpu->cookie);
275 	vcpu->cookie = NULL;
276 	if (destroy) {
277 		vmm_stat_free(vcpu->stats);
278 		fpu_save_area_free(vcpu->guestfpu);
279 		vcpu_lock_destroy(vcpu);
280 	}
281 }
282 
283 static struct vcpu *
284 vcpu_alloc(struct vm *vm, int vcpu_id)
285 {
286 	struct vcpu *vcpu;
287 
288 	KASSERT(vcpu_id >= 0 && vcpu_id < vm->maxcpus,
289 	    ("vcpu_alloc: invalid vcpu %d", vcpu_id));
290 
291 	vcpu = malloc(sizeof(*vcpu), M_VMM, M_WAITOK | M_ZERO);
292 	vcpu_lock_init(vcpu);
293 	vcpu->state = VCPU_IDLE;
294 	vcpu->hostcpu = NOCPU;
295 	vcpu->vcpuid = vcpu_id;
296 	vcpu->vm = vm;
297 	vcpu->guestfpu = fpu_save_area_alloc();
298 	vcpu->stats = vmm_stat_alloc();
299 	return (vcpu);
300 }
301 
302 static void
303 vcpu_init(struct vcpu *vcpu)
304 {
305 	vcpu->cookie = vmmops_vcpu_init(vcpu->vm->cookie, vcpu, vcpu->vcpuid);
306 	MPASS(vcpu->cookie != NULL);
307 	fpu_save_area_reset(vcpu->guestfpu);
308 	vmm_stat_init(vcpu->stats);
309 }
310 
311 struct vm_exit *
312 vm_exitinfo(struct vcpu *vcpu)
313 {
314 	return (&vcpu->exitinfo);
315 }
316 
317 static int
318 vmm_init(void)
319 {
320 	int error;
321 
322 	vm_maxcpu = mp_ncpus;
323 	TUNABLE_INT_FETCH("hw.vmm.maxcpu", &vm_maxcpu);
324 
325 	if (vm_maxcpu > VM_MAXCPU) {
326 		printf("vmm: vm_maxcpu clamped to %u\n", VM_MAXCPU);
327 		vm_maxcpu = VM_MAXCPU;
328 	}
329 	if (vm_maxcpu == 0)
330 		vm_maxcpu = 1;
331 
332 	error = vmm_regs_init(&vmm_arch_regs, &vmm_arch_regs_masks);
333 	if (error != 0)
334 		return (error);
335 
336 	return (vmmops_modinit(0));
337 }
338 
339 static int
340 vmm_handler(module_t mod, int what, void *arg)
341 {
342 	int error;
343 
344 	switch (what) {
345 	case MOD_LOAD:
346 		/* TODO: if (vmm_is_hw_supported()) { */
347 		vmmdev_init();
348 		error = vmm_init();
349 		if (error == 0)
350 			vmm_initialized = true;
351 		break;
352 	case MOD_UNLOAD:
353 		/* TODO: if (vmm_is_hw_supported()) { */
354 		error = vmmdev_cleanup();
355 		if (error == 0 && vmm_initialized) {
356 			error = vmmops_modcleanup();
357 			if (error)
358 				vmm_initialized = false;
359 		}
360 		break;
361 	default:
362 		error = 0;
363 		break;
364 	}
365 	return (error);
366 }
367 
368 static moduledata_t vmm_kmod = {
369 	"vmm",
370 	vmm_handler,
371 	NULL
372 };
373 
374 /*
375  * vmm initialization has the following dependencies:
376  *
377  * - HYP initialization requires smp_rendezvous() and therefore must happen
378  *   after SMP is fully functional (after SI_SUB_SMP).
379  */
380 DECLARE_MODULE(vmm, vmm_kmod, SI_SUB_SMP + 1, SI_ORDER_ANY);
381 MODULE_VERSION(vmm, 1);
382 
383 static void
384 vm_init(struct vm *vm, bool create)
385 {
386 	int i;
387 
388 	vm->cookie = vmmops_init(vm, vmspace_pmap(vm->vmspace));
389 	MPASS(vm->cookie != NULL);
390 
391 	CPU_ZERO(&vm->active_cpus);
392 	CPU_ZERO(&vm->debug_cpus);
393 
394 	vm->suspend = 0;
395 	CPU_ZERO(&vm->suspended_cpus);
396 
397 	memset(vm->mmio_region, 0, sizeof(vm->mmio_region));
398 	memset(vm->special_reg, 0, sizeof(vm->special_reg));
399 
400 	if (!create) {
401 		for (i = 0; i < vm->maxcpus; i++) {
402 			if (vm->vcpu[i] != NULL)
403 				vcpu_init(vm->vcpu[i]);
404 		}
405 	}
406 }
407 
408 struct vcpu *
409 vm_alloc_vcpu(struct vm *vm, int vcpuid)
410 {
411 	struct vcpu *vcpu;
412 
413 	if (vcpuid < 0 || vcpuid >= vm_get_maxcpus(vm))
414 		return (NULL);
415 
416 	/* Some interrupt controllers may have a CPU limit */
417 	if (vcpuid >= vgic_max_cpu_count(vm->cookie))
418 		return (NULL);
419 
420 	vcpu = atomic_load_ptr(&vm->vcpu[vcpuid]);
421 	if (__predict_true(vcpu != NULL))
422 		return (vcpu);
423 
424 	sx_xlock(&vm->vcpus_init_lock);
425 	vcpu = vm->vcpu[vcpuid];
426 	if (vcpu == NULL/* && !vm->dying*/) {
427 		vcpu = vcpu_alloc(vm, vcpuid);
428 		vcpu_init(vcpu);
429 
430 		/*
431 		 * Ensure vCPU is fully created before updating pointer
432 		 * to permit unlocked reads above.
433 		 */
434 		atomic_store_rel_ptr((uintptr_t *)&vm->vcpu[vcpuid],
435 		    (uintptr_t)vcpu);
436 	}
437 	sx_xunlock(&vm->vcpus_init_lock);
438 	return (vcpu);
439 }
440 
441 void
442 vm_slock_vcpus(struct vm *vm)
443 {
444 	sx_slock(&vm->vcpus_init_lock);
445 }
446 
447 void
448 vm_unlock_vcpus(struct vm *vm)
449 {
450 	sx_unlock(&vm->vcpus_init_lock);
451 }
452 
453 int
454 vm_create(const char *name, struct vm **retvm)
455 {
456 	struct vm *vm;
457 	struct vmspace *vmspace;
458 
459 	/*
460 	 * If vmm.ko could not be successfully initialized then don't attempt
461 	 * to create the virtual machine.
462 	 */
463 	if (!vmm_initialized)
464 		return (ENXIO);
465 
466 	if (name == NULL || strlen(name) >= VM_MAX_NAMELEN)
467 		return (EINVAL);
468 
469 	vmspace = vmmops_vmspace_alloc(0, 1ul << 39);
470 	if (vmspace == NULL)
471 		return (ENOMEM);
472 
473 	vm = malloc(sizeof(struct vm), M_VMM, M_WAITOK | M_ZERO);
474 	strcpy(vm->name, name);
475 	vm->vmspace = vmspace;
476 	sx_init(&vm->mem_segs_lock, "vm mem_segs");
477 	sx_init(&vm->vcpus_init_lock, "vm vcpus");
478 
479 	vm->sockets = 1;
480 	vm->cores = 1;			/* XXX backwards compatibility */
481 	vm->threads = 1;		/* XXX backwards compatibility */
482 	vm->maxcpus = vm_maxcpu;
483 
484 	vm->vcpu = malloc(sizeof(*vm->vcpu) * vm->maxcpus, M_VMM,
485 	    M_WAITOK | M_ZERO);
486 
487 	vm_init(vm, true);
488 
489 	*retvm = vm;
490 	return (0);
491 }
492 
493 void
494 vm_get_topology(struct vm *vm, uint16_t *sockets, uint16_t *cores,
495     uint16_t *threads, uint16_t *maxcpus)
496 {
497 	*sockets = vm->sockets;
498 	*cores = vm->cores;
499 	*threads = vm->threads;
500 	*maxcpus = vm->maxcpus;
501 }
502 
503 uint16_t
504 vm_get_maxcpus(struct vm *vm)
505 {
506 	return (vm->maxcpus);
507 }
508 
509 int
510 vm_set_topology(struct vm *vm, uint16_t sockets, uint16_t cores,
511     uint16_t threads, uint16_t maxcpus)
512 {
513 	/* Ignore maxcpus. */
514 	if ((sockets * cores * threads) > vm->maxcpus)
515 		return (EINVAL);
516 	vm->sockets = sockets;
517 	vm->cores = cores;
518 	vm->threads = threads;
519 	return(0);
520 }
521 
522 static void
523 vm_cleanup(struct vm *vm, bool destroy)
524 {
525 	struct mem_map *mm;
526 	pmap_t pmap __diagused;
527 	int i;
528 
529 	if (destroy) {
530 		pmap = vmspace_pmap(vm->vmspace);
531 		sched_pin();
532 		PCPU_SET(curvmpmap, NULL);
533 		sched_unpin();
534 		CPU_FOREACH(i) {
535 			MPASS(cpuid_to_pcpu[i]->pc_curvmpmap != pmap);
536 		}
537 	}
538 
539 	vgic_detach_from_vm(vm->cookie);
540 
541 	for (i = 0; i < vm->maxcpus; i++) {
542 		if (vm->vcpu[i] != NULL)
543 			vcpu_cleanup(vm->vcpu[i], destroy);
544 	}
545 
546 	vmmops_cleanup(vm->cookie);
547 
548 	/*
549 	 * System memory is removed from the guest address space only when
550 	 * the VM is destroyed. This is because the mapping remains the same
551 	 * across VM reset.
552 	 *
553 	 * Device memory can be relocated by the guest (e.g. using PCI BARs)
554 	 * so those mappings are removed on a VM reset.
555 	 */
556 	if (!destroy) {
557 		for (i = 0; i < VM_MAX_MEMMAPS; i++) {
558 			mm = &vm->mem_maps[i];
559 			if (destroy || !sysmem_mapping(vm, mm))
560 				vm_free_memmap(vm, i);
561 		}
562 	}
563 
564 	if (destroy) {
565 		for (i = 0; i < VM_MAX_MEMSEGS; i++)
566 			vm_free_memseg(vm, i);
567 
568 		vmmops_vmspace_free(vm->vmspace);
569 		vm->vmspace = NULL;
570 
571 		for (i = 0; i < vm->maxcpus; i++)
572 			free(vm->vcpu[i], M_VMM);
573 		free(vm->vcpu, M_VMM);
574 		sx_destroy(&vm->vcpus_init_lock);
575 		sx_destroy(&vm->mem_segs_lock);
576 	}
577 }
578 
579 void
580 vm_destroy(struct vm *vm)
581 {
582 	vm_cleanup(vm, true);
583 	free(vm, M_VMM);
584 }
585 
586 int
587 vm_reinit(struct vm *vm)
588 {
589 	int error;
590 
591 	/*
592 	 * A virtual machine can be reset only if all vcpus are suspended.
593 	 */
594 	if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) {
595 		vm_cleanup(vm, false);
596 		vm_init(vm, false);
597 		error = 0;
598 	} else {
599 		error = EBUSY;
600 	}
601 
602 	return (error);
603 }
604 
605 const char *
606 vm_name(struct vm *vm)
607 {
608 	return (vm->name);
609 }
610 
611 void
612 vm_slock_memsegs(struct vm *vm)
613 {
614 	sx_slock(&vm->mem_segs_lock);
615 }
616 
617 void
618 vm_xlock_memsegs(struct vm *vm)
619 {
620 	sx_xlock(&vm->mem_segs_lock);
621 }
622 
623 void
624 vm_unlock_memsegs(struct vm *vm)
625 {
626 	sx_unlock(&vm->mem_segs_lock);
627 }
628 
629 /*
630  * Return 'true' if 'gpa' is allocated in the guest address space.
631  *
632  * This function is called in the context of a running vcpu which acts as
633  * an implicit lock on 'vm->mem_maps[]'.
634  */
635 bool
636 vm_mem_allocated(struct vcpu *vcpu, vm_paddr_t gpa)
637 {
638 	struct vm *vm = vcpu->vm;
639 	struct mem_map *mm;
640 	int i;
641 
642 #ifdef INVARIANTS
643 	int hostcpu, state;
644 	state = vcpu_get_state(vcpu, &hostcpu);
645 	KASSERT(state == VCPU_RUNNING && hostcpu == curcpu,
646 	    ("%s: invalid vcpu state %d/%d", __func__, state, hostcpu));
647 #endif
648 
649 	for (i = 0; i < VM_MAX_MEMMAPS; i++) {
650 		mm = &vm->mem_maps[i];
651 		if (mm->len != 0 && gpa >= mm->gpa && gpa < mm->gpa + mm->len)
652 			return (true);		/* 'gpa' is sysmem or devmem */
653 	}
654 
655 	return (false);
656 }
657 
658 int
659 vm_alloc_memseg(struct vm *vm, int ident, size_t len, bool sysmem)
660 {
661 	struct mem_seg *seg;
662 	vm_object_t obj;
663 
664 	sx_assert(&vm->mem_segs_lock, SX_XLOCKED);
665 
666 	if (ident < 0 || ident >= VM_MAX_MEMSEGS)
667 		return (EINVAL);
668 
669 	if (len == 0 || (len & PAGE_MASK))
670 		return (EINVAL);
671 
672 	seg = &vm->mem_segs[ident];
673 	if (seg->object != NULL) {
674 		if (seg->len == len && seg->sysmem == sysmem)
675 			return (EEXIST);
676 		else
677 			return (EINVAL);
678 	}
679 
680 	obj = vm_object_allocate(OBJT_DEFAULT, len >> PAGE_SHIFT);
681 	if (obj == NULL)
682 		return (ENOMEM);
683 
684 	seg->len = len;
685 	seg->object = obj;
686 	seg->sysmem = sysmem;
687 	return (0);
688 }
689 
690 int
691 vm_get_memseg(struct vm *vm, int ident, size_t *len, bool *sysmem,
692     vm_object_t *objptr)
693 {
694 	struct mem_seg *seg;
695 
696 	sx_assert(&vm->mem_segs_lock, SX_LOCKED);
697 
698 	if (ident < 0 || ident >= VM_MAX_MEMSEGS)
699 		return (EINVAL);
700 
701 	seg = &vm->mem_segs[ident];
702 	if (len)
703 		*len = seg->len;
704 	if (sysmem)
705 		*sysmem = seg->sysmem;
706 	if (objptr)
707 		*objptr = seg->object;
708 	return (0);
709 }
710 
711 void
712 vm_free_memseg(struct vm *vm, int ident)
713 {
714 	struct mem_seg *seg;
715 
716 	KASSERT(ident >= 0 && ident < VM_MAX_MEMSEGS,
717 	    ("%s: invalid memseg ident %d", __func__, ident));
718 
719 	seg = &vm->mem_segs[ident];
720 	if (seg->object != NULL) {
721 		vm_object_deallocate(seg->object);
722 		bzero(seg, sizeof(struct mem_seg));
723 	}
724 }
725 
726 int
727 vm_mmap_memseg(struct vm *vm, vm_paddr_t gpa, int segid, vm_ooffset_t first,
728     size_t len, int prot, int flags)
729 {
730 	struct mem_seg *seg;
731 	struct mem_map *m, *map;
732 	vm_ooffset_t last;
733 	int i, error;
734 
735 	if (prot == 0 || (prot & ~(VM_PROT_ALL)) != 0)
736 		return (EINVAL);
737 
738 	if (flags & ~VM_MEMMAP_F_WIRED)
739 		return (EINVAL);
740 
741 	if (segid < 0 || segid >= VM_MAX_MEMSEGS)
742 		return (EINVAL);
743 
744 	seg = &vm->mem_segs[segid];
745 	if (seg->object == NULL)
746 		return (EINVAL);
747 
748 	last = first + len;
749 	if (first < 0 || first >= last || last > seg->len)
750 		return (EINVAL);
751 
752 	if ((gpa | first | last) & PAGE_MASK)
753 		return (EINVAL);
754 
755 	map = NULL;
756 	for (i = 0; i < VM_MAX_MEMMAPS; i++) {
757 		m = &vm->mem_maps[i];
758 		if (m->len == 0) {
759 			map = m;
760 			break;
761 		}
762 	}
763 
764 	if (map == NULL)
765 		return (ENOSPC);
766 
767 	error = vm_map_find(&vm->vmspace->vm_map, seg->object, first, &gpa,
768 	    len, 0, VMFS_NO_SPACE, prot, prot, 0);
769 	if (error != KERN_SUCCESS)
770 		return (EFAULT);
771 
772 	vm_object_reference(seg->object);
773 
774 	if (flags & VM_MEMMAP_F_WIRED) {
775 		error = vm_map_wire(&vm->vmspace->vm_map, gpa, gpa + len,
776 		    VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
777 		if (error != KERN_SUCCESS) {
778 			vm_map_remove(&vm->vmspace->vm_map, gpa, gpa + len);
779 			return (error == KERN_RESOURCE_SHORTAGE ? ENOMEM :
780 			    EFAULT);
781 		}
782 	}
783 
784 	map->gpa = gpa;
785 	map->len = len;
786 	map->segoff = first;
787 	map->segid = segid;
788 	map->prot = prot;
789 	map->flags = flags;
790 	return (0);
791 }
792 
793 int
794 vm_munmap_memseg(struct vm *vm, vm_paddr_t gpa, size_t len)
795 {
796 	struct mem_map *m;
797 	int i;
798 
799 	for (i = 0; i < VM_MAX_MEMMAPS; i++) {
800 		m = &vm->mem_maps[i];
801 		if (m->gpa == gpa && m->len == len) {
802 			vm_free_memmap(vm, i);
803 			return (0);
804 		}
805 	}
806 
807 	return (EINVAL);
808 }
809 
810 int
811 vm_mmap_getnext(struct vm *vm, vm_paddr_t *gpa, int *segid,
812     vm_ooffset_t *segoff, size_t *len, int *prot, int *flags)
813 {
814 	struct mem_map *mm, *mmnext;
815 	int i;
816 
817 	mmnext = NULL;
818 	for (i = 0; i < VM_MAX_MEMMAPS; i++) {
819 		mm = &vm->mem_maps[i];
820 		if (mm->len == 0 || mm->gpa < *gpa)
821 			continue;
822 		if (mmnext == NULL || mm->gpa < mmnext->gpa)
823 			mmnext = mm;
824 	}
825 
826 	if (mmnext != NULL) {
827 		*gpa = mmnext->gpa;
828 		if (segid)
829 			*segid = mmnext->segid;
830 		if (segoff)
831 			*segoff = mmnext->segoff;
832 		if (len)
833 			*len = mmnext->len;
834 		if (prot)
835 			*prot = mmnext->prot;
836 		if (flags)
837 			*flags = mmnext->flags;
838 		return (0);
839 	} else {
840 		return (ENOENT);
841 	}
842 }
843 
844 static void
845 vm_free_memmap(struct vm *vm, int ident)
846 {
847 	struct mem_map *mm;
848 	int error __diagused;
849 
850 	mm = &vm->mem_maps[ident];
851 	if (mm->len) {
852 		error = vm_map_remove(&vm->vmspace->vm_map, mm->gpa,
853 		    mm->gpa + mm->len);
854 		KASSERT(error == KERN_SUCCESS, ("%s: vm_map_remove error %d",
855 		    __func__, error));
856 		bzero(mm, sizeof(struct mem_map));
857 	}
858 }
859 
860 static __inline bool
861 sysmem_mapping(struct vm *vm, struct mem_map *mm)
862 {
863 
864 	if (mm->len != 0 && vm->mem_segs[mm->segid].sysmem)
865 		return (true);
866 	else
867 		return (false);
868 }
869 
870 vm_paddr_t
871 vmm_sysmem_maxaddr(struct vm *vm)
872 {
873 	struct mem_map *mm;
874 	vm_paddr_t maxaddr;
875 	int i;
876 
877 	maxaddr = 0;
878 	for (i = 0; i < VM_MAX_MEMMAPS; i++) {
879 		mm = &vm->mem_maps[i];
880 		if (sysmem_mapping(vm, mm)) {
881 			if (maxaddr < mm->gpa + mm->len)
882 				maxaddr = mm->gpa + mm->len;
883 		}
884 	}
885 	return (maxaddr);
886 }
887 
888 int
889 vm_gla2gpa_nofault(struct vcpu *vcpu, struct vm_guest_paging *paging,
890     uint64_t gla, int prot, uint64_t *gpa, int *is_fault)
891 {
892 
893 	vmmops_gla2gpa(vcpu->cookie, paging, gla, prot, gpa, is_fault);
894 	return (0);
895 }
896 
897 static int
898 vmm_reg_raz(struct vcpu *vcpu, uint64_t *rval, void *arg)
899 {
900 	*rval = 0;
901 	return (0);
902 }
903 
904 static int
905 vmm_reg_read_arg(struct vcpu *vcpu, uint64_t *rval, void *arg)
906 {
907 	*rval = *(uint64_t *)arg;
908 	return (0);
909 }
910 
911 static int
912 vmm_reg_wi(struct vcpu *vcpu, uint64_t wval, void *arg)
913 {
914 	return (0);
915 }
916 
917 static const struct vmm_special_reg vmm_special_regs[] = {
918 #define	SPECIAL_REG(_reg, _read, _write)				\
919 	{								\
920 		.esr_iss = ((_reg ## _op0) << ISS_MSR_OP0_SHIFT) |	\
921 		    ((_reg ## _op1) << ISS_MSR_OP1_SHIFT) |		\
922 		    ((_reg ## _CRn) << ISS_MSR_CRn_SHIFT) |		\
923 		    ((_reg ## _CRm) << ISS_MSR_CRm_SHIFT) |		\
924 		    ((_reg ## _op2) << ISS_MSR_OP2_SHIFT),		\
925 		.esr_mask = ISS_MSR_REG_MASK,				\
926 		.reg_read = (_read),					\
927 		.reg_write = (_write),					\
928 		.arg = NULL,						\
929 	}
930 #define	ID_SPECIAL_REG(_reg, _name)					\
931 	{								\
932 		.esr_iss = ((_reg ## _op0) << ISS_MSR_OP0_SHIFT) |	\
933 		    ((_reg ## _op1) << ISS_MSR_OP1_SHIFT) |		\
934 		    ((_reg ## _CRn) << ISS_MSR_CRn_SHIFT) |		\
935 		    ((_reg ## _CRm) << ISS_MSR_CRm_SHIFT) |		\
936 		    ((_reg ## _op2) << ISS_MSR_OP2_SHIFT),		\
937 		.esr_mask = ISS_MSR_REG_MASK,				\
938 		.reg_read = vmm_reg_read_arg,				\
939 		.reg_write = vmm_reg_wi,				\
940 		.arg = &(vmm_arch_regs._name),				\
941 	}
942 
943 	/* ID registers */
944 	ID_SPECIAL_REG(ID_AA64PFR0_EL1, id_aa64pfr0),
945 	ID_SPECIAL_REG(ID_AA64DFR0_EL1, id_aa64dfr0),
946 	ID_SPECIAL_REG(ID_AA64ISAR0_EL1, id_aa64isar0),
947 	ID_SPECIAL_REG(ID_AA64MMFR0_EL1, id_aa64mmfr0),
948 	ID_SPECIAL_REG(ID_AA64MMFR1_EL1, id_aa64mmfr1),
949 
950 	/*
951 	 * All other ID registers are read as zero.
952 	 * They are all in the op0=3, op1=0, CRn=0, CRm={0..7} space.
953 	 */
954 	{
955 		.esr_iss = (3 << ISS_MSR_OP0_SHIFT) |
956 		    (0 << ISS_MSR_OP1_SHIFT) |
957 		    (0 << ISS_MSR_CRn_SHIFT) |
958 		    (0 << ISS_MSR_CRm_SHIFT),
959 		.esr_mask = ISS_MSR_OP0_MASK | ISS_MSR_OP1_MASK |
960 		    ISS_MSR_CRn_MASK | (0x8 << ISS_MSR_CRm_SHIFT),
961 		.reg_read = vmm_reg_raz,
962 		.reg_write = vmm_reg_wi,
963 		.arg = NULL,
964 	},
965 
966 	/* Counter physical registers */
967 	SPECIAL_REG(CNTP_CTL_EL0, vtimer_phys_ctl_read, vtimer_phys_ctl_write),
968 	SPECIAL_REG(CNTP_CVAL_EL0, vtimer_phys_cval_read,
969 	    vtimer_phys_cval_write),
970 	SPECIAL_REG(CNTP_TVAL_EL0, vtimer_phys_tval_read,
971 	    vtimer_phys_tval_write),
972 	SPECIAL_REG(CNTPCT_EL0, vtimer_phys_cnt_read, vtimer_phys_cnt_write),
973 #undef SPECIAL_REG
974 };
975 
976 void
977 vm_register_reg_handler(struct vm *vm, uint64_t iss, uint64_t mask,
978     reg_read_t reg_read, reg_write_t reg_write, void *arg)
979 {
980 	int i;
981 
982 	for (i = 0; i < nitems(vm->special_reg); i++) {
983 		if (vm->special_reg[i].esr_iss == 0 &&
984 		    vm->special_reg[i].esr_mask == 0) {
985 			vm->special_reg[i].esr_iss = iss;
986 			vm->special_reg[i].esr_mask = mask;
987 			vm->special_reg[i].reg_read = reg_read;
988 			vm->special_reg[i].reg_write = reg_write;
989 			vm->special_reg[i].arg = arg;
990 			return;
991 		}
992 	}
993 
994 	panic("%s: No free special register slot", __func__);
995 }
996 
997 void
998 vm_deregister_reg_handler(struct vm *vm, uint64_t iss, uint64_t mask)
999 {
1000 	int i;
1001 
1002 	for (i = 0; i < nitems(vm->special_reg); i++) {
1003 		if (vm->special_reg[i].esr_iss == iss &&
1004 		    vm->special_reg[i].esr_mask == mask) {
1005 			memset(&vm->special_reg[i], 0,
1006 			    sizeof(vm->special_reg[i]));
1007 			return;
1008 		}
1009 	}
1010 
1011 	panic("%s: Invalid special register: iss %lx mask %lx", __func__, iss,
1012 	    mask);
1013 }
1014 
1015 static int
1016 vm_handle_reg_emul(struct vcpu *vcpu, bool *retu)
1017 {
1018 	struct vm *vm;
1019 	struct vm_exit *vme;
1020 	struct vre *vre;
1021 	int i, rv;
1022 
1023 	vm = vcpu->vm;
1024 	vme = &vcpu->exitinfo;
1025 	vre = &vme->u.reg_emul.vre;
1026 
1027 	for (i = 0; i < nitems(vm->special_reg); i++) {
1028 		if (vm->special_reg[i].esr_iss == 0 &&
1029 		    vm->special_reg[i].esr_mask == 0)
1030 			continue;
1031 
1032 		if ((vre->inst_syndrome & vm->special_reg[i].esr_mask) ==
1033 		    vm->special_reg[i].esr_iss) {
1034 			rv = vmm_emulate_register(vcpu, vre,
1035 			    vm->special_reg[i].reg_read,
1036 			    vm->special_reg[i].reg_write,
1037 			    vm->special_reg[i].arg);
1038 			if (rv == 0) {
1039 				*retu = false;
1040 			}
1041 			return (rv);
1042 		}
1043 	}
1044 	for (i = 0; i < nitems(vmm_special_regs); i++) {
1045 		if ((vre->inst_syndrome & vmm_special_regs[i].esr_mask) ==
1046 		    vmm_special_regs[i].esr_iss) {
1047 			rv = vmm_emulate_register(vcpu, vre,
1048 			    vmm_special_regs[i].reg_read,
1049 			    vmm_special_regs[i].reg_write,
1050 			    vmm_special_regs[i].arg);
1051 			if (rv == 0) {
1052 				*retu = false;
1053 			}
1054 			return (rv);
1055 		}
1056 	}
1057 
1058 
1059 	*retu = true;
1060 	return (0);
1061 }
1062 
1063 void
1064 vm_register_inst_handler(struct vm *vm, uint64_t start, uint64_t size,
1065     mem_region_read_t mmio_read, mem_region_write_t mmio_write)
1066 {
1067 	int i;
1068 
1069 	for (i = 0; i < nitems(vm->mmio_region); i++) {
1070 		if (vm->mmio_region[i].start == 0 &&
1071 		    vm->mmio_region[i].end == 0) {
1072 			vm->mmio_region[i].start = start;
1073 			vm->mmio_region[i].end = start + size;
1074 			vm->mmio_region[i].read = mmio_read;
1075 			vm->mmio_region[i].write = mmio_write;
1076 			return;
1077 		}
1078 	}
1079 
1080 	panic("%s: No free MMIO region", __func__);
1081 }
1082 
1083 void
1084 vm_deregister_inst_handler(struct vm *vm, uint64_t start, uint64_t size)
1085 {
1086 	int i;
1087 
1088 	for (i = 0; i < nitems(vm->mmio_region); i++) {
1089 		if (vm->mmio_region[i].start == start &&
1090 		    vm->mmio_region[i].end == start + size) {
1091 			memset(&vm->mmio_region[i], 0,
1092 			    sizeof(vm->mmio_region[i]));
1093 			return;
1094 		}
1095 	}
1096 
1097 	panic("%s: Invalid MMIO region: %lx - %lx", __func__, start,
1098 	    start + size);
1099 }
1100 
1101 static int
1102 vm_handle_inst_emul(struct vcpu *vcpu, bool *retu)
1103 {
1104 	struct vm *vm;
1105 	struct vm_exit *vme;
1106 	struct vie *vie;
1107 	struct hyp *hyp;
1108 	uint64_t fault_ipa;
1109 	struct vm_guest_paging *paging;
1110 	struct vmm_mmio_region *vmr;
1111 	int error, i;
1112 
1113 	vm = vcpu->vm;
1114 	hyp = vm->cookie;
1115 	if (!hyp->vgic_attached)
1116 		goto out_user;
1117 
1118 	vme = &vcpu->exitinfo;
1119 	vie = &vme->u.inst_emul.vie;
1120 	paging = &vme->u.inst_emul.paging;
1121 
1122 	fault_ipa = vme->u.inst_emul.gpa;
1123 
1124 	vmr = NULL;
1125 	for (i = 0; i < nitems(vm->mmio_region); i++) {
1126 		if (vm->mmio_region[i].start <= fault_ipa &&
1127 		    vm->mmio_region[i].end > fault_ipa) {
1128 			vmr = &vm->mmio_region[i];
1129 			break;
1130 		}
1131 	}
1132 	if (vmr == NULL)
1133 		goto out_user;
1134 
1135 	error = vmm_emulate_instruction(vcpu, fault_ipa, vie, paging,
1136 	    vmr->read, vmr->write, retu);
1137 	return (error);
1138 
1139 out_user:
1140 	*retu = true;
1141 	return (0);
1142 }
1143 
1144 int
1145 vm_suspend(struct vm *vm, enum vm_suspend_how how)
1146 {
1147 	int i;
1148 
1149 	if (how <= VM_SUSPEND_NONE || how >= VM_SUSPEND_LAST)
1150 		return (EINVAL);
1151 
1152 	if (atomic_cmpset_int(&vm->suspend, 0, how) == 0) {
1153 		VM_CTR2(vm, "virtual machine already suspended %d/%d",
1154 		    vm->suspend, how);
1155 		return (EALREADY);
1156 	}
1157 
1158 	VM_CTR1(vm, "virtual machine successfully suspended %d", how);
1159 
1160 	/*
1161 	 * Notify all active vcpus that they are now suspended.
1162 	 */
1163 	for (i = 0; i < vm->maxcpus; i++) {
1164 		if (CPU_ISSET(i, &vm->active_cpus))
1165 			vcpu_notify_event(vm_vcpu(vm, i));
1166 	}
1167 
1168 	return (0);
1169 }
1170 
1171 void
1172 vm_exit_suspended(struct vcpu *vcpu, uint64_t pc)
1173 {
1174 	struct vm *vm = vcpu->vm;
1175 	struct vm_exit *vmexit;
1176 
1177 	KASSERT(vm->suspend > VM_SUSPEND_NONE && vm->suspend < VM_SUSPEND_LAST,
1178 	    ("vm_exit_suspended: invalid suspend type %d", vm->suspend));
1179 
1180 	vmexit = vm_exitinfo(vcpu);
1181 	vmexit->pc = pc;
1182 	vmexit->inst_length = 4;
1183 	vmexit->exitcode = VM_EXITCODE_SUSPENDED;
1184 	vmexit->u.suspended.how = vm->suspend;
1185 }
1186 
1187 void
1188 vm_exit_debug(struct vcpu *vcpu, uint64_t pc)
1189 {
1190 	struct vm_exit *vmexit;
1191 
1192 	vmexit = vm_exitinfo(vcpu);
1193 	vmexit->pc = pc;
1194 	vmexit->inst_length = 4;
1195 	vmexit->exitcode = VM_EXITCODE_DEBUG;
1196 }
1197 
1198 int
1199 vm_activate_cpu(struct vcpu *vcpu)
1200 {
1201 	struct vm *vm = vcpu->vm;
1202 
1203 	if (CPU_ISSET(vcpu->vcpuid, &vm->active_cpus))
1204 		return (EBUSY);
1205 
1206 	CPU_SET_ATOMIC(vcpu->vcpuid, &vm->active_cpus);
1207 	return (0);
1208 
1209 }
1210 
1211 int
1212 vm_suspend_cpu(struct vm *vm, struct vcpu *vcpu)
1213 {
1214 	if (vcpu == NULL) {
1215 		vm->debug_cpus = vm->active_cpus;
1216 		for (int i = 0; i < vm->maxcpus; i++) {
1217 			if (CPU_ISSET(i, &vm->active_cpus))
1218 				vcpu_notify_event(vm_vcpu(vm, i));
1219 		}
1220 	} else {
1221 		if (!CPU_ISSET(vcpu->vcpuid, &vm->active_cpus))
1222 			return (EINVAL);
1223 
1224 		CPU_SET_ATOMIC(vcpu->vcpuid, &vm->debug_cpus);
1225 		vcpu_notify_event(vcpu);
1226 	}
1227 	return (0);
1228 }
1229 
1230 int
1231 vm_resume_cpu(struct vm *vm, struct vcpu *vcpu)
1232 {
1233 
1234 	if (vcpu == NULL) {
1235 		CPU_ZERO(&vm->debug_cpus);
1236 	} else {
1237 		if (!CPU_ISSET(vcpu->vcpuid, &vm->debug_cpus))
1238 			return (EINVAL);
1239 
1240 		CPU_CLR_ATOMIC(vcpu->vcpuid, &vm->debug_cpus);
1241 	}
1242 	return (0);
1243 }
1244 
1245 int
1246 vcpu_debugged(struct vcpu *vcpu)
1247 {
1248 
1249 	return (CPU_ISSET(vcpu->vcpuid, &vcpu->vm->debug_cpus));
1250 }
1251 
1252 cpuset_t
1253 vm_active_cpus(struct vm *vm)
1254 {
1255 
1256 	return (vm->active_cpus);
1257 }
1258 
1259 cpuset_t
1260 vm_debug_cpus(struct vm *vm)
1261 {
1262 
1263 	return (vm->debug_cpus);
1264 }
1265 
1266 cpuset_t
1267 vm_suspended_cpus(struct vm *vm)
1268 {
1269 
1270 	return (vm->suspended_cpus);
1271 }
1272 
1273 
1274 void *
1275 vcpu_stats(struct vcpu *vcpu)
1276 {
1277 
1278 	return (vcpu->stats);
1279 }
1280 
1281 /*
1282  * This function is called to ensure that a vcpu "sees" a pending event
1283  * as soon as possible:
1284  * - If the vcpu thread is sleeping then it is woken up.
1285  * - If the vcpu is running on a different host_cpu then an IPI will be directed
1286  *   to the host_cpu to cause the vcpu to trap into the hypervisor.
1287  */
1288 static void
1289 vcpu_notify_event_locked(struct vcpu *vcpu)
1290 {
1291 	int hostcpu;
1292 
1293 	hostcpu = vcpu->hostcpu;
1294 	if (vcpu->state == VCPU_RUNNING) {
1295 		KASSERT(hostcpu != NOCPU, ("vcpu running on invalid hostcpu"));
1296 		if (hostcpu != curcpu) {
1297 			ipi_cpu(hostcpu, vmm_ipinum);
1298 		} else {
1299 			/*
1300 			 * If the 'vcpu' is running on 'curcpu' then it must
1301 			 * be sending a notification to itself (e.g. SELF_IPI).
1302 			 * The pending event will be picked up when the vcpu
1303 			 * transitions back to guest context.
1304 			 */
1305 		}
1306 	} else {
1307 		KASSERT(hostcpu == NOCPU, ("vcpu state %d not consistent "
1308 		    "with hostcpu %d", vcpu->state, hostcpu));
1309 		if (vcpu->state == VCPU_SLEEPING)
1310 			wakeup_one(vcpu);
1311 	}
1312 }
1313 
1314 void
1315 vcpu_notify_event(struct vcpu *vcpu)
1316 {
1317 	vcpu_lock(vcpu);
1318 	vcpu_notify_event_locked(vcpu);
1319 	vcpu_unlock(vcpu);
1320 }
1321 
1322 static void
1323 restore_guest_fpustate(struct vcpu *vcpu)
1324 {
1325 
1326 	/* flush host state to the pcb */
1327 	vfp_save_state(curthread, curthread->td_pcb);
1328 	/* Ensure the VFP state will be re-loaded when exiting the guest */
1329 	PCPU_SET(fpcurthread, NULL);
1330 
1331 	/* restore guest FPU state */
1332 	vfp_enable();
1333 	vfp_restore(vcpu->guestfpu);
1334 
1335 	/*
1336 	 * The FPU is now "dirty" with the guest's state so turn on emulation
1337 	 * to trap any access to the FPU by the host.
1338 	 */
1339 	vfp_disable();
1340 }
1341 
1342 static void
1343 save_guest_fpustate(struct vcpu *vcpu)
1344 {
1345 	if ((READ_SPECIALREG(cpacr_el1) & CPACR_FPEN_MASK) !=
1346 	    CPACR_FPEN_TRAP_ALL1)
1347 		panic("VFP not enabled in host!");
1348 
1349 	/* save guest FPU state */
1350 	vfp_enable();
1351 	vfp_store(vcpu->guestfpu);
1352 	vfp_disable();
1353 
1354 	KASSERT(PCPU_GET(fpcurthread) == NULL,
1355 	    ("%s: fpcurthread set with guest registers", __func__));
1356 }
1357 static int
1358 vcpu_set_state_locked(struct vcpu *vcpu, enum vcpu_state newstate,
1359     bool from_idle)
1360 {
1361 	int error;
1362 
1363 	vcpu_assert_locked(vcpu);
1364 
1365 	/*
1366 	 * State transitions from the vmmdev_ioctl() must always begin from
1367 	 * the VCPU_IDLE state. This guarantees that there is only a single
1368 	 * ioctl() operating on a vcpu at any point.
1369 	 */
1370 	if (from_idle) {
1371 		while (vcpu->state != VCPU_IDLE) {
1372 			vcpu_notify_event_locked(vcpu);
1373 			msleep_spin(&vcpu->state, &vcpu->mtx, "vmstat", hz);
1374 		}
1375 	} else {
1376 		KASSERT(vcpu->state != VCPU_IDLE, ("invalid transition from "
1377 		    "vcpu idle state"));
1378 	}
1379 
1380 	if (vcpu->state == VCPU_RUNNING) {
1381 		KASSERT(vcpu->hostcpu == curcpu, ("curcpu %d and hostcpu %d "
1382 		    "mismatch for running vcpu", curcpu, vcpu->hostcpu));
1383 	} else {
1384 		KASSERT(vcpu->hostcpu == NOCPU, ("Invalid hostcpu %d for a "
1385 		    "vcpu that is not running", vcpu->hostcpu));
1386 	}
1387 
1388 	/*
1389 	 * The following state transitions are allowed:
1390 	 * IDLE -> FROZEN -> IDLE
1391 	 * FROZEN -> RUNNING -> FROZEN
1392 	 * FROZEN -> SLEEPING -> FROZEN
1393 	 */
1394 	switch (vcpu->state) {
1395 	case VCPU_IDLE:
1396 	case VCPU_RUNNING:
1397 	case VCPU_SLEEPING:
1398 		error = (newstate != VCPU_FROZEN);
1399 		break;
1400 	case VCPU_FROZEN:
1401 		error = (newstate == VCPU_FROZEN);
1402 		break;
1403 	default:
1404 		error = 1;
1405 		break;
1406 	}
1407 
1408 	if (error)
1409 		return (EBUSY);
1410 
1411 	vcpu->state = newstate;
1412 	if (newstate == VCPU_RUNNING)
1413 		vcpu->hostcpu = curcpu;
1414 	else
1415 		vcpu->hostcpu = NOCPU;
1416 
1417 	if (newstate == VCPU_IDLE)
1418 		wakeup(&vcpu->state);
1419 
1420 	return (0);
1421 }
1422 
1423 static void
1424 vcpu_require_state(struct vcpu *vcpu, enum vcpu_state newstate)
1425 {
1426 	int error;
1427 
1428 	if ((error = vcpu_set_state(vcpu, newstate, false)) != 0)
1429 		panic("Error %d setting state to %d\n", error, newstate);
1430 }
1431 
1432 static void
1433 vcpu_require_state_locked(struct vcpu *vcpu, enum vcpu_state newstate)
1434 {
1435 	int error;
1436 
1437 	if ((error = vcpu_set_state_locked(vcpu, newstate, false)) != 0)
1438 		panic("Error %d setting state to %d", error, newstate);
1439 }
1440 
1441 int
1442 vm_get_capability(struct vcpu *vcpu, int type, int *retval)
1443 {
1444 	if (type < 0 || type >= VM_CAP_MAX)
1445 		return (EINVAL);
1446 
1447 	return (vmmops_getcap(vcpu->cookie, type, retval));
1448 }
1449 
1450 int
1451 vm_set_capability(struct vcpu *vcpu, int type, int val)
1452 {
1453 	if (type < 0 || type >= VM_CAP_MAX)
1454 		return (EINVAL);
1455 
1456 	return (vmmops_setcap(vcpu->cookie, type, val));
1457 }
1458 
1459 struct vm *
1460 vcpu_vm(struct vcpu *vcpu)
1461 {
1462 	return (vcpu->vm);
1463 }
1464 
1465 int
1466 vcpu_vcpuid(struct vcpu *vcpu)
1467 {
1468 	return (vcpu->vcpuid);
1469 }
1470 
1471 void *
1472 vcpu_get_cookie(struct vcpu *vcpu)
1473 {
1474 	return (vcpu->cookie);
1475 }
1476 
1477 struct vcpu *
1478 vm_vcpu(struct vm *vm, int vcpuid)
1479 {
1480 	return (vm->vcpu[vcpuid]);
1481 }
1482 
1483 int
1484 vcpu_set_state(struct vcpu *vcpu, enum vcpu_state newstate, bool from_idle)
1485 {
1486 	int error;
1487 
1488 	vcpu_lock(vcpu);
1489 	error = vcpu_set_state_locked(vcpu, newstate, from_idle);
1490 	vcpu_unlock(vcpu);
1491 
1492 	return (error);
1493 }
1494 
1495 enum vcpu_state
1496 vcpu_get_state(struct vcpu *vcpu, int *hostcpu)
1497 {
1498 	enum vcpu_state state;
1499 
1500 	vcpu_lock(vcpu);
1501 	state = vcpu->state;
1502 	if (hostcpu != NULL)
1503 		*hostcpu = vcpu->hostcpu;
1504 	vcpu_unlock(vcpu);
1505 
1506 	return (state);
1507 }
1508 
1509 static void *
1510 _vm_gpa_hold(struct vm *vm, vm_paddr_t gpa, size_t len, int reqprot,
1511     void **cookie)
1512 {
1513 	int i, count, pageoff;
1514 	struct mem_map *mm;
1515 	vm_page_t m;
1516 
1517 	pageoff = gpa & PAGE_MASK;
1518 	if (len > PAGE_SIZE - pageoff)
1519 		panic("vm_gpa_hold: invalid gpa/len: 0x%016lx/%lu", gpa, len);
1520 
1521 	count = 0;
1522 	for (i = 0; i < VM_MAX_MEMMAPS; i++) {
1523 		mm = &vm->mem_maps[i];
1524 		if (sysmem_mapping(vm, mm) && gpa >= mm->gpa &&
1525 		    gpa < mm->gpa + mm->len) {
1526 			count = vm_fault_quick_hold_pages(&vm->vmspace->vm_map,
1527 			    trunc_page(gpa), PAGE_SIZE, reqprot, &m, 1);
1528 			break;
1529 		}
1530 	}
1531 
1532 	if (count == 1) {
1533 		*cookie = m;
1534 		return ((void *)(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)) + pageoff));
1535 	} else {
1536 		*cookie = NULL;
1537 		return (NULL);
1538 	}
1539 }
1540 
1541 void *
1542 vm_gpa_hold(struct vcpu *vcpu, vm_paddr_t gpa, size_t len, int reqprot,
1543 	    void **cookie)
1544 {
1545 #ifdef INVARIANTS
1546 	/*
1547 	 * The current vcpu should be frozen to ensure 'vm_memmap[]'
1548 	 * stability.
1549 	 */
1550 	int state = vcpu_get_state(vcpu, NULL);
1551 	KASSERT(state == VCPU_FROZEN, ("%s: invalid vcpu state %d",
1552 	    __func__, state));
1553 #endif
1554 	return (_vm_gpa_hold(vcpu->vm, gpa, len, reqprot, cookie));
1555 }
1556 
1557 void *
1558 vm_gpa_hold_global(struct vm *vm, vm_paddr_t gpa, size_t len, int reqprot,
1559     void **cookie)
1560 {
1561 	sx_assert(&vm->mem_segs_lock, SX_LOCKED);
1562 	return (_vm_gpa_hold(vm, gpa, len, reqprot, cookie));
1563 }
1564 
1565 void
1566 vm_gpa_release(void *cookie)
1567 {
1568 	vm_page_t m = cookie;
1569 
1570 	vm_page_unwire(m, PQ_ACTIVE);
1571 }
1572 
1573 int
1574 vm_get_register(struct vcpu *vcpu, int reg, uint64_t *retval)
1575 {
1576 
1577 	if (reg >= VM_REG_LAST)
1578 		return (EINVAL);
1579 
1580 	return (vmmops_getreg(vcpu->cookie, reg, retval));
1581 }
1582 
1583 int
1584 vm_set_register(struct vcpu *vcpu, int reg, uint64_t val)
1585 {
1586 	int error;
1587 
1588 	if (reg >= VM_REG_LAST)
1589 		return (EINVAL);
1590 	error = vmmops_setreg(vcpu->cookie, reg, val);
1591 	if (error || reg != VM_REG_GUEST_PC)
1592 		return (error);
1593 
1594 	vcpu->nextpc = val;
1595 
1596 	return (0);
1597 }
1598 
1599 void *
1600 vm_get_cookie(struct vm *vm)
1601 {
1602 	return (vm->cookie);
1603 }
1604 
1605 int
1606 vm_inject_exception(struct vcpu *vcpu, uint64_t esr, uint64_t far)
1607 {
1608 	return (vmmops_exception(vcpu->cookie, esr, far));
1609 }
1610 
1611 int
1612 vm_attach_vgic(struct vm *vm, struct vm_vgic_descr *descr)
1613 {
1614 	return (vgic_attach_to_vm(vm->cookie, descr));
1615 }
1616 
1617 int
1618 vm_assert_irq(struct vm *vm, uint32_t irq)
1619 {
1620 	return (vgic_inject_irq(vm->cookie, -1, irq, true));
1621 }
1622 
1623 int
1624 vm_deassert_irq(struct vm *vm, uint32_t irq)
1625 {
1626 	return (vgic_inject_irq(vm->cookie, -1, irq, false));
1627 }
1628 
1629 int
1630 vm_raise_msi(struct vm *vm, uint64_t msg, uint64_t addr, int bus, int slot,
1631     int func)
1632 {
1633 	/* TODO: Should we raise an SError? */
1634 	return (vgic_inject_msi(vm->cookie, msg, addr));
1635 }
1636 
1637 static int
1638 vm_handle_smccc_call(struct vcpu *vcpu, struct vm_exit *vme, bool *retu)
1639 {
1640 	struct hypctx *hypctx;
1641 	int i;
1642 
1643 	hypctx = vcpu_get_cookie(vcpu);
1644 
1645 	if ((hypctx->tf.tf_esr & ESR_ELx_ISS_MASK) != 0)
1646 		return (1);
1647 
1648 	vme->exitcode = VM_EXITCODE_SMCCC;
1649 	vme->u.smccc_call.func_id = hypctx->tf.tf_x[0];
1650 	for (i = 0; i < nitems(vme->u.smccc_call.args); i++)
1651 		vme->u.smccc_call.args[i] = hypctx->tf.tf_x[i + 1];
1652 
1653 	*retu = true;
1654 	return (0);
1655 }
1656 
1657 static int
1658 vm_handle_wfi(struct vcpu *vcpu, struct vm_exit *vme, bool *retu)
1659 {
1660 	vcpu_lock(vcpu);
1661 	while (1) {
1662 		if (vgic_has_pending_irq(vcpu->cookie))
1663 			break;
1664 
1665 		if (vcpu_should_yield(vcpu))
1666 			break;
1667 
1668 		vcpu_require_state_locked(vcpu, VCPU_SLEEPING);
1669 		/*
1670 		 * XXX msleep_spin() cannot be interrupted by signals so
1671 		 * wake up periodically to check pending signals.
1672 		 */
1673 		msleep_spin(vcpu, &vcpu->mtx, "vmidle", hz);
1674 		vcpu_require_state_locked(vcpu, VCPU_FROZEN);
1675 	}
1676 	vcpu_unlock(vcpu);
1677 
1678 	*retu = false;
1679 	return (0);
1680 }
1681 
1682 static int
1683 vm_handle_paging(struct vcpu *vcpu, bool *retu)
1684 {
1685 	struct vm *vm = vcpu->vm;
1686 	struct vm_exit *vme;
1687 	struct vm_map *map;
1688 	uint64_t addr, esr;
1689 	pmap_t pmap;
1690 	int ftype, rv;
1691 
1692 	vme = &vcpu->exitinfo;
1693 
1694 	pmap = vmspace_pmap(vcpu->vm->vmspace);
1695 	addr = vme->u.paging.gpa;
1696 	esr = vme->u.paging.esr;
1697 
1698 	/* The page exists, but the page table needs to be updated. */
1699 	if (pmap_fault(pmap, esr, addr) == KERN_SUCCESS)
1700 		return (0);
1701 
1702 	switch (ESR_ELx_EXCEPTION(esr)) {
1703 	case EXCP_INSN_ABORT_L:
1704 	case EXCP_DATA_ABORT_L:
1705 		ftype = VM_PROT_EXECUTE | VM_PROT_READ | VM_PROT_WRITE;
1706 		break;
1707 	default:
1708 		panic("%s: Invalid exception (esr = %lx)", __func__, esr);
1709 	}
1710 
1711 	map = &vm->vmspace->vm_map;
1712 	rv = vm_fault(map, vme->u.paging.gpa, ftype, VM_FAULT_NORMAL, NULL);
1713 	if (rv != KERN_SUCCESS)
1714 		return (EFAULT);
1715 
1716 	return (0);
1717 }
1718 
1719 int
1720 vm_run(struct vcpu *vcpu)
1721 {
1722 	struct vm *vm = vcpu->vm;
1723 	struct vm_eventinfo evinfo;
1724 	int error, vcpuid;
1725 	struct vm_exit *vme;
1726 	bool retu;
1727 	pmap_t pmap;
1728 
1729 	vcpuid = vcpu->vcpuid;
1730 
1731 	if (!CPU_ISSET(vcpuid, &vm->active_cpus))
1732 		return (EINVAL);
1733 
1734 	if (CPU_ISSET(vcpuid, &vm->suspended_cpus))
1735 		return (EINVAL);
1736 
1737 	pmap = vmspace_pmap(vm->vmspace);
1738 	vme = &vcpu->exitinfo;
1739 	evinfo.rptr = NULL;
1740 	evinfo.sptr = &vm->suspend;
1741 	evinfo.iptr = NULL;
1742 restart:
1743 	critical_enter();
1744 
1745 	restore_guest_fpustate(vcpu);
1746 
1747 	vcpu_require_state(vcpu, VCPU_RUNNING);
1748 	error = vmmops_run(vcpu->cookie, vcpu->nextpc, pmap, &evinfo);
1749 	vcpu_require_state(vcpu, VCPU_FROZEN);
1750 
1751 	save_guest_fpustate(vcpu);
1752 
1753 	critical_exit();
1754 
1755 	if (error == 0) {
1756 		retu = false;
1757 		switch (vme->exitcode) {
1758 		case VM_EXITCODE_INST_EMUL:
1759 			vcpu->nextpc = vme->pc + vme->inst_length;
1760 			error = vm_handle_inst_emul(vcpu, &retu);
1761 			break;
1762 
1763 		case VM_EXITCODE_REG_EMUL:
1764 			vcpu->nextpc = vme->pc + vme->inst_length;
1765 			error = vm_handle_reg_emul(vcpu, &retu);
1766 			break;
1767 
1768 		case VM_EXITCODE_HVC:
1769 			/*
1770 			 * The HVC instruction saves the address for the
1771 			 * next instruction as the return address.
1772 			 */
1773 			vcpu->nextpc = vme->pc;
1774 			/*
1775 			 * The PSCI call can change the exit information in the
1776 			 * case of suspend/reset/poweroff/cpu off/cpu on.
1777 			 */
1778 			error = vm_handle_smccc_call(vcpu, vme, &retu);
1779 			break;
1780 
1781 		case VM_EXITCODE_WFI:
1782 			vcpu->nextpc = vme->pc + vme->inst_length;
1783 			error = vm_handle_wfi(vcpu, vme, &retu);
1784 			break;
1785 
1786 		case VM_EXITCODE_PAGING:
1787 			vcpu->nextpc = vme->pc;
1788 			error = vm_handle_paging(vcpu, &retu);
1789 			break;
1790 
1791 		default:
1792 			/* Handle in userland */
1793 			vcpu->nextpc = vme->pc;
1794 			retu = true;
1795 			break;
1796 		}
1797 	}
1798 
1799 	if (error == 0 && retu == false)
1800 		goto restart;
1801 
1802 	return (error);
1803 }
1804