xref: /freebsd/sys/amd64/vmm/vmm.c (revision a466cc55373fc3cf86837f09da729535b57e69a1)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2011 NetApp, Inc.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  *
28  * $FreeBSD$
29  */
30 
31 #include <sys/cdefs.h>
32 __FBSDID("$FreeBSD$");
33 
34 #include "opt_bhyve_snapshot.h"
35 
36 #include <sys/param.h>
37 #include <sys/systm.h>
38 #include <sys/kernel.h>
39 #include <sys/module.h>
40 #include <sys/sysctl.h>
41 #include <sys/malloc.h>
42 #include <sys/pcpu.h>
43 #include <sys/lock.h>
44 #include <sys/mutex.h>
45 #include <sys/proc.h>
46 #include <sys/rwlock.h>
47 #include <sys/sched.h>
48 #include <sys/smp.h>
49 #include <sys/sx.h>
50 #include <sys/vnode.h>
51 
52 #include <vm/vm.h>
53 #include <vm/vm_param.h>
54 #include <vm/vm_extern.h>
55 #include <vm/vm_object.h>
56 #include <vm/vm_page.h>
57 #include <vm/pmap.h>
58 #include <vm/vm_map.h>
59 #include <vm/vm_pager.h>
60 #include <vm/vm_kern.h>
61 #include <vm/vnode_pager.h>
62 #include <vm/swap_pager.h>
63 #include <vm/uma.h>
64 
65 #include <machine/cpu.h>
66 #include <machine/pcb.h>
67 #include <machine/smp.h>
68 #include <machine/md_var.h>
69 #include <x86/psl.h>
70 #include <x86/apicreg.h>
71 #include <x86/ifunc.h>
72 
73 #include <machine/vmm.h>
74 #include <machine/vmm_dev.h>
75 #include <machine/vmm_instruction_emul.h>
76 #include <machine/vmm_snapshot.h>
77 
78 #include "vmm_ioport.h"
79 #include "vmm_ktr.h"
80 #include "vmm_host.h"
81 #include "vmm_mem.h"
82 #include "vmm_util.h"
83 #include "vatpic.h"
84 #include "vatpit.h"
85 #include "vhpet.h"
86 #include "vioapic.h"
87 #include "vlapic.h"
88 #include "vpmtmr.h"
89 #include "vrtc.h"
90 #include "vmm_stat.h"
91 #include "vmm_lapic.h"
92 
93 #include "io/ppt.h"
94 #include "io/iommu.h"
95 
96 struct vlapic;
97 
98 /*
99  * Initialization:
100  * (a) allocated when vcpu is created
101  * (i) initialized when vcpu is created and when it is reinitialized
102  * (o) initialized the first time the vcpu is created
103  * (x) initialized before use
104  */
105 struct vcpu {
106 	struct mtx 	mtx;		/* (o) protects 'state' and 'hostcpu' */
107 	enum vcpu_state	state;		/* (o) vcpu state */
108 	int		vcpuid;		/* (o) */
109 	int		hostcpu;	/* (o) vcpu's host cpu */
110 	int		reqidle;	/* (i) request vcpu to idle */
111 	struct vm	*vm;		/* (o) */
112 	void		*cookie;	/* (i) cpu-specific data */
113 	struct vlapic	*vlapic;	/* (i) APIC device model */
114 	enum x2apic_state x2apic_state;	/* (i) APIC mode */
115 	uint64_t	exitintinfo;	/* (i) events pending at VM exit */
116 	int		nmi_pending;	/* (i) NMI pending */
117 	int		extint_pending;	/* (i) INTR pending */
118 	int	exception_pending;	/* (i) exception pending */
119 	int	exc_vector;		/* (x) exception collateral */
120 	int	exc_errcode_valid;
121 	uint32_t exc_errcode;
122 	struct savefpu	*guestfpu;	/* (a,i) guest fpu state */
123 	uint64_t	guest_xcr0;	/* (i) guest %xcr0 register */
124 	void		*stats;		/* (a,i) statistics */
125 	struct vm_exit	exitinfo;	/* (x) exit reason and collateral */
126 	cpuset_t	exitinfo_cpuset; /* (x) storage for vmexit handlers */
127 	uint64_t	nextrip;	/* (x) next instruction to execute */
128 	uint64_t	tsc_offset;	/* (o) TSC offsetting */
129 };
130 
131 #define	vcpu_lock_init(v)	mtx_init(&((v)->mtx), "vcpu lock", 0, MTX_SPIN)
132 #define	vcpu_lock_destroy(v)	mtx_destroy(&((v)->mtx))
133 #define	vcpu_lock(v)		mtx_lock_spin(&((v)->mtx))
134 #define	vcpu_unlock(v)		mtx_unlock_spin(&((v)->mtx))
135 #define	vcpu_assert_locked(v)	mtx_assert(&((v)->mtx), MA_OWNED)
136 
137 struct mem_seg {
138 	size_t	len;
139 	bool	sysmem;
140 	struct vm_object *object;
141 };
142 #define	VM_MAX_MEMSEGS	4
143 
144 struct mem_map {
145 	vm_paddr_t	gpa;
146 	size_t		len;
147 	vm_ooffset_t	segoff;
148 	int		segid;
149 	int		prot;
150 	int		flags;
151 };
152 #define	VM_MAX_MEMMAPS	8
153 
154 /*
155  * Initialization:
156  * (o) initialized the first time the VM is created
157  * (i) initialized when VM is created and when it is reinitialized
158  * (x) initialized before use
159  *
160  * Locking:
161  * [m] mem_segs_lock
162  * [r] rendezvous_mtx
163  * [v] reads require one frozen vcpu, writes require freezing all vcpus
164  */
165 struct vm {
166 	void		*cookie;		/* (i) cpu-specific data */
167 	void		*iommu;			/* (x) iommu-specific data */
168 	struct vhpet	*vhpet;			/* (i) virtual HPET */
169 	struct vioapic	*vioapic;		/* (i) virtual ioapic */
170 	struct vatpic	*vatpic;		/* (i) virtual atpic */
171 	struct vatpit	*vatpit;		/* (i) virtual atpit */
172 	struct vpmtmr	*vpmtmr;		/* (i) virtual ACPI PM timer */
173 	struct vrtc	*vrtc;			/* (o) virtual RTC */
174 	volatile cpuset_t active_cpus;		/* (i) active vcpus */
175 	volatile cpuset_t debug_cpus;		/* (i) vcpus stopped for debug */
176 	cpuset_t	startup_cpus;		/* (i) [r] waiting for startup */
177 	int		suspend;		/* (i) stop VM execution */
178 	bool		dying;			/* (o) is dying */
179 	volatile cpuset_t suspended_cpus; 	/* (i) suspended vcpus */
180 	volatile cpuset_t halted_cpus;		/* (x) cpus in a hard halt */
181 	cpuset_t	rendezvous_req_cpus;	/* (x) [r] rendezvous requested */
182 	cpuset_t	rendezvous_done_cpus;	/* (x) [r] rendezvous finished */
183 	void		*rendezvous_arg;	/* (x) [r] rendezvous func/arg */
184 	vm_rendezvous_func_t rendezvous_func;
185 	struct mtx	rendezvous_mtx;		/* (o) rendezvous lock */
186 	struct mem_map	mem_maps[VM_MAX_MEMMAPS]; /* (i) [m+v] guest address space */
187 	struct mem_seg	mem_segs[VM_MAX_MEMSEGS]; /* (o) [m+v] guest memory regions */
188 	struct vmspace	*vmspace;		/* (o) guest's address space */
189 	char		name[VM_MAX_NAMELEN+1];	/* (o) virtual machine name */
190 	struct vcpu	**vcpu;			/* (o) guest vcpus */
191 	/* The following describe the vm cpu topology */
192 	uint16_t	sockets;		/* (o) num of sockets */
193 	uint16_t	cores;			/* (o) num of cores/socket */
194 	uint16_t	threads;		/* (o) num of threads/core */
195 	uint16_t	maxcpus;		/* (o) max pluggable cpus */
196 	struct sx	mem_segs_lock;		/* (o) */
197 	struct sx	vcpus_init_lock;	/* (o) */
198 };
199 
200 #define	VMM_CTR0(vcpu, format)						\
201 	VCPU_CTR0((vcpu)->vm, (vcpu)->vcpuid, format)
202 
203 #define	VMM_CTR1(vcpu, format, p1)					\
204 	VCPU_CTR1((vcpu)->vm, (vcpu)->vcpuid, format, p1)
205 
206 #define	VMM_CTR2(vcpu, format, p1, p2)					\
207 	VCPU_CTR2((vcpu)->vm, (vcpu)->vcpuid, format, p1, p2)
208 
209 #define	VMM_CTR3(vcpu, format, p1, p2, p3)				\
210 	VCPU_CTR3((vcpu)->vm, (vcpu)->vcpuid, format, p1, p2, p3)
211 
212 #define	VMM_CTR4(vcpu, format, p1, p2, p3, p4)				\
213 	VCPU_CTR4((vcpu)->vm, (vcpu)->vcpuid, format, p1, p2, p3, p4)
214 
215 static int vmm_initialized;
216 
217 static void	vmmops_panic(void);
218 
219 static void
220 vmmops_panic(void)
221 {
222 	panic("vmm_ops func called when !vmm_is_intel() && !vmm_is_svm()");
223 }
224 
225 #define	DEFINE_VMMOPS_IFUNC(ret_type, opname, args)			\
226     DEFINE_IFUNC(static, ret_type, vmmops_##opname, args)		\
227     {									\
228     	if (vmm_is_intel())						\
229     		return (vmm_ops_intel.opname);				\
230     	else if (vmm_is_svm())						\
231     		return (vmm_ops_amd.opname);				\
232     	else								\
233     		return ((ret_type (*)args)vmmops_panic);		\
234     }
235 
236 DEFINE_VMMOPS_IFUNC(int, modinit, (int ipinum))
237 DEFINE_VMMOPS_IFUNC(int, modcleanup, (void))
238 DEFINE_VMMOPS_IFUNC(void, modresume, (void))
239 DEFINE_VMMOPS_IFUNC(void *, init, (struct vm *vm, struct pmap *pmap))
240 DEFINE_VMMOPS_IFUNC(int, run, (void *vcpui, register_t rip, struct pmap *pmap,
241     struct vm_eventinfo *info))
242 DEFINE_VMMOPS_IFUNC(void, cleanup, (void *vmi))
243 DEFINE_VMMOPS_IFUNC(void *, vcpu_init, (void *vmi, struct vcpu *vcpu,
244     int vcpu_id))
245 DEFINE_VMMOPS_IFUNC(void, vcpu_cleanup, (void *vcpui))
246 DEFINE_VMMOPS_IFUNC(int, getreg, (void *vcpui, int num, uint64_t *retval))
247 DEFINE_VMMOPS_IFUNC(int, setreg, (void *vcpui, int num, uint64_t val))
248 DEFINE_VMMOPS_IFUNC(int, getdesc, (void *vcpui, int num, struct seg_desc *desc))
249 DEFINE_VMMOPS_IFUNC(int, setdesc, (void *vcpui, int num, struct seg_desc *desc))
250 DEFINE_VMMOPS_IFUNC(int, getcap, (void *vcpui, int num, int *retval))
251 DEFINE_VMMOPS_IFUNC(int, setcap, (void *vcpui, int num, int val))
252 DEFINE_VMMOPS_IFUNC(struct vmspace *, vmspace_alloc, (vm_offset_t min,
253     vm_offset_t max))
254 DEFINE_VMMOPS_IFUNC(void, vmspace_free, (struct vmspace *vmspace))
255 DEFINE_VMMOPS_IFUNC(struct vlapic *, vlapic_init, (void *vcpui))
256 DEFINE_VMMOPS_IFUNC(void, vlapic_cleanup, (struct vlapic *vlapic))
257 #ifdef BHYVE_SNAPSHOT
258 DEFINE_VMMOPS_IFUNC(int, vcpu_snapshot, (void *vcpui,
259     struct vm_snapshot_meta *meta))
260 DEFINE_VMMOPS_IFUNC(int, restore_tsc, (void *vcpui, uint64_t now))
261 #endif
262 
263 #define	fpu_start_emulating()	load_cr0(rcr0() | CR0_TS)
264 #define	fpu_stop_emulating()	clts()
265 
266 SDT_PROVIDER_DEFINE(vmm);
267 
268 static MALLOC_DEFINE(M_VM, "vm", "vm");
269 
270 /* statistics */
271 static VMM_STAT(VCPU_TOTAL_RUNTIME, "vcpu total runtime");
272 
273 SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL,
274     NULL);
275 
276 /*
277  * Halt the guest if all vcpus are executing a HLT instruction with
278  * interrupts disabled.
279  */
280 static int halt_detection_enabled = 1;
281 SYSCTL_INT(_hw_vmm, OID_AUTO, halt_detection, CTLFLAG_RDTUN,
282     &halt_detection_enabled, 0,
283     "Halt VM if all vcpus execute HLT with interrupts disabled");
284 
285 static int vmm_ipinum;
286 SYSCTL_INT(_hw_vmm, OID_AUTO, ipinum, CTLFLAG_RD, &vmm_ipinum, 0,
287     "IPI vector used for vcpu notifications");
288 
289 static int trace_guest_exceptions;
290 SYSCTL_INT(_hw_vmm, OID_AUTO, trace_guest_exceptions, CTLFLAG_RDTUN,
291     &trace_guest_exceptions, 0,
292     "Trap into hypervisor on all guest exceptions and reflect them back");
293 
294 static int trap_wbinvd;
295 SYSCTL_INT(_hw_vmm, OID_AUTO, trap_wbinvd, CTLFLAG_RDTUN, &trap_wbinvd, 0,
296     "WBINVD triggers a VM-exit");
297 
298 u_int vm_maxcpu;
299 SYSCTL_UINT(_hw_vmm, OID_AUTO, maxcpu, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
300     &vm_maxcpu, 0, "Maximum number of vCPUs");
301 
302 static void vm_free_memmap(struct vm *vm, int ident);
303 static bool sysmem_mapping(struct vm *vm, struct mem_map *mm);
304 static void vcpu_notify_event_locked(struct vcpu *vcpu, bool lapic_intr);
305 
306 /*
307  * Upper limit on vm_maxcpu.  Limited by use of uint16_t types for CPU
308  * counts as well as range of vpid values for VT-x and by the capacity
309  * of cpuset_t masks.  The call to new_unrhdr() in vpid_init() in
310  * vmx.c requires 'vm_maxcpu + 1 <= 0xffff', hence the '- 1' below.
311  */
312 #define	VM_MAXCPU	MIN(0xffff - 1, CPU_SETSIZE)
313 
314 #ifdef KTR
315 static const char *
316 vcpu_state2str(enum vcpu_state state)
317 {
318 
319 	switch (state) {
320 	case VCPU_IDLE:
321 		return ("idle");
322 	case VCPU_FROZEN:
323 		return ("frozen");
324 	case VCPU_RUNNING:
325 		return ("running");
326 	case VCPU_SLEEPING:
327 		return ("sleeping");
328 	default:
329 		return ("unknown");
330 	}
331 }
332 #endif
333 
334 static void
335 vcpu_cleanup(struct vcpu *vcpu, bool destroy)
336 {
337 	vmmops_vlapic_cleanup(vcpu->vlapic);
338 	vmmops_vcpu_cleanup(vcpu->cookie);
339 	vcpu->cookie = NULL;
340 	if (destroy) {
341 		vmm_stat_free(vcpu->stats);
342 		fpu_save_area_free(vcpu->guestfpu);
343 		vcpu_lock_destroy(vcpu);
344 		free(vcpu, M_VM);
345 	}
346 }
347 
348 static struct vcpu *
349 vcpu_alloc(struct vm *vm, int vcpu_id)
350 {
351 	struct vcpu *vcpu;
352 
353 	KASSERT(vcpu_id >= 0 && vcpu_id < vm->maxcpus,
354 	    ("vcpu_init: invalid vcpu %d", vcpu_id));
355 
356 	vcpu = malloc(sizeof(*vcpu), M_VM, M_WAITOK | M_ZERO);
357 	vcpu_lock_init(vcpu);
358 	vcpu->state = VCPU_IDLE;
359 	vcpu->hostcpu = NOCPU;
360 	vcpu->vcpuid = vcpu_id;
361 	vcpu->vm = vm;
362 	vcpu->guestfpu = fpu_save_area_alloc();
363 	vcpu->stats = vmm_stat_alloc();
364 	vcpu->tsc_offset = 0;
365 	return (vcpu);
366 }
367 
368 static void
369 vcpu_init(struct vcpu *vcpu)
370 {
371 	vcpu->cookie = vmmops_vcpu_init(vcpu->vm->cookie, vcpu, vcpu->vcpuid);
372 	vcpu->vlapic = vmmops_vlapic_init(vcpu->cookie);
373 	vm_set_x2apic_state(vcpu, X2APIC_DISABLED);
374 	vcpu->reqidle = 0;
375 	vcpu->exitintinfo = 0;
376 	vcpu->nmi_pending = 0;
377 	vcpu->extint_pending = 0;
378 	vcpu->exception_pending = 0;
379 	vcpu->guest_xcr0 = XFEATURE_ENABLED_X87;
380 	fpu_save_area_reset(vcpu->guestfpu);
381 	vmm_stat_init(vcpu->stats);
382 }
383 
384 int
385 vcpu_trace_exceptions(struct vcpu *vcpu)
386 {
387 
388 	return (trace_guest_exceptions);
389 }
390 
391 int
392 vcpu_trap_wbinvd(struct vcpu *vcpu)
393 {
394 	return (trap_wbinvd);
395 }
396 
397 struct vm_exit *
398 vm_exitinfo(struct vcpu *vcpu)
399 {
400 	return (&vcpu->exitinfo);
401 }
402 
403 cpuset_t *
404 vm_exitinfo_cpuset(struct vcpu *vcpu)
405 {
406 	return (&vcpu->exitinfo_cpuset);
407 }
408 
409 static int
410 vmm_init(void)
411 {
412 	int error;
413 
414 	if (!vmm_is_hw_supported())
415 		return (ENXIO);
416 
417 	vm_maxcpu = mp_ncpus;
418 	TUNABLE_INT_FETCH("hw.vmm.maxcpu", &vm_maxcpu);
419 
420 	if (vm_maxcpu > VM_MAXCPU) {
421 		printf("vmm: vm_maxcpu clamped to %u\n", VM_MAXCPU);
422 		vm_maxcpu = VM_MAXCPU;
423 	}
424 	if (vm_maxcpu == 0)
425 		vm_maxcpu = 1;
426 
427 	vmm_host_state_init();
428 
429 	vmm_ipinum = lapic_ipi_alloc(pti ? &IDTVEC(justreturn1_pti) :
430 	    &IDTVEC(justreturn));
431 	if (vmm_ipinum < 0)
432 		vmm_ipinum = IPI_AST;
433 
434 	error = vmm_mem_init();
435 	if (error)
436 		return (error);
437 
438 	vmm_resume_p = vmmops_modresume;
439 
440 	return (vmmops_modinit(vmm_ipinum));
441 }
442 
443 static int
444 vmm_handler(module_t mod, int what, void *arg)
445 {
446 	int error;
447 
448 	switch (what) {
449 	case MOD_LOAD:
450 		if (vmm_is_hw_supported()) {
451 			vmmdev_init();
452 			error = vmm_init();
453 			if (error == 0)
454 				vmm_initialized = 1;
455 		} else {
456 			error = ENXIO;
457 		}
458 		break;
459 	case MOD_UNLOAD:
460 		if (vmm_is_hw_supported()) {
461 			error = vmmdev_cleanup();
462 			if (error == 0) {
463 				vmm_resume_p = NULL;
464 				iommu_cleanup();
465 				if (vmm_ipinum != IPI_AST)
466 					lapic_ipi_free(vmm_ipinum);
467 				error = vmmops_modcleanup();
468 				/*
469 				 * Something bad happened - prevent new
470 				 * VMs from being created
471 				 */
472 				if (error)
473 					vmm_initialized = 0;
474 			}
475 		} else {
476 			error = 0;
477 		}
478 		break;
479 	default:
480 		error = 0;
481 		break;
482 	}
483 	return (error);
484 }
485 
486 static moduledata_t vmm_kmod = {
487 	"vmm",
488 	vmm_handler,
489 	NULL
490 };
491 
492 /*
493  * vmm initialization has the following dependencies:
494  *
495  * - VT-x initialization requires smp_rendezvous() and therefore must happen
496  *   after SMP is fully functional (after SI_SUB_SMP).
497  */
498 DECLARE_MODULE(vmm, vmm_kmod, SI_SUB_SMP + 1, SI_ORDER_ANY);
499 MODULE_VERSION(vmm, 1);
500 
501 static void
502 vm_init(struct vm *vm, bool create)
503 {
504 	vm->cookie = vmmops_init(vm, vmspace_pmap(vm->vmspace));
505 	vm->iommu = NULL;
506 	vm->vioapic = vioapic_init(vm);
507 	vm->vhpet = vhpet_init(vm);
508 	vm->vatpic = vatpic_init(vm);
509 	vm->vatpit = vatpit_init(vm);
510 	vm->vpmtmr = vpmtmr_init(vm);
511 	if (create)
512 		vm->vrtc = vrtc_init(vm);
513 
514 	CPU_ZERO(&vm->active_cpus);
515 	CPU_ZERO(&vm->debug_cpus);
516 	CPU_ZERO(&vm->startup_cpus);
517 
518 	vm->suspend = 0;
519 	CPU_ZERO(&vm->suspended_cpus);
520 
521 	if (!create) {
522 		for (int i = 0; i < vm->maxcpus; i++) {
523 			if (vm->vcpu[i] != NULL)
524 				vcpu_init(vm->vcpu[i]);
525 		}
526 	}
527 }
528 
529 void
530 vm_disable_vcpu_creation(struct vm *vm)
531 {
532 	sx_xlock(&vm->vcpus_init_lock);
533 	vm->dying = true;
534 	sx_xunlock(&vm->vcpus_init_lock);
535 }
536 
537 struct vcpu *
538 vm_alloc_vcpu(struct vm *vm, int vcpuid)
539 {
540 	struct vcpu *vcpu;
541 
542 	if (vcpuid < 0 || vcpuid >= vm_get_maxcpus(vm))
543 		return (NULL);
544 
545 	vcpu = atomic_load_ptr(&vm->vcpu[vcpuid]);
546 	if (__predict_true(vcpu != NULL))
547 		return (vcpu);
548 
549 	sx_xlock(&vm->vcpus_init_lock);
550 	vcpu = vm->vcpu[vcpuid];
551 	if (vcpu == NULL && !vm->dying) {
552 		vcpu = vcpu_alloc(vm, vcpuid);
553 		vcpu_init(vcpu);
554 
555 		/*
556 		 * Ensure vCPU is fully created before updating pointer
557 		 * to permit unlocked reads above.
558 		 */
559 		atomic_store_rel_ptr((uintptr_t *)&vm->vcpu[vcpuid],
560 		    (uintptr_t)vcpu);
561 	}
562 	sx_xunlock(&vm->vcpus_init_lock);
563 	return (vcpu);
564 }
565 
566 void
567 vm_slock_vcpus(struct vm *vm)
568 {
569 	sx_slock(&vm->vcpus_init_lock);
570 }
571 
572 void
573 vm_unlock_vcpus(struct vm *vm)
574 {
575 	sx_unlock(&vm->vcpus_init_lock);
576 }
577 
578 /*
579  * The default CPU topology is a single thread per package.
580  */
581 u_int cores_per_package = 1;
582 u_int threads_per_core = 1;
583 
584 int
585 vm_create(const char *name, struct vm **retvm)
586 {
587 	struct vm *vm;
588 	struct vmspace *vmspace;
589 
590 	/*
591 	 * If vmm.ko could not be successfully initialized then don't attempt
592 	 * to create the virtual machine.
593 	 */
594 	if (!vmm_initialized)
595 		return (ENXIO);
596 
597 	if (name == NULL || strnlen(name, VM_MAX_NAMELEN + 1) ==
598 	    VM_MAX_NAMELEN + 1)
599 		return (EINVAL);
600 
601 	vmspace = vmmops_vmspace_alloc(0, VM_MAXUSER_ADDRESS_LA48);
602 	if (vmspace == NULL)
603 		return (ENOMEM);
604 
605 	vm = malloc(sizeof(struct vm), M_VM, M_WAITOK | M_ZERO);
606 	strcpy(vm->name, name);
607 	vm->vmspace = vmspace;
608 	mtx_init(&vm->rendezvous_mtx, "vm rendezvous lock", 0, MTX_DEF);
609 	sx_init(&vm->mem_segs_lock, "vm mem_segs");
610 	sx_init(&vm->vcpus_init_lock, "vm vcpus");
611 	vm->vcpu = malloc(sizeof(*vm->vcpu) * vm_maxcpu, M_VM, M_WAITOK |
612 	    M_ZERO);
613 
614 	vm->sockets = 1;
615 	vm->cores = cores_per_package;	/* XXX backwards compatibility */
616 	vm->threads = threads_per_core;	/* XXX backwards compatibility */
617 	vm->maxcpus = vm_maxcpu;
618 
619 	vm_init(vm, true);
620 
621 	*retvm = vm;
622 	return (0);
623 }
624 
625 void
626 vm_get_topology(struct vm *vm, uint16_t *sockets, uint16_t *cores,
627     uint16_t *threads, uint16_t *maxcpus)
628 {
629 	*sockets = vm->sockets;
630 	*cores = vm->cores;
631 	*threads = vm->threads;
632 	*maxcpus = vm->maxcpus;
633 }
634 
635 uint16_t
636 vm_get_maxcpus(struct vm *vm)
637 {
638 	return (vm->maxcpus);
639 }
640 
641 int
642 vm_set_topology(struct vm *vm, uint16_t sockets, uint16_t cores,
643     uint16_t threads, uint16_t maxcpus __unused)
644 {
645 	/* Ignore maxcpus. */
646 	if ((sockets * cores * threads) > vm->maxcpus)
647 		return (EINVAL);
648 	vm->sockets = sockets;
649 	vm->cores = cores;
650 	vm->threads = threads;
651 	return(0);
652 }
653 
654 static void
655 vm_cleanup(struct vm *vm, bool destroy)
656 {
657 	struct mem_map *mm;
658 	int i;
659 
660 	if (destroy)
661 		vm_xlock_memsegs(vm);
662 
663 	ppt_unassign_all(vm);
664 
665 	if (vm->iommu != NULL)
666 		iommu_destroy_domain(vm->iommu);
667 
668 	if (destroy)
669 		vrtc_cleanup(vm->vrtc);
670 	else
671 		vrtc_reset(vm->vrtc);
672 	vpmtmr_cleanup(vm->vpmtmr);
673 	vatpit_cleanup(vm->vatpit);
674 	vhpet_cleanup(vm->vhpet);
675 	vatpic_cleanup(vm->vatpic);
676 	vioapic_cleanup(vm->vioapic);
677 
678 	for (i = 0; i < vm->maxcpus; i++) {
679 		if (vm->vcpu[i] != NULL)
680 			vcpu_cleanup(vm->vcpu[i], destroy);
681 	}
682 
683 	vmmops_cleanup(vm->cookie);
684 
685 	/*
686 	 * System memory is removed from the guest address space only when
687 	 * the VM is destroyed. This is because the mapping remains the same
688 	 * across VM reset.
689 	 *
690 	 * Device memory can be relocated by the guest (e.g. using PCI BARs)
691 	 * so those mappings are removed on a VM reset.
692 	 */
693 	for (i = 0; i < VM_MAX_MEMMAPS; i++) {
694 		mm = &vm->mem_maps[i];
695 		if (destroy || !sysmem_mapping(vm, mm))
696 			vm_free_memmap(vm, i);
697 	}
698 
699 	if (destroy) {
700 		for (i = 0; i < VM_MAX_MEMSEGS; i++)
701 			vm_free_memseg(vm, i);
702 		vm_unlock_memsegs(vm);
703 
704 		vmmops_vmspace_free(vm->vmspace);
705 		vm->vmspace = NULL;
706 
707 		free(vm->vcpu, M_VM);
708 		sx_destroy(&vm->vcpus_init_lock);
709 		sx_destroy(&vm->mem_segs_lock);
710 		mtx_destroy(&vm->rendezvous_mtx);
711 	}
712 }
713 
714 void
715 vm_destroy(struct vm *vm)
716 {
717 	vm_cleanup(vm, true);
718 	free(vm, M_VM);
719 }
720 
721 int
722 vm_reinit(struct vm *vm)
723 {
724 	int error;
725 
726 	/*
727 	 * A virtual machine can be reset only if all vcpus are suspended.
728 	 */
729 	if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) {
730 		vm_cleanup(vm, false);
731 		vm_init(vm, false);
732 		error = 0;
733 	} else {
734 		error = EBUSY;
735 	}
736 
737 	return (error);
738 }
739 
740 const char *
741 vm_name(struct vm *vm)
742 {
743 	return (vm->name);
744 }
745 
746 void
747 vm_slock_memsegs(struct vm *vm)
748 {
749 	sx_slock(&vm->mem_segs_lock);
750 }
751 
752 void
753 vm_xlock_memsegs(struct vm *vm)
754 {
755 	sx_xlock(&vm->mem_segs_lock);
756 }
757 
758 void
759 vm_unlock_memsegs(struct vm *vm)
760 {
761 	sx_unlock(&vm->mem_segs_lock);
762 }
763 
764 int
765 vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa)
766 {
767 	vm_object_t obj;
768 
769 	if ((obj = vmm_mmio_alloc(vm->vmspace, gpa, len, hpa)) == NULL)
770 		return (ENOMEM);
771 	else
772 		return (0);
773 }
774 
775 int
776 vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len)
777 {
778 
779 	vmm_mmio_free(vm->vmspace, gpa, len);
780 	return (0);
781 }
782 
783 /*
784  * Return 'true' if 'gpa' is allocated in the guest address space.
785  *
786  * This function is called in the context of a running vcpu which acts as
787  * an implicit lock on 'vm->mem_maps[]'.
788  */
789 bool
790 vm_mem_allocated(struct vcpu *vcpu, vm_paddr_t gpa)
791 {
792 	struct vm *vm = vcpu->vm;
793 	struct mem_map *mm;
794 	int i;
795 
796 #ifdef INVARIANTS
797 	int hostcpu, state;
798 	state = vcpu_get_state(vcpu, &hostcpu);
799 	KASSERT(state == VCPU_RUNNING && hostcpu == curcpu,
800 	    ("%s: invalid vcpu state %d/%d", __func__, state, hostcpu));
801 #endif
802 
803 	for (i = 0; i < VM_MAX_MEMMAPS; i++) {
804 		mm = &vm->mem_maps[i];
805 		if (mm->len != 0 && gpa >= mm->gpa && gpa < mm->gpa + mm->len)
806 			return (true);		/* 'gpa' is sysmem or devmem */
807 	}
808 
809 	if (ppt_is_mmio(vm, gpa))
810 		return (true);			/* 'gpa' is pci passthru mmio */
811 
812 	return (false);
813 }
814 
815 int
816 vm_alloc_memseg(struct vm *vm, int ident, size_t len, bool sysmem)
817 {
818 	struct mem_seg *seg;
819 	vm_object_t obj;
820 
821 	sx_assert(&vm->mem_segs_lock, SX_XLOCKED);
822 
823 	if (ident < 0 || ident >= VM_MAX_MEMSEGS)
824 		return (EINVAL);
825 
826 	if (len == 0 || (len & PAGE_MASK))
827 		return (EINVAL);
828 
829 	seg = &vm->mem_segs[ident];
830 	if (seg->object != NULL) {
831 		if (seg->len == len && seg->sysmem == sysmem)
832 			return (EEXIST);
833 		else
834 			return (EINVAL);
835 	}
836 
837 	obj = vm_object_allocate(OBJT_SWAP, len >> PAGE_SHIFT);
838 	if (obj == NULL)
839 		return (ENOMEM);
840 
841 	seg->len = len;
842 	seg->object = obj;
843 	seg->sysmem = sysmem;
844 	return (0);
845 }
846 
847 int
848 vm_get_memseg(struct vm *vm, int ident, size_t *len, bool *sysmem,
849     vm_object_t *objptr)
850 {
851 	struct mem_seg *seg;
852 
853 	sx_assert(&vm->mem_segs_lock, SX_LOCKED);
854 
855 	if (ident < 0 || ident >= VM_MAX_MEMSEGS)
856 		return (EINVAL);
857 
858 	seg = &vm->mem_segs[ident];
859 	if (len)
860 		*len = seg->len;
861 	if (sysmem)
862 		*sysmem = seg->sysmem;
863 	if (objptr)
864 		*objptr = seg->object;
865 	return (0);
866 }
867 
868 void
869 vm_free_memseg(struct vm *vm, int ident)
870 {
871 	struct mem_seg *seg;
872 
873 	KASSERT(ident >= 0 && ident < VM_MAX_MEMSEGS,
874 	    ("%s: invalid memseg ident %d", __func__, ident));
875 
876 	seg = &vm->mem_segs[ident];
877 	if (seg->object != NULL) {
878 		vm_object_deallocate(seg->object);
879 		bzero(seg, sizeof(struct mem_seg));
880 	}
881 }
882 
883 int
884 vm_mmap_memseg(struct vm *vm, vm_paddr_t gpa, int segid, vm_ooffset_t first,
885     size_t len, int prot, int flags)
886 {
887 	struct mem_seg *seg;
888 	struct mem_map *m, *map;
889 	vm_ooffset_t last;
890 	int i, error;
891 
892 	if (prot == 0 || (prot & ~(VM_PROT_ALL)) != 0)
893 		return (EINVAL);
894 
895 	if (flags & ~VM_MEMMAP_F_WIRED)
896 		return (EINVAL);
897 
898 	if (segid < 0 || segid >= VM_MAX_MEMSEGS)
899 		return (EINVAL);
900 
901 	seg = &vm->mem_segs[segid];
902 	if (seg->object == NULL)
903 		return (EINVAL);
904 
905 	last = first + len;
906 	if (first < 0 || first >= last || last > seg->len)
907 		return (EINVAL);
908 
909 	if ((gpa | first | last) & PAGE_MASK)
910 		return (EINVAL);
911 
912 	map = NULL;
913 	for (i = 0; i < VM_MAX_MEMMAPS; i++) {
914 		m = &vm->mem_maps[i];
915 		if (m->len == 0) {
916 			map = m;
917 			break;
918 		}
919 	}
920 
921 	if (map == NULL)
922 		return (ENOSPC);
923 
924 	error = vm_map_find(&vm->vmspace->vm_map, seg->object, first, &gpa,
925 	    len, 0, VMFS_NO_SPACE, prot, prot, 0);
926 	if (error != KERN_SUCCESS)
927 		return (EFAULT);
928 
929 	vm_object_reference(seg->object);
930 
931 	if (flags & VM_MEMMAP_F_WIRED) {
932 		error = vm_map_wire(&vm->vmspace->vm_map, gpa, gpa + len,
933 		    VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
934 		if (error != KERN_SUCCESS) {
935 			vm_map_remove(&vm->vmspace->vm_map, gpa, gpa + len);
936 			return (error == KERN_RESOURCE_SHORTAGE ? ENOMEM :
937 			    EFAULT);
938 		}
939 	}
940 
941 	map->gpa = gpa;
942 	map->len = len;
943 	map->segoff = first;
944 	map->segid = segid;
945 	map->prot = prot;
946 	map->flags = flags;
947 	return (0);
948 }
949 
950 int
951 vm_munmap_memseg(struct vm *vm, vm_paddr_t gpa, size_t len)
952 {
953 	struct mem_map *m;
954 	int i;
955 
956 	for (i = 0; i < VM_MAX_MEMMAPS; i++) {
957 		m = &vm->mem_maps[i];
958 		if (m->gpa == gpa && m->len == len &&
959 		    (m->flags & VM_MEMMAP_F_IOMMU) == 0) {
960 			vm_free_memmap(vm, i);
961 			return (0);
962 		}
963 	}
964 
965 	return (EINVAL);
966 }
967 
968 int
969 vm_mmap_getnext(struct vm *vm, vm_paddr_t *gpa, int *segid,
970     vm_ooffset_t *segoff, size_t *len, int *prot, int *flags)
971 {
972 	struct mem_map *mm, *mmnext;
973 	int i;
974 
975 	mmnext = NULL;
976 	for (i = 0; i < VM_MAX_MEMMAPS; i++) {
977 		mm = &vm->mem_maps[i];
978 		if (mm->len == 0 || mm->gpa < *gpa)
979 			continue;
980 		if (mmnext == NULL || mm->gpa < mmnext->gpa)
981 			mmnext = mm;
982 	}
983 
984 	if (mmnext != NULL) {
985 		*gpa = mmnext->gpa;
986 		if (segid)
987 			*segid = mmnext->segid;
988 		if (segoff)
989 			*segoff = mmnext->segoff;
990 		if (len)
991 			*len = mmnext->len;
992 		if (prot)
993 			*prot = mmnext->prot;
994 		if (flags)
995 			*flags = mmnext->flags;
996 		return (0);
997 	} else {
998 		return (ENOENT);
999 	}
1000 }
1001 
1002 static void
1003 vm_free_memmap(struct vm *vm, int ident)
1004 {
1005 	struct mem_map *mm;
1006 	int error __diagused;
1007 
1008 	mm = &vm->mem_maps[ident];
1009 	if (mm->len) {
1010 		error = vm_map_remove(&vm->vmspace->vm_map, mm->gpa,
1011 		    mm->gpa + mm->len);
1012 		KASSERT(error == KERN_SUCCESS, ("%s: vm_map_remove error %d",
1013 		    __func__, error));
1014 		bzero(mm, sizeof(struct mem_map));
1015 	}
1016 }
1017 
1018 static __inline bool
1019 sysmem_mapping(struct vm *vm, struct mem_map *mm)
1020 {
1021 
1022 	if (mm->len != 0 && vm->mem_segs[mm->segid].sysmem)
1023 		return (true);
1024 	else
1025 		return (false);
1026 }
1027 
1028 vm_paddr_t
1029 vmm_sysmem_maxaddr(struct vm *vm)
1030 {
1031 	struct mem_map *mm;
1032 	vm_paddr_t maxaddr;
1033 	int i;
1034 
1035 	maxaddr = 0;
1036 	for (i = 0; i < VM_MAX_MEMMAPS; i++) {
1037 		mm = &vm->mem_maps[i];
1038 		if (sysmem_mapping(vm, mm)) {
1039 			if (maxaddr < mm->gpa + mm->len)
1040 				maxaddr = mm->gpa + mm->len;
1041 		}
1042 	}
1043 	return (maxaddr);
1044 }
1045 
1046 static void
1047 vm_iommu_modify(struct vm *vm, bool map)
1048 {
1049 	int i, sz;
1050 	vm_paddr_t gpa, hpa;
1051 	struct mem_map *mm;
1052 	void *vp, *cookie, *host_domain;
1053 
1054 	sz = PAGE_SIZE;
1055 	host_domain = iommu_host_domain();
1056 
1057 	for (i = 0; i < VM_MAX_MEMMAPS; i++) {
1058 		mm = &vm->mem_maps[i];
1059 		if (!sysmem_mapping(vm, mm))
1060 			continue;
1061 
1062 		if (map) {
1063 			KASSERT((mm->flags & VM_MEMMAP_F_IOMMU) == 0,
1064 			    ("iommu map found invalid memmap %#lx/%#lx/%#x",
1065 			    mm->gpa, mm->len, mm->flags));
1066 			if ((mm->flags & VM_MEMMAP_F_WIRED) == 0)
1067 				continue;
1068 			mm->flags |= VM_MEMMAP_F_IOMMU;
1069 		} else {
1070 			if ((mm->flags & VM_MEMMAP_F_IOMMU) == 0)
1071 				continue;
1072 			mm->flags &= ~VM_MEMMAP_F_IOMMU;
1073 			KASSERT((mm->flags & VM_MEMMAP_F_WIRED) != 0,
1074 			    ("iommu unmap found invalid memmap %#lx/%#lx/%#x",
1075 			    mm->gpa, mm->len, mm->flags));
1076 		}
1077 
1078 		gpa = mm->gpa;
1079 		while (gpa < mm->gpa + mm->len) {
1080 			vp = vm_gpa_hold_global(vm, gpa, PAGE_SIZE,
1081 			    VM_PROT_WRITE, &cookie);
1082 			KASSERT(vp != NULL, ("vm(%s) could not map gpa %#lx",
1083 			    vm_name(vm), gpa));
1084 
1085 			vm_gpa_release(cookie);
1086 
1087 			hpa = DMAP_TO_PHYS((uintptr_t)vp);
1088 			if (map) {
1089 				iommu_create_mapping(vm->iommu, gpa, hpa, sz);
1090 			} else {
1091 				iommu_remove_mapping(vm->iommu, gpa, sz);
1092 			}
1093 
1094 			gpa += PAGE_SIZE;
1095 		}
1096 	}
1097 
1098 	/*
1099 	 * Invalidate the cached translations associated with the domain
1100 	 * from which pages were removed.
1101 	 */
1102 	if (map)
1103 		iommu_invalidate_tlb(host_domain);
1104 	else
1105 		iommu_invalidate_tlb(vm->iommu);
1106 }
1107 
1108 #define	vm_iommu_unmap(vm)	vm_iommu_modify((vm), false)
1109 #define	vm_iommu_map(vm)	vm_iommu_modify((vm), true)
1110 
1111 int
1112 vm_unassign_pptdev(struct vm *vm, int bus, int slot, int func)
1113 {
1114 	int error;
1115 
1116 	error = ppt_unassign_device(vm, bus, slot, func);
1117 	if (error)
1118 		return (error);
1119 
1120 	if (ppt_assigned_devices(vm) == 0)
1121 		vm_iommu_unmap(vm);
1122 
1123 	return (0);
1124 }
1125 
1126 int
1127 vm_assign_pptdev(struct vm *vm, int bus, int slot, int func)
1128 {
1129 	int error;
1130 	vm_paddr_t maxaddr;
1131 
1132 	/* Set up the IOMMU to do the 'gpa' to 'hpa' translation */
1133 	if (ppt_assigned_devices(vm) == 0) {
1134 		KASSERT(vm->iommu == NULL,
1135 		    ("vm_assign_pptdev: iommu must be NULL"));
1136 		maxaddr = vmm_sysmem_maxaddr(vm);
1137 		vm->iommu = iommu_create_domain(maxaddr);
1138 		if (vm->iommu == NULL)
1139 			return (ENXIO);
1140 		vm_iommu_map(vm);
1141 	}
1142 
1143 	error = ppt_assign_device(vm, bus, slot, func);
1144 	return (error);
1145 }
1146 
1147 static void *
1148 _vm_gpa_hold(struct vm *vm, vm_paddr_t gpa, size_t len, int reqprot,
1149     void **cookie)
1150 {
1151 	int i, count, pageoff;
1152 	struct mem_map *mm;
1153 	vm_page_t m;
1154 
1155 	pageoff = gpa & PAGE_MASK;
1156 	if (len > PAGE_SIZE - pageoff)
1157 		panic("vm_gpa_hold: invalid gpa/len: 0x%016lx/%lu", gpa, len);
1158 
1159 	count = 0;
1160 	for (i = 0; i < VM_MAX_MEMMAPS; i++) {
1161 		mm = &vm->mem_maps[i];
1162 		if (gpa >= mm->gpa && gpa < mm->gpa + mm->len) {
1163 			count = vm_fault_quick_hold_pages(&vm->vmspace->vm_map,
1164 			    trunc_page(gpa), PAGE_SIZE, reqprot, &m, 1);
1165 			break;
1166 		}
1167 	}
1168 
1169 	if (count == 1) {
1170 		*cookie = m;
1171 		return ((void *)(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)) + pageoff));
1172 	} else {
1173 		*cookie = NULL;
1174 		return (NULL);
1175 	}
1176 }
1177 
1178 void *
1179 vm_gpa_hold(struct vcpu *vcpu, vm_paddr_t gpa, size_t len, int reqprot,
1180     void **cookie)
1181 {
1182 #ifdef INVARIANTS
1183 	/*
1184 	 * The current vcpu should be frozen to ensure 'vm_memmap[]'
1185 	 * stability.
1186 	 */
1187 	int state = vcpu_get_state(vcpu, NULL);
1188 	KASSERT(state == VCPU_FROZEN, ("%s: invalid vcpu state %d",
1189 	    __func__, state));
1190 #endif
1191 	return (_vm_gpa_hold(vcpu->vm, gpa, len, reqprot, cookie));
1192 }
1193 
1194 void *
1195 vm_gpa_hold_global(struct vm *vm, vm_paddr_t gpa, size_t len, int reqprot,
1196     void **cookie)
1197 {
1198 	sx_assert(&vm->mem_segs_lock, SX_LOCKED);
1199 	return (_vm_gpa_hold(vm, gpa, len, reqprot, cookie));
1200 }
1201 
1202 void
1203 vm_gpa_release(void *cookie)
1204 {
1205 	vm_page_t m = cookie;
1206 
1207 	vm_page_unwire(m, PQ_ACTIVE);
1208 }
1209 
1210 int
1211 vm_get_register(struct vcpu *vcpu, int reg, uint64_t *retval)
1212 {
1213 
1214 	if (reg >= VM_REG_LAST)
1215 		return (EINVAL);
1216 
1217 	return (vmmops_getreg(vcpu->cookie, reg, retval));
1218 }
1219 
1220 int
1221 vm_set_register(struct vcpu *vcpu, int reg, uint64_t val)
1222 {
1223 	int error;
1224 
1225 	if (reg >= VM_REG_LAST)
1226 		return (EINVAL);
1227 
1228 	error = vmmops_setreg(vcpu->cookie, reg, val);
1229 	if (error || reg != VM_REG_GUEST_RIP)
1230 		return (error);
1231 
1232 	/* Set 'nextrip' to match the value of %rip */
1233 	VMM_CTR1(vcpu, "Setting nextrip to %#lx", val);
1234 	vcpu->nextrip = val;
1235 	return (0);
1236 }
1237 
1238 static bool
1239 is_descriptor_table(int reg)
1240 {
1241 
1242 	switch (reg) {
1243 	case VM_REG_GUEST_IDTR:
1244 	case VM_REG_GUEST_GDTR:
1245 		return (true);
1246 	default:
1247 		return (false);
1248 	}
1249 }
1250 
1251 static bool
1252 is_segment_register(int reg)
1253 {
1254 
1255 	switch (reg) {
1256 	case VM_REG_GUEST_ES:
1257 	case VM_REG_GUEST_CS:
1258 	case VM_REG_GUEST_SS:
1259 	case VM_REG_GUEST_DS:
1260 	case VM_REG_GUEST_FS:
1261 	case VM_REG_GUEST_GS:
1262 	case VM_REG_GUEST_TR:
1263 	case VM_REG_GUEST_LDTR:
1264 		return (true);
1265 	default:
1266 		return (false);
1267 	}
1268 }
1269 
1270 int
1271 vm_get_seg_desc(struct vcpu *vcpu, int reg, struct seg_desc *desc)
1272 {
1273 
1274 	if (!is_segment_register(reg) && !is_descriptor_table(reg))
1275 		return (EINVAL);
1276 
1277 	return (vmmops_getdesc(vcpu->cookie, reg, desc));
1278 }
1279 
1280 int
1281 vm_set_seg_desc(struct vcpu *vcpu, int reg, struct seg_desc *desc)
1282 {
1283 
1284 	if (!is_segment_register(reg) && !is_descriptor_table(reg))
1285 		return (EINVAL);
1286 
1287 	return (vmmops_setdesc(vcpu->cookie, reg, desc));
1288 }
1289 
1290 static void
1291 restore_guest_fpustate(struct vcpu *vcpu)
1292 {
1293 
1294 	/* flush host state to the pcb */
1295 	fpuexit(curthread);
1296 
1297 	/* restore guest FPU state */
1298 	fpu_stop_emulating();
1299 	fpurestore(vcpu->guestfpu);
1300 
1301 	/* restore guest XCR0 if XSAVE is enabled in the host */
1302 	if (rcr4() & CR4_XSAVE)
1303 		load_xcr(0, vcpu->guest_xcr0);
1304 
1305 	/*
1306 	 * The FPU is now "dirty" with the guest's state so turn on emulation
1307 	 * to trap any access to the FPU by the host.
1308 	 */
1309 	fpu_start_emulating();
1310 }
1311 
1312 static void
1313 save_guest_fpustate(struct vcpu *vcpu)
1314 {
1315 
1316 	if ((rcr0() & CR0_TS) == 0)
1317 		panic("fpu emulation not enabled in host!");
1318 
1319 	/* save guest XCR0 and restore host XCR0 */
1320 	if (rcr4() & CR4_XSAVE) {
1321 		vcpu->guest_xcr0 = rxcr(0);
1322 		load_xcr(0, vmm_get_host_xcr0());
1323 	}
1324 
1325 	/* save guest FPU state */
1326 	fpu_stop_emulating();
1327 	fpusave(vcpu->guestfpu);
1328 	fpu_start_emulating();
1329 }
1330 
1331 static VMM_STAT(VCPU_IDLE_TICKS, "number of ticks vcpu was idle");
1332 
1333 static int
1334 vcpu_set_state_locked(struct vcpu *vcpu, enum vcpu_state newstate,
1335     bool from_idle)
1336 {
1337 	int error;
1338 
1339 	vcpu_assert_locked(vcpu);
1340 
1341 	/*
1342 	 * State transitions from the vmmdev_ioctl() must always begin from
1343 	 * the VCPU_IDLE state. This guarantees that there is only a single
1344 	 * ioctl() operating on a vcpu at any point.
1345 	 */
1346 	if (from_idle) {
1347 		while (vcpu->state != VCPU_IDLE) {
1348 			vcpu->reqidle = 1;
1349 			vcpu_notify_event_locked(vcpu, false);
1350 			VMM_CTR1(vcpu, "vcpu state change from %s to "
1351 			    "idle requested", vcpu_state2str(vcpu->state));
1352 			msleep_spin(&vcpu->state, &vcpu->mtx, "vmstat", hz);
1353 		}
1354 	} else {
1355 		KASSERT(vcpu->state != VCPU_IDLE, ("invalid transition from "
1356 		    "vcpu idle state"));
1357 	}
1358 
1359 	if (vcpu->state == VCPU_RUNNING) {
1360 		KASSERT(vcpu->hostcpu == curcpu, ("curcpu %d and hostcpu %d "
1361 		    "mismatch for running vcpu", curcpu, vcpu->hostcpu));
1362 	} else {
1363 		KASSERT(vcpu->hostcpu == NOCPU, ("Invalid hostcpu %d for a "
1364 		    "vcpu that is not running", vcpu->hostcpu));
1365 	}
1366 
1367 	/*
1368 	 * The following state transitions are allowed:
1369 	 * IDLE -> FROZEN -> IDLE
1370 	 * FROZEN -> RUNNING -> FROZEN
1371 	 * FROZEN -> SLEEPING -> FROZEN
1372 	 */
1373 	switch (vcpu->state) {
1374 	case VCPU_IDLE:
1375 	case VCPU_RUNNING:
1376 	case VCPU_SLEEPING:
1377 		error = (newstate != VCPU_FROZEN);
1378 		break;
1379 	case VCPU_FROZEN:
1380 		error = (newstate == VCPU_FROZEN);
1381 		break;
1382 	default:
1383 		error = 1;
1384 		break;
1385 	}
1386 
1387 	if (error)
1388 		return (EBUSY);
1389 
1390 	VMM_CTR2(vcpu, "vcpu state changed from %s to %s",
1391 	    vcpu_state2str(vcpu->state), vcpu_state2str(newstate));
1392 
1393 	vcpu->state = newstate;
1394 	if (newstate == VCPU_RUNNING)
1395 		vcpu->hostcpu = curcpu;
1396 	else
1397 		vcpu->hostcpu = NOCPU;
1398 
1399 	if (newstate == VCPU_IDLE)
1400 		wakeup(&vcpu->state);
1401 
1402 	return (0);
1403 }
1404 
1405 static void
1406 vcpu_require_state(struct vcpu *vcpu, enum vcpu_state newstate)
1407 {
1408 	int error;
1409 
1410 	if ((error = vcpu_set_state(vcpu, newstate, false)) != 0)
1411 		panic("Error %d setting state to %d\n", error, newstate);
1412 }
1413 
1414 static void
1415 vcpu_require_state_locked(struct vcpu *vcpu, enum vcpu_state newstate)
1416 {
1417 	int error;
1418 
1419 	if ((error = vcpu_set_state_locked(vcpu, newstate, false)) != 0)
1420 		panic("Error %d setting state to %d", error, newstate);
1421 }
1422 
1423 static int
1424 vm_handle_rendezvous(struct vcpu *vcpu)
1425 {
1426 	struct vm *vm = vcpu->vm;
1427 	struct thread *td;
1428 	int error, vcpuid;
1429 
1430 	error = 0;
1431 	vcpuid = vcpu->vcpuid;
1432 	td = curthread;
1433 	mtx_lock(&vm->rendezvous_mtx);
1434 	while (vm->rendezvous_func != NULL) {
1435 		/* 'rendezvous_req_cpus' must be a subset of 'active_cpus' */
1436 		CPU_AND(&vm->rendezvous_req_cpus, &vm->rendezvous_req_cpus, &vm->active_cpus);
1437 
1438 		if (CPU_ISSET(vcpuid, &vm->rendezvous_req_cpus) &&
1439 		    !CPU_ISSET(vcpuid, &vm->rendezvous_done_cpus)) {
1440 			VMM_CTR0(vcpu, "Calling rendezvous func");
1441 			(*vm->rendezvous_func)(vcpu, vm->rendezvous_arg);
1442 			CPU_SET(vcpuid, &vm->rendezvous_done_cpus);
1443 		}
1444 		if (CPU_CMP(&vm->rendezvous_req_cpus,
1445 		    &vm->rendezvous_done_cpus) == 0) {
1446 			VMM_CTR0(vcpu, "Rendezvous completed");
1447 			CPU_ZERO(&vm->rendezvous_req_cpus);
1448 			vm->rendezvous_func = NULL;
1449 			wakeup(&vm->rendezvous_func);
1450 			break;
1451 		}
1452 		VMM_CTR0(vcpu, "Wait for rendezvous completion");
1453 		mtx_sleep(&vm->rendezvous_func, &vm->rendezvous_mtx, 0,
1454 		    "vmrndv", hz);
1455 		if (td_ast_pending(td, TDA_SUSPEND)) {
1456 			mtx_unlock(&vm->rendezvous_mtx);
1457 			error = thread_check_susp(td, true);
1458 			if (error != 0)
1459 				return (error);
1460 			mtx_lock(&vm->rendezvous_mtx);
1461 		}
1462 	}
1463 	mtx_unlock(&vm->rendezvous_mtx);
1464 	return (0);
1465 }
1466 
1467 /*
1468  * Emulate a guest 'hlt' by sleeping until the vcpu is ready to run.
1469  */
1470 static int
1471 vm_handle_hlt(struct vcpu *vcpu, bool intr_disabled, bool *retu)
1472 {
1473 	struct vm *vm = vcpu->vm;
1474 	const char *wmesg;
1475 	struct thread *td;
1476 	int error, t, vcpuid, vcpu_halted, vm_halted;
1477 
1478 	vcpuid = vcpu->vcpuid;
1479 	vcpu_halted = 0;
1480 	vm_halted = 0;
1481 	error = 0;
1482 	td = curthread;
1483 
1484 	KASSERT(!CPU_ISSET(vcpuid, &vm->halted_cpus), ("vcpu already halted"));
1485 
1486 	vcpu_lock(vcpu);
1487 	while (1) {
1488 		/*
1489 		 * Do a final check for pending NMI or interrupts before
1490 		 * really putting this thread to sleep. Also check for
1491 		 * software events that would cause this vcpu to wakeup.
1492 		 *
1493 		 * These interrupts/events could have happened after the
1494 		 * vcpu returned from vmmops_run() and before it acquired the
1495 		 * vcpu lock above.
1496 		 */
1497 		if (vm->rendezvous_func != NULL || vm->suspend || vcpu->reqidle)
1498 			break;
1499 		if (vm_nmi_pending(vcpu))
1500 			break;
1501 		if (!intr_disabled) {
1502 			if (vm_extint_pending(vcpu) ||
1503 			    vlapic_pending_intr(vcpu->vlapic, NULL)) {
1504 				break;
1505 			}
1506 		}
1507 
1508 		/* Don't go to sleep if the vcpu thread needs to yield */
1509 		if (vcpu_should_yield(vcpu))
1510 			break;
1511 
1512 		if (vcpu_debugged(vcpu))
1513 			break;
1514 
1515 		/*
1516 		 * Some Linux guests implement "halt" by having all vcpus
1517 		 * execute HLT with interrupts disabled. 'halted_cpus' keeps
1518 		 * track of the vcpus that have entered this state. When all
1519 		 * vcpus enter the halted state the virtual machine is halted.
1520 		 */
1521 		if (intr_disabled) {
1522 			wmesg = "vmhalt";
1523 			VMM_CTR0(vcpu, "Halted");
1524 			if (!vcpu_halted && halt_detection_enabled) {
1525 				vcpu_halted = 1;
1526 				CPU_SET_ATOMIC(vcpuid, &vm->halted_cpus);
1527 			}
1528 			if (CPU_CMP(&vm->halted_cpus, &vm->active_cpus) == 0) {
1529 				vm_halted = 1;
1530 				break;
1531 			}
1532 		} else {
1533 			wmesg = "vmidle";
1534 		}
1535 
1536 		t = ticks;
1537 		vcpu_require_state_locked(vcpu, VCPU_SLEEPING);
1538 		/*
1539 		 * XXX msleep_spin() cannot be interrupted by signals so
1540 		 * wake up periodically to check pending signals.
1541 		 */
1542 		msleep_spin(vcpu, &vcpu->mtx, wmesg, hz);
1543 		vcpu_require_state_locked(vcpu, VCPU_FROZEN);
1544 		vmm_stat_incr(vcpu, VCPU_IDLE_TICKS, ticks - t);
1545 		if (td_ast_pending(td, TDA_SUSPEND)) {
1546 			vcpu_unlock(vcpu);
1547 			error = thread_check_susp(td, false);
1548 			if (error != 0) {
1549 				if (vcpu_halted) {
1550 					CPU_CLR_ATOMIC(vcpuid,
1551 					    &vm->halted_cpus);
1552 				}
1553 				return (error);
1554 			}
1555 			vcpu_lock(vcpu);
1556 		}
1557 	}
1558 
1559 	if (vcpu_halted)
1560 		CPU_CLR_ATOMIC(vcpuid, &vm->halted_cpus);
1561 
1562 	vcpu_unlock(vcpu);
1563 
1564 	if (vm_halted)
1565 		vm_suspend(vm, VM_SUSPEND_HALT);
1566 
1567 	return (0);
1568 }
1569 
1570 static int
1571 vm_handle_paging(struct vcpu *vcpu, bool *retu)
1572 {
1573 	struct vm *vm = vcpu->vm;
1574 	int rv, ftype;
1575 	struct vm_map *map;
1576 	struct vm_exit *vme;
1577 
1578 	vme = &vcpu->exitinfo;
1579 
1580 	KASSERT(vme->inst_length == 0, ("%s: invalid inst_length %d",
1581 	    __func__, vme->inst_length));
1582 
1583 	ftype = vme->u.paging.fault_type;
1584 	KASSERT(ftype == VM_PROT_READ ||
1585 	    ftype == VM_PROT_WRITE || ftype == VM_PROT_EXECUTE,
1586 	    ("vm_handle_paging: invalid fault_type %d", ftype));
1587 
1588 	if (ftype == VM_PROT_READ || ftype == VM_PROT_WRITE) {
1589 		rv = pmap_emulate_accessed_dirty(vmspace_pmap(vm->vmspace),
1590 		    vme->u.paging.gpa, ftype);
1591 		if (rv == 0) {
1592 			VMM_CTR2(vcpu, "%s bit emulation for gpa %#lx",
1593 			    ftype == VM_PROT_READ ? "accessed" : "dirty",
1594 			    vme->u.paging.gpa);
1595 			goto done;
1596 		}
1597 	}
1598 
1599 	map = &vm->vmspace->vm_map;
1600 	rv = vm_fault(map, vme->u.paging.gpa, ftype, VM_FAULT_NORMAL, NULL);
1601 
1602 	VMM_CTR3(vcpu, "vm_handle_paging rv = %d, gpa = %#lx, "
1603 	    "ftype = %d", rv, vme->u.paging.gpa, ftype);
1604 
1605 	if (rv != KERN_SUCCESS)
1606 		return (EFAULT);
1607 done:
1608 	return (0);
1609 }
1610 
1611 static int
1612 vm_handle_inst_emul(struct vcpu *vcpu, bool *retu)
1613 {
1614 	struct vie *vie;
1615 	struct vm_exit *vme;
1616 	uint64_t gla, gpa, cs_base;
1617 	struct vm_guest_paging *paging;
1618 	mem_region_read_t mread;
1619 	mem_region_write_t mwrite;
1620 	enum vm_cpu_mode cpu_mode;
1621 	int cs_d, error, fault;
1622 
1623 	vme = &vcpu->exitinfo;
1624 
1625 	KASSERT(vme->inst_length == 0, ("%s: invalid inst_length %d",
1626 	    __func__, vme->inst_length));
1627 
1628 	gla = vme->u.inst_emul.gla;
1629 	gpa = vme->u.inst_emul.gpa;
1630 	cs_base = vme->u.inst_emul.cs_base;
1631 	cs_d = vme->u.inst_emul.cs_d;
1632 	vie = &vme->u.inst_emul.vie;
1633 	paging = &vme->u.inst_emul.paging;
1634 	cpu_mode = paging->cpu_mode;
1635 
1636 	VMM_CTR1(vcpu, "inst_emul fault accessing gpa %#lx", gpa);
1637 
1638 	/* Fetch, decode and emulate the faulting instruction */
1639 	if (vie->num_valid == 0) {
1640 		error = vmm_fetch_instruction(vcpu, paging, vme->rip + cs_base,
1641 		    VIE_INST_SIZE, vie, &fault);
1642 	} else {
1643 		/*
1644 		 * The instruction bytes have already been copied into 'vie'
1645 		 */
1646 		error = fault = 0;
1647 	}
1648 	if (error || fault)
1649 		return (error);
1650 
1651 	if (vmm_decode_instruction(vcpu, gla, cpu_mode, cs_d, vie) != 0) {
1652 		VMM_CTR1(vcpu, "Error decoding instruction at %#lx",
1653 		    vme->rip + cs_base);
1654 		*retu = true;	    /* dump instruction bytes in userspace */
1655 		return (0);
1656 	}
1657 
1658 	/*
1659 	 * Update 'nextrip' based on the length of the emulated instruction.
1660 	 */
1661 	vme->inst_length = vie->num_processed;
1662 	vcpu->nextrip += vie->num_processed;
1663 	VMM_CTR1(vcpu, "nextrip updated to %#lx after instruction decoding",
1664 	    vcpu->nextrip);
1665 
1666 	/* return to userland unless this is an in-kernel emulated device */
1667 	if (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE) {
1668 		mread = lapic_mmio_read;
1669 		mwrite = lapic_mmio_write;
1670 	} else if (gpa >= VIOAPIC_BASE && gpa < VIOAPIC_BASE + VIOAPIC_SIZE) {
1671 		mread = vioapic_mmio_read;
1672 		mwrite = vioapic_mmio_write;
1673 	} else if (gpa >= VHPET_BASE && gpa < VHPET_BASE + VHPET_SIZE) {
1674 		mread = vhpet_mmio_read;
1675 		mwrite = vhpet_mmio_write;
1676 	} else {
1677 		*retu = true;
1678 		return (0);
1679 	}
1680 
1681 	error = vmm_emulate_instruction(vcpu, gpa, vie, paging, mread, mwrite,
1682 	    retu);
1683 
1684 	return (error);
1685 }
1686 
1687 static int
1688 vm_handle_suspend(struct vcpu *vcpu, bool *retu)
1689 {
1690 	struct vm *vm = vcpu->vm;
1691 	int error, i;
1692 	struct thread *td;
1693 
1694 	error = 0;
1695 	td = curthread;
1696 
1697 	CPU_SET_ATOMIC(vcpu->vcpuid, &vm->suspended_cpus);
1698 
1699 	/*
1700 	 * Wait until all 'active_cpus' have suspended themselves.
1701 	 *
1702 	 * Since a VM may be suspended at any time including when one or
1703 	 * more vcpus are doing a rendezvous we need to call the rendezvous
1704 	 * handler while we are waiting to prevent a deadlock.
1705 	 */
1706 	vcpu_lock(vcpu);
1707 	while (error == 0) {
1708 		if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) {
1709 			VMM_CTR0(vcpu, "All vcpus suspended");
1710 			break;
1711 		}
1712 
1713 		if (vm->rendezvous_func == NULL) {
1714 			VMM_CTR0(vcpu, "Sleeping during suspend");
1715 			vcpu_require_state_locked(vcpu, VCPU_SLEEPING);
1716 			msleep_spin(vcpu, &vcpu->mtx, "vmsusp", hz);
1717 			vcpu_require_state_locked(vcpu, VCPU_FROZEN);
1718 			if (td_ast_pending(td, TDA_SUSPEND)) {
1719 				vcpu_unlock(vcpu);
1720 				error = thread_check_susp(td, false);
1721 				vcpu_lock(vcpu);
1722 			}
1723 		} else {
1724 			VMM_CTR0(vcpu, "Rendezvous during suspend");
1725 			vcpu_unlock(vcpu);
1726 			error = vm_handle_rendezvous(vcpu);
1727 			vcpu_lock(vcpu);
1728 		}
1729 	}
1730 	vcpu_unlock(vcpu);
1731 
1732 	/*
1733 	 * Wakeup the other sleeping vcpus and return to userspace.
1734 	 */
1735 	for (i = 0; i < vm->maxcpus; i++) {
1736 		if (CPU_ISSET(i, &vm->suspended_cpus)) {
1737 			vcpu_notify_event(vm_vcpu(vm, i), false);
1738 		}
1739 	}
1740 
1741 	*retu = true;
1742 	return (error);
1743 }
1744 
1745 static int
1746 vm_handle_reqidle(struct vcpu *vcpu, bool *retu)
1747 {
1748 	vcpu_lock(vcpu);
1749 	KASSERT(vcpu->reqidle, ("invalid vcpu reqidle %d", vcpu->reqidle));
1750 	vcpu->reqidle = 0;
1751 	vcpu_unlock(vcpu);
1752 	*retu = true;
1753 	return (0);
1754 }
1755 
1756 int
1757 vm_suspend(struct vm *vm, enum vm_suspend_how how)
1758 {
1759 	int i;
1760 
1761 	if (how <= VM_SUSPEND_NONE || how >= VM_SUSPEND_LAST)
1762 		return (EINVAL);
1763 
1764 	if (atomic_cmpset_int(&vm->suspend, 0, how) == 0) {
1765 		VM_CTR2(vm, "virtual machine already suspended %d/%d",
1766 		    vm->suspend, how);
1767 		return (EALREADY);
1768 	}
1769 
1770 	VM_CTR1(vm, "virtual machine successfully suspended %d", how);
1771 
1772 	/*
1773 	 * Notify all active vcpus that they are now suspended.
1774 	 */
1775 	for (i = 0; i < vm->maxcpus; i++) {
1776 		if (CPU_ISSET(i, &vm->active_cpus))
1777 			vcpu_notify_event(vm_vcpu(vm, i), false);
1778 	}
1779 
1780 	return (0);
1781 }
1782 
1783 void
1784 vm_exit_suspended(struct vcpu *vcpu, uint64_t rip)
1785 {
1786 	struct vm *vm = vcpu->vm;
1787 	struct vm_exit *vmexit;
1788 
1789 	KASSERT(vm->suspend > VM_SUSPEND_NONE && vm->suspend < VM_SUSPEND_LAST,
1790 	    ("vm_exit_suspended: invalid suspend type %d", vm->suspend));
1791 
1792 	vmexit = vm_exitinfo(vcpu);
1793 	vmexit->rip = rip;
1794 	vmexit->inst_length = 0;
1795 	vmexit->exitcode = VM_EXITCODE_SUSPENDED;
1796 	vmexit->u.suspended.how = vm->suspend;
1797 }
1798 
1799 void
1800 vm_exit_debug(struct vcpu *vcpu, uint64_t rip)
1801 {
1802 	struct vm_exit *vmexit;
1803 
1804 	vmexit = vm_exitinfo(vcpu);
1805 	vmexit->rip = rip;
1806 	vmexit->inst_length = 0;
1807 	vmexit->exitcode = VM_EXITCODE_DEBUG;
1808 }
1809 
1810 void
1811 vm_exit_rendezvous(struct vcpu *vcpu, uint64_t rip)
1812 {
1813 	struct vm_exit *vmexit;
1814 
1815 	vmexit = vm_exitinfo(vcpu);
1816 	vmexit->rip = rip;
1817 	vmexit->inst_length = 0;
1818 	vmexit->exitcode = VM_EXITCODE_RENDEZVOUS;
1819 	vmm_stat_incr(vcpu, VMEXIT_RENDEZVOUS, 1);
1820 }
1821 
1822 void
1823 vm_exit_reqidle(struct vcpu *vcpu, uint64_t rip)
1824 {
1825 	struct vm_exit *vmexit;
1826 
1827 	vmexit = vm_exitinfo(vcpu);
1828 	vmexit->rip = rip;
1829 	vmexit->inst_length = 0;
1830 	vmexit->exitcode = VM_EXITCODE_REQIDLE;
1831 	vmm_stat_incr(vcpu, VMEXIT_REQIDLE, 1);
1832 }
1833 
1834 void
1835 vm_exit_astpending(struct vcpu *vcpu, uint64_t rip)
1836 {
1837 	struct vm_exit *vmexit;
1838 
1839 	vmexit = vm_exitinfo(vcpu);
1840 	vmexit->rip = rip;
1841 	vmexit->inst_length = 0;
1842 	vmexit->exitcode = VM_EXITCODE_BOGUS;
1843 	vmm_stat_incr(vcpu, VMEXIT_ASTPENDING, 1);
1844 }
1845 
1846 int
1847 vm_run(struct vcpu *vcpu)
1848 {
1849 	struct vm *vm = vcpu->vm;
1850 	struct vm_eventinfo evinfo;
1851 	int error, vcpuid;
1852 	struct pcb *pcb;
1853 	uint64_t tscval;
1854 	struct vm_exit *vme;
1855 	bool retu, intr_disabled;
1856 	pmap_t pmap;
1857 
1858 	vcpuid = vcpu->vcpuid;
1859 
1860 	if (!CPU_ISSET(vcpuid, &vm->active_cpus))
1861 		return (EINVAL);
1862 
1863 	if (CPU_ISSET(vcpuid, &vm->suspended_cpus))
1864 		return (EINVAL);
1865 
1866 	pmap = vmspace_pmap(vm->vmspace);
1867 	vme = &vcpu->exitinfo;
1868 	evinfo.rptr = &vm->rendezvous_req_cpus;
1869 	evinfo.sptr = &vm->suspend;
1870 	evinfo.iptr = &vcpu->reqidle;
1871 restart:
1872 	critical_enter();
1873 
1874 	KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active),
1875 	    ("vm_run: absurd pm_active"));
1876 
1877 	tscval = rdtsc();
1878 
1879 	pcb = PCPU_GET(curpcb);
1880 	set_pcb_flags(pcb, PCB_FULL_IRET);
1881 
1882 	restore_guest_fpustate(vcpu);
1883 
1884 	vcpu_require_state(vcpu, VCPU_RUNNING);
1885 	error = vmmops_run(vcpu->cookie, vcpu->nextrip, pmap, &evinfo);
1886 	vcpu_require_state(vcpu, VCPU_FROZEN);
1887 
1888 	save_guest_fpustate(vcpu);
1889 
1890 	vmm_stat_incr(vcpu, VCPU_TOTAL_RUNTIME, rdtsc() - tscval);
1891 
1892 	critical_exit();
1893 
1894 	if (error == 0) {
1895 		retu = false;
1896 		vcpu->nextrip = vme->rip + vme->inst_length;
1897 		switch (vme->exitcode) {
1898 		case VM_EXITCODE_REQIDLE:
1899 			error = vm_handle_reqidle(vcpu, &retu);
1900 			break;
1901 		case VM_EXITCODE_SUSPENDED:
1902 			error = vm_handle_suspend(vcpu, &retu);
1903 			break;
1904 		case VM_EXITCODE_IOAPIC_EOI:
1905 			vioapic_process_eoi(vm, vme->u.ioapic_eoi.vector);
1906 			break;
1907 		case VM_EXITCODE_RENDEZVOUS:
1908 			error = vm_handle_rendezvous(vcpu);
1909 			break;
1910 		case VM_EXITCODE_HLT:
1911 			intr_disabled = ((vme->u.hlt.rflags & PSL_I) == 0);
1912 			error = vm_handle_hlt(vcpu, intr_disabled, &retu);
1913 			break;
1914 		case VM_EXITCODE_PAGING:
1915 			error = vm_handle_paging(vcpu, &retu);
1916 			break;
1917 		case VM_EXITCODE_INST_EMUL:
1918 			error = vm_handle_inst_emul(vcpu, &retu);
1919 			break;
1920 		case VM_EXITCODE_INOUT:
1921 		case VM_EXITCODE_INOUT_STR:
1922 			error = vm_handle_inout(vcpu, vme, &retu);
1923 			break;
1924 		case VM_EXITCODE_MONITOR:
1925 		case VM_EXITCODE_MWAIT:
1926 		case VM_EXITCODE_VMINSN:
1927 			vm_inject_ud(vcpu);
1928 			break;
1929 		default:
1930 			retu = true;	/* handled in userland */
1931 			break;
1932 		}
1933 	}
1934 
1935 	/*
1936 	 * VM_EXITCODE_INST_EMUL could access the apic which could transform the
1937 	 * exit code into VM_EXITCODE_IPI.
1938 	 */
1939 	if (error == 0 && vme->exitcode == VM_EXITCODE_IPI)
1940 		error = vm_handle_ipi(vcpu, vme, &retu);
1941 
1942 	if (error == 0 && retu == false)
1943 		goto restart;
1944 
1945 	vmm_stat_incr(vcpu, VMEXIT_USERSPACE, 1);
1946 	VMM_CTR2(vcpu, "retu %d/%d", error, vme->exitcode);
1947 
1948 	return (error);
1949 }
1950 
1951 int
1952 vm_restart_instruction(struct vcpu *vcpu)
1953 {
1954 	enum vcpu_state state;
1955 	uint64_t rip;
1956 	int error __diagused;
1957 
1958 	state = vcpu_get_state(vcpu, NULL);
1959 	if (state == VCPU_RUNNING) {
1960 		/*
1961 		 * When a vcpu is "running" the next instruction is determined
1962 		 * by adding 'rip' and 'inst_length' in the vcpu's 'exitinfo'.
1963 		 * Thus setting 'inst_length' to zero will cause the current
1964 		 * instruction to be restarted.
1965 		 */
1966 		vcpu->exitinfo.inst_length = 0;
1967 		VMM_CTR1(vcpu, "restarting instruction at %#lx by "
1968 		    "setting inst_length to zero", vcpu->exitinfo.rip);
1969 	} else if (state == VCPU_FROZEN) {
1970 		/*
1971 		 * When a vcpu is "frozen" it is outside the critical section
1972 		 * around vmmops_run() and 'nextrip' points to the next
1973 		 * instruction. Thus instruction restart is achieved by setting
1974 		 * 'nextrip' to the vcpu's %rip.
1975 		 */
1976 		error = vm_get_register(vcpu, VM_REG_GUEST_RIP, &rip);
1977 		KASSERT(!error, ("%s: error %d getting rip", __func__, error));
1978 		VMM_CTR2(vcpu, "restarting instruction by updating "
1979 		    "nextrip from %#lx to %#lx", vcpu->nextrip, rip);
1980 		vcpu->nextrip = rip;
1981 	} else {
1982 		panic("%s: invalid state %d", __func__, state);
1983 	}
1984 	return (0);
1985 }
1986 
1987 int
1988 vm_exit_intinfo(struct vcpu *vcpu, uint64_t info)
1989 {
1990 	int type, vector;
1991 
1992 	if (info & VM_INTINFO_VALID) {
1993 		type = info & VM_INTINFO_TYPE;
1994 		vector = info & 0xff;
1995 		if (type == VM_INTINFO_NMI && vector != IDT_NMI)
1996 			return (EINVAL);
1997 		if (type == VM_INTINFO_HWEXCEPTION && vector >= 32)
1998 			return (EINVAL);
1999 		if (info & VM_INTINFO_RSVD)
2000 			return (EINVAL);
2001 	} else {
2002 		info = 0;
2003 	}
2004 	VMM_CTR2(vcpu, "%s: info1(%#lx)", __func__, info);
2005 	vcpu->exitintinfo = info;
2006 	return (0);
2007 }
2008 
2009 enum exc_class {
2010 	EXC_BENIGN,
2011 	EXC_CONTRIBUTORY,
2012 	EXC_PAGEFAULT
2013 };
2014 
2015 #define	IDT_VE	20	/* Virtualization Exception (Intel specific) */
2016 
2017 static enum exc_class
2018 exception_class(uint64_t info)
2019 {
2020 	int type, vector;
2021 
2022 	KASSERT(info & VM_INTINFO_VALID, ("intinfo must be valid: %#lx", info));
2023 	type = info & VM_INTINFO_TYPE;
2024 	vector = info & 0xff;
2025 
2026 	/* Table 6-4, "Interrupt and Exception Classes", Intel SDM, Vol 3 */
2027 	switch (type) {
2028 	case VM_INTINFO_HWINTR:
2029 	case VM_INTINFO_SWINTR:
2030 	case VM_INTINFO_NMI:
2031 		return (EXC_BENIGN);
2032 	default:
2033 		/*
2034 		 * Hardware exception.
2035 		 *
2036 		 * SVM and VT-x use identical type values to represent NMI,
2037 		 * hardware interrupt and software interrupt.
2038 		 *
2039 		 * SVM uses type '3' for all exceptions. VT-x uses type '3'
2040 		 * for exceptions except #BP and #OF. #BP and #OF use a type
2041 		 * value of '5' or '6'. Therefore we don't check for explicit
2042 		 * values of 'type' to classify 'intinfo' into a hardware
2043 		 * exception.
2044 		 */
2045 		break;
2046 	}
2047 
2048 	switch (vector) {
2049 	case IDT_PF:
2050 	case IDT_VE:
2051 		return (EXC_PAGEFAULT);
2052 	case IDT_DE:
2053 	case IDT_TS:
2054 	case IDT_NP:
2055 	case IDT_SS:
2056 	case IDT_GP:
2057 		return (EXC_CONTRIBUTORY);
2058 	default:
2059 		return (EXC_BENIGN);
2060 	}
2061 }
2062 
2063 static int
2064 nested_fault(struct vcpu *vcpu, uint64_t info1, uint64_t info2,
2065     uint64_t *retinfo)
2066 {
2067 	enum exc_class exc1, exc2;
2068 	int type1, vector1;
2069 
2070 	KASSERT(info1 & VM_INTINFO_VALID, ("info1 %#lx is not valid", info1));
2071 	KASSERT(info2 & VM_INTINFO_VALID, ("info2 %#lx is not valid", info2));
2072 
2073 	/*
2074 	 * If an exception occurs while attempting to call the double-fault
2075 	 * handler the processor enters shutdown mode (aka triple fault).
2076 	 */
2077 	type1 = info1 & VM_INTINFO_TYPE;
2078 	vector1 = info1 & 0xff;
2079 	if (type1 == VM_INTINFO_HWEXCEPTION && vector1 == IDT_DF) {
2080 		VMM_CTR2(vcpu, "triple fault: info1(%#lx), info2(%#lx)",
2081 		    info1, info2);
2082 		vm_suspend(vcpu->vm, VM_SUSPEND_TRIPLEFAULT);
2083 		*retinfo = 0;
2084 		return (0);
2085 	}
2086 
2087 	/*
2088 	 * Table 6-5 "Conditions for Generating a Double Fault", Intel SDM, Vol3
2089 	 */
2090 	exc1 = exception_class(info1);
2091 	exc2 = exception_class(info2);
2092 	if ((exc1 == EXC_CONTRIBUTORY && exc2 == EXC_CONTRIBUTORY) ||
2093 	    (exc1 == EXC_PAGEFAULT && exc2 != EXC_BENIGN)) {
2094 		/* Convert nested fault into a double fault. */
2095 		*retinfo = IDT_DF;
2096 		*retinfo |= VM_INTINFO_VALID | VM_INTINFO_HWEXCEPTION;
2097 		*retinfo |= VM_INTINFO_DEL_ERRCODE;
2098 	} else {
2099 		/* Handle exceptions serially */
2100 		*retinfo = info2;
2101 	}
2102 	return (1);
2103 }
2104 
2105 static uint64_t
2106 vcpu_exception_intinfo(struct vcpu *vcpu)
2107 {
2108 	uint64_t info = 0;
2109 
2110 	if (vcpu->exception_pending) {
2111 		info = vcpu->exc_vector & 0xff;
2112 		info |= VM_INTINFO_VALID | VM_INTINFO_HWEXCEPTION;
2113 		if (vcpu->exc_errcode_valid) {
2114 			info |= VM_INTINFO_DEL_ERRCODE;
2115 			info |= (uint64_t)vcpu->exc_errcode << 32;
2116 		}
2117 	}
2118 	return (info);
2119 }
2120 
2121 int
2122 vm_entry_intinfo(struct vcpu *vcpu, uint64_t *retinfo)
2123 {
2124 	uint64_t info1, info2;
2125 	int valid;
2126 
2127 	info1 = vcpu->exitintinfo;
2128 	vcpu->exitintinfo = 0;
2129 
2130 	info2 = 0;
2131 	if (vcpu->exception_pending) {
2132 		info2 = vcpu_exception_intinfo(vcpu);
2133 		vcpu->exception_pending = 0;
2134 		VMM_CTR2(vcpu, "Exception %d delivered: %#lx",
2135 		    vcpu->exc_vector, info2);
2136 	}
2137 
2138 	if ((info1 & VM_INTINFO_VALID) && (info2 & VM_INTINFO_VALID)) {
2139 		valid = nested_fault(vcpu, info1, info2, retinfo);
2140 	} else if (info1 & VM_INTINFO_VALID) {
2141 		*retinfo = info1;
2142 		valid = 1;
2143 	} else if (info2 & VM_INTINFO_VALID) {
2144 		*retinfo = info2;
2145 		valid = 1;
2146 	} else {
2147 		valid = 0;
2148 	}
2149 
2150 	if (valid) {
2151 		VMM_CTR4(vcpu, "%s: info1(%#lx), info2(%#lx), "
2152 		    "retinfo(%#lx)", __func__, info1, info2, *retinfo);
2153 	}
2154 
2155 	return (valid);
2156 }
2157 
2158 int
2159 vm_get_intinfo(struct vcpu *vcpu, uint64_t *info1, uint64_t *info2)
2160 {
2161 	*info1 = vcpu->exitintinfo;
2162 	*info2 = vcpu_exception_intinfo(vcpu);
2163 	return (0);
2164 }
2165 
2166 int
2167 vm_inject_exception(struct vcpu *vcpu, int vector, int errcode_valid,
2168     uint32_t errcode, int restart_instruction)
2169 {
2170 	uint64_t regval;
2171 	int error __diagused;
2172 
2173 	if (vector < 0 || vector >= 32)
2174 		return (EINVAL);
2175 
2176 	/*
2177 	 * A double fault exception should never be injected directly into
2178 	 * the guest. It is a derived exception that results from specific
2179 	 * combinations of nested faults.
2180 	 */
2181 	if (vector == IDT_DF)
2182 		return (EINVAL);
2183 
2184 	if (vcpu->exception_pending) {
2185 		VMM_CTR2(vcpu, "Unable to inject exception %d due to "
2186 		    "pending exception %d", vector, vcpu->exc_vector);
2187 		return (EBUSY);
2188 	}
2189 
2190 	if (errcode_valid) {
2191 		/*
2192 		 * Exceptions don't deliver an error code in real mode.
2193 		 */
2194 		error = vm_get_register(vcpu, VM_REG_GUEST_CR0, &regval);
2195 		KASSERT(!error, ("%s: error %d getting CR0", __func__, error));
2196 		if (!(regval & CR0_PE))
2197 			errcode_valid = 0;
2198 	}
2199 
2200 	/*
2201 	 * From section 26.6.1 "Interruptibility State" in Intel SDM:
2202 	 *
2203 	 * Event blocking by "STI" or "MOV SS" is cleared after guest executes
2204 	 * one instruction or incurs an exception.
2205 	 */
2206 	error = vm_set_register(vcpu, VM_REG_GUEST_INTR_SHADOW, 0);
2207 	KASSERT(error == 0, ("%s: error %d clearing interrupt shadow",
2208 	    __func__, error));
2209 
2210 	if (restart_instruction)
2211 		vm_restart_instruction(vcpu);
2212 
2213 	vcpu->exception_pending = 1;
2214 	vcpu->exc_vector = vector;
2215 	vcpu->exc_errcode = errcode;
2216 	vcpu->exc_errcode_valid = errcode_valid;
2217 	VMM_CTR1(vcpu, "Exception %d pending", vector);
2218 	return (0);
2219 }
2220 
2221 void
2222 vm_inject_fault(struct vcpu *vcpu, int vector, int errcode_valid, int errcode)
2223 {
2224 	int error __diagused, restart_instruction;
2225 
2226 	restart_instruction = 1;
2227 
2228 	error = vm_inject_exception(vcpu, vector, errcode_valid,
2229 	    errcode, restart_instruction);
2230 	KASSERT(error == 0, ("vm_inject_exception error %d", error));
2231 }
2232 
2233 void
2234 vm_inject_pf(struct vcpu *vcpu, int error_code, uint64_t cr2)
2235 {
2236 	int error __diagused;
2237 
2238 	VMM_CTR2(vcpu, "Injecting page fault: error_code %#x, cr2 %#lx",
2239 	    error_code, cr2);
2240 
2241 	error = vm_set_register(vcpu, VM_REG_GUEST_CR2, cr2);
2242 	KASSERT(error == 0, ("vm_set_register(cr2) error %d", error));
2243 
2244 	vm_inject_fault(vcpu, IDT_PF, 1, error_code);
2245 }
2246 
2247 static VMM_STAT(VCPU_NMI_COUNT, "number of NMIs delivered to vcpu");
2248 
2249 int
2250 vm_inject_nmi(struct vcpu *vcpu)
2251 {
2252 
2253 	vcpu->nmi_pending = 1;
2254 	vcpu_notify_event(vcpu, false);
2255 	return (0);
2256 }
2257 
2258 int
2259 vm_nmi_pending(struct vcpu *vcpu)
2260 {
2261 	return (vcpu->nmi_pending);
2262 }
2263 
2264 void
2265 vm_nmi_clear(struct vcpu *vcpu)
2266 {
2267 	if (vcpu->nmi_pending == 0)
2268 		panic("vm_nmi_clear: inconsistent nmi_pending state");
2269 
2270 	vcpu->nmi_pending = 0;
2271 	vmm_stat_incr(vcpu, VCPU_NMI_COUNT, 1);
2272 }
2273 
2274 static VMM_STAT(VCPU_EXTINT_COUNT, "number of ExtINTs delivered to vcpu");
2275 
2276 int
2277 vm_inject_extint(struct vcpu *vcpu)
2278 {
2279 
2280 	vcpu->extint_pending = 1;
2281 	vcpu_notify_event(vcpu, false);
2282 	return (0);
2283 }
2284 
2285 int
2286 vm_extint_pending(struct vcpu *vcpu)
2287 {
2288 	return (vcpu->extint_pending);
2289 }
2290 
2291 void
2292 vm_extint_clear(struct vcpu *vcpu)
2293 {
2294 	if (vcpu->extint_pending == 0)
2295 		panic("vm_extint_clear: inconsistent extint_pending state");
2296 
2297 	vcpu->extint_pending = 0;
2298 	vmm_stat_incr(vcpu, VCPU_EXTINT_COUNT, 1);
2299 }
2300 
2301 int
2302 vm_get_capability(struct vcpu *vcpu, int type, int *retval)
2303 {
2304 	if (type < 0 || type >= VM_CAP_MAX)
2305 		return (EINVAL);
2306 
2307 	return (vmmops_getcap(vcpu->cookie, type, retval));
2308 }
2309 
2310 int
2311 vm_set_capability(struct vcpu *vcpu, int type, int val)
2312 {
2313 	if (type < 0 || type >= VM_CAP_MAX)
2314 		return (EINVAL);
2315 
2316 	return (vmmops_setcap(vcpu->cookie, type, val));
2317 }
2318 
2319 struct vm *
2320 vcpu_vm(struct vcpu *vcpu)
2321 {
2322 	return (vcpu->vm);
2323 }
2324 
2325 int
2326 vcpu_vcpuid(struct vcpu *vcpu)
2327 {
2328 	return (vcpu->vcpuid);
2329 }
2330 
2331 struct vcpu *
2332 vm_vcpu(struct vm *vm, int vcpuid)
2333 {
2334 	return (vm->vcpu[vcpuid]);
2335 }
2336 
2337 struct vlapic *
2338 vm_lapic(struct vcpu *vcpu)
2339 {
2340 	return (vcpu->vlapic);
2341 }
2342 
2343 struct vioapic *
2344 vm_ioapic(struct vm *vm)
2345 {
2346 
2347 	return (vm->vioapic);
2348 }
2349 
2350 struct vhpet *
2351 vm_hpet(struct vm *vm)
2352 {
2353 
2354 	return (vm->vhpet);
2355 }
2356 
2357 bool
2358 vmm_is_pptdev(int bus, int slot, int func)
2359 {
2360 	int b, f, i, n, s;
2361 	char *val, *cp, *cp2;
2362 	bool found;
2363 
2364 	/*
2365 	 * XXX
2366 	 * The length of an environment variable is limited to 128 bytes which
2367 	 * puts an upper limit on the number of passthru devices that may be
2368 	 * specified using a single environment variable.
2369 	 *
2370 	 * Work around this by scanning multiple environment variable
2371 	 * names instead of a single one - yuck!
2372 	 */
2373 	const char *names[] = { "pptdevs", "pptdevs2", "pptdevs3", NULL };
2374 
2375 	/* set pptdevs="1/2/3 4/5/6 7/8/9 10/11/12" */
2376 	found = false;
2377 	for (i = 0; names[i] != NULL && !found; i++) {
2378 		cp = val = kern_getenv(names[i]);
2379 		while (cp != NULL && *cp != '\0') {
2380 			if ((cp2 = strchr(cp, ' ')) != NULL)
2381 				*cp2 = '\0';
2382 
2383 			n = sscanf(cp, "%d/%d/%d", &b, &s, &f);
2384 			if (n == 3 && bus == b && slot == s && func == f) {
2385 				found = true;
2386 				break;
2387 			}
2388 
2389 			if (cp2 != NULL)
2390 				*cp2++ = ' ';
2391 
2392 			cp = cp2;
2393 		}
2394 		freeenv(val);
2395 	}
2396 	return (found);
2397 }
2398 
2399 void *
2400 vm_iommu_domain(struct vm *vm)
2401 {
2402 
2403 	return (vm->iommu);
2404 }
2405 
2406 int
2407 vcpu_set_state(struct vcpu *vcpu, enum vcpu_state newstate, bool from_idle)
2408 {
2409 	int error;
2410 
2411 	vcpu_lock(vcpu);
2412 	error = vcpu_set_state_locked(vcpu, newstate, from_idle);
2413 	vcpu_unlock(vcpu);
2414 
2415 	return (error);
2416 }
2417 
2418 enum vcpu_state
2419 vcpu_get_state(struct vcpu *vcpu, int *hostcpu)
2420 {
2421 	enum vcpu_state state;
2422 
2423 	vcpu_lock(vcpu);
2424 	state = vcpu->state;
2425 	if (hostcpu != NULL)
2426 		*hostcpu = vcpu->hostcpu;
2427 	vcpu_unlock(vcpu);
2428 
2429 	return (state);
2430 }
2431 
2432 int
2433 vm_activate_cpu(struct vcpu *vcpu)
2434 {
2435 	struct vm *vm = vcpu->vm;
2436 
2437 	if (CPU_ISSET(vcpu->vcpuid, &vm->active_cpus))
2438 		return (EBUSY);
2439 
2440 	VMM_CTR0(vcpu, "activated");
2441 	CPU_SET_ATOMIC(vcpu->vcpuid, &vm->active_cpus);
2442 	return (0);
2443 }
2444 
2445 int
2446 vm_suspend_cpu(struct vm *vm, struct vcpu *vcpu)
2447 {
2448 	if (vcpu == NULL) {
2449 		vm->debug_cpus = vm->active_cpus;
2450 		for (int i = 0; i < vm->maxcpus; i++) {
2451 			if (CPU_ISSET(i, &vm->active_cpus))
2452 				vcpu_notify_event(vm_vcpu(vm, i), false);
2453 		}
2454 	} else {
2455 		if (!CPU_ISSET(vcpu->vcpuid, &vm->active_cpus))
2456 			return (EINVAL);
2457 
2458 		CPU_SET_ATOMIC(vcpu->vcpuid, &vm->debug_cpus);
2459 		vcpu_notify_event(vcpu, false);
2460 	}
2461 	return (0);
2462 }
2463 
2464 int
2465 vm_resume_cpu(struct vm *vm, struct vcpu *vcpu)
2466 {
2467 
2468 	if (vcpu == NULL) {
2469 		CPU_ZERO(&vm->debug_cpus);
2470 	} else {
2471 		if (!CPU_ISSET(vcpu->vcpuid, &vm->debug_cpus))
2472 			return (EINVAL);
2473 
2474 		CPU_CLR_ATOMIC(vcpu->vcpuid, &vm->debug_cpus);
2475 	}
2476 	return (0);
2477 }
2478 
2479 int
2480 vcpu_debugged(struct vcpu *vcpu)
2481 {
2482 
2483 	return (CPU_ISSET(vcpu->vcpuid, &vcpu->vm->debug_cpus));
2484 }
2485 
2486 cpuset_t
2487 vm_active_cpus(struct vm *vm)
2488 {
2489 
2490 	return (vm->active_cpus);
2491 }
2492 
2493 cpuset_t
2494 vm_debug_cpus(struct vm *vm)
2495 {
2496 
2497 	return (vm->debug_cpus);
2498 }
2499 
2500 cpuset_t
2501 vm_suspended_cpus(struct vm *vm)
2502 {
2503 
2504 	return (vm->suspended_cpus);
2505 }
2506 
2507 /*
2508  * Returns the subset of vCPUs in tostart that are awaiting startup.
2509  * These vCPUs are also marked as no longer awaiting startup.
2510  */
2511 cpuset_t
2512 vm_start_cpus(struct vm *vm, const cpuset_t *tostart)
2513 {
2514 	cpuset_t set;
2515 
2516 	mtx_lock(&vm->rendezvous_mtx);
2517 	CPU_AND(&set, &vm->startup_cpus, tostart);
2518 	CPU_ANDNOT(&vm->startup_cpus, &vm->startup_cpus, &set);
2519 	mtx_unlock(&vm->rendezvous_mtx);
2520 	return (set);
2521 }
2522 
2523 void
2524 vm_await_start(struct vm *vm, const cpuset_t *waiting)
2525 {
2526 	mtx_lock(&vm->rendezvous_mtx);
2527 	CPU_OR(&vm->startup_cpus, &vm->startup_cpus, waiting);
2528 	mtx_unlock(&vm->rendezvous_mtx);
2529 }
2530 
2531 void *
2532 vcpu_stats(struct vcpu *vcpu)
2533 {
2534 
2535 	return (vcpu->stats);
2536 }
2537 
2538 int
2539 vm_get_x2apic_state(struct vcpu *vcpu, enum x2apic_state *state)
2540 {
2541 	*state = vcpu->x2apic_state;
2542 
2543 	return (0);
2544 }
2545 
2546 int
2547 vm_set_x2apic_state(struct vcpu *vcpu, enum x2apic_state state)
2548 {
2549 	if (state >= X2APIC_STATE_LAST)
2550 		return (EINVAL);
2551 
2552 	vcpu->x2apic_state = state;
2553 
2554 	vlapic_set_x2apic_state(vcpu, state);
2555 
2556 	return (0);
2557 }
2558 
2559 /*
2560  * This function is called to ensure that a vcpu "sees" a pending event
2561  * as soon as possible:
2562  * - If the vcpu thread is sleeping then it is woken up.
2563  * - If the vcpu is running on a different host_cpu then an IPI will be directed
2564  *   to the host_cpu to cause the vcpu to trap into the hypervisor.
2565  */
2566 static void
2567 vcpu_notify_event_locked(struct vcpu *vcpu, bool lapic_intr)
2568 {
2569 	int hostcpu;
2570 
2571 	hostcpu = vcpu->hostcpu;
2572 	if (vcpu->state == VCPU_RUNNING) {
2573 		KASSERT(hostcpu != NOCPU, ("vcpu running on invalid hostcpu"));
2574 		if (hostcpu != curcpu) {
2575 			if (lapic_intr) {
2576 				vlapic_post_intr(vcpu->vlapic, hostcpu,
2577 				    vmm_ipinum);
2578 			} else {
2579 				ipi_cpu(hostcpu, vmm_ipinum);
2580 			}
2581 		} else {
2582 			/*
2583 			 * If the 'vcpu' is running on 'curcpu' then it must
2584 			 * be sending a notification to itself (e.g. SELF_IPI).
2585 			 * The pending event will be picked up when the vcpu
2586 			 * transitions back to guest context.
2587 			 */
2588 		}
2589 	} else {
2590 		KASSERT(hostcpu == NOCPU, ("vcpu state %d not consistent "
2591 		    "with hostcpu %d", vcpu->state, hostcpu));
2592 		if (vcpu->state == VCPU_SLEEPING)
2593 			wakeup_one(vcpu);
2594 	}
2595 }
2596 
2597 void
2598 vcpu_notify_event(struct vcpu *vcpu, bool lapic_intr)
2599 {
2600 	vcpu_lock(vcpu);
2601 	vcpu_notify_event_locked(vcpu, lapic_intr);
2602 	vcpu_unlock(vcpu);
2603 }
2604 
2605 struct vmspace *
2606 vm_get_vmspace(struct vm *vm)
2607 {
2608 
2609 	return (vm->vmspace);
2610 }
2611 
2612 int
2613 vm_apicid2vcpuid(struct vm *vm, int apicid)
2614 {
2615 	/*
2616 	 * XXX apic id is assumed to be numerically identical to vcpu id
2617 	 */
2618 	return (apicid);
2619 }
2620 
2621 int
2622 vm_smp_rendezvous(struct vcpu *vcpu, cpuset_t dest,
2623     vm_rendezvous_func_t func, void *arg)
2624 {
2625 	struct vm *vm = vcpu->vm;
2626 	int error, i;
2627 
2628 	/*
2629 	 * Enforce that this function is called without any locks
2630 	 */
2631 	WITNESS_WARN(WARN_PANIC, NULL, "vm_smp_rendezvous");
2632 
2633 restart:
2634 	mtx_lock(&vm->rendezvous_mtx);
2635 	if (vm->rendezvous_func != NULL) {
2636 		/*
2637 		 * If a rendezvous is already in progress then we need to
2638 		 * call the rendezvous handler in case this 'vcpu' is one
2639 		 * of the targets of the rendezvous.
2640 		 */
2641 		VMM_CTR0(vcpu, "Rendezvous already in progress");
2642 		mtx_unlock(&vm->rendezvous_mtx);
2643 		error = vm_handle_rendezvous(vcpu);
2644 		if (error != 0)
2645 			return (error);
2646 		goto restart;
2647 	}
2648 	KASSERT(vm->rendezvous_func == NULL, ("vm_smp_rendezvous: previous "
2649 	    "rendezvous is still in progress"));
2650 
2651 	VMM_CTR0(vcpu, "Initiating rendezvous");
2652 	vm->rendezvous_req_cpus = dest;
2653 	CPU_ZERO(&vm->rendezvous_done_cpus);
2654 	vm->rendezvous_arg = arg;
2655 	vm->rendezvous_func = func;
2656 	mtx_unlock(&vm->rendezvous_mtx);
2657 
2658 	/*
2659 	 * Wake up any sleeping vcpus and trigger a VM-exit in any running
2660 	 * vcpus so they handle the rendezvous as soon as possible.
2661 	 */
2662 	for (i = 0; i < vm->maxcpus; i++) {
2663 		if (CPU_ISSET(i, &dest))
2664 			vcpu_notify_event(vm_vcpu(vm, i), false);
2665 	}
2666 
2667 	return (vm_handle_rendezvous(vcpu));
2668 }
2669 
2670 struct vatpic *
2671 vm_atpic(struct vm *vm)
2672 {
2673 	return (vm->vatpic);
2674 }
2675 
2676 struct vatpit *
2677 vm_atpit(struct vm *vm)
2678 {
2679 	return (vm->vatpit);
2680 }
2681 
2682 struct vpmtmr *
2683 vm_pmtmr(struct vm *vm)
2684 {
2685 
2686 	return (vm->vpmtmr);
2687 }
2688 
2689 struct vrtc *
2690 vm_rtc(struct vm *vm)
2691 {
2692 
2693 	return (vm->vrtc);
2694 }
2695 
2696 enum vm_reg_name
2697 vm_segment_name(int seg)
2698 {
2699 	static enum vm_reg_name seg_names[] = {
2700 		VM_REG_GUEST_ES,
2701 		VM_REG_GUEST_CS,
2702 		VM_REG_GUEST_SS,
2703 		VM_REG_GUEST_DS,
2704 		VM_REG_GUEST_FS,
2705 		VM_REG_GUEST_GS
2706 	};
2707 
2708 	KASSERT(seg >= 0 && seg < nitems(seg_names),
2709 	    ("%s: invalid segment encoding %d", __func__, seg));
2710 	return (seg_names[seg]);
2711 }
2712 
2713 void
2714 vm_copy_teardown(struct vm_copyinfo *copyinfo, int num_copyinfo)
2715 {
2716 	int idx;
2717 
2718 	for (idx = 0; idx < num_copyinfo; idx++) {
2719 		if (copyinfo[idx].cookie != NULL)
2720 			vm_gpa_release(copyinfo[idx].cookie);
2721 	}
2722 	bzero(copyinfo, num_copyinfo * sizeof(struct vm_copyinfo));
2723 }
2724 
2725 int
2726 vm_copy_setup(struct vcpu *vcpu, struct vm_guest_paging *paging,
2727     uint64_t gla, size_t len, int prot, struct vm_copyinfo *copyinfo,
2728     int num_copyinfo, int *fault)
2729 {
2730 	int error, idx, nused;
2731 	size_t n, off, remaining;
2732 	void *hva, *cookie;
2733 	uint64_t gpa;
2734 
2735 	bzero(copyinfo, sizeof(struct vm_copyinfo) * num_copyinfo);
2736 
2737 	nused = 0;
2738 	remaining = len;
2739 	while (remaining > 0) {
2740 		KASSERT(nused < num_copyinfo, ("insufficient vm_copyinfo"));
2741 		error = vm_gla2gpa(vcpu, paging, gla, prot, &gpa, fault);
2742 		if (error || *fault)
2743 			return (error);
2744 		off = gpa & PAGE_MASK;
2745 		n = min(remaining, PAGE_SIZE - off);
2746 		copyinfo[nused].gpa = gpa;
2747 		copyinfo[nused].len = n;
2748 		remaining -= n;
2749 		gla += n;
2750 		nused++;
2751 	}
2752 
2753 	for (idx = 0; idx < nused; idx++) {
2754 		hva = vm_gpa_hold(vcpu, copyinfo[idx].gpa,
2755 		    copyinfo[idx].len, prot, &cookie);
2756 		if (hva == NULL)
2757 			break;
2758 		copyinfo[idx].hva = hva;
2759 		copyinfo[idx].cookie = cookie;
2760 	}
2761 
2762 	if (idx != nused) {
2763 		vm_copy_teardown(copyinfo, num_copyinfo);
2764 		return (EFAULT);
2765 	} else {
2766 		*fault = 0;
2767 		return (0);
2768 	}
2769 }
2770 
2771 void
2772 vm_copyin(struct vm_copyinfo *copyinfo, void *kaddr, size_t len)
2773 {
2774 	char *dst;
2775 	int idx;
2776 
2777 	dst = kaddr;
2778 	idx = 0;
2779 	while (len > 0) {
2780 		bcopy(copyinfo[idx].hva, dst, copyinfo[idx].len);
2781 		len -= copyinfo[idx].len;
2782 		dst += copyinfo[idx].len;
2783 		idx++;
2784 	}
2785 }
2786 
2787 void
2788 vm_copyout(const void *kaddr, struct vm_copyinfo *copyinfo, size_t len)
2789 {
2790 	const char *src;
2791 	int idx;
2792 
2793 	src = kaddr;
2794 	idx = 0;
2795 	while (len > 0) {
2796 		bcopy(src, copyinfo[idx].hva, copyinfo[idx].len);
2797 		len -= copyinfo[idx].len;
2798 		src += copyinfo[idx].len;
2799 		idx++;
2800 	}
2801 }
2802 
2803 /*
2804  * Return the amount of in-use and wired memory for the VM. Since
2805  * these are global stats, only return the values with for vCPU 0
2806  */
2807 VMM_STAT_DECLARE(VMM_MEM_RESIDENT);
2808 VMM_STAT_DECLARE(VMM_MEM_WIRED);
2809 
2810 static void
2811 vm_get_rescnt(struct vcpu *vcpu, struct vmm_stat_type *stat)
2812 {
2813 
2814 	if (vcpu->vcpuid == 0) {
2815 		vmm_stat_set(vcpu, VMM_MEM_RESIDENT, PAGE_SIZE *
2816 		    vmspace_resident_count(vcpu->vm->vmspace));
2817 	}
2818 }
2819 
2820 static void
2821 vm_get_wiredcnt(struct vcpu *vcpu, struct vmm_stat_type *stat)
2822 {
2823 
2824 	if (vcpu->vcpuid == 0) {
2825 		vmm_stat_set(vcpu, VMM_MEM_WIRED, PAGE_SIZE *
2826 		    pmap_wired_count(vmspace_pmap(vcpu->vm->vmspace)));
2827 	}
2828 }
2829 
2830 VMM_STAT_FUNC(VMM_MEM_RESIDENT, "Resident memory", vm_get_rescnt);
2831 VMM_STAT_FUNC(VMM_MEM_WIRED, "Wired memory", vm_get_wiredcnt);
2832 
2833 #ifdef BHYVE_SNAPSHOT
2834 static int
2835 vm_snapshot_vcpus(struct vm *vm, struct vm_snapshot_meta *meta)
2836 {
2837 	uint64_t tsc, now;
2838 	int ret;
2839 	struct vcpu *vcpu;
2840 	uint16_t i, maxcpus;
2841 
2842 	now = rdtsc();
2843 	maxcpus = vm_get_maxcpus(vm);
2844 	for (i = 0; i < maxcpus; i++) {
2845 		vcpu = vm->vcpu[i];
2846 		if (vcpu == NULL)
2847 			continue;
2848 
2849 		SNAPSHOT_VAR_OR_LEAVE(vcpu->x2apic_state, meta, ret, done);
2850 		SNAPSHOT_VAR_OR_LEAVE(vcpu->exitintinfo, meta, ret, done);
2851 		SNAPSHOT_VAR_OR_LEAVE(vcpu->exc_vector, meta, ret, done);
2852 		SNAPSHOT_VAR_OR_LEAVE(vcpu->exc_errcode_valid, meta, ret, done);
2853 		SNAPSHOT_VAR_OR_LEAVE(vcpu->exc_errcode, meta, ret, done);
2854 		SNAPSHOT_VAR_OR_LEAVE(vcpu->guest_xcr0, meta, ret, done);
2855 		SNAPSHOT_VAR_OR_LEAVE(vcpu->exitinfo, meta, ret, done);
2856 		SNAPSHOT_VAR_OR_LEAVE(vcpu->nextrip, meta, ret, done);
2857 
2858 		/*
2859 		 * Save the absolute TSC value by adding now to tsc_offset.
2860 		 *
2861 		 * It will be turned turned back into an actual offset when the
2862 		 * TSC restore function is called
2863 		 */
2864 		tsc = now + vcpu->tsc_offset;
2865 		SNAPSHOT_VAR_OR_LEAVE(tsc, meta, ret, done);
2866 		if (meta->op == VM_SNAPSHOT_RESTORE)
2867 			vcpu->tsc_offset = tsc;
2868 	}
2869 
2870 done:
2871 	return (ret);
2872 }
2873 
2874 static int
2875 vm_snapshot_vm(struct vm *vm, struct vm_snapshot_meta *meta)
2876 {
2877 	int ret;
2878 
2879 	ret = vm_snapshot_vcpus(vm, meta);
2880 	if (ret != 0)
2881 		goto done;
2882 
2883 	SNAPSHOT_VAR_OR_LEAVE(vm->startup_cpus, meta, ret, done);
2884 done:
2885 	return (ret);
2886 }
2887 
2888 static int
2889 vm_snapshot_vcpu(struct vm *vm, struct vm_snapshot_meta *meta)
2890 {
2891 	int error;
2892 	struct vcpu *vcpu;
2893 	uint16_t i, maxcpus;
2894 
2895 	error = 0;
2896 
2897 	maxcpus = vm_get_maxcpus(vm);
2898 	for (i = 0; i < maxcpus; i++) {
2899 		vcpu = vm->vcpu[i];
2900 		if (vcpu == NULL)
2901 			continue;
2902 
2903 		error = vmmops_vcpu_snapshot(vcpu->cookie, meta);
2904 		if (error != 0) {
2905 			printf("%s: failed to snapshot vmcs/vmcb data for "
2906 			       "vCPU: %d; error: %d\n", __func__, i, error);
2907 			goto done;
2908 		}
2909 	}
2910 
2911 done:
2912 	return (error);
2913 }
2914 
2915 /*
2916  * Save kernel-side structures to user-space for snapshotting.
2917  */
2918 int
2919 vm_snapshot_req(struct vm *vm, struct vm_snapshot_meta *meta)
2920 {
2921 	int ret = 0;
2922 
2923 	switch (meta->dev_req) {
2924 	case STRUCT_VMCX:
2925 		ret = vm_snapshot_vcpu(vm, meta);
2926 		break;
2927 	case STRUCT_VM:
2928 		ret = vm_snapshot_vm(vm, meta);
2929 		break;
2930 	case STRUCT_VIOAPIC:
2931 		ret = vioapic_snapshot(vm_ioapic(vm), meta);
2932 		break;
2933 	case STRUCT_VLAPIC:
2934 		ret = vlapic_snapshot(vm, meta);
2935 		break;
2936 	case STRUCT_VHPET:
2937 		ret = vhpet_snapshot(vm_hpet(vm), meta);
2938 		break;
2939 	case STRUCT_VATPIC:
2940 		ret = vatpic_snapshot(vm_atpic(vm), meta);
2941 		break;
2942 	case STRUCT_VATPIT:
2943 		ret = vatpit_snapshot(vm_atpit(vm), meta);
2944 		break;
2945 	case STRUCT_VPMTMR:
2946 		ret = vpmtmr_snapshot(vm_pmtmr(vm), meta);
2947 		break;
2948 	case STRUCT_VRTC:
2949 		ret = vrtc_snapshot(vm_rtc(vm), meta);
2950 		break;
2951 	default:
2952 		printf("%s: failed to find the requested type %#x\n",
2953 		       __func__, meta->dev_req);
2954 		ret = (EINVAL);
2955 	}
2956 	return (ret);
2957 }
2958 
2959 void
2960 vm_set_tsc_offset(struct vcpu *vcpu, uint64_t offset)
2961 {
2962 	vcpu->tsc_offset = offset;
2963 }
2964 
2965 int
2966 vm_restore_time(struct vm *vm)
2967 {
2968 	int error;
2969 	uint64_t now;
2970 	struct vcpu *vcpu;
2971 	uint16_t i, maxcpus;
2972 
2973 	now = rdtsc();
2974 
2975 	error = vhpet_restore_time(vm_hpet(vm));
2976 	if (error)
2977 		return (error);
2978 
2979 	maxcpus = vm_get_maxcpus(vm);
2980 	for (i = 0; i < maxcpus; i++) {
2981 		vcpu = vm->vcpu[i];
2982 		if (vcpu == NULL)
2983 			continue;
2984 
2985 		error = vmmops_restore_tsc(vcpu->cookie,
2986 		    vcpu->tsc_offset - now);
2987 		if (error)
2988 			return (error);
2989 	}
2990 
2991 	return (0);
2992 }
2993 #endif
2994