xref: /freebsd/sys/amd64/vmm/vmm.c (revision ce3adf4362fcca6a43e500b2531f0038adbfbd21)
1 /*-
2  * Copyright (c) 2011 NetApp, Inc.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  *
26  * $FreeBSD$
27  */
28 
29 #include <sys/cdefs.h>
30 __FBSDID("$FreeBSD$");
31 
32 #include <sys/param.h>
33 #include <sys/systm.h>
34 #include <sys/kernel.h>
35 #include <sys/module.h>
36 #include <sys/sysctl.h>
37 #include <sys/malloc.h>
38 #include <sys/pcpu.h>
39 #include <sys/lock.h>
40 #include <sys/mutex.h>
41 #include <sys/proc.h>
42 #include <sys/sched.h>
43 #include <sys/smp.h>
44 #include <sys/systm.h>
45 
46 #include <vm/vm.h>
47 
48 #include <machine/vm.h>
49 #include <machine/pcb.h>
50 #include <machine/smp.h>
51 #include <x86/apicreg.h>
52 
53 #include <machine/vmm.h>
54 #include "vmm_host.h"
55 #include "vmm_mem.h"
56 #include "vmm_util.h"
57 #include <machine/vmm_dev.h>
58 #include "vlapic.h"
59 #include "vmm_msr.h"
60 #include "vmm_ipi.h"
61 #include "vmm_stat.h"
62 #include "vmm_lapic.h"
63 
64 #include "io/ppt.h"
65 #include "io/iommu.h"
66 
67 struct vlapic;
68 
69 struct vcpu {
70 	int		flags;
71 	enum vcpu_state	state;
72 	struct mtx	mtx;
73 	int		hostcpu;	/* host cpuid this vcpu last ran on */
74 	uint64_t	guest_msrs[VMM_MSR_NUM];
75 	struct vlapic	*vlapic;
76 	int		 vcpuid;
77 	struct savefpu	*guestfpu;	/* guest fpu state */
78 	void		*stats;
79 	struct vm_exit	exitinfo;
80 	enum x2apic_state x2apic_state;
81 	int		nmi_pending;
82 };
83 
84 #define	vcpu_lock_init(v)	mtx_init(&((v)->mtx), "vcpu lock", 0, MTX_SPIN)
85 #define	vcpu_lock(v)		mtx_lock_spin(&((v)->mtx))
86 #define	vcpu_unlock(v)		mtx_unlock_spin(&((v)->mtx))
87 
88 #define	VM_MAX_MEMORY_SEGMENTS	2
89 
90 struct vm {
91 	void		*cookie;	/* processor-specific data */
92 	void		*iommu;		/* iommu-specific data */
93 	struct vcpu	vcpu[VM_MAXCPU];
94 	int		num_mem_segs;
95 	struct vm_memory_segment mem_segs[VM_MAX_MEMORY_SEGMENTS];
96 	char		name[VM_MAX_NAMELEN];
97 
98 	/*
99 	 * Set of active vcpus.
100 	 * An active vcpu is one that has been started implicitly (BSP) or
101 	 * explicitly (AP) by sending it a startup ipi.
102 	 */
103 	cpuset_t	active_cpus;
104 };
105 
106 static int vmm_initialized;
107 
108 static struct vmm_ops *ops;
109 #define	VMM_INIT()	(ops != NULL ? (*ops->init)() : 0)
110 #define	VMM_CLEANUP()	(ops != NULL ? (*ops->cleanup)() : 0)
111 
112 #define	VMINIT(vm)	(ops != NULL ? (*ops->vminit)(vm): NULL)
113 #define	VMRUN(vmi, vcpu, rip) \
114 	(ops != NULL ? (*ops->vmrun)(vmi, vcpu, rip) : ENXIO)
115 #define	VMCLEANUP(vmi)	(ops != NULL ? (*ops->vmcleanup)(vmi) : NULL)
116 #define	VMMMAP_SET(vmi, gpa, hpa, len, attr, prot, spm)			\
117     	(ops != NULL ? 							\
118     	(*ops->vmmmap_set)(vmi, gpa, hpa, len, attr, prot, spm) :	\
119 	ENXIO)
120 #define	VMMMAP_GET(vmi, gpa) \
121 	(ops != NULL ? (*ops->vmmmap_get)(vmi, gpa) : ENXIO)
122 #define	VMGETREG(vmi, vcpu, num, retval)		\
123 	(ops != NULL ? (*ops->vmgetreg)(vmi, vcpu, num, retval) : ENXIO)
124 #define	VMSETREG(vmi, vcpu, num, val)		\
125 	(ops != NULL ? (*ops->vmsetreg)(vmi, vcpu, num, val) : ENXIO)
126 #define	VMGETDESC(vmi, vcpu, num, desc)		\
127 	(ops != NULL ? (*ops->vmgetdesc)(vmi, vcpu, num, desc) : ENXIO)
128 #define	VMSETDESC(vmi, vcpu, num, desc)		\
129 	(ops != NULL ? (*ops->vmsetdesc)(vmi, vcpu, num, desc) : ENXIO)
130 #define	VMINJECT(vmi, vcpu, type, vec, ec, ecv)	\
131 	(ops != NULL ? (*ops->vminject)(vmi, vcpu, type, vec, ec, ecv) : ENXIO)
132 #define	VMGETCAP(vmi, vcpu, num, retval)	\
133 	(ops != NULL ? (*ops->vmgetcap)(vmi, vcpu, num, retval) : ENXIO)
134 #define	VMSETCAP(vmi, vcpu, num, val)		\
135 	(ops != NULL ? (*ops->vmsetcap)(vmi, vcpu, num, val) : ENXIO)
136 
137 #define	fpu_start_emulating()	load_cr0(rcr0() | CR0_TS)
138 #define	fpu_stop_emulating()	clts()
139 
140 static MALLOC_DEFINE(M_VM, "vm", "vm");
141 CTASSERT(VMM_MSR_NUM <= 64);	/* msr_mask can keep track of up to 64 msrs */
142 
143 /* statistics */
144 static VMM_STAT(VCPU_TOTAL_RUNTIME, "vcpu total runtime");
145 
146 static void
147 vcpu_cleanup(struct vcpu *vcpu)
148 {
149 	vlapic_cleanup(vcpu->vlapic);
150 	vmm_stat_free(vcpu->stats);
151 	fpu_save_area_free(vcpu->guestfpu);
152 }
153 
154 static void
155 vcpu_init(struct vm *vm, uint32_t vcpu_id)
156 {
157 	struct vcpu *vcpu;
158 
159 	vcpu = &vm->vcpu[vcpu_id];
160 
161 	vcpu_lock_init(vcpu);
162 	vcpu->hostcpu = NOCPU;
163 	vcpu->vcpuid = vcpu_id;
164 	vcpu->vlapic = vlapic_init(vm, vcpu_id);
165 	vm_set_x2apic_state(vm, vcpu_id, X2APIC_ENABLED);
166 	vcpu->guestfpu = fpu_save_area_alloc();
167 	fpu_save_area_reset(vcpu->guestfpu);
168 	vcpu->stats = vmm_stat_alloc();
169 }
170 
171 struct vm_exit *
172 vm_exitinfo(struct vm *vm, int cpuid)
173 {
174 	struct vcpu *vcpu;
175 
176 	if (cpuid < 0 || cpuid >= VM_MAXCPU)
177 		panic("vm_exitinfo: invalid cpuid %d", cpuid);
178 
179 	vcpu = &vm->vcpu[cpuid];
180 
181 	return (&vcpu->exitinfo);
182 }
183 
184 static int
185 vmm_init(void)
186 {
187 	int error;
188 
189 	vmm_host_state_init();
190 	vmm_ipi_init();
191 
192 	error = vmm_mem_init();
193 	if (error)
194 		return (error);
195 
196 	if (vmm_is_intel())
197 		ops = &vmm_ops_intel;
198 	else if (vmm_is_amd())
199 		ops = &vmm_ops_amd;
200 	else
201 		return (ENXIO);
202 
203 	vmm_msr_init();
204 
205 	return (VMM_INIT());
206 }
207 
208 static int
209 vmm_handler(module_t mod, int what, void *arg)
210 {
211 	int error;
212 
213 	switch (what) {
214 	case MOD_LOAD:
215 		vmmdev_init();
216 		if (ppt_num_devices() > 0)
217 			iommu_init();
218 		error = vmm_init();
219 		if (error == 0)
220 			vmm_initialized = 1;
221 		break;
222 	case MOD_UNLOAD:
223 		error = vmmdev_cleanup();
224 		if (error == 0) {
225 			iommu_cleanup();
226 			vmm_ipi_cleanup();
227 			error = VMM_CLEANUP();
228 			/*
229 			 * Something bad happened - prevent new
230 			 * VMs from being created
231 			 */
232 			if (error)
233 				vmm_initialized = 0;
234 		}
235 		break;
236 	default:
237 		error = 0;
238 		break;
239 	}
240 	return (error);
241 }
242 
243 static moduledata_t vmm_kmod = {
244 	"vmm",
245 	vmm_handler,
246 	NULL
247 };
248 
249 /*
250  * vmm initialization has the following dependencies:
251  *
252  * - iommu initialization must happen after the pci passthru driver has had
253  *   a chance to attach to any passthru devices (after SI_SUB_CONFIGURE).
254  *
255  * - VT-x initialization requires smp_rendezvous() and therefore must happen
256  *   after SMP is fully functional (after SI_SUB_SMP).
257  */
258 DECLARE_MODULE(vmm, vmm_kmod, SI_SUB_SMP + 1, SI_ORDER_ANY);
259 MODULE_VERSION(vmm, 1);
260 
261 SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW, NULL, NULL);
262 
263 int
264 vm_create(const char *name, struct vm **retvm)
265 {
266 	int i;
267 	struct vm *vm;
268 	vm_paddr_t maxaddr;
269 
270 	const int BSP = 0;
271 
272 	/*
273 	 * If vmm.ko could not be successfully initialized then don't attempt
274 	 * to create the virtual machine.
275 	 */
276 	if (!vmm_initialized)
277 		return (ENXIO);
278 
279 	if (name == NULL || strlen(name) >= VM_MAX_NAMELEN)
280 		return (EINVAL);
281 
282 	vm = malloc(sizeof(struct vm), M_VM, M_WAITOK | M_ZERO);
283 	strcpy(vm->name, name);
284 	vm->cookie = VMINIT(vm);
285 
286 	for (i = 0; i < VM_MAXCPU; i++) {
287 		vcpu_init(vm, i);
288 		guest_msrs_init(vm, i);
289 	}
290 
291 	maxaddr = vmm_mem_maxaddr();
292 	vm->iommu = iommu_create_domain(maxaddr);
293 	vm_activate_cpu(vm, BSP);
294 
295 	*retvm = vm;
296 	return (0);
297 }
298 
299 static void
300 vm_free_mem_seg(struct vm *vm, struct vm_memory_segment *seg)
301 {
302 	size_t len;
303 	vm_paddr_t hpa;
304 	void *host_domain;
305 
306 	host_domain = iommu_host_domain();
307 
308 	len = 0;
309 	while (len < seg->len) {
310 		hpa = vm_gpa2hpa(vm, seg->gpa + len, PAGE_SIZE);
311 		if (hpa == (vm_paddr_t)-1) {
312 			panic("vm_free_mem_segs: cannot free hpa "
313 			      "associated with gpa 0x%016lx", seg->gpa + len);
314 		}
315 
316 		/*
317 		 * Remove the 'gpa' to 'hpa' mapping in VMs domain.
318 		 * And resurrect the 1:1 mapping for 'hpa' in 'host_domain'.
319 		 */
320 		iommu_remove_mapping(vm->iommu, seg->gpa + len, PAGE_SIZE);
321 		iommu_create_mapping(host_domain, hpa, hpa, PAGE_SIZE);
322 
323 		vmm_mem_free(hpa, PAGE_SIZE);
324 
325 		len += PAGE_SIZE;
326 	}
327 
328 	/*
329 	 * Invalidate cached translations associated with 'vm->iommu' since
330 	 * we have now moved some pages from it.
331 	 */
332 	iommu_invalidate_tlb(vm->iommu);
333 
334 	bzero(seg, sizeof(struct vm_memory_segment));
335 }
336 
337 void
338 vm_destroy(struct vm *vm)
339 {
340 	int i;
341 
342 	ppt_unassign_all(vm);
343 
344 	for (i = 0; i < vm->num_mem_segs; i++)
345 		vm_free_mem_seg(vm, &vm->mem_segs[i]);
346 
347 	vm->num_mem_segs = 0;
348 
349 	for (i = 0; i < VM_MAXCPU; i++)
350 		vcpu_cleanup(&vm->vcpu[i]);
351 
352 	iommu_destroy_domain(vm->iommu);
353 
354 	VMCLEANUP(vm->cookie);
355 
356 	free(vm, M_VM);
357 }
358 
359 const char *
360 vm_name(struct vm *vm)
361 {
362 	return (vm->name);
363 }
364 
365 int
366 vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa)
367 {
368 	const boolean_t spok = TRUE;	/* superpage mappings are ok */
369 
370 	return (VMMMAP_SET(vm->cookie, gpa, hpa, len, VM_MEMATTR_UNCACHEABLE,
371 			   VM_PROT_RW, spok));
372 }
373 
374 int
375 vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len)
376 {
377 	const boolean_t spok = TRUE;	/* superpage mappings are ok */
378 
379 	return (VMMMAP_SET(vm->cookie, gpa, 0, len, 0,
380 			   VM_PROT_NONE, spok));
381 }
382 
383 /*
384  * Returns TRUE if 'gpa' is available for allocation and FALSE otherwise
385  */
386 static boolean_t
387 vm_gpa_available(struct vm *vm, vm_paddr_t gpa)
388 {
389 	int i;
390 	vm_paddr_t gpabase, gpalimit;
391 
392 	if (gpa & PAGE_MASK)
393 		panic("vm_gpa_available: gpa (0x%016lx) not page aligned", gpa);
394 
395 	for (i = 0; i < vm->num_mem_segs; i++) {
396 		gpabase = vm->mem_segs[i].gpa;
397 		gpalimit = gpabase + vm->mem_segs[i].len;
398 		if (gpa >= gpabase && gpa < gpalimit)
399 			return (FALSE);
400 	}
401 
402 	return (TRUE);
403 }
404 
405 int
406 vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len)
407 {
408 	int error, available, allocated;
409 	struct vm_memory_segment *seg;
410 	vm_paddr_t g, hpa;
411 	void *host_domain;
412 
413 	const boolean_t spok = TRUE;	/* superpage mappings are ok */
414 
415 	if ((gpa & PAGE_MASK) || (len & PAGE_MASK) || len == 0)
416 		return (EINVAL);
417 
418 	available = allocated = 0;
419 	g = gpa;
420 	while (g < gpa + len) {
421 		if (vm_gpa_available(vm, g))
422 			available++;
423 		else
424 			allocated++;
425 
426 		g += PAGE_SIZE;
427 	}
428 
429 	/*
430 	 * If there are some allocated and some available pages in the address
431 	 * range then it is an error.
432 	 */
433 	if (allocated && available)
434 		return (EINVAL);
435 
436 	/*
437 	 * If the entire address range being requested has already been
438 	 * allocated then there isn't anything more to do.
439 	 */
440 	if (allocated && available == 0)
441 		return (0);
442 
443 	if (vm->num_mem_segs >= VM_MAX_MEMORY_SEGMENTS)
444 		return (E2BIG);
445 
446 	host_domain = iommu_host_domain();
447 
448 	seg = &vm->mem_segs[vm->num_mem_segs];
449 
450 	error = 0;
451 	seg->gpa = gpa;
452 	seg->len = 0;
453 	while (seg->len < len) {
454 		hpa = vmm_mem_alloc(PAGE_SIZE);
455 		if (hpa == 0) {
456 			error = ENOMEM;
457 			break;
458 		}
459 
460 		error = VMMMAP_SET(vm->cookie, gpa + seg->len, hpa, PAGE_SIZE,
461 				   VM_MEMATTR_WRITE_BACK, VM_PROT_ALL, spok);
462 		if (error)
463 			break;
464 
465 		/*
466 		 * Remove the 1:1 mapping for 'hpa' from the 'host_domain'.
467 		 * Add mapping for 'gpa + seg->len' to 'hpa' in the VMs domain.
468 		 */
469 		iommu_remove_mapping(host_domain, hpa, PAGE_SIZE);
470 		iommu_create_mapping(vm->iommu, gpa + seg->len, hpa, PAGE_SIZE);
471 
472 		seg->len += PAGE_SIZE;
473 	}
474 
475 	if (error) {
476 		vm_free_mem_seg(vm, seg);
477 		return (error);
478 	}
479 
480 	/*
481 	 * Invalidate cached translations associated with 'host_domain' since
482 	 * we have now moved some pages from it.
483 	 */
484 	iommu_invalidate_tlb(host_domain);
485 
486 	vm->num_mem_segs++;
487 
488 	return (0);
489 }
490 
491 vm_paddr_t
492 vm_gpa2hpa(struct vm *vm, vm_paddr_t gpa, size_t len)
493 {
494 	vm_paddr_t nextpage;
495 
496 	nextpage = rounddown(gpa + PAGE_SIZE, PAGE_SIZE);
497 	if (len > nextpage - gpa)
498 		panic("vm_gpa2hpa: invalid gpa/len: 0x%016lx/%lu", gpa, len);
499 
500 	return (VMMMAP_GET(vm->cookie, gpa));
501 }
502 
503 int
504 vm_gpabase2memseg(struct vm *vm, vm_paddr_t gpabase,
505 		  struct vm_memory_segment *seg)
506 {
507 	int i;
508 
509 	for (i = 0; i < vm->num_mem_segs; i++) {
510 		if (gpabase == vm->mem_segs[i].gpa) {
511 			*seg = vm->mem_segs[i];
512 			return (0);
513 		}
514 	}
515 	return (-1);
516 }
517 
518 int
519 vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval)
520 {
521 
522 	if (vcpu < 0 || vcpu >= VM_MAXCPU)
523 		return (EINVAL);
524 
525 	if (reg >= VM_REG_LAST)
526 		return (EINVAL);
527 
528 	return (VMGETREG(vm->cookie, vcpu, reg, retval));
529 }
530 
531 int
532 vm_set_register(struct vm *vm, int vcpu, int reg, uint64_t val)
533 {
534 
535 	if (vcpu < 0 || vcpu >= VM_MAXCPU)
536 		return (EINVAL);
537 
538 	if (reg >= VM_REG_LAST)
539 		return (EINVAL);
540 
541 	return (VMSETREG(vm->cookie, vcpu, reg, val));
542 }
543 
544 static boolean_t
545 is_descriptor_table(int reg)
546 {
547 
548 	switch (reg) {
549 	case VM_REG_GUEST_IDTR:
550 	case VM_REG_GUEST_GDTR:
551 		return (TRUE);
552 	default:
553 		return (FALSE);
554 	}
555 }
556 
557 static boolean_t
558 is_segment_register(int reg)
559 {
560 
561 	switch (reg) {
562 	case VM_REG_GUEST_ES:
563 	case VM_REG_GUEST_CS:
564 	case VM_REG_GUEST_SS:
565 	case VM_REG_GUEST_DS:
566 	case VM_REG_GUEST_FS:
567 	case VM_REG_GUEST_GS:
568 	case VM_REG_GUEST_TR:
569 	case VM_REG_GUEST_LDTR:
570 		return (TRUE);
571 	default:
572 		return (FALSE);
573 	}
574 }
575 
576 int
577 vm_get_seg_desc(struct vm *vm, int vcpu, int reg,
578 		struct seg_desc *desc)
579 {
580 
581 	if (vcpu < 0 || vcpu >= VM_MAXCPU)
582 		return (EINVAL);
583 
584 	if (!is_segment_register(reg) && !is_descriptor_table(reg))
585 		return (EINVAL);
586 
587 	return (VMGETDESC(vm->cookie, vcpu, reg, desc));
588 }
589 
590 int
591 vm_set_seg_desc(struct vm *vm, int vcpu, int reg,
592 		struct seg_desc *desc)
593 {
594 	if (vcpu < 0 || vcpu >= VM_MAXCPU)
595 		return (EINVAL);
596 
597 	if (!is_segment_register(reg) && !is_descriptor_table(reg))
598 		return (EINVAL);
599 
600 	return (VMSETDESC(vm->cookie, vcpu, reg, desc));
601 }
602 
603 static void
604 restore_guest_fpustate(struct vcpu *vcpu)
605 {
606 
607 	/* flush host state to the pcb */
608 	fpuexit(curthread);
609 
610 	/* restore guest FPU state */
611 	fpu_stop_emulating();
612 	fpurestore(vcpu->guestfpu);
613 
614 	/*
615 	 * The FPU is now "dirty" with the guest's state so turn on emulation
616 	 * to trap any access to the FPU by the host.
617 	 */
618 	fpu_start_emulating();
619 }
620 
621 static void
622 save_guest_fpustate(struct vcpu *vcpu)
623 {
624 
625 	if ((rcr0() & CR0_TS) == 0)
626 		panic("fpu emulation not enabled in host!");
627 
628 	/* save guest FPU state */
629 	fpu_stop_emulating();
630 	fpusave(vcpu->guestfpu);
631 	fpu_start_emulating();
632 }
633 
634 static VMM_STAT(VCPU_IDLE_TICKS, "number of ticks vcpu was idle");
635 
636 int
637 vm_run(struct vm *vm, struct vm_run *vmrun)
638 {
639 	int error, vcpuid, sleepticks, t;
640 	struct vcpu *vcpu;
641 	struct pcb *pcb;
642 	uint64_t tscval, rip;
643 	struct vm_exit *vme;
644 
645 	vcpuid = vmrun->cpuid;
646 
647 	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
648 		return (EINVAL);
649 
650 	vcpu = &vm->vcpu[vcpuid];
651 	vme = &vmrun->vm_exit;
652 	rip = vmrun->rip;
653 restart:
654 	critical_enter();
655 
656 	tscval = rdtsc();
657 
658 	pcb = PCPU_GET(curpcb);
659 	set_pcb_flags(pcb, PCB_FULL_IRET);
660 
661 	restore_guest_msrs(vm, vcpuid);
662 	restore_guest_fpustate(vcpu);
663 
664 	vcpu->hostcpu = curcpu;
665 	error = VMRUN(vm->cookie, vcpuid, rip);
666 	vcpu->hostcpu = NOCPU;
667 
668 	save_guest_fpustate(vcpu);
669 	restore_host_msrs(vm, vcpuid);
670 
671 	vmm_stat_incr(vm, vcpuid, VCPU_TOTAL_RUNTIME, rdtsc() - tscval);
672 
673 	/* copy the exit information */
674 	bcopy(&vcpu->exitinfo, vme, sizeof(struct vm_exit));
675 
676 	critical_exit();
677 
678 	/*
679 	 * Oblige the guest's desire to 'hlt' by sleeping until the vcpu
680 	 * is ready to run.
681 	 */
682 	if (error == 0 && vme->exitcode == VM_EXITCODE_HLT) {
683 		vcpu_lock(vcpu);
684 
685 		/*
686 		 * Figure out the number of host ticks until the next apic
687 		 * timer interrupt in the guest.
688 		 */
689 		sleepticks = lapic_timer_tick(vm, vcpuid);
690 
691 		/*
692 		 * If the guest local apic timer is disabled then sleep for
693 		 * a long time but not forever.
694 		 */
695 		if (sleepticks < 0)
696 			sleepticks = hz;
697 
698 		/*
699 		 * Do a final check for pending NMI or interrupts before
700 		 * really putting this thread to sleep.
701 		 *
702 		 * These interrupts could have happened any time after we
703 		 * returned from VMRUN() and before we grabbed the vcpu lock.
704 		 */
705 		if (!vm_nmi_pending(vm, vcpuid) &&
706 		    lapic_pending_intr(vm, vcpuid) < 0) {
707 			if (sleepticks <= 0)
708 				panic("invalid sleepticks %d", sleepticks);
709 			t = ticks;
710 			msleep_spin(vcpu, &vcpu->mtx, "vmidle", sleepticks);
711 			vmm_stat_incr(vm, vcpuid, VCPU_IDLE_TICKS, ticks - t);
712 		}
713 
714 		vcpu_unlock(vcpu);
715 
716 		rip = vme->rip + vme->inst_length;
717 		goto restart;
718 	}
719 
720 	return (error);
721 }
722 
723 int
724 vm_inject_event(struct vm *vm, int vcpuid, int type,
725 		int vector, uint32_t code, int code_valid)
726 {
727 	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
728 		return (EINVAL);
729 
730 	if ((type > VM_EVENT_NONE && type < VM_EVENT_MAX) == 0)
731 		return (EINVAL);
732 
733 	if (vector < 0 || vector > 255)
734 		return (EINVAL);
735 
736 	return (VMINJECT(vm->cookie, vcpuid, type, vector, code, code_valid));
737 }
738 
739 static VMM_STAT(VCPU_NMI_COUNT, "number of NMIs delivered to vcpu");
740 
741 int
742 vm_inject_nmi(struct vm *vm, int vcpuid)
743 {
744 	struct vcpu *vcpu;
745 
746 	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
747 		return (EINVAL);
748 
749 	vcpu = &vm->vcpu[vcpuid];
750 
751 	vcpu->nmi_pending = 1;
752 	vm_interrupt_hostcpu(vm, vcpuid);
753 	return (0);
754 }
755 
756 int
757 vm_nmi_pending(struct vm *vm, int vcpuid)
758 {
759 	struct vcpu *vcpu;
760 
761 	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
762 		panic("vm_nmi_pending: invalid vcpuid %d", vcpuid);
763 
764 	vcpu = &vm->vcpu[vcpuid];
765 
766 	return (vcpu->nmi_pending);
767 }
768 
769 void
770 vm_nmi_clear(struct vm *vm, int vcpuid)
771 {
772 	struct vcpu *vcpu;
773 
774 	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
775 		panic("vm_nmi_pending: invalid vcpuid %d", vcpuid);
776 
777 	vcpu = &vm->vcpu[vcpuid];
778 
779 	if (vcpu->nmi_pending == 0)
780 		panic("vm_nmi_clear: inconsistent nmi_pending state");
781 
782 	vcpu->nmi_pending = 0;
783 	vmm_stat_incr(vm, vcpuid, VCPU_NMI_COUNT, 1);
784 }
785 
786 int
787 vm_get_capability(struct vm *vm, int vcpu, int type, int *retval)
788 {
789 	if (vcpu < 0 || vcpu >= VM_MAXCPU)
790 		return (EINVAL);
791 
792 	if (type < 0 || type >= VM_CAP_MAX)
793 		return (EINVAL);
794 
795 	return (VMGETCAP(vm->cookie, vcpu, type, retval));
796 }
797 
798 int
799 vm_set_capability(struct vm *vm, int vcpu, int type, int val)
800 {
801 	if (vcpu < 0 || vcpu >= VM_MAXCPU)
802 		return (EINVAL);
803 
804 	if (type < 0 || type >= VM_CAP_MAX)
805 		return (EINVAL);
806 
807 	return (VMSETCAP(vm->cookie, vcpu, type, val));
808 }
809 
810 uint64_t *
811 vm_guest_msrs(struct vm *vm, int cpu)
812 {
813 	return (vm->vcpu[cpu].guest_msrs);
814 }
815 
816 struct vlapic *
817 vm_lapic(struct vm *vm, int cpu)
818 {
819 	return (vm->vcpu[cpu].vlapic);
820 }
821 
822 boolean_t
823 vmm_is_pptdev(int bus, int slot, int func)
824 {
825 	int found, i, n;
826 	int b, s, f;
827 	char *val, *cp, *cp2;
828 
829 	/*
830 	 * XXX
831 	 * The length of an environment variable is limited to 128 bytes which
832 	 * puts an upper limit on the number of passthru devices that may be
833 	 * specified using a single environment variable.
834 	 *
835 	 * Work around this by scanning multiple environment variable
836 	 * names instead of a single one - yuck!
837 	 */
838 	const char *names[] = { "pptdevs", "pptdevs2", "pptdevs3", NULL };
839 
840 	/* set pptdevs="1/2/3 4/5/6 7/8/9 10/11/12" */
841 	found = 0;
842 	for (i = 0; names[i] != NULL && !found; i++) {
843 		cp = val = getenv(names[i]);
844 		while (cp != NULL && *cp != '\0') {
845 			if ((cp2 = strchr(cp, ' ')) != NULL)
846 				*cp2 = '\0';
847 
848 			n = sscanf(cp, "%d/%d/%d", &b, &s, &f);
849 			if (n == 3 && bus == b && slot == s && func == f) {
850 				found = 1;
851 				break;
852 			}
853 
854 			if (cp2 != NULL)
855 				*cp2++ = ' ';
856 
857 			cp = cp2;
858 		}
859 		freeenv(val);
860 	}
861 	return (found);
862 }
863 
864 void *
865 vm_iommu_domain(struct vm *vm)
866 {
867 
868 	return (vm->iommu);
869 }
870 
871 int
872 vcpu_set_state(struct vm *vm, int vcpuid, enum vcpu_state state)
873 {
874 	int error;
875 	struct vcpu *vcpu;
876 
877 	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
878 		panic("vm_set_run_state: invalid vcpuid %d", vcpuid);
879 
880 	vcpu = &vm->vcpu[vcpuid];
881 
882 	vcpu_lock(vcpu);
883 
884 	/*
885 	 * The following state transitions are allowed:
886 	 * IDLE -> RUNNING -> IDLE
887 	 * IDLE -> CANNOT_RUN -> IDLE
888 	 */
889 	if ((vcpu->state == VCPU_IDLE && state != VCPU_IDLE) ||
890 	    (vcpu->state != VCPU_IDLE && state == VCPU_IDLE)) {
891 		error = 0;
892 		vcpu->state = state;
893 	} else {
894 		error = EBUSY;
895 	}
896 
897 	vcpu_unlock(vcpu);
898 
899 	return (error);
900 }
901 
902 enum vcpu_state
903 vcpu_get_state(struct vm *vm, int vcpuid, int *hostcpu)
904 {
905 	struct vcpu *vcpu;
906 	enum vcpu_state state;
907 
908 	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
909 		panic("vm_get_run_state: invalid vcpuid %d", vcpuid);
910 
911 	vcpu = &vm->vcpu[vcpuid];
912 
913 	vcpu_lock(vcpu);
914 	state = vcpu->state;
915 	if (hostcpu != NULL)
916 		*hostcpu = vcpu->hostcpu;
917 	vcpu_unlock(vcpu);
918 
919 	return (state);
920 }
921 
922 void
923 vm_activate_cpu(struct vm *vm, int vcpuid)
924 {
925 
926 	if (vcpuid >= 0 && vcpuid < VM_MAXCPU)
927 		CPU_SET(vcpuid, &vm->active_cpus);
928 }
929 
930 cpuset_t
931 vm_active_cpus(struct vm *vm)
932 {
933 
934 	return (vm->active_cpus);
935 }
936 
937 void *
938 vcpu_stats(struct vm *vm, int vcpuid)
939 {
940 
941 	return (vm->vcpu[vcpuid].stats);
942 }
943 
944 int
945 vm_get_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state *state)
946 {
947 	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
948 		return (EINVAL);
949 
950 	*state = vm->vcpu[vcpuid].x2apic_state;
951 
952 	return (0);
953 }
954 
955 int
956 vm_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state)
957 {
958 	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
959 		return (EINVAL);
960 
961 	if (state >= X2APIC_STATE_LAST)
962 		return (EINVAL);
963 
964 	vm->vcpu[vcpuid].x2apic_state = state;
965 
966 	vlapic_set_x2apic_state(vm, vcpuid, state);
967 
968 	return (0);
969 }
970 
971 void
972 vm_interrupt_hostcpu(struct vm *vm, int vcpuid)
973 {
974 	int hostcpu;
975 	struct vcpu *vcpu;
976 
977 	vcpu = &vm->vcpu[vcpuid];
978 
979 	vcpu_lock(vcpu);
980 	hostcpu = vcpu->hostcpu;
981 	if (hostcpu == NOCPU) {
982 		/*
983 		 * If the vcpu is 'RUNNING' but without a valid 'hostcpu' then
984 		 * the host thread must be sleeping waiting for an event to
985 		 * kick the vcpu out of 'hlt'.
986 		 *
987 		 * XXX this is racy because the condition exists right before
988 		 * and after calling VMRUN() in vm_run(). The wakeup() is
989 		 * benign in this case.
990 		 */
991 		if (vcpu->state == VCPU_RUNNING)
992 			wakeup_one(vcpu);
993 	} else {
994 		if (vcpu->state != VCPU_RUNNING)
995 			panic("invalid vcpu state %d", vcpu->state);
996 		if (hostcpu != curcpu)
997 			ipi_cpu(hostcpu, vmm_ipinum);
998 	}
999 	vcpu_unlock(vcpu);
1000 }
1001