xref: /freebsd/sys/amd64/vmm/intel/vmx.c (revision 0572ccaa4543b0abef8ef81e384c1d04de9f3da1)
1 /*-
2  * Copyright (c) 2011 NetApp, Inc.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  *
26  * $FreeBSD$
27  */
28 
29 #include <sys/cdefs.h>
30 __FBSDID("$FreeBSD$");
31 
32 #include <sys/param.h>
33 #include <sys/systm.h>
34 #include <sys/smp.h>
35 #include <sys/kernel.h>
36 #include <sys/malloc.h>
37 #include <sys/pcpu.h>
38 #include <sys/proc.h>
39 #include <sys/sysctl.h>
40 
41 #include <vm/vm.h>
42 #include <vm/pmap.h>
43 
44 #include <machine/psl.h>
45 #include <machine/cpufunc.h>
46 #include <machine/md_var.h>
47 #include <machine/segments.h>
48 #include <machine/smp.h>
49 #include <machine/specialreg.h>
50 #include <machine/vmparam.h>
51 
52 #include <machine/vmm.h>
53 #include <machine/vmm_dev.h>
54 #include "vmm_host.h"
55 #include "vmm_ioport.h"
56 #include "vmm_ipi.h"
57 #include "vmm_msr.h"
58 #include "vmm_ktr.h"
59 #include "vmm_stat.h"
60 #include "vatpic.h"
61 #include "vlapic.h"
62 #include "vlapic_priv.h"
63 
64 #include "vmx_msr.h"
65 #include "ept.h"
66 #include "vmx_cpufunc.h"
67 #include "vmx.h"
68 #include "x86.h"
69 #include "vmx_controls.h"
70 
71 #define	PINBASED_CTLS_ONE_SETTING					\
72 	(PINBASED_EXTINT_EXITING	|				\
73 	 PINBASED_NMI_EXITING		|				\
74 	 PINBASED_VIRTUAL_NMI)
75 #define	PINBASED_CTLS_ZERO_SETTING	0
76 
77 #define PROCBASED_CTLS_WINDOW_SETTING					\
78 	(PROCBASED_INT_WINDOW_EXITING	|				\
79 	 PROCBASED_NMI_WINDOW_EXITING)
80 
81 #define	PROCBASED_CTLS_ONE_SETTING 					\
82 	(PROCBASED_SECONDARY_CONTROLS	|				\
83 	 PROCBASED_IO_EXITING		|				\
84 	 PROCBASED_MSR_BITMAPS		|				\
85 	 PROCBASED_CTLS_WINDOW_SETTING)
86 #define	PROCBASED_CTLS_ZERO_SETTING	\
87 	(PROCBASED_CR3_LOAD_EXITING |	\
88 	PROCBASED_CR3_STORE_EXITING |	\
89 	PROCBASED_IO_BITMAPS)
90 
91 #define	PROCBASED_CTLS2_ONE_SETTING	PROCBASED2_ENABLE_EPT
92 #define	PROCBASED_CTLS2_ZERO_SETTING	0
93 
94 #define VM_EXIT_CTLS_ONE_SETTING_NO_PAT					\
95 	(VM_EXIT_HOST_LMA			|			\
96 	VM_EXIT_SAVE_EFER			|			\
97 	VM_EXIT_LOAD_EFER)
98 
99 #define	VM_EXIT_CTLS_ONE_SETTING					\
100 	(VM_EXIT_CTLS_ONE_SETTING_NO_PAT       	|			\
101 	VM_EXIT_ACKNOWLEDGE_INTERRUPT		|			\
102 	VM_EXIT_SAVE_PAT			|			\
103 	VM_EXIT_LOAD_PAT)
104 #define	VM_EXIT_CTLS_ZERO_SETTING	VM_EXIT_SAVE_DEBUG_CONTROLS
105 
106 #define	VM_ENTRY_CTLS_ONE_SETTING_NO_PAT	VM_ENTRY_LOAD_EFER
107 
108 #define	VM_ENTRY_CTLS_ONE_SETTING					\
109 	(VM_ENTRY_CTLS_ONE_SETTING_NO_PAT     	|			\
110 	VM_ENTRY_LOAD_PAT)
111 #define	VM_ENTRY_CTLS_ZERO_SETTING					\
112 	(VM_ENTRY_LOAD_DEBUG_CONTROLS		|			\
113 	VM_ENTRY_INTO_SMM			|			\
114 	VM_ENTRY_DEACTIVATE_DUAL_MONITOR)
115 
116 #define	guest_msr_rw(vmx, msr) \
117 	msr_bitmap_change_access((vmx)->msr_bitmap, (msr), MSR_BITMAP_ACCESS_RW)
118 
119 #define	guest_msr_ro(vmx, msr) \
120     msr_bitmap_change_access((vmx)->msr_bitmap, (msr), MSR_BITMAP_ACCESS_READ)
121 
122 #define	HANDLED		1
123 #define	UNHANDLED	0
124 
125 static MALLOC_DEFINE(M_VMX, "vmx", "vmx");
126 static MALLOC_DEFINE(M_VLAPIC, "vlapic", "vlapic");
127 
128 SYSCTL_DECL(_hw_vmm);
129 SYSCTL_NODE(_hw_vmm, OID_AUTO, vmx, CTLFLAG_RW, NULL, NULL);
130 
131 int vmxon_enabled[MAXCPU];
132 static char vmxon_region[MAXCPU][PAGE_SIZE] __aligned(PAGE_SIZE);
133 
134 static uint32_t pinbased_ctls, procbased_ctls, procbased_ctls2;
135 static uint32_t exit_ctls, entry_ctls;
136 
137 static uint64_t cr0_ones_mask, cr0_zeros_mask;
138 SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr0_ones_mask, CTLFLAG_RD,
139 	     &cr0_ones_mask, 0, NULL);
140 SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr0_zeros_mask, CTLFLAG_RD,
141 	     &cr0_zeros_mask, 0, NULL);
142 
143 static uint64_t cr4_ones_mask, cr4_zeros_mask;
144 SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr4_ones_mask, CTLFLAG_RD,
145 	     &cr4_ones_mask, 0, NULL);
146 SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr4_zeros_mask, CTLFLAG_RD,
147 	     &cr4_zeros_mask, 0, NULL);
148 
149 static int vmx_no_patmsr;
150 
151 static int vmx_initialized;
152 SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, initialized, CTLFLAG_RD,
153 	   &vmx_initialized, 0, "Intel VMX initialized");
154 
155 /*
156  * Optional capabilities
157  */
158 static int cap_halt_exit;
159 static int cap_pause_exit;
160 static int cap_unrestricted_guest;
161 static int cap_monitor_trap;
162 static int cap_invpcid;
163 
164 static int virtual_interrupt_delivery;
165 SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, virtual_interrupt_delivery, CTLFLAG_RD,
166     &virtual_interrupt_delivery, 0, "APICv virtual interrupt delivery support");
167 
168 static int posted_interrupts;
169 SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, posted_interrupts, CTLFLAG_RD,
170     &posted_interrupts, 0, "APICv posted interrupt support");
171 
172 static int pirvec;
173 SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, posted_interrupt_vector, CTLFLAG_RD,
174     &pirvec, 0, "APICv posted interrupt vector");
175 
176 static struct unrhdr *vpid_unr;
177 static u_int vpid_alloc_failed;
178 SYSCTL_UINT(_hw_vmm_vmx, OID_AUTO, vpid_alloc_failed, CTLFLAG_RD,
179 	    &vpid_alloc_failed, 0, NULL);
180 
181 /*
182  * Use the last page below 4GB as the APIC access address. This address is
183  * occupied by the boot firmware so it is guaranteed that it will not conflict
184  * with a page in system memory.
185  */
186 #define	APIC_ACCESS_ADDRESS	0xFFFFF000
187 
188 static void vmx_inject_pir(struct vlapic *vlapic);
189 
190 #ifdef KTR
191 static const char *
192 exit_reason_to_str(int reason)
193 {
194 	static char reasonbuf[32];
195 
196 	switch (reason) {
197 	case EXIT_REASON_EXCEPTION:
198 		return "exception";
199 	case EXIT_REASON_EXT_INTR:
200 		return "extint";
201 	case EXIT_REASON_TRIPLE_FAULT:
202 		return "triplefault";
203 	case EXIT_REASON_INIT:
204 		return "init";
205 	case EXIT_REASON_SIPI:
206 		return "sipi";
207 	case EXIT_REASON_IO_SMI:
208 		return "iosmi";
209 	case EXIT_REASON_SMI:
210 		return "smi";
211 	case EXIT_REASON_INTR_WINDOW:
212 		return "intrwindow";
213 	case EXIT_REASON_NMI_WINDOW:
214 		return "nmiwindow";
215 	case EXIT_REASON_TASK_SWITCH:
216 		return "taskswitch";
217 	case EXIT_REASON_CPUID:
218 		return "cpuid";
219 	case EXIT_REASON_GETSEC:
220 		return "getsec";
221 	case EXIT_REASON_HLT:
222 		return "hlt";
223 	case EXIT_REASON_INVD:
224 		return "invd";
225 	case EXIT_REASON_INVLPG:
226 		return "invlpg";
227 	case EXIT_REASON_RDPMC:
228 		return "rdpmc";
229 	case EXIT_REASON_RDTSC:
230 		return "rdtsc";
231 	case EXIT_REASON_RSM:
232 		return "rsm";
233 	case EXIT_REASON_VMCALL:
234 		return "vmcall";
235 	case EXIT_REASON_VMCLEAR:
236 		return "vmclear";
237 	case EXIT_REASON_VMLAUNCH:
238 		return "vmlaunch";
239 	case EXIT_REASON_VMPTRLD:
240 		return "vmptrld";
241 	case EXIT_REASON_VMPTRST:
242 		return "vmptrst";
243 	case EXIT_REASON_VMREAD:
244 		return "vmread";
245 	case EXIT_REASON_VMRESUME:
246 		return "vmresume";
247 	case EXIT_REASON_VMWRITE:
248 		return "vmwrite";
249 	case EXIT_REASON_VMXOFF:
250 		return "vmxoff";
251 	case EXIT_REASON_VMXON:
252 		return "vmxon";
253 	case EXIT_REASON_CR_ACCESS:
254 		return "craccess";
255 	case EXIT_REASON_DR_ACCESS:
256 		return "draccess";
257 	case EXIT_REASON_INOUT:
258 		return "inout";
259 	case EXIT_REASON_RDMSR:
260 		return "rdmsr";
261 	case EXIT_REASON_WRMSR:
262 		return "wrmsr";
263 	case EXIT_REASON_INVAL_VMCS:
264 		return "invalvmcs";
265 	case EXIT_REASON_INVAL_MSR:
266 		return "invalmsr";
267 	case EXIT_REASON_MWAIT:
268 		return "mwait";
269 	case EXIT_REASON_MTF:
270 		return "mtf";
271 	case EXIT_REASON_MONITOR:
272 		return "monitor";
273 	case EXIT_REASON_PAUSE:
274 		return "pause";
275 	case EXIT_REASON_MCE:
276 		return "mce";
277 	case EXIT_REASON_TPR:
278 		return "tpr";
279 	case EXIT_REASON_APIC_ACCESS:
280 		return "apic-access";
281 	case EXIT_REASON_GDTR_IDTR:
282 		return "gdtridtr";
283 	case EXIT_REASON_LDTR_TR:
284 		return "ldtrtr";
285 	case EXIT_REASON_EPT_FAULT:
286 		return "eptfault";
287 	case EXIT_REASON_EPT_MISCONFIG:
288 		return "eptmisconfig";
289 	case EXIT_REASON_INVEPT:
290 		return "invept";
291 	case EXIT_REASON_RDTSCP:
292 		return "rdtscp";
293 	case EXIT_REASON_VMX_PREEMPT:
294 		return "vmxpreempt";
295 	case EXIT_REASON_INVVPID:
296 		return "invvpid";
297 	case EXIT_REASON_WBINVD:
298 		return "wbinvd";
299 	case EXIT_REASON_XSETBV:
300 		return "xsetbv";
301 	case EXIT_REASON_APIC_WRITE:
302 		return "apic-write";
303 	default:
304 		snprintf(reasonbuf, sizeof(reasonbuf), "%d", reason);
305 		return (reasonbuf);
306 	}
307 }
308 #endif	/* KTR */
309 
310 static int
311 vmx_allow_x2apic_msrs(struct vmx *vmx)
312 {
313 	int i, error;
314 
315 	error = 0;
316 
317 	/*
318 	 * Allow readonly access to the following x2APIC MSRs from the guest.
319 	 */
320 	error += guest_msr_ro(vmx, MSR_APIC_ID);
321 	error += guest_msr_ro(vmx, MSR_APIC_VERSION);
322 	error += guest_msr_ro(vmx, MSR_APIC_LDR);
323 	error += guest_msr_ro(vmx, MSR_APIC_SVR);
324 
325 	for (i = 0; i < 8; i++)
326 		error += guest_msr_ro(vmx, MSR_APIC_ISR0 + i);
327 
328 	for (i = 0; i < 8; i++)
329 		error += guest_msr_ro(vmx, MSR_APIC_TMR0 + i);
330 
331 	for (i = 0; i < 8; i++)
332 		error += guest_msr_ro(vmx, MSR_APIC_IRR0 + i);
333 
334 	error += guest_msr_ro(vmx, MSR_APIC_ESR);
335 	error += guest_msr_ro(vmx, MSR_APIC_LVT_TIMER);
336 	error += guest_msr_ro(vmx, MSR_APIC_LVT_THERMAL);
337 	error += guest_msr_ro(vmx, MSR_APIC_LVT_PCINT);
338 	error += guest_msr_ro(vmx, MSR_APIC_LVT_LINT0);
339 	error += guest_msr_ro(vmx, MSR_APIC_LVT_LINT1);
340 	error += guest_msr_ro(vmx, MSR_APIC_LVT_ERROR);
341 	error += guest_msr_ro(vmx, MSR_APIC_ICR_TIMER);
342 	error += guest_msr_ro(vmx, MSR_APIC_DCR_TIMER);
343 	error += guest_msr_ro(vmx, MSR_APIC_ICR);
344 
345 	/*
346 	 * Allow TPR, EOI and SELF_IPI MSRs to be read and written by the guest.
347 	 *
348 	 * These registers get special treatment described in the section
349 	 * "Virtualizing MSR-Based APIC Accesses".
350 	 */
351 	error += guest_msr_rw(vmx, MSR_APIC_TPR);
352 	error += guest_msr_rw(vmx, MSR_APIC_EOI);
353 	error += guest_msr_rw(vmx, MSR_APIC_SELF_IPI);
354 
355 	return (error);
356 }
357 
358 u_long
359 vmx_fix_cr0(u_long cr0)
360 {
361 
362 	return ((cr0 | cr0_ones_mask) & ~cr0_zeros_mask);
363 }
364 
365 u_long
366 vmx_fix_cr4(u_long cr4)
367 {
368 
369 	return ((cr4 | cr4_ones_mask) & ~cr4_zeros_mask);
370 }
371 
372 static void
373 vpid_free(int vpid)
374 {
375 	if (vpid < 0 || vpid > 0xffff)
376 		panic("vpid_free: invalid vpid %d", vpid);
377 
378 	/*
379 	 * VPIDs [0,VM_MAXCPU] are special and are not allocated from
380 	 * the unit number allocator.
381 	 */
382 
383 	if (vpid > VM_MAXCPU)
384 		free_unr(vpid_unr, vpid);
385 }
386 
387 static void
388 vpid_alloc(uint16_t *vpid, int num)
389 {
390 	int i, x;
391 
392 	if (num <= 0 || num > VM_MAXCPU)
393 		panic("invalid number of vpids requested: %d", num);
394 
395 	/*
396 	 * If the "enable vpid" execution control is not enabled then the
397 	 * VPID is required to be 0 for all vcpus.
398 	 */
399 	if ((procbased_ctls2 & PROCBASED2_ENABLE_VPID) == 0) {
400 		for (i = 0; i < num; i++)
401 			vpid[i] = 0;
402 		return;
403 	}
404 
405 	/*
406 	 * Allocate a unique VPID for each vcpu from the unit number allocator.
407 	 */
408 	for (i = 0; i < num; i++) {
409 		x = alloc_unr(vpid_unr);
410 		if (x == -1)
411 			break;
412 		else
413 			vpid[i] = x;
414 	}
415 
416 	if (i < num) {
417 		atomic_add_int(&vpid_alloc_failed, 1);
418 
419 		/*
420 		 * If the unit number allocator does not have enough unique
421 		 * VPIDs then we need to allocate from the [1,VM_MAXCPU] range.
422 		 *
423 		 * These VPIDs are not be unique across VMs but this does not
424 		 * affect correctness because the combined mappings are also
425 		 * tagged with the EP4TA which is unique for each VM.
426 		 *
427 		 * It is still sub-optimal because the invvpid will invalidate
428 		 * combined mappings for a particular VPID across all EP4TAs.
429 		 */
430 		while (i-- > 0)
431 			vpid_free(vpid[i]);
432 
433 		for (i = 0; i < num; i++)
434 			vpid[i] = i + 1;
435 	}
436 }
437 
438 static void
439 vpid_init(void)
440 {
441 	/*
442 	 * VPID 0 is required when the "enable VPID" execution control is
443 	 * disabled.
444 	 *
445 	 * VPIDs [1,VM_MAXCPU] are used as the "overflow namespace" when the
446 	 * unit number allocator does not have sufficient unique VPIDs to
447 	 * satisfy the allocation.
448 	 *
449 	 * The remaining VPIDs are managed by the unit number allocator.
450 	 */
451 	vpid_unr = new_unrhdr(VM_MAXCPU + 1, 0xffff, NULL);
452 }
453 
454 static void
455 msr_save_area_init(struct msr_entry *g_area, int *g_count)
456 {
457 	int cnt;
458 
459 	static struct msr_entry guest_msrs[] = {
460 		{ MSR_KGSBASE, 0, 0 },
461 	};
462 
463 	cnt = sizeof(guest_msrs) / sizeof(guest_msrs[0]);
464 	if (cnt > GUEST_MSR_MAX_ENTRIES)
465 		panic("guest msr save area overrun");
466 	bcopy(guest_msrs, g_area, sizeof(guest_msrs));
467 	*g_count = cnt;
468 }
469 
470 static void
471 vmx_disable(void *arg __unused)
472 {
473 	struct invvpid_desc invvpid_desc = { 0 };
474 	struct invept_desc invept_desc = { 0 };
475 
476 	if (vmxon_enabled[curcpu]) {
477 		/*
478 		 * See sections 25.3.3.3 and 25.3.3.4 in Intel Vol 3b.
479 		 *
480 		 * VMXON or VMXOFF are not required to invalidate any TLB
481 		 * caching structures. This prevents potential retention of
482 		 * cached information in the TLB between distinct VMX episodes.
483 		 */
484 		invvpid(INVVPID_TYPE_ALL_CONTEXTS, invvpid_desc);
485 		invept(INVEPT_TYPE_ALL_CONTEXTS, invept_desc);
486 		vmxoff();
487 	}
488 	load_cr4(rcr4() & ~CR4_VMXE);
489 }
490 
491 static int
492 vmx_cleanup(void)
493 {
494 
495 	if (pirvec != 0)
496 		vmm_ipi_free(pirvec);
497 
498 	if (vpid_unr != NULL) {
499 		delete_unrhdr(vpid_unr);
500 		vpid_unr = NULL;
501 	}
502 
503 	smp_rendezvous(NULL, vmx_disable, NULL, NULL);
504 
505 	return (0);
506 }
507 
508 static void
509 vmx_enable(void *arg __unused)
510 {
511 	int error;
512 
513 	load_cr4(rcr4() | CR4_VMXE);
514 
515 	*(uint32_t *)vmxon_region[curcpu] = vmx_revision();
516 	error = vmxon(vmxon_region[curcpu]);
517 	if (error == 0)
518 		vmxon_enabled[curcpu] = 1;
519 }
520 
521 static void
522 vmx_restore(void)
523 {
524 
525 	if (vmxon_enabled[curcpu])
526 		vmxon(vmxon_region[curcpu]);
527 }
528 
529 static int
530 vmx_init(int ipinum)
531 {
532 	int error, use_tpr_shadow;
533 	uint64_t fixed0, fixed1, feature_control;
534 	uint32_t tmp, procbased2_vid_bits;
535 
536 	/* CPUID.1:ECX[bit 5] must be 1 for processor to support VMX */
537 	if (!(cpu_feature2 & CPUID2_VMX)) {
538 		printf("vmx_init: processor does not support VMX operation\n");
539 		return (ENXIO);
540 	}
541 
542 	/*
543 	 * Verify that MSR_IA32_FEATURE_CONTROL lock and VMXON enable bits
544 	 * are set (bits 0 and 2 respectively).
545 	 */
546 	feature_control = rdmsr(MSR_IA32_FEATURE_CONTROL);
547 	if ((feature_control & IA32_FEATURE_CONTROL_LOCK) == 0 ||
548 	    (feature_control & IA32_FEATURE_CONTROL_VMX_EN) == 0) {
549 		printf("vmx_init: VMX operation disabled by BIOS\n");
550 		return (ENXIO);
551 	}
552 
553 	/* Check support for primary processor-based VM-execution controls */
554 	error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
555 			       MSR_VMX_TRUE_PROCBASED_CTLS,
556 			       PROCBASED_CTLS_ONE_SETTING,
557 			       PROCBASED_CTLS_ZERO_SETTING, &procbased_ctls);
558 	if (error) {
559 		printf("vmx_init: processor does not support desired primary "
560 		       "processor-based controls\n");
561 		return (error);
562 	}
563 
564 	/* Clear the processor-based ctl bits that are set on demand */
565 	procbased_ctls &= ~PROCBASED_CTLS_WINDOW_SETTING;
566 
567 	/* Check support for secondary processor-based VM-execution controls */
568 	error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2,
569 			       MSR_VMX_PROCBASED_CTLS2,
570 			       PROCBASED_CTLS2_ONE_SETTING,
571 			       PROCBASED_CTLS2_ZERO_SETTING, &procbased_ctls2);
572 	if (error) {
573 		printf("vmx_init: processor does not support desired secondary "
574 		       "processor-based controls\n");
575 		return (error);
576 	}
577 
578 	/* Check support for VPID */
579 	error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, MSR_VMX_PROCBASED_CTLS2,
580 			       PROCBASED2_ENABLE_VPID, 0, &tmp);
581 	if (error == 0)
582 		procbased_ctls2 |= PROCBASED2_ENABLE_VPID;
583 
584 	/* Check support for pin-based VM-execution controls */
585 	error = vmx_set_ctlreg(MSR_VMX_PINBASED_CTLS,
586 			       MSR_VMX_TRUE_PINBASED_CTLS,
587 			       PINBASED_CTLS_ONE_SETTING,
588 			       PINBASED_CTLS_ZERO_SETTING, &pinbased_ctls);
589 	if (error) {
590 		printf("vmx_init: processor does not support desired "
591 		       "pin-based controls\n");
592 		return (error);
593 	}
594 
595 	/* Check support for VM-exit controls */
596 	error = vmx_set_ctlreg(MSR_VMX_EXIT_CTLS, MSR_VMX_TRUE_EXIT_CTLS,
597 			       VM_EXIT_CTLS_ONE_SETTING,
598 			       VM_EXIT_CTLS_ZERO_SETTING,
599 			       &exit_ctls);
600 	if (error) {
601 		/* Try again without the PAT MSR bits */
602 		error = vmx_set_ctlreg(MSR_VMX_EXIT_CTLS,
603 				       MSR_VMX_TRUE_EXIT_CTLS,
604 				       VM_EXIT_CTLS_ONE_SETTING_NO_PAT,
605 				       VM_EXIT_CTLS_ZERO_SETTING,
606 				       &exit_ctls);
607 		if (error) {
608 			printf("vmx_init: processor does not support desired "
609 			       "exit controls\n");
610 			return (error);
611 		} else {
612 			if (bootverbose)
613 				printf("vmm: PAT MSR access not supported\n");
614 			guest_msr_valid(MSR_PAT);
615 			vmx_no_patmsr = 1;
616 		}
617 	}
618 
619 	/* Check support for VM-entry controls */
620 	if (!vmx_no_patmsr) {
621 		error = vmx_set_ctlreg(MSR_VMX_ENTRY_CTLS,
622 				       MSR_VMX_TRUE_ENTRY_CTLS,
623 				       VM_ENTRY_CTLS_ONE_SETTING,
624 				       VM_ENTRY_CTLS_ZERO_SETTING,
625 				       &entry_ctls);
626 	} else {
627 		error = vmx_set_ctlreg(MSR_VMX_ENTRY_CTLS,
628 				       MSR_VMX_TRUE_ENTRY_CTLS,
629 				       VM_ENTRY_CTLS_ONE_SETTING_NO_PAT,
630 				       VM_ENTRY_CTLS_ZERO_SETTING,
631 				       &entry_ctls);
632 	}
633 
634 	if (error) {
635 		printf("vmx_init: processor does not support desired "
636 		       "entry controls\n");
637 		       return (error);
638 	}
639 
640 	/*
641 	 * Check support for optional features by testing them
642 	 * as individual bits
643 	 */
644 	cap_halt_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
645 					MSR_VMX_TRUE_PROCBASED_CTLS,
646 					PROCBASED_HLT_EXITING, 0,
647 					&tmp) == 0);
648 
649 	cap_monitor_trap = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
650 					MSR_VMX_PROCBASED_CTLS,
651 					PROCBASED_MTF, 0,
652 					&tmp) == 0);
653 
654 	cap_pause_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
655 					 MSR_VMX_TRUE_PROCBASED_CTLS,
656 					 PROCBASED_PAUSE_EXITING, 0,
657 					 &tmp) == 0);
658 
659 	cap_unrestricted_guest = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2,
660 					MSR_VMX_PROCBASED_CTLS2,
661 					PROCBASED2_UNRESTRICTED_GUEST, 0,
662 				        &tmp) == 0);
663 
664 	cap_invpcid = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2,
665 	    MSR_VMX_PROCBASED_CTLS2, PROCBASED2_ENABLE_INVPCID, 0,
666 	    &tmp) == 0);
667 
668 	/*
669 	 * Check support for virtual interrupt delivery.
670 	 */
671 	procbased2_vid_bits = (PROCBASED2_VIRTUALIZE_APIC_ACCESSES |
672 	    PROCBASED2_VIRTUALIZE_X2APIC_MODE |
673 	    PROCBASED2_APIC_REGISTER_VIRTUALIZATION |
674 	    PROCBASED2_VIRTUAL_INTERRUPT_DELIVERY);
675 
676 	use_tpr_shadow = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
677 	    MSR_VMX_TRUE_PROCBASED_CTLS, PROCBASED_USE_TPR_SHADOW, 0,
678 	    &tmp) == 0);
679 
680 	error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, MSR_VMX_PROCBASED_CTLS2,
681 	    procbased2_vid_bits, 0, &tmp);
682 	if (error == 0 && use_tpr_shadow) {
683 		virtual_interrupt_delivery = 1;
684 		TUNABLE_INT_FETCH("hw.vmm.vmx.use_apic_vid",
685 		    &virtual_interrupt_delivery);
686 	}
687 
688 	if (virtual_interrupt_delivery) {
689 		procbased_ctls |= PROCBASED_USE_TPR_SHADOW;
690 		procbased_ctls2 |= procbased2_vid_bits;
691 		procbased_ctls2 &= ~PROCBASED2_VIRTUALIZE_X2APIC_MODE;
692 
693 		/*
694 		 * Check for Posted Interrupts only if Virtual Interrupt
695 		 * Delivery is enabled.
696 		 */
697 		error = vmx_set_ctlreg(MSR_VMX_PINBASED_CTLS,
698 		    MSR_VMX_TRUE_PINBASED_CTLS, PINBASED_POSTED_INTERRUPT, 0,
699 		    &tmp);
700 		if (error == 0) {
701 			pirvec = vmm_ipi_alloc();
702 			if (pirvec == 0) {
703 				if (bootverbose) {
704 					printf("vmx_init: unable to allocate "
705 					    "posted interrupt vector\n");
706 				}
707 			} else {
708 				posted_interrupts = 1;
709 				TUNABLE_INT_FETCH("hw.vmm.vmx.use_apic_pir",
710 				    &posted_interrupts);
711 			}
712 		}
713 	}
714 
715 	if (posted_interrupts)
716 		    pinbased_ctls |= PINBASED_POSTED_INTERRUPT;
717 
718 	/* Initialize EPT */
719 	error = ept_init(ipinum);
720 	if (error) {
721 		printf("vmx_init: ept initialization failed (%d)\n", error);
722 		return (error);
723 	}
724 
725 	/*
726 	 * Stash the cr0 and cr4 bits that must be fixed to 0 or 1
727 	 */
728 	fixed0 = rdmsr(MSR_VMX_CR0_FIXED0);
729 	fixed1 = rdmsr(MSR_VMX_CR0_FIXED1);
730 	cr0_ones_mask = fixed0 & fixed1;
731 	cr0_zeros_mask = ~fixed0 & ~fixed1;
732 
733 	/*
734 	 * CR0_PE and CR0_PG can be set to zero in VMX non-root operation
735 	 * if unrestricted guest execution is allowed.
736 	 */
737 	if (cap_unrestricted_guest)
738 		cr0_ones_mask &= ~(CR0_PG | CR0_PE);
739 
740 	/*
741 	 * Do not allow the guest to set CR0_NW or CR0_CD.
742 	 */
743 	cr0_zeros_mask |= (CR0_NW | CR0_CD);
744 
745 	fixed0 = rdmsr(MSR_VMX_CR4_FIXED0);
746 	fixed1 = rdmsr(MSR_VMX_CR4_FIXED1);
747 	cr4_ones_mask = fixed0 & fixed1;
748 	cr4_zeros_mask = ~fixed0 & ~fixed1;
749 
750 	vpid_init();
751 
752 	/* enable VMX operation */
753 	smp_rendezvous(NULL, vmx_enable, NULL, NULL);
754 
755 	vmx_initialized = 1;
756 
757 	return (0);
758 }
759 
760 static void
761 vmx_trigger_hostintr(int vector)
762 {
763 	uintptr_t func;
764 	struct gate_descriptor *gd;
765 
766 	gd = &idt[vector];
767 
768 	KASSERT(vector >= 32 && vector <= 255, ("vmx_trigger_hostintr: "
769 	    "invalid vector %d", vector));
770 	KASSERT(gd->gd_p == 1, ("gate descriptor for vector %d not present",
771 	    vector));
772 	KASSERT(gd->gd_type == SDT_SYSIGT, ("gate descriptor for vector %d "
773 	    "has invalid type %d", vector, gd->gd_type));
774 	KASSERT(gd->gd_dpl == SEL_KPL, ("gate descriptor for vector %d "
775 	    "has invalid dpl %d", vector, gd->gd_dpl));
776 	KASSERT(gd->gd_selector == GSEL(GCODE_SEL, SEL_KPL), ("gate descriptor "
777 	    "for vector %d has invalid selector %d", vector, gd->gd_selector));
778 	KASSERT(gd->gd_ist == 0, ("gate descriptor for vector %d has invalid "
779 	    "IST %d", vector, gd->gd_ist));
780 
781 	func = ((long)gd->gd_hioffset << 16 | gd->gd_looffset);
782 	vmx_call_isr(func);
783 }
784 
785 static int
786 vmx_setup_cr_shadow(int which, struct vmcs *vmcs, uint32_t initial)
787 {
788 	int error, mask_ident, shadow_ident;
789 	uint64_t mask_value;
790 
791 	if (which != 0 && which != 4)
792 		panic("vmx_setup_cr_shadow: unknown cr%d", which);
793 
794 	if (which == 0) {
795 		mask_ident = VMCS_CR0_MASK;
796 		mask_value = cr0_ones_mask | cr0_zeros_mask;
797 		shadow_ident = VMCS_CR0_SHADOW;
798 	} else {
799 		mask_ident = VMCS_CR4_MASK;
800 		mask_value = cr4_ones_mask | cr4_zeros_mask;
801 		shadow_ident = VMCS_CR4_SHADOW;
802 	}
803 
804 	error = vmcs_setreg(vmcs, 0, VMCS_IDENT(mask_ident), mask_value);
805 	if (error)
806 		return (error);
807 
808 	error = vmcs_setreg(vmcs, 0, VMCS_IDENT(shadow_ident), initial);
809 	if (error)
810 		return (error);
811 
812 	return (0);
813 }
814 #define	vmx_setup_cr0_shadow(vmcs,init)	vmx_setup_cr_shadow(0, (vmcs), (init))
815 #define	vmx_setup_cr4_shadow(vmcs,init)	vmx_setup_cr_shadow(4, (vmcs), (init))
816 
817 static void *
818 vmx_vminit(struct vm *vm, pmap_t pmap)
819 {
820 	uint16_t vpid[VM_MAXCPU];
821 	int i, error, guest_msr_count;
822 	struct vmx *vmx;
823 	struct vmcs *vmcs;
824 
825 	vmx = malloc(sizeof(struct vmx), M_VMX, M_WAITOK | M_ZERO);
826 	if ((uintptr_t)vmx & PAGE_MASK) {
827 		panic("malloc of struct vmx not aligned on %d byte boundary",
828 		      PAGE_SIZE);
829 	}
830 	vmx->vm = vm;
831 
832 	vmx->eptp = eptp(vtophys((vm_offset_t)pmap->pm_pml4));
833 
834 	/*
835 	 * Clean up EPTP-tagged guest physical and combined mappings
836 	 *
837 	 * VMX transitions are not required to invalidate any guest physical
838 	 * mappings. So, it may be possible for stale guest physical mappings
839 	 * to be present in the processor TLBs.
840 	 *
841 	 * Combined mappings for this EP4TA are also invalidated for all VPIDs.
842 	 */
843 	ept_invalidate_mappings(vmx->eptp);
844 
845 	msr_bitmap_initialize(vmx->msr_bitmap);
846 
847 	/*
848 	 * It is safe to allow direct access to MSR_GSBASE and MSR_FSBASE.
849 	 * The guest FSBASE and GSBASE are saved and restored during
850 	 * vm-exit and vm-entry respectively. The host FSBASE and GSBASE are
851 	 * always restored from the vmcs host state area on vm-exit.
852 	 *
853 	 * The SYSENTER_CS/ESP/EIP MSRs are identical to FS/GSBASE in
854 	 * how they are saved/restored so can be directly accessed by the
855 	 * guest.
856 	 *
857 	 * Guest KGSBASE is saved and restored in the guest MSR save area.
858 	 * Host KGSBASE is restored before returning to userland from the pcb.
859 	 * There will be a window of time when we are executing in the host
860 	 * kernel context with a value of KGSBASE from the guest. This is ok
861 	 * because the value of KGSBASE is inconsequential in kernel context.
862 	 *
863 	 * MSR_EFER is saved and restored in the guest VMCS area on a
864 	 * VM exit and entry respectively. It is also restored from the
865 	 * host VMCS area on a VM exit.
866 	 *
867 	 * The TSC MSR is exposed read-only. Writes are disallowed as that
868 	 * will impact the host TSC.
869 	 * XXX Writes would be implemented with a wrmsr trap, and
870 	 * then modifying the TSC offset in the VMCS.
871 	 */
872 	if (guest_msr_rw(vmx, MSR_GSBASE) ||
873 	    guest_msr_rw(vmx, MSR_FSBASE) ||
874 	    guest_msr_rw(vmx, MSR_SYSENTER_CS_MSR) ||
875 	    guest_msr_rw(vmx, MSR_SYSENTER_ESP_MSR) ||
876 	    guest_msr_rw(vmx, MSR_SYSENTER_EIP_MSR) ||
877 	    guest_msr_rw(vmx, MSR_KGSBASE) ||
878 	    guest_msr_rw(vmx, MSR_EFER) ||
879 	    guest_msr_ro(vmx, MSR_TSC))
880 		panic("vmx_vminit: error setting guest msr access");
881 
882 	/*
883 	 * MSR_PAT is saved and restored in the guest VMCS are on a VM exit
884 	 * and entry respectively. It is also restored from the host VMCS
885 	 * area on a VM exit. However, if running on a system with no
886 	 * MSR_PAT save/restore support, leave access disabled so accesses
887 	 * will be trapped.
888 	 */
889 	if (!vmx_no_patmsr && guest_msr_rw(vmx, MSR_PAT))
890 		panic("vmx_vminit: error setting guest pat msr access");
891 
892 	vpid_alloc(vpid, VM_MAXCPU);
893 
894 	if (virtual_interrupt_delivery) {
895 		error = vm_map_mmio(vm, DEFAULT_APIC_BASE, PAGE_SIZE,
896 		    APIC_ACCESS_ADDRESS);
897 		/* XXX this should really return an error to the caller */
898 		KASSERT(error == 0, ("vm_map_mmio(apicbase) error %d", error));
899 	}
900 
901 	for (i = 0; i < VM_MAXCPU; i++) {
902 		vmcs = &vmx->vmcs[i];
903 		vmcs->identifier = vmx_revision();
904 		error = vmclear(vmcs);
905 		if (error != 0) {
906 			panic("vmx_vminit: vmclear error %d on vcpu %d\n",
907 			      error, i);
908 		}
909 
910 		error = vmcs_init(vmcs);
911 		KASSERT(error == 0, ("vmcs_init error %d", error));
912 
913 		VMPTRLD(vmcs);
914 		error = 0;
915 		error += vmwrite(VMCS_HOST_RSP, (u_long)&vmx->ctx[i]);
916 		error += vmwrite(VMCS_EPTP, vmx->eptp);
917 		error += vmwrite(VMCS_PIN_BASED_CTLS, pinbased_ctls);
918 		error += vmwrite(VMCS_PRI_PROC_BASED_CTLS, procbased_ctls);
919 		error += vmwrite(VMCS_SEC_PROC_BASED_CTLS, procbased_ctls2);
920 		error += vmwrite(VMCS_EXIT_CTLS, exit_ctls);
921 		error += vmwrite(VMCS_ENTRY_CTLS, entry_ctls);
922 		error += vmwrite(VMCS_MSR_BITMAP, vtophys(vmx->msr_bitmap));
923 		error += vmwrite(VMCS_VPID, vpid[i]);
924 		if (virtual_interrupt_delivery) {
925 			error += vmwrite(VMCS_APIC_ACCESS, APIC_ACCESS_ADDRESS);
926 			error += vmwrite(VMCS_VIRTUAL_APIC,
927 			    vtophys(&vmx->apic_page[i]));
928 			error += vmwrite(VMCS_EOI_EXIT0, 0);
929 			error += vmwrite(VMCS_EOI_EXIT1, 0);
930 			error += vmwrite(VMCS_EOI_EXIT2, 0);
931 			error += vmwrite(VMCS_EOI_EXIT3, 0);
932 		}
933 		if (posted_interrupts) {
934 			error += vmwrite(VMCS_PIR_VECTOR, pirvec);
935 			error += vmwrite(VMCS_PIR_DESC,
936 			    vtophys(&vmx->pir_desc[i]));
937 		}
938 		VMCLEAR(vmcs);
939 		KASSERT(error == 0, ("vmx_vminit: error customizing the vmcs"));
940 
941 		vmx->cap[i].set = 0;
942 		vmx->cap[i].proc_ctls = procbased_ctls;
943 		vmx->cap[i].proc_ctls2 = procbased_ctls2;
944 
945 		vmx->state[i].lastcpu = -1;
946 		vmx->state[i].vpid = vpid[i];
947 
948 		msr_save_area_init(vmx->guest_msrs[i], &guest_msr_count);
949 
950 		error = vmcs_set_msr_save(vmcs, vtophys(vmx->guest_msrs[i]),
951 		    guest_msr_count);
952 		if (error != 0)
953 			panic("vmcs_set_msr_save error %d", error);
954 
955 		/*
956 		 * Set up the CR0/4 shadows, and init the read shadow
957 		 * to the power-on register value from the Intel Sys Arch.
958 		 *  CR0 - 0x60000010
959 		 *  CR4 - 0
960 		 */
961 		error = vmx_setup_cr0_shadow(vmcs, 0x60000010);
962 		if (error != 0)
963 			panic("vmx_setup_cr0_shadow %d", error);
964 
965 		error = vmx_setup_cr4_shadow(vmcs, 0);
966 		if (error != 0)
967 			panic("vmx_setup_cr4_shadow %d", error);
968 
969 		vmx->ctx[i].pmap = pmap;
970 	}
971 
972 	return (vmx);
973 }
974 
975 static int
976 vmx_handle_cpuid(struct vm *vm, int vcpu, struct vmxctx *vmxctx)
977 {
978 	int handled, func;
979 
980 	func = vmxctx->guest_rax;
981 
982 	handled = x86_emulate_cpuid(vm, vcpu,
983 				    (uint32_t*)(&vmxctx->guest_rax),
984 				    (uint32_t*)(&vmxctx->guest_rbx),
985 				    (uint32_t*)(&vmxctx->guest_rcx),
986 				    (uint32_t*)(&vmxctx->guest_rdx));
987 	return (handled);
988 }
989 
990 static __inline void
991 vmx_run_trace(struct vmx *vmx, int vcpu)
992 {
993 #ifdef KTR
994 	VCPU_CTR1(vmx->vm, vcpu, "Resume execution at %#lx", vmcs_guest_rip());
995 #endif
996 }
997 
998 static __inline void
999 vmx_exit_trace(struct vmx *vmx, int vcpu, uint64_t rip, uint32_t exit_reason,
1000 	       int handled)
1001 {
1002 #ifdef KTR
1003 	VCPU_CTR3(vmx->vm, vcpu, "%s %s vmexit at 0x%0lx",
1004 		 handled ? "handled" : "unhandled",
1005 		 exit_reason_to_str(exit_reason), rip);
1006 #endif
1007 }
1008 
1009 static __inline void
1010 vmx_astpending_trace(struct vmx *vmx, int vcpu, uint64_t rip)
1011 {
1012 #ifdef KTR
1013 	VCPU_CTR1(vmx->vm, vcpu, "astpending vmexit at 0x%0lx", rip);
1014 #endif
1015 }
1016 
1017 static VMM_STAT_INTEL(VCPU_INVVPID_SAVED, "Number of vpid invalidations saved");
1018 
1019 static void
1020 vmx_set_pcpu_defaults(struct vmx *vmx, int vcpu, pmap_t pmap)
1021 {
1022 	struct vmxstate *vmxstate;
1023 	struct invvpid_desc invvpid_desc;
1024 
1025 	vmxstate = &vmx->state[vcpu];
1026 	if (vmxstate->lastcpu == curcpu)
1027 		return;
1028 
1029 	vmxstate->lastcpu = curcpu;
1030 
1031 	vmm_stat_incr(vmx->vm, vcpu, VCPU_MIGRATIONS, 1);
1032 
1033 	vmcs_write(VMCS_HOST_TR_BASE, vmm_get_host_trbase());
1034 	vmcs_write(VMCS_HOST_GDTR_BASE, vmm_get_host_gdtrbase());
1035 	vmcs_write(VMCS_HOST_GS_BASE, vmm_get_host_gsbase());
1036 
1037 	/*
1038 	 * If we are using VPIDs then invalidate all mappings tagged with 'vpid'
1039 	 *
1040 	 * We do this because this vcpu was executing on a different host
1041 	 * cpu when it last ran. We do not track whether it invalidated
1042 	 * mappings associated with its 'vpid' during that run. So we must
1043 	 * assume that the mappings associated with 'vpid' on 'curcpu' are
1044 	 * stale and invalidate them.
1045 	 *
1046 	 * Note that we incur this penalty only when the scheduler chooses to
1047 	 * move the thread associated with this vcpu between host cpus.
1048 	 *
1049 	 * Note also that this will invalidate mappings tagged with 'vpid'
1050 	 * for "all" EP4TAs.
1051 	 */
1052 	if (vmxstate->vpid != 0) {
1053 		if (pmap->pm_eptgen == vmx->eptgen[curcpu]) {
1054 			invvpid_desc._res1 = 0;
1055 			invvpid_desc._res2 = 0;
1056 			invvpid_desc.vpid = vmxstate->vpid;
1057 			invvpid_desc.linear_addr = 0;
1058 			invvpid(INVVPID_TYPE_SINGLE_CONTEXT, invvpid_desc);
1059 		} else {
1060 			/*
1061 			 * The invvpid can be skipped if an invept is going to
1062 			 * be performed before entering the guest. The invept
1063 			 * will invalidate combined mappings tagged with
1064 			 * 'vmx->eptp' for all vpids.
1065 			 */
1066 			vmm_stat_incr(vmx->vm, vcpu, VCPU_INVVPID_SAVED, 1);
1067 		}
1068 	}
1069 }
1070 
1071 /*
1072  * We depend on 'procbased_ctls' to have the Interrupt Window Exiting bit set.
1073  */
1074 CTASSERT((PROCBASED_CTLS_ONE_SETTING & PROCBASED_INT_WINDOW_EXITING) != 0);
1075 
1076 static void __inline
1077 vmx_set_int_window_exiting(struct vmx *vmx, int vcpu)
1078 {
1079 
1080 	if ((vmx->cap[vcpu].proc_ctls & PROCBASED_INT_WINDOW_EXITING) == 0) {
1081 		vmx->cap[vcpu].proc_ctls |= PROCBASED_INT_WINDOW_EXITING;
1082 		vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
1083 		VCPU_CTR0(vmx->vm, vcpu, "Enabling interrupt window exiting");
1084 	}
1085 }
1086 
1087 static void __inline
1088 vmx_clear_int_window_exiting(struct vmx *vmx, int vcpu)
1089 {
1090 
1091 	KASSERT((vmx->cap[vcpu].proc_ctls & PROCBASED_INT_WINDOW_EXITING) != 0,
1092 	    ("intr_window_exiting not set: %#x", vmx->cap[vcpu].proc_ctls));
1093 	vmx->cap[vcpu].proc_ctls &= ~PROCBASED_INT_WINDOW_EXITING;
1094 	vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
1095 	VCPU_CTR0(vmx->vm, vcpu, "Disabling interrupt window exiting");
1096 }
1097 
1098 static void __inline
1099 vmx_set_nmi_window_exiting(struct vmx *vmx, int vcpu)
1100 {
1101 
1102 	if ((vmx->cap[vcpu].proc_ctls & PROCBASED_NMI_WINDOW_EXITING) == 0) {
1103 		vmx->cap[vcpu].proc_ctls |= PROCBASED_NMI_WINDOW_EXITING;
1104 		vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
1105 		VCPU_CTR0(vmx->vm, vcpu, "Enabling NMI window exiting");
1106 	}
1107 }
1108 
1109 static void __inline
1110 vmx_clear_nmi_window_exiting(struct vmx *vmx, int vcpu)
1111 {
1112 
1113 	KASSERT((vmx->cap[vcpu].proc_ctls & PROCBASED_NMI_WINDOW_EXITING) != 0,
1114 	    ("nmi_window_exiting not set %#x", vmx->cap[vcpu].proc_ctls));
1115 	vmx->cap[vcpu].proc_ctls &= ~PROCBASED_NMI_WINDOW_EXITING;
1116 	vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
1117 	VCPU_CTR0(vmx->vm, vcpu, "Disabling NMI window exiting");
1118 }
1119 
1120 #define	NMI_BLOCKING	(VMCS_INTERRUPTIBILITY_NMI_BLOCKING |		\
1121 			 VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING)
1122 #define	HWINTR_BLOCKING	(VMCS_INTERRUPTIBILITY_STI_BLOCKING |		\
1123 			 VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING)
1124 
1125 static void
1126 vmx_inject_nmi(struct vmx *vmx, int vcpu)
1127 {
1128 	uint32_t gi, info;
1129 
1130 	gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
1131 	KASSERT((gi & NMI_BLOCKING) == 0, ("vmx_inject_nmi: invalid guest "
1132 	    "interruptibility-state %#x", gi));
1133 
1134 	info = vmcs_read(VMCS_ENTRY_INTR_INFO);
1135 	KASSERT((info & VMCS_INTR_VALID) == 0, ("vmx_inject_nmi: invalid "
1136 	    "VM-entry interruption information %#x", info));
1137 
1138 	/*
1139 	 * Inject the virtual NMI. The vector must be the NMI IDT entry
1140 	 * or the VMCS entry check will fail.
1141 	 */
1142 	info = IDT_NMI | VMCS_INTR_T_NMI | VMCS_INTR_VALID;
1143 	vmcs_write(VMCS_ENTRY_INTR_INFO, info);
1144 
1145 	VCPU_CTR0(vmx->vm, vcpu, "Injecting vNMI");
1146 
1147 	/* Clear the request */
1148 	vm_nmi_clear(vmx->vm, vcpu);
1149 }
1150 
1151 static void
1152 vmx_inject_interrupts(struct vmx *vmx, int vcpu, struct vlapic *vlapic)
1153 {
1154 	struct vm_exception exc;
1155 	int vector, need_nmi_exiting, extint_pending;
1156 	uint64_t rflags;
1157 	uint32_t gi, info;
1158 
1159 	if (vm_exception_pending(vmx->vm, vcpu, &exc)) {
1160 		KASSERT(exc.vector >= 0 && exc.vector < 32,
1161 		    ("%s: invalid exception vector %d", __func__, exc.vector));
1162 
1163 		info = vmcs_read(VMCS_ENTRY_INTR_INFO);
1164 		KASSERT((info & VMCS_INTR_VALID) == 0, ("%s: cannot inject "
1165 		     "pending exception %d: %#x", __func__, exc.vector, info));
1166 
1167 		info = exc.vector | VMCS_INTR_T_HWEXCEPTION | VMCS_INTR_VALID;
1168 		if (exc.error_code_valid) {
1169 			info |= VMCS_INTR_DEL_ERRCODE;
1170 			vmcs_write(VMCS_ENTRY_EXCEPTION_ERROR, exc.error_code);
1171 		}
1172 		vmcs_write(VMCS_ENTRY_INTR_INFO, info);
1173 	}
1174 
1175 	if (vm_nmi_pending(vmx->vm, vcpu)) {
1176 		/*
1177 		 * If there are no conditions blocking NMI injection then
1178 		 * inject it directly here otherwise enable "NMI window
1179 		 * exiting" to inject it as soon as we can.
1180 		 *
1181 		 * We also check for STI_BLOCKING because some implementations
1182 		 * don't allow NMI injection in this case. If we are running
1183 		 * on a processor that doesn't have this restriction it will
1184 		 * immediately exit and the NMI will be injected in the
1185 		 * "NMI window exiting" handler.
1186 		 */
1187 		need_nmi_exiting = 1;
1188 		gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
1189 		if ((gi & (HWINTR_BLOCKING | NMI_BLOCKING)) == 0) {
1190 			info = vmcs_read(VMCS_ENTRY_INTR_INFO);
1191 			if ((info & VMCS_INTR_VALID) == 0) {
1192 				vmx_inject_nmi(vmx, vcpu);
1193 				need_nmi_exiting = 0;
1194 			} else {
1195 				VCPU_CTR1(vmx->vm, vcpu, "Cannot inject NMI "
1196 				    "due to VM-entry intr info %#x", info);
1197 			}
1198 		} else {
1199 			VCPU_CTR1(vmx->vm, vcpu, "Cannot inject NMI due to "
1200 			    "Guest Interruptibility-state %#x", gi);
1201 		}
1202 
1203 		if (need_nmi_exiting)
1204 			vmx_set_nmi_window_exiting(vmx, vcpu);
1205 	}
1206 
1207 	extint_pending = vm_extint_pending(vmx->vm, vcpu);
1208 
1209 	if (!extint_pending && virtual_interrupt_delivery) {
1210 		vmx_inject_pir(vlapic);
1211 		return;
1212 	}
1213 
1214 	/*
1215 	 * If interrupt-window exiting is already in effect then don't bother
1216 	 * checking for pending interrupts. This is just an optimization and
1217 	 * not needed for correctness.
1218 	 */
1219 	if ((vmx->cap[vcpu].proc_ctls & PROCBASED_INT_WINDOW_EXITING) != 0) {
1220 		VCPU_CTR0(vmx->vm, vcpu, "Skip interrupt injection due to "
1221 		    "pending int_window_exiting");
1222 		return;
1223 	}
1224 
1225 	if (!extint_pending) {
1226 		/* Ask the local apic for a vector to inject */
1227 		if (!vlapic_pending_intr(vlapic, &vector))
1228 			return;
1229 	} else {
1230 		/* Ask the legacy pic for a vector to inject */
1231 		vatpic_pending_intr(vmx->vm, &vector);
1232 	}
1233 
1234 	KASSERT(vector >= 32 && vector <= 255, ("invalid vector %d", vector));
1235 
1236 	/* Check RFLAGS.IF and the interruptibility state of the guest */
1237 	rflags = vmcs_read(VMCS_GUEST_RFLAGS);
1238 	if ((rflags & PSL_I) == 0) {
1239 		VCPU_CTR2(vmx->vm, vcpu, "Cannot inject vector %d due to "
1240 		    "rflags %#lx", vector, rflags);
1241 		goto cantinject;
1242 	}
1243 
1244 	gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
1245 	if (gi & HWINTR_BLOCKING) {
1246 		VCPU_CTR2(vmx->vm, vcpu, "Cannot inject vector %d due to "
1247 		    "Guest Interruptibility-state %#x", vector, gi);
1248 		goto cantinject;
1249 	}
1250 
1251 	info = vmcs_read(VMCS_ENTRY_INTR_INFO);
1252 	if (info & VMCS_INTR_VALID) {
1253 		/*
1254 		 * This is expected and could happen for multiple reasons:
1255 		 * - A vectoring VM-entry was aborted due to astpending
1256 		 * - A VM-exit happened during event injection.
1257 		 * - An exception was injected above.
1258 		 * - An NMI was injected above or after "NMI window exiting"
1259 		 */
1260 		VCPU_CTR2(vmx->vm, vcpu, "Cannot inject vector %d due to "
1261 		    "VM-entry intr info %#x", vector, info);
1262 		goto cantinject;
1263 	}
1264 
1265 	/* Inject the interrupt */
1266 	info = VMCS_INTR_T_HWINTR | VMCS_INTR_VALID;
1267 	info |= vector;
1268 	vmcs_write(VMCS_ENTRY_INTR_INFO, info);
1269 
1270 	if (!extint_pending) {
1271 		/* Update the Local APIC ISR */
1272 		vlapic_intr_accepted(vlapic, vector);
1273 	} else {
1274 		vm_extint_clear(vmx->vm, vcpu);
1275 		vatpic_intr_accepted(vmx->vm, vector);
1276 
1277 		/*
1278 		 * After we accepted the current ExtINT the PIC may
1279 		 * have posted another one.  If that is the case, set
1280 		 * the Interrupt Window Exiting execution control so
1281 		 * we can inject that one too.
1282 		 */
1283 		if (vm_extint_pending(vmx->vm, vcpu))
1284 			vmx_set_int_window_exiting(vmx, vcpu);
1285 	}
1286 
1287 	VCPU_CTR1(vmx->vm, vcpu, "Injecting hwintr at vector %d", vector);
1288 
1289 	return;
1290 
1291 cantinject:
1292 	/*
1293 	 * Set the Interrupt Window Exiting execution control so we can inject
1294 	 * the interrupt as soon as blocking condition goes away.
1295 	 */
1296 	vmx_set_int_window_exiting(vmx, vcpu);
1297 }
1298 
1299 /*
1300  * If the Virtual NMIs execution control is '1' then the logical processor
1301  * tracks virtual-NMI blocking in the Guest Interruptibility-state field of
1302  * the VMCS. An IRET instruction in VMX non-root operation will remove any
1303  * virtual-NMI blocking.
1304  *
1305  * This unblocking occurs even if the IRET causes a fault. In this case the
1306  * hypervisor needs to restore virtual-NMI blocking before resuming the guest.
1307  */
1308 static void
1309 vmx_restore_nmi_blocking(struct vmx *vmx, int vcpuid)
1310 {
1311 	uint32_t gi;
1312 
1313 	VCPU_CTR0(vmx->vm, vcpuid, "Restore Virtual-NMI blocking");
1314 	gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
1315 	gi |= VMCS_INTERRUPTIBILITY_NMI_BLOCKING;
1316 	vmcs_write(VMCS_GUEST_INTERRUPTIBILITY, gi);
1317 }
1318 
1319 static void
1320 vmx_clear_nmi_blocking(struct vmx *vmx, int vcpuid)
1321 {
1322 	uint32_t gi;
1323 
1324 	VCPU_CTR0(vmx->vm, vcpuid, "Clear Virtual-NMI blocking");
1325 	gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
1326 	gi &= ~VMCS_INTERRUPTIBILITY_NMI_BLOCKING;
1327 	vmcs_write(VMCS_GUEST_INTERRUPTIBILITY, gi);
1328 }
1329 
1330 static int
1331 vmx_emulate_xsetbv(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
1332 {
1333 	struct vmxctx *vmxctx;
1334 	uint64_t xcrval;
1335 	const struct xsave_limits *limits;
1336 
1337 	vmxctx = &vmx->ctx[vcpu];
1338 	limits = vmm_get_xsave_limits();
1339 
1340 	/*
1341 	 * Note that the processor raises a GP# fault on its own if
1342 	 * xsetbv is executed for CPL != 0, so we do not have to
1343 	 * emulate that fault here.
1344 	 */
1345 
1346 	/* Only xcr0 is supported. */
1347 	if (vmxctx->guest_rcx != 0) {
1348 		vm_inject_gp(vmx->vm, vcpu);
1349 		return (HANDLED);
1350 	}
1351 
1352 	/* We only handle xcr0 if both the host and guest have XSAVE enabled. */
1353 	if (!limits->xsave_enabled || !(vmcs_read(VMCS_GUEST_CR4) & CR4_XSAVE)) {
1354 		vm_inject_ud(vmx->vm, vcpu);
1355 		return (HANDLED);
1356 	}
1357 
1358 	xcrval = vmxctx->guest_rdx << 32 | (vmxctx->guest_rax & 0xffffffff);
1359 	if ((xcrval & ~limits->xcr0_allowed) != 0) {
1360 		vm_inject_gp(vmx->vm, vcpu);
1361 		return (HANDLED);
1362 	}
1363 
1364 	if (!(xcrval & XFEATURE_ENABLED_X87)) {
1365 		vm_inject_gp(vmx->vm, vcpu);
1366 		return (HANDLED);
1367 	}
1368 
1369 	if ((xcrval & (XFEATURE_ENABLED_AVX | XFEATURE_ENABLED_SSE)) ==
1370 	    XFEATURE_ENABLED_AVX) {
1371 		vm_inject_gp(vmx->vm, vcpu);
1372 		return (HANDLED);
1373 	}
1374 
1375 	/*
1376 	 * This runs "inside" vmrun() with the guest's FPU state, so
1377 	 * modifying xcr0 directly modifies the guest's xcr0, not the
1378 	 * host's.
1379 	 */
1380 	load_xcr(0, xcrval);
1381 	return (HANDLED);
1382 }
1383 
1384 static int
1385 vmx_emulate_cr_access(struct vmx *vmx, int vcpu, uint64_t exitqual)
1386 {
1387 	int cr, vmcs_guest_cr, vmcs_shadow_cr;
1388 	uint64_t crval, regval, ones_mask, zeros_mask;
1389 	const struct vmxctx *vmxctx;
1390 
1391 	/* We only handle mov to %cr0 or %cr4 at this time */
1392 	if ((exitqual & 0xf0) != 0x00)
1393 		return (UNHANDLED);
1394 
1395 	cr = exitqual & 0xf;
1396 	if (cr != 0 && cr != 4)
1397 		return (UNHANDLED);
1398 
1399 	regval = 0; /* silence gcc */
1400 	vmxctx = &vmx->ctx[vcpu];
1401 
1402 	/*
1403 	 * We must use vmcs_write() directly here because vmcs_setreg() will
1404 	 * call vmclear(vmcs) as a side-effect which we certainly don't want.
1405 	 */
1406 	switch ((exitqual >> 8) & 0xf) {
1407 	case 0:
1408 		regval = vmxctx->guest_rax;
1409 		break;
1410 	case 1:
1411 		regval = vmxctx->guest_rcx;
1412 		break;
1413 	case 2:
1414 		regval = vmxctx->guest_rdx;
1415 		break;
1416 	case 3:
1417 		regval = vmxctx->guest_rbx;
1418 		break;
1419 	case 4:
1420 		regval = vmcs_read(VMCS_GUEST_RSP);
1421 		break;
1422 	case 5:
1423 		regval = vmxctx->guest_rbp;
1424 		break;
1425 	case 6:
1426 		regval = vmxctx->guest_rsi;
1427 		break;
1428 	case 7:
1429 		regval = vmxctx->guest_rdi;
1430 		break;
1431 	case 8:
1432 		regval = vmxctx->guest_r8;
1433 		break;
1434 	case 9:
1435 		regval = vmxctx->guest_r9;
1436 		break;
1437 	case 10:
1438 		regval = vmxctx->guest_r10;
1439 		break;
1440 	case 11:
1441 		regval = vmxctx->guest_r11;
1442 		break;
1443 	case 12:
1444 		regval = vmxctx->guest_r12;
1445 		break;
1446 	case 13:
1447 		regval = vmxctx->guest_r13;
1448 		break;
1449 	case 14:
1450 		regval = vmxctx->guest_r14;
1451 		break;
1452 	case 15:
1453 		regval = vmxctx->guest_r15;
1454 		break;
1455 	}
1456 
1457 	if (cr == 0) {
1458 		ones_mask = cr0_ones_mask;
1459 		zeros_mask = cr0_zeros_mask;
1460 		vmcs_guest_cr = VMCS_GUEST_CR0;
1461 		vmcs_shadow_cr = VMCS_CR0_SHADOW;
1462 	} else {
1463 		ones_mask = cr4_ones_mask;
1464 		zeros_mask = cr4_zeros_mask;
1465 		vmcs_guest_cr = VMCS_GUEST_CR4;
1466 		vmcs_shadow_cr = VMCS_CR4_SHADOW;
1467 	}
1468 	vmcs_write(vmcs_shadow_cr, regval);
1469 
1470 	crval = regval | ones_mask;
1471 	crval &= ~zeros_mask;
1472 	vmcs_write(vmcs_guest_cr, crval);
1473 
1474 	if (cr == 0 && regval & CR0_PG) {
1475 		uint64_t efer, entry_ctls;
1476 
1477 		/*
1478 		 * If CR0.PG is 1 and EFER.LME is 1 then EFER.LMA and
1479 		 * the "IA-32e mode guest" bit in VM-entry control must be
1480 		 * equal.
1481 		 */
1482 		efer = vmcs_read(VMCS_GUEST_IA32_EFER);
1483 		if (efer & EFER_LME) {
1484 			efer |= EFER_LMA;
1485 			vmcs_write(VMCS_GUEST_IA32_EFER, efer);
1486 			entry_ctls = vmcs_read(VMCS_ENTRY_CTLS);
1487 			entry_ctls |= VM_ENTRY_GUEST_LMA;
1488 			vmcs_write(VMCS_ENTRY_CTLS, entry_ctls);
1489 		}
1490 	}
1491 
1492 	return (HANDLED);
1493 }
1494 
1495 /*
1496  * From section "Guest Register State" in the Intel SDM: CPL = SS.DPL
1497  */
1498 static int
1499 vmx_cpl(void)
1500 {
1501 	uint32_t ssar;
1502 
1503 	ssar = vmcs_read(VMCS_GUEST_SS_ACCESS_RIGHTS);
1504 	return ((ssar >> 5) & 0x3);
1505 }
1506 
1507 static enum vie_cpu_mode
1508 vmx_cpu_mode(void)
1509 {
1510 
1511 	if (vmcs_read(VMCS_GUEST_IA32_EFER) & EFER_LMA)
1512 		return (CPU_MODE_64BIT);
1513 	else
1514 		return (CPU_MODE_COMPATIBILITY);
1515 }
1516 
1517 static enum vie_paging_mode
1518 vmx_paging_mode(void)
1519 {
1520 
1521 	if (!(vmcs_read(VMCS_GUEST_CR0) & CR0_PG))
1522 		return (PAGING_MODE_FLAT);
1523 	if (!(vmcs_read(VMCS_GUEST_CR4) & CR4_PAE))
1524 		return (PAGING_MODE_32);
1525 	if (vmcs_read(VMCS_GUEST_IA32_EFER) & EFER_LME)
1526 		return (PAGING_MODE_64);
1527 	else
1528 		return (PAGING_MODE_PAE);
1529 }
1530 
1531 static void
1532 vmexit_inst_emul(struct vm_exit *vmexit, uint64_t gpa, uint64_t gla)
1533 {
1534 	vmexit->exitcode = VM_EXITCODE_INST_EMUL;
1535 	vmexit->u.inst_emul.gpa = gpa;
1536 	vmexit->u.inst_emul.gla = gla;
1537 	vmexit->u.inst_emul.cr3 = vmcs_guest_cr3();
1538 	vmexit->u.inst_emul.cpu_mode = vmx_cpu_mode();
1539 	vmexit->u.inst_emul.paging_mode = vmx_paging_mode();
1540 	vmexit->u.inst_emul.cpl = vmx_cpl();
1541 }
1542 
1543 static int
1544 ept_fault_type(uint64_t ept_qual)
1545 {
1546 	int fault_type;
1547 
1548 	if (ept_qual & EPT_VIOLATION_DATA_WRITE)
1549 		fault_type = VM_PROT_WRITE;
1550 	else if (ept_qual & EPT_VIOLATION_INST_FETCH)
1551 		fault_type = VM_PROT_EXECUTE;
1552 	else
1553 		fault_type= VM_PROT_READ;
1554 
1555 	return (fault_type);
1556 }
1557 
1558 static boolean_t
1559 ept_emulation_fault(uint64_t ept_qual)
1560 {
1561 	int read, write;
1562 
1563 	/* EPT fault on an instruction fetch doesn't make sense here */
1564 	if (ept_qual & EPT_VIOLATION_INST_FETCH)
1565 		return (FALSE);
1566 
1567 	/* EPT fault must be a read fault or a write fault */
1568 	read = ept_qual & EPT_VIOLATION_DATA_READ ? 1 : 0;
1569 	write = ept_qual & EPT_VIOLATION_DATA_WRITE ? 1 : 0;
1570 	if ((read | write) == 0)
1571 		return (FALSE);
1572 
1573 	/*
1574 	 * The EPT violation must have been caused by accessing a
1575 	 * guest-physical address that is a translation of a guest-linear
1576 	 * address.
1577 	 */
1578 	if ((ept_qual & EPT_VIOLATION_GLA_VALID) == 0 ||
1579 	    (ept_qual & EPT_VIOLATION_XLAT_VALID) == 0) {
1580 		return (FALSE);
1581 	}
1582 
1583 	return (TRUE);
1584 }
1585 
1586 static __inline int
1587 apic_access_virtualization(struct vmx *vmx, int vcpuid)
1588 {
1589 	uint32_t proc_ctls2;
1590 
1591 	proc_ctls2 = vmx->cap[vcpuid].proc_ctls2;
1592 	return ((proc_ctls2 & PROCBASED2_VIRTUALIZE_APIC_ACCESSES) ? 1 : 0);
1593 }
1594 
1595 static __inline int
1596 x2apic_virtualization(struct vmx *vmx, int vcpuid)
1597 {
1598 	uint32_t proc_ctls2;
1599 
1600 	proc_ctls2 = vmx->cap[vcpuid].proc_ctls2;
1601 	return ((proc_ctls2 & PROCBASED2_VIRTUALIZE_X2APIC_MODE) ? 1 : 0);
1602 }
1603 
1604 static int
1605 vmx_handle_apic_write(struct vmx *vmx, int vcpuid, struct vlapic *vlapic,
1606     uint64_t qual)
1607 {
1608 	int error, handled, offset;
1609 	uint32_t *apic_regs, vector;
1610 	bool retu;
1611 
1612 	handled = HANDLED;
1613 	offset = APIC_WRITE_OFFSET(qual);
1614 
1615 	if (!apic_access_virtualization(vmx, vcpuid)) {
1616 		/*
1617 		 * In general there should not be any APIC write VM-exits
1618 		 * unless APIC-access virtualization is enabled.
1619 		 *
1620 		 * However self-IPI virtualization can legitimately trigger
1621 		 * an APIC-write VM-exit so treat it specially.
1622 		 */
1623 		if (x2apic_virtualization(vmx, vcpuid) &&
1624 		    offset == APIC_OFFSET_SELF_IPI) {
1625 			apic_regs = (uint32_t *)(vlapic->apic_page);
1626 			vector = apic_regs[APIC_OFFSET_SELF_IPI / 4];
1627 			vlapic_self_ipi_handler(vlapic, vector);
1628 			return (HANDLED);
1629 		} else
1630 			return (UNHANDLED);
1631 	}
1632 
1633 	switch (offset) {
1634 	case APIC_OFFSET_ID:
1635 		vlapic_id_write_handler(vlapic);
1636 		break;
1637 	case APIC_OFFSET_LDR:
1638 		vlapic_ldr_write_handler(vlapic);
1639 		break;
1640 	case APIC_OFFSET_DFR:
1641 		vlapic_dfr_write_handler(vlapic);
1642 		break;
1643 	case APIC_OFFSET_SVR:
1644 		vlapic_svr_write_handler(vlapic);
1645 		break;
1646 	case APIC_OFFSET_ESR:
1647 		vlapic_esr_write_handler(vlapic);
1648 		break;
1649 	case APIC_OFFSET_ICR_LOW:
1650 		retu = false;
1651 		error = vlapic_icrlo_write_handler(vlapic, &retu);
1652 		if (error != 0 || retu)
1653 			handled = UNHANDLED;
1654 		break;
1655 	case APIC_OFFSET_CMCI_LVT:
1656 	case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT:
1657 		vlapic_lvt_write_handler(vlapic, offset);
1658 		break;
1659 	case APIC_OFFSET_TIMER_ICR:
1660 		vlapic_icrtmr_write_handler(vlapic);
1661 		break;
1662 	case APIC_OFFSET_TIMER_DCR:
1663 		vlapic_dcr_write_handler(vlapic);
1664 		break;
1665 	default:
1666 		handled = UNHANDLED;
1667 		break;
1668 	}
1669 	return (handled);
1670 }
1671 
1672 static bool
1673 apic_access_fault(struct vmx *vmx, int vcpuid, uint64_t gpa)
1674 {
1675 
1676 	if (apic_access_virtualization(vmx, vcpuid) &&
1677 	    (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE))
1678 		return (true);
1679 	else
1680 		return (false);
1681 }
1682 
1683 static int
1684 vmx_handle_apic_access(struct vmx *vmx, int vcpuid, struct vm_exit *vmexit)
1685 {
1686 	uint64_t qual;
1687 	int access_type, offset, allowed;
1688 
1689 	if (!apic_access_virtualization(vmx, vcpuid))
1690 		return (UNHANDLED);
1691 
1692 	qual = vmexit->u.vmx.exit_qualification;
1693 	access_type = APIC_ACCESS_TYPE(qual);
1694 	offset = APIC_ACCESS_OFFSET(qual);
1695 
1696 	allowed = 0;
1697 	if (access_type == 0) {
1698 		/*
1699 		 * Read data access to the following registers is expected.
1700 		 */
1701 		switch (offset) {
1702 		case APIC_OFFSET_APR:
1703 		case APIC_OFFSET_PPR:
1704 		case APIC_OFFSET_RRR:
1705 		case APIC_OFFSET_CMCI_LVT:
1706 		case APIC_OFFSET_TIMER_CCR:
1707 			allowed = 1;
1708 			break;
1709 		default:
1710 			break;
1711 		}
1712 	} else if (access_type == 1) {
1713 		/*
1714 		 * Write data access to the following registers is expected.
1715 		 */
1716 		switch (offset) {
1717 		case APIC_OFFSET_VER:
1718 		case APIC_OFFSET_APR:
1719 		case APIC_OFFSET_PPR:
1720 		case APIC_OFFSET_RRR:
1721 		case APIC_OFFSET_ISR0 ... APIC_OFFSET_ISR7:
1722 		case APIC_OFFSET_TMR0 ... APIC_OFFSET_TMR7:
1723 		case APIC_OFFSET_IRR0 ... APIC_OFFSET_IRR7:
1724 		case APIC_OFFSET_CMCI_LVT:
1725 		case APIC_OFFSET_TIMER_CCR:
1726 			allowed = 1;
1727 			break;
1728 		default:
1729 			break;
1730 		}
1731 	}
1732 
1733 	if (allowed) {
1734 		vmexit_inst_emul(vmexit, DEFAULT_APIC_BASE + offset,
1735 		    VIE_INVALID_GLA);
1736 	}
1737 
1738 	/*
1739 	 * Regardless of whether the APIC-access is allowed this handler
1740 	 * always returns UNHANDLED:
1741 	 * - if the access is allowed then it is handled by emulating the
1742 	 *   instruction that caused the VM-exit (outside the critical section)
1743 	 * - if the access is not allowed then it will be converted to an
1744 	 *   exitcode of VM_EXITCODE_VMX and will be dealt with in userland.
1745 	 */
1746 	return (UNHANDLED);
1747 }
1748 
1749 static int
1750 vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
1751 {
1752 	int error, handled;
1753 	struct vmxctx *vmxctx;
1754 	struct vlapic *vlapic;
1755 	uint32_t eax, ecx, edx, idtvec_info, idtvec_err, intr_info, reason;
1756 	uint64_t qual, gpa;
1757 	bool retu;
1758 
1759 	CTASSERT((PINBASED_CTLS_ONE_SETTING & PINBASED_VIRTUAL_NMI) != 0);
1760 	CTASSERT((PINBASED_CTLS_ONE_SETTING & PINBASED_NMI_EXITING) != 0);
1761 
1762 	handled = UNHANDLED;
1763 	vmxctx = &vmx->ctx[vcpu];
1764 
1765 	qual = vmexit->u.vmx.exit_qualification;
1766 	reason = vmexit->u.vmx.exit_reason;
1767 	vmexit->exitcode = VM_EXITCODE_BOGUS;
1768 
1769 	vmm_stat_incr(vmx->vm, vcpu, VMEXIT_COUNT, 1);
1770 
1771 	/*
1772 	 * VM exits that could be triggered during event injection on the
1773 	 * previous VM entry need to be handled specially by re-injecting
1774 	 * the event.
1775 	 *
1776 	 * See "Information for VM Exits During Event Delivery" in Intel SDM
1777 	 * for details.
1778 	 */
1779 	switch (reason) {
1780 	case EXIT_REASON_EPT_FAULT:
1781 	case EXIT_REASON_EPT_MISCONFIG:
1782 	case EXIT_REASON_APIC_ACCESS:
1783 	case EXIT_REASON_TASK_SWITCH:
1784 	case EXIT_REASON_EXCEPTION:
1785 		idtvec_info = vmcs_idt_vectoring_info();
1786 		if (idtvec_info & VMCS_IDT_VEC_VALID) {
1787 			idtvec_info &= ~(1 << 12); /* clear undefined bit */
1788 			vmcs_write(VMCS_ENTRY_INTR_INFO, idtvec_info);
1789 			if (idtvec_info & VMCS_IDT_VEC_ERRCODE_VALID) {
1790 				idtvec_err = vmcs_idt_vectoring_err();
1791 				vmcs_write(VMCS_ENTRY_EXCEPTION_ERROR,
1792 				    idtvec_err);
1793 			}
1794 			/*
1795 			 * If 'virtual NMIs' are being used and the VM-exit
1796 			 * happened while injecting an NMI during the previous
1797 			 * VM-entry, then clear "blocking by NMI" in the Guest
1798 			 * Interruptibility-state.
1799 			 */
1800 			if ((idtvec_info & VMCS_INTR_T_MASK) ==
1801 			    VMCS_INTR_T_NMI) {
1802 				 vmx_clear_nmi_blocking(vmx, vcpu);
1803 			}
1804 			vmcs_write(VMCS_ENTRY_INST_LENGTH, vmexit->inst_length);
1805 		}
1806 	default:
1807 		idtvec_info = 0;
1808 		break;
1809 	}
1810 
1811 	switch (reason) {
1812 	case EXIT_REASON_CR_ACCESS:
1813 		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_CR_ACCESS, 1);
1814 		handled = vmx_emulate_cr_access(vmx, vcpu, qual);
1815 		break;
1816 	case EXIT_REASON_RDMSR:
1817 		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_RDMSR, 1);
1818 		retu = false;
1819 		ecx = vmxctx->guest_rcx;
1820 		VCPU_CTR1(vmx->vm, vcpu, "rdmsr 0x%08x", ecx);
1821 		error = emulate_rdmsr(vmx->vm, vcpu, ecx, &retu);
1822 		if (error) {
1823 			vmexit->exitcode = VM_EXITCODE_RDMSR;
1824 			vmexit->u.msr.code = ecx;
1825 		} else if (!retu) {
1826 			handled = HANDLED;
1827 		} else {
1828 			/* Return to userspace with a valid exitcode */
1829 			KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS,
1830 			    ("emulate_wrmsr retu with bogus exitcode"));
1831 		}
1832 		break;
1833 	case EXIT_REASON_WRMSR:
1834 		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_WRMSR, 1);
1835 		retu = false;
1836 		eax = vmxctx->guest_rax;
1837 		ecx = vmxctx->guest_rcx;
1838 		edx = vmxctx->guest_rdx;
1839 		VCPU_CTR2(vmx->vm, vcpu, "wrmsr 0x%08x value 0x%016lx",
1840 		    ecx, (uint64_t)edx << 32 | eax);
1841 		error = emulate_wrmsr(vmx->vm, vcpu, ecx,
1842 		    (uint64_t)edx << 32 | eax, &retu);
1843 		if (error) {
1844 			vmexit->exitcode = VM_EXITCODE_WRMSR;
1845 			vmexit->u.msr.code = ecx;
1846 			vmexit->u.msr.wval = (uint64_t)edx << 32 | eax;
1847 		} else if (!retu) {
1848 			handled = HANDLED;
1849 		} else {
1850 			/* Return to userspace with a valid exitcode */
1851 			KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS,
1852 			    ("emulate_wrmsr retu with bogus exitcode"));
1853 		}
1854 		break;
1855 	case EXIT_REASON_HLT:
1856 		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_HLT, 1);
1857 		vmexit->exitcode = VM_EXITCODE_HLT;
1858 		vmexit->u.hlt.rflags = vmcs_read(VMCS_GUEST_RFLAGS);
1859 		break;
1860 	case EXIT_REASON_MTF:
1861 		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_MTRAP, 1);
1862 		vmexit->exitcode = VM_EXITCODE_MTRAP;
1863 		break;
1864 	case EXIT_REASON_PAUSE:
1865 		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_PAUSE, 1);
1866 		vmexit->exitcode = VM_EXITCODE_PAUSE;
1867 		break;
1868 	case EXIT_REASON_INTR_WINDOW:
1869 		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INTR_WINDOW, 1);
1870 		vmx_clear_int_window_exiting(vmx, vcpu);
1871 		return (1);
1872 	case EXIT_REASON_EXT_INTR:
1873 		/*
1874 		 * External interrupts serve only to cause VM exits and allow
1875 		 * the host interrupt handler to run.
1876 		 *
1877 		 * If this external interrupt triggers a virtual interrupt
1878 		 * to a VM, then that state will be recorded by the
1879 		 * host interrupt handler in the VM's softc. We will inject
1880 		 * this virtual interrupt during the subsequent VM enter.
1881 		 */
1882 		intr_info = vmcs_read(VMCS_EXIT_INTR_INFO);
1883 
1884 		/*
1885 		 * XXX: Ignore this exit if VMCS_INTR_VALID is not set.
1886 		 * This appears to be a bug in VMware Fusion?
1887 		 */
1888 		if (!(intr_info & VMCS_INTR_VALID))
1889 			return (1);
1890 		KASSERT((intr_info & VMCS_INTR_VALID) != 0 &&
1891 		    (intr_info & VMCS_INTR_T_MASK) == VMCS_INTR_T_HWINTR,
1892 		    ("VM exit interruption info invalid: %#x", intr_info));
1893 		vmx_trigger_hostintr(intr_info & 0xff);
1894 
1895 		/*
1896 		 * This is special. We want to treat this as an 'handled'
1897 		 * VM-exit but not increment the instruction pointer.
1898 		 */
1899 		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_EXTINT, 1);
1900 		return (1);
1901 	case EXIT_REASON_NMI_WINDOW:
1902 		/* Exit to allow the pending virtual NMI to be injected */
1903 		if (vm_nmi_pending(vmx->vm, vcpu))
1904 			vmx_inject_nmi(vmx, vcpu);
1905 		vmx_clear_nmi_window_exiting(vmx, vcpu);
1906 		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_NMI_WINDOW, 1);
1907 		return (1);
1908 	case EXIT_REASON_INOUT:
1909 		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INOUT, 1);
1910 		vmexit->exitcode = VM_EXITCODE_INOUT;
1911 		vmexit->u.inout.bytes = (qual & 0x7) + 1;
1912 		vmexit->u.inout.in = (qual & 0x8) ? 1 : 0;
1913 		vmexit->u.inout.string = (qual & 0x10) ? 1 : 0;
1914 		vmexit->u.inout.rep = (qual & 0x20) ? 1 : 0;
1915 		vmexit->u.inout.port = (uint16_t)(qual >> 16);
1916 		vmexit->u.inout.eax = (uint32_t)(vmxctx->guest_rax);
1917 		error = emulate_ioport(vmx->vm, vcpu, vmexit);
1918 		if (error == 0)  {
1919 			handled = 1;
1920 			vmxctx->guest_rax = vmexit->u.inout.eax;
1921 		}
1922 		break;
1923 	case EXIT_REASON_CPUID:
1924 		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_CPUID, 1);
1925 		handled = vmx_handle_cpuid(vmx->vm, vcpu, vmxctx);
1926 		break;
1927 	case EXIT_REASON_EXCEPTION:
1928 		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_EXCEPTION, 1);
1929 		intr_info = vmcs_read(VMCS_EXIT_INTR_INFO);
1930 		KASSERT((intr_info & VMCS_INTR_VALID) != 0,
1931 		    ("VM exit interruption info invalid: %#x", intr_info));
1932 
1933 		/*
1934 		 * If Virtual NMIs control is 1 and the VM-exit is due to a
1935 		 * fault encountered during the execution of IRET then we must
1936 		 * restore the state of "virtual-NMI blocking" before resuming
1937 		 * the guest.
1938 		 *
1939 		 * See "Resuming Guest Software after Handling an Exception".
1940 		 */
1941 		if ((idtvec_info & VMCS_IDT_VEC_VALID) == 0 &&
1942 		    (intr_info & 0xff) != IDT_DF &&
1943 		    (intr_info & EXIT_QUAL_NMIUDTI) != 0)
1944 			vmx_restore_nmi_blocking(vmx, vcpu);
1945 
1946 		/*
1947 		 * The NMI has already been handled in vmx_exit_handle_nmi().
1948 		 */
1949 		if ((intr_info & VMCS_INTR_T_MASK) == VMCS_INTR_T_NMI)
1950 			return (1);
1951 		break;
1952 	case EXIT_REASON_EPT_FAULT:
1953 		/*
1954 		 * If 'gpa' lies within the address space allocated to
1955 		 * memory then this must be a nested page fault otherwise
1956 		 * this must be an instruction that accesses MMIO space.
1957 		 */
1958 		gpa = vmcs_gpa();
1959 		if (vm_mem_allocated(vmx->vm, gpa) ||
1960 		    apic_access_fault(vmx, vcpu, gpa)) {
1961 			vmexit->exitcode = VM_EXITCODE_PAGING;
1962 			vmexit->u.paging.gpa = gpa;
1963 			vmexit->u.paging.fault_type = ept_fault_type(qual);
1964 			vmm_stat_incr(vmx->vm, vcpu, VMEXIT_NESTED_FAULT, 1);
1965 		} else if (ept_emulation_fault(qual)) {
1966 			vmexit_inst_emul(vmexit, gpa, vmcs_gla());
1967 			vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INST_EMUL, 1);
1968 		}
1969 		/*
1970 		 * If Virtual NMIs control is 1 and the VM-exit is due to an
1971 		 * EPT fault during the execution of IRET then we must restore
1972 		 * the state of "virtual-NMI blocking" before resuming.
1973 		 *
1974 		 * See description of "NMI unblocking due to IRET" in
1975 		 * "Exit Qualification for EPT Violations".
1976 		 */
1977 		if ((idtvec_info & VMCS_IDT_VEC_VALID) == 0 &&
1978 		    (qual & EXIT_QUAL_NMIUDTI) != 0)
1979 			vmx_restore_nmi_blocking(vmx, vcpu);
1980 		break;
1981 	case EXIT_REASON_VIRTUALIZED_EOI:
1982 		vmexit->exitcode = VM_EXITCODE_IOAPIC_EOI;
1983 		vmexit->u.ioapic_eoi.vector = qual & 0xFF;
1984 		vmexit->inst_length = 0;	/* trap-like */
1985 		break;
1986 	case EXIT_REASON_APIC_ACCESS:
1987 		handled = vmx_handle_apic_access(vmx, vcpu, vmexit);
1988 		break;
1989 	case EXIT_REASON_APIC_WRITE:
1990 		/*
1991 		 * APIC-write VM exit is trap-like so the %rip is already
1992 		 * pointing to the next instruction.
1993 		 */
1994 		vmexit->inst_length = 0;
1995 		vlapic = vm_lapic(vmx->vm, vcpu);
1996 		handled = vmx_handle_apic_write(vmx, vcpu, vlapic, qual);
1997 		break;
1998 	case EXIT_REASON_XSETBV:
1999 		handled = vmx_emulate_xsetbv(vmx, vcpu, vmexit);
2000 		break;
2001 	default:
2002 		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_UNKNOWN, 1);
2003 		break;
2004 	}
2005 
2006 	if (handled) {
2007 		/*
2008 		 * It is possible that control is returned to userland
2009 		 * even though we were able to handle the VM exit in the
2010 		 * kernel.
2011 		 *
2012 		 * In such a case we want to make sure that the userland
2013 		 * restarts guest execution at the instruction *after*
2014 		 * the one we just processed. Therefore we update the
2015 		 * guest rip in the VMCS and in 'vmexit'.
2016 		 */
2017 		vmexit->rip += vmexit->inst_length;
2018 		vmexit->inst_length = 0;
2019 		vmcs_write(VMCS_GUEST_RIP, vmexit->rip);
2020 	} else {
2021 		if (vmexit->exitcode == VM_EXITCODE_BOGUS) {
2022 			/*
2023 			 * If this VM exit was not claimed by anybody then
2024 			 * treat it as a generic VMX exit.
2025 			 */
2026 			vmexit->exitcode = VM_EXITCODE_VMX;
2027 			vmexit->u.vmx.status = VM_SUCCESS;
2028 			vmexit->u.vmx.inst_type = 0;
2029 			vmexit->u.vmx.inst_error = 0;
2030 		} else {
2031 			/*
2032 			 * The exitcode and collateral have been populated.
2033 			 * The VM exit will be processed further in userland.
2034 			 */
2035 		}
2036 	}
2037 	return (handled);
2038 }
2039 
2040 static __inline int
2041 vmx_exit_astpending(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
2042 {
2043 
2044 	vmexit->rip = vmcs_guest_rip();
2045 	vmexit->inst_length = 0;
2046 	vmexit->exitcode = VM_EXITCODE_BOGUS;
2047 	vmx_astpending_trace(vmx, vcpu, vmexit->rip);
2048 	vmm_stat_incr(vmx->vm, vcpu, VMEXIT_ASTPENDING, 1);
2049 
2050 	return (HANDLED);
2051 }
2052 
2053 static __inline int
2054 vmx_exit_rendezvous(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
2055 {
2056 
2057 	vmexit->rip = vmcs_guest_rip();
2058 	vmexit->inst_length = 0;
2059 	vmexit->exitcode = VM_EXITCODE_RENDEZVOUS;
2060 	vmm_stat_incr(vmx->vm, vcpu, VMEXIT_RENDEZVOUS, 1);
2061 
2062 	return (UNHANDLED);
2063 }
2064 
2065 static __inline int
2066 vmx_exit_inst_error(struct vmxctx *vmxctx, int rc, struct vm_exit *vmexit)
2067 {
2068 
2069 	KASSERT(vmxctx->inst_fail_status != VM_SUCCESS,
2070 	    ("vmx_exit_inst_error: invalid inst_fail_status %d",
2071 	    vmxctx->inst_fail_status));
2072 
2073 	vmexit->inst_length = 0;
2074 	vmexit->exitcode = VM_EXITCODE_VMX;
2075 	vmexit->u.vmx.status = vmxctx->inst_fail_status;
2076 	vmexit->u.vmx.inst_error = vmcs_instruction_error();
2077 	vmexit->u.vmx.exit_reason = ~0;
2078 	vmexit->u.vmx.exit_qualification = ~0;
2079 
2080 	switch (rc) {
2081 	case VMX_VMRESUME_ERROR:
2082 	case VMX_VMLAUNCH_ERROR:
2083 	case VMX_INVEPT_ERROR:
2084 		vmexit->u.vmx.inst_type = rc;
2085 		break;
2086 	default:
2087 		panic("vm_exit_inst_error: vmx_enter_guest returned %d", rc);
2088 	}
2089 
2090 	return (UNHANDLED);
2091 }
2092 
2093 /*
2094  * If the NMI-exiting VM execution control is set to '1' then an NMI in
2095  * non-root operation causes a VM-exit. NMI blocking is in effect so it is
2096  * sufficient to simply vector to the NMI handler via a software interrupt.
2097  * However, this must be done before maskable interrupts are enabled
2098  * otherwise the "iret" issued by an interrupt handler will incorrectly
2099  * clear NMI blocking.
2100  */
2101 static __inline void
2102 vmx_exit_handle_nmi(struct vmx *vmx, int vcpuid, struct vm_exit *vmexit)
2103 {
2104 	uint32_t intr_info;
2105 
2106 	KASSERT((read_rflags() & PSL_I) == 0, ("interrupts enabled"));
2107 
2108 	if (vmexit->u.vmx.exit_reason != EXIT_REASON_EXCEPTION)
2109 		return;
2110 
2111 	intr_info = vmcs_read(VMCS_EXIT_INTR_INFO);
2112 	KASSERT((intr_info & VMCS_INTR_VALID) != 0,
2113 	    ("VM exit interruption info invalid: %#x", intr_info));
2114 
2115 	if ((intr_info & VMCS_INTR_T_MASK) == VMCS_INTR_T_NMI) {
2116 		KASSERT((intr_info & 0xff) == IDT_NMI, ("VM exit due "
2117 		    "to NMI has invalid vector: %#x", intr_info));
2118 		VCPU_CTR0(vmx->vm, vcpuid, "Vectoring to NMI handler");
2119 		__asm __volatile("int $2");
2120 	}
2121 }
2122 
2123 static int
2124 vmx_run(void *arg, int vcpu, register_t startrip, pmap_t pmap,
2125     void *rendezvous_cookie, void *suspend_cookie)
2126 {
2127 	int rc, handled, launched;
2128 	struct vmx *vmx;
2129 	struct vm *vm;
2130 	struct vmxctx *vmxctx;
2131 	struct vmcs *vmcs;
2132 	struct vm_exit *vmexit;
2133 	struct vlapic *vlapic;
2134 	uint64_t rip;
2135 	uint32_t exit_reason;
2136 
2137 	vmx = arg;
2138 	vm = vmx->vm;
2139 	vmcs = &vmx->vmcs[vcpu];
2140 	vmxctx = &vmx->ctx[vcpu];
2141 	vlapic = vm_lapic(vm, vcpu);
2142 	vmexit = vm_exitinfo(vm, vcpu);
2143 	launched = 0;
2144 
2145 	KASSERT(vmxctx->pmap == pmap,
2146 	    ("pmap %p different than ctx pmap %p", pmap, vmxctx->pmap));
2147 
2148 	VMPTRLD(vmcs);
2149 
2150 	/*
2151 	 * XXX
2152 	 * We do this every time because we may setup the virtual machine
2153 	 * from a different process than the one that actually runs it.
2154 	 *
2155 	 * If the life of a virtual machine was spent entirely in the context
2156 	 * of a single process we could do this once in vmx_vminit().
2157 	 */
2158 	vmcs_write(VMCS_HOST_CR3, rcr3());
2159 
2160 	vmcs_write(VMCS_GUEST_RIP, startrip);
2161 	vmx_set_pcpu_defaults(vmx, vcpu, pmap);
2162 	do {
2163 		/*
2164 		 * Interrupts are disabled from this point on until the
2165 		 * guest starts executing. This is done for the following
2166 		 * reasons:
2167 		 *
2168 		 * If an AST is asserted on this thread after the check below,
2169 		 * then the IPI_AST notification will not be lost, because it
2170 		 * will cause a VM exit due to external interrupt as soon as
2171 		 * the guest state is loaded.
2172 		 *
2173 		 * A posted interrupt after 'vmx_inject_interrupts()' will
2174 		 * not be "lost" because it will be held pending in the host
2175 		 * APIC because interrupts are disabled. The pending interrupt
2176 		 * will be recognized as soon as the guest state is loaded.
2177 		 *
2178 		 * The same reasoning applies to the IPI generated by
2179 		 * pmap_invalidate_ept().
2180 		 */
2181 		disable_intr();
2182 		if (vcpu_suspended(suspend_cookie)) {
2183 			enable_intr();
2184 			vm_exit_suspended(vmx->vm, vcpu, vmcs_guest_rip());
2185 			handled = UNHANDLED;
2186 			break;
2187 		}
2188 
2189 		if (vcpu_rendezvous_pending(rendezvous_cookie)) {
2190 			enable_intr();
2191 			handled = vmx_exit_rendezvous(vmx, vcpu, vmexit);
2192 			break;
2193 		}
2194 
2195 		if (curthread->td_flags & (TDF_ASTPENDING | TDF_NEEDRESCHED)) {
2196 			enable_intr();
2197 			handled = vmx_exit_astpending(vmx, vcpu, vmexit);
2198 			break;
2199 		}
2200 
2201 		vmx_inject_interrupts(vmx, vcpu, vlapic);
2202 		vmx_run_trace(vmx, vcpu);
2203 		rc = vmx_enter_guest(vmxctx, vmx, launched);
2204 
2205 		/* Collect some information for VM exit processing */
2206 		vmexit->rip = rip = vmcs_guest_rip();
2207 		vmexit->inst_length = vmexit_instruction_length();
2208 		vmexit->u.vmx.exit_reason = exit_reason = vmcs_exit_reason();
2209 		vmexit->u.vmx.exit_qualification = vmcs_exit_qualification();
2210 
2211 		if (rc == VMX_GUEST_VMEXIT) {
2212 			vmx_exit_handle_nmi(vmx, vcpu, vmexit);
2213 			enable_intr();
2214 			handled = vmx_exit_process(vmx, vcpu, vmexit);
2215 		} else {
2216 			enable_intr();
2217 			handled = vmx_exit_inst_error(vmxctx, rc, vmexit);
2218 		}
2219 		launched = 1;
2220 		vmx_exit_trace(vmx, vcpu, rip, exit_reason, handled);
2221 	} while (handled);
2222 
2223 	/*
2224 	 * If a VM exit has been handled then the exitcode must be BOGUS
2225 	 * If a VM exit is not handled then the exitcode must not be BOGUS
2226 	 */
2227 	if ((handled && vmexit->exitcode != VM_EXITCODE_BOGUS) ||
2228 	    (!handled && vmexit->exitcode == VM_EXITCODE_BOGUS)) {
2229 		panic("Mismatch between handled (%d) and exitcode (%d)",
2230 		      handled, vmexit->exitcode);
2231 	}
2232 
2233 	if (!handled)
2234 		vmm_stat_incr(vm, vcpu, VMEXIT_USERSPACE, 1);
2235 
2236 	VCPU_CTR1(vm, vcpu, "returning from vmx_run: exitcode %d",
2237 	    vmexit->exitcode);
2238 
2239 	VMCLEAR(vmcs);
2240 	return (0);
2241 }
2242 
2243 static void
2244 vmx_vmcleanup(void *arg)
2245 {
2246 	int i;
2247 	struct vmx *vmx = arg;
2248 
2249 	if (apic_access_virtualization(vmx, 0))
2250 		vm_unmap_mmio(vmx->vm, DEFAULT_APIC_BASE, PAGE_SIZE);
2251 
2252 	for (i = 0; i < VM_MAXCPU; i++)
2253 		vpid_free(vmx->state[i].vpid);
2254 
2255 	free(vmx, M_VMX);
2256 
2257 	return;
2258 }
2259 
2260 static register_t *
2261 vmxctx_regptr(struct vmxctx *vmxctx, int reg)
2262 {
2263 
2264 	switch (reg) {
2265 	case VM_REG_GUEST_RAX:
2266 		return (&vmxctx->guest_rax);
2267 	case VM_REG_GUEST_RBX:
2268 		return (&vmxctx->guest_rbx);
2269 	case VM_REG_GUEST_RCX:
2270 		return (&vmxctx->guest_rcx);
2271 	case VM_REG_GUEST_RDX:
2272 		return (&vmxctx->guest_rdx);
2273 	case VM_REG_GUEST_RSI:
2274 		return (&vmxctx->guest_rsi);
2275 	case VM_REG_GUEST_RDI:
2276 		return (&vmxctx->guest_rdi);
2277 	case VM_REG_GUEST_RBP:
2278 		return (&vmxctx->guest_rbp);
2279 	case VM_REG_GUEST_R8:
2280 		return (&vmxctx->guest_r8);
2281 	case VM_REG_GUEST_R9:
2282 		return (&vmxctx->guest_r9);
2283 	case VM_REG_GUEST_R10:
2284 		return (&vmxctx->guest_r10);
2285 	case VM_REG_GUEST_R11:
2286 		return (&vmxctx->guest_r11);
2287 	case VM_REG_GUEST_R12:
2288 		return (&vmxctx->guest_r12);
2289 	case VM_REG_GUEST_R13:
2290 		return (&vmxctx->guest_r13);
2291 	case VM_REG_GUEST_R14:
2292 		return (&vmxctx->guest_r14);
2293 	case VM_REG_GUEST_R15:
2294 		return (&vmxctx->guest_r15);
2295 	default:
2296 		break;
2297 	}
2298 	return (NULL);
2299 }
2300 
2301 static int
2302 vmxctx_getreg(struct vmxctx *vmxctx, int reg, uint64_t *retval)
2303 {
2304 	register_t *regp;
2305 
2306 	if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) {
2307 		*retval = *regp;
2308 		return (0);
2309 	} else
2310 		return (EINVAL);
2311 }
2312 
2313 static int
2314 vmxctx_setreg(struct vmxctx *vmxctx, int reg, uint64_t val)
2315 {
2316 	register_t *regp;
2317 
2318 	if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) {
2319 		*regp = val;
2320 		return (0);
2321 	} else
2322 		return (EINVAL);
2323 }
2324 
2325 static int
2326 vmx_shadow_reg(int reg)
2327 {
2328 	int shreg;
2329 
2330 	shreg = -1;
2331 
2332 	switch (reg) {
2333 	case VM_REG_GUEST_CR0:
2334 		shreg = VMCS_CR0_SHADOW;
2335                 break;
2336         case VM_REG_GUEST_CR4:
2337 		shreg = VMCS_CR4_SHADOW;
2338 		break;
2339 	default:
2340 		break;
2341 	}
2342 
2343 	return (shreg);
2344 }
2345 
2346 static int
2347 vmx_getreg(void *arg, int vcpu, int reg, uint64_t *retval)
2348 {
2349 	int running, hostcpu;
2350 	struct vmx *vmx = arg;
2351 
2352 	running = vcpu_is_running(vmx->vm, vcpu, &hostcpu);
2353 	if (running && hostcpu != curcpu)
2354 		panic("vmx_getreg: %s%d is running", vm_name(vmx->vm), vcpu);
2355 
2356 	if (vmxctx_getreg(&vmx->ctx[vcpu], reg, retval) == 0)
2357 		return (0);
2358 
2359 	return (vmcs_getreg(&vmx->vmcs[vcpu], running, reg, retval));
2360 }
2361 
2362 static int
2363 vmx_setreg(void *arg, int vcpu, int reg, uint64_t val)
2364 {
2365 	int error, hostcpu, running, shadow;
2366 	uint64_t ctls;
2367 	struct vmx *vmx = arg;
2368 
2369 	running = vcpu_is_running(vmx->vm, vcpu, &hostcpu);
2370 	if (running && hostcpu != curcpu)
2371 		panic("vmx_setreg: %s%d is running", vm_name(vmx->vm), vcpu);
2372 
2373 	if (vmxctx_setreg(&vmx->ctx[vcpu], reg, val) == 0)
2374 		return (0);
2375 
2376 	error = vmcs_setreg(&vmx->vmcs[vcpu], running, reg, val);
2377 
2378 	if (error == 0) {
2379 		/*
2380 		 * If the "load EFER" VM-entry control is 1 then the
2381 		 * value of EFER.LMA must be identical to "IA-32e mode guest"
2382 		 * bit in the VM-entry control.
2383 		 */
2384 		if ((entry_ctls & VM_ENTRY_LOAD_EFER) != 0 &&
2385 		    (reg == VM_REG_GUEST_EFER)) {
2386 			vmcs_getreg(&vmx->vmcs[vcpu], running,
2387 				    VMCS_IDENT(VMCS_ENTRY_CTLS), &ctls);
2388 			if (val & EFER_LMA)
2389 				ctls |= VM_ENTRY_GUEST_LMA;
2390 			else
2391 				ctls &= ~VM_ENTRY_GUEST_LMA;
2392 			vmcs_setreg(&vmx->vmcs[vcpu], running,
2393 				    VMCS_IDENT(VMCS_ENTRY_CTLS), ctls);
2394 		}
2395 
2396 		shadow = vmx_shadow_reg(reg);
2397 		if (shadow > 0) {
2398 			/*
2399 			 * Store the unmodified value in the shadow
2400 			 */
2401 			error = vmcs_setreg(&vmx->vmcs[vcpu], running,
2402 				    VMCS_IDENT(shadow), val);
2403 		}
2404 	}
2405 
2406 	return (error);
2407 }
2408 
2409 static int
2410 vmx_getdesc(void *arg, int vcpu, int reg, struct seg_desc *desc)
2411 {
2412 	struct vmx *vmx = arg;
2413 
2414 	return (vmcs_getdesc(&vmx->vmcs[vcpu], reg, desc));
2415 }
2416 
2417 static int
2418 vmx_setdesc(void *arg, int vcpu, int reg, struct seg_desc *desc)
2419 {
2420 	struct vmx *vmx = arg;
2421 
2422 	return (vmcs_setdesc(&vmx->vmcs[vcpu], reg, desc));
2423 }
2424 
2425 static int
2426 vmx_getcap(void *arg, int vcpu, int type, int *retval)
2427 {
2428 	struct vmx *vmx = arg;
2429 	int vcap;
2430 	int ret;
2431 
2432 	ret = ENOENT;
2433 
2434 	vcap = vmx->cap[vcpu].set;
2435 
2436 	switch (type) {
2437 	case VM_CAP_HALT_EXIT:
2438 		if (cap_halt_exit)
2439 			ret = 0;
2440 		break;
2441 	case VM_CAP_PAUSE_EXIT:
2442 		if (cap_pause_exit)
2443 			ret = 0;
2444 		break;
2445 	case VM_CAP_MTRAP_EXIT:
2446 		if (cap_monitor_trap)
2447 			ret = 0;
2448 		break;
2449 	case VM_CAP_UNRESTRICTED_GUEST:
2450 		if (cap_unrestricted_guest)
2451 			ret = 0;
2452 		break;
2453 	case VM_CAP_ENABLE_INVPCID:
2454 		if (cap_invpcid)
2455 			ret = 0;
2456 		break;
2457 	default:
2458 		break;
2459 	}
2460 
2461 	if (ret == 0)
2462 		*retval = (vcap & (1 << type)) ? 1 : 0;
2463 
2464 	return (ret);
2465 }
2466 
2467 static int
2468 vmx_setcap(void *arg, int vcpu, int type, int val)
2469 {
2470 	struct vmx *vmx = arg;
2471 	struct vmcs *vmcs = &vmx->vmcs[vcpu];
2472 	uint32_t baseval;
2473 	uint32_t *pptr;
2474 	int error;
2475 	int flag;
2476 	int reg;
2477 	int retval;
2478 
2479 	retval = ENOENT;
2480 	pptr = NULL;
2481 
2482 	switch (type) {
2483 	case VM_CAP_HALT_EXIT:
2484 		if (cap_halt_exit) {
2485 			retval = 0;
2486 			pptr = &vmx->cap[vcpu].proc_ctls;
2487 			baseval = *pptr;
2488 			flag = PROCBASED_HLT_EXITING;
2489 			reg = VMCS_PRI_PROC_BASED_CTLS;
2490 		}
2491 		break;
2492 	case VM_CAP_MTRAP_EXIT:
2493 		if (cap_monitor_trap) {
2494 			retval = 0;
2495 			pptr = &vmx->cap[vcpu].proc_ctls;
2496 			baseval = *pptr;
2497 			flag = PROCBASED_MTF;
2498 			reg = VMCS_PRI_PROC_BASED_CTLS;
2499 		}
2500 		break;
2501 	case VM_CAP_PAUSE_EXIT:
2502 		if (cap_pause_exit) {
2503 			retval = 0;
2504 			pptr = &vmx->cap[vcpu].proc_ctls;
2505 			baseval = *pptr;
2506 			flag = PROCBASED_PAUSE_EXITING;
2507 			reg = VMCS_PRI_PROC_BASED_CTLS;
2508 		}
2509 		break;
2510 	case VM_CAP_UNRESTRICTED_GUEST:
2511 		if (cap_unrestricted_guest) {
2512 			retval = 0;
2513 			pptr = &vmx->cap[vcpu].proc_ctls2;
2514 			baseval = *pptr;
2515 			flag = PROCBASED2_UNRESTRICTED_GUEST;
2516 			reg = VMCS_SEC_PROC_BASED_CTLS;
2517 		}
2518 		break;
2519 	case VM_CAP_ENABLE_INVPCID:
2520 		if (cap_invpcid) {
2521 			retval = 0;
2522 			pptr = &vmx->cap[vcpu].proc_ctls2;
2523 			baseval = *pptr;
2524 			flag = PROCBASED2_ENABLE_INVPCID;
2525 			reg = VMCS_SEC_PROC_BASED_CTLS;
2526 		}
2527 		break;
2528 	default:
2529 		break;
2530 	}
2531 
2532 	if (retval == 0) {
2533 		if (val) {
2534 			baseval |= flag;
2535 		} else {
2536 			baseval &= ~flag;
2537 		}
2538 		VMPTRLD(vmcs);
2539 		error = vmwrite(reg, baseval);
2540 		VMCLEAR(vmcs);
2541 
2542 		if (error) {
2543 			retval = error;
2544 		} else {
2545 			/*
2546 			 * Update optional stored flags, and record
2547 			 * setting
2548 			 */
2549 			if (pptr != NULL) {
2550 				*pptr = baseval;
2551 			}
2552 
2553 			if (val) {
2554 				vmx->cap[vcpu].set |= (1 << type);
2555 			} else {
2556 				vmx->cap[vcpu].set &= ~(1 << type);
2557 			}
2558 		}
2559 	}
2560 
2561         return (retval);
2562 }
2563 
2564 struct vlapic_vtx {
2565 	struct vlapic	vlapic;
2566 	struct pir_desc	*pir_desc;
2567 	struct vmx	*vmx;
2568 };
2569 
2570 #define	VMX_CTR_PIR(vm, vcpuid, pir_desc, notify, vector, level, msg)	\
2571 do {									\
2572 	VCPU_CTR2(vm, vcpuid, msg " assert %s-triggered vector %d",	\
2573 	    level ? "level" : "edge", vector);				\
2574 	VCPU_CTR1(vm, vcpuid, msg " pir0 0x%016lx", pir_desc->pir[0]);	\
2575 	VCPU_CTR1(vm, vcpuid, msg " pir1 0x%016lx", pir_desc->pir[1]);	\
2576 	VCPU_CTR1(vm, vcpuid, msg " pir2 0x%016lx", pir_desc->pir[2]);	\
2577 	VCPU_CTR1(vm, vcpuid, msg " pir3 0x%016lx", pir_desc->pir[3]);	\
2578 	VCPU_CTR1(vm, vcpuid, msg " notify: %s", notify ? "yes" : "no");\
2579 } while (0)
2580 
2581 /*
2582  * vlapic->ops handlers that utilize the APICv hardware assist described in
2583  * Chapter 29 of the Intel SDM.
2584  */
2585 static int
2586 vmx_set_intr_ready(struct vlapic *vlapic, int vector, bool level)
2587 {
2588 	struct vlapic_vtx *vlapic_vtx;
2589 	struct pir_desc *pir_desc;
2590 	uint64_t mask;
2591 	int idx, notify;
2592 
2593 	vlapic_vtx = (struct vlapic_vtx *)vlapic;
2594 	pir_desc = vlapic_vtx->pir_desc;
2595 
2596 	/*
2597 	 * Keep track of interrupt requests in the PIR descriptor. This is
2598 	 * because the virtual APIC page pointed to by the VMCS cannot be
2599 	 * modified if the vcpu is running.
2600 	 */
2601 	idx = vector / 64;
2602 	mask = 1UL << (vector % 64);
2603 	atomic_set_long(&pir_desc->pir[idx], mask);
2604 	notify = atomic_cmpset_long(&pir_desc->pending, 0, 1);
2605 
2606 	VMX_CTR_PIR(vlapic->vm, vlapic->vcpuid, pir_desc, notify, vector,
2607 	    level, "vmx_set_intr_ready");
2608 	return (notify);
2609 }
2610 
2611 static int
2612 vmx_pending_intr(struct vlapic *vlapic, int *vecptr)
2613 {
2614 	struct vlapic_vtx *vlapic_vtx;
2615 	struct pir_desc *pir_desc;
2616 	struct LAPIC *lapic;
2617 	uint64_t pending, pirval;
2618 	uint32_t ppr, vpr;
2619 	int i;
2620 
2621 	/*
2622 	 * This function is only expected to be called from the 'HLT' exit
2623 	 * handler which does not care about the vector that is pending.
2624 	 */
2625 	KASSERT(vecptr == NULL, ("vmx_pending_intr: vecptr must be NULL"));
2626 
2627 	vlapic_vtx = (struct vlapic_vtx *)vlapic;
2628 	pir_desc = vlapic_vtx->pir_desc;
2629 
2630 	pending = atomic_load_acq_long(&pir_desc->pending);
2631 	if (!pending)
2632 		return (0);	/* common case */
2633 
2634 	/*
2635 	 * If there is an interrupt pending then it will be recognized only
2636 	 * if its priority is greater than the processor priority.
2637 	 *
2638 	 * Special case: if the processor priority is zero then any pending
2639 	 * interrupt will be recognized.
2640 	 */
2641 	lapic = vlapic->apic_page;
2642 	ppr = lapic->ppr & 0xf0;
2643 	if (ppr == 0)
2644 		return (1);
2645 
2646 	VCPU_CTR1(vlapic->vm, vlapic->vcpuid, "HLT with non-zero PPR %d",
2647 	    lapic->ppr);
2648 
2649 	for (i = 3; i >= 0; i--) {
2650 		pirval = pir_desc->pir[i];
2651 		if (pirval != 0) {
2652 			vpr = (i * 64 + flsl(pirval) - 1) & 0xf0;
2653 			return (vpr > ppr);
2654 		}
2655 	}
2656 	return (0);
2657 }
2658 
2659 static void
2660 vmx_intr_accepted(struct vlapic *vlapic, int vector)
2661 {
2662 
2663 	panic("vmx_intr_accepted: not expected to be called");
2664 }
2665 
2666 static void
2667 vmx_set_tmr(struct vlapic *vlapic, int vector, bool level)
2668 {
2669 	struct vlapic_vtx *vlapic_vtx;
2670 	struct vmx *vmx;
2671 	struct vmcs *vmcs;
2672 	uint64_t mask, val;
2673 
2674 	KASSERT(vector >= 0 && vector <= 255, ("invalid vector %d", vector));
2675 	KASSERT(!vcpu_is_running(vlapic->vm, vlapic->vcpuid, NULL),
2676 	    ("vmx_set_tmr: vcpu cannot be running"));
2677 
2678 	vlapic_vtx = (struct vlapic_vtx *)vlapic;
2679 	vmx = vlapic_vtx->vmx;
2680 	vmcs = &vmx->vmcs[vlapic->vcpuid];
2681 	mask = 1UL << (vector % 64);
2682 
2683 	VMPTRLD(vmcs);
2684 	val = vmcs_read(VMCS_EOI_EXIT(vector));
2685 	if (level)
2686 		val |= mask;
2687 	else
2688 		val &= ~mask;
2689 	vmcs_write(VMCS_EOI_EXIT(vector), val);
2690 	VMCLEAR(vmcs);
2691 }
2692 
2693 static void
2694 vmx_enable_x2apic_mode(struct vlapic *vlapic)
2695 {
2696 	struct vmx *vmx;
2697 	struct vmcs *vmcs;
2698 	uint32_t proc_ctls2;
2699 	int vcpuid, error;
2700 
2701 	vcpuid = vlapic->vcpuid;
2702 	vmx = ((struct vlapic_vtx *)vlapic)->vmx;
2703 	vmcs = &vmx->vmcs[vcpuid];
2704 
2705 	proc_ctls2 = vmx->cap[vcpuid].proc_ctls2;
2706 	KASSERT((proc_ctls2 & PROCBASED2_VIRTUALIZE_APIC_ACCESSES) != 0,
2707 	    ("%s: invalid proc_ctls2 %#x", __func__, proc_ctls2));
2708 
2709 	proc_ctls2 &= ~PROCBASED2_VIRTUALIZE_APIC_ACCESSES;
2710 	proc_ctls2 |= PROCBASED2_VIRTUALIZE_X2APIC_MODE;
2711 	vmx->cap[vcpuid].proc_ctls2 = proc_ctls2;
2712 
2713 	VMPTRLD(vmcs);
2714 	vmcs_write(VMCS_SEC_PROC_BASED_CTLS, proc_ctls2);
2715 	VMCLEAR(vmcs);
2716 
2717 	if (vlapic->vcpuid == 0) {
2718 		/*
2719 		 * The nested page table mappings are shared by all vcpus
2720 		 * so unmap the APIC access page just once.
2721 		 */
2722 		error = vm_unmap_mmio(vmx->vm, DEFAULT_APIC_BASE, PAGE_SIZE);
2723 		KASSERT(error == 0, ("%s: vm_unmap_mmio error %d",
2724 		    __func__, error));
2725 
2726 		/*
2727 		 * The MSR bitmap is shared by all vcpus so modify it only
2728 		 * once in the context of vcpu 0.
2729 		 */
2730 		error = vmx_allow_x2apic_msrs(vmx);
2731 		KASSERT(error == 0, ("%s: vmx_allow_x2apic_msrs error %d",
2732 		    __func__, error));
2733 	}
2734 }
2735 
2736 static void
2737 vmx_post_intr(struct vlapic *vlapic, int hostcpu)
2738 {
2739 
2740 	ipi_cpu(hostcpu, pirvec);
2741 }
2742 
2743 /*
2744  * Transfer the pending interrupts in the PIR descriptor to the IRR
2745  * in the virtual APIC page.
2746  */
2747 static void
2748 vmx_inject_pir(struct vlapic *vlapic)
2749 {
2750 	struct vlapic_vtx *vlapic_vtx;
2751 	struct pir_desc *pir_desc;
2752 	struct LAPIC *lapic;
2753 	uint64_t val, pirval;
2754 	int rvi, pirbase = -1;
2755 	uint16_t intr_status_old, intr_status_new;
2756 
2757 	vlapic_vtx = (struct vlapic_vtx *)vlapic;
2758 	pir_desc = vlapic_vtx->pir_desc;
2759 	if (atomic_cmpset_long(&pir_desc->pending, 1, 0) == 0) {
2760 		VCPU_CTR0(vlapic->vm, vlapic->vcpuid, "vmx_inject_pir: "
2761 		    "no posted interrupt pending");
2762 		return;
2763 	}
2764 
2765 	pirval = 0;
2766 	pirbase = -1;
2767 	lapic = vlapic->apic_page;
2768 
2769 	val = atomic_readandclear_long(&pir_desc->pir[0]);
2770 	if (val != 0) {
2771 		lapic->irr0 |= val;
2772 		lapic->irr1 |= val >> 32;
2773 		pirbase = 0;
2774 		pirval = val;
2775 	}
2776 
2777 	val = atomic_readandclear_long(&pir_desc->pir[1]);
2778 	if (val != 0) {
2779 		lapic->irr2 |= val;
2780 		lapic->irr3 |= val >> 32;
2781 		pirbase = 64;
2782 		pirval = val;
2783 	}
2784 
2785 	val = atomic_readandclear_long(&pir_desc->pir[2]);
2786 	if (val != 0) {
2787 		lapic->irr4 |= val;
2788 		lapic->irr5 |= val >> 32;
2789 		pirbase = 128;
2790 		pirval = val;
2791 	}
2792 
2793 	val = atomic_readandclear_long(&pir_desc->pir[3]);
2794 	if (val != 0) {
2795 		lapic->irr6 |= val;
2796 		lapic->irr7 |= val >> 32;
2797 		pirbase = 192;
2798 		pirval = val;
2799 	}
2800 
2801 	VLAPIC_CTR_IRR(vlapic, "vmx_inject_pir");
2802 
2803 	/*
2804 	 * Update RVI so the processor can evaluate pending virtual
2805 	 * interrupts on VM-entry.
2806 	 *
2807 	 * It is possible for pirval to be 0 here, even though the
2808 	 * pending bit has been set. The scenario is:
2809 	 * CPU-Y is sending a posted interrupt to CPU-X, which
2810 	 * is running a guest and processing posted interrupts in h/w.
2811 	 * CPU-X will eventually exit and the state seen in s/w is
2812 	 * the pending bit set, but no PIR bits set.
2813 	 *
2814 	 *      CPU-X                      CPU-Y
2815 	 *   (vm running)                (host running)
2816 	 *   rx posted interrupt
2817 	 *   CLEAR pending bit
2818 	 *				 SET PIR bit
2819 	 *   READ/CLEAR PIR bits
2820 	 *				 SET pending bit
2821 	 *   (vm exit)
2822 	 *   pending bit set, PIR 0
2823 	 */
2824 	if (pirval != 0) {
2825 		rvi = pirbase + flsl(pirval) - 1;
2826 		intr_status_old = vmcs_read(VMCS_GUEST_INTR_STATUS);
2827 		intr_status_new = (intr_status_old & 0xFF00) | rvi;
2828 		if (intr_status_new > intr_status_old) {
2829 			vmcs_write(VMCS_GUEST_INTR_STATUS, intr_status_new);
2830 			VCPU_CTR2(vlapic->vm, vlapic->vcpuid, "vmx_inject_pir: "
2831 			    "guest_intr_status changed from 0x%04x to 0x%04x",
2832 			    intr_status_old, intr_status_new);
2833 		}
2834 	}
2835 }
2836 
2837 static struct vlapic *
2838 vmx_vlapic_init(void *arg, int vcpuid)
2839 {
2840 	struct vmx *vmx;
2841 	struct vlapic *vlapic;
2842 	struct vlapic_vtx *vlapic_vtx;
2843 
2844 	vmx = arg;
2845 
2846 	vlapic = malloc(sizeof(struct vlapic_vtx), M_VLAPIC, M_WAITOK | M_ZERO);
2847 	vlapic->vm = vmx->vm;
2848 	vlapic->vcpuid = vcpuid;
2849 	vlapic->apic_page = (struct LAPIC *)&vmx->apic_page[vcpuid];
2850 
2851 	vlapic_vtx = (struct vlapic_vtx *)vlapic;
2852 	vlapic_vtx->pir_desc = &vmx->pir_desc[vcpuid];
2853 	vlapic_vtx->vmx = vmx;
2854 
2855 	if (virtual_interrupt_delivery) {
2856 		vlapic->ops.set_intr_ready = vmx_set_intr_ready;
2857 		vlapic->ops.pending_intr = vmx_pending_intr;
2858 		vlapic->ops.intr_accepted = vmx_intr_accepted;
2859 		vlapic->ops.set_tmr = vmx_set_tmr;
2860 		vlapic->ops.enable_x2apic_mode = vmx_enable_x2apic_mode;
2861 	}
2862 
2863 	if (posted_interrupts)
2864 		vlapic->ops.post_intr = vmx_post_intr;
2865 
2866 	vlapic_init(vlapic);
2867 
2868 	return (vlapic);
2869 }
2870 
2871 static void
2872 vmx_vlapic_cleanup(void *arg, struct vlapic *vlapic)
2873 {
2874 
2875 	vlapic_cleanup(vlapic);
2876 	free(vlapic, M_VLAPIC);
2877 }
2878 
2879 struct vmm_ops vmm_ops_intel = {
2880 	vmx_init,
2881 	vmx_cleanup,
2882 	vmx_restore,
2883 	vmx_vminit,
2884 	vmx_run,
2885 	vmx_vmcleanup,
2886 	vmx_getreg,
2887 	vmx_setreg,
2888 	vmx_getdesc,
2889 	vmx_setdesc,
2890 	vmx_getcap,
2891 	vmx_setcap,
2892 	ept_vmspace_alloc,
2893 	ept_vmspace_free,
2894 	vmx_vlapic_init,
2895 	vmx_vlapic_cleanup,
2896 };
2897