1 /*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (c) 2011 NetApp, Inc.
5 * All rights reserved.
6 * Copyright (c) 2018 Joyent, Inc.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 */
29 /*
30 * This file and its contents are supplied under the terms of the
31 * Common Development and Distribution License ("CDDL"), version 1.0.
32 * You may only use this file in accordance with the terms of version
33 * 1.0 of the CDDL.
34 *
35 * A full copy of the text of the CDDL should have accompanied this
36 * source. A copy of the CDDL is also available via the Internet at
37 * http://www.illumos.org/license/CDDL.
38 *
39 * Copyright 2015 Pluribus Networks Inc.
40 * Copyright 2018 Joyent, Inc.
41 * Copyright 2022 Oxide Computer Company
42 * Copyright 2022 MNX Cloud, Inc.
43 */
44
45 #include <sys/cdefs.h>
46
47 #include <sys/param.h>
48 #include <sys/systm.h>
49 #include <sys/kernel.h>
50 #include <sys/kmem.h>
51 #include <sys/pcpu.h>
52 #include <sys/proc.h>
53 #include <sys/sysctl.h>
54
55 #include <sys/x86_archext.h>
56 #include <sys/smp_impldefs.h>
57 #include <sys/smt.h>
58 #include <sys/hma.h>
59 #include <sys/trap.h>
60 #include <sys/archsystm.h>
61
62 #include <machine/psl.h>
63 #include <machine/cpufunc.h>
64 #include <machine/md_var.h>
65 #include <machine/reg.h>
66 #include <machine/segments.h>
67 #include <machine/specialreg.h>
68 #include <machine/vmparam.h>
69 #include <sys/vmm_vm.h>
70 #include <sys/vmm_kernel.h>
71
72 #include <machine/vmm.h>
73 #include <machine/vmm_dev.h>
74 #include <sys/vmm_instruction_emul.h>
75 #include "vmm_lapic.h"
76 #include "vmm_host.h"
77 #include "vmm_ioport.h"
78 #include "vmm_stat.h"
79 #include "vatpic.h"
80 #include "vlapic.h"
81 #include "vlapic_priv.h"
82
83 #include "vmcs.h"
84 #include "vmx.h"
85 #include "vmx_msr.h"
86 #include "vmx_controls.h"
87
88 #define PINBASED_CTLS_ONE_SETTING \
89 (PINBASED_EXTINT_EXITING | \
90 PINBASED_NMI_EXITING | \
91 PINBASED_VIRTUAL_NMI)
92 #define PINBASED_CTLS_ZERO_SETTING 0
93
94 #define PROCBASED_CTLS_WINDOW_SETTING \
95 (PROCBASED_INT_WINDOW_EXITING | \
96 PROCBASED_NMI_WINDOW_EXITING)
97
98 /*
99 * Distinct from FreeBSD bhyve, we consider several additional proc-based
100 * controls necessary:
101 * - TSC offsetting
102 * - HLT exiting
103 */
104 #define PROCBASED_CTLS_ONE_SETTING \
105 (PROCBASED_SECONDARY_CONTROLS | \
106 PROCBASED_TSC_OFFSET | \
107 PROCBASED_HLT_EXITING | \
108 PROCBASED_MWAIT_EXITING | \
109 PROCBASED_MONITOR_EXITING | \
110 PROCBASED_IO_EXITING | \
111 PROCBASED_MSR_BITMAPS | \
112 PROCBASED_CTLS_WINDOW_SETTING | \
113 PROCBASED_CR8_LOAD_EXITING | \
114 PROCBASED_CR8_STORE_EXITING)
115
116 #define PROCBASED_CTLS_ZERO_SETTING \
117 (PROCBASED_CR3_LOAD_EXITING | \
118 PROCBASED_CR3_STORE_EXITING | \
119 PROCBASED_IO_BITMAPS)
120
121 /*
122 * EPT and Unrestricted Guest are considered necessities. The latter is not a
123 * requirement on FreeBSD, where grub2-bhyve is used to load guests directly
124 * without a bootrom starting in real mode.
125 */
126 #define PROCBASED_CTLS2_ONE_SETTING \
127 (PROCBASED2_ENABLE_EPT | \
128 PROCBASED2_UNRESTRICTED_GUEST)
129 #define PROCBASED_CTLS2_ZERO_SETTING 0
130
131 #define VM_EXIT_CTLS_ONE_SETTING \
132 (VM_EXIT_SAVE_DEBUG_CONTROLS | \
133 VM_EXIT_HOST_LMA | \
134 VM_EXIT_LOAD_PAT | \
135 VM_EXIT_SAVE_EFER | \
136 VM_EXIT_LOAD_EFER | \
137 VM_EXIT_ACKNOWLEDGE_INTERRUPT)
138
139 #define VM_EXIT_CTLS_ZERO_SETTING 0
140
141 #define VM_ENTRY_CTLS_ONE_SETTING \
142 (VM_ENTRY_LOAD_DEBUG_CONTROLS | \
143 VM_ENTRY_LOAD_EFER)
144
145 #define VM_ENTRY_CTLS_ZERO_SETTING \
146 (VM_ENTRY_INTO_SMM | \
147 VM_ENTRY_DEACTIVATE_DUAL_MONITOR)
148
149 /*
150 * Cover the EPT capabilities used by bhyve at present:
151 * - 4-level page walks
152 * - write-back memory type
153 * - INVEPT operations (all types)
154 * - INVVPID operations (single-context only)
155 */
156 #define EPT_CAPS_REQUIRED \
157 (IA32_VMX_EPT_VPID_PWL4 | \
158 IA32_VMX_EPT_VPID_TYPE_WB | \
159 IA32_VMX_EPT_VPID_INVEPT | \
160 IA32_VMX_EPT_VPID_INVEPT_SINGLE | \
161 IA32_VMX_EPT_VPID_INVEPT_ALL | \
162 IA32_VMX_EPT_VPID_INVVPID | \
163 IA32_VMX_EPT_VPID_INVVPID_SINGLE)
164
165 #define HANDLED 1
166 #define UNHANDLED 0
167
168 SYSCTL_DECL(_hw_vmm);
169 SYSCTL_NODE(_hw_vmm, OID_AUTO, vmx, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL,
170 NULL);
171
172 /*
173 * TSC scaling related constants
174 */
175 #define INTEL_TSCM_INT_SIZE 16
176 #define INTEL_TSCM_FRAC_SIZE 48
177
178 static uint32_t pinbased_ctls, procbased_ctls, procbased_ctls2;
179 static uint32_t exit_ctls, entry_ctls;
180
181 static uint64_t cr0_ones_mask, cr0_zeros_mask;
182
183 static uint64_t cr4_ones_mask, cr4_zeros_mask;
184
185 static int vmx_initialized;
186
187 /*
188 * Optional capabilities
189 */
190
191 /* PAUSE triggers a VM-exit */
192 static int cap_pause_exit;
193
194 /* WBINVD triggers a VM-exit */
195 static int cap_wbinvd_exit;
196
197 /* Monitor trap flag */
198 static int cap_monitor_trap;
199
200 /* Guests are allowed to use INVPCID */
201 static int cap_invpcid;
202
203 /* Extra capabilities (VMX_CAP_*) beyond the minimum */
204 static enum vmx_caps vmx_capabilities;
205
206 /* APICv posted interrupt vector */
207 static int pirvec = -1;
208
209 static uint_t vpid_alloc_failed;
210
211 int guest_l1d_flush;
212 int guest_l1d_flush_sw;
213
214 /* MSR save region is composed of an array of 'struct msr_entry' */
215 struct msr_entry {
216 uint32_t index;
217 uint32_t reserved;
218 uint64_t val;
219 };
220
221 static struct msr_entry msr_load_list[1] __aligned(16);
222
223 /*
224 * The definitions of SDT probes for VMX.
225 */
226
227 /* BEGIN CSTYLED */
228 SDT_PROBE_DEFINE3(vmm, vmx, exit, entry,
229 "struct vmx *", "int", "struct vm_exit *");
230
231 SDT_PROBE_DEFINE4(vmm, vmx, exit, taskswitch,
232 "struct vmx *", "int", "struct vm_exit *", "struct vm_task_switch *");
233
234 SDT_PROBE_DEFINE4(vmm, vmx, exit, craccess,
235 "struct vmx *", "int", "struct vm_exit *", "uint64_t");
236
237 SDT_PROBE_DEFINE4(vmm, vmx, exit, rdmsr,
238 "struct vmx *", "int", "struct vm_exit *", "uint32_t");
239
240 SDT_PROBE_DEFINE5(vmm, vmx, exit, wrmsr,
241 "struct vmx *", "int", "struct vm_exit *", "uint32_t", "uint64_t");
242
243 SDT_PROBE_DEFINE3(vmm, vmx, exit, halt,
244 "struct vmx *", "int", "struct vm_exit *");
245
246 SDT_PROBE_DEFINE3(vmm, vmx, exit, mtrap,
247 "struct vmx *", "int", "struct vm_exit *");
248
249 SDT_PROBE_DEFINE3(vmm, vmx, exit, pause,
250 "struct vmx *", "int", "struct vm_exit *");
251
252 SDT_PROBE_DEFINE3(vmm, vmx, exit, intrwindow,
253 "struct vmx *", "int", "struct vm_exit *");
254
255 SDT_PROBE_DEFINE4(vmm, vmx, exit, interrupt,
256 "struct vmx *", "int", "struct vm_exit *", "uint32_t");
257
258 SDT_PROBE_DEFINE3(vmm, vmx, exit, nmiwindow,
259 "struct vmx *", "int", "struct vm_exit *");
260
261 SDT_PROBE_DEFINE3(vmm, vmx, exit, inout,
262 "struct vmx *", "int", "struct vm_exit *");
263
264 SDT_PROBE_DEFINE3(vmm, vmx, exit, cpuid,
265 "struct vmx *", "int", "struct vm_exit *");
266
267 SDT_PROBE_DEFINE5(vmm, vmx, exit, exception,
268 "struct vmx *", "int", "struct vm_exit *", "uint32_t", "int");
269
270 SDT_PROBE_DEFINE5(vmm, vmx, exit, nestedfault,
271 "struct vmx *", "int", "struct vm_exit *", "uint64_t", "uint64_t");
272
273 SDT_PROBE_DEFINE4(vmm, vmx, exit, mmiofault,
274 "struct vmx *", "int", "struct vm_exit *", "uint64_t");
275
276 SDT_PROBE_DEFINE3(vmm, vmx, exit, eoi,
277 "struct vmx *", "int", "struct vm_exit *");
278
279 SDT_PROBE_DEFINE3(vmm, vmx, exit, apicaccess,
280 "struct vmx *", "int", "struct vm_exit *");
281
282 SDT_PROBE_DEFINE4(vmm, vmx, exit, apicwrite,
283 "struct vmx *", "int", "struct vm_exit *", "struct vlapic *");
284
285 SDT_PROBE_DEFINE3(vmm, vmx, exit, xsetbv,
286 "struct vmx *", "int", "struct vm_exit *");
287
288 SDT_PROBE_DEFINE3(vmm, vmx, exit, monitor,
289 "struct vmx *", "int", "struct vm_exit *");
290
291 SDT_PROBE_DEFINE3(vmm, vmx, exit, mwait,
292 "struct vmx *", "int", "struct vm_exit *");
293
294 SDT_PROBE_DEFINE3(vmm, vmx, exit, vminsn,
295 "struct vmx *", "int", "struct vm_exit *");
296
297 SDT_PROBE_DEFINE4(vmm, vmx, exit, unknown,
298 "struct vmx *", "int", "struct vm_exit *", "uint32_t");
299
300 SDT_PROBE_DEFINE4(vmm, vmx, exit, return,
301 "struct vmx *", "int", "struct vm_exit *", "int");
302 /* END CSTYLED */
303
304 static int vmx_getdesc(void *arg, int vcpu, int reg, struct seg_desc *desc);
305 static int vmx_getreg(void *arg, int vcpu, int reg, uint64_t *retval);
306 static void vmx_apply_tsc_adjust(struct vmx *, int);
307 static void vmx_apicv_sync_tmr(struct vlapic *vlapic);
308 static void vmx_tpr_shadow_enter(struct vlapic *vlapic);
309 static void vmx_tpr_shadow_exit(struct vlapic *vlapic);
310
311 static void
vmx_allow_x2apic_msrs(struct vmx * vmx,int vcpuid)312 vmx_allow_x2apic_msrs(struct vmx *vmx, int vcpuid)
313 {
314 /*
315 * Allow readonly access to the following x2APIC MSRs from the guest.
316 */
317 guest_msr_ro(vmx, vcpuid, MSR_APIC_ID);
318 guest_msr_ro(vmx, vcpuid, MSR_APIC_VERSION);
319 guest_msr_ro(vmx, vcpuid, MSR_APIC_LDR);
320 guest_msr_ro(vmx, vcpuid, MSR_APIC_SVR);
321
322 for (uint_t i = 0; i < 8; i++) {
323 guest_msr_ro(vmx, vcpuid, MSR_APIC_ISR0 + i);
324 guest_msr_ro(vmx, vcpuid, MSR_APIC_TMR0 + i);
325 guest_msr_ro(vmx, vcpuid, MSR_APIC_IRR0 + i);
326 }
327
328 guest_msr_ro(vmx, vcpuid, MSR_APIC_ESR);
329 guest_msr_ro(vmx, vcpuid, MSR_APIC_LVT_TIMER);
330 guest_msr_ro(vmx, vcpuid, MSR_APIC_LVT_THERMAL);
331 guest_msr_ro(vmx, vcpuid, MSR_APIC_LVT_PCINT);
332 guest_msr_ro(vmx, vcpuid, MSR_APIC_LVT_LINT0);
333 guest_msr_ro(vmx, vcpuid, MSR_APIC_LVT_LINT1);
334 guest_msr_ro(vmx, vcpuid, MSR_APIC_LVT_ERROR);
335 guest_msr_ro(vmx, vcpuid, MSR_APIC_ICR_TIMER);
336 guest_msr_ro(vmx, vcpuid, MSR_APIC_DCR_TIMER);
337 guest_msr_ro(vmx, vcpuid, MSR_APIC_ICR);
338
339 /*
340 * Allow TPR, EOI and SELF_IPI MSRs to be read and written by the guest.
341 *
342 * These registers get special treatment described in the section
343 * "Virtualizing MSR-Based APIC Accesses".
344 */
345 guest_msr_rw(vmx, vcpuid, MSR_APIC_TPR);
346 guest_msr_rw(vmx, vcpuid, MSR_APIC_EOI);
347 guest_msr_rw(vmx, vcpuid, MSR_APIC_SELF_IPI);
348 }
349
350 static ulong_t
vmx_fix_cr0(ulong_t cr0)351 vmx_fix_cr0(ulong_t cr0)
352 {
353 return ((cr0 | cr0_ones_mask) & ~cr0_zeros_mask);
354 }
355
356 /*
357 * Given a live (VMCS-active) cr0 value, and its shadow counterpart, calculate
358 * the value observable from the guest.
359 */
360 static ulong_t
vmx_unshadow_cr0(uint64_t cr0,uint64_t shadow)361 vmx_unshadow_cr0(uint64_t cr0, uint64_t shadow)
362 {
363 return ((cr0 & ~cr0_ones_mask) |
364 (shadow & (cr0_zeros_mask | cr0_ones_mask)));
365 }
366
367 static ulong_t
vmx_fix_cr4(ulong_t cr4)368 vmx_fix_cr4(ulong_t cr4)
369 {
370 return ((cr4 | cr4_ones_mask) & ~cr4_zeros_mask);
371 }
372
373 /*
374 * Given a live (VMCS-active) cr4 value, and its shadow counterpart, calculate
375 * the value observable from the guest.
376 */
377 static ulong_t
vmx_unshadow_cr4(uint64_t cr4,uint64_t shadow)378 vmx_unshadow_cr4(uint64_t cr4, uint64_t shadow)
379 {
380 return ((cr4 & ~cr4_ones_mask) |
381 (shadow & (cr4_zeros_mask | cr4_ones_mask)));
382 }
383
384 static void
vpid_free(int vpid)385 vpid_free(int vpid)
386 {
387 if (vpid < 0 || vpid > 0xffff)
388 panic("vpid_free: invalid vpid %d", vpid);
389
390 /*
391 * VPIDs [0,VM_MAXCPU] are special and are not allocated from
392 * the unit number allocator.
393 */
394
395 if (vpid > VM_MAXCPU)
396 hma_vmx_vpid_free((uint16_t)vpid);
397 }
398
399 static void
vpid_alloc(uint16_t * vpid,int num)400 vpid_alloc(uint16_t *vpid, int num)
401 {
402 int i, x;
403
404 if (num <= 0 || num > VM_MAXCPU)
405 panic("invalid number of vpids requested: %d", num);
406
407 /*
408 * If the "enable vpid" execution control is not enabled then the
409 * VPID is required to be 0 for all vcpus.
410 */
411 if ((procbased_ctls2 & PROCBASED2_ENABLE_VPID) == 0) {
412 for (i = 0; i < num; i++)
413 vpid[i] = 0;
414 return;
415 }
416
417 /*
418 * Allocate a unique VPID for each vcpu from the unit number allocator.
419 */
420 for (i = 0; i < num; i++) {
421 uint16_t tmp;
422
423 tmp = hma_vmx_vpid_alloc();
424 x = (tmp == 0) ? -1 : tmp;
425
426 if (x == -1)
427 break;
428 else
429 vpid[i] = x;
430 }
431
432 if (i < num) {
433 atomic_add_int(&vpid_alloc_failed, 1);
434
435 /*
436 * If the unit number allocator does not have enough unique
437 * VPIDs then we need to allocate from the [1,VM_MAXCPU] range.
438 *
439 * These VPIDs are not be unique across VMs but this does not
440 * affect correctness because the combined mappings are also
441 * tagged with the EP4TA which is unique for each VM.
442 *
443 * It is still sub-optimal because the invvpid will invalidate
444 * combined mappings for a particular VPID across all EP4TAs.
445 */
446 while (i-- > 0)
447 vpid_free(vpid[i]);
448
449 for (i = 0; i < num; i++)
450 vpid[i] = i + 1;
451 }
452 }
453
454 static int
vmx_cleanup(void)455 vmx_cleanup(void)
456 {
457 /* This is taken care of by the hma registration */
458 return (0);
459 }
460
461 static void
vmx_restore(void)462 vmx_restore(void)
463 {
464 /* No-op on illumos */
465 }
466
467 static int
vmx_init(void)468 vmx_init(void)
469 {
470 int error;
471 uint64_t fixed0, fixed1;
472 uint32_t tmp;
473 enum vmx_caps avail_caps = VMX_CAP_NONE;
474
475 /* Check support for primary processor-based VM-execution controls */
476 error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
477 MSR_VMX_TRUE_PROCBASED_CTLS,
478 PROCBASED_CTLS_ONE_SETTING,
479 PROCBASED_CTLS_ZERO_SETTING, &procbased_ctls);
480 if (error) {
481 printf("vmx_init: processor does not support desired primary "
482 "processor-based controls\n");
483 return (error);
484 }
485
486 /*
487 * Clear interrupt-window/NMI-window exiting from the default proc-based
488 * controls. They are set and cleared based on runtime vCPU events.
489 */
490 procbased_ctls &= ~PROCBASED_CTLS_WINDOW_SETTING;
491
492 /* Check support for secondary processor-based VM-execution controls */
493 error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2,
494 MSR_VMX_PROCBASED_CTLS2,
495 PROCBASED_CTLS2_ONE_SETTING,
496 PROCBASED_CTLS2_ZERO_SETTING, &procbased_ctls2);
497 if (error) {
498 printf("vmx_init: processor does not support desired secondary "
499 "processor-based controls\n");
500 return (error);
501 }
502
503 /* Check support for VPID */
504 error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2,
505 MSR_VMX_PROCBASED_CTLS2,
506 PROCBASED2_ENABLE_VPID,
507 0, &tmp);
508 if (error == 0)
509 procbased_ctls2 |= PROCBASED2_ENABLE_VPID;
510
511 /* Check support for pin-based VM-execution controls */
512 error = vmx_set_ctlreg(MSR_VMX_PINBASED_CTLS,
513 MSR_VMX_TRUE_PINBASED_CTLS,
514 PINBASED_CTLS_ONE_SETTING,
515 PINBASED_CTLS_ZERO_SETTING, &pinbased_ctls);
516 if (error) {
517 printf("vmx_init: processor does not support desired "
518 "pin-based controls\n");
519 return (error);
520 }
521
522 /* Check support for VM-exit controls */
523 error = vmx_set_ctlreg(MSR_VMX_EXIT_CTLS, MSR_VMX_TRUE_EXIT_CTLS,
524 VM_EXIT_CTLS_ONE_SETTING,
525 VM_EXIT_CTLS_ZERO_SETTING,
526 &exit_ctls);
527 if (error) {
528 printf("vmx_init: processor does not support desired "
529 "exit controls\n");
530 return (error);
531 }
532
533 /* Check support for VM-entry controls */
534 error = vmx_set_ctlreg(MSR_VMX_ENTRY_CTLS, MSR_VMX_TRUE_ENTRY_CTLS,
535 VM_ENTRY_CTLS_ONE_SETTING, VM_ENTRY_CTLS_ZERO_SETTING,
536 &entry_ctls);
537 if (error) {
538 printf("vmx_init: processor does not support desired "
539 "entry controls\n");
540 return (error);
541 }
542
543 /*
544 * Check support for optional features by testing them
545 * as individual bits
546 */
547 cap_monitor_trap = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
548 MSR_VMX_PROCBASED_CTLS,
549 PROCBASED_MTF, 0,
550 &tmp) == 0);
551
552 cap_pause_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
553 MSR_VMX_TRUE_PROCBASED_CTLS,
554 PROCBASED_PAUSE_EXITING, 0,
555 &tmp) == 0);
556
557 cap_wbinvd_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2,
558 MSR_VMX_PROCBASED_CTLS2,
559 PROCBASED2_WBINVD_EXITING, 0,
560 &tmp) == 0);
561
562 cap_invpcid = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2,
563 MSR_VMX_PROCBASED_CTLS2, PROCBASED2_ENABLE_INVPCID, 0,
564 &tmp) == 0);
565
566 /*
567 * Check for APIC virtualization capabilities:
568 * - TPR shadowing
569 * - Full APICv (with or without x2APIC support)
570 * - Posted interrupt handling
571 */
572 if (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, MSR_VMX_TRUE_PROCBASED_CTLS,
573 PROCBASED_USE_TPR_SHADOW, 0, &tmp) == 0) {
574 avail_caps |= VMX_CAP_TPR_SHADOW;
575
576 const uint32_t apicv_bits =
577 PROCBASED2_VIRTUALIZE_APIC_ACCESSES |
578 PROCBASED2_APIC_REGISTER_VIRTUALIZATION |
579 PROCBASED2_VIRTUALIZE_X2APIC_MODE |
580 PROCBASED2_VIRTUAL_INTERRUPT_DELIVERY;
581 if (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2,
582 MSR_VMX_PROCBASED_CTLS2, apicv_bits, 0, &tmp) == 0) {
583 avail_caps |= VMX_CAP_APICV;
584
585 /*
586 * It may make sense in the future to differentiate
587 * hardware (or software) configurations with APICv but
588 * no support for accelerating x2APIC mode.
589 */
590 avail_caps |= VMX_CAP_APICV_X2APIC;
591
592 error = vmx_set_ctlreg(MSR_VMX_PINBASED_CTLS,
593 MSR_VMX_TRUE_PINBASED_CTLS,
594 PINBASED_POSTED_INTERRUPT, 0, &tmp);
595 if (error == 0) {
596 /*
597 * If the PSM-provided interfaces for requesting
598 * and using a PIR IPI vector are present, use
599 * them for posted interrupts.
600 */
601 if (psm_get_pir_ipivect != NULL &&
602 psm_send_pir_ipi != NULL) {
603 pirvec = psm_get_pir_ipivect();
604 avail_caps |= VMX_CAP_APICV_PIR;
605 }
606 }
607 }
608 }
609
610 /*
611 * Check for necessary EPT capabilities
612 *
613 * TODO: Properly handle when IA32_VMX_EPT_VPID_HW_AD is missing and the
614 * hypervisor intends to utilize dirty page tracking.
615 */
616 uint64_t ept_caps = rdmsr(MSR_IA32_VMX_EPT_VPID_CAP);
617 if ((ept_caps & EPT_CAPS_REQUIRED) != EPT_CAPS_REQUIRED) {
618 cmn_err(CE_WARN, "!Inadequate EPT capabilities: %lx", ept_caps);
619 return (EINVAL);
620 }
621
622 #ifdef __FreeBSD__
623 guest_l1d_flush = (cpu_ia32_arch_caps &
624 IA32_ARCH_CAP_SKIP_L1DFL_VMENTRY) == 0;
625 TUNABLE_INT_FETCH("hw.vmm.l1d_flush", &guest_l1d_flush);
626
627 /*
628 * L1D cache flush is enabled. Use IA32_FLUSH_CMD MSR when
629 * available. Otherwise fall back to the software flush
630 * method which loads enough data from the kernel text to
631 * flush existing L1D content, both on VMX entry and on NMI
632 * return.
633 */
634 if (guest_l1d_flush) {
635 if ((cpu_stdext_feature3 & CPUID_STDEXT3_L1D_FLUSH) == 0) {
636 guest_l1d_flush_sw = 1;
637 TUNABLE_INT_FETCH("hw.vmm.l1d_flush_sw",
638 &guest_l1d_flush_sw);
639 }
640 if (guest_l1d_flush_sw) {
641 if (nmi_flush_l1d_sw <= 1)
642 nmi_flush_l1d_sw = 1;
643 } else {
644 msr_load_list[0].index = MSR_IA32_FLUSH_CMD;
645 msr_load_list[0].val = IA32_FLUSH_CMD_L1D;
646 }
647 }
648 #else
649 /* L1D flushing is taken care of by smt_acquire() and friends */
650 guest_l1d_flush = 0;
651 #endif /* __FreeBSD__ */
652
653 /*
654 * Stash the cr0 and cr4 bits that must be fixed to 0 or 1
655 */
656 fixed0 = rdmsr(MSR_VMX_CR0_FIXED0);
657 fixed1 = rdmsr(MSR_VMX_CR0_FIXED1);
658 cr0_ones_mask = fixed0 & fixed1;
659 cr0_zeros_mask = ~fixed0 & ~fixed1;
660
661 /*
662 * Since Unrestricted Guest was already verified present, CR0_PE and
663 * CR0_PG are allowed to be set to zero in VMX non-root operation
664 */
665 cr0_ones_mask &= ~(CR0_PG | CR0_PE);
666
667 /*
668 * Do not allow the guest to set CR0_NW or CR0_CD.
669 */
670 cr0_zeros_mask |= (CR0_NW | CR0_CD);
671
672 fixed0 = rdmsr(MSR_VMX_CR4_FIXED0);
673 fixed1 = rdmsr(MSR_VMX_CR4_FIXED1);
674 cr4_ones_mask = fixed0 & fixed1;
675 cr4_zeros_mask = ~fixed0 & ~fixed1;
676
677 vmx_msr_init();
678
679 vmx_capabilities = avail_caps;
680 vmx_initialized = 1;
681
682 return (0);
683 }
684
685 static void
vmx_trigger_hostintr(int vector)686 vmx_trigger_hostintr(int vector)
687 {
688 VERIFY(vector >= 32 && vector <= 255);
689 vmx_call_isr(vector - 32);
690 }
691
692 static void *
vmx_vminit(struct vm * vm)693 vmx_vminit(struct vm *vm)
694 {
695 uint16_t vpid[VM_MAXCPU];
696 int i, error, datasel;
697 struct vmx *vmx;
698 uint32_t exc_bitmap;
699 uint16_t maxcpus;
700 uint32_t proc_ctls, proc2_ctls, pin_ctls;
701 uint64_t apic_access_pa = UINT64_MAX;
702
703 vmx = kmem_zalloc(sizeof (struct vmx), KM_SLEEP);
704 VERIFY3U((uintptr_t)vmx & PAGE_MASK, ==, 0);
705
706 vmx->vm = vm;
707 vmx->eptp = vmspace_table_root(vm_get_vmspace(vm));
708
709 /*
710 * Clean up EP4TA-tagged guest-physical and combined mappings
711 *
712 * VMX transitions are not required to invalidate any guest physical
713 * mappings. So, it may be possible for stale guest physical mappings
714 * to be present in the processor TLBs.
715 *
716 * Combined mappings for this EP4TA are also invalidated for all VPIDs.
717 */
718 hma_vmx_invept_allcpus((uintptr_t)vmx->eptp);
719
720 vmx_msr_bitmap_initialize(vmx);
721
722 vpid_alloc(vpid, VM_MAXCPU);
723
724 /* Grab the established defaults */
725 proc_ctls = procbased_ctls;
726 proc2_ctls = procbased_ctls2;
727 pin_ctls = pinbased_ctls;
728 /* For now, default to the available capabilities */
729 vmx->vmx_caps = vmx_capabilities;
730
731 if (vmx_cap_en(vmx, VMX_CAP_TPR_SHADOW)) {
732 proc_ctls |= PROCBASED_USE_TPR_SHADOW;
733 proc_ctls &= ~PROCBASED_CR8_LOAD_EXITING;
734 proc_ctls &= ~PROCBASED_CR8_STORE_EXITING;
735 }
736 if (vmx_cap_en(vmx, VMX_CAP_APICV)) {
737 ASSERT(vmx_cap_en(vmx, VMX_CAP_TPR_SHADOW));
738
739 proc2_ctls |= (PROCBASED2_VIRTUALIZE_APIC_ACCESSES |
740 PROCBASED2_APIC_REGISTER_VIRTUALIZATION |
741 PROCBASED2_VIRTUAL_INTERRUPT_DELIVERY);
742
743 /*
744 * Allocate a page of memory to back the APIC access address for
745 * when APICv features are in use. Guest MMIO accesses should
746 * never actually reach this page, but rather be intercepted.
747 */
748 vmx->apic_access_page = kmem_zalloc(PAGESIZE, KM_SLEEP);
749 VERIFY3U((uintptr_t)vmx->apic_access_page & PAGEOFFSET, ==, 0);
750 apic_access_pa = vtophys(vmx->apic_access_page);
751
752 error = vm_map_mmio(vm, DEFAULT_APIC_BASE, PAGE_SIZE,
753 apic_access_pa);
754 /* XXX this should really return an error to the caller */
755 KASSERT(error == 0, ("vm_map_mmio(apicbase) error %d", error));
756 }
757 if (vmx_cap_en(vmx, VMX_CAP_APICV_PIR)) {
758 ASSERT(vmx_cap_en(vmx, VMX_CAP_APICV));
759
760 pin_ctls |= PINBASED_POSTED_INTERRUPT;
761 }
762
763 /* Reflect any enabled defaults in the cap set */
764 int cap_defaults = 0;
765 if ((proc_ctls & PROCBASED_HLT_EXITING) != 0) {
766 cap_defaults |= (1 << VM_CAP_HALT_EXIT);
767 }
768 if ((proc_ctls & PROCBASED_PAUSE_EXITING) != 0) {
769 cap_defaults |= (1 << VM_CAP_PAUSE_EXIT);
770 }
771 if ((proc_ctls & PROCBASED_MTF) != 0) {
772 cap_defaults |= (1 << VM_CAP_MTRAP_EXIT);
773 }
774 if ((proc2_ctls & PROCBASED2_ENABLE_INVPCID) != 0) {
775 cap_defaults |= (1 << VM_CAP_ENABLE_INVPCID);
776 }
777
778 maxcpus = vm_get_maxcpus(vm);
779 datasel = vmm_get_host_datasel();
780 for (i = 0; i < maxcpus; i++) {
781 /*
782 * Cache physical address lookups for various components which
783 * may be required inside the critical_enter() section implied
784 * by VMPTRLD() below.
785 */
786 vm_paddr_t msr_bitmap_pa = vtophys(vmx->msr_bitmap[i]);
787 vm_paddr_t apic_page_pa = vtophys(&vmx->apic_page[i]);
788 vm_paddr_t pir_desc_pa = vtophys(&vmx->pir_desc[i]);
789
790 vmx->vmcs_pa[i] = (uintptr_t)vtophys(&vmx->vmcs[i]);
791 vmcs_initialize(&vmx->vmcs[i], vmx->vmcs_pa[i]);
792
793 vmx_msr_guest_init(vmx, i);
794
795 vmcs_load(vmx->vmcs_pa[i]);
796
797 vmcs_write(VMCS_HOST_IA32_PAT, vmm_get_host_pat());
798 vmcs_write(VMCS_HOST_IA32_EFER, vmm_get_host_efer());
799
800 /* Load the control registers */
801 vmcs_write(VMCS_HOST_CR0, vmm_get_host_cr0());
802 vmcs_write(VMCS_HOST_CR4, vmm_get_host_cr4() | CR4_VMXE);
803
804 /* Load the segment selectors */
805 vmcs_write(VMCS_HOST_CS_SELECTOR, vmm_get_host_codesel());
806
807 vmcs_write(VMCS_HOST_ES_SELECTOR, datasel);
808 vmcs_write(VMCS_HOST_SS_SELECTOR, datasel);
809 vmcs_write(VMCS_HOST_DS_SELECTOR, datasel);
810
811 vmcs_write(VMCS_HOST_FS_SELECTOR, vmm_get_host_fssel());
812 vmcs_write(VMCS_HOST_GS_SELECTOR, vmm_get_host_gssel());
813 vmcs_write(VMCS_HOST_TR_SELECTOR, vmm_get_host_tsssel());
814
815 /*
816 * Configure host sysenter MSRs to be restored on VM exit.
817 * The thread-specific MSR_INTC_SEP_ESP value is loaded in
818 * vmx_run.
819 */
820 vmcs_write(VMCS_HOST_IA32_SYSENTER_CS, KCS_SEL);
821 vmcs_write(VMCS_HOST_IA32_SYSENTER_EIP,
822 rdmsr(MSR_SYSENTER_EIP_MSR));
823
824 /* instruction pointer */
825 vmcs_write(VMCS_HOST_RIP, (uint64_t)vmx_exit_guest);
826
827 /* link pointer */
828 vmcs_write(VMCS_LINK_POINTER, ~0);
829
830 vmcs_write(VMCS_EPTP, vmx->eptp);
831 vmcs_write(VMCS_PIN_BASED_CTLS, pin_ctls);
832 vmcs_write(VMCS_PRI_PROC_BASED_CTLS, proc_ctls);
833
834 uint32_t use_proc2_ctls = proc2_ctls;
835 if (cap_wbinvd_exit && vcpu_trap_wbinvd(vm, i) != 0)
836 use_proc2_ctls |= PROCBASED2_WBINVD_EXITING;
837 vmcs_write(VMCS_SEC_PROC_BASED_CTLS, use_proc2_ctls);
838
839 vmcs_write(VMCS_EXIT_CTLS, exit_ctls);
840 vmcs_write(VMCS_ENTRY_CTLS, entry_ctls);
841 vmcs_write(VMCS_MSR_BITMAP, msr_bitmap_pa);
842 vmcs_write(VMCS_VPID, vpid[i]);
843
844 if (guest_l1d_flush && !guest_l1d_flush_sw) {
845 vmcs_write(VMCS_ENTRY_MSR_LOAD,
846 vtophys(&msr_load_list[0]));
847 vmcs_write(VMCS_ENTRY_MSR_LOAD_COUNT,
848 nitems(msr_load_list));
849 vmcs_write(VMCS_EXIT_MSR_STORE, 0);
850 vmcs_write(VMCS_EXIT_MSR_STORE_COUNT, 0);
851 }
852
853 /* exception bitmap */
854 if (vcpu_trace_exceptions(vm, i))
855 exc_bitmap = 0xffffffff;
856 else
857 exc_bitmap = 1 << IDT_MC;
858 vmcs_write(VMCS_EXCEPTION_BITMAP, exc_bitmap);
859
860 vmx->ctx[i].guest_dr6 = DBREG_DR6_RESERVED1;
861 vmcs_write(VMCS_GUEST_DR7, DBREG_DR7_RESERVED1);
862
863 if (vmx_cap_en(vmx, VMX_CAP_TPR_SHADOW)) {
864 vmcs_write(VMCS_VIRTUAL_APIC, apic_page_pa);
865 }
866
867 if (vmx_cap_en(vmx, VMX_CAP_APICV)) {
868 vmcs_write(VMCS_APIC_ACCESS, apic_access_pa);
869 vmcs_write(VMCS_EOI_EXIT0, 0);
870 vmcs_write(VMCS_EOI_EXIT1, 0);
871 vmcs_write(VMCS_EOI_EXIT2, 0);
872 vmcs_write(VMCS_EOI_EXIT3, 0);
873 }
874 if (vmx_cap_en(vmx, VMX_CAP_APICV_PIR)) {
875 vmcs_write(VMCS_PIR_VECTOR, pirvec);
876 vmcs_write(VMCS_PIR_DESC, pir_desc_pa);
877 }
878
879 /*
880 * Set up the CR0/4 masks and configure the read shadow state
881 * to the power-on register value from the Intel Sys Arch.
882 * CR0 - 0x60000010
883 * CR4 - 0
884 */
885 vmcs_write(VMCS_CR0_MASK, cr0_ones_mask | cr0_zeros_mask);
886 vmcs_write(VMCS_CR0_SHADOW, 0x60000010);
887 vmcs_write(VMCS_CR4_MASK, cr4_ones_mask | cr4_zeros_mask);
888 vmcs_write(VMCS_CR4_SHADOW, 0);
889
890 vmcs_clear(vmx->vmcs_pa[i]);
891
892 vmx->cap[i].set = cap_defaults;
893 vmx->cap[i].proc_ctls = proc_ctls;
894 vmx->cap[i].proc_ctls2 = proc2_ctls;
895 vmx->cap[i].exc_bitmap = exc_bitmap;
896
897 vmx->state[i].nextrip = ~0;
898 vmx->state[i].lastcpu = NOCPU;
899 vmx->state[i].vpid = vpid[i];
900 }
901
902 return (vmx);
903 }
904
905 static VMM_STAT_INTEL(VCPU_INVVPID_SAVED, "Number of vpid invalidations saved");
906 static VMM_STAT_INTEL(VCPU_INVVPID_DONE, "Number of vpid invalidations done");
907
908 #define INVVPID_TYPE_ADDRESS 0UL
909 #define INVVPID_TYPE_SINGLE_CONTEXT 1UL
910 #define INVVPID_TYPE_ALL_CONTEXTS 2UL
911
912 struct invvpid_desc {
913 uint16_t vpid;
914 uint16_t _res1;
915 uint32_t _res2;
916 uint64_t linear_addr;
917 };
918 CTASSERT(sizeof (struct invvpid_desc) == 16);
919
920 static __inline void
invvpid(uint64_t type,struct invvpid_desc desc)921 invvpid(uint64_t type, struct invvpid_desc desc)
922 {
923 int error;
924
925 DTRACE_PROBE3(vmx__invvpid, uint64_t, type, uint16_t, desc.vpid,
926 uint64_t, desc.linear_addr);
927
928 __asm __volatile("invvpid %[desc], %[type];"
929 VMX_SET_ERROR_CODE_ASM
930 : [error] "=r" (error)
931 : [desc] "m" (desc), [type] "r" (type)
932 : "memory");
933
934 if (error) {
935 panic("invvpid error %d", error);
936 }
937 }
938
939 /*
940 * Invalidate guest mappings identified by its VPID from the TLB.
941 *
942 * This is effectively a flush of the guest TLB, removing only "combined
943 * mappings" (to use the VMX parlance). Actions which modify the EPT structures
944 * for the instance (such as unmapping GPAs) would require an 'invept' flush.
945 */
946 static void
vmx_invvpid(struct vmx * vmx,int vcpu,int running)947 vmx_invvpid(struct vmx *vmx, int vcpu, int running)
948 {
949 struct vmxstate *vmxstate;
950 struct vmspace *vms;
951
952 vmxstate = &vmx->state[vcpu];
953 if (vmxstate->vpid == 0) {
954 return;
955 }
956
957 if (!running) {
958 /*
959 * Set the 'lastcpu' to an invalid host cpu.
960 *
961 * This will invalidate TLB entries tagged with the vcpu's
962 * vpid the next time it runs via vmx_set_pcpu_defaults().
963 */
964 vmxstate->lastcpu = NOCPU;
965 return;
966 }
967
968 /*
969 * Invalidate all mappings tagged with 'vpid'
970 *
971 * This is done when a vCPU moves between host CPUs, where there may be
972 * stale TLB entries for this VPID on the target, or if emulated actions
973 * in the guest CPU have incurred an explicit TLB flush.
974 */
975 vms = vm_get_vmspace(vmx->vm);
976 if (vmspace_table_gen(vms) == vmx->eptgen[curcpu]) {
977 struct invvpid_desc invvpid_desc = {
978 .vpid = vmxstate->vpid,
979 .linear_addr = 0,
980 ._res1 = 0,
981 ._res2 = 0,
982 };
983
984 invvpid(INVVPID_TYPE_SINGLE_CONTEXT, invvpid_desc);
985 vmm_stat_incr(vmx->vm, vcpu, VCPU_INVVPID_DONE, 1);
986 } else {
987 /*
988 * The INVVPID can be skipped if an INVEPT is going to be
989 * performed before entering the guest. The INVEPT will
990 * invalidate combined mappings for the EP4TA associated with
991 * this guest, in all VPIDs.
992 */
993 vmm_stat_incr(vmx->vm, vcpu, VCPU_INVVPID_SAVED, 1);
994 }
995 }
996
997 static __inline void
invept(uint64_t type,uint64_t eptp)998 invept(uint64_t type, uint64_t eptp)
999 {
1000 int error;
1001 struct invept_desc {
1002 uint64_t eptp;
1003 uint64_t _resv;
1004 } desc = { eptp, 0 };
1005
1006 DTRACE_PROBE2(vmx__invept, uint64_t, type, uint64_t, eptp);
1007
1008 __asm __volatile("invept %[desc], %[type];"
1009 VMX_SET_ERROR_CODE_ASM
1010 : [error] "=r" (error)
1011 : [desc] "m" (desc), [type] "r" (type)
1012 : "memory");
1013
1014 if (error != 0) {
1015 panic("invvpid error %d", error);
1016 }
1017 }
1018
1019 static void
vmx_set_pcpu_defaults(struct vmx * vmx,int vcpu)1020 vmx_set_pcpu_defaults(struct vmx *vmx, int vcpu)
1021 {
1022 struct vmxstate *vmxstate;
1023
1024 /*
1025 * Regardless of whether the VM appears to have migrated between CPUs,
1026 * save the host sysenter stack pointer. As it points to the kernel
1027 * stack of each thread, the correct value must be maintained for every
1028 * trip into the critical section.
1029 */
1030 vmcs_write(VMCS_HOST_IA32_SYSENTER_ESP, rdmsr(MSR_SYSENTER_ESP_MSR));
1031
1032 /*
1033 * Perform any needed TSC_OFFSET adjustment based on TSC_MSR writes or
1034 * migration between host CPUs with differing TSC values.
1035 */
1036 vmx_apply_tsc_adjust(vmx, vcpu);
1037
1038 vmxstate = &vmx->state[vcpu];
1039 if (vmxstate->lastcpu == curcpu)
1040 return;
1041
1042 vmxstate->lastcpu = curcpu;
1043
1044 vmm_stat_incr(vmx->vm, vcpu, VCPU_MIGRATIONS, 1);
1045
1046 /* Load the per-CPU IDT address */
1047 vmcs_write(VMCS_HOST_IDTR_BASE, vmm_get_host_idtrbase());
1048 vmcs_write(VMCS_HOST_TR_BASE, vmm_get_host_trbase());
1049 vmcs_write(VMCS_HOST_GDTR_BASE, vmm_get_host_gdtrbase());
1050 vmcs_write(VMCS_HOST_GS_BASE, vmm_get_host_gsbase());
1051 vmx_invvpid(vmx, vcpu, 1);
1052 }
1053
1054 static __inline bool
vmx_int_window_exiting(struct vmx * vmx,int vcpu)1055 vmx_int_window_exiting(struct vmx *vmx, int vcpu)
1056 {
1057 return ((vmx->cap[vcpu].proc_ctls & PROCBASED_INT_WINDOW_EXITING) != 0);
1058 }
1059
1060 static __inline void
vmx_set_int_window_exiting(struct vmx * vmx,int vcpu)1061 vmx_set_int_window_exiting(struct vmx *vmx, int vcpu)
1062 {
1063 if (!vmx_int_window_exiting(vmx, vcpu)) {
1064 /* Enable interrupt window exiting */
1065 vmx->cap[vcpu].proc_ctls |= PROCBASED_INT_WINDOW_EXITING;
1066 vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
1067 }
1068 }
1069
1070 static __inline void
vmx_clear_int_window_exiting(struct vmx * vmx,int vcpu)1071 vmx_clear_int_window_exiting(struct vmx *vmx, int vcpu)
1072 {
1073 /* Disable interrupt window exiting */
1074 vmx->cap[vcpu].proc_ctls &= ~PROCBASED_INT_WINDOW_EXITING;
1075 vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
1076 }
1077
1078 static __inline bool
vmx_nmi_window_exiting(struct vmx * vmx,int vcpu)1079 vmx_nmi_window_exiting(struct vmx *vmx, int vcpu)
1080 {
1081 return ((vmx->cap[vcpu].proc_ctls & PROCBASED_NMI_WINDOW_EXITING) != 0);
1082 }
1083
1084 static __inline void
vmx_set_nmi_window_exiting(struct vmx * vmx,int vcpu)1085 vmx_set_nmi_window_exiting(struct vmx *vmx, int vcpu)
1086 {
1087 if (!vmx_nmi_window_exiting(vmx, vcpu)) {
1088 vmx->cap[vcpu].proc_ctls |= PROCBASED_NMI_WINDOW_EXITING;
1089 vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
1090 }
1091 }
1092
1093 static __inline void
vmx_clear_nmi_window_exiting(struct vmx * vmx,int vcpu)1094 vmx_clear_nmi_window_exiting(struct vmx *vmx, int vcpu)
1095 {
1096 vmx->cap[vcpu].proc_ctls &= ~PROCBASED_NMI_WINDOW_EXITING;
1097 vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
1098 }
1099
1100 /*
1101 * Set the TSC adjustment, taking into account the offsets measured between
1102 * host physical CPUs. This is required even if the guest has not set a TSC
1103 * offset since vCPUs inherit the TSC offset of whatever physical CPU it has
1104 * migrated onto. Without this mitigation, un-synched host TSCs will convey
1105 * the appearance of TSC time-travel to the guest as its vCPUs migrate.
1106 */
1107 static void
vmx_apply_tsc_adjust(struct vmx * vmx,int vcpu)1108 vmx_apply_tsc_adjust(struct vmx *vmx, int vcpu)
1109 {
1110 const uint64_t offset = vcpu_tsc_offset(vmx->vm, vcpu, true);
1111
1112 ASSERT(vmx->cap[vcpu].proc_ctls & PROCBASED_TSC_OFFSET);
1113
1114 if (vmx->tsc_offset_active[vcpu] != offset) {
1115 vmcs_write(VMCS_TSC_OFFSET, offset);
1116 vmx->tsc_offset_active[vcpu] = offset;
1117 }
1118 }
1119
1120 CTASSERT(VMCS_INTR_T_HWINTR == VM_INTINFO_HWINTR);
1121 CTASSERT(VMCS_INTR_T_NMI == VM_INTINFO_NMI);
1122 CTASSERT(VMCS_INTR_T_HWEXCEPTION == VM_INTINFO_HWEXCP);
1123 CTASSERT(VMCS_INTR_T_SWINTR == VM_INTINFO_SWINTR);
1124 CTASSERT(VMCS_INTR_T_PRIV_SWEXCEPTION == VM_INTINFO_RESV5);
1125 CTASSERT(VMCS_INTR_T_SWEXCEPTION == VM_INTINFO_RESV6);
1126 CTASSERT(VMCS_IDT_VEC_ERRCODE_VALID == VM_INTINFO_DEL_ERRCODE);
1127 CTASSERT(VMCS_INTR_T_MASK == VM_INTINFO_MASK_TYPE);
1128
1129 static uint64_t
vmx_idtvec_to_intinfo(uint32_t info,uint32_t errcode)1130 vmx_idtvec_to_intinfo(uint32_t info, uint32_t errcode)
1131 {
1132 ASSERT(info & VMCS_IDT_VEC_VALID);
1133
1134 const uint32_t type = info & VMCS_INTR_T_MASK;
1135 const uint8_t vec = info & 0xff;
1136
1137 switch (type) {
1138 case VMCS_INTR_T_HWINTR:
1139 case VMCS_INTR_T_NMI:
1140 case VMCS_INTR_T_HWEXCEPTION:
1141 case VMCS_INTR_T_SWINTR:
1142 case VMCS_INTR_T_PRIV_SWEXCEPTION:
1143 case VMCS_INTR_T_SWEXCEPTION:
1144 break;
1145 default:
1146 panic("unexpected event type 0x%03x", type);
1147 }
1148
1149 uint64_t intinfo = VM_INTINFO_VALID | type | vec;
1150 if (info & VMCS_IDT_VEC_ERRCODE_VALID) {
1151 intinfo |= (uint64_t)errcode << 32;
1152 }
1153
1154 return (intinfo);
1155 }
1156
1157 CTASSERT(VMCS_INTR_DEL_ERRCODE == VMCS_IDT_VEC_ERRCODE_VALID);
1158 CTASSERT(VMCS_INTR_VALID == VMCS_IDT_VEC_VALID);
1159
1160 /*
1161 * Store VMX-specific event injection info for later handling. This depends on
1162 * the bhyve-internal event definitions matching those in the VMCS, as ensured
1163 * by the vmx_idtvec_to_intinfo() and the related CTASSERTs.
1164 */
1165 static void
vmx_stash_intinfo(struct vmx * vmx,int vcpu)1166 vmx_stash_intinfo(struct vmx *vmx, int vcpu)
1167 {
1168 uint64_t info = vmcs_read(VMCS_ENTRY_INTR_INFO);
1169 if ((info & VMCS_INTR_VALID) != 0) {
1170 uint32_t errcode = 0;
1171
1172 if ((info & VMCS_INTR_DEL_ERRCODE) != 0) {
1173 errcode = vmcs_read(VMCS_ENTRY_EXCEPTION_ERROR);
1174 }
1175
1176 VERIFY0(vm_exit_intinfo(vmx->vm, vcpu,
1177 vmx_idtvec_to_intinfo(info, errcode)));
1178
1179 vmcs_write(VMCS_ENTRY_INTR_INFO, 0);
1180 vmcs_write(VMCS_ENTRY_EXCEPTION_ERROR, 0);
1181 }
1182 }
1183
1184 static void
vmx_inject_intinfo(uint64_t info)1185 vmx_inject_intinfo(uint64_t info)
1186 {
1187 ASSERT(VM_INTINFO_PENDING(info));
1188 ASSERT0(info & VM_INTINFO_MASK_RSVD);
1189
1190 /*
1191 * The bhyve format matches that of the VMCS, which is ensured by the
1192 * CTASSERTs above.
1193 */
1194 uint32_t inject = info;
1195 switch (VM_INTINFO_VECTOR(info)) {
1196 case IDT_BP:
1197 case IDT_OF:
1198 /*
1199 * VT-x requires #BP and #OF to be injected as software
1200 * exceptions.
1201 */
1202 inject &= ~VMCS_INTR_T_MASK;
1203 inject |= VMCS_INTR_T_SWEXCEPTION;
1204 break;
1205 default:
1206 break;
1207 }
1208
1209 if (VM_INTINFO_HAS_ERRCODE(info)) {
1210 vmcs_write(VMCS_ENTRY_EXCEPTION_ERROR,
1211 VM_INTINFO_ERRCODE(info));
1212 }
1213 vmcs_write(VMCS_ENTRY_INTR_INFO, inject);
1214 }
1215
1216 #define NMI_BLOCKING (VMCS_INTERRUPTIBILITY_NMI_BLOCKING | \
1217 VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING)
1218 #define HWINTR_BLOCKING (VMCS_INTERRUPTIBILITY_STI_BLOCKING | \
1219 VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING)
1220
1221 static void
vmx_inject_nmi(struct vmx * vmx,int vcpu)1222 vmx_inject_nmi(struct vmx *vmx, int vcpu)
1223 {
1224 ASSERT0(vmcs_read(VMCS_GUEST_INTERRUPTIBILITY) & NMI_BLOCKING);
1225 ASSERT0(vmcs_read(VMCS_ENTRY_INTR_INFO) & VMCS_INTR_VALID);
1226
1227 /*
1228 * Inject the virtual NMI. The vector must be the NMI IDT entry
1229 * or the VMCS entry check will fail.
1230 */
1231 vmcs_write(VMCS_ENTRY_INTR_INFO,
1232 IDT_NMI | VMCS_INTR_T_NMI | VMCS_INTR_VALID);
1233
1234 /* Clear the request */
1235 vm_nmi_clear(vmx->vm, vcpu);
1236 }
1237
1238 /*
1239 * Inject exceptions, NMIs, and ExtINTs.
1240 *
1241 * The logic behind these are complicated and may involve mutex contention, so
1242 * the injection is performed without the protection of host CPU interrupts
1243 * being disabled. This means a racing notification could be "lost",
1244 * necessitating a later call to vmx_inject_recheck() to close that window
1245 * of opportunity.
1246 */
1247 static enum event_inject_state
vmx_inject_events(struct vmx * vmx,int vcpu,uint64_t rip)1248 vmx_inject_events(struct vmx *vmx, int vcpu, uint64_t rip)
1249 {
1250 uint64_t entryinfo;
1251 uint32_t gi, info;
1252 int vector;
1253 enum event_inject_state state;
1254
1255 gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
1256 info = vmcs_read(VMCS_ENTRY_INTR_INFO);
1257 state = EIS_CAN_INJECT;
1258
1259 /* Clear any interrupt blocking if the guest %rip has changed */
1260 if (vmx->state[vcpu].nextrip != rip && (gi & HWINTR_BLOCKING) != 0) {
1261 gi &= ~HWINTR_BLOCKING;
1262 vmcs_write(VMCS_GUEST_INTERRUPTIBILITY, gi);
1263 }
1264
1265 /*
1266 * It could be that an interrupt is already pending for injection from
1267 * the VMCS. This would be the case if the vCPU exited for conditions
1268 * such as an AST before a vm-entry delivered the injection.
1269 */
1270 if ((info & VMCS_INTR_VALID) != 0) {
1271 return (EIS_EV_EXISTING | EIS_REQ_EXIT);
1272 }
1273
1274 if (vm_entry_intinfo(vmx->vm, vcpu, &entryinfo)) {
1275 vmx_inject_intinfo(entryinfo);
1276 state = EIS_EV_INJECTED;
1277 }
1278
1279 if (vm_nmi_pending(vmx->vm, vcpu)) {
1280 /*
1281 * If there are no conditions blocking NMI injection then inject
1282 * it directly here otherwise enable "NMI window exiting" to
1283 * inject it as soon as we can.
1284 *
1285 * According to the Intel manual, some CPUs do not allow NMI
1286 * injection when STI_BLOCKING is active. That check is
1287 * enforced here, regardless of CPU capability. If running on a
1288 * CPU without such a restriction it will immediately exit and
1289 * the NMI will be injected in the "NMI window exiting" handler.
1290 */
1291 if ((gi & (HWINTR_BLOCKING | NMI_BLOCKING)) == 0) {
1292 if (state == EIS_CAN_INJECT) {
1293 vmx_inject_nmi(vmx, vcpu);
1294 state = EIS_EV_INJECTED;
1295 } else {
1296 return (state | EIS_REQ_EXIT);
1297 }
1298 } else {
1299 vmx_set_nmi_window_exiting(vmx, vcpu);
1300 }
1301 }
1302
1303 if (vm_extint_pending(vmx->vm, vcpu)) {
1304 if (state != EIS_CAN_INJECT) {
1305 return (state | EIS_REQ_EXIT);
1306 }
1307 if ((gi & HWINTR_BLOCKING) != 0 ||
1308 (vmcs_read(VMCS_GUEST_RFLAGS) & PSL_I) == 0) {
1309 return (EIS_GI_BLOCK);
1310 }
1311
1312 /* Ask the legacy pic for a vector to inject */
1313 vatpic_pending_intr(vmx->vm, &vector);
1314
1315 /*
1316 * From the Intel SDM, Volume 3, Section "Maskable
1317 * Hardware Interrupts":
1318 * - maskable interrupt vectors [0,255] can be delivered
1319 * through the INTR pin.
1320 */
1321 KASSERT(vector >= 0 && vector <= 255,
1322 ("invalid vector %d from INTR", vector));
1323
1324 /* Inject the interrupt */
1325 vmcs_write(VMCS_ENTRY_INTR_INFO,
1326 VMCS_INTR_T_HWINTR | VMCS_INTR_VALID | vector);
1327
1328 vm_extint_clear(vmx->vm, vcpu);
1329 vatpic_intr_accepted(vmx->vm, vector);
1330 state = EIS_EV_INJECTED;
1331 }
1332
1333 return (state);
1334 }
1335
1336 /*
1337 * Inject any interrupts pending on the vLAPIC.
1338 *
1339 * This is done with host CPU interrupts disabled so notification IPIs, either
1340 * from the standard vCPU notification or APICv posted interrupts, will be
1341 * queued on the host APIC and recognized when entering VMX context.
1342 */
1343 static enum event_inject_state
vmx_inject_vlapic(struct vmx * vmx,int vcpu,struct vlapic * vlapic)1344 vmx_inject_vlapic(struct vmx *vmx, int vcpu, struct vlapic *vlapic)
1345 {
1346 int vector;
1347
1348 if (!vlapic_pending_intr(vlapic, &vector)) {
1349 return (EIS_CAN_INJECT);
1350 }
1351
1352 /*
1353 * From the Intel SDM, Volume 3, Section "Maskable
1354 * Hardware Interrupts":
1355 * - maskable interrupt vectors [16,255] can be delivered
1356 * through the local APIC.
1357 */
1358 KASSERT(vector >= 16 && vector <= 255,
1359 ("invalid vector %d from local APIC", vector));
1360
1361 if (vmx_cap_en(vmx, VMX_CAP_APICV)) {
1362 uint16_t status_old = vmcs_read(VMCS_GUEST_INTR_STATUS);
1363 uint16_t status_new = (status_old & 0xff00) | vector;
1364
1365 /*
1366 * The APICv state will have been synced into the vLAPIC
1367 * as part of vlapic_pending_intr(). Prepare the VMCS
1368 * for the to-be-injected pending interrupt.
1369 */
1370 if (status_new > status_old) {
1371 vmcs_write(VMCS_GUEST_INTR_STATUS, status_new);
1372 }
1373
1374 /*
1375 * Ensure VMCS state regarding EOI traps is kept in sync
1376 * with the TMRs in the vlapic.
1377 */
1378 vmx_apicv_sync_tmr(vlapic);
1379
1380 /*
1381 * The rest of the injection process for injecting the
1382 * interrupt(s) is handled by APICv. It does not preclude other
1383 * event injection from occurring.
1384 */
1385 return (EIS_CAN_INJECT);
1386 }
1387
1388 ASSERT0(vmcs_read(VMCS_ENTRY_INTR_INFO) & VMCS_INTR_VALID);
1389
1390 /* Does guest interruptability block injection? */
1391 if ((vmcs_read(VMCS_GUEST_INTERRUPTIBILITY) & HWINTR_BLOCKING) != 0 ||
1392 (vmcs_read(VMCS_GUEST_RFLAGS) & PSL_I) == 0) {
1393 return (EIS_GI_BLOCK);
1394 }
1395
1396 /* Inject the interrupt */
1397 vmcs_write(VMCS_ENTRY_INTR_INFO,
1398 VMCS_INTR_T_HWINTR | VMCS_INTR_VALID | vector);
1399
1400 /* Update the Local APIC ISR */
1401 vlapic_intr_accepted(vlapic, vector);
1402
1403 return (EIS_EV_INJECTED);
1404 }
1405
1406 /*
1407 * Re-check for events to be injected.
1408 *
1409 * Once host CPU interrupts are disabled, check for the presence of any events
1410 * which require injection processing. If an exit is required upon injection,
1411 * or once the guest becomes interruptable, that will be configured too.
1412 */
1413 static bool
vmx_inject_recheck(struct vmx * vmx,int vcpu,enum event_inject_state state)1414 vmx_inject_recheck(struct vmx *vmx, int vcpu, enum event_inject_state state)
1415 {
1416 if (state == EIS_CAN_INJECT) {
1417 if (vm_nmi_pending(vmx->vm, vcpu) &&
1418 !vmx_nmi_window_exiting(vmx, vcpu)) {
1419 /* queued NMI not blocked by NMI-window-exiting */
1420 return (true);
1421 }
1422 if (vm_extint_pending(vmx->vm, vcpu)) {
1423 /* queued ExtINT not blocked by existing injection */
1424 return (true);
1425 }
1426 } else {
1427 if ((state & EIS_REQ_EXIT) != 0) {
1428 /*
1429 * Use a self-IPI to force an immediate exit after
1430 * event injection has occurred.
1431 */
1432 poke_cpu(CPU->cpu_id);
1433 } else {
1434 /*
1435 * If any event is being injected, an exit immediately
1436 * upon becoming interruptable again will allow pending
1437 * or newly queued events to be injected in a timely
1438 * manner.
1439 */
1440 vmx_set_int_window_exiting(vmx, vcpu);
1441 }
1442 }
1443 return (false);
1444 }
1445
1446 /*
1447 * If the Virtual NMIs execution control is '1' then the logical processor
1448 * tracks virtual-NMI blocking in the Guest Interruptibility-state field of
1449 * the VMCS. An IRET instruction in VMX non-root operation will remove any
1450 * virtual-NMI blocking.
1451 *
1452 * This unblocking occurs even if the IRET causes a fault. In this case the
1453 * hypervisor needs to restore virtual-NMI blocking before resuming the guest.
1454 */
1455 static void
vmx_restore_nmi_blocking(struct vmx * vmx,int vcpuid)1456 vmx_restore_nmi_blocking(struct vmx *vmx, int vcpuid)
1457 {
1458 uint32_t gi;
1459
1460 gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
1461 gi |= VMCS_INTERRUPTIBILITY_NMI_BLOCKING;
1462 vmcs_write(VMCS_GUEST_INTERRUPTIBILITY, gi);
1463 }
1464
1465 static void
vmx_clear_nmi_blocking(struct vmx * vmx,int vcpuid)1466 vmx_clear_nmi_blocking(struct vmx *vmx, int vcpuid)
1467 {
1468 uint32_t gi;
1469
1470 gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
1471 gi &= ~VMCS_INTERRUPTIBILITY_NMI_BLOCKING;
1472 vmcs_write(VMCS_GUEST_INTERRUPTIBILITY, gi);
1473 }
1474
1475 static void
vmx_assert_nmi_blocking(struct vmx * vmx,int vcpuid)1476 vmx_assert_nmi_blocking(struct vmx *vmx, int vcpuid)
1477 {
1478 uint32_t gi;
1479
1480 gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
1481 KASSERT(gi & VMCS_INTERRUPTIBILITY_NMI_BLOCKING,
1482 ("NMI blocking is not in effect %x", gi));
1483 }
1484
1485 static int
vmx_emulate_xsetbv(struct vmx * vmx,int vcpu,struct vm_exit * vmexit)1486 vmx_emulate_xsetbv(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
1487 {
1488 struct vmxctx *vmxctx;
1489 uint64_t xcrval;
1490 const struct xsave_limits *limits;
1491
1492 vmxctx = &vmx->ctx[vcpu];
1493 limits = vmm_get_xsave_limits();
1494
1495 /*
1496 * Note that the processor raises a GP# fault on its own if
1497 * xsetbv is executed for CPL != 0, so we do not have to
1498 * emulate that fault here.
1499 */
1500
1501 /* Only xcr0 is supported. */
1502 if (vmxctx->guest_rcx != 0) {
1503 vm_inject_gp(vmx->vm, vcpu);
1504 return (HANDLED);
1505 }
1506
1507 /* We only handle xcr0 if both the host and guest have XSAVE enabled. */
1508 if (!limits->xsave_enabled ||
1509 !(vmcs_read(VMCS_GUEST_CR4) & CR4_XSAVE)) {
1510 vm_inject_ud(vmx->vm, vcpu);
1511 return (HANDLED);
1512 }
1513
1514 xcrval = vmxctx->guest_rdx << 32 | (vmxctx->guest_rax & 0xffffffff);
1515 if ((xcrval & ~limits->xcr0_allowed) != 0) {
1516 vm_inject_gp(vmx->vm, vcpu);
1517 return (HANDLED);
1518 }
1519
1520 if (!(xcrval & XFEATURE_ENABLED_X87)) {
1521 vm_inject_gp(vmx->vm, vcpu);
1522 return (HANDLED);
1523 }
1524
1525 /* AVX (YMM_Hi128) requires SSE. */
1526 if (xcrval & XFEATURE_ENABLED_AVX &&
1527 (xcrval & XFEATURE_AVX) != XFEATURE_AVX) {
1528 vm_inject_gp(vmx->vm, vcpu);
1529 return (HANDLED);
1530 }
1531
1532 /*
1533 * AVX512 requires base AVX (YMM_Hi128) as well as OpMask,
1534 * ZMM_Hi256, and Hi16_ZMM.
1535 */
1536 if (xcrval & XFEATURE_AVX512 &&
1537 (xcrval & (XFEATURE_AVX512 | XFEATURE_AVX)) !=
1538 (XFEATURE_AVX512 | XFEATURE_AVX)) {
1539 vm_inject_gp(vmx->vm, vcpu);
1540 return (HANDLED);
1541 }
1542
1543 /*
1544 * Intel MPX requires both bound register state flags to be
1545 * set.
1546 */
1547 if (((xcrval & XFEATURE_ENABLED_BNDREGS) != 0) !=
1548 ((xcrval & XFEATURE_ENABLED_BNDCSR) != 0)) {
1549 vm_inject_gp(vmx->vm, vcpu);
1550 return (HANDLED);
1551 }
1552
1553 /*
1554 * This runs "inside" vmrun() with the guest's FPU state, so
1555 * modifying xcr0 directly modifies the guest's xcr0, not the
1556 * host's.
1557 */
1558 load_xcr(0, xcrval);
1559 return (HANDLED);
1560 }
1561
1562 static uint64_t
vmx_get_guest_reg(struct vmx * vmx,int vcpu,int ident)1563 vmx_get_guest_reg(struct vmx *vmx, int vcpu, int ident)
1564 {
1565 const struct vmxctx *vmxctx;
1566
1567 vmxctx = &vmx->ctx[vcpu];
1568
1569 switch (ident) {
1570 case 0:
1571 return (vmxctx->guest_rax);
1572 case 1:
1573 return (vmxctx->guest_rcx);
1574 case 2:
1575 return (vmxctx->guest_rdx);
1576 case 3:
1577 return (vmxctx->guest_rbx);
1578 case 4:
1579 return (vmcs_read(VMCS_GUEST_RSP));
1580 case 5:
1581 return (vmxctx->guest_rbp);
1582 case 6:
1583 return (vmxctx->guest_rsi);
1584 case 7:
1585 return (vmxctx->guest_rdi);
1586 case 8:
1587 return (vmxctx->guest_r8);
1588 case 9:
1589 return (vmxctx->guest_r9);
1590 case 10:
1591 return (vmxctx->guest_r10);
1592 case 11:
1593 return (vmxctx->guest_r11);
1594 case 12:
1595 return (vmxctx->guest_r12);
1596 case 13:
1597 return (vmxctx->guest_r13);
1598 case 14:
1599 return (vmxctx->guest_r14);
1600 case 15:
1601 return (vmxctx->guest_r15);
1602 default:
1603 panic("invalid vmx register %d", ident);
1604 }
1605 }
1606
1607 static void
vmx_set_guest_reg(struct vmx * vmx,int vcpu,int ident,uint64_t regval)1608 vmx_set_guest_reg(struct vmx *vmx, int vcpu, int ident, uint64_t regval)
1609 {
1610 struct vmxctx *vmxctx;
1611
1612 vmxctx = &vmx->ctx[vcpu];
1613
1614 switch (ident) {
1615 case 0:
1616 vmxctx->guest_rax = regval;
1617 break;
1618 case 1:
1619 vmxctx->guest_rcx = regval;
1620 break;
1621 case 2:
1622 vmxctx->guest_rdx = regval;
1623 break;
1624 case 3:
1625 vmxctx->guest_rbx = regval;
1626 break;
1627 case 4:
1628 vmcs_write(VMCS_GUEST_RSP, regval);
1629 break;
1630 case 5:
1631 vmxctx->guest_rbp = regval;
1632 break;
1633 case 6:
1634 vmxctx->guest_rsi = regval;
1635 break;
1636 case 7:
1637 vmxctx->guest_rdi = regval;
1638 break;
1639 case 8:
1640 vmxctx->guest_r8 = regval;
1641 break;
1642 case 9:
1643 vmxctx->guest_r9 = regval;
1644 break;
1645 case 10:
1646 vmxctx->guest_r10 = regval;
1647 break;
1648 case 11:
1649 vmxctx->guest_r11 = regval;
1650 break;
1651 case 12:
1652 vmxctx->guest_r12 = regval;
1653 break;
1654 case 13:
1655 vmxctx->guest_r13 = regval;
1656 break;
1657 case 14:
1658 vmxctx->guest_r14 = regval;
1659 break;
1660 case 15:
1661 vmxctx->guest_r15 = regval;
1662 break;
1663 default:
1664 panic("invalid vmx register %d", ident);
1665 }
1666 }
1667
1668 static void
vmx_sync_efer_state(struct vmx * vmx,int vcpu,uint64_t efer)1669 vmx_sync_efer_state(struct vmx *vmx, int vcpu, uint64_t efer)
1670 {
1671 uint64_t ctrl;
1672
1673 /*
1674 * If the "load EFER" VM-entry control is 1 (which we require) then the
1675 * value of EFER.LMA must be identical to "IA-32e mode guest" bit in the
1676 * VM-entry control.
1677 */
1678 ctrl = vmcs_read(VMCS_ENTRY_CTLS);
1679 if ((efer & EFER_LMA) != 0) {
1680 ctrl |= VM_ENTRY_GUEST_LMA;
1681 } else {
1682 ctrl &= ~VM_ENTRY_GUEST_LMA;
1683 }
1684 vmcs_write(VMCS_ENTRY_CTLS, ctrl);
1685 }
1686
1687 static int
vmx_emulate_cr0_access(struct vmx * vmx,int vcpu,uint64_t exitqual)1688 vmx_emulate_cr0_access(struct vmx *vmx, int vcpu, uint64_t exitqual)
1689 {
1690 uint64_t crval, regval;
1691
1692 /* We only handle mov to %cr0 at this time */
1693 if ((exitqual & 0xf0) != 0x00)
1694 return (UNHANDLED);
1695
1696 regval = vmx_get_guest_reg(vmx, vcpu, (exitqual >> 8) & 0xf);
1697
1698 vmcs_write(VMCS_CR0_SHADOW, regval);
1699
1700 crval = regval | cr0_ones_mask;
1701 crval &= ~cr0_zeros_mask;
1702
1703 const uint64_t old = vmcs_read(VMCS_GUEST_CR0);
1704 const uint64_t diff = crval ^ old;
1705 /* Flush the TLB if the paging or write-protect bits are changing */
1706 if ((diff & CR0_PG) != 0 || (diff & CR0_WP) != 0) {
1707 vmx_invvpid(vmx, vcpu, 1);
1708 }
1709
1710 vmcs_write(VMCS_GUEST_CR0, crval);
1711
1712 if (regval & CR0_PG) {
1713 uint64_t efer;
1714
1715 /* Keep EFER.LMA properly updated if paging is enabled */
1716 efer = vmcs_read(VMCS_GUEST_IA32_EFER);
1717 if (efer & EFER_LME) {
1718 efer |= EFER_LMA;
1719 vmcs_write(VMCS_GUEST_IA32_EFER, efer);
1720 vmx_sync_efer_state(vmx, vcpu, efer);
1721 }
1722 }
1723
1724 return (HANDLED);
1725 }
1726
1727 static int
vmx_emulate_cr4_access(struct vmx * vmx,int vcpu,uint64_t exitqual)1728 vmx_emulate_cr4_access(struct vmx *vmx, int vcpu, uint64_t exitqual)
1729 {
1730 uint64_t crval, regval;
1731
1732 /* We only handle mov to %cr4 at this time */
1733 if ((exitqual & 0xf0) != 0x00)
1734 return (UNHANDLED);
1735
1736 regval = vmx_get_guest_reg(vmx, vcpu, (exitqual >> 8) & 0xf);
1737
1738 vmcs_write(VMCS_CR4_SHADOW, regval);
1739
1740 crval = regval | cr4_ones_mask;
1741 crval &= ~cr4_zeros_mask;
1742 vmcs_write(VMCS_GUEST_CR4, crval);
1743
1744 return (HANDLED);
1745 }
1746
1747 static int
vmx_emulate_cr8_access(struct vmx * vmx,int vcpu,uint64_t exitqual)1748 vmx_emulate_cr8_access(struct vmx *vmx, int vcpu, uint64_t exitqual)
1749 {
1750 struct vlapic *vlapic;
1751 uint64_t cr8;
1752 int regnum;
1753
1754 /* We only handle mov %cr8 to/from a register at this time. */
1755 if ((exitqual & 0xe0) != 0x00) {
1756 return (UNHANDLED);
1757 }
1758
1759 vlapic = vm_lapic(vmx->vm, vcpu);
1760 regnum = (exitqual >> 8) & 0xf;
1761 if (exitqual & 0x10) {
1762 cr8 = vlapic_get_cr8(vlapic);
1763 vmx_set_guest_reg(vmx, vcpu, regnum, cr8);
1764 } else {
1765 cr8 = vmx_get_guest_reg(vmx, vcpu, regnum);
1766 vlapic_set_cr8(vlapic, cr8);
1767 }
1768
1769 return (HANDLED);
1770 }
1771
1772 /*
1773 * From section "Guest Register State" in the Intel SDM: CPL = SS.DPL
1774 */
1775 static int
vmx_cpl(void)1776 vmx_cpl(void)
1777 {
1778 uint32_t ssar;
1779
1780 ssar = vmcs_read(VMCS_GUEST_SS_ACCESS_RIGHTS);
1781 return ((ssar >> 5) & 0x3);
1782 }
1783
1784 static enum vm_cpu_mode
vmx_cpu_mode(void)1785 vmx_cpu_mode(void)
1786 {
1787 uint32_t csar;
1788
1789 if (vmcs_read(VMCS_GUEST_IA32_EFER) & EFER_LMA) {
1790 csar = vmcs_read(VMCS_GUEST_CS_ACCESS_RIGHTS);
1791 if (csar & 0x2000)
1792 return (CPU_MODE_64BIT); /* CS.L = 1 */
1793 else
1794 return (CPU_MODE_COMPATIBILITY);
1795 } else if (vmcs_read(VMCS_GUEST_CR0) & CR0_PE) {
1796 return (CPU_MODE_PROTECTED);
1797 } else {
1798 return (CPU_MODE_REAL);
1799 }
1800 }
1801
1802 static enum vm_paging_mode
vmx_paging_mode(void)1803 vmx_paging_mode(void)
1804 {
1805
1806 if (!(vmcs_read(VMCS_GUEST_CR0) & CR0_PG))
1807 return (PAGING_MODE_FLAT);
1808 if (!(vmcs_read(VMCS_GUEST_CR4) & CR4_PAE))
1809 return (PAGING_MODE_32);
1810 if (vmcs_read(VMCS_GUEST_IA32_EFER) & EFER_LME)
1811 return (PAGING_MODE_64);
1812 else
1813 return (PAGING_MODE_PAE);
1814 }
1815
1816 static void
vmx_paging_info(struct vm_guest_paging * paging)1817 vmx_paging_info(struct vm_guest_paging *paging)
1818 {
1819 paging->cr3 = vmcs_read(VMCS_GUEST_CR3);
1820 paging->cpl = vmx_cpl();
1821 paging->cpu_mode = vmx_cpu_mode();
1822 paging->paging_mode = vmx_paging_mode();
1823 }
1824
1825 static void
vmexit_mmio_emul(struct vm_exit * vmexit,struct vie * vie,uint64_t gpa,uint64_t gla)1826 vmexit_mmio_emul(struct vm_exit *vmexit, struct vie *vie, uint64_t gpa,
1827 uint64_t gla)
1828 {
1829 struct vm_guest_paging paging;
1830 uint32_t csar;
1831
1832 vmexit->exitcode = VM_EXITCODE_MMIO_EMUL;
1833 vmexit->inst_length = 0;
1834 vmexit->u.mmio_emul.gpa = gpa;
1835 vmexit->u.mmio_emul.gla = gla;
1836 vmx_paging_info(&paging);
1837
1838 switch (paging.cpu_mode) {
1839 case CPU_MODE_REAL:
1840 vmexit->u.mmio_emul.cs_base = vmcs_read(VMCS_GUEST_CS_BASE);
1841 vmexit->u.mmio_emul.cs_d = 0;
1842 break;
1843 case CPU_MODE_PROTECTED:
1844 case CPU_MODE_COMPATIBILITY:
1845 vmexit->u.mmio_emul.cs_base = vmcs_read(VMCS_GUEST_CS_BASE);
1846 csar = vmcs_read(VMCS_GUEST_CS_ACCESS_RIGHTS);
1847 vmexit->u.mmio_emul.cs_d = SEG_DESC_DEF32(csar);
1848 break;
1849 default:
1850 vmexit->u.mmio_emul.cs_base = 0;
1851 vmexit->u.mmio_emul.cs_d = 0;
1852 break;
1853 }
1854
1855 vie_init_mmio(vie, NULL, 0, &paging, gpa);
1856 }
1857
1858 static void
vmexit_inout(struct vm_exit * vmexit,struct vie * vie,uint64_t qual,uint32_t eax)1859 vmexit_inout(struct vm_exit *vmexit, struct vie *vie, uint64_t qual,
1860 uint32_t eax)
1861 {
1862 struct vm_guest_paging paging;
1863 struct vm_inout *inout;
1864
1865 inout = &vmexit->u.inout;
1866
1867 inout->bytes = (qual & 0x7) + 1;
1868 inout->flags = 0;
1869 inout->flags |= (qual & 0x8) ? INOUT_IN : 0;
1870 inout->flags |= (qual & 0x10) ? INOUT_STR : 0;
1871 inout->flags |= (qual & 0x20) ? INOUT_REP : 0;
1872 inout->port = (uint16_t)(qual >> 16);
1873 inout->eax = eax;
1874 if (inout->flags & INOUT_STR) {
1875 uint64_t inst_info;
1876
1877 inst_info = vmcs_read(VMCS_EXIT_INSTRUCTION_INFO);
1878
1879 /*
1880 * According to the SDM, bits 9:7 encode the address size of the
1881 * ins/outs operation, but only values 0/1/2 are expected,
1882 * corresponding to 16/32/64 bit sizes.
1883 */
1884 inout->addrsize = 2 << BITX(inst_info, 9, 7);
1885 VERIFY(inout->addrsize == 2 || inout->addrsize == 4 ||
1886 inout->addrsize == 8);
1887
1888 if (inout->flags & INOUT_IN) {
1889 /*
1890 * The bits describing the segment in INSTRUCTION_INFO
1891 * are not defined for ins, leaving it to system
1892 * software to assume %es (encoded as 0)
1893 */
1894 inout->segment = 0;
1895 } else {
1896 /*
1897 * Bits 15-17 encode the segment for OUTS.
1898 * This value follows the standard x86 segment order.
1899 */
1900 inout->segment = (inst_info >> 15) & 0x7;
1901 }
1902 }
1903
1904 vmexit->exitcode = VM_EXITCODE_INOUT;
1905 vmx_paging_info(&paging);
1906 vie_init_inout(vie, inout, vmexit->inst_length, &paging);
1907
1908 /* The in/out emulation will handle advancing %rip */
1909 vmexit->inst_length = 0;
1910 }
1911
1912 static int
ept_fault_type(uint64_t ept_qual)1913 ept_fault_type(uint64_t ept_qual)
1914 {
1915 int fault_type;
1916
1917 if (ept_qual & EPT_VIOLATION_DATA_WRITE)
1918 fault_type = PROT_WRITE;
1919 else if (ept_qual & EPT_VIOLATION_INST_FETCH)
1920 fault_type = PROT_EXEC;
1921 else
1922 fault_type = PROT_READ;
1923
1924 return (fault_type);
1925 }
1926
1927 static bool
ept_emulation_fault(uint64_t ept_qual)1928 ept_emulation_fault(uint64_t ept_qual)
1929 {
1930 int read, write;
1931
1932 /* EPT fault on an instruction fetch doesn't make sense here */
1933 if (ept_qual & EPT_VIOLATION_INST_FETCH)
1934 return (false);
1935
1936 /* EPT fault must be a read fault or a write fault */
1937 read = ept_qual & EPT_VIOLATION_DATA_READ ? 1 : 0;
1938 write = ept_qual & EPT_VIOLATION_DATA_WRITE ? 1 : 0;
1939 if ((read | write) == 0)
1940 return (false);
1941
1942 /*
1943 * The EPT violation must have been caused by accessing a
1944 * guest-physical address that is a translation of a guest-linear
1945 * address.
1946 */
1947 if ((ept_qual & EPT_VIOLATION_GLA_VALID) == 0 ||
1948 (ept_qual & EPT_VIOLATION_XLAT_VALID) == 0) {
1949 return (false);
1950 }
1951
1952 return (true);
1953 }
1954
1955 static __inline int
apic_access_virtualization(struct vmx * vmx,int vcpuid)1956 apic_access_virtualization(struct vmx *vmx, int vcpuid)
1957 {
1958 uint32_t proc_ctls2;
1959
1960 proc_ctls2 = vmx->cap[vcpuid].proc_ctls2;
1961 return ((proc_ctls2 & PROCBASED2_VIRTUALIZE_APIC_ACCESSES) ? 1 : 0);
1962 }
1963
1964 static __inline int
x2apic_virtualization(struct vmx * vmx,int vcpuid)1965 x2apic_virtualization(struct vmx *vmx, int vcpuid)
1966 {
1967 uint32_t proc_ctls2;
1968
1969 proc_ctls2 = vmx->cap[vcpuid].proc_ctls2;
1970 return ((proc_ctls2 & PROCBASED2_VIRTUALIZE_X2APIC_MODE) ? 1 : 0);
1971 }
1972
1973 static int
vmx_handle_apic_write(struct vmx * vmx,int vcpuid,struct vlapic * vlapic,uint64_t qual)1974 vmx_handle_apic_write(struct vmx *vmx, int vcpuid, struct vlapic *vlapic,
1975 uint64_t qual)
1976 {
1977 const uint_t offset = APIC_WRITE_OFFSET(qual);
1978
1979 if (!apic_access_virtualization(vmx, vcpuid)) {
1980 /*
1981 * In general there should not be any APIC write VM-exits
1982 * unless APIC-access virtualization is enabled.
1983 *
1984 * However self-IPI virtualization can legitimately trigger
1985 * an APIC-write VM-exit so treat it specially.
1986 */
1987 if (x2apic_virtualization(vmx, vcpuid) &&
1988 offset == APIC_OFFSET_SELF_IPI) {
1989 const uint32_t *apic_regs =
1990 (uint32_t *)(vlapic->apic_page);
1991 const uint32_t vector =
1992 apic_regs[APIC_OFFSET_SELF_IPI / 4];
1993
1994 vlapic_self_ipi_handler(vlapic, vector);
1995 return (HANDLED);
1996 } else
1997 return (UNHANDLED);
1998 }
1999
2000 switch (offset) {
2001 case APIC_OFFSET_ID:
2002 vlapic_id_write_handler(vlapic);
2003 break;
2004 case APIC_OFFSET_LDR:
2005 vlapic_ldr_write_handler(vlapic);
2006 break;
2007 case APIC_OFFSET_DFR:
2008 vlapic_dfr_write_handler(vlapic);
2009 break;
2010 case APIC_OFFSET_SVR:
2011 vlapic_svr_write_handler(vlapic);
2012 break;
2013 case APIC_OFFSET_ESR:
2014 vlapic_esr_write_handler(vlapic);
2015 break;
2016 case APIC_OFFSET_ICR_LOW:
2017 vlapic_icrlo_write_handler(vlapic);
2018 break;
2019 case APIC_OFFSET_CMCI_LVT:
2020 case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT:
2021 vlapic_lvt_write_handler(vlapic, offset);
2022 break;
2023 case APIC_OFFSET_TIMER_ICR:
2024 vlapic_icrtmr_write_handler(vlapic);
2025 break;
2026 case APIC_OFFSET_TIMER_DCR:
2027 vlapic_dcr_write_handler(vlapic);
2028 break;
2029 default:
2030 return (UNHANDLED);
2031 }
2032 return (HANDLED);
2033 }
2034
2035 static bool
apic_access_fault(struct vmx * vmx,int vcpuid,uint64_t gpa)2036 apic_access_fault(struct vmx *vmx, int vcpuid, uint64_t gpa)
2037 {
2038
2039 if (apic_access_virtualization(vmx, vcpuid) &&
2040 (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE))
2041 return (true);
2042 else
2043 return (false);
2044 }
2045
2046 static int
vmx_handle_apic_access(struct vmx * vmx,int vcpuid,struct vm_exit * vmexit)2047 vmx_handle_apic_access(struct vmx *vmx, int vcpuid, struct vm_exit *vmexit)
2048 {
2049 uint64_t qual;
2050 int access_type, offset, allowed;
2051 struct vie *vie;
2052
2053 if (!apic_access_virtualization(vmx, vcpuid))
2054 return (UNHANDLED);
2055
2056 qual = vmexit->u.vmx.exit_qualification;
2057 access_type = APIC_ACCESS_TYPE(qual);
2058 offset = APIC_ACCESS_OFFSET(qual);
2059
2060 allowed = 0;
2061 if (access_type == 0) {
2062 /*
2063 * Read data access to the following registers is expected.
2064 */
2065 switch (offset) {
2066 case APIC_OFFSET_APR:
2067 case APIC_OFFSET_PPR:
2068 case APIC_OFFSET_RRR:
2069 case APIC_OFFSET_CMCI_LVT:
2070 case APIC_OFFSET_TIMER_CCR:
2071 allowed = 1;
2072 break;
2073 default:
2074 break;
2075 }
2076 } else if (access_type == 1) {
2077 /*
2078 * Write data access to the following registers is expected.
2079 */
2080 switch (offset) {
2081 case APIC_OFFSET_VER:
2082 case APIC_OFFSET_APR:
2083 case APIC_OFFSET_PPR:
2084 case APIC_OFFSET_RRR:
2085 case APIC_OFFSET_ISR0 ... APIC_OFFSET_ISR7:
2086 case APIC_OFFSET_TMR0 ... APIC_OFFSET_TMR7:
2087 case APIC_OFFSET_IRR0 ... APIC_OFFSET_IRR7:
2088 case APIC_OFFSET_CMCI_LVT:
2089 case APIC_OFFSET_TIMER_CCR:
2090 allowed = 1;
2091 break;
2092 default:
2093 break;
2094 }
2095 }
2096
2097 if (allowed) {
2098 vie = vm_vie_ctx(vmx->vm, vcpuid);
2099 vmexit_mmio_emul(vmexit, vie, DEFAULT_APIC_BASE + offset,
2100 VIE_INVALID_GLA);
2101 }
2102
2103 /*
2104 * Regardless of whether the APIC-access is allowed this handler
2105 * always returns UNHANDLED:
2106 * - if the access is allowed then it is handled by emulating the
2107 * instruction that caused the VM-exit (outside the critical section)
2108 * - if the access is not allowed then it will be converted to an
2109 * exitcode of VM_EXITCODE_VMX and will be dealt with in userland.
2110 */
2111 return (UNHANDLED);
2112 }
2113
2114 static enum task_switch_reason
vmx_task_switch_reason(uint64_t qual)2115 vmx_task_switch_reason(uint64_t qual)
2116 {
2117 int reason;
2118
2119 reason = (qual >> 30) & 0x3;
2120 switch (reason) {
2121 case 0:
2122 return (TSR_CALL);
2123 case 1:
2124 return (TSR_IRET);
2125 case 2:
2126 return (TSR_JMP);
2127 case 3:
2128 return (TSR_IDT_GATE);
2129 default:
2130 panic("%s: invalid reason %d", __func__, reason);
2131 }
2132 }
2133
2134 static int
vmx_handle_msr(struct vmx * vmx,int vcpuid,struct vm_exit * vmexit,bool is_wrmsr)2135 vmx_handle_msr(struct vmx *vmx, int vcpuid, struct vm_exit *vmexit,
2136 bool is_wrmsr)
2137 {
2138 struct vmxctx *vmxctx = &vmx->ctx[vcpuid];
2139 const uint32_t ecx = vmxctx->guest_rcx;
2140 vm_msr_result_t res;
2141 uint64_t val = 0;
2142
2143 if (is_wrmsr) {
2144 vmm_stat_incr(vmx->vm, vcpuid, VMEXIT_WRMSR, 1);
2145 val = vmxctx->guest_rdx << 32 | (uint32_t)vmxctx->guest_rax;
2146
2147 if (vlapic_owned_msr(ecx)) {
2148 struct vlapic *vlapic = vm_lapic(vmx->vm, vcpuid);
2149
2150 res = vlapic_wrmsr(vlapic, ecx, val);
2151 } else {
2152 res = vmx_wrmsr(vmx, vcpuid, ecx, val);
2153 }
2154 } else {
2155 vmm_stat_incr(vmx->vm, vcpuid, VMEXIT_RDMSR, 1);
2156
2157 if (vlapic_owned_msr(ecx)) {
2158 struct vlapic *vlapic = vm_lapic(vmx->vm, vcpuid);
2159
2160 res = vlapic_rdmsr(vlapic, ecx, &val);
2161 } else {
2162 res = vmx_rdmsr(vmx, vcpuid, ecx, &val);
2163 }
2164 }
2165
2166 switch (res) {
2167 case VMR_OK:
2168 /* Store rdmsr result in the appropriate registers */
2169 if (!is_wrmsr) {
2170 vmxctx->guest_rax = (uint32_t)val;
2171 vmxctx->guest_rdx = val >> 32;
2172 }
2173 return (HANDLED);
2174 case VMR_GP:
2175 vm_inject_gp(vmx->vm, vcpuid);
2176 return (HANDLED);
2177 case VMR_UNHANLDED:
2178 vmexit->exitcode = is_wrmsr ?
2179 VM_EXITCODE_WRMSR : VM_EXITCODE_RDMSR;
2180 vmexit->u.msr.code = ecx;
2181 vmexit->u.msr.wval = val;
2182 return (UNHANDLED);
2183 default:
2184 panic("unexpected msr result %u\n", res);
2185 }
2186 }
2187
2188 static int
vmx_exit_process(struct vmx * vmx,int vcpu,struct vm_exit * vmexit)2189 vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
2190 {
2191 int error, errcode, errcode_valid, handled;
2192 struct vmxctx *vmxctx;
2193 struct vie *vie;
2194 struct vlapic *vlapic;
2195 struct vm_task_switch *ts;
2196 uint32_t idtvec_info, intr_info;
2197 uint32_t intr_type, intr_vec, reason;
2198 uint64_t qual, gpa;
2199
2200 CTASSERT((PINBASED_CTLS_ONE_SETTING & PINBASED_VIRTUAL_NMI) != 0);
2201 CTASSERT((PINBASED_CTLS_ONE_SETTING & PINBASED_NMI_EXITING) != 0);
2202
2203 handled = UNHANDLED;
2204 vmxctx = &vmx->ctx[vcpu];
2205
2206 qual = vmexit->u.vmx.exit_qualification;
2207 reason = vmexit->u.vmx.exit_reason;
2208 vmexit->exitcode = VM_EXITCODE_BOGUS;
2209
2210 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_COUNT, 1);
2211 SDT_PROBE3(vmm, vmx, exit, entry, vmx, vcpu, vmexit);
2212
2213 /*
2214 * VM-entry failures during or after loading guest state.
2215 *
2216 * These VM-exits are uncommon but must be handled specially
2217 * as most VM-exit fields are not populated as usual.
2218 */
2219 if (reason == EXIT_REASON_MCE_DURING_ENTRY) {
2220 vmm_call_trap(T_MCE);
2221 return (1);
2222 }
2223
2224 /*
2225 * VM exits that can be triggered during event delivery need to
2226 * be handled specially by re-injecting the event if the IDT
2227 * vectoring information field's valid bit is set.
2228 *
2229 * See "Information for VM Exits During Event Delivery" in Intel SDM
2230 * for details.
2231 */
2232 idtvec_info = vmcs_read(VMCS_IDT_VECTORING_INFO);
2233 if (idtvec_info & VMCS_IDT_VEC_VALID) {
2234 uint32_t errcode = 0;
2235 if (idtvec_info & VMCS_IDT_VEC_ERRCODE_VALID) {
2236 errcode = vmcs_read(VMCS_IDT_VECTORING_ERROR);
2237 }
2238
2239 /* Record exit intinfo */
2240 VERIFY0(vm_exit_intinfo(vmx->vm, vcpu,
2241 vmx_idtvec_to_intinfo(idtvec_info, errcode)));
2242
2243 /*
2244 * If 'virtual NMIs' are being used and the VM-exit
2245 * happened while injecting an NMI during the previous
2246 * VM-entry, then clear "blocking by NMI" in the
2247 * Guest Interruptibility-State so the NMI can be
2248 * reinjected on the subsequent VM-entry.
2249 *
2250 * However, if the NMI was being delivered through a task
2251 * gate, then the new task must start execution with NMIs
2252 * blocked so don't clear NMI blocking in this case.
2253 */
2254 intr_type = idtvec_info & VMCS_INTR_T_MASK;
2255 if (intr_type == VMCS_INTR_T_NMI) {
2256 if (reason != EXIT_REASON_TASK_SWITCH)
2257 vmx_clear_nmi_blocking(vmx, vcpu);
2258 else
2259 vmx_assert_nmi_blocking(vmx, vcpu);
2260 }
2261
2262 /*
2263 * Update VM-entry instruction length if the event being
2264 * delivered was a software interrupt or software exception.
2265 */
2266 if (intr_type == VMCS_INTR_T_SWINTR ||
2267 intr_type == VMCS_INTR_T_PRIV_SWEXCEPTION ||
2268 intr_type == VMCS_INTR_T_SWEXCEPTION) {
2269 vmcs_write(VMCS_ENTRY_INST_LENGTH, vmexit->inst_length);
2270 }
2271 }
2272
2273 switch (reason) {
2274 case EXIT_REASON_TRIPLE_FAULT:
2275 (void) vm_suspend(vmx->vm, VM_SUSPEND_TRIPLEFAULT, vcpu);
2276 handled = HANDLED;
2277 break;
2278 case EXIT_REASON_TASK_SWITCH:
2279 ts = &vmexit->u.task_switch;
2280 ts->tsssel = qual & 0xffff;
2281 ts->reason = vmx_task_switch_reason(qual);
2282 ts->ext = 0;
2283 ts->errcode_valid = 0;
2284 vmx_paging_info(&ts->paging);
2285 /*
2286 * If the task switch was due to a CALL, JMP, IRET, software
2287 * interrupt (INT n) or software exception (INT3, INTO),
2288 * then the saved %rip references the instruction that caused
2289 * the task switch. The instruction length field in the VMCS
2290 * is valid in this case.
2291 *
2292 * In all other cases (e.g., NMI, hardware exception) the
2293 * saved %rip is one that would have been saved in the old TSS
2294 * had the task switch completed normally so the instruction
2295 * length field is not needed in this case and is explicitly
2296 * set to 0.
2297 */
2298 if (ts->reason == TSR_IDT_GATE) {
2299 KASSERT(idtvec_info & VMCS_IDT_VEC_VALID,
2300 ("invalid idtvec_info %x for IDT task switch",
2301 idtvec_info));
2302 intr_type = idtvec_info & VMCS_INTR_T_MASK;
2303 if (intr_type != VMCS_INTR_T_SWINTR &&
2304 intr_type != VMCS_INTR_T_SWEXCEPTION &&
2305 intr_type != VMCS_INTR_T_PRIV_SWEXCEPTION) {
2306 /* Task switch triggered by external event */
2307 ts->ext = 1;
2308 vmexit->inst_length = 0;
2309 if (idtvec_info & VMCS_IDT_VEC_ERRCODE_VALID) {
2310 ts->errcode_valid = 1;
2311 ts->errcode =
2312 vmcs_read(VMCS_IDT_VECTORING_ERROR);
2313 }
2314 }
2315 }
2316 vmexit->exitcode = VM_EXITCODE_TASK_SWITCH;
2317 SDT_PROBE4(vmm, vmx, exit, taskswitch, vmx, vcpu, vmexit, ts);
2318 break;
2319 case EXIT_REASON_CR_ACCESS:
2320 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_CR_ACCESS, 1);
2321 SDT_PROBE4(vmm, vmx, exit, craccess, vmx, vcpu, vmexit, qual);
2322 switch (qual & 0xf) {
2323 case 0:
2324 handled = vmx_emulate_cr0_access(vmx, vcpu, qual);
2325 break;
2326 case 4:
2327 handled = vmx_emulate_cr4_access(vmx, vcpu, qual);
2328 break;
2329 case 8:
2330 handled = vmx_emulate_cr8_access(vmx, vcpu, qual);
2331 break;
2332 }
2333 break;
2334 case EXIT_REASON_RDMSR:
2335 case EXIT_REASON_WRMSR:
2336 handled = vmx_handle_msr(vmx, vcpu, vmexit,
2337 reason == EXIT_REASON_WRMSR);
2338 break;
2339 case EXIT_REASON_HLT:
2340 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_HLT, 1);
2341 SDT_PROBE3(vmm, vmx, exit, halt, vmx, vcpu, vmexit);
2342 vmexit->exitcode = VM_EXITCODE_HLT;
2343 vmexit->u.hlt.rflags = vmcs_read(VMCS_GUEST_RFLAGS);
2344 break;
2345 case EXIT_REASON_MTF:
2346 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_MTRAP, 1);
2347 SDT_PROBE3(vmm, vmx, exit, mtrap, vmx, vcpu, vmexit);
2348 vmexit->exitcode = VM_EXITCODE_MTRAP;
2349 vmexit->inst_length = 0;
2350 break;
2351 case EXIT_REASON_PAUSE:
2352 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_PAUSE, 1);
2353 SDT_PROBE3(vmm, vmx, exit, pause, vmx, vcpu, vmexit);
2354 vmexit->exitcode = VM_EXITCODE_PAUSE;
2355 break;
2356 case EXIT_REASON_INTR_WINDOW:
2357 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INTR_WINDOW, 1);
2358 SDT_PROBE3(vmm, vmx, exit, intrwindow, vmx, vcpu, vmexit);
2359 ASSERT(vmx_int_window_exiting(vmx, vcpu));
2360 vmx_clear_int_window_exiting(vmx, vcpu);
2361 return (1);
2362 case EXIT_REASON_EXT_INTR:
2363 /*
2364 * External interrupts serve only to cause VM exits and allow
2365 * the host interrupt handler to run.
2366 *
2367 * If this external interrupt triggers a virtual interrupt
2368 * to a VM, then that state will be recorded by the
2369 * host interrupt handler in the VM's softc. We will inject
2370 * this virtual interrupt during the subsequent VM enter.
2371 */
2372 intr_info = vmcs_read(VMCS_EXIT_INTR_INFO);
2373 SDT_PROBE4(vmm, vmx, exit, interrupt,
2374 vmx, vcpu, vmexit, intr_info);
2375
2376 /*
2377 * XXX: Ignore this exit if VMCS_INTR_VALID is not set.
2378 * This appears to be a bug in VMware Fusion?
2379 */
2380 if (!(intr_info & VMCS_INTR_VALID))
2381 return (1);
2382 KASSERT((intr_info & VMCS_INTR_VALID) != 0 &&
2383 (intr_info & VMCS_INTR_T_MASK) == VMCS_INTR_T_HWINTR,
2384 ("VM exit interruption info invalid: %x", intr_info));
2385 vmx_trigger_hostintr(intr_info & 0xff);
2386
2387 /*
2388 * This is special. We want to treat this as an 'handled'
2389 * VM-exit but not increment the instruction pointer.
2390 */
2391 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_EXTINT, 1);
2392 return (1);
2393 case EXIT_REASON_NMI_WINDOW:
2394 SDT_PROBE3(vmm, vmx, exit, nmiwindow, vmx, vcpu, vmexit);
2395 /* Exit to allow the pending virtual NMI to be injected */
2396 if (vm_nmi_pending(vmx->vm, vcpu))
2397 vmx_inject_nmi(vmx, vcpu);
2398 ASSERT(vmx_nmi_window_exiting(vmx, vcpu));
2399 vmx_clear_nmi_window_exiting(vmx, vcpu);
2400 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_NMI_WINDOW, 1);
2401 return (1);
2402 case EXIT_REASON_INOUT:
2403 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INOUT, 1);
2404 vie = vm_vie_ctx(vmx->vm, vcpu);
2405 vmexit_inout(vmexit, vie, qual, (uint32_t)vmxctx->guest_rax);
2406 SDT_PROBE3(vmm, vmx, exit, inout, vmx, vcpu, vmexit);
2407 break;
2408 case EXIT_REASON_CPUID:
2409 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_CPUID, 1);
2410 SDT_PROBE3(vmm, vmx, exit, cpuid, vmx, vcpu, vmexit);
2411 vcpu_emulate_cpuid(vmx->vm, vcpu,
2412 (uint64_t *)&vmxctx->guest_rax,
2413 (uint64_t *)&vmxctx->guest_rbx,
2414 (uint64_t *)&vmxctx->guest_rcx,
2415 (uint64_t *)&vmxctx->guest_rdx);
2416 handled = HANDLED;
2417 break;
2418 case EXIT_REASON_EXCEPTION:
2419 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_EXCEPTION, 1);
2420 intr_info = vmcs_read(VMCS_EXIT_INTR_INFO);
2421 KASSERT((intr_info & VMCS_INTR_VALID) != 0,
2422 ("VM exit interruption info invalid: %x", intr_info));
2423
2424 intr_vec = intr_info & 0xff;
2425 intr_type = intr_info & VMCS_INTR_T_MASK;
2426
2427 /*
2428 * If Virtual NMIs control is 1 and the VM-exit is due to a
2429 * fault encountered during the execution of IRET then we must
2430 * restore the state of "virtual-NMI blocking" before resuming
2431 * the guest.
2432 *
2433 * See "Resuming Guest Software after Handling an Exception".
2434 * See "Information for VM Exits Due to Vectored Events".
2435 */
2436 if ((idtvec_info & VMCS_IDT_VEC_VALID) == 0 &&
2437 (intr_vec != IDT_DF) &&
2438 (intr_info & EXIT_QUAL_NMIUDTI) != 0)
2439 vmx_restore_nmi_blocking(vmx, vcpu);
2440
2441 /*
2442 * The NMI has already been handled in vmx_exit_handle_nmi().
2443 */
2444 if (intr_type == VMCS_INTR_T_NMI)
2445 return (1);
2446
2447 /*
2448 * Call the machine check handler by hand. Also don't reflect
2449 * the machine check back into the guest.
2450 */
2451 if (intr_vec == IDT_MC) {
2452 vmm_call_trap(T_MCE);
2453 return (1);
2454 }
2455
2456 /*
2457 * If the hypervisor has requested user exits for
2458 * debug exceptions, bounce them out to userland.
2459 */
2460 if (intr_type == VMCS_INTR_T_SWEXCEPTION &&
2461 intr_vec == IDT_BP &&
2462 (vmx->cap[vcpu].set & (1 << VM_CAP_BPT_EXIT))) {
2463 vmexit->exitcode = VM_EXITCODE_BPT;
2464 vmexit->u.bpt.inst_length = vmexit->inst_length;
2465 vmexit->inst_length = 0;
2466 break;
2467 }
2468
2469 if (intr_vec == IDT_PF) {
2470 vmxctx->guest_cr2 = qual;
2471 }
2472
2473 /*
2474 * Software exceptions exhibit trap-like behavior. This in
2475 * turn requires populating the VM-entry instruction length
2476 * so that the %rip in the trap frame is past the INT3/INTO
2477 * instruction.
2478 */
2479 if (intr_type == VMCS_INTR_T_SWEXCEPTION)
2480 vmcs_write(VMCS_ENTRY_INST_LENGTH, vmexit->inst_length);
2481
2482 /* Reflect all other exceptions back into the guest */
2483 errcode_valid = errcode = 0;
2484 if (intr_info & VMCS_INTR_DEL_ERRCODE) {
2485 errcode_valid = 1;
2486 errcode = vmcs_read(VMCS_EXIT_INTR_ERRCODE);
2487 }
2488 SDT_PROBE5(vmm, vmx, exit, exception,
2489 vmx, vcpu, vmexit, intr_vec, errcode);
2490 error = vm_inject_exception(vmx->vm, vcpu, intr_vec,
2491 errcode_valid, errcode, 0);
2492 KASSERT(error == 0, ("%s: vm_inject_exception error %d",
2493 __func__, error));
2494 return (1);
2495
2496 case EXIT_REASON_EPT_FAULT:
2497 /*
2498 * If 'gpa' lies within the address space allocated to
2499 * memory then this must be a nested page fault otherwise
2500 * this must be an instruction that accesses MMIO space.
2501 */
2502 gpa = vmcs_read(VMCS_GUEST_PHYSICAL_ADDRESS);
2503 if (vm_mem_allocated(vmx->vm, vcpu, gpa) ||
2504 apic_access_fault(vmx, vcpu, gpa)) {
2505 vmexit->exitcode = VM_EXITCODE_PAGING;
2506 vmexit->inst_length = 0;
2507 vmexit->u.paging.gpa = gpa;
2508 vmexit->u.paging.fault_type = ept_fault_type(qual);
2509 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_NESTED_FAULT, 1);
2510 SDT_PROBE5(vmm, vmx, exit, nestedfault,
2511 vmx, vcpu, vmexit, gpa, qual);
2512 } else if (ept_emulation_fault(qual)) {
2513 vie = vm_vie_ctx(vmx->vm, vcpu);
2514 vmexit_mmio_emul(vmexit, vie, gpa,
2515 vmcs_read(VMCS_GUEST_LINEAR_ADDRESS));
2516 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_MMIO_EMUL, 1);
2517 SDT_PROBE4(vmm, vmx, exit, mmiofault,
2518 vmx, vcpu, vmexit, gpa);
2519 }
2520 /*
2521 * If Virtual NMIs control is 1 and the VM-exit is due to an
2522 * EPT fault during the execution of IRET then we must restore
2523 * the state of "virtual-NMI blocking" before resuming.
2524 *
2525 * See description of "NMI unblocking due to IRET" in
2526 * "Exit Qualification for EPT Violations".
2527 */
2528 if ((idtvec_info & VMCS_IDT_VEC_VALID) == 0 &&
2529 (qual & EXIT_QUAL_NMIUDTI) != 0)
2530 vmx_restore_nmi_blocking(vmx, vcpu);
2531 break;
2532 case EXIT_REASON_VIRTUALIZED_EOI:
2533 vmexit->exitcode = VM_EXITCODE_IOAPIC_EOI;
2534 vmexit->u.ioapic_eoi.vector = qual & 0xFF;
2535 SDT_PROBE3(vmm, vmx, exit, eoi, vmx, vcpu, vmexit);
2536 vmexit->inst_length = 0; /* trap-like */
2537 break;
2538 case EXIT_REASON_APIC_ACCESS:
2539 SDT_PROBE3(vmm, vmx, exit, apicaccess, vmx, vcpu, vmexit);
2540 handled = vmx_handle_apic_access(vmx, vcpu, vmexit);
2541 break;
2542 case EXIT_REASON_APIC_WRITE:
2543 /*
2544 * APIC-write VM exit is trap-like so the %rip is already
2545 * pointing to the next instruction.
2546 */
2547 vmexit->inst_length = 0;
2548 vlapic = vm_lapic(vmx->vm, vcpu);
2549 SDT_PROBE4(vmm, vmx, exit, apicwrite,
2550 vmx, vcpu, vmexit, vlapic);
2551 handled = vmx_handle_apic_write(vmx, vcpu, vlapic, qual);
2552 break;
2553 case EXIT_REASON_XSETBV:
2554 SDT_PROBE3(vmm, vmx, exit, xsetbv, vmx, vcpu, vmexit);
2555 handled = vmx_emulate_xsetbv(vmx, vcpu, vmexit);
2556 break;
2557 case EXIT_REASON_MONITOR:
2558 SDT_PROBE3(vmm, vmx, exit, monitor, vmx, vcpu, vmexit);
2559 vmexit->exitcode = VM_EXITCODE_MONITOR;
2560 break;
2561 case EXIT_REASON_MWAIT:
2562 SDT_PROBE3(vmm, vmx, exit, mwait, vmx, vcpu, vmexit);
2563 vmexit->exitcode = VM_EXITCODE_MWAIT;
2564 break;
2565 case EXIT_REASON_TPR:
2566 vlapic = vm_lapic(vmx->vm, vcpu);
2567 vlapic_sync_tpr(vlapic);
2568 vmexit->inst_length = 0;
2569 handled = HANDLED;
2570 break;
2571 case EXIT_REASON_VMCALL:
2572 case EXIT_REASON_VMCLEAR:
2573 case EXIT_REASON_VMLAUNCH:
2574 case EXIT_REASON_VMPTRLD:
2575 case EXIT_REASON_VMPTRST:
2576 case EXIT_REASON_VMREAD:
2577 case EXIT_REASON_VMRESUME:
2578 case EXIT_REASON_VMWRITE:
2579 case EXIT_REASON_VMXOFF:
2580 case EXIT_REASON_VMXON:
2581 SDT_PROBE3(vmm, vmx, exit, vminsn, vmx, vcpu, vmexit);
2582 vmexit->exitcode = VM_EXITCODE_VMINSN;
2583 break;
2584 case EXIT_REASON_INVD:
2585 case EXIT_REASON_WBINVD:
2586 /* ignore exit */
2587 handled = HANDLED;
2588 break;
2589 default:
2590 SDT_PROBE4(vmm, vmx, exit, unknown,
2591 vmx, vcpu, vmexit, reason);
2592 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_UNKNOWN, 1);
2593 break;
2594 }
2595
2596 if (handled) {
2597 /*
2598 * It is possible that control is returned to userland
2599 * even though we were able to handle the VM exit in the
2600 * kernel.
2601 *
2602 * In such a case we want to make sure that the userland
2603 * restarts guest execution at the instruction *after*
2604 * the one we just processed. Therefore we update the
2605 * guest rip in the VMCS and in 'vmexit'.
2606 */
2607 vmexit->rip += vmexit->inst_length;
2608 vmexit->inst_length = 0;
2609 vmcs_write(VMCS_GUEST_RIP, vmexit->rip);
2610 } else {
2611 if (vmexit->exitcode == VM_EXITCODE_BOGUS) {
2612 /*
2613 * If this VM exit was not claimed by anybody then
2614 * treat it as a generic VMX exit.
2615 */
2616 vmexit->exitcode = VM_EXITCODE_VMX;
2617 vmexit->u.vmx.status = VM_SUCCESS;
2618 vmexit->u.vmx.inst_type = 0;
2619 vmexit->u.vmx.inst_error = 0;
2620 } else {
2621 /*
2622 * The exitcode and collateral have been populated.
2623 * The VM exit will be processed further in userland.
2624 */
2625 }
2626 }
2627
2628 SDT_PROBE4(vmm, vmx, exit, return,
2629 vmx, vcpu, vmexit, handled);
2630 return (handled);
2631 }
2632
2633 static void
vmx_exit_inst_error(struct vmxctx * vmxctx,int rc,struct vm_exit * vmexit)2634 vmx_exit_inst_error(struct vmxctx *vmxctx, int rc, struct vm_exit *vmexit)
2635 {
2636
2637 KASSERT(vmxctx->inst_fail_status != VM_SUCCESS,
2638 ("vmx_exit_inst_error: invalid inst_fail_status %d",
2639 vmxctx->inst_fail_status));
2640
2641 vmexit->inst_length = 0;
2642 vmexit->exitcode = VM_EXITCODE_VMX;
2643 vmexit->u.vmx.status = vmxctx->inst_fail_status;
2644 vmexit->u.vmx.inst_error = vmcs_read(VMCS_INSTRUCTION_ERROR);
2645 vmexit->u.vmx.exit_reason = ~0;
2646 vmexit->u.vmx.exit_qualification = ~0;
2647
2648 switch (rc) {
2649 case VMX_VMRESUME_ERROR:
2650 case VMX_VMLAUNCH_ERROR:
2651 case VMX_INVEPT_ERROR:
2652 case VMX_VMWRITE_ERROR:
2653 vmexit->u.vmx.inst_type = rc;
2654 break;
2655 default:
2656 panic("vm_exit_inst_error: vmx_enter_guest returned %d", rc);
2657 }
2658 }
2659
2660 /*
2661 * If the NMI-exiting VM execution control is set to '1' then an NMI in
2662 * non-root operation causes a VM-exit. NMI blocking is in effect so it is
2663 * sufficient to simply vector to the NMI handler via a software interrupt.
2664 * However, this must be done before maskable interrupts are enabled
2665 * otherwise the "iret" issued by an interrupt handler will incorrectly
2666 * clear NMI blocking.
2667 */
2668 static __inline void
vmx_exit_handle_possible_nmi(struct vm_exit * vmexit)2669 vmx_exit_handle_possible_nmi(struct vm_exit *vmexit)
2670 {
2671 ASSERT(!interrupts_enabled());
2672
2673 if (vmexit->u.vmx.exit_reason == EXIT_REASON_EXCEPTION) {
2674 uint32_t intr_info = vmcs_read(VMCS_EXIT_INTR_INFO);
2675 ASSERT(intr_info & VMCS_INTR_VALID);
2676
2677 if ((intr_info & VMCS_INTR_T_MASK) == VMCS_INTR_T_NMI) {
2678 ASSERT3U(intr_info & 0xff, ==, IDT_NMI);
2679 vmm_call_trap(T_NMIFLT);
2680 }
2681 }
2682 }
2683
2684 static __inline void
vmx_dr_enter_guest(struct vmxctx * vmxctx)2685 vmx_dr_enter_guest(struct vmxctx *vmxctx)
2686 {
2687 uint64_t rflags;
2688
2689 /* Save host control debug registers. */
2690 vmxctx->host_dr7 = rdr7();
2691 vmxctx->host_debugctl = rdmsr(MSR_DEBUGCTLMSR);
2692
2693 /*
2694 * Disable debugging in DR7 and DEBUGCTL to avoid triggering
2695 * exceptions in the host based on the guest DRx values. The
2696 * guest DR7 and DEBUGCTL are saved/restored in the VMCS.
2697 */
2698 load_dr7(0);
2699 wrmsr(MSR_DEBUGCTLMSR, 0);
2700
2701 /*
2702 * Disable single stepping the kernel to avoid corrupting the
2703 * guest DR6. A debugger might still be able to corrupt the
2704 * guest DR6 by setting a breakpoint after this point and then
2705 * single stepping.
2706 */
2707 rflags = read_rflags();
2708 vmxctx->host_tf = rflags & PSL_T;
2709 write_rflags(rflags & ~PSL_T);
2710
2711 /* Save host debug registers. */
2712 vmxctx->host_dr0 = rdr0();
2713 vmxctx->host_dr1 = rdr1();
2714 vmxctx->host_dr2 = rdr2();
2715 vmxctx->host_dr3 = rdr3();
2716 vmxctx->host_dr6 = rdr6();
2717
2718 /* Restore guest debug registers. */
2719 load_dr0(vmxctx->guest_dr0);
2720 load_dr1(vmxctx->guest_dr1);
2721 load_dr2(vmxctx->guest_dr2);
2722 load_dr3(vmxctx->guest_dr3);
2723 load_dr6(vmxctx->guest_dr6);
2724 }
2725
2726 static __inline void
vmx_dr_leave_guest(struct vmxctx * vmxctx)2727 vmx_dr_leave_guest(struct vmxctx *vmxctx)
2728 {
2729
2730 /* Save guest debug registers. */
2731 vmxctx->guest_dr0 = rdr0();
2732 vmxctx->guest_dr1 = rdr1();
2733 vmxctx->guest_dr2 = rdr2();
2734 vmxctx->guest_dr3 = rdr3();
2735 vmxctx->guest_dr6 = rdr6();
2736
2737 /*
2738 * Restore host debug registers. Restore DR7, DEBUGCTL, and
2739 * PSL_T last.
2740 */
2741 load_dr0(vmxctx->host_dr0);
2742 load_dr1(vmxctx->host_dr1);
2743 load_dr2(vmxctx->host_dr2);
2744 load_dr3(vmxctx->host_dr3);
2745 load_dr6(vmxctx->host_dr6);
2746 wrmsr(MSR_DEBUGCTLMSR, vmxctx->host_debugctl);
2747 load_dr7(vmxctx->host_dr7);
2748 write_rflags(read_rflags() | vmxctx->host_tf);
2749 }
2750
2751 static int
vmx_run(void * arg,int vcpu,uint64_t rip)2752 vmx_run(void *arg, int vcpu, uint64_t rip)
2753 {
2754 int rc, handled, launched;
2755 struct vmx *vmx;
2756 struct vm *vm;
2757 struct vmxctx *vmxctx;
2758 uintptr_t vmcs_pa;
2759 struct vm_exit *vmexit;
2760 struct vlapic *vlapic;
2761 uint32_t exit_reason;
2762 bool tpr_shadow_active;
2763 vm_client_t *vmc;
2764
2765 vmx = arg;
2766 vm = vmx->vm;
2767 vmcs_pa = vmx->vmcs_pa[vcpu];
2768 vmxctx = &vmx->ctx[vcpu];
2769 vlapic = vm_lapic(vm, vcpu);
2770 vmexit = vm_exitinfo(vm, vcpu);
2771 vmc = vm_get_vmclient(vm, vcpu);
2772 launched = 0;
2773 tpr_shadow_active = vmx_cap_en(vmx, VMX_CAP_TPR_SHADOW) &&
2774 !vmx_cap_en(vmx, VMX_CAP_APICV) &&
2775 (vmx->cap[vcpu].proc_ctls & PROCBASED_USE_TPR_SHADOW) != 0;
2776
2777 vmx_msr_guest_enter(vmx, vcpu);
2778
2779 vmcs_load(vmcs_pa);
2780
2781 VERIFY(vmx->vmcs_state[vcpu] == VS_NONE && curthread->t_preempt != 0);
2782 vmx->vmcs_state[vcpu] = VS_LOADED;
2783
2784 /*
2785 * XXX
2786 * We do this every time because we may setup the virtual machine
2787 * from a different process than the one that actually runs it.
2788 *
2789 * If the life of a virtual machine was spent entirely in the context
2790 * of a single process we could do this once in vmx_vminit().
2791 */
2792 vmcs_write(VMCS_HOST_CR3, rcr3());
2793
2794 vmcs_write(VMCS_GUEST_RIP, rip);
2795 vmx_set_pcpu_defaults(vmx, vcpu);
2796 do {
2797 enum event_inject_state inject_state;
2798 uint64_t eptgen;
2799
2800 ASSERT3U(vmcs_read(VMCS_GUEST_RIP), ==, rip);
2801
2802 handled = UNHANDLED;
2803
2804 /*
2805 * Perform initial event/exception/interrupt injection before
2806 * host CPU interrupts are disabled.
2807 */
2808 inject_state = vmx_inject_events(vmx, vcpu, rip);
2809
2810 /*
2811 * Interrupts are disabled from this point on until the
2812 * guest starts executing. This is done for the following
2813 * reasons:
2814 *
2815 * If an AST is asserted on this thread after the check below,
2816 * then the IPI_AST notification will not be lost, because it
2817 * will cause a VM exit due to external interrupt as soon as
2818 * the guest state is loaded.
2819 *
2820 * A posted interrupt after vmx_inject_vlapic() will not be
2821 * "lost" because it will be held pending in the host APIC
2822 * because interrupts are disabled. The pending interrupt will
2823 * be recognized as soon as the guest state is loaded.
2824 *
2825 * The same reasoning applies to the IPI generated by vmspace
2826 * invalidation.
2827 */
2828 disable_intr();
2829
2830 /*
2831 * If not precluded by existing events, inject any interrupt
2832 * pending on the vLAPIC. As a lock-less operation, it is safe
2833 * (and prudent) to perform with host CPU interrupts disabled.
2834 */
2835 if (inject_state == EIS_CAN_INJECT) {
2836 inject_state = vmx_inject_vlapic(vmx, vcpu, vlapic);
2837 }
2838
2839 /*
2840 * Check for vCPU bail-out conditions. This must be done after
2841 * vmx_inject_events() to detect a triple-fault condition.
2842 */
2843 if (vcpu_entry_bailout_checks(vmx->vm, vcpu, rip)) {
2844 enable_intr();
2845 break;
2846 }
2847
2848 if (vcpu_run_state_pending(vm, vcpu)) {
2849 enable_intr();
2850 vm_exit_run_state(vmx->vm, vcpu, rip);
2851 break;
2852 }
2853
2854 /*
2855 * If subsequent activity queued events which require injection
2856 * handling, take another lap to handle them.
2857 */
2858 if (vmx_inject_recheck(vmx, vcpu, inject_state)) {
2859 enable_intr();
2860 handled = HANDLED;
2861 continue;
2862 }
2863
2864 if ((rc = smt_acquire()) != 1) {
2865 enable_intr();
2866 vmexit->rip = rip;
2867 vmexit->inst_length = 0;
2868 if (rc == -1) {
2869 vmexit->exitcode = VM_EXITCODE_HT;
2870 } else {
2871 vmexit->exitcode = VM_EXITCODE_BOGUS;
2872 handled = HANDLED;
2873 }
2874 break;
2875 }
2876
2877 /*
2878 * If this thread has gone off-cpu due to mutex operations
2879 * during vmx_run, the VMCS will have been unloaded, forcing a
2880 * re-VMLAUNCH as opposed to VMRESUME.
2881 */
2882 launched = (vmx->vmcs_state[vcpu] & VS_LAUNCHED) != 0;
2883 /*
2884 * Restoration of the GDT limit is taken care of by
2885 * vmx_savectx(). Since the maximum practical index for the
2886 * IDT is 255, restoring its limits from the post-VMX-exit
2887 * default of 0xffff is not a concern.
2888 *
2889 * Only 64-bit hypervisor callers are allowed, which forgoes
2890 * the need to restore any LDT descriptor. Toss an error to
2891 * anyone attempting to break that rule.
2892 */
2893 if (curproc->p_model != DATAMODEL_LP64) {
2894 smt_release();
2895 enable_intr();
2896 bzero(vmexit, sizeof (*vmexit));
2897 vmexit->rip = rip;
2898 vmexit->exitcode = VM_EXITCODE_VMX;
2899 vmexit->u.vmx.status = VM_FAIL_INVALID;
2900 handled = UNHANDLED;
2901 break;
2902 }
2903
2904 if (tpr_shadow_active) {
2905 vmx_tpr_shadow_enter(vlapic);
2906 }
2907
2908 /*
2909 * Indicate activation of vmspace (EPT) table just prior to VMX
2910 * entry, checking for the necessity of an invept invalidation.
2911 */
2912 eptgen = vmc_table_enter(vmc);
2913 if (vmx->eptgen[curcpu] != eptgen) {
2914 /*
2915 * VMspace generation does not match what was previously
2916 * used on this host CPU, so all mappings associated
2917 * with this EP4TA must be invalidated.
2918 */
2919 invept(1, vmx->eptp);
2920 vmx->eptgen[curcpu] = eptgen;
2921 }
2922
2923 vcpu_ustate_change(vm, vcpu, VU_RUN);
2924 vmx_dr_enter_guest(vmxctx);
2925
2926 /* Perform VMX entry */
2927 rc = vmx_enter_guest(vmxctx, vmx, launched);
2928
2929 vmx_dr_leave_guest(vmxctx);
2930 vcpu_ustate_change(vm, vcpu, VU_EMU_KERN);
2931
2932 vmx->vmcs_state[vcpu] |= VS_LAUNCHED;
2933 smt_release();
2934
2935 if (tpr_shadow_active) {
2936 vmx_tpr_shadow_exit(vlapic);
2937 }
2938
2939 /* Collect some information for VM exit processing */
2940 vmexit->rip = rip = vmcs_read(VMCS_GUEST_RIP);
2941 vmexit->inst_length = vmcs_read(VMCS_EXIT_INSTRUCTION_LENGTH);
2942 vmexit->u.vmx.exit_reason = exit_reason =
2943 (vmcs_read(VMCS_EXIT_REASON) & BASIC_EXIT_REASON_MASK);
2944 vmexit->u.vmx.exit_qualification =
2945 vmcs_read(VMCS_EXIT_QUALIFICATION);
2946 /* Update 'nextrip' */
2947 vmx->state[vcpu].nextrip = rip;
2948
2949 if (rc == VMX_GUEST_VMEXIT) {
2950 vmx_exit_handle_possible_nmi(vmexit);
2951 }
2952 enable_intr();
2953 vmc_table_exit(vmc);
2954
2955 if (rc == VMX_GUEST_VMEXIT) {
2956 handled = vmx_exit_process(vmx, vcpu, vmexit);
2957 } else {
2958 vmx_exit_inst_error(vmxctx, rc, vmexit);
2959 }
2960 DTRACE_PROBE3(vmm__vexit, int, vcpu, uint64_t, rip,
2961 uint32_t, exit_reason);
2962 rip = vmexit->rip;
2963 } while (handled);
2964
2965 /* If a VM exit has been handled then the exitcode must be BOGUS */
2966 if (handled && vmexit->exitcode != VM_EXITCODE_BOGUS) {
2967 panic("Non-BOGUS exitcode (%d) unexpected for handled VM exit",
2968 vmexit->exitcode);
2969 }
2970
2971 vmcs_clear(vmcs_pa);
2972 vmx_msr_guest_exit(vmx, vcpu);
2973
2974 VERIFY(vmx->vmcs_state[vcpu] != VS_NONE && curthread->t_preempt != 0);
2975 vmx->vmcs_state[vcpu] = VS_NONE;
2976
2977 return (0);
2978 }
2979
2980 static void
vmx_vmcleanup(void * arg)2981 vmx_vmcleanup(void *arg)
2982 {
2983 int i;
2984 struct vmx *vmx = arg;
2985 uint16_t maxcpus;
2986
2987 if (vmx_cap_en(vmx, VMX_CAP_APICV)) {
2988 (void) vm_unmap_mmio(vmx->vm, DEFAULT_APIC_BASE, PAGE_SIZE);
2989 kmem_free(vmx->apic_access_page, PAGESIZE);
2990 } else {
2991 VERIFY3P(vmx->apic_access_page, ==, NULL);
2992 }
2993
2994 vmx_msr_bitmap_destroy(vmx);
2995
2996 maxcpus = vm_get_maxcpus(vmx->vm);
2997 for (i = 0; i < maxcpus; i++)
2998 vpid_free(vmx->state[i].vpid);
2999
3000 kmem_free(vmx, sizeof (*vmx));
3001 }
3002
3003 /*
3004 * Ensure that the VMCS for this vcpu is loaded.
3005 * Returns true if a VMCS load was required.
3006 */
3007 static bool
vmx_vmcs_access_ensure(struct vmx * vmx,int vcpu)3008 vmx_vmcs_access_ensure(struct vmx *vmx, int vcpu)
3009 {
3010 int hostcpu;
3011
3012 if (vcpu_is_running(vmx->vm, vcpu, &hostcpu)) {
3013 if (hostcpu != curcpu) {
3014 panic("unexpected vcpu migration %d != %d",
3015 hostcpu, curcpu);
3016 }
3017 /* Earlier logic already took care of the load */
3018 return (false);
3019 } else {
3020 vmcs_load(vmx->vmcs_pa[vcpu]);
3021 return (true);
3022 }
3023 }
3024
3025 static void
vmx_vmcs_access_done(struct vmx * vmx,int vcpu)3026 vmx_vmcs_access_done(struct vmx *vmx, int vcpu)
3027 {
3028 int hostcpu;
3029
3030 if (vcpu_is_running(vmx->vm, vcpu, &hostcpu)) {
3031 if (hostcpu != curcpu) {
3032 panic("unexpected vcpu migration %d != %d",
3033 hostcpu, curcpu);
3034 }
3035 /* Later logic will take care of the unload */
3036 } else {
3037 vmcs_clear(vmx->vmcs_pa[vcpu]);
3038 }
3039 }
3040
3041 static uint64_t *
vmxctx_regptr(struct vmxctx * vmxctx,int reg)3042 vmxctx_regptr(struct vmxctx *vmxctx, int reg)
3043 {
3044 switch (reg) {
3045 case VM_REG_GUEST_RAX:
3046 return (&vmxctx->guest_rax);
3047 case VM_REG_GUEST_RBX:
3048 return (&vmxctx->guest_rbx);
3049 case VM_REG_GUEST_RCX:
3050 return (&vmxctx->guest_rcx);
3051 case VM_REG_GUEST_RDX:
3052 return (&vmxctx->guest_rdx);
3053 case VM_REG_GUEST_RSI:
3054 return (&vmxctx->guest_rsi);
3055 case VM_REG_GUEST_RDI:
3056 return (&vmxctx->guest_rdi);
3057 case VM_REG_GUEST_RBP:
3058 return (&vmxctx->guest_rbp);
3059 case VM_REG_GUEST_R8:
3060 return (&vmxctx->guest_r8);
3061 case VM_REG_GUEST_R9:
3062 return (&vmxctx->guest_r9);
3063 case VM_REG_GUEST_R10:
3064 return (&vmxctx->guest_r10);
3065 case VM_REG_GUEST_R11:
3066 return (&vmxctx->guest_r11);
3067 case VM_REG_GUEST_R12:
3068 return (&vmxctx->guest_r12);
3069 case VM_REG_GUEST_R13:
3070 return (&vmxctx->guest_r13);
3071 case VM_REG_GUEST_R14:
3072 return (&vmxctx->guest_r14);
3073 case VM_REG_GUEST_R15:
3074 return (&vmxctx->guest_r15);
3075 case VM_REG_GUEST_CR2:
3076 return (&vmxctx->guest_cr2);
3077 case VM_REG_GUEST_DR0:
3078 return (&vmxctx->guest_dr0);
3079 case VM_REG_GUEST_DR1:
3080 return (&vmxctx->guest_dr1);
3081 case VM_REG_GUEST_DR2:
3082 return (&vmxctx->guest_dr2);
3083 case VM_REG_GUEST_DR3:
3084 return (&vmxctx->guest_dr3);
3085 case VM_REG_GUEST_DR6:
3086 return (&vmxctx->guest_dr6);
3087 default:
3088 break;
3089 }
3090 return (NULL);
3091 }
3092
3093 static int
vmx_getreg(void * arg,int vcpu,int reg,uint64_t * retval)3094 vmx_getreg(void *arg, int vcpu, int reg, uint64_t *retval)
3095 {
3096 struct vmx *vmx = arg;
3097 uint64_t *regp;
3098
3099 /* VMCS access not required for ctx reads */
3100 if ((regp = vmxctx_regptr(&vmx->ctx[vcpu], reg)) != NULL) {
3101 *retval = *regp;
3102 return (0);
3103 }
3104
3105 bool vmcs_loaded = vmx_vmcs_access_ensure(vmx, vcpu);
3106 int err = 0;
3107
3108 if (reg == VM_REG_GUEST_INTR_SHADOW) {
3109 uint64_t gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
3110 *retval = (gi & HWINTR_BLOCKING) ? 1 : 0;
3111 } else {
3112 uint32_t encoding;
3113
3114 encoding = vmcs_field_encoding(reg);
3115 switch (encoding) {
3116 case VMCS_GUEST_CR0:
3117 /* Take the shadow bits into account */
3118 *retval = vmx_unshadow_cr0(vmcs_read(encoding),
3119 vmcs_read(VMCS_CR0_SHADOW));
3120 break;
3121 case VMCS_GUEST_CR4:
3122 /* Take the shadow bits into account */
3123 *retval = vmx_unshadow_cr4(vmcs_read(encoding),
3124 vmcs_read(VMCS_CR4_SHADOW));
3125 break;
3126 case VMCS_INVALID_ENCODING:
3127 err = EINVAL;
3128 break;
3129 default:
3130 *retval = vmcs_read(encoding);
3131 break;
3132 }
3133 }
3134
3135 if (vmcs_loaded) {
3136 vmx_vmcs_access_done(vmx, vcpu);
3137 }
3138 return (err);
3139 }
3140
3141 static int
vmx_setreg(void * arg,int vcpu,int reg,uint64_t val)3142 vmx_setreg(void *arg, int vcpu, int reg, uint64_t val)
3143 {
3144 struct vmx *vmx = arg;
3145 uint64_t *regp;
3146
3147 /* VMCS access not required for ctx writes */
3148 if ((regp = vmxctx_regptr(&vmx->ctx[vcpu], reg)) != NULL) {
3149 *regp = val;
3150 return (0);
3151 }
3152
3153 bool vmcs_loaded = vmx_vmcs_access_ensure(vmx, vcpu);
3154 int err = 0;
3155
3156 if (reg == VM_REG_GUEST_INTR_SHADOW) {
3157 if (val != 0) {
3158 /*
3159 * Forcing the vcpu into an interrupt shadow is not
3160 * presently supported.
3161 */
3162 err = EINVAL;
3163 } else {
3164 uint64_t gi;
3165
3166 gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
3167 gi &= ~HWINTR_BLOCKING;
3168 vmcs_write(VMCS_GUEST_INTERRUPTIBILITY, gi);
3169 err = 0;
3170 }
3171 } else {
3172 uint32_t encoding;
3173
3174 err = 0;
3175 encoding = vmcs_field_encoding(reg);
3176 switch (encoding) {
3177 case VMCS_GUEST_IA32_EFER:
3178 vmcs_write(encoding, val);
3179 vmx_sync_efer_state(vmx, vcpu, val);
3180 break;
3181 case VMCS_GUEST_CR0:
3182 /*
3183 * The guest is not allowed to modify certain bits in
3184 * %cr0 and %cr4. To maintain the illusion of full
3185 * control, they have shadow versions which contain the
3186 * guest-perceived (via reads from the register) values
3187 * as opposed to the guest-effective values.
3188 *
3189 * This is detailed in the SDM: Vol. 3 Ch. 24.6.6.
3190 */
3191 vmcs_write(VMCS_CR0_SHADOW, val);
3192 vmcs_write(encoding, vmx_fix_cr0(val));
3193 break;
3194 case VMCS_GUEST_CR4:
3195 /* See above for detail on %cr4 shadowing */
3196 vmcs_write(VMCS_CR4_SHADOW, val);
3197 vmcs_write(encoding, vmx_fix_cr4(val));
3198 break;
3199 case VMCS_GUEST_CR3:
3200 vmcs_write(encoding, val);
3201 /*
3202 * Invalidate the guest vcpu's TLB mappings to emulate
3203 * the behavior of updating %cr3.
3204 *
3205 * XXX the processor retains global mappings when %cr3
3206 * is updated but vmx_invvpid() does not.
3207 */
3208 vmx_invvpid(vmx, vcpu,
3209 vcpu_is_running(vmx->vm, vcpu, NULL));
3210 break;
3211 case VMCS_INVALID_ENCODING:
3212 err = EINVAL;
3213 break;
3214 default:
3215 vmcs_write(encoding, val);
3216 break;
3217 }
3218 }
3219
3220 if (vmcs_loaded) {
3221 vmx_vmcs_access_done(vmx, vcpu);
3222 }
3223 return (err);
3224 }
3225
3226 static int
vmx_getdesc(void * arg,int vcpu,int seg,struct seg_desc * desc)3227 vmx_getdesc(void *arg, int vcpu, int seg, struct seg_desc *desc)
3228 {
3229 struct vmx *vmx = arg;
3230 uint32_t base, limit, access;
3231
3232 bool vmcs_loaded = vmx_vmcs_access_ensure(vmx, vcpu);
3233
3234 vmcs_seg_desc_encoding(seg, &base, &limit, &access);
3235 desc->base = vmcs_read(base);
3236 desc->limit = vmcs_read(limit);
3237 if (access != VMCS_INVALID_ENCODING) {
3238 desc->access = vmcs_read(access);
3239 } else {
3240 desc->access = 0;
3241 }
3242
3243 if (vmcs_loaded) {
3244 vmx_vmcs_access_done(vmx, vcpu);
3245 }
3246 return (0);
3247 }
3248
3249 static int
vmx_setdesc(void * arg,int vcpu,int seg,const struct seg_desc * desc)3250 vmx_setdesc(void *arg, int vcpu, int seg, const struct seg_desc *desc)
3251 {
3252 struct vmx *vmx = arg;
3253 uint32_t base, limit, access;
3254
3255 bool vmcs_loaded = vmx_vmcs_access_ensure(vmx, vcpu);
3256
3257 vmcs_seg_desc_encoding(seg, &base, &limit, &access);
3258 vmcs_write(base, desc->base);
3259 vmcs_write(limit, desc->limit);
3260 if (access != VMCS_INVALID_ENCODING) {
3261 vmcs_write(access, desc->access);
3262 }
3263
3264 if (vmcs_loaded) {
3265 vmx_vmcs_access_done(vmx, vcpu);
3266 }
3267 return (0);
3268 }
3269
3270 static uint64_t *
vmx_msr_ptr(struct vmx * vmx,int vcpu,uint32_t msr)3271 vmx_msr_ptr(struct vmx *vmx, int vcpu, uint32_t msr)
3272 {
3273 uint64_t *guest_msrs = vmx->guest_msrs[vcpu];
3274
3275 switch (msr) {
3276 case MSR_LSTAR:
3277 return (&guest_msrs[IDX_MSR_LSTAR]);
3278 case MSR_CSTAR:
3279 return (&guest_msrs[IDX_MSR_CSTAR]);
3280 case MSR_STAR:
3281 return (&guest_msrs[IDX_MSR_STAR]);
3282 case MSR_SF_MASK:
3283 return (&guest_msrs[IDX_MSR_SF_MASK]);
3284 case MSR_KGSBASE:
3285 return (&guest_msrs[IDX_MSR_KGSBASE]);
3286 case MSR_PAT:
3287 return (&guest_msrs[IDX_MSR_PAT]);
3288 default:
3289 return (NULL);
3290 }
3291 }
3292
3293 static int
vmx_msr_get(void * arg,int vcpu,uint32_t msr,uint64_t * valp)3294 vmx_msr_get(void *arg, int vcpu, uint32_t msr, uint64_t *valp)
3295 {
3296 struct vmx *vmx = arg;
3297
3298 ASSERT(valp != NULL);
3299
3300 const uint64_t *msrp = vmx_msr_ptr(vmx, vcpu, msr);
3301 if (msrp != NULL) {
3302 *valp = *msrp;
3303 return (0);
3304 }
3305
3306 const uint32_t vmcs_enc = vmcs_msr_encoding(msr);
3307 if (vmcs_enc != VMCS_INVALID_ENCODING) {
3308 bool vmcs_loaded = vmx_vmcs_access_ensure(vmx, vcpu);
3309
3310 *valp = vmcs_read(vmcs_enc);
3311
3312 if (vmcs_loaded) {
3313 vmx_vmcs_access_done(vmx, vcpu);
3314 }
3315 return (0);
3316 }
3317
3318 return (EINVAL);
3319 }
3320
3321 static int
vmx_msr_set(void * arg,int vcpu,uint32_t msr,uint64_t val)3322 vmx_msr_set(void *arg, int vcpu, uint32_t msr, uint64_t val)
3323 {
3324 struct vmx *vmx = arg;
3325
3326 /* TODO: mask value */
3327
3328 uint64_t *msrp = vmx_msr_ptr(vmx, vcpu, msr);
3329 if (msrp != NULL) {
3330 *msrp = val;
3331 return (0);
3332 }
3333
3334 const uint32_t vmcs_enc = vmcs_msr_encoding(msr);
3335 if (vmcs_enc != VMCS_INVALID_ENCODING) {
3336 bool vmcs_loaded = vmx_vmcs_access_ensure(vmx, vcpu);
3337
3338 vmcs_write(vmcs_enc, val);
3339
3340 if (msr == MSR_EFER) {
3341 vmx_sync_efer_state(vmx, vcpu, val);
3342 }
3343
3344 if (vmcs_loaded) {
3345 vmx_vmcs_access_done(vmx, vcpu);
3346 }
3347 return (0);
3348 }
3349 return (EINVAL);
3350 }
3351
3352 static int
vmx_getcap(void * arg,int vcpu,int type,int * retval)3353 vmx_getcap(void *arg, int vcpu, int type, int *retval)
3354 {
3355 struct vmx *vmx = arg;
3356 int vcap;
3357 int ret;
3358
3359 ret = ENOENT;
3360
3361 vcap = vmx->cap[vcpu].set;
3362
3363 switch (type) {
3364 case VM_CAP_HALT_EXIT:
3365 ret = 0;
3366 break;
3367 case VM_CAP_PAUSE_EXIT:
3368 if (cap_pause_exit)
3369 ret = 0;
3370 break;
3371 case VM_CAP_MTRAP_EXIT:
3372 if (cap_monitor_trap)
3373 ret = 0;
3374 break;
3375 case VM_CAP_ENABLE_INVPCID:
3376 if (cap_invpcid)
3377 ret = 0;
3378 break;
3379 case VM_CAP_BPT_EXIT:
3380 ret = 0;
3381 break;
3382 default:
3383 break;
3384 }
3385
3386 if (ret == 0)
3387 *retval = (vcap & (1 << type)) ? 1 : 0;
3388
3389 return (ret);
3390 }
3391
3392 static int
vmx_setcap(void * arg,int vcpu,int type,int val)3393 vmx_setcap(void *arg, int vcpu, int type, int val)
3394 {
3395 struct vmx *vmx = arg;
3396 uint32_t baseval, reg, flag;
3397 uint32_t *pptr;
3398 int error;
3399
3400 error = ENOENT;
3401 pptr = NULL;
3402
3403 switch (type) {
3404 case VM_CAP_HALT_EXIT:
3405 error = 0;
3406 pptr = &vmx->cap[vcpu].proc_ctls;
3407 baseval = *pptr;
3408 flag = PROCBASED_HLT_EXITING;
3409 reg = VMCS_PRI_PROC_BASED_CTLS;
3410 break;
3411 case VM_CAP_MTRAP_EXIT:
3412 if (cap_monitor_trap) {
3413 error = 0;
3414 pptr = &vmx->cap[vcpu].proc_ctls;
3415 baseval = *pptr;
3416 flag = PROCBASED_MTF;
3417 reg = VMCS_PRI_PROC_BASED_CTLS;
3418 }
3419 break;
3420 case VM_CAP_PAUSE_EXIT:
3421 if (cap_pause_exit) {
3422 error = 0;
3423 pptr = &vmx->cap[vcpu].proc_ctls;
3424 baseval = *pptr;
3425 flag = PROCBASED_PAUSE_EXITING;
3426 reg = VMCS_PRI_PROC_BASED_CTLS;
3427 }
3428 break;
3429 case VM_CAP_ENABLE_INVPCID:
3430 if (cap_invpcid) {
3431 error = 0;
3432 pptr = &vmx->cap[vcpu].proc_ctls2;
3433 baseval = *pptr;
3434 flag = PROCBASED2_ENABLE_INVPCID;
3435 reg = VMCS_SEC_PROC_BASED_CTLS;
3436 }
3437 break;
3438 case VM_CAP_BPT_EXIT:
3439 error = 0;
3440
3441 /* Don't change the bitmap if we are tracing all exceptions. */
3442 if (vmx->cap[vcpu].exc_bitmap != 0xffffffff) {
3443 pptr = &vmx->cap[vcpu].exc_bitmap;
3444 baseval = *pptr;
3445 flag = (1 << IDT_BP);
3446 reg = VMCS_EXCEPTION_BITMAP;
3447 }
3448 break;
3449 default:
3450 break;
3451 }
3452
3453 if (error != 0) {
3454 return (error);
3455 }
3456
3457 if (pptr != NULL) {
3458 if (val) {
3459 baseval |= flag;
3460 } else {
3461 baseval &= ~flag;
3462 }
3463 vmcs_load(vmx->vmcs_pa[vcpu]);
3464 vmcs_write(reg, baseval);
3465 vmcs_clear(vmx->vmcs_pa[vcpu]);
3466
3467 /*
3468 * Update optional stored flags, and record
3469 * setting
3470 */
3471 *pptr = baseval;
3472 }
3473
3474 if (val) {
3475 vmx->cap[vcpu].set |= (1 << type);
3476 } else {
3477 vmx->cap[vcpu].set &= ~(1 << type);
3478 }
3479
3480 return (0);
3481 }
3482
3483 struct vlapic_vtx {
3484 struct vlapic vlapic;
3485
3486 /* Align to the nearest cacheline */
3487 uint8_t _pad[64 - (sizeof (struct vlapic) % 64)];
3488
3489 /* TMR handling state for posted interrupts */
3490 uint32_t tmr_active[8];
3491 uint32_t pending_level[8];
3492 uint32_t pending_edge[8];
3493
3494 struct pir_desc *pir_desc;
3495 struct vmx *vmx;
3496 uint_t pending_prio;
3497 boolean_t tmr_sync;
3498 };
3499
3500 CTASSERT((offsetof(struct vlapic_vtx, tmr_active) & 63) == 0);
3501
3502 #define VPR_PRIO_BIT(vpr) (1 << ((vpr) >> 4))
3503
3504 static vcpu_notify_t
vmx_apicv_set_ready(struct vlapic * vlapic,int vector,bool level)3505 vmx_apicv_set_ready(struct vlapic *vlapic, int vector, bool level)
3506 {
3507 struct vlapic_vtx *vlapic_vtx;
3508 struct pir_desc *pir_desc;
3509 uint32_t mask, tmrval;
3510 int idx;
3511 vcpu_notify_t notify = VCPU_NOTIFY_NONE;
3512
3513 vlapic_vtx = (struct vlapic_vtx *)vlapic;
3514 pir_desc = vlapic_vtx->pir_desc;
3515 idx = vector / 32;
3516 mask = 1UL << (vector % 32);
3517
3518 /*
3519 * If the currently asserted TMRs do not match the state requested by
3520 * the incoming interrupt, an exit will be required to reconcile those
3521 * bits in the APIC page. This will keep the vLAPIC behavior in line
3522 * with the architecturally defined expectations.
3523 *
3524 * If actors of mixed types (edge and level) are racing against the same
3525 * vector (toggling its TMR bit back and forth), the results could
3526 * inconsistent. Such circumstances are considered a rare edge case and
3527 * are never expected to be found in the wild.
3528 */
3529 tmrval = atomic_load_acq_int(&vlapic_vtx->tmr_active[idx]);
3530 if (!level) {
3531 if ((tmrval & mask) != 0) {
3532 /* Edge-triggered interrupt needs TMR de-asserted */
3533 atomic_set_int(&vlapic_vtx->pending_edge[idx], mask);
3534 atomic_store_rel_long(&pir_desc->pending, 1);
3535 return (VCPU_NOTIFY_EXIT);
3536 }
3537 } else {
3538 if ((tmrval & mask) == 0) {
3539 /* Level-triggered interrupt needs TMR asserted */
3540 atomic_set_int(&vlapic_vtx->pending_level[idx], mask);
3541 atomic_store_rel_long(&pir_desc->pending, 1);
3542 return (VCPU_NOTIFY_EXIT);
3543 }
3544 }
3545
3546 /*
3547 * If the interrupt request does not require manipulation of the TMRs
3548 * for delivery, set it in PIR descriptor. It cannot be inserted into
3549 * the APIC page while the vCPU might be running.
3550 */
3551 atomic_set_int(&pir_desc->pir[idx], mask);
3552
3553 /*
3554 * A notification is required whenever the 'pending' bit makes a
3555 * transition from 0->1.
3556 *
3557 * Even if the 'pending' bit is already asserted, notification about
3558 * the incoming interrupt may still be necessary. For example, if a
3559 * vCPU is HLTed with a high PPR, a low priority interrupt would cause
3560 * the 0->1 'pending' transition with a notification, but the vCPU
3561 * would ignore the interrupt for the time being. The same vCPU would
3562 * need to then be notified if a high-priority interrupt arrived which
3563 * satisfied the PPR.
3564 *
3565 * The priorities of interrupts injected while 'pending' is asserted
3566 * are tracked in a custom bitfield 'pending_prio'. Should the
3567 * to-be-injected interrupt exceed the priorities already present, the
3568 * notification is sent. The priorities recorded in 'pending_prio' are
3569 * cleared whenever the 'pending' bit makes another 0->1 transition.
3570 */
3571 if (atomic_cmpset_long(&pir_desc->pending, 0, 1) != 0) {
3572 notify = VCPU_NOTIFY_APIC;
3573 vlapic_vtx->pending_prio = 0;
3574 } else {
3575 const uint_t old_prio = vlapic_vtx->pending_prio;
3576 const uint_t prio_bit = VPR_PRIO_BIT(vector & APIC_TPR_INT);
3577
3578 if ((old_prio & prio_bit) == 0 && prio_bit > old_prio) {
3579 atomic_set_int(&vlapic_vtx->pending_prio, prio_bit);
3580 notify = VCPU_NOTIFY_APIC;
3581 }
3582 }
3583
3584 return (notify);
3585 }
3586
3587 static void
vmx_apicv_accepted(struct vlapic * vlapic,int vector)3588 vmx_apicv_accepted(struct vlapic *vlapic, int vector)
3589 {
3590 /*
3591 * When APICv is enabled for an instance, the traditional interrupt
3592 * injection method (populating ENTRY_INTR_INFO in the VMCS) is not
3593 * used and the CPU does the heavy lifting of virtual interrupt
3594 * delivery. For that reason vmx_intr_accepted() should never be called
3595 * when APICv is enabled.
3596 */
3597 panic("vmx_intr_accepted: not expected to be called");
3598 }
3599
3600 static void
vmx_apicv_sync_tmr(struct vlapic * vlapic)3601 vmx_apicv_sync_tmr(struct vlapic *vlapic)
3602 {
3603 struct vlapic_vtx *vlapic_vtx;
3604 const uint32_t *tmrs;
3605
3606 vlapic_vtx = (struct vlapic_vtx *)vlapic;
3607 tmrs = &vlapic_vtx->tmr_active[0];
3608
3609 if (!vlapic_vtx->tmr_sync) {
3610 return;
3611 }
3612
3613 vmcs_write(VMCS_EOI_EXIT0, ((uint64_t)tmrs[1] << 32) | tmrs[0]);
3614 vmcs_write(VMCS_EOI_EXIT1, ((uint64_t)tmrs[3] << 32) | tmrs[2]);
3615 vmcs_write(VMCS_EOI_EXIT2, ((uint64_t)tmrs[5] << 32) | tmrs[4]);
3616 vmcs_write(VMCS_EOI_EXIT3, ((uint64_t)tmrs[7] << 32) | tmrs[6]);
3617 vlapic_vtx->tmr_sync = B_FALSE;
3618 }
3619
3620 static void
vmx_enable_x2apic_mode_ts(struct vlapic * vlapic)3621 vmx_enable_x2apic_mode_ts(struct vlapic *vlapic)
3622 {
3623 struct vmx *vmx;
3624 uint32_t proc_ctls;
3625 int vcpuid;
3626
3627 vcpuid = vlapic->vcpuid;
3628 vmx = ((struct vlapic_vtx *)vlapic)->vmx;
3629
3630 proc_ctls = vmx->cap[vcpuid].proc_ctls;
3631 proc_ctls &= ~PROCBASED_USE_TPR_SHADOW;
3632 proc_ctls |= PROCBASED_CR8_LOAD_EXITING;
3633 proc_ctls |= PROCBASED_CR8_STORE_EXITING;
3634 vmx->cap[vcpuid].proc_ctls = proc_ctls;
3635
3636 vmcs_load(vmx->vmcs_pa[vcpuid]);
3637 vmcs_write(VMCS_PRI_PROC_BASED_CTLS, proc_ctls);
3638 vmcs_clear(vmx->vmcs_pa[vcpuid]);
3639 }
3640
3641 static void
vmx_enable_x2apic_mode_vid(struct vlapic * vlapic)3642 vmx_enable_x2apic_mode_vid(struct vlapic *vlapic)
3643 {
3644 struct vmx *vmx;
3645 uint32_t proc_ctls2;
3646 int vcpuid;
3647
3648 vcpuid = vlapic->vcpuid;
3649 vmx = ((struct vlapic_vtx *)vlapic)->vmx;
3650
3651 proc_ctls2 = vmx->cap[vcpuid].proc_ctls2;
3652 KASSERT((proc_ctls2 & PROCBASED2_VIRTUALIZE_APIC_ACCESSES) != 0,
3653 ("%s: invalid proc_ctls2 %x", __func__, proc_ctls2));
3654
3655 proc_ctls2 &= ~PROCBASED2_VIRTUALIZE_APIC_ACCESSES;
3656 proc_ctls2 |= PROCBASED2_VIRTUALIZE_X2APIC_MODE;
3657 vmx->cap[vcpuid].proc_ctls2 = proc_ctls2;
3658
3659 vmcs_load(vmx->vmcs_pa[vcpuid]);
3660 vmcs_write(VMCS_SEC_PROC_BASED_CTLS, proc_ctls2);
3661 vmcs_clear(vmx->vmcs_pa[vcpuid]);
3662
3663 vmx_allow_x2apic_msrs(vmx, vcpuid);
3664 }
3665
3666 static void
vmx_apicv_notify(struct vlapic * vlapic,int hostcpu)3667 vmx_apicv_notify(struct vlapic *vlapic, int hostcpu)
3668 {
3669 psm_send_pir_ipi(hostcpu);
3670 }
3671
3672 static void
vmx_apicv_sync(struct vlapic * vlapic)3673 vmx_apicv_sync(struct vlapic *vlapic)
3674 {
3675 struct vlapic_vtx *vlapic_vtx;
3676 struct pir_desc *pir_desc;
3677 struct LAPIC *lapic;
3678 uint_t i;
3679
3680 vlapic_vtx = (struct vlapic_vtx *)vlapic;
3681 pir_desc = vlapic_vtx->pir_desc;
3682 lapic = vlapic->apic_page;
3683
3684 if (atomic_cmpset_long(&pir_desc->pending, 1, 0) == 0) {
3685 return;
3686 }
3687
3688 vlapic_vtx->pending_prio = 0;
3689
3690 /* Make sure the invalid (0-15) vectors are not set */
3691 ASSERT0(vlapic_vtx->pending_level[0] & 0xffff);
3692 ASSERT0(vlapic_vtx->pending_edge[0] & 0xffff);
3693 ASSERT0(pir_desc->pir[0] & 0xffff);
3694
3695 for (i = 0; i <= 7; i++) {
3696 uint32_t *tmrp = &lapic->tmr0 + (i * 4);
3697 uint32_t *irrp = &lapic->irr0 + (i * 4);
3698
3699 const uint32_t pending_level =
3700 atomic_readandclear_int(&vlapic_vtx->pending_level[i]);
3701 const uint32_t pending_edge =
3702 atomic_readandclear_int(&vlapic_vtx->pending_edge[i]);
3703 const uint32_t pending_inject =
3704 atomic_readandclear_int(&pir_desc->pir[i]);
3705
3706 if (pending_level != 0) {
3707 /*
3708 * Level-triggered interrupts assert their corresponding
3709 * bit in the TMR when queued in IRR.
3710 */
3711 *tmrp |= pending_level;
3712 *irrp |= pending_level;
3713 }
3714 if (pending_edge != 0) {
3715 /*
3716 * When queuing an edge-triggered interrupt in IRR, the
3717 * corresponding bit in the TMR is cleared.
3718 */
3719 *tmrp &= ~pending_edge;
3720 *irrp |= pending_edge;
3721 }
3722 if (pending_inject != 0) {
3723 /*
3724 * Interrupts which do not require a change to the TMR
3725 * (because it already matches the necessary state) can
3726 * simply be queued in IRR.
3727 */
3728 *irrp |= pending_inject;
3729 }
3730
3731 if (*tmrp != vlapic_vtx->tmr_active[i]) {
3732 /* Check if VMX EOI triggers require updating. */
3733 vlapic_vtx->tmr_active[i] = *tmrp;
3734 vlapic_vtx->tmr_sync = B_TRUE;
3735 }
3736 }
3737 }
3738
3739 static void
vmx_tpr_shadow_enter(struct vlapic * vlapic)3740 vmx_tpr_shadow_enter(struct vlapic *vlapic)
3741 {
3742 /*
3743 * When TPR shadowing is enabled, VMX will initiate a guest exit if its
3744 * TPR falls below a threshold priority. That threshold is set to the
3745 * current TPR priority, since guest interrupt status should be
3746 * re-evaluated if its TPR is set lower.
3747 */
3748 vmcs_write(VMCS_TPR_THRESHOLD, vlapic_get_cr8(vlapic));
3749 }
3750
3751 static void
vmx_tpr_shadow_exit(struct vlapic * vlapic)3752 vmx_tpr_shadow_exit(struct vlapic *vlapic)
3753 {
3754 /*
3755 * Unlike full APICv, where changes to the TPR are reflected in the PPR,
3756 * with TPR shadowing, that duty is relegated to the VMM. Upon exit,
3757 * the PPR is updated to reflect any change in the TPR here.
3758 */
3759 vlapic_sync_tpr(vlapic);
3760 }
3761
3762 static struct vlapic *
vmx_vlapic_init(void * arg,int vcpuid)3763 vmx_vlapic_init(void *arg, int vcpuid)
3764 {
3765 struct vmx *vmx = arg;
3766 struct vlapic_vtx *vlapic_vtx;
3767 struct vlapic *vlapic;
3768
3769 vlapic_vtx = kmem_zalloc(sizeof (struct vlapic_vtx), KM_SLEEP);
3770 vlapic_vtx->pir_desc = &vmx->pir_desc[vcpuid];
3771 vlapic_vtx->vmx = vmx;
3772
3773 vlapic = &vlapic_vtx->vlapic;
3774 vlapic->vm = vmx->vm;
3775 vlapic->vcpuid = vcpuid;
3776 vlapic->apic_page = (struct LAPIC *)&vmx->apic_page[vcpuid];
3777
3778 if (vmx_cap_en(vmx, VMX_CAP_TPR_SHADOW)) {
3779 vlapic->ops.enable_x2apic_mode = vmx_enable_x2apic_mode_ts;
3780 }
3781 if (vmx_cap_en(vmx, VMX_CAP_APICV)) {
3782 vlapic->ops.set_intr_ready = vmx_apicv_set_ready;
3783 vlapic->ops.sync_state = vmx_apicv_sync;
3784 vlapic->ops.intr_accepted = vmx_apicv_accepted;
3785 vlapic->ops.enable_x2apic_mode = vmx_enable_x2apic_mode_vid;
3786
3787 if (vmx_cap_en(vmx, VMX_CAP_APICV_PIR)) {
3788 vlapic->ops.post_intr = vmx_apicv_notify;
3789 }
3790 }
3791
3792 vlapic_init(vlapic);
3793
3794 return (vlapic);
3795 }
3796
3797 static void
vmx_vlapic_cleanup(void * arg,struct vlapic * vlapic)3798 vmx_vlapic_cleanup(void *arg, struct vlapic *vlapic)
3799 {
3800 vlapic_cleanup(vlapic);
3801 kmem_free(vlapic, sizeof (struct vlapic_vtx));
3802 }
3803
3804 static void
vmx_pause(void * arg,int vcpuid)3805 vmx_pause(void *arg, int vcpuid)
3806 {
3807 struct vmx *vmx = arg;
3808
3809 VERIFY(vmx_vmcs_access_ensure(vmx, vcpuid));
3810
3811 /* Stash any interrupt/exception pending injection. */
3812 vmx_stash_intinfo(vmx, vcpuid);
3813
3814 /*
3815 * Now that no event is pending injection, interrupt-window exiting and
3816 * NMI-window exiting can be disabled. If/when this vCPU is made to run
3817 * again, those conditions will be reinstated when the now-queued events
3818 * are re-injected.
3819 */
3820 vmx_clear_nmi_window_exiting(vmx, vcpuid);
3821 vmx_clear_int_window_exiting(vmx, vcpuid);
3822
3823 vmx_vmcs_access_done(vmx, vcpuid);
3824 }
3825
3826 static void
vmx_savectx(void * arg,int vcpu)3827 vmx_savectx(void *arg, int vcpu)
3828 {
3829 struct vmx *vmx = arg;
3830
3831 if ((vmx->vmcs_state[vcpu] & VS_LOADED) != 0) {
3832 vmcs_clear(vmx->vmcs_pa[vcpu]);
3833 vmx_msr_guest_exit(vmx, vcpu);
3834 /*
3835 * Having VMCLEARed the VMCS, it can no longer be re-entered
3836 * with VMRESUME, but must be VMLAUNCHed again.
3837 */
3838 vmx->vmcs_state[vcpu] &= ~VS_LAUNCHED;
3839 }
3840
3841 reset_gdtr_limit();
3842 }
3843
3844 static void
vmx_restorectx(void * arg,int vcpu)3845 vmx_restorectx(void *arg, int vcpu)
3846 {
3847 struct vmx *vmx = arg;
3848
3849 ASSERT0(vmx->vmcs_state[vcpu] & VS_LAUNCHED);
3850
3851 if ((vmx->vmcs_state[vcpu] & VS_LOADED) != 0) {
3852 vmx_msr_guest_enter(vmx, vcpu);
3853 vmcs_load(vmx->vmcs_pa[vcpu]);
3854 }
3855 }
3856
3857 static freqratio_res_t
vmx_freq_ratio(uint64_t guest_hz,uint64_t host_hz,uint64_t * mult)3858 vmx_freq_ratio(uint64_t guest_hz, uint64_t host_hz, uint64_t *mult)
3859 {
3860 if (guest_hz == host_hz) {
3861 *mult = VM_TSCM_NOSCALE;
3862 return (FR_SCALING_NOT_NEEDED);
3863 }
3864
3865 /* VMX support not implemented at this time */
3866 return (FR_SCALING_NOT_SUPPORTED);
3867 }
3868
3869 struct vmm_ops vmm_ops_intel = {
3870 .init = vmx_init,
3871 .cleanup = vmx_cleanup,
3872 .resume = vmx_restore,
3873
3874 .vminit = vmx_vminit,
3875 .vmrun = vmx_run,
3876 .vmcleanup = vmx_vmcleanup,
3877 .vmgetreg = vmx_getreg,
3878 .vmsetreg = vmx_setreg,
3879 .vmgetdesc = vmx_getdesc,
3880 .vmsetdesc = vmx_setdesc,
3881 .vmgetcap = vmx_getcap,
3882 .vmsetcap = vmx_setcap,
3883 .vlapic_init = vmx_vlapic_init,
3884 .vlapic_cleanup = vmx_vlapic_cleanup,
3885 .vmpause = vmx_pause,
3886
3887 .vmsavectx = vmx_savectx,
3888 .vmrestorectx = vmx_restorectx,
3889
3890 .vmgetmsr = vmx_msr_get,
3891 .vmsetmsr = vmx_msr_set,
3892
3893 .vmfreqratio = vmx_freq_ratio,
3894 .fr_intsize = INTEL_TSCM_INT_SIZE,
3895 .fr_fracsize = INTEL_TSCM_FRAC_SIZE,
3896 };
3897
3898 /* Side-effect free HW validation derived from checks in vmx_init. */
3899 int
vmx_x86_supported(const char ** msg)3900 vmx_x86_supported(const char **msg)
3901 {
3902 int error;
3903 uint32_t tmp;
3904
3905 ASSERT(msg != NULL);
3906
3907 /* Check support for primary processor-based VM-execution controls */
3908 error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
3909 MSR_VMX_TRUE_PROCBASED_CTLS, PROCBASED_CTLS_ONE_SETTING,
3910 PROCBASED_CTLS_ZERO_SETTING, &tmp);
3911 if (error) {
3912 *msg = "processor does not support desired primary "
3913 "processor-based controls";
3914 return (error);
3915 }
3916
3917 /* Check support for secondary processor-based VM-execution controls */
3918 error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2,
3919 MSR_VMX_PROCBASED_CTLS2, PROCBASED_CTLS2_ONE_SETTING,
3920 PROCBASED_CTLS2_ZERO_SETTING, &tmp);
3921 if (error) {
3922 *msg = "processor does not support desired secondary "
3923 "processor-based controls";
3924 return (error);
3925 }
3926
3927 /* Check support for pin-based VM-execution controls */
3928 error = vmx_set_ctlreg(MSR_VMX_PINBASED_CTLS,
3929 MSR_VMX_TRUE_PINBASED_CTLS, PINBASED_CTLS_ONE_SETTING,
3930 PINBASED_CTLS_ZERO_SETTING, &tmp);
3931 if (error) {
3932 *msg = "processor does not support desired pin-based controls";
3933 return (error);
3934 }
3935
3936 /* Check support for VM-exit controls */
3937 error = vmx_set_ctlreg(MSR_VMX_EXIT_CTLS, MSR_VMX_TRUE_EXIT_CTLS,
3938 VM_EXIT_CTLS_ONE_SETTING, VM_EXIT_CTLS_ZERO_SETTING, &tmp);
3939 if (error) {
3940 *msg = "processor does not support desired exit controls";
3941 return (error);
3942 }
3943
3944 /* Check support for VM-entry controls */
3945 error = vmx_set_ctlreg(MSR_VMX_ENTRY_CTLS, MSR_VMX_TRUE_ENTRY_CTLS,
3946 VM_ENTRY_CTLS_ONE_SETTING, VM_ENTRY_CTLS_ZERO_SETTING, &tmp);
3947 if (error) {
3948 *msg = "processor does not support desired entry controls";
3949 return (error);
3950 }
3951
3952 /* Unrestricted guest is nominally optional, but not for us. */
3953 error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, MSR_VMX_PROCBASED_CTLS2,
3954 PROCBASED2_UNRESTRICTED_GUEST, 0, &tmp);
3955 if (error) {
3956 *msg = "processor does not support desired unrestricted guest "
3957 "controls";
3958 return (error);
3959 }
3960
3961 return (0);
3962 }
3963