1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2011 NetApp, Inc. 5 * All rights reserved. 6 * Copyright (c) 2018 Joyent, Inc. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 * 29 * $FreeBSD$ 30 */ 31 32 #include <sys/cdefs.h> 33 __FBSDID("$FreeBSD$"); 34 35 #include "opt_bhyve_snapshot.h" 36 37 #include <sys/param.h> 38 #include <sys/systm.h> 39 #include <sys/smp.h> 40 #include <sys/kernel.h> 41 #include <sys/malloc.h> 42 #include <sys/pcpu.h> 43 #include <sys/proc.h> 44 #include <sys/reg.h> 45 #include <sys/smr.h> 46 #include <sys/sysctl.h> 47 48 #include <vm/vm.h> 49 #include <vm/pmap.h> 50 51 #include <machine/psl.h> 52 #include <machine/cpufunc.h> 53 #include <machine/md_var.h> 54 #include <machine/segments.h> 55 #include <machine/smp.h> 56 #include <machine/specialreg.h> 57 #include <machine/vmparam.h> 58 59 #include <machine/vmm.h> 60 #include <machine/vmm_dev.h> 61 #include <machine/vmm_instruction_emul.h> 62 #include <machine/vmm_snapshot.h> 63 64 #include "vmm_lapic.h" 65 #include "vmm_host.h" 66 #include "vmm_ioport.h" 67 #include "vmm_ktr.h" 68 #include "vmm_stat.h" 69 #include "vatpic.h" 70 #include "vlapic.h" 71 #include "vlapic_priv.h" 72 73 #include "ept.h" 74 #include "vmx_cpufunc.h" 75 #include "vmx.h" 76 #include "vmx_msr.h" 77 #include "x86.h" 78 #include "vmx_controls.h" 79 80 #define PINBASED_CTLS_ONE_SETTING \ 81 (PINBASED_EXTINT_EXITING | \ 82 PINBASED_NMI_EXITING | \ 83 PINBASED_VIRTUAL_NMI) 84 #define PINBASED_CTLS_ZERO_SETTING 0 85 86 #define PROCBASED_CTLS_WINDOW_SETTING \ 87 (PROCBASED_INT_WINDOW_EXITING | \ 88 PROCBASED_NMI_WINDOW_EXITING) 89 90 #define PROCBASED_CTLS_ONE_SETTING \ 91 (PROCBASED_SECONDARY_CONTROLS | \ 92 PROCBASED_MWAIT_EXITING | \ 93 PROCBASED_MONITOR_EXITING | \ 94 PROCBASED_IO_EXITING | \ 95 PROCBASED_MSR_BITMAPS | \ 96 PROCBASED_CTLS_WINDOW_SETTING | \ 97 PROCBASED_CR8_LOAD_EXITING | \ 98 PROCBASED_CR8_STORE_EXITING) 99 #define PROCBASED_CTLS_ZERO_SETTING \ 100 (PROCBASED_CR3_LOAD_EXITING | \ 101 PROCBASED_CR3_STORE_EXITING | \ 102 PROCBASED_IO_BITMAPS) 103 104 #define PROCBASED_CTLS2_ONE_SETTING PROCBASED2_ENABLE_EPT 105 #define PROCBASED_CTLS2_ZERO_SETTING 0 106 107 #define VM_EXIT_CTLS_ONE_SETTING \ 108 (VM_EXIT_SAVE_DEBUG_CONTROLS | \ 109 VM_EXIT_HOST_LMA | \ 110 VM_EXIT_SAVE_EFER | \ 111 VM_EXIT_LOAD_EFER | \ 112 VM_EXIT_ACKNOWLEDGE_INTERRUPT) 113 114 #define VM_EXIT_CTLS_ZERO_SETTING 0 115 116 #define VM_ENTRY_CTLS_ONE_SETTING \ 117 (VM_ENTRY_LOAD_DEBUG_CONTROLS | \ 118 VM_ENTRY_LOAD_EFER) 119 120 #define VM_ENTRY_CTLS_ZERO_SETTING \ 121 (VM_ENTRY_INTO_SMM | \ 122 VM_ENTRY_DEACTIVATE_DUAL_MONITOR) 123 124 #define HANDLED 1 125 #define UNHANDLED 0 126 127 static MALLOC_DEFINE(M_VMX, "vmx", "vmx"); 128 static MALLOC_DEFINE(M_VLAPIC, "vlapic", "vlapic"); 129 130 SYSCTL_DECL(_hw_vmm); 131 SYSCTL_NODE(_hw_vmm, OID_AUTO, vmx, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 132 NULL); 133 134 int vmxon_enabled[MAXCPU]; 135 static char vmxon_region[MAXCPU][PAGE_SIZE] __aligned(PAGE_SIZE); 136 137 static uint32_t pinbased_ctls, procbased_ctls, procbased_ctls2; 138 static uint32_t exit_ctls, entry_ctls; 139 140 static uint64_t cr0_ones_mask, cr0_zeros_mask; 141 SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr0_ones_mask, CTLFLAG_RD, 142 &cr0_ones_mask, 0, NULL); 143 SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr0_zeros_mask, CTLFLAG_RD, 144 &cr0_zeros_mask, 0, NULL); 145 146 static uint64_t cr4_ones_mask, cr4_zeros_mask; 147 SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr4_ones_mask, CTLFLAG_RD, 148 &cr4_ones_mask, 0, NULL); 149 SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr4_zeros_mask, CTLFLAG_RD, 150 &cr4_zeros_mask, 0, NULL); 151 152 static int vmx_initialized; 153 SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, initialized, CTLFLAG_RD, 154 &vmx_initialized, 0, "Intel VMX initialized"); 155 156 /* 157 * Optional capabilities 158 */ 159 static SYSCTL_NODE(_hw_vmm_vmx, OID_AUTO, cap, 160 CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 161 NULL); 162 163 static int cap_halt_exit; 164 SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, halt_exit, CTLFLAG_RD, &cap_halt_exit, 0, 165 "HLT triggers a VM-exit"); 166 167 static int cap_pause_exit; 168 SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, pause_exit, CTLFLAG_RD, &cap_pause_exit, 169 0, "PAUSE triggers a VM-exit"); 170 171 static int cap_rdpid; 172 SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, rdpid, CTLFLAG_RD, &cap_rdpid, 0, 173 "Guests are allowed to use RDPID"); 174 175 static int cap_rdtscp; 176 SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, rdtscp, CTLFLAG_RD, &cap_rdtscp, 0, 177 "Guests are allowed to use RDTSCP"); 178 179 static int cap_unrestricted_guest; 180 SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, unrestricted_guest, CTLFLAG_RD, 181 &cap_unrestricted_guest, 0, "Unrestricted guests"); 182 183 static int cap_monitor_trap; 184 SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, monitor_trap, CTLFLAG_RD, 185 &cap_monitor_trap, 0, "Monitor trap flag"); 186 187 static int cap_invpcid; 188 SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, invpcid, CTLFLAG_RD, &cap_invpcid, 189 0, "Guests are allowed to use INVPCID"); 190 191 static int tpr_shadowing; 192 SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, tpr_shadowing, CTLFLAG_RD, 193 &tpr_shadowing, 0, "TPR shadowing support"); 194 195 static int virtual_interrupt_delivery; 196 SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, virtual_interrupt_delivery, CTLFLAG_RD, 197 &virtual_interrupt_delivery, 0, "APICv virtual interrupt delivery support"); 198 199 static int posted_interrupts; 200 SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, posted_interrupts, CTLFLAG_RD, 201 &posted_interrupts, 0, "APICv posted interrupt support"); 202 203 static int pirvec = -1; 204 SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, posted_interrupt_vector, CTLFLAG_RD, 205 &pirvec, 0, "APICv posted interrupt vector"); 206 207 static struct unrhdr *vpid_unr; 208 static u_int vpid_alloc_failed; 209 SYSCTL_UINT(_hw_vmm_vmx, OID_AUTO, vpid_alloc_failed, CTLFLAG_RD, 210 &vpid_alloc_failed, 0, NULL); 211 212 int guest_l1d_flush; 213 SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, l1d_flush, CTLFLAG_RD, 214 &guest_l1d_flush, 0, NULL); 215 int guest_l1d_flush_sw; 216 SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, l1d_flush_sw, CTLFLAG_RD, 217 &guest_l1d_flush_sw, 0, NULL); 218 219 static struct msr_entry msr_load_list[1] __aligned(16); 220 221 /* 222 * The definitions of SDT probes for VMX. 223 */ 224 225 SDT_PROBE_DEFINE3(vmm, vmx, exit, entry, 226 "struct vmx *", "int", "struct vm_exit *"); 227 228 SDT_PROBE_DEFINE4(vmm, vmx, exit, taskswitch, 229 "struct vmx *", "int", "struct vm_exit *", "struct vm_task_switch *"); 230 231 SDT_PROBE_DEFINE4(vmm, vmx, exit, craccess, 232 "struct vmx *", "int", "struct vm_exit *", "uint64_t"); 233 234 SDT_PROBE_DEFINE4(vmm, vmx, exit, rdmsr, 235 "struct vmx *", "int", "struct vm_exit *", "uint32_t"); 236 237 SDT_PROBE_DEFINE5(vmm, vmx, exit, wrmsr, 238 "struct vmx *", "int", "struct vm_exit *", "uint32_t", "uint64_t"); 239 240 SDT_PROBE_DEFINE3(vmm, vmx, exit, halt, 241 "struct vmx *", "int", "struct vm_exit *"); 242 243 SDT_PROBE_DEFINE3(vmm, vmx, exit, mtrap, 244 "struct vmx *", "int", "struct vm_exit *"); 245 246 SDT_PROBE_DEFINE3(vmm, vmx, exit, pause, 247 "struct vmx *", "int", "struct vm_exit *"); 248 249 SDT_PROBE_DEFINE3(vmm, vmx, exit, intrwindow, 250 "struct vmx *", "int", "struct vm_exit *"); 251 252 SDT_PROBE_DEFINE4(vmm, vmx, exit, interrupt, 253 "struct vmx *", "int", "struct vm_exit *", "uint32_t"); 254 255 SDT_PROBE_DEFINE3(vmm, vmx, exit, nmiwindow, 256 "struct vmx *", "int", "struct vm_exit *"); 257 258 SDT_PROBE_DEFINE3(vmm, vmx, exit, inout, 259 "struct vmx *", "int", "struct vm_exit *"); 260 261 SDT_PROBE_DEFINE3(vmm, vmx, exit, cpuid, 262 "struct vmx *", "int", "struct vm_exit *"); 263 264 SDT_PROBE_DEFINE5(vmm, vmx, exit, exception, 265 "struct vmx *", "int", "struct vm_exit *", "uint32_t", "int"); 266 267 SDT_PROBE_DEFINE5(vmm, vmx, exit, nestedfault, 268 "struct vmx *", "int", "struct vm_exit *", "uint64_t", "uint64_t"); 269 270 SDT_PROBE_DEFINE4(vmm, vmx, exit, mmiofault, 271 "struct vmx *", "int", "struct vm_exit *", "uint64_t"); 272 273 SDT_PROBE_DEFINE3(vmm, vmx, exit, eoi, 274 "struct vmx *", "int", "struct vm_exit *"); 275 276 SDT_PROBE_DEFINE3(vmm, vmx, exit, apicaccess, 277 "struct vmx *", "int", "struct vm_exit *"); 278 279 SDT_PROBE_DEFINE4(vmm, vmx, exit, apicwrite, 280 "struct vmx *", "int", "struct vm_exit *", "struct vlapic *"); 281 282 SDT_PROBE_DEFINE3(vmm, vmx, exit, xsetbv, 283 "struct vmx *", "int", "struct vm_exit *"); 284 285 SDT_PROBE_DEFINE3(vmm, vmx, exit, monitor, 286 "struct vmx *", "int", "struct vm_exit *"); 287 288 SDT_PROBE_DEFINE3(vmm, vmx, exit, mwait, 289 "struct vmx *", "int", "struct vm_exit *"); 290 291 SDT_PROBE_DEFINE3(vmm, vmx, exit, vminsn, 292 "struct vmx *", "int", "struct vm_exit *"); 293 294 SDT_PROBE_DEFINE4(vmm, vmx, exit, unknown, 295 "struct vmx *", "int", "struct vm_exit *", "uint32_t"); 296 297 SDT_PROBE_DEFINE4(vmm, vmx, exit, return, 298 "struct vmx *", "int", "struct vm_exit *", "int"); 299 300 /* 301 * Use the last page below 4GB as the APIC access address. This address is 302 * occupied by the boot firmware so it is guaranteed that it will not conflict 303 * with a page in system memory. 304 */ 305 #define APIC_ACCESS_ADDRESS 0xFFFFF000 306 307 static int vmx_getdesc(void *arg, int vcpu, int reg, struct seg_desc *desc); 308 static int vmx_getreg(void *arg, int vcpu, int reg, uint64_t *retval); 309 static int vmxctx_setreg(struct vmxctx *vmxctx, int reg, uint64_t val); 310 static void vmx_inject_pir(struct vlapic *vlapic); 311 #ifdef BHYVE_SNAPSHOT 312 static int vmx_restore_tsc(void *arg, int vcpu, uint64_t now); 313 #endif 314 315 static inline bool 316 host_has_rdpid(void) 317 { 318 return ((cpu_stdext_feature2 & CPUID_STDEXT2_RDPID) != 0); 319 } 320 321 static inline bool 322 host_has_rdtscp(void) 323 { 324 return ((amd_feature & AMDID_RDTSCP) != 0); 325 } 326 327 #ifdef KTR 328 static const char * 329 exit_reason_to_str(int reason) 330 { 331 static char reasonbuf[32]; 332 333 switch (reason) { 334 case EXIT_REASON_EXCEPTION: 335 return "exception"; 336 case EXIT_REASON_EXT_INTR: 337 return "extint"; 338 case EXIT_REASON_TRIPLE_FAULT: 339 return "triplefault"; 340 case EXIT_REASON_INIT: 341 return "init"; 342 case EXIT_REASON_SIPI: 343 return "sipi"; 344 case EXIT_REASON_IO_SMI: 345 return "iosmi"; 346 case EXIT_REASON_SMI: 347 return "smi"; 348 case EXIT_REASON_INTR_WINDOW: 349 return "intrwindow"; 350 case EXIT_REASON_NMI_WINDOW: 351 return "nmiwindow"; 352 case EXIT_REASON_TASK_SWITCH: 353 return "taskswitch"; 354 case EXIT_REASON_CPUID: 355 return "cpuid"; 356 case EXIT_REASON_GETSEC: 357 return "getsec"; 358 case EXIT_REASON_HLT: 359 return "hlt"; 360 case EXIT_REASON_INVD: 361 return "invd"; 362 case EXIT_REASON_INVLPG: 363 return "invlpg"; 364 case EXIT_REASON_RDPMC: 365 return "rdpmc"; 366 case EXIT_REASON_RDTSC: 367 return "rdtsc"; 368 case EXIT_REASON_RSM: 369 return "rsm"; 370 case EXIT_REASON_VMCALL: 371 return "vmcall"; 372 case EXIT_REASON_VMCLEAR: 373 return "vmclear"; 374 case EXIT_REASON_VMLAUNCH: 375 return "vmlaunch"; 376 case EXIT_REASON_VMPTRLD: 377 return "vmptrld"; 378 case EXIT_REASON_VMPTRST: 379 return "vmptrst"; 380 case EXIT_REASON_VMREAD: 381 return "vmread"; 382 case EXIT_REASON_VMRESUME: 383 return "vmresume"; 384 case EXIT_REASON_VMWRITE: 385 return "vmwrite"; 386 case EXIT_REASON_VMXOFF: 387 return "vmxoff"; 388 case EXIT_REASON_VMXON: 389 return "vmxon"; 390 case EXIT_REASON_CR_ACCESS: 391 return "craccess"; 392 case EXIT_REASON_DR_ACCESS: 393 return "draccess"; 394 case EXIT_REASON_INOUT: 395 return "inout"; 396 case EXIT_REASON_RDMSR: 397 return "rdmsr"; 398 case EXIT_REASON_WRMSR: 399 return "wrmsr"; 400 case EXIT_REASON_INVAL_VMCS: 401 return "invalvmcs"; 402 case EXIT_REASON_INVAL_MSR: 403 return "invalmsr"; 404 case EXIT_REASON_MWAIT: 405 return "mwait"; 406 case EXIT_REASON_MTF: 407 return "mtf"; 408 case EXIT_REASON_MONITOR: 409 return "monitor"; 410 case EXIT_REASON_PAUSE: 411 return "pause"; 412 case EXIT_REASON_MCE_DURING_ENTRY: 413 return "mce-during-entry"; 414 case EXIT_REASON_TPR: 415 return "tpr"; 416 case EXIT_REASON_APIC_ACCESS: 417 return "apic-access"; 418 case EXIT_REASON_GDTR_IDTR: 419 return "gdtridtr"; 420 case EXIT_REASON_LDTR_TR: 421 return "ldtrtr"; 422 case EXIT_REASON_EPT_FAULT: 423 return "eptfault"; 424 case EXIT_REASON_EPT_MISCONFIG: 425 return "eptmisconfig"; 426 case EXIT_REASON_INVEPT: 427 return "invept"; 428 case EXIT_REASON_RDTSCP: 429 return "rdtscp"; 430 case EXIT_REASON_VMX_PREEMPT: 431 return "vmxpreempt"; 432 case EXIT_REASON_INVVPID: 433 return "invvpid"; 434 case EXIT_REASON_WBINVD: 435 return "wbinvd"; 436 case EXIT_REASON_XSETBV: 437 return "xsetbv"; 438 case EXIT_REASON_APIC_WRITE: 439 return "apic-write"; 440 default: 441 snprintf(reasonbuf, sizeof(reasonbuf), "%d", reason); 442 return (reasonbuf); 443 } 444 } 445 #endif /* KTR */ 446 447 static int 448 vmx_allow_x2apic_msrs(struct vmx *vmx) 449 { 450 int i, error; 451 452 error = 0; 453 454 /* 455 * Allow readonly access to the following x2APIC MSRs from the guest. 456 */ 457 error += guest_msr_ro(vmx, MSR_APIC_ID); 458 error += guest_msr_ro(vmx, MSR_APIC_VERSION); 459 error += guest_msr_ro(vmx, MSR_APIC_LDR); 460 error += guest_msr_ro(vmx, MSR_APIC_SVR); 461 462 for (i = 0; i < 8; i++) 463 error += guest_msr_ro(vmx, MSR_APIC_ISR0 + i); 464 465 for (i = 0; i < 8; i++) 466 error += guest_msr_ro(vmx, MSR_APIC_TMR0 + i); 467 468 for (i = 0; i < 8; i++) 469 error += guest_msr_ro(vmx, MSR_APIC_IRR0 + i); 470 471 error += guest_msr_ro(vmx, MSR_APIC_ESR); 472 error += guest_msr_ro(vmx, MSR_APIC_LVT_TIMER); 473 error += guest_msr_ro(vmx, MSR_APIC_LVT_THERMAL); 474 error += guest_msr_ro(vmx, MSR_APIC_LVT_PCINT); 475 error += guest_msr_ro(vmx, MSR_APIC_LVT_LINT0); 476 error += guest_msr_ro(vmx, MSR_APIC_LVT_LINT1); 477 error += guest_msr_ro(vmx, MSR_APIC_LVT_ERROR); 478 error += guest_msr_ro(vmx, MSR_APIC_ICR_TIMER); 479 error += guest_msr_ro(vmx, MSR_APIC_DCR_TIMER); 480 error += guest_msr_ro(vmx, MSR_APIC_ICR); 481 482 /* 483 * Allow TPR, EOI and SELF_IPI MSRs to be read and written by the guest. 484 * 485 * These registers get special treatment described in the section 486 * "Virtualizing MSR-Based APIC Accesses". 487 */ 488 error += guest_msr_rw(vmx, MSR_APIC_TPR); 489 error += guest_msr_rw(vmx, MSR_APIC_EOI); 490 error += guest_msr_rw(vmx, MSR_APIC_SELF_IPI); 491 492 return (error); 493 } 494 495 u_long 496 vmx_fix_cr0(u_long cr0) 497 { 498 499 return ((cr0 | cr0_ones_mask) & ~cr0_zeros_mask); 500 } 501 502 u_long 503 vmx_fix_cr4(u_long cr4) 504 { 505 506 return ((cr4 | cr4_ones_mask) & ~cr4_zeros_mask); 507 } 508 509 static void 510 vpid_free(int vpid) 511 { 512 if (vpid < 0 || vpid > 0xffff) 513 panic("vpid_free: invalid vpid %d", vpid); 514 515 /* 516 * VPIDs [0,VM_MAXCPU] are special and are not allocated from 517 * the unit number allocator. 518 */ 519 520 if (vpid > VM_MAXCPU) 521 free_unr(vpid_unr, vpid); 522 } 523 524 static void 525 vpid_alloc(uint16_t *vpid, int num) 526 { 527 int i, x; 528 529 if (num <= 0 || num > VM_MAXCPU) 530 panic("invalid number of vpids requested: %d", num); 531 532 /* 533 * If the "enable vpid" execution control is not enabled then the 534 * VPID is required to be 0 for all vcpus. 535 */ 536 if ((procbased_ctls2 & PROCBASED2_ENABLE_VPID) == 0) { 537 for (i = 0; i < num; i++) 538 vpid[i] = 0; 539 return; 540 } 541 542 /* 543 * Allocate a unique VPID for each vcpu from the unit number allocator. 544 */ 545 for (i = 0; i < num; i++) { 546 x = alloc_unr(vpid_unr); 547 if (x == -1) 548 break; 549 else 550 vpid[i] = x; 551 } 552 553 if (i < num) { 554 atomic_add_int(&vpid_alloc_failed, 1); 555 556 /* 557 * If the unit number allocator does not have enough unique 558 * VPIDs then we need to allocate from the [1,VM_MAXCPU] range. 559 * 560 * These VPIDs are not be unique across VMs but this does not 561 * affect correctness because the combined mappings are also 562 * tagged with the EP4TA which is unique for each VM. 563 * 564 * It is still sub-optimal because the invvpid will invalidate 565 * combined mappings for a particular VPID across all EP4TAs. 566 */ 567 while (i-- > 0) 568 vpid_free(vpid[i]); 569 570 for (i = 0; i < num; i++) 571 vpid[i] = i + 1; 572 } 573 } 574 575 static void 576 vpid_init(void) 577 { 578 /* 579 * VPID 0 is required when the "enable VPID" execution control is 580 * disabled. 581 * 582 * VPIDs [1,VM_MAXCPU] are used as the "overflow namespace" when the 583 * unit number allocator does not have sufficient unique VPIDs to 584 * satisfy the allocation. 585 * 586 * The remaining VPIDs are managed by the unit number allocator. 587 */ 588 vpid_unr = new_unrhdr(VM_MAXCPU + 1, 0xffff, NULL); 589 } 590 591 static void 592 vmx_disable(void *arg __unused) 593 { 594 struct invvpid_desc invvpid_desc = { 0 }; 595 struct invept_desc invept_desc = { 0 }; 596 597 if (vmxon_enabled[curcpu]) { 598 /* 599 * See sections 25.3.3.3 and 25.3.3.4 in Intel Vol 3b. 600 * 601 * VMXON or VMXOFF are not required to invalidate any TLB 602 * caching structures. This prevents potential retention of 603 * cached information in the TLB between distinct VMX episodes. 604 */ 605 invvpid(INVVPID_TYPE_ALL_CONTEXTS, invvpid_desc); 606 invept(INVEPT_TYPE_ALL_CONTEXTS, invept_desc); 607 vmxoff(); 608 } 609 load_cr4(rcr4() & ~CR4_VMXE); 610 } 611 612 static int 613 vmx_modcleanup(void) 614 { 615 616 if (pirvec >= 0) 617 lapic_ipi_free(pirvec); 618 619 if (vpid_unr != NULL) { 620 delete_unrhdr(vpid_unr); 621 vpid_unr = NULL; 622 } 623 624 if (nmi_flush_l1d_sw == 1) 625 nmi_flush_l1d_sw = 0; 626 627 smp_rendezvous(NULL, vmx_disable, NULL, NULL); 628 629 return (0); 630 } 631 632 static void 633 vmx_enable(void *arg __unused) 634 { 635 int error; 636 uint64_t feature_control; 637 638 feature_control = rdmsr(MSR_IA32_FEATURE_CONTROL); 639 if ((feature_control & IA32_FEATURE_CONTROL_LOCK) == 0 || 640 (feature_control & IA32_FEATURE_CONTROL_VMX_EN) == 0) { 641 wrmsr(MSR_IA32_FEATURE_CONTROL, 642 feature_control | IA32_FEATURE_CONTROL_VMX_EN | 643 IA32_FEATURE_CONTROL_LOCK); 644 } 645 646 load_cr4(rcr4() | CR4_VMXE); 647 648 *(uint32_t *)vmxon_region[curcpu] = vmx_revision(); 649 error = vmxon(vmxon_region[curcpu]); 650 if (error == 0) 651 vmxon_enabled[curcpu] = 1; 652 } 653 654 static void 655 vmx_modresume(void) 656 { 657 658 if (vmxon_enabled[curcpu]) 659 vmxon(vmxon_region[curcpu]); 660 } 661 662 static int 663 vmx_modinit(int ipinum) 664 { 665 int error; 666 uint64_t basic, fixed0, fixed1, feature_control; 667 uint32_t tmp, procbased2_vid_bits; 668 669 /* CPUID.1:ECX[bit 5] must be 1 for processor to support VMX */ 670 if (!(cpu_feature2 & CPUID2_VMX)) { 671 printf("vmx_modinit: processor does not support VMX " 672 "operation\n"); 673 return (ENXIO); 674 } 675 676 /* 677 * Verify that MSR_IA32_FEATURE_CONTROL lock and VMXON enable bits 678 * are set (bits 0 and 2 respectively). 679 */ 680 feature_control = rdmsr(MSR_IA32_FEATURE_CONTROL); 681 if ((feature_control & IA32_FEATURE_CONTROL_LOCK) == 1 && 682 (feature_control & IA32_FEATURE_CONTROL_VMX_EN) == 0) { 683 printf("vmx_modinit: VMX operation disabled by BIOS\n"); 684 return (ENXIO); 685 } 686 687 /* 688 * Verify capabilities MSR_VMX_BASIC: 689 * - bit 54 indicates support for INS/OUTS decoding 690 */ 691 basic = rdmsr(MSR_VMX_BASIC); 692 if ((basic & (1UL << 54)) == 0) { 693 printf("vmx_modinit: processor does not support desired basic " 694 "capabilities\n"); 695 return (EINVAL); 696 } 697 698 /* Check support for primary processor-based VM-execution controls */ 699 error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, 700 MSR_VMX_TRUE_PROCBASED_CTLS, 701 PROCBASED_CTLS_ONE_SETTING, 702 PROCBASED_CTLS_ZERO_SETTING, &procbased_ctls); 703 if (error) { 704 printf("vmx_modinit: processor does not support desired " 705 "primary processor-based controls\n"); 706 return (error); 707 } 708 709 /* Clear the processor-based ctl bits that are set on demand */ 710 procbased_ctls &= ~PROCBASED_CTLS_WINDOW_SETTING; 711 712 /* Check support for secondary processor-based VM-execution controls */ 713 error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, 714 MSR_VMX_PROCBASED_CTLS2, 715 PROCBASED_CTLS2_ONE_SETTING, 716 PROCBASED_CTLS2_ZERO_SETTING, &procbased_ctls2); 717 if (error) { 718 printf("vmx_modinit: processor does not support desired " 719 "secondary processor-based controls\n"); 720 return (error); 721 } 722 723 /* Check support for VPID */ 724 error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, MSR_VMX_PROCBASED_CTLS2, 725 PROCBASED2_ENABLE_VPID, 0, &tmp); 726 if (error == 0) 727 procbased_ctls2 |= PROCBASED2_ENABLE_VPID; 728 729 /* Check support for pin-based VM-execution controls */ 730 error = vmx_set_ctlreg(MSR_VMX_PINBASED_CTLS, 731 MSR_VMX_TRUE_PINBASED_CTLS, 732 PINBASED_CTLS_ONE_SETTING, 733 PINBASED_CTLS_ZERO_SETTING, &pinbased_ctls); 734 if (error) { 735 printf("vmx_modinit: processor does not support desired " 736 "pin-based controls\n"); 737 return (error); 738 } 739 740 /* Check support for VM-exit controls */ 741 error = vmx_set_ctlreg(MSR_VMX_EXIT_CTLS, MSR_VMX_TRUE_EXIT_CTLS, 742 VM_EXIT_CTLS_ONE_SETTING, 743 VM_EXIT_CTLS_ZERO_SETTING, 744 &exit_ctls); 745 if (error) { 746 printf("vmx_modinit: processor does not support desired " 747 "exit controls\n"); 748 return (error); 749 } 750 751 /* Check support for VM-entry controls */ 752 error = vmx_set_ctlreg(MSR_VMX_ENTRY_CTLS, MSR_VMX_TRUE_ENTRY_CTLS, 753 VM_ENTRY_CTLS_ONE_SETTING, VM_ENTRY_CTLS_ZERO_SETTING, 754 &entry_ctls); 755 if (error) { 756 printf("vmx_modinit: processor does not support desired " 757 "entry controls\n"); 758 return (error); 759 } 760 761 /* 762 * Check support for optional features by testing them 763 * as individual bits 764 */ 765 cap_halt_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, 766 MSR_VMX_TRUE_PROCBASED_CTLS, 767 PROCBASED_HLT_EXITING, 0, 768 &tmp) == 0); 769 770 cap_monitor_trap = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, 771 MSR_VMX_PROCBASED_CTLS, 772 PROCBASED_MTF, 0, 773 &tmp) == 0); 774 775 cap_pause_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, 776 MSR_VMX_TRUE_PROCBASED_CTLS, 777 PROCBASED_PAUSE_EXITING, 0, 778 &tmp) == 0); 779 780 /* 781 * Check support for RDPID and/or RDTSCP. 782 * 783 * Support a pass-through-based implementation of these via the 784 * "enable RDTSCP" VM-execution control and the "RDTSC exiting" 785 * VM-execution control. 786 * 787 * The "enable RDTSCP" VM-execution control applies to both RDPID 788 * and RDTSCP (see SDM volume 3, section 25.3, "Changes to 789 * Instruction Behavior in VMX Non-root operation"); this is why 790 * only this VM-execution control needs to be enabled in order to 791 * enable passing through whichever of RDPID and/or RDTSCP are 792 * supported by the host. 793 * 794 * The "RDTSC exiting" VM-execution control applies to both RDTSC 795 * and RDTSCP (again, per SDM volume 3, section 25.3), and is 796 * already set up for RDTSC and RDTSCP pass-through by the current 797 * implementation of RDTSC. 798 * 799 * Although RDPID and RDTSCP are optional capabilities, since there 800 * does not currently seem to be a use case for enabling/disabling 801 * these via libvmmapi, choose not to support this and, instead, 802 * just statically always enable or always disable this support 803 * across all vCPUs on all VMs. (Note that there may be some 804 * complications to providing this functionality, e.g., the MSR 805 * bitmap is currently per-VM rather than per-vCPU while the 806 * capability API wants to be able to control capabilities on a 807 * per-vCPU basis). 808 */ 809 error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, 810 MSR_VMX_PROCBASED_CTLS2, 811 PROCBASED2_ENABLE_RDTSCP, 0, &tmp); 812 cap_rdpid = error == 0 && host_has_rdpid(); 813 cap_rdtscp = error == 0 && host_has_rdtscp(); 814 if (cap_rdpid || cap_rdtscp) 815 procbased_ctls2 |= PROCBASED2_ENABLE_RDTSCP; 816 817 cap_unrestricted_guest = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, 818 MSR_VMX_PROCBASED_CTLS2, 819 PROCBASED2_UNRESTRICTED_GUEST, 0, 820 &tmp) == 0); 821 822 cap_invpcid = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, 823 MSR_VMX_PROCBASED_CTLS2, PROCBASED2_ENABLE_INVPCID, 0, 824 &tmp) == 0); 825 826 /* 827 * Check support for TPR shadow. 828 */ 829 error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, 830 MSR_VMX_TRUE_PROCBASED_CTLS, PROCBASED_USE_TPR_SHADOW, 0, 831 &tmp); 832 if (error == 0) { 833 tpr_shadowing = 1; 834 TUNABLE_INT_FETCH("hw.vmm.vmx.use_tpr_shadowing", 835 &tpr_shadowing); 836 } 837 838 if (tpr_shadowing) { 839 procbased_ctls |= PROCBASED_USE_TPR_SHADOW; 840 procbased_ctls &= ~PROCBASED_CR8_LOAD_EXITING; 841 procbased_ctls &= ~PROCBASED_CR8_STORE_EXITING; 842 } 843 844 /* 845 * Check support for virtual interrupt delivery. 846 */ 847 procbased2_vid_bits = (PROCBASED2_VIRTUALIZE_APIC_ACCESSES | 848 PROCBASED2_VIRTUALIZE_X2APIC_MODE | 849 PROCBASED2_APIC_REGISTER_VIRTUALIZATION | 850 PROCBASED2_VIRTUAL_INTERRUPT_DELIVERY); 851 852 error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, MSR_VMX_PROCBASED_CTLS2, 853 procbased2_vid_bits, 0, &tmp); 854 if (error == 0 && tpr_shadowing) { 855 virtual_interrupt_delivery = 1; 856 TUNABLE_INT_FETCH("hw.vmm.vmx.use_apic_vid", 857 &virtual_interrupt_delivery); 858 } 859 860 if (virtual_interrupt_delivery) { 861 procbased_ctls |= PROCBASED_USE_TPR_SHADOW; 862 procbased_ctls2 |= procbased2_vid_bits; 863 procbased_ctls2 &= ~PROCBASED2_VIRTUALIZE_X2APIC_MODE; 864 865 /* 866 * Check for Posted Interrupts only if Virtual Interrupt 867 * Delivery is enabled. 868 */ 869 error = vmx_set_ctlreg(MSR_VMX_PINBASED_CTLS, 870 MSR_VMX_TRUE_PINBASED_CTLS, PINBASED_POSTED_INTERRUPT, 0, 871 &tmp); 872 if (error == 0) { 873 pirvec = lapic_ipi_alloc(pti ? &IDTVEC(justreturn1_pti) : 874 &IDTVEC(justreturn)); 875 if (pirvec < 0) { 876 if (bootverbose) { 877 printf("vmx_modinit: unable to " 878 "allocate posted interrupt " 879 "vector\n"); 880 } 881 } else { 882 posted_interrupts = 1; 883 TUNABLE_INT_FETCH("hw.vmm.vmx.use_apic_pir", 884 &posted_interrupts); 885 } 886 } 887 } 888 889 if (posted_interrupts) 890 pinbased_ctls |= PINBASED_POSTED_INTERRUPT; 891 892 /* Initialize EPT */ 893 error = ept_init(ipinum); 894 if (error) { 895 printf("vmx_modinit: ept initialization failed (%d)\n", error); 896 return (error); 897 } 898 899 guest_l1d_flush = (cpu_ia32_arch_caps & 900 IA32_ARCH_CAP_SKIP_L1DFL_VMENTRY) == 0; 901 TUNABLE_INT_FETCH("hw.vmm.l1d_flush", &guest_l1d_flush); 902 903 /* 904 * L1D cache flush is enabled. Use IA32_FLUSH_CMD MSR when 905 * available. Otherwise fall back to the software flush 906 * method which loads enough data from the kernel text to 907 * flush existing L1D content, both on VMX entry and on NMI 908 * return. 909 */ 910 if (guest_l1d_flush) { 911 if ((cpu_stdext_feature3 & CPUID_STDEXT3_L1D_FLUSH) == 0) { 912 guest_l1d_flush_sw = 1; 913 TUNABLE_INT_FETCH("hw.vmm.l1d_flush_sw", 914 &guest_l1d_flush_sw); 915 } 916 if (guest_l1d_flush_sw) { 917 if (nmi_flush_l1d_sw <= 1) 918 nmi_flush_l1d_sw = 1; 919 } else { 920 msr_load_list[0].index = MSR_IA32_FLUSH_CMD; 921 msr_load_list[0].val = IA32_FLUSH_CMD_L1D; 922 } 923 } 924 925 /* 926 * Stash the cr0 and cr4 bits that must be fixed to 0 or 1 927 */ 928 fixed0 = rdmsr(MSR_VMX_CR0_FIXED0); 929 fixed1 = rdmsr(MSR_VMX_CR0_FIXED1); 930 cr0_ones_mask = fixed0 & fixed1; 931 cr0_zeros_mask = ~fixed0 & ~fixed1; 932 933 /* 934 * CR0_PE and CR0_PG can be set to zero in VMX non-root operation 935 * if unrestricted guest execution is allowed. 936 */ 937 if (cap_unrestricted_guest) 938 cr0_ones_mask &= ~(CR0_PG | CR0_PE); 939 940 /* 941 * Do not allow the guest to set CR0_NW or CR0_CD. 942 */ 943 cr0_zeros_mask |= (CR0_NW | CR0_CD); 944 945 fixed0 = rdmsr(MSR_VMX_CR4_FIXED0); 946 fixed1 = rdmsr(MSR_VMX_CR4_FIXED1); 947 cr4_ones_mask = fixed0 & fixed1; 948 cr4_zeros_mask = ~fixed0 & ~fixed1; 949 950 vpid_init(); 951 952 vmx_msr_init(); 953 954 /* enable VMX operation */ 955 smp_rendezvous(NULL, vmx_enable, NULL, NULL); 956 957 vmx_initialized = 1; 958 959 return (0); 960 } 961 962 static void 963 vmx_trigger_hostintr(int vector) 964 { 965 uintptr_t func; 966 struct gate_descriptor *gd; 967 968 gd = &idt[vector]; 969 970 KASSERT(vector >= 32 && vector <= 255, ("vmx_trigger_hostintr: " 971 "invalid vector %d", vector)); 972 KASSERT(gd->gd_p == 1, ("gate descriptor for vector %d not present", 973 vector)); 974 KASSERT(gd->gd_type == SDT_SYSIGT, ("gate descriptor for vector %d " 975 "has invalid type %d", vector, gd->gd_type)); 976 KASSERT(gd->gd_dpl == SEL_KPL, ("gate descriptor for vector %d " 977 "has invalid dpl %d", vector, gd->gd_dpl)); 978 KASSERT(gd->gd_selector == GSEL(GCODE_SEL, SEL_KPL), ("gate descriptor " 979 "for vector %d has invalid selector %d", vector, gd->gd_selector)); 980 KASSERT(gd->gd_ist == 0, ("gate descriptor for vector %d has invalid " 981 "IST %d", vector, gd->gd_ist)); 982 983 func = ((long)gd->gd_hioffset << 16 | gd->gd_looffset); 984 vmx_call_isr(func); 985 } 986 987 static int 988 vmx_setup_cr_shadow(int which, struct vmcs *vmcs, uint32_t initial) 989 { 990 int error, mask_ident, shadow_ident; 991 uint64_t mask_value; 992 993 if (which != 0 && which != 4) 994 panic("vmx_setup_cr_shadow: unknown cr%d", which); 995 996 if (which == 0) { 997 mask_ident = VMCS_CR0_MASK; 998 mask_value = cr0_ones_mask | cr0_zeros_mask; 999 shadow_ident = VMCS_CR0_SHADOW; 1000 } else { 1001 mask_ident = VMCS_CR4_MASK; 1002 mask_value = cr4_ones_mask | cr4_zeros_mask; 1003 shadow_ident = VMCS_CR4_SHADOW; 1004 } 1005 1006 error = vmcs_setreg(vmcs, 0, VMCS_IDENT(mask_ident), mask_value); 1007 if (error) 1008 return (error); 1009 1010 error = vmcs_setreg(vmcs, 0, VMCS_IDENT(shadow_ident), initial); 1011 if (error) 1012 return (error); 1013 1014 return (0); 1015 } 1016 #define vmx_setup_cr0_shadow(vmcs,init) vmx_setup_cr_shadow(0, (vmcs), (init)) 1017 #define vmx_setup_cr4_shadow(vmcs,init) vmx_setup_cr_shadow(4, (vmcs), (init)) 1018 1019 static void * 1020 vmx_init(struct vm *vm, pmap_t pmap) 1021 { 1022 uint16_t vpid[VM_MAXCPU]; 1023 int i, error; 1024 struct vmx *vmx; 1025 struct vmcs *vmcs; 1026 uint32_t exc_bitmap; 1027 uint16_t maxcpus; 1028 1029 vmx = malloc(sizeof(struct vmx), M_VMX, M_WAITOK | M_ZERO); 1030 if ((uintptr_t)vmx & PAGE_MASK) { 1031 panic("malloc of struct vmx not aligned on %d byte boundary", 1032 PAGE_SIZE); 1033 } 1034 vmx->vm = vm; 1035 1036 vmx->eptp = eptp(vtophys((vm_offset_t)pmap->pm_pmltop)); 1037 1038 /* 1039 * Clean up EPTP-tagged guest physical and combined mappings 1040 * 1041 * VMX transitions are not required to invalidate any guest physical 1042 * mappings. So, it may be possible for stale guest physical mappings 1043 * to be present in the processor TLBs. 1044 * 1045 * Combined mappings for this EP4TA are also invalidated for all VPIDs. 1046 */ 1047 ept_invalidate_mappings(vmx->eptp); 1048 1049 msr_bitmap_initialize(vmx->msr_bitmap); 1050 1051 /* 1052 * It is safe to allow direct access to MSR_GSBASE and MSR_FSBASE. 1053 * The guest FSBASE and GSBASE are saved and restored during 1054 * vm-exit and vm-entry respectively. The host FSBASE and GSBASE are 1055 * always restored from the vmcs host state area on vm-exit. 1056 * 1057 * The SYSENTER_CS/ESP/EIP MSRs are identical to FS/GSBASE in 1058 * how they are saved/restored so can be directly accessed by the 1059 * guest. 1060 * 1061 * MSR_EFER is saved and restored in the guest VMCS area on a 1062 * VM exit and entry respectively. It is also restored from the 1063 * host VMCS area on a VM exit. 1064 * 1065 * The TSC MSR is exposed read-only. Writes are disallowed as 1066 * that will impact the host TSC. If the guest does a write 1067 * the "use TSC offsetting" execution control is enabled and the 1068 * difference between the host TSC and the guest TSC is written 1069 * into the TSC offset in the VMCS. 1070 * 1071 * Guest TSC_AUX support is enabled if any of guest RDPID and/or 1072 * guest RDTSCP support are enabled (since, as per Table 2-2 in SDM 1073 * volume 4, TSC_AUX is supported if any of RDPID and/or RDTSCP are 1074 * supported). If guest TSC_AUX support is enabled, TSC_AUX is 1075 * exposed read-only so that the VMM can do one fewer MSR read per 1076 * exit than if this register were exposed read-write; the guest 1077 * restore value can be updated during guest writes (expected to be 1078 * rare) instead of during all exits (common). 1079 */ 1080 if (guest_msr_rw(vmx, MSR_GSBASE) || 1081 guest_msr_rw(vmx, MSR_FSBASE) || 1082 guest_msr_rw(vmx, MSR_SYSENTER_CS_MSR) || 1083 guest_msr_rw(vmx, MSR_SYSENTER_ESP_MSR) || 1084 guest_msr_rw(vmx, MSR_SYSENTER_EIP_MSR) || 1085 guest_msr_rw(vmx, MSR_EFER) || 1086 guest_msr_ro(vmx, MSR_TSC) || 1087 ((cap_rdpid || cap_rdtscp) && guest_msr_ro(vmx, MSR_TSC_AUX))) 1088 panic("vmx_init: error setting guest msr access"); 1089 1090 vpid_alloc(vpid, VM_MAXCPU); 1091 1092 if (virtual_interrupt_delivery) { 1093 error = vm_map_mmio(vm, DEFAULT_APIC_BASE, PAGE_SIZE, 1094 APIC_ACCESS_ADDRESS); 1095 /* XXX this should really return an error to the caller */ 1096 KASSERT(error == 0, ("vm_map_mmio(apicbase) error %d", error)); 1097 } 1098 1099 maxcpus = vm_get_maxcpus(vm); 1100 for (i = 0; i < maxcpus; i++) { 1101 vmcs = &vmx->vmcs[i]; 1102 vmcs->identifier = vmx_revision(); 1103 error = vmclear(vmcs); 1104 if (error != 0) { 1105 panic("vmx_init: vmclear error %d on vcpu %d\n", 1106 error, i); 1107 } 1108 1109 vmx_msr_guest_init(vmx, i); 1110 1111 error = vmcs_init(vmcs); 1112 KASSERT(error == 0, ("vmcs_init error %d", error)); 1113 1114 VMPTRLD(vmcs); 1115 error = 0; 1116 error += vmwrite(VMCS_HOST_RSP, (u_long)&vmx->ctx[i]); 1117 error += vmwrite(VMCS_EPTP, vmx->eptp); 1118 error += vmwrite(VMCS_PIN_BASED_CTLS, pinbased_ctls); 1119 error += vmwrite(VMCS_PRI_PROC_BASED_CTLS, procbased_ctls); 1120 error += vmwrite(VMCS_SEC_PROC_BASED_CTLS, procbased_ctls2); 1121 error += vmwrite(VMCS_EXIT_CTLS, exit_ctls); 1122 error += vmwrite(VMCS_ENTRY_CTLS, entry_ctls); 1123 error += vmwrite(VMCS_MSR_BITMAP, vtophys(vmx->msr_bitmap)); 1124 error += vmwrite(VMCS_VPID, vpid[i]); 1125 1126 if (guest_l1d_flush && !guest_l1d_flush_sw) { 1127 vmcs_write(VMCS_ENTRY_MSR_LOAD, pmap_kextract( 1128 (vm_offset_t)&msr_load_list[0])); 1129 vmcs_write(VMCS_ENTRY_MSR_LOAD_COUNT, 1130 nitems(msr_load_list)); 1131 vmcs_write(VMCS_EXIT_MSR_STORE, 0); 1132 vmcs_write(VMCS_EXIT_MSR_STORE_COUNT, 0); 1133 } 1134 1135 /* exception bitmap */ 1136 if (vcpu_trace_exceptions(vm, i)) 1137 exc_bitmap = 0xffffffff; 1138 else 1139 exc_bitmap = 1 << IDT_MC; 1140 error += vmwrite(VMCS_EXCEPTION_BITMAP, exc_bitmap); 1141 1142 vmx->ctx[i].guest_dr6 = DBREG_DR6_RESERVED1; 1143 error += vmwrite(VMCS_GUEST_DR7, DBREG_DR7_RESERVED1); 1144 1145 if (tpr_shadowing) { 1146 error += vmwrite(VMCS_VIRTUAL_APIC, 1147 vtophys(&vmx->apic_page[i])); 1148 } 1149 1150 if (virtual_interrupt_delivery) { 1151 error += vmwrite(VMCS_APIC_ACCESS, APIC_ACCESS_ADDRESS); 1152 error += vmwrite(VMCS_EOI_EXIT0, 0); 1153 error += vmwrite(VMCS_EOI_EXIT1, 0); 1154 error += vmwrite(VMCS_EOI_EXIT2, 0); 1155 error += vmwrite(VMCS_EOI_EXIT3, 0); 1156 } 1157 if (posted_interrupts) { 1158 error += vmwrite(VMCS_PIR_VECTOR, pirvec); 1159 error += vmwrite(VMCS_PIR_DESC, 1160 vtophys(&vmx->pir_desc[i])); 1161 } 1162 VMCLEAR(vmcs); 1163 KASSERT(error == 0, ("vmx_init: error customizing the vmcs")); 1164 1165 vmx->cap[i].set = 0; 1166 vmx->cap[i].set |= cap_rdpid != 0 ? 1 << VM_CAP_RDPID : 0; 1167 vmx->cap[i].set |= cap_rdtscp != 0 ? 1 << VM_CAP_RDTSCP : 0; 1168 vmx->cap[i].proc_ctls = procbased_ctls; 1169 vmx->cap[i].proc_ctls2 = procbased_ctls2; 1170 vmx->cap[i].exc_bitmap = exc_bitmap; 1171 1172 vmx->state[i].nextrip = ~0; 1173 vmx->state[i].lastcpu = NOCPU; 1174 vmx->state[i].vpid = vpid[i]; 1175 1176 /* 1177 * Set up the CR0/4 shadows, and init the read shadow 1178 * to the power-on register value from the Intel Sys Arch. 1179 * CR0 - 0x60000010 1180 * CR4 - 0 1181 */ 1182 error = vmx_setup_cr0_shadow(vmcs, 0x60000010); 1183 if (error != 0) 1184 panic("vmx_setup_cr0_shadow %d", error); 1185 1186 error = vmx_setup_cr4_shadow(vmcs, 0); 1187 if (error != 0) 1188 panic("vmx_setup_cr4_shadow %d", error); 1189 1190 vmx->ctx[i].pmap = pmap; 1191 } 1192 1193 return (vmx); 1194 } 1195 1196 static int 1197 vmx_handle_cpuid(struct vm *vm, int vcpu, struct vmxctx *vmxctx) 1198 { 1199 int handled; 1200 1201 handled = x86_emulate_cpuid(vm, vcpu, (uint64_t *)&vmxctx->guest_rax, 1202 (uint64_t *)&vmxctx->guest_rbx, (uint64_t *)&vmxctx->guest_rcx, 1203 (uint64_t *)&vmxctx->guest_rdx); 1204 return (handled); 1205 } 1206 1207 static __inline void 1208 vmx_run_trace(struct vmx *vmx, int vcpu) 1209 { 1210 #ifdef KTR 1211 VCPU_CTR1(vmx->vm, vcpu, "Resume execution at %#lx", vmcs_guest_rip()); 1212 #endif 1213 } 1214 1215 static __inline void 1216 vmx_exit_trace(struct vmx *vmx, int vcpu, uint64_t rip, uint32_t exit_reason, 1217 int handled) 1218 { 1219 #ifdef KTR 1220 VCPU_CTR3(vmx->vm, vcpu, "%s %s vmexit at 0x%0lx", 1221 handled ? "handled" : "unhandled", 1222 exit_reason_to_str(exit_reason), rip); 1223 #endif 1224 } 1225 1226 static __inline void 1227 vmx_astpending_trace(struct vmx *vmx, int vcpu, uint64_t rip) 1228 { 1229 #ifdef KTR 1230 VCPU_CTR1(vmx->vm, vcpu, "astpending vmexit at 0x%0lx", rip); 1231 #endif 1232 } 1233 1234 static VMM_STAT_INTEL(VCPU_INVVPID_SAVED, "Number of vpid invalidations saved"); 1235 static VMM_STAT_INTEL(VCPU_INVVPID_DONE, "Number of vpid invalidations done"); 1236 1237 /* 1238 * Invalidate guest mappings identified by its vpid from the TLB. 1239 */ 1240 static __inline void 1241 vmx_invvpid(struct vmx *vmx, int vcpu, pmap_t pmap, int running) 1242 { 1243 struct vmxstate *vmxstate; 1244 struct invvpid_desc invvpid_desc; 1245 1246 vmxstate = &vmx->state[vcpu]; 1247 if (vmxstate->vpid == 0) 1248 return; 1249 1250 if (!running) { 1251 /* 1252 * Set the 'lastcpu' to an invalid host cpu. 1253 * 1254 * This will invalidate TLB entries tagged with the vcpu's 1255 * vpid the next time it runs via vmx_set_pcpu_defaults(). 1256 */ 1257 vmxstate->lastcpu = NOCPU; 1258 return; 1259 } 1260 1261 KASSERT(curthread->td_critnest > 0, ("%s: vcpu %d running outside " 1262 "critical section", __func__, vcpu)); 1263 1264 /* 1265 * Invalidate all mappings tagged with 'vpid' 1266 * 1267 * We do this because this vcpu was executing on a different host 1268 * cpu when it last ran. We do not track whether it invalidated 1269 * mappings associated with its 'vpid' during that run. So we must 1270 * assume that the mappings associated with 'vpid' on 'curcpu' are 1271 * stale and invalidate them. 1272 * 1273 * Note that we incur this penalty only when the scheduler chooses to 1274 * move the thread associated with this vcpu between host cpus. 1275 * 1276 * Note also that this will invalidate mappings tagged with 'vpid' 1277 * for "all" EP4TAs. 1278 */ 1279 if (atomic_load_long(&pmap->pm_eptgen) == vmx->eptgen[curcpu]) { 1280 invvpid_desc._res1 = 0; 1281 invvpid_desc._res2 = 0; 1282 invvpid_desc.vpid = vmxstate->vpid; 1283 invvpid_desc.linear_addr = 0; 1284 invvpid(INVVPID_TYPE_SINGLE_CONTEXT, invvpid_desc); 1285 vmm_stat_incr(vmx->vm, vcpu, VCPU_INVVPID_DONE, 1); 1286 } else { 1287 /* 1288 * The invvpid can be skipped if an invept is going to 1289 * be performed before entering the guest. The invept 1290 * will invalidate combined mappings tagged with 1291 * 'vmx->eptp' for all vpids. 1292 */ 1293 vmm_stat_incr(vmx->vm, vcpu, VCPU_INVVPID_SAVED, 1); 1294 } 1295 } 1296 1297 static void 1298 vmx_set_pcpu_defaults(struct vmx *vmx, int vcpu, pmap_t pmap) 1299 { 1300 struct vmxstate *vmxstate; 1301 1302 vmxstate = &vmx->state[vcpu]; 1303 if (vmxstate->lastcpu == curcpu) 1304 return; 1305 1306 vmxstate->lastcpu = curcpu; 1307 1308 vmm_stat_incr(vmx->vm, vcpu, VCPU_MIGRATIONS, 1); 1309 1310 vmcs_write(VMCS_HOST_TR_BASE, vmm_get_host_trbase()); 1311 vmcs_write(VMCS_HOST_GDTR_BASE, vmm_get_host_gdtrbase()); 1312 vmcs_write(VMCS_HOST_GS_BASE, vmm_get_host_gsbase()); 1313 vmx_invvpid(vmx, vcpu, pmap, 1); 1314 } 1315 1316 /* 1317 * We depend on 'procbased_ctls' to have the Interrupt Window Exiting bit set. 1318 */ 1319 CTASSERT((PROCBASED_CTLS_ONE_SETTING & PROCBASED_INT_WINDOW_EXITING) != 0); 1320 1321 static void __inline 1322 vmx_set_int_window_exiting(struct vmx *vmx, int vcpu) 1323 { 1324 1325 if ((vmx->cap[vcpu].proc_ctls & PROCBASED_INT_WINDOW_EXITING) == 0) { 1326 vmx->cap[vcpu].proc_ctls |= PROCBASED_INT_WINDOW_EXITING; 1327 vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); 1328 VCPU_CTR0(vmx->vm, vcpu, "Enabling interrupt window exiting"); 1329 } 1330 } 1331 1332 static void __inline 1333 vmx_clear_int_window_exiting(struct vmx *vmx, int vcpu) 1334 { 1335 1336 KASSERT((vmx->cap[vcpu].proc_ctls & PROCBASED_INT_WINDOW_EXITING) != 0, 1337 ("intr_window_exiting not set: %#x", vmx->cap[vcpu].proc_ctls)); 1338 vmx->cap[vcpu].proc_ctls &= ~PROCBASED_INT_WINDOW_EXITING; 1339 vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); 1340 VCPU_CTR0(vmx->vm, vcpu, "Disabling interrupt window exiting"); 1341 } 1342 1343 static void __inline 1344 vmx_set_nmi_window_exiting(struct vmx *vmx, int vcpu) 1345 { 1346 1347 if ((vmx->cap[vcpu].proc_ctls & PROCBASED_NMI_WINDOW_EXITING) == 0) { 1348 vmx->cap[vcpu].proc_ctls |= PROCBASED_NMI_WINDOW_EXITING; 1349 vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); 1350 VCPU_CTR0(vmx->vm, vcpu, "Enabling NMI window exiting"); 1351 } 1352 } 1353 1354 static void __inline 1355 vmx_clear_nmi_window_exiting(struct vmx *vmx, int vcpu) 1356 { 1357 1358 KASSERT((vmx->cap[vcpu].proc_ctls & PROCBASED_NMI_WINDOW_EXITING) != 0, 1359 ("nmi_window_exiting not set %#x", vmx->cap[vcpu].proc_ctls)); 1360 vmx->cap[vcpu].proc_ctls &= ~PROCBASED_NMI_WINDOW_EXITING; 1361 vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); 1362 VCPU_CTR0(vmx->vm, vcpu, "Disabling NMI window exiting"); 1363 } 1364 1365 int 1366 vmx_set_tsc_offset(struct vmx *vmx, int vcpu, uint64_t offset) 1367 { 1368 int error; 1369 1370 if ((vmx->cap[vcpu].proc_ctls & PROCBASED_TSC_OFFSET) == 0) { 1371 vmx->cap[vcpu].proc_ctls |= PROCBASED_TSC_OFFSET; 1372 vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); 1373 VCPU_CTR0(vmx->vm, vcpu, "Enabling TSC offsetting"); 1374 } 1375 1376 error = vmwrite(VMCS_TSC_OFFSET, offset); 1377 #ifdef BHYVE_SNAPSHOT 1378 if (error == 0) 1379 error = vm_set_tsc_offset(vmx->vm, vcpu, offset); 1380 #endif 1381 return (error); 1382 } 1383 1384 #define NMI_BLOCKING (VMCS_INTERRUPTIBILITY_NMI_BLOCKING | \ 1385 VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING) 1386 #define HWINTR_BLOCKING (VMCS_INTERRUPTIBILITY_STI_BLOCKING | \ 1387 VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING) 1388 1389 static void 1390 vmx_inject_nmi(struct vmx *vmx, int vcpu) 1391 { 1392 uint32_t gi __diagused, info; 1393 1394 gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY); 1395 KASSERT((gi & NMI_BLOCKING) == 0, ("vmx_inject_nmi: invalid guest " 1396 "interruptibility-state %#x", gi)); 1397 1398 info = vmcs_read(VMCS_ENTRY_INTR_INFO); 1399 KASSERT((info & VMCS_INTR_VALID) == 0, ("vmx_inject_nmi: invalid " 1400 "VM-entry interruption information %#x", info)); 1401 1402 /* 1403 * Inject the virtual NMI. The vector must be the NMI IDT entry 1404 * or the VMCS entry check will fail. 1405 */ 1406 info = IDT_NMI | VMCS_INTR_T_NMI | VMCS_INTR_VALID; 1407 vmcs_write(VMCS_ENTRY_INTR_INFO, info); 1408 1409 VCPU_CTR0(vmx->vm, vcpu, "Injecting vNMI"); 1410 1411 /* Clear the request */ 1412 vm_nmi_clear(vmx->vm, vcpu); 1413 } 1414 1415 static void 1416 vmx_inject_interrupts(struct vmx *vmx, int vcpu, struct vlapic *vlapic, 1417 uint64_t guestrip) 1418 { 1419 int vector, need_nmi_exiting, extint_pending; 1420 uint64_t rflags, entryinfo; 1421 uint32_t gi, info; 1422 1423 if (vmx->state[vcpu].nextrip != guestrip) { 1424 gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY); 1425 if (gi & HWINTR_BLOCKING) { 1426 VCPU_CTR2(vmx->vm, vcpu, "Guest interrupt blocking " 1427 "cleared due to rip change: %#lx/%#lx", 1428 vmx->state[vcpu].nextrip, guestrip); 1429 gi &= ~HWINTR_BLOCKING; 1430 vmcs_write(VMCS_GUEST_INTERRUPTIBILITY, gi); 1431 } 1432 } 1433 1434 if (vm_entry_intinfo(vmx->vm, vcpu, &entryinfo)) { 1435 KASSERT((entryinfo & VMCS_INTR_VALID) != 0, ("%s: entry " 1436 "intinfo is not valid: %#lx", __func__, entryinfo)); 1437 1438 info = vmcs_read(VMCS_ENTRY_INTR_INFO); 1439 KASSERT((info & VMCS_INTR_VALID) == 0, ("%s: cannot inject " 1440 "pending exception: %#lx/%#x", __func__, entryinfo, info)); 1441 1442 info = entryinfo; 1443 vector = info & 0xff; 1444 if (vector == IDT_BP || vector == IDT_OF) { 1445 /* 1446 * VT-x requires #BP and #OF to be injected as software 1447 * exceptions. 1448 */ 1449 info &= ~VMCS_INTR_T_MASK; 1450 info |= VMCS_INTR_T_SWEXCEPTION; 1451 } 1452 1453 if (info & VMCS_INTR_DEL_ERRCODE) 1454 vmcs_write(VMCS_ENTRY_EXCEPTION_ERROR, entryinfo >> 32); 1455 1456 vmcs_write(VMCS_ENTRY_INTR_INFO, info); 1457 } 1458 1459 if (vm_nmi_pending(vmx->vm, vcpu)) { 1460 /* 1461 * If there are no conditions blocking NMI injection then 1462 * inject it directly here otherwise enable "NMI window 1463 * exiting" to inject it as soon as we can. 1464 * 1465 * We also check for STI_BLOCKING because some implementations 1466 * don't allow NMI injection in this case. If we are running 1467 * on a processor that doesn't have this restriction it will 1468 * immediately exit and the NMI will be injected in the 1469 * "NMI window exiting" handler. 1470 */ 1471 need_nmi_exiting = 1; 1472 gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY); 1473 if ((gi & (HWINTR_BLOCKING | NMI_BLOCKING)) == 0) { 1474 info = vmcs_read(VMCS_ENTRY_INTR_INFO); 1475 if ((info & VMCS_INTR_VALID) == 0) { 1476 vmx_inject_nmi(vmx, vcpu); 1477 need_nmi_exiting = 0; 1478 } else { 1479 VCPU_CTR1(vmx->vm, vcpu, "Cannot inject NMI " 1480 "due to VM-entry intr info %#x", info); 1481 } 1482 } else { 1483 VCPU_CTR1(vmx->vm, vcpu, "Cannot inject NMI due to " 1484 "Guest Interruptibility-state %#x", gi); 1485 } 1486 1487 if (need_nmi_exiting) 1488 vmx_set_nmi_window_exiting(vmx, vcpu); 1489 } 1490 1491 extint_pending = vm_extint_pending(vmx->vm, vcpu); 1492 1493 if (!extint_pending && virtual_interrupt_delivery) { 1494 vmx_inject_pir(vlapic); 1495 return; 1496 } 1497 1498 /* 1499 * If interrupt-window exiting is already in effect then don't bother 1500 * checking for pending interrupts. This is just an optimization and 1501 * not needed for correctness. 1502 */ 1503 if ((vmx->cap[vcpu].proc_ctls & PROCBASED_INT_WINDOW_EXITING) != 0) { 1504 VCPU_CTR0(vmx->vm, vcpu, "Skip interrupt injection due to " 1505 "pending int_window_exiting"); 1506 return; 1507 } 1508 1509 if (!extint_pending) { 1510 /* Ask the local apic for a vector to inject */ 1511 if (!vlapic_pending_intr(vlapic, &vector)) 1512 return; 1513 1514 /* 1515 * From the Intel SDM, Volume 3, Section "Maskable 1516 * Hardware Interrupts": 1517 * - maskable interrupt vectors [16,255] can be delivered 1518 * through the local APIC. 1519 */ 1520 KASSERT(vector >= 16 && vector <= 255, 1521 ("invalid vector %d from local APIC", vector)); 1522 } else { 1523 /* Ask the legacy pic for a vector to inject */ 1524 vatpic_pending_intr(vmx->vm, &vector); 1525 1526 /* 1527 * From the Intel SDM, Volume 3, Section "Maskable 1528 * Hardware Interrupts": 1529 * - maskable interrupt vectors [0,255] can be delivered 1530 * through the INTR pin. 1531 */ 1532 KASSERT(vector >= 0 && vector <= 255, 1533 ("invalid vector %d from INTR", vector)); 1534 } 1535 1536 /* Check RFLAGS.IF and the interruptibility state of the guest */ 1537 rflags = vmcs_read(VMCS_GUEST_RFLAGS); 1538 if ((rflags & PSL_I) == 0) { 1539 VCPU_CTR2(vmx->vm, vcpu, "Cannot inject vector %d due to " 1540 "rflags %#lx", vector, rflags); 1541 goto cantinject; 1542 } 1543 1544 gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY); 1545 if (gi & HWINTR_BLOCKING) { 1546 VCPU_CTR2(vmx->vm, vcpu, "Cannot inject vector %d due to " 1547 "Guest Interruptibility-state %#x", vector, gi); 1548 goto cantinject; 1549 } 1550 1551 info = vmcs_read(VMCS_ENTRY_INTR_INFO); 1552 if (info & VMCS_INTR_VALID) { 1553 /* 1554 * This is expected and could happen for multiple reasons: 1555 * - A vectoring VM-entry was aborted due to astpending 1556 * - A VM-exit happened during event injection. 1557 * - An exception was injected above. 1558 * - An NMI was injected above or after "NMI window exiting" 1559 */ 1560 VCPU_CTR2(vmx->vm, vcpu, "Cannot inject vector %d due to " 1561 "VM-entry intr info %#x", vector, info); 1562 goto cantinject; 1563 } 1564 1565 /* Inject the interrupt */ 1566 info = VMCS_INTR_T_HWINTR | VMCS_INTR_VALID; 1567 info |= vector; 1568 vmcs_write(VMCS_ENTRY_INTR_INFO, info); 1569 1570 if (!extint_pending) { 1571 /* Update the Local APIC ISR */ 1572 vlapic_intr_accepted(vlapic, vector); 1573 } else { 1574 vm_extint_clear(vmx->vm, vcpu); 1575 vatpic_intr_accepted(vmx->vm, vector); 1576 1577 /* 1578 * After we accepted the current ExtINT the PIC may 1579 * have posted another one. If that is the case, set 1580 * the Interrupt Window Exiting execution control so 1581 * we can inject that one too. 1582 * 1583 * Also, interrupt window exiting allows us to inject any 1584 * pending APIC vector that was preempted by the ExtINT 1585 * as soon as possible. This applies both for the software 1586 * emulated vlapic and the hardware assisted virtual APIC. 1587 */ 1588 vmx_set_int_window_exiting(vmx, vcpu); 1589 } 1590 1591 VCPU_CTR1(vmx->vm, vcpu, "Injecting hwintr at vector %d", vector); 1592 1593 return; 1594 1595 cantinject: 1596 /* 1597 * Set the Interrupt Window Exiting execution control so we can inject 1598 * the interrupt as soon as blocking condition goes away. 1599 */ 1600 vmx_set_int_window_exiting(vmx, vcpu); 1601 } 1602 1603 /* 1604 * If the Virtual NMIs execution control is '1' then the logical processor 1605 * tracks virtual-NMI blocking in the Guest Interruptibility-state field of 1606 * the VMCS. An IRET instruction in VMX non-root operation will remove any 1607 * virtual-NMI blocking. 1608 * 1609 * This unblocking occurs even if the IRET causes a fault. In this case the 1610 * hypervisor needs to restore virtual-NMI blocking before resuming the guest. 1611 */ 1612 static void 1613 vmx_restore_nmi_blocking(struct vmx *vmx, int vcpuid) 1614 { 1615 uint32_t gi; 1616 1617 VCPU_CTR0(vmx->vm, vcpuid, "Restore Virtual-NMI blocking"); 1618 gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY); 1619 gi |= VMCS_INTERRUPTIBILITY_NMI_BLOCKING; 1620 vmcs_write(VMCS_GUEST_INTERRUPTIBILITY, gi); 1621 } 1622 1623 static void 1624 vmx_clear_nmi_blocking(struct vmx *vmx, int vcpuid) 1625 { 1626 uint32_t gi; 1627 1628 VCPU_CTR0(vmx->vm, vcpuid, "Clear Virtual-NMI blocking"); 1629 gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY); 1630 gi &= ~VMCS_INTERRUPTIBILITY_NMI_BLOCKING; 1631 vmcs_write(VMCS_GUEST_INTERRUPTIBILITY, gi); 1632 } 1633 1634 static void 1635 vmx_assert_nmi_blocking(struct vmx *vmx, int vcpuid) 1636 { 1637 uint32_t gi __diagused; 1638 1639 gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY); 1640 KASSERT(gi & VMCS_INTERRUPTIBILITY_NMI_BLOCKING, 1641 ("NMI blocking is not in effect %#x", gi)); 1642 } 1643 1644 static int 1645 vmx_emulate_xsetbv(struct vmx *vmx, int vcpu, struct vm_exit *vmexit) 1646 { 1647 struct vmxctx *vmxctx; 1648 uint64_t xcrval; 1649 const struct xsave_limits *limits; 1650 1651 vmxctx = &vmx->ctx[vcpu]; 1652 limits = vmm_get_xsave_limits(); 1653 1654 /* 1655 * Note that the processor raises a GP# fault on its own if 1656 * xsetbv is executed for CPL != 0, so we do not have to 1657 * emulate that fault here. 1658 */ 1659 1660 /* Only xcr0 is supported. */ 1661 if (vmxctx->guest_rcx != 0) { 1662 vm_inject_gp(vmx->vm, vcpu); 1663 return (HANDLED); 1664 } 1665 1666 /* We only handle xcr0 if both the host and guest have XSAVE enabled. */ 1667 if (!limits->xsave_enabled || !(vmcs_read(VMCS_GUEST_CR4) & CR4_XSAVE)) { 1668 vm_inject_ud(vmx->vm, vcpu); 1669 return (HANDLED); 1670 } 1671 1672 xcrval = vmxctx->guest_rdx << 32 | (vmxctx->guest_rax & 0xffffffff); 1673 if ((xcrval & ~limits->xcr0_allowed) != 0) { 1674 vm_inject_gp(vmx->vm, vcpu); 1675 return (HANDLED); 1676 } 1677 1678 if (!(xcrval & XFEATURE_ENABLED_X87)) { 1679 vm_inject_gp(vmx->vm, vcpu); 1680 return (HANDLED); 1681 } 1682 1683 /* AVX (YMM_Hi128) requires SSE. */ 1684 if (xcrval & XFEATURE_ENABLED_AVX && 1685 (xcrval & XFEATURE_AVX) != XFEATURE_AVX) { 1686 vm_inject_gp(vmx->vm, vcpu); 1687 return (HANDLED); 1688 } 1689 1690 /* 1691 * AVX512 requires base AVX (YMM_Hi128) as well as OpMask, 1692 * ZMM_Hi256, and Hi16_ZMM. 1693 */ 1694 if (xcrval & XFEATURE_AVX512 && 1695 (xcrval & (XFEATURE_AVX512 | XFEATURE_AVX)) != 1696 (XFEATURE_AVX512 | XFEATURE_AVX)) { 1697 vm_inject_gp(vmx->vm, vcpu); 1698 return (HANDLED); 1699 } 1700 1701 /* 1702 * Intel MPX requires both bound register state flags to be 1703 * set. 1704 */ 1705 if (((xcrval & XFEATURE_ENABLED_BNDREGS) != 0) != 1706 ((xcrval & XFEATURE_ENABLED_BNDCSR) != 0)) { 1707 vm_inject_gp(vmx->vm, vcpu); 1708 return (HANDLED); 1709 } 1710 1711 /* 1712 * This runs "inside" vmrun() with the guest's FPU state, so 1713 * modifying xcr0 directly modifies the guest's xcr0, not the 1714 * host's. 1715 */ 1716 load_xcr(0, xcrval); 1717 return (HANDLED); 1718 } 1719 1720 static uint64_t 1721 vmx_get_guest_reg(struct vmx *vmx, int vcpu, int ident) 1722 { 1723 const struct vmxctx *vmxctx; 1724 1725 vmxctx = &vmx->ctx[vcpu]; 1726 1727 switch (ident) { 1728 case 0: 1729 return (vmxctx->guest_rax); 1730 case 1: 1731 return (vmxctx->guest_rcx); 1732 case 2: 1733 return (vmxctx->guest_rdx); 1734 case 3: 1735 return (vmxctx->guest_rbx); 1736 case 4: 1737 return (vmcs_read(VMCS_GUEST_RSP)); 1738 case 5: 1739 return (vmxctx->guest_rbp); 1740 case 6: 1741 return (vmxctx->guest_rsi); 1742 case 7: 1743 return (vmxctx->guest_rdi); 1744 case 8: 1745 return (vmxctx->guest_r8); 1746 case 9: 1747 return (vmxctx->guest_r9); 1748 case 10: 1749 return (vmxctx->guest_r10); 1750 case 11: 1751 return (vmxctx->guest_r11); 1752 case 12: 1753 return (vmxctx->guest_r12); 1754 case 13: 1755 return (vmxctx->guest_r13); 1756 case 14: 1757 return (vmxctx->guest_r14); 1758 case 15: 1759 return (vmxctx->guest_r15); 1760 default: 1761 panic("invalid vmx register %d", ident); 1762 } 1763 } 1764 1765 static void 1766 vmx_set_guest_reg(struct vmx *vmx, int vcpu, int ident, uint64_t regval) 1767 { 1768 struct vmxctx *vmxctx; 1769 1770 vmxctx = &vmx->ctx[vcpu]; 1771 1772 switch (ident) { 1773 case 0: 1774 vmxctx->guest_rax = regval; 1775 break; 1776 case 1: 1777 vmxctx->guest_rcx = regval; 1778 break; 1779 case 2: 1780 vmxctx->guest_rdx = regval; 1781 break; 1782 case 3: 1783 vmxctx->guest_rbx = regval; 1784 break; 1785 case 4: 1786 vmcs_write(VMCS_GUEST_RSP, regval); 1787 break; 1788 case 5: 1789 vmxctx->guest_rbp = regval; 1790 break; 1791 case 6: 1792 vmxctx->guest_rsi = regval; 1793 break; 1794 case 7: 1795 vmxctx->guest_rdi = regval; 1796 break; 1797 case 8: 1798 vmxctx->guest_r8 = regval; 1799 break; 1800 case 9: 1801 vmxctx->guest_r9 = regval; 1802 break; 1803 case 10: 1804 vmxctx->guest_r10 = regval; 1805 break; 1806 case 11: 1807 vmxctx->guest_r11 = regval; 1808 break; 1809 case 12: 1810 vmxctx->guest_r12 = regval; 1811 break; 1812 case 13: 1813 vmxctx->guest_r13 = regval; 1814 break; 1815 case 14: 1816 vmxctx->guest_r14 = regval; 1817 break; 1818 case 15: 1819 vmxctx->guest_r15 = regval; 1820 break; 1821 default: 1822 panic("invalid vmx register %d", ident); 1823 } 1824 } 1825 1826 static int 1827 vmx_emulate_cr0_access(struct vmx *vmx, int vcpu, uint64_t exitqual) 1828 { 1829 uint64_t crval, regval; 1830 1831 /* We only handle mov to %cr0 at this time */ 1832 if ((exitqual & 0xf0) != 0x00) 1833 return (UNHANDLED); 1834 1835 regval = vmx_get_guest_reg(vmx, vcpu, (exitqual >> 8) & 0xf); 1836 1837 vmcs_write(VMCS_CR0_SHADOW, regval); 1838 1839 crval = regval | cr0_ones_mask; 1840 crval &= ~cr0_zeros_mask; 1841 vmcs_write(VMCS_GUEST_CR0, crval); 1842 1843 if (regval & CR0_PG) { 1844 uint64_t efer, entry_ctls; 1845 1846 /* 1847 * If CR0.PG is 1 and EFER.LME is 1 then EFER.LMA and 1848 * the "IA-32e mode guest" bit in VM-entry control must be 1849 * equal. 1850 */ 1851 efer = vmcs_read(VMCS_GUEST_IA32_EFER); 1852 if (efer & EFER_LME) { 1853 efer |= EFER_LMA; 1854 vmcs_write(VMCS_GUEST_IA32_EFER, efer); 1855 entry_ctls = vmcs_read(VMCS_ENTRY_CTLS); 1856 entry_ctls |= VM_ENTRY_GUEST_LMA; 1857 vmcs_write(VMCS_ENTRY_CTLS, entry_ctls); 1858 } 1859 } 1860 1861 return (HANDLED); 1862 } 1863 1864 static int 1865 vmx_emulate_cr4_access(struct vmx *vmx, int vcpu, uint64_t exitqual) 1866 { 1867 uint64_t crval, regval; 1868 1869 /* We only handle mov to %cr4 at this time */ 1870 if ((exitqual & 0xf0) != 0x00) 1871 return (UNHANDLED); 1872 1873 regval = vmx_get_guest_reg(vmx, vcpu, (exitqual >> 8) & 0xf); 1874 1875 vmcs_write(VMCS_CR4_SHADOW, regval); 1876 1877 crval = regval | cr4_ones_mask; 1878 crval &= ~cr4_zeros_mask; 1879 vmcs_write(VMCS_GUEST_CR4, crval); 1880 1881 return (HANDLED); 1882 } 1883 1884 static int 1885 vmx_emulate_cr8_access(struct vmx *vmx, int vcpu, uint64_t exitqual) 1886 { 1887 struct vlapic *vlapic; 1888 uint64_t cr8; 1889 int regnum; 1890 1891 /* We only handle mov %cr8 to/from a register at this time. */ 1892 if ((exitqual & 0xe0) != 0x00) { 1893 return (UNHANDLED); 1894 } 1895 1896 vlapic = vm_lapic(vmx->vm, vcpu); 1897 regnum = (exitqual >> 8) & 0xf; 1898 if (exitqual & 0x10) { 1899 cr8 = vlapic_get_cr8(vlapic); 1900 vmx_set_guest_reg(vmx, vcpu, regnum, cr8); 1901 } else { 1902 cr8 = vmx_get_guest_reg(vmx, vcpu, regnum); 1903 vlapic_set_cr8(vlapic, cr8); 1904 } 1905 1906 return (HANDLED); 1907 } 1908 1909 /* 1910 * From section "Guest Register State" in the Intel SDM: CPL = SS.DPL 1911 */ 1912 static int 1913 vmx_cpl(void) 1914 { 1915 uint32_t ssar; 1916 1917 ssar = vmcs_read(VMCS_GUEST_SS_ACCESS_RIGHTS); 1918 return ((ssar >> 5) & 0x3); 1919 } 1920 1921 static enum vm_cpu_mode 1922 vmx_cpu_mode(void) 1923 { 1924 uint32_t csar; 1925 1926 if (vmcs_read(VMCS_GUEST_IA32_EFER) & EFER_LMA) { 1927 csar = vmcs_read(VMCS_GUEST_CS_ACCESS_RIGHTS); 1928 if (csar & 0x2000) 1929 return (CPU_MODE_64BIT); /* CS.L = 1 */ 1930 else 1931 return (CPU_MODE_COMPATIBILITY); 1932 } else if (vmcs_read(VMCS_GUEST_CR0) & CR0_PE) { 1933 return (CPU_MODE_PROTECTED); 1934 } else { 1935 return (CPU_MODE_REAL); 1936 } 1937 } 1938 1939 static enum vm_paging_mode 1940 vmx_paging_mode(void) 1941 { 1942 uint64_t cr4; 1943 1944 if (!(vmcs_read(VMCS_GUEST_CR0) & CR0_PG)) 1945 return (PAGING_MODE_FLAT); 1946 cr4 = vmcs_read(VMCS_GUEST_CR4); 1947 if (!(cr4 & CR4_PAE)) 1948 return (PAGING_MODE_32); 1949 if (vmcs_read(VMCS_GUEST_IA32_EFER) & EFER_LME) { 1950 if (!(cr4 & CR4_LA57)) 1951 return (PAGING_MODE_64); 1952 return (PAGING_MODE_64_LA57); 1953 } else 1954 return (PAGING_MODE_PAE); 1955 } 1956 1957 static uint64_t 1958 inout_str_index(struct vmx *vmx, int vcpuid, int in) 1959 { 1960 uint64_t val; 1961 int error __diagused; 1962 enum vm_reg_name reg; 1963 1964 reg = in ? VM_REG_GUEST_RDI : VM_REG_GUEST_RSI; 1965 error = vmx_getreg(vmx, vcpuid, reg, &val); 1966 KASSERT(error == 0, ("%s: vmx_getreg error %d", __func__, error)); 1967 return (val); 1968 } 1969 1970 static uint64_t 1971 inout_str_count(struct vmx *vmx, int vcpuid, int rep) 1972 { 1973 uint64_t val; 1974 int error __diagused; 1975 1976 if (rep) { 1977 error = vmx_getreg(vmx, vcpuid, VM_REG_GUEST_RCX, &val); 1978 KASSERT(!error, ("%s: vmx_getreg error %d", __func__, error)); 1979 } else { 1980 val = 1; 1981 } 1982 return (val); 1983 } 1984 1985 static int 1986 inout_str_addrsize(uint32_t inst_info) 1987 { 1988 uint32_t size; 1989 1990 size = (inst_info >> 7) & 0x7; 1991 switch (size) { 1992 case 0: 1993 return (2); /* 16 bit */ 1994 case 1: 1995 return (4); /* 32 bit */ 1996 case 2: 1997 return (8); /* 64 bit */ 1998 default: 1999 panic("%s: invalid size encoding %d", __func__, size); 2000 } 2001 } 2002 2003 static void 2004 inout_str_seginfo(struct vmx *vmx, int vcpuid, uint32_t inst_info, int in, 2005 struct vm_inout_str *vis) 2006 { 2007 int error __diagused, s; 2008 2009 if (in) { 2010 vis->seg_name = VM_REG_GUEST_ES; 2011 } else { 2012 s = (inst_info >> 15) & 0x7; 2013 vis->seg_name = vm_segment_name(s); 2014 } 2015 2016 error = vmx_getdesc(vmx, vcpuid, vis->seg_name, &vis->seg_desc); 2017 KASSERT(error == 0, ("%s: vmx_getdesc error %d", __func__, error)); 2018 } 2019 2020 static void 2021 vmx_paging_info(struct vm_guest_paging *paging) 2022 { 2023 paging->cr3 = vmcs_guest_cr3(); 2024 paging->cpl = vmx_cpl(); 2025 paging->cpu_mode = vmx_cpu_mode(); 2026 paging->paging_mode = vmx_paging_mode(); 2027 } 2028 2029 static void 2030 vmexit_inst_emul(struct vm_exit *vmexit, uint64_t gpa, uint64_t gla) 2031 { 2032 struct vm_guest_paging *paging; 2033 uint32_t csar; 2034 2035 paging = &vmexit->u.inst_emul.paging; 2036 2037 vmexit->exitcode = VM_EXITCODE_INST_EMUL; 2038 vmexit->inst_length = 0; 2039 vmexit->u.inst_emul.gpa = gpa; 2040 vmexit->u.inst_emul.gla = gla; 2041 vmx_paging_info(paging); 2042 switch (paging->cpu_mode) { 2043 case CPU_MODE_REAL: 2044 vmexit->u.inst_emul.cs_base = vmcs_read(VMCS_GUEST_CS_BASE); 2045 vmexit->u.inst_emul.cs_d = 0; 2046 break; 2047 case CPU_MODE_PROTECTED: 2048 case CPU_MODE_COMPATIBILITY: 2049 vmexit->u.inst_emul.cs_base = vmcs_read(VMCS_GUEST_CS_BASE); 2050 csar = vmcs_read(VMCS_GUEST_CS_ACCESS_RIGHTS); 2051 vmexit->u.inst_emul.cs_d = SEG_DESC_DEF32(csar); 2052 break; 2053 default: 2054 vmexit->u.inst_emul.cs_base = 0; 2055 vmexit->u.inst_emul.cs_d = 0; 2056 break; 2057 } 2058 vie_init(&vmexit->u.inst_emul.vie, NULL, 0); 2059 } 2060 2061 static int 2062 ept_fault_type(uint64_t ept_qual) 2063 { 2064 int fault_type; 2065 2066 if (ept_qual & EPT_VIOLATION_DATA_WRITE) 2067 fault_type = VM_PROT_WRITE; 2068 else if (ept_qual & EPT_VIOLATION_INST_FETCH) 2069 fault_type = VM_PROT_EXECUTE; 2070 else 2071 fault_type= VM_PROT_READ; 2072 2073 return (fault_type); 2074 } 2075 2076 static bool 2077 ept_emulation_fault(uint64_t ept_qual) 2078 { 2079 int read, write; 2080 2081 /* EPT fault on an instruction fetch doesn't make sense here */ 2082 if (ept_qual & EPT_VIOLATION_INST_FETCH) 2083 return (false); 2084 2085 /* EPT fault must be a read fault or a write fault */ 2086 read = ept_qual & EPT_VIOLATION_DATA_READ ? 1 : 0; 2087 write = ept_qual & EPT_VIOLATION_DATA_WRITE ? 1 : 0; 2088 if ((read | write) == 0) 2089 return (false); 2090 2091 /* 2092 * The EPT violation must have been caused by accessing a 2093 * guest-physical address that is a translation of a guest-linear 2094 * address. 2095 */ 2096 if ((ept_qual & EPT_VIOLATION_GLA_VALID) == 0 || 2097 (ept_qual & EPT_VIOLATION_XLAT_VALID) == 0) { 2098 return (false); 2099 } 2100 2101 return (true); 2102 } 2103 2104 static __inline int 2105 apic_access_virtualization(struct vmx *vmx, int vcpuid) 2106 { 2107 uint32_t proc_ctls2; 2108 2109 proc_ctls2 = vmx->cap[vcpuid].proc_ctls2; 2110 return ((proc_ctls2 & PROCBASED2_VIRTUALIZE_APIC_ACCESSES) ? 1 : 0); 2111 } 2112 2113 static __inline int 2114 x2apic_virtualization(struct vmx *vmx, int vcpuid) 2115 { 2116 uint32_t proc_ctls2; 2117 2118 proc_ctls2 = vmx->cap[vcpuid].proc_ctls2; 2119 return ((proc_ctls2 & PROCBASED2_VIRTUALIZE_X2APIC_MODE) ? 1 : 0); 2120 } 2121 2122 static int 2123 vmx_handle_apic_write(struct vmx *vmx, int vcpuid, struct vlapic *vlapic, 2124 uint64_t qual) 2125 { 2126 int error, handled, offset; 2127 uint32_t *apic_regs, vector; 2128 bool retu; 2129 2130 handled = HANDLED; 2131 offset = APIC_WRITE_OFFSET(qual); 2132 2133 if (!apic_access_virtualization(vmx, vcpuid)) { 2134 /* 2135 * In general there should not be any APIC write VM-exits 2136 * unless APIC-access virtualization is enabled. 2137 * 2138 * However self-IPI virtualization can legitimately trigger 2139 * an APIC-write VM-exit so treat it specially. 2140 */ 2141 if (x2apic_virtualization(vmx, vcpuid) && 2142 offset == APIC_OFFSET_SELF_IPI) { 2143 apic_regs = (uint32_t *)(vlapic->apic_page); 2144 vector = apic_regs[APIC_OFFSET_SELF_IPI / 4]; 2145 vlapic_self_ipi_handler(vlapic, vector); 2146 return (HANDLED); 2147 } else 2148 return (UNHANDLED); 2149 } 2150 2151 switch (offset) { 2152 case APIC_OFFSET_ID: 2153 vlapic_id_write_handler(vlapic); 2154 break; 2155 case APIC_OFFSET_LDR: 2156 vlapic_ldr_write_handler(vlapic); 2157 break; 2158 case APIC_OFFSET_DFR: 2159 vlapic_dfr_write_handler(vlapic); 2160 break; 2161 case APIC_OFFSET_SVR: 2162 vlapic_svr_write_handler(vlapic); 2163 break; 2164 case APIC_OFFSET_ESR: 2165 vlapic_esr_write_handler(vlapic); 2166 break; 2167 case APIC_OFFSET_ICR_LOW: 2168 retu = false; 2169 error = vlapic_icrlo_write_handler(vlapic, &retu); 2170 if (error != 0 || retu) 2171 handled = UNHANDLED; 2172 break; 2173 case APIC_OFFSET_CMCI_LVT: 2174 case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT: 2175 vlapic_lvt_write_handler(vlapic, offset); 2176 break; 2177 case APIC_OFFSET_TIMER_ICR: 2178 vlapic_icrtmr_write_handler(vlapic); 2179 break; 2180 case APIC_OFFSET_TIMER_DCR: 2181 vlapic_dcr_write_handler(vlapic); 2182 break; 2183 default: 2184 handled = UNHANDLED; 2185 break; 2186 } 2187 return (handled); 2188 } 2189 2190 static bool 2191 apic_access_fault(struct vmx *vmx, int vcpuid, uint64_t gpa) 2192 { 2193 2194 if (apic_access_virtualization(vmx, vcpuid) && 2195 (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE)) 2196 return (true); 2197 else 2198 return (false); 2199 } 2200 2201 static int 2202 vmx_handle_apic_access(struct vmx *vmx, int vcpuid, struct vm_exit *vmexit) 2203 { 2204 uint64_t qual; 2205 int access_type, offset, allowed; 2206 2207 if (!apic_access_virtualization(vmx, vcpuid)) 2208 return (UNHANDLED); 2209 2210 qual = vmexit->u.vmx.exit_qualification; 2211 access_type = APIC_ACCESS_TYPE(qual); 2212 offset = APIC_ACCESS_OFFSET(qual); 2213 2214 allowed = 0; 2215 if (access_type == 0) { 2216 /* 2217 * Read data access to the following registers is expected. 2218 */ 2219 switch (offset) { 2220 case APIC_OFFSET_APR: 2221 case APIC_OFFSET_PPR: 2222 case APIC_OFFSET_RRR: 2223 case APIC_OFFSET_CMCI_LVT: 2224 case APIC_OFFSET_TIMER_CCR: 2225 allowed = 1; 2226 break; 2227 default: 2228 break; 2229 } 2230 } else if (access_type == 1) { 2231 /* 2232 * Write data access to the following registers is expected. 2233 */ 2234 switch (offset) { 2235 case APIC_OFFSET_VER: 2236 case APIC_OFFSET_APR: 2237 case APIC_OFFSET_PPR: 2238 case APIC_OFFSET_RRR: 2239 case APIC_OFFSET_ISR0 ... APIC_OFFSET_ISR7: 2240 case APIC_OFFSET_TMR0 ... APIC_OFFSET_TMR7: 2241 case APIC_OFFSET_IRR0 ... APIC_OFFSET_IRR7: 2242 case APIC_OFFSET_CMCI_LVT: 2243 case APIC_OFFSET_TIMER_CCR: 2244 allowed = 1; 2245 break; 2246 default: 2247 break; 2248 } 2249 } 2250 2251 if (allowed) { 2252 vmexit_inst_emul(vmexit, DEFAULT_APIC_BASE + offset, 2253 VIE_INVALID_GLA); 2254 } 2255 2256 /* 2257 * Regardless of whether the APIC-access is allowed this handler 2258 * always returns UNHANDLED: 2259 * - if the access is allowed then it is handled by emulating the 2260 * instruction that caused the VM-exit (outside the critical section) 2261 * - if the access is not allowed then it will be converted to an 2262 * exitcode of VM_EXITCODE_VMX and will be dealt with in userland. 2263 */ 2264 return (UNHANDLED); 2265 } 2266 2267 static enum task_switch_reason 2268 vmx_task_switch_reason(uint64_t qual) 2269 { 2270 int reason; 2271 2272 reason = (qual >> 30) & 0x3; 2273 switch (reason) { 2274 case 0: 2275 return (TSR_CALL); 2276 case 1: 2277 return (TSR_IRET); 2278 case 2: 2279 return (TSR_JMP); 2280 case 3: 2281 return (TSR_IDT_GATE); 2282 default: 2283 panic("%s: invalid reason %d", __func__, reason); 2284 } 2285 } 2286 2287 static int 2288 emulate_wrmsr(struct vmx *vmx, int vcpuid, u_int num, uint64_t val, bool *retu) 2289 { 2290 int error; 2291 2292 if (lapic_msr(num)) 2293 error = lapic_wrmsr(vmx->vm, vcpuid, num, val, retu); 2294 else 2295 error = vmx_wrmsr(vmx, vcpuid, num, val, retu); 2296 2297 return (error); 2298 } 2299 2300 static int 2301 emulate_rdmsr(struct vmx *vmx, int vcpuid, u_int num, bool *retu) 2302 { 2303 struct vmxctx *vmxctx; 2304 uint64_t result; 2305 uint32_t eax, edx; 2306 int error; 2307 2308 if (lapic_msr(num)) 2309 error = lapic_rdmsr(vmx->vm, vcpuid, num, &result, retu); 2310 else 2311 error = vmx_rdmsr(vmx, vcpuid, num, &result, retu); 2312 2313 if (error == 0) { 2314 eax = result; 2315 vmxctx = &vmx->ctx[vcpuid]; 2316 error = vmxctx_setreg(vmxctx, VM_REG_GUEST_RAX, eax); 2317 KASSERT(error == 0, ("vmxctx_setreg(rax) error %d", error)); 2318 2319 edx = result >> 32; 2320 error = vmxctx_setreg(vmxctx, VM_REG_GUEST_RDX, edx); 2321 KASSERT(error == 0, ("vmxctx_setreg(rdx) error %d", error)); 2322 } 2323 2324 return (error); 2325 } 2326 2327 static int 2328 vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit) 2329 { 2330 int error, errcode, errcode_valid, handled, in; 2331 struct vmxctx *vmxctx; 2332 struct vlapic *vlapic; 2333 struct vm_inout_str *vis; 2334 struct vm_task_switch *ts; 2335 uint32_t eax, ecx, edx, idtvec_info, idtvec_err, intr_info, inst_info; 2336 uint32_t intr_type, intr_vec, reason; 2337 uint64_t exitintinfo, qual, gpa; 2338 bool retu; 2339 2340 CTASSERT((PINBASED_CTLS_ONE_SETTING & PINBASED_VIRTUAL_NMI) != 0); 2341 CTASSERT((PINBASED_CTLS_ONE_SETTING & PINBASED_NMI_EXITING) != 0); 2342 2343 handled = UNHANDLED; 2344 vmxctx = &vmx->ctx[vcpu]; 2345 2346 qual = vmexit->u.vmx.exit_qualification; 2347 reason = vmexit->u.vmx.exit_reason; 2348 vmexit->exitcode = VM_EXITCODE_BOGUS; 2349 2350 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_COUNT, 1); 2351 SDT_PROBE3(vmm, vmx, exit, entry, vmx, vcpu, vmexit); 2352 2353 /* 2354 * VM-entry failures during or after loading guest state. 2355 * 2356 * These VM-exits are uncommon but must be handled specially 2357 * as most VM-exit fields are not populated as usual. 2358 */ 2359 if (__predict_false(reason == EXIT_REASON_MCE_DURING_ENTRY)) { 2360 VCPU_CTR0(vmx->vm, vcpu, "Handling MCE during VM-entry"); 2361 __asm __volatile("int $18"); 2362 return (1); 2363 } 2364 2365 /* 2366 * VM exits that can be triggered during event delivery need to 2367 * be handled specially by re-injecting the event if the IDT 2368 * vectoring information field's valid bit is set. 2369 * 2370 * See "Information for VM Exits During Event Delivery" in Intel SDM 2371 * for details. 2372 */ 2373 idtvec_info = vmcs_idt_vectoring_info(); 2374 if (idtvec_info & VMCS_IDT_VEC_VALID) { 2375 idtvec_info &= ~(1 << 12); /* clear undefined bit */ 2376 exitintinfo = idtvec_info; 2377 if (idtvec_info & VMCS_IDT_VEC_ERRCODE_VALID) { 2378 idtvec_err = vmcs_idt_vectoring_err(); 2379 exitintinfo |= (uint64_t)idtvec_err << 32; 2380 } 2381 error = vm_exit_intinfo(vmx->vm, vcpu, exitintinfo); 2382 KASSERT(error == 0, ("%s: vm_set_intinfo error %d", 2383 __func__, error)); 2384 2385 /* 2386 * If 'virtual NMIs' are being used and the VM-exit 2387 * happened while injecting an NMI during the previous 2388 * VM-entry, then clear "blocking by NMI" in the 2389 * Guest Interruptibility-State so the NMI can be 2390 * reinjected on the subsequent VM-entry. 2391 * 2392 * However, if the NMI was being delivered through a task 2393 * gate, then the new task must start execution with NMIs 2394 * blocked so don't clear NMI blocking in this case. 2395 */ 2396 intr_type = idtvec_info & VMCS_INTR_T_MASK; 2397 if (intr_type == VMCS_INTR_T_NMI) { 2398 if (reason != EXIT_REASON_TASK_SWITCH) 2399 vmx_clear_nmi_blocking(vmx, vcpu); 2400 else 2401 vmx_assert_nmi_blocking(vmx, vcpu); 2402 } 2403 2404 /* 2405 * Update VM-entry instruction length if the event being 2406 * delivered was a software interrupt or software exception. 2407 */ 2408 if (intr_type == VMCS_INTR_T_SWINTR || 2409 intr_type == VMCS_INTR_T_PRIV_SWEXCEPTION || 2410 intr_type == VMCS_INTR_T_SWEXCEPTION) { 2411 vmcs_write(VMCS_ENTRY_INST_LENGTH, vmexit->inst_length); 2412 } 2413 } 2414 2415 switch (reason) { 2416 case EXIT_REASON_TASK_SWITCH: 2417 ts = &vmexit->u.task_switch; 2418 ts->tsssel = qual & 0xffff; 2419 ts->reason = vmx_task_switch_reason(qual); 2420 ts->ext = 0; 2421 ts->errcode_valid = 0; 2422 vmx_paging_info(&ts->paging); 2423 /* 2424 * If the task switch was due to a CALL, JMP, IRET, software 2425 * interrupt (INT n) or software exception (INT3, INTO), 2426 * then the saved %rip references the instruction that caused 2427 * the task switch. The instruction length field in the VMCS 2428 * is valid in this case. 2429 * 2430 * In all other cases (e.g., NMI, hardware exception) the 2431 * saved %rip is one that would have been saved in the old TSS 2432 * had the task switch completed normally so the instruction 2433 * length field is not needed in this case and is explicitly 2434 * set to 0. 2435 */ 2436 if (ts->reason == TSR_IDT_GATE) { 2437 KASSERT(idtvec_info & VMCS_IDT_VEC_VALID, 2438 ("invalid idtvec_info %#x for IDT task switch", 2439 idtvec_info)); 2440 intr_type = idtvec_info & VMCS_INTR_T_MASK; 2441 if (intr_type != VMCS_INTR_T_SWINTR && 2442 intr_type != VMCS_INTR_T_SWEXCEPTION && 2443 intr_type != VMCS_INTR_T_PRIV_SWEXCEPTION) { 2444 /* Task switch triggered by external event */ 2445 ts->ext = 1; 2446 vmexit->inst_length = 0; 2447 if (idtvec_info & VMCS_IDT_VEC_ERRCODE_VALID) { 2448 ts->errcode_valid = 1; 2449 ts->errcode = vmcs_idt_vectoring_err(); 2450 } 2451 } 2452 } 2453 vmexit->exitcode = VM_EXITCODE_TASK_SWITCH; 2454 SDT_PROBE4(vmm, vmx, exit, taskswitch, vmx, vcpu, vmexit, ts); 2455 VCPU_CTR4(vmx->vm, vcpu, "task switch reason %d, tss 0x%04x, " 2456 "%s errcode 0x%016lx", ts->reason, ts->tsssel, 2457 ts->ext ? "external" : "internal", 2458 ((uint64_t)ts->errcode << 32) | ts->errcode_valid); 2459 break; 2460 case EXIT_REASON_CR_ACCESS: 2461 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_CR_ACCESS, 1); 2462 SDT_PROBE4(vmm, vmx, exit, craccess, vmx, vcpu, vmexit, qual); 2463 switch (qual & 0xf) { 2464 case 0: 2465 handled = vmx_emulate_cr0_access(vmx, vcpu, qual); 2466 break; 2467 case 4: 2468 handled = vmx_emulate_cr4_access(vmx, vcpu, qual); 2469 break; 2470 case 8: 2471 handled = vmx_emulate_cr8_access(vmx, vcpu, qual); 2472 break; 2473 } 2474 break; 2475 case EXIT_REASON_RDMSR: 2476 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_RDMSR, 1); 2477 retu = false; 2478 ecx = vmxctx->guest_rcx; 2479 VCPU_CTR1(vmx->vm, vcpu, "rdmsr 0x%08x", ecx); 2480 SDT_PROBE4(vmm, vmx, exit, rdmsr, vmx, vcpu, vmexit, ecx); 2481 error = emulate_rdmsr(vmx, vcpu, ecx, &retu); 2482 if (error) { 2483 vmexit->exitcode = VM_EXITCODE_RDMSR; 2484 vmexit->u.msr.code = ecx; 2485 } else if (!retu) { 2486 handled = HANDLED; 2487 } else { 2488 /* Return to userspace with a valid exitcode */ 2489 KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS, 2490 ("emulate_rdmsr retu with bogus exitcode")); 2491 } 2492 break; 2493 case EXIT_REASON_WRMSR: 2494 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_WRMSR, 1); 2495 retu = false; 2496 eax = vmxctx->guest_rax; 2497 ecx = vmxctx->guest_rcx; 2498 edx = vmxctx->guest_rdx; 2499 VCPU_CTR2(vmx->vm, vcpu, "wrmsr 0x%08x value 0x%016lx", 2500 ecx, (uint64_t)edx << 32 | eax); 2501 SDT_PROBE5(vmm, vmx, exit, wrmsr, vmx, vmexit, vcpu, ecx, 2502 (uint64_t)edx << 32 | eax); 2503 error = emulate_wrmsr(vmx, vcpu, ecx, 2504 (uint64_t)edx << 32 | eax, &retu); 2505 if (error) { 2506 vmexit->exitcode = VM_EXITCODE_WRMSR; 2507 vmexit->u.msr.code = ecx; 2508 vmexit->u.msr.wval = (uint64_t)edx << 32 | eax; 2509 } else if (!retu) { 2510 handled = HANDLED; 2511 } else { 2512 /* Return to userspace with a valid exitcode */ 2513 KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS, 2514 ("emulate_wrmsr retu with bogus exitcode")); 2515 } 2516 break; 2517 case EXIT_REASON_HLT: 2518 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_HLT, 1); 2519 SDT_PROBE3(vmm, vmx, exit, halt, vmx, vcpu, vmexit); 2520 vmexit->exitcode = VM_EXITCODE_HLT; 2521 vmexit->u.hlt.rflags = vmcs_read(VMCS_GUEST_RFLAGS); 2522 if (virtual_interrupt_delivery) 2523 vmexit->u.hlt.intr_status = 2524 vmcs_read(VMCS_GUEST_INTR_STATUS); 2525 else 2526 vmexit->u.hlt.intr_status = 0; 2527 break; 2528 case EXIT_REASON_MTF: 2529 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_MTRAP, 1); 2530 SDT_PROBE3(vmm, vmx, exit, mtrap, vmx, vcpu, vmexit); 2531 vmexit->exitcode = VM_EXITCODE_MTRAP; 2532 vmexit->inst_length = 0; 2533 break; 2534 case EXIT_REASON_PAUSE: 2535 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_PAUSE, 1); 2536 SDT_PROBE3(vmm, vmx, exit, pause, vmx, vcpu, vmexit); 2537 vmexit->exitcode = VM_EXITCODE_PAUSE; 2538 break; 2539 case EXIT_REASON_INTR_WINDOW: 2540 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INTR_WINDOW, 1); 2541 SDT_PROBE3(vmm, vmx, exit, intrwindow, vmx, vcpu, vmexit); 2542 vmx_clear_int_window_exiting(vmx, vcpu); 2543 return (1); 2544 case EXIT_REASON_EXT_INTR: 2545 /* 2546 * External interrupts serve only to cause VM exits and allow 2547 * the host interrupt handler to run. 2548 * 2549 * If this external interrupt triggers a virtual interrupt 2550 * to a VM, then that state will be recorded by the 2551 * host interrupt handler in the VM's softc. We will inject 2552 * this virtual interrupt during the subsequent VM enter. 2553 */ 2554 intr_info = vmcs_read(VMCS_EXIT_INTR_INFO); 2555 SDT_PROBE4(vmm, vmx, exit, interrupt, 2556 vmx, vcpu, vmexit, intr_info); 2557 2558 /* 2559 * XXX: Ignore this exit if VMCS_INTR_VALID is not set. 2560 * This appears to be a bug in VMware Fusion? 2561 */ 2562 if (!(intr_info & VMCS_INTR_VALID)) 2563 return (1); 2564 KASSERT((intr_info & VMCS_INTR_VALID) != 0 && 2565 (intr_info & VMCS_INTR_T_MASK) == VMCS_INTR_T_HWINTR, 2566 ("VM exit interruption info invalid: %#x", intr_info)); 2567 vmx_trigger_hostintr(intr_info & 0xff); 2568 2569 /* 2570 * This is special. We want to treat this as an 'handled' 2571 * VM-exit but not increment the instruction pointer. 2572 */ 2573 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_EXTINT, 1); 2574 return (1); 2575 case EXIT_REASON_NMI_WINDOW: 2576 SDT_PROBE3(vmm, vmx, exit, nmiwindow, vmx, vcpu, vmexit); 2577 /* Exit to allow the pending virtual NMI to be injected */ 2578 if (vm_nmi_pending(vmx->vm, vcpu)) 2579 vmx_inject_nmi(vmx, vcpu); 2580 vmx_clear_nmi_window_exiting(vmx, vcpu); 2581 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_NMI_WINDOW, 1); 2582 return (1); 2583 case EXIT_REASON_INOUT: 2584 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INOUT, 1); 2585 vmexit->exitcode = VM_EXITCODE_INOUT; 2586 vmexit->u.inout.bytes = (qual & 0x7) + 1; 2587 vmexit->u.inout.in = in = (qual & 0x8) ? 1 : 0; 2588 vmexit->u.inout.string = (qual & 0x10) ? 1 : 0; 2589 vmexit->u.inout.rep = (qual & 0x20) ? 1 : 0; 2590 vmexit->u.inout.port = (uint16_t)(qual >> 16); 2591 vmexit->u.inout.eax = (uint32_t)(vmxctx->guest_rax); 2592 if (vmexit->u.inout.string) { 2593 inst_info = vmcs_read(VMCS_EXIT_INSTRUCTION_INFO); 2594 vmexit->exitcode = VM_EXITCODE_INOUT_STR; 2595 vis = &vmexit->u.inout_str; 2596 vmx_paging_info(&vis->paging); 2597 vis->rflags = vmcs_read(VMCS_GUEST_RFLAGS); 2598 vis->cr0 = vmcs_read(VMCS_GUEST_CR0); 2599 vis->index = inout_str_index(vmx, vcpu, in); 2600 vis->count = inout_str_count(vmx, vcpu, vis->inout.rep); 2601 vis->addrsize = inout_str_addrsize(inst_info); 2602 inout_str_seginfo(vmx, vcpu, inst_info, in, vis); 2603 } 2604 SDT_PROBE3(vmm, vmx, exit, inout, vmx, vcpu, vmexit); 2605 break; 2606 case EXIT_REASON_CPUID: 2607 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_CPUID, 1); 2608 SDT_PROBE3(vmm, vmx, exit, cpuid, vmx, vcpu, vmexit); 2609 handled = vmx_handle_cpuid(vmx->vm, vcpu, vmxctx); 2610 break; 2611 case EXIT_REASON_EXCEPTION: 2612 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_EXCEPTION, 1); 2613 intr_info = vmcs_read(VMCS_EXIT_INTR_INFO); 2614 KASSERT((intr_info & VMCS_INTR_VALID) != 0, 2615 ("VM exit interruption info invalid: %#x", intr_info)); 2616 2617 intr_vec = intr_info & 0xff; 2618 intr_type = intr_info & VMCS_INTR_T_MASK; 2619 2620 /* 2621 * If Virtual NMIs control is 1 and the VM-exit is due to a 2622 * fault encountered during the execution of IRET then we must 2623 * restore the state of "virtual-NMI blocking" before resuming 2624 * the guest. 2625 * 2626 * See "Resuming Guest Software after Handling an Exception". 2627 * See "Information for VM Exits Due to Vectored Events". 2628 */ 2629 if ((idtvec_info & VMCS_IDT_VEC_VALID) == 0 && 2630 (intr_vec != IDT_DF) && 2631 (intr_info & EXIT_QUAL_NMIUDTI) != 0) 2632 vmx_restore_nmi_blocking(vmx, vcpu); 2633 2634 /* 2635 * The NMI has already been handled in vmx_exit_handle_nmi(). 2636 */ 2637 if (intr_type == VMCS_INTR_T_NMI) 2638 return (1); 2639 2640 /* 2641 * Call the machine check handler by hand. Also don't reflect 2642 * the machine check back into the guest. 2643 */ 2644 if (intr_vec == IDT_MC) { 2645 VCPU_CTR0(vmx->vm, vcpu, "Vectoring to MCE handler"); 2646 __asm __volatile("int $18"); 2647 return (1); 2648 } 2649 2650 /* 2651 * If the hypervisor has requested user exits for 2652 * debug exceptions, bounce them out to userland. 2653 */ 2654 if (intr_type == VMCS_INTR_T_SWEXCEPTION && intr_vec == IDT_BP && 2655 (vmx->cap[vcpu].set & (1 << VM_CAP_BPT_EXIT))) { 2656 vmexit->exitcode = VM_EXITCODE_BPT; 2657 vmexit->u.bpt.inst_length = vmexit->inst_length; 2658 vmexit->inst_length = 0; 2659 break; 2660 } 2661 2662 if (intr_vec == IDT_PF) { 2663 error = vmxctx_setreg(vmxctx, VM_REG_GUEST_CR2, qual); 2664 KASSERT(error == 0, ("%s: vmxctx_setreg(cr2) error %d", 2665 __func__, error)); 2666 } 2667 2668 /* 2669 * Software exceptions exhibit trap-like behavior. This in 2670 * turn requires populating the VM-entry instruction length 2671 * so that the %rip in the trap frame is past the INT3/INTO 2672 * instruction. 2673 */ 2674 if (intr_type == VMCS_INTR_T_SWEXCEPTION) 2675 vmcs_write(VMCS_ENTRY_INST_LENGTH, vmexit->inst_length); 2676 2677 /* Reflect all other exceptions back into the guest */ 2678 errcode_valid = errcode = 0; 2679 if (intr_info & VMCS_INTR_DEL_ERRCODE) { 2680 errcode_valid = 1; 2681 errcode = vmcs_read(VMCS_EXIT_INTR_ERRCODE); 2682 } 2683 VCPU_CTR2(vmx->vm, vcpu, "Reflecting exception %d/%#x into " 2684 "the guest", intr_vec, errcode); 2685 SDT_PROBE5(vmm, vmx, exit, exception, 2686 vmx, vcpu, vmexit, intr_vec, errcode); 2687 error = vm_inject_exception(vmx->vm, vcpu, intr_vec, 2688 errcode_valid, errcode, 0); 2689 KASSERT(error == 0, ("%s: vm_inject_exception error %d", 2690 __func__, error)); 2691 return (1); 2692 2693 case EXIT_REASON_EPT_FAULT: 2694 /* 2695 * If 'gpa' lies within the address space allocated to 2696 * memory then this must be a nested page fault otherwise 2697 * this must be an instruction that accesses MMIO space. 2698 */ 2699 gpa = vmcs_gpa(); 2700 if (vm_mem_allocated(vmx->vm, vcpu, gpa) || 2701 apic_access_fault(vmx, vcpu, gpa)) { 2702 vmexit->exitcode = VM_EXITCODE_PAGING; 2703 vmexit->inst_length = 0; 2704 vmexit->u.paging.gpa = gpa; 2705 vmexit->u.paging.fault_type = ept_fault_type(qual); 2706 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_NESTED_FAULT, 1); 2707 SDT_PROBE5(vmm, vmx, exit, nestedfault, 2708 vmx, vcpu, vmexit, gpa, qual); 2709 } else if (ept_emulation_fault(qual)) { 2710 vmexit_inst_emul(vmexit, gpa, vmcs_gla()); 2711 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INST_EMUL, 1); 2712 SDT_PROBE4(vmm, vmx, exit, mmiofault, 2713 vmx, vcpu, vmexit, gpa); 2714 } 2715 /* 2716 * If Virtual NMIs control is 1 and the VM-exit is due to an 2717 * EPT fault during the execution of IRET then we must restore 2718 * the state of "virtual-NMI blocking" before resuming. 2719 * 2720 * See description of "NMI unblocking due to IRET" in 2721 * "Exit Qualification for EPT Violations". 2722 */ 2723 if ((idtvec_info & VMCS_IDT_VEC_VALID) == 0 && 2724 (qual & EXIT_QUAL_NMIUDTI) != 0) 2725 vmx_restore_nmi_blocking(vmx, vcpu); 2726 break; 2727 case EXIT_REASON_VIRTUALIZED_EOI: 2728 vmexit->exitcode = VM_EXITCODE_IOAPIC_EOI; 2729 vmexit->u.ioapic_eoi.vector = qual & 0xFF; 2730 SDT_PROBE3(vmm, vmx, exit, eoi, vmx, vcpu, vmexit); 2731 vmexit->inst_length = 0; /* trap-like */ 2732 break; 2733 case EXIT_REASON_APIC_ACCESS: 2734 SDT_PROBE3(vmm, vmx, exit, apicaccess, vmx, vcpu, vmexit); 2735 handled = vmx_handle_apic_access(vmx, vcpu, vmexit); 2736 break; 2737 case EXIT_REASON_APIC_WRITE: 2738 /* 2739 * APIC-write VM exit is trap-like so the %rip is already 2740 * pointing to the next instruction. 2741 */ 2742 vmexit->inst_length = 0; 2743 vlapic = vm_lapic(vmx->vm, vcpu); 2744 SDT_PROBE4(vmm, vmx, exit, apicwrite, 2745 vmx, vcpu, vmexit, vlapic); 2746 handled = vmx_handle_apic_write(vmx, vcpu, vlapic, qual); 2747 break; 2748 case EXIT_REASON_XSETBV: 2749 SDT_PROBE3(vmm, vmx, exit, xsetbv, vmx, vcpu, vmexit); 2750 handled = vmx_emulate_xsetbv(vmx, vcpu, vmexit); 2751 break; 2752 case EXIT_REASON_MONITOR: 2753 SDT_PROBE3(vmm, vmx, exit, monitor, vmx, vcpu, vmexit); 2754 vmexit->exitcode = VM_EXITCODE_MONITOR; 2755 break; 2756 case EXIT_REASON_MWAIT: 2757 SDT_PROBE3(vmm, vmx, exit, mwait, vmx, vcpu, vmexit); 2758 vmexit->exitcode = VM_EXITCODE_MWAIT; 2759 break; 2760 case EXIT_REASON_TPR: 2761 vlapic = vm_lapic(vmx->vm, vcpu); 2762 vlapic_sync_tpr(vlapic); 2763 vmexit->inst_length = 0; 2764 handled = HANDLED; 2765 break; 2766 case EXIT_REASON_VMCALL: 2767 case EXIT_REASON_VMCLEAR: 2768 case EXIT_REASON_VMLAUNCH: 2769 case EXIT_REASON_VMPTRLD: 2770 case EXIT_REASON_VMPTRST: 2771 case EXIT_REASON_VMREAD: 2772 case EXIT_REASON_VMRESUME: 2773 case EXIT_REASON_VMWRITE: 2774 case EXIT_REASON_VMXOFF: 2775 case EXIT_REASON_VMXON: 2776 SDT_PROBE3(vmm, vmx, exit, vminsn, vmx, vcpu, vmexit); 2777 vmexit->exitcode = VM_EXITCODE_VMINSN; 2778 break; 2779 default: 2780 SDT_PROBE4(vmm, vmx, exit, unknown, 2781 vmx, vcpu, vmexit, reason); 2782 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_UNKNOWN, 1); 2783 break; 2784 } 2785 2786 if (handled) { 2787 /* 2788 * It is possible that control is returned to userland 2789 * even though we were able to handle the VM exit in the 2790 * kernel. 2791 * 2792 * In such a case we want to make sure that the userland 2793 * restarts guest execution at the instruction *after* 2794 * the one we just processed. Therefore we update the 2795 * guest rip in the VMCS and in 'vmexit'. 2796 */ 2797 vmexit->rip += vmexit->inst_length; 2798 vmexit->inst_length = 0; 2799 vmcs_write(VMCS_GUEST_RIP, vmexit->rip); 2800 } else { 2801 if (vmexit->exitcode == VM_EXITCODE_BOGUS) { 2802 /* 2803 * If this VM exit was not claimed by anybody then 2804 * treat it as a generic VMX exit. 2805 */ 2806 vmexit->exitcode = VM_EXITCODE_VMX; 2807 vmexit->u.vmx.status = VM_SUCCESS; 2808 vmexit->u.vmx.inst_type = 0; 2809 vmexit->u.vmx.inst_error = 0; 2810 } else { 2811 /* 2812 * The exitcode and collateral have been populated. 2813 * The VM exit will be processed further in userland. 2814 */ 2815 } 2816 } 2817 2818 SDT_PROBE4(vmm, vmx, exit, return, 2819 vmx, vcpu, vmexit, handled); 2820 return (handled); 2821 } 2822 2823 static __inline void 2824 vmx_exit_inst_error(struct vmxctx *vmxctx, int rc, struct vm_exit *vmexit) 2825 { 2826 2827 KASSERT(vmxctx->inst_fail_status != VM_SUCCESS, 2828 ("vmx_exit_inst_error: invalid inst_fail_status %d", 2829 vmxctx->inst_fail_status)); 2830 2831 vmexit->inst_length = 0; 2832 vmexit->exitcode = VM_EXITCODE_VMX; 2833 vmexit->u.vmx.status = vmxctx->inst_fail_status; 2834 vmexit->u.vmx.inst_error = vmcs_instruction_error(); 2835 vmexit->u.vmx.exit_reason = ~0; 2836 vmexit->u.vmx.exit_qualification = ~0; 2837 2838 switch (rc) { 2839 case VMX_VMRESUME_ERROR: 2840 case VMX_VMLAUNCH_ERROR: 2841 vmexit->u.vmx.inst_type = rc; 2842 break; 2843 default: 2844 panic("vm_exit_inst_error: vmx_enter_guest returned %d", rc); 2845 } 2846 } 2847 2848 /* 2849 * If the NMI-exiting VM execution control is set to '1' then an NMI in 2850 * non-root operation causes a VM-exit. NMI blocking is in effect so it is 2851 * sufficient to simply vector to the NMI handler via a software interrupt. 2852 * However, this must be done before maskable interrupts are enabled 2853 * otherwise the "iret" issued by an interrupt handler will incorrectly 2854 * clear NMI blocking. 2855 */ 2856 static __inline void 2857 vmx_exit_handle_nmi(struct vmx *vmx, int vcpuid, struct vm_exit *vmexit) 2858 { 2859 uint32_t intr_info; 2860 2861 KASSERT((read_rflags() & PSL_I) == 0, ("interrupts enabled")); 2862 2863 if (vmexit->u.vmx.exit_reason != EXIT_REASON_EXCEPTION) 2864 return; 2865 2866 intr_info = vmcs_read(VMCS_EXIT_INTR_INFO); 2867 KASSERT((intr_info & VMCS_INTR_VALID) != 0, 2868 ("VM exit interruption info invalid: %#x", intr_info)); 2869 2870 if ((intr_info & VMCS_INTR_T_MASK) == VMCS_INTR_T_NMI) { 2871 KASSERT((intr_info & 0xff) == IDT_NMI, ("VM exit due " 2872 "to NMI has invalid vector: %#x", intr_info)); 2873 VCPU_CTR0(vmx->vm, vcpuid, "Vectoring to NMI handler"); 2874 __asm __volatile("int $2"); 2875 } 2876 } 2877 2878 static __inline void 2879 vmx_dr_enter_guest(struct vmxctx *vmxctx) 2880 { 2881 register_t rflags; 2882 2883 /* Save host control debug registers. */ 2884 vmxctx->host_dr7 = rdr7(); 2885 vmxctx->host_debugctl = rdmsr(MSR_DEBUGCTLMSR); 2886 2887 /* 2888 * Disable debugging in DR7 and DEBUGCTL to avoid triggering 2889 * exceptions in the host based on the guest DRx values. The 2890 * guest DR7 and DEBUGCTL are saved/restored in the VMCS. 2891 */ 2892 load_dr7(0); 2893 wrmsr(MSR_DEBUGCTLMSR, 0); 2894 2895 /* 2896 * Disable single stepping the kernel to avoid corrupting the 2897 * guest DR6. A debugger might still be able to corrupt the 2898 * guest DR6 by setting a breakpoint after this point and then 2899 * single stepping. 2900 */ 2901 rflags = read_rflags(); 2902 vmxctx->host_tf = rflags & PSL_T; 2903 write_rflags(rflags & ~PSL_T); 2904 2905 /* Save host debug registers. */ 2906 vmxctx->host_dr0 = rdr0(); 2907 vmxctx->host_dr1 = rdr1(); 2908 vmxctx->host_dr2 = rdr2(); 2909 vmxctx->host_dr3 = rdr3(); 2910 vmxctx->host_dr6 = rdr6(); 2911 2912 /* Restore guest debug registers. */ 2913 load_dr0(vmxctx->guest_dr0); 2914 load_dr1(vmxctx->guest_dr1); 2915 load_dr2(vmxctx->guest_dr2); 2916 load_dr3(vmxctx->guest_dr3); 2917 load_dr6(vmxctx->guest_dr6); 2918 } 2919 2920 static __inline void 2921 vmx_dr_leave_guest(struct vmxctx *vmxctx) 2922 { 2923 2924 /* Save guest debug registers. */ 2925 vmxctx->guest_dr0 = rdr0(); 2926 vmxctx->guest_dr1 = rdr1(); 2927 vmxctx->guest_dr2 = rdr2(); 2928 vmxctx->guest_dr3 = rdr3(); 2929 vmxctx->guest_dr6 = rdr6(); 2930 2931 /* 2932 * Restore host debug registers. Restore DR7, DEBUGCTL, and 2933 * PSL_T last. 2934 */ 2935 load_dr0(vmxctx->host_dr0); 2936 load_dr1(vmxctx->host_dr1); 2937 load_dr2(vmxctx->host_dr2); 2938 load_dr3(vmxctx->host_dr3); 2939 load_dr6(vmxctx->host_dr6); 2940 wrmsr(MSR_DEBUGCTLMSR, vmxctx->host_debugctl); 2941 load_dr7(vmxctx->host_dr7); 2942 write_rflags(read_rflags() | vmxctx->host_tf); 2943 } 2944 2945 static __inline void 2946 vmx_pmap_activate(struct vmx *vmx, pmap_t pmap) 2947 { 2948 long eptgen; 2949 int cpu; 2950 2951 cpu = curcpu; 2952 2953 CPU_SET_ATOMIC(cpu, &pmap->pm_active); 2954 smr_enter(pmap->pm_eptsmr); 2955 eptgen = atomic_load_long(&pmap->pm_eptgen); 2956 if (eptgen != vmx->eptgen[cpu]) { 2957 vmx->eptgen[cpu] = eptgen; 2958 invept(INVEPT_TYPE_SINGLE_CONTEXT, 2959 (struct invept_desc){ .eptp = vmx->eptp, ._res = 0 }); 2960 } 2961 } 2962 2963 static __inline void 2964 vmx_pmap_deactivate(struct vmx *vmx, pmap_t pmap) 2965 { 2966 smr_exit(pmap->pm_eptsmr); 2967 CPU_CLR_ATOMIC(curcpu, &pmap->pm_active); 2968 } 2969 2970 static int 2971 vmx_run(void *arg, int vcpu, register_t rip, pmap_t pmap, 2972 struct vm_eventinfo *evinfo) 2973 { 2974 int rc, handled, launched; 2975 struct vmx *vmx; 2976 struct vm *vm; 2977 struct vmxctx *vmxctx; 2978 struct vmcs *vmcs; 2979 struct vm_exit *vmexit; 2980 struct vlapic *vlapic; 2981 uint32_t exit_reason; 2982 struct region_descriptor gdtr, idtr; 2983 uint16_t ldt_sel; 2984 2985 vmx = arg; 2986 vm = vmx->vm; 2987 vmcs = &vmx->vmcs[vcpu]; 2988 vmxctx = &vmx->ctx[vcpu]; 2989 vlapic = vm_lapic(vm, vcpu); 2990 vmexit = vm_exitinfo(vm, vcpu); 2991 launched = 0; 2992 2993 KASSERT(vmxctx->pmap == pmap, 2994 ("pmap %p different than ctx pmap %p", pmap, vmxctx->pmap)); 2995 2996 vmx_msr_guest_enter(vmx, vcpu); 2997 2998 VMPTRLD(vmcs); 2999 3000 /* 3001 * XXX 3002 * We do this every time because we may setup the virtual machine 3003 * from a different process than the one that actually runs it. 3004 * 3005 * If the life of a virtual machine was spent entirely in the context 3006 * of a single process we could do this once in vmx_init(). 3007 */ 3008 vmcs_write(VMCS_HOST_CR3, rcr3()); 3009 3010 vmcs_write(VMCS_GUEST_RIP, rip); 3011 vmx_set_pcpu_defaults(vmx, vcpu, pmap); 3012 do { 3013 KASSERT(vmcs_guest_rip() == rip, ("%s: vmcs guest rip mismatch " 3014 "%#lx/%#lx", __func__, vmcs_guest_rip(), rip)); 3015 3016 handled = UNHANDLED; 3017 /* 3018 * Interrupts are disabled from this point on until the 3019 * guest starts executing. This is done for the following 3020 * reasons: 3021 * 3022 * If an AST is asserted on this thread after the check below, 3023 * then the IPI_AST notification will not be lost, because it 3024 * will cause a VM exit due to external interrupt as soon as 3025 * the guest state is loaded. 3026 * 3027 * A posted interrupt after 'vmx_inject_interrupts()' will 3028 * not be "lost" because it will be held pending in the host 3029 * APIC because interrupts are disabled. The pending interrupt 3030 * will be recognized as soon as the guest state is loaded. 3031 * 3032 * The same reasoning applies to the IPI generated by 3033 * pmap_invalidate_ept(). 3034 */ 3035 disable_intr(); 3036 vmx_inject_interrupts(vmx, vcpu, vlapic, rip); 3037 3038 /* 3039 * Check for vcpu suspension after injecting events because 3040 * vmx_inject_interrupts() can suspend the vcpu due to a 3041 * triple fault. 3042 */ 3043 if (vcpu_suspended(evinfo)) { 3044 enable_intr(); 3045 vm_exit_suspended(vmx->vm, vcpu, rip); 3046 break; 3047 } 3048 3049 if (vcpu_rendezvous_pending(evinfo)) { 3050 enable_intr(); 3051 vm_exit_rendezvous(vmx->vm, vcpu, rip); 3052 break; 3053 } 3054 3055 if (vcpu_reqidle(evinfo)) { 3056 enable_intr(); 3057 vm_exit_reqidle(vmx->vm, vcpu, rip); 3058 break; 3059 } 3060 3061 if (vcpu_should_yield(vm, vcpu)) { 3062 enable_intr(); 3063 vm_exit_astpending(vmx->vm, vcpu, rip); 3064 vmx_astpending_trace(vmx, vcpu, rip); 3065 handled = HANDLED; 3066 break; 3067 } 3068 3069 if (vcpu_debugged(vm, vcpu)) { 3070 enable_intr(); 3071 vm_exit_debug(vmx->vm, vcpu, rip); 3072 break; 3073 } 3074 3075 /* 3076 * If TPR Shadowing is enabled, the TPR Threshold 3077 * must be updated right before entering the guest. 3078 */ 3079 if (tpr_shadowing && !virtual_interrupt_delivery) { 3080 if ((vmx->cap[vcpu].proc_ctls & PROCBASED_USE_TPR_SHADOW) != 0) { 3081 vmcs_write(VMCS_TPR_THRESHOLD, vlapic_get_cr8(vlapic)); 3082 } 3083 } 3084 3085 /* 3086 * VM exits restore the base address but not the 3087 * limits of GDTR and IDTR. The VMCS only stores the 3088 * base address, so VM exits set the limits to 0xffff. 3089 * Save and restore the full GDTR and IDTR to restore 3090 * the limits. 3091 * 3092 * The VMCS does not save the LDTR at all, and VM 3093 * exits clear LDTR as if a NULL selector were loaded. 3094 * The userspace hypervisor probably doesn't use a 3095 * LDT, but save and restore it to be safe. 3096 */ 3097 sgdt(&gdtr); 3098 sidt(&idtr); 3099 ldt_sel = sldt(); 3100 3101 /* 3102 * The TSC_AUX MSR must be saved/restored while interrupts 3103 * are disabled so that it is not possible for the guest 3104 * TSC_AUX MSR value to be overwritten by the resume 3105 * portion of the IPI_SUSPEND codepath. This is why the 3106 * transition of this MSR is handled separately from those 3107 * handled by vmx_msr_guest_{enter,exit}(), which are ok to 3108 * be transitioned with preemption disabled but interrupts 3109 * enabled. 3110 * 3111 * These vmx_msr_guest_{enter,exit}_tsc_aux() calls can be 3112 * anywhere in this loop so long as they happen with 3113 * interrupts disabled. This location is chosen for 3114 * simplicity. 3115 */ 3116 vmx_msr_guest_enter_tsc_aux(vmx, vcpu); 3117 3118 vmx_dr_enter_guest(vmxctx); 3119 3120 /* 3121 * Mark the EPT as active on this host CPU and invalidate 3122 * EPTP-tagged TLB entries if required. 3123 */ 3124 vmx_pmap_activate(vmx, pmap); 3125 3126 vmx_run_trace(vmx, vcpu); 3127 rc = vmx_enter_guest(vmxctx, vmx, launched); 3128 3129 vmx_pmap_deactivate(vmx, pmap); 3130 vmx_dr_leave_guest(vmxctx); 3131 vmx_msr_guest_exit_tsc_aux(vmx, vcpu); 3132 3133 bare_lgdt(&gdtr); 3134 lidt(&idtr); 3135 lldt(ldt_sel); 3136 3137 /* Collect some information for VM exit processing */ 3138 vmexit->rip = rip = vmcs_guest_rip(); 3139 vmexit->inst_length = vmexit_instruction_length(); 3140 vmexit->u.vmx.exit_reason = exit_reason = vmcs_exit_reason(); 3141 vmexit->u.vmx.exit_qualification = vmcs_exit_qualification(); 3142 3143 /* Update 'nextrip' */ 3144 vmx->state[vcpu].nextrip = rip; 3145 3146 if (rc == VMX_GUEST_VMEXIT) { 3147 vmx_exit_handle_nmi(vmx, vcpu, vmexit); 3148 enable_intr(); 3149 handled = vmx_exit_process(vmx, vcpu, vmexit); 3150 } else { 3151 enable_intr(); 3152 vmx_exit_inst_error(vmxctx, rc, vmexit); 3153 } 3154 launched = 1; 3155 vmx_exit_trace(vmx, vcpu, rip, exit_reason, handled); 3156 rip = vmexit->rip; 3157 } while (handled); 3158 3159 /* 3160 * If a VM exit has been handled then the exitcode must be BOGUS 3161 * If a VM exit is not handled then the exitcode must not be BOGUS 3162 */ 3163 if ((handled && vmexit->exitcode != VM_EXITCODE_BOGUS) || 3164 (!handled && vmexit->exitcode == VM_EXITCODE_BOGUS)) { 3165 panic("Mismatch between handled (%d) and exitcode (%d)", 3166 handled, vmexit->exitcode); 3167 } 3168 3169 if (!handled) 3170 vmm_stat_incr(vm, vcpu, VMEXIT_USERSPACE, 1); 3171 3172 VCPU_CTR1(vm, vcpu, "returning from vmx_run: exitcode %d", 3173 vmexit->exitcode); 3174 3175 VMCLEAR(vmcs); 3176 vmx_msr_guest_exit(vmx, vcpu); 3177 3178 return (0); 3179 } 3180 3181 static void 3182 vmx_cleanup(void *arg) 3183 { 3184 int i; 3185 struct vmx *vmx = arg; 3186 uint16_t maxcpus; 3187 3188 if (apic_access_virtualization(vmx, 0)) 3189 vm_unmap_mmio(vmx->vm, DEFAULT_APIC_BASE, PAGE_SIZE); 3190 3191 maxcpus = vm_get_maxcpus(vmx->vm); 3192 for (i = 0; i < maxcpus; i++) 3193 vpid_free(vmx->state[i].vpid); 3194 3195 free(vmx, M_VMX); 3196 3197 return; 3198 } 3199 3200 static register_t * 3201 vmxctx_regptr(struct vmxctx *vmxctx, int reg) 3202 { 3203 3204 switch (reg) { 3205 case VM_REG_GUEST_RAX: 3206 return (&vmxctx->guest_rax); 3207 case VM_REG_GUEST_RBX: 3208 return (&vmxctx->guest_rbx); 3209 case VM_REG_GUEST_RCX: 3210 return (&vmxctx->guest_rcx); 3211 case VM_REG_GUEST_RDX: 3212 return (&vmxctx->guest_rdx); 3213 case VM_REG_GUEST_RSI: 3214 return (&vmxctx->guest_rsi); 3215 case VM_REG_GUEST_RDI: 3216 return (&vmxctx->guest_rdi); 3217 case VM_REG_GUEST_RBP: 3218 return (&vmxctx->guest_rbp); 3219 case VM_REG_GUEST_R8: 3220 return (&vmxctx->guest_r8); 3221 case VM_REG_GUEST_R9: 3222 return (&vmxctx->guest_r9); 3223 case VM_REG_GUEST_R10: 3224 return (&vmxctx->guest_r10); 3225 case VM_REG_GUEST_R11: 3226 return (&vmxctx->guest_r11); 3227 case VM_REG_GUEST_R12: 3228 return (&vmxctx->guest_r12); 3229 case VM_REG_GUEST_R13: 3230 return (&vmxctx->guest_r13); 3231 case VM_REG_GUEST_R14: 3232 return (&vmxctx->guest_r14); 3233 case VM_REG_GUEST_R15: 3234 return (&vmxctx->guest_r15); 3235 case VM_REG_GUEST_CR2: 3236 return (&vmxctx->guest_cr2); 3237 case VM_REG_GUEST_DR0: 3238 return (&vmxctx->guest_dr0); 3239 case VM_REG_GUEST_DR1: 3240 return (&vmxctx->guest_dr1); 3241 case VM_REG_GUEST_DR2: 3242 return (&vmxctx->guest_dr2); 3243 case VM_REG_GUEST_DR3: 3244 return (&vmxctx->guest_dr3); 3245 case VM_REG_GUEST_DR6: 3246 return (&vmxctx->guest_dr6); 3247 default: 3248 break; 3249 } 3250 return (NULL); 3251 } 3252 3253 static int 3254 vmxctx_getreg(struct vmxctx *vmxctx, int reg, uint64_t *retval) 3255 { 3256 register_t *regp; 3257 3258 if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) { 3259 *retval = *regp; 3260 return (0); 3261 } else 3262 return (EINVAL); 3263 } 3264 3265 static int 3266 vmxctx_setreg(struct vmxctx *vmxctx, int reg, uint64_t val) 3267 { 3268 register_t *regp; 3269 3270 if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) { 3271 *regp = val; 3272 return (0); 3273 } else 3274 return (EINVAL); 3275 } 3276 3277 static int 3278 vmx_get_intr_shadow(struct vmx *vmx, int vcpu, int running, uint64_t *retval) 3279 { 3280 uint64_t gi; 3281 int error; 3282 3283 error = vmcs_getreg(&vmx->vmcs[vcpu], running, 3284 VMCS_IDENT(VMCS_GUEST_INTERRUPTIBILITY), &gi); 3285 *retval = (gi & HWINTR_BLOCKING) ? 1 : 0; 3286 return (error); 3287 } 3288 3289 static int 3290 vmx_modify_intr_shadow(struct vmx *vmx, int vcpu, int running, uint64_t val) 3291 { 3292 struct vmcs *vmcs; 3293 uint64_t gi; 3294 int error, ident; 3295 3296 /* 3297 * Forcing the vcpu into an interrupt shadow is not supported. 3298 */ 3299 if (val) { 3300 error = EINVAL; 3301 goto done; 3302 } 3303 3304 vmcs = &vmx->vmcs[vcpu]; 3305 ident = VMCS_IDENT(VMCS_GUEST_INTERRUPTIBILITY); 3306 error = vmcs_getreg(vmcs, running, ident, &gi); 3307 if (error == 0) { 3308 gi &= ~HWINTR_BLOCKING; 3309 error = vmcs_setreg(vmcs, running, ident, gi); 3310 } 3311 done: 3312 VCPU_CTR2(vmx->vm, vcpu, "Setting intr_shadow to %#lx %s", val, 3313 error ? "failed" : "succeeded"); 3314 return (error); 3315 } 3316 3317 static int 3318 vmx_shadow_reg(int reg) 3319 { 3320 int shreg; 3321 3322 shreg = -1; 3323 3324 switch (reg) { 3325 case VM_REG_GUEST_CR0: 3326 shreg = VMCS_CR0_SHADOW; 3327 break; 3328 case VM_REG_GUEST_CR4: 3329 shreg = VMCS_CR4_SHADOW; 3330 break; 3331 default: 3332 break; 3333 } 3334 3335 return (shreg); 3336 } 3337 3338 static int 3339 vmx_getreg(void *arg, int vcpu, int reg, uint64_t *retval) 3340 { 3341 int running, hostcpu; 3342 struct vmx *vmx = arg; 3343 3344 running = vcpu_is_running(vmx->vm, vcpu, &hostcpu); 3345 if (running && hostcpu != curcpu) 3346 panic("vmx_getreg: %s%d is running", vm_name(vmx->vm), vcpu); 3347 3348 if (reg == VM_REG_GUEST_INTR_SHADOW) 3349 return (vmx_get_intr_shadow(vmx, vcpu, running, retval)); 3350 3351 if (vmxctx_getreg(&vmx->ctx[vcpu], reg, retval) == 0) 3352 return (0); 3353 3354 return (vmcs_getreg(&vmx->vmcs[vcpu], running, reg, retval)); 3355 } 3356 3357 static int 3358 vmx_setreg(void *arg, int vcpu, int reg, uint64_t val) 3359 { 3360 int error, hostcpu, running, shadow; 3361 uint64_t ctls; 3362 pmap_t pmap; 3363 struct vmx *vmx = arg; 3364 3365 running = vcpu_is_running(vmx->vm, vcpu, &hostcpu); 3366 if (running && hostcpu != curcpu) 3367 panic("vmx_setreg: %s%d is running", vm_name(vmx->vm), vcpu); 3368 3369 if (reg == VM_REG_GUEST_INTR_SHADOW) 3370 return (vmx_modify_intr_shadow(vmx, vcpu, running, val)); 3371 3372 if (vmxctx_setreg(&vmx->ctx[vcpu], reg, val) == 0) 3373 return (0); 3374 3375 /* Do not permit user write access to VMCS fields by offset. */ 3376 if (reg < 0) 3377 return (EINVAL); 3378 3379 error = vmcs_setreg(&vmx->vmcs[vcpu], running, reg, val); 3380 3381 if (error == 0) { 3382 /* 3383 * If the "load EFER" VM-entry control is 1 then the 3384 * value of EFER.LMA must be identical to "IA-32e mode guest" 3385 * bit in the VM-entry control. 3386 */ 3387 if ((entry_ctls & VM_ENTRY_LOAD_EFER) != 0 && 3388 (reg == VM_REG_GUEST_EFER)) { 3389 vmcs_getreg(&vmx->vmcs[vcpu], running, 3390 VMCS_IDENT(VMCS_ENTRY_CTLS), &ctls); 3391 if (val & EFER_LMA) 3392 ctls |= VM_ENTRY_GUEST_LMA; 3393 else 3394 ctls &= ~VM_ENTRY_GUEST_LMA; 3395 vmcs_setreg(&vmx->vmcs[vcpu], running, 3396 VMCS_IDENT(VMCS_ENTRY_CTLS), ctls); 3397 } 3398 3399 shadow = vmx_shadow_reg(reg); 3400 if (shadow > 0) { 3401 /* 3402 * Store the unmodified value in the shadow 3403 */ 3404 error = vmcs_setreg(&vmx->vmcs[vcpu], running, 3405 VMCS_IDENT(shadow), val); 3406 } 3407 3408 if (reg == VM_REG_GUEST_CR3) { 3409 /* 3410 * Invalidate the guest vcpu's TLB mappings to emulate 3411 * the behavior of updating %cr3. 3412 * 3413 * XXX the processor retains global mappings when %cr3 3414 * is updated but vmx_invvpid() does not. 3415 */ 3416 pmap = vmx->ctx[vcpu].pmap; 3417 vmx_invvpid(vmx, vcpu, pmap, running); 3418 } 3419 } 3420 3421 return (error); 3422 } 3423 3424 static int 3425 vmx_getdesc(void *arg, int vcpu, int reg, struct seg_desc *desc) 3426 { 3427 int hostcpu, running; 3428 struct vmx *vmx = arg; 3429 3430 running = vcpu_is_running(vmx->vm, vcpu, &hostcpu); 3431 if (running && hostcpu != curcpu) 3432 panic("vmx_getdesc: %s%d is running", vm_name(vmx->vm), vcpu); 3433 3434 return (vmcs_getdesc(&vmx->vmcs[vcpu], running, reg, desc)); 3435 } 3436 3437 static int 3438 vmx_setdesc(void *arg, int vcpu, int reg, struct seg_desc *desc) 3439 { 3440 int hostcpu, running; 3441 struct vmx *vmx = arg; 3442 3443 running = vcpu_is_running(vmx->vm, vcpu, &hostcpu); 3444 if (running && hostcpu != curcpu) 3445 panic("vmx_setdesc: %s%d is running", vm_name(vmx->vm), vcpu); 3446 3447 return (vmcs_setdesc(&vmx->vmcs[vcpu], running, reg, desc)); 3448 } 3449 3450 static int 3451 vmx_getcap(void *arg, int vcpu, int type, int *retval) 3452 { 3453 struct vmx *vmx = arg; 3454 int vcap; 3455 int ret; 3456 3457 ret = ENOENT; 3458 3459 vcap = vmx->cap[vcpu].set; 3460 3461 switch (type) { 3462 case VM_CAP_HALT_EXIT: 3463 if (cap_halt_exit) 3464 ret = 0; 3465 break; 3466 case VM_CAP_PAUSE_EXIT: 3467 if (cap_pause_exit) 3468 ret = 0; 3469 break; 3470 case VM_CAP_MTRAP_EXIT: 3471 if (cap_monitor_trap) 3472 ret = 0; 3473 break; 3474 case VM_CAP_RDPID: 3475 if (cap_rdpid) 3476 ret = 0; 3477 break; 3478 case VM_CAP_RDTSCP: 3479 if (cap_rdtscp) 3480 ret = 0; 3481 break; 3482 case VM_CAP_UNRESTRICTED_GUEST: 3483 if (cap_unrestricted_guest) 3484 ret = 0; 3485 break; 3486 case VM_CAP_ENABLE_INVPCID: 3487 if (cap_invpcid) 3488 ret = 0; 3489 break; 3490 case VM_CAP_BPT_EXIT: 3491 ret = 0; 3492 break; 3493 default: 3494 break; 3495 } 3496 3497 if (ret == 0) 3498 *retval = (vcap & (1 << type)) ? 1 : 0; 3499 3500 return (ret); 3501 } 3502 3503 static int 3504 vmx_setcap(void *arg, int vcpu, int type, int val) 3505 { 3506 struct vmx *vmx = arg; 3507 struct vmcs *vmcs = &vmx->vmcs[vcpu]; 3508 uint32_t baseval; 3509 uint32_t *pptr; 3510 int error; 3511 int flag; 3512 int reg; 3513 int retval; 3514 3515 retval = ENOENT; 3516 pptr = NULL; 3517 3518 switch (type) { 3519 case VM_CAP_HALT_EXIT: 3520 if (cap_halt_exit) { 3521 retval = 0; 3522 pptr = &vmx->cap[vcpu].proc_ctls; 3523 baseval = *pptr; 3524 flag = PROCBASED_HLT_EXITING; 3525 reg = VMCS_PRI_PROC_BASED_CTLS; 3526 } 3527 break; 3528 case VM_CAP_MTRAP_EXIT: 3529 if (cap_monitor_trap) { 3530 retval = 0; 3531 pptr = &vmx->cap[vcpu].proc_ctls; 3532 baseval = *pptr; 3533 flag = PROCBASED_MTF; 3534 reg = VMCS_PRI_PROC_BASED_CTLS; 3535 } 3536 break; 3537 case VM_CAP_PAUSE_EXIT: 3538 if (cap_pause_exit) { 3539 retval = 0; 3540 pptr = &vmx->cap[vcpu].proc_ctls; 3541 baseval = *pptr; 3542 flag = PROCBASED_PAUSE_EXITING; 3543 reg = VMCS_PRI_PROC_BASED_CTLS; 3544 } 3545 break; 3546 case VM_CAP_RDPID: 3547 case VM_CAP_RDTSCP: 3548 if (cap_rdpid || cap_rdtscp) 3549 /* 3550 * Choose not to support enabling/disabling 3551 * RDPID/RDTSCP via libvmmapi since, as per the 3552 * discussion in vmx_modinit(), RDPID/RDTSCP are 3553 * either always enabled or always disabled. 3554 */ 3555 error = EOPNOTSUPP; 3556 break; 3557 case VM_CAP_UNRESTRICTED_GUEST: 3558 if (cap_unrestricted_guest) { 3559 retval = 0; 3560 pptr = &vmx->cap[vcpu].proc_ctls2; 3561 baseval = *pptr; 3562 flag = PROCBASED2_UNRESTRICTED_GUEST; 3563 reg = VMCS_SEC_PROC_BASED_CTLS; 3564 } 3565 break; 3566 case VM_CAP_ENABLE_INVPCID: 3567 if (cap_invpcid) { 3568 retval = 0; 3569 pptr = &vmx->cap[vcpu].proc_ctls2; 3570 baseval = *pptr; 3571 flag = PROCBASED2_ENABLE_INVPCID; 3572 reg = VMCS_SEC_PROC_BASED_CTLS; 3573 } 3574 break; 3575 case VM_CAP_BPT_EXIT: 3576 retval = 0; 3577 3578 /* Don't change the bitmap if we are tracing all exceptions. */ 3579 if (vmx->cap[vcpu].exc_bitmap != 0xffffffff) { 3580 pptr = &vmx->cap[vcpu].exc_bitmap; 3581 baseval = *pptr; 3582 flag = (1 << IDT_BP); 3583 reg = VMCS_EXCEPTION_BITMAP; 3584 } 3585 break; 3586 default: 3587 break; 3588 } 3589 3590 if (retval) 3591 return (retval); 3592 3593 if (pptr != NULL) { 3594 if (val) { 3595 baseval |= flag; 3596 } else { 3597 baseval &= ~flag; 3598 } 3599 VMPTRLD(vmcs); 3600 error = vmwrite(reg, baseval); 3601 VMCLEAR(vmcs); 3602 3603 if (error) 3604 return (error); 3605 3606 /* 3607 * Update optional stored flags, and record 3608 * setting 3609 */ 3610 *pptr = baseval; 3611 } 3612 3613 if (val) { 3614 vmx->cap[vcpu].set |= (1 << type); 3615 } else { 3616 vmx->cap[vcpu].set &= ~(1 << type); 3617 } 3618 3619 return (0); 3620 } 3621 3622 static struct vmspace * 3623 vmx_vmspace_alloc(vm_offset_t min, vm_offset_t max) 3624 { 3625 return (ept_vmspace_alloc(min, max)); 3626 } 3627 3628 static void 3629 vmx_vmspace_free(struct vmspace *vmspace) 3630 { 3631 ept_vmspace_free(vmspace); 3632 } 3633 3634 struct vlapic_vtx { 3635 struct vlapic vlapic; 3636 struct pir_desc *pir_desc; 3637 struct vmx *vmx; 3638 u_int pending_prio; 3639 }; 3640 3641 #define VPR_PRIO_BIT(vpr) (1 << ((vpr) >> 4)) 3642 3643 #define VMX_CTR_PIR(vm, vcpuid, pir_desc, notify, vector, level, msg) \ 3644 do { \ 3645 VCPU_CTR2(vm, vcpuid, msg " assert %s-triggered vector %d", \ 3646 level ? "level" : "edge", vector); \ 3647 VCPU_CTR1(vm, vcpuid, msg " pir0 0x%016lx", pir_desc->pir[0]); \ 3648 VCPU_CTR1(vm, vcpuid, msg " pir1 0x%016lx", pir_desc->pir[1]); \ 3649 VCPU_CTR1(vm, vcpuid, msg " pir2 0x%016lx", pir_desc->pir[2]); \ 3650 VCPU_CTR1(vm, vcpuid, msg " pir3 0x%016lx", pir_desc->pir[3]); \ 3651 VCPU_CTR1(vm, vcpuid, msg " notify: %s", notify ? "yes" : "no");\ 3652 } while (0) 3653 3654 /* 3655 * vlapic->ops handlers that utilize the APICv hardware assist described in 3656 * Chapter 29 of the Intel SDM. 3657 */ 3658 static int 3659 vmx_set_intr_ready(struct vlapic *vlapic, int vector, bool level) 3660 { 3661 struct vlapic_vtx *vlapic_vtx; 3662 struct pir_desc *pir_desc; 3663 uint64_t mask; 3664 int idx, notify = 0; 3665 3666 vlapic_vtx = (struct vlapic_vtx *)vlapic; 3667 pir_desc = vlapic_vtx->pir_desc; 3668 3669 /* 3670 * Keep track of interrupt requests in the PIR descriptor. This is 3671 * because the virtual APIC page pointed to by the VMCS cannot be 3672 * modified if the vcpu is running. 3673 */ 3674 idx = vector / 64; 3675 mask = 1UL << (vector % 64); 3676 atomic_set_long(&pir_desc->pir[idx], mask); 3677 3678 /* 3679 * A notification is required whenever the 'pending' bit makes a 3680 * transition from 0->1. 3681 * 3682 * Even if the 'pending' bit is already asserted, notification about 3683 * the incoming interrupt may still be necessary. For example, if a 3684 * vCPU is HLTed with a high PPR, a low priority interrupt would cause 3685 * the 0->1 'pending' transition with a notification, but the vCPU 3686 * would ignore the interrupt for the time being. The same vCPU would 3687 * need to then be notified if a high-priority interrupt arrived which 3688 * satisfied the PPR. 3689 * 3690 * The priorities of interrupts injected while 'pending' is asserted 3691 * are tracked in a custom bitfield 'pending_prio'. Should the 3692 * to-be-injected interrupt exceed the priorities already present, the 3693 * notification is sent. The priorities recorded in 'pending_prio' are 3694 * cleared whenever the 'pending' bit makes another 0->1 transition. 3695 */ 3696 if (atomic_cmpset_long(&pir_desc->pending, 0, 1) != 0) { 3697 notify = 1; 3698 vlapic_vtx->pending_prio = 0; 3699 } else { 3700 const u_int old_prio = vlapic_vtx->pending_prio; 3701 const u_int prio_bit = VPR_PRIO_BIT(vector & APIC_TPR_INT); 3702 3703 if ((old_prio & prio_bit) == 0 && prio_bit > old_prio) { 3704 atomic_set_int(&vlapic_vtx->pending_prio, prio_bit); 3705 notify = 1; 3706 } 3707 } 3708 3709 VMX_CTR_PIR(vlapic->vm, vlapic->vcpuid, pir_desc, notify, vector, 3710 level, "vmx_set_intr_ready"); 3711 return (notify); 3712 } 3713 3714 static int 3715 vmx_pending_intr(struct vlapic *vlapic, int *vecptr) 3716 { 3717 struct vlapic_vtx *vlapic_vtx; 3718 struct pir_desc *pir_desc; 3719 struct LAPIC *lapic; 3720 uint64_t pending, pirval; 3721 uint32_t ppr, vpr; 3722 int i; 3723 3724 /* 3725 * This function is only expected to be called from the 'HLT' exit 3726 * handler which does not care about the vector that is pending. 3727 */ 3728 KASSERT(vecptr == NULL, ("vmx_pending_intr: vecptr must be NULL")); 3729 3730 vlapic_vtx = (struct vlapic_vtx *)vlapic; 3731 pir_desc = vlapic_vtx->pir_desc; 3732 3733 pending = atomic_load_acq_long(&pir_desc->pending); 3734 if (!pending) { 3735 /* 3736 * While a virtual interrupt may have already been 3737 * processed the actual delivery maybe pending the 3738 * interruptibility of the guest. Recognize a pending 3739 * interrupt by reevaluating virtual interrupts 3740 * following Section 29.2.1 in the Intel SDM Volume 3. 3741 */ 3742 struct vm_exit *vmexit; 3743 uint8_t rvi, ppr; 3744 3745 vmexit = vm_exitinfo(vlapic->vm, vlapic->vcpuid); 3746 KASSERT(vmexit->exitcode == VM_EXITCODE_HLT, 3747 ("vmx_pending_intr: exitcode not 'HLT'")); 3748 rvi = vmexit->u.hlt.intr_status & APIC_TPR_INT; 3749 lapic = vlapic->apic_page; 3750 ppr = lapic->ppr & APIC_TPR_INT; 3751 if (rvi > ppr) { 3752 return (1); 3753 } 3754 3755 return (0); 3756 } 3757 3758 /* 3759 * If there is an interrupt pending then it will be recognized only 3760 * if its priority is greater than the processor priority. 3761 * 3762 * Special case: if the processor priority is zero then any pending 3763 * interrupt will be recognized. 3764 */ 3765 lapic = vlapic->apic_page; 3766 ppr = lapic->ppr & APIC_TPR_INT; 3767 if (ppr == 0) 3768 return (1); 3769 3770 VCPU_CTR1(vlapic->vm, vlapic->vcpuid, "HLT with non-zero PPR %d", 3771 lapic->ppr); 3772 3773 vpr = 0; 3774 for (i = 3; i >= 0; i--) { 3775 pirval = pir_desc->pir[i]; 3776 if (pirval != 0) { 3777 vpr = (i * 64 + flsl(pirval) - 1) & APIC_TPR_INT; 3778 break; 3779 } 3780 } 3781 3782 /* 3783 * If the highest-priority pending interrupt falls short of the 3784 * processor priority of this vCPU, ensure that 'pending_prio' does not 3785 * have any stale bits which would preclude a higher-priority interrupt 3786 * from incurring a notification later. 3787 */ 3788 if (vpr <= ppr) { 3789 const u_int prio_bit = VPR_PRIO_BIT(vpr); 3790 const u_int old = vlapic_vtx->pending_prio; 3791 3792 if (old > prio_bit && (old & prio_bit) == 0) { 3793 vlapic_vtx->pending_prio = prio_bit; 3794 } 3795 return (0); 3796 } 3797 return (1); 3798 } 3799 3800 static void 3801 vmx_intr_accepted(struct vlapic *vlapic, int vector) 3802 { 3803 3804 panic("vmx_intr_accepted: not expected to be called"); 3805 } 3806 3807 static void 3808 vmx_set_tmr(struct vlapic *vlapic, int vector, bool level) 3809 { 3810 struct vlapic_vtx *vlapic_vtx; 3811 struct vmx *vmx; 3812 struct vmcs *vmcs; 3813 uint64_t mask, val; 3814 3815 KASSERT(vector >= 0 && vector <= 255, ("invalid vector %d", vector)); 3816 KASSERT(!vcpu_is_running(vlapic->vm, vlapic->vcpuid, NULL), 3817 ("vmx_set_tmr: vcpu cannot be running")); 3818 3819 vlapic_vtx = (struct vlapic_vtx *)vlapic; 3820 vmx = vlapic_vtx->vmx; 3821 vmcs = &vmx->vmcs[vlapic->vcpuid]; 3822 mask = 1UL << (vector % 64); 3823 3824 VMPTRLD(vmcs); 3825 val = vmcs_read(VMCS_EOI_EXIT(vector)); 3826 if (level) 3827 val |= mask; 3828 else 3829 val &= ~mask; 3830 vmcs_write(VMCS_EOI_EXIT(vector), val); 3831 VMCLEAR(vmcs); 3832 } 3833 3834 static void 3835 vmx_enable_x2apic_mode_ts(struct vlapic *vlapic) 3836 { 3837 struct vmx *vmx; 3838 struct vmcs *vmcs; 3839 uint32_t proc_ctls; 3840 int vcpuid; 3841 3842 vcpuid = vlapic->vcpuid; 3843 vmx = ((struct vlapic_vtx *)vlapic)->vmx; 3844 vmcs = &vmx->vmcs[vcpuid]; 3845 3846 proc_ctls = vmx->cap[vcpuid].proc_ctls; 3847 proc_ctls &= ~PROCBASED_USE_TPR_SHADOW; 3848 proc_ctls |= PROCBASED_CR8_LOAD_EXITING; 3849 proc_ctls |= PROCBASED_CR8_STORE_EXITING; 3850 vmx->cap[vcpuid].proc_ctls = proc_ctls; 3851 3852 VMPTRLD(vmcs); 3853 vmcs_write(VMCS_PRI_PROC_BASED_CTLS, proc_ctls); 3854 VMCLEAR(vmcs); 3855 } 3856 3857 static void 3858 vmx_enable_x2apic_mode_vid(struct vlapic *vlapic) 3859 { 3860 struct vmx *vmx; 3861 struct vmcs *vmcs; 3862 uint32_t proc_ctls2; 3863 int vcpuid, error __diagused; 3864 3865 vcpuid = vlapic->vcpuid; 3866 vmx = ((struct vlapic_vtx *)vlapic)->vmx; 3867 vmcs = &vmx->vmcs[vcpuid]; 3868 3869 proc_ctls2 = vmx->cap[vcpuid].proc_ctls2; 3870 KASSERT((proc_ctls2 & PROCBASED2_VIRTUALIZE_APIC_ACCESSES) != 0, 3871 ("%s: invalid proc_ctls2 %#x", __func__, proc_ctls2)); 3872 3873 proc_ctls2 &= ~PROCBASED2_VIRTUALIZE_APIC_ACCESSES; 3874 proc_ctls2 |= PROCBASED2_VIRTUALIZE_X2APIC_MODE; 3875 vmx->cap[vcpuid].proc_ctls2 = proc_ctls2; 3876 3877 VMPTRLD(vmcs); 3878 vmcs_write(VMCS_SEC_PROC_BASED_CTLS, proc_ctls2); 3879 VMCLEAR(vmcs); 3880 3881 if (vlapic->vcpuid == 0) { 3882 /* 3883 * The nested page table mappings are shared by all vcpus 3884 * so unmap the APIC access page just once. 3885 */ 3886 error = vm_unmap_mmio(vmx->vm, DEFAULT_APIC_BASE, PAGE_SIZE); 3887 KASSERT(error == 0, ("%s: vm_unmap_mmio error %d", 3888 __func__, error)); 3889 3890 /* 3891 * The MSR bitmap is shared by all vcpus so modify it only 3892 * once in the context of vcpu 0. 3893 */ 3894 error = vmx_allow_x2apic_msrs(vmx); 3895 KASSERT(error == 0, ("%s: vmx_allow_x2apic_msrs error %d", 3896 __func__, error)); 3897 } 3898 } 3899 3900 static void 3901 vmx_post_intr(struct vlapic *vlapic, int hostcpu) 3902 { 3903 3904 ipi_cpu(hostcpu, pirvec); 3905 } 3906 3907 /* 3908 * Transfer the pending interrupts in the PIR descriptor to the IRR 3909 * in the virtual APIC page. 3910 */ 3911 static void 3912 vmx_inject_pir(struct vlapic *vlapic) 3913 { 3914 struct vlapic_vtx *vlapic_vtx; 3915 struct pir_desc *pir_desc; 3916 struct LAPIC *lapic; 3917 uint64_t val, pirval; 3918 int rvi, pirbase = -1; 3919 uint16_t intr_status_old, intr_status_new; 3920 3921 vlapic_vtx = (struct vlapic_vtx *)vlapic; 3922 pir_desc = vlapic_vtx->pir_desc; 3923 if (atomic_cmpset_long(&pir_desc->pending, 1, 0) == 0) { 3924 VCPU_CTR0(vlapic->vm, vlapic->vcpuid, "vmx_inject_pir: " 3925 "no posted interrupt pending"); 3926 return; 3927 } 3928 3929 pirval = 0; 3930 pirbase = -1; 3931 lapic = vlapic->apic_page; 3932 3933 val = atomic_readandclear_long(&pir_desc->pir[0]); 3934 if (val != 0) { 3935 lapic->irr0 |= val; 3936 lapic->irr1 |= val >> 32; 3937 pirbase = 0; 3938 pirval = val; 3939 } 3940 3941 val = atomic_readandclear_long(&pir_desc->pir[1]); 3942 if (val != 0) { 3943 lapic->irr2 |= val; 3944 lapic->irr3 |= val >> 32; 3945 pirbase = 64; 3946 pirval = val; 3947 } 3948 3949 val = atomic_readandclear_long(&pir_desc->pir[2]); 3950 if (val != 0) { 3951 lapic->irr4 |= val; 3952 lapic->irr5 |= val >> 32; 3953 pirbase = 128; 3954 pirval = val; 3955 } 3956 3957 val = atomic_readandclear_long(&pir_desc->pir[3]); 3958 if (val != 0) { 3959 lapic->irr6 |= val; 3960 lapic->irr7 |= val >> 32; 3961 pirbase = 192; 3962 pirval = val; 3963 } 3964 3965 VLAPIC_CTR_IRR(vlapic, "vmx_inject_pir"); 3966 3967 /* 3968 * Update RVI so the processor can evaluate pending virtual 3969 * interrupts on VM-entry. 3970 * 3971 * It is possible for pirval to be 0 here, even though the 3972 * pending bit has been set. The scenario is: 3973 * CPU-Y is sending a posted interrupt to CPU-X, which 3974 * is running a guest and processing posted interrupts in h/w. 3975 * CPU-X will eventually exit and the state seen in s/w is 3976 * the pending bit set, but no PIR bits set. 3977 * 3978 * CPU-X CPU-Y 3979 * (vm running) (host running) 3980 * rx posted interrupt 3981 * CLEAR pending bit 3982 * SET PIR bit 3983 * READ/CLEAR PIR bits 3984 * SET pending bit 3985 * (vm exit) 3986 * pending bit set, PIR 0 3987 */ 3988 if (pirval != 0) { 3989 rvi = pirbase + flsl(pirval) - 1; 3990 intr_status_old = vmcs_read(VMCS_GUEST_INTR_STATUS); 3991 intr_status_new = (intr_status_old & 0xFF00) | rvi; 3992 if (intr_status_new > intr_status_old) { 3993 vmcs_write(VMCS_GUEST_INTR_STATUS, intr_status_new); 3994 VCPU_CTR2(vlapic->vm, vlapic->vcpuid, "vmx_inject_pir: " 3995 "guest_intr_status changed from 0x%04x to 0x%04x", 3996 intr_status_old, intr_status_new); 3997 } 3998 } 3999 } 4000 4001 static struct vlapic * 4002 vmx_vlapic_init(void *arg, int vcpuid) 4003 { 4004 struct vmx *vmx; 4005 struct vlapic *vlapic; 4006 struct vlapic_vtx *vlapic_vtx; 4007 4008 vmx = arg; 4009 4010 vlapic = malloc(sizeof(struct vlapic_vtx), M_VLAPIC, M_WAITOK | M_ZERO); 4011 vlapic->vm = vmx->vm; 4012 vlapic->vcpuid = vcpuid; 4013 vlapic->apic_page = (struct LAPIC *)&vmx->apic_page[vcpuid]; 4014 4015 vlapic_vtx = (struct vlapic_vtx *)vlapic; 4016 vlapic_vtx->pir_desc = &vmx->pir_desc[vcpuid]; 4017 vlapic_vtx->vmx = vmx; 4018 4019 if (tpr_shadowing) { 4020 vlapic->ops.enable_x2apic_mode = vmx_enable_x2apic_mode_ts; 4021 } 4022 4023 if (virtual_interrupt_delivery) { 4024 vlapic->ops.set_intr_ready = vmx_set_intr_ready; 4025 vlapic->ops.pending_intr = vmx_pending_intr; 4026 vlapic->ops.intr_accepted = vmx_intr_accepted; 4027 vlapic->ops.set_tmr = vmx_set_tmr; 4028 vlapic->ops.enable_x2apic_mode = vmx_enable_x2apic_mode_vid; 4029 } 4030 4031 if (posted_interrupts) 4032 vlapic->ops.post_intr = vmx_post_intr; 4033 4034 vlapic_init(vlapic); 4035 4036 return (vlapic); 4037 } 4038 4039 static void 4040 vmx_vlapic_cleanup(void *arg, struct vlapic *vlapic) 4041 { 4042 4043 vlapic_cleanup(vlapic); 4044 free(vlapic, M_VLAPIC); 4045 } 4046 4047 #ifdef BHYVE_SNAPSHOT 4048 static int 4049 vmx_snapshot(void *arg, struct vm_snapshot_meta *meta) 4050 { 4051 struct vmx *vmx; 4052 struct vmxctx *vmxctx; 4053 int i; 4054 int ret; 4055 4056 vmx = arg; 4057 4058 KASSERT(vmx != NULL, ("%s: arg was NULL", __func__)); 4059 4060 for (i = 0; i < VM_MAXCPU; i++) { 4061 SNAPSHOT_BUF_OR_LEAVE(vmx->guest_msrs[i], 4062 sizeof(vmx->guest_msrs[i]), meta, ret, done); 4063 4064 vmxctx = &vmx->ctx[i]; 4065 SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_rdi, meta, ret, done); 4066 SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_rsi, meta, ret, done); 4067 SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_rdx, meta, ret, done); 4068 SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_rcx, meta, ret, done); 4069 SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_r8, meta, ret, done); 4070 SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_r9, meta, ret, done); 4071 SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_rax, meta, ret, done); 4072 SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_rbx, meta, ret, done); 4073 SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_rbp, meta, ret, done); 4074 SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_r10, meta, ret, done); 4075 SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_r11, meta, ret, done); 4076 SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_r12, meta, ret, done); 4077 SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_r13, meta, ret, done); 4078 SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_r14, meta, ret, done); 4079 SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_r15, meta, ret, done); 4080 SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_cr2, meta, ret, done); 4081 SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_dr0, meta, ret, done); 4082 SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_dr1, meta, ret, done); 4083 SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_dr2, meta, ret, done); 4084 SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_dr3, meta, ret, done); 4085 SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_dr6, meta, ret, done); 4086 } 4087 4088 done: 4089 return (ret); 4090 } 4091 4092 static int 4093 vmx_vmcx_snapshot(void *arg, struct vm_snapshot_meta *meta, int vcpu) 4094 { 4095 struct vmcs *vmcs; 4096 struct vmx *vmx; 4097 int err, run, hostcpu; 4098 4099 vmx = (struct vmx *)arg; 4100 err = 0; 4101 4102 KASSERT(arg != NULL, ("%s: arg was NULL", __func__)); 4103 vmcs = &vmx->vmcs[vcpu]; 4104 4105 run = vcpu_is_running(vmx->vm, vcpu, &hostcpu); 4106 if (run && hostcpu != curcpu) { 4107 printf("%s: %s%d is running", __func__, vm_name(vmx->vm), vcpu); 4108 return (EINVAL); 4109 } 4110 4111 err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_CR0, meta); 4112 err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_CR3, meta); 4113 err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_CR4, meta); 4114 err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_DR7, meta); 4115 err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_RSP, meta); 4116 err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_RIP, meta); 4117 err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_RFLAGS, meta); 4118 4119 /* Guest segments */ 4120 err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_ES, meta); 4121 err += vmcs_snapshot_desc(vmcs, run, VM_REG_GUEST_ES, meta); 4122 4123 err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_CS, meta); 4124 err += vmcs_snapshot_desc(vmcs, run, VM_REG_GUEST_CS, meta); 4125 4126 err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_SS, meta); 4127 err += vmcs_snapshot_desc(vmcs, run, VM_REG_GUEST_SS, meta); 4128 4129 err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_DS, meta); 4130 err += vmcs_snapshot_desc(vmcs, run, VM_REG_GUEST_DS, meta); 4131 4132 err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_FS, meta); 4133 err += vmcs_snapshot_desc(vmcs, run, VM_REG_GUEST_FS, meta); 4134 4135 err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_GS, meta); 4136 err += vmcs_snapshot_desc(vmcs, run, VM_REG_GUEST_GS, meta); 4137 4138 err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_TR, meta); 4139 err += vmcs_snapshot_desc(vmcs, run, VM_REG_GUEST_TR, meta); 4140 4141 err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_LDTR, meta); 4142 err += vmcs_snapshot_desc(vmcs, run, VM_REG_GUEST_LDTR, meta); 4143 4144 err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_EFER, meta); 4145 4146 err += vmcs_snapshot_desc(vmcs, run, VM_REG_GUEST_IDTR, meta); 4147 err += vmcs_snapshot_desc(vmcs, run, VM_REG_GUEST_GDTR, meta); 4148 4149 /* Guest page tables */ 4150 err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_PDPTE0, meta); 4151 err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_PDPTE1, meta); 4152 err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_PDPTE2, meta); 4153 err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_PDPTE3, meta); 4154 4155 /* Other guest state */ 4156 err += vmcs_snapshot_any(vmcs, run, VMCS_GUEST_IA32_SYSENTER_CS, meta); 4157 err += vmcs_snapshot_any(vmcs, run, VMCS_GUEST_IA32_SYSENTER_ESP, meta); 4158 err += vmcs_snapshot_any(vmcs, run, VMCS_GUEST_IA32_SYSENTER_EIP, meta); 4159 err += vmcs_snapshot_any(vmcs, run, VMCS_GUEST_INTERRUPTIBILITY, meta); 4160 err += vmcs_snapshot_any(vmcs, run, VMCS_GUEST_ACTIVITY, meta); 4161 err += vmcs_snapshot_any(vmcs, run, VMCS_ENTRY_CTLS, meta); 4162 err += vmcs_snapshot_any(vmcs, run, VMCS_EXIT_CTLS, meta); 4163 4164 return (err); 4165 } 4166 4167 static int 4168 vmx_restore_tsc(void *arg, int vcpu, uint64_t offset) 4169 { 4170 struct vmcs *vmcs; 4171 struct vmx *vmx = (struct vmx *)arg; 4172 int error, running, hostcpu; 4173 4174 KASSERT(arg != NULL, ("%s: arg was NULL", __func__)); 4175 vmcs = &vmx->vmcs[vcpu]; 4176 4177 running = vcpu_is_running(vmx->vm, vcpu, &hostcpu); 4178 if (running && hostcpu != curcpu) { 4179 printf("%s: %s%d is running", __func__, vm_name(vmx->vm), vcpu); 4180 return (EINVAL); 4181 } 4182 4183 if (!running) 4184 VMPTRLD(vmcs); 4185 4186 error = vmx_set_tsc_offset(vmx, vcpu, offset); 4187 4188 if (!running) 4189 VMCLEAR(vmcs); 4190 return (error); 4191 } 4192 #endif 4193 4194 const struct vmm_ops vmm_ops_intel = { 4195 .modinit = vmx_modinit, 4196 .modcleanup = vmx_modcleanup, 4197 .modresume = vmx_modresume, 4198 .init = vmx_init, 4199 .run = vmx_run, 4200 .cleanup = vmx_cleanup, 4201 .getreg = vmx_getreg, 4202 .setreg = vmx_setreg, 4203 .getdesc = vmx_getdesc, 4204 .setdesc = vmx_setdesc, 4205 .getcap = vmx_getcap, 4206 .setcap = vmx_setcap, 4207 .vmspace_alloc = vmx_vmspace_alloc, 4208 .vmspace_free = vmx_vmspace_free, 4209 .vlapic_init = vmx_vlapic_init, 4210 .vlapic_cleanup = vmx_vlapic_cleanup, 4211 #ifdef BHYVE_SNAPSHOT 4212 .snapshot = vmx_snapshot, 4213 .vmcx_snapshot = vmx_vmcx_snapshot, 4214 .restore_tsc = vmx_restore_tsc, 4215 #endif 4216 }; 4217