1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2011 NetApp, Inc. 5 * All rights reserved. 6 * Copyright (c) 2018 Joyent, Inc. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 */ 29 30 #include <sys/cdefs.h> 31 #include "opt_bhyve_snapshot.h" 32 33 #include <sys/param.h> 34 #include <sys/systm.h> 35 #include <sys/smp.h> 36 #include <sys/kernel.h> 37 #include <sys/malloc.h> 38 #include <sys/pcpu.h> 39 #include <sys/proc.h> 40 #include <sys/reg.h> 41 #include <sys/smr.h> 42 #include <sys/sysctl.h> 43 44 #include <vm/vm.h> 45 #include <vm/vm_extern.h> 46 #include <vm/pmap.h> 47 48 #include <machine/psl.h> 49 #include <machine/cpufunc.h> 50 #include <machine/md_var.h> 51 #include <machine/segments.h> 52 #include <machine/smp.h> 53 #include <machine/specialreg.h> 54 #include <machine/vmparam.h> 55 56 #include <machine/vmm.h> 57 #include <machine/vmm_dev.h> 58 #include <machine/vmm_instruction_emul.h> 59 #include <machine/vmm_snapshot.h> 60 61 #include "vmm_lapic.h" 62 #include "vmm_host.h" 63 #include "vmm_ioport.h" 64 #include "vmm_ktr.h" 65 #include "vmm_stat.h" 66 #include "vatpic.h" 67 #include "vlapic.h" 68 #include "vlapic_priv.h" 69 70 #include "ept.h" 71 #include "vmx_cpufunc.h" 72 #include "vmx.h" 73 #include "vmx_msr.h" 74 #include "x86.h" 75 #include "vmx_controls.h" 76 77 #define PINBASED_CTLS_ONE_SETTING \ 78 (PINBASED_EXTINT_EXITING | \ 79 PINBASED_NMI_EXITING | \ 80 PINBASED_VIRTUAL_NMI) 81 #define PINBASED_CTLS_ZERO_SETTING 0 82 83 #define PROCBASED_CTLS_WINDOW_SETTING \ 84 (PROCBASED_INT_WINDOW_EXITING | \ 85 PROCBASED_NMI_WINDOW_EXITING) 86 87 #define PROCBASED_CTLS_ONE_SETTING \ 88 (PROCBASED_SECONDARY_CONTROLS | \ 89 PROCBASED_MWAIT_EXITING | \ 90 PROCBASED_MONITOR_EXITING | \ 91 PROCBASED_IO_EXITING | \ 92 PROCBASED_MSR_BITMAPS | \ 93 PROCBASED_CTLS_WINDOW_SETTING | \ 94 PROCBASED_CR8_LOAD_EXITING | \ 95 PROCBASED_CR8_STORE_EXITING) 96 #define PROCBASED_CTLS_ZERO_SETTING \ 97 (PROCBASED_CR3_LOAD_EXITING | \ 98 PROCBASED_CR3_STORE_EXITING | \ 99 PROCBASED_IO_BITMAPS) 100 101 #define PROCBASED_CTLS2_ONE_SETTING PROCBASED2_ENABLE_EPT 102 #define PROCBASED_CTLS2_ZERO_SETTING 0 103 104 #define VM_EXIT_CTLS_ONE_SETTING \ 105 (VM_EXIT_SAVE_DEBUG_CONTROLS | \ 106 VM_EXIT_HOST_LMA | \ 107 VM_EXIT_SAVE_EFER | \ 108 VM_EXIT_LOAD_EFER | \ 109 VM_EXIT_ACKNOWLEDGE_INTERRUPT) 110 111 #define VM_EXIT_CTLS_ZERO_SETTING 0 112 113 #define VM_ENTRY_CTLS_ONE_SETTING \ 114 (VM_ENTRY_LOAD_DEBUG_CONTROLS | \ 115 VM_ENTRY_LOAD_EFER) 116 117 #define VM_ENTRY_CTLS_ZERO_SETTING \ 118 (VM_ENTRY_INTO_SMM | \ 119 VM_ENTRY_DEACTIVATE_DUAL_MONITOR) 120 121 #define HANDLED 1 122 #define UNHANDLED 0 123 124 static MALLOC_DEFINE(M_VMX, "vmx", "vmx"); 125 static MALLOC_DEFINE(M_VLAPIC, "vlapic", "vlapic"); 126 127 bool vmx_have_msr_tsc_aux; 128 129 SYSCTL_DECL(_hw_vmm); 130 SYSCTL_NODE(_hw_vmm, OID_AUTO, vmx, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 131 NULL); 132 133 int vmxon_enabled[MAXCPU]; 134 static uint8_t *vmxon_region; 135 136 static uint32_t pinbased_ctls, procbased_ctls, procbased_ctls2; 137 static uint32_t exit_ctls, entry_ctls; 138 139 static uint64_t cr0_ones_mask, cr0_zeros_mask; 140 SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr0_ones_mask, CTLFLAG_RD, 141 &cr0_ones_mask, 0, NULL); 142 SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr0_zeros_mask, CTLFLAG_RD, 143 &cr0_zeros_mask, 0, NULL); 144 145 static uint64_t cr4_ones_mask, cr4_zeros_mask; 146 SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr4_ones_mask, CTLFLAG_RD, 147 &cr4_ones_mask, 0, NULL); 148 SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr4_zeros_mask, CTLFLAG_RD, 149 &cr4_zeros_mask, 0, NULL); 150 151 static int vmx_initialized; 152 SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, initialized, CTLFLAG_RD, 153 &vmx_initialized, 0, "Intel VMX initialized"); 154 155 /* 156 * Optional capabilities 157 */ 158 static SYSCTL_NODE(_hw_vmm_vmx, OID_AUTO, cap, 159 CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 160 NULL); 161 162 static int cap_halt_exit; 163 SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, halt_exit, CTLFLAG_RD, &cap_halt_exit, 0, 164 "HLT triggers a VM-exit"); 165 166 static int cap_pause_exit; 167 SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, pause_exit, CTLFLAG_RD, &cap_pause_exit, 168 0, "PAUSE triggers a VM-exit"); 169 170 static int cap_wbinvd_exit; 171 SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, wbinvd_exit, CTLFLAG_RD, &cap_wbinvd_exit, 172 0, "WBINVD triggers a VM-exit"); 173 174 static int cap_rdpid; 175 SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, rdpid, CTLFLAG_RD, &cap_rdpid, 0, 176 "Guests are allowed to use RDPID"); 177 178 static int cap_rdtscp; 179 SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, rdtscp, CTLFLAG_RD, &cap_rdtscp, 0, 180 "Guests are allowed to use RDTSCP"); 181 182 static int cap_unrestricted_guest; 183 SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, unrestricted_guest, CTLFLAG_RD, 184 &cap_unrestricted_guest, 0, "Unrestricted guests"); 185 186 static int cap_monitor_trap; 187 SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, monitor_trap, CTLFLAG_RD, 188 &cap_monitor_trap, 0, "Monitor trap flag"); 189 190 static int cap_invpcid; 191 SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, invpcid, CTLFLAG_RD, &cap_invpcid, 192 0, "Guests are allowed to use INVPCID"); 193 194 static int tpr_shadowing; 195 SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, tpr_shadowing, 196 CTLFLAG_RDTUN | CTLFLAG_NOFETCH, 197 &tpr_shadowing, 0, "TPR shadowing support"); 198 199 static int virtual_interrupt_delivery; 200 SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, virtual_interrupt_delivery, 201 CTLFLAG_RDTUN | CTLFLAG_NOFETCH, 202 &virtual_interrupt_delivery, 0, "APICv virtual interrupt delivery support"); 203 204 static int posted_interrupts; 205 SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, posted_interrupts, 206 CTLFLAG_RDTUN | CTLFLAG_NOFETCH, 207 &posted_interrupts, 0, "APICv posted interrupt support"); 208 209 static int pirvec = -1; 210 SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, posted_interrupt_vector, CTLFLAG_RD, 211 &pirvec, 0, "APICv posted interrupt vector"); 212 213 static struct unrhdr *vpid_unr; 214 static u_int vpid_alloc_failed; 215 SYSCTL_UINT(_hw_vmm_vmx, OID_AUTO, vpid_alloc_failed, CTLFLAG_RD, 216 &vpid_alloc_failed, 0, NULL); 217 218 int guest_l1d_flush; 219 SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, l1d_flush, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, 220 &guest_l1d_flush, 0, NULL); 221 int guest_l1d_flush_sw; 222 SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, l1d_flush_sw, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, 223 &guest_l1d_flush_sw, 0, NULL); 224 225 static struct msr_entry msr_load_list[1] __aligned(16); 226 227 /* 228 * The definitions of SDT probes for VMX. 229 */ 230 231 SDT_PROBE_DEFINE3(vmm, vmx, exit, entry, 232 "struct vmx *", "int", "struct vm_exit *"); 233 234 SDT_PROBE_DEFINE4(vmm, vmx, exit, taskswitch, 235 "struct vmx *", "int", "struct vm_exit *", "struct vm_task_switch *"); 236 237 SDT_PROBE_DEFINE4(vmm, vmx, exit, craccess, 238 "struct vmx *", "int", "struct vm_exit *", "uint64_t"); 239 240 SDT_PROBE_DEFINE4(vmm, vmx, exit, rdmsr, 241 "struct vmx *", "int", "struct vm_exit *", "uint32_t"); 242 243 SDT_PROBE_DEFINE5(vmm, vmx, exit, wrmsr, 244 "struct vmx *", "int", "struct vm_exit *", "uint32_t", "uint64_t"); 245 246 SDT_PROBE_DEFINE3(vmm, vmx, exit, halt, 247 "struct vmx *", "int", "struct vm_exit *"); 248 249 SDT_PROBE_DEFINE3(vmm, vmx, exit, mtrap, 250 "struct vmx *", "int", "struct vm_exit *"); 251 252 SDT_PROBE_DEFINE3(vmm, vmx, exit, pause, 253 "struct vmx *", "int", "struct vm_exit *"); 254 255 SDT_PROBE_DEFINE3(vmm, vmx, exit, intrwindow, 256 "struct vmx *", "int", "struct vm_exit *"); 257 258 SDT_PROBE_DEFINE4(vmm, vmx, exit, interrupt, 259 "struct vmx *", "int", "struct vm_exit *", "uint32_t"); 260 261 SDT_PROBE_DEFINE3(vmm, vmx, exit, nmiwindow, 262 "struct vmx *", "int", "struct vm_exit *"); 263 264 SDT_PROBE_DEFINE3(vmm, vmx, exit, inout, 265 "struct vmx *", "int", "struct vm_exit *"); 266 267 SDT_PROBE_DEFINE3(vmm, vmx, exit, cpuid, 268 "struct vmx *", "int", "struct vm_exit *"); 269 270 SDT_PROBE_DEFINE5(vmm, vmx, exit, exception, 271 "struct vmx *", "int", "struct vm_exit *", "uint32_t", "int"); 272 273 SDT_PROBE_DEFINE5(vmm, vmx, exit, nestedfault, 274 "struct vmx *", "int", "struct vm_exit *", "uint64_t", "uint64_t"); 275 276 SDT_PROBE_DEFINE4(vmm, vmx, exit, mmiofault, 277 "struct vmx *", "int", "struct vm_exit *", "uint64_t"); 278 279 SDT_PROBE_DEFINE3(vmm, vmx, exit, eoi, 280 "struct vmx *", "int", "struct vm_exit *"); 281 282 SDT_PROBE_DEFINE3(vmm, vmx, exit, apicaccess, 283 "struct vmx *", "int", "struct vm_exit *"); 284 285 SDT_PROBE_DEFINE4(vmm, vmx, exit, apicwrite, 286 "struct vmx *", "int", "struct vm_exit *", "struct vlapic *"); 287 288 SDT_PROBE_DEFINE3(vmm, vmx, exit, xsetbv, 289 "struct vmx *", "int", "struct vm_exit *"); 290 291 SDT_PROBE_DEFINE3(vmm, vmx, exit, monitor, 292 "struct vmx *", "int", "struct vm_exit *"); 293 294 SDT_PROBE_DEFINE3(vmm, vmx, exit, mwait, 295 "struct vmx *", "int", "struct vm_exit *"); 296 297 SDT_PROBE_DEFINE3(vmm, vmx, exit, vminsn, 298 "struct vmx *", "int", "struct vm_exit *"); 299 300 SDT_PROBE_DEFINE4(vmm, vmx, exit, unknown, 301 "struct vmx *", "int", "struct vm_exit *", "uint32_t"); 302 303 SDT_PROBE_DEFINE4(vmm, vmx, exit, return, 304 "struct vmx *", "int", "struct vm_exit *", "int"); 305 306 /* 307 * Use the last page below 4GB as the APIC access address. This address is 308 * occupied by the boot firmware so it is guaranteed that it will not conflict 309 * with a page in system memory. 310 */ 311 #define APIC_ACCESS_ADDRESS 0xFFFFF000 312 313 static int vmx_getdesc(void *vcpui, int reg, struct seg_desc *desc); 314 static int vmx_getreg(void *vcpui, int reg, uint64_t *retval); 315 static int vmxctx_setreg(struct vmxctx *vmxctx, int reg, uint64_t val); 316 static void vmx_inject_pir(struct vlapic *vlapic); 317 #ifdef BHYVE_SNAPSHOT 318 static int vmx_restore_tsc(void *vcpui, uint64_t now); 319 #endif 320 321 static inline bool 322 host_has_rdpid(void) 323 { 324 return ((cpu_stdext_feature2 & CPUID_STDEXT2_RDPID) != 0); 325 } 326 327 static inline bool 328 host_has_rdtscp(void) 329 { 330 return ((amd_feature & AMDID_RDTSCP) != 0); 331 } 332 333 #ifdef KTR 334 static const char * 335 exit_reason_to_str(int reason) 336 { 337 static char reasonbuf[32]; 338 339 switch (reason) { 340 case EXIT_REASON_EXCEPTION: 341 return "exception"; 342 case EXIT_REASON_EXT_INTR: 343 return "extint"; 344 case EXIT_REASON_TRIPLE_FAULT: 345 return "triplefault"; 346 case EXIT_REASON_INIT: 347 return "init"; 348 case EXIT_REASON_SIPI: 349 return "sipi"; 350 case EXIT_REASON_IO_SMI: 351 return "iosmi"; 352 case EXIT_REASON_SMI: 353 return "smi"; 354 case EXIT_REASON_INTR_WINDOW: 355 return "intrwindow"; 356 case EXIT_REASON_NMI_WINDOW: 357 return "nmiwindow"; 358 case EXIT_REASON_TASK_SWITCH: 359 return "taskswitch"; 360 case EXIT_REASON_CPUID: 361 return "cpuid"; 362 case EXIT_REASON_GETSEC: 363 return "getsec"; 364 case EXIT_REASON_HLT: 365 return "hlt"; 366 case EXIT_REASON_INVD: 367 return "invd"; 368 case EXIT_REASON_INVLPG: 369 return "invlpg"; 370 case EXIT_REASON_RDPMC: 371 return "rdpmc"; 372 case EXIT_REASON_RDTSC: 373 return "rdtsc"; 374 case EXIT_REASON_RSM: 375 return "rsm"; 376 case EXIT_REASON_VMCALL: 377 return "vmcall"; 378 case EXIT_REASON_VMCLEAR: 379 return "vmclear"; 380 case EXIT_REASON_VMLAUNCH: 381 return "vmlaunch"; 382 case EXIT_REASON_VMPTRLD: 383 return "vmptrld"; 384 case EXIT_REASON_VMPTRST: 385 return "vmptrst"; 386 case EXIT_REASON_VMREAD: 387 return "vmread"; 388 case EXIT_REASON_VMRESUME: 389 return "vmresume"; 390 case EXIT_REASON_VMWRITE: 391 return "vmwrite"; 392 case EXIT_REASON_VMXOFF: 393 return "vmxoff"; 394 case EXIT_REASON_VMXON: 395 return "vmxon"; 396 case EXIT_REASON_CR_ACCESS: 397 return "craccess"; 398 case EXIT_REASON_DR_ACCESS: 399 return "draccess"; 400 case EXIT_REASON_INOUT: 401 return "inout"; 402 case EXIT_REASON_RDMSR: 403 return "rdmsr"; 404 case EXIT_REASON_WRMSR: 405 return "wrmsr"; 406 case EXIT_REASON_INVAL_VMCS: 407 return "invalvmcs"; 408 case EXIT_REASON_INVAL_MSR: 409 return "invalmsr"; 410 case EXIT_REASON_MWAIT: 411 return "mwait"; 412 case EXIT_REASON_MTF: 413 return "mtf"; 414 case EXIT_REASON_MONITOR: 415 return "monitor"; 416 case EXIT_REASON_PAUSE: 417 return "pause"; 418 case EXIT_REASON_MCE_DURING_ENTRY: 419 return "mce-during-entry"; 420 case EXIT_REASON_TPR: 421 return "tpr"; 422 case EXIT_REASON_APIC_ACCESS: 423 return "apic-access"; 424 case EXIT_REASON_GDTR_IDTR: 425 return "gdtridtr"; 426 case EXIT_REASON_LDTR_TR: 427 return "ldtrtr"; 428 case EXIT_REASON_EPT_FAULT: 429 return "eptfault"; 430 case EXIT_REASON_EPT_MISCONFIG: 431 return "eptmisconfig"; 432 case EXIT_REASON_INVEPT: 433 return "invept"; 434 case EXIT_REASON_RDTSCP: 435 return "rdtscp"; 436 case EXIT_REASON_VMX_PREEMPT: 437 return "vmxpreempt"; 438 case EXIT_REASON_INVVPID: 439 return "invvpid"; 440 case EXIT_REASON_WBINVD: 441 return "wbinvd"; 442 case EXIT_REASON_XSETBV: 443 return "xsetbv"; 444 case EXIT_REASON_APIC_WRITE: 445 return "apic-write"; 446 default: 447 snprintf(reasonbuf, sizeof(reasonbuf), "%d", reason); 448 return (reasonbuf); 449 } 450 } 451 #endif /* KTR */ 452 453 static int 454 vmx_allow_x2apic_msrs(struct vmx *vmx) 455 { 456 int i, error; 457 458 error = 0; 459 460 /* 461 * Allow readonly access to the following x2APIC MSRs from the guest. 462 */ 463 error += guest_msr_ro(vmx, MSR_APIC_ID); 464 error += guest_msr_ro(vmx, MSR_APIC_VERSION); 465 error += guest_msr_ro(vmx, MSR_APIC_LDR); 466 error += guest_msr_ro(vmx, MSR_APIC_SVR); 467 468 for (i = 0; i < 8; i++) 469 error += guest_msr_ro(vmx, MSR_APIC_ISR0 + i); 470 471 for (i = 0; i < 8; i++) 472 error += guest_msr_ro(vmx, MSR_APIC_TMR0 + i); 473 474 for (i = 0; i < 8; i++) 475 error += guest_msr_ro(vmx, MSR_APIC_IRR0 + i); 476 477 error += guest_msr_ro(vmx, MSR_APIC_ESR); 478 error += guest_msr_ro(vmx, MSR_APIC_LVT_TIMER); 479 error += guest_msr_ro(vmx, MSR_APIC_LVT_THERMAL); 480 error += guest_msr_ro(vmx, MSR_APIC_LVT_PCINT); 481 error += guest_msr_ro(vmx, MSR_APIC_LVT_LINT0); 482 error += guest_msr_ro(vmx, MSR_APIC_LVT_LINT1); 483 error += guest_msr_ro(vmx, MSR_APIC_LVT_ERROR); 484 error += guest_msr_ro(vmx, MSR_APIC_ICR_TIMER); 485 error += guest_msr_ro(vmx, MSR_APIC_DCR_TIMER); 486 error += guest_msr_ro(vmx, MSR_APIC_ICR); 487 488 /* 489 * Allow TPR, EOI and SELF_IPI MSRs to be read and written by the guest. 490 * 491 * These registers get special treatment described in the section 492 * "Virtualizing MSR-Based APIC Accesses". 493 */ 494 error += guest_msr_rw(vmx, MSR_APIC_TPR); 495 error += guest_msr_rw(vmx, MSR_APIC_EOI); 496 error += guest_msr_rw(vmx, MSR_APIC_SELF_IPI); 497 498 return (error); 499 } 500 501 u_long 502 vmx_fix_cr0(u_long cr0) 503 { 504 505 return ((cr0 | cr0_ones_mask) & ~cr0_zeros_mask); 506 } 507 508 u_long 509 vmx_fix_cr4(u_long cr4) 510 { 511 512 return ((cr4 | cr4_ones_mask) & ~cr4_zeros_mask); 513 } 514 515 static void 516 vpid_free(int vpid) 517 { 518 if (vpid < 0 || vpid > 0xffff) 519 panic("vpid_free: invalid vpid %d", vpid); 520 521 /* 522 * VPIDs [0,vm_maxcpu] are special and are not allocated from 523 * the unit number allocator. 524 */ 525 526 if (vpid > vm_maxcpu) 527 free_unr(vpid_unr, vpid); 528 } 529 530 static uint16_t 531 vpid_alloc(int vcpuid) 532 { 533 int x; 534 535 /* 536 * If the "enable vpid" execution control is not enabled then the 537 * VPID is required to be 0 for all vcpus. 538 */ 539 if ((procbased_ctls2 & PROCBASED2_ENABLE_VPID) == 0) 540 return (0); 541 542 /* 543 * Try to allocate a unique VPID for each from the unit number 544 * allocator. 545 */ 546 x = alloc_unr(vpid_unr); 547 548 if (x == -1) { 549 atomic_add_int(&vpid_alloc_failed, 1); 550 551 /* 552 * If the unit number allocator does not have enough unique 553 * VPIDs then we need to allocate from the [1,vm_maxcpu] range. 554 * 555 * These VPIDs are not be unique across VMs but this does not 556 * affect correctness because the combined mappings are also 557 * tagged with the EP4TA which is unique for each VM. 558 * 559 * It is still sub-optimal because the invvpid will invalidate 560 * combined mappings for a particular VPID across all EP4TAs. 561 */ 562 return (vcpuid + 1); 563 } 564 565 return (x); 566 } 567 568 static void 569 vpid_init(void) 570 { 571 /* 572 * VPID 0 is required when the "enable VPID" execution control is 573 * disabled. 574 * 575 * VPIDs [1,vm_maxcpu] are used as the "overflow namespace" when the 576 * unit number allocator does not have sufficient unique VPIDs to 577 * satisfy the allocation. 578 * 579 * The remaining VPIDs are managed by the unit number allocator. 580 */ 581 vpid_unr = new_unrhdr(vm_maxcpu + 1, 0xffff, NULL); 582 } 583 584 static void 585 vmx_disable(void *arg __unused) 586 { 587 struct invvpid_desc invvpid_desc = { 0 }; 588 struct invept_desc invept_desc = { 0 }; 589 590 if (vmxon_enabled[curcpu]) { 591 /* 592 * See sections 25.3.3.3 and 25.3.3.4 in Intel Vol 3b. 593 * 594 * VMXON or VMXOFF are not required to invalidate any TLB 595 * caching structures. This prevents potential retention of 596 * cached information in the TLB between distinct VMX episodes. 597 */ 598 invvpid(INVVPID_TYPE_ALL_CONTEXTS, invvpid_desc); 599 invept(INVEPT_TYPE_ALL_CONTEXTS, invept_desc); 600 vmxoff(); 601 } 602 load_cr4(rcr4() & ~CR4_VMXE); 603 } 604 605 static int 606 vmx_modcleanup(void) 607 { 608 609 if (pirvec >= 0) 610 lapic_ipi_free(pirvec); 611 612 if (vpid_unr != NULL) { 613 delete_unrhdr(vpid_unr); 614 vpid_unr = NULL; 615 } 616 617 if (nmi_flush_l1d_sw == 1) 618 nmi_flush_l1d_sw = 0; 619 620 smp_rendezvous(NULL, vmx_disable, NULL, NULL); 621 622 if (vmxon_region != NULL) 623 kmem_free(vmxon_region, (mp_maxid + 1) * PAGE_SIZE); 624 625 return (0); 626 } 627 628 static void 629 vmx_enable(void *arg __unused) 630 { 631 int error; 632 uint64_t feature_control; 633 634 feature_control = rdmsr(MSR_IA32_FEATURE_CONTROL); 635 if ((feature_control & IA32_FEATURE_CONTROL_LOCK) == 0 || 636 (feature_control & IA32_FEATURE_CONTROL_VMX_EN) == 0) { 637 wrmsr(MSR_IA32_FEATURE_CONTROL, 638 feature_control | IA32_FEATURE_CONTROL_VMX_EN | 639 IA32_FEATURE_CONTROL_LOCK); 640 } 641 642 load_cr4(rcr4() | CR4_VMXE); 643 644 *(uint32_t *)&vmxon_region[curcpu * PAGE_SIZE] = vmx_revision(); 645 error = vmxon(&vmxon_region[curcpu * PAGE_SIZE]); 646 if (error == 0) 647 vmxon_enabled[curcpu] = 1; 648 } 649 650 static void 651 vmx_modresume(void) 652 { 653 654 if (vmxon_enabled[curcpu]) 655 vmxon(&vmxon_region[curcpu * PAGE_SIZE]); 656 } 657 658 static int 659 vmx_modinit(int ipinum) 660 { 661 int error; 662 uint64_t basic, fixed0, fixed1, feature_control; 663 uint32_t tmp, procbased2_vid_bits; 664 665 /* CPUID.1:ECX[bit 5] must be 1 for processor to support VMX */ 666 if (!(cpu_feature2 & CPUID2_VMX)) { 667 printf("vmx_modinit: processor does not support VMX " 668 "operation\n"); 669 return (ENXIO); 670 } 671 672 /* 673 * Verify that MSR_IA32_FEATURE_CONTROL lock and VMXON enable bits 674 * are set (bits 0 and 2 respectively). 675 */ 676 feature_control = rdmsr(MSR_IA32_FEATURE_CONTROL); 677 if ((feature_control & IA32_FEATURE_CONTROL_LOCK) == 1 && 678 (feature_control & IA32_FEATURE_CONTROL_VMX_EN) == 0) { 679 printf("vmx_modinit: VMX operation disabled by BIOS\n"); 680 return (ENXIO); 681 } 682 683 /* 684 * Verify capabilities MSR_VMX_BASIC: 685 * - bit 54 indicates support for INS/OUTS decoding 686 */ 687 basic = rdmsr(MSR_VMX_BASIC); 688 if ((basic & (1UL << 54)) == 0) { 689 printf("vmx_modinit: processor does not support desired basic " 690 "capabilities\n"); 691 return (EINVAL); 692 } 693 694 /* Check support for primary processor-based VM-execution controls */ 695 error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, 696 MSR_VMX_TRUE_PROCBASED_CTLS, 697 PROCBASED_CTLS_ONE_SETTING, 698 PROCBASED_CTLS_ZERO_SETTING, &procbased_ctls); 699 if (error) { 700 printf("vmx_modinit: processor does not support desired " 701 "primary processor-based controls\n"); 702 return (error); 703 } 704 705 /* Clear the processor-based ctl bits that are set on demand */ 706 procbased_ctls &= ~PROCBASED_CTLS_WINDOW_SETTING; 707 708 /* Check support for secondary processor-based VM-execution controls */ 709 error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, 710 MSR_VMX_PROCBASED_CTLS2, 711 PROCBASED_CTLS2_ONE_SETTING, 712 PROCBASED_CTLS2_ZERO_SETTING, &procbased_ctls2); 713 if (error) { 714 printf("vmx_modinit: processor does not support desired " 715 "secondary processor-based controls\n"); 716 return (error); 717 } 718 719 /* Check support for VPID */ 720 error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, MSR_VMX_PROCBASED_CTLS2, 721 PROCBASED2_ENABLE_VPID, 0, &tmp); 722 if (error == 0) 723 procbased_ctls2 |= PROCBASED2_ENABLE_VPID; 724 725 /* Check support for pin-based VM-execution controls */ 726 error = vmx_set_ctlreg(MSR_VMX_PINBASED_CTLS, 727 MSR_VMX_TRUE_PINBASED_CTLS, 728 PINBASED_CTLS_ONE_SETTING, 729 PINBASED_CTLS_ZERO_SETTING, &pinbased_ctls); 730 if (error) { 731 printf("vmx_modinit: processor does not support desired " 732 "pin-based controls\n"); 733 return (error); 734 } 735 736 /* Check support for VM-exit controls */ 737 error = vmx_set_ctlreg(MSR_VMX_EXIT_CTLS, MSR_VMX_TRUE_EXIT_CTLS, 738 VM_EXIT_CTLS_ONE_SETTING, 739 VM_EXIT_CTLS_ZERO_SETTING, 740 &exit_ctls); 741 if (error) { 742 printf("vmx_modinit: processor does not support desired " 743 "exit controls\n"); 744 return (error); 745 } 746 747 /* Check support for VM-entry controls */ 748 error = vmx_set_ctlreg(MSR_VMX_ENTRY_CTLS, MSR_VMX_TRUE_ENTRY_CTLS, 749 VM_ENTRY_CTLS_ONE_SETTING, VM_ENTRY_CTLS_ZERO_SETTING, 750 &entry_ctls); 751 if (error) { 752 printf("vmx_modinit: processor does not support desired " 753 "entry controls\n"); 754 return (error); 755 } 756 757 /* 758 * Check support for optional features by testing them 759 * as individual bits 760 */ 761 cap_halt_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, 762 MSR_VMX_TRUE_PROCBASED_CTLS, 763 PROCBASED_HLT_EXITING, 0, 764 &tmp) == 0); 765 766 cap_monitor_trap = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, 767 MSR_VMX_PROCBASED_CTLS, 768 PROCBASED_MTF, 0, 769 &tmp) == 0); 770 771 cap_pause_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, 772 MSR_VMX_TRUE_PROCBASED_CTLS, 773 PROCBASED_PAUSE_EXITING, 0, 774 &tmp) == 0); 775 776 cap_wbinvd_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, 777 MSR_VMX_PROCBASED_CTLS2, 778 PROCBASED2_WBINVD_EXITING, 779 0, 780 &tmp) == 0); 781 782 /* 783 * Check support for RDPID and/or RDTSCP. 784 * 785 * Support a pass-through-based implementation of these via the 786 * "enable RDTSCP" VM-execution control and the "RDTSC exiting" 787 * VM-execution control. 788 * 789 * The "enable RDTSCP" VM-execution control applies to both RDPID 790 * and RDTSCP (see SDM volume 3, section 25.3, "Changes to 791 * Instruction Behavior in VMX Non-root operation"); this is why 792 * only this VM-execution control needs to be enabled in order to 793 * enable passing through whichever of RDPID and/or RDTSCP are 794 * supported by the host. 795 * 796 * The "RDTSC exiting" VM-execution control applies to both RDTSC 797 * and RDTSCP (again, per SDM volume 3, section 25.3), and is 798 * already set up for RDTSC and RDTSCP pass-through by the current 799 * implementation of RDTSC. 800 * 801 * Although RDPID and RDTSCP are optional capabilities, since there 802 * does not currently seem to be a use case for enabling/disabling 803 * these via libvmmapi, choose not to support this and, instead, 804 * just statically always enable or always disable this support 805 * across all vCPUs on all VMs. (Note that there may be some 806 * complications to providing this functionality, e.g., the MSR 807 * bitmap is currently per-VM rather than per-vCPU while the 808 * capability API wants to be able to control capabilities on a 809 * per-vCPU basis). 810 */ 811 error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, 812 MSR_VMX_PROCBASED_CTLS2, 813 PROCBASED2_ENABLE_RDTSCP, 0, &tmp); 814 cap_rdpid = error == 0 && host_has_rdpid(); 815 cap_rdtscp = error == 0 && host_has_rdtscp(); 816 if (cap_rdpid || cap_rdtscp) { 817 procbased_ctls2 |= PROCBASED2_ENABLE_RDTSCP; 818 vmx_have_msr_tsc_aux = true; 819 } 820 821 cap_unrestricted_guest = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, 822 MSR_VMX_PROCBASED_CTLS2, 823 PROCBASED2_UNRESTRICTED_GUEST, 0, 824 &tmp) == 0); 825 826 cap_invpcid = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, 827 MSR_VMX_PROCBASED_CTLS2, PROCBASED2_ENABLE_INVPCID, 0, 828 &tmp) == 0); 829 830 /* 831 * Check support for TPR shadow. 832 */ 833 error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, 834 MSR_VMX_TRUE_PROCBASED_CTLS, PROCBASED_USE_TPR_SHADOW, 0, 835 &tmp); 836 if (error == 0) { 837 tpr_shadowing = 1; 838 #ifndef BURN_BRIDGES 839 TUNABLE_INT_FETCH("hw.vmm.vmx.use_tpr_shadowing", 840 &tpr_shadowing); 841 #endif 842 TUNABLE_INT_FETCH("hw.vmm.vmx.cap.tpr_shadowing", 843 &tpr_shadowing); 844 } 845 846 if (tpr_shadowing) { 847 procbased_ctls |= PROCBASED_USE_TPR_SHADOW; 848 procbased_ctls &= ~PROCBASED_CR8_LOAD_EXITING; 849 procbased_ctls &= ~PROCBASED_CR8_STORE_EXITING; 850 } 851 852 /* 853 * Check support for virtual interrupt delivery. 854 */ 855 procbased2_vid_bits = (PROCBASED2_VIRTUALIZE_APIC_ACCESSES | 856 PROCBASED2_VIRTUALIZE_X2APIC_MODE | 857 PROCBASED2_APIC_REGISTER_VIRTUALIZATION | 858 PROCBASED2_VIRTUAL_INTERRUPT_DELIVERY); 859 860 error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, MSR_VMX_PROCBASED_CTLS2, 861 procbased2_vid_bits, 0, &tmp); 862 if (error == 0 && tpr_shadowing) { 863 virtual_interrupt_delivery = 1; 864 #ifndef BURN_BRIDGES 865 TUNABLE_INT_FETCH("hw.vmm.vmx.use_apic_vid", 866 &virtual_interrupt_delivery); 867 #endif 868 TUNABLE_INT_FETCH("hw.vmm.vmx.cap.virtual_interrupt_delivery", 869 &virtual_interrupt_delivery); 870 } 871 872 if (virtual_interrupt_delivery) { 873 procbased_ctls |= PROCBASED_USE_TPR_SHADOW; 874 procbased_ctls2 |= procbased2_vid_bits; 875 procbased_ctls2 &= ~PROCBASED2_VIRTUALIZE_X2APIC_MODE; 876 877 /* 878 * Check for Posted Interrupts only if Virtual Interrupt 879 * Delivery is enabled. 880 */ 881 error = vmx_set_ctlreg(MSR_VMX_PINBASED_CTLS, 882 MSR_VMX_TRUE_PINBASED_CTLS, PINBASED_POSTED_INTERRUPT, 0, 883 &tmp); 884 if (error == 0) { 885 pirvec = lapic_ipi_alloc(pti ? &IDTVEC(justreturn1_pti) : 886 &IDTVEC(justreturn)); 887 if (pirvec < 0) { 888 if (bootverbose) { 889 printf("vmx_modinit: unable to " 890 "allocate posted interrupt " 891 "vector\n"); 892 } 893 } else { 894 posted_interrupts = 1; 895 #ifndef BURN_BRIDGES 896 TUNABLE_INT_FETCH("hw.vmm.vmx.use_apic_pir", 897 &posted_interrupts); 898 #endif 899 TUNABLE_INT_FETCH("hw.vmm.vmx.cap.posted_interrupts", 900 &posted_interrupts); 901 } 902 } 903 } 904 905 if (posted_interrupts) 906 pinbased_ctls |= PINBASED_POSTED_INTERRUPT; 907 908 /* Initialize EPT */ 909 error = ept_init(ipinum); 910 if (error) { 911 printf("vmx_modinit: ept initialization failed (%d)\n", error); 912 return (error); 913 } 914 915 guest_l1d_flush = (cpu_ia32_arch_caps & 916 IA32_ARCH_CAP_SKIP_L1DFL_VMENTRY) == 0; 917 #ifndef BURN_BRIDGES 918 TUNABLE_INT_FETCH("hw.vmm.l1d_flush", &guest_l1d_flush); 919 #endif 920 TUNABLE_INT_FETCH("hw.vmm.vmx.l1d_flush", &guest_l1d_flush); 921 922 /* 923 * L1D cache flush is enabled. Use IA32_FLUSH_CMD MSR when 924 * available. Otherwise fall back to the software flush 925 * method which loads enough data from the kernel text to 926 * flush existing L1D content, both on VMX entry and on NMI 927 * return. 928 */ 929 if (guest_l1d_flush) { 930 if ((cpu_stdext_feature3 & CPUID_STDEXT3_L1D_FLUSH) == 0) { 931 guest_l1d_flush_sw = 1; 932 #ifndef BURN_BRIDGES 933 TUNABLE_INT_FETCH("hw.vmm.l1d_flush_sw", 934 &guest_l1d_flush_sw); 935 #endif 936 TUNABLE_INT_FETCH("hw.vmm.vmx.l1d_flush_sw", 937 &guest_l1d_flush_sw); 938 } 939 if (guest_l1d_flush_sw) { 940 if (nmi_flush_l1d_sw <= 1) 941 nmi_flush_l1d_sw = 1; 942 } else { 943 msr_load_list[0].index = MSR_IA32_FLUSH_CMD; 944 msr_load_list[0].val = IA32_FLUSH_CMD_L1D; 945 } 946 } 947 948 /* 949 * Stash the cr0 and cr4 bits that must be fixed to 0 or 1 950 */ 951 fixed0 = rdmsr(MSR_VMX_CR0_FIXED0); 952 fixed1 = rdmsr(MSR_VMX_CR0_FIXED1); 953 cr0_ones_mask = fixed0 & fixed1; 954 cr0_zeros_mask = ~fixed0 & ~fixed1; 955 956 /* 957 * CR0_PE and CR0_PG can be set to zero in VMX non-root operation 958 * if unrestricted guest execution is allowed. 959 */ 960 if (cap_unrestricted_guest) 961 cr0_ones_mask &= ~(CR0_PG | CR0_PE); 962 963 /* 964 * Do not allow the guest to set CR0_NW or CR0_CD. 965 */ 966 cr0_zeros_mask |= (CR0_NW | CR0_CD); 967 968 fixed0 = rdmsr(MSR_VMX_CR4_FIXED0); 969 fixed1 = rdmsr(MSR_VMX_CR4_FIXED1); 970 cr4_ones_mask = fixed0 & fixed1; 971 cr4_zeros_mask = ~fixed0 & ~fixed1; 972 973 vpid_init(); 974 975 vmx_msr_init(); 976 977 /* enable VMX operation */ 978 vmxon_region = kmem_malloc((mp_maxid + 1) * PAGE_SIZE, 979 M_WAITOK | M_ZERO); 980 smp_rendezvous(NULL, vmx_enable, NULL, NULL); 981 982 vmx_initialized = 1; 983 984 return (0); 985 } 986 987 static void 988 vmx_trigger_hostintr(int vector) 989 { 990 uintptr_t func; 991 struct gate_descriptor *gd; 992 993 gd = &idt[vector]; 994 995 KASSERT(vector >= 32 && vector <= 255, ("vmx_trigger_hostintr: " 996 "invalid vector %d", vector)); 997 KASSERT(gd->gd_p == 1, ("gate descriptor for vector %d not present", 998 vector)); 999 KASSERT(gd->gd_type == SDT_SYSIGT, ("gate descriptor for vector %d " 1000 "has invalid type %d", vector, gd->gd_type)); 1001 KASSERT(gd->gd_dpl == SEL_KPL, ("gate descriptor for vector %d " 1002 "has invalid dpl %d", vector, gd->gd_dpl)); 1003 KASSERT(gd->gd_selector == GSEL(GCODE_SEL, SEL_KPL), ("gate descriptor " 1004 "for vector %d has invalid selector %d", vector, gd->gd_selector)); 1005 KASSERT(gd->gd_ist == 0, ("gate descriptor for vector %d has invalid " 1006 "IST %d", vector, gd->gd_ist)); 1007 1008 func = ((long)gd->gd_hioffset << 16 | gd->gd_looffset); 1009 vmx_call_isr(func); 1010 } 1011 1012 static int 1013 vmx_setup_cr_shadow(int which, struct vmcs *vmcs, uint32_t initial) 1014 { 1015 int error, mask_ident, shadow_ident; 1016 uint64_t mask_value; 1017 1018 if (which != 0 && which != 4) 1019 panic("vmx_setup_cr_shadow: unknown cr%d", which); 1020 1021 if (which == 0) { 1022 mask_ident = VMCS_CR0_MASK; 1023 mask_value = cr0_ones_mask | cr0_zeros_mask; 1024 shadow_ident = VMCS_CR0_SHADOW; 1025 } else { 1026 mask_ident = VMCS_CR4_MASK; 1027 mask_value = cr4_ones_mask | cr4_zeros_mask; 1028 shadow_ident = VMCS_CR4_SHADOW; 1029 } 1030 1031 error = vmcs_setreg(vmcs, 0, VMCS_IDENT(mask_ident), mask_value); 1032 if (error) 1033 return (error); 1034 1035 error = vmcs_setreg(vmcs, 0, VMCS_IDENT(shadow_ident), initial); 1036 if (error) 1037 return (error); 1038 1039 return (0); 1040 } 1041 #define vmx_setup_cr0_shadow(vmcs,init) vmx_setup_cr_shadow(0, (vmcs), (init)) 1042 #define vmx_setup_cr4_shadow(vmcs,init) vmx_setup_cr_shadow(4, (vmcs), (init)) 1043 1044 static void * 1045 vmx_init(struct vm *vm, pmap_t pmap) 1046 { 1047 int error __diagused; 1048 struct vmx *vmx; 1049 1050 vmx = malloc(sizeof(struct vmx), M_VMX, M_WAITOK | M_ZERO); 1051 vmx->vm = vm; 1052 1053 vmx->eptp = eptp(vtophys((vm_offset_t)pmap->pm_pmltop)); 1054 1055 /* 1056 * Clean up EPTP-tagged guest physical and combined mappings 1057 * 1058 * VMX transitions are not required to invalidate any guest physical 1059 * mappings. So, it may be possible for stale guest physical mappings 1060 * to be present in the processor TLBs. 1061 * 1062 * Combined mappings for this EP4TA are also invalidated for all VPIDs. 1063 */ 1064 ept_invalidate_mappings(vmx->eptp); 1065 1066 vmx->msr_bitmap = malloc_aligned(PAGE_SIZE, PAGE_SIZE, M_VMX, 1067 M_WAITOK | M_ZERO); 1068 msr_bitmap_initialize(vmx->msr_bitmap); 1069 1070 /* 1071 * It is safe to allow direct access to MSR_GSBASE and MSR_FSBASE. 1072 * The guest FSBASE and GSBASE are saved and restored during 1073 * vm-exit and vm-entry respectively. The host FSBASE and GSBASE are 1074 * always restored from the vmcs host state area on vm-exit. 1075 * 1076 * The SYSENTER_CS/ESP/EIP MSRs are identical to FS/GSBASE in 1077 * how they are saved/restored so can be directly accessed by the 1078 * guest. 1079 * 1080 * MSR_EFER is saved and restored in the guest VMCS area on a 1081 * VM exit and entry respectively. It is also restored from the 1082 * host VMCS area on a VM exit. 1083 * 1084 * The TSC MSR is exposed read-only. Writes are disallowed as 1085 * that will impact the host TSC. If the guest does a write 1086 * the "use TSC offsetting" execution control is enabled and the 1087 * difference between the host TSC and the guest TSC is written 1088 * into the TSC offset in the VMCS. 1089 * 1090 * Guest TSC_AUX support is enabled if any of guest RDPID and/or 1091 * guest RDTSCP support are enabled (since, as per Table 2-2 in SDM 1092 * volume 4, TSC_AUX is supported if any of RDPID and/or RDTSCP are 1093 * supported). If guest TSC_AUX support is enabled, TSC_AUX is 1094 * exposed read-only so that the VMM can do one fewer MSR read per 1095 * exit than if this register were exposed read-write; the guest 1096 * restore value can be updated during guest writes (expected to be 1097 * rare) instead of during all exits (common). 1098 */ 1099 if (guest_msr_rw(vmx, MSR_GSBASE) || 1100 guest_msr_rw(vmx, MSR_FSBASE) || 1101 guest_msr_rw(vmx, MSR_SYSENTER_CS_MSR) || 1102 guest_msr_rw(vmx, MSR_SYSENTER_ESP_MSR) || 1103 guest_msr_rw(vmx, MSR_SYSENTER_EIP_MSR) || 1104 guest_msr_rw(vmx, MSR_EFER) || 1105 guest_msr_ro(vmx, MSR_TSC) || 1106 ((cap_rdpid || cap_rdtscp) && guest_msr_ro(vmx, MSR_TSC_AUX))) 1107 panic("vmx_init: error setting guest msr access"); 1108 1109 if (virtual_interrupt_delivery) { 1110 error = vm_map_mmio(vm, DEFAULT_APIC_BASE, PAGE_SIZE, 1111 APIC_ACCESS_ADDRESS); 1112 /* XXX this should really return an error to the caller */ 1113 KASSERT(error == 0, ("vm_map_mmio(apicbase) error %d", error)); 1114 } 1115 1116 vmx->pmap = pmap; 1117 return (vmx); 1118 } 1119 1120 static void * 1121 vmx_vcpu_init(void *vmi, struct vcpu *vcpu1, int vcpuid) 1122 { 1123 struct vmx *vmx = vmi; 1124 struct vmcs *vmcs; 1125 struct vmx_vcpu *vcpu; 1126 uint32_t exc_bitmap; 1127 uint16_t vpid; 1128 int error; 1129 1130 vpid = vpid_alloc(vcpuid); 1131 1132 vcpu = malloc(sizeof(*vcpu), M_VMX, M_WAITOK | M_ZERO); 1133 vcpu->vmx = vmx; 1134 vcpu->vcpu = vcpu1; 1135 vcpu->vcpuid = vcpuid; 1136 vcpu->vmcs = malloc_aligned(sizeof(*vmcs), PAGE_SIZE, M_VMX, 1137 M_WAITOK | M_ZERO); 1138 vcpu->apic_page = malloc_aligned(PAGE_SIZE, PAGE_SIZE, M_VMX, 1139 M_WAITOK | M_ZERO); 1140 vcpu->pir_desc = malloc_aligned(sizeof(*vcpu->pir_desc), 64, M_VMX, 1141 M_WAITOK | M_ZERO); 1142 1143 vmcs = vcpu->vmcs; 1144 vmcs->identifier = vmx_revision(); 1145 error = vmclear(vmcs); 1146 if (error != 0) { 1147 panic("vmx_init: vmclear error %d on vcpu %d\n", 1148 error, vcpuid); 1149 } 1150 1151 vmx_msr_guest_init(vmx, vcpu); 1152 1153 error = vmcs_init(vmcs); 1154 KASSERT(error == 0, ("vmcs_init error %d", error)); 1155 1156 VMPTRLD(vmcs); 1157 error = 0; 1158 error += vmwrite(VMCS_HOST_RSP, (u_long)&vcpu->ctx); 1159 error += vmwrite(VMCS_EPTP, vmx->eptp); 1160 error += vmwrite(VMCS_PIN_BASED_CTLS, pinbased_ctls); 1161 error += vmwrite(VMCS_PRI_PROC_BASED_CTLS, procbased_ctls); 1162 if (vcpu_trap_wbinvd(vcpu->vcpu)) { 1163 KASSERT(cap_wbinvd_exit, ("WBINVD trap not available")); 1164 procbased_ctls2 |= PROCBASED2_WBINVD_EXITING; 1165 } 1166 error += vmwrite(VMCS_SEC_PROC_BASED_CTLS, procbased_ctls2); 1167 error += vmwrite(VMCS_EXIT_CTLS, exit_ctls); 1168 error += vmwrite(VMCS_ENTRY_CTLS, entry_ctls); 1169 error += vmwrite(VMCS_MSR_BITMAP, vtophys(vmx->msr_bitmap)); 1170 error += vmwrite(VMCS_VPID, vpid); 1171 1172 if (guest_l1d_flush && !guest_l1d_flush_sw) { 1173 vmcs_write(VMCS_ENTRY_MSR_LOAD, pmap_kextract( 1174 (vm_offset_t)&msr_load_list[0])); 1175 vmcs_write(VMCS_ENTRY_MSR_LOAD_COUNT, 1176 nitems(msr_load_list)); 1177 vmcs_write(VMCS_EXIT_MSR_STORE, 0); 1178 vmcs_write(VMCS_EXIT_MSR_STORE_COUNT, 0); 1179 } 1180 1181 /* exception bitmap */ 1182 if (vcpu_trace_exceptions(vcpu->vcpu)) 1183 exc_bitmap = 0xffffffff; 1184 else 1185 exc_bitmap = 1 << IDT_MC; 1186 error += vmwrite(VMCS_EXCEPTION_BITMAP, exc_bitmap); 1187 1188 vcpu->ctx.guest_dr6 = DBREG_DR6_RESERVED1; 1189 error += vmwrite(VMCS_GUEST_DR7, DBREG_DR7_RESERVED1); 1190 1191 if (tpr_shadowing) { 1192 error += vmwrite(VMCS_VIRTUAL_APIC, vtophys(vcpu->apic_page)); 1193 } 1194 1195 if (virtual_interrupt_delivery) { 1196 error += vmwrite(VMCS_APIC_ACCESS, APIC_ACCESS_ADDRESS); 1197 error += vmwrite(VMCS_EOI_EXIT0, 0); 1198 error += vmwrite(VMCS_EOI_EXIT1, 0); 1199 error += vmwrite(VMCS_EOI_EXIT2, 0); 1200 error += vmwrite(VMCS_EOI_EXIT3, 0); 1201 } 1202 if (posted_interrupts) { 1203 error += vmwrite(VMCS_PIR_VECTOR, pirvec); 1204 error += vmwrite(VMCS_PIR_DESC, vtophys(vcpu->pir_desc)); 1205 } 1206 VMCLEAR(vmcs); 1207 KASSERT(error == 0, ("vmx_init: error customizing the vmcs")); 1208 1209 vcpu->cap.set = 0; 1210 vcpu->cap.set |= cap_rdpid != 0 ? 1 << VM_CAP_RDPID : 0; 1211 vcpu->cap.set |= cap_rdtscp != 0 ? 1 << VM_CAP_RDTSCP : 0; 1212 vcpu->cap.proc_ctls = procbased_ctls; 1213 vcpu->cap.proc_ctls2 = procbased_ctls2; 1214 vcpu->cap.exc_bitmap = exc_bitmap; 1215 1216 vcpu->state.nextrip = ~0; 1217 vcpu->state.lastcpu = NOCPU; 1218 vcpu->state.vpid = vpid; 1219 1220 /* 1221 * Set up the CR0/4 shadows, and init the read shadow 1222 * to the power-on register value from the Intel Sys Arch. 1223 * CR0 - 0x60000010 1224 * CR4 - 0 1225 */ 1226 error = vmx_setup_cr0_shadow(vmcs, 0x60000010); 1227 if (error != 0) 1228 panic("vmx_setup_cr0_shadow %d", error); 1229 1230 error = vmx_setup_cr4_shadow(vmcs, 0); 1231 if (error != 0) 1232 panic("vmx_setup_cr4_shadow %d", error); 1233 1234 vcpu->ctx.pmap = vmx->pmap; 1235 1236 return (vcpu); 1237 } 1238 1239 static int 1240 vmx_handle_cpuid(struct vmx_vcpu *vcpu, struct vmxctx *vmxctx) 1241 { 1242 int handled; 1243 1244 handled = x86_emulate_cpuid(vcpu->vcpu, (uint64_t *)&vmxctx->guest_rax, 1245 (uint64_t *)&vmxctx->guest_rbx, (uint64_t *)&vmxctx->guest_rcx, 1246 (uint64_t *)&vmxctx->guest_rdx); 1247 return (handled); 1248 } 1249 1250 static __inline void 1251 vmx_run_trace(struct vmx_vcpu *vcpu) 1252 { 1253 VMX_CTR1(vcpu, "Resume execution at %#lx", vmcs_guest_rip()); 1254 } 1255 1256 static __inline void 1257 vmx_exit_trace(struct vmx_vcpu *vcpu, uint64_t rip, uint32_t exit_reason, 1258 int handled) 1259 { 1260 VMX_CTR3(vcpu, "%s %s vmexit at 0x%0lx", 1261 handled ? "handled" : "unhandled", 1262 exit_reason_to_str(exit_reason), rip); 1263 } 1264 1265 static __inline void 1266 vmx_astpending_trace(struct vmx_vcpu *vcpu, uint64_t rip) 1267 { 1268 VMX_CTR1(vcpu, "astpending vmexit at 0x%0lx", rip); 1269 } 1270 1271 static VMM_STAT_INTEL(VCPU_INVVPID_SAVED, "Number of vpid invalidations saved"); 1272 static VMM_STAT_INTEL(VCPU_INVVPID_DONE, "Number of vpid invalidations done"); 1273 1274 /* 1275 * Invalidate guest mappings identified by its vpid from the TLB. 1276 */ 1277 static __inline void 1278 vmx_invvpid(struct vmx *vmx, struct vmx_vcpu *vcpu, pmap_t pmap, int running) 1279 { 1280 struct vmxstate *vmxstate; 1281 struct invvpid_desc invvpid_desc; 1282 1283 vmxstate = &vcpu->state; 1284 if (vmxstate->vpid == 0) 1285 return; 1286 1287 if (!running) { 1288 /* 1289 * Set the 'lastcpu' to an invalid host cpu. 1290 * 1291 * This will invalidate TLB entries tagged with the vcpu's 1292 * vpid the next time it runs via vmx_set_pcpu_defaults(). 1293 */ 1294 vmxstate->lastcpu = NOCPU; 1295 return; 1296 } 1297 1298 KASSERT(curthread->td_critnest > 0, ("%s: vcpu %d running outside " 1299 "critical section", __func__, vcpu->vcpuid)); 1300 1301 /* 1302 * Invalidate all mappings tagged with 'vpid' 1303 * 1304 * We do this because this vcpu was executing on a different host 1305 * cpu when it last ran. We do not track whether it invalidated 1306 * mappings associated with its 'vpid' during that run. So we must 1307 * assume that the mappings associated with 'vpid' on 'curcpu' are 1308 * stale and invalidate them. 1309 * 1310 * Note that we incur this penalty only when the scheduler chooses to 1311 * move the thread associated with this vcpu between host cpus. 1312 * 1313 * Note also that this will invalidate mappings tagged with 'vpid' 1314 * for "all" EP4TAs. 1315 */ 1316 if (atomic_load_long(&pmap->pm_eptgen) == vmx->eptgen[curcpu]) { 1317 invvpid_desc._res1 = 0; 1318 invvpid_desc._res2 = 0; 1319 invvpid_desc.vpid = vmxstate->vpid; 1320 invvpid_desc.linear_addr = 0; 1321 invvpid(INVVPID_TYPE_SINGLE_CONTEXT, invvpid_desc); 1322 vmm_stat_incr(vcpu->vcpu, VCPU_INVVPID_DONE, 1); 1323 } else { 1324 /* 1325 * The invvpid can be skipped if an invept is going to 1326 * be performed before entering the guest. The invept 1327 * will invalidate combined mappings tagged with 1328 * 'vmx->eptp' for all vpids. 1329 */ 1330 vmm_stat_incr(vcpu->vcpu, VCPU_INVVPID_SAVED, 1); 1331 } 1332 } 1333 1334 static void 1335 vmx_set_pcpu_defaults(struct vmx *vmx, struct vmx_vcpu *vcpu, pmap_t pmap) 1336 { 1337 struct vmxstate *vmxstate; 1338 1339 vmxstate = &vcpu->state; 1340 if (vmxstate->lastcpu == curcpu) 1341 return; 1342 1343 vmxstate->lastcpu = curcpu; 1344 1345 vmm_stat_incr(vcpu->vcpu, VCPU_MIGRATIONS, 1); 1346 1347 vmcs_write(VMCS_HOST_TR_BASE, vmm_get_host_trbase()); 1348 vmcs_write(VMCS_HOST_GDTR_BASE, vmm_get_host_gdtrbase()); 1349 vmcs_write(VMCS_HOST_GS_BASE, vmm_get_host_gsbase()); 1350 vmx_invvpid(vmx, vcpu, pmap, 1); 1351 } 1352 1353 /* 1354 * We depend on 'procbased_ctls' to have the Interrupt Window Exiting bit set. 1355 */ 1356 CTASSERT((PROCBASED_CTLS_ONE_SETTING & PROCBASED_INT_WINDOW_EXITING) != 0); 1357 1358 static void __inline 1359 vmx_set_int_window_exiting(struct vmx_vcpu *vcpu) 1360 { 1361 1362 if ((vcpu->cap.proc_ctls & PROCBASED_INT_WINDOW_EXITING) == 0) { 1363 vcpu->cap.proc_ctls |= PROCBASED_INT_WINDOW_EXITING; 1364 vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vcpu->cap.proc_ctls); 1365 VMX_CTR0(vcpu, "Enabling interrupt window exiting"); 1366 } 1367 } 1368 1369 static void __inline 1370 vmx_clear_int_window_exiting(struct vmx_vcpu *vcpu) 1371 { 1372 1373 KASSERT((vcpu->cap.proc_ctls & PROCBASED_INT_WINDOW_EXITING) != 0, 1374 ("intr_window_exiting not set: %#x", vcpu->cap.proc_ctls)); 1375 vcpu->cap.proc_ctls &= ~PROCBASED_INT_WINDOW_EXITING; 1376 vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vcpu->cap.proc_ctls); 1377 VMX_CTR0(vcpu, "Disabling interrupt window exiting"); 1378 } 1379 1380 static void __inline 1381 vmx_set_nmi_window_exiting(struct vmx_vcpu *vcpu) 1382 { 1383 1384 if ((vcpu->cap.proc_ctls & PROCBASED_NMI_WINDOW_EXITING) == 0) { 1385 vcpu->cap.proc_ctls |= PROCBASED_NMI_WINDOW_EXITING; 1386 vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vcpu->cap.proc_ctls); 1387 VMX_CTR0(vcpu, "Enabling NMI window exiting"); 1388 } 1389 } 1390 1391 static void __inline 1392 vmx_clear_nmi_window_exiting(struct vmx_vcpu *vcpu) 1393 { 1394 1395 KASSERT((vcpu->cap.proc_ctls & PROCBASED_NMI_WINDOW_EXITING) != 0, 1396 ("nmi_window_exiting not set %#x", vcpu->cap.proc_ctls)); 1397 vcpu->cap.proc_ctls &= ~PROCBASED_NMI_WINDOW_EXITING; 1398 vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vcpu->cap.proc_ctls); 1399 VMX_CTR0(vcpu, "Disabling NMI window exiting"); 1400 } 1401 1402 int 1403 vmx_set_tsc_offset(struct vmx_vcpu *vcpu, uint64_t offset) 1404 { 1405 int error; 1406 1407 if ((vcpu->cap.proc_ctls & PROCBASED_TSC_OFFSET) == 0) { 1408 vcpu->cap.proc_ctls |= PROCBASED_TSC_OFFSET; 1409 vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vcpu->cap.proc_ctls); 1410 VMX_CTR0(vcpu, "Enabling TSC offsetting"); 1411 } 1412 1413 error = vmwrite(VMCS_TSC_OFFSET, offset); 1414 #ifdef BHYVE_SNAPSHOT 1415 if (error == 0) 1416 vm_set_tsc_offset(vcpu->vcpu, offset); 1417 #endif 1418 return (error); 1419 } 1420 1421 #define NMI_BLOCKING (VMCS_INTERRUPTIBILITY_NMI_BLOCKING | \ 1422 VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING) 1423 #define HWINTR_BLOCKING (VMCS_INTERRUPTIBILITY_STI_BLOCKING | \ 1424 VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING) 1425 1426 static void 1427 vmx_inject_nmi(struct vmx_vcpu *vcpu) 1428 { 1429 uint32_t gi __diagused, info; 1430 1431 gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY); 1432 KASSERT((gi & NMI_BLOCKING) == 0, ("vmx_inject_nmi: invalid guest " 1433 "interruptibility-state %#x", gi)); 1434 1435 info = vmcs_read(VMCS_ENTRY_INTR_INFO); 1436 KASSERT((info & VMCS_INTR_VALID) == 0, ("vmx_inject_nmi: invalid " 1437 "VM-entry interruption information %#x", info)); 1438 1439 /* 1440 * Inject the virtual NMI. The vector must be the NMI IDT entry 1441 * or the VMCS entry check will fail. 1442 */ 1443 info = IDT_NMI | VMCS_INTR_T_NMI | VMCS_INTR_VALID; 1444 vmcs_write(VMCS_ENTRY_INTR_INFO, info); 1445 1446 VMX_CTR0(vcpu, "Injecting vNMI"); 1447 1448 /* Clear the request */ 1449 vm_nmi_clear(vcpu->vcpu); 1450 } 1451 1452 static void 1453 vmx_inject_interrupts(struct vmx_vcpu *vcpu, struct vlapic *vlapic, 1454 uint64_t guestrip) 1455 { 1456 int vector, need_nmi_exiting, extint_pending; 1457 uint64_t rflags, entryinfo; 1458 uint32_t gi, info; 1459 1460 if (vcpu->cap.set & (1 << VM_CAP_MASK_HWINTR)) { 1461 return; 1462 } 1463 1464 if (vcpu->state.nextrip != guestrip) { 1465 gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY); 1466 if (gi & HWINTR_BLOCKING) { 1467 VMX_CTR2(vcpu, "Guest interrupt blocking " 1468 "cleared due to rip change: %#lx/%#lx", 1469 vcpu->state.nextrip, guestrip); 1470 gi &= ~HWINTR_BLOCKING; 1471 vmcs_write(VMCS_GUEST_INTERRUPTIBILITY, gi); 1472 } 1473 } 1474 1475 if (vm_entry_intinfo(vcpu->vcpu, &entryinfo)) { 1476 KASSERT((entryinfo & VMCS_INTR_VALID) != 0, ("%s: entry " 1477 "intinfo is not valid: %#lx", __func__, entryinfo)); 1478 1479 info = vmcs_read(VMCS_ENTRY_INTR_INFO); 1480 KASSERT((info & VMCS_INTR_VALID) == 0, ("%s: cannot inject " 1481 "pending exception: %#lx/%#x", __func__, entryinfo, info)); 1482 1483 info = entryinfo; 1484 vector = info & 0xff; 1485 if (vector == IDT_BP || vector == IDT_OF) { 1486 /* 1487 * VT-x requires #BP and #OF to be injected as software 1488 * exceptions. 1489 */ 1490 info &= ~VMCS_INTR_T_MASK; 1491 info |= VMCS_INTR_T_SWEXCEPTION; 1492 } 1493 1494 if (info & VMCS_INTR_DEL_ERRCODE) 1495 vmcs_write(VMCS_ENTRY_EXCEPTION_ERROR, entryinfo >> 32); 1496 1497 vmcs_write(VMCS_ENTRY_INTR_INFO, info); 1498 } 1499 1500 if (vm_nmi_pending(vcpu->vcpu)) { 1501 /* 1502 * If there are no conditions blocking NMI injection then 1503 * inject it directly here otherwise enable "NMI window 1504 * exiting" to inject it as soon as we can. 1505 * 1506 * We also check for STI_BLOCKING because some implementations 1507 * don't allow NMI injection in this case. If we are running 1508 * on a processor that doesn't have this restriction it will 1509 * immediately exit and the NMI will be injected in the 1510 * "NMI window exiting" handler. 1511 */ 1512 need_nmi_exiting = 1; 1513 gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY); 1514 if ((gi & (HWINTR_BLOCKING | NMI_BLOCKING)) == 0) { 1515 info = vmcs_read(VMCS_ENTRY_INTR_INFO); 1516 if ((info & VMCS_INTR_VALID) == 0) { 1517 vmx_inject_nmi(vcpu); 1518 need_nmi_exiting = 0; 1519 } else { 1520 VMX_CTR1(vcpu, "Cannot inject NMI " 1521 "due to VM-entry intr info %#x", info); 1522 } 1523 } else { 1524 VMX_CTR1(vcpu, "Cannot inject NMI due to " 1525 "Guest Interruptibility-state %#x", gi); 1526 } 1527 1528 if (need_nmi_exiting) 1529 vmx_set_nmi_window_exiting(vcpu); 1530 } 1531 1532 extint_pending = vm_extint_pending(vcpu->vcpu); 1533 1534 if (!extint_pending && virtual_interrupt_delivery) { 1535 vmx_inject_pir(vlapic); 1536 return; 1537 } 1538 1539 /* 1540 * If interrupt-window exiting is already in effect then don't bother 1541 * checking for pending interrupts. This is just an optimization and 1542 * not needed for correctness. 1543 */ 1544 if ((vcpu->cap.proc_ctls & PROCBASED_INT_WINDOW_EXITING) != 0) { 1545 VMX_CTR0(vcpu, "Skip interrupt injection due to " 1546 "pending int_window_exiting"); 1547 return; 1548 } 1549 1550 if (!extint_pending) { 1551 /* Ask the local apic for a vector to inject */ 1552 if (!vlapic_pending_intr(vlapic, &vector)) 1553 return; 1554 1555 /* 1556 * From the Intel SDM, Volume 3, Section "Maskable 1557 * Hardware Interrupts": 1558 * - maskable interrupt vectors [16,255] can be delivered 1559 * through the local APIC. 1560 */ 1561 KASSERT(vector >= 16 && vector <= 255, 1562 ("invalid vector %d from local APIC", vector)); 1563 } else { 1564 /* Ask the legacy pic for a vector to inject */ 1565 vatpic_pending_intr(vcpu->vmx->vm, &vector); 1566 1567 /* 1568 * From the Intel SDM, Volume 3, Section "Maskable 1569 * Hardware Interrupts": 1570 * - maskable interrupt vectors [0,255] can be delivered 1571 * through the INTR pin. 1572 */ 1573 KASSERT(vector >= 0 && vector <= 255, 1574 ("invalid vector %d from INTR", vector)); 1575 } 1576 1577 /* Check RFLAGS.IF and the interruptibility state of the guest */ 1578 rflags = vmcs_read(VMCS_GUEST_RFLAGS); 1579 if ((rflags & PSL_I) == 0) { 1580 VMX_CTR2(vcpu, "Cannot inject vector %d due to " 1581 "rflags %#lx", vector, rflags); 1582 goto cantinject; 1583 } 1584 1585 gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY); 1586 if (gi & HWINTR_BLOCKING) { 1587 VMX_CTR2(vcpu, "Cannot inject vector %d due to " 1588 "Guest Interruptibility-state %#x", vector, gi); 1589 goto cantinject; 1590 } 1591 1592 info = vmcs_read(VMCS_ENTRY_INTR_INFO); 1593 if (info & VMCS_INTR_VALID) { 1594 /* 1595 * This is expected and could happen for multiple reasons: 1596 * - A vectoring VM-entry was aborted due to astpending 1597 * - A VM-exit happened during event injection. 1598 * - An exception was injected above. 1599 * - An NMI was injected above or after "NMI window exiting" 1600 */ 1601 VMX_CTR2(vcpu, "Cannot inject vector %d due to " 1602 "VM-entry intr info %#x", vector, info); 1603 goto cantinject; 1604 } 1605 1606 /* Inject the interrupt */ 1607 info = VMCS_INTR_T_HWINTR | VMCS_INTR_VALID; 1608 info |= vector; 1609 vmcs_write(VMCS_ENTRY_INTR_INFO, info); 1610 1611 if (!extint_pending) { 1612 /* Update the Local APIC ISR */ 1613 vlapic_intr_accepted(vlapic, vector); 1614 } else { 1615 vm_extint_clear(vcpu->vcpu); 1616 vatpic_intr_accepted(vcpu->vmx->vm, vector); 1617 1618 /* 1619 * After we accepted the current ExtINT the PIC may 1620 * have posted another one. If that is the case, set 1621 * the Interrupt Window Exiting execution control so 1622 * we can inject that one too. 1623 * 1624 * Also, interrupt window exiting allows us to inject any 1625 * pending APIC vector that was preempted by the ExtINT 1626 * as soon as possible. This applies both for the software 1627 * emulated vlapic and the hardware assisted virtual APIC. 1628 */ 1629 vmx_set_int_window_exiting(vcpu); 1630 } 1631 1632 VMX_CTR1(vcpu, "Injecting hwintr at vector %d", vector); 1633 1634 return; 1635 1636 cantinject: 1637 /* 1638 * Set the Interrupt Window Exiting execution control so we can inject 1639 * the interrupt as soon as blocking condition goes away. 1640 */ 1641 vmx_set_int_window_exiting(vcpu); 1642 } 1643 1644 /* 1645 * If the Virtual NMIs execution control is '1' then the logical processor 1646 * tracks virtual-NMI blocking in the Guest Interruptibility-state field of 1647 * the VMCS. An IRET instruction in VMX non-root operation will remove any 1648 * virtual-NMI blocking. 1649 * 1650 * This unblocking occurs even if the IRET causes a fault. In this case the 1651 * hypervisor needs to restore virtual-NMI blocking before resuming the guest. 1652 */ 1653 static void 1654 vmx_restore_nmi_blocking(struct vmx_vcpu *vcpu) 1655 { 1656 uint32_t gi; 1657 1658 VMX_CTR0(vcpu, "Restore Virtual-NMI blocking"); 1659 gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY); 1660 gi |= VMCS_INTERRUPTIBILITY_NMI_BLOCKING; 1661 vmcs_write(VMCS_GUEST_INTERRUPTIBILITY, gi); 1662 } 1663 1664 static void 1665 vmx_clear_nmi_blocking(struct vmx_vcpu *vcpu) 1666 { 1667 uint32_t gi; 1668 1669 VMX_CTR0(vcpu, "Clear Virtual-NMI blocking"); 1670 gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY); 1671 gi &= ~VMCS_INTERRUPTIBILITY_NMI_BLOCKING; 1672 vmcs_write(VMCS_GUEST_INTERRUPTIBILITY, gi); 1673 } 1674 1675 static void 1676 vmx_assert_nmi_blocking(struct vmx_vcpu *vcpu) 1677 { 1678 uint32_t gi __diagused; 1679 1680 gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY); 1681 KASSERT(gi & VMCS_INTERRUPTIBILITY_NMI_BLOCKING, 1682 ("NMI blocking is not in effect %#x", gi)); 1683 } 1684 1685 static int 1686 vmx_emulate_xsetbv(struct vmx *vmx, struct vmx_vcpu *vcpu, 1687 struct vm_exit *vmexit) 1688 { 1689 struct vmxctx *vmxctx; 1690 uint64_t xcrval; 1691 const struct xsave_limits *limits; 1692 1693 vmxctx = &vcpu->ctx; 1694 limits = vmm_get_xsave_limits(); 1695 1696 /* 1697 * Note that the processor raises a GP# fault on its own if 1698 * xsetbv is executed for CPL != 0, so we do not have to 1699 * emulate that fault here. 1700 */ 1701 1702 /* Only xcr0 is supported. */ 1703 if (vmxctx->guest_rcx != 0) { 1704 vm_inject_gp(vcpu->vcpu); 1705 return (HANDLED); 1706 } 1707 1708 /* We only handle xcr0 if both the host and guest have XSAVE enabled. */ 1709 if (!limits->xsave_enabled || !(vmcs_read(VMCS_GUEST_CR4) & CR4_XSAVE)) { 1710 vm_inject_ud(vcpu->vcpu); 1711 return (HANDLED); 1712 } 1713 1714 xcrval = vmxctx->guest_rdx << 32 | (vmxctx->guest_rax & 0xffffffff); 1715 if ((xcrval & ~limits->xcr0_allowed) != 0) { 1716 vm_inject_gp(vcpu->vcpu); 1717 return (HANDLED); 1718 } 1719 1720 if (!(xcrval & XFEATURE_ENABLED_X87)) { 1721 vm_inject_gp(vcpu->vcpu); 1722 return (HANDLED); 1723 } 1724 1725 /* AVX (YMM_Hi128) requires SSE. */ 1726 if (xcrval & XFEATURE_ENABLED_AVX && 1727 (xcrval & XFEATURE_AVX) != XFEATURE_AVX) { 1728 vm_inject_gp(vcpu->vcpu); 1729 return (HANDLED); 1730 } 1731 1732 /* 1733 * AVX512 requires base AVX (YMM_Hi128) as well as OpMask, 1734 * ZMM_Hi256, and Hi16_ZMM. 1735 */ 1736 if (xcrval & XFEATURE_AVX512 && 1737 (xcrval & (XFEATURE_AVX512 | XFEATURE_AVX)) != 1738 (XFEATURE_AVX512 | XFEATURE_AVX)) { 1739 vm_inject_gp(vcpu->vcpu); 1740 return (HANDLED); 1741 } 1742 1743 /* 1744 * Intel MPX requires both bound register state flags to be 1745 * set. 1746 */ 1747 if (((xcrval & XFEATURE_ENABLED_BNDREGS) != 0) != 1748 ((xcrval & XFEATURE_ENABLED_BNDCSR) != 0)) { 1749 vm_inject_gp(vcpu->vcpu); 1750 return (HANDLED); 1751 } 1752 1753 /* 1754 * This runs "inside" vmrun() with the guest's FPU state, so 1755 * modifying xcr0 directly modifies the guest's xcr0, not the 1756 * host's. 1757 */ 1758 load_xcr(0, xcrval); 1759 return (HANDLED); 1760 } 1761 1762 static uint64_t 1763 vmx_get_guest_reg(struct vmx_vcpu *vcpu, int ident) 1764 { 1765 const struct vmxctx *vmxctx; 1766 1767 vmxctx = &vcpu->ctx; 1768 1769 switch (ident) { 1770 case 0: 1771 return (vmxctx->guest_rax); 1772 case 1: 1773 return (vmxctx->guest_rcx); 1774 case 2: 1775 return (vmxctx->guest_rdx); 1776 case 3: 1777 return (vmxctx->guest_rbx); 1778 case 4: 1779 return (vmcs_read(VMCS_GUEST_RSP)); 1780 case 5: 1781 return (vmxctx->guest_rbp); 1782 case 6: 1783 return (vmxctx->guest_rsi); 1784 case 7: 1785 return (vmxctx->guest_rdi); 1786 case 8: 1787 return (vmxctx->guest_r8); 1788 case 9: 1789 return (vmxctx->guest_r9); 1790 case 10: 1791 return (vmxctx->guest_r10); 1792 case 11: 1793 return (vmxctx->guest_r11); 1794 case 12: 1795 return (vmxctx->guest_r12); 1796 case 13: 1797 return (vmxctx->guest_r13); 1798 case 14: 1799 return (vmxctx->guest_r14); 1800 case 15: 1801 return (vmxctx->guest_r15); 1802 default: 1803 panic("invalid vmx register %d", ident); 1804 } 1805 } 1806 1807 static void 1808 vmx_set_guest_reg(struct vmx_vcpu *vcpu, int ident, uint64_t regval) 1809 { 1810 struct vmxctx *vmxctx; 1811 1812 vmxctx = &vcpu->ctx; 1813 1814 switch (ident) { 1815 case 0: 1816 vmxctx->guest_rax = regval; 1817 break; 1818 case 1: 1819 vmxctx->guest_rcx = regval; 1820 break; 1821 case 2: 1822 vmxctx->guest_rdx = regval; 1823 break; 1824 case 3: 1825 vmxctx->guest_rbx = regval; 1826 break; 1827 case 4: 1828 vmcs_write(VMCS_GUEST_RSP, regval); 1829 break; 1830 case 5: 1831 vmxctx->guest_rbp = regval; 1832 break; 1833 case 6: 1834 vmxctx->guest_rsi = regval; 1835 break; 1836 case 7: 1837 vmxctx->guest_rdi = regval; 1838 break; 1839 case 8: 1840 vmxctx->guest_r8 = regval; 1841 break; 1842 case 9: 1843 vmxctx->guest_r9 = regval; 1844 break; 1845 case 10: 1846 vmxctx->guest_r10 = regval; 1847 break; 1848 case 11: 1849 vmxctx->guest_r11 = regval; 1850 break; 1851 case 12: 1852 vmxctx->guest_r12 = regval; 1853 break; 1854 case 13: 1855 vmxctx->guest_r13 = regval; 1856 break; 1857 case 14: 1858 vmxctx->guest_r14 = regval; 1859 break; 1860 case 15: 1861 vmxctx->guest_r15 = regval; 1862 break; 1863 default: 1864 panic("invalid vmx register %d", ident); 1865 } 1866 } 1867 1868 static int 1869 vmx_emulate_cr0_access(struct vmx_vcpu *vcpu, uint64_t exitqual) 1870 { 1871 uint64_t crval, regval; 1872 1873 /* We only handle mov to %cr0 at this time */ 1874 if ((exitqual & 0xf0) != 0x00) 1875 return (UNHANDLED); 1876 1877 regval = vmx_get_guest_reg(vcpu, (exitqual >> 8) & 0xf); 1878 1879 vmcs_write(VMCS_CR0_SHADOW, regval); 1880 1881 crval = regval | cr0_ones_mask; 1882 crval &= ~cr0_zeros_mask; 1883 vmcs_write(VMCS_GUEST_CR0, crval); 1884 1885 if (regval & CR0_PG) { 1886 uint64_t efer, entry_ctls; 1887 1888 /* 1889 * If CR0.PG is 1 and EFER.LME is 1 then EFER.LMA and 1890 * the "IA-32e mode guest" bit in VM-entry control must be 1891 * equal. 1892 */ 1893 efer = vmcs_read(VMCS_GUEST_IA32_EFER); 1894 if (efer & EFER_LME) { 1895 efer |= EFER_LMA; 1896 vmcs_write(VMCS_GUEST_IA32_EFER, efer); 1897 entry_ctls = vmcs_read(VMCS_ENTRY_CTLS); 1898 entry_ctls |= VM_ENTRY_GUEST_LMA; 1899 vmcs_write(VMCS_ENTRY_CTLS, entry_ctls); 1900 } 1901 } 1902 1903 return (HANDLED); 1904 } 1905 1906 static int 1907 vmx_emulate_cr4_access(struct vmx_vcpu *vcpu, uint64_t exitqual) 1908 { 1909 uint64_t crval, regval; 1910 1911 /* We only handle mov to %cr4 at this time */ 1912 if ((exitqual & 0xf0) != 0x00) 1913 return (UNHANDLED); 1914 1915 regval = vmx_get_guest_reg(vcpu, (exitqual >> 8) & 0xf); 1916 1917 vmcs_write(VMCS_CR4_SHADOW, regval); 1918 1919 crval = regval | cr4_ones_mask; 1920 crval &= ~cr4_zeros_mask; 1921 vmcs_write(VMCS_GUEST_CR4, crval); 1922 1923 return (HANDLED); 1924 } 1925 1926 static int 1927 vmx_emulate_cr8_access(struct vmx *vmx, struct vmx_vcpu *vcpu, 1928 uint64_t exitqual) 1929 { 1930 struct vlapic *vlapic; 1931 uint64_t cr8; 1932 int regnum; 1933 1934 /* We only handle mov %cr8 to/from a register at this time. */ 1935 if ((exitqual & 0xe0) != 0x00) { 1936 return (UNHANDLED); 1937 } 1938 1939 vlapic = vm_lapic(vcpu->vcpu); 1940 regnum = (exitqual >> 8) & 0xf; 1941 if (exitqual & 0x10) { 1942 cr8 = vlapic_get_cr8(vlapic); 1943 vmx_set_guest_reg(vcpu, regnum, cr8); 1944 } else { 1945 cr8 = vmx_get_guest_reg(vcpu, regnum); 1946 vlapic_set_cr8(vlapic, cr8); 1947 } 1948 1949 return (HANDLED); 1950 } 1951 1952 /* 1953 * From section "Guest Register State" in the Intel SDM: CPL = SS.DPL 1954 */ 1955 static int 1956 vmx_cpl(void) 1957 { 1958 uint32_t ssar; 1959 1960 ssar = vmcs_read(VMCS_GUEST_SS_ACCESS_RIGHTS); 1961 return ((ssar >> 5) & 0x3); 1962 } 1963 1964 static enum vm_cpu_mode 1965 vmx_cpu_mode(void) 1966 { 1967 uint32_t csar; 1968 1969 if (vmcs_read(VMCS_GUEST_IA32_EFER) & EFER_LMA) { 1970 csar = vmcs_read(VMCS_GUEST_CS_ACCESS_RIGHTS); 1971 if (csar & 0x2000) 1972 return (CPU_MODE_64BIT); /* CS.L = 1 */ 1973 else 1974 return (CPU_MODE_COMPATIBILITY); 1975 } else if (vmcs_read(VMCS_GUEST_CR0) & CR0_PE) { 1976 return (CPU_MODE_PROTECTED); 1977 } else { 1978 return (CPU_MODE_REAL); 1979 } 1980 } 1981 1982 static enum vm_paging_mode 1983 vmx_paging_mode(void) 1984 { 1985 uint64_t cr4; 1986 1987 if (!(vmcs_read(VMCS_GUEST_CR0) & CR0_PG)) 1988 return (PAGING_MODE_FLAT); 1989 cr4 = vmcs_read(VMCS_GUEST_CR4); 1990 if (!(cr4 & CR4_PAE)) 1991 return (PAGING_MODE_32); 1992 if (vmcs_read(VMCS_GUEST_IA32_EFER) & EFER_LME) { 1993 if (!(cr4 & CR4_LA57)) 1994 return (PAGING_MODE_64); 1995 return (PAGING_MODE_64_LA57); 1996 } else 1997 return (PAGING_MODE_PAE); 1998 } 1999 2000 static uint64_t 2001 inout_str_index(struct vmx_vcpu *vcpu, int in) 2002 { 2003 uint64_t val; 2004 int error __diagused; 2005 enum vm_reg_name reg; 2006 2007 reg = in ? VM_REG_GUEST_RDI : VM_REG_GUEST_RSI; 2008 error = vmx_getreg(vcpu, reg, &val); 2009 KASSERT(error == 0, ("%s: vmx_getreg error %d", __func__, error)); 2010 return (val); 2011 } 2012 2013 static uint64_t 2014 inout_str_count(struct vmx_vcpu *vcpu, int rep) 2015 { 2016 uint64_t val; 2017 int error __diagused; 2018 2019 if (rep) { 2020 error = vmx_getreg(vcpu, VM_REG_GUEST_RCX, &val); 2021 KASSERT(!error, ("%s: vmx_getreg error %d", __func__, error)); 2022 } else { 2023 val = 1; 2024 } 2025 return (val); 2026 } 2027 2028 static int 2029 inout_str_addrsize(uint32_t inst_info) 2030 { 2031 uint32_t size; 2032 2033 size = (inst_info >> 7) & 0x7; 2034 switch (size) { 2035 case 0: 2036 return (2); /* 16 bit */ 2037 case 1: 2038 return (4); /* 32 bit */ 2039 case 2: 2040 return (8); /* 64 bit */ 2041 default: 2042 panic("%s: invalid size encoding %d", __func__, size); 2043 } 2044 } 2045 2046 static void 2047 inout_str_seginfo(struct vmx_vcpu *vcpu, uint32_t inst_info, int in, 2048 struct vm_inout_str *vis) 2049 { 2050 int error __diagused, s; 2051 2052 if (in) { 2053 vis->seg_name = VM_REG_GUEST_ES; 2054 } else { 2055 s = (inst_info >> 15) & 0x7; 2056 vis->seg_name = vm_segment_name(s); 2057 } 2058 2059 error = vmx_getdesc(vcpu, vis->seg_name, &vis->seg_desc); 2060 KASSERT(error == 0, ("%s: vmx_getdesc error %d", __func__, error)); 2061 } 2062 2063 static void 2064 vmx_paging_info(struct vm_guest_paging *paging) 2065 { 2066 paging->cr3 = vmcs_guest_cr3(); 2067 paging->cpl = vmx_cpl(); 2068 paging->cpu_mode = vmx_cpu_mode(); 2069 paging->paging_mode = vmx_paging_mode(); 2070 } 2071 2072 static void 2073 vmexit_inst_emul(struct vm_exit *vmexit, uint64_t gpa, uint64_t gla) 2074 { 2075 struct vm_guest_paging *paging; 2076 uint32_t csar; 2077 2078 paging = &vmexit->u.inst_emul.paging; 2079 2080 vmexit->exitcode = VM_EXITCODE_INST_EMUL; 2081 vmexit->inst_length = 0; 2082 vmexit->u.inst_emul.gpa = gpa; 2083 vmexit->u.inst_emul.gla = gla; 2084 vmx_paging_info(paging); 2085 switch (paging->cpu_mode) { 2086 case CPU_MODE_REAL: 2087 vmexit->u.inst_emul.cs_base = vmcs_read(VMCS_GUEST_CS_BASE); 2088 vmexit->u.inst_emul.cs_d = 0; 2089 break; 2090 case CPU_MODE_PROTECTED: 2091 case CPU_MODE_COMPATIBILITY: 2092 vmexit->u.inst_emul.cs_base = vmcs_read(VMCS_GUEST_CS_BASE); 2093 csar = vmcs_read(VMCS_GUEST_CS_ACCESS_RIGHTS); 2094 vmexit->u.inst_emul.cs_d = SEG_DESC_DEF32(csar); 2095 break; 2096 default: 2097 vmexit->u.inst_emul.cs_base = 0; 2098 vmexit->u.inst_emul.cs_d = 0; 2099 break; 2100 } 2101 vie_init(&vmexit->u.inst_emul.vie, NULL, 0); 2102 } 2103 2104 static int 2105 ept_fault_type(uint64_t ept_qual) 2106 { 2107 int fault_type; 2108 2109 if (ept_qual & EPT_VIOLATION_DATA_WRITE) 2110 fault_type = VM_PROT_WRITE; 2111 else if (ept_qual & EPT_VIOLATION_INST_FETCH) 2112 fault_type = VM_PROT_EXECUTE; 2113 else 2114 fault_type= VM_PROT_READ; 2115 2116 return (fault_type); 2117 } 2118 2119 static bool 2120 ept_emulation_fault(uint64_t ept_qual) 2121 { 2122 int read, write; 2123 2124 /* EPT fault on an instruction fetch doesn't make sense here */ 2125 if (ept_qual & EPT_VIOLATION_INST_FETCH) 2126 return (false); 2127 2128 /* EPT fault must be a read fault or a write fault */ 2129 read = ept_qual & EPT_VIOLATION_DATA_READ ? 1 : 0; 2130 write = ept_qual & EPT_VIOLATION_DATA_WRITE ? 1 : 0; 2131 if ((read | write) == 0) 2132 return (false); 2133 2134 /* 2135 * The EPT violation must have been caused by accessing a 2136 * guest-physical address that is a translation of a guest-linear 2137 * address. 2138 */ 2139 if ((ept_qual & EPT_VIOLATION_GLA_VALID) == 0 || 2140 (ept_qual & EPT_VIOLATION_XLAT_VALID) == 0) { 2141 return (false); 2142 } 2143 2144 return (true); 2145 } 2146 2147 static __inline int 2148 apic_access_virtualization(struct vmx_vcpu *vcpu) 2149 { 2150 uint32_t proc_ctls2; 2151 2152 proc_ctls2 = vcpu->cap.proc_ctls2; 2153 return ((proc_ctls2 & PROCBASED2_VIRTUALIZE_APIC_ACCESSES) ? 1 : 0); 2154 } 2155 2156 static __inline int 2157 x2apic_virtualization(struct vmx_vcpu *vcpu) 2158 { 2159 uint32_t proc_ctls2; 2160 2161 proc_ctls2 = vcpu->cap.proc_ctls2; 2162 return ((proc_ctls2 & PROCBASED2_VIRTUALIZE_X2APIC_MODE) ? 1 : 0); 2163 } 2164 2165 static int 2166 vmx_handle_apic_write(struct vmx_vcpu *vcpu, struct vlapic *vlapic, 2167 uint64_t qual) 2168 { 2169 int error, handled, offset; 2170 uint32_t *apic_regs, vector; 2171 bool retu; 2172 2173 handled = HANDLED; 2174 offset = APIC_WRITE_OFFSET(qual); 2175 2176 if (!apic_access_virtualization(vcpu)) { 2177 /* 2178 * In general there should not be any APIC write VM-exits 2179 * unless APIC-access virtualization is enabled. 2180 * 2181 * However self-IPI virtualization can legitimately trigger 2182 * an APIC-write VM-exit so treat it specially. 2183 */ 2184 if (x2apic_virtualization(vcpu) && 2185 offset == APIC_OFFSET_SELF_IPI) { 2186 apic_regs = (uint32_t *)(vlapic->apic_page); 2187 vector = apic_regs[APIC_OFFSET_SELF_IPI / 4]; 2188 vlapic_self_ipi_handler(vlapic, vector); 2189 return (HANDLED); 2190 } else 2191 return (UNHANDLED); 2192 } 2193 2194 switch (offset) { 2195 case APIC_OFFSET_ID: 2196 vlapic_id_write_handler(vlapic); 2197 break; 2198 case APIC_OFFSET_LDR: 2199 vlapic_ldr_write_handler(vlapic); 2200 break; 2201 case APIC_OFFSET_DFR: 2202 vlapic_dfr_write_handler(vlapic); 2203 break; 2204 case APIC_OFFSET_SVR: 2205 vlapic_svr_write_handler(vlapic); 2206 break; 2207 case APIC_OFFSET_ESR: 2208 vlapic_esr_write_handler(vlapic); 2209 break; 2210 case APIC_OFFSET_ICR_LOW: 2211 retu = false; 2212 error = vlapic_icrlo_write_handler(vlapic, &retu); 2213 if (error != 0 || retu) 2214 handled = UNHANDLED; 2215 break; 2216 case APIC_OFFSET_CMCI_LVT: 2217 case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT: 2218 vlapic_lvt_write_handler(vlapic, offset); 2219 break; 2220 case APIC_OFFSET_TIMER_ICR: 2221 vlapic_icrtmr_write_handler(vlapic); 2222 break; 2223 case APIC_OFFSET_TIMER_DCR: 2224 vlapic_dcr_write_handler(vlapic); 2225 break; 2226 default: 2227 handled = UNHANDLED; 2228 break; 2229 } 2230 return (handled); 2231 } 2232 2233 static bool 2234 apic_access_fault(struct vmx_vcpu *vcpu, uint64_t gpa) 2235 { 2236 2237 if (apic_access_virtualization(vcpu) && 2238 (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE)) 2239 return (true); 2240 else 2241 return (false); 2242 } 2243 2244 static int 2245 vmx_handle_apic_access(struct vmx_vcpu *vcpu, struct vm_exit *vmexit) 2246 { 2247 uint64_t qual; 2248 int access_type, offset, allowed; 2249 2250 if (!apic_access_virtualization(vcpu)) 2251 return (UNHANDLED); 2252 2253 qual = vmexit->u.vmx.exit_qualification; 2254 access_type = APIC_ACCESS_TYPE(qual); 2255 offset = APIC_ACCESS_OFFSET(qual); 2256 2257 allowed = 0; 2258 if (access_type == 0) { 2259 /* 2260 * Read data access to the following registers is expected. 2261 */ 2262 switch (offset) { 2263 case APIC_OFFSET_APR: 2264 case APIC_OFFSET_PPR: 2265 case APIC_OFFSET_RRR: 2266 case APIC_OFFSET_CMCI_LVT: 2267 case APIC_OFFSET_TIMER_CCR: 2268 allowed = 1; 2269 break; 2270 default: 2271 break; 2272 } 2273 } else if (access_type == 1) { 2274 /* 2275 * Write data access to the following registers is expected. 2276 */ 2277 switch (offset) { 2278 case APIC_OFFSET_VER: 2279 case APIC_OFFSET_APR: 2280 case APIC_OFFSET_PPR: 2281 case APIC_OFFSET_RRR: 2282 case APIC_OFFSET_ISR0 ... APIC_OFFSET_ISR7: 2283 case APIC_OFFSET_TMR0 ... APIC_OFFSET_TMR7: 2284 case APIC_OFFSET_IRR0 ... APIC_OFFSET_IRR7: 2285 case APIC_OFFSET_CMCI_LVT: 2286 case APIC_OFFSET_TIMER_CCR: 2287 allowed = 1; 2288 break; 2289 default: 2290 break; 2291 } 2292 } 2293 2294 if (allowed) { 2295 vmexit_inst_emul(vmexit, DEFAULT_APIC_BASE + offset, 2296 VIE_INVALID_GLA); 2297 } 2298 2299 /* 2300 * Regardless of whether the APIC-access is allowed this handler 2301 * always returns UNHANDLED: 2302 * - if the access is allowed then it is handled by emulating the 2303 * instruction that caused the VM-exit (outside the critical section) 2304 * - if the access is not allowed then it will be converted to an 2305 * exitcode of VM_EXITCODE_VMX and will be dealt with in userland. 2306 */ 2307 return (UNHANDLED); 2308 } 2309 2310 static enum task_switch_reason 2311 vmx_task_switch_reason(uint64_t qual) 2312 { 2313 int reason; 2314 2315 reason = (qual >> 30) & 0x3; 2316 switch (reason) { 2317 case 0: 2318 return (TSR_CALL); 2319 case 1: 2320 return (TSR_IRET); 2321 case 2: 2322 return (TSR_JMP); 2323 case 3: 2324 return (TSR_IDT_GATE); 2325 default: 2326 panic("%s: invalid reason %d", __func__, reason); 2327 } 2328 } 2329 2330 static int 2331 emulate_wrmsr(struct vmx_vcpu *vcpu, u_int num, uint64_t val, bool *retu) 2332 { 2333 int error; 2334 2335 if (lapic_msr(num)) 2336 error = lapic_wrmsr(vcpu->vcpu, num, val, retu); 2337 else 2338 error = vmx_wrmsr(vcpu, num, val, retu); 2339 2340 return (error); 2341 } 2342 2343 static int 2344 emulate_rdmsr(struct vmx_vcpu *vcpu, u_int num, bool *retu) 2345 { 2346 struct vmxctx *vmxctx; 2347 uint64_t result; 2348 uint32_t eax, edx; 2349 int error; 2350 2351 if (lapic_msr(num)) 2352 error = lapic_rdmsr(vcpu->vcpu, num, &result, retu); 2353 else 2354 error = vmx_rdmsr(vcpu, num, &result, retu); 2355 2356 if (error == 0) { 2357 eax = result; 2358 vmxctx = &vcpu->ctx; 2359 error = vmxctx_setreg(vmxctx, VM_REG_GUEST_RAX, eax); 2360 KASSERT(error == 0, ("vmxctx_setreg(rax) error %d", error)); 2361 2362 edx = result >> 32; 2363 error = vmxctx_setreg(vmxctx, VM_REG_GUEST_RDX, edx); 2364 KASSERT(error == 0, ("vmxctx_setreg(rdx) error %d", error)); 2365 } 2366 2367 return (error); 2368 } 2369 2370 static int 2371 vmx_exit_process(struct vmx *vmx, struct vmx_vcpu *vcpu, struct vm_exit *vmexit) 2372 { 2373 int error, errcode, errcode_valid, handled, in; 2374 struct vmxctx *vmxctx; 2375 struct vlapic *vlapic; 2376 struct vm_inout_str *vis; 2377 struct vm_task_switch *ts; 2378 uint32_t eax, ecx, edx, idtvec_info, idtvec_err, intr_info, inst_info; 2379 uint32_t intr_type, intr_vec, reason; 2380 uint64_t exitintinfo, qual, gpa; 2381 #ifdef KDTRACE_HOOKS 2382 int vcpuid; 2383 #endif 2384 bool retu; 2385 2386 CTASSERT((PINBASED_CTLS_ONE_SETTING & PINBASED_VIRTUAL_NMI) != 0); 2387 CTASSERT((PINBASED_CTLS_ONE_SETTING & PINBASED_NMI_EXITING) != 0); 2388 2389 handled = UNHANDLED; 2390 vmxctx = &vcpu->ctx; 2391 #ifdef KDTRACE_HOOKS 2392 vcpuid = vcpu->vcpuid; 2393 #endif 2394 2395 qual = vmexit->u.vmx.exit_qualification; 2396 reason = vmexit->u.vmx.exit_reason; 2397 vmexit->exitcode = VM_EXITCODE_BOGUS; 2398 2399 vmm_stat_incr(vcpu->vcpu, VMEXIT_COUNT, 1); 2400 SDT_PROBE3(vmm, vmx, exit, entry, vmx, vcpuid, vmexit); 2401 2402 /* 2403 * VM-entry failures during or after loading guest state. 2404 * 2405 * These VM-exits are uncommon but must be handled specially 2406 * as most VM-exit fields are not populated as usual. 2407 */ 2408 if (__predict_false(reason == EXIT_REASON_MCE_DURING_ENTRY)) { 2409 VMX_CTR0(vcpu, "Handling MCE during VM-entry"); 2410 __asm __volatile("int $18"); 2411 return (1); 2412 } 2413 2414 /* 2415 * VM exits that can be triggered during event delivery need to 2416 * be handled specially by re-injecting the event if the IDT 2417 * vectoring information field's valid bit is set. 2418 * 2419 * See "Information for VM Exits During Event Delivery" in Intel SDM 2420 * for details. 2421 */ 2422 idtvec_info = vmcs_idt_vectoring_info(); 2423 if (idtvec_info & VMCS_IDT_VEC_VALID) { 2424 idtvec_info &= ~(1 << 12); /* clear undefined bit */ 2425 exitintinfo = idtvec_info; 2426 if (idtvec_info & VMCS_IDT_VEC_ERRCODE_VALID) { 2427 idtvec_err = vmcs_idt_vectoring_err(); 2428 exitintinfo |= (uint64_t)idtvec_err << 32; 2429 } 2430 error = vm_exit_intinfo(vcpu->vcpu, exitintinfo); 2431 KASSERT(error == 0, ("%s: vm_set_intinfo error %d", 2432 __func__, error)); 2433 2434 /* 2435 * If 'virtual NMIs' are being used and the VM-exit 2436 * happened while injecting an NMI during the previous 2437 * VM-entry, then clear "blocking by NMI" in the 2438 * Guest Interruptibility-State so the NMI can be 2439 * reinjected on the subsequent VM-entry. 2440 * 2441 * However, if the NMI was being delivered through a task 2442 * gate, then the new task must start execution with NMIs 2443 * blocked so don't clear NMI blocking in this case. 2444 */ 2445 intr_type = idtvec_info & VMCS_INTR_T_MASK; 2446 if (intr_type == VMCS_INTR_T_NMI) { 2447 if (reason != EXIT_REASON_TASK_SWITCH) 2448 vmx_clear_nmi_blocking(vcpu); 2449 else 2450 vmx_assert_nmi_blocking(vcpu); 2451 } 2452 2453 /* 2454 * Update VM-entry instruction length if the event being 2455 * delivered was a software interrupt or software exception. 2456 */ 2457 if (intr_type == VMCS_INTR_T_SWINTR || 2458 intr_type == VMCS_INTR_T_PRIV_SWEXCEPTION || 2459 intr_type == VMCS_INTR_T_SWEXCEPTION) { 2460 vmcs_write(VMCS_ENTRY_INST_LENGTH, vmexit->inst_length); 2461 } 2462 } 2463 2464 switch (reason) { 2465 case EXIT_REASON_TASK_SWITCH: 2466 ts = &vmexit->u.task_switch; 2467 ts->tsssel = qual & 0xffff; 2468 ts->reason = vmx_task_switch_reason(qual); 2469 ts->ext = 0; 2470 ts->errcode_valid = 0; 2471 vmx_paging_info(&ts->paging); 2472 /* 2473 * If the task switch was due to a CALL, JMP, IRET, software 2474 * interrupt (INT n) or software exception (INT3, INTO), 2475 * then the saved %rip references the instruction that caused 2476 * the task switch. The instruction length field in the VMCS 2477 * is valid in this case. 2478 * 2479 * In all other cases (e.g., NMI, hardware exception) the 2480 * saved %rip is one that would have been saved in the old TSS 2481 * had the task switch completed normally so the instruction 2482 * length field is not needed in this case and is explicitly 2483 * set to 0. 2484 */ 2485 if (ts->reason == TSR_IDT_GATE) { 2486 KASSERT(idtvec_info & VMCS_IDT_VEC_VALID, 2487 ("invalid idtvec_info %#x for IDT task switch", 2488 idtvec_info)); 2489 intr_type = idtvec_info & VMCS_INTR_T_MASK; 2490 if (intr_type != VMCS_INTR_T_SWINTR && 2491 intr_type != VMCS_INTR_T_SWEXCEPTION && 2492 intr_type != VMCS_INTR_T_PRIV_SWEXCEPTION) { 2493 /* Task switch triggered by external event */ 2494 ts->ext = 1; 2495 vmexit->inst_length = 0; 2496 if (idtvec_info & VMCS_IDT_VEC_ERRCODE_VALID) { 2497 ts->errcode_valid = 1; 2498 ts->errcode = vmcs_idt_vectoring_err(); 2499 } 2500 } 2501 } 2502 vmexit->exitcode = VM_EXITCODE_TASK_SWITCH; 2503 SDT_PROBE4(vmm, vmx, exit, taskswitch, vmx, vcpuid, vmexit, ts); 2504 VMX_CTR4(vcpu, "task switch reason %d, tss 0x%04x, " 2505 "%s errcode 0x%016lx", ts->reason, ts->tsssel, 2506 ts->ext ? "external" : "internal", 2507 ((uint64_t)ts->errcode << 32) | ts->errcode_valid); 2508 break; 2509 case EXIT_REASON_CR_ACCESS: 2510 vmm_stat_incr(vcpu->vcpu, VMEXIT_CR_ACCESS, 1); 2511 SDT_PROBE4(vmm, vmx, exit, craccess, vmx, vcpuid, vmexit, qual); 2512 switch (qual & 0xf) { 2513 case 0: 2514 handled = vmx_emulate_cr0_access(vcpu, qual); 2515 break; 2516 case 4: 2517 handled = vmx_emulate_cr4_access(vcpu, qual); 2518 break; 2519 case 8: 2520 handled = vmx_emulate_cr8_access(vmx, vcpu, qual); 2521 break; 2522 } 2523 break; 2524 case EXIT_REASON_RDMSR: 2525 vmm_stat_incr(vcpu->vcpu, VMEXIT_RDMSR, 1); 2526 retu = false; 2527 ecx = vmxctx->guest_rcx; 2528 VMX_CTR1(vcpu, "rdmsr 0x%08x", ecx); 2529 SDT_PROBE4(vmm, vmx, exit, rdmsr, vmx, vcpuid, vmexit, ecx); 2530 error = emulate_rdmsr(vcpu, ecx, &retu); 2531 if (error) { 2532 vmexit->exitcode = VM_EXITCODE_RDMSR; 2533 vmexit->u.msr.code = ecx; 2534 } else if (!retu) { 2535 handled = HANDLED; 2536 } else { 2537 /* Return to userspace with a valid exitcode */ 2538 KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS, 2539 ("emulate_rdmsr retu with bogus exitcode")); 2540 } 2541 break; 2542 case EXIT_REASON_WRMSR: 2543 vmm_stat_incr(vcpu->vcpu, VMEXIT_WRMSR, 1); 2544 retu = false; 2545 eax = vmxctx->guest_rax; 2546 ecx = vmxctx->guest_rcx; 2547 edx = vmxctx->guest_rdx; 2548 VMX_CTR2(vcpu, "wrmsr 0x%08x value 0x%016lx", 2549 ecx, (uint64_t)edx << 32 | eax); 2550 SDT_PROBE5(vmm, vmx, exit, wrmsr, vmx, vmexit, vcpuid, ecx, 2551 (uint64_t)edx << 32 | eax); 2552 error = emulate_wrmsr(vcpu, ecx, (uint64_t)edx << 32 | eax, 2553 &retu); 2554 if (error) { 2555 vmexit->exitcode = VM_EXITCODE_WRMSR; 2556 vmexit->u.msr.code = ecx; 2557 vmexit->u.msr.wval = (uint64_t)edx << 32 | eax; 2558 } else if (!retu) { 2559 handled = HANDLED; 2560 } else { 2561 /* Return to userspace with a valid exitcode */ 2562 KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS, 2563 ("emulate_wrmsr retu with bogus exitcode")); 2564 } 2565 break; 2566 case EXIT_REASON_HLT: 2567 vmm_stat_incr(vcpu->vcpu, VMEXIT_HLT, 1); 2568 SDT_PROBE3(vmm, vmx, exit, halt, vmx, vcpuid, vmexit); 2569 vmexit->exitcode = VM_EXITCODE_HLT; 2570 vmexit->u.hlt.rflags = vmcs_read(VMCS_GUEST_RFLAGS); 2571 if (virtual_interrupt_delivery) 2572 vmexit->u.hlt.intr_status = 2573 vmcs_read(VMCS_GUEST_INTR_STATUS); 2574 else 2575 vmexit->u.hlt.intr_status = 0; 2576 break; 2577 case EXIT_REASON_MTF: 2578 vmm_stat_incr(vcpu->vcpu, VMEXIT_MTRAP, 1); 2579 SDT_PROBE3(vmm, vmx, exit, mtrap, vmx, vcpuid, vmexit); 2580 vmexit->exitcode = VM_EXITCODE_MTRAP; 2581 vmexit->inst_length = 0; 2582 break; 2583 case EXIT_REASON_PAUSE: 2584 vmm_stat_incr(vcpu->vcpu, VMEXIT_PAUSE, 1); 2585 SDT_PROBE3(vmm, vmx, exit, pause, vmx, vcpuid, vmexit); 2586 vmexit->exitcode = VM_EXITCODE_PAUSE; 2587 break; 2588 case EXIT_REASON_INTR_WINDOW: 2589 vmm_stat_incr(vcpu->vcpu, VMEXIT_INTR_WINDOW, 1); 2590 SDT_PROBE3(vmm, vmx, exit, intrwindow, vmx, vcpuid, vmexit); 2591 vmx_clear_int_window_exiting(vcpu); 2592 return (1); 2593 case EXIT_REASON_EXT_INTR: 2594 /* 2595 * External interrupts serve only to cause VM exits and allow 2596 * the host interrupt handler to run. 2597 * 2598 * If this external interrupt triggers a virtual interrupt 2599 * to a VM, then that state will be recorded by the 2600 * host interrupt handler in the VM's softc. We will inject 2601 * this virtual interrupt during the subsequent VM enter. 2602 */ 2603 intr_info = vmcs_read(VMCS_EXIT_INTR_INFO); 2604 SDT_PROBE4(vmm, vmx, exit, interrupt, 2605 vmx, vcpuid, vmexit, intr_info); 2606 2607 /* 2608 * XXX: Ignore this exit if VMCS_INTR_VALID is not set. 2609 * This appears to be a bug in VMware Fusion? 2610 */ 2611 if (!(intr_info & VMCS_INTR_VALID)) 2612 return (1); 2613 KASSERT((intr_info & VMCS_INTR_VALID) != 0 && 2614 (intr_info & VMCS_INTR_T_MASK) == VMCS_INTR_T_HWINTR, 2615 ("VM exit interruption info invalid: %#x", intr_info)); 2616 vmx_trigger_hostintr(intr_info & 0xff); 2617 2618 /* 2619 * This is special. We want to treat this as an 'handled' 2620 * VM-exit but not increment the instruction pointer. 2621 */ 2622 vmm_stat_incr(vcpu->vcpu, VMEXIT_EXTINT, 1); 2623 return (1); 2624 case EXIT_REASON_NMI_WINDOW: 2625 SDT_PROBE3(vmm, vmx, exit, nmiwindow, vmx, vcpuid, vmexit); 2626 /* Exit to allow the pending virtual NMI to be injected */ 2627 if (vm_nmi_pending(vcpu->vcpu)) 2628 vmx_inject_nmi(vcpu); 2629 vmx_clear_nmi_window_exiting(vcpu); 2630 vmm_stat_incr(vcpu->vcpu, VMEXIT_NMI_WINDOW, 1); 2631 return (1); 2632 case EXIT_REASON_INOUT: 2633 vmm_stat_incr(vcpu->vcpu, VMEXIT_INOUT, 1); 2634 vmexit->exitcode = VM_EXITCODE_INOUT; 2635 vmexit->u.inout.bytes = (qual & 0x7) + 1; 2636 vmexit->u.inout.in = in = (qual & 0x8) ? 1 : 0; 2637 vmexit->u.inout.string = (qual & 0x10) ? 1 : 0; 2638 vmexit->u.inout.rep = (qual & 0x20) ? 1 : 0; 2639 vmexit->u.inout.port = (uint16_t)(qual >> 16); 2640 vmexit->u.inout.eax = (uint32_t)(vmxctx->guest_rax); 2641 if (vmexit->u.inout.string) { 2642 inst_info = vmcs_read(VMCS_EXIT_INSTRUCTION_INFO); 2643 vmexit->exitcode = VM_EXITCODE_INOUT_STR; 2644 vis = &vmexit->u.inout_str; 2645 vmx_paging_info(&vis->paging); 2646 vis->rflags = vmcs_read(VMCS_GUEST_RFLAGS); 2647 vis->cr0 = vmcs_read(VMCS_GUEST_CR0); 2648 vis->index = inout_str_index(vcpu, in); 2649 vis->count = inout_str_count(vcpu, vis->inout.rep); 2650 vis->addrsize = inout_str_addrsize(inst_info); 2651 inout_str_seginfo(vcpu, inst_info, in, vis); 2652 } 2653 SDT_PROBE3(vmm, vmx, exit, inout, vmx, vcpuid, vmexit); 2654 break; 2655 case EXIT_REASON_CPUID: 2656 vmm_stat_incr(vcpu->vcpu, VMEXIT_CPUID, 1); 2657 SDT_PROBE3(vmm, vmx, exit, cpuid, vmx, vcpuid, vmexit); 2658 handled = vmx_handle_cpuid(vcpu, vmxctx); 2659 break; 2660 case EXIT_REASON_EXCEPTION: 2661 vmm_stat_incr(vcpu->vcpu, VMEXIT_EXCEPTION, 1); 2662 intr_info = vmcs_read(VMCS_EXIT_INTR_INFO); 2663 KASSERT((intr_info & VMCS_INTR_VALID) != 0, 2664 ("VM exit interruption info invalid: %#x", intr_info)); 2665 2666 intr_vec = intr_info & 0xff; 2667 intr_type = intr_info & VMCS_INTR_T_MASK; 2668 2669 /* 2670 * If Virtual NMIs control is 1 and the VM-exit is due to a 2671 * fault encountered during the execution of IRET then we must 2672 * restore the state of "virtual-NMI blocking" before resuming 2673 * the guest. 2674 * 2675 * See "Resuming Guest Software after Handling an Exception". 2676 * See "Information for VM Exits Due to Vectored Events". 2677 */ 2678 if ((idtvec_info & VMCS_IDT_VEC_VALID) == 0 && 2679 (intr_vec != IDT_DF) && 2680 (intr_info & EXIT_QUAL_NMIUDTI) != 0) 2681 vmx_restore_nmi_blocking(vcpu); 2682 2683 /* 2684 * The NMI has already been handled in vmx_exit_handle_nmi(). 2685 */ 2686 if (intr_type == VMCS_INTR_T_NMI) 2687 return (1); 2688 2689 /* 2690 * Call the machine check handler by hand. Also don't reflect 2691 * the machine check back into the guest. 2692 */ 2693 if (intr_vec == IDT_MC) { 2694 VMX_CTR0(vcpu, "Vectoring to MCE handler"); 2695 __asm __volatile("int $18"); 2696 return (1); 2697 } 2698 2699 /* 2700 * If the hypervisor has requested user exits for 2701 * debug exceptions, bounce them out to userland. 2702 */ 2703 if (intr_type == VMCS_INTR_T_SWEXCEPTION && intr_vec == IDT_BP && 2704 (vcpu->cap.set & (1 << VM_CAP_BPT_EXIT))) { 2705 vmexit->exitcode = VM_EXITCODE_BPT; 2706 vmexit->u.bpt.inst_length = vmexit->inst_length; 2707 vmexit->inst_length = 0; 2708 break; 2709 } 2710 2711 if (intr_vec == IDT_PF) { 2712 error = vmxctx_setreg(vmxctx, VM_REG_GUEST_CR2, qual); 2713 KASSERT(error == 0, ("%s: vmxctx_setreg(cr2) error %d", 2714 __func__, error)); 2715 } 2716 2717 /* 2718 * Software exceptions exhibit trap-like behavior. This in 2719 * turn requires populating the VM-entry instruction length 2720 * so that the %rip in the trap frame is past the INT3/INTO 2721 * instruction. 2722 */ 2723 if (intr_type == VMCS_INTR_T_SWEXCEPTION) 2724 vmcs_write(VMCS_ENTRY_INST_LENGTH, vmexit->inst_length); 2725 2726 /* Reflect all other exceptions back into the guest */ 2727 errcode_valid = errcode = 0; 2728 if (intr_info & VMCS_INTR_DEL_ERRCODE) { 2729 errcode_valid = 1; 2730 errcode = vmcs_read(VMCS_EXIT_INTR_ERRCODE); 2731 } 2732 VMX_CTR2(vcpu, "Reflecting exception %d/%#x into " 2733 "the guest", intr_vec, errcode); 2734 SDT_PROBE5(vmm, vmx, exit, exception, 2735 vmx, vcpuid, vmexit, intr_vec, errcode); 2736 error = vm_inject_exception(vcpu->vcpu, intr_vec, 2737 errcode_valid, errcode, 0); 2738 KASSERT(error == 0, ("%s: vm_inject_exception error %d", 2739 __func__, error)); 2740 return (1); 2741 2742 case EXIT_REASON_EPT_FAULT: 2743 /* 2744 * If 'gpa' lies within the address space allocated to 2745 * memory then this must be a nested page fault otherwise 2746 * this must be an instruction that accesses MMIO space. 2747 */ 2748 gpa = vmcs_gpa(); 2749 if (vm_mem_allocated(vcpu->vcpu, gpa) || 2750 apic_access_fault(vcpu, gpa)) { 2751 vmexit->exitcode = VM_EXITCODE_PAGING; 2752 vmexit->inst_length = 0; 2753 vmexit->u.paging.gpa = gpa; 2754 vmexit->u.paging.fault_type = ept_fault_type(qual); 2755 vmm_stat_incr(vcpu->vcpu, VMEXIT_NESTED_FAULT, 1); 2756 SDT_PROBE5(vmm, vmx, exit, nestedfault, 2757 vmx, vcpuid, vmexit, gpa, qual); 2758 } else if (ept_emulation_fault(qual)) { 2759 vmexit_inst_emul(vmexit, gpa, vmcs_gla()); 2760 vmm_stat_incr(vcpu->vcpu, VMEXIT_INST_EMUL, 1); 2761 SDT_PROBE4(vmm, vmx, exit, mmiofault, 2762 vmx, vcpuid, vmexit, gpa); 2763 } 2764 /* 2765 * If Virtual NMIs control is 1 and the VM-exit is due to an 2766 * EPT fault during the execution of IRET then we must restore 2767 * the state of "virtual-NMI blocking" before resuming. 2768 * 2769 * See description of "NMI unblocking due to IRET" in 2770 * "Exit Qualification for EPT Violations". 2771 */ 2772 if ((idtvec_info & VMCS_IDT_VEC_VALID) == 0 && 2773 (qual & EXIT_QUAL_NMIUDTI) != 0) 2774 vmx_restore_nmi_blocking(vcpu); 2775 break; 2776 case EXIT_REASON_VIRTUALIZED_EOI: 2777 vmexit->exitcode = VM_EXITCODE_IOAPIC_EOI; 2778 vmexit->u.ioapic_eoi.vector = qual & 0xFF; 2779 SDT_PROBE3(vmm, vmx, exit, eoi, vmx, vcpuid, vmexit); 2780 vmexit->inst_length = 0; /* trap-like */ 2781 break; 2782 case EXIT_REASON_APIC_ACCESS: 2783 SDT_PROBE3(vmm, vmx, exit, apicaccess, vmx, vcpuid, vmexit); 2784 handled = vmx_handle_apic_access(vcpu, vmexit); 2785 break; 2786 case EXIT_REASON_APIC_WRITE: 2787 /* 2788 * APIC-write VM exit is trap-like so the %rip is already 2789 * pointing to the next instruction. 2790 */ 2791 vmexit->inst_length = 0; 2792 vlapic = vm_lapic(vcpu->vcpu); 2793 SDT_PROBE4(vmm, vmx, exit, apicwrite, 2794 vmx, vcpuid, vmexit, vlapic); 2795 handled = vmx_handle_apic_write(vcpu, vlapic, qual); 2796 break; 2797 case EXIT_REASON_XSETBV: 2798 SDT_PROBE3(vmm, vmx, exit, xsetbv, vmx, vcpuid, vmexit); 2799 handled = vmx_emulate_xsetbv(vmx, vcpu, vmexit); 2800 break; 2801 case EXIT_REASON_MONITOR: 2802 SDT_PROBE3(vmm, vmx, exit, monitor, vmx, vcpuid, vmexit); 2803 vmexit->exitcode = VM_EXITCODE_MONITOR; 2804 break; 2805 case EXIT_REASON_MWAIT: 2806 SDT_PROBE3(vmm, vmx, exit, mwait, vmx, vcpuid, vmexit); 2807 vmexit->exitcode = VM_EXITCODE_MWAIT; 2808 break; 2809 case EXIT_REASON_TPR: 2810 vlapic = vm_lapic(vcpu->vcpu); 2811 vlapic_sync_tpr(vlapic); 2812 vmexit->inst_length = 0; 2813 handled = HANDLED; 2814 break; 2815 case EXIT_REASON_VMCALL: 2816 case EXIT_REASON_VMCLEAR: 2817 case EXIT_REASON_VMLAUNCH: 2818 case EXIT_REASON_VMPTRLD: 2819 case EXIT_REASON_VMPTRST: 2820 case EXIT_REASON_VMREAD: 2821 case EXIT_REASON_VMRESUME: 2822 case EXIT_REASON_VMWRITE: 2823 case EXIT_REASON_VMXOFF: 2824 case EXIT_REASON_VMXON: 2825 SDT_PROBE3(vmm, vmx, exit, vminsn, vmx, vcpuid, vmexit); 2826 vmexit->exitcode = VM_EXITCODE_VMINSN; 2827 break; 2828 case EXIT_REASON_INVD: 2829 case EXIT_REASON_WBINVD: 2830 /* ignore exit */ 2831 handled = HANDLED; 2832 break; 2833 default: 2834 SDT_PROBE4(vmm, vmx, exit, unknown, 2835 vmx, vcpuid, vmexit, reason); 2836 vmm_stat_incr(vcpu->vcpu, VMEXIT_UNKNOWN, 1); 2837 break; 2838 } 2839 2840 if (handled) { 2841 /* 2842 * It is possible that control is returned to userland 2843 * even though we were able to handle the VM exit in the 2844 * kernel. 2845 * 2846 * In such a case we want to make sure that the userland 2847 * restarts guest execution at the instruction *after* 2848 * the one we just processed. Therefore we update the 2849 * guest rip in the VMCS and in 'vmexit'. 2850 */ 2851 vmexit->rip += vmexit->inst_length; 2852 vmexit->inst_length = 0; 2853 vmcs_write(VMCS_GUEST_RIP, vmexit->rip); 2854 } else { 2855 if (vmexit->exitcode == VM_EXITCODE_BOGUS) { 2856 /* 2857 * If this VM exit was not claimed by anybody then 2858 * treat it as a generic VMX exit. 2859 */ 2860 vmexit->exitcode = VM_EXITCODE_VMX; 2861 vmexit->u.vmx.status = VM_SUCCESS; 2862 vmexit->u.vmx.inst_type = 0; 2863 vmexit->u.vmx.inst_error = 0; 2864 } else { 2865 /* 2866 * The exitcode and collateral have been populated. 2867 * The VM exit will be processed further in userland. 2868 */ 2869 } 2870 } 2871 2872 SDT_PROBE4(vmm, vmx, exit, return, 2873 vmx, vcpuid, vmexit, handled); 2874 return (handled); 2875 } 2876 2877 static __inline void 2878 vmx_exit_inst_error(struct vmxctx *vmxctx, int rc, struct vm_exit *vmexit) 2879 { 2880 2881 KASSERT(vmxctx->inst_fail_status != VM_SUCCESS, 2882 ("vmx_exit_inst_error: invalid inst_fail_status %d", 2883 vmxctx->inst_fail_status)); 2884 2885 vmexit->inst_length = 0; 2886 vmexit->exitcode = VM_EXITCODE_VMX; 2887 vmexit->u.vmx.status = vmxctx->inst_fail_status; 2888 vmexit->u.vmx.inst_error = vmcs_instruction_error(); 2889 vmexit->u.vmx.exit_reason = ~0; 2890 vmexit->u.vmx.exit_qualification = ~0; 2891 2892 switch (rc) { 2893 case VMX_VMRESUME_ERROR: 2894 case VMX_VMLAUNCH_ERROR: 2895 vmexit->u.vmx.inst_type = rc; 2896 break; 2897 default: 2898 panic("vm_exit_inst_error: vmx_enter_guest returned %d", rc); 2899 } 2900 } 2901 2902 /* 2903 * If the NMI-exiting VM execution control is set to '1' then an NMI in 2904 * non-root operation causes a VM-exit. NMI blocking is in effect so it is 2905 * sufficient to simply vector to the NMI handler via a software interrupt. 2906 * However, this must be done before maskable interrupts are enabled 2907 * otherwise the "iret" issued by an interrupt handler will incorrectly 2908 * clear NMI blocking. 2909 */ 2910 static __inline void 2911 vmx_exit_handle_nmi(struct vmx_vcpu *vcpu, struct vm_exit *vmexit) 2912 { 2913 uint32_t intr_info; 2914 2915 KASSERT((read_rflags() & PSL_I) == 0, ("interrupts enabled")); 2916 2917 if (vmexit->u.vmx.exit_reason != EXIT_REASON_EXCEPTION) 2918 return; 2919 2920 intr_info = vmcs_read(VMCS_EXIT_INTR_INFO); 2921 KASSERT((intr_info & VMCS_INTR_VALID) != 0, 2922 ("VM exit interruption info invalid: %#x", intr_info)); 2923 2924 if ((intr_info & VMCS_INTR_T_MASK) == VMCS_INTR_T_NMI) { 2925 KASSERT((intr_info & 0xff) == IDT_NMI, ("VM exit due " 2926 "to NMI has invalid vector: %#x", intr_info)); 2927 VMX_CTR0(vcpu, "Vectoring to NMI handler"); 2928 __asm __volatile("int $2"); 2929 } 2930 } 2931 2932 static __inline void 2933 vmx_dr_enter_guest(struct vmxctx *vmxctx) 2934 { 2935 register_t rflags; 2936 2937 /* Save host control debug registers. */ 2938 vmxctx->host_dr7 = rdr7(); 2939 vmxctx->host_debugctl = rdmsr(MSR_DEBUGCTLMSR); 2940 2941 /* 2942 * Disable debugging in DR7 and DEBUGCTL to avoid triggering 2943 * exceptions in the host based on the guest DRx values. The 2944 * guest DR7 and DEBUGCTL are saved/restored in the VMCS. 2945 */ 2946 load_dr7(0); 2947 wrmsr(MSR_DEBUGCTLMSR, 0); 2948 2949 /* 2950 * Disable single stepping the kernel to avoid corrupting the 2951 * guest DR6. A debugger might still be able to corrupt the 2952 * guest DR6 by setting a breakpoint after this point and then 2953 * single stepping. 2954 */ 2955 rflags = read_rflags(); 2956 vmxctx->host_tf = rflags & PSL_T; 2957 write_rflags(rflags & ~PSL_T); 2958 2959 /* Save host debug registers. */ 2960 vmxctx->host_dr0 = rdr0(); 2961 vmxctx->host_dr1 = rdr1(); 2962 vmxctx->host_dr2 = rdr2(); 2963 vmxctx->host_dr3 = rdr3(); 2964 vmxctx->host_dr6 = rdr6(); 2965 2966 /* Restore guest debug registers. */ 2967 load_dr0(vmxctx->guest_dr0); 2968 load_dr1(vmxctx->guest_dr1); 2969 load_dr2(vmxctx->guest_dr2); 2970 load_dr3(vmxctx->guest_dr3); 2971 load_dr6(vmxctx->guest_dr6); 2972 } 2973 2974 static __inline void 2975 vmx_dr_leave_guest(struct vmxctx *vmxctx) 2976 { 2977 2978 /* Save guest debug registers. */ 2979 vmxctx->guest_dr0 = rdr0(); 2980 vmxctx->guest_dr1 = rdr1(); 2981 vmxctx->guest_dr2 = rdr2(); 2982 vmxctx->guest_dr3 = rdr3(); 2983 vmxctx->guest_dr6 = rdr6(); 2984 2985 /* 2986 * Restore host debug registers. Restore DR7, DEBUGCTL, and 2987 * PSL_T last. 2988 */ 2989 load_dr0(vmxctx->host_dr0); 2990 load_dr1(vmxctx->host_dr1); 2991 load_dr2(vmxctx->host_dr2); 2992 load_dr3(vmxctx->host_dr3); 2993 load_dr6(vmxctx->host_dr6); 2994 wrmsr(MSR_DEBUGCTLMSR, vmxctx->host_debugctl); 2995 load_dr7(vmxctx->host_dr7); 2996 write_rflags(read_rflags() | vmxctx->host_tf); 2997 } 2998 2999 static __inline void 3000 vmx_pmap_activate(struct vmx *vmx, pmap_t pmap) 3001 { 3002 long eptgen; 3003 int cpu; 3004 3005 cpu = curcpu; 3006 3007 CPU_SET_ATOMIC(cpu, &pmap->pm_active); 3008 smr_enter(pmap->pm_eptsmr); 3009 eptgen = atomic_load_long(&pmap->pm_eptgen); 3010 if (eptgen != vmx->eptgen[cpu]) { 3011 vmx->eptgen[cpu] = eptgen; 3012 invept(INVEPT_TYPE_SINGLE_CONTEXT, 3013 (struct invept_desc){ .eptp = vmx->eptp, ._res = 0 }); 3014 } 3015 } 3016 3017 static __inline void 3018 vmx_pmap_deactivate(struct vmx *vmx, pmap_t pmap) 3019 { 3020 smr_exit(pmap->pm_eptsmr); 3021 CPU_CLR_ATOMIC(curcpu, &pmap->pm_active); 3022 } 3023 3024 static int 3025 vmx_run(void *vcpui, register_t rip, pmap_t pmap, struct vm_eventinfo *evinfo) 3026 { 3027 int rc, handled, launched; 3028 struct vmx *vmx; 3029 struct vmx_vcpu *vcpu; 3030 struct vmxctx *vmxctx; 3031 struct vmcs *vmcs; 3032 struct vm_exit *vmexit; 3033 struct vlapic *vlapic; 3034 uint32_t exit_reason; 3035 struct region_descriptor gdtr, idtr; 3036 uint16_t ldt_sel; 3037 3038 vcpu = vcpui; 3039 vmx = vcpu->vmx; 3040 vmcs = vcpu->vmcs; 3041 vmxctx = &vcpu->ctx; 3042 vlapic = vm_lapic(vcpu->vcpu); 3043 vmexit = vm_exitinfo(vcpu->vcpu); 3044 launched = 0; 3045 3046 KASSERT(vmxctx->pmap == pmap, 3047 ("pmap %p different than ctx pmap %p", pmap, vmxctx->pmap)); 3048 3049 vmx_msr_guest_enter(vcpu); 3050 3051 VMPTRLD(vmcs); 3052 3053 /* 3054 * XXX 3055 * We do this every time because we may setup the virtual machine 3056 * from a different process than the one that actually runs it. 3057 * 3058 * If the life of a virtual machine was spent entirely in the context 3059 * of a single process we could do this once in vmx_init(). 3060 */ 3061 vmcs_write(VMCS_HOST_CR3, rcr3()); 3062 3063 vmcs_write(VMCS_GUEST_RIP, rip); 3064 vmx_set_pcpu_defaults(vmx, vcpu, pmap); 3065 do { 3066 KASSERT(vmcs_guest_rip() == rip, ("%s: vmcs guest rip mismatch " 3067 "%#lx/%#lx", __func__, vmcs_guest_rip(), rip)); 3068 3069 handled = UNHANDLED; 3070 /* 3071 * Interrupts are disabled from this point on until the 3072 * guest starts executing. This is done for the following 3073 * reasons: 3074 * 3075 * If an AST is asserted on this thread after the check below, 3076 * then the IPI_AST notification will not be lost, because it 3077 * will cause a VM exit due to external interrupt as soon as 3078 * the guest state is loaded. 3079 * 3080 * A posted interrupt after 'vmx_inject_interrupts()' will 3081 * not be "lost" because it will be held pending in the host 3082 * APIC because interrupts are disabled. The pending interrupt 3083 * will be recognized as soon as the guest state is loaded. 3084 * 3085 * The same reasoning applies to the IPI generated by 3086 * pmap_invalidate_ept(). 3087 */ 3088 disable_intr(); 3089 vmx_inject_interrupts(vcpu, vlapic, rip); 3090 3091 /* 3092 * Check for vcpu suspension after injecting events because 3093 * vmx_inject_interrupts() can suspend the vcpu due to a 3094 * triple fault. 3095 */ 3096 if (vcpu_suspended(evinfo)) { 3097 enable_intr(); 3098 vm_exit_suspended(vcpu->vcpu, rip); 3099 break; 3100 } 3101 3102 if (vcpu_rendezvous_pending(vcpu->vcpu, evinfo)) { 3103 enable_intr(); 3104 vm_exit_rendezvous(vcpu->vcpu, rip); 3105 break; 3106 } 3107 3108 if (vcpu_reqidle(evinfo)) { 3109 enable_intr(); 3110 vm_exit_reqidle(vcpu->vcpu, rip); 3111 break; 3112 } 3113 3114 if (vcpu_should_yield(vcpu->vcpu)) { 3115 enable_intr(); 3116 vm_exit_astpending(vcpu->vcpu, rip); 3117 vmx_astpending_trace(vcpu, rip); 3118 handled = HANDLED; 3119 break; 3120 } 3121 3122 if (vcpu_debugged(vcpu->vcpu)) { 3123 enable_intr(); 3124 vm_exit_debug(vcpu->vcpu, rip); 3125 break; 3126 } 3127 3128 /* 3129 * If TPR Shadowing is enabled, the TPR Threshold 3130 * must be updated right before entering the guest. 3131 */ 3132 if (tpr_shadowing && !virtual_interrupt_delivery) { 3133 if ((vcpu->cap.proc_ctls & PROCBASED_USE_TPR_SHADOW) != 0) { 3134 vmcs_write(VMCS_TPR_THRESHOLD, vlapic_get_cr8(vlapic)); 3135 } 3136 } 3137 3138 /* 3139 * VM exits restore the base address but not the 3140 * limits of GDTR and IDTR. The VMCS only stores the 3141 * base address, so VM exits set the limits to 0xffff. 3142 * Save and restore the full GDTR and IDTR to restore 3143 * the limits. 3144 * 3145 * The VMCS does not save the LDTR at all, and VM 3146 * exits clear LDTR as if a NULL selector were loaded. 3147 * The userspace hypervisor probably doesn't use a 3148 * LDT, but save and restore it to be safe. 3149 */ 3150 sgdt(&gdtr); 3151 sidt(&idtr); 3152 ldt_sel = sldt(); 3153 3154 /* 3155 * The TSC_AUX MSR must be saved/restored while interrupts 3156 * are disabled so that it is not possible for the guest 3157 * TSC_AUX MSR value to be overwritten by the resume 3158 * portion of the IPI_SUSPEND codepath. This is why the 3159 * transition of this MSR is handled separately from those 3160 * handled by vmx_msr_guest_{enter,exit}(), which are ok to 3161 * be transitioned with preemption disabled but interrupts 3162 * enabled. 3163 * 3164 * These vmx_msr_guest_{enter,exit}_tsc_aux() calls can be 3165 * anywhere in this loop so long as they happen with 3166 * interrupts disabled. This location is chosen for 3167 * simplicity. 3168 */ 3169 vmx_msr_guest_enter_tsc_aux(vmx, vcpu); 3170 3171 vmx_dr_enter_guest(vmxctx); 3172 3173 /* 3174 * Mark the EPT as active on this host CPU and invalidate 3175 * EPTP-tagged TLB entries if required. 3176 */ 3177 vmx_pmap_activate(vmx, pmap); 3178 3179 vmx_run_trace(vcpu); 3180 rc = vmx_enter_guest(vmxctx, vmx, launched); 3181 3182 vmx_pmap_deactivate(vmx, pmap); 3183 vmx_dr_leave_guest(vmxctx); 3184 vmx_msr_guest_exit_tsc_aux(vmx, vcpu); 3185 3186 bare_lgdt(&gdtr); 3187 lidt(&idtr); 3188 lldt(ldt_sel); 3189 3190 /* Collect some information for VM exit processing */ 3191 vmexit->rip = rip = vmcs_guest_rip(); 3192 vmexit->inst_length = vmexit_instruction_length(); 3193 vmexit->u.vmx.exit_reason = exit_reason = vmcs_exit_reason(); 3194 vmexit->u.vmx.exit_qualification = vmcs_exit_qualification(); 3195 3196 /* Update 'nextrip' */ 3197 vcpu->state.nextrip = rip; 3198 3199 if (rc == VMX_GUEST_VMEXIT) { 3200 vmx_exit_handle_nmi(vcpu, vmexit); 3201 enable_intr(); 3202 handled = vmx_exit_process(vmx, vcpu, vmexit); 3203 } else { 3204 enable_intr(); 3205 vmx_exit_inst_error(vmxctx, rc, vmexit); 3206 } 3207 launched = 1; 3208 vmx_exit_trace(vcpu, rip, exit_reason, handled); 3209 rip = vmexit->rip; 3210 } while (handled); 3211 3212 /* 3213 * If a VM exit has been handled then the exitcode must be BOGUS 3214 * If a VM exit is not handled then the exitcode must not be BOGUS 3215 */ 3216 if ((handled && vmexit->exitcode != VM_EXITCODE_BOGUS) || 3217 (!handled && vmexit->exitcode == VM_EXITCODE_BOGUS)) { 3218 panic("Mismatch between handled (%d) and exitcode (%d)", 3219 handled, vmexit->exitcode); 3220 } 3221 3222 VMX_CTR1(vcpu, "returning from vmx_run: exitcode %d", 3223 vmexit->exitcode); 3224 3225 VMCLEAR(vmcs); 3226 vmx_msr_guest_exit(vcpu); 3227 3228 return (0); 3229 } 3230 3231 static void 3232 vmx_vcpu_cleanup(void *vcpui) 3233 { 3234 struct vmx_vcpu *vcpu = vcpui; 3235 3236 vpid_free(vcpu->state.vpid); 3237 free(vcpu->pir_desc, M_VMX); 3238 free(vcpu->apic_page, M_VMX); 3239 free(vcpu->vmcs, M_VMX); 3240 free(vcpu, M_VMX); 3241 } 3242 3243 static void 3244 vmx_cleanup(void *vmi) 3245 { 3246 struct vmx *vmx = vmi; 3247 3248 if (virtual_interrupt_delivery) 3249 vm_unmap_mmio(vmx->vm, DEFAULT_APIC_BASE, PAGE_SIZE); 3250 3251 free(vmx->msr_bitmap, M_VMX); 3252 free(vmx, M_VMX); 3253 3254 return; 3255 } 3256 3257 static register_t * 3258 vmxctx_regptr(struct vmxctx *vmxctx, int reg) 3259 { 3260 3261 switch (reg) { 3262 case VM_REG_GUEST_RAX: 3263 return (&vmxctx->guest_rax); 3264 case VM_REG_GUEST_RBX: 3265 return (&vmxctx->guest_rbx); 3266 case VM_REG_GUEST_RCX: 3267 return (&vmxctx->guest_rcx); 3268 case VM_REG_GUEST_RDX: 3269 return (&vmxctx->guest_rdx); 3270 case VM_REG_GUEST_RSI: 3271 return (&vmxctx->guest_rsi); 3272 case VM_REG_GUEST_RDI: 3273 return (&vmxctx->guest_rdi); 3274 case VM_REG_GUEST_RBP: 3275 return (&vmxctx->guest_rbp); 3276 case VM_REG_GUEST_R8: 3277 return (&vmxctx->guest_r8); 3278 case VM_REG_GUEST_R9: 3279 return (&vmxctx->guest_r9); 3280 case VM_REG_GUEST_R10: 3281 return (&vmxctx->guest_r10); 3282 case VM_REG_GUEST_R11: 3283 return (&vmxctx->guest_r11); 3284 case VM_REG_GUEST_R12: 3285 return (&vmxctx->guest_r12); 3286 case VM_REG_GUEST_R13: 3287 return (&vmxctx->guest_r13); 3288 case VM_REG_GUEST_R14: 3289 return (&vmxctx->guest_r14); 3290 case VM_REG_GUEST_R15: 3291 return (&vmxctx->guest_r15); 3292 case VM_REG_GUEST_CR2: 3293 return (&vmxctx->guest_cr2); 3294 case VM_REG_GUEST_DR0: 3295 return (&vmxctx->guest_dr0); 3296 case VM_REG_GUEST_DR1: 3297 return (&vmxctx->guest_dr1); 3298 case VM_REG_GUEST_DR2: 3299 return (&vmxctx->guest_dr2); 3300 case VM_REG_GUEST_DR3: 3301 return (&vmxctx->guest_dr3); 3302 case VM_REG_GUEST_DR6: 3303 return (&vmxctx->guest_dr6); 3304 default: 3305 break; 3306 } 3307 return (NULL); 3308 } 3309 3310 static int 3311 vmxctx_getreg(struct vmxctx *vmxctx, int reg, uint64_t *retval) 3312 { 3313 register_t *regp; 3314 3315 if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) { 3316 *retval = *regp; 3317 return (0); 3318 } else 3319 return (EINVAL); 3320 } 3321 3322 static int 3323 vmxctx_setreg(struct vmxctx *vmxctx, int reg, uint64_t val) 3324 { 3325 register_t *regp; 3326 3327 if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) { 3328 *regp = val; 3329 return (0); 3330 } else 3331 return (EINVAL); 3332 } 3333 3334 static int 3335 vmx_get_intr_shadow(struct vmx_vcpu *vcpu, int running, uint64_t *retval) 3336 { 3337 uint64_t gi; 3338 int error; 3339 3340 error = vmcs_getreg(vcpu->vmcs, running, 3341 VMCS_IDENT(VMCS_GUEST_INTERRUPTIBILITY), &gi); 3342 *retval = (gi & HWINTR_BLOCKING) ? 1 : 0; 3343 return (error); 3344 } 3345 3346 static int 3347 vmx_modify_intr_shadow(struct vmx_vcpu *vcpu, int running, uint64_t val) 3348 { 3349 struct vmcs *vmcs; 3350 uint64_t gi; 3351 int error, ident; 3352 3353 /* 3354 * Forcing the vcpu into an interrupt shadow is not supported. 3355 */ 3356 if (val) { 3357 error = EINVAL; 3358 goto done; 3359 } 3360 3361 vmcs = vcpu->vmcs; 3362 ident = VMCS_IDENT(VMCS_GUEST_INTERRUPTIBILITY); 3363 error = vmcs_getreg(vmcs, running, ident, &gi); 3364 if (error == 0) { 3365 gi &= ~HWINTR_BLOCKING; 3366 error = vmcs_setreg(vmcs, running, ident, gi); 3367 } 3368 done: 3369 VMX_CTR2(vcpu, "Setting intr_shadow to %#lx %s", val, 3370 error ? "failed" : "succeeded"); 3371 return (error); 3372 } 3373 3374 static int 3375 vmx_shadow_reg(int reg) 3376 { 3377 int shreg; 3378 3379 shreg = -1; 3380 3381 switch (reg) { 3382 case VM_REG_GUEST_CR0: 3383 shreg = VMCS_CR0_SHADOW; 3384 break; 3385 case VM_REG_GUEST_CR4: 3386 shreg = VMCS_CR4_SHADOW; 3387 break; 3388 default: 3389 break; 3390 } 3391 3392 return (shreg); 3393 } 3394 3395 static int 3396 vmx_getreg(void *vcpui, int reg, uint64_t *retval) 3397 { 3398 int running, hostcpu; 3399 struct vmx_vcpu *vcpu = vcpui; 3400 struct vmx *vmx = vcpu->vmx; 3401 3402 running = vcpu_is_running(vcpu->vcpu, &hostcpu); 3403 if (running && hostcpu != curcpu) 3404 panic("vmx_getreg: %s%d is running", vm_name(vmx->vm), 3405 vcpu->vcpuid); 3406 3407 if (reg == VM_REG_GUEST_INTR_SHADOW) 3408 return (vmx_get_intr_shadow(vcpu, running, retval)); 3409 3410 if (vmxctx_getreg(&vcpu->ctx, reg, retval) == 0) 3411 return (0); 3412 3413 return (vmcs_getreg(vcpu->vmcs, running, reg, retval)); 3414 } 3415 3416 static int 3417 vmx_setreg(void *vcpui, int reg, uint64_t val) 3418 { 3419 int error, hostcpu, running, shadow; 3420 uint64_t ctls; 3421 pmap_t pmap; 3422 struct vmx_vcpu *vcpu = vcpui; 3423 struct vmx *vmx = vcpu->vmx; 3424 3425 running = vcpu_is_running(vcpu->vcpu, &hostcpu); 3426 if (running && hostcpu != curcpu) 3427 panic("vmx_setreg: %s%d is running", vm_name(vmx->vm), 3428 vcpu->vcpuid); 3429 3430 if (reg == VM_REG_GUEST_INTR_SHADOW) 3431 return (vmx_modify_intr_shadow(vcpu, running, val)); 3432 3433 if (vmxctx_setreg(&vcpu->ctx, reg, val) == 0) 3434 return (0); 3435 3436 /* Do not permit user write access to VMCS fields by offset. */ 3437 if (reg < 0) 3438 return (EINVAL); 3439 3440 error = vmcs_setreg(vcpu->vmcs, running, reg, val); 3441 3442 if (error == 0) { 3443 /* 3444 * If the "load EFER" VM-entry control is 1 then the 3445 * value of EFER.LMA must be identical to "IA-32e mode guest" 3446 * bit in the VM-entry control. 3447 */ 3448 if ((entry_ctls & VM_ENTRY_LOAD_EFER) != 0 && 3449 (reg == VM_REG_GUEST_EFER)) { 3450 vmcs_getreg(vcpu->vmcs, running, 3451 VMCS_IDENT(VMCS_ENTRY_CTLS), &ctls); 3452 if (val & EFER_LMA) 3453 ctls |= VM_ENTRY_GUEST_LMA; 3454 else 3455 ctls &= ~VM_ENTRY_GUEST_LMA; 3456 vmcs_setreg(vcpu->vmcs, running, 3457 VMCS_IDENT(VMCS_ENTRY_CTLS), ctls); 3458 } 3459 3460 shadow = vmx_shadow_reg(reg); 3461 if (shadow > 0) { 3462 /* 3463 * Store the unmodified value in the shadow 3464 */ 3465 error = vmcs_setreg(vcpu->vmcs, running, 3466 VMCS_IDENT(shadow), val); 3467 } 3468 3469 if (reg == VM_REG_GUEST_CR3) { 3470 /* 3471 * Invalidate the guest vcpu's TLB mappings to emulate 3472 * the behavior of updating %cr3. 3473 * 3474 * XXX the processor retains global mappings when %cr3 3475 * is updated but vmx_invvpid() does not. 3476 */ 3477 pmap = vcpu->ctx.pmap; 3478 vmx_invvpid(vmx, vcpu, pmap, running); 3479 } 3480 } 3481 3482 return (error); 3483 } 3484 3485 static int 3486 vmx_getdesc(void *vcpui, int reg, struct seg_desc *desc) 3487 { 3488 int hostcpu, running; 3489 struct vmx_vcpu *vcpu = vcpui; 3490 struct vmx *vmx = vcpu->vmx; 3491 3492 running = vcpu_is_running(vcpu->vcpu, &hostcpu); 3493 if (running && hostcpu != curcpu) 3494 panic("vmx_getdesc: %s%d is running", vm_name(vmx->vm), 3495 vcpu->vcpuid); 3496 3497 return (vmcs_getdesc(vcpu->vmcs, running, reg, desc)); 3498 } 3499 3500 static int 3501 vmx_setdesc(void *vcpui, int reg, struct seg_desc *desc) 3502 { 3503 int hostcpu, running; 3504 struct vmx_vcpu *vcpu = vcpui; 3505 struct vmx *vmx = vcpu->vmx; 3506 3507 running = vcpu_is_running(vcpu->vcpu, &hostcpu); 3508 if (running && hostcpu != curcpu) 3509 panic("vmx_setdesc: %s%d is running", vm_name(vmx->vm), 3510 vcpu->vcpuid); 3511 3512 return (vmcs_setdesc(vcpu->vmcs, running, reg, desc)); 3513 } 3514 3515 static int 3516 vmx_getcap(void *vcpui, int type, int *retval) 3517 { 3518 struct vmx_vcpu *vcpu = vcpui; 3519 int vcap; 3520 int ret; 3521 3522 ret = ENOENT; 3523 3524 vcap = vcpu->cap.set; 3525 3526 switch (type) { 3527 case VM_CAP_HALT_EXIT: 3528 if (cap_halt_exit) 3529 ret = 0; 3530 break; 3531 case VM_CAP_PAUSE_EXIT: 3532 if (cap_pause_exit) 3533 ret = 0; 3534 break; 3535 case VM_CAP_MTRAP_EXIT: 3536 if (cap_monitor_trap) 3537 ret = 0; 3538 break; 3539 case VM_CAP_RDPID: 3540 if (cap_rdpid) 3541 ret = 0; 3542 break; 3543 case VM_CAP_RDTSCP: 3544 if (cap_rdtscp) 3545 ret = 0; 3546 break; 3547 case VM_CAP_UNRESTRICTED_GUEST: 3548 if (cap_unrestricted_guest) 3549 ret = 0; 3550 break; 3551 case VM_CAP_ENABLE_INVPCID: 3552 if (cap_invpcid) 3553 ret = 0; 3554 break; 3555 case VM_CAP_BPT_EXIT: 3556 case VM_CAP_IPI_EXIT: 3557 ret = 0; 3558 break; 3559 default: 3560 break; 3561 } 3562 3563 if (ret == 0) 3564 *retval = (vcap & (1 << type)) ? 1 : 0; 3565 3566 return (ret); 3567 } 3568 3569 static int 3570 vmx_setcap(void *vcpui, int type, int val) 3571 { 3572 struct vmx_vcpu *vcpu = vcpui; 3573 struct vmcs *vmcs = vcpu->vmcs; 3574 struct vlapic *vlapic; 3575 uint32_t baseval; 3576 uint32_t *pptr; 3577 int error; 3578 int flag; 3579 int reg; 3580 int retval; 3581 3582 retval = ENOENT; 3583 pptr = NULL; 3584 3585 switch (type) { 3586 case VM_CAP_HALT_EXIT: 3587 if (cap_halt_exit) { 3588 retval = 0; 3589 pptr = &vcpu->cap.proc_ctls; 3590 baseval = *pptr; 3591 flag = PROCBASED_HLT_EXITING; 3592 reg = VMCS_PRI_PROC_BASED_CTLS; 3593 } 3594 break; 3595 case VM_CAP_MTRAP_EXIT: 3596 if (cap_monitor_trap) { 3597 retval = 0; 3598 pptr = &vcpu->cap.proc_ctls; 3599 baseval = *pptr; 3600 flag = PROCBASED_MTF; 3601 reg = VMCS_PRI_PROC_BASED_CTLS; 3602 } 3603 break; 3604 case VM_CAP_PAUSE_EXIT: 3605 if (cap_pause_exit) { 3606 retval = 0; 3607 pptr = &vcpu->cap.proc_ctls; 3608 baseval = *pptr; 3609 flag = PROCBASED_PAUSE_EXITING; 3610 reg = VMCS_PRI_PROC_BASED_CTLS; 3611 } 3612 break; 3613 case VM_CAP_RDPID: 3614 case VM_CAP_RDTSCP: 3615 if (cap_rdpid || cap_rdtscp) 3616 /* 3617 * Choose not to support enabling/disabling 3618 * RDPID/RDTSCP via libvmmapi since, as per the 3619 * discussion in vmx_modinit(), RDPID/RDTSCP are 3620 * either always enabled or always disabled. 3621 */ 3622 error = EOPNOTSUPP; 3623 break; 3624 case VM_CAP_UNRESTRICTED_GUEST: 3625 if (cap_unrestricted_guest) { 3626 retval = 0; 3627 pptr = &vcpu->cap.proc_ctls2; 3628 baseval = *pptr; 3629 flag = PROCBASED2_UNRESTRICTED_GUEST; 3630 reg = VMCS_SEC_PROC_BASED_CTLS; 3631 } 3632 break; 3633 case VM_CAP_ENABLE_INVPCID: 3634 if (cap_invpcid) { 3635 retval = 0; 3636 pptr = &vcpu->cap.proc_ctls2; 3637 baseval = *pptr; 3638 flag = PROCBASED2_ENABLE_INVPCID; 3639 reg = VMCS_SEC_PROC_BASED_CTLS; 3640 } 3641 break; 3642 case VM_CAP_BPT_EXIT: 3643 retval = 0; 3644 3645 /* Don't change the bitmap if we are tracing all exceptions. */ 3646 if (vcpu->cap.exc_bitmap != 0xffffffff) { 3647 pptr = &vcpu->cap.exc_bitmap; 3648 baseval = *pptr; 3649 flag = (1 << IDT_BP); 3650 reg = VMCS_EXCEPTION_BITMAP; 3651 } 3652 break; 3653 case VM_CAP_IPI_EXIT: 3654 retval = 0; 3655 3656 vlapic = vm_lapic(vcpu->vcpu); 3657 vlapic->ipi_exit = val; 3658 break; 3659 case VM_CAP_MASK_HWINTR: 3660 retval = 0; 3661 break; 3662 default: 3663 break; 3664 } 3665 3666 if (retval) 3667 return (retval); 3668 3669 if (pptr != NULL) { 3670 if (val) { 3671 baseval |= flag; 3672 } else { 3673 baseval &= ~flag; 3674 } 3675 VMPTRLD(vmcs); 3676 error = vmwrite(reg, baseval); 3677 VMCLEAR(vmcs); 3678 3679 if (error) 3680 return (error); 3681 3682 /* 3683 * Update optional stored flags, and record 3684 * setting 3685 */ 3686 *pptr = baseval; 3687 } 3688 3689 if (val) { 3690 vcpu->cap.set |= (1 << type); 3691 } else { 3692 vcpu->cap.set &= ~(1 << type); 3693 } 3694 3695 return (0); 3696 } 3697 3698 static struct vmspace * 3699 vmx_vmspace_alloc(vm_offset_t min, vm_offset_t max) 3700 { 3701 return (ept_vmspace_alloc(min, max)); 3702 } 3703 3704 static void 3705 vmx_vmspace_free(struct vmspace *vmspace) 3706 { 3707 ept_vmspace_free(vmspace); 3708 } 3709 3710 struct vlapic_vtx { 3711 struct vlapic vlapic; 3712 struct pir_desc *pir_desc; 3713 struct vmx_vcpu *vcpu; 3714 u_int pending_prio; 3715 }; 3716 3717 #define VPR_PRIO_BIT(vpr) (1 << ((vpr) >> 4)) 3718 3719 #define VMX_CTR_PIR(vlapic, pir_desc, notify, vector, level, msg) \ 3720 do { \ 3721 VLAPIC_CTR2(vlapic, msg " assert %s-triggered vector %d", \ 3722 level ? "level" : "edge", vector); \ 3723 VLAPIC_CTR1(vlapic, msg " pir0 0x%016lx", pir_desc->pir[0]); \ 3724 VLAPIC_CTR1(vlapic, msg " pir1 0x%016lx", pir_desc->pir[1]); \ 3725 VLAPIC_CTR1(vlapic, msg " pir2 0x%016lx", pir_desc->pir[2]); \ 3726 VLAPIC_CTR1(vlapic, msg " pir3 0x%016lx", pir_desc->pir[3]); \ 3727 VLAPIC_CTR1(vlapic, msg " notify: %s", notify ? "yes" : "no"); \ 3728 } while (0) 3729 3730 /* 3731 * vlapic->ops handlers that utilize the APICv hardware assist described in 3732 * Chapter 29 of the Intel SDM. 3733 */ 3734 static int 3735 vmx_set_intr_ready(struct vlapic *vlapic, int vector, bool level) 3736 { 3737 struct vlapic_vtx *vlapic_vtx; 3738 struct pir_desc *pir_desc; 3739 uint64_t mask; 3740 int idx, notify = 0; 3741 3742 vlapic_vtx = (struct vlapic_vtx *)vlapic; 3743 pir_desc = vlapic_vtx->pir_desc; 3744 3745 /* 3746 * Keep track of interrupt requests in the PIR descriptor. This is 3747 * because the virtual APIC page pointed to by the VMCS cannot be 3748 * modified if the vcpu is running. 3749 */ 3750 idx = vector / 64; 3751 mask = 1UL << (vector % 64); 3752 atomic_set_long(&pir_desc->pir[idx], mask); 3753 3754 /* 3755 * A notification is required whenever the 'pending' bit makes a 3756 * transition from 0->1. 3757 * 3758 * Even if the 'pending' bit is already asserted, notification about 3759 * the incoming interrupt may still be necessary. For example, if a 3760 * vCPU is HLTed with a high PPR, a low priority interrupt would cause 3761 * the 0->1 'pending' transition with a notification, but the vCPU 3762 * would ignore the interrupt for the time being. The same vCPU would 3763 * need to then be notified if a high-priority interrupt arrived which 3764 * satisfied the PPR. 3765 * 3766 * The priorities of interrupts injected while 'pending' is asserted 3767 * are tracked in a custom bitfield 'pending_prio'. Should the 3768 * to-be-injected interrupt exceed the priorities already present, the 3769 * notification is sent. The priorities recorded in 'pending_prio' are 3770 * cleared whenever the 'pending' bit makes another 0->1 transition. 3771 */ 3772 if (atomic_cmpset_long(&pir_desc->pending, 0, 1) != 0) { 3773 notify = 1; 3774 vlapic_vtx->pending_prio = 0; 3775 } else { 3776 const u_int old_prio = vlapic_vtx->pending_prio; 3777 const u_int prio_bit = VPR_PRIO_BIT(vector & APIC_TPR_INT); 3778 3779 if ((old_prio & prio_bit) == 0 && prio_bit > old_prio) { 3780 atomic_set_int(&vlapic_vtx->pending_prio, prio_bit); 3781 notify = 1; 3782 } 3783 } 3784 3785 VMX_CTR_PIR(vlapic, pir_desc, notify, vector, level, 3786 "vmx_set_intr_ready"); 3787 return (notify); 3788 } 3789 3790 static int 3791 vmx_pending_intr(struct vlapic *vlapic, int *vecptr) 3792 { 3793 struct vlapic_vtx *vlapic_vtx; 3794 struct pir_desc *pir_desc; 3795 struct LAPIC *lapic; 3796 uint64_t pending, pirval; 3797 uint8_t ppr, vpr, rvi; 3798 struct vm_exit *vmexit; 3799 int i; 3800 3801 /* 3802 * This function is only expected to be called from the 'HLT' exit 3803 * handler which does not care about the vector that is pending. 3804 */ 3805 KASSERT(vecptr == NULL, ("vmx_pending_intr: vecptr must be NULL")); 3806 3807 vlapic_vtx = (struct vlapic_vtx *)vlapic; 3808 pir_desc = vlapic_vtx->pir_desc; 3809 lapic = vlapic->apic_page; 3810 3811 /* 3812 * While a virtual interrupt may have already been 3813 * processed the actual delivery maybe pending the 3814 * interruptibility of the guest. Recognize a pending 3815 * interrupt by reevaluating virtual interrupts 3816 * following Section 30.2.1 in the Intel SDM Volume 3. 3817 */ 3818 vmexit = vm_exitinfo(vlapic->vcpu); 3819 KASSERT(vmexit->exitcode == VM_EXITCODE_HLT, 3820 ("vmx_pending_intr: exitcode not 'HLT'")); 3821 rvi = vmexit->u.hlt.intr_status & APIC_TPR_INT; 3822 ppr = lapic->ppr & APIC_TPR_INT; 3823 if (rvi > ppr) 3824 return (1); 3825 3826 pending = atomic_load_acq_long(&pir_desc->pending); 3827 if (!pending) 3828 return (0); 3829 3830 /* 3831 * If there is an interrupt pending then it will be recognized only 3832 * if its priority is greater than the processor priority. 3833 * 3834 * Special case: if the processor priority is zero then any pending 3835 * interrupt will be recognized. 3836 */ 3837 if (ppr == 0) 3838 return (1); 3839 3840 VLAPIC_CTR1(vlapic, "HLT with non-zero PPR %d", lapic->ppr); 3841 3842 vpr = 0; 3843 for (i = 3; i >= 0; i--) { 3844 pirval = pir_desc->pir[i]; 3845 if (pirval != 0) { 3846 vpr = (i * 64 + flsl(pirval) - 1) & APIC_TPR_INT; 3847 break; 3848 } 3849 } 3850 3851 /* 3852 * If the highest-priority pending interrupt falls short of the 3853 * processor priority of this vCPU, ensure that 'pending_prio' does not 3854 * have any stale bits which would preclude a higher-priority interrupt 3855 * from incurring a notification later. 3856 */ 3857 if (vpr <= ppr) { 3858 const u_int prio_bit = VPR_PRIO_BIT(vpr); 3859 const u_int old = vlapic_vtx->pending_prio; 3860 3861 if (old > prio_bit && (old & prio_bit) == 0) { 3862 vlapic_vtx->pending_prio = prio_bit; 3863 } 3864 return (0); 3865 } 3866 return (1); 3867 } 3868 3869 static void 3870 vmx_intr_accepted(struct vlapic *vlapic, int vector) 3871 { 3872 3873 panic("vmx_intr_accepted: not expected to be called"); 3874 } 3875 3876 static void 3877 vmx_set_tmr(struct vlapic *vlapic, int vector, bool level) 3878 { 3879 struct vlapic_vtx *vlapic_vtx; 3880 struct vmcs *vmcs; 3881 uint64_t mask, val; 3882 3883 KASSERT(vector >= 0 && vector <= 255, ("invalid vector %d", vector)); 3884 KASSERT(!vcpu_is_running(vlapic->vcpu, NULL), 3885 ("vmx_set_tmr: vcpu cannot be running")); 3886 3887 vlapic_vtx = (struct vlapic_vtx *)vlapic; 3888 vmcs = vlapic_vtx->vcpu->vmcs; 3889 mask = 1UL << (vector % 64); 3890 3891 VMPTRLD(vmcs); 3892 val = vmcs_read(VMCS_EOI_EXIT(vector)); 3893 if (level) 3894 val |= mask; 3895 else 3896 val &= ~mask; 3897 vmcs_write(VMCS_EOI_EXIT(vector), val); 3898 VMCLEAR(vmcs); 3899 } 3900 3901 static void 3902 vmx_enable_x2apic_mode_ts(struct vlapic *vlapic) 3903 { 3904 struct vlapic_vtx *vlapic_vtx; 3905 struct vmx_vcpu *vcpu; 3906 struct vmcs *vmcs; 3907 uint32_t proc_ctls; 3908 3909 vlapic_vtx = (struct vlapic_vtx *)vlapic; 3910 vcpu = vlapic_vtx->vcpu; 3911 vmcs = vcpu->vmcs; 3912 3913 proc_ctls = vcpu->cap.proc_ctls; 3914 proc_ctls &= ~PROCBASED_USE_TPR_SHADOW; 3915 proc_ctls |= PROCBASED_CR8_LOAD_EXITING; 3916 proc_ctls |= PROCBASED_CR8_STORE_EXITING; 3917 vcpu->cap.proc_ctls = proc_ctls; 3918 3919 VMPTRLD(vmcs); 3920 vmcs_write(VMCS_PRI_PROC_BASED_CTLS, proc_ctls); 3921 VMCLEAR(vmcs); 3922 } 3923 3924 static void 3925 vmx_enable_x2apic_mode_vid(struct vlapic *vlapic) 3926 { 3927 struct vlapic_vtx *vlapic_vtx; 3928 struct vmx *vmx; 3929 struct vmx_vcpu *vcpu; 3930 struct vmcs *vmcs; 3931 uint32_t proc_ctls2; 3932 int error __diagused; 3933 3934 vlapic_vtx = (struct vlapic_vtx *)vlapic; 3935 vcpu = vlapic_vtx->vcpu; 3936 vmx = vcpu->vmx; 3937 vmcs = vcpu->vmcs; 3938 3939 proc_ctls2 = vcpu->cap.proc_ctls2; 3940 KASSERT((proc_ctls2 & PROCBASED2_VIRTUALIZE_APIC_ACCESSES) != 0, 3941 ("%s: invalid proc_ctls2 %#x", __func__, proc_ctls2)); 3942 3943 proc_ctls2 &= ~PROCBASED2_VIRTUALIZE_APIC_ACCESSES; 3944 proc_ctls2 |= PROCBASED2_VIRTUALIZE_X2APIC_MODE; 3945 vcpu->cap.proc_ctls2 = proc_ctls2; 3946 3947 VMPTRLD(vmcs); 3948 vmcs_write(VMCS_SEC_PROC_BASED_CTLS, proc_ctls2); 3949 VMCLEAR(vmcs); 3950 3951 if (vlapic->vcpuid == 0) { 3952 /* 3953 * The nested page table mappings are shared by all vcpus 3954 * so unmap the APIC access page just once. 3955 */ 3956 error = vm_unmap_mmio(vmx->vm, DEFAULT_APIC_BASE, PAGE_SIZE); 3957 KASSERT(error == 0, ("%s: vm_unmap_mmio error %d", 3958 __func__, error)); 3959 3960 /* 3961 * The MSR bitmap is shared by all vcpus so modify it only 3962 * once in the context of vcpu 0. 3963 */ 3964 error = vmx_allow_x2apic_msrs(vmx); 3965 KASSERT(error == 0, ("%s: vmx_allow_x2apic_msrs error %d", 3966 __func__, error)); 3967 } 3968 } 3969 3970 static void 3971 vmx_post_intr(struct vlapic *vlapic, int hostcpu) 3972 { 3973 3974 ipi_cpu(hostcpu, pirvec); 3975 } 3976 3977 /* 3978 * Transfer the pending interrupts in the PIR descriptor to the IRR 3979 * in the virtual APIC page. 3980 */ 3981 static void 3982 vmx_inject_pir(struct vlapic *vlapic) 3983 { 3984 struct vlapic_vtx *vlapic_vtx; 3985 struct pir_desc *pir_desc; 3986 struct LAPIC *lapic; 3987 uint64_t val, pirval; 3988 int rvi, pirbase = -1; 3989 uint16_t intr_status_old, intr_status_new; 3990 3991 vlapic_vtx = (struct vlapic_vtx *)vlapic; 3992 pir_desc = vlapic_vtx->pir_desc; 3993 if (atomic_cmpset_long(&pir_desc->pending, 1, 0) == 0) { 3994 VLAPIC_CTR0(vlapic, "vmx_inject_pir: " 3995 "no posted interrupt pending"); 3996 return; 3997 } 3998 3999 pirval = 0; 4000 pirbase = -1; 4001 lapic = vlapic->apic_page; 4002 4003 val = atomic_readandclear_long(&pir_desc->pir[0]); 4004 if (val != 0) { 4005 lapic->irr0 |= val; 4006 lapic->irr1 |= val >> 32; 4007 pirbase = 0; 4008 pirval = val; 4009 } 4010 4011 val = atomic_readandclear_long(&pir_desc->pir[1]); 4012 if (val != 0) { 4013 lapic->irr2 |= val; 4014 lapic->irr3 |= val >> 32; 4015 pirbase = 64; 4016 pirval = val; 4017 } 4018 4019 val = atomic_readandclear_long(&pir_desc->pir[2]); 4020 if (val != 0) { 4021 lapic->irr4 |= val; 4022 lapic->irr5 |= val >> 32; 4023 pirbase = 128; 4024 pirval = val; 4025 } 4026 4027 val = atomic_readandclear_long(&pir_desc->pir[3]); 4028 if (val != 0) { 4029 lapic->irr6 |= val; 4030 lapic->irr7 |= val >> 32; 4031 pirbase = 192; 4032 pirval = val; 4033 } 4034 4035 VLAPIC_CTR_IRR(vlapic, "vmx_inject_pir"); 4036 4037 /* 4038 * Update RVI so the processor can evaluate pending virtual 4039 * interrupts on VM-entry. 4040 * 4041 * It is possible for pirval to be 0 here, even though the 4042 * pending bit has been set. The scenario is: 4043 * CPU-Y is sending a posted interrupt to CPU-X, which 4044 * is running a guest and processing posted interrupts in h/w. 4045 * CPU-X will eventually exit and the state seen in s/w is 4046 * the pending bit set, but no PIR bits set. 4047 * 4048 * CPU-X CPU-Y 4049 * (vm running) (host running) 4050 * rx posted interrupt 4051 * CLEAR pending bit 4052 * SET PIR bit 4053 * READ/CLEAR PIR bits 4054 * SET pending bit 4055 * (vm exit) 4056 * pending bit set, PIR 0 4057 */ 4058 if (pirval != 0) { 4059 rvi = pirbase + flsl(pirval) - 1; 4060 intr_status_old = vmcs_read(VMCS_GUEST_INTR_STATUS); 4061 intr_status_new = (intr_status_old & 0xFF00) | rvi; 4062 if (intr_status_new > intr_status_old) { 4063 vmcs_write(VMCS_GUEST_INTR_STATUS, intr_status_new); 4064 VLAPIC_CTR2(vlapic, "vmx_inject_pir: " 4065 "guest_intr_status changed from 0x%04x to 0x%04x", 4066 intr_status_old, intr_status_new); 4067 } 4068 } 4069 } 4070 4071 static struct vlapic * 4072 vmx_vlapic_init(void *vcpui) 4073 { 4074 struct vmx *vmx; 4075 struct vmx_vcpu *vcpu; 4076 struct vlapic *vlapic; 4077 struct vlapic_vtx *vlapic_vtx; 4078 4079 vcpu = vcpui; 4080 vmx = vcpu->vmx; 4081 4082 vlapic = malloc(sizeof(struct vlapic_vtx), M_VLAPIC, M_WAITOK | M_ZERO); 4083 vlapic->vm = vmx->vm; 4084 vlapic->vcpu = vcpu->vcpu; 4085 vlapic->vcpuid = vcpu->vcpuid; 4086 vlapic->apic_page = (struct LAPIC *)vcpu->apic_page; 4087 4088 vlapic_vtx = (struct vlapic_vtx *)vlapic; 4089 vlapic_vtx->pir_desc = vcpu->pir_desc; 4090 vlapic_vtx->vcpu = vcpu; 4091 4092 if (tpr_shadowing) { 4093 vlapic->ops.enable_x2apic_mode = vmx_enable_x2apic_mode_ts; 4094 } 4095 4096 if (virtual_interrupt_delivery) { 4097 vlapic->ops.set_intr_ready = vmx_set_intr_ready; 4098 vlapic->ops.pending_intr = vmx_pending_intr; 4099 vlapic->ops.intr_accepted = vmx_intr_accepted; 4100 vlapic->ops.set_tmr = vmx_set_tmr; 4101 vlapic->ops.enable_x2apic_mode = vmx_enable_x2apic_mode_vid; 4102 } 4103 4104 if (posted_interrupts) 4105 vlapic->ops.post_intr = vmx_post_intr; 4106 4107 vlapic_init(vlapic); 4108 4109 return (vlapic); 4110 } 4111 4112 static void 4113 vmx_vlapic_cleanup(struct vlapic *vlapic) 4114 { 4115 4116 vlapic_cleanup(vlapic); 4117 free(vlapic, M_VLAPIC); 4118 } 4119 4120 #ifdef BHYVE_SNAPSHOT 4121 static int 4122 vmx_vcpu_snapshot(void *vcpui, struct vm_snapshot_meta *meta) 4123 { 4124 struct vmcs *vmcs; 4125 struct vmx *vmx; 4126 struct vmx_vcpu *vcpu; 4127 struct vmxctx *vmxctx; 4128 int err, run, hostcpu; 4129 4130 err = 0; 4131 vcpu = vcpui; 4132 vmx = vcpu->vmx; 4133 vmcs = vcpu->vmcs; 4134 4135 run = vcpu_is_running(vcpu->vcpu, &hostcpu); 4136 if (run && hostcpu != curcpu) { 4137 printf("%s: %s%d is running", __func__, vm_name(vmx->vm), 4138 vcpu->vcpuid); 4139 return (EINVAL); 4140 } 4141 4142 err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_CR0, meta); 4143 err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_CR3, meta); 4144 err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_CR4, meta); 4145 err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_DR7, meta); 4146 err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_RSP, meta); 4147 err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_RIP, meta); 4148 err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_RFLAGS, meta); 4149 4150 /* Guest segments */ 4151 err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_ES, meta); 4152 err += vmcs_snapshot_desc(vmcs, run, VM_REG_GUEST_ES, meta); 4153 4154 err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_CS, meta); 4155 err += vmcs_snapshot_desc(vmcs, run, VM_REG_GUEST_CS, meta); 4156 4157 err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_SS, meta); 4158 err += vmcs_snapshot_desc(vmcs, run, VM_REG_GUEST_SS, meta); 4159 4160 err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_DS, meta); 4161 err += vmcs_snapshot_desc(vmcs, run, VM_REG_GUEST_DS, meta); 4162 4163 err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_FS, meta); 4164 err += vmcs_snapshot_desc(vmcs, run, VM_REG_GUEST_FS, meta); 4165 4166 err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_GS, meta); 4167 err += vmcs_snapshot_desc(vmcs, run, VM_REG_GUEST_GS, meta); 4168 4169 err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_TR, meta); 4170 err += vmcs_snapshot_desc(vmcs, run, VM_REG_GUEST_TR, meta); 4171 4172 err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_LDTR, meta); 4173 err += vmcs_snapshot_desc(vmcs, run, VM_REG_GUEST_LDTR, meta); 4174 4175 err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_EFER, meta); 4176 4177 err += vmcs_snapshot_desc(vmcs, run, VM_REG_GUEST_IDTR, meta); 4178 err += vmcs_snapshot_desc(vmcs, run, VM_REG_GUEST_GDTR, meta); 4179 4180 /* Guest page tables */ 4181 err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_PDPTE0, meta); 4182 err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_PDPTE1, meta); 4183 err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_PDPTE2, meta); 4184 err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_PDPTE3, meta); 4185 4186 /* Other guest state */ 4187 err += vmcs_snapshot_any(vmcs, run, VMCS_GUEST_IA32_SYSENTER_CS, meta); 4188 err += vmcs_snapshot_any(vmcs, run, VMCS_GUEST_IA32_SYSENTER_ESP, meta); 4189 err += vmcs_snapshot_any(vmcs, run, VMCS_GUEST_IA32_SYSENTER_EIP, meta); 4190 err += vmcs_snapshot_any(vmcs, run, VMCS_GUEST_INTERRUPTIBILITY, meta); 4191 err += vmcs_snapshot_any(vmcs, run, VMCS_GUEST_ACTIVITY, meta); 4192 err += vmcs_snapshot_any(vmcs, run, VMCS_ENTRY_CTLS, meta); 4193 err += vmcs_snapshot_any(vmcs, run, VMCS_EXIT_CTLS, meta); 4194 if (err != 0) 4195 goto done; 4196 4197 SNAPSHOT_BUF_OR_LEAVE(vcpu->guest_msrs, 4198 sizeof(vcpu->guest_msrs), meta, err, done); 4199 4200 SNAPSHOT_BUF_OR_LEAVE(vcpu->pir_desc, 4201 sizeof(*vcpu->pir_desc), meta, err, done); 4202 4203 vmxctx = &vcpu->ctx; 4204 SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_rdi, meta, err, done); 4205 SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_rsi, meta, err, done); 4206 SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_rdx, meta, err, done); 4207 SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_rcx, meta, err, done); 4208 SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_r8, meta, err, done); 4209 SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_r9, meta, err, done); 4210 SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_rax, meta, err, done); 4211 SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_rbx, meta, err, done); 4212 SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_rbp, meta, err, done); 4213 SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_r10, meta, err, done); 4214 SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_r11, meta, err, done); 4215 SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_r12, meta, err, done); 4216 SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_r13, meta, err, done); 4217 SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_r14, meta, err, done); 4218 SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_r15, meta, err, done); 4219 SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_cr2, meta, err, done); 4220 SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_dr0, meta, err, done); 4221 SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_dr1, meta, err, done); 4222 SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_dr2, meta, err, done); 4223 SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_dr3, meta, err, done); 4224 SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_dr6, meta, err, done); 4225 4226 done: 4227 return (err); 4228 } 4229 4230 static int 4231 vmx_restore_tsc(void *vcpui, uint64_t offset) 4232 { 4233 struct vmx_vcpu *vcpu = vcpui; 4234 struct vmcs *vmcs; 4235 struct vmx *vmx; 4236 int error, running, hostcpu; 4237 4238 vmx = vcpu->vmx; 4239 vmcs = vcpu->vmcs; 4240 4241 running = vcpu_is_running(vcpu->vcpu, &hostcpu); 4242 if (running && hostcpu != curcpu) { 4243 printf("%s: %s%d is running", __func__, vm_name(vmx->vm), 4244 vcpu->vcpuid); 4245 return (EINVAL); 4246 } 4247 4248 if (!running) 4249 VMPTRLD(vmcs); 4250 4251 error = vmx_set_tsc_offset(vcpu, offset); 4252 4253 if (!running) 4254 VMCLEAR(vmcs); 4255 return (error); 4256 } 4257 #endif 4258 4259 const struct vmm_ops vmm_ops_intel = { 4260 .modinit = vmx_modinit, 4261 .modcleanup = vmx_modcleanup, 4262 .modresume = vmx_modresume, 4263 .init = vmx_init, 4264 .run = vmx_run, 4265 .cleanup = vmx_cleanup, 4266 .vcpu_init = vmx_vcpu_init, 4267 .vcpu_cleanup = vmx_vcpu_cleanup, 4268 .getreg = vmx_getreg, 4269 .setreg = vmx_setreg, 4270 .getdesc = vmx_getdesc, 4271 .setdesc = vmx_setdesc, 4272 .getcap = vmx_getcap, 4273 .setcap = vmx_setcap, 4274 .vmspace_alloc = vmx_vmspace_alloc, 4275 .vmspace_free = vmx_vmspace_free, 4276 .vlapic_init = vmx_vlapic_init, 4277 .vlapic_cleanup = vmx_vlapic_cleanup, 4278 #ifdef BHYVE_SNAPSHOT 4279 .vcpu_snapshot = vmx_vcpu_snapshot, 4280 .restore_tsc = vmx_restore_tsc, 4281 #endif 4282 }; 4283