1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2011 NetApp, Inc. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 * 28 * $FreeBSD$ 29 */ 30 31 #include <sys/cdefs.h> 32 __FBSDID("$FreeBSD$"); 33 34 #include <sys/param.h> 35 #include <sys/systm.h> 36 #include <sys/smp.h> 37 #include <sys/kernel.h> 38 #include <sys/malloc.h> 39 #include <sys/pcpu.h> 40 #include <sys/proc.h> 41 #include <sys/sysctl.h> 42 43 #include <vm/vm.h> 44 #include <vm/pmap.h> 45 46 #include <machine/psl.h> 47 #include <machine/cpufunc.h> 48 #include <machine/md_var.h> 49 #include <machine/reg.h> 50 #include <machine/segments.h> 51 #include <machine/smp.h> 52 #include <machine/specialreg.h> 53 #include <machine/vmparam.h> 54 55 #include <machine/vmm.h> 56 #include <machine/vmm_dev.h> 57 #include <machine/vmm_instruction_emul.h> 58 #include "vmm_lapic.h" 59 #include "vmm_host.h" 60 #include "vmm_ioport.h" 61 #include "vmm_ktr.h" 62 #include "vmm_stat.h" 63 #include "vatpic.h" 64 #include "vlapic.h" 65 #include "vlapic_priv.h" 66 67 #include "ept.h" 68 #include "vmx_cpufunc.h" 69 #include "vmx.h" 70 #include "vmx_msr.h" 71 #include "x86.h" 72 #include "vmx_controls.h" 73 74 #define PINBASED_CTLS_ONE_SETTING \ 75 (PINBASED_EXTINT_EXITING | \ 76 PINBASED_NMI_EXITING | \ 77 PINBASED_VIRTUAL_NMI) 78 #define PINBASED_CTLS_ZERO_SETTING 0 79 80 #define PROCBASED_CTLS_WINDOW_SETTING \ 81 (PROCBASED_INT_WINDOW_EXITING | \ 82 PROCBASED_NMI_WINDOW_EXITING) 83 84 #define PROCBASED_CTLS_ONE_SETTING \ 85 (PROCBASED_SECONDARY_CONTROLS | \ 86 PROCBASED_MWAIT_EXITING | \ 87 PROCBASED_MONITOR_EXITING | \ 88 PROCBASED_IO_EXITING | \ 89 PROCBASED_MSR_BITMAPS | \ 90 PROCBASED_CTLS_WINDOW_SETTING | \ 91 PROCBASED_CR8_LOAD_EXITING | \ 92 PROCBASED_CR8_STORE_EXITING) 93 #define PROCBASED_CTLS_ZERO_SETTING \ 94 (PROCBASED_CR3_LOAD_EXITING | \ 95 PROCBASED_CR3_STORE_EXITING | \ 96 PROCBASED_IO_BITMAPS) 97 98 #define PROCBASED_CTLS2_ONE_SETTING PROCBASED2_ENABLE_EPT 99 #define PROCBASED_CTLS2_ZERO_SETTING 0 100 101 #define VM_EXIT_CTLS_ONE_SETTING \ 102 (VM_EXIT_SAVE_DEBUG_CONTROLS | \ 103 VM_EXIT_HOST_LMA | \ 104 VM_EXIT_SAVE_EFER | \ 105 VM_EXIT_LOAD_EFER | \ 106 VM_EXIT_ACKNOWLEDGE_INTERRUPT) 107 108 #define VM_EXIT_CTLS_ZERO_SETTING 0 109 110 #define VM_ENTRY_CTLS_ONE_SETTING \ 111 (VM_ENTRY_LOAD_DEBUG_CONTROLS | \ 112 VM_ENTRY_LOAD_EFER) 113 114 #define VM_ENTRY_CTLS_ZERO_SETTING \ 115 (VM_ENTRY_INTO_SMM | \ 116 VM_ENTRY_DEACTIVATE_DUAL_MONITOR) 117 118 #define HANDLED 1 119 #define UNHANDLED 0 120 121 static MALLOC_DEFINE(M_VMX, "vmx", "vmx"); 122 static MALLOC_DEFINE(M_VLAPIC, "vlapic", "vlapic"); 123 124 SYSCTL_DECL(_hw_vmm); 125 SYSCTL_NODE(_hw_vmm, OID_AUTO, vmx, CTLFLAG_RW, NULL, NULL); 126 127 int vmxon_enabled[MAXCPU]; 128 static char vmxon_region[MAXCPU][PAGE_SIZE] __aligned(PAGE_SIZE); 129 130 static uint32_t pinbased_ctls, procbased_ctls, procbased_ctls2; 131 static uint32_t exit_ctls, entry_ctls; 132 133 static uint64_t cr0_ones_mask, cr0_zeros_mask; 134 SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr0_ones_mask, CTLFLAG_RD, 135 &cr0_ones_mask, 0, NULL); 136 SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr0_zeros_mask, CTLFLAG_RD, 137 &cr0_zeros_mask, 0, NULL); 138 139 static uint64_t cr4_ones_mask, cr4_zeros_mask; 140 SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr4_ones_mask, CTLFLAG_RD, 141 &cr4_ones_mask, 0, NULL); 142 SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr4_zeros_mask, CTLFLAG_RD, 143 &cr4_zeros_mask, 0, NULL); 144 145 static int vmx_initialized; 146 SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, initialized, CTLFLAG_RD, 147 &vmx_initialized, 0, "Intel VMX initialized"); 148 149 /* 150 * Optional capabilities 151 */ 152 static SYSCTL_NODE(_hw_vmm_vmx, OID_AUTO, cap, CTLFLAG_RW, NULL, NULL); 153 154 static int cap_halt_exit; 155 SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, halt_exit, CTLFLAG_RD, &cap_halt_exit, 0, 156 "HLT triggers a VM-exit"); 157 158 static int cap_pause_exit; 159 SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, pause_exit, CTLFLAG_RD, &cap_pause_exit, 160 0, "PAUSE triggers a VM-exit"); 161 162 static int cap_unrestricted_guest; 163 SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, unrestricted_guest, CTLFLAG_RD, 164 &cap_unrestricted_guest, 0, "Unrestricted guests"); 165 166 static int cap_monitor_trap; 167 SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, monitor_trap, CTLFLAG_RD, 168 &cap_monitor_trap, 0, "Monitor trap flag"); 169 170 static int cap_invpcid; 171 SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, invpcid, CTLFLAG_RD, &cap_invpcid, 172 0, "Guests are allowed to use INVPCID"); 173 174 static int virtual_interrupt_delivery; 175 SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, virtual_interrupt_delivery, CTLFLAG_RD, 176 &virtual_interrupt_delivery, 0, "APICv virtual interrupt delivery support"); 177 178 static int posted_interrupts; 179 SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, posted_interrupts, CTLFLAG_RD, 180 &posted_interrupts, 0, "APICv posted interrupt support"); 181 182 static int pirvec = -1; 183 SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, posted_interrupt_vector, CTLFLAG_RD, 184 &pirvec, 0, "APICv posted interrupt vector"); 185 186 static struct unrhdr *vpid_unr; 187 static u_int vpid_alloc_failed; 188 SYSCTL_UINT(_hw_vmm_vmx, OID_AUTO, vpid_alloc_failed, CTLFLAG_RD, 189 &vpid_alloc_failed, 0, NULL); 190 191 /* 192 * The definitions of SDT probes for VMX. 193 */ 194 195 SDT_PROBE_DEFINE3(vmm, vmx, exit, entry, 196 "struct vmx *", "int", "struct vm_exit *"); 197 198 SDT_PROBE_DEFINE4(vmm, vmx, exit, taskswitch, 199 "struct vmx *", "int", "struct vm_exit *", "struct vm_task_switch *"); 200 201 SDT_PROBE_DEFINE4(vmm, vmx, exit, craccess, 202 "struct vmx *", "int", "struct vm_exit *", "uint64_t"); 203 204 SDT_PROBE_DEFINE4(vmm, vmx, exit, rdmsr, 205 "struct vmx *", "int", "struct vm_exit *", "uint32_t"); 206 207 SDT_PROBE_DEFINE5(vmm, vmx, exit, wrmsr, 208 "struct vmx *", "int", "struct vm_exit *", "uint32_t", "uint64_t"); 209 210 SDT_PROBE_DEFINE3(vmm, vmx, exit, halt, 211 "struct vmx *", "int", "struct vm_exit *"); 212 213 SDT_PROBE_DEFINE3(vmm, vmx, exit, mtrap, 214 "struct vmx *", "int", "struct vm_exit *"); 215 216 SDT_PROBE_DEFINE3(vmm, vmx, exit, pause, 217 "struct vmx *", "int", "struct vm_exit *"); 218 219 SDT_PROBE_DEFINE3(vmm, vmx, exit, intrwindow, 220 "struct vmx *", "int", "struct vm_exit *"); 221 222 SDT_PROBE_DEFINE4(vmm, vmx, exit, interrupt, 223 "struct vmx *", "int", "struct vm_exit *", "uint32_t"); 224 225 SDT_PROBE_DEFINE3(vmm, vmx, exit, nmiwindow, 226 "struct vmx *", "int", "struct vm_exit *"); 227 228 SDT_PROBE_DEFINE3(vmm, vmx, exit, inout, 229 "struct vmx *", "int", "struct vm_exit *"); 230 231 SDT_PROBE_DEFINE3(vmm, vmx, exit, cpuid, 232 "struct vmx *", "int", "struct vm_exit *"); 233 234 SDT_PROBE_DEFINE5(vmm, vmx, exit, exception, 235 "struct vmx *", "int", "struct vm_exit *", "uint32_t", "int"); 236 237 SDT_PROBE_DEFINE5(vmm, vmx, exit, nestedfault, 238 "struct vmx *", "int", "struct vm_exit *", "uint64_t", "uint64_t"); 239 240 SDT_PROBE_DEFINE4(vmm, vmx, exit, mmiofault, 241 "struct vmx *", "int", "struct vm_exit *", "uint64_t"); 242 243 SDT_PROBE_DEFINE3(vmm, vmx, exit, eoi, 244 "struct vmx *", "int", "struct vm_exit *"); 245 246 SDT_PROBE_DEFINE3(vmm, vmx, exit, apicaccess, 247 "struct vmx *", "int", "struct vm_exit *"); 248 249 SDT_PROBE_DEFINE4(vmm, vmx, exit, apicwrite, 250 "struct vmx *", "int", "struct vm_exit *", "struct vlapic *"); 251 252 SDT_PROBE_DEFINE3(vmm, vmx, exit, xsetbv, 253 "struct vmx *", "int", "struct vm_exit *"); 254 255 SDT_PROBE_DEFINE3(vmm, vmx, exit, monitor, 256 "struct vmx *", "int", "struct vm_exit *"); 257 258 SDT_PROBE_DEFINE3(vmm, vmx, exit, mwait, 259 "struct vmx *", "int", "struct vm_exit *"); 260 261 SDT_PROBE_DEFINE4(vmm, vmx, exit, unknown, 262 "struct vmx *", "int", "struct vm_exit *", "uint32_t"); 263 264 SDT_PROBE_DEFINE4(vmm, vmx, exit, return, 265 "struct vmx *", "int", "struct vm_exit *", "int"); 266 267 /* 268 * Use the last page below 4GB as the APIC access address. This address is 269 * occupied by the boot firmware so it is guaranteed that it will not conflict 270 * with a page in system memory. 271 */ 272 #define APIC_ACCESS_ADDRESS 0xFFFFF000 273 274 static int vmx_getdesc(void *arg, int vcpu, int reg, struct seg_desc *desc); 275 static int vmx_getreg(void *arg, int vcpu, int reg, uint64_t *retval); 276 static int vmxctx_setreg(struct vmxctx *vmxctx, int reg, uint64_t val); 277 static void vmx_inject_pir(struct vlapic *vlapic); 278 279 #ifdef KTR 280 static const char * 281 exit_reason_to_str(int reason) 282 { 283 static char reasonbuf[32]; 284 285 switch (reason) { 286 case EXIT_REASON_EXCEPTION: 287 return "exception"; 288 case EXIT_REASON_EXT_INTR: 289 return "extint"; 290 case EXIT_REASON_TRIPLE_FAULT: 291 return "triplefault"; 292 case EXIT_REASON_INIT: 293 return "init"; 294 case EXIT_REASON_SIPI: 295 return "sipi"; 296 case EXIT_REASON_IO_SMI: 297 return "iosmi"; 298 case EXIT_REASON_SMI: 299 return "smi"; 300 case EXIT_REASON_INTR_WINDOW: 301 return "intrwindow"; 302 case EXIT_REASON_NMI_WINDOW: 303 return "nmiwindow"; 304 case EXIT_REASON_TASK_SWITCH: 305 return "taskswitch"; 306 case EXIT_REASON_CPUID: 307 return "cpuid"; 308 case EXIT_REASON_GETSEC: 309 return "getsec"; 310 case EXIT_REASON_HLT: 311 return "hlt"; 312 case EXIT_REASON_INVD: 313 return "invd"; 314 case EXIT_REASON_INVLPG: 315 return "invlpg"; 316 case EXIT_REASON_RDPMC: 317 return "rdpmc"; 318 case EXIT_REASON_RDTSC: 319 return "rdtsc"; 320 case EXIT_REASON_RSM: 321 return "rsm"; 322 case EXIT_REASON_VMCALL: 323 return "vmcall"; 324 case EXIT_REASON_VMCLEAR: 325 return "vmclear"; 326 case EXIT_REASON_VMLAUNCH: 327 return "vmlaunch"; 328 case EXIT_REASON_VMPTRLD: 329 return "vmptrld"; 330 case EXIT_REASON_VMPTRST: 331 return "vmptrst"; 332 case EXIT_REASON_VMREAD: 333 return "vmread"; 334 case EXIT_REASON_VMRESUME: 335 return "vmresume"; 336 case EXIT_REASON_VMWRITE: 337 return "vmwrite"; 338 case EXIT_REASON_VMXOFF: 339 return "vmxoff"; 340 case EXIT_REASON_VMXON: 341 return "vmxon"; 342 case EXIT_REASON_CR_ACCESS: 343 return "craccess"; 344 case EXIT_REASON_DR_ACCESS: 345 return "draccess"; 346 case EXIT_REASON_INOUT: 347 return "inout"; 348 case EXIT_REASON_RDMSR: 349 return "rdmsr"; 350 case EXIT_REASON_WRMSR: 351 return "wrmsr"; 352 case EXIT_REASON_INVAL_VMCS: 353 return "invalvmcs"; 354 case EXIT_REASON_INVAL_MSR: 355 return "invalmsr"; 356 case EXIT_REASON_MWAIT: 357 return "mwait"; 358 case EXIT_REASON_MTF: 359 return "mtf"; 360 case EXIT_REASON_MONITOR: 361 return "monitor"; 362 case EXIT_REASON_PAUSE: 363 return "pause"; 364 case EXIT_REASON_MCE_DURING_ENTRY: 365 return "mce-during-entry"; 366 case EXIT_REASON_TPR: 367 return "tpr"; 368 case EXIT_REASON_APIC_ACCESS: 369 return "apic-access"; 370 case EXIT_REASON_GDTR_IDTR: 371 return "gdtridtr"; 372 case EXIT_REASON_LDTR_TR: 373 return "ldtrtr"; 374 case EXIT_REASON_EPT_FAULT: 375 return "eptfault"; 376 case EXIT_REASON_EPT_MISCONFIG: 377 return "eptmisconfig"; 378 case EXIT_REASON_INVEPT: 379 return "invept"; 380 case EXIT_REASON_RDTSCP: 381 return "rdtscp"; 382 case EXIT_REASON_VMX_PREEMPT: 383 return "vmxpreempt"; 384 case EXIT_REASON_INVVPID: 385 return "invvpid"; 386 case EXIT_REASON_WBINVD: 387 return "wbinvd"; 388 case EXIT_REASON_XSETBV: 389 return "xsetbv"; 390 case EXIT_REASON_APIC_WRITE: 391 return "apic-write"; 392 default: 393 snprintf(reasonbuf, sizeof(reasonbuf), "%d", reason); 394 return (reasonbuf); 395 } 396 } 397 #endif /* KTR */ 398 399 static int 400 vmx_allow_x2apic_msrs(struct vmx *vmx) 401 { 402 int i, error; 403 404 error = 0; 405 406 /* 407 * Allow readonly access to the following x2APIC MSRs from the guest. 408 */ 409 error += guest_msr_ro(vmx, MSR_APIC_ID); 410 error += guest_msr_ro(vmx, MSR_APIC_VERSION); 411 error += guest_msr_ro(vmx, MSR_APIC_LDR); 412 error += guest_msr_ro(vmx, MSR_APIC_SVR); 413 414 for (i = 0; i < 8; i++) 415 error += guest_msr_ro(vmx, MSR_APIC_ISR0 + i); 416 417 for (i = 0; i < 8; i++) 418 error += guest_msr_ro(vmx, MSR_APIC_TMR0 + i); 419 420 for (i = 0; i < 8; i++) 421 error += guest_msr_ro(vmx, MSR_APIC_IRR0 + i); 422 423 error += guest_msr_ro(vmx, MSR_APIC_ESR); 424 error += guest_msr_ro(vmx, MSR_APIC_LVT_TIMER); 425 error += guest_msr_ro(vmx, MSR_APIC_LVT_THERMAL); 426 error += guest_msr_ro(vmx, MSR_APIC_LVT_PCINT); 427 error += guest_msr_ro(vmx, MSR_APIC_LVT_LINT0); 428 error += guest_msr_ro(vmx, MSR_APIC_LVT_LINT1); 429 error += guest_msr_ro(vmx, MSR_APIC_LVT_ERROR); 430 error += guest_msr_ro(vmx, MSR_APIC_ICR_TIMER); 431 error += guest_msr_ro(vmx, MSR_APIC_DCR_TIMER); 432 error += guest_msr_ro(vmx, MSR_APIC_ICR); 433 434 /* 435 * Allow TPR, EOI and SELF_IPI MSRs to be read and written by the guest. 436 * 437 * These registers get special treatment described in the section 438 * "Virtualizing MSR-Based APIC Accesses". 439 */ 440 error += guest_msr_rw(vmx, MSR_APIC_TPR); 441 error += guest_msr_rw(vmx, MSR_APIC_EOI); 442 error += guest_msr_rw(vmx, MSR_APIC_SELF_IPI); 443 444 return (error); 445 } 446 447 u_long 448 vmx_fix_cr0(u_long cr0) 449 { 450 451 return ((cr0 | cr0_ones_mask) & ~cr0_zeros_mask); 452 } 453 454 u_long 455 vmx_fix_cr4(u_long cr4) 456 { 457 458 return ((cr4 | cr4_ones_mask) & ~cr4_zeros_mask); 459 } 460 461 static void 462 vpid_free(int vpid) 463 { 464 if (vpid < 0 || vpid > 0xffff) 465 panic("vpid_free: invalid vpid %d", vpid); 466 467 /* 468 * VPIDs [0,VM_MAXCPU] are special and are not allocated from 469 * the unit number allocator. 470 */ 471 472 if (vpid > VM_MAXCPU) 473 free_unr(vpid_unr, vpid); 474 } 475 476 static void 477 vpid_alloc(uint16_t *vpid, int num) 478 { 479 int i, x; 480 481 if (num <= 0 || num > VM_MAXCPU) 482 panic("invalid number of vpids requested: %d", num); 483 484 /* 485 * If the "enable vpid" execution control is not enabled then the 486 * VPID is required to be 0 for all vcpus. 487 */ 488 if ((procbased_ctls2 & PROCBASED2_ENABLE_VPID) == 0) { 489 for (i = 0; i < num; i++) 490 vpid[i] = 0; 491 return; 492 } 493 494 /* 495 * Allocate a unique VPID for each vcpu from the unit number allocator. 496 */ 497 for (i = 0; i < num; i++) { 498 x = alloc_unr(vpid_unr); 499 if (x == -1) 500 break; 501 else 502 vpid[i] = x; 503 } 504 505 if (i < num) { 506 atomic_add_int(&vpid_alloc_failed, 1); 507 508 /* 509 * If the unit number allocator does not have enough unique 510 * VPIDs then we need to allocate from the [1,VM_MAXCPU] range. 511 * 512 * These VPIDs are not be unique across VMs but this does not 513 * affect correctness because the combined mappings are also 514 * tagged with the EP4TA which is unique for each VM. 515 * 516 * It is still sub-optimal because the invvpid will invalidate 517 * combined mappings for a particular VPID across all EP4TAs. 518 */ 519 while (i-- > 0) 520 vpid_free(vpid[i]); 521 522 for (i = 0; i < num; i++) 523 vpid[i] = i + 1; 524 } 525 } 526 527 static void 528 vpid_init(void) 529 { 530 /* 531 * VPID 0 is required when the "enable VPID" execution control is 532 * disabled. 533 * 534 * VPIDs [1,VM_MAXCPU] are used as the "overflow namespace" when the 535 * unit number allocator does not have sufficient unique VPIDs to 536 * satisfy the allocation. 537 * 538 * The remaining VPIDs are managed by the unit number allocator. 539 */ 540 vpid_unr = new_unrhdr(VM_MAXCPU + 1, 0xffff, NULL); 541 } 542 543 static void 544 vmx_disable(void *arg __unused) 545 { 546 struct invvpid_desc invvpid_desc = { 0 }; 547 struct invept_desc invept_desc = { 0 }; 548 549 if (vmxon_enabled[curcpu]) { 550 /* 551 * See sections 25.3.3.3 and 25.3.3.4 in Intel Vol 3b. 552 * 553 * VMXON or VMXOFF are not required to invalidate any TLB 554 * caching structures. This prevents potential retention of 555 * cached information in the TLB between distinct VMX episodes. 556 */ 557 invvpid(INVVPID_TYPE_ALL_CONTEXTS, invvpid_desc); 558 invept(INVEPT_TYPE_ALL_CONTEXTS, invept_desc); 559 vmxoff(); 560 } 561 load_cr4(rcr4() & ~CR4_VMXE); 562 } 563 564 static int 565 vmx_cleanup(void) 566 { 567 568 if (pirvec >= 0) 569 lapic_ipi_free(pirvec); 570 571 if (vpid_unr != NULL) { 572 delete_unrhdr(vpid_unr); 573 vpid_unr = NULL; 574 } 575 576 smp_rendezvous(NULL, vmx_disable, NULL, NULL); 577 578 return (0); 579 } 580 581 static void 582 vmx_enable(void *arg __unused) 583 { 584 int error; 585 uint64_t feature_control; 586 587 feature_control = rdmsr(MSR_IA32_FEATURE_CONTROL); 588 if ((feature_control & IA32_FEATURE_CONTROL_LOCK) == 0 || 589 (feature_control & IA32_FEATURE_CONTROL_VMX_EN) == 0) { 590 wrmsr(MSR_IA32_FEATURE_CONTROL, 591 feature_control | IA32_FEATURE_CONTROL_VMX_EN | 592 IA32_FEATURE_CONTROL_LOCK); 593 } 594 595 load_cr4(rcr4() | CR4_VMXE); 596 597 *(uint32_t *)vmxon_region[curcpu] = vmx_revision(); 598 error = vmxon(vmxon_region[curcpu]); 599 if (error == 0) 600 vmxon_enabled[curcpu] = 1; 601 } 602 603 static void 604 vmx_restore(void) 605 { 606 607 if (vmxon_enabled[curcpu]) 608 vmxon(vmxon_region[curcpu]); 609 } 610 611 static int 612 vmx_init(int ipinum) 613 { 614 int error, use_tpr_shadow; 615 uint64_t basic, fixed0, fixed1, feature_control; 616 uint32_t tmp, procbased2_vid_bits; 617 618 /* CPUID.1:ECX[bit 5] must be 1 for processor to support VMX */ 619 if (!(cpu_feature2 & CPUID2_VMX)) { 620 printf("vmx_init: processor does not support VMX operation\n"); 621 return (ENXIO); 622 } 623 624 /* 625 * Verify that MSR_IA32_FEATURE_CONTROL lock and VMXON enable bits 626 * are set (bits 0 and 2 respectively). 627 */ 628 feature_control = rdmsr(MSR_IA32_FEATURE_CONTROL); 629 if ((feature_control & IA32_FEATURE_CONTROL_LOCK) == 1 && 630 (feature_control & IA32_FEATURE_CONTROL_VMX_EN) == 0) { 631 printf("vmx_init: VMX operation disabled by BIOS\n"); 632 return (ENXIO); 633 } 634 635 /* 636 * Verify capabilities MSR_VMX_BASIC: 637 * - bit 54 indicates support for INS/OUTS decoding 638 */ 639 basic = rdmsr(MSR_VMX_BASIC); 640 if ((basic & (1UL << 54)) == 0) { 641 printf("vmx_init: processor does not support desired basic " 642 "capabilities\n"); 643 return (EINVAL); 644 } 645 646 /* Check support for primary processor-based VM-execution controls */ 647 error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, 648 MSR_VMX_TRUE_PROCBASED_CTLS, 649 PROCBASED_CTLS_ONE_SETTING, 650 PROCBASED_CTLS_ZERO_SETTING, &procbased_ctls); 651 if (error) { 652 printf("vmx_init: processor does not support desired primary " 653 "processor-based controls\n"); 654 return (error); 655 } 656 657 /* Clear the processor-based ctl bits that are set on demand */ 658 procbased_ctls &= ~PROCBASED_CTLS_WINDOW_SETTING; 659 660 /* Check support for secondary processor-based VM-execution controls */ 661 error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, 662 MSR_VMX_PROCBASED_CTLS2, 663 PROCBASED_CTLS2_ONE_SETTING, 664 PROCBASED_CTLS2_ZERO_SETTING, &procbased_ctls2); 665 if (error) { 666 printf("vmx_init: processor does not support desired secondary " 667 "processor-based controls\n"); 668 return (error); 669 } 670 671 /* Check support for VPID */ 672 error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, MSR_VMX_PROCBASED_CTLS2, 673 PROCBASED2_ENABLE_VPID, 0, &tmp); 674 if (error == 0) 675 procbased_ctls2 |= PROCBASED2_ENABLE_VPID; 676 677 /* Check support for pin-based VM-execution controls */ 678 error = vmx_set_ctlreg(MSR_VMX_PINBASED_CTLS, 679 MSR_VMX_TRUE_PINBASED_CTLS, 680 PINBASED_CTLS_ONE_SETTING, 681 PINBASED_CTLS_ZERO_SETTING, &pinbased_ctls); 682 if (error) { 683 printf("vmx_init: processor does not support desired " 684 "pin-based controls\n"); 685 return (error); 686 } 687 688 /* Check support for VM-exit controls */ 689 error = vmx_set_ctlreg(MSR_VMX_EXIT_CTLS, MSR_VMX_TRUE_EXIT_CTLS, 690 VM_EXIT_CTLS_ONE_SETTING, 691 VM_EXIT_CTLS_ZERO_SETTING, 692 &exit_ctls); 693 if (error) { 694 printf("vmx_init: processor does not support desired " 695 "exit controls\n"); 696 return (error); 697 } 698 699 /* Check support for VM-entry controls */ 700 error = vmx_set_ctlreg(MSR_VMX_ENTRY_CTLS, MSR_VMX_TRUE_ENTRY_CTLS, 701 VM_ENTRY_CTLS_ONE_SETTING, VM_ENTRY_CTLS_ZERO_SETTING, 702 &entry_ctls); 703 if (error) { 704 printf("vmx_init: processor does not support desired " 705 "entry controls\n"); 706 return (error); 707 } 708 709 /* 710 * Check support for optional features by testing them 711 * as individual bits 712 */ 713 cap_halt_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, 714 MSR_VMX_TRUE_PROCBASED_CTLS, 715 PROCBASED_HLT_EXITING, 0, 716 &tmp) == 0); 717 718 cap_monitor_trap = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, 719 MSR_VMX_PROCBASED_CTLS, 720 PROCBASED_MTF, 0, 721 &tmp) == 0); 722 723 cap_pause_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, 724 MSR_VMX_TRUE_PROCBASED_CTLS, 725 PROCBASED_PAUSE_EXITING, 0, 726 &tmp) == 0); 727 728 cap_unrestricted_guest = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, 729 MSR_VMX_PROCBASED_CTLS2, 730 PROCBASED2_UNRESTRICTED_GUEST, 0, 731 &tmp) == 0); 732 733 cap_invpcid = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, 734 MSR_VMX_PROCBASED_CTLS2, PROCBASED2_ENABLE_INVPCID, 0, 735 &tmp) == 0); 736 737 /* 738 * Check support for virtual interrupt delivery. 739 */ 740 procbased2_vid_bits = (PROCBASED2_VIRTUALIZE_APIC_ACCESSES | 741 PROCBASED2_VIRTUALIZE_X2APIC_MODE | 742 PROCBASED2_APIC_REGISTER_VIRTUALIZATION | 743 PROCBASED2_VIRTUAL_INTERRUPT_DELIVERY); 744 745 use_tpr_shadow = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, 746 MSR_VMX_TRUE_PROCBASED_CTLS, PROCBASED_USE_TPR_SHADOW, 0, 747 &tmp) == 0); 748 749 error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, MSR_VMX_PROCBASED_CTLS2, 750 procbased2_vid_bits, 0, &tmp); 751 if (error == 0 && use_tpr_shadow) { 752 virtual_interrupt_delivery = 1; 753 TUNABLE_INT_FETCH("hw.vmm.vmx.use_apic_vid", 754 &virtual_interrupt_delivery); 755 } 756 757 if (virtual_interrupt_delivery) { 758 procbased_ctls |= PROCBASED_USE_TPR_SHADOW; 759 procbased_ctls2 |= procbased2_vid_bits; 760 procbased_ctls2 &= ~PROCBASED2_VIRTUALIZE_X2APIC_MODE; 761 762 /* 763 * No need to emulate accesses to %CR8 if virtual 764 * interrupt delivery is enabled. 765 */ 766 procbased_ctls &= ~PROCBASED_CR8_LOAD_EXITING; 767 procbased_ctls &= ~PROCBASED_CR8_STORE_EXITING; 768 769 /* 770 * Check for Posted Interrupts only if Virtual Interrupt 771 * Delivery is enabled. 772 */ 773 error = vmx_set_ctlreg(MSR_VMX_PINBASED_CTLS, 774 MSR_VMX_TRUE_PINBASED_CTLS, PINBASED_POSTED_INTERRUPT, 0, 775 &tmp); 776 if (error == 0) { 777 pirvec = lapic_ipi_alloc(pti ? &IDTVEC(justreturn1_pti) : 778 &IDTVEC(justreturn)); 779 if (pirvec < 0) { 780 if (bootverbose) { 781 printf("vmx_init: unable to allocate " 782 "posted interrupt vector\n"); 783 } 784 } else { 785 posted_interrupts = 1; 786 TUNABLE_INT_FETCH("hw.vmm.vmx.use_apic_pir", 787 &posted_interrupts); 788 } 789 } 790 } 791 792 if (posted_interrupts) 793 pinbased_ctls |= PINBASED_POSTED_INTERRUPT; 794 795 /* Initialize EPT */ 796 error = ept_init(ipinum); 797 if (error) { 798 printf("vmx_init: ept initialization failed (%d)\n", error); 799 return (error); 800 } 801 802 /* 803 * Stash the cr0 and cr4 bits that must be fixed to 0 or 1 804 */ 805 fixed0 = rdmsr(MSR_VMX_CR0_FIXED0); 806 fixed1 = rdmsr(MSR_VMX_CR0_FIXED1); 807 cr0_ones_mask = fixed0 & fixed1; 808 cr0_zeros_mask = ~fixed0 & ~fixed1; 809 810 /* 811 * CR0_PE and CR0_PG can be set to zero in VMX non-root operation 812 * if unrestricted guest execution is allowed. 813 */ 814 if (cap_unrestricted_guest) 815 cr0_ones_mask &= ~(CR0_PG | CR0_PE); 816 817 /* 818 * Do not allow the guest to set CR0_NW or CR0_CD. 819 */ 820 cr0_zeros_mask |= (CR0_NW | CR0_CD); 821 822 fixed0 = rdmsr(MSR_VMX_CR4_FIXED0); 823 fixed1 = rdmsr(MSR_VMX_CR4_FIXED1); 824 cr4_ones_mask = fixed0 & fixed1; 825 cr4_zeros_mask = ~fixed0 & ~fixed1; 826 827 vpid_init(); 828 829 vmx_msr_init(); 830 831 /* enable VMX operation */ 832 smp_rendezvous(NULL, vmx_enable, NULL, NULL); 833 834 vmx_initialized = 1; 835 836 return (0); 837 } 838 839 static void 840 vmx_trigger_hostintr(int vector) 841 { 842 uintptr_t func; 843 struct gate_descriptor *gd; 844 845 gd = &idt[vector]; 846 847 KASSERT(vector >= 32 && vector <= 255, ("vmx_trigger_hostintr: " 848 "invalid vector %d", vector)); 849 KASSERT(gd->gd_p == 1, ("gate descriptor for vector %d not present", 850 vector)); 851 KASSERT(gd->gd_type == SDT_SYSIGT, ("gate descriptor for vector %d " 852 "has invalid type %d", vector, gd->gd_type)); 853 KASSERT(gd->gd_dpl == SEL_KPL, ("gate descriptor for vector %d " 854 "has invalid dpl %d", vector, gd->gd_dpl)); 855 KASSERT(gd->gd_selector == GSEL(GCODE_SEL, SEL_KPL), ("gate descriptor " 856 "for vector %d has invalid selector %d", vector, gd->gd_selector)); 857 KASSERT(gd->gd_ist == 0, ("gate descriptor for vector %d has invalid " 858 "IST %d", vector, gd->gd_ist)); 859 860 func = ((long)gd->gd_hioffset << 16 | gd->gd_looffset); 861 vmx_call_isr(func); 862 } 863 864 static int 865 vmx_setup_cr_shadow(int which, struct vmcs *vmcs, uint32_t initial) 866 { 867 int error, mask_ident, shadow_ident; 868 uint64_t mask_value; 869 870 if (which != 0 && which != 4) 871 panic("vmx_setup_cr_shadow: unknown cr%d", which); 872 873 if (which == 0) { 874 mask_ident = VMCS_CR0_MASK; 875 mask_value = cr0_ones_mask | cr0_zeros_mask; 876 shadow_ident = VMCS_CR0_SHADOW; 877 } else { 878 mask_ident = VMCS_CR4_MASK; 879 mask_value = cr4_ones_mask | cr4_zeros_mask; 880 shadow_ident = VMCS_CR4_SHADOW; 881 } 882 883 error = vmcs_setreg(vmcs, 0, VMCS_IDENT(mask_ident), mask_value); 884 if (error) 885 return (error); 886 887 error = vmcs_setreg(vmcs, 0, VMCS_IDENT(shadow_ident), initial); 888 if (error) 889 return (error); 890 891 return (0); 892 } 893 #define vmx_setup_cr0_shadow(vmcs,init) vmx_setup_cr_shadow(0, (vmcs), (init)) 894 #define vmx_setup_cr4_shadow(vmcs,init) vmx_setup_cr_shadow(4, (vmcs), (init)) 895 896 static void * 897 vmx_vminit(struct vm *vm, pmap_t pmap) 898 { 899 uint16_t vpid[VM_MAXCPU]; 900 int i, error; 901 struct vmx *vmx; 902 struct vmcs *vmcs; 903 uint32_t exc_bitmap; 904 905 vmx = malloc(sizeof(struct vmx), M_VMX, M_WAITOK | M_ZERO); 906 if ((uintptr_t)vmx & PAGE_MASK) { 907 panic("malloc of struct vmx not aligned on %d byte boundary", 908 PAGE_SIZE); 909 } 910 vmx->vm = vm; 911 912 vmx->eptp = eptp(vtophys((vm_offset_t)pmap->pm_pml4)); 913 914 /* 915 * Clean up EPTP-tagged guest physical and combined mappings 916 * 917 * VMX transitions are not required to invalidate any guest physical 918 * mappings. So, it may be possible for stale guest physical mappings 919 * to be present in the processor TLBs. 920 * 921 * Combined mappings for this EP4TA are also invalidated for all VPIDs. 922 */ 923 ept_invalidate_mappings(vmx->eptp); 924 925 msr_bitmap_initialize(vmx->msr_bitmap); 926 927 /* 928 * It is safe to allow direct access to MSR_GSBASE and MSR_FSBASE. 929 * The guest FSBASE and GSBASE are saved and restored during 930 * vm-exit and vm-entry respectively. The host FSBASE and GSBASE are 931 * always restored from the vmcs host state area on vm-exit. 932 * 933 * The SYSENTER_CS/ESP/EIP MSRs are identical to FS/GSBASE in 934 * how they are saved/restored so can be directly accessed by the 935 * guest. 936 * 937 * MSR_EFER is saved and restored in the guest VMCS area on a 938 * VM exit and entry respectively. It is also restored from the 939 * host VMCS area on a VM exit. 940 * 941 * The TSC MSR is exposed read-only. Writes are disallowed as 942 * that will impact the host TSC. If the guest does a write 943 * the "use TSC offsetting" execution control is enabled and the 944 * difference between the host TSC and the guest TSC is written 945 * into the TSC offset in the VMCS. 946 */ 947 if (guest_msr_rw(vmx, MSR_GSBASE) || 948 guest_msr_rw(vmx, MSR_FSBASE) || 949 guest_msr_rw(vmx, MSR_SYSENTER_CS_MSR) || 950 guest_msr_rw(vmx, MSR_SYSENTER_ESP_MSR) || 951 guest_msr_rw(vmx, MSR_SYSENTER_EIP_MSR) || 952 guest_msr_rw(vmx, MSR_EFER) || 953 guest_msr_ro(vmx, MSR_TSC)) 954 panic("vmx_vminit: error setting guest msr access"); 955 956 vpid_alloc(vpid, VM_MAXCPU); 957 958 if (virtual_interrupt_delivery) { 959 error = vm_map_mmio(vm, DEFAULT_APIC_BASE, PAGE_SIZE, 960 APIC_ACCESS_ADDRESS); 961 /* XXX this should really return an error to the caller */ 962 KASSERT(error == 0, ("vm_map_mmio(apicbase) error %d", error)); 963 } 964 965 for (i = 0; i < VM_MAXCPU; i++) { 966 vmcs = &vmx->vmcs[i]; 967 vmcs->identifier = vmx_revision(); 968 error = vmclear(vmcs); 969 if (error != 0) { 970 panic("vmx_vminit: vmclear error %d on vcpu %d\n", 971 error, i); 972 } 973 974 vmx_msr_guest_init(vmx, i); 975 976 error = vmcs_init(vmcs); 977 KASSERT(error == 0, ("vmcs_init error %d", error)); 978 979 VMPTRLD(vmcs); 980 error = 0; 981 error += vmwrite(VMCS_HOST_RSP, (u_long)&vmx->ctx[i]); 982 error += vmwrite(VMCS_EPTP, vmx->eptp); 983 error += vmwrite(VMCS_PIN_BASED_CTLS, pinbased_ctls); 984 error += vmwrite(VMCS_PRI_PROC_BASED_CTLS, procbased_ctls); 985 error += vmwrite(VMCS_SEC_PROC_BASED_CTLS, procbased_ctls2); 986 error += vmwrite(VMCS_EXIT_CTLS, exit_ctls); 987 error += vmwrite(VMCS_ENTRY_CTLS, entry_ctls); 988 error += vmwrite(VMCS_MSR_BITMAP, vtophys(vmx->msr_bitmap)); 989 error += vmwrite(VMCS_VPID, vpid[i]); 990 991 /* exception bitmap */ 992 if (vcpu_trace_exceptions(vm, i)) 993 exc_bitmap = 0xffffffff; 994 else 995 exc_bitmap = 1 << IDT_MC; 996 error += vmwrite(VMCS_EXCEPTION_BITMAP, exc_bitmap); 997 998 vmx->ctx[i].guest_dr6 = DBREG_DR6_RESERVED1; 999 error += vmwrite(VMCS_GUEST_DR7, DBREG_DR7_RESERVED1); 1000 1001 if (virtual_interrupt_delivery) { 1002 error += vmwrite(VMCS_APIC_ACCESS, APIC_ACCESS_ADDRESS); 1003 error += vmwrite(VMCS_VIRTUAL_APIC, 1004 vtophys(&vmx->apic_page[i])); 1005 error += vmwrite(VMCS_EOI_EXIT0, 0); 1006 error += vmwrite(VMCS_EOI_EXIT1, 0); 1007 error += vmwrite(VMCS_EOI_EXIT2, 0); 1008 error += vmwrite(VMCS_EOI_EXIT3, 0); 1009 } 1010 if (posted_interrupts) { 1011 error += vmwrite(VMCS_PIR_VECTOR, pirvec); 1012 error += vmwrite(VMCS_PIR_DESC, 1013 vtophys(&vmx->pir_desc[i])); 1014 } 1015 VMCLEAR(vmcs); 1016 KASSERT(error == 0, ("vmx_vminit: error customizing the vmcs")); 1017 1018 vmx->cap[i].set = 0; 1019 vmx->cap[i].proc_ctls = procbased_ctls; 1020 vmx->cap[i].proc_ctls2 = procbased_ctls2; 1021 1022 vmx->state[i].nextrip = ~0; 1023 vmx->state[i].lastcpu = NOCPU; 1024 vmx->state[i].vpid = vpid[i]; 1025 1026 /* 1027 * Set up the CR0/4 shadows, and init the read shadow 1028 * to the power-on register value from the Intel Sys Arch. 1029 * CR0 - 0x60000010 1030 * CR4 - 0 1031 */ 1032 error = vmx_setup_cr0_shadow(vmcs, 0x60000010); 1033 if (error != 0) 1034 panic("vmx_setup_cr0_shadow %d", error); 1035 1036 error = vmx_setup_cr4_shadow(vmcs, 0); 1037 if (error != 0) 1038 panic("vmx_setup_cr4_shadow %d", error); 1039 1040 vmx->ctx[i].pmap = pmap; 1041 } 1042 1043 return (vmx); 1044 } 1045 1046 static int 1047 vmx_handle_cpuid(struct vm *vm, int vcpu, struct vmxctx *vmxctx) 1048 { 1049 int handled, func; 1050 1051 func = vmxctx->guest_rax; 1052 1053 handled = x86_emulate_cpuid(vm, vcpu, 1054 (uint32_t*)(&vmxctx->guest_rax), 1055 (uint32_t*)(&vmxctx->guest_rbx), 1056 (uint32_t*)(&vmxctx->guest_rcx), 1057 (uint32_t*)(&vmxctx->guest_rdx)); 1058 return (handled); 1059 } 1060 1061 static __inline void 1062 vmx_run_trace(struct vmx *vmx, int vcpu) 1063 { 1064 #ifdef KTR 1065 VCPU_CTR1(vmx->vm, vcpu, "Resume execution at %#lx", vmcs_guest_rip()); 1066 #endif 1067 } 1068 1069 static __inline void 1070 vmx_exit_trace(struct vmx *vmx, int vcpu, uint64_t rip, uint32_t exit_reason, 1071 int handled) 1072 { 1073 #ifdef KTR 1074 VCPU_CTR3(vmx->vm, vcpu, "%s %s vmexit at 0x%0lx", 1075 handled ? "handled" : "unhandled", 1076 exit_reason_to_str(exit_reason), rip); 1077 #endif 1078 } 1079 1080 static __inline void 1081 vmx_astpending_trace(struct vmx *vmx, int vcpu, uint64_t rip) 1082 { 1083 #ifdef KTR 1084 VCPU_CTR1(vmx->vm, vcpu, "astpending vmexit at 0x%0lx", rip); 1085 #endif 1086 } 1087 1088 static VMM_STAT_INTEL(VCPU_INVVPID_SAVED, "Number of vpid invalidations saved"); 1089 static VMM_STAT_INTEL(VCPU_INVVPID_DONE, "Number of vpid invalidations done"); 1090 1091 /* 1092 * Invalidate guest mappings identified by its vpid from the TLB. 1093 */ 1094 static __inline void 1095 vmx_invvpid(struct vmx *vmx, int vcpu, pmap_t pmap, int running) 1096 { 1097 struct vmxstate *vmxstate; 1098 struct invvpid_desc invvpid_desc; 1099 1100 vmxstate = &vmx->state[vcpu]; 1101 if (vmxstate->vpid == 0) 1102 return; 1103 1104 if (!running) { 1105 /* 1106 * Set the 'lastcpu' to an invalid host cpu. 1107 * 1108 * This will invalidate TLB entries tagged with the vcpu's 1109 * vpid the next time it runs via vmx_set_pcpu_defaults(). 1110 */ 1111 vmxstate->lastcpu = NOCPU; 1112 return; 1113 } 1114 1115 KASSERT(curthread->td_critnest > 0, ("%s: vcpu %d running outside " 1116 "critical section", __func__, vcpu)); 1117 1118 /* 1119 * Invalidate all mappings tagged with 'vpid' 1120 * 1121 * We do this because this vcpu was executing on a different host 1122 * cpu when it last ran. We do not track whether it invalidated 1123 * mappings associated with its 'vpid' during that run. So we must 1124 * assume that the mappings associated with 'vpid' on 'curcpu' are 1125 * stale and invalidate them. 1126 * 1127 * Note that we incur this penalty only when the scheduler chooses to 1128 * move the thread associated with this vcpu between host cpus. 1129 * 1130 * Note also that this will invalidate mappings tagged with 'vpid' 1131 * for "all" EP4TAs. 1132 */ 1133 if (pmap->pm_eptgen == vmx->eptgen[curcpu]) { 1134 invvpid_desc._res1 = 0; 1135 invvpid_desc._res2 = 0; 1136 invvpid_desc.vpid = vmxstate->vpid; 1137 invvpid_desc.linear_addr = 0; 1138 invvpid(INVVPID_TYPE_SINGLE_CONTEXT, invvpid_desc); 1139 vmm_stat_incr(vmx->vm, vcpu, VCPU_INVVPID_DONE, 1); 1140 } else { 1141 /* 1142 * The invvpid can be skipped if an invept is going to 1143 * be performed before entering the guest. The invept 1144 * will invalidate combined mappings tagged with 1145 * 'vmx->eptp' for all vpids. 1146 */ 1147 vmm_stat_incr(vmx->vm, vcpu, VCPU_INVVPID_SAVED, 1); 1148 } 1149 } 1150 1151 static void 1152 vmx_set_pcpu_defaults(struct vmx *vmx, int vcpu, pmap_t pmap) 1153 { 1154 struct vmxstate *vmxstate; 1155 1156 vmxstate = &vmx->state[vcpu]; 1157 if (vmxstate->lastcpu == curcpu) 1158 return; 1159 1160 vmxstate->lastcpu = curcpu; 1161 1162 vmm_stat_incr(vmx->vm, vcpu, VCPU_MIGRATIONS, 1); 1163 1164 vmcs_write(VMCS_HOST_TR_BASE, vmm_get_host_trbase()); 1165 vmcs_write(VMCS_HOST_GDTR_BASE, vmm_get_host_gdtrbase()); 1166 vmcs_write(VMCS_HOST_GS_BASE, vmm_get_host_gsbase()); 1167 vmx_invvpid(vmx, vcpu, pmap, 1); 1168 } 1169 1170 /* 1171 * We depend on 'procbased_ctls' to have the Interrupt Window Exiting bit set. 1172 */ 1173 CTASSERT((PROCBASED_CTLS_ONE_SETTING & PROCBASED_INT_WINDOW_EXITING) != 0); 1174 1175 static void __inline 1176 vmx_set_int_window_exiting(struct vmx *vmx, int vcpu) 1177 { 1178 1179 if ((vmx->cap[vcpu].proc_ctls & PROCBASED_INT_WINDOW_EXITING) == 0) { 1180 vmx->cap[vcpu].proc_ctls |= PROCBASED_INT_WINDOW_EXITING; 1181 vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); 1182 VCPU_CTR0(vmx->vm, vcpu, "Enabling interrupt window exiting"); 1183 } 1184 } 1185 1186 static void __inline 1187 vmx_clear_int_window_exiting(struct vmx *vmx, int vcpu) 1188 { 1189 1190 KASSERT((vmx->cap[vcpu].proc_ctls & PROCBASED_INT_WINDOW_EXITING) != 0, 1191 ("intr_window_exiting not set: %#x", vmx->cap[vcpu].proc_ctls)); 1192 vmx->cap[vcpu].proc_ctls &= ~PROCBASED_INT_WINDOW_EXITING; 1193 vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); 1194 VCPU_CTR0(vmx->vm, vcpu, "Disabling interrupt window exiting"); 1195 } 1196 1197 static void __inline 1198 vmx_set_nmi_window_exiting(struct vmx *vmx, int vcpu) 1199 { 1200 1201 if ((vmx->cap[vcpu].proc_ctls & PROCBASED_NMI_WINDOW_EXITING) == 0) { 1202 vmx->cap[vcpu].proc_ctls |= PROCBASED_NMI_WINDOW_EXITING; 1203 vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); 1204 VCPU_CTR0(vmx->vm, vcpu, "Enabling NMI window exiting"); 1205 } 1206 } 1207 1208 static void __inline 1209 vmx_clear_nmi_window_exiting(struct vmx *vmx, int vcpu) 1210 { 1211 1212 KASSERT((vmx->cap[vcpu].proc_ctls & PROCBASED_NMI_WINDOW_EXITING) != 0, 1213 ("nmi_window_exiting not set %#x", vmx->cap[vcpu].proc_ctls)); 1214 vmx->cap[vcpu].proc_ctls &= ~PROCBASED_NMI_WINDOW_EXITING; 1215 vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); 1216 VCPU_CTR0(vmx->vm, vcpu, "Disabling NMI window exiting"); 1217 } 1218 1219 int 1220 vmx_set_tsc_offset(struct vmx *vmx, int vcpu, uint64_t offset) 1221 { 1222 int error; 1223 1224 if ((vmx->cap[vcpu].proc_ctls & PROCBASED_TSC_OFFSET) == 0) { 1225 vmx->cap[vcpu].proc_ctls |= PROCBASED_TSC_OFFSET; 1226 vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); 1227 VCPU_CTR0(vmx->vm, vcpu, "Enabling TSC offsetting"); 1228 } 1229 1230 error = vmwrite(VMCS_TSC_OFFSET, offset); 1231 1232 return (error); 1233 } 1234 1235 #define NMI_BLOCKING (VMCS_INTERRUPTIBILITY_NMI_BLOCKING | \ 1236 VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING) 1237 #define HWINTR_BLOCKING (VMCS_INTERRUPTIBILITY_STI_BLOCKING | \ 1238 VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING) 1239 1240 static void 1241 vmx_inject_nmi(struct vmx *vmx, int vcpu) 1242 { 1243 uint32_t gi, info; 1244 1245 gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY); 1246 KASSERT((gi & NMI_BLOCKING) == 0, ("vmx_inject_nmi: invalid guest " 1247 "interruptibility-state %#x", gi)); 1248 1249 info = vmcs_read(VMCS_ENTRY_INTR_INFO); 1250 KASSERT((info & VMCS_INTR_VALID) == 0, ("vmx_inject_nmi: invalid " 1251 "VM-entry interruption information %#x", info)); 1252 1253 /* 1254 * Inject the virtual NMI. The vector must be the NMI IDT entry 1255 * or the VMCS entry check will fail. 1256 */ 1257 info = IDT_NMI | VMCS_INTR_T_NMI | VMCS_INTR_VALID; 1258 vmcs_write(VMCS_ENTRY_INTR_INFO, info); 1259 1260 VCPU_CTR0(vmx->vm, vcpu, "Injecting vNMI"); 1261 1262 /* Clear the request */ 1263 vm_nmi_clear(vmx->vm, vcpu); 1264 } 1265 1266 static void 1267 vmx_inject_interrupts(struct vmx *vmx, int vcpu, struct vlapic *vlapic, 1268 uint64_t guestrip) 1269 { 1270 int vector, need_nmi_exiting, extint_pending; 1271 uint64_t rflags, entryinfo; 1272 uint32_t gi, info; 1273 1274 if (vmx->state[vcpu].nextrip != guestrip) { 1275 gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY); 1276 if (gi & HWINTR_BLOCKING) { 1277 VCPU_CTR2(vmx->vm, vcpu, "Guest interrupt blocking " 1278 "cleared due to rip change: %#lx/%#lx", 1279 vmx->state[vcpu].nextrip, guestrip); 1280 gi &= ~HWINTR_BLOCKING; 1281 vmcs_write(VMCS_GUEST_INTERRUPTIBILITY, gi); 1282 } 1283 } 1284 1285 if (vm_entry_intinfo(vmx->vm, vcpu, &entryinfo)) { 1286 KASSERT((entryinfo & VMCS_INTR_VALID) != 0, ("%s: entry " 1287 "intinfo is not valid: %#lx", __func__, entryinfo)); 1288 1289 info = vmcs_read(VMCS_ENTRY_INTR_INFO); 1290 KASSERT((info & VMCS_INTR_VALID) == 0, ("%s: cannot inject " 1291 "pending exception: %#lx/%#x", __func__, entryinfo, info)); 1292 1293 info = entryinfo; 1294 vector = info & 0xff; 1295 if (vector == IDT_BP || vector == IDT_OF) { 1296 /* 1297 * VT-x requires #BP and #OF to be injected as software 1298 * exceptions. 1299 */ 1300 info &= ~VMCS_INTR_T_MASK; 1301 info |= VMCS_INTR_T_SWEXCEPTION; 1302 } 1303 1304 if (info & VMCS_INTR_DEL_ERRCODE) 1305 vmcs_write(VMCS_ENTRY_EXCEPTION_ERROR, entryinfo >> 32); 1306 1307 vmcs_write(VMCS_ENTRY_INTR_INFO, info); 1308 } 1309 1310 if (vm_nmi_pending(vmx->vm, vcpu)) { 1311 /* 1312 * If there are no conditions blocking NMI injection then 1313 * inject it directly here otherwise enable "NMI window 1314 * exiting" to inject it as soon as we can. 1315 * 1316 * We also check for STI_BLOCKING because some implementations 1317 * don't allow NMI injection in this case. If we are running 1318 * on a processor that doesn't have this restriction it will 1319 * immediately exit and the NMI will be injected in the 1320 * "NMI window exiting" handler. 1321 */ 1322 need_nmi_exiting = 1; 1323 gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY); 1324 if ((gi & (HWINTR_BLOCKING | NMI_BLOCKING)) == 0) { 1325 info = vmcs_read(VMCS_ENTRY_INTR_INFO); 1326 if ((info & VMCS_INTR_VALID) == 0) { 1327 vmx_inject_nmi(vmx, vcpu); 1328 need_nmi_exiting = 0; 1329 } else { 1330 VCPU_CTR1(vmx->vm, vcpu, "Cannot inject NMI " 1331 "due to VM-entry intr info %#x", info); 1332 } 1333 } else { 1334 VCPU_CTR1(vmx->vm, vcpu, "Cannot inject NMI due to " 1335 "Guest Interruptibility-state %#x", gi); 1336 } 1337 1338 if (need_nmi_exiting) 1339 vmx_set_nmi_window_exiting(vmx, vcpu); 1340 } 1341 1342 extint_pending = vm_extint_pending(vmx->vm, vcpu); 1343 1344 if (!extint_pending && virtual_interrupt_delivery) { 1345 vmx_inject_pir(vlapic); 1346 return; 1347 } 1348 1349 /* 1350 * If interrupt-window exiting is already in effect then don't bother 1351 * checking for pending interrupts. This is just an optimization and 1352 * not needed for correctness. 1353 */ 1354 if ((vmx->cap[vcpu].proc_ctls & PROCBASED_INT_WINDOW_EXITING) != 0) { 1355 VCPU_CTR0(vmx->vm, vcpu, "Skip interrupt injection due to " 1356 "pending int_window_exiting"); 1357 return; 1358 } 1359 1360 if (!extint_pending) { 1361 /* Ask the local apic for a vector to inject */ 1362 if (!vlapic_pending_intr(vlapic, &vector)) 1363 return; 1364 1365 /* 1366 * From the Intel SDM, Volume 3, Section "Maskable 1367 * Hardware Interrupts": 1368 * - maskable interrupt vectors [16,255] can be delivered 1369 * through the local APIC. 1370 */ 1371 KASSERT(vector >= 16 && vector <= 255, 1372 ("invalid vector %d from local APIC", vector)); 1373 } else { 1374 /* Ask the legacy pic for a vector to inject */ 1375 vatpic_pending_intr(vmx->vm, &vector); 1376 1377 /* 1378 * From the Intel SDM, Volume 3, Section "Maskable 1379 * Hardware Interrupts": 1380 * - maskable interrupt vectors [0,255] can be delivered 1381 * through the INTR pin. 1382 */ 1383 KASSERT(vector >= 0 && vector <= 255, 1384 ("invalid vector %d from INTR", vector)); 1385 } 1386 1387 /* Check RFLAGS.IF and the interruptibility state of the guest */ 1388 rflags = vmcs_read(VMCS_GUEST_RFLAGS); 1389 if ((rflags & PSL_I) == 0) { 1390 VCPU_CTR2(vmx->vm, vcpu, "Cannot inject vector %d due to " 1391 "rflags %#lx", vector, rflags); 1392 goto cantinject; 1393 } 1394 1395 gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY); 1396 if (gi & HWINTR_BLOCKING) { 1397 VCPU_CTR2(vmx->vm, vcpu, "Cannot inject vector %d due to " 1398 "Guest Interruptibility-state %#x", vector, gi); 1399 goto cantinject; 1400 } 1401 1402 info = vmcs_read(VMCS_ENTRY_INTR_INFO); 1403 if (info & VMCS_INTR_VALID) { 1404 /* 1405 * This is expected and could happen for multiple reasons: 1406 * - A vectoring VM-entry was aborted due to astpending 1407 * - A VM-exit happened during event injection. 1408 * - An exception was injected above. 1409 * - An NMI was injected above or after "NMI window exiting" 1410 */ 1411 VCPU_CTR2(vmx->vm, vcpu, "Cannot inject vector %d due to " 1412 "VM-entry intr info %#x", vector, info); 1413 goto cantinject; 1414 } 1415 1416 /* Inject the interrupt */ 1417 info = VMCS_INTR_T_HWINTR | VMCS_INTR_VALID; 1418 info |= vector; 1419 vmcs_write(VMCS_ENTRY_INTR_INFO, info); 1420 1421 if (!extint_pending) { 1422 /* Update the Local APIC ISR */ 1423 vlapic_intr_accepted(vlapic, vector); 1424 } else { 1425 vm_extint_clear(vmx->vm, vcpu); 1426 vatpic_intr_accepted(vmx->vm, vector); 1427 1428 /* 1429 * After we accepted the current ExtINT the PIC may 1430 * have posted another one. If that is the case, set 1431 * the Interrupt Window Exiting execution control so 1432 * we can inject that one too. 1433 * 1434 * Also, interrupt window exiting allows us to inject any 1435 * pending APIC vector that was preempted by the ExtINT 1436 * as soon as possible. This applies both for the software 1437 * emulated vlapic and the hardware assisted virtual APIC. 1438 */ 1439 vmx_set_int_window_exiting(vmx, vcpu); 1440 } 1441 1442 VCPU_CTR1(vmx->vm, vcpu, "Injecting hwintr at vector %d", vector); 1443 1444 return; 1445 1446 cantinject: 1447 /* 1448 * Set the Interrupt Window Exiting execution control so we can inject 1449 * the interrupt as soon as blocking condition goes away. 1450 */ 1451 vmx_set_int_window_exiting(vmx, vcpu); 1452 } 1453 1454 /* 1455 * If the Virtual NMIs execution control is '1' then the logical processor 1456 * tracks virtual-NMI blocking in the Guest Interruptibility-state field of 1457 * the VMCS. An IRET instruction in VMX non-root operation will remove any 1458 * virtual-NMI blocking. 1459 * 1460 * This unblocking occurs even if the IRET causes a fault. In this case the 1461 * hypervisor needs to restore virtual-NMI blocking before resuming the guest. 1462 */ 1463 static void 1464 vmx_restore_nmi_blocking(struct vmx *vmx, int vcpuid) 1465 { 1466 uint32_t gi; 1467 1468 VCPU_CTR0(vmx->vm, vcpuid, "Restore Virtual-NMI blocking"); 1469 gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY); 1470 gi |= VMCS_INTERRUPTIBILITY_NMI_BLOCKING; 1471 vmcs_write(VMCS_GUEST_INTERRUPTIBILITY, gi); 1472 } 1473 1474 static void 1475 vmx_clear_nmi_blocking(struct vmx *vmx, int vcpuid) 1476 { 1477 uint32_t gi; 1478 1479 VCPU_CTR0(vmx->vm, vcpuid, "Clear Virtual-NMI blocking"); 1480 gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY); 1481 gi &= ~VMCS_INTERRUPTIBILITY_NMI_BLOCKING; 1482 vmcs_write(VMCS_GUEST_INTERRUPTIBILITY, gi); 1483 } 1484 1485 static void 1486 vmx_assert_nmi_blocking(struct vmx *vmx, int vcpuid) 1487 { 1488 uint32_t gi; 1489 1490 gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY); 1491 KASSERT(gi & VMCS_INTERRUPTIBILITY_NMI_BLOCKING, 1492 ("NMI blocking is not in effect %#x", gi)); 1493 } 1494 1495 static int 1496 vmx_emulate_xsetbv(struct vmx *vmx, int vcpu, struct vm_exit *vmexit) 1497 { 1498 struct vmxctx *vmxctx; 1499 uint64_t xcrval; 1500 const struct xsave_limits *limits; 1501 1502 vmxctx = &vmx->ctx[vcpu]; 1503 limits = vmm_get_xsave_limits(); 1504 1505 /* 1506 * Note that the processor raises a GP# fault on its own if 1507 * xsetbv is executed for CPL != 0, so we do not have to 1508 * emulate that fault here. 1509 */ 1510 1511 /* Only xcr0 is supported. */ 1512 if (vmxctx->guest_rcx != 0) { 1513 vm_inject_gp(vmx->vm, vcpu); 1514 return (HANDLED); 1515 } 1516 1517 /* We only handle xcr0 if both the host and guest have XSAVE enabled. */ 1518 if (!limits->xsave_enabled || !(vmcs_read(VMCS_GUEST_CR4) & CR4_XSAVE)) { 1519 vm_inject_ud(vmx->vm, vcpu); 1520 return (HANDLED); 1521 } 1522 1523 xcrval = vmxctx->guest_rdx << 32 | (vmxctx->guest_rax & 0xffffffff); 1524 if ((xcrval & ~limits->xcr0_allowed) != 0) { 1525 vm_inject_gp(vmx->vm, vcpu); 1526 return (HANDLED); 1527 } 1528 1529 if (!(xcrval & XFEATURE_ENABLED_X87)) { 1530 vm_inject_gp(vmx->vm, vcpu); 1531 return (HANDLED); 1532 } 1533 1534 /* AVX (YMM_Hi128) requires SSE. */ 1535 if (xcrval & XFEATURE_ENABLED_AVX && 1536 (xcrval & XFEATURE_AVX) != XFEATURE_AVX) { 1537 vm_inject_gp(vmx->vm, vcpu); 1538 return (HANDLED); 1539 } 1540 1541 /* 1542 * AVX512 requires base AVX (YMM_Hi128) as well as OpMask, 1543 * ZMM_Hi256, and Hi16_ZMM. 1544 */ 1545 if (xcrval & XFEATURE_AVX512 && 1546 (xcrval & (XFEATURE_AVX512 | XFEATURE_AVX)) != 1547 (XFEATURE_AVX512 | XFEATURE_AVX)) { 1548 vm_inject_gp(vmx->vm, vcpu); 1549 return (HANDLED); 1550 } 1551 1552 /* 1553 * Intel MPX requires both bound register state flags to be 1554 * set. 1555 */ 1556 if (((xcrval & XFEATURE_ENABLED_BNDREGS) != 0) != 1557 ((xcrval & XFEATURE_ENABLED_BNDCSR) != 0)) { 1558 vm_inject_gp(vmx->vm, vcpu); 1559 return (HANDLED); 1560 } 1561 1562 /* 1563 * This runs "inside" vmrun() with the guest's FPU state, so 1564 * modifying xcr0 directly modifies the guest's xcr0, not the 1565 * host's. 1566 */ 1567 load_xcr(0, xcrval); 1568 return (HANDLED); 1569 } 1570 1571 static uint64_t 1572 vmx_get_guest_reg(struct vmx *vmx, int vcpu, int ident) 1573 { 1574 const struct vmxctx *vmxctx; 1575 1576 vmxctx = &vmx->ctx[vcpu]; 1577 1578 switch (ident) { 1579 case 0: 1580 return (vmxctx->guest_rax); 1581 case 1: 1582 return (vmxctx->guest_rcx); 1583 case 2: 1584 return (vmxctx->guest_rdx); 1585 case 3: 1586 return (vmxctx->guest_rbx); 1587 case 4: 1588 return (vmcs_read(VMCS_GUEST_RSP)); 1589 case 5: 1590 return (vmxctx->guest_rbp); 1591 case 6: 1592 return (vmxctx->guest_rsi); 1593 case 7: 1594 return (vmxctx->guest_rdi); 1595 case 8: 1596 return (vmxctx->guest_r8); 1597 case 9: 1598 return (vmxctx->guest_r9); 1599 case 10: 1600 return (vmxctx->guest_r10); 1601 case 11: 1602 return (vmxctx->guest_r11); 1603 case 12: 1604 return (vmxctx->guest_r12); 1605 case 13: 1606 return (vmxctx->guest_r13); 1607 case 14: 1608 return (vmxctx->guest_r14); 1609 case 15: 1610 return (vmxctx->guest_r15); 1611 default: 1612 panic("invalid vmx register %d", ident); 1613 } 1614 } 1615 1616 static void 1617 vmx_set_guest_reg(struct vmx *vmx, int vcpu, int ident, uint64_t regval) 1618 { 1619 struct vmxctx *vmxctx; 1620 1621 vmxctx = &vmx->ctx[vcpu]; 1622 1623 switch (ident) { 1624 case 0: 1625 vmxctx->guest_rax = regval; 1626 break; 1627 case 1: 1628 vmxctx->guest_rcx = regval; 1629 break; 1630 case 2: 1631 vmxctx->guest_rdx = regval; 1632 break; 1633 case 3: 1634 vmxctx->guest_rbx = regval; 1635 break; 1636 case 4: 1637 vmcs_write(VMCS_GUEST_RSP, regval); 1638 break; 1639 case 5: 1640 vmxctx->guest_rbp = regval; 1641 break; 1642 case 6: 1643 vmxctx->guest_rsi = regval; 1644 break; 1645 case 7: 1646 vmxctx->guest_rdi = regval; 1647 break; 1648 case 8: 1649 vmxctx->guest_r8 = regval; 1650 break; 1651 case 9: 1652 vmxctx->guest_r9 = regval; 1653 break; 1654 case 10: 1655 vmxctx->guest_r10 = regval; 1656 break; 1657 case 11: 1658 vmxctx->guest_r11 = regval; 1659 break; 1660 case 12: 1661 vmxctx->guest_r12 = regval; 1662 break; 1663 case 13: 1664 vmxctx->guest_r13 = regval; 1665 break; 1666 case 14: 1667 vmxctx->guest_r14 = regval; 1668 break; 1669 case 15: 1670 vmxctx->guest_r15 = regval; 1671 break; 1672 default: 1673 panic("invalid vmx register %d", ident); 1674 } 1675 } 1676 1677 static int 1678 vmx_emulate_cr0_access(struct vmx *vmx, int vcpu, uint64_t exitqual) 1679 { 1680 uint64_t crval, regval; 1681 1682 /* We only handle mov to %cr0 at this time */ 1683 if ((exitqual & 0xf0) != 0x00) 1684 return (UNHANDLED); 1685 1686 regval = vmx_get_guest_reg(vmx, vcpu, (exitqual >> 8) & 0xf); 1687 1688 vmcs_write(VMCS_CR0_SHADOW, regval); 1689 1690 crval = regval | cr0_ones_mask; 1691 crval &= ~cr0_zeros_mask; 1692 vmcs_write(VMCS_GUEST_CR0, crval); 1693 1694 if (regval & CR0_PG) { 1695 uint64_t efer, entry_ctls; 1696 1697 /* 1698 * If CR0.PG is 1 and EFER.LME is 1 then EFER.LMA and 1699 * the "IA-32e mode guest" bit in VM-entry control must be 1700 * equal. 1701 */ 1702 efer = vmcs_read(VMCS_GUEST_IA32_EFER); 1703 if (efer & EFER_LME) { 1704 efer |= EFER_LMA; 1705 vmcs_write(VMCS_GUEST_IA32_EFER, efer); 1706 entry_ctls = vmcs_read(VMCS_ENTRY_CTLS); 1707 entry_ctls |= VM_ENTRY_GUEST_LMA; 1708 vmcs_write(VMCS_ENTRY_CTLS, entry_ctls); 1709 } 1710 } 1711 1712 return (HANDLED); 1713 } 1714 1715 static int 1716 vmx_emulate_cr4_access(struct vmx *vmx, int vcpu, uint64_t exitqual) 1717 { 1718 uint64_t crval, regval; 1719 1720 /* We only handle mov to %cr4 at this time */ 1721 if ((exitqual & 0xf0) != 0x00) 1722 return (UNHANDLED); 1723 1724 regval = vmx_get_guest_reg(vmx, vcpu, (exitqual >> 8) & 0xf); 1725 1726 vmcs_write(VMCS_CR4_SHADOW, regval); 1727 1728 crval = regval | cr4_ones_mask; 1729 crval &= ~cr4_zeros_mask; 1730 vmcs_write(VMCS_GUEST_CR4, crval); 1731 1732 return (HANDLED); 1733 } 1734 1735 static int 1736 vmx_emulate_cr8_access(struct vmx *vmx, int vcpu, uint64_t exitqual) 1737 { 1738 struct vlapic *vlapic; 1739 uint64_t cr8; 1740 int regnum; 1741 1742 /* We only handle mov %cr8 to/from a register at this time. */ 1743 if ((exitqual & 0xe0) != 0x00) { 1744 return (UNHANDLED); 1745 } 1746 1747 vlapic = vm_lapic(vmx->vm, vcpu); 1748 regnum = (exitqual >> 8) & 0xf; 1749 if (exitqual & 0x10) { 1750 cr8 = vlapic_get_cr8(vlapic); 1751 vmx_set_guest_reg(vmx, vcpu, regnum, cr8); 1752 } else { 1753 cr8 = vmx_get_guest_reg(vmx, vcpu, regnum); 1754 vlapic_set_cr8(vlapic, cr8); 1755 } 1756 1757 return (HANDLED); 1758 } 1759 1760 /* 1761 * From section "Guest Register State" in the Intel SDM: CPL = SS.DPL 1762 */ 1763 static int 1764 vmx_cpl(void) 1765 { 1766 uint32_t ssar; 1767 1768 ssar = vmcs_read(VMCS_GUEST_SS_ACCESS_RIGHTS); 1769 return ((ssar >> 5) & 0x3); 1770 } 1771 1772 static enum vm_cpu_mode 1773 vmx_cpu_mode(void) 1774 { 1775 uint32_t csar; 1776 1777 if (vmcs_read(VMCS_GUEST_IA32_EFER) & EFER_LMA) { 1778 csar = vmcs_read(VMCS_GUEST_CS_ACCESS_RIGHTS); 1779 if (csar & 0x2000) 1780 return (CPU_MODE_64BIT); /* CS.L = 1 */ 1781 else 1782 return (CPU_MODE_COMPATIBILITY); 1783 } else if (vmcs_read(VMCS_GUEST_CR0) & CR0_PE) { 1784 return (CPU_MODE_PROTECTED); 1785 } else { 1786 return (CPU_MODE_REAL); 1787 } 1788 } 1789 1790 static enum vm_paging_mode 1791 vmx_paging_mode(void) 1792 { 1793 1794 if (!(vmcs_read(VMCS_GUEST_CR0) & CR0_PG)) 1795 return (PAGING_MODE_FLAT); 1796 if (!(vmcs_read(VMCS_GUEST_CR4) & CR4_PAE)) 1797 return (PAGING_MODE_32); 1798 if (vmcs_read(VMCS_GUEST_IA32_EFER) & EFER_LME) 1799 return (PAGING_MODE_64); 1800 else 1801 return (PAGING_MODE_PAE); 1802 } 1803 1804 static uint64_t 1805 inout_str_index(struct vmx *vmx, int vcpuid, int in) 1806 { 1807 uint64_t val; 1808 int error; 1809 enum vm_reg_name reg; 1810 1811 reg = in ? VM_REG_GUEST_RDI : VM_REG_GUEST_RSI; 1812 error = vmx_getreg(vmx, vcpuid, reg, &val); 1813 KASSERT(error == 0, ("%s: vmx_getreg error %d", __func__, error)); 1814 return (val); 1815 } 1816 1817 static uint64_t 1818 inout_str_count(struct vmx *vmx, int vcpuid, int rep) 1819 { 1820 uint64_t val; 1821 int error; 1822 1823 if (rep) { 1824 error = vmx_getreg(vmx, vcpuid, VM_REG_GUEST_RCX, &val); 1825 KASSERT(!error, ("%s: vmx_getreg error %d", __func__, error)); 1826 } else { 1827 val = 1; 1828 } 1829 return (val); 1830 } 1831 1832 static int 1833 inout_str_addrsize(uint32_t inst_info) 1834 { 1835 uint32_t size; 1836 1837 size = (inst_info >> 7) & 0x7; 1838 switch (size) { 1839 case 0: 1840 return (2); /* 16 bit */ 1841 case 1: 1842 return (4); /* 32 bit */ 1843 case 2: 1844 return (8); /* 64 bit */ 1845 default: 1846 panic("%s: invalid size encoding %d", __func__, size); 1847 } 1848 } 1849 1850 static void 1851 inout_str_seginfo(struct vmx *vmx, int vcpuid, uint32_t inst_info, int in, 1852 struct vm_inout_str *vis) 1853 { 1854 int error, s; 1855 1856 if (in) { 1857 vis->seg_name = VM_REG_GUEST_ES; 1858 } else { 1859 s = (inst_info >> 15) & 0x7; 1860 vis->seg_name = vm_segment_name(s); 1861 } 1862 1863 error = vmx_getdesc(vmx, vcpuid, vis->seg_name, &vis->seg_desc); 1864 KASSERT(error == 0, ("%s: vmx_getdesc error %d", __func__, error)); 1865 } 1866 1867 static void 1868 vmx_paging_info(struct vm_guest_paging *paging) 1869 { 1870 paging->cr3 = vmcs_guest_cr3(); 1871 paging->cpl = vmx_cpl(); 1872 paging->cpu_mode = vmx_cpu_mode(); 1873 paging->paging_mode = vmx_paging_mode(); 1874 } 1875 1876 static void 1877 vmexit_inst_emul(struct vm_exit *vmexit, uint64_t gpa, uint64_t gla) 1878 { 1879 struct vm_guest_paging *paging; 1880 uint32_t csar; 1881 1882 paging = &vmexit->u.inst_emul.paging; 1883 1884 vmexit->exitcode = VM_EXITCODE_INST_EMUL; 1885 vmexit->inst_length = 0; 1886 vmexit->u.inst_emul.gpa = gpa; 1887 vmexit->u.inst_emul.gla = gla; 1888 vmx_paging_info(paging); 1889 switch (paging->cpu_mode) { 1890 case CPU_MODE_REAL: 1891 vmexit->u.inst_emul.cs_base = vmcs_read(VMCS_GUEST_CS_BASE); 1892 vmexit->u.inst_emul.cs_d = 0; 1893 break; 1894 case CPU_MODE_PROTECTED: 1895 case CPU_MODE_COMPATIBILITY: 1896 vmexit->u.inst_emul.cs_base = vmcs_read(VMCS_GUEST_CS_BASE); 1897 csar = vmcs_read(VMCS_GUEST_CS_ACCESS_RIGHTS); 1898 vmexit->u.inst_emul.cs_d = SEG_DESC_DEF32(csar); 1899 break; 1900 default: 1901 vmexit->u.inst_emul.cs_base = 0; 1902 vmexit->u.inst_emul.cs_d = 0; 1903 break; 1904 } 1905 vie_init(&vmexit->u.inst_emul.vie, NULL, 0); 1906 } 1907 1908 static int 1909 ept_fault_type(uint64_t ept_qual) 1910 { 1911 int fault_type; 1912 1913 if (ept_qual & EPT_VIOLATION_DATA_WRITE) 1914 fault_type = VM_PROT_WRITE; 1915 else if (ept_qual & EPT_VIOLATION_INST_FETCH) 1916 fault_type = VM_PROT_EXECUTE; 1917 else 1918 fault_type= VM_PROT_READ; 1919 1920 return (fault_type); 1921 } 1922 1923 static boolean_t 1924 ept_emulation_fault(uint64_t ept_qual) 1925 { 1926 int read, write; 1927 1928 /* EPT fault on an instruction fetch doesn't make sense here */ 1929 if (ept_qual & EPT_VIOLATION_INST_FETCH) 1930 return (FALSE); 1931 1932 /* EPT fault must be a read fault or a write fault */ 1933 read = ept_qual & EPT_VIOLATION_DATA_READ ? 1 : 0; 1934 write = ept_qual & EPT_VIOLATION_DATA_WRITE ? 1 : 0; 1935 if ((read | write) == 0) 1936 return (FALSE); 1937 1938 /* 1939 * The EPT violation must have been caused by accessing a 1940 * guest-physical address that is a translation of a guest-linear 1941 * address. 1942 */ 1943 if ((ept_qual & EPT_VIOLATION_GLA_VALID) == 0 || 1944 (ept_qual & EPT_VIOLATION_XLAT_VALID) == 0) { 1945 return (FALSE); 1946 } 1947 1948 return (TRUE); 1949 } 1950 1951 static __inline int 1952 apic_access_virtualization(struct vmx *vmx, int vcpuid) 1953 { 1954 uint32_t proc_ctls2; 1955 1956 proc_ctls2 = vmx->cap[vcpuid].proc_ctls2; 1957 return ((proc_ctls2 & PROCBASED2_VIRTUALIZE_APIC_ACCESSES) ? 1 : 0); 1958 } 1959 1960 static __inline int 1961 x2apic_virtualization(struct vmx *vmx, int vcpuid) 1962 { 1963 uint32_t proc_ctls2; 1964 1965 proc_ctls2 = vmx->cap[vcpuid].proc_ctls2; 1966 return ((proc_ctls2 & PROCBASED2_VIRTUALIZE_X2APIC_MODE) ? 1 : 0); 1967 } 1968 1969 static int 1970 vmx_handle_apic_write(struct vmx *vmx, int vcpuid, struct vlapic *vlapic, 1971 uint64_t qual) 1972 { 1973 int error, handled, offset; 1974 uint32_t *apic_regs, vector; 1975 bool retu; 1976 1977 handled = HANDLED; 1978 offset = APIC_WRITE_OFFSET(qual); 1979 1980 if (!apic_access_virtualization(vmx, vcpuid)) { 1981 /* 1982 * In general there should not be any APIC write VM-exits 1983 * unless APIC-access virtualization is enabled. 1984 * 1985 * However self-IPI virtualization can legitimately trigger 1986 * an APIC-write VM-exit so treat it specially. 1987 */ 1988 if (x2apic_virtualization(vmx, vcpuid) && 1989 offset == APIC_OFFSET_SELF_IPI) { 1990 apic_regs = (uint32_t *)(vlapic->apic_page); 1991 vector = apic_regs[APIC_OFFSET_SELF_IPI / 4]; 1992 vlapic_self_ipi_handler(vlapic, vector); 1993 return (HANDLED); 1994 } else 1995 return (UNHANDLED); 1996 } 1997 1998 switch (offset) { 1999 case APIC_OFFSET_ID: 2000 vlapic_id_write_handler(vlapic); 2001 break; 2002 case APIC_OFFSET_LDR: 2003 vlapic_ldr_write_handler(vlapic); 2004 break; 2005 case APIC_OFFSET_DFR: 2006 vlapic_dfr_write_handler(vlapic); 2007 break; 2008 case APIC_OFFSET_SVR: 2009 vlapic_svr_write_handler(vlapic); 2010 break; 2011 case APIC_OFFSET_ESR: 2012 vlapic_esr_write_handler(vlapic); 2013 break; 2014 case APIC_OFFSET_ICR_LOW: 2015 retu = false; 2016 error = vlapic_icrlo_write_handler(vlapic, &retu); 2017 if (error != 0 || retu) 2018 handled = UNHANDLED; 2019 break; 2020 case APIC_OFFSET_CMCI_LVT: 2021 case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT: 2022 vlapic_lvt_write_handler(vlapic, offset); 2023 break; 2024 case APIC_OFFSET_TIMER_ICR: 2025 vlapic_icrtmr_write_handler(vlapic); 2026 break; 2027 case APIC_OFFSET_TIMER_DCR: 2028 vlapic_dcr_write_handler(vlapic); 2029 break; 2030 default: 2031 handled = UNHANDLED; 2032 break; 2033 } 2034 return (handled); 2035 } 2036 2037 static bool 2038 apic_access_fault(struct vmx *vmx, int vcpuid, uint64_t gpa) 2039 { 2040 2041 if (apic_access_virtualization(vmx, vcpuid) && 2042 (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE)) 2043 return (true); 2044 else 2045 return (false); 2046 } 2047 2048 static int 2049 vmx_handle_apic_access(struct vmx *vmx, int vcpuid, struct vm_exit *vmexit) 2050 { 2051 uint64_t qual; 2052 int access_type, offset, allowed; 2053 2054 if (!apic_access_virtualization(vmx, vcpuid)) 2055 return (UNHANDLED); 2056 2057 qual = vmexit->u.vmx.exit_qualification; 2058 access_type = APIC_ACCESS_TYPE(qual); 2059 offset = APIC_ACCESS_OFFSET(qual); 2060 2061 allowed = 0; 2062 if (access_type == 0) { 2063 /* 2064 * Read data access to the following registers is expected. 2065 */ 2066 switch (offset) { 2067 case APIC_OFFSET_APR: 2068 case APIC_OFFSET_PPR: 2069 case APIC_OFFSET_RRR: 2070 case APIC_OFFSET_CMCI_LVT: 2071 case APIC_OFFSET_TIMER_CCR: 2072 allowed = 1; 2073 break; 2074 default: 2075 break; 2076 } 2077 } else if (access_type == 1) { 2078 /* 2079 * Write data access to the following registers is expected. 2080 */ 2081 switch (offset) { 2082 case APIC_OFFSET_VER: 2083 case APIC_OFFSET_APR: 2084 case APIC_OFFSET_PPR: 2085 case APIC_OFFSET_RRR: 2086 case APIC_OFFSET_ISR0 ... APIC_OFFSET_ISR7: 2087 case APIC_OFFSET_TMR0 ... APIC_OFFSET_TMR7: 2088 case APIC_OFFSET_IRR0 ... APIC_OFFSET_IRR7: 2089 case APIC_OFFSET_CMCI_LVT: 2090 case APIC_OFFSET_TIMER_CCR: 2091 allowed = 1; 2092 break; 2093 default: 2094 break; 2095 } 2096 } 2097 2098 if (allowed) { 2099 vmexit_inst_emul(vmexit, DEFAULT_APIC_BASE + offset, 2100 VIE_INVALID_GLA); 2101 } 2102 2103 /* 2104 * Regardless of whether the APIC-access is allowed this handler 2105 * always returns UNHANDLED: 2106 * - if the access is allowed then it is handled by emulating the 2107 * instruction that caused the VM-exit (outside the critical section) 2108 * - if the access is not allowed then it will be converted to an 2109 * exitcode of VM_EXITCODE_VMX and will be dealt with in userland. 2110 */ 2111 return (UNHANDLED); 2112 } 2113 2114 static enum task_switch_reason 2115 vmx_task_switch_reason(uint64_t qual) 2116 { 2117 int reason; 2118 2119 reason = (qual >> 30) & 0x3; 2120 switch (reason) { 2121 case 0: 2122 return (TSR_CALL); 2123 case 1: 2124 return (TSR_IRET); 2125 case 2: 2126 return (TSR_JMP); 2127 case 3: 2128 return (TSR_IDT_GATE); 2129 default: 2130 panic("%s: invalid reason %d", __func__, reason); 2131 } 2132 } 2133 2134 static int 2135 emulate_wrmsr(struct vmx *vmx, int vcpuid, u_int num, uint64_t val, bool *retu) 2136 { 2137 int error; 2138 2139 if (lapic_msr(num)) 2140 error = lapic_wrmsr(vmx->vm, vcpuid, num, val, retu); 2141 else 2142 error = vmx_wrmsr(vmx, vcpuid, num, val, retu); 2143 2144 return (error); 2145 } 2146 2147 static int 2148 emulate_rdmsr(struct vmx *vmx, int vcpuid, u_int num, bool *retu) 2149 { 2150 struct vmxctx *vmxctx; 2151 uint64_t result; 2152 uint32_t eax, edx; 2153 int error; 2154 2155 if (lapic_msr(num)) 2156 error = lapic_rdmsr(vmx->vm, vcpuid, num, &result, retu); 2157 else 2158 error = vmx_rdmsr(vmx, vcpuid, num, &result, retu); 2159 2160 if (error == 0) { 2161 eax = result; 2162 vmxctx = &vmx->ctx[vcpuid]; 2163 error = vmxctx_setreg(vmxctx, VM_REG_GUEST_RAX, eax); 2164 KASSERT(error == 0, ("vmxctx_setreg(rax) error %d", error)); 2165 2166 edx = result >> 32; 2167 error = vmxctx_setreg(vmxctx, VM_REG_GUEST_RDX, edx); 2168 KASSERT(error == 0, ("vmxctx_setreg(rdx) error %d", error)); 2169 } 2170 2171 return (error); 2172 } 2173 2174 static int 2175 vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit) 2176 { 2177 int error, errcode, errcode_valid, handled, in; 2178 struct vmxctx *vmxctx; 2179 struct vlapic *vlapic; 2180 struct vm_inout_str *vis; 2181 struct vm_task_switch *ts; 2182 uint32_t eax, ecx, edx, idtvec_info, idtvec_err, intr_info, inst_info; 2183 uint32_t intr_type, intr_vec, reason; 2184 uint64_t exitintinfo, qual, gpa; 2185 bool retu; 2186 2187 CTASSERT((PINBASED_CTLS_ONE_SETTING & PINBASED_VIRTUAL_NMI) != 0); 2188 CTASSERT((PINBASED_CTLS_ONE_SETTING & PINBASED_NMI_EXITING) != 0); 2189 2190 handled = UNHANDLED; 2191 vmxctx = &vmx->ctx[vcpu]; 2192 2193 qual = vmexit->u.vmx.exit_qualification; 2194 reason = vmexit->u.vmx.exit_reason; 2195 vmexit->exitcode = VM_EXITCODE_BOGUS; 2196 2197 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_COUNT, 1); 2198 SDT_PROBE3(vmm, vmx, exit, entry, vmx, vcpu, vmexit); 2199 2200 /* 2201 * VM-entry failures during or after loading guest state. 2202 * 2203 * These VM-exits are uncommon but must be handled specially 2204 * as most VM-exit fields are not populated as usual. 2205 */ 2206 if (__predict_false(reason == EXIT_REASON_MCE_DURING_ENTRY)) { 2207 VCPU_CTR0(vmx->vm, vcpu, "Handling MCE during VM-entry"); 2208 __asm __volatile("int $18"); 2209 return (1); 2210 } 2211 2212 /* 2213 * VM exits that can be triggered during event delivery need to 2214 * be handled specially by re-injecting the event if the IDT 2215 * vectoring information field's valid bit is set. 2216 * 2217 * See "Information for VM Exits During Event Delivery" in Intel SDM 2218 * for details. 2219 */ 2220 idtvec_info = vmcs_idt_vectoring_info(); 2221 if (idtvec_info & VMCS_IDT_VEC_VALID) { 2222 idtvec_info &= ~(1 << 12); /* clear undefined bit */ 2223 exitintinfo = idtvec_info; 2224 if (idtvec_info & VMCS_IDT_VEC_ERRCODE_VALID) { 2225 idtvec_err = vmcs_idt_vectoring_err(); 2226 exitintinfo |= (uint64_t)idtvec_err << 32; 2227 } 2228 error = vm_exit_intinfo(vmx->vm, vcpu, exitintinfo); 2229 KASSERT(error == 0, ("%s: vm_set_intinfo error %d", 2230 __func__, error)); 2231 2232 /* 2233 * If 'virtual NMIs' are being used and the VM-exit 2234 * happened while injecting an NMI during the previous 2235 * VM-entry, then clear "blocking by NMI" in the 2236 * Guest Interruptibility-State so the NMI can be 2237 * reinjected on the subsequent VM-entry. 2238 * 2239 * However, if the NMI was being delivered through a task 2240 * gate, then the new task must start execution with NMIs 2241 * blocked so don't clear NMI blocking in this case. 2242 */ 2243 intr_type = idtvec_info & VMCS_INTR_T_MASK; 2244 if (intr_type == VMCS_INTR_T_NMI) { 2245 if (reason != EXIT_REASON_TASK_SWITCH) 2246 vmx_clear_nmi_blocking(vmx, vcpu); 2247 else 2248 vmx_assert_nmi_blocking(vmx, vcpu); 2249 } 2250 2251 /* 2252 * Update VM-entry instruction length if the event being 2253 * delivered was a software interrupt or software exception. 2254 */ 2255 if (intr_type == VMCS_INTR_T_SWINTR || 2256 intr_type == VMCS_INTR_T_PRIV_SWEXCEPTION || 2257 intr_type == VMCS_INTR_T_SWEXCEPTION) { 2258 vmcs_write(VMCS_ENTRY_INST_LENGTH, vmexit->inst_length); 2259 } 2260 } 2261 2262 switch (reason) { 2263 case EXIT_REASON_TASK_SWITCH: 2264 ts = &vmexit->u.task_switch; 2265 ts->tsssel = qual & 0xffff; 2266 ts->reason = vmx_task_switch_reason(qual); 2267 ts->ext = 0; 2268 ts->errcode_valid = 0; 2269 vmx_paging_info(&ts->paging); 2270 /* 2271 * If the task switch was due to a CALL, JMP, IRET, software 2272 * interrupt (INT n) or software exception (INT3, INTO), 2273 * then the saved %rip references the instruction that caused 2274 * the task switch. The instruction length field in the VMCS 2275 * is valid in this case. 2276 * 2277 * In all other cases (e.g., NMI, hardware exception) the 2278 * saved %rip is one that would have been saved in the old TSS 2279 * had the task switch completed normally so the instruction 2280 * length field is not needed in this case and is explicitly 2281 * set to 0. 2282 */ 2283 if (ts->reason == TSR_IDT_GATE) { 2284 KASSERT(idtvec_info & VMCS_IDT_VEC_VALID, 2285 ("invalid idtvec_info %#x for IDT task switch", 2286 idtvec_info)); 2287 intr_type = idtvec_info & VMCS_INTR_T_MASK; 2288 if (intr_type != VMCS_INTR_T_SWINTR && 2289 intr_type != VMCS_INTR_T_SWEXCEPTION && 2290 intr_type != VMCS_INTR_T_PRIV_SWEXCEPTION) { 2291 /* Task switch triggered by external event */ 2292 ts->ext = 1; 2293 vmexit->inst_length = 0; 2294 if (idtvec_info & VMCS_IDT_VEC_ERRCODE_VALID) { 2295 ts->errcode_valid = 1; 2296 ts->errcode = vmcs_idt_vectoring_err(); 2297 } 2298 } 2299 } 2300 vmexit->exitcode = VM_EXITCODE_TASK_SWITCH; 2301 SDT_PROBE4(vmm, vmx, exit, taskswitch, vmx, vcpu, vmexit, ts); 2302 VCPU_CTR4(vmx->vm, vcpu, "task switch reason %d, tss 0x%04x, " 2303 "%s errcode 0x%016lx", ts->reason, ts->tsssel, 2304 ts->ext ? "external" : "internal", 2305 ((uint64_t)ts->errcode << 32) | ts->errcode_valid); 2306 break; 2307 case EXIT_REASON_CR_ACCESS: 2308 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_CR_ACCESS, 1); 2309 SDT_PROBE4(vmm, vmx, exit, craccess, vmx, vcpu, vmexit, qual); 2310 switch (qual & 0xf) { 2311 case 0: 2312 handled = vmx_emulate_cr0_access(vmx, vcpu, qual); 2313 break; 2314 case 4: 2315 handled = vmx_emulate_cr4_access(vmx, vcpu, qual); 2316 break; 2317 case 8: 2318 handled = vmx_emulate_cr8_access(vmx, vcpu, qual); 2319 break; 2320 } 2321 break; 2322 case EXIT_REASON_RDMSR: 2323 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_RDMSR, 1); 2324 retu = false; 2325 ecx = vmxctx->guest_rcx; 2326 VCPU_CTR1(vmx->vm, vcpu, "rdmsr 0x%08x", ecx); 2327 SDT_PROBE4(vmm, vmx, exit, rdmsr, vmx, vcpu, vmexit, ecx); 2328 error = emulate_rdmsr(vmx, vcpu, ecx, &retu); 2329 if (error) { 2330 vmexit->exitcode = VM_EXITCODE_RDMSR; 2331 vmexit->u.msr.code = ecx; 2332 } else if (!retu) { 2333 handled = HANDLED; 2334 } else { 2335 /* Return to userspace with a valid exitcode */ 2336 KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS, 2337 ("emulate_rdmsr retu with bogus exitcode")); 2338 } 2339 break; 2340 case EXIT_REASON_WRMSR: 2341 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_WRMSR, 1); 2342 retu = false; 2343 eax = vmxctx->guest_rax; 2344 ecx = vmxctx->guest_rcx; 2345 edx = vmxctx->guest_rdx; 2346 VCPU_CTR2(vmx->vm, vcpu, "wrmsr 0x%08x value 0x%016lx", 2347 ecx, (uint64_t)edx << 32 | eax); 2348 SDT_PROBE5(vmm, vmx, exit, wrmsr, vmx, vmexit, vcpu, ecx, 2349 (uint64_t)edx << 32 | eax); 2350 error = emulate_wrmsr(vmx, vcpu, ecx, 2351 (uint64_t)edx << 32 | eax, &retu); 2352 if (error) { 2353 vmexit->exitcode = VM_EXITCODE_WRMSR; 2354 vmexit->u.msr.code = ecx; 2355 vmexit->u.msr.wval = (uint64_t)edx << 32 | eax; 2356 } else if (!retu) { 2357 handled = HANDLED; 2358 } else { 2359 /* Return to userspace with a valid exitcode */ 2360 KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS, 2361 ("emulate_wrmsr retu with bogus exitcode")); 2362 } 2363 break; 2364 case EXIT_REASON_HLT: 2365 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_HLT, 1); 2366 SDT_PROBE3(vmm, vmx, exit, halt, vmx, vcpu, vmexit); 2367 vmexit->exitcode = VM_EXITCODE_HLT; 2368 vmexit->u.hlt.rflags = vmcs_read(VMCS_GUEST_RFLAGS); 2369 if (virtual_interrupt_delivery) 2370 vmexit->u.hlt.intr_status = 2371 vmcs_read(VMCS_GUEST_INTR_STATUS); 2372 else 2373 vmexit->u.hlt.intr_status = 0; 2374 break; 2375 case EXIT_REASON_MTF: 2376 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_MTRAP, 1); 2377 SDT_PROBE3(vmm, vmx, exit, mtrap, vmx, vcpu, vmexit); 2378 vmexit->exitcode = VM_EXITCODE_MTRAP; 2379 vmexit->inst_length = 0; 2380 break; 2381 case EXIT_REASON_PAUSE: 2382 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_PAUSE, 1); 2383 SDT_PROBE3(vmm, vmx, exit, pause, vmx, vcpu, vmexit); 2384 vmexit->exitcode = VM_EXITCODE_PAUSE; 2385 break; 2386 case EXIT_REASON_INTR_WINDOW: 2387 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INTR_WINDOW, 1); 2388 SDT_PROBE3(vmm, vmx, exit, intrwindow, vmx, vcpu, vmexit); 2389 vmx_clear_int_window_exiting(vmx, vcpu); 2390 return (1); 2391 case EXIT_REASON_EXT_INTR: 2392 /* 2393 * External interrupts serve only to cause VM exits and allow 2394 * the host interrupt handler to run. 2395 * 2396 * If this external interrupt triggers a virtual interrupt 2397 * to a VM, then that state will be recorded by the 2398 * host interrupt handler in the VM's softc. We will inject 2399 * this virtual interrupt during the subsequent VM enter. 2400 */ 2401 intr_info = vmcs_read(VMCS_EXIT_INTR_INFO); 2402 SDT_PROBE4(vmm, vmx, exit, interrupt, 2403 vmx, vcpu, vmexit, intr_info); 2404 2405 /* 2406 * XXX: Ignore this exit if VMCS_INTR_VALID is not set. 2407 * This appears to be a bug in VMware Fusion? 2408 */ 2409 if (!(intr_info & VMCS_INTR_VALID)) 2410 return (1); 2411 KASSERT((intr_info & VMCS_INTR_VALID) != 0 && 2412 (intr_info & VMCS_INTR_T_MASK) == VMCS_INTR_T_HWINTR, 2413 ("VM exit interruption info invalid: %#x", intr_info)); 2414 vmx_trigger_hostintr(intr_info & 0xff); 2415 2416 /* 2417 * This is special. We want to treat this as an 'handled' 2418 * VM-exit but not increment the instruction pointer. 2419 */ 2420 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_EXTINT, 1); 2421 return (1); 2422 case EXIT_REASON_NMI_WINDOW: 2423 SDT_PROBE3(vmm, vmx, exit, nmiwindow, vmx, vcpu, vmexit); 2424 /* Exit to allow the pending virtual NMI to be injected */ 2425 if (vm_nmi_pending(vmx->vm, vcpu)) 2426 vmx_inject_nmi(vmx, vcpu); 2427 vmx_clear_nmi_window_exiting(vmx, vcpu); 2428 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_NMI_WINDOW, 1); 2429 return (1); 2430 case EXIT_REASON_INOUT: 2431 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INOUT, 1); 2432 vmexit->exitcode = VM_EXITCODE_INOUT; 2433 vmexit->u.inout.bytes = (qual & 0x7) + 1; 2434 vmexit->u.inout.in = in = (qual & 0x8) ? 1 : 0; 2435 vmexit->u.inout.string = (qual & 0x10) ? 1 : 0; 2436 vmexit->u.inout.rep = (qual & 0x20) ? 1 : 0; 2437 vmexit->u.inout.port = (uint16_t)(qual >> 16); 2438 vmexit->u.inout.eax = (uint32_t)(vmxctx->guest_rax); 2439 if (vmexit->u.inout.string) { 2440 inst_info = vmcs_read(VMCS_EXIT_INSTRUCTION_INFO); 2441 vmexit->exitcode = VM_EXITCODE_INOUT_STR; 2442 vis = &vmexit->u.inout_str; 2443 vmx_paging_info(&vis->paging); 2444 vis->rflags = vmcs_read(VMCS_GUEST_RFLAGS); 2445 vis->cr0 = vmcs_read(VMCS_GUEST_CR0); 2446 vis->index = inout_str_index(vmx, vcpu, in); 2447 vis->count = inout_str_count(vmx, vcpu, vis->inout.rep); 2448 vis->addrsize = inout_str_addrsize(inst_info); 2449 inout_str_seginfo(vmx, vcpu, inst_info, in, vis); 2450 } 2451 SDT_PROBE3(vmm, vmx, exit, inout, vmx, vcpu, vmexit); 2452 break; 2453 case EXIT_REASON_CPUID: 2454 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_CPUID, 1); 2455 SDT_PROBE3(vmm, vmx, exit, cpuid, vmx, vcpu, vmexit); 2456 handled = vmx_handle_cpuid(vmx->vm, vcpu, vmxctx); 2457 break; 2458 case EXIT_REASON_EXCEPTION: 2459 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_EXCEPTION, 1); 2460 intr_info = vmcs_read(VMCS_EXIT_INTR_INFO); 2461 KASSERT((intr_info & VMCS_INTR_VALID) != 0, 2462 ("VM exit interruption info invalid: %#x", intr_info)); 2463 2464 intr_vec = intr_info & 0xff; 2465 intr_type = intr_info & VMCS_INTR_T_MASK; 2466 2467 /* 2468 * If Virtual NMIs control is 1 and the VM-exit is due to a 2469 * fault encountered during the execution of IRET then we must 2470 * restore the state of "virtual-NMI blocking" before resuming 2471 * the guest. 2472 * 2473 * See "Resuming Guest Software after Handling an Exception". 2474 * See "Information for VM Exits Due to Vectored Events". 2475 */ 2476 if ((idtvec_info & VMCS_IDT_VEC_VALID) == 0 && 2477 (intr_vec != IDT_DF) && 2478 (intr_info & EXIT_QUAL_NMIUDTI) != 0) 2479 vmx_restore_nmi_blocking(vmx, vcpu); 2480 2481 /* 2482 * The NMI has already been handled in vmx_exit_handle_nmi(). 2483 */ 2484 if (intr_type == VMCS_INTR_T_NMI) 2485 return (1); 2486 2487 /* 2488 * Call the machine check handler by hand. Also don't reflect 2489 * the machine check back into the guest. 2490 */ 2491 if (intr_vec == IDT_MC) { 2492 VCPU_CTR0(vmx->vm, vcpu, "Vectoring to MCE handler"); 2493 __asm __volatile("int $18"); 2494 return (1); 2495 } 2496 2497 if (intr_vec == IDT_PF) { 2498 error = vmxctx_setreg(vmxctx, VM_REG_GUEST_CR2, qual); 2499 KASSERT(error == 0, ("%s: vmxctx_setreg(cr2) error %d", 2500 __func__, error)); 2501 } 2502 2503 /* 2504 * Software exceptions exhibit trap-like behavior. This in 2505 * turn requires populating the VM-entry instruction length 2506 * so that the %rip in the trap frame is past the INT3/INTO 2507 * instruction. 2508 */ 2509 if (intr_type == VMCS_INTR_T_SWEXCEPTION) 2510 vmcs_write(VMCS_ENTRY_INST_LENGTH, vmexit->inst_length); 2511 2512 /* Reflect all other exceptions back into the guest */ 2513 errcode_valid = errcode = 0; 2514 if (intr_info & VMCS_INTR_DEL_ERRCODE) { 2515 errcode_valid = 1; 2516 errcode = vmcs_read(VMCS_EXIT_INTR_ERRCODE); 2517 } 2518 VCPU_CTR2(vmx->vm, vcpu, "Reflecting exception %d/%#x into " 2519 "the guest", intr_vec, errcode); 2520 SDT_PROBE5(vmm, vmx, exit, exception, 2521 vmx, vcpu, vmexit, intr_vec, errcode); 2522 error = vm_inject_exception(vmx->vm, vcpu, intr_vec, 2523 errcode_valid, errcode, 0); 2524 KASSERT(error == 0, ("%s: vm_inject_exception error %d", 2525 __func__, error)); 2526 return (1); 2527 2528 case EXIT_REASON_EPT_FAULT: 2529 /* 2530 * If 'gpa' lies within the address space allocated to 2531 * memory then this must be a nested page fault otherwise 2532 * this must be an instruction that accesses MMIO space. 2533 */ 2534 gpa = vmcs_gpa(); 2535 if (vm_mem_allocated(vmx->vm, vcpu, gpa) || 2536 apic_access_fault(vmx, vcpu, gpa)) { 2537 vmexit->exitcode = VM_EXITCODE_PAGING; 2538 vmexit->inst_length = 0; 2539 vmexit->u.paging.gpa = gpa; 2540 vmexit->u.paging.fault_type = ept_fault_type(qual); 2541 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_NESTED_FAULT, 1); 2542 SDT_PROBE5(vmm, vmx, exit, nestedfault, 2543 vmx, vcpu, vmexit, gpa, qual); 2544 } else if (ept_emulation_fault(qual)) { 2545 vmexit_inst_emul(vmexit, gpa, vmcs_gla()); 2546 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INST_EMUL, 1); 2547 SDT_PROBE4(vmm, vmx, exit, mmiofault, 2548 vmx, vcpu, vmexit, gpa); 2549 } 2550 /* 2551 * If Virtual NMIs control is 1 and the VM-exit is due to an 2552 * EPT fault during the execution of IRET then we must restore 2553 * the state of "virtual-NMI blocking" before resuming. 2554 * 2555 * See description of "NMI unblocking due to IRET" in 2556 * "Exit Qualification for EPT Violations". 2557 */ 2558 if ((idtvec_info & VMCS_IDT_VEC_VALID) == 0 && 2559 (qual & EXIT_QUAL_NMIUDTI) != 0) 2560 vmx_restore_nmi_blocking(vmx, vcpu); 2561 break; 2562 case EXIT_REASON_VIRTUALIZED_EOI: 2563 vmexit->exitcode = VM_EXITCODE_IOAPIC_EOI; 2564 vmexit->u.ioapic_eoi.vector = qual & 0xFF; 2565 SDT_PROBE3(vmm, vmx, exit, eoi, vmx, vcpu, vmexit); 2566 vmexit->inst_length = 0; /* trap-like */ 2567 break; 2568 case EXIT_REASON_APIC_ACCESS: 2569 SDT_PROBE3(vmm, vmx, exit, apicaccess, vmx, vcpu, vmexit); 2570 handled = vmx_handle_apic_access(vmx, vcpu, vmexit); 2571 break; 2572 case EXIT_REASON_APIC_WRITE: 2573 /* 2574 * APIC-write VM exit is trap-like so the %rip is already 2575 * pointing to the next instruction. 2576 */ 2577 vmexit->inst_length = 0; 2578 vlapic = vm_lapic(vmx->vm, vcpu); 2579 SDT_PROBE4(vmm, vmx, exit, apicwrite, 2580 vmx, vcpu, vmexit, vlapic); 2581 handled = vmx_handle_apic_write(vmx, vcpu, vlapic, qual); 2582 break; 2583 case EXIT_REASON_XSETBV: 2584 SDT_PROBE3(vmm, vmx, exit, xsetbv, vmx, vcpu, vmexit); 2585 handled = vmx_emulate_xsetbv(vmx, vcpu, vmexit); 2586 break; 2587 case EXIT_REASON_MONITOR: 2588 SDT_PROBE3(vmm, vmx, exit, monitor, vmx, vcpu, vmexit); 2589 vmexit->exitcode = VM_EXITCODE_MONITOR; 2590 break; 2591 case EXIT_REASON_MWAIT: 2592 SDT_PROBE3(vmm, vmx, exit, mwait, vmx, vcpu, vmexit); 2593 vmexit->exitcode = VM_EXITCODE_MWAIT; 2594 break; 2595 default: 2596 SDT_PROBE4(vmm, vmx, exit, unknown, 2597 vmx, vcpu, vmexit, reason); 2598 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_UNKNOWN, 1); 2599 break; 2600 } 2601 2602 if (handled) { 2603 /* 2604 * It is possible that control is returned to userland 2605 * even though we were able to handle the VM exit in the 2606 * kernel. 2607 * 2608 * In such a case we want to make sure that the userland 2609 * restarts guest execution at the instruction *after* 2610 * the one we just processed. Therefore we update the 2611 * guest rip in the VMCS and in 'vmexit'. 2612 */ 2613 vmexit->rip += vmexit->inst_length; 2614 vmexit->inst_length = 0; 2615 vmcs_write(VMCS_GUEST_RIP, vmexit->rip); 2616 } else { 2617 if (vmexit->exitcode == VM_EXITCODE_BOGUS) { 2618 /* 2619 * If this VM exit was not claimed by anybody then 2620 * treat it as a generic VMX exit. 2621 */ 2622 vmexit->exitcode = VM_EXITCODE_VMX; 2623 vmexit->u.vmx.status = VM_SUCCESS; 2624 vmexit->u.vmx.inst_type = 0; 2625 vmexit->u.vmx.inst_error = 0; 2626 } else { 2627 /* 2628 * The exitcode and collateral have been populated. 2629 * The VM exit will be processed further in userland. 2630 */ 2631 } 2632 } 2633 2634 SDT_PROBE4(vmm, vmx, exit, return, 2635 vmx, vcpu, vmexit, handled); 2636 return (handled); 2637 } 2638 2639 static __inline void 2640 vmx_exit_inst_error(struct vmxctx *vmxctx, int rc, struct vm_exit *vmexit) 2641 { 2642 2643 KASSERT(vmxctx->inst_fail_status != VM_SUCCESS, 2644 ("vmx_exit_inst_error: invalid inst_fail_status %d", 2645 vmxctx->inst_fail_status)); 2646 2647 vmexit->inst_length = 0; 2648 vmexit->exitcode = VM_EXITCODE_VMX; 2649 vmexit->u.vmx.status = vmxctx->inst_fail_status; 2650 vmexit->u.vmx.inst_error = vmcs_instruction_error(); 2651 vmexit->u.vmx.exit_reason = ~0; 2652 vmexit->u.vmx.exit_qualification = ~0; 2653 2654 switch (rc) { 2655 case VMX_VMRESUME_ERROR: 2656 case VMX_VMLAUNCH_ERROR: 2657 case VMX_INVEPT_ERROR: 2658 vmexit->u.vmx.inst_type = rc; 2659 break; 2660 default: 2661 panic("vm_exit_inst_error: vmx_enter_guest returned %d", rc); 2662 } 2663 } 2664 2665 /* 2666 * If the NMI-exiting VM execution control is set to '1' then an NMI in 2667 * non-root operation causes a VM-exit. NMI blocking is in effect so it is 2668 * sufficient to simply vector to the NMI handler via a software interrupt. 2669 * However, this must be done before maskable interrupts are enabled 2670 * otherwise the "iret" issued by an interrupt handler will incorrectly 2671 * clear NMI blocking. 2672 */ 2673 static __inline void 2674 vmx_exit_handle_nmi(struct vmx *vmx, int vcpuid, struct vm_exit *vmexit) 2675 { 2676 uint32_t intr_info; 2677 2678 KASSERT((read_rflags() & PSL_I) == 0, ("interrupts enabled")); 2679 2680 if (vmexit->u.vmx.exit_reason != EXIT_REASON_EXCEPTION) 2681 return; 2682 2683 intr_info = vmcs_read(VMCS_EXIT_INTR_INFO); 2684 KASSERT((intr_info & VMCS_INTR_VALID) != 0, 2685 ("VM exit interruption info invalid: %#x", intr_info)); 2686 2687 if ((intr_info & VMCS_INTR_T_MASK) == VMCS_INTR_T_NMI) { 2688 KASSERT((intr_info & 0xff) == IDT_NMI, ("VM exit due " 2689 "to NMI has invalid vector: %#x", intr_info)); 2690 VCPU_CTR0(vmx->vm, vcpuid, "Vectoring to NMI handler"); 2691 __asm __volatile("int $2"); 2692 } 2693 } 2694 2695 static __inline void 2696 vmx_dr_enter_guest(struct vmxctx *vmxctx) 2697 { 2698 register_t rflags; 2699 2700 /* Save host control debug registers. */ 2701 vmxctx->host_dr7 = rdr7(); 2702 vmxctx->host_debugctl = rdmsr(MSR_DEBUGCTLMSR); 2703 2704 /* 2705 * Disable debugging in DR7 and DEBUGCTL to avoid triggering 2706 * exceptions in the host based on the guest DRx values. The 2707 * guest DR7 and DEBUGCTL are saved/restored in the VMCS. 2708 */ 2709 load_dr7(0); 2710 wrmsr(MSR_DEBUGCTLMSR, 0); 2711 2712 /* 2713 * Disable single stepping the kernel to avoid corrupting the 2714 * guest DR6. A debugger might still be able to corrupt the 2715 * guest DR6 by setting a breakpoint after this point and then 2716 * single stepping. 2717 */ 2718 rflags = read_rflags(); 2719 vmxctx->host_tf = rflags & PSL_T; 2720 write_rflags(rflags & ~PSL_T); 2721 2722 /* Save host debug registers. */ 2723 vmxctx->host_dr0 = rdr0(); 2724 vmxctx->host_dr1 = rdr1(); 2725 vmxctx->host_dr2 = rdr2(); 2726 vmxctx->host_dr3 = rdr3(); 2727 vmxctx->host_dr6 = rdr6(); 2728 2729 /* Restore guest debug registers. */ 2730 load_dr0(vmxctx->guest_dr0); 2731 load_dr1(vmxctx->guest_dr1); 2732 load_dr2(vmxctx->guest_dr2); 2733 load_dr3(vmxctx->guest_dr3); 2734 load_dr6(vmxctx->guest_dr6); 2735 } 2736 2737 static __inline void 2738 vmx_dr_leave_guest(struct vmxctx *vmxctx) 2739 { 2740 2741 /* Save guest debug registers. */ 2742 vmxctx->guest_dr0 = rdr0(); 2743 vmxctx->guest_dr1 = rdr1(); 2744 vmxctx->guest_dr2 = rdr2(); 2745 vmxctx->guest_dr3 = rdr3(); 2746 vmxctx->guest_dr6 = rdr6(); 2747 2748 /* 2749 * Restore host debug registers. Restore DR7, DEBUGCTL, and 2750 * PSL_T last. 2751 */ 2752 load_dr0(vmxctx->host_dr0); 2753 load_dr1(vmxctx->host_dr1); 2754 load_dr2(vmxctx->host_dr2); 2755 load_dr3(vmxctx->host_dr3); 2756 load_dr6(vmxctx->host_dr6); 2757 wrmsr(MSR_DEBUGCTLMSR, vmxctx->host_debugctl); 2758 load_dr7(vmxctx->host_dr7); 2759 write_rflags(read_rflags() | vmxctx->host_tf); 2760 } 2761 2762 static int 2763 vmx_run(void *arg, int vcpu, register_t rip, pmap_t pmap, 2764 struct vm_eventinfo *evinfo) 2765 { 2766 int rc, handled, launched; 2767 struct vmx *vmx; 2768 struct vm *vm; 2769 struct vmxctx *vmxctx; 2770 struct vmcs *vmcs; 2771 struct vm_exit *vmexit; 2772 struct vlapic *vlapic; 2773 uint32_t exit_reason; 2774 2775 vmx = arg; 2776 vm = vmx->vm; 2777 vmcs = &vmx->vmcs[vcpu]; 2778 vmxctx = &vmx->ctx[vcpu]; 2779 vlapic = vm_lapic(vm, vcpu); 2780 vmexit = vm_exitinfo(vm, vcpu); 2781 launched = 0; 2782 2783 KASSERT(vmxctx->pmap == pmap, 2784 ("pmap %p different than ctx pmap %p", pmap, vmxctx->pmap)); 2785 2786 vmx_msr_guest_enter(vmx, vcpu); 2787 2788 VMPTRLD(vmcs); 2789 2790 /* 2791 * XXX 2792 * We do this every time because we may setup the virtual machine 2793 * from a different process than the one that actually runs it. 2794 * 2795 * If the life of a virtual machine was spent entirely in the context 2796 * of a single process we could do this once in vmx_vminit(). 2797 */ 2798 vmcs_write(VMCS_HOST_CR3, rcr3()); 2799 2800 vmcs_write(VMCS_GUEST_RIP, rip); 2801 vmx_set_pcpu_defaults(vmx, vcpu, pmap); 2802 do { 2803 KASSERT(vmcs_guest_rip() == rip, ("%s: vmcs guest rip mismatch " 2804 "%#lx/%#lx", __func__, vmcs_guest_rip(), rip)); 2805 2806 handled = UNHANDLED; 2807 /* 2808 * Interrupts are disabled from this point on until the 2809 * guest starts executing. This is done for the following 2810 * reasons: 2811 * 2812 * If an AST is asserted on this thread after the check below, 2813 * then the IPI_AST notification will not be lost, because it 2814 * will cause a VM exit due to external interrupt as soon as 2815 * the guest state is loaded. 2816 * 2817 * A posted interrupt after 'vmx_inject_interrupts()' will 2818 * not be "lost" because it will be held pending in the host 2819 * APIC because interrupts are disabled. The pending interrupt 2820 * will be recognized as soon as the guest state is loaded. 2821 * 2822 * The same reasoning applies to the IPI generated by 2823 * pmap_invalidate_ept(). 2824 */ 2825 disable_intr(); 2826 vmx_inject_interrupts(vmx, vcpu, vlapic, rip); 2827 2828 /* 2829 * Check for vcpu suspension after injecting events because 2830 * vmx_inject_interrupts() can suspend the vcpu due to a 2831 * triple fault. 2832 */ 2833 if (vcpu_suspended(evinfo)) { 2834 enable_intr(); 2835 vm_exit_suspended(vmx->vm, vcpu, rip); 2836 break; 2837 } 2838 2839 if (vcpu_rendezvous_pending(evinfo)) { 2840 enable_intr(); 2841 vm_exit_rendezvous(vmx->vm, vcpu, rip); 2842 break; 2843 } 2844 2845 if (vcpu_reqidle(evinfo)) { 2846 enable_intr(); 2847 vm_exit_reqidle(vmx->vm, vcpu, rip); 2848 break; 2849 } 2850 2851 if (vcpu_should_yield(vm, vcpu)) { 2852 enable_intr(); 2853 vm_exit_astpending(vmx->vm, vcpu, rip); 2854 vmx_astpending_trace(vmx, vcpu, rip); 2855 handled = HANDLED; 2856 break; 2857 } 2858 2859 if (vcpu_debugged(vm, vcpu)) { 2860 enable_intr(); 2861 vm_exit_debug(vmx->vm, vcpu, rip); 2862 break; 2863 } 2864 2865 vmx_run_trace(vmx, vcpu); 2866 vmx_dr_enter_guest(vmxctx); 2867 rc = vmx_enter_guest(vmxctx, vmx, launched); 2868 vmx_dr_leave_guest(vmxctx); 2869 2870 /* Collect some information for VM exit processing */ 2871 vmexit->rip = rip = vmcs_guest_rip(); 2872 vmexit->inst_length = vmexit_instruction_length(); 2873 vmexit->u.vmx.exit_reason = exit_reason = vmcs_exit_reason(); 2874 vmexit->u.vmx.exit_qualification = vmcs_exit_qualification(); 2875 2876 /* Update 'nextrip' */ 2877 vmx->state[vcpu].nextrip = rip; 2878 2879 if (rc == VMX_GUEST_VMEXIT) { 2880 vmx_exit_handle_nmi(vmx, vcpu, vmexit); 2881 enable_intr(); 2882 handled = vmx_exit_process(vmx, vcpu, vmexit); 2883 } else { 2884 enable_intr(); 2885 vmx_exit_inst_error(vmxctx, rc, vmexit); 2886 } 2887 launched = 1; 2888 vmx_exit_trace(vmx, vcpu, rip, exit_reason, handled); 2889 rip = vmexit->rip; 2890 } while (handled); 2891 2892 /* 2893 * If a VM exit has been handled then the exitcode must be BOGUS 2894 * If a VM exit is not handled then the exitcode must not be BOGUS 2895 */ 2896 if ((handled && vmexit->exitcode != VM_EXITCODE_BOGUS) || 2897 (!handled && vmexit->exitcode == VM_EXITCODE_BOGUS)) { 2898 panic("Mismatch between handled (%d) and exitcode (%d)", 2899 handled, vmexit->exitcode); 2900 } 2901 2902 if (!handled) 2903 vmm_stat_incr(vm, vcpu, VMEXIT_USERSPACE, 1); 2904 2905 VCPU_CTR1(vm, vcpu, "returning from vmx_run: exitcode %d", 2906 vmexit->exitcode); 2907 2908 VMCLEAR(vmcs); 2909 vmx_msr_guest_exit(vmx, vcpu); 2910 2911 return (0); 2912 } 2913 2914 static void 2915 vmx_vmcleanup(void *arg) 2916 { 2917 int i; 2918 struct vmx *vmx = arg; 2919 2920 if (apic_access_virtualization(vmx, 0)) 2921 vm_unmap_mmio(vmx->vm, DEFAULT_APIC_BASE, PAGE_SIZE); 2922 2923 for (i = 0; i < VM_MAXCPU; i++) 2924 vpid_free(vmx->state[i].vpid); 2925 2926 free(vmx, M_VMX); 2927 2928 return; 2929 } 2930 2931 static register_t * 2932 vmxctx_regptr(struct vmxctx *vmxctx, int reg) 2933 { 2934 2935 switch (reg) { 2936 case VM_REG_GUEST_RAX: 2937 return (&vmxctx->guest_rax); 2938 case VM_REG_GUEST_RBX: 2939 return (&vmxctx->guest_rbx); 2940 case VM_REG_GUEST_RCX: 2941 return (&vmxctx->guest_rcx); 2942 case VM_REG_GUEST_RDX: 2943 return (&vmxctx->guest_rdx); 2944 case VM_REG_GUEST_RSI: 2945 return (&vmxctx->guest_rsi); 2946 case VM_REG_GUEST_RDI: 2947 return (&vmxctx->guest_rdi); 2948 case VM_REG_GUEST_RBP: 2949 return (&vmxctx->guest_rbp); 2950 case VM_REG_GUEST_R8: 2951 return (&vmxctx->guest_r8); 2952 case VM_REG_GUEST_R9: 2953 return (&vmxctx->guest_r9); 2954 case VM_REG_GUEST_R10: 2955 return (&vmxctx->guest_r10); 2956 case VM_REG_GUEST_R11: 2957 return (&vmxctx->guest_r11); 2958 case VM_REG_GUEST_R12: 2959 return (&vmxctx->guest_r12); 2960 case VM_REG_GUEST_R13: 2961 return (&vmxctx->guest_r13); 2962 case VM_REG_GUEST_R14: 2963 return (&vmxctx->guest_r14); 2964 case VM_REG_GUEST_R15: 2965 return (&vmxctx->guest_r15); 2966 case VM_REG_GUEST_CR2: 2967 return (&vmxctx->guest_cr2); 2968 case VM_REG_GUEST_DR0: 2969 return (&vmxctx->guest_dr0); 2970 case VM_REG_GUEST_DR1: 2971 return (&vmxctx->guest_dr1); 2972 case VM_REG_GUEST_DR2: 2973 return (&vmxctx->guest_dr2); 2974 case VM_REG_GUEST_DR3: 2975 return (&vmxctx->guest_dr3); 2976 case VM_REG_GUEST_DR6: 2977 return (&vmxctx->guest_dr6); 2978 default: 2979 break; 2980 } 2981 return (NULL); 2982 } 2983 2984 static int 2985 vmxctx_getreg(struct vmxctx *vmxctx, int reg, uint64_t *retval) 2986 { 2987 register_t *regp; 2988 2989 if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) { 2990 *retval = *regp; 2991 return (0); 2992 } else 2993 return (EINVAL); 2994 } 2995 2996 static int 2997 vmxctx_setreg(struct vmxctx *vmxctx, int reg, uint64_t val) 2998 { 2999 register_t *regp; 3000 3001 if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) { 3002 *regp = val; 3003 return (0); 3004 } else 3005 return (EINVAL); 3006 } 3007 3008 static int 3009 vmx_get_intr_shadow(struct vmx *vmx, int vcpu, int running, uint64_t *retval) 3010 { 3011 uint64_t gi; 3012 int error; 3013 3014 error = vmcs_getreg(&vmx->vmcs[vcpu], running, 3015 VMCS_IDENT(VMCS_GUEST_INTERRUPTIBILITY), &gi); 3016 *retval = (gi & HWINTR_BLOCKING) ? 1 : 0; 3017 return (error); 3018 } 3019 3020 static int 3021 vmx_modify_intr_shadow(struct vmx *vmx, int vcpu, int running, uint64_t val) 3022 { 3023 struct vmcs *vmcs; 3024 uint64_t gi; 3025 int error, ident; 3026 3027 /* 3028 * Forcing the vcpu into an interrupt shadow is not supported. 3029 */ 3030 if (val) { 3031 error = EINVAL; 3032 goto done; 3033 } 3034 3035 vmcs = &vmx->vmcs[vcpu]; 3036 ident = VMCS_IDENT(VMCS_GUEST_INTERRUPTIBILITY); 3037 error = vmcs_getreg(vmcs, running, ident, &gi); 3038 if (error == 0) { 3039 gi &= ~HWINTR_BLOCKING; 3040 error = vmcs_setreg(vmcs, running, ident, gi); 3041 } 3042 done: 3043 VCPU_CTR2(vmx->vm, vcpu, "Setting intr_shadow to %#lx %s", val, 3044 error ? "failed" : "succeeded"); 3045 return (error); 3046 } 3047 3048 static int 3049 vmx_shadow_reg(int reg) 3050 { 3051 int shreg; 3052 3053 shreg = -1; 3054 3055 switch (reg) { 3056 case VM_REG_GUEST_CR0: 3057 shreg = VMCS_CR0_SHADOW; 3058 break; 3059 case VM_REG_GUEST_CR4: 3060 shreg = VMCS_CR4_SHADOW; 3061 break; 3062 default: 3063 break; 3064 } 3065 3066 return (shreg); 3067 } 3068 3069 static int 3070 vmx_getreg(void *arg, int vcpu, int reg, uint64_t *retval) 3071 { 3072 int running, hostcpu; 3073 struct vmx *vmx = arg; 3074 3075 running = vcpu_is_running(vmx->vm, vcpu, &hostcpu); 3076 if (running && hostcpu != curcpu) 3077 panic("vmx_getreg: %s%d is running", vm_name(vmx->vm), vcpu); 3078 3079 if (reg == VM_REG_GUEST_INTR_SHADOW) 3080 return (vmx_get_intr_shadow(vmx, vcpu, running, retval)); 3081 3082 if (vmxctx_getreg(&vmx->ctx[vcpu], reg, retval) == 0) 3083 return (0); 3084 3085 return (vmcs_getreg(&vmx->vmcs[vcpu], running, reg, retval)); 3086 } 3087 3088 static int 3089 vmx_setreg(void *arg, int vcpu, int reg, uint64_t val) 3090 { 3091 int error, hostcpu, running, shadow; 3092 uint64_t ctls; 3093 pmap_t pmap; 3094 struct vmx *vmx = arg; 3095 3096 running = vcpu_is_running(vmx->vm, vcpu, &hostcpu); 3097 if (running && hostcpu != curcpu) 3098 panic("vmx_setreg: %s%d is running", vm_name(vmx->vm), vcpu); 3099 3100 if (reg == VM_REG_GUEST_INTR_SHADOW) 3101 return (vmx_modify_intr_shadow(vmx, vcpu, running, val)); 3102 3103 if (vmxctx_setreg(&vmx->ctx[vcpu], reg, val) == 0) 3104 return (0); 3105 3106 error = vmcs_setreg(&vmx->vmcs[vcpu], running, reg, val); 3107 3108 if (error == 0) { 3109 /* 3110 * If the "load EFER" VM-entry control is 1 then the 3111 * value of EFER.LMA must be identical to "IA-32e mode guest" 3112 * bit in the VM-entry control. 3113 */ 3114 if ((entry_ctls & VM_ENTRY_LOAD_EFER) != 0 && 3115 (reg == VM_REG_GUEST_EFER)) { 3116 vmcs_getreg(&vmx->vmcs[vcpu], running, 3117 VMCS_IDENT(VMCS_ENTRY_CTLS), &ctls); 3118 if (val & EFER_LMA) 3119 ctls |= VM_ENTRY_GUEST_LMA; 3120 else 3121 ctls &= ~VM_ENTRY_GUEST_LMA; 3122 vmcs_setreg(&vmx->vmcs[vcpu], running, 3123 VMCS_IDENT(VMCS_ENTRY_CTLS), ctls); 3124 } 3125 3126 shadow = vmx_shadow_reg(reg); 3127 if (shadow > 0) { 3128 /* 3129 * Store the unmodified value in the shadow 3130 */ 3131 error = vmcs_setreg(&vmx->vmcs[vcpu], running, 3132 VMCS_IDENT(shadow), val); 3133 } 3134 3135 if (reg == VM_REG_GUEST_CR3) { 3136 /* 3137 * Invalidate the guest vcpu's TLB mappings to emulate 3138 * the behavior of updating %cr3. 3139 * 3140 * XXX the processor retains global mappings when %cr3 3141 * is updated but vmx_invvpid() does not. 3142 */ 3143 pmap = vmx->ctx[vcpu].pmap; 3144 vmx_invvpid(vmx, vcpu, pmap, running); 3145 } 3146 } 3147 3148 return (error); 3149 } 3150 3151 static int 3152 vmx_getdesc(void *arg, int vcpu, int reg, struct seg_desc *desc) 3153 { 3154 int hostcpu, running; 3155 struct vmx *vmx = arg; 3156 3157 running = vcpu_is_running(vmx->vm, vcpu, &hostcpu); 3158 if (running && hostcpu != curcpu) 3159 panic("vmx_getdesc: %s%d is running", vm_name(vmx->vm), vcpu); 3160 3161 return (vmcs_getdesc(&vmx->vmcs[vcpu], running, reg, desc)); 3162 } 3163 3164 static int 3165 vmx_setdesc(void *arg, int vcpu, int reg, struct seg_desc *desc) 3166 { 3167 int hostcpu, running; 3168 struct vmx *vmx = arg; 3169 3170 running = vcpu_is_running(vmx->vm, vcpu, &hostcpu); 3171 if (running && hostcpu != curcpu) 3172 panic("vmx_setdesc: %s%d is running", vm_name(vmx->vm), vcpu); 3173 3174 return (vmcs_setdesc(&vmx->vmcs[vcpu], running, reg, desc)); 3175 } 3176 3177 static int 3178 vmx_getcap(void *arg, int vcpu, int type, int *retval) 3179 { 3180 struct vmx *vmx = arg; 3181 int vcap; 3182 int ret; 3183 3184 ret = ENOENT; 3185 3186 vcap = vmx->cap[vcpu].set; 3187 3188 switch (type) { 3189 case VM_CAP_HALT_EXIT: 3190 if (cap_halt_exit) 3191 ret = 0; 3192 break; 3193 case VM_CAP_PAUSE_EXIT: 3194 if (cap_pause_exit) 3195 ret = 0; 3196 break; 3197 case VM_CAP_MTRAP_EXIT: 3198 if (cap_monitor_trap) 3199 ret = 0; 3200 break; 3201 case VM_CAP_UNRESTRICTED_GUEST: 3202 if (cap_unrestricted_guest) 3203 ret = 0; 3204 break; 3205 case VM_CAP_ENABLE_INVPCID: 3206 if (cap_invpcid) 3207 ret = 0; 3208 break; 3209 default: 3210 break; 3211 } 3212 3213 if (ret == 0) 3214 *retval = (vcap & (1 << type)) ? 1 : 0; 3215 3216 return (ret); 3217 } 3218 3219 static int 3220 vmx_setcap(void *arg, int vcpu, int type, int val) 3221 { 3222 struct vmx *vmx = arg; 3223 struct vmcs *vmcs = &vmx->vmcs[vcpu]; 3224 uint32_t baseval; 3225 uint32_t *pptr; 3226 int error; 3227 int flag; 3228 int reg; 3229 int retval; 3230 3231 retval = ENOENT; 3232 pptr = NULL; 3233 3234 switch (type) { 3235 case VM_CAP_HALT_EXIT: 3236 if (cap_halt_exit) { 3237 retval = 0; 3238 pptr = &vmx->cap[vcpu].proc_ctls; 3239 baseval = *pptr; 3240 flag = PROCBASED_HLT_EXITING; 3241 reg = VMCS_PRI_PROC_BASED_CTLS; 3242 } 3243 break; 3244 case VM_CAP_MTRAP_EXIT: 3245 if (cap_monitor_trap) { 3246 retval = 0; 3247 pptr = &vmx->cap[vcpu].proc_ctls; 3248 baseval = *pptr; 3249 flag = PROCBASED_MTF; 3250 reg = VMCS_PRI_PROC_BASED_CTLS; 3251 } 3252 break; 3253 case VM_CAP_PAUSE_EXIT: 3254 if (cap_pause_exit) { 3255 retval = 0; 3256 pptr = &vmx->cap[vcpu].proc_ctls; 3257 baseval = *pptr; 3258 flag = PROCBASED_PAUSE_EXITING; 3259 reg = VMCS_PRI_PROC_BASED_CTLS; 3260 } 3261 break; 3262 case VM_CAP_UNRESTRICTED_GUEST: 3263 if (cap_unrestricted_guest) { 3264 retval = 0; 3265 pptr = &vmx->cap[vcpu].proc_ctls2; 3266 baseval = *pptr; 3267 flag = PROCBASED2_UNRESTRICTED_GUEST; 3268 reg = VMCS_SEC_PROC_BASED_CTLS; 3269 } 3270 break; 3271 case VM_CAP_ENABLE_INVPCID: 3272 if (cap_invpcid) { 3273 retval = 0; 3274 pptr = &vmx->cap[vcpu].proc_ctls2; 3275 baseval = *pptr; 3276 flag = PROCBASED2_ENABLE_INVPCID; 3277 reg = VMCS_SEC_PROC_BASED_CTLS; 3278 } 3279 break; 3280 default: 3281 break; 3282 } 3283 3284 if (retval == 0) { 3285 if (val) { 3286 baseval |= flag; 3287 } else { 3288 baseval &= ~flag; 3289 } 3290 VMPTRLD(vmcs); 3291 error = vmwrite(reg, baseval); 3292 VMCLEAR(vmcs); 3293 3294 if (error) { 3295 retval = error; 3296 } else { 3297 /* 3298 * Update optional stored flags, and record 3299 * setting 3300 */ 3301 if (pptr != NULL) { 3302 *pptr = baseval; 3303 } 3304 3305 if (val) { 3306 vmx->cap[vcpu].set |= (1 << type); 3307 } else { 3308 vmx->cap[vcpu].set &= ~(1 << type); 3309 } 3310 } 3311 } 3312 3313 return (retval); 3314 } 3315 3316 struct vlapic_vtx { 3317 struct vlapic vlapic; 3318 struct pir_desc *pir_desc; 3319 struct vmx *vmx; 3320 }; 3321 3322 #define VMX_CTR_PIR(vm, vcpuid, pir_desc, notify, vector, level, msg) \ 3323 do { \ 3324 VCPU_CTR2(vm, vcpuid, msg " assert %s-triggered vector %d", \ 3325 level ? "level" : "edge", vector); \ 3326 VCPU_CTR1(vm, vcpuid, msg " pir0 0x%016lx", pir_desc->pir[0]); \ 3327 VCPU_CTR1(vm, vcpuid, msg " pir1 0x%016lx", pir_desc->pir[1]); \ 3328 VCPU_CTR1(vm, vcpuid, msg " pir2 0x%016lx", pir_desc->pir[2]); \ 3329 VCPU_CTR1(vm, vcpuid, msg " pir3 0x%016lx", pir_desc->pir[3]); \ 3330 VCPU_CTR1(vm, vcpuid, msg " notify: %s", notify ? "yes" : "no");\ 3331 } while (0) 3332 3333 /* 3334 * vlapic->ops handlers that utilize the APICv hardware assist described in 3335 * Chapter 29 of the Intel SDM. 3336 */ 3337 static int 3338 vmx_set_intr_ready(struct vlapic *vlapic, int vector, bool level) 3339 { 3340 struct vlapic_vtx *vlapic_vtx; 3341 struct pir_desc *pir_desc; 3342 uint64_t mask; 3343 int idx, notify; 3344 3345 vlapic_vtx = (struct vlapic_vtx *)vlapic; 3346 pir_desc = vlapic_vtx->pir_desc; 3347 3348 /* 3349 * Keep track of interrupt requests in the PIR descriptor. This is 3350 * because the virtual APIC page pointed to by the VMCS cannot be 3351 * modified if the vcpu is running. 3352 */ 3353 idx = vector / 64; 3354 mask = 1UL << (vector % 64); 3355 atomic_set_long(&pir_desc->pir[idx], mask); 3356 notify = atomic_cmpset_long(&pir_desc->pending, 0, 1); 3357 3358 VMX_CTR_PIR(vlapic->vm, vlapic->vcpuid, pir_desc, notify, vector, 3359 level, "vmx_set_intr_ready"); 3360 return (notify); 3361 } 3362 3363 static int 3364 vmx_pending_intr(struct vlapic *vlapic, int *vecptr) 3365 { 3366 struct vlapic_vtx *vlapic_vtx; 3367 struct pir_desc *pir_desc; 3368 struct LAPIC *lapic; 3369 uint64_t pending, pirval; 3370 uint32_t ppr, vpr; 3371 int i; 3372 3373 /* 3374 * This function is only expected to be called from the 'HLT' exit 3375 * handler which does not care about the vector that is pending. 3376 */ 3377 KASSERT(vecptr == NULL, ("vmx_pending_intr: vecptr must be NULL")); 3378 3379 vlapic_vtx = (struct vlapic_vtx *)vlapic; 3380 pir_desc = vlapic_vtx->pir_desc; 3381 3382 pending = atomic_load_acq_long(&pir_desc->pending); 3383 if (!pending) { 3384 /* 3385 * While a virtual interrupt may have already been 3386 * processed the actual delivery maybe pending the 3387 * interruptibility of the guest. Recognize a pending 3388 * interrupt by reevaluating virtual interrupts 3389 * following Section 29.2.1 in the Intel SDM Volume 3. 3390 */ 3391 struct vm_exit *vmexit; 3392 uint8_t rvi, ppr; 3393 3394 vmexit = vm_exitinfo(vlapic->vm, vlapic->vcpuid); 3395 KASSERT(vmexit->exitcode == VM_EXITCODE_HLT, 3396 ("vmx_pending_intr: exitcode not 'HLT'")); 3397 rvi = vmexit->u.hlt.intr_status & APIC_TPR_INT; 3398 lapic = vlapic->apic_page; 3399 ppr = lapic->ppr & APIC_TPR_INT; 3400 if (rvi > ppr) { 3401 return (1); 3402 } 3403 3404 return (0); 3405 } 3406 3407 /* 3408 * If there is an interrupt pending then it will be recognized only 3409 * if its priority is greater than the processor priority. 3410 * 3411 * Special case: if the processor priority is zero then any pending 3412 * interrupt will be recognized. 3413 */ 3414 lapic = vlapic->apic_page; 3415 ppr = lapic->ppr & APIC_TPR_INT; 3416 if (ppr == 0) 3417 return (1); 3418 3419 VCPU_CTR1(vlapic->vm, vlapic->vcpuid, "HLT with non-zero PPR %d", 3420 lapic->ppr); 3421 3422 for (i = 3; i >= 0; i--) { 3423 pirval = pir_desc->pir[i]; 3424 if (pirval != 0) { 3425 vpr = (i * 64 + flsl(pirval) - 1) & APIC_TPR_INT; 3426 return (vpr > ppr); 3427 } 3428 } 3429 return (0); 3430 } 3431 3432 static void 3433 vmx_intr_accepted(struct vlapic *vlapic, int vector) 3434 { 3435 3436 panic("vmx_intr_accepted: not expected to be called"); 3437 } 3438 3439 static void 3440 vmx_set_tmr(struct vlapic *vlapic, int vector, bool level) 3441 { 3442 struct vlapic_vtx *vlapic_vtx; 3443 struct vmx *vmx; 3444 struct vmcs *vmcs; 3445 uint64_t mask, val; 3446 3447 KASSERT(vector >= 0 && vector <= 255, ("invalid vector %d", vector)); 3448 KASSERT(!vcpu_is_running(vlapic->vm, vlapic->vcpuid, NULL), 3449 ("vmx_set_tmr: vcpu cannot be running")); 3450 3451 vlapic_vtx = (struct vlapic_vtx *)vlapic; 3452 vmx = vlapic_vtx->vmx; 3453 vmcs = &vmx->vmcs[vlapic->vcpuid]; 3454 mask = 1UL << (vector % 64); 3455 3456 VMPTRLD(vmcs); 3457 val = vmcs_read(VMCS_EOI_EXIT(vector)); 3458 if (level) 3459 val |= mask; 3460 else 3461 val &= ~mask; 3462 vmcs_write(VMCS_EOI_EXIT(vector), val); 3463 VMCLEAR(vmcs); 3464 } 3465 3466 static void 3467 vmx_enable_x2apic_mode(struct vlapic *vlapic) 3468 { 3469 struct vmx *vmx; 3470 struct vmcs *vmcs; 3471 uint32_t proc_ctls2; 3472 int vcpuid, error; 3473 3474 vcpuid = vlapic->vcpuid; 3475 vmx = ((struct vlapic_vtx *)vlapic)->vmx; 3476 vmcs = &vmx->vmcs[vcpuid]; 3477 3478 proc_ctls2 = vmx->cap[vcpuid].proc_ctls2; 3479 KASSERT((proc_ctls2 & PROCBASED2_VIRTUALIZE_APIC_ACCESSES) != 0, 3480 ("%s: invalid proc_ctls2 %#x", __func__, proc_ctls2)); 3481 3482 proc_ctls2 &= ~PROCBASED2_VIRTUALIZE_APIC_ACCESSES; 3483 proc_ctls2 |= PROCBASED2_VIRTUALIZE_X2APIC_MODE; 3484 vmx->cap[vcpuid].proc_ctls2 = proc_ctls2; 3485 3486 VMPTRLD(vmcs); 3487 vmcs_write(VMCS_SEC_PROC_BASED_CTLS, proc_ctls2); 3488 VMCLEAR(vmcs); 3489 3490 if (vlapic->vcpuid == 0) { 3491 /* 3492 * The nested page table mappings are shared by all vcpus 3493 * so unmap the APIC access page just once. 3494 */ 3495 error = vm_unmap_mmio(vmx->vm, DEFAULT_APIC_BASE, PAGE_SIZE); 3496 KASSERT(error == 0, ("%s: vm_unmap_mmio error %d", 3497 __func__, error)); 3498 3499 /* 3500 * The MSR bitmap is shared by all vcpus so modify it only 3501 * once in the context of vcpu 0. 3502 */ 3503 error = vmx_allow_x2apic_msrs(vmx); 3504 KASSERT(error == 0, ("%s: vmx_allow_x2apic_msrs error %d", 3505 __func__, error)); 3506 } 3507 } 3508 3509 static void 3510 vmx_post_intr(struct vlapic *vlapic, int hostcpu) 3511 { 3512 3513 ipi_cpu(hostcpu, pirvec); 3514 } 3515 3516 /* 3517 * Transfer the pending interrupts in the PIR descriptor to the IRR 3518 * in the virtual APIC page. 3519 */ 3520 static void 3521 vmx_inject_pir(struct vlapic *vlapic) 3522 { 3523 struct vlapic_vtx *vlapic_vtx; 3524 struct pir_desc *pir_desc; 3525 struct LAPIC *lapic; 3526 uint64_t val, pirval; 3527 int rvi, pirbase = -1; 3528 uint16_t intr_status_old, intr_status_new; 3529 3530 vlapic_vtx = (struct vlapic_vtx *)vlapic; 3531 pir_desc = vlapic_vtx->pir_desc; 3532 if (atomic_cmpset_long(&pir_desc->pending, 1, 0) == 0) { 3533 VCPU_CTR0(vlapic->vm, vlapic->vcpuid, "vmx_inject_pir: " 3534 "no posted interrupt pending"); 3535 return; 3536 } 3537 3538 pirval = 0; 3539 pirbase = -1; 3540 lapic = vlapic->apic_page; 3541 3542 val = atomic_readandclear_long(&pir_desc->pir[0]); 3543 if (val != 0) { 3544 lapic->irr0 |= val; 3545 lapic->irr1 |= val >> 32; 3546 pirbase = 0; 3547 pirval = val; 3548 } 3549 3550 val = atomic_readandclear_long(&pir_desc->pir[1]); 3551 if (val != 0) { 3552 lapic->irr2 |= val; 3553 lapic->irr3 |= val >> 32; 3554 pirbase = 64; 3555 pirval = val; 3556 } 3557 3558 val = atomic_readandclear_long(&pir_desc->pir[2]); 3559 if (val != 0) { 3560 lapic->irr4 |= val; 3561 lapic->irr5 |= val >> 32; 3562 pirbase = 128; 3563 pirval = val; 3564 } 3565 3566 val = atomic_readandclear_long(&pir_desc->pir[3]); 3567 if (val != 0) { 3568 lapic->irr6 |= val; 3569 lapic->irr7 |= val >> 32; 3570 pirbase = 192; 3571 pirval = val; 3572 } 3573 3574 VLAPIC_CTR_IRR(vlapic, "vmx_inject_pir"); 3575 3576 /* 3577 * Update RVI so the processor can evaluate pending virtual 3578 * interrupts on VM-entry. 3579 * 3580 * It is possible for pirval to be 0 here, even though the 3581 * pending bit has been set. The scenario is: 3582 * CPU-Y is sending a posted interrupt to CPU-X, which 3583 * is running a guest and processing posted interrupts in h/w. 3584 * CPU-X will eventually exit and the state seen in s/w is 3585 * the pending bit set, but no PIR bits set. 3586 * 3587 * CPU-X CPU-Y 3588 * (vm running) (host running) 3589 * rx posted interrupt 3590 * CLEAR pending bit 3591 * SET PIR bit 3592 * READ/CLEAR PIR bits 3593 * SET pending bit 3594 * (vm exit) 3595 * pending bit set, PIR 0 3596 */ 3597 if (pirval != 0) { 3598 rvi = pirbase + flsl(pirval) - 1; 3599 intr_status_old = vmcs_read(VMCS_GUEST_INTR_STATUS); 3600 intr_status_new = (intr_status_old & 0xFF00) | rvi; 3601 if (intr_status_new > intr_status_old) { 3602 vmcs_write(VMCS_GUEST_INTR_STATUS, intr_status_new); 3603 VCPU_CTR2(vlapic->vm, vlapic->vcpuid, "vmx_inject_pir: " 3604 "guest_intr_status changed from 0x%04x to 0x%04x", 3605 intr_status_old, intr_status_new); 3606 } 3607 } 3608 } 3609 3610 static struct vlapic * 3611 vmx_vlapic_init(void *arg, int vcpuid) 3612 { 3613 struct vmx *vmx; 3614 struct vlapic *vlapic; 3615 struct vlapic_vtx *vlapic_vtx; 3616 3617 vmx = arg; 3618 3619 vlapic = malloc(sizeof(struct vlapic_vtx), M_VLAPIC, M_WAITOK | M_ZERO); 3620 vlapic->vm = vmx->vm; 3621 vlapic->vcpuid = vcpuid; 3622 vlapic->apic_page = (struct LAPIC *)&vmx->apic_page[vcpuid]; 3623 3624 vlapic_vtx = (struct vlapic_vtx *)vlapic; 3625 vlapic_vtx->pir_desc = &vmx->pir_desc[vcpuid]; 3626 vlapic_vtx->vmx = vmx; 3627 3628 if (virtual_interrupt_delivery) { 3629 vlapic->ops.set_intr_ready = vmx_set_intr_ready; 3630 vlapic->ops.pending_intr = vmx_pending_intr; 3631 vlapic->ops.intr_accepted = vmx_intr_accepted; 3632 vlapic->ops.set_tmr = vmx_set_tmr; 3633 vlapic->ops.enable_x2apic_mode = vmx_enable_x2apic_mode; 3634 } 3635 3636 if (posted_interrupts) 3637 vlapic->ops.post_intr = vmx_post_intr; 3638 3639 vlapic_init(vlapic); 3640 3641 return (vlapic); 3642 } 3643 3644 static void 3645 vmx_vlapic_cleanup(void *arg, struct vlapic *vlapic) 3646 { 3647 3648 vlapic_cleanup(vlapic); 3649 free(vlapic, M_VLAPIC); 3650 } 3651 3652 struct vmm_ops vmm_ops_intel = { 3653 vmx_init, 3654 vmx_cleanup, 3655 vmx_restore, 3656 vmx_vminit, 3657 vmx_run, 3658 vmx_vmcleanup, 3659 vmx_getreg, 3660 vmx_setreg, 3661 vmx_getdesc, 3662 vmx_setdesc, 3663 vmx_getcap, 3664 vmx_setcap, 3665 ept_vmspace_alloc, 3666 ept_vmspace_free, 3667 vmx_vlapic_init, 3668 vmx_vlapic_cleanup, 3669 }; 3670