1 /*- 2 * Copyright (c) 2011 NetApp, Inc. 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 * 26 * $FreeBSD$ 27 */ 28 29 #include <sys/cdefs.h> 30 __FBSDID("$FreeBSD$"); 31 32 #include <sys/param.h> 33 #include <sys/systm.h> 34 #include <sys/smp.h> 35 #include <sys/kernel.h> 36 #include <sys/malloc.h> 37 #include <sys/pcpu.h> 38 #include <sys/proc.h> 39 #include <sys/sysctl.h> 40 41 #include <vm/vm.h> 42 #include <vm/pmap.h> 43 44 #include <machine/psl.h> 45 #include <machine/cpufunc.h> 46 #include <machine/md_var.h> 47 #include <machine/segments.h> 48 #include <machine/smp.h> 49 #include <machine/specialreg.h> 50 #include <machine/vmparam.h> 51 52 #include <machine/vmm.h> 53 #include "vmm_host.h" 54 #include "vmm_ipi.h" 55 #include "vmm_msr.h" 56 #include "vmm_ktr.h" 57 #include "vmm_stat.h" 58 #include "vlapic.h" 59 #include "vlapic_priv.h" 60 61 #include "vmx_msr.h" 62 #include "ept.h" 63 #include "vmx_cpufunc.h" 64 #include "vmx.h" 65 #include "x86.h" 66 #include "vmx_controls.h" 67 68 #define PINBASED_CTLS_ONE_SETTING \ 69 (PINBASED_EXTINT_EXITING | \ 70 PINBASED_NMI_EXITING | \ 71 PINBASED_VIRTUAL_NMI) 72 #define PINBASED_CTLS_ZERO_SETTING 0 73 74 #define PROCBASED_CTLS_WINDOW_SETTING \ 75 (PROCBASED_INT_WINDOW_EXITING | \ 76 PROCBASED_NMI_WINDOW_EXITING) 77 78 #define PROCBASED_CTLS_ONE_SETTING \ 79 (PROCBASED_SECONDARY_CONTROLS | \ 80 PROCBASED_IO_EXITING | \ 81 PROCBASED_MSR_BITMAPS | \ 82 PROCBASED_CTLS_WINDOW_SETTING) 83 #define PROCBASED_CTLS_ZERO_SETTING \ 84 (PROCBASED_CR3_LOAD_EXITING | \ 85 PROCBASED_CR3_STORE_EXITING | \ 86 PROCBASED_IO_BITMAPS) 87 88 #define PROCBASED_CTLS2_ONE_SETTING PROCBASED2_ENABLE_EPT 89 #define PROCBASED_CTLS2_ZERO_SETTING 0 90 91 #define VM_EXIT_CTLS_ONE_SETTING_NO_PAT \ 92 (VM_EXIT_HOST_LMA | \ 93 VM_EXIT_SAVE_EFER | \ 94 VM_EXIT_LOAD_EFER) 95 96 #define VM_EXIT_CTLS_ONE_SETTING \ 97 (VM_EXIT_CTLS_ONE_SETTING_NO_PAT | \ 98 VM_EXIT_ACKNOWLEDGE_INTERRUPT | \ 99 VM_EXIT_SAVE_PAT | \ 100 VM_EXIT_LOAD_PAT) 101 #define VM_EXIT_CTLS_ZERO_SETTING VM_EXIT_SAVE_DEBUG_CONTROLS 102 103 #define VM_ENTRY_CTLS_ONE_SETTING_NO_PAT VM_ENTRY_LOAD_EFER 104 105 #define VM_ENTRY_CTLS_ONE_SETTING \ 106 (VM_ENTRY_CTLS_ONE_SETTING_NO_PAT | \ 107 VM_ENTRY_LOAD_PAT) 108 #define VM_ENTRY_CTLS_ZERO_SETTING \ 109 (VM_ENTRY_LOAD_DEBUG_CONTROLS | \ 110 VM_ENTRY_INTO_SMM | \ 111 VM_ENTRY_DEACTIVATE_DUAL_MONITOR) 112 113 #define guest_msr_rw(vmx, msr) \ 114 msr_bitmap_change_access((vmx)->msr_bitmap, (msr), MSR_BITMAP_ACCESS_RW) 115 116 #define guest_msr_ro(vmx, msr) \ 117 msr_bitmap_change_access((vmx)->msr_bitmap, (msr), MSR_BITMAP_ACCESS_READ) 118 119 #define HANDLED 1 120 #define UNHANDLED 0 121 122 static MALLOC_DEFINE(M_VMX, "vmx", "vmx"); 123 static MALLOC_DEFINE(M_VLAPIC, "vlapic", "vlapic"); 124 125 SYSCTL_DECL(_hw_vmm); 126 SYSCTL_NODE(_hw_vmm, OID_AUTO, vmx, CTLFLAG_RW, NULL, NULL); 127 128 int vmxon_enabled[MAXCPU]; 129 static char vmxon_region[MAXCPU][PAGE_SIZE] __aligned(PAGE_SIZE); 130 131 static uint32_t pinbased_ctls, procbased_ctls, procbased_ctls2; 132 static uint32_t exit_ctls, entry_ctls; 133 134 static uint64_t cr0_ones_mask, cr0_zeros_mask; 135 SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr0_ones_mask, CTLFLAG_RD, 136 &cr0_ones_mask, 0, NULL); 137 SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr0_zeros_mask, CTLFLAG_RD, 138 &cr0_zeros_mask, 0, NULL); 139 140 static uint64_t cr4_ones_mask, cr4_zeros_mask; 141 SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr4_ones_mask, CTLFLAG_RD, 142 &cr4_ones_mask, 0, NULL); 143 SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr4_zeros_mask, CTLFLAG_RD, 144 &cr4_zeros_mask, 0, NULL); 145 146 static int vmx_no_patmsr; 147 148 static int vmx_initialized; 149 SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, initialized, CTLFLAG_RD, 150 &vmx_initialized, 0, "Intel VMX initialized"); 151 152 /* 153 * Optional capabilities 154 */ 155 static int cap_halt_exit; 156 static int cap_pause_exit; 157 static int cap_unrestricted_guest; 158 static int cap_monitor_trap; 159 static int cap_invpcid; 160 161 static int virtual_interrupt_delivery; 162 SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, virtual_interrupt_delivery, CTLFLAG_RD, 163 &virtual_interrupt_delivery, 0, "APICv virtual interrupt delivery support"); 164 165 static int posted_interrupts; 166 SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, posted_interrupts, CTLFLAG_RD, 167 &posted_interrupts, 0, "APICv posted interrupt support"); 168 169 static int pirvec; 170 SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, posted_interrupt_vector, CTLFLAG_RD, 171 &pirvec, 0, "APICv posted interrupt vector"); 172 173 static struct unrhdr *vpid_unr; 174 static u_int vpid_alloc_failed; 175 SYSCTL_UINT(_hw_vmm_vmx, OID_AUTO, vpid_alloc_failed, CTLFLAG_RD, 176 &vpid_alloc_failed, 0, NULL); 177 178 /* 179 * Use the last page below 4GB as the APIC access address. This address is 180 * occupied by the boot firmware so it is guaranteed that it will not conflict 181 * with a page in system memory. 182 */ 183 #define APIC_ACCESS_ADDRESS 0xFFFFF000 184 185 static void vmx_inject_pir(struct vlapic *vlapic); 186 187 #ifdef KTR 188 static const char * 189 exit_reason_to_str(int reason) 190 { 191 static char reasonbuf[32]; 192 193 switch (reason) { 194 case EXIT_REASON_EXCEPTION: 195 return "exception"; 196 case EXIT_REASON_EXT_INTR: 197 return "extint"; 198 case EXIT_REASON_TRIPLE_FAULT: 199 return "triplefault"; 200 case EXIT_REASON_INIT: 201 return "init"; 202 case EXIT_REASON_SIPI: 203 return "sipi"; 204 case EXIT_REASON_IO_SMI: 205 return "iosmi"; 206 case EXIT_REASON_SMI: 207 return "smi"; 208 case EXIT_REASON_INTR_WINDOW: 209 return "intrwindow"; 210 case EXIT_REASON_NMI_WINDOW: 211 return "nmiwindow"; 212 case EXIT_REASON_TASK_SWITCH: 213 return "taskswitch"; 214 case EXIT_REASON_CPUID: 215 return "cpuid"; 216 case EXIT_REASON_GETSEC: 217 return "getsec"; 218 case EXIT_REASON_HLT: 219 return "hlt"; 220 case EXIT_REASON_INVD: 221 return "invd"; 222 case EXIT_REASON_INVLPG: 223 return "invlpg"; 224 case EXIT_REASON_RDPMC: 225 return "rdpmc"; 226 case EXIT_REASON_RDTSC: 227 return "rdtsc"; 228 case EXIT_REASON_RSM: 229 return "rsm"; 230 case EXIT_REASON_VMCALL: 231 return "vmcall"; 232 case EXIT_REASON_VMCLEAR: 233 return "vmclear"; 234 case EXIT_REASON_VMLAUNCH: 235 return "vmlaunch"; 236 case EXIT_REASON_VMPTRLD: 237 return "vmptrld"; 238 case EXIT_REASON_VMPTRST: 239 return "vmptrst"; 240 case EXIT_REASON_VMREAD: 241 return "vmread"; 242 case EXIT_REASON_VMRESUME: 243 return "vmresume"; 244 case EXIT_REASON_VMWRITE: 245 return "vmwrite"; 246 case EXIT_REASON_VMXOFF: 247 return "vmxoff"; 248 case EXIT_REASON_VMXON: 249 return "vmxon"; 250 case EXIT_REASON_CR_ACCESS: 251 return "craccess"; 252 case EXIT_REASON_DR_ACCESS: 253 return "draccess"; 254 case EXIT_REASON_INOUT: 255 return "inout"; 256 case EXIT_REASON_RDMSR: 257 return "rdmsr"; 258 case EXIT_REASON_WRMSR: 259 return "wrmsr"; 260 case EXIT_REASON_INVAL_VMCS: 261 return "invalvmcs"; 262 case EXIT_REASON_INVAL_MSR: 263 return "invalmsr"; 264 case EXIT_REASON_MWAIT: 265 return "mwait"; 266 case EXIT_REASON_MTF: 267 return "mtf"; 268 case EXIT_REASON_MONITOR: 269 return "monitor"; 270 case EXIT_REASON_PAUSE: 271 return "pause"; 272 case EXIT_REASON_MCE: 273 return "mce"; 274 case EXIT_REASON_TPR: 275 return "tpr"; 276 case EXIT_REASON_APIC_ACCESS: 277 return "apic-access"; 278 case EXIT_REASON_GDTR_IDTR: 279 return "gdtridtr"; 280 case EXIT_REASON_LDTR_TR: 281 return "ldtrtr"; 282 case EXIT_REASON_EPT_FAULT: 283 return "eptfault"; 284 case EXIT_REASON_EPT_MISCONFIG: 285 return "eptmisconfig"; 286 case EXIT_REASON_INVEPT: 287 return "invept"; 288 case EXIT_REASON_RDTSCP: 289 return "rdtscp"; 290 case EXIT_REASON_VMX_PREEMPT: 291 return "vmxpreempt"; 292 case EXIT_REASON_INVVPID: 293 return "invvpid"; 294 case EXIT_REASON_WBINVD: 295 return "wbinvd"; 296 case EXIT_REASON_XSETBV: 297 return "xsetbv"; 298 case EXIT_REASON_APIC_WRITE: 299 return "apic-write"; 300 default: 301 snprintf(reasonbuf, sizeof(reasonbuf), "%d", reason); 302 return (reasonbuf); 303 } 304 } 305 #endif /* KTR */ 306 307 static int 308 vmx_allow_x2apic_msrs(struct vmx *vmx) 309 { 310 int i, error; 311 312 error = 0; 313 314 /* 315 * Allow readonly access to the following x2APIC MSRs from the guest. 316 */ 317 error += guest_msr_ro(vmx, MSR_APIC_ID); 318 error += guest_msr_ro(vmx, MSR_APIC_VERSION); 319 error += guest_msr_ro(vmx, MSR_APIC_LDR); 320 error += guest_msr_ro(vmx, MSR_APIC_SVR); 321 322 for (i = 0; i < 8; i++) 323 error += guest_msr_ro(vmx, MSR_APIC_ISR0 + i); 324 325 for (i = 0; i < 8; i++) 326 error += guest_msr_ro(vmx, MSR_APIC_TMR0 + i); 327 328 for (i = 0; i < 8; i++) 329 error += guest_msr_ro(vmx, MSR_APIC_IRR0 + i); 330 331 error += guest_msr_ro(vmx, MSR_APIC_ESR); 332 error += guest_msr_ro(vmx, MSR_APIC_LVT_TIMER); 333 error += guest_msr_ro(vmx, MSR_APIC_LVT_THERMAL); 334 error += guest_msr_ro(vmx, MSR_APIC_LVT_PCINT); 335 error += guest_msr_ro(vmx, MSR_APIC_LVT_LINT0); 336 error += guest_msr_ro(vmx, MSR_APIC_LVT_LINT1); 337 error += guest_msr_ro(vmx, MSR_APIC_LVT_ERROR); 338 error += guest_msr_ro(vmx, MSR_APIC_ICR_TIMER); 339 error += guest_msr_ro(vmx, MSR_APIC_DCR_TIMER); 340 error += guest_msr_ro(vmx, MSR_APIC_ICR); 341 342 /* 343 * Allow TPR, EOI and SELF_IPI MSRs to be read and written by the guest. 344 * 345 * These registers get special treatment described in the section 346 * "Virtualizing MSR-Based APIC Accesses". 347 */ 348 error += guest_msr_rw(vmx, MSR_APIC_TPR); 349 error += guest_msr_rw(vmx, MSR_APIC_EOI); 350 error += guest_msr_rw(vmx, MSR_APIC_SELF_IPI); 351 352 return (error); 353 } 354 355 u_long 356 vmx_fix_cr0(u_long cr0) 357 { 358 359 return ((cr0 | cr0_ones_mask) & ~cr0_zeros_mask); 360 } 361 362 u_long 363 vmx_fix_cr4(u_long cr4) 364 { 365 366 return ((cr4 | cr4_ones_mask) & ~cr4_zeros_mask); 367 } 368 369 static void 370 vpid_free(int vpid) 371 { 372 if (vpid < 0 || vpid > 0xffff) 373 panic("vpid_free: invalid vpid %d", vpid); 374 375 /* 376 * VPIDs [0,VM_MAXCPU] are special and are not allocated from 377 * the unit number allocator. 378 */ 379 380 if (vpid > VM_MAXCPU) 381 free_unr(vpid_unr, vpid); 382 } 383 384 static void 385 vpid_alloc(uint16_t *vpid, int num) 386 { 387 int i, x; 388 389 if (num <= 0 || num > VM_MAXCPU) 390 panic("invalid number of vpids requested: %d", num); 391 392 /* 393 * If the "enable vpid" execution control is not enabled then the 394 * VPID is required to be 0 for all vcpus. 395 */ 396 if ((procbased_ctls2 & PROCBASED2_ENABLE_VPID) == 0) { 397 for (i = 0; i < num; i++) 398 vpid[i] = 0; 399 return; 400 } 401 402 /* 403 * Allocate a unique VPID for each vcpu from the unit number allocator. 404 */ 405 for (i = 0; i < num; i++) { 406 x = alloc_unr(vpid_unr); 407 if (x == -1) 408 break; 409 else 410 vpid[i] = x; 411 } 412 413 if (i < num) { 414 atomic_add_int(&vpid_alloc_failed, 1); 415 416 /* 417 * If the unit number allocator does not have enough unique 418 * VPIDs then we need to allocate from the [1,VM_MAXCPU] range. 419 * 420 * These VPIDs are not be unique across VMs but this does not 421 * affect correctness because the combined mappings are also 422 * tagged with the EP4TA which is unique for each VM. 423 * 424 * It is still sub-optimal because the invvpid will invalidate 425 * combined mappings for a particular VPID across all EP4TAs. 426 */ 427 while (i-- > 0) 428 vpid_free(vpid[i]); 429 430 for (i = 0; i < num; i++) 431 vpid[i] = i + 1; 432 } 433 } 434 435 static void 436 vpid_init(void) 437 { 438 /* 439 * VPID 0 is required when the "enable VPID" execution control is 440 * disabled. 441 * 442 * VPIDs [1,VM_MAXCPU] are used as the "overflow namespace" when the 443 * unit number allocator does not have sufficient unique VPIDs to 444 * satisfy the allocation. 445 * 446 * The remaining VPIDs are managed by the unit number allocator. 447 */ 448 vpid_unr = new_unrhdr(VM_MAXCPU + 1, 0xffff, NULL); 449 } 450 451 static void 452 msr_save_area_init(struct msr_entry *g_area, int *g_count) 453 { 454 int cnt; 455 456 static struct msr_entry guest_msrs[] = { 457 { MSR_KGSBASE, 0, 0 }, 458 }; 459 460 cnt = sizeof(guest_msrs) / sizeof(guest_msrs[0]); 461 if (cnt > GUEST_MSR_MAX_ENTRIES) 462 panic("guest msr save area overrun"); 463 bcopy(guest_msrs, g_area, sizeof(guest_msrs)); 464 *g_count = cnt; 465 } 466 467 static void 468 vmx_disable(void *arg __unused) 469 { 470 struct invvpid_desc invvpid_desc = { 0 }; 471 struct invept_desc invept_desc = { 0 }; 472 473 if (vmxon_enabled[curcpu]) { 474 /* 475 * See sections 25.3.3.3 and 25.3.3.4 in Intel Vol 3b. 476 * 477 * VMXON or VMXOFF are not required to invalidate any TLB 478 * caching structures. This prevents potential retention of 479 * cached information in the TLB between distinct VMX episodes. 480 */ 481 invvpid(INVVPID_TYPE_ALL_CONTEXTS, invvpid_desc); 482 invept(INVEPT_TYPE_ALL_CONTEXTS, invept_desc); 483 vmxoff(); 484 } 485 load_cr4(rcr4() & ~CR4_VMXE); 486 } 487 488 static int 489 vmx_cleanup(void) 490 { 491 492 if (pirvec != 0) 493 vmm_ipi_free(pirvec); 494 495 if (vpid_unr != NULL) { 496 delete_unrhdr(vpid_unr); 497 vpid_unr = NULL; 498 } 499 500 smp_rendezvous(NULL, vmx_disable, NULL, NULL); 501 502 return (0); 503 } 504 505 static void 506 vmx_enable(void *arg __unused) 507 { 508 int error; 509 510 load_cr4(rcr4() | CR4_VMXE); 511 512 *(uint32_t *)vmxon_region[curcpu] = vmx_revision(); 513 error = vmxon(vmxon_region[curcpu]); 514 if (error == 0) 515 vmxon_enabled[curcpu] = 1; 516 } 517 518 static void 519 vmx_restore(void) 520 { 521 522 if (vmxon_enabled[curcpu]) 523 vmxon(vmxon_region[curcpu]); 524 } 525 526 static int 527 vmx_init(int ipinum) 528 { 529 int error, use_tpr_shadow; 530 uint64_t fixed0, fixed1, feature_control; 531 uint32_t tmp, procbased2_vid_bits; 532 533 /* CPUID.1:ECX[bit 5] must be 1 for processor to support VMX */ 534 if (!(cpu_feature2 & CPUID2_VMX)) { 535 printf("vmx_init: processor does not support VMX operation\n"); 536 return (ENXIO); 537 } 538 539 /* 540 * Verify that MSR_IA32_FEATURE_CONTROL lock and VMXON enable bits 541 * are set (bits 0 and 2 respectively). 542 */ 543 feature_control = rdmsr(MSR_IA32_FEATURE_CONTROL); 544 if ((feature_control & IA32_FEATURE_CONTROL_LOCK) == 0 || 545 (feature_control & IA32_FEATURE_CONTROL_VMX_EN) == 0) { 546 printf("vmx_init: VMX operation disabled by BIOS\n"); 547 return (ENXIO); 548 } 549 550 /* Check support for primary processor-based VM-execution controls */ 551 error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, 552 MSR_VMX_TRUE_PROCBASED_CTLS, 553 PROCBASED_CTLS_ONE_SETTING, 554 PROCBASED_CTLS_ZERO_SETTING, &procbased_ctls); 555 if (error) { 556 printf("vmx_init: processor does not support desired primary " 557 "processor-based controls\n"); 558 return (error); 559 } 560 561 /* Clear the processor-based ctl bits that are set on demand */ 562 procbased_ctls &= ~PROCBASED_CTLS_WINDOW_SETTING; 563 564 /* Check support for secondary processor-based VM-execution controls */ 565 error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, 566 MSR_VMX_PROCBASED_CTLS2, 567 PROCBASED_CTLS2_ONE_SETTING, 568 PROCBASED_CTLS2_ZERO_SETTING, &procbased_ctls2); 569 if (error) { 570 printf("vmx_init: processor does not support desired secondary " 571 "processor-based controls\n"); 572 return (error); 573 } 574 575 /* Check support for VPID */ 576 error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, MSR_VMX_PROCBASED_CTLS2, 577 PROCBASED2_ENABLE_VPID, 0, &tmp); 578 if (error == 0) 579 procbased_ctls2 |= PROCBASED2_ENABLE_VPID; 580 581 /* Check support for pin-based VM-execution controls */ 582 error = vmx_set_ctlreg(MSR_VMX_PINBASED_CTLS, 583 MSR_VMX_TRUE_PINBASED_CTLS, 584 PINBASED_CTLS_ONE_SETTING, 585 PINBASED_CTLS_ZERO_SETTING, &pinbased_ctls); 586 if (error) { 587 printf("vmx_init: processor does not support desired " 588 "pin-based controls\n"); 589 return (error); 590 } 591 592 /* Check support for VM-exit controls */ 593 error = vmx_set_ctlreg(MSR_VMX_EXIT_CTLS, MSR_VMX_TRUE_EXIT_CTLS, 594 VM_EXIT_CTLS_ONE_SETTING, 595 VM_EXIT_CTLS_ZERO_SETTING, 596 &exit_ctls); 597 if (error) { 598 /* Try again without the PAT MSR bits */ 599 error = vmx_set_ctlreg(MSR_VMX_EXIT_CTLS, 600 MSR_VMX_TRUE_EXIT_CTLS, 601 VM_EXIT_CTLS_ONE_SETTING_NO_PAT, 602 VM_EXIT_CTLS_ZERO_SETTING, 603 &exit_ctls); 604 if (error) { 605 printf("vmx_init: processor does not support desired " 606 "exit controls\n"); 607 return (error); 608 } else { 609 if (bootverbose) 610 printf("vmm: PAT MSR access not supported\n"); 611 guest_msr_valid(MSR_PAT); 612 vmx_no_patmsr = 1; 613 } 614 } 615 616 /* Check support for VM-entry controls */ 617 if (!vmx_no_patmsr) { 618 error = vmx_set_ctlreg(MSR_VMX_ENTRY_CTLS, 619 MSR_VMX_TRUE_ENTRY_CTLS, 620 VM_ENTRY_CTLS_ONE_SETTING, 621 VM_ENTRY_CTLS_ZERO_SETTING, 622 &entry_ctls); 623 } else { 624 error = vmx_set_ctlreg(MSR_VMX_ENTRY_CTLS, 625 MSR_VMX_TRUE_ENTRY_CTLS, 626 VM_ENTRY_CTLS_ONE_SETTING_NO_PAT, 627 VM_ENTRY_CTLS_ZERO_SETTING, 628 &entry_ctls); 629 } 630 631 if (error) { 632 printf("vmx_init: processor does not support desired " 633 "entry controls\n"); 634 return (error); 635 } 636 637 /* 638 * Check support for optional features by testing them 639 * as individual bits 640 */ 641 cap_halt_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, 642 MSR_VMX_TRUE_PROCBASED_CTLS, 643 PROCBASED_HLT_EXITING, 0, 644 &tmp) == 0); 645 646 cap_monitor_trap = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, 647 MSR_VMX_PROCBASED_CTLS, 648 PROCBASED_MTF, 0, 649 &tmp) == 0); 650 651 cap_pause_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, 652 MSR_VMX_TRUE_PROCBASED_CTLS, 653 PROCBASED_PAUSE_EXITING, 0, 654 &tmp) == 0); 655 656 cap_unrestricted_guest = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, 657 MSR_VMX_PROCBASED_CTLS2, 658 PROCBASED2_UNRESTRICTED_GUEST, 0, 659 &tmp) == 0); 660 661 cap_invpcid = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, 662 MSR_VMX_PROCBASED_CTLS2, PROCBASED2_ENABLE_INVPCID, 0, 663 &tmp) == 0); 664 665 /* 666 * Check support for virtual interrupt delivery. 667 */ 668 procbased2_vid_bits = (PROCBASED2_VIRTUALIZE_APIC_ACCESSES | 669 PROCBASED2_VIRTUALIZE_X2APIC_MODE | 670 PROCBASED2_APIC_REGISTER_VIRTUALIZATION | 671 PROCBASED2_VIRTUAL_INTERRUPT_DELIVERY); 672 673 use_tpr_shadow = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, 674 MSR_VMX_TRUE_PROCBASED_CTLS, PROCBASED_USE_TPR_SHADOW, 0, 675 &tmp) == 0); 676 677 error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, MSR_VMX_PROCBASED_CTLS2, 678 procbased2_vid_bits, 0, &tmp); 679 if (error == 0 && use_tpr_shadow) { 680 virtual_interrupt_delivery = 1; 681 TUNABLE_INT_FETCH("hw.vmm.vmx.use_apic_vid", 682 &virtual_interrupt_delivery); 683 } 684 685 if (virtual_interrupt_delivery) { 686 procbased_ctls |= PROCBASED_USE_TPR_SHADOW; 687 procbased_ctls2 |= procbased2_vid_bits; 688 procbased_ctls2 &= ~PROCBASED2_VIRTUALIZE_X2APIC_MODE; 689 690 /* 691 * Check for Posted Interrupts only if Virtual Interrupt 692 * Delivery is enabled. 693 */ 694 error = vmx_set_ctlreg(MSR_VMX_PINBASED_CTLS, 695 MSR_VMX_TRUE_PINBASED_CTLS, PINBASED_POSTED_INTERRUPT, 0, 696 &tmp); 697 if (error == 0) { 698 pirvec = vmm_ipi_alloc(); 699 if (pirvec == 0) { 700 if (bootverbose) { 701 printf("vmx_init: unable to allocate " 702 "posted interrupt vector\n"); 703 } 704 } else { 705 posted_interrupts = 1; 706 TUNABLE_INT_FETCH("hw.vmm.vmx.use_apic_pir", 707 &posted_interrupts); 708 } 709 } 710 } 711 712 if (posted_interrupts) 713 pinbased_ctls |= PINBASED_POSTED_INTERRUPT; 714 715 /* Initialize EPT */ 716 error = ept_init(ipinum); 717 if (error) { 718 printf("vmx_init: ept initialization failed (%d)\n", error); 719 return (error); 720 } 721 722 /* 723 * Stash the cr0 and cr4 bits that must be fixed to 0 or 1 724 */ 725 fixed0 = rdmsr(MSR_VMX_CR0_FIXED0); 726 fixed1 = rdmsr(MSR_VMX_CR0_FIXED1); 727 cr0_ones_mask = fixed0 & fixed1; 728 cr0_zeros_mask = ~fixed0 & ~fixed1; 729 730 /* 731 * CR0_PE and CR0_PG can be set to zero in VMX non-root operation 732 * if unrestricted guest execution is allowed. 733 */ 734 if (cap_unrestricted_guest) 735 cr0_ones_mask &= ~(CR0_PG | CR0_PE); 736 737 /* 738 * Do not allow the guest to set CR0_NW or CR0_CD. 739 */ 740 cr0_zeros_mask |= (CR0_NW | CR0_CD); 741 742 fixed0 = rdmsr(MSR_VMX_CR4_FIXED0); 743 fixed1 = rdmsr(MSR_VMX_CR4_FIXED1); 744 cr4_ones_mask = fixed0 & fixed1; 745 cr4_zeros_mask = ~fixed0 & ~fixed1; 746 747 vpid_init(); 748 749 /* enable VMX operation */ 750 smp_rendezvous(NULL, vmx_enable, NULL, NULL); 751 752 vmx_initialized = 1; 753 754 return (0); 755 } 756 757 static void 758 vmx_trigger_hostintr(int vector) 759 { 760 uintptr_t func; 761 struct gate_descriptor *gd; 762 763 gd = &idt[vector]; 764 765 KASSERT(vector >= 32 && vector <= 255, ("vmx_trigger_hostintr: " 766 "invalid vector %d", vector)); 767 KASSERT(gd->gd_p == 1, ("gate descriptor for vector %d not present", 768 vector)); 769 KASSERT(gd->gd_type == SDT_SYSIGT, ("gate descriptor for vector %d " 770 "has invalid type %d", vector, gd->gd_type)); 771 KASSERT(gd->gd_dpl == SEL_KPL, ("gate descriptor for vector %d " 772 "has invalid dpl %d", vector, gd->gd_dpl)); 773 KASSERT(gd->gd_selector == GSEL(GCODE_SEL, SEL_KPL), ("gate descriptor " 774 "for vector %d has invalid selector %d", vector, gd->gd_selector)); 775 KASSERT(gd->gd_ist == 0, ("gate descriptor for vector %d has invalid " 776 "IST %d", vector, gd->gd_ist)); 777 778 func = ((long)gd->gd_hioffset << 16 | gd->gd_looffset); 779 vmx_call_isr(func); 780 } 781 782 static int 783 vmx_setup_cr_shadow(int which, struct vmcs *vmcs, uint32_t initial) 784 { 785 int error, mask_ident, shadow_ident; 786 uint64_t mask_value; 787 788 if (which != 0 && which != 4) 789 panic("vmx_setup_cr_shadow: unknown cr%d", which); 790 791 if (which == 0) { 792 mask_ident = VMCS_CR0_MASK; 793 mask_value = cr0_ones_mask | cr0_zeros_mask; 794 shadow_ident = VMCS_CR0_SHADOW; 795 } else { 796 mask_ident = VMCS_CR4_MASK; 797 mask_value = cr4_ones_mask | cr4_zeros_mask; 798 shadow_ident = VMCS_CR4_SHADOW; 799 } 800 801 error = vmcs_setreg(vmcs, 0, VMCS_IDENT(mask_ident), mask_value); 802 if (error) 803 return (error); 804 805 error = vmcs_setreg(vmcs, 0, VMCS_IDENT(shadow_ident), initial); 806 if (error) 807 return (error); 808 809 return (0); 810 } 811 #define vmx_setup_cr0_shadow(vmcs,init) vmx_setup_cr_shadow(0, (vmcs), (init)) 812 #define vmx_setup_cr4_shadow(vmcs,init) vmx_setup_cr_shadow(4, (vmcs), (init)) 813 814 static void * 815 vmx_vminit(struct vm *vm, pmap_t pmap) 816 { 817 uint16_t vpid[VM_MAXCPU]; 818 int i, error, guest_msr_count; 819 struct vmx *vmx; 820 struct vmcs *vmcs; 821 822 vmx = malloc(sizeof(struct vmx), M_VMX, M_WAITOK | M_ZERO); 823 if ((uintptr_t)vmx & PAGE_MASK) { 824 panic("malloc of struct vmx not aligned on %d byte boundary", 825 PAGE_SIZE); 826 } 827 vmx->vm = vm; 828 829 vmx->eptp = eptp(vtophys((vm_offset_t)pmap->pm_pml4)); 830 831 /* 832 * Clean up EPTP-tagged guest physical and combined mappings 833 * 834 * VMX transitions are not required to invalidate any guest physical 835 * mappings. So, it may be possible for stale guest physical mappings 836 * to be present in the processor TLBs. 837 * 838 * Combined mappings for this EP4TA are also invalidated for all VPIDs. 839 */ 840 ept_invalidate_mappings(vmx->eptp); 841 842 msr_bitmap_initialize(vmx->msr_bitmap); 843 844 /* 845 * It is safe to allow direct access to MSR_GSBASE and MSR_FSBASE. 846 * The guest FSBASE and GSBASE are saved and restored during 847 * vm-exit and vm-entry respectively. The host FSBASE and GSBASE are 848 * always restored from the vmcs host state area on vm-exit. 849 * 850 * The SYSENTER_CS/ESP/EIP MSRs are identical to FS/GSBASE in 851 * how they are saved/restored so can be directly accessed by the 852 * guest. 853 * 854 * Guest KGSBASE is saved and restored in the guest MSR save area. 855 * Host KGSBASE is restored before returning to userland from the pcb. 856 * There will be a window of time when we are executing in the host 857 * kernel context with a value of KGSBASE from the guest. This is ok 858 * because the value of KGSBASE is inconsequential in kernel context. 859 * 860 * MSR_EFER is saved and restored in the guest VMCS area on a 861 * VM exit and entry respectively. It is also restored from the 862 * host VMCS area on a VM exit. 863 */ 864 if (guest_msr_rw(vmx, MSR_GSBASE) || 865 guest_msr_rw(vmx, MSR_FSBASE) || 866 guest_msr_rw(vmx, MSR_SYSENTER_CS_MSR) || 867 guest_msr_rw(vmx, MSR_SYSENTER_ESP_MSR) || 868 guest_msr_rw(vmx, MSR_SYSENTER_EIP_MSR) || 869 guest_msr_rw(vmx, MSR_KGSBASE) || 870 guest_msr_rw(vmx, MSR_EFER)) 871 panic("vmx_vminit: error setting guest msr access"); 872 873 /* 874 * MSR_PAT is saved and restored in the guest VMCS are on a VM exit 875 * and entry respectively. It is also restored from the host VMCS 876 * area on a VM exit. However, if running on a system with no 877 * MSR_PAT save/restore support, leave access disabled so accesses 878 * will be trapped. 879 */ 880 if (!vmx_no_patmsr && guest_msr_rw(vmx, MSR_PAT)) 881 panic("vmx_vminit: error setting guest pat msr access"); 882 883 vpid_alloc(vpid, VM_MAXCPU); 884 885 if (virtual_interrupt_delivery) { 886 error = vm_map_mmio(vm, DEFAULT_APIC_BASE, PAGE_SIZE, 887 APIC_ACCESS_ADDRESS); 888 /* XXX this should really return an error to the caller */ 889 KASSERT(error == 0, ("vm_map_mmio(apicbase) error %d", error)); 890 } 891 892 for (i = 0; i < VM_MAXCPU; i++) { 893 vmcs = &vmx->vmcs[i]; 894 vmcs->identifier = vmx_revision(); 895 error = vmclear(vmcs); 896 if (error != 0) { 897 panic("vmx_vminit: vmclear error %d on vcpu %d\n", 898 error, i); 899 } 900 901 error = vmcs_init(vmcs); 902 KASSERT(error == 0, ("vmcs_init error %d", error)); 903 904 VMPTRLD(vmcs); 905 error = 0; 906 error += vmwrite(VMCS_HOST_RSP, (u_long)&vmx->ctx[i]); 907 error += vmwrite(VMCS_EPTP, vmx->eptp); 908 error += vmwrite(VMCS_PIN_BASED_CTLS, pinbased_ctls); 909 error += vmwrite(VMCS_PRI_PROC_BASED_CTLS, procbased_ctls); 910 error += vmwrite(VMCS_SEC_PROC_BASED_CTLS, procbased_ctls2); 911 error += vmwrite(VMCS_EXIT_CTLS, exit_ctls); 912 error += vmwrite(VMCS_ENTRY_CTLS, entry_ctls); 913 error += vmwrite(VMCS_MSR_BITMAP, vtophys(vmx->msr_bitmap)); 914 error += vmwrite(VMCS_VPID, vpid[i]); 915 if (virtual_interrupt_delivery) { 916 error += vmwrite(VMCS_APIC_ACCESS, APIC_ACCESS_ADDRESS); 917 error += vmwrite(VMCS_VIRTUAL_APIC, 918 vtophys(&vmx->apic_page[i])); 919 error += vmwrite(VMCS_EOI_EXIT0, 0); 920 error += vmwrite(VMCS_EOI_EXIT1, 0); 921 error += vmwrite(VMCS_EOI_EXIT2, 0); 922 error += vmwrite(VMCS_EOI_EXIT3, 0); 923 } 924 if (posted_interrupts) { 925 error += vmwrite(VMCS_PIR_VECTOR, pirvec); 926 error += vmwrite(VMCS_PIR_DESC, 927 vtophys(&vmx->pir_desc[i])); 928 } 929 VMCLEAR(vmcs); 930 KASSERT(error == 0, ("vmx_vminit: error customizing the vmcs")); 931 932 vmx->cap[i].set = 0; 933 vmx->cap[i].proc_ctls = procbased_ctls; 934 vmx->cap[i].proc_ctls2 = procbased_ctls2; 935 936 vmx->state[i].lastcpu = -1; 937 vmx->state[i].vpid = vpid[i]; 938 vmx->state[i].user_event.intr_info = 0; 939 940 msr_save_area_init(vmx->guest_msrs[i], &guest_msr_count); 941 942 error = vmcs_set_msr_save(vmcs, vtophys(vmx->guest_msrs[i]), 943 guest_msr_count); 944 if (error != 0) 945 panic("vmcs_set_msr_save error %d", error); 946 947 /* 948 * Set up the CR0/4 shadows, and init the read shadow 949 * to the power-on register value from the Intel Sys Arch. 950 * CR0 - 0x60000010 951 * CR4 - 0 952 */ 953 error = vmx_setup_cr0_shadow(vmcs, 0x60000010); 954 if (error != 0) 955 panic("vmx_setup_cr0_shadow %d", error); 956 957 error = vmx_setup_cr4_shadow(vmcs, 0); 958 if (error != 0) 959 panic("vmx_setup_cr4_shadow %d", error); 960 961 vmx->ctx[i].pmap = pmap; 962 } 963 964 return (vmx); 965 } 966 967 static int 968 vmx_handle_cpuid(struct vm *vm, int vcpu, struct vmxctx *vmxctx) 969 { 970 int handled, func; 971 972 func = vmxctx->guest_rax; 973 974 handled = x86_emulate_cpuid(vm, vcpu, 975 (uint32_t*)(&vmxctx->guest_rax), 976 (uint32_t*)(&vmxctx->guest_rbx), 977 (uint32_t*)(&vmxctx->guest_rcx), 978 (uint32_t*)(&vmxctx->guest_rdx)); 979 return (handled); 980 } 981 982 static __inline void 983 vmx_run_trace(struct vmx *vmx, int vcpu) 984 { 985 #ifdef KTR 986 VCPU_CTR1(vmx->vm, vcpu, "Resume execution at %#lx", vmcs_guest_rip()); 987 #endif 988 } 989 990 static __inline void 991 vmx_exit_trace(struct vmx *vmx, int vcpu, uint64_t rip, uint32_t exit_reason, 992 int handled) 993 { 994 #ifdef KTR 995 VCPU_CTR3(vmx->vm, vcpu, "%s %s vmexit at 0x%0lx", 996 handled ? "handled" : "unhandled", 997 exit_reason_to_str(exit_reason), rip); 998 #endif 999 } 1000 1001 static __inline void 1002 vmx_astpending_trace(struct vmx *vmx, int vcpu, uint64_t rip) 1003 { 1004 #ifdef KTR 1005 VCPU_CTR1(vmx->vm, vcpu, "astpending vmexit at 0x%0lx", rip); 1006 #endif 1007 } 1008 1009 static VMM_STAT_INTEL(VCPU_INVVPID_SAVED, "Number of vpid invalidations saved"); 1010 1011 static void 1012 vmx_set_pcpu_defaults(struct vmx *vmx, int vcpu, pmap_t pmap) 1013 { 1014 struct vmxstate *vmxstate; 1015 struct invvpid_desc invvpid_desc; 1016 1017 vmxstate = &vmx->state[vcpu]; 1018 if (vmxstate->lastcpu == curcpu) 1019 return; 1020 1021 vmxstate->lastcpu = curcpu; 1022 1023 vmm_stat_incr(vmx->vm, vcpu, VCPU_MIGRATIONS, 1); 1024 1025 vmcs_write(VMCS_HOST_TR_BASE, vmm_get_host_trbase()); 1026 vmcs_write(VMCS_HOST_GDTR_BASE, vmm_get_host_gdtrbase()); 1027 vmcs_write(VMCS_HOST_GS_BASE, vmm_get_host_gsbase()); 1028 1029 /* 1030 * If we are using VPIDs then invalidate all mappings tagged with 'vpid' 1031 * 1032 * We do this because this vcpu was executing on a different host 1033 * cpu when it last ran. We do not track whether it invalidated 1034 * mappings associated with its 'vpid' during that run. So we must 1035 * assume that the mappings associated with 'vpid' on 'curcpu' are 1036 * stale and invalidate them. 1037 * 1038 * Note that we incur this penalty only when the scheduler chooses to 1039 * move the thread associated with this vcpu between host cpus. 1040 * 1041 * Note also that this will invalidate mappings tagged with 'vpid' 1042 * for "all" EP4TAs. 1043 */ 1044 if (vmxstate->vpid != 0) { 1045 if (pmap->pm_eptgen == vmx->eptgen[curcpu]) { 1046 invvpid_desc._res1 = 0; 1047 invvpid_desc._res2 = 0; 1048 invvpid_desc.vpid = vmxstate->vpid; 1049 invvpid(INVVPID_TYPE_SINGLE_CONTEXT, invvpid_desc); 1050 } else { 1051 /* 1052 * The invvpid can be skipped if an invept is going to 1053 * be performed before entering the guest. The invept 1054 * will invalidate combined mappings tagged with 1055 * 'vmx->eptp' for all vpids. 1056 */ 1057 vmm_stat_incr(vmx->vm, vcpu, VCPU_INVVPID_SAVED, 1); 1058 } 1059 } 1060 } 1061 1062 /* 1063 * We depend on 'procbased_ctls' to have the Interrupt Window Exiting bit set. 1064 */ 1065 CTASSERT((PROCBASED_CTLS_ONE_SETTING & PROCBASED_INT_WINDOW_EXITING) != 0); 1066 1067 static void __inline 1068 vmx_set_int_window_exiting(struct vmx *vmx, int vcpu) 1069 { 1070 1071 if ((vmx->cap[vcpu].proc_ctls & PROCBASED_INT_WINDOW_EXITING) == 0) { 1072 vmx->cap[vcpu].proc_ctls |= PROCBASED_INT_WINDOW_EXITING; 1073 vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); 1074 VCPU_CTR0(vmx->vm, vcpu, "Enabling interrupt window exiting"); 1075 } 1076 } 1077 1078 static void __inline 1079 vmx_clear_int_window_exiting(struct vmx *vmx, int vcpu) 1080 { 1081 1082 KASSERT((vmx->cap[vcpu].proc_ctls & PROCBASED_INT_WINDOW_EXITING) != 0, 1083 ("intr_window_exiting not set: %#x", vmx->cap[vcpu].proc_ctls)); 1084 vmx->cap[vcpu].proc_ctls &= ~PROCBASED_INT_WINDOW_EXITING; 1085 vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); 1086 VCPU_CTR0(vmx->vm, vcpu, "Disabling interrupt window exiting"); 1087 } 1088 1089 static void __inline 1090 vmx_set_nmi_window_exiting(struct vmx *vmx, int vcpu) 1091 { 1092 1093 if ((vmx->cap[vcpu].proc_ctls & PROCBASED_NMI_WINDOW_EXITING) == 0) { 1094 vmx->cap[vcpu].proc_ctls |= PROCBASED_NMI_WINDOW_EXITING; 1095 vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); 1096 VCPU_CTR0(vmx->vm, vcpu, "Enabling NMI window exiting"); 1097 } 1098 } 1099 1100 static void __inline 1101 vmx_clear_nmi_window_exiting(struct vmx *vmx, int vcpu) 1102 { 1103 1104 KASSERT((vmx->cap[vcpu].proc_ctls & PROCBASED_NMI_WINDOW_EXITING) != 0, 1105 ("nmi_window_exiting not set %#x", vmx->cap[vcpu].proc_ctls)); 1106 vmx->cap[vcpu].proc_ctls &= ~PROCBASED_NMI_WINDOW_EXITING; 1107 vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); 1108 VCPU_CTR0(vmx->vm, vcpu, "Disabling NMI window exiting"); 1109 } 1110 1111 #define NMI_BLOCKING (VMCS_INTERRUPTIBILITY_NMI_BLOCKING | \ 1112 VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING) 1113 #define HWINTR_BLOCKING (VMCS_INTERRUPTIBILITY_STI_BLOCKING | \ 1114 VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING) 1115 1116 static void 1117 vmx_inject_user_event(struct vmx *vmx, int vcpu) 1118 { 1119 struct vmxevent *user_event; 1120 uint32_t info; 1121 1122 user_event = &vmx->state[vcpu].user_event; 1123 1124 info = vmcs_read(VMCS_ENTRY_INTR_INFO); 1125 KASSERT((info & VMCS_INTR_VALID) == 0, ("vmx_inject_user_event: invalid " 1126 "VM-entry interruption information %#x", info)); 1127 1128 vmcs_write(VMCS_ENTRY_INTR_INFO, user_event->intr_info); 1129 if (user_event->intr_info & VMCS_INTR_DEL_ERRCODE) 1130 vmcs_write(VMCS_ENTRY_EXCEPTION_ERROR, user_event->error_code); 1131 user_event->intr_info = 0; 1132 } 1133 1134 static void 1135 vmx_inject_exception(struct vmx *vmx, int vcpu, struct vm_exit *vmexit, 1136 int fault, int errvalid, int errcode) 1137 { 1138 uint32_t info; 1139 1140 info = vmcs_read(VMCS_ENTRY_INTR_INFO); 1141 KASSERT((info & VMCS_INTR_VALID) == 0, ("vmx_inject_exception: invalid " 1142 "VM-entry interruption information %#x", info)); 1143 1144 /* 1145 * Although INTR_T_HWEXCEPTION does not advance %rip, vmx_run() 1146 * always advances it, so we clear the instruction length to zero 1147 * explicitly. 1148 */ 1149 vmexit->inst_length = 0; 1150 info = fault | VMCS_INTR_T_HWEXCEPTION | VMCS_INTR_VALID; 1151 if (errvalid) { 1152 info |= VMCS_INTR_DEL_ERRCODE; 1153 vmcs_write(VMCS_ENTRY_EXCEPTION_ERROR, errcode); 1154 } 1155 vmcs_write(VMCS_ENTRY_INTR_INFO, info); 1156 1157 VCPU_CTR2(vmx->vm, vcpu, "Injecting fault %d (errcode %d)", fault, 1158 errcode); 1159 } 1160 1161 /* All GP# faults VMM injects use an error code of 0. */ 1162 static void 1163 vmx_inject_gp(struct vmx *vmx, int vcpu, struct vm_exit *vmexit) 1164 { 1165 1166 vmx_inject_exception(vmx, vcpu, vmexit, IDT_GP, 1, 0); 1167 } 1168 1169 static void 1170 vmx_inject_ud(struct vmx *vmx, int vcpu, struct vm_exit *vmexit) 1171 { 1172 1173 vmx_inject_exception(vmx, vcpu, vmexit, IDT_UD, 0, 0); 1174 } 1175 1176 static void 1177 vmx_inject_nmi(struct vmx *vmx, int vcpu) 1178 { 1179 uint32_t gi, info; 1180 1181 gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY); 1182 KASSERT((gi & NMI_BLOCKING) == 0, ("vmx_inject_nmi: invalid guest " 1183 "interruptibility-state %#x", gi)); 1184 1185 info = vmcs_read(VMCS_ENTRY_INTR_INFO); 1186 KASSERT((info & VMCS_INTR_VALID) == 0, ("vmx_inject_nmi: invalid " 1187 "VM-entry interruption information %#x", info)); 1188 1189 /* 1190 * Inject the virtual NMI. The vector must be the NMI IDT entry 1191 * or the VMCS entry check will fail. 1192 */ 1193 info = IDT_NMI | VMCS_INTR_T_NMI | VMCS_INTR_VALID; 1194 vmcs_write(VMCS_ENTRY_INTR_INFO, info); 1195 1196 VCPU_CTR0(vmx->vm, vcpu, "Injecting vNMI"); 1197 1198 /* Clear the request */ 1199 vm_nmi_clear(vmx->vm, vcpu); 1200 } 1201 1202 static void 1203 vmx_inject_interrupts(struct vmx *vmx, int vcpu, struct vlapic *vlapic) 1204 { 1205 int vector, need_nmi_exiting; 1206 uint64_t rflags; 1207 uint32_t gi, info; 1208 1209 if (vm_nmi_pending(vmx->vm, vcpu)) { 1210 /* 1211 * If there are no conditions blocking NMI injection then 1212 * inject it directly here otherwise enable "NMI window 1213 * exiting" to inject it as soon as we can. 1214 * 1215 * We also check for STI_BLOCKING because some implementations 1216 * don't allow NMI injection in this case. If we are running 1217 * on a processor that doesn't have this restriction it will 1218 * immediately exit and the NMI will be injected in the 1219 * "NMI window exiting" handler. 1220 */ 1221 need_nmi_exiting = 1; 1222 gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY); 1223 if ((gi & (HWINTR_BLOCKING | NMI_BLOCKING)) == 0) { 1224 info = vmcs_read(VMCS_ENTRY_INTR_INFO); 1225 if ((info & VMCS_INTR_VALID) == 0) { 1226 vmx_inject_nmi(vmx, vcpu); 1227 need_nmi_exiting = 0; 1228 } else { 1229 VCPU_CTR1(vmx->vm, vcpu, "Cannot inject NMI " 1230 "due to VM-entry intr info %#x", info); 1231 } 1232 } else { 1233 VCPU_CTR1(vmx->vm, vcpu, "Cannot inject NMI due to " 1234 "Guest Interruptibility-state %#x", gi); 1235 } 1236 1237 if (need_nmi_exiting) 1238 vmx_set_nmi_window_exiting(vmx, vcpu); 1239 } 1240 1241 /* 1242 * If there is a user injection event pending and there isn't 1243 * an interrupt queued already, inject the user event. 1244 */ 1245 if (vmx->state[vcpu].user_event.intr_info & VMCS_INTR_VALID) { 1246 info = vmcs_read(VMCS_ENTRY_INTR_INFO); 1247 if ((info & VMCS_INTR_VALID) == 0) { 1248 vmx_inject_user_event(vmx, vcpu); 1249 } else { 1250 /* 1251 * XXX: Do we need to force an exit so this can 1252 * be injected? 1253 */ 1254 VCPU_CTR1(vmx->vm, vcpu, "Cannot inject user event " 1255 "due to VM-entry intr info %#x", info); 1256 } 1257 } 1258 1259 if (virtual_interrupt_delivery) { 1260 vmx_inject_pir(vlapic); 1261 return; 1262 } 1263 1264 /* 1265 * If interrupt-window exiting is already in effect then don't bother 1266 * checking for pending interrupts. This is just an optimization and 1267 * not needed for correctness. 1268 */ 1269 if ((vmx->cap[vcpu].proc_ctls & PROCBASED_INT_WINDOW_EXITING) != 0) { 1270 VCPU_CTR0(vmx->vm, vcpu, "Skip interrupt injection due to " 1271 "pending int_window_exiting"); 1272 return; 1273 } 1274 1275 /* Ask the local apic for a vector to inject */ 1276 if (!vlapic_pending_intr(vlapic, &vector)) 1277 return; 1278 1279 KASSERT(vector >= 32 && vector <= 255, ("invalid vector %d", vector)); 1280 1281 /* Check RFLAGS.IF and the interruptibility state of the guest */ 1282 rflags = vmcs_read(VMCS_GUEST_RFLAGS); 1283 if ((rflags & PSL_I) == 0) { 1284 VCPU_CTR2(vmx->vm, vcpu, "Cannot inject vector %d due to " 1285 "rflags %#lx", vector, rflags); 1286 goto cantinject; 1287 } 1288 1289 gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY); 1290 if (gi & HWINTR_BLOCKING) { 1291 VCPU_CTR2(vmx->vm, vcpu, "Cannot inject vector %d due to " 1292 "Guest Interruptibility-state %#x", vector, gi); 1293 goto cantinject; 1294 } 1295 1296 info = vmcs_read(VMCS_ENTRY_INTR_INFO); 1297 if (info & VMCS_INTR_VALID) { 1298 /* 1299 * This is expected and could happen for multiple reasons: 1300 * - A vectoring VM-entry was aborted due to astpending 1301 * - A VM-exit happened during event injection. 1302 * - An NMI was injected above or after "NMI window exiting" 1303 */ 1304 VCPU_CTR2(vmx->vm, vcpu, "Cannot inject vector %d due to " 1305 "VM-entry intr info %#x", vector, info); 1306 goto cantinject; 1307 } 1308 1309 /* Inject the interrupt */ 1310 info = VMCS_INTR_T_HWINTR | VMCS_INTR_VALID; 1311 info |= vector; 1312 vmcs_write(VMCS_ENTRY_INTR_INFO, info); 1313 1314 /* Update the Local APIC ISR */ 1315 vlapic_intr_accepted(vlapic, vector); 1316 1317 VCPU_CTR1(vmx->vm, vcpu, "Injecting hwintr at vector %d", vector); 1318 1319 return; 1320 1321 cantinject: 1322 /* 1323 * Set the Interrupt Window Exiting execution control so we can inject 1324 * the interrupt as soon as blocking condition goes away. 1325 */ 1326 vmx_set_int_window_exiting(vmx, vcpu); 1327 } 1328 1329 /* 1330 * If the Virtual NMIs execution control is '1' then the logical processor 1331 * tracks virtual-NMI blocking in the Guest Interruptibility-state field of 1332 * the VMCS. An IRET instruction in VMX non-root operation will remove any 1333 * virtual-NMI blocking. 1334 * 1335 * This unblocking occurs even if the IRET causes a fault. In this case the 1336 * hypervisor needs to restore virtual-NMI blocking before resuming the guest. 1337 */ 1338 static void 1339 vmx_restore_nmi_blocking(struct vmx *vmx, int vcpuid) 1340 { 1341 uint32_t gi; 1342 1343 VCPU_CTR0(vmx->vm, vcpuid, "Restore Virtual-NMI blocking"); 1344 gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY); 1345 gi |= VMCS_INTERRUPTIBILITY_NMI_BLOCKING; 1346 vmcs_write(VMCS_GUEST_INTERRUPTIBILITY, gi); 1347 } 1348 1349 static void 1350 vmx_clear_nmi_blocking(struct vmx *vmx, int vcpuid) 1351 { 1352 uint32_t gi; 1353 1354 VCPU_CTR0(vmx->vm, vcpuid, "Clear Virtual-NMI blocking"); 1355 gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY); 1356 gi &= ~VMCS_INTERRUPTIBILITY_NMI_BLOCKING; 1357 vmcs_write(VMCS_GUEST_INTERRUPTIBILITY, gi); 1358 } 1359 1360 static int 1361 vmx_emulate_xsetbv(struct vmx *vmx, int vcpu, struct vm_exit *vmexit) 1362 { 1363 struct vmxctx *vmxctx; 1364 uint64_t xcrval; 1365 const struct xsave_limits *limits; 1366 1367 vmxctx = &vmx->ctx[vcpu]; 1368 limits = vmm_get_xsave_limits(); 1369 1370 /* 1371 * Note that the processor raises a GP# fault on its own if 1372 * xsetbv is executed for CPL != 0, so we do not have to 1373 * emulate that fault here. 1374 */ 1375 1376 /* Only xcr0 is supported. */ 1377 if (vmxctx->guest_rcx != 0) { 1378 vmx_inject_gp(vmx, vcpu, vmexit); 1379 return (HANDLED); 1380 } 1381 1382 /* We only handle xcr0 if both the host and guest have XSAVE enabled. */ 1383 if (!limits->xsave_enabled || !(vmcs_read(VMCS_GUEST_CR4) & CR4_XSAVE)) { 1384 vmx_inject_ud(vmx, vcpu, vmexit); 1385 return (HANDLED); 1386 } 1387 1388 xcrval = vmxctx->guest_rdx << 32 | (vmxctx->guest_rax & 0xffffffff); 1389 if ((xcrval & ~limits->xcr0_allowed) != 0) { 1390 vmx_inject_gp(vmx, vcpu, vmexit); 1391 return (HANDLED); 1392 } 1393 1394 if (!(xcrval & XFEATURE_ENABLED_X87)) { 1395 vmx_inject_gp(vmx, vcpu, vmexit); 1396 return (HANDLED); 1397 } 1398 1399 if ((xcrval & (XFEATURE_ENABLED_AVX | XFEATURE_ENABLED_SSE)) == 1400 XFEATURE_ENABLED_AVX) { 1401 vmx_inject_gp(vmx, vcpu, vmexit); 1402 return (HANDLED); 1403 } 1404 1405 /* 1406 * This runs "inside" vmrun() with the guest's FPU state, so 1407 * modifying xcr0 directly modifies the guest's xcr0, not the 1408 * host's. 1409 */ 1410 load_xcr(0, xcrval); 1411 return (HANDLED); 1412 } 1413 1414 static int 1415 vmx_emulate_cr_access(struct vmx *vmx, int vcpu, uint64_t exitqual) 1416 { 1417 int cr, vmcs_guest_cr, vmcs_shadow_cr; 1418 uint64_t crval, regval, ones_mask, zeros_mask; 1419 const struct vmxctx *vmxctx; 1420 1421 /* We only handle mov to %cr0 or %cr4 at this time */ 1422 if ((exitqual & 0xf0) != 0x00) 1423 return (UNHANDLED); 1424 1425 cr = exitqual & 0xf; 1426 if (cr != 0 && cr != 4) 1427 return (UNHANDLED); 1428 1429 regval = 0; /* silence gcc */ 1430 vmxctx = &vmx->ctx[vcpu]; 1431 1432 /* 1433 * We must use vmcs_write() directly here because vmcs_setreg() will 1434 * call vmclear(vmcs) as a side-effect which we certainly don't want. 1435 */ 1436 switch ((exitqual >> 8) & 0xf) { 1437 case 0: 1438 regval = vmxctx->guest_rax; 1439 break; 1440 case 1: 1441 regval = vmxctx->guest_rcx; 1442 break; 1443 case 2: 1444 regval = vmxctx->guest_rdx; 1445 break; 1446 case 3: 1447 regval = vmxctx->guest_rbx; 1448 break; 1449 case 4: 1450 regval = vmcs_read(VMCS_GUEST_RSP); 1451 break; 1452 case 5: 1453 regval = vmxctx->guest_rbp; 1454 break; 1455 case 6: 1456 regval = vmxctx->guest_rsi; 1457 break; 1458 case 7: 1459 regval = vmxctx->guest_rdi; 1460 break; 1461 case 8: 1462 regval = vmxctx->guest_r8; 1463 break; 1464 case 9: 1465 regval = vmxctx->guest_r9; 1466 break; 1467 case 10: 1468 regval = vmxctx->guest_r10; 1469 break; 1470 case 11: 1471 regval = vmxctx->guest_r11; 1472 break; 1473 case 12: 1474 regval = vmxctx->guest_r12; 1475 break; 1476 case 13: 1477 regval = vmxctx->guest_r13; 1478 break; 1479 case 14: 1480 regval = vmxctx->guest_r14; 1481 break; 1482 case 15: 1483 regval = vmxctx->guest_r15; 1484 break; 1485 } 1486 1487 if (cr == 0) { 1488 ones_mask = cr0_ones_mask; 1489 zeros_mask = cr0_zeros_mask; 1490 vmcs_guest_cr = VMCS_GUEST_CR0; 1491 vmcs_shadow_cr = VMCS_CR0_SHADOW; 1492 } else { 1493 ones_mask = cr4_ones_mask; 1494 zeros_mask = cr4_zeros_mask; 1495 vmcs_guest_cr = VMCS_GUEST_CR4; 1496 vmcs_shadow_cr = VMCS_CR4_SHADOW; 1497 } 1498 vmcs_write(vmcs_shadow_cr, regval); 1499 1500 crval = regval | ones_mask; 1501 crval &= ~zeros_mask; 1502 vmcs_write(vmcs_guest_cr, crval); 1503 1504 if (cr == 0 && regval & CR0_PG) { 1505 uint64_t efer, entry_ctls; 1506 1507 /* 1508 * If CR0.PG is 1 and EFER.LME is 1 then EFER.LMA and 1509 * the "IA-32e mode guest" bit in VM-entry control must be 1510 * equal. 1511 */ 1512 efer = vmcs_read(VMCS_GUEST_IA32_EFER); 1513 if (efer & EFER_LME) { 1514 efer |= EFER_LMA; 1515 vmcs_write(VMCS_GUEST_IA32_EFER, efer); 1516 entry_ctls = vmcs_read(VMCS_ENTRY_CTLS); 1517 entry_ctls |= VM_ENTRY_GUEST_LMA; 1518 vmcs_write(VMCS_ENTRY_CTLS, entry_ctls); 1519 } 1520 } 1521 1522 return (HANDLED); 1523 } 1524 1525 static enum vie_cpu_mode 1526 vmx_cpu_mode(void) 1527 { 1528 1529 if (vmcs_read(VMCS_GUEST_IA32_EFER) & EFER_LMA) 1530 return (CPU_MODE_64BIT); 1531 else 1532 return (CPU_MODE_COMPATIBILITY); 1533 } 1534 1535 static enum vie_paging_mode 1536 vmx_paging_mode(void) 1537 { 1538 1539 if (!(vmcs_read(VMCS_GUEST_CR0) & CR0_PG)) 1540 return (PAGING_MODE_FLAT); 1541 if (!(vmcs_read(VMCS_GUEST_CR4) & CR4_PAE)) 1542 return (PAGING_MODE_32); 1543 if (vmcs_read(VMCS_GUEST_IA32_EFER) & EFER_LME) 1544 return (PAGING_MODE_64); 1545 else 1546 return (PAGING_MODE_PAE); 1547 } 1548 1549 static int 1550 ept_fault_type(uint64_t ept_qual) 1551 { 1552 int fault_type; 1553 1554 if (ept_qual & EPT_VIOLATION_DATA_WRITE) 1555 fault_type = VM_PROT_WRITE; 1556 else if (ept_qual & EPT_VIOLATION_INST_FETCH) 1557 fault_type = VM_PROT_EXECUTE; 1558 else 1559 fault_type= VM_PROT_READ; 1560 1561 return (fault_type); 1562 } 1563 1564 static boolean_t 1565 ept_emulation_fault(uint64_t ept_qual) 1566 { 1567 int read, write; 1568 1569 /* EPT fault on an instruction fetch doesn't make sense here */ 1570 if (ept_qual & EPT_VIOLATION_INST_FETCH) 1571 return (FALSE); 1572 1573 /* EPT fault must be a read fault or a write fault */ 1574 read = ept_qual & EPT_VIOLATION_DATA_READ ? 1 : 0; 1575 write = ept_qual & EPT_VIOLATION_DATA_WRITE ? 1 : 0; 1576 if ((read | write) == 0) 1577 return (FALSE); 1578 1579 /* 1580 * The EPT violation must have been caused by accessing a 1581 * guest-physical address that is a translation of a guest-linear 1582 * address. 1583 */ 1584 if ((ept_qual & EPT_VIOLATION_GLA_VALID) == 0 || 1585 (ept_qual & EPT_VIOLATION_XLAT_VALID) == 0) { 1586 return (FALSE); 1587 } 1588 1589 return (TRUE); 1590 } 1591 1592 static __inline int 1593 apic_access_virtualization(struct vmx *vmx, int vcpuid) 1594 { 1595 uint32_t proc_ctls2; 1596 1597 proc_ctls2 = vmx->cap[vcpuid].proc_ctls2; 1598 return ((proc_ctls2 & PROCBASED2_VIRTUALIZE_APIC_ACCESSES) ? 1 : 0); 1599 } 1600 1601 static __inline int 1602 x2apic_virtualization(struct vmx *vmx, int vcpuid) 1603 { 1604 uint32_t proc_ctls2; 1605 1606 proc_ctls2 = vmx->cap[vcpuid].proc_ctls2; 1607 return ((proc_ctls2 & PROCBASED2_VIRTUALIZE_X2APIC_MODE) ? 1 : 0); 1608 } 1609 1610 static int 1611 vmx_handle_apic_write(struct vmx *vmx, int vcpuid, struct vlapic *vlapic, 1612 uint64_t qual) 1613 { 1614 int error, handled, offset; 1615 uint32_t *apic_regs, vector; 1616 bool retu; 1617 1618 handled = HANDLED; 1619 offset = APIC_WRITE_OFFSET(qual); 1620 1621 if (!apic_access_virtualization(vmx, vcpuid)) { 1622 /* 1623 * In general there should not be any APIC write VM-exits 1624 * unless APIC-access virtualization is enabled. 1625 * 1626 * However self-IPI virtualization can legitimately trigger 1627 * an APIC-write VM-exit so treat it specially. 1628 */ 1629 if (x2apic_virtualization(vmx, vcpuid) && 1630 offset == APIC_OFFSET_SELF_IPI) { 1631 apic_regs = (uint32_t *)(vlapic->apic_page); 1632 vector = apic_regs[APIC_OFFSET_SELF_IPI / 4]; 1633 vlapic_self_ipi_handler(vlapic, vector); 1634 return (HANDLED); 1635 } else 1636 return (UNHANDLED); 1637 } 1638 1639 switch (offset) { 1640 case APIC_OFFSET_ID: 1641 vlapic_id_write_handler(vlapic); 1642 break; 1643 case APIC_OFFSET_LDR: 1644 vlapic_ldr_write_handler(vlapic); 1645 break; 1646 case APIC_OFFSET_DFR: 1647 vlapic_dfr_write_handler(vlapic); 1648 break; 1649 case APIC_OFFSET_SVR: 1650 vlapic_svr_write_handler(vlapic); 1651 break; 1652 case APIC_OFFSET_ESR: 1653 vlapic_esr_write_handler(vlapic); 1654 break; 1655 case APIC_OFFSET_ICR_LOW: 1656 retu = false; 1657 error = vlapic_icrlo_write_handler(vlapic, &retu); 1658 if (error != 0 || retu) 1659 handled = UNHANDLED; 1660 break; 1661 case APIC_OFFSET_CMCI_LVT: 1662 case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT: 1663 vlapic_lvt_write_handler(vlapic, offset); 1664 break; 1665 case APIC_OFFSET_TIMER_ICR: 1666 vlapic_icrtmr_write_handler(vlapic); 1667 break; 1668 case APIC_OFFSET_TIMER_DCR: 1669 vlapic_dcr_write_handler(vlapic); 1670 break; 1671 default: 1672 handled = UNHANDLED; 1673 break; 1674 } 1675 return (handled); 1676 } 1677 1678 static bool 1679 apic_access_fault(struct vmx *vmx, int vcpuid, uint64_t gpa) 1680 { 1681 1682 if (apic_access_virtualization(vmx, vcpuid) && 1683 (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE)) 1684 return (true); 1685 else 1686 return (false); 1687 } 1688 1689 static int 1690 vmx_handle_apic_access(struct vmx *vmx, int vcpuid, struct vm_exit *vmexit) 1691 { 1692 uint64_t qual; 1693 int access_type, offset, allowed; 1694 1695 if (!apic_access_virtualization(vmx, vcpuid)) 1696 return (UNHANDLED); 1697 1698 qual = vmexit->u.vmx.exit_qualification; 1699 access_type = APIC_ACCESS_TYPE(qual); 1700 offset = APIC_ACCESS_OFFSET(qual); 1701 1702 allowed = 0; 1703 if (access_type == 0) { 1704 /* 1705 * Read data access to the following registers is expected. 1706 */ 1707 switch (offset) { 1708 case APIC_OFFSET_APR: 1709 case APIC_OFFSET_PPR: 1710 case APIC_OFFSET_RRR: 1711 case APIC_OFFSET_CMCI_LVT: 1712 case APIC_OFFSET_TIMER_CCR: 1713 allowed = 1; 1714 break; 1715 default: 1716 break; 1717 } 1718 } else if (access_type == 1) { 1719 /* 1720 * Write data access to the following registers is expected. 1721 */ 1722 switch (offset) { 1723 case APIC_OFFSET_VER: 1724 case APIC_OFFSET_APR: 1725 case APIC_OFFSET_PPR: 1726 case APIC_OFFSET_RRR: 1727 case APIC_OFFSET_ISR0 ... APIC_OFFSET_ISR7: 1728 case APIC_OFFSET_TMR0 ... APIC_OFFSET_TMR7: 1729 case APIC_OFFSET_IRR0 ... APIC_OFFSET_IRR7: 1730 case APIC_OFFSET_CMCI_LVT: 1731 case APIC_OFFSET_TIMER_CCR: 1732 allowed = 1; 1733 break; 1734 default: 1735 break; 1736 } 1737 } 1738 1739 if (allowed) { 1740 vmexit->exitcode = VM_EXITCODE_INST_EMUL; 1741 vmexit->u.inst_emul.gpa = DEFAULT_APIC_BASE + offset; 1742 vmexit->u.inst_emul.gla = VIE_INVALID_GLA; 1743 vmexit->u.inst_emul.cr3 = vmcs_guest_cr3(); 1744 vmexit->u.inst_emul.cpu_mode = vmx_cpu_mode(); 1745 vmexit->u.inst_emul.paging_mode = vmx_paging_mode(); 1746 } 1747 1748 /* 1749 * Regardless of whether the APIC-access is allowed this handler 1750 * always returns UNHANDLED: 1751 * - if the access is allowed then it is handled by emulating the 1752 * instruction that caused the VM-exit (outside the critical section) 1753 * - if the access is not allowed then it will be converted to an 1754 * exitcode of VM_EXITCODE_VMX and will be dealt with in userland. 1755 */ 1756 return (UNHANDLED); 1757 } 1758 1759 static int 1760 vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit) 1761 { 1762 int error, handled; 1763 struct vmxctx *vmxctx; 1764 struct vlapic *vlapic; 1765 uint32_t eax, ecx, edx, idtvec_info, idtvec_err, intr_info, reason; 1766 uint64_t qual, gpa; 1767 bool retu; 1768 1769 CTASSERT((PINBASED_CTLS_ONE_SETTING & PINBASED_VIRTUAL_NMI) != 0); 1770 CTASSERT((PINBASED_CTLS_ONE_SETTING & PINBASED_NMI_EXITING) != 0); 1771 1772 handled = UNHANDLED; 1773 vmxctx = &vmx->ctx[vcpu]; 1774 1775 qual = vmexit->u.vmx.exit_qualification; 1776 reason = vmexit->u.vmx.exit_reason; 1777 vmexit->exitcode = VM_EXITCODE_BOGUS; 1778 1779 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_COUNT, 1); 1780 1781 /* 1782 * VM exits that could be triggered during event injection on the 1783 * previous VM entry need to be handled specially by re-injecting 1784 * the event. 1785 * 1786 * See "Information for VM Exits During Event Delivery" in Intel SDM 1787 * for details. 1788 */ 1789 switch (reason) { 1790 case EXIT_REASON_EPT_FAULT: 1791 case EXIT_REASON_EPT_MISCONFIG: 1792 case EXIT_REASON_APIC_ACCESS: 1793 case EXIT_REASON_TASK_SWITCH: 1794 case EXIT_REASON_EXCEPTION: 1795 idtvec_info = vmcs_idt_vectoring_info(); 1796 if (idtvec_info & VMCS_IDT_VEC_VALID) { 1797 idtvec_info &= ~(1 << 12); /* clear undefined bit */ 1798 vmcs_write(VMCS_ENTRY_INTR_INFO, idtvec_info); 1799 if (idtvec_info & VMCS_IDT_VEC_ERRCODE_VALID) { 1800 idtvec_err = vmcs_idt_vectoring_err(); 1801 vmcs_write(VMCS_ENTRY_EXCEPTION_ERROR, 1802 idtvec_err); 1803 } 1804 /* 1805 * If 'virtual NMIs' are being used and the VM-exit 1806 * happened while injecting an NMI during the previous 1807 * VM-entry, then clear "blocking by NMI" in the Guest 1808 * Interruptibility-state. 1809 */ 1810 if ((idtvec_info & VMCS_INTR_T_MASK) == 1811 VMCS_INTR_T_NMI) { 1812 vmx_clear_nmi_blocking(vmx, vcpu); 1813 } 1814 vmcs_write(VMCS_ENTRY_INST_LENGTH, vmexit->inst_length); 1815 } 1816 default: 1817 idtvec_info = 0; 1818 break; 1819 } 1820 1821 switch (reason) { 1822 case EXIT_REASON_CR_ACCESS: 1823 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_CR_ACCESS, 1); 1824 handled = vmx_emulate_cr_access(vmx, vcpu, qual); 1825 break; 1826 case EXIT_REASON_RDMSR: 1827 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_RDMSR, 1); 1828 retu = false; 1829 ecx = vmxctx->guest_rcx; 1830 error = emulate_rdmsr(vmx->vm, vcpu, ecx, &retu); 1831 if (error) { 1832 vmexit->exitcode = VM_EXITCODE_RDMSR; 1833 vmexit->u.msr.code = ecx; 1834 } else if (!retu) { 1835 handled = HANDLED; 1836 } else { 1837 /* Return to userspace with a valid exitcode */ 1838 KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS, 1839 ("emulate_wrmsr retu with bogus exitcode")); 1840 } 1841 break; 1842 case EXIT_REASON_WRMSR: 1843 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_WRMSR, 1); 1844 retu = false; 1845 eax = vmxctx->guest_rax; 1846 ecx = vmxctx->guest_rcx; 1847 edx = vmxctx->guest_rdx; 1848 error = emulate_wrmsr(vmx->vm, vcpu, ecx, 1849 (uint64_t)edx << 32 | eax, &retu); 1850 if (error) { 1851 vmexit->exitcode = VM_EXITCODE_WRMSR; 1852 vmexit->u.msr.code = ecx; 1853 vmexit->u.msr.wval = (uint64_t)edx << 32 | eax; 1854 } else if (!retu) { 1855 handled = HANDLED; 1856 } else { 1857 /* Return to userspace with a valid exitcode */ 1858 KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS, 1859 ("emulate_wrmsr retu with bogus exitcode")); 1860 } 1861 break; 1862 case EXIT_REASON_HLT: 1863 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_HLT, 1); 1864 vmexit->exitcode = VM_EXITCODE_HLT; 1865 vmexit->u.hlt.rflags = vmcs_read(VMCS_GUEST_RFLAGS); 1866 break; 1867 case EXIT_REASON_MTF: 1868 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_MTRAP, 1); 1869 vmexit->exitcode = VM_EXITCODE_MTRAP; 1870 break; 1871 case EXIT_REASON_PAUSE: 1872 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_PAUSE, 1); 1873 vmexit->exitcode = VM_EXITCODE_PAUSE; 1874 break; 1875 case EXIT_REASON_INTR_WINDOW: 1876 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INTR_WINDOW, 1); 1877 vmx_clear_int_window_exiting(vmx, vcpu); 1878 return (1); 1879 case EXIT_REASON_EXT_INTR: 1880 /* 1881 * External interrupts serve only to cause VM exits and allow 1882 * the host interrupt handler to run. 1883 * 1884 * If this external interrupt triggers a virtual interrupt 1885 * to a VM, then that state will be recorded by the 1886 * host interrupt handler in the VM's softc. We will inject 1887 * this virtual interrupt during the subsequent VM enter. 1888 */ 1889 intr_info = vmcs_read(VMCS_EXIT_INTR_INFO); 1890 KASSERT((intr_info & VMCS_INTR_VALID) != 0 && 1891 (intr_info & VMCS_INTR_T_MASK) == VMCS_INTR_T_HWINTR, 1892 ("VM exit interruption info invalid: %#x", intr_info)); 1893 vmx_trigger_hostintr(intr_info & 0xff); 1894 1895 /* 1896 * This is special. We want to treat this as an 'handled' 1897 * VM-exit but not increment the instruction pointer. 1898 */ 1899 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_EXTINT, 1); 1900 return (1); 1901 case EXIT_REASON_NMI_WINDOW: 1902 /* Exit to allow the pending virtual NMI to be injected */ 1903 if (vm_nmi_pending(vmx->vm, vcpu)) 1904 vmx_inject_nmi(vmx, vcpu); 1905 vmx_clear_nmi_window_exiting(vmx, vcpu); 1906 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_NMI_WINDOW, 1); 1907 return (1); 1908 case EXIT_REASON_INOUT: 1909 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INOUT, 1); 1910 vmexit->exitcode = VM_EXITCODE_INOUT; 1911 vmexit->u.inout.bytes = (qual & 0x7) + 1; 1912 vmexit->u.inout.in = (qual & 0x8) ? 1 : 0; 1913 vmexit->u.inout.string = (qual & 0x10) ? 1 : 0; 1914 vmexit->u.inout.rep = (qual & 0x20) ? 1 : 0; 1915 vmexit->u.inout.port = (uint16_t)(qual >> 16); 1916 vmexit->u.inout.eax = (uint32_t)(vmxctx->guest_rax); 1917 break; 1918 case EXIT_REASON_CPUID: 1919 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_CPUID, 1); 1920 handled = vmx_handle_cpuid(vmx->vm, vcpu, vmxctx); 1921 break; 1922 case EXIT_REASON_EXCEPTION: 1923 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_EXCEPTION, 1); 1924 intr_info = vmcs_read(VMCS_EXIT_INTR_INFO); 1925 KASSERT((intr_info & VMCS_INTR_VALID) != 0, 1926 ("VM exit interruption info invalid: %#x", intr_info)); 1927 1928 /* 1929 * If Virtual NMIs control is 1 and the VM-exit is due to a 1930 * fault encountered during the execution of IRET then we must 1931 * restore the state of "virtual-NMI blocking" before resuming 1932 * the guest. 1933 * 1934 * See "Resuming Guest Software after Handling an Exception". 1935 */ 1936 if ((idtvec_info & VMCS_IDT_VEC_VALID) == 0 && 1937 (intr_info & 0xff) != IDT_DF && 1938 (intr_info & EXIT_QUAL_NMIUDTI) != 0) 1939 vmx_restore_nmi_blocking(vmx, vcpu); 1940 1941 /* 1942 * The NMI has already been handled in vmx_exit_handle_nmi(). 1943 */ 1944 if ((intr_info & VMCS_INTR_T_MASK) == VMCS_INTR_T_NMI) 1945 return (1); 1946 break; 1947 case EXIT_REASON_EPT_FAULT: 1948 /* 1949 * If 'gpa' lies within the address space allocated to 1950 * memory then this must be a nested page fault otherwise 1951 * this must be an instruction that accesses MMIO space. 1952 */ 1953 gpa = vmcs_gpa(); 1954 if (vm_mem_allocated(vmx->vm, gpa) || 1955 apic_access_fault(vmx, vcpu, gpa)) { 1956 vmexit->exitcode = VM_EXITCODE_PAGING; 1957 vmexit->u.paging.gpa = gpa; 1958 vmexit->u.paging.fault_type = ept_fault_type(qual); 1959 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_NESTED_FAULT, 1); 1960 } else if (ept_emulation_fault(qual)) { 1961 vmexit->exitcode = VM_EXITCODE_INST_EMUL; 1962 vmexit->u.inst_emul.gpa = gpa; 1963 vmexit->u.inst_emul.gla = vmcs_gla(); 1964 vmexit->u.inst_emul.cr3 = vmcs_guest_cr3(); 1965 vmexit->u.inst_emul.cpu_mode = vmx_cpu_mode(); 1966 vmexit->u.inst_emul.paging_mode = vmx_paging_mode(); 1967 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INST_EMUL, 1); 1968 } 1969 /* 1970 * If Virtual NMIs control is 1 and the VM-exit is due to an 1971 * EPT fault during the execution of IRET then we must restore 1972 * the state of "virtual-NMI blocking" before resuming. 1973 * 1974 * See description of "NMI unblocking due to IRET" in 1975 * "Exit Qualification for EPT Violations". 1976 */ 1977 if ((idtvec_info & VMCS_IDT_VEC_VALID) == 0 && 1978 (qual & EXIT_QUAL_NMIUDTI) != 0) 1979 vmx_restore_nmi_blocking(vmx, vcpu); 1980 break; 1981 case EXIT_REASON_VIRTUALIZED_EOI: 1982 vmexit->exitcode = VM_EXITCODE_IOAPIC_EOI; 1983 vmexit->u.ioapic_eoi.vector = qual & 0xFF; 1984 vmexit->inst_length = 0; /* trap-like */ 1985 break; 1986 case EXIT_REASON_APIC_ACCESS: 1987 handled = vmx_handle_apic_access(vmx, vcpu, vmexit); 1988 break; 1989 case EXIT_REASON_APIC_WRITE: 1990 /* 1991 * APIC-write VM exit is trap-like so the %rip is already 1992 * pointing to the next instruction. 1993 */ 1994 vmexit->inst_length = 0; 1995 vlapic = vm_lapic(vmx->vm, vcpu); 1996 handled = vmx_handle_apic_write(vmx, vcpu, vlapic, qual); 1997 break; 1998 case EXIT_REASON_XSETBV: 1999 handled = vmx_emulate_xsetbv(vmx, vcpu, vmexit); 2000 break; 2001 default: 2002 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_UNKNOWN, 1); 2003 break; 2004 } 2005 2006 if (handled) { 2007 /* 2008 * It is possible that control is returned to userland 2009 * even though we were able to handle the VM exit in the 2010 * kernel. 2011 * 2012 * In such a case we want to make sure that the userland 2013 * restarts guest execution at the instruction *after* 2014 * the one we just processed. Therefore we update the 2015 * guest rip in the VMCS and in 'vmexit'. 2016 */ 2017 vmexit->rip += vmexit->inst_length; 2018 vmexit->inst_length = 0; 2019 vmcs_write(VMCS_GUEST_RIP, vmexit->rip); 2020 } else { 2021 if (vmexit->exitcode == VM_EXITCODE_BOGUS) { 2022 /* 2023 * If this VM exit was not claimed by anybody then 2024 * treat it as a generic VMX exit. 2025 */ 2026 vmexit->exitcode = VM_EXITCODE_VMX; 2027 vmexit->u.vmx.status = VM_SUCCESS; 2028 vmexit->u.vmx.inst_type = 0; 2029 vmexit->u.vmx.inst_error = 0; 2030 } else { 2031 /* 2032 * The exitcode and collateral have been populated. 2033 * The VM exit will be processed further in userland. 2034 */ 2035 } 2036 } 2037 return (handled); 2038 } 2039 2040 static __inline int 2041 vmx_exit_astpending(struct vmx *vmx, int vcpu, struct vm_exit *vmexit) 2042 { 2043 2044 vmexit->rip = vmcs_guest_rip(); 2045 vmexit->inst_length = 0; 2046 vmexit->exitcode = VM_EXITCODE_BOGUS; 2047 vmx_astpending_trace(vmx, vcpu, vmexit->rip); 2048 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_ASTPENDING, 1); 2049 2050 return (HANDLED); 2051 } 2052 2053 static __inline int 2054 vmx_exit_rendezvous(struct vmx *vmx, int vcpu, struct vm_exit *vmexit) 2055 { 2056 2057 vmexit->rip = vmcs_guest_rip(); 2058 vmexit->inst_length = 0; 2059 vmexit->exitcode = VM_EXITCODE_RENDEZVOUS; 2060 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_RENDEZVOUS, 1); 2061 2062 return (UNHANDLED); 2063 } 2064 2065 static __inline int 2066 vmx_exit_inst_error(struct vmxctx *vmxctx, int rc, struct vm_exit *vmexit) 2067 { 2068 2069 KASSERT(vmxctx->inst_fail_status != VM_SUCCESS, 2070 ("vmx_exit_inst_error: invalid inst_fail_status %d", 2071 vmxctx->inst_fail_status)); 2072 2073 vmexit->inst_length = 0; 2074 vmexit->exitcode = VM_EXITCODE_VMX; 2075 vmexit->u.vmx.status = vmxctx->inst_fail_status; 2076 vmexit->u.vmx.inst_error = vmcs_instruction_error(); 2077 vmexit->u.vmx.exit_reason = ~0; 2078 vmexit->u.vmx.exit_qualification = ~0; 2079 2080 switch (rc) { 2081 case VMX_VMRESUME_ERROR: 2082 case VMX_VMLAUNCH_ERROR: 2083 case VMX_INVEPT_ERROR: 2084 vmexit->u.vmx.inst_type = rc; 2085 break; 2086 default: 2087 panic("vm_exit_inst_error: vmx_enter_guest returned %d", rc); 2088 } 2089 2090 return (UNHANDLED); 2091 } 2092 2093 /* 2094 * If the NMI-exiting VM execution control is set to '1' then an NMI in 2095 * non-root operation causes a VM-exit. NMI blocking is in effect so it is 2096 * sufficient to simply vector to the NMI handler via a software interrupt. 2097 * However, this must be done before maskable interrupts are enabled 2098 * otherwise the "iret" issued by an interrupt handler will incorrectly 2099 * clear NMI blocking. 2100 */ 2101 static __inline void 2102 vmx_exit_handle_nmi(struct vmx *vmx, int vcpuid, struct vm_exit *vmexit) 2103 { 2104 uint32_t intr_info; 2105 2106 KASSERT((read_rflags() & PSL_I) == 0, ("interrupts enabled")); 2107 2108 if (vmexit->u.vmx.exit_reason != EXIT_REASON_EXCEPTION) 2109 return; 2110 2111 intr_info = vmcs_read(VMCS_EXIT_INTR_INFO); 2112 KASSERT((intr_info & VMCS_INTR_VALID) != 0, 2113 ("VM exit interruption info invalid: %#x", intr_info)); 2114 2115 if ((intr_info & VMCS_INTR_T_MASK) == VMCS_INTR_T_NMI) { 2116 KASSERT((intr_info & 0xff) == IDT_NMI, ("VM exit due " 2117 "to NMI has invalid vector: %#x", intr_info)); 2118 VCPU_CTR0(vmx->vm, vcpuid, "Vectoring to NMI handler"); 2119 __asm __volatile("int $2"); 2120 } 2121 } 2122 2123 static int 2124 vmx_run(void *arg, int vcpu, register_t startrip, pmap_t pmap, 2125 void *rendezvous_cookie) 2126 { 2127 int rc, handled, launched; 2128 struct vmx *vmx; 2129 struct vm *vm; 2130 struct vmxctx *vmxctx; 2131 struct vmcs *vmcs; 2132 struct vm_exit *vmexit; 2133 struct vlapic *vlapic; 2134 uint64_t rip; 2135 uint32_t exit_reason; 2136 2137 vmx = arg; 2138 vm = vmx->vm; 2139 vmcs = &vmx->vmcs[vcpu]; 2140 vmxctx = &vmx->ctx[vcpu]; 2141 vlapic = vm_lapic(vm, vcpu); 2142 vmexit = vm_exitinfo(vm, vcpu); 2143 launched = 0; 2144 2145 KASSERT(vmxctx->pmap == pmap, 2146 ("pmap %p different than ctx pmap %p", pmap, vmxctx->pmap)); 2147 2148 VMPTRLD(vmcs); 2149 2150 /* 2151 * XXX 2152 * We do this every time because we may setup the virtual machine 2153 * from a different process than the one that actually runs it. 2154 * 2155 * If the life of a virtual machine was spent entirely in the context 2156 * of a single process we could do this once in vmx_vminit(). 2157 */ 2158 vmcs_write(VMCS_HOST_CR3, rcr3()); 2159 2160 vmcs_write(VMCS_GUEST_RIP, startrip); 2161 vmx_set_pcpu_defaults(vmx, vcpu, pmap); 2162 do { 2163 /* 2164 * Interrupts are disabled from this point on until the 2165 * guest starts executing. This is done for the following 2166 * reasons: 2167 * 2168 * If an AST is asserted on this thread after the check below, 2169 * then the IPI_AST notification will not be lost, because it 2170 * will cause a VM exit due to external interrupt as soon as 2171 * the guest state is loaded. 2172 * 2173 * A posted interrupt after 'vmx_inject_interrupts()' will 2174 * not be "lost" because it will be held pending in the host 2175 * APIC because interrupts are disabled. The pending interrupt 2176 * will be recognized as soon as the guest state is loaded. 2177 * 2178 * The same reasoning applies to the IPI generated by 2179 * pmap_invalidate_ept(). 2180 */ 2181 disable_intr(); 2182 if (curthread->td_flags & (TDF_ASTPENDING | TDF_NEEDRESCHED)) { 2183 enable_intr(); 2184 handled = vmx_exit_astpending(vmx, vcpu, vmexit); 2185 break; 2186 } 2187 2188 if (vcpu_rendezvous_pending(rendezvous_cookie)) { 2189 enable_intr(); 2190 handled = vmx_exit_rendezvous(vmx, vcpu, vmexit); 2191 break; 2192 } 2193 2194 vmx_inject_interrupts(vmx, vcpu, vlapic); 2195 vmx_run_trace(vmx, vcpu); 2196 rc = vmx_enter_guest(vmxctx, vmx, launched); 2197 2198 /* Collect some information for VM exit processing */ 2199 vmexit->rip = rip = vmcs_guest_rip(); 2200 vmexit->inst_length = vmexit_instruction_length(); 2201 vmexit->u.vmx.exit_reason = exit_reason = vmcs_exit_reason(); 2202 vmexit->u.vmx.exit_qualification = vmcs_exit_qualification(); 2203 2204 if (rc == VMX_GUEST_VMEXIT) { 2205 vmx_exit_handle_nmi(vmx, vcpu, vmexit); 2206 enable_intr(); 2207 handled = vmx_exit_process(vmx, vcpu, vmexit); 2208 } else { 2209 enable_intr(); 2210 handled = vmx_exit_inst_error(vmxctx, rc, vmexit); 2211 } 2212 launched = 1; 2213 vmx_exit_trace(vmx, vcpu, rip, exit_reason, handled); 2214 } while (handled); 2215 2216 /* 2217 * If a VM exit has been handled then the exitcode must be BOGUS 2218 * If a VM exit is not handled then the exitcode must not be BOGUS 2219 */ 2220 if ((handled && vmexit->exitcode != VM_EXITCODE_BOGUS) || 2221 (!handled && vmexit->exitcode == VM_EXITCODE_BOGUS)) { 2222 panic("Mismatch between handled (%d) and exitcode (%d)", 2223 handled, vmexit->exitcode); 2224 } 2225 2226 if (!handled) 2227 vmm_stat_incr(vm, vcpu, VMEXIT_USERSPACE, 1); 2228 2229 VCPU_CTR1(vm, vcpu, "returning from vmx_run: exitcode %d", 2230 vmexit->exitcode); 2231 2232 VMCLEAR(vmcs); 2233 return (0); 2234 } 2235 2236 static void 2237 vmx_vmcleanup(void *arg) 2238 { 2239 int i, error; 2240 struct vmx *vmx = arg; 2241 2242 if (apic_access_virtualization(vmx, 0)) 2243 vm_unmap_mmio(vmx->vm, DEFAULT_APIC_BASE, PAGE_SIZE); 2244 2245 for (i = 0; i < VM_MAXCPU; i++) 2246 vpid_free(vmx->state[i].vpid); 2247 2248 /* 2249 * XXXSMP we also need to clear the VMCS active on the other vcpus. 2250 */ 2251 error = vmclear(&vmx->vmcs[0]); 2252 if (error != 0) 2253 panic("vmx_vmcleanup: vmclear error %d on vcpu 0", error); 2254 2255 free(vmx, M_VMX); 2256 2257 return; 2258 } 2259 2260 static register_t * 2261 vmxctx_regptr(struct vmxctx *vmxctx, int reg) 2262 { 2263 2264 switch (reg) { 2265 case VM_REG_GUEST_RAX: 2266 return (&vmxctx->guest_rax); 2267 case VM_REG_GUEST_RBX: 2268 return (&vmxctx->guest_rbx); 2269 case VM_REG_GUEST_RCX: 2270 return (&vmxctx->guest_rcx); 2271 case VM_REG_GUEST_RDX: 2272 return (&vmxctx->guest_rdx); 2273 case VM_REG_GUEST_RSI: 2274 return (&vmxctx->guest_rsi); 2275 case VM_REG_GUEST_RDI: 2276 return (&vmxctx->guest_rdi); 2277 case VM_REG_GUEST_RBP: 2278 return (&vmxctx->guest_rbp); 2279 case VM_REG_GUEST_R8: 2280 return (&vmxctx->guest_r8); 2281 case VM_REG_GUEST_R9: 2282 return (&vmxctx->guest_r9); 2283 case VM_REG_GUEST_R10: 2284 return (&vmxctx->guest_r10); 2285 case VM_REG_GUEST_R11: 2286 return (&vmxctx->guest_r11); 2287 case VM_REG_GUEST_R12: 2288 return (&vmxctx->guest_r12); 2289 case VM_REG_GUEST_R13: 2290 return (&vmxctx->guest_r13); 2291 case VM_REG_GUEST_R14: 2292 return (&vmxctx->guest_r14); 2293 case VM_REG_GUEST_R15: 2294 return (&vmxctx->guest_r15); 2295 default: 2296 break; 2297 } 2298 return (NULL); 2299 } 2300 2301 static int 2302 vmxctx_getreg(struct vmxctx *vmxctx, int reg, uint64_t *retval) 2303 { 2304 register_t *regp; 2305 2306 if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) { 2307 *retval = *regp; 2308 return (0); 2309 } else 2310 return (EINVAL); 2311 } 2312 2313 static int 2314 vmxctx_setreg(struct vmxctx *vmxctx, int reg, uint64_t val) 2315 { 2316 register_t *regp; 2317 2318 if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) { 2319 *regp = val; 2320 return (0); 2321 } else 2322 return (EINVAL); 2323 } 2324 2325 static int 2326 vmx_shadow_reg(int reg) 2327 { 2328 int shreg; 2329 2330 shreg = -1; 2331 2332 switch (reg) { 2333 case VM_REG_GUEST_CR0: 2334 shreg = VMCS_CR0_SHADOW; 2335 break; 2336 case VM_REG_GUEST_CR4: 2337 shreg = VMCS_CR4_SHADOW; 2338 break; 2339 default: 2340 break; 2341 } 2342 2343 return (shreg); 2344 } 2345 2346 static int 2347 vmx_getreg(void *arg, int vcpu, int reg, uint64_t *retval) 2348 { 2349 int running, hostcpu; 2350 struct vmx *vmx = arg; 2351 2352 running = vcpu_is_running(vmx->vm, vcpu, &hostcpu); 2353 if (running && hostcpu != curcpu) 2354 panic("vmx_getreg: %s%d is running", vm_name(vmx->vm), vcpu); 2355 2356 if (vmxctx_getreg(&vmx->ctx[vcpu], reg, retval) == 0) 2357 return (0); 2358 2359 return (vmcs_getreg(&vmx->vmcs[vcpu], running, reg, retval)); 2360 } 2361 2362 static int 2363 vmx_setreg(void *arg, int vcpu, int reg, uint64_t val) 2364 { 2365 int error, hostcpu, running, shadow; 2366 uint64_t ctls; 2367 struct vmx *vmx = arg; 2368 2369 running = vcpu_is_running(vmx->vm, vcpu, &hostcpu); 2370 if (running && hostcpu != curcpu) 2371 panic("vmx_setreg: %s%d is running", vm_name(vmx->vm), vcpu); 2372 2373 if (vmxctx_setreg(&vmx->ctx[vcpu], reg, val) == 0) 2374 return (0); 2375 2376 error = vmcs_setreg(&vmx->vmcs[vcpu], running, reg, val); 2377 2378 if (error == 0) { 2379 /* 2380 * If the "load EFER" VM-entry control is 1 then the 2381 * value of EFER.LMA must be identical to "IA-32e mode guest" 2382 * bit in the VM-entry control. 2383 */ 2384 if ((entry_ctls & VM_ENTRY_LOAD_EFER) != 0 && 2385 (reg == VM_REG_GUEST_EFER)) { 2386 vmcs_getreg(&vmx->vmcs[vcpu], running, 2387 VMCS_IDENT(VMCS_ENTRY_CTLS), &ctls); 2388 if (val & EFER_LMA) 2389 ctls |= VM_ENTRY_GUEST_LMA; 2390 else 2391 ctls &= ~VM_ENTRY_GUEST_LMA; 2392 vmcs_setreg(&vmx->vmcs[vcpu], running, 2393 VMCS_IDENT(VMCS_ENTRY_CTLS), ctls); 2394 } 2395 2396 shadow = vmx_shadow_reg(reg); 2397 if (shadow > 0) { 2398 /* 2399 * Store the unmodified value in the shadow 2400 */ 2401 error = vmcs_setreg(&vmx->vmcs[vcpu], running, 2402 VMCS_IDENT(shadow), val); 2403 } 2404 } 2405 2406 return (error); 2407 } 2408 2409 static int 2410 vmx_getdesc(void *arg, int vcpu, int reg, struct seg_desc *desc) 2411 { 2412 struct vmx *vmx = arg; 2413 2414 return (vmcs_getdesc(&vmx->vmcs[vcpu], reg, desc)); 2415 } 2416 2417 static int 2418 vmx_setdesc(void *arg, int vcpu, int reg, struct seg_desc *desc) 2419 { 2420 struct vmx *vmx = arg; 2421 2422 return (vmcs_setdesc(&vmx->vmcs[vcpu], reg, desc)); 2423 } 2424 2425 static int 2426 vmx_inject(void *arg, int vcpu, int type, int vector, uint32_t code, 2427 int code_valid) 2428 { 2429 struct vmx *vmx = arg; 2430 struct vmxevent *user_event = &vmx->state[vcpu].user_event; 2431 2432 static uint32_t type_map[VM_EVENT_MAX] = { 2433 0x1, /* VM_EVENT_NONE */ 2434 0x0, /* VM_HW_INTR */ 2435 0x2, /* VM_NMI */ 2436 0x3, /* VM_HW_EXCEPTION */ 2437 0x4, /* VM_SW_INTR */ 2438 0x5, /* VM_PRIV_SW_EXCEPTION */ 2439 0x6, /* VM_SW_EXCEPTION */ 2440 }; 2441 2442 /* 2443 * If there is already an exception pending to be delivered to the 2444 * vcpu then just return. 2445 */ 2446 if (user_event->intr_info & VMCS_INTR_VALID) 2447 return (EAGAIN); 2448 2449 user_event->intr_info = vector | (type_map[type] << 8) | VMCS_INTR_VALID; 2450 if (code_valid) { 2451 user_event->intr_info |= VMCS_INTR_DEL_ERRCODE; 2452 user_event->error_code = code; 2453 } 2454 return (0); 2455 } 2456 2457 static int 2458 vmx_getcap(void *arg, int vcpu, int type, int *retval) 2459 { 2460 struct vmx *vmx = arg; 2461 int vcap; 2462 int ret; 2463 2464 ret = ENOENT; 2465 2466 vcap = vmx->cap[vcpu].set; 2467 2468 switch (type) { 2469 case VM_CAP_HALT_EXIT: 2470 if (cap_halt_exit) 2471 ret = 0; 2472 break; 2473 case VM_CAP_PAUSE_EXIT: 2474 if (cap_pause_exit) 2475 ret = 0; 2476 break; 2477 case VM_CAP_MTRAP_EXIT: 2478 if (cap_monitor_trap) 2479 ret = 0; 2480 break; 2481 case VM_CAP_UNRESTRICTED_GUEST: 2482 if (cap_unrestricted_guest) 2483 ret = 0; 2484 break; 2485 case VM_CAP_ENABLE_INVPCID: 2486 if (cap_invpcid) 2487 ret = 0; 2488 break; 2489 default: 2490 break; 2491 } 2492 2493 if (ret == 0) 2494 *retval = (vcap & (1 << type)) ? 1 : 0; 2495 2496 return (ret); 2497 } 2498 2499 static int 2500 vmx_setcap(void *arg, int vcpu, int type, int val) 2501 { 2502 struct vmx *vmx = arg; 2503 struct vmcs *vmcs = &vmx->vmcs[vcpu]; 2504 uint32_t baseval; 2505 uint32_t *pptr; 2506 int error; 2507 int flag; 2508 int reg; 2509 int retval; 2510 2511 retval = ENOENT; 2512 pptr = NULL; 2513 2514 switch (type) { 2515 case VM_CAP_HALT_EXIT: 2516 if (cap_halt_exit) { 2517 retval = 0; 2518 pptr = &vmx->cap[vcpu].proc_ctls; 2519 baseval = *pptr; 2520 flag = PROCBASED_HLT_EXITING; 2521 reg = VMCS_PRI_PROC_BASED_CTLS; 2522 } 2523 break; 2524 case VM_CAP_MTRAP_EXIT: 2525 if (cap_monitor_trap) { 2526 retval = 0; 2527 pptr = &vmx->cap[vcpu].proc_ctls; 2528 baseval = *pptr; 2529 flag = PROCBASED_MTF; 2530 reg = VMCS_PRI_PROC_BASED_CTLS; 2531 } 2532 break; 2533 case VM_CAP_PAUSE_EXIT: 2534 if (cap_pause_exit) { 2535 retval = 0; 2536 pptr = &vmx->cap[vcpu].proc_ctls; 2537 baseval = *pptr; 2538 flag = PROCBASED_PAUSE_EXITING; 2539 reg = VMCS_PRI_PROC_BASED_CTLS; 2540 } 2541 break; 2542 case VM_CAP_UNRESTRICTED_GUEST: 2543 if (cap_unrestricted_guest) { 2544 retval = 0; 2545 pptr = &vmx->cap[vcpu].proc_ctls2; 2546 baseval = *pptr; 2547 flag = PROCBASED2_UNRESTRICTED_GUEST; 2548 reg = VMCS_SEC_PROC_BASED_CTLS; 2549 } 2550 break; 2551 case VM_CAP_ENABLE_INVPCID: 2552 if (cap_invpcid) { 2553 retval = 0; 2554 pptr = &vmx->cap[vcpu].proc_ctls2; 2555 baseval = *pptr; 2556 flag = PROCBASED2_ENABLE_INVPCID; 2557 reg = VMCS_SEC_PROC_BASED_CTLS; 2558 } 2559 break; 2560 default: 2561 break; 2562 } 2563 2564 if (retval == 0) { 2565 if (val) { 2566 baseval |= flag; 2567 } else { 2568 baseval &= ~flag; 2569 } 2570 VMPTRLD(vmcs); 2571 error = vmwrite(reg, baseval); 2572 VMCLEAR(vmcs); 2573 2574 if (error) { 2575 retval = error; 2576 } else { 2577 /* 2578 * Update optional stored flags, and record 2579 * setting 2580 */ 2581 if (pptr != NULL) { 2582 *pptr = baseval; 2583 } 2584 2585 if (val) { 2586 vmx->cap[vcpu].set |= (1 << type); 2587 } else { 2588 vmx->cap[vcpu].set &= ~(1 << type); 2589 } 2590 } 2591 } 2592 2593 return (retval); 2594 } 2595 2596 struct vlapic_vtx { 2597 struct vlapic vlapic; 2598 struct pir_desc *pir_desc; 2599 struct vmx *vmx; 2600 }; 2601 2602 #define VMX_CTR_PIR(vm, vcpuid, pir_desc, notify, vector, level, msg) \ 2603 do { \ 2604 VCPU_CTR2(vm, vcpuid, msg " assert %s-triggered vector %d", \ 2605 level ? "level" : "edge", vector); \ 2606 VCPU_CTR1(vm, vcpuid, msg " pir0 0x%016lx", pir_desc->pir[0]); \ 2607 VCPU_CTR1(vm, vcpuid, msg " pir1 0x%016lx", pir_desc->pir[1]); \ 2608 VCPU_CTR1(vm, vcpuid, msg " pir2 0x%016lx", pir_desc->pir[2]); \ 2609 VCPU_CTR1(vm, vcpuid, msg " pir3 0x%016lx", pir_desc->pir[3]); \ 2610 VCPU_CTR1(vm, vcpuid, msg " notify: %s", notify ? "yes" : "no");\ 2611 } while (0) 2612 2613 /* 2614 * vlapic->ops handlers that utilize the APICv hardware assist described in 2615 * Chapter 29 of the Intel SDM. 2616 */ 2617 static int 2618 vmx_set_intr_ready(struct vlapic *vlapic, int vector, bool level) 2619 { 2620 struct vlapic_vtx *vlapic_vtx; 2621 struct pir_desc *pir_desc; 2622 uint64_t mask; 2623 int idx, notify; 2624 2625 vlapic_vtx = (struct vlapic_vtx *)vlapic; 2626 pir_desc = vlapic_vtx->pir_desc; 2627 2628 /* 2629 * Keep track of interrupt requests in the PIR descriptor. This is 2630 * because the virtual APIC page pointed to by the VMCS cannot be 2631 * modified if the vcpu is running. 2632 */ 2633 idx = vector / 64; 2634 mask = 1UL << (vector % 64); 2635 atomic_set_long(&pir_desc->pir[idx], mask); 2636 notify = atomic_cmpset_long(&pir_desc->pending, 0, 1); 2637 2638 VMX_CTR_PIR(vlapic->vm, vlapic->vcpuid, pir_desc, notify, vector, 2639 level, "vmx_set_intr_ready"); 2640 return (notify); 2641 } 2642 2643 static int 2644 vmx_pending_intr(struct vlapic *vlapic, int *vecptr) 2645 { 2646 struct vlapic_vtx *vlapic_vtx; 2647 struct pir_desc *pir_desc; 2648 struct LAPIC *lapic; 2649 uint64_t pending, pirval; 2650 uint32_t ppr, vpr; 2651 int i; 2652 2653 /* 2654 * This function is only expected to be called from the 'HLT' exit 2655 * handler which does not care about the vector that is pending. 2656 */ 2657 KASSERT(vecptr == NULL, ("vmx_pending_intr: vecptr must be NULL")); 2658 2659 vlapic_vtx = (struct vlapic_vtx *)vlapic; 2660 pir_desc = vlapic_vtx->pir_desc; 2661 2662 pending = atomic_load_acq_long(&pir_desc->pending); 2663 if (!pending) 2664 return (0); /* common case */ 2665 2666 /* 2667 * If there is an interrupt pending then it will be recognized only 2668 * if its priority is greater than the processor priority. 2669 * 2670 * Special case: if the processor priority is zero then any pending 2671 * interrupt will be recognized. 2672 */ 2673 lapic = vlapic->apic_page; 2674 ppr = lapic->ppr & 0xf0; 2675 if (ppr == 0) 2676 return (1); 2677 2678 VCPU_CTR1(vlapic->vm, vlapic->vcpuid, "HLT with non-zero PPR %d", 2679 lapic->ppr); 2680 2681 for (i = 3; i >= 0; i--) { 2682 pirval = pir_desc->pir[i]; 2683 if (pirval != 0) { 2684 vpr = (i * 64 + flsl(pirval) - 1) & 0xf0; 2685 return (vpr > ppr); 2686 } 2687 } 2688 return (0); 2689 } 2690 2691 static void 2692 vmx_intr_accepted(struct vlapic *vlapic, int vector) 2693 { 2694 2695 panic("vmx_intr_accepted: not expected to be called"); 2696 } 2697 2698 static void 2699 vmx_set_tmr(struct vlapic *vlapic, int vector, bool level) 2700 { 2701 struct vlapic_vtx *vlapic_vtx; 2702 struct vmx *vmx; 2703 struct vmcs *vmcs; 2704 uint64_t mask, val; 2705 2706 KASSERT(vector >= 0 && vector <= 255, ("invalid vector %d", vector)); 2707 KASSERT(!vcpu_is_running(vlapic->vm, vlapic->vcpuid, NULL), 2708 ("vmx_set_tmr: vcpu cannot be running")); 2709 2710 vlapic_vtx = (struct vlapic_vtx *)vlapic; 2711 vmx = vlapic_vtx->vmx; 2712 vmcs = &vmx->vmcs[vlapic->vcpuid]; 2713 mask = 1UL << (vector % 64); 2714 2715 VMPTRLD(vmcs); 2716 val = vmcs_read(VMCS_EOI_EXIT(vector)); 2717 if (level) 2718 val |= mask; 2719 else 2720 val &= ~mask; 2721 vmcs_write(VMCS_EOI_EXIT(vector), val); 2722 VMCLEAR(vmcs); 2723 } 2724 2725 static void 2726 vmx_enable_x2apic_mode(struct vlapic *vlapic) 2727 { 2728 struct vmx *vmx; 2729 struct vmcs *vmcs; 2730 uint32_t proc_ctls2; 2731 int vcpuid, error; 2732 2733 vcpuid = vlapic->vcpuid; 2734 vmx = ((struct vlapic_vtx *)vlapic)->vmx; 2735 vmcs = &vmx->vmcs[vcpuid]; 2736 2737 proc_ctls2 = vmx->cap[vcpuid].proc_ctls2; 2738 KASSERT((proc_ctls2 & PROCBASED2_VIRTUALIZE_APIC_ACCESSES) != 0, 2739 ("%s: invalid proc_ctls2 %#x", __func__, proc_ctls2)); 2740 2741 proc_ctls2 &= ~PROCBASED2_VIRTUALIZE_APIC_ACCESSES; 2742 proc_ctls2 |= PROCBASED2_VIRTUALIZE_X2APIC_MODE; 2743 vmx->cap[vcpuid].proc_ctls2 = proc_ctls2; 2744 2745 VMPTRLD(vmcs); 2746 vmcs_write(VMCS_SEC_PROC_BASED_CTLS, proc_ctls2); 2747 VMCLEAR(vmcs); 2748 2749 if (vlapic->vcpuid == 0) { 2750 /* 2751 * The nested page table mappings are shared by all vcpus 2752 * so unmap the APIC access page just once. 2753 */ 2754 error = vm_unmap_mmio(vmx->vm, DEFAULT_APIC_BASE, PAGE_SIZE); 2755 KASSERT(error == 0, ("%s: vm_unmap_mmio error %d", 2756 __func__, error)); 2757 2758 /* 2759 * The MSR bitmap is shared by all vcpus so modify it only 2760 * once in the context of vcpu 0. 2761 */ 2762 error = vmx_allow_x2apic_msrs(vmx); 2763 KASSERT(error == 0, ("%s: vmx_allow_x2apic_msrs error %d", 2764 __func__, error)); 2765 } 2766 } 2767 2768 static void 2769 vmx_post_intr(struct vlapic *vlapic, int hostcpu) 2770 { 2771 2772 ipi_cpu(hostcpu, pirvec); 2773 } 2774 2775 /* 2776 * Transfer the pending interrupts in the PIR descriptor to the IRR 2777 * in the virtual APIC page. 2778 */ 2779 static void 2780 vmx_inject_pir(struct vlapic *vlapic) 2781 { 2782 struct vlapic_vtx *vlapic_vtx; 2783 struct pir_desc *pir_desc; 2784 struct LAPIC *lapic; 2785 uint64_t val, pirval; 2786 int rvi, pirbase; 2787 uint16_t intr_status_old, intr_status_new; 2788 2789 vlapic_vtx = (struct vlapic_vtx *)vlapic; 2790 pir_desc = vlapic_vtx->pir_desc; 2791 if (atomic_cmpset_long(&pir_desc->pending, 1, 0) == 0) { 2792 VCPU_CTR0(vlapic->vm, vlapic->vcpuid, "vmx_inject_pir: " 2793 "no posted interrupt pending"); 2794 return; 2795 } 2796 2797 pirval = 0; 2798 lapic = vlapic->apic_page; 2799 2800 val = atomic_readandclear_long(&pir_desc->pir[0]); 2801 if (val != 0) { 2802 lapic->irr0 |= val; 2803 lapic->irr1 |= val >> 32; 2804 pirbase = 0; 2805 pirval = val; 2806 } 2807 2808 val = atomic_readandclear_long(&pir_desc->pir[1]); 2809 if (val != 0) { 2810 lapic->irr2 |= val; 2811 lapic->irr3 |= val >> 32; 2812 pirbase = 64; 2813 pirval = val; 2814 } 2815 2816 val = atomic_readandclear_long(&pir_desc->pir[2]); 2817 if (val != 0) { 2818 lapic->irr4 |= val; 2819 lapic->irr5 |= val >> 32; 2820 pirbase = 128; 2821 pirval = val; 2822 } 2823 2824 val = atomic_readandclear_long(&pir_desc->pir[3]); 2825 if (val != 0) { 2826 lapic->irr6 |= val; 2827 lapic->irr7 |= val >> 32; 2828 pirbase = 192; 2829 pirval = val; 2830 } 2831 VLAPIC_CTR_IRR(vlapic, "vmx_inject_pir"); 2832 2833 /* 2834 * Update RVI so the processor can evaluate pending virtual 2835 * interrupts on VM-entry. 2836 */ 2837 if (pirval != 0) { 2838 rvi = pirbase + flsl(pirval) - 1; 2839 intr_status_old = vmcs_read(VMCS_GUEST_INTR_STATUS); 2840 intr_status_new = (intr_status_old & 0xFF00) | rvi; 2841 if (intr_status_new > intr_status_old) { 2842 vmcs_write(VMCS_GUEST_INTR_STATUS, intr_status_new); 2843 VCPU_CTR2(vlapic->vm, vlapic->vcpuid, "vmx_inject_pir: " 2844 "guest_intr_status changed from 0x%04x to 0x%04x", 2845 intr_status_old, intr_status_new); 2846 } 2847 } 2848 } 2849 2850 static struct vlapic * 2851 vmx_vlapic_init(void *arg, int vcpuid) 2852 { 2853 struct vmx *vmx; 2854 struct vlapic *vlapic; 2855 struct vlapic_vtx *vlapic_vtx; 2856 2857 vmx = arg; 2858 2859 vlapic = malloc(sizeof(struct vlapic_vtx), M_VLAPIC, M_WAITOK | M_ZERO); 2860 vlapic->vm = vmx->vm; 2861 vlapic->vcpuid = vcpuid; 2862 vlapic->apic_page = (struct LAPIC *)&vmx->apic_page[vcpuid]; 2863 2864 vlapic_vtx = (struct vlapic_vtx *)vlapic; 2865 vlapic_vtx->pir_desc = &vmx->pir_desc[vcpuid]; 2866 vlapic_vtx->vmx = vmx; 2867 2868 if (virtual_interrupt_delivery) { 2869 vlapic->ops.set_intr_ready = vmx_set_intr_ready; 2870 vlapic->ops.pending_intr = vmx_pending_intr; 2871 vlapic->ops.intr_accepted = vmx_intr_accepted; 2872 vlapic->ops.set_tmr = vmx_set_tmr; 2873 vlapic->ops.enable_x2apic_mode = vmx_enable_x2apic_mode; 2874 } 2875 2876 if (posted_interrupts) 2877 vlapic->ops.post_intr = vmx_post_intr; 2878 2879 vlapic_init(vlapic); 2880 2881 return (vlapic); 2882 } 2883 2884 static void 2885 vmx_vlapic_cleanup(void *arg, struct vlapic *vlapic) 2886 { 2887 2888 vlapic_cleanup(vlapic); 2889 free(vlapic, M_VLAPIC); 2890 } 2891 2892 struct vmm_ops vmm_ops_intel = { 2893 vmx_init, 2894 vmx_cleanup, 2895 vmx_restore, 2896 vmx_vminit, 2897 vmx_run, 2898 vmx_vmcleanup, 2899 vmx_getreg, 2900 vmx_setreg, 2901 vmx_getdesc, 2902 vmx_setdesc, 2903 vmx_inject, 2904 vmx_getcap, 2905 vmx_setcap, 2906 ept_vmspace_alloc, 2907 ept_vmspace_free, 2908 vmx_vlapic_init, 2909 vmx_vlapic_cleanup, 2910 }; 2911