1 /*- 2 * Copyright (c) 2011 NetApp, Inc. 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 * 26 * $FreeBSD$ 27 */ 28 29 #include <sys/cdefs.h> 30 __FBSDID("$FreeBSD$"); 31 32 #include <sys/param.h> 33 #include <sys/systm.h> 34 #include <sys/smp.h> 35 #include <sys/kernel.h> 36 #include <sys/malloc.h> 37 #include <sys/pcpu.h> 38 #include <sys/proc.h> 39 #include <sys/sysctl.h> 40 41 #include <vm/vm.h> 42 #include <vm/pmap.h> 43 44 #include <machine/psl.h> 45 #include <machine/cpufunc.h> 46 #include <machine/md_var.h> 47 #include <machine/segments.h> 48 #include <machine/specialreg.h> 49 #include <machine/vmparam.h> 50 51 #include <machine/vmm.h> 52 #include "vmm_host.h" 53 #include "vmm_lapic.h" 54 #include "vmm_msr.h" 55 #include "vmm_ktr.h" 56 #include "vmm_stat.h" 57 58 #include "vmx_msr.h" 59 #include "ept.h" 60 #include "vmx_cpufunc.h" 61 #include "vmx.h" 62 #include "x86.h" 63 #include "vmx_controls.h" 64 65 #define PINBASED_CTLS_ONE_SETTING \ 66 (PINBASED_EXTINT_EXITING | \ 67 PINBASED_NMI_EXITING | \ 68 PINBASED_VIRTUAL_NMI) 69 #define PINBASED_CTLS_ZERO_SETTING 0 70 71 #define PROCBASED_CTLS_WINDOW_SETTING \ 72 (PROCBASED_INT_WINDOW_EXITING | \ 73 PROCBASED_NMI_WINDOW_EXITING) 74 75 #define PROCBASED_CTLS_ONE_SETTING \ 76 (PROCBASED_SECONDARY_CONTROLS | \ 77 PROCBASED_IO_EXITING | \ 78 PROCBASED_MSR_BITMAPS | \ 79 PROCBASED_CTLS_WINDOW_SETTING) 80 #define PROCBASED_CTLS_ZERO_SETTING \ 81 (PROCBASED_CR3_LOAD_EXITING | \ 82 PROCBASED_CR3_STORE_EXITING | \ 83 PROCBASED_IO_BITMAPS) 84 85 #define PROCBASED_CTLS2_ONE_SETTING PROCBASED2_ENABLE_EPT 86 #define PROCBASED_CTLS2_ZERO_SETTING 0 87 88 #define VM_EXIT_CTLS_ONE_SETTING_NO_PAT \ 89 (VM_EXIT_HOST_LMA | \ 90 VM_EXIT_SAVE_EFER | \ 91 VM_EXIT_LOAD_EFER) 92 93 #define VM_EXIT_CTLS_ONE_SETTING \ 94 (VM_EXIT_CTLS_ONE_SETTING_NO_PAT | \ 95 VM_EXIT_SAVE_PAT | \ 96 VM_EXIT_LOAD_PAT) 97 #define VM_EXIT_CTLS_ZERO_SETTING VM_EXIT_SAVE_DEBUG_CONTROLS 98 99 #define VM_ENTRY_CTLS_ONE_SETTING_NO_PAT VM_ENTRY_LOAD_EFER 100 101 #define VM_ENTRY_CTLS_ONE_SETTING \ 102 (VM_ENTRY_CTLS_ONE_SETTING_NO_PAT | \ 103 VM_ENTRY_LOAD_PAT) 104 #define VM_ENTRY_CTLS_ZERO_SETTING \ 105 (VM_ENTRY_LOAD_DEBUG_CONTROLS | \ 106 VM_ENTRY_INTO_SMM | \ 107 VM_ENTRY_DEACTIVATE_DUAL_MONITOR) 108 109 #define guest_msr_rw(vmx, msr) \ 110 msr_bitmap_change_access((vmx)->msr_bitmap, (msr), MSR_BITMAP_ACCESS_RW) 111 112 #define HANDLED 1 113 #define UNHANDLED 0 114 115 MALLOC_DEFINE(M_VMX, "vmx", "vmx"); 116 117 SYSCTL_DECL(_hw_vmm); 118 SYSCTL_NODE(_hw_vmm, OID_AUTO, vmx, CTLFLAG_RW, NULL, NULL); 119 120 int vmxon_enabled[MAXCPU]; 121 static char vmxon_region[MAXCPU][PAGE_SIZE] __aligned(PAGE_SIZE); 122 123 static uint32_t pinbased_ctls, procbased_ctls, procbased_ctls2; 124 static uint32_t exit_ctls, entry_ctls; 125 126 static uint64_t cr0_ones_mask, cr0_zeros_mask; 127 SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr0_ones_mask, CTLFLAG_RD, 128 &cr0_ones_mask, 0, NULL); 129 SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr0_zeros_mask, CTLFLAG_RD, 130 &cr0_zeros_mask, 0, NULL); 131 132 static uint64_t cr4_ones_mask, cr4_zeros_mask; 133 SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr4_ones_mask, CTLFLAG_RD, 134 &cr4_ones_mask, 0, NULL); 135 SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr4_zeros_mask, CTLFLAG_RD, 136 &cr4_zeros_mask, 0, NULL); 137 138 static int vmx_no_patmsr; 139 140 static int vmx_initialized; 141 SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, initialized, CTLFLAG_RD, 142 &vmx_initialized, 0, "Intel VMX initialized"); 143 144 /* 145 * Virtual NMI blocking conditions. 146 * 147 * Some processor implementations also require NMI to be blocked if 148 * the STI_BLOCKING bit is set. It is possible to detect this at runtime 149 * based on the (exit_reason,exit_qual) tuple being set to 150 * (EXIT_REASON_INVAL_VMCS, EXIT_QUAL_NMI_WHILE_STI_BLOCKING). 151 * 152 * We take the easy way out and also include STI_BLOCKING as one of the 153 * gating items for vNMI injection. 154 */ 155 static uint64_t nmi_blocking_bits = VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING | 156 VMCS_INTERRUPTIBILITY_NMI_BLOCKING | 157 VMCS_INTERRUPTIBILITY_STI_BLOCKING; 158 159 /* 160 * Optional capabilities 161 */ 162 static int cap_halt_exit; 163 static int cap_pause_exit; 164 static int cap_unrestricted_guest; 165 static int cap_monitor_trap; 166 static int cap_invpcid; 167 168 static struct unrhdr *vpid_unr; 169 static u_int vpid_alloc_failed; 170 SYSCTL_UINT(_hw_vmm_vmx, OID_AUTO, vpid_alloc_failed, CTLFLAG_RD, 171 &vpid_alloc_failed, 0, NULL); 172 173 #ifdef KTR 174 static const char * 175 exit_reason_to_str(int reason) 176 { 177 static char reasonbuf[32]; 178 179 switch (reason) { 180 case EXIT_REASON_EXCEPTION: 181 return "exception"; 182 case EXIT_REASON_EXT_INTR: 183 return "extint"; 184 case EXIT_REASON_TRIPLE_FAULT: 185 return "triplefault"; 186 case EXIT_REASON_INIT: 187 return "init"; 188 case EXIT_REASON_SIPI: 189 return "sipi"; 190 case EXIT_REASON_IO_SMI: 191 return "iosmi"; 192 case EXIT_REASON_SMI: 193 return "smi"; 194 case EXIT_REASON_INTR_WINDOW: 195 return "intrwindow"; 196 case EXIT_REASON_NMI_WINDOW: 197 return "nmiwindow"; 198 case EXIT_REASON_TASK_SWITCH: 199 return "taskswitch"; 200 case EXIT_REASON_CPUID: 201 return "cpuid"; 202 case EXIT_REASON_GETSEC: 203 return "getsec"; 204 case EXIT_REASON_HLT: 205 return "hlt"; 206 case EXIT_REASON_INVD: 207 return "invd"; 208 case EXIT_REASON_INVLPG: 209 return "invlpg"; 210 case EXIT_REASON_RDPMC: 211 return "rdpmc"; 212 case EXIT_REASON_RDTSC: 213 return "rdtsc"; 214 case EXIT_REASON_RSM: 215 return "rsm"; 216 case EXIT_REASON_VMCALL: 217 return "vmcall"; 218 case EXIT_REASON_VMCLEAR: 219 return "vmclear"; 220 case EXIT_REASON_VMLAUNCH: 221 return "vmlaunch"; 222 case EXIT_REASON_VMPTRLD: 223 return "vmptrld"; 224 case EXIT_REASON_VMPTRST: 225 return "vmptrst"; 226 case EXIT_REASON_VMREAD: 227 return "vmread"; 228 case EXIT_REASON_VMRESUME: 229 return "vmresume"; 230 case EXIT_REASON_VMWRITE: 231 return "vmwrite"; 232 case EXIT_REASON_VMXOFF: 233 return "vmxoff"; 234 case EXIT_REASON_VMXON: 235 return "vmxon"; 236 case EXIT_REASON_CR_ACCESS: 237 return "craccess"; 238 case EXIT_REASON_DR_ACCESS: 239 return "draccess"; 240 case EXIT_REASON_INOUT: 241 return "inout"; 242 case EXIT_REASON_RDMSR: 243 return "rdmsr"; 244 case EXIT_REASON_WRMSR: 245 return "wrmsr"; 246 case EXIT_REASON_INVAL_VMCS: 247 return "invalvmcs"; 248 case EXIT_REASON_INVAL_MSR: 249 return "invalmsr"; 250 case EXIT_REASON_MWAIT: 251 return "mwait"; 252 case EXIT_REASON_MTF: 253 return "mtf"; 254 case EXIT_REASON_MONITOR: 255 return "monitor"; 256 case EXIT_REASON_PAUSE: 257 return "pause"; 258 case EXIT_REASON_MCE: 259 return "mce"; 260 case EXIT_REASON_TPR: 261 return "tpr"; 262 case EXIT_REASON_APIC: 263 return "apic"; 264 case EXIT_REASON_GDTR_IDTR: 265 return "gdtridtr"; 266 case EXIT_REASON_LDTR_TR: 267 return "ldtrtr"; 268 case EXIT_REASON_EPT_FAULT: 269 return "eptfault"; 270 case EXIT_REASON_EPT_MISCONFIG: 271 return "eptmisconfig"; 272 case EXIT_REASON_INVEPT: 273 return "invept"; 274 case EXIT_REASON_RDTSCP: 275 return "rdtscp"; 276 case EXIT_REASON_VMX_PREEMPT: 277 return "vmxpreempt"; 278 case EXIT_REASON_INVVPID: 279 return "invvpid"; 280 case EXIT_REASON_WBINVD: 281 return "wbinvd"; 282 case EXIT_REASON_XSETBV: 283 return "xsetbv"; 284 default: 285 snprintf(reasonbuf, sizeof(reasonbuf), "%d", reason); 286 return (reasonbuf); 287 } 288 } 289 290 #ifdef SETJMP_TRACE 291 static const char * 292 vmx_setjmp_rc2str(int rc) 293 { 294 switch (rc) { 295 case VMX_RETURN_DIRECT: 296 return "direct"; 297 case VMX_RETURN_LONGJMP: 298 return "longjmp"; 299 case VMX_RETURN_VMRESUME: 300 return "vmresume"; 301 case VMX_RETURN_VMLAUNCH: 302 return "vmlaunch"; 303 case VMX_RETURN_AST: 304 return "ast"; 305 default: 306 return "unknown"; 307 } 308 } 309 310 #define SETJMP_TRACE(vmx, vcpu, vmxctx, regname) \ 311 VCPU_CTR1((vmx)->vm, (vcpu), "setjmp trace " #regname " 0x%016lx", \ 312 (vmxctx)->regname) 313 314 static void 315 vmx_setjmp_trace(struct vmx *vmx, int vcpu, struct vmxctx *vmxctx, int rc) 316 { 317 uint64_t host_rip, host_rsp; 318 319 if (vmxctx != &vmx->ctx[vcpu]) 320 panic("vmx_setjmp_trace: invalid vmxctx %p; should be %p", 321 vmxctx, &vmx->ctx[vcpu]); 322 323 VCPU_CTR1((vmx)->vm, (vcpu), "vmxctx = %p", vmxctx); 324 VCPU_CTR2((vmx)->vm, (vcpu), "setjmp return code %s(%d)", 325 vmx_setjmp_rc2str(rc), rc); 326 327 host_rsp = host_rip = ~0; 328 vmread(VMCS_HOST_RIP, &host_rip); 329 vmread(VMCS_HOST_RSP, &host_rsp); 330 VCPU_CTR2((vmx)->vm, (vcpu), "vmcs host_rip 0x%016lx, host_rsp %#lx", 331 host_rip, host_rsp); 332 333 SETJMP_TRACE(vmx, vcpu, vmxctx, host_r15); 334 SETJMP_TRACE(vmx, vcpu, vmxctx, host_r14); 335 SETJMP_TRACE(vmx, vcpu, vmxctx, host_r13); 336 SETJMP_TRACE(vmx, vcpu, vmxctx, host_r12); 337 SETJMP_TRACE(vmx, vcpu, vmxctx, host_rbp); 338 SETJMP_TRACE(vmx, vcpu, vmxctx, host_rsp); 339 SETJMP_TRACE(vmx, vcpu, vmxctx, host_rbx); 340 SETJMP_TRACE(vmx, vcpu, vmxctx, host_rip); 341 342 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rdi); 343 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rsi); 344 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rdx); 345 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rcx); 346 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r8); 347 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r9); 348 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rax); 349 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rbx); 350 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rbp); 351 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r10); 352 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r11); 353 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r12); 354 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r13); 355 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r14); 356 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r15); 357 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_cr2); 358 } 359 #endif 360 #else 361 static void __inline 362 vmx_setjmp_trace(struct vmx *vmx, int vcpu, struct vmxctx *vmxctx, int rc) 363 { 364 return; 365 } 366 #endif /* KTR */ 367 368 u_long 369 vmx_fix_cr0(u_long cr0) 370 { 371 372 return ((cr0 | cr0_ones_mask) & ~cr0_zeros_mask); 373 } 374 375 u_long 376 vmx_fix_cr4(u_long cr4) 377 { 378 379 return ((cr4 | cr4_ones_mask) & ~cr4_zeros_mask); 380 } 381 382 static void 383 vpid_free(int vpid) 384 { 385 if (vpid < 0 || vpid > 0xffff) 386 panic("vpid_free: invalid vpid %d", vpid); 387 388 /* 389 * VPIDs [0,VM_MAXCPU] are special and are not allocated from 390 * the unit number allocator. 391 */ 392 393 if (vpid > VM_MAXCPU) 394 free_unr(vpid_unr, vpid); 395 } 396 397 static void 398 vpid_alloc(uint16_t *vpid, int num) 399 { 400 int i, x; 401 402 if (num <= 0 || num > VM_MAXCPU) 403 panic("invalid number of vpids requested: %d", num); 404 405 /* 406 * If the "enable vpid" execution control is not enabled then the 407 * VPID is required to be 0 for all vcpus. 408 */ 409 if ((procbased_ctls2 & PROCBASED2_ENABLE_VPID) == 0) { 410 for (i = 0; i < num; i++) 411 vpid[i] = 0; 412 return; 413 } 414 415 /* 416 * Allocate a unique VPID for each vcpu from the unit number allocator. 417 */ 418 for (i = 0; i < num; i++) { 419 x = alloc_unr(vpid_unr); 420 if (x == -1) 421 break; 422 else 423 vpid[i] = x; 424 } 425 426 if (i < num) { 427 atomic_add_int(&vpid_alloc_failed, 1); 428 429 /* 430 * If the unit number allocator does not have enough unique 431 * VPIDs then we need to allocate from the [1,VM_MAXCPU] range. 432 * 433 * These VPIDs are not be unique across VMs but this does not 434 * affect correctness because the combined mappings are also 435 * tagged with the EP4TA which is unique for each VM. 436 * 437 * It is still sub-optimal because the invvpid will invalidate 438 * combined mappings for a particular VPID across all EP4TAs. 439 */ 440 while (i-- > 0) 441 vpid_free(vpid[i]); 442 443 for (i = 0; i < num; i++) 444 vpid[i] = i + 1; 445 } 446 } 447 448 static void 449 vpid_init(void) 450 { 451 /* 452 * VPID 0 is required when the "enable VPID" execution control is 453 * disabled. 454 * 455 * VPIDs [1,VM_MAXCPU] are used as the "overflow namespace" when the 456 * unit number allocator does not have sufficient unique VPIDs to 457 * satisfy the allocation. 458 * 459 * The remaining VPIDs are managed by the unit number allocator. 460 */ 461 vpid_unr = new_unrhdr(VM_MAXCPU + 1, 0xffff, NULL); 462 } 463 464 static void 465 msr_save_area_init(struct msr_entry *g_area, int *g_count) 466 { 467 int cnt; 468 469 static struct msr_entry guest_msrs[] = { 470 { MSR_KGSBASE, 0, 0 }, 471 }; 472 473 cnt = sizeof(guest_msrs) / sizeof(guest_msrs[0]); 474 if (cnt > GUEST_MSR_MAX_ENTRIES) 475 panic("guest msr save area overrun"); 476 bcopy(guest_msrs, g_area, sizeof(guest_msrs)); 477 *g_count = cnt; 478 } 479 480 static void 481 vmx_disable(void *arg __unused) 482 { 483 struct invvpid_desc invvpid_desc = { 0 }; 484 struct invept_desc invept_desc = { 0 }; 485 486 if (vmxon_enabled[curcpu]) { 487 /* 488 * See sections 25.3.3.3 and 25.3.3.4 in Intel Vol 3b. 489 * 490 * VMXON or VMXOFF are not required to invalidate any TLB 491 * caching structures. This prevents potential retention of 492 * cached information in the TLB between distinct VMX episodes. 493 */ 494 invvpid(INVVPID_TYPE_ALL_CONTEXTS, invvpid_desc); 495 invept(INVEPT_TYPE_ALL_CONTEXTS, invept_desc); 496 vmxoff(); 497 } 498 load_cr4(rcr4() & ~CR4_VMXE); 499 } 500 501 static int 502 vmx_cleanup(void) 503 { 504 505 if (vpid_unr != NULL) { 506 delete_unrhdr(vpid_unr); 507 vpid_unr = NULL; 508 } 509 510 smp_rendezvous(NULL, vmx_disable, NULL, NULL); 511 512 return (0); 513 } 514 515 static void 516 vmx_enable(void *arg __unused) 517 { 518 int error; 519 520 load_cr4(rcr4() | CR4_VMXE); 521 522 *(uint32_t *)vmxon_region[curcpu] = vmx_revision(); 523 error = vmxon(vmxon_region[curcpu]); 524 if (error == 0) 525 vmxon_enabled[curcpu] = 1; 526 } 527 528 static int 529 vmx_init(void) 530 { 531 int error; 532 uint64_t fixed0, fixed1, feature_control; 533 uint32_t tmp; 534 535 /* CPUID.1:ECX[bit 5] must be 1 for processor to support VMX */ 536 if (!(cpu_feature2 & CPUID2_VMX)) { 537 printf("vmx_init: processor does not support VMX operation\n"); 538 return (ENXIO); 539 } 540 541 /* 542 * Verify that MSR_IA32_FEATURE_CONTROL lock and VMXON enable bits 543 * are set (bits 0 and 2 respectively). 544 */ 545 feature_control = rdmsr(MSR_IA32_FEATURE_CONTROL); 546 if ((feature_control & IA32_FEATURE_CONTROL_LOCK) == 0 || 547 (feature_control & IA32_FEATURE_CONTROL_VMX_EN) == 0) { 548 printf("vmx_init: VMX operation disabled by BIOS\n"); 549 return (ENXIO); 550 } 551 552 /* Check support for primary processor-based VM-execution controls */ 553 error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, 554 MSR_VMX_TRUE_PROCBASED_CTLS, 555 PROCBASED_CTLS_ONE_SETTING, 556 PROCBASED_CTLS_ZERO_SETTING, &procbased_ctls); 557 if (error) { 558 printf("vmx_init: processor does not support desired primary " 559 "processor-based controls\n"); 560 return (error); 561 } 562 563 /* Clear the processor-based ctl bits that are set on demand */ 564 procbased_ctls &= ~PROCBASED_CTLS_WINDOW_SETTING; 565 566 /* Check support for secondary processor-based VM-execution controls */ 567 error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, 568 MSR_VMX_PROCBASED_CTLS2, 569 PROCBASED_CTLS2_ONE_SETTING, 570 PROCBASED_CTLS2_ZERO_SETTING, &procbased_ctls2); 571 if (error) { 572 printf("vmx_init: processor does not support desired secondary " 573 "processor-based controls\n"); 574 return (error); 575 } 576 577 /* Check support for VPID */ 578 error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, MSR_VMX_PROCBASED_CTLS2, 579 PROCBASED2_ENABLE_VPID, 0, &tmp); 580 if (error == 0) 581 procbased_ctls2 |= PROCBASED2_ENABLE_VPID; 582 583 /* Check support for pin-based VM-execution controls */ 584 error = vmx_set_ctlreg(MSR_VMX_PINBASED_CTLS, 585 MSR_VMX_TRUE_PINBASED_CTLS, 586 PINBASED_CTLS_ONE_SETTING, 587 PINBASED_CTLS_ZERO_SETTING, &pinbased_ctls); 588 if (error) { 589 printf("vmx_init: processor does not support desired " 590 "pin-based controls\n"); 591 return (error); 592 } 593 594 /* Check support for VM-exit controls */ 595 error = vmx_set_ctlreg(MSR_VMX_EXIT_CTLS, MSR_VMX_TRUE_EXIT_CTLS, 596 VM_EXIT_CTLS_ONE_SETTING, 597 VM_EXIT_CTLS_ZERO_SETTING, 598 &exit_ctls); 599 if (error) { 600 /* Try again without the PAT MSR bits */ 601 error = vmx_set_ctlreg(MSR_VMX_EXIT_CTLS, 602 MSR_VMX_TRUE_EXIT_CTLS, 603 VM_EXIT_CTLS_ONE_SETTING_NO_PAT, 604 VM_EXIT_CTLS_ZERO_SETTING, 605 &exit_ctls); 606 if (error) { 607 printf("vmx_init: processor does not support desired " 608 "exit controls\n"); 609 return (error); 610 } else { 611 if (bootverbose) 612 printf("vmm: PAT MSR access not supported\n"); 613 guest_msr_valid(MSR_PAT); 614 vmx_no_patmsr = 1; 615 } 616 } 617 618 /* Check support for VM-entry controls */ 619 if (!vmx_no_patmsr) { 620 error = vmx_set_ctlreg(MSR_VMX_ENTRY_CTLS, 621 MSR_VMX_TRUE_ENTRY_CTLS, 622 VM_ENTRY_CTLS_ONE_SETTING, 623 VM_ENTRY_CTLS_ZERO_SETTING, 624 &entry_ctls); 625 } else { 626 error = vmx_set_ctlreg(MSR_VMX_ENTRY_CTLS, 627 MSR_VMX_TRUE_ENTRY_CTLS, 628 VM_ENTRY_CTLS_ONE_SETTING_NO_PAT, 629 VM_ENTRY_CTLS_ZERO_SETTING, 630 &entry_ctls); 631 } 632 633 if (error) { 634 printf("vmx_init: processor does not support desired " 635 "entry controls\n"); 636 return (error); 637 } 638 639 /* 640 * Check support for optional features by testing them 641 * as individual bits 642 */ 643 cap_halt_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, 644 MSR_VMX_TRUE_PROCBASED_CTLS, 645 PROCBASED_HLT_EXITING, 0, 646 &tmp) == 0); 647 648 cap_monitor_trap = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, 649 MSR_VMX_PROCBASED_CTLS, 650 PROCBASED_MTF, 0, 651 &tmp) == 0); 652 653 cap_pause_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, 654 MSR_VMX_TRUE_PROCBASED_CTLS, 655 PROCBASED_PAUSE_EXITING, 0, 656 &tmp) == 0); 657 658 cap_unrestricted_guest = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, 659 MSR_VMX_PROCBASED_CTLS2, 660 PROCBASED2_UNRESTRICTED_GUEST, 0, 661 &tmp) == 0); 662 663 cap_invpcid = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, 664 MSR_VMX_PROCBASED_CTLS2, PROCBASED2_ENABLE_INVPCID, 0, 665 &tmp) == 0); 666 667 668 /* Initialize EPT */ 669 error = ept_init(); 670 if (error) { 671 printf("vmx_init: ept initialization failed (%d)\n", error); 672 return (error); 673 } 674 675 /* 676 * Stash the cr0 and cr4 bits that must be fixed to 0 or 1 677 */ 678 fixed0 = rdmsr(MSR_VMX_CR0_FIXED0); 679 fixed1 = rdmsr(MSR_VMX_CR0_FIXED1); 680 cr0_ones_mask = fixed0 & fixed1; 681 cr0_zeros_mask = ~fixed0 & ~fixed1; 682 683 /* 684 * CR0_PE and CR0_PG can be set to zero in VMX non-root operation 685 * if unrestricted guest execution is allowed. 686 */ 687 if (cap_unrestricted_guest) 688 cr0_ones_mask &= ~(CR0_PG | CR0_PE); 689 690 /* 691 * Do not allow the guest to set CR0_NW or CR0_CD. 692 */ 693 cr0_zeros_mask |= (CR0_NW | CR0_CD); 694 695 fixed0 = rdmsr(MSR_VMX_CR4_FIXED0); 696 fixed1 = rdmsr(MSR_VMX_CR4_FIXED1); 697 cr4_ones_mask = fixed0 & fixed1; 698 cr4_zeros_mask = ~fixed0 & ~fixed1; 699 700 vpid_init(); 701 702 /* enable VMX operation */ 703 smp_rendezvous(NULL, vmx_enable, NULL, NULL); 704 705 vmx_initialized = 1; 706 707 return (0); 708 } 709 710 static int 711 vmx_setup_cr_shadow(int which, struct vmcs *vmcs, uint32_t initial) 712 { 713 int error, mask_ident, shadow_ident; 714 uint64_t mask_value; 715 716 if (which != 0 && which != 4) 717 panic("vmx_setup_cr_shadow: unknown cr%d", which); 718 719 if (which == 0) { 720 mask_ident = VMCS_CR0_MASK; 721 mask_value = cr0_ones_mask | cr0_zeros_mask; 722 shadow_ident = VMCS_CR0_SHADOW; 723 } else { 724 mask_ident = VMCS_CR4_MASK; 725 mask_value = cr4_ones_mask | cr4_zeros_mask; 726 shadow_ident = VMCS_CR4_SHADOW; 727 } 728 729 error = vmcs_setreg(vmcs, 0, VMCS_IDENT(mask_ident), mask_value); 730 if (error) 731 return (error); 732 733 error = vmcs_setreg(vmcs, 0, VMCS_IDENT(shadow_ident), initial); 734 if (error) 735 return (error); 736 737 return (0); 738 } 739 #define vmx_setup_cr0_shadow(vmcs,init) vmx_setup_cr_shadow(0, (vmcs), (init)) 740 #define vmx_setup_cr4_shadow(vmcs,init) vmx_setup_cr_shadow(4, (vmcs), (init)) 741 742 static void * 743 vmx_vminit(struct vm *vm, pmap_t pmap) 744 { 745 uint16_t vpid[VM_MAXCPU]; 746 int i, error, guest_msr_count; 747 struct vmx *vmx; 748 749 vmx = malloc(sizeof(struct vmx), M_VMX, M_WAITOK | M_ZERO); 750 if ((uintptr_t)vmx & PAGE_MASK) { 751 panic("malloc of struct vmx not aligned on %d byte boundary", 752 PAGE_SIZE); 753 } 754 vmx->vm = vm; 755 756 vmx->eptp = eptp(vtophys((vm_offset_t)pmap->pm_pml4)); 757 758 /* 759 * Clean up EPTP-tagged guest physical and combined mappings 760 * 761 * VMX transitions are not required to invalidate any guest physical 762 * mappings. So, it may be possible for stale guest physical mappings 763 * to be present in the processor TLBs. 764 * 765 * Combined mappings for this EP4TA are also invalidated for all VPIDs. 766 */ 767 ept_invalidate_mappings(vmx->eptp); 768 769 msr_bitmap_initialize(vmx->msr_bitmap); 770 771 /* 772 * It is safe to allow direct access to MSR_GSBASE and MSR_FSBASE. 773 * The guest FSBASE and GSBASE are saved and restored during 774 * vm-exit and vm-entry respectively. The host FSBASE and GSBASE are 775 * always restored from the vmcs host state area on vm-exit. 776 * 777 * The SYSENTER_CS/ESP/EIP MSRs are identical to FS/GSBASE in 778 * how they are saved/restored so can be directly accessed by the 779 * guest. 780 * 781 * Guest KGSBASE is saved and restored in the guest MSR save area. 782 * Host KGSBASE is restored before returning to userland from the pcb. 783 * There will be a window of time when we are executing in the host 784 * kernel context with a value of KGSBASE from the guest. This is ok 785 * because the value of KGSBASE is inconsequential in kernel context. 786 * 787 * MSR_EFER is saved and restored in the guest VMCS area on a 788 * VM exit and entry respectively. It is also restored from the 789 * host VMCS area on a VM exit. 790 */ 791 if (guest_msr_rw(vmx, MSR_GSBASE) || 792 guest_msr_rw(vmx, MSR_FSBASE) || 793 guest_msr_rw(vmx, MSR_SYSENTER_CS_MSR) || 794 guest_msr_rw(vmx, MSR_SYSENTER_ESP_MSR) || 795 guest_msr_rw(vmx, MSR_SYSENTER_EIP_MSR) || 796 guest_msr_rw(vmx, MSR_KGSBASE) || 797 guest_msr_rw(vmx, MSR_EFER)) 798 panic("vmx_vminit: error setting guest msr access"); 799 800 /* 801 * MSR_PAT is saved and restored in the guest VMCS are on a VM exit 802 * and entry respectively. It is also restored from the host VMCS 803 * area on a VM exit. However, if running on a system with no 804 * MSR_PAT save/restore support, leave access disabled so accesses 805 * will be trapped. 806 */ 807 if (!vmx_no_patmsr && guest_msr_rw(vmx, MSR_PAT)) 808 panic("vmx_vminit: error setting guest pat msr access"); 809 810 vpid_alloc(vpid, VM_MAXCPU); 811 812 for (i = 0; i < VM_MAXCPU; i++) { 813 vmx->vmcs[i].identifier = vmx_revision(); 814 error = vmclear(&vmx->vmcs[i]); 815 if (error != 0) { 816 panic("vmx_vminit: vmclear error %d on vcpu %d\n", 817 error, i); 818 } 819 820 error = vmcs_set_defaults(&vmx->vmcs[i], 821 (u_long)vmx_longjmp, 822 (u_long)&vmx->ctx[i], 823 vmx->eptp, 824 pinbased_ctls, 825 procbased_ctls, 826 procbased_ctls2, 827 exit_ctls, entry_ctls, 828 vtophys(vmx->msr_bitmap), 829 vpid[i]); 830 831 if (error != 0) 832 panic("vmx_vminit: vmcs_set_defaults error %d", error); 833 834 vmx->cap[i].set = 0; 835 vmx->cap[i].proc_ctls = procbased_ctls; 836 vmx->cap[i].proc_ctls2 = procbased_ctls2; 837 838 vmx->state[i].lastcpu = -1; 839 vmx->state[i].vpid = vpid[i]; 840 841 msr_save_area_init(vmx->guest_msrs[i], &guest_msr_count); 842 843 error = vmcs_set_msr_save(&vmx->vmcs[i], 844 vtophys(vmx->guest_msrs[i]), 845 guest_msr_count); 846 if (error != 0) 847 panic("vmcs_set_msr_save error %d", error); 848 849 /* 850 * Set up the CR0/4 shadows, and init the read shadow 851 * to the power-on register value from the Intel Sys Arch. 852 * CR0 - 0x60000010 853 * CR4 - 0 854 */ 855 error = vmx_setup_cr0_shadow(&vmx->vmcs[i], 0x60000010); 856 if (error != 0) 857 panic("vmx_setup_cr0_shadow %d", error); 858 859 error = vmx_setup_cr4_shadow(&vmx->vmcs[i], 0); 860 if (error != 0) 861 panic("vmx_setup_cr4_shadow %d", error); 862 863 vmx->ctx[i].pmap = pmap; 864 vmx->ctx[i].eptp = vmx->eptp; 865 } 866 867 return (vmx); 868 } 869 870 static int 871 vmx_handle_cpuid(struct vm *vm, int vcpu, struct vmxctx *vmxctx) 872 { 873 int handled, func; 874 875 func = vmxctx->guest_rax; 876 877 handled = x86_emulate_cpuid(vm, vcpu, 878 (uint32_t*)(&vmxctx->guest_rax), 879 (uint32_t*)(&vmxctx->guest_rbx), 880 (uint32_t*)(&vmxctx->guest_rcx), 881 (uint32_t*)(&vmxctx->guest_rdx)); 882 return (handled); 883 } 884 885 static __inline void 886 vmx_run_trace(struct vmx *vmx, int vcpu) 887 { 888 #ifdef KTR 889 VCPU_CTR1(vmx->vm, vcpu, "Resume execution at %#lx", vmcs_guest_rip()); 890 #endif 891 } 892 893 static __inline void 894 vmx_exit_trace(struct vmx *vmx, int vcpu, uint64_t rip, uint32_t exit_reason, 895 int handled) 896 { 897 #ifdef KTR 898 VCPU_CTR3(vmx->vm, vcpu, "%s %s vmexit at 0x%0lx", 899 handled ? "handled" : "unhandled", 900 exit_reason_to_str(exit_reason), rip); 901 #endif 902 } 903 904 static __inline void 905 vmx_astpending_trace(struct vmx *vmx, int vcpu, uint64_t rip) 906 { 907 #ifdef KTR 908 VCPU_CTR1(vmx->vm, vcpu, "astpending vmexit at 0x%0lx", rip); 909 #endif 910 } 911 912 static int 913 vmx_set_pcpu_defaults(struct vmx *vmx, int vcpu) 914 { 915 int error, lastcpu; 916 struct vmxstate *vmxstate; 917 struct invvpid_desc invvpid_desc = { 0 }; 918 919 vmxstate = &vmx->state[vcpu]; 920 lastcpu = vmxstate->lastcpu; 921 vmxstate->lastcpu = curcpu; 922 923 if (lastcpu == curcpu) { 924 error = 0; 925 goto done; 926 } 927 928 vmm_stat_incr(vmx->vm, vcpu, VCPU_MIGRATIONS, 1); 929 930 error = vmwrite(VMCS_HOST_TR_BASE, vmm_get_host_trbase()); 931 if (error != 0) 932 goto done; 933 934 error = vmwrite(VMCS_HOST_GDTR_BASE, vmm_get_host_gdtrbase()); 935 if (error != 0) 936 goto done; 937 938 error = vmwrite(VMCS_HOST_GS_BASE, vmm_get_host_gsbase()); 939 if (error != 0) 940 goto done; 941 942 /* 943 * If we are using VPIDs then invalidate all mappings tagged with 'vpid' 944 * 945 * We do this because this vcpu was executing on a different host 946 * cpu when it last ran. We do not track whether it invalidated 947 * mappings associated with its 'vpid' during that run. So we must 948 * assume that the mappings associated with 'vpid' on 'curcpu' are 949 * stale and invalidate them. 950 * 951 * Note that we incur this penalty only when the scheduler chooses to 952 * move the thread associated with this vcpu between host cpus. 953 * 954 * Note also that this will invalidate mappings tagged with 'vpid' 955 * for "all" EP4TAs. 956 */ 957 if (vmxstate->vpid != 0) { 958 invvpid_desc.vpid = vmxstate->vpid; 959 invvpid(INVVPID_TYPE_SINGLE_CONTEXT, invvpid_desc); 960 } 961 done: 962 return (error); 963 } 964 965 static void 966 vm_exit_update_rip(struct vm_exit *vmexit) 967 { 968 int error; 969 970 error = vmwrite(VMCS_GUEST_RIP, vmexit->rip + vmexit->inst_length); 971 if (error) 972 panic("vmx_run: error %d writing to VMCS_GUEST_RIP", error); 973 } 974 975 /* 976 * We depend on 'procbased_ctls' to have the Interrupt Window Exiting bit set. 977 */ 978 CTASSERT((PROCBASED_CTLS_ONE_SETTING & PROCBASED_INT_WINDOW_EXITING) != 0); 979 980 static void __inline 981 vmx_set_int_window_exiting(struct vmx *vmx, int vcpu) 982 { 983 int error; 984 985 vmx->cap[vcpu].proc_ctls |= PROCBASED_INT_WINDOW_EXITING; 986 987 error = vmwrite(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); 988 if (error) 989 panic("vmx_set_int_window_exiting: vmwrite error %d", error); 990 } 991 992 static void __inline 993 vmx_clear_int_window_exiting(struct vmx *vmx, int vcpu) 994 { 995 int error; 996 997 vmx->cap[vcpu].proc_ctls &= ~PROCBASED_INT_WINDOW_EXITING; 998 999 error = vmwrite(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); 1000 if (error) 1001 panic("vmx_clear_int_window_exiting: vmwrite error %d", error); 1002 } 1003 1004 static void __inline 1005 vmx_set_nmi_window_exiting(struct vmx *vmx, int vcpu) 1006 { 1007 int error; 1008 1009 vmx->cap[vcpu].proc_ctls |= PROCBASED_NMI_WINDOW_EXITING; 1010 1011 error = vmwrite(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); 1012 if (error) 1013 panic("vmx_set_nmi_window_exiting: vmwrite error %d", error); 1014 } 1015 1016 static void __inline 1017 vmx_clear_nmi_window_exiting(struct vmx *vmx, int vcpu) 1018 { 1019 int error; 1020 1021 vmx->cap[vcpu].proc_ctls &= ~PROCBASED_NMI_WINDOW_EXITING; 1022 1023 error = vmwrite(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); 1024 if (error) 1025 panic("vmx_clear_nmi_window_exiting: vmwrite error %d", error); 1026 } 1027 1028 static int 1029 vmx_inject_nmi(struct vmx *vmx, int vcpu) 1030 { 1031 int error; 1032 uint64_t info, interruptibility; 1033 1034 /* Bail out if no NMI requested */ 1035 if (!vm_nmi_pending(vmx->vm, vcpu)) 1036 return (0); 1037 1038 error = vmread(VMCS_GUEST_INTERRUPTIBILITY, &interruptibility); 1039 if (error) { 1040 panic("vmx_inject_nmi: vmread(interruptibility) %d", 1041 error); 1042 } 1043 if (interruptibility & nmi_blocking_bits) 1044 goto nmiblocked; 1045 1046 /* 1047 * Inject the virtual NMI. The vector must be the NMI IDT entry 1048 * or the VMCS entry check will fail. 1049 */ 1050 info = VMCS_INTERRUPTION_INFO_NMI | VMCS_INTERRUPTION_INFO_VALID; 1051 info |= IDT_NMI; 1052 1053 error = vmwrite(VMCS_ENTRY_INTR_INFO, info); 1054 if (error) 1055 panic("vmx_inject_nmi: vmwrite(intrinfo) %d", error); 1056 1057 VCPU_CTR0(vmx->vm, vcpu, "Injecting vNMI"); 1058 1059 /* Clear the request */ 1060 vm_nmi_clear(vmx->vm, vcpu); 1061 return (1); 1062 1063 nmiblocked: 1064 /* 1065 * Set the NMI Window Exiting execution control so we can inject 1066 * the virtual NMI as soon as blocking condition goes away. 1067 */ 1068 vmx_set_nmi_window_exiting(vmx, vcpu); 1069 1070 VCPU_CTR0(vmx->vm, vcpu, "Enabling NMI window exiting"); 1071 return (1); 1072 } 1073 1074 static void 1075 vmx_inject_interrupts(struct vmx *vmx, int vcpu) 1076 { 1077 int error, vector; 1078 uint64_t info, rflags, interruptibility; 1079 1080 const int HWINTR_BLOCKED = VMCS_INTERRUPTIBILITY_STI_BLOCKING | 1081 VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING; 1082 1083 /* 1084 * If there is already an interrupt pending then just return. 1085 * 1086 * This could happen if an interrupt was injected on a prior 1087 * VM entry but the actual entry into guest mode was aborted 1088 * because of a pending AST. 1089 */ 1090 error = vmread(VMCS_ENTRY_INTR_INFO, &info); 1091 if (error) 1092 panic("vmx_inject_interrupts: vmread(intrinfo) %d", error); 1093 if (info & VMCS_INTERRUPTION_INFO_VALID) 1094 return; 1095 1096 /* 1097 * NMI injection has priority so deal with those first 1098 */ 1099 if (vmx_inject_nmi(vmx, vcpu)) 1100 return; 1101 1102 /* Ask the local apic for a vector to inject */ 1103 vector = lapic_pending_intr(vmx->vm, vcpu); 1104 if (vector < 0) 1105 return; 1106 1107 if (vector < 32 || vector > 255) 1108 panic("vmx_inject_interrupts: invalid vector %d\n", vector); 1109 1110 /* Check RFLAGS.IF and the interruptibility state of the guest */ 1111 error = vmread(VMCS_GUEST_RFLAGS, &rflags); 1112 if (error) 1113 panic("vmx_inject_interrupts: vmread(rflags) %d", error); 1114 1115 if ((rflags & PSL_I) == 0) 1116 goto cantinject; 1117 1118 error = vmread(VMCS_GUEST_INTERRUPTIBILITY, &interruptibility); 1119 if (error) { 1120 panic("vmx_inject_interrupts: vmread(interruptibility) %d", 1121 error); 1122 } 1123 if (interruptibility & HWINTR_BLOCKED) 1124 goto cantinject; 1125 1126 /* Inject the interrupt */ 1127 info = VMCS_INTERRUPTION_INFO_HW_INTR | VMCS_INTERRUPTION_INFO_VALID; 1128 info |= vector; 1129 error = vmwrite(VMCS_ENTRY_INTR_INFO, info); 1130 if (error) 1131 panic("vmx_inject_interrupts: vmwrite(intrinfo) %d", error); 1132 1133 /* Update the Local APIC ISR */ 1134 lapic_intr_accepted(vmx->vm, vcpu, vector); 1135 1136 VCPU_CTR1(vmx->vm, vcpu, "Injecting hwintr at vector %d", vector); 1137 1138 return; 1139 1140 cantinject: 1141 /* 1142 * Set the Interrupt Window Exiting execution control so we can inject 1143 * the interrupt as soon as blocking condition goes away. 1144 */ 1145 vmx_set_int_window_exiting(vmx, vcpu); 1146 1147 VCPU_CTR0(vmx->vm, vcpu, "Enabling interrupt window exiting"); 1148 } 1149 1150 static int 1151 vmx_emulate_cr_access(struct vmx *vmx, int vcpu, uint64_t exitqual) 1152 { 1153 int error, cr, vmcs_guest_cr, vmcs_shadow_cr; 1154 uint64_t crval, regval, ones_mask, zeros_mask; 1155 const struct vmxctx *vmxctx; 1156 1157 /* We only handle mov to %cr0 or %cr4 at this time */ 1158 if ((exitqual & 0xf0) != 0x00) 1159 return (UNHANDLED); 1160 1161 cr = exitqual & 0xf; 1162 if (cr != 0 && cr != 4) 1163 return (UNHANDLED); 1164 1165 vmxctx = &vmx->ctx[vcpu]; 1166 1167 /* 1168 * We must use vmwrite() directly here because vmcs_setreg() will 1169 * call vmclear(vmcs) as a side-effect which we certainly don't want. 1170 */ 1171 switch ((exitqual >> 8) & 0xf) { 1172 case 0: 1173 regval = vmxctx->guest_rax; 1174 break; 1175 case 1: 1176 regval = vmxctx->guest_rcx; 1177 break; 1178 case 2: 1179 regval = vmxctx->guest_rdx; 1180 break; 1181 case 3: 1182 regval = vmxctx->guest_rbx; 1183 break; 1184 case 4: 1185 error = vmread(VMCS_GUEST_RSP, ®val); 1186 if (error) { 1187 panic("vmx_emulate_cr_access: " 1188 "error %d reading guest rsp", error); 1189 } 1190 break; 1191 case 5: 1192 regval = vmxctx->guest_rbp; 1193 break; 1194 case 6: 1195 regval = vmxctx->guest_rsi; 1196 break; 1197 case 7: 1198 regval = vmxctx->guest_rdi; 1199 break; 1200 case 8: 1201 regval = vmxctx->guest_r8; 1202 break; 1203 case 9: 1204 regval = vmxctx->guest_r9; 1205 break; 1206 case 10: 1207 regval = vmxctx->guest_r10; 1208 break; 1209 case 11: 1210 regval = vmxctx->guest_r11; 1211 break; 1212 case 12: 1213 regval = vmxctx->guest_r12; 1214 break; 1215 case 13: 1216 regval = vmxctx->guest_r13; 1217 break; 1218 case 14: 1219 regval = vmxctx->guest_r14; 1220 break; 1221 case 15: 1222 regval = vmxctx->guest_r15; 1223 break; 1224 } 1225 1226 if (cr == 0) { 1227 ones_mask = cr0_ones_mask; 1228 zeros_mask = cr0_zeros_mask; 1229 vmcs_guest_cr = VMCS_GUEST_CR0; 1230 vmcs_shadow_cr = VMCS_CR0_SHADOW; 1231 } else { 1232 ones_mask = cr4_ones_mask; 1233 zeros_mask = cr4_zeros_mask; 1234 vmcs_guest_cr = VMCS_GUEST_CR4; 1235 vmcs_shadow_cr = VMCS_CR4_SHADOW; 1236 } 1237 1238 error = vmwrite(vmcs_shadow_cr, regval); 1239 if (error) { 1240 panic("vmx_emulate_cr_access: error %d writing cr%d shadow", 1241 error, cr); 1242 } 1243 1244 crval = regval | ones_mask; 1245 crval &= ~zeros_mask; 1246 error = vmwrite(vmcs_guest_cr, crval); 1247 if (error) { 1248 panic("vmx_emulate_cr_access: error %d writing cr%d", 1249 error, cr); 1250 } 1251 1252 if (cr == 0 && regval & CR0_PG) { 1253 uint64_t efer, entry_ctls; 1254 1255 /* 1256 * If CR0.PG is 1 and EFER.LME is 1 then EFER.LMA and 1257 * the "IA-32e mode guest" bit in VM-entry control must be 1258 * equal. 1259 */ 1260 error = vmread(VMCS_GUEST_IA32_EFER, &efer); 1261 if (error) { 1262 panic("vmx_emulate_cr_access: error %d efer read", 1263 error); 1264 } 1265 if (efer & EFER_LME) { 1266 efer |= EFER_LMA; 1267 error = vmwrite(VMCS_GUEST_IA32_EFER, efer); 1268 if (error) { 1269 panic("vmx_emulate_cr_access: error %d" 1270 " efer write", error); 1271 } 1272 error = vmread(VMCS_ENTRY_CTLS, &entry_ctls); 1273 if (error) { 1274 panic("vmx_emulate_cr_access: error %d" 1275 " entry ctls read", error); 1276 } 1277 entry_ctls |= VM_ENTRY_GUEST_LMA; 1278 error = vmwrite(VMCS_ENTRY_CTLS, entry_ctls); 1279 if (error) { 1280 panic("vmx_emulate_cr_access: error %d" 1281 " entry ctls write", error); 1282 } 1283 } 1284 } 1285 1286 return (HANDLED); 1287 } 1288 1289 static int 1290 ept_fault_type(uint64_t ept_qual) 1291 { 1292 int fault_type; 1293 1294 if (ept_qual & EPT_VIOLATION_DATA_WRITE) 1295 fault_type = VM_PROT_WRITE; 1296 else if (ept_qual & EPT_VIOLATION_INST_FETCH) 1297 fault_type = VM_PROT_EXECUTE; 1298 else 1299 fault_type= VM_PROT_READ; 1300 1301 return (fault_type); 1302 } 1303 1304 static int 1305 ept_protection(uint64_t ept_qual) 1306 { 1307 int prot = 0; 1308 1309 if (ept_qual & EPT_VIOLATION_GPA_READABLE) 1310 prot |= VM_PROT_READ; 1311 if (ept_qual & EPT_VIOLATION_GPA_WRITEABLE) 1312 prot |= VM_PROT_WRITE; 1313 if (ept_qual & EPT_VIOLATION_GPA_EXECUTABLE) 1314 prot |= VM_PROT_EXECUTE; 1315 1316 return (prot); 1317 } 1318 1319 static boolean_t 1320 ept_emulation_fault(uint64_t ept_qual) 1321 { 1322 int read, write; 1323 1324 /* EPT fault on an instruction fetch doesn't make sense here */ 1325 if (ept_qual & EPT_VIOLATION_INST_FETCH) 1326 return (FALSE); 1327 1328 /* EPT fault must be a read fault or a write fault */ 1329 read = ept_qual & EPT_VIOLATION_DATA_READ ? 1 : 0; 1330 write = ept_qual & EPT_VIOLATION_DATA_WRITE ? 1 : 0; 1331 if ((read | write) == 0) 1332 return (FALSE); 1333 1334 /* 1335 * The EPT violation must have been caused by accessing a 1336 * guest-physical address that is a translation of a guest-linear 1337 * address. 1338 */ 1339 if ((ept_qual & EPT_VIOLATION_GLA_VALID) == 0 || 1340 (ept_qual & EPT_VIOLATION_XLAT_VALID) == 0) { 1341 return (FALSE); 1342 } 1343 1344 return (TRUE); 1345 } 1346 1347 static int 1348 vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit) 1349 { 1350 int error, handled; 1351 struct vmcs *vmcs; 1352 struct vmxctx *vmxctx; 1353 uint32_t eax, ecx, edx, idtvec_info, idtvec_err, reason; 1354 uint64_t qual, gpa; 1355 1356 handled = 0; 1357 vmcs = &vmx->vmcs[vcpu]; 1358 vmxctx = &vmx->ctx[vcpu]; 1359 qual = vmexit->u.vmx.exit_qualification; 1360 reason = vmexit->u.vmx.exit_reason; 1361 vmexit->exitcode = VM_EXITCODE_BOGUS; 1362 1363 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_COUNT, 1); 1364 1365 /* 1366 * VM exits that could be triggered during event injection on the 1367 * previous VM entry need to be handled specially by re-injecting 1368 * the event. 1369 * 1370 * See "Information for VM Exits During Event Delivery" in Intel SDM 1371 * for details. 1372 */ 1373 switch (reason) { 1374 case EXIT_REASON_EPT_FAULT: 1375 case EXIT_REASON_EPT_MISCONFIG: 1376 case EXIT_REASON_APIC: 1377 case EXIT_REASON_TASK_SWITCH: 1378 case EXIT_REASON_EXCEPTION: 1379 idtvec_info = vmcs_idt_vectoring_info(); 1380 if (idtvec_info & VMCS_IDT_VEC_VALID) { 1381 idtvec_info &= ~(1 << 12); /* clear undefined bit */ 1382 vmwrite(VMCS_ENTRY_INTR_INFO, idtvec_info); 1383 if (idtvec_info & VMCS_IDT_VEC_ERRCODE_VALID) { 1384 idtvec_err = vmcs_idt_vectoring_err(); 1385 vmwrite(VMCS_ENTRY_EXCEPTION_ERROR, idtvec_err); 1386 } 1387 vmwrite(VMCS_ENTRY_INST_LENGTH, vmexit->inst_length); 1388 } 1389 default: 1390 break; 1391 } 1392 1393 switch (reason) { 1394 case EXIT_REASON_CR_ACCESS: 1395 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_CR_ACCESS, 1); 1396 handled = vmx_emulate_cr_access(vmx, vcpu, qual); 1397 break; 1398 case EXIT_REASON_RDMSR: 1399 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_RDMSR, 1); 1400 ecx = vmxctx->guest_rcx; 1401 error = emulate_rdmsr(vmx->vm, vcpu, ecx); 1402 if (error) { 1403 vmexit->exitcode = VM_EXITCODE_RDMSR; 1404 vmexit->u.msr.code = ecx; 1405 } else 1406 handled = 1; 1407 break; 1408 case EXIT_REASON_WRMSR: 1409 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_WRMSR, 1); 1410 eax = vmxctx->guest_rax; 1411 ecx = vmxctx->guest_rcx; 1412 edx = vmxctx->guest_rdx; 1413 error = emulate_wrmsr(vmx->vm, vcpu, ecx, 1414 (uint64_t)edx << 32 | eax); 1415 if (error) { 1416 vmexit->exitcode = VM_EXITCODE_WRMSR; 1417 vmexit->u.msr.code = ecx; 1418 vmexit->u.msr.wval = (uint64_t)edx << 32 | eax; 1419 } else 1420 handled = 1; 1421 break; 1422 case EXIT_REASON_HLT: 1423 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_HLT, 1); 1424 vmexit->exitcode = VM_EXITCODE_HLT; 1425 break; 1426 case EXIT_REASON_MTF: 1427 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_MTRAP, 1); 1428 vmexit->exitcode = VM_EXITCODE_MTRAP; 1429 break; 1430 case EXIT_REASON_PAUSE: 1431 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_PAUSE, 1); 1432 vmexit->exitcode = VM_EXITCODE_PAUSE; 1433 break; 1434 case EXIT_REASON_INTR_WINDOW: 1435 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INTR_WINDOW, 1); 1436 vmx_clear_int_window_exiting(vmx, vcpu); 1437 VCPU_CTR0(vmx->vm, vcpu, "Disabling interrupt window exiting"); 1438 return (1); 1439 case EXIT_REASON_EXT_INTR: 1440 /* 1441 * External interrupts serve only to cause VM exits and allow 1442 * the host interrupt handler to run. 1443 * 1444 * If this external interrupt triggers a virtual interrupt 1445 * to a VM, then that state will be recorded by the 1446 * host interrupt handler in the VM's softc. We will inject 1447 * this virtual interrupt during the subsequent VM enter. 1448 */ 1449 1450 /* 1451 * This is special. We want to treat this as an 'handled' 1452 * VM-exit but not increment the instruction pointer. 1453 */ 1454 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_EXTINT, 1); 1455 return (1); 1456 case EXIT_REASON_NMI_WINDOW: 1457 /* Exit to allow the pending virtual NMI to be injected */ 1458 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_NMI_WINDOW, 1); 1459 vmx_clear_nmi_window_exiting(vmx, vcpu); 1460 VCPU_CTR0(vmx->vm, vcpu, "Disabling NMI window exiting"); 1461 return (1); 1462 case EXIT_REASON_INOUT: 1463 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INOUT, 1); 1464 vmexit->exitcode = VM_EXITCODE_INOUT; 1465 vmexit->u.inout.bytes = (qual & 0x7) + 1; 1466 vmexit->u.inout.in = (qual & 0x8) ? 1 : 0; 1467 vmexit->u.inout.string = (qual & 0x10) ? 1 : 0; 1468 vmexit->u.inout.rep = (qual & 0x20) ? 1 : 0; 1469 vmexit->u.inout.port = (uint16_t)(qual >> 16); 1470 vmexit->u.inout.eax = (uint32_t)(vmxctx->guest_rax); 1471 break; 1472 case EXIT_REASON_CPUID: 1473 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_CPUID, 1); 1474 handled = vmx_handle_cpuid(vmx->vm, vcpu, vmxctx); 1475 break; 1476 case EXIT_REASON_EPT_FAULT: 1477 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_EPT_FAULT, 1); 1478 /* 1479 * If 'gpa' lies within the address space allocated to 1480 * memory then this must be a nested page fault otherwise 1481 * this must be an instruction that accesses MMIO space. 1482 */ 1483 gpa = vmcs_gpa(); 1484 if (vm_mem_allocated(vmx->vm, gpa)) { 1485 vmexit->exitcode = VM_EXITCODE_PAGING; 1486 vmexit->u.paging.gpa = gpa; 1487 vmexit->u.paging.fault_type = ept_fault_type(qual); 1488 vmexit->u.paging.protection = ept_protection(qual); 1489 } else if (ept_emulation_fault(qual)) { 1490 vmexit->exitcode = VM_EXITCODE_INST_EMUL; 1491 vmexit->u.inst_emul.gpa = gpa; 1492 vmexit->u.inst_emul.gla = vmcs_gla(); 1493 vmexit->u.inst_emul.cr3 = vmcs_guest_cr3(); 1494 } 1495 break; 1496 default: 1497 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_UNKNOWN, 1); 1498 break; 1499 } 1500 1501 if (handled) { 1502 /* 1503 * It is possible that control is returned to userland 1504 * even though we were able to handle the VM exit in the 1505 * kernel. 1506 * 1507 * In such a case we want to make sure that the userland 1508 * restarts guest execution at the instruction *after* 1509 * the one we just processed. Therefore we update the 1510 * guest rip in the VMCS and in 'vmexit'. 1511 */ 1512 vm_exit_update_rip(vmexit); 1513 vmexit->rip += vmexit->inst_length; 1514 vmexit->inst_length = 0; 1515 } else { 1516 if (vmexit->exitcode == VM_EXITCODE_BOGUS) { 1517 /* 1518 * If this VM exit was not claimed by anybody then 1519 * treat it as a generic VMX exit. 1520 */ 1521 vmexit->exitcode = VM_EXITCODE_VMX; 1522 vmexit->u.vmx.error = 0; 1523 } else { 1524 /* 1525 * The exitcode and collateral have been populated. 1526 * The VM exit will be processed further in userland. 1527 */ 1528 } 1529 } 1530 return (handled); 1531 } 1532 1533 static int 1534 vmx_run(void *arg, int vcpu, register_t rip, pmap_t pmap) 1535 { 1536 int error, vie, rc, handled, astpending; 1537 uint32_t exit_reason; 1538 struct vmx *vmx; 1539 struct vmxctx *vmxctx; 1540 struct vmcs *vmcs; 1541 struct vm_exit *vmexit; 1542 1543 vmx = arg; 1544 vmcs = &vmx->vmcs[vcpu]; 1545 vmxctx = &vmx->ctx[vcpu]; 1546 vmxctx->launched = 0; 1547 1548 astpending = 0; 1549 vmexit = vm_exitinfo(vmx->vm, vcpu); 1550 1551 KASSERT(vmxctx->pmap == pmap, 1552 ("pmap %p different than ctx pmap %p", pmap, vmxctx->pmap)); 1553 KASSERT(vmxctx->eptp == vmx->eptp, 1554 ("eptp %p different than ctx eptp %#lx", eptp, vmxctx->eptp)); 1555 1556 /* 1557 * XXX Can we avoid doing this every time we do a vm run? 1558 */ 1559 VMPTRLD(vmcs); 1560 1561 /* 1562 * XXX 1563 * We do this every time because we may setup the virtual machine 1564 * from a different process than the one that actually runs it. 1565 * 1566 * If the life of a virtual machine was spent entirely in the context 1567 * of a single process we could do this once in vmcs_set_defaults(). 1568 */ 1569 if ((error = vmwrite(VMCS_HOST_CR3, rcr3())) != 0) 1570 panic("vmx_run: error %d writing to VMCS_HOST_CR3", error); 1571 1572 if ((error = vmwrite(VMCS_GUEST_RIP, rip)) != 0) 1573 panic("vmx_run: error %d writing to VMCS_GUEST_RIP", error); 1574 1575 if ((error = vmx_set_pcpu_defaults(vmx, vcpu)) != 0) 1576 panic("vmx_run: error %d setting up pcpu defaults", error); 1577 1578 do { 1579 lapic_timer_tick(vmx->vm, vcpu); 1580 vmx_inject_interrupts(vmx, vcpu); 1581 vmx_run_trace(vmx, vcpu); 1582 rc = vmx_setjmp(vmxctx); 1583 #ifdef SETJMP_TRACE 1584 vmx_setjmp_trace(vmx, vcpu, vmxctx, rc); 1585 #endif 1586 switch (rc) { 1587 case VMX_RETURN_DIRECT: 1588 if (vmxctx->launched == 0) { 1589 vmxctx->launched = 1; 1590 vmx_launch(vmxctx); 1591 } else 1592 vmx_resume(vmxctx); 1593 panic("vmx_launch/resume should not return"); 1594 break; 1595 case VMX_RETURN_LONGJMP: 1596 break; /* vm exit */ 1597 case VMX_RETURN_AST: 1598 astpending = 1; 1599 break; 1600 case VMX_RETURN_VMRESUME: 1601 vie = vmcs_instruction_error(); 1602 if (vmxctx->launch_error == VM_FAIL_INVALID || 1603 vie != VMRESUME_WITH_NON_LAUNCHED_VMCS) { 1604 printf("vmresume error %d vmcs inst error %d\n", 1605 vmxctx->launch_error, vie); 1606 goto err_exit; 1607 } 1608 vmx_launch(vmxctx); /* try to launch the guest */ 1609 panic("vmx_launch should not return"); 1610 break; 1611 case VMX_RETURN_VMLAUNCH: 1612 vie = vmcs_instruction_error(); 1613 #if 1 1614 printf("vmlaunch error %d vmcs inst error %d\n", 1615 vmxctx->launch_error, vie); 1616 #endif 1617 goto err_exit; 1618 case VMX_RETURN_INVEPT: 1619 panic("vm %s:%d invept error %d", 1620 vm_name(vmx->vm), vcpu, vmxctx->launch_error); 1621 default: 1622 panic("vmx_setjmp returned %d", rc); 1623 } 1624 1625 /* enable interrupts */ 1626 enable_intr(); 1627 1628 /* collect some basic information for VM exit processing */ 1629 vmexit->rip = rip = vmcs_guest_rip(); 1630 vmexit->inst_length = vmexit_instruction_length(); 1631 vmexit->u.vmx.exit_reason = exit_reason = vmcs_exit_reason(); 1632 vmexit->u.vmx.exit_qualification = vmcs_exit_qualification(); 1633 1634 if (astpending) { 1635 handled = 1; 1636 vmexit->inst_length = 0; 1637 vmexit->exitcode = VM_EXITCODE_BOGUS; 1638 vmx_astpending_trace(vmx, vcpu, rip); 1639 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_ASTPENDING, 1); 1640 break; 1641 } 1642 1643 handled = vmx_exit_process(vmx, vcpu, vmexit); 1644 vmx_exit_trace(vmx, vcpu, rip, exit_reason, handled); 1645 1646 } while (handled); 1647 1648 /* 1649 * If a VM exit has been handled then the exitcode must be BOGUS 1650 * If a VM exit is not handled then the exitcode must not be BOGUS 1651 */ 1652 if ((handled && vmexit->exitcode != VM_EXITCODE_BOGUS) || 1653 (!handled && vmexit->exitcode == VM_EXITCODE_BOGUS)) { 1654 panic("Mismatch between handled (%d) and exitcode (%d)", 1655 handled, vmexit->exitcode); 1656 } 1657 1658 if (!handled) 1659 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_USERSPACE, 1); 1660 1661 VCPU_CTR1(vmx->vm, vcpu, "goto userland: exitcode %d",vmexit->exitcode); 1662 1663 /* 1664 * XXX 1665 * We need to do this to ensure that any VMCS state cached by the 1666 * processor is flushed to memory. We need to do this in case the 1667 * VM moves to a different cpu the next time it runs. 1668 * 1669 * Can we avoid doing this? 1670 */ 1671 VMCLEAR(vmcs); 1672 return (0); 1673 1674 err_exit: 1675 vmexit->exitcode = VM_EXITCODE_VMX; 1676 vmexit->u.vmx.exit_reason = (uint32_t)-1; 1677 vmexit->u.vmx.exit_qualification = (uint32_t)-1; 1678 vmexit->u.vmx.error = vie; 1679 VMCLEAR(vmcs); 1680 return (ENOEXEC); 1681 } 1682 1683 static void 1684 vmx_vmcleanup(void *arg) 1685 { 1686 int i, error; 1687 struct vmx *vmx = arg; 1688 1689 for (i = 0; i < VM_MAXCPU; i++) 1690 vpid_free(vmx->state[i].vpid); 1691 1692 /* 1693 * XXXSMP we also need to clear the VMCS active on the other vcpus. 1694 */ 1695 error = vmclear(&vmx->vmcs[0]); 1696 if (error != 0) 1697 panic("vmx_vmcleanup: vmclear error %d on vcpu 0", error); 1698 1699 free(vmx, M_VMX); 1700 1701 return; 1702 } 1703 1704 static register_t * 1705 vmxctx_regptr(struct vmxctx *vmxctx, int reg) 1706 { 1707 1708 switch (reg) { 1709 case VM_REG_GUEST_RAX: 1710 return (&vmxctx->guest_rax); 1711 case VM_REG_GUEST_RBX: 1712 return (&vmxctx->guest_rbx); 1713 case VM_REG_GUEST_RCX: 1714 return (&vmxctx->guest_rcx); 1715 case VM_REG_GUEST_RDX: 1716 return (&vmxctx->guest_rdx); 1717 case VM_REG_GUEST_RSI: 1718 return (&vmxctx->guest_rsi); 1719 case VM_REG_GUEST_RDI: 1720 return (&vmxctx->guest_rdi); 1721 case VM_REG_GUEST_RBP: 1722 return (&vmxctx->guest_rbp); 1723 case VM_REG_GUEST_R8: 1724 return (&vmxctx->guest_r8); 1725 case VM_REG_GUEST_R9: 1726 return (&vmxctx->guest_r9); 1727 case VM_REG_GUEST_R10: 1728 return (&vmxctx->guest_r10); 1729 case VM_REG_GUEST_R11: 1730 return (&vmxctx->guest_r11); 1731 case VM_REG_GUEST_R12: 1732 return (&vmxctx->guest_r12); 1733 case VM_REG_GUEST_R13: 1734 return (&vmxctx->guest_r13); 1735 case VM_REG_GUEST_R14: 1736 return (&vmxctx->guest_r14); 1737 case VM_REG_GUEST_R15: 1738 return (&vmxctx->guest_r15); 1739 default: 1740 break; 1741 } 1742 return (NULL); 1743 } 1744 1745 static int 1746 vmxctx_getreg(struct vmxctx *vmxctx, int reg, uint64_t *retval) 1747 { 1748 register_t *regp; 1749 1750 if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) { 1751 *retval = *regp; 1752 return (0); 1753 } else 1754 return (EINVAL); 1755 } 1756 1757 static int 1758 vmxctx_setreg(struct vmxctx *vmxctx, int reg, uint64_t val) 1759 { 1760 register_t *regp; 1761 1762 if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) { 1763 *regp = val; 1764 return (0); 1765 } else 1766 return (EINVAL); 1767 } 1768 1769 static int 1770 vmx_shadow_reg(int reg) 1771 { 1772 int shreg; 1773 1774 shreg = -1; 1775 1776 switch (reg) { 1777 case VM_REG_GUEST_CR0: 1778 shreg = VMCS_CR0_SHADOW; 1779 break; 1780 case VM_REG_GUEST_CR4: 1781 shreg = VMCS_CR4_SHADOW; 1782 break; 1783 default: 1784 break; 1785 } 1786 1787 return (shreg); 1788 } 1789 1790 static int 1791 vmx_getreg(void *arg, int vcpu, int reg, uint64_t *retval) 1792 { 1793 int running, hostcpu; 1794 struct vmx *vmx = arg; 1795 1796 running = vcpu_is_running(vmx->vm, vcpu, &hostcpu); 1797 if (running && hostcpu != curcpu) 1798 panic("vmx_getreg: %s%d is running", vm_name(vmx->vm), vcpu); 1799 1800 if (vmxctx_getreg(&vmx->ctx[vcpu], reg, retval) == 0) 1801 return (0); 1802 1803 return (vmcs_getreg(&vmx->vmcs[vcpu], running, reg, retval)); 1804 } 1805 1806 static int 1807 vmx_setreg(void *arg, int vcpu, int reg, uint64_t val) 1808 { 1809 int error, hostcpu, running, shadow; 1810 uint64_t ctls; 1811 struct vmx *vmx = arg; 1812 1813 running = vcpu_is_running(vmx->vm, vcpu, &hostcpu); 1814 if (running && hostcpu != curcpu) 1815 panic("vmx_setreg: %s%d is running", vm_name(vmx->vm), vcpu); 1816 1817 if (vmxctx_setreg(&vmx->ctx[vcpu], reg, val) == 0) 1818 return (0); 1819 1820 error = vmcs_setreg(&vmx->vmcs[vcpu], running, reg, val); 1821 1822 if (error == 0) { 1823 /* 1824 * If the "load EFER" VM-entry control is 1 then the 1825 * value of EFER.LMA must be identical to "IA-32e mode guest" 1826 * bit in the VM-entry control. 1827 */ 1828 if ((entry_ctls & VM_ENTRY_LOAD_EFER) != 0 && 1829 (reg == VM_REG_GUEST_EFER)) { 1830 vmcs_getreg(&vmx->vmcs[vcpu], running, 1831 VMCS_IDENT(VMCS_ENTRY_CTLS), &ctls); 1832 if (val & EFER_LMA) 1833 ctls |= VM_ENTRY_GUEST_LMA; 1834 else 1835 ctls &= ~VM_ENTRY_GUEST_LMA; 1836 vmcs_setreg(&vmx->vmcs[vcpu], running, 1837 VMCS_IDENT(VMCS_ENTRY_CTLS), ctls); 1838 } 1839 1840 shadow = vmx_shadow_reg(reg); 1841 if (shadow > 0) { 1842 /* 1843 * Store the unmodified value in the shadow 1844 */ 1845 error = vmcs_setreg(&vmx->vmcs[vcpu], running, 1846 VMCS_IDENT(shadow), val); 1847 } 1848 } 1849 1850 return (error); 1851 } 1852 1853 static int 1854 vmx_getdesc(void *arg, int vcpu, int reg, struct seg_desc *desc) 1855 { 1856 struct vmx *vmx = arg; 1857 1858 return (vmcs_getdesc(&vmx->vmcs[vcpu], reg, desc)); 1859 } 1860 1861 static int 1862 vmx_setdesc(void *arg, int vcpu, int reg, struct seg_desc *desc) 1863 { 1864 struct vmx *vmx = arg; 1865 1866 return (vmcs_setdesc(&vmx->vmcs[vcpu], reg, desc)); 1867 } 1868 1869 static int 1870 vmx_inject(void *arg, int vcpu, int type, int vector, uint32_t code, 1871 int code_valid) 1872 { 1873 int error; 1874 uint64_t info; 1875 struct vmx *vmx = arg; 1876 struct vmcs *vmcs = &vmx->vmcs[vcpu]; 1877 1878 static uint32_t type_map[VM_EVENT_MAX] = { 1879 0x1, /* VM_EVENT_NONE */ 1880 0x0, /* VM_HW_INTR */ 1881 0x2, /* VM_NMI */ 1882 0x3, /* VM_HW_EXCEPTION */ 1883 0x4, /* VM_SW_INTR */ 1884 0x5, /* VM_PRIV_SW_EXCEPTION */ 1885 0x6, /* VM_SW_EXCEPTION */ 1886 }; 1887 1888 /* 1889 * If there is already an exception pending to be delivered to the 1890 * vcpu then just return. 1891 */ 1892 error = vmcs_getreg(vmcs, 0, VMCS_IDENT(VMCS_ENTRY_INTR_INFO), &info); 1893 if (error) 1894 return (error); 1895 1896 if (info & VMCS_INTERRUPTION_INFO_VALID) 1897 return (EAGAIN); 1898 1899 info = vector | (type_map[type] << 8) | (code_valid ? 1 << 11 : 0); 1900 info |= VMCS_INTERRUPTION_INFO_VALID; 1901 error = vmcs_setreg(vmcs, 0, VMCS_IDENT(VMCS_ENTRY_INTR_INFO), info); 1902 if (error != 0) 1903 return (error); 1904 1905 if (code_valid) { 1906 error = vmcs_setreg(vmcs, 0, 1907 VMCS_IDENT(VMCS_ENTRY_EXCEPTION_ERROR), 1908 code); 1909 } 1910 return (error); 1911 } 1912 1913 static int 1914 vmx_getcap(void *arg, int vcpu, int type, int *retval) 1915 { 1916 struct vmx *vmx = arg; 1917 int vcap; 1918 int ret; 1919 1920 ret = ENOENT; 1921 1922 vcap = vmx->cap[vcpu].set; 1923 1924 switch (type) { 1925 case VM_CAP_HALT_EXIT: 1926 if (cap_halt_exit) 1927 ret = 0; 1928 break; 1929 case VM_CAP_PAUSE_EXIT: 1930 if (cap_pause_exit) 1931 ret = 0; 1932 break; 1933 case VM_CAP_MTRAP_EXIT: 1934 if (cap_monitor_trap) 1935 ret = 0; 1936 break; 1937 case VM_CAP_UNRESTRICTED_GUEST: 1938 if (cap_unrestricted_guest) 1939 ret = 0; 1940 break; 1941 case VM_CAP_ENABLE_INVPCID: 1942 if (cap_invpcid) 1943 ret = 0; 1944 break; 1945 default: 1946 break; 1947 } 1948 1949 if (ret == 0) 1950 *retval = (vcap & (1 << type)) ? 1 : 0; 1951 1952 return (ret); 1953 } 1954 1955 static int 1956 vmx_setcap(void *arg, int vcpu, int type, int val) 1957 { 1958 struct vmx *vmx = arg; 1959 struct vmcs *vmcs = &vmx->vmcs[vcpu]; 1960 uint32_t baseval; 1961 uint32_t *pptr; 1962 int error; 1963 int flag; 1964 int reg; 1965 int retval; 1966 1967 retval = ENOENT; 1968 pptr = NULL; 1969 1970 switch (type) { 1971 case VM_CAP_HALT_EXIT: 1972 if (cap_halt_exit) { 1973 retval = 0; 1974 pptr = &vmx->cap[vcpu].proc_ctls; 1975 baseval = *pptr; 1976 flag = PROCBASED_HLT_EXITING; 1977 reg = VMCS_PRI_PROC_BASED_CTLS; 1978 } 1979 break; 1980 case VM_CAP_MTRAP_EXIT: 1981 if (cap_monitor_trap) { 1982 retval = 0; 1983 pptr = &vmx->cap[vcpu].proc_ctls; 1984 baseval = *pptr; 1985 flag = PROCBASED_MTF; 1986 reg = VMCS_PRI_PROC_BASED_CTLS; 1987 } 1988 break; 1989 case VM_CAP_PAUSE_EXIT: 1990 if (cap_pause_exit) { 1991 retval = 0; 1992 pptr = &vmx->cap[vcpu].proc_ctls; 1993 baseval = *pptr; 1994 flag = PROCBASED_PAUSE_EXITING; 1995 reg = VMCS_PRI_PROC_BASED_CTLS; 1996 } 1997 break; 1998 case VM_CAP_UNRESTRICTED_GUEST: 1999 if (cap_unrestricted_guest) { 2000 retval = 0; 2001 pptr = &vmx->cap[vcpu].proc_ctls2; 2002 baseval = *pptr; 2003 flag = PROCBASED2_UNRESTRICTED_GUEST; 2004 reg = VMCS_SEC_PROC_BASED_CTLS; 2005 } 2006 break; 2007 case VM_CAP_ENABLE_INVPCID: 2008 if (cap_invpcid) { 2009 retval = 0; 2010 pptr = &vmx->cap[vcpu].proc_ctls2; 2011 baseval = *pptr; 2012 flag = PROCBASED2_ENABLE_INVPCID; 2013 reg = VMCS_SEC_PROC_BASED_CTLS; 2014 } 2015 break; 2016 default: 2017 break; 2018 } 2019 2020 if (retval == 0) { 2021 if (val) { 2022 baseval |= flag; 2023 } else { 2024 baseval &= ~flag; 2025 } 2026 VMPTRLD(vmcs); 2027 error = vmwrite(reg, baseval); 2028 VMCLEAR(vmcs); 2029 2030 if (error) { 2031 retval = error; 2032 } else { 2033 /* 2034 * Update optional stored flags, and record 2035 * setting 2036 */ 2037 if (pptr != NULL) { 2038 *pptr = baseval; 2039 } 2040 2041 if (val) { 2042 vmx->cap[vcpu].set |= (1 << type); 2043 } else { 2044 vmx->cap[vcpu].set &= ~(1 << type); 2045 } 2046 } 2047 } 2048 2049 return (retval); 2050 } 2051 2052 struct vmm_ops vmm_ops_intel = { 2053 vmx_init, 2054 vmx_cleanup, 2055 vmx_vminit, 2056 vmx_run, 2057 vmx_vmcleanup, 2058 vmx_getreg, 2059 vmx_setreg, 2060 vmx_getdesc, 2061 vmx_setdesc, 2062 vmx_inject, 2063 vmx_getcap, 2064 vmx_setcap, 2065 ept_vmspace_alloc, 2066 ept_vmspace_free, 2067 }; 2068