1 /*- 2 * Copyright (c) 2011 NetApp, Inc. 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 * 26 * $FreeBSD$ 27 */ 28 29 #include <sys/cdefs.h> 30 __FBSDID("$FreeBSD$"); 31 32 #include <sys/param.h> 33 #include <sys/systm.h> 34 #include <sys/smp.h> 35 #include <sys/kernel.h> 36 #include <sys/malloc.h> 37 #include <sys/pcpu.h> 38 #include <sys/proc.h> 39 #include <sys/sysctl.h> 40 41 #include <vm/vm.h> 42 #include <vm/pmap.h> 43 44 #include <machine/psl.h> 45 #include <machine/cpufunc.h> 46 #include <machine/md_var.h> 47 #include <machine/pmap.h> 48 #include <machine/segments.h> 49 #include <machine/specialreg.h> 50 #include <machine/vmparam.h> 51 52 #include <x86/apicreg.h> 53 54 #include <machine/vmm.h> 55 #include "vmm_host.h" 56 #include "vmm_lapic.h" 57 #include "vmm_msr.h" 58 #include "vmm_ktr.h" 59 #include "vmm_stat.h" 60 61 #include "vmx_msr.h" 62 #include "ept.h" 63 #include "vmx_cpufunc.h" 64 #include "vmx.h" 65 #include "x86.h" 66 #include "vmx_controls.h" 67 68 #define PINBASED_CTLS_ONE_SETTING \ 69 (PINBASED_EXTINT_EXITING | \ 70 PINBASED_NMI_EXITING | \ 71 PINBASED_VIRTUAL_NMI) 72 #define PINBASED_CTLS_ZERO_SETTING 0 73 74 #define PROCBASED_CTLS_WINDOW_SETTING \ 75 (PROCBASED_INT_WINDOW_EXITING | \ 76 PROCBASED_NMI_WINDOW_EXITING) 77 78 #define PROCBASED_CTLS_ONE_SETTING \ 79 (PROCBASED_SECONDARY_CONTROLS | \ 80 PROCBASED_IO_EXITING | \ 81 PROCBASED_MSR_BITMAPS | \ 82 PROCBASED_CTLS_WINDOW_SETTING) 83 #define PROCBASED_CTLS_ZERO_SETTING \ 84 (PROCBASED_CR3_LOAD_EXITING | \ 85 PROCBASED_CR3_STORE_EXITING | \ 86 PROCBASED_IO_BITMAPS) 87 88 #define PROCBASED_CTLS2_ONE_SETTING PROCBASED2_ENABLE_EPT 89 #define PROCBASED_CTLS2_ZERO_SETTING 0 90 91 #define VM_EXIT_CTLS_ONE_SETTING_NO_PAT \ 92 (VM_EXIT_HOST_LMA | \ 93 VM_EXIT_SAVE_EFER | \ 94 VM_EXIT_LOAD_EFER) 95 96 #define VM_EXIT_CTLS_ONE_SETTING \ 97 (VM_EXIT_CTLS_ONE_SETTING_NO_PAT | \ 98 VM_EXIT_SAVE_PAT | \ 99 VM_EXIT_LOAD_PAT) 100 #define VM_EXIT_CTLS_ZERO_SETTING VM_EXIT_SAVE_DEBUG_CONTROLS 101 102 #define VM_ENTRY_CTLS_ONE_SETTING_NO_PAT VM_ENTRY_LOAD_EFER 103 104 #define VM_ENTRY_CTLS_ONE_SETTING \ 105 (VM_ENTRY_CTLS_ONE_SETTING_NO_PAT | \ 106 VM_ENTRY_LOAD_PAT) 107 #define VM_ENTRY_CTLS_ZERO_SETTING \ 108 (VM_ENTRY_LOAD_DEBUG_CONTROLS | \ 109 VM_ENTRY_INTO_SMM | \ 110 VM_ENTRY_DEACTIVATE_DUAL_MONITOR) 111 112 #define guest_msr_rw(vmx, msr) \ 113 msr_bitmap_change_access((vmx)->msr_bitmap, (msr), MSR_BITMAP_ACCESS_RW) 114 115 #define HANDLED 1 116 #define UNHANDLED 0 117 118 MALLOC_DEFINE(M_VMX, "vmx", "vmx"); 119 120 SYSCTL_DECL(_hw_vmm); 121 SYSCTL_NODE(_hw_vmm, OID_AUTO, vmx, CTLFLAG_RW, NULL, NULL); 122 123 int vmxon_enabled[MAXCPU]; 124 static char vmxon_region[MAXCPU][PAGE_SIZE] __aligned(PAGE_SIZE); 125 126 static uint32_t pinbased_ctls, procbased_ctls, procbased_ctls2; 127 static uint32_t exit_ctls, entry_ctls; 128 129 static uint64_t cr0_ones_mask, cr0_zeros_mask; 130 SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr0_ones_mask, CTLFLAG_RD, 131 &cr0_ones_mask, 0, NULL); 132 SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr0_zeros_mask, CTLFLAG_RD, 133 &cr0_zeros_mask, 0, NULL); 134 135 static uint64_t cr4_ones_mask, cr4_zeros_mask; 136 SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr4_ones_mask, CTLFLAG_RD, 137 &cr4_ones_mask, 0, NULL); 138 SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr4_zeros_mask, CTLFLAG_RD, 139 &cr4_zeros_mask, 0, NULL); 140 141 static int vmx_no_patmsr; 142 143 static int vmx_initialized; 144 SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, initialized, CTLFLAG_RD, 145 &vmx_initialized, 0, "Intel VMX initialized"); 146 147 /* 148 * Virtual NMI blocking conditions. 149 * 150 * Some processor implementations also require NMI to be blocked if 151 * the STI_BLOCKING bit is set. It is possible to detect this at runtime 152 * based on the (exit_reason,exit_qual) tuple being set to 153 * (EXIT_REASON_INVAL_VMCS, EXIT_QUAL_NMI_WHILE_STI_BLOCKING). 154 * 155 * We take the easy way out and also include STI_BLOCKING as one of the 156 * gating items for vNMI injection. 157 */ 158 static uint64_t nmi_blocking_bits = VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING | 159 VMCS_INTERRUPTIBILITY_NMI_BLOCKING | 160 VMCS_INTERRUPTIBILITY_STI_BLOCKING; 161 162 /* 163 * Optional capabilities 164 */ 165 static int cap_halt_exit; 166 static int cap_pause_exit; 167 static int cap_unrestricted_guest; 168 static int cap_monitor_trap; 169 170 /* statistics */ 171 static VMM_STAT_INTEL(VMEXIT_HLT_IGNORED, "number of times hlt was ignored"); 172 173 static struct unrhdr *vpid_unr; 174 static u_int vpid_alloc_failed; 175 SYSCTL_UINT(_hw_vmm_vmx, OID_AUTO, vpid_alloc_failed, CTLFLAG_RD, 176 &vpid_alloc_failed, 0, NULL); 177 178 #ifdef KTR 179 static const char * 180 exit_reason_to_str(int reason) 181 { 182 static char reasonbuf[32]; 183 184 switch (reason) { 185 case EXIT_REASON_EXCEPTION: 186 return "exception"; 187 case EXIT_REASON_EXT_INTR: 188 return "extint"; 189 case EXIT_REASON_TRIPLE_FAULT: 190 return "triplefault"; 191 case EXIT_REASON_INIT: 192 return "init"; 193 case EXIT_REASON_SIPI: 194 return "sipi"; 195 case EXIT_REASON_IO_SMI: 196 return "iosmi"; 197 case EXIT_REASON_SMI: 198 return "smi"; 199 case EXIT_REASON_INTR_WINDOW: 200 return "intrwindow"; 201 case EXIT_REASON_NMI_WINDOW: 202 return "nmiwindow"; 203 case EXIT_REASON_TASK_SWITCH: 204 return "taskswitch"; 205 case EXIT_REASON_CPUID: 206 return "cpuid"; 207 case EXIT_REASON_GETSEC: 208 return "getsec"; 209 case EXIT_REASON_HLT: 210 return "hlt"; 211 case EXIT_REASON_INVD: 212 return "invd"; 213 case EXIT_REASON_INVLPG: 214 return "invlpg"; 215 case EXIT_REASON_RDPMC: 216 return "rdpmc"; 217 case EXIT_REASON_RDTSC: 218 return "rdtsc"; 219 case EXIT_REASON_RSM: 220 return "rsm"; 221 case EXIT_REASON_VMCALL: 222 return "vmcall"; 223 case EXIT_REASON_VMCLEAR: 224 return "vmclear"; 225 case EXIT_REASON_VMLAUNCH: 226 return "vmlaunch"; 227 case EXIT_REASON_VMPTRLD: 228 return "vmptrld"; 229 case EXIT_REASON_VMPTRST: 230 return "vmptrst"; 231 case EXIT_REASON_VMREAD: 232 return "vmread"; 233 case EXIT_REASON_VMRESUME: 234 return "vmresume"; 235 case EXIT_REASON_VMWRITE: 236 return "vmwrite"; 237 case EXIT_REASON_VMXOFF: 238 return "vmxoff"; 239 case EXIT_REASON_VMXON: 240 return "vmxon"; 241 case EXIT_REASON_CR_ACCESS: 242 return "craccess"; 243 case EXIT_REASON_DR_ACCESS: 244 return "draccess"; 245 case EXIT_REASON_INOUT: 246 return "inout"; 247 case EXIT_REASON_RDMSR: 248 return "rdmsr"; 249 case EXIT_REASON_WRMSR: 250 return "wrmsr"; 251 case EXIT_REASON_INVAL_VMCS: 252 return "invalvmcs"; 253 case EXIT_REASON_INVAL_MSR: 254 return "invalmsr"; 255 case EXIT_REASON_MWAIT: 256 return "mwait"; 257 case EXIT_REASON_MTF: 258 return "mtf"; 259 case EXIT_REASON_MONITOR: 260 return "monitor"; 261 case EXIT_REASON_PAUSE: 262 return "pause"; 263 case EXIT_REASON_MCE: 264 return "mce"; 265 case EXIT_REASON_TPR: 266 return "tpr"; 267 case EXIT_REASON_APIC: 268 return "apic"; 269 case EXIT_REASON_GDTR_IDTR: 270 return "gdtridtr"; 271 case EXIT_REASON_LDTR_TR: 272 return "ldtrtr"; 273 case EXIT_REASON_EPT_FAULT: 274 return "eptfault"; 275 case EXIT_REASON_EPT_MISCONFIG: 276 return "eptmisconfig"; 277 case EXIT_REASON_INVEPT: 278 return "invept"; 279 case EXIT_REASON_RDTSCP: 280 return "rdtscp"; 281 case EXIT_REASON_VMX_PREEMPT: 282 return "vmxpreempt"; 283 case EXIT_REASON_INVVPID: 284 return "invvpid"; 285 case EXIT_REASON_WBINVD: 286 return "wbinvd"; 287 case EXIT_REASON_XSETBV: 288 return "xsetbv"; 289 default: 290 snprintf(reasonbuf, sizeof(reasonbuf), "%d", reason); 291 return (reasonbuf); 292 } 293 } 294 295 #ifdef SETJMP_TRACE 296 static const char * 297 vmx_setjmp_rc2str(int rc) 298 { 299 switch (rc) { 300 case VMX_RETURN_DIRECT: 301 return "direct"; 302 case VMX_RETURN_LONGJMP: 303 return "longjmp"; 304 case VMX_RETURN_VMRESUME: 305 return "vmresume"; 306 case VMX_RETURN_VMLAUNCH: 307 return "vmlaunch"; 308 case VMX_RETURN_AST: 309 return "ast"; 310 default: 311 return "unknown"; 312 } 313 } 314 315 #define SETJMP_TRACE(vmx, vcpu, vmxctx, regname) \ 316 VMM_CTR1((vmx)->vm, (vcpu), "setjmp trace " #regname " 0x%016lx", \ 317 (vmxctx)->regname) 318 319 static void 320 vmx_setjmp_trace(struct vmx *vmx, int vcpu, struct vmxctx *vmxctx, int rc) 321 { 322 uint64_t host_rip, host_rsp; 323 324 if (vmxctx != &vmx->ctx[vcpu]) 325 panic("vmx_setjmp_trace: invalid vmxctx %p; should be %p", 326 vmxctx, &vmx->ctx[vcpu]); 327 328 VMM_CTR1((vmx)->vm, (vcpu), "vmxctx = %p", vmxctx); 329 VMM_CTR2((vmx)->vm, (vcpu), "setjmp return code %s(%d)", 330 vmx_setjmp_rc2str(rc), rc); 331 332 host_rsp = host_rip = ~0; 333 vmread(VMCS_HOST_RIP, &host_rip); 334 vmread(VMCS_HOST_RSP, &host_rsp); 335 VMM_CTR2((vmx)->vm, (vcpu), "vmcs host_rip 0x%016lx, host_rsp 0x%016lx", 336 host_rip, host_rsp); 337 338 SETJMP_TRACE(vmx, vcpu, vmxctx, host_r15); 339 SETJMP_TRACE(vmx, vcpu, vmxctx, host_r14); 340 SETJMP_TRACE(vmx, vcpu, vmxctx, host_r13); 341 SETJMP_TRACE(vmx, vcpu, vmxctx, host_r12); 342 SETJMP_TRACE(vmx, vcpu, vmxctx, host_rbp); 343 SETJMP_TRACE(vmx, vcpu, vmxctx, host_rsp); 344 SETJMP_TRACE(vmx, vcpu, vmxctx, host_rbx); 345 SETJMP_TRACE(vmx, vcpu, vmxctx, host_rip); 346 347 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rdi); 348 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rsi); 349 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rdx); 350 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rcx); 351 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r8); 352 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r9); 353 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rax); 354 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rbx); 355 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rbp); 356 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r10); 357 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r11); 358 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r12); 359 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r13); 360 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r14); 361 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r15); 362 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_cr2); 363 } 364 #endif 365 #else 366 static void __inline 367 vmx_setjmp_trace(struct vmx *vmx, int vcpu, struct vmxctx *vmxctx, int rc) 368 { 369 return; 370 } 371 #endif /* KTR */ 372 373 u_long 374 vmx_fix_cr0(u_long cr0) 375 { 376 377 return ((cr0 | cr0_ones_mask) & ~cr0_zeros_mask); 378 } 379 380 u_long 381 vmx_fix_cr4(u_long cr4) 382 { 383 384 return ((cr4 | cr4_ones_mask) & ~cr4_zeros_mask); 385 } 386 387 static void 388 vpid_free(int vpid) 389 { 390 if (vpid < 0 || vpid > 0xffff) 391 panic("vpid_free: invalid vpid %d", vpid); 392 393 /* 394 * VPIDs [0,VM_MAXCPU] are special and are not allocated from 395 * the unit number allocator. 396 */ 397 398 if (vpid > VM_MAXCPU) 399 free_unr(vpid_unr, vpid); 400 } 401 402 static void 403 vpid_alloc(uint16_t *vpid, int num) 404 { 405 int i, x; 406 407 if (num <= 0 || num > VM_MAXCPU) 408 panic("invalid number of vpids requested: %d", num); 409 410 /* 411 * If the "enable vpid" execution control is not enabled then the 412 * VPID is required to be 0 for all vcpus. 413 */ 414 if ((procbased_ctls2 & PROCBASED2_ENABLE_VPID) == 0) { 415 for (i = 0; i < num; i++) 416 vpid[i] = 0; 417 return; 418 } 419 420 /* 421 * Allocate a unique VPID for each vcpu from the unit number allocator. 422 */ 423 for (i = 0; i < num; i++) { 424 x = alloc_unr(vpid_unr); 425 if (x == -1) 426 break; 427 else 428 vpid[i] = x; 429 } 430 431 if (i < num) { 432 atomic_add_int(&vpid_alloc_failed, 1); 433 434 /* 435 * If the unit number allocator does not have enough unique 436 * VPIDs then we need to allocate from the [1,VM_MAXCPU] range. 437 * 438 * These VPIDs are not be unique across VMs but this does not 439 * affect correctness because the combined mappings are also 440 * tagged with the EP4TA which is unique for each VM. 441 * 442 * It is still sub-optimal because the invvpid will invalidate 443 * combined mappings for a particular VPID across all EP4TAs. 444 */ 445 while (i-- > 0) 446 vpid_free(vpid[i]); 447 448 for (i = 0; i < num; i++) 449 vpid[i] = i + 1; 450 } 451 } 452 453 static void 454 vpid_init(void) 455 { 456 /* 457 * VPID 0 is required when the "enable VPID" execution control is 458 * disabled. 459 * 460 * VPIDs [1,VM_MAXCPU] are used as the "overflow namespace" when the 461 * unit number allocator does not have sufficient unique VPIDs to 462 * satisfy the allocation. 463 * 464 * The remaining VPIDs are managed by the unit number allocator. 465 */ 466 vpid_unr = new_unrhdr(VM_MAXCPU + 1, 0xffff, NULL); 467 } 468 469 static void 470 msr_save_area_init(struct msr_entry *g_area, int *g_count) 471 { 472 int cnt; 473 474 static struct msr_entry guest_msrs[] = { 475 { MSR_KGSBASE, 0, 0 }, 476 }; 477 478 cnt = sizeof(guest_msrs) / sizeof(guest_msrs[0]); 479 if (cnt > GUEST_MSR_MAX_ENTRIES) 480 panic("guest msr save area overrun"); 481 bcopy(guest_msrs, g_area, sizeof(guest_msrs)); 482 *g_count = cnt; 483 } 484 485 static void 486 vmx_disable(void *arg __unused) 487 { 488 struct invvpid_desc invvpid_desc = { 0 }; 489 struct invept_desc invept_desc = { 0 }; 490 491 if (vmxon_enabled[curcpu]) { 492 /* 493 * See sections 25.3.3.3 and 25.3.3.4 in Intel Vol 3b. 494 * 495 * VMXON or VMXOFF are not required to invalidate any TLB 496 * caching structures. This prevents potential retention of 497 * cached information in the TLB between distinct VMX episodes. 498 */ 499 invvpid(INVVPID_TYPE_ALL_CONTEXTS, invvpid_desc); 500 invept(INVEPT_TYPE_ALL_CONTEXTS, invept_desc); 501 vmxoff(); 502 } 503 load_cr4(rcr4() & ~CR4_VMXE); 504 } 505 506 static int 507 vmx_cleanup(void) 508 { 509 510 if (vpid_unr != NULL) { 511 delete_unrhdr(vpid_unr); 512 vpid_unr = NULL; 513 } 514 515 smp_rendezvous(NULL, vmx_disable, NULL, NULL); 516 517 return (0); 518 } 519 520 static void 521 vmx_enable(void *arg __unused) 522 { 523 int error; 524 525 load_cr4(rcr4() | CR4_VMXE); 526 527 *(uint32_t *)vmxon_region[curcpu] = vmx_revision(); 528 error = vmxon(vmxon_region[curcpu]); 529 if (error == 0) 530 vmxon_enabled[curcpu] = 1; 531 } 532 533 static int 534 vmx_init(void) 535 { 536 int error; 537 uint64_t fixed0, fixed1, feature_control; 538 uint32_t tmp; 539 540 /* CPUID.1:ECX[bit 5] must be 1 for processor to support VMX */ 541 if (!(cpu_feature2 & CPUID2_VMX)) { 542 printf("vmx_init: processor does not support VMX operation\n"); 543 return (ENXIO); 544 } 545 546 /* 547 * Verify that MSR_IA32_FEATURE_CONTROL lock and VMXON enable bits 548 * are set (bits 0 and 2 respectively). 549 */ 550 feature_control = rdmsr(MSR_IA32_FEATURE_CONTROL); 551 if ((feature_control & IA32_FEATURE_CONTROL_LOCK) == 0 || 552 (feature_control & IA32_FEATURE_CONTROL_VMX_EN) == 0) { 553 printf("vmx_init: VMX operation disabled by BIOS\n"); 554 return (ENXIO); 555 } 556 557 /* Check support for primary processor-based VM-execution controls */ 558 error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, 559 MSR_VMX_TRUE_PROCBASED_CTLS, 560 PROCBASED_CTLS_ONE_SETTING, 561 PROCBASED_CTLS_ZERO_SETTING, &procbased_ctls); 562 if (error) { 563 printf("vmx_init: processor does not support desired primary " 564 "processor-based controls\n"); 565 return (error); 566 } 567 568 /* Clear the processor-based ctl bits that are set on demand */ 569 procbased_ctls &= ~PROCBASED_CTLS_WINDOW_SETTING; 570 571 /* Check support for secondary processor-based VM-execution controls */ 572 error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, 573 MSR_VMX_PROCBASED_CTLS2, 574 PROCBASED_CTLS2_ONE_SETTING, 575 PROCBASED_CTLS2_ZERO_SETTING, &procbased_ctls2); 576 if (error) { 577 printf("vmx_init: processor does not support desired secondary " 578 "processor-based controls\n"); 579 return (error); 580 } 581 582 /* Check support for VPID */ 583 error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, MSR_VMX_PROCBASED_CTLS2, 584 PROCBASED2_ENABLE_VPID, 0, &tmp); 585 if (error == 0) 586 procbased_ctls2 |= PROCBASED2_ENABLE_VPID; 587 588 /* Check support for pin-based VM-execution controls */ 589 error = vmx_set_ctlreg(MSR_VMX_PINBASED_CTLS, 590 MSR_VMX_TRUE_PINBASED_CTLS, 591 PINBASED_CTLS_ONE_SETTING, 592 PINBASED_CTLS_ZERO_SETTING, &pinbased_ctls); 593 if (error) { 594 printf("vmx_init: processor does not support desired " 595 "pin-based controls\n"); 596 return (error); 597 } 598 599 /* Check support for VM-exit controls */ 600 error = vmx_set_ctlreg(MSR_VMX_EXIT_CTLS, MSR_VMX_TRUE_EXIT_CTLS, 601 VM_EXIT_CTLS_ONE_SETTING, 602 VM_EXIT_CTLS_ZERO_SETTING, 603 &exit_ctls); 604 if (error) { 605 /* Try again without the PAT MSR bits */ 606 error = vmx_set_ctlreg(MSR_VMX_EXIT_CTLS, 607 MSR_VMX_TRUE_EXIT_CTLS, 608 VM_EXIT_CTLS_ONE_SETTING_NO_PAT, 609 VM_EXIT_CTLS_ZERO_SETTING, 610 &exit_ctls); 611 if (error) { 612 printf("vmx_init: processor does not support desired " 613 "exit controls\n"); 614 return (error); 615 } else { 616 if (bootverbose) 617 printf("vmm: PAT MSR access not supported\n"); 618 guest_msr_valid(MSR_PAT); 619 vmx_no_patmsr = 1; 620 } 621 } 622 623 /* Check support for VM-entry controls */ 624 if (!vmx_no_patmsr) { 625 error = vmx_set_ctlreg(MSR_VMX_ENTRY_CTLS, 626 MSR_VMX_TRUE_ENTRY_CTLS, 627 VM_ENTRY_CTLS_ONE_SETTING, 628 VM_ENTRY_CTLS_ZERO_SETTING, 629 &entry_ctls); 630 } else { 631 error = vmx_set_ctlreg(MSR_VMX_ENTRY_CTLS, 632 MSR_VMX_TRUE_ENTRY_CTLS, 633 VM_ENTRY_CTLS_ONE_SETTING_NO_PAT, 634 VM_ENTRY_CTLS_ZERO_SETTING, 635 &entry_ctls); 636 } 637 638 if (error) { 639 printf("vmx_init: processor does not support desired " 640 "entry controls\n"); 641 return (error); 642 } 643 644 /* 645 * Check support for optional features by testing them 646 * as individual bits 647 */ 648 cap_halt_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, 649 MSR_VMX_TRUE_PROCBASED_CTLS, 650 PROCBASED_HLT_EXITING, 0, 651 &tmp) == 0); 652 653 cap_monitor_trap = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, 654 MSR_VMX_PROCBASED_CTLS, 655 PROCBASED_MTF, 0, 656 &tmp) == 0); 657 658 cap_pause_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, 659 MSR_VMX_TRUE_PROCBASED_CTLS, 660 PROCBASED_PAUSE_EXITING, 0, 661 &tmp) == 0); 662 663 cap_unrestricted_guest = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, 664 MSR_VMX_PROCBASED_CTLS2, 665 PROCBASED2_UNRESTRICTED_GUEST, 0, 666 &tmp) == 0); 667 668 /* Initialize EPT */ 669 error = ept_init(); 670 if (error) { 671 printf("vmx_init: ept initialization failed (%d)\n", error); 672 return (error); 673 } 674 675 /* 676 * Stash the cr0 and cr4 bits that must be fixed to 0 or 1 677 */ 678 fixed0 = rdmsr(MSR_VMX_CR0_FIXED0); 679 fixed1 = rdmsr(MSR_VMX_CR0_FIXED1); 680 cr0_ones_mask = fixed0 & fixed1; 681 cr0_zeros_mask = ~fixed0 & ~fixed1; 682 683 /* 684 * CR0_PE and CR0_PG can be set to zero in VMX non-root operation 685 * if unrestricted guest execution is allowed. 686 */ 687 if (cap_unrestricted_guest) 688 cr0_ones_mask &= ~(CR0_PG | CR0_PE); 689 690 /* 691 * Do not allow the guest to set CR0_NW or CR0_CD. 692 */ 693 cr0_zeros_mask |= (CR0_NW | CR0_CD); 694 695 fixed0 = rdmsr(MSR_VMX_CR4_FIXED0); 696 fixed1 = rdmsr(MSR_VMX_CR4_FIXED1); 697 cr4_ones_mask = fixed0 & fixed1; 698 cr4_zeros_mask = ~fixed0 & ~fixed1; 699 700 vpid_init(); 701 702 /* enable VMX operation */ 703 smp_rendezvous(NULL, vmx_enable, NULL, NULL); 704 705 vmx_initialized = 1; 706 707 return (0); 708 } 709 710 static int 711 vmx_setup_cr_shadow(int which, struct vmcs *vmcs, uint32_t initial) 712 { 713 int error, mask_ident, shadow_ident; 714 uint64_t mask_value; 715 716 if (which != 0 && which != 4) 717 panic("vmx_setup_cr_shadow: unknown cr%d", which); 718 719 if (which == 0) { 720 mask_ident = VMCS_CR0_MASK; 721 mask_value = cr0_ones_mask | cr0_zeros_mask; 722 shadow_ident = VMCS_CR0_SHADOW; 723 } else { 724 mask_ident = VMCS_CR4_MASK; 725 mask_value = cr4_ones_mask | cr4_zeros_mask; 726 shadow_ident = VMCS_CR4_SHADOW; 727 } 728 729 error = vmcs_setreg(vmcs, 0, VMCS_IDENT(mask_ident), mask_value); 730 if (error) 731 return (error); 732 733 error = vmcs_setreg(vmcs, 0, VMCS_IDENT(shadow_ident), initial); 734 if (error) 735 return (error); 736 737 return (0); 738 } 739 #define vmx_setup_cr0_shadow(vmcs,init) vmx_setup_cr_shadow(0, (vmcs), (init)) 740 #define vmx_setup_cr4_shadow(vmcs,init) vmx_setup_cr_shadow(4, (vmcs), (init)) 741 742 static void * 743 vmx_vminit(struct vm *vm) 744 { 745 uint16_t vpid[VM_MAXCPU]; 746 int i, error, guest_msr_count; 747 struct vmx *vmx; 748 749 vmx = malloc(sizeof(struct vmx), M_VMX, M_WAITOK | M_ZERO); 750 if ((uintptr_t)vmx & PAGE_MASK) { 751 panic("malloc of struct vmx not aligned on %d byte boundary", 752 PAGE_SIZE); 753 } 754 vmx->vm = vm; 755 756 /* 757 * Clean up EPTP-tagged guest physical and combined mappings 758 * 759 * VMX transitions are not required to invalidate any guest physical 760 * mappings. So, it may be possible for stale guest physical mappings 761 * to be present in the processor TLBs. 762 * 763 * Combined mappings for this EP4TA are also invalidated for all VPIDs. 764 */ 765 ept_invalidate_mappings(vtophys(vmx->pml4ept)); 766 767 msr_bitmap_initialize(vmx->msr_bitmap); 768 769 /* 770 * It is safe to allow direct access to MSR_GSBASE and MSR_FSBASE. 771 * The guest FSBASE and GSBASE are saved and restored during 772 * vm-exit and vm-entry respectively. The host FSBASE and GSBASE are 773 * always restored from the vmcs host state area on vm-exit. 774 * 775 * The SYSENTER_CS/ESP/EIP MSRs are identical to FS/GSBASE in 776 * how they are saved/restored so can be directly accessed by the 777 * guest. 778 * 779 * Guest KGSBASE is saved and restored in the guest MSR save area. 780 * Host KGSBASE is restored before returning to userland from the pcb. 781 * There will be a window of time when we are executing in the host 782 * kernel context with a value of KGSBASE from the guest. This is ok 783 * because the value of KGSBASE is inconsequential in kernel context. 784 * 785 * MSR_EFER is saved and restored in the guest VMCS area on a 786 * VM exit and entry respectively. It is also restored from the 787 * host VMCS area on a VM exit. 788 */ 789 if (guest_msr_rw(vmx, MSR_GSBASE) || 790 guest_msr_rw(vmx, MSR_FSBASE) || 791 guest_msr_rw(vmx, MSR_SYSENTER_CS_MSR) || 792 guest_msr_rw(vmx, MSR_SYSENTER_ESP_MSR) || 793 guest_msr_rw(vmx, MSR_SYSENTER_EIP_MSR) || 794 guest_msr_rw(vmx, MSR_KGSBASE) || 795 guest_msr_rw(vmx, MSR_EFER)) 796 panic("vmx_vminit: error setting guest msr access"); 797 798 /* 799 * MSR_PAT is saved and restored in the guest VMCS are on a VM exit 800 * and entry respectively. It is also restored from the host VMCS 801 * area on a VM exit. However, if running on a system with no 802 * MSR_PAT save/restore support, leave access disabled so accesses 803 * will be trapped. 804 */ 805 if (!vmx_no_patmsr && guest_msr_rw(vmx, MSR_PAT)) 806 panic("vmx_vminit: error setting guest pat msr access"); 807 808 vpid_alloc(vpid, VM_MAXCPU); 809 810 for (i = 0; i < VM_MAXCPU; i++) { 811 vmx->vmcs[i].identifier = vmx_revision(); 812 error = vmclear(&vmx->vmcs[i]); 813 if (error != 0) { 814 panic("vmx_vminit: vmclear error %d on vcpu %d\n", 815 error, i); 816 } 817 818 error = vmcs_set_defaults(&vmx->vmcs[i], 819 (u_long)vmx_longjmp, 820 (u_long)&vmx->ctx[i], 821 vtophys(vmx->pml4ept), 822 pinbased_ctls, 823 procbased_ctls, 824 procbased_ctls2, 825 exit_ctls, entry_ctls, 826 vtophys(vmx->msr_bitmap), 827 vpid[i]); 828 829 if (error != 0) 830 panic("vmx_vminit: vmcs_set_defaults error %d", error); 831 832 vmx->cap[i].set = 0; 833 vmx->cap[i].proc_ctls = procbased_ctls; 834 835 vmx->state[i].lastcpu = -1; 836 vmx->state[i].vpid = vpid[i]; 837 838 msr_save_area_init(vmx->guest_msrs[i], &guest_msr_count); 839 840 error = vmcs_set_msr_save(&vmx->vmcs[i], 841 vtophys(vmx->guest_msrs[i]), 842 guest_msr_count); 843 if (error != 0) 844 panic("vmcs_set_msr_save error %d", error); 845 846 /* 847 * Set up the CR0/4 shadows, and init the read shadow 848 * to the power-on register value from the Intel Sys Arch. 849 * CR0 - 0x60000010 850 * CR4 - 0 851 */ 852 error = vmx_setup_cr0_shadow(&vmx->vmcs[i], 0x60000010); 853 if (error != 0) 854 panic("vmx_setup_cr0_shadow %d", error); 855 856 error = vmx_setup_cr4_shadow(&vmx->vmcs[i], 0); 857 if (error != 0) 858 panic("vmx_setup_cr4_shadow %d", error); 859 } 860 861 return (vmx); 862 } 863 864 static int 865 vmx_handle_cpuid(struct vm *vm, int vcpu, struct vmxctx *vmxctx) 866 { 867 int handled, func; 868 869 func = vmxctx->guest_rax; 870 871 handled = x86_emulate_cpuid(vm, vcpu, 872 (uint32_t*)(&vmxctx->guest_rax), 873 (uint32_t*)(&vmxctx->guest_rbx), 874 (uint32_t*)(&vmxctx->guest_rcx), 875 (uint32_t*)(&vmxctx->guest_rdx)); 876 return (handled); 877 } 878 879 static __inline void 880 vmx_run_trace(struct vmx *vmx, int vcpu) 881 { 882 #ifdef KTR 883 VMM_CTR1(vmx->vm, vcpu, "Resume execution at 0x%0lx", vmcs_guest_rip()); 884 #endif 885 } 886 887 static __inline void 888 vmx_exit_trace(struct vmx *vmx, int vcpu, uint64_t rip, uint32_t exit_reason, 889 int handled) 890 { 891 #ifdef KTR 892 VMM_CTR3(vmx->vm, vcpu, "%s %s vmexit at 0x%0lx", 893 handled ? "handled" : "unhandled", 894 exit_reason_to_str(exit_reason), rip); 895 #endif 896 } 897 898 static __inline void 899 vmx_astpending_trace(struct vmx *vmx, int vcpu, uint64_t rip) 900 { 901 #ifdef KTR 902 VMM_CTR1(vmx->vm, vcpu, "astpending vmexit at 0x%0lx", rip); 903 #endif 904 } 905 906 static int 907 vmx_set_pcpu_defaults(struct vmx *vmx, int vcpu) 908 { 909 int error, lastcpu; 910 struct vmxstate *vmxstate; 911 struct invvpid_desc invvpid_desc = { 0 }; 912 913 vmxstate = &vmx->state[vcpu]; 914 lastcpu = vmxstate->lastcpu; 915 vmxstate->lastcpu = curcpu; 916 917 if (lastcpu == curcpu) { 918 error = 0; 919 goto done; 920 } 921 922 vmm_stat_incr(vmx->vm, vcpu, VCPU_MIGRATIONS, 1); 923 924 error = vmwrite(VMCS_HOST_TR_BASE, vmm_get_host_trbase()); 925 if (error != 0) 926 goto done; 927 928 error = vmwrite(VMCS_HOST_GDTR_BASE, vmm_get_host_gdtrbase()); 929 if (error != 0) 930 goto done; 931 932 error = vmwrite(VMCS_HOST_GS_BASE, vmm_get_host_gsbase()); 933 if (error != 0) 934 goto done; 935 936 /* 937 * If we are using VPIDs then invalidate all mappings tagged with 'vpid' 938 * 939 * We do this because this vcpu was executing on a different host 940 * cpu when it last ran. We do not track whether it invalidated 941 * mappings associated with its 'vpid' during that run. So we must 942 * assume that the mappings associated with 'vpid' on 'curcpu' are 943 * stale and invalidate them. 944 * 945 * Note that we incur this penalty only when the scheduler chooses to 946 * move the thread associated with this vcpu between host cpus. 947 * 948 * Note also that this will invalidate mappings tagged with 'vpid' 949 * for "all" EP4TAs. 950 */ 951 if (vmxstate->vpid != 0) { 952 invvpid_desc.vpid = vmxstate->vpid; 953 invvpid(INVVPID_TYPE_SINGLE_CONTEXT, invvpid_desc); 954 } 955 done: 956 return (error); 957 } 958 959 static void 960 vm_exit_update_rip(struct vm_exit *vmexit) 961 { 962 int error; 963 964 error = vmwrite(VMCS_GUEST_RIP, vmexit->rip + vmexit->inst_length); 965 if (error) 966 panic("vmx_run: error %d writing to VMCS_GUEST_RIP", error); 967 } 968 969 /* 970 * We depend on 'procbased_ctls' to have the Interrupt Window Exiting bit set. 971 */ 972 CTASSERT((PROCBASED_CTLS_ONE_SETTING & PROCBASED_INT_WINDOW_EXITING) != 0); 973 974 static void __inline 975 vmx_set_int_window_exiting(struct vmx *vmx, int vcpu) 976 { 977 int error; 978 979 vmx->cap[vcpu].proc_ctls |= PROCBASED_INT_WINDOW_EXITING; 980 981 error = vmwrite(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); 982 if (error) 983 panic("vmx_set_int_window_exiting: vmwrite error %d", error); 984 } 985 986 static void __inline 987 vmx_clear_int_window_exiting(struct vmx *vmx, int vcpu) 988 { 989 int error; 990 991 vmx->cap[vcpu].proc_ctls &= ~PROCBASED_INT_WINDOW_EXITING; 992 993 error = vmwrite(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); 994 if (error) 995 panic("vmx_clear_int_window_exiting: vmwrite error %d", error); 996 } 997 998 static void __inline 999 vmx_set_nmi_window_exiting(struct vmx *vmx, int vcpu) 1000 { 1001 int error; 1002 1003 vmx->cap[vcpu].proc_ctls |= PROCBASED_NMI_WINDOW_EXITING; 1004 1005 error = vmwrite(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); 1006 if (error) 1007 panic("vmx_set_nmi_window_exiting: vmwrite error %d", error); 1008 } 1009 1010 static void __inline 1011 vmx_clear_nmi_window_exiting(struct vmx *vmx, int vcpu) 1012 { 1013 int error; 1014 1015 vmx->cap[vcpu].proc_ctls &= ~PROCBASED_NMI_WINDOW_EXITING; 1016 1017 error = vmwrite(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); 1018 if (error) 1019 panic("vmx_clear_nmi_window_exiting: vmwrite error %d", error); 1020 } 1021 1022 static int 1023 vmx_inject_nmi(struct vmx *vmx, int vcpu) 1024 { 1025 int error; 1026 uint64_t info, interruptibility; 1027 1028 /* Bail out if no NMI requested */ 1029 if (!vm_nmi_pending(vmx->vm, vcpu)) 1030 return (0); 1031 1032 error = vmread(VMCS_GUEST_INTERRUPTIBILITY, &interruptibility); 1033 if (error) { 1034 panic("vmx_inject_nmi: vmread(interruptibility) %d", 1035 error); 1036 } 1037 if (interruptibility & nmi_blocking_bits) 1038 goto nmiblocked; 1039 1040 /* 1041 * Inject the virtual NMI. The vector must be the NMI IDT entry 1042 * or the VMCS entry check will fail. 1043 */ 1044 info = VMCS_INTERRUPTION_INFO_NMI | VMCS_INTERRUPTION_INFO_VALID; 1045 info |= IDT_NMI; 1046 1047 error = vmwrite(VMCS_ENTRY_INTR_INFO, info); 1048 if (error) 1049 panic("vmx_inject_nmi: vmwrite(intrinfo) %d", error); 1050 1051 VMM_CTR0(vmx->vm, vcpu, "Injecting vNMI"); 1052 1053 /* Clear the request */ 1054 vm_nmi_clear(vmx->vm, vcpu); 1055 return (1); 1056 1057 nmiblocked: 1058 /* 1059 * Set the NMI Window Exiting execution control so we can inject 1060 * the virtual NMI as soon as blocking condition goes away. 1061 */ 1062 vmx_set_nmi_window_exiting(vmx, vcpu); 1063 1064 VMM_CTR0(vmx->vm, vcpu, "Enabling NMI window exiting"); 1065 return (1); 1066 } 1067 1068 static void 1069 vmx_inject_interrupts(struct vmx *vmx, int vcpu) 1070 { 1071 int error, vector; 1072 uint64_t info, rflags, interruptibility; 1073 1074 const int HWINTR_BLOCKED = VMCS_INTERRUPTIBILITY_STI_BLOCKING | 1075 VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING; 1076 1077 /* 1078 * If there is already an interrupt pending then just return. 1079 * 1080 * This could happen if an interrupt was injected on a prior 1081 * VM entry but the actual entry into guest mode was aborted 1082 * because of a pending AST. 1083 */ 1084 error = vmread(VMCS_ENTRY_INTR_INFO, &info); 1085 if (error) 1086 panic("vmx_inject_interrupts: vmread(intrinfo) %d", error); 1087 if (info & VMCS_INTERRUPTION_INFO_VALID) 1088 return; 1089 1090 /* 1091 * NMI injection has priority so deal with those first 1092 */ 1093 if (vmx_inject_nmi(vmx, vcpu)) 1094 return; 1095 1096 /* Ask the local apic for a vector to inject */ 1097 vector = lapic_pending_intr(vmx->vm, vcpu); 1098 if (vector < 0) 1099 return; 1100 1101 if (vector < 32 || vector > 255) 1102 panic("vmx_inject_interrupts: invalid vector %d\n", vector); 1103 1104 /* Check RFLAGS.IF and the interruptibility state of the guest */ 1105 error = vmread(VMCS_GUEST_RFLAGS, &rflags); 1106 if (error) 1107 panic("vmx_inject_interrupts: vmread(rflags) %d", error); 1108 1109 if ((rflags & PSL_I) == 0) 1110 goto cantinject; 1111 1112 error = vmread(VMCS_GUEST_INTERRUPTIBILITY, &interruptibility); 1113 if (error) { 1114 panic("vmx_inject_interrupts: vmread(interruptibility) %d", 1115 error); 1116 } 1117 if (interruptibility & HWINTR_BLOCKED) 1118 goto cantinject; 1119 1120 /* Inject the interrupt */ 1121 info = VMCS_INTERRUPTION_INFO_HW_INTR | VMCS_INTERRUPTION_INFO_VALID; 1122 info |= vector; 1123 error = vmwrite(VMCS_ENTRY_INTR_INFO, info); 1124 if (error) 1125 panic("vmx_inject_interrupts: vmwrite(intrinfo) %d", error); 1126 1127 /* Update the Local APIC ISR */ 1128 lapic_intr_accepted(vmx->vm, vcpu, vector); 1129 1130 VMM_CTR1(vmx->vm, vcpu, "Injecting hwintr at vector %d", vector); 1131 1132 return; 1133 1134 cantinject: 1135 /* 1136 * Set the Interrupt Window Exiting execution control so we can inject 1137 * the interrupt as soon as blocking condition goes away. 1138 */ 1139 vmx_set_int_window_exiting(vmx, vcpu); 1140 1141 VMM_CTR0(vmx->vm, vcpu, "Enabling interrupt window exiting"); 1142 } 1143 1144 static int 1145 vmx_emulate_cr_access(struct vmx *vmx, int vcpu, uint64_t exitqual) 1146 { 1147 int error, cr, vmcs_guest_cr, vmcs_shadow_cr; 1148 uint64_t crval, regval, ones_mask, zeros_mask; 1149 const struct vmxctx *vmxctx; 1150 1151 /* We only handle mov to %cr0 or %cr4 at this time */ 1152 if ((exitqual & 0xf0) != 0x00) 1153 return (UNHANDLED); 1154 1155 cr = exitqual & 0xf; 1156 if (cr != 0 && cr != 4) 1157 return (UNHANDLED); 1158 1159 vmxctx = &vmx->ctx[vcpu]; 1160 1161 /* 1162 * We must use vmwrite() directly here because vmcs_setreg() will 1163 * call vmclear(vmcs) as a side-effect which we certainly don't want. 1164 */ 1165 switch ((exitqual >> 8) & 0xf) { 1166 case 0: 1167 regval = vmxctx->guest_rax; 1168 break; 1169 case 1: 1170 regval = vmxctx->guest_rcx; 1171 break; 1172 case 2: 1173 regval = vmxctx->guest_rdx; 1174 break; 1175 case 3: 1176 regval = vmxctx->guest_rbx; 1177 break; 1178 case 4: 1179 error = vmread(VMCS_GUEST_RSP, ®val); 1180 if (error) { 1181 panic("vmx_emulate_cr_access: " 1182 "error %d reading guest rsp", error); 1183 } 1184 break; 1185 case 5: 1186 regval = vmxctx->guest_rbp; 1187 break; 1188 case 6: 1189 regval = vmxctx->guest_rsi; 1190 break; 1191 case 7: 1192 regval = vmxctx->guest_rdi; 1193 break; 1194 case 8: 1195 regval = vmxctx->guest_r8; 1196 break; 1197 case 9: 1198 regval = vmxctx->guest_r9; 1199 break; 1200 case 10: 1201 regval = vmxctx->guest_r10; 1202 break; 1203 case 11: 1204 regval = vmxctx->guest_r11; 1205 break; 1206 case 12: 1207 regval = vmxctx->guest_r12; 1208 break; 1209 case 13: 1210 regval = vmxctx->guest_r13; 1211 break; 1212 case 14: 1213 regval = vmxctx->guest_r14; 1214 break; 1215 case 15: 1216 regval = vmxctx->guest_r15; 1217 break; 1218 } 1219 1220 if (cr == 0) { 1221 ones_mask = cr0_ones_mask; 1222 zeros_mask = cr0_zeros_mask; 1223 vmcs_guest_cr = VMCS_GUEST_CR0; 1224 vmcs_shadow_cr = VMCS_CR0_SHADOW; 1225 } else { 1226 ones_mask = cr4_ones_mask; 1227 zeros_mask = cr4_zeros_mask; 1228 vmcs_guest_cr = VMCS_GUEST_CR4; 1229 vmcs_shadow_cr = VMCS_CR4_SHADOW; 1230 } 1231 1232 error = vmwrite(vmcs_shadow_cr, regval); 1233 if (error) { 1234 panic("vmx_emulate_cr_access: error %d writing cr%d shadow", 1235 error, cr); 1236 } 1237 1238 crval = regval | ones_mask; 1239 crval &= ~zeros_mask; 1240 error = vmwrite(vmcs_guest_cr, crval); 1241 if (error) { 1242 panic("vmx_emulate_cr_access: error %d writing cr%d", 1243 error, cr); 1244 } 1245 1246 if (cr == 0 && regval & CR0_PG) { 1247 uint64_t efer, entry_ctls; 1248 1249 /* 1250 * If CR0.PG is 1 and EFER.LME is 1 then EFER.LMA and 1251 * the "IA-32e mode guest" bit in VM-entry control must be 1252 * equal. 1253 */ 1254 error = vmread(VMCS_GUEST_IA32_EFER, &efer); 1255 if (error) { 1256 panic("vmx_emulate_cr_access: error %d efer read", 1257 error); 1258 } 1259 if (efer & EFER_LME) { 1260 efer |= EFER_LMA; 1261 error = vmwrite(VMCS_GUEST_IA32_EFER, efer); 1262 if (error) { 1263 panic("vmx_emulate_cr_access: error %d" 1264 " efer write", error); 1265 } 1266 error = vmread(VMCS_ENTRY_CTLS, &entry_ctls); 1267 if (error) { 1268 panic("vmx_emulate_cr_access: error %d" 1269 " entry ctls read", error); 1270 } 1271 entry_ctls |= VM_ENTRY_GUEST_LMA; 1272 error = vmwrite(VMCS_ENTRY_CTLS, entry_ctls); 1273 if (error) { 1274 panic("vmx_emulate_cr_access: error %d" 1275 " entry ctls write", error); 1276 } 1277 } 1278 } 1279 1280 return (HANDLED); 1281 } 1282 1283 static int 1284 vmx_ept_fault(struct vm *vm, int cpu, 1285 uint64_t gla, uint64_t gpa, uint64_t rip, int inst_length, 1286 uint64_t cr3, uint64_t ept_qual, struct vie *vie) 1287 { 1288 int read, write, error; 1289 1290 /* EPT violation on an instruction fetch doesn't make sense here */ 1291 if (ept_qual & EPT_VIOLATION_INST_FETCH) 1292 return (UNHANDLED); 1293 1294 /* EPT violation must be a read fault or a write fault */ 1295 read = ept_qual & EPT_VIOLATION_DATA_READ ? 1 : 0; 1296 write = ept_qual & EPT_VIOLATION_DATA_WRITE ? 1 : 0; 1297 if ((read | write) == 0) 1298 return (UNHANDLED); 1299 1300 /* 1301 * The EPT violation must have been caused by accessing a 1302 * guest-physical address that is a translation of a guest-linear 1303 * address. 1304 */ 1305 if ((ept_qual & EPT_VIOLATION_GLA_VALID) == 0 || 1306 (ept_qual & EPT_VIOLATION_XLAT_VALID) == 0) { 1307 return (UNHANDLED); 1308 } 1309 1310 /* Fetch, decode and emulate the faulting instruction */ 1311 if (vmm_fetch_instruction(vm, cpu, rip, inst_length, cr3, vie) != 0) 1312 return (UNHANDLED); 1313 1314 if (vmm_decode_instruction(vm, cpu, gla, vie) != 0) 1315 return (UNHANDLED); 1316 1317 /* 1318 * Check if this is a local apic access 1319 */ 1320 if (gpa < DEFAULT_APIC_BASE || gpa >= DEFAULT_APIC_BASE + PAGE_SIZE) 1321 return (UNHANDLED); 1322 1323 error = vmm_emulate_instruction(vm, cpu, gpa, vie, 1324 lapic_mmio_read, lapic_mmio_write, 0); 1325 1326 return (error ? UNHANDLED : HANDLED); 1327 } 1328 1329 static int 1330 vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit) 1331 { 1332 int error, handled; 1333 struct vmcs *vmcs; 1334 struct vmxctx *vmxctx; 1335 uint32_t eax, ecx, edx; 1336 uint64_t qual, gla, gpa, cr3, intr_info; 1337 1338 handled = 0; 1339 vmcs = &vmx->vmcs[vcpu]; 1340 vmxctx = &vmx->ctx[vcpu]; 1341 qual = vmexit->u.vmx.exit_qualification; 1342 vmexit->exitcode = VM_EXITCODE_BOGUS; 1343 1344 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_COUNT, 1); 1345 1346 switch (vmexit->u.vmx.exit_reason) { 1347 case EXIT_REASON_CR_ACCESS: 1348 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_CR_ACCESS, 1); 1349 handled = vmx_emulate_cr_access(vmx, vcpu, qual); 1350 break; 1351 case EXIT_REASON_RDMSR: 1352 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_RDMSR, 1); 1353 ecx = vmxctx->guest_rcx; 1354 error = emulate_rdmsr(vmx->vm, vcpu, ecx); 1355 if (error) { 1356 vmexit->exitcode = VM_EXITCODE_RDMSR; 1357 vmexit->u.msr.code = ecx; 1358 } else 1359 handled = 1; 1360 break; 1361 case EXIT_REASON_WRMSR: 1362 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_WRMSR, 1); 1363 eax = vmxctx->guest_rax; 1364 ecx = vmxctx->guest_rcx; 1365 edx = vmxctx->guest_rdx; 1366 error = emulate_wrmsr(vmx->vm, vcpu, ecx, 1367 (uint64_t)edx << 32 | eax); 1368 if (error) { 1369 vmexit->exitcode = VM_EXITCODE_WRMSR; 1370 vmexit->u.msr.code = ecx; 1371 vmexit->u.msr.wval = (uint64_t)edx << 32 | eax; 1372 } else 1373 handled = 1; 1374 break; 1375 case EXIT_REASON_HLT: 1376 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_HLT, 1); 1377 /* 1378 * If there is an event waiting to be injected then there is 1379 * no need to 'hlt'. 1380 */ 1381 error = vmread(VMCS_ENTRY_INTR_INFO, &intr_info); 1382 if (error) 1383 panic("vmx_exit_process: vmread(intrinfo) %d", error); 1384 1385 if (intr_info & VMCS_INTERRUPTION_INFO_VALID) { 1386 handled = 1; 1387 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_HLT_IGNORED, 1); 1388 } else 1389 vmexit->exitcode = VM_EXITCODE_HLT; 1390 break; 1391 case EXIT_REASON_MTF: 1392 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_MTRAP, 1); 1393 vmexit->exitcode = VM_EXITCODE_MTRAP; 1394 break; 1395 case EXIT_REASON_PAUSE: 1396 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_PAUSE, 1); 1397 vmexit->exitcode = VM_EXITCODE_PAUSE; 1398 break; 1399 case EXIT_REASON_INTR_WINDOW: 1400 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INTR_WINDOW, 1); 1401 vmx_clear_int_window_exiting(vmx, vcpu); 1402 VMM_CTR0(vmx->vm, vcpu, "Disabling interrupt window exiting"); 1403 return (1); 1404 case EXIT_REASON_EXT_INTR: 1405 /* 1406 * External interrupts serve only to cause VM exits and allow 1407 * the host interrupt handler to run. 1408 * 1409 * If this external interrupt triggers a virtual interrupt 1410 * to a VM, then that state will be recorded by the 1411 * host interrupt handler in the VM's softc. We will inject 1412 * this virtual interrupt during the subsequent VM enter. 1413 */ 1414 1415 /* 1416 * This is special. We want to treat this as an 'handled' 1417 * VM-exit but not increment the instruction pointer. 1418 */ 1419 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_EXTINT, 1); 1420 return (1); 1421 case EXIT_REASON_NMI_WINDOW: 1422 /* Exit to allow the pending virtual NMI to be injected */ 1423 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_NMI_WINDOW, 1); 1424 vmx_clear_nmi_window_exiting(vmx, vcpu); 1425 VMM_CTR0(vmx->vm, vcpu, "Disabling NMI window exiting"); 1426 return (1); 1427 case EXIT_REASON_INOUT: 1428 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INOUT, 1); 1429 vmexit->exitcode = VM_EXITCODE_INOUT; 1430 vmexit->u.inout.bytes = (qual & 0x7) + 1; 1431 vmexit->u.inout.in = (qual & 0x8) ? 1 : 0; 1432 vmexit->u.inout.string = (qual & 0x10) ? 1 : 0; 1433 vmexit->u.inout.rep = (qual & 0x20) ? 1 : 0; 1434 vmexit->u.inout.port = (uint16_t)(qual >> 16); 1435 vmexit->u.inout.eax = (uint32_t)(vmxctx->guest_rax); 1436 break; 1437 case EXIT_REASON_CPUID: 1438 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_CPUID, 1); 1439 handled = vmx_handle_cpuid(vmx->vm, vcpu, vmxctx); 1440 break; 1441 case EXIT_REASON_EPT_FAULT: 1442 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_EPT_FAULT, 1); 1443 gla = vmcs_gla(); 1444 gpa = vmcs_gpa(); 1445 cr3 = vmcs_guest_cr3(); 1446 handled = vmx_ept_fault(vmx->vm, vcpu, gla, gpa, 1447 vmexit->rip, vmexit->inst_length, 1448 cr3, qual, &vmexit->u.paging.vie); 1449 if (!handled) { 1450 vmexit->exitcode = VM_EXITCODE_PAGING; 1451 vmexit->u.paging.gpa = gpa; 1452 } 1453 break; 1454 default: 1455 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_UNKNOWN, 1); 1456 break; 1457 } 1458 1459 if (handled) { 1460 /* 1461 * It is possible that control is returned to userland 1462 * even though we were able to handle the VM exit in the 1463 * kernel. 1464 * 1465 * In such a case we want to make sure that the userland 1466 * restarts guest execution at the instruction *after* 1467 * the one we just processed. Therefore we update the 1468 * guest rip in the VMCS and in 'vmexit'. 1469 */ 1470 vm_exit_update_rip(vmexit); 1471 vmexit->rip += vmexit->inst_length; 1472 vmexit->inst_length = 0; 1473 1474 /* 1475 * Special case for spinning up an AP - exit to userspace to 1476 * give the controlling process a chance to intercept and 1477 * spin up a thread for the AP. 1478 */ 1479 if (vmexit->exitcode == VM_EXITCODE_SPINUP_AP) 1480 handled = 0; 1481 } else { 1482 if (vmexit->exitcode == VM_EXITCODE_BOGUS) { 1483 /* 1484 * If this VM exit was not claimed by anybody then 1485 * treat it as a generic VMX exit. 1486 */ 1487 vmexit->exitcode = VM_EXITCODE_VMX; 1488 vmexit->u.vmx.error = 0; 1489 } else { 1490 /* 1491 * The exitcode and collateral have been populated. 1492 * The VM exit will be processed further in userland. 1493 */ 1494 } 1495 } 1496 return (handled); 1497 } 1498 1499 static int 1500 vmx_run(void *arg, int vcpu, register_t rip) 1501 { 1502 int error, vie, rc, handled, astpending; 1503 uint32_t exit_reason; 1504 struct vmx *vmx; 1505 struct vmxctx *vmxctx; 1506 struct vmcs *vmcs; 1507 struct vm_exit *vmexit; 1508 1509 vmx = arg; 1510 vmcs = &vmx->vmcs[vcpu]; 1511 vmxctx = &vmx->ctx[vcpu]; 1512 vmxctx->launched = 0; 1513 1514 astpending = 0; 1515 vmexit = vm_exitinfo(vmx->vm, vcpu); 1516 1517 /* 1518 * XXX Can we avoid doing this every time we do a vm run? 1519 */ 1520 VMPTRLD(vmcs); 1521 1522 /* 1523 * XXX 1524 * We do this every time because we may setup the virtual machine 1525 * from a different process than the one that actually runs it. 1526 * 1527 * If the life of a virtual machine was spent entirely in the context 1528 * of a single process we could do this once in vmcs_set_defaults(). 1529 */ 1530 if ((error = vmwrite(VMCS_HOST_CR3, rcr3())) != 0) 1531 panic("vmx_run: error %d writing to VMCS_HOST_CR3", error); 1532 1533 if ((error = vmwrite(VMCS_GUEST_RIP, rip)) != 0) 1534 panic("vmx_run: error %d writing to VMCS_GUEST_RIP", error); 1535 1536 if ((error = vmx_set_pcpu_defaults(vmx, vcpu)) != 0) 1537 panic("vmx_run: error %d setting up pcpu defaults", error); 1538 1539 do { 1540 lapic_timer_tick(vmx->vm, vcpu); 1541 vmx_inject_interrupts(vmx, vcpu); 1542 vmx_run_trace(vmx, vcpu); 1543 rc = vmx_setjmp(vmxctx); 1544 #ifdef SETJMP_TRACE 1545 vmx_setjmp_trace(vmx, vcpu, vmxctx, rc); 1546 #endif 1547 switch (rc) { 1548 case VMX_RETURN_DIRECT: 1549 if (vmxctx->launched == 0) { 1550 vmxctx->launched = 1; 1551 vmx_launch(vmxctx); 1552 } else 1553 vmx_resume(vmxctx); 1554 panic("vmx_launch/resume should not return"); 1555 break; 1556 case VMX_RETURN_LONGJMP: 1557 break; /* vm exit */ 1558 case VMX_RETURN_AST: 1559 astpending = 1; 1560 break; 1561 case VMX_RETURN_VMRESUME: 1562 vie = vmcs_instruction_error(); 1563 if (vmxctx->launch_error == VM_FAIL_INVALID || 1564 vie != VMRESUME_WITH_NON_LAUNCHED_VMCS) { 1565 printf("vmresume error %d vmcs inst error %d\n", 1566 vmxctx->launch_error, vie); 1567 goto err_exit; 1568 } 1569 vmx_launch(vmxctx); /* try to launch the guest */ 1570 panic("vmx_launch should not return"); 1571 break; 1572 case VMX_RETURN_VMLAUNCH: 1573 vie = vmcs_instruction_error(); 1574 #if 1 1575 printf("vmlaunch error %d vmcs inst error %d\n", 1576 vmxctx->launch_error, vie); 1577 #endif 1578 goto err_exit; 1579 default: 1580 panic("vmx_setjmp returned %d", rc); 1581 } 1582 1583 /* enable interrupts */ 1584 enable_intr(); 1585 1586 /* collect some basic information for VM exit processing */ 1587 vmexit->rip = rip = vmcs_guest_rip(); 1588 vmexit->inst_length = vmexit_instruction_length(); 1589 vmexit->u.vmx.exit_reason = exit_reason = vmcs_exit_reason(); 1590 vmexit->u.vmx.exit_qualification = vmcs_exit_qualification(); 1591 1592 if (astpending) { 1593 handled = 1; 1594 vmexit->inst_length = 0; 1595 vmexit->exitcode = VM_EXITCODE_BOGUS; 1596 vmx_astpending_trace(vmx, vcpu, rip); 1597 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_ASTPENDING, 1); 1598 break; 1599 } 1600 1601 handled = vmx_exit_process(vmx, vcpu, vmexit); 1602 vmx_exit_trace(vmx, vcpu, rip, exit_reason, handled); 1603 1604 } while (handled); 1605 1606 /* 1607 * If a VM exit has been handled then the exitcode must be BOGUS 1608 * If a VM exit is not handled then the exitcode must not be BOGUS 1609 */ 1610 if ((handled && vmexit->exitcode != VM_EXITCODE_BOGUS) || 1611 (!handled && vmexit->exitcode == VM_EXITCODE_BOGUS)) { 1612 panic("Mismatch between handled (%d) and exitcode (%d)", 1613 handled, vmexit->exitcode); 1614 } 1615 1616 if (!handled) 1617 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_USERSPACE, 1); 1618 1619 VMM_CTR1(vmx->vm, vcpu, "goto userland: exitcode %d",vmexit->exitcode); 1620 1621 /* 1622 * XXX 1623 * We need to do this to ensure that any VMCS state cached by the 1624 * processor is flushed to memory. We need to do this in case the 1625 * VM moves to a different cpu the next time it runs. 1626 * 1627 * Can we avoid doing this? 1628 */ 1629 VMCLEAR(vmcs); 1630 return (0); 1631 1632 err_exit: 1633 vmexit->exitcode = VM_EXITCODE_VMX; 1634 vmexit->u.vmx.exit_reason = (uint32_t)-1; 1635 vmexit->u.vmx.exit_qualification = (uint32_t)-1; 1636 vmexit->u.vmx.error = vie; 1637 VMCLEAR(vmcs); 1638 return (ENOEXEC); 1639 } 1640 1641 static void 1642 vmx_vmcleanup(void *arg) 1643 { 1644 int i, error; 1645 struct vmx *vmx = arg; 1646 1647 for (i = 0; i < VM_MAXCPU; i++) 1648 vpid_free(vmx->state[i].vpid); 1649 1650 /* 1651 * XXXSMP we also need to clear the VMCS active on the other vcpus. 1652 */ 1653 error = vmclear(&vmx->vmcs[0]); 1654 if (error != 0) 1655 panic("vmx_vmcleanup: vmclear error %d on vcpu 0", error); 1656 1657 ept_vmcleanup(vmx); 1658 free(vmx, M_VMX); 1659 1660 return; 1661 } 1662 1663 static register_t * 1664 vmxctx_regptr(struct vmxctx *vmxctx, int reg) 1665 { 1666 1667 switch (reg) { 1668 case VM_REG_GUEST_RAX: 1669 return (&vmxctx->guest_rax); 1670 case VM_REG_GUEST_RBX: 1671 return (&vmxctx->guest_rbx); 1672 case VM_REG_GUEST_RCX: 1673 return (&vmxctx->guest_rcx); 1674 case VM_REG_GUEST_RDX: 1675 return (&vmxctx->guest_rdx); 1676 case VM_REG_GUEST_RSI: 1677 return (&vmxctx->guest_rsi); 1678 case VM_REG_GUEST_RDI: 1679 return (&vmxctx->guest_rdi); 1680 case VM_REG_GUEST_RBP: 1681 return (&vmxctx->guest_rbp); 1682 case VM_REG_GUEST_R8: 1683 return (&vmxctx->guest_r8); 1684 case VM_REG_GUEST_R9: 1685 return (&vmxctx->guest_r9); 1686 case VM_REG_GUEST_R10: 1687 return (&vmxctx->guest_r10); 1688 case VM_REG_GUEST_R11: 1689 return (&vmxctx->guest_r11); 1690 case VM_REG_GUEST_R12: 1691 return (&vmxctx->guest_r12); 1692 case VM_REG_GUEST_R13: 1693 return (&vmxctx->guest_r13); 1694 case VM_REG_GUEST_R14: 1695 return (&vmxctx->guest_r14); 1696 case VM_REG_GUEST_R15: 1697 return (&vmxctx->guest_r15); 1698 default: 1699 break; 1700 } 1701 return (NULL); 1702 } 1703 1704 static int 1705 vmxctx_getreg(struct vmxctx *vmxctx, int reg, uint64_t *retval) 1706 { 1707 register_t *regp; 1708 1709 if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) { 1710 *retval = *regp; 1711 return (0); 1712 } else 1713 return (EINVAL); 1714 } 1715 1716 static int 1717 vmxctx_setreg(struct vmxctx *vmxctx, int reg, uint64_t val) 1718 { 1719 register_t *regp; 1720 1721 if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) { 1722 *regp = val; 1723 return (0); 1724 } else 1725 return (EINVAL); 1726 } 1727 1728 static int 1729 vmx_shadow_reg(int reg) 1730 { 1731 int shreg; 1732 1733 shreg = -1; 1734 1735 switch (reg) { 1736 case VM_REG_GUEST_CR0: 1737 shreg = VMCS_CR0_SHADOW; 1738 break; 1739 case VM_REG_GUEST_CR4: 1740 shreg = VMCS_CR4_SHADOW; 1741 break; 1742 default: 1743 break; 1744 } 1745 1746 return (shreg); 1747 } 1748 1749 static int 1750 vmx_getreg(void *arg, int vcpu, int reg, uint64_t *retval) 1751 { 1752 int running, hostcpu; 1753 struct vmx *vmx = arg; 1754 1755 running = vcpu_is_running(vmx->vm, vcpu, &hostcpu); 1756 if (running && hostcpu != curcpu) 1757 panic("vmx_getreg: %s%d is running", vm_name(vmx->vm), vcpu); 1758 1759 if (vmxctx_getreg(&vmx->ctx[vcpu], reg, retval) == 0) 1760 return (0); 1761 1762 return (vmcs_getreg(&vmx->vmcs[vcpu], running, reg, retval)); 1763 } 1764 1765 static int 1766 vmx_setreg(void *arg, int vcpu, int reg, uint64_t val) 1767 { 1768 int error, hostcpu, running, shadow; 1769 uint64_t ctls; 1770 struct vmx *vmx = arg; 1771 1772 running = vcpu_is_running(vmx->vm, vcpu, &hostcpu); 1773 if (running && hostcpu != curcpu) 1774 panic("vmx_setreg: %s%d is running", vm_name(vmx->vm), vcpu); 1775 1776 if (vmxctx_setreg(&vmx->ctx[vcpu], reg, val) == 0) 1777 return (0); 1778 1779 error = vmcs_setreg(&vmx->vmcs[vcpu], running, reg, val); 1780 1781 if (error == 0) { 1782 /* 1783 * If the "load EFER" VM-entry control is 1 then the 1784 * value of EFER.LMA must be identical to "IA-32e mode guest" 1785 * bit in the VM-entry control. 1786 */ 1787 if ((entry_ctls & VM_ENTRY_LOAD_EFER) != 0 && 1788 (reg == VM_REG_GUEST_EFER)) { 1789 vmcs_getreg(&vmx->vmcs[vcpu], running, 1790 VMCS_IDENT(VMCS_ENTRY_CTLS), &ctls); 1791 if (val & EFER_LMA) 1792 ctls |= VM_ENTRY_GUEST_LMA; 1793 else 1794 ctls &= ~VM_ENTRY_GUEST_LMA; 1795 vmcs_setreg(&vmx->vmcs[vcpu], running, 1796 VMCS_IDENT(VMCS_ENTRY_CTLS), ctls); 1797 } 1798 1799 shadow = vmx_shadow_reg(reg); 1800 if (shadow > 0) { 1801 /* 1802 * Store the unmodified value in the shadow 1803 */ 1804 error = vmcs_setreg(&vmx->vmcs[vcpu], running, 1805 VMCS_IDENT(shadow), val); 1806 } 1807 } 1808 1809 return (error); 1810 } 1811 1812 static int 1813 vmx_getdesc(void *arg, int vcpu, int reg, struct seg_desc *desc) 1814 { 1815 struct vmx *vmx = arg; 1816 1817 return (vmcs_getdesc(&vmx->vmcs[vcpu], reg, desc)); 1818 } 1819 1820 static int 1821 vmx_setdesc(void *arg, int vcpu, int reg, struct seg_desc *desc) 1822 { 1823 struct vmx *vmx = arg; 1824 1825 return (vmcs_setdesc(&vmx->vmcs[vcpu], reg, desc)); 1826 } 1827 1828 static int 1829 vmx_inject(void *arg, int vcpu, int type, int vector, uint32_t code, 1830 int code_valid) 1831 { 1832 int error; 1833 uint64_t info; 1834 struct vmx *vmx = arg; 1835 struct vmcs *vmcs = &vmx->vmcs[vcpu]; 1836 1837 static uint32_t type_map[VM_EVENT_MAX] = { 1838 0x1, /* VM_EVENT_NONE */ 1839 0x0, /* VM_HW_INTR */ 1840 0x2, /* VM_NMI */ 1841 0x3, /* VM_HW_EXCEPTION */ 1842 0x4, /* VM_SW_INTR */ 1843 0x5, /* VM_PRIV_SW_EXCEPTION */ 1844 0x6, /* VM_SW_EXCEPTION */ 1845 }; 1846 1847 /* 1848 * If there is already an exception pending to be delivered to the 1849 * vcpu then just return. 1850 */ 1851 error = vmcs_getreg(vmcs, 0, VMCS_IDENT(VMCS_ENTRY_INTR_INFO), &info); 1852 if (error) 1853 return (error); 1854 1855 if (info & VMCS_INTERRUPTION_INFO_VALID) 1856 return (EAGAIN); 1857 1858 info = vector | (type_map[type] << 8) | (code_valid ? 1 << 11 : 0); 1859 info |= VMCS_INTERRUPTION_INFO_VALID; 1860 error = vmcs_setreg(vmcs, 0, VMCS_IDENT(VMCS_ENTRY_INTR_INFO), info); 1861 if (error != 0) 1862 return (error); 1863 1864 if (code_valid) { 1865 error = vmcs_setreg(vmcs, 0, 1866 VMCS_IDENT(VMCS_ENTRY_EXCEPTION_ERROR), 1867 code); 1868 } 1869 return (error); 1870 } 1871 1872 static int 1873 vmx_getcap(void *arg, int vcpu, int type, int *retval) 1874 { 1875 struct vmx *vmx = arg; 1876 int vcap; 1877 int ret; 1878 1879 ret = ENOENT; 1880 1881 vcap = vmx->cap[vcpu].set; 1882 1883 switch (type) { 1884 case VM_CAP_HALT_EXIT: 1885 if (cap_halt_exit) 1886 ret = 0; 1887 break; 1888 case VM_CAP_PAUSE_EXIT: 1889 if (cap_pause_exit) 1890 ret = 0; 1891 break; 1892 case VM_CAP_MTRAP_EXIT: 1893 if (cap_monitor_trap) 1894 ret = 0; 1895 break; 1896 case VM_CAP_UNRESTRICTED_GUEST: 1897 if (cap_unrestricted_guest) 1898 ret = 0; 1899 break; 1900 default: 1901 break; 1902 } 1903 1904 if (ret == 0) 1905 *retval = (vcap & (1 << type)) ? 1 : 0; 1906 1907 return (ret); 1908 } 1909 1910 static int 1911 vmx_setcap(void *arg, int vcpu, int type, int val) 1912 { 1913 struct vmx *vmx = arg; 1914 struct vmcs *vmcs = &vmx->vmcs[vcpu]; 1915 uint32_t baseval; 1916 uint32_t *pptr; 1917 int error; 1918 int flag; 1919 int reg; 1920 int retval; 1921 1922 retval = ENOENT; 1923 pptr = NULL; 1924 1925 switch (type) { 1926 case VM_CAP_HALT_EXIT: 1927 if (cap_halt_exit) { 1928 retval = 0; 1929 pptr = &vmx->cap[vcpu].proc_ctls; 1930 baseval = *pptr; 1931 flag = PROCBASED_HLT_EXITING; 1932 reg = VMCS_PRI_PROC_BASED_CTLS; 1933 } 1934 break; 1935 case VM_CAP_MTRAP_EXIT: 1936 if (cap_monitor_trap) { 1937 retval = 0; 1938 pptr = &vmx->cap[vcpu].proc_ctls; 1939 baseval = *pptr; 1940 flag = PROCBASED_MTF; 1941 reg = VMCS_PRI_PROC_BASED_CTLS; 1942 } 1943 break; 1944 case VM_CAP_PAUSE_EXIT: 1945 if (cap_pause_exit) { 1946 retval = 0; 1947 pptr = &vmx->cap[vcpu].proc_ctls; 1948 baseval = *pptr; 1949 flag = PROCBASED_PAUSE_EXITING; 1950 reg = VMCS_PRI_PROC_BASED_CTLS; 1951 } 1952 break; 1953 case VM_CAP_UNRESTRICTED_GUEST: 1954 if (cap_unrestricted_guest) { 1955 retval = 0; 1956 baseval = procbased_ctls2; 1957 flag = PROCBASED2_UNRESTRICTED_GUEST; 1958 reg = VMCS_SEC_PROC_BASED_CTLS; 1959 } 1960 break; 1961 default: 1962 break; 1963 } 1964 1965 if (retval == 0) { 1966 if (val) { 1967 baseval |= flag; 1968 } else { 1969 baseval &= ~flag; 1970 } 1971 VMPTRLD(vmcs); 1972 error = vmwrite(reg, baseval); 1973 VMCLEAR(vmcs); 1974 1975 if (error) { 1976 retval = error; 1977 } else { 1978 /* 1979 * Update optional stored flags, and record 1980 * setting 1981 */ 1982 if (pptr != NULL) { 1983 *pptr = baseval; 1984 } 1985 1986 if (val) { 1987 vmx->cap[vcpu].set |= (1 << type); 1988 } else { 1989 vmx->cap[vcpu].set &= ~(1 << type); 1990 } 1991 } 1992 } 1993 1994 return (retval); 1995 } 1996 1997 struct vmm_ops vmm_ops_intel = { 1998 vmx_init, 1999 vmx_cleanup, 2000 vmx_vminit, 2001 vmx_run, 2002 vmx_vmcleanup, 2003 ept_vmmmap_set, 2004 ept_vmmmap_get, 2005 vmx_getreg, 2006 vmx_setreg, 2007 vmx_getdesc, 2008 vmx_setdesc, 2009 vmx_inject, 2010 vmx_getcap, 2011 vmx_setcap 2012 }; 2013