1 /*- 2 * Copyright (c) 2011 NetApp, Inc. 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 * 26 * $FreeBSD$ 27 */ 28 29 #include <sys/cdefs.h> 30 __FBSDID("$FreeBSD$"); 31 32 #include <sys/param.h> 33 #include <sys/systm.h> 34 #include <sys/smp.h> 35 #include <sys/kernel.h> 36 #include <sys/malloc.h> 37 #include <sys/pcpu.h> 38 #include <sys/proc.h> 39 40 #include <vm/vm.h> 41 #include <vm/pmap.h> 42 43 #include <machine/psl.h> 44 #include <machine/cpufunc.h> 45 #include <machine/md_var.h> 46 #include <machine/pmap.h> 47 #include <machine/segments.h> 48 #include <machine/specialreg.h> 49 #include <machine/vmparam.h> 50 51 #include <x86/apicreg.h> 52 53 #include <machine/vmm.h> 54 #include "vmm_host.h" 55 #include "vmm_lapic.h" 56 #include "vmm_msr.h" 57 #include "vmm_ktr.h" 58 #include "vmm_stat.h" 59 60 #include "vmx_msr.h" 61 #include "ept.h" 62 #include "vmx_cpufunc.h" 63 #include "vmx.h" 64 #include "x86.h" 65 #include "vmx_controls.h" 66 67 #define PINBASED_CTLS_ONE_SETTING \ 68 (PINBASED_EXTINT_EXITING | \ 69 PINBASED_NMI_EXITING | \ 70 PINBASED_VIRTUAL_NMI) 71 #define PINBASED_CTLS_ZERO_SETTING 0 72 73 #define PROCBASED_CTLS_WINDOW_SETTING \ 74 (PROCBASED_INT_WINDOW_EXITING | \ 75 PROCBASED_NMI_WINDOW_EXITING) 76 77 #define PROCBASED_CTLS_ONE_SETTING \ 78 (PROCBASED_SECONDARY_CONTROLS | \ 79 PROCBASED_IO_EXITING | \ 80 PROCBASED_MSR_BITMAPS | \ 81 PROCBASED_CTLS_WINDOW_SETTING) 82 #define PROCBASED_CTLS_ZERO_SETTING \ 83 (PROCBASED_CR3_LOAD_EXITING | \ 84 PROCBASED_CR3_STORE_EXITING | \ 85 PROCBASED_IO_BITMAPS) 86 87 #define PROCBASED_CTLS2_ONE_SETTING PROCBASED2_ENABLE_EPT 88 #define PROCBASED_CTLS2_ZERO_SETTING 0 89 90 #define VM_EXIT_CTLS_ONE_SETTING_NO_PAT \ 91 (VM_EXIT_HOST_LMA | \ 92 VM_EXIT_SAVE_EFER | \ 93 VM_EXIT_LOAD_EFER) 94 95 #define VM_EXIT_CTLS_ONE_SETTING \ 96 (VM_EXIT_CTLS_ONE_SETTING_NO_PAT | \ 97 VM_EXIT_SAVE_PAT | \ 98 VM_EXIT_LOAD_PAT) 99 #define VM_EXIT_CTLS_ZERO_SETTING VM_EXIT_SAVE_DEBUG_CONTROLS 100 101 #define VM_ENTRY_CTLS_ONE_SETTING_NO_PAT VM_ENTRY_LOAD_EFER 102 103 #define VM_ENTRY_CTLS_ONE_SETTING \ 104 (VM_ENTRY_CTLS_ONE_SETTING_NO_PAT | \ 105 VM_ENTRY_LOAD_PAT) 106 #define VM_ENTRY_CTLS_ZERO_SETTING \ 107 (VM_ENTRY_LOAD_DEBUG_CONTROLS | \ 108 VM_ENTRY_INTO_SMM | \ 109 VM_ENTRY_DEACTIVATE_DUAL_MONITOR) 110 111 #define guest_msr_rw(vmx, msr) \ 112 msr_bitmap_change_access((vmx)->msr_bitmap, (msr), MSR_BITMAP_ACCESS_RW) 113 114 #define HANDLED 1 115 #define UNHANDLED 0 116 117 MALLOC_DEFINE(M_VMX, "vmx", "vmx"); 118 119 int vmxon_enabled[MAXCPU]; 120 static char vmxon_region[MAXCPU][PAGE_SIZE] __aligned(PAGE_SIZE); 121 122 static uint32_t pinbased_ctls, procbased_ctls, procbased_ctls2; 123 static uint32_t exit_ctls, entry_ctls; 124 125 static uint64_t cr0_ones_mask, cr0_zeros_mask; 126 static uint64_t cr4_ones_mask, cr4_zeros_mask; 127 128 static volatile u_int nextvpid; 129 130 static int vmx_no_patmsr; 131 132 /* 133 * Virtual NMI blocking conditions. 134 * 135 * Some processor implementations also require NMI to be blocked if 136 * the STI_BLOCKING bit is set. It is possible to detect this at runtime 137 * based on the (exit_reason,exit_qual) tuple being set to 138 * (EXIT_REASON_INVAL_VMCS, EXIT_QUAL_NMI_WHILE_STI_BLOCKING). 139 * 140 * We take the easy way out and also include STI_BLOCKING as one of the 141 * gating items for vNMI injection. 142 */ 143 static uint64_t nmi_blocking_bits = VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING | 144 VMCS_INTERRUPTIBILITY_NMI_BLOCKING | 145 VMCS_INTERRUPTIBILITY_STI_BLOCKING; 146 147 /* 148 * Optional capabilities 149 */ 150 static int cap_halt_exit; 151 static int cap_pause_exit; 152 static int cap_unrestricted_guest; 153 static int cap_monitor_trap; 154 155 /* statistics */ 156 static VMM_STAT_INTEL(VMEXIT_HLT_IGNORED, "number of times hlt was ignored"); 157 158 #ifdef KTR 159 static const char * 160 exit_reason_to_str(int reason) 161 { 162 static char reasonbuf[32]; 163 164 switch (reason) { 165 case EXIT_REASON_EXCEPTION: 166 return "exception"; 167 case EXIT_REASON_EXT_INTR: 168 return "extint"; 169 case EXIT_REASON_TRIPLE_FAULT: 170 return "triplefault"; 171 case EXIT_REASON_INIT: 172 return "init"; 173 case EXIT_REASON_SIPI: 174 return "sipi"; 175 case EXIT_REASON_IO_SMI: 176 return "iosmi"; 177 case EXIT_REASON_SMI: 178 return "smi"; 179 case EXIT_REASON_INTR_WINDOW: 180 return "intrwindow"; 181 case EXIT_REASON_NMI_WINDOW: 182 return "nmiwindow"; 183 case EXIT_REASON_TASK_SWITCH: 184 return "taskswitch"; 185 case EXIT_REASON_CPUID: 186 return "cpuid"; 187 case EXIT_REASON_GETSEC: 188 return "getsec"; 189 case EXIT_REASON_HLT: 190 return "hlt"; 191 case EXIT_REASON_INVD: 192 return "invd"; 193 case EXIT_REASON_INVLPG: 194 return "invlpg"; 195 case EXIT_REASON_RDPMC: 196 return "rdpmc"; 197 case EXIT_REASON_RDTSC: 198 return "rdtsc"; 199 case EXIT_REASON_RSM: 200 return "rsm"; 201 case EXIT_REASON_VMCALL: 202 return "vmcall"; 203 case EXIT_REASON_VMCLEAR: 204 return "vmclear"; 205 case EXIT_REASON_VMLAUNCH: 206 return "vmlaunch"; 207 case EXIT_REASON_VMPTRLD: 208 return "vmptrld"; 209 case EXIT_REASON_VMPTRST: 210 return "vmptrst"; 211 case EXIT_REASON_VMREAD: 212 return "vmread"; 213 case EXIT_REASON_VMRESUME: 214 return "vmresume"; 215 case EXIT_REASON_VMWRITE: 216 return "vmwrite"; 217 case EXIT_REASON_VMXOFF: 218 return "vmxoff"; 219 case EXIT_REASON_VMXON: 220 return "vmxon"; 221 case EXIT_REASON_CR_ACCESS: 222 return "craccess"; 223 case EXIT_REASON_DR_ACCESS: 224 return "draccess"; 225 case EXIT_REASON_INOUT: 226 return "inout"; 227 case EXIT_REASON_RDMSR: 228 return "rdmsr"; 229 case EXIT_REASON_WRMSR: 230 return "wrmsr"; 231 case EXIT_REASON_INVAL_VMCS: 232 return "invalvmcs"; 233 case EXIT_REASON_INVAL_MSR: 234 return "invalmsr"; 235 case EXIT_REASON_MWAIT: 236 return "mwait"; 237 case EXIT_REASON_MTF: 238 return "mtf"; 239 case EXIT_REASON_MONITOR: 240 return "monitor"; 241 case EXIT_REASON_PAUSE: 242 return "pause"; 243 case EXIT_REASON_MCE: 244 return "mce"; 245 case EXIT_REASON_TPR: 246 return "tpr"; 247 case EXIT_REASON_APIC: 248 return "apic"; 249 case EXIT_REASON_GDTR_IDTR: 250 return "gdtridtr"; 251 case EXIT_REASON_LDTR_TR: 252 return "ldtrtr"; 253 case EXIT_REASON_EPT_FAULT: 254 return "eptfault"; 255 case EXIT_REASON_EPT_MISCONFIG: 256 return "eptmisconfig"; 257 case EXIT_REASON_INVEPT: 258 return "invept"; 259 case EXIT_REASON_RDTSCP: 260 return "rdtscp"; 261 case EXIT_REASON_VMX_PREEMPT: 262 return "vmxpreempt"; 263 case EXIT_REASON_INVVPID: 264 return "invvpid"; 265 case EXIT_REASON_WBINVD: 266 return "wbinvd"; 267 case EXIT_REASON_XSETBV: 268 return "xsetbv"; 269 default: 270 snprintf(reasonbuf, sizeof(reasonbuf), "%d", reason); 271 return (reasonbuf); 272 } 273 } 274 275 #ifdef SETJMP_TRACE 276 static const char * 277 vmx_setjmp_rc2str(int rc) 278 { 279 switch (rc) { 280 case VMX_RETURN_DIRECT: 281 return "direct"; 282 case VMX_RETURN_LONGJMP: 283 return "longjmp"; 284 case VMX_RETURN_VMRESUME: 285 return "vmresume"; 286 case VMX_RETURN_VMLAUNCH: 287 return "vmlaunch"; 288 case VMX_RETURN_AST: 289 return "ast"; 290 default: 291 return "unknown"; 292 } 293 } 294 295 #define SETJMP_TRACE(vmx, vcpu, vmxctx, regname) \ 296 VMM_CTR1((vmx)->vm, (vcpu), "setjmp trace " #regname " 0x%016lx", \ 297 (vmxctx)->regname) 298 299 static void 300 vmx_setjmp_trace(struct vmx *vmx, int vcpu, struct vmxctx *vmxctx, int rc) 301 { 302 uint64_t host_rip, host_rsp; 303 304 if (vmxctx != &vmx->ctx[vcpu]) 305 panic("vmx_setjmp_trace: invalid vmxctx %p; should be %p", 306 vmxctx, &vmx->ctx[vcpu]); 307 308 VMM_CTR1((vmx)->vm, (vcpu), "vmxctx = %p", vmxctx); 309 VMM_CTR2((vmx)->vm, (vcpu), "setjmp return code %s(%d)", 310 vmx_setjmp_rc2str(rc), rc); 311 312 host_rsp = host_rip = ~0; 313 vmread(VMCS_HOST_RIP, &host_rip); 314 vmread(VMCS_HOST_RSP, &host_rsp); 315 VMM_CTR2((vmx)->vm, (vcpu), "vmcs host_rip 0x%016lx, host_rsp 0x%016lx", 316 host_rip, host_rsp); 317 318 SETJMP_TRACE(vmx, vcpu, vmxctx, host_r15); 319 SETJMP_TRACE(vmx, vcpu, vmxctx, host_r14); 320 SETJMP_TRACE(vmx, vcpu, vmxctx, host_r13); 321 SETJMP_TRACE(vmx, vcpu, vmxctx, host_r12); 322 SETJMP_TRACE(vmx, vcpu, vmxctx, host_rbp); 323 SETJMP_TRACE(vmx, vcpu, vmxctx, host_rsp); 324 SETJMP_TRACE(vmx, vcpu, vmxctx, host_rbx); 325 SETJMP_TRACE(vmx, vcpu, vmxctx, host_rip); 326 327 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rdi); 328 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rsi); 329 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rdx); 330 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rcx); 331 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r8); 332 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r9); 333 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rax); 334 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rbx); 335 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rbp); 336 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r10); 337 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r11); 338 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r12); 339 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r13); 340 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r14); 341 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r15); 342 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_cr2); 343 } 344 #endif 345 #else 346 static void __inline 347 vmx_setjmp_trace(struct vmx *vmx, int vcpu, struct vmxctx *vmxctx, int rc) 348 { 349 return; 350 } 351 #endif /* KTR */ 352 353 u_long 354 vmx_fix_cr0(u_long cr0) 355 { 356 357 return ((cr0 | cr0_ones_mask) & ~cr0_zeros_mask); 358 } 359 360 u_long 361 vmx_fix_cr4(u_long cr4) 362 { 363 364 return ((cr4 | cr4_ones_mask) & ~cr4_zeros_mask); 365 } 366 367 static void 368 msr_save_area_init(struct msr_entry *g_area, int *g_count) 369 { 370 int cnt; 371 372 static struct msr_entry guest_msrs[] = { 373 { MSR_KGSBASE, 0, 0 }, 374 }; 375 376 cnt = sizeof(guest_msrs) / sizeof(guest_msrs[0]); 377 if (cnt > GUEST_MSR_MAX_ENTRIES) 378 panic("guest msr save area overrun"); 379 bcopy(guest_msrs, g_area, sizeof(guest_msrs)); 380 *g_count = cnt; 381 } 382 383 static void 384 vmx_disable(void *arg __unused) 385 { 386 struct invvpid_desc invvpid_desc = { 0 }; 387 struct invept_desc invept_desc = { 0 }; 388 389 if (vmxon_enabled[curcpu]) { 390 /* 391 * See sections 25.3.3.3 and 25.3.3.4 in Intel Vol 3b. 392 * 393 * VMXON or VMXOFF are not required to invalidate any TLB 394 * caching structures. This prevents potential retention of 395 * cached information in the TLB between distinct VMX episodes. 396 */ 397 invvpid(INVVPID_TYPE_ALL_CONTEXTS, invvpid_desc); 398 invept(INVEPT_TYPE_ALL_CONTEXTS, invept_desc); 399 vmxoff(); 400 } 401 load_cr4(rcr4() & ~CR4_VMXE); 402 } 403 404 static int 405 vmx_cleanup(void) 406 { 407 408 smp_rendezvous(NULL, vmx_disable, NULL, NULL); 409 410 return (0); 411 } 412 413 static void 414 vmx_enable(void *arg __unused) 415 { 416 int error; 417 418 load_cr4(rcr4() | CR4_VMXE); 419 420 *(uint32_t *)vmxon_region[curcpu] = vmx_revision(); 421 error = vmxon(vmxon_region[curcpu]); 422 if (error == 0) 423 vmxon_enabled[curcpu] = 1; 424 } 425 426 static int 427 vmx_init(void) 428 { 429 int error; 430 uint64_t fixed0, fixed1, feature_control; 431 uint32_t tmp; 432 433 /* CPUID.1:ECX[bit 5] must be 1 for processor to support VMX */ 434 if (!(cpu_feature2 & CPUID2_VMX)) { 435 printf("vmx_init: processor does not support VMX operation\n"); 436 return (ENXIO); 437 } 438 439 /* 440 * Verify that MSR_IA32_FEATURE_CONTROL lock and VMXON enable bits 441 * are set (bits 0 and 2 respectively). 442 */ 443 feature_control = rdmsr(MSR_IA32_FEATURE_CONTROL); 444 if ((feature_control & 0x5) != 0x5) { 445 printf("vmx_init: VMX operation disabled by BIOS\n"); 446 return (ENXIO); 447 } 448 449 /* Check support for primary processor-based VM-execution controls */ 450 error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, 451 MSR_VMX_TRUE_PROCBASED_CTLS, 452 PROCBASED_CTLS_ONE_SETTING, 453 PROCBASED_CTLS_ZERO_SETTING, &procbased_ctls); 454 if (error) { 455 printf("vmx_init: processor does not support desired primary " 456 "processor-based controls\n"); 457 return (error); 458 } 459 460 /* Clear the processor-based ctl bits that are set on demand */ 461 procbased_ctls &= ~PROCBASED_CTLS_WINDOW_SETTING; 462 463 /* Check support for secondary processor-based VM-execution controls */ 464 error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, 465 MSR_VMX_PROCBASED_CTLS2, 466 PROCBASED_CTLS2_ONE_SETTING, 467 PROCBASED_CTLS2_ZERO_SETTING, &procbased_ctls2); 468 if (error) { 469 printf("vmx_init: processor does not support desired secondary " 470 "processor-based controls\n"); 471 return (error); 472 } 473 474 /* Check support for VPID */ 475 error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, MSR_VMX_PROCBASED_CTLS2, 476 PROCBASED2_ENABLE_VPID, 0, &tmp); 477 if (error == 0) 478 procbased_ctls2 |= PROCBASED2_ENABLE_VPID; 479 480 /* Check support for pin-based VM-execution controls */ 481 error = vmx_set_ctlreg(MSR_VMX_PINBASED_CTLS, 482 MSR_VMX_TRUE_PINBASED_CTLS, 483 PINBASED_CTLS_ONE_SETTING, 484 PINBASED_CTLS_ZERO_SETTING, &pinbased_ctls); 485 if (error) { 486 printf("vmx_init: processor does not support desired " 487 "pin-based controls\n"); 488 return (error); 489 } 490 491 /* Check support for VM-exit controls */ 492 error = vmx_set_ctlreg(MSR_VMX_EXIT_CTLS, MSR_VMX_TRUE_EXIT_CTLS, 493 VM_EXIT_CTLS_ONE_SETTING, 494 VM_EXIT_CTLS_ZERO_SETTING, 495 &exit_ctls); 496 if (error) { 497 /* Try again without the PAT MSR bits */ 498 error = vmx_set_ctlreg(MSR_VMX_EXIT_CTLS, 499 MSR_VMX_TRUE_EXIT_CTLS, 500 VM_EXIT_CTLS_ONE_SETTING_NO_PAT, 501 VM_EXIT_CTLS_ZERO_SETTING, 502 &exit_ctls); 503 if (error) { 504 printf("vmx_init: processor does not support desired " 505 "exit controls\n"); 506 return (error); 507 } else { 508 if (bootverbose) 509 printf("vmm: PAT MSR access not supported\n"); 510 guest_msr_valid(MSR_PAT); 511 vmx_no_patmsr = 1; 512 } 513 } 514 515 /* Check support for VM-entry controls */ 516 if (!vmx_no_patmsr) { 517 error = vmx_set_ctlreg(MSR_VMX_ENTRY_CTLS, 518 MSR_VMX_TRUE_ENTRY_CTLS, 519 VM_ENTRY_CTLS_ONE_SETTING, 520 VM_ENTRY_CTLS_ZERO_SETTING, 521 &entry_ctls); 522 } else { 523 error = vmx_set_ctlreg(MSR_VMX_ENTRY_CTLS, 524 MSR_VMX_TRUE_ENTRY_CTLS, 525 VM_ENTRY_CTLS_ONE_SETTING_NO_PAT, 526 VM_ENTRY_CTLS_ZERO_SETTING, 527 &entry_ctls); 528 } 529 530 if (error) { 531 printf("vmx_init: processor does not support desired " 532 "entry controls\n"); 533 return (error); 534 } 535 536 /* 537 * Check support for optional features by testing them 538 * as individual bits 539 */ 540 cap_halt_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, 541 MSR_VMX_TRUE_PROCBASED_CTLS, 542 PROCBASED_HLT_EXITING, 0, 543 &tmp) == 0); 544 545 cap_monitor_trap = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, 546 MSR_VMX_PROCBASED_CTLS, 547 PROCBASED_MTF, 0, 548 &tmp) == 0); 549 550 cap_pause_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, 551 MSR_VMX_TRUE_PROCBASED_CTLS, 552 PROCBASED_PAUSE_EXITING, 0, 553 &tmp) == 0); 554 555 cap_unrestricted_guest = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, 556 MSR_VMX_PROCBASED_CTLS2, 557 PROCBASED2_UNRESTRICTED_GUEST, 0, 558 &tmp) == 0); 559 560 /* Initialize EPT */ 561 error = ept_init(); 562 if (error) { 563 printf("vmx_init: ept initialization failed (%d)\n", error); 564 return (error); 565 } 566 567 /* 568 * Stash the cr0 and cr4 bits that must be fixed to 0 or 1 569 */ 570 fixed0 = rdmsr(MSR_VMX_CR0_FIXED0); 571 fixed1 = rdmsr(MSR_VMX_CR0_FIXED1); 572 cr0_ones_mask = fixed0 & fixed1; 573 cr0_zeros_mask = ~fixed0 & ~fixed1; 574 575 /* 576 * CR0_PE and CR0_PG can be set to zero in VMX non-root operation 577 * if unrestricted guest execution is allowed. 578 */ 579 if (cap_unrestricted_guest) 580 cr0_ones_mask &= ~(CR0_PG | CR0_PE); 581 582 /* 583 * Do not allow the guest to set CR0_NW or CR0_CD. 584 */ 585 cr0_zeros_mask |= (CR0_NW | CR0_CD); 586 587 fixed0 = rdmsr(MSR_VMX_CR4_FIXED0); 588 fixed1 = rdmsr(MSR_VMX_CR4_FIXED1); 589 cr4_ones_mask = fixed0 & fixed1; 590 cr4_zeros_mask = ~fixed0 & ~fixed1; 591 592 /* enable VMX operation */ 593 smp_rendezvous(NULL, vmx_enable, NULL, NULL); 594 595 return (0); 596 } 597 598 /* 599 * If this processor does not support VPIDs then simply return 0. 600 * 601 * Otherwise generate the next value of VPID to use. Any value is alright 602 * as long as it is non-zero. 603 * 604 * We always execute in VMX non-root context with EPT enabled. Thus all 605 * combined mappings are tagged with the (EP4TA, VPID, PCID) tuple. This 606 * in turn means that multiple VMs can share the same VPID as long as 607 * they have distinct EPT page tables. 608 * 609 * XXX 610 * We should optimize this so that it returns VPIDs that are not in 611 * use. Then we will not unnecessarily invalidate mappings in 612 * vmx_set_pcpu_defaults() just because two or more vcpus happen to 613 * use the same 'vpid'. 614 */ 615 static uint16_t 616 vmx_vpid(void) 617 { 618 uint16_t vpid = 0; 619 620 if ((procbased_ctls2 & PROCBASED2_ENABLE_VPID) != 0) { 621 do { 622 vpid = atomic_fetchadd_int(&nextvpid, 1); 623 } while (vpid == 0); 624 } 625 626 return (vpid); 627 } 628 629 static int 630 vmx_setup_cr_shadow(int which, struct vmcs *vmcs) 631 { 632 int error, mask_ident, shadow_ident; 633 uint64_t mask_value, shadow_value; 634 635 if (which != 0 && which != 4) 636 panic("vmx_setup_cr_shadow: unknown cr%d", which); 637 638 if (which == 0) { 639 mask_ident = VMCS_CR0_MASK; 640 mask_value = cr0_ones_mask | cr0_zeros_mask; 641 shadow_ident = VMCS_CR0_SHADOW; 642 shadow_value = cr0_ones_mask; 643 } else { 644 mask_ident = VMCS_CR4_MASK; 645 mask_value = cr4_ones_mask | cr4_zeros_mask; 646 shadow_ident = VMCS_CR4_SHADOW; 647 shadow_value = cr4_ones_mask; 648 } 649 650 error = vmcs_setreg(vmcs, VMCS_IDENT(mask_ident), mask_value); 651 if (error) 652 return (error); 653 654 error = vmcs_setreg(vmcs, VMCS_IDENT(shadow_ident), shadow_value); 655 if (error) 656 return (error); 657 658 return (0); 659 } 660 #define vmx_setup_cr0_shadow(vmcs) vmx_setup_cr_shadow(0, (vmcs)) 661 #define vmx_setup_cr4_shadow(vmcs) vmx_setup_cr_shadow(4, (vmcs)) 662 663 static void * 664 vmx_vminit(struct vm *vm) 665 { 666 uint16_t vpid; 667 int i, error, guest_msr_count; 668 struct vmx *vmx; 669 670 vmx = malloc(sizeof(struct vmx), M_VMX, M_WAITOK | M_ZERO); 671 if ((uintptr_t)vmx & PAGE_MASK) { 672 panic("malloc of struct vmx not aligned on %d byte boundary", 673 PAGE_SIZE); 674 } 675 vmx->vm = vm; 676 677 /* 678 * Clean up EPTP-tagged guest physical and combined mappings 679 * 680 * VMX transitions are not required to invalidate any guest physical 681 * mappings. So, it may be possible for stale guest physical mappings 682 * to be present in the processor TLBs. 683 * 684 * Combined mappings for this EP4TA are also invalidated for all VPIDs. 685 */ 686 ept_invalidate_mappings(vtophys(vmx->pml4ept)); 687 688 msr_bitmap_initialize(vmx->msr_bitmap); 689 690 /* 691 * It is safe to allow direct access to MSR_GSBASE and MSR_FSBASE. 692 * The guest FSBASE and GSBASE are saved and restored during 693 * vm-exit and vm-entry respectively. The host FSBASE and GSBASE are 694 * always restored from the vmcs host state area on vm-exit. 695 * 696 * The SYSENTER_CS/ESP/EIP MSRs are identical to FS/GSBASE in 697 * how they are saved/restored so can be directly accessed by the 698 * guest. 699 * 700 * Guest KGSBASE is saved and restored in the guest MSR save area. 701 * Host KGSBASE is restored before returning to userland from the pcb. 702 * There will be a window of time when we are executing in the host 703 * kernel context with a value of KGSBASE from the guest. This is ok 704 * because the value of KGSBASE is inconsequential in kernel context. 705 * 706 * MSR_EFER is saved and restored in the guest VMCS area on a 707 * VM exit and entry respectively. It is also restored from the 708 * host VMCS area on a VM exit. 709 */ 710 if (guest_msr_rw(vmx, MSR_GSBASE) || 711 guest_msr_rw(vmx, MSR_FSBASE) || 712 guest_msr_rw(vmx, MSR_SYSENTER_CS_MSR) || 713 guest_msr_rw(vmx, MSR_SYSENTER_ESP_MSR) || 714 guest_msr_rw(vmx, MSR_SYSENTER_EIP_MSR) || 715 guest_msr_rw(vmx, MSR_KGSBASE) || 716 guest_msr_rw(vmx, MSR_EFER)) 717 panic("vmx_vminit: error setting guest msr access"); 718 719 /* 720 * MSR_PAT is saved and restored in the guest VMCS are on a VM exit 721 * and entry respectively. It is also restored from the host VMCS 722 * area on a VM exit. However, if running on a system with no 723 * MSR_PAT save/restore support, leave access disabled so accesses 724 * will be trapped. 725 */ 726 if (!vmx_no_patmsr && guest_msr_rw(vmx, MSR_PAT)) 727 panic("vmx_vminit: error setting guest pat msr access"); 728 729 for (i = 0; i < VM_MAXCPU; i++) { 730 vmx->vmcs[i].identifier = vmx_revision(); 731 error = vmclear(&vmx->vmcs[i]); 732 if (error != 0) { 733 panic("vmx_vminit: vmclear error %d on vcpu %d\n", 734 error, i); 735 } 736 737 vpid = vmx_vpid(); 738 739 error = vmcs_set_defaults(&vmx->vmcs[i], 740 (u_long)vmx_longjmp, 741 (u_long)&vmx->ctx[i], 742 vtophys(vmx->pml4ept), 743 pinbased_ctls, 744 procbased_ctls, 745 procbased_ctls2, 746 exit_ctls, entry_ctls, 747 vtophys(vmx->msr_bitmap), 748 vpid); 749 750 if (error != 0) 751 panic("vmx_vminit: vmcs_set_defaults error %d", error); 752 753 vmx->cap[i].set = 0; 754 vmx->cap[i].proc_ctls = procbased_ctls; 755 756 vmx->state[i].lastcpu = -1; 757 vmx->state[i].vpid = vpid; 758 759 msr_save_area_init(vmx->guest_msrs[i], &guest_msr_count); 760 761 error = vmcs_set_msr_save(&vmx->vmcs[i], 762 vtophys(vmx->guest_msrs[i]), 763 guest_msr_count); 764 if (error != 0) 765 panic("vmcs_set_msr_save error %d", error); 766 767 error = vmx_setup_cr0_shadow(&vmx->vmcs[i]); 768 if (error != 0) 769 panic("vmx_setup_cr0_shadow %d", error); 770 771 error = vmx_setup_cr4_shadow(&vmx->vmcs[i]); 772 if (error != 0) 773 panic("vmx_setup_cr4_shadow %d", error); 774 } 775 776 return (vmx); 777 } 778 779 static int 780 vmx_handle_cpuid(struct vm *vm, int vcpu, struct vmxctx *vmxctx) 781 { 782 int handled, func; 783 784 func = vmxctx->guest_rax; 785 786 handled = x86_emulate_cpuid(vm, vcpu, 787 (uint32_t*)(&vmxctx->guest_rax), 788 (uint32_t*)(&vmxctx->guest_rbx), 789 (uint32_t*)(&vmxctx->guest_rcx), 790 (uint32_t*)(&vmxctx->guest_rdx)); 791 return (handled); 792 } 793 794 static __inline void 795 vmx_run_trace(struct vmx *vmx, int vcpu) 796 { 797 #ifdef KTR 798 VMM_CTR1(vmx->vm, vcpu, "Resume execution at 0x%0lx", vmcs_guest_rip()); 799 #endif 800 } 801 802 static __inline void 803 vmx_exit_trace(struct vmx *vmx, int vcpu, uint64_t rip, uint32_t exit_reason, 804 int handled) 805 { 806 #ifdef KTR 807 VMM_CTR3(vmx->vm, vcpu, "%s %s vmexit at 0x%0lx", 808 handled ? "handled" : "unhandled", 809 exit_reason_to_str(exit_reason), rip); 810 #endif 811 } 812 813 static __inline void 814 vmx_astpending_trace(struct vmx *vmx, int vcpu, uint64_t rip) 815 { 816 #ifdef KTR 817 VMM_CTR1(vmx->vm, vcpu, "astpending vmexit at 0x%0lx", rip); 818 #endif 819 } 820 821 static int 822 vmx_set_pcpu_defaults(struct vmx *vmx, int vcpu) 823 { 824 int error, lastcpu; 825 struct vmxstate *vmxstate; 826 struct invvpid_desc invvpid_desc = { 0 }; 827 828 vmxstate = &vmx->state[vcpu]; 829 lastcpu = vmxstate->lastcpu; 830 vmxstate->lastcpu = curcpu; 831 832 if (lastcpu == curcpu) { 833 error = 0; 834 goto done; 835 } 836 837 vmm_stat_incr(vmx->vm, vcpu, VCPU_MIGRATIONS, 1); 838 839 error = vmwrite(VMCS_HOST_TR_BASE, vmm_get_host_trbase()); 840 if (error != 0) 841 goto done; 842 843 error = vmwrite(VMCS_HOST_GDTR_BASE, vmm_get_host_gdtrbase()); 844 if (error != 0) 845 goto done; 846 847 error = vmwrite(VMCS_HOST_GS_BASE, vmm_get_host_gsbase()); 848 if (error != 0) 849 goto done; 850 851 /* 852 * If we are using VPIDs then invalidate all mappings tagged with 'vpid' 853 * 854 * We do this because this vcpu was executing on a different host 855 * cpu when it last ran. We do not track whether it invalidated 856 * mappings associated with its 'vpid' during that run. So we must 857 * assume that the mappings associated with 'vpid' on 'curcpu' are 858 * stale and invalidate them. 859 * 860 * Note that we incur this penalty only when the scheduler chooses to 861 * move the thread associated with this vcpu between host cpus. 862 * 863 * Note also that this will invalidate mappings tagged with 'vpid' 864 * for "all" EP4TAs. 865 */ 866 if (vmxstate->vpid != 0) { 867 invvpid_desc.vpid = vmxstate->vpid; 868 invvpid(INVVPID_TYPE_SINGLE_CONTEXT, invvpid_desc); 869 } 870 done: 871 return (error); 872 } 873 874 static void 875 vm_exit_update_rip(struct vm_exit *vmexit) 876 { 877 int error; 878 879 error = vmwrite(VMCS_GUEST_RIP, vmexit->rip + vmexit->inst_length); 880 if (error) 881 panic("vmx_run: error %d writing to VMCS_GUEST_RIP", error); 882 } 883 884 /* 885 * We depend on 'procbased_ctls' to have the Interrupt Window Exiting bit set. 886 */ 887 CTASSERT((PROCBASED_CTLS_ONE_SETTING & PROCBASED_INT_WINDOW_EXITING) != 0); 888 889 static void __inline 890 vmx_set_int_window_exiting(struct vmx *vmx, int vcpu) 891 { 892 int error; 893 894 vmx->cap[vcpu].proc_ctls |= PROCBASED_INT_WINDOW_EXITING; 895 896 error = vmwrite(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); 897 if (error) 898 panic("vmx_set_int_window_exiting: vmwrite error %d", error); 899 } 900 901 static void __inline 902 vmx_clear_int_window_exiting(struct vmx *vmx, int vcpu) 903 { 904 int error; 905 906 vmx->cap[vcpu].proc_ctls &= ~PROCBASED_INT_WINDOW_EXITING; 907 908 error = vmwrite(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); 909 if (error) 910 panic("vmx_clear_int_window_exiting: vmwrite error %d", error); 911 } 912 913 static void __inline 914 vmx_set_nmi_window_exiting(struct vmx *vmx, int vcpu) 915 { 916 int error; 917 918 vmx->cap[vcpu].proc_ctls |= PROCBASED_NMI_WINDOW_EXITING; 919 920 error = vmwrite(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); 921 if (error) 922 panic("vmx_set_nmi_window_exiting: vmwrite error %d", error); 923 } 924 925 static void __inline 926 vmx_clear_nmi_window_exiting(struct vmx *vmx, int vcpu) 927 { 928 int error; 929 930 vmx->cap[vcpu].proc_ctls &= ~PROCBASED_NMI_WINDOW_EXITING; 931 932 error = vmwrite(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); 933 if (error) 934 panic("vmx_clear_nmi_window_exiting: vmwrite error %d", error); 935 } 936 937 static int 938 vmx_inject_nmi(struct vmx *vmx, int vcpu) 939 { 940 int error; 941 uint64_t info, interruptibility; 942 943 /* Bail out if no NMI requested */ 944 if (!vm_nmi_pending(vmx->vm, vcpu)) 945 return (0); 946 947 error = vmread(VMCS_GUEST_INTERRUPTIBILITY, &interruptibility); 948 if (error) { 949 panic("vmx_inject_nmi: vmread(interruptibility) %d", 950 error); 951 } 952 if (interruptibility & nmi_blocking_bits) 953 goto nmiblocked; 954 955 /* 956 * Inject the virtual NMI. The vector must be the NMI IDT entry 957 * or the VMCS entry check will fail. 958 */ 959 info = VMCS_INTERRUPTION_INFO_NMI | VMCS_INTERRUPTION_INFO_VALID; 960 info |= IDT_NMI; 961 962 error = vmwrite(VMCS_ENTRY_INTR_INFO, info); 963 if (error) 964 panic("vmx_inject_nmi: vmwrite(intrinfo) %d", error); 965 966 VMM_CTR0(vmx->vm, vcpu, "Injecting vNMI"); 967 968 /* Clear the request */ 969 vm_nmi_clear(vmx->vm, vcpu); 970 return (1); 971 972 nmiblocked: 973 /* 974 * Set the NMI Window Exiting execution control so we can inject 975 * the virtual NMI as soon as blocking condition goes away. 976 */ 977 vmx_set_nmi_window_exiting(vmx, vcpu); 978 979 VMM_CTR0(vmx->vm, vcpu, "Enabling NMI window exiting"); 980 return (1); 981 } 982 983 static void 984 vmx_inject_interrupts(struct vmx *vmx, int vcpu) 985 { 986 int error, vector; 987 uint64_t info, rflags, interruptibility; 988 989 const int HWINTR_BLOCKED = VMCS_INTERRUPTIBILITY_STI_BLOCKING | 990 VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING; 991 992 /* 993 * If there is already an interrupt pending then just return. 994 * 995 * This could happen if an interrupt was injected on a prior 996 * VM entry but the actual entry into guest mode was aborted 997 * because of a pending AST. 998 */ 999 error = vmread(VMCS_ENTRY_INTR_INFO, &info); 1000 if (error) 1001 panic("vmx_inject_interrupts: vmread(intrinfo) %d", error); 1002 if (info & VMCS_INTERRUPTION_INFO_VALID) 1003 return; 1004 1005 /* 1006 * NMI injection has priority so deal with those first 1007 */ 1008 if (vmx_inject_nmi(vmx, vcpu)) 1009 return; 1010 1011 /* Ask the local apic for a vector to inject */ 1012 vector = lapic_pending_intr(vmx->vm, vcpu); 1013 if (vector < 0) 1014 return; 1015 1016 if (vector < 32 || vector > 255) 1017 panic("vmx_inject_interrupts: invalid vector %d\n", vector); 1018 1019 /* Check RFLAGS.IF and the interruptibility state of the guest */ 1020 error = vmread(VMCS_GUEST_RFLAGS, &rflags); 1021 if (error) 1022 panic("vmx_inject_interrupts: vmread(rflags) %d", error); 1023 1024 if ((rflags & PSL_I) == 0) 1025 goto cantinject; 1026 1027 error = vmread(VMCS_GUEST_INTERRUPTIBILITY, &interruptibility); 1028 if (error) { 1029 panic("vmx_inject_interrupts: vmread(interruptibility) %d", 1030 error); 1031 } 1032 if (interruptibility & HWINTR_BLOCKED) 1033 goto cantinject; 1034 1035 /* Inject the interrupt */ 1036 info = VMCS_INTERRUPTION_INFO_HW_INTR | VMCS_INTERRUPTION_INFO_VALID; 1037 info |= vector; 1038 error = vmwrite(VMCS_ENTRY_INTR_INFO, info); 1039 if (error) 1040 panic("vmx_inject_interrupts: vmwrite(intrinfo) %d", error); 1041 1042 /* Update the Local APIC ISR */ 1043 lapic_intr_accepted(vmx->vm, vcpu, vector); 1044 1045 VMM_CTR1(vmx->vm, vcpu, "Injecting hwintr at vector %d", vector); 1046 1047 return; 1048 1049 cantinject: 1050 /* 1051 * Set the Interrupt Window Exiting execution control so we can inject 1052 * the interrupt as soon as blocking condition goes away. 1053 */ 1054 vmx_set_int_window_exiting(vmx, vcpu); 1055 1056 VMM_CTR0(vmx->vm, vcpu, "Enabling interrupt window exiting"); 1057 } 1058 1059 static int 1060 vmx_emulate_cr_access(struct vmx *vmx, int vcpu, uint64_t exitqual) 1061 { 1062 int error, cr, vmcs_guest_cr; 1063 uint64_t regval, ones_mask, zeros_mask; 1064 const struct vmxctx *vmxctx; 1065 1066 /* We only handle mov to %cr0 or %cr4 at this time */ 1067 if ((exitqual & 0xf0) != 0x00) 1068 return (UNHANDLED); 1069 1070 cr = exitqual & 0xf; 1071 if (cr != 0 && cr != 4) 1072 return (UNHANDLED); 1073 1074 vmxctx = &vmx->ctx[vcpu]; 1075 1076 /* 1077 * We must use vmwrite() directly here because vmcs_setreg() will 1078 * call vmclear(vmcs) as a side-effect which we certainly don't want. 1079 */ 1080 switch ((exitqual >> 8) & 0xf) { 1081 case 0: 1082 regval = vmxctx->guest_rax; 1083 break; 1084 case 1: 1085 regval = vmxctx->guest_rcx; 1086 break; 1087 case 2: 1088 regval = vmxctx->guest_rdx; 1089 break; 1090 case 3: 1091 regval = vmxctx->guest_rbx; 1092 break; 1093 case 4: 1094 error = vmread(VMCS_GUEST_RSP, ®val); 1095 if (error) { 1096 panic("vmx_emulate_cr_access: " 1097 "error %d reading guest rsp", error); 1098 } 1099 break; 1100 case 5: 1101 regval = vmxctx->guest_rbp; 1102 break; 1103 case 6: 1104 regval = vmxctx->guest_rsi; 1105 break; 1106 case 7: 1107 regval = vmxctx->guest_rdi; 1108 break; 1109 case 8: 1110 regval = vmxctx->guest_r8; 1111 break; 1112 case 9: 1113 regval = vmxctx->guest_r9; 1114 break; 1115 case 10: 1116 regval = vmxctx->guest_r10; 1117 break; 1118 case 11: 1119 regval = vmxctx->guest_r11; 1120 break; 1121 case 12: 1122 regval = vmxctx->guest_r12; 1123 break; 1124 case 13: 1125 regval = vmxctx->guest_r13; 1126 break; 1127 case 14: 1128 regval = vmxctx->guest_r14; 1129 break; 1130 case 15: 1131 regval = vmxctx->guest_r15; 1132 break; 1133 } 1134 1135 if (cr == 0) { 1136 ones_mask = cr0_ones_mask; 1137 zeros_mask = cr0_zeros_mask; 1138 vmcs_guest_cr = VMCS_GUEST_CR0; 1139 } else { 1140 ones_mask = cr4_ones_mask; 1141 zeros_mask = cr4_zeros_mask; 1142 vmcs_guest_cr = VMCS_GUEST_CR4; 1143 } 1144 regval |= ones_mask; 1145 regval &= ~zeros_mask; 1146 error = vmwrite(vmcs_guest_cr, regval); 1147 if (error) { 1148 panic("vmx_emulate_cr_access: error %d writing cr%d", 1149 error, cr); 1150 } 1151 1152 return (HANDLED); 1153 } 1154 1155 static int 1156 vmx_ept_fault(struct vm *vm, int cpu, 1157 uint64_t gla, uint64_t gpa, uint64_t rip, int inst_length, 1158 uint64_t cr3, uint64_t ept_qual, struct vie *vie) 1159 { 1160 int read, write, error; 1161 1162 /* EPT violation on an instruction fetch doesn't make sense here */ 1163 if (ept_qual & EPT_VIOLATION_INST_FETCH) 1164 return (UNHANDLED); 1165 1166 /* EPT violation must be a read fault or a write fault */ 1167 read = ept_qual & EPT_VIOLATION_DATA_READ ? 1 : 0; 1168 write = ept_qual & EPT_VIOLATION_DATA_WRITE ? 1 : 0; 1169 if ((read | write) == 0) 1170 return (UNHANDLED); 1171 1172 /* 1173 * The EPT violation must have been caused by accessing a 1174 * guest-physical address that is a translation of a guest-linear 1175 * address. 1176 */ 1177 if ((ept_qual & EPT_VIOLATION_GLA_VALID) == 0 || 1178 (ept_qual & EPT_VIOLATION_XLAT_VALID) == 0) { 1179 return (UNHANDLED); 1180 } 1181 1182 /* Fetch, decode and emulate the faulting instruction */ 1183 if (vmm_fetch_instruction(vm, cpu, rip, inst_length, cr3, vie) != 0) 1184 return (UNHANDLED); 1185 1186 if (vmm_decode_instruction(vm, cpu, gla, vie) != 0) 1187 return (UNHANDLED); 1188 1189 /* 1190 * Check if this is a local apic access 1191 */ 1192 if (gpa < DEFAULT_APIC_BASE || gpa >= DEFAULT_APIC_BASE + PAGE_SIZE) 1193 return (UNHANDLED); 1194 1195 error = vmm_emulate_instruction(vm, cpu, gpa, vie, 1196 lapic_mmio_read, lapic_mmio_write, 0); 1197 1198 return (error ? UNHANDLED : HANDLED); 1199 } 1200 1201 static int 1202 vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit) 1203 { 1204 int error, handled; 1205 struct vmcs *vmcs; 1206 struct vmxctx *vmxctx; 1207 uint32_t eax, ecx, edx; 1208 uint64_t qual, gla, gpa, cr3, intr_info; 1209 1210 handled = 0; 1211 vmcs = &vmx->vmcs[vcpu]; 1212 vmxctx = &vmx->ctx[vcpu]; 1213 qual = vmexit->u.vmx.exit_qualification; 1214 vmexit->exitcode = VM_EXITCODE_BOGUS; 1215 1216 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_COUNT, 1); 1217 1218 switch (vmexit->u.vmx.exit_reason) { 1219 case EXIT_REASON_CR_ACCESS: 1220 handled = vmx_emulate_cr_access(vmx, vcpu, qual); 1221 break; 1222 case EXIT_REASON_RDMSR: 1223 ecx = vmxctx->guest_rcx; 1224 error = emulate_rdmsr(vmx->vm, vcpu, ecx); 1225 if (error) { 1226 vmexit->exitcode = VM_EXITCODE_RDMSR; 1227 vmexit->u.msr.code = ecx; 1228 } else 1229 handled = 1; 1230 break; 1231 case EXIT_REASON_WRMSR: 1232 eax = vmxctx->guest_rax; 1233 ecx = vmxctx->guest_rcx; 1234 edx = vmxctx->guest_rdx; 1235 error = emulate_wrmsr(vmx->vm, vcpu, ecx, 1236 (uint64_t)edx << 32 | eax); 1237 if (error) { 1238 vmexit->exitcode = VM_EXITCODE_WRMSR; 1239 vmexit->u.msr.code = ecx; 1240 vmexit->u.msr.wval = (uint64_t)edx << 32 | eax; 1241 } else 1242 handled = 1; 1243 break; 1244 case EXIT_REASON_HLT: 1245 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_HLT, 1); 1246 /* 1247 * If there is an event waiting to be injected then there is 1248 * no need to 'hlt'. 1249 */ 1250 error = vmread(VMCS_ENTRY_INTR_INFO, &intr_info); 1251 if (error) 1252 panic("vmx_exit_process: vmread(intrinfo) %d", error); 1253 1254 if (intr_info & VMCS_INTERRUPTION_INFO_VALID) { 1255 handled = 1; 1256 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_HLT_IGNORED, 1); 1257 } else 1258 vmexit->exitcode = VM_EXITCODE_HLT; 1259 break; 1260 case EXIT_REASON_MTF: 1261 vmexit->exitcode = VM_EXITCODE_MTRAP; 1262 break; 1263 case EXIT_REASON_PAUSE: 1264 vmexit->exitcode = VM_EXITCODE_PAUSE; 1265 break; 1266 case EXIT_REASON_INTR_WINDOW: 1267 vmx_clear_int_window_exiting(vmx, vcpu); 1268 VMM_CTR0(vmx->vm, vcpu, "Disabling interrupt window exiting"); 1269 /* FALLTHRU */ 1270 case EXIT_REASON_EXT_INTR: 1271 /* 1272 * External interrupts serve only to cause VM exits and allow 1273 * the host interrupt handler to run. 1274 * 1275 * If this external interrupt triggers a virtual interrupt 1276 * to a VM, then that state will be recorded by the 1277 * host interrupt handler in the VM's softc. We will inject 1278 * this virtual interrupt during the subsequent VM enter. 1279 */ 1280 1281 /* 1282 * This is special. We want to treat this as an 'handled' 1283 * VM-exit but not increment the instruction pointer. 1284 */ 1285 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_EXTINT, 1); 1286 return (1); 1287 case EXIT_REASON_NMI_WINDOW: 1288 /* Exit to allow the pending virtual NMI to be injected */ 1289 vmx_clear_nmi_window_exiting(vmx, vcpu); 1290 VMM_CTR0(vmx->vm, vcpu, "Disabling NMI window exiting"); 1291 return (1); 1292 case EXIT_REASON_INOUT: 1293 vmexit->exitcode = VM_EXITCODE_INOUT; 1294 vmexit->u.inout.bytes = (qual & 0x7) + 1; 1295 vmexit->u.inout.in = (qual & 0x8) ? 1 : 0; 1296 vmexit->u.inout.string = (qual & 0x10) ? 1 : 0; 1297 vmexit->u.inout.rep = (qual & 0x20) ? 1 : 0; 1298 vmexit->u.inout.port = (uint16_t)(qual >> 16); 1299 vmexit->u.inout.eax = (uint32_t)(vmxctx->guest_rax); 1300 break; 1301 case EXIT_REASON_CPUID: 1302 handled = vmx_handle_cpuid(vmx->vm, vcpu, vmxctx); 1303 break; 1304 case EXIT_REASON_EPT_FAULT: 1305 gla = vmcs_gla(); 1306 gpa = vmcs_gpa(); 1307 cr3 = vmcs_guest_cr3(); 1308 handled = vmx_ept_fault(vmx->vm, vcpu, gla, gpa, 1309 vmexit->rip, vmexit->inst_length, 1310 cr3, qual, &vmexit->u.paging.vie); 1311 if (!handled) { 1312 vmexit->exitcode = VM_EXITCODE_PAGING; 1313 vmexit->u.paging.gpa = gpa; 1314 } 1315 break; 1316 default: 1317 break; 1318 } 1319 1320 if (handled) { 1321 /* 1322 * It is possible that control is returned to userland 1323 * even though we were able to handle the VM exit in the 1324 * kernel. 1325 * 1326 * In such a case we want to make sure that the userland 1327 * restarts guest execution at the instruction *after* 1328 * the one we just processed. Therefore we update the 1329 * guest rip in the VMCS and in 'vmexit'. 1330 */ 1331 vm_exit_update_rip(vmexit); 1332 vmexit->rip += vmexit->inst_length; 1333 vmexit->inst_length = 0; 1334 1335 /* 1336 * Special case for spinning up an AP - exit to userspace to 1337 * give the controlling process a chance to intercept and 1338 * spin up a thread for the AP. 1339 */ 1340 if (vmexit->exitcode == VM_EXITCODE_SPINUP_AP) 1341 handled = 0; 1342 } else { 1343 if (vmexit->exitcode == VM_EXITCODE_BOGUS) { 1344 /* 1345 * If this VM exit was not claimed by anybody then 1346 * treat it as a generic VMX exit. 1347 */ 1348 vmexit->exitcode = VM_EXITCODE_VMX; 1349 vmexit->u.vmx.error = 0; 1350 } else { 1351 /* 1352 * The exitcode and collateral have been populated. 1353 * The VM exit will be processed further in userland. 1354 */ 1355 } 1356 } 1357 return (handled); 1358 } 1359 1360 static int 1361 vmx_run(void *arg, int vcpu, register_t rip) 1362 { 1363 int error, vie, rc, handled, astpending; 1364 uint32_t exit_reason; 1365 struct vmx *vmx; 1366 struct vmxctx *vmxctx; 1367 struct vmcs *vmcs; 1368 struct vm_exit *vmexit; 1369 1370 vmx = arg; 1371 vmcs = &vmx->vmcs[vcpu]; 1372 vmxctx = &vmx->ctx[vcpu]; 1373 vmxctx->launched = 0; 1374 1375 astpending = 0; 1376 vmexit = vm_exitinfo(vmx->vm, vcpu); 1377 1378 /* 1379 * XXX Can we avoid doing this every time we do a vm run? 1380 */ 1381 VMPTRLD(vmcs); 1382 1383 /* 1384 * XXX 1385 * We do this every time because we may setup the virtual machine 1386 * from a different process than the one that actually runs it. 1387 * 1388 * If the life of a virtual machine was spent entirely in the context 1389 * of a single process we could do this once in vmcs_set_defaults(). 1390 */ 1391 if ((error = vmwrite(VMCS_HOST_CR3, rcr3())) != 0) 1392 panic("vmx_run: error %d writing to VMCS_HOST_CR3", error); 1393 1394 if ((error = vmwrite(VMCS_GUEST_RIP, rip)) != 0) 1395 panic("vmx_run: error %d writing to VMCS_GUEST_RIP", error); 1396 1397 if ((error = vmx_set_pcpu_defaults(vmx, vcpu)) != 0) 1398 panic("vmx_run: error %d setting up pcpu defaults", error); 1399 1400 do { 1401 lapic_timer_tick(vmx->vm, vcpu); 1402 vmx_inject_interrupts(vmx, vcpu); 1403 vmx_run_trace(vmx, vcpu); 1404 rc = vmx_setjmp(vmxctx); 1405 #ifdef SETJMP_TRACE 1406 vmx_setjmp_trace(vmx, vcpu, vmxctx, rc); 1407 #endif 1408 switch (rc) { 1409 case VMX_RETURN_DIRECT: 1410 if (vmxctx->launched == 0) { 1411 vmxctx->launched = 1; 1412 vmx_launch(vmxctx); 1413 } else 1414 vmx_resume(vmxctx); 1415 panic("vmx_launch/resume should not return"); 1416 break; 1417 case VMX_RETURN_LONGJMP: 1418 break; /* vm exit */ 1419 case VMX_RETURN_AST: 1420 astpending = 1; 1421 break; 1422 case VMX_RETURN_VMRESUME: 1423 vie = vmcs_instruction_error(); 1424 if (vmxctx->launch_error == VM_FAIL_INVALID || 1425 vie != VMRESUME_WITH_NON_LAUNCHED_VMCS) { 1426 printf("vmresume error %d vmcs inst error %d\n", 1427 vmxctx->launch_error, vie); 1428 goto err_exit; 1429 } 1430 vmx_launch(vmxctx); /* try to launch the guest */ 1431 panic("vmx_launch should not return"); 1432 break; 1433 case VMX_RETURN_VMLAUNCH: 1434 vie = vmcs_instruction_error(); 1435 #if 1 1436 printf("vmlaunch error %d vmcs inst error %d\n", 1437 vmxctx->launch_error, vie); 1438 #endif 1439 goto err_exit; 1440 default: 1441 panic("vmx_setjmp returned %d", rc); 1442 } 1443 1444 /* enable interrupts */ 1445 enable_intr(); 1446 1447 /* collect some basic information for VM exit processing */ 1448 vmexit->rip = rip = vmcs_guest_rip(); 1449 vmexit->inst_length = vmexit_instruction_length(); 1450 vmexit->u.vmx.exit_reason = exit_reason = vmcs_exit_reason(); 1451 vmexit->u.vmx.exit_qualification = vmcs_exit_qualification(); 1452 1453 if (astpending) { 1454 handled = 1; 1455 vmexit->inst_length = 0; 1456 vmexit->exitcode = VM_EXITCODE_BOGUS; 1457 vmx_astpending_trace(vmx, vcpu, rip); 1458 break; 1459 } 1460 1461 handled = vmx_exit_process(vmx, vcpu, vmexit); 1462 vmx_exit_trace(vmx, vcpu, rip, exit_reason, handled); 1463 1464 } while (handled); 1465 1466 /* 1467 * If a VM exit has been handled then the exitcode must be BOGUS 1468 * If a VM exit is not handled then the exitcode must not be BOGUS 1469 */ 1470 if ((handled && vmexit->exitcode != VM_EXITCODE_BOGUS) || 1471 (!handled && vmexit->exitcode == VM_EXITCODE_BOGUS)) { 1472 panic("Mismatch between handled (%d) and exitcode (%d)", 1473 handled, vmexit->exitcode); 1474 } 1475 1476 VMM_CTR1(vmx->vm, vcpu, "goto userland: exitcode %d",vmexit->exitcode); 1477 1478 /* 1479 * XXX 1480 * We need to do this to ensure that any VMCS state cached by the 1481 * processor is flushed to memory. We need to do this in case the 1482 * VM moves to a different cpu the next time it runs. 1483 * 1484 * Can we avoid doing this? 1485 */ 1486 VMCLEAR(vmcs); 1487 return (0); 1488 1489 err_exit: 1490 vmexit->exitcode = VM_EXITCODE_VMX; 1491 vmexit->u.vmx.exit_reason = (uint32_t)-1; 1492 vmexit->u.vmx.exit_qualification = (uint32_t)-1; 1493 vmexit->u.vmx.error = vie; 1494 VMCLEAR(vmcs); 1495 return (ENOEXEC); 1496 } 1497 1498 static void 1499 vmx_vmcleanup(void *arg) 1500 { 1501 int error; 1502 struct vmx *vmx = arg; 1503 1504 /* 1505 * XXXSMP we also need to clear the VMCS active on the other vcpus. 1506 */ 1507 error = vmclear(&vmx->vmcs[0]); 1508 if (error != 0) 1509 panic("vmx_vmcleanup: vmclear error %d on vcpu 0", error); 1510 1511 ept_vmcleanup(vmx); 1512 free(vmx, M_VMX); 1513 1514 return; 1515 } 1516 1517 static register_t * 1518 vmxctx_regptr(struct vmxctx *vmxctx, int reg) 1519 { 1520 1521 switch (reg) { 1522 case VM_REG_GUEST_RAX: 1523 return (&vmxctx->guest_rax); 1524 case VM_REG_GUEST_RBX: 1525 return (&vmxctx->guest_rbx); 1526 case VM_REG_GUEST_RCX: 1527 return (&vmxctx->guest_rcx); 1528 case VM_REG_GUEST_RDX: 1529 return (&vmxctx->guest_rdx); 1530 case VM_REG_GUEST_RSI: 1531 return (&vmxctx->guest_rsi); 1532 case VM_REG_GUEST_RDI: 1533 return (&vmxctx->guest_rdi); 1534 case VM_REG_GUEST_RBP: 1535 return (&vmxctx->guest_rbp); 1536 case VM_REG_GUEST_R8: 1537 return (&vmxctx->guest_r8); 1538 case VM_REG_GUEST_R9: 1539 return (&vmxctx->guest_r9); 1540 case VM_REG_GUEST_R10: 1541 return (&vmxctx->guest_r10); 1542 case VM_REG_GUEST_R11: 1543 return (&vmxctx->guest_r11); 1544 case VM_REG_GUEST_R12: 1545 return (&vmxctx->guest_r12); 1546 case VM_REG_GUEST_R13: 1547 return (&vmxctx->guest_r13); 1548 case VM_REG_GUEST_R14: 1549 return (&vmxctx->guest_r14); 1550 case VM_REG_GUEST_R15: 1551 return (&vmxctx->guest_r15); 1552 default: 1553 break; 1554 } 1555 return (NULL); 1556 } 1557 1558 static int 1559 vmxctx_getreg(struct vmxctx *vmxctx, int reg, uint64_t *retval) 1560 { 1561 register_t *regp; 1562 1563 if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) { 1564 *retval = *regp; 1565 return (0); 1566 } else 1567 return (EINVAL); 1568 } 1569 1570 static int 1571 vmxctx_setreg(struct vmxctx *vmxctx, int reg, uint64_t val) 1572 { 1573 register_t *regp; 1574 1575 if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) { 1576 *regp = val; 1577 return (0); 1578 } else 1579 return (EINVAL); 1580 } 1581 1582 static int 1583 vmx_getreg(void *arg, int vcpu, int reg, uint64_t *retval) 1584 { 1585 struct vmx *vmx = arg; 1586 1587 if (vmxctx_getreg(&vmx->ctx[vcpu], reg, retval) == 0) 1588 return (0); 1589 1590 /* 1591 * If the vcpu is running then don't mess with the VMCS. 1592 * 1593 * vmcs_getreg will VMCLEAR the vmcs when it is done which will cause 1594 * the subsequent vmlaunch/vmresume to fail. 1595 */ 1596 if (vcpu_is_running(vmx->vm, vcpu)) 1597 panic("vmx_getreg: %s%d is running", vm_name(vmx->vm), vcpu); 1598 1599 return (vmcs_getreg(&vmx->vmcs[vcpu], reg, retval)); 1600 } 1601 1602 static int 1603 vmx_setreg(void *arg, int vcpu, int reg, uint64_t val) 1604 { 1605 int error; 1606 uint64_t ctls; 1607 struct vmx *vmx = arg; 1608 1609 /* 1610 * XXX Allow caller to set contents of the guest registers saved in 1611 * the 'vmxctx' even though the vcpu might be running. We need this 1612 * specifically to support the rdmsr emulation that will set the 1613 * %eax and %edx registers during vm exit processing. 1614 */ 1615 if (vmxctx_setreg(&vmx->ctx[vcpu], reg, val) == 0) 1616 return (0); 1617 1618 /* 1619 * If the vcpu is running then don't mess with the VMCS. 1620 * 1621 * vmcs_setreg will VMCLEAR the vmcs when it is done which will cause 1622 * the subsequent vmlaunch/vmresume to fail. 1623 */ 1624 if (vcpu_is_running(vmx->vm, vcpu)) 1625 panic("vmx_setreg: %s%d is running", vm_name(vmx->vm), vcpu); 1626 1627 error = vmcs_setreg(&vmx->vmcs[vcpu], reg, val); 1628 1629 if (error == 0) { 1630 /* 1631 * If the "load EFER" VM-entry control is 1 then the 1632 * value of EFER.LMA must be identical to "IA-32e mode guest" 1633 * bit in the VM-entry control. 1634 */ 1635 if ((entry_ctls & VM_ENTRY_LOAD_EFER) != 0 && 1636 (reg == VM_REG_GUEST_EFER)) { 1637 vmcs_getreg(&vmx->vmcs[vcpu], 1638 VMCS_IDENT(VMCS_ENTRY_CTLS), &ctls); 1639 if (val & EFER_LMA) 1640 ctls |= VM_ENTRY_GUEST_LMA; 1641 else 1642 ctls &= ~VM_ENTRY_GUEST_LMA; 1643 vmcs_setreg(&vmx->vmcs[vcpu], 1644 VMCS_IDENT(VMCS_ENTRY_CTLS), ctls); 1645 } 1646 } 1647 1648 return (error); 1649 } 1650 1651 static int 1652 vmx_getdesc(void *arg, int vcpu, int reg, struct seg_desc *desc) 1653 { 1654 struct vmx *vmx = arg; 1655 1656 return (vmcs_getdesc(&vmx->vmcs[vcpu], reg, desc)); 1657 } 1658 1659 static int 1660 vmx_setdesc(void *arg, int vcpu, int reg, struct seg_desc *desc) 1661 { 1662 struct vmx *vmx = arg; 1663 1664 return (vmcs_setdesc(&vmx->vmcs[vcpu], reg, desc)); 1665 } 1666 1667 static int 1668 vmx_inject(void *arg, int vcpu, int type, int vector, uint32_t code, 1669 int code_valid) 1670 { 1671 int error; 1672 uint64_t info; 1673 struct vmx *vmx = arg; 1674 struct vmcs *vmcs = &vmx->vmcs[vcpu]; 1675 1676 static uint32_t type_map[VM_EVENT_MAX] = { 1677 0x1, /* VM_EVENT_NONE */ 1678 0x0, /* VM_HW_INTR */ 1679 0x2, /* VM_NMI */ 1680 0x3, /* VM_HW_EXCEPTION */ 1681 0x4, /* VM_SW_INTR */ 1682 0x5, /* VM_PRIV_SW_EXCEPTION */ 1683 0x6, /* VM_SW_EXCEPTION */ 1684 }; 1685 1686 /* 1687 * If there is already an exception pending to be delivered to the 1688 * vcpu then just return. 1689 */ 1690 error = vmcs_getreg(vmcs, VMCS_IDENT(VMCS_ENTRY_INTR_INFO), &info); 1691 if (error) 1692 return (error); 1693 1694 if (info & VMCS_INTERRUPTION_INFO_VALID) 1695 return (EAGAIN); 1696 1697 info = vector | (type_map[type] << 8) | (code_valid ? 1 << 11 : 0); 1698 info |= VMCS_INTERRUPTION_INFO_VALID; 1699 error = vmcs_setreg(vmcs, VMCS_IDENT(VMCS_ENTRY_INTR_INFO), info); 1700 if (error != 0) 1701 return (error); 1702 1703 if (code_valid) { 1704 error = vmcs_setreg(vmcs, 1705 VMCS_IDENT(VMCS_ENTRY_EXCEPTION_ERROR), 1706 code); 1707 } 1708 return (error); 1709 } 1710 1711 static int 1712 vmx_getcap(void *arg, int vcpu, int type, int *retval) 1713 { 1714 struct vmx *vmx = arg; 1715 int vcap; 1716 int ret; 1717 1718 ret = ENOENT; 1719 1720 vcap = vmx->cap[vcpu].set; 1721 1722 switch (type) { 1723 case VM_CAP_HALT_EXIT: 1724 if (cap_halt_exit) 1725 ret = 0; 1726 break; 1727 case VM_CAP_PAUSE_EXIT: 1728 if (cap_pause_exit) 1729 ret = 0; 1730 break; 1731 case VM_CAP_MTRAP_EXIT: 1732 if (cap_monitor_trap) 1733 ret = 0; 1734 break; 1735 case VM_CAP_UNRESTRICTED_GUEST: 1736 if (cap_unrestricted_guest) 1737 ret = 0; 1738 break; 1739 default: 1740 break; 1741 } 1742 1743 if (ret == 0) 1744 *retval = (vcap & (1 << type)) ? 1 : 0; 1745 1746 return (ret); 1747 } 1748 1749 static int 1750 vmx_setcap(void *arg, int vcpu, int type, int val) 1751 { 1752 struct vmx *vmx = arg; 1753 struct vmcs *vmcs = &vmx->vmcs[vcpu]; 1754 uint32_t baseval; 1755 uint32_t *pptr; 1756 int error; 1757 int flag; 1758 int reg; 1759 int retval; 1760 1761 retval = ENOENT; 1762 pptr = NULL; 1763 1764 switch (type) { 1765 case VM_CAP_HALT_EXIT: 1766 if (cap_halt_exit) { 1767 retval = 0; 1768 pptr = &vmx->cap[vcpu].proc_ctls; 1769 baseval = *pptr; 1770 flag = PROCBASED_HLT_EXITING; 1771 reg = VMCS_PRI_PROC_BASED_CTLS; 1772 } 1773 break; 1774 case VM_CAP_MTRAP_EXIT: 1775 if (cap_monitor_trap) { 1776 retval = 0; 1777 pptr = &vmx->cap[vcpu].proc_ctls; 1778 baseval = *pptr; 1779 flag = PROCBASED_MTF; 1780 reg = VMCS_PRI_PROC_BASED_CTLS; 1781 } 1782 break; 1783 case VM_CAP_PAUSE_EXIT: 1784 if (cap_pause_exit) { 1785 retval = 0; 1786 pptr = &vmx->cap[vcpu].proc_ctls; 1787 baseval = *pptr; 1788 flag = PROCBASED_PAUSE_EXITING; 1789 reg = VMCS_PRI_PROC_BASED_CTLS; 1790 } 1791 break; 1792 case VM_CAP_UNRESTRICTED_GUEST: 1793 if (cap_unrestricted_guest) { 1794 retval = 0; 1795 baseval = procbased_ctls2; 1796 flag = PROCBASED2_UNRESTRICTED_GUEST; 1797 reg = VMCS_SEC_PROC_BASED_CTLS; 1798 } 1799 break; 1800 default: 1801 break; 1802 } 1803 1804 if (retval == 0) { 1805 if (val) { 1806 baseval |= flag; 1807 } else { 1808 baseval &= ~flag; 1809 } 1810 VMPTRLD(vmcs); 1811 error = vmwrite(reg, baseval); 1812 VMCLEAR(vmcs); 1813 1814 if (error) { 1815 retval = error; 1816 } else { 1817 /* 1818 * Update optional stored flags, and record 1819 * setting 1820 */ 1821 if (pptr != NULL) { 1822 *pptr = baseval; 1823 } 1824 1825 if (val) { 1826 vmx->cap[vcpu].set |= (1 << type); 1827 } else { 1828 vmx->cap[vcpu].set &= ~(1 << type); 1829 } 1830 } 1831 } 1832 1833 return (retval); 1834 } 1835 1836 struct vmm_ops vmm_ops_intel = { 1837 vmx_init, 1838 vmx_cleanup, 1839 vmx_vminit, 1840 vmx_run, 1841 vmx_vmcleanup, 1842 ept_vmmmap_set, 1843 ept_vmmmap_get, 1844 vmx_getreg, 1845 vmx_setreg, 1846 vmx_getdesc, 1847 vmx_setdesc, 1848 vmx_inject, 1849 vmx_getcap, 1850 vmx_setcap 1851 }; 1852