1 /*- 2 * Copyright (c) 2011 NetApp, Inc. 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 * 26 * $FreeBSD$ 27 */ 28 29 #include <sys/cdefs.h> 30 __FBSDID("$FreeBSD$"); 31 32 #include <sys/param.h> 33 #include <sys/systm.h> 34 #include <sys/smp.h> 35 #include <sys/kernel.h> 36 #include <sys/malloc.h> 37 #include <sys/pcpu.h> 38 #include <sys/proc.h> 39 40 #include <vm/vm.h> 41 #include <vm/pmap.h> 42 43 #include <machine/psl.h> 44 #include <machine/cpufunc.h> 45 #include <machine/md_var.h> 46 #include <machine/pmap.h> 47 #include <machine/segments.h> 48 #include <machine/specialreg.h> 49 #include <machine/vmparam.h> 50 51 #include <x86/apicreg.h> 52 53 #include <machine/vmm.h> 54 #include "vmm_host.h" 55 #include "vmm_lapic.h" 56 #include "vmm_msr.h" 57 #include "vmm_ktr.h" 58 #include "vmm_stat.h" 59 60 #include "vmx_msr.h" 61 #include "ept.h" 62 #include "vmx_cpufunc.h" 63 #include "vmx.h" 64 #include "x86.h" 65 #include "vmx_controls.h" 66 67 #define PINBASED_CTLS_ONE_SETTING \ 68 (PINBASED_EXTINT_EXITING | \ 69 PINBASED_NMI_EXITING | \ 70 PINBASED_VIRTUAL_NMI) 71 #define PINBASED_CTLS_ZERO_SETTING 0 72 73 #define PROCBASED_CTLS_WINDOW_SETTING \ 74 (PROCBASED_INT_WINDOW_EXITING | \ 75 PROCBASED_NMI_WINDOW_EXITING) 76 77 #define PROCBASED_CTLS_ONE_SETTING \ 78 (PROCBASED_SECONDARY_CONTROLS | \ 79 PROCBASED_IO_EXITING | \ 80 PROCBASED_MSR_BITMAPS | \ 81 PROCBASED_CTLS_WINDOW_SETTING) 82 #define PROCBASED_CTLS_ZERO_SETTING \ 83 (PROCBASED_CR3_LOAD_EXITING | \ 84 PROCBASED_CR3_STORE_EXITING | \ 85 PROCBASED_IO_BITMAPS) 86 87 #define PROCBASED_CTLS2_ONE_SETTING PROCBASED2_ENABLE_EPT 88 #define PROCBASED_CTLS2_ZERO_SETTING 0 89 90 #define VM_EXIT_CTLS_ONE_SETTING_NO_PAT \ 91 (VM_EXIT_HOST_LMA | \ 92 VM_EXIT_SAVE_EFER | \ 93 VM_EXIT_LOAD_EFER) 94 95 #define VM_EXIT_CTLS_ONE_SETTING \ 96 (VM_EXIT_CTLS_ONE_SETTING_NO_PAT | \ 97 VM_EXIT_SAVE_PAT | \ 98 VM_EXIT_LOAD_PAT) 99 #define VM_EXIT_CTLS_ZERO_SETTING VM_EXIT_SAVE_DEBUG_CONTROLS 100 101 #define VM_ENTRY_CTLS_ONE_SETTING_NO_PAT VM_ENTRY_LOAD_EFER 102 103 #define VM_ENTRY_CTLS_ONE_SETTING \ 104 (VM_ENTRY_CTLS_ONE_SETTING_NO_PAT | \ 105 VM_ENTRY_LOAD_PAT) 106 #define VM_ENTRY_CTLS_ZERO_SETTING \ 107 (VM_ENTRY_LOAD_DEBUG_CONTROLS | \ 108 VM_ENTRY_INTO_SMM | \ 109 VM_ENTRY_DEACTIVATE_DUAL_MONITOR) 110 111 #define guest_msr_rw(vmx, msr) \ 112 msr_bitmap_change_access((vmx)->msr_bitmap, (msr), MSR_BITMAP_ACCESS_RW) 113 114 #define HANDLED 1 115 #define UNHANDLED 0 116 117 MALLOC_DEFINE(M_VMX, "vmx", "vmx"); 118 119 int vmxon_enabled[MAXCPU]; 120 static char vmxon_region[MAXCPU][PAGE_SIZE] __aligned(PAGE_SIZE); 121 122 static uint32_t pinbased_ctls, procbased_ctls, procbased_ctls2; 123 static uint32_t exit_ctls, entry_ctls; 124 125 static uint64_t cr0_ones_mask, cr0_zeros_mask; 126 static uint64_t cr4_ones_mask, cr4_zeros_mask; 127 128 static volatile u_int nextvpid; 129 130 static int vmx_no_patmsr; 131 132 /* 133 * Virtual NMI blocking conditions. 134 * 135 * Some processor implementations also require NMI to be blocked if 136 * the STI_BLOCKING bit is set. It is possible to detect this at runtime 137 * based on the (exit_reason,exit_qual) tuple being set to 138 * (EXIT_REASON_INVAL_VMCS, EXIT_QUAL_NMI_WHILE_STI_BLOCKING). 139 * 140 * We take the easy way out and also include STI_BLOCKING as one of the 141 * gating items for vNMI injection. 142 */ 143 static uint64_t nmi_blocking_bits = VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING | 144 VMCS_INTERRUPTIBILITY_NMI_BLOCKING | 145 VMCS_INTERRUPTIBILITY_STI_BLOCKING; 146 147 /* 148 * Optional capabilities 149 */ 150 static int cap_halt_exit; 151 static int cap_pause_exit; 152 static int cap_unrestricted_guest; 153 static int cap_monitor_trap; 154 155 /* statistics */ 156 static VMM_STAT_DEFINE(VCPU_MIGRATIONS, "vcpu migration across host cpus"); 157 static VMM_STAT_DEFINE(VMEXIT_EXTINT, "vm exits due to external interrupt"); 158 static VMM_STAT_DEFINE(VMEXIT_HLT_IGNORED, "number of times hlt was ignored"); 159 static VMM_STAT_DEFINE(VMEXIT_HLT, "number of times hlt was intercepted"); 160 161 #ifdef KTR 162 static const char * 163 exit_reason_to_str(int reason) 164 { 165 static char reasonbuf[32]; 166 167 switch (reason) { 168 case EXIT_REASON_EXCEPTION: 169 return "exception"; 170 case EXIT_REASON_EXT_INTR: 171 return "extint"; 172 case EXIT_REASON_TRIPLE_FAULT: 173 return "triplefault"; 174 case EXIT_REASON_INIT: 175 return "init"; 176 case EXIT_REASON_SIPI: 177 return "sipi"; 178 case EXIT_REASON_IO_SMI: 179 return "iosmi"; 180 case EXIT_REASON_SMI: 181 return "smi"; 182 case EXIT_REASON_INTR_WINDOW: 183 return "intrwindow"; 184 case EXIT_REASON_NMI_WINDOW: 185 return "nmiwindow"; 186 case EXIT_REASON_TASK_SWITCH: 187 return "taskswitch"; 188 case EXIT_REASON_CPUID: 189 return "cpuid"; 190 case EXIT_REASON_GETSEC: 191 return "getsec"; 192 case EXIT_REASON_HLT: 193 return "hlt"; 194 case EXIT_REASON_INVD: 195 return "invd"; 196 case EXIT_REASON_INVLPG: 197 return "invlpg"; 198 case EXIT_REASON_RDPMC: 199 return "rdpmc"; 200 case EXIT_REASON_RDTSC: 201 return "rdtsc"; 202 case EXIT_REASON_RSM: 203 return "rsm"; 204 case EXIT_REASON_VMCALL: 205 return "vmcall"; 206 case EXIT_REASON_VMCLEAR: 207 return "vmclear"; 208 case EXIT_REASON_VMLAUNCH: 209 return "vmlaunch"; 210 case EXIT_REASON_VMPTRLD: 211 return "vmptrld"; 212 case EXIT_REASON_VMPTRST: 213 return "vmptrst"; 214 case EXIT_REASON_VMREAD: 215 return "vmread"; 216 case EXIT_REASON_VMRESUME: 217 return "vmresume"; 218 case EXIT_REASON_VMWRITE: 219 return "vmwrite"; 220 case EXIT_REASON_VMXOFF: 221 return "vmxoff"; 222 case EXIT_REASON_VMXON: 223 return "vmxon"; 224 case EXIT_REASON_CR_ACCESS: 225 return "craccess"; 226 case EXIT_REASON_DR_ACCESS: 227 return "draccess"; 228 case EXIT_REASON_INOUT: 229 return "inout"; 230 case EXIT_REASON_RDMSR: 231 return "rdmsr"; 232 case EXIT_REASON_WRMSR: 233 return "wrmsr"; 234 case EXIT_REASON_INVAL_VMCS: 235 return "invalvmcs"; 236 case EXIT_REASON_INVAL_MSR: 237 return "invalmsr"; 238 case EXIT_REASON_MWAIT: 239 return "mwait"; 240 case EXIT_REASON_MTF: 241 return "mtf"; 242 case EXIT_REASON_MONITOR: 243 return "monitor"; 244 case EXIT_REASON_PAUSE: 245 return "pause"; 246 case EXIT_REASON_MCE: 247 return "mce"; 248 case EXIT_REASON_TPR: 249 return "tpr"; 250 case EXIT_REASON_APIC: 251 return "apic"; 252 case EXIT_REASON_GDTR_IDTR: 253 return "gdtridtr"; 254 case EXIT_REASON_LDTR_TR: 255 return "ldtrtr"; 256 case EXIT_REASON_EPT_FAULT: 257 return "eptfault"; 258 case EXIT_REASON_EPT_MISCONFIG: 259 return "eptmisconfig"; 260 case EXIT_REASON_INVEPT: 261 return "invept"; 262 case EXIT_REASON_RDTSCP: 263 return "rdtscp"; 264 case EXIT_REASON_VMX_PREEMPT: 265 return "vmxpreempt"; 266 case EXIT_REASON_INVVPID: 267 return "invvpid"; 268 case EXIT_REASON_WBINVD: 269 return "wbinvd"; 270 case EXIT_REASON_XSETBV: 271 return "xsetbv"; 272 default: 273 snprintf(reasonbuf, sizeof(reasonbuf), "%d", reason); 274 return (reasonbuf); 275 } 276 } 277 278 #ifdef SETJMP_TRACE 279 static const char * 280 vmx_setjmp_rc2str(int rc) 281 { 282 switch (rc) { 283 case VMX_RETURN_DIRECT: 284 return "direct"; 285 case VMX_RETURN_LONGJMP: 286 return "longjmp"; 287 case VMX_RETURN_VMRESUME: 288 return "vmresume"; 289 case VMX_RETURN_VMLAUNCH: 290 return "vmlaunch"; 291 case VMX_RETURN_AST: 292 return "ast"; 293 default: 294 return "unknown"; 295 } 296 } 297 298 #define SETJMP_TRACE(vmx, vcpu, vmxctx, regname) \ 299 VMM_CTR1((vmx)->vm, (vcpu), "setjmp trace " #regname " 0x%016lx", \ 300 (vmxctx)->regname) 301 302 static void 303 vmx_setjmp_trace(struct vmx *vmx, int vcpu, struct vmxctx *vmxctx, int rc) 304 { 305 uint64_t host_rip, host_rsp; 306 307 if (vmxctx != &vmx->ctx[vcpu]) 308 panic("vmx_setjmp_trace: invalid vmxctx %p; should be %p", 309 vmxctx, &vmx->ctx[vcpu]); 310 311 VMM_CTR1((vmx)->vm, (vcpu), "vmxctx = %p", vmxctx); 312 VMM_CTR2((vmx)->vm, (vcpu), "setjmp return code %s(%d)", 313 vmx_setjmp_rc2str(rc), rc); 314 315 host_rsp = host_rip = ~0; 316 vmread(VMCS_HOST_RIP, &host_rip); 317 vmread(VMCS_HOST_RSP, &host_rsp); 318 VMM_CTR2((vmx)->vm, (vcpu), "vmcs host_rip 0x%016lx, host_rsp 0x%016lx", 319 host_rip, host_rsp); 320 321 SETJMP_TRACE(vmx, vcpu, vmxctx, host_r15); 322 SETJMP_TRACE(vmx, vcpu, vmxctx, host_r14); 323 SETJMP_TRACE(vmx, vcpu, vmxctx, host_r13); 324 SETJMP_TRACE(vmx, vcpu, vmxctx, host_r12); 325 SETJMP_TRACE(vmx, vcpu, vmxctx, host_rbp); 326 SETJMP_TRACE(vmx, vcpu, vmxctx, host_rsp); 327 SETJMP_TRACE(vmx, vcpu, vmxctx, host_rbx); 328 SETJMP_TRACE(vmx, vcpu, vmxctx, host_rip); 329 330 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rdi); 331 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rsi); 332 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rdx); 333 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rcx); 334 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r8); 335 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r9); 336 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rax); 337 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rbx); 338 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rbp); 339 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r10); 340 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r11); 341 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r12); 342 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r13); 343 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r14); 344 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r15); 345 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_cr2); 346 } 347 #endif 348 #else 349 static void __inline 350 vmx_setjmp_trace(struct vmx *vmx, int vcpu, struct vmxctx *vmxctx, int rc) 351 { 352 return; 353 } 354 #endif /* KTR */ 355 356 u_long 357 vmx_fix_cr0(u_long cr0) 358 { 359 360 return ((cr0 | cr0_ones_mask) & ~cr0_zeros_mask); 361 } 362 363 u_long 364 vmx_fix_cr4(u_long cr4) 365 { 366 367 return ((cr4 | cr4_ones_mask) & ~cr4_zeros_mask); 368 } 369 370 static void 371 msr_save_area_init(struct msr_entry *g_area, int *g_count) 372 { 373 int cnt; 374 375 static struct msr_entry guest_msrs[] = { 376 { MSR_KGSBASE, 0, 0 }, 377 }; 378 379 cnt = sizeof(guest_msrs) / sizeof(guest_msrs[0]); 380 if (cnt > GUEST_MSR_MAX_ENTRIES) 381 panic("guest msr save area overrun"); 382 bcopy(guest_msrs, g_area, sizeof(guest_msrs)); 383 *g_count = cnt; 384 } 385 386 static void 387 vmx_disable(void *arg __unused) 388 { 389 struct invvpid_desc invvpid_desc = { 0 }; 390 struct invept_desc invept_desc = { 0 }; 391 392 if (vmxon_enabled[curcpu]) { 393 /* 394 * See sections 25.3.3.3 and 25.3.3.4 in Intel Vol 3b. 395 * 396 * VMXON or VMXOFF are not required to invalidate any TLB 397 * caching structures. This prevents potential retention of 398 * cached information in the TLB between distinct VMX episodes. 399 */ 400 invvpid(INVVPID_TYPE_ALL_CONTEXTS, invvpid_desc); 401 invept(INVEPT_TYPE_ALL_CONTEXTS, invept_desc); 402 vmxoff(); 403 } 404 load_cr4(rcr4() & ~CR4_VMXE); 405 } 406 407 static int 408 vmx_cleanup(void) 409 { 410 411 smp_rendezvous(NULL, vmx_disable, NULL, NULL); 412 413 return (0); 414 } 415 416 static void 417 vmx_enable(void *arg __unused) 418 { 419 int error; 420 421 load_cr4(rcr4() | CR4_VMXE); 422 423 *(uint32_t *)vmxon_region[curcpu] = vmx_revision(); 424 error = vmxon(vmxon_region[curcpu]); 425 if (error == 0) 426 vmxon_enabled[curcpu] = 1; 427 } 428 429 static int 430 vmx_init(void) 431 { 432 int error; 433 uint64_t fixed0, fixed1, feature_control; 434 uint32_t tmp; 435 436 /* CPUID.1:ECX[bit 5] must be 1 for processor to support VMX */ 437 if (!(cpu_feature2 & CPUID2_VMX)) { 438 printf("vmx_init: processor does not support VMX operation\n"); 439 return (ENXIO); 440 } 441 442 /* 443 * Verify that MSR_IA32_FEATURE_CONTROL lock and VMXON enable bits 444 * are set (bits 0 and 2 respectively). 445 */ 446 feature_control = rdmsr(MSR_IA32_FEATURE_CONTROL); 447 if ((feature_control & 0x5) != 0x5) { 448 printf("vmx_init: VMX operation disabled by BIOS\n"); 449 return (ENXIO); 450 } 451 452 /* Check support for primary processor-based VM-execution controls */ 453 error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, 454 MSR_VMX_TRUE_PROCBASED_CTLS, 455 PROCBASED_CTLS_ONE_SETTING, 456 PROCBASED_CTLS_ZERO_SETTING, &procbased_ctls); 457 if (error) { 458 printf("vmx_init: processor does not support desired primary " 459 "processor-based controls\n"); 460 return (error); 461 } 462 463 /* Clear the processor-based ctl bits that are set on demand */ 464 procbased_ctls &= ~PROCBASED_CTLS_WINDOW_SETTING; 465 466 /* Check support for secondary processor-based VM-execution controls */ 467 error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, 468 MSR_VMX_PROCBASED_CTLS2, 469 PROCBASED_CTLS2_ONE_SETTING, 470 PROCBASED_CTLS2_ZERO_SETTING, &procbased_ctls2); 471 if (error) { 472 printf("vmx_init: processor does not support desired secondary " 473 "processor-based controls\n"); 474 return (error); 475 } 476 477 /* Check support for VPID */ 478 error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, MSR_VMX_PROCBASED_CTLS2, 479 PROCBASED2_ENABLE_VPID, 0, &tmp); 480 if (error == 0) 481 procbased_ctls2 |= PROCBASED2_ENABLE_VPID; 482 483 /* Check support for pin-based VM-execution controls */ 484 error = vmx_set_ctlreg(MSR_VMX_PINBASED_CTLS, 485 MSR_VMX_TRUE_PINBASED_CTLS, 486 PINBASED_CTLS_ONE_SETTING, 487 PINBASED_CTLS_ZERO_SETTING, &pinbased_ctls); 488 if (error) { 489 printf("vmx_init: processor does not support desired " 490 "pin-based controls\n"); 491 return (error); 492 } 493 494 /* Check support for VM-exit controls */ 495 error = vmx_set_ctlreg(MSR_VMX_EXIT_CTLS, MSR_VMX_TRUE_EXIT_CTLS, 496 VM_EXIT_CTLS_ONE_SETTING, 497 VM_EXIT_CTLS_ZERO_SETTING, 498 &exit_ctls); 499 if (error) { 500 /* Try again without the PAT MSR bits */ 501 error = vmx_set_ctlreg(MSR_VMX_EXIT_CTLS, 502 MSR_VMX_TRUE_EXIT_CTLS, 503 VM_EXIT_CTLS_ONE_SETTING_NO_PAT, 504 VM_EXIT_CTLS_ZERO_SETTING, 505 &exit_ctls); 506 if (error) { 507 printf("vmx_init: processor does not support desired " 508 "exit controls\n"); 509 return (error); 510 } else { 511 if (bootverbose) 512 printf("vmm: PAT MSR access not supported\n"); 513 guest_msr_valid(MSR_PAT); 514 vmx_no_patmsr = 1; 515 } 516 } 517 518 /* Check support for VM-entry controls */ 519 if (!vmx_no_patmsr) { 520 error = vmx_set_ctlreg(MSR_VMX_ENTRY_CTLS, 521 MSR_VMX_TRUE_ENTRY_CTLS, 522 VM_ENTRY_CTLS_ONE_SETTING, 523 VM_ENTRY_CTLS_ZERO_SETTING, 524 &entry_ctls); 525 } else { 526 error = vmx_set_ctlreg(MSR_VMX_ENTRY_CTLS, 527 MSR_VMX_TRUE_ENTRY_CTLS, 528 VM_ENTRY_CTLS_ONE_SETTING_NO_PAT, 529 VM_ENTRY_CTLS_ZERO_SETTING, 530 &entry_ctls); 531 } 532 533 if (error) { 534 printf("vmx_init: processor does not support desired " 535 "entry controls\n"); 536 return (error); 537 } 538 539 /* 540 * Check support for optional features by testing them 541 * as individual bits 542 */ 543 cap_halt_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, 544 MSR_VMX_TRUE_PROCBASED_CTLS, 545 PROCBASED_HLT_EXITING, 0, 546 &tmp) == 0); 547 548 cap_monitor_trap = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, 549 MSR_VMX_PROCBASED_CTLS, 550 PROCBASED_MTF, 0, 551 &tmp) == 0); 552 553 cap_pause_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, 554 MSR_VMX_TRUE_PROCBASED_CTLS, 555 PROCBASED_PAUSE_EXITING, 0, 556 &tmp) == 0); 557 558 cap_unrestricted_guest = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, 559 MSR_VMX_PROCBASED_CTLS2, 560 PROCBASED2_UNRESTRICTED_GUEST, 0, 561 &tmp) == 0); 562 563 /* Initialize EPT */ 564 error = ept_init(); 565 if (error) { 566 printf("vmx_init: ept initialization failed (%d)\n", error); 567 return (error); 568 } 569 570 /* 571 * Stash the cr0 and cr4 bits that must be fixed to 0 or 1 572 */ 573 fixed0 = rdmsr(MSR_VMX_CR0_FIXED0); 574 fixed1 = rdmsr(MSR_VMX_CR0_FIXED1); 575 cr0_ones_mask = fixed0 & fixed1; 576 cr0_zeros_mask = ~fixed0 & ~fixed1; 577 578 /* 579 * CR0_PE and CR0_PG can be set to zero in VMX non-root operation 580 * if unrestricted guest execution is allowed. 581 */ 582 if (cap_unrestricted_guest) 583 cr0_ones_mask &= ~(CR0_PG | CR0_PE); 584 585 /* 586 * Do not allow the guest to set CR0_NW or CR0_CD. 587 */ 588 cr0_zeros_mask |= (CR0_NW | CR0_CD); 589 590 fixed0 = rdmsr(MSR_VMX_CR4_FIXED0); 591 fixed1 = rdmsr(MSR_VMX_CR4_FIXED1); 592 cr4_ones_mask = fixed0 & fixed1; 593 cr4_zeros_mask = ~fixed0 & ~fixed1; 594 595 /* enable VMX operation */ 596 smp_rendezvous(NULL, vmx_enable, NULL, NULL); 597 598 return (0); 599 } 600 601 /* 602 * If this processor does not support VPIDs then simply return 0. 603 * 604 * Otherwise generate the next value of VPID to use. Any value is alright 605 * as long as it is non-zero. 606 * 607 * We always execute in VMX non-root context with EPT enabled. Thus all 608 * combined mappings are tagged with the (EP4TA, VPID, PCID) tuple. This 609 * in turn means that multiple VMs can share the same VPID as long as 610 * they have distinct EPT page tables. 611 * 612 * XXX 613 * We should optimize this so that it returns VPIDs that are not in 614 * use. Then we will not unnecessarily invalidate mappings in 615 * vmx_set_pcpu_defaults() just because two or more vcpus happen to 616 * use the same 'vpid'. 617 */ 618 static uint16_t 619 vmx_vpid(void) 620 { 621 uint16_t vpid = 0; 622 623 if ((procbased_ctls2 & PROCBASED2_ENABLE_VPID) != 0) { 624 do { 625 vpid = atomic_fetchadd_int(&nextvpid, 1); 626 } while (vpid == 0); 627 } 628 629 return (vpid); 630 } 631 632 static int 633 vmx_setup_cr_shadow(int which, struct vmcs *vmcs) 634 { 635 int error, mask_ident, shadow_ident; 636 uint64_t mask_value, shadow_value; 637 638 if (which != 0 && which != 4) 639 panic("vmx_setup_cr_shadow: unknown cr%d", which); 640 641 if (which == 0) { 642 mask_ident = VMCS_CR0_MASK; 643 mask_value = cr0_ones_mask | cr0_zeros_mask; 644 shadow_ident = VMCS_CR0_SHADOW; 645 shadow_value = cr0_ones_mask; 646 } else { 647 mask_ident = VMCS_CR4_MASK; 648 mask_value = cr4_ones_mask | cr4_zeros_mask; 649 shadow_ident = VMCS_CR4_SHADOW; 650 shadow_value = cr4_ones_mask; 651 } 652 653 error = vmcs_setreg(vmcs, VMCS_IDENT(mask_ident), mask_value); 654 if (error) 655 return (error); 656 657 error = vmcs_setreg(vmcs, VMCS_IDENT(shadow_ident), shadow_value); 658 if (error) 659 return (error); 660 661 return (0); 662 } 663 #define vmx_setup_cr0_shadow(vmcs) vmx_setup_cr_shadow(0, (vmcs)) 664 #define vmx_setup_cr4_shadow(vmcs) vmx_setup_cr_shadow(4, (vmcs)) 665 666 static void * 667 vmx_vminit(struct vm *vm) 668 { 669 uint16_t vpid; 670 int i, error, guest_msr_count; 671 struct vmx *vmx; 672 673 vmx = malloc(sizeof(struct vmx), M_VMX, M_WAITOK | M_ZERO); 674 if ((uintptr_t)vmx & PAGE_MASK) { 675 panic("malloc of struct vmx not aligned on %d byte boundary", 676 PAGE_SIZE); 677 } 678 vmx->vm = vm; 679 680 /* 681 * Clean up EPTP-tagged guest physical and combined mappings 682 * 683 * VMX transitions are not required to invalidate any guest physical 684 * mappings. So, it may be possible for stale guest physical mappings 685 * to be present in the processor TLBs. 686 * 687 * Combined mappings for this EP4TA are also invalidated for all VPIDs. 688 */ 689 ept_invalidate_mappings(vtophys(vmx->pml4ept)); 690 691 msr_bitmap_initialize(vmx->msr_bitmap); 692 693 /* 694 * It is safe to allow direct access to MSR_GSBASE and MSR_FSBASE. 695 * The guest FSBASE and GSBASE are saved and restored during 696 * vm-exit and vm-entry respectively. The host FSBASE and GSBASE are 697 * always restored from the vmcs host state area on vm-exit. 698 * 699 * The SYSENTER_CS/ESP/EIP MSRs are identical to FS/GSBASE in 700 * how they are saved/restored so can be directly accessed by the 701 * guest. 702 * 703 * Guest KGSBASE is saved and restored in the guest MSR save area. 704 * Host KGSBASE is restored before returning to userland from the pcb. 705 * There will be a window of time when we are executing in the host 706 * kernel context with a value of KGSBASE from the guest. This is ok 707 * because the value of KGSBASE is inconsequential in kernel context. 708 * 709 * MSR_EFER is saved and restored in the guest VMCS area on a 710 * VM exit and entry respectively. It is also restored from the 711 * host VMCS area on a VM exit. 712 */ 713 if (guest_msr_rw(vmx, MSR_GSBASE) || 714 guest_msr_rw(vmx, MSR_FSBASE) || 715 guest_msr_rw(vmx, MSR_SYSENTER_CS_MSR) || 716 guest_msr_rw(vmx, MSR_SYSENTER_ESP_MSR) || 717 guest_msr_rw(vmx, MSR_SYSENTER_EIP_MSR) || 718 guest_msr_rw(vmx, MSR_KGSBASE) || 719 guest_msr_rw(vmx, MSR_EFER)) 720 panic("vmx_vminit: error setting guest msr access"); 721 722 /* 723 * MSR_PAT is saved and restored in the guest VMCS are on a VM exit 724 * and entry respectively. It is also restored from the host VMCS 725 * area on a VM exit. However, if running on a system with no 726 * MSR_PAT save/restore support, leave access disabled so accesses 727 * will be trapped. 728 */ 729 if (!vmx_no_patmsr && guest_msr_rw(vmx, MSR_PAT)) 730 panic("vmx_vminit: error setting guest pat msr access"); 731 732 for (i = 0; i < VM_MAXCPU; i++) { 733 vmx->vmcs[i].identifier = vmx_revision(); 734 error = vmclear(&vmx->vmcs[i]); 735 if (error != 0) { 736 panic("vmx_vminit: vmclear error %d on vcpu %d\n", 737 error, i); 738 } 739 740 vpid = vmx_vpid(); 741 742 error = vmcs_set_defaults(&vmx->vmcs[i], 743 (u_long)vmx_longjmp, 744 (u_long)&vmx->ctx[i], 745 vtophys(vmx->pml4ept), 746 pinbased_ctls, 747 procbased_ctls, 748 procbased_ctls2, 749 exit_ctls, entry_ctls, 750 vtophys(vmx->msr_bitmap), 751 vpid); 752 753 if (error != 0) 754 panic("vmx_vminit: vmcs_set_defaults error %d", error); 755 756 vmx->cap[i].set = 0; 757 vmx->cap[i].proc_ctls = procbased_ctls; 758 759 vmx->state[i].lastcpu = -1; 760 vmx->state[i].vpid = vpid; 761 762 msr_save_area_init(vmx->guest_msrs[i], &guest_msr_count); 763 764 error = vmcs_set_msr_save(&vmx->vmcs[i], 765 vtophys(vmx->guest_msrs[i]), 766 guest_msr_count); 767 if (error != 0) 768 panic("vmcs_set_msr_save error %d", error); 769 770 error = vmx_setup_cr0_shadow(&vmx->vmcs[i]); 771 if (error != 0) 772 panic("vmx_setup_cr0_shadow %d", error); 773 774 error = vmx_setup_cr4_shadow(&vmx->vmcs[i]); 775 if (error != 0) 776 panic("vmx_setup_cr4_shadow %d", error); 777 } 778 779 return (vmx); 780 } 781 782 static int 783 vmx_handle_cpuid(struct vm *vm, int vcpu, struct vmxctx *vmxctx) 784 { 785 int handled, func; 786 787 func = vmxctx->guest_rax; 788 789 handled = x86_emulate_cpuid(vm, vcpu, 790 (uint32_t*)(&vmxctx->guest_rax), 791 (uint32_t*)(&vmxctx->guest_rbx), 792 (uint32_t*)(&vmxctx->guest_rcx), 793 (uint32_t*)(&vmxctx->guest_rdx)); 794 return (handled); 795 } 796 797 static __inline void 798 vmx_run_trace(struct vmx *vmx, int vcpu) 799 { 800 #ifdef KTR 801 VMM_CTR1(vmx->vm, vcpu, "Resume execution at 0x%0lx", vmcs_guest_rip()); 802 #endif 803 } 804 805 static __inline void 806 vmx_exit_trace(struct vmx *vmx, int vcpu, uint64_t rip, uint32_t exit_reason, 807 int handled) 808 { 809 #ifdef KTR 810 VMM_CTR3(vmx->vm, vcpu, "%s %s vmexit at 0x%0lx", 811 handled ? "handled" : "unhandled", 812 exit_reason_to_str(exit_reason), rip); 813 #endif 814 } 815 816 static __inline void 817 vmx_astpending_trace(struct vmx *vmx, int vcpu, uint64_t rip) 818 { 819 #ifdef KTR 820 VMM_CTR1(vmx->vm, vcpu, "astpending vmexit at 0x%0lx", rip); 821 #endif 822 } 823 824 static int 825 vmx_set_pcpu_defaults(struct vmx *vmx, int vcpu) 826 { 827 int error, lastcpu; 828 struct vmxstate *vmxstate; 829 struct invvpid_desc invvpid_desc = { 0 }; 830 831 vmxstate = &vmx->state[vcpu]; 832 lastcpu = vmxstate->lastcpu; 833 vmxstate->lastcpu = curcpu; 834 835 if (lastcpu == curcpu) { 836 error = 0; 837 goto done; 838 } 839 840 vmm_stat_incr(vmx->vm, vcpu, VCPU_MIGRATIONS, 1); 841 842 error = vmwrite(VMCS_HOST_TR_BASE, vmm_get_host_trbase()); 843 if (error != 0) 844 goto done; 845 846 error = vmwrite(VMCS_HOST_GDTR_BASE, vmm_get_host_gdtrbase()); 847 if (error != 0) 848 goto done; 849 850 error = vmwrite(VMCS_HOST_GS_BASE, vmm_get_host_gsbase()); 851 if (error != 0) 852 goto done; 853 854 /* 855 * If we are using VPIDs then invalidate all mappings tagged with 'vpid' 856 * 857 * We do this because this vcpu was executing on a different host 858 * cpu when it last ran. We do not track whether it invalidated 859 * mappings associated with its 'vpid' during that run. So we must 860 * assume that the mappings associated with 'vpid' on 'curcpu' are 861 * stale and invalidate them. 862 * 863 * Note that we incur this penalty only when the scheduler chooses to 864 * move the thread associated with this vcpu between host cpus. 865 * 866 * Note also that this will invalidate mappings tagged with 'vpid' 867 * for "all" EP4TAs. 868 */ 869 if (vmxstate->vpid != 0) { 870 invvpid_desc.vpid = vmxstate->vpid; 871 invvpid(INVVPID_TYPE_SINGLE_CONTEXT, invvpid_desc); 872 } 873 done: 874 return (error); 875 } 876 877 static void 878 vm_exit_update_rip(struct vm_exit *vmexit) 879 { 880 int error; 881 882 error = vmwrite(VMCS_GUEST_RIP, vmexit->rip + vmexit->inst_length); 883 if (error) 884 panic("vmx_run: error %d writing to VMCS_GUEST_RIP", error); 885 } 886 887 /* 888 * We depend on 'procbased_ctls' to have the Interrupt Window Exiting bit set. 889 */ 890 CTASSERT((PROCBASED_CTLS_ONE_SETTING & PROCBASED_INT_WINDOW_EXITING) != 0); 891 892 static void __inline 893 vmx_set_int_window_exiting(struct vmx *vmx, int vcpu) 894 { 895 int error; 896 897 vmx->cap[vcpu].proc_ctls |= PROCBASED_INT_WINDOW_EXITING; 898 899 error = vmwrite(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); 900 if (error) 901 panic("vmx_set_int_window_exiting: vmwrite error %d", error); 902 } 903 904 static void __inline 905 vmx_clear_int_window_exiting(struct vmx *vmx, int vcpu) 906 { 907 int error; 908 909 vmx->cap[vcpu].proc_ctls &= ~PROCBASED_INT_WINDOW_EXITING; 910 911 error = vmwrite(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); 912 if (error) 913 panic("vmx_clear_int_window_exiting: vmwrite error %d", error); 914 } 915 916 static void __inline 917 vmx_set_nmi_window_exiting(struct vmx *vmx, int vcpu) 918 { 919 int error; 920 921 vmx->cap[vcpu].proc_ctls |= PROCBASED_NMI_WINDOW_EXITING; 922 923 error = vmwrite(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); 924 if (error) 925 panic("vmx_set_nmi_window_exiting: vmwrite error %d", error); 926 } 927 928 static void __inline 929 vmx_clear_nmi_window_exiting(struct vmx *vmx, int vcpu) 930 { 931 int error; 932 933 vmx->cap[vcpu].proc_ctls &= ~PROCBASED_NMI_WINDOW_EXITING; 934 935 error = vmwrite(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); 936 if (error) 937 panic("vmx_clear_nmi_window_exiting: vmwrite error %d", error); 938 } 939 940 static int 941 vmx_inject_nmi(struct vmx *vmx, int vcpu) 942 { 943 int error; 944 uint64_t info, interruptibility; 945 946 /* Bail out if no NMI requested */ 947 if (!vm_nmi_pending(vmx->vm, vcpu)) 948 return (0); 949 950 error = vmread(VMCS_GUEST_INTERRUPTIBILITY, &interruptibility); 951 if (error) { 952 panic("vmx_inject_nmi: vmread(interruptibility) %d", 953 error); 954 } 955 if (interruptibility & nmi_blocking_bits) 956 goto nmiblocked; 957 958 /* 959 * Inject the virtual NMI. The vector must be the NMI IDT entry 960 * or the VMCS entry check will fail. 961 */ 962 info = VMCS_INTERRUPTION_INFO_NMI | VMCS_INTERRUPTION_INFO_VALID; 963 info |= IDT_NMI; 964 965 error = vmwrite(VMCS_ENTRY_INTR_INFO, info); 966 if (error) 967 panic("vmx_inject_nmi: vmwrite(intrinfo) %d", error); 968 969 VMM_CTR0(vmx->vm, vcpu, "Injecting vNMI"); 970 971 /* Clear the request */ 972 vm_nmi_clear(vmx->vm, vcpu); 973 return (1); 974 975 nmiblocked: 976 /* 977 * Set the NMI Window Exiting execution control so we can inject 978 * the virtual NMI as soon as blocking condition goes away. 979 */ 980 vmx_set_nmi_window_exiting(vmx, vcpu); 981 982 VMM_CTR0(vmx->vm, vcpu, "Enabling NMI window exiting"); 983 return (1); 984 } 985 986 static void 987 vmx_inject_interrupts(struct vmx *vmx, int vcpu) 988 { 989 int error, vector; 990 uint64_t info, rflags, interruptibility; 991 992 const int HWINTR_BLOCKED = VMCS_INTERRUPTIBILITY_STI_BLOCKING | 993 VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING; 994 995 /* 996 * If there is already an interrupt pending then just return. 997 * 998 * This could happen if an interrupt was injected on a prior 999 * VM entry but the actual entry into guest mode was aborted 1000 * because of a pending AST. 1001 */ 1002 error = vmread(VMCS_ENTRY_INTR_INFO, &info); 1003 if (error) 1004 panic("vmx_inject_interrupts: vmread(intrinfo) %d", error); 1005 if (info & VMCS_INTERRUPTION_INFO_VALID) 1006 return; 1007 1008 /* 1009 * NMI injection has priority so deal with those first 1010 */ 1011 if (vmx_inject_nmi(vmx, vcpu)) 1012 return; 1013 1014 /* Ask the local apic for a vector to inject */ 1015 vector = lapic_pending_intr(vmx->vm, vcpu); 1016 if (vector < 0) 1017 return; 1018 1019 if (vector < 32 || vector > 255) 1020 panic("vmx_inject_interrupts: invalid vector %d\n", vector); 1021 1022 /* Check RFLAGS.IF and the interruptibility state of the guest */ 1023 error = vmread(VMCS_GUEST_RFLAGS, &rflags); 1024 if (error) 1025 panic("vmx_inject_interrupts: vmread(rflags) %d", error); 1026 1027 if ((rflags & PSL_I) == 0) 1028 goto cantinject; 1029 1030 error = vmread(VMCS_GUEST_INTERRUPTIBILITY, &interruptibility); 1031 if (error) { 1032 panic("vmx_inject_interrupts: vmread(interruptibility) %d", 1033 error); 1034 } 1035 if (interruptibility & HWINTR_BLOCKED) 1036 goto cantinject; 1037 1038 /* Inject the interrupt */ 1039 info = VMCS_INTERRUPTION_INFO_HW_INTR | VMCS_INTERRUPTION_INFO_VALID; 1040 info |= vector; 1041 error = vmwrite(VMCS_ENTRY_INTR_INFO, info); 1042 if (error) 1043 panic("vmx_inject_interrupts: vmwrite(intrinfo) %d", error); 1044 1045 /* Update the Local APIC ISR */ 1046 lapic_intr_accepted(vmx->vm, vcpu, vector); 1047 1048 VMM_CTR1(vmx->vm, vcpu, "Injecting hwintr at vector %d", vector); 1049 1050 return; 1051 1052 cantinject: 1053 /* 1054 * Set the Interrupt Window Exiting execution control so we can inject 1055 * the interrupt as soon as blocking condition goes away. 1056 */ 1057 vmx_set_int_window_exiting(vmx, vcpu); 1058 1059 VMM_CTR0(vmx->vm, vcpu, "Enabling interrupt window exiting"); 1060 } 1061 1062 static int 1063 vmx_emulate_cr_access(struct vmx *vmx, int vcpu, uint64_t exitqual) 1064 { 1065 int error, cr, vmcs_guest_cr; 1066 uint64_t regval, ones_mask, zeros_mask; 1067 const struct vmxctx *vmxctx; 1068 1069 /* We only handle mov to %cr0 or %cr4 at this time */ 1070 if ((exitqual & 0xf0) != 0x00) 1071 return (UNHANDLED); 1072 1073 cr = exitqual & 0xf; 1074 if (cr != 0 && cr != 4) 1075 return (UNHANDLED); 1076 1077 vmxctx = &vmx->ctx[vcpu]; 1078 1079 /* 1080 * We must use vmwrite() directly here because vmcs_setreg() will 1081 * call vmclear(vmcs) as a side-effect which we certainly don't want. 1082 */ 1083 switch ((exitqual >> 8) & 0xf) { 1084 case 0: 1085 regval = vmxctx->guest_rax; 1086 break; 1087 case 1: 1088 regval = vmxctx->guest_rcx; 1089 break; 1090 case 2: 1091 regval = vmxctx->guest_rdx; 1092 break; 1093 case 3: 1094 regval = vmxctx->guest_rbx; 1095 break; 1096 case 4: 1097 error = vmread(VMCS_GUEST_RSP, ®val); 1098 if (error) { 1099 panic("vmx_emulate_cr_access: " 1100 "error %d reading guest rsp", error); 1101 } 1102 break; 1103 case 5: 1104 regval = vmxctx->guest_rbp; 1105 break; 1106 case 6: 1107 regval = vmxctx->guest_rsi; 1108 break; 1109 case 7: 1110 regval = vmxctx->guest_rdi; 1111 break; 1112 case 8: 1113 regval = vmxctx->guest_r8; 1114 break; 1115 case 9: 1116 regval = vmxctx->guest_r9; 1117 break; 1118 case 10: 1119 regval = vmxctx->guest_r10; 1120 break; 1121 case 11: 1122 regval = vmxctx->guest_r11; 1123 break; 1124 case 12: 1125 regval = vmxctx->guest_r12; 1126 break; 1127 case 13: 1128 regval = vmxctx->guest_r13; 1129 break; 1130 case 14: 1131 regval = vmxctx->guest_r14; 1132 break; 1133 case 15: 1134 regval = vmxctx->guest_r15; 1135 break; 1136 } 1137 1138 if (cr == 0) { 1139 ones_mask = cr0_ones_mask; 1140 zeros_mask = cr0_zeros_mask; 1141 vmcs_guest_cr = VMCS_GUEST_CR0; 1142 } else { 1143 ones_mask = cr4_ones_mask; 1144 zeros_mask = cr4_zeros_mask; 1145 vmcs_guest_cr = VMCS_GUEST_CR4; 1146 } 1147 regval |= ones_mask; 1148 regval &= ~zeros_mask; 1149 error = vmwrite(vmcs_guest_cr, regval); 1150 if (error) { 1151 panic("vmx_emulate_cr_access: error %d writing cr%d", 1152 error, cr); 1153 } 1154 1155 return (HANDLED); 1156 } 1157 1158 static int 1159 vmx_ept_fault(struct vm *vm, int cpu, 1160 uint64_t gla, uint64_t gpa, uint64_t rip, int inst_length, 1161 uint64_t cr3, uint64_t ept_qual, struct vie *vie) 1162 { 1163 int read, write, error; 1164 1165 /* EPT violation on an instruction fetch doesn't make sense here */ 1166 if (ept_qual & EPT_VIOLATION_INST_FETCH) 1167 return (UNHANDLED); 1168 1169 /* EPT violation must be a read fault or a write fault */ 1170 read = ept_qual & EPT_VIOLATION_DATA_READ ? 1 : 0; 1171 write = ept_qual & EPT_VIOLATION_DATA_WRITE ? 1 : 0; 1172 if ((read | write) == 0) 1173 return (UNHANDLED); 1174 1175 /* 1176 * The EPT violation must have been caused by accessing a 1177 * guest-physical address that is a translation of a guest-linear 1178 * address. 1179 */ 1180 if ((ept_qual & EPT_VIOLATION_GLA_VALID) == 0 || 1181 (ept_qual & EPT_VIOLATION_XLAT_VALID) == 0) { 1182 return (UNHANDLED); 1183 } 1184 1185 /* Fetch, decode and emulate the faulting instruction */ 1186 if (vmm_fetch_instruction(vm, cpu, rip, inst_length, cr3, vie) != 0) 1187 return (UNHANDLED); 1188 1189 if (vmm_decode_instruction(vm, cpu, gla, vie) != 0) 1190 return (UNHANDLED); 1191 1192 /* 1193 * Check if this is a local apic access 1194 */ 1195 if (gpa < DEFAULT_APIC_BASE || gpa >= DEFAULT_APIC_BASE + PAGE_SIZE) 1196 return (UNHANDLED); 1197 1198 error = vmm_emulate_instruction(vm, cpu, gpa, vie, 1199 lapic_mmio_read, lapic_mmio_write, 0); 1200 1201 return (error ? UNHANDLED : HANDLED); 1202 } 1203 1204 static int 1205 vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit) 1206 { 1207 int error, handled; 1208 struct vmcs *vmcs; 1209 struct vmxctx *vmxctx; 1210 uint32_t eax, ecx, edx; 1211 uint64_t qual, gla, gpa, cr3, intr_info; 1212 1213 handled = 0; 1214 vmcs = &vmx->vmcs[vcpu]; 1215 vmxctx = &vmx->ctx[vcpu]; 1216 qual = vmexit->u.vmx.exit_qualification; 1217 vmexit->exitcode = VM_EXITCODE_BOGUS; 1218 1219 switch (vmexit->u.vmx.exit_reason) { 1220 case EXIT_REASON_CR_ACCESS: 1221 handled = vmx_emulate_cr_access(vmx, vcpu, qual); 1222 break; 1223 case EXIT_REASON_RDMSR: 1224 ecx = vmxctx->guest_rcx; 1225 error = emulate_rdmsr(vmx->vm, vcpu, ecx); 1226 if (error) { 1227 vmexit->exitcode = VM_EXITCODE_RDMSR; 1228 vmexit->u.msr.code = ecx; 1229 } else 1230 handled = 1; 1231 break; 1232 case EXIT_REASON_WRMSR: 1233 eax = vmxctx->guest_rax; 1234 ecx = vmxctx->guest_rcx; 1235 edx = vmxctx->guest_rdx; 1236 error = emulate_wrmsr(vmx->vm, vcpu, ecx, 1237 (uint64_t)edx << 32 | eax); 1238 if (error) { 1239 vmexit->exitcode = VM_EXITCODE_WRMSR; 1240 vmexit->u.msr.code = ecx; 1241 vmexit->u.msr.wval = (uint64_t)edx << 32 | eax; 1242 } else 1243 handled = 1; 1244 break; 1245 case EXIT_REASON_HLT: 1246 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_HLT, 1); 1247 /* 1248 * If there is an event waiting to be injected then there is 1249 * no need to 'hlt'. 1250 */ 1251 error = vmread(VMCS_ENTRY_INTR_INFO, &intr_info); 1252 if (error) 1253 panic("vmx_exit_process: vmread(intrinfo) %d", error); 1254 1255 if (intr_info & VMCS_INTERRUPTION_INFO_VALID) { 1256 handled = 1; 1257 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_HLT_IGNORED, 1); 1258 } else 1259 vmexit->exitcode = VM_EXITCODE_HLT; 1260 break; 1261 case EXIT_REASON_MTF: 1262 vmexit->exitcode = VM_EXITCODE_MTRAP; 1263 break; 1264 case EXIT_REASON_PAUSE: 1265 vmexit->exitcode = VM_EXITCODE_PAUSE; 1266 break; 1267 case EXIT_REASON_INTR_WINDOW: 1268 vmx_clear_int_window_exiting(vmx, vcpu); 1269 VMM_CTR0(vmx->vm, vcpu, "Disabling interrupt window exiting"); 1270 /* FALLTHRU */ 1271 case EXIT_REASON_EXT_INTR: 1272 /* 1273 * External interrupts serve only to cause VM exits and allow 1274 * the host interrupt handler to run. 1275 * 1276 * If this external interrupt triggers a virtual interrupt 1277 * to a VM, then that state will be recorded by the 1278 * host interrupt handler in the VM's softc. We will inject 1279 * this virtual interrupt during the subsequent VM enter. 1280 */ 1281 1282 /* 1283 * This is special. We want to treat this as an 'handled' 1284 * VM-exit but not increment the instruction pointer. 1285 */ 1286 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_EXTINT, 1); 1287 return (1); 1288 case EXIT_REASON_NMI_WINDOW: 1289 /* Exit to allow the pending virtual NMI to be injected */ 1290 vmx_clear_nmi_window_exiting(vmx, vcpu); 1291 VMM_CTR0(vmx->vm, vcpu, "Disabling NMI window exiting"); 1292 return (1); 1293 case EXIT_REASON_INOUT: 1294 vmexit->exitcode = VM_EXITCODE_INOUT; 1295 vmexit->u.inout.bytes = (qual & 0x7) + 1; 1296 vmexit->u.inout.in = (qual & 0x8) ? 1 : 0; 1297 vmexit->u.inout.string = (qual & 0x10) ? 1 : 0; 1298 vmexit->u.inout.rep = (qual & 0x20) ? 1 : 0; 1299 vmexit->u.inout.port = (uint16_t)(qual >> 16); 1300 vmexit->u.inout.eax = (uint32_t)(vmxctx->guest_rax); 1301 break; 1302 case EXIT_REASON_CPUID: 1303 handled = vmx_handle_cpuid(vmx->vm, vcpu, vmxctx); 1304 break; 1305 case EXIT_REASON_EPT_FAULT: 1306 gla = vmcs_gla(); 1307 gpa = vmcs_gpa(); 1308 cr3 = vmcs_guest_cr3(); 1309 handled = vmx_ept_fault(vmx->vm, vcpu, gla, gpa, 1310 vmexit->rip, vmexit->inst_length, 1311 cr3, qual, &vmexit->u.paging.vie); 1312 if (!handled) { 1313 vmexit->exitcode = VM_EXITCODE_PAGING; 1314 vmexit->u.paging.gpa = gpa; 1315 } 1316 break; 1317 default: 1318 break; 1319 } 1320 1321 if (handled) { 1322 /* 1323 * It is possible that control is returned to userland 1324 * even though we were able to handle the VM exit in the 1325 * kernel. 1326 * 1327 * In such a case we want to make sure that the userland 1328 * restarts guest execution at the instruction *after* 1329 * the one we just processed. Therefore we update the 1330 * guest rip in the VMCS and in 'vmexit'. 1331 */ 1332 vm_exit_update_rip(vmexit); 1333 vmexit->rip += vmexit->inst_length; 1334 vmexit->inst_length = 0; 1335 1336 /* 1337 * Special case for spinning up an AP - exit to userspace to 1338 * give the controlling process a chance to intercept and 1339 * spin up a thread for the AP. 1340 */ 1341 if (vmexit->exitcode == VM_EXITCODE_SPINUP_AP) 1342 handled = 0; 1343 } else { 1344 if (vmexit->exitcode == VM_EXITCODE_BOGUS) { 1345 /* 1346 * If this VM exit was not claimed by anybody then 1347 * treat it as a generic VMX exit. 1348 */ 1349 vmexit->exitcode = VM_EXITCODE_VMX; 1350 vmexit->u.vmx.error = 0; 1351 } else { 1352 /* 1353 * The exitcode and collateral have been populated. 1354 * The VM exit will be processed further in userland. 1355 */ 1356 } 1357 } 1358 return (handled); 1359 } 1360 1361 static int 1362 vmx_run(void *arg, int vcpu, register_t rip) 1363 { 1364 int error, vie, rc, handled, astpending; 1365 uint32_t exit_reason; 1366 struct vmx *vmx; 1367 struct vmxctx *vmxctx; 1368 struct vmcs *vmcs; 1369 struct vm_exit *vmexit; 1370 1371 vmx = arg; 1372 vmcs = &vmx->vmcs[vcpu]; 1373 vmxctx = &vmx->ctx[vcpu]; 1374 vmxctx->launched = 0; 1375 1376 astpending = 0; 1377 vmexit = vm_exitinfo(vmx->vm, vcpu); 1378 1379 /* 1380 * XXX Can we avoid doing this every time we do a vm run? 1381 */ 1382 VMPTRLD(vmcs); 1383 1384 /* 1385 * XXX 1386 * We do this every time because we may setup the virtual machine 1387 * from a different process than the one that actually runs it. 1388 * 1389 * If the life of a virtual machine was spent entirely in the context 1390 * of a single process we could do this once in vmcs_set_defaults(). 1391 */ 1392 if ((error = vmwrite(VMCS_HOST_CR3, rcr3())) != 0) 1393 panic("vmx_run: error %d writing to VMCS_HOST_CR3", error); 1394 1395 if ((error = vmwrite(VMCS_GUEST_RIP, rip)) != 0) 1396 panic("vmx_run: error %d writing to VMCS_GUEST_RIP", error); 1397 1398 if ((error = vmx_set_pcpu_defaults(vmx, vcpu)) != 0) 1399 panic("vmx_run: error %d setting up pcpu defaults", error); 1400 1401 do { 1402 lapic_timer_tick(vmx->vm, vcpu); 1403 vmx_inject_interrupts(vmx, vcpu); 1404 vmx_run_trace(vmx, vcpu); 1405 rc = vmx_setjmp(vmxctx); 1406 #ifdef SETJMP_TRACE 1407 vmx_setjmp_trace(vmx, vcpu, vmxctx, rc); 1408 #endif 1409 switch (rc) { 1410 case VMX_RETURN_DIRECT: 1411 if (vmxctx->launched == 0) { 1412 vmxctx->launched = 1; 1413 vmx_launch(vmxctx); 1414 } else 1415 vmx_resume(vmxctx); 1416 panic("vmx_launch/resume should not return"); 1417 break; 1418 case VMX_RETURN_LONGJMP: 1419 break; /* vm exit */ 1420 case VMX_RETURN_AST: 1421 astpending = 1; 1422 break; 1423 case VMX_RETURN_VMRESUME: 1424 vie = vmcs_instruction_error(); 1425 if (vmxctx->launch_error == VM_FAIL_INVALID || 1426 vie != VMRESUME_WITH_NON_LAUNCHED_VMCS) { 1427 printf("vmresume error %d vmcs inst error %d\n", 1428 vmxctx->launch_error, vie); 1429 goto err_exit; 1430 } 1431 vmx_launch(vmxctx); /* try to launch the guest */ 1432 panic("vmx_launch should not return"); 1433 break; 1434 case VMX_RETURN_VMLAUNCH: 1435 vie = vmcs_instruction_error(); 1436 #if 1 1437 printf("vmlaunch error %d vmcs inst error %d\n", 1438 vmxctx->launch_error, vie); 1439 #endif 1440 goto err_exit; 1441 default: 1442 panic("vmx_setjmp returned %d", rc); 1443 } 1444 1445 /* enable interrupts */ 1446 enable_intr(); 1447 1448 /* collect some basic information for VM exit processing */ 1449 vmexit->rip = rip = vmcs_guest_rip(); 1450 vmexit->inst_length = vmexit_instruction_length(); 1451 vmexit->u.vmx.exit_reason = exit_reason = vmcs_exit_reason(); 1452 vmexit->u.vmx.exit_qualification = vmcs_exit_qualification(); 1453 1454 if (astpending) { 1455 handled = 1; 1456 vmexit->inst_length = 0; 1457 vmexit->exitcode = VM_EXITCODE_BOGUS; 1458 vmx_astpending_trace(vmx, vcpu, rip); 1459 break; 1460 } 1461 1462 handled = vmx_exit_process(vmx, vcpu, vmexit); 1463 vmx_exit_trace(vmx, vcpu, rip, exit_reason, handled); 1464 1465 } while (handled); 1466 1467 /* 1468 * If a VM exit has been handled then the exitcode must be BOGUS 1469 * If a VM exit is not handled then the exitcode must not be BOGUS 1470 */ 1471 if ((handled && vmexit->exitcode != VM_EXITCODE_BOGUS) || 1472 (!handled && vmexit->exitcode == VM_EXITCODE_BOGUS)) { 1473 panic("Mismatch between handled (%d) and exitcode (%d)", 1474 handled, vmexit->exitcode); 1475 } 1476 1477 VMM_CTR1(vmx->vm, vcpu, "goto userland: exitcode %d",vmexit->exitcode); 1478 1479 /* 1480 * XXX 1481 * We need to do this to ensure that any VMCS state cached by the 1482 * processor is flushed to memory. We need to do this in case the 1483 * VM moves to a different cpu the next time it runs. 1484 * 1485 * Can we avoid doing this? 1486 */ 1487 VMCLEAR(vmcs); 1488 return (0); 1489 1490 err_exit: 1491 vmexit->exitcode = VM_EXITCODE_VMX; 1492 vmexit->u.vmx.exit_reason = (uint32_t)-1; 1493 vmexit->u.vmx.exit_qualification = (uint32_t)-1; 1494 vmexit->u.vmx.error = vie; 1495 VMCLEAR(vmcs); 1496 return (ENOEXEC); 1497 } 1498 1499 static void 1500 vmx_vmcleanup(void *arg) 1501 { 1502 int error; 1503 struct vmx *vmx = arg; 1504 1505 /* 1506 * XXXSMP we also need to clear the VMCS active on the other vcpus. 1507 */ 1508 error = vmclear(&vmx->vmcs[0]); 1509 if (error != 0) 1510 panic("vmx_vmcleanup: vmclear error %d on vcpu 0", error); 1511 1512 ept_vmcleanup(vmx); 1513 free(vmx, M_VMX); 1514 1515 return; 1516 } 1517 1518 static register_t * 1519 vmxctx_regptr(struct vmxctx *vmxctx, int reg) 1520 { 1521 1522 switch (reg) { 1523 case VM_REG_GUEST_RAX: 1524 return (&vmxctx->guest_rax); 1525 case VM_REG_GUEST_RBX: 1526 return (&vmxctx->guest_rbx); 1527 case VM_REG_GUEST_RCX: 1528 return (&vmxctx->guest_rcx); 1529 case VM_REG_GUEST_RDX: 1530 return (&vmxctx->guest_rdx); 1531 case VM_REG_GUEST_RSI: 1532 return (&vmxctx->guest_rsi); 1533 case VM_REG_GUEST_RDI: 1534 return (&vmxctx->guest_rdi); 1535 case VM_REG_GUEST_RBP: 1536 return (&vmxctx->guest_rbp); 1537 case VM_REG_GUEST_R8: 1538 return (&vmxctx->guest_r8); 1539 case VM_REG_GUEST_R9: 1540 return (&vmxctx->guest_r9); 1541 case VM_REG_GUEST_R10: 1542 return (&vmxctx->guest_r10); 1543 case VM_REG_GUEST_R11: 1544 return (&vmxctx->guest_r11); 1545 case VM_REG_GUEST_R12: 1546 return (&vmxctx->guest_r12); 1547 case VM_REG_GUEST_R13: 1548 return (&vmxctx->guest_r13); 1549 case VM_REG_GUEST_R14: 1550 return (&vmxctx->guest_r14); 1551 case VM_REG_GUEST_R15: 1552 return (&vmxctx->guest_r15); 1553 default: 1554 break; 1555 } 1556 return (NULL); 1557 } 1558 1559 static int 1560 vmxctx_getreg(struct vmxctx *vmxctx, int reg, uint64_t *retval) 1561 { 1562 register_t *regp; 1563 1564 if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) { 1565 *retval = *regp; 1566 return (0); 1567 } else 1568 return (EINVAL); 1569 } 1570 1571 static int 1572 vmxctx_setreg(struct vmxctx *vmxctx, int reg, uint64_t val) 1573 { 1574 register_t *regp; 1575 1576 if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) { 1577 *regp = val; 1578 return (0); 1579 } else 1580 return (EINVAL); 1581 } 1582 1583 static int 1584 vmx_getreg(void *arg, int vcpu, int reg, uint64_t *retval) 1585 { 1586 struct vmx *vmx = arg; 1587 1588 if (vmxctx_getreg(&vmx->ctx[vcpu], reg, retval) == 0) 1589 return (0); 1590 1591 /* 1592 * If the vcpu is running then don't mess with the VMCS. 1593 * 1594 * vmcs_getreg will VMCLEAR the vmcs when it is done which will cause 1595 * the subsequent vmlaunch/vmresume to fail. 1596 */ 1597 if (vcpu_is_running(vmx->vm, vcpu)) 1598 panic("vmx_getreg: %s%d is running", vm_name(vmx->vm), vcpu); 1599 1600 return (vmcs_getreg(&vmx->vmcs[vcpu], reg, retval)); 1601 } 1602 1603 static int 1604 vmx_setreg(void *arg, int vcpu, int reg, uint64_t val) 1605 { 1606 int error; 1607 uint64_t ctls; 1608 struct vmx *vmx = arg; 1609 1610 /* 1611 * XXX Allow caller to set contents of the guest registers saved in 1612 * the 'vmxctx' even though the vcpu might be running. We need this 1613 * specifically to support the rdmsr emulation that will set the 1614 * %eax and %edx registers during vm exit processing. 1615 */ 1616 if (vmxctx_setreg(&vmx->ctx[vcpu], reg, val) == 0) 1617 return (0); 1618 1619 /* 1620 * If the vcpu is running then don't mess with the VMCS. 1621 * 1622 * vmcs_setreg will VMCLEAR the vmcs when it is done which will cause 1623 * the subsequent vmlaunch/vmresume to fail. 1624 */ 1625 if (vcpu_is_running(vmx->vm, vcpu)) 1626 panic("vmx_setreg: %s%d is running", vm_name(vmx->vm), vcpu); 1627 1628 error = vmcs_setreg(&vmx->vmcs[vcpu], reg, val); 1629 1630 if (error == 0) { 1631 /* 1632 * If the "load EFER" VM-entry control is 1 then the 1633 * value of EFER.LMA must be identical to "IA-32e mode guest" 1634 * bit in the VM-entry control. 1635 */ 1636 if ((entry_ctls & VM_ENTRY_LOAD_EFER) != 0 && 1637 (reg == VM_REG_GUEST_EFER)) { 1638 vmcs_getreg(&vmx->vmcs[vcpu], 1639 VMCS_IDENT(VMCS_ENTRY_CTLS), &ctls); 1640 if (val & EFER_LMA) 1641 ctls |= VM_ENTRY_GUEST_LMA; 1642 else 1643 ctls &= ~VM_ENTRY_GUEST_LMA; 1644 vmcs_setreg(&vmx->vmcs[vcpu], 1645 VMCS_IDENT(VMCS_ENTRY_CTLS), ctls); 1646 } 1647 } 1648 1649 return (error); 1650 } 1651 1652 static int 1653 vmx_getdesc(void *arg, int vcpu, int reg, struct seg_desc *desc) 1654 { 1655 struct vmx *vmx = arg; 1656 1657 return (vmcs_getdesc(&vmx->vmcs[vcpu], reg, desc)); 1658 } 1659 1660 static int 1661 vmx_setdesc(void *arg, int vcpu, int reg, struct seg_desc *desc) 1662 { 1663 struct vmx *vmx = arg; 1664 1665 return (vmcs_setdesc(&vmx->vmcs[vcpu], reg, desc)); 1666 } 1667 1668 static int 1669 vmx_inject(void *arg, int vcpu, int type, int vector, uint32_t code, 1670 int code_valid) 1671 { 1672 int error; 1673 uint64_t info; 1674 struct vmx *vmx = arg; 1675 struct vmcs *vmcs = &vmx->vmcs[vcpu]; 1676 1677 static uint32_t type_map[VM_EVENT_MAX] = { 1678 0x1, /* VM_EVENT_NONE */ 1679 0x0, /* VM_HW_INTR */ 1680 0x2, /* VM_NMI */ 1681 0x3, /* VM_HW_EXCEPTION */ 1682 0x4, /* VM_SW_INTR */ 1683 0x5, /* VM_PRIV_SW_EXCEPTION */ 1684 0x6, /* VM_SW_EXCEPTION */ 1685 }; 1686 1687 /* 1688 * If there is already an exception pending to be delivered to the 1689 * vcpu then just return. 1690 */ 1691 error = vmcs_getreg(vmcs, VMCS_IDENT(VMCS_ENTRY_INTR_INFO), &info); 1692 if (error) 1693 return (error); 1694 1695 if (info & VMCS_INTERRUPTION_INFO_VALID) 1696 return (EAGAIN); 1697 1698 info = vector | (type_map[type] << 8) | (code_valid ? 1 << 11 : 0); 1699 info |= VMCS_INTERRUPTION_INFO_VALID; 1700 error = vmcs_setreg(vmcs, VMCS_IDENT(VMCS_ENTRY_INTR_INFO), info); 1701 if (error != 0) 1702 return (error); 1703 1704 if (code_valid) { 1705 error = vmcs_setreg(vmcs, 1706 VMCS_IDENT(VMCS_ENTRY_EXCEPTION_ERROR), 1707 code); 1708 } 1709 return (error); 1710 } 1711 1712 static int 1713 vmx_getcap(void *arg, int vcpu, int type, int *retval) 1714 { 1715 struct vmx *vmx = arg; 1716 int vcap; 1717 int ret; 1718 1719 ret = ENOENT; 1720 1721 vcap = vmx->cap[vcpu].set; 1722 1723 switch (type) { 1724 case VM_CAP_HALT_EXIT: 1725 if (cap_halt_exit) 1726 ret = 0; 1727 break; 1728 case VM_CAP_PAUSE_EXIT: 1729 if (cap_pause_exit) 1730 ret = 0; 1731 break; 1732 case VM_CAP_MTRAP_EXIT: 1733 if (cap_monitor_trap) 1734 ret = 0; 1735 break; 1736 case VM_CAP_UNRESTRICTED_GUEST: 1737 if (cap_unrestricted_guest) 1738 ret = 0; 1739 break; 1740 default: 1741 break; 1742 } 1743 1744 if (ret == 0) 1745 *retval = (vcap & (1 << type)) ? 1 : 0; 1746 1747 return (ret); 1748 } 1749 1750 static int 1751 vmx_setcap(void *arg, int vcpu, int type, int val) 1752 { 1753 struct vmx *vmx = arg; 1754 struct vmcs *vmcs = &vmx->vmcs[vcpu]; 1755 uint32_t baseval; 1756 uint32_t *pptr; 1757 int error; 1758 int flag; 1759 int reg; 1760 int retval; 1761 1762 retval = ENOENT; 1763 pptr = NULL; 1764 1765 switch (type) { 1766 case VM_CAP_HALT_EXIT: 1767 if (cap_halt_exit) { 1768 retval = 0; 1769 pptr = &vmx->cap[vcpu].proc_ctls; 1770 baseval = *pptr; 1771 flag = PROCBASED_HLT_EXITING; 1772 reg = VMCS_PRI_PROC_BASED_CTLS; 1773 } 1774 break; 1775 case VM_CAP_MTRAP_EXIT: 1776 if (cap_monitor_trap) { 1777 retval = 0; 1778 pptr = &vmx->cap[vcpu].proc_ctls; 1779 baseval = *pptr; 1780 flag = PROCBASED_MTF; 1781 reg = VMCS_PRI_PROC_BASED_CTLS; 1782 } 1783 break; 1784 case VM_CAP_PAUSE_EXIT: 1785 if (cap_pause_exit) { 1786 retval = 0; 1787 pptr = &vmx->cap[vcpu].proc_ctls; 1788 baseval = *pptr; 1789 flag = PROCBASED_PAUSE_EXITING; 1790 reg = VMCS_PRI_PROC_BASED_CTLS; 1791 } 1792 break; 1793 case VM_CAP_UNRESTRICTED_GUEST: 1794 if (cap_unrestricted_guest) { 1795 retval = 0; 1796 baseval = procbased_ctls2; 1797 flag = PROCBASED2_UNRESTRICTED_GUEST; 1798 reg = VMCS_SEC_PROC_BASED_CTLS; 1799 } 1800 break; 1801 default: 1802 break; 1803 } 1804 1805 if (retval == 0) { 1806 if (val) { 1807 baseval |= flag; 1808 } else { 1809 baseval &= ~flag; 1810 } 1811 VMPTRLD(vmcs); 1812 error = vmwrite(reg, baseval); 1813 VMCLEAR(vmcs); 1814 1815 if (error) { 1816 retval = error; 1817 } else { 1818 /* 1819 * Update optional stored flags, and record 1820 * setting 1821 */ 1822 if (pptr != NULL) { 1823 *pptr = baseval; 1824 } 1825 1826 if (val) { 1827 vmx->cap[vcpu].set |= (1 << type); 1828 } else { 1829 vmx->cap[vcpu].set &= ~(1 << type); 1830 } 1831 } 1832 } 1833 1834 return (retval); 1835 } 1836 1837 struct vmm_ops vmm_ops_intel = { 1838 vmx_init, 1839 vmx_cleanup, 1840 vmx_vminit, 1841 vmx_run, 1842 vmx_vmcleanup, 1843 ept_vmmmap_set, 1844 ept_vmmmap_get, 1845 vmx_getreg, 1846 vmx_setreg, 1847 vmx_getdesc, 1848 vmx_setdesc, 1849 vmx_inject, 1850 vmx_getcap, 1851 vmx_setcap 1852 }; 1853