1 /*- 2 * Copyright (c) 2011 NetApp, Inc. 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 * 26 * $FreeBSD$ 27 */ 28 29 #include <sys/cdefs.h> 30 __FBSDID("$FreeBSD$"); 31 32 #include <sys/param.h> 33 #include <sys/systm.h> 34 #include <sys/smp.h> 35 #include <sys/kernel.h> 36 #include <sys/malloc.h> 37 #include <sys/pcpu.h> 38 #include <sys/proc.h> 39 #include <sys/sysctl.h> 40 41 #include <vm/vm.h> 42 #include <vm/pmap.h> 43 44 #include <machine/psl.h> 45 #include <machine/cpufunc.h> 46 #include <machine/md_var.h> 47 #include <machine/pmap.h> 48 #include <machine/segments.h> 49 #include <machine/specialreg.h> 50 #include <machine/vmparam.h> 51 52 #include <x86/apicreg.h> 53 54 #include <machine/vmm.h> 55 #include "vmm_host.h" 56 #include "vmm_lapic.h" 57 #include "vmm_msr.h" 58 #include "vmm_ktr.h" 59 #include "vmm_stat.h" 60 61 #include "vmx_msr.h" 62 #include "ept.h" 63 #include "vmx_cpufunc.h" 64 #include "vmx.h" 65 #include "x86.h" 66 #include "vmx_controls.h" 67 68 #define PINBASED_CTLS_ONE_SETTING \ 69 (PINBASED_EXTINT_EXITING | \ 70 PINBASED_NMI_EXITING | \ 71 PINBASED_VIRTUAL_NMI) 72 #define PINBASED_CTLS_ZERO_SETTING 0 73 74 #define PROCBASED_CTLS_WINDOW_SETTING \ 75 (PROCBASED_INT_WINDOW_EXITING | \ 76 PROCBASED_NMI_WINDOW_EXITING) 77 78 #define PROCBASED_CTLS_ONE_SETTING \ 79 (PROCBASED_SECONDARY_CONTROLS | \ 80 PROCBASED_IO_EXITING | \ 81 PROCBASED_MSR_BITMAPS | \ 82 PROCBASED_CTLS_WINDOW_SETTING) 83 #define PROCBASED_CTLS_ZERO_SETTING \ 84 (PROCBASED_CR3_LOAD_EXITING | \ 85 PROCBASED_CR3_STORE_EXITING | \ 86 PROCBASED_IO_BITMAPS) 87 88 #define PROCBASED_CTLS2_ONE_SETTING PROCBASED2_ENABLE_EPT 89 #define PROCBASED_CTLS2_ZERO_SETTING 0 90 91 #define VM_EXIT_CTLS_ONE_SETTING_NO_PAT \ 92 (VM_EXIT_HOST_LMA | \ 93 VM_EXIT_SAVE_EFER | \ 94 VM_EXIT_LOAD_EFER) 95 96 #define VM_EXIT_CTLS_ONE_SETTING \ 97 (VM_EXIT_CTLS_ONE_SETTING_NO_PAT | \ 98 VM_EXIT_SAVE_PAT | \ 99 VM_EXIT_LOAD_PAT) 100 #define VM_EXIT_CTLS_ZERO_SETTING VM_EXIT_SAVE_DEBUG_CONTROLS 101 102 #define VM_ENTRY_CTLS_ONE_SETTING_NO_PAT VM_ENTRY_LOAD_EFER 103 104 #define VM_ENTRY_CTLS_ONE_SETTING \ 105 (VM_ENTRY_CTLS_ONE_SETTING_NO_PAT | \ 106 VM_ENTRY_LOAD_PAT) 107 #define VM_ENTRY_CTLS_ZERO_SETTING \ 108 (VM_ENTRY_LOAD_DEBUG_CONTROLS | \ 109 VM_ENTRY_INTO_SMM | \ 110 VM_ENTRY_DEACTIVATE_DUAL_MONITOR) 111 112 #define guest_msr_rw(vmx, msr) \ 113 msr_bitmap_change_access((vmx)->msr_bitmap, (msr), MSR_BITMAP_ACCESS_RW) 114 115 #define HANDLED 1 116 #define UNHANDLED 0 117 118 MALLOC_DEFINE(M_VMX, "vmx", "vmx"); 119 120 SYSCTL_DECL(_hw_vmm); 121 SYSCTL_NODE(_hw_vmm, OID_AUTO, vmx, CTLFLAG_RW, NULL, NULL); 122 123 int vmxon_enabled[MAXCPU]; 124 static char vmxon_region[MAXCPU][PAGE_SIZE] __aligned(PAGE_SIZE); 125 126 static uint32_t pinbased_ctls, procbased_ctls, procbased_ctls2; 127 static uint32_t exit_ctls, entry_ctls; 128 129 static uint64_t cr0_ones_mask, cr0_zeros_mask; 130 SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr0_ones_mask, CTLFLAG_RD, 131 &cr0_ones_mask, 0, NULL); 132 SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr0_zeros_mask, CTLFLAG_RD, 133 &cr0_zeros_mask, 0, NULL); 134 135 static uint64_t cr4_ones_mask, cr4_zeros_mask; 136 SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr4_ones_mask, CTLFLAG_RD, 137 &cr4_ones_mask, 0, NULL); 138 SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr4_zeros_mask, CTLFLAG_RD, 139 &cr4_zeros_mask, 0, NULL); 140 141 static volatile u_int nextvpid; 142 143 static int vmx_no_patmsr; 144 145 static int vmx_initialized; 146 SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, initialized, CTLFLAG_RD, 147 &vmx_initialized, 0, "Intel VMX initialized"); 148 149 /* 150 * Virtual NMI blocking conditions. 151 * 152 * Some processor implementations also require NMI to be blocked if 153 * the STI_BLOCKING bit is set. It is possible to detect this at runtime 154 * based on the (exit_reason,exit_qual) tuple being set to 155 * (EXIT_REASON_INVAL_VMCS, EXIT_QUAL_NMI_WHILE_STI_BLOCKING). 156 * 157 * We take the easy way out and also include STI_BLOCKING as one of the 158 * gating items for vNMI injection. 159 */ 160 static uint64_t nmi_blocking_bits = VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING | 161 VMCS_INTERRUPTIBILITY_NMI_BLOCKING | 162 VMCS_INTERRUPTIBILITY_STI_BLOCKING; 163 164 /* 165 * Optional capabilities 166 */ 167 static int cap_halt_exit; 168 static int cap_pause_exit; 169 static int cap_unrestricted_guest; 170 static int cap_monitor_trap; 171 172 /* statistics */ 173 static VMM_STAT_INTEL(VMEXIT_HLT_IGNORED, "number of times hlt was ignored"); 174 175 #ifdef KTR 176 static const char * 177 exit_reason_to_str(int reason) 178 { 179 static char reasonbuf[32]; 180 181 switch (reason) { 182 case EXIT_REASON_EXCEPTION: 183 return "exception"; 184 case EXIT_REASON_EXT_INTR: 185 return "extint"; 186 case EXIT_REASON_TRIPLE_FAULT: 187 return "triplefault"; 188 case EXIT_REASON_INIT: 189 return "init"; 190 case EXIT_REASON_SIPI: 191 return "sipi"; 192 case EXIT_REASON_IO_SMI: 193 return "iosmi"; 194 case EXIT_REASON_SMI: 195 return "smi"; 196 case EXIT_REASON_INTR_WINDOW: 197 return "intrwindow"; 198 case EXIT_REASON_NMI_WINDOW: 199 return "nmiwindow"; 200 case EXIT_REASON_TASK_SWITCH: 201 return "taskswitch"; 202 case EXIT_REASON_CPUID: 203 return "cpuid"; 204 case EXIT_REASON_GETSEC: 205 return "getsec"; 206 case EXIT_REASON_HLT: 207 return "hlt"; 208 case EXIT_REASON_INVD: 209 return "invd"; 210 case EXIT_REASON_INVLPG: 211 return "invlpg"; 212 case EXIT_REASON_RDPMC: 213 return "rdpmc"; 214 case EXIT_REASON_RDTSC: 215 return "rdtsc"; 216 case EXIT_REASON_RSM: 217 return "rsm"; 218 case EXIT_REASON_VMCALL: 219 return "vmcall"; 220 case EXIT_REASON_VMCLEAR: 221 return "vmclear"; 222 case EXIT_REASON_VMLAUNCH: 223 return "vmlaunch"; 224 case EXIT_REASON_VMPTRLD: 225 return "vmptrld"; 226 case EXIT_REASON_VMPTRST: 227 return "vmptrst"; 228 case EXIT_REASON_VMREAD: 229 return "vmread"; 230 case EXIT_REASON_VMRESUME: 231 return "vmresume"; 232 case EXIT_REASON_VMWRITE: 233 return "vmwrite"; 234 case EXIT_REASON_VMXOFF: 235 return "vmxoff"; 236 case EXIT_REASON_VMXON: 237 return "vmxon"; 238 case EXIT_REASON_CR_ACCESS: 239 return "craccess"; 240 case EXIT_REASON_DR_ACCESS: 241 return "draccess"; 242 case EXIT_REASON_INOUT: 243 return "inout"; 244 case EXIT_REASON_RDMSR: 245 return "rdmsr"; 246 case EXIT_REASON_WRMSR: 247 return "wrmsr"; 248 case EXIT_REASON_INVAL_VMCS: 249 return "invalvmcs"; 250 case EXIT_REASON_INVAL_MSR: 251 return "invalmsr"; 252 case EXIT_REASON_MWAIT: 253 return "mwait"; 254 case EXIT_REASON_MTF: 255 return "mtf"; 256 case EXIT_REASON_MONITOR: 257 return "monitor"; 258 case EXIT_REASON_PAUSE: 259 return "pause"; 260 case EXIT_REASON_MCE: 261 return "mce"; 262 case EXIT_REASON_TPR: 263 return "tpr"; 264 case EXIT_REASON_APIC: 265 return "apic"; 266 case EXIT_REASON_GDTR_IDTR: 267 return "gdtridtr"; 268 case EXIT_REASON_LDTR_TR: 269 return "ldtrtr"; 270 case EXIT_REASON_EPT_FAULT: 271 return "eptfault"; 272 case EXIT_REASON_EPT_MISCONFIG: 273 return "eptmisconfig"; 274 case EXIT_REASON_INVEPT: 275 return "invept"; 276 case EXIT_REASON_RDTSCP: 277 return "rdtscp"; 278 case EXIT_REASON_VMX_PREEMPT: 279 return "vmxpreempt"; 280 case EXIT_REASON_INVVPID: 281 return "invvpid"; 282 case EXIT_REASON_WBINVD: 283 return "wbinvd"; 284 case EXIT_REASON_XSETBV: 285 return "xsetbv"; 286 default: 287 snprintf(reasonbuf, sizeof(reasonbuf), "%d", reason); 288 return (reasonbuf); 289 } 290 } 291 292 #ifdef SETJMP_TRACE 293 static const char * 294 vmx_setjmp_rc2str(int rc) 295 { 296 switch (rc) { 297 case VMX_RETURN_DIRECT: 298 return "direct"; 299 case VMX_RETURN_LONGJMP: 300 return "longjmp"; 301 case VMX_RETURN_VMRESUME: 302 return "vmresume"; 303 case VMX_RETURN_VMLAUNCH: 304 return "vmlaunch"; 305 case VMX_RETURN_AST: 306 return "ast"; 307 default: 308 return "unknown"; 309 } 310 } 311 312 #define SETJMP_TRACE(vmx, vcpu, vmxctx, regname) \ 313 VMM_CTR1((vmx)->vm, (vcpu), "setjmp trace " #regname " 0x%016lx", \ 314 (vmxctx)->regname) 315 316 static void 317 vmx_setjmp_trace(struct vmx *vmx, int vcpu, struct vmxctx *vmxctx, int rc) 318 { 319 uint64_t host_rip, host_rsp; 320 321 if (vmxctx != &vmx->ctx[vcpu]) 322 panic("vmx_setjmp_trace: invalid vmxctx %p; should be %p", 323 vmxctx, &vmx->ctx[vcpu]); 324 325 VMM_CTR1((vmx)->vm, (vcpu), "vmxctx = %p", vmxctx); 326 VMM_CTR2((vmx)->vm, (vcpu), "setjmp return code %s(%d)", 327 vmx_setjmp_rc2str(rc), rc); 328 329 host_rsp = host_rip = ~0; 330 vmread(VMCS_HOST_RIP, &host_rip); 331 vmread(VMCS_HOST_RSP, &host_rsp); 332 VMM_CTR2((vmx)->vm, (vcpu), "vmcs host_rip 0x%016lx, host_rsp 0x%016lx", 333 host_rip, host_rsp); 334 335 SETJMP_TRACE(vmx, vcpu, vmxctx, host_r15); 336 SETJMP_TRACE(vmx, vcpu, vmxctx, host_r14); 337 SETJMP_TRACE(vmx, vcpu, vmxctx, host_r13); 338 SETJMP_TRACE(vmx, vcpu, vmxctx, host_r12); 339 SETJMP_TRACE(vmx, vcpu, vmxctx, host_rbp); 340 SETJMP_TRACE(vmx, vcpu, vmxctx, host_rsp); 341 SETJMP_TRACE(vmx, vcpu, vmxctx, host_rbx); 342 SETJMP_TRACE(vmx, vcpu, vmxctx, host_rip); 343 344 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rdi); 345 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rsi); 346 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rdx); 347 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rcx); 348 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r8); 349 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r9); 350 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rax); 351 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rbx); 352 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rbp); 353 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r10); 354 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r11); 355 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r12); 356 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r13); 357 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r14); 358 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r15); 359 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_cr2); 360 } 361 #endif 362 #else 363 static void __inline 364 vmx_setjmp_trace(struct vmx *vmx, int vcpu, struct vmxctx *vmxctx, int rc) 365 { 366 return; 367 } 368 #endif /* KTR */ 369 370 u_long 371 vmx_fix_cr0(u_long cr0) 372 { 373 374 return ((cr0 | cr0_ones_mask) & ~cr0_zeros_mask); 375 } 376 377 u_long 378 vmx_fix_cr4(u_long cr4) 379 { 380 381 return ((cr4 | cr4_ones_mask) & ~cr4_zeros_mask); 382 } 383 384 static void 385 msr_save_area_init(struct msr_entry *g_area, int *g_count) 386 { 387 int cnt; 388 389 static struct msr_entry guest_msrs[] = { 390 { MSR_KGSBASE, 0, 0 }, 391 }; 392 393 cnt = sizeof(guest_msrs) / sizeof(guest_msrs[0]); 394 if (cnt > GUEST_MSR_MAX_ENTRIES) 395 panic("guest msr save area overrun"); 396 bcopy(guest_msrs, g_area, sizeof(guest_msrs)); 397 *g_count = cnt; 398 } 399 400 static void 401 vmx_disable(void *arg __unused) 402 { 403 struct invvpid_desc invvpid_desc = { 0 }; 404 struct invept_desc invept_desc = { 0 }; 405 406 if (vmxon_enabled[curcpu]) { 407 /* 408 * See sections 25.3.3.3 and 25.3.3.4 in Intel Vol 3b. 409 * 410 * VMXON or VMXOFF are not required to invalidate any TLB 411 * caching structures. This prevents potential retention of 412 * cached information in the TLB between distinct VMX episodes. 413 */ 414 invvpid(INVVPID_TYPE_ALL_CONTEXTS, invvpid_desc); 415 invept(INVEPT_TYPE_ALL_CONTEXTS, invept_desc); 416 vmxoff(); 417 } 418 load_cr4(rcr4() & ~CR4_VMXE); 419 } 420 421 static int 422 vmx_cleanup(void) 423 { 424 425 smp_rendezvous(NULL, vmx_disable, NULL, NULL); 426 427 return (0); 428 } 429 430 static void 431 vmx_enable(void *arg __unused) 432 { 433 int error; 434 435 load_cr4(rcr4() | CR4_VMXE); 436 437 *(uint32_t *)vmxon_region[curcpu] = vmx_revision(); 438 error = vmxon(vmxon_region[curcpu]); 439 if (error == 0) 440 vmxon_enabled[curcpu] = 1; 441 } 442 443 static int 444 vmx_init(void) 445 { 446 int error; 447 uint64_t fixed0, fixed1, feature_control; 448 uint32_t tmp; 449 450 /* CPUID.1:ECX[bit 5] must be 1 for processor to support VMX */ 451 if (!(cpu_feature2 & CPUID2_VMX)) { 452 printf("vmx_init: processor does not support VMX operation\n"); 453 return (ENXIO); 454 } 455 456 /* 457 * Verify that MSR_IA32_FEATURE_CONTROL lock and VMXON enable bits 458 * are set (bits 0 and 2 respectively). 459 */ 460 feature_control = rdmsr(MSR_IA32_FEATURE_CONTROL); 461 if ((feature_control & IA32_FEATURE_CONTROL_LOCK) == 0 || 462 (feature_control & IA32_FEATURE_CONTROL_VMX_EN) == 0) { 463 printf("vmx_init: VMX operation disabled by BIOS\n"); 464 return (ENXIO); 465 } 466 467 /* Check support for primary processor-based VM-execution controls */ 468 error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, 469 MSR_VMX_TRUE_PROCBASED_CTLS, 470 PROCBASED_CTLS_ONE_SETTING, 471 PROCBASED_CTLS_ZERO_SETTING, &procbased_ctls); 472 if (error) { 473 printf("vmx_init: processor does not support desired primary " 474 "processor-based controls\n"); 475 return (error); 476 } 477 478 /* Clear the processor-based ctl bits that are set on demand */ 479 procbased_ctls &= ~PROCBASED_CTLS_WINDOW_SETTING; 480 481 /* Check support for secondary processor-based VM-execution controls */ 482 error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, 483 MSR_VMX_PROCBASED_CTLS2, 484 PROCBASED_CTLS2_ONE_SETTING, 485 PROCBASED_CTLS2_ZERO_SETTING, &procbased_ctls2); 486 if (error) { 487 printf("vmx_init: processor does not support desired secondary " 488 "processor-based controls\n"); 489 return (error); 490 } 491 492 /* Check support for VPID */ 493 error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, MSR_VMX_PROCBASED_CTLS2, 494 PROCBASED2_ENABLE_VPID, 0, &tmp); 495 if (error == 0) 496 procbased_ctls2 |= PROCBASED2_ENABLE_VPID; 497 498 /* Check support for pin-based VM-execution controls */ 499 error = vmx_set_ctlreg(MSR_VMX_PINBASED_CTLS, 500 MSR_VMX_TRUE_PINBASED_CTLS, 501 PINBASED_CTLS_ONE_SETTING, 502 PINBASED_CTLS_ZERO_SETTING, &pinbased_ctls); 503 if (error) { 504 printf("vmx_init: processor does not support desired " 505 "pin-based controls\n"); 506 return (error); 507 } 508 509 /* Check support for VM-exit controls */ 510 error = vmx_set_ctlreg(MSR_VMX_EXIT_CTLS, MSR_VMX_TRUE_EXIT_CTLS, 511 VM_EXIT_CTLS_ONE_SETTING, 512 VM_EXIT_CTLS_ZERO_SETTING, 513 &exit_ctls); 514 if (error) { 515 /* Try again without the PAT MSR bits */ 516 error = vmx_set_ctlreg(MSR_VMX_EXIT_CTLS, 517 MSR_VMX_TRUE_EXIT_CTLS, 518 VM_EXIT_CTLS_ONE_SETTING_NO_PAT, 519 VM_EXIT_CTLS_ZERO_SETTING, 520 &exit_ctls); 521 if (error) { 522 printf("vmx_init: processor does not support desired " 523 "exit controls\n"); 524 return (error); 525 } else { 526 if (bootverbose) 527 printf("vmm: PAT MSR access not supported\n"); 528 guest_msr_valid(MSR_PAT); 529 vmx_no_patmsr = 1; 530 } 531 } 532 533 /* Check support for VM-entry controls */ 534 if (!vmx_no_patmsr) { 535 error = vmx_set_ctlreg(MSR_VMX_ENTRY_CTLS, 536 MSR_VMX_TRUE_ENTRY_CTLS, 537 VM_ENTRY_CTLS_ONE_SETTING, 538 VM_ENTRY_CTLS_ZERO_SETTING, 539 &entry_ctls); 540 } else { 541 error = vmx_set_ctlreg(MSR_VMX_ENTRY_CTLS, 542 MSR_VMX_TRUE_ENTRY_CTLS, 543 VM_ENTRY_CTLS_ONE_SETTING_NO_PAT, 544 VM_ENTRY_CTLS_ZERO_SETTING, 545 &entry_ctls); 546 } 547 548 if (error) { 549 printf("vmx_init: processor does not support desired " 550 "entry controls\n"); 551 return (error); 552 } 553 554 /* 555 * Check support for optional features by testing them 556 * as individual bits 557 */ 558 cap_halt_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, 559 MSR_VMX_TRUE_PROCBASED_CTLS, 560 PROCBASED_HLT_EXITING, 0, 561 &tmp) == 0); 562 563 cap_monitor_trap = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, 564 MSR_VMX_PROCBASED_CTLS, 565 PROCBASED_MTF, 0, 566 &tmp) == 0); 567 568 cap_pause_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, 569 MSR_VMX_TRUE_PROCBASED_CTLS, 570 PROCBASED_PAUSE_EXITING, 0, 571 &tmp) == 0); 572 573 cap_unrestricted_guest = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, 574 MSR_VMX_PROCBASED_CTLS2, 575 PROCBASED2_UNRESTRICTED_GUEST, 0, 576 &tmp) == 0); 577 578 /* Initialize EPT */ 579 error = ept_init(); 580 if (error) { 581 printf("vmx_init: ept initialization failed (%d)\n", error); 582 return (error); 583 } 584 585 /* 586 * Stash the cr0 and cr4 bits that must be fixed to 0 or 1 587 */ 588 fixed0 = rdmsr(MSR_VMX_CR0_FIXED0); 589 fixed1 = rdmsr(MSR_VMX_CR0_FIXED1); 590 cr0_ones_mask = fixed0 & fixed1; 591 cr0_zeros_mask = ~fixed0 & ~fixed1; 592 593 /* 594 * CR0_PE and CR0_PG can be set to zero in VMX non-root operation 595 * if unrestricted guest execution is allowed. 596 */ 597 if (cap_unrestricted_guest) 598 cr0_ones_mask &= ~(CR0_PG | CR0_PE); 599 600 /* 601 * Do not allow the guest to set CR0_NW or CR0_CD. 602 */ 603 cr0_zeros_mask |= (CR0_NW | CR0_CD); 604 605 fixed0 = rdmsr(MSR_VMX_CR4_FIXED0); 606 fixed1 = rdmsr(MSR_VMX_CR4_FIXED1); 607 cr4_ones_mask = fixed0 & fixed1; 608 cr4_zeros_mask = ~fixed0 & ~fixed1; 609 610 /* enable VMX operation */ 611 smp_rendezvous(NULL, vmx_enable, NULL, NULL); 612 613 vmx_initialized = 1; 614 615 return (0); 616 } 617 618 /* 619 * If this processor does not support VPIDs then simply return 0. 620 * 621 * Otherwise generate the next value of VPID to use. Any value is alright 622 * as long as it is non-zero. 623 * 624 * We always execute in VMX non-root context with EPT enabled. Thus all 625 * combined mappings are tagged with the (EP4TA, VPID, PCID) tuple. This 626 * in turn means that multiple VMs can share the same VPID as long as 627 * they have distinct EPT page tables. 628 * 629 * XXX 630 * We should optimize this so that it returns VPIDs that are not in 631 * use. Then we will not unnecessarily invalidate mappings in 632 * vmx_set_pcpu_defaults() just because two or more vcpus happen to 633 * use the same 'vpid'. 634 */ 635 static uint16_t 636 vmx_vpid(void) 637 { 638 uint16_t vpid = 0; 639 640 if ((procbased_ctls2 & PROCBASED2_ENABLE_VPID) != 0) { 641 do { 642 vpid = atomic_fetchadd_int(&nextvpid, 1); 643 } while (vpid == 0); 644 } 645 646 return (vpid); 647 } 648 649 static int 650 vmx_setup_cr_shadow(int which, struct vmcs *vmcs, uint32_t initial) 651 { 652 int error, mask_ident, shadow_ident; 653 uint64_t mask_value; 654 655 if (which != 0 && which != 4) 656 panic("vmx_setup_cr_shadow: unknown cr%d", which); 657 658 if (which == 0) { 659 mask_ident = VMCS_CR0_MASK; 660 mask_value = cr0_ones_mask | cr0_zeros_mask; 661 shadow_ident = VMCS_CR0_SHADOW; 662 } else { 663 mask_ident = VMCS_CR4_MASK; 664 mask_value = cr4_ones_mask | cr4_zeros_mask; 665 shadow_ident = VMCS_CR4_SHADOW; 666 } 667 668 error = vmcs_setreg(vmcs, 0, VMCS_IDENT(mask_ident), mask_value); 669 if (error) 670 return (error); 671 672 error = vmcs_setreg(vmcs, 0, VMCS_IDENT(shadow_ident), initial); 673 if (error) 674 return (error); 675 676 return (0); 677 } 678 #define vmx_setup_cr0_shadow(vmcs,init) vmx_setup_cr_shadow(0, (vmcs), (init)) 679 #define vmx_setup_cr4_shadow(vmcs,init) vmx_setup_cr_shadow(4, (vmcs), (init)) 680 681 static void * 682 vmx_vminit(struct vm *vm) 683 { 684 uint16_t vpid; 685 int i, error, guest_msr_count; 686 struct vmx *vmx; 687 688 vmx = malloc(sizeof(struct vmx), M_VMX, M_WAITOK | M_ZERO); 689 if ((uintptr_t)vmx & PAGE_MASK) { 690 panic("malloc of struct vmx not aligned on %d byte boundary", 691 PAGE_SIZE); 692 } 693 vmx->vm = vm; 694 695 /* 696 * Clean up EPTP-tagged guest physical and combined mappings 697 * 698 * VMX transitions are not required to invalidate any guest physical 699 * mappings. So, it may be possible for stale guest physical mappings 700 * to be present in the processor TLBs. 701 * 702 * Combined mappings for this EP4TA are also invalidated for all VPIDs. 703 */ 704 ept_invalidate_mappings(vtophys(vmx->pml4ept)); 705 706 msr_bitmap_initialize(vmx->msr_bitmap); 707 708 /* 709 * It is safe to allow direct access to MSR_GSBASE and MSR_FSBASE. 710 * The guest FSBASE and GSBASE are saved and restored during 711 * vm-exit and vm-entry respectively. The host FSBASE and GSBASE are 712 * always restored from the vmcs host state area on vm-exit. 713 * 714 * The SYSENTER_CS/ESP/EIP MSRs are identical to FS/GSBASE in 715 * how they are saved/restored so can be directly accessed by the 716 * guest. 717 * 718 * Guest KGSBASE is saved and restored in the guest MSR save area. 719 * Host KGSBASE is restored before returning to userland from the pcb. 720 * There will be a window of time when we are executing in the host 721 * kernel context with a value of KGSBASE from the guest. This is ok 722 * because the value of KGSBASE is inconsequential in kernel context. 723 * 724 * MSR_EFER is saved and restored in the guest VMCS area on a 725 * VM exit and entry respectively. It is also restored from the 726 * host VMCS area on a VM exit. 727 */ 728 if (guest_msr_rw(vmx, MSR_GSBASE) || 729 guest_msr_rw(vmx, MSR_FSBASE) || 730 guest_msr_rw(vmx, MSR_SYSENTER_CS_MSR) || 731 guest_msr_rw(vmx, MSR_SYSENTER_ESP_MSR) || 732 guest_msr_rw(vmx, MSR_SYSENTER_EIP_MSR) || 733 guest_msr_rw(vmx, MSR_KGSBASE) || 734 guest_msr_rw(vmx, MSR_EFER)) 735 panic("vmx_vminit: error setting guest msr access"); 736 737 /* 738 * MSR_PAT is saved and restored in the guest VMCS are on a VM exit 739 * and entry respectively. It is also restored from the host VMCS 740 * area on a VM exit. However, if running on a system with no 741 * MSR_PAT save/restore support, leave access disabled so accesses 742 * will be trapped. 743 */ 744 if (!vmx_no_patmsr && guest_msr_rw(vmx, MSR_PAT)) 745 panic("vmx_vminit: error setting guest pat msr access"); 746 747 for (i = 0; i < VM_MAXCPU; i++) { 748 vmx->vmcs[i].identifier = vmx_revision(); 749 error = vmclear(&vmx->vmcs[i]); 750 if (error != 0) { 751 panic("vmx_vminit: vmclear error %d on vcpu %d\n", 752 error, i); 753 } 754 755 vpid = vmx_vpid(); 756 757 error = vmcs_set_defaults(&vmx->vmcs[i], 758 (u_long)vmx_longjmp, 759 (u_long)&vmx->ctx[i], 760 vtophys(vmx->pml4ept), 761 pinbased_ctls, 762 procbased_ctls, 763 procbased_ctls2, 764 exit_ctls, entry_ctls, 765 vtophys(vmx->msr_bitmap), 766 vpid); 767 768 if (error != 0) 769 panic("vmx_vminit: vmcs_set_defaults error %d", error); 770 771 vmx->cap[i].set = 0; 772 vmx->cap[i].proc_ctls = procbased_ctls; 773 774 vmx->state[i].lastcpu = -1; 775 vmx->state[i].vpid = vpid; 776 777 msr_save_area_init(vmx->guest_msrs[i], &guest_msr_count); 778 779 error = vmcs_set_msr_save(&vmx->vmcs[i], 780 vtophys(vmx->guest_msrs[i]), 781 guest_msr_count); 782 if (error != 0) 783 panic("vmcs_set_msr_save error %d", error); 784 785 /* 786 * Set up the CR0/4 shadows, and init the read shadow 787 * to the power-on register value from the Intel Sys Arch. 788 * CR0 - 0x60000010 789 * CR4 - 0 790 */ 791 error = vmx_setup_cr0_shadow(&vmx->vmcs[i], 0x60000010); 792 if (error != 0) 793 panic("vmx_setup_cr0_shadow %d", error); 794 795 error = vmx_setup_cr4_shadow(&vmx->vmcs[i], 0); 796 if (error != 0) 797 panic("vmx_setup_cr4_shadow %d", error); 798 } 799 800 return (vmx); 801 } 802 803 static int 804 vmx_handle_cpuid(struct vm *vm, int vcpu, struct vmxctx *vmxctx) 805 { 806 int handled, func; 807 808 func = vmxctx->guest_rax; 809 810 handled = x86_emulate_cpuid(vm, vcpu, 811 (uint32_t*)(&vmxctx->guest_rax), 812 (uint32_t*)(&vmxctx->guest_rbx), 813 (uint32_t*)(&vmxctx->guest_rcx), 814 (uint32_t*)(&vmxctx->guest_rdx)); 815 return (handled); 816 } 817 818 static __inline void 819 vmx_run_trace(struct vmx *vmx, int vcpu) 820 { 821 #ifdef KTR 822 VMM_CTR1(vmx->vm, vcpu, "Resume execution at 0x%0lx", vmcs_guest_rip()); 823 #endif 824 } 825 826 static __inline void 827 vmx_exit_trace(struct vmx *vmx, int vcpu, uint64_t rip, uint32_t exit_reason, 828 int handled) 829 { 830 #ifdef KTR 831 VMM_CTR3(vmx->vm, vcpu, "%s %s vmexit at 0x%0lx", 832 handled ? "handled" : "unhandled", 833 exit_reason_to_str(exit_reason), rip); 834 #endif 835 } 836 837 static __inline void 838 vmx_astpending_trace(struct vmx *vmx, int vcpu, uint64_t rip) 839 { 840 #ifdef KTR 841 VMM_CTR1(vmx->vm, vcpu, "astpending vmexit at 0x%0lx", rip); 842 #endif 843 } 844 845 static int 846 vmx_set_pcpu_defaults(struct vmx *vmx, int vcpu) 847 { 848 int error, lastcpu; 849 struct vmxstate *vmxstate; 850 struct invvpid_desc invvpid_desc = { 0 }; 851 852 vmxstate = &vmx->state[vcpu]; 853 lastcpu = vmxstate->lastcpu; 854 vmxstate->lastcpu = curcpu; 855 856 if (lastcpu == curcpu) { 857 error = 0; 858 goto done; 859 } 860 861 vmm_stat_incr(vmx->vm, vcpu, VCPU_MIGRATIONS, 1); 862 863 error = vmwrite(VMCS_HOST_TR_BASE, vmm_get_host_trbase()); 864 if (error != 0) 865 goto done; 866 867 error = vmwrite(VMCS_HOST_GDTR_BASE, vmm_get_host_gdtrbase()); 868 if (error != 0) 869 goto done; 870 871 error = vmwrite(VMCS_HOST_GS_BASE, vmm_get_host_gsbase()); 872 if (error != 0) 873 goto done; 874 875 /* 876 * If we are using VPIDs then invalidate all mappings tagged with 'vpid' 877 * 878 * We do this because this vcpu was executing on a different host 879 * cpu when it last ran. We do not track whether it invalidated 880 * mappings associated with its 'vpid' during that run. So we must 881 * assume that the mappings associated with 'vpid' on 'curcpu' are 882 * stale and invalidate them. 883 * 884 * Note that we incur this penalty only when the scheduler chooses to 885 * move the thread associated with this vcpu between host cpus. 886 * 887 * Note also that this will invalidate mappings tagged with 'vpid' 888 * for "all" EP4TAs. 889 */ 890 if (vmxstate->vpid != 0) { 891 invvpid_desc.vpid = vmxstate->vpid; 892 invvpid(INVVPID_TYPE_SINGLE_CONTEXT, invvpid_desc); 893 } 894 done: 895 return (error); 896 } 897 898 static void 899 vm_exit_update_rip(struct vm_exit *vmexit) 900 { 901 int error; 902 903 error = vmwrite(VMCS_GUEST_RIP, vmexit->rip + vmexit->inst_length); 904 if (error) 905 panic("vmx_run: error %d writing to VMCS_GUEST_RIP", error); 906 } 907 908 /* 909 * We depend on 'procbased_ctls' to have the Interrupt Window Exiting bit set. 910 */ 911 CTASSERT((PROCBASED_CTLS_ONE_SETTING & PROCBASED_INT_WINDOW_EXITING) != 0); 912 913 static void __inline 914 vmx_set_int_window_exiting(struct vmx *vmx, int vcpu) 915 { 916 int error; 917 918 vmx->cap[vcpu].proc_ctls |= PROCBASED_INT_WINDOW_EXITING; 919 920 error = vmwrite(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); 921 if (error) 922 panic("vmx_set_int_window_exiting: vmwrite error %d", error); 923 } 924 925 static void __inline 926 vmx_clear_int_window_exiting(struct vmx *vmx, int vcpu) 927 { 928 int error; 929 930 vmx->cap[vcpu].proc_ctls &= ~PROCBASED_INT_WINDOW_EXITING; 931 932 error = vmwrite(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); 933 if (error) 934 panic("vmx_clear_int_window_exiting: vmwrite error %d", error); 935 } 936 937 static void __inline 938 vmx_set_nmi_window_exiting(struct vmx *vmx, int vcpu) 939 { 940 int error; 941 942 vmx->cap[vcpu].proc_ctls |= PROCBASED_NMI_WINDOW_EXITING; 943 944 error = vmwrite(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); 945 if (error) 946 panic("vmx_set_nmi_window_exiting: vmwrite error %d", error); 947 } 948 949 static void __inline 950 vmx_clear_nmi_window_exiting(struct vmx *vmx, int vcpu) 951 { 952 int error; 953 954 vmx->cap[vcpu].proc_ctls &= ~PROCBASED_NMI_WINDOW_EXITING; 955 956 error = vmwrite(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); 957 if (error) 958 panic("vmx_clear_nmi_window_exiting: vmwrite error %d", error); 959 } 960 961 static int 962 vmx_inject_nmi(struct vmx *vmx, int vcpu) 963 { 964 int error; 965 uint64_t info, interruptibility; 966 967 /* Bail out if no NMI requested */ 968 if (!vm_nmi_pending(vmx->vm, vcpu)) 969 return (0); 970 971 error = vmread(VMCS_GUEST_INTERRUPTIBILITY, &interruptibility); 972 if (error) { 973 panic("vmx_inject_nmi: vmread(interruptibility) %d", 974 error); 975 } 976 if (interruptibility & nmi_blocking_bits) 977 goto nmiblocked; 978 979 /* 980 * Inject the virtual NMI. The vector must be the NMI IDT entry 981 * or the VMCS entry check will fail. 982 */ 983 info = VMCS_INTERRUPTION_INFO_NMI | VMCS_INTERRUPTION_INFO_VALID; 984 info |= IDT_NMI; 985 986 error = vmwrite(VMCS_ENTRY_INTR_INFO, info); 987 if (error) 988 panic("vmx_inject_nmi: vmwrite(intrinfo) %d", error); 989 990 VMM_CTR0(vmx->vm, vcpu, "Injecting vNMI"); 991 992 /* Clear the request */ 993 vm_nmi_clear(vmx->vm, vcpu); 994 return (1); 995 996 nmiblocked: 997 /* 998 * Set the NMI Window Exiting execution control so we can inject 999 * the virtual NMI as soon as blocking condition goes away. 1000 */ 1001 vmx_set_nmi_window_exiting(vmx, vcpu); 1002 1003 VMM_CTR0(vmx->vm, vcpu, "Enabling NMI window exiting"); 1004 return (1); 1005 } 1006 1007 static void 1008 vmx_inject_interrupts(struct vmx *vmx, int vcpu) 1009 { 1010 int error, vector; 1011 uint64_t info, rflags, interruptibility; 1012 1013 const int HWINTR_BLOCKED = VMCS_INTERRUPTIBILITY_STI_BLOCKING | 1014 VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING; 1015 1016 /* 1017 * If there is already an interrupt pending then just return. 1018 * 1019 * This could happen if an interrupt was injected on a prior 1020 * VM entry but the actual entry into guest mode was aborted 1021 * because of a pending AST. 1022 */ 1023 error = vmread(VMCS_ENTRY_INTR_INFO, &info); 1024 if (error) 1025 panic("vmx_inject_interrupts: vmread(intrinfo) %d", error); 1026 if (info & VMCS_INTERRUPTION_INFO_VALID) 1027 return; 1028 1029 /* 1030 * NMI injection has priority so deal with those first 1031 */ 1032 if (vmx_inject_nmi(vmx, vcpu)) 1033 return; 1034 1035 /* Ask the local apic for a vector to inject */ 1036 vector = lapic_pending_intr(vmx->vm, vcpu); 1037 if (vector < 0) 1038 return; 1039 1040 if (vector < 32 || vector > 255) 1041 panic("vmx_inject_interrupts: invalid vector %d\n", vector); 1042 1043 /* Check RFLAGS.IF and the interruptibility state of the guest */ 1044 error = vmread(VMCS_GUEST_RFLAGS, &rflags); 1045 if (error) 1046 panic("vmx_inject_interrupts: vmread(rflags) %d", error); 1047 1048 if ((rflags & PSL_I) == 0) 1049 goto cantinject; 1050 1051 error = vmread(VMCS_GUEST_INTERRUPTIBILITY, &interruptibility); 1052 if (error) { 1053 panic("vmx_inject_interrupts: vmread(interruptibility) %d", 1054 error); 1055 } 1056 if (interruptibility & HWINTR_BLOCKED) 1057 goto cantinject; 1058 1059 /* Inject the interrupt */ 1060 info = VMCS_INTERRUPTION_INFO_HW_INTR | VMCS_INTERRUPTION_INFO_VALID; 1061 info |= vector; 1062 error = vmwrite(VMCS_ENTRY_INTR_INFO, info); 1063 if (error) 1064 panic("vmx_inject_interrupts: vmwrite(intrinfo) %d", error); 1065 1066 /* Update the Local APIC ISR */ 1067 lapic_intr_accepted(vmx->vm, vcpu, vector); 1068 1069 VMM_CTR1(vmx->vm, vcpu, "Injecting hwintr at vector %d", vector); 1070 1071 return; 1072 1073 cantinject: 1074 /* 1075 * Set the Interrupt Window Exiting execution control so we can inject 1076 * the interrupt as soon as blocking condition goes away. 1077 */ 1078 vmx_set_int_window_exiting(vmx, vcpu); 1079 1080 VMM_CTR0(vmx->vm, vcpu, "Enabling interrupt window exiting"); 1081 } 1082 1083 static int 1084 vmx_emulate_cr_access(struct vmx *vmx, int vcpu, uint64_t exitqual) 1085 { 1086 int error, cr, vmcs_guest_cr, vmcs_shadow_cr; 1087 uint64_t crval, regval, ones_mask, zeros_mask; 1088 const struct vmxctx *vmxctx; 1089 1090 /* We only handle mov to %cr0 or %cr4 at this time */ 1091 if ((exitqual & 0xf0) != 0x00) 1092 return (UNHANDLED); 1093 1094 cr = exitqual & 0xf; 1095 if (cr != 0 && cr != 4) 1096 return (UNHANDLED); 1097 1098 vmxctx = &vmx->ctx[vcpu]; 1099 1100 /* 1101 * We must use vmwrite() directly here because vmcs_setreg() will 1102 * call vmclear(vmcs) as a side-effect which we certainly don't want. 1103 */ 1104 switch ((exitqual >> 8) & 0xf) { 1105 case 0: 1106 regval = vmxctx->guest_rax; 1107 break; 1108 case 1: 1109 regval = vmxctx->guest_rcx; 1110 break; 1111 case 2: 1112 regval = vmxctx->guest_rdx; 1113 break; 1114 case 3: 1115 regval = vmxctx->guest_rbx; 1116 break; 1117 case 4: 1118 error = vmread(VMCS_GUEST_RSP, ®val); 1119 if (error) { 1120 panic("vmx_emulate_cr_access: " 1121 "error %d reading guest rsp", error); 1122 } 1123 break; 1124 case 5: 1125 regval = vmxctx->guest_rbp; 1126 break; 1127 case 6: 1128 regval = vmxctx->guest_rsi; 1129 break; 1130 case 7: 1131 regval = vmxctx->guest_rdi; 1132 break; 1133 case 8: 1134 regval = vmxctx->guest_r8; 1135 break; 1136 case 9: 1137 regval = vmxctx->guest_r9; 1138 break; 1139 case 10: 1140 regval = vmxctx->guest_r10; 1141 break; 1142 case 11: 1143 regval = vmxctx->guest_r11; 1144 break; 1145 case 12: 1146 regval = vmxctx->guest_r12; 1147 break; 1148 case 13: 1149 regval = vmxctx->guest_r13; 1150 break; 1151 case 14: 1152 regval = vmxctx->guest_r14; 1153 break; 1154 case 15: 1155 regval = vmxctx->guest_r15; 1156 break; 1157 } 1158 1159 if (cr == 0) { 1160 ones_mask = cr0_ones_mask; 1161 zeros_mask = cr0_zeros_mask; 1162 vmcs_guest_cr = VMCS_GUEST_CR0; 1163 vmcs_shadow_cr = VMCS_CR0_SHADOW; 1164 } else { 1165 ones_mask = cr4_ones_mask; 1166 zeros_mask = cr4_zeros_mask; 1167 vmcs_guest_cr = VMCS_GUEST_CR4; 1168 vmcs_shadow_cr = VMCS_CR4_SHADOW; 1169 } 1170 1171 error = vmwrite(vmcs_shadow_cr, regval); 1172 if (error) { 1173 panic("vmx_emulate_cr_access: error %d writing cr%d shadow", 1174 error, cr); 1175 } 1176 1177 crval = regval | ones_mask; 1178 crval &= ~zeros_mask; 1179 error = vmwrite(vmcs_guest_cr, crval); 1180 if (error) { 1181 panic("vmx_emulate_cr_access: error %d writing cr%d", 1182 error, cr); 1183 } 1184 1185 if (cr == 0 && regval & CR0_PG) { 1186 uint64_t efer, entry_ctls; 1187 1188 /* 1189 * If CR0.PG is 1 and EFER.LME is 1 then EFER.LMA and 1190 * the "IA-32e mode guest" bit in VM-entry control must be 1191 * equal. 1192 */ 1193 error = vmread(VMCS_GUEST_IA32_EFER, &efer); 1194 if (error) { 1195 panic("vmx_emulate_cr_access: error %d efer read", 1196 error); 1197 } 1198 if (efer & EFER_LME) { 1199 efer |= EFER_LMA; 1200 error = vmwrite(VMCS_GUEST_IA32_EFER, efer); 1201 if (error) { 1202 panic("vmx_emulate_cr_access: error %d" 1203 " efer write", error); 1204 } 1205 error = vmread(VMCS_ENTRY_CTLS, &entry_ctls); 1206 if (error) { 1207 panic("vmx_emulate_cr_access: error %d" 1208 " entry ctls read", error); 1209 } 1210 entry_ctls |= VM_ENTRY_GUEST_LMA; 1211 error = vmwrite(VMCS_ENTRY_CTLS, entry_ctls); 1212 if (error) { 1213 panic("vmx_emulate_cr_access: error %d" 1214 " entry ctls write", error); 1215 } 1216 } 1217 } 1218 1219 return (HANDLED); 1220 } 1221 1222 static int 1223 vmx_ept_fault(struct vm *vm, int cpu, 1224 uint64_t gla, uint64_t gpa, uint64_t rip, int inst_length, 1225 uint64_t cr3, uint64_t ept_qual, struct vie *vie) 1226 { 1227 int read, write, error; 1228 1229 /* EPT violation on an instruction fetch doesn't make sense here */ 1230 if (ept_qual & EPT_VIOLATION_INST_FETCH) 1231 return (UNHANDLED); 1232 1233 /* EPT violation must be a read fault or a write fault */ 1234 read = ept_qual & EPT_VIOLATION_DATA_READ ? 1 : 0; 1235 write = ept_qual & EPT_VIOLATION_DATA_WRITE ? 1 : 0; 1236 if ((read | write) == 0) 1237 return (UNHANDLED); 1238 1239 /* 1240 * The EPT violation must have been caused by accessing a 1241 * guest-physical address that is a translation of a guest-linear 1242 * address. 1243 */ 1244 if ((ept_qual & EPT_VIOLATION_GLA_VALID) == 0 || 1245 (ept_qual & EPT_VIOLATION_XLAT_VALID) == 0) { 1246 return (UNHANDLED); 1247 } 1248 1249 /* Fetch, decode and emulate the faulting instruction */ 1250 if (vmm_fetch_instruction(vm, cpu, rip, inst_length, cr3, vie) != 0) 1251 return (UNHANDLED); 1252 1253 if (vmm_decode_instruction(vm, cpu, gla, vie) != 0) 1254 return (UNHANDLED); 1255 1256 /* 1257 * Check if this is a local apic access 1258 */ 1259 if (gpa < DEFAULT_APIC_BASE || gpa >= DEFAULT_APIC_BASE + PAGE_SIZE) 1260 return (UNHANDLED); 1261 1262 error = vmm_emulate_instruction(vm, cpu, gpa, vie, 1263 lapic_mmio_read, lapic_mmio_write, 0); 1264 1265 return (error ? UNHANDLED : HANDLED); 1266 } 1267 1268 static int 1269 vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit) 1270 { 1271 int error, handled; 1272 struct vmcs *vmcs; 1273 struct vmxctx *vmxctx; 1274 uint32_t eax, ecx, edx; 1275 uint64_t qual, gla, gpa, cr3, intr_info; 1276 1277 handled = 0; 1278 vmcs = &vmx->vmcs[vcpu]; 1279 vmxctx = &vmx->ctx[vcpu]; 1280 qual = vmexit->u.vmx.exit_qualification; 1281 vmexit->exitcode = VM_EXITCODE_BOGUS; 1282 1283 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_COUNT, 1); 1284 1285 switch (vmexit->u.vmx.exit_reason) { 1286 case EXIT_REASON_CR_ACCESS: 1287 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_CR_ACCESS, 1); 1288 handled = vmx_emulate_cr_access(vmx, vcpu, qual); 1289 break; 1290 case EXIT_REASON_RDMSR: 1291 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_RDMSR, 1); 1292 ecx = vmxctx->guest_rcx; 1293 error = emulate_rdmsr(vmx->vm, vcpu, ecx); 1294 if (error) { 1295 vmexit->exitcode = VM_EXITCODE_RDMSR; 1296 vmexit->u.msr.code = ecx; 1297 } else 1298 handled = 1; 1299 break; 1300 case EXIT_REASON_WRMSR: 1301 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_WRMSR, 1); 1302 eax = vmxctx->guest_rax; 1303 ecx = vmxctx->guest_rcx; 1304 edx = vmxctx->guest_rdx; 1305 error = emulate_wrmsr(vmx->vm, vcpu, ecx, 1306 (uint64_t)edx << 32 | eax); 1307 if (error) { 1308 vmexit->exitcode = VM_EXITCODE_WRMSR; 1309 vmexit->u.msr.code = ecx; 1310 vmexit->u.msr.wval = (uint64_t)edx << 32 | eax; 1311 } else 1312 handled = 1; 1313 break; 1314 case EXIT_REASON_HLT: 1315 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_HLT, 1); 1316 /* 1317 * If there is an event waiting to be injected then there is 1318 * no need to 'hlt'. 1319 */ 1320 error = vmread(VMCS_ENTRY_INTR_INFO, &intr_info); 1321 if (error) 1322 panic("vmx_exit_process: vmread(intrinfo) %d", error); 1323 1324 if (intr_info & VMCS_INTERRUPTION_INFO_VALID) { 1325 handled = 1; 1326 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_HLT_IGNORED, 1); 1327 } else 1328 vmexit->exitcode = VM_EXITCODE_HLT; 1329 break; 1330 case EXIT_REASON_MTF: 1331 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_MTRAP, 1); 1332 vmexit->exitcode = VM_EXITCODE_MTRAP; 1333 break; 1334 case EXIT_REASON_PAUSE: 1335 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_PAUSE, 1); 1336 vmexit->exitcode = VM_EXITCODE_PAUSE; 1337 break; 1338 case EXIT_REASON_INTR_WINDOW: 1339 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INTR_WINDOW, 1); 1340 vmx_clear_int_window_exiting(vmx, vcpu); 1341 VMM_CTR0(vmx->vm, vcpu, "Disabling interrupt window exiting"); 1342 return (1); 1343 case EXIT_REASON_EXT_INTR: 1344 /* 1345 * External interrupts serve only to cause VM exits and allow 1346 * the host interrupt handler to run. 1347 * 1348 * If this external interrupt triggers a virtual interrupt 1349 * to a VM, then that state will be recorded by the 1350 * host interrupt handler in the VM's softc. We will inject 1351 * this virtual interrupt during the subsequent VM enter. 1352 */ 1353 1354 /* 1355 * This is special. We want to treat this as an 'handled' 1356 * VM-exit but not increment the instruction pointer. 1357 */ 1358 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_EXTINT, 1); 1359 return (1); 1360 case EXIT_REASON_NMI_WINDOW: 1361 /* Exit to allow the pending virtual NMI to be injected */ 1362 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_NMI_WINDOW, 1); 1363 vmx_clear_nmi_window_exiting(vmx, vcpu); 1364 VMM_CTR0(vmx->vm, vcpu, "Disabling NMI window exiting"); 1365 return (1); 1366 case EXIT_REASON_INOUT: 1367 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INOUT, 1); 1368 vmexit->exitcode = VM_EXITCODE_INOUT; 1369 vmexit->u.inout.bytes = (qual & 0x7) + 1; 1370 vmexit->u.inout.in = (qual & 0x8) ? 1 : 0; 1371 vmexit->u.inout.string = (qual & 0x10) ? 1 : 0; 1372 vmexit->u.inout.rep = (qual & 0x20) ? 1 : 0; 1373 vmexit->u.inout.port = (uint16_t)(qual >> 16); 1374 vmexit->u.inout.eax = (uint32_t)(vmxctx->guest_rax); 1375 break; 1376 case EXIT_REASON_CPUID: 1377 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_CPUID, 1); 1378 handled = vmx_handle_cpuid(vmx->vm, vcpu, vmxctx); 1379 break; 1380 case EXIT_REASON_EPT_FAULT: 1381 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_EPT_FAULT, 1); 1382 gla = vmcs_gla(); 1383 gpa = vmcs_gpa(); 1384 cr3 = vmcs_guest_cr3(); 1385 handled = vmx_ept_fault(vmx->vm, vcpu, gla, gpa, 1386 vmexit->rip, vmexit->inst_length, 1387 cr3, qual, &vmexit->u.paging.vie); 1388 if (!handled) { 1389 vmexit->exitcode = VM_EXITCODE_PAGING; 1390 vmexit->u.paging.gpa = gpa; 1391 } 1392 break; 1393 default: 1394 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_UNKNOWN, 1); 1395 break; 1396 } 1397 1398 if (handled) { 1399 /* 1400 * It is possible that control is returned to userland 1401 * even though we were able to handle the VM exit in the 1402 * kernel. 1403 * 1404 * In such a case we want to make sure that the userland 1405 * restarts guest execution at the instruction *after* 1406 * the one we just processed. Therefore we update the 1407 * guest rip in the VMCS and in 'vmexit'. 1408 */ 1409 vm_exit_update_rip(vmexit); 1410 vmexit->rip += vmexit->inst_length; 1411 vmexit->inst_length = 0; 1412 1413 /* 1414 * Special case for spinning up an AP - exit to userspace to 1415 * give the controlling process a chance to intercept and 1416 * spin up a thread for the AP. 1417 */ 1418 if (vmexit->exitcode == VM_EXITCODE_SPINUP_AP) 1419 handled = 0; 1420 } else { 1421 if (vmexit->exitcode == VM_EXITCODE_BOGUS) { 1422 /* 1423 * If this VM exit was not claimed by anybody then 1424 * treat it as a generic VMX exit. 1425 */ 1426 vmexit->exitcode = VM_EXITCODE_VMX; 1427 vmexit->u.vmx.error = 0; 1428 } else { 1429 /* 1430 * The exitcode and collateral have been populated. 1431 * The VM exit will be processed further in userland. 1432 */ 1433 } 1434 } 1435 return (handled); 1436 } 1437 1438 static int 1439 vmx_run(void *arg, int vcpu, register_t rip) 1440 { 1441 int error, vie, rc, handled, astpending; 1442 uint32_t exit_reason; 1443 struct vmx *vmx; 1444 struct vmxctx *vmxctx; 1445 struct vmcs *vmcs; 1446 struct vm_exit *vmexit; 1447 1448 vmx = arg; 1449 vmcs = &vmx->vmcs[vcpu]; 1450 vmxctx = &vmx->ctx[vcpu]; 1451 vmxctx->launched = 0; 1452 1453 astpending = 0; 1454 vmexit = vm_exitinfo(vmx->vm, vcpu); 1455 1456 /* 1457 * XXX Can we avoid doing this every time we do a vm run? 1458 */ 1459 VMPTRLD(vmcs); 1460 1461 /* 1462 * XXX 1463 * We do this every time because we may setup the virtual machine 1464 * from a different process than the one that actually runs it. 1465 * 1466 * If the life of a virtual machine was spent entirely in the context 1467 * of a single process we could do this once in vmcs_set_defaults(). 1468 */ 1469 if ((error = vmwrite(VMCS_HOST_CR3, rcr3())) != 0) 1470 panic("vmx_run: error %d writing to VMCS_HOST_CR3", error); 1471 1472 if ((error = vmwrite(VMCS_GUEST_RIP, rip)) != 0) 1473 panic("vmx_run: error %d writing to VMCS_GUEST_RIP", error); 1474 1475 if ((error = vmx_set_pcpu_defaults(vmx, vcpu)) != 0) 1476 panic("vmx_run: error %d setting up pcpu defaults", error); 1477 1478 do { 1479 lapic_timer_tick(vmx->vm, vcpu); 1480 vmx_inject_interrupts(vmx, vcpu); 1481 vmx_run_trace(vmx, vcpu); 1482 rc = vmx_setjmp(vmxctx); 1483 #ifdef SETJMP_TRACE 1484 vmx_setjmp_trace(vmx, vcpu, vmxctx, rc); 1485 #endif 1486 switch (rc) { 1487 case VMX_RETURN_DIRECT: 1488 if (vmxctx->launched == 0) { 1489 vmxctx->launched = 1; 1490 vmx_launch(vmxctx); 1491 } else 1492 vmx_resume(vmxctx); 1493 panic("vmx_launch/resume should not return"); 1494 break; 1495 case VMX_RETURN_LONGJMP: 1496 break; /* vm exit */ 1497 case VMX_RETURN_AST: 1498 astpending = 1; 1499 break; 1500 case VMX_RETURN_VMRESUME: 1501 vie = vmcs_instruction_error(); 1502 if (vmxctx->launch_error == VM_FAIL_INVALID || 1503 vie != VMRESUME_WITH_NON_LAUNCHED_VMCS) { 1504 printf("vmresume error %d vmcs inst error %d\n", 1505 vmxctx->launch_error, vie); 1506 goto err_exit; 1507 } 1508 vmx_launch(vmxctx); /* try to launch the guest */ 1509 panic("vmx_launch should not return"); 1510 break; 1511 case VMX_RETURN_VMLAUNCH: 1512 vie = vmcs_instruction_error(); 1513 #if 1 1514 printf("vmlaunch error %d vmcs inst error %d\n", 1515 vmxctx->launch_error, vie); 1516 #endif 1517 goto err_exit; 1518 default: 1519 panic("vmx_setjmp returned %d", rc); 1520 } 1521 1522 /* enable interrupts */ 1523 enable_intr(); 1524 1525 /* collect some basic information for VM exit processing */ 1526 vmexit->rip = rip = vmcs_guest_rip(); 1527 vmexit->inst_length = vmexit_instruction_length(); 1528 vmexit->u.vmx.exit_reason = exit_reason = vmcs_exit_reason(); 1529 vmexit->u.vmx.exit_qualification = vmcs_exit_qualification(); 1530 1531 if (astpending) { 1532 handled = 1; 1533 vmexit->inst_length = 0; 1534 vmexit->exitcode = VM_EXITCODE_BOGUS; 1535 vmx_astpending_trace(vmx, vcpu, rip); 1536 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_ASTPENDING, 1); 1537 break; 1538 } 1539 1540 handled = vmx_exit_process(vmx, vcpu, vmexit); 1541 vmx_exit_trace(vmx, vcpu, rip, exit_reason, handled); 1542 1543 } while (handled); 1544 1545 /* 1546 * If a VM exit has been handled then the exitcode must be BOGUS 1547 * If a VM exit is not handled then the exitcode must not be BOGUS 1548 */ 1549 if ((handled && vmexit->exitcode != VM_EXITCODE_BOGUS) || 1550 (!handled && vmexit->exitcode == VM_EXITCODE_BOGUS)) { 1551 panic("Mismatch between handled (%d) and exitcode (%d)", 1552 handled, vmexit->exitcode); 1553 } 1554 1555 if (!handled) 1556 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_USERSPACE, 1); 1557 1558 VMM_CTR1(vmx->vm, vcpu, "goto userland: exitcode %d",vmexit->exitcode); 1559 1560 /* 1561 * XXX 1562 * We need to do this to ensure that any VMCS state cached by the 1563 * processor is flushed to memory. We need to do this in case the 1564 * VM moves to a different cpu the next time it runs. 1565 * 1566 * Can we avoid doing this? 1567 */ 1568 VMCLEAR(vmcs); 1569 return (0); 1570 1571 err_exit: 1572 vmexit->exitcode = VM_EXITCODE_VMX; 1573 vmexit->u.vmx.exit_reason = (uint32_t)-1; 1574 vmexit->u.vmx.exit_qualification = (uint32_t)-1; 1575 vmexit->u.vmx.error = vie; 1576 VMCLEAR(vmcs); 1577 return (ENOEXEC); 1578 } 1579 1580 static void 1581 vmx_vmcleanup(void *arg) 1582 { 1583 int error; 1584 struct vmx *vmx = arg; 1585 1586 /* 1587 * XXXSMP we also need to clear the VMCS active on the other vcpus. 1588 */ 1589 error = vmclear(&vmx->vmcs[0]); 1590 if (error != 0) 1591 panic("vmx_vmcleanup: vmclear error %d on vcpu 0", error); 1592 1593 ept_vmcleanup(vmx); 1594 free(vmx, M_VMX); 1595 1596 return; 1597 } 1598 1599 static register_t * 1600 vmxctx_regptr(struct vmxctx *vmxctx, int reg) 1601 { 1602 1603 switch (reg) { 1604 case VM_REG_GUEST_RAX: 1605 return (&vmxctx->guest_rax); 1606 case VM_REG_GUEST_RBX: 1607 return (&vmxctx->guest_rbx); 1608 case VM_REG_GUEST_RCX: 1609 return (&vmxctx->guest_rcx); 1610 case VM_REG_GUEST_RDX: 1611 return (&vmxctx->guest_rdx); 1612 case VM_REG_GUEST_RSI: 1613 return (&vmxctx->guest_rsi); 1614 case VM_REG_GUEST_RDI: 1615 return (&vmxctx->guest_rdi); 1616 case VM_REG_GUEST_RBP: 1617 return (&vmxctx->guest_rbp); 1618 case VM_REG_GUEST_R8: 1619 return (&vmxctx->guest_r8); 1620 case VM_REG_GUEST_R9: 1621 return (&vmxctx->guest_r9); 1622 case VM_REG_GUEST_R10: 1623 return (&vmxctx->guest_r10); 1624 case VM_REG_GUEST_R11: 1625 return (&vmxctx->guest_r11); 1626 case VM_REG_GUEST_R12: 1627 return (&vmxctx->guest_r12); 1628 case VM_REG_GUEST_R13: 1629 return (&vmxctx->guest_r13); 1630 case VM_REG_GUEST_R14: 1631 return (&vmxctx->guest_r14); 1632 case VM_REG_GUEST_R15: 1633 return (&vmxctx->guest_r15); 1634 default: 1635 break; 1636 } 1637 return (NULL); 1638 } 1639 1640 static int 1641 vmxctx_getreg(struct vmxctx *vmxctx, int reg, uint64_t *retval) 1642 { 1643 register_t *regp; 1644 1645 if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) { 1646 *retval = *regp; 1647 return (0); 1648 } else 1649 return (EINVAL); 1650 } 1651 1652 static int 1653 vmxctx_setreg(struct vmxctx *vmxctx, int reg, uint64_t val) 1654 { 1655 register_t *regp; 1656 1657 if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) { 1658 *regp = val; 1659 return (0); 1660 } else 1661 return (EINVAL); 1662 } 1663 1664 static int 1665 vmx_shadow_reg(int reg) 1666 { 1667 int shreg; 1668 1669 shreg = -1; 1670 1671 switch (reg) { 1672 case VM_REG_GUEST_CR0: 1673 shreg = VMCS_CR0_SHADOW; 1674 break; 1675 case VM_REG_GUEST_CR4: 1676 shreg = VMCS_CR4_SHADOW; 1677 break; 1678 default: 1679 break; 1680 } 1681 1682 return (shreg); 1683 } 1684 1685 static int 1686 vmx_getreg(void *arg, int vcpu, int reg, uint64_t *retval) 1687 { 1688 int running, hostcpu; 1689 struct vmx *vmx = arg; 1690 1691 running = vcpu_is_running(vmx->vm, vcpu, &hostcpu); 1692 if (running && hostcpu != curcpu) 1693 panic("vmx_getreg: %s%d is running", vm_name(vmx->vm), vcpu); 1694 1695 if (vmxctx_getreg(&vmx->ctx[vcpu], reg, retval) == 0) 1696 return (0); 1697 1698 return (vmcs_getreg(&vmx->vmcs[vcpu], running, reg, retval)); 1699 } 1700 1701 static int 1702 vmx_setreg(void *arg, int vcpu, int reg, uint64_t val) 1703 { 1704 int error, hostcpu, running, shadow; 1705 uint64_t ctls; 1706 struct vmx *vmx = arg; 1707 1708 running = vcpu_is_running(vmx->vm, vcpu, &hostcpu); 1709 if (running && hostcpu != curcpu) 1710 panic("vmx_setreg: %s%d is running", vm_name(vmx->vm), vcpu); 1711 1712 if (vmxctx_setreg(&vmx->ctx[vcpu], reg, val) == 0) 1713 return (0); 1714 1715 error = vmcs_setreg(&vmx->vmcs[vcpu], running, reg, val); 1716 1717 if (error == 0) { 1718 /* 1719 * If the "load EFER" VM-entry control is 1 then the 1720 * value of EFER.LMA must be identical to "IA-32e mode guest" 1721 * bit in the VM-entry control. 1722 */ 1723 if ((entry_ctls & VM_ENTRY_LOAD_EFER) != 0 && 1724 (reg == VM_REG_GUEST_EFER)) { 1725 vmcs_getreg(&vmx->vmcs[vcpu], running, 1726 VMCS_IDENT(VMCS_ENTRY_CTLS), &ctls); 1727 if (val & EFER_LMA) 1728 ctls |= VM_ENTRY_GUEST_LMA; 1729 else 1730 ctls &= ~VM_ENTRY_GUEST_LMA; 1731 vmcs_setreg(&vmx->vmcs[vcpu], running, 1732 VMCS_IDENT(VMCS_ENTRY_CTLS), ctls); 1733 } 1734 1735 shadow = vmx_shadow_reg(reg); 1736 if (shadow > 0) { 1737 /* 1738 * Store the unmodified value in the shadow 1739 */ 1740 error = vmcs_setreg(&vmx->vmcs[vcpu], running, 1741 VMCS_IDENT(shadow), val); 1742 } 1743 } 1744 1745 return (error); 1746 } 1747 1748 static int 1749 vmx_getdesc(void *arg, int vcpu, int reg, struct seg_desc *desc) 1750 { 1751 struct vmx *vmx = arg; 1752 1753 return (vmcs_getdesc(&vmx->vmcs[vcpu], reg, desc)); 1754 } 1755 1756 static int 1757 vmx_setdesc(void *arg, int vcpu, int reg, struct seg_desc *desc) 1758 { 1759 struct vmx *vmx = arg; 1760 1761 return (vmcs_setdesc(&vmx->vmcs[vcpu], reg, desc)); 1762 } 1763 1764 static int 1765 vmx_inject(void *arg, int vcpu, int type, int vector, uint32_t code, 1766 int code_valid) 1767 { 1768 int error; 1769 uint64_t info; 1770 struct vmx *vmx = arg; 1771 struct vmcs *vmcs = &vmx->vmcs[vcpu]; 1772 1773 static uint32_t type_map[VM_EVENT_MAX] = { 1774 0x1, /* VM_EVENT_NONE */ 1775 0x0, /* VM_HW_INTR */ 1776 0x2, /* VM_NMI */ 1777 0x3, /* VM_HW_EXCEPTION */ 1778 0x4, /* VM_SW_INTR */ 1779 0x5, /* VM_PRIV_SW_EXCEPTION */ 1780 0x6, /* VM_SW_EXCEPTION */ 1781 }; 1782 1783 /* 1784 * If there is already an exception pending to be delivered to the 1785 * vcpu then just return. 1786 */ 1787 error = vmcs_getreg(vmcs, 0, VMCS_IDENT(VMCS_ENTRY_INTR_INFO), &info); 1788 if (error) 1789 return (error); 1790 1791 if (info & VMCS_INTERRUPTION_INFO_VALID) 1792 return (EAGAIN); 1793 1794 info = vector | (type_map[type] << 8) | (code_valid ? 1 << 11 : 0); 1795 info |= VMCS_INTERRUPTION_INFO_VALID; 1796 error = vmcs_setreg(vmcs, 0, VMCS_IDENT(VMCS_ENTRY_INTR_INFO), info); 1797 if (error != 0) 1798 return (error); 1799 1800 if (code_valid) { 1801 error = vmcs_setreg(vmcs, 0, 1802 VMCS_IDENT(VMCS_ENTRY_EXCEPTION_ERROR), 1803 code); 1804 } 1805 return (error); 1806 } 1807 1808 static int 1809 vmx_getcap(void *arg, int vcpu, int type, int *retval) 1810 { 1811 struct vmx *vmx = arg; 1812 int vcap; 1813 int ret; 1814 1815 ret = ENOENT; 1816 1817 vcap = vmx->cap[vcpu].set; 1818 1819 switch (type) { 1820 case VM_CAP_HALT_EXIT: 1821 if (cap_halt_exit) 1822 ret = 0; 1823 break; 1824 case VM_CAP_PAUSE_EXIT: 1825 if (cap_pause_exit) 1826 ret = 0; 1827 break; 1828 case VM_CAP_MTRAP_EXIT: 1829 if (cap_monitor_trap) 1830 ret = 0; 1831 break; 1832 case VM_CAP_UNRESTRICTED_GUEST: 1833 if (cap_unrestricted_guest) 1834 ret = 0; 1835 break; 1836 default: 1837 break; 1838 } 1839 1840 if (ret == 0) 1841 *retval = (vcap & (1 << type)) ? 1 : 0; 1842 1843 return (ret); 1844 } 1845 1846 static int 1847 vmx_setcap(void *arg, int vcpu, int type, int val) 1848 { 1849 struct vmx *vmx = arg; 1850 struct vmcs *vmcs = &vmx->vmcs[vcpu]; 1851 uint32_t baseval; 1852 uint32_t *pptr; 1853 int error; 1854 int flag; 1855 int reg; 1856 int retval; 1857 1858 retval = ENOENT; 1859 pptr = NULL; 1860 1861 switch (type) { 1862 case VM_CAP_HALT_EXIT: 1863 if (cap_halt_exit) { 1864 retval = 0; 1865 pptr = &vmx->cap[vcpu].proc_ctls; 1866 baseval = *pptr; 1867 flag = PROCBASED_HLT_EXITING; 1868 reg = VMCS_PRI_PROC_BASED_CTLS; 1869 } 1870 break; 1871 case VM_CAP_MTRAP_EXIT: 1872 if (cap_monitor_trap) { 1873 retval = 0; 1874 pptr = &vmx->cap[vcpu].proc_ctls; 1875 baseval = *pptr; 1876 flag = PROCBASED_MTF; 1877 reg = VMCS_PRI_PROC_BASED_CTLS; 1878 } 1879 break; 1880 case VM_CAP_PAUSE_EXIT: 1881 if (cap_pause_exit) { 1882 retval = 0; 1883 pptr = &vmx->cap[vcpu].proc_ctls; 1884 baseval = *pptr; 1885 flag = PROCBASED_PAUSE_EXITING; 1886 reg = VMCS_PRI_PROC_BASED_CTLS; 1887 } 1888 break; 1889 case VM_CAP_UNRESTRICTED_GUEST: 1890 if (cap_unrestricted_guest) { 1891 retval = 0; 1892 baseval = procbased_ctls2; 1893 flag = PROCBASED2_UNRESTRICTED_GUEST; 1894 reg = VMCS_SEC_PROC_BASED_CTLS; 1895 } 1896 break; 1897 default: 1898 break; 1899 } 1900 1901 if (retval == 0) { 1902 if (val) { 1903 baseval |= flag; 1904 } else { 1905 baseval &= ~flag; 1906 } 1907 VMPTRLD(vmcs); 1908 error = vmwrite(reg, baseval); 1909 VMCLEAR(vmcs); 1910 1911 if (error) { 1912 retval = error; 1913 } else { 1914 /* 1915 * Update optional stored flags, and record 1916 * setting 1917 */ 1918 if (pptr != NULL) { 1919 *pptr = baseval; 1920 } 1921 1922 if (val) { 1923 vmx->cap[vcpu].set |= (1 << type); 1924 } else { 1925 vmx->cap[vcpu].set &= ~(1 << type); 1926 } 1927 } 1928 } 1929 1930 return (retval); 1931 } 1932 1933 struct vmm_ops vmm_ops_intel = { 1934 vmx_init, 1935 vmx_cleanup, 1936 vmx_vminit, 1937 vmx_run, 1938 vmx_vmcleanup, 1939 ept_vmmmap_set, 1940 ept_vmmmap_get, 1941 vmx_getreg, 1942 vmx_setreg, 1943 vmx_getdesc, 1944 vmx_setdesc, 1945 vmx_inject, 1946 vmx_getcap, 1947 vmx_setcap 1948 }; 1949