1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2011 NetApp, Inc. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 * 28 * $FreeBSD$ 29 */ 30 31 #include <sys/cdefs.h> 32 __FBSDID("$FreeBSD$"); 33 34 #include <sys/param.h> 35 #include <sys/systm.h> 36 #include <sys/smp.h> 37 #include <sys/kernel.h> 38 #include <sys/malloc.h> 39 #include <sys/pcpu.h> 40 #include <sys/proc.h> 41 #include <sys/sysctl.h> 42 43 #include <vm/vm.h> 44 #include <vm/pmap.h> 45 46 #include <machine/psl.h> 47 #include <machine/cpufunc.h> 48 #include <machine/md_var.h> 49 #include <machine/reg.h> 50 #include <machine/segments.h> 51 #include <machine/smp.h> 52 #include <machine/specialreg.h> 53 #include <machine/vmparam.h> 54 55 #include <machine/vmm.h> 56 #include <machine/vmm_dev.h> 57 #include <machine/vmm_instruction_emul.h> 58 #include "vmm_lapic.h" 59 #include "vmm_host.h" 60 #include "vmm_ioport.h" 61 #include "vmm_ktr.h" 62 #include "vmm_stat.h" 63 #include "vatpic.h" 64 #include "vlapic.h" 65 #include "vlapic_priv.h" 66 67 #include "ept.h" 68 #include "vmx_cpufunc.h" 69 #include "vmx.h" 70 #include "vmx_msr.h" 71 #include "x86.h" 72 #include "vmx_controls.h" 73 74 #define PINBASED_CTLS_ONE_SETTING \ 75 (PINBASED_EXTINT_EXITING | \ 76 PINBASED_NMI_EXITING | \ 77 PINBASED_VIRTUAL_NMI) 78 #define PINBASED_CTLS_ZERO_SETTING 0 79 80 #define PROCBASED_CTLS_WINDOW_SETTING \ 81 (PROCBASED_INT_WINDOW_EXITING | \ 82 PROCBASED_NMI_WINDOW_EXITING) 83 84 #define PROCBASED_CTLS_ONE_SETTING \ 85 (PROCBASED_SECONDARY_CONTROLS | \ 86 PROCBASED_MWAIT_EXITING | \ 87 PROCBASED_MONITOR_EXITING | \ 88 PROCBASED_IO_EXITING | \ 89 PROCBASED_MSR_BITMAPS | \ 90 PROCBASED_CTLS_WINDOW_SETTING | \ 91 PROCBASED_CR8_LOAD_EXITING | \ 92 PROCBASED_CR8_STORE_EXITING) 93 #define PROCBASED_CTLS_ZERO_SETTING \ 94 (PROCBASED_CR3_LOAD_EXITING | \ 95 PROCBASED_CR3_STORE_EXITING | \ 96 PROCBASED_IO_BITMAPS) 97 98 #define PROCBASED_CTLS2_ONE_SETTING PROCBASED2_ENABLE_EPT 99 #define PROCBASED_CTLS2_ZERO_SETTING 0 100 101 #define VM_EXIT_CTLS_ONE_SETTING \ 102 (VM_EXIT_SAVE_DEBUG_CONTROLS | \ 103 VM_EXIT_HOST_LMA | \ 104 VM_EXIT_SAVE_EFER | \ 105 VM_EXIT_LOAD_EFER | \ 106 VM_EXIT_ACKNOWLEDGE_INTERRUPT) 107 108 #define VM_EXIT_CTLS_ZERO_SETTING 0 109 110 #define VM_ENTRY_CTLS_ONE_SETTING \ 111 (VM_ENTRY_LOAD_DEBUG_CONTROLS | \ 112 VM_ENTRY_LOAD_EFER) 113 114 #define VM_ENTRY_CTLS_ZERO_SETTING \ 115 (VM_ENTRY_INTO_SMM | \ 116 VM_ENTRY_DEACTIVATE_DUAL_MONITOR) 117 118 #define HANDLED 1 119 #define UNHANDLED 0 120 121 static MALLOC_DEFINE(M_VMX, "vmx", "vmx"); 122 static MALLOC_DEFINE(M_VLAPIC, "vlapic", "vlapic"); 123 124 SYSCTL_DECL(_hw_vmm); 125 SYSCTL_NODE(_hw_vmm, OID_AUTO, vmx, CTLFLAG_RW, NULL, NULL); 126 127 int vmxon_enabled[MAXCPU]; 128 static char vmxon_region[MAXCPU][PAGE_SIZE] __aligned(PAGE_SIZE); 129 130 static uint32_t pinbased_ctls, procbased_ctls, procbased_ctls2; 131 static uint32_t exit_ctls, entry_ctls; 132 133 static uint64_t cr0_ones_mask, cr0_zeros_mask; 134 SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr0_ones_mask, CTLFLAG_RD, 135 &cr0_ones_mask, 0, NULL); 136 SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr0_zeros_mask, CTLFLAG_RD, 137 &cr0_zeros_mask, 0, NULL); 138 139 static uint64_t cr4_ones_mask, cr4_zeros_mask; 140 SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr4_ones_mask, CTLFLAG_RD, 141 &cr4_ones_mask, 0, NULL); 142 SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr4_zeros_mask, CTLFLAG_RD, 143 &cr4_zeros_mask, 0, NULL); 144 145 static int vmx_initialized; 146 SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, initialized, CTLFLAG_RD, 147 &vmx_initialized, 0, "Intel VMX initialized"); 148 149 /* 150 * Optional capabilities 151 */ 152 static SYSCTL_NODE(_hw_vmm_vmx, OID_AUTO, cap, CTLFLAG_RW, NULL, NULL); 153 154 static int cap_halt_exit; 155 SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, halt_exit, CTLFLAG_RD, &cap_halt_exit, 0, 156 "HLT triggers a VM-exit"); 157 158 static int cap_pause_exit; 159 SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, pause_exit, CTLFLAG_RD, &cap_pause_exit, 160 0, "PAUSE triggers a VM-exit"); 161 162 static int cap_unrestricted_guest; 163 SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, unrestricted_guest, CTLFLAG_RD, 164 &cap_unrestricted_guest, 0, "Unrestricted guests"); 165 166 static int cap_monitor_trap; 167 SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, monitor_trap, CTLFLAG_RD, 168 &cap_monitor_trap, 0, "Monitor trap flag"); 169 170 static int cap_invpcid; 171 SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, invpcid, CTLFLAG_RD, &cap_invpcid, 172 0, "Guests are allowed to use INVPCID"); 173 174 static int virtual_interrupt_delivery; 175 SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, virtual_interrupt_delivery, CTLFLAG_RD, 176 &virtual_interrupt_delivery, 0, "APICv virtual interrupt delivery support"); 177 178 static int posted_interrupts; 179 SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, posted_interrupts, CTLFLAG_RD, 180 &posted_interrupts, 0, "APICv posted interrupt support"); 181 182 static int pirvec = -1; 183 SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, posted_interrupt_vector, CTLFLAG_RD, 184 &pirvec, 0, "APICv posted interrupt vector"); 185 186 static struct unrhdr *vpid_unr; 187 static u_int vpid_alloc_failed; 188 SYSCTL_UINT(_hw_vmm_vmx, OID_AUTO, vpid_alloc_failed, CTLFLAG_RD, 189 &vpid_alloc_failed, 0, NULL); 190 191 static int guest_l1d_flush; 192 SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, l1d_flush, CTLFLAG_RD, 193 &guest_l1d_flush, 0, NULL); 194 195 uint64_t vmx_msr_flush_cmd; 196 197 /* 198 * The definitions of SDT probes for VMX. 199 */ 200 201 SDT_PROBE_DEFINE3(vmm, vmx, exit, entry, 202 "struct vmx *", "int", "struct vm_exit *"); 203 204 SDT_PROBE_DEFINE4(vmm, vmx, exit, taskswitch, 205 "struct vmx *", "int", "struct vm_exit *", "struct vm_task_switch *"); 206 207 SDT_PROBE_DEFINE4(vmm, vmx, exit, craccess, 208 "struct vmx *", "int", "struct vm_exit *", "uint64_t"); 209 210 SDT_PROBE_DEFINE4(vmm, vmx, exit, rdmsr, 211 "struct vmx *", "int", "struct vm_exit *", "uint32_t"); 212 213 SDT_PROBE_DEFINE5(vmm, vmx, exit, wrmsr, 214 "struct vmx *", "int", "struct vm_exit *", "uint32_t", "uint64_t"); 215 216 SDT_PROBE_DEFINE3(vmm, vmx, exit, halt, 217 "struct vmx *", "int", "struct vm_exit *"); 218 219 SDT_PROBE_DEFINE3(vmm, vmx, exit, mtrap, 220 "struct vmx *", "int", "struct vm_exit *"); 221 222 SDT_PROBE_DEFINE3(vmm, vmx, exit, pause, 223 "struct vmx *", "int", "struct vm_exit *"); 224 225 SDT_PROBE_DEFINE3(vmm, vmx, exit, intrwindow, 226 "struct vmx *", "int", "struct vm_exit *"); 227 228 SDT_PROBE_DEFINE4(vmm, vmx, exit, interrupt, 229 "struct vmx *", "int", "struct vm_exit *", "uint32_t"); 230 231 SDT_PROBE_DEFINE3(vmm, vmx, exit, nmiwindow, 232 "struct vmx *", "int", "struct vm_exit *"); 233 234 SDT_PROBE_DEFINE3(vmm, vmx, exit, inout, 235 "struct vmx *", "int", "struct vm_exit *"); 236 237 SDT_PROBE_DEFINE3(vmm, vmx, exit, cpuid, 238 "struct vmx *", "int", "struct vm_exit *"); 239 240 SDT_PROBE_DEFINE5(vmm, vmx, exit, exception, 241 "struct vmx *", "int", "struct vm_exit *", "uint32_t", "int"); 242 243 SDT_PROBE_DEFINE5(vmm, vmx, exit, nestedfault, 244 "struct vmx *", "int", "struct vm_exit *", "uint64_t", "uint64_t"); 245 246 SDT_PROBE_DEFINE4(vmm, vmx, exit, mmiofault, 247 "struct vmx *", "int", "struct vm_exit *", "uint64_t"); 248 249 SDT_PROBE_DEFINE3(vmm, vmx, exit, eoi, 250 "struct vmx *", "int", "struct vm_exit *"); 251 252 SDT_PROBE_DEFINE3(vmm, vmx, exit, apicaccess, 253 "struct vmx *", "int", "struct vm_exit *"); 254 255 SDT_PROBE_DEFINE4(vmm, vmx, exit, apicwrite, 256 "struct vmx *", "int", "struct vm_exit *", "struct vlapic *"); 257 258 SDT_PROBE_DEFINE3(vmm, vmx, exit, xsetbv, 259 "struct vmx *", "int", "struct vm_exit *"); 260 261 SDT_PROBE_DEFINE3(vmm, vmx, exit, monitor, 262 "struct vmx *", "int", "struct vm_exit *"); 263 264 SDT_PROBE_DEFINE3(vmm, vmx, exit, mwait, 265 "struct vmx *", "int", "struct vm_exit *"); 266 267 SDT_PROBE_DEFINE4(vmm, vmx, exit, unknown, 268 "struct vmx *", "int", "struct vm_exit *", "uint32_t"); 269 270 SDT_PROBE_DEFINE4(vmm, vmx, exit, return, 271 "struct vmx *", "int", "struct vm_exit *", "int"); 272 273 /* 274 * Use the last page below 4GB as the APIC access address. This address is 275 * occupied by the boot firmware so it is guaranteed that it will not conflict 276 * with a page in system memory. 277 */ 278 #define APIC_ACCESS_ADDRESS 0xFFFFF000 279 280 static int vmx_getdesc(void *arg, int vcpu, int reg, struct seg_desc *desc); 281 static int vmx_getreg(void *arg, int vcpu, int reg, uint64_t *retval); 282 static int vmxctx_setreg(struct vmxctx *vmxctx, int reg, uint64_t val); 283 static void vmx_inject_pir(struct vlapic *vlapic); 284 285 #ifdef KTR 286 static const char * 287 exit_reason_to_str(int reason) 288 { 289 static char reasonbuf[32]; 290 291 switch (reason) { 292 case EXIT_REASON_EXCEPTION: 293 return "exception"; 294 case EXIT_REASON_EXT_INTR: 295 return "extint"; 296 case EXIT_REASON_TRIPLE_FAULT: 297 return "triplefault"; 298 case EXIT_REASON_INIT: 299 return "init"; 300 case EXIT_REASON_SIPI: 301 return "sipi"; 302 case EXIT_REASON_IO_SMI: 303 return "iosmi"; 304 case EXIT_REASON_SMI: 305 return "smi"; 306 case EXIT_REASON_INTR_WINDOW: 307 return "intrwindow"; 308 case EXIT_REASON_NMI_WINDOW: 309 return "nmiwindow"; 310 case EXIT_REASON_TASK_SWITCH: 311 return "taskswitch"; 312 case EXIT_REASON_CPUID: 313 return "cpuid"; 314 case EXIT_REASON_GETSEC: 315 return "getsec"; 316 case EXIT_REASON_HLT: 317 return "hlt"; 318 case EXIT_REASON_INVD: 319 return "invd"; 320 case EXIT_REASON_INVLPG: 321 return "invlpg"; 322 case EXIT_REASON_RDPMC: 323 return "rdpmc"; 324 case EXIT_REASON_RDTSC: 325 return "rdtsc"; 326 case EXIT_REASON_RSM: 327 return "rsm"; 328 case EXIT_REASON_VMCALL: 329 return "vmcall"; 330 case EXIT_REASON_VMCLEAR: 331 return "vmclear"; 332 case EXIT_REASON_VMLAUNCH: 333 return "vmlaunch"; 334 case EXIT_REASON_VMPTRLD: 335 return "vmptrld"; 336 case EXIT_REASON_VMPTRST: 337 return "vmptrst"; 338 case EXIT_REASON_VMREAD: 339 return "vmread"; 340 case EXIT_REASON_VMRESUME: 341 return "vmresume"; 342 case EXIT_REASON_VMWRITE: 343 return "vmwrite"; 344 case EXIT_REASON_VMXOFF: 345 return "vmxoff"; 346 case EXIT_REASON_VMXON: 347 return "vmxon"; 348 case EXIT_REASON_CR_ACCESS: 349 return "craccess"; 350 case EXIT_REASON_DR_ACCESS: 351 return "draccess"; 352 case EXIT_REASON_INOUT: 353 return "inout"; 354 case EXIT_REASON_RDMSR: 355 return "rdmsr"; 356 case EXIT_REASON_WRMSR: 357 return "wrmsr"; 358 case EXIT_REASON_INVAL_VMCS: 359 return "invalvmcs"; 360 case EXIT_REASON_INVAL_MSR: 361 return "invalmsr"; 362 case EXIT_REASON_MWAIT: 363 return "mwait"; 364 case EXIT_REASON_MTF: 365 return "mtf"; 366 case EXIT_REASON_MONITOR: 367 return "monitor"; 368 case EXIT_REASON_PAUSE: 369 return "pause"; 370 case EXIT_REASON_MCE_DURING_ENTRY: 371 return "mce-during-entry"; 372 case EXIT_REASON_TPR: 373 return "tpr"; 374 case EXIT_REASON_APIC_ACCESS: 375 return "apic-access"; 376 case EXIT_REASON_GDTR_IDTR: 377 return "gdtridtr"; 378 case EXIT_REASON_LDTR_TR: 379 return "ldtrtr"; 380 case EXIT_REASON_EPT_FAULT: 381 return "eptfault"; 382 case EXIT_REASON_EPT_MISCONFIG: 383 return "eptmisconfig"; 384 case EXIT_REASON_INVEPT: 385 return "invept"; 386 case EXIT_REASON_RDTSCP: 387 return "rdtscp"; 388 case EXIT_REASON_VMX_PREEMPT: 389 return "vmxpreempt"; 390 case EXIT_REASON_INVVPID: 391 return "invvpid"; 392 case EXIT_REASON_WBINVD: 393 return "wbinvd"; 394 case EXIT_REASON_XSETBV: 395 return "xsetbv"; 396 case EXIT_REASON_APIC_WRITE: 397 return "apic-write"; 398 default: 399 snprintf(reasonbuf, sizeof(reasonbuf), "%d", reason); 400 return (reasonbuf); 401 } 402 } 403 #endif /* KTR */ 404 405 static int 406 vmx_allow_x2apic_msrs(struct vmx *vmx) 407 { 408 int i, error; 409 410 error = 0; 411 412 /* 413 * Allow readonly access to the following x2APIC MSRs from the guest. 414 */ 415 error += guest_msr_ro(vmx, MSR_APIC_ID); 416 error += guest_msr_ro(vmx, MSR_APIC_VERSION); 417 error += guest_msr_ro(vmx, MSR_APIC_LDR); 418 error += guest_msr_ro(vmx, MSR_APIC_SVR); 419 420 for (i = 0; i < 8; i++) 421 error += guest_msr_ro(vmx, MSR_APIC_ISR0 + i); 422 423 for (i = 0; i < 8; i++) 424 error += guest_msr_ro(vmx, MSR_APIC_TMR0 + i); 425 426 for (i = 0; i < 8; i++) 427 error += guest_msr_ro(vmx, MSR_APIC_IRR0 + i); 428 429 error += guest_msr_ro(vmx, MSR_APIC_ESR); 430 error += guest_msr_ro(vmx, MSR_APIC_LVT_TIMER); 431 error += guest_msr_ro(vmx, MSR_APIC_LVT_THERMAL); 432 error += guest_msr_ro(vmx, MSR_APIC_LVT_PCINT); 433 error += guest_msr_ro(vmx, MSR_APIC_LVT_LINT0); 434 error += guest_msr_ro(vmx, MSR_APIC_LVT_LINT1); 435 error += guest_msr_ro(vmx, MSR_APIC_LVT_ERROR); 436 error += guest_msr_ro(vmx, MSR_APIC_ICR_TIMER); 437 error += guest_msr_ro(vmx, MSR_APIC_DCR_TIMER); 438 error += guest_msr_ro(vmx, MSR_APIC_ICR); 439 440 /* 441 * Allow TPR, EOI and SELF_IPI MSRs to be read and written by the guest. 442 * 443 * These registers get special treatment described in the section 444 * "Virtualizing MSR-Based APIC Accesses". 445 */ 446 error += guest_msr_rw(vmx, MSR_APIC_TPR); 447 error += guest_msr_rw(vmx, MSR_APIC_EOI); 448 error += guest_msr_rw(vmx, MSR_APIC_SELF_IPI); 449 450 return (error); 451 } 452 453 u_long 454 vmx_fix_cr0(u_long cr0) 455 { 456 457 return ((cr0 | cr0_ones_mask) & ~cr0_zeros_mask); 458 } 459 460 u_long 461 vmx_fix_cr4(u_long cr4) 462 { 463 464 return ((cr4 | cr4_ones_mask) & ~cr4_zeros_mask); 465 } 466 467 static void 468 vpid_free(int vpid) 469 { 470 if (vpid < 0 || vpid > 0xffff) 471 panic("vpid_free: invalid vpid %d", vpid); 472 473 /* 474 * VPIDs [0,VM_MAXCPU] are special and are not allocated from 475 * the unit number allocator. 476 */ 477 478 if (vpid > VM_MAXCPU) 479 free_unr(vpid_unr, vpid); 480 } 481 482 static void 483 vpid_alloc(uint16_t *vpid, int num) 484 { 485 int i, x; 486 487 if (num <= 0 || num > VM_MAXCPU) 488 panic("invalid number of vpids requested: %d", num); 489 490 /* 491 * If the "enable vpid" execution control is not enabled then the 492 * VPID is required to be 0 for all vcpus. 493 */ 494 if ((procbased_ctls2 & PROCBASED2_ENABLE_VPID) == 0) { 495 for (i = 0; i < num; i++) 496 vpid[i] = 0; 497 return; 498 } 499 500 /* 501 * Allocate a unique VPID for each vcpu from the unit number allocator. 502 */ 503 for (i = 0; i < num; i++) { 504 x = alloc_unr(vpid_unr); 505 if (x == -1) 506 break; 507 else 508 vpid[i] = x; 509 } 510 511 if (i < num) { 512 atomic_add_int(&vpid_alloc_failed, 1); 513 514 /* 515 * If the unit number allocator does not have enough unique 516 * VPIDs then we need to allocate from the [1,VM_MAXCPU] range. 517 * 518 * These VPIDs are not be unique across VMs but this does not 519 * affect correctness because the combined mappings are also 520 * tagged with the EP4TA which is unique for each VM. 521 * 522 * It is still sub-optimal because the invvpid will invalidate 523 * combined mappings for a particular VPID across all EP4TAs. 524 */ 525 while (i-- > 0) 526 vpid_free(vpid[i]); 527 528 for (i = 0; i < num; i++) 529 vpid[i] = i + 1; 530 } 531 } 532 533 static void 534 vpid_init(void) 535 { 536 /* 537 * VPID 0 is required when the "enable VPID" execution control is 538 * disabled. 539 * 540 * VPIDs [1,VM_MAXCPU] are used as the "overflow namespace" when the 541 * unit number allocator does not have sufficient unique VPIDs to 542 * satisfy the allocation. 543 * 544 * The remaining VPIDs are managed by the unit number allocator. 545 */ 546 vpid_unr = new_unrhdr(VM_MAXCPU + 1, 0xffff, NULL); 547 } 548 549 static void 550 vmx_disable(void *arg __unused) 551 { 552 struct invvpid_desc invvpid_desc = { 0 }; 553 struct invept_desc invept_desc = { 0 }; 554 555 if (vmxon_enabled[curcpu]) { 556 /* 557 * See sections 25.3.3.3 and 25.3.3.4 in Intel Vol 3b. 558 * 559 * VMXON or VMXOFF are not required to invalidate any TLB 560 * caching structures. This prevents potential retention of 561 * cached information in the TLB between distinct VMX episodes. 562 */ 563 invvpid(INVVPID_TYPE_ALL_CONTEXTS, invvpid_desc); 564 invept(INVEPT_TYPE_ALL_CONTEXTS, invept_desc); 565 vmxoff(); 566 } 567 load_cr4(rcr4() & ~CR4_VMXE); 568 } 569 570 static int 571 vmx_cleanup(void) 572 { 573 574 if (pirvec >= 0) 575 lapic_ipi_free(pirvec); 576 577 if (vpid_unr != NULL) { 578 delete_unrhdr(vpid_unr); 579 vpid_unr = NULL; 580 } 581 582 smp_rendezvous(NULL, vmx_disable, NULL, NULL); 583 584 return (0); 585 } 586 587 static void 588 vmx_enable(void *arg __unused) 589 { 590 int error; 591 uint64_t feature_control; 592 593 feature_control = rdmsr(MSR_IA32_FEATURE_CONTROL); 594 if ((feature_control & IA32_FEATURE_CONTROL_LOCK) == 0 || 595 (feature_control & IA32_FEATURE_CONTROL_VMX_EN) == 0) { 596 wrmsr(MSR_IA32_FEATURE_CONTROL, 597 feature_control | IA32_FEATURE_CONTROL_VMX_EN | 598 IA32_FEATURE_CONTROL_LOCK); 599 } 600 601 load_cr4(rcr4() | CR4_VMXE); 602 603 *(uint32_t *)vmxon_region[curcpu] = vmx_revision(); 604 error = vmxon(vmxon_region[curcpu]); 605 if (error == 0) 606 vmxon_enabled[curcpu] = 1; 607 } 608 609 static void 610 vmx_restore(void) 611 { 612 613 if (vmxon_enabled[curcpu]) 614 vmxon(vmxon_region[curcpu]); 615 } 616 617 static int 618 vmx_init(int ipinum) 619 { 620 int error, use_tpr_shadow; 621 uint64_t basic, fixed0, fixed1, feature_control; 622 uint32_t tmp, procbased2_vid_bits; 623 624 /* CPUID.1:ECX[bit 5] must be 1 for processor to support VMX */ 625 if (!(cpu_feature2 & CPUID2_VMX)) { 626 printf("vmx_init: processor does not support VMX operation\n"); 627 return (ENXIO); 628 } 629 630 /* 631 * Verify that MSR_IA32_FEATURE_CONTROL lock and VMXON enable bits 632 * are set (bits 0 and 2 respectively). 633 */ 634 feature_control = rdmsr(MSR_IA32_FEATURE_CONTROL); 635 if ((feature_control & IA32_FEATURE_CONTROL_LOCK) == 1 && 636 (feature_control & IA32_FEATURE_CONTROL_VMX_EN) == 0) { 637 printf("vmx_init: VMX operation disabled by BIOS\n"); 638 return (ENXIO); 639 } 640 641 /* 642 * Verify capabilities MSR_VMX_BASIC: 643 * - bit 54 indicates support for INS/OUTS decoding 644 */ 645 basic = rdmsr(MSR_VMX_BASIC); 646 if ((basic & (1UL << 54)) == 0) { 647 printf("vmx_init: processor does not support desired basic " 648 "capabilities\n"); 649 return (EINVAL); 650 } 651 652 /* Check support for primary processor-based VM-execution controls */ 653 error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, 654 MSR_VMX_TRUE_PROCBASED_CTLS, 655 PROCBASED_CTLS_ONE_SETTING, 656 PROCBASED_CTLS_ZERO_SETTING, &procbased_ctls); 657 if (error) { 658 printf("vmx_init: processor does not support desired primary " 659 "processor-based controls\n"); 660 return (error); 661 } 662 663 /* Clear the processor-based ctl bits that are set on demand */ 664 procbased_ctls &= ~PROCBASED_CTLS_WINDOW_SETTING; 665 666 /* Check support for secondary processor-based VM-execution controls */ 667 error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, 668 MSR_VMX_PROCBASED_CTLS2, 669 PROCBASED_CTLS2_ONE_SETTING, 670 PROCBASED_CTLS2_ZERO_SETTING, &procbased_ctls2); 671 if (error) { 672 printf("vmx_init: processor does not support desired secondary " 673 "processor-based controls\n"); 674 return (error); 675 } 676 677 /* Check support for VPID */ 678 error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, MSR_VMX_PROCBASED_CTLS2, 679 PROCBASED2_ENABLE_VPID, 0, &tmp); 680 if (error == 0) 681 procbased_ctls2 |= PROCBASED2_ENABLE_VPID; 682 683 /* Check support for pin-based VM-execution controls */ 684 error = vmx_set_ctlreg(MSR_VMX_PINBASED_CTLS, 685 MSR_VMX_TRUE_PINBASED_CTLS, 686 PINBASED_CTLS_ONE_SETTING, 687 PINBASED_CTLS_ZERO_SETTING, &pinbased_ctls); 688 if (error) { 689 printf("vmx_init: processor does not support desired " 690 "pin-based controls\n"); 691 return (error); 692 } 693 694 /* Check support for VM-exit controls */ 695 error = vmx_set_ctlreg(MSR_VMX_EXIT_CTLS, MSR_VMX_TRUE_EXIT_CTLS, 696 VM_EXIT_CTLS_ONE_SETTING, 697 VM_EXIT_CTLS_ZERO_SETTING, 698 &exit_ctls); 699 if (error) { 700 printf("vmx_init: processor does not support desired " 701 "exit controls\n"); 702 return (error); 703 } 704 705 /* Check support for VM-entry controls */ 706 error = vmx_set_ctlreg(MSR_VMX_ENTRY_CTLS, MSR_VMX_TRUE_ENTRY_CTLS, 707 VM_ENTRY_CTLS_ONE_SETTING, VM_ENTRY_CTLS_ZERO_SETTING, 708 &entry_ctls); 709 if (error) { 710 printf("vmx_init: processor does not support desired " 711 "entry controls\n"); 712 return (error); 713 } 714 715 /* 716 * Check support for optional features by testing them 717 * as individual bits 718 */ 719 cap_halt_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, 720 MSR_VMX_TRUE_PROCBASED_CTLS, 721 PROCBASED_HLT_EXITING, 0, 722 &tmp) == 0); 723 724 cap_monitor_trap = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, 725 MSR_VMX_PROCBASED_CTLS, 726 PROCBASED_MTF, 0, 727 &tmp) == 0); 728 729 cap_pause_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, 730 MSR_VMX_TRUE_PROCBASED_CTLS, 731 PROCBASED_PAUSE_EXITING, 0, 732 &tmp) == 0); 733 734 cap_unrestricted_guest = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, 735 MSR_VMX_PROCBASED_CTLS2, 736 PROCBASED2_UNRESTRICTED_GUEST, 0, 737 &tmp) == 0); 738 739 cap_invpcid = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, 740 MSR_VMX_PROCBASED_CTLS2, PROCBASED2_ENABLE_INVPCID, 0, 741 &tmp) == 0); 742 743 /* 744 * Check support for virtual interrupt delivery. 745 */ 746 procbased2_vid_bits = (PROCBASED2_VIRTUALIZE_APIC_ACCESSES | 747 PROCBASED2_VIRTUALIZE_X2APIC_MODE | 748 PROCBASED2_APIC_REGISTER_VIRTUALIZATION | 749 PROCBASED2_VIRTUAL_INTERRUPT_DELIVERY); 750 751 use_tpr_shadow = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, 752 MSR_VMX_TRUE_PROCBASED_CTLS, PROCBASED_USE_TPR_SHADOW, 0, 753 &tmp) == 0); 754 755 error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, MSR_VMX_PROCBASED_CTLS2, 756 procbased2_vid_bits, 0, &tmp); 757 if (error == 0 && use_tpr_shadow) { 758 virtual_interrupt_delivery = 1; 759 TUNABLE_INT_FETCH("hw.vmm.vmx.use_apic_vid", 760 &virtual_interrupt_delivery); 761 } 762 763 if (virtual_interrupt_delivery) { 764 procbased_ctls |= PROCBASED_USE_TPR_SHADOW; 765 procbased_ctls2 |= procbased2_vid_bits; 766 procbased_ctls2 &= ~PROCBASED2_VIRTUALIZE_X2APIC_MODE; 767 768 /* 769 * No need to emulate accesses to %CR8 if virtual 770 * interrupt delivery is enabled. 771 */ 772 procbased_ctls &= ~PROCBASED_CR8_LOAD_EXITING; 773 procbased_ctls &= ~PROCBASED_CR8_STORE_EXITING; 774 775 /* 776 * Check for Posted Interrupts only if Virtual Interrupt 777 * Delivery is enabled. 778 */ 779 error = vmx_set_ctlreg(MSR_VMX_PINBASED_CTLS, 780 MSR_VMX_TRUE_PINBASED_CTLS, PINBASED_POSTED_INTERRUPT, 0, 781 &tmp); 782 if (error == 0) { 783 pirvec = lapic_ipi_alloc(pti ? &IDTVEC(justreturn1_pti) : 784 &IDTVEC(justreturn)); 785 if (pirvec < 0) { 786 if (bootverbose) { 787 printf("vmx_init: unable to allocate " 788 "posted interrupt vector\n"); 789 } 790 } else { 791 posted_interrupts = 1; 792 TUNABLE_INT_FETCH("hw.vmm.vmx.use_apic_pir", 793 &posted_interrupts); 794 } 795 } 796 } 797 798 if (posted_interrupts) 799 pinbased_ctls |= PINBASED_POSTED_INTERRUPT; 800 801 /* Initialize EPT */ 802 error = ept_init(ipinum); 803 if (error) { 804 printf("vmx_init: ept initialization failed (%d)\n", error); 805 return (error); 806 } 807 808 guest_l1d_flush = (cpu_ia32_arch_caps & IA32_ARCH_CAP_RDCL_NO) == 0; 809 TUNABLE_INT_FETCH("hw.vmm.l1d_flush", &guest_l1d_flush); 810 if (guest_l1d_flush && 811 (cpu_stdext_feature3 & CPUID_STDEXT3_L1D_FLUSH) != 0) 812 vmx_msr_flush_cmd = IA32_FLUSH_CMD_L1D; 813 814 /* 815 * Stash the cr0 and cr4 bits that must be fixed to 0 or 1 816 */ 817 fixed0 = rdmsr(MSR_VMX_CR0_FIXED0); 818 fixed1 = rdmsr(MSR_VMX_CR0_FIXED1); 819 cr0_ones_mask = fixed0 & fixed1; 820 cr0_zeros_mask = ~fixed0 & ~fixed1; 821 822 /* 823 * CR0_PE and CR0_PG can be set to zero in VMX non-root operation 824 * if unrestricted guest execution is allowed. 825 */ 826 if (cap_unrestricted_guest) 827 cr0_ones_mask &= ~(CR0_PG | CR0_PE); 828 829 /* 830 * Do not allow the guest to set CR0_NW or CR0_CD. 831 */ 832 cr0_zeros_mask |= (CR0_NW | CR0_CD); 833 834 fixed0 = rdmsr(MSR_VMX_CR4_FIXED0); 835 fixed1 = rdmsr(MSR_VMX_CR4_FIXED1); 836 cr4_ones_mask = fixed0 & fixed1; 837 cr4_zeros_mask = ~fixed0 & ~fixed1; 838 839 vpid_init(); 840 841 vmx_msr_init(); 842 843 /* enable VMX operation */ 844 smp_rendezvous(NULL, vmx_enable, NULL, NULL); 845 846 vmx_initialized = 1; 847 848 return (0); 849 } 850 851 static void 852 vmx_trigger_hostintr(int vector) 853 { 854 uintptr_t func; 855 struct gate_descriptor *gd; 856 857 gd = &idt[vector]; 858 859 KASSERT(vector >= 32 && vector <= 255, ("vmx_trigger_hostintr: " 860 "invalid vector %d", vector)); 861 KASSERT(gd->gd_p == 1, ("gate descriptor for vector %d not present", 862 vector)); 863 KASSERT(gd->gd_type == SDT_SYSIGT, ("gate descriptor for vector %d " 864 "has invalid type %d", vector, gd->gd_type)); 865 KASSERT(gd->gd_dpl == SEL_KPL, ("gate descriptor for vector %d " 866 "has invalid dpl %d", vector, gd->gd_dpl)); 867 KASSERT(gd->gd_selector == GSEL(GCODE_SEL, SEL_KPL), ("gate descriptor " 868 "for vector %d has invalid selector %d", vector, gd->gd_selector)); 869 KASSERT(gd->gd_ist == 0, ("gate descriptor for vector %d has invalid " 870 "IST %d", vector, gd->gd_ist)); 871 872 func = ((long)gd->gd_hioffset << 16 | gd->gd_looffset); 873 vmx_call_isr(func); 874 } 875 876 static int 877 vmx_setup_cr_shadow(int which, struct vmcs *vmcs, uint32_t initial) 878 { 879 int error, mask_ident, shadow_ident; 880 uint64_t mask_value; 881 882 if (which != 0 && which != 4) 883 panic("vmx_setup_cr_shadow: unknown cr%d", which); 884 885 if (which == 0) { 886 mask_ident = VMCS_CR0_MASK; 887 mask_value = cr0_ones_mask | cr0_zeros_mask; 888 shadow_ident = VMCS_CR0_SHADOW; 889 } else { 890 mask_ident = VMCS_CR4_MASK; 891 mask_value = cr4_ones_mask | cr4_zeros_mask; 892 shadow_ident = VMCS_CR4_SHADOW; 893 } 894 895 error = vmcs_setreg(vmcs, 0, VMCS_IDENT(mask_ident), mask_value); 896 if (error) 897 return (error); 898 899 error = vmcs_setreg(vmcs, 0, VMCS_IDENT(shadow_ident), initial); 900 if (error) 901 return (error); 902 903 return (0); 904 } 905 #define vmx_setup_cr0_shadow(vmcs,init) vmx_setup_cr_shadow(0, (vmcs), (init)) 906 #define vmx_setup_cr4_shadow(vmcs,init) vmx_setup_cr_shadow(4, (vmcs), (init)) 907 908 static void * 909 vmx_vminit(struct vm *vm, pmap_t pmap) 910 { 911 uint16_t vpid[VM_MAXCPU]; 912 int i, error; 913 struct vmx *vmx; 914 struct vmcs *vmcs; 915 uint32_t exc_bitmap; 916 917 vmx = malloc(sizeof(struct vmx), M_VMX, M_WAITOK | M_ZERO); 918 if ((uintptr_t)vmx & PAGE_MASK) { 919 panic("malloc of struct vmx not aligned on %d byte boundary", 920 PAGE_SIZE); 921 } 922 vmx->vm = vm; 923 924 vmx->eptp = eptp(vtophys((vm_offset_t)pmap->pm_pml4)); 925 926 /* 927 * Clean up EPTP-tagged guest physical and combined mappings 928 * 929 * VMX transitions are not required to invalidate any guest physical 930 * mappings. So, it may be possible for stale guest physical mappings 931 * to be present in the processor TLBs. 932 * 933 * Combined mappings for this EP4TA are also invalidated for all VPIDs. 934 */ 935 ept_invalidate_mappings(vmx->eptp); 936 937 msr_bitmap_initialize(vmx->msr_bitmap); 938 939 /* 940 * It is safe to allow direct access to MSR_GSBASE and MSR_FSBASE. 941 * The guest FSBASE and GSBASE are saved and restored during 942 * vm-exit and vm-entry respectively. The host FSBASE and GSBASE are 943 * always restored from the vmcs host state area on vm-exit. 944 * 945 * The SYSENTER_CS/ESP/EIP MSRs are identical to FS/GSBASE in 946 * how they are saved/restored so can be directly accessed by the 947 * guest. 948 * 949 * MSR_EFER is saved and restored in the guest VMCS area on a 950 * VM exit and entry respectively. It is also restored from the 951 * host VMCS area on a VM exit. 952 * 953 * The TSC MSR is exposed read-only. Writes are disallowed as 954 * that will impact the host TSC. If the guest does a write 955 * the "use TSC offsetting" execution control is enabled and the 956 * difference between the host TSC and the guest TSC is written 957 * into the TSC offset in the VMCS. 958 */ 959 if (guest_msr_rw(vmx, MSR_GSBASE) || 960 guest_msr_rw(vmx, MSR_FSBASE) || 961 guest_msr_rw(vmx, MSR_SYSENTER_CS_MSR) || 962 guest_msr_rw(vmx, MSR_SYSENTER_ESP_MSR) || 963 guest_msr_rw(vmx, MSR_SYSENTER_EIP_MSR) || 964 guest_msr_rw(vmx, MSR_EFER) || 965 guest_msr_ro(vmx, MSR_TSC)) 966 panic("vmx_vminit: error setting guest msr access"); 967 968 vpid_alloc(vpid, VM_MAXCPU); 969 970 if (virtual_interrupt_delivery) { 971 error = vm_map_mmio(vm, DEFAULT_APIC_BASE, PAGE_SIZE, 972 APIC_ACCESS_ADDRESS); 973 /* XXX this should really return an error to the caller */ 974 KASSERT(error == 0, ("vm_map_mmio(apicbase) error %d", error)); 975 } 976 977 for (i = 0; i < VM_MAXCPU; i++) { 978 vmcs = &vmx->vmcs[i]; 979 vmcs->identifier = vmx_revision(); 980 error = vmclear(vmcs); 981 if (error != 0) { 982 panic("vmx_vminit: vmclear error %d on vcpu %d\n", 983 error, i); 984 } 985 986 vmx_msr_guest_init(vmx, i); 987 988 error = vmcs_init(vmcs); 989 KASSERT(error == 0, ("vmcs_init error %d", error)); 990 991 VMPTRLD(vmcs); 992 error = 0; 993 error += vmwrite(VMCS_HOST_RSP, (u_long)&vmx->ctx[i]); 994 error += vmwrite(VMCS_EPTP, vmx->eptp); 995 error += vmwrite(VMCS_PIN_BASED_CTLS, pinbased_ctls); 996 error += vmwrite(VMCS_PRI_PROC_BASED_CTLS, procbased_ctls); 997 error += vmwrite(VMCS_SEC_PROC_BASED_CTLS, procbased_ctls2); 998 error += vmwrite(VMCS_EXIT_CTLS, exit_ctls); 999 error += vmwrite(VMCS_ENTRY_CTLS, entry_ctls); 1000 error += vmwrite(VMCS_MSR_BITMAP, vtophys(vmx->msr_bitmap)); 1001 error += vmwrite(VMCS_VPID, vpid[i]); 1002 1003 /* exception bitmap */ 1004 if (vcpu_trace_exceptions(vm, i)) 1005 exc_bitmap = 0xffffffff; 1006 else 1007 exc_bitmap = 1 << IDT_MC; 1008 error += vmwrite(VMCS_EXCEPTION_BITMAP, exc_bitmap); 1009 1010 vmx->ctx[i].guest_dr6 = DBREG_DR6_RESERVED1; 1011 error += vmwrite(VMCS_GUEST_DR7, DBREG_DR7_RESERVED1); 1012 1013 if (virtual_interrupt_delivery) { 1014 error += vmwrite(VMCS_APIC_ACCESS, APIC_ACCESS_ADDRESS); 1015 error += vmwrite(VMCS_VIRTUAL_APIC, 1016 vtophys(&vmx->apic_page[i])); 1017 error += vmwrite(VMCS_EOI_EXIT0, 0); 1018 error += vmwrite(VMCS_EOI_EXIT1, 0); 1019 error += vmwrite(VMCS_EOI_EXIT2, 0); 1020 error += vmwrite(VMCS_EOI_EXIT3, 0); 1021 } 1022 if (posted_interrupts) { 1023 error += vmwrite(VMCS_PIR_VECTOR, pirvec); 1024 error += vmwrite(VMCS_PIR_DESC, 1025 vtophys(&vmx->pir_desc[i])); 1026 } 1027 VMCLEAR(vmcs); 1028 KASSERT(error == 0, ("vmx_vminit: error customizing the vmcs")); 1029 1030 vmx->cap[i].set = 0; 1031 vmx->cap[i].proc_ctls = procbased_ctls; 1032 vmx->cap[i].proc_ctls2 = procbased_ctls2; 1033 1034 vmx->state[i].nextrip = ~0; 1035 vmx->state[i].lastcpu = NOCPU; 1036 vmx->state[i].vpid = vpid[i]; 1037 1038 /* 1039 * Set up the CR0/4 shadows, and init the read shadow 1040 * to the power-on register value from the Intel Sys Arch. 1041 * CR0 - 0x60000010 1042 * CR4 - 0 1043 */ 1044 error = vmx_setup_cr0_shadow(vmcs, 0x60000010); 1045 if (error != 0) 1046 panic("vmx_setup_cr0_shadow %d", error); 1047 1048 error = vmx_setup_cr4_shadow(vmcs, 0); 1049 if (error != 0) 1050 panic("vmx_setup_cr4_shadow %d", error); 1051 1052 vmx->ctx[i].pmap = pmap; 1053 } 1054 1055 return (vmx); 1056 } 1057 1058 static int 1059 vmx_handle_cpuid(struct vm *vm, int vcpu, struct vmxctx *vmxctx) 1060 { 1061 int handled, func; 1062 1063 func = vmxctx->guest_rax; 1064 1065 handled = x86_emulate_cpuid(vm, vcpu, 1066 (uint32_t*)(&vmxctx->guest_rax), 1067 (uint32_t*)(&vmxctx->guest_rbx), 1068 (uint32_t*)(&vmxctx->guest_rcx), 1069 (uint32_t*)(&vmxctx->guest_rdx)); 1070 return (handled); 1071 } 1072 1073 static __inline void 1074 vmx_run_trace(struct vmx *vmx, int vcpu) 1075 { 1076 #ifdef KTR 1077 VCPU_CTR1(vmx->vm, vcpu, "Resume execution at %#lx", vmcs_guest_rip()); 1078 #endif 1079 } 1080 1081 static __inline void 1082 vmx_exit_trace(struct vmx *vmx, int vcpu, uint64_t rip, uint32_t exit_reason, 1083 int handled) 1084 { 1085 #ifdef KTR 1086 VCPU_CTR3(vmx->vm, vcpu, "%s %s vmexit at 0x%0lx", 1087 handled ? "handled" : "unhandled", 1088 exit_reason_to_str(exit_reason), rip); 1089 #endif 1090 } 1091 1092 static __inline void 1093 vmx_astpending_trace(struct vmx *vmx, int vcpu, uint64_t rip) 1094 { 1095 #ifdef KTR 1096 VCPU_CTR1(vmx->vm, vcpu, "astpending vmexit at 0x%0lx", rip); 1097 #endif 1098 } 1099 1100 static VMM_STAT_INTEL(VCPU_INVVPID_SAVED, "Number of vpid invalidations saved"); 1101 static VMM_STAT_INTEL(VCPU_INVVPID_DONE, "Number of vpid invalidations done"); 1102 1103 /* 1104 * Invalidate guest mappings identified by its vpid from the TLB. 1105 */ 1106 static __inline void 1107 vmx_invvpid(struct vmx *vmx, int vcpu, pmap_t pmap, int running) 1108 { 1109 struct vmxstate *vmxstate; 1110 struct invvpid_desc invvpid_desc; 1111 1112 vmxstate = &vmx->state[vcpu]; 1113 if (vmxstate->vpid == 0) 1114 return; 1115 1116 if (!running) { 1117 /* 1118 * Set the 'lastcpu' to an invalid host cpu. 1119 * 1120 * This will invalidate TLB entries tagged with the vcpu's 1121 * vpid the next time it runs via vmx_set_pcpu_defaults(). 1122 */ 1123 vmxstate->lastcpu = NOCPU; 1124 return; 1125 } 1126 1127 KASSERT(curthread->td_critnest > 0, ("%s: vcpu %d running outside " 1128 "critical section", __func__, vcpu)); 1129 1130 /* 1131 * Invalidate all mappings tagged with 'vpid' 1132 * 1133 * We do this because this vcpu was executing on a different host 1134 * cpu when it last ran. We do not track whether it invalidated 1135 * mappings associated with its 'vpid' during that run. So we must 1136 * assume that the mappings associated with 'vpid' on 'curcpu' are 1137 * stale and invalidate them. 1138 * 1139 * Note that we incur this penalty only when the scheduler chooses to 1140 * move the thread associated with this vcpu between host cpus. 1141 * 1142 * Note also that this will invalidate mappings tagged with 'vpid' 1143 * for "all" EP4TAs. 1144 */ 1145 if (pmap->pm_eptgen == vmx->eptgen[curcpu]) { 1146 invvpid_desc._res1 = 0; 1147 invvpid_desc._res2 = 0; 1148 invvpid_desc.vpid = vmxstate->vpid; 1149 invvpid_desc.linear_addr = 0; 1150 invvpid(INVVPID_TYPE_SINGLE_CONTEXT, invvpid_desc); 1151 vmm_stat_incr(vmx->vm, vcpu, VCPU_INVVPID_DONE, 1); 1152 } else { 1153 /* 1154 * The invvpid can be skipped if an invept is going to 1155 * be performed before entering the guest. The invept 1156 * will invalidate combined mappings tagged with 1157 * 'vmx->eptp' for all vpids. 1158 */ 1159 vmm_stat_incr(vmx->vm, vcpu, VCPU_INVVPID_SAVED, 1); 1160 } 1161 } 1162 1163 static void 1164 vmx_set_pcpu_defaults(struct vmx *vmx, int vcpu, pmap_t pmap) 1165 { 1166 struct vmxstate *vmxstate; 1167 1168 vmxstate = &vmx->state[vcpu]; 1169 if (vmxstate->lastcpu == curcpu) 1170 return; 1171 1172 vmxstate->lastcpu = curcpu; 1173 1174 vmm_stat_incr(vmx->vm, vcpu, VCPU_MIGRATIONS, 1); 1175 1176 vmcs_write(VMCS_HOST_TR_BASE, vmm_get_host_trbase()); 1177 vmcs_write(VMCS_HOST_GDTR_BASE, vmm_get_host_gdtrbase()); 1178 vmcs_write(VMCS_HOST_GS_BASE, vmm_get_host_gsbase()); 1179 vmx_invvpid(vmx, vcpu, pmap, 1); 1180 } 1181 1182 /* 1183 * We depend on 'procbased_ctls' to have the Interrupt Window Exiting bit set. 1184 */ 1185 CTASSERT((PROCBASED_CTLS_ONE_SETTING & PROCBASED_INT_WINDOW_EXITING) != 0); 1186 1187 static void __inline 1188 vmx_set_int_window_exiting(struct vmx *vmx, int vcpu) 1189 { 1190 1191 if ((vmx->cap[vcpu].proc_ctls & PROCBASED_INT_WINDOW_EXITING) == 0) { 1192 vmx->cap[vcpu].proc_ctls |= PROCBASED_INT_WINDOW_EXITING; 1193 vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); 1194 VCPU_CTR0(vmx->vm, vcpu, "Enabling interrupt window exiting"); 1195 } 1196 } 1197 1198 static void __inline 1199 vmx_clear_int_window_exiting(struct vmx *vmx, int vcpu) 1200 { 1201 1202 KASSERT((vmx->cap[vcpu].proc_ctls & PROCBASED_INT_WINDOW_EXITING) != 0, 1203 ("intr_window_exiting not set: %#x", vmx->cap[vcpu].proc_ctls)); 1204 vmx->cap[vcpu].proc_ctls &= ~PROCBASED_INT_WINDOW_EXITING; 1205 vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); 1206 VCPU_CTR0(vmx->vm, vcpu, "Disabling interrupt window exiting"); 1207 } 1208 1209 static void __inline 1210 vmx_set_nmi_window_exiting(struct vmx *vmx, int vcpu) 1211 { 1212 1213 if ((vmx->cap[vcpu].proc_ctls & PROCBASED_NMI_WINDOW_EXITING) == 0) { 1214 vmx->cap[vcpu].proc_ctls |= PROCBASED_NMI_WINDOW_EXITING; 1215 vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); 1216 VCPU_CTR0(vmx->vm, vcpu, "Enabling NMI window exiting"); 1217 } 1218 } 1219 1220 static void __inline 1221 vmx_clear_nmi_window_exiting(struct vmx *vmx, int vcpu) 1222 { 1223 1224 KASSERT((vmx->cap[vcpu].proc_ctls & PROCBASED_NMI_WINDOW_EXITING) != 0, 1225 ("nmi_window_exiting not set %#x", vmx->cap[vcpu].proc_ctls)); 1226 vmx->cap[vcpu].proc_ctls &= ~PROCBASED_NMI_WINDOW_EXITING; 1227 vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); 1228 VCPU_CTR0(vmx->vm, vcpu, "Disabling NMI window exiting"); 1229 } 1230 1231 int 1232 vmx_set_tsc_offset(struct vmx *vmx, int vcpu, uint64_t offset) 1233 { 1234 int error; 1235 1236 if ((vmx->cap[vcpu].proc_ctls & PROCBASED_TSC_OFFSET) == 0) { 1237 vmx->cap[vcpu].proc_ctls |= PROCBASED_TSC_OFFSET; 1238 vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); 1239 VCPU_CTR0(vmx->vm, vcpu, "Enabling TSC offsetting"); 1240 } 1241 1242 error = vmwrite(VMCS_TSC_OFFSET, offset); 1243 1244 return (error); 1245 } 1246 1247 #define NMI_BLOCKING (VMCS_INTERRUPTIBILITY_NMI_BLOCKING | \ 1248 VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING) 1249 #define HWINTR_BLOCKING (VMCS_INTERRUPTIBILITY_STI_BLOCKING | \ 1250 VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING) 1251 1252 static void 1253 vmx_inject_nmi(struct vmx *vmx, int vcpu) 1254 { 1255 uint32_t gi, info; 1256 1257 gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY); 1258 KASSERT((gi & NMI_BLOCKING) == 0, ("vmx_inject_nmi: invalid guest " 1259 "interruptibility-state %#x", gi)); 1260 1261 info = vmcs_read(VMCS_ENTRY_INTR_INFO); 1262 KASSERT((info & VMCS_INTR_VALID) == 0, ("vmx_inject_nmi: invalid " 1263 "VM-entry interruption information %#x", info)); 1264 1265 /* 1266 * Inject the virtual NMI. The vector must be the NMI IDT entry 1267 * or the VMCS entry check will fail. 1268 */ 1269 info = IDT_NMI | VMCS_INTR_T_NMI | VMCS_INTR_VALID; 1270 vmcs_write(VMCS_ENTRY_INTR_INFO, info); 1271 1272 VCPU_CTR0(vmx->vm, vcpu, "Injecting vNMI"); 1273 1274 /* Clear the request */ 1275 vm_nmi_clear(vmx->vm, vcpu); 1276 } 1277 1278 static void 1279 vmx_inject_interrupts(struct vmx *vmx, int vcpu, struct vlapic *vlapic, 1280 uint64_t guestrip) 1281 { 1282 int vector, need_nmi_exiting, extint_pending; 1283 uint64_t rflags, entryinfo; 1284 uint32_t gi, info; 1285 1286 if (vmx->state[vcpu].nextrip != guestrip) { 1287 gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY); 1288 if (gi & HWINTR_BLOCKING) { 1289 VCPU_CTR2(vmx->vm, vcpu, "Guest interrupt blocking " 1290 "cleared due to rip change: %#lx/%#lx", 1291 vmx->state[vcpu].nextrip, guestrip); 1292 gi &= ~HWINTR_BLOCKING; 1293 vmcs_write(VMCS_GUEST_INTERRUPTIBILITY, gi); 1294 } 1295 } 1296 1297 if (vm_entry_intinfo(vmx->vm, vcpu, &entryinfo)) { 1298 KASSERT((entryinfo & VMCS_INTR_VALID) != 0, ("%s: entry " 1299 "intinfo is not valid: %#lx", __func__, entryinfo)); 1300 1301 info = vmcs_read(VMCS_ENTRY_INTR_INFO); 1302 KASSERT((info & VMCS_INTR_VALID) == 0, ("%s: cannot inject " 1303 "pending exception: %#lx/%#x", __func__, entryinfo, info)); 1304 1305 info = entryinfo; 1306 vector = info & 0xff; 1307 if (vector == IDT_BP || vector == IDT_OF) { 1308 /* 1309 * VT-x requires #BP and #OF to be injected as software 1310 * exceptions. 1311 */ 1312 info &= ~VMCS_INTR_T_MASK; 1313 info |= VMCS_INTR_T_SWEXCEPTION; 1314 } 1315 1316 if (info & VMCS_INTR_DEL_ERRCODE) 1317 vmcs_write(VMCS_ENTRY_EXCEPTION_ERROR, entryinfo >> 32); 1318 1319 vmcs_write(VMCS_ENTRY_INTR_INFO, info); 1320 } 1321 1322 if (vm_nmi_pending(vmx->vm, vcpu)) { 1323 /* 1324 * If there are no conditions blocking NMI injection then 1325 * inject it directly here otherwise enable "NMI window 1326 * exiting" to inject it as soon as we can. 1327 * 1328 * We also check for STI_BLOCKING because some implementations 1329 * don't allow NMI injection in this case. If we are running 1330 * on a processor that doesn't have this restriction it will 1331 * immediately exit and the NMI will be injected in the 1332 * "NMI window exiting" handler. 1333 */ 1334 need_nmi_exiting = 1; 1335 gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY); 1336 if ((gi & (HWINTR_BLOCKING | NMI_BLOCKING)) == 0) { 1337 info = vmcs_read(VMCS_ENTRY_INTR_INFO); 1338 if ((info & VMCS_INTR_VALID) == 0) { 1339 vmx_inject_nmi(vmx, vcpu); 1340 need_nmi_exiting = 0; 1341 } else { 1342 VCPU_CTR1(vmx->vm, vcpu, "Cannot inject NMI " 1343 "due to VM-entry intr info %#x", info); 1344 } 1345 } else { 1346 VCPU_CTR1(vmx->vm, vcpu, "Cannot inject NMI due to " 1347 "Guest Interruptibility-state %#x", gi); 1348 } 1349 1350 if (need_nmi_exiting) 1351 vmx_set_nmi_window_exiting(vmx, vcpu); 1352 } 1353 1354 extint_pending = vm_extint_pending(vmx->vm, vcpu); 1355 1356 if (!extint_pending && virtual_interrupt_delivery) { 1357 vmx_inject_pir(vlapic); 1358 return; 1359 } 1360 1361 /* 1362 * If interrupt-window exiting is already in effect then don't bother 1363 * checking for pending interrupts. This is just an optimization and 1364 * not needed for correctness. 1365 */ 1366 if ((vmx->cap[vcpu].proc_ctls & PROCBASED_INT_WINDOW_EXITING) != 0) { 1367 VCPU_CTR0(vmx->vm, vcpu, "Skip interrupt injection due to " 1368 "pending int_window_exiting"); 1369 return; 1370 } 1371 1372 if (!extint_pending) { 1373 /* Ask the local apic for a vector to inject */ 1374 if (!vlapic_pending_intr(vlapic, &vector)) 1375 return; 1376 1377 /* 1378 * From the Intel SDM, Volume 3, Section "Maskable 1379 * Hardware Interrupts": 1380 * - maskable interrupt vectors [16,255] can be delivered 1381 * through the local APIC. 1382 */ 1383 KASSERT(vector >= 16 && vector <= 255, 1384 ("invalid vector %d from local APIC", vector)); 1385 } else { 1386 /* Ask the legacy pic for a vector to inject */ 1387 vatpic_pending_intr(vmx->vm, &vector); 1388 1389 /* 1390 * From the Intel SDM, Volume 3, Section "Maskable 1391 * Hardware Interrupts": 1392 * - maskable interrupt vectors [0,255] can be delivered 1393 * through the INTR pin. 1394 */ 1395 KASSERT(vector >= 0 && vector <= 255, 1396 ("invalid vector %d from INTR", vector)); 1397 } 1398 1399 /* Check RFLAGS.IF and the interruptibility state of the guest */ 1400 rflags = vmcs_read(VMCS_GUEST_RFLAGS); 1401 if ((rflags & PSL_I) == 0) { 1402 VCPU_CTR2(vmx->vm, vcpu, "Cannot inject vector %d due to " 1403 "rflags %#lx", vector, rflags); 1404 goto cantinject; 1405 } 1406 1407 gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY); 1408 if (gi & HWINTR_BLOCKING) { 1409 VCPU_CTR2(vmx->vm, vcpu, "Cannot inject vector %d due to " 1410 "Guest Interruptibility-state %#x", vector, gi); 1411 goto cantinject; 1412 } 1413 1414 info = vmcs_read(VMCS_ENTRY_INTR_INFO); 1415 if (info & VMCS_INTR_VALID) { 1416 /* 1417 * This is expected and could happen for multiple reasons: 1418 * - A vectoring VM-entry was aborted due to astpending 1419 * - A VM-exit happened during event injection. 1420 * - An exception was injected above. 1421 * - An NMI was injected above or after "NMI window exiting" 1422 */ 1423 VCPU_CTR2(vmx->vm, vcpu, "Cannot inject vector %d due to " 1424 "VM-entry intr info %#x", vector, info); 1425 goto cantinject; 1426 } 1427 1428 /* Inject the interrupt */ 1429 info = VMCS_INTR_T_HWINTR | VMCS_INTR_VALID; 1430 info |= vector; 1431 vmcs_write(VMCS_ENTRY_INTR_INFO, info); 1432 1433 if (!extint_pending) { 1434 /* Update the Local APIC ISR */ 1435 vlapic_intr_accepted(vlapic, vector); 1436 } else { 1437 vm_extint_clear(vmx->vm, vcpu); 1438 vatpic_intr_accepted(vmx->vm, vector); 1439 1440 /* 1441 * After we accepted the current ExtINT the PIC may 1442 * have posted another one. If that is the case, set 1443 * the Interrupt Window Exiting execution control so 1444 * we can inject that one too. 1445 * 1446 * Also, interrupt window exiting allows us to inject any 1447 * pending APIC vector that was preempted by the ExtINT 1448 * as soon as possible. This applies both for the software 1449 * emulated vlapic and the hardware assisted virtual APIC. 1450 */ 1451 vmx_set_int_window_exiting(vmx, vcpu); 1452 } 1453 1454 VCPU_CTR1(vmx->vm, vcpu, "Injecting hwintr at vector %d", vector); 1455 1456 return; 1457 1458 cantinject: 1459 /* 1460 * Set the Interrupt Window Exiting execution control so we can inject 1461 * the interrupt as soon as blocking condition goes away. 1462 */ 1463 vmx_set_int_window_exiting(vmx, vcpu); 1464 } 1465 1466 /* 1467 * If the Virtual NMIs execution control is '1' then the logical processor 1468 * tracks virtual-NMI blocking in the Guest Interruptibility-state field of 1469 * the VMCS. An IRET instruction in VMX non-root operation will remove any 1470 * virtual-NMI blocking. 1471 * 1472 * This unblocking occurs even if the IRET causes a fault. In this case the 1473 * hypervisor needs to restore virtual-NMI blocking before resuming the guest. 1474 */ 1475 static void 1476 vmx_restore_nmi_blocking(struct vmx *vmx, int vcpuid) 1477 { 1478 uint32_t gi; 1479 1480 VCPU_CTR0(vmx->vm, vcpuid, "Restore Virtual-NMI blocking"); 1481 gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY); 1482 gi |= VMCS_INTERRUPTIBILITY_NMI_BLOCKING; 1483 vmcs_write(VMCS_GUEST_INTERRUPTIBILITY, gi); 1484 } 1485 1486 static void 1487 vmx_clear_nmi_blocking(struct vmx *vmx, int vcpuid) 1488 { 1489 uint32_t gi; 1490 1491 VCPU_CTR0(vmx->vm, vcpuid, "Clear Virtual-NMI blocking"); 1492 gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY); 1493 gi &= ~VMCS_INTERRUPTIBILITY_NMI_BLOCKING; 1494 vmcs_write(VMCS_GUEST_INTERRUPTIBILITY, gi); 1495 } 1496 1497 static void 1498 vmx_assert_nmi_blocking(struct vmx *vmx, int vcpuid) 1499 { 1500 uint32_t gi; 1501 1502 gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY); 1503 KASSERT(gi & VMCS_INTERRUPTIBILITY_NMI_BLOCKING, 1504 ("NMI blocking is not in effect %#x", gi)); 1505 } 1506 1507 static int 1508 vmx_emulate_xsetbv(struct vmx *vmx, int vcpu, struct vm_exit *vmexit) 1509 { 1510 struct vmxctx *vmxctx; 1511 uint64_t xcrval; 1512 const struct xsave_limits *limits; 1513 1514 vmxctx = &vmx->ctx[vcpu]; 1515 limits = vmm_get_xsave_limits(); 1516 1517 /* 1518 * Note that the processor raises a GP# fault on its own if 1519 * xsetbv is executed for CPL != 0, so we do not have to 1520 * emulate that fault here. 1521 */ 1522 1523 /* Only xcr0 is supported. */ 1524 if (vmxctx->guest_rcx != 0) { 1525 vm_inject_gp(vmx->vm, vcpu); 1526 return (HANDLED); 1527 } 1528 1529 /* We only handle xcr0 if both the host and guest have XSAVE enabled. */ 1530 if (!limits->xsave_enabled || !(vmcs_read(VMCS_GUEST_CR4) & CR4_XSAVE)) { 1531 vm_inject_ud(vmx->vm, vcpu); 1532 return (HANDLED); 1533 } 1534 1535 xcrval = vmxctx->guest_rdx << 32 | (vmxctx->guest_rax & 0xffffffff); 1536 if ((xcrval & ~limits->xcr0_allowed) != 0) { 1537 vm_inject_gp(vmx->vm, vcpu); 1538 return (HANDLED); 1539 } 1540 1541 if (!(xcrval & XFEATURE_ENABLED_X87)) { 1542 vm_inject_gp(vmx->vm, vcpu); 1543 return (HANDLED); 1544 } 1545 1546 /* AVX (YMM_Hi128) requires SSE. */ 1547 if (xcrval & XFEATURE_ENABLED_AVX && 1548 (xcrval & XFEATURE_AVX) != XFEATURE_AVX) { 1549 vm_inject_gp(vmx->vm, vcpu); 1550 return (HANDLED); 1551 } 1552 1553 /* 1554 * AVX512 requires base AVX (YMM_Hi128) as well as OpMask, 1555 * ZMM_Hi256, and Hi16_ZMM. 1556 */ 1557 if (xcrval & XFEATURE_AVX512 && 1558 (xcrval & (XFEATURE_AVX512 | XFEATURE_AVX)) != 1559 (XFEATURE_AVX512 | XFEATURE_AVX)) { 1560 vm_inject_gp(vmx->vm, vcpu); 1561 return (HANDLED); 1562 } 1563 1564 /* 1565 * Intel MPX requires both bound register state flags to be 1566 * set. 1567 */ 1568 if (((xcrval & XFEATURE_ENABLED_BNDREGS) != 0) != 1569 ((xcrval & XFEATURE_ENABLED_BNDCSR) != 0)) { 1570 vm_inject_gp(vmx->vm, vcpu); 1571 return (HANDLED); 1572 } 1573 1574 /* 1575 * This runs "inside" vmrun() with the guest's FPU state, so 1576 * modifying xcr0 directly modifies the guest's xcr0, not the 1577 * host's. 1578 */ 1579 load_xcr(0, xcrval); 1580 return (HANDLED); 1581 } 1582 1583 static uint64_t 1584 vmx_get_guest_reg(struct vmx *vmx, int vcpu, int ident) 1585 { 1586 const struct vmxctx *vmxctx; 1587 1588 vmxctx = &vmx->ctx[vcpu]; 1589 1590 switch (ident) { 1591 case 0: 1592 return (vmxctx->guest_rax); 1593 case 1: 1594 return (vmxctx->guest_rcx); 1595 case 2: 1596 return (vmxctx->guest_rdx); 1597 case 3: 1598 return (vmxctx->guest_rbx); 1599 case 4: 1600 return (vmcs_read(VMCS_GUEST_RSP)); 1601 case 5: 1602 return (vmxctx->guest_rbp); 1603 case 6: 1604 return (vmxctx->guest_rsi); 1605 case 7: 1606 return (vmxctx->guest_rdi); 1607 case 8: 1608 return (vmxctx->guest_r8); 1609 case 9: 1610 return (vmxctx->guest_r9); 1611 case 10: 1612 return (vmxctx->guest_r10); 1613 case 11: 1614 return (vmxctx->guest_r11); 1615 case 12: 1616 return (vmxctx->guest_r12); 1617 case 13: 1618 return (vmxctx->guest_r13); 1619 case 14: 1620 return (vmxctx->guest_r14); 1621 case 15: 1622 return (vmxctx->guest_r15); 1623 default: 1624 panic("invalid vmx register %d", ident); 1625 } 1626 } 1627 1628 static void 1629 vmx_set_guest_reg(struct vmx *vmx, int vcpu, int ident, uint64_t regval) 1630 { 1631 struct vmxctx *vmxctx; 1632 1633 vmxctx = &vmx->ctx[vcpu]; 1634 1635 switch (ident) { 1636 case 0: 1637 vmxctx->guest_rax = regval; 1638 break; 1639 case 1: 1640 vmxctx->guest_rcx = regval; 1641 break; 1642 case 2: 1643 vmxctx->guest_rdx = regval; 1644 break; 1645 case 3: 1646 vmxctx->guest_rbx = regval; 1647 break; 1648 case 4: 1649 vmcs_write(VMCS_GUEST_RSP, regval); 1650 break; 1651 case 5: 1652 vmxctx->guest_rbp = regval; 1653 break; 1654 case 6: 1655 vmxctx->guest_rsi = regval; 1656 break; 1657 case 7: 1658 vmxctx->guest_rdi = regval; 1659 break; 1660 case 8: 1661 vmxctx->guest_r8 = regval; 1662 break; 1663 case 9: 1664 vmxctx->guest_r9 = regval; 1665 break; 1666 case 10: 1667 vmxctx->guest_r10 = regval; 1668 break; 1669 case 11: 1670 vmxctx->guest_r11 = regval; 1671 break; 1672 case 12: 1673 vmxctx->guest_r12 = regval; 1674 break; 1675 case 13: 1676 vmxctx->guest_r13 = regval; 1677 break; 1678 case 14: 1679 vmxctx->guest_r14 = regval; 1680 break; 1681 case 15: 1682 vmxctx->guest_r15 = regval; 1683 break; 1684 default: 1685 panic("invalid vmx register %d", ident); 1686 } 1687 } 1688 1689 static int 1690 vmx_emulate_cr0_access(struct vmx *vmx, int vcpu, uint64_t exitqual) 1691 { 1692 uint64_t crval, regval; 1693 1694 /* We only handle mov to %cr0 at this time */ 1695 if ((exitqual & 0xf0) != 0x00) 1696 return (UNHANDLED); 1697 1698 regval = vmx_get_guest_reg(vmx, vcpu, (exitqual >> 8) & 0xf); 1699 1700 vmcs_write(VMCS_CR0_SHADOW, regval); 1701 1702 crval = regval | cr0_ones_mask; 1703 crval &= ~cr0_zeros_mask; 1704 vmcs_write(VMCS_GUEST_CR0, crval); 1705 1706 if (regval & CR0_PG) { 1707 uint64_t efer, entry_ctls; 1708 1709 /* 1710 * If CR0.PG is 1 and EFER.LME is 1 then EFER.LMA and 1711 * the "IA-32e mode guest" bit in VM-entry control must be 1712 * equal. 1713 */ 1714 efer = vmcs_read(VMCS_GUEST_IA32_EFER); 1715 if (efer & EFER_LME) { 1716 efer |= EFER_LMA; 1717 vmcs_write(VMCS_GUEST_IA32_EFER, efer); 1718 entry_ctls = vmcs_read(VMCS_ENTRY_CTLS); 1719 entry_ctls |= VM_ENTRY_GUEST_LMA; 1720 vmcs_write(VMCS_ENTRY_CTLS, entry_ctls); 1721 } 1722 } 1723 1724 return (HANDLED); 1725 } 1726 1727 static int 1728 vmx_emulate_cr4_access(struct vmx *vmx, int vcpu, uint64_t exitqual) 1729 { 1730 uint64_t crval, regval; 1731 1732 /* We only handle mov to %cr4 at this time */ 1733 if ((exitqual & 0xf0) != 0x00) 1734 return (UNHANDLED); 1735 1736 regval = vmx_get_guest_reg(vmx, vcpu, (exitqual >> 8) & 0xf); 1737 1738 vmcs_write(VMCS_CR4_SHADOW, regval); 1739 1740 crval = regval | cr4_ones_mask; 1741 crval &= ~cr4_zeros_mask; 1742 vmcs_write(VMCS_GUEST_CR4, crval); 1743 1744 return (HANDLED); 1745 } 1746 1747 static int 1748 vmx_emulate_cr8_access(struct vmx *vmx, int vcpu, uint64_t exitqual) 1749 { 1750 struct vlapic *vlapic; 1751 uint64_t cr8; 1752 int regnum; 1753 1754 /* We only handle mov %cr8 to/from a register at this time. */ 1755 if ((exitqual & 0xe0) != 0x00) { 1756 return (UNHANDLED); 1757 } 1758 1759 vlapic = vm_lapic(vmx->vm, vcpu); 1760 regnum = (exitqual >> 8) & 0xf; 1761 if (exitqual & 0x10) { 1762 cr8 = vlapic_get_cr8(vlapic); 1763 vmx_set_guest_reg(vmx, vcpu, regnum, cr8); 1764 } else { 1765 cr8 = vmx_get_guest_reg(vmx, vcpu, regnum); 1766 vlapic_set_cr8(vlapic, cr8); 1767 } 1768 1769 return (HANDLED); 1770 } 1771 1772 /* 1773 * From section "Guest Register State" in the Intel SDM: CPL = SS.DPL 1774 */ 1775 static int 1776 vmx_cpl(void) 1777 { 1778 uint32_t ssar; 1779 1780 ssar = vmcs_read(VMCS_GUEST_SS_ACCESS_RIGHTS); 1781 return ((ssar >> 5) & 0x3); 1782 } 1783 1784 static enum vm_cpu_mode 1785 vmx_cpu_mode(void) 1786 { 1787 uint32_t csar; 1788 1789 if (vmcs_read(VMCS_GUEST_IA32_EFER) & EFER_LMA) { 1790 csar = vmcs_read(VMCS_GUEST_CS_ACCESS_RIGHTS); 1791 if (csar & 0x2000) 1792 return (CPU_MODE_64BIT); /* CS.L = 1 */ 1793 else 1794 return (CPU_MODE_COMPATIBILITY); 1795 } else if (vmcs_read(VMCS_GUEST_CR0) & CR0_PE) { 1796 return (CPU_MODE_PROTECTED); 1797 } else { 1798 return (CPU_MODE_REAL); 1799 } 1800 } 1801 1802 static enum vm_paging_mode 1803 vmx_paging_mode(void) 1804 { 1805 1806 if (!(vmcs_read(VMCS_GUEST_CR0) & CR0_PG)) 1807 return (PAGING_MODE_FLAT); 1808 if (!(vmcs_read(VMCS_GUEST_CR4) & CR4_PAE)) 1809 return (PAGING_MODE_32); 1810 if (vmcs_read(VMCS_GUEST_IA32_EFER) & EFER_LME) 1811 return (PAGING_MODE_64); 1812 else 1813 return (PAGING_MODE_PAE); 1814 } 1815 1816 static uint64_t 1817 inout_str_index(struct vmx *vmx, int vcpuid, int in) 1818 { 1819 uint64_t val; 1820 int error; 1821 enum vm_reg_name reg; 1822 1823 reg = in ? VM_REG_GUEST_RDI : VM_REG_GUEST_RSI; 1824 error = vmx_getreg(vmx, vcpuid, reg, &val); 1825 KASSERT(error == 0, ("%s: vmx_getreg error %d", __func__, error)); 1826 return (val); 1827 } 1828 1829 static uint64_t 1830 inout_str_count(struct vmx *vmx, int vcpuid, int rep) 1831 { 1832 uint64_t val; 1833 int error; 1834 1835 if (rep) { 1836 error = vmx_getreg(vmx, vcpuid, VM_REG_GUEST_RCX, &val); 1837 KASSERT(!error, ("%s: vmx_getreg error %d", __func__, error)); 1838 } else { 1839 val = 1; 1840 } 1841 return (val); 1842 } 1843 1844 static int 1845 inout_str_addrsize(uint32_t inst_info) 1846 { 1847 uint32_t size; 1848 1849 size = (inst_info >> 7) & 0x7; 1850 switch (size) { 1851 case 0: 1852 return (2); /* 16 bit */ 1853 case 1: 1854 return (4); /* 32 bit */ 1855 case 2: 1856 return (8); /* 64 bit */ 1857 default: 1858 panic("%s: invalid size encoding %d", __func__, size); 1859 } 1860 } 1861 1862 static void 1863 inout_str_seginfo(struct vmx *vmx, int vcpuid, uint32_t inst_info, int in, 1864 struct vm_inout_str *vis) 1865 { 1866 int error, s; 1867 1868 if (in) { 1869 vis->seg_name = VM_REG_GUEST_ES; 1870 } else { 1871 s = (inst_info >> 15) & 0x7; 1872 vis->seg_name = vm_segment_name(s); 1873 } 1874 1875 error = vmx_getdesc(vmx, vcpuid, vis->seg_name, &vis->seg_desc); 1876 KASSERT(error == 0, ("%s: vmx_getdesc error %d", __func__, error)); 1877 } 1878 1879 static void 1880 vmx_paging_info(struct vm_guest_paging *paging) 1881 { 1882 paging->cr3 = vmcs_guest_cr3(); 1883 paging->cpl = vmx_cpl(); 1884 paging->cpu_mode = vmx_cpu_mode(); 1885 paging->paging_mode = vmx_paging_mode(); 1886 } 1887 1888 static void 1889 vmexit_inst_emul(struct vm_exit *vmexit, uint64_t gpa, uint64_t gla) 1890 { 1891 struct vm_guest_paging *paging; 1892 uint32_t csar; 1893 1894 paging = &vmexit->u.inst_emul.paging; 1895 1896 vmexit->exitcode = VM_EXITCODE_INST_EMUL; 1897 vmexit->inst_length = 0; 1898 vmexit->u.inst_emul.gpa = gpa; 1899 vmexit->u.inst_emul.gla = gla; 1900 vmx_paging_info(paging); 1901 switch (paging->cpu_mode) { 1902 case CPU_MODE_REAL: 1903 vmexit->u.inst_emul.cs_base = vmcs_read(VMCS_GUEST_CS_BASE); 1904 vmexit->u.inst_emul.cs_d = 0; 1905 break; 1906 case CPU_MODE_PROTECTED: 1907 case CPU_MODE_COMPATIBILITY: 1908 vmexit->u.inst_emul.cs_base = vmcs_read(VMCS_GUEST_CS_BASE); 1909 csar = vmcs_read(VMCS_GUEST_CS_ACCESS_RIGHTS); 1910 vmexit->u.inst_emul.cs_d = SEG_DESC_DEF32(csar); 1911 break; 1912 default: 1913 vmexit->u.inst_emul.cs_base = 0; 1914 vmexit->u.inst_emul.cs_d = 0; 1915 break; 1916 } 1917 vie_init(&vmexit->u.inst_emul.vie, NULL, 0); 1918 } 1919 1920 static int 1921 ept_fault_type(uint64_t ept_qual) 1922 { 1923 int fault_type; 1924 1925 if (ept_qual & EPT_VIOLATION_DATA_WRITE) 1926 fault_type = VM_PROT_WRITE; 1927 else if (ept_qual & EPT_VIOLATION_INST_FETCH) 1928 fault_type = VM_PROT_EXECUTE; 1929 else 1930 fault_type= VM_PROT_READ; 1931 1932 return (fault_type); 1933 } 1934 1935 static boolean_t 1936 ept_emulation_fault(uint64_t ept_qual) 1937 { 1938 int read, write; 1939 1940 /* EPT fault on an instruction fetch doesn't make sense here */ 1941 if (ept_qual & EPT_VIOLATION_INST_FETCH) 1942 return (FALSE); 1943 1944 /* EPT fault must be a read fault or a write fault */ 1945 read = ept_qual & EPT_VIOLATION_DATA_READ ? 1 : 0; 1946 write = ept_qual & EPT_VIOLATION_DATA_WRITE ? 1 : 0; 1947 if ((read | write) == 0) 1948 return (FALSE); 1949 1950 /* 1951 * The EPT violation must have been caused by accessing a 1952 * guest-physical address that is a translation of a guest-linear 1953 * address. 1954 */ 1955 if ((ept_qual & EPT_VIOLATION_GLA_VALID) == 0 || 1956 (ept_qual & EPT_VIOLATION_XLAT_VALID) == 0) { 1957 return (FALSE); 1958 } 1959 1960 return (TRUE); 1961 } 1962 1963 static __inline int 1964 apic_access_virtualization(struct vmx *vmx, int vcpuid) 1965 { 1966 uint32_t proc_ctls2; 1967 1968 proc_ctls2 = vmx->cap[vcpuid].proc_ctls2; 1969 return ((proc_ctls2 & PROCBASED2_VIRTUALIZE_APIC_ACCESSES) ? 1 : 0); 1970 } 1971 1972 static __inline int 1973 x2apic_virtualization(struct vmx *vmx, int vcpuid) 1974 { 1975 uint32_t proc_ctls2; 1976 1977 proc_ctls2 = vmx->cap[vcpuid].proc_ctls2; 1978 return ((proc_ctls2 & PROCBASED2_VIRTUALIZE_X2APIC_MODE) ? 1 : 0); 1979 } 1980 1981 static int 1982 vmx_handle_apic_write(struct vmx *vmx, int vcpuid, struct vlapic *vlapic, 1983 uint64_t qual) 1984 { 1985 int error, handled, offset; 1986 uint32_t *apic_regs, vector; 1987 bool retu; 1988 1989 handled = HANDLED; 1990 offset = APIC_WRITE_OFFSET(qual); 1991 1992 if (!apic_access_virtualization(vmx, vcpuid)) { 1993 /* 1994 * In general there should not be any APIC write VM-exits 1995 * unless APIC-access virtualization is enabled. 1996 * 1997 * However self-IPI virtualization can legitimately trigger 1998 * an APIC-write VM-exit so treat it specially. 1999 */ 2000 if (x2apic_virtualization(vmx, vcpuid) && 2001 offset == APIC_OFFSET_SELF_IPI) { 2002 apic_regs = (uint32_t *)(vlapic->apic_page); 2003 vector = apic_regs[APIC_OFFSET_SELF_IPI / 4]; 2004 vlapic_self_ipi_handler(vlapic, vector); 2005 return (HANDLED); 2006 } else 2007 return (UNHANDLED); 2008 } 2009 2010 switch (offset) { 2011 case APIC_OFFSET_ID: 2012 vlapic_id_write_handler(vlapic); 2013 break; 2014 case APIC_OFFSET_LDR: 2015 vlapic_ldr_write_handler(vlapic); 2016 break; 2017 case APIC_OFFSET_DFR: 2018 vlapic_dfr_write_handler(vlapic); 2019 break; 2020 case APIC_OFFSET_SVR: 2021 vlapic_svr_write_handler(vlapic); 2022 break; 2023 case APIC_OFFSET_ESR: 2024 vlapic_esr_write_handler(vlapic); 2025 break; 2026 case APIC_OFFSET_ICR_LOW: 2027 retu = false; 2028 error = vlapic_icrlo_write_handler(vlapic, &retu); 2029 if (error != 0 || retu) 2030 handled = UNHANDLED; 2031 break; 2032 case APIC_OFFSET_CMCI_LVT: 2033 case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT: 2034 vlapic_lvt_write_handler(vlapic, offset); 2035 break; 2036 case APIC_OFFSET_TIMER_ICR: 2037 vlapic_icrtmr_write_handler(vlapic); 2038 break; 2039 case APIC_OFFSET_TIMER_DCR: 2040 vlapic_dcr_write_handler(vlapic); 2041 break; 2042 default: 2043 handled = UNHANDLED; 2044 break; 2045 } 2046 return (handled); 2047 } 2048 2049 static bool 2050 apic_access_fault(struct vmx *vmx, int vcpuid, uint64_t gpa) 2051 { 2052 2053 if (apic_access_virtualization(vmx, vcpuid) && 2054 (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE)) 2055 return (true); 2056 else 2057 return (false); 2058 } 2059 2060 static int 2061 vmx_handle_apic_access(struct vmx *vmx, int vcpuid, struct vm_exit *vmexit) 2062 { 2063 uint64_t qual; 2064 int access_type, offset, allowed; 2065 2066 if (!apic_access_virtualization(vmx, vcpuid)) 2067 return (UNHANDLED); 2068 2069 qual = vmexit->u.vmx.exit_qualification; 2070 access_type = APIC_ACCESS_TYPE(qual); 2071 offset = APIC_ACCESS_OFFSET(qual); 2072 2073 allowed = 0; 2074 if (access_type == 0) { 2075 /* 2076 * Read data access to the following registers is expected. 2077 */ 2078 switch (offset) { 2079 case APIC_OFFSET_APR: 2080 case APIC_OFFSET_PPR: 2081 case APIC_OFFSET_RRR: 2082 case APIC_OFFSET_CMCI_LVT: 2083 case APIC_OFFSET_TIMER_CCR: 2084 allowed = 1; 2085 break; 2086 default: 2087 break; 2088 } 2089 } else if (access_type == 1) { 2090 /* 2091 * Write data access to the following registers is expected. 2092 */ 2093 switch (offset) { 2094 case APIC_OFFSET_VER: 2095 case APIC_OFFSET_APR: 2096 case APIC_OFFSET_PPR: 2097 case APIC_OFFSET_RRR: 2098 case APIC_OFFSET_ISR0 ... APIC_OFFSET_ISR7: 2099 case APIC_OFFSET_TMR0 ... APIC_OFFSET_TMR7: 2100 case APIC_OFFSET_IRR0 ... APIC_OFFSET_IRR7: 2101 case APIC_OFFSET_CMCI_LVT: 2102 case APIC_OFFSET_TIMER_CCR: 2103 allowed = 1; 2104 break; 2105 default: 2106 break; 2107 } 2108 } 2109 2110 if (allowed) { 2111 vmexit_inst_emul(vmexit, DEFAULT_APIC_BASE + offset, 2112 VIE_INVALID_GLA); 2113 } 2114 2115 /* 2116 * Regardless of whether the APIC-access is allowed this handler 2117 * always returns UNHANDLED: 2118 * - if the access is allowed then it is handled by emulating the 2119 * instruction that caused the VM-exit (outside the critical section) 2120 * - if the access is not allowed then it will be converted to an 2121 * exitcode of VM_EXITCODE_VMX and will be dealt with in userland. 2122 */ 2123 return (UNHANDLED); 2124 } 2125 2126 static enum task_switch_reason 2127 vmx_task_switch_reason(uint64_t qual) 2128 { 2129 int reason; 2130 2131 reason = (qual >> 30) & 0x3; 2132 switch (reason) { 2133 case 0: 2134 return (TSR_CALL); 2135 case 1: 2136 return (TSR_IRET); 2137 case 2: 2138 return (TSR_JMP); 2139 case 3: 2140 return (TSR_IDT_GATE); 2141 default: 2142 panic("%s: invalid reason %d", __func__, reason); 2143 } 2144 } 2145 2146 static int 2147 emulate_wrmsr(struct vmx *vmx, int vcpuid, u_int num, uint64_t val, bool *retu) 2148 { 2149 int error; 2150 2151 if (lapic_msr(num)) 2152 error = lapic_wrmsr(vmx->vm, vcpuid, num, val, retu); 2153 else 2154 error = vmx_wrmsr(vmx, vcpuid, num, val, retu); 2155 2156 return (error); 2157 } 2158 2159 static int 2160 emulate_rdmsr(struct vmx *vmx, int vcpuid, u_int num, bool *retu) 2161 { 2162 struct vmxctx *vmxctx; 2163 uint64_t result; 2164 uint32_t eax, edx; 2165 int error; 2166 2167 if (lapic_msr(num)) 2168 error = lapic_rdmsr(vmx->vm, vcpuid, num, &result, retu); 2169 else 2170 error = vmx_rdmsr(vmx, vcpuid, num, &result, retu); 2171 2172 if (error == 0) { 2173 eax = result; 2174 vmxctx = &vmx->ctx[vcpuid]; 2175 error = vmxctx_setreg(vmxctx, VM_REG_GUEST_RAX, eax); 2176 KASSERT(error == 0, ("vmxctx_setreg(rax) error %d", error)); 2177 2178 edx = result >> 32; 2179 error = vmxctx_setreg(vmxctx, VM_REG_GUEST_RDX, edx); 2180 KASSERT(error == 0, ("vmxctx_setreg(rdx) error %d", error)); 2181 } 2182 2183 return (error); 2184 } 2185 2186 static int 2187 vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit) 2188 { 2189 int error, errcode, errcode_valid, handled, in; 2190 struct vmxctx *vmxctx; 2191 struct vlapic *vlapic; 2192 struct vm_inout_str *vis; 2193 struct vm_task_switch *ts; 2194 uint32_t eax, ecx, edx, idtvec_info, idtvec_err, intr_info, inst_info; 2195 uint32_t intr_type, intr_vec, reason; 2196 uint64_t exitintinfo, qual, gpa; 2197 bool retu; 2198 2199 CTASSERT((PINBASED_CTLS_ONE_SETTING & PINBASED_VIRTUAL_NMI) != 0); 2200 CTASSERT((PINBASED_CTLS_ONE_SETTING & PINBASED_NMI_EXITING) != 0); 2201 2202 handled = UNHANDLED; 2203 vmxctx = &vmx->ctx[vcpu]; 2204 2205 qual = vmexit->u.vmx.exit_qualification; 2206 reason = vmexit->u.vmx.exit_reason; 2207 vmexit->exitcode = VM_EXITCODE_BOGUS; 2208 2209 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_COUNT, 1); 2210 SDT_PROBE3(vmm, vmx, exit, entry, vmx, vcpu, vmexit); 2211 2212 /* 2213 * VM-entry failures during or after loading guest state. 2214 * 2215 * These VM-exits are uncommon but must be handled specially 2216 * as most VM-exit fields are not populated as usual. 2217 */ 2218 if (__predict_false(reason == EXIT_REASON_MCE_DURING_ENTRY)) { 2219 VCPU_CTR0(vmx->vm, vcpu, "Handling MCE during VM-entry"); 2220 __asm __volatile("int $18"); 2221 return (1); 2222 } 2223 2224 /* 2225 * VM exits that can be triggered during event delivery need to 2226 * be handled specially by re-injecting the event if the IDT 2227 * vectoring information field's valid bit is set. 2228 * 2229 * See "Information for VM Exits During Event Delivery" in Intel SDM 2230 * for details. 2231 */ 2232 idtvec_info = vmcs_idt_vectoring_info(); 2233 if (idtvec_info & VMCS_IDT_VEC_VALID) { 2234 idtvec_info &= ~(1 << 12); /* clear undefined bit */ 2235 exitintinfo = idtvec_info; 2236 if (idtvec_info & VMCS_IDT_VEC_ERRCODE_VALID) { 2237 idtvec_err = vmcs_idt_vectoring_err(); 2238 exitintinfo |= (uint64_t)idtvec_err << 32; 2239 } 2240 error = vm_exit_intinfo(vmx->vm, vcpu, exitintinfo); 2241 KASSERT(error == 0, ("%s: vm_set_intinfo error %d", 2242 __func__, error)); 2243 2244 /* 2245 * If 'virtual NMIs' are being used and the VM-exit 2246 * happened while injecting an NMI during the previous 2247 * VM-entry, then clear "blocking by NMI" in the 2248 * Guest Interruptibility-State so the NMI can be 2249 * reinjected on the subsequent VM-entry. 2250 * 2251 * However, if the NMI was being delivered through a task 2252 * gate, then the new task must start execution with NMIs 2253 * blocked so don't clear NMI blocking in this case. 2254 */ 2255 intr_type = idtvec_info & VMCS_INTR_T_MASK; 2256 if (intr_type == VMCS_INTR_T_NMI) { 2257 if (reason != EXIT_REASON_TASK_SWITCH) 2258 vmx_clear_nmi_blocking(vmx, vcpu); 2259 else 2260 vmx_assert_nmi_blocking(vmx, vcpu); 2261 } 2262 2263 /* 2264 * Update VM-entry instruction length if the event being 2265 * delivered was a software interrupt or software exception. 2266 */ 2267 if (intr_type == VMCS_INTR_T_SWINTR || 2268 intr_type == VMCS_INTR_T_PRIV_SWEXCEPTION || 2269 intr_type == VMCS_INTR_T_SWEXCEPTION) { 2270 vmcs_write(VMCS_ENTRY_INST_LENGTH, vmexit->inst_length); 2271 } 2272 } 2273 2274 switch (reason) { 2275 case EXIT_REASON_TASK_SWITCH: 2276 ts = &vmexit->u.task_switch; 2277 ts->tsssel = qual & 0xffff; 2278 ts->reason = vmx_task_switch_reason(qual); 2279 ts->ext = 0; 2280 ts->errcode_valid = 0; 2281 vmx_paging_info(&ts->paging); 2282 /* 2283 * If the task switch was due to a CALL, JMP, IRET, software 2284 * interrupt (INT n) or software exception (INT3, INTO), 2285 * then the saved %rip references the instruction that caused 2286 * the task switch. The instruction length field in the VMCS 2287 * is valid in this case. 2288 * 2289 * In all other cases (e.g., NMI, hardware exception) the 2290 * saved %rip is one that would have been saved in the old TSS 2291 * had the task switch completed normally so the instruction 2292 * length field is not needed in this case and is explicitly 2293 * set to 0. 2294 */ 2295 if (ts->reason == TSR_IDT_GATE) { 2296 KASSERT(idtvec_info & VMCS_IDT_VEC_VALID, 2297 ("invalid idtvec_info %#x for IDT task switch", 2298 idtvec_info)); 2299 intr_type = idtvec_info & VMCS_INTR_T_MASK; 2300 if (intr_type != VMCS_INTR_T_SWINTR && 2301 intr_type != VMCS_INTR_T_SWEXCEPTION && 2302 intr_type != VMCS_INTR_T_PRIV_SWEXCEPTION) { 2303 /* Task switch triggered by external event */ 2304 ts->ext = 1; 2305 vmexit->inst_length = 0; 2306 if (idtvec_info & VMCS_IDT_VEC_ERRCODE_VALID) { 2307 ts->errcode_valid = 1; 2308 ts->errcode = vmcs_idt_vectoring_err(); 2309 } 2310 } 2311 } 2312 vmexit->exitcode = VM_EXITCODE_TASK_SWITCH; 2313 SDT_PROBE4(vmm, vmx, exit, taskswitch, vmx, vcpu, vmexit, ts); 2314 VCPU_CTR4(vmx->vm, vcpu, "task switch reason %d, tss 0x%04x, " 2315 "%s errcode 0x%016lx", ts->reason, ts->tsssel, 2316 ts->ext ? "external" : "internal", 2317 ((uint64_t)ts->errcode << 32) | ts->errcode_valid); 2318 break; 2319 case EXIT_REASON_CR_ACCESS: 2320 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_CR_ACCESS, 1); 2321 SDT_PROBE4(vmm, vmx, exit, craccess, vmx, vcpu, vmexit, qual); 2322 switch (qual & 0xf) { 2323 case 0: 2324 handled = vmx_emulate_cr0_access(vmx, vcpu, qual); 2325 break; 2326 case 4: 2327 handled = vmx_emulate_cr4_access(vmx, vcpu, qual); 2328 break; 2329 case 8: 2330 handled = vmx_emulate_cr8_access(vmx, vcpu, qual); 2331 break; 2332 } 2333 break; 2334 case EXIT_REASON_RDMSR: 2335 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_RDMSR, 1); 2336 retu = false; 2337 ecx = vmxctx->guest_rcx; 2338 VCPU_CTR1(vmx->vm, vcpu, "rdmsr 0x%08x", ecx); 2339 SDT_PROBE4(vmm, vmx, exit, rdmsr, vmx, vcpu, vmexit, ecx); 2340 error = emulate_rdmsr(vmx, vcpu, ecx, &retu); 2341 if (error) { 2342 vmexit->exitcode = VM_EXITCODE_RDMSR; 2343 vmexit->u.msr.code = ecx; 2344 } else if (!retu) { 2345 handled = HANDLED; 2346 } else { 2347 /* Return to userspace with a valid exitcode */ 2348 KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS, 2349 ("emulate_rdmsr retu with bogus exitcode")); 2350 } 2351 break; 2352 case EXIT_REASON_WRMSR: 2353 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_WRMSR, 1); 2354 retu = false; 2355 eax = vmxctx->guest_rax; 2356 ecx = vmxctx->guest_rcx; 2357 edx = vmxctx->guest_rdx; 2358 VCPU_CTR2(vmx->vm, vcpu, "wrmsr 0x%08x value 0x%016lx", 2359 ecx, (uint64_t)edx << 32 | eax); 2360 SDT_PROBE5(vmm, vmx, exit, wrmsr, vmx, vmexit, vcpu, ecx, 2361 (uint64_t)edx << 32 | eax); 2362 error = emulate_wrmsr(vmx, vcpu, ecx, 2363 (uint64_t)edx << 32 | eax, &retu); 2364 if (error) { 2365 vmexit->exitcode = VM_EXITCODE_WRMSR; 2366 vmexit->u.msr.code = ecx; 2367 vmexit->u.msr.wval = (uint64_t)edx << 32 | eax; 2368 } else if (!retu) { 2369 handled = HANDLED; 2370 } else { 2371 /* Return to userspace with a valid exitcode */ 2372 KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS, 2373 ("emulate_wrmsr retu with bogus exitcode")); 2374 } 2375 break; 2376 case EXIT_REASON_HLT: 2377 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_HLT, 1); 2378 SDT_PROBE3(vmm, vmx, exit, halt, vmx, vcpu, vmexit); 2379 vmexit->exitcode = VM_EXITCODE_HLT; 2380 vmexit->u.hlt.rflags = vmcs_read(VMCS_GUEST_RFLAGS); 2381 if (virtual_interrupt_delivery) 2382 vmexit->u.hlt.intr_status = 2383 vmcs_read(VMCS_GUEST_INTR_STATUS); 2384 else 2385 vmexit->u.hlt.intr_status = 0; 2386 break; 2387 case EXIT_REASON_MTF: 2388 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_MTRAP, 1); 2389 SDT_PROBE3(vmm, vmx, exit, mtrap, vmx, vcpu, vmexit); 2390 vmexit->exitcode = VM_EXITCODE_MTRAP; 2391 vmexit->inst_length = 0; 2392 break; 2393 case EXIT_REASON_PAUSE: 2394 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_PAUSE, 1); 2395 SDT_PROBE3(vmm, vmx, exit, pause, vmx, vcpu, vmexit); 2396 vmexit->exitcode = VM_EXITCODE_PAUSE; 2397 break; 2398 case EXIT_REASON_INTR_WINDOW: 2399 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INTR_WINDOW, 1); 2400 SDT_PROBE3(vmm, vmx, exit, intrwindow, vmx, vcpu, vmexit); 2401 vmx_clear_int_window_exiting(vmx, vcpu); 2402 return (1); 2403 case EXIT_REASON_EXT_INTR: 2404 /* 2405 * External interrupts serve only to cause VM exits and allow 2406 * the host interrupt handler to run. 2407 * 2408 * If this external interrupt triggers a virtual interrupt 2409 * to a VM, then that state will be recorded by the 2410 * host interrupt handler in the VM's softc. We will inject 2411 * this virtual interrupt during the subsequent VM enter. 2412 */ 2413 intr_info = vmcs_read(VMCS_EXIT_INTR_INFO); 2414 SDT_PROBE4(vmm, vmx, exit, interrupt, 2415 vmx, vcpu, vmexit, intr_info); 2416 2417 /* 2418 * XXX: Ignore this exit if VMCS_INTR_VALID is not set. 2419 * This appears to be a bug in VMware Fusion? 2420 */ 2421 if (!(intr_info & VMCS_INTR_VALID)) 2422 return (1); 2423 KASSERT((intr_info & VMCS_INTR_VALID) != 0 && 2424 (intr_info & VMCS_INTR_T_MASK) == VMCS_INTR_T_HWINTR, 2425 ("VM exit interruption info invalid: %#x", intr_info)); 2426 vmx_trigger_hostintr(intr_info & 0xff); 2427 2428 /* 2429 * This is special. We want to treat this as an 'handled' 2430 * VM-exit but not increment the instruction pointer. 2431 */ 2432 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_EXTINT, 1); 2433 return (1); 2434 case EXIT_REASON_NMI_WINDOW: 2435 SDT_PROBE3(vmm, vmx, exit, nmiwindow, vmx, vcpu, vmexit); 2436 /* Exit to allow the pending virtual NMI to be injected */ 2437 if (vm_nmi_pending(vmx->vm, vcpu)) 2438 vmx_inject_nmi(vmx, vcpu); 2439 vmx_clear_nmi_window_exiting(vmx, vcpu); 2440 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_NMI_WINDOW, 1); 2441 return (1); 2442 case EXIT_REASON_INOUT: 2443 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INOUT, 1); 2444 vmexit->exitcode = VM_EXITCODE_INOUT; 2445 vmexit->u.inout.bytes = (qual & 0x7) + 1; 2446 vmexit->u.inout.in = in = (qual & 0x8) ? 1 : 0; 2447 vmexit->u.inout.string = (qual & 0x10) ? 1 : 0; 2448 vmexit->u.inout.rep = (qual & 0x20) ? 1 : 0; 2449 vmexit->u.inout.port = (uint16_t)(qual >> 16); 2450 vmexit->u.inout.eax = (uint32_t)(vmxctx->guest_rax); 2451 if (vmexit->u.inout.string) { 2452 inst_info = vmcs_read(VMCS_EXIT_INSTRUCTION_INFO); 2453 vmexit->exitcode = VM_EXITCODE_INOUT_STR; 2454 vis = &vmexit->u.inout_str; 2455 vmx_paging_info(&vis->paging); 2456 vis->rflags = vmcs_read(VMCS_GUEST_RFLAGS); 2457 vis->cr0 = vmcs_read(VMCS_GUEST_CR0); 2458 vis->index = inout_str_index(vmx, vcpu, in); 2459 vis->count = inout_str_count(vmx, vcpu, vis->inout.rep); 2460 vis->addrsize = inout_str_addrsize(inst_info); 2461 inout_str_seginfo(vmx, vcpu, inst_info, in, vis); 2462 } 2463 SDT_PROBE3(vmm, vmx, exit, inout, vmx, vcpu, vmexit); 2464 break; 2465 case EXIT_REASON_CPUID: 2466 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_CPUID, 1); 2467 SDT_PROBE3(vmm, vmx, exit, cpuid, vmx, vcpu, vmexit); 2468 handled = vmx_handle_cpuid(vmx->vm, vcpu, vmxctx); 2469 break; 2470 case EXIT_REASON_EXCEPTION: 2471 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_EXCEPTION, 1); 2472 intr_info = vmcs_read(VMCS_EXIT_INTR_INFO); 2473 KASSERT((intr_info & VMCS_INTR_VALID) != 0, 2474 ("VM exit interruption info invalid: %#x", intr_info)); 2475 2476 intr_vec = intr_info & 0xff; 2477 intr_type = intr_info & VMCS_INTR_T_MASK; 2478 2479 /* 2480 * If Virtual NMIs control is 1 and the VM-exit is due to a 2481 * fault encountered during the execution of IRET then we must 2482 * restore the state of "virtual-NMI blocking" before resuming 2483 * the guest. 2484 * 2485 * See "Resuming Guest Software after Handling an Exception". 2486 * See "Information for VM Exits Due to Vectored Events". 2487 */ 2488 if ((idtvec_info & VMCS_IDT_VEC_VALID) == 0 && 2489 (intr_vec != IDT_DF) && 2490 (intr_info & EXIT_QUAL_NMIUDTI) != 0) 2491 vmx_restore_nmi_blocking(vmx, vcpu); 2492 2493 /* 2494 * The NMI has already been handled in vmx_exit_handle_nmi(). 2495 */ 2496 if (intr_type == VMCS_INTR_T_NMI) 2497 return (1); 2498 2499 /* 2500 * Call the machine check handler by hand. Also don't reflect 2501 * the machine check back into the guest. 2502 */ 2503 if (intr_vec == IDT_MC) { 2504 VCPU_CTR0(vmx->vm, vcpu, "Vectoring to MCE handler"); 2505 __asm __volatile("int $18"); 2506 return (1); 2507 } 2508 2509 if (intr_vec == IDT_PF) { 2510 error = vmxctx_setreg(vmxctx, VM_REG_GUEST_CR2, qual); 2511 KASSERT(error == 0, ("%s: vmxctx_setreg(cr2) error %d", 2512 __func__, error)); 2513 } 2514 2515 /* 2516 * Software exceptions exhibit trap-like behavior. This in 2517 * turn requires populating the VM-entry instruction length 2518 * so that the %rip in the trap frame is past the INT3/INTO 2519 * instruction. 2520 */ 2521 if (intr_type == VMCS_INTR_T_SWEXCEPTION) 2522 vmcs_write(VMCS_ENTRY_INST_LENGTH, vmexit->inst_length); 2523 2524 /* Reflect all other exceptions back into the guest */ 2525 errcode_valid = errcode = 0; 2526 if (intr_info & VMCS_INTR_DEL_ERRCODE) { 2527 errcode_valid = 1; 2528 errcode = vmcs_read(VMCS_EXIT_INTR_ERRCODE); 2529 } 2530 VCPU_CTR2(vmx->vm, vcpu, "Reflecting exception %d/%#x into " 2531 "the guest", intr_vec, errcode); 2532 SDT_PROBE5(vmm, vmx, exit, exception, 2533 vmx, vcpu, vmexit, intr_vec, errcode); 2534 error = vm_inject_exception(vmx->vm, vcpu, intr_vec, 2535 errcode_valid, errcode, 0); 2536 KASSERT(error == 0, ("%s: vm_inject_exception error %d", 2537 __func__, error)); 2538 return (1); 2539 2540 case EXIT_REASON_EPT_FAULT: 2541 /* 2542 * If 'gpa' lies within the address space allocated to 2543 * memory then this must be a nested page fault otherwise 2544 * this must be an instruction that accesses MMIO space. 2545 */ 2546 gpa = vmcs_gpa(); 2547 if (vm_mem_allocated(vmx->vm, vcpu, gpa) || 2548 apic_access_fault(vmx, vcpu, gpa)) { 2549 vmexit->exitcode = VM_EXITCODE_PAGING; 2550 vmexit->inst_length = 0; 2551 vmexit->u.paging.gpa = gpa; 2552 vmexit->u.paging.fault_type = ept_fault_type(qual); 2553 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_NESTED_FAULT, 1); 2554 SDT_PROBE5(vmm, vmx, exit, nestedfault, 2555 vmx, vcpu, vmexit, gpa, qual); 2556 } else if (ept_emulation_fault(qual)) { 2557 vmexit_inst_emul(vmexit, gpa, vmcs_gla()); 2558 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INST_EMUL, 1); 2559 SDT_PROBE4(vmm, vmx, exit, mmiofault, 2560 vmx, vcpu, vmexit, gpa); 2561 } 2562 /* 2563 * If Virtual NMIs control is 1 and the VM-exit is due to an 2564 * EPT fault during the execution of IRET then we must restore 2565 * the state of "virtual-NMI blocking" before resuming. 2566 * 2567 * See description of "NMI unblocking due to IRET" in 2568 * "Exit Qualification for EPT Violations". 2569 */ 2570 if ((idtvec_info & VMCS_IDT_VEC_VALID) == 0 && 2571 (qual & EXIT_QUAL_NMIUDTI) != 0) 2572 vmx_restore_nmi_blocking(vmx, vcpu); 2573 break; 2574 case EXIT_REASON_VIRTUALIZED_EOI: 2575 vmexit->exitcode = VM_EXITCODE_IOAPIC_EOI; 2576 vmexit->u.ioapic_eoi.vector = qual & 0xFF; 2577 SDT_PROBE3(vmm, vmx, exit, eoi, vmx, vcpu, vmexit); 2578 vmexit->inst_length = 0; /* trap-like */ 2579 break; 2580 case EXIT_REASON_APIC_ACCESS: 2581 SDT_PROBE3(vmm, vmx, exit, apicaccess, vmx, vcpu, vmexit); 2582 handled = vmx_handle_apic_access(vmx, vcpu, vmexit); 2583 break; 2584 case EXIT_REASON_APIC_WRITE: 2585 /* 2586 * APIC-write VM exit is trap-like so the %rip is already 2587 * pointing to the next instruction. 2588 */ 2589 vmexit->inst_length = 0; 2590 vlapic = vm_lapic(vmx->vm, vcpu); 2591 SDT_PROBE4(vmm, vmx, exit, apicwrite, 2592 vmx, vcpu, vmexit, vlapic); 2593 handled = vmx_handle_apic_write(vmx, vcpu, vlapic, qual); 2594 break; 2595 case EXIT_REASON_XSETBV: 2596 SDT_PROBE3(vmm, vmx, exit, xsetbv, vmx, vcpu, vmexit); 2597 handled = vmx_emulate_xsetbv(vmx, vcpu, vmexit); 2598 break; 2599 case EXIT_REASON_MONITOR: 2600 SDT_PROBE3(vmm, vmx, exit, monitor, vmx, vcpu, vmexit); 2601 vmexit->exitcode = VM_EXITCODE_MONITOR; 2602 break; 2603 case EXIT_REASON_MWAIT: 2604 SDT_PROBE3(vmm, vmx, exit, mwait, vmx, vcpu, vmexit); 2605 vmexit->exitcode = VM_EXITCODE_MWAIT; 2606 break; 2607 default: 2608 SDT_PROBE4(vmm, vmx, exit, unknown, 2609 vmx, vcpu, vmexit, reason); 2610 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_UNKNOWN, 1); 2611 break; 2612 } 2613 2614 if (handled) { 2615 /* 2616 * It is possible that control is returned to userland 2617 * even though we were able to handle the VM exit in the 2618 * kernel. 2619 * 2620 * In such a case we want to make sure that the userland 2621 * restarts guest execution at the instruction *after* 2622 * the one we just processed. Therefore we update the 2623 * guest rip in the VMCS and in 'vmexit'. 2624 */ 2625 vmexit->rip += vmexit->inst_length; 2626 vmexit->inst_length = 0; 2627 vmcs_write(VMCS_GUEST_RIP, vmexit->rip); 2628 } else { 2629 if (vmexit->exitcode == VM_EXITCODE_BOGUS) { 2630 /* 2631 * If this VM exit was not claimed by anybody then 2632 * treat it as a generic VMX exit. 2633 */ 2634 vmexit->exitcode = VM_EXITCODE_VMX; 2635 vmexit->u.vmx.status = VM_SUCCESS; 2636 vmexit->u.vmx.inst_type = 0; 2637 vmexit->u.vmx.inst_error = 0; 2638 } else { 2639 /* 2640 * The exitcode and collateral have been populated. 2641 * The VM exit will be processed further in userland. 2642 */ 2643 } 2644 } 2645 2646 SDT_PROBE4(vmm, vmx, exit, return, 2647 vmx, vcpu, vmexit, handled); 2648 return (handled); 2649 } 2650 2651 static __inline void 2652 vmx_exit_inst_error(struct vmxctx *vmxctx, int rc, struct vm_exit *vmexit) 2653 { 2654 2655 KASSERT(vmxctx->inst_fail_status != VM_SUCCESS, 2656 ("vmx_exit_inst_error: invalid inst_fail_status %d", 2657 vmxctx->inst_fail_status)); 2658 2659 vmexit->inst_length = 0; 2660 vmexit->exitcode = VM_EXITCODE_VMX; 2661 vmexit->u.vmx.status = vmxctx->inst_fail_status; 2662 vmexit->u.vmx.inst_error = vmcs_instruction_error(); 2663 vmexit->u.vmx.exit_reason = ~0; 2664 vmexit->u.vmx.exit_qualification = ~0; 2665 2666 switch (rc) { 2667 case VMX_VMRESUME_ERROR: 2668 case VMX_VMLAUNCH_ERROR: 2669 case VMX_INVEPT_ERROR: 2670 vmexit->u.vmx.inst_type = rc; 2671 break; 2672 default: 2673 panic("vm_exit_inst_error: vmx_enter_guest returned %d", rc); 2674 } 2675 } 2676 2677 /* 2678 * If the NMI-exiting VM execution control is set to '1' then an NMI in 2679 * non-root operation causes a VM-exit. NMI blocking is in effect so it is 2680 * sufficient to simply vector to the NMI handler via a software interrupt. 2681 * However, this must be done before maskable interrupts are enabled 2682 * otherwise the "iret" issued by an interrupt handler will incorrectly 2683 * clear NMI blocking. 2684 */ 2685 static __inline void 2686 vmx_exit_handle_nmi(struct vmx *vmx, int vcpuid, struct vm_exit *vmexit) 2687 { 2688 uint32_t intr_info; 2689 2690 KASSERT((read_rflags() & PSL_I) == 0, ("interrupts enabled")); 2691 2692 if (vmexit->u.vmx.exit_reason != EXIT_REASON_EXCEPTION) 2693 return; 2694 2695 intr_info = vmcs_read(VMCS_EXIT_INTR_INFO); 2696 KASSERT((intr_info & VMCS_INTR_VALID) != 0, 2697 ("VM exit interruption info invalid: %#x", intr_info)); 2698 2699 if ((intr_info & VMCS_INTR_T_MASK) == VMCS_INTR_T_NMI) { 2700 KASSERT((intr_info & 0xff) == IDT_NMI, ("VM exit due " 2701 "to NMI has invalid vector: %#x", intr_info)); 2702 VCPU_CTR0(vmx->vm, vcpuid, "Vectoring to NMI handler"); 2703 __asm __volatile("int $2"); 2704 } 2705 } 2706 2707 static __inline void 2708 vmx_dr_enter_guest(struct vmxctx *vmxctx) 2709 { 2710 register_t rflags; 2711 2712 /* Save host control debug registers. */ 2713 vmxctx->host_dr7 = rdr7(); 2714 vmxctx->host_debugctl = rdmsr(MSR_DEBUGCTLMSR); 2715 2716 /* 2717 * Disable debugging in DR7 and DEBUGCTL to avoid triggering 2718 * exceptions in the host based on the guest DRx values. The 2719 * guest DR7 and DEBUGCTL are saved/restored in the VMCS. 2720 */ 2721 load_dr7(0); 2722 wrmsr(MSR_DEBUGCTLMSR, 0); 2723 2724 /* 2725 * Disable single stepping the kernel to avoid corrupting the 2726 * guest DR6. A debugger might still be able to corrupt the 2727 * guest DR6 by setting a breakpoint after this point and then 2728 * single stepping. 2729 */ 2730 rflags = read_rflags(); 2731 vmxctx->host_tf = rflags & PSL_T; 2732 write_rflags(rflags & ~PSL_T); 2733 2734 /* Save host debug registers. */ 2735 vmxctx->host_dr0 = rdr0(); 2736 vmxctx->host_dr1 = rdr1(); 2737 vmxctx->host_dr2 = rdr2(); 2738 vmxctx->host_dr3 = rdr3(); 2739 vmxctx->host_dr6 = rdr6(); 2740 2741 /* Restore guest debug registers. */ 2742 load_dr0(vmxctx->guest_dr0); 2743 load_dr1(vmxctx->guest_dr1); 2744 load_dr2(vmxctx->guest_dr2); 2745 load_dr3(vmxctx->guest_dr3); 2746 load_dr6(vmxctx->guest_dr6); 2747 } 2748 2749 static __inline void 2750 vmx_dr_leave_guest(struct vmxctx *vmxctx) 2751 { 2752 2753 /* Save guest debug registers. */ 2754 vmxctx->guest_dr0 = rdr0(); 2755 vmxctx->guest_dr1 = rdr1(); 2756 vmxctx->guest_dr2 = rdr2(); 2757 vmxctx->guest_dr3 = rdr3(); 2758 vmxctx->guest_dr6 = rdr6(); 2759 2760 /* 2761 * Restore host debug registers. Restore DR7, DEBUGCTL, and 2762 * PSL_T last. 2763 */ 2764 load_dr0(vmxctx->host_dr0); 2765 load_dr1(vmxctx->host_dr1); 2766 load_dr2(vmxctx->host_dr2); 2767 load_dr3(vmxctx->host_dr3); 2768 load_dr6(vmxctx->host_dr6); 2769 wrmsr(MSR_DEBUGCTLMSR, vmxctx->host_debugctl); 2770 load_dr7(vmxctx->host_dr7); 2771 write_rflags(read_rflags() | vmxctx->host_tf); 2772 } 2773 2774 static int 2775 vmx_run(void *arg, int vcpu, register_t rip, pmap_t pmap, 2776 struct vm_eventinfo *evinfo) 2777 { 2778 int rc, handled, launched; 2779 struct vmx *vmx; 2780 struct vm *vm; 2781 struct vmxctx *vmxctx; 2782 struct vmcs *vmcs; 2783 struct vm_exit *vmexit; 2784 struct vlapic *vlapic; 2785 uint32_t exit_reason; 2786 2787 vmx = arg; 2788 vm = vmx->vm; 2789 vmcs = &vmx->vmcs[vcpu]; 2790 vmxctx = &vmx->ctx[vcpu]; 2791 vlapic = vm_lapic(vm, vcpu); 2792 vmexit = vm_exitinfo(vm, vcpu); 2793 launched = 0; 2794 2795 KASSERT(vmxctx->pmap == pmap, 2796 ("pmap %p different than ctx pmap %p", pmap, vmxctx->pmap)); 2797 2798 vmx_msr_guest_enter(vmx, vcpu); 2799 2800 VMPTRLD(vmcs); 2801 2802 /* 2803 * XXX 2804 * We do this every time because we may setup the virtual machine 2805 * from a different process than the one that actually runs it. 2806 * 2807 * If the life of a virtual machine was spent entirely in the context 2808 * of a single process we could do this once in vmx_vminit(). 2809 */ 2810 vmcs_write(VMCS_HOST_CR3, rcr3()); 2811 2812 vmcs_write(VMCS_GUEST_RIP, rip); 2813 vmx_set_pcpu_defaults(vmx, vcpu, pmap); 2814 do { 2815 KASSERT(vmcs_guest_rip() == rip, ("%s: vmcs guest rip mismatch " 2816 "%#lx/%#lx", __func__, vmcs_guest_rip(), rip)); 2817 2818 handled = UNHANDLED; 2819 /* 2820 * Interrupts are disabled from this point on until the 2821 * guest starts executing. This is done for the following 2822 * reasons: 2823 * 2824 * If an AST is asserted on this thread after the check below, 2825 * then the IPI_AST notification will not be lost, because it 2826 * will cause a VM exit due to external interrupt as soon as 2827 * the guest state is loaded. 2828 * 2829 * A posted interrupt after 'vmx_inject_interrupts()' will 2830 * not be "lost" because it will be held pending in the host 2831 * APIC because interrupts are disabled. The pending interrupt 2832 * will be recognized as soon as the guest state is loaded. 2833 * 2834 * The same reasoning applies to the IPI generated by 2835 * pmap_invalidate_ept(). 2836 */ 2837 disable_intr(); 2838 vmx_inject_interrupts(vmx, vcpu, vlapic, rip); 2839 2840 /* 2841 * Check for vcpu suspension after injecting events because 2842 * vmx_inject_interrupts() can suspend the vcpu due to a 2843 * triple fault. 2844 */ 2845 if (vcpu_suspended(evinfo)) { 2846 enable_intr(); 2847 vm_exit_suspended(vmx->vm, vcpu, rip); 2848 break; 2849 } 2850 2851 if (vcpu_rendezvous_pending(evinfo)) { 2852 enable_intr(); 2853 vm_exit_rendezvous(vmx->vm, vcpu, rip); 2854 break; 2855 } 2856 2857 if (vcpu_reqidle(evinfo)) { 2858 enable_intr(); 2859 vm_exit_reqidle(vmx->vm, vcpu, rip); 2860 break; 2861 } 2862 2863 if (vcpu_should_yield(vm, vcpu)) { 2864 enable_intr(); 2865 vm_exit_astpending(vmx->vm, vcpu, rip); 2866 vmx_astpending_trace(vmx, vcpu, rip); 2867 handled = HANDLED; 2868 break; 2869 } 2870 2871 if (vcpu_debugged(vm, vcpu)) { 2872 enable_intr(); 2873 vm_exit_debug(vmx->vm, vcpu, rip); 2874 break; 2875 } 2876 2877 vmx_run_trace(vmx, vcpu); 2878 vmx_dr_enter_guest(vmxctx); 2879 rc = vmx_enter_guest(vmxctx, vmx, launched); 2880 vmx_dr_leave_guest(vmxctx); 2881 2882 /* Collect some information for VM exit processing */ 2883 vmexit->rip = rip = vmcs_guest_rip(); 2884 vmexit->inst_length = vmexit_instruction_length(); 2885 vmexit->u.vmx.exit_reason = exit_reason = vmcs_exit_reason(); 2886 vmexit->u.vmx.exit_qualification = vmcs_exit_qualification(); 2887 2888 /* Update 'nextrip' */ 2889 vmx->state[vcpu].nextrip = rip; 2890 2891 if (rc == VMX_GUEST_VMEXIT) { 2892 vmx_exit_handle_nmi(vmx, vcpu, vmexit); 2893 enable_intr(); 2894 handled = vmx_exit_process(vmx, vcpu, vmexit); 2895 } else { 2896 enable_intr(); 2897 vmx_exit_inst_error(vmxctx, rc, vmexit); 2898 } 2899 launched = 1; 2900 vmx_exit_trace(vmx, vcpu, rip, exit_reason, handled); 2901 rip = vmexit->rip; 2902 } while (handled); 2903 2904 /* 2905 * If a VM exit has been handled then the exitcode must be BOGUS 2906 * If a VM exit is not handled then the exitcode must not be BOGUS 2907 */ 2908 if ((handled && vmexit->exitcode != VM_EXITCODE_BOGUS) || 2909 (!handled && vmexit->exitcode == VM_EXITCODE_BOGUS)) { 2910 panic("Mismatch between handled (%d) and exitcode (%d)", 2911 handled, vmexit->exitcode); 2912 } 2913 2914 if (!handled) 2915 vmm_stat_incr(vm, vcpu, VMEXIT_USERSPACE, 1); 2916 2917 VCPU_CTR1(vm, vcpu, "returning from vmx_run: exitcode %d", 2918 vmexit->exitcode); 2919 2920 VMCLEAR(vmcs); 2921 vmx_msr_guest_exit(vmx, vcpu); 2922 2923 return (0); 2924 } 2925 2926 static void 2927 vmx_vmcleanup(void *arg) 2928 { 2929 int i; 2930 struct vmx *vmx = arg; 2931 2932 if (apic_access_virtualization(vmx, 0)) 2933 vm_unmap_mmio(vmx->vm, DEFAULT_APIC_BASE, PAGE_SIZE); 2934 2935 for (i = 0; i < VM_MAXCPU; i++) 2936 vpid_free(vmx->state[i].vpid); 2937 2938 free(vmx, M_VMX); 2939 2940 return; 2941 } 2942 2943 static register_t * 2944 vmxctx_regptr(struct vmxctx *vmxctx, int reg) 2945 { 2946 2947 switch (reg) { 2948 case VM_REG_GUEST_RAX: 2949 return (&vmxctx->guest_rax); 2950 case VM_REG_GUEST_RBX: 2951 return (&vmxctx->guest_rbx); 2952 case VM_REG_GUEST_RCX: 2953 return (&vmxctx->guest_rcx); 2954 case VM_REG_GUEST_RDX: 2955 return (&vmxctx->guest_rdx); 2956 case VM_REG_GUEST_RSI: 2957 return (&vmxctx->guest_rsi); 2958 case VM_REG_GUEST_RDI: 2959 return (&vmxctx->guest_rdi); 2960 case VM_REG_GUEST_RBP: 2961 return (&vmxctx->guest_rbp); 2962 case VM_REG_GUEST_R8: 2963 return (&vmxctx->guest_r8); 2964 case VM_REG_GUEST_R9: 2965 return (&vmxctx->guest_r9); 2966 case VM_REG_GUEST_R10: 2967 return (&vmxctx->guest_r10); 2968 case VM_REG_GUEST_R11: 2969 return (&vmxctx->guest_r11); 2970 case VM_REG_GUEST_R12: 2971 return (&vmxctx->guest_r12); 2972 case VM_REG_GUEST_R13: 2973 return (&vmxctx->guest_r13); 2974 case VM_REG_GUEST_R14: 2975 return (&vmxctx->guest_r14); 2976 case VM_REG_GUEST_R15: 2977 return (&vmxctx->guest_r15); 2978 case VM_REG_GUEST_CR2: 2979 return (&vmxctx->guest_cr2); 2980 case VM_REG_GUEST_DR0: 2981 return (&vmxctx->guest_dr0); 2982 case VM_REG_GUEST_DR1: 2983 return (&vmxctx->guest_dr1); 2984 case VM_REG_GUEST_DR2: 2985 return (&vmxctx->guest_dr2); 2986 case VM_REG_GUEST_DR3: 2987 return (&vmxctx->guest_dr3); 2988 case VM_REG_GUEST_DR6: 2989 return (&vmxctx->guest_dr6); 2990 default: 2991 break; 2992 } 2993 return (NULL); 2994 } 2995 2996 static int 2997 vmxctx_getreg(struct vmxctx *vmxctx, int reg, uint64_t *retval) 2998 { 2999 register_t *regp; 3000 3001 if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) { 3002 *retval = *regp; 3003 return (0); 3004 } else 3005 return (EINVAL); 3006 } 3007 3008 static int 3009 vmxctx_setreg(struct vmxctx *vmxctx, int reg, uint64_t val) 3010 { 3011 register_t *regp; 3012 3013 if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) { 3014 *regp = val; 3015 return (0); 3016 } else 3017 return (EINVAL); 3018 } 3019 3020 static int 3021 vmx_get_intr_shadow(struct vmx *vmx, int vcpu, int running, uint64_t *retval) 3022 { 3023 uint64_t gi; 3024 int error; 3025 3026 error = vmcs_getreg(&vmx->vmcs[vcpu], running, 3027 VMCS_IDENT(VMCS_GUEST_INTERRUPTIBILITY), &gi); 3028 *retval = (gi & HWINTR_BLOCKING) ? 1 : 0; 3029 return (error); 3030 } 3031 3032 static int 3033 vmx_modify_intr_shadow(struct vmx *vmx, int vcpu, int running, uint64_t val) 3034 { 3035 struct vmcs *vmcs; 3036 uint64_t gi; 3037 int error, ident; 3038 3039 /* 3040 * Forcing the vcpu into an interrupt shadow is not supported. 3041 */ 3042 if (val) { 3043 error = EINVAL; 3044 goto done; 3045 } 3046 3047 vmcs = &vmx->vmcs[vcpu]; 3048 ident = VMCS_IDENT(VMCS_GUEST_INTERRUPTIBILITY); 3049 error = vmcs_getreg(vmcs, running, ident, &gi); 3050 if (error == 0) { 3051 gi &= ~HWINTR_BLOCKING; 3052 error = vmcs_setreg(vmcs, running, ident, gi); 3053 } 3054 done: 3055 VCPU_CTR2(vmx->vm, vcpu, "Setting intr_shadow to %#lx %s", val, 3056 error ? "failed" : "succeeded"); 3057 return (error); 3058 } 3059 3060 static int 3061 vmx_shadow_reg(int reg) 3062 { 3063 int shreg; 3064 3065 shreg = -1; 3066 3067 switch (reg) { 3068 case VM_REG_GUEST_CR0: 3069 shreg = VMCS_CR0_SHADOW; 3070 break; 3071 case VM_REG_GUEST_CR4: 3072 shreg = VMCS_CR4_SHADOW; 3073 break; 3074 default: 3075 break; 3076 } 3077 3078 return (shreg); 3079 } 3080 3081 static int 3082 vmx_getreg(void *arg, int vcpu, int reg, uint64_t *retval) 3083 { 3084 int running, hostcpu; 3085 struct vmx *vmx = arg; 3086 3087 running = vcpu_is_running(vmx->vm, vcpu, &hostcpu); 3088 if (running && hostcpu != curcpu) 3089 panic("vmx_getreg: %s%d is running", vm_name(vmx->vm), vcpu); 3090 3091 if (reg == VM_REG_GUEST_INTR_SHADOW) 3092 return (vmx_get_intr_shadow(vmx, vcpu, running, retval)); 3093 3094 if (vmxctx_getreg(&vmx->ctx[vcpu], reg, retval) == 0) 3095 return (0); 3096 3097 return (vmcs_getreg(&vmx->vmcs[vcpu], running, reg, retval)); 3098 } 3099 3100 static int 3101 vmx_setreg(void *arg, int vcpu, int reg, uint64_t val) 3102 { 3103 int error, hostcpu, running, shadow; 3104 uint64_t ctls; 3105 pmap_t pmap; 3106 struct vmx *vmx = arg; 3107 3108 running = vcpu_is_running(vmx->vm, vcpu, &hostcpu); 3109 if (running && hostcpu != curcpu) 3110 panic("vmx_setreg: %s%d is running", vm_name(vmx->vm), vcpu); 3111 3112 if (reg == VM_REG_GUEST_INTR_SHADOW) 3113 return (vmx_modify_intr_shadow(vmx, vcpu, running, val)); 3114 3115 if (vmxctx_setreg(&vmx->ctx[vcpu], reg, val) == 0) 3116 return (0); 3117 3118 error = vmcs_setreg(&vmx->vmcs[vcpu], running, reg, val); 3119 3120 if (error == 0) { 3121 /* 3122 * If the "load EFER" VM-entry control is 1 then the 3123 * value of EFER.LMA must be identical to "IA-32e mode guest" 3124 * bit in the VM-entry control. 3125 */ 3126 if ((entry_ctls & VM_ENTRY_LOAD_EFER) != 0 && 3127 (reg == VM_REG_GUEST_EFER)) { 3128 vmcs_getreg(&vmx->vmcs[vcpu], running, 3129 VMCS_IDENT(VMCS_ENTRY_CTLS), &ctls); 3130 if (val & EFER_LMA) 3131 ctls |= VM_ENTRY_GUEST_LMA; 3132 else 3133 ctls &= ~VM_ENTRY_GUEST_LMA; 3134 vmcs_setreg(&vmx->vmcs[vcpu], running, 3135 VMCS_IDENT(VMCS_ENTRY_CTLS), ctls); 3136 } 3137 3138 shadow = vmx_shadow_reg(reg); 3139 if (shadow > 0) { 3140 /* 3141 * Store the unmodified value in the shadow 3142 */ 3143 error = vmcs_setreg(&vmx->vmcs[vcpu], running, 3144 VMCS_IDENT(shadow), val); 3145 } 3146 3147 if (reg == VM_REG_GUEST_CR3) { 3148 /* 3149 * Invalidate the guest vcpu's TLB mappings to emulate 3150 * the behavior of updating %cr3. 3151 * 3152 * XXX the processor retains global mappings when %cr3 3153 * is updated but vmx_invvpid() does not. 3154 */ 3155 pmap = vmx->ctx[vcpu].pmap; 3156 vmx_invvpid(vmx, vcpu, pmap, running); 3157 } 3158 } 3159 3160 return (error); 3161 } 3162 3163 static int 3164 vmx_getdesc(void *arg, int vcpu, int reg, struct seg_desc *desc) 3165 { 3166 int hostcpu, running; 3167 struct vmx *vmx = arg; 3168 3169 running = vcpu_is_running(vmx->vm, vcpu, &hostcpu); 3170 if (running && hostcpu != curcpu) 3171 panic("vmx_getdesc: %s%d is running", vm_name(vmx->vm), vcpu); 3172 3173 return (vmcs_getdesc(&vmx->vmcs[vcpu], running, reg, desc)); 3174 } 3175 3176 static int 3177 vmx_setdesc(void *arg, int vcpu, int reg, struct seg_desc *desc) 3178 { 3179 int hostcpu, running; 3180 struct vmx *vmx = arg; 3181 3182 running = vcpu_is_running(vmx->vm, vcpu, &hostcpu); 3183 if (running && hostcpu != curcpu) 3184 panic("vmx_setdesc: %s%d is running", vm_name(vmx->vm), vcpu); 3185 3186 return (vmcs_setdesc(&vmx->vmcs[vcpu], running, reg, desc)); 3187 } 3188 3189 static int 3190 vmx_getcap(void *arg, int vcpu, int type, int *retval) 3191 { 3192 struct vmx *vmx = arg; 3193 int vcap; 3194 int ret; 3195 3196 ret = ENOENT; 3197 3198 vcap = vmx->cap[vcpu].set; 3199 3200 switch (type) { 3201 case VM_CAP_HALT_EXIT: 3202 if (cap_halt_exit) 3203 ret = 0; 3204 break; 3205 case VM_CAP_PAUSE_EXIT: 3206 if (cap_pause_exit) 3207 ret = 0; 3208 break; 3209 case VM_CAP_MTRAP_EXIT: 3210 if (cap_monitor_trap) 3211 ret = 0; 3212 break; 3213 case VM_CAP_UNRESTRICTED_GUEST: 3214 if (cap_unrestricted_guest) 3215 ret = 0; 3216 break; 3217 case VM_CAP_ENABLE_INVPCID: 3218 if (cap_invpcid) 3219 ret = 0; 3220 break; 3221 default: 3222 break; 3223 } 3224 3225 if (ret == 0) 3226 *retval = (vcap & (1 << type)) ? 1 : 0; 3227 3228 return (ret); 3229 } 3230 3231 static int 3232 vmx_setcap(void *arg, int vcpu, int type, int val) 3233 { 3234 struct vmx *vmx = arg; 3235 struct vmcs *vmcs = &vmx->vmcs[vcpu]; 3236 uint32_t baseval; 3237 uint32_t *pptr; 3238 int error; 3239 int flag; 3240 int reg; 3241 int retval; 3242 3243 retval = ENOENT; 3244 pptr = NULL; 3245 3246 switch (type) { 3247 case VM_CAP_HALT_EXIT: 3248 if (cap_halt_exit) { 3249 retval = 0; 3250 pptr = &vmx->cap[vcpu].proc_ctls; 3251 baseval = *pptr; 3252 flag = PROCBASED_HLT_EXITING; 3253 reg = VMCS_PRI_PROC_BASED_CTLS; 3254 } 3255 break; 3256 case VM_CAP_MTRAP_EXIT: 3257 if (cap_monitor_trap) { 3258 retval = 0; 3259 pptr = &vmx->cap[vcpu].proc_ctls; 3260 baseval = *pptr; 3261 flag = PROCBASED_MTF; 3262 reg = VMCS_PRI_PROC_BASED_CTLS; 3263 } 3264 break; 3265 case VM_CAP_PAUSE_EXIT: 3266 if (cap_pause_exit) { 3267 retval = 0; 3268 pptr = &vmx->cap[vcpu].proc_ctls; 3269 baseval = *pptr; 3270 flag = PROCBASED_PAUSE_EXITING; 3271 reg = VMCS_PRI_PROC_BASED_CTLS; 3272 } 3273 break; 3274 case VM_CAP_UNRESTRICTED_GUEST: 3275 if (cap_unrestricted_guest) { 3276 retval = 0; 3277 pptr = &vmx->cap[vcpu].proc_ctls2; 3278 baseval = *pptr; 3279 flag = PROCBASED2_UNRESTRICTED_GUEST; 3280 reg = VMCS_SEC_PROC_BASED_CTLS; 3281 } 3282 break; 3283 case VM_CAP_ENABLE_INVPCID: 3284 if (cap_invpcid) { 3285 retval = 0; 3286 pptr = &vmx->cap[vcpu].proc_ctls2; 3287 baseval = *pptr; 3288 flag = PROCBASED2_ENABLE_INVPCID; 3289 reg = VMCS_SEC_PROC_BASED_CTLS; 3290 } 3291 break; 3292 default: 3293 break; 3294 } 3295 3296 if (retval == 0) { 3297 if (val) { 3298 baseval |= flag; 3299 } else { 3300 baseval &= ~flag; 3301 } 3302 VMPTRLD(vmcs); 3303 error = vmwrite(reg, baseval); 3304 VMCLEAR(vmcs); 3305 3306 if (error) { 3307 retval = error; 3308 } else { 3309 /* 3310 * Update optional stored flags, and record 3311 * setting 3312 */ 3313 if (pptr != NULL) { 3314 *pptr = baseval; 3315 } 3316 3317 if (val) { 3318 vmx->cap[vcpu].set |= (1 << type); 3319 } else { 3320 vmx->cap[vcpu].set &= ~(1 << type); 3321 } 3322 } 3323 } 3324 3325 return (retval); 3326 } 3327 3328 struct vlapic_vtx { 3329 struct vlapic vlapic; 3330 struct pir_desc *pir_desc; 3331 struct vmx *vmx; 3332 }; 3333 3334 #define VMX_CTR_PIR(vm, vcpuid, pir_desc, notify, vector, level, msg) \ 3335 do { \ 3336 VCPU_CTR2(vm, vcpuid, msg " assert %s-triggered vector %d", \ 3337 level ? "level" : "edge", vector); \ 3338 VCPU_CTR1(vm, vcpuid, msg " pir0 0x%016lx", pir_desc->pir[0]); \ 3339 VCPU_CTR1(vm, vcpuid, msg " pir1 0x%016lx", pir_desc->pir[1]); \ 3340 VCPU_CTR1(vm, vcpuid, msg " pir2 0x%016lx", pir_desc->pir[2]); \ 3341 VCPU_CTR1(vm, vcpuid, msg " pir3 0x%016lx", pir_desc->pir[3]); \ 3342 VCPU_CTR1(vm, vcpuid, msg " notify: %s", notify ? "yes" : "no");\ 3343 } while (0) 3344 3345 /* 3346 * vlapic->ops handlers that utilize the APICv hardware assist described in 3347 * Chapter 29 of the Intel SDM. 3348 */ 3349 static int 3350 vmx_set_intr_ready(struct vlapic *vlapic, int vector, bool level) 3351 { 3352 struct vlapic_vtx *vlapic_vtx; 3353 struct pir_desc *pir_desc; 3354 uint64_t mask; 3355 int idx, notify; 3356 3357 vlapic_vtx = (struct vlapic_vtx *)vlapic; 3358 pir_desc = vlapic_vtx->pir_desc; 3359 3360 /* 3361 * Keep track of interrupt requests in the PIR descriptor. This is 3362 * because the virtual APIC page pointed to by the VMCS cannot be 3363 * modified if the vcpu is running. 3364 */ 3365 idx = vector / 64; 3366 mask = 1UL << (vector % 64); 3367 atomic_set_long(&pir_desc->pir[idx], mask); 3368 notify = atomic_cmpset_long(&pir_desc->pending, 0, 1); 3369 3370 VMX_CTR_PIR(vlapic->vm, vlapic->vcpuid, pir_desc, notify, vector, 3371 level, "vmx_set_intr_ready"); 3372 return (notify); 3373 } 3374 3375 static int 3376 vmx_pending_intr(struct vlapic *vlapic, int *vecptr) 3377 { 3378 struct vlapic_vtx *vlapic_vtx; 3379 struct pir_desc *pir_desc; 3380 struct LAPIC *lapic; 3381 uint64_t pending, pirval; 3382 uint32_t ppr, vpr; 3383 int i; 3384 3385 /* 3386 * This function is only expected to be called from the 'HLT' exit 3387 * handler which does not care about the vector that is pending. 3388 */ 3389 KASSERT(vecptr == NULL, ("vmx_pending_intr: vecptr must be NULL")); 3390 3391 vlapic_vtx = (struct vlapic_vtx *)vlapic; 3392 pir_desc = vlapic_vtx->pir_desc; 3393 3394 pending = atomic_load_acq_long(&pir_desc->pending); 3395 if (!pending) { 3396 /* 3397 * While a virtual interrupt may have already been 3398 * processed the actual delivery maybe pending the 3399 * interruptibility of the guest. Recognize a pending 3400 * interrupt by reevaluating virtual interrupts 3401 * following Section 29.2.1 in the Intel SDM Volume 3. 3402 */ 3403 struct vm_exit *vmexit; 3404 uint8_t rvi, ppr; 3405 3406 vmexit = vm_exitinfo(vlapic->vm, vlapic->vcpuid); 3407 KASSERT(vmexit->exitcode == VM_EXITCODE_HLT, 3408 ("vmx_pending_intr: exitcode not 'HLT'")); 3409 rvi = vmexit->u.hlt.intr_status & APIC_TPR_INT; 3410 lapic = vlapic->apic_page; 3411 ppr = lapic->ppr & APIC_TPR_INT; 3412 if (rvi > ppr) { 3413 return (1); 3414 } 3415 3416 return (0); 3417 } 3418 3419 /* 3420 * If there is an interrupt pending then it will be recognized only 3421 * if its priority is greater than the processor priority. 3422 * 3423 * Special case: if the processor priority is zero then any pending 3424 * interrupt will be recognized. 3425 */ 3426 lapic = vlapic->apic_page; 3427 ppr = lapic->ppr & APIC_TPR_INT; 3428 if (ppr == 0) 3429 return (1); 3430 3431 VCPU_CTR1(vlapic->vm, vlapic->vcpuid, "HLT with non-zero PPR %d", 3432 lapic->ppr); 3433 3434 for (i = 3; i >= 0; i--) { 3435 pirval = pir_desc->pir[i]; 3436 if (pirval != 0) { 3437 vpr = (i * 64 + flsl(pirval) - 1) & APIC_TPR_INT; 3438 return (vpr > ppr); 3439 } 3440 } 3441 return (0); 3442 } 3443 3444 static void 3445 vmx_intr_accepted(struct vlapic *vlapic, int vector) 3446 { 3447 3448 panic("vmx_intr_accepted: not expected to be called"); 3449 } 3450 3451 static void 3452 vmx_set_tmr(struct vlapic *vlapic, int vector, bool level) 3453 { 3454 struct vlapic_vtx *vlapic_vtx; 3455 struct vmx *vmx; 3456 struct vmcs *vmcs; 3457 uint64_t mask, val; 3458 3459 KASSERT(vector >= 0 && vector <= 255, ("invalid vector %d", vector)); 3460 KASSERT(!vcpu_is_running(vlapic->vm, vlapic->vcpuid, NULL), 3461 ("vmx_set_tmr: vcpu cannot be running")); 3462 3463 vlapic_vtx = (struct vlapic_vtx *)vlapic; 3464 vmx = vlapic_vtx->vmx; 3465 vmcs = &vmx->vmcs[vlapic->vcpuid]; 3466 mask = 1UL << (vector % 64); 3467 3468 VMPTRLD(vmcs); 3469 val = vmcs_read(VMCS_EOI_EXIT(vector)); 3470 if (level) 3471 val |= mask; 3472 else 3473 val &= ~mask; 3474 vmcs_write(VMCS_EOI_EXIT(vector), val); 3475 VMCLEAR(vmcs); 3476 } 3477 3478 static void 3479 vmx_enable_x2apic_mode(struct vlapic *vlapic) 3480 { 3481 struct vmx *vmx; 3482 struct vmcs *vmcs; 3483 uint32_t proc_ctls2; 3484 int vcpuid, error; 3485 3486 vcpuid = vlapic->vcpuid; 3487 vmx = ((struct vlapic_vtx *)vlapic)->vmx; 3488 vmcs = &vmx->vmcs[vcpuid]; 3489 3490 proc_ctls2 = vmx->cap[vcpuid].proc_ctls2; 3491 KASSERT((proc_ctls2 & PROCBASED2_VIRTUALIZE_APIC_ACCESSES) != 0, 3492 ("%s: invalid proc_ctls2 %#x", __func__, proc_ctls2)); 3493 3494 proc_ctls2 &= ~PROCBASED2_VIRTUALIZE_APIC_ACCESSES; 3495 proc_ctls2 |= PROCBASED2_VIRTUALIZE_X2APIC_MODE; 3496 vmx->cap[vcpuid].proc_ctls2 = proc_ctls2; 3497 3498 VMPTRLD(vmcs); 3499 vmcs_write(VMCS_SEC_PROC_BASED_CTLS, proc_ctls2); 3500 VMCLEAR(vmcs); 3501 3502 if (vlapic->vcpuid == 0) { 3503 /* 3504 * The nested page table mappings are shared by all vcpus 3505 * so unmap the APIC access page just once. 3506 */ 3507 error = vm_unmap_mmio(vmx->vm, DEFAULT_APIC_BASE, PAGE_SIZE); 3508 KASSERT(error == 0, ("%s: vm_unmap_mmio error %d", 3509 __func__, error)); 3510 3511 /* 3512 * The MSR bitmap is shared by all vcpus so modify it only 3513 * once in the context of vcpu 0. 3514 */ 3515 error = vmx_allow_x2apic_msrs(vmx); 3516 KASSERT(error == 0, ("%s: vmx_allow_x2apic_msrs error %d", 3517 __func__, error)); 3518 } 3519 } 3520 3521 static void 3522 vmx_post_intr(struct vlapic *vlapic, int hostcpu) 3523 { 3524 3525 ipi_cpu(hostcpu, pirvec); 3526 } 3527 3528 /* 3529 * Transfer the pending interrupts in the PIR descriptor to the IRR 3530 * in the virtual APIC page. 3531 */ 3532 static void 3533 vmx_inject_pir(struct vlapic *vlapic) 3534 { 3535 struct vlapic_vtx *vlapic_vtx; 3536 struct pir_desc *pir_desc; 3537 struct LAPIC *lapic; 3538 uint64_t val, pirval; 3539 int rvi, pirbase = -1; 3540 uint16_t intr_status_old, intr_status_new; 3541 3542 vlapic_vtx = (struct vlapic_vtx *)vlapic; 3543 pir_desc = vlapic_vtx->pir_desc; 3544 if (atomic_cmpset_long(&pir_desc->pending, 1, 0) == 0) { 3545 VCPU_CTR0(vlapic->vm, vlapic->vcpuid, "vmx_inject_pir: " 3546 "no posted interrupt pending"); 3547 return; 3548 } 3549 3550 pirval = 0; 3551 pirbase = -1; 3552 lapic = vlapic->apic_page; 3553 3554 val = atomic_readandclear_long(&pir_desc->pir[0]); 3555 if (val != 0) { 3556 lapic->irr0 |= val; 3557 lapic->irr1 |= val >> 32; 3558 pirbase = 0; 3559 pirval = val; 3560 } 3561 3562 val = atomic_readandclear_long(&pir_desc->pir[1]); 3563 if (val != 0) { 3564 lapic->irr2 |= val; 3565 lapic->irr3 |= val >> 32; 3566 pirbase = 64; 3567 pirval = val; 3568 } 3569 3570 val = atomic_readandclear_long(&pir_desc->pir[2]); 3571 if (val != 0) { 3572 lapic->irr4 |= val; 3573 lapic->irr5 |= val >> 32; 3574 pirbase = 128; 3575 pirval = val; 3576 } 3577 3578 val = atomic_readandclear_long(&pir_desc->pir[3]); 3579 if (val != 0) { 3580 lapic->irr6 |= val; 3581 lapic->irr7 |= val >> 32; 3582 pirbase = 192; 3583 pirval = val; 3584 } 3585 3586 VLAPIC_CTR_IRR(vlapic, "vmx_inject_pir"); 3587 3588 /* 3589 * Update RVI so the processor can evaluate pending virtual 3590 * interrupts on VM-entry. 3591 * 3592 * It is possible for pirval to be 0 here, even though the 3593 * pending bit has been set. The scenario is: 3594 * CPU-Y is sending a posted interrupt to CPU-X, which 3595 * is running a guest and processing posted interrupts in h/w. 3596 * CPU-X will eventually exit and the state seen in s/w is 3597 * the pending bit set, but no PIR bits set. 3598 * 3599 * CPU-X CPU-Y 3600 * (vm running) (host running) 3601 * rx posted interrupt 3602 * CLEAR pending bit 3603 * SET PIR bit 3604 * READ/CLEAR PIR bits 3605 * SET pending bit 3606 * (vm exit) 3607 * pending bit set, PIR 0 3608 */ 3609 if (pirval != 0) { 3610 rvi = pirbase + flsl(pirval) - 1; 3611 intr_status_old = vmcs_read(VMCS_GUEST_INTR_STATUS); 3612 intr_status_new = (intr_status_old & 0xFF00) | rvi; 3613 if (intr_status_new > intr_status_old) { 3614 vmcs_write(VMCS_GUEST_INTR_STATUS, intr_status_new); 3615 VCPU_CTR2(vlapic->vm, vlapic->vcpuid, "vmx_inject_pir: " 3616 "guest_intr_status changed from 0x%04x to 0x%04x", 3617 intr_status_old, intr_status_new); 3618 } 3619 } 3620 } 3621 3622 static struct vlapic * 3623 vmx_vlapic_init(void *arg, int vcpuid) 3624 { 3625 struct vmx *vmx; 3626 struct vlapic *vlapic; 3627 struct vlapic_vtx *vlapic_vtx; 3628 3629 vmx = arg; 3630 3631 vlapic = malloc(sizeof(struct vlapic_vtx), M_VLAPIC, M_WAITOK | M_ZERO); 3632 vlapic->vm = vmx->vm; 3633 vlapic->vcpuid = vcpuid; 3634 vlapic->apic_page = (struct LAPIC *)&vmx->apic_page[vcpuid]; 3635 3636 vlapic_vtx = (struct vlapic_vtx *)vlapic; 3637 vlapic_vtx->pir_desc = &vmx->pir_desc[vcpuid]; 3638 vlapic_vtx->vmx = vmx; 3639 3640 if (virtual_interrupt_delivery) { 3641 vlapic->ops.set_intr_ready = vmx_set_intr_ready; 3642 vlapic->ops.pending_intr = vmx_pending_intr; 3643 vlapic->ops.intr_accepted = vmx_intr_accepted; 3644 vlapic->ops.set_tmr = vmx_set_tmr; 3645 vlapic->ops.enable_x2apic_mode = vmx_enable_x2apic_mode; 3646 } 3647 3648 if (posted_interrupts) 3649 vlapic->ops.post_intr = vmx_post_intr; 3650 3651 vlapic_init(vlapic); 3652 3653 return (vlapic); 3654 } 3655 3656 static void 3657 vmx_vlapic_cleanup(void *arg, struct vlapic *vlapic) 3658 { 3659 3660 vlapic_cleanup(vlapic); 3661 free(vlapic, M_VLAPIC); 3662 } 3663 3664 struct vmm_ops vmm_ops_intel = { 3665 vmx_init, 3666 vmx_cleanup, 3667 vmx_restore, 3668 vmx_vminit, 3669 vmx_run, 3670 vmx_vmcleanup, 3671 vmx_getreg, 3672 vmx_setreg, 3673 vmx_getdesc, 3674 vmx_setdesc, 3675 vmx_getcap, 3676 vmx_setcap, 3677 ept_vmspace_alloc, 3678 ept_vmspace_free, 3679 vmx_vlapic_init, 3680 vmx_vlapic_cleanup, 3681 }; 3682