1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2011 NetApp, Inc. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 * 28 * $FreeBSD$ 29 */ 30 31 #include <sys/cdefs.h> 32 __FBSDID("$FreeBSD$"); 33 34 #include <sys/param.h> 35 #include <sys/systm.h> 36 #include <sys/smp.h> 37 #include <sys/kernel.h> 38 #include <sys/malloc.h> 39 #include <sys/pcpu.h> 40 #include <sys/proc.h> 41 #include <sys/sysctl.h> 42 43 #include <vm/vm.h> 44 #include <vm/pmap.h> 45 46 #include <machine/psl.h> 47 #include <machine/cpufunc.h> 48 #include <machine/md_var.h> 49 #include <machine/reg.h> 50 #include <machine/segments.h> 51 #include <machine/smp.h> 52 #include <machine/specialreg.h> 53 #include <machine/vmparam.h> 54 55 #include <machine/vmm.h> 56 #include <machine/vmm_dev.h> 57 #include <machine/vmm_instruction_emul.h> 58 #include "vmm_lapic.h" 59 #include "vmm_host.h" 60 #include "vmm_ioport.h" 61 #include "vmm_ktr.h" 62 #include "vmm_stat.h" 63 #include "vatpic.h" 64 #include "vlapic.h" 65 #include "vlapic_priv.h" 66 67 #include "ept.h" 68 #include "vmx_cpufunc.h" 69 #include "vmx.h" 70 #include "vmx_msr.h" 71 #include "x86.h" 72 #include "vmx_controls.h" 73 74 #define PINBASED_CTLS_ONE_SETTING \ 75 (PINBASED_EXTINT_EXITING | \ 76 PINBASED_NMI_EXITING | \ 77 PINBASED_VIRTUAL_NMI) 78 #define PINBASED_CTLS_ZERO_SETTING 0 79 80 #define PROCBASED_CTLS_WINDOW_SETTING \ 81 (PROCBASED_INT_WINDOW_EXITING | \ 82 PROCBASED_NMI_WINDOW_EXITING) 83 84 #define PROCBASED_CTLS_ONE_SETTING \ 85 (PROCBASED_SECONDARY_CONTROLS | \ 86 PROCBASED_MWAIT_EXITING | \ 87 PROCBASED_MONITOR_EXITING | \ 88 PROCBASED_IO_EXITING | \ 89 PROCBASED_MSR_BITMAPS | \ 90 PROCBASED_CTLS_WINDOW_SETTING | \ 91 PROCBASED_CR8_LOAD_EXITING | \ 92 PROCBASED_CR8_STORE_EXITING) 93 #define PROCBASED_CTLS_ZERO_SETTING \ 94 (PROCBASED_CR3_LOAD_EXITING | \ 95 PROCBASED_CR3_STORE_EXITING | \ 96 PROCBASED_IO_BITMAPS) 97 98 #define PROCBASED_CTLS2_ONE_SETTING PROCBASED2_ENABLE_EPT 99 #define PROCBASED_CTLS2_ZERO_SETTING 0 100 101 #define VM_EXIT_CTLS_ONE_SETTING \ 102 (VM_EXIT_SAVE_DEBUG_CONTROLS | \ 103 VM_EXIT_HOST_LMA | \ 104 VM_EXIT_SAVE_EFER | \ 105 VM_EXIT_LOAD_EFER | \ 106 VM_EXIT_ACKNOWLEDGE_INTERRUPT) 107 108 #define VM_EXIT_CTLS_ZERO_SETTING 0 109 110 #define VM_ENTRY_CTLS_ONE_SETTING \ 111 (VM_ENTRY_LOAD_DEBUG_CONTROLS | \ 112 VM_ENTRY_LOAD_EFER) 113 114 #define VM_ENTRY_CTLS_ZERO_SETTING \ 115 (VM_ENTRY_INTO_SMM | \ 116 VM_ENTRY_DEACTIVATE_DUAL_MONITOR) 117 118 #define HANDLED 1 119 #define UNHANDLED 0 120 121 static MALLOC_DEFINE(M_VMX, "vmx", "vmx"); 122 static MALLOC_DEFINE(M_VLAPIC, "vlapic", "vlapic"); 123 124 SYSCTL_DECL(_hw_vmm); 125 SYSCTL_NODE(_hw_vmm, OID_AUTO, vmx, CTLFLAG_RW, NULL, NULL); 126 127 int vmxon_enabled[MAXCPU]; 128 static char vmxon_region[MAXCPU][PAGE_SIZE] __aligned(PAGE_SIZE); 129 130 static uint32_t pinbased_ctls, procbased_ctls, procbased_ctls2; 131 static uint32_t exit_ctls, entry_ctls; 132 133 static uint64_t cr0_ones_mask, cr0_zeros_mask; 134 SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr0_ones_mask, CTLFLAG_RD, 135 &cr0_ones_mask, 0, NULL); 136 SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr0_zeros_mask, CTLFLAG_RD, 137 &cr0_zeros_mask, 0, NULL); 138 139 static uint64_t cr4_ones_mask, cr4_zeros_mask; 140 SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr4_ones_mask, CTLFLAG_RD, 141 &cr4_ones_mask, 0, NULL); 142 SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr4_zeros_mask, CTLFLAG_RD, 143 &cr4_zeros_mask, 0, NULL); 144 145 static int vmx_initialized; 146 SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, initialized, CTLFLAG_RD, 147 &vmx_initialized, 0, "Intel VMX initialized"); 148 149 /* 150 * Optional capabilities 151 */ 152 static SYSCTL_NODE(_hw_vmm_vmx, OID_AUTO, cap, CTLFLAG_RW, NULL, NULL); 153 154 static int cap_halt_exit; 155 SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, halt_exit, CTLFLAG_RD, &cap_halt_exit, 0, 156 "HLT triggers a VM-exit"); 157 158 static int cap_pause_exit; 159 SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, pause_exit, CTLFLAG_RD, &cap_pause_exit, 160 0, "PAUSE triggers a VM-exit"); 161 162 static int cap_unrestricted_guest; 163 SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, unrestricted_guest, CTLFLAG_RD, 164 &cap_unrestricted_guest, 0, "Unrestricted guests"); 165 166 static int cap_monitor_trap; 167 SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, monitor_trap, CTLFLAG_RD, 168 &cap_monitor_trap, 0, "Monitor trap flag"); 169 170 static int cap_invpcid; 171 SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, invpcid, CTLFLAG_RD, &cap_invpcid, 172 0, "Guests are allowed to use INVPCID"); 173 174 static int virtual_interrupt_delivery; 175 SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, virtual_interrupt_delivery, CTLFLAG_RD, 176 &virtual_interrupt_delivery, 0, "APICv virtual interrupt delivery support"); 177 178 static int posted_interrupts; 179 SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, posted_interrupts, CTLFLAG_RD, 180 &posted_interrupts, 0, "APICv posted interrupt support"); 181 182 static int pirvec = -1; 183 SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, posted_interrupt_vector, CTLFLAG_RD, 184 &pirvec, 0, "APICv posted interrupt vector"); 185 186 static struct unrhdr *vpid_unr; 187 static u_int vpid_alloc_failed; 188 SYSCTL_UINT(_hw_vmm_vmx, OID_AUTO, vpid_alloc_failed, CTLFLAG_RD, 189 &vpid_alloc_failed, 0, NULL); 190 191 static int guest_l1d_flush; 192 SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, l1d_flush, CTLFLAG_RD, 193 &guest_l1d_flush, 0, NULL); 194 static int guest_l1d_flush_sw; 195 SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, l1d_flush_sw, CTLFLAG_RD, 196 &guest_l1d_flush_sw, 0, NULL); 197 198 static struct msr_entry msr_load_list[1] __aligned(16); 199 200 /* 201 * The definitions of SDT probes for VMX. 202 */ 203 204 SDT_PROBE_DEFINE3(vmm, vmx, exit, entry, 205 "struct vmx *", "int", "struct vm_exit *"); 206 207 SDT_PROBE_DEFINE4(vmm, vmx, exit, taskswitch, 208 "struct vmx *", "int", "struct vm_exit *", "struct vm_task_switch *"); 209 210 SDT_PROBE_DEFINE4(vmm, vmx, exit, craccess, 211 "struct vmx *", "int", "struct vm_exit *", "uint64_t"); 212 213 SDT_PROBE_DEFINE4(vmm, vmx, exit, rdmsr, 214 "struct vmx *", "int", "struct vm_exit *", "uint32_t"); 215 216 SDT_PROBE_DEFINE5(vmm, vmx, exit, wrmsr, 217 "struct vmx *", "int", "struct vm_exit *", "uint32_t", "uint64_t"); 218 219 SDT_PROBE_DEFINE3(vmm, vmx, exit, halt, 220 "struct vmx *", "int", "struct vm_exit *"); 221 222 SDT_PROBE_DEFINE3(vmm, vmx, exit, mtrap, 223 "struct vmx *", "int", "struct vm_exit *"); 224 225 SDT_PROBE_DEFINE3(vmm, vmx, exit, pause, 226 "struct vmx *", "int", "struct vm_exit *"); 227 228 SDT_PROBE_DEFINE3(vmm, vmx, exit, intrwindow, 229 "struct vmx *", "int", "struct vm_exit *"); 230 231 SDT_PROBE_DEFINE4(vmm, vmx, exit, interrupt, 232 "struct vmx *", "int", "struct vm_exit *", "uint32_t"); 233 234 SDT_PROBE_DEFINE3(vmm, vmx, exit, nmiwindow, 235 "struct vmx *", "int", "struct vm_exit *"); 236 237 SDT_PROBE_DEFINE3(vmm, vmx, exit, inout, 238 "struct vmx *", "int", "struct vm_exit *"); 239 240 SDT_PROBE_DEFINE3(vmm, vmx, exit, cpuid, 241 "struct vmx *", "int", "struct vm_exit *"); 242 243 SDT_PROBE_DEFINE5(vmm, vmx, exit, exception, 244 "struct vmx *", "int", "struct vm_exit *", "uint32_t", "int"); 245 246 SDT_PROBE_DEFINE5(vmm, vmx, exit, nestedfault, 247 "struct vmx *", "int", "struct vm_exit *", "uint64_t", "uint64_t"); 248 249 SDT_PROBE_DEFINE4(vmm, vmx, exit, mmiofault, 250 "struct vmx *", "int", "struct vm_exit *", "uint64_t"); 251 252 SDT_PROBE_DEFINE3(vmm, vmx, exit, eoi, 253 "struct vmx *", "int", "struct vm_exit *"); 254 255 SDT_PROBE_DEFINE3(vmm, vmx, exit, apicaccess, 256 "struct vmx *", "int", "struct vm_exit *"); 257 258 SDT_PROBE_DEFINE4(vmm, vmx, exit, apicwrite, 259 "struct vmx *", "int", "struct vm_exit *", "struct vlapic *"); 260 261 SDT_PROBE_DEFINE3(vmm, vmx, exit, xsetbv, 262 "struct vmx *", "int", "struct vm_exit *"); 263 264 SDT_PROBE_DEFINE3(vmm, vmx, exit, monitor, 265 "struct vmx *", "int", "struct vm_exit *"); 266 267 SDT_PROBE_DEFINE3(vmm, vmx, exit, mwait, 268 "struct vmx *", "int", "struct vm_exit *"); 269 270 SDT_PROBE_DEFINE3(vmm, vmx, exit, vminsn, 271 "struct vmx *", "int", "struct vm_exit *"); 272 273 SDT_PROBE_DEFINE4(vmm, vmx, exit, unknown, 274 "struct vmx *", "int", "struct vm_exit *", "uint32_t"); 275 276 SDT_PROBE_DEFINE4(vmm, vmx, exit, return, 277 "struct vmx *", "int", "struct vm_exit *", "int"); 278 279 /* 280 * Use the last page below 4GB as the APIC access address. This address is 281 * occupied by the boot firmware so it is guaranteed that it will not conflict 282 * with a page in system memory. 283 */ 284 #define APIC_ACCESS_ADDRESS 0xFFFFF000 285 286 static int vmx_getdesc(void *arg, int vcpu, int reg, struct seg_desc *desc); 287 static int vmx_getreg(void *arg, int vcpu, int reg, uint64_t *retval); 288 static int vmxctx_setreg(struct vmxctx *vmxctx, int reg, uint64_t val); 289 static void vmx_inject_pir(struct vlapic *vlapic); 290 291 #ifdef KTR 292 static const char * 293 exit_reason_to_str(int reason) 294 { 295 static char reasonbuf[32]; 296 297 switch (reason) { 298 case EXIT_REASON_EXCEPTION: 299 return "exception"; 300 case EXIT_REASON_EXT_INTR: 301 return "extint"; 302 case EXIT_REASON_TRIPLE_FAULT: 303 return "triplefault"; 304 case EXIT_REASON_INIT: 305 return "init"; 306 case EXIT_REASON_SIPI: 307 return "sipi"; 308 case EXIT_REASON_IO_SMI: 309 return "iosmi"; 310 case EXIT_REASON_SMI: 311 return "smi"; 312 case EXIT_REASON_INTR_WINDOW: 313 return "intrwindow"; 314 case EXIT_REASON_NMI_WINDOW: 315 return "nmiwindow"; 316 case EXIT_REASON_TASK_SWITCH: 317 return "taskswitch"; 318 case EXIT_REASON_CPUID: 319 return "cpuid"; 320 case EXIT_REASON_GETSEC: 321 return "getsec"; 322 case EXIT_REASON_HLT: 323 return "hlt"; 324 case EXIT_REASON_INVD: 325 return "invd"; 326 case EXIT_REASON_INVLPG: 327 return "invlpg"; 328 case EXIT_REASON_RDPMC: 329 return "rdpmc"; 330 case EXIT_REASON_RDTSC: 331 return "rdtsc"; 332 case EXIT_REASON_RSM: 333 return "rsm"; 334 case EXIT_REASON_VMCALL: 335 return "vmcall"; 336 case EXIT_REASON_VMCLEAR: 337 return "vmclear"; 338 case EXIT_REASON_VMLAUNCH: 339 return "vmlaunch"; 340 case EXIT_REASON_VMPTRLD: 341 return "vmptrld"; 342 case EXIT_REASON_VMPTRST: 343 return "vmptrst"; 344 case EXIT_REASON_VMREAD: 345 return "vmread"; 346 case EXIT_REASON_VMRESUME: 347 return "vmresume"; 348 case EXIT_REASON_VMWRITE: 349 return "vmwrite"; 350 case EXIT_REASON_VMXOFF: 351 return "vmxoff"; 352 case EXIT_REASON_VMXON: 353 return "vmxon"; 354 case EXIT_REASON_CR_ACCESS: 355 return "craccess"; 356 case EXIT_REASON_DR_ACCESS: 357 return "draccess"; 358 case EXIT_REASON_INOUT: 359 return "inout"; 360 case EXIT_REASON_RDMSR: 361 return "rdmsr"; 362 case EXIT_REASON_WRMSR: 363 return "wrmsr"; 364 case EXIT_REASON_INVAL_VMCS: 365 return "invalvmcs"; 366 case EXIT_REASON_INVAL_MSR: 367 return "invalmsr"; 368 case EXIT_REASON_MWAIT: 369 return "mwait"; 370 case EXIT_REASON_MTF: 371 return "mtf"; 372 case EXIT_REASON_MONITOR: 373 return "monitor"; 374 case EXIT_REASON_PAUSE: 375 return "pause"; 376 case EXIT_REASON_MCE_DURING_ENTRY: 377 return "mce-during-entry"; 378 case EXIT_REASON_TPR: 379 return "tpr"; 380 case EXIT_REASON_APIC_ACCESS: 381 return "apic-access"; 382 case EXIT_REASON_GDTR_IDTR: 383 return "gdtridtr"; 384 case EXIT_REASON_LDTR_TR: 385 return "ldtrtr"; 386 case EXIT_REASON_EPT_FAULT: 387 return "eptfault"; 388 case EXIT_REASON_EPT_MISCONFIG: 389 return "eptmisconfig"; 390 case EXIT_REASON_INVEPT: 391 return "invept"; 392 case EXIT_REASON_RDTSCP: 393 return "rdtscp"; 394 case EXIT_REASON_VMX_PREEMPT: 395 return "vmxpreempt"; 396 case EXIT_REASON_INVVPID: 397 return "invvpid"; 398 case EXIT_REASON_WBINVD: 399 return "wbinvd"; 400 case EXIT_REASON_XSETBV: 401 return "xsetbv"; 402 case EXIT_REASON_APIC_WRITE: 403 return "apic-write"; 404 default: 405 snprintf(reasonbuf, sizeof(reasonbuf), "%d", reason); 406 return (reasonbuf); 407 } 408 } 409 #endif /* KTR */ 410 411 static int 412 vmx_allow_x2apic_msrs(struct vmx *vmx) 413 { 414 int i, error; 415 416 error = 0; 417 418 /* 419 * Allow readonly access to the following x2APIC MSRs from the guest. 420 */ 421 error += guest_msr_ro(vmx, MSR_APIC_ID); 422 error += guest_msr_ro(vmx, MSR_APIC_VERSION); 423 error += guest_msr_ro(vmx, MSR_APIC_LDR); 424 error += guest_msr_ro(vmx, MSR_APIC_SVR); 425 426 for (i = 0; i < 8; i++) 427 error += guest_msr_ro(vmx, MSR_APIC_ISR0 + i); 428 429 for (i = 0; i < 8; i++) 430 error += guest_msr_ro(vmx, MSR_APIC_TMR0 + i); 431 432 for (i = 0; i < 8; i++) 433 error += guest_msr_ro(vmx, MSR_APIC_IRR0 + i); 434 435 error += guest_msr_ro(vmx, MSR_APIC_ESR); 436 error += guest_msr_ro(vmx, MSR_APIC_LVT_TIMER); 437 error += guest_msr_ro(vmx, MSR_APIC_LVT_THERMAL); 438 error += guest_msr_ro(vmx, MSR_APIC_LVT_PCINT); 439 error += guest_msr_ro(vmx, MSR_APIC_LVT_LINT0); 440 error += guest_msr_ro(vmx, MSR_APIC_LVT_LINT1); 441 error += guest_msr_ro(vmx, MSR_APIC_LVT_ERROR); 442 error += guest_msr_ro(vmx, MSR_APIC_ICR_TIMER); 443 error += guest_msr_ro(vmx, MSR_APIC_DCR_TIMER); 444 error += guest_msr_ro(vmx, MSR_APIC_ICR); 445 446 /* 447 * Allow TPR, EOI and SELF_IPI MSRs to be read and written by the guest. 448 * 449 * These registers get special treatment described in the section 450 * "Virtualizing MSR-Based APIC Accesses". 451 */ 452 error += guest_msr_rw(vmx, MSR_APIC_TPR); 453 error += guest_msr_rw(vmx, MSR_APIC_EOI); 454 error += guest_msr_rw(vmx, MSR_APIC_SELF_IPI); 455 456 return (error); 457 } 458 459 u_long 460 vmx_fix_cr0(u_long cr0) 461 { 462 463 return ((cr0 | cr0_ones_mask) & ~cr0_zeros_mask); 464 } 465 466 u_long 467 vmx_fix_cr4(u_long cr4) 468 { 469 470 return ((cr4 | cr4_ones_mask) & ~cr4_zeros_mask); 471 } 472 473 static void 474 vpid_free(int vpid) 475 { 476 if (vpid < 0 || vpid > 0xffff) 477 panic("vpid_free: invalid vpid %d", vpid); 478 479 /* 480 * VPIDs [0,VM_MAXCPU] are special and are not allocated from 481 * the unit number allocator. 482 */ 483 484 if (vpid > VM_MAXCPU) 485 free_unr(vpid_unr, vpid); 486 } 487 488 static void 489 vpid_alloc(uint16_t *vpid, int num) 490 { 491 int i, x; 492 493 if (num <= 0 || num > VM_MAXCPU) 494 panic("invalid number of vpids requested: %d", num); 495 496 /* 497 * If the "enable vpid" execution control is not enabled then the 498 * VPID is required to be 0 for all vcpus. 499 */ 500 if ((procbased_ctls2 & PROCBASED2_ENABLE_VPID) == 0) { 501 for (i = 0; i < num; i++) 502 vpid[i] = 0; 503 return; 504 } 505 506 /* 507 * Allocate a unique VPID for each vcpu from the unit number allocator. 508 */ 509 for (i = 0; i < num; i++) { 510 x = alloc_unr(vpid_unr); 511 if (x == -1) 512 break; 513 else 514 vpid[i] = x; 515 } 516 517 if (i < num) { 518 atomic_add_int(&vpid_alloc_failed, 1); 519 520 /* 521 * If the unit number allocator does not have enough unique 522 * VPIDs then we need to allocate from the [1,VM_MAXCPU] range. 523 * 524 * These VPIDs are not be unique across VMs but this does not 525 * affect correctness because the combined mappings are also 526 * tagged with the EP4TA which is unique for each VM. 527 * 528 * It is still sub-optimal because the invvpid will invalidate 529 * combined mappings for a particular VPID across all EP4TAs. 530 */ 531 while (i-- > 0) 532 vpid_free(vpid[i]); 533 534 for (i = 0; i < num; i++) 535 vpid[i] = i + 1; 536 } 537 } 538 539 static void 540 vpid_init(void) 541 { 542 /* 543 * VPID 0 is required when the "enable VPID" execution control is 544 * disabled. 545 * 546 * VPIDs [1,VM_MAXCPU] are used as the "overflow namespace" when the 547 * unit number allocator does not have sufficient unique VPIDs to 548 * satisfy the allocation. 549 * 550 * The remaining VPIDs are managed by the unit number allocator. 551 */ 552 vpid_unr = new_unrhdr(VM_MAXCPU + 1, 0xffff, NULL); 553 } 554 555 static void 556 vmx_disable(void *arg __unused) 557 { 558 struct invvpid_desc invvpid_desc = { 0 }; 559 struct invept_desc invept_desc = { 0 }; 560 561 if (vmxon_enabled[curcpu]) { 562 /* 563 * See sections 25.3.3.3 and 25.3.3.4 in Intel Vol 3b. 564 * 565 * VMXON or VMXOFF are not required to invalidate any TLB 566 * caching structures. This prevents potential retention of 567 * cached information in the TLB between distinct VMX episodes. 568 */ 569 invvpid(INVVPID_TYPE_ALL_CONTEXTS, invvpid_desc); 570 invept(INVEPT_TYPE_ALL_CONTEXTS, invept_desc); 571 vmxoff(); 572 } 573 load_cr4(rcr4() & ~CR4_VMXE); 574 } 575 576 static int 577 vmx_cleanup(void) 578 { 579 580 if (pirvec >= 0) 581 lapic_ipi_free(pirvec); 582 583 if (vpid_unr != NULL) { 584 delete_unrhdr(vpid_unr); 585 vpid_unr = NULL; 586 } 587 588 if (nmi_flush_l1d_sw == 1) 589 nmi_flush_l1d_sw = 0; 590 591 smp_rendezvous(NULL, vmx_disable, NULL, NULL); 592 593 return (0); 594 } 595 596 static void 597 vmx_enable(void *arg __unused) 598 { 599 int error; 600 uint64_t feature_control; 601 602 feature_control = rdmsr(MSR_IA32_FEATURE_CONTROL); 603 if ((feature_control & IA32_FEATURE_CONTROL_LOCK) == 0 || 604 (feature_control & IA32_FEATURE_CONTROL_VMX_EN) == 0) { 605 wrmsr(MSR_IA32_FEATURE_CONTROL, 606 feature_control | IA32_FEATURE_CONTROL_VMX_EN | 607 IA32_FEATURE_CONTROL_LOCK); 608 } 609 610 load_cr4(rcr4() | CR4_VMXE); 611 612 *(uint32_t *)vmxon_region[curcpu] = vmx_revision(); 613 error = vmxon(vmxon_region[curcpu]); 614 if (error == 0) 615 vmxon_enabled[curcpu] = 1; 616 } 617 618 static void 619 vmx_restore(void) 620 { 621 622 if (vmxon_enabled[curcpu]) 623 vmxon(vmxon_region[curcpu]); 624 } 625 626 static int 627 vmx_init(int ipinum) 628 { 629 int error, use_tpr_shadow; 630 uint64_t basic, fixed0, fixed1, feature_control; 631 uint32_t tmp, procbased2_vid_bits; 632 633 /* CPUID.1:ECX[bit 5] must be 1 for processor to support VMX */ 634 if (!(cpu_feature2 & CPUID2_VMX)) { 635 printf("vmx_init: processor does not support VMX operation\n"); 636 return (ENXIO); 637 } 638 639 /* 640 * Verify that MSR_IA32_FEATURE_CONTROL lock and VMXON enable bits 641 * are set (bits 0 and 2 respectively). 642 */ 643 feature_control = rdmsr(MSR_IA32_FEATURE_CONTROL); 644 if ((feature_control & IA32_FEATURE_CONTROL_LOCK) == 1 && 645 (feature_control & IA32_FEATURE_CONTROL_VMX_EN) == 0) { 646 printf("vmx_init: VMX operation disabled by BIOS\n"); 647 return (ENXIO); 648 } 649 650 /* 651 * Verify capabilities MSR_VMX_BASIC: 652 * - bit 54 indicates support for INS/OUTS decoding 653 */ 654 basic = rdmsr(MSR_VMX_BASIC); 655 if ((basic & (1UL << 54)) == 0) { 656 printf("vmx_init: processor does not support desired basic " 657 "capabilities\n"); 658 return (EINVAL); 659 } 660 661 /* Check support for primary processor-based VM-execution controls */ 662 error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, 663 MSR_VMX_TRUE_PROCBASED_CTLS, 664 PROCBASED_CTLS_ONE_SETTING, 665 PROCBASED_CTLS_ZERO_SETTING, &procbased_ctls); 666 if (error) { 667 printf("vmx_init: processor does not support desired primary " 668 "processor-based controls\n"); 669 return (error); 670 } 671 672 /* Clear the processor-based ctl bits that are set on demand */ 673 procbased_ctls &= ~PROCBASED_CTLS_WINDOW_SETTING; 674 675 /* Check support for secondary processor-based VM-execution controls */ 676 error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, 677 MSR_VMX_PROCBASED_CTLS2, 678 PROCBASED_CTLS2_ONE_SETTING, 679 PROCBASED_CTLS2_ZERO_SETTING, &procbased_ctls2); 680 if (error) { 681 printf("vmx_init: processor does not support desired secondary " 682 "processor-based controls\n"); 683 return (error); 684 } 685 686 /* Check support for VPID */ 687 error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, MSR_VMX_PROCBASED_CTLS2, 688 PROCBASED2_ENABLE_VPID, 0, &tmp); 689 if (error == 0) 690 procbased_ctls2 |= PROCBASED2_ENABLE_VPID; 691 692 /* Check support for pin-based VM-execution controls */ 693 error = vmx_set_ctlreg(MSR_VMX_PINBASED_CTLS, 694 MSR_VMX_TRUE_PINBASED_CTLS, 695 PINBASED_CTLS_ONE_SETTING, 696 PINBASED_CTLS_ZERO_SETTING, &pinbased_ctls); 697 if (error) { 698 printf("vmx_init: processor does not support desired " 699 "pin-based controls\n"); 700 return (error); 701 } 702 703 /* Check support for VM-exit controls */ 704 error = vmx_set_ctlreg(MSR_VMX_EXIT_CTLS, MSR_VMX_TRUE_EXIT_CTLS, 705 VM_EXIT_CTLS_ONE_SETTING, 706 VM_EXIT_CTLS_ZERO_SETTING, 707 &exit_ctls); 708 if (error) { 709 printf("vmx_init: processor does not support desired " 710 "exit controls\n"); 711 return (error); 712 } 713 714 /* Check support for VM-entry controls */ 715 error = vmx_set_ctlreg(MSR_VMX_ENTRY_CTLS, MSR_VMX_TRUE_ENTRY_CTLS, 716 VM_ENTRY_CTLS_ONE_SETTING, VM_ENTRY_CTLS_ZERO_SETTING, 717 &entry_ctls); 718 if (error) { 719 printf("vmx_init: processor does not support desired " 720 "entry controls\n"); 721 return (error); 722 } 723 724 /* 725 * Check support for optional features by testing them 726 * as individual bits 727 */ 728 cap_halt_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, 729 MSR_VMX_TRUE_PROCBASED_CTLS, 730 PROCBASED_HLT_EXITING, 0, 731 &tmp) == 0); 732 733 cap_monitor_trap = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, 734 MSR_VMX_PROCBASED_CTLS, 735 PROCBASED_MTF, 0, 736 &tmp) == 0); 737 738 cap_pause_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, 739 MSR_VMX_TRUE_PROCBASED_CTLS, 740 PROCBASED_PAUSE_EXITING, 0, 741 &tmp) == 0); 742 743 cap_unrestricted_guest = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, 744 MSR_VMX_PROCBASED_CTLS2, 745 PROCBASED2_UNRESTRICTED_GUEST, 0, 746 &tmp) == 0); 747 748 cap_invpcid = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, 749 MSR_VMX_PROCBASED_CTLS2, PROCBASED2_ENABLE_INVPCID, 0, 750 &tmp) == 0); 751 752 /* 753 * Check support for virtual interrupt delivery. 754 */ 755 procbased2_vid_bits = (PROCBASED2_VIRTUALIZE_APIC_ACCESSES | 756 PROCBASED2_VIRTUALIZE_X2APIC_MODE | 757 PROCBASED2_APIC_REGISTER_VIRTUALIZATION | 758 PROCBASED2_VIRTUAL_INTERRUPT_DELIVERY); 759 760 use_tpr_shadow = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, 761 MSR_VMX_TRUE_PROCBASED_CTLS, PROCBASED_USE_TPR_SHADOW, 0, 762 &tmp) == 0); 763 764 error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, MSR_VMX_PROCBASED_CTLS2, 765 procbased2_vid_bits, 0, &tmp); 766 if (error == 0 && use_tpr_shadow) { 767 virtual_interrupt_delivery = 1; 768 TUNABLE_INT_FETCH("hw.vmm.vmx.use_apic_vid", 769 &virtual_interrupt_delivery); 770 } 771 772 if (virtual_interrupt_delivery) { 773 procbased_ctls |= PROCBASED_USE_TPR_SHADOW; 774 procbased_ctls2 |= procbased2_vid_bits; 775 procbased_ctls2 &= ~PROCBASED2_VIRTUALIZE_X2APIC_MODE; 776 777 /* 778 * No need to emulate accesses to %CR8 if virtual 779 * interrupt delivery is enabled. 780 */ 781 procbased_ctls &= ~PROCBASED_CR8_LOAD_EXITING; 782 procbased_ctls &= ~PROCBASED_CR8_STORE_EXITING; 783 784 /* 785 * Check for Posted Interrupts only if Virtual Interrupt 786 * Delivery is enabled. 787 */ 788 error = vmx_set_ctlreg(MSR_VMX_PINBASED_CTLS, 789 MSR_VMX_TRUE_PINBASED_CTLS, PINBASED_POSTED_INTERRUPT, 0, 790 &tmp); 791 if (error == 0) { 792 pirvec = lapic_ipi_alloc(pti ? &IDTVEC(justreturn1_pti) : 793 &IDTVEC(justreturn)); 794 if (pirvec < 0) { 795 if (bootverbose) { 796 printf("vmx_init: unable to allocate " 797 "posted interrupt vector\n"); 798 } 799 } else { 800 posted_interrupts = 1; 801 TUNABLE_INT_FETCH("hw.vmm.vmx.use_apic_pir", 802 &posted_interrupts); 803 } 804 } 805 } 806 807 if (posted_interrupts) 808 pinbased_ctls |= PINBASED_POSTED_INTERRUPT; 809 810 /* Initialize EPT */ 811 error = ept_init(ipinum); 812 if (error) { 813 printf("vmx_init: ept initialization failed (%d)\n", error); 814 return (error); 815 } 816 817 guest_l1d_flush = (cpu_ia32_arch_caps & 818 IA32_ARCH_CAP_SKIP_L1DFL_VMENTRY) == 0; 819 TUNABLE_INT_FETCH("hw.vmm.l1d_flush", &guest_l1d_flush); 820 821 /* 822 * L1D cache flush is enabled. Use IA32_FLUSH_CMD MSR when 823 * available. Otherwise fall back to the software flush 824 * method which loads enough data from the kernel text to 825 * flush existing L1D content, both on VMX entry and on NMI 826 * return. 827 */ 828 if (guest_l1d_flush) { 829 if ((cpu_stdext_feature3 & CPUID_STDEXT3_L1D_FLUSH) == 0) { 830 guest_l1d_flush_sw = 1; 831 TUNABLE_INT_FETCH("hw.vmm.l1d_flush_sw", 832 &guest_l1d_flush_sw); 833 } 834 if (guest_l1d_flush_sw) { 835 if (nmi_flush_l1d_sw <= 1) 836 nmi_flush_l1d_sw = 1; 837 } else { 838 msr_load_list[0].index = MSR_IA32_FLUSH_CMD; 839 msr_load_list[0].val = IA32_FLUSH_CMD_L1D; 840 } 841 } 842 843 /* 844 * Stash the cr0 and cr4 bits that must be fixed to 0 or 1 845 */ 846 fixed0 = rdmsr(MSR_VMX_CR0_FIXED0); 847 fixed1 = rdmsr(MSR_VMX_CR0_FIXED1); 848 cr0_ones_mask = fixed0 & fixed1; 849 cr0_zeros_mask = ~fixed0 & ~fixed1; 850 851 /* 852 * CR0_PE and CR0_PG can be set to zero in VMX non-root operation 853 * if unrestricted guest execution is allowed. 854 */ 855 if (cap_unrestricted_guest) 856 cr0_ones_mask &= ~(CR0_PG | CR0_PE); 857 858 /* 859 * Do not allow the guest to set CR0_NW or CR0_CD. 860 */ 861 cr0_zeros_mask |= (CR0_NW | CR0_CD); 862 863 fixed0 = rdmsr(MSR_VMX_CR4_FIXED0); 864 fixed1 = rdmsr(MSR_VMX_CR4_FIXED1); 865 cr4_ones_mask = fixed0 & fixed1; 866 cr4_zeros_mask = ~fixed0 & ~fixed1; 867 868 vpid_init(); 869 870 vmx_msr_init(); 871 872 /* enable VMX operation */ 873 smp_rendezvous(NULL, vmx_enable, NULL, NULL); 874 875 vmx_initialized = 1; 876 877 return (0); 878 } 879 880 static void 881 vmx_trigger_hostintr(int vector) 882 { 883 uintptr_t func; 884 struct gate_descriptor *gd; 885 886 gd = &idt[vector]; 887 888 KASSERT(vector >= 32 && vector <= 255, ("vmx_trigger_hostintr: " 889 "invalid vector %d", vector)); 890 KASSERT(gd->gd_p == 1, ("gate descriptor for vector %d not present", 891 vector)); 892 KASSERT(gd->gd_type == SDT_SYSIGT, ("gate descriptor for vector %d " 893 "has invalid type %d", vector, gd->gd_type)); 894 KASSERT(gd->gd_dpl == SEL_KPL, ("gate descriptor for vector %d " 895 "has invalid dpl %d", vector, gd->gd_dpl)); 896 KASSERT(gd->gd_selector == GSEL(GCODE_SEL, SEL_KPL), ("gate descriptor " 897 "for vector %d has invalid selector %d", vector, gd->gd_selector)); 898 KASSERT(gd->gd_ist == 0, ("gate descriptor for vector %d has invalid " 899 "IST %d", vector, gd->gd_ist)); 900 901 func = ((long)gd->gd_hioffset << 16 | gd->gd_looffset); 902 vmx_call_isr(func); 903 } 904 905 static int 906 vmx_setup_cr_shadow(int which, struct vmcs *vmcs, uint32_t initial) 907 { 908 int error, mask_ident, shadow_ident; 909 uint64_t mask_value; 910 911 if (which != 0 && which != 4) 912 panic("vmx_setup_cr_shadow: unknown cr%d", which); 913 914 if (which == 0) { 915 mask_ident = VMCS_CR0_MASK; 916 mask_value = cr0_ones_mask | cr0_zeros_mask; 917 shadow_ident = VMCS_CR0_SHADOW; 918 } else { 919 mask_ident = VMCS_CR4_MASK; 920 mask_value = cr4_ones_mask | cr4_zeros_mask; 921 shadow_ident = VMCS_CR4_SHADOW; 922 } 923 924 error = vmcs_setreg(vmcs, 0, VMCS_IDENT(mask_ident), mask_value); 925 if (error) 926 return (error); 927 928 error = vmcs_setreg(vmcs, 0, VMCS_IDENT(shadow_ident), initial); 929 if (error) 930 return (error); 931 932 return (0); 933 } 934 #define vmx_setup_cr0_shadow(vmcs,init) vmx_setup_cr_shadow(0, (vmcs), (init)) 935 #define vmx_setup_cr4_shadow(vmcs,init) vmx_setup_cr_shadow(4, (vmcs), (init)) 936 937 static void * 938 vmx_vminit(struct vm *vm, pmap_t pmap) 939 { 940 uint16_t vpid[VM_MAXCPU]; 941 int i, error; 942 struct vmx *vmx; 943 struct vmcs *vmcs; 944 uint32_t exc_bitmap; 945 946 vmx = malloc(sizeof(struct vmx), M_VMX, M_WAITOK | M_ZERO); 947 if ((uintptr_t)vmx & PAGE_MASK) { 948 panic("malloc of struct vmx not aligned on %d byte boundary", 949 PAGE_SIZE); 950 } 951 vmx->vm = vm; 952 953 vmx->eptp = eptp(vtophys((vm_offset_t)pmap->pm_pml4)); 954 955 /* 956 * Clean up EPTP-tagged guest physical and combined mappings 957 * 958 * VMX transitions are not required to invalidate any guest physical 959 * mappings. So, it may be possible for stale guest physical mappings 960 * to be present in the processor TLBs. 961 * 962 * Combined mappings for this EP4TA are also invalidated for all VPIDs. 963 */ 964 ept_invalidate_mappings(vmx->eptp); 965 966 msr_bitmap_initialize(vmx->msr_bitmap); 967 968 /* 969 * It is safe to allow direct access to MSR_GSBASE and MSR_FSBASE. 970 * The guest FSBASE and GSBASE are saved and restored during 971 * vm-exit and vm-entry respectively. The host FSBASE and GSBASE are 972 * always restored from the vmcs host state area on vm-exit. 973 * 974 * The SYSENTER_CS/ESP/EIP MSRs are identical to FS/GSBASE in 975 * how they are saved/restored so can be directly accessed by the 976 * guest. 977 * 978 * MSR_EFER is saved and restored in the guest VMCS area on a 979 * VM exit and entry respectively. It is also restored from the 980 * host VMCS area on a VM exit. 981 * 982 * The TSC MSR is exposed read-only. Writes are disallowed as 983 * that will impact the host TSC. If the guest does a write 984 * the "use TSC offsetting" execution control is enabled and the 985 * difference between the host TSC and the guest TSC is written 986 * into the TSC offset in the VMCS. 987 */ 988 if (guest_msr_rw(vmx, MSR_GSBASE) || 989 guest_msr_rw(vmx, MSR_FSBASE) || 990 guest_msr_rw(vmx, MSR_SYSENTER_CS_MSR) || 991 guest_msr_rw(vmx, MSR_SYSENTER_ESP_MSR) || 992 guest_msr_rw(vmx, MSR_SYSENTER_EIP_MSR) || 993 guest_msr_rw(vmx, MSR_EFER) || 994 guest_msr_ro(vmx, MSR_TSC)) 995 panic("vmx_vminit: error setting guest msr access"); 996 997 vpid_alloc(vpid, VM_MAXCPU); 998 999 if (virtual_interrupt_delivery) { 1000 error = vm_map_mmio(vm, DEFAULT_APIC_BASE, PAGE_SIZE, 1001 APIC_ACCESS_ADDRESS); 1002 /* XXX this should really return an error to the caller */ 1003 KASSERT(error == 0, ("vm_map_mmio(apicbase) error %d", error)); 1004 } 1005 1006 for (i = 0; i < VM_MAXCPU; i++) { 1007 vmcs = &vmx->vmcs[i]; 1008 vmcs->identifier = vmx_revision(); 1009 error = vmclear(vmcs); 1010 if (error != 0) { 1011 panic("vmx_vminit: vmclear error %d on vcpu %d\n", 1012 error, i); 1013 } 1014 1015 vmx_msr_guest_init(vmx, i); 1016 1017 error = vmcs_init(vmcs); 1018 KASSERT(error == 0, ("vmcs_init error %d", error)); 1019 1020 VMPTRLD(vmcs); 1021 error = 0; 1022 error += vmwrite(VMCS_HOST_RSP, (u_long)&vmx->ctx[i]); 1023 error += vmwrite(VMCS_EPTP, vmx->eptp); 1024 error += vmwrite(VMCS_PIN_BASED_CTLS, pinbased_ctls); 1025 error += vmwrite(VMCS_PRI_PROC_BASED_CTLS, procbased_ctls); 1026 error += vmwrite(VMCS_SEC_PROC_BASED_CTLS, procbased_ctls2); 1027 error += vmwrite(VMCS_EXIT_CTLS, exit_ctls); 1028 error += vmwrite(VMCS_ENTRY_CTLS, entry_ctls); 1029 error += vmwrite(VMCS_MSR_BITMAP, vtophys(vmx->msr_bitmap)); 1030 error += vmwrite(VMCS_VPID, vpid[i]); 1031 1032 if (guest_l1d_flush && !guest_l1d_flush_sw) { 1033 vmcs_write(VMCS_ENTRY_MSR_LOAD, pmap_kextract( 1034 (vm_offset_t)&msr_load_list[0])); 1035 vmcs_write(VMCS_ENTRY_MSR_LOAD_COUNT, 1036 nitems(msr_load_list)); 1037 vmcs_write(VMCS_EXIT_MSR_STORE, 0); 1038 vmcs_write(VMCS_EXIT_MSR_STORE_COUNT, 0); 1039 } 1040 1041 /* exception bitmap */ 1042 if (vcpu_trace_exceptions(vm, i)) 1043 exc_bitmap = 0xffffffff; 1044 else 1045 exc_bitmap = 1 << IDT_MC; 1046 error += vmwrite(VMCS_EXCEPTION_BITMAP, exc_bitmap); 1047 1048 vmx->ctx[i].guest_dr6 = DBREG_DR6_RESERVED1; 1049 error += vmwrite(VMCS_GUEST_DR7, DBREG_DR7_RESERVED1); 1050 1051 if (virtual_interrupt_delivery) { 1052 error += vmwrite(VMCS_APIC_ACCESS, APIC_ACCESS_ADDRESS); 1053 error += vmwrite(VMCS_VIRTUAL_APIC, 1054 vtophys(&vmx->apic_page[i])); 1055 error += vmwrite(VMCS_EOI_EXIT0, 0); 1056 error += vmwrite(VMCS_EOI_EXIT1, 0); 1057 error += vmwrite(VMCS_EOI_EXIT2, 0); 1058 error += vmwrite(VMCS_EOI_EXIT3, 0); 1059 } 1060 if (posted_interrupts) { 1061 error += vmwrite(VMCS_PIR_VECTOR, pirvec); 1062 error += vmwrite(VMCS_PIR_DESC, 1063 vtophys(&vmx->pir_desc[i])); 1064 } 1065 VMCLEAR(vmcs); 1066 KASSERT(error == 0, ("vmx_vminit: error customizing the vmcs")); 1067 1068 vmx->cap[i].set = 0; 1069 vmx->cap[i].proc_ctls = procbased_ctls; 1070 vmx->cap[i].proc_ctls2 = procbased_ctls2; 1071 1072 vmx->state[i].nextrip = ~0; 1073 vmx->state[i].lastcpu = NOCPU; 1074 vmx->state[i].vpid = vpid[i]; 1075 1076 /* 1077 * Set up the CR0/4 shadows, and init the read shadow 1078 * to the power-on register value from the Intel Sys Arch. 1079 * CR0 - 0x60000010 1080 * CR4 - 0 1081 */ 1082 error = vmx_setup_cr0_shadow(vmcs, 0x60000010); 1083 if (error != 0) 1084 panic("vmx_setup_cr0_shadow %d", error); 1085 1086 error = vmx_setup_cr4_shadow(vmcs, 0); 1087 if (error != 0) 1088 panic("vmx_setup_cr4_shadow %d", error); 1089 1090 vmx->ctx[i].pmap = pmap; 1091 } 1092 1093 return (vmx); 1094 } 1095 1096 static int 1097 vmx_handle_cpuid(struct vm *vm, int vcpu, struct vmxctx *vmxctx) 1098 { 1099 int handled, func; 1100 1101 func = vmxctx->guest_rax; 1102 1103 handled = x86_emulate_cpuid(vm, vcpu, 1104 (uint32_t*)(&vmxctx->guest_rax), 1105 (uint32_t*)(&vmxctx->guest_rbx), 1106 (uint32_t*)(&vmxctx->guest_rcx), 1107 (uint32_t*)(&vmxctx->guest_rdx)); 1108 return (handled); 1109 } 1110 1111 static __inline void 1112 vmx_run_trace(struct vmx *vmx, int vcpu) 1113 { 1114 #ifdef KTR 1115 VCPU_CTR1(vmx->vm, vcpu, "Resume execution at %#lx", vmcs_guest_rip()); 1116 #endif 1117 } 1118 1119 static __inline void 1120 vmx_exit_trace(struct vmx *vmx, int vcpu, uint64_t rip, uint32_t exit_reason, 1121 int handled) 1122 { 1123 #ifdef KTR 1124 VCPU_CTR3(vmx->vm, vcpu, "%s %s vmexit at 0x%0lx", 1125 handled ? "handled" : "unhandled", 1126 exit_reason_to_str(exit_reason), rip); 1127 #endif 1128 } 1129 1130 static __inline void 1131 vmx_astpending_trace(struct vmx *vmx, int vcpu, uint64_t rip) 1132 { 1133 #ifdef KTR 1134 VCPU_CTR1(vmx->vm, vcpu, "astpending vmexit at 0x%0lx", rip); 1135 #endif 1136 } 1137 1138 static VMM_STAT_INTEL(VCPU_INVVPID_SAVED, "Number of vpid invalidations saved"); 1139 static VMM_STAT_INTEL(VCPU_INVVPID_DONE, "Number of vpid invalidations done"); 1140 1141 /* 1142 * Invalidate guest mappings identified by its vpid from the TLB. 1143 */ 1144 static __inline void 1145 vmx_invvpid(struct vmx *vmx, int vcpu, pmap_t pmap, int running) 1146 { 1147 struct vmxstate *vmxstate; 1148 struct invvpid_desc invvpid_desc; 1149 1150 vmxstate = &vmx->state[vcpu]; 1151 if (vmxstate->vpid == 0) 1152 return; 1153 1154 if (!running) { 1155 /* 1156 * Set the 'lastcpu' to an invalid host cpu. 1157 * 1158 * This will invalidate TLB entries tagged with the vcpu's 1159 * vpid the next time it runs via vmx_set_pcpu_defaults(). 1160 */ 1161 vmxstate->lastcpu = NOCPU; 1162 return; 1163 } 1164 1165 KASSERT(curthread->td_critnest > 0, ("%s: vcpu %d running outside " 1166 "critical section", __func__, vcpu)); 1167 1168 /* 1169 * Invalidate all mappings tagged with 'vpid' 1170 * 1171 * We do this because this vcpu was executing on a different host 1172 * cpu when it last ran. We do not track whether it invalidated 1173 * mappings associated with its 'vpid' during that run. So we must 1174 * assume that the mappings associated with 'vpid' on 'curcpu' are 1175 * stale and invalidate them. 1176 * 1177 * Note that we incur this penalty only when the scheduler chooses to 1178 * move the thread associated with this vcpu between host cpus. 1179 * 1180 * Note also that this will invalidate mappings tagged with 'vpid' 1181 * for "all" EP4TAs. 1182 */ 1183 if (pmap->pm_eptgen == vmx->eptgen[curcpu]) { 1184 invvpid_desc._res1 = 0; 1185 invvpid_desc._res2 = 0; 1186 invvpid_desc.vpid = vmxstate->vpid; 1187 invvpid_desc.linear_addr = 0; 1188 invvpid(INVVPID_TYPE_SINGLE_CONTEXT, invvpid_desc); 1189 vmm_stat_incr(vmx->vm, vcpu, VCPU_INVVPID_DONE, 1); 1190 } else { 1191 /* 1192 * The invvpid can be skipped if an invept is going to 1193 * be performed before entering the guest. The invept 1194 * will invalidate combined mappings tagged with 1195 * 'vmx->eptp' for all vpids. 1196 */ 1197 vmm_stat_incr(vmx->vm, vcpu, VCPU_INVVPID_SAVED, 1); 1198 } 1199 } 1200 1201 static void 1202 vmx_set_pcpu_defaults(struct vmx *vmx, int vcpu, pmap_t pmap) 1203 { 1204 struct vmxstate *vmxstate; 1205 1206 vmxstate = &vmx->state[vcpu]; 1207 if (vmxstate->lastcpu == curcpu) 1208 return; 1209 1210 vmxstate->lastcpu = curcpu; 1211 1212 vmm_stat_incr(vmx->vm, vcpu, VCPU_MIGRATIONS, 1); 1213 1214 vmcs_write(VMCS_HOST_TR_BASE, vmm_get_host_trbase()); 1215 vmcs_write(VMCS_HOST_GDTR_BASE, vmm_get_host_gdtrbase()); 1216 vmcs_write(VMCS_HOST_GS_BASE, vmm_get_host_gsbase()); 1217 vmx_invvpid(vmx, vcpu, pmap, 1); 1218 } 1219 1220 /* 1221 * We depend on 'procbased_ctls' to have the Interrupt Window Exiting bit set. 1222 */ 1223 CTASSERT((PROCBASED_CTLS_ONE_SETTING & PROCBASED_INT_WINDOW_EXITING) != 0); 1224 1225 static void __inline 1226 vmx_set_int_window_exiting(struct vmx *vmx, int vcpu) 1227 { 1228 1229 if ((vmx->cap[vcpu].proc_ctls & PROCBASED_INT_WINDOW_EXITING) == 0) { 1230 vmx->cap[vcpu].proc_ctls |= PROCBASED_INT_WINDOW_EXITING; 1231 vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); 1232 VCPU_CTR0(vmx->vm, vcpu, "Enabling interrupt window exiting"); 1233 } 1234 } 1235 1236 static void __inline 1237 vmx_clear_int_window_exiting(struct vmx *vmx, int vcpu) 1238 { 1239 1240 KASSERT((vmx->cap[vcpu].proc_ctls & PROCBASED_INT_WINDOW_EXITING) != 0, 1241 ("intr_window_exiting not set: %#x", vmx->cap[vcpu].proc_ctls)); 1242 vmx->cap[vcpu].proc_ctls &= ~PROCBASED_INT_WINDOW_EXITING; 1243 vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); 1244 VCPU_CTR0(vmx->vm, vcpu, "Disabling interrupt window exiting"); 1245 } 1246 1247 static void __inline 1248 vmx_set_nmi_window_exiting(struct vmx *vmx, int vcpu) 1249 { 1250 1251 if ((vmx->cap[vcpu].proc_ctls & PROCBASED_NMI_WINDOW_EXITING) == 0) { 1252 vmx->cap[vcpu].proc_ctls |= PROCBASED_NMI_WINDOW_EXITING; 1253 vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); 1254 VCPU_CTR0(vmx->vm, vcpu, "Enabling NMI window exiting"); 1255 } 1256 } 1257 1258 static void __inline 1259 vmx_clear_nmi_window_exiting(struct vmx *vmx, int vcpu) 1260 { 1261 1262 KASSERT((vmx->cap[vcpu].proc_ctls & PROCBASED_NMI_WINDOW_EXITING) != 0, 1263 ("nmi_window_exiting not set %#x", vmx->cap[vcpu].proc_ctls)); 1264 vmx->cap[vcpu].proc_ctls &= ~PROCBASED_NMI_WINDOW_EXITING; 1265 vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); 1266 VCPU_CTR0(vmx->vm, vcpu, "Disabling NMI window exiting"); 1267 } 1268 1269 int 1270 vmx_set_tsc_offset(struct vmx *vmx, int vcpu, uint64_t offset) 1271 { 1272 int error; 1273 1274 if ((vmx->cap[vcpu].proc_ctls & PROCBASED_TSC_OFFSET) == 0) { 1275 vmx->cap[vcpu].proc_ctls |= PROCBASED_TSC_OFFSET; 1276 vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); 1277 VCPU_CTR0(vmx->vm, vcpu, "Enabling TSC offsetting"); 1278 } 1279 1280 error = vmwrite(VMCS_TSC_OFFSET, offset); 1281 1282 return (error); 1283 } 1284 1285 #define NMI_BLOCKING (VMCS_INTERRUPTIBILITY_NMI_BLOCKING | \ 1286 VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING) 1287 #define HWINTR_BLOCKING (VMCS_INTERRUPTIBILITY_STI_BLOCKING | \ 1288 VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING) 1289 1290 static void 1291 vmx_inject_nmi(struct vmx *vmx, int vcpu) 1292 { 1293 uint32_t gi, info; 1294 1295 gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY); 1296 KASSERT((gi & NMI_BLOCKING) == 0, ("vmx_inject_nmi: invalid guest " 1297 "interruptibility-state %#x", gi)); 1298 1299 info = vmcs_read(VMCS_ENTRY_INTR_INFO); 1300 KASSERT((info & VMCS_INTR_VALID) == 0, ("vmx_inject_nmi: invalid " 1301 "VM-entry interruption information %#x", info)); 1302 1303 /* 1304 * Inject the virtual NMI. The vector must be the NMI IDT entry 1305 * or the VMCS entry check will fail. 1306 */ 1307 info = IDT_NMI | VMCS_INTR_T_NMI | VMCS_INTR_VALID; 1308 vmcs_write(VMCS_ENTRY_INTR_INFO, info); 1309 1310 VCPU_CTR0(vmx->vm, vcpu, "Injecting vNMI"); 1311 1312 /* Clear the request */ 1313 vm_nmi_clear(vmx->vm, vcpu); 1314 } 1315 1316 static void 1317 vmx_inject_interrupts(struct vmx *vmx, int vcpu, struct vlapic *vlapic, 1318 uint64_t guestrip) 1319 { 1320 int vector, need_nmi_exiting, extint_pending; 1321 uint64_t rflags, entryinfo; 1322 uint32_t gi, info; 1323 1324 if (vmx->state[vcpu].nextrip != guestrip) { 1325 gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY); 1326 if (gi & HWINTR_BLOCKING) { 1327 VCPU_CTR2(vmx->vm, vcpu, "Guest interrupt blocking " 1328 "cleared due to rip change: %#lx/%#lx", 1329 vmx->state[vcpu].nextrip, guestrip); 1330 gi &= ~HWINTR_BLOCKING; 1331 vmcs_write(VMCS_GUEST_INTERRUPTIBILITY, gi); 1332 } 1333 } 1334 1335 if (vm_entry_intinfo(vmx->vm, vcpu, &entryinfo)) { 1336 KASSERT((entryinfo & VMCS_INTR_VALID) != 0, ("%s: entry " 1337 "intinfo is not valid: %#lx", __func__, entryinfo)); 1338 1339 info = vmcs_read(VMCS_ENTRY_INTR_INFO); 1340 KASSERT((info & VMCS_INTR_VALID) == 0, ("%s: cannot inject " 1341 "pending exception: %#lx/%#x", __func__, entryinfo, info)); 1342 1343 info = entryinfo; 1344 vector = info & 0xff; 1345 if (vector == IDT_BP || vector == IDT_OF) { 1346 /* 1347 * VT-x requires #BP and #OF to be injected as software 1348 * exceptions. 1349 */ 1350 info &= ~VMCS_INTR_T_MASK; 1351 info |= VMCS_INTR_T_SWEXCEPTION; 1352 } 1353 1354 if (info & VMCS_INTR_DEL_ERRCODE) 1355 vmcs_write(VMCS_ENTRY_EXCEPTION_ERROR, entryinfo >> 32); 1356 1357 vmcs_write(VMCS_ENTRY_INTR_INFO, info); 1358 } 1359 1360 if (vm_nmi_pending(vmx->vm, vcpu)) { 1361 /* 1362 * If there are no conditions blocking NMI injection then 1363 * inject it directly here otherwise enable "NMI window 1364 * exiting" to inject it as soon as we can. 1365 * 1366 * We also check for STI_BLOCKING because some implementations 1367 * don't allow NMI injection in this case. If we are running 1368 * on a processor that doesn't have this restriction it will 1369 * immediately exit and the NMI will be injected in the 1370 * "NMI window exiting" handler. 1371 */ 1372 need_nmi_exiting = 1; 1373 gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY); 1374 if ((gi & (HWINTR_BLOCKING | NMI_BLOCKING)) == 0) { 1375 info = vmcs_read(VMCS_ENTRY_INTR_INFO); 1376 if ((info & VMCS_INTR_VALID) == 0) { 1377 vmx_inject_nmi(vmx, vcpu); 1378 need_nmi_exiting = 0; 1379 } else { 1380 VCPU_CTR1(vmx->vm, vcpu, "Cannot inject NMI " 1381 "due to VM-entry intr info %#x", info); 1382 } 1383 } else { 1384 VCPU_CTR1(vmx->vm, vcpu, "Cannot inject NMI due to " 1385 "Guest Interruptibility-state %#x", gi); 1386 } 1387 1388 if (need_nmi_exiting) 1389 vmx_set_nmi_window_exiting(vmx, vcpu); 1390 } 1391 1392 extint_pending = vm_extint_pending(vmx->vm, vcpu); 1393 1394 if (!extint_pending && virtual_interrupt_delivery) { 1395 vmx_inject_pir(vlapic); 1396 return; 1397 } 1398 1399 /* 1400 * If interrupt-window exiting is already in effect then don't bother 1401 * checking for pending interrupts. This is just an optimization and 1402 * not needed for correctness. 1403 */ 1404 if ((vmx->cap[vcpu].proc_ctls & PROCBASED_INT_WINDOW_EXITING) != 0) { 1405 VCPU_CTR0(vmx->vm, vcpu, "Skip interrupt injection due to " 1406 "pending int_window_exiting"); 1407 return; 1408 } 1409 1410 if (!extint_pending) { 1411 /* Ask the local apic for a vector to inject */ 1412 if (!vlapic_pending_intr(vlapic, &vector)) 1413 return; 1414 1415 /* 1416 * From the Intel SDM, Volume 3, Section "Maskable 1417 * Hardware Interrupts": 1418 * - maskable interrupt vectors [16,255] can be delivered 1419 * through the local APIC. 1420 */ 1421 KASSERT(vector >= 16 && vector <= 255, 1422 ("invalid vector %d from local APIC", vector)); 1423 } else { 1424 /* Ask the legacy pic for a vector to inject */ 1425 vatpic_pending_intr(vmx->vm, &vector); 1426 1427 /* 1428 * From the Intel SDM, Volume 3, Section "Maskable 1429 * Hardware Interrupts": 1430 * - maskable interrupt vectors [0,255] can be delivered 1431 * through the INTR pin. 1432 */ 1433 KASSERT(vector >= 0 && vector <= 255, 1434 ("invalid vector %d from INTR", vector)); 1435 } 1436 1437 /* Check RFLAGS.IF and the interruptibility state of the guest */ 1438 rflags = vmcs_read(VMCS_GUEST_RFLAGS); 1439 if ((rflags & PSL_I) == 0) { 1440 VCPU_CTR2(vmx->vm, vcpu, "Cannot inject vector %d due to " 1441 "rflags %#lx", vector, rflags); 1442 goto cantinject; 1443 } 1444 1445 gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY); 1446 if (gi & HWINTR_BLOCKING) { 1447 VCPU_CTR2(vmx->vm, vcpu, "Cannot inject vector %d due to " 1448 "Guest Interruptibility-state %#x", vector, gi); 1449 goto cantinject; 1450 } 1451 1452 info = vmcs_read(VMCS_ENTRY_INTR_INFO); 1453 if (info & VMCS_INTR_VALID) { 1454 /* 1455 * This is expected and could happen for multiple reasons: 1456 * - A vectoring VM-entry was aborted due to astpending 1457 * - A VM-exit happened during event injection. 1458 * - An exception was injected above. 1459 * - An NMI was injected above or after "NMI window exiting" 1460 */ 1461 VCPU_CTR2(vmx->vm, vcpu, "Cannot inject vector %d due to " 1462 "VM-entry intr info %#x", vector, info); 1463 goto cantinject; 1464 } 1465 1466 /* Inject the interrupt */ 1467 info = VMCS_INTR_T_HWINTR | VMCS_INTR_VALID; 1468 info |= vector; 1469 vmcs_write(VMCS_ENTRY_INTR_INFO, info); 1470 1471 if (!extint_pending) { 1472 /* Update the Local APIC ISR */ 1473 vlapic_intr_accepted(vlapic, vector); 1474 } else { 1475 vm_extint_clear(vmx->vm, vcpu); 1476 vatpic_intr_accepted(vmx->vm, vector); 1477 1478 /* 1479 * After we accepted the current ExtINT the PIC may 1480 * have posted another one. If that is the case, set 1481 * the Interrupt Window Exiting execution control so 1482 * we can inject that one too. 1483 * 1484 * Also, interrupt window exiting allows us to inject any 1485 * pending APIC vector that was preempted by the ExtINT 1486 * as soon as possible. This applies both for the software 1487 * emulated vlapic and the hardware assisted virtual APIC. 1488 */ 1489 vmx_set_int_window_exiting(vmx, vcpu); 1490 } 1491 1492 VCPU_CTR1(vmx->vm, vcpu, "Injecting hwintr at vector %d", vector); 1493 1494 return; 1495 1496 cantinject: 1497 /* 1498 * Set the Interrupt Window Exiting execution control so we can inject 1499 * the interrupt as soon as blocking condition goes away. 1500 */ 1501 vmx_set_int_window_exiting(vmx, vcpu); 1502 } 1503 1504 /* 1505 * If the Virtual NMIs execution control is '1' then the logical processor 1506 * tracks virtual-NMI blocking in the Guest Interruptibility-state field of 1507 * the VMCS. An IRET instruction in VMX non-root operation will remove any 1508 * virtual-NMI blocking. 1509 * 1510 * This unblocking occurs even if the IRET causes a fault. In this case the 1511 * hypervisor needs to restore virtual-NMI blocking before resuming the guest. 1512 */ 1513 static void 1514 vmx_restore_nmi_blocking(struct vmx *vmx, int vcpuid) 1515 { 1516 uint32_t gi; 1517 1518 VCPU_CTR0(vmx->vm, vcpuid, "Restore Virtual-NMI blocking"); 1519 gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY); 1520 gi |= VMCS_INTERRUPTIBILITY_NMI_BLOCKING; 1521 vmcs_write(VMCS_GUEST_INTERRUPTIBILITY, gi); 1522 } 1523 1524 static void 1525 vmx_clear_nmi_blocking(struct vmx *vmx, int vcpuid) 1526 { 1527 uint32_t gi; 1528 1529 VCPU_CTR0(vmx->vm, vcpuid, "Clear Virtual-NMI blocking"); 1530 gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY); 1531 gi &= ~VMCS_INTERRUPTIBILITY_NMI_BLOCKING; 1532 vmcs_write(VMCS_GUEST_INTERRUPTIBILITY, gi); 1533 } 1534 1535 static void 1536 vmx_assert_nmi_blocking(struct vmx *vmx, int vcpuid) 1537 { 1538 uint32_t gi; 1539 1540 gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY); 1541 KASSERT(gi & VMCS_INTERRUPTIBILITY_NMI_BLOCKING, 1542 ("NMI blocking is not in effect %#x", gi)); 1543 } 1544 1545 static int 1546 vmx_emulate_xsetbv(struct vmx *vmx, int vcpu, struct vm_exit *vmexit) 1547 { 1548 struct vmxctx *vmxctx; 1549 uint64_t xcrval; 1550 const struct xsave_limits *limits; 1551 1552 vmxctx = &vmx->ctx[vcpu]; 1553 limits = vmm_get_xsave_limits(); 1554 1555 /* 1556 * Note that the processor raises a GP# fault on its own if 1557 * xsetbv is executed for CPL != 0, so we do not have to 1558 * emulate that fault here. 1559 */ 1560 1561 /* Only xcr0 is supported. */ 1562 if (vmxctx->guest_rcx != 0) { 1563 vm_inject_gp(vmx->vm, vcpu); 1564 return (HANDLED); 1565 } 1566 1567 /* We only handle xcr0 if both the host and guest have XSAVE enabled. */ 1568 if (!limits->xsave_enabled || !(vmcs_read(VMCS_GUEST_CR4) & CR4_XSAVE)) { 1569 vm_inject_ud(vmx->vm, vcpu); 1570 return (HANDLED); 1571 } 1572 1573 xcrval = vmxctx->guest_rdx << 32 | (vmxctx->guest_rax & 0xffffffff); 1574 if ((xcrval & ~limits->xcr0_allowed) != 0) { 1575 vm_inject_gp(vmx->vm, vcpu); 1576 return (HANDLED); 1577 } 1578 1579 if (!(xcrval & XFEATURE_ENABLED_X87)) { 1580 vm_inject_gp(vmx->vm, vcpu); 1581 return (HANDLED); 1582 } 1583 1584 /* AVX (YMM_Hi128) requires SSE. */ 1585 if (xcrval & XFEATURE_ENABLED_AVX && 1586 (xcrval & XFEATURE_AVX) != XFEATURE_AVX) { 1587 vm_inject_gp(vmx->vm, vcpu); 1588 return (HANDLED); 1589 } 1590 1591 /* 1592 * AVX512 requires base AVX (YMM_Hi128) as well as OpMask, 1593 * ZMM_Hi256, and Hi16_ZMM. 1594 */ 1595 if (xcrval & XFEATURE_AVX512 && 1596 (xcrval & (XFEATURE_AVX512 | XFEATURE_AVX)) != 1597 (XFEATURE_AVX512 | XFEATURE_AVX)) { 1598 vm_inject_gp(vmx->vm, vcpu); 1599 return (HANDLED); 1600 } 1601 1602 /* 1603 * Intel MPX requires both bound register state flags to be 1604 * set. 1605 */ 1606 if (((xcrval & XFEATURE_ENABLED_BNDREGS) != 0) != 1607 ((xcrval & XFEATURE_ENABLED_BNDCSR) != 0)) { 1608 vm_inject_gp(vmx->vm, vcpu); 1609 return (HANDLED); 1610 } 1611 1612 /* 1613 * This runs "inside" vmrun() with the guest's FPU state, so 1614 * modifying xcr0 directly modifies the guest's xcr0, not the 1615 * host's. 1616 */ 1617 load_xcr(0, xcrval); 1618 return (HANDLED); 1619 } 1620 1621 static uint64_t 1622 vmx_get_guest_reg(struct vmx *vmx, int vcpu, int ident) 1623 { 1624 const struct vmxctx *vmxctx; 1625 1626 vmxctx = &vmx->ctx[vcpu]; 1627 1628 switch (ident) { 1629 case 0: 1630 return (vmxctx->guest_rax); 1631 case 1: 1632 return (vmxctx->guest_rcx); 1633 case 2: 1634 return (vmxctx->guest_rdx); 1635 case 3: 1636 return (vmxctx->guest_rbx); 1637 case 4: 1638 return (vmcs_read(VMCS_GUEST_RSP)); 1639 case 5: 1640 return (vmxctx->guest_rbp); 1641 case 6: 1642 return (vmxctx->guest_rsi); 1643 case 7: 1644 return (vmxctx->guest_rdi); 1645 case 8: 1646 return (vmxctx->guest_r8); 1647 case 9: 1648 return (vmxctx->guest_r9); 1649 case 10: 1650 return (vmxctx->guest_r10); 1651 case 11: 1652 return (vmxctx->guest_r11); 1653 case 12: 1654 return (vmxctx->guest_r12); 1655 case 13: 1656 return (vmxctx->guest_r13); 1657 case 14: 1658 return (vmxctx->guest_r14); 1659 case 15: 1660 return (vmxctx->guest_r15); 1661 default: 1662 panic("invalid vmx register %d", ident); 1663 } 1664 } 1665 1666 static void 1667 vmx_set_guest_reg(struct vmx *vmx, int vcpu, int ident, uint64_t regval) 1668 { 1669 struct vmxctx *vmxctx; 1670 1671 vmxctx = &vmx->ctx[vcpu]; 1672 1673 switch (ident) { 1674 case 0: 1675 vmxctx->guest_rax = regval; 1676 break; 1677 case 1: 1678 vmxctx->guest_rcx = regval; 1679 break; 1680 case 2: 1681 vmxctx->guest_rdx = regval; 1682 break; 1683 case 3: 1684 vmxctx->guest_rbx = regval; 1685 break; 1686 case 4: 1687 vmcs_write(VMCS_GUEST_RSP, regval); 1688 break; 1689 case 5: 1690 vmxctx->guest_rbp = regval; 1691 break; 1692 case 6: 1693 vmxctx->guest_rsi = regval; 1694 break; 1695 case 7: 1696 vmxctx->guest_rdi = regval; 1697 break; 1698 case 8: 1699 vmxctx->guest_r8 = regval; 1700 break; 1701 case 9: 1702 vmxctx->guest_r9 = regval; 1703 break; 1704 case 10: 1705 vmxctx->guest_r10 = regval; 1706 break; 1707 case 11: 1708 vmxctx->guest_r11 = regval; 1709 break; 1710 case 12: 1711 vmxctx->guest_r12 = regval; 1712 break; 1713 case 13: 1714 vmxctx->guest_r13 = regval; 1715 break; 1716 case 14: 1717 vmxctx->guest_r14 = regval; 1718 break; 1719 case 15: 1720 vmxctx->guest_r15 = regval; 1721 break; 1722 default: 1723 panic("invalid vmx register %d", ident); 1724 } 1725 } 1726 1727 static int 1728 vmx_emulate_cr0_access(struct vmx *vmx, int vcpu, uint64_t exitqual) 1729 { 1730 uint64_t crval, regval; 1731 1732 /* We only handle mov to %cr0 at this time */ 1733 if ((exitqual & 0xf0) != 0x00) 1734 return (UNHANDLED); 1735 1736 regval = vmx_get_guest_reg(vmx, vcpu, (exitqual >> 8) & 0xf); 1737 1738 vmcs_write(VMCS_CR0_SHADOW, regval); 1739 1740 crval = regval | cr0_ones_mask; 1741 crval &= ~cr0_zeros_mask; 1742 vmcs_write(VMCS_GUEST_CR0, crval); 1743 1744 if (regval & CR0_PG) { 1745 uint64_t efer, entry_ctls; 1746 1747 /* 1748 * If CR0.PG is 1 and EFER.LME is 1 then EFER.LMA and 1749 * the "IA-32e mode guest" bit in VM-entry control must be 1750 * equal. 1751 */ 1752 efer = vmcs_read(VMCS_GUEST_IA32_EFER); 1753 if (efer & EFER_LME) { 1754 efer |= EFER_LMA; 1755 vmcs_write(VMCS_GUEST_IA32_EFER, efer); 1756 entry_ctls = vmcs_read(VMCS_ENTRY_CTLS); 1757 entry_ctls |= VM_ENTRY_GUEST_LMA; 1758 vmcs_write(VMCS_ENTRY_CTLS, entry_ctls); 1759 } 1760 } 1761 1762 return (HANDLED); 1763 } 1764 1765 static int 1766 vmx_emulate_cr4_access(struct vmx *vmx, int vcpu, uint64_t exitqual) 1767 { 1768 uint64_t crval, regval; 1769 1770 /* We only handle mov to %cr4 at this time */ 1771 if ((exitqual & 0xf0) != 0x00) 1772 return (UNHANDLED); 1773 1774 regval = vmx_get_guest_reg(vmx, vcpu, (exitqual >> 8) & 0xf); 1775 1776 vmcs_write(VMCS_CR4_SHADOW, regval); 1777 1778 crval = regval | cr4_ones_mask; 1779 crval &= ~cr4_zeros_mask; 1780 vmcs_write(VMCS_GUEST_CR4, crval); 1781 1782 return (HANDLED); 1783 } 1784 1785 static int 1786 vmx_emulate_cr8_access(struct vmx *vmx, int vcpu, uint64_t exitqual) 1787 { 1788 struct vlapic *vlapic; 1789 uint64_t cr8; 1790 int regnum; 1791 1792 /* We only handle mov %cr8 to/from a register at this time. */ 1793 if ((exitqual & 0xe0) != 0x00) { 1794 return (UNHANDLED); 1795 } 1796 1797 vlapic = vm_lapic(vmx->vm, vcpu); 1798 regnum = (exitqual >> 8) & 0xf; 1799 if (exitqual & 0x10) { 1800 cr8 = vlapic_get_cr8(vlapic); 1801 vmx_set_guest_reg(vmx, vcpu, regnum, cr8); 1802 } else { 1803 cr8 = vmx_get_guest_reg(vmx, vcpu, regnum); 1804 vlapic_set_cr8(vlapic, cr8); 1805 } 1806 1807 return (HANDLED); 1808 } 1809 1810 /* 1811 * From section "Guest Register State" in the Intel SDM: CPL = SS.DPL 1812 */ 1813 static int 1814 vmx_cpl(void) 1815 { 1816 uint32_t ssar; 1817 1818 ssar = vmcs_read(VMCS_GUEST_SS_ACCESS_RIGHTS); 1819 return ((ssar >> 5) & 0x3); 1820 } 1821 1822 static enum vm_cpu_mode 1823 vmx_cpu_mode(void) 1824 { 1825 uint32_t csar; 1826 1827 if (vmcs_read(VMCS_GUEST_IA32_EFER) & EFER_LMA) { 1828 csar = vmcs_read(VMCS_GUEST_CS_ACCESS_RIGHTS); 1829 if (csar & 0x2000) 1830 return (CPU_MODE_64BIT); /* CS.L = 1 */ 1831 else 1832 return (CPU_MODE_COMPATIBILITY); 1833 } else if (vmcs_read(VMCS_GUEST_CR0) & CR0_PE) { 1834 return (CPU_MODE_PROTECTED); 1835 } else { 1836 return (CPU_MODE_REAL); 1837 } 1838 } 1839 1840 static enum vm_paging_mode 1841 vmx_paging_mode(void) 1842 { 1843 1844 if (!(vmcs_read(VMCS_GUEST_CR0) & CR0_PG)) 1845 return (PAGING_MODE_FLAT); 1846 if (!(vmcs_read(VMCS_GUEST_CR4) & CR4_PAE)) 1847 return (PAGING_MODE_32); 1848 if (vmcs_read(VMCS_GUEST_IA32_EFER) & EFER_LME) 1849 return (PAGING_MODE_64); 1850 else 1851 return (PAGING_MODE_PAE); 1852 } 1853 1854 static uint64_t 1855 inout_str_index(struct vmx *vmx, int vcpuid, int in) 1856 { 1857 uint64_t val; 1858 int error; 1859 enum vm_reg_name reg; 1860 1861 reg = in ? VM_REG_GUEST_RDI : VM_REG_GUEST_RSI; 1862 error = vmx_getreg(vmx, vcpuid, reg, &val); 1863 KASSERT(error == 0, ("%s: vmx_getreg error %d", __func__, error)); 1864 return (val); 1865 } 1866 1867 static uint64_t 1868 inout_str_count(struct vmx *vmx, int vcpuid, int rep) 1869 { 1870 uint64_t val; 1871 int error; 1872 1873 if (rep) { 1874 error = vmx_getreg(vmx, vcpuid, VM_REG_GUEST_RCX, &val); 1875 KASSERT(!error, ("%s: vmx_getreg error %d", __func__, error)); 1876 } else { 1877 val = 1; 1878 } 1879 return (val); 1880 } 1881 1882 static int 1883 inout_str_addrsize(uint32_t inst_info) 1884 { 1885 uint32_t size; 1886 1887 size = (inst_info >> 7) & 0x7; 1888 switch (size) { 1889 case 0: 1890 return (2); /* 16 bit */ 1891 case 1: 1892 return (4); /* 32 bit */ 1893 case 2: 1894 return (8); /* 64 bit */ 1895 default: 1896 panic("%s: invalid size encoding %d", __func__, size); 1897 } 1898 } 1899 1900 static void 1901 inout_str_seginfo(struct vmx *vmx, int vcpuid, uint32_t inst_info, int in, 1902 struct vm_inout_str *vis) 1903 { 1904 int error, s; 1905 1906 if (in) { 1907 vis->seg_name = VM_REG_GUEST_ES; 1908 } else { 1909 s = (inst_info >> 15) & 0x7; 1910 vis->seg_name = vm_segment_name(s); 1911 } 1912 1913 error = vmx_getdesc(vmx, vcpuid, vis->seg_name, &vis->seg_desc); 1914 KASSERT(error == 0, ("%s: vmx_getdesc error %d", __func__, error)); 1915 } 1916 1917 static void 1918 vmx_paging_info(struct vm_guest_paging *paging) 1919 { 1920 paging->cr3 = vmcs_guest_cr3(); 1921 paging->cpl = vmx_cpl(); 1922 paging->cpu_mode = vmx_cpu_mode(); 1923 paging->paging_mode = vmx_paging_mode(); 1924 } 1925 1926 static void 1927 vmexit_inst_emul(struct vm_exit *vmexit, uint64_t gpa, uint64_t gla) 1928 { 1929 struct vm_guest_paging *paging; 1930 uint32_t csar; 1931 1932 paging = &vmexit->u.inst_emul.paging; 1933 1934 vmexit->exitcode = VM_EXITCODE_INST_EMUL; 1935 vmexit->inst_length = 0; 1936 vmexit->u.inst_emul.gpa = gpa; 1937 vmexit->u.inst_emul.gla = gla; 1938 vmx_paging_info(paging); 1939 switch (paging->cpu_mode) { 1940 case CPU_MODE_REAL: 1941 vmexit->u.inst_emul.cs_base = vmcs_read(VMCS_GUEST_CS_BASE); 1942 vmexit->u.inst_emul.cs_d = 0; 1943 break; 1944 case CPU_MODE_PROTECTED: 1945 case CPU_MODE_COMPATIBILITY: 1946 vmexit->u.inst_emul.cs_base = vmcs_read(VMCS_GUEST_CS_BASE); 1947 csar = vmcs_read(VMCS_GUEST_CS_ACCESS_RIGHTS); 1948 vmexit->u.inst_emul.cs_d = SEG_DESC_DEF32(csar); 1949 break; 1950 default: 1951 vmexit->u.inst_emul.cs_base = 0; 1952 vmexit->u.inst_emul.cs_d = 0; 1953 break; 1954 } 1955 vie_init(&vmexit->u.inst_emul.vie, NULL, 0); 1956 } 1957 1958 static int 1959 ept_fault_type(uint64_t ept_qual) 1960 { 1961 int fault_type; 1962 1963 if (ept_qual & EPT_VIOLATION_DATA_WRITE) 1964 fault_type = VM_PROT_WRITE; 1965 else if (ept_qual & EPT_VIOLATION_INST_FETCH) 1966 fault_type = VM_PROT_EXECUTE; 1967 else 1968 fault_type= VM_PROT_READ; 1969 1970 return (fault_type); 1971 } 1972 1973 static boolean_t 1974 ept_emulation_fault(uint64_t ept_qual) 1975 { 1976 int read, write; 1977 1978 /* EPT fault on an instruction fetch doesn't make sense here */ 1979 if (ept_qual & EPT_VIOLATION_INST_FETCH) 1980 return (FALSE); 1981 1982 /* EPT fault must be a read fault or a write fault */ 1983 read = ept_qual & EPT_VIOLATION_DATA_READ ? 1 : 0; 1984 write = ept_qual & EPT_VIOLATION_DATA_WRITE ? 1 : 0; 1985 if ((read | write) == 0) 1986 return (FALSE); 1987 1988 /* 1989 * The EPT violation must have been caused by accessing a 1990 * guest-physical address that is a translation of a guest-linear 1991 * address. 1992 */ 1993 if ((ept_qual & EPT_VIOLATION_GLA_VALID) == 0 || 1994 (ept_qual & EPT_VIOLATION_XLAT_VALID) == 0) { 1995 return (FALSE); 1996 } 1997 1998 return (TRUE); 1999 } 2000 2001 static __inline int 2002 apic_access_virtualization(struct vmx *vmx, int vcpuid) 2003 { 2004 uint32_t proc_ctls2; 2005 2006 proc_ctls2 = vmx->cap[vcpuid].proc_ctls2; 2007 return ((proc_ctls2 & PROCBASED2_VIRTUALIZE_APIC_ACCESSES) ? 1 : 0); 2008 } 2009 2010 static __inline int 2011 x2apic_virtualization(struct vmx *vmx, int vcpuid) 2012 { 2013 uint32_t proc_ctls2; 2014 2015 proc_ctls2 = vmx->cap[vcpuid].proc_ctls2; 2016 return ((proc_ctls2 & PROCBASED2_VIRTUALIZE_X2APIC_MODE) ? 1 : 0); 2017 } 2018 2019 static int 2020 vmx_handle_apic_write(struct vmx *vmx, int vcpuid, struct vlapic *vlapic, 2021 uint64_t qual) 2022 { 2023 int error, handled, offset; 2024 uint32_t *apic_regs, vector; 2025 bool retu; 2026 2027 handled = HANDLED; 2028 offset = APIC_WRITE_OFFSET(qual); 2029 2030 if (!apic_access_virtualization(vmx, vcpuid)) { 2031 /* 2032 * In general there should not be any APIC write VM-exits 2033 * unless APIC-access virtualization is enabled. 2034 * 2035 * However self-IPI virtualization can legitimately trigger 2036 * an APIC-write VM-exit so treat it specially. 2037 */ 2038 if (x2apic_virtualization(vmx, vcpuid) && 2039 offset == APIC_OFFSET_SELF_IPI) { 2040 apic_regs = (uint32_t *)(vlapic->apic_page); 2041 vector = apic_regs[APIC_OFFSET_SELF_IPI / 4]; 2042 vlapic_self_ipi_handler(vlapic, vector); 2043 return (HANDLED); 2044 } else 2045 return (UNHANDLED); 2046 } 2047 2048 switch (offset) { 2049 case APIC_OFFSET_ID: 2050 vlapic_id_write_handler(vlapic); 2051 break; 2052 case APIC_OFFSET_LDR: 2053 vlapic_ldr_write_handler(vlapic); 2054 break; 2055 case APIC_OFFSET_DFR: 2056 vlapic_dfr_write_handler(vlapic); 2057 break; 2058 case APIC_OFFSET_SVR: 2059 vlapic_svr_write_handler(vlapic); 2060 break; 2061 case APIC_OFFSET_ESR: 2062 vlapic_esr_write_handler(vlapic); 2063 break; 2064 case APIC_OFFSET_ICR_LOW: 2065 retu = false; 2066 error = vlapic_icrlo_write_handler(vlapic, &retu); 2067 if (error != 0 || retu) 2068 handled = UNHANDLED; 2069 break; 2070 case APIC_OFFSET_CMCI_LVT: 2071 case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT: 2072 vlapic_lvt_write_handler(vlapic, offset); 2073 break; 2074 case APIC_OFFSET_TIMER_ICR: 2075 vlapic_icrtmr_write_handler(vlapic); 2076 break; 2077 case APIC_OFFSET_TIMER_DCR: 2078 vlapic_dcr_write_handler(vlapic); 2079 break; 2080 default: 2081 handled = UNHANDLED; 2082 break; 2083 } 2084 return (handled); 2085 } 2086 2087 static bool 2088 apic_access_fault(struct vmx *vmx, int vcpuid, uint64_t gpa) 2089 { 2090 2091 if (apic_access_virtualization(vmx, vcpuid) && 2092 (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE)) 2093 return (true); 2094 else 2095 return (false); 2096 } 2097 2098 static int 2099 vmx_handle_apic_access(struct vmx *vmx, int vcpuid, struct vm_exit *vmexit) 2100 { 2101 uint64_t qual; 2102 int access_type, offset, allowed; 2103 2104 if (!apic_access_virtualization(vmx, vcpuid)) 2105 return (UNHANDLED); 2106 2107 qual = vmexit->u.vmx.exit_qualification; 2108 access_type = APIC_ACCESS_TYPE(qual); 2109 offset = APIC_ACCESS_OFFSET(qual); 2110 2111 allowed = 0; 2112 if (access_type == 0) { 2113 /* 2114 * Read data access to the following registers is expected. 2115 */ 2116 switch (offset) { 2117 case APIC_OFFSET_APR: 2118 case APIC_OFFSET_PPR: 2119 case APIC_OFFSET_RRR: 2120 case APIC_OFFSET_CMCI_LVT: 2121 case APIC_OFFSET_TIMER_CCR: 2122 allowed = 1; 2123 break; 2124 default: 2125 break; 2126 } 2127 } else if (access_type == 1) { 2128 /* 2129 * Write data access to the following registers is expected. 2130 */ 2131 switch (offset) { 2132 case APIC_OFFSET_VER: 2133 case APIC_OFFSET_APR: 2134 case APIC_OFFSET_PPR: 2135 case APIC_OFFSET_RRR: 2136 case APIC_OFFSET_ISR0 ... APIC_OFFSET_ISR7: 2137 case APIC_OFFSET_TMR0 ... APIC_OFFSET_TMR7: 2138 case APIC_OFFSET_IRR0 ... APIC_OFFSET_IRR7: 2139 case APIC_OFFSET_CMCI_LVT: 2140 case APIC_OFFSET_TIMER_CCR: 2141 allowed = 1; 2142 break; 2143 default: 2144 break; 2145 } 2146 } 2147 2148 if (allowed) { 2149 vmexit_inst_emul(vmexit, DEFAULT_APIC_BASE + offset, 2150 VIE_INVALID_GLA); 2151 } 2152 2153 /* 2154 * Regardless of whether the APIC-access is allowed this handler 2155 * always returns UNHANDLED: 2156 * - if the access is allowed then it is handled by emulating the 2157 * instruction that caused the VM-exit (outside the critical section) 2158 * - if the access is not allowed then it will be converted to an 2159 * exitcode of VM_EXITCODE_VMX and will be dealt with in userland. 2160 */ 2161 return (UNHANDLED); 2162 } 2163 2164 static enum task_switch_reason 2165 vmx_task_switch_reason(uint64_t qual) 2166 { 2167 int reason; 2168 2169 reason = (qual >> 30) & 0x3; 2170 switch (reason) { 2171 case 0: 2172 return (TSR_CALL); 2173 case 1: 2174 return (TSR_IRET); 2175 case 2: 2176 return (TSR_JMP); 2177 case 3: 2178 return (TSR_IDT_GATE); 2179 default: 2180 panic("%s: invalid reason %d", __func__, reason); 2181 } 2182 } 2183 2184 static int 2185 emulate_wrmsr(struct vmx *vmx, int vcpuid, u_int num, uint64_t val, bool *retu) 2186 { 2187 int error; 2188 2189 if (lapic_msr(num)) 2190 error = lapic_wrmsr(vmx->vm, vcpuid, num, val, retu); 2191 else 2192 error = vmx_wrmsr(vmx, vcpuid, num, val, retu); 2193 2194 return (error); 2195 } 2196 2197 static int 2198 emulate_rdmsr(struct vmx *vmx, int vcpuid, u_int num, bool *retu) 2199 { 2200 struct vmxctx *vmxctx; 2201 uint64_t result; 2202 uint32_t eax, edx; 2203 int error; 2204 2205 if (lapic_msr(num)) 2206 error = lapic_rdmsr(vmx->vm, vcpuid, num, &result, retu); 2207 else 2208 error = vmx_rdmsr(vmx, vcpuid, num, &result, retu); 2209 2210 if (error == 0) { 2211 eax = result; 2212 vmxctx = &vmx->ctx[vcpuid]; 2213 error = vmxctx_setreg(vmxctx, VM_REG_GUEST_RAX, eax); 2214 KASSERT(error == 0, ("vmxctx_setreg(rax) error %d", error)); 2215 2216 edx = result >> 32; 2217 error = vmxctx_setreg(vmxctx, VM_REG_GUEST_RDX, edx); 2218 KASSERT(error == 0, ("vmxctx_setreg(rdx) error %d", error)); 2219 } 2220 2221 return (error); 2222 } 2223 2224 static int 2225 vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit) 2226 { 2227 int error, errcode, errcode_valid, handled, in; 2228 struct vmxctx *vmxctx; 2229 struct vlapic *vlapic; 2230 struct vm_inout_str *vis; 2231 struct vm_task_switch *ts; 2232 uint32_t eax, ecx, edx, idtvec_info, idtvec_err, intr_info, inst_info; 2233 uint32_t intr_type, intr_vec, reason; 2234 uint64_t exitintinfo, qual, gpa; 2235 bool retu; 2236 2237 CTASSERT((PINBASED_CTLS_ONE_SETTING & PINBASED_VIRTUAL_NMI) != 0); 2238 CTASSERT((PINBASED_CTLS_ONE_SETTING & PINBASED_NMI_EXITING) != 0); 2239 2240 handled = UNHANDLED; 2241 vmxctx = &vmx->ctx[vcpu]; 2242 2243 qual = vmexit->u.vmx.exit_qualification; 2244 reason = vmexit->u.vmx.exit_reason; 2245 vmexit->exitcode = VM_EXITCODE_BOGUS; 2246 2247 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_COUNT, 1); 2248 SDT_PROBE3(vmm, vmx, exit, entry, vmx, vcpu, vmexit); 2249 2250 /* 2251 * VM-entry failures during or after loading guest state. 2252 * 2253 * These VM-exits are uncommon but must be handled specially 2254 * as most VM-exit fields are not populated as usual. 2255 */ 2256 if (__predict_false(reason == EXIT_REASON_MCE_DURING_ENTRY)) { 2257 VCPU_CTR0(vmx->vm, vcpu, "Handling MCE during VM-entry"); 2258 __asm __volatile("int $18"); 2259 return (1); 2260 } 2261 2262 /* 2263 * VM exits that can be triggered during event delivery need to 2264 * be handled specially by re-injecting the event if the IDT 2265 * vectoring information field's valid bit is set. 2266 * 2267 * See "Information for VM Exits During Event Delivery" in Intel SDM 2268 * for details. 2269 */ 2270 idtvec_info = vmcs_idt_vectoring_info(); 2271 if (idtvec_info & VMCS_IDT_VEC_VALID) { 2272 idtvec_info &= ~(1 << 12); /* clear undefined bit */ 2273 exitintinfo = idtvec_info; 2274 if (idtvec_info & VMCS_IDT_VEC_ERRCODE_VALID) { 2275 idtvec_err = vmcs_idt_vectoring_err(); 2276 exitintinfo |= (uint64_t)idtvec_err << 32; 2277 } 2278 error = vm_exit_intinfo(vmx->vm, vcpu, exitintinfo); 2279 KASSERT(error == 0, ("%s: vm_set_intinfo error %d", 2280 __func__, error)); 2281 2282 /* 2283 * If 'virtual NMIs' are being used and the VM-exit 2284 * happened while injecting an NMI during the previous 2285 * VM-entry, then clear "blocking by NMI" in the 2286 * Guest Interruptibility-State so the NMI can be 2287 * reinjected on the subsequent VM-entry. 2288 * 2289 * However, if the NMI was being delivered through a task 2290 * gate, then the new task must start execution with NMIs 2291 * blocked so don't clear NMI blocking in this case. 2292 */ 2293 intr_type = idtvec_info & VMCS_INTR_T_MASK; 2294 if (intr_type == VMCS_INTR_T_NMI) { 2295 if (reason != EXIT_REASON_TASK_SWITCH) 2296 vmx_clear_nmi_blocking(vmx, vcpu); 2297 else 2298 vmx_assert_nmi_blocking(vmx, vcpu); 2299 } 2300 2301 /* 2302 * Update VM-entry instruction length if the event being 2303 * delivered was a software interrupt or software exception. 2304 */ 2305 if (intr_type == VMCS_INTR_T_SWINTR || 2306 intr_type == VMCS_INTR_T_PRIV_SWEXCEPTION || 2307 intr_type == VMCS_INTR_T_SWEXCEPTION) { 2308 vmcs_write(VMCS_ENTRY_INST_LENGTH, vmexit->inst_length); 2309 } 2310 } 2311 2312 switch (reason) { 2313 case EXIT_REASON_TASK_SWITCH: 2314 ts = &vmexit->u.task_switch; 2315 ts->tsssel = qual & 0xffff; 2316 ts->reason = vmx_task_switch_reason(qual); 2317 ts->ext = 0; 2318 ts->errcode_valid = 0; 2319 vmx_paging_info(&ts->paging); 2320 /* 2321 * If the task switch was due to a CALL, JMP, IRET, software 2322 * interrupt (INT n) or software exception (INT3, INTO), 2323 * then the saved %rip references the instruction that caused 2324 * the task switch. The instruction length field in the VMCS 2325 * is valid in this case. 2326 * 2327 * In all other cases (e.g., NMI, hardware exception) the 2328 * saved %rip is one that would have been saved in the old TSS 2329 * had the task switch completed normally so the instruction 2330 * length field is not needed in this case and is explicitly 2331 * set to 0. 2332 */ 2333 if (ts->reason == TSR_IDT_GATE) { 2334 KASSERT(idtvec_info & VMCS_IDT_VEC_VALID, 2335 ("invalid idtvec_info %#x for IDT task switch", 2336 idtvec_info)); 2337 intr_type = idtvec_info & VMCS_INTR_T_MASK; 2338 if (intr_type != VMCS_INTR_T_SWINTR && 2339 intr_type != VMCS_INTR_T_SWEXCEPTION && 2340 intr_type != VMCS_INTR_T_PRIV_SWEXCEPTION) { 2341 /* Task switch triggered by external event */ 2342 ts->ext = 1; 2343 vmexit->inst_length = 0; 2344 if (idtvec_info & VMCS_IDT_VEC_ERRCODE_VALID) { 2345 ts->errcode_valid = 1; 2346 ts->errcode = vmcs_idt_vectoring_err(); 2347 } 2348 } 2349 } 2350 vmexit->exitcode = VM_EXITCODE_TASK_SWITCH; 2351 SDT_PROBE4(vmm, vmx, exit, taskswitch, vmx, vcpu, vmexit, ts); 2352 VCPU_CTR4(vmx->vm, vcpu, "task switch reason %d, tss 0x%04x, " 2353 "%s errcode 0x%016lx", ts->reason, ts->tsssel, 2354 ts->ext ? "external" : "internal", 2355 ((uint64_t)ts->errcode << 32) | ts->errcode_valid); 2356 break; 2357 case EXIT_REASON_CR_ACCESS: 2358 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_CR_ACCESS, 1); 2359 SDT_PROBE4(vmm, vmx, exit, craccess, vmx, vcpu, vmexit, qual); 2360 switch (qual & 0xf) { 2361 case 0: 2362 handled = vmx_emulate_cr0_access(vmx, vcpu, qual); 2363 break; 2364 case 4: 2365 handled = vmx_emulate_cr4_access(vmx, vcpu, qual); 2366 break; 2367 case 8: 2368 handled = vmx_emulate_cr8_access(vmx, vcpu, qual); 2369 break; 2370 } 2371 break; 2372 case EXIT_REASON_RDMSR: 2373 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_RDMSR, 1); 2374 retu = false; 2375 ecx = vmxctx->guest_rcx; 2376 VCPU_CTR1(vmx->vm, vcpu, "rdmsr 0x%08x", ecx); 2377 SDT_PROBE4(vmm, vmx, exit, rdmsr, vmx, vcpu, vmexit, ecx); 2378 error = emulate_rdmsr(vmx, vcpu, ecx, &retu); 2379 if (error) { 2380 vmexit->exitcode = VM_EXITCODE_RDMSR; 2381 vmexit->u.msr.code = ecx; 2382 } else if (!retu) { 2383 handled = HANDLED; 2384 } else { 2385 /* Return to userspace with a valid exitcode */ 2386 KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS, 2387 ("emulate_rdmsr retu with bogus exitcode")); 2388 } 2389 break; 2390 case EXIT_REASON_WRMSR: 2391 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_WRMSR, 1); 2392 retu = false; 2393 eax = vmxctx->guest_rax; 2394 ecx = vmxctx->guest_rcx; 2395 edx = vmxctx->guest_rdx; 2396 VCPU_CTR2(vmx->vm, vcpu, "wrmsr 0x%08x value 0x%016lx", 2397 ecx, (uint64_t)edx << 32 | eax); 2398 SDT_PROBE5(vmm, vmx, exit, wrmsr, vmx, vmexit, vcpu, ecx, 2399 (uint64_t)edx << 32 | eax); 2400 error = emulate_wrmsr(vmx, vcpu, ecx, 2401 (uint64_t)edx << 32 | eax, &retu); 2402 if (error) { 2403 vmexit->exitcode = VM_EXITCODE_WRMSR; 2404 vmexit->u.msr.code = ecx; 2405 vmexit->u.msr.wval = (uint64_t)edx << 32 | eax; 2406 } else if (!retu) { 2407 handled = HANDLED; 2408 } else { 2409 /* Return to userspace with a valid exitcode */ 2410 KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS, 2411 ("emulate_wrmsr retu with bogus exitcode")); 2412 } 2413 break; 2414 case EXIT_REASON_HLT: 2415 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_HLT, 1); 2416 SDT_PROBE3(vmm, vmx, exit, halt, vmx, vcpu, vmexit); 2417 vmexit->exitcode = VM_EXITCODE_HLT; 2418 vmexit->u.hlt.rflags = vmcs_read(VMCS_GUEST_RFLAGS); 2419 if (virtual_interrupt_delivery) 2420 vmexit->u.hlt.intr_status = 2421 vmcs_read(VMCS_GUEST_INTR_STATUS); 2422 else 2423 vmexit->u.hlt.intr_status = 0; 2424 break; 2425 case EXIT_REASON_MTF: 2426 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_MTRAP, 1); 2427 SDT_PROBE3(vmm, vmx, exit, mtrap, vmx, vcpu, vmexit); 2428 vmexit->exitcode = VM_EXITCODE_MTRAP; 2429 vmexit->inst_length = 0; 2430 break; 2431 case EXIT_REASON_PAUSE: 2432 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_PAUSE, 1); 2433 SDT_PROBE3(vmm, vmx, exit, pause, vmx, vcpu, vmexit); 2434 vmexit->exitcode = VM_EXITCODE_PAUSE; 2435 break; 2436 case EXIT_REASON_INTR_WINDOW: 2437 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INTR_WINDOW, 1); 2438 SDT_PROBE3(vmm, vmx, exit, intrwindow, vmx, vcpu, vmexit); 2439 vmx_clear_int_window_exiting(vmx, vcpu); 2440 return (1); 2441 case EXIT_REASON_EXT_INTR: 2442 /* 2443 * External interrupts serve only to cause VM exits and allow 2444 * the host interrupt handler to run. 2445 * 2446 * If this external interrupt triggers a virtual interrupt 2447 * to a VM, then that state will be recorded by the 2448 * host interrupt handler in the VM's softc. We will inject 2449 * this virtual interrupt during the subsequent VM enter. 2450 */ 2451 intr_info = vmcs_read(VMCS_EXIT_INTR_INFO); 2452 SDT_PROBE4(vmm, vmx, exit, interrupt, 2453 vmx, vcpu, vmexit, intr_info); 2454 2455 /* 2456 * XXX: Ignore this exit if VMCS_INTR_VALID is not set. 2457 * This appears to be a bug in VMware Fusion? 2458 */ 2459 if (!(intr_info & VMCS_INTR_VALID)) 2460 return (1); 2461 KASSERT((intr_info & VMCS_INTR_VALID) != 0 && 2462 (intr_info & VMCS_INTR_T_MASK) == VMCS_INTR_T_HWINTR, 2463 ("VM exit interruption info invalid: %#x", intr_info)); 2464 vmx_trigger_hostintr(intr_info & 0xff); 2465 2466 /* 2467 * This is special. We want to treat this as an 'handled' 2468 * VM-exit but not increment the instruction pointer. 2469 */ 2470 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_EXTINT, 1); 2471 return (1); 2472 case EXIT_REASON_NMI_WINDOW: 2473 SDT_PROBE3(vmm, vmx, exit, nmiwindow, vmx, vcpu, vmexit); 2474 /* Exit to allow the pending virtual NMI to be injected */ 2475 if (vm_nmi_pending(vmx->vm, vcpu)) 2476 vmx_inject_nmi(vmx, vcpu); 2477 vmx_clear_nmi_window_exiting(vmx, vcpu); 2478 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_NMI_WINDOW, 1); 2479 return (1); 2480 case EXIT_REASON_INOUT: 2481 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INOUT, 1); 2482 vmexit->exitcode = VM_EXITCODE_INOUT; 2483 vmexit->u.inout.bytes = (qual & 0x7) + 1; 2484 vmexit->u.inout.in = in = (qual & 0x8) ? 1 : 0; 2485 vmexit->u.inout.string = (qual & 0x10) ? 1 : 0; 2486 vmexit->u.inout.rep = (qual & 0x20) ? 1 : 0; 2487 vmexit->u.inout.port = (uint16_t)(qual >> 16); 2488 vmexit->u.inout.eax = (uint32_t)(vmxctx->guest_rax); 2489 if (vmexit->u.inout.string) { 2490 inst_info = vmcs_read(VMCS_EXIT_INSTRUCTION_INFO); 2491 vmexit->exitcode = VM_EXITCODE_INOUT_STR; 2492 vis = &vmexit->u.inout_str; 2493 vmx_paging_info(&vis->paging); 2494 vis->rflags = vmcs_read(VMCS_GUEST_RFLAGS); 2495 vis->cr0 = vmcs_read(VMCS_GUEST_CR0); 2496 vis->index = inout_str_index(vmx, vcpu, in); 2497 vis->count = inout_str_count(vmx, vcpu, vis->inout.rep); 2498 vis->addrsize = inout_str_addrsize(inst_info); 2499 inout_str_seginfo(vmx, vcpu, inst_info, in, vis); 2500 } 2501 SDT_PROBE3(vmm, vmx, exit, inout, vmx, vcpu, vmexit); 2502 break; 2503 case EXIT_REASON_CPUID: 2504 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_CPUID, 1); 2505 SDT_PROBE3(vmm, vmx, exit, cpuid, vmx, vcpu, vmexit); 2506 handled = vmx_handle_cpuid(vmx->vm, vcpu, vmxctx); 2507 break; 2508 case EXIT_REASON_EXCEPTION: 2509 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_EXCEPTION, 1); 2510 intr_info = vmcs_read(VMCS_EXIT_INTR_INFO); 2511 KASSERT((intr_info & VMCS_INTR_VALID) != 0, 2512 ("VM exit interruption info invalid: %#x", intr_info)); 2513 2514 intr_vec = intr_info & 0xff; 2515 intr_type = intr_info & VMCS_INTR_T_MASK; 2516 2517 /* 2518 * If Virtual NMIs control is 1 and the VM-exit is due to a 2519 * fault encountered during the execution of IRET then we must 2520 * restore the state of "virtual-NMI blocking" before resuming 2521 * the guest. 2522 * 2523 * See "Resuming Guest Software after Handling an Exception". 2524 * See "Information for VM Exits Due to Vectored Events". 2525 */ 2526 if ((idtvec_info & VMCS_IDT_VEC_VALID) == 0 && 2527 (intr_vec != IDT_DF) && 2528 (intr_info & EXIT_QUAL_NMIUDTI) != 0) 2529 vmx_restore_nmi_blocking(vmx, vcpu); 2530 2531 /* 2532 * The NMI has already been handled in vmx_exit_handle_nmi(). 2533 */ 2534 if (intr_type == VMCS_INTR_T_NMI) 2535 return (1); 2536 2537 /* 2538 * Call the machine check handler by hand. Also don't reflect 2539 * the machine check back into the guest. 2540 */ 2541 if (intr_vec == IDT_MC) { 2542 VCPU_CTR0(vmx->vm, vcpu, "Vectoring to MCE handler"); 2543 __asm __volatile("int $18"); 2544 return (1); 2545 } 2546 2547 if (intr_vec == IDT_PF) { 2548 error = vmxctx_setreg(vmxctx, VM_REG_GUEST_CR2, qual); 2549 KASSERT(error == 0, ("%s: vmxctx_setreg(cr2) error %d", 2550 __func__, error)); 2551 } 2552 2553 /* 2554 * Software exceptions exhibit trap-like behavior. This in 2555 * turn requires populating the VM-entry instruction length 2556 * so that the %rip in the trap frame is past the INT3/INTO 2557 * instruction. 2558 */ 2559 if (intr_type == VMCS_INTR_T_SWEXCEPTION) 2560 vmcs_write(VMCS_ENTRY_INST_LENGTH, vmexit->inst_length); 2561 2562 /* Reflect all other exceptions back into the guest */ 2563 errcode_valid = errcode = 0; 2564 if (intr_info & VMCS_INTR_DEL_ERRCODE) { 2565 errcode_valid = 1; 2566 errcode = vmcs_read(VMCS_EXIT_INTR_ERRCODE); 2567 } 2568 VCPU_CTR2(vmx->vm, vcpu, "Reflecting exception %d/%#x into " 2569 "the guest", intr_vec, errcode); 2570 SDT_PROBE5(vmm, vmx, exit, exception, 2571 vmx, vcpu, vmexit, intr_vec, errcode); 2572 error = vm_inject_exception(vmx->vm, vcpu, intr_vec, 2573 errcode_valid, errcode, 0); 2574 KASSERT(error == 0, ("%s: vm_inject_exception error %d", 2575 __func__, error)); 2576 return (1); 2577 2578 case EXIT_REASON_EPT_FAULT: 2579 /* 2580 * If 'gpa' lies within the address space allocated to 2581 * memory then this must be a nested page fault otherwise 2582 * this must be an instruction that accesses MMIO space. 2583 */ 2584 gpa = vmcs_gpa(); 2585 if (vm_mem_allocated(vmx->vm, vcpu, gpa) || 2586 apic_access_fault(vmx, vcpu, gpa)) { 2587 vmexit->exitcode = VM_EXITCODE_PAGING; 2588 vmexit->inst_length = 0; 2589 vmexit->u.paging.gpa = gpa; 2590 vmexit->u.paging.fault_type = ept_fault_type(qual); 2591 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_NESTED_FAULT, 1); 2592 SDT_PROBE5(vmm, vmx, exit, nestedfault, 2593 vmx, vcpu, vmexit, gpa, qual); 2594 } else if (ept_emulation_fault(qual)) { 2595 vmexit_inst_emul(vmexit, gpa, vmcs_gla()); 2596 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INST_EMUL, 1); 2597 SDT_PROBE4(vmm, vmx, exit, mmiofault, 2598 vmx, vcpu, vmexit, gpa); 2599 } 2600 /* 2601 * If Virtual NMIs control is 1 and the VM-exit is due to an 2602 * EPT fault during the execution of IRET then we must restore 2603 * the state of "virtual-NMI blocking" before resuming. 2604 * 2605 * See description of "NMI unblocking due to IRET" in 2606 * "Exit Qualification for EPT Violations". 2607 */ 2608 if ((idtvec_info & VMCS_IDT_VEC_VALID) == 0 && 2609 (qual & EXIT_QUAL_NMIUDTI) != 0) 2610 vmx_restore_nmi_blocking(vmx, vcpu); 2611 break; 2612 case EXIT_REASON_VIRTUALIZED_EOI: 2613 vmexit->exitcode = VM_EXITCODE_IOAPIC_EOI; 2614 vmexit->u.ioapic_eoi.vector = qual & 0xFF; 2615 SDT_PROBE3(vmm, vmx, exit, eoi, vmx, vcpu, vmexit); 2616 vmexit->inst_length = 0; /* trap-like */ 2617 break; 2618 case EXIT_REASON_APIC_ACCESS: 2619 SDT_PROBE3(vmm, vmx, exit, apicaccess, vmx, vcpu, vmexit); 2620 handled = vmx_handle_apic_access(vmx, vcpu, vmexit); 2621 break; 2622 case EXIT_REASON_APIC_WRITE: 2623 /* 2624 * APIC-write VM exit is trap-like so the %rip is already 2625 * pointing to the next instruction. 2626 */ 2627 vmexit->inst_length = 0; 2628 vlapic = vm_lapic(vmx->vm, vcpu); 2629 SDT_PROBE4(vmm, vmx, exit, apicwrite, 2630 vmx, vcpu, vmexit, vlapic); 2631 handled = vmx_handle_apic_write(vmx, vcpu, vlapic, qual); 2632 break; 2633 case EXIT_REASON_XSETBV: 2634 SDT_PROBE3(vmm, vmx, exit, xsetbv, vmx, vcpu, vmexit); 2635 handled = vmx_emulate_xsetbv(vmx, vcpu, vmexit); 2636 break; 2637 case EXIT_REASON_MONITOR: 2638 SDT_PROBE3(vmm, vmx, exit, monitor, vmx, vcpu, vmexit); 2639 vmexit->exitcode = VM_EXITCODE_MONITOR; 2640 break; 2641 case EXIT_REASON_MWAIT: 2642 SDT_PROBE3(vmm, vmx, exit, mwait, vmx, vcpu, vmexit); 2643 vmexit->exitcode = VM_EXITCODE_MWAIT; 2644 break; 2645 case EXIT_REASON_VMCALL: 2646 case EXIT_REASON_VMCLEAR: 2647 case EXIT_REASON_VMLAUNCH: 2648 case EXIT_REASON_VMPTRLD: 2649 case EXIT_REASON_VMPTRST: 2650 case EXIT_REASON_VMREAD: 2651 case EXIT_REASON_VMRESUME: 2652 case EXIT_REASON_VMWRITE: 2653 case EXIT_REASON_VMXOFF: 2654 case EXIT_REASON_VMXON: 2655 SDT_PROBE3(vmm, vmx, exit, vminsn, vmx, vcpu, vmexit); 2656 vmexit->exitcode = VM_EXITCODE_VMINSN; 2657 break; 2658 default: 2659 SDT_PROBE4(vmm, vmx, exit, unknown, 2660 vmx, vcpu, vmexit, reason); 2661 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_UNKNOWN, 1); 2662 break; 2663 } 2664 2665 if (handled) { 2666 /* 2667 * It is possible that control is returned to userland 2668 * even though we were able to handle the VM exit in the 2669 * kernel. 2670 * 2671 * In such a case we want to make sure that the userland 2672 * restarts guest execution at the instruction *after* 2673 * the one we just processed. Therefore we update the 2674 * guest rip in the VMCS and in 'vmexit'. 2675 */ 2676 vmexit->rip += vmexit->inst_length; 2677 vmexit->inst_length = 0; 2678 vmcs_write(VMCS_GUEST_RIP, vmexit->rip); 2679 } else { 2680 if (vmexit->exitcode == VM_EXITCODE_BOGUS) { 2681 /* 2682 * If this VM exit was not claimed by anybody then 2683 * treat it as a generic VMX exit. 2684 */ 2685 vmexit->exitcode = VM_EXITCODE_VMX; 2686 vmexit->u.vmx.status = VM_SUCCESS; 2687 vmexit->u.vmx.inst_type = 0; 2688 vmexit->u.vmx.inst_error = 0; 2689 } else { 2690 /* 2691 * The exitcode and collateral have been populated. 2692 * The VM exit will be processed further in userland. 2693 */ 2694 } 2695 } 2696 2697 SDT_PROBE4(vmm, vmx, exit, return, 2698 vmx, vcpu, vmexit, handled); 2699 return (handled); 2700 } 2701 2702 static __inline void 2703 vmx_exit_inst_error(struct vmxctx *vmxctx, int rc, struct vm_exit *vmexit) 2704 { 2705 2706 KASSERT(vmxctx->inst_fail_status != VM_SUCCESS, 2707 ("vmx_exit_inst_error: invalid inst_fail_status %d", 2708 vmxctx->inst_fail_status)); 2709 2710 vmexit->inst_length = 0; 2711 vmexit->exitcode = VM_EXITCODE_VMX; 2712 vmexit->u.vmx.status = vmxctx->inst_fail_status; 2713 vmexit->u.vmx.inst_error = vmcs_instruction_error(); 2714 vmexit->u.vmx.exit_reason = ~0; 2715 vmexit->u.vmx.exit_qualification = ~0; 2716 2717 switch (rc) { 2718 case VMX_VMRESUME_ERROR: 2719 case VMX_VMLAUNCH_ERROR: 2720 case VMX_INVEPT_ERROR: 2721 vmexit->u.vmx.inst_type = rc; 2722 break; 2723 default: 2724 panic("vm_exit_inst_error: vmx_enter_guest returned %d", rc); 2725 } 2726 } 2727 2728 /* 2729 * If the NMI-exiting VM execution control is set to '1' then an NMI in 2730 * non-root operation causes a VM-exit. NMI blocking is in effect so it is 2731 * sufficient to simply vector to the NMI handler via a software interrupt. 2732 * However, this must be done before maskable interrupts are enabled 2733 * otherwise the "iret" issued by an interrupt handler will incorrectly 2734 * clear NMI blocking. 2735 */ 2736 static __inline void 2737 vmx_exit_handle_nmi(struct vmx *vmx, int vcpuid, struct vm_exit *vmexit) 2738 { 2739 uint32_t intr_info; 2740 2741 KASSERT((read_rflags() & PSL_I) == 0, ("interrupts enabled")); 2742 2743 if (vmexit->u.vmx.exit_reason != EXIT_REASON_EXCEPTION) 2744 return; 2745 2746 intr_info = vmcs_read(VMCS_EXIT_INTR_INFO); 2747 KASSERT((intr_info & VMCS_INTR_VALID) != 0, 2748 ("VM exit interruption info invalid: %#x", intr_info)); 2749 2750 if ((intr_info & VMCS_INTR_T_MASK) == VMCS_INTR_T_NMI) { 2751 KASSERT((intr_info & 0xff) == IDT_NMI, ("VM exit due " 2752 "to NMI has invalid vector: %#x", intr_info)); 2753 VCPU_CTR0(vmx->vm, vcpuid, "Vectoring to NMI handler"); 2754 __asm __volatile("int $2"); 2755 } 2756 } 2757 2758 static __inline void 2759 vmx_dr_enter_guest(struct vmxctx *vmxctx) 2760 { 2761 register_t rflags; 2762 2763 /* Save host control debug registers. */ 2764 vmxctx->host_dr7 = rdr7(); 2765 vmxctx->host_debugctl = rdmsr(MSR_DEBUGCTLMSR); 2766 2767 /* 2768 * Disable debugging in DR7 and DEBUGCTL to avoid triggering 2769 * exceptions in the host based on the guest DRx values. The 2770 * guest DR7 and DEBUGCTL are saved/restored in the VMCS. 2771 */ 2772 load_dr7(0); 2773 wrmsr(MSR_DEBUGCTLMSR, 0); 2774 2775 /* 2776 * Disable single stepping the kernel to avoid corrupting the 2777 * guest DR6. A debugger might still be able to corrupt the 2778 * guest DR6 by setting a breakpoint after this point and then 2779 * single stepping. 2780 */ 2781 rflags = read_rflags(); 2782 vmxctx->host_tf = rflags & PSL_T; 2783 write_rflags(rflags & ~PSL_T); 2784 2785 /* Save host debug registers. */ 2786 vmxctx->host_dr0 = rdr0(); 2787 vmxctx->host_dr1 = rdr1(); 2788 vmxctx->host_dr2 = rdr2(); 2789 vmxctx->host_dr3 = rdr3(); 2790 vmxctx->host_dr6 = rdr6(); 2791 2792 /* Restore guest debug registers. */ 2793 load_dr0(vmxctx->guest_dr0); 2794 load_dr1(vmxctx->guest_dr1); 2795 load_dr2(vmxctx->guest_dr2); 2796 load_dr3(vmxctx->guest_dr3); 2797 load_dr6(vmxctx->guest_dr6); 2798 } 2799 2800 static __inline void 2801 vmx_dr_leave_guest(struct vmxctx *vmxctx) 2802 { 2803 2804 /* Save guest debug registers. */ 2805 vmxctx->guest_dr0 = rdr0(); 2806 vmxctx->guest_dr1 = rdr1(); 2807 vmxctx->guest_dr2 = rdr2(); 2808 vmxctx->guest_dr3 = rdr3(); 2809 vmxctx->guest_dr6 = rdr6(); 2810 2811 /* 2812 * Restore host debug registers. Restore DR7, DEBUGCTL, and 2813 * PSL_T last. 2814 */ 2815 load_dr0(vmxctx->host_dr0); 2816 load_dr1(vmxctx->host_dr1); 2817 load_dr2(vmxctx->host_dr2); 2818 load_dr3(vmxctx->host_dr3); 2819 load_dr6(vmxctx->host_dr6); 2820 wrmsr(MSR_DEBUGCTLMSR, vmxctx->host_debugctl); 2821 load_dr7(vmxctx->host_dr7); 2822 write_rflags(read_rflags() | vmxctx->host_tf); 2823 } 2824 2825 static int 2826 vmx_run(void *arg, int vcpu, register_t rip, pmap_t pmap, 2827 struct vm_eventinfo *evinfo) 2828 { 2829 int rc, handled, launched; 2830 struct vmx *vmx; 2831 struct vm *vm; 2832 struct vmxctx *vmxctx; 2833 struct vmcs *vmcs; 2834 struct vm_exit *vmexit; 2835 struct vlapic *vlapic; 2836 uint32_t exit_reason; 2837 struct region_descriptor gdtr, idtr; 2838 uint16_t ldt_sel; 2839 2840 vmx = arg; 2841 vm = vmx->vm; 2842 vmcs = &vmx->vmcs[vcpu]; 2843 vmxctx = &vmx->ctx[vcpu]; 2844 vlapic = vm_lapic(vm, vcpu); 2845 vmexit = vm_exitinfo(vm, vcpu); 2846 launched = 0; 2847 2848 KASSERT(vmxctx->pmap == pmap, 2849 ("pmap %p different than ctx pmap %p", pmap, vmxctx->pmap)); 2850 2851 vmx_msr_guest_enter(vmx, vcpu); 2852 2853 VMPTRLD(vmcs); 2854 2855 /* 2856 * XXX 2857 * We do this every time because we may setup the virtual machine 2858 * from a different process than the one that actually runs it. 2859 * 2860 * If the life of a virtual machine was spent entirely in the context 2861 * of a single process we could do this once in vmx_vminit(). 2862 */ 2863 vmcs_write(VMCS_HOST_CR3, rcr3()); 2864 2865 vmcs_write(VMCS_GUEST_RIP, rip); 2866 vmx_set_pcpu_defaults(vmx, vcpu, pmap); 2867 do { 2868 KASSERT(vmcs_guest_rip() == rip, ("%s: vmcs guest rip mismatch " 2869 "%#lx/%#lx", __func__, vmcs_guest_rip(), rip)); 2870 2871 handled = UNHANDLED; 2872 /* 2873 * Interrupts are disabled from this point on until the 2874 * guest starts executing. This is done for the following 2875 * reasons: 2876 * 2877 * If an AST is asserted on this thread after the check below, 2878 * then the IPI_AST notification will not be lost, because it 2879 * will cause a VM exit due to external interrupt as soon as 2880 * the guest state is loaded. 2881 * 2882 * A posted interrupt after 'vmx_inject_interrupts()' will 2883 * not be "lost" because it will be held pending in the host 2884 * APIC because interrupts are disabled. The pending interrupt 2885 * will be recognized as soon as the guest state is loaded. 2886 * 2887 * The same reasoning applies to the IPI generated by 2888 * pmap_invalidate_ept(). 2889 */ 2890 disable_intr(); 2891 vmx_inject_interrupts(vmx, vcpu, vlapic, rip); 2892 2893 /* 2894 * Check for vcpu suspension after injecting events because 2895 * vmx_inject_interrupts() can suspend the vcpu due to a 2896 * triple fault. 2897 */ 2898 if (vcpu_suspended(evinfo)) { 2899 enable_intr(); 2900 vm_exit_suspended(vmx->vm, vcpu, rip); 2901 break; 2902 } 2903 2904 if (vcpu_rendezvous_pending(evinfo)) { 2905 enable_intr(); 2906 vm_exit_rendezvous(vmx->vm, vcpu, rip); 2907 break; 2908 } 2909 2910 if (vcpu_reqidle(evinfo)) { 2911 enable_intr(); 2912 vm_exit_reqidle(vmx->vm, vcpu, rip); 2913 break; 2914 } 2915 2916 if (vcpu_should_yield(vm, vcpu)) { 2917 enable_intr(); 2918 vm_exit_astpending(vmx->vm, vcpu, rip); 2919 vmx_astpending_trace(vmx, vcpu, rip); 2920 handled = HANDLED; 2921 break; 2922 } 2923 2924 if (vcpu_debugged(vm, vcpu)) { 2925 enable_intr(); 2926 vm_exit_debug(vmx->vm, vcpu, rip); 2927 break; 2928 } 2929 2930 /* 2931 * VM exits restore the base address but not the 2932 * limits of GDTR and IDTR. The VMCS only stores the 2933 * base address, so VM exits set the limits to 0xffff. 2934 * Save and restore the full GDTR and IDTR to restore 2935 * the limits. 2936 * 2937 * The VMCS does not save the LDTR at all, and VM 2938 * exits clear LDTR as if a NULL selector were loaded. 2939 * The userspace hypervisor probably doesn't use a 2940 * LDT, but save and restore it to be safe. 2941 */ 2942 sgdt(&gdtr); 2943 sidt(&idtr); 2944 ldt_sel = sldt(); 2945 2946 vmx_run_trace(vmx, vcpu); 2947 vmx_dr_enter_guest(vmxctx); 2948 rc = vmx_enter_guest(vmxctx, vmx, launched); 2949 vmx_dr_leave_guest(vmxctx); 2950 2951 bare_lgdt(&gdtr); 2952 lidt(&idtr); 2953 lldt(ldt_sel); 2954 2955 /* Collect some information for VM exit processing */ 2956 vmexit->rip = rip = vmcs_guest_rip(); 2957 vmexit->inst_length = vmexit_instruction_length(); 2958 vmexit->u.vmx.exit_reason = exit_reason = vmcs_exit_reason(); 2959 vmexit->u.vmx.exit_qualification = vmcs_exit_qualification(); 2960 2961 /* Update 'nextrip' */ 2962 vmx->state[vcpu].nextrip = rip; 2963 2964 if (rc == VMX_GUEST_VMEXIT) { 2965 vmx_exit_handle_nmi(vmx, vcpu, vmexit); 2966 enable_intr(); 2967 handled = vmx_exit_process(vmx, vcpu, vmexit); 2968 } else { 2969 enable_intr(); 2970 vmx_exit_inst_error(vmxctx, rc, vmexit); 2971 } 2972 launched = 1; 2973 vmx_exit_trace(vmx, vcpu, rip, exit_reason, handled); 2974 rip = vmexit->rip; 2975 } while (handled); 2976 2977 /* 2978 * If a VM exit has been handled then the exitcode must be BOGUS 2979 * If a VM exit is not handled then the exitcode must not be BOGUS 2980 */ 2981 if ((handled && vmexit->exitcode != VM_EXITCODE_BOGUS) || 2982 (!handled && vmexit->exitcode == VM_EXITCODE_BOGUS)) { 2983 panic("Mismatch between handled (%d) and exitcode (%d)", 2984 handled, vmexit->exitcode); 2985 } 2986 2987 if (!handled) 2988 vmm_stat_incr(vm, vcpu, VMEXIT_USERSPACE, 1); 2989 2990 VCPU_CTR1(vm, vcpu, "returning from vmx_run: exitcode %d", 2991 vmexit->exitcode); 2992 2993 VMCLEAR(vmcs); 2994 vmx_msr_guest_exit(vmx, vcpu); 2995 2996 return (0); 2997 } 2998 2999 static void 3000 vmx_vmcleanup(void *arg) 3001 { 3002 int i; 3003 struct vmx *vmx = arg; 3004 3005 if (apic_access_virtualization(vmx, 0)) 3006 vm_unmap_mmio(vmx->vm, DEFAULT_APIC_BASE, PAGE_SIZE); 3007 3008 for (i = 0; i < VM_MAXCPU; i++) 3009 vpid_free(vmx->state[i].vpid); 3010 3011 free(vmx, M_VMX); 3012 3013 return; 3014 } 3015 3016 static register_t * 3017 vmxctx_regptr(struct vmxctx *vmxctx, int reg) 3018 { 3019 3020 switch (reg) { 3021 case VM_REG_GUEST_RAX: 3022 return (&vmxctx->guest_rax); 3023 case VM_REG_GUEST_RBX: 3024 return (&vmxctx->guest_rbx); 3025 case VM_REG_GUEST_RCX: 3026 return (&vmxctx->guest_rcx); 3027 case VM_REG_GUEST_RDX: 3028 return (&vmxctx->guest_rdx); 3029 case VM_REG_GUEST_RSI: 3030 return (&vmxctx->guest_rsi); 3031 case VM_REG_GUEST_RDI: 3032 return (&vmxctx->guest_rdi); 3033 case VM_REG_GUEST_RBP: 3034 return (&vmxctx->guest_rbp); 3035 case VM_REG_GUEST_R8: 3036 return (&vmxctx->guest_r8); 3037 case VM_REG_GUEST_R9: 3038 return (&vmxctx->guest_r9); 3039 case VM_REG_GUEST_R10: 3040 return (&vmxctx->guest_r10); 3041 case VM_REG_GUEST_R11: 3042 return (&vmxctx->guest_r11); 3043 case VM_REG_GUEST_R12: 3044 return (&vmxctx->guest_r12); 3045 case VM_REG_GUEST_R13: 3046 return (&vmxctx->guest_r13); 3047 case VM_REG_GUEST_R14: 3048 return (&vmxctx->guest_r14); 3049 case VM_REG_GUEST_R15: 3050 return (&vmxctx->guest_r15); 3051 case VM_REG_GUEST_CR2: 3052 return (&vmxctx->guest_cr2); 3053 case VM_REG_GUEST_DR0: 3054 return (&vmxctx->guest_dr0); 3055 case VM_REG_GUEST_DR1: 3056 return (&vmxctx->guest_dr1); 3057 case VM_REG_GUEST_DR2: 3058 return (&vmxctx->guest_dr2); 3059 case VM_REG_GUEST_DR3: 3060 return (&vmxctx->guest_dr3); 3061 case VM_REG_GUEST_DR6: 3062 return (&vmxctx->guest_dr6); 3063 default: 3064 break; 3065 } 3066 return (NULL); 3067 } 3068 3069 static int 3070 vmxctx_getreg(struct vmxctx *vmxctx, int reg, uint64_t *retval) 3071 { 3072 register_t *regp; 3073 3074 if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) { 3075 *retval = *regp; 3076 return (0); 3077 } else 3078 return (EINVAL); 3079 } 3080 3081 static int 3082 vmxctx_setreg(struct vmxctx *vmxctx, int reg, uint64_t val) 3083 { 3084 register_t *regp; 3085 3086 if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) { 3087 *regp = val; 3088 return (0); 3089 } else 3090 return (EINVAL); 3091 } 3092 3093 static int 3094 vmx_get_intr_shadow(struct vmx *vmx, int vcpu, int running, uint64_t *retval) 3095 { 3096 uint64_t gi; 3097 int error; 3098 3099 error = vmcs_getreg(&vmx->vmcs[vcpu], running, 3100 VMCS_IDENT(VMCS_GUEST_INTERRUPTIBILITY), &gi); 3101 *retval = (gi & HWINTR_BLOCKING) ? 1 : 0; 3102 return (error); 3103 } 3104 3105 static int 3106 vmx_modify_intr_shadow(struct vmx *vmx, int vcpu, int running, uint64_t val) 3107 { 3108 struct vmcs *vmcs; 3109 uint64_t gi; 3110 int error, ident; 3111 3112 /* 3113 * Forcing the vcpu into an interrupt shadow is not supported. 3114 */ 3115 if (val) { 3116 error = EINVAL; 3117 goto done; 3118 } 3119 3120 vmcs = &vmx->vmcs[vcpu]; 3121 ident = VMCS_IDENT(VMCS_GUEST_INTERRUPTIBILITY); 3122 error = vmcs_getreg(vmcs, running, ident, &gi); 3123 if (error == 0) { 3124 gi &= ~HWINTR_BLOCKING; 3125 error = vmcs_setreg(vmcs, running, ident, gi); 3126 } 3127 done: 3128 VCPU_CTR2(vmx->vm, vcpu, "Setting intr_shadow to %#lx %s", val, 3129 error ? "failed" : "succeeded"); 3130 return (error); 3131 } 3132 3133 static int 3134 vmx_shadow_reg(int reg) 3135 { 3136 int shreg; 3137 3138 shreg = -1; 3139 3140 switch (reg) { 3141 case VM_REG_GUEST_CR0: 3142 shreg = VMCS_CR0_SHADOW; 3143 break; 3144 case VM_REG_GUEST_CR4: 3145 shreg = VMCS_CR4_SHADOW; 3146 break; 3147 default: 3148 break; 3149 } 3150 3151 return (shreg); 3152 } 3153 3154 static int 3155 vmx_getreg(void *arg, int vcpu, int reg, uint64_t *retval) 3156 { 3157 int running, hostcpu; 3158 struct vmx *vmx = arg; 3159 3160 running = vcpu_is_running(vmx->vm, vcpu, &hostcpu); 3161 if (running && hostcpu != curcpu) 3162 panic("vmx_getreg: %s%d is running", vm_name(vmx->vm), vcpu); 3163 3164 if (reg == VM_REG_GUEST_INTR_SHADOW) 3165 return (vmx_get_intr_shadow(vmx, vcpu, running, retval)); 3166 3167 if (vmxctx_getreg(&vmx->ctx[vcpu], reg, retval) == 0) 3168 return (0); 3169 3170 return (vmcs_getreg(&vmx->vmcs[vcpu], running, reg, retval)); 3171 } 3172 3173 static int 3174 vmx_setreg(void *arg, int vcpu, int reg, uint64_t val) 3175 { 3176 int error, hostcpu, running, shadow; 3177 uint64_t ctls; 3178 pmap_t pmap; 3179 struct vmx *vmx = arg; 3180 3181 running = vcpu_is_running(vmx->vm, vcpu, &hostcpu); 3182 if (running && hostcpu != curcpu) 3183 panic("vmx_setreg: %s%d is running", vm_name(vmx->vm), vcpu); 3184 3185 if (reg == VM_REG_GUEST_INTR_SHADOW) 3186 return (vmx_modify_intr_shadow(vmx, vcpu, running, val)); 3187 3188 if (vmxctx_setreg(&vmx->ctx[vcpu], reg, val) == 0) 3189 return (0); 3190 3191 error = vmcs_setreg(&vmx->vmcs[vcpu], running, reg, val); 3192 3193 if (error == 0) { 3194 /* 3195 * If the "load EFER" VM-entry control is 1 then the 3196 * value of EFER.LMA must be identical to "IA-32e mode guest" 3197 * bit in the VM-entry control. 3198 */ 3199 if ((entry_ctls & VM_ENTRY_LOAD_EFER) != 0 && 3200 (reg == VM_REG_GUEST_EFER)) { 3201 vmcs_getreg(&vmx->vmcs[vcpu], running, 3202 VMCS_IDENT(VMCS_ENTRY_CTLS), &ctls); 3203 if (val & EFER_LMA) 3204 ctls |= VM_ENTRY_GUEST_LMA; 3205 else 3206 ctls &= ~VM_ENTRY_GUEST_LMA; 3207 vmcs_setreg(&vmx->vmcs[vcpu], running, 3208 VMCS_IDENT(VMCS_ENTRY_CTLS), ctls); 3209 } 3210 3211 shadow = vmx_shadow_reg(reg); 3212 if (shadow > 0) { 3213 /* 3214 * Store the unmodified value in the shadow 3215 */ 3216 error = vmcs_setreg(&vmx->vmcs[vcpu], running, 3217 VMCS_IDENT(shadow), val); 3218 } 3219 3220 if (reg == VM_REG_GUEST_CR3) { 3221 /* 3222 * Invalidate the guest vcpu's TLB mappings to emulate 3223 * the behavior of updating %cr3. 3224 * 3225 * XXX the processor retains global mappings when %cr3 3226 * is updated but vmx_invvpid() does not. 3227 */ 3228 pmap = vmx->ctx[vcpu].pmap; 3229 vmx_invvpid(vmx, vcpu, pmap, running); 3230 } 3231 } 3232 3233 return (error); 3234 } 3235 3236 static int 3237 vmx_getdesc(void *arg, int vcpu, int reg, struct seg_desc *desc) 3238 { 3239 int hostcpu, running; 3240 struct vmx *vmx = arg; 3241 3242 running = vcpu_is_running(vmx->vm, vcpu, &hostcpu); 3243 if (running && hostcpu != curcpu) 3244 panic("vmx_getdesc: %s%d is running", vm_name(vmx->vm), vcpu); 3245 3246 return (vmcs_getdesc(&vmx->vmcs[vcpu], running, reg, desc)); 3247 } 3248 3249 static int 3250 vmx_setdesc(void *arg, int vcpu, int reg, struct seg_desc *desc) 3251 { 3252 int hostcpu, running; 3253 struct vmx *vmx = arg; 3254 3255 running = vcpu_is_running(vmx->vm, vcpu, &hostcpu); 3256 if (running && hostcpu != curcpu) 3257 panic("vmx_setdesc: %s%d is running", vm_name(vmx->vm), vcpu); 3258 3259 return (vmcs_setdesc(&vmx->vmcs[vcpu], running, reg, desc)); 3260 } 3261 3262 static int 3263 vmx_getcap(void *arg, int vcpu, int type, int *retval) 3264 { 3265 struct vmx *vmx = arg; 3266 int vcap; 3267 int ret; 3268 3269 ret = ENOENT; 3270 3271 vcap = vmx->cap[vcpu].set; 3272 3273 switch (type) { 3274 case VM_CAP_HALT_EXIT: 3275 if (cap_halt_exit) 3276 ret = 0; 3277 break; 3278 case VM_CAP_PAUSE_EXIT: 3279 if (cap_pause_exit) 3280 ret = 0; 3281 break; 3282 case VM_CAP_MTRAP_EXIT: 3283 if (cap_monitor_trap) 3284 ret = 0; 3285 break; 3286 case VM_CAP_UNRESTRICTED_GUEST: 3287 if (cap_unrestricted_guest) 3288 ret = 0; 3289 break; 3290 case VM_CAP_ENABLE_INVPCID: 3291 if (cap_invpcid) 3292 ret = 0; 3293 break; 3294 default: 3295 break; 3296 } 3297 3298 if (ret == 0) 3299 *retval = (vcap & (1 << type)) ? 1 : 0; 3300 3301 return (ret); 3302 } 3303 3304 static int 3305 vmx_setcap(void *arg, int vcpu, int type, int val) 3306 { 3307 struct vmx *vmx = arg; 3308 struct vmcs *vmcs = &vmx->vmcs[vcpu]; 3309 uint32_t baseval; 3310 uint32_t *pptr; 3311 int error; 3312 int flag; 3313 int reg; 3314 int retval; 3315 3316 retval = ENOENT; 3317 pptr = NULL; 3318 3319 switch (type) { 3320 case VM_CAP_HALT_EXIT: 3321 if (cap_halt_exit) { 3322 retval = 0; 3323 pptr = &vmx->cap[vcpu].proc_ctls; 3324 baseval = *pptr; 3325 flag = PROCBASED_HLT_EXITING; 3326 reg = VMCS_PRI_PROC_BASED_CTLS; 3327 } 3328 break; 3329 case VM_CAP_MTRAP_EXIT: 3330 if (cap_monitor_trap) { 3331 retval = 0; 3332 pptr = &vmx->cap[vcpu].proc_ctls; 3333 baseval = *pptr; 3334 flag = PROCBASED_MTF; 3335 reg = VMCS_PRI_PROC_BASED_CTLS; 3336 } 3337 break; 3338 case VM_CAP_PAUSE_EXIT: 3339 if (cap_pause_exit) { 3340 retval = 0; 3341 pptr = &vmx->cap[vcpu].proc_ctls; 3342 baseval = *pptr; 3343 flag = PROCBASED_PAUSE_EXITING; 3344 reg = VMCS_PRI_PROC_BASED_CTLS; 3345 } 3346 break; 3347 case VM_CAP_UNRESTRICTED_GUEST: 3348 if (cap_unrestricted_guest) { 3349 retval = 0; 3350 pptr = &vmx->cap[vcpu].proc_ctls2; 3351 baseval = *pptr; 3352 flag = PROCBASED2_UNRESTRICTED_GUEST; 3353 reg = VMCS_SEC_PROC_BASED_CTLS; 3354 } 3355 break; 3356 case VM_CAP_ENABLE_INVPCID: 3357 if (cap_invpcid) { 3358 retval = 0; 3359 pptr = &vmx->cap[vcpu].proc_ctls2; 3360 baseval = *pptr; 3361 flag = PROCBASED2_ENABLE_INVPCID; 3362 reg = VMCS_SEC_PROC_BASED_CTLS; 3363 } 3364 break; 3365 default: 3366 break; 3367 } 3368 3369 if (retval == 0) { 3370 if (val) { 3371 baseval |= flag; 3372 } else { 3373 baseval &= ~flag; 3374 } 3375 VMPTRLD(vmcs); 3376 error = vmwrite(reg, baseval); 3377 VMCLEAR(vmcs); 3378 3379 if (error) { 3380 retval = error; 3381 } else { 3382 /* 3383 * Update optional stored flags, and record 3384 * setting 3385 */ 3386 if (pptr != NULL) { 3387 *pptr = baseval; 3388 } 3389 3390 if (val) { 3391 vmx->cap[vcpu].set |= (1 << type); 3392 } else { 3393 vmx->cap[vcpu].set &= ~(1 << type); 3394 } 3395 } 3396 } 3397 3398 return (retval); 3399 } 3400 3401 struct vlapic_vtx { 3402 struct vlapic vlapic; 3403 struct pir_desc *pir_desc; 3404 struct vmx *vmx; 3405 }; 3406 3407 #define VMX_CTR_PIR(vm, vcpuid, pir_desc, notify, vector, level, msg) \ 3408 do { \ 3409 VCPU_CTR2(vm, vcpuid, msg " assert %s-triggered vector %d", \ 3410 level ? "level" : "edge", vector); \ 3411 VCPU_CTR1(vm, vcpuid, msg " pir0 0x%016lx", pir_desc->pir[0]); \ 3412 VCPU_CTR1(vm, vcpuid, msg " pir1 0x%016lx", pir_desc->pir[1]); \ 3413 VCPU_CTR1(vm, vcpuid, msg " pir2 0x%016lx", pir_desc->pir[2]); \ 3414 VCPU_CTR1(vm, vcpuid, msg " pir3 0x%016lx", pir_desc->pir[3]); \ 3415 VCPU_CTR1(vm, vcpuid, msg " notify: %s", notify ? "yes" : "no");\ 3416 } while (0) 3417 3418 /* 3419 * vlapic->ops handlers that utilize the APICv hardware assist described in 3420 * Chapter 29 of the Intel SDM. 3421 */ 3422 static int 3423 vmx_set_intr_ready(struct vlapic *vlapic, int vector, bool level) 3424 { 3425 struct vlapic_vtx *vlapic_vtx; 3426 struct pir_desc *pir_desc; 3427 uint64_t mask; 3428 int idx, notify; 3429 3430 vlapic_vtx = (struct vlapic_vtx *)vlapic; 3431 pir_desc = vlapic_vtx->pir_desc; 3432 3433 /* 3434 * Keep track of interrupt requests in the PIR descriptor. This is 3435 * because the virtual APIC page pointed to by the VMCS cannot be 3436 * modified if the vcpu is running. 3437 */ 3438 idx = vector / 64; 3439 mask = 1UL << (vector % 64); 3440 atomic_set_long(&pir_desc->pir[idx], mask); 3441 notify = atomic_cmpset_long(&pir_desc->pending, 0, 1); 3442 3443 VMX_CTR_PIR(vlapic->vm, vlapic->vcpuid, pir_desc, notify, vector, 3444 level, "vmx_set_intr_ready"); 3445 return (notify); 3446 } 3447 3448 static int 3449 vmx_pending_intr(struct vlapic *vlapic, int *vecptr) 3450 { 3451 struct vlapic_vtx *vlapic_vtx; 3452 struct pir_desc *pir_desc; 3453 struct LAPIC *lapic; 3454 uint64_t pending, pirval; 3455 uint32_t ppr, vpr; 3456 int i; 3457 3458 /* 3459 * This function is only expected to be called from the 'HLT' exit 3460 * handler which does not care about the vector that is pending. 3461 */ 3462 KASSERT(vecptr == NULL, ("vmx_pending_intr: vecptr must be NULL")); 3463 3464 vlapic_vtx = (struct vlapic_vtx *)vlapic; 3465 pir_desc = vlapic_vtx->pir_desc; 3466 3467 pending = atomic_load_acq_long(&pir_desc->pending); 3468 if (!pending) { 3469 /* 3470 * While a virtual interrupt may have already been 3471 * processed the actual delivery maybe pending the 3472 * interruptibility of the guest. Recognize a pending 3473 * interrupt by reevaluating virtual interrupts 3474 * following Section 29.2.1 in the Intel SDM Volume 3. 3475 */ 3476 struct vm_exit *vmexit; 3477 uint8_t rvi, ppr; 3478 3479 vmexit = vm_exitinfo(vlapic->vm, vlapic->vcpuid); 3480 KASSERT(vmexit->exitcode == VM_EXITCODE_HLT, 3481 ("vmx_pending_intr: exitcode not 'HLT'")); 3482 rvi = vmexit->u.hlt.intr_status & APIC_TPR_INT; 3483 lapic = vlapic->apic_page; 3484 ppr = lapic->ppr & APIC_TPR_INT; 3485 if (rvi > ppr) { 3486 return (1); 3487 } 3488 3489 return (0); 3490 } 3491 3492 /* 3493 * If there is an interrupt pending then it will be recognized only 3494 * if its priority is greater than the processor priority. 3495 * 3496 * Special case: if the processor priority is zero then any pending 3497 * interrupt will be recognized. 3498 */ 3499 lapic = vlapic->apic_page; 3500 ppr = lapic->ppr & APIC_TPR_INT; 3501 if (ppr == 0) 3502 return (1); 3503 3504 VCPU_CTR1(vlapic->vm, vlapic->vcpuid, "HLT with non-zero PPR %d", 3505 lapic->ppr); 3506 3507 for (i = 3; i >= 0; i--) { 3508 pirval = pir_desc->pir[i]; 3509 if (pirval != 0) { 3510 vpr = (i * 64 + flsl(pirval) - 1) & APIC_TPR_INT; 3511 return (vpr > ppr); 3512 } 3513 } 3514 return (0); 3515 } 3516 3517 static void 3518 vmx_intr_accepted(struct vlapic *vlapic, int vector) 3519 { 3520 3521 panic("vmx_intr_accepted: not expected to be called"); 3522 } 3523 3524 static void 3525 vmx_set_tmr(struct vlapic *vlapic, int vector, bool level) 3526 { 3527 struct vlapic_vtx *vlapic_vtx; 3528 struct vmx *vmx; 3529 struct vmcs *vmcs; 3530 uint64_t mask, val; 3531 3532 KASSERT(vector >= 0 && vector <= 255, ("invalid vector %d", vector)); 3533 KASSERT(!vcpu_is_running(vlapic->vm, vlapic->vcpuid, NULL), 3534 ("vmx_set_tmr: vcpu cannot be running")); 3535 3536 vlapic_vtx = (struct vlapic_vtx *)vlapic; 3537 vmx = vlapic_vtx->vmx; 3538 vmcs = &vmx->vmcs[vlapic->vcpuid]; 3539 mask = 1UL << (vector % 64); 3540 3541 VMPTRLD(vmcs); 3542 val = vmcs_read(VMCS_EOI_EXIT(vector)); 3543 if (level) 3544 val |= mask; 3545 else 3546 val &= ~mask; 3547 vmcs_write(VMCS_EOI_EXIT(vector), val); 3548 VMCLEAR(vmcs); 3549 } 3550 3551 static void 3552 vmx_enable_x2apic_mode(struct vlapic *vlapic) 3553 { 3554 struct vmx *vmx; 3555 struct vmcs *vmcs; 3556 uint32_t proc_ctls2; 3557 int vcpuid, error; 3558 3559 vcpuid = vlapic->vcpuid; 3560 vmx = ((struct vlapic_vtx *)vlapic)->vmx; 3561 vmcs = &vmx->vmcs[vcpuid]; 3562 3563 proc_ctls2 = vmx->cap[vcpuid].proc_ctls2; 3564 KASSERT((proc_ctls2 & PROCBASED2_VIRTUALIZE_APIC_ACCESSES) != 0, 3565 ("%s: invalid proc_ctls2 %#x", __func__, proc_ctls2)); 3566 3567 proc_ctls2 &= ~PROCBASED2_VIRTUALIZE_APIC_ACCESSES; 3568 proc_ctls2 |= PROCBASED2_VIRTUALIZE_X2APIC_MODE; 3569 vmx->cap[vcpuid].proc_ctls2 = proc_ctls2; 3570 3571 VMPTRLD(vmcs); 3572 vmcs_write(VMCS_SEC_PROC_BASED_CTLS, proc_ctls2); 3573 VMCLEAR(vmcs); 3574 3575 if (vlapic->vcpuid == 0) { 3576 /* 3577 * The nested page table mappings are shared by all vcpus 3578 * so unmap the APIC access page just once. 3579 */ 3580 error = vm_unmap_mmio(vmx->vm, DEFAULT_APIC_BASE, PAGE_SIZE); 3581 KASSERT(error == 0, ("%s: vm_unmap_mmio error %d", 3582 __func__, error)); 3583 3584 /* 3585 * The MSR bitmap is shared by all vcpus so modify it only 3586 * once in the context of vcpu 0. 3587 */ 3588 error = vmx_allow_x2apic_msrs(vmx); 3589 KASSERT(error == 0, ("%s: vmx_allow_x2apic_msrs error %d", 3590 __func__, error)); 3591 } 3592 } 3593 3594 static void 3595 vmx_post_intr(struct vlapic *vlapic, int hostcpu) 3596 { 3597 3598 ipi_cpu(hostcpu, pirvec); 3599 } 3600 3601 /* 3602 * Transfer the pending interrupts in the PIR descriptor to the IRR 3603 * in the virtual APIC page. 3604 */ 3605 static void 3606 vmx_inject_pir(struct vlapic *vlapic) 3607 { 3608 struct vlapic_vtx *vlapic_vtx; 3609 struct pir_desc *pir_desc; 3610 struct LAPIC *lapic; 3611 uint64_t val, pirval; 3612 int rvi, pirbase = -1; 3613 uint16_t intr_status_old, intr_status_new; 3614 3615 vlapic_vtx = (struct vlapic_vtx *)vlapic; 3616 pir_desc = vlapic_vtx->pir_desc; 3617 if (atomic_cmpset_long(&pir_desc->pending, 1, 0) == 0) { 3618 VCPU_CTR0(vlapic->vm, vlapic->vcpuid, "vmx_inject_pir: " 3619 "no posted interrupt pending"); 3620 return; 3621 } 3622 3623 pirval = 0; 3624 pirbase = -1; 3625 lapic = vlapic->apic_page; 3626 3627 val = atomic_readandclear_long(&pir_desc->pir[0]); 3628 if (val != 0) { 3629 lapic->irr0 |= val; 3630 lapic->irr1 |= val >> 32; 3631 pirbase = 0; 3632 pirval = val; 3633 } 3634 3635 val = atomic_readandclear_long(&pir_desc->pir[1]); 3636 if (val != 0) { 3637 lapic->irr2 |= val; 3638 lapic->irr3 |= val >> 32; 3639 pirbase = 64; 3640 pirval = val; 3641 } 3642 3643 val = atomic_readandclear_long(&pir_desc->pir[2]); 3644 if (val != 0) { 3645 lapic->irr4 |= val; 3646 lapic->irr5 |= val >> 32; 3647 pirbase = 128; 3648 pirval = val; 3649 } 3650 3651 val = atomic_readandclear_long(&pir_desc->pir[3]); 3652 if (val != 0) { 3653 lapic->irr6 |= val; 3654 lapic->irr7 |= val >> 32; 3655 pirbase = 192; 3656 pirval = val; 3657 } 3658 3659 VLAPIC_CTR_IRR(vlapic, "vmx_inject_pir"); 3660 3661 /* 3662 * Update RVI so the processor can evaluate pending virtual 3663 * interrupts on VM-entry. 3664 * 3665 * It is possible for pirval to be 0 here, even though the 3666 * pending bit has been set. The scenario is: 3667 * CPU-Y is sending a posted interrupt to CPU-X, which 3668 * is running a guest and processing posted interrupts in h/w. 3669 * CPU-X will eventually exit and the state seen in s/w is 3670 * the pending bit set, but no PIR bits set. 3671 * 3672 * CPU-X CPU-Y 3673 * (vm running) (host running) 3674 * rx posted interrupt 3675 * CLEAR pending bit 3676 * SET PIR bit 3677 * READ/CLEAR PIR bits 3678 * SET pending bit 3679 * (vm exit) 3680 * pending bit set, PIR 0 3681 */ 3682 if (pirval != 0) { 3683 rvi = pirbase + flsl(pirval) - 1; 3684 intr_status_old = vmcs_read(VMCS_GUEST_INTR_STATUS); 3685 intr_status_new = (intr_status_old & 0xFF00) | rvi; 3686 if (intr_status_new > intr_status_old) { 3687 vmcs_write(VMCS_GUEST_INTR_STATUS, intr_status_new); 3688 VCPU_CTR2(vlapic->vm, vlapic->vcpuid, "vmx_inject_pir: " 3689 "guest_intr_status changed from 0x%04x to 0x%04x", 3690 intr_status_old, intr_status_new); 3691 } 3692 } 3693 } 3694 3695 static struct vlapic * 3696 vmx_vlapic_init(void *arg, int vcpuid) 3697 { 3698 struct vmx *vmx; 3699 struct vlapic *vlapic; 3700 struct vlapic_vtx *vlapic_vtx; 3701 3702 vmx = arg; 3703 3704 vlapic = malloc(sizeof(struct vlapic_vtx), M_VLAPIC, M_WAITOK | M_ZERO); 3705 vlapic->vm = vmx->vm; 3706 vlapic->vcpuid = vcpuid; 3707 vlapic->apic_page = (struct LAPIC *)&vmx->apic_page[vcpuid]; 3708 3709 vlapic_vtx = (struct vlapic_vtx *)vlapic; 3710 vlapic_vtx->pir_desc = &vmx->pir_desc[vcpuid]; 3711 vlapic_vtx->vmx = vmx; 3712 3713 if (virtual_interrupt_delivery) { 3714 vlapic->ops.set_intr_ready = vmx_set_intr_ready; 3715 vlapic->ops.pending_intr = vmx_pending_intr; 3716 vlapic->ops.intr_accepted = vmx_intr_accepted; 3717 vlapic->ops.set_tmr = vmx_set_tmr; 3718 vlapic->ops.enable_x2apic_mode = vmx_enable_x2apic_mode; 3719 } 3720 3721 if (posted_interrupts) 3722 vlapic->ops.post_intr = vmx_post_intr; 3723 3724 vlapic_init(vlapic); 3725 3726 return (vlapic); 3727 } 3728 3729 static void 3730 vmx_vlapic_cleanup(void *arg, struct vlapic *vlapic) 3731 { 3732 3733 vlapic_cleanup(vlapic); 3734 free(vlapic, M_VLAPIC); 3735 } 3736 3737 struct vmm_ops vmm_ops_intel = { 3738 vmx_init, 3739 vmx_cleanup, 3740 vmx_restore, 3741 vmx_vminit, 3742 vmx_run, 3743 vmx_vmcleanup, 3744 vmx_getreg, 3745 vmx_setreg, 3746 vmx_getdesc, 3747 vmx_setdesc, 3748 vmx_getcap, 3749 vmx_setcap, 3750 ept_vmspace_alloc, 3751 ept_vmspace_free, 3752 vmx_vlapic_init, 3753 vmx_vlapic_cleanup, 3754 }; 3755