1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2011 NetApp, Inc. 5 * All rights reserved. 6 * Copyright (c) 2018 Joyent, Inc. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 * 29 * $FreeBSD$ 30 */ 31 32 #include <sys/cdefs.h> 33 __FBSDID("$FreeBSD$"); 34 35 #include <sys/param.h> 36 #include <sys/systm.h> 37 #include <sys/smp.h> 38 #include <sys/kernel.h> 39 #include <sys/malloc.h> 40 #include <sys/pcpu.h> 41 #include <sys/proc.h> 42 #include <sys/sysctl.h> 43 44 #include <vm/vm.h> 45 #include <vm/pmap.h> 46 47 #include <machine/psl.h> 48 #include <machine/cpufunc.h> 49 #include <machine/md_var.h> 50 #include <machine/reg.h> 51 #include <machine/segments.h> 52 #include <machine/smp.h> 53 #include <machine/specialreg.h> 54 #include <machine/vmparam.h> 55 56 #include <machine/vmm.h> 57 #include <machine/vmm_dev.h> 58 #include <machine/vmm_instruction_emul.h> 59 #include "vmm_lapic.h" 60 #include "vmm_host.h" 61 #include "vmm_ioport.h" 62 #include "vmm_ktr.h" 63 #include "vmm_stat.h" 64 #include "vatpic.h" 65 #include "vlapic.h" 66 #include "vlapic_priv.h" 67 68 #include "ept.h" 69 #include "vmx_cpufunc.h" 70 #include "vmx.h" 71 #include "vmx_msr.h" 72 #include "x86.h" 73 #include "vmx_controls.h" 74 75 #define PINBASED_CTLS_ONE_SETTING \ 76 (PINBASED_EXTINT_EXITING | \ 77 PINBASED_NMI_EXITING | \ 78 PINBASED_VIRTUAL_NMI) 79 #define PINBASED_CTLS_ZERO_SETTING 0 80 81 #define PROCBASED_CTLS_WINDOW_SETTING \ 82 (PROCBASED_INT_WINDOW_EXITING | \ 83 PROCBASED_NMI_WINDOW_EXITING) 84 85 #define PROCBASED_CTLS_ONE_SETTING \ 86 (PROCBASED_SECONDARY_CONTROLS | \ 87 PROCBASED_MWAIT_EXITING | \ 88 PROCBASED_MONITOR_EXITING | \ 89 PROCBASED_IO_EXITING | \ 90 PROCBASED_MSR_BITMAPS | \ 91 PROCBASED_CTLS_WINDOW_SETTING | \ 92 PROCBASED_CR8_LOAD_EXITING | \ 93 PROCBASED_CR8_STORE_EXITING) 94 #define PROCBASED_CTLS_ZERO_SETTING \ 95 (PROCBASED_CR3_LOAD_EXITING | \ 96 PROCBASED_CR3_STORE_EXITING | \ 97 PROCBASED_IO_BITMAPS) 98 99 #define PROCBASED_CTLS2_ONE_SETTING PROCBASED2_ENABLE_EPT 100 #define PROCBASED_CTLS2_ZERO_SETTING 0 101 102 #define VM_EXIT_CTLS_ONE_SETTING \ 103 (VM_EXIT_SAVE_DEBUG_CONTROLS | \ 104 VM_EXIT_HOST_LMA | \ 105 VM_EXIT_SAVE_EFER | \ 106 VM_EXIT_LOAD_EFER | \ 107 VM_EXIT_ACKNOWLEDGE_INTERRUPT) 108 109 #define VM_EXIT_CTLS_ZERO_SETTING 0 110 111 #define VM_ENTRY_CTLS_ONE_SETTING \ 112 (VM_ENTRY_LOAD_DEBUG_CONTROLS | \ 113 VM_ENTRY_LOAD_EFER) 114 115 #define VM_ENTRY_CTLS_ZERO_SETTING \ 116 (VM_ENTRY_INTO_SMM | \ 117 VM_ENTRY_DEACTIVATE_DUAL_MONITOR) 118 119 #define HANDLED 1 120 #define UNHANDLED 0 121 122 static MALLOC_DEFINE(M_VMX, "vmx", "vmx"); 123 static MALLOC_DEFINE(M_VLAPIC, "vlapic", "vlapic"); 124 125 SYSCTL_DECL(_hw_vmm); 126 SYSCTL_NODE(_hw_vmm, OID_AUTO, vmx, CTLFLAG_RW, NULL, NULL); 127 128 int vmxon_enabled[MAXCPU]; 129 static char vmxon_region[MAXCPU][PAGE_SIZE] __aligned(PAGE_SIZE); 130 131 static uint32_t pinbased_ctls, procbased_ctls, procbased_ctls2; 132 static uint32_t exit_ctls, entry_ctls; 133 134 static uint64_t cr0_ones_mask, cr0_zeros_mask; 135 SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr0_ones_mask, CTLFLAG_RD, 136 &cr0_ones_mask, 0, NULL); 137 SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr0_zeros_mask, CTLFLAG_RD, 138 &cr0_zeros_mask, 0, NULL); 139 140 static uint64_t cr4_ones_mask, cr4_zeros_mask; 141 SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr4_ones_mask, CTLFLAG_RD, 142 &cr4_ones_mask, 0, NULL); 143 SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr4_zeros_mask, CTLFLAG_RD, 144 &cr4_zeros_mask, 0, NULL); 145 146 static int vmx_initialized; 147 SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, initialized, CTLFLAG_RD, 148 &vmx_initialized, 0, "Intel VMX initialized"); 149 150 /* 151 * Optional capabilities 152 */ 153 static SYSCTL_NODE(_hw_vmm_vmx, OID_AUTO, cap, CTLFLAG_RW, NULL, NULL); 154 155 static int cap_halt_exit; 156 SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, halt_exit, CTLFLAG_RD, &cap_halt_exit, 0, 157 "HLT triggers a VM-exit"); 158 159 static int cap_pause_exit; 160 SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, pause_exit, CTLFLAG_RD, &cap_pause_exit, 161 0, "PAUSE triggers a VM-exit"); 162 163 static int cap_unrestricted_guest; 164 SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, unrestricted_guest, CTLFLAG_RD, 165 &cap_unrestricted_guest, 0, "Unrestricted guests"); 166 167 static int cap_monitor_trap; 168 SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, monitor_trap, CTLFLAG_RD, 169 &cap_monitor_trap, 0, "Monitor trap flag"); 170 171 static int cap_invpcid; 172 SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, invpcid, CTLFLAG_RD, &cap_invpcid, 173 0, "Guests are allowed to use INVPCID"); 174 175 static int virtual_interrupt_delivery; 176 SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, virtual_interrupt_delivery, CTLFLAG_RD, 177 &virtual_interrupt_delivery, 0, "APICv virtual interrupt delivery support"); 178 179 static int posted_interrupts; 180 SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, posted_interrupts, CTLFLAG_RD, 181 &posted_interrupts, 0, "APICv posted interrupt support"); 182 183 static int pirvec = -1; 184 SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, posted_interrupt_vector, CTLFLAG_RD, 185 &pirvec, 0, "APICv posted interrupt vector"); 186 187 static struct unrhdr *vpid_unr; 188 static u_int vpid_alloc_failed; 189 SYSCTL_UINT(_hw_vmm_vmx, OID_AUTO, vpid_alloc_failed, CTLFLAG_RD, 190 &vpid_alloc_failed, 0, NULL); 191 192 int guest_l1d_flush; 193 SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, l1d_flush, CTLFLAG_RD, 194 &guest_l1d_flush, 0, NULL); 195 int guest_l1d_flush_sw; 196 SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, l1d_flush_sw, CTLFLAG_RD, 197 &guest_l1d_flush_sw, 0, NULL); 198 199 static struct msr_entry msr_load_list[1] __aligned(16); 200 201 /* 202 * The definitions of SDT probes for VMX. 203 */ 204 205 SDT_PROBE_DEFINE3(vmm, vmx, exit, entry, 206 "struct vmx *", "int", "struct vm_exit *"); 207 208 SDT_PROBE_DEFINE4(vmm, vmx, exit, taskswitch, 209 "struct vmx *", "int", "struct vm_exit *", "struct vm_task_switch *"); 210 211 SDT_PROBE_DEFINE4(vmm, vmx, exit, craccess, 212 "struct vmx *", "int", "struct vm_exit *", "uint64_t"); 213 214 SDT_PROBE_DEFINE4(vmm, vmx, exit, rdmsr, 215 "struct vmx *", "int", "struct vm_exit *", "uint32_t"); 216 217 SDT_PROBE_DEFINE5(vmm, vmx, exit, wrmsr, 218 "struct vmx *", "int", "struct vm_exit *", "uint32_t", "uint64_t"); 219 220 SDT_PROBE_DEFINE3(vmm, vmx, exit, halt, 221 "struct vmx *", "int", "struct vm_exit *"); 222 223 SDT_PROBE_DEFINE3(vmm, vmx, exit, mtrap, 224 "struct vmx *", "int", "struct vm_exit *"); 225 226 SDT_PROBE_DEFINE3(vmm, vmx, exit, pause, 227 "struct vmx *", "int", "struct vm_exit *"); 228 229 SDT_PROBE_DEFINE3(vmm, vmx, exit, intrwindow, 230 "struct vmx *", "int", "struct vm_exit *"); 231 232 SDT_PROBE_DEFINE4(vmm, vmx, exit, interrupt, 233 "struct vmx *", "int", "struct vm_exit *", "uint32_t"); 234 235 SDT_PROBE_DEFINE3(vmm, vmx, exit, nmiwindow, 236 "struct vmx *", "int", "struct vm_exit *"); 237 238 SDT_PROBE_DEFINE3(vmm, vmx, exit, inout, 239 "struct vmx *", "int", "struct vm_exit *"); 240 241 SDT_PROBE_DEFINE3(vmm, vmx, exit, cpuid, 242 "struct vmx *", "int", "struct vm_exit *"); 243 244 SDT_PROBE_DEFINE5(vmm, vmx, exit, exception, 245 "struct vmx *", "int", "struct vm_exit *", "uint32_t", "int"); 246 247 SDT_PROBE_DEFINE5(vmm, vmx, exit, nestedfault, 248 "struct vmx *", "int", "struct vm_exit *", "uint64_t", "uint64_t"); 249 250 SDT_PROBE_DEFINE4(vmm, vmx, exit, mmiofault, 251 "struct vmx *", "int", "struct vm_exit *", "uint64_t"); 252 253 SDT_PROBE_DEFINE3(vmm, vmx, exit, eoi, 254 "struct vmx *", "int", "struct vm_exit *"); 255 256 SDT_PROBE_DEFINE3(vmm, vmx, exit, apicaccess, 257 "struct vmx *", "int", "struct vm_exit *"); 258 259 SDT_PROBE_DEFINE4(vmm, vmx, exit, apicwrite, 260 "struct vmx *", "int", "struct vm_exit *", "struct vlapic *"); 261 262 SDT_PROBE_DEFINE3(vmm, vmx, exit, xsetbv, 263 "struct vmx *", "int", "struct vm_exit *"); 264 265 SDT_PROBE_DEFINE3(vmm, vmx, exit, monitor, 266 "struct vmx *", "int", "struct vm_exit *"); 267 268 SDT_PROBE_DEFINE3(vmm, vmx, exit, mwait, 269 "struct vmx *", "int", "struct vm_exit *"); 270 271 SDT_PROBE_DEFINE3(vmm, vmx, exit, vminsn, 272 "struct vmx *", "int", "struct vm_exit *"); 273 274 SDT_PROBE_DEFINE4(vmm, vmx, exit, unknown, 275 "struct vmx *", "int", "struct vm_exit *", "uint32_t"); 276 277 SDT_PROBE_DEFINE4(vmm, vmx, exit, return, 278 "struct vmx *", "int", "struct vm_exit *", "int"); 279 280 /* 281 * Use the last page below 4GB as the APIC access address. This address is 282 * occupied by the boot firmware so it is guaranteed that it will not conflict 283 * with a page in system memory. 284 */ 285 #define APIC_ACCESS_ADDRESS 0xFFFFF000 286 287 static int vmx_getdesc(void *arg, int vcpu, int reg, struct seg_desc *desc); 288 static int vmx_getreg(void *arg, int vcpu, int reg, uint64_t *retval); 289 static int vmxctx_setreg(struct vmxctx *vmxctx, int reg, uint64_t val); 290 static void vmx_inject_pir(struct vlapic *vlapic); 291 292 #ifdef KTR 293 static const char * 294 exit_reason_to_str(int reason) 295 { 296 static char reasonbuf[32]; 297 298 switch (reason) { 299 case EXIT_REASON_EXCEPTION: 300 return "exception"; 301 case EXIT_REASON_EXT_INTR: 302 return "extint"; 303 case EXIT_REASON_TRIPLE_FAULT: 304 return "triplefault"; 305 case EXIT_REASON_INIT: 306 return "init"; 307 case EXIT_REASON_SIPI: 308 return "sipi"; 309 case EXIT_REASON_IO_SMI: 310 return "iosmi"; 311 case EXIT_REASON_SMI: 312 return "smi"; 313 case EXIT_REASON_INTR_WINDOW: 314 return "intrwindow"; 315 case EXIT_REASON_NMI_WINDOW: 316 return "nmiwindow"; 317 case EXIT_REASON_TASK_SWITCH: 318 return "taskswitch"; 319 case EXIT_REASON_CPUID: 320 return "cpuid"; 321 case EXIT_REASON_GETSEC: 322 return "getsec"; 323 case EXIT_REASON_HLT: 324 return "hlt"; 325 case EXIT_REASON_INVD: 326 return "invd"; 327 case EXIT_REASON_INVLPG: 328 return "invlpg"; 329 case EXIT_REASON_RDPMC: 330 return "rdpmc"; 331 case EXIT_REASON_RDTSC: 332 return "rdtsc"; 333 case EXIT_REASON_RSM: 334 return "rsm"; 335 case EXIT_REASON_VMCALL: 336 return "vmcall"; 337 case EXIT_REASON_VMCLEAR: 338 return "vmclear"; 339 case EXIT_REASON_VMLAUNCH: 340 return "vmlaunch"; 341 case EXIT_REASON_VMPTRLD: 342 return "vmptrld"; 343 case EXIT_REASON_VMPTRST: 344 return "vmptrst"; 345 case EXIT_REASON_VMREAD: 346 return "vmread"; 347 case EXIT_REASON_VMRESUME: 348 return "vmresume"; 349 case EXIT_REASON_VMWRITE: 350 return "vmwrite"; 351 case EXIT_REASON_VMXOFF: 352 return "vmxoff"; 353 case EXIT_REASON_VMXON: 354 return "vmxon"; 355 case EXIT_REASON_CR_ACCESS: 356 return "craccess"; 357 case EXIT_REASON_DR_ACCESS: 358 return "draccess"; 359 case EXIT_REASON_INOUT: 360 return "inout"; 361 case EXIT_REASON_RDMSR: 362 return "rdmsr"; 363 case EXIT_REASON_WRMSR: 364 return "wrmsr"; 365 case EXIT_REASON_INVAL_VMCS: 366 return "invalvmcs"; 367 case EXIT_REASON_INVAL_MSR: 368 return "invalmsr"; 369 case EXIT_REASON_MWAIT: 370 return "mwait"; 371 case EXIT_REASON_MTF: 372 return "mtf"; 373 case EXIT_REASON_MONITOR: 374 return "monitor"; 375 case EXIT_REASON_PAUSE: 376 return "pause"; 377 case EXIT_REASON_MCE_DURING_ENTRY: 378 return "mce-during-entry"; 379 case EXIT_REASON_TPR: 380 return "tpr"; 381 case EXIT_REASON_APIC_ACCESS: 382 return "apic-access"; 383 case EXIT_REASON_GDTR_IDTR: 384 return "gdtridtr"; 385 case EXIT_REASON_LDTR_TR: 386 return "ldtrtr"; 387 case EXIT_REASON_EPT_FAULT: 388 return "eptfault"; 389 case EXIT_REASON_EPT_MISCONFIG: 390 return "eptmisconfig"; 391 case EXIT_REASON_INVEPT: 392 return "invept"; 393 case EXIT_REASON_RDTSCP: 394 return "rdtscp"; 395 case EXIT_REASON_VMX_PREEMPT: 396 return "vmxpreempt"; 397 case EXIT_REASON_INVVPID: 398 return "invvpid"; 399 case EXIT_REASON_WBINVD: 400 return "wbinvd"; 401 case EXIT_REASON_XSETBV: 402 return "xsetbv"; 403 case EXIT_REASON_APIC_WRITE: 404 return "apic-write"; 405 default: 406 snprintf(reasonbuf, sizeof(reasonbuf), "%d", reason); 407 return (reasonbuf); 408 } 409 } 410 #endif /* KTR */ 411 412 static int 413 vmx_allow_x2apic_msrs(struct vmx *vmx) 414 { 415 int i, error; 416 417 error = 0; 418 419 /* 420 * Allow readonly access to the following x2APIC MSRs from the guest. 421 */ 422 error += guest_msr_ro(vmx, MSR_APIC_ID); 423 error += guest_msr_ro(vmx, MSR_APIC_VERSION); 424 error += guest_msr_ro(vmx, MSR_APIC_LDR); 425 error += guest_msr_ro(vmx, MSR_APIC_SVR); 426 427 for (i = 0; i < 8; i++) 428 error += guest_msr_ro(vmx, MSR_APIC_ISR0 + i); 429 430 for (i = 0; i < 8; i++) 431 error += guest_msr_ro(vmx, MSR_APIC_TMR0 + i); 432 433 for (i = 0; i < 8; i++) 434 error += guest_msr_ro(vmx, MSR_APIC_IRR0 + i); 435 436 error += guest_msr_ro(vmx, MSR_APIC_ESR); 437 error += guest_msr_ro(vmx, MSR_APIC_LVT_TIMER); 438 error += guest_msr_ro(vmx, MSR_APIC_LVT_THERMAL); 439 error += guest_msr_ro(vmx, MSR_APIC_LVT_PCINT); 440 error += guest_msr_ro(vmx, MSR_APIC_LVT_LINT0); 441 error += guest_msr_ro(vmx, MSR_APIC_LVT_LINT1); 442 error += guest_msr_ro(vmx, MSR_APIC_LVT_ERROR); 443 error += guest_msr_ro(vmx, MSR_APIC_ICR_TIMER); 444 error += guest_msr_ro(vmx, MSR_APIC_DCR_TIMER); 445 error += guest_msr_ro(vmx, MSR_APIC_ICR); 446 447 /* 448 * Allow TPR, EOI and SELF_IPI MSRs to be read and written by the guest. 449 * 450 * These registers get special treatment described in the section 451 * "Virtualizing MSR-Based APIC Accesses". 452 */ 453 error += guest_msr_rw(vmx, MSR_APIC_TPR); 454 error += guest_msr_rw(vmx, MSR_APIC_EOI); 455 error += guest_msr_rw(vmx, MSR_APIC_SELF_IPI); 456 457 return (error); 458 } 459 460 u_long 461 vmx_fix_cr0(u_long cr0) 462 { 463 464 return ((cr0 | cr0_ones_mask) & ~cr0_zeros_mask); 465 } 466 467 u_long 468 vmx_fix_cr4(u_long cr4) 469 { 470 471 return ((cr4 | cr4_ones_mask) & ~cr4_zeros_mask); 472 } 473 474 static void 475 vpid_free(int vpid) 476 { 477 if (vpid < 0 || vpid > 0xffff) 478 panic("vpid_free: invalid vpid %d", vpid); 479 480 /* 481 * VPIDs [0,VM_MAXCPU] are special and are not allocated from 482 * the unit number allocator. 483 */ 484 485 if (vpid > VM_MAXCPU) 486 free_unr(vpid_unr, vpid); 487 } 488 489 static void 490 vpid_alloc(uint16_t *vpid, int num) 491 { 492 int i, x; 493 494 if (num <= 0 || num > VM_MAXCPU) 495 panic("invalid number of vpids requested: %d", num); 496 497 /* 498 * If the "enable vpid" execution control is not enabled then the 499 * VPID is required to be 0 for all vcpus. 500 */ 501 if ((procbased_ctls2 & PROCBASED2_ENABLE_VPID) == 0) { 502 for (i = 0; i < num; i++) 503 vpid[i] = 0; 504 return; 505 } 506 507 /* 508 * Allocate a unique VPID for each vcpu from the unit number allocator. 509 */ 510 for (i = 0; i < num; i++) { 511 x = alloc_unr(vpid_unr); 512 if (x == -1) 513 break; 514 else 515 vpid[i] = x; 516 } 517 518 if (i < num) { 519 atomic_add_int(&vpid_alloc_failed, 1); 520 521 /* 522 * If the unit number allocator does not have enough unique 523 * VPIDs then we need to allocate from the [1,VM_MAXCPU] range. 524 * 525 * These VPIDs are not be unique across VMs but this does not 526 * affect correctness because the combined mappings are also 527 * tagged with the EP4TA which is unique for each VM. 528 * 529 * It is still sub-optimal because the invvpid will invalidate 530 * combined mappings for a particular VPID across all EP4TAs. 531 */ 532 while (i-- > 0) 533 vpid_free(vpid[i]); 534 535 for (i = 0; i < num; i++) 536 vpid[i] = i + 1; 537 } 538 } 539 540 static void 541 vpid_init(void) 542 { 543 /* 544 * VPID 0 is required when the "enable VPID" execution control is 545 * disabled. 546 * 547 * VPIDs [1,VM_MAXCPU] are used as the "overflow namespace" when the 548 * unit number allocator does not have sufficient unique VPIDs to 549 * satisfy the allocation. 550 * 551 * The remaining VPIDs are managed by the unit number allocator. 552 */ 553 vpid_unr = new_unrhdr(VM_MAXCPU + 1, 0xffff, NULL); 554 } 555 556 static void 557 vmx_disable(void *arg __unused) 558 { 559 struct invvpid_desc invvpid_desc = { 0 }; 560 struct invept_desc invept_desc = { 0 }; 561 562 if (vmxon_enabled[curcpu]) { 563 /* 564 * See sections 25.3.3.3 and 25.3.3.4 in Intel Vol 3b. 565 * 566 * VMXON or VMXOFF are not required to invalidate any TLB 567 * caching structures. This prevents potential retention of 568 * cached information in the TLB between distinct VMX episodes. 569 */ 570 invvpid(INVVPID_TYPE_ALL_CONTEXTS, invvpid_desc); 571 invept(INVEPT_TYPE_ALL_CONTEXTS, invept_desc); 572 vmxoff(); 573 } 574 load_cr4(rcr4() & ~CR4_VMXE); 575 } 576 577 static int 578 vmx_cleanup(void) 579 { 580 581 if (pirvec >= 0) 582 lapic_ipi_free(pirvec); 583 584 if (vpid_unr != NULL) { 585 delete_unrhdr(vpid_unr); 586 vpid_unr = NULL; 587 } 588 589 if (nmi_flush_l1d_sw == 1) 590 nmi_flush_l1d_sw = 0; 591 592 smp_rendezvous(NULL, vmx_disable, NULL, NULL); 593 594 return (0); 595 } 596 597 static void 598 vmx_enable(void *arg __unused) 599 { 600 int error; 601 uint64_t feature_control; 602 603 feature_control = rdmsr(MSR_IA32_FEATURE_CONTROL); 604 if ((feature_control & IA32_FEATURE_CONTROL_LOCK) == 0 || 605 (feature_control & IA32_FEATURE_CONTROL_VMX_EN) == 0) { 606 wrmsr(MSR_IA32_FEATURE_CONTROL, 607 feature_control | IA32_FEATURE_CONTROL_VMX_EN | 608 IA32_FEATURE_CONTROL_LOCK); 609 } 610 611 load_cr4(rcr4() | CR4_VMXE); 612 613 *(uint32_t *)vmxon_region[curcpu] = vmx_revision(); 614 error = vmxon(vmxon_region[curcpu]); 615 if (error == 0) 616 vmxon_enabled[curcpu] = 1; 617 } 618 619 static void 620 vmx_restore(void) 621 { 622 623 if (vmxon_enabled[curcpu]) 624 vmxon(vmxon_region[curcpu]); 625 } 626 627 static int 628 vmx_init(int ipinum) 629 { 630 int error, use_tpr_shadow; 631 uint64_t basic, fixed0, fixed1, feature_control; 632 uint32_t tmp, procbased2_vid_bits; 633 634 /* CPUID.1:ECX[bit 5] must be 1 for processor to support VMX */ 635 if (!(cpu_feature2 & CPUID2_VMX)) { 636 printf("vmx_init: processor does not support VMX operation\n"); 637 return (ENXIO); 638 } 639 640 /* 641 * Verify that MSR_IA32_FEATURE_CONTROL lock and VMXON enable bits 642 * are set (bits 0 and 2 respectively). 643 */ 644 feature_control = rdmsr(MSR_IA32_FEATURE_CONTROL); 645 if ((feature_control & IA32_FEATURE_CONTROL_LOCK) == 1 && 646 (feature_control & IA32_FEATURE_CONTROL_VMX_EN) == 0) { 647 printf("vmx_init: VMX operation disabled by BIOS\n"); 648 return (ENXIO); 649 } 650 651 /* 652 * Verify capabilities MSR_VMX_BASIC: 653 * - bit 54 indicates support for INS/OUTS decoding 654 */ 655 basic = rdmsr(MSR_VMX_BASIC); 656 if ((basic & (1UL << 54)) == 0) { 657 printf("vmx_init: processor does not support desired basic " 658 "capabilities\n"); 659 return (EINVAL); 660 } 661 662 /* Check support for primary processor-based VM-execution controls */ 663 error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, 664 MSR_VMX_TRUE_PROCBASED_CTLS, 665 PROCBASED_CTLS_ONE_SETTING, 666 PROCBASED_CTLS_ZERO_SETTING, &procbased_ctls); 667 if (error) { 668 printf("vmx_init: processor does not support desired primary " 669 "processor-based controls\n"); 670 return (error); 671 } 672 673 /* Clear the processor-based ctl bits that are set on demand */ 674 procbased_ctls &= ~PROCBASED_CTLS_WINDOW_SETTING; 675 676 /* Check support for secondary processor-based VM-execution controls */ 677 error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, 678 MSR_VMX_PROCBASED_CTLS2, 679 PROCBASED_CTLS2_ONE_SETTING, 680 PROCBASED_CTLS2_ZERO_SETTING, &procbased_ctls2); 681 if (error) { 682 printf("vmx_init: processor does not support desired secondary " 683 "processor-based controls\n"); 684 return (error); 685 } 686 687 /* Check support for VPID */ 688 error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, MSR_VMX_PROCBASED_CTLS2, 689 PROCBASED2_ENABLE_VPID, 0, &tmp); 690 if (error == 0) 691 procbased_ctls2 |= PROCBASED2_ENABLE_VPID; 692 693 /* Check support for pin-based VM-execution controls */ 694 error = vmx_set_ctlreg(MSR_VMX_PINBASED_CTLS, 695 MSR_VMX_TRUE_PINBASED_CTLS, 696 PINBASED_CTLS_ONE_SETTING, 697 PINBASED_CTLS_ZERO_SETTING, &pinbased_ctls); 698 if (error) { 699 printf("vmx_init: processor does not support desired " 700 "pin-based controls\n"); 701 return (error); 702 } 703 704 /* Check support for VM-exit controls */ 705 error = vmx_set_ctlreg(MSR_VMX_EXIT_CTLS, MSR_VMX_TRUE_EXIT_CTLS, 706 VM_EXIT_CTLS_ONE_SETTING, 707 VM_EXIT_CTLS_ZERO_SETTING, 708 &exit_ctls); 709 if (error) { 710 printf("vmx_init: processor does not support desired " 711 "exit controls\n"); 712 return (error); 713 } 714 715 /* Check support for VM-entry controls */ 716 error = vmx_set_ctlreg(MSR_VMX_ENTRY_CTLS, MSR_VMX_TRUE_ENTRY_CTLS, 717 VM_ENTRY_CTLS_ONE_SETTING, VM_ENTRY_CTLS_ZERO_SETTING, 718 &entry_ctls); 719 if (error) { 720 printf("vmx_init: processor does not support desired " 721 "entry controls\n"); 722 return (error); 723 } 724 725 /* 726 * Check support for optional features by testing them 727 * as individual bits 728 */ 729 cap_halt_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, 730 MSR_VMX_TRUE_PROCBASED_CTLS, 731 PROCBASED_HLT_EXITING, 0, 732 &tmp) == 0); 733 734 cap_monitor_trap = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, 735 MSR_VMX_PROCBASED_CTLS, 736 PROCBASED_MTF, 0, 737 &tmp) == 0); 738 739 cap_pause_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, 740 MSR_VMX_TRUE_PROCBASED_CTLS, 741 PROCBASED_PAUSE_EXITING, 0, 742 &tmp) == 0); 743 744 cap_unrestricted_guest = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, 745 MSR_VMX_PROCBASED_CTLS2, 746 PROCBASED2_UNRESTRICTED_GUEST, 0, 747 &tmp) == 0); 748 749 cap_invpcid = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, 750 MSR_VMX_PROCBASED_CTLS2, PROCBASED2_ENABLE_INVPCID, 0, 751 &tmp) == 0); 752 753 /* 754 * Check support for virtual interrupt delivery. 755 */ 756 procbased2_vid_bits = (PROCBASED2_VIRTUALIZE_APIC_ACCESSES | 757 PROCBASED2_VIRTUALIZE_X2APIC_MODE | 758 PROCBASED2_APIC_REGISTER_VIRTUALIZATION | 759 PROCBASED2_VIRTUAL_INTERRUPT_DELIVERY); 760 761 use_tpr_shadow = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, 762 MSR_VMX_TRUE_PROCBASED_CTLS, PROCBASED_USE_TPR_SHADOW, 0, 763 &tmp) == 0); 764 765 error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, MSR_VMX_PROCBASED_CTLS2, 766 procbased2_vid_bits, 0, &tmp); 767 if (error == 0 && use_tpr_shadow) { 768 virtual_interrupt_delivery = 1; 769 TUNABLE_INT_FETCH("hw.vmm.vmx.use_apic_vid", 770 &virtual_interrupt_delivery); 771 } 772 773 if (virtual_interrupt_delivery) { 774 procbased_ctls |= PROCBASED_USE_TPR_SHADOW; 775 procbased_ctls2 |= procbased2_vid_bits; 776 procbased_ctls2 &= ~PROCBASED2_VIRTUALIZE_X2APIC_MODE; 777 778 /* 779 * No need to emulate accesses to %CR8 if virtual 780 * interrupt delivery is enabled. 781 */ 782 procbased_ctls &= ~PROCBASED_CR8_LOAD_EXITING; 783 procbased_ctls &= ~PROCBASED_CR8_STORE_EXITING; 784 785 /* 786 * Check for Posted Interrupts only if Virtual Interrupt 787 * Delivery is enabled. 788 */ 789 error = vmx_set_ctlreg(MSR_VMX_PINBASED_CTLS, 790 MSR_VMX_TRUE_PINBASED_CTLS, PINBASED_POSTED_INTERRUPT, 0, 791 &tmp); 792 if (error == 0) { 793 pirvec = lapic_ipi_alloc(pti ? &IDTVEC(justreturn1_pti) : 794 &IDTVEC(justreturn)); 795 if (pirvec < 0) { 796 if (bootverbose) { 797 printf("vmx_init: unable to allocate " 798 "posted interrupt vector\n"); 799 } 800 } else { 801 posted_interrupts = 1; 802 TUNABLE_INT_FETCH("hw.vmm.vmx.use_apic_pir", 803 &posted_interrupts); 804 } 805 } 806 } 807 808 if (posted_interrupts) 809 pinbased_ctls |= PINBASED_POSTED_INTERRUPT; 810 811 /* Initialize EPT */ 812 error = ept_init(ipinum); 813 if (error) { 814 printf("vmx_init: ept initialization failed (%d)\n", error); 815 return (error); 816 } 817 818 guest_l1d_flush = (cpu_ia32_arch_caps & 819 IA32_ARCH_CAP_SKIP_L1DFL_VMENTRY) == 0; 820 TUNABLE_INT_FETCH("hw.vmm.l1d_flush", &guest_l1d_flush); 821 822 /* 823 * L1D cache flush is enabled. Use IA32_FLUSH_CMD MSR when 824 * available. Otherwise fall back to the software flush 825 * method which loads enough data from the kernel text to 826 * flush existing L1D content, both on VMX entry and on NMI 827 * return. 828 */ 829 if (guest_l1d_flush) { 830 if ((cpu_stdext_feature3 & CPUID_STDEXT3_L1D_FLUSH) == 0) { 831 guest_l1d_flush_sw = 1; 832 TUNABLE_INT_FETCH("hw.vmm.l1d_flush_sw", 833 &guest_l1d_flush_sw); 834 } 835 if (guest_l1d_flush_sw) { 836 if (nmi_flush_l1d_sw <= 1) 837 nmi_flush_l1d_sw = 1; 838 } else { 839 msr_load_list[0].index = MSR_IA32_FLUSH_CMD; 840 msr_load_list[0].val = IA32_FLUSH_CMD_L1D; 841 } 842 } 843 844 /* 845 * Stash the cr0 and cr4 bits that must be fixed to 0 or 1 846 */ 847 fixed0 = rdmsr(MSR_VMX_CR0_FIXED0); 848 fixed1 = rdmsr(MSR_VMX_CR0_FIXED1); 849 cr0_ones_mask = fixed0 & fixed1; 850 cr0_zeros_mask = ~fixed0 & ~fixed1; 851 852 /* 853 * CR0_PE and CR0_PG can be set to zero in VMX non-root operation 854 * if unrestricted guest execution is allowed. 855 */ 856 if (cap_unrestricted_guest) 857 cr0_ones_mask &= ~(CR0_PG | CR0_PE); 858 859 /* 860 * Do not allow the guest to set CR0_NW or CR0_CD. 861 */ 862 cr0_zeros_mask |= (CR0_NW | CR0_CD); 863 864 fixed0 = rdmsr(MSR_VMX_CR4_FIXED0); 865 fixed1 = rdmsr(MSR_VMX_CR4_FIXED1); 866 cr4_ones_mask = fixed0 & fixed1; 867 cr4_zeros_mask = ~fixed0 & ~fixed1; 868 869 vpid_init(); 870 871 vmx_msr_init(); 872 873 /* enable VMX operation */ 874 smp_rendezvous(NULL, vmx_enable, NULL, NULL); 875 876 vmx_initialized = 1; 877 878 return (0); 879 } 880 881 static void 882 vmx_trigger_hostintr(int vector) 883 { 884 uintptr_t func; 885 struct gate_descriptor *gd; 886 887 gd = &idt[vector]; 888 889 KASSERT(vector >= 32 && vector <= 255, ("vmx_trigger_hostintr: " 890 "invalid vector %d", vector)); 891 KASSERT(gd->gd_p == 1, ("gate descriptor for vector %d not present", 892 vector)); 893 KASSERT(gd->gd_type == SDT_SYSIGT, ("gate descriptor for vector %d " 894 "has invalid type %d", vector, gd->gd_type)); 895 KASSERT(gd->gd_dpl == SEL_KPL, ("gate descriptor for vector %d " 896 "has invalid dpl %d", vector, gd->gd_dpl)); 897 KASSERT(gd->gd_selector == GSEL(GCODE_SEL, SEL_KPL), ("gate descriptor " 898 "for vector %d has invalid selector %d", vector, gd->gd_selector)); 899 KASSERT(gd->gd_ist == 0, ("gate descriptor for vector %d has invalid " 900 "IST %d", vector, gd->gd_ist)); 901 902 func = ((long)gd->gd_hioffset << 16 | gd->gd_looffset); 903 vmx_call_isr(func); 904 } 905 906 static int 907 vmx_setup_cr_shadow(int which, struct vmcs *vmcs, uint32_t initial) 908 { 909 int error, mask_ident, shadow_ident; 910 uint64_t mask_value; 911 912 if (which != 0 && which != 4) 913 panic("vmx_setup_cr_shadow: unknown cr%d", which); 914 915 if (which == 0) { 916 mask_ident = VMCS_CR0_MASK; 917 mask_value = cr0_ones_mask | cr0_zeros_mask; 918 shadow_ident = VMCS_CR0_SHADOW; 919 } else { 920 mask_ident = VMCS_CR4_MASK; 921 mask_value = cr4_ones_mask | cr4_zeros_mask; 922 shadow_ident = VMCS_CR4_SHADOW; 923 } 924 925 error = vmcs_setreg(vmcs, 0, VMCS_IDENT(mask_ident), mask_value); 926 if (error) 927 return (error); 928 929 error = vmcs_setreg(vmcs, 0, VMCS_IDENT(shadow_ident), initial); 930 if (error) 931 return (error); 932 933 return (0); 934 } 935 #define vmx_setup_cr0_shadow(vmcs,init) vmx_setup_cr_shadow(0, (vmcs), (init)) 936 #define vmx_setup_cr4_shadow(vmcs,init) vmx_setup_cr_shadow(4, (vmcs), (init)) 937 938 static void * 939 vmx_vminit(struct vm *vm, pmap_t pmap) 940 { 941 uint16_t vpid[VM_MAXCPU]; 942 int i, error; 943 struct vmx *vmx; 944 struct vmcs *vmcs; 945 uint32_t exc_bitmap; 946 uint16_t maxcpus; 947 948 vmx = malloc(sizeof(struct vmx), M_VMX, M_WAITOK | M_ZERO); 949 if ((uintptr_t)vmx & PAGE_MASK) { 950 panic("malloc of struct vmx not aligned on %d byte boundary", 951 PAGE_SIZE); 952 } 953 vmx->vm = vm; 954 955 vmx->eptp = eptp(vtophys((vm_offset_t)pmap->pm_pml4)); 956 957 /* 958 * Clean up EPTP-tagged guest physical and combined mappings 959 * 960 * VMX transitions are not required to invalidate any guest physical 961 * mappings. So, it may be possible for stale guest physical mappings 962 * to be present in the processor TLBs. 963 * 964 * Combined mappings for this EP4TA are also invalidated for all VPIDs. 965 */ 966 ept_invalidate_mappings(vmx->eptp); 967 968 msr_bitmap_initialize(vmx->msr_bitmap); 969 970 /* 971 * It is safe to allow direct access to MSR_GSBASE and MSR_FSBASE. 972 * The guest FSBASE and GSBASE are saved and restored during 973 * vm-exit and vm-entry respectively. The host FSBASE and GSBASE are 974 * always restored from the vmcs host state area on vm-exit. 975 * 976 * The SYSENTER_CS/ESP/EIP MSRs are identical to FS/GSBASE in 977 * how they are saved/restored so can be directly accessed by the 978 * guest. 979 * 980 * MSR_EFER is saved and restored in the guest VMCS area on a 981 * VM exit and entry respectively. It is also restored from the 982 * host VMCS area on a VM exit. 983 * 984 * The TSC MSR is exposed read-only. Writes are disallowed as 985 * that will impact the host TSC. If the guest does a write 986 * the "use TSC offsetting" execution control is enabled and the 987 * difference between the host TSC and the guest TSC is written 988 * into the TSC offset in the VMCS. 989 */ 990 if (guest_msr_rw(vmx, MSR_GSBASE) || 991 guest_msr_rw(vmx, MSR_FSBASE) || 992 guest_msr_rw(vmx, MSR_SYSENTER_CS_MSR) || 993 guest_msr_rw(vmx, MSR_SYSENTER_ESP_MSR) || 994 guest_msr_rw(vmx, MSR_SYSENTER_EIP_MSR) || 995 guest_msr_rw(vmx, MSR_EFER) || 996 guest_msr_ro(vmx, MSR_TSC)) 997 panic("vmx_vminit: error setting guest msr access"); 998 999 vpid_alloc(vpid, VM_MAXCPU); 1000 1001 if (virtual_interrupt_delivery) { 1002 error = vm_map_mmio(vm, DEFAULT_APIC_BASE, PAGE_SIZE, 1003 APIC_ACCESS_ADDRESS); 1004 /* XXX this should really return an error to the caller */ 1005 KASSERT(error == 0, ("vm_map_mmio(apicbase) error %d", error)); 1006 } 1007 1008 maxcpus = vm_get_maxcpus(vm); 1009 for (i = 0; i < maxcpus; i++) { 1010 vmcs = &vmx->vmcs[i]; 1011 vmcs->identifier = vmx_revision(); 1012 error = vmclear(vmcs); 1013 if (error != 0) { 1014 panic("vmx_vminit: vmclear error %d on vcpu %d\n", 1015 error, i); 1016 } 1017 1018 vmx_msr_guest_init(vmx, i); 1019 1020 error = vmcs_init(vmcs); 1021 KASSERT(error == 0, ("vmcs_init error %d", error)); 1022 1023 VMPTRLD(vmcs); 1024 error = 0; 1025 error += vmwrite(VMCS_HOST_RSP, (u_long)&vmx->ctx[i]); 1026 error += vmwrite(VMCS_EPTP, vmx->eptp); 1027 error += vmwrite(VMCS_PIN_BASED_CTLS, pinbased_ctls); 1028 error += vmwrite(VMCS_PRI_PROC_BASED_CTLS, procbased_ctls); 1029 error += vmwrite(VMCS_SEC_PROC_BASED_CTLS, procbased_ctls2); 1030 error += vmwrite(VMCS_EXIT_CTLS, exit_ctls); 1031 error += vmwrite(VMCS_ENTRY_CTLS, entry_ctls); 1032 error += vmwrite(VMCS_MSR_BITMAP, vtophys(vmx->msr_bitmap)); 1033 error += vmwrite(VMCS_VPID, vpid[i]); 1034 1035 if (guest_l1d_flush && !guest_l1d_flush_sw) { 1036 vmcs_write(VMCS_ENTRY_MSR_LOAD, pmap_kextract( 1037 (vm_offset_t)&msr_load_list[0])); 1038 vmcs_write(VMCS_ENTRY_MSR_LOAD_COUNT, 1039 nitems(msr_load_list)); 1040 vmcs_write(VMCS_EXIT_MSR_STORE, 0); 1041 vmcs_write(VMCS_EXIT_MSR_STORE_COUNT, 0); 1042 } 1043 1044 /* exception bitmap */ 1045 if (vcpu_trace_exceptions(vm, i)) 1046 exc_bitmap = 0xffffffff; 1047 else 1048 exc_bitmap = 1 << IDT_MC; 1049 error += vmwrite(VMCS_EXCEPTION_BITMAP, exc_bitmap); 1050 1051 vmx->ctx[i].guest_dr6 = DBREG_DR6_RESERVED1; 1052 error += vmwrite(VMCS_GUEST_DR7, DBREG_DR7_RESERVED1); 1053 1054 if (virtual_interrupt_delivery) { 1055 error += vmwrite(VMCS_APIC_ACCESS, APIC_ACCESS_ADDRESS); 1056 error += vmwrite(VMCS_VIRTUAL_APIC, 1057 vtophys(&vmx->apic_page[i])); 1058 error += vmwrite(VMCS_EOI_EXIT0, 0); 1059 error += vmwrite(VMCS_EOI_EXIT1, 0); 1060 error += vmwrite(VMCS_EOI_EXIT2, 0); 1061 error += vmwrite(VMCS_EOI_EXIT3, 0); 1062 } 1063 if (posted_interrupts) { 1064 error += vmwrite(VMCS_PIR_VECTOR, pirvec); 1065 error += vmwrite(VMCS_PIR_DESC, 1066 vtophys(&vmx->pir_desc[i])); 1067 } 1068 VMCLEAR(vmcs); 1069 KASSERT(error == 0, ("vmx_vminit: error customizing the vmcs")); 1070 1071 vmx->cap[i].set = 0; 1072 vmx->cap[i].proc_ctls = procbased_ctls; 1073 vmx->cap[i].proc_ctls2 = procbased_ctls2; 1074 vmx->cap[i].exc_bitmap = exc_bitmap; 1075 1076 vmx->state[i].nextrip = ~0; 1077 vmx->state[i].lastcpu = NOCPU; 1078 vmx->state[i].vpid = vpid[i]; 1079 1080 /* 1081 * Set up the CR0/4 shadows, and init the read shadow 1082 * to the power-on register value from the Intel Sys Arch. 1083 * CR0 - 0x60000010 1084 * CR4 - 0 1085 */ 1086 error = vmx_setup_cr0_shadow(vmcs, 0x60000010); 1087 if (error != 0) 1088 panic("vmx_setup_cr0_shadow %d", error); 1089 1090 error = vmx_setup_cr4_shadow(vmcs, 0); 1091 if (error != 0) 1092 panic("vmx_setup_cr4_shadow %d", error); 1093 1094 vmx->ctx[i].pmap = pmap; 1095 } 1096 1097 return (vmx); 1098 } 1099 1100 static int 1101 vmx_handle_cpuid(struct vm *vm, int vcpu, struct vmxctx *vmxctx) 1102 { 1103 int handled, func; 1104 1105 func = vmxctx->guest_rax; 1106 1107 handled = x86_emulate_cpuid(vm, vcpu, 1108 (uint32_t*)(&vmxctx->guest_rax), 1109 (uint32_t*)(&vmxctx->guest_rbx), 1110 (uint32_t*)(&vmxctx->guest_rcx), 1111 (uint32_t*)(&vmxctx->guest_rdx)); 1112 return (handled); 1113 } 1114 1115 static __inline void 1116 vmx_run_trace(struct vmx *vmx, int vcpu) 1117 { 1118 #ifdef KTR 1119 VCPU_CTR1(vmx->vm, vcpu, "Resume execution at %#lx", vmcs_guest_rip()); 1120 #endif 1121 } 1122 1123 static __inline void 1124 vmx_exit_trace(struct vmx *vmx, int vcpu, uint64_t rip, uint32_t exit_reason, 1125 int handled) 1126 { 1127 #ifdef KTR 1128 VCPU_CTR3(vmx->vm, vcpu, "%s %s vmexit at 0x%0lx", 1129 handled ? "handled" : "unhandled", 1130 exit_reason_to_str(exit_reason), rip); 1131 #endif 1132 } 1133 1134 static __inline void 1135 vmx_astpending_trace(struct vmx *vmx, int vcpu, uint64_t rip) 1136 { 1137 #ifdef KTR 1138 VCPU_CTR1(vmx->vm, vcpu, "astpending vmexit at 0x%0lx", rip); 1139 #endif 1140 } 1141 1142 static VMM_STAT_INTEL(VCPU_INVVPID_SAVED, "Number of vpid invalidations saved"); 1143 static VMM_STAT_INTEL(VCPU_INVVPID_DONE, "Number of vpid invalidations done"); 1144 1145 /* 1146 * Invalidate guest mappings identified by its vpid from the TLB. 1147 */ 1148 static __inline void 1149 vmx_invvpid(struct vmx *vmx, int vcpu, pmap_t pmap, int running) 1150 { 1151 struct vmxstate *vmxstate; 1152 struct invvpid_desc invvpid_desc; 1153 1154 vmxstate = &vmx->state[vcpu]; 1155 if (vmxstate->vpid == 0) 1156 return; 1157 1158 if (!running) { 1159 /* 1160 * Set the 'lastcpu' to an invalid host cpu. 1161 * 1162 * This will invalidate TLB entries tagged with the vcpu's 1163 * vpid the next time it runs via vmx_set_pcpu_defaults(). 1164 */ 1165 vmxstate->lastcpu = NOCPU; 1166 return; 1167 } 1168 1169 KASSERT(curthread->td_critnest > 0, ("%s: vcpu %d running outside " 1170 "critical section", __func__, vcpu)); 1171 1172 /* 1173 * Invalidate all mappings tagged with 'vpid' 1174 * 1175 * We do this because this vcpu was executing on a different host 1176 * cpu when it last ran. We do not track whether it invalidated 1177 * mappings associated with its 'vpid' during that run. So we must 1178 * assume that the mappings associated with 'vpid' on 'curcpu' are 1179 * stale and invalidate them. 1180 * 1181 * Note that we incur this penalty only when the scheduler chooses to 1182 * move the thread associated with this vcpu between host cpus. 1183 * 1184 * Note also that this will invalidate mappings tagged with 'vpid' 1185 * for "all" EP4TAs. 1186 */ 1187 if (pmap->pm_eptgen == vmx->eptgen[curcpu]) { 1188 invvpid_desc._res1 = 0; 1189 invvpid_desc._res2 = 0; 1190 invvpid_desc.vpid = vmxstate->vpid; 1191 invvpid_desc.linear_addr = 0; 1192 invvpid(INVVPID_TYPE_SINGLE_CONTEXT, invvpid_desc); 1193 vmm_stat_incr(vmx->vm, vcpu, VCPU_INVVPID_DONE, 1); 1194 } else { 1195 /* 1196 * The invvpid can be skipped if an invept is going to 1197 * be performed before entering the guest. The invept 1198 * will invalidate combined mappings tagged with 1199 * 'vmx->eptp' for all vpids. 1200 */ 1201 vmm_stat_incr(vmx->vm, vcpu, VCPU_INVVPID_SAVED, 1); 1202 } 1203 } 1204 1205 static void 1206 vmx_set_pcpu_defaults(struct vmx *vmx, int vcpu, pmap_t pmap) 1207 { 1208 struct vmxstate *vmxstate; 1209 1210 vmxstate = &vmx->state[vcpu]; 1211 if (vmxstate->lastcpu == curcpu) 1212 return; 1213 1214 vmxstate->lastcpu = curcpu; 1215 1216 vmm_stat_incr(vmx->vm, vcpu, VCPU_MIGRATIONS, 1); 1217 1218 vmcs_write(VMCS_HOST_TR_BASE, vmm_get_host_trbase()); 1219 vmcs_write(VMCS_HOST_GDTR_BASE, vmm_get_host_gdtrbase()); 1220 vmcs_write(VMCS_HOST_GS_BASE, vmm_get_host_gsbase()); 1221 vmx_invvpid(vmx, vcpu, pmap, 1); 1222 } 1223 1224 /* 1225 * We depend on 'procbased_ctls' to have the Interrupt Window Exiting bit set. 1226 */ 1227 CTASSERT((PROCBASED_CTLS_ONE_SETTING & PROCBASED_INT_WINDOW_EXITING) != 0); 1228 1229 static void __inline 1230 vmx_set_int_window_exiting(struct vmx *vmx, int vcpu) 1231 { 1232 1233 if ((vmx->cap[vcpu].proc_ctls & PROCBASED_INT_WINDOW_EXITING) == 0) { 1234 vmx->cap[vcpu].proc_ctls |= PROCBASED_INT_WINDOW_EXITING; 1235 vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); 1236 VCPU_CTR0(vmx->vm, vcpu, "Enabling interrupt window exiting"); 1237 } 1238 } 1239 1240 static void __inline 1241 vmx_clear_int_window_exiting(struct vmx *vmx, int vcpu) 1242 { 1243 1244 KASSERT((vmx->cap[vcpu].proc_ctls & PROCBASED_INT_WINDOW_EXITING) != 0, 1245 ("intr_window_exiting not set: %#x", vmx->cap[vcpu].proc_ctls)); 1246 vmx->cap[vcpu].proc_ctls &= ~PROCBASED_INT_WINDOW_EXITING; 1247 vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); 1248 VCPU_CTR0(vmx->vm, vcpu, "Disabling interrupt window exiting"); 1249 } 1250 1251 static void __inline 1252 vmx_set_nmi_window_exiting(struct vmx *vmx, int vcpu) 1253 { 1254 1255 if ((vmx->cap[vcpu].proc_ctls & PROCBASED_NMI_WINDOW_EXITING) == 0) { 1256 vmx->cap[vcpu].proc_ctls |= PROCBASED_NMI_WINDOW_EXITING; 1257 vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); 1258 VCPU_CTR0(vmx->vm, vcpu, "Enabling NMI window exiting"); 1259 } 1260 } 1261 1262 static void __inline 1263 vmx_clear_nmi_window_exiting(struct vmx *vmx, int vcpu) 1264 { 1265 1266 KASSERT((vmx->cap[vcpu].proc_ctls & PROCBASED_NMI_WINDOW_EXITING) != 0, 1267 ("nmi_window_exiting not set %#x", vmx->cap[vcpu].proc_ctls)); 1268 vmx->cap[vcpu].proc_ctls &= ~PROCBASED_NMI_WINDOW_EXITING; 1269 vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); 1270 VCPU_CTR0(vmx->vm, vcpu, "Disabling NMI window exiting"); 1271 } 1272 1273 int 1274 vmx_set_tsc_offset(struct vmx *vmx, int vcpu, uint64_t offset) 1275 { 1276 int error; 1277 1278 if ((vmx->cap[vcpu].proc_ctls & PROCBASED_TSC_OFFSET) == 0) { 1279 vmx->cap[vcpu].proc_ctls |= PROCBASED_TSC_OFFSET; 1280 vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); 1281 VCPU_CTR0(vmx->vm, vcpu, "Enabling TSC offsetting"); 1282 } 1283 1284 error = vmwrite(VMCS_TSC_OFFSET, offset); 1285 1286 return (error); 1287 } 1288 1289 #define NMI_BLOCKING (VMCS_INTERRUPTIBILITY_NMI_BLOCKING | \ 1290 VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING) 1291 #define HWINTR_BLOCKING (VMCS_INTERRUPTIBILITY_STI_BLOCKING | \ 1292 VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING) 1293 1294 static void 1295 vmx_inject_nmi(struct vmx *vmx, int vcpu) 1296 { 1297 uint32_t gi, info; 1298 1299 gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY); 1300 KASSERT((gi & NMI_BLOCKING) == 0, ("vmx_inject_nmi: invalid guest " 1301 "interruptibility-state %#x", gi)); 1302 1303 info = vmcs_read(VMCS_ENTRY_INTR_INFO); 1304 KASSERT((info & VMCS_INTR_VALID) == 0, ("vmx_inject_nmi: invalid " 1305 "VM-entry interruption information %#x", info)); 1306 1307 /* 1308 * Inject the virtual NMI. The vector must be the NMI IDT entry 1309 * or the VMCS entry check will fail. 1310 */ 1311 info = IDT_NMI | VMCS_INTR_T_NMI | VMCS_INTR_VALID; 1312 vmcs_write(VMCS_ENTRY_INTR_INFO, info); 1313 1314 VCPU_CTR0(vmx->vm, vcpu, "Injecting vNMI"); 1315 1316 /* Clear the request */ 1317 vm_nmi_clear(vmx->vm, vcpu); 1318 } 1319 1320 static void 1321 vmx_inject_interrupts(struct vmx *vmx, int vcpu, struct vlapic *vlapic, 1322 uint64_t guestrip) 1323 { 1324 int vector, need_nmi_exiting, extint_pending; 1325 uint64_t rflags, entryinfo; 1326 uint32_t gi, info; 1327 1328 if (vmx->state[vcpu].nextrip != guestrip) { 1329 gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY); 1330 if (gi & HWINTR_BLOCKING) { 1331 VCPU_CTR2(vmx->vm, vcpu, "Guest interrupt blocking " 1332 "cleared due to rip change: %#lx/%#lx", 1333 vmx->state[vcpu].nextrip, guestrip); 1334 gi &= ~HWINTR_BLOCKING; 1335 vmcs_write(VMCS_GUEST_INTERRUPTIBILITY, gi); 1336 } 1337 } 1338 1339 if (vm_entry_intinfo(vmx->vm, vcpu, &entryinfo)) { 1340 KASSERT((entryinfo & VMCS_INTR_VALID) != 0, ("%s: entry " 1341 "intinfo is not valid: %#lx", __func__, entryinfo)); 1342 1343 info = vmcs_read(VMCS_ENTRY_INTR_INFO); 1344 KASSERT((info & VMCS_INTR_VALID) == 0, ("%s: cannot inject " 1345 "pending exception: %#lx/%#x", __func__, entryinfo, info)); 1346 1347 info = entryinfo; 1348 vector = info & 0xff; 1349 if (vector == IDT_BP || vector == IDT_OF) { 1350 /* 1351 * VT-x requires #BP and #OF to be injected as software 1352 * exceptions. 1353 */ 1354 info &= ~VMCS_INTR_T_MASK; 1355 info |= VMCS_INTR_T_SWEXCEPTION; 1356 } 1357 1358 if (info & VMCS_INTR_DEL_ERRCODE) 1359 vmcs_write(VMCS_ENTRY_EXCEPTION_ERROR, entryinfo >> 32); 1360 1361 vmcs_write(VMCS_ENTRY_INTR_INFO, info); 1362 } 1363 1364 if (vm_nmi_pending(vmx->vm, vcpu)) { 1365 /* 1366 * If there are no conditions blocking NMI injection then 1367 * inject it directly here otherwise enable "NMI window 1368 * exiting" to inject it as soon as we can. 1369 * 1370 * We also check for STI_BLOCKING because some implementations 1371 * don't allow NMI injection in this case. If we are running 1372 * on a processor that doesn't have this restriction it will 1373 * immediately exit and the NMI will be injected in the 1374 * "NMI window exiting" handler. 1375 */ 1376 need_nmi_exiting = 1; 1377 gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY); 1378 if ((gi & (HWINTR_BLOCKING | NMI_BLOCKING)) == 0) { 1379 info = vmcs_read(VMCS_ENTRY_INTR_INFO); 1380 if ((info & VMCS_INTR_VALID) == 0) { 1381 vmx_inject_nmi(vmx, vcpu); 1382 need_nmi_exiting = 0; 1383 } else { 1384 VCPU_CTR1(vmx->vm, vcpu, "Cannot inject NMI " 1385 "due to VM-entry intr info %#x", info); 1386 } 1387 } else { 1388 VCPU_CTR1(vmx->vm, vcpu, "Cannot inject NMI due to " 1389 "Guest Interruptibility-state %#x", gi); 1390 } 1391 1392 if (need_nmi_exiting) 1393 vmx_set_nmi_window_exiting(vmx, vcpu); 1394 } 1395 1396 extint_pending = vm_extint_pending(vmx->vm, vcpu); 1397 1398 if (!extint_pending && virtual_interrupt_delivery) { 1399 vmx_inject_pir(vlapic); 1400 return; 1401 } 1402 1403 /* 1404 * If interrupt-window exiting is already in effect then don't bother 1405 * checking for pending interrupts. This is just an optimization and 1406 * not needed for correctness. 1407 */ 1408 if ((vmx->cap[vcpu].proc_ctls & PROCBASED_INT_WINDOW_EXITING) != 0) { 1409 VCPU_CTR0(vmx->vm, vcpu, "Skip interrupt injection due to " 1410 "pending int_window_exiting"); 1411 return; 1412 } 1413 1414 if (!extint_pending) { 1415 /* Ask the local apic for a vector to inject */ 1416 if (!vlapic_pending_intr(vlapic, &vector)) 1417 return; 1418 1419 /* 1420 * From the Intel SDM, Volume 3, Section "Maskable 1421 * Hardware Interrupts": 1422 * - maskable interrupt vectors [16,255] can be delivered 1423 * through the local APIC. 1424 */ 1425 KASSERT(vector >= 16 && vector <= 255, 1426 ("invalid vector %d from local APIC", vector)); 1427 } else { 1428 /* Ask the legacy pic for a vector to inject */ 1429 vatpic_pending_intr(vmx->vm, &vector); 1430 1431 /* 1432 * From the Intel SDM, Volume 3, Section "Maskable 1433 * Hardware Interrupts": 1434 * - maskable interrupt vectors [0,255] can be delivered 1435 * through the INTR pin. 1436 */ 1437 KASSERT(vector >= 0 && vector <= 255, 1438 ("invalid vector %d from INTR", vector)); 1439 } 1440 1441 /* Check RFLAGS.IF and the interruptibility state of the guest */ 1442 rflags = vmcs_read(VMCS_GUEST_RFLAGS); 1443 if ((rflags & PSL_I) == 0) { 1444 VCPU_CTR2(vmx->vm, vcpu, "Cannot inject vector %d due to " 1445 "rflags %#lx", vector, rflags); 1446 goto cantinject; 1447 } 1448 1449 gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY); 1450 if (gi & HWINTR_BLOCKING) { 1451 VCPU_CTR2(vmx->vm, vcpu, "Cannot inject vector %d due to " 1452 "Guest Interruptibility-state %#x", vector, gi); 1453 goto cantinject; 1454 } 1455 1456 info = vmcs_read(VMCS_ENTRY_INTR_INFO); 1457 if (info & VMCS_INTR_VALID) { 1458 /* 1459 * This is expected and could happen for multiple reasons: 1460 * - A vectoring VM-entry was aborted due to astpending 1461 * - A VM-exit happened during event injection. 1462 * - An exception was injected above. 1463 * - An NMI was injected above or after "NMI window exiting" 1464 */ 1465 VCPU_CTR2(vmx->vm, vcpu, "Cannot inject vector %d due to " 1466 "VM-entry intr info %#x", vector, info); 1467 goto cantinject; 1468 } 1469 1470 /* Inject the interrupt */ 1471 info = VMCS_INTR_T_HWINTR | VMCS_INTR_VALID; 1472 info |= vector; 1473 vmcs_write(VMCS_ENTRY_INTR_INFO, info); 1474 1475 if (!extint_pending) { 1476 /* Update the Local APIC ISR */ 1477 vlapic_intr_accepted(vlapic, vector); 1478 } else { 1479 vm_extint_clear(vmx->vm, vcpu); 1480 vatpic_intr_accepted(vmx->vm, vector); 1481 1482 /* 1483 * After we accepted the current ExtINT the PIC may 1484 * have posted another one. If that is the case, set 1485 * the Interrupt Window Exiting execution control so 1486 * we can inject that one too. 1487 * 1488 * Also, interrupt window exiting allows us to inject any 1489 * pending APIC vector that was preempted by the ExtINT 1490 * as soon as possible. This applies both for the software 1491 * emulated vlapic and the hardware assisted virtual APIC. 1492 */ 1493 vmx_set_int_window_exiting(vmx, vcpu); 1494 } 1495 1496 VCPU_CTR1(vmx->vm, vcpu, "Injecting hwintr at vector %d", vector); 1497 1498 return; 1499 1500 cantinject: 1501 /* 1502 * Set the Interrupt Window Exiting execution control so we can inject 1503 * the interrupt as soon as blocking condition goes away. 1504 */ 1505 vmx_set_int_window_exiting(vmx, vcpu); 1506 } 1507 1508 /* 1509 * If the Virtual NMIs execution control is '1' then the logical processor 1510 * tracks virtual-NMI blocking in the Guest Interruptibility-state field of 1511 * the VMCS. An IRET instruction in VMX non-root operation will remove any 1512 * virtual-NMI blocking. 1513 * 1514 * This unblocking occurs even if the IRET causes a fault. In this case the 1515 * hypervisor needs to restore virtual-NMI blocking before resuming the guest. 1516 */ 1517 static void 1518 vmx_restore_nmi_blocking(struct vmx *vmx, int vcpuid) 1519 { 1520 uint32_t gi; 1521 1522 VCPU_CTR0(vmx->vm, vcpuid, "Restore Virtual-NMI blocking"); 1523 gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY); 1524 gi |= VMCS_INTERRUPTIBILITY_NMI_BLOCKING; 1525 vmcs_write(VMCS_GUEST_INTERRUPTIBILITY, gi); 1526 } 1527 1528 static void 1529 vmx_clear_nmi_blocking(struct vmx *vmx, int vcpuid) 1530 { 1531 uint32_t gi; 1532 1533 VCPU_CTR0(vmx->vm, vcpuid, "Clear Virtual-NMI blocking"); 1534 gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY); 1535 gi &= ~VMCS_INTERRUPTIBILITY_NMI_BLOCKING; 1536 vmcs_write(VMCS_GUEST_INTERRUPTIBILITY, gi); 1537 } 1538 1539 static void 1540 vmx_assert_nmi_blocking(struct vmx *vmx, int vcpuid) 1541 { 1542 uint32_t gi; 1543 1544 gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY); 1545 KASSERT(gi & VMCS_INTERRUPTIBILITY_NMI_BLOCKING, 1546 ("NMI blocking is not in effect %#x", gi)); 1547 } 1548 1549 static int 1550 vmx_emulate_xsetbv(struct vmx *vmx, int vcpu, struct vm_exit *vmexit) 1551 { 1552 struct vmxctx *vmxctx; 1553 uint64_t xcrval; 1554 const struct xsave_limits *limits; 1555 1556 vmxctx = &vmx->ctx[vcpu]; 1557 limits = vmm_get_xsave_limits(); 1558 1559 /* 1560 * Note that the processor raises a GP# fault on its own if 1561 * xsetbv is executed for CPL != 0, so we do not have to 1562 * emulate that fault here. 1563 */ 1564 1565 /* Only xcr0 is supported. */ 1566 if (vmxctx->guest_rcx != 0) { 1567 vm_inject_gp(vmx->vm, vcpu); 1568 return (HANDLED); 1569 } 1570 1571 /* We only handle xcr0 if both the host and guest have XSAVE enabled. */ 1572 if (!limits->xsave_enabled || !(vmcs_read(VMCS_GUEST_CR4) & CR4_XSAVE)) { 1573 vm_inject_ud(vmx->vm, vcpu); 1574 return (HANDLED); 1575 } 1576 1577 xcrval = vmxctx->guest_rdx << 32 | (vmxctx->guest_rax & 0xffffffff); 1578 if ((xcrval & ~limits->xcr0_allowed) != 0) { 1579 vm_inject_gp(vmx->vm, vcpu); 1580 return (HANDLED); 1581 } 1582 1583 if (!(xcrval & XFEATURE_ENABLED_X87)) { 1584 vm_inject_gp(vmx->vm, vcpu); 1585 return (HANDLED); 1586 } 1587 1588 /* AVX (YMM_Hi128) requires SSE. */ 1589 if (xcrval & XFEATURE_ENABLED_AVX && 1590 (xcrval & XFEATURE_AVX) != XFEATURE_AVX) { 1591 vm_inject_gp(vmx->vm, vcpu); 1592 return (HANDLED); 1593 } 1594 1595 /* 1596 * AVX512 requires base AVX (YMM_Hi128) as well as OpMask, 1597 * ZMM_Hi256, and Hi16_ZMM. 1598 */ 1599 if (xcrval & XFEATURE_AVX512 && 1600 (xcrval & (XFEATURE_AVX512 | XFEATURE_AVX)) != 1601 (XFEATURE_AVX512 | XFEATURE_AVX)) { 1602 vm_inject_gp(vmx->vm, vcpu); 1603 return (HANDLED); 1604 } 1605 1606 /* 1607 * Intel MPX requires both bound register state flags to be 1608 * set. 1609 */ 1610 if (((xcrval & XFEATURE_ENABLED_BNDREGS) != 0) != 1611 ((xcrval & XFEATURE_ENABLED_BNDCSR) != 0)) { 1612 vm_inject_gp(vmx->vm, vcpu); 1613 return (HANDLED); 1614 } 1615 1616 /* 1617 * This runs "inside" vmrun() with the guest's FPU state, so 1618 * modifying xcr0 directly modifies the guest's xcr0, not the 1619 * host's. 1620 */ 1621 load_xcr(0, xcrval); 1622 return (HANDLED); 1623 } 1624 1625 static uint64_t 1626 vmx_get_guest_reg(struct vmx *vmx, int vcpu, int ident) 1627 { 1628 const struct vmxctx *vmxctx; 1629 1630 vmxctx = &vmx->ctx[vcpu]; 1631 1632 switch (ident) { 1633 case 0: 1634 return (vmxctx->guest_rax); 1635 case 1: 1636 return (vmxctx->guest_rcx); 1637 case 2: 1638 return (vmxctx->guest_rdx); 1639 case 3: 1640 return (vmxctx->guest_rbx); 1641 case 4: 1642 return (vmcs_read(VMCS_GUEST_RSP)); 1643 case 5: 1644 return (vmxctx->guest_rbp); 1645 case 6: 1646 return (vmxctx->guest_rsi); 1647 case 7: 1648 return (vmxctx->guest_rdi); 1649 case 8: 1650 return (vmxctx->guest_r8); 1651 case 9: 1652 return (vmxctx->guest_r9); 1653 case 10: 1654 return (vmxctx->guest_r10); 1655 case 11: 1656 return (vmxctx->guest_r11); 1657 case 12: 1658 return (vmxctx->guest_r12); 1659 case 13: 1660 return (vmxctx->guest_r13); 1661 case 14: 1662 return (vmxctx->guest_r14); 1663 case 15: 1664 return (vmxctx->guest_r15); 1665 default: 1666 panic("invalid vmx register %d", ident); 1667 } 1668 } 1669 1670 static void 1671 vmx_set_guest_reg(struct vmx *vmx, int vcpu, int ident, uint64_t regval) 1672 { 1673 struct vmxctx *vmxctx; 1674 1675 vmxctx = &vmx->ctx[vcpu]; 1676 1677 switch (ident) { 1678 case 0: 1679 vmxctx->guest_rax = regval; 1680 break; 1681 case 1: 1682 vmxctx->guest_rcx = regval; 1683 break; 1684 case 2: 1685 vmxctx->guest_rdx = regval; 1686 break; 1687 case 3: 1688 vmxctx->guest_rbx = regval; 1689 break; 1690 case 4: 1691 vmcs_write(VMCS_GUEST_RSP, regval); 1692 break; 1693 case 5: 1694 vmxctx->guest_rbp = regval; 1695 break; 1696 case 6: 1697 vmxctx->guest_rsi = regval; 1698 break; 1699 case 7: 1700 vmxctx->guest_rdi = regval; 1701 break; 1702 case 8: 1703 vmxctx->guest_r8 = regval; 1704 break; 1705 case 9: 1706 vmxctx->guest_r9 = regval; 1707 break; 1708 case 10: 1709 vmxctx->guest_r10 = regval; 1710 break; 1711 case 11: 1712 vmxctx->guest_r11 = regval; 1713 break; 1714 case 12: 1715 vmxctx->guest_r12 = regval; 1716 break; 1717 case 13: 1718 vmxctx->guest_r13 = regval; 1719 break; 1720 case 14: 1721 vmxctx->guest_r14 = regval; 1722 break; 1723 case 15: 1724 vmxctx->guest_r15 = regval; 1725 break; 1726 default: 1727 panic("invalid vmx register %d", ident); 1728 } 1729 } 1730 1731 static int 1732 vmx_emulate_cr0_access(struct vmx *vmx, int vcpu, uint64_t exitqual) 1733 { 1734 uint64_t crval, regval; 1735 1736 /* We only handle mov to %cr0 at this time */ 1737 if ((exitqual & 0xf0) != 0x00) 1738 return (UNHANDLED); 1739 1740 regval = vmx_get_guest_reg(vmx, vcpu, (exitqual >> 8) & 0xf); 1741 1742 vmcs_write(VMCS_CR0_SHADOW, regval); 1743 1744 crval = regval | cr0_ones_mask; 1745 crval &= ~cr0_zeros_mask; 1746 vmcs_write(VMCS_GUEST_CR0, crval); 1747 1748 if (regval & CR0_PG) { 1749 uint64_t efer, entry_ctls; 1750 1751 /* 1752 * If CR0.PG is 1 and EFER.LME is 1 then EFER.LMA and 1753 * the "IA-32e mode guest" bit in VM-entry control must be 1754 * equal. 1755 */ 1756 efer = vmcs_read(VMCS_GUEST_IA32_EFER); 1757 if (efer & EFER_LME) { 1758 efer |= EFER_LMA; 1759 vmcs_write(VMCS_GUEST_IA32_EFER, efer); 1760 entry_ctls = vmcs_read(VMCS_ENTRY_CTLS); 1761 entry_ctls |= VM_ENTRY_GUEST_LMA; 1762 vmcs_write(VMCS_ENTRY_CTLS, entry_ctls); 1763 } 1764 } 1765 1766 return (HANDLED); 1767 } 1768 1769 static int 1770 vmx_emulate_cr4_access(struct vmx *vmx, int vcpu, uint64_t exitqual) 1771 { 1772 uint64_t crval, regval; 1773 1774 /* We only handle mov to %cr4 at this time */ 1775 if ((exitqual & 0xf0) != 0x00) 1776 return (UNHANDLED); 1777 1778 regval = vmx_get_guest_reg(vmx, vcpu, (exitqual >> 8) & 0xf); 1779 1780 vmcs_write(VMCS_CR4_SHADOW, regval); 1781 1782 crval = regval | cr4_ones_mask; 1783 crval &= ~cr4_zeros_mask; 1784 vmcs_write(VMCS_GUEST_CR4, crval); 1785 1786 return (HANDLED); 1787 } 1788 1789 static int 1790 vmx_emulate_cr8_access(struct vmx *vmx, int vcpu, uint64_t exitqual) 1791 { 1792 struct vlapic *vlapic; 1793 uint64_t cr8; 1794 int regnum; 1795 1796 /* We only handle mov %cr8 to/from a register at this time. */ 1797 if ((exitqual & 0xe0) != 0x00) { 1798 return (UNHANDLED); 1799 } 1800 1801 vlapic = vm_lapic(vmx->vm, vcpu); 1802 regnum = (exitqual >> 8) & 0xf; 1803 if (exitqual & 0x10) { 1804 cr8 = vlapic_get_cr8(vlapic); 1805 vmx_set_guest_reg(vmx, vcpu, regnum, cr8); 1806 } else { 1807 cr8 = vmx_get_guest_reg(vmx, vcpu, regnum); 1808 vlapic_set_cr8(vlapic, cr8); 1809 } 1810 1811 return (HANDLED); 1812 } 1813 1814 /* 1815 * From section "Guest Register State" in the Intel SDM: CPL = SS.DPL 1816 */ 1817 static int 1818 vmx_cpl(void) 1819 { 1820 uint32_t ssar; 1821 1822 ssar = vmcs_read(VMCS_GUEST_SS_ACCESS_RIGHTS); 1823 return ((ssar >> 5) & 0x3); 1824 } 1825 1826 static enum vm_cpu_mode 1827 vmx_cpu_mode(void) 1828 { 1829 uint32_t csar; 1830 1831 if (vmcs_read(VMCS_GUEST_IA32_EFER) & EFER_LMA) { 1832 csar = vmcs_read(VMCS_GUEST_CS_ACCESS_RIGHTS); 1833 if (csar & 0x2000) 1834 return (CPU_MODE_64BIT); /* CS.L = 1 */ 1835 else 1836 return (CPU_MODE_COMPATIBILITY); 1837 } else if (vmcs_read(VMCS_GUEST_CR0) & CR0_PE) { 1838 return (CPU_MODE_PROTECTED); 1839 } else { 1840 return (CPU_MODE_REAL); 1841 } 1842 } 1843 1844 static enum vm_paging_mode 1845 vmx_paging_mode(void) 1846 { 1847 1848 if (!(vmcs_read(VMCS_GUEST_CR0) & CR0_PG)) 1849 return (PAGING_MODE_FLAT); 1850 if (!(vmcs_read(VMCS_GUEST_CR4) & CR4_PAE)) 1851 return (PAGING_MODE_32); 1852 if (vmcs_read(VMCS_GUEST_IA32_EFER) & EFER_LME) 1853 return (PAGING_MODE_64); 1854 else 1855 return (PAGING_MODE_PAE); 1856 } 1857 1858 static uint64_t 1859 inout_str_index(struct vmx *vmx, int vcpuid, int in) 1860 { 1861 uint64_t val; 1862 int error; 1863 enum vm_reg_name reg; 1864 1865 reg = in ? VM_REG_GUEST_RDI : VM_REG_GUEST_RSI; 1866 error = vmx_getreg(vmx, vcpuid, reg, &val); 1867 KASSERT(error == 0, ("%s: vmx_getreg error %d", __func__, error)); 1868 return (val); 1869 } 1870 1871 static uint64_t 1872 inout_str_count(struct vmx *vmx, int vcpuid, int rep) 1873 { 1874 uint64_t val; 1875 int error; 1876 1877 if (rep) { 1878 error = vmx_getreg(vmx, vcpuid, VM_REG_GUEST_RCX, &val); 1879 KASSERT(!error, ("%s: vmx_getreg error %d", __func__, error)); 1880 } else { 1881 val = 1; 1882 } 1883 return (val); 1884 } 1885 1886 static int 1887 inout_str_addrsize(uint32_t inst_info) 1888 { 1889 uint32_t size; 1890 1891 size = (inst_info >> 7) & 0x7; 1892 switch (size) { 1893 case 0: 1894 return (2); /* 16 bit */ 1895 case 1: 1896 return (4); /* 32 bit */ 1897 case 2: 1898 return (8); /* 64 bit */ 1899 default: 1900 panic("%s: invalid size encoding %d", __func__, size); 1901 } 1902 } 1903 1904 static void 1905 inout_str_seginfo(struct vmx *vmx, int vcpuid, uint32_t inst_info, int in, 1906 struct vm_inout_str *vis) 1907 { 1908 int error, s; 1909 1910 if (in) { 1911 vis->seg_name = VM_REG_GUEST_ES; 1912 } else { 1913 s = (inst_info >> 15) & 0x7; 1914 vis->seg_name = vm_segment_name(s); 1915 } 1916 1917 error = vmx_getdesc(vmx, vcpuid, vis->seg_name, &vis->seg_desc); 1918 KASSERT(error == 0, ("%s: vmx_getdesc error %d", __func__, error)); 1919 } 1920 1921 static void 1922 vmx_paging_info(struct vm_guest_paging *paging) 1923 { 1924 paging->cr3 = vmcs_guest_cr3(); 1925 paging->cpl = vmx_cpl(); 1926 paging->cpu_mode = vmx_cpu_mode(); 1927 paging->paging_mode = vmx_paging_mode(); 1928 } 1929 1930 static void 1931 vmexit_inst_emul(struct vm_exit *vmexit, uint64_t gpa, uint64_t gla) 1932 { 1933 struct vm_guest_paging *paging; 1934 uint32_t csar; 1935 1936 paging = &vmexit->u.inst_emul.paging; 1937 1938 vmexit->exitcode = VM_EXITCODE_INST_EMUL; 1939 vmexit->inst_length = 0; 1940 vmexit->u.inst_emul.gpa = gpa; 1941 vmexit->u.inst_emul.gla = gla; 1942 vmx_paging_info(paging); 1943 switch (paging->cpu_mode) { 1944 case CPU_MODE_REAL: 1945 vmexit->u.inst_emul.cs_base = vmcs_read(VMCS_GUEST_CS_BASE); 1946 vmexit->u.inst_emul.cs_d = 0; 1947 break; 1948 case CPU_MODE_PROTECTED: 1949 case CPU_MODE_COMPATIBILITY: 1950 vmexit->u.inst_emul.cs_base = vmcs_read(VMCS_GUEST_CS_BASE); 1951 csar = vmcs_read(VMCS_GUEST_CS_ACCESS_RIGHTS); 1952 vmexit->u.inst_emul.cs_d = SEG_DESC_DEF32(csar); 1953 break; 1954 default: 1955 vmexit->u.inst_emul.cs_base = 0; 1956 vmexit->u.inst_emul.cs_d = 0; 1957 break; 1958 } 1959 vie_init(&vmexit->u.inst_emul.vie, NULL, 0); 1960 } 1961 1962 static int 1963 ept_fault_type(uint64_t ept_qual) 1964 { 1965 int fault_type; 1966 1967 if (ept_qual & EPT_VIOLATION_DATA_WRITE) 1968 fault_type = VM_PROT_WRITE; 1969 else if (ept_qual & EPT_VIOLATION_INST_FETCH) 1970 fault_type = VM_PROT_EXECUTE; 1971 else 1972 fault_type= VM_PROT_READ; 1973 1974 return (fault_type); 1975 } 1976 1977 static bool 1978 ept_emulation_fault(uint64_t ept_qual) 1979 { 1980 int read, write; 1981 1982 /* EPT fault on an instruction fetch doesn't make sense here */ 1983 if (ept_qual & EPT_VIOLATION_INST_FETCH) 1984 return (false); 1985 1986 /* EPT fault must be a read fault or a write fault */ 1987 read = ept_qual & EPT_VIOLATION_DATA_READ ? 1 : 0; 1988 write = ept_qual & EPT_VIOLATION_DATA_WRITE ? 1 : 0; 1989 if ((read | write) == 0) 1990 return (false); 1991 1992 /* 1993 * The EPT violation must have been caused by accessing a 1994 * guest-physical address that is a translation of a guest-linear 1995 * address. 1996 */ 1997 if ((ept_qual & EPT_VIOLATION_GLA_VALID) == 0 || 1998 (ept_qual & EPT_VIOLATION_XLAT_VALID) == 0) { 1999 return (false); 2000 } 2001 2002 return (true); 2003 } 2004 2005 static __inline int 2006 apic_access_virtualization(struct vmx *vmx, int vcpuid) 2007 { 2008 uint32_t proc_ctls2; 2009 2010 proc_ctls2 = vmx->cap[vcpuid].proc_ctls2; 2011 return ((proc_ctls2 & PROCBASED2_VIRTUALIZE_APIC_ACCESSES) ? 1 : 0); 2012 } 2013 2014 static __inline int 2015 x2apic_virtualization(struct vmx *vmx, int vcpuid) 2016 { 2017 uint32_t proc_ctls2; 2018 2019 proc_ctls2 = vmx->cap[vcpuid].proc_ctls2; 2020 return ((proc_ctls2 & PROCBASED2_VIRTUALIZE_X2APIC_MODE) ? 1 : 0); 2021 } 2022 2023 static int 2024 vmx_handle_apic_write(struct vmx *vmx, int vcpuid, struct vlapic *vlapic, 2025 uint64_t qual) 2026 { 2027 int error, handled, offset; 2028 uint32_t *apic_regs, vector; 2029 bool retu; 2030 2031 handled = HANDLED; 2032 offset = APIC_WRITE_OFFSET(qual); 2033 2034 if (!apic_access_virtualization(vmx, vcpuid)) { 2035 /* 2036 * In general there should not be any APIC write VM-exits 2037 * unless APIC-access virtualization is enabled. 2038 * 2039 * However self-IPI virtualization can legitimately trigger 2040 * an APIC-write VM-exit so treat it specially. 2041 */ 2042 if (x2apic_virtualization(vmx, vcpuid) && 2043 offset == APIC_OFFSET_SELF_IPI) { 2044 apic_regs = (uint32_t *)(vlapic->apic_page); 2045 vector = apic_regs[APIC_OFFSET_SELF_IPI / 4]; 2046 vlapic_self_ipi_handler(vlapic, vector); 2047 return (HANDLED); 2048 } else 2049 return (UNHANDLED); 2050 } 2051 2052 switch (offset) { 2053 case APIC_OFFSET_ID: 2054 vlapic_id_write_handler(vlapic); 2055 break; 2056 case APIC_OFFSET_LDR: 2057 vlapic_ldr_write_handler(vlapic); 2058 break; 2059 case APIC_OFFSET_DFR: 2060 vlapic_dfr_write_handler(vlapic); 2061 break; 2062 case APIC_OFFSET_SVR: 2063 vlapic_svr_write_handler(vlapic); 2064 break; 2065 case APIC_OFFSET_ESR: 2066 vlapic_esr_write_handler(vlapic); 2067 break; 2068 case APIC_OFFSET_ICR_LOW: 2069 retu = false; 2070 error = vlapic_icrlo_write_handler(vlapic, &retu); 2071 if (error != 0 || retu) 2072 handled = UNHANDLED; 2073 break; 2074 case APIC_OFFSET_CMCI_LVT: 2075 case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT: 2076 vlapic_lvt_write_handler(vlapic, offset); 2077 break; 2078 case APIC_OFFSET_TIMER_ICR: 2079 vlapic_icrtmr_write_handler(vlapic); 2080 break; 2081 case APIC_OFFSET_TIMER_DCR: 2082 vlapic_dcr_write_handler(vlapic); 2083 break; 2084 default: 2085 handled = UNHANDLED; 2086 break; 2087 } 2088 return (handled); 2089 } 2090 2091 static bool 2092 apic_access_fault(struct vmx *vmx, int vcpuid, uint64_t gpa) 2093 { 2094 2095 if (apic_access_virtualization(vmx, vcpuid) && 2096 (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE)) 2097 return (true); 2098 else 2099 return (false); 2100 } 2101 2102 static int 2103 vmx_handle_apic_access(struct vmx *vmx, int vcpuid, struct vm_exit *vmexit) 2104 { 2105 uint64_t qual; 2106 int access_type, offset, allowed; 2107 2108 if (!apic_access_virtualization(vmx, vcpuid)) 2109 return (UNHANDLED); 2110 2111 qual = vmexit->u.vmx.exit_qualification; 2112 access_type = APIC_ACCESS_TYPE(qual); 2113 offset = APIC_ACCESS_OFFSET(qual); 2114 2115 allowed = 0; 2116 if (access_type == 0) { 2117 /* 2118 * Read data access to the following registers is expected. 2119 */ 2120 switch (offset) { 2121 case APIC_OFFSET_APR: 2122 case APIC_OFFSET_PPR: 2123 case APIC_OFFSET_RRR: 2124 case APIC_OFFSET_CMCI_LVT: 2125 case APIC_OFFSET_TIMER_CCR: 2126 allowed = 1; 2127 break; 2128 default: 2129 break; 2130 } 2131 } else if (access_type == 1) { 2132 /* 2133 * Write data access to the following registers is expected. 2134 */ 2135 switch (offset) { 2136 case APIC_OFFSET_VER: 2137 case APIC_OFFSET_APR: 2138 case APIC_OFFSET_PPR: 2139 case APIC_OFFSET_RRR: 2140 case APIC_OFFSET_ISR0 ... APIC_OFFSET_ISR7: 2141 case APIC_OFFSET_TMR0 ... APIC_OFFSET_TMR7: 2142 case APIC_OFFSET_IRR0 ... APIC_OFFSET_IRR7: 2143 case APIC_OFFSET_CMCI_LVT: 2144 case APIC_OFFSET_TIMER_CCR: 2145 allowed = 1; 2146 break; 2147 default: 2148 break; 2149 } 2150 } 2151 2152 if (allowed) { 2153 vmexit_inst_emul(vmexit, DEFAULT_APIC_BASE + offset, 2154 VIE_INVALID_GLA); 2155 } 2156 2157 /* 2158 * Regardless of whether the APIC-access is allowed this handler 2159 * always returns UNHANDLED: 2160 * - if the access is allowed then it is handled by emulating the 2161 * instruction that caused the VM-exit (outside the critical section) 2162 * - if the access is not allowed then it will be converted to an 2163 * exitcode of VM_EXITCODE_VMX and will be dealt with in userland. 2164 */ 2165 return (UNHANDLED); 2166 } 2167 2168 static enum task_switch_reason 2169 vmx_task_switch_reason(uint64_t qual) 2170 { 2171 int reason; 2172 2173 reason = (qual >> 30) & 0x3; 2174 switch (reason) { 2175 case 0: 2176 return (TSR_CALL); 2177 case 1: 2178 return (TSR_IRET); 2179 case 2: 2180 return (TSR_JMP); 2181 case 3: 2182 return (TSR_IDT_GATE); 2183 default: 2184 panic("%s: invalid reason %d", __func__, reason); 2185 } 2186 } 2187 2188 static int 2189 emulate_wrmsr(struct vmx *vmx, int vcpuid, u_int num, uint64_t val, bool *retu) 2190 { 2191 int error; 2192 2193 if (lapic_msr(num)) 2194 error = lapic_wrmsr(vmx->vm, vcpuid, num, val, retu); 2195 else 2196 error = vmx_wrmsr(vmx, vcpuid, num, val, retu); 2197 2198 return (error); 2199 } 2200 2201 static int 2202 emulate_rdmsr(struct vmx *vmx, int vcpuid, u_int num, bool *retu) 2203 { 2204 struct vmxctx *vmxctx; 2205 uint64_t result; 2206 uint32_t eax, edx; 2207 int error; 2208 2209 if (lapic_msr(num)) 2210 error = lapic_rdmsr(vmx->vm, vcpuid, num, &result, retu); 2211 else 2212 error = vmx_rdmsr(vmx, vcpuid, num, &result, retu); 2213 2214 if (error == 0) { 2215 eax = result; 2216 vmxctx = &vmx->ctx[vcpuid]; 2217 error = vmxctx_setreg(vmxctx, VM_REG_GUEST_RAX, eax); 2218 KASSERT(error == 0, ("vmxctx_setreg(rax) error %d", error)); 2219 2220 edx = result >> 32; 2221 error = vmxctx_setreg(vmxctx, VM_REG_GUEST_RDX, edx); 2222 KASSERT(error == 0, ("vmxctx_setreg(rdx) error %d", error)); 2223 } 2224 2225 return (error); 2226 } 2227 2228 static int 2229 vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit) 2230 { 2231 int error, errcode, errcode_valid, handled, in; 2232 struct vmxctx *vmxctx; 2233 struct vlapic *vlapic; 2234 struct vm_inout_str *vis; 2235 struct vm_task_switch *ts; 2236 uint32_t eax, ecx, edx, idtvec_info, idtvec_err, intr_info, inst_info; 2237 uint32_t intr_type, intr_vec, reason; 2238 uint64_t exitintinfo, qual, gpa; 2239 bool retu; 2240 2241 CTASSERT((PINBASED_CTLS_ONE_SETTING & PINBASED_VIRTUAL_NMI) != 0); 2242 CTASSERT((PINBASED_CTLS_ONE_SETTING & PINBASED_NMI_EXITING) != 0); 2243 2244 handled = UNHANDLED; 2245 vmxctx = &vmx->ctx[vcpu]; 2246 2247 qual = vmexit->u.vmx.exit_qualification; 2248 reason = vmexit->u.vmx.exit_reason; 2249 vmexit->exitcode = VM_EXITCODE_BOGUS; 2250 2251 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_COUNT, 1); 2252 SDT_PROBE3(vmm, vmx, exit, entry, vmx, vcpu, vmexit); 2253 2254 /* 2255 * VM-entry failures during or after loading guest state. 2256 * 2257 * These VM-exits are uncommon but must be handled specially 2258 * as most VM-exit fields are not populated as usual. 2259 */ 2260 if (__predict_false(reason == EXIT_REASON_MCE_DURING_ENTRY)) { 2261 VCPU_CTR0(vmx->vm, vcpu, "Handling MCE during VM-entry"); 2262 __asm __volatile("int $18"); 2263 return (1); 2264 } 2265 2266 /* 2267 * VM exits that can be triggered during event delivery need to 2268 * be handled specially by re-injecting the event if the IDT 2269 * vectoring information field's valid bit is set. 2270 * 2271 * See "Information for VM Exits During Event Delivery" in Intel SDM 2272 * for details. 2273 */ 2274 idtvec_info = vmcs_idt_vectoring_info(); 2275 if (idtvec_info & VMCS_IDT_VEC_VALID) { 2276 idtvec_info &= ~(1 << 12); /* clear undefined bit */ 2277 exitintinfo = idtvec_info; 2278 if (idtvec_info & VMCS_IDT_VEC_ERRCODE_VALID) { 2279 idtvec_err = vmcs_idt_vectoring_err(); 2280 exitintinfo |= (uint64_t)idtvec_err << 32; 2281 } 2282 error = vm_exit_intinfo(vmx->vm, vcpu, exitintinfo); 2283 KASSERT(error == 0, ("%s: vm_set_intinfo error %d", 2284 __func__, error)); 2285 2286 /* 2287 * If 'virtual NMIs' are being used and the VM-exit 2288 * happened while injecting an NMI during the previous 2289 * VM-entry, then clear "blocking by NMI" in the 2290 * Guest Interruptibility-State so the NMI can be 2291 * reinjected on the subsequent VM-entry. 2292 * 2293 * However, if the NMI was being delivered through a task 2294 * gate, then the new task must start execution with NMIs 2295 * blocked so don't clear NMI blocking in this case. 2296 */ 2297 intr_type = idtvec_info & VMCS_INTR_T_MASK; 2298 if (intr_type == VMCS_INTR_T_NMI) { 2299 if (reason != EXIT_REASON_TASK_SWITCH) 2300 vmx_clear_nmi_blocking(vmx, vcpu); 2301 else 2302 vmx_assert_nmi_blocking(vmx, vcpu); 2303 } 2304 2305 /* 2306 * Update VM-entry instruction length if the event being 2307 * delivered was a software interrupt or software exception. 2308 */ 2309 if (intr_type == VMCS_INTR_T_SWINTR || 2310 intr_type == VMCS_INTR_T_PRIV_SWEXCEPTION || 2311 intr_type == VMCS_INTR_T_SWEXCEPTION) { 2312 vmcs_write(VMCS_ENTRY_INST_LENGTH, vmexit->inst_length); 2313 } 2314 } 2315 2316 switch (reason) { 2317 case EXIT_REASON_TASK_SWITCH: 2318 ts = &vmexit->u.task_switch; 2319 ts->tsssel = qual & 0xffff; 2320 ts->reason = vmx_task_switch_reason(qual); 2321 ts->ext = 0; 2322 ts->errcode_valid = 0; 2323 vmx_paging_info(&ts->paging); 2324 /* 2325 * If the task switch was due to a CALL, JMP, IRET, software 2326 * interrupt (INT n) or software exception (INT3, INTO), 2327 * then the saved %rip references the instruction that caused 2328 * the task switch. The instruction length field in the VMCS 2329 * is valid in this case. 2330 * 2331 * In all other cases (e.g., NMI, hardware exception) the 2332 * saved %rip is one that would have been saved in the old TSS 2333 * had the task switch completed normally so the instruction 2334 * length field is not needed in this case and is explicitly 2335 * set to 0. 2336 */ 2337 if (ts->reason == TSR_IDT_GATE) { 2338 KASSERT(idtvec_info & VMCS_IDT_VEC_VALID, 2339 ("invalid idtvec_info %#x for IDT task switch", 2340 idtvec_info)); 2341 intr_type = idtvec_info & VMCS_INTR_T_MASK; 2342 if (intr_type != VMCS_INTR_T_SWINTR && 2343 intr_type != VMCS_INTR_T_SWEXCEPTION && 2344 intr_type != VMCS_INTR_T_PRIV_SWEXCEPTION) { 2345 /* Task switch triggered by external event */ 2346 ts->ext = 1; 2347 vmexit->inst_length = 0; 2348 if (idtvec_info & VMCS_IDT_VEC_ERRCODE_VALID) { 2349 ts->errcode_valid = 1; 2350 ts->errcode = vmcs_idt_vectoring_err(); 2351 } 2352 } 2353 } 2354 vmexit->exitcode = VM_EXITCODE_TASK_SWITCH; 2355 SDT_PROBE4(vmm, vmx, exit, taskswitch, vmx, vcpu, vmexit, ts); 2356 VCPU_CTR4(vmx->vm, vcpu, "task switch reason %d, tss 0x%04x, " 2357 "%s errcode 0x%016lx", ts->reason, ts->tsssel, 2358 ts->ext ? "external" : "internal", 2359 ((uint64_t)ts->errcode << 32) | ts->errcode_valid); 2360 break; 2361 case EXIT_REASON_CR_ACCESS: 2362 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_CR_ACCESS, 1); 2363 SDT_PROBE4(vmm, vmx, exit, craccess, vmx, vcpu, vmexit, qual); 2364 switch (qual & 0xf) { 2365 case 0: 2366 handled = vmx_emulate_cr0_access(vmx, vcpu, qual); 2367 break; 2368 case 4: 2369 handled = vmx_emulate_cr4_access(vmx, vcpu, qual); 2370 break; 2371 case 8: 2372 handled = vmx_emulate_cr8_access(vmx, vcpu, qual); 2373 break; 2374 } 2375 break; 2376 case EXIT_REASON_RDMSR: 2377 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_RDMSR, 1); 2378 retu = false; 2379 ecx = vmxctx->guest_rcx; 2380 VCPU_CTR1(vmx->vm, vcpu, "rdmsr 0x%08x", ecx); 2381 SDT_PROBE4(vmm, vmx, exit, rdmsr, vmx, vcpu, vmexit, ecx); 2382 error = emulate_rdmsr(vmx, vcpu, ecx, &retu); 2383 if (error) { 2384 vmexit->exitcode = VM_EXITCODE_RDMSR; 2385 vmexit->u.msr.code = ecx; 2386 } else if (!retu) { 2387 handled = HANDLED; 2388 } else { 2389 /* Return to userspace with a valid exitcode */ 2390 KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS, 2391 ("emulate_rdmsr retu with bogus exitcode")); 2392 } 2393 break; 2394 case EXIT_REASON_WRMSR: 2395 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_WRMSR, 1); 2396 retu = false; 2397 eax = vmxctx->guest_rax; 2398 ecx = vmxctx->guest_rcx; 2399 edx = vmxctx->guest_rdx; 2400 VCPU_CTR2(vmx->vm, vcpu, "wrmsr 0x%08x value 0x%016lx", 2401 ecx, (uint64_t)edx << 32 | eax); 2402 SDT_PROBE5(vmm, vmx, exit, wrmsr, vmx, vmexit, vcpu, ecx, 2403 (uint64_t)edx << 32 | eax); 2404 error = emulate_wrmsr(vmx, vcpu, ecx, 2405 (uint64_t)edx << 32 | eax, &retu); 2406 if (error) { 2407 vmexit->exitcode = VM_EXITCODE_WRMSR; 2408 vmexit->u.msr.code = ecx; 2409 vmexit->u.msr.wval = (uint64_t)edx << 32 | eax; 2410 } else if (!retu) { 2411 handled = HANDLED; 2412 } else { 2413 /* Return to userspace with a valid exitcode */ 2414 KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS, 2415 ("emulate_wrmsr retu with bogus exitcode")); 2416 } 2417 break; 2418 case EXIT_REASON_HLT: 2419 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_HLT, 1); 2420 SDT_PROBE3(vmm, vmx, exit, halt, vmx, vcpu, vmexit); 2421 vmexit->exitcode = VM_EXITCODE_HLT; 2422 vmexit->u.hlt.rflags = vmcs_read(VMCS_GUEST_RFLAGS); 2423 if (virtual_interrupt_delivery) 2424 vmexit->u.hlt.intr_status = 2425 vmcs_read(VMCS_GUEST_INTR_STATUS); 2426 else 2427 vmexit->u.hlt.intr_status = 0; 2428 break; 2429 case EXIT_REASON_MTF: 2430 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_MTRAP, 1); 2431 SDT_PROBE3(vmm, vmx, exit, mtrap, vmx, vcpu, vmexit); 2432 vmexit->exitcode = VM_EXITCODE_MTRAP; 2433 vmexit->inst_length = 0; 2434 break; 2435 case EXIT_REASON_PAUSE: 2436 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_PAUSE, 1); 2437 SDT_PROBE3(vmm, vmx, exit, pause, vmx, vcpu, vmexit); 2438 vmexit->exitcode = VM_EXITCODE_PAUSE; 2439 break; 2440 case EXIT_REASON_INTR_WINDOW: 2441 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INTR_WINDOW, 1); 2442 SDT_PROBE3(vmm, vmx, exit, intrwindow, vmx, vcpu, vmexit); 2443 vmx_clear_int_window_exiting(vmx, vcpu); 2444 return (1); 2445 case EXIT_REASON_EXT_INTR: 2446 /* 2447 * External interrupts serve only to cause VM exits and allow 2448 * the host interrupt handler to run. 2449 * 2450 * If this external interrupt triggers a virtual interrupt 2451 * to a VM, then that state will be recorded by the 2452 * host interrupt handler in the VM's softc. We will inject 2453 * this virtual interrupt during the subsequent VM enter. 2454 */ 2455 intr_info = vmcs_read(VMCS_EXIT_INTR_INFO); 2456 SDT_PROBE4(vmm, vmx, exit, interrupt, 2457 vmx, vcpu, vmexit, intr_info); 2458 2459 /* 2460 * XXX: Ignore this exit if VMCS_INTR_VALID is not set. 2461 * This appears to be a bug in VMware Fusion? 2462 */ 2463 if (!(intr_info & VMCS_INTR_VALID)) 2464 return (1); 2465 KASSERT((intr_info & VMCS_INTR_VALID) != 0 && 2466 (intr_info & VMCS_INTR_T_MASK) == VMCS_INTR_T_HWINTR, 2467 ("VM exit interruption info invalid: %#x", intr_info)); 2468 vmx_trigger_hostintr(intr_info & 0xff); 2469 2470 /* 2471 * This is special. We want to treat this as an 'handled' 2472 * VM-exit but not increment the instruction pointer. 2473 */ 2474 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_EXTINT, 1); 2475 return (1); 2476 case EXIT_REASON_NMI_WINDOW: 2477 SDT_PROBE3(vmm, vmx, exit, nmiwindow, vmx, vcpu, vmexit); 2478 /* Exit to allow the pending virtual NMI to be injected */ 2479 if (vm_nmi_pending(vmx->vm, vcpu)) 2480 vmx_inject_nmi(vmx, vcpu); 2481 vmx_clear_nmi_window_exiting(vmx, vcpu); 2482 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_NMI_WINDOW, 1); 2483 return (1); 2484 case EXIT_REASON_INOUT: 2485 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INOUT, 1); 2486 vmexit->exitcode = VM_EXITCODE_INOUT; 2487 vmexit->u.inout.bytes = (qual & 0x7) + 1; 2488 vmexit->u.inout.in = in = (qual & 0x8) ? 1 : 0; 2489 vmexit->u.inout.string = (qual & 0x10) ? 1 : 0; 2490 vmexit->u.inout.rep = (qual & 0x20) ? 1 : 0; 2491 vmexit->u.inout.port = (uint16_t)(qual >> 16); 2492 vmexit->u.inout.eax = (uint32_t)(vmxctx->guest_rax); 2493 if (vmexit->u.inout.string) { 2494 inst_info = vmcs_read(VMCS_EXIT_INSTRUCTION_INFO); 2495 vmexit->exitcode = VM_EXITCODE_INOUT_STR; 2496 vis = &vmexit->u.inout_str; 2497 vmx_paging_info(&vis->paging); 2498 vis->rflags = vmcs_read(VMCS_GUEST_RFLAGS); 2499 vis->cr0 = vmcs_read(VMCS_GUEST_CR0); 2500 vis->index = inout_str_index(vmx, vcpu, in); 2501 vis->count = inout_str_count(vmx, vcpu, vis->inout.rep); 2502 vis->addrsize = inout_str_addrsize(inst_info); 2503 inout_str_seginfo(vmx, vcpu, inst_info, in, vis); 2504 } 2505 SDT_PROBE3(vmm, vmx, exit, inout, vmx, vcpu, vmexit); 2506 break; 2507 case EXIT_REASON_CPUID: 2508 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_CPUID, 1); 2509 SDT_PROBE3(vmm, vmx, exit, cpuid, vmx, vcpu, vmexit); 2510 handled = vmx_handle_cpuid(vmx->vm, vcpu, vmxctx); 2511 break; 2512 case EXIT_REASON_EXCEPTION: 2513 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_EXCEPTION, 1); 2514 intr_info = vmcs_read(VMCS_EXIT_INTR_INFO); 2515 KASSERT((intr_info & VMCS_INTR_VALID) != 0, 2516 ("VM exit interruption info invalid: %#x", intr_info)); 2517 2518 intr_vec = intr_info & 0xff; 2519 intr_type = intr_info & VMCS_INTR_T_MASK; 2520 2521 /* 2522 * If Virtual NMIs control is 1 and the VM-exit is due to a 2523 * fault encountered during the execution of IRET then we must 2524 * restore the state of "virtual-NMI blocking" before resuming 2525 * the guest. 2526 * 2527 * See "Resuming Guest Software after Handling an Exception". 2528 * See "Information for VM Exits Due to Vectored Events". 2529 */ 2530 if ((idtvec_info & VMCS_IDT_VEC_VALID) == 0 && 2531 (intr_vec != IDT_DF) && 2532 (intr_info & EXIT_QUAL_NMIUDTI) != 0) 2533 vmx_restore_nmi_blocking(vmx, vcpu); 2534 2535 /* 2536 * The NMI has already been handled in vmx_exit_handle_nmi(). 2537 */ 2538 if (intr_type == VMCS_INTR_T_NMI) 2539 return (1); 2540 2541 /* 2542 * Call the machine check handler by hand. Also don't reflect 2543 * the machine check back into the guest. 2544 */ 2545 if (intr_vec == IDT_MC) { 2546 VCPU_CTR0(vmx->vm, vcpu, "Vectoring to MCE handler"); 2547 __asm __volatile("int $18"); 2548 return (1); 2549 } 2550 2551 /* 2552 * If the hypervisor has requested user exits for 2553 * debug exceptions, bounce them out to userland. 2554 */ 2555 if (intr_type == VMCS_INTR_T_SWEXCEPTION && intr_vec == IDT_BP && 2556 (vmx->cap[vcpu].set & (1 << VM_CAP_BPT_EXIT))) { 2557 vmexit->exitcode = VM_EXITCODE_BPT; 2558 vmexit->u.bpt.inst_length = vmexit->inst_length; 2559 vmexit->inst_length = 0; 2560 break; 2561 } 2562 2563 if (intr_vec == IDT_PF) { 2564 error = vmxctx_setreg(vmxctx, VM_REG_GUEST_CR2, qual); 2565 KASSERT(error == 0, ("%s: vmxctx_setreg(cr2) error %d", 2566 __func__, error)); 2567 } 2568 2569 /* 2570 * Software exceptions exhibit trap-like behavior. This in 2571 * turn requires populating the VM-entry instruction length 2572 * so that the %rip in the trap frame is past the INT3/INTO 2573 * instruction. 2574 */ 2575 if (intr_type == VMCS_INTR_T_SWEXCEPTION) 2576 vmcs_write(VMCS_ENTRY_INST_LENGTH, vmexit->inst_length); 2577 2578 /* Reflect all other exceptions back into the guest */ 2579 errcode_valid = errcode = 0; 2580 if (intr_info & VMCS_INTR_DEL_ERRCODE) { 2581 errcode_valid = 1; 2582 errcode = vmcs_read(VMCS_EXIT_INTR_ERRCODE); 2583 } 2584 VCPU_CTR2(vmx->vm, vcpu, "Reflecting exception %d/%#x into " 2585 "the guest", intr_vec, errcode); 2586 SDT_PROBE5(vmm, vmx, exit, exception, 2587 vmx, vcpu, vmexit, intr_vec, errcode); 2588 error = vm_inject_exception(vmx->vm, vcpu, intr_vec, 2589 errcode_valid, errcode, 0); 2590 KASSERT(error == 0, ("%s: vm_inject_exception error %d", 2591 __func__, error)); 2592 return (1); 2593 2594 case EXIT_REASON_EPT_FAULT: 2595 /* 2596 * If 'gpa' lies within the address space allocated to 2597 * memory then this must be a nested page fault otherwise 2598 * this must be an instruction that accesses MMIO space. 2599 */ 2600 gpa = vmcs_gpa(); 2601 if (vm_mem_allocated(vmx->vm, vcpu, gpa) || 2602 apic_access_fault(vmx, vcpu, gpa)) { 2603 vmexit->exitcode = VM_EXITCODE_PAGING; 2604 vmexit->inst_length = 0; 2605 vmexit->u.paging.gpa = gpa; 2606 vmexit->u.paging.fault_type = ept_fault_type(qual); 2607 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_NESTED_FAULT, 1); 2608 SDT_PROBE5(vmm, vmx, exit, nestedfault, 2609 vmx, vcpu, vmexit, gpa, qual); 2610 } else if (ept_emulation_fault(qual)) { 2611 vmexit_inst_emul(vmexit, gpa, vmcs_gla()); 2612 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INST_EMUL, 1); 2613 SDT_PROBE4(vmm, vmx, exit, mmiofault, 2614 vmx, vcpu, vmexit, gpa); 2615 } 2616 /* 2617 * If Virtual NMIs control is 1 and the VM-exit is due to an 2618 * EPT fault during the execution of IRET then we must restore 2619 * the state of "virtual-NMI blocking" before resuming. 2620 * 2621 * See description of "NMI unblocking due to IRET" in 2622 * "Exit Qualification for EPT Violations". 2623 */ 2624 if ((idtvec_info & VMCS_IDT_VEC_VALID) == 0 && 2625 (qual & EXIT_QUAL_NMIUDTI) != 0) 2626 vmx_restore_nmi_blocking(vmx, vcpu); 2627 break; 2628 case EXIT_REASON_VIRTUALIZED_EOI: 2629 vmexit->exitcode = VM_EXITCODE_IOAPIC_EOI; 2630 vmexit->u.ioapic_eoi.vector = qual & 0xFF; 2631 SDT_PROBE3(vmm, vmx, exit, eoi, vmx, vcpu, vmexit); 2632 vmexit->inst_length = 0; /* trap-like */ 2633 break; 2634 case EXIT_REASON_APIC_ACCESS: 2635 SDT_PROBE3(vmm, vmx, exit, apicaccess, vmx, vcpu, vmexit); 2636 handled = vmx_handle_apic_access(vmx, vcpu, vmexit); 2637 break; 2638 case EXIT_REASON_APIC_WRITE: 2639 /* 2640 * APIC-write VM exit is trap-like so the %rip is already 2641 * pointing to the next instruction. 2642 */ 2643 vmexit->inst_length = 0; 2644 vlapic = vm_lapic(vmx->vm, vcpu); 2645 SDT_PROBE4(vmm, vmx, exit, apicwrite, 2646 vmx, vcpu, vmexit, vlapic); 2647 handled = vmx_handle_apic_write(vmx, vcpu, vlapic, qual); 2648 break; 2649 case EXIT_REASON_XSETBV: 2650 SDT_PROBE3(vmm, vmx, exit, xsetbv, vmx, vcpu, vmexit); 2651 handled = vmx_emulate_xsetbv(vmx, vcpu, vmexit); 2652 break; 2653 case EXIT_REASON_MONITOR: 2654 SDT_PROBE3(vmm, vmx, exit, monitor, vmx, vcpu, vmexit); 2655 vmexit->exitcode = VM_EXITCODE_MONITOR; 2656 break; 2657 case EXIT_REASON_MWAIT: 2658 SDT_PROBE3(vmm, vmx, exit, mwait, vmx, vcpu, vmexit); 2659 vmexit->exitcode = VM_EXITCODE_MWAIT; 2660 break; 2661 case EXIT_REASON_VMCALL: 2662 case EXIT_REASON_VMCLEAR: 2663 case EXIT_REASON_VMLAUNCH: 2664 case EXIT_REASON_VMPTRLD: 2665 case EXIT_REASON_VMPTRST: 2666 case EXIT_REASON_VMREAD: 2667 case EXIT_REASON_VMRESUME: 2668 case EXIT_REASON_VMWRITE: 2669 case EXIT_REASON_VMXOFF: 2670 case EXIT_REASON_VMXON: 2671 SDT_PROBE3(vmm, vmx, exit, vminsn, vmx, vcpu, vmexit); 2672 vmexit->exitcode = VM_EXITCODE_VMINSN; 2673 break; 2674 default: 2675 SDT_PROBE4(vmm, vmx, exit, unknown, 2676 vmx, vcpu, vmexit, reason); 2677 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_UNKNOWN, 1); 2678 break; 2679 } 2680 2681 if (handled) { 2682 /* 2683 * It is possible that control is returned to userland 2684 * even though we were able to handle the VM exit in the 2685 * kernel. 2686 * 2687 * In such a case we want to make sure that the userland 2688 * restarts guest execution at the instruction *after* 2689 * the one we just processed. Therefore we update the 2690 * guest rip in the VMCS and in 'vmexit'. 2691 */ 2692 vmexit->rip += vmexit->inst_length; 2693 vmexit->inst_length = 0; 2694 vmcs_write(VMCS_GUEST_RIP, vmexit->rip); 2695 } else { 2696 if (vmexit->exitcode == VM_EXITCODE_BOGUS) { 2697 /* 2698 * If this VM exit was not claimed by anybody then 2699 * treat it as a generic VMX exit. 2700 */ 2701 vmexit->exitcode = VM_EXITCODE_VMX; 2702 vmexit->u.vmx.status = VM_SUCCESS; 2703 vmexit->u.vmx.inst_type = 0; 2704 vmexit->u.vmx.inst_error = 0; 2705 } else { 2706 /* 2707 * The exitcode and collateral have been populated. 2708 * The VM exit will be processed further in userland. 2709 */ 2710 } 2711 } 2712 2713 SDT_PROBE4(vmm, vmx, exit, return, 2714 vmx, vcpu, vmexit, handled); 2715 return (handled); 2716 } 2717 2718 static __inline void 2719 vmx_exit_inst_error(struct vmxctx *vmxctx, int rc, struct vm_exit *vmexit) 2720 { 2721 2722 KASSERT(vmxctx->inst_fail_status != VM_SUCCESS, 2723 ("vmx_exit_inst_error: invalid inst_fail_status %d", 2724 vmxctx->inst_fail_status)); 2725 2726 vmexit->inst_length = 0; 2727 vmexit->exitcode = VM_EXITCODE_VMX; 2728 vmexit->u.vmx.status = vmxctx->inst_fail_status; 2729 vmexit->u.vmx.inst_error = vmcs_instruction_error(); 2730 vmexit->u.vmx.exit_reason = ~0; 2731 vmexit->u.vmx.exit_qualification = ~0; 2732 2733 switch (rc) { 2734 case VMX_VMRESUME_ERROR: 2735 case VMX_VMLAUNCH_ERROR: 2736 case VMX_INVEPT_ERROR: 2737 vmexit->u.vmx.inst_type = rc; 2738 break; 2739 default: 2740 panic("vm_exit_inst_error: vmx_enter_guest returned %d", rc); 2741 } 2742 } 2743 2744 /* 2745 * If the NMI-exiting VM execution control is set to '1' then an NMI in 2746 * non-root operation causes a VM-exit. NMI blocking is in effect so it is 2747 * sufficient to simply vector to the NMI handler via a software interrupt. 2748 * However, this must be done before maskable interrupts are enabled 2749 * otherwise the "iret" issued by an interrupt handler will incorrectly 2750 * clear NMI blocking. 2751 */ 2752 static __inline void 2753 vmx_exit_handle_nmi(struct vmx *vmx, int vcpuid, struct vm_exit *vmexit) 2754 { 2755 uint32_t intr_info; 2756 2757 KASSERT((read_rflags() & PSL_I) == 0, ("interrupts enabled")); 2758 2759 if (vmexit->u.vmx.exit_reason != EXIT_REASON_EXCEPTION) 2760 return; 2761 2762 intr_info = vmcs_read(VMCS_EXIT_INTR_INFO); 2763 KASSERT((intr_info & VMCS_INTR_VALID) != 0, 2764 ("VM exit interruption info invalid: %#x", intr_info)); 2765 2766 if ((intr_info & VMCS_INTR_T_MASK) == VMCS_INTR_T_NMI) { 2767 KASSERT((intr_info & 0xff) == IDT_NMI, ("VM exit due " 2768 "to NMI has invalid vector: %#x", intr_info)); 2769 VCPU_CTR0(vmx->vm, vcpuid, "Vectoring to NMI handler"); 2770 __asm __volatile("int $2"); 2771 } 2772 } 2773 2774 static __inline void 2775 vmx_dr_enter_guest(struct vmxctx *vmxctx) 2776 { 2777 register_t rflags; 2778 2779 /* Save host control debug registers. */ 2780 vmxctx->host_dr7 = rdr7(); 2781 vmxctx->host_debugctl = rdmsr(MSR_DEBUGCTLMSR); 2782 2783 /* 2784 * Disable debugging in DR7 and DEBUGCTL to avoid triggering 2785 * exceptions in the host based on the guest DRx values. The 2786 * guest DR7 and DEBUGCTL are saved/restored in the VMCS. 2787 */ 2788 load_dr7(0); 2789 wrmsr(MSR_DEBUGCTLMSR, 0); 2790 2791 /* 2792 * Disable single stepping the kernel to avoid corrupting the 2793 * guest DR6. A debugger might still be able to corrupt the 2794 * guest DR6 by setting a breakpoint after this point and then 2795 * single stepping. 2796 */ 2797 rflags = read_rflags(); 2798 vmxctx->host_tf = rflags & PSL_T; 2799 write_rflags(rflags & ~PSL_T); 2800 2801 /* Save host debug registers. */ 2802 vmxctx->host_dr0 = rdr0(); 2803 vmxctx->host_dr1 = rdr1(); 2804 vmxctx->host_dr2 = rdr2(); 2805 vmxctx->host_dr3 = rdr3(); 2806 vmxctx->host_dr6 = rdr6(); 2807 2808 /* Restore guest debug registers. */ 2809 load_dr0(vmxctx->guest_dr0); 2810 load_dr1(vmxctx->guest_dr1); 2811 load_dr2(vmxctx->guest_dr2); 2812 load_dr3(vmxctx->guest_dr3); 2813 load_dr6(vmxctx->guest_dr6); 2814 } 2815 2816 static __inline void 2817 vmx_dr_leave_guest(struct vmxctx *vmxctx) 2818 { 2819 2820 /* Save guest debug registers. */ 2821 vmxctx->guest_dr0 = rdr0(); 2822 vmxctx->guest_dr1 = rdr1(); 2823 vmxctx->guest_dr2 = rdr2(); 2824 vmxctx->guest_dr3 = rdr3(); 2825 vmxctx->guest_dr6 = rdr6(); 2826 2827 /* 2828 * Restore host debug registers. Restore DR7, DEBUGCTL, and 2829 * PSL_T last. 2830 */ 2831 load_dr0(vmxctx->host_dr0); 2832 load_dr1(vmxctx->host_dr1); 2833 load_dr2(vmxctx->host_dr2); 2834 load_dr3(vmxctx->host_dr3); 2835 load_dr6(vmxctx->host_dr6); 2836 wrmsr(MSR_DEBUGCTLMSR, vmxctx->host_debugctl); 2837 load_dr7(vmxctx->host_dr7); 2838 write_rflags(read_rflags() | vmxctx->host_tf); 2839 } 2840 2841 static int 2842 vmx_run(void *arg, int vcpu, register_t rip, pmap_t pmap, 2843 struct vm_eventinfo *evinfo) 2844 { 2845 int rc, handled, launched; 2846 struct vmx *vmx; 2847 struct vm *vm; 2848 struct vmxctx *vmxctx; 2849 struct vmcs *vmcs; 2850 struct vm_exit *vmexit; 2851 struct vlapic *vlapic; 2852 uint32_t exit_reason; 2853 struct region_descriptor gdtr, idtr; 2854 uint16_t ldt_sel; 2855 2856 vmx = arg; 2857 vm = vmx->vm; 2858 vmcs = &vmx->vmcs[vcpu]; 2859 vmxctx = &vmx->ctx[vcpu]; 2860 vlapic = vm_lapic(vm, vcpu); 2861 vmexit = vm_exitinfo(vm, vcpu); 2862 launched = 0; 2863 2864 KASSERT(vmxctx->pmap == pmap, 2865 ("pmap %p different than ctx pmap %p", pmap, vmxctx->pmap)); 2866 2867 vmx_msr_guest_enter(vmx, vcpu); 2868 2869 VMPTRLD(vmcs); 2870 2871 /* 2872 * XXX 2873 * We do this every time because we may setup the virtual machine 2874 * from a different process than the one that actually runs it. 2875 * 2876 * If the life of a virtual machine was spent entirely in the context 2877 * of a single process we could do this once in vmx_vminit(). 2878 */ 2879 vmcs_write(VMCS_HOST_CR3, rcr3()); 2880 2881 vmcs_write(VMCS_GUEST_RIP, rip); 2882 vmx_set_pcpu_defaults(vmx, vcpu, pmap); 2883 do { 2884 KASSERT(vmcs_guest_rip() == rip, ("%s: vmcs guest rip mismatch " 2885 "%#lx/%#lx", __func__, vmcs_guest_rip(), rip)); 2886 2887 handled = UNHANDLED; 2888 /* 2889 * Interrupts are disabled from this point on until the 2890 * guest starts executing. This is done for the following 2891 * reasons: 2892 * 2893 * If an AST is asserted on this thread after the check below, 2894 * then the IPI_AST notification will not be lost, because it 2895 * will cause a VM exit due to external interrupt as soon as 2896 * the guest state is loaded. 2897 * 2898 * A posted interrupt after 'vmx_inject_interrupts()' will 2899 * not be "lost" because it will be held pending in the host 2900 * APIC because interrupts are disabled. The pending interrupt 2901 * will be recognized as soon as the guest state is loaded. 2902 * 2903 * The same reasoning applies to the IPI generated by 2904 * pmap_invalidate_ept(). 2905 */ 2906 disable_intr(); 2907 vmx_inject_interrupts(vmx, vcpu, vlapic, rip); 2908 2909 /* 2910 * Check for vcpu suspension after injecting events because 2911 * vmx_inject_interrupts() can suspend the vcpu due to a 2912 * triple fault. 2913 */ 2914 if (vcpu_suspended(evinfo)) { 2915 enable_intr(); 2916 vm_exit_suspended(vmx->vm, vcpu, rip); 2917 break; 2918 } 2919 2920 if (vcpu_rendezvous_pending(evinfo)) { 2921 enable_intr(); 2922 vm_exit_rendezvous(vmx->vm, vcpu, rip); 2923 break; 2924 } 2925 2926 if (vcpu_reqidle(evinfo)) { 2927 enable_intr(); 2928 vm_exit_reqidle(vmx->vm, vcpu, rip); 2929 break; 2930 } 2931 2932 if (vcpu_should_yield(vm, vcpu)) { 2933 enable_intr(); 2934 vm_exit_astpending(vmx->vm, vcpu, rip); 2935 vmx_astpending_trace(vmx, vcpu, rip); 2936 handled = HANDLED; 2937 break; 2938 } 2939 2940 if (vcpu_debugged(vm, vcpu)) { 2941 enable_intr(); 2942 vm_exit_debug(vmx->vm, vcpu, rip); 2943 break; 2944 } 2945 2946 /* 2947 * VM exits restore the base address but not the 2948 * limits of GDTR and IDTR. The VMCS only stores the 2949 * base address, so VM exits set the limits to 0xffff. 2950 * Save and restore the full GDTR and IDTR to restore 2951 * the limits. 2952 * 2953 * The VMCS does not save the LDTR at all, and VM 2954 * exits clear LDTR as if a NULL selector were loaded. 2955 * The userspace hypervisor probably doesn't use a 2956 * LDT, but save and restore it to be safe. 2957 */ 2958 sgdt(&gdtr); 2959 sidt(&idtr); 2960 ldt_sel = sldt(); 2961 2962 vmx_run_trace(vmx, vcpu); 2963 vmx_dr_enter_guest(vmxctx); 2964 rc = vmx_enter_guest(vmxctx, vmx, launched); 2965 vmx_dr_leave_guest(vmxctx); 2966 2967 bare_lgdt(&gdtr); 2968 lidt(&idtr); 2969 lldt(ldt_sel); 2970 2971 /* Collect some information for VM exit processing */ 2972 vmexit->rip = rip = vmcs_guest_rip(); 2973 vmexit->inst_length = vmexit_instruction_length(); 2974 vmexit->u.vmx.exit_reason = exit_reason = vmcs_exit_reason(); 2975 vmexit->u.vmx.exit_qualification = vmcs_exit_qualification(); 2976 2977 /* Update 'nextrip' */ 2978 vmx->state[vcpu].nextrip = rip; 2979 2980 if (rc == VMX_GUEST_VMEXIT) { 2981 vmx_exit_handle_nmi(vmx, vcpu, vmexit); 2982 enable_intr(); 2983 handled = vmx_exit_process(vmx, vcpu, vmexit); 2984 } else { 2985 enable_intr(); 2986 vmx_exit_inst_error(vmxctx, rc, vmexit); 2987 } 2988 launched = 1; 2989 vmx_exit_trace(vmx, vcpu, rip, exit_reason, handled); 2990 rip = vmexit->rip; 2991 } while (handled); 2992 2993 /* 2994 * If a VM exit has been handled then the exitcode must be BOGUS 2995 * If a VM exit is not handled then the exitcode must not be BOGUS 2996 */ 2997 if ((handled && vmexit->exitcode != VM_EXITCODE_BOGUS) || 2998 (!handled && vmexit->exitcode == VM_EXITCODE_BOGUS)) { 2999 panic("Mismatch between handled (%d) and exitcode (%d)", 3000 handled, vmexit->exitcode); 3001 } 3002 3003 if (!handled) 3004 vmm_stat_incr(vm, vcpu, VMEXIT_USERSPACE, 1); 3005 3006 VCPU_CTR1(vm, vcpu, "returning from vmx_run: exitcode %d", 3007 vmexit->exitcode); 3008 3009 VMCLEAR(vmcs); 3010 vmx_msr_guest_exit(vmx, vcpu); 3011 3012 return (0); 3013 } 3014 3015 static void 3016 vmx_vmcleanup(void *arg) 3017 { 3018 int i; 3019 struct vmx *vmx = arg; 3020 uint16_t maxcpus; 3021 3022 if (apic_access_virtualization(vmx, 0)) 3023 vm_unmap_mmio(vmx->vm, DEFAULT_APIC_BASE, PAGE_SIZE); 3024 3025 maxcpus = vm_get_maxcpus(vmx->vm); 3026 for (i = 0; i < maxcpus; i++) 3027 vpid_free(vmx->state[i].vpid); 3028 3029 free(vmx, M_VMX); 3030 3031 return; 3032 } 3033 3034 static register_t * 3035 vmxctx_regptr(struct vmxctx *vmxctx, int reg) 3036 { 3037 3038 switch (reg) { 3039 case VM_REG_GUEST_RAX: 3040 return (&vmxctx->guest_rax); 3041 case VM_REG_GUEST_RBX: 3042 return (&vmxctx->guest_rbx); 3043 case VM_REG_GUEST_RCX: 3044 return (&vmxctx->guest_rcx); 3045 case VM_REG_GUEST_RDX: 3046 return (&vmxctx->guest_rdx); 3047 case VM_REG_GUEST_RSI: 3048 return (&vmxctx->guest_rsi); 3049 case VM_REG_GUEST_RDI: 3050 return (&vmxctx->guest_rdi); 3051 case VM_REG_GUEST_RBP: 3052 return (&vmxctx->guest_rbp); 3053 case VM_REG_GUEST_R8: 3054 return (&vmxctx->guest_r8); 3055 case VM_REG_GUEST_R9: 3056 return (&vmxctx->guest_r9); 3057 case VM_REG_GUEST_R10: 3058 return (&vmxctx->guest_r10); 3059 case VM_REG_GUEST_R11: 3060 return (&vmxctx->guest_r11); 3061 case VM_REG_GUEST_R12: 3062 return (&vmxctx->guest_r12); 3063 case VM_REG_GUEST_R13: 3064 return (&vmxctx->guest_r13); 3065 case VM_REG_GUEST_R14: 3066 return (&vmxctx->guest_r14); 3067 case VM_REG_GUEST_R15: 3068 return (&vmxctx->guest_r15); 3069 case VM_REG_GUEST_CR2: 3070 return (&vmxctx->guest_cr2); 3071 case VM_REG_GUEST_DR0: 3072 return (&vmxctx->guest_dr0); 3073 case VM_REG_GUEST_DR1: 3074 return (&vmxctx->guest_dr1); 3075 case VM_REG_GUEST_DR2: 3076 return (&vmxctx->guest_dr2); 3077 case VM_REG_GUEST_DR3: 3078 return (&vmxctx->guest_dr3); 3079 case VM_REG_GUEST_DR6: 3080 return (&vmxctx->guest_dr6); 3081 default: 3082 break; 3083 } 3084 return (NULL); 3085 } 3086 3087 static int 3088 vmxctx_getreg(struct vmxctx *vmxctx, int reg, uint64_t *retval) 3089 { 3090 register_t *regp; 3091 3092 if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) { 3093 *retval = *regp; 3094 return (0); 3095 } else 3096 return (EINVAL); 3097 } 3098 3099 static int 3100 vmxctx_setreg(struct vmxctx *vmxctx, int reg, uint64_t val) 3101 { 3102 register_t *regp; 3103 3104 if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) { 3105 *regp = val; 3106 return (0); 3107 } else 3108 return (EINVAL); 3109 } 3110 3111 static int 3112 vmx_get_intr_shadow(struct vmx *vmx, int vcpu, int running, uint64_t *retval) 3113 { 3114 uint64_t gi; 3115 int error; 3116 3117 error = vmcs_getreg(&vmx->vmcs[vcpu], running, 3118 VMCS_IDENT(VMCS_GUEST_INTERRUPTIBILITY), &gi); 3119 *retval = (gi & HWINTR_BLOCKING) ? 1 : 0; 3120 return (error); 3121 } 3122 3123 static int 3124 vmx_modify_intr_shadow(struct vmx *vmx, int vcpu, int running, uint64_t val) 3125 { 3126 struct vmcs *vmcs; 3127 uint64_t gi; 3128 int error, ident; 3129 3130 /* 3131 * Forcing the vcpu into an interrupt shadow is not supported. 3132 */ 3133 if (val) { 3134 error = EINVAL; 3135 goto done; 3136 } 3137 3138 vmcs = &vmx->vmcs[vcpu]; 3139 ident = VMCS_IDENT(VMCS_GUEST_INTERRUPTIBILITY); 3140 error = vmcs_getreg(vmcs, running, ident, &gi); 3141 if (error == 0) { 3142 gi &= ~HWINTR_BLOCKING; 3143 error = vmcs_setreg(vmcs, running, ident, gi); 3144 } 3145 done: 3146 VCPU_CTR2(vmx->vm, vcpu, "Setting intr_shadow to %#lx %s", val, 3147 error ? "failed" : "succeeded"); 3148 return (error); 3149 } 3150 3151 static int 3152 vmx_shadow_reg(int reg) 3153 { 3154 int shreg; 3155 3156 shreg = -1; 3157 3158 switch (reg) { 3159 case VM_REG_GUEST_CR0: 3160 shreg = VMCS_CR0_SHADOW; 3161 break; 3162 case VM_REG_GUEST_CR4: 3163 shreg = VMCS_CR4_SHADOW; 3164 break; 3165 default: 3166 break; 3167 } 3168 3169 return (shreg); 3170 } 3171 3172 static int 3173 vmx_getreg(void *arg, int vcpu, int reg, uint64_t *retval) 3174 { 3175 int running, hostcpu; 3176 struct vmx *vmx = arg; 3177 3178 running = vcpu_is_running(vmx->vm, vcpu, &hostcpu); 3179 if (running && hostcpu != curcpu) 3180 panic("vmx_getreg: %s%d is running", vm_name(vmx->vm), vcpu); 3181 3182 if (reg == VM_REG_GUEST_INTR_SHADOW) 3183 return (vmx_get_intr_shadow(vmx, vcpu, running, retval)); 3184 3185 if (vmxctx_getreg(&vmx->ctx[vcpu], reg, retval) == 0) 3186 return (0); 3187 3188 return (vmcs_getreg(&vmx->vmcs[vcpu], running, reg, retval)); 3189 } 3190 3191 static int 3192 vmx_setreg(void *arg, int vcpu, int reg, uint64_t val) 3193 { 3194 int error, hostcpu, running, shadow; 3195 uint64_t ctls; 3196 pmap_t pmap; 3197 struct vmx *vmx = arg; 3198 3199 running = vcpu_is_running(vmx->vm, vcpu, &hostcpu); 3200 if (running && hostcpu != curcpu) 3201 panic("vmx_setreg: %s%d is running", vm_name(vmx->vm), vcpu); 3202 3203 if (reg == VM_REG_GUEST_INTR_SHADOW) 3204 return (vmx_modify_intr_shadow(vmx, vcpu, running, val)); 3205 3206 if (vmxctx_setreg(&vmx->ctx[vcpu], reg, val) == 0) 3207 return (0); 3208 3209 error = vmcs_setreg(&vmx->vmcs[vcpu], running, reg, val); 3210 3211 if (error == 0) { 3212 /* 3213 * If the "load EFER" VM-entry control is 1 then the 3214 * value of EFER.LMA must be identical to "IA-32e mode guest" 3215 * bit in the VM-entry control. 3216 */ 3217 if ((entry_ctls & VM_ENTRY_LOAD_EFER) != 0 && 3218 (reg == VM_REG_GUEST_EFER)) { 3219 vmcs_getreg(&vmx->vmcs[vcpu], running, 3220 VMCS_IDENT(VMCS_ENTRY_CTLS), &ctls); 3221 if (val & EFER_LMA) 3222 ctls |= VM_ENTRY_GUEST_LMA; 3223 else 3224 ctls &= ~VM_ENTRY_GUEST_LMA; 3225 vmcs_setreg(&vmx->vmcs[vcpu], running, 3226 VMCS_IDENT(VMCS_ENTRY_CTLS), ctls); 3227 } 3228 3229 shadow = vmx_shadow_reg(reg); 3230 if (shadow > 0) { 3231 /* 3232 * Store the unmodified value in the shadow 3233 */ 3234 error = vmcs_setreg(&vmx->vmcs[vcpu], running, 3235 VMCS_IDENT(shadow), val); 3236 } 3237 3238 if (reg == VM_REG_GUEST_CR3) { 3239 /* 3240 * Invalidate the guest vcpu's TLB mappings to emulate 3241 * the behavior of updating %cr3. 3242 * 3243 * XXX the processor retains global mappings when %cr3 3244 * is updated but vmx_invvpid() does not. 3245 */ 3246 pmap = vmx->ctx[vcpu].pmap; 3247 vmx_invvpid(vmx, vcpu, pmap, running); 3248 } 3249 } 3250 3251 return (error); 3252 } 3253 3254 static int 3255 vmx_getdesc(void *arg, int vcpu, int reg, struct seg_desc *desc) 3256 { 3257 int hostcpu, running; 3258 struct vmx *vmx = arg; 3259 3260 running = vcpu_is_running(vmx->vm, vcpu, &hostcpu); 3261 if (running && hostcpu != curcpu) 3262 panic("vmx_getdesc: %s%d is running", vm_name(vmx->vm), vcpu); 3263 3264 return (vmcs_getdesc(&vmx->vmcs[vcpu], running, reg, desc)); 3265 } 3266 3267 static int 3268 vmx_setdesc(void *arg, int vcpu, int reg, struct seg_desc *desc) 3269 { 3270 int hostcpu, running; 3271 struct vmx *vmx = arg; 3272 3273 running = vcpu_is_running(vmx->vm, vcpu, &hostcpu); 3274 if (running && hostcpu != curcpu) 3275 panic("vmx_setdesc: %s%d is running", vm_name(vmx->vm), vcpu); 3276 3277 return (vmcs_setdesc(&vmx->vmcs[vcpu], running, reg, desc)); 3278 } 3279 3280 static int 3281 vmx_getcap(void *arg, int vcpu, int type, int *retval) 3282 { 3283 struct vmx *vmx = arg; 3284 int vcap; 3285 int ret; 3286 3287 ret = ENOENT; 3288 3289 vcap = vmx->cap[vcpu].set; 3290 3291 switch (type) { 3292 case VM_CAP_HALT_EXIT: 3293 if (cap_halt_exit) 3294 ret = 0; 3295 break; 3296 case VM_CAP_PAUSE_EXIT: 3297 if (cap_pause_exit) 3298 ret = 0; 3299 break; 3300 case VM_CAP_MTRAP_EXIT: 3301 if (cap_monitor_trap) 3302 ret = 0; 3303 break; 3304 case VM_CAP_UNRESTRICTED_GUEST: 3305 if (cap_unrestricted_guest) 3306 ret = 0; 3307 break; 3308 case VM_CAP_ENABLE_INVPCID: 3309 if (cap_invpcid) 3310 ret = 0; 3311 break; 3312 case VM_CAP_BPT_EXIT: 3313 ret = 0; 3314 break; 3315 default: 3316 break; 3317 } 3318 3319 if (ret == 0) 3320 *retval = (vcap & (1 << type)) ? 1 : 0; 3321 3322 return (ret); 3323 } 3324 3325 static int 3326 vmx_setcap(void *arg, int vcpu, int type, int val) 3327 { 3328 struct vmx *vmx = arg; 3329 struct vmcs *vmcs = &vmx->vmcs[vcpu]; 3330 uint32_t baseval; 3331 uint32_t *pptr; 3332 int error; 3333 int flag; 3334 int reg; 3335 int retval; 3336 3337 retval = ENOENT; 3338 pptr = NULL; 3339 3340 switch (type) { 3341 case VM_CAP_HALT_EXIT: 3342 if (cap_halt_exit) { 3343 retval = 0; 3344 pptr = &vmx->cap[vcpu].proc_ctls; 3345 baseval = *pptr; 3346 flag = PROCBASED_HLT_EXITING; 3347 reg = VMCS_PRI_PROC_BASED_CTLS; 3348 } 3349 break; 3350 case VM_CAP_MTRAP_EXIT: 3351 if (cap_monitor_trap) { 3352 retval = 0; 3353 pptr = &vmx->cap[vcpu].proc_ctls; 3354 baseval = *pptr; 3355 flag = PROCBASED_MTF; 3356 reg = VMCS_PRI_PROC_BASED_CTLS; 3357 } 3358 break; 3359 case VM_CAP_PAUSE_EXIT: 3360 if (cap_pause_exit) { 3361 retval = 0; 3362 pptr = &vmx->cap[vcpu].proc_ctls; 3363 baseval = *pptr; 3364 flag = PROCBASED_PAUSE_EXITING; 3365 reg = VMCS_PRI_PROC_BASED_CTLS; 3366 } 3367 break; 3368 case VM_CAP_UNRESTRICTED_GUEST: 3369 if (cap_unrestricted_guest) { 3370 retval = 0; 3371 pptr = &vmx->cap[vcpu].proc_ctls2; 3372 baseval = *pptr; 3373 flag = PROCBASED2_UNRESTRICTED_GUEST; 3374 reg = VMCS_SEC_PROC_BASED_CTLS; 3375 } 3376 break; 3377 case VM_CAP_ENABLE_INVPCID: 3378 if (cap_invpcid) { 3379 retval = 0; 3380 pptr = &vmx->cap[vcpu].proc_ctls2; 3381 baseval = *pptr; 3382 flag = PROCBASED2_ENABLE_INVPCID; 3383 reg = VMCS_SEC_PROC_BASED_CTLS; 3384 } 3385 break; 3386 case VM_CAP_BPT_EXIT: 3387 retval = 0; 3388 3389 /* Don't change the bitmap if we are tracing all exceptions. */ 3390 if (vmx->cap[vcpu].exc_bitmap != 0xffffffff) { 3391 pptr = &vmx->cap[vcpu].exc_bitmap; 3392 baseval = *pptr; 3393 flag = (1 << IDT_BP); 3394 reg = VMCS_EXCEPTION_BITMAP; 3395 } 3396 break; 3397 default: 3398 break; 3399 } 3400 3401 if (retval) 3402 return (retval); 3403 3404 if (pptr != NULL) { 3405 if (val) { 3406 baseval |= flag; 3407 } else { 3408 baseval &= ~flag; 3409 } 3410 VMPTRLD(vmcs); 3411 error = vmwrite(reg, baseval); 3412 VMCLEAR(vmcs); 3413 3414 if (error) 3415 return (error); 3416 3417 /* 3418 * Update optional stored flags, and record 3419 * setting 3420 */ 3421 *pptr = baseval; 3422 } 3423 3424 if (val) { 3425 vmx->cap[vcpu].set |= (1 << type); 3426 } else { 3427 vmx->cap[vcpu].set &= ~(1 << type); 3428 } 3429 3430 return (0); 3431 } 3432 3433 struct vlapic_vtx { 3434 struct vlapic vlapic; 3435 struct pir_desc *pir_desc; 3436 struct vmx *vmx; 3437 u_int pending_prio; 3438 }; 3439 3440 #define VPR_PRIO_BIT(vpr) (1 << ((vpr) >> 4)) 3441 3442 #define VMX_CTR_PIR(vm, vcpuid, pir_desc, notify, vector, level, msg) \ 3443 do { \ 3444 VCPU_CTR2(vm, vcpuid, msg " assert %s-triggered vector %d", \ 3445 level ? "level" : "edge", vector); \ 3446 VCPU_CTR1(vm, vcpuid, msg " pir0 0x%016lx", pir_desc->pir[0]); \ 3447 VCPU_CTR1(vm, vcpuid, msg " pir1 0x%016lx", pir_desc->pir[1]); \ 3448 VCPU_CTR1(vm, vcpuid, msg " pir2 0x%016lx", pir_desc->pir[2]); \ 3449 VCPU_CTR1(vm, vcpuid, msg " pir3 0x%016lx", pir_desc->pir[3]); \ 3450 VCPU_CTR1(vm, vcpuid, msg " notify: %s", notify ? "yes" : "no");\ 3451 } while (0) 3452 3453 /* 3454 * vlapic->ops handlers that utilize the APICv hardware assist described in 3455 * Chapter 29 of the Intel SDM. 3456 */ 3457 static int 3458 vmx_set_intr_ready(struct vlapic *vlapic, int vector, bool level) 3459 { 3460 struct vlapic_vtx *vlapic_vtx; 3461 struct pir_desc *pir_desc; 3462 uint64_t mask; 3463 int idx, notify = 0; 3464 3465 vlapic_vtx = (struct vlapic_vtx *)vlapic; 3466 pir_desc = vlapic_vtx->pir_desc; 3467 3468 /* 3469 * Keep track of interrupt requests in the PIR descriptor. This is 3470 * because the virtual APIC page pointed to by the VMCS cannot be 3471 * modified if the vcpu is running. 3472 */ 3473 idx = vector / 64; 3474 mask = 1UL << (vector % 64); 3475 atomic_set_long(&pir_desc->pir[idx], mask); 3476 3477 /* 3478 * A notification is required whenever the 'pending' bit makes a 3479 * transition from 0->1. 3480 * 3481 * Even if the 'pending' bit is already asserted, notification about 3482 * the incoming interrupt may still be necessary. For example, if a 3483 * vCPU is HLTed with a high PPR, a low priority interrupt would cause 3484 * the 0->1 'pending' transition with a notification, but the vCPU 3485 * would ignore the interrupt for the time being. The same vCPU would 3486 * need to then be notified if a high-priority interrupt arrived which 3487 * satisfied the PPR. 3488 * 3489 * The priorities of interrupts injected while 'pending' is asserted 3490 * are tracked in a custom bitfield 'pending_prio'. Should the 3491 * to-be-injected interrupt exceed the priorities already present, the 3492 * notification is sent. The priorities recorded in 'pending_prio' are 3493 * cleared whenever the 'pending' bit makes another 0->1 transition. 3494 */ 3495 if (atomic_cmpset_long(&pir_desc->pending, 0, 1) != 0) { 3496 notify = 1; 3497 vlapic_vtx->pending_prio = 0; 3498 } else { 3499 const u_int old_prio = vlapic_vtx->pending_prio; 3500 const u_int prio_bit = VPR_PRIO_BIT(vector & APIC_TPR_INT); 3501 3502 if ((old_prio & prio_bit) == 0 && prio_bit > old_prio) { 3503 atomic_set_int(&vlapic_vtx->pending_prio, prio_bit); 3504 notify = 1; 3505 } 3506 } 3507 3508 VMX_CTR_PIR(vlapic->vm, vlapic->vcpuid, pir_desc, notify, vector, 3509 level, "vmx_set_intr_ready"); 3510 return (notify); 3511 } 3512 3513 static int 3514 vmx_pending_intr(struct vlapic *vlapic, int *vecptr) 3515 { 3516 struct vlapic_vtx *vlapic_vtx; 3517 struct pir_desc *pir_desc; 3518 struct LAPIC *lapic; 3519 uint64_t pending, pirval; 3520 uint32_t ppr, vpr; 3521 int i; 3522 3523 /* 3524 * This function is only expected to be called from the 'HLT' exit 3525 * handler which does not care about the vector that is pending. 3526 */ 3527 KASSERT(vecptr == NULL, ("vmx_pending_intr: vecptr must be NULL")); 3528 3529 vlapic_vtx = (struct vlapic_vtx *)vlapic; 3530 pir_desc = vlapic_vtx->pir_desc; 3531 3532 pending = atomic_load_acq_long(&pir_desc->pending); 3533 if (!pending) { 3534 /* 3535 * While a virtual interrupt may have already been 3536 * processed the actual delivery maybe pending the 3537 * interruptibility of the guest. Recognize a pending 3538 * interrupt by reevaluating virtual interrupts 3539 * following Section 29.2.1 in the Intel SDM Volume 3. 3540 */ 3541 struct vm_exit *vmexit; 3542 uint8_t rvi, ppr; 3543 3544 vmexit = vm_exitinfo(vlapic->vm, vlapic->vcpuid); 3545 KASSERT(vmexit->exitcode == VM_EXITCODE_HLT, 3546 ("vmx_pending_intr: exitcode not 'HLT'")); 3547 rvi = vmexit->u.hlt.intr_status & APIC_TPR_INT; 3548 lapic = vlapic->apic_page; 3549 ppr = lapic->ppr & APIC_TPR_INT; 3550 if (rvi > ppr) { 3551 return (1); 3552 } 3553 3554 return (0); 3555 } 3556 3557 /* 3558 * If there is an interrupt pending then it will be recognized only 3559 * if its priority is greater than the processor priority. 3560 * 3561 * Special case: if the processor priority is zero then any pending 3562 * interrupt will be recognized. 3563 */ 3564 lapic = vlapic->apic_page; 3565 ppr = lapic->ppr & APIC_TPR_INT; 3566 if (ppr == 0) 3567 return (1); 3568 3569 VCPU_CTR1(vlapic->vm, vlapic->vcpuid, "HLT with non-zero PPR %d", 3570 lapic->ppr); 3571 3572 vpr = 0; 3573 for (i = 3; i >= 0; i--) { 3574 pirval = pir_desc->pir[i]; 3575 if (pirval != 0) { 3576 vpr = (i * 64 + flsl(pirval) - 1) & APIC_TPR_INT; 3577 break; 3578 } 3579 } 3580 3581 /* 3582 * If the highest-priority pending interrupt falls short of the 3583 * processor priority of this vCPU, ensure that 'pending_prio' does not 3584 * have any stale bits which would preclude a higher-priority interrupt 3585 * from incurring a notification later. 3586 */ 3587 if (vpr <= ppr) { 3588 const u_int prio_bit = VPR_PRIO_BIT(vpr); 3589 const u_int old = vlapic_vtx->pending_prio; 3590 3591 if (old > prio_bit && (old & prio_bit) == 0) { 3592 vlapic_vtx->pending_prio = prio_bit; 3593 } 3594 return (0); 3595 } 3596 return (1); 3597 } 3598 3599 static void 3600 vmx_intr_accepted(struct vlapic *vlapic, int vector) 3601 { 3602 3603 panic("vmx_intr_accepted: not expected to be called"); 3604 } 3605 3606 static void 3607 vmx_set_tmr(struct vlapic *vlapic, int vector, bool level) 3608 { 3609 struct vlapic_vtx *vlapic_vtx; 3610 struct vmx *vmx; 3611 struct vmcs *vmcs; 3612 uint64_t mask, val; 3613 3614 KASSERT(vector >= 0 && vector <= 255, ("invalid vector %d", vector)); 3615 KASSERT(!vcpu_is_running(vlapic->vm, vlapic->vcpuid, NULL), 3616 ("vmx_set_tmr: vcpu cannot be running")); 3617 3618 vlapic_vtx = (struct vlapic_vtx *)vlapic; 3619 vmx = vlapic_vtx->vmx; 3620 vmcs = &vmx->vmcs[vlapic->vcpuid]; 3621 mask = 1UL << (vector % 64); 3622 3623 VMPTRLD(vmcs); 3624 val = vmcs_read(VMCS_EOI_EXIT(vector)); 3625 if (level) 3626 val |= mask; 3627 else 3628 val &= ~mask; 3629 vmcs_write(VMCS_EOI_EXIT(vector), val); 3630 VMCLEAR(vmcs); 3631 } 3632 3633 static void 3634 vmx_enable_x2apic_mode(struct vlapic *vlapic) 3635 { 3636 struct vmx *vmx; 3637 struct vmcs *vmcs; 3638 uint32_t proc_ctls2; 3639 int vcpuid, error; 3640 3641 vcpuid = vlapic->vcpuid; 3642 vmx = ((struct vlapic_vtx *)vlapic)->vmx; 3643 vmcs = &vmx->vmcs[vcpuid]; 3644 3645 proc_ctls2 = vmx->cap[vcpuid].proc_ctls2; 3646 KASSERT((proc_ctls2 & PROCBASED2_VIRTUALIZE_APIC_ACCESSES) != 0, 3647 ("%s: invalid proc_ctls2 %#x", __func__, proc_ctls2)); 3648 3649 proc_ctls2 &= ~PROCBASED2_VIRTUALIZE_APIC_ACCESSES; 3650 proc_ctls2 |= PROCBASED2_VIRTUALIZE_X2APIC_MODE; 3651 vmx->cap[vcpuid].proc_ctls2 = proc_ctls2; 3652 3653 VMPTRLD(vmcs); 3654 vmcs_write(VMCS_SEC_PROC_BASED_CTLS, proc_ctls2); 3655 VMCLEAR(vmcs); 3656 3657 if (vlapic->vcpuid == 0) { 3658 /* 3659 * The nested page table mappings are shared by all vcpus 3660 * so unmap the APIC access page just once. 3661 */ 3662 error = vm_unmap_mmio(vmx->vm, DEFAULT_APIC_BASE, PAGE_SIZE); 3663 KASSERT(error == 0, ("%s: vm_unmap_mmio error %d", 3664 __func__, error)); 3665 3666 /* 3667 * The MSR bitmap is shared by all vcpus so modify it only 3668 * once in the context of vcpu 0. 3669 */ 3670 error = vmx_allow_x2apic_msrs(vmx); 3671 KASSERT(error == 0, ("%s: vmx_allow_x2apic_msrs error %d", 3672 __func__, error)); 3673 } 3674 } 3675 3676 static void 3677 vmx_post_intr(struct vlapic *vlapic, int hostcpu) 3678 { 3679 3680 ipi_cpu(hostcpu, pirvec); 3681 } 3682 3683 /* 3684 * Transfer the pending interrupts in the PIR descriptor to the IRR 3685 * in the virtual APIC page. 3686 */ 3687 static void 3688 vmx_inject_pir(struct vlapic *vlapic) 3689 { 3690 struct vlapic_vtx *vlapic_vtx; 3691 struct pir_desc *pir_desc; 3692 struct LAPIC *lapic; 3693 uint64_t val, pirval; 3694 int rvi, pirbase = -1; 3695 uint16_t intr_status_old, intr_status_new; 3696 3697 vlapic_vtx = (struct vlapic_vtx *)vlapic; 3698 pir_desc = vlapic_vtx->pir_desc; 3699 if (atomic_cmpset_long(&pir_desc->pending, 1, 0) == 0) { 3700 VCPU_CTR0(vlapic->vm, vlapic->vcpuid, "vmx_inject_pir: " 3701 "no posted interrupt pending"); 3702 return; 3703 } 3704 3705 pirval = 0; 3706 pirbase = -1; 3707 lapic = vlapic->apic_page; 3708 3709 val = atomic_readandclear_long(&pir_desc->pir[0]); 3710 if (val != 0) { 3711 lapic->irr0 |= val; 3712 lapic->irr1 |= val >> 32; 3713 pirbase = 0; 3714 pirval = val; 3715 } 3716 3717 val = atomic_readandclear_long(&pir_desc->pir[1]); 3718 if (val != 0) { 3719 lapic->irr2 |= val; 3720 lapic->irr3 |= val >> 32; 3721 pirbase = 64; 3722 pirval = val; 3723 } 3724 3725 val = atomic_readandclear_long(&pir_desc->pir[2]); 3726 if (val != 0) { 3727 lapic->irr4 |= val; 3728 lapic->irr5 |= val >> 32; 3729 pirbase = 128; 3730 pirval = val; 3731 } 3732 3733 val = atomic_readandclear_long(&pir_desc->pir[3]); 3734 if (val != 0) { 3735 lapic->irr6 |= val; 3736 lapic->irr7 |= val >> 32; 3737 pirbase = 192; 3738 pirval = val; 3739 } 3740 3741 VLAPIC_CTR_IRR(vlapic, "vmx_inject_pir"); 3742 3743 /* 3744 * Update RVI so the processor can evaluate pending virtual 3745 * interrupts on VM-entry. 3746 * 3747 * It is possible for pirval to be 0 here, even though the 3748 * pending bit has been set. The scenario is: 3749 * CPU-Y is sending a posted interrupt to CPU-X, which 3750 * is running a guest and processing posted interrupts in h/w. 3751 * CPU-X will eventually exit and the state seen in s/w is 3752 * the pending bit set, but no PIR bits set. 3753 * 3754 * CPU-X CPU-Y 3755 * (vm running) (host running) 3756 * rx posted interrupt 3757 * CLEAR pending bit 3758 * SET PIR bit 3759 * READ/CLEAR PIR bits 3760 * SET pending bit 3761 * (vm exit) 3762 * pending bit set, PIR 0 3763 */ 3764 if (pirval != 0) { 3765 rvi = pirbase + flsl(pirval) - 1; 3766 intr_status_old = vmcs_read(VMCS_GUEST_INTR_STATUS); 3767 intr_status_new = (intr_status_old & 0xFF00) | rvi; 3768 if (intr_status_new > intr_status_old) { 3769 vmcs_write(VMCS_GUEST_INTR_STATUS, intr_status_new); 3770 VCPU_CTR2(vlapic->vm, vlapic->vcpuid, "vmx_inject_pir: " 3771 "guest_intr_status changed from 0x%04x to 0x%04x", 3772 intr_status_old, intr_status_new); 3773 } 3774 } 3775 } 3776 3777 static struct vlapic * 3778 vmx_vlapic_init(void *arg, int vcpuid) 3779 { 3780 struct vmx *vmx; 3781 struct vlapic *vlapic; 3782 struct vlapic_vtx *vlapic_vtx; 3783 3784 vmx = arg; 3785 3786 vlapic = malloc(sizeof(struct vlapic_vtx), M_VLAPIC, M_WAITOK | M_ZERO); 3787 vlapic->vm = vmx->vm; 3788 vlapic->vcpuid = vcpuid; 3789 vlapic->apic_page = (struct LAPIC *)&vmx->apic_page[vcpuid]; 3790 3791 vlapic_vtx = (struct vlapic_vtx *)vlapic; 3792 vlapic_vtx->pir_desc = &vmx->pir_desc[vcpuid]; 3793 vlapic_vtx->vmx = vmx; 3794 3795 if (virtual_interrupt_delivery) { 3796 vlapic->ops.set_intr_ready = vmx_set_intr_ready; 3797 vlapic->ops.pending_intr = vmx_pending_intr; 3798 vlapic->ops.intr_accepted = vmx_intr_accepted; 3799 vlapic->ops.set_tmr = vmx_set_tmr; 3800 vlapic->ops.enable_x2apic_mode = vmx_enable_x2apic_mode; 3801 } 3802 3803 if (posted_interrupts) 3804 vlapic->ops.post_intr = vmx_post_intr; 3805 3806 vlapic_init(vlapic); 3807 3808 return (vlapic); 3809 } 3810 3811 static void 3812 vmx_vlapic_cleanup(void *arg, struct vlapic *vlapic) 3813 { 3814 3815 vlapic_cleanup(vlapic); 3816 free(vlapic, M_VLAPIC); 3817 } 3818 3819 struct vmm_ops vmm_ops_intel = { 3820 .init = vmx_init, 3821 .cleanup = vmx_cleanup, 3822 .resume = vmx_restore, 3823 .vminit = vmx_vminit, 3824 .vmrun = vmx_run, 3825 .vmcleanup = vmx_vmcleanup, 3826 .vmgetreg = vmx_getreg, 3827 .vmsetreg = vmx_setreg, 3828 .vmgetdesc = vmx_getdesc, 3829 .vmsetdesc = vmx_setdesc, 3830 .vmgetcap = vmx_getcap, 3831 .vmsetcap = vmx_setcap, 3832 .vmspace_alloc = ept_vmspace_alloc, 3833 .vmspace_free = ept_vmspace_free, 3834 .vlapic_init = vmx_vlapic_init, 3835 .vlapic_cleanup = vmx_vlapic_cleanup, 3836 }; 3837