1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2013, Anish Gupta (akgupt3@gmail.com) 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice unmodified, this list of conditions, and the following 12 * disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 */ 28 29 /* 30 * This file and its contents are supplied under the terms of the 31 * Common Development and Distribution License ("CDDL"), version 1.0. 32 * You may only use this file in accordance with the terms of version 33 * 1.0 of the CDDL. 34 * 35 * A full copy of the text of the CDDL should have accompanied this 36 * source. A copy of the CDDL is also available via the Internet at 37 * http://www.illumos.org/license/CDDL. 38 * 39 * Copyright 2018 Joyent, Inc. 40 * Copyright 2022 Oxide Computer Company 41 */ 42 43 #include <sys/cdefs.h> 44 __FBSDID("$FreeBSD$"); 45 46 #include <sys/param.h> 47 #include <sys/systm.h> 48 #include <sys/kernel.h> 49 #include <sys/kmem.h> 50 #include <sys/pcpu.h> 51 #include <sys/proc.h> 52 #include <sys/sysctl.h> 53 54 #include <sys/x86_archext.h> 55 #include <sys/trap.h> 56 57 #include <machine/cpufunc.h> 58 #include <machine/psl.h> 59 #include <machine/md_var.h> 60 #include <machine/reg.h> 61 #include <machine/specialreg.h> 62 #include <machine/vmm.h> 63 #include <machine/vmm_dev.h> 64 #include <sys/vmm_instruction_emul.h> 65 #include <sys/vmm_vm.h> 66 #include <sys/vmm_kernel.h> 67 68 #include "vmm_lapic.h" 69 #include "vmm_stat.h" 70 #include "vmm_ioport.h" 71 #include "vatpic.h" 72 #include "vlapic.h" 73 #include "vlapic_priv.h" 74 75 #include "vmcb.h" 76 #include "svm.h" 77 #include "svm_softc.h" 78 #include "svm_msr.h" 79 80 SYSCTL_DECL(_hw_vmm); 81 SYSCTL_NODE(_hw_vmm, OID_AUTO, svm, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 82 NULL); 83 84 /* 85 * SVM CPUID function 0x8000_000A, edx bit decoding. 86 */ 87 #define AMD_CPUID_SVM_NP BIT(0) /* Nested paging or RVI */ 88 #define AMD_CPUID_SVM_LBR BIT(1) /* Last branch virtualization */ 89 #define AMD_CPUID_SVM_SVML BIT(2) /* SVM lock */ 90 #define AMD_CPUID_SVM_NRIP_SAVE BIT(3) /* Next RIP is saved */ 91 #define AMD_CPUID_SVM_TSC_RATE BIT(4) /* TSC rate control. */ 92 #define AMD_CPUID_SVM_VMCB_CLEAN BIT(5) /* VMCB state caching */ 93 #define AMD_CPUID_SVM_FLUSH_BY_ASID BIT(6) /* Flush by ASID */ 94 #define AMD_CPUID_SVM_DECODE_ASSIST BIT(7) /* Decode assist */ 95 #define AMD_CPUID_SVM_PAUSE_INC BIT(10) /* Pause intercept filter. */ 96 #define AMD_CPUID_SVM_PAUSE_FTH BIT(12) /* Pause filter threshold */ 97 #define AMD_CPUID_SVM_AVIC BIT(13) /* AVIC present */ 98 99 #define VMCB_CACHE_DEFAULT (VMCB_CACHE_ASID | \ 100 VMCB_CACHE_IOPM | \ 101 VMCB_CACHE_I | \ 102 VMCB_CACHE_TPR | \ 103 VMCB_CACHE_CR2 | \ 104 VMCB_CACHE_CR | \ 105 VMCB_CACHE_DR | \ 106 VMCB_CACHE_DT | \ 107 VMCB_CACHE_SEG | \ 108 VMCB_CACHE_NP) 109 110 static uint32_t vmcb_clean = VMCB_CACHE_DEFAULT; 111 SYSCTL_INT(_hw_vmm_svm, OID_AUTO, vmcb_clean, CTLFLAG_RDTUN, &vmcb_clean, 112 0, NULL); 113 114 /* SVM features advertised by CPUID.8000000AH:EDX */ 115 static uint32_t svm_feature = ~0U; /* AMD SVM features. */ 116 117 static int disable_npf_assist; 118 119 static VMM_STAT_AMD(VCPU_EXITINTINFO, "VM exits during event delivery"); 120 static VMM_STAT_AMD(VCPU_INTINFO_INJECTED, "Events pending at VM entry"); 121 static VMM_STAT_AMD(VMEXIT_VINTR, "VM exits due to interrupt window"); 122 123 static int svm_setreg(void *arg, int vcpu, int ident, uint64_t val); 124 static int svm_getreg(void *arg, int vcpu, int ident, uint64_t *val); 125 static void flush_asid(struct svm_softc *sc, int vcpuid); 126 127 static __inline bool 128 flush_by_asid(void) 129 { 130 return ((svm_feature & AMD_CPUID_SVM_FLUSH_BY_ASID) != 0); 131 } 132 133 static __inline bool 134 decode_assist(void) 135 { 136 return ((svm_feature & AMD_CPUID_SVM_DECODE_ASSIST) != 0); 137 } 138 139 static int 140 svm_cleanup(void) 141 { 142 /* This is taken care of by the hma registration */ 143 return (0); 144 } 145 146 static int 147 svm_init(void) 148 { 149 vmcb_clean &= VMCB_CACHE_DEFAULT; 150 151 svm_msr_init(); 152 153 return (0); 154 } 155 156 static void 157 svm_restore(void) 158 { 159 /* No-op on illumos */ 160 } 161 162 /* Pentium compatible MSRs */ 163 #define MSR_PENTIUM_START 0 164 #define MSR_PENTIUM_END 0x1FFF 165 /* AMD 6th generation and Intel compatible MSRs */ 166 #define MSR_AMD6TH_START 0xC0000000UL 167 #define MSR_AMD6TH_END 0xC0001FFFUL 168 /* AMD 7th and 8th generation compatible MSRs */ 169 #define MSR_AMD7TH_START 0xC0010000UL 170 #define MSR_AMD7TH_END 0xC0011FFFUL 171 172 /* 173 * Get the index and bit position for a MSR in permission bitmap. 174 * Two bits are used for each MSR: lower bit for read and higher bit for write. 175 */ 176 static int 177 svm_msr_index(uint64_t msr, int *index, int *bit) 178 { 179 uint32_t base, off; 180 181 *index = -1; 182 *bit = (msr % 4) * 2; 183 base = 0; 184 185 if (msr <= MSR_PENTIUM_END) { 186 *index = msr / 4; 187 return (0); 188 } 189 190 base += (MSR_PENTIUM_END - MSR_PENTIUM_START + 1); 191 if (msr >= MSR_AMD6TH_START && msr <= MSR_AMD6TH_END) { 192 off = (msr - MSR_AMD6TH_START); 193 *index = (off + base) / 4; 194 return (0); 195 } 196 197 base += (MSR_AMD6TH_END - MSR_AMD6TH_START + 1); 198 if (msr >= MSR_AMD7TH_START && msr <= MSR_AMD7TH_END) { 199 off = (msr - MSR_AMD7TH_START); 200 *index = (off + base) / 4; 201 return (0); 202 } 203 204 return (EINVAL); 205 } 206 207 /* 208 * Allow vcpu to read or write the 'msr' without trapping into the hypervisor. 209 */ 210 static void 211 svm_msr_perm(uint8_t *perm_bitmap, uint64_t msr, bool read, bool write) 212 { 213 int index, bit, error; 214 215 error = svm_msr_index(msr, &index, &bit); 216 KASSERT(error == 0, ("%s: invalid msr %lx", __func__, msr)); 217 KASSERT(index >= 0 && index < SVM_MSR_BITMAP_SIZE, 218 ("%s: invalid index %d for msr %lx", __func__, index, msr)); 219 KASSERT(bit >= 0 && bit <= 6, ("%s: invalid bit position %d " 220 "msr %lx", __func__, bit, msr)); 221 222 if (read) 223 perm_bitmap[index] &= ~(1UL << bit); 224 225 if (write) 226 perm_bitmap[index] &= ~(2UL << bit); 227 } 228 229 static void 230 svm_msr_rw_ok(uint8_t *perm_bitmap, uint64_t msr) 231 { 232 233 svm_msr_perm(perm_bitmap, msr, true, true); 234 } 235 236 static void 237 svm_msr_rd_ok(uint8_t *perm_bitmap, uint64_t msr) 238 { 239 240 svm_msr_perm(perm_bitmap, msr, true, false); 241 } 242 243 static __inline int 244 svm_get_intercept(struct svm_softc *sc, int vcpu, int idx, uint32_t bitmask) 245 { 246 struct vmcb_ctrl *ctrl; 247 248 KASSERT(idx >= 0 && idx < 5, ("invalid intercept index %d", idx)); 249 250 ctrl = svm_get_vmcb_ctrl(sc, vcpu); 251 return (ctrl->intercept[idx] & bitmask ? 1 : 0); 252 } 253 254 static __inline void 255 svm_set_intercept(struct svm_softc *sc, int vcpu, int idx, uint32_t bitmask, 256 int enabled) 257 { 258 struct vmcb_ctrl *ctrl; 259 uint32_t oldval; 260 261 KASSERT(idx >= 0 && idx < 5, ("invalid intercept index %d", idx)); 262 263 ctrl = svm_get_vmcb_ctrl(sc, vcpu); 264 oldval = ctrl->intercept[idx]; 265 266 if (enabled) 267 ctrl->intercept[idx] |= bitmask; 268 else 269 ctrl->intercept[idx] &= ~bitmask; 270 271 if (ctrl->intercept[idx] != oldval) { 272 svm_set_dirty(sc, vcpu, VMCB_CACHE_I); 273 } 274 } 275 276 static __inline void 277 svm_disable_intercept(struct svm_softc *sc, int vcpu, int off, uint32_t bitmask) 278 { 279 280 svm_set_intercept(sc, vcpu, off, bitmask, 0); 281 } 282 283 static __inline void 284 svm_enable_intercept(struct svm_softc *sc, int vcpu, int off, uint32_t bitmask) 285 { 286 287 svm_set_intercept(sc, vcpu, off, bitmask, 1); 288 } 289 290 static void 291 vmcb_init(struct svm_softc *sc, int vcpu, uint64_t iopm_base_pa, 292 uint64_t msrpm_base_pa, uint64_t np_pml4) 293 { 294 struct vmcb_ctrl *ctrl; 295 struct vmcb_state *state; 296 uint32_t mask; 297 int n; 298 299 ctrl = svm_get_vmcb_ctrl(sc, vcpu); 300 state = svm_get_vmcb_state(sc, vcpu); 301 302 ctrl->iopm_base_pa = iopm_base_pa; 303 ctrl->msrpm_base_pa = msrpm_base_pa; 304 305 /* Enable nested paging */ 306 ctrl->np_ctrl = NP_ENABLE; 307 ctrl->n_cr3 = np_pml4; 308 309 /* 310 * Intercept accesses to the control registers that are not shadowed 311 * in the VMCB - i.e. all except cr0, cr2, cr3, cr4 and cr8. 312 */ 313 for (n = 0; n < 16; n++) { 314 mask = (BIT(n) << 16) | BIT(n); 315 if (n == 0 || n == 2 || n == 3 || n == 4 || n == 8) 316 svm_disable_intercept(sc, vcpu, VMCB_CR_INTCPT, mask); 317 else 318 svm_enable_intercept(sc, vcpu, VMCB_CR_INTCPT, mask); 319 } 320 321 /* 322 * Selectively intercept writes to %cr0. This triggers on operations 323 * which would change bits other than TS or MP. 324 */ 325 svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, 326 VMCB_INTCPT_CR0_WRITE); 327 328 /* 329 * Intercept everything when tracing guest exceptions otherwise 330 * just intercept machine check exception. 331 */ 332 if (vcpu_trace_exceptions(sc->vm, vcpu)) { 333 for (n = 0; n < 32; n++) { 334 /* 335 * Skip unimplemented vectors in the exception bitmap. 336 */ 337 if (n == 2 || n == 9) { 338 continue; 339 } 340 svm_enable_intercept(sc, vcpu, VMCB_EXC_INTCPT, BIT(n)); 341 } 342 } else { 343 svm_enable_intercept(sc, vcpu, VMCB_EXC_INTCPT, BIT(IDT_MC)); 344 } 345 346 /* Intercept various events (for e.g. I/O, MSR and CPUID accesses) */ 347 svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_IO); 348 svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_MSR); 349 svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_CPUID); 350 svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_INTR); 351 svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_INIT); 352 svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_NMI); 353 svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_SMI); 354 svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_SHUTDOWN); 355 svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, 356 VMCB_INTCPT_FERR_FREEZE); 357 358 /* Enable exit-on-hlt by default */ 359 svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_HLT); 360 361 svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_MONITOR); 362 svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_MWAIT); 363 364 /* Intercept privileged invalidation instructions. */ 365 svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_INVD); 366 svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_INVLPGA); 367 368 /* 369 * Intercept all virtualization-related instructions. 370 * 371 * From section "Canonicalization and Consistency Checks" in APMv2 372 * the VMRUN intercept bit must be set to pass the consistency check. 373 */ 374 svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_VMRUN); 375 svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_VMMCALL); 376 svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_VMLOAD); 377 svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_VMSAVE); 378 svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_STGI); 379 svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_CLGI); 380 svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_SKINIT); 381 382 /* 383 * The ASID will be set to a non-zero value just before VMRUN. 384 */ 385 ctrl->asid = 0; 386 387 /* 388 * Section 15.21.1, Interrupt Masking in EFLAGS 389 * Section 15.21.2, Virtualizing APIC.TPR 390 * 391 * This must be set for %rflag and %cr8 isolation of guest and host. 392 */ 393 ctrl->v_intr_ctrl |= V_INTR_MASKING; 394 395 /* Enable Last Branch Record aka LBR for debugging */ 396 ctrl->misc_ctrl |= LBR_VIRT_ENABLE; 397 state->dbgctl = BIT(0); 398 399 /* EFER_SVM must always be set when the guest is executing */ 400 state->efer = EFER_SVM; 401 402 /* Set up the PAT to power-on state */ 403 state->g_pat = PAT_VALUE(0, PAT_WRITE_BACK) | 404 PAT_VALUE(1, PAT_WRITE_THROUGH) | 405 PAT_VALUE(2, PAT_UNCACHED) | 406 PAT_VALUE(3, PAT_UNCACHEABLE) | 407 PAT_VALUE(4, PAT_WRITE_BACK) | 408 PAT_VALUE(5, PAT_WRITE_THROUGH) | 409 PAT_VALUE(6, PAT_UNCACHED) | 410 PAT_VALUE(7, PAT_UNCACHEABLE); 411 412 /* Set up DR6/7 to power-on state */ 413 state->dr6 = DBREG_DR6_RESERVED1; 414 state->dr7 = DBREG_DR7_RESERVED1; 415 } 416 417 /* 418 * Initialize a virtual machine. 419 */ 420 static void * 421 svm_vminit(struct vm *vm) 422 { 423 struct svm_softc *svm_sc; 424 struct svm_vcpu *vcpu; 425 vm_paddr_t msrpm_pa, iopm_pa, pml4_pa; 426 int i; 427 uint16_t maxcpus; 428 429 svm_sc = kmem_zalloc(sizeof (*svm_sc), KM_SLEEP); 430 VERIFY3U(((uintptr_t)svm_sc & PAGE_MASK), ==, 0); 431 432 svm_sc->msr_bitmap = vmm_contig_alloc(SVM_MSR_BITMAP_SIZE); 433 if (svm_sc->msr_bitmap == NULL) 434 panic("contigmalloc of SVM MSR bitmap failed"); 435 svm_sc->iopm_bitmap = vmm_contig_alloc(SVM_IO_BITMAP_SIZE); 436 if (svm_sc->iopm_bitmap == NULL) 437 panic("contigmalloc of SVM IO bitmap failed"); 438 439 svm_sc->vm = vm; 440 svm_sc->nptp = vmspace_table_root(vm_get_vmspace(vm)); 441 442 /* 443 * Intercept read and write accesses to all MSRs. 444 */ 445 memset(svm_sc->msr_bitmap, 0xFF, SVM_MSR_BITMAP_SIZE); 446 447 /* 448 * Access to the following MSRs is redirected to the VMCB when the 449 * guest is executing. Therefore it is safe to allow the guest to 450 * read/write these MSRs directly without hypervisor involvement. 451 */ 452 svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_GSBASE); 453 svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_FSBASE); 454 svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_KGSBASE); 455 456 svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_STAR); 457 svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_LSTAR); 458 svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_CSTAR); 459 svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SF_MASK); 460 svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SYSENTER_CS_MSR); 461 svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SYSENTER_ESP_MSR); 462 svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SYSENTER_EIP_MSR); 463 svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_PAT); 464 465 svm_msr_rd_ok(svm_sc->msr_bitmap, MSR_TSC); 466 467 /* 468 * Intercept writes to make sure that the EFER_SVM bit is not cleared. 469 */ 470 svm_msr_rd_ok(svm_sc->msr_bitmap, MSR_EFER); 471 472 /* Intercept access to all I/O ports. */ 473 memset(svm_sc->iopm_bitmap, 0xFF, SVM_IO_BITMAP_SIZE); 474 475 iopm_pa = vtophys(svm_sc->iopm_bitmap); 476 msrpm_pa = vtophys(svm_sc->msr_bitmap); 477 pml4_pa = svm_sc->nptp; 478 maxcpus = vm_get_maxcpus(svm_sc->vm); 479 for (i = 0; i < maxcpus; i++) { 480 vcpu = svm_get_vcpu(svm_sc, i); 481 vcpu->nextrip = ~0; 482 vcpu->lastcpu = NOCPU; 483 vcpu->vmcb_pa = vtophys(&vcpu->vmcb); 484 vmcb_init(svm_sc, i, iopm_pa, msrpm_pa, pml4_pa); 485 svm_msr_guest_init(svm_sc, i); 486 } 487 return (svm_sc); 488 } 489 490 /* 491 * Collateral for a generic SVM VM-exit. 492 */ 493 static void 494 vm_exit_svm(struct vm_exit *vme, uint64_t code, uint64_t info1, uint64_t info2) 495 { 496 497 vme->exitcode = VM_EXITCODE_SVM; 498 vme->u.svm.exitcode = code; 499 vme->u.svm.exitinfo1 = info1; 500 vme->u.svm.exitinfo2 = info2; 501 } 502 503 static int 504 svm_cpl(struct vmcb_state *state) 505 { 506 507 /* 508 * From APMv2: 509 * "Retrieve the CPL from the CPL field in the VMCB, not 510 * from any segment DPL" 511 */ 512 return (state->cpl); 513 } 514 515 static enum vm_cpu_mode 516 svm_vcpu_mode(struct vmcb *vmcb) 517 { 518 struct vmcb_state *state; 519 520 state = &vmcb->state; 521 522 if (state->efer & EFER_LMA) { 523 struct vmcb_segment *seg; 524 525 /* 526 * Section 4.8.1 for APM2, check if Code Segment has 527 * Long attribute set in descriptor. 528 */ 529 seg = vmcb_segptr(vmcb, VM_REG_GUEST_CS); 530 if (seg->attrib & VMCB_CS_ATTRIB_L) 531 return (CPU_MODE_64BIT); 532 else 533 return (CPU_MODE_COMPATIBILITY); 534 } else if (state->cr0 & CR0_PE) { 535 return (CPU_MODE_PROTECTED); 536 } else { 537 return (CPU_MODE_REAL); 538 } 539 } 540 541 static enum vm_paging_mode 542 svm_paging_mode(uint64_t cr0, uint64_t cr4, uint64_t efer) 543 { 544 545 if ((cr0 & CR0_PG) == 0) 546 return (PAGING_MODE_FLAT); 547 if ((cr4 & CR4_PAE) == 0) 548 return (PAGING_MODE_32); 549 if (efer & EFER_LME) 550 return (PAGING_MODE_64); 551 else 552 return (PAGING_MODE_PAE); 553 } 554 555 /* 556 * ins/outs utility routines 557 */ 558 559 static void 560 svm_paging_info(struct vmcb *vmcb, struct vm_guest_paging *paging) 561 { 562 struct vmcb_state *state; 563 564 state = &vmcb->state; 565 paging->cr3 = state->cr3; 566 paging->cpl = svm_cpl(state); 567 paging->cpu_mode = svm_vcpu_mode(vmcb); 568 paging->paging_mode = svm_paging_mode(state->cr0, state->cr4, 569 state->efer); 570 } 571 572 #define UNHANDLED 0 573 574 /* 575 * Handle guest I/O intercept. 576 */ 577 static int 578 svm_handle_inout(struct svm_softc *svm_sc, int vcpu, struct vm_exit *vmexit) 579 { 580 struct vmcb_ctrl *ctrl; 581 struct vmcb_state *state; 582 struct vm_inout *inout; 583 struct vie *vie; 584 uint64_t info1; 585 struct vm_guest_paging paging; 586 587 state = svm_get_vmcb_state(svm_sc, vcpu); 588 ctrl = svm_get_vmcb_ctrl(svm_sc, vcpu); 589 inout = &vmexit->u.inout; 590 info1 = ctrl->exitinfo1; 591 592 inout->bytes = (info1 >> 4) & 0x7; 593 inout->flags = 0; 594 inout->flags |= (info1 & BIT(0)) ? INOUT_IN : 0; 595 inout->flags |= (info1 & BIT(3)) ? INOUT_REP : 0; 596 inout->flags |= (info1 & BIT(2)) ? INOUT_STR : 0; 597 inout->port = (uint16_t)(info1 >> 16); 598 inout->eax = (uint32_t)(state->rax); 599 600 if ((inout->flags & INOUT_STR) != 0) { 601 /* 602 * The effective segment number in EXITINFO1[12:10] is populated 603 * only if the processor has the DecodeAssist capability. 604 * 605 * This is not specified explicitly in APMv2 but can be verified 606 * empirically. 607 */ 608 if (!decode_assist()) { 609 /* 610 * Without decoding assistance, force the task of 611 * emulating the ins/outs on userspace. 612 */ 613 vmexit->exitcode = VM_EXITCODE_INST_EMUL; 614 bzero(&vmexit->u.inst_emul, 615 sizeof (vmexit->u.inst_emul)); 616 return (UNHANDLED); 617 } 618 619 /* 620 * Bits 7-9 encode the address size of ins/outs operations where 621 * the 1/2/4 values correspond to 16/32/64 bit sizes. 622 */ 623 inout->addrsize = 2 * ((info1 >> 7) & 0x7); 624 VERIFY(inout->addrsize == 2 || inout->addrsize == 4 || 625 inout->addrsize == 8); 626 627 if (inout->flags & INOUT_IN) { 628 /* 629 * For INS instructions, %es (encoded as 0) is the 630 * implied segment for the operation. 631 */ 632 inout->segment = 0; 633 } else { 634 /* 635 * Bits 10-12 encode the segment for OUTS. 636 * This value follows the standard x86 segment order. 637 */ 638 inout->segment = (info1 >> 10) & 0x7; 639 } 640 } 641 642 vmexit->exitcode = VM_EXITCODE_INOUT; 643 svm_paging_info(svm_get_vmcb(svm_sc, vcpu), &paging); 644 vie = vm_vie_ctx(svm_sc->vm, vcpu); 645 vie_init_inout(vie, inout, vmexit->inst_length, &paging); 646 647 /* The in/out emulation will handle advancing %rip */ 648 vmexit->inst_length = 0; 649 650 return (UNHANDLED); 651 } 652 653 static int 654 npf_fault_type(uint64_t exitinfo1) 655 { 656 657 if (exitinfo1 & VMCB_NPF_INFO1_W) 658 return (PROT_WRITE); 659 else if (exitinfo1 & VMCB_NPF_INFO1_ID) 660 return (PROT_EXEC); 661 else 662 return (PROT_READ); 663 } 664 665 static bool 666 svm_npf_emul_fault(uint64_t exitinfo1) 667 { 668 if (exitinfo1 & VMCB_NPF_INFO1_ID) { 669 return (false); 670 } 671 672 if (exitinfo1 & VMCB_NPF_INFO1_GPT) { 673 return (false); 674 } 675 676 if ((exitinfo1 & VMCB_NPF_INFO1_GPA) == 0) { 677 return (false); 678 } 679 680 return (true); 681 } 682 683 static void 684 svm_handle_mmio_emul(struct svm_softc *svm_sc, int vcpu, struct vm_exit *vmexit, 685 uint64_t gpa) 686 { 687 struct vmcb_ctrl *ctrl; 688 struct vmcb *vmcb; 689 struct vie *vie; 690 struct vm_guest_paging paging; 691 struct vmcb_segment *seg; 692 char *inst_bytes = NULL; 693 uint8_t inst_len = 0; 694 695 vmcb = svm_get_vmcb(svm_sc, vcpu); 696 ctrl = &vmcb->ctrl; 697 698 vmexit->exitcode = VM_EXITCODE_MMIO_EMUL; 699 vmexit->u.mmio_emul.gpa = gpa; 700 vmexit->u.mmio_emul.gla = VIE_INVALID_GLA; 701 svm_paging_info(vmcb, &paging); 702 703 switch (paging.cpu_mode) { 704 case CPU_MODE_REAL: 705 seg = vmcb_segptr(vmcb, VM_REG_GUEST_CS); 706 vmexit->u.mmio_emul.cs_base = seg->base; 707 vmexit->u.mmio_emul.cs_d = 0; 708 break; 709 case CPU_MODE_PROTECTED: 710 case CPU_MODE_COMPATIBILITY: 711 seg = vmcb_segptr(vmcb, VM_REG_GUEST_CS); 712 vmexit->u.mmio_emul.cs_base = seg->base; 713 714 /* 715 * Section 4.8.1 of APM2, Default Operand Size or D bit. 716 */ 717 vmexit->u.mmio_emul.cs_d = (seg->attrib & VMCB_CS_ATTRIB_D) ? 718 1 : 0; 719 break; 720 default: 721 vmexit->u.mmio_emul.cs_base = 0; 722 vmexit->u.mmio_emul.cs_d = 0; 723 break; 724 } 725 726 /* 727 * Copy the instruction bytes into 'vie' if available. 728 */ 729 if (decode_assist() && !disable_npf_assist) { 730 inst_len = ctrl->inst_len; 731 inst_bytes = (char *)ctrl->inst_bytes; 732 } 733 vie = vm_vie_ctx(svm_sc->vm, vcpu); 734 vie_init_mmio(vie, inst_bytes, inst_len, &paging, gpa); 735 } 736 737 /* 738 * Do not allow CD, NW, or invalid high bits to be asserted in the value of cr0 739 * which is live in the guest. They are visible via the shadow instead. 740 */ 741 #define SVM_CR0_MASK ~(CR0_CD | CR0_NW | 0xffffffff00000000) 742 743 static void 744 svm_set_cr0(struct svm_softc *svm_sc, int vcpu, uint64_t val, bool guest_write) 745 { 746 struct vmcb_state *state; 747 struct svm_regctx *regctx; 748 uint64_t masked, old, diff; 749 750 state = svm_get_vmcb_state(svm_sc, vcpu); 751 regctx = svm_get_guest_regctx(svm_sc, vcpu); 752 753 old = state->cr0 | (regctx->sctx_cr0_shadow & ~SVM_CR0_MASK); 754 diff = old ^ val; 755 756 /* No further work needed if register contents remain the same */ 757 if (diff == 0) { 758 return; 759 } 760 761 /* Flush the TLB if the paging or write-protect bits are changing */ 762 if ((diff & CR0_PG) != 0 || (diff & CR0_WP) != 0) { 763 flush_asid(svm_sc, vcpu); 764 } 765 766 /* 767 * If the change in %cr0 is due to a guest action (via interception) 768 * then other CPU state updates may be required. 769 */ 770 if (guest_write) { 771 if ((diff & CR0_PG) != 0) { 772 uint64_t efer = state->efer; 773 774 /* Keep the long-mode state in EFER in sync */ 775 if ((val & CR0_PG) != 0 && (efer & EFER_LME) != 0) { 776 state->efer |= EFER_LMA; 777 } 778 if ((val & CR0_PG) == 0 && (efer & EFER_LME) != 0) { 779 state->efer &= ~EFER_LMA; 780 } 781 } 782 } 783 784 masked = val & SVM_CR0_MASK; 785 regctx->sctx_cr0_shadow = val; 786 state->cr0 = masked; 787 svm_set_dirty(svm_sc, vcpu, VMCB_CACHE_CR); 788 789 if ((masked ^ val) != 0) { 790 /* 791 * The guest has set bits in %cr0 which we are masking out and 792 * exposing via shadow. 793 * 794 * We must intercept %cr0 reads in order to make the shadowed 795 * view available to the guest. 796 * 797 * Writes to %cr0 must also be intercepted (unconditionally, 798 * unlike the VMCB_INTCPT_CR0_WRITE mechanism) so we can catch 799 * if/when the guest clears those shadowed bits. 800 */ 801 svm_enable_intercept(svm_sc, vcpu, VMCB_CR_INTCPT, 802 BIT(0) | BIT(16)); 803 } else { 804 /* 805 * When no bits remain in %cr0 which require shadowing, the 806 * unconditional intercept of reads/writes to %cr0 can be 807 * disabled. 808 * 809 * The selective write intercept (VMCB_INTCPT_CR0_WRITE) remains 810 * in place so we can be notified of operations which change 811 * bits other than TS or MP. 812 */ 813 svm_disable_intercept(svm_sc, vcpu, VMCB_CR_INTCPT, 814 BIT(0) | BIT(16)); 815 } 816 svm_set_dirty(svm_sc, vcpu, VMCB_CACHE_I); 817 } 818 819 static void 820 svm_get_cr0(struct svm_softc *svm_sc, int vcpu, uint64_t *val) 821 { 822 struct vmcb *vmcb; 823 struct svm_regctx *regctx; 824 825 vmcb = svm_get_vmcb(svm_sc, vcpu); 826 regctx = svm_get_guest_regctx(svm_sc, vcpu); 827 828 /* 829 * Include the %cr0 bits which exist only in the shadow along with those 830 * in the running vCPU state. 831 */ 832 *val = vmcb->state.cr0 | (regctx->sctx_cr0_shadow & ~SVM_CR0_MASK); 833 } 834 835 static void 836 svm_handle_cr0_read(struct svm_softc *svm_sc, int vcpu, enum vm_reg_name reg) 837 { 838 uint64_t val; 839 int err __maybe_unused; 840 841 svm_get_cr0(svm_sc, vcpu, &val); 842 err = svm_setreg(svm_sc, vcpu, reg, val); 843 ASSERT(err == 0); 844 } 845 846 static void 847 svm_handle_cr0_write(struct svm_softc *svm_sc, int vcpu, enum vm_reg_name reg) 848 { 849 struct vmcb_state *state; 850 uint64_t val; 851 int err __maybe_unused; 852 853 state = svm_get_vmcb_state(svm_sc, vcpu); 854 855 err = svm_getreg(svm_sc, vcpu, reg, &val); 856 ASSERT(err == 0); 857 858 if ((val & CR0_NW) != 0 && (val & CR0_CD) == 0) { 859 /* NW without CD is nonsensical */ 860 vm_inject_gp(svm_sc->vm, vcpu); 861 return; 862 } 863 if ((val & CR0_PG) != 0 && (val & CR0_PE) == 0) { 864 /* PG requires PE */ 865 vm_inject_gp(svm_sc->vm, vcpu); 866 return; 867 } 868 if ((state->cr0 & CR0_PG) == 0 && (val & CR0_PG) != 0) { 869 /* When enabling paging, PAE must be enabled if LME is. */ 870 if ((state->efer & EFER_LME) != 0 && 871 (state->cr4 & CR4_PAE) == 0) { 872 vm_inject_gp(svm_sc->vm, vcpu); 873 return; 874 } 875 } 876 877 svm_set_cr0(svm_sc, vcpu, val, true); 878 } 879 880 static void 881 svm_inst_emul_other(struct svm_softc *svm_sc, int vcpu, struct vm_exit *vmexit) 882 { 883 struct vie *vie; 884 struct vm_guest_paging paging; 885 886 /* Let the instruction emulation (hopefully in-kernel) handle it */ 887 vmexit->exitcode = VM_EXITCODE_INST_EMUL; 888 bzero(&vmexit->u.inst_emul, sizeof (vmexit->u.inst_emul)); 889 vie = vm_vie_ctx(svm_sc->vm, vcpu); 890 svm_paging_info(svm_get_vmcb(svm_sc, vcpu), &paging); 891 vie_init_other(vie, &paging); 892 893 /* The instruction emulation will handle advancing %rip */ 894 vmexit->inst_length = 0; 895 } 896 897 static void 898 svm_update_virqinfo(struct svm_softc *sc, int vcpu) 899 { 900 struct vm *vm; 901 struct vlapic *vlapic; 902 struct vmcb_ctrl *ctrl; 903 904 vm = sc->vm; 905 vlapic = vm_lapic(vm, vcpu); 906 ctrl = svm_get_vmcb_ctrl(sc, vcpu); 907 908 /* Update %cr8 in the emulated vlapic */ 909 vlapic_set_cr8(vlapic, ctrl->v_tpr); 910 911 /* Virtual interrupt injection is not used. */ 912 KASSERT(ctrl->v_intr_vector == 0, ("%s: invalid " 913 "v_intr_vector %d", __func__, ctrl->v_intr_vector)); 914 } 915 916 CTASSERT(VMCB_EVENTINJ_TYPE_INTR == VM_INTINFO_HWINTR); 917 CTASSERT(VMCB_EVENTINJ_TYPE_NMI == VM_INTINFO_NMI); 918 CTASSERT(VMCB_EVENTINJ_TYPE_EXCEPTION == VM_INTINFO_HWEXCP); 919 CTASSERT(VMCB_EVENTINJ_TYPE_INTn == VM_INTINFO_SWINTR); 920 CTASSERT(VMCB_EVENTINJ_EC_VALID == VM_INTINFO_DEL_ERRCODE); 921 CTASSERT(VMCB_EVENTINJ_VALID == VM_INTINFO_VALID); 922 923 static void 924 svm_save_exitintinfo(struct svm_softc *svm_sc, int vcpu) 925 { 926 struct vmcb_ctrl *ctrl; 927 uint64_t intinfo; 928 int err; 929 930 ctrl = svm_get_vmcb_ctrl(svm_sc, vcpu); 931 intinfo = ctrl->exitintinfo; 932 if (!VMCB_EXITINTINFO_VALID(intinfo)) 933 return; 934 935 /* 936 * From APMv2, Section "Intercepts during IDT interrupt delivery" 937 * 938 * If a #VMEXIT happened during event delivery then record the event 939 * that was being delivered. 940 */ 941 vmm_stat_incr(svm_sc->vm, vcpu, VCPU_EXITINTINFO, 1); 942 /* 943 * Relies on match between VMCB exitintinfo format and bhyve-generic 944 * format, which is ensured by CTASSERTs above. 945 */ 946 err = vm_exit_intinfo(svm_sc->vm, vcpu, intinfo); 947 VERIFY0(err); 948 } 949 950 static __inline int 951 vintr_intercept_enabled(struct svm_softc *sc, int vcpu) 952 { 953 954 return (svm_get_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, 955 VMCB_INTCPT_VINTR)); 956 } 957 958 static void 959 svm_enable_intr_window_exiting(struct svm_softc *sc, int vcpu) 960 { 961 struct vmcb_ctrl *ctrl; 962 struct vmcb_state *state; 963 964 ctrl = svm_get_vmcb_ctrl(sc, vcpu); 965 state = svm_get_vmcb_state(sc, vcpu); 966 967 if ((ctrl->v_irq & V_IRQ) != 0 && ctrl->v_intr_vector == 0) { 968 KASSERT(ctrl->v_intr_prio & V_IGN_TPR, 969 ("%s: invalid v_ign_tpr", __func__)); 970 KASSERT(vintr_intercept_enabled(sc, vcpu), 971 ("%s: vintr intercept should be enabled", __func__)); 972 return; 973 } 974 975 /* 976 * We use V_IRQ in conjunction with the VINTR intercept to trap into the 977 * hypervisor as soon as a virtual interrupt can be delivered. 978 * 979 * Since injected events are not subject to intercept checks we need to 980 * ensure that the V_IRQ is not actually going to be delivered on VM 981 * entry. 982 */ 983 VERIFY((ctrl->eventinj & VMCB_EVENTINJ_VALID) != 0 || 984 (state->rflags & PSL_I) == 0 || ctrl->intr_shadow); 985 986 ctrl->v_irq |= V_IRQ; 987 ctrl->v_intr_prio |= V_IGN_TPR; 988 ctrl->v_intr_vector = 0; 989 svm_set_dirty(sc, vcpu, VMCB_CACHE_TPR); 990 svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_VINTR); 991 } 992 993 static void 994 svm_disable_intr_window_exiting(struct svm_softc *sc, int vcpu) 995 { 996 struct vmcb_ctrl *ctrl; 997 998 ctrl = svm_get_vmcb_ctrl(sc, vcpu); 999 1000 if ((ctrl->v_irq & V_IRQ) == 0 && ctrl->v_intr_vector == 0) { 1001 KASSERT(!vintr_intercept_enabled(sc, vcpu), 1002 ("%s: vintr intercept should be disabled", __func__)); 1003 return; 1004 } 1005 1006 ctrl->v_irq &= ~V_IRQ; 1007 ctrl->v_intr_vector = 0; 1008 svm_set_dirty(sc, vcpu, VMCB_CACHE_TPR); 1009 svm_disable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_VINTR); 1010 } 1011 1012 /* 1013 * Once an NMI is injected it blocks delivery of further NMIs until the handler 1014 * executes an IRET. The IRET intercept is enabled when an NMI is injected to 1015 * to track when the vcpu is done handling the NMI. 1016 */ 1017 static int 1018 svm_nmi_blocked(struct svm_softc *sc, int vcpu) 1019 { 1020 return (svm_get_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, 1021 VMCB_INTCPT_IRET)); 1022 } 1023 1024 static void 1025 svm_clear_nmi_blocking(struct svm_softc *sc, int vcpu) 1026 { 1027 struct vmcb_ctrl *ctrl; 1028 1029 KASSERT(svm_nmi_blocked(sc, vcpu), ("vNMI already unblocked")); 1030 /* 1031 * When the IRET intercept is cleared the vcpu will attempt to execute 1032 * the "iret" when it runs next. However, it is possible to inject 1033 * another NMI into the vcpu before the "iret" has actually executed. 1034 * 1035 * For e.g. if the "iret" encounters a #NPF when accessing the stack 1036 * it will trap back into the hypervisor. If an NMI is pending for 1037 * the vcpu it will be injected into the guest. 1038 * 1039 * XXX this needs to be fixed 1040 */ 1041 svm_disable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_IRET); 1042 1043 /* 1044 * Set an interrupt shadow to prevent an NMI from being immediately 1045 * injected on the next VMRUN. 1046 */ 1047 ctrl = svm_get_vmcb_ctrl(sc, vcpu); 1048 ctrl->intr_shadow = 1; 1049 } 1050 1051 static void 1052 svm_inject_event(struct vmcb_ctrl *ctrl, uint64_t info) 1053 { 1054 ASSERT(VM_INTINFO_PENDING(info)); 1055 1056 uint8_t vector = VM_INTINFO_VECTOR(info); 1057 uint32_t type = VM_INTINFO_TYPE(info); 1058 1059 /* 1060 * Correct behavior depends on bhyve intinfo event types lining up with 1061 * those defined by AMD for event injection in the VMCB. The CTASSERTs 1062 * above svm_save_exitintinfo() ensure it. 1063 */ 1064 switch (type) { 1065 case VM_INTINFO_NMI: 1066 /* Ensure vector for injected event matches its type (NMI) */ 1067 vector = IDT_NMI; 1068 break; 1069 case VM_INTINFO_HWINTR: 1070 case VM_INTINFO_SWINTR: 1071 break; 1072 case VM_INTINFO_HWEXCP: 1073 if (vector == IDT_NMI) { 1074 /* 1075 * NMIs are expected to be injected with 1076 * VMCB_EVENTINJ_TYPE_NMI, rather than as an exception 1077 * with the NMI vector. 1078 */ 1079 type = VM_INTINFO_NMI; 1080 } 1081 VERIFY(vector < 32); 1082 break; 1083 default: 1084 /* 1085 * Since there is not strong validation for injected event types 1086 * at this point, fall back to software interrupt for those we 1087 * do not recognized. 1088 */ 1089 type = VM_INTINFO_SWINTR; 1090 break; 1091 } 1092 1093 ctrl->eventinj = VMCB_EVENTINJ_VALID | type | vector; 1094 if (VM_INTINFO_HAS_ERRCODE(info)) { 1095 ctrl->eventinj |= VMCB_EVENTINJ_EC_VALID; 1096 ctrl->eventinj |= (uint64_t)VM_INTINFO_ERRCODE(info) << 32; 1097 } 1098 } 1099 1100 static void 1101 svm_inject_nmi(struct svm_softc *sc, int vcpu) 1102 { 1103 struct vmcb_ctrl *ctrl = svm_get_vmcb_ctrl(sc, vcpu); 1104 1105 ASSERT(!svm_nmi_blocked(sc, vcpu)); 1106 1107 ctrl->eventinj = VMCB_EVENTINJ_VALID | VMCB_EVENTINJ_TYPE_NMI; 1108 vm_nmi_clear(sc->vm, vcpu); 1109 1110 /* 1111 * Virtual NMI blocking is now in effect. 1112 * 1113 * Not only does this block a subsequent NMI injection from taking 1114 * place, it also configures an intercept on the IRET so we can track 1115 * when the next injection can take place. 1116 */ 1117 svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_IRET); 1118 } 1119 1120 static void 1121 svm_inject_irq(struct svm_softc *sc, int vcpu, int vector) 1122 { 1123 struct vmcb_ctrl *ctrl = svm_get_vmcb_ctrl(sc, vcpu); 1124 1125 ASSERT(vector >= 0 && vector <= 255); 1126 1127 ctrl->eventinj = VMCB_EVENTINJ_VALID | vector; 1128 } 1129 1130 #define EFER_MBZ_BITS 0xFFFFFFFFFFFF0200UL 1131 1132 static vm_msr_result_t 1133 svm_write_efer(struct svm_softc *sc, int vcpu, uint64_t newval) 1134 { 1135 struct vmcb_state *state = svm_get_vmcb_state(sc, vcpu); 1136 uint64_t lma; 1137 int error; 1138 1139 newval &= ~0xFE; /* clear the Read-As-Zero (RAZ) bits */ 1140 1141 if (newval & EFER_MBZ_BITS) { 1142 return (VMR_GP); 1143 } 1144 1145 /* APMv2 Table 14-5 "Long-Mode Consistency Checks" */ 1146 const uint64_t changed = state->efer ^ newval; 1147 if (changed & EFER_LME) { 1148 if (state->cr0 & CR0_PG) { 1149 return (VMR_GP); 1150 } 1151 } 1152 1153 /* EFER.LMA = EFER.LME & CR0.PG */ 1154 if ((newval & EFER_LME) != 0 && (state->cr0 & CR0_PG) != 0) { 1155 lma = EFER_LMA; 1156 } else { 1157 lma = 0; 1158 } 1159 if ((newval & EFER_LMA) != lma) { 1160 return (VMR_GP); 1161 } 1162 1163 if ((newval & EFER_NXE) != 0 && 1164 !vm_cpuid_capability(sc->vm, vcpu, VCC_NO_EXECUTE)) { 1165 return (VMR_GP); 1166 } 1167 if ((newval & EFER_FFXSR) != 0 && 1168 !vm_cpuid_capability(sc->vm, vcpu, VCC_FFXSR)) { 1169 return (VMR_GP); 1170 } 1171 if ((newval & EFER_TCE) != 0 && 1172 !vm_cpuid_capability(sc->vm, vcpu, VCC_TCE)) { 1173 return (VMR_GP); 1174 } 1175 1176 /* 1177 * Until bhyve has proper support for long-mode segment limits, just 1178 * toss a #GP at the guest if they attempt to use it. 1179 */ 1180 if (newval & EFER_LMSLE) { 1181 return (VMR_GP); 1182 } 1183 1184 error = svm_setreg(sc, vcpu, VM_REG_GUEST_EFER, newval); 1185 VERIFY0(error); 1186 return (VMR_OK); 1187 } 1188 1189 static int 1190 svm_handle_msr(struct svm_softc *svm_sc, int vcpu, struct vm_exit *vmexit, 1191 bool is_wrmsr) 1192 { 1193 struct vmcb_state *state = svm_get_vmcb_state(svm_sc, vcpu); 1194 struct svm_regctx *ctx = svm_get_guest_regctx(svm_sc, vcpu); 1195 const uint32_t ecx = ctx->sctx_rcx; 1196 vm_msr_result_t res; 1197 uint64_t val = 0; 1198 1199 if (is_wrmsr) { 1200 vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_WRMSR, 1); 1201 val = ctx->sctx_rdx << 32 | (uint32_t)state->rax; 1202 1203 if (vlapic_owned_msr(ecx)) { 1204 struct vlapic *vlapic = vm_lapic(svm_sc->vm, vcpu); 1205 1206 res = vlapic_wrmsr(vlapic, ecx, val); 1207 } else if (ecx == MSR_EFER) { 1208 res = svm_write_efer(svm_sc, vcpu, val); 1209 } else { 1210 res = svm_wrmsr(svm_sc, vcpu, ecx, val); 1211 } 1212 } else { 1213 vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_RDMSR, 1); 1214 1215 if (vlapic_owned_msr(ecx)) { 1216 struct vlapic *vlapic = vm_lapic(svm_sc->vm, vcpu); 1217 1218 res = vlapic_rdmsr(vlapic, ecx, &val); 1219 } else { 1220 res = svm_rdmsr(svm_sc, vcpu, ecx, &val); 1221 } 1222 } 1223 1224 switch (res) { 1225 case VMR_OK: 1226 /* Store rdmsr result in the appropriate registers */ 1227 if (!is_wrmsr) { 1228 state->rax = (uint32_t)val; 1229 ctx->sctx_rdx = val >> 32; 1230 } 1231 return (1); 1232 case VMR_GP: 1233 vm_inject_gp(svm_sc->vm, vcpu); 1234 return (1); 1235 case VMR_UNHANLDED: 1236 vmexit->exitcode = is_wrmsr ? 1237 VM_EXITCODE_WRMSR : VM_EXITCODE_RDMSR; 1238 vmexit->u.msr.code = ecx; 1239 vmexit->u.msr.wval = val; 1240 return (0); 1241 default: 1242 panic("unexpected msr result %u\n", res); 1243 } 1244 } 1245 1246 /* 1247 * From section "State Saved on Exit" in APMv2: nRIP is saved for all #VMEXITs 1248 * that are due to instruction intercepts as well as MSR and IOIO intercepts 1249 * and exceptions caused by INT3, INTO and BOUND instructions. 1250 * 1251 * Return 1 if the nRIP is valid and 0 otherwise. 1252 */ 1253 static int 1254 nrip_valid(uint64_t exitcode) 1255 { 1256 switch (exitcode) { 1257 case 0x00 ... 0x0F: /* read of CR0 through CR15 */ 1258 case 0x10 ... 0x1F: /* write of CR0 through CR15 */ 1259 case 0x20 ... 0x2F: /* read of DR0 through DR15 */ 1260 case 0x30 ... 0x3F: /* write of DR0 through DR15 */ 1261 case 0x43: /* INT3 */ 1262 case 0x44: /* INTO */ 1263 case 0x45: /* BOUND */ 1264 case 0x65 ... 0x7C: /* VMEXIT_CR0_SEL_WRITE ... VMEXIT_MSR */ 1265 case 0x80 ... 0x8D: /* VMEXIT_VMRUN ... VMEXIT_XSETBV */ 1266 return (1); 1267 default: 1268 return (0); 1269 } 1270 } 1271 1272 static int 1273 svm_vmexit(struct svm_softc *svm_sc, int vcpu, struct vm_exit *vmexit) 1274 { 1275 struct vmcb *vmcb; 1276 struct vmcb_state *state; 1277 struct vmcb_ctrl *ctrl; 1278 struct svm_regctx *ctx; 1279 uint64_t code, info1, info2; 1280 int handled; 1281 1282 ctx = svm_get_guest_regctx(svm_sc, vcpu); 1283 vmcb = svm_get_vmcb(svm_sc, vcpu); 1284 state = &vmcb->state; 1285 ctrl = &vmcb->ctrl; 1286 1287 handled = 0; 1288 code = ctrl->exitcode; 1289 info1 = ctrl->exitinfo1; 1290 info2 = ctrl->exitinfo2; 1291 1292 vmexit->exitcode = VM_EXITCODE_BOGUS; 1293 vmexit->rip = state->rip; 1294 vmexit->inst_length = nrip_valid(code) ? ctrl->nrip - state->rip : 0; 1295 1296 vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_COUNT, 1); 1297 1298 /* 1299 * #VMEXIT(INVALID) needs to be handled early because the VMCB is 1300 * in an inconsistent state and can trigger assertions that would 1301 * never happen otherwise. 1302 */ 1303 if (code == VMCB_EXIT_INVALID) { 1304 vm_exit_svm(vmexit, code, info1, info2); 1305 return (0); 1306 } 1307 1308 KASSERT((ctrl->eventinj & VMCB_EVENTINJ_VALID) == 0, ("%s: event " 1309 "injection valid bit is set %lx", __func__, ctrl->eventinj)); 1310 1311 KASSERT(vmexit->inst_length >= 0 && vmexit->inst_length <= 15, 1312 ("invalid inst_length %d: code (%lx), info1 (%lx), info2 (%lx)", 1313 vmexit->inst_length, code, info1, info2)); 1314 1315 svm_update_virqinfo(svm_sc, vcpu); 1316 svm_save_exitintinfo(svm_sc, vcpu); 1317 1318 switch (code) { 1319 case VMCB_EXIT_CR0_READ: 1320 if (VMCB_CRx_INFO1_VALID(info1) != 0) { 1321 svm_handle_cr0_read(svm_sc, vcpu, 1322 vie_regnum_map(VMCB_CRx_INFO1_GPR(info1))); 1323 handled = 1; 1324 } else { 1325 /* 1326 * If SMSW is used to read the contents of %cr0, then 1327 * the VALID bit will not be set in `info1`, since the 1328 * handling is different from the mov-to-reg case. 1329 * 1330 * Punt to the instruction emulation to handle it. 1331 */ 1332 svm_inst_emul_other(svm_sc, vcpu, vmexit); 1333 } 1334 break; 1335 case VMCB_EXIT_CR0_WRITE: 1336 case VMCB_EXIT_CR0_SEL_WRITE: 1337 if (VMCB_CRx_INFO1_VALID(info1) != 0) { 1338 svm_handle_cr0_write(svm_sc, vcpu, 1339 vie_regnum_map(VMCB_CRx_INFO1_GPR(info1))); 1340 handled = 1; 1341 } else { 1342 /* 1343 * Writes to %cr0 without VALID being set in `info1` are 1344 * initiated by the LMSW and CLTS instructions. While 1345 * LMSW (like SMSW) sees little use in modern OSes and 1346 * bootloaders, CLTS is still used for handling FPU 1347 * state transitions. 1348 * 1349 * Punt to the instruction emulation to handle them. 1350 */ 1351 svm_inst_emul_other(svm_sc, vcpu, vmexit); 1352 } 1353 break; 1354 case VMCB_EXIT_IRET: 1355 /* 1356 * Restart execution at "iret" but with the intercept cleared. 1357 */ 1358 vmexit->inst_length = 0; 1359 svm_clear_nmi_blocking(svm_sc, vcpu); 1360 handled = 1; 1361 break; 1362 case VMCB_EXIT_VINTR: /* interrupt window exiting */ 1363 vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_VINTR, 1); 1364 svm_disable_intr_window_exiting(svm_sc, vcpu); 1365 handled = 1; 1366 break; 1367 case VMCB_EXIT_INTR: /* external interrupt */ 1368 vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_EXTINT, 1); 1369 handled = 1; 1370 break; 1371 case VMCB_EXIT_NMI: 1372 case VMCB_EXIT_SMI: 1373 case VMCB_EXIT_INIT: 1374 /* 1375 * For external NMI/SMI and physical INIT interrupts, simply 1376 * continue execution, as those host events will be handled by 1377 * the physical CPU. 1378 */ 1379 handled = 1; 1380 break; 1381 case VMCB_EXIT_EXCP0 ... VMCB_EXIT_EXCP31: { 1382 vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_EXCEPTION, 1); 1383 1384 const uint8_t idtvec = code - VMCB_EXIT_EXCP0; 1385 uint32_t errcode = 0; 1386 bool reflect = true; 1387 bool errcode_valid = false; 1388 1389 switch (idtvec) { 1390 case IDT_MC: 1391 /* The host will handle the MCE itself. */ 1392 reflect = false; 1393 vmm_call_trap(T_MCE); 1394 break; 1395 case IDT_PF: 1396 VERIFY0(svm_setreg(svm_sc, vcpu, VM_REG_GUEST_CR2, 1397 info2)); 1398 /* fallthru */ 1399 case IDT_NP: 1400 case IDT_SS: 1401 case IDT_GP: 1402 case IDT_AC: 1403 case IDT_TS: 1404 errcode_valid = true; 1405 errcode = info1; 1406 break; 1407 1408 case IDT_DF: 1409 errcode_valid = true; 1410 break; 1411 1412 case IDT_BP: 1413 case IDT_OF: 1414 case IDT_BR: 1415 /* 1416 * The 'nrip' field is populated for INT3, INTO and 1417 * BOUND exceptions and this also implies that 1418 * 'inst_length' is non-zero. 1419 * 1420 * Reset 'inst_length' to zero so the guest %rip at 1421 * event injection is identical to what it was when 1422 * the exception originally happened. 1423 */ 1424 vmexit->inst_length = 0; 1425 /* fallthru */ 1426 default: 1427 errcode_valid = false; 1428 break; 1429 } 1430 VERIFY0(vmexit->inst_length); 1431 1432 if (reflect) { 1433 /* Reflect the exception back into the guest */ 1434 VERIFY0(vm_inject_exception(svm_sc->vm, vcpu, idtvec, 1435 errcode_valid, errcode, false)); 1436 } 1437 handled = 1; 1438 break; 1439 } 1440 case VMCB_EXIT_MSR: 1441 handled = svm_handle_msr(svm_sc, vcpu, vmexit, info1 != 0); 1442 break; 1443 case VMCB_EXIT_IO: 1444 handled = svm_handle_inout(svm_sc, vcpu, vmexit); 1445 vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_INOUT, 1); 1446 break; 1447 case VMCB_EXIT_SHUTDOWN: 1448 (void) vm_suspend(svm_sc->vm, VM_SUSPEND_TRIPLEFAULT); 1449 handled = 1; 1450 break; 1451 case VMCB_EXIT_INVD: 1452 case VMCB_EXIT_INVLPGA: 1453 /* privileged invalidation instructions */ 1454 vm_inject_ud(svm_sc->vm, vcpu); 1455 handled = 1; 1456 break; 1457 case VMCB_EXIT_VMRUN: 1458 case VMCB_EXIT_VMLOAD: 1459 case VMCB_EXIT_VMSAVE: 1460 case VMCB_EXIT_STGI: 1461 case VMCB_EXIT_CLGI: 1462 case VMCB_EXIT_SKINIT: 1463 /* privileged vmm instructions */ 1464 vm_inject_ud(svm_sc->vm, vcpu); 1465 handled = 1; 1466 break; 1467 case VMCB_EXIT_VMMCALL: 1468 /* No handlers make use of VMMCALL for now */ 1469 vm_inject_ud(svm_sc->vm, vcpu); 1470 handled = 1; 1471 break; 1472 case VMCB_EXIT_CPUID: 1473 vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_CPUID, 1); 1474 vcpu_emulate_cpuid(svm_sc->vm, vcpu, &state->rax, 1475 &ctx->sctx_rbx, &ctx->sctx_rcx, &ctx->sctx_rdx); 1476 handled = 1; 1477 break; 1478 case VMCB_EXIT_HLT: 1479 vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_HLT, 1); 1480 vmexit->exitcode = VM_EXITCODE_HLT; 1481 vmexit->u.hlt.rflags = state->rflags; 1482 break; 1483 case VMCB_EXIT_PAUSE: 1484 vmexit->exitcode = VM_EXITCODE_PAUSE; 1485 vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_PAUSE, 1); 1486 break; 1487 case VMCB_EXIT_NPF: 1488 /* EXITINFO2 contains the faulting guest physical address */ 1489 if (info1 & VMCB_NPF_INFO1_RSV) { 1490 /* nested fault with reserved bits set */ 1491 } else if (vm_mem_allocated(svm_sc->vm, vcpu, info2)) { 1492 vmexit->exitcode = VM_EXITCODE_PAGING; 1493 vmexit->u.paging.gpa = info2; 1494 vmexit->u.paging.fault_type = npf_fault_type(info1); 1495 vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_NESTED_FAULT, 1); 1496 } else if (svm_npf_emul_fault(info1)) { 1497 svm_handle_mmio_emul(svm_sc, vcpu, vmexit, info2); 1498 vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_MMIO_EMUL, 1); 1499 } 1500 break; 1501 case VMCB_EXIT_MONITOR: 1502 vmexit->exitcode = VM_EXITCODE_MONITOR; 1503 break; 1504 case VMCB_EXIT_MWAIT: 1505 vmexit->exitcode = VM_EXITCODE_MWAIT; 1506 break; 1507 default: 1508 vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_UNKNOWN, 1); 1509 break; 1510 } 1511 1512 DTRACE_PROBE3(vmm__vexit, int, vcpu, uint64_t, vmexit->rip, uint32_t, 1513 code); 1514 1515 if (handled) { 1516 vmexit->rip += vmexit->inst_length; 1517 vmexit->inst_length = 0; 1518 state->rip = vmexit->rip; 1519 } else { 1520 if (vmexit->exitcode == VM_EXITCODE_BOGUS) { 1521 /* 1522 * If this VM exit was not claimed by anybody then 1523 * treat it as a generic SVM exit. 1524 */ 1525 vm_exit_svm(vmexit, code, info1, info2); 1526 } else { 1527 /* 1528 * The exitcode and collateral have been populated. 1529 * The VM exit will be processed further in userland. 1530 */ 1531 } 1532 } 1533 return (handled); 1534 } 1535 1536 /* 1537 * Inject exceptions, NMIs, and ExtINTs. 1538 * 1539 * The logic behind these are complicated and may involve mutex contention, so 1540 * the injection is performed without the protection of host CPU interrupts 1541 * being disabled. This means a racing notification could be "lost", 1542 * necessitating a later call to svm_inject_recheck() to close that window 1543 * of opportunity. 1544 */ 1545 static enum event_inject_state 1546 svm_inject_events(struct svm_softc *sc, int vcpu) 1547 { 1548 struct vmcb_ctrl *ctrl; 1549 struct vmcb_state *state; 1550 struct svm_vcpu *vcpustate; 1551 uint64_t intinfo; 1552 enum event_inject_state ev_state; 1553 1554 state = svm_get_vmcb_state(sc, vcpu); 1555 ctrl = svm_get_vmcb_ctrl(sc, vcpu); 1556 vcpustate = svm_get_vcpu(sc, vcpu); 1557 ev_state = EIS_CAN_INJECT; 1558 1559 /* Clear any interrupt shadow if guest %rip has changed */ 1560 if (vcpustate->nextrip != state->rip) { 1561 ctrl->intr_shadow = 0; 1562 } 1563 1564 /* 1565 * An event is already pending for injection. This can occur when the 1566 * vCPU exits prior to VM entry (like for an AST). 1567 */ 1568 if (ctrl->eventinj & VMCB_EVENTINJ_VALID) { 1569 return (EIS_EV_EXISTING | EIS_REQ_EXIT); 1570 } 1571 1572 /* 1573 * Inject pending events or exceptions for this vcpu. 1574 * 1575 * An event might be pending because the previous #VMEXIT happened 1576 * during event delivery (i.e. ctrl->exitintinfo). 1577 * 1578 * An event might also be pending because an exception was injected 1579 * by the hypervisor (e.g. #PF during instruction emulation). 1580 */ 1581 if (vm_entry_intinfo(sc->vm, vcpu, &intinfo)) { 1582 svm_inject_event(ctrl, intinfo); 1583 vmm_stat_incr(sc->vm, vcpu, VCPU_INTINFO_INJECTED, 1); 1584 ev_state = EIS_EV_INJECTED; 1585 } 1586 1587 /* NMI event has priority over interrupts. */ 1588 if (vm_nmi_pending(sc->vm, vcpu) && !svm_nmi_blocked(sc, vcpu)) { 1589 if (ev_state == EIS_CAN_INJECT) { 1590 /* Can't inject NMI if vcpu is in an intr_shadow. */ 1591 if (ctrl->intr_shadow) { 1592 return (EIS_GI_BLOCK); 1593 } 1594 1595 svm_inject_nmi(sc, vcpu); 1596 ev_state = EIS_EV_INJECTED; 1597 } else { 1598 return (ev_state | EIS_REQ_EXIT); 1599 } 1600 } 1601 1602 if (vm_extint_pending(sc->vm, vcpu)) { 1603 int vector; 1604 1605 if (ev_state != EIS_CAN_INJECT) { 1606 return (ev_state | EIS_REQ_EXIT); 1607 } 1608 1609 /* 1610 * If the guest has disabled interrupts or is in an interrupt 1611 * shadow then we cannot inject the pending interrupt. 1612 */ 1613 if ((state->rflags & PSL_I) == 0 || ctrl->intr_shadow) { 1614 return (EIS_GI_BLOCK); 1615 } 1616 1617 /* Ask the legacy pic for a vector to inject */ 1618 vatpic_pending_intr(sc->vm, &vector); 1619 KASSERT(vector >= 0 && vector <= 255, 1620 ("invalid vector %d from INTR", vector)); 1621 1622 svm_inject_irq(sc, vcpu, vector); 1623 vm_extint_clear(sc->vm, vcpu); 1624 vatpic_intr_accepted(sc->vm, vector); 1625 ev_state = EIS_EV_INJECTED; 1626 } 1627 1628 return (ev_state); 1629 } 1630 1631 /* 1632 * Synchronize vLAPIC state and inject any interrupts pending on it. 1633 * 1634 * This is done with host CPU interrupts disabled so notification IPIs will be 1635 * queued on the host APIC and recognized when entering SVM guest context. 1636 */ 1637 static enum event_inject_state 1638 svm_inject_vlapic(struct svm_softc *sc, int vcpu, struct vlapic *vlapic, 1639 enum event_inject_state ev_state) 1640 { 1641 struct vmcb_ctrl *ctrl; 1642 struct vmcb_state *state; 1643 int vector; 1644 uint8_t v_tpr; 1645 1646 state = svm_get_vmcb_state(sc, vcpu); 1647 ctrl = svm_get_vmcb_ctrl(sc, vcpu); 1648 1649 /* 1650 * The guest can modify the TPR by writing to %cr8. In guest mode the 1651 * CPU reflects this write to V_TPR without hypervisor intervention. 1652 * 1653 * The guest can also modify the TPR by writing to it via the memory 1654 * mapped APIC page. In this case, the write will be emulated by the 1655 * hypervisor. For this reason V_TPR must be updated before every 1656 * VMRUN. 1657 */ 1658 v_tpr = vlapic_get_cr8(vlapic); 1659 KASSERT(v_tpr <= 15, ("invalid v_tpr %x", v_tpr)); 1660 if (ctrl->v_tpr != v_tpr) { 1661 ctrl->v_tpr = v_tpr; 1662 svm_set_dirty(sc, vcpu, VMCB_CACHE_TPR); 1663 } 1664 1665 /* If an event cannot otherwise be injected, we are done for now */ 1666 if (ev_state != EIS_CAN_INJECT) { 1667 return (ev_state); 1668 } 1669 1670 if (!vlapic_pending_intr(vlapic, &vector)) { 1671 return (EIS_CAN_INJECT); 1672 } 1673 KASSERT(vector >= 16 && vector <= 255, 1674 ("invalid vector %d from local APIC", vector)); 1675 1676 /* 1677 * If the guest has disabled interrupts or is in an interrupt shadow 1678 * then we cannot inject the pending interrupt. 1679 */ 1680 if ((state->rflags & PSL_I) == 0 || ctrl->intr_shadow) { 1681 return (EIS_GI_BLOCK); 1682 } 1683 1684 svm_inject_irq(sc, vcpu, vector); 1685 vlapic_intr_accepted(vlapic, vector); 1686 return (EIS_EV_INJECTED); 1687 } 1688 1689 /* 1690 * Re-check for events to be injected. 1691 * 1692 * Once host CPU interrupts are disabled, check for the presence of any events 1693 * which require injection processing. If an exit is required upon injection, 1694 * or once the guest becomes interruptable, that will be configured too. 1695 */ 1696 static bool 1697 svm_inject_recheck(struct svm_softc *sc, int vcpu, 1698 enum event_inject_state ev_state) 1699 { 1700 struct vmcb_ctrl *ctrl; 1701 1702 ctrl = svm_get_vmcb_ctrl(sc, vcpu); 1703 1704 if (ev_state == EIS_CAN_INJECT) { 1705 /* 1706 * An active interrupt shadow would preclude us from injecting 1707 * any events picked up during a re-check. 1708 */ 1709 if (ctrl->intr_shadow != 0) { 1710 return (false); 1711 } 1712 1713 if (vm_nmi_pending(sc->vm, vcpu) && 1714 !svm_nmi_blocked(sc, vcpu)) { 1715 /* queued NMI not blocked by NMI-window-exiting */ 1716 return (true); 1717 } 1718 if (vm_extint_pending(sc->vm, vcpu)) { 1719 /* queued ExtINT not blocked by existing injection */ 1720 return (true); 1721 } 1722 } else { 1723 if ((ev_state & EIS_REQ_EXIT) != 0) { 1724 /* 1725 * Use a self-IPI to force an immediate exit after 1726 * event injection has occurred. 1727 */ 1728 poke_cpu(CPU->cpu_id); 1729 } else { 1730 /* 1731 * If any event is being injected, an exit immediately 1732 * upon becoming interruptable again will allow pending 1733 * or newly queued events to be injected in a timely 1734 * manner. 1735 */ 1736 svm_enable_intr_window_exiting(sc, vcpu); 1737 } 1738 } 1739 return (false); 1740 } 1741 1742 1743 static void 1744 check_asid(struct svm_softc *sc, int vcpuid, uint_t thiscpu, uint64_t nptgen) 1745 { 1746 struct svm_vcpu *vcpustate = svm_get_vcpu(sc, vcpuid); 1747 struct vmcb_ctrl *ctrl = svm_get_vmcb_ctrl(sc, vcpuid); 1748 uint8_t flush; 1749 1750 flush = hma_svm_asid_update(&vcpustate->hma_asid, flush_by_asid(), 1751 vcpustate->nptgen != nptgen); 1752 1753 if (flush != VMCB_TLB_FLUSH_NOTHING) { 1754 ctrl->asid = vcpustate->hma_asid.hsa_asid; 1755 svm_set_dirty(sc, vcpuid, VMCB_CACHE_ASID); 1756 } 1757 ctrl->tlb_ctrl = flush; 1758 vcpustate->nptgen = nptgen; 1759 } 1760 1761 static void 1762 flush_asid(struct svm_softc *sc, int vcpuid) 1763 { 1764 struct svm_vcpu *vcpustate = svm_get_vcpu(sc, vcpuid); 1765 struct vmcb_ctrl *ctrl = svm_get_vmcb_ctrl(sc, vcpuid); 1766 uint8_t flush; 1767 1768 flush = hma_svm_asid_update(&vcpustate->hma_asid, flush_by_asid(), 1769 true); 1770 1771 ASSERT(flush != VMCB_TLB_FLUSH_NOTHING); 1772 ctrl->asid = vcpustate->hma_asid.hsa_asid; 1773 ctrl->tlb_ctrl = flush; 1774 svm_set_dirty(sc, vcpuid, VMCB_CACHE_ASID); 1775 /* 1776 * A potential future optimization: We could choose to update the nptgen 1777 * associated with the vCPU, since any pending nptgen change requiring a 1778 * flush will be satisfied by the one which has just now been queued. 1779 */ 1780 } 1781 1782 static __inline void 1783 disable_gintr(void) 1784 { 1785 __asm __volatile("clgi"); 1786 } 1787 1788 static __inline void 1789 enable_gintr(void) 1790 { 1791 __asm __volatile("stgi"); 1792 } 1793 1794 static __inline void 1795 svm_dr_enter_guest(struct svm_regctx *gctx) 1796 { 1797 1798 /* Save host control debug registers. */ 1799 gctx->host_dr7 = rdr7(); 1800 gctx->host_debugctl = rdmsr(MSR_DEBUGCTLMSR); 1801 1802 /* 1803 * Disable debugging in DR7 and DEBUGCTL to avoid triggering 1804 * exceptions in the host based on the guest DRx values. The 1805 * guest DR6, DR7, and DEBUGCTL are saved/restored in the 1806 * VMCB. 1807 */ 1808 load_dr7(0); 1809 wrmsr(MSR_DEBUGCTLMSR, 0); 1810 1811 /* Save host debug registers. */ 1812 gctx->host_dr0 = rdr0(); 1813 gctx->host_dr1 = rdr1(); 1814 gctx->host_dr2 = rdr2(); 1815 gctx->host_dr3 = rdr3(); 1816 gctx->host_dr6 = rdr6(); 1817 1818 /* Restore guest debug registers. */ 1819 load_dr0(gctx->sctx_dr0); 1820 load_dr1(gctx->sctx_dr1); 1821 load_dr2(gctx->sctx_dr2); 1822 load_dr3(gctx->sctx_dr3); 1823 } 1824 1825 static __inline void 1826 svm_dr_leave_guest(struct svm_regctx *gctx) 1827 { 1828 1829 /* Save guest debug registers. */ 1830 gctx->sctx_dr0 = rdr0(); 1831 gctx->sctx_dr1 = rdr1(); 1832 gctx->sctx_dr2 = rdr2(); 1833 gctx->sctx_dr3 = rdr3(); 1834 1835 /* 1836 * Restore host debug registers. Restore DR7 and DEBUGCTL 1837 * last. 1838 */ 1839 load_dr0(gctx->host_dr0); 1840 load_dr1(gctx->host_dr1); 1841 load_dr2(gctx->host_dr2); 1842 load_dr3(gctx->host_dr3); 1843 load_dr6(gctx->host_dr6); 1844 wrmsr(MSR_DEBUGCTLMSR, gctx->host_debugctl); 1845 load_dr7(gctx->host_dr7); 1846 } 1847 1848 static void 1849 svm_apply_tsc_adjust(struct svm_softc *svm_sc, int vcpuid) 1850 { 1851 const uint64_t offset = vcpu_tsc_offset(svm_sc->vm, vcpuid, true); 1852 struct vmcb_ctrl *ctrl = svm_get_vmcb_ctrl(svm_sc, vcpuid); 1853 1854 if (ctrl->tsc_offset != offset) { 1855 ctrl->tsc_offset = offset; 1856 svm_set_dirty(svm_sc, vcpuid, VMCB_CACHE_I); 1857 } 1858 } 1859 1860 1861 /* 1862 * Start vcpu with specified RIP. 1863 */ 1864 static int 1865 svm_vmrun(void *arg, int vcpu, uint64_t rip) 1866 { 1867 struct svm_regctx *gctx; 1868 struct svm_softc *svm_sc; 1869 struct svm_vcpu *vcpustate; 1870 struct vmcb_state *state; 1871 struct vmcb_ctrl *ctrl; 1872 struct vm_exit *vmexit; 1873 struct vlapic *vlapic; 1874 vm_client_t *vmc; 1875 struct vm *vm; 1876 uint64_t vmcb_pa; 1877 int handled; 1878 uint16_t ldt_sel; 1879 1880 svm_sc = arg; 1881 vm = svm_sc->vm; 1882 1883 vcpustate = svm_get_vcpu(svm_sc, vcpu); 1884 state = svm_get_vmcb_state(svm_sc, vcpu); 1885 ctrl = svm_get_vmcb_ctrl(svm_sc, vcpu); 1886 vmexit = vm_exitinfo(vm, vcpu); 1887 vlapic = vm_lapic(vm, vcpu); 1888 vmc = vm_get_vmclient(vm, vcpu); 1889 1890 gctx = svm_get_guest_regctx(svm_sc, vcpu); 1891 vmcb_pa = svm_sc->vcpu[vcpu].vmcb_pa; 1892 1893 if (vcpustate->lastcpu != curcpu) { 1894 /* 1895 * Force new ASID allocation by invalidating the generation. 1896 */ 1897 vcpustate->hma_asid.hsa_gen = 0; 1898 1899 /* 1900 * Invalidate the VMCB state cache by marking all fields dirty. 1901 */ 1902 svm_set_dirty(svm_sc, vcpu, 0xffffffff); 1903 1904 /* 1905 * XXX 1906 * Setting 'vcpustate->lastcpu' here is bit premature because 1907 * we may return from this function without actually executing 1908 * the VMRUN instruction. This could happen if an AST or yield 1909 * condition is pending on the first time through the loop. 1910 * 1911 * This works for now but any new side-effects of vcpu 1912 * migration should take this case into account. 1913 */ 1914 vcpustate->lastcpu = curcpu; 1915 vmm_stat_incr(vm, vcpu, VCPU_MIGRATIONS, 1); 1916 } 1917 1918 svm_apply_tsc_adjust(svm_sc, vcpu); 1919 1920 svm_msr_guest_enter(svm_sc, vcpu); 1921 1922 VERIFY(!vcpustate->loaded && curthread->t_preempt != 0); 1923 vcpustate->loaded = B_TRUE; 1924 1925 /* Update Guest RIP */ 1926 state->rip = rip; 1927 1928 do { 1929 enum event_inject_state inject_state; 1930 uint64_t nptgen; 1931 1932 /* 1933 * Initial event injection is complex and may involve mutex 1934 * contention, so it must be performed with global interrupts 1935 * still enabled. 1936 */ 1937 inject_state = svm_inject_events(svm_sc, vcpu); 1938 handled = 0; 1939 1940 /* 1941 * Disable global interrupts to guarantee atomicity during 1942 * loading of guest state. This includes not only the state 1943 * loaded by the "vmrun" instruction but also software state 1944 * maintained by the hypervisor: suspended and rendezvous 1945 * state, NPT generation number, vlapic interrupts etc. 1946 */ 1947 disable_gintr(); 1948 1949 /* 1950 * Synchronizing and injecting vlapic state is lock-free and is 1951 * safe (and prudent) to perform with interrupts disabled. 1952 */ 1953 inject_state = svm_inject_vlapic(svm_sc, vcpu, vlapic, 1954 inject_state); 1955 1956 /* 1957 * Check for vCPU bail-out conditions. This must be done after 1958 * svm_inject_events() to detect a triple-fault condition. 1959 */ 1960 if (vcpu_entry_bailout_checks(vm, vcpu, state->rip)) { 1961 enable_gintr(); 1962 break; 1963 } 1964 1965 if (vcpu_run_state_pending(vm, vcpu)) { 1966 enable_gintr(); 1967 vm_exit_run_state(vm, vcpu, state->rip); 1968 break; 1969 } 1970 1971 /* 1972 * If subsequent activity queued events which require injection 1973 * handling, take another lap to handle them. 1974 */ 1975 if (svm_inject_recheck(svm_sc, vcpu, inject_state)) { 1976 enable_gintr(); 1977 handled = 1; 1978 continue; 1979 } 1980 1981 /* 1982 * #VMEXIT resumes the host with the guest LDTR, so 1983 * save the current LDT selector so it can be restored 1984 * after an exit. The userspace hypervisor probably 1985 * doesn't use a LDT, but save and restore it to be 1986 * safe. 1987 */ 1988 ldt_sel = sldt(); 1989 1990 /* 1991 * Check the vmspace and ASID generations to ensure that the 1992 * vcpu does not use stale TLB mappings. 1993 */ 1994 nptgen = vmc_table_enter(vmc); 1995 check_asid(svm_sc, vcpu, curcpu, nptgen); 1996 1997 ctrl->vmcb_clean = vmcb_clean & ~vcpustate->dirty; 1998 vcpustate->dirty = 0; 1999 2000 /* Launch Virtual Machine. */ 2001 vcpu_ustate_change(vm, vcpu, VU_RUN); 2002 svm_dr_enter_guest(gctx); 2003 svm_launch(vmcb_pa, gctx, get_pcpu()); 2004 svm_dr_leave_guest(gctx); 2005 vcpu_ustate_change(vm, vcpu, VU_EMU_KERN); 2006 2007 /* Restore host LDTR. */ 2008 lldt(ldt_sel); 2009 2010 /* #VMEXIT disables interrupts so re-enable them here. */ 2011 enable_gintr(); 2012 2013 vmc_table_exit(vmc); 2014 2015 /* Update 'nextrip' */ 2016 vcpustate->nextrip = state->rip; 2017 2018 /* Handle #VMEXIT and if required return to user space. */ 2019 handled = svm_vmexit(svm_sc, vcpu, vmexit); 2020 } while (handled); 2021 2022 svm_msr_guest_exit(svm_sc, vcpu); 2023 2024 VERIFY(vcpustate->loaded && curthread->t_preempt != 0); 2025 vcpustate->loaded = B_FALSE; 2026 2027 return (0); 2028 } 2029 2030 static void 2031 svm_vmcleanup(void *arg) 2032 { 2033 struct svm_softc *sc = arg; 2034 2035 vmm_contig_free(sc->iopm_bitmap, SVM_IO_BITMAP_SIZE); 2036 vmm_contig_free(sc->msr_bitmap, SVM_MSR_BITMAP_SIZE); 2037 kmem_free(sc, sizeof (*sc)); 2038 } 2039 2040 static uint64_t * 2041 swctx_regptr(struct svm_regctx *regctx, int reg) 2042 { 2043 switch (reg) { 2044 case VM_REG_GUEST_RBX: 2045 return (®ctx->sctx_rbx); 2046 case VM_REG_GUEST_RCX: 2047 return (®ctx->sctx_rcx); 2048 case VM_REG_GUEST_RDX: 2049 return (®ctx->sctx_rdx); 2050 case VM_REG_GUEST_RDI: 2051 return (®ctx->sctx_rdi); 2052 case VM_REG_GUEST_RSI: 2053 return (®ctx->sctx_rsi); 2054 case VM_REG_GUEST_RBP: 2055 return (®ctx->sctx_rbp); 2056 case VM_REG_GUEST_R8: 2057 return (®ctx->sctx_r8); 2058 case VM_REG_GUEST_R9: 2059 return (®ctx->sctx_r9); 2060 case VM_REG_GUEST_R10: 2061 return (®ctx->sctx_r10); 2062 case VM_REG_GUEST_R11: 2063 return (®ctx->sctx_r11); 2064 case VM_REG_GUEST_R12: 2065 return (®ctx->sctx_r12); 2066 case VM_REG_GUEST_R13: 2067 return (®ctx->sctx_r13); 2068 case VM_REG_GUEST_R14: 2069 return (®ctx->sctx_r14); 2070 case VM_REG_GUEST_R15: 2071 return (®ctx->sctx_r15); 2072 case VM_REG_GUEST_DR0: 2073 return (®ctx->sctx_dr0); 2074 case VM_REG_GUEST_DR1: 2075 return (®ctx->sctx_dr1); 2076 case VM_REG_GUEST_DR2: 2077 return (®ctx->sctx_dr2); 2078 case VM_REG_GUEST_DR3: 2079 return (®ctx->sctx_dr3); 2080 default: 2081 return (NULL); 2082 } 2083 } 2084 2085 static int 2086 svm_getreg(void *arg, int vcpu, int ident, uint64_t *val) 2087 { 2088 struct svm_softc *sc; 2089 struct vmcb *vmcb; 2090 uint64_t *regp; 2091 uint64_t *fieldp; 2092 struct vmcb_segment *seg; 2093 2094 sc = arg; 2095 vmcb = svm_get_vmcb(sc, vcpu); 2096 2097 regp = swctx_regptr(svm_get_guest_regctx(sc, vcpu), ident); 2098 if (regp != NULL) { 2099 *val = *regp; 2100 return (0); 2101 } 2102 2103 switch (ident) { 2104 case VM_REG_GUEST_INTR_SHADOW: 2105 *val = (vmcb->ctrl.intr_shadow != 0) ? 1 : 0; 2106 break; 2107 2108 case VM_REG_GUEST_CR0: 2109 svm_get_cr0(sc, vcpu, val); 2110 break; 2111 case VM_REG_GUEST_CR2: 2112 case VM_REG_GUEST_CR3: 2113 case VM_REG_GUEST_CR4: 2114 case VM_REG_GUEST_DR6: 2115 case VM_REG_GUEST_DR7: 2116 case VM_REG_GUEST_EFER: 2117 case VM_REG_GUEST_RAX: 2118 case VM_REG_GUEST_RFLAGS: 2119 case VM_REG_GUEST_RIP: 2120 case VM_REG_GUEST_RSP: 2121 fieldp = vmcb_regptr(vmcb, ident, NULL); 2122 *val = *fieldp; 2123 break; 2124 2125 case VM_REG_GUEST_CS: 2126 case VM_REG_GUEST_DS: 2127 case VM_REG_GUEST_ES: 2128 case VM_REG_GUEST_FS: 2129 case VM_REG_GUEST_GS: 2130 case VM_REG_GUEST_SS: 2131 case VM_REG_GUEST_LDTR: 2132 case VM_REG_GUEST_TR: 2133 seg = vmcb_segptr(vmcb, ident); 2134 *val = seg->selector; 2135 break; 2136 2137 case VM_REG_GUEST_GDTR: 2138 case VM_REG_GUEST_IDTR: 2139 /* GDTR and IDTR don't have segment selectors */ 2140 return (EINVAL); 2141 2142 case VM_REG_GUEST_PDPTE0: 2143 case VM_REG_GUEST_PDPTE1: 2144 case VM_REG_GUEST_PDPTE2: 2145 case VM_REG_GUEST_PDPTE3: 2146 /* 2147 * Unlike VMX, where the PDPTEs are explicitly cached as part of 2148 * several well-defined events related to paging (such as 2149 * loading %cr3), SVM walks the PDPEs (their PDPTE) as part of 2150 * nested paging lookups. This makes these registers 2151 * effectively irrelevant on SVM. 2152 * 2153 * Rather than tossing an error, emit zeroed values so casual 2154 * consumers do not need to be as careful about that difference. 2155 */ 2156 *val = 0; 2157 break; 2158 2159 default: 2160 return (EINVAL); 2161 } 2162 2163 return (0); 2164 } 2165 2166 static int 2167 svm_setreg(void *arg, int vcpu, int ident, uint64_t val) 2168 { 2169 struct svm_softc *sc; 2170 struct vmcb *vmcb; 2171 uint64_t *regp; 2172 uint64_t *fieldp; 2173 uint32_t dirty; 2174 struct vmcb_segment *seg; 2175 2176 sc = arg; 2177 vmcb = svm_get_vmcb(sc, vcpu); 2178 2179 regp = swctx_regptr(svm_get_guest_regctx(sc, vcpu), ident); 2180 if (regp != NULL) { 2181 *regp = val; 2182 return (0); 2183 } 2184 2185 dirty = VMCB_CACHE_NONE; 2186 switch (ident) { 2187 case VM_REG_GUEST_INTR_SHADOW: 2188 vmcb->ctrl.intr_shadow = (val != 0) ? 1 : 0; 2189 break; 2190 2191 case VM_REG_GUEST_EFER: 2192 fieldp = vmcb_regptr(vmcb, ident, &dirty); 2193 /* EFER_SVM must always be set when the guest is executing */ 2194 *fieldp = val | EFER_SVM; 2195 dirty |= VMCB_CACHE_CR; 2196 break; 2197 2198 case VM_REG_GUEST_CR0: 2199 svm_set_cr0(sc, vcpu, val, false); 2200 break; 2201 case VM_REG_GUEST_CR2: 2202 case VM_REG_GUEST_CR3: 2203 case VM_REG_GUEST_CR4: 2204 case VM_REG_GUEST_DR6: 2205 case VM_REG_GUEST_DR7: 2206 case VM_REG_GUEST_RAX: 2207 case VM_REG_GUEST_RFLAGS: 2208 case VM_REG_GUEST_RIP: 2209 case VM_REG_GUEST_RSP: 2210 fieldp = vmcb_regptr(vmcb, ident, &dirty); 2211 *fieldp = val; 2212 break; 2213 2214 case VM_REG_GUEST_CS: 2215 case VM_REG_GUEST_DS: 2216 case VM_REG_GUEST_ES: 2217 case VM_REG_GUEST_SS: 2218 case VM_REG_GUEST_FS: 2219 case VM_REG_GUEST_GS: 2220 case VM_REG_GUEST_LDTR: 2221 case VM_REG_GUEST_TR: 2222 dirty |= VMCB_CACHE_SEG; 2223 seg = vmcb_segptr(vmcb, ident); 2224 seg->selector = (uint16_t)val; 2225 break; 2226 2227 case VM_REG_GUEST_GDTR: 2228 case VM_REG_GUEST_IDTR: 2229 /* GDTR and IDTR don't have segment selectors */ 2230 return (EINVAL); 2231 2232 case VM_REG_GUEST_PDPTE0: 2233 case VM_REG_GUEST_PDPTE1: 2234 case VM_REG_GUEST_PDPTE2: 2235 case VM_REG_GUEST_PDPTE3: 2236 /* 2237 * PDPEs (AMD's PDPTE) are not cached under SVM, so we can 2238 * ignore attempts to set them. See handler in svm_getreg() for 2239 * more details. 2240 */ 2241 break; 2242 2243 default: 2244 return (EINVAL); 2245 } 2246 2247 if (dirty != VMCB_CACHE_NONE) { 2248 svm_set_dirty(sc, vcpu, dirty); 2249 } 2250 2251 /* 2252 * XXX deal with CR3 and invalidate TLB entries tagged with the 2253 * vcpu's ASID. This needs to be treated differently depending on 2254 * whether 'running' is true/false. 2255 */ 2256 2257 return (0); 2258 } 2259 2260 static int 2261 svm_setdesc(void *arg, int vcpu, int reg, const struct seg_desc *desc) 2262 { 2263 struct vmcb *vmcb; 2264 struct svm_softc *sc; 2265 struct vmcb_segment *seg; 2266 2267 sc = arg; 2268 vmcb = svm_get_vmcb(sc, vcpu); 2269 2270 switch (reg) { 2271 case VM_REG_GUEST_CS: 2272 case VM_REG_GUEST_DS: 2273 case VM_REG_GUEST_ES: 2274 case VM_REG_GUEST_SS: 2275 case VM_REG_GUEST_FS: 2276 case VM_REG_GUEST_GS: 2277 case VM_REG_GUEST_LDTR: 2278 case VM_REG_GUEST_TR: 2279 svm_set_dirty(sc, vcpu, VMCB_CACHE_SEG); 2280 seg = vmcb_segptr(vmcb, reg); 2281 /* 2282 * Map seg_desc access to VMCB attribute format. 2283 * 2284 * SVM uses the 'P' bit in the segment attributes to indicate a 2285 * NULL segment so clear it if the segment is marked unusable. 2286 */ 2287 seg->attrib = VMCB_ACCESS2ATTR(desc->access); 2288 if (SEG_DESC_UNUSABLE(desc->access)) { 2289 seg->attrib &= ~0x80; 2290 } 2291 /* 2292 * Keep CPL synced with the DPL specified for %ss. 2293 * 2294 * KVM notes that a SYSRET to non-cpl-3 is possible on AMD 2295 * (unlike Intel), but accepts such a possible deviation for 2296 * what is otherwise unreasonable behavior for a guest OS, since 2297 * they do the same synchronization. 2298 */ 2299 if (reg == VM_REG_GUEST_SS) { 2300 vmcb->state.cpl = SEG_DESC_DPL(desc->access); 2301 } 2302 break; 2303 2304 case VM_REG_GUEST_GDTR: 2305 case VM_REG_GUEST_IDTR: 2306 svm_set_dirty(sc, vcpu, VMCB_CACHE_DT); 2307 seg = vmcb_segptr(vmcb, reg); 2308 break; 2309 2310 default: 2311 return (EINVAL); 2312 } 2313 2314 ASSERT(seg != NULL); 2315 seg->base = desc->base; 2316 seg->limit = desc->limit; 2317 2318 return (0); 2319 } 2320 2321 static int 2322 svm_getdesc(void *arg, int vcpu, int reg, struct seg_desc *desc) 2323 { 2324 struct vmcb *vmcb; 2325 struct svm_softc *sc; 2326 struct vmcb_segment *seg; 2327 2328 sc = arg; 2329 vmcb = svm_get_vmcb(sc, vcpu); 2330 2331 switch (reg) { 2332 case VM_REG_GUEST_DS: 2333 case VM_REG_GUEST_ES: 2334 case VM_REG_GUEST_FS: 2335 case VM_REG_GUEST_GS: 2336 case VM_REG_GUEST_SS: 2337 case VM_REG_GUEST_LDTR: 2338 seg = vmcb_segptr(vmcb, reg); 2339 desc->access = VMCB_ATTR2ACCESS(seg->attrib); 2340 /* 2341 * VT-x uses bit 16 to indicate a segment that has been loaded 2342 * with a NULL selector (aka unusable). The 'desc->access' 2343 * field is interpreted in the VT-x format by the 2344 * processor-independent code. 2345 * 2346 * SVM uses the 'P' bit to convey the same information so 2347 * convert it into the VT-x format. For more details refer to 2348 * section "Segment State in the VMCB" in APMv2. 2349 */ 2350 if ((desc->access & 0x80) == 0) { 2351 /* Unusable segment */ 2352 desc->access |= 0x10000; 2353 } 2354 break; 2355 2356 case VM_REG_GUEST_CS: 2357 case VM_REG_GUEST_TR: 2358 seg = vmcb_segptr(vmcb, reg); 2359 desc->access = VMCB_ATTR2ACCESS(seg->attrib); 2360 break; 2361 2362 case VM_REG_GUEST_GDTR: 2363 case VM_REG_GUEST_IDTR: 2364 seg = vmcb_segptr(vmcb, reg); 2365 /* 2366 * Since there are no access bits associated with the GDTR or 2367 * the IDTR, zero out the field to ensure it does not contain 2368 * garbage which might confuse the consumer. 2369 */ 2370 desc->access = 0; 2371 break; 2372 2373 default: 2374 return (EINVAL); 2375 } 2376 2377 ASSERT(seg != NULL); 2378 desc->base = seg->base; 2379 desc->limit = seg->limit; 2380 return (0); 2381 } 2382 2383 static int 2384 svm_get_msr(void *arg, int vcpu, uint32_t msr, uint64_t *valp) 2385 { 2386 struct svm_softc *sc = arg; 2387 struct vmcb *vmcb = svm_get_vmcb(sc, vcpu); 2388 const uint64_t *msrp = vmcb_msr_ptr(vmcb, msr, NULL); 2389 2390 if (msrp != NULL) { 2391 *valp = *msrp; 2392 return (0); 2393 } 2394 2395 return (EINVAL); 2396 } 2397 2398 static int 2399 svm_set_msr(void *arg, int vcpu, uint32_t msr, uint64_t val) 2400 { 2401 struct svm_softc *sc = arg; 2402 struct vmcb *vmcb = svm_get_vmcb(sc, vcpu); 2403 2404 uint32_t dirty = 0; 2405 uint64_t *msrp = vmcb_msr_ptr(vmcb, msr, &dirty); 2406 if (msrp == NULL) { 2407 return (EINVAL); 2408 } 2409 switch (msr) { 2410 case MSR_EFER: 2411 /* 2412 * For now, just clone the logic from 2413 * svm_setreg(): 2414 * 2415 * EFER_SVM must always be set when the guest is 2416 * executing 2417 */ 2418 *msrp = val | EFER_SVM; 2419 break; 2420 /* TODO: other necessary MSR masking */ 2421 default: 2422 *msrp = val; 2423 break; 2424 } 2425 if (dirty != 0) { 2426 svm_set_dirty(sc, vcpu, dirty); 2427 } 2428 return (0); 2429 2430 } 2431 2432 static int 2433 svm_setcap(void *arg, int vcpu, int type, int val) 2434 { 2435 struct svm_softc *sc; 2436 int error; 2437 2438 sc = arg; 2439 error = 0; 2440 switch (type) { 2441 case VM_CAP_HALT_EXIT: 2442 svm_set_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, 2443 VMCB_INTCPT_HLT, val); 2444 break; 2445 case VM_CAP_PAUSE_EXIT: 2446 svm_set_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, 2447 VMCB_INTCPT_PAUSE, val); 2448 break; 2449 default: 2450 error = ENOENT; 2451 break; 2452 } 2453 return (error); 2454 } 2455 2456 static int 2457 svm_getcap(void *arg, int vcpu, int type, int *retval) 2458 { 2459 struct svm_softc *sc; 2460 int error; 2461 2462 sc = arg; 2463 error = 0; 2464 2465 switch (type) { 2466 case VM_CAP_HALT_EXIT: 2467 *retval = svm_get_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, 2468 VMCB_INTCPT_HLT); 2469 break; 2470 case VM_CAP_PAUSE_EXIT: 2471 *retval = svm_get_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, 2472 VMCB_INTCPT_PAUSE); 2473 break; 2474 default: 2475 error = ENOENT; 2476 break; 2477 } 2478 return (error); 2479 } 2480 2481 static struct vlapic * 2482 svm_vlapic_init(void *arg, int vcpuid) 2483 { 2484 struct svm_softc *svm_sc; 2485 struct vlapic *vlapic; 2486 2487 svm_sc = arg; 2488 vlapic = kmem_zalloc(sizeof (struct vlapic), KM_SLEEP); 2489 vlapic->vm = svm_sc->vm; 2490 vlapic->vcpuid = vcpuid; 2491 vlapic->apic_page = (struct LAPIC *)&svm_sc->apic_page[vcpuid]; 2492 2493 vlapic_init(vlapic); 2494 2495 return (vlapic); 2496 } 2497 2498 static void 2499 svm_vlapic_cleanup(void *arg, struct vlapic *vlapic) 2500 { 2501 vlapic_cleanup(vlapic); 2502 kmem_free(vlapic, sizeof (struct vlapic)); 2503 } 2504 2505 static void 2506 svm_savectx(void *arg, int vcpu) 2507 { 2508 struct svm_softc *sc = arg; 2509 2510 if (sc->vcpu[vcpu].loaded) { 2511 svm_msr_guest_exit(sc, vcpu); 2512 } 2513 } 2514 2515 static void 2516 svm_restorectx(void *arg, int vcpu) 2517 { 2518 struct svm_softc *sc = arg; 2519 2520 if (sc->vcpu[vcpu].loaded) { 2521 svm_msr_guest_enter(sc, vcpu); 2522 } 2523 } 2524 2525 struct vmm_ops vmm_ops_amd = { 2526 .init = svm_init, 2527 .cleanup = svm_cleanup, 2528 .resume = svm_restore, 2529 2530 .vminit = svm_vminit, 2531 .vmrun = svm_vmrun, 2532 .vmcleanup = svm_vmcleanup, 2533 .vmgetreg = svm_getreg, 2534 .vmsetreg = svm_setreg, 2535 .vmgetdesc = svm_getdesc, 2536 .vmsetdesc = svm_setdesc, 2537 .vmgetcap = svm_getcap, 2538 .vmsetcap = svm_setcap, 2539 .vlapic_init = svm_vlapic_init, 2540 .vlapic_cleanup = svm_vlapic_cleanup, 2541 2542 .vmsavectx = svm_savectx, 2543 .vmrestorectx = svm_restorectx, 2544 2545 .vmgetmsr = svm_get_msr, 2546 .vmsetmsr = svm_set_msr, 2547 }; 2548