1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2013, Anish Gupta (akgupt3@gmail.com) 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice unmodified, this list of conditions, and the following 12 * disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 */ 28 29 /* 30 * This file and its contents are supplied under the terms of the 31 * Common Development and Distribution License ("CDDL"), version 1.0. 32 * You may only use this file in accordance with the terms of version 33 * 1.0 of the CDDL. 34 * 35 * A full copy of the text of the CDDL should have accompanied this 36 * source. A copy of the CDDL is also available via the Internet at 37 * http://www.illumos.org/license/CDDL. 38 * 39 * Copyright 2018 Joyent, Inc. 40 * Copyright 2023 Oxide Computer Company 41 */ 42 43 #include <sys/cdefs.h> 44 45 #include <sys/param.h> 46 #include <sys/systm.h> 47 #include <sys/kernel.h> 48 #include <sys/kmem.h> 49 #include <sys/pcpu.h> 50 #include <sys/proc.h> 51 #include <sys/sysctl.h> 52 #include <sys/cpu.h> 53 54 #include <sys/x86_archext.h> 55 #include <sys/archsystm.h> 56 #include <sys/trap.h> 57 58 #include <machine/cpufunc.h> 59 #include <machine/psl.h> 60 #include <machine/md_var.h> 61 #include <machine/reg.h> 62 #include <machine/specialreg.h> 63 #include <machine/vmm.h> 64 #include <machine/vmm_dev.h> 65 #include <sys/vmm_instruction_emul.h> 66 #include <sys/vmm_vm.h> 67 #include <sys/vmm_kernel.h> 68 69 #include "vmm_lapic.h" 70 #include "vmm_stat.h" 71 #include "vmm_ioport.h" 72 #include "vatpic.h" 73 #include "vlapic.h" 74 #include "vlapic_priv.h" 75 76 #include "vmcb.h" 77 #include "svm.h" 78 #include "svm_softc.h" 79 #include "svm_msr.h" 80 81 SYSCTL_DECL(_hw_vmm); 82 SYSCTL_NODE(_hw_vmm, OID_AUTO, svm, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 83 NULL); 84 85 /* 86 * Guardrails for supported guest TSC frequencies. 87 * 88 * A minimum of 0.5 GHz, which should be sufficient for all recent AMD CPUs, and 89 * a maximum ratio of (15 * host frequency), which is sufficient to prevent 90 * overflowing frequency calcuations and give plenty of bandwidth for future CPU 91 * frequency increases. 92 */ 93 #define AMD_TSC_MIN_FREQ 500000000 94 #define AMD_TSC_MAX_FREQ_RATIO 15 95 96 /* SVM features advertised by CPUID.8000000AH:EDX */ 97 static uint32_t svm_feature = 0; 98 99 static int disable_npf_assist; 100 101 static VMM_STAT_AMD(VCPU_EXITINTINFO, "VM exits during event delivery"); 102 static VMM_STAT_AMD(VCPU_INTINFO_INJECTED, "Events pending at VM entry"); 103 static VMM_STAT_AMD(VMEXIT_VINTR, "VM exits due to interrupt window"); 104 105 static int svm_setreg(void *arg, int vcpu, int ident, uint64_t val); 106 static int svm_getreg(void *arg, int vcpu, int ident, uint64_t *val); 107 static void flush_asid(struct svm_softc *sc, int vcpuid); 108 109 static __inline bool 110 has_flush_by_asid(void) 111 { 112 return ((svm_feature & CPUID_AMD_EDX_FLUSH_ASID) != 0); 113 } 114 115 static __inline bool 116 has_lbr_virt(void) 117 { 118 return ((svm_feature & CPUID_AMD_EDX_LBR_VIRT) != 0); 119 } 120 121 static __inline bool 122 has_decode_assist(void) 123 { 124 return ((svm_feature & CPUID_AMD_EDX_DECODE_ASSISTS) != 0); 125 } 126 127 static __inline bool 128 has_tsc_freq_ctl(void) 129 { 130 return ((svm_feature & CPUID_AMD_EDX_TSC_RATE_MSR) != 0); 131 } 132 133 static int 134 svm_cleanup(void) 135 { 136 /* This is taken care of by the hma registration */ 137 return (0); 138 } 139 140 static int 141 svm_init(void) 142 { 143 /* Grab a (bhyve) local copy of the SVM feature bits */ 144 struct cpuid_regs regs = { 145 .cp_eax = 0x8000000a, 146 }; 147 (void) cpuid_insn(NULL, ®s); 148 svm_feature = regs.cp_edx; 149 150 /* 151 * HMA should have already checked for these features which we refuse to 152 * operate without, but no harm in making sure 153 */ 154 const uint32_t demand_bits = 155 (CPUID_AMD_EDX_NESTED_PAGING | CPUID_AMD_EDX_NRIPS); 156 VERIFY((svm_feature & demand_bits) == demand_bits); 157 158 return (0); 159 } 160 161 static void 162 svm_restore(void) 163 { 164 /* No-op on illumos */ 165 } 166 167 /* Pentium compatible MSRs */ 168 #define MSR_PENTIUM_START 0 169 #define MSR_PENTIUM_END 0x1FFF 170 /* AMD 6th generation and Intel compatible MSRs */ 171 #define MSR_AMD6TH_START 0xC0000000UL 172 #define MSR_AMD6TH_END 0xC0001FFFUL 173 /* AMD 7th and 8th generation compatible MSRs */ 174 #define MSR_AMD7TH_START 0xC0010000UL 175 #define MSR_AMD7TH_END 0xC0011FFFUL 176 177 /* 178 * Get the index and bit position for a MSR in permission bitmap. 179 * Two bits are used for each MSR: lower bit for read and higher bit for write. 180 */ 181 static int 182 svm_msr_index(uint64_t msr, int *index, int *bit) 183 { 184 uint32_t base, off; 185 186 *index = -1; 187 *bit = (msr % 4) * 2; 188 base = 0; 189 190 if (msr <= MSR_PENTIUM_END) { 191 *index = msr / 4; 192 return (0); 193 } 194 195 base += (MSR_PENTIUM_END - MSR_PENTIUM_START + 1); 196 if (msr >= MSR_AMD6TH_START && msr <= MSR_AMD6TH_END) { 197 off = (msr - MSR_AMD6TH_START); 198 *index = (off + base) / 4; 199 return (0); 200 } 201 202 base += (MSR_AMD6TH_END - MSR_AMD6TH_START + 1); 203 if (msr >= MSR_AMD7TH_START && msr <= MSR_AMD7TH_END) { 204 off = (msr - MSR_AMD7TH_START); 205 *index = (off + base) / 4; 206 return (0); 207 } 208 209 return (EINVAL); 210 } 211 212 /* 213 * Allow vcpu to read or write the 'msr' without trapping into the hypervisor. 214 */ 215 static void 216 svm_msr_perm(uint8_t *perm_bitmap, uint64_t msr, bool read, bool write) 217 { 218 int index, bit, error; 219 220 error = svm_msr_index(msr, &index, &bit); 221 KASSERT(error == 0, ("%s: invalid msr %lx", __func__, msr)); 222 KASSERT(index >= 0 && index < SVM_MSR_BITMAP_SIZE, 223 ("%s: invalid index %d for msr %lx", __func__, index, msr)); 224 KASSERT(bit >= 0 && bit <= 6, ("%s: invalid bit position %d " 225 "msr %lx", __func__, bit, msr)); 226 227 if (read) 228 perm_bitmap[index] &= ~(1UL << bit); 229 230 if (write) 231 perm_bitmap[index] &= ~(2UL << bit); 232 } 233 234 static void 235 svm_msr_rw_ok(uint8_t *perm_bitmap, uint64_t msr) 236 { 237 238 svm_msr_perm(perm_bitmap, msr, true, true); 239 } 240 241 static void 242 svm_msr_rd_ok(uint8_t *perm_bitmap, uint64_t msr) 243 { 244 245 svm_msr_perm(perm_bitmap, msr, true, false); 246 } 247 248 int 249 svm_get_intercept(struct svm_softc *sc, int vcpu, int idx, uint32_t bitmask) 250 { 251 struct vmcb_ctrl *ctrl; 252 253 KASSERT(idx >= 0 && idx < 5, ("invalid intercept index %d", idx)); 254 255 ctrl = svm_get_vmcb_ctrl(sc, vcpu); 256 return (ctrl->intercept[idx] & bitmask ? 1 : 0); 257 } 258 259 void 260 svm_set_intercept(struct svm_softc *sc, int vcpu, int idx, uint32_t bitmask, 261 int enabled) 262 { 263 struct vmcb_ctrl *ctrl; 264 uint32_t oldval; 265 266 KASSERT(idx >= 0 && idx < 5, ("invalid intercept index %d", idx)); 267 268 ctrl = svm_get_vmcb_ctrl(sc, vcpu); 269 oldval = ctrl->intercept[idx]; 270 271 if (enabled) 272 ctrl->intercept[idx] |= bitmask; 273 else 274 ctrl->intercept[idx] &= ~bitmask; 275 276 if (ctrl->intercept[idx] != oldval) { 277 svm_set_dirty(sc, vcpu, VMCB_CACHE_I); 278 } 279 } 280 281 static void 282 vmcb_init(struct svm_softc *sc, int vcpu, uint64_t iopm_base_pa, 283 uint64_t msrpm_base_pa, uint64_t np_pml4) 284 { 285 struct vmcb_ctrl *ctrl; 286 struct vmcb_state *state; 287 uint32_t mask; 288 int n; 289 290 ctrl = svm_get_vmcb_ctrl(sc, vcpu); 291 state = svm_get_vmcb_state(sc, vcpu); 292 293 ctrl->iopm_base_pa = iopm_base_pa; 294 ctrl->msrpm_base_pa = msrpm_base_pa; 295 296 /* Enable nested paging */ 297 ctrl->np_ctrl = NP_ENABLE; 298 ctrl->n_cr3 = np_pml4; 299 300 /* 301 * Intercept accesses to the control registers that are not shadowed 302 * in the VMCB - i.e. all except cr0, cr2, cr3, cr4 and cr8. 303 */ 304 for (n = 0; n < 16; n++) { 305 mask = (BIT(n) << 16) | BIT(n); 306 if (n == 0 || n == 2 || n == 3 || n == 4 || n == 8) 307 svm_disable_intercept(sc, vcpu, VMCB_CR_INTCPT, mask); 308 else 309 svm_enable_intercept(sc, vcpu, VMCB_CR_INTCPT, mask); 310 } 311 312 /* 313 * Selectively intercept writes to %cr0. This triggers on operations 314 * which would change bits other than TS or MP. 315 */ 316 svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, 317 VMCB_INTCPT_CR0_WRITE); 318 319 /* 320 * Intercept everything when tracing guest exceptions otherwise 321 * just intercept machine check exception. 322 */ 323 if (vcpu_trace_exceptions(sc->vm, vcpu)) { 324 for (n = 0; n < 32; n++) { 325 /* 326 * Skip unimplemented vectors in the exception bitmap. 327 */ 328 if (n == 2 || n == 9) { 329 continue; 330 } 331 svm_enable_intercept(sc, vcpu, VMCB_EXC_INTCPT, BIT(n)); 332 } 333 } else { 334 svm_enable_intercept(sc, vcpu, VMCB_EXC_INTCPT, BIT(IDT_MC)); 335 } 336 337 /* Intercept various events (for e.g. I/O, MSR and CPUID accesses) */ 338 svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_IO); 339 svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_MSR); 340 svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_CPUID); 341 svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_INTR); 342 svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_INIT); 343 svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_NMI); 344 svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_SMI); 345 svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_RDPMC); 346 svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_SHUTDOWN); 347 svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, 348 VMCB_INTCPT_FERR_FREEZE); 349 350 /* Enable exit-on-hlt by default */ 351 svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_HLT); 352 353 svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_MONITOR); 354 svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_MWAIT); 355 356 /* Intercept privileged invalidation instructions. */ 357 svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_INVD); 358 svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_INVLPGA); 359 360 /* 361 * Intercept all virtualization-related instructions. 362 * 363 * From section "Canonicalization and Consistency Checks" in APMv2 364 * the VMRUN intercept bit must be set to pass the consistency check. 365 */ 366 svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_VMRUN); 367 svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_VMMCALL); 368 svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_VMLOAD); 369 svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_VMSAVE); 370 svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_STGI); 371 svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_CLGI); 372 svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_SKINIT); 373 if (vcpu_trap_wbinvd(sc->vm, vcpu) != 0) { 374 svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, 375 VMCB_INTCPT_WBINVD); 376 } 377 378 /* 379 * The ASID will be set to a non-zero value just before VMRUN. 380 */ 381 ctrl->asid = 0; 382 383 /* 384 * Section 15.21.1, Interrupt Masking in EFLAGS 385 * Section 15.21.2, Virtualizing APIC.TPR 386 * 387 * This must be set for %rflag and %cr8 isolation of guest and host. 388 */ 389 ctrl->v_intr_ctrl |= V_INTR_MASKING; 390 391 /* Enable Last Branch Record aka LBR-virt (if available) */ 392 if (has_lbr_virt()) { 393 ctrl->misc_ctrl |= LBR_VIRT_ENABLE; 394 } 395 396 /* EFER_SVM must always be set when the guest is executing */ 397 state->efer = EFER_SVM; 398 399 /* Set up the PAT to power-on state */ 400 state->g_pat = PAT_VALUE(0, PAT_WRITE_BACK) | 401 PAT_VALUE(1, PAT_WRITE_THROUGH) | 402 PAT_VALUE(2, PAT_UNCACHED) | 403 PAT_VALUE(3, PAT_UNCACHEABLE) | 404 PAT_VALUE(4, PAT_WRITE_BACK) | 405 PAT_VALUE(5, PAT_WRITE_THROUGH) | 406 PAT_VALUE(6, PAT_UNCACHED) | 407 PAT_VALUE(7, PAT_UNCACHEABLE); 408 409 /* Set up DR6/7 to power-on state */ 410 state->dr6 = DBREG_DR6_RESERVED1; 411 state->dr7 = DBREG_DR7_RESERVED1; 412 } 413 414 /* 415 * Initialize a virtual machine. 416 */ 417 static void * 418 svm_vminit(struct vm *vm) 419 { 420 struct svm_softc *svm_sc; 421 struct svm_vcpu *vcpu; 422 vm_paddr_t msrpm_pa, iopm_pa, pml4_pa; 423 int i; 424 uint16_t maxcpus; 425 426 svm_sc = kmem_zalloc(sizeof (*svm_sc), KM_SLEEP); 427 VERIFY3U(((uintptr_t)svm_sc & PAGE_MASK), ==, 0); 428 429 svm_sc->msr_bitmap = vmm_contig_alloc(SVM_MSR_BITMAP_SIZE); 430 if (svm_sc->msr_bitmap == NULL) 431 panic("contigmalloc of SVM MSR bitmap failed"); 432 svm_sc->iopm_bitmap = vmm_contig_alloc(SVM_IO_BITMAP_SIZE); 433 if (svm_sc->iopm_bitmap == NULL) 434 panic("contigmalloc of SVM IO bitmap failed"); 435 436 svm_sc->vm = vm; 437 svm_sc->nptp = vmspace_table_root(vm_get_vmspace(vm)); 438 439 /* 440 * Intercept read and write accesses to all MSRs. 441 */ 442 memset(svm_sc->msr_bitmap, 0xFF, SVM_MSR_BITMAP_SIZE); 443 444 /* 445 * Access to the following MSRs is redirected to the VMCB when the 446 * guest is executing. Therefore it is safe to allow the guest to 447 * read/write these MSRs directly without hypervisor involvement. 448 */ 449 svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_GSBASE); 450 svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_FSBASE); 451 svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_KGSBASE); 452 453 svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_STAR); 454 svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_LSTAR); 455 svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_CSTAR); 456 svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SF_MASK); 457 svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SYSENTER_CS_MSR); 458 svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SYSENTER_ESP_MSR); 459 svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SYSENTER_EIP_MSR); 460 svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_PAT); 461 462 svm_msr_rd_ok(svm_sc->msr_bitmap, MSR_TSC); 463 464 /* 465 * Intercept writes to make sure that the EFER_SVM bit is not cleared. 466 */ 467 svm_msr_rd_ok(svm_sc->msr_bitmap, MSR_EFER); 468 469 /* Intercept access to all I/O ports. */ 470 memset(svm_sc->iopm_bitmap, 0xFF, SVM_IO_BITMAP_SIZE); 471 472 iopm_pa = vtophys(svm_sc->iopm_bitmap); 473 msrpm_pa = vtophys(svm_sc->msr_bitmap); 474 pml4_pa = svm_sc->nptp; 475 maxcpus = vm_get_maxcpus(svm_sc->vm); 476 for (i = 0; i < maxcpus; i++) { 477 vcpu = svm_get_vcpu(svm_sc, i); 478 vcpu->nextrip = ~0; 479 vcpu->lastcpu = NOCPU; 480 vcpu->vmcb_pa = vtophys(&vcpu->vmcb); 481 vmcb_init(svm_sc, i, iopm_pa, msrpm_pa, pml4_pa); 482 svm_msr_guest_init(svm_sc, i); 483 } 484 485 svm_pmu_init(svm_sc); 486 487 return (svm_sc); 488 } 489 490 /* 491 * Collateral for a generic SVM VM-exit. 492 */ 493 static void 494 vm_exit_svm(struct vm_exit *vme, uint64_t code, uint64_t info1, uint64_t info2) 495 { 496 497 vme->exitcode = VM_EXITCODE_SVM; 498 vme->u.svm.exitcode = code; 499 vme->u.svm.exitinfo1 = info1; 500 vme->u.svm.exitinfo2 = info2; 501 } 502 503 static enum vm_cpu_mode 504 svm_vcpu_mode(struct vmcb *vmcb) 505 { 506 struct vmcb_state *state; 507 508 state = &vmcb->state; 509 510 if (state->efer & EFER_LMA) { 511 struct vmcb_segment *seg; 512 513 /* 514 * Section 4.8.1 for APM2, check if Code Segment has 515 * Long attribute set in descriptor. 516 */ 517 seg = vmcb_segptr(vmcb, VM_REG_GUEST_CS); 518 if (seg->attrib & VMCB_CS_ATTRIB_L) 519 return (CPU_MODE_64BIT); 520 else 521 return (CPU_MODE_COMPATIBILITY); 522 } else if (state->cr0 & CR0_PE) { 523 return (CPU_MODE_PROTECTED); 524 } else { 525 return (CPU_MODE_REAL); 526 } 527 } 528 529 static enum vm_paging_mode 530 svm_paging_mode(uint64_t cr0, uint64_t cr4, uint64_t efer) 531 { 532 533 if ((cr0 & CR0_PG) == 0) 534 return (PAGING_MODE_FLAT); 535 if ((cr4 & CR4_PAE) == 0) 536 return (PAGING_MODE_32); 537 if (efer & EFER_LME) 538 return (PAGING_MODE_64); 539 else 540 return (PAGING_MODE_PAE); 541 } 542 543 static void 544 svm_paging_info(struct vmcb *vmcb, struct vm_guest_paging *paging) 545 { 546 struct vmcb_state *state; 547 548 state = &vmcb->state; 549 paging->cr3 = state->cr3; 550 paging->cpl = state->cpl; 551 paging->cpu_mode = svm_vcpu_mode(vmcb); 552 paging->paging_mode = svm_paging_mode(state->cr0, state->cr4, 553 state->efer); 554 } 555 556 #define UNHANDLED 0 557 558 /* 559 * Handle guest I/O intercept. 560 */ 561 static int 562 svm_handle_inout(struct svm_softc *svm_sc, int vcpu, struct vm_exit *vmexit) 563 { 564 struct vmcb_ctrl *ctrl; 565 struct vmcb_state *state; 566 struct vm_inout *inout; 567 struct vie *vie; 568 uint64_t info1; 569 struct vm_guest_paging paging; 570 571 state = svm_get_vmcb_state(svm_sc, vcpu); 572 ctrl = svm_get_vmcb_ctrl(svm_sc, vcpu); 573 inout = &vmexit->u.inout; 574 info1 = ctrl->exitinfo1; 575 576 inout->bytes = (info1 >> 4) & 0x7; 577 inout->flags = 0; 578 inout->flags |= (info1 & BIT(0)) ? INOUT_IN : 0; 579 inout->flags |= (info1 & BIT(3)) ? INOUT_REP : 0; 580 inout->flags |= (info1 & BIT(2)) ? INOUT_STR : 0; 581 inout->port = (uint16_t)(info1 >> 16); 582 inout->eax = (uint32_t)(state->rax); 583 584 /* 585 * We'll always need paging and vie info, even if we bail out early 586 * due to missing DecodeAssist. 587 */ 588 svm_paging_info(svm_get_vmcb(svm_sc, vcpu), &paging); 589 vie = vm_vie_ctx(svm_sc->vm, vcpu); 590 591 if ((inout->flags & INOUT_STR) != 0) { 592 /* 593 * The effective segment number in EXITINFO1[12:10] is populated 594 * only if the processor has the DecodeAssist capability. 595 * 596 * This is not specified explicitly in APMv2 but can be verified 597 * empirically. 598 */ 599 if (!has_decode_assist()) { 600 /* 601 * Without decoding assistance, force the task of 602 * emulating the ins/outs on userspace. 603 */ 604 vmexit->exitcode = VM_EXITCODE_INST_EMUL; 605 bzero(&vmexit->u.inst_emul, 606 sizeof (vmexit->u.inst_emul)); 607 vie_init_other(vie, &paging); 608 return (UNHANDLED); 609 } 610 611 /* 612 * Bits 7-9 encode the address size of ins/outs operations where 613 * the 1/2/4 values correspond to 16/32/64 bit sizes. 614 */ 615 inout->addrsize = 2 * ((info1 >> 7) & 0x7); 616 VERIFY(inout->addrsize == 2 || inout->addrsize == 4 || 617 inout->addrsize == 8); 618 619 if (inout->flags & INOUT_IN) { 620 /* 621 * For INS instructions, %es (encoded as 0) is the 622 * implied segment for the operation. 623 */ 624 inout->segment = 0; 625 } else { 626 /* 627 * Bits 10-12 encode the segment for OUTS. 628 * This value follows the standard x86 segment order. 629 */ 630 inout->segment = (info1 >> 10) & 0x7; 631 } 632 } 633 634 vmexit->exitcode = VM_EXITCODE_INOUT; 635 vie_init_inout(vie, inout, vmexit->inst_length, &paging); 636 637 /* The in/out emulation will handle advancing %rip */ 638 vmexit->inst_length = 0; 639 640 return (UNHANDLED); 641 } 642 643 static int 644 npf_fault_type(uint64_t exitinfo1) 645 { 646 647 if (exitinfo1 & VMCB_NPF_INFO1_W) 648 return (PROT_WRITE); 649 else if (exitinfo1 & VMCB_NPF_INFO1_ID) 650 return (PROT_EXEC); 651 else 652 return (PROT_READ); 653 } 654 655 static bool 656 svm_npf_emul_fault(uint64_t exitinfo1) 657 { 658 if (exitinfo1 & VMCB_NPF_INFO1_ID) { 659 return (false); 660 } 661 662 if (exitinfo1 & VMCB_NPF_INFO1_GPT) { 663 return (false); 664 } 665 666 if ((exitinfo1 & VMCB_NPF_INFO1_GPA) == 0) { 667 return (false); 668 } 669 670 return (true); 671 } 672 673 static void 674 svm_handle_mmio_emul(struct svm_softc *svm_sc, int vcpu, struct vm_exit *vmexit, 675 uint64_t gpa) 676 { 677 struct vmcb_ctrl *ctrl; 678 struct vmcb *vmcb; 679 struct vie *vie; 680 struct vm_guest_paging paging; 681 struct vmcb_segment *seg; 682 char *inst_bytes = NULL; 683 uint8_t inst_len = 0; 684 685 vmcb = svm_get_vmcb(svm_sc, vcpu); 686 ctrl = &vmcb->ctrl; 687 688 vmexit->exitcode = VM_EXITCODE_MMIO_EMUL; 689 vmexit->u.mmio_emul.gpa = gpa; 690 vmexit->u.mmio_emul.gla = VIE_INVALID_GLA; 691 svm_paging_info(vmcb, &paging); 692 693 switch (paging.cpu_mode) { 694 case CPU_MODE_REAL: 695 seg = vmcb_segptr(vmcb, VM_REG_GUEST_CS); 696 vmexit->u.mmio_emul.cs_base = seg->base; 697 vmexit->u.mmio_emul.cs_d = 0; 698 break; 699 case CPU_MODE_PROTECTED: 700 case CPU_MODE_COMPATIBILITY: 701 seg = vmcb_segptr(vmcb, VM_REG_GUEST_CS); 702 vmexit->u.mmio_emul.cs_base = seg->base; 703 704 /* 705 * Section 4.8.1 of APM2, Default Operand Size or D bit. 706 */ 707 vmexit->u.mmio_emul.cs_d = (seg->attrib & VMCB_CS_ATTRIB_D) ? 708 1 : 0; 709 break; 710 default: 711 vmexit->u.mmio_emul.cs_base = 0; 712 vmexit->u.mmio_emul.cs_d = 0; 713 break; 714 } 715 716 /* 717 * Copy the instruction bytes into 'vie' if available. 718 */ 719 if (has_decode_assist() && !disable_npf_assist) { 720 inst_len = ctrl->inst_len; 721 inst_bytes = (char *)ctrl->inst_bytes; 722 } 723 vie = vm_vie_ctx(svm_sc->vm, vcpu); 724 vie_init_mmio(vie, inst_bytes, inst_len, &paging, gpa); 725 } 726 727 /* 728 * Do not allow CD, NW, or invalid high bits to be asserted in the value of cr0 729 * which is live in the guest. They are visible via the shadow instead. 730 */ 731 #define SVM_CR0_MASK ~(CR0_CD | CR0_NW | 0xffffffff00000000) 732 733 static void 734 svm_set_cr0(struct svm_softc *svm_sc, int vcpu, uint64_t val, bool guest_write) 735 { 736 struct vmcb_state *state; 737 struct svm_regctx *regctx; 738 uint64_t masked, old, diff; 739 740 state = svm_get_vmcb_state(svm_sc, vcpu); 741 regctx = svm_get_guest_regctx(svm_sc, vcpu); 742 743 old = state->cr0 | (regctx->sctx_cr0_shadow & ~SVM_CR0_MASK); 744 diff = old ^ val; 745 746 /* No further work needed if register contents remain the same */ 747 if (diff == 0) { 748 return; 749 } 750 751 /* Flush the TLB if the paging or write-protect bits are changing */ 752 if ((diff & CR0_PG) != 0 || (diff & CR0_WP) != 0) { 753 flush_asid(svm_sc, vcpu); 754 } 755 756 /* 757 * If the change in %cr0 is due to a guest action (via interception) 758 * then other CPU state updates may be required. 759 */ 760 if (guest_write) { 761 if ((diff & CR0_PG) != 0) { 762 uint64_t efer = state->efer; 763 764 /* Keep the long-mode state in EFER in sync */ 765 if ((val & CR0_PG) != 0 && (efer & EFER_LME) != 0) { 766 state->efer |= EFER_LMA; 767 } 768 if ((val & CR0_PG) == 0 && (efer & EFER_LME) != 0) { 769 state->efer &= ~EFER_LMA; 770 } 771 } 772 } 773 774 masked = val & SVM_CR0_MASK; 775 regctx->sctx_cr0_shadow = val; 776 state->cr0 = masked; 777 svm_set_dirty(svm_sc, vcpu, VMCB_CACHE_CR); 778 779 if ((masked ^ val) != 0) { 780 /* 781 * The guest has set bits in %cr0 which we are masking out and 782 * exposing via shadow. 783 * 784 * We must intercept %cr0 reads in order to make the shadowed 785 * view available to the guest. 786 * 787 * Writes to %cr0 must also be intercepted (unconditionally, 788 * unlike the VMCB_INTCPT_CR0_WRITE mechanism) so we can catch 789 * if/when the guest clears those shadowed bits. 790 */ 791 svm_enable_intercept(svm_sc, vcpu, VMCB_CR_INTCPT, 792 BIT(0) | BIT(16)); 793 } else { 794 /* 795 * When no bits remain in %cr0 which require shadowing, the 796 * unconditional intercept of reads/writes to %cr0 can be 797 * disabled. 798 * 799 * The selective write intercept (VMCB_INTCPT_CR0_WRITE) remains 800 * in place so we can be notified of operations which change 801 * bits other than TS or MP. 802 */ 803 svm_disable_intercept(svm_sc, vcpu, VMCB_CR_INTCPT, 804 BIT(0) | BIT(16)); 805 } 806 svm_set_dirty(svm_sc, vcpu, VMCB_CACHE_I); 807 } 808 809 static void 810 svm_get_cr0(struct svm_softc *svm_sc, int vcpu, uint64_t *val) 811 { 812 struct vmcb *vmcb; 813 struct svm_regctx *regctx; 814 815 vmcb = svm_get_vmcb(svm_sc, vcpu); 816 regctx = svm_get_guest_regctx(svm_sc, vcpu); 817 818 /* 819 * Include the %cr0 bits which exist only in the shadow along with those 820 * in the running vCPU state. 821 */ 822 *val = vmcb->state.cr0 | (regctx->sctx_cr0_shadow & ~SVM_CR0_MASK); 823 } 824 825 static void 826 svm_handle_cr0_read(struct svm_softc *svm_sc, int vcpu, enum vm_reg_name reg) 827 { 828 uint64_t val; 829 int err __maybe_unused; 830 831 svm_get_cr0(svm_sc, vcpu, &val); 832 err = svm_setreg(svm_sc, vcpu, reg, val); 833 ASSERT(err == 0); 834 } 835 836 static void 837 svm_handle_cr0_write(struct svm_softc *svm_sc, int vcpu, enum vm_reg_name reg) 838 { 839 struct vmcb_state *state; 840 uint64_t val; 841 int err __maybe_unused; 842 843 state = svm_get_vmcb_state(svm_sc, vcpu); 844 845 err = svm_getreg(svm_sc, vcpu, reg, &val); 846 ASSERT(err == 0); 847 848 if ((val & CR0_NW) != 0 && (val & CR0_CD) == 0) { 849 /* NW without CD is nonsensical */ 850 vm_inject_gp(svm_sc->vm, vcpu); 851 return; 852 } 853 if ((val & CR0_PG) != 0 && (val & CR0_PE) == 0) { 854 /* PG requires PE */ 855 vm_inject_gp(svm_sc->vm, vcpu); 856 return; 857 } 858 if ((state->cr0 & CR0_PG) == 0 && (val & CR0_PG) != 0) { 859 /* When enabling paging, PAE must be enabled if LME is. */ 860 if ((state->efer & EFER_LME) != 0 && 861 (state->cr4 & CR4_PAE) == 0) { 862 vm_inject_gp(svm_sc->vm, vcpu); 863 return; 864 } 865 } 866 867 svm_set_cr0(svm_sc, vcpu, val, true); 868 } 869 870 static void 871 svm_inst_emul_other(struct svm_softc *svm_sc, int vcpu, struct vm_exit *vmexit) 872 { 873 struct vie *vie; 874 struct vm_guest_paging paging; 875 876 /* Let the instruction emulation (hopefully in-kernel) handle it */ 877 vmexit->exitcode = VM_EXITCODE_INST_EMUL; 878 bzero(&vmexit->u.inst_emul, sizeof (vmexit->u.inst_emul)); 879 vie = vm_vie_ctx(svm_sc->vm, vcpu); 880 svm_paging_info(svm_get_vmcb(svm_sc, vcpu), &paging); 881 vie_init_other(vie, &paging); 882 883 /* The instruction emulation will handle advancing %rip */ 884 vmexit->inst_length = 0; 885 } 886 887 static void 888 svm_update_virqinfo(struct svm_softc *sc, int vcpu) 889 { 890 struct vm *vm; 891 struct vlapic *vlapic; 892 struct vmcb_ctrl *ctrl; 893 894 vm = sc->vm; 895 vlapic = vm_lapic(vm, vcpu); 896 ctrl = svm_get_vmcb_ctrl(sc, vcpu); 897 898 /* Update %cr8 in the emulated vlapic */ 899 vlapic_set_cr8(vlapic, ctrl->v_tpr); 900 901 /* Virtual interrupt injection is not used. */ 902 KASSERT(ctrl->v_intr_vector == 0, ("%s: invalid " 903 "v_intr_vector %d", __func__, ctrl->v_intr_vector)); 904 } 905 906 CTASSERT(VMCB_EVENTINJ_TYPE_INTR == VM_INTINFO_HWINTR); 907 CTASSERT(VMCB_EVENTINJ_TYPE_NMI == VM_INTINFO_NMI); 908 CTASSERT(VMCB_EVENTINJ_TYPE_EXCEPTION == VM_INTINFO_HWEXCP); 909 CTASSERT(VMCB_EVENTINJ_TYPE_INTn == VM_INTINFO_SWINTR); 910 CTASSERT(VMCB_EVENTINJ_EC_VALID == VM_INTINFO_DEL_ERRCODE); 911 CTASSERT(VMCB_EVENTINJ_VALID == VM_INTINFO_VALID); 912 913 /* 914 * Store SVM-specific event injection info for later handling. This depends on 915 * the bhyve-internal event definitions matching those in the VMCB, as ensured 916 * by the above CTASSERTs. 917 */ 918 static void 919 svm_stash_intinfo(struct svm_softc *svm_sc, int vcpu, uint64_t intinfo) 920 { 921 ASSERT(VMCB_EXITINTINFO_VALID(intinfo)); 922 923 /* 924 * If stashing an NMI pending injection, ensure that it bears the 925 * correct vector which exit_intinfo expects. 926 */ 927 if (VM_INTINFO_TYPE(intinfo) == VM_INTINFO_NMI) { 928 intinfo &= ~VM_INTINFO_MASK_VECTOR; 929 intinfo |= IDT_NMI; 930 } 931 932 VERIFY0(vm_exit_intinfo(svm_sc->vm, vcpu, intinfo)); 933 } 934 935 static void 936 svm_save_exitintinfo(struct svm_softc *svm_sc, int vcpu) 937 { 938 struct vmcb_ctrl *ctrl = svm_get_vmcb_ctrl(svm_sc, vcpu); 939 uint64_t intinfo = ctrl->exitintinfo; 940 941 if (VMCB_EXITINTINFO_VALID(intinfo)) { 942 /* 943 * If a #VMEXIT happened during event delivery then record the 944 * event that was being delivered. 945 */ 946 vmm_stat_incr(svm_sc->vm, vcpu, VCPU_EXITINTINFO, 1); 947 948 svm_stash_intinfo(svm_sc, vcpu, intinfo); 949 } 950 } 951 952 static __inline int 953 vintr_intercept_enabled(struct svm_softc *sc, int vcpu) 954 { 955 956 return (svm_get_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, 957 VMCB_INTCPT_VINTR)); 958 } 959 960 static void 961 svm_enable_intr_window_exiting(struct svm_softc *sc, int vcpu) 962 { 963 struct vmcb_ctrl *ctrl; 964 struct vmcb_state *state; 965 966 ctrl = svm_get_vmcb_ctrl(sc, vcpu); 967 state = svm_get_vmcb_state(sc, vcpu); 968 969 if ((ctrl->v_irq & V_IRQ) != 0 && ctrl->v_intr_vector == 0) { 970 KASSERT(ctrl->v_intr_prio & V_IGN_TPR, 971 ("%s: invalid v_ign_tpr", __func__)); 972 KASSERT(vintr_intercept_enabled(sc, vcpu), 973 ("%s: vintr intercept should be enabled", __func__)); 974 return; 975 } 976 977 /* 978 * We use V_IRQ in conjunction with the VINTR intercept to trap into the 979 * hypervisor as soon as a virtual interrupt can be delivered. 980 * 981 * Since injected events are not subject to intercept checks we need to 982 * ensure that the V_IRQ is not actually going to be delivered on VM 983 * entry. 984 */ 985 VERIFY((ctrl->eventinj & VMCB_EVENTINJ_VALID) != 0 || 986 (state->rflags & PSL_I) == 0 || ctrl->intr_shadow); 987 988 ctrl->v_irq |= V_IRQ; 989 ctrl->v_intr_prio |= V_IGN_TPR; 990 ctrl->v_intr_vector = 0; 991 svm_set_dirty(sc, vcpu, VMCB_CACHE_TPR); 992 svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_VINTR); 993 } 994 995 static void 996 svm_disable_intr_window_exiting(struct svm_softc *sc, int vcpu) 997 { 998 struct vmcb_ctrl *ctrl; 999 1000 ctrl = svm_get_vmcb_ctrl(sc, vcpu); 1001 1002 if ((ctrl->v_irq & V_IRQ) == 0 && ctrl->v_intr_vector == 0) { 1003 KASSERT(!vintr_intercept_enabled(sc, vcpu), 1004 ("%s: vintr intercept should be disabled", __func__)); 1005 return; 1006 } 1007 1008 ctrl->v_irq &= ~V_IRQ; 1009 ctrl->v_intr_vector = 0; 1010 svm_set_dirty(sc, vcpu, VMCB_CACHE_TPR); 1011 svm_disable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_VINTR); 1012 } 1013 1014 /* 1015 * Once an NMI is injected it blocks delivery of further NMIs until the handler 1016 * executes an IRET. The IRET intercept is enabled when an NMI is injected to 1017 * to track when the vcpu is done handling the NMI. 1018 */ 1019 static int 1020 svm_nmi_blocked(struct svm_softc *sc, int vcpu) 1021 { 1022 return (svm_get_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, 1023 VMCB_INTCPT_IRET)); 1024 } 1025 1026 static void 1027 svm_clear_nmi_blocking(struct svm_softc *sc, int vcpu) 1028 { 1029 struct vmcb_ctrl *ctrl; 1030 1031 KASSERT(svm_nmi_blocked(sc, vcpu), ("vNMI already unblocked")); 1032 /* 1033 * When the IRET intercept is cleared the vcpu will attempt to execute 1034 * the "iret" when it runs next. However, it is possible to inject 1035 * another NMI into the vcpu before the "iret" has actually executed. 1036 * 1037 * For e.g. if the "iret" encounters a #NPF when accessing the stack 1038 * it will trap back into the hypervisor. If an NMI is pending for 1039 * the vcpu it will be injected into the guest. 1040 * 1041 * XXX this needs to be fixed 1042 */ 1043 svm_disable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_IRET); 1044 1045 /* 1046 * Set an interrupt shadow to prevent an NMI from being immediately 1047 * injected on the next VMRUN. 1048 */ 1049 ctrl = svm_get_vmcb_ctrl(sc, vcpu); 1050 ctrl->intr_shadow = 1; 1051 } 1052 1053 static void 1054 svm_inject_event(struct vmcb_ctrl *ctrl, uint64_t info) 1055 { 1056 ASSERT(VM_INTINFO_PENDING(info)); 1057 1058 uint8_t vector = VM_INTINFO_VECTOR(info); 1059 uint32_t type = VM_INTINFO_TYPE(info); 1060 1061 /* 1062 * Correct behavior depends on bhyve intinfo event types lining up with 1063 * those defined by AMD for event injection in the VMCB. The CTASSERTs 1064 * above svm_save_exitintinfo() ensure it. 1065 */ 1066 switch (type) { 1067 case VM_INTINFO_NMI: 1068 /* Ensure vector for injected event matches its type (NMI) */ 1069 vector = IDT_NMI; 1070 break; 1071 case VM_INTINFO_HWINTR: 1072 case VM_INTINFO_SWINTR: 1073 break; 1074 case VM_INTINFO_HWEXCP: 1075 if (vector == IDT_NMI) { 1076 /* 1077 * NMIs are expected to be injected with 1078 * VMCB_EVENTINJ_TYPE_NMI, rather than as an exception 1079 * with the NMI vector. 1080 */ 1081 type = VM_INTINFO_NMI; 1082 } 1083 VERIFY(vector < 32); 1084 break; 1085 default: 1086 /* 1087 * Since there is not strong validation for injected event types 1088 * at this point, fall back to software interrupt for those we 1089 * do not recognized. 1090 */ 1091 type = VM_INTINFO_SWINTR; 1092 break; 1093 } 1094 1095 ctrl->eventinj = VMCB_EVENTINJ_VALID | type | vector; 1096 if (VM_INTINFO_HAS_ERRCODE(info)) { 1097 ctrl->eventinj |= VMCB_EVENTINJ_EC_VALID; 1098 ctrl->eventinj |= (uint64_t)VM_INTINFO_ERRCODE(info) << 32; 1099 } 1100 } 1101 1102 static void 1103 svm_inject_nmi(struct svm_softc *sc, int vcpu) 1104 { 1105 struct vmcb_ctrl *ctrl = svm_get_vmcb_ctrl(sc, vcpu); 1106 1107 ASSERT(!svm_nmi_blocked(sc, vcpu)); 1108 1109 ctrl->eventinj = VMCB_EVENTINJ_VALID | VMCB_EVENTINJ_TYPE_NMI; 1110 vm_nmi_clear(sc->vm, vcpu); 1111 1112 /* 1113 * Virtual NMI blocking is now in effect. 1114 * 1115 * Not only does this block a subsequent NMI injection from taking 1116 * place, it also configures an intercept on the IRET so we can track 1117 * when the next injection can take place. 1118 */ 1119 svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_IRET); 1120 } 1121 1122 static void 1123 svm_inject_irq(struct svm_softc *sc, int vcpu, int vector) 1124 { 1125 struct vmcb_ctrl *ctrl = svm_get_vmcb_ctrl(sc, vcpu); 1126 1127 ASSERT(vector >= 0 && vector <= 255); 1128 1129 ctrl->eventinj = VMCB_EVENTINJ_VALID | vector; 1130 } 1131 1132 #define EFER_MBZ_BITS 0xFFFFFFFFFFFF0200UL 1133 1134 static vm_msr_result_t 1135 svm_write_efer(struct svm_softc *sc, int vcpu, uint64_t newval) 1136 { 1137 struct vmcb_state *state = svm_get_vmcb_state(sc, vcpu); 1138 uint64_t lma; 1139 int error; 1140 1141 newval &= ~0xFE; /* clear the Read-As-Zero (RAZ) bits */ 1142 1143 if (newval & EFER_MBZ_BITS) { 1144 return (VMR_GP); 1145 } 1146 1147 /* APMv2 Table 14-5 "Long-Mode Consistency Checks" */ 1148 const uint64_t changed = state->efer ^ newval; 1149 if (changed & EFER_LME) { 1150 if (state->cr0 & CR0_PG) { 1151 return (VMR_GP); 1152 } 1153 } 1154 1155 /* EFER.LMA = EFER.LME & CR0.PG */ 1156 if ((newval & EFER_LME) != 0 && (state->cr0 & CR0_PG) != 0) { 1157 lma = EFER_LMA; 1158 } else { 1159 lma = 0; 1160 } 1161 if ((newval & EFER_LMA) != lma) { 1162 return (VMR_GP); 1163 } 1164 1165 if ((newval & EFER_NXE) != 0 && 1166 !vm_cpuid_capability(sc->vm, vcpu, VCC_NO_EXECUTE)) { 1167 return (VMR_GP); 1168 } 1169 if ((newval & EFER_FFXSR) != 0 && 1170 !vm_cpuid_capability(sc->vm, vcpu, VCC_FFXSR)) { 1171 return (VMR_GP); 1172 } 1173 if ((newval & EFER_TCE) != 0 && 1174 !vm_cpuid_capability(sc->vm, vcpu, VCC_TCE)) { 1175 return (VMR_GP); 1176 } 1177 1178 /* 1179 * Until bhyve has proper support for long-mode segment limits, just 1180 * toss a #GP at the guest if they attempt to use it. 1181 */ 1182 if (newval & EFER_LMSLE) { 1183 return (VMR_GP); 1184 } 1185 1186 error = svm_setreg(sc, vcpu, VM_REG_GUEST_EFER, newval); 1187 VERIFY0(error); 1188 return (VMR_OK); 1189 } 1190 1191 static int 1192 svm_handle_msr(struct svm_softc *svm_sc, int vcpu, struct vm_exit *vmexit, 1193 bool is_wrmsr) 1194 { 1195 struct vmcb_state *state = svm_get_vmcb_state(svm_sc, vcpu); 1196 struct svm_regctx *ctx = svm_get_guest_regctx(svm_sc, vcpu); 1197 const uint32_t ecx = ctx->sctx_rcx; 1198 vm_msr_result_t res; 1199 uint64_t val = 0; 1200 1201 if (is_wrmsr) { 1202 vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_WRMSR, 1); 1203 val = ctx->sctx_rdx << 32 | (uint32_t)state->rax; 1204 1205 if (vlapic_owned_msr(ecx)) { 1206 struct vlapic *vlapic = vm_lapic(svm_sc->vm, vcpu); 1207 1208 res = vlapic_wrmsr(vlapic, ecx, val); 1209 } else if (ecx == MSR_EFER) { 1210 res = svm_write_efer(svm_sc, vcpu, val); 1211 } else if (svm_pmu_owned_msr(ecx)) { 1212 res = svm_pmu_wrmsr(svm_sc, vcpu, ecx, val); 1213 } else { 1214 res = svm_wrmsr(svm_sc, vcpu, ecx, val); 1215 } 1216 } else { 1217 vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_RDMSR, 1); 1218 1219 if (vlapic_owned_msr(ecx)) { 1220 struct vlapic *vlapic = vm_lapic(svm_sc->vm, vcpu); 1221 1222 res = vlapic_rdmsr(vlapic, ecx, &val); 1223 } else if (svm_pmu_owned_msr(ecx)) { 1224 res = svm_pmu_rdmsr(svm_sc, vcpu, ecx, &val); 1225 } else { 1226 res = svm_rdmsr(svm_sc, vcpu, ecx, &val); 1227 } 1228 } 1229 1230 switch (res) { 1231 case VMR_OK: 1232 /* Store rdmsr result in the appropriate registers */ 1233 if (!is_wrmsr) { 1234 state->rax = (uint32_t)val; 1235 ctx->sctx_rdx = val >> 32; 1236 } 1237 return (1); 1238 case VMR_GP: 1239 vm_inject_gp(svm_sc->vm, vcpu); 1240 return (1); 1241 case VMR_UNHANLDED: 1242 vmexit->exitcode = is_wrmsr ? 1243 VM_EXITCODE_WRMSR : VM_EXITCODE_RDMSR; 1244 vmexit->u.msr.code = ecx; 1245 vmexit->u.msr.wval = val; 1246 return (0); 1247 default: 1248 panic("unexpected msr result %u\n", res); 1249 } 1250 } 1251 1252 static void 1253 svm_handle_rdpmc(struct svm_softc *svm_sc, int vcpu) 1254 { 1255 struct vmcb_state *state = svm_get_vmcb_state(svm_sc, vcpu); 1256 struct svm_regctx *ctx = svm_get_guest_regctx(svm_sc, vcpu); 1257 const uint32_t ecx = ctx->sctx_rcx; 1258 uint64_t val = 0; 1259 1260 if (svm_pmu_rdpmc(svm_sc, vcpu, ecx, &val)) { 1261 state->rax = (uint32_t)val; 1262 ctx->sctx_rdx = val >> 32; 1263 } else { 1264 vm_inject_gp(svm_sc->vm, vcpu); 1265 } 1266 } 1267 1268 /* 1269 * From section "State Saved on Exit" in APMv2: nRIP is saved for all #VMEXITs 1270 * that are due to instruction intercepts as well as MSR and IOIO intercepts 1271 * and exceptions caused by INT3, INTO and BOUND instructions. 1272 * 1273 * Return 1 if the nRIP is valid and 0 otherwise. 1274 */ 1275 static int 1276 nrip_valid(uint64_t exitcode) 1277 { 1278 switch (exitcode) { 1279 case 0x00 ... 0x0F: /* read of CR0 through CR15 */ 1280 case 0x10 ... 0x1F: /* write of CR0 through CR15 */ 1281 case 0x20 ... 0x2F: /* read of DR0 through DR15 */ 1282 case 0x30 ... 0x3F: /* write of DR0 through DR15 */ 1283 case 0x43: /* INT3 */ 1284 case 0x44: /* INTO */ 1285 case 0x45: /* BOUND */ 1286 case 0x65 ... 0x7C: /* VMEXIT_CR0_SEL_WRITE ... VMEXIT_MSR */ 1287 case 0x80 ... 0x8D: /* VMEXIT_VMRUN ... VMEXIT_XSETBV */ 1288 return (1); 1289 default: 1290 return (0); 1291 } 1292 } 1293 1294 static int 1295 svm_vmexit(struct svm_softc *svm_sc, int vcpu, struct vm_exit *vmexit) 1296 { 1297 struct vmcb *vmcb; 1298 struct vmcb_state *state; 1299 struct vmcb_ctrl *ctrl; 1300 struct svm_regctx *ctx; 1301 uint64_t code, info1, info2; 1302 int handled; 1303 1304 ctx = svm_get_guest_regctx(svm_sc, vcpu); 1305 vmcb = svm_get_vmcb(svm_sc, vcpu); 1306 state = &vmcb->state; 1307 ctrl = &vmcb->ctrl; 1308 1309 handled = 0; 1310 code = ctrl->exitcode; 1311 info1 = ctrl->exitinfo1; 1312 info2 = ctrl->exitinfo2; 1313 1314 vmexit->exitcode = VM_EXITCODE_BOGUS; 1315 vmexit->rip = state->rip; 1316 vmexit->inst_length = nrip_valid(code) ? ctrl->nrip - state->rip : 0; 1317 1318 vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_COUNT, 1); 1319 1320 /* 1321 * #VMEXIT(INVALID) needs to be handled early because the VMCB is 1322 * in an inconsistent state and can trigger assertions that would 1323 * never happen otherwise. 1324 */ 1325 if (code == VMCB_EXIT_INVALID) { 1326 vm_exit_svm(vmexit, code, info1, info2); 1327 return (0); 1328 } 1329 1330 KASSERT((ctrl->eventinj & VMCB_EVENTINJ_VALID) == 0, ("%s: event " 1331 "injection valid bit is set %lx", __func__, ctrl->eventinj)); 1332 1333 KASSERT(vmexit->inst_length >= 0 && vmexit->inst_length <= 15, 1334 ("invalid inst_length %d: code (%lx), info1 (%lx), info2 (%lx)", 1335 vmexit->inst_length, code, info1, info2)); 1336 1337 svm_update_virqinfo(svm_sc, vcpu); 1338 svm_save_exitintinfo(svm_sc, vcpu); 1339 1340 switch (code) { 1341 case VMCB_EXIT_CR0_READ: 1342 if (VMCB_CRx_INFO1_VALID(info1) != 0) { 1343 svm_handle_cr0_read(svm_sc, vcpu, 1344 vie_regnum_map(VMCB_CRx_INFO1_GPR(info1))); 1345 handled = 1; 1346 } else { 1347 /* 1348 * If SMSW is used to read the contents of %cr0, then 1349 * the VALID bit will not be set in `info1`, since the 1350 * handling is different from the mov-to-reg case. 1351 * 1352 * Punt to the instruction emulation to handle it. 1353 */ 1354 svm_inst_emul_other(svm_sc, vcpu, vmexit); 1355 } 1356 break; 1357 case VMCB_EXIT_CR0_WRITE: 1358 case VMCB_EXIT_CR0_SEL_WRITE: 1359 if (VMCB_CRx_INFO1_VALID(info1) != 0) { 1360 svm_handle_cr0_write(svm_sc, vcpu, 1361 vie_regnum_map(VMCB_CRx_INFO1_GPR(info1))); 1362 handled = 1; 1363 } else { 1364 /* 1365 * Writes to %cr0 without VALID being set in `info1` are 1366 * initiated by the LMSW and CLTS instructions. While 1367 * LMSW (like SMSW) sees little use in modern OSes and 1368 * bootloaders, CLTS is still used for handling FPU 1369 * state transitions. 1370 * 1371 * Punt to the instruction emulation to handle them. 1372 */ 1373 svm_inst_emul_other(svm_sc, vcpu, vmexit); 1374 } 1375 break; 1376 case VMCB_EXIT_IRET: 1377 /* 1378 * Restart execution at "iret" but with the intercept cleared. 1379 */ 1380 vmexit->inst_length = 0; 1381 svm_clear_nmi_blocking(svm_sc, vcpu); 1382 handled = 1; 1383 break; 1384 case VMCB_EXIT_VINTR: /* interrupt window exiting */ 1385 vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_VINTR, 1); 1386 svm_disable_intr_window_exiting(svm_sc, vcpu); 1387 handled = 1; 1388 break; 1389 case VMCB_EXIT_INTR: /* external interrupt */ 1390 vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_EXTINT, 1); 1391 handled = 1; 1392 break; 1393 case VMCB_EXIT_NMI: 1394 case VMCB_EXIT_SMI: 1395 case VMCB_EXIT_INIT: 1396 /* 1397 * For external NMI/SMI and physical INIT interrupts, simply 1398 * continue execution, as those host events will be handled by 1399 * the physical CPU. 1400 */ 1401 handled = 1; 1402 break; 1403 case VMCB_EXIT_EXCP0 ... VMCB_EXIT_EXCP31: { 1404 vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_EXCEPTION, 1); 1405 1406 const uint8_t idtvec = code - VMCB_EXIT_EXCP0; 1407 uint32_t errcode = 0; 1408 bool reflect = true; 1409 bool errcode_valid = false; 1410 1411 switch (idtvec) { 1412 case IDT_MC: 1413 /* The host will handle the MCE itself. */ 1414 reflect = false; 1415 vmm_call_trap(T_MCE); 1416 break; 1417 case IDT_PF: 1418 VERIFY0(svm_setreg(svm_sc, vcpu, VM_REG_GUEST_CR2, 1419 info2)); 1420 /* fallthru */ 1421 case IDT_NP: 1422 case IDT_SS: 1423 case IDT_GP: 1424 case IDT_AC: 1425 case IDT_TS: 1426 errcode_valid = true; 1427 errcode = info1; 1428 break; 1429 1430 case IDT_DF: 1431 errcode_valid = true; 1432 break; 1433 1434 case IDT_BP: 1435 case IDT_OF: 1436 case IDT_BR: 1437 /* 1438 * The 'nrip' field is populated for INT3, INTO and 1439 * BOUND exceptions and this also implies that 1440 * 'inst_length' is non-zero. 1441 * 1442 * Reset 'inst_length' to zero so the guest %rip at 1443 * event injection is identical to what it was when 1444 * the exception originally happened. 1445 */ 1446 vmexit->inst_length = 0; 1447 /* fallthru */ 1448 default: 1449 errcode_valid = false; 1450 break; 1451 } 1452 VERIFY0(vmexit->inst_length); 1453 1454 if (reflect) { 1455 /* Reflect the exception back into the guest */ 1456 VERIFY0(vm_inject_exception(svm_sc->vm, vcpu, idtvec, 1457 errcode_valid, errcode, false)); 1458 } 1459 handled = 1; 1460 break; 1461 } 1462 case VMCB_EXIT_MSR: 1463 handled = svm_handle_msr(svm_sc, vcpu, vmexit, info1 != 0); 1464 break; 1465 case VMCB_EXIT_RDPMC: 1466 svm_handle_rdpmc(svm_sc, vcpu); 1467 handled = 1; 1468 break; 1469 case VMCB_EXIT_IO: 1470 handled = svm_handle_inout(svm_sc, vcpu, vmexit); 1471 vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_INOUT, 1); 1472 break; 1473 case VMCB_EXIT_SHUTDOWN: 1474 (void) vm_suspend(svm_sc->vm, VM_SUSPEND_TRIPLEFAULT, vcpu); 1475 handled = 1; 1476 break; 1477 case VMCB_EXIT_INVLPGA: 1478 /* privileged invalidation instructions */ 1479 vm_inject_ud(svm_sc->vm, vcpu); 1480 handled = 1; 1481 break; 1482 case VMCB_EXIT_VMRUN: 1483 case VMCB_EXIT_VMLOAD: 1484 case VMCB_EXIT_VMSAVE: 1485 case VMCB_EXIT_STGI: 1486 case VMCB_EXIT_CLGI: 1487 case VMCB_EXIT_SKINIT: 1488 /* privileged vmm instructions */ 1489 vm_inject_ud(svm_sc->vm, vcpu); 1490 handled = 1; 1491 break; 1492 case VMCB_EXIT_INVD: 1493 case VMCB_EXIT_WBINVD: 1494 /* ignore exit */ 1495 handled = 1; 1496 break; 1497 case VMCB_EXIT_VMMCALL: 1498 /* No handlers make use of VMMCALL for now */ 1499 vm_inject_ud(svm_sc->vm, vcpu); 1500 handled = 1; 1501 break; 1502 case VMCB_EXIT_CPUID: 1503 vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_CPUID, 1); 1504 vcpu_emulate_cpuid(svm_sc->vm, vcpu, &state->rax, 1505 &ctx->sctx_rbx, &ctx->sctx_rcx, &ctx->sctx_rdx); 1506 handled = 1; 1507 break; 1508 case VMCB_EXIT_HLT: 1509 vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_HLT, 1); 1510 vmexit->exitcode = VM_EXITCODE_HLT; 1511 vmexit->u.hlt.rflags = state->rflags; 1512 break; 1513 case VMCB_EXIT_PAUSE: 1514 vmexit->exitcode = VM_EXITCODE_PAUSE; 1515 vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_PAUSE, 1); 1516 break; 1517 case VMCB_EXIT_NPF: 1518 /* EXITINFO2 contains the faulting guest physical address */ 1519 if (info1 & VMCB_NPF_INFO1_RSV) { 1520 /* nested fault with reserved bits set */ 1521 } else if (vm_mem_allocated(svm_sc->vm, vcpu, info2)) { 1522 vmexit->exitcode = VM_EXITCODE_PAGING; 1523 vmexit->u.paging.gpa = info2; 1524 vmexit->u.paging.fault_type = npf_fault_type(info1); 1525 vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_NESTED_FAULT, 1); 1526 } else if (svm_npf_emul_fault(info1)) { 1527 svm_handle_mmio_emul(svm_sc, vcpu, vmexit, info2); 1528 vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_MMIO_EMUL, 1); 1529 } 1530 break; 1531 case VMCB_EXIT_MONITOR: 1532 vmexit->exitcode = VM_EXITCODE_MONITOR; 1533 break; 1534 case VMCB_EXIT_MWAIT: 1535 vmexit->exitcode = VM_EXITCODE_MWAIT; 1536 break; 1537 default: 1538 vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_UNKNOWN, 1); 1539 break; 1540 } 1541 1542 DTRACE_PROBE3(vmm__vexit, int, vcpu, uint64_t, vmexit->rip, uint32_t, 1543 code); 1544 1545 if (handled) { 1546 vmexit->rip += vmexit->inst_length; 1547 vmexit->inst_length = 0; 1548 state->rip = vmexit->rip; 1549 } else { 1550 if (vmexit->exitcode == VM_EXITCODE_BOGUS) { 1551 /* 1552 * If this VM exit was not claimed by anybody then 1553 * treat it as a generic SVM exit. 1554 */ 1555 vm_exit_svm(vmexit, code, info1, info2); 1556 } else { 1557 /* 1558 * The exitcode and collateral have been populated. 1559 * The VM exit will be processed further in userland. 1560 */ 1561 } 1562 } 1563 return (handled); 1564 } 1565 1566 /* 1567 * Inject exceptions, NMIs, and ExtINTs. 1568 * 1569 * The logic behind these are complicated and may involve mutex contention, so 1570 * the injection is performed without the protection of host CPU interrupts 1571 * being disabled. This means a racing notification could be "lost", 1572 * necessitating a later call to svm_inject_recheck() to close that window 1573 * of opportunity. 1574 */ 1575 static enum event_inject_state 1576 svm_inject_events(struct svm_softc *sc, int vcpu) 1577 { 1578 struct vmcb_ctrl *ctrl; 1579 struct vmcb_state *state; 1580 struct svm_vcpu *vcpustate; 1581 uint64_t intinfo; 1582 enum event_inject_state ev_state; 1583 1584 state = svm_get_vmcb_state(sc, vcpu); 1585 ctrl = svm_get_vmcb_ctrl(sc, vcpu); 1586 vcpustate = svm_get_vcpu(sc, vcpu); 1587 ev_state = EIS_CAN_INJECT; 1588 1589 /* Clear any interrupt shadow if guest %rip has changed */ 1590 if (vcpustate->nextrip != state->rip) { 1591 ctrl->intr_shadow = 0; 1592 } 1593 1594 /* 1595 * An event is already pending for injection. This can occur when the 1596 * vCPU exits prior to VM entry (like for an AST). 1597 */ 1598 if (ctrl->eventinj & VMCB_EVENTINJ_VALID) { 1599 return (EIS_EV_EXISTING | EIS_REQ_EXIT); 1600 } 1601 1602 /* 1603 * Inject pending events or exceptions for this vcpu. 1604 * 1605 * An event might be pending because the previous #VMEXIT happened 1606 * during event delivery (i.e. ctrl->exitintinfo). 1607 * 1608 * An event might also be pending because an exception was injected 1609 * by the hypervisor (e.g. #PF during instruction emulation). 1610 */ 1611 if (vm_entry_intinfo(sc->vm, vcpu, &intinfo)) { 1612 svm_inject_event(ctrl, intinfo); 1613 vmm_stat_incr(sc->vm, vcpu, VCPU_INTINFO_INJECTED, 1); 1614 ev_state = EIS_EV_INJECTED; 1615 } 1616 1617 /* NMI event has priority over interrupts. */ 1618 if (vm_nmi_pending(sc->vm, vcpu) && !svm_nmi_blocked(sc, vcpu)) { 1619 if (ev_state == EIS_CAN_INJECT) { 1620 /* Can't inject NMI if vcpu is in an intr_shadow. */ 1621 if (ctrl->intr_shadow) { 1622 return (EIS_GI_BLOCK); 1623 } 1624 1625 svm_inject_nmi(sc, vcpu); 1626 ev_state = EIS_EV_INJECTED; 1627 } else { 1628 return (ev_state | EIS_REQ_EXIT); 1629 } 1630 } 1631 1632 if (vm_extint_pending(sc->vm, vcpu)) { 1633 int vector; 1634 1635 if (ev_state != EIS_CAN_INJECT) { 1636 return (ev_state | EIS_REQ_EXIT); 1637 } 1638 1639 /* 1640 * If the guest has disabled interrupts or is in an interrupt 1641 * shadow then we cannot inject the pending interrupt. 1642 */ 1643 if ((state->rflags & PSL_I) == 0 || ctrl->intr_shadow) { 1644 return (EIS_GI_BLOCK); 1645 } 1646 1647 /* Ask the legacy pic for a vector to inject */ 1648 vatpic_pending_intr(sc->vm, &vector); 1649 KASSERT(vector >= 0 && vector <= 255, 1650 ("invalid vector %d from INTR", vector)); 1651 1652 svm_inject_irq(sc, vcpu, vector); 1653 vm_extint_clear(sc->vm, vcpu); 1654 vatpic_intr_accepted(sc->vm, vector); 1655 ev_state = EIS_EV_INJECTED; 1656 } 1657 1658 return (ev_state); 1659 } 1660 1661 /* 1662 * Synchronize vLAPIC state and inject any interrupts pending on it. 1663 * 1664 * This is done with host CPU interrupts disabled so notification IPIs will be 1665 * queued on the host APIC and recognized when entering SVM guest context. 1666 */ 1667 static enum event_inject_state 1668 svm_inject_vlapic(struct svm_softc *sc, int vcpu, struct vlapic *vlapic, 1669 enum event_inject_state ev_state) 1670 { 1671 struct vmcb_ctrl *ctrl; 1672 struct vmcb_state *state; 1673 int vector; 1674 uint8_t v_tpr; 1675 1676 state = svm_get_vmcb_state(sc, vcpu); 1677 ctrl = svm_get_vmcb_ctrl(sc, vcpu); 1678 1679 /* 1680 * The guest can modify the TPR by writing to %cr8. In guest mode the 1681 * CPU reflects this write to V_TPR without hypervisor intervention. 1682 * 1683 * The guest can also modify the TPR by writing to it via the memory 1684 * mapped APIC page. In this case, the write will be emulated by the 1685 * hypervisor. For this reason V_TPR must be updated before every 1686 * VMRUN. 1687 */ 1688 v_tpr = vlapic_get_cr8(vlapic); 1689 KASSERT(v_tpr <= 15, ("invalid v_tpr %x", v_tpr)); 1690 if (ctrl->v_tpr != v_tpr) { 1691 ctrl->v_tpr = v_tpr; 1692 svm_set_dirty(sc, vcpu, VMCB_CACHE_TPR); 1693 } 1694 1695 /* If an event cannot otherwise be injected, we are done for now */ 1696 if (ev_state != EIS_CAN_INJECT) { 1697 return (ev_state); 1698 } 1699 1700 if (!vlapic_pending_intr(vlapic, &vector)) { 1701 return (EIS_CAN_INJECT); 1702 } 1703 KASSERT(vector >= 16 && vector <= 255, 1704 ("invalid vector %d from local APIC", vector)); 1705 1706 /* 1707 * If the guest has disabled interrupts or is in an interrupt shadow 1708 * then we cannot inject the pending interrupt. 1709 */ 1710 if ((state->rflags & PSL_I) == 0 || ctrl->intr_shadow) { 1711 return (EIS_GI_BLOCK); 1712 } 1713 1714 svm_inject_irq(sc, vcpu, vector); 1715 vlapic_intr_accepted(vlapic, vector); 1716 return (EIS_EV_INJECTED); 1717 } 1718 1719 /* 1720 * Re-check for events to be injected. 1721 * 1722 * Once host CPU interrupts are disabled, check for the presence of any events 1723 * which require injection processing. If an exit is required upon injection, 1724 * or once the guest becomes interruptable, that will be configured too. 1725 */ 1726 static bool 1727 svm_inject_recheck(struct svm_softc *sc, int vcpu, 1728 enum event_inject_state ev_state) 1729 { 1730 struct vmcb_ctrl *ctrl; 1731 1732 ctrl = svm_get_vmcb_ctrl(sc, vcpu); 1733 1734 if (ev_state == EIS_CAN_INJECT) { 1735 /* 1736 * An active interrupt shadow would preclude us from injecting 1737 * any events picked up during a re-check. 1738 */ 1739 if (ctrl->intr_shadow != 0) { 1740 return (false); 1741 } 1742 1743 if (vm_nmi_pending(sc->vm, vcpu) && 1744 !svm_nmi_blocked(sc, vcpu)) { 1745 /* queued NMI not blocked by NMI-window-exiting */ 1746 return (true); 1747 } 1748 if (vm_extint_pending(sc->vm, vcpu)) { 1749 /* queued ExtINT not blocked by existing injection */ 1750 return (true); 1751 } 1752 } else { 1753 if ((ev_state & EIS_REQ_EXIT) != 0) { 1754 /* 1755 * Use a self-IPI to force an immediate exit after 1756 * event injection has occurred. 1757 */ 1758 poke_cpu(CPU->cpu_id); 1759 } else { 1760 /* 1761 * If any event is being injected, an exit immediately 1762 * upon becoming interruptable again will allow pending 1763 * or newly queued events to be injected in a timely 1764 * manner. 1765 */ 1766 svm_enable_intr_window_exiting(sc, vcpu); 1767 } 1768 } 1769 return (false); 1770 } 1771 1772 1773 static void 1774 check_asid(struct svm_softc *sc, int vcpuid, uint_t thiscpu, uint64_t nptgen) 1775 { 1776 struct svm_vcpu *vcpustate = svm_get_vcpu(sc, vcpuid); 1777 struct vmcb_ctrl *ctrl = svm_get_vmcb_ctrl(sc, vcpuid); 1778 uint8_t flush; 1779 1780 flush = hma_svm_asid_update(&vcpustate->hma_asid, has_flush_by_asid(), 1781 vcpustate->nptgen != nptgen); 1782 1783 if (flush != VMCB_TLB_FLUSH_NOTHING) { 1784 ctrl->asid = vcpustate->hma_asid.hsa_asid; 1785 svm_set_dirty(sc, vcpuid, VMCB_CACHE_ASID); 1786 } 1787 ctrl->tlb_ctrl = flush; 1788 vcpustate->nptgen = nptgen; 1789 } 1790 1791 static void 1792 flush_asid(struct svm_softc *sc, int vcpuid) 1793 { 1794 struct svm_vcpu *vcpustate = svm_get_vcpu(sc, vcpuid); 1795 struct vmcb_ctrl *ctrl = svm_get_vmcb_ctrl(sc, vcpuid); 1796 uint8_t flush; 1797 1798 /* HMA ASID updates are expected to be done with interrupts disabled */ 1799 const ulong_t iflag = intr_clear(); 1800 flush = hma_svm_asid_update(&vcpustate->hma_asid, has_flush_by_asid(), 1801 true); 1802 intr_restore(iflag); 1803 1804 ASSERT(flush != VMCB_TLB_FLUSH_NOTHING); 1805 ctrl->asid = vcpustate->hma_asid.hsa_asid; 1806 ctrl->tlb_ctrl = flush; 1807 svm_set_dirty(sc, vcpuid, VMCB_CACHE_ASID); 1808 /* 1809 * A potential future optimization: We could choose to update the nptgen 1810 * associated with the vCPU, since any pending nptgen change requiring a 1811 * flush will be satisfied by the one which has just now been queued. 1812 */ 1813 } 1814 1815 static __inline void 1816 svm_dr_enter_guest(struct svm_regctx *gctx) 1817 { 1818 1819 /* Save host control debug registers. */ 1820 gctx->host_dr7 = rdr7(); 1821 gctx->host_debugctl = rdmsr(MSR_DEBUGCTLMSR); 1822 1823 /* 1824 * Disable debugging in DR7 and DEBUGCTL to avoid triggering 1825 * exceptions in the host based on the guest DRx values. The 1826 * guest DR6, DR7, and DEBUGCTL are saved/restored in the 1827 * VMCB. 1828 */ 1829 load_dr7(0); 1830 wrmsr(MSR_DEBUGCTLMSR, 0); 1831 1832 /* Save host debug registers. */ 1833 gctx->host_dr0 = rdr0(); 1834 gctx->host_dr1 = rdr1(); 1835 gctx->host_dr2 = rdr2(); 1836 gctx->host_dr3 = rdr3(); 1837 gctx->host_dr6 = rdr6(); 1838 1839 /* Restore guest debug registers. */ 1840 load_dr0(gctx->sctx_dr0); 1841 load_dr1(gctx->sctx_dr1); 1842 load_dr2(gctx->sctx_dr2); 1843 load_dr3(gctx->sctx_dr3); 1844 } 1845 1846 static __inline void 1847 svm_dr_leave_guest(struct svm_regctx *gctx) 1848 { 1849 1850 /* Save guest debug registers. */ 1851 gctx->sctx_dr0 = rdr0(); 1852 gctx->sctx_dr1 = rdr1(); 1853 gctx->sctx_dr2 = rdr2(); 1854 gctx->sctx_dr3 = rdr3(); 1855 1856 /* 1857 * Restore host debug registers. Restore DR7 and DEBUGCTL 1858 * last. 1859 */ 1860 load_dr0(gctx->host_dr0); 1861 load_dr1(gctx->host_dr1); 1862 load_dr2(gctx->host_dr2); 1863 load_dr3(gctx->host_dr3); 1864 load_dr6(gctx->host_dr6); 1865 wrmsr(MSR_DEBUGCTLMSR, gctx->host_debugctl); 1866 load_dr7(gctx->host_dr7); 1867 } 1868 1869 /* 1870 * Apply the TSC offset for a vCPU, including physical CPU and per-vCPU offsets. 1871 */ 1872 static void 1873 svm_apply_tsc_adjust(struct svm_softc *svm_sc, int vcpuid) 1874 { 1875 const uint64_t offset = vcpu_tsc_offset(svm_sc->vm, vcpuid, true); 1876 struct vmcb_ctrl *ctrl = svm_get_vmcb_ctrl(svm_sc, vcpuid); 1877 1878 if (ctrl->tsc_offset != offset) { 1879 ctrl->tsc_offset = offset; 1880 svm_set_dirty(svm_sc, vcpuid, VMCB_CACHE_I); 1881 } 1882 } 1883 1884 /* 1885 * Start vcpu with specified RIP. 1886 */ 1887 static int 1888 svm_vmrun(void *arg, int vcpu, uint64_t rip) 1889 { 1890 struct svm_regctx *gctx; 1891 struct svm_softc *svm_sc; 1892 struct svm_vcpu *vcpustate; 1893 struct vmcb_state *state; 1894 struct vm_exit *vmexit; 1895 struct vlapic *vlapic; 1896 vm_client_t *vmc; 1897 struct vm *vm; 1898 uint64_t vmcb_pa; 1899 int handled; 1900 uint16_t ldt_sel; 1901 1902 svm_sc = arg; 1903 vm = svm_sc->vm; 1904 1905 vcpustate = svm_get_vcpu(svm_sc, vcpu); 1906 state = svm_get_vmcb_state(svm_sc, vcpu); 1907 vmexit = vm_exitinfo(vm, vcpu); 1908 vlapic = vm_lapic(vm, vcpu); 1909 vmc = vm_get_vmclient(vm, vcpu); 1910 1911 gctx = svm_get_guest_regctx(svm_sc, vcpu); 1912 vmcb_pa = svm_sc->vcpu[vcpu].vmcb_pa; 1913 1914 if (vcpustate->lastcpu != curcpu) { 1915 /* 1916 * Force new ASID allocation by invalidating the generation. 1917 */ 1918 vcpustate->hma_asid.hsa_gen = 0; 1919 1920 /* 1921 * Invalidate the VMCB state cache by marking all fields dirty. 1922 */ 1923 svm_set_dirty(svm_sc, vcpu, 0xffffffff); 1924 1925 /* 1926 * XXX 1927 * Setting 'vcpustate->lastcpu' here is bit premature because 1928 * we may return from this function without actually executing 1929 * the VMRUN instruction. This could happen if an AST or yield 1930 * condition is pending on the first time through the loop. 1931 * 1932 * This works for now but any new side-effects of vcpu 1933 * migration should take this case into account. 1934 */ 1935 vcpustate->lastcpu = curcpu; 1936 vmm_stat_incr(vm, vcpu, VCPU_MIGRATIONS, 1); 1937 } 1938 1939 svm_apply_tsc_adjust(svm_sc, vcpu); 1940 1941 svm_msr_guest_enter(svm_sc, vcpu); 1942 1943 VERIFY(!vcpustate->loaded && curthread->t_preempt != 0); 1944 vcpustate->loaded = B_TRUE; 1945 1946 /* Update Guest RIP */ 1947 state->rip = rip; 1948 1949 do { 1950 enum event_inject_state inject_state; 1951 uint64_t nptgen; 1952 1953 /* 1954 * Initial event injection is complex and may involve mutex 1955 * contention, so it must be performed with global interrupts 1956 * still enabled. 1957 */ 1958 inject_state = svm_inject_events(svm_sc, vcpu); 1959 handled = 0; 1960 1961 /* 1962 * Disable interrupts while loading VM state and performing 1963 * event injection. 1964 */ 1965 const ulong_t iflag = intr_clear(); 1966 1967 /* 1968 * Synchronizing and injecting vlapic state is lock-free and is 1969 * safe (and prudent) to perform with interrupts disabled. 1970 */ 1971 inject_state = svm_inject_vlapic(svm_sc, vcpu, vlapic, 1972 inject_state); 1973 1974 /* 1975 * Check for vCPU bail-out conditions. This must be done after 1976 * svm_inject_events() to detect a triple-fault condition. 1977 */ 1978 if (vcpu_entry_bailout_checks(vm, vcpu, state->rip)) { 1979 intr_restore(iflag); 1980 break; 1981 } 1982 1983 if (vcpu_run_state_pending(vm, vcpu)) { 1984 intr_restore(iflag); 1985 vm_exit_run_state(vm, vcpu, state->rip); 1986 break; 1987 } 1988 1989 /* 1990 * If subsequent activity queued events which require injection 1991 * handling, take another lap to handle them. 1992 */ 1993 if (svm_inject_recheck(svm_sc, vcpu, inject_state)) { 1994 intr_restore(iflag); 1995 handled = 1; 1996 continue; 1997 } 1998 1999 /* 2000 * #VMEXIT resumes the host with the guest LDTR, so 2001 * save the current LDT selector so it can be restored 2002 * after an exit. The userspace hypervisor probably 2003 * doesn't use a LDT, but save and restore it to be 2004 * safe. 2005 */ 2006 ldt_sel = sldt(); 2007 2008 /* 2009 * Check the vmspace and ASID generations to ensure that the 2010 * vcpu does not use stale TLB mappings. 2011 */ 2012 nptgen = vmc_table_enter(vmc); 2013 check_asid(svm_sc, vcpu, curcpu, nptgen); 2014 2015 svm_pmu_enter(svm_sc, vcpu); 2016 vcpu_ustate_change(vm, vcpu, VU_RUN); 2017 svm_dr_enter_guest(gctx); 2018 svm_apply_dirty(svm_sc, vcpu); 2019 2020 /* 2021 * Perform VMRUN to enter guest context. 2022 * 2023 * This is done with the protection of clearing the GIF 2024 * (global interrupt flag) as required by SVM. 2025 */ 2026 hma_svm_gif_disable(); 2027 svm_launch(vmcb_pa, gctx, get_pcpu()); 2028 hma_svm_gif_enable(); 2029 2030 svm_dr_leave_guest(gctx); 2031 vcpu_ustate_change(vm, vcpu, VU_EMU_KERN); 2032 svm_pmu_exit(svm_sc, vcpu); 2033 2034 /* Restore host LDTR. */ 2035 lldt(ldt_sel); 2036 2037 /* 2038 * Re-enable interrupts now that necessary CPU state has been 2039 * restored. Subsequent logic may need to block. 2040 */ 2041 intr_restore(iflag); 2042 2043 vmc_table_exit(vmc); 2044 2045 /* Update 'nextrip' */ 2046 vcpustate->nextrip = state->rip; 2047 2048 /* Handle #VMEXIT and if required return to user space. */ 2049 handled = svm_vmexit(svm_sc, vcpu, vmexit); 2050 } while (handled); 2051 2052 svm_msr_guest_exit(svm_sc, vcpu); 2053 2054 ASSERT(interrupts_enabled()); 2055 VERIFY(vcpustate->loaded && curthread->t_preempt != 0); 2056 vcpustate->loaded = B_FALSE; 2057 2058 return (0); 2059 } 2060 2061 static void 2062 svm_vmcleanup(void *arg) 2063 { 2064 struct svm_softc *sc = arg; 2065 2066 vmm_contig_free(sc->iopm_bitmap, SVM_IO_BITMAP_SIZE); 2067 vmm_contig_free(sc->msr_bitmap, SVM_MSR_BITMAP_SIZE); 2068 kmem_free(sc, sizeof (*sc)); 2069 } 2070 2071 static uint64_t * 2072 swctx_regptr(struct svm_regctx *regctx, int reg) 2073 { 2074 switch (reg) { 2075 case VM_REG_GUEST_RBX: 2076 return (®ctx->sctx_rbx); 2077 case VM_REG_GUEST_RCX: 2078 return (®ctx->sctx_rcx); 2079 case VM_REG_GUEST_RDX: 2080 return (®ctx->sctx_rdx); 2081 case VM_REG_GUEST_RDI: 2082 return (®ctx->sctx_rdi); 2083 case VM_REG_GUEST_RSI: 2084 return (®ctx->sctx_rsi); 2085 case VM_REG_GUEST_RBP: 2086 return (®ctx->sctx_rbp); 2087 case VM_REG_GUEST_R8: 2088 return (®ctx->sctx_r8); 2089 case VM_REG_GUEST_R9: 2090 return (®ctx->sctx_r9); 2091 case VM_REG_GUEST_R10: 2092 return (®ctx->sctx_r10); 2093 case VM_REG_GUEST_R11: 2094 return (®ctx->sctx_r11); 2095 case VM_REG_GUEST_R12: 2096 return (®ctx->sctx_r12); 2097 case VM_REG_GUEST_R13: 2098 return (®ctx->sctx_r13); 2099 case VM_REG_GUEST_R14: 2100 return (®ctx->sctx_r14); 2101 case VM_REG_GUEST_R15: 2102 return (®ctx->sctx_r15); 2103 case VM_REG_GUEST_DR0: 2104 return (®ctx->sctx_dr0); 2105 case VM_REG_GUEST_DR1: 2106 return (®ctx->sctx_dr1); 2107 case VM_REG_GUEST_DR2: 2108 return (®ctx->sctx_dr2); 2109 case VM_REG_GUEST_DR3: 2110 return (®ctx->sctx_dr3); 2111 default: 2112 return (NULL); 2113 } 2114 } 2115 2116 static int 2117 svm_getreg(void *arg, int vcpu, int ident, uint64_t *val) 2118 { 2119 struct svm_softc *sc; 2120 struct vmcb *vmcb; 2121 uint64_t *regp; 2122 uint64_t *fieldp; 2123 struct vmcb_segment *seg; 2124 2125 sc = arg; 2126 vmcb = svm_get_vmcb(sc, vcpu); 2127 2128 regp = swctx_regptr(svm_get_guest_regctx(sc, vcpu), ident); 2129 if (regp != NULL) { 2130 *val = *regp; 2131 return (0); 2132 } 2133 2134 switch (ident) { 2135 case VM_REG_GUEST_INTR_SHADOW: 2136 *val = (vmcb->ctrl.intr_shadow != 0) ? 1 : 0; 2137 break; 2138 2139 case VM_REG_GUEST_CR0: 2140 svm_get_cr0(sc, vcpu, val); 2141 break; 2142 case VM_REG_GUEST_CR2: 2143 case VM_REG_GUEST_CR3: 2144 case VM_REG_GUEST_CR4: 2145 case VM_REG_GUEST_DR6: 2146 case VM_REG_GUEST_DR7: 2147 case VM_REG_GUEST_EFER: 2148 case VM_REG_GUEST_RAX: 2149 case VM_REG_GUEST_RFLAGS: 2150 case VM_REG_GUEST_RIP: 2151 case VM_REG_GUEST_RSP: 2152 fieldp = vmcb_regptr(vmcb, ident, NULL); 2153 *val = *fieldp; 2154 break; 2155 2156 case VM_REG_GUEST_CS: 2157 case VM_REG_GUEST_DS: 2158 case VM_REG_GUEST_ES: 2159 case VM_REG_GUEST_FS: 2160 case VM_REG_GUEST_GS: 2161 case VM_REG_GUEST_SS: 2162 case VM_REG_GUEST_LDTR: 2163 case VM_REG_GUEST_TR: 2164 seg = vmcb_segptr(vmcb, ident); 2165 *val = seg->selector; 2166 break; 2167 2168 case VM_REG_GUEST_GDTR: 2169 case VM_REG_GUEST_IDTR: 2170 /* GDTR and IDTR don't have segment selectors */ 2171 return (EINVAL); 2172 2173 case VM_REG_GUEST_PDPTE0: 2174 case VM_REG_GUEST_PDPTE1: 2175 case VM_REG_GUEST_PDPTE2: 2176 case VM_REG_GUEST_PDPTE3: 2177 /* 2178 * Unlike VMX, where the PDPTEs are explicitly cached as part of 2179 * several well-defined events related to paging (such as 2180 * loading %cr3), SVM walks the PDPEs (their PDPTE) as part of 2181 * nested paging lookups. This makes these registers 2182 * effectively irrelevant on SVM. 2183 * 2184 * Rather than tossing an error, emit zeroed values so casual 2185 * consumers do not need to be as careful about that difference. 2186 */ 2187 *val = 0; 2188 break; 2189 2190 default: 2191 return (EINVAL); 2192 } 2193 2194 return (0); 2195 } 2196 2197 static int 2198 svm_setreg(void *arg, int vcpu, int ident, uint64_t val) 2199 { 2200 struct svm_softc *sc; 2201 struct vmcb *vmcb; 2202 uint64_t *regp; 2203 uint64_t *fieldp; 2204 uint32_t dirty; 2205 struct vmcb_segment *seg; 2206 2207 sc = arg; 2208 vmcb = svm_get_vmcb(sc, vcpu); 2209 2210 regp = swctx_regptr(svm_get_guest_regctx(sc, vcpu), ident); 2211 if (regp != NULL) { 2212 *regp = val; 2213 return (0); 2214 } 2215 2216 dirty = VMCB_CACHE_NONE; 2217 switch (ident) { 2218 case VM_REG_GUEST_INTR_SHADOW: 2219 vmcb->ctrl.intr_shadow = (val != 0) ? 1 : 0; 2220 break; 2221 2222 case VM_REG_GUEST_EFER: 2223 fieldp = vmcb_regptr(vmcb, ident, &dirty); 2224 /* EFER_SVM must always be set when the guest is executing */ 2225 *fieldp = val | EFER_SVM; 2226 dirty |= VMCB_CACHE_CR; 2227 break; 2228 2229 case VM_REG_GUEST_CR0: 2230 svm_set_cr0(sc, vcpu, val, false); 2231 break; 2232 case VM_REG_GUEST_CR2: 2233 case VM_REG_GUEST_CR3: 2234 case VM_REG_GUEST_CR4: 2235 case VM_REG_GUEST_DR6: 2236 case VM_REG_GUEST_DR7: 2237 case VM_REG_GUEST_RAX: 2238 case VM_REG_GUEST_RFLAGS: 2239 case VM_REG_GUEST_RIP: 2240 case VM_REG_GUEST_RSP: 2241 fieldp = vmcb_regptr(vmcb, ident, &dirty); 2242 *fieldp = val; 2243 break; 2244 2245 case VM_REG_GUEST_CS: 2246 case VM_REG_GUEST_DS: 2247 case VM_REG_GUEST_ES: 2248 case VM_REG_GUEST_SS: 2249 case VM_REG_GUEST_FS: 2250 case VM_REG_GUEST_GS: 2251 case VM_REG_GUEST_LDTR: 2252 case VM_REG_GUEST_TR: 2253 dirty |= VMCB_CACHE_SEG; 2254 seg = vmcb_segptr(vmcb, ident); 2255 seg->selector = (uint16_t)val; 2256 break; 2257 2258 case VM_REG_GUEST_GDTR: 2259 case VM_REG_GUEST_IDTR: 2260 /* GDTR and IDTR don't have segment selectors */ 2261 return (EINVAL); 2262 2263 case VM_REG_GUEST_PDPTE0: 2264 case VM_REG_GUEST_PDPTE1: 2265 case VM_REG_GUEST_PDPTE2: 2266 case VM_REG_GUEST_PDPTE3: 2267 /* 2268 * PDPEs (AMD's PDPTE) are not cached under SVM, so we can 2269 * ignore attempts to set them. See handler in svm_getreg() for 2270 * more details. 2271 */ 2272 break; 2273 2274 default: 2275 return (EINVAL); 2276 } 2277 2278 if (dirty != VMCB_CACHE_NONE) { 2279 svm_set_dirty(sc, vcpu, dirty); 2280 } 2281 2282 /* 2283 * XXX deal with CR3 and invalidate TLB entries tagged with the 2284 * vcpu's ASID. This needs to be treated differently depending on 2285 * whether 'running' is true/false. 2286 */ 2287 2288 return (0); 2289 } 2290 2291 static int 2292 svm_setdesc(void *arg, int vcpu, int reg, const struct seg_desc *desc) 2293 { 2294 struct vmcb *vmcb; 2295 struct svm_softc *sc; 2296 struct vmcb_segment *seg; 2297 2298 sc = arg; 2299 vmcb = svm_get_vmcb(sc, vcpu); 2300 2301 switch (reg) { 2302 case VM_REG_GUEST_CS: 2303 case VM_REG_GUEST_DS: 2304 case VM_REG_GUEST_ES: 2305 case VM_REG_GUEST_SS: 2306 case VM_REG_GUEST_FS: 2307 case VM_REG_GUEST_GS: 2308 case VM_REG_GUEST_LDTR: 2309 case VM_REG_GUEST_TR: 2310 svm_set_dirty(sc, vcpu, VMCB_CACHE_SEG); 2311 seg = vmcb_segptr(vmcb, reg); 2312 /* 2313 * Map seg_desc access to VMCB attribute format. 2314 * 2315 * SVM uses the 'P' bit in the segment attributes to indicate a 2316 * NULL segment so clear it if the segment is marked unusable. 2317 */ 2318 seg->attrib = VMCB_ACCESS2ATTR(desc->access); 2319 if (SEG_DESC_UNUSABLE(desc->access)) { 2320 seg->attrib &= ~0x80; 2321 } 2322 /* 2323 * Keep CPL synced with the DPL specified for %ss. 2324 * 2325 * KVM notes that a SYSRET to non-cpl-3 is possible on AMD 2326 * (unlike Intel), but accepts such a possible deviation for 2327 * what is otherwise unreasonable behavior for a guest OS, since 2328 * they do the same synchronization. 2329 */ 2330 if (reg == VM_REG_GUEST_SS) { 2331 vmcb->state.cpl = SEG_DESC_DPL(desc->access); 2332 } 2333 break; 2334 2335 case VM_REG_GUEST_GDTR: 2336 case VM_REG_GUEST_IDTR: 2337 svm_set_dirty(sc, vcpu, VMCB_CACHE_DT); 2338 seg = vmcb_segptr(vmcb, reg); 2339 break; 2340 2341 default: 2342 return (EINVAL); 2343 } 2344 2345 ASSERT(seg != NULL); 2346 seg->base = desc->base; 2347 seg->limit = desc->limit; 2348 2349 return (0); 2350 } 2351 2352 static int 2353 svm_getdesc(void *arg, int vcpu, int reg, struct seg_desc *desc) 2354 { 2355 struct vmcb *vmcb; 2356 struct svm_softc *sc; 2357 struct vmcb_segment *seg; 2358 2359 sc = arg; 2360 vmcb = svm_get_vmcb(sc, vcpu); 2361 2362 switch (reg) { 2363 case VM_REG_GUEST_DS: 2364 case VM_REG_GUEST_ES: 2365 case VM_REG_GUEST_FS: 2366 case VM_REG_GUEST_GS: 2367 case VM_REG_GUEST_SS: 2368 case VM_REG_GUEST_LDTR: 2369 seg = vmcb_segptr(vmcb, reg); 2370 desc->access = VMCB_ATTR2ACCESS(seg->attrib); 2371 /* 2372 * VT-x uses bit 16 to indicate a segment that has been loaded 2373 * with a NULL selector (aka unusable). The 'desc->access' 2374 * field is interpreted in the VT-x format by the 2375 * processor-independent code. 2376 * 2377 * SVM uses the 'P' bit to convey the same information so 2378 * convert it into the VT-x format. For more details refer to 2379 * section "Segment State in the VMCB" in APMv2. 2380 */ 2381 if ((desc->access & 0x80) == 0) { 2382 /* Unusable segment */ 2383 desc->access |= 0x10000; 2384 } 2385 2386 /* 2387 * Just as CPL (in the VMCB) is kept synced to SS when the 2388 * segment is written, so too shall the segment sync from CPL 2389 * when it is read. 2390 */ 2391 if (reg == VM_REG_GUEST_SS) { 2392 desc->access &= 2393 ~(SEG_DESC_DPL_MASK << SEG_DESC_DPL_SHIFT); 2394 desc->access |= 2395 (vmcb->state.cpl & SEG_DESC_DPL_MASK) << 2396 SEG_DESC_DPL_SHIFT; 2397 } 2398 break; 2399 2400 case VM_REG_GUEST_CS: 2401 case VM_REG_GUEST_TR: 2402 seg = vmcb_segptr(vmcb, reg); 2403 desc->access = VMCB_ATTR2ACCESS(seg->attrib); 2404 break; 2405 2406 case VM_REG_GUEST_GDTR: 2407 case VM_REG_GUEST_IDTR: 2408 seg = vmcb_segptr(vmcb, reg); 2409 /* 2410 * Since there are no access bits associated with the GDTR or 2411 * the IDTR, zero out the field to ensure it does not contain 2412 * garbage which might confuse the consumer. 2413 */ 2414 desc->access = 0; 2415 break; 2416 2417 default: 2418 return (EINVAL); 2419 } 2420 2421 ASSERT(seg != NULL); 2422 desc->base = seg->base; 2423 desc->limit = seg->limit; 2424 return (0); 2425 } 2426 2427 static int 2428 svm_get_msr(void *arg, int vcpu, uint32_t msr, uint64_t *valp) 2429 { 2430 struct svm_softc *sc = arg; 2431 struct vmcb *vmcb = svm_get_vmcb(sc, vcpu); 2432 const uint64_t *msrp = vmcb_msr_ptr(vmcb, msr, NULL); 2433 2434 if (msrp != NULL) { 2435 *valp = *msrp; 2436 return (0); 2437 } 2438 2439 return (EINVAL); 2440 } 2441 2442 static int 2443 svm_set_msr(void *arg, int vcpu, uint32_t msr, uint64_t val) 2444 { 2445 struct svm_softc *sc = arg; 2446 struct vmcb *vmcb = svm_get_vmcb(sc, vcpu); 2447 2448 uint32_t dirty = 0; 2449 uint64_t *msrp = vmcb_msr_ptr(vmcb, msr, &dirty); 2450 if (msrp == NULL) { 2451 return (EINVAL); 2452 } 2453 switch (msr) { 2454 case MSR_EFER: 2455 /* 2456 * For now, just clone the logic from 2457 * svm_setreg(): 2458 * 2459 * EFER_SVM must always be set when the guest is 2460 * executing 2461 */ 2462 *msrp = val | EFER_SVM; 2463 break; 2464 /* TODO: other necessary MSR masking */ 2465 default: 2466 *msrp = val; 2467 break; 2468 } 2469 if (dirty != 0) { 2470 svm_set_dirty(sc, vcpu, dirty); 2471 } 2472 return (0); 2473 2474 } 2475 2476 static int 2477 svm_setcap(void *arg, int vcpu, int type, int val) 2478 { 2479 struct svm_softc *sc; 2480 int error; 2481 2482 sc = arg; 2483 error = 0; 2484 switch (type) { 2485 case VM_CAP_HALT_EXIT: 2486 svm_set_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, 2487 VMCB_INTCPT_HLT, val); 2488 break; 2489 case VM_CAP_PAUSE_EXIT: 2490 svm_set_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, 2491 VMCB_INTCPT_PAUSE, val); 2492 break; 2493 default: 2494 error = ENOENT; 2495 break; 2496 } 2497 return (error); 2498 } 2499 2500 static int 2501 svm_getcap(void *arg, int vcpu, int type, int *retval) 2502 { 2503 struct svm_softc *sc; 2504 int error; 2505 2506 sc = arg; 2507 error = 0; 2508 2509 switch (type) { 2510 case VM_CAP_HALT_EXIT: 2511 *retval = svm_get_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, 2512 VMCB_INTCPT_HLT); 2513 break; 2514 case VM_CAP_PAUSE_EXIT: 2515 *retval = svm_get_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, 2516 VMCB_INTCPT_PAUSE); 2517 break; 2518 default: 2519 error = ENOENT; 2520 break; 2521 } 2522 return (error); 2523 } 2524 2525 static struct vlapic * 2526 svm_vlapic_init(void *arg, int vcpuid) 2527 { 2528 struct svm_softc *svm_sc; 2529 struct vlapic *vlapic; 2530 2531 svm_sc = arg; 2532 vlapic = kmem_zalloc(sizeof (struct vlapic), KM_SLEEP); 2533 vlapic->vm = svm_sc->vm; 2534 vlapic->vcpuid = vcpuid; 2535 vlapic->apic_page = (struct LAPIC *)&svm_sc->apic_page[vcpuid]; 2536 2537 vlapic_init(vlapic); 2538 2539 return (vlapic); 2540 } 2541 2542 static void 2543 svm_vlapic_cleanup(void *arg, struct vlapic *vlapic) 2544 { 2545 vlapic_cleanup(vlapic); 2546 kmem_free(vlapic, sizeof (struct vlapic)); 2547 } 2548 2549 static void 2550 svm_pause(void *arg, int vcpu) 2551 { 2552 struct svm_softc *sc = arg; 2553 struct vmcb_ctrl *ctrl = svm_get_vmcb_ctrl(sc, vcpu); 2554 2555 /* 2556 * If an event is pending injection in the VMCB, stash it in 2557 * exit_intinfo as if it were deferred by an exit from guest context. 2558 */ 2559 const uint64_t intinfo = ctrl->eventinj; 2560 if ((intinfo & VMCB_EVENTINJ_VALID) != 0) { 2561 svm_stash_intinfo(sc, vcpu, intinfo); 2562 ctrl->eventinj = 0; 2563 } 2564 2565 /* 2566 * Now that no event is pending injection, interrupt-window exiting and 2567 * NMI-blocking can be disabled. If/when this vCPU is made to run 2568 * again, those conditions will be reinstated when the now-queued events 2569 * are re-injected. 2570 */ 2571 svm_disable_intr_window_exiting(sc, vcpu); 2572 svm_disable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_IRET); 2573 } 2574 2575 static void 2576 svm_savectx(void *arg, int vcpu) 2577 { 2578 struct svm_softc *sc = arg; 2579 2580 /* We should _never_ go off-CPU with the GIF disabled */ 2581 ASSERT(!hma_svm_gif_is_disabled()); 2582 2583 if (sc->vcpu[vcpu].loaded) { 2584 svm_msr_guest_exit(sc, vcpu); 2585 } 2586 } 2587 2588 static void 2589 svm_restorectx(void *arg, int vcpu) 2590 { 2591 struct svm_softc *sc = arg; 2592 2593 if (sc->vcpu[vcpu].loaded) { 2594 svm_msr_guest_enter(sc, vcpu); 2595 } 2596 } 2597 2598 static freqratio_res_t 2599 svm_freq_ratio(uint64_t guest_hz, uint64_t host_hz, uint64_t *mult) 2600 { 2601 /* 2602 * Check whether scaling is needed at all before potentially erroring 2603 * out for other reasons. 2604 */ 2605 if (guest_hz == host_hz) { 2606 return (FR_SCALING_NOT_NEEDED); 2607 } 2608 2609 /* 2610 * Confirm that scaling is available. 2611 */ 2612 if (!has_tsc_freq_ctl()) { 2613 return (FR_SCALING_NOT_SUPPORTED); 2614 } 2615 2616 /* 2617 * Verify the guest_hz is within the supported range. 2618 */ 2619 if ((guest_hz < AMD_TSC_MIN_FREQ) || 2620 (guest_hz >= (host_hz * AMD_TSC_MAX_FREQ_RATIO))) { 2621 return (FR_OUT_OF_RANGE); 2622 } 2623 2624 /* Calculate the multiplier. */ 2625 uint64_t m = vmm_calc_freq_multiplier(guest_hz, host_hz, 2626 AMD_TSCM_FRAC_SIZE); 2627 *mult = m; 2628 2629 return (FR_VALID); 2630 } 2631 2632 struct vmm_ops vmm_ops_amd = { 2633 .init = svm_init, 2634 .cleanup = svm_cleanup, 2635 .resume = svm_restore, 2636 2637 .vminit = svm_vminit, 2638 .vmrun = svm_vmrun, 2639 .vmcleanup = svm_vmcleanup, 2640 .vmgetreg = svm_getreg, 2641 .vmsetreg = svm_setreg, 2642 .vmgetdesc = svm_getdesc, 2643 .vmsetdesc = svm_setdesc, 2644 .vmgetcap = svm_getcap, 2645 .vmsetcap = svm_setcap, 2646 .vlapic_init = svm_vlapic_init, 2647 .vlapic_cleanup = svm_vlapic_cleanup, 2648 .vmpause = svm_pause, 2649 2650 .vmsavectx = svm_savectx, 2651 .vmrestorectx = svm_restorectx, 2652 2653 .vmgetmsr = svm_get_msr, 2654 .vmsetmsr = svm_set_msr, 2655 2656 .vmfreqratio = svm_freq_ratio, 2657 .fr_intsize = AMD_TSCM_INT_SIZE, 2658 .fr_fracsize = AMD_TSCM_FRAC_SIZE, 2659 }; 2660