1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2011 NetApp, Inc. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 * 28 * $FreeBSD$ 29 */ 30 /* 31 * This file and its contents are supplied under the terms of the 32 * Common Development and Distribution License ("CDDL"), version 1.0. 33 * You may only use this file in accordance with the terms of version 34 * 1.0 of the CDDL. 35 * 36 * A full copy of the text of the CDDL should have accompanied this 37 * source. A copy of the CDDL is also available via the Internet at 38 * http://www.illumos.org/license/CDDL. 39 * 40 * Copyright 2015 Pluribus Networks Inc. 41 * Copyright 2018 Joyent, Inc. 42 * Copyright 2022 Oxide Computer Company 43 * Copyright 2021 OmniOS Community Edition (OmniOSce) Association. 44 */ 45 46 #include <sys/cdefs.h> 47 __FBSDID("$FreeBSD$"); 48 49 #include <sys/param.h> 50 #include <sys/systm.h> 51 #include <sys/kernel.h> 52 #include <sys/module.h> 53 #include <sys/sysctl.h> 54 #include <sys/kmem.h> 55 #include <sys/pcpu.h> 56 #include <sys/mutex.h> 57 #include <sys/proc.h> 58 #include <sys/rwlock.h> 59 #include <sys/sched.h> 60 #include <sys/systm.h> 61 #include <sys/sunddi.h> 62 #include <sys/hma.h> 63 64 #include <machine/md_var.h> 65 #include <x86/psl.h> 66 #include <x86/apicreg.h> 67 68 #include <machine/specialreg.h> 69 #include <machine/vmm.h> 70 #include <machine/vmm_dev.h> 71 #include <machine/vmparam.h> 72 #include <sys/vmm_instruction_emul.h> 73 #include <sys/vmm_vm.h> 74 #include <sys/vmm_gpt.h> 75 #include <sys/vmm_data.h> 76 77 #include "vmm_ioport.h" 78 #include "vmm_host.h" 79 #include "vmm_util.h" 80 #include "vatpic.h" 81 #include "vatpit.h" 82 #include "vhpet.h" 83 #include "vioapic.h" 84 #include "vlapic.h" 85 #include "vpmtmr.h" 86 #include "vrtc.h" 87 #include "vmm_stat.h" 88 #include "vmm_lapic.h" 89 90 #include "io/ppt.h" 91 #include "io/iommu.h" 92 93 struct vlapic; 94 95 /* Flags for vtc_status */ 96 #define VTCS_FPU_RESTORED 1 /* guest FPU restored, host FPU saved */ 97 #define VTCS_FPU_CTX_CRITICAL 2 /* in ctx where FPU restore cannot be lazy */ 98 99 typedef struct vm_thread_ctx { 100 struct vm *vtc_vm; 101 int vtc_vcpuid; 102 uint_t vtc_status; 103 enum vcpu_ustate vtc_ustate; 104 } vm_thread_ctx_t; 105 106 #define VMM_MTRR_VAR_MAX 10 107 #define VMM_MTRR_DEF_MASK \ 108 (MTRR_DEF_ENABLE | MTRR_DEF_FIXED_ENABLE | MTRR_DEF_TYPE) 109 #define VMM_MTRR_PHYSBASE_MASK (MTRR_PHYSBASE_PHYSBASE | MTRR_PHYSBASE_TYPE) 110 #define VMM_MTRR_PHYSMASK_MASK (MTRR_PHYSMASK_PHYSMASK | MTRR_PHYSMASK_VALID) 111 struct vm_mtrr { 112 uint64_t def_type; 113 uint64_t fixed4k[8]; 114 uint64_t fixed16k[2]; 115 uint64_t fixed64k; 116 struct { 117 uint64_t base; 118 uint64_t mask; 119 } var[VMM_MTRR_VAR_MAX]; 120 }; 121 122 /* 123 * Initialization: 124 * (a) allocated when vcpu is created 125 * (i) initialized when vcpu is created and when it is reinitialized 126 * (o) initialized the first time the vcpu is created 127 * (x) initialized before use 128 */ 129 struct vcpu { 130 /* (o) protects state, run_state, hostcpu, sipi_vector */ 131 kmutex_t lock; 132 133 enum vcpu_state state; /* (o) vcpu state */ 134 enum vcpu_run_state run_state; /* (i) vcpu init/sipi/run state */ 135 kcondvar_t vcpu_cv; /* (o) cpu waiter cv */ 136 kcondvar_t state_cv; /* (o) IDLE-transition cv */ 137 int hostcpu; /* (o) vcpu's current host cpu */ 138 int lastloccpu; /* (o) last host cpu localized to */ 139 int reqidle; /* (i) request vcpu to idle */ 140 struct vlapic *vlapic; /* (i) APIC device model */ 141 enum x2apic_state x2apic_state; /* (i) APIC mode */ 142 uint64_t exit_intinfo; /* (i) events pending at VM exit */ 143 uint64_t exc_pending; /* (i) exception pending */ 144 bool nmi_pending; /* (i) NMI pending */ 145 bool extint_pending; /* (i) INTR pending */ 146 147 uint8_t sipi_vector; /* (i) SIPI vector */ 148 hma_fpu_t *guestfpu; /* (a,i) guest fpu state */ 149 uint64_t guest_xcr0; /* (i) guest %xcr0 register */ 150 void *stats; /* (a,i) statistics */ 151 struct vm_exit exitinfo; /* (x) exit reason and collateral */ 152 uint64_t nextrip; /* (x) next instruction to execute */ 153 struct vie *vie_ctx; /* (x) instruction emulation context */ 154 vm_client_t *vmclient; /* (a) VM-system client */ 155 uint64_t tsc_offset; /* (x) offset from host TSC */ 156 struct vm_mtrr mtrr; /* (i) vcpu's MTRR */ 157 vcpu_cpuid_config_t cpuid_cfg; /* (x) cpuid configuration */ 158 159 enum vcpu_ustate ustate; /* (i) microstate for the vcpu */ 160 hrtime_t ustate_when; /* (i) time of last ustate change */ 161 uint64_t ustate_total[VU_MAX]; /* (o) total time spent in ustates */ 162 vm_thread_ctx_t vtc; /* (o) thread state for ctxops */ 163 struct ctxop *ctxop; /* (o) ctxop storage for vcpu */ 164 }; 165 166 #define vcpu_lock(v) mutex_enter(&((v)->lock)) 167 #define vcpu_unlock(v) mutex_exit(&((v)->lock)) 168 #define vcpu_assert_locked(v) ASSERT(MUTEX_HELD(&((v)->lock))) 169 170 struct mem_seg { 171 size_t len; 172 bool sysmem; 173 vm_object_t *object; 174 }; 175 #define VM_MAX_MEMSEGS 5 176 177 struct mem_map { 178 vm_paddr_t gpa; 179 size_t len; 180 vm_ooffset_t segoff; 181 int segid; 182 int prot; 183 int flags; 184 }; 185 #define VM_MAX_MEMMAPS 8 186 187 /* 188 * Initialization: 189 * (o) initialized the first time the VM is created 190 * (i) initialized when VM is created and when it is reinitialized 191 * (x) initialized before use 192 */ 193 struct vm { 194 void *cookie; /* (i) cpu-specific data */ 195 void *iommu; /* (x) iommu-specific data */ 196 struct vhpet *vhpet; /* (i) virtual HPET */ 197 struct vioapic *vioapic; /* (i) virtual ioapic */ 198 struct vatpic *vatpic; /* (i) virtual atpic */ 199 struct vatpit *vatpit; /* (i) virtual atpit */ 200 struct vpmtmr *vpmtmr; /* (i) virtual ACPI PM timer */ 201 struct vrtc *vrtc; /* (o) virtual RTC */ 202 volatile cpuset_t active_cpus; /* (i) active vcpus */ 203 volatile cpuset_t debug_cpus; /* (i) vcpus stopped for dbg */ 204 int suspend; /* (i) stop VM execution */ 205 volatile cpuset_t suspended_cpus; /* (i) suspended vcpus */ 206 volatile cpuset_t halted_cpus; /* (x) cpus in a hard halt */ 207 struct mem_map mem_maps[VM_MAX_MEMMAPS]; /* (i) guest address space */ 208 struct mem_seg mem_segs[VM_MAX_MEMSEGS]; /* (o) guest memory regions */ 209 struct vmspace *vmspace; /* (o) guest's address space */ 210 struct vcpu vcpu[VM_MAXCPU]; /* (i) guest vcpus */ 211 /* The following describe the vm cpu topology */ 212 uint16_t sockets; /* (o) num of sockets */ 213 uint16_t cores; /* (o) num of cores/socket */ 214 uint16_t threads; /* (o) num of threads/core */ 215 uint16_t maxcpus; /* (o) max pluggable cpus */ 216 217 uint64_t boot_tsc_offset; /* (i) TSC offset at VM boot */ 218 hrtime_t boot_hrtime; /* (i) hrtime at VM boot */ 219 220 struct ioport_config ioports; /* (o) ioport handling */ 221 222 bool mem_transient; /* (o) alloc transient memory */ 223 }; 224 225 static int vmm_initialized; 226 227 228 static void 229 nullop_panic(void) 230 { 231 panic("null vmm operation call"); 232 } 233 234 /* Do not allow use of an un-set `ops` to do anything but panic */ 235 static struct vmm_ops vmm_ops_null = { 236 .init = (vmm_init_func_t)nullop_panic, 237 .cleanup = (vmm_cleanup_func_t)nullop_panic, 238 .resume = (vmm_resume_func_t)nullop_panic, 239 .vminit = (vmi_init_func_t)nullop_panic, 240 .vmrun = (vmi_run_func_t)nullop_panic, 241 .vmcleanup = (vmi_cleanup_func_t)nullop_panic, 242 .vmgetreg = (vmi_get_register_t)nullop_panic, 243 .vmsetreg = (vmi_set_register_t)nullop_panic, 244 .vmgetdesc = (vmi_get_desc_t)nullop_panic, 245 .vmsetdesc = (vmi_set_desc_t)nullop_panic, 246 .vmgetcap = (vmi_get_cap_t)nullop_panic, 247 .vmsetcap = (vmi_set_cap_t)nullop_panic, 248 .vlapic_init = (vmi_vlapic_init)nullop_panic, 249 .vlapic_cleanup = (vmi_vlapic_cleanup)nullop_panic, 250 .vmsavectx = (vmi_savectx)nullop_panic, 251 .vmrestorectx = (vmi_restorectx)nullop_panic, 252 .vmgetmsr = (vmi_get_msr_t)nullop_panic, 253 .vmsetmsr = (vmi_set_msr_t)nullop_panic, 254 }; 255 256 static struct vmm_ops *ops = &vmm_ops_null; 257 static vmm_pte_ops_t *pte_ops = NULL; 258 259 #define VMM_INIT() ((*ops->init)()) 260 #define VMM_CLEANUP() ((*ops->cleanup)()) 261 #define VMM_RESUME() ((*ops->resume)()) 262 263 #define VMINIT(vm) ((*ops->vminit)(vm)) 264 #define VMRUN(vmi, vcpu, rip) ((*ops->vmrun)(vmi, vcpu, rip)) 265 #define VMCLEANUP(vmi) ((*ops->vmcleanup)(vmi)) 266 267 #define VMGETREG(vmi, vcpu, num, rv) ((*ops->vmgetreg)(vmi, vcpu, num, rv)) 268 #define VMSETREG(vmi, vcpu, num, val) ((*ops->vmsetreg)(vmi, vcpu, num, val)) 269 #define VMGETDESC(vmi, vcpu, num, dsc) ((*ops->vmgetdesc)(vmi, vcpu, num, dsc)) 270 #define VMSETDESC(vmi, vcpu, num, dsc) ((*ops->vmsetdesc)(vmi, vcpu, num, dsc)) 271 #define VMGETCAP(vmi, vcpu, num, rv) ((*ops->vmgetcap)(vmi, vcpu, num, rv)) 272 #define VMSETCAP(vmi, vcpu, num, val) ((*ops->vmsetcap)(vmi, vcpu, num, val)) 273 #define VLAPIC_INIT(vmi, vcpu) ((*ops->vlapic_init)(vmi, vcpu)) 274 #define VLAPIC_CLEANUP(vmi, vlapic) ((*ops->vlapic_cleanup)(vmi, vlapic)) 275 276 #define fpu_start_emulating() load_cr0(rcr0() | CR0_TS) 277 #define fpu_stop_emulating() clts() 278 279 SDT_PROVIDER_DEFINE(vmm); 280 281 SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 282 NULL); 283 284 /* 285 * Halt the guest if all vcpus are executing a HLT instruction with 286 * interrupts disabled. 287 */ 288 static int halt_detection_enabled = 1; 289 290 /* Trap into hypervisor on all guest exceptions and reflect them back */ 291 static int trace_guest_exceptions; 292 293 static void vm_free_memmap(struct vm *vm, int ident); 294 static bool sysmem_mapping(struct vm *vm, struct mem_map *mm); 295 static void vcpu_notify_event_locked(struct vcpu *vcpu, vcpu_notify_t); 296 static bool vcpu_sleep_bailout_checks(struct vm *vm, int vcpuid); 297 static int vcpu_vector_sipi(struct vm *vm, int vcpuid, uint8_t vector); 298 299 static void vmm_savectx(void *); 300 static void vmm_restorectx(void *); 301 static const struct ctxop_template vmm_ctxop_tpl = { 302 .ct_rev = CTXOP_TPL_REV, 303 .ct_save = vmm_savectx, 304 .ct_restore = vmm_restorectx, 305 }; 306 307 #ifdef KTR 308 static const char * 309 vcpu_state2str(enum vcpu_state state) 310 { 311 312 switch (state) { 313 case VCPU_IDLE: 314 return ("idle"); 315 case VCPU_FROZEN: 316 return ("frozen"); 317 case VCPU_RUNNING: 318 return ("running"); 319 case VCPU_SLEEPING: 320 return ("sleeping"); 321 default: 322 return ("unknown"); 323 } 324 } 325 #endif 326 327 static void 328 vcpu_cleanup(struct vm *vm, int i, bool destroy) 329 { 330 struct vcpu *vcpu = &vm->vcpu[i]; 331 332 VLAPIC_CLEANUP(vm->cookie, vcpu->vlapic); 333 if (destroy) { 334 vmm_stat_free(vcpu->stats); 335 336 vcpu_cpuid_cleanup(&vcpu->cpuid_cfg); 337 338 hma_fpu_free(vcpu->guestfpu); 339 vcpu->guestfpu = NULL; 340 341 vie_free(vcpu->vie_ctx); 342 vcpu->vie_ctx = NULL; 343 344 vmc_destroy(vcpu->vmclient); 345 vcpu->vmclient = NULL; 346 347 ctxop_free(vcpu->ctxop); 348 mutex_destroy(&vcpu->lock); 349 } 350 } 351 352 static void 353 vcpu_init(struct vm *vm, int vcpu_id, bool create) 354 { 355 struct vcpu *vcpu; 356 357 KASSERT(vcpu_id >= 0 && vcpu_id < vm->maxcpus, 358 ("vcpu_init: invalid vcpu %d", vcpu_id)); 359 360 vcpu = &vm->vcpu[vcpu_id]; 361 362 if (create) { 363 mutex_init(&vcpu->lock, NULL, MUTEX_ADAPTIVE, NULL); 364 365 vcpu->state = VCPU_IDLE; 366 vcpu->hostcpu = NOCPU; 367 vcpu->lastloccpu = NOCPU; 368 vcpu->guestfpu = hma_fpu_alloc(KM_SLEEP); 369 vcpu->stats = vmm_stat_alloc(); 370 vcpu->vie_ctx = vie_alloc(); 371 vcpu_cpuid_init(&vcpu->cpuid_cfg); 372 373 vcpu->ustate = VU_INIT; 374 vcpu->ustate_when = gethrtime(); 375 376 vcpu->vtc.vtc_vm = vm; 377 vcpu->vtc.vtc_vcpuid = vcpu_id; 378 vcpu->ctxop = ctxop_allocate(&vmm_ctxop_tpl, &vcpu->vtc); 379 } else { 380 vie_reset(vcpu->vie_ctx); 381 bzero(&vcpu->exitinfo, sizeof (vcpu->exitinfo)); 382 if (vcpu->ustate != VU_INIT) { 383 vcpu_ustate_change(vm, vcpu_id, VU_INIT); 384 } 385 bzero(&vcpu->mtrr, sizeof (vcpu->mtrr)); 386 } 387 388 vcpu->run_state = VRS_HALT; 389 vcpu->vlapic = VLAPIC_INIT(vm->cookie, vcpu_id); 390 (void) vm_set_x2apic_state(vm, vcpu_id, X2APIC_DISABLED); 391 vcpu->reqidle = 0; 392 vcpu->exit_intinfo = 0; 393 vcpu->nmi_pending = false; 394 vcpu->extint_pending = false; 395 vcpu->exc_pending = 0; 396 vcpu->guest_xcr0 = XFEATURE_ENABLED_X87; 397 (void) hma_fpu_init(vcpu->guestfpu); 398 vmm_stat_init(vcpu->stats); 399 vcpu->tsc_offset = 0; 400 } 401 402 int 403 vcpu_trace_exceptions(struct vm *vm, int vcpuid) 404 { 405 406 return (trace_guest_exceptions); 407 } 408 409 struct vm_exit * 410 vm_exitinfo(struct vm *vm, int cpuid) 411 { 412 struct vcpu *vcpu; 413 414 if (cpuid < 0 || cpuid >= vm->maxcpus) 415 panic("vm_exitinfo: invalid cpuid %d", cpuid); 416 417 vcpu = &vm->vcpu[cpuid]; 418 419 return (&vcpu->exitinfo); 420 } 421 422 struct vie * 423 vm_vie_ctx(struct vm *vm, int cpuid) 424 { 425 if (cpuid < 0 || cpuid >= vm->maxcpus) 426 panic("vm_vie_ctx: invalid cpuid %d", cpuid); 427 428 return (vm->vcpu[cpuid].vie_ctx); 429 } 430 431 static int 432 vmm_init(void) 433 { 434 vmm_host_state_init(); 435 436 if (vmm_is_intel()) { 437 ops = &vmm_ops_intel; 438 pte_ops = &ept_pte_ops; 439 } else if (vmm_is_svm()) { 440 ops = &vmm_ops_amd; 441 pte_ops = &rvi_pte_ops; 442 } else { 443 return (ENXIO); 444 } 445 446 return (VMM_INIT()); 447 } 448 449 int 450 vmm_mod_load() 451 { 452 int error; 453 454 VERIFY(vmm_initialized == 0); 455 456 error = vmm_init(); 457 if (error == 0) 458 vmm_initialized = 1; 459 460 return (error); 461 } 462 463 int 464 vmm_mod_unload() 465 { 466 int error; 467 468 VERIFY(vmm_initialized == 1); 469 470 error = VMM_CLEANUP(); 471 if (error) 472 return (error); 473 vmm_initialized = 0; 474 475 return (0); 476 } 477 478 /* 479 * Create a test IOMMU domain to see if the host system has necessary hardware 480 * and drivers to do so. 481 */ 482 bool 483 vmm_check_iommu(void) 484 { 485 void *domain; 486 const size_t arb_test_sz = (1UL << 32); 487 488 domain = iommu_create_domain(arb_test_sz); 489 if (domain == NULL) { 490 return (false); 491 } 492 iommu_destroy_domain(domain); 493 return (true); 494 } 495 496 static void 497 vm_init(struct vm *vm, bool create) 498 { 499 int i; 500 501 vm->cookie = VMINIT(vm); 502 vm->iommu = NULL; 503 vm->vioapic = vioapic_init(vm); 504 vm->vhpet = vhpet_init(vm); 505 vm->vatpic = vatpic_init(vm); 506 vm->vatpit = vatpit_init(vm); 507 vm->vpmtmr = vpmtmr_init(vm); 508 if (create) 509 vm->vrtc = vrtc_init(vm); 510 511 vm_inout_init(vm, &vm->ioports); 512 513 CPU_ZERO(&vm->active_cpus); 514 CPU_ZERO(&vm->debug_cpus); 515 516 vm->suspend = 0; 517 CPU_ZERO(&vm->suspended_cpus); 518 519 for (i = 0; i < vm->maxcpus; i++) 520 vcpu_init(vm, i, create); 521 522 /* 523 * Configure the VM-wide TSC offset so that the call to vm_init() 524 * represents the boot time (when the TSC(s) read 0). Each vCPU will 525 * have its own offset from this, which is altered if/when the guest 526 * writes to MSR_TSC. 527 * 528 * The TSC offsetting math is all unsigned, using overflow for negative 529 * offets. A reading of the TSC is negated to form the boot offset. 530 */ 531 const uint64_t boot_tsc = rdtsc_offset(); 532 vm->boot_tsc_offset = (uint64_t)(-(int64_t)boot_tsc); 533 534 /* Convert the boot TSC reading to hrtime */ 535 vm->boot_hrtime = (hrtime_t)boot_tsc; 536 scalehrtime(&vm->boot_hrtime); 537 } 538 539 /* 540 * The default CPU topology is a single thread per package. 541 */ 542 uint_t cores_per_package = 1; 543 uint_t threads_per_core = 1; 544 545 /* 546 * Debugging tunable to enable dirty-page-tracking. 547 * (Remains off by default for now) 548 */ 549 bool gpt_track_dirty = false; 550 551 int 552 vm_create(uint64_t flags, struct vm **retvm) 553 { 554 struct vm *vm; 555 struct vmspace *vmspace; 556 557 /* 558 * If vmm.ko could not be successfully initialized then don't attempt 559 * to create the virtual machine. 560 */ 561 if (!vmm_initialized) 562 return (ENXIO); 563 564 vmspace = vmspace_alloc(VM_MAXUSER_ADDRESS, pte_ops, gpt_track_dirty); 565 if (vmspace == NULL) 566 return (ENOMEM); 567 568 vm = kmem_zalloc(sizeof (struct vm), KM_SLEEP); 569 570 vm->vmspace = vmspace; 571 vm->mem_transient = (flags & VCF_RESERVOIR_MEM) == 0; 572 for (uint_t i = 0; i < VM_MAXCPU; i++) { 573 vm->vcpu[i].vmclient = vmspace_client_alloc(vmspace); 574 } 575 576 vm->sockets = 1; 577 vm->cores = cores_per_package; /* XXX backwards compatibility */ 578 vm->threads = threads_per_core; /* XXX backwards compatibility */ 579 vm->maxcpus = VM_MAXCPU; /* XXX temp to keep code working */ 580 581 vm_init(vm, true); 582 583 *retvm = vm; 584 return (0); 585 } 586 587 void 588 vm_get_topology(struct vm *vm, uint16_t *sockets, uint16_t *cores, 589 uint16_t *threads, uint16_t *maxcpus) 590 { 591 *sockets = vm->sockets; 592 *cores = vm->cores; 593 *threads = vm->threads; 594 *maxcpus = vm->maxcpus; 595 } 596 597 uint16_t 598 vm_get_maxcpus(struct vm *vm) 599 { 600 return (vm->maxcpus); 601 } 602 603 int 604 vm_set_topology(struct vm *vm, uint16_t sockets, uint16_t cores, 605 uint16_t threads, uint16_t maxcpus) 606 { 607 if (maxcpus != 0) 608 return (EINVAL); /* XXX remove when supported */ 609 if ((sockets * cores * threads) > vm->maxcpus) 610 return (EINVAL); 611 /* XXX need to check sockets * cores * threads == vCPU, how? */ 612 vm->sockets = sockets; 613 vm->cores = cores; 614 vm->threads = threads; 615 vm->maxcpus = VM_MAXCPU; /* XXX temp to keep code working */ 616 return (0); 617 } 618 619 static void 620 vm_cleanup(struct vm *vm, bool destroy) 621 { 622 struct mem_map *mm; 623 int i; 624 625 ppt_unassign_all(vm); 626 627 if (vm->iommu != NULL) 628 iommu_destroy_domain(vm->iommu); 629 630 /* 631 * Devices which attach their own ioport hooks should be cleaned up 632 * first so they can tear down those registrations. 633 */ 634 vpmtmr_cleanup(vm->vpmtmr); 635 636 vm_inout_cleanup(vm, &vm->ioports); 637 638 if (destroy) 639 vrtc_cleanup(vm->vrtc); 640 else 641 vrtc_reset(vm->vrtc); 642 643 vatpit_cleanup(vm->vatpit); 644 vhpet_cleanup(vm->vhpet); 645 vatpic_cleanup(vm->vatpic); 646 vioapic_cleanup(vm->vioapic); 647 648 for (i = 0; i < vm->maxcpus; i++) 649 vcpu_cleanup(vm, i, destroy); 650 651 VMCLEANUP(vm->cookie); 652 653 /* 654 * System memory is removed from the guest address space only when 655 * the VM is destroyed. This is because the mapping remains the same 656 * across VM reset. 657 * 658 * Device memory can be relocated by the guest (e.g. using PCI BARs) 659 * so those mappings are removed on a VM reset. 660 */ 661 for (i = 0; i < VM_MAX_MEMMAPS; i++) { 662 mm = &vm->mem_maps[i]; 663 if (destroy || !sysmem_mapping(vm, mm)) { 664 vm_free_memmap(vm, i); 665 } else { 666 /* 667 * We need to reset the IOMMU flag so this mapping can 668 * be reused when a VM is rebooted. Since the IOMMU 669 * domain has already been destroyed we can just reset 670 * the flag here. 671 */ 672 mm->flags &= ~VM_MEMMAP_F_IOMMU; 673 } 674 } 675 676 if (destroy) { 677 for (i = 0; i < VM_MAX_MEMSEGS; i++) 678 vm_free_memseg(vm, i); 679 680 vmspace_destroy(vm->vmspace); 681 vm->vmspace = NULL; 682 } 683 } 684 685 void 686 vm_destroy(struct vm *vm) 687 { 688 vm_cleanup(vm, true); 689 kmem_free(vm, sizeof (*vm)); 690 } 691 692 int 693 vm_reinit(struct vm *vm, uint64_t flags) 694 { 695 /* A virtual machine can be reset only if all vcpus are suspended. */ 696 if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) != 0) { 697 if ((flags & VM_REINIT_F_FORCE_SUSPEND) == 0) { 698 return (EBUSY); 699 } 700 701 /* 702 * Force the VM (and all its vCPUs) into a suspended state. 703 * This should be quick and easy, since the vm_reinit() call is 704 * made while holding the VM write lock, which requires holding 705 * all of the vCPUs in the VCPU_FROZEN state. 706 */ 707 (void) atomic_cmpset_int((uint_t *)&vm->suspend, 0, 708 VM_SUSPEND_RESET); 709 for (uint_t i = 0; i < vm->maxcpus; i++) { 710 struct vcpu *vcpu = &vm->vcpu[i]; 711 712 if (CPU_ISSET(i, &vm->suspended_cpus) || 713 !CPU_ISSET(i, &vm->active_cpus)) { 714 continue; 715 } 716 717 vcpu_lock(vcpu); 718 VERIFY3U(vcpu->state, ==, VCPU_FROZEN); 719 CPU_SET_ATOMIC(i, &vm->suspended_cpus); 720 vcpu_unlock(vcpu); 721 } 722 723 VERIFY0(CPU_CMP(&vm->suspended_cpus, &vm->active_cpus)); 724 } 725 726 vm_cleanup(vm, false); 727 vm_init(vm, false); 728 return (0); 729 } 730 731 int 732 vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa) 733 { 734 vm_object_t *obj; 735 736 if ((obj = vmm_mmio_alloc(vm->vmspace, gpa, len, hpa)) == NULL) 737 return (ENOMEM); 738 else 739 return (0); 740 } 741 742 int 743 vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len) 744 { 745 return (vmspace_unmap(vm->vmspace, gpa, gpa + len)); 746 } 747 748 /* 749 * Return 'true' if 'gpa' is allocated in the guest address space. 750 * 751 * This function is called in the context of a running vcpu which acts as 752 * an implicit lock on 'vm->mem_maps[]'. 753 */ 754 bool 755 vm_mem_allocated(struct vm *vm, int vcpuid, vm_paddr_t gpa) 756 { 757 struct mem_map *mm; 758 int i; 759 760 #ifdef INVARIANTS 761 int hostcpu, state; 762 state = vcpu_get_state(vm, vcpuid, &hostcpu); 763 KASSERT(state == VCPU_RUNNING && hostcpu == curcpu, 764 ("%s: invalid vcpu state %d/%d", __func__, state, hostcpu)); 765 #endif 766 767 for (i = 0; i < VM_MAX_MEMMAPS; i++) { 768 mm = &vm->mem_maps[i]; 769 if (mm->len != 0 && gpa >= mm->gpa && gpa < mm->gpa + mm->len) 770 return (true); /* 'gpa' is sysmem or devmem */ 771 } 772 773 if (ppt_is_mmio(vm, gpa)) 774 return (true); /* 'gpa' is pci passthru mmio */ 775 776 return (false); 777 } 778 779 int 780 vm_alloc_memseg(struct vm *vm, int ident, size_t len, bool sysmem) 781 { 782 struct mem_seg *seg; 783 vm_object_t *obj; 784 785 if (ident < 0 || ident >= VM_MAX_MEMSEGS) 786 return (EINVAL); 787 788 if (len == 0 || (len & PAGE_MASK)) 789 return (EINVAL); 790 791 seg = &vm->mem_segs[ident]; 792 if (seg->object != NULL) { 793 if (seg->len == len && seg->sysmem == sysmem) 794 return (EEXIST); 795 else 796 return (EINVAL); 797 } 798 799 obj = vm_object_mem_allocate(len, vm->mem_transient); 800 if (obj == NULL) 801 return (ENOMEM); 802 803 seg->len = len; 804 seg->object = obj; 805 seg->sysmem = sysmem; 806 return (0); 807 } 808 809 int 810 vm_get_memseg(struct vm *vm, int ident, size_t *len, bool *sysmem, 811 vm_object_t **objptr) 812 { 813 struct mem_seg *seg; 814 815 if (ident < 0 || ident >= VM_MAX_MEMSEGS) 816 return (EINVAL); 817 818 seg = &vm->mem_segs[ident]; 819 if (len) 820 *len = seg->len; 821 if (sysmem) 822 *sysmem = seg->sysmem; 823 if (objptr) 824 *objptr = seg->object; 825 return (0); 826 } 827 828 void 829 vm_free_memseg(struct vm *vm, int ident) 830 { 831 struct mem_seg *seg; 832 833 KASSERT(ident >= 0 && ident < VM_MAX_MEMSEGS, 834 ("%s: invalid memseg ident %d", __func__, ident)); 835 836 seg = &vm->mem_segs[ident]; 837 if (seg->object != NULL) { 838 vm_object_release(seg->object); 839 bzero(seg, sizeof (struct mem_seg)); 840 } 841 } 842 843 int 844 vm_mmap_memseg(struct vm *vm, vm_paddr_t gpa, int segid, vm_ooffset_t first, 845 size_t len, int prot, int flags) 846 { 847 struct mem_seg *seg; 848 struct mem_map *m, *map; 849 vm_ooffset_t last; 850 int i, error; 851 852 if (prot == 0 || (prot & ~(PROT_ALL)) != 0) 853 return (EINVAL); 854 855 if (flags & ~VM_MEMMAP_F_WIRED) 856 return (EINVAL); 857 858 if (segid < 0 || segid >= VM_MAX_MEMSEGS) 859 return (EINVAL); 860 861 seg = &vm->mem_segs[segid]; 862 if (seg->object == NULL) 863 return (EINVAL); 864 865 last = first + len; 866 if (first < 0 || first >= last || last > seg->len) 867 return (EINVAL); 868 869 if ((gpa | first | last) & PAGE_MASK) 870 return (EINVAL); 871 872 map = NULL; 873 for (i = 0; i < VM_MAX_MEMMAPS; i++) { 874 m = &vm->mem_maps[i]; 875 if (m->len == 0) { 876 map = m; 877 break; 878 } 879 } 880 881 if (map == NULL) 882 return (ENOSPC); 883 884 error = vmspace_map(vm->vmspace, seg->object, first, gpa, len, prot); 885 if (error != 0) 886 return (EFAULT); 887 888 vm_object_reference(seg->object); 889 890 if ((flags & VM_MEMMAP_F_WIRED) != 0) { 891 error = vmspace_populate(vm->vmspace, gpa, gpa + len); 892 if (error != 0) { 893 VERIFY0(vmspace_unmap(vm->vmspace, gpa, gpa + len)); 894 return (EFAULT); 895 } 896 } 897 898 map->gpa = gpa; 899 map->len = len; 900 map->segoff = first; 901 map->segid = segid; 902 map->prot = prot; 903 map->flags = flags; 904 return (0); 905 } 906 907 int 908 vm_munmap_memseg(struct vm *vm, vm_paddr_t gpa, size_t len) 909 { 910 struct mem_map *m; 911 int i; 912 913 for (i = 0; i < VM_MAX_MEMMAPS; i++) { 914 m = &vm->mem_maps[i]; 915 if (m->gpa == gpa && m->len == len && 916 (m->flags & VM_MEMMAP_F_IOMMU) == 0) { 917 vm_free_memmap(vm, i); 918 return (0); 919 } 920 } 921 922 return (EINVAL); 923 } 924 925 int 926 vm_mmap_getnext(struct vm *vm, vm_paddr_t *gpa, int *segid, 927 vm_ooffset_t *segoff, size_t *len, int *prot, int *flags) 928 { 929 struct mem_map *mm, *mmnext; 930 int i; 931 932 mmnext = NULL; 933 for (i = 0; i < VM_MAX_MEMMAPS; i++) { 934 mm = &vm->mem_maps[i]; 935 if (mm->len == 0 || mm->gpa < *gpa) 936 continue; 937 if (mmnext == NULL || mm->gpa < mmnext->gpa) 938 mmnext = mm; 939 } 940 941 if (mmnext != NULL) { 942 *gpa = mmnext->gpa; 943 if (segid) 944 *segid = mmnext->segid; 945 if (segoff) 946 *segoff = mmnext->segoff; 947 if (len) 948 *len = mmnext->len; 949 if (prot) 950 *prot = mmnext->prot; 951 if (flags) 952 *flags = mmnext->flags; 953 return (0); 954 } else { 955 return (ENOENT); 956 } 957 } 958 959 static void 960 vm_free_memmap(struct vm *vm, int ident) 961 { 962 struct mem_map *mm; 963 int error; 964 965 mm = &vm->mem_maps[ident]; 966 if (mm->len) { 967 error = vmspace_unmap(vm->vmspace, mm->gpa, 968 mm->gpa + mm->len); 969 KASSERT(error == 0, ("%s: vmspace_unmap error %d", 970 __func__, error)); 971 bzero(mm, sizeof (struct mem_map)); 972 } 973 } 974 975 static __inline bool 976 sysmem_mapping(struct vm *vm, struct mem_map *mm) 977 { 978 979 if (mm->len != 0 && vm->mem_segs[mm->segid].sysmem) 980 return (true); 981 else 982 return (false); 983 } 984 985 vm_paddr_t 986 vmm_sysmem_maxaddr(struct vm *vm) 987 { 988 struct mem_map *mm; 989 vm_paddr_t maxaddr; 990 int i; 991 992 maxaddr = 0; 993 for (i = 0; i < VM_MAX_MEMMAPS; i++) { 994 mm = &vm->mem_maps[i]; 995 if (sysmem_mapping(vm, mm)) { 996 if (maxaddr < mm->gpa + mm->len) 997 maxaddr = mm->gpa + mm->len; 998 } 999 } 1000 return (maxaddr); 1001 } 1002 1003 static void 1004 vm_iommu_modify(struct vm *vm, bool map) 1005 { 1006 int i, sz; 1007 vm_paddr_t gpa, hpa; 1008 struct mem_map *mm; 1009 vm_client_t *vmc; 1010 1011 sz = PAGE_SIZE; 1012 vmc = vmspace_client_alloc(vm->vmspace); 1013 1014 for (i = 0; i < VM_MAX_MEMMAPS; i++) { 1015 mm = &vm->mem_maps[i]; 1016 if (!sysmem_mapping(vm, mm)) 1017 continue; 1018 1019 if (map) { 1020 KASSERT((mm->flags & VM_MEMMAP_F_IOMMU) == 0, 1021 ("iommu map found invalid memmap %lx/%lx/%x", 1022 mm->gpa, mm->len, mm->flags)); 1023 if ((mm->flags & VM_MEMMAP_F_WIRED) == 0) 1024 continue; 1025 mm->flags |= VM_MEMMAP_F_IOMMU; 1026 } else { 1027 if ((mm->flags & VM_MEMMAP_F_IOMMU) == 0) 1028 continue; 1029 mm->flags &= ~VM_MEMMAP_F_IOMMU; 1030 KASSERT((mm->flags & VM_MEMMAP_F_WIRED) != 0, 1031 ("iommu unmap found invalid memmap %lx/%lx/%x", 1032 mm->gpa, mm->len, mm->flags)); 1033 } 1034 1035 gpa = mm->gpa; 1036 while (gpa < mm->gpa + mm->len) { 1037 vm_page_t *vmp; 1038 1039 vmp = vmc_hold(vmc, gpa, PROT_WRITE); 1040 ASSERT(vmp != NULL); 1041 hpa = ((uintptr_t)vmp_get_pfn(vmp) << PAGESHIFT); 1042 (void) vmp_release(vmp); 1043 1044 /* 1045 * When originally ported from FreeBSD, the logic for 1046 * adding memory to the guest domain would 1047 * simultaneously remove it from the host domain. The 1048 * justification for that is not clear, and FreeBSD has 1049 * subsequently changed the behavior to not remove the 1050 * memory from the host domain. 1051 * 1052 * Leaving the guest memory in the host domain for the 1053 * life of the VM is necessary to make it available for 1054 * DMA, such as through viona in the TX path. 1055 */ 1056 if (map) { 1057 iommu_create_mapping(vm->iommu, gpa, hpa, sz); 1058 } else { 1059 iommu_remove_mapping(vm->iommu, gpa, sz); 1060 } 1061 1062 gpa += PAGE_SIZE; 1063 } 1064 } 1065 vmc_destroy(vmc); 1066 1067 /* 1068 * Invalidate the cached translations associated with the domain 1069 * from which pages were removed. 1070 */ 1071 iommu_invalidate_tlb(vm->iommu); 1072 } 1073 1074 int 1075 vm_unassign_pptdev(struct vm *vm, int pptfd) 1076 { 1077 int error; 1078 1079 error = ppt_unassign_device(vm, pptfd); 1080 if (error) 1081 return (error); 1082 1083 if (ppt_assigned_devices(vm) == 0) 1084 vm_iommu_modify(vm, false); 1085 1086 return (0); 1087 } 1088 1089 int 1090 vm_assign_pptdev(struct vm *vm, int pptfd) 1091 { 1092 int error; 1093 vm_paddr_t maxaddr; 1094 1095 /* Set up the IOMMU to do the 'gpa' to 'hpa' translation */ 1096 if (ppt_assigned_devices(vm) == 0) { 1097 KASSERT(vm->iommu == NULL, 1098 ("vm_assign_pptdev: iommu must be NULL")); 1099 maxaddr = vmm_sysmem_maxaddr(vm); 1100 vm->iommu = iommu_create_domain(maxaddr); 1101 if (vm->iommu == NULL) 1102 return (ENXIO); 1103 vm_iommu_modify(vm, true); 1104 } 1105 1106 error = ppt_assign_device(vm, pptfd); 1107 return (error); 1108 } 1109 1110 int 1111 vm_get_register(struct vm *vm, int vcpuid, int reg, uint64_t *retval) 1112 { 1113 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 1114 return (EINVAL); 1115 1116 if (reg >= VM_REG_LAST) 1117 return (EINVAL); 1118 1119 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 1120 switch (reg) { 1121 case VM_REG_GUEST_XCR0: 1122 *retval = vcpu->guest_xcr0; 1123 return (0); 1124 default: 1125 return (VMGETREG(vm->cookie, vcpuid, reg, retval)); 1126 } 1127 } 1128 1129 int 1130 vm_set_register(struct vm *vm, int vcpuid, int reg, uint64_t val) 1131 { 1132 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 1133 return (EINVAL); 1134 1135 if (reg >= VM_REG_LAST) 1136 return (EINVAL); 1137 1138 int error; 1139 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 1140 switch (reg) { 1141 case VM_REG_GUEST_RIP: 1142 error = VMSETREG(vm->cookie, vcpuid, reg, val); 1143 if (error == 0) { 1144 vcpu->nextrip = val; 1145 } 1146 return (error); 1147 case VM_REG_GUEST_XCR0: 1148 if (!validate_guest_xcr0(val, vmm_get_host_xcr0())) { 1149 return (EINVAL); 1150 } 1151 vcpu->guest_xcr0 = val; 1152 return (0); 1153 default: 1154 return (VMSETREG(vm->cookie, vcpuid, reg, val)); 1155 } 1156 } 1157 1158 static bool 1159 is_descriptor_table(int reg) 1160 { 1161 switch (reg) { 1162 case VM_REG_GUEST_IDTR: 1163 case VM_REG_GUEST_GDTR: 1164 return (true); 1165 default: 1166 return (false); 1167 } 1168 } 1169 1170 static bool 1171 is_segment_register(int reg) 1172 { 1173 switch (reg) { 1174 case VM_REG_GUEST_ES: 1175 case VM_REG_GUEST_CS: 1176 case VM_REG_GUEST_SS: 1177 case VM_REG_GUEST_DS: 1178 case VM_REG_GUEST_FS: 1179 case VM_REG_GUEST_GS: 1180 case VM_REG_GUEST_TR: 1181 case VM_REG_GUEST_LDTR: 1182 return (true); 1183 default: 1184 return (false); 1185 } 1186 } 1187 1188 int 1189 vm_get_seg_desc(struct vm *vm, int vcpu, int reg, struct seg_desc *desc) 1190 { 1191 1192 if (vcpu < 0 || vcpu >= vm->maxcpus) 1193 return (EINVAL); 1194 1195 if (!is_segment_register(reg) && !is_descriptor_table(reg)) 1196 return (EINVAL); 1197 1198 return (VMGETDESC(vm->cookie, vcpu, reg, desc)); 1199 } 1200 1201 int 1202 vm_set_seg_desc(struct vm *vm, int vcpu, int reg, const struct seg_desc *desc) 1203 { 1204 if (vcpu < 0 || vcpu >= vm->maxcpus) 1205 return (EINVAL); 1206 1207 if (!is_segment_register(reg) && !is_descriptor_table(reg)) 1208 return (EINVAL); 1209 1210 return (VMSETDESC(vm->cookie, vcpu, reg, desc)); 1211 } 1212 1213 static int 1214 translate_hma_xsave_result(hma_fpu_xsave_result_t res) 1215 { 1216 switch (res) { 1217 case HFXR_OK: 1218 return (0); 1219 case HFXR_NO_SPACE: 1220 return (ENOSPC); 1221 case HFXR_BAD_ALIGN: 1222 case HFXR_UNSUP_FMT: 1223 case HFXR_UNSUP_FEAT: 1224 case HFXR_INVALID_DATA: 1225 return (EINVAL); 1226 default: 1227 panic("unexpected xsave result"); 1228 } 1229 } 1230 1231 int 1232 vm_get_fpu(struct vm *vm, int vcpuid, void *buf, size_t len) 1233 { 1234 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 1235 return (EINVAL); 1236 1237 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 1238 hma_fpu_xsave_result_t res; 1239 1240 res = hma_fpu_get_xsave_state(vcpu->guestfpu, buf, len); 1241 return (translate_hma_xsave_result(res)); 1242 } 1243 1244 int 1245 vm_set_fpu(struct vm *vm, int vcpuid, void *buf, size_t len) 1246 { 1247 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 1248 return (EINVAL); 1249 1250 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 1251 hma_fpu_xsave_result_t res; 1252 1253 res = hma_fpu_set_xsave_state(vcpu->guestfpu, buf, len); 1254 return (translate_hma_xsave_result(res)); 1255 } 1256 1257 int 1258 vm_get_run_state(struct vm *vm, int vcpuid, uint32_t *state, uint8_t *sipi_vec) 1259 { 1260 struct vcpu *vcpu; 1261 1262 if (vcpuid < 0 || vcpuid >= vm->maxcpus) { 1263 return (EINVAL); 1264 } 1265 1266 vcpu = &vm->vcpu[vcpuid]; 1267 1268 vcpu_lock(vcpu); 1269 *state = vcpu->run_state; 1270 *sipi_vec = vcpu->sipi_vector; 1271 vcpu_unlock(vcpu); 1272 1273 return (0); 1274 } 1275 1276 int 1277 vm_set_run_state(struct vm *vm, int vcpuid, uint32_t state, uint8_t sipi_vec) 1278 { 1279 struct vcpu *vcpu; 1280 1281 if (vcpuid < 0 || vcpuid >= vm->maxcpus) { 1282 return (EINVAL); 1283 } 1284 if (!VRS_IS_VALID(state)) { 1285 return (EINVAL); 1286 } 1287 1288 vcpu = &vm->vcpu[vcpuid]; 1289 1290 vcpu_lock(vcpu); 1291 vcpu->run_state = state; 1292 vcpu->sipi_vector = sipi_vec; 1293 vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT); 1294 vcpu_unlock(vcpu); 1295 1296 return (0); 1297 } 1298 1299 void 1300 vm_track_dirty_pages(struct vm *vm, uint64_t gpa, size_t len, uint8_t *bitmap) 1301 { 1302 vmspace_t *vms = vm_get_vmspace(vm); 1303 vmspace_track_dirty(vms, gpa, len, bitmap); 1304 } 1305 1306 static void 1307 restore_guest_fpustate(struct vcpu *vcpu) 1308 { 1309 /* Save host FPU and restore guest FPU */ 1310 fpu_stop_emulating(); 1311 hma_fpu_start_guest(vcpu->guestfpu); 1312 1313 /* restore guest XCR0 if XSAVE is enabled in the host */ 1314 if (rcr4() & CR4_XSAVE) 1315 load_xcr(0, vcpu->guest_xcr0); 1316 1317 /* 1318 * The FPU is now "dirty" with the guest's state so turn on emulation 1319 * to trap any access to the FPU by the host. 1320 */ 1321 fpu_start_emulating(); 1322 } 1323 1324 static void 1325 save_guest_fpustate(struct vcpu *vcpu) 1326 { 1327 1328 if ((rcr0() & CR0_TS) == 0) 1329 panic("fpu emulation not enabled in host!"); 1330 1331 /* save guest XCR0 and restore host XCR0 */ 1332 if (rcr4() & CR4_XSAVE) { 1333 vcpu->guest_xcr0 = rxcr(0); 1334 load_xcr(0, vmm_get_host_xcr0()); 1335 } 1336 1337 /* save guest FPU and restore host FPU */ 1338 fpu_stop_emulating(); 1339 hma_fpu_stop_guest(vcpu->guestfpu); 1340 /* 1341 * When the host state has been restored, we should not re-enable 1342 * CR0.TS on illumos for eager FPU. 1343 */ 1344 } 1345 1346 static int 1347 vcpu_set_state_locked(struct vm *vm, int vcpuid, enum vcpu_state newstate, 1348 bool from_idle) 1349 { 1350 struct vcpu *vcpu; 1351 int error; 1352 1353 vcpu = &vm->vcpu[vcpuid]; 1354 vcpu_assert_locked(vcpu); 1355 1356 /* 1357 * State transitions from the vmmdev_ioctl() must always begin from 1358 * the VCPU_IDLE state. This guarantees that there is only a single 1359 * ioctl() operating on a vcpu at any point. 1360 */ 1361 if (from_idle) { 1362 while (vcpu->state != VCPU_IDLE) { 1363 vcpu->reqidle = 1; 1364 vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT); 1365 cv_wait(&vcpu->state_cv, &vcpu->lock); 1366 } 1367 } else { 1368 KASSERT(vcpu->state != VCPU_IDLE, ("invalid transition from " 1369 "vcpu idle state")); 1370 } 1371 1372 if (vcpu->state == VCPU_RUNNING) { 1373 KASSERT(vcpu->hostcpu == curcpu, ("curcpu %d and hostcpu %d " 1374 "mismatch for running vcpu", curcpu, vcpu->hostcpu)); 1375 } else { 1376 KASSERT(vcpu->hostcpu == NOCPU, ("Invalid hostcpu %d for a " 1377 "vcpu that is not running", vcpu->hostcpu)); 1378 } 1379 1380 /* 1381 * The following state transitions are allowed: 1382 * IDLE -> FROZEN -> IDLE 1383 * FROZEN -> RUNNING -> FROZEN 1384 * FROZEN -> SLEEPING -> FROZEN 1385 */ 1386 switch (vcpu->state) { 1387 case VCPU_IDLE: 1388 case VCPU_RUNNING: 1389 case VCPU_SLEEPING: 1390 error = (newstate != VCPU_FROZEN); 1391 break; 1392 case VCPU_FROZEN: 1393 error = (newstate == VCPU_FROZEN); 1394 break; 1395 default: 1396 error = 1; 1397 break; 1398 } 1399 1400 if (error) 1401 return (EBUSY); 1402 1403 vcpu->state = newstate; 1404 if (newstate == VCPU_RUNNING) 1405 vcpu->hostcpu = curcpu; 1406 else 1407 vcpu->hostcpu = NOCPU; 1408 1409 if (newstate == VCPU_IDLE) { 1410 cv_broadcast(&vcpu->state_cv); 1411 } 1412 1413 return (0); 1414 } 1415 1416 static void 1417 vcpu_require_state(struct vm *vm, int vcpuid, enum vcpu_state newstate) 1418 { 1419 int error; 1420 1421 if ((error = vcpu_set_state(vm, vcpuid, newstate, false)) != 0) 1422 panic("Error %d setting state to %d\n", error, newstate); 1423 } 1424 1425 static void 1426 vcpu_require_state_locked(struct vm *vm, int vcpuid, enum vcpu_state newstate) 1427 { 1428 int error; 1429 1430 if ((error = vcpu_set_state_locked(vm, vcpuid, newstate, false)) != 0) 1431 panic("Error %d setting state to %d", error, newstate); 1432 } 1433 1434 /* 1435 * Emulate a guest 'hlt' by sleeping until the vcpu is ready to run. 1436 */ 1437 static int 1438 vm_handle_hlt(struct vm *vm, int vcpuid, bool intr_disabled) 1439 { 1440 struct vcpu *vcpu; 1441 int vcpu_halted, vm_halted; 1442 bool userspace_exit = false; 1443 1444 KASSERT(!CPU_ISSET(vcpuid, &vm->halted_cpus), ("vcpu already halted")); 1445 1446 vcpu = &vm->vcpu[vcpuid]; 1447 vcpu_halted = 0; 1448 vm_halted = 0; 1449 1450 vcpu_lock(vcpu); 1451 while (1) { 1452 /* 1453 * Do a final check for pending interrupts (including NMI and 1454 * INIT) before putting this thread to sleep. 1455 */ 1456 if (vm_nmi_pending(vm, vcpuid)) 1457 break; 1458 if (vcpu_run_state_pending(vm, vcpuid)) 1459 break; 1460 if (!intr_disabled) { 1461 if (vm_extint_pending(vm, vcpuid) || 1462 vlapic_pending_intr(vcpu->vlapic, NULL)) { 1463 break; 1464 } 1465 } 1466 1467 /* 1468 * Also check for software events which would cause a wake-up. 1469 * This will set the appropriate exitcode directly, rather than 1470 * requiring a trip through VM_RUN(). 1471 */ 1472 if (vcpu_sleep_bailout_checks(vm, vcpuid)) { 1473 userspace_exit = true; 1474 break; 1475 } 1476 1477 /* 1478 * Some Linux guests implement "halt" by having all vcpus 1479 * execute HLT with interrupts disabled. 'halted_cpus' keeps 1480 * track of the vcpus that have entered this state. When all 1481 * vcpus enter the halted state the virtual machine is halted. 1482 */ 1483 if (intr_disabled) { 1484 if (!vcpu_halted && halt_detection_enabled) { 1485 vcpu_halted = 1; 1486 CPU_SET_ATOMIC(vcpuid, &vm->halted_cpus); 1487 } 1488 if (CPU_CMP(&vm->halted_cpus, &vm->active_cpus) == 0) { 1489 vm_halted = 1; 1490 break; 1491 } 1492 } 1493 1494 vcpu_ustate_change(vm, vcpuid, VU_IDLE); 1495 vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING); 1496 (void) cv_wait_sig(&vcpu->vcpu_cv, &vcpu->lock); 1497 vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN); 1498 vcpu_ustate_change(vm, vcpuid, VU_EMU_KERN); 1499 } 1500 1501 if (vcpu_halted) 1502 CPU_CLR_ATOMIC(vcpuid, &vm->halted_cpus); 1503 1504 vcpu_unlock(vcpu); 1505 1506 if (vm_halted) { 1507 (void) vm_suspend(vm, VM_SUSPEND_HALT); 1508 } 1509 1510 return (userspace_exit ? -1 : 0); 1511 } 1512 1513 static int 1514 vm_handle_paging(struct vm *vm, int vcpuid) 1515 { 1516 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 1517 vm_client_t *vmc = vcpu->vmclient; 1518 struct vm_exit *vme = &vcpu->exitinfo; 1519 const int ftype = vme->u.paging.fault_type; 1520 1521 ASSERT0(vme->inst_length); 1522 ASSERT(ftype == PROT_READ || ftype == PROT_WRITE || ftype == PROT_EXEC); 1523 1524 if (vmc_fault(vmc, vme->u.paging.gpa, ftype) != 0) { 1525 /* 1526 * If the fault cannot be serviced, kick it out to userspace for 1527 * handling (or more likely, halting the instance). 1528 */ 1529 return (-1); 1530 } 1531 1532 return (0); 1533 } 1534 1535 int 1536 vm_service_mmio_read(struct vm *vm, int cpuid, uint64_t gpa, uint64_t *rval, 1537 int rsize) 1538 { 1539 int err = ESRCH; 1540 1541 if (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE) { 1542 struct vlapic *vlapic = vm_lapic(vm, cpuid); 1543 1544 err = vlapic_mmio_read(vlapic, gpa, rval, rsize); 1545 } else if (gpa >= VIOAPIC_BASE && gpa < VIOAPIC_BASE + VIOAPIC_SIZE) { 1546 err = vioapic_mmio_read(vm, cpuid, gpa, rval, rsize); 1547 } else if (gpa >= VHPET_BASE && gpa < VHPET_BASE + VHPET_SIZE) { 1548 err = vhpet_mmio_read(vm, cpuid, gpa, rval, rsize); 1549 } 1550 1551 return (err); 1552 } 1553 1554 int 1555 vm_service_mmio_write(struct vm *vm, int cpuid, uint64_t gpa, uint64_t wval, 1556 int wsize) 1557 { 1558 int err = ESRCH; 1559 1560 if (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE) { 1561 struct vlapic *vlapic = vm_lapic(vm, cpuid); 1562 1563 err = vlapic_mmio_write(vlapic, gpa, wval, wsize); 1564 } else if (gpa >= VIOAPIC_BASE && gpa < VIOAPIC_BASE + VIOAPIC_SIZE) { 1565 err = vioapic_mmio_write(vm, cpuid, gpa, wval, wsize); 1566 } else if (gpa >= VHPET_BASE && gpa < VHPET_BASE + VHPET_SIZE) { 1567 err = vhpet_mmio_write(vm, cpuid, gpa, wval, wsize); 1568 } 1569 1570 return (err); 1571 } 1572 1573 static int 1574 vm_handle_mmio_emul(struct vm *vm, int vcpuid) 1575 { 1576 struct vie *vie; 1577 struct vcpu *vcpu; 1578 struct vm_exit *vme; 1579 uint64_t inst_addr; 1580 int error, fault, cs_d; 1581 1582 vcpu = &vm->vcpu[vcpuid]; 1583 vme = &vcpu->exitinfo; 1584 vie = vcpu->vie_ctx; 1585 1586 KASSERT(vme->inst_length == 0, ("%s: invalid inst_length %d", 1587 __func__, vme->inst_length)); 1588 1589 inst_addr = vme->rip + vme->u.mmio_emul.cs_base; 1590 cs_d = vme->u.mmio_emul.cs_d; 1591 1592 /* Fetch the faulting instruction */ 1593 if (vie_needs_fetch(vie)) { 1594 error = vie_fetch_instruction(vie, vm, vcpuid, inst_addr, 1595 &fault); 1596 if (error != 0) { 1597 return (error); 1598 } else if (fault) { 1599 /* 1600 * If a fault during instruction fetch was encountered, 1601 * it will have asserted that the appropriate exception 1602 * be injected at next entry. 1603 * No further work is required. 1604 */ 1605 return (0); 1606 } 1607 } 1608 1609 if (vie_decode_instruction(vie, vm, vcpuid, cs_d) != 0) { 1610 /* Dump (unrecognized) instruction bytes in userspace */ 1611 vie_fallback_exitinfo(vie, vme); 1612 return (-1); 1613 } 1614 if (vme->u.mmio_emul.gla != VIE_INVALID_GLA && 1615 vie_verify_gla(vie, vm, vcpuid, vme->u.mmio_emul.gla) != 0) { 1616 /* Decoded GLA does not match GLA from VM exit state */ 1617 vie_fallback_exitinfo(vie, vme); 1618 return (-1); 1619 } 1620 1621 repeat: 1622 error = vie_emulate_mmio(vie, vm, vcpuid); 1623 if (error < 0) { 1624 /* 1625 * MMIO not handled by any of the in-kernel-emulated devices, so 1626 * make a trip out to userspace for it. 1627 */ 1628 vie_exitinfo(vie, vme); 1629 } else if (error == EAGAIN) { 1630 /* 1631 * Continue emulating the rep-prefixed instruction, which has 1632 * not completed its iterations. 1633 * 1634 * In case this can be emulated in-kernel and has a high 1635 * repetition count (causing a tight spin), it should be 1636 * deferential to yield conditions. 1637 */ 1638 if (!vcpu_should_yield(vm, vcpuid)) { 1639 goto repeat; 1640 } else { 1641 /* 1642 * Defer to the contending load by making a trip to 1643 * userspace with a no-op (BOGUS) exit reason. 1644 */ 1645 vie_reset(vie); 1646 vme->exitcode = VM_EXITCODE_BOGUS; 1647 return (-1); 1648 } 1649 } else if (error == 0) { 1650 /* Update %rip now that instruction has been emulated */ 1651 vie_advance_pc(vie, &vcpu->nextrip); 1652 } 1653 return (error); 1654 } 1655 1656 static int 1657 vm_handle_inout(struct vm *vm, int vcpuid, struct vm_exit *vme) 1658 { 1659 struct vcpu *vcpu; 1660 struct vie *vie; 1661 int err; 1662 1663 vcpu = &vm->vcpu[vcpuid]; 1664 vie = vcpu->vie_ctx; 1665 1666 repeat: 1667 err = vie_emulate_inout(vie, vm, vcpuid); 1668 1669 if (err < 0) { 1670 /* 1671 * In/out not handled by any of the in-kernel-emulated devices, 1672 * so make a trip out to userspace for it. 1673 */ 1674 vie_exitinfo(vie, vme); 1675 return (err); 1676 } else if (err == EAGAIN) { 1677 /* 1678 * Continue emulating the rep-prefixed ins/outs, which has not 1679 * completed its iterations. 1680 * 1681 * In case this can be emulated in-kernel and has a high 1682 * repetition count (causing a tight spin), it should be 1683 * deferential to yield conditions. 1684 */ 1685 if (!vcpu_should_yield(vm, vcpuid)) { 1686 goto repeat; 1687 } else { 1688 /* 1689 * Defer to the contending load by making a trip to 1690 * userspace with a no-op (BOGUS) exit reason. 1691 */ 1692 vie_reset(vie); 1693 vme->exitcode = VM_EXITCODE_BOGUS; 1694 return (-1); 1695 } 1696 } else if (err != 0) { 1697 /* Emulation failure. Bail all the way out to userspace. */ 1698 vme->exitcode = VM_EXITCODE_INST_EMUL; 1699 bzero(&vme->u.inst_emul, sizeof (vme->u.inst_emul)); 1700 return (-1); 1701 } 1702 1703 vie_advance_pc(vie, &vcpu->nextrip); 1704 return (0); 1705 } 1706 1707 static int 1708 vm_handle_inst_emul(struct vm *vm, int vcpuid) 1709 { 1710 struct vie *vie; 1711 struct vcpu *vcpu; 1712 struct vm_exit *vme; 1713 uint64_t cs_base; 1714 int error, fault, cs_d; 1715 1716 vcpu = &vm->vcpu[vcpuid]; 1717 vme = &vcpu->exitinfo; 1718 vie = vcpu->vie_ctx; 1719 1720 vie_cs_info(vie, vm, vcpuid, &cs_base, &cs_d); 1721 1722 /* Fetch the faulting instruction */ 1723 ASSERT(vie_needs_fetch(vie)); 1724 error = vie_fetch_instruction(vie, vm, vcpuid, vme->rip + cs_base, 1725 &fault); 1726 if (error != 0) { 1727 return (error); 1728 } else if (fault) { 1729 /* 1730 * If a fault during instruction fetch was encounted, it will 1731 * have asserted that the appropriate exception be injected at 1732 * next entry. No further work is required. 1733 */ 1734 return (0); 1735 } 1736 1737 if (vie_decode_instruction(vie, vm, vcpuid, cs_d) != 0) { 1738 /* Dump (unrecognized) instruction bytes in userspace */ 1739 vie_fallback_exitinfo(vie, vme); 1740 return (-1); 1741 } 1742 1743 error = vie_emulate_other(vie, vm, vcpuid); 1744 if (error != 0) { 1745 /* 1746 * Instruction emulation was unable to complete successfully, so 1747 * kick it out to userspace for handling. 1748 */ 1749 vie_fallback_exitinfo(vie, vme); 1750 } else { 1751 /* Update %rip now that instruction has been emulated */ 1752 vie_advance_pc(vie, &vcpu->nextrip); 1753 } 1754 return (error); 1755 } 1756 1757 static int 1758 vm_handle_suspend(struct vm *vm, int vcpuid) 1759 { 1760 int i; 1761 struct vcpu *vcpu; 1762 1763 vcpu = &vm->vcpu[vcpuid]; 1764 1765 CPU_SET_ATOMIC(vcpuid, &vm->suspended_cpus); 1766 1767 /* 1768 * Wait until all 'active_cpus' have suspended themselves. 1769 */ 1770 vcpu_lock(vcpu); 1771 vcpu_ustate_change(vm, vcpuid, VU_INIT); 1772 while (1) { 1773 int rc; 1774 1775 if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) { 1776 break; 1777 } 1778 1779 vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING); 1780 rc = cv_reltimedwait_sig(&vcpu->vcpu_cv, &vcpu->lock, hz, 1781 TR_CLOCK_TICK); 1782 vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN); 1783 1784 /* 1785 * If the userspace process driving the instance is killed, any 1786 * vCPUs yet to be marked suspended (because they are not 1787 * VM_RUN-ing in the kernel presently) will never reach that 1788 * state. 1789 * 1790 * To avoid vm_handle_suspend() getting stuck in the kernel 1791 * waiting for those vCPUs, offer a bail-out even though it 1792 * means returning without all vCPUs in a suspended state. 1793 */ 1794 if (rc <= 0) { 1795 if ((curproc->p_flag & SEXITING) != 0) { 1796 break; 1797 } 1798 } 1799 } 1800 vcpu_unlock(vcpu); 1801 1802 /* 1803 * Wakeup the other sleeping vcpus and return to userspace. 1804 */ 1805 for (i = 0; i < vm->maxcpus; i++) { 1806 if (CPU_ISSET(i, &vm->suspended_cpus)) { 1807 vcpu_notify_event(vm, i); 1808 } 1809 } 1810 1811 return (-1); 1812 } 1813 1814 static int 1815 vm_handle_reqidle(struct vm *vm, int vcpuid) 1816 { 1817 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 1818 1819 vcpu_lock(vcpu); 1820 KASSERT(vcpu->reqidle, ("invalid vcpu reqidle %d", vcpu->reqidle)); 1821 vcpu->reqidle = 0; 1822 vcpu_unlock(vcpu); 1823 return (-1); 1824 } 1825 1826 static int 1827 vm_handle_run_state(struct vm *vm, int vcpuid) 1828 { 1829 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 1830 bool handled = false; 1831 1832 vcpu_lock(vcpu); 1833 while (1) { 1834 if ((vcpu->run_state & VRS_PEND_INIT) != 0) { 1835 vcpu_unlock(vcpu); 1836 VERIFY0(vcpu_arch_reset(vm, vcpuid, true)); 1837 vcpu_lock(vcpu); 1838 1839 vcpu->run_state &= ~(VRS_RUN | VRS_PEND_INIT); 1840 vcpu->run_state |= VRS_INIT; 1841 } 1842 1843 if ((vcpu->run_state & (VRS_INIT | VRS_RUN | VRS_PEND_SIPI)) == 1844 (VRS_INIT | VRS_PEND_SIPI)) { 1845 const uint8_t vector = vcpu->sipi_vector; 1846 1847 vcpu_unlock(vcpu); 1848 VERIFY0(vcpu_vector_sipi(vm, vcpuid, vector)); 1849 vcpu_lock(vcpu); 1850 1851 vcpu->run_state &= ~VRS_PEND_SIPI; 1852 vcpu->run_state |= VRS_RUN; 1853 } 1854 1855 /* 1856 * If the vCPU is now in the running state, there is no need to 1857 * wait for anything prior to re-entry. 1858 */ 1859 if ((vcpu->run_state & VRS_RUN) != 0) { 1860 handled = true; 1861 break; 1862 } 1863 1864 /* 1865 * Also check for software events which would cause a wake-up. 1866 * This will set the appropriate exitcode directly, rather than 1867 * requiring a trip through VM_RUN(). 1868 */ 1869 if (vcpu_sleep_bailout_checks(vm, vcpuid)) { 1870 break; 1871 } 1872 1873 vcpu_ustate_change(vm, vcpuid, VU_IDLE); 1874 vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING); 1875 (void) cv_wait_sig(&vcpu->vcpu_cv, &vcpu->lock); 1876 vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN); 1877 vcpu_ustate_change(vm, vcpuid, VU_EMU_KERN); 1878 } 1879 vcpu_unlock(vcpu); 1880 1881 return (handled ? 0 : -1); 1882 } 1883 1884 static int 1885 vm_rdmtrr(const struct vm_mtrr *mtrr, uint32_t num, uint64_t *val) 1886 { 1887 switch (num) { 1888 case MSR_MTRRcap: 1889 *val = MTRR_CAP_WC | MTRR_CAP_FIXED | VMM_MTRR_VAR_MAX; 1890 break; 1891 case MSR_MTRRdefType: 1892 *val = mtrr->def_type; 1893 break; 1894 case MSR_MTRR4kBase ... MSR_MTRR4kBase + 7: 1895 *val = mtrr->fixed4k[num - MSR_MTRR4kBase]; 1896 break; 1897 case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1: 1898 *val = mtrr->fixed16k[num - MSR_MTRR16kBase]; 1899 break; 1900 case MSR_MTRR64kBase: 1901 *val = mtrr->fixed64k; 1902 break; 1903 case MSR_MTRRVarBase ... MSR_MTRRVarBase + (VMM_MTRR_VAR_MAX * 2) - 1: { 1904 uint_t offset = num - MSR_MTRRVarBase; 1905 if (offset % 2 == 0) { 1906 *val = mtrr->var[offset / 2].base; 1907 } else { 1908 *val = mtrr->var[offset / 2].mask; 1909 } 1910 break; 1911 } 1912 default: 1913 return (-1); 1914 } 1915 1916 return (0); 1917 } 1918 1919 static int 1920 vm_wrmtrr(struct vm_mtrr *mtrr, uint32_t num, uint64_t val) 1921 { 1922 switch (num) { 1923 case MSR_MTRRcap: 1924 /* MTRRCAP is read only */ 1925 return (-1); 1926 case MSR_MTRRdefType: 1927 if (val & ~VMM_MTRR_DEF_MASK) { 1928 /* generate #GP on writes to reserved fields */ 1929 return (-1); 1930 } 1931 mtrr->def_type = val; 1932 break; 1933 case MSR_MTRR4kBase ... MSR_MTRR4kBase + 7: 1934 mtrr->fixed4k[num - MSR_MTRR4kBase] = val; 1935 break; 1936 case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1: 1937 mtrr->fixed16k[num - MSR_MTRR16kBase] = val; 1938 break; 1939 case MSR_MTRR64kBase: 1940 mtrr->fixed64k = val; 1941 break; 1942 case MSR_MTRRVarBase ... MSR_MTRRVarBase + (VMM_MTRR_VAR_MAX * 2) - 1: { 1943 uint_t offset = num - MSR_MTRRVarBase; 1944 if (offset % 2 == 0) { 1945 if (val & ~VMM_MTRR_PHYSBASE_MASK) { 1946 /* generate #GP on writes to reserved fields */ 1947 return (-1); 1948 } 1949 mtrr->var[offset / 2].base = val; 1950 } else { 1951 if (val & ~VMM_MTRR_PHYSMASK_MASK) { 1952 /* generate #GP on writes to reserved fields */ 1953 return (-1); 1954 } 1955 mtrr->var[offset / 2].mask = val; 1956 } 1957 break; 1958 } 1959 default: 1960 return (-1); 1961 } 1962 1963 return (0); 1964 } 1965 1966 static bool 1967 is_mtrr_msr(uint32_t msr) 1968 { 1969 switch (msr) { 1970 case MSR_MTRRcap: 1971 case MSR_MTRRdefType: 1972 case MSR_MTRR4kBase ... MSR_MTRR4kBase + 7: 1973 case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1: 1974 case MSR_MTRR64kBase: 1975 case MSR_MTRRVarBase ... MSR_MTRRVarBase + (VMM_MTRR_VAR_MAX * 2) - 1: 1976 return (true); 1977 default: 1978 return (false); 1979 } 1980 } 1981 1982 static int 1983 vm_handle_rdmsr(struct vm *vm, int vcpuid, struct vm_exit *vme) 1984 { 1985 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 1986 const uint32_t code = vme->u.msr.code; 1987 uint64_t val = 0; 1988 1989 switch (code) { 1990 case MSR_MCG_CAP: 1991 case MSR_MCG_STATUS: 1992 val = 0; 1993 break; 1994 1995 case MSR_MTRRcap: 1996 case MSR_MTRRdefType: 1997 case MSR_MTRR4kBase ... MSR_MTRR4kBase + 7: 1998 case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1: 1999 case MSR_MTRR64kBase: 2000 case MSR_MTRRVarBase ... MSR_MTRRVarBase + (VMM_MTRR_VAR_MAX * 2) - 1: 2001 if (vm_rdmtrr(&vcpu->mtrr, code, &val) != 0) 2002 vm_inject_gp(vm, vcpuid); 2003 break; 2004 2005 case MSR_TSC: 2006 /* 2007 * In all likelihood, this should always be handled in guest 2008 * context by VMX/SVM rather than taking an exit. (Both VMX and 2009 * SVM pass through read-only access to MSR_TSC to the guest.) 2010 * 2011 * No physical offset is requested of vcpu_tsc_offset() since 2012 * rdtsc_offset() takes care of that instead. 2013 */ 2014 val = vcpu_tsc_offset(vm, vcpuid, false) + rdtsc_offset(); 2015 break; 2016 2017 default: 2018 /* 2019 * Anything not handled at this point will be kicked out to 2020 * userspace for attempted processing there. 2021 */ 2022 return (-1); 2023 } 2024 2025 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RAX, 2026 val & 0xffffffff)); 2027 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RDX, 2028 val >> 32)); 2029 return (0); 2030 } 2031 2032 static int 2033 vm_handle_wrmsr(struct vm *vm, int vcpuid, struct vm_exit *vme) 2034 { 2035 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 2036 const uint32_t code = vme->u.msr.code; 2037 const uint64_t val = vme->u.msr.wval; 2038 2039 switch (code) { 2040 case MSR_MCG_CAP: 2041 case MSR_MCG_STATUS: 2042 /* Ignore writes */ 2043 break; 2044 2045 case MSR_MTRRcap: 2046 case MSR_MTRRdefType: 2047 case MSR_MTRR4kBase ... MSR_MTRR4kBase + 7: 2048 case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1: 2049 case MSR_MTRR64kBase: 2050 case MSR_MTRRVarBase ... MSR_MTRRVarBase + (VMM_MTRR_VAR_MAX * 2) - 1: 2051 if (vm_wrmtrr(&vcpu->mtrr, code, val) != 0) 2052 vm_inject_gp(vm, vcpuid); 2053 break; 2054 2055 case MSR_TSC: 2056 /* 2057 * The effect of writing the TSC MSR is that a subsequent read 2058 * of the TSC would report that value written (plus any time 2059 * elapsed between the write and the read). The guest TSC value 2060 * is calculated from a global offset for the guest (which 2061 * effectively makes its TSC read 0 at guest boot) and a 2062 * per-vCPU offset to handle these writes to the MSR. 2063 * 2064 * To calculate that per-vCPU offset, we can work backwards from 2065 * the guest value at the time of write: 2066 * 2067 * value = host TSC + VM boot offset + vCPU offset 2068 * 2069 * so therefore: 2070 * 2071 * value - host TSC - VM boot offset = vCPU offset 2072 */ 2073 vcpu->tsc_offset = val - vm->boot_tsc_offset - rdtsc_offset(); 2074 break; 2075 2076 default: 2077 /* 2078 * Anything not handled at this point will be kicked out to 2079 * userspace for attempted processing there. 2080 */ 2081 return (-1); 2082 } 2083 2084 return (0); 2085 } 2086 2087 int 2088 vm_suspend(struct vm *vm, enum vm_suspend_how how) 2089 { 2090 if (how <= VM_SUSPEND_NONE || how >= VM_SUSPEND_LAST) 2091 return (EINVAL); 2092 2093 if (atomic_cmpset_int((uint_t *)&vm->suspend, 0, how) == 0) { 2094 return (EALREADY); 2095 } 2096 2097 /* 2098 * Notify all active vcpus that they are now suspended. 2099 */ 2100 for (uint_t i = 0; i < vm->maxcpus; i++) { 2101 struct vcpu *vcpu = &vm->vcpu[i]; 2102 2103 vcpu_lock(vcpu); 2104 if (vcpu->state == VCPU_IDLE || vcpu->state == VCPU_FROZEN) { 2105 /* 2106 * Any vCPUs not actively running or in HLT can be 2107 * marked as suspended immediately. 2108 */ 2109 if (CPU_ISSET(i, &vm->active_cpus)) { 2110 CPU_SET_ATOMIC(i, &vm->suspended_cpus); 2111 } 2112 } else { 2113 /* 2114 * Those which are running or in HLT will pick up the 2115 * suspended state after notification. 2116 */ 2117 vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT); 2118 } 2119 vcpu_unlock(vcpu); 2120 } 2121 return (0); 2122 } 2123 2124 void 2125 vm_exit_run_state(struct vm *vm, int vcpuid, uint64_t rip) 2126 { 2127 struct vm_exit *vmexit; 2128 2129 vmexit = vm_exitinfo(vm, vcpuid); 2130 vmexit->rip = rip; 2131 vmexit->inst_length = 0; 2132 vmexit->exitcode = VM_EXITCODE_RUN_STATE; 2133 vmm_stat_incr(vm, vcpuid, VMEXIT_RUN_STATE, 1); 2134 } 2135 2136 /* 2137 * Some vmm resources, such as the lapic, may have CPU-specific resources 2138 * allocated to them which would benefit from migration onto the host CPU which 2139 * is processing the vcpu state. 2140 */ 2141 static void 2142 vm_localize_resources(struct vm *vm, struct vcpu *vcpu) 2143 { 2144 /* 2145 * Localizing cyclic resources requires acquisition of cpu_lock, and 2146 * doing so with kpreempt disabled is a recipe for deadlock disaster. 2147 */ 2148 VERIFY(curthread->t_preempt == 0); 2149 2150 /* 2151 * Do not bother with localization if this vCPU is about to return to 2152 * the host CPU it was last localized to. 2153 */ 2154 if (vcpu->lastloccpu == curcpu) 2155 return; 2156 2157 /* 2158 * Localize system-wide resources to the primary boot vCPU. While any 2159 * of the other vCPUs may access them, it keeps the potential interrupt 2160 * footprint constrained to CPUs involved with this instance. 2161 */ 2162 if (vcpu == &vm->vcpu[0]) { 2163 vhpet_localize_resources(vm->vhpet); 2164 vrtc_localize_resources(vm->vrtc); 2165 vatpit_localize_resources(vm->vatpit); 2166 } 2167 2168 vlapic_localize_resources(vcpu->vlapic); 2169 2170 vcpu->lastloccpu = curcpu; 2171 } 2172 2173 static void 2174 vmm_savectx(void *arg) 2175 { 2176 vm_thread_ctx_t *vtc = arg; 2177 struct vm *vm = vtc->vtc_vm; 2178 const int vcpuid = vtc->vtc_vcpuid; 2179 2180 if (ops->vmsavectx != NULL) { 2181 ops->vmsavectx(vm->cookie, vcpuid); 2182 } 2183 2184 /* 2185 * Account for going off-cpu, unless the vCPU is idled, where being 2186 * off-cpu is the explicit point. 2187 */ 2188 if (vm->vcpu[vcpuid].ustate != VU_IDLE) { 2189 vtc->vtc_ustate = vm->vcpu[vcpuid].ustate; 2190 vcpu_ustate_change(vm, vcpuid, VU_SCHED); 2191 } 2192 2193 /* 2194 * If the CPU holds the restored guest FPU state, save it and restore 2195 * the host FPU state before this thread goes off-cpu. 2196 */ 2197 if ((vtc->vtc_status & VTCS_FPU_RESTORED) != 0) { 2198 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 2199 2200 save_guest_fpustate(vcpu); 2201 vtc->vtc_status &= ~VTCS_FPU_RESTORED; 2202 } 2203 } 2204 2205 static void 2206 vmm_restorectx(void *arg) 2207 { 2208 vm_thread_ctx_t *vtc = arg; 2209 struct vm *vm = vtc->vtc_vm; 2210 const int vcpuid = vtc->vtc_vcpuid; 2211 2212 /* Complete microstate accounting for vCPU being off-cpu */ 2213 if (vm->vcpu[vcpuid].ustate != VU_IDLE) { 2214 vcpu_ustate_change(vm, vcpuid, vtc->vtc_ustate); 2215 } 2216 2217 /* 2218 * When coming back on-cpu, only restore the guest FPU status if the 2219 * thread is in a context marked as requiring it. This should be rare, 2220 * occurring only when a future logic error results in a voluntary 2221 * sleep during the VMRUN critical section. 2222 * 2223 * The common case will result in elision of the guest FPU state 2224 * restoration, deferring that action until it is clearly necessary 2225 * during vm_run. 2226 */ 2227 VERIFY((vtc->vtc_status & VTCS_FPU_RESTORED) == 0); 2228 if ((vtc->vtc_status & VTCS_FPU_CTX_CRITICAL) != 0) { 2229 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 2230 2231 restore_guest_fpustate(vcpu); 2232 vtc->vtc_status |= VTCS_FPU_RESTORED; 2233 } 2234 2235 if (ops->vmrestorectx != NULL) { 2236 ops->vmrestorectx(vm->cookie, vcpuid); 2237 } 2238 2239 } 2240 2241 static int 2242 vm_entry_actions(struct vm *vm, int vcpuid, const struct vm_entry *entry, 2243 struct vm_exit *vme) 2244 { 2245 struct vcpu *vcpu; 2246 struct vie *vie; 2247 int err; 2248 2249 vcpu = &vm->vcpu[vcpuid]; 2250 vie = vcpu->vie_ctx; 2251 err = 0; 2252 2253 switch (entry->cmd) { 2254 case VEC_DEFAULT: 2255 return (0); 2256 case VEC_DISCARD_INSTR: 2257 vie_reset(vie); 2258 return (0); 2259 case VEC_FULFILL_MMIO: 2260 err = vie_fulfill_mmio(vie, &entry->u.mmio); 2261 if (err == 0) { 2262 err = vie_emulate_mmio(vie, vm, vcpuid); 2263 if (err == 0) { 2264 vie_advance_pc(vie, &vcpu->nextrip); 2265 } else if (err < 0) { 2266 vie_exitinfo(vie, vme); 2267 } else if (err == EAGAIN) { 2268 /* 2269 * Clear the instruction emulation state in 2270 * order to re-enter VM context and continue 2271 * this 'rep <instruction>' 2272 */ 2273 vie_reset(vie); 2274 err = 0; 2275 } 2276 } 2277 break; 2278 case VEC_FULFILL_INOUT: 2279 err = vie_fulfill_inout(vie, &entry->u.inout); 2280 if (err == 0) { 2281 err = vie_emulate_inout(vie, vm, vcpuid); 2282 if (err == 0) { 2283 vie_advance_pc(vie, &vcpu->nextrip); 2284 } else if (err < 0) { 2285 vie_exitinfo(vie, vme); 2286 } else if (err == EAGAIN) { 2287 /* 2288 * Clear the instruction emulation state in 2289 * order to re-enter VM context and continue 2290 * this 'rep ins/outs' 2291 */ 2292 vie_reset(vie); 2293 err = 0; 2294 } 2295 } 2296 break; 2297 default: 2298 return (EINVAL); 2299 } 2300 return (err); 2301 } 2302 2303 static int 2304 vm_loop_checks(struct vm *vm, int vcpuid, struct vm_exit *vme) 2305 { 2306 struct vie *vie; 2307 2308 vie = vm->vcpu[vcpuid].vie_ctx; 2309 2310 if (vie_pending(vie)) { 2311 /* 2312 * Userspace has not fulfilled the pending needs of the 2313 * instruction emulation, so bail back out. 2314 */ 2315 vie_exitinfo(vie, vme); 2316 return (-1); 2317 } 2318 2319 return (0); 2320 } 2321 2322 int 2323 vm_run(struct vm *vm, int vcpuid, const struct vm_entry *entry) 2324 { 2325 int error; 2326 struct vcpu *vcpu; 2327 struct vm_exit *vme; 2328 bool intr_disabled; 2329 int affinity_type = CPU_CURRENT; 2330 2331 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 2332 return (EINVAL); 2333 if (!CPU_ISSET(vcpuid, &vm->active_cpus)) 2334 return (EINVAL); 2335 2336 vcpu = &vm->vcpu[vcpuid]; 2337 vme = &vcpu->exitinfo; 2338 2339 vcpu_ustate_change(vm, vcpuid, VU_EMU_KERN); 2340 2341 vcpu->vtc.vtc_status = 0; 2342 ctxop_attach(curthread, vcpu->ctxop); 2343 2344 error = vm_entry_actions(vm, vcpuid, entry, vme); 2345 if (error != 0) { 2346 goto exit; 2347 } 2348 2349 restart: 2350 error = vm_loop_checks(vm, vcpuid, vme); 2351 if (error != 0) { 2352 goto exit; 2353 } 2354 2355 thread_affinity_set(curthread, affinity_type); 2356 /* 2357 * Resource localization should happen after the CPU affinity for the 2358 * thread has been set to ensure that access from restricted contexts, 2359 * such as VMX-accelerated APIC operations, can occur without inducing 2360 * cyclic cross-calls. 2361 * 2362 * This must be done prior to disabling kpreempt via critical_enter(). 2363 */ 2364 vm_localize_resources(vm, vcpu); 2365 affinity_type = CPU_CURRENT; 2366 critical_enter(); 2367 2368 /* Force a trip through update_sregs to reload %fs/%gs and friends */ 2369 PCB_SET_UPDATE_SEGS(&ttolwp(curthread)->lwp_pcb); 2370 2371 if ((vcpu->vtc.vtc_status & VTCS_FPU_RESTORED) == 0) { 2372 restore_guest_fpustate(vcpu); 2373 vcpu->vtc.vtc_status |= VTCS_FPU_RESTORED; 2374 } 2375 vcpu->vtc.vtc_status |= VTCS_FPU_CTX_CRITICAL; 2376 2377 vcpu_require_state(vm, vcpuid, VCPU_RUNNING); 2378 error = VMRUN(vm->cookie, vcpuid, vcpu->nextrip); 2379 vcpu_require_state(vm, vcpuid, VCPU_FROZEN); 2380 2381 /* 2382 * Once clear of the delicate contexts comprising the VM_RUN handler, 2383 * thread CPU affinity can be loosened while other processing occurs. 2384 */ 2385 vcpu->vtc.vtc_status &= ~VTCS_FPU_CTX_CRITICAL; 2386 thread_affinity_clear(curthread); 2387 critical_exit(); 2388 2389 if (error != 0) { 2390 /* Communicate out any error from VMRUN() above */ 2391 goto exit; 2392 } 2393 2394 vcpu->nextrip = vme->rip + vme->inst_length; 2395 switch (vme->exitcode) { 2396 case VM_EXITCODE_REQIDLE: 2397 error = vm_handle_reqidle(vm, vcpuid); 2398 break; 2399 case VM_EXITCODE_RUN_STATE: 2400 error = vm_handle_run_state(vm, vcpuid); 2401 break; 2402 case VM_EXITCODE_SUSPENDED: 2403 error = vm_handle_suspend(vm, vcpuid); 2404 break; 2405 case VM_EXITCODE_IOAPIC_EOI: 2406 vioapic_process_eoi(vm, vcpuid, 2407 vme->u.ioapic_eoi.vector); 2408 break; 2409 case VM_EXITCODE_HLT: 2410 intr_disabled = ((vme->u.hlt.rflags & PSL_I) == 0); 2411 error = vm_handle_hlt(vm, vcpuid, intr_disabled); 2412 break; 2413 case VM_EXITCODE_PAGING: 2414 error = vm_handle_paging(vm, vcpuid); 2415 break; 2416 case VM_EXITCODE_MMIO_EMUL: 2417 error = vm_handle_mmio_emul(vm, vcpuid); 2418 break; 2419 case VM_EXITCODE_INOUT: 2420 error = vm_handle_inout(vm, vcpuid, vme); 2421 break; 2422 case VM_EXITCODE_INST_EMUL: 2423 error = vm_handle_inst_emul(vm, vcpuid); 2424 break; 2425 case VM_EXITCODE_MONITOR: 2426 case VM_EXITCODE_MWAIT: 2427 case VM_EXITCODE_VMINSN: 2428 vm_inject_ud(vm, vcpuid); 2429 break; 2430 case VM_EXITCODE_RDMSR: 2431 error = vm_handle_rdmsr(vm, vcpuid, vme); 2432 break; 2433 case VM_EXITCODE_WRMSR: 2434 error = vm_handle_wrmsr(vm, vcpuid, vme); 2435 break; 2436 case VM_EXITCODE_HT: 2437 affinity_type = CPU_BEST; 2438 break; 2439 case VM_EXITCODE_MTRAP: 2440 VERIFY0(vm_suspend_cpu(vm, vcpuid)); 2441 error = -1; 2442 break; 2443 default: 2444 /* handled in userland */ 2445 error = -1; 2446 break; 2447 } 2448 2449 if (error == 0) { 2450 /* VM exit conditions handled in-kernel, continue running */ 2451 goto restart; 2452 } 2453 2454 exit: 2455 kpreempt_disable(); 2456 ctxop_detach(curthread, vcpu->ctxop); 2457 /* Make sure all of the needed vCPU context state is saved */ 2458 vmm_savectx(&vcpu->vtc); 2459 kpreempt_enable(); 2460 2461 vcpu_ustate_change(vm, vcpuid, VU_EMU_USER); 2462 return (error); 2463 } 2464 2465 int 2466 vm_restart_instruction(void *arg, int vcpuid) 2467 { 2468 struct vm *vm; 2469 struct vcpu *vcpu; 2470 enum vcpu_state state; 2471 uint64_t rip; 2472 int error; 2473 2474 vm = arg; 2475 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 2476 return (EINVAL); 2477 2478 vcpu = &vm->vcpu[vcpuid]; 2479 state = vcpu_get_state(vm, vcpuid, NULL); 2480 if (state == VCPU_RUNNING) { 2481 /* 2482 * When a vcpu is "running" the next instruction is determined 2483 * by adding 'rip' and 'inst_length' in the vcpu's 'exitinfo'. 2484 * Thus setting 'inst_length' to zero will cause the current 2485 * instruction to be restarted. 2486 */ 2487 vcpu->exitinfo.inst_length = 0; 2488 } else if (state == VCPU_FROZEN) { 2489 /* 2490 * When a vcpu is "frozen" it is outside the critical section 2491 * around VMRUN() and 'nextrip' points to the next instruction. 2492 * Thus instruction restart is achieved by setting 'nextrip' 2493 * to the vcpu's %rip. 2494 */ 2495 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RIP, &rip); 2496 KASSERT(!error, ("%s: error %d getting rip", __func__, error)); 2497 vcpu->nextrip = rip; 2498 } else { 2499 panic("%s: invalid state %d", __func__, state); 2500 } 2501 return (0); 2502 } 2503 2504 int 2505 vm_exit_intinfo(struct vm *vm, int vcpuid, uint64_t info) 2506 { 2507 struct vcpu *vcpu; 2508 2509 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 2510 return (EINVAL); 2511 2512 vcpu = &vm->vcpu[vcpuid]; 2513 2514 if (VM_INTINFO_PENDING(info)) { 2515 const uint32_t type = VM_INTINFO_TYPE(info); 2516 const uint8_t vector = VM_INTINFO_VECTOR(info); 2517 2518 if (type == VM_INTINFO_NMI && vector != IDT_NMI) 2519 return (EINVAL); 2520 if (type == VM_INTINFO_HWEXCP && vector >= 32) 2521 return (EINVAL); 2522 if (info & VM_INTINFO_MASK_RSVD) 2523 return (EINVAL); 2524 } else { 2525 info = 0; 2526 } 2527 vcpu->exit_intinfo = info; 2528 return (0); 2529 } 2530 2531 enum exc_class { 2532 EXC_BENIGN, 2533 EXC_CONTRIBUTORY, 2534 EXC_PAGEFAULT 2535 }; 2536 2537 #define IDT_VE 20 /* Virtualization Exception (Intel specific) */ 2538 2539 static enum exc_class 2540 exception_class(uint64_t info) 2541 { 2542 ASSERT(VM_INTINFO_PENDING(info)); 2543 2544 /* Table 6-4, "Interrupt and Exception Classes", Intel SDM, Vol 3 */ 2545 switch (VM_INTINFO_TYPE(info)) { 2546 case VM_INTINFO_HWINTR: 2547 case VM_INTINFO_SWINTR: 2548 case VM_INTINFO_NMI: 2549 return (EXC_BENIGN); 2550 default: 2551 /* 2552 * Hardware exception. 2553 * 2554 * SVM and VT-x use identical type values to represent NMI, 2555 * hardware interrupt and software interrupt. 2556 * 2557 * SVM uses type '3' for all exceptions. VT-x uses type '3' 2558 * for exceptions except #BP and #OF. #BP and #OF use a type 2559 * value of '5' or '6'. Therefore we don't check for explicit 2560 * values of 'type' to classify 'intinfo' into a hardware 2561 * exception. 2562 */ 2563 break; 2564 } 2565 2566 switch (VM_INTINFO_VECTOR(info)) { 2567 case IDT_PF: 2568 case IDT_VE: 2569 return (EXC_PAGEFAULT); 2570 case IDT_DE: 2571 case IDT_TS: 2572 case IDT_NP: 2573 case IDT_SS: 2574 case IDT_GP: 2575 return (EXC_CONTRIBUTORY); 2576 default: 2577 return (EXC_BENIGN); 2578 } 2579 } 2580 2581 /* 2582 * Fetch event pending injection into the guest, if one exists. 2583 * 2584 * Returns true if an event is to be injected (which is placed in `retinfo`). 2585 */ 2586 bool 2587 vm_entry_intinfo(struct vm *vm, int vcpuid, uint64_t *retinfo) 2588 { 2589 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 2590 const uint64_t info1 = vcpu->exit_intinfo; 2591 vcpu->exit_intinfo = 0; 2592 const uint64_t info2 = vcpu->exc_pending; 2593 vcpu->exc_pending = 0; 2594 2595 if (VM_INTINFO_PENDING(info1) && VM_INTINFO_PENDING(info2)) { 2596 /* 2597 * If an exception occurs while attempting to call the 2598 * double-fault handler the processor enters shutdown mode 2599 * (aka triple fault). 2600 */ 2601 if (VM_INTINFO_TYPE(info1) == VM_INTINFO_HWEXCP && 2602 VM_INTINFO_VECTOR(info1) == IDT_DF) { 2603 (void) vm_suspend(vm, VM_SUSPEND_TRIPLEFAULT); 2604 *retinfo = 0; 2605 return (false); 2606 } 2607 /* 2608 * "Conditions for Generating a Double Fault" 2609 * Intel SDM, Vol3, Table 6-5 2610 */ 2611 const enum exc_class exc1 = exception_class(info1); 2612 const enum exc_class exc2 = exception_class(info2); 2613 if ((exc1 == EXC_CONTRIBUTORY && exc2 == EXC_CONTRIBUTORY) || 2614 (exc1 == EXC_PAGEFAULT && exc2 != EXC_BENIGN)) { 2615 /* Convert nested fault into a double fault. */ 2616 *retinfo = 2617 VM_INTINFO_VALID | 2618 VM_INTINFO_DEL_ERRCODE | 2619 VM_INTINFO_HWEXCP | 2620 IDT_DF; 2621 } else { 2622 /* Handle exceptions serially */ 2623 vcpu->exit_intinfo = info1; 2624 *retinfo = info2; 2625 } 2626 return (true); 2627 } else if (VM_INTINFO_PENDING(info1)) { 2628 *retinfo = info1; 2629 return (true); 2630 } else if (VM_INTINFO_PENDING(info2)) { 2631 *retinfo = info2; 2632 return (true); 2633 } 2634 2635 return (false); 2636 } 2637 2638 int 2639 vm_get_intinfo(struct vm *vm, int vcpuid, uint64_t *info1, uint64_t *info2) 2640 { 2641 struct vcpu *vcpu; 2642 2643 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 2644 return (EINVAL); 2645 2646 vcpu = &vm->vcpu[vcpuid]; 2647 *info1 = vcpu->exit_intinfo; 2648 *info2 = vcpu->exc_pending; 2649 return (0); 2650 } 2651 2652 int 2653 vm_inject_exception(struct vm *vm, int vcpuid, uint8_t vector, 2654 bool errcode_valid, uint32_t errcode, bool restart_instruction) 2655 { 2656 struct vcpu *vcpu; 2657 uint64_t regval; 2658 int error; 2659 2660 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 2661 return (EINVAL); 2662 2663 if (vector >= 32) 2664 return (EINVAL); 2665 2666 /* 2667 * NMIs are to be injected via their own specialized path using 2668 * vm_inject_nmi(). 2669 */ 2670 if (vector == IDT_NMI) { 2671 return (EINVAL); 2672 } 2673 2674 /* 2675 * A double fault exception should never be injected directly into 2676 * the guest. It is a derived exception that results from specific 2677 * combinations of nested faults. 2678 */ 2679 if (vector == IDT_DF) { 2680 return (EINVAL); 2681 } 2682 2683 vcpu = &vm->vcpu[vcpuid]; 2684 2685 if (VM_INTINFO_PENDING(vcpu->exc_pending)) { 2686 /* Unable to inject exception due to one already pending */ 2687 return (EBUSY); 2688 } 2689 2690 if (errcode_valid) { 2691 /* 2692 * Exceptions don't deliver an error code in real mode. 2693 */ 2694 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_CR0, ®val); 2695 VERIFY0(error); 2696 if ((regval & CR0_PE) == 0) { 2697 errcode_valid = false; 2698 } 2699 } 2700 2701 /* 2702 * From section 26.6.1 "Interruptibility State" in Intel SDM: 2703 * 2704 * Event blocking by "STI" or "MOV SS" is cleared after guest executes 2705 * one instruction or incurs an exception. 2706 */ 2707 error = vm_set_register(vm, vcpuid, VM_REG_GUEST_INTR_SHADOW, 0); 2708 VERIFY0(error); 2709 2710 if (restart_instruction) { 2711 VERIFY0(vm_restart_instruction(vm, vcpuid)); 2712 } 2713 2714 uint64_t val = VM_INTINFO_VALID | VM_INTINFO_HWEXCP | vector; 2715 if (errcode_valid) { 2716 val |= VM_INTINFO_DEL_ERRCODE; 2717 val |= (uint64_t)errcode << VM_INTINFO_SHIFT_ERRCODE; 2718 } 2719 vcpu->exc_pending = val; 2720 return (0); 2721 } 2722 2723 void 2724 vm_inject_ud(struct vm *vm, int vcpuid) 2725 { 2726 VERIFY0(vm_inject_exception(vm, vcpuid, IDT_UD, false, 0, true)); 2727 } 2728 2729 void 2730 vm_inject_gp(struct vm *vm, int vcpuid) 2731 { 2732 VERIFY0(vm_inject_exception(vm, vcpuid, IDT_GP, true, 0, true)); 2733 } 2734 2735 void 2736 vm_inject_ac(struct vm *vm, int vcpuid, uint32_t errcode) 2737 { 2738 VERIFY0(vm_inject_exception(vm, vcpuid, IDT_AC, true, errcode, true)); 2739 } 2740 2741 void 2742 vm_inject_ss(struct vm *vm, int vcpuid, uint32_t errcode) 2743 { 2744 VERIFY0(vm_inject_exception(vm, vcpuid, IDT_SS, true, errcode, true)); 2745 } 2746 2747 void 2748 vm_inject_pf(struct vm *vm, int vcpuid, uint32_t errcode, uint64_t cr2) 2749 { 2750 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_CR2, cr2)); 2751 VERIFY0(vm_inject_exception(vm, vcpuid, IDT_PF, true, errcode, true)); 2752 } 2753 2754 static VMM_STAT(VCPU_NMI_COUNT, "number of NMIs delivered to vcpu"); 2755 2756 int 2757 vm_inject_nmi(struct vm *vm, int vcpuid) 2758 { 2759 struct vcpu *vcpu; 2760 2761 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 2762 return (EINVAL); 2763 2764 vcpu = &vm->vcpu[vcpuid]; 2765 2766 vcpu->nmi_pending = true; 2767 vcpu_notify_event(vm, vcpuid); 2768 return (0); 2769 } 2770 2771 bool 2772 vm_nmi_pending(struct vm *vm, int vcpuid) 2773 { 2774 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 2775 2776 return (vcpu->nmi_pending); 2777 } 2778 2779 void 2780 vm_nmi_clear(struct vm *vm, int vcpuid) 2781 { 2782 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 2783 2784 ASSERT(vcpu->nmi_pending); 2785 2786 vcpu->nmi_pending = false; 2787 vmm_stat_incr(vm, vcpuid, VCPU_NMI_COUNT, 1); 2788 } 2789 2790 static VMM_STAT(VCPU_EXTINT_COUNT, "number of ExtINTs delivered to vcpu"); 2791 2792 int 2793 vm_inject_extint(struct vm *vm, int vcpuid) 2794 { 2795 struct vcpu *vcpu; 2796 2797 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 2798 return (EINVAL); 2799 2800 vcpu = &vm->vcpu[vcpuid]; 2801 2802 vcpu->extint_pending = true; 2803 vcpu_notify_event(vm, vcpuid); 2804 return (0); 2805 } 2806 2807 bool 2808 vm_extint_pending(struct vm *vm, int vcpuid) 2809 { 2810 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 2811 2812 return (vcpu->extint_pending); 2813 } 2814 2815 void 2816 vm_extint_clear(struct vm *vm, int vcpuid) 2817 { 2818 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 2819 2820 ASSERT(vcpu->extint_pending); 2821 2822 vcpu->extint_pending = false; 2823 vmm_stat_incr(vm, vcpuid, VCPU_EXTINT_COUNT, 1); 2824 } 2825 2826 int 2827 vm_inject_init(struct vm *vm, int vcpuid) 2828 { 2829 struct vcpu *vcpu; 2830 2831 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 2832 return (EINVAL); 2833 2834 vcpu = &vm->vcpu[vcpuid]; 2835 vcpu_lock(vcpu); 2836 vcpu->run_state |= VRS_PEND_INIT; 2837 /* 2838 * As part of queuing the INIT request, clear any pending SIPI. It 2839 * would not otherwise survive across the reset of the vCPU when it 2840 * undergoes the requested INIT. We would not want it to linger when it 2841 * could be mistaken as a subsequent (after the INIT) SIPI request. 2842 */ 2843 vcpu->run_state &= ~VRS_PEND_SIPI; 2844 vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT); 2845 2846 vcpu_unlock(vcpu); 2847 return (0); 2848 } 2849 2850 int 2851 vm_inject_sipi(struct vm *vm, int vcpuid, uint8_t vector) 2852 { 2853 struct vcpu *vcpu; 2854 2855 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 2856 return (EINVAL); 2857 2858 vcpu = &vm->vcpu[vcpuid]; 2859 vcpu_lock(vcpu); 2860 vcpu->run_state |= VRS_PEND_SIPI; 2861 vcpu->sipi_vector = vector; 2862 /* SIPI is only actionable if the CPU is waiting in INIT state */ 2863 if ((vcpu->run_state & (VRS_INIT | VRS_RUN)) == VRS_INIT) { 2864 vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT); 2865 } 2866 vcpu_unlock(vcpu); 2867 return (0); 2868 } 2869 2870 bool 2871 vcpu_run_state_pending(struct vm *vm, int vcpuid) 2872 { 2873 struct vcpu *vcpu; 2874 2875 ASSERT(vcpuid >= 0 && vcpuid < vm->maxcpus); 2876 vcpu = &vm->vcpu[vcpuid]; 2877 2878 /* Of interest: vCPU not in running state or with pending INIT */ 2879 return ((vcpu->run_state & (VRS_RUN | VRS_PEND_INIT)) != VRS_RUN); 2880 } 2881 2882 int 2883 vcpu_arch_reset(struct vm *vm, int vcpuid, bool init_only) 2884 { 2885 struct seg_desc desc; 2886 const enum vm_reg_name clear_regs[] = { 2887 VM_REG_GUEST_CR2, 2888 VM_REG_GUEST_CR3, 2889 VM_REG_GUEST_CR4, 2890 VM_REG_GUEST_RAX, 2891 VM_REG_GUEST_RBX, 2892 VM_REG_GUEST_RCX, 2893 VM_REG_GUEST_RSI, 2894 VM_REG_GUEST_RDI, 2895 VM_REG_GUEST_RBP, 2896 VM_REG_GUEST_RSP, 2897 VM_REG_GUEST_R8, 2898 VM_REG_GUEST_R9, 2899 VM_REG_GUEST_R10, 2900 VM_REG_GUEST_R11, 2901 VM_REG_GUEST_R12, 2902 VM_REG_GUEST_R13, 2903 VM_REG_GUEST_R14, 2904 VM_REG_GUEST_R15, 2905 VM_REG_GUEST_DR0, 2906 VM_REG_GUEST_DR1, 2907 VM_REG_GUEST_DR2, 2908 VM_REG_GUEST_DR3, 2909 VM_REG_GUEST_EFER, 2910 }; 2911 const enum vm_reg_name data_segs[] = { 2912 VM_REG_GUEST_SS, 2913 VM_REG_GUEST_DS, 2914 VM_REG_GUEST_ES, 2915 VM_REG_GUEST_FS, 2916 VM_REG_GUEST_GS, 2917 }; 2918 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 2919 2920 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 2921 return (EINVAL); 2922 2923 for (uint_t i = 0; i < nitems(clear_regs); i++) { 2924 VERIFY0(vm_set_register(vm, vcpuid, clear_regs[i], 0)); 2925 } 2926 2927 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, 2)); 2928 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RIP, 0xfff0)); 2929 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_CR0, 0x60000010)); 2930 2931 /* 2932 * The prescribed contents of %rdx differ slightly between the Intel and 2933 * AMD architectural definitions. The former expects the Extended Model 2934 * in bits 16-19 where the latter expects all the Family, Model, and 2935 * Stepping be there. Common boot ROMs appear to disregard this 2936 * anyways, so we stick with a compromise value similar to what is 2937 * spelled out in the Intel SDM. 2938 */ 2939 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RDX, 0x600)); 2940 2941 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_DR6, 0xffff0ff0)); 2942 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_DR7, 0x400)); 2943 2944 /* CS: Present, R/W, Accessed */ 2945 desc.access = 0x0093; 2946 desc.base = 0xffff0000; 2947 desc.limit = 0xffff; 2948 VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_CS, &desc)); 2949 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_CS, 0xf000)); 2950 2951 /* SS, DS, ES, FS, GS: Present, R/W, Accessed */ 2952 desc.access = 0x0093; 2953 desc.base = 0; 2954 desc.limit = 0xffff; 2955 for (uint_t i = 0; i < nitems(data_segs); i++) { 2956 VERIFY0(vm_set_seg_desc(vm, vcpuid, data_segs[i], &desc)); 2957 VERIFY0(vm_set_register(vm, vcpuid, data_segs[i], 0)); 2958 } 2959 2960 /* GDTR, IDTR */ 2961 desc.base = 0; 2962 desc.limit = 0xffff; 2963 VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_GDTR, &desc)); 2964 VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_IDTR, &desc)); 2965 2966 /* LDTR: Present, LDT */ 2967 desc.access = 0x0082; 2968 desc.base = 0; 2969 desc.limit = 0xffff; 2970 VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_LDTR, &desc)); 2971 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_LDTR, 0)); 2972 2973 /* TR: Present, 32-bit TSS */ 2974 desc.access = 0x008b; 2975 desc.base = 0; 2976 desc.limit = 0xffff; 2977 VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_TR, &desc)); 2978 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_TR, 0)); 2979 2980 vlapic_reset(vm_lapic(vm, vcpuid)); 2981 2982 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_INTR_SHADOW, 0)); 2983 2984 vcpu->exit_intinfo = 0; 2985 vcpu->exc_pending = 0; 2986 vcpu->nmi_pending = false; 2987 vcpu->extint_pending = 0; 2988 2989 /* 2990 * A CPU reset caused by power-on or system reset clears more state than 2991 * one which is trigged from an INIT IPI. 2992 */ 2993 if (!init_only) { 2994 vcpu->guest_xcr0 = XFEATURE_ENABLED_X87; 2995 (void) hma_fpu_init(vcpu->guestfpu); 2996 2997 /* XXX: clear MSRs and other pieces */ 2998 bzero(&vcpu->mtrr, sizeof (vcpu->mtrr)); 2999 } 3000 3001 return (0); 3002 } 3003 3004 static int 3005 vcpu_vector_sipi(struct vm *vm, int vcpuid, uint8_t vector) 3006 { 3007 struct seg_desc desc; 3008 3009 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 3010 return (EINVAL); 3011 3012 /* CS: Present, R/W, Accessed */ 3013 desc.access = 0x0093; 3014 desc.base = (uint64_t)vector << 12; 3015 desc.limit = 0xffff; 3016 VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_CS, &desc)); 3017 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_CS, 3018 (uint64_t)vector << 8)); 3019 3020 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RIP, 0)); 3021 3022 return (0); 3023 } 3024 3025 int 3026 vm_get_capability(struct vm *vm, int vcpu, int type, int *retval) 3027 { 3028 if (vcpu < 0 || vcpu >= vm->maxcpus) 3029 return (EINVAL); 3030 3031 if (type < 0 || type >= VM_CAP_MAX) 3032 return (EINVAL); 3033 3034 return (VMGETCAP(vm->cookie, vcpu, type, retval)); 3035 } 3036 3037 int 3038 vm_set_capability(struct vm *vm, int vcpu, int type, int val) 3039 { 3040 if (vcpu < 0 || vcpu >= vm->maxcpus) 3041 return (EINVAL); 3042 3043 if (type < 0 || type >= VM_CAP_MAX) 3044 return (EINVAL); 3045 3046 return (VMSETCAP(vm->cookie, vcpu, type, val)); 3047 } 3048 3049 vcpu_cpuid_config_t * 3050 vm_cpuid_config(struct vm *vm, int vcpuid) 3051 { 3052 ASSERT3S(vcpuid, >=, 0); 3053 ASSERT3S(vcpuid, <, VM_MAXCPU); 3054 3055 return (&vm->vcpu[vcpuid].cpuid_cfg); 3056 } 3057 3058 struct vlapic * 3059 vm_lapic(struct vm *vm, int cpu) 3060 { 3061 ASSERT3S(cpu, >=, 0); 3062 ASSERT3S(cpu, <, VM_MAXCPU); 3063 3064 return (vm->vcpu[cpu].vlapic); 3065 } 3066 3067 struct vioapic * 3068 vm_ioapic(struct vm *vm) 3069 { 3070 3071 return (vm->vioapic); 3072 } 3073 3074 struct vhpet * 3075 vm_hpet(struct vm *vm) 3076 { 3077 3078 return (vm->vhpet); 3079 } 3080 3081 void * 3082 vm_iommu_domain(struct vm *vm) 3083 { 3084 3085 return (vm->iommu); 3086 } 3087 3088 int 3089 vcpu_set_state(struct vm *vm, int vcpuid, enum vcpu_state newstate, 3090 bool from_idle) 3091 { 3092 int error; 3093 struct vcpu *vcpu; 3094 3095 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 3096 panic("vcpu_set_state: invalid vcpuid %d", vcpuid); 3097 3098 vcpu = &vm->vcpu[vcpuid]; 3099 3100 vcpu_lock(vcpu); 3101 error = vcpu_set_state_locked(vm, vcpuid, newstate, from_idle); 3102 vcpu_unlock(vcpu); 3103 3104 return (error); 3105 } 3106 3107 enum vcpu_state 3108 vcpu_get_state(struct vm *vm, int vcpuid, int *hostcpu) 3109 { 3110 struct vcpu *vcpu; 3111 enum vcpu_state state; 3112 3113 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 3114 panic("vcpu_get_state: invalid vcpuid %d", vcpuid); 3115 3116 vcpu = &vm->vcpu[vcpuid]; 3117 3118 vcpu_lock(vcpu); 3119 state = vcpu->state; 3120 if (hostcpu != NULL) 3121 *hostcpu = vcpu->hostcpu; 3122 vcpu_unlock(vcpu); 3123 3124 return (state); 3125 } 3126 3127 uint64_t 3128 vcpu_tsc_offset(struct vm *vm, int vcpuid, bool phys_adj) 3129 { 3130 ASSERT(vcpuid >= 0 && vcpuid < vm->maxcpus); 3131 3132 uint64_t vcpu_off = vm->boot_tsc_offset + vm->vcpu[vcpuid].tsc_offset; 3133 3134 if (phys_adj) { 3135 /* Include any offset for the current physical CPU too */ 3136 extern hrtime_t tsc_gethrtime_tick_delta(void); 3137 vcpu_off += (uint64_t)tsc_gethrtime_tick_delta(); 3138 } 3139 3140 return (vcpu_off); 3141 } 3142 3143 /* Normalize hrtime against the boot time for a VM */ 3144 hrtime_t 3145 vm_normalize_hrtime(struct vm *vm, hrtime_t hrt) 3146 { 3147 /* To avoid underflow/overflow UB, perform math as unsigned */ 3148 return ((hrtime_t)((uint64_t)hrt - (uint64_t)vm->boot_hrtime)); 3149 } 3150 3151 /* Denormalize hrtime against the boot time for a VM */ 3152 hrtime_t 3153 vm_denormalize_hrtime(struct vm *vm, hrtime_t hrt) 3154 { 3155 /* To avoid underflow/overflow UB, perform math as unsigned */ 3156 return ((hrtime_t)((uint64_t)hrt + (uint64_t)vm->boot_hrtime)); 3157 } 3158 3159 int 3160 vm_activate_cpu(struct vm *vm, int vcpuid) 3161 { 3162 3163 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 3164 return (EINVAL); 3165 3166 if (CPU_ISSET(vcpuid, &vm->active_cpus)) 3167 return (EBUSY); 3168 3169 if (vm->suspend != 0) { 3170 return (EBUSY); 3171 } 3172 3173 CPU_SET_ATOMIC(vcpuid, &vm->active_cpus); 3174 3175 /* 3176 * It is possible that this vCPU was undergoing activation at the same 3177 * time that the VM was being suspended. If that happens to be the 3178 * case, it should reflect the suspended state immediately. 3179 */ 3180 if (atomic_load_acq_int((uint_t *)&vm->suspend) != 0) { 3181 CPU_SET_ATOMIC(vcpuid, &vm->suspended_cpus); 3182 } 3183 3184 return (0); 3185 } 3186 3187 int 3188 vm_suspend_cpu(struct vm *vm, int vcpuid) 3189 { 3190 int i; 3191 3192 if (vcpuid < -1 || vcpuid >= vm->maxcpus) 3193 return (EINVAL); 3194 3195 if (vcpuid == -1) { 3196 vm->debug_cpus = vm->active_cpus; 3197 for (i = 0; i < vm->maxcpus; i++) { 3198 if (CPU_ISSET(i, &vm->active_cpus)) 3199 vcpu_notify_event(vm, i); 3200 } 3201 } else { 3202 if (!CPU_ISSET(vcpuid, &vm->active_cpus)) 3203 return (EINVAL); 3204 3205 CPU_SET_ATOMIC(vcpuid, &vm->debug_cpus); 3206 vcpu_notify_event(vm, vcpuid); 3207 } 3208 return (0); 3209 } 3210 3211 int 3212 vm_resume_cpu(struct vm *vm, int vcpuid) 3213 { 3214 3215 if (vcpuid < -1 || vcpuid >= vm->maxcpus) 3216 return (EINVAL); 3217 3218 if (vcpuid == -1) { 3219 CPU_ZERO(&vm->debug_cpus); 3220 } else { 3221 if (!CPU_ISSET(vcpuid, &vm->debug_cpus)) 3222 return (EINVAL); 3223 3224 CPU_CLR_ATOMIC(vcpuid, &vm->debug_cpus); 3225 } 3226 return (0); 3227 } 3228 3229 static bool 3230 vcpu_bailout_checks(struct vm *vm, int vcpuid, bool on_entry, 3231 uint64_t entry_rip) 3232 { 3233 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 3234 struct vm_exit *vme = &vcpu->exitinfo; 3235 bool bail = false; 3236 3237 ASSERT(vcpuid >= 0 && vcpuid < vm->maxcpus); 3238 3239 if (vm->suspend) { 3240 if (on_entry) { 3241 VERIFY(vm->suspend > VM_SUSPEND_NONE && 3242 vm->suspend < VM_SUSPEND_LAST); 3243 3244 vme->exitcode = VM_EXITCODE_SUSPENDED; 3245 vme->u.suspended.how = vm->suspend; 3246 } else { 3247 /* 3248 * Handling VM suspend is complicated, so if that 3249 * condition is detected outside of VM-entry itself, 3250 * just emit a BOGUS exitcode so we take a lap to pick 3251 * up the event during an entry and are directed into 3252 * the vm_handle_suspend() logic. 3253 */ 3254 vme->exitcode = VM_EXITCODE_BOGUS; 3255 } 3256 bail = true; 3257 } 3258 if (vcpu->reqidle) { 3259 vme->exitcode = VM_EXITCODE_REQIDLE; 3260 vmm_stat_incr(vm, vcpuid, VMEXIT_REQIDLE, 1); 3261 3262 if (!on_entry) { 3263 /* 3264 * A reqidle request detected outside of VM-entry can be 3265 * handled directly by clearing the request (and taking 3266 * a lap to userspace). 3267 */ 3268 vcpu_assert_locked(vcpu); 3269 vcpu->reqidle = 0; 3270 } 3271 bail = true; 3272 } 3273 if (vcpu_should_yield(vm, vcpuid)) { 3274 vme->exitcode = VM_EXITCODE_BOGUS; 3275 vmm_stat_incr(vm, vcpuid, VMEXIT_ASTPENDING, 1); 3276 bail = true; 3277 } 3278 if (CPU_ISSET(vcpuid, &vm->debug_cpus)) { 3279 vme->exitcode = VM_EXITCODE_DEBUG; 3280 bail = true; 3281 } 3282 3283 if (bail) { 3284 if (on_entry) { 3285 /* 3286 * If bailing out during VM-entry, the current %rip must 3287 * be recorded in the exitinfo. 3288 */ 3289 vme->rip = entry_rip; 3290 } 3291 vme->inst_length = 0; 3292 } 3293 return (bail); 3294 } 3295 3296 static bool 3297 vcpu_sleep_bailout_checks(struct vm *vm, int vcpuid) 3298 { 3299 /* 3300 * Bail-out check done prior to sleeping (in vCPU contexts like HLT or 3301 * wait-for-SIPI) expect that %rip is already populated in the vm_exit 3302 * structure, and we would only modify the exitcode. 3303 */ 3304 return (vcpu_bailout_checks(vm, vcpuid, false, 0)); 3305 } 3306 3307 bool 3308 vcpu_entry_bailout_checks(struct vm *vm, int vcpuid, uint64_t rip) 3309 { 3310 /* 3311 * Bail-out checks done as part of VM entry require an updated %rip to 3312 * populate the vm_exit struct if any of the conditions of interest are 3313 * matched in the check. 3314 */ 3315 return (vcpu_bailout_checks(vm, vcpuid, true, rip)); 3316 } 3317 3318 cpuset_t 3319 vm_active_cpus(struct vm *vm) 3320 { 3321 3322 return (vm->active_cpus); 3323 } 3324 3325 cpuset_t 3326 vm_debug_cpus(struct vm *vm) 3327 { 3328 3329 return (vm->debug_cpus); 3330 } 3331 3332 cpuset_t 3333 vm_suspended_cpus(struct vm *vm) 3334 { 3335 3336 return (vm->suspended_cpus); 3337 } 3338 3339 void * 3340 vcpu_stats(struct vm *vm, int vcpuid) 3341 { 3342 3343 return (vm->vcpu[vcpuid].stats); 3344 } 3345 3346 int 3347 vm_get_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state *state) 3348 { 3349 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 3350 return (EINVAL); 3351 3352 *state = vm->vcpu[vcpuid].x2apic_state; 3353 3354 return (0); 3355 } 3356 3357 int 3358 vm_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state) 3359 { 3360 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 3361 return (EINVAL); 3362 3363 if (state >= X2APIC_STATE_LAST) 3364 return (EINVAL); 3365 3366 vm->vcpu[vcpuid].x2apic_state = state; 3367 3368 vlapic_set_x2apic_state(vm, vcpuid, state); 3369 3370 return (0); 3371 } 3372 3373 /* 3374 * This function is called to ensure that a vcpu "sees" a pending event 3375 * as soon as possible: 3376 * - If the vcpu thread is sleeping then it is woken up. 3377 * - If the vcpu is running on a different host_cpu then an IPI will be directed 3378 * to the host_cpu to cause the vcpu to trap into the hypervisor. 3379 */ 3380 static void 3381 vcpu_notify_event_locked(struct vcpu *vcpu, vcpu_notify_t ntype) 3382 { 3383 int hostcpu; 3384 3385 ASSERT(ntype == VCPU_NOTIFY_APIC || VCPU_NOTIFY_EXIT); 3386 3387 hostcpu = vcpu->hostcpu; 3388 if (vcpu->state == VCPU_RUNNING) { 3389 KASSERT(hostcpu != NOCPU, ("vcpu running on invalid hostcpu")); 3390 if (hostcpu != curcpu) { 3391 if (ntype == VCPU_NOTIFY_APIC) { 3392 vlapic_post_intr(vcpu->vlapic, hostcpu); 3393 } else { 3394 poke_cpu(hostcpu); 3395 } 3396 } else { 3397 /* 3398 * If the 'vcpu' is running on 'curcpu' then it must 3399 * be sending a notification to itself (e.g. SELF_IPI). 3400 * The pending event will be picked up when the vcpu 3401 * transitions back to guest context. 3402 */ 3403 } 3404 } else { 3405 KASSERT(hostcpu == NOCPU, ("vcpu state %d not consistent " 3406 "with hostcpu %d", vcpu->state, hostcpu)); 3407 if (vcpu->state == VCPU_SLEEPING) { 3408 cv_signal(&vcpu->vcpu_cv); 3409 } 3410 } 3411 } 3412 3413 void 3414 vcpu_notify_event(struct vm *vm, int vcpuid) 3415 { 3416 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 3417 3418 vcpu_lock(vcpu); 3419 vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT); 3420 vcpu_unlock(vcpu); 3421 } 3422 3423 void 3424 vcpu_notify_event_type(struct vm *vm, int vcpuid, vcpu_notify_t ntype) 3425 { 3426 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 3427 3428 if (ntype == VCPU_NOTIFY_NONE) { 3429 return; 3430 } 3431 3432 vcpu_lock(vcpu); 3433 vcpu_notify_event_locked(vcpu, ntype); 3434 vcpu_unlock(vcpu); 3435 } 3436 3437 void 3438 vcpu_ustate_change(struct vm *vm, int vcpuid, enum vcpu_ustate ustate) 3439 { 3440 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 3441 hrtime_t now = gethrtime(); 3442 3443 ASSERT3U(ustate, !=, vcpu->ustate); 3444 ASSERT3S(ustate, <, VU_MAX); 3445 ASSERT3S(ustate, >=, VU_INIT); 3446 3447 hrtime_t delta = now - vcpu->ustate_when; 3448 vcpu->ustate_total[vcpu->ustate] += delta; 3449 3450 membar_producer(); 3451 3452 vcpu->ustate_when = now; 3453 vcpu->ustate = ustate; 3454 } 3455 3456 struct vmspace * 3457 vm_get_vmspace(struct vm *vm) 3458 { 3459 3460 return (vm->vmspace); 3461 } 3462 3463 struct vm_client * 3464 vm_get_vmclient(struct vm *vm, int vcpuid) 3465 { 3466 return (vm->vcpu[vcpuid].vmclient); 3467 } 3468 3469 int 3470 vm_apicid2vcpuid(struct vm *vm, int apicid) 3471 { 3472 /* 3473 * XXX apic id is assumed to be numerically identical to vcpu id 3474 */ 3475 return (apicid); 3476 } 3477 3478 struct vatpic * 3479 vm_atpic(struct vm *vm) 3480 { 3481 return (vm->vatpic); 3482 } 3483 3484 struct vatpit * 3485 vm_atpit(struct vm *vm) 3486 { 3487 return (vm->vatpit); 3488 } 3489 3490 struct vpmtmr * 3491 vm_pmtmr(struct vm *vm) 3492 { 3493 3494 return (vm->vpmtmr); 3495 } 3496 3497 struct vrtc * 3498 vm_rtc(struct vm *vm) 3499 { 3500 3501 return (vm->vrtc); 3502 } 3503 3504 enum vm_reg_name 3505 vm_segment_name(int seg) 3506 { 3507 static enum vm_reg_name seg_names[] = { 3508 VM_REG_GUEST_ES, 3509 VM_REG_GUEST_CS, 3510 VM_REG_GUEST_SS, 3511 VM_REG_GUEST_DS, 3512 VM_REG_GUEST_FS, 3513 VM_REG_GUEST_GS 3514 }; 3515 3516 KASSERT(seg >= 0 && seg < nitems(seg_names), 3517 ("%s: invalid segment encoding %d", __func__, seg)); 3518 return (seg_names[seg]); 3519 } 3520 3521 void 3522 vm_copy_teardown(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo, 3523 uint_t num_copyinfo) 3524 { 3525 for (uint_t idx = 0; idx < num_copyinfo; idx++) { 3526 if (copyinfo[idx].cookie != NULL) { 3527 (void) vmp_release((vm_page_t *)copyinfo[idx].cookie); 3528 } 3529 } 3530 bzero(copyinfo, num_copyinfo * sizeof (struct vm_copyinfo)); 3531 } 3532 3533 int 3534 vm_copy_setup(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, 3535 uint64_t gla, size_t len, int prot, struct vm_copyinfo *copyinfo, 3536 uint_t num_copyinfo, int *fault) 3537 { 3538 uint_t idx, nused; 3539 size_t n, off, remaining; 3540 vm_client_t *vmc = vm_get_vmclient(vm, vcpuid); 3541 3542 bzero(copyinfo, sizeof (struct vm_copyinfo) * num_copyinfo); 3543 3544 nused = 0; 3545 remaining = len; 3546 while (remaining > 0) { 3547 uint64_t gpa; 3548 int error; 3549 3550 KASSERT(nused < num_copyinfo, ("insufficient vm_copyinfo")); 3551 error = vm_gla2gpa(vm, vcpuid, paging, gla, prot, &gpa, fault); 3552 if (error || *fault) 3553 return (error); 3554 off = gpa & PAGEOFFSET; 3555 n = min(remaining, PAGESIZE - off); 3556 copyinfo[nused].gpa = gpa; 3557 copyinfo[nused].len = n; 3558 remaining -= n; 3559 gla += n; 3560 nused++; 3561 } 3562 3563 for (idx = 0; idx < nused; idx++) { 3564 vm_page_t *vmp; 3565 caddr_t hva; 3566 3567 vmp = vmc_hold(vmc, copyinfo[idx].gpa & PAGEMASK, prot); 3568 if (vmp == NULL) { 3569 break; 3570 } 3571 if ((prot & PROT_WRITE) != 0) { 3572 hva = (caddr_t)vmp_get_writable(vmp); 3573 } else { 3574 hva = (caddr_t)vmp_get_readable(vmp); 3575 } 3576 copyinfo[idx].hva = hva + (copyinfo[idx].gpa & PAGEOFFSET); 3577 copyinfo[idx].cookie = vmp; 3578 copyinfo[idx].prot = prot; 3579 } 3580 3581 if (idx != nused) { 3582 vm_copy_teardown(vm, vcpuid, copyinfo, num_copyinfo); 3583 return (EFAULT); 3584 } else { 3585 *fault = 0; 3586 return (0); 3587 } 3588 } 3589 3590 void 3591 vm_copyin(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo, void *kaddr, 3592 size_t len) 3593 { 3594 char *dst; 3595 int idx; 3596 3597 dst = kaddr; 3598 idx = 0; 3599 while (len > 0) { 3600 ASSERT(copyinfo[idx].prot & PROT_READ); 3601 3602 bcopy(copyinfo[idx].hva, dst, copyinfo[idx].len); 3603 len -= copyinfo[idx].len; 3604 dst += copyinfo[idx].len; 3605 idx++; 3606 } 3607 } 3608 3609 void 3610 vm_copyout(struct vm *vm, int vcpuid, const void *kaddr, 3611 struct vm_copyinfo *copyinfo, size_t len) 3612 { 3613 const char *src; 3614 int idx; 3615 3616 src = kaddr; 3617 idx = 0; 3618 while (len > 0) { 3619 ASSERT(copyinfo[idx].prot & PROT_WRITE); 3620 3621 bcopy(src, copyinfo[idx].hva, copyinfo[idx].len); 3622 len -= copyinfo[idx].len; 3623 src += copyinfo[idx].len; 3624 idx++; 3625 } 3626 } 3627 3628 /* 3629 * Return the amount of in-use and wired memory for the VM. Since 3630 * these are global stats, only return the values with for vCPU 0 3631 */ 3632 VMM_STAT_DECLARE(VMM_MEM_RESIDENT); 3633 3634 static void 3635 vm_get_rescnt(struct vm *vm, int vcpu, struct vmm_stat_type *stat) 3636 { 3637 if (vcpu == 0) { 3638 vmm_stat_set(vm, vcpu, VMM_MEM_RESIDENT, 3639 PAGE_SIZE * vmspace_resident_count(vm->vmspace)); 3640 } 3641 } 3642 3643 VMM_STAT_FUNC(VMM_MEM_RESIDENT, "Resident memory", vm_get_rescnt); 3644 3645 int 3646 vm_ioport_access(struct vm *vm, int vcpuid, bool in, uint16_t port, 3647 uint8_t bytes, uint32_t *val) 3648 { 3649 return (vm_inout_access(&vm->ioports, in, port, bytes, val)); 3650 } 3651 3652 /* 3653 * bhyve-internal interfaces to attach or detach IO port handlers. 3654 * Must be called with VM write lock held for safety. 3655 */ 3656 int 3657 vm_ioport_attach(struct vm *vm, uint16_t port, ioport_handler_t func, void *arg, 3658 void **cookie) 3659 { 3660 int err; 3661 err = vm_inout_attach(&vm->ioports, port, IOPF_DEFAULT, func, arg); 3662 if (err == 0) { 3663 *cookie = (void *)IOP_GEN_COOKIE(func, arg, port); 3664 } 3665 return (err); 3666 } 3667 int 3668 vm_ioport_detach(struct vm *vm, void **cookie, ioport_handler_t *old_func, 3669 void **old_arg) 3670 { 3671 uint16_t port = IOP_PORT_FROM_COOKIE((uintptr_t)*cookie); 3672 int err; 3673 3674 err = vm_inout_detach(&vm->ioports, port, false, old_func, old_arg); 3675 if (err == 0) { 3676 *cookie = NULL; 3677 } 3678 return (err); 3679 } 3680 3681 /* 3682 * External driver interfaces to attach or detach IO port handlers. 3683 * Must be called with VM write lock held for safety. 3684 */ 3685 int 3686 vm_ioport_hook(struct vm *vm, uint16_t port, ioport_handler_t func, 3687 void *arg, void **cookie) 3688 { 3689 int err; 3690 3691 if (port == 0) { 3692 return (EINVAL); 3693 } 3694 3695 err = vm_inout_attach(&vm->ioports, port, IOPF_DRV_HOOK, func, arg); 3696 if (err == 0) { 3697 *cookie = (void *)IOP_GEN_COOKIE(func, arg, port); 3698 } 3699 return (err); 3700 } 3701 void 3702 vm_ioport_unhook(struct vm *vm, void **cookie) 3703 { 3704 uint16_t port = IOP_PORT_FROM_COOKIE((uintptr_t)*cookie); 3705 ioport_handler_t old_func; 3706 void *old_arg; 3707 int err; 3708 3709 err = vm_inout_detach(&vm->ioports, port, true, &old_func, &old_arg); 3710 3711 /* ioport-hook-using drivers are expected to be well-behaved */ 3712 VERIFY0(err); 3713 VERIFY(IOP_GEN_COOKIE(old_func, old_arg, port) == (uintptr_t)*cookie); 3714 3715 *cookie = NULL; 3716 } 3717 3718 int 3719 vmm_kstat_update_vcpu(struct kstat *ksp, int rw) 3720 { 3721 struct vm *vm = ksp->ks_private; 3722 vmm_vcpu_kstats_t *vvk = ksp->ks_data; 3723 const int vcpuid = vvk->vvk_vcpu.value.ui32; 3724 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 3725 3726 ASSERT3U(vcpuid, <, VM_MAXCPU); 3727 3728 vvk->vvk_time_init.value.ui64 = vcpu->ustate_total[VU_INIT]; 3729 vvk->vvk_time_run.value.ui64 = vcpu->ustate_total[VU_RUN]; 3730 vvk->vvk_time_idle.value.ui64 = vcpu->ustate_total[VU_IDLE]; 3731 vvk->vvk_time_emu_kern.value.ui64 = vcpu->ustate_total[VU_EMU_KERN]; 3732 vvk->vvk_time_emu_user.value.ui64 = vcpu->ustate_total[VU_EMU_USER]; 3733 vvk->vvk_time_sched.value.ui64 = vcpu->ustate_total[VU_SCHED]; 3734 3735 return (0); 3736 } 3737 3738 SET_DECLARE(vmm_data_version_entries, const vmm_data_version_entry_t); 3739 3740 static inline bool 3741 vmm_data_is_cpu_specific(uint16_t data_class) 3742 { 3743 switch (data_class) { 3744 case VDC_REGISTER: 3745 case VDC_MSR: 3746 case VDC_FPU: 3747 case VDC_LAPIC: 3748 return (true); 3749 default: 3750 return (false); 3751 } 3752 } 3753 3754 static int 3755 vmm_data_find(const vmm_data_req_t *req, const vmm_data_version_entry_t **resp) 3756 { 3757 const vmm_data_version_entry_t **vdpp, *vdp; 3758 3759 ASSERT(resp != NULL); 3760 ASSERT(req->vdr_result_len != NULL); 3761 3762 SET_FOREACH(vdpp, vmm_data_version_entries) { 3763 vdp = *vdpp; 3764 if (vdp->vdve_class == req->vdr_class && 3765 vdp->vdve_version == req->vdr_version) { 3766 /* 3767 * Enforce any data length expectation expressed by the 3768 * provider for this data. 3769 */ 3770 if (vdp->vdve_len_expect != 0 && 3771 vdp->vdve_len_expect > req->vdr_len) { 3772 *req->vdr_result_len = vdp->vdve_len_expect; 3773 return (ENOSPC); 3774 } 3775 *resp = vdp; 3776 return (0); 3777 } 3778 } 3779 return (EINVAL); 3780 } 3781 3782 static void * 3783 vmm_data_from_class(const vmm_data_req_t *req, struct vm *vm, int vcpuid) 3784 { 3785 switch (req->vdr_class) { 3786 /* per-cpu data/devices */ 3787 case VDC_LAPIC: 3788 return (vm_lapic(vm, vcpuid)); 3789 case VDC_VMM_ARCH: 3790 return (vm); 3791 3792 case VDC_FPU: 3793 case VDC_REGISTER: 3794 case VDC_MSR: 3795 /* 3796 * These have per-CPU handling which is dispatched outside 3797 * vmm_data_version_entries listing. 3798 */ 3799 return (NULL); 3800 3801 /* system-wide data/devices */ 3802 case VDC_IOAPIC: 3803 return (vm->vioapic); 3804 case VDC_ATPIT: 3805 return (vm->vatpit); 3806 case VDC_ATPIC: 3807 return (vm->vatpic); 3808 case VDC_HPET: 3809 return (vm->vhpet); 3810 case VDC_PM_TIMER: 3811 return (vm->vpmtmr); 3812 case VDC_RTC: 3813 return (vm->vrtc); 3814 3815 default: 3816 /* The data class will have been validated by now */ 3817 panic("Unexpected class %u", req->vdr_class); 3818 } 3819 } 3820 3821 const uint32_t arch_msr_iter[] = { 3822 MSR_EFER, 3823 3824 /* 3825 * While gsbase and fsbase are accessible via the MSR accessors, they 3826 * are not included in MSR iteration since they are covered by the 3827 * segment descriptor interface too. 3828 */ 3829 MSR_KGSBASE, 3830 3831 MSR_STAR, 3832 MSR_LSTAR, 3833 MSR_CSTAR, 3834 MSR_SF_MASK, 3835 3836 MSR_SYSENTER_CS_MSR, 3837 MSR_SYSENTER_ESP_MSR, 3838 MSR_SYSENTER_EIP_MSR, 3839 MSR_PAT, 3840 }; 3841 const uint32_t generic_msr_iter[] = { 3842 MSR_TSC, 3843 MSR_MTRRcap, 3844 MSR_MTRRdefType, 3845 3846 MSR_MTRR4kBase, MSR_MTRR4kBase + 1, MSR_MTRR4kBase + 2, 3847 MSR_MTRR4kBase + 3, MSR_MTRR4kBase + 4, MSR_MTRR4kBase + 5, 3848 MSR_MTRR4kBase + 6, MSR_MTRR4kBase + 7, 3849 3850 MSR_MTRR16kBase, MSR_MTRR16kBase + 1, 3851 3852 MSR_MTRR64kBase, 3853 }; 3854 3855 static int 3856 vmm_data_read_msrs(struct vm *vm, int vcpuid, const vmm_data_req_t *req) 3857 { 3858 VERIFY3U(req->vdr_class, ==, VDC_MSR); 3859 VERIFY3U(req->vdr_version, ==, 1); 3860 3861 const uint_t num_msrs = nitems(arch_msr_iter) + nitems(generic_msr_iter) 3862 + (VMM_MTRR_VAR_MAX * 2); 3863 const uint32_t output_len = 3864 num_msrs * sizeof (struct vdi_field_entry_v1); 3865 *req->vdr_result_len = output_len; 3866 3867 if (req->vdr_len < output_len) { 3868 return (ENOSPC); 3869 } 3870 3871 struct vdi_field_entry_v1 *entryp = req->vdr_data; 3872 for (uint_t i = 0; i < nitems(arch_msr_iter); i++, entryp++) { 3873 const uint32_t msr = arch_msr_iter[i]; 3874 uint64_t val = 0; 3875 3876 int err = ops->vmgetmsr(vm->cookie, vcpuid, msr, &val); 3877 /* All of these MSRs are expected to work */ 3878 VERIFY0(err); 3879 entryp->vfe_ident = msr; 3880 entryp->vfe_value = val; 3881 } 3882 3883 struct vm_mtrr *mtrr = &vm->vcpu[vcpuid].mtrr; 3884 for (uint_t i = 0; i < nitems(generic_msr_iter); i++, entryp++) { 3885 const uint32_t msr = generic_msr_iter[i]; 3886 3887 entryp->vfe_ident = msr; 3888 switch (msr) { 3889 case MSR_TSC: 3890 /* 3891 * Communicate this as the difference from the VM-wide 3892 * offset of the boot time. 3893 */ 3894 entryp->vfe_value = vm->vcpu[vcpuid].tsc_offset; 3895 break; 3896 case MSR_MTRRcap: 3897 case MSR_MTRRdefType: 3898 case MSR_MTRR4kBase ... MSR_MTRR4kBase + 7: 3899 case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1: 3900 case MSR_MTRR64kBase: { 3901 int err = vm_rdmtrr(mtrr, msr, &entryp->vfe_value); 3902 VERIFY0(err); 3903 break; 3904 } 3905 default: 3906 panic("unexpected msr export %x", msr); 3907 } 3908 } 3909 /* Copy the variable MTRRs */ 3910 for (uint_t i = 0; i < (VMM_MTRR_VAR_MAX * 2); i++, entryp++) { 3911 const uint32_t msr = MSR_MTRRVarBase + i; 3912 3913 entryp->vfe_ident = msr; 3914 int err = vm_rdmtrr(mtrr, msr, &entryp->vfe_value); 3915 VERIFY0(err); 3916 } 3917 return (0); 3918 } 3919 3920 static int 3921 vmm_data_write_msrs(struct vm *vm, int vcpuid, const vmm_data_req_t *req) 3922 { 3923 VERIFY3U(req->vdr_class, ==, VDC_MSR); 3924 VERIFY3U(req->vdr_version, ==, 1); 3925 3926 const struct vdi_field_entry_v1 *entryp = req->vdr_data; 3927 const uint_t entry_count = 3928 req->vdr_len / sizeof (struct vdi_field_entry_v1); 3929 struct vm_mtrr *mtrr = &vm->vcpu[vcpuid].mtrr; 3930 3931 /* 3932 * First make sure that all of the MSRs can be manipulated. 3933 * For now, this check is done by going though the getmsr handler 3934 */ 3935 for (uint_t i = 0; i < entry_count; i++, entryp++) { 3936 const uint32_t msr = entryp->vfe_ident; 3937 uint64_t val; 3938 int err = 0; 3939 3940 switch (msr) { 3941 case MSR_TSC: 3942 break; 3943 default: 3944 if (is_mtrr_msr(msr)) { 3945 err = vm_rdmtrr(mtrr, msr, &val); 3946 } else { 3947 err = ops->vmgetmsr(vm->cookie, vcpuid, msr, 3948 &val); 3949 } 3950 break; 3951 } 3952 if (err != 0) { 3953 return (err); 3954 } 3955 } 3956 3957 /* 3958 * Fairly confident that all of the 'set' operations are at least 3959 * targeting valid MSRs, continue on. 3960 */ 3961 entryp = req->vdr_data; 3962 for (uint_t i = 0; i < entry_count; i++, entryp++) { 3963 const uint32_t msr = entryp->vfe_ident; 3964 const uint64_t val = entryp->vfe_value; 3965 int err = 0; 3966 3967 switch (msr) { 3968 case MSR_TSC: 3969 vm->vcpu[vcpuid].tsc_offset = entryp->vfe_value; 3970 break; 3971 default: 3972 if (is_mtrr_msr(msr)) { 3973 if (msr == MSR_MTRRcap) { 3974 /* 3975 * MTRRcap is read-only. If the current 3976 * value matches the incoming one, 3977 * consider it a success 3978 */ 3979 uint64_t comp; 3980 err = vm_rdmtrr(mtrr, msr, &comp); 3981 if (err != 0 || comp != val) { 3982 err = EINVAL; 3983 } 3984 } else { 3985 err = vm_wrmtrr(mtrr, msr, val); 3986 } 3987 } else { 3988 err = ops->vmsetmsr(vm->cookie, vcpuid, msr, 3989 val); 3990 } 3991 break; 3992 } 3993 if (err != 0) { 3994 return (err); 3995 } 3996 } 3997 *req->vdr_result_len = entry_count * sizeof (struct vdi_field_entry_v1); 3998 3999 return (0); 4000 } 4001 4002 static const vmm_data_version_entry_t msr_v1 = { 4003 .vdve_class = VDC_MSR, 4004 .vdve_version = 1, 4005 .vdve_len_per_item = sizeof (struct vdi_field_entry_v1), 4006 /* Requires backend-specific dispatch */ 4007 .vdve_readf = NULL, 4008 .vdve_writef = NULL, 4009 }; 4010 VMM_DATA_VERSION(msr_v1); 4011 4012 static const uint32_t vmm_arch_v1_fields[] = { 4013 VAI_TSC_BOOT_OFFSET, 4014 VAI_BOOT_HRTIME, 4015 VAI_TSC_FREQ, 4016 }; 4017 4018 static bool 4019 vmm_read_arch_field(struct vm *vm, uint32_t ident, uint64_t *valp) 4020 { 4021 ASSERT(valp != NULL); 4022 4023 switch (ident) { 4024 case VAI_TSC_BOOT_OFFSET: 4025 *valp = vm->boot_tsc_offset; 4026 return (true); 4027 case VAI_BOOT_HRTIME: 4028 *valp = vm->boot_hrtime; 4029 return (true); 4030 case VAI_TSC_FREQ: 4031 /* 4032 * Since the system TSC calibration is not public, just derive 4033 * it from the scaling functions available. 4034 */ 4035 *valp = unscalehrtime(NANOSEC); 4036 return (true); 4037 default: 4038 break; 4039 } 4040 return (false); 4041 } 4042 4043 static int 4044 vmm_data_read_vmm_arch(void *arg, const vmm_data_req_t *req) 4045 { 4046 struct vm *vm = arg; 4047 4048 VERIFY3U(req->vdr_class, ==, VDC_VMM_ARCH); 4049 VERIFY3U(req->vdr_version, ==, 1); 4050 4051 struct vdi_field_entry_v1 *entryp = req->vdr_data; 4052 4053 /* Specific fields requested */ 4054 if ((req->vdr_flags & VDX_FLAG_READ_COPYIN) != 0) { 4055 const uint_t count = 4056 req->vdr_len / sizeof (struct vdi_field_entry_v1); 4057 4058 for (uint_t i = 0; i < count; i++, entryp++) { 4059 if (!vmm_read_arch_field(vm, entryp->vfe_ident, 4060 &entryp->vfe_value)) { 4061 return (EINVAL); 4062 } 4063 } 4064 *req->vdr_result_len = 4065 count * sizeof (struct vdi_field_entry_v1); 4066 return (0); 4067 } 4068 4069 /* Emit all of the possible values */ 4070 const uint32_t total_size = nitems(vmm_arch_v1_fields) * 4071 sizeof (struct vdi_field_entry_v1); 4072 *req->vdr_result_len = total_size; 4073 if (req->vdr_len < total_size) { 4074 return (ENOSPC); 4075 } 4076 for (uint_t i = 0; i < nitems(vmm_arch_v1_fields); i++, entryp++) { 4077 entryp->vfe_ident = vmm_arch_v1_fields[i]; 4078 VERIFY(vmm_read_arch_field(vm, entryp->vfe_ident, 4079 &entryp->vfe_value)); 4080 } 4081 return (0); 4082 } 4083 4084 static int 4085 vmm_data_write_vmm_arch(void *arg, const vmm_data_req_t *req) 4086 { 4087 struct vm *vm = arg; 4088 4089 VERIFY3U(req->vdr_class, ==, VDC_VMM_ARCH); 4090 VERIFY3U(req->vdr_version, ==, 1); 4091 4092 const struct vdi_field_entry_v1 *entryp = req->vdr_data; 4093 const uint_t entry_count = 4094 req->vdr_len / sizeof (struct vdi_field_entry_v1); 4095 4096 for (uint_t i = 0; i < entry_count; i++, entryp++) { 4097 const uint64_t val = entryp->vfe_value; 4098 4099 switch (entryp->vfe_ident) { 4100 case VAI_TSC_BOOT_OFFSET: 4101 vm->boot_tsc_offset = val; 4102 break; 4103 case VAI_BOOT_HRTIME: 4104 vm->boot_hrtime = val; 4105 break; 4106 case VAI_TSC_FREQ: 4107 /* Guest TSC frequency not (currently) adjustable */ 4108 return (EPERM); 4109 default: 4110 return (EINVAL); 4111 } 4112 } 4113 *req->vdr_result_len = entry_count * sizeof (struct vdi_field_entry_v1); 4114 return (0); 4115 } 4116 4117 static const vmm_data_version_entry_t vmm_arch_v1 = { 4118 .vdve_class = VDC_VMM_ARCH, 4119 .vdve_version = 1, 4120 .vdve_len_per_item = sizeof (struct vdi_field_entry_v1), 4121 .vdve_readf = vmm_data_read_vmm_arch, 4122 .vdve_writef = vmm_data_write_vmm_arch, 4123 }; 4124 VMM_DATA_VERSION(vmm_arch_v1); 4125 4126 static int 4127 vmm_data_read_versions(void *arg, const vmm_data_req_t *req) 4128 { 4129 VERIFY3U(req->vdr_class, ==, VDC_VERSION); 4130 VERIFY3U(req->vdr_version, ==, 1); 4131 4132 const uint32_t total_size = SET_COUNT(vmm_data_version_entries) * 4133 sizeof (struct vdi_version_entry_v1); 4134 4135 /* Make sure there is room for all of the entries */ 4136 *req->vdr_result_len = total_size; 4137 if (req->vdr_len < *req->vdr_result_len) { 4138 return (ENOSPC); 4139 } 4140 4141 struct vdi_version_entry_v1 *entryp = req->vdr_data; 4142 const vmm_data_version_entry_t **vdpp; 4143 SET_FOREACH(vdpp, vmm_data_version_entries) { 4144 const vmm_data_version_entry_t *vdp = *vdpp; 4145 4146 entryp->vve_class = vdp->vdve_class; 4147 entryp->vve_version = vdp->vdve_version; 4148 entryp->vve_len_expect = vdp->vdve_len_expect; 4149 entryp->vve_len_per_item = vdp->vdve_len_per_item; 4150 entryp++; 4151 } 4152 return (0); 4153 } 4154 4155 static int 4156 vmm_data_write_versions(void *arg, const vmm_data_req_t *req) 4157 { 4158 /* Writing to the version information makes no sense */ 4159 return (EPERM); 4160 } 4161 4162 static const vmm_data_version_entry_t versions_v1 = { 4163 .vdve_class = VDC_VERSION, 4164 .vdve_version = 1, 4165 .vdve_len_per_item = sizeof (struct vdi_version_entry_v1), 4166 .vdve_readf = vmm_data_read_versions, 4167 .vdve_writef = vmm_data_write_versions, 4168 }; 4169 VMM_DATA_VERSION(versions_v1); 4170 4171 int 4172 vmm_data_read(struct vm *vm, int vcpuid, const vmm_data_req_t *req) 4173 { 4174 int err = 0; 4175 4176 if (vmm_data_is_cpu_specific(req->vdr_class)) { 4177 if (vcpuid >= VM_MAXCPU) { 4178 return (EINVAL); 4179 } 4180 } 4181 4182 const vmm_data_version_entry_t *entry = NULL; 4183 err = vmm_data_find(req, &entry); 4184 if (err != 0) { 4185 return (err); 4186 } 4187 ASSERT(entry != NULL); 4188 4189 void *datap = vmm_data_from_class(req, vm, vcpuid); 4190 if (datap != NULL) { 4191 err = entry->vdve_readf(datap, req); 4192 4193 /* 4194 * Successful reads of fixed-length data should populate the 4195 * length of that result. 4196 */ 4197 if (err == 0 && entry->vdve_len_expect != 0) { 4198 *req->vdr_result_len = entry->vdve_len_expect; 4199 } 4200 } else { 4201 switch (req->vdr_class) { 4202 case VDC_MSR: 4203 err = vmm_data_read_msrs(vm, vcpuid, req); 4204 break; 4205 case VDC_FPU: 4206 /* TODO: wire up to xsave export via hma_fpu iface */ 4207 err = EINVAL; 4208 break; 4209 case VDC_REGISTER: 4210 default: 4211 err = EINVAL; 4212 break; 4213 } 4214 } 4215 4216 return (err); 4217 } 4218 4219 int 4220 vmm_data_write(struct vm *vm, int vcpuid, const vmm_data_req_t *req) 4221 { 4222 int err = 0; 4223 4224 if (vmm_data_is_cpu_specific(req->vdr_class)) { 4225 if (vcpuid >= VM_MAXCPU) { 4226 return (EINVAL); 4227 } 4228 } 4229 4230 const vmm_data_version_entry_t *entry = NULL; 4231 err = vmm_data_find(req, &entry); 4232 if (err != 0) { 4233 return (err); 4234 } 4235 ASSERT(entry != NULL); 4236 4237 void *datap = vmm_data_from_class(req, vm, vcpuid); 4238 if (datap != NULL) { 4239 err = entry->vdve_writef(datap, req); 4240 /* 4241 * Successful writes of fixed-length data should populate the 4242 * length of that result. 4243 */ 4244 if (err == 0 && entry->vdve_len_expect != 0) { 4245 *req->vdr_result_len = entry->vdve_len_expect; 4246 } 4247 } else { 4248 switch (req->vdr_class) { 4249 case VDC_MSR: 4250 err = vmm_data_write_msrs(vm, vcpuid, req); 4251 break; 4252 case VDC_FPU: 4253 /* TODO: wire up to xsave import via hma_fpu iface */ 4254 err = EINVAL; 4255 break; 4256 case VDC_REGISTER: 4257 default: 4258 err = EINVAL; 4259 break; 4260 } 4261 } 4262 4263 return (err); 4264 } 4265