1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2011 NetApp, Inc. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 * 28 * $FreeBSD$ 29 */ 30 /* 31 * This file and its contents are supplied under the terms of the 32 * Common Development and Distribution License ("CDDL"), version 1.0. 33 * You may only use this file in accordance with the terms of version 34 * 1.0 of the CDDL. 35 * 36 * A full copy of the text of the CDDL should have accompanied this 37 * source. A copy of the CDDL is also available via the Internet at 38 * http://www.illumos.org/license/CDDL. 39 * 40 * Copyright 2015 Pluribus Networks Inc. 41 * Copyright 2018 Joyent, Inc. 42 * Copyright 2022 Oxide Computer Company 43 * Copyright 2021 OmniOS Community Edition (OmniOSce) Association. 44 */ 45 46 #include <sys/cdefs.h> 47 __FBSDID("$FreeBSD$"); 48 49 #include <sys/param.h> 50 #include <sys/systm.h> 51 #include <sys/kernel.h> 52 #include <sys/module.h> 53 #include <sys/sysctl.h> 54 #include <sys/kmem.h> 55 #include <sys/pcpu.h> 56 #include <sys/mutex.h> 57 #include <sys/proc.h> 58 #include <sys/rwlock.h> 59 #include <sys/sched.h> 60 #include <sys/systm.h> 61 #include <sys/sunddi.h> 62 #include <sys/hma.h> 63 64 #include <machine/md_var.h> 65 #include <x86/psl.h> 66 #include <x86/apicreg.h> 67 68 #include <machine/specialreg.h> 69 #include <machine/vmm.h> 70 #include <machine/vmm_dev.h> 71 #include <machine/vmparam.h> 72 #include <sys/vmm_instruction_emul.h> 73 #include <sys/vmm_vm.h> 74 #include <sys/vmm_gpt.h> 75 #include <sys/vmm_data.h> 76 77 #include "vmm_ioport.h" 78 #include "vmm_host.h" 79 #include "vmm_util.h" 80 #include "vatpic.h" 81 #include "vatpit.h" 82 #include "vhpet.h" 83 #include "vioapic.h" 84 #include "vlapic.h" 85 #include "vpmtmr.h" 86 #include "vrtc.h" 87 #include "vmm_stat.h" 88 #include "vmm_lapic.h" 89 90 #include "io/ppt.h" 91 #include "io/iommu.h" 92 93 struct vlapic; 94 95 /* Flags for vtc_status */ 96 #define VTCS_FPU_RESTORED 1 /* guest FPU restored, host FPU saved */ 97 #define VTCS_FPU_CTX_CRITICAL 2 /* in ctx where FPU restore cannot be lazy */ 98 99 typedef struct vm_thread_ctx { 100 struct vm *vtc_vm; 101 int vtc_vcpuid; 102 uint_t vtc_status; 103 enum vcpu_ustate vtc_ustate; 104 } vm_thread_ctx_t; 105 106 #define VMM_MTRR_VAR_MAX 10 107 #define VMM_MTRR_DEF_MASK \ 108 (MTRR_DEF_ENABLE | MTRR_DEF_FIXED_ENABLE | MTRR_DEF_TYPE) 109 #define VMM_MTRR_PHYSBASE_MASK (MTRR_PHYSBASE_PHYSBASE | MTRR_PHYSBASE_TYPE) 110 #define VMM_MTRR_PHYSMASK_MASK (MTRR_PHYSMASK_PHYSMASK | MTRR_PHYSMASK_VALID) 111 struct vm_mtrr { 112 uint64_t def_type; 113 uint64_t fixed4k[8]; 114 uint64_t fixed16k[2]; 115 uint64_t fixed64k; 116 struct { 117 uint64_t base; 118 uint64_t mask; 119 } var[VMM_MTRR_VAR_MAX]; 120 }; 121 122 /* 123 * Initialization: 124 * (a) allocated when vcpu is created 125 * (i) initialized when vcpu is created and when it is reinitialized 126 * (o) initialized the first time the vcpu is created 127 * (x) initialized before use 128 */ 129 struct vcpu { 130 /* (o) protects state, run_state, hostcpu, sipi_vector */ 131 kmutex_t lock; 132 133 enum vcpu_state state; /* (o) vcpu state */ 134 enum vcpu_run_state run_state; /* (i) vcpu init/sipi/run state */ 135 kcondvar_t vcpu_cv; /* (o) cpu waiter cv */ 136 kcondvar_t state_cv; /* (o) IDLE-transition cv */ 137 int hostcpu; /* (o) vcpu's current host cpu */ 138 int lastloccpu; /* (o) last host cpu localized to */ 139 int reqidle; /* (i) request vcpu to idle */ 140 struct vlapic *vlapic; /* (i) APIC device model */ 141 enum x2apic_state x2apic_state; /* (i) APIC mode */ 142 uint64_t exit_intinfo; /* (i) events pending at VM exit */ 143 uint64_t exc_pending; /* (i) exception pending */ 144 bool nmi_pending; /* (i) NMI pending */ 145 bool extint_pending; /* (i) INTR pending */ 146 147 uint8_t sipi_vector; /* (i) SIPI vector */ 148 hma_fpu_t *guestfpu; /* (a,i) guest fpu state */ 149 uint64_t guest_xcr0; /* (i) guest %xcr0 register */ 150 void *stats; /* (a,i) statistics */ 151 struct vm_exit exitinfo; /* (x) exit reason and collateral */ 152 uint64_t nextrip; /* (x) next instruction to execute */ 153 struct vie *vie_ctx; /* (x) instruction emulation context */ 154 vm_client_t *vmclient; /* (a) VM-system client */ 155 uint64_t tsc_offset; /* (x) offset from host TSC */ 156 struct vm_mtrr mtrr; /* (i) vcpu's MTRR */ 157 vcpu_cpuid_config_t cpuid_cfg; /* (x) cpuid configuration */ 158 159 enum vcpu_ustate ustate; /* (i) microstate for the vcpu */ 160 hrtime_t ustate_when; /* (i) time of last ustate change */ 161 uint64_t ustate_total[VU_MAX]; /* (o) total time spent in ustates */ 162 vm_thread_ctx_t vtc; /* (o) thread state for ctxops */ 163 struct ctxop *ctxop; /* (o) ctxop storage for vcpu */ 164 }; 165 166 #define vcpu_lock(v) mutex_enter(&((v)->lock)) 167 #define vcpu_unlock(v) mutex_exit(&((v)->lock)) 168 #define vcpu_assert_locked(v) ASSERT(MUTEX_HELD(&((v)->lock))) 169 170 struct mem_seg { 171 size_t len; 172 bool sysmem; 173 vm_object_t *object; 174 }; 175 #define VM_MAX_MEMSEGS 5 176 177 struct mem_map { 178 vm_paddr_t gpa; 179 size_t len; 180 vm_ooffset_t segoff; 181 int segid; 182 int prot; 183 int flags; 184 }; 185 #define VM_MAX_MEMMAPS 8 186 187 /* 188 * Initialization: 189 * (o) initialized the first time the VM is created 190 * (i) initialized when VM is created and when it is reinitialized 191 * (x) initialized before use 192 */ 193 struct vm { 194 void *cookie; /* (i) cpu-specific data */ 195 void *iommu; /* (x) iommu-specific data */ 196 struct vhpet *vhpet; /* (i) virtual HPET */ 197 struct vioapic *vioapic; /* (i) virtual ioapic */ 198 struct vatpic *vatpic; /* (i) virtual atpic */ 199 struct vatpit *vatpit; /* (i) virtual atpit */ 200 struct vpmtmr *vpmtmr; /* (i) virtual ACPI PM timer */ 201 struct vrtc *vrtc; /* (o) virtual RTC */ 202 volatile cpuset_t active_cpus; /* (i) active vcpus */ 203 volatile cpuset_t debug_cpus; /* (i) vcpus stopped for dbg */ 204 int suspend; /* (i) stop VM execution */ 205 volatile cpuset_t suspended_cpus; /* (i) suspended vcpus */ 206 volatile cpuset_t halted_cpus; /* (x) cpus in a hard halt */ 207 struct mem_map mem_maps[VM_MAX_MEMMAPS]; /* (i) guest address space */ 208 struct mem_seg mem_segs[VM_MAX_MEMSEGS]; /* (o) guest memory regions */ 209 struct vmspace *vmspace; /* (o) guest's address space */ 210 struct vcpu vcpu[VM_MAXCPU]; /* (i) guest vcpus */ 211 /* The following describe the vm cpu topology */ 212 uint16_t sockets; /* (o) num of sockets */ 213 uint16_t cores; /* (o) num of cores/socket */ 214 uint16_t threads; /* (o) num of threads/core */ 215 uint16_t maxcpus; /* (o) max pluggable cpus */ 216 217 uint64_t boot_tsc_offset; /* (i) TSC offset at VM boot */ 218 hrtime_t boot_hrtime; /* (i) hrtime at VM boot */ 219 220 struct ioport_config ioports; /* (o) ioport handling */ 221 222 bool mem_transient; /* (o) alloc transient memory */ 223 }; 224 225 static int vmm_initialized; 226 227 228 static void 229 nullop_panic(void) 230 { 231 panic("null vmm operation call"); 232 } 233 234 /* Do not allow use of an un-set `ops` to do anything but panic */ 235 static struct vmm_ops vmm_ops_null = { 236 .init = (vmm_init_func_t)nullop_panic, 237 .cleanup = (vmm_cleanup_func_t)nullop_panic, 238 .resume = (vmm_resume_func_t)nullop_panic, 239 .vminit = (vmi_init_func_t)nullop_panic, 240 .vmrun = (vmi_run_func_t)nullop_panic, 241 .vmcleanup = (vmi_cleanup_func_t)nullop_panic, 242 .vmgetreg = (vmi_get_register_t)nullop_panic, 243 .vmsetreg = (vmi_set_register_t)nullop_panic, 244 .vmgetdesc = (vmi_get_desc_t)nullop_panic, 245 .vmsetdesc = (vmi_set_desc_t)nullop_panic, 246 .vmgetcap = (vmi_get_cap_t)nullop_panic, 247 .vmsetcap = (vmi_set_cap_t)nullop_panic, 248 .vlapic_init = (vmi_vlapic_init)nullop_panic, 249 .vlapic_cleanup = (vmi_vlapic_cleanup)nullop_panic, 250 .vmsavectx = (vmi_savectx)nullop_panic, 251 .vmrestorectx = (vmi_restorectx)nullop_panic, 252 .vmgetmsr = (vmi_get_msr_t)nullop_panic, 253 .vmsetmsr = (vmi_set_msr_t)nullop_panic, 254 }; 255 256 static struct vmm_ops *ops = &vmm_ops_null; 257 static vmm_pte_ops_t *pte_ops = NULL; 258 259 #define VMM_INIT() ((*ops->init)()) 260 #define VMM_CLEANUP() ((*ops->cleanup)()) 261 #define VMM_RESUME() ((*ops->resume)()) 262 263 #define VMINIT(vm) ((*ops->vminit)(vm)) 264 #define VMRUN(vmi, vcpu, rip) ((*ops->vmrun)(vmi, vcpu, rip)) 265 #define VMCLEANUP(vmi) ((*ops->vmcleanup)(vmi)) 266 267 #define VMGETREG(vmi, vcpu, num, rv) ((*ops->vmgetreg)(vmi, vcpu, num, rv)) 268 #define VMSETREG(vmi, vcpu, num, val) ((*ops->vmsetreg)(vmi, vcpu, num, val)) 269 #define VMGETDESC(vmi, vcpu, num, dsc) ((*ops->vmgetdesc)(vmi, vcpu, num, dsc)) 270 #define VMSETDESC(vmi, vcpu, num, dsc) ((*ops->vmsetdesc)(vmi, vcpu, num, dsc)) 271 #define VMGETCAP(vmi, vcpu, num, rv) ((*ops->vmgetcap)(vmi, vcpu, num, rv)) 272 #define VMSETCAP(vmi, vcpu, num, val) ((*ops->vmsetcap)(vmi, vcpu, num, val)) 273 #define VLAPIC_INIT(vmi, vcpu) ((*ops->vlapic_init)(vmi, vcpu)) 274 #define VLAPIC_CLEANUP(vmi, vlapic) ((*ops->vlapic_cleanup)(vmi, vlapic)) 275 276 #define fpu_start_emulating() load_cr0(rcr0() | CR0_TS) 277 #define fpu_stop_emulating() clts() 278 279 SDT_PROVIDER_DEFINE(vmm); 280 281 SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 282 NULL); 283 284 /* 285 * Halt the guest if all vcpus are executing a HLT instruction with 286 * interrupts disabled. 287 */ 288 static int halt_detection_enabled = 1; 289 290 /* Trap into hypervisor on all guest exceptions and reflect them back */ 291 static int trace_guest_exceptions; 292 293 static void vm_free_memmap(struct vm *vm, int ident); 294 static bool sysmem_mapping(struct vm *vm, struct mem_map *mm); 295 static void vcpu_notify_event_locked(struct vcpu *vcpu, vcpu_notify_t); 296 static bool vcpu_sleep_bailout_checks(struct vm *vm, int vcpuid); 297 static int vcpu_vector_sipi(struct vm *vm, int vcpuid, uint8_t vector); 298 299 static void vmm_savectx(void *); 300 static void vmm_restorectx(void *); 301 static const struct ctxop_template vmm_ctxop_tpl = { 302 .ct_rev = CTXOP_TPL_REV, 303 .ct_save = vmm_savectx, 304 .ct_restore = vmm_restorectx, 305 }; 306 307 #ifdef KTR 308 static const char * 309 vcpu_state2str(enum vcpu_state state) 310 { 311 312 switch (state) { 313 case VCPU_IDLE: 314 return ("idle"); 315 case VCPU_FROZEN: 316 return ("frozen"); 317 case VCPU_RUNNING: 318 return ("running"); 319 case VCPU_SLEEPING: 320 return ("sleeping"); 321 default: 322 return ("unknown"); 323 } 324 } 325 #endif 326 327 static void 328 vcpu_cleanup(struct vm *vm, int i, bool destroy) 329 { 330 struct vcpu *vcpu = &vm->vcpu[i]; 331 332 VLAPIC_CLEANUP(vm->cookie, vcpu->vlapic); 333 if (destroy) { 334 vmm_stat_free(vcpu->stats); 335 336 vcpu_cpuid_cleanup(&vcpu->cpuid_cfg); 337 338 hma_fpu_free(vcpu->guestfpu); 339 vcpu->guestfpu = NULL; 340 341 vie_free(vcpu->vie_ctx); 342 vcpu->vie_ctx = NULL; 343 344 vmc_destroy(vcpu->vmclient); 345 vcpu->vmclient = NULL; 346 347 ctxop_free(vcpu->ctxop); 348 mutex_destroy(&vcpu->lock); 349 } 350 } 351 352 static void 353 vcpu_init(struct vm *vm, int vcpu_id, bool create) 354 { 355 struct vcpu *vcpu; 356 357 KASSERT(vcpu_id >= 0 && vcpu_id < vm->maxcpus, 358 ("vcpu_init: invalid vcpu %d", vcpu_id)); 359 360 vcpu = &vm->vcpu[vcpu_id]; 361 362 if (create) { 363 mutex_init(&vcpu->lock, NULL, MUTEX_ADAPTIVE, NULL); 364 365 vcpu->state = VCPU_IDLE; 366 vcpu->hostcpu = NOCPU; 367 vcpu->lastloccpu = NOCPU; 368 vcpu->guestfpu = hma_fpu_alloc(KM_SLEEP); 369 vcpu->stats = vmm_stat_alloc(); 370 vcpu->vie_ctx = vie_alloc(); 371 vcpu_cpuid_init(&vcpu->cpuid_cfg); 372 373 vcpu->ustate = VU_INIT; 374 vcpu->ustate_when = gethrtime(); 375 376 vcpu->vtc.vtc_vm = vm; 377 vcpu->vtc.vtc_vcpuid = vcpu_id; 378 vcpu->ctxop = ctxop_allocate(&vmm_ctxop_tpl, &vcpu->vtc); 379 } else { 380 vie_reset(vcpu->vie_ctx); 381 bzero(&vcpu->exitinfo, sizeof (vcpu->exitinfo)); 382 if (vcpu->ustate != VU_INIT) { 383 vcpu_ustate_change(vm, vcpu_id, VU_INIT); 384 } 385 bzero(&vcpu->mtrr, sizeof (vcpu->mtrr)); 386 } 387 388 vcpu->run_state = VRS_HALT; 389 vcpu->vlapic = VLAPIC_INIT(vm->cookie, vcpu_id); 390 (void) vm_set_x2apic_state(vm, vcpu_id, X2APIC_DISABLED); 391 vcpu->reqidle = 0; 392 vcpu->exit_intinfo = 0; 393 vcpu->nmi_pending = false; 394 vcpu->extint_pending = false; 395 vcpu->exc_pending = 0; 396 vcpu->guest_xcr0 = XFEATURE_ENABLED_X87; 397 (void) hma_fpu_init(vcpu->guestfpu); 398 vmm_stat_init(vcpu->stats); 399 vcpu->tsc_offset = 0; 400 } 401 402 int 403 vcpu_trace_exceptions(struct vm *vm, int vcpuid) 404 { 405 406 return (trace_guest_exceptions); 407 } 408 409 struct vm_exit * 410 vm_exitinfo(struct vm *vm, int cpuid) 411 { 412 struct vcpu *vcpu; 413 414 if (cpuid < 0 || cpuid >= vm->maxcpus) 415 panic("vm_exitinfo: invalid cpuid %d", cpuid); 416 417 vcpu = &vm->vcpu[cpuid]; 418 419 return (&vcpu->exitinfo); 420 } 421 422 struct vie * 423 vm_vie_ctx(struct vm *vm, int cpuid) 424 { 425 if (cpuid < 0 || cpuid >= vm->maxcpus) 426 panic("vm_vie_ctx: invalid cpuid %d", cpuid); 427 428 return (vm->vcpu[cpuid].vie_ctx); 429 } 430 431 static int 432 vmm_init(void) 433 { 434 vmm_host_state_init(); 435 436 if (vmm_is_intel()) { 437 ops = &vmm_ops_intel; 438 pte_ops = &ept_pte_ops; 439 } else if (vmm_is_svm()) { 440 ops = &vmm_ops_amd; 441 pte_ops = &rvi_pte_ops; 442 } else { 443 return (ENXIO); 444 } 445 446 return (VMM_INIT()); 447 } 448 449 int 450 vmm_mod_load() 451 { 452 int error; 453 454 VERIFY(vmm_initialized == 0); 455 456 error = vmm_init(); 457 if (error == 0) 458 vmm_initialized = 1; 459 460 return (error); 461 } 462 463 int 464 vmm_mod_unload() 465 { 466 int error; 467 468 VERIFY(vmm_initialized == 1); 469 470 error = VMM_CLEANUP(); 471 if (error) 472 return (error); 473 vmm_initialized = 0; 474 475 return (0); 476 } 477 478 /* 479 * Create a test IOMMU domain to see if the host system has necessary hardware 480 * and drivers to do so. 481 */ 482 bool 483 vmm_check_iommu(void) 484 { 485 void *domain; 486 const size_t arb_test_sz = (1UL << 32); 487 488 domain = iommu_create_domain(arb_test_sz); 489 if (domain == NULL) { 490 return (false); 491 } 492 iommu_destroy_domain(domain); 493 return (true); 494 } 495 496 static void 497 vm_init(struct vm *vm, bool create) 498 { 499 int i; 500 501 vm->cookie = VMINIT(vm); 502 vm->iommu = NULL; 503 vm->vioapic = vioapic_init(vm); 504 vm->vhpet = vhpet_init(vm); 505 vm->vatpic = vatpic_init(vm); 506 vm->vatpit = vatpit_init(vm); 507 vm->vpmtmr = vpmtmr_init(vm); 508 if (create) 509 vm->vrtc = vrtc_init(vm); 510 511 vm_inout_init(vm, &vm->ioports); 512 513 CPU_ZERO(&vm->active_cpus); 514 CPU_ZERO(&vm->debug_cpus); 515 516 vm->suspend = 0; 517 CPU_ZERO(&vm->suspended_cpus); 518 519 for (i = 0; i < vm->maxcpus; i++) 520 vcpu_init(vm, i, create); 521 522 /* 523 * Configure the VM-wide TSC offset so that the call to vm_init() 524 * represents the boot time (when the TSC(s) read 0). Each vCPU will 525 * have its own offset from this, which is altered if/when the guest 526 * writes to MSR_TSC. 527 * 528 * The TSC offsetting math is all unsigned, using overflow for negative 529 * offets. A reading of the TSC is negated to form the boot offset. 530 */ 531 const uint64_t boot_tsc = rdtsc_offset(); 532 vm->boot_tsc_offset = (uint64_t)(-(int64_t)boot_tsc); 533 534 /* Convert the boot TSC reading to hrtime */ 535 vm->boot_hrtime = (hrtime_t)boot_tsc; 536 scalehrtime(&vm->boot_hrtime); 537 } 538 539 /* 540 * The default CPU topology is a single thread per package. 541 */ 542 uint_t cores_per_package = 1; 543 uint_t threads_per_core = 1; 544 545 /* 546 * Debugging tunable to enable dirty-page-tracking. 547 * (Remains off by default for now) 548 */ 549 bool gpt_track_dirty = false; 550 551 int 552 vm_create(uint64_t flags, struct vm **retvm) 553 { 554 struct vm *vm; 555 struct vmspace *vmspace; 556 557 /* 558 * If vmm.ko could not be successfully initialized then don't attempt 559 * to create the virtual machine. 560 */ 561 if (!vmm_initialized) 562 return (ENXIO); 563 564 vmspace = vmspace_alloc(VM_MAXUSER_ADDRESS, pte_ops, gpt_track_dirty); 565 if (vmspace == NULL) 566 return (ENOMEM); 567 568 vm = kmem_zalloc(sizeof (struct vm), KM_SLEEP); 569 570 vm->vmspace = vmspace; 571 vm->mem_transient = (flags & VCF_RESERVOIR_MEM) == 0; 572 for (uint_t i = 0; i < VM_MAXCPU; i++) { 573 vm->vcpu[i].vmclient = vmspace_client_alloc(vmspace); 574 } 575 576 vm->sockets = 1; 577 vm->cores = cores_per_package; /* XXX backwards compatibility */ 578 vm->threads = threads_per_core; /* XXX backwards compatibility */ 579 vm->maxcpus = VM_MAXCPU; /* XXX temp to keep code working */ 580 581 vm_init(vm, true); 582 583 *retvm = vm; 584 return (0); 585 } 586 587 void 588 vm_get_topology(struct vm *vm, uint16_t *sockets, uint16_t *cores, 589 uint16_t *threads, uint16_t *maxcpus) 590 { 591 *sockets = vm->sockets; 592 *cores = vm->cores; 593 *threads = vm->threads; 594 *maxcpus = vm->maxcpus; 595 } 596 597 uint16_t 598 vm_get_maxcpus(struct vm *vm) 599 { 600 return (vm->maxcpus); 601 } 602 603 int 604 vm_set_topology(struct vm *vm, uint16_t sockets, uint16_t cores, 605 uint16_t threads, uint16_t maxcpus) 606 { 607 if (maxcpus != 0) 608 return (EINVAL); /* XXX remove when supported */ 609 if ((sockets * cores * threads) > vm->maxcpus) 610 return (EINVAL); 611 /* XXX need to check sockets * cores * threads == vCPU, how? */ 612 vm->sockets = sockets; 613 vm->cores = cores; 614 vm->threads = threads; 615 vm->maxcpus = VM_MAXCPU; /* XXX temp to keep code working */ 616 return (0); 617 } 618 619 static void 620 vm_cleanup(struct vm *vm, bool destroy) 621 { 622 struct mem_map *mm; 623 int i; 624 625 ppt_unassign_all(vm); 626 627 if (vm->iommu != NULL) 628 iommu_destroy_domain(vm->iommu); 629 630 /* 631 * Devices which attach their own ioport hooks should be cleaned up 632 * first so they can tear down those registrations. 633 */ 634 vpmtmr_cleanup(vm->vpmtmr); 635 636 vm_inout_cleanup(vm, &vm->ioports); 637 638 if (destroy) 639 vrtc_cleanup(vm->vrtc); 640 else 641 vrtc_reset(vm->vrtc); 642 643 vatpit_cleanup(vm->vatpit); 644 vhpet_cleanup(vm->vhpet); 645 vatpic_cleanup(vm->vatpic); 646 vioapic_cleanup(vm->vioapic); 647 648 for (i = 0; i < vm->maxcpus; i++) 649 vcpu_cleanup(vm, i, destroy); 650 651 VMCLEANUP(vm->cookie); 652 653 /* 654 * System memory is removed from the guest address space only when 655 * the VM is destroyed. This is because the mapping remains the same 656 * across VM reset. 657 * 658 * Device memory can be relocated by the guest (e.g. using PCI BARs) 659 * so those mappings are removed on a VM reset. 660 */ 661 for (i = 0; i < VM_MAX_MEMMAPS; i++) { 662 mm = &vm->mem_maps[i]; 663 if (destroy || !sysmem_mapping(vm, mm)) { 664 vm_free_memmap(vm, i); 665 } else { 666 /* 667 * We need to reset the IOMMU flag so this mapping can 668 * be reused when a VM is rebooted. Since the IOMMU 669 * domain has already been destroyed we can just reset 670 * the flag here. 671 */ 672 mm->flags &= ~VM_MEMMAP_F_IOMMU; 673 } 674 } 675 676 if (destroy) { 677 for (i = 0; i < VM_MAX_MEMSEGS; i++) 678 vm_free_memseg(vm, i); 679 680 vmspace_destroy(vm->vmspace); 681 vm->vmspace = NULL; 682 } 683 } 684 685 void 686 vm_destroy(struct vm *vm) 687 { 688 vm_cleanup(vm, true); 689 kmem_free(vm, sizeof (*vm)); 690 } 691 692 int 693 vm_reinit(struct vm *vm, uint64_t flags) 694 { 695 /* A virtual machine can be reset only if all vcpus are suspended. */ 696 if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) != 0) { 697 if ((flags & VM_REINIT_F_FORCE_SUSPEND) == 0) { 698 return (EBUSY); 699 } 700 701 /* 702 * Force the VM (and all its vCPUs) into a suspended state. 703 * This should be quick and easy, since the vm_reinit() call is 704 * made while holding the VM write lock, which requires holding 705 * all of the vCPUs in the VCPU_FROZEN state. 706 */ 707 (void) atomic_cmpset_int((uint_t *)&vm->suspend, 0, 708 VM_SUSPEND_RESET); 709 for (uint_t i = 0; i < vm->maxcpus; i++) { 710 struct vcpu *vcpu = &vm->vcpu[i]; 711 712 if (CPU_ISSET(i, &vm->suspended_cpus) || 713 !CPU_ISSET(i, &vm->active_cpus)) { 714 continue; 715 } 716 717 vcpu_lock(vcpu); 718 VERIFY3U(vcpu->state, ==, VCPU_FROZEN); 719 CPU_SET_ATOMIC(i, &vm->suspended_cpus); 720 vcpu_unlock(vcpu); 721 } 722 723 VERIFY0(CPU_CMP(&vm->suspended_cpus, &vm->active_cpus)); 724 } 725 726 vm_cleanup(vm, false); 727 vm_init(vm, false); 728 return (0); 729 } 730 731 int 732 vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa) 733 { 734 vm_object_t *obj; 735 736 if ((obj = vmm_mmio_alloc(vm->vmspace, gpa, len, hpa)) == NULL) 737 return (ENOMEM); 738 else 739 return (0); 740 } 741 742 int 743 vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len) 744 { 745 return (vmspace_unmap(vm->vmspace, gpa, gpa + len)); 746 } 747 748 /* 749 * Return 'true' if 'gpa' is allocated in the guest address space. 750 * 751 * This function is called in the context of a running vcpu which acts as 752 * an implicit lock on 'vm->mem_maps[]'. 753 */ 754 bool 755 vm_mem_allocated(struct vm *vm, int vcpuid, vm_paddr_t gpa) 756 { 757 struct mem_map *mm; 758 int i; 759 760 #ifdef INVARIANTS 761 int hostcpu, state; 762 state = vcpu_get_state(vm, vcpuid, &hostcpu); 763 KASSERT(state == VCPU_RUNNING && hostcpu == curcpu, 764 ("%s: invalid vcpu state %d/%d", __func__, state, hostcpu)); 765 #endif 766 767 for (i = 0; i < VM_MAX_MEMMAPS; i++) { 768 mm = &vm->mem_maps[i]; 769 if (mm->len != 0 && gpa >= mm->gpa && gpa < mm->gpa + mm->len) 770 return (true); /* 'gpa' is sysmem or devmem */ 771 } 772 773 if (ppt_is_mmio(vm, gpa)) 774 return (true); /* 'gpa' is pci passthru mmio */ 775 776 return (false); 777 } 778 779 int 780 vm_alloc_memseg(struct vm *vm, int ident, size_t len, bool sysmem) 781 { 782 struct mem_seg *seg; 783 vm_object_t *obj; 784 785 if (ident < 0 || ident >= VM_MAX_MEMSEGS) 786 return (EINVAL); 787 788 if (len == 0 || (len & PAGE_MASK)) 789 return (EINVAL); 790 791 seg = &vm->mem_segs[ident]; 792 if (seg->object != NULL) { 793 if (seg->len == len && seg->sysmem == sysmem) 794 return (EEXIST); 795 else 796 return (EINVAL); 797 } 798 799 obj = vm_object_mem_allocate(len, vm->mem_transient); 800 if (obj == NULL) 801 return (ENOMEM); 802 803 seg->len = len; 804 seg->object = obj; 805 seg->sysmem = sysmem; 806 return (0); 807 } 808 809 int 810 vm_get_memseg(struct vm *vm, int ident, size_t *len, bool *sysmem, 811 vm_object_t **objptr) 812 { 813 struct mem_seg *seg; 814 815 if (ident < 0 || ident >= VM_MAX_MEMSEGS) 816 return (EINVAL); 817 818 seg = &vm->mem_segs[ident]; 819 if (len) 820 *len = seg->len; 821 if (sysmem) 822 *sysmem = seg->sysmem; 823 if (objptr) 824 *objptr = seg->object; 825 return (0); 826 } 827 828 void 829 vm_free_memseg(struct vm *vm, int ident) 830 { 831 struct mem_seg *seg; 832 833 KASSERT(ident >= 0 && ident < VM_MAX_MEMSEGS, 834 ("%s: invalid memseg ident %d", __func__, ident)); 835 836 seg = &vm->mem_segs[ident]; 837 if (seg->object != NULL) { 838 vm_object_release(seg->object); 839 bzero(seg, sizeof (struct mem_seg)); 840 } 841 } 842 843 int 844 vm_mmap_memseg(struct vm *vm, vm_paddr_t gpa, int segid, vm_ooffset_t first, 845 size_t len, int prot, int flags) 846 { 847 struct mem_seg *seg; 848 struct mem_map *m, *map; 849 vm_ooffset_t last; 850 int i, error; 851 852 if (prot == 0 || (prot & ~(PROT_ALL)) != 0) 853 return (EINVAL); 854 855 if (flags & ~VM_MEMMAP_F_WIRED) 856 return (EINVAL); 857 858 if (segid < 0 || segid >= VM_MAX_MEMSEGS) 859 return (EINVAL); 860 861 seg = &vm->mem_segs[segid]; 862 if (seg->object == NULL) 863 return (EINVAL); 864 865 last = first + len; 866 if (first < 0 || first >= last || last > seg->len) 867 return (EINVAL); 868 869 if ((gpa | first | last) & PAGE_MASK) 870 return (EINVAL); 871 872 map = NULL; 873 for (i = 0; i < VM_MAX_MEMMAPS; i++) { 874 m = &vm->mem_maps[i]; 875 if (m->len == 0) { 876 map = m; 877 break; 878 } 879 } 880 881 if (map == NULL) 882 return (ENOSPC); 883 884 error = vmspace_map(vm->vmspace, seg->object, first, gpa, len, prot); 885 if (error != 0) 886 return (EFAULT); 887 888 vm_object_reference(seg->object); 889 890 if ((flags & VM_MEMMAP_F_WIRED) != 0) { 891 error = vmspace_populate(vm->vmspace, gpa, gpa + len); 892 if (error != 0) { 893 VERIFY0(vmspace_unmap(vm->vmspace, gpa, gpa + len)); 894 return (EFAULT); 895 } 896 } 897 898 map->gpa = gpa; 899 map->len = len; 900 map->segoff = first; 901 map->segid = segid; 902 map->prot = prot; 903 map->flags = flags; 904 return (0); 905 } 906 907 int 908 vm_munmap_memseg(struct vm *vm, vm_paddr_t gpa, size_t len) 909 { 910 struct mem_map *m; 911 int i; 912 913 for (i = 0; i < VM_MAX_MEMMAPS; i++) { 914 m = &vm->mem_maps[i]; 915 if (m->gpa == gpa && m->len == len && 916 (m->flags & VM_MEMMAP_F_IOMMU) == 0) { 917 vm_free_memmap(vm, i); 918 return (0); 919 } 920 } 921 922 return (EINVAL); 923 } 924 925 int 926 vm_mmap_getnext(struct vm *vm, vm_paddr_t *gpa, int *segid, 927 vm_ooffset_t *segoff, size_t *len, int *prot, int *flags) 928 { 929 struct mem_map *mm, *mmnext; 930 int i; 931 932 mmnext = NULL; 933 for (i = 0; i < VM_MAX_MEMMAPS; i++) { 934 mm = &vm->mem_maps[i]; 935 if (mm->len == 0 || mm->gpa < *gpa) 936 continue; 937 if (mmnext == NULL || mm->gpa < mmnext->gpa) 938 mmnext = mm; 939 } 940 941 if (mmnext != NULL) { 942 *gpa = mmnext->gpa; 943 if (segid) 944 *segid = mmnext->segid; 945 if (segoff) 946 *segoff = mmnext->segoff; 947 if (len) 948 *len = mmnext->len; 949 if (prot) 950 *prot = mmnext->prot; 951 if (flags) 952 *flags = mmnext->flags; 953 return (0); 954 } else { 955 return (ENOENT); 956 } 957 } 958 959 static void 960 vm_free_memmap(struct vm *vm, int ident) 961 { 962 struct mem_map *mm; 963 int error; 964 965 mm = &vm->mem_maps[ident]; 966 if (mm->len) { 967 error = vmspace_unmap(vm->vmspace, mm->gpa, 968 mm->gpa + mm->len); 969 KASSERT(error == 0, ("%s: vmspace_unmap error %d", 970 __func__, error)); 971 bzero(mm, sizeof (struct mem_map)); 972 } 973 } 974 975 static __inline bool 976 sysmem_mapping(struct vm *vm, struct mem_map *mm) 977 { 978 979 if (mm->len != 0 && vm->mem_segs[mm->segid].sysmem) 980 return (true); 981 else 982 return (false); 983 } 984 985 vm_paddr_t 986 vmm_sysmem_maxaddr(struct vm *vm) 987 { 988 struct mem_map *mm; 989 vm_paddr_t maxaddr; 990 int i; 991 992 maxaddr = 0; 993 for (i = 0; i < VM_MAX_MEMMAPS; i++) { 994 mm = &vm->mem_maps[i]; 995 if (sysmem_mapping(vm, mm)) { 996 if (maxaddr < mm->gpa + mm->len) 997 maxaddr = mm->gpa + mm->len; 998 } 999 } 1000 return (maxaddr); 1001 } 1002 1003 static void 1004 vm_iommu_modify(struct vm *vm, bool map) 1005 { 1006 int i, sz; 1007 vm_paddr_t gpa, hpa; 1008 struct mem_map *mm; 1009 vm_client_t *vmc; 1010 1011 sz = PAGE_SIZE; 1012 vmc = vmspace_client_alloc(vm->vmspace); 1013 1014 for (i = 0; i < VM_MAX_MEMMAPS; i++) { 1015 mm = &vm->mem_maps[i]; 1016 if (!sysmem_mapping(vm, mm)) 1017 continue; 1018 1019 if (map) { 1020 KASSERT((mm->flags & VM_MEMMAP_F_IOMMU) == 0, 1021 ("iommu map found invalid memmap %lx/%lx/%x", 1022 mm->gpa, mm->len, mm->flags)); 1023 if ((mm->flags & VM_MEMMAP_F_WIRED) == 0) 1024 continue; 1025 mm->flags |= VM_MEMMAP_F_IOMMU; 1026 } else { 1027 if ((mm->flags & VM_MEMMAP_F_IOMMU) == 0) 1028 continue; 1029 mm->flags &= ~VM_MEMMAP_F_IOMMU; 1030 KASSERT((mm->flags & VM_MEMMAP_F_WIRED) != 0, 1031 ("iommu unmap found invalid memmap %lx/%lx/%x", 1032 mm->gpa, mm->len, mm->flags)); 1033 } 1034 1035 gpa = mm->gpa; 1036 while (gpa < mm->gpa + mm->len) { 1037 vm_page_t *vmp; 1038 1039 vmp = vmc_hold(vmc, gpa, PROT_WRITE); 1040 ASSERT(vmp != NULL); 1041 hpa = ((uintptr_t)vmp_get_pfn(vmp) << PAGESHIFT); 1042 (void) vmp_release(vmp); 1043 1044 /* 1045 * When originally ported from FreeBSD, the logic for 1046 * adding memory to the guest domain would 1047 * simultaneously remove it from the host domain. The 1048 * justification for that is not clear, and FreeBSD has 1049 * subsequently changed the behavior to not remove the 1050 * memory from the host domain. 1051 * 1052 * Leaving the guest memory in the host domain for the 1053 * life of the VM is necessary to make it available for 1054 * DMA, such as through viona in the TX path. 1055 */ 1056 if (map) { 1057 iommu_create_mapping(vm->iommu, gpa, hpa, sz); 1058 } else { 1059 iommu_remove_mapping(vm->iommu, gpa, sz); 1060 } 1061 1062 gpa += PAGE_SIZE; 1063 } 1064 } 1065 vmc_destroy(vmc); 1066 1067 /* 1068 * Invalidate the cached translations associated with the domain 1069 * from which pages were removed. 1070 */ 1071 iommu_invalidate_tlb(vm->iommu); 1072 } 1073 1074 int 1075 vm_unassign_pptdev(struct vm *vm, int pptfd) 1076 { 1077 int error; 1078 1079 error = ppt_unassign_device(vm, pptfd); 1080 if (error) 1081 return (error); 1082 1083 if (ppt_assigned_devices(vm) == 0) 1084 vm_iommu_modify(vm, false); 1085 1086 return (0); 1087 } 1088 1089 int 1090 vm_assign_pptdev(struct vm *vm, int pptfd) 1091 { 1092 int error; 1093 vm_paddr_t maxaddr; 1094 1095 /* Set up the IOMMU to do the 'gpa' to 'hpa' translation */ 1096 if (ppt_assigned_devices(vm) == 0) { 1097 KASSERT(vm->iommu == NULL, 1098 ("vm_assign_pptdev: iommu must be NULL")); 1099 maxaddr = vmm_sysmem_maxaddr(vm); 1100 vm->iommu = iommu_create_domain(maxaddr); 1101 if (vm->iommu == NULL) 1102 return (ENXIO); 1103 vm_iommu_modify(vm, true); 1104 } 1105 1106 error = ppt_assign_device(vm, pptfd); 1107 return (error); 1108 } 1109 1110 int 1111 vm_get_register(struct vm *vm, int vcpuid, int reg, uint64_t *retval) 1112 { 1113 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 1114 return (EINVAL); 1115 1116 if (reg >= VM_REG_LAST) 1117 return (EINVAL); 1118 1119 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 1120 switch (reg) { 1121 case VM_REG_GUEST_XCR0: 1122 *retval = vcpu->guest_xcr0; 1123 return (0); 1124 default: 1125 return (VMGETREG(vm->cookie, vcpuid, reg, retval)); 1126 } 1127 } 1128 1129 int 1130 vm_set_register(struct vm *vm, int vcpuid, int reg, uint64_t val) 1131 { 1132 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 1133 return (EINVAL); 1134 1135 if (reg >= VM_REG_LAST) 1136 return (EINVAL); 1137 1138 int error; 1139 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 1140 switch (reg) { 1141 case VM_REG_GUEST_RIP: 1142 error = VMSETREG(vm->cookie, vcpuid, reg, val); 1143 if (error == 0) { 1144 vcpu->nextrip = val; 1145 } 1146 return (error); 1147 case VM_REG_GUEST_XCR0: 1148 if (!validate_guest_xcr0(val, vmm_get_host_xcr0())) { 1149 return (EINVAL); 1150 } 1151 vcpu->guest_xcr0 = val; 1152 return (0); 1153 default: 1154 return (VMSETREG(vm->cookie, vcpuid, reg, val)); 1155 } 1156 } 1157 1158 static bool 1159 is_descriptor_table(int reg) 1160 { 1161 switch (reg) { 1162 case VM_REG_GUEST_IDTR: 1163 case VM_REG_GUEST_GDTR: 1164 return (true); 1165 default: 1166 return (false); 1167 } 1168 } 1169 1170 static bool 1171 is_segment_register(int reg) 1172 { 1173 switch (reg) { 1174 case VM_REG_GUEST_ES: 1175 case VM_REG_GUEST_CS: 1176 case VM_REG_GUEST_SS: 1177 case VM_REG_GUEST_DS: 1178 case VM_REG_GUEST_FS: 1179 case VM_REG_GUEST_GS: 1180 case VM_REG_GUEST_TR: 1181 case VM_REG_GUEST_LDTR: 1182 return (true); 1183 default: 1184 return (false); 1185 } 1186 } 1187 1188 int 1189 vm_get_seg_desc(struct vm *vm, int vcpu, int reg, struct seg_desc *desc) 1190 { 1191 1192 if (vcpu < 0 || vcpu >= vm->maxcpus) 1193 return (EINVAL); 1194 1195 if (!is_segment_register(reg) && !is_descriptor_table(reg)) 1196 return (EINVAL); 1197 1198 return (VMGETDESC(vm->cookie, vcpu, reg, desc)); 1199 } 1200 1201 int 1202 vm_set_seg_desc(struct vm *vm, int vcpu, int reg, const struct seg_desc *desc) 1203 { 1204 if (vcpu < 0 || vcpu >= vm->maxcpus) 1205 return (EINVAL); 1206 1207 if (!is_segment_register(reg) && !is_descriptor_table(reg)) 1208 return (EINVAL); 1209 1210 return (VMSETDESC(vm->cookie, vcpu, reg, desc)); 1211 } 1212 1213 static int 1214 translate_hma_xsave_result(hma_fpu_xsave_result_t res) 1215 { 1216 switch (res) { 1217 case HFXR_OK: 1218 return (0); 1219 case HFXR_NO_SPACE: 1220 return (ENOSPC); 1221 case HFXR_BAD_ALIGN: 1222 case HFXR_UNSUP_FMT: 1223 case HFXR_UNSUP_FEAT: 1224 case HFXR_INVALID_DATA: 1225 return (EINVAL); 1226 default: 1227 panic("unexpected xsave result"); 1228 } 1229 } 1230 1231 int 1232 vm_get_fpu(struct vm *vm, int vcpuid, void *buf, size_t len) 1233 { 1234 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 1235 return (EINVAL); 1236 1237 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 1238 hma_fpu_xsave_result_t res; 1239 1240 res = hma_fpu_get_xsave_state(vcpu->guestfpu, buf, len); 1241 return (translate_hma_xsave_result(res)); 1242 } 1243 1244 int 1245 vm_set_fpu(struct vm *vm, int vcpuid, void *buf, size_t len) 1246 { 1247 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 1248 return (EINVAL); 1249 1250 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 1251 hma_fpu_xsave_result_t res; 1252 1253 res = hma_fpu_set_xsave_state(vcpu->guestfpu, buf, len); 1254 return (translate_hma_xsave_result(res)); 1255 } 1256 1257 int 1258 vm_get_run_state(struct vm *vm, int vcpuid, uint32_t *state, uint8_t *sipi_vec) 1259 { 1260 struct vcpu *vcpu; 1261 1262 if (vcpuid < 0 || vcpuid >= vm->maxcpus) { 1263 return (EINVAL); 1264 } 1265 1266 vcpu = &vm->vcpu[vcpuid]; 1267 1268 vcpu_lock(vcpu); 1269 *state = vcpu->run_state; 1270 *sipi_vec = vcpu->sipi_vector; 1271 vcpu_unlock(vcpu); 1272 1273 return (0); 1274 } 1275 1276 int 1277 vm_set_run_state(struct vm *vm, int vcpuid, uint32_t state, uint8_t sipi_vec) 1278 { 1279 struct vcpu *vcpu; 1280 1281 if (vcpuid < 0 || vcpuid >= vm->maxcpus) { 1282 return (EINVAL); 1283 } 1284 if (!VRS_IS_VALID(state)) { 1285 return (EINVAL); 1286 } 1287 1288 vcpu = &vm->vcpu[vcpuid]; 1289 1290 vcpu_lock(vcpu); 1291 vcpu->run_state = state; 1292 vcpu->sipi_vector = sipi_vec; 1293 vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT); 1294 vcpu_unlock(vcpu); 1295 1296 return (0); 1297 } 1298 1299 void 1300 vm_track_dirty_pages(struct vm *vm, uint64_t gpa, size_t len, uint8_t *bitmap) 1301 { 1302 vmspace_t *vms = vm_get_vmspace(vm); 1303 vmspace_track_dirty(vms, gpa, len, bitmap); 1304 } 1305 1306 static void 1307 restore_guest_fpustate(struct vcpu *vcpu) 1308 { 1309 /* Save host FPU and restore guest FPU */ 1310 fpu_stop_emulating(); 1311 hma_fpu_start_guest(vcpu->guestfpu); 1312 1313 /* restore guest XCR0 if XSAVE is enabled in the host */ 1314 if (rcr4() & CR4_XSAVE) 1315 load_xcr(0, vcpu->guest_xcr0); 1316 1317 /* 1318 * The FPU is now "dirty" with the guest's state so turn on emulation 1319 * to trap any access to the FPU by the host. 1320 */ 1321 fpu_start_emulating(); 1322 } 1323 1324 static void 1325 save_guest_fpustate(struct vcpu *vcpu) 1326 { 1327 1328 if ((rcr0() & CR0_TS) == 0) 1329 panic("fpu emulation not enabled in host!"); 1330 1331 /* save guest XCR0 and restore host XCR0 */ 1332 if (rcr4() & CR4_XSAVE) { 1333 vcpu->guest_xcr0 = rxcr(0); 1334 load_xcr(0, vmm_get_host_xcr0()); 1335 } 1336 1337 /* save guest FPU and restore host FPU */ 1338 fpu_stop_emulating(); 1339 hma_fpu_stop_guest(vcpu->guestfpu); 1340 /* 1341 * When the host state has been restored, we should not re-enable 1342 * CR0.TS on illumos for eager FPU. 1343 */ 1344 } 1345 1346 static int 1347 vcpu_set_state_locked(struct vm *vm, int vcpuid, enum vcpu_state newstate, 1348 bool from_idle) 1349 { 1350 struct vcpu *vcpu; 1351 int error; 1352 1353 vcpu = &vm->vcpu[vcpuid]; 1354 vcpu_assert_locked(vcpu); 1355 1356 /* 1357 * State transitions from the vmmdev_ioctl() must always begin from 1358 * the VCPU_IDLE state. This guarantees that there is only a single 1359 * ioctl() operating on a vcpu at any point. 1360 */ 1361 if (from_idle) { 1362 while (vcpu->state != VCPU_IDLE) { 1363 vcpu->reqidle = 1; 1364 vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT); 1365 cv_wait(&vcpu->state_cv, &vcpu->lock); 1366 } 1367 } else { 1368 KASSERT(vcpu->state != VCPU_IDLE, ("invalid transition from " 1369 "vcpu idle state")); 1370 } 1371 1372 if (vcpu->state == VCPU_RUNNING) { 1373 KASSERT(vcpu->hostcpu == curcpu, ("curcpu %d and hostcpu %d " 1374 "mismatch for running vcpu", curcpu, vcpu->hostcpu)); 1375 } else { 1376 KASSERT(vcpu->hostcpu == NOCPU, ("Invalid hostcpu %d for a " 1377 "vcpu that is not running", vcpu->hostcpu)); 1378 } 1379 1380 /* 1381 * The following state transitions are allowed: 1382 * IDLE -> FROZEN -> IDLE 1383 * FROZEN -> RUNNING -> FROZEN 1384 * FROZEN -> SLEEPING -> FROZEN 1385 */ 1386 switch (vcpu->state) { 1387 case VCPU_IDLE: 1388 case VCPU_RUNNING: 1389 case VCPU_SLEEPING: 1390 error = (newstate != VCPU_FROZEN); 1391 break; 1392 case VCPU_FROZEN: 1393 error = (newstate == VCPU_FROZEN); 1394 break; 1395 default: 1396 error = 1; 1397 break; 1398 } 1399 1400 if (error) 1401 return (EBUSY); 1402 1403 vcpu->state = newstate; 1404 if (newstate == VCPU_RUNNING) 1405 vcpu->hostcpu = curcpu; 1406 else 1407 vcpu->hostcpu = NOCPU; 1408 1409 if (newstate == VCPU_IDLE) { 1410 cv_broadcast(&vcpu->state_cv); 1411 } 1412 1413 return (0); 1414 } 1415 1416 static void 1417 vcpu_require_state(struct vm *vm, int vcpuid, enum vcpu_state newstate) 1418 { 1419 int error; 1420 1421 if ((error = vcpu_set_state(vm, vcpuid, newstate, false)) != 0) 1422 panic("Error %d setting state to %d\n", error, newstate); 1423 } 1424 1425 static void 1426 vcpu_require_state_locked(struct vm *vm, int vcpuid, enum vcpu_state newstate) 1427 { 1428 int error; 1429 1430 if ((error = vcpu_set_state_locked(vm, vcpuid, newstate, false)) != 0) 1431 panic("Error %d setting state to %d", error, newstate); 1432 } 1433 1434 /* 1435 * Emulate a guest 'hlt' by sleeping until the vcpu is ready to run. 1436 */ 1437 static int 1438 vm_handle_hlt(struct vm *vm, int vcpuid, bool intr_disabled) 1439 { 1440 struct vcpu *vcpu; 1441 int vcpu_halted, vm_halted; 1442 bool userspace_exit = false; 1443 1444 KASSERT(!CPU_ISSET(vcpuid, &vm->halted_cpus), ("vcpu already halted")); 1445 1446 vcpu = &vm->vcpu[vcpuid]; 1447 vcpu_halted = 0; 1448 vm_halted = 0; 1449 1450 vcpu_lock(vcpu); 1451 while (1) { 1452 /* 1453 * Do a final check for pending interrupts (including NMI and 1454 * INIT) before putting this thread to sleep. 1455 */ 1456 if (vm_nmi_pending(vm, vcpuid)) 1457 break; 1458 if (vcpu_run_state_pending(vm, vcpuid)) 1459 break; 1460 if (!intr_disabled) { 1461 if (vm_extint_pending(vm, vcpuid) || 1462 vlapic_pending_intr(vcpu->vlapic, NULL)) { 1463 break; 1464 } 1465 } 1466 1467 /* 1468 * Also check for software events which would cause a wake-up. 1469 * This will set the appropriate exitcode directly, rather than 1470 * requiring a trip through VM_RUN(). 1471 */ 1472 if (vcpu_sleep_bailout_checks(vm, vcpuid)) { 1473 userspace_exit = true; 1474 break; 1475 } 1476 1477 /* 1478 * Some Linux guests implement "halt" by having all vcpus 1479 * execute HLT with interrupts disabled. 'halted_cpus' keeps 1480 * track of the vcpus that have entered this state. When all 1481 * vcpus enter the halted state the virtual machine is halted. 1482 */ 1483 if (intr_disabled) { 1484 if (!vcpu_halted && halt_detection_enabled) { 1485 vcpu_halted = 1; 1486 CPU_SET_ATOMIC(vcpuid, &vm->halted_cpus); 1487 } 1488 if (CPU_CMP(&vm->halted_cpus, &vm->active_cpus) == 0) { 1489 vm_halted = 1; 1490 break; 1491 } 1492 } 1493 1494 vcpu_ustate_change(vm, vcpuid, VU_IDLE); 1495 vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING); 1496 (void) cv_wait_sig(&vcpu->vcpu_cv, &vcpu->lock); 1497 vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN); 1498 vcpu_ustate_change(vm, vcpuid, VU_EMU_KERN); 1499 } 1500 1501 if (vcpu_halted) 1502 CPU_CLR_ATOMIC(vcpuid, &vm->halted_cpus); 1503 1504 vcpu_unlock(vcpu); 1505 1506 if (vm_halted) { 1507 (void) vm_suspend(vm, VM_SUSPEND_HALT); 1508 } 1509 1510 return (userspace_exit ? -1 : 0); 1511 } 1512 1513 static int 1514 vm_handle_paging(struct vm *vm, int vcpuid) 1515 { 1516 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 1517 vm_client_t *vmc = vcpu->vmclient; 1518 struct vm_exit *vme = &vcpu->exitinfo; 1519 int rv, ftype; 1520 1521 KASSERT(vme->inst_length == 0, ("%s: invalid inst_length %d", 1522 __func__, vme->inst_length)); 1523 1524 ftype = vme->u.paging.fault_type; 1525 KASSERT(ftype == PROT_READ || 1526 ftype == PROT_WRITE || ftype == PROT_EXEC, 1527 ("vm_handle_paging: invalid fault_type %d", ftype)); 1528 1529 rv = vmc_fault(vmc, vme->u.paging.gpa, ftype); 1530 1531 if (rv != 0) 1532 return (EFAULT); 1533 return (0); 1534 } 1535 1536 int 1537 vm_service_mmio_read(struct vm *vm, int cpuid, uint64_t gpa, uint64_t *rval, 1538 int rsize) 1539 { 1540 int err = ESRCH; 1541 1542 if (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE) { 1543 struct vlapic *vlapic = vm_lapic(vm, cpuid); 1544 1545 err = vlapic_mmio_read(vlapic, gpa, rval, rsize); 1546 } else if (gpa >= VIOAPIC_BASE && gpa < VIOAPIC_BASE + VIOAPIC_SIZE) { 1547 err = vioapic_mmio_read(vm, cpuid, gpa, rval, rsize); 1548 } else if (gpa >= VHPET_BASE && gpa < VHPET_BASE + VHPET_SIZE) { 1549 err = vhpet_mmio_read(vm, cpuid, gpa, rval, rsize); 1550 } 1551 1552 return (err); 1553 } 1554 1555 int 1556 vm_service_mmio_write(struct vm *vm, int cpuid, uint64_t gpa, uint64_t wval, 1557 int wsize) 1558 { 1559 int err = ESRCH; 1560 1561 if (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE) { 1562 struct vlapic *vlapic = vm_lapic(vm, cpuid); 1563 1564 err = vlapic_mmio_write(vlapic, gpa, wval, wsize); 1565 } else if (gpa >= VIOAPIC_BASE && gpa < VIOAPIC_BASE + VIOAPIC_SIZE) { 1566 err = vioapic_mmio_write(vm, cpuid, gpa, wval, wsize); 1567 } else if (gpa >= VHPET_BASE && gpa < VHPET_BASE + VHPET_SIZE) { 1568 err = vhpet_mmio_write(vm, cpuid, gpa, wval, wsize); 1569 } 1570 1571 return (err); 1572 } 1573 1574 static int 1575 vm_handle_mmio_emul(struct vm *vm, int vcpuid) 1576 { 1577 struct vie *vie; 1578 struct vcpu *vcpu; 1579 struct vm_exit *vme; 1580 uint64_t inst_addr; 1581 int error, fault, cs_d; 1582 1583 vcpu = &vm->vcpu[vcpuid]; 1584 vme = &vcpu->exitinfo; 1585 vie = vcpu->vie_ctx; 1586 1587 KASSERT(vme->inst_length == 0, ("%s: invalid inst_length %d", 1588 __func__, vme->inst_length)); 1589 1590 inst_addr = vme->rip + vme->u.mmio_emul.cs_base; 1591 cs_d = vme->u.mmio_emul.cs_d; 1592 1593 /* Fetch the faulting instruction */ 1594 if (vie_needs_fetch(vie)) { 1595 error = vie_fetch_instruction(vie, vm, vcpuid, inst_addr, 1596 &fault); 1597 if (error != 0) { 1598 return (error); 1599 } else if (fault) { 1600 /* 1601 * If a fault during instruction fetch was encountered, 1602 * it will have asserted that the appropriate exception 1603 * be injected at next entry. 1604 * No further work is required. 1605 */ 1606 return (0); 1607 } 1608 } 1609 1610 if (vie_decode_instruction(vie, vm, vcpuid, cs_d) != 0) { 1611 /* Dump (unrecognized) instruction bytes in userspace */ 1612 vie_fallback_exitinfo(vie, vme); 1613 return (-1); 1614 } 1615 if (vme->u.mmio_emul.gla != VIE_INVALID_GLA && 1616 vie_verify_gla(vie, vm, vcpuid, vme->u.mmio_emul.gla) != 0) { 1617 /* Decoded GLA does not match GLA from VM exit state */ 1618 vie_fallback_exitinfo(vie, vme); 1619 return (-1); 1620 } 1621 1622 repeat: 1623 error = vie_emulate_mmio(vie, vm, vcpuid); 1624 if (error < 0) { 1625 /* 1626 * MMIO not handled by any of the in-kernel-emulated devices, so 1627 * make a trip out to userspace for it. 1628 */ 1629 vie_exitinfo(vie, vme); 1630 } else if (error == EAGAIN) { 1631 /* 1632 * Continue emulating the rep-prefixed instruction, which has 1633 * not completed its iterations. 1634 * 1635 * In case this can be emulated in-kernel and has a high 1636 * repetition count (causing a tight spin), it should be 1637 * deferential to yield conditions. 1638 */ 1639 if (!vcpu_should_yield(vm, vcpuid)) { 1640 goto repeat; 1641 } else { 1642 /* 1643 * Defer to the contending load by making a trip to 1644 * userspace with a no-op (BOGUS) exit reason. 1645 */ 1646 vie_reset(vie); 1647 vme->exitcode = VM_EXITCODE_BOGUS; 1648 return (-1); 1649 } 1650 } else if (error == 0) { 1651 /* Update %rip now that instruction has been emulated */ 1652 vie_advance_pc(vie, &vcpu->nextrip); 1653 } 1654 return (error); 1655 } 1656 1657 static int 1658 vm_handle_inout(struct vm *vm, int vcpuid, struct vm_exit *vme) 1659 { 1660 struct vcpu *vcpu; 1661 struct vie *vie; 1662 int err; 1663 1664 vcpu = &vm->vcpu[vcpuid]; 1665 vie = vcpu->vie_ctx; 1666 1667 repeat: 1668 err = vie_emulate_inout(vie, vm, vcpuid); 1669 1670 if (err < 0) { 1671 /* 1672 * In/out not handled by any of the in-kernel-emulated devices, 1673 * so make a trip out to userspace for it. 1674 */ 1675 vie_exitinfo(vie, vme); 1676 return (err); 1677 } else if (err == EAGAIN) { 1678 /* 1679 * Continue emulating the rep-prefixed ins/outs, which has not 1680 * completed its iterations. 1681 * 1682 * In case this can be emulated in-kernel and has a high 1683 * repetition count (causing a tight spin), it should be 1684 * deferential to yield conditions. 1685 */ 1686 if (!vcpu_should_yield(vm, vcpuid)) { 1687 goto repeat; 1688 } else { 1689 /* 1690 * Defer to the contending load by making a trip to 1691 * userspace with a no-op (BOGUS) exit reason. 1692 */ 1693 vie_reset(vie); 1694 vme->exitcode = VM_EXITCODE_BOGUS; 1695 return (-1); 1696 } 1697 } else if (err != 0) { 1698 /* Emulation failure. Bail all the way out to userspace. */ 1699 vme->exitcode = VM_EXITCODE_INST_EMUL; 1700 bzero(&vme->u.inst_emul, sizeof (vme->u.inst_emul)); 1701 return (-1); 1702 } 1703 1704 vie_advance_pc(vie, &vcpu->nextrip); 1705 return (0); 1706 } 1707 1708 static int 1709 vm_handle_inst_emul(struct vm *vm, int vcpuid) 1710 { 1711 struct vie *vie; 1712 struct vcpu *vcpu; 1713 struct vm_exit *vme; 1714 uint64_t cs_base; 1715 int error, fault, cs_d; 1716 1717 vcpu = &vm->vcpu[vcpuid]; 1718 vme = &vcpu->exitinfo; 1719 vie = vcpu->vie_ctx; 1720 1721 vie_cs_info(vie, vm, vcpuid, &cs_base, &cs_d); 1722 1723 /* Fetch the faulting instruction */ 1724 ASSERT(vie_needs_fetch(vie)); 1725 error = vie_fetch_instruction(vie, vm, vcpuid, vme->rip + cs_base, 1726 &fault); 1727 if (error != 0) { 1728 return (error); 1729 } else if (fault) { 1730 /* 1731 * If a fault during instruction fetch was encounted, it will 1732 * have asserted that the appropriate exception be injected at 1733 * next entry. No further work is required. 1734 */ 1735 return (0); 1736 } 1737 1738 if (vie_decode_instruction(vie, vm, vcpuid, cs_d) != 0) { 1739 /* Dump (unrecognized) instruction bytes in userspace */ 1740 vie_fallback_exitinfo(vie, vme); 1741 return (-1); 1742 } 1743 1744 error = vie_emulate_other(vie, vm, vcpuid); 1745 if (error != 0) { 1746 /* 1747 * Instruction emulation was unable to complete successfully, so 1748 * kick it out to userspace for handling. 1749 */ 1750 vie_fallback_exitinfo(vie, vme); 1751 } else { 1752 /* Update %rip now that instruction has been emulated */ 1753 vie_advance_pc(vie, &vcpu->nextrip); 1754 } 1755 return (error); 1756 } 1757 1758 static int 1759 vm_handle_suspend(struct vm *vm, int vcpuid) 1760 { 1761 int i; 1762 struct vcpu *vcpu; 1763 1764 vcpu = &vm->vcpu[vcpuid]; 1765 1766 CPU_SET_ATOMIC(vcpuid, &vm->suspended_cpus); 1767 1768 /* 1769 * Wait until all 'active_cpus' have suspended themselves. 1770 */ 1771 vcpu_lock(vcpu); 1772 vcpu_ustate_change(vm, vcpuid, VU_INIT); 1773 while (1) { 1774 int rc; 1775 1776 if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) { 1777 break; 1778 } 1779 1780 vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING); 1781 rc = cv_reltimedwait_sig(&vcpu->vcpu_cv, &vcpu->lock, hz, 1782 TR_CLOCK_TICK); 1783 vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN); 1784 1785 /* 1786 * If the userspace process driving the instance is killed, any 1787 * vCPUs yet to be marked suspended (because they are not 1788 * VM_RUN-ing in the kernel presently) will never reach that 1789 * state. 1790 * 1791 * To avoid vm_handle_suspend() getting stuck in the kernel 1792 * waiting for those vCPUs, offer a bail-out even though it 1793 * means returning without all vCPUs in a suspended state. 1794 */ 1795 if (rc <= 0) { 1796 if ((curproc->p_flag & SEXITING) != 0) { 1797 break; 1798 } 1799 } 1800 } 1801 vcpu_unlock(vcpu); 1802 1803 /* 1804 * Wakeup the other sleeping vcpus and return to userspace. 1805 */ 1806 for (i = 0; i < vm->maxcpus; i++) { 1807 if (CPU_ISSET(i, &vm->suspended_cpus)) { 1808 vcpu_notify_event(vm, i); 1809 } 1810 } 1811 1812 return (-1); 1813 } 1814 1815 static int 1816 vm_handle_reqidle(struct vm *vm, int vcpuid) 1817 { 1818 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 1819 1820 vcpu_lock(vcpu); 1821 KASSERT(vcpu->reqidle, ("invalid vcpu reqidle %d", vcpu->reqidle)); 1822 vcpu->reqidle = 0; 1823 vcpu_unlock(vcpu); 1824 return (-1); 1825 } 1826 1827 static int 1828 vm_handle_run_state(struct vm *vm, int vcpuid) 1829 { 1830 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 1831 bool handled = false; 1832 1833 vcpu_lock(vcpu); 1834 while (1) { 1835 if ((vcpu->run_state & VRS_PEND_INIT) != 0) { 1836 vcpu_unlock(vcpu); 1837 VERIFY0(vcpu_arch_reset(vm, vcpuid, true)); 1838 vcpu_lock(vcpu); 1839 1840 vcpu->run_state &= ~(VRS_RUN | VRS_PEND_INIT); 1841 vcpu->run_state |= VRS_INIT; 1842 } 1843 1844 if ((vcpu->run_state & (VRS_INIT | VRS_RUN | VRS_PEND_SIPI)) == 1845 (VRS_INIT | VRS_PEND_SIPI)) { 1846 const uint8_t vector = vcpu->sipi_vector; 1847 1848 vcpu_unlock(vcpu); 1849 VERIFY0(vcpu_vector_sipi(vm, vcpuid, vector)); 1850 vcpu_lock(vcpu); 1851 1852 vcpu->run_state &= ~VRS_PEND_SIPI; 1853 vcpu->run_state |= VRS_RUN; 1854 } 1855 1856 /* 1857 * If the vCPU is now in the running state, there is no need to 1858 * wait for anything prior to re-entry. 1859 */ 1860 if ((vcpu->run_state & VRS_RUN) != 0) { 1861 handled = true; 1862 break; 1863 } 1864 1865 /* 1866 * Also check for software events which would cause a wake-up. 1867 * This will set the appropriate exitcode directly, rather than 1868 * requiring a trip through VM_RUN(). 1869 */ 1870 if (vcpu_sleep_bailout_checks(vm, vcpuid)) { 1871 break; 1872 } 1873 1874 vcpu_ustate_change(vm, vcpuid, VU_IDLE); 1875 vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING); 1876 (void) cv_wait_sig(&vcpu->vcpu_cv, &vcpu->lock); 1877 vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN); 1878 vcpu_ustate_change(vm, vcpuid, VU_EMU_KERN); 1879 } 1880 vcpu_unlock(vcpu); 1881 1882 return (handled ? 0 : -1); 1883 } 1884 1885 static int 1886 vm_rdmtrr(const struct vm_mtrr *mtrr, uint32_t num, uint64_t *val) 1887 { 1888 switch (num) { 1889 case MSR_MTRRcap: 1890 *val = MTRR_CAP_WC | MTRR_CAP_FIXED | VMM_MTRR_VAR_MAX; 1891 break; 1892 case MSR_MTRRdefType: 1893 *val = mtrr->def_type; 1894 break; 1895 case MSR_MTRR4kBase ... MSR_MTRR4kBase + 7: 1896 *val = mtrr->fixed4k[num - MSR_MTRR4kBase]; 1897 break; 1898 case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1: 1899 *val = mtrr->fixed16k[num - MSR_MTRR16kBase]; 1900 break; 1901 case MSR_MTRR64kBase: 1902 *val = mtrr->fixed64k; 1903 break; 1904 case MSR_MTRRVarBase ... MSR_MTRRVarBase + (VMM_MTRR_VAR_MAX * 2) - 1: { 1905 uint_t offset = num - MSR_MTRRVarBase; 1906 if (offset % 2 == 0) { 1907 *val = mtrr->var[offset / 2].base; 1908 } else { 1909 *val = mtrr->var[offset / 2].mask; 1910 } 1911 break; 1912 } 1913 default: 1914 return (-1); 1915 } 1916 1917 return (0); 1918 } 1919 1920 static int 1921 vm_wrmtrr(struct vm_mtrr *mtrr, uint32_t num, uint64_t val) 1922 { 1923 switch (num) { 1924 case MSR_MTRRcap: 1925 /* MTRRCAP is read only */ 1926 return (-1); 1927 case MSR_MTRRdefType: 1928 if (val & ~VMM_MTRR_DEF_MASK) { 1929 /* generate #GP on writes to reserved fields */ 1930 return (-1); 1931 } 1932 mtrr->def_type = val; 1933 break; 1934 case MSR_MTRR4kBase ... MSR_MTRR4kBase + 7: 1935 mtrr->fixed4k[num - MSR_MTRR4kBase] = val; 1936 break; 1937 case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1: 1938 mtrr->fixed16k[num - MSR_MTRR16kBase] = val; 1939 break; 1940 case MSR_MTRR64kBase: 1941 mtrr->fixed64k = val; 1942 break; 1943 case MSR_MTRRVarBase ... MSR_MTRRVarBase + (VMM_MTRR_VAR_MAX * 2) - 1: { 1944 uint_t offset = num - MSR_MTRRVarBase; 1945 if (offset % 2 == 0) { 1946 if (val & ~VMM_MTRR_PHYSBASE_MASK) { 1947 /* generate #GP on writes to reserved fields */ 1948 return (-1); 1949 } 1950 mtrr->var[offset / 2].base = val; 1951 } else { 1952 if (val & ~VMM_MTRR_PHYSMASK_MASK) { 1953 /* generate #GP on writes to reserved fields */ 1954 return (-1); 1955 } 1956 mtrr->var[offset / 2].mask = val; 1957 } 1958 break; 1959 } 1960 default: 1961 return (-1); 1962 } 1963 1964 return (0); 1965 } 1966 1967 static bool 1968 is_mtrr_msr(uint32_t msr) 1969 { 1970 switch (msr) { 1971 case MSR_MTRRcap: 1972 case MSR_MTRRdefType: 1973 case MSR_MTRR4kBase ... MSR_MTRR4kBase + 7: 1974 case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1: 1975 case MSR_MTRR64kBase: 1976 case MSR_MTRRVarBase ... MSR_MTRRVarBase + (VMM_MTRR_VAR_MAX * 2) - 1: 1977 return (true); 1978 default: 1979 return (false); 1980 } 1981 } 1982 1983 static int 1984 vm_handle_rdmsr(struct vm *vm, int vcpuid, struct vm_exit *vme) 1985 { 1986 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 1987 const uint32_t code = vme->u.msr.code; 1988 uint64_t val = 0; 1989 1990 switch (code) { 1991 case MSR_MCG_CAP: 1992 case MSR_MCG_STATUS: 1993 val = 0; 1994 break; 1995 1996 case MSR_MTRRcap: 1997 case MSR_MTRRdefType: 1998 case MSR_MTRR4kBase ... MSR_MTRR4kBase + 7: 1999 case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1: 2000 case MSR_MTRR64kBase: 2001 case MSR_MTRRVarBase ... MSR_MTRRVarBase + (VMM_MTRR_VAR_MAX * 2) - 1: 2002 if (vm_rdmtrr(&vcpu->mtrr, code, &val) != 0) 2003 vm_inject_gp(vm, vcpuid); 2004 break; 2005 2006 case MSR_TSC: 2007 /* 2008 * In all likelihood, this should always be handled in guest 2009 * context by VMX/SVM rather than taking an exit. (Both VMX and 2010 * SVM pass through read-only access to MSR_TSC to the guest.) 2011 * 2012 * No physical offset is requested of vcpu_tsc_offset() since 2013 * rdtsc_offset() takes care of that instead. 2014 */ 2015 val = vcpu_tsc_offset(vm, vcpuid, false) + rdtsc_offset(); 2016 break; 2017 2018 default: 2019 /* 2020 * Anything not handled at this point will be kicked out to 2021 * userspace for attempted processing there. 2022 */ 2023 return (-1); 2024 } 2025 2026 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RAX, 2027 val & 0xffffffff)); 2028 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RDX, 2029 val >> 32)); 2030 return (0); 2031 } 2032 2033 static int 2034 vm_handle_wrmsr(struct vm *vm, int vcpuid, struct vm_exit *vme) 2035 { 2036 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 2037 const uint32_t code = vme->u.msr.code; 2038 const uint64_t val = vme->u.msr.wval; 2039 2040 switch (code) { 2041 case MSR_MCG_CAP: 2042 case MSR_MCG_STATUS: 2043 /* Ignore writes */ 2044 break; 2045 2046 case MSR_MTRRcap: 2047 case MSR_MTRRdefType: 2048 case MSR_MTRR4kBase ... MSR_MTRR4kBase + 7: 2049 case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1: 2050 case MSR_MTRR64kBase: 2051 case MSR_MTRRVarBase ... MSR_MTRRVarBase + (VMM_MTRR_VAR_MAX * 2) - 1: 2052 if (vm_wrmtrr(&vcpu->mtrr, code, val) != 0) 2053 vm_inject_gp(vm, vcpuid); 2054 break; 2055 2056 case MSR_TSC: 2057 /* 2058 * The effect of writing the TSC MSR is that a subsequent read 2059 * of the TSC would report that value written (plus any time 2060 * elapsed between the write and the read). The guest TSC value 2061 * is calculated from a global offset for the guest (which 2062 * effectively makes its TSC read 0 at guest boot) and a 2063 * per-vCPU offset to handle these writes to the MSR. 2064 * 2065 * To calculate that per-vCPU offset, we can work backwards from 2066 * the guest value at the time of write: 2067 * 2068 * value = host TSC + VM boot offset + vCPU offset 2069 * 2070 * so therefore: 2071 * 2072 * value - host TSC - VM boot offset = vCPU offset 2073 */ 2074 vcpu->tsc_offset = val - vm->boot_tsc_offset - rdtsc_offset(); 2075 break; 2076 2077 default: 2078 /* 2079 * Anything not handled at this point will be kicked out to 2080 * userspace for attempted processing there. 2081 */ 2082 return (-1); 2083 } 2084 2085 return (0); 2086 } 2087 2088 int 2089 vm_suspend(struct vm *vm, enum vm_suspend_how how) 2090 { 2091 if (how <= VM_SUSPEND_NONE || how >= VM_SUSPEND_LAST) 2092 return (EINVAL); 2093 2094 if (atomic_cmpset_int((uint_t *)&vm->suspend, 0, how) == 0) { 2095 return (EALREADY); 2096 } 2097 2098 /* 2099 * Notify all active vcpus that they are now suspended. 2100 */ 2101 for (uint_t i = 0; i < vm->maxcpus; i++) { 2102 struct vcpu *vcpu = &vm->vcpu[i]; 2103 2104 vcpu_lock(vcpu); 2105 if (vcpu->state == VCPU_IDLE || vcpu->state == VCPU_FROZEN) { 2106 /* 2107 * Any vCPUs not actively running or in HLT can be 2108 * marked as suspended immediately. 2109 */ 2110 if (CPU_ISSET(i, &vm->active_cpus)) { 2111 CPU_SET_ATOMIC(i, &vm->suspended_cpus); 2112 } 2113 } else { 2114 /* 2115 * Those which are running or in HLT will pick up the 2116 * suspended state after notification. 2117 */ 2118 vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT); 2119 } 2120 vcpu_unlock(vcpu); 2121 } 2122 return (0); 2123 } 2124 2125 void 2126 vm_exit_run_state(struct vm *vm, int vcpuid, uint64_t rip) 2127 { 2128 struct vm_exit *vmexit; 2129 2130 vmexit = vm_exitinfo(vm, vcpuid); 2131 vmexit->rip = rip; 2132 vmexit->inst_length = 0; 2133 vmexit->exitcode = VM_EXITCODE_RUN_STATE; 2134 vmm_stat_incr(vm, vcpuid, VMEXIT_RUN_STATE, 1); 2135 } 2136 2137 /* 2138 * Some vmm resources, such as the lapic, may have CPU-specific resources 2139 * allocated to them which would benefit from migration onto the host CPU which 2140 * is processing the vcpu state. 2141 */ 2142 static void 2143 vm_localize_resources(struct vm *vm, struct vcpu *vcpu) 2144 { 2145 /* 2146 * Localizing cyclic resources requires acquisition of cpu_lock, and 2147 * doing so with kpreempt disabled is a recipe for deadlock disaster. 2148 */ 2149 VERIFY(curthread->t_preempt == 0); 2150 2151 /* 2152 * Do not bother with localization if this vCPU is about to return to 2153 * the host CPU it was last localized to. 2154 */ 2155 if (vcpu->lastloccpu == curcpu) 2156 return; 2157 2158 /* 2159 * Localize system-wide resources to the primary boot vCPU. While any 2160 * of the other vCPUs may access them, it keeps the potential interrupt 2161 * footprint constrained to CPUs involved with this instance. 2162 */ 2163 if (vcpu == &vm->vcpu[0]) { 2164 vhpet_localize_resources(vm->vhpet); 2165 vrtc_localize_resources(vm->vrtc); 2166 vatpit_localize_resources(vm->vatpit); 2167 } 2168 2169 vlapic_localize_resources(vcpu->vlapic); 2170 2171 vcpu->lastloccpu = curcpu; 2172 } 2173 2174 static void 2175 vmm_savectx(void *arg) 2176 { 2177 vm_thread_ctx_t *vtc = arg; 2178 struct vm *vm = vtc->vtc_vm; 2179 const int vcpuid = vtc->vtc_vcpuid; 2180 2181 if (ops->vmsavectx != NULL) { 2182 ops->vmsavectx(vm->cookie, vcpuid); 2183 } 2184 2185 /* 2186 * Account for going off-cpu, unless the vCPU is idled, where being 2187 * off-cpu is the explicit point. 2188 */ 2189 if (vm->vcpu[vcpuid].ustate != VU_IDLE) { 2190 vtc->vtc_ustate = vm->vcpu[vcpuid].ustate; 2191 vcpu_ustate_change(vm, vcpuid, VU_SCHED); 2192 } 2193 2194 /* 2195 * If the CPU holds the restored guest FPU state, save it and restore 2196 * the host FPU state before this thread goes off-cpu. 2197 */ 2198 if ((vtc->vtc_status & VTCS_FPU_RESTORED) != 0) { 2199 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 2200 2201 save_guest_fpustate(vcpu); 2202 vtc->vtc_status &= ~VTCS_FPU_RESTORED; 2203 } 2204 } 2205 2206 static void 2207 vmm_restorectx(void *arg) 2208 { 2209 vm_thread_ctx_t *vtc = arg; 2210 struct vm *vm = vtc->vtc_vm; 2211 const int vcpuid = vtc->vtc_vcpuid; 2212 2213 /* Complete microstate accounting for vCPU being off-cpu */ 2214 if (vm->vcpu[vcpuid].ustate != VU_IDLE) { 2215 vcpu_ustate_change(vm, vcpuid, vtc->vtc_ustate); 2216 } 2217 2218 /* 2219 * When coming back on-cpu, only restore the guest FPU status if the 2220 * thread is in a context marked as requiring it. This should be rare, 2221 * occurring only when a future logic error results in a voluntary 2222 * sleep during the VMRUN critical section. 2223 * 2224 * The common case will result in elision of the guest FPU state 2225 * restoration, deferring that action until it is clearly necessary 2226 * during vm_run. 2227 */ 2228 VERIFY((vtc->vtc_status & VTCS_FPU_RESTORED) == 0); 2229 if ((vtc->vtc_status & VTCS_FPU_CTX_CRITICAL) != 0) { 2230 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 2231 2232 restore_guest_fpustate(vcpu); 2233 vtc->vtc_status |= VTCS_FPU_RESTORED; 2234 } 2235 2236 if (ops->vmrestorectx != NULL) { 2237 ops->vmrestorectx(vm->cookie, vcpuid); 2238 } 2239 2240 } 2241 2242 static int 2243 vm_entry_actions(struct vm *vm, int vcpuid, const struct vm_entry *entry, 2244 struct vm_exit *vme) 2245 { 2246 struct vcpu *vcpu; 2247 struct vie *vie; 2248 int err; 2249 2250 vcpu = &vm->vcpu[vcpuid]; 2251 vie = vcpu->vie_ctx; 2252 err = 0; 2253 2254 switch (entry->cmd) { 2255 case VEC_DEFAULT: 2256 return (0); 2257 case VEC_DISCARD_INSTR: 2258 vie_reset(vie); 2259 return (0); 2260 case VEC_FULFILL_MMIO: 2261 err = vie_fulfill_mmio(vie, &entry->u.mmio); 2262 if (err == 0) { 2263 err = vie_emulate_mmio(vie, vm, vcpuid); 2264 if (err == 0) { 2265 vie_advance_pc(vie, &vcpu->nextrip); 2266 } else if (err < 0) { 2267 vie_exitinfo(vie, vme); 2268 } else if (err == EAGAIN) { 2269 /* 2270 * Clear the instruction emulation state in 2271 * order to re-enter VM context and continue 2272 * this 'rep <instruction>' 2273 */ 2274 vie_reset(vie); 2275 err = 0; 2276 } 2277 } 2278 break; 2279 case VEC_FULFILL_INOUT: 2280 err = vie_fulfill_inout(vie, &entry->u.inout); 2281 if (err == 0) { 2282 err = vie_emulate_inout(vie, vm, vcpuid); 2283 if (err == 0) { 2284 vie_advance_pc(vie, &vcpu->nextrip); 2285 } else if (err < 0) { 2286 vie_exitinfo(vie, vme); 2287 } else if (err == EAGAIN) { 2288 /* 2289 * Clear the instruction emulation state in 2290 * order to re-enter VM context and continue 2291 * this 'rep ins/outs' 2292 */ 2293 vie_reset(vie); 2294 err = 0; 2295 } 2296 } 2297 break; 2298 default: 2299 return (EINVAL); 2300 } 2301 return (err); 2302 } 2303 2304 static int 2305 vm_loop_checks(struct vm *vm, int vcpuid, struct vm_exit *vme) 2306 { 2307 struct vie *vie; 2308 2309 vie = vm->vcpu[vcpuid].vie_ctx; 2310 2311 if (vie_pending(vie)) { 2312 /* 2313 * Userspace has not fulfilled the pending needs of the 2314 * instruction emulation, so bail back out. 2315 */ 2316 vie_exitinfo(vie, vme); 2317 return (-1); 2318 } 2319 2320 return (0); 2321 } 2322 2323 int 2324 vm_run(struct vm *vm, int vcpuid, const struct vm_entry *entry) 2325 { 2326 int error; 2327 struct vcpu *vcpu; 2328 struct vm_exit *vme; 2329 bool intr_disabled; 2330 int affinity_type = CPU_CURRENT; 2331 2332 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 2333 return (EINVAL); 2334 if (!CPU_ISSET(vcpuid, &vm->active_cpus)) 2335 return (EINVAL); 2336 2337 vcpu = &vm->vcpu[vcpuid]; 2338 vme = &vcpu->exitinfo; 2339 2340 vcpu_ustate_change(vm, vcpuid, VU_EMU_KERN); 2341 2342 vcpu->vtc.vtc_status = 0; 2343 ctxop_attach(curthread, vcpu->ctxop); 2344 2345 error = vm_entry_actions(vm, vcpuid, entry, vme); 2346 if (error != 0) { 2347 goto exit; 2348 } 2349 2350 restart: 2351 error = vm_loop_checks(vm, vcpuid, vme); 2352 if (error != 0) { 2353 goto exit; 2354 } 2355 2356 thread_affinity_set(curthread, affinity_type); 2357 /* 2358 * Resource localization should happen after the CPU affinity for the 2359 * thread has been set to ensure that access from restricted contexts, 2360 * such as VMX-accelerated APIC operations, can occur without inducing 2361 * cyclic cross-calls. 2362 * 2363 * This must be done prior to disabling kpreempt via critical_enter(). 2364 */ 2365 vm_localize_resources(vm, vcpu); 2366 affinity_type = CPU_CURRENT; 2367 critical_enter(); 2368 2369 /* Force a trip through update_sregs to reload %fs/%gs and friends */ 2370 PCB_SET_UPDATE_SEGS(&ttolwp(curthread)->lwp_pcb); 2371 2372 if ((vcpu->vtc.vtc_status & VTCS_FPU_RESTORED) == 0) { 2373 restore_guest_fpustate(vcpu); 2374 vcpu->vtc.vtc_status |= VTCS_FPU_RESTORED; 2375 } 2376 vcpu->vtc.vtc_status |= VTCS_FPU_CTX_CRITICAL; 2377 2378 vcpu_require_state(vm, vcpuid, VCPU_RUNNING); 2379 error = VMRUN(vm->cookie, vcpuid, vcpu->nextrip); 2380 vcpu_require_state(vm, vcpuid, VCPU_FROZEN); 2381 2382 /* 2383 * Once clear of the delicate contexts comprising the VM_RUN handler, 2384 * thread CPU affinity can be loosened while other processing occurs. 2385 */ 2386 vcpu->vtc.vtc_status &= ~VTCS_FPU_CTX_CRITICAL; 2387 thread_affinity_clear(curthread); 2388 critical_exit(); 2389 2390 if (error != 0) { 2391 /* Communicate out any error from VMRUN() above */ 2392 goto exit; 2393 } 2394 2395 vcpu->nextrip = vme->rip + vme->inst_length; 2396 switch (vme->exitcode) { 2397 case VM_EXITCODE_REQIDLE: 2398 error = vm_handle_reqidle(vm, vcpuid); 2399 break; 2400 case VM_EXITCODE_RUN_STATE: 2401 error = vm_handle_run_state(vm, vcpuid); 2402 break; 2403 case VM_EXITCODE_SUSPENDED: 2404 error = vm_handle_suspend(vm, vcpuid); 2405 break; 2406 case VM_EXITCODE_IOAPIC_EOI: 2407 vioapic_process_eoi(vm, vcpuid, 2408 vme->u.ioapic_eoi.vector); 2409 break; 2410 case VM_EXITCODE_HLT: 2411 intr_disabled = ((vme->u.hlt.rflags & PSL_I) == 0); 2412 error = vm_handle_hlt(vm, vcpuid, intr_disabled); 2413 break; 2414 case VM_EXITCODE_PAGING: 2415 error = vm_handle_paging(vm, vcpuid); 2416 break; 2417 case VM_EXITCODE_MMIO_EMUL: 2418 error = vm_handle_mmio_emul(vm, vcpuid); 2419 break; 2420 case VM_EXITCODE_INOUT: 2421 error = vm_handle_inout(vm, vcpuid, vme); 2422 break; 2423 case VM_EXITCODE_INST_EMUL: 2424 error = vm_handle_inst_emul(vm, vcpuid); 2425 break; 2426 case VM_EXITCODE_MONITOR: 2427 case VM_EXITCODE_MWAIT: 2428 case VM_EXITCODE_VMINSN: 2429 vm_inject_ud(vm, vcpuid); 2430 break; 2431 case VM_EXITCODE_RDMSR: 2432 error = vm_handle_rdmsr(vm, vcpuid, vme); 2433 break; 2434 case VM_EXITCODE_WRMSR: 2435 error = vm_handle_wrmsr(vm, vcpuid, vme); 2436 break; 2437 case VM_EXITCODE_HT: 2438 affinity_type = CPU_BEST; 2439 break; 2440 case VM_EXITCODE_MTRAP: 2441 VERIFY0(vm_suspend_cpu(vm, vcpuid)); 2442 error = -1; 2443 break; 2444 default: 2445 /* handled in userland */ 2446 error = -1; 2447 break; 2448 } 2449 2450 if (error == 0) { 2451 /* VM exit conditions handled in-kernel, continue running */ 2452 goto restart; 2453 } 2454 2455 exit: 2456 kpreempt_disable(); 2457 ctxop_detach(curthread, vcpu->ctxop); 2458 /* Make sure all of the needed vCPU context state is saved */ 2459 vmm_savectx(&vcpu->vtc); 2460 kpreempt_enable(); 2461 2462 vcpu_ustate_change(vm, vcpuid, VU_EMU_USER); 2463 return (error); 2464 } 2465 2466 int 2467 vm_restart_instruction(void *arg, int vcpuid) 2468 { 2469 struct vm *vm; 2470 struct vcpu *vcpu; 2471 enum vcpu_state state; 2472 uint64_t rip; 2473 int error; 2474 2475 vm = arg; 2476 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 2477 return (EINVAL); 2478 2479 vcpu = &vm->vcpu[vcpuid]; 2480 state = vcpu_get_state(vm, vcpuid, NULL); 2481 if (state == VCPU_RUNNING) { 2482 /* 2483 * When a vcpu is "running" the next instruction is determined 2484 * by adding 'rip' and 'inst_length' in the vcpu's 'exitinfo'. 2485 * Thus setting 'inst_length' to zero will cause the current 2486 * instruction to be restarted. 2487 */ 2488 vcpu->exitinfo.inst_length = 0; 2489 } else if (state == VCPU_FROZEN) { 2490 /* 2491 * When a vcpu is "frozen" it is outside the critical section 2492 * around VMRUN() and 'nextrip' points to the next instruction. 2493 * Thus instruction restart is achieved by setting 'nextrip' 2494 * to the vcpu's %rip. 2495 */ 2496 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RIP, &rip); 2497 KASSERT(!error, ("%s: error %d getting rip", __func__, error)); 2498 vcpu->nextrip = rip; 2499 } else { 2500 panic("%s: invalid state %d", __func__, state); 2501 } 2502 return (0); 2503 } 2504 2505 int 2506 vm_exit_intinfo(struct vm *vm, int vcpuid, uint64_t info) 2507 { 2508 struct vcpu *vcpu; 2509 2510 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 2511 return (EINVAL); 2512 2513 vcpu = &vm->vcpu[vcpuid]; 2514 2515 if (VM_INTINFO_PENDING(info)) { 2516 const uint32_t type = VM_INTINFO_TYPE(info); 2517 const uint8_t vector = VM_INTINFO_VECTOR(info); 2518 2519 if (type == VM_INTINFO_NMI && vector != IDT_NMI) 2520 return (EINVAL); 2521 if (type == VM_INTINFO_HWEXCP && vector >= 32) 2522 return (EINVAL); 2523 if (info & VM_INTINFO_MASK_RSVD) 2524 return (EINVAL); 2525 } else { 2526 info = 0; 2527 } 2528 vcpu->exit_intinfo = info; 2529 return (0); 2530 } 2531 2532 enum exc_class { 2533 EXC_BENIGN, 2534 EXC_CONTRIBUTORY, 2535 EXC_PAGEFAULT 2536 }; 2537 2538 #define IDT_VE 20 /* Virtualization Exception (Intel specific) */ 2539 2540 static enum exc_class 2541 exception_class(uint64_t info) 2542 { 2543 ASSERT(VM_INTINFO_PENDING(info)); 2544 2545 /* Table 6-4, "Interrupt and Exception Classes", Intel SDM, Vol 3 */ 2546 switch (VM_INTINFO_TYPE(info)) { 2547 case VM_INTINFO_HWINTR: 2548 case VM_INTINFO_SWINTR: 2549 case VM_INTINFO_NMI: 2550 return (EXC_BENIGN); 2551 default: 2552 /* 2553 * Hardware exception. 2554 * 2555 * SVM and VT-x use identical type values to represent NMI, 2556 * hardware interrupt and software interrupt. 2557 * 2558 * SVM uses type '3' for all exceptions. VT-x uses type '3' 2559 * for exceptions except #BP and #OF. #BP and #OF use a type 2560 * value of '5' or '6'. Therefore we don't check for explicit 2561 * values of 'type' to classify 'intinfo' into a hardware 2562 * exception. 2563 */ 2564 break; 2565 } 2566 2567 switch (VM_INTINFO_VECTOR(info)) { 2568 case IDT_PF: 2569 case IDT_VE: 2570 return (EXC_PAGEFAULT); 2571 case IDT_DE: 2572 case IDT_TS: 2573 case IDT_NP: 2574 case IDT_SS: 2575 case IDT_GP: 2576 return (EXC_CONTRIBUTORY); 2577 default: 2578 return (EXC_BENIGN); 2579 } 2580 } 2581 2582 /* 2583 * Fetch event pending injection into the guest, if one exists. 2584 * 2585 * Returns true if an event is to be injected (which is placed in `retinfo`). 2586 */ 2587 bool 2588 vm_entry_intinfo(struct vm *vm, int vcpuid, uint64_t *retinfo) 2589 { 2590 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 2591 const uint64_t info1 = vcpu->exit_intinfo; 2592 vcpu->exit_intinfo = 0; 2593 const uint64_t info2 = vcpu->exc_pending; 2594 vcpu->exc_pending = 0; 2595 2596 if (VM_INTINFO_PENDING(info1) && VM_INTINFO_PENDING(info2)) { 2597 /* 2598 * If an exception occurs while attempting to call the 2599 * double-fault handler the processor enters shutdown mode 2600 * (aka triple fault). 2601 */ 2602 if (VM_INTINFO_TYPE(info1) == VM_INTINFO_HWEXCP && 2603 VM_INTINFO_VECTOR(info1) == IDT_DF) { 2604 (void) vm_suspend(vm, VM_SUSPEND_TRIPLEFAULT); 2605 *retinfo = 0; 2606 return (false); 2607 } 2608 /* 2609 * "Conditions for Generating a Double Fault" 2610 * Intel SDM, Vol3, Table 6-5 2611 */ 2612 const enum exc_class exc1 = exception_class(info1); 2613 const enum exc_class exc2 = exception_class(info2); 2614 if ((exc1 == EXC_CONTRIBUTORY && exc2 == EXC_CONTRIBUTORY) || 2615 (exc1 == EXC_PAGEFAULT && exc2 != EXC_BENIGN)) { 2616 /* Convert nested fault into a double fault. */ 2617 *retinfo = 2618 VM_INTINFO_VALID | 2619 VM_INTINFO_DEL_ERRCODE | 2620 VM_INTINFO_HWEXCP | 2621 IDT_DF; 2622 } else { 2623 /* Handle exceptions serially */ 2624 vcpu->exit_intinfo = info1; 2625 *retinfo = info2; 2626 } 2627 return (true); 2628 } else if (VM_INTINFO_PENDING(info1)) { 2629 *retinfo = info1; 2630 return (true); 2631 } else if (VM_INTINFO_PENDING(info2)) { 2632 *retinfo = info2; 2633 return (true); 2634 } 2635 2636 return (false); 2637 } 2638 2639 int 2640 vm_get_intinfo(struct vm *vm, int vcpuid, uint64_t *info1, uint64_t *info2) 2641 { 2642 struct vcpu *vcpu; 2643 2644 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 2645 return (EINVAL); 2646 2647 vcpu = &vm->vcpu[vcpuid]; 2648 *info1 = vcpu->exit_intinfo; 2649 *info2 = vcpu->exc_pending; 2650 return (0); 2651 } 2652 2653 int 2654 vm_inject_exception(struct vm *vm, int vcpuid, uint8_t vector, 2655 bool errcode_valid, uint32_t errcode, bool restart_instruction) 2656 { 2657 struct vcpu *vcpu; 2658 uint64_t regval; 2659 int error; 2660 2661 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 2662 return (EINVAL); 2663 2664 if (vector >= 32) 2665 return (EINVAL); 2666 2667 /* 2668 * NMIs are to be injected via their own specialized path using 2669 * vm_inject_nmi(). 2670 */ 2671 if (vector == IDT_NMI) { 2672 return (EINVAL); 2673 } 2674 2675 /* 2676 * A double fault exception should never be injected directly into 2677 * the guest. It is a derived exception that results from specific 2678 * combinations of nested faults. 2679 */ 2680 if (vector == IDT_DF) { 2681 return (EINVAL); 2682 } 2683 2684 vcpu = &vm->vcpu[vcpuid]; 2685 2686 if (VM_INTINFO_PENDING(vcpu->exc_pending)) { 2687 /* Unable to inject exception due to one already pending */ 2688 return (EBUSY); 2689 } 2690 2691 if (errcode_valid) { 2692 /* 2693 * Exceptions don't deliver an error code in real mode. 2694 */ 2695 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_CR0, ®val); 2696 VERIFY0(error); 2697 if ((regval & CR0_PE) == 0) { 2698 errcode_valid = false; 2699 } 2700 } 2701 2702 /* 2703 * From section 26.6.1 "Interruptibility State" in Intel SDM: 2704 * 2705 * Event blocking by "STI" or "MOV SS" is cleared after guest executes 2706 * one instruction or incurs an exception. 2707 */ 2708 error = vm_set_register(vm, vcpuid, VM_REG_GUEST_INTR_SHADOW, 0); 2709 VERIFY0(error); 2710 2711 if (restart_instruction) { 2712 VERIFY0(vm_restart_instruction(vm, vcpuid)); 2713 } 2714 2715 uint64_t val = VM_INTINFO_VALID | VM_INTINFO_HWEXCP | vector; 2716 if (errcode_valid) { 2717 val |= VM_INTINFO_DEL_ERRCODE; 2718 val |= (uint64_t)errcode << VM_INTINFO_SHIFT_ERRCODE; 2719 } 2720 vcpu->exc_pending = val; 2721 return (0); 2722 } 2723 2724 void 2725 vm_inject_ud(struct vm *vm, int vcpuid) 2726 { 2727 VERIFY0(vm_inject_exception(vm, vcpuid, IDT_UD, false, 0, true)); 2728 } 2729 2730 void 2731 vm_inject_gp(struct vm *vm, int vcpuid) 2732 { 2733 VERIFY0(vm_inject_exception(vm, vcpuid, IDT_GP, true, 0, true)); 2734 } 2735 2736 void 2737 vm_inject_ac(struct vm *vm, int vcpuid, uint32_t errcode) 2738 { 2739 VERIFY0(vm_inject_exception(vm, vcpuid, IDT_AC, true, errcode, true)); 2740 } 2741 2742 void 2743 vm_inject_ss(struct vm *vm, int vcpuid, uint32_t errcode) 2744 { 2745 VERIFY0(vm_inject_exception(vm, vcpuid, IDT_SS, true, errcode, true)); 2746 } 2747 2748 void 2749 vm_inject_pf(struct vm *vm, int vcpuid, uint32_t errcode, uint64_t cr2) 2750 { 2751 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_CR2, cr2)); 2752 VERIFY0(vm_inject_exception(vm, vcpuid, IDT_PF, true, errcode, true)); 2753 } 2754 2755 static VMM_STAT(VCPU_NMI_COUNT, "number of NMIs delivered to vcpu"); 2756 2757 int 2758 vm_inject_nmi(struct vm *vm, int vcpuid) 2759 { 2760 struct vcpu *vcpu; 2761 2762 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 2763 return (EINVAL); 2764 2765 vcpu = &vm->vcpu[vcpuid]; 2766 2767 vcpu->nmi_pending = true; 2768 vcpu_notify_event(vm, vcpuid); 2769 return (0); 2770 } 2771 2772 bool 2773 vm_nmi_pending(struct vm *vm, int vcpuid) 2774 { 2775 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 2776 2777 return (vcpu->nmi_pending); 2778 } 2779 2780 void 2781 vm_nmi_clear(struct vm *vm, int vcpuid) 2782 { 2783 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 2784 2785 ASSERT(vcpu->nmi_pending); 2786 2787 vcpu->nmi_pending = false; 2788 vmm_stat_incr(vm, vcpuid, VCPU_NMI_COUNT, 1); 2789 } 2790 2791 static VMM_STAT(VCPU_EXTINT_COUNT, "number of ExtINTs delivered to vcpu"); 2792 2793 int 2794 vm_inject_extint(struct vm *vm, int vcpuid) 2795 { 2796 struct vcpu *vcpu; 2797 2798 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 2799 return (EINVAL); 2800 2801 vcpu = &vm->vcpu[vcpuid]; 2802 2803 vcpu->extint_pending = true; 2804 vcpu_notify_event(vm, vcpuid); 2805 return (0); 2806 } 2807 2808 bool 2809 vm_extint_pending(struct vm *vm, int vcpuid) 2810 { 2811 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 2812 2813 return (vcpu->extint_pending); 2814 } 2815 2816 void 2817 vm_extint_clear(struct vm *vm, int vcpuid) 2818 { 2819 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 2820 2821 ASSERT(vcpu->extint_pending); 2822 2823 vcpu->extint_pending = false; 2824 vmm_stat_incr(vm, vcpuid, VCPU_EXTINT_COUNT, 1); 2825 } 2826 2827 int 2828 vm_inject_init(struct vm *vm, int vcpuid) 2829 { 2830 struct vcpu *vcpu; 2831 2832 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 2833 return (EINVAL); 2834 2835 vcpu = &vm->vcpu[vcpuid]; 2836 vcpu_lock(vcpu); 2837 vcpu->run_state |= VRS_PEND_INIT; 2838 /* 2839 * As part of queuing the INIT request, clear any pending SIPI. It 2840 * would not otherwise survive across the reset of the vCPU when it 2841 * undergoes the requested INIT. We would not want it to linger when it 2842 * could be mistaken as a subsequent (after the INIT) SIPI request. 2843 */ 2844 vcpu->run_state &= ~VRS_PEND_SIPI; 2845 vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT); 2846 2847 vcpu_unlock(vcpu); 2848 return (0); 2849 } 2850 2851 int 2852 vm_inject_sipi(struct vm *vm, int vcpuid, uint8_t vector) 2853 { 2854 struct vcpu *vcpu; 2855 2856 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 2857 return (EINVAL); 2858 2859 vcpu = &vm->vcpu[vcpuid]; 2860 vcpu_lock(vcpu); 2861 vcpu->run_state |= VRS_PEND_SIPI; 2862 vcpu->sipi_vector = vector; 2863 /* SIPI is only actionable if the CPU is waiting in INIT state */ 2864 if ((vcpu->run_state & (VRS_INIT | VRS_RUN)) == VRS_INIT) { 2865 vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT); 2866 } 2867 vcpu_unlock(vcpu); 2868 return (0); 2869 } 2870 2871 bool 2872 vcpu_run_state_pending(struct vm *vm, int vcpuid) 2873 { 2874 struct vcpu *vcpu; 2875 2876 ASSERT(vcpuid >= 0 && vcpuid < vm->maxcpus); 2877 vcpu = &vm->vcpu[vcpuid]; 2878 2879 /* Of interest: vCPU not in running state or with pending INIT */ 2880 return ((vcpu->run_state & (VRS_RUN | VRS_PEND_INIT)) != VRS_RUN); 2881 } 2882 2883 int 2884 vcpu_arch_reset(struct vm *vm, int vcpuid, bool init_only) 2885 { 2886 struct seg_desc desc; 2887 const enum vm_reg_name clear_regs[] = { 2888 VM_REG_GUEST_CR2, 2889 VM_REG_GUEST_CR3, 2890 VM_REG_GUEST_CR4, 2891 VM_REG_GUEST_RAX, 2892 VM_REG_GUEST_RBX, 2893 VM_REG_GUEST_RCX, 2894 VM_REG_GUEST_RSI, 2895 VM_REG_GUEST_RDI, 2896 VM_REG_GUEST_RBP, 2897 VM_REG_GUEST_RSP, 2898 VM_REG_GUEST_R8, 2899 VM_REG_GUEST_R9, 2900 VM_REG_GUEST_R10, 2901 VM_REG_GUEST_R11, 2902 VM_REG_GUEST_R12, 2903 VM_REG_GUEST_R13, 2904 VM_REG_GUEST_R14, 2905 VM_REG_GUEST_R15, 2906 VM_REG_GUEST_DR0, 2907 VM_REG_GUEST_DR1, 2908 VM_REG_GUEST_DR2, 2909 VM_REG_GUEST_DR3, 2910 VM_REG_GUEST_EFER, 2911 }; 2912 const enum vm_reg_name data_segs[] = { 2913 VM_REG_GUEST_SS, 2914 VM_REG_GUEST_DS, 2915 VM_REG_GUEST_ES, 2916 VM_REG_GUEST_FS, 2917 VM_REG_GUEST_GS, 2918 }; 2919 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 2920 2921 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 2922 return (EINVAL); 2923 2924 for (uint_t i = 0; i < nitems(clear_regs); i++) { 2925 VERIFY0(vm_set_register(vm, vcpuid, clear_regs[i], 0)); 2926 } 2927 2928 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, 2)); 2929 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RIP, 0xfff0)); 2930 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_CR0, 0x60000010)); 2931 2932 /* 2933 * The prescribed contents of %rdx differ slightly between the Intel and 2934 * AMD architectural definitions. The former expects the Extended Model 2935 * in bits 16-19 where the latter expects all the Family, Model, and 2936 * Stepping be there. Common boot ROMs appear to disregard this 2937 * anyways, so we stick with a compromise value similar to what is 2938 * spelled out in the Intel SDM. 2939 */ 2940 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RDX, 0x600)); 2941 2942 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_DR6, 0xffff0ff0)); 2943 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_DR7, 0x400)); 2944 2945 /* CS: Present, R/W, Accessed */ 2946 desc.access = 0x0093; 2947 desc.base = 0xffff0000; 2948 desc.limit = 0xffff; 2949 VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_CS, &desc)); 2950 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_CS, 0xf000)); 2951 2952 /* SS, DS, ES, FS, GS: Present, R/W, Accessed */ 2953 desc.access = 0x0093; 2954 desc.base = 0; 2955 desc.limit = 0xffff; 2956 for (uint_t i = 0; i < nitems(data_segs); i++) { 2957 VERIFY0(vm_set_seg_desc(vm, vcpuid, data_segs[i], &desc)); 2958 VERIFY0(vm_set_register(vm, vcpuid, data_segs[i], 0)); 2959 } 2960 2961 /* GDTR, IDTR */ 2962 desc.base = 0; 2963 desc.limit = 0xffff; 2964 VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_GDTR, &desc)); 2965 VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_IDTR, &desc)); 2966 2967 /* LDTR: Present, LDT */ 2968 desc.access = 0x0082; 2969 desc.base = 0; 2970 desc.limit = 0xffff; 2971 VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_LDTR, &desc)); 2972 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_LDTR, 0)); 2973 2974 /* TR: Present, 32-bit TSS */ 2975 desc.access = 0x008b; 2976 desc.base = 0; 2977 desc.limit = 0xffff; 2978 VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_TR, &desc)); 2979 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_TR, 0)); 2980 2981 vlapic_reset(vm_lapic(vm, vcpuid)); 2982 2983 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_INTR_SHADOW, 0)); 2984 2985 vcpu->exit_intinfo = 0; 2986 vcpu->exc_pending = 0; 2987 vcpu->nmi_pending = false; 2988 vcpu->extint_pending = 0; 2989 2990 /* 2991 * A CPU reset caused by power-on or system reset clears more state than 2992 * one which is trigged from an INIT IPI. 2993 */ 2994 if (!init_only) { 2995 vcpu->guest_xcr0 = XFEATURE_ENABLED_X87; 2996 (void) hma_fpu_init(vcpu->guestfpu); 2997 2998 /* XXX: clear MSRs and other pieces */ 2999 bzero(&vcpu->mtrr, sizeof (vcpu->mtrr)); 3000 } 3001 3002 return (0); 3003 } 3004 3005 static int 3006 vcpu_vector_sipi(struct vm *vm, int vcpuid, uint8_t vector) 3007 { 3008 struct seg_desc desc; 3009 3010 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 3011 return (EINVAL); 3012 3013 /* CS: Present, R/W, Accessed */ 3014 desc.access = 0x0093; 3015 desc.base = (uint64_t)vector << 12; 3016 desc.limit = 0xffff; 3017 VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_CS, &desc)); 3018 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_CS, 3019 (uint64_t)vector << 8)); 3020 3021 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RIP, 0)); 3022 3023 return (0); 3024 } 3025 3026 int 3027 vm_get_capability(struct vm *vm, int vcpu, int type, int *retval) 3028 { 3029 if (vcpu < 0 || vcpu >= vm->maxcpus) 3030 return (EINVAL); 3031 3032 if (type < 0 || type >= VM_CAP_MAX) 3033 return (EINVAL); 3034 3035 return (VMGETCAP(vm->cookie, vcpu, type, retval)); 3036 } 3037 3038 int 3039 vm_set_capability(struct vm *vm, int vcpu, int type, int val) 3040 { 3041 if (vcpu < 0 || vcpu >= vm->maxcpus) 3042 return (EINVAL); 3043 3044 if (type < 0 || type >= VM_CAP_MAX) 3045 return (EINVAL); 3046 3047 return (VMSETCAP(vm->cookie, vcpu, type, val)); 3048 } 3049 3050 vcpu_cpuid_config_t * 3051 vm_cpuid_config(struct vm *vm, int vcpuid) 3052 { 3053 ASSERT3S(vcpuid, >=, 0); 3054 ASSERT3S(vcpuid, <, VM_MAXCPU); 3055 3056 return (&vm->vcpu[vcpuid].cpuid_cfg); 3057 } 3058 3059 struct vlapic * 3060 vm_lapic(struct vm *vm, int cpu) 3061 { 3062 ASSERT3S(cpu, >=, 0); 3063 ASSERT3S(cpu, <, VM_MAXCPU); 3064 3065 return (vm->vcpu[cpu].vlapic); 3066 } 3067 3068 struct vioapic * 3069 vm_ioapic(struct vm *vm) 3070 { 3071 3072 return (vm->vioapic); 3073 } 3074 3075 struct vhpet * 3076 vm_hpet(struct vm *vm) 3077 { 3078 3079 return (vm->vhpet); 3080 } 3081 3082 void * 3083 vm_iommu_domain(struct vm *vm) 3084 { 3085 3086 return (vm->iommu); 3087 } 3088 3089 int 3090 vcpu_set_state(struct vm *vm, int vcpuid, enum vcpu_state newstate, 3091 bool from_idle) 3092 { 3093 int error; 3094 struct vcpu *vcpu; 3095 3096 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 3097 panic("vcpu_set_state: invalid vcpuid %d", vcpuid); 3098 3099 vcpu = &vm->vcpu[vcpuid]; 3100 3101 vcpu_lock(vcpu); 3102 error = vcpu_set_state_locked(vm, vcpuid, newstate, from_idle); 3103 vcpu_unlock(vcpu); 3104 3105 return (error); 3106 } 3107 3108 enum vcpu_state 3109 vcpu_get_state(struct vm *vm, int vcpuid, int *hostcpu) 3110 { 3111 struct vcpu *vcpu; 3112 enum vcpu_state state; 3113 3114 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 3115 panic("vcpu_get_state: invalid vcpuid %d", vcpuid); 3116 3117 vcpu = &vm->vcpu[vcpuid]; 3118 3119 vcpu_lock(vcpu); 3120 state = vcpu->state; 3121 if (hostcpu != NULL) 3122 *hostcpu = vcpu->hostcpu; 3123 vcpu_unlock(vcpu); 3124 3125 return (state); 3126 } 3127 3128 uint64_t 3129 vcpu_tsc_offset(struct vm *vm, int vcpuid, bool phys_adj) 3130 { 3131 ASSERT(vcpuid >= 0 && vcpuid < vm->maxcpus); 3132 3133 uint64_t vcpu_off = vm->boot_tsc_offset + vm->vcpu[vcpuid].tsc_offset; 3134 3135 if (phys_adj) { 3136 /* Include any offset for the current physical CPU too */ 3137 extern hrtime_t tsc_gethrtime_tick_delta(void); 3138 vcpu_off += (uint64_t)tsc_gethrtime_tick_delta(); 3139 } 3140 3141 return (vcpu_off); 3142 } 3143 3144 /* Normalize hrtime against the boot time for a VM */ 3145 hrtime_t 3146 vm_normalize_hrtime(struct vm *vm, hrtime_t hrt) 3147 { 3148 /* To avoid underflow/overflow UB, perform math as unsigned */ 3149 return ((hrtime_t)((uint64_t)hrt - (uint64_t)vm->boot_hrtime)); 3150 } 3151 3152 /* Denormalize hrtime against the boot time for a VM */ 3153 hrtime_t 3154 vm_denormalize_hrtime(struct vm *vm, hrtime_t hrt) 3155 { 3156 /* To avoid underflow/overflow UB, perform math as unsigned */ 3157 return ((hrtime_t)((uint64_t)hrt + (uint64_t)vm->boot_hrtime)); 3158 } 3159 3160 int 3161 vm_activate_cpu(struct vm *vm, int vcpuid) 3162 { 3163 3164 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 3165 return (EINVAL); 3166 3167 if (CPU_ISSET(vcpuid, &vm->active_cpus)) 3168 return (EBUSY); 3169 3170 if (vm->suspend != 0) { 3171 return (EBUSY); 3172 } 3173 3174 CPU_SET_ATOMIC(vcpuid, &vm->active_cpus); 3175 3176 /* 3177 * It is possible that this vCPU was undergoing activation at the same 3178 * time that the VM was being suspended. If that happens to be the 3179 * case, it should reflect the suspended state immediately. 3180 */ 3181 if (atomic_load_acq_int((uint_t *)&vm->suspend) != 0) { 3182 CPU_SET_ATOMIC(vcpuid, &vm->suspended_cpus); 3183 } 3184 3185 return (0); 3186 } 3187 3188 int 3189 vm_suspend_cpu(struct vm *vm, int vcpuid) 3190 { 3191 int i; 3192 3193 if (vcpuid < -1 || vcpuid >= vm->maxcpus) 3194 return (EINVAL); 3195 3196 if (vcpuid == -1) { 3197 vm->debug_cpus = vm->active_cpus; 3198 for (i = 0; i < vm->maxcpus; i++) { 3199 if (CPU_ISSET(i, &vm->active_cpus)) 3200 vcpu_notify_event(vm, i); 3201 } 3202 } else { 3203 if (!CPU_ISSET(vcpuid, &vm->active_cpus)) 3204 return (EINVAL); 3205 3206 CPU_SET_ATOMIC(vcpuid, &vm->debug_cpus); 3207 vcpu_notify_event(vm, vcpuid); 3208 } 3209 return (0); 3210 } 3211 3212 int 3213 vm_resume_cpu(struct vm *vm, int vcpuid) 3214 { 3215 3216 if (vcpuid < -1 || vcpuid >= vm->maxcpus) 3217 return (EINVAL); 3218 3219 if (vcpuid == -1) { 3220 CPU_ZERO(&vm->debug_cpus); 3221 } else { 3222 if (!CPU_ISSET(vcpuid, &vm->debug_cpus)) 3223 return (EINVAL); 3224 3225 CPU_CLR_ATOMIC(vcpuid, &vm->debug_cpus); 3226 } 3227 return (0); 3228 } 3229 3230 static bool 3231 vcpu_bailout_checks(struct vm *vm, int vcpuid, bool on_entry, 3232 uint64_t entry_rip) 3233 { 3234 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 3235 struct vm_exit *vme = &vcpu->exitinfo; 3236 bool bail = false; 3237 3238 ASSERT(vcpuid >= 0 && vcpuid < vm->maxcpus); 3239 3240 if (vm->suspend) { 3241 if (on_entry) { 3242 VERIFY(vm->suspend > VM_SUSPEND_NONE && 3243 vm->suspend < VM_SUSPEND_LAST); 3244 3245 vme->exitcode = VM_EXITCODE_SUSPENDED; 3246 vme->u.suspended.how = vm->suspend; 3247 } else { 3248 /* 3249 * Handling VM suspend is complicated, so if that 3250 * condition is detected outside of VM-entry itself, 3251 * just emit a BOGUS exitcode so we take a lap to pick 3252 * up the event during an entry and are directed into 3253 * the vm_handle_suspend() logic. 3254 */ 3255 vme->exitcode = VM_EXITCODE_BOGUS; 3256 } 3257 bail = true; 3258 } 3259 if (vcpu->reqidle) { 3260 vme->exitcode = VM_EXITCODE_REQIDLE; 3261 vmm_stat_incr(vm, vcpuid, VMEXIT_REQIDLE, 1); 3262 3263 if (!on_entry) { 3264 /* 3265 * A reqidle request detected outside of VM-entry can be 3266 * handled directly by clearing the request (and taking 3267 * a lap to userspace). 3268 */ 3269 vcpu_assert_locked(vcpu); 3270 vcpu->reqidle = 0; 3271 } 3272 bail = true; 3273 } 3274 if (vcpu_should_yield(vm, vcpuid)) { 3275 vme->exitcode = VM_EXITCODE_BOGUS; 3276 vmm_stat_incr(vm, vcpuid, VMEXIT_ASTPENDING, 1); 3277 bail = true; 3278 } 3279 if (CPU_ISSET(vcpuid, &vm->debug_cpus)) { 3280 vme->exitcode = VM_EXITCODE_DEBUG; 3281 bail = true; 3282 } 3283 3284 if (bail) { 3285 if (on_entry) { 3286 /* 3287 * If bailing out during VM-entry, the current %rip must 3288 * be recorded in the exitinfo. 3289 */ 3290 vme->rip = entry_rip; 3291 } 3292 vme->inst_length = 0; 3293 } 3294 return (bail); 3295 } 3296 3297 static bool 3298 vcpu_sleep_bailout_checks(struct vm *vm, int vcpuid) 3299 { 3300 /* 3301 * Bail-out check done prior to sleeping (in vCPU contexts like HLT or 3302 * wait-for-SIPI) expect that %rip is already populated in the vm_exit 3303 * structure, and we would only modify the exitcode. 3304 */ 3305 return (vcpu_bailout_checks(vm, vcpuid, false, 0)); 3306 } 3307 3308 bool 3309 vcpu_entry_bailout_checks(struct vm *vm, int vcpuid, uint64_t rip) 3310 { 3311 /* 3312 * Bail-out checks done as part of VM entry require an updated %rip to 3313 * populate the vm_exit struct if any of the conditions of interest are 3314 * matched in the check. 3315 */ 3316 return (vcpu_bailout_checks(vm, vcpuid, true, rip)); 3317 } 3318 3319 cpuset_t 3320 vm_active_cpus(struct vm *vm) 3321 { 3322 3323 return (vm->active_cpus); 3324 } 3325 3326 cpuset_t 3327 vm_debug_cpus(struct vm *vm) 3328 { 3329 3330 return (vm->debug_cpus); 3331 } 3332 3333 cpuset_t 3334 vm_suspended_cpus(struct vm *vm) 3335 { 3336 3337 return (vm->suspended_cpus); 3338 } 3339 3340 void * 3341 vcpu_stats(struct vm *vm, int vcpuid) 3342 { 3343 3344 return (vm->vcpu[vcpuid].stats); 3345 } 3346 3347 int 3348 vm_get_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state *state) 3349 { 3350 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 3351 return (EINVAL); 3352 3353 *state = vm->vcpu[vcpuid].x2apic_state; 3354 3355 return (0); 3356 } 3357 3358 int 3359 vm_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state) 3360 { 3361 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 3362 return (EINVAL); 3363 3364 if (state >= X2APIC_STATE_LAST) 3365 return (EINVAL); 3366 3367 vm->vcpu[vcpuid].x2apic_state = state; 3368 3369 vlapic_set_x2apic_state(vm, vcpuid, state); 3370 3371 return (0); 3372 } 3373 3374 /* 3375 * This function is called to ensure that a vcpu "sees" a pending event 3376 * as soon as possible: 3377 * - If the vcpu thread is sleeping then it is woken up. 3378 * - If the vcpu is running on a different host_cpu then an IPI will be directed 3379 * to the host_cpu to cause the vcpu to trap into the hypervisor. 3380 */ 3381 static void 3382 vcpu_notify_event_locked(struct vcpu *vcpu, vcpu_notify_t ntype) 3383 { 3384 int hostcpu; 3385 3386 ASSERT(ntype == VCPU_NOTIFY_APIC || VCPU_NOTIFY_EXIT); 3387 3388 hostcpu = vcpu->hostcpu; 3389 if (vcpu->state == VCPU_RUNNING) { 3390 KASSERT(hostcpu != NOCPU, ("vcpu running on invalid hostcpu")); 3391 if (hostcpu != curcpu) { 3392 if (ntype == VCPU_NOTIFY_APIC) { 3393 vlapic_post_intr(vcpu->vlapic, hostcpu); 3394 } else { 3395 poke_cpu(hostcpu); 3396 } 3397 } else { 3398 /* 3399 * If the 'vcpu' is running on 'curcpu' then it must 3400 * be sending a notification to itself (e.g. SELF_IPI). 3401 * The pending event will be picked up when the vcpu 3402 * transitions back to guest context. 3403 */ 3404 } 3405 } else { 3406 KASSERT(hostcpu == NOCPU, ("vcpu state %d not consistent " 3407 "with hostcpu %d", vcpu->state, hostcpu)); 3408 if (vcpu->state == VCPU_SLEEPING) { 3409 cv_signal(&vcpu->vcpu_cv); 3410 } 3411 } 3412 } 3413 3414 void 3415 vcpu_notify_event(struct vm *vm, int vcpuid) 3416 { 3417 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 3418 3419 vcpu_lock(vcpu); 3420 vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT); 3421 vcpu_unlock(vcpu); 3422 } 3423 3424 void 3425 vcpu_notify_event_type(struct vm *vm, int vcpuid, vcpu_notify_t ntype) 3426 { 3427 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 3428 3429 if (ntype == VCPU_NOTIFY_NONE) { 3430 return; 3431 } 3432 3433 vcpu_lock(vcpu); 3434 vcpu_notify_event_locked(vcpu, ntype); 3435 vcpu_unlock(vcpu); 3436 } 3437 3438 void 3439 vcpu_ustate_change(struct vm *vm, int vcpuid, enum vcpu_ustate ustate) 3440 { 3441 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 3442 hrtime_t now = gethrtime(); 3443 3444 ASSERT3U(ustate, !=, vcpu->ustate); 3445 ASSERT3S(ustate, <, VU_MAX); 3446 ASSERT3S(ustate, >=, VU_INIT); 3447 3448 hrtime_t delta = now - vcpu->ustate_when; 3449 vcpu->ustate_total[vcpu->ustate] += delta; 3450 3451 membar_producer(); 3452 3453 vcpu->ustate_when = now; 3454 vcpu->ustate = ustate; 3455 } 3456 3457 struct vmspace * 3458 vm_get_vmspace(struct vm *vm) 3459 { 3460 3461 return (vm->vmspace); 3462 } 3463 3464 struct vm_client * 3465 vm_get_vmclient(struct vm *vm, int vcpuid) 3466 { 3467 return (vm->vcpu[vcpuid].vmclient); 3468 } 3469 3470 int 3471 vm_apicid2vcpuid(struct vm *vm, int apicid) 3472 { 3473 /* 3474 * XXX apic id is assumed to be numerically identical to vcpu id 3475 */ 3476 return (apicid); 3477 } 3478 3479 struct vatpic * 3480 vm_atpic(struct vm *vm) 3481 { 3482 return (vm->vatpic); 3483 } 3484 3485 struct vatpit * 3486 vm_atpit(struct vm *vm) 3487 { 3488 return (vm->vatpit); 3489 } 3490 3491 struct vpmtmr * 3492 vm_pmtmr(struct vm *vm) 3493 { 3494 3495 return (vm->vpmtmr); 3496 } 3497 3498 struct vrtc * 3499 vm_rtc(struct vm *vm) 3500 { 3501 3502 return (vm->vrtc); 3503 } 3504 3505 enum vm_reg_name 3506 vm_segment_name(int seg) 3507 { 3508 static enum vm_reg_name seg_names[] = { 3509 VM_REG_GUEST_ES, 3510 VM_REG_GUEST_CS, 3511 VM_REG_GUEST_SS, 3512 VM_REG_GUEST_DS, 3513 VM_REG_GUEST_FS, 3514 VM_REG_GUEST_GS 3515 }; 3516 3517 KASSERT(seg >= 0 && seg < nitems(seg_names), 3518 ("%s: invalid segment encoding %d", __func__, seg)); 3519 return (seg_names[seg]); 3520 } 3521 3522 void 3523 vm_copy_teardown(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo, 3524 uint_t num_copyinfo) 3525 { 3526 for (uint_t idx = 0; idx < num_copyinfo; idx++) { 3527 if (copyinfo[idx].cookie != NULL) { 3528 (void) vmp_release((vm_page_t *)copyinfo[idx].cookie); 3529 } 3530 } 3531 bzero(copyinfo, num_copyinfo * sizeof (struct vm_copyinfo)); 3532 } 3533 3534 int 3535 vm_copy_setup(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, 3536 uint64_t gla, size_t len, int prot, struct vm_copyinfo *copyinfo, 3537 uint_t num_copyinfo, int *fault) 3538 { 3539 uint_t idx, nused; 3540 size_t n, off, remaining; 3541 vm_client_t *vmc = vm_get_vmclient(vm, vcpuid); 3542 3543 bzero(copyinfo, sizeof (struct vm_copyinfo) * num_copyinfo); 3544 3545 nused = 0; 3546 remaining = len; 3547 while (remaining > 0) { 3548 uint64_t gpa; 3549 int error; 3550 3551 KASSERT(nused < num_copyinfo, ("insufficient vm_copyinfo")); 3552 error = vm_gla2gpa(vm, vcpuid, paging, gla, prot, &gpa, fault); 3553 if (error || *fault) 3554 return (error); 3555 off = gpa & PAGEOFFSET; 3556 n = min(remaining, PAGESIZE - off); 3557 copyinfo[nused].gpa = gpa; 3558 copyinfo[nused].len = n; 3559 remaining -= n; 3560 gla += n; 3561 nused++; 3562 } 3563 3564 for (idx = 0; idx < nused; idx++) { 3565 vm_page_t *vmp; 3566 caddr_t hva; 3567 3568 vmp = vmc_hold(vmc, copyinfo[idx].gpa & PAGEMASK, prot); 3569 if (vmp == NULL) { 3570 break; 3571 } 3572 if ((prot & PROT_WRITE) != 0) { 3573 hva = (caddr_t)vmp_get_writable(vmp); 3574 } else { 3575 hva = (caddr_t)vmp_get_readable(vmp); 3576 } 3577 copyinfo[idx].hva = hva + (copyinfo[idx].gpa & PAGEOFFSET); 3578 copyinfo[idx].cookie = vmp; 3579 copyinfo[idx].prot = prot; 3580 } 3581 3582 if (idx != nused) { 3583 vm_copy_teardown(vm, vcpuid, copyinfo, num_copyinfo); 3584 return (EFAULT); 3585 } else { 3586 *fault = 0; 3587 return (0); 3588 } 3589 } 3590 3591 void 3592 vm_copyin(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo, void *kaddr, 3593 size_t len) 3594 { 3595 char *dst; 3596 int idx; 3597 3598 dst = kaddr; 3599 idx = 0; 3600 while (len > 0) { 3601 ASSERT(copyinfo[idx].prot & PROT_READ); 3602 3603 bcopy(copyinfo[idx].hva, dst, copyinfo[idx].len); 3604 len -= copyinfo[idx].len; 3605 dst += copyinfo[idx].len; 3606 idx++; 3607 } 3608 } 3609 3610 void 3611 vm_copyout(struct vm *vm, int vcpuid, const void *kaddr, 3612 struct vm_copyinfo *copyinfo, size_t len) 3613 { 3614 const char *src; 3615 int idx; 3616 3617 src = kaddr; 3618 idx = 0; 3619 while (len > 0) { 3620 ASSERT(copyinfo[idx].prot & PROT_WRITE); 3621 3622 bcopy(src, copyinfo[idx].hva, copyinfo[idx].len); 3623 len -= copyinfo[idx].len; 3624 src += copyinfo[idx].len; 3625 idx++; 3626 } 3627 } 3628 3629 /* 3630 * Return the amount of in-use and wired memory for the VM. Since 3631 * these are global stats, only return the values with for vCPU 0 3632 */ 3633 VMM_STAT_DECLARE(VMM_MEM_RESIDENT); 3634 3635 static void 3636 vm_get_rescnt(struct vm *vm, int vcpu, struct vmm_stat_type *stat) 3637 { 3638 if (vcpu == 0) { 3639 vmm_stat_set(vm, vcpu, VMM_MEM_RESIDENT, 3640 PAGE_SIZE * vmspace_resident_count(vm->vmspace)); 3641 } 3642 } 3643 3644 VMM_STAT_FUNC(VMM_MEM_RESIDENT, "Resident memory", vm_get_rescnt); 3645 3646 int 3647 vm_ioport_access(struct vm *vm, int vcpuid, bool in, uint16_t port, 3648 uint8_t bytes, uint32_t *val) 3649 { 3650 return (vm_inout_access(&vm->ioports, in, port, bytes, val)); 3651 } 3652 3653 /* 3654 * bhyve-internal interfaces to attach or detach IO port handlers. 3655 * Must be called with VM write lock held for safety. 3656 */ 3657 int 3658 vm_ioport_attach(struct vm *vm, uint16_t port, ioport_handler_t func, void *arg, 3659 void **cookie) 3660 { 3661 int err; 3662 err = vm_inout_attach(&vm->ioports, port, IOPF_DEFAULT, func, arg); 3663 if (err == 0) { 3664 *cookie = (void *)IOP_GEN_COOKIE(func, arg, port); 3665 } 3666 return (err); 3667 } 3668 int 3669 vm_ioport_detach(struct vm *vm, void **cookie, ioport_handler_t *old_func, 3670 void **old_arg) 3671 { 3672 uint16_t port = IOP_PORT_FROM_COOKIE((uintptr_t)*cookie); 3673 int err; 3674 3675 err = vm_inout_detach(&vm->ioports, port, false, old_func, old_arg); 3676 if (err == 0) { 3677 *cookie = NULL; 3678 } 3679 return (err); 3680 } 3681 3682 /* 3683 * External driver interfaces to attach or detach IO port handlers. 3684 * Must be called with VM write lock held for safety. 3685 */ 3686 int 3687 vm_ioport_hook(struct vm *vm, uint16_t port, ioport_handler_t func, 3688 void *arg, void **cookie) 3689 { 3690 int err; 3691 3692 if (port == 0) { 3693 return (EINVAL); 3694 } 3695 3696 err = vm_inout_attach(&vm->ioports, port, IOPF_DRV_HOOK, func, arg); 3697 if (err == 0) { 3698 *cookie = (void *)IOP_GEN_COOKIE(func, arg, port); 3699 } 3700 return (err); 3701 } 3702 void 3703 vm_ioport_unhook(struct vm *vm, void **cookie) 3704 { 3705 uint16_t port = IOP_PORT_FROM_COOKIE((uintptr_t)*cookie); 3706 ioport_handler_t old_func; 3707 void *old_arg; 3708 int err; 3709 3710 err = vm_inout_detach(&vm->ioports, port, true, &old_func, &old_arg); 3711 3712 /* ioport-hook-using drivers are expected to be well-behaved */ 3713 VERIFY0(err); 3714 VERIFY(IOP_GEN_COOKIE(old_func, old_arg, port) == (uintptr_t)*cookie); 3715 3716 *cookie = NULL; 3717 } 3718 3719 int 3720 vmm_kstat_update_vcpu(struct kstat *ksp, int rw) 3721 { 3722 struct vm *vm = ksp->ks_private; 3723 vmm_vcpu_kstats_t *vvk = ksp->ks_data; 3724 const int vcpuid = vvk->vvk_vcpu.value.ui32; 3725 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 3726 3727 ASSERT3U(vcpuid, <, VM_MAXCPU); 3728 3729 vvk->vvk_time_init.value.ui64 = vcpu->ustate_total[VU_INIT]; 3730 vvk->vvk_time_run.value.ui64 = vcpu->ustate_total[VU_RUN]; 3731 vvk->vvk_time_idle.value.ui64 = vcpu->ustate_total[VU_IDLE]; 3732 vvk->vvk_time_emu_kern.value.ui64 = vcpu->ustate_total[VU_EMU_KERN]; 3733 vvk->vvk_time_emu_user.value.ui64 = vcpu->ustate_total[VU_EMU_USER]; 3734 vvk->vvk_time_sched.value.ui64 = vcpu->ustate_total[VU_SCHED]; 3735 3736 return (0); 3737 } 3738 3739 SET_DECLARE(vmm_data_version_entries, const vmm_data_version_entry_t); 3740 3741 static inline bool 3742 vmm_data_is_cpu_specific(uint16_t data_class) 3743 { 3744 switch (data_class) { 3745 case VDC_REGISTER: 3746 case VDC_MSR: 3747 case VDC_FPU: 3748 case VDC_LAPIC: 3749 return (true); 3750 default: 3751 return (false); 3752 } 3753 } 3754 3755 static int 3756 vmm_data_find(const vmm_data_req_t *req, const vmm_data_version_entry_t **resp) 3757 { 3758 const vmm_data_version_entry_t **vdpp, *vdp; 3759 3760 ASSERT(resp != NULL); 3761 ASSERT(req->vdr_result_len != NULL); 3762 3763 SET_FOREACH(vdpp, vmm_data_version_entries) { 3764 vdp = *vdpp; 3765 if (vdp->vdve_class == req->vdr_class && 3766 vdp->vdve_version == req->vdr_version) { 3767 /* 3768 * Enforce any data length expectation expressed by the 3769 * provider for this data. 3770 */ 3771 if (vdp->vdve_len_expect != 0 && 3772 vdp->vdve_len_expect > req->vdr_len) { 3773 *req->vdr_result_len = vdp->vdve_len_expect; 3774 return (ENOSPC); 3775 } 3776 *resp = vdp; 3777 return (0); 3778 } 3779 } 3780 return (EINVAL); 3781 } 3782 3783 static void * 3784 vmm_data_from_class(const vmm_data_req_t *req, struct vm *vm, int vcpuid) 3785 { 3786 switch (req->vdr_class) { 3787 /* per-cpu data/devices */ 3788 case VDC_LAPIC: 3789 return (vm_lapic(vm, vcpuid)); 3790 case VDC_VMM_ARCH: 3791 return (vm); 3792 3793 case VDC_FPU: 3794 case VDC_REGISTER: 3795 case VDC_MSR: 3796 /* 3797 * These have per-CPU handling which is dispatched outside 3798 * vmm_data_version_entries listing. 3799 */ 3800 return (NULL); 3801 3802 /* system-wide data/devices */ 3803 case VDC_IOAPIC: 3804 return (vm->vioapic); 3805 case VDC_ATPIT: 3806 return (vm->vatpit); 3807 case VDC_ATPIC: 3808 return (vm->vatpic); 3809 case VDC_HPET: 3810 return (vm->vhpet); 3811 case VDC_PM_TIMER: 3812 return (vm->vpmtmr); 3813 case VDC_RTC: 3814 return (vm->vrtc); 3815 3816 default: 3817 /* The data class will have been validated by now */ 3818 panic("Unexpected class %u", req->vdr_class); 3819 } 3820 } 3821 3822 const uint32_t arch_msr_iter[] = { 3823 MSR_EFER, 3824 3825 /* 3826 * While gsbase and fsbase are accessible via the MSR accessors, they 3827 * are not included in MSR iteration since they are covered by the 3828 * segment descriptor interface too. 3829 */ 3830 MSR_KGSBASE, 3831 3832 MSR_STAR, 3833 MSR_LSTAR, 3834 MSR_CSTAR, 3835 MSR_SF_MASK, 3836 3837 MSR_SYSENTER_CS_MSR, 3838 MSR_SYSENTER_ESP_MSR, 3839 MSR_SYSENTER_EIP_MSR, 3840 MSR_PAT, 3841 }; 3842 const uint32_t generic_msr_iter[] = { 3843 MSR_TSC, 3844 MSR_MTRRcap, 3845 MSR_MTRRdefType, 3846 3847 MSR_MTRR4kBase, MSR_MTRR4kBase + 1, MSR_MTRR4kBase + 2, 3848 MSR_MTRR4kBase + 3, MSR_MTRR4kBase + 4, MSR_MTRR4kBase + 5, 3849 MSR_MTRR4kBase + 6, MSR_MTRR4kBase + 7, 3850 3851 MSR_MTRR16kBase, MSR_MTRR16kBase + 1, 3852 3853 MSR_MTRR64kBase, 3854 }; 3855 3856 static int 3857 vmm_data_read_msrs(struct vm *vm, int vcpuid, const vmm_data_req_t *req) 3858 { 3859 VERIFY3U(req->vdr_class, ==, VDC_MSR); 3860 VERIFY3U(req->vdr_version, ==, 1); 3861 3862 const uint_t num_msrs = nitems(arch_msr_iter) + nitems(generic_msr_iter) 3863 + (VMM_MTRR_VAR_MAX * 2); 3864 const uint32_t output_len = 3865 num_msrs * sizeof (struct vdi_field_entry_v1); 3866 *req->vdr_result_len = output_len; 3867 3868 if (req->vdr_len < output_len) { 3869 return (ENOSPC); 3870 } 3871 3872 struct vdi_field_entry_v1 *entryp = req->vdr_data; 3873 for (uint_t i = 0; i < nitems(arch_msr_iter); i++, entryp++) { 3874 const uint32_t msr = arch_msr_iter[i]; 3875 uint64_t val = 0; 3876 3877 int err = ops->vmgetmsr(vm->cookie, vcpuid, msr, &val); 3878 /* All of these MSRs are expected to work */ 3879 VERIFY0(err); 3880 entryp->vfe_ident = msr; 3881 entryp->vfe_value = val; 3882 } 3883 3884 struct vm_mtrr *mtrr = &vm->vcpu[vcpuid].mtrr; 3885 for (uint_t i = 0; i < nitems(generic_msr_iter); i++, entryp++) { 3886 const uint32_t msr = generic_msr_iter[i]; 3887 3888 entryp->vfe_ident = msr; 3889 switch (msr) { 3890 case MSR_TSC: 3891 /* 3892 * Communicate this as the difference from the VM-wide 3893 * offset of the boot time. 3894 */ 3895 entryp->vfe_value = vm->vcpu[vcpuid].tsc_offset; 3896 break; 3897 case MSR_MTRRcap: 3898 case MSR_MTRRdefType: 3899 case MSR_MTRR4kBase ... MSR_MTRR4kBase + 7: 3900 case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1: 3901 case MSR_MTRR64kBase: { 3902 int err = vm_rdmtrr(mtrr, msr, &entryp->vfe_value); 3903 VERIFY0(err); 3904 break; 3905 } 3906 default: 3907 panic("unexpected msr export %x", msr); 3908 } 3909 } 3910 /* Copy the variable MTRRs */ 3911 for (uint_t i = 0; i < (VMM_MTRR_VAR_MAX * 2); i++, entryp++) { 3912 const uint32_t msr = MSR_MTRRVarBase + i; 3913 3914 entryp->vfe_ident = msr; 3915 int err = vm_rdmtrr(mtrr, msr, &entryp->vfe_value); 3916 VERIFY0(err); 3917 } 3918 return (0); 3919 } 3920 3921 static int 3922 vmm_data_write_msrs(struct vm *vm, int vcpuid, const vmm_data_req_t *req) 3923 { 3924 VERIFY3U(req->vdr_class, ==, VDC_MSR); 3925 VERIFY3U(req->vdr_version, ==, 1); 3926 3927 const struct vdi_field_entry_v1 *entryp = req->vdr_data; 3928 const uint_t entry_count = 3929 req->vdr_len / sizeof (struct vdi_field_entry_v1); 3930 struct vm_mtrr *mtrr = &vm->vcpu[vcpuid].mtrr; 3931 3932 /* 3933 * First make sure that all of the MSRs can be manipulated. 3934 * For now, this check is done by going though the getmsr handler 3935 */ 3936 for (uint_t i = 0; i < entry_count; i++, entryp++) { 3937 const uint32_t msr = entryp->vfe_ident; 3938 uint64_t val; 3939 int err = 0; 3940 3941 switch (msr) { 3942 case MSR_TSC: 3943 break; 3944 default: 3945 if (is_mtrr_msr(msr)) { 3946 err = vm_rdmtrr(mtrr, msr, &val); 3947 } else { 3948 err = ops->vmgetmsr(vm->cookie, vcpuid, msr, 3949 &val); 3950 } 3951 break; 3952 } 3953 if (err != 0) { 3954 return (err); 3955 } 3956 } 3957 3958 /* 3959 * Fairly confident that all of the 'set' operations are at least 3960 * targeting valid MSRs, continue on. 3961 */ 3962 entryp = req->vdr_data; 3963 for (uint_t i = 0; i < entry_count; i++, entryp++) { 3964 const uint32_t msr = entryp->vfe_ident; 3965 const uint64_t val = entryp->vfe_value; 3966 int err = 0; 3967 3968 switch (msr) { 3969 case MSR_TSC: 3970 vm->vcpu[vcpuid].tsc_offset = entryp->vfe_value; 3971 break; 3972 default: 3973 if (is_mtrr_msr(msr)) { 3974 if (msr == MSR_MTRRcap) { 3975 /* 3976 * MTRRcap is read-only. If the current 3977 * value matches the incoming one, 3978 * consider it a success 3979 */ 3980 uint64_t comp; 3981 err = vm_rdmtrr(mtrr, msr, &comp); 3982 if (err != 0 || comp != val) { 3983 err = EINVAL; 3984 } 3985 } else { 3986 err = vm_wrmtrr(mtrr, msr, val); 3987 } 3988 } else { 3989 err = ops->vmsetmsr(vm->cookie, vcpuid, msr, 3990 val); 3991 } 3992 break; 3993 } 3994 if (err != 0) { 3995 return (err); 3996 } 3997 } 3998 *req->vdr_result_len = entry_count * sizeof (struct vdi_field_entry_v1); 3999 4000 return (0); 4001 } 4002 4003 static const vmm_data_version_entry_t msr_v1 = { 4004 .vdve_class = VDC_MSR, 4005 .vdve_version = 1, 4006 .vdve_len_per_item = sizeof (struct vdi_field_entry_v1), 4007 /* Requires backend-specific dispatch */ 4008 .vdve_readf = NULL, 4009 .vdve_writef = NULL, 4010 }; 4011 VMM_DATA_VERSION(msr_v1); 4012 4013 static const uint32_t vmm_arch_v1_fields[] = { 4014 VAI_TSC_BOOT_OFFSET, 4015 VAI_BOOT_HRTIME, 4016 VAI_TSC_FREQ, 4017 }; 4018 4019 static bool 4020 vmm_read_arch_field(struct vm *vm, uint32_t ident, uint64_t *valp) 4021 { 4022 ASSERT(valp != NULL); 4023 4024 switch (ident) { 4025 case VAI_TSC_BOOT_OFFSET: 4026 *valp = vm->boot_tsc_offset; 4027 return (true); 4028 case VAI_BOOT_HRTIME: 4029 *valp = vm->boot_hrtime; 4030 return (true); 4031 case VAI_TSC_FREQ: 4032 /* 4033 * Since the system TSC calibration is not public, just derive 4034 * it from the scaling functions available. 4035 */ 4036 *valp = unscalehrtime(NANOSEC); 4037 return (true); 4038 default: 4039 break; 4040 } 4041 return (false); 4042 } 4043 4044 static int 4045 vmm_data_read_vmm_arch(void *arg, const vmm_data_req_t *req) 4046 { 4047 struct vm *vm = arg; 4048 4049 VERIFY3U(req->vdr_class, ==, VDC_VMM_ARCH); 4050 VERIFY3U(req->vdr_version, ==, 1); 4051 4052 struct vdi_field_entry_v1 *entryp = req->vdr_data; 4053 4054 /* Specific fields requested */ 4055 if ((req->vdr_flags & VDX_FLAG_READ_COPYIN) != 0) { 4056 const uint_t count = 4057 req->vdr_len / sizeof (struct vdi_field_entry_v1); 4058 4059 for (uint_t i = 0; i < count; i++, entryp++) { 4060 if (!vmm_read_arch_field(vm, entryp->vfe_ident, 4061 &entryp->vfe_value)) { 4062 return (EINVAL); 4063 } 4064 } 4065 *req->vdr_result_len = 4066 count * sizeof (struct vdi_field_entry_v1); 4067 return (0); 4068 } 4069 4070 /* Emit all of the possible values */ 4071 const uint32_t total_size = nitems(vmm_arch_v1_fields) * 4072 sizeof (struct vdi_field_entry_v1); 4073 *req->vdr_result_len = total_size; 4074 if (req->vdr_len < total_size) { 4075 return (ENOSPC); 4076 } 4077 for (uint_t i = 0; i < nitems(vmm_arch_v1_fields); i++, entryp++) { 4078 entryp->vfe_ident = vmm_arch_v1_fields[i]; 4079 VERIFY(vmm_read_arch_field(vm, entryp->vfe_ident, 4080 &entryp->vfe_value)); 4081 } 4082 return (0); 4083 } 4084 4085 static int 4086 vmm_data_write_vmm_arch(void *arg, const vmm_data_req_t *req) 4087 { 4088 struct vm *vm = arg; 4089 4090 VERIFY3U(req->vdr_class, ==, VDC_VMM_ARCH); 4091 VERIFY3U(req->vdr_version, ==, 1); 4092 4093 const struct vdi_field_entry_v1 *entryp = req->vdr_data; 4094 const uint_t entry_count = 4095 req->vdr_len / sizeof (struct vdi_field_entry_v1); 4096 4097 for (uint_t i = 0; i < entry_count; i++, entryp++) { 4098 const uint64_t val = entryp->vfe_value; 4099 4100 switch (entryp->vfe_ident) { 4101 case VAI_TSC_BOOT_OFFSET: 4102 vm->boot_tsc_offset = val; 4103 break; 4104 case VAI_BOOT_HRTIME: 4105 vm->boot_hrtime = val; 4106 break; 4107 case VAI_TSC_FREQ: 4108 /* Guest TSC frequency not (currently) adjustable */ 4109 return (EPERM); 4110 default: 4111 return (EINVAL); 4112 } 4113 } 4114 *req->vdr_result_len = entry_count * sizeof (struct vdi_field_entry_v1); 4115 return (0); 4116 } 4117 4118 static const vmm_data_version_entry_t vmm_arch_v1 = { 4119 .vdve_class = VDC_VMM_ARCH, 4120 .vdve_version = 1, 4121 .vdve_len_per_item = sizeof (struct vdi_field_entry_v1), 4122 .vdve_readf = vmm_data_read_vmm_arch, 4123 .vdve_writef = vmm_data_write_vmm_arch, 4124 }; 4125 VMM_DATA_VERSION(vmm_arch_v1); 4126 4127 static int 4128 vmm_data_read_versions(void *arg, const vmm_data_req_t *req) 4129 { 4130 VERIFY3U(req->vdr_class, ==, VDC_VERSION); 4131 VERIFY3U(req->vdr_version, ==, 1); 4132 4133 const uint32_t total_size = SET_COUNT(vmm_data_version_entries) * 4134 sizeof (struct vdi_version_entry_v1); 4135 4136 /* Make sure there is room for all of the entries */ 4137 *req->vdr_result_len = total_size; 4138 if (req->vdr_len < *req->vdr_result_len) { 4139 return (ENOSPC); 4140 } 4141 4142 struct vdi_version_entry_v1 *entryp = req->vdr_data; 4143 const vmm_data_version_entry_t **vdpp; 4144 SET_FOREACH(vdpp, vmm_data_version_entries) { 4145 const vmm_data_version_entry_t *vdp = *vdpp; 4146 4147 entryp->vve_class = vdp->vdve_class; 4148 entryp->vve_version = vdp->vdve_version; 4149 entryp->vve_len_expect = vdp->vdve_len_expect; 4150 entryp->vve_len_per_item = vdp->vdve_len_per_item; 4151 entryp++; 4152 } 4153 return (0); 4154 } 4155 4156 static int 4157 vmm_data_write_versions(void *arg, const vmm_data_req_t *req) 4158 { 4159 /* Writing to the version information makes no sense */ 4160 return (EPERM); 4161 } 4162 4163 static const vmm_data_version_entry_t versions_v1 = { 4164 .vdve_class = VDC_VERSION, 4165 .vdve_version = 1, 4166 .vdve_len_per_item = sizeof (struct vdi_version_entry_v1), 4167 .vdve_readf = vmm_data_read_versions, 4168 .vdve_writef = vmm_data_write_versions, 4169 }; 4170 VMM_DATA_VERSION(versions_v1); 4171 4172 int 4173 vmm_data_read(struct vm *vm, int vcpuid, const vmm_data_req_t *req) 4174 { 4175 int err = 0; 4176 4177 if (vmm_data_is_cpu_specific(req->vdr_class)) { 4178 if (vcpuid >= VM_MAXCPU) { 4179 return (EINVAL); 4180 } 4181 } 4182 4183 const vmm_data_version_entry_t *entry = NULL; 4184 err = vmm_data_find(req, &entry); 4185 if (err != 0) { 4186 return (err); 4187 } 4188 ASSERT(entry != NULL); 4189 4190 void *datap = vmm_data_from_class(req, vm, vcpuid); 4191 if (datap != NULL) { 4192 err = entry->vdve_readf(datap, req); 4193 4194 /* 4195 * Successful reads of fixed-length data should populate the 4196 * length of that result. 4197 */ 4198 if (err == 0 && entry->vdve_len_expect != 0) { 4199 *req->vdr_result_len = entry->vdve_len_expect; 4200 } 4201 } else { 4202 switch (req->vdr_class) { 4203 case VDC_MSR: 4204 err = vmm_data_read_msrs(vm, vcpuid, req); 4205 break; 4206 case VDC_FPU: 4207 /* TODO: wire up to xsave export via hma_fpu iface */ 4208 err = EINVAL; 4209 break; 4210 case VDC_REGISTER: 4211 default: 4212 err = EINVAL; 4213 break; 4214 } 4215 } 4216 4217 return (err); 4218 } 4219 4220 int 4221 vmm_data_write(struct vm *vm, int vcpuid, const vmm_data_req_t *req) 4222 { 4223 int err = 0; 4224 4225 if (vmm_data_is_cpu_specific(req->vdr_class)) { 4226 if (vcpuid >= VM_MAXCPU) { 4227 return (EINVAL); 4228 } 4229 } 4230 4231 const vmm_data_version_entry_t *entry = NULL; 4232 err = vmm_data_find(req, &entry); 4233 if (err != 0) { 4234 return (err); 4235 } 4236 ASSERT(entry != NULL); 4237 4238 void *datap = vmm_data_from_class(req, vm, vcpuid); 4239 if (datap != NULL) { 4240 err = entry->vdve_writef(datap, req); 4241 /* 4242 * Successful writes of fixed-length data should populate the 4243 * length of that result. 4244 */ 4245 if (err == 0 && entry->vdve_len_expect != 0) { 4246 *req->vdr_result_len = entry->vdve_len_expect; 4247 } 4248 } else { 4249 switch (req->vdr_class) { 4250 case VDC_MSR: 4251 err = vmm_data_write_msrs(vm, vcpuid, req); 4252 break; 4253 case VDC_FPU: 4254 /* TODO: wire up to xsave import via hma_fpu iface */ 4255 err = EINVAL; 4256 break; 4257 case VDC_REGISTER: 4258 default: 4259 err = EINVAL; 4260 break; 4261 } 4262 } 4263 4264 return (err); 4265 } 4266