1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2011 NetApp, Inc. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 * 28 * $FreeBSD$ 29 */ 30 /* 31 * This file and its contents are supplied under the terms of the 32 * Common Development and Distribution License ("CDDL"), version 1.0. 33 * You may only use this file in accordance with the terms of version 34 * 1.0 of the CDDL. 35 * 36 * A full copy of the text of the CDDL should have accompanied this 37 * source. A copy of the CDDL is also available via the Internet at 38 * http://www.illumos.org/license/CDDL. 39 * 40 * Copyright 2015 Pluribus Networks Inc. 41 * Copyright 2018 Joyent, Inc. 42 * Copyright 2022 Oxide Computer Company 43 * Copyright 2021 OmniOS Community Edition (OmniOSce) Association. 44 */ 45 46 #include <sys/cdefs.h> 47 __FBSDID("$FreeBSD$"); 48 49 #include <sys/param.h> 50 #include <sys/systm.h> 51 #include <sys/kernel.h> 52 #include <sys/module.h> 53 #include <sys/sysctl.h> 54 #include <sys/kmem.h> 55 #include <sys/pcpu.h> 56 #include <sys/mutex.h> 57 #include <sys/proc.h> 58 #include <sys/rwlock.h> 59 #include <sys/sched.h> 60 #include <sys/systm.h> 61 #include <sys/sunddi.h> 62 #include <sys/hma.h> 63 64 #include <machine/md_var.h> 65 #include <x86/psl.h> 66 #include <x86/apicreg.h> 67 68 #include <machine/specialreg.h> 69 #include <machine/vmm.h> 70 #include <machine/vmm_dev.h> 71 #include <machine/vmparam.h> 72 #include <sys/vmm_instruction_emul.h> 73 #include <sys/vmm_vm.h> 74 #include <sys/vmm_gpt.h> 75 #include <sys/vmm_data.h> 76 77 #include "vmm_ioport.h" 78 #include "vmm_host.h" 79 #include "vmm_util.h" 80 #include "vatpic.h" 81 #include "vatpit.h" 82 #include "vhpet.h" 83 #include "vioapic.h" 84 #include "vlapic.h" 85 #include "vpmtmr.h" 86 #include "vrtc.h" 87 #include "vmm_stat.h" 88 #include "vmm_lapic.h" 89 90 #include "io/ppt.h" 91 #include "io/iommu.h" 92 93 struct vlapic; 94 95 /* Flags for vtc_status */ 96 #define VTCS_FPU_RESTORED 1 /* guest FPU restored, host FPU saved */ 97 #define VTCS_FPU_CTX_CRITICAL 2 /* in ctx where FPU restore cannot be lazy */ 98 99 typedef struct vm_thread_ctx { 100 struct vm *vtc_vm; 101 int vtc_vcpuid; 102 uint_t vtc_status; 103 enum vcpu_ustate vtc_ustate; 104 } vm_thread_ctx_t; 105 106 #define VMM_MTRR_VAR_MAX 10 107 #define VMM_MTRR_DEF_MASK \ 108 (MTRR_DEF_ENABLE | MTRR_DEF_FIXED_ENABLE | MTRR_DEF_TYPE) 109 #define VMM_MTRR_PHYSBASE_MASK (MTRR_PHYSBASE_PHYSBASE | MTRR_PHYSBASE_TYPE) 110 #define VMM_MTRR_PHYSMASK_MASK (MTRR_PHYSMASK_PHYSMASK | MTRR_PHYSMASK_VALID) 111 struct vm_mtrr { 112 uint64_t def_type; 113 uint64_t fixed4k[8]; 114 uint64_t fixed16k[2]; 115 uint64_t fixed64k; 116 struct { 117 uint64_t base; 118 uint64_t mask; 119 } var[VMM_MTRR_VAR_MAX]; 120 }; 121 122 /* 123 * Initialization: 124 * (a) allocated when vcpu is created 125 * (i) initialized when vcpu is created and when it is reinitialized 126 * (o) initialized the first time the vcpu is created 127 * (x) initialized before use 128 */ 129 struct vcpu { 130 /* (o) protects state, run_state, hostcpu, sipi_vector */ 131 kmutex_t lock; 132 133 enum vcpu_state state; /* (o) vcpu state */ 134 enum vcpu_run_state run_state; /* (i) vcpu init/sipi/run state */ 135 kcondvar_t vcpu_cv; /* (o) cpu waiter cv */ 136 kcondvar_t state_cv; /* (o) IDLE-transition cv */ 137 int hostcpu; /* (o) vcpu's current host cpu */ 138 int lastloccpu; /* (o) last host cpu localized to */ 139 int reqidle; /* (i) request vcpu to idle */ 140 struct vlapic *vlapic; /* (i) APIC device model */ 141 enum x2apic_state x2apic_state; /* (i) APIC mode */ 142 uint64_t exit_intinfo; /* (i) events pending at VM exit */ 143 uint64_t exc_pending; /* (i) exception pending */ 144 bool nmi_pending; /* (i) NMI pending */ 145 bool extint_pending; /* (i) INTR pending */ 146 147 uint8_t sipi_vector; /* (i) SIPI vector */ 148 hma_fpu_t *guestfpu; /* (a,i) guest fpu state */ 149 uint64_t guest_xcr0; /* (i) guest %xcr0 register */ 150 void *stats; /* (a,i) statistics */ 151 struct vm_exit exitinfo; /* (x) exit reason and collateral */ 152 uint64_t nextrip; /* (x) next instruction to execute */ 153 struct vie *vie_ctx; /* (x) instruction emulation context */ 154 vm_client_t *vmclient; /* (a) VM-system client */ 155 uint64_t tsc_offset; /* (x) offset from host TSC */ 156 struct vm_mtrr mtrr; /* (i) vcpu's MTRR */ 157 vcpu_cpuid_config_t cpuid_cfg; /* (x) cpuid configuration */ 158 159 enum vcpu_ustate ustate; /* (i) microstate for the vcpu */ 160 hrtime_t ustate_when; /* (i) time of last ustate change */ 161 uint64_t ustate_total[VU_MAX]; /* (o) total time spent in ustates */ 162 vm_thread_ctx_t vtc; /* (o) thread state for ctxops */ 163 struct ctxop *ctxop; /* (o) ctxop storage for vcpu */ 164 }; 165 166 #define vcpu_lock(v) mutex_enter(&((v)->lock)) 167 #define vcpu_unlock(v) mutex_exit(&((v)->lock)) 168 #define vcpu_assert_locked(v) ASSERT(MUTEX_HELD(&((v)->lock))) 169 170 struct mem_seg { 171 size_t len; 172 bool sysmem; 173 vm_object_t *object; 174 }; 175 #define VM_MAX_MEMSEGS 5 176 177 struct mem_map { 178 vm_paddr_t gpa; 179 size_t len; 180 vm_ooffset_t segoff; 181 int segid; 182 int prot; 183 int flags; 184 }; 185 #define VM_MAX_MEMMAPS 8 186 187 /* 188 * Initialization: 189 * (o) initialized the first time the VM is created 190 * (i) initialized when VM is created and when it is reinitialized 191 * (x) initialized before use 192 */ 193 struct vm { 194 void *cookie; /* (i) cpu-specific data */ 195 void *iommu; /* (x) iommu-specific data */ 196 struct vhpet *vhpet; /* (i) virtual HPET */ 197 struct vioapic *vioapic; /* (i) virtual ioapic */ 198 struct vatpic *vatpic; /* (i) virtual atpic */ 199 struct vatpit *vatpit; /* (i) virtual atpit */ 200 struct vpmtmr *vpmtmr; /* (i) virtual ACPI PM timer */ 201 struct vrtc *vrtc; /* (o) virtual RTC */ 202 volatile cpuset_t active_cpus; /* (i) active vcpus */ 203 volatile cpuset_t debug_cpus; /* (i) vcpus stopped for dbg */ 204 int suspend; /* (i) stop VM execution */ 205 volatile cpuset_t suspended_cpus; /* (i) suspended vcpus */ 206 volatile cpuset_t halted_cpus; /* (x) cpus in a hard halt */ 207 struct mem_map mem_maps[VM_MAX_MEMMAPS]; /* (i) guest address space */ 208 struct mem_seg mem_segs[VM_MAX_MEMSEGS]; /* (o) guest memory regions */ 209 struct vmspace *vmspace; /* (o) guest's address space */ 210 struct vcpu vcpu[VM_MAXCPU]; /* (i) guest vcpus */ 211 /* The following describe the vm cpu topology */ 212 uint16_t sockets; /* (o) num of sockets */ 213 uint16_t cores; /* (o) num of cores/socket */ 214 uint16_t threads; /* (o) num of threads/core */ 215 uint16_t maxcpus; /* (o) max pluggable cpus */ 216 217 uint64_t boot_tsc_offset; /* (i) TSC offset at VM boot */ 218 hrtime_t boot_hrtime; /* (i) hrtime at VM boot */ 219 220 struct ioport_config ioports; /* (o) ioport handling */ 221 222 bool mem_transient; /* (o) alloc transient memory */ 223 bool is_paused; /* (i) instance is paused */ 224 }; 225 226 static int vmm_initialized; 227 228 229 static void 230 nullop_panic(void) 231 { 232 panic("null vmm operation call"); 233 } 234 235 /* Do not allow use of an un-set `ops` to do anything but panic */ 236 static struct vmm_ops vmm_ops_null = { 237 .init = (vmm_init_func_t)nullop_panic, 238 .cleanup = (vmm_cleanup_func_t)nullop_panic, 239 .resume = (vmm_resume_func_t)nullop_panic, 240 .vminit = (vmi_init_func_t)nullop_panic, 241 .vmrun = (vmi_run_func_t)nullop_panic, 242 .vmcleanup = (vmi_cleanup_func_t)nullop_panic, 243 .vmgetreg = (vmi_get_register_t)nullop_panic, 244 .vmsetreg = (vmi_set_register_t)nullop_panic, 245 .vmgetdesc = (vmi_get_desc_t)nullop_panic, 246 .vmsetdesc = (vmi_set_desc_t)nullop_panic, 247 .vmgetcap = (vmi_get_cap_t)nullop_panic, 248 .vmsetcap = (vmi_set_cap_t)nullop_panic, 249 .vlapic_init = (vmi_vlapic_init)nullop_panic, 250 .vlapic_cleanup = (vmi_vlapic_cleanup)nullop_panic, 251 .vmsavectx = (vmi_savectx)nullop_panic, 252 .vmrestorectx = (vmi_restorectx)nullop_panic, 253 .vmgetmsr = (vmi_get_msr_t)nullop_panic, 254 .vmsetmsr = (vmi_set_msr_t)nullop_panic, 255 }; 256 257 static struct vmm_ops *ops = &vmm_ops_null; 258 static vmm_pte_ops_t *pte_ops = NULL; 259 260 #define VMM_INIT() ((*ops->init)()) 261 #define VMM_CLEANUP() ((*ops->cleanup)()) 262 #define VMM_RESUME() ((*ops->resume)()) 263 264 #define VMINIT(vm) ((*ops->vminit)(vm)) 265 #define VMRUN(vmi, vcpu, rip) ((*ops->vmrun)(vmi, vcpu, rip)) 266 #define VMCLEANUP(vmi) ((*ops->vmcleanup)(vmi)) 267 268 #define VMGETREG(vmi, vcpu, num, rv) ((*ops->vmgetreg)(vmi, vcpu, num, rv)) 269 #define VMSETREG(vmi, vcpu, num, val) ((*ops->vmsetreg)(vmi, vcpu, num, val)) 270 #define VMGETDESC(vmi, vcpu, num, dsc) ((*ops->vmgetdesc)(vmi, vcpu, num, dsc)) 271 #define VMSETDESC(vmi, vcpu, num, dsc) ((*ops->vmsetdesc)(vmi, vcpu, num, dsc)) 272 #define VMGETCAP(vmi, vcpu, num, rv) ((*ops->vmgetcap)(vmi, vcpu, num, rv)) 273 #define VMSETCAP(vmi, vcpu, num, val) ((*ops->vmsetcap)(vmi, vcpu, num, val)) 274 #define VLAPIC_INIT(vmi, vcpu) ((*ops->vlapic_init)(vmi, vcpu)) 275 #define VLAPIC_CLEANUP(vmi, vlapic) ((*ops->vlapic_cleanup)(vmi, vlapic)) 276 277 #define fpu_start_emulating() load_cr0(rcr0() | CR0_TS) 278 #define fpu_stop_emulating() clts() 279 280 SDT_PROVIDER_DEFINE(vmm); 281 282 SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 283 NULL); 284 285 /* 286 * Halt the guest if all vcpus are executing a HLT instruction with 287 * interrupts disabled. 288 */ 289 static int halt_detection_enabled = 1; 290 291 /* Trap into hypervisor on all guest exceptions and reflect them back */ 292 static int trace_guest_exceptions; 293 294 static void vm_free_memmap(struct vm *vm, int ident); 295 static bool sysmem_mapping(struct vm *vm, struct mem_map *mm); 296 static void vcpu_notify_event_locked(struct vcpu *vcpu, vcpu_notify_t); 297 static bool vcpu_sleep_bailout_checks(struct vm *vm, int vcpuid); 298 static int vcpu_vector_sipi(struct vm *vm, int vcpuid, uint8_t vector); 299 300 static void vmm_savectx(void *); 301 static void vmm_restorectx(void *); 302 static const struct ctxop_template vmm_ctxop_tpl = { 303 .ct_rev = CTXOP_TPL_REV, 304 .ct_save = vmm_savectx, 305 .ct_restore = vmm_restorectx, 306 }; 307 308 #ifdef KTR 309 static const char * 310 vcpu_state2str(enum vcpu_state state) 311 { 312 313 switch (state) { 314 case VCPU_IDLE: 315 return ("idle"); 316 case VCPU_FROZEN: 317 return ("frozen"); 318 case VCPU_RUNNING: 319 return ("running"); 320 case VCPU_SLEEPING: 321 return ("sleeping"); 322 default: 323 return ("unknown"); 324 } 325 } 326 #endif 327 328 static void 329 vcpu_cleanup(struct vm *vm, int i, bool destroy) 330 { 331 struct vcpu *vcpu = &vm->vcpu[i]; 332 333 VLAPIC_CLEANUP(vm->cookie, vcpu->vlapic); 334 if (destroy) { 335 vmm_stat_free(vcpu->stats); 336 337 vcpu_cpuid_cleanup(&vcpu->cpuid_cfg); 338 339 hma_fpu_free(vcpu->guestfpu); 340 vcpu->guestfpu = NULL; 341 342 vie_free(vcpu->vie_ctx); 343 vcpu->vie_ctx = NULL; 344 345 vmc_destroy(vcpu->vmclient); 346 vcpu->vmclient = NULL; 347 348 ctxop_free(vcpu->ctxop); 349 mutex_destroy(&vcpu->lock); 350 } 351 } 352 353 static void 354 vcpu_init(struct vm *vm, int vcpu_id, bool create) 355 { 356 struct vcpu *vcpu; 357 358 KASSERT(vcpu_id >= 0 && vcpu_id < vm->maxcpus, 359 ("vcpu_init: invalid vcpu %d", vcpu_id)); 360 361 vcpu = &vm->vcpu[vcpu_id]; 362 363 if (create) { 364 mutex_init(&vcpu->lock, NULL, MUTEX_ADAPTIVE, NULL); 365 366 vcpu->state = VCPU_IDLE; 367 vcpu->hostcpu = NOCPU; 368 vcpu->lastloccpu = NOCPU; 369 vcpu->guestfpu = hma_fpu_alloc(KM_SLEEP); 370 vcpu->stats = vmm_stat_alloc(); 371 vcpu->vie_ctx = vie_alloc(); 372 vcpu_cpuid_init(&vcpu->cpuid_cfg); 373 374 vcpu->ustate = VU_INIT; 375 vcpu->ustate_when = gethrtime(); 376 377 vcpu->vtc.vtc_vm = vm; 378 vcpu->vtc.vtc_vcpuid = vcpu_id; 379 vcpu->ctxop = ctxop_allocate(&vmm_ctxop_tpl, &vcpu->vtc); 380 } else { 381 vie_reset(vcpu->vie_ctx); 382 bzero(&vcpu->exitinfo, sizeof (vcpu->exitinfo)); 383 if (vcpu->ustate != VU_INIT) { 384 vcpu_ustate_change(vm, vcpu_id, VU_INIT); 385 } 386 bzero(&vcpu->mtrr, sizeof (vcpu->mtrr)); 387 } 388 389 vcpu->run_state = VRS_HALT; 390 vcpu->vlapic = VLAPIC_INIT(vm->cookie, vcpu_id); 391 (void) vm_set_x2apic_state(vm, vcpu_id, X2APIC_DISABLED); 392 vcpu->reqidle = 0; 393 vcpu->exit_intinfo = 0; 394 vcpu->nmi_pending = false; 395 vcpu->extint_pending = false; 396 vcpu->exc_pending = 0; 397 vcpu->guest_xcr0 = XFEATURE_ENABLED_X87; 398 (void) hma_fpu_init(vcpu->guestfpu); 399 vmm_stat_init(vcpu->stats); 400 vcpu->tsc_offset = 0; 401 } 402 403 int 404 vcpu_trace_exceptions(struct vm *vm, int vcpuid) 405 { 406 407 return (trace_guest_exceptions); 408 } 409 410 struct vm_exit * 411 vm_exitinfo(struct vm *vm, int cpuid) 412 { 413 struct vcpu *vcpu; 414 415 if (cpuid < 0 || cpuid >= vm->maxcpus) 416 panic("vm_exitinfo: invalid cpuid %d", cpuid); 417 418 vcpu = &vm->vcpu[cpuid]; 419 420 return (&vcpu->exitinfo); 421 } 422 423 struct vie * 424 vm_vie_ctx(struct vm *vm, int cpuid) 425 { 426 if (cpuid < 0 || cpuid >= vm->maxcpus) 427 panic("vm_vie_ctx: invalid cpuid %d", cpuid); 428 429 return (vm->vcpu[cpuid].vie_ctx); 430 } 431 432 static int 433 vmm_init(void) 434 { 435 vmm_host_state_init(); 436 437 if (vmm_is_intel()) { 438 ops = &vmm_ops_intel; 439 pte_ops = &ept_pte_ops; 440 } else if (vmm_is_svm()) { 441 ops = &vmm_ops_amd; 442 pte_ops = &rvi_pte_ops; 443 } else { 444 return (ENXIO); 445 } 446 447 return (VMM_INIT()); 448 } 449 450 int 451 vmm_mod_load() 452 { 453 int error; 454 455 VERIFY(vmm_initialized == 0); 456 457 error = vmm_init(); 458 if (error == 0) 459 vmm_initialized = 1; 460 461 return (error); 462 } 463 464 int 465 vmm_mod_unload() 466 { 467 int error; 468 469 VERIFY(vmm_initialized == 1); 470 471 error = VMM_CLEANUP(); 472 if (error) 473 return (error); 474 vmm_initialized = 0; 475 476 return (0); 477 } 478 479 /* 480 * Create a test IOMMU domain to see if the host system has necessary hardware 481 * and drivers to do so. 482 */ 483 bool 484 vmm_check_iommu(void) 485 { 486 void *domain; 487 const size_t arb_test_sz = (1UL << 32); 488 489 domain = iommu_create_domain(arb_test_sz); 490 if (domain == NULL) { 491 return (false); 492 } 493 iommu_destroy_domain(domain); 494 return (true); 495 } 496 497 static void 498 vm_init(struct vm *vm, bool create) 499 { 500 int i; 501 502 vm->cookie = VMINIT(vm); 503 vm->iommu = NULL; 504 vm->vioapic = vioapic_init(vm); 505 vm->vhpet = vhpet_init(vm); 506 vm->vatpic = vatpic_init(vm); 507 vm->vatpit = vatpit_init(vm); 508 vm->vpmtmr = vpmtmr_init(vm); 509 if (create) 510 vm->vrtc = vrtc_init(vm); 511 512 vm_inout_init(vm, &vm->ioports); 513 514 CPU_ZERO(&vm->active_cpus); 515 CPU_ZERO(&vm->debug_cpus); 516 517 vm->suspend = 0; 518 CPU_ZERO(&vm->suspended_cpus); 519 520 for (i = 0; i < vm->maxcpus; i++) 521 vcpu_init(vm, i, create); 522 523 /* 524 * Configure the VM-wide TSC offset so that the call to vm_init() 525 * represents the boot time (when the TSC(s) read 0). Each vCPU will 526 * have its own offset from this, which is altered if/when the guest 527 * writes to MSR_TSC. 528 * 529 * The TSC offsetting math is all unsigned, using overflow for negative 530 * offets. A reading of the TSC is negated to form the boot offset. 531 */ 532 const uint64_t boot_tsc = rdtsc_offset(); 533 vm->boot_tsc_offset = (uint64_t)(-(int64_t)boot_tsc); 534 535 /* Convert the boot TSC reading to hrtime */ 536 vm->boot_hrtime = (hrtime_t)boot_tsc; 537 scalehrtime(&vm->boot_hrtime); 538 } 539 540 /* 541 * The default CPU topology is a single thread per package. 542 */ 543 uint_t cores_per_package = 1; 544 uint_t threads_per_core = 1; 545 546 /* 547 * Debugging tunable to enable dirty-page-tracking. 548 * (Remains off by default for now) 549 */ 550 bool gpt_track_dirty = false; 551 552 int 553 vm_create(uint64_t flags, struct vm **retvm) 554 { 555 struct vm *vm; 556 struct vmspace *vmspace; 557 558 /* 559 * If vmm.ko could not be successfully initialized then don't attempt 560 * to create the virtual machine. 561 */ 562 if (!vmm_initialized) 563 return (ENXIO); 564 565 vmspace = vmspace_alloc(VM_MAXUSER_ADDRESS, pte_ops, gpt_track_dirty); 566 if (vmspace == NULL) 567 return (ENOMEM); 568 569 vm = kmem_zalloc(sizeof (struct vm), KM_SLEEP); 570 571 vm->vmspace = vmspace; 572 vm->mem_transient = (flags & VCF_RESERVOIR_MEM) == 0; 573 for (uint_t i = 0; i < VM_MAXCPU; i++) { 574 vm->vcpu[i].vmclient = vmspace_client_alloc(vmspace); 575 } 576 577 vm->sockets = 1; 578 vm->cores = cores_per_package; /* XXX backwards compatibility */ 579 vm->threads = threads_per_core; /* XXX backwards compatibility */ 580 vm->maxcpus = VM_MAXCPU; /* XXX temp to keep code working */ 581 582 vm_init(vm, true); 583 584 *retvm = vm; 585 return (0); 586 } 587 588 void 589 vm_get_topology(struct vm *vm, uint16_t *sockets, uint16_t *cores, 590 uint16_t *threads, uint16_t *maxcpus) 591 { 592 *sockets = vm->sockets; 593 *cores = vm->cores; 594 *threads = vm->threads; 595 *maxcpus = vm->maxcpus; 596 } 597 598 uint16_t 599 vm_get_maxcpus(struct vm *vm) 600 { 601 return (vm->maxcpus); 602 } 603 604 int 605 vm_set_topology(struct vm *vm, uint16_t sockets, uint16_t cores, 606 uint16_t threads, uint16_t maxcpus) 607 { 608 if (maxcpus != 0) 609 return (EINVAL); /* XXX remove when supported */ 610 if ((sockets * cores * threads) > vm->maxcpus) 611 return (EINVAL); 612 /* XXX need to check sockets * cores * threads == vCPU, how? */ 613 vm->sockets = sockets; 614 vm->cores = cores; 615 vm->threads = threads; 616 vm->maxcpus = VM_MAXCPU; /* XXX temp to keep code working */ 617 return (0); 618 } 619 620 static void 621 vm_cleanup(struct vm *vm, bool destroy) 622 { 623 struct mem_map *mm; 624 int i; 625 626 ppt_unassign_all(vm); 627 628 if (vm->iommu != NULL) 629 iommu_destroy_domain(vm->iommu); 630 631 /* 632 * Devices which attach their own ioport hooks should be cleaned up 633 * first so they can tear down those registrations. 634 */ 635 vpmtmr_cleanup(vm->vpmtmr); 636 637 vm_inout_cleanup(vm, &vm->ioports); 638 639 if (destroy) 640 vrtc_cleanup(vm->vrtc); 641 else 642 vrtc_reset(vm->vrtc); 643 644 vatpit_cleanup(vm->vatpit); 645 vhpet_cleanup(vm->vhpet); 646 vatpic_cleanup(vm->vatpic); 647 vioapic_cleanup(vm->vioapic); 648 649 for (i = 0; i < vm->maxcpus; i++) 650 vcpu_cleanup(vm, i, destroy); 651 652 VMCLEANUP(vm->cookie); 653 654 /* 655 * System memory is removed from the guest address space only when 656 * the VM is destroyed. This is because the mapping remains the same 657 * across VM reset. 658 * 659 * Device memory can be relocated by the guest (e.g. using PCI BARs) 660 * so those mappings are removed on a VM reset. 661 */ 662 for (i = 0; i < VM_MAX_MEMMAPS; i++) { 663 mm = &vm->mem_maps[i]; 664 if (destroy || !sysmem_mapping(vm, mm)) { 665 vm_free_memmap(vm, i); 666 } else { 667 /* 668 * We need to reset the IOMMU flag so this mapping can 669 * be reused when a VM is rebooted. Since the IOMMU 670 * domain has already been destroyed we can just reset 671 * the flag here. 672 */ 673 mm->flags &= ~VM_MEMMAP_F_IOMMU; 674 } 675 } 676 677 if (destroy) { 678 for (i = 0; i < VM_MAX_MEMSEGS; i++) 679 vm_free_memseg(vm, i); 680 681 vmspace_destroy(vm->vmspace); 682 vm->vmspace = NULL; 683 } 684 } 685 686 void 687 vm_destroy(struct vm *vm) 688 { 689 vm_cleanup(vm, true); 690 kmem_free(vm, sizeof (*vm)); 691 } 692 693 int 694 vm_reinit(struct vm *vm, uint64_t flags) 695 { 696 /* A virtual machine can be reset only if all vcpus are suspended. */ 697 if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) != 0) { 698 if ((flags & VM_REINIT_F_FORCE_SUSPEND) == 0) { 699 return (EBUSY); 700 } 701 702 /* 703 * Force the VM (and all its vCPUs) into a suspended state. 704 * This should be quick and easy, since the vm_reinit() call is 705 * made while holding the VM write lock, which requires holding 706 * all of the vCPUs in the VCPU_FROZEN state. 707 */ 708 (void) atomic_cmpset_int((uint_t *)&vm->suspend, 0, 709 VM_SUSPEND_RESET); 710 for (uint_t i = 0; i < vm->maxcpus; i++) { 711 struct vcpu *vcpu = &vm->vcpu[i]; 712 713 if (CPU_ISSET(i, &vm->suspended_cpus) || 714 !CPU_ISSET(i, &vm->active_cpus)) { 715 continue; 716 } 717 718 vcpu_lock(vcpu); 719 VERIFY3U(vcpu->state, ==, VCPU_FROZEN); 720 CPU_SET_ATOMIC(i, &vm->suspended_cpus); 721 vcpu_unlock(vcpu); 722 } 723 724 VERIFY0(CPU_CMP(&vm->suspended_cpus, &vm->active_cpus)); 725 } 726 727 vm_cleanup(vm, false); 728 vm_init(vm, false); 729 return (0); 730 } 731 732 bool 733 vm_is_paused(struct vm *vm) 734 { 735 return (vm->is_paused); 736 } 737 738 int 739 vm_pause_instance(struct vm *vm) 740 { 741 if (vm->is_paused) { 742 return (EALREADY); 743 } 744 vm->is_paused = true; 745 746 for (uint_t i = 0; i < vm->maxcpus; i++) { 747 struct vcpu *vcpu = &vm->vcpu[i]; 748 749 if (!CPU_ISSET(i, &vm->active_cpus)) { 750 continue; 751 } 752 vlapic_pause(vcpu->vlapic); 753 } 754 vhpet_pause(vm->vhpet); 755 vatpit_pause(vm->vatpit); 756 vrtc_pause(vm->vrtc); 757 758 return (0); 759 } 760 761 int 762 vm_resume_instance(struct vm *vm) 763 { 764 if (!vm->is_paused) { 765 return (EALREADY); 766 } 767 vm->is_paused = false; 768 769 vrtc_resume(vm->vrtc); 770 vatpit_resume(vm->vatpit); 771 vhpet_resume(vm->vhpet); 772 for (uint_t i = 0; i < vm->maxcpus; i++) { 773 struct vcpu *vcpu = &vm->vcpu[i]; 774 775 if (!CPU_ISSET(i, &vm->active_cpus)) { 776 continue; 777 } 778 vlapic_resume(vcpu->vlapic); 779 } 780 781 return (0); 782 } 783 784 int 785 vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa) 786 { 787 vm_object_t *obj; 788 789 if ((obj = vmm_mmio_alloc(vm->vmspace, gpa, len, hpa)) == NULL) 790 return (ENOMEM); 791 else 792 return (0); 793 } 794 795 int 796 vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len) 797 { 798 return (vmspace_unmap(vm->vmspace, gpa, gpa + len)); 799 } 800 801 /* 802 * Return 'true' if 'gpa' is allocated in the guest address space. 803 * 804 * This function is called in the context of a running vcpu which acts as 805 * an implicit lock on 'vm->mem_maps[]'. 806 */ 807 bool 808 vm_mem_allocated(struct vm *vm, int vcpuid, vm_paddr_t gpa) 809 { 810 struct mem_map *mm; 811 int i; 812 813 #ifdef INVARIANTS 814 int hostcpu, state; 815 state = vcpu_get_state(vm, vcpuid, &hostcpu); 816 KASSERT(state == VCPU_RUNNING && hostcpu == curcpu, 817 ("%s: invalid vcpu state %d/%d", __func__, state, hostcpu)); 818 #endif 819 820 for (i = 0; i < VM_MAX_MEMMAPS; i++) { 821 mm = &vm->mem_maps[i]; 822 if (mm->len != 0 && gpa >= mm->gpa && gpa < mm->gpa + mm->len) 823 return (true); /* 'gpa' is sysmem or devmem */ 824 } 825 826 if (ppt_is_mmio(vm, gpa)) 827 return (true); /* 'gpa' is pci passthru mmio */ 828 829 return (false); 830 } 831 832 int 833 vm_alloc_memseg(struct vm *vm, int ident, size_t len, bool sysmem) 834 { 835 struct mem_seg *seg; 836 vm_object_t *obj; 837 838 if (ident < 0 || ident >= VM_MAX_MEMSEGS) 839 return (EINVAL); 840 841 if (len == 0 || (len & PAGE_MASK)) 842 return (EINVAL); 843 844 seg = &vm->mem_segs[ident]; 845 if (seg->object != NULL) { 846 if (seg->len == len && seg->sysmem == sysmem) 847 return (EEXIST); 848 else 849 return (EINVAL); 850 } 851 852 obj = vm_object_mem_allocate(len, vm->mem_transient); 853 if (obj == NULL) 854 return (ENOMEM); 855 856 seg->len = len; 857 seg->object = obj; 858 seg->sysmem = sysmem; 859 return (0); 860 } 861 862 int 863 vm_get_memseg(struct vm *vm, int ident, size_t *len, bool *sysmem, 864 vm_object_t **objptr) 865 { 866 struct mem_seg *seg; 867 868 if (ident < 0 || ident >= VM_MAX_MEMSEGS) 869 return (EINVAL); 870 871 seg = &vm->mem_segs[ident]; 872 if (len) 873 *len = seg->len; 874 if (sysmem) 875 *sysmem = seg->sysmem; 876 if (objptr) 877 *objptr = seg->object; 878 return (0); 879 } 880 881 void 882 vm_free_memseg(struct vm *vm, int ident) 883 { 884 struct mem_seg *seg; 885 886 KASSERT(ident >= 0 && ident < VM_MAX_MEMSEGS, 887 ("%s: invalid memseg ident %d", __func__, ident)); 888 889 seg = &vm->mem_segs[ident]; 890 if (seg->object != NULL) { 891 vm_object_release(seg->object); 892 bzero(seg, sizeof (struct mem_seg)); 893 } 894 } 895 896 int 897 vm_mmap_memseg(struct vm *vm, vm_paddr_t gpa, int segid, vm_ooffset_t first, 898 size_t len, int prot, int flags) 899 { 900 struct mem_seg *seg; 901 struct mem_map *m, *map; 902 vm_ooffset_t last; 903 int i, error; 904 905 if (prot == 0 || (prot & ~(PROT_ALL)) != 0) 906 return (EINVAL); 907 908 if (flags & ~VM_MEMMAP_F_WIRED) 909 return (EINVAL); 910 911 if (segid < 0 || segid >= VM_MAX_MEMSEGS) 912 return (EINVAL); 913 914 seg = &vm->mem_segs[segid]; 915 if (seg->object == NULL) 916 return (EINVAL); 917 918 last = first + len; 919 if (first < 0 || first >= last || last > seg->len) 920 return (EINVAL); 921 922 if ((gpa | first | last) & PAGE_MASK) 923 return (EINVAL); 924 925 map = NULL; 926 for (i = 0; i < VM_MAX_MEMMAPS; i++) { 927 m = &vm->mem_maps[i]; 928 if (m->len == 0) { 929 map = m; 930 break; 931 } 932 } 933 934 if (map == NULL) 935 return (ENOSPC); 936 937 error = vmspace_map(vm->vmspace, seg->object, first, gpa, len, prot); 938 if (error != 0) 939 return (EFAULT); 940 941 vm_object_reference(seg->object); 942 943 if ((flags & VM_MEMMAP_F_WIRED) != 0) { 944 error = vmspace_populate(vm->vmspace, gpa, gpa + len); 945 if (error != 0) { 946 VERIFY0(vmspace_unmap(vm->vmspace, gpa, gpa + len)); 947 return (EFAULT); 948 } 949 } 950 951 map->gpa = gpa; 952 map->len = len; 953 map->segoff = first; 954 map->segid = segid; 955 map->prot = prot; 956 map->flags = flags; 957 return (0); 958 } 959 960 int 961 vm_munmap_memseg(struct vm *vm, vm_paddr_t gpa, size_t len) 962 { 963 struct mem_map *m; 964 int i; 965 966 for (i = 0; i < VM_MAX_MEMMAPS; i++) { 967 m = &vm->mem_maps[i]; 968 if (m->gpa == gpa && m->len == len && 969 (m->flags & VM_MEMMAP_F_IOMMU) == 0) { 970 vm_free_memmap(vm, i); 971 return (0); 972 } 973 } 974 975 return (EINVAL); 976 } 977 978 int 979 vm_mmap_getnext(struct vm *vm, vm_paddr_t *gpa, int *segid, 980 vm_ooffset_t *segoff, size_t *len, int *prot, int *flags) 981 { 982 struct mem_map *mm, *mmnext; 983 int i; 984 985 mmnext = NULL; 986 for (i = 0; i < VM_MAX_MEMMAPS; i++) { 987 mm = &vm->mem_maps[i]; 988 if (mm->len == 0 || mm->gpa < *gpa) 989 continue; 990 if (mmnext == NULL || mm->gpa < mmnext->gpa) 991 mmnext = mm; 992 } 993 994 if (mmnext != NULL) { 995 *gpa = mmnext->gpa; 996 if (segid) 997 *segid = mmnext->segid; 998 if (segoff) 999 *segoff = mmnext->segoff; 1000 if (len) 1001 *len = mmnext->len; 1002 if (prot) 1003 *prot = mmnext->prot; 1004 if (flags) 1005 *flags = mmnext->flags; 1006 return (0); 1007 } else { 1008 return (ENOENT); 1009 } 1010 } 1011 1012 static void 1013 vm_free_memmap(struct vm *vm, int ident) 1014 { 1015 struct mem_map *mm; 1016 int error; 1017 1018 mm = &vm->mem_maps[ident]; 1019 if (mm->len) { 1020 error = vmspace_unmap(vm->vmspace, mm->gpa, 1021 mm->gpa + mm->len); 1022 KASSERT(error == 0, ("%s: vmspace_unmap error %d", 1023 __func__, error)); 1024 bzero(mm, sizeof (struct mem_map)); 1025 } 1026 } 1027 1028 static __inline bool 1029 sysmem_mapping(struct vm *vm, struct mem_map *mm) 1030 { 1031 1032 if (mm->len != 0 && vm->mem_segs[mm->segid].sysmem) 1033 return (true); 1034 else 1035 return (false); 1036 } 1037 1038 vm_paddr_t 1039 vmm_sysmem_maxaddr(struct vm *vm) 1040 { 1041 struct mem_map *mm; 1042 vm_paddr_t maxaddr; 1043 int i; 1044 1045 maxaddr = 0; 1046 for (i = 0; i < VM_MAX_MEMMAPS; i++) { 1047 mm = &vm->mem_maps[i]; 1048 if (sysmem_mapping(vm, mm)) { 1049 if (maxaddr < mm->gpa + mm->len) 1050 maxaddr = mm->gpa + mm->len; 1051 } 1052 } 1053 return (maxaddr); 1054 } 1055 1056 static void 1057 vm_iommu_modify(struct vm *vm, bool map) 1058 { 1059 int i, sz; 1060 vm_paddr_t gpa, hpa; 1061 struct mem_map *mm; 1062 vm_client_t *vmc; 1063 1064 sz = PAGE_SIZE; 1065 vmc = vmspace_client_alloc(vm->vmspace); 1066 1067 for (i = 0; i < VM_MAX_MEMMAPS; i++) { 1068 mm = &vm->mem_maps[i]; 1069 if (!sysmem_mapping(vm, mm)) 1070 continue; 1071 1072 if (map) { 1073 KASSERT((mm->flags & VM_MEMMAP_F_IOMMU) == 0, 1074 ("iommu map found invalid memmap %lx/%lx/%x", 1075 mm->gpa, mm->len, mm->flags)); 1076 if ((mm->flags & VM_MEMMAP_F_WIRED) == 0) 1077 continue; 1078 mm->flags |= VM_MEMMAP_F_IOMMU; 1079 } else { 1080 if ((mm->flags & VM_MEMMAP_F_IOMMU) == 0) 1081 continue; 1082 mm->flags &= ~VM_MEMMAP_F_IOMMU; 1083 KASSERT((mm->flags & VM_MEMMAP_F_WIRED) != 0, 1084 ("iommu unmap found invalid memmap %lx/%lx/%x", 1085 mm->gpa, mm->len, mm->flags)); 1086 } 1087 1088 gpa = mm->gpa; 1089 while (gpa < mm->gpa + mm->len) { 1090 vm_page_t *vmp; 1091 1092 vmp = vmc_hold(vmc, gpa, PROT_WRITE); 1093 ASSERT(vmp != NULL); 1094 hpa = ((uintptr_t)vmp_get_pfn(vmp) << PAGESHIFT); 1095 (void) vmp_release(vmp); 1096 1097 /* 1098 * When originally ported from FreeBSD, the logic for 1099 * adding memory to the guest domain would 1100 * simultaneously remove it from the host domain. The 1101 * justification for that is not clear, and FreeBSD has 1102 * subsequently changed the behavior to not remove the 1103 * memory from the host domain. 1104 * 1105 * Leaving the guest memory in the host domain for the 1106 * life of the VM is necessary to make it available for 1107 * DMA, such as through viona in the TX path. 1108 */ 1109 if (map) { 1110 iommu_create_mapping(vm->iommu, gpa, hpa, sz); 1111 } else { 1112 iommu_remove_mapping(vm->iommu, gpa, sz); 1113 } 1114 1115 gpa += PAGE_SIZE; 1116 } 1117 } 1118 vmc_destroy(vmc); 1119 1120 /* 1121 * Invalidate the cached translations associated with the domain 1122 * from which pages were removed. 1123 */ 1124 iommu_invalidate_tlb(vm->iommu); 1125 } 1126 1127 int 1128 vm_unassign_pptdev(struct vm *vm, int pptfd) 1129 { 1130 int error; 1131 1132 error = ppt_unassign_device(vm, pptfd); 1133 if (error) 1134 return (error); 1135 1136 if (ppt_assigned_devices(vm) == 0) 1137 vm_iommu_modify(vm, false); 1138 1139 return (0); 1140 } 1141 1142 int 1143 vm_assign_pptdev(struct vm *vm, int pptfd) 1144 { 1145 int error; 1146 vm_paddr_t maxaddr; 1147 1148 /* Set up the IOMMU to do the 'gpa' to 'hpa' translation */ 1149 if (ppt_assigned_devices(vm) == 0) { 1150 KASSERT(vm->iommu == NULL, 1151 ("vm_assign_pptdev: iommu must be NULL")); 1152 maxaddr = vmm_sysmem_maxaddr(vm); 1153 vm->iommu = iommu_create_domain(maxaddr); 1154 if (vm->iommu == NULL) 1155 return (ENXIO); 1156 vm_iommu_modify(vm, true); 1157 } 1158 1159 error = ppt_assign_device(vm, pptfd); 1160 return (error); 1161 } 1162 1163 int 1164 vm_get_register(struct vm *vm, int vcpuid, int reg, uint64_t *retval) 1165 { 1166 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 1167 return (EINVAL); 1168 1169 if (reg >= VM_REG_LAST) 1170 return (EINVAL); 1171 1172 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 1173 switch (reg) { 1174 case VM_REG_GUEST_XCR0: 1175 *retval = vcpu->guest_xcr0; 1176 return (0); 1177 default: 1178 return (VMGETREG(vm->cookie, vcpuid, reg, retval)); 1179 } 1180 } 1181 1182 int 1183 vm_set_register(struct vm *vm, int vcpuid, int reg, uint64_t val) 1184 { 1185 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 1186 return (EINVAL); 1187 1188 if (reg >= VM_REG_LAST) 1189 return (EINVAL); 1190 1191 int error; 1192 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 1193 switch (reg) { 1194 case VM_REG_GUEST_RIP: 1195 error = VMSETREG(vm->cookie, vcpuid, reg, val); 1196 if (error == 0) { 1197 vcpu->nextrip = val; 1198 } 1199 return (error); 1200 case VM_REG_GUEST_XCR0: 1201 if (!validate_guest_xcr0(val, vmm_get_host_xcr0())) { 1202 return (EINVAL); 1203 } 1204 vcpu->guest_xcr0 = val; 1205 return (0); 1206 default: 1207 return (VMSETREG(vm->cookie, vcpuid, reg, val)); 1208 } 1209 } 1210 1211 static bool 1212 is_descriptor_table(int reg) 1213 { 1214 switch (reg) { 1215 case VM_REG_GUEST_IDTR: 1216 case VM_REG_GUEST_GDTR: 1217 return (true); 1218 default: 1219 return (false); 1220 } 1221 } 1222 1223 static bool 1224 is_segment_register(int reg) 1225 { 1226 switch (reg) { 1227 case VM_REG_GUEST_ES: 1228 case VM_REG_GUEST_CS: 1229 case VM_REG_GUEST_SS: 1230 case VM_REG_GUEST_DS: 1231 case VM_REG_GUEST_FS: 1232 case VM_REG_GUEST_GS: 1233 case VM_REG_GUEST_TR: 1234 case VM_REG_GUEST_LDTR: 1235 return (true); 1236 default: 1237 return (false); 1238 } 1239 } 1240 1241 int 1242 vm_get_seg_desc(struct vm *vm, int vcpu, int reg, struct seg_desc *desc) 1243 { 1244 1245 if (vcpu < 0 || vcpu >= vm->maxcpus) 1246 return (EINVAL); 1247 1248 if (!is_segment_register(reg) && !is_descriptor_table(reg)) 1249 return (EINVAL); 1250 1251 return (VMGETDESC(vm->cookie, vcpu, reg, desc)); 1252 } 1253 1254 int 1255 vm_set_seg_desc(struct vm *vm, int vcpu, int reg, const struct seg_desc *desc) 1256 { 1257 if (vcpu < 0 || vcpu >= vm->maxcpus) 1258 return (EINVAL); 1259 1260 if (!is_segment_register(reg) && !is_descriptor_table(reg)) 1261 return (EINVAL); 1262 1263 return (VMSETDESC(vm->cookie, vcpu, reg, desc)); 1264 } 1265 1266 static int 1267 translate_hma_xsave_result(hma_fpu_xsave_result_t res) 1268 { 1269 switch (res) { 1270 case HFXR_OK: 1271 return (0); 1272 case HFXR_NO_SPACE: 1273 return (ENOSPC); 1274 case HFXR_BAD_ALIGN: 1275 case HFXR_UNSUP_FMT: 1276 case HFXR_UNSUP_FEAT: 1277 case HFXR_INVALID_DATA: 1278 return (EINVAL); 1279 default: 1280 panic("unexpected xsave result"); 1281 } 1282 } 1283 1284 int 1285 vm_get_fpu(struct vm *vm, int vcpuid, void *buf, size_t len) 1286 { 1287 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 1288 return (EINVAL); 1289 1290 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 1291 hma_fpu_xsave_result_t res; 1292 1293 res = hma_fpu_get_xsave_state(vcpu->guestfpu, buf, len); 1294 return (translate_hma_xsave_result(res)); 1295 } 1296 1297 int 1298 vm_set_fpu(struct vm *vm, int vcpuid, void *buf, size_t len) 1299 { 1300 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 1301 return (EINVAL); 1302 1303 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 1304 hma_fpu_xsave_result_t res; 1305 1306 res = hma_fpu_set_xsave_state(vcpu->guestfpu, buf, len); 1307 return (translate_hma_xsave_result(res)); 1308 } 1309 1310 int 1311 vm_get_run_state(struct vm *vm, int vcpuid, uint32_t *state, uint8_t *sipi_vec) 1312 { 1313 struct vcpu *vcpu; 1314 1315 if (vcpuid < 0 || vcpuid >= vm->maxcpus) { 1316 return (EINVAL); 1317 } 1318 1319 vcpu = &vm->vcpu[vcpuid]; 1320 1321 vcpu_lock(vcpu); 1322 *state = vcpu->run_state; 1323 *sipi_vec = vcpu->sipi_vector; 1324 vcpu_unlock(vcpu); 1325 1326 return (0); 1327 } 1328 1329 int 1330 vm_set_run_state(struct vm *vm, int vcpuid, uint32_t state, uint8_t sipi_vec) 1331 { 1332 struct vcpu *vcpu; 1333 1334 if (vcpuid < 0 || vcpuid >= vm->maxcpus) { 1335 return (EINVAL); 1336 } 1337 if (!VRS_IS_VALID(state)) { 1338 return (EINVAL); 1339 } 1340 1341 vcpu = &vm->vcpu[vcpuid]; 1342 1343 vcpu_lock(vcpu); 1344 vcpu->run_state = state; 1345 vcpu->sipi_vector = sipi_vec; 1346 vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT); 1347 vcpu_unlock(vcpu); 1348 1349 return (0); 1350 } 1351 1352 void 1353 vm_track_dirty_pages(struct vm *vm, uint64_t gpa, size_t len, uint8_t *bitmap) 1354 { 1355 vmspace_t *vms = vm_get_vmspace(vm); 1356 vmspace_track_dirty(vms, gpa, len, bitmap); 1357 } 1358 1359 static void 1360 restore_guest_fpustate(struct vcpu *vcpu) 1361 { 1362 /* Save host FPU and restore guest FPU */ 1363 fpu_stop_emulating(); 1364 hma_fpu_start_guest(vcpu->guestfpu); 1365 1366 /* restore guest XCR0 if XSAVE is enabled in the host */ 1367 if (rcr4() & CR4_XSAVE) 1368 load_xcr(0, vcpu->guest_xcr0); 1369 1370 /* 1371 * The FPU is now "dirty" with the guest's state so turn on emulation 1372 * to trap any access to the FPU by the host. 1373 */ 1374 fpu_start_emulating(); 1375 } 1376 1377 static void 1378 save_guest_fpustate(struct vcpu *vcpu) 1379 { 1380 1381 if ((rcr0() & CR0_TS) == 0) 1382 panic("fpu emulation not enabled in host!"); 1383 1384 /* save guest XCR0 and restore host XCR0 */ 1385 if (rcr4() & CR4_XSAVE) { 1386 vcpu->guest_xcr0 = rxcr(0); 1387 load_xcr(0, vmm_get_host_xcr0()); 1388 } 1389 1390 /* save guest FPU and restore host FPU */ 1391 fpu_stop_emulating(); 1392 hma_fpu_stop_guest(vcpu->guestfpu); 1393 /* 1394 * When the host state has been restored, we should not re-enable 1395 * CR0.TS on illumos for eager FPU. 1396 */ 1397 } 1398 1399 static int 1400 vcpu_set_state_locked(struct vm *vm, int vcpuid, enum vcpu_state newstate, 1401 bool from_idle) 1402 { 1403 struct vcpu *vcpu; 1404 int error; 1405 1406 vcpu = &vm->vcpu[vcpuid]; 1407 vcpu_assert_locked(vcpu); 1408 1409 /* 1410 * State transitions from the vmmdev_ioctl() must always begin from 1411 * the VCPU_IDLE state. This guarantees that there is only a single 1412 * ioctl() operating on a vcpu at any point. 1413 */ 1414 if (from_idle) { 1415 while (vcpu->state != VCPU_IDLE) { 1416 vcpu->reqidle = 1; 1417 vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT); 1418 cv_wait(&vcpu->state_cv, &vcpu->lock); 1419 } 1420 } else { 1421 KASSERT(vcpu->state != VCPU_IDLE, ("invalid transition from " 1422 "vcpu idle state")); 1423 } 1424 1425 if (vcpu->state == VCPU_RUNNING) { 1426 KASSERT(vcpu->hostcpu == curcpu, ("curcpu %d and hostcpu %d " 1427 "mismatch for running vcpu", curcpu, vcpu->hostcpu)); 1428 } else { 1429 KASSERT(vcpu->hostcpu == NOCPU, ("Invalid hostcpu %d for a " 1430 "vcpu that is not running", vcpu->hostcpu)); 1431 } 1432 1433 /* 1434 * The following state transitions are allowed: 1435 * IDLE -> FROZEN -> IDLE 1436 * FROZEN -> RUNNING -> FROZEN 1437 * FROZEN -> SLEEPING -> FROZEN 1438 */ 1439 switch (vcpu->state) { 1440 case VCPU_IDLE: 1441 case VCPU_RUNNING: 1442 case VCPU_SLEEPING: 1443 error = (newstate != VCPU_FROZEN); 1444 break; 1445 case VCPU_FROZEN: 1446 error = (newstate == VCPU_FROZEN); 1447 break; 1448 default: 1449 error = 1; 1450 break; 1451 } 1452 1453 if (error) 1454 return (EBUSY); 1455 1456 vcpu->state = newstate; 1457 if (newstate == VCPU_RUNNING) 1458 vcpu->hostcpu = curcpu; 1459 else 1460 vcpu->hostcpu = NOCPU; 1461 1462 if (newstate == VCPU_IDLE) { 1463 cv_broadcast(&vcpu->state_cv); 1464 } 1465 1466 return (0); 1467 } 1468 1469 static void 1470 vcpu_require_state(struct vm *vm, int vcpuid, enum vcpu_state newstate) 1471 { 1472 int error; 1473 1474 if ((error = vcpu_set_state(vm, vcpuid, newstate, false)) != 0) 1475 panic("Error %d setting state to %d\n", error, newstate); 1476 } 1477 1478 static void 1479 vcpu_require_state_locked(struct vm *vm, int vcpuid, enum vcpu_state newstate) 1480 { 1481 int error; 1482 1483 if ((error = vcpu_set_state_locked(vm, vcpuid, newstate, false)) != 0) 1484 panic("Error %d setting state to %d", error, newstate); 1485 } 1486 1487 /* 1488 * Emulate a guest 'hlt' by sleeping until the vcpu is ready to run. 1489 */ 1490 static int 1491 vm_handle_hlt(struct vm *vm, int vcpuid, bool intr_disabled) 1492 { 1493 struct vcpu *vcpu; 1494 int vcpu_halted, vm_halted; 1495 bool userspace_exit = false; 1496 1497 KASSERT(!CPU_ISSET(vcpuid, &vm->halted_cpus), ("vcpu already halted")); 1498 1499 vcpu = &vm->vcpu[vcpuid]; 1500 vcpu_halted = 0; 1501 vm_halted = 0; 1502 1503 vcpu_lock(vcpu); 1504 while (1) { 1505 /* 1506 * Do a final check for pending interrupts (including NMI and 1507 * INIT) before putting this thread to sleep. 1508 */ 1509 if (vm_nmi_pending(vm, vcpuid)) 1510 break; 1511 if (vcpu_run_state_pending(vm, vcpuid)) 1512 break; 1513 if (!intr_disabled) { 1514 if (vm_extint_pending(vm, vcpuid) || 1515 vlapic_pending_intr(vcpu->vlapic, NULL)) { 1516 break; 1517 } 1518 } 1519 1520 /* 1521 * Also check for software events which would cause a wake-up. 1522 * This will set the appropriate exitcode directly, rather than 1523 * requiring a trip through VM_RUN(). 1524 */ 1525 if (vcpu_sleep_bailout_checks(vm, vcpuid)) { 1526 userspace_exit = true; 1527 break; 1528 } 1529 1530 /* 1531 * Some Linux guests implement "halt" by having all vcpus 1532 * execute HLT with interrupts disabled. 'halted_cpus' keeps 1533 * track of the vcpus that have entered this state. When all 1534 * vcpus enter the halted state the virtual machine is halted. 1535 */ 1536 if (intr_disabled) { 1537 if (!vcpu_halted && halt_detection_enabled) { 1538 vcpu_halted = 1; 1539 CPU_SET_ATOMIC(vcpuid, &vm->halted_cpus); 1540 } 1541 if (CPU_CMP(&vm->halted_cpus, &vm->active_cpus) == 0) { 1542 vm_halted = 1; 1543 break; 1544 } 1545 } 1546 1547 vcpu_ustate_change(vm, vcpuid, VU_IDLE); 1548 vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING); 1549 (void) cv_wait_sig(&vcpu->vcpu_cv, &vcpu->lock); 1550 vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN); 1551 vcpu_ustate_change(vm, vcpuid, VU_EMU_KERN); 1552 } 1553 1554 if (vcpu_halted) 1555 CPU_CLR_ATOMIC(vcpuid, &vm->halted_cpus); 1556 1557 vcpu_unlock(vcpu); 1558 1559 if (vm_halted) { 1560 (void) vm_suspend(vm, VM_SUSPEND_HALT); 1561 } 1562 1563 return (userspace_exit ? -1 : 0); 1564 } 1565 1566 static int 1567 vm_handle_paging(struct vm *vm, int vcpuid) 1568 { 1569 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 1570 vm_client_t *vmc = vcpu->vmclient; 1571 struct vm_exit *vme = &vcpu->exitinfo; 1572 const int ftype = vme->u.paging.fault_type; 1573 1574 ASSERT0(vme->inst_length); 1575 ASSERT(ftype == PROT_READ || ftype == PROT_WRITE || ftype == PROT_EXEC); 1576 1577 if (vmc_fault(vmc, vme->u.paging.gpa, ftype) != 0) { 1578 /* 1579 * If the fault cannot be serviced, kick it out to userspace for 1580 * handling (or more likely, halting the instance). 1581 */ 1582 return (-1); 1583 } 1584 1585 return (0); 1586 } 1587 1588 int 1589 vm_service_mmio_read(struct vm *vm, int cpuid, uint64_t gpa, uint64_t *rval, 1590 int rsize) 1591 { 1592 int err = ESRCH; 1593 1594 if (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE) { 1595 struct vlapic *vlapic = vm_lapic(vm, cpuid); 1596 1597 err = vlapic_mmio_read(vlapic, gpa, rval, rsize); 1598 } else if (gpa >= VIOAPIC_BASE && gpa < VIOAPIC_BASE + VIOAPIC_SIZE) { 1599 err = vioapic_mmio_read(vm, cpuid, gpa, rval, rsize); 1600 } else if (gpa >= VHPET_BASE && gpa < VHPET_BASE + VHPET_SIZE) { 1601 err = vhpet_mmio_read(vm, cpuid, gpa, rval, rsize); 1602 } 1603 1604 return (err); 1605 } 1606 1607 int 1608 vm_service_mmio_write(struct vm *vm, int cpuid, uint64_t gpa, uint64_t wval, 1609 int wsize) 1610 { 1611 int err = ESRCH; 1612 1613 if (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE) { 1614 struct vlapic *vlapic = vm_lapic(vm, cpuid); 1615 1616 err = vlapic_mmio_write(vlapic, gpa, wval, wsize); 1617 } else if (gpa >= VIOAPIC_BASE && gpa < VIOAPIC_BASE + VIOAPIC_SIZE) { 1618 err = vioapic_mmio_write(vm, cpuid, gpa, wval, wsize); 1619 } else if (gpa >= VHPET_BASE && gpa < VHPET_BASE + VHPET_SIZE) { 1620 err = vhpet_mmio_write(vm, cpuid, gpa, wval, wsize); 1621 } 1622 1623 return (err); 1624 } 1625 1626 static int 1627 vm_handle_mmio_emul(struct vm *vm, int vcpuid) 1628 { 1629 struct vie *vie; 1630 struct vcpu *vcpu; 1631 struct vm_exit *vme; 1632 uint64_t inst_addr; 1633 int error, fault, cs_d; 1634 1635 vcpu = &vm->vcpu[vcpuid]; 1636 vme = &vcpu->exitinfo; 1637 vie = vcpu->vie_ctx; 1638 1639 KASSERT(vme->inst_length == 0, ("%s: invalid inst_length %d", 1640 __func__, vme->inst_length)); 1641 1642 inst_addr = vme->rip + vme->u.mmio_emul.cs_base; 1643 cs_d = vme->u.mmio_emul.cs_d; 1644 1645 /* Fetch the faulting instruction */ 1646 if (vie_needs_fetch(vie)) { 1647 error = vie_fetch_instruction(vie, vm, vcpuid, inst_addr, 1648 &fault); 1649 if (error != 0) { 1650 return (error); 1651 } else if (fault) { 1652 /* 1653 * If a fault during instruction fetch was encountered, 1654 * it will have asserted that the appropriate exception 1655 * be injected at next entry. 1656 * No further work is required. 1657 */ 1658 return (0); 1659 } 1660 } 1661 1662 if (vie_decode_instruction(vie, vm, vcpuid, cs_d) != 0) { 1663 /* Dump (unrecognized) instruction bytes in userspace */ 1664 vie_fallback_exitinfo(vie, vme); 1665 return (-1); 1666 } 1667 if (vme->u.mmio_emul.gla != VIE_INVALID_GLA && 1668 vie_verify_gla(vie, vm, vcpuid, vme->u.mmio_emul.gla) != 0) { 1669 /* Decoded GLA does not match GLA from VM exit state */ 1670 vie_fallback_exitinfo(vie, vme); 1671 return (-1); 1672 } 1673 1674 repeat: 1675 error = vie_emulate_mmio(vie, vm, vcpuid); 1676 if (error < 0) { 1677 /* 1678 * MMIO not handled by any of the in-kernel-emulated devices, so 1679 * make a trip out to userspace for it. 1680 */ 1681 vie_exitinfo(vie, vme); 1682 } else if (error == EAGAIN) { 1683 /* 1684 * Continue emulating the rep-prefixed instruction, which has 1685 * not completed its iterations. 1686 * 1687 * In case this can be emulated in-kernel and has a high 1688 * repetition count (causing a tight spin), it should be 1689 * deferential to yield conditions. 1690 */ 1691 if (!vcpu_should_yield(vm, vcpuid)) { 1692 goto repeat; 1693 } else { 1694 /* 1695 * Defer to the contending load by making a trip to 1696 * userspace with a no-op (BOGUS) exit reason. 1697 */ 1698 vie_reset(vie); 1699 vme->exitcode = VM_EXITCODE_BOGUS; 1700 return (-1); 1701 } 1702 } else if (error == 0) { 1703 /* Update %rip now that instruction has been emulated */ 1704 vie_advance_pc(vie, &vcpu->nextrip); 1705 } 1706 return (error); 1707 } 1708 1709 static int 1710 vm_handle_inout(struct vm *vm, int vcpuid, struct vm_exit *vme) 1711 { 1712 struct vcpu *vcpu; 1713 struct vie *vie; 1714 int err; 1715 1716 vcpu = &vm->vcpu[vcpuid]; 1717 vie = vcpu->vie_ctx; 1718 1719 repeat: 1720 err = vie_emulate_inout(vie, vm, vcpuid); 1721 1722 if (err < 0) { 1723 /* 1724 * In/out not handled by any of the in-kernel-emulated devices, 1725 * so make a trip out to userspace for it. 1726 */ 1727 vie_exitinfo(vie, vme); 1728 return (err); 1729 } else if (err == EAGAIN) { 1730 /* 1731 * Continue emulating the rep-prefixed ins/outs, which has not 1732 * completed its iterations. 1733 * 1734 * In case this can be emulated in-kernel and has a high 1735 * repetition count (causing a tight spin), it should be 1736 * deferential to yield conditions. 1737 */ 1738 if (!vcpu_should_yield(vm, vcpuid)) { 1739 goto repeat; 1740 } else { 1741 /* 1742 * Defer to the contending load by making a trip to 1743 * userspace with a no-op (BOGUS) exit reason. 1744 */ 1745 vie_reset(vie); 1746 vme->exitcode = VM_EXITCODE_BOGUS; 1747 return (-1); 1748 } 1749 } else if (err != 0) { 1750 /* Emulation failure. Bail all the way out to userspace. */ 1751 vme->exitcode = VM_EXITCODE_INST_EMUL; 1752 bzero(&vme->u.inst_emul, sizeof (vme->u.inst_emul)); 1753 return (-1); 1754 } 1755 1756 vie_advance_pc(vie, &vcpu->nextrip); 1757 return (0); 1758 } 1759 1760 static int 1761 vm_handle_inst_emul(struct vm *vm, int vcpuid) 1762 { 1763 struct vie *vie; 1764 struct vcpu *vcpu; 1765 struct vm_exit *vme; 1766 uint64_t cs_base; 1767 int error, fault, cs_d; 1768 1769 vcpu = &vm->vcpu[vcpuid]; 1770 vme = &vcpu->exitinfo; 1771 vie = vcpu->vie_ctx; 1772 1773 vie_cs_info(vie, vm, vcpuid, &cs_base, &cs_d); 1774 1775 /* Fetch the faulting instruction */ 1776 ASSERT(vie_needs_fetch(vie)); 1777 error = vie_fetch_instruction(vie, vm, vcpuid, vme->rip + cs_base, 1778 &fault); 1779 if (error != 0) { 1780 return (error); 1781 } else if (fault) { 1782 /* 1783 * If a fault during instruction fetch was encounted, it will 1784 * have asserted that the appropriate exception be injected at 1785 * next entry. No further work is required. 1786 */ 1787 return (0); 1788 } 1789 1790 if (vie_decode_instruction(vie, vm, vcpuid, cs_d) != 0) { 1791 /* Dump (unrecognized) instruction bytes in userspace */ 1792 vie_fallback_exitinfo(vie, vme); 1793 return (-1); 1794 } 1795 1796 error = vie_emulate_other(vie, vm, vcpuid); 1797 if (error != 0) { 1798 /* 1799 * Instruction emulation was unable to complete successfully, so 1800 * kick it out to userspace for handling. 1801 */ 1802 vie_fallback_exitinfo(vie, vme); 1803 } else { 1804 /* Update %rip now that instruction has been emulated */ 1805 vie_advance_pc(vie, &vcpu->nextrip); 1806 } 1807 return (error); 1808 } 1809 1810 static int 1811 vm_handle_suspend(struct vm *vm, int vcpuid) 1812 { 1813 int i; 1814 struct vcpu *vcpu; 1815 1816 vcpu = &vm->vcpu[vcpuid]; 1817 1818 CPU_SET_ATOMIC(vcpuid, &vm->suspended_cpus); 1819 1820 /* 1821 * Wait until all 'active_cpus' have suspended themselves. 1822 */ 1823 vcpu_lock(vcpu); 1824 vcpu_ustate_change(vm, vcpuid, VU_INIT); 1825 while (1) { 1826 int rc; 1827 1828 if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) { 1829 break; 1830 } 1831 1832 vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING); 1833 rc = cv_reltimedwait_sig(&vcpu->vcpu_cv, &vcpu->lock, hz, 1834 TR_CLOCK_TICK); 1835 vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN); 1836 1837 /* 1838 * If the userspace process driving the instance is killed, any 1839 * vCPUs yet to be marked suspended (because they are not 1840 * VM_RUN-ing in the kernel presently) will never reach that 1841 * state. 1842 * 1843 * To avoid vm_handle_suspend() getting stuck in the kernel 1844 * waiting for those vCPUs, offer a bail-out even though it 1845 * means returning without all vCPUs in a suspended state. 1846 */ 1847 if (rc <= 0) { 1848 if ((curproc->p_flag & SEXITING) != 0) { 1849 break; 1850 } 1851 } 1852 } 1853 vcpu_unlock(vcpu); 1854 1855 /* 1856 * Wakeup the other sleeping vcpus and return to userspace. 1857 */ 1858 for (i = 0; i < vm->maxcpus; i++) { 1859 if (CPU_ISSET(i, &vm->suspended_cpus)) { 1860 vcpu_notify_event(vm, i); 1861 } 1862 } 1863 1864 return (-1); 1865 } 1866 1867 static int 1868 vm_handle_reqidle(struct vm *vm, int vcpuid) 1869 { 1870 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 1871 1872 vcpu_lock(vcpu); 1873 KASSERT(vcpu->reqidle, ("invalid vcpu reqidle %d", vcpu->reqidle)); 1874 vcpu->reqidle = 0; 1875 vcpu_unlock(vcpu); 1876 return (-1); 1877 } 1878 1879 static int 1880 vm_handle_run_state(struct vm *vm, int vcpuid) 1881 { 1882 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 1883 bool handled = false; 1884 1885 vcpu_lock(vcpu); 1886 while (1) { 1887 if ((vcpu->run_state & VRS_PEND_INIT) != 0) { 1888 vcpu_unlock(vcpu); 1889 VERIFY0(vcpu_arch_reset(vm, vcpuid, true)); 1890 vcpu_lock(vcpu); 1891 1892 vcpu->run_state &= ~(VRS_RUN | VRS_PEND_INIT); 1893 vcpu->run_state |= VRS_INIT; 1894 } 1895 1896 if ((vcpu->run_state & (VRS_INIT | VRS_RUN | VRS_PEND_SIPI)) == 1897 (VRS_INIT | VRS_PEND_SIPI)) { 1898 const uint8_t vector = vcpu->sipi_vector; 1899 1900 vcpu_unlock(vcpu); 1901 VERIFY0(vcpu_vector_sipi(vm, vcpuid, vector)); 1902 vcpu_lock(vcpu); 1903 1904 vcpu->run_state &= ~VRS_PEND_SIPI; 1905 vcpu->run_state |= VRS_RUN; 1906 } 1907 1908 /* 1909 * If the vCPU is now in the running state, there is no need to 1910 * wait for anything prior to re-entry. 1911 */ 1912 if ((vcpu->run_state & VRS_RUN) != 0) { 1913 handled = true; 1914 break; 1915 } 1916 1917 /* 1918 * Also check for software events which would cause a wake-up. 1919 * This will set the appropriate exitcode directly, rather than 1920 * requiring a trip through VM_RUN(). 1921 */ 1922 if (vcpu_sleep_bailout_checks(vm, vcpuid)) { 1923 break; 1924 } 1925 1926 vcpu_ustate_change(vm, vcpuid, VU_IDLE); 1927 vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING); 1928 (void) cv_wait_sig(&vcpu->vcpu_cv, &vcpu->lock); 1929 vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN); 1930 vcpu_ustate_change(vm, vcpuid, VU_EMU_KERN); 1931 } 1932 vcpu_unlock(vcpu); 1933 1934 return (handled ? 0 : -1); 1935 } 1936 1937 static int 1938 vm_rdmtrr(const struct vm_mtrr *mtrr, uint32_t num, uint64_t *val) 1939 { 1940 switch (num) { 1941 case MSR_MTRRcap: 1942 *val = MTRR_CAP_WC | MTRR_CAP_FIXED | VMM_MTRR_VAR_MAX; 1943 break; 1944 case MSR_MTRRdefType: 1945 *val = mtrr->def_type; 1946 break; 1947 case MSR_MTRR4kBase ... MSR_MTRR4kBase + 7: 1948 *val = mtrr->fixed4k[num - MSR_MTRR4kBase]; 1949 break; 1950 case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1: 1951 *val = mtrr->fixed16k[num - MSR_MTRR16kBase]; 1952 break; 1953 case MSR_MTRR64kBase: 1954 *val = mtrr->fixed64k; 1955 break; 1956 case MSR_MTRRVarBase ... MSR_MTRRVarBase + (VMM_MTRR_VAR_MAX * 2) - 1: { 1957 uint_t offset = num - MSR_MTRRVarBase; 1958 if (offset % 2 == 0) { 1959 *val = mtrr->var[offset / 2].base; 1960 } else { 1961 *val = mtrr->var[offset / 2].mask; 1962 } 1963 break; 1964 } 1965 default: 1966 return (-1); 1967 } 1968 1969 return (0); 1970 } 1971 1972 static int 1973 vm_wrmtrr(struct vm_mtrr *mtrr, uint32_t num, uint64_t val) 1974 { 1975 switch (num) { 1976 case MSR_MTRRcap: 1977 /* MTRRCAP is read only */ 1978 return (-1); 1979 case MSR_MTRRdefType: 1980 if (val & ~VMM_MTRR_DEF_MASK) { 1981 /* generate #GP on writes to reserved fields */ 1982 return (-1); 1983 } 1984 mtrr->def_type = val; 1985 break; 1986 case MSR_MTRR4kBase ... MSR_MTRR4kBase + 7: 1987 mtrr->fixed4k[num - MSR_MTRR4kBase] = val; 1988 break; 1989 case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1: 1990 mtrr->fixed16k[num - MSR_MTRR16kBase] = val; 1991 break; 1992 case MSR_MTRR64kBase: 1993 mtrr->fixed64k = val; 1994 break; 1995 case MSR_MTRRVarBase ... MSR_MTRRVarBase + (VMM_MTRR_VAR_MAX * 2) - 1: { 1996 uint_t offset = num - MSR_MTRRVarBase; 1997 if (offset % 2 == 0) { 1998 if (val & ~VMM_MTRR_PHYSBASE_MASK) { 1999 /* generate #GP on writes to reserved fields */ 2000 return (-1); 2001 } 2002 mtrr->var[offset / 2].base = val; 2003 } else { 2004 if (val & ~VMM_MTRR_PHYSMASK_MASK) { 2005 /* generate #GP on writes to reserved fields */ 2006 return (-1); 2007 } 2008 mtrr->var[offset / 2].mask = val; 2009 } 2010 break; 2011 } 2012 default: 2013 return (-1); 2014 } 2015 2016 return (0); 2017 } 2018 2019 static bool 2020 is_mtrr_msr(uint32_t msr) 2021 { 2022 switch (msr) { 2023 case MSR_MTRRcap: 2024 case MSR_MTRRdefType: 2025 case MSR_MTRR4kBase ... MSR_MTRR4kBase + 7: 2026 case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1: 2027 case MSR_MTRR64kBase: 2028 case MSR_MTRRVarBase ... MSR_MTRRVarBase + (VMM_MTRR_VAR_MAX * 2) - 1: 2029 return (true); 2030 default: 2031 return (false); 2032 } 2033 } 2034 2035 static int 2036 vm_handle_rdmsr(struct vm *vm, int vcpuid, struct vm_exit *vme) 2037 { 2038 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 2039 const uint32_t code = vme->u.msr.code; 2040 uint64_t val = 0; 2041 2042 switch (code) { 2043 case MSR_MCG_CAP: 2044 case MSR_MCG_STATUS: 2045 val = 0; 2046 break; 2047 2048 case MSR_MTRRcap: 2049 case MSR_MTRRdefType: 2050 case MSR_MTRR4kBase ... MSR_MTRR4kBase + 7: 2051 case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1: 2052 case MSR_MTRR64kBase: 2053 case MSR_MTRRVarBase ... MSR_MTRRVarBase + (VMM_MTRR_VAR_MAX * 2) - 1: 2054 if (vm_rdmtrr(&vcpu->mtrr, code, &val) != 0) 2055 vm_inject_gp(vm, vcpuid); 2056 break; 2057 2058 case MSR_TSC: 2059 /* 2060 * In all likelihood, this should always be handled in guest 2061 * context by VMX/SVM rather than taking an exit. (Both VMX and 2062 * SVM pass through read-only access to MSR_TSC to the guest.) 2063 * 2064 * No physical offset is requested of vcpu_tsc_offset() since 2065 * rdtsc_offset() takes care of that instead. 2066 */ 2067 val = vcpu_tsc_offset(vm, vcpuid, false) + rdtsc_offset(); 2068 break; 2069 2070 default: 2071 /* 2072 * Anything not handled at this point will be kicked out to 2073 * userspace for attempted processing there. 2074 */ 2075 return (-1); 2076 } 2077 2078 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RAX, 2079 val & 0xffffffff)); 2080 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RDX, 2081 val >> 32)); 2082 return (0); 2083 } 2084 2085 static int 2086 vm_handle_wrmsr(struct vm *vm, int vcpuid, struct vm_exit *vme) 2087 { 2088 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 2089 const uint32_t code = vme->u.msr.code; 2090 const uint64_t val = vme->u.msr.wval; 2091 2092 switch (code) { 2093 case MSR_MCG_CAP: 2094 case MSR_MCG_STATUS: 2095 /* Ignore writes */ 2096 break; 2097 2098 case MSR_MTRRcap: 2099 case MSR_MTRRdefType: 2100 case MSR_MTRR4kBase ... MSR_MTRR4kBase + 7: 2101 case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1: 2102 case MSR_MTRR64kBase: 2103 case MSR_MTRRVarBase ... MSR_MTRRVarBase + (VMM_MTRR_VAR_MAX * 2) - 1: 2104 if (vm_wrmtrr(&vcpu->mtrr, code, val) != 0) 2105 vm_inject_gp(vm, vcpuid); 2106 break; 2107 2108 case MSR_TSC: 2109 /* 2110 * The effect of writing the TSC MSR is that a subsequent read 2111 * of the TSC would report that value written (plus any time 2112 * elapsed between the write and the read). The guest TSC value 2113 * is calculated from a global offset for the guest (which 2114 * effectively makes its TSC read 0 at guest boot) and a 2115 * per-vCPU offset to handle these writes to the MSR. 2116 * 2117 * To calculate that per-vCPU offset, we can work backwards from 2118 * the guest value at the time of write: 2119 * 2120 * value = host TSC + VM boot offset + vCPU offset 2121 * 2122 * so therefore: 2123 * 2124 * value - host TSC - VM boot offset = vCPU offset 2125 */ 2126 vcpu->tsc_offset = val - vm->boot_tsc_offset - rdtsc_offset(); 2127 break; 2128 2129 default: 2130 /* 2131 * Anything not handled at this point will be kicked out to 2132 * userspace for attempted processing there. 2133 */ 2134 return (-1); 2135 } 2136 2137 return (0); 2138 } 2139 2140 int 2141 vm_suspend(struct vm *vm, enum vm_suspend_how how) 2142 { 2143 if (how <= VM_SUSPEND_NONE || how >= VM_SUSPEND_LAST) 2144 return (EINVAL); 2145 2146 if (atomic_cmpset_int((uint_t *)&vm->suspend, 0, how) == 0) { 2147 return (EALREADY); 2148 } 2149 2150 /* 2151 * Notify all active vcpus that they are now suspended. 2152 */ 2153 for (uint_t i = 0; i < vm->maxcpus; i++) { 2154 struct vcpu *vcpu = &vm->vcpu[i]; 2155 2156 vcpu_lock(vcpu); 2157 if (vcpu->state == VCPU_IDLE || vcpu->state == VCPU_FROZEN) { 2158 /* 2159 * Any vCPUs not actively running or in HLT can be 2160 * marked as suspended immediately. 2161 */ 2162 if (CPU_ISSET(i, &vm->active_cpus)) { 2163 CPU_SET_ATOMIC(i, &vm->suspended_cpus); 2164 } 2165 } else { 2166 /* 2167 * Those which are running or in HLT will pick up the 2168 * suspended state after notification. 2169 */ 2170 vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT); 2171 } 2172 vcpu_unlock(vcpu); 2173 } 2174 return (0); 2175 } 2176 2177 void 2178 vm_exit_run_state(struct vm *vm, int vcpuid, uint64_t rip) 2179 { 2180 struct vm_exit *vmexit; 2181 2182 vmexit = vm_exitinfo(vm, vcpuid); 2183 vmexit->rip = rip; 2184 vmexit->inst_length = 0; 2185 vmexit->exitcode = VM_EXITCODE_RUN_STATE; 2186 vmm_stat_incr(vm, vcpuid, VMEXIT_RUN_STATE, 1); 2187 } 2188 2189 /* 2190 * Some vmm resources, such as the lapic, may have CPU-specific resources 2191 * allocated to them which would benefit from migration onto the host CPU which 2192 * is processing the vcpu state. 2193 */ 2194 static void 2195 vm_localize_resources(struct vm *vm, struct vcpu *vcpu) 2196 { 2197 /* 2198 * Localizing cyclic resources requires acquisition of cpu_lock, and 2199 * doing so with kpreempt disabled is a recipe for deadlock disaster. 2200 */ 2201 VERIFY(curthread->t_preempt == 0); 2202 2203 /* 2204 * Do not bother with localization if this vCPU is about to return to 2205 * the host CPU it was last localized to. 2206 */ 2207 if (vcpu->lastloccpu == curcpu) 2208 return; 2209 2210 /* 2211 * Localize system-wide resources to the primary boot vCPU. While any 2212 * of the other vCPUs may access them, it keeps the potential interrupt 2213 * footprint constrained to CPUs involved with this instance. 2214 */ 2215 if (vcpu == &vm->vcpu[0]) { 2216 vhpet_localize_resources(vm->vhpet); 2217 vrtc_localize_resources(vm->vrtc); 2218 vatpit_localize_resources(vm->vatpit); 2219 } 2220 2221 vlapic_localize_resources(vcpu->vlapic); 2222 2223 vcpu->lastloccpu = curcpu; 2224 } 2225 2226 static void 2227 vmm_savectx(void *arg) 2228 { 2229 vm_thread_ctx_t *vtc = arg; 2230 struct vm *vm = vtc->vtc_vm; 2231 const int vcpuid = vtc->vtc_vcpuid; 2232 2233 if (ops->vmsavectx != NULL) { 2234 ops->vmsavectx(vm->cookie, vcpuid); 2235 } 2236 2237 /* 2238 * Account for going off-cpu, unless the vCPU is idled, where being 2239 * off-cpu is the explicit point. 2240 */ 2241 if (vm->vcpu[vcpuid].ustate != VU_IDLE) { 2242 vtc->vtc_ustate = vm->vcpu[vcpuid].ustate; 2243 vcpu_ustate_change(vm, vcpuid, VU_SCHED); 2244 } 2245 2246 /* 2247 * If the CPU holds the restored guest FPU state, save it and restore 2248 * the host FPU state before this thread goes off-cpu. 2249 */ 2250 if ((vtc->vtc_status & VTCS_FPU_RESTORED) != 0) { 2251 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 2252 2253 save_guest_fpustate(vcpu); 2254 vtc->vtc_status &= ~VTCS_FPU_RESTORED; 2255 } 2256 } 2257 2258 static void 2259 vmm_restorectx(void *arg) 2260 { 2261 vm_thread_ctx_t *vtc = arg; 2262 struct vm *vm = vtc->vtc_vm; 2263 const int vcpuid = vtc->vtc_vcpuid; 2264 2265 /* Complete microstate accounting for vCPU being off-cpu */ 2266 if (vm->vcpu[vcpuid].ustate != VU_IDLE) { 2267 vcpu_ustate_change(vm, vcpuid, vtc->vtc_ustate); 2268 } 2269 2270 /* 2271 * When coming back on-cpu, only restore the guest FPU status if the 2272 * thread is in a context marked as requiring it. This should be rare, 2273 * occurring only when a future logic error results in a voluntary 2274 * sleep during the VMRUN critical section. 2275 * 2276 * The common case will result in elision of the guest FPU state 2277 * restoration, deferring that action until it is clearly necessary 2278 * during vm_run. 2279 */ 2280 VERIFY((vtc->vtc_status & VTCS_FPU_RESTORED) == 0); 2281 if ((vtc->vtc_status & VTCS_FPU_CTX_CRITICAL) != 0) { 2282 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 2283 2284 restore_guest_fpustate(vcpu); 2285 vtc->vtc_status |= VTCS_FPU_RESTORED; 2286 } 2287 2288 if (ops->vmrestorectx != NULL) { 2289 ops->vmrestorectx(vm->cookie, vcpuid); 2290 } 2291 2292 } 2293 2294 static int 2295 vm_entry_actions(struct vm *vm, int vcpuid, const struct vm_entry *entry, 2296 struct vm_exit *vme) 2297 { 2298 struct vcpu *vcpu; 2299 struct vie *vie; 2300 int err; 2301 2302 vcpu = &vm->vcpu[vcpuid]; 2303 vie = vcpu->vie_ctx; 2304 err = 0; 2305 2306 switch (entry->cmd) { 2307 case VEC_DEFAULT: 2308 return (0); 2309 case VEC_DISCARD_INSTR: 2310 vie_reset(vie); 2311 return (0); 2312 case VEC_FULFILL_MMIO: 2313 err = vie_fulfill_mmio(vie, &entry->u.mmio); 2314 if (err == 0) { 2315 err = vie_emulate_mmio(vie, vm, vcpuid); 2316 if (err == 0) { 2317 vie_advance_pc(vie, &vcpu->nextrip); 2318 } else if (err < 0) { 2319 vie_exitinfo(vie, vme); 2320 } else if (err == EAGAIN) { 2321 /* 2322 * Clear the instruction emulation state in 2323 * order to re-enter VM context and continue 2324 * this 'rep <instruction>' 2325 */ 2326 vie_reset(vie); 2327 err = 0; 2328 } 2329 } 2330 break; 2331 case VEC_FULFILL_INOUT: 2332 err = vie_fulfill_inout(vie, &entry->u.inout); 2333 if (err == 0) { 2334 err = vie_emulate_inout(vie, vm, vcpuid); 2335 if (err == 0) { 2336 vie_advance_pc(vie, &vcpu->nextrip); 2337 } else if (err < 0) { 2338 vie_exitinfo(vie, vme); 2339 } else if (err == EAGAIN) { 2340 /* 2341 * Clear the instruction emulation state in 2342 * order to re-enter VM context and continue 2343 * this 'rep ins/outs' 2344 */ 2345 vie_reset(vie); 2346 err = 0; 2347 } 2348 } 2349 break; 2350 default: 2351 return (EINVAL); 2352 } 2353 return (err); 2354 } 2355 2356 static int 2357 vm_loop_checks(struct vm *vm, int vcpuid, struct vm_exit *vme) 2358 { 2359 struct vie *vie; 2360 2361 vie = vm->vcpu[vcpuid].vie_ctx; 2362 2363 if (vie_pending(vie)) { 2364 /* 2365 * Userspace has not fulfilled the pending needs of the 2366 * instruction emulation, so bail back out. 2367 */ 2368 vie_exitinfo(vie, vme); 2369 return (-1); 2370 } 2371 2372 return (0); 2373 } 2374 2375 int 2376 vm_run(struct vm *vm, int vcpuid, const struct vm_entry *entry) 2377 { 2378 int error; 2379 struct vcpu *vcpu; 2380 struct vm_exit *vme; 2381 bool intr_disabled; 2382 int affinity_type = CPU_CURRENT; 2383 2384 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 2385 return (EINVAL); 2386 if (!CPU_ISSET(vcpuid, &vm->active_cpus)) 2387 return (EINVAL); 2388 2389 vcpu = &vm->vcpu[vcpuid]; 2390 vme = &vcpu->exitinfo; 2391 2392 vcpu_ustate_change(vm, vcpuid, VU_EMU_KERN); 2393 2394 vcpu->vtc.vtc_status = 0; 2395 ctxop_attach(curthread, vcpu->ctxop); 2396 2397 error = vm_entry_actions(vm, vcpuid, entry, vme); 2398 if (error != 0) { 2399 goto exit; 2400 } 2401 2402 restart: 2403 error = vm_loop_checks(vm, vcpuid, vme); 2404 if (error != 0) { 2405 goto exit; 2406 } 2407 2408 thread_affinity_set(curthread, affinity_type); 2409 /* 2410 * Resource localization should happen after the CPU affinity for the 2411 * thread has been set to ensure that access from restricted contexts, 2412 * such as VMX-accelerated APIC operations, can occur without inducing 2413 * cyclic cross-calls. 2414 * 2415 * This must be done prior to disabling kpreempt via critical_enter(). 2416 */ 2417 vm_localize_resources(vm, vcpu); 2418 affinity_type = CPU_CURRENT; 2419 critical_enter(); 2420 2421 /* Force a trip through update_sregs to reload %fs/%gs and friends */ 2422 PCB_SET_UPDATE_SEGS(&ttolwp(curthread)->lwp_pcb); 2423 2424 if ((vcpu->vtc.vtc_status & VTCS_FPU_RESTORED) == 0) { 2425 restore_guest_fpustate(vcpu); 2426 vcpu->vtc.vtc_status |= VTCS_FPU_RESTORED; 2427 } 2428 vcpu->vtc.vtc_status |= VTCS_FPU_CTX_CRITICAL; 2429 2430 vcpu_require_state(vm, vcpuid, VCPU_RUNNING); 2431 error = VMRUN(vm->cookie, vcpuid, vcpu->nextrip); 2432 vcpu_require_state(vm, vcpuid, VCPU_FROZEN); 2433 2434 /* 2435 * Once clear of the delicate contexts comprising the VM_RUN handler, 2436 * thread CPU affinity can be loosened while other processing occurs. 2437 */ 2438 vcpu->vtc.vtc_status &= ~VTCS_FPU_CTX_CRITICAL; 2439 thread_affinity_clear(curthread); 2440 critical_exit(); 2441 2442 if (error != 0) { 2443 /* Communicate out any error from VMRUN() above */ 2444 goto exit; 2445 } 2446 2447 vcpu->nextrip = vme->rip + vme->inst_length; 2448 switch (vme->exitcode) { 2449 case VM_EXITCODE_REQIDLE: 2450 error = vm_handle_reqidle(vm, vcpuid); 2451 break; 2452 case VM_EXITCODE_RUN_STATE: 2453 error = vm_handle_run_state(vm, vcpuid); 2454 break; 2455 case VM_EXITCODE_SUSPENDED: 2456 error = vm_handle_suspend(vm, vcpuid); 2457 break; 2458 case VM_EXITCODE_IOAPIC_EOI: 2459 vioapic_process_eoi(vm, vcpuid, 2460 vme->u.ioapic_eoi.vector); 2461 break; 2462 case VM_EXITCODE_HLT: 2463 intr_disabled = ((vme->u.hlt.rflags & PSL_I) == 0); 2464 error = vm_handle_hlt(vm, vcpuid, intr_disabled); 2465 break; 2466 case VM_EXITCODE_PAGING: 2467 error = vm_handle_paging(vm, vcpuid); 2468 break; 2469 case VM_EXITCODE_MMIO_EMUL: 2470 error = vm_handle_mmio_emul(vm, vcpuid); 2471 break; 2472 case VM_EXITCODE_INOUT: 2473 error = vm_handle_inout(vm, vcpuid, vme); 2474 break; 2475 case VM_EXITCODE_INST_EMUL: 2476 error = vm_handle_inst_emul(vm, vcpuid); 2477 break; 2478 case VM_EXITCODE_MONITOR: 2479 case VM_EXITCODE_MWAIT: 2480 case VM_EXITCODE_VMINSN: 2481 vm_inject_ud(vm, vcpuid); 2482 break; 2483 case VM_EXITCODE_RDMSR: 2484 error = vm_handle_rdmsr(vm, vcpuid, vme); 2485 break; 2486 case VM_EXITCODE_WRMSR: 2487 error = vm_handle_wrmsr(vm, vcpuid, vme); 2488 break; 2489 case VM_EXITCODE_HT: 2490 affinity_type = CPU_BEST; 2491 break; 2492 case VM_EXITCODE_MTRAP: 2493 VERIFY0(vm_suspend_cpu(vm, vcpuid)); 2494 error = -1; 2495 break; 2496 default: 2497 /* handled in userland */ 2498 error = -1; 2499 break; 2500 } 2501 2502 if (error == 0) { 2503 /* VM exit conditions handled in-kernel, continue running */ 2504 goto restart; 2505 } 2506 2507 exit: 2508 kpreempt_disable(); 2509 ctxop_detach(curthread, vcpu->ctxop); 2510 /* Make sure all of the needed vCPU context state is saved */ 2511 vmm_savectx(&vcpu->vtc); 2512 kpreempt_enable(); 2513 2514 vcpu_ustate_change(vm, vcpuid, VU_EMU_USER); 2515 return (error); 2516 } 2517 2518 int 2519 vm_restart_instruction(void *arg, int vcpuid) 2520 { 2521 struct vm *vm; 2522 struct vcpu *vcpu; 2523 enum vcpu_state state; 2524 uint64_t rip; 2525 int error; 2526 2527 vm = arg; 2528 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 2529 return (EINVAL); 2530 2531 vcpu = &vm->vcpu[vcpuid]; 2532 state = vcpu_get_state(vm, vcpuid, NULL); 2533 if (state == VCPU_RUNNING) { 2534 /* 2535 * When a vcpu is "running" the next instruction is determined 2536 * by adding 'rip' and 'inst_length' in the vcpu's 'exitinfo'. 2537 * Thus setting 'inst_length' to zero will cause the current 2538 * instruction to be restarted. 2539 */ 2540 vcpu->exitinfo.inst_length = 0; 2541 } else if (state == VCPU_FROZEN) { 2542 /* 2543 * When a vcpu is "frozen" it is outside the critical section 2544 * around VMRUN() and 'nextrip' points to the next instruction. 2545 * Thus instruction restart is achieved by setting 'nextrip' 2546 * to the vcpu's %rip. 2547 */ 2548 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RIP, &rip); 2549 KASSERT(!error, ("%s: error %d getting rip", __func__, error)); 2550 vcpu->nextrip = rip; 2551 } else { 2552 panic("%s: invalid state %d", __func__, state); 2553 } 2554 return (0); 2555 } 2556 2557 int 2558 vm_exit_intinfo(struct vm *vm, int vcpuid, uint64_t info) 2559 { 2560 struct vcpu *vcpu; 2561 2562 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 2563 return (EINVAL); 2564 2565 vcpu = &vm->vcpu[vcpuid]; 2566 2567 if (VM_INTINFO_PENDING(info)) { 2568 const uint32_t type = VM_INTINFO_TYPE(info); 2569 const uint8_t vector = VM_INTINFO_VECTOR(info); 2570 2571 if (type == VM_INTINFO_NMI && vector != IDT_NMI) 2572 return (EINVAL); 2573 if (type == VM_INTINFO_HWEXCP && vector >= 32) 2574 return (EINVAL); 2575 if (info & VM_INTINFO_MASK_RSVD) 2576 return (EINVAL); 2577 } else { 2578 info = 0; 2579 } 2580 vcpu->exit_intinfo = info; 2581 return (0); 2582 } 2583 2584 enum exc_class { 2585 EXC_BENIGN, 2586 EXC_CONTRIBUTORY, 2587 EXC_PAGEFAULT 2588 }; 2589 2590 #define IDT_VE 20 /* Virtualization Exception (Intel specific) */ 2591 2592 static enum exc_class 2593 exception_class(uint64_t info) 2594 { 2595 ASSERT(VM_INTINFO_PENDING(info)); 2596 2597 /* Table 6-4, "Interrupt and Exception Classes", Intel SDM, Vol 3 */ 2598 switch (VM_INTINFO_TYPE(info)) { 2599 case VM_INTINFO_HWINTR: 2600 case VM_INTINFO_SWINTR: 2601 case VM_INTINFO_NMI: 2602 return (EXC_BENIGN); 2603 default: 2604 /* 2605 * Hardware exception. 2606 * 2607 * SVM and VT-x use identical type values to represent NMI, 2608 * hardware interrupt and software interrupt. 2609 * 2610 * SVM uses type '3' for all exceptions. VT-x uses type '3' 2611 * for exceptions except #BP and #OF. #BP and #OF use a type 2612 * value of '5' or '6'. Therefore we don't check for explicit 2613 * values of 'type' to classify 'intinfo' into a hardware 2614 * exception. 2615 */ 2616 break; 2617 } 2618 2619 switch (VM_INTINFO_VECTOR(info)) { 2620 case IDT_PF: 2621 case IDT_VE: 2622 return (EXC_PAGEFAULT); 2623 case IDT_DE: 2624 case IDT_TS: 2625 case IDT_NP: 2626 case IDT_SS: 2627 case IDT_GP: 2628 return (EXC_CONTRIBUTORY); 2629 default: 2630 return (EXC_BENIGN); 2631 } 2632 } 2633 2634 /* 2635 * Fetch event pending injection into the guest, if one exists. 2636 * 2637 * Returns true if an event is to be injected (which is placed in `retinfo`). 2638 */ 2639 bool 2640 vm_entry_intinfo(struct vm *vm, int vcpuid, uint64_t *retinfo) 2641 { 2642 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 2643 const uint64_t info1 = vcpu->exit_intinfo; 2644 vcpu->exit_intinfo = 0; 2645 const uint64_t info2 = vcpu->exc_pending; 2646 vcpu->exc_pending = 0; 2647 2648 if (VM_INTINFO_PENDING(info1) && VM_INTINFO_PENDING(info2)) { 2649 /* 2650 * If an exception occurs while attempting to call the 2651 * double-fault handler the processor enters shutdown mode 2652 * (aka triple fault). 2653 */ 2654 if (VM_INTINFO_TYPE(info1) == VM_INTINFO_HWEXCP && 2655 VM_INTINFO_VECTOR(info1) == IDT_DF) { 2656 (void) vm_suspend(vm, VM_SUSPEND_TRIPLEFAULT); 2657 *retinfo = 0; 2658 return (false); 2659 } 2660 /* 2661 * "Conditions for Generating a Double Fault" 2662 * Intel SDM, Vol3, Table 6-5 2663 */ 2664 const enum exc_class exc1 = exception_class(info1); 2665 const enum exc_class exc2 = exception_class(info2); 2666 if ((exc1 == EXC_CONTRIBUTORY && exc2 == EXC_CONTRIBUTORY) || 2667 (exc1 == EXC_PAGEFAULT && exc2 != EXC_BENIGN)) { 2668 /* Convert nested fault into a double fault. */ 2669 *retinfo = 2670 VM_INTINFO_VALID | 2671 VM_INTINFO_DEL_ERRCODE | 2672 VM_INTINFO_HWEXCP | 2673 IDT_DF; 2674 } else { 2675 /* Handle exceptions serially */ 2676 vcpu->exit_intinfo = info1; 2677 *retinfo = info2; 2678 } 2679 return (true); 2680 } else if (VM_INTINFO_PENDING(info1)) { 2681 *retinfo = info1; 2682 return (true); 2683 } else if (VM_INTINFO_PENDING(info2)) { 2684 *retinfo = info2; 2685 return (true); 2686 } 2687 2688 return (false); 2689 } 2690 2691 int 2692 vm_get_intinfo(struct vm *vm, int vcpuid, uint64_t *info1, uint64_t *info2) 2693 { 2694 struct vcpu *vcpu; 2695 2696 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 2697 return (EINVAL); 2698 2699 vcpu = &vm->vcpu[vcpuid]; 2700 *info1 = vcpu->exit_intinfo; 2701 *info2 = vcpu->exc_pending; 2702 return (0); 2703 } 2704 2705 int 2706 vm_inject_exception(struct vm *vm, int vcpuid, uint8_t vector, 2707 bool errcode_valid, uint32_t errcode, bool restart_instruction) 2708 { 2709 struct vcpu *vcpu; 2710 uint64_t regval; 2711 int error; 2712 2713 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 2714 return (EINVAL); 2715 2716 if (vector >= 32) 2717 return (EINVAL); 2718 2719 /* 2720 * NMIs are to be injected via their own specialized path using 2721 * vm_inject_nmi(). 2722 */ 2723 if (vector == IDT_NMI) { 2724 return (EINVAL); 2725 } 2726 2727 /* 2728 * A double fault exception should never be injected directly into 2729 * the guest. It is a derived exception that results from specific 2730 * combinations of nested faults. 2731 */ 2732 if (vector == IDT_DF) { 2733 return (EINVAL); 2734 } 2735 2736 vcpu = &vm->vcpu[vcpuid]; 2737 2738 if (VM_INTINFO_PENDING(vcpu->exc_pending)) { 2739 /* Unable to inject exception due to one already pending */ 2740 return (EBUSY); 2741 } 2742 2743 if (errcode_valid) { 2744 /* 2745 * Exceptions don't deliver an error code in real mode. 2746 */ 2747 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_CR0, ®val); 2748 VERIFY0(error); 2749 if ((regval & CR0_PE) == 0) { 2750 errcode_valid = false; 2751 } 2752 } 2753 2754 /* 2755 * From section 26.6.1 "Interruptibility State" in Intel SDM: 2756 * 2757 * Event blocking by "STI" or "MOV SS" is cleared after guest executes 2758 * one instruction or incurs an exception. 2759 */ 2760 error = vm_set_register(vm, vcpuid, VM_REG_GUEST_INTR_SHADOW, 0); 2761 VERIFY0(error); 2762 2763 if (restart_instruction) { 2764 VERIFY0(vm_restart_instruction(vm, vcpuid)); 2765 } 2766 2767 uint64_t val = VM_INTINFO_VALID | VM_INTINFO_HWEXCP | vector; 2768 if (errcode_valid) { 2769 val |= VM_INTINFO_DEL_ERRCODE; 2770 val |= (uint64_t)errcode << VM_INTINFO_SHIFT_ERRCODE; 2771 } 2772 vcpu->exc_pending = val; 2773 return (0); 2774 } 2775 2776 void 2777 vm_inject_ud(struct vm *vm, int vcpuid) 2778 { 2779 VERIFY0(vm_inject_exception(vm, vcpuid, IDT_UD, false, 0, true)); 2780 } 2781 2782 void 2783 vm_inject_gp(struct vm *vm, int vcpuid) 2784 { 2785 VERIFY0(vm_inject_exception(vm, vcpuid, IDT_GP, true, 0, true)); 2786 } 2787 2788 void 2789 vm_inject_ac(struct vm *vm, int vcpuid, uint32_t errcode) 2790 { 2791 VERIFY0(vm_inject_exception(vm, vcpuid, IDT_AC, true, errcode, true)); 2792 } 2793 2794 void 2795 vm_inject_ss(struct vm *vm, int vcpuid, uint32_t errcode) 2796 { 2797 VERIFY0(vm_inject_exception(vm, vcpuid, IDT_SS, true, errcode, true)); 2798 } 2799 2800 void 2801 vm_inject_pf(struct vm *vm, int vcpuid, uint32_t errcode, uint64_t cr2) 2802 { 2803 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_CR2, cr2)); 2804 VERIFY0(vm_inject_exception(vm, vcpuid, IDT_PF, true, errcode, true)); 2805 } 2806 2807 static VMM_STAT(VCPU_NMI_COUNT, "number of NMIs delivered to vcpu"); 2808 2809 int 2810 vm_inject_nmi(struct vm *vm, int vcpuid) 2811 { 2812 struct vcpu *vcpu; 2813 2814 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 2815 return (EINVAL); 2816 2817 vcpu = &vm->vcpu[vcpuid]; 2818 2819 vcpu->nmi_pending = true; 2820 vcpu_notify_event(vm, vcpuid); 2821 return (0); 2822 } 2823 2824 bool 2825 vm_nmi_pending(struct vm *vm, int vcpuid) 2826 { 2827 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 2828 2829 return (vcpu->nmi_pending); 2830 } 2831 2832 void 2833 vm_nmi_clear(struct vm *vm, int vcpuid) 2834 { 2835 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 2836 2837 ASSERT(vcpu->nmi_pending); 2838 2839 vcpu->nmi_pending = false; 2840 vmm_stat_incr(vm, vcpuid, VCPU_NMI_COUNT, 1); 2841 } 2842 2843 static VMM_STAT(VCPU_EXTINT_COUNT, "number of ExtINTs delivered to vcpu"); 2844 2845 int 2846 vm_inject_extint(struct vm *vm, int vcpuid) 2847 { 2848 struct vcpu *vcpu; 2849 2850 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 2851 return (EINVAL); 2852 2853 vcpu = &vm->vcpu[vcpuid]; 2854 2855 vcpu->extint_pending = true; 2856 vcpu_notify_event(vm, vcpuid); 2857 return (0); 2858 } 2859 2860 bool 2861 vm_extint_pending(struct vm *vm, int vcpuid) 2862 { 2863 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 2864 2865 return (vcpu->extint_pending); 2866 } 2867 2868 void 2869 vm_extint_clear(struct vm *vm, int vcpuid) 2870 { 2871 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 2872 2873 ASSERT(vcpu->extint_pending); 2874 2875 vcpu->extint_pending = false; 2876 vmm_stat_incr(vm, vcpuid, VCPU_EXTINT_COUNT, 1); 2877 } 2878 2879 int 2880 vm_inject_init(struct vm *vm, int vcpuid) 2881 { 2882 struct vcpu *vcpu; 2883 2884 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 2885 return (EINVAL); 2886 2887 vcpu = &vm->vcpu[vcpuid]; 2888 vcpu_lock(vcpu); 2889 vcpu->run_state |= VRS_PEND_INIT; 2890 /* 2891 * As part of queuing the INIT request, clear any pending SIPI. It 2892 * would not otherwise survive across the reset of the vCPU when it 2893 * undergoes the requested INIT. We would not want it to linger when it 2894 * could be mistaken as a subsequent (after the INIT) SIPI request. 2895 */ 2896 vcpu->run_state &= ~VRS_PEND_SIPI; 2897 vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT); 2898 2899 vcpu_unlock(vcpu); 2900 return (0); 2901 } 2902 2903 int 2904 vm_inject_sipi(struct vm *vm, int vcpuid, uint8_t vector) 2905 { 2906 struct vcpu *vcpu; 2907 2908 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 2909 return (EINVAL); 2910 2911 vcpu = &vm->vcpu[vcpuid]; 2912 vcpu_lock(vcpu); 2913 vcpu->run_state |= VRS_PEND_SIPI; 2914 vcpu->sipi_vector = vector; 2915 /* SIPI is only actionable if the CPU is waiting in INIT state */ 2916 if ((vcpu->run_state & (VRS_INIT | VRS_RUN)) == VRS_INIT) { 2917 vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT); 2918 } 2919 vcpu_unlock(vcpu); 2920 return (0); 2921 } 2922 2923 bool 2924 vcpu_run_state_pending(struct vm *vm, int vcpuid) 2925 { 2926 struct vcpu *vcpu; 2927 2928 ASSERT(vcpuid >= 0 && vcpuid < vm->maxcpus); 2929 vcpu = &vm->vcpu[vcpuid]; 2930 2931 /* Of interest: vCPU not in running state or with pending INIT */ 2932 return ((vcpu->run_state & (VRS_RUN | VRS_PEND_INIT)) != VRS_RUN); 2933 } 2934 2935 int 2936 vcpu_arch_reset(struct vm *vm, int vcpuid, bool init_only) 2937 { 2938 struct seg_desc desc; 2939 const enum vm_reg_name clear_regs[] = { 2940 VM_REG_GUEST_CR2, 2941 VM_REG_GUEST_CR3, 2942 VM_REG_GUEST_CR4, 2943 VM_REG_GUEST_RAX, 2944 VM_REG_GUEST_RBX, 2945 VM_REG_GUEST_RCX, 2946 VM_REG_GUEST_RSI, 2947 VM_REG_GUEST_RDI, 2948 VM_REG_GUEST_RBP, 2949 VM_REG_GUEST_RSP, 2950 VM_REG_GUEST_R8, 2951 VM_REG_GUEST_R9, 2952 VM_REG_GUEST_R10, 2953 VM_REG_GUEST_R11, 2954 VM_REG_GUEST_R12, 2955 VM_REG_GUEST_R13, 2956 VM_REG_GUEST_R14, 2957 VM_REG_GUEST_R15, 2958 VM_REG_GUEST_DR0, 2959 VM_REG_GUEST_DR1, 2960 VM_REG_GUEST_DR2, 2961 VM_REG_GUEST_DR3, 2962 VM_REG_GUEST_EFER, 2963 }; 2964 const enum vm_reg_name data_segs[] = { 2965 VM_REG_GUEST_SS, 2966 VM_REG_GUEST_DS, 2967 VM_REG_GUEST_ES, 2968 VM_REG_GUEST_FS, 2969 VM_REG_GUEST_GS, 2970 }; 2971 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 2972 2973 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 2974 return (EINVAL); 2975 2976 for (uint_t i = 0; i < nitems(clear_regs); i++) { 2977 VERIFY0(vm_set_register(vm, vcpuid, clear_regs[i], 0)); 2978 } 2979 2980 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, 2)); 2981 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RIP, 0xfff0)); 2982 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_CR0, 0x60000010)); 2983 2984 /* 2985 * The prescribed contents of %rdx differ slightly between the Intel and 2986 * AMD architectural definitions. The former expects the Extended Model 2987 * in bits 16-19 where the latter expects all the Family, Model, and 2988 * Stepping be there. Common boot ROMs appear to disregard this 2989 * anyways, so we stick with a compromise value similar to what is 2990 * spelled out in the Intel SDM. 2991 */ 2992 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RDX, 0x600)); 2993 2994 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_DR6, 0xffff0ff0)); 2995 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_DR7, 0x400)); 2996 2997 /* CS: Present, R/W, Accessed */ 2998 desc.access = 0x0093; 2999 desc.base = 0xffff0000; 3000 desc.limit = 0xffff; 3001 VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_CS, &desc)); 3002 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_CS, 0xf000)); 3003 3004 /* SS, DS, ES, FS, GS: Present, R/W, Accessed */ 3005 desc.access = 0x0093; 3006 desc.base = 0; 3007 desc.limit = 0xffff; 3008 for (uint_t i = 0; i < nitems(data_segs); i++) { 3009 VERIFY0(vm_set_seg_desc(vm, vcpuid, data_segs[i], &desc)); 3010 VERIFY0(vm_set_register(vm, vcpuid, data_segs[i], 0)); 3011 } 3012 3013 /* GDTR, IDTR */ 3014 desc.base = 0; 3015 desc.limit = 0xffff; 3016 VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_GDTR, &desc)); 3017 VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_IDTR, &desc)); 3018 3019 /* LDTR: Present, LDT */ 3020 desc.access = 0x0082; 3021 desc.base = 0; 3022 desc.limit = 0xffff; 3023 VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_LDTR, &desc)); 3024 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_LDTR, 0)); 3025 3026 /* TR: Present, 32-bit TSS */ 3027 desc.access = 0x008b; 3028 desc.base = 0; 3029 desc.limit = 0xffff; 3030 VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_TR, &desc)); 3031 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_TR, 0)); 3032 3033 vlapic_reset(vm_lapic(vm, vcpuid)); 3034 3035 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_INTR_SHADOW, 0)); 3036 3037 vcpu->exit_intinfo = 0; 3038 vcpu->exc_pending = 0; 3039 vcpu->nmi_pending = false; 3040 vcpu->extint_pending = 0; 3041 3042 /* 3043 * A CPU reset caused by power-on or system reset clears more state than 3044 * one which is trigged from an INIT IPI. 3045 */ 3046 if (!init_only) { 3047 vcpu->guest_xcr0 = XFEATURE_ENABLED_X87; 3048 (void) hma_fpu_init(vcpu->guestfpu); 3049 3050 /* XXX: clear MSRs and other pieces */ 3051 bzero(&vcpu->mtrr, sizeof (vcpu->mtrr)); 3052 } 3053 3054 return (0); 3055 } 3056 3057 static int 3058 vcpu_vector_sipi(struct vm *vm, int vcpuid, uint8_t vector) 3059 { 3060 struct seg_desc desc; 3061 3062 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 3063 return (EINVAL); 3064 3065 /* CS: Present, R/W, Accessed */ 3066 desc.access = 0x0093; 3067 desc.base = (uint64_t)vector << 12; 3068 desc.limit = 0xffff; 3069 VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_CS, &desc)); 3070 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_CS, 3071 (uint64_t)vector << 8)); 3072 3073 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RIP, 0)); 3074 3075 return (0); 3076 } 3077 3078 int 3079 vm_get_capability(struct vm *vm, int vcpu, int type, int *retval) 3080 { 3081 if (vcpu < 0 || vcpu >= vm->maxcpus) 3082 return (EINVAL); 3083 3084 if (type < 0 || type >= VM_CAP_MAX) 3085 return (EINVAL); 3086 3087 return (VMGETCAP(vm->cookie, vcpu, type, retval)); 3088 } 3089 3090 int 3091 vm_set_capability(struct vm *vm, int vcpu, int type, int val) 3092 { 3093 if (vcpu < 0 || vcpu >= vm->maxcpus) 3094 return (EINVAL); 3095 3096 if (type < 0 || type >= VM_CAP_MAX) 3097 return (EINVAL); 3098 3099 return (VMSETCAP(vm->cookie, vcpu, type, val)); 3100 } 3101 3102 vcpu_cpuid_config_t * 3103 vm_cpuid_config(struct vm *vm, int vcpuid) 3104 { 3105 ASSERT3S(vcpuid, >=, 0); 3106 ASSERT3S(vcpuid, <, VM_MAXCPU); 3107 3108 return (&vm->vcpu[vcpuid].cpuid_cfg); 3109 } 3110 3111 struct vlapic * 3112 vm_lapic(struct vm *vm, int cpu) 3113 { 3114 ASSERT3S(cpu, >=, 0); 3115 ASSERT3S(cpu, <, VM_MAXCPU); 3116 3117 return (vm->vcpu[cpu].vlapic); 3118 } 3119 3120 struct vioapic * 3121 vm_ioapic(struct vm *vm) 3122 { 3123 3124 return (vm->vioapic); 3125 } 3126 3127 struct vhpet * 3128 vm_hpet(struct vm *vm) 3129 { 3130 3131 return (vm->vhpet); 3132 } 3133 3134 void * 3135 vm_iommu_domain(struct vm *vm) 3136 { 3137 3138 return (vm->iommu); 3139 } 3140 3141 int 3142 vcpu_set_state(struct vm *vm, int vcpuid, enum vcpu_state newstate, 3143 bool from_idle) 3144 { 3145 int error; 3146 struct vcpu *vcpu; 3147 3148 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 3149 panic("vcpu_set_state: invalid vcpuid %d", vcpuid); 3150 3151 vcpu = &vm->vcpu[vcpuid]; 3152 3153 vcpu_lock(vcpu); 3154 error = vcpu_set_state_locked(vm, vcpuid, newstate, from_idle); 3155 vcpu_unlock(vcpu); 3156 3157 return (error); 3158 } 3159 3160 enum vcpu_state 3161 vcpu_get_state(struct vm *vm, int vcpuid, int *hostcpu) 3162 { 3163 struct vcpu *vcpu; 3164 enum vcpu_state state; 3165 3166 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 3167 panic("vcpu_get_state: invalid vcpuid %d", vcpuid); 3168 3169 vcpu = &vm->vcpu[vcpuid]; 3170 3171 vcpu_lock(vcpu); 3172 state = vcpu->state; 3173 if (hostcpu != NULL) 3174 *hostcpu = vcpu->hostcpu; 3175 vcpu_unlock(vcpu); 3176 3177 return (state); 3178 } 3179 3180 uint64_t 3181 vcpu_tsc_offset(struct vm *vm, int vcpuid, bool phys_adj) 3182 { 3183 ASSERT(vcpuid >= 0 && vcpuid < vm->maxcpus); 3184 3185 uint64_t vcpu_off = vm->boot_tsc_offset + vm->vcpu[vcpuid].tsc_offset; 3186 3187 if (phys_adj) { 3188 /* Include any offset for the current physical CPU too */ 3189 extern hrtime_t tsc_gethrtime_tick_delta(void); 3190 vcpu_off += (uint64_t)tsc_gethrtime_tick_delta(); 3191 } 3192 3193 return (vcpu_off); 3194 } 3195 3196 /* Normalize hrtime against the boot time for a VM */ 3197 hrtime_t 3198 vm_normalize_hrtime(struct vm *vm, hrtime_t hrt) 3199 { 3200 /* To avoid underflow/overflow UB, perform math as unsigned */ 3201 return ((hrtime_t)((uint64_t)hrt - (uint64_t)vm->boot_hrtime)); 3202 } 3203 3204 /* Denormalize hrtime against the boot time for a VM */ 3205 hrtime_t 3206 vm_denormalize_hrtime(struct vm *vm, hrtime_t hrt) 3207 { 3208 /* To avoid underflow/overflow UB, perform math as unsigned */ 3209 return ((hrtime_t)((uint64_t)hrt + (uint64_t)vm->boot_hrtime)); 3210 } 3211 3212 int 3213 vm_activate_cpu(struct vm *vm, int vcpuid) 3214 { 3215 3216 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 3217 return (EINVAL); 3218 3219 if (CPU_ISSET(vcpuid, &vm->active_cpus)) 3220 return (EBUSY); 3221 3222 if (vm->suspend != 0) { 3223 return (EBUSY); 3224 } 3225 3226 CPU_SET_ATOMIC(vcpuid, &vm->active_cpus); 3227 3228 /* 3229 * It is possible that this vCPU was undergoing activation at the same 3230 * time that the VM was being suspended. If that happens to be the 3231 * case, it should reflect the suspended state immediately. 3232 */ 3233 if (atomic_load_acq_int((uint_t *)&vm->suspend) != 0) { 3234 CPU_SET_ATOMIC(vcpuid, &vm->suspended_cpus); 3235 } 3236 3237 return (0); 3238 } 3239 3240 int 3241 vm_suspend_cpu(struct vm *vm, int vcpuid) 3242 { 3243 int i; 3244 3245 if (vcpuid < -1 || vcpuid >= vm->maxcpus) 3246 return (EINVAL); 3247 3248 if (vcpuid == -1) { 3249 vm->debug_cpus = vm->active_cpus; 3250 for (i = 0; i < vm->maxcpus; i++) { 3251 if (CPU_ISSET(i, &vm->active_cpus)) 3252 vcpu_notify_event(vm, i); 3253 } 3254 } else { 3255 if (!CPU_ISSET(vcpuid, &vm->active_cpus)) 3256 return (EINVAL); 3257 3258 CPU_SET_ATOMIC(vcpuid, &vm->debug_cpus); 3259 vcpu_notify_event(vm, vcpuid); 3260 } 3261 return (0); 3262 } 3263 3264 int 3265 vm_resume_cpu(struct vm *vm, int vcpuid) 3266 { 3267 3268 if (vcpuid < -1 || vcpuid >= vm->maxcpus) 3269 return (EINVAL); 3270 3271 if (vcpuid == -1) { 3272 CPU_ZERO(&vm->debug_cpus); 3273 } else { 3274 if (!CPU_ISSET(vcpuid, &vm->debug_cpus)) 3275 return (EINVAL); 3276 3277 CPU_CLR_ATOMIC(vcpuid, &vm->debug_cpus); 3278 } 3279 return (0); 3280 } 3281 3282 static bool 3283 vcpu_bailout_checks(struct vm *vm, int vcpuid, bool on_entry, 3284 uint64_t entry_rip) 3285 { 3286 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 3287 struct vm_exit *vme = &vcpu->exitinfo; 3288 bool bail = false; 3289 3290 ASSERT(vcpuid >= 0 && vcpuid < vm->maxcpus); 3291 3292 if (vm->suspend) { 3293 if (on_entry) { 3294 VERIFY(vm->suspend > VM_SUSPEND_NONE && 3295 vm->suspend < VM_SUSPEND_LAST); 3296 3297 vme->exitcode = VM_EXITCODE_SUSPENDED; 3298 vme->u.suspended.how = vm->suspend; 3299 } else { 3300 /* 3301 * Handling VM suspend is complicated, so if that 3302 * condition is detected outside of VM-entry itself, 3303 * just emit a BOGUS exitcode so we take a lap to pick 3304 * up the event during an entry and are directed into 3305 * the vm_handle_suspend() logic. 3306 */ 3307 vme->exitcode = VM_EXITCODE_BOGUS; 3308 } 3309 bail = true; 3310 } 3311 if (vcpu->reqidle) { 3312 vme->exitcode = VM_EXITCODE_REQIDLE; 3313 vmm_stat_incr(vm, vcpuid, VMEXIT_REQIDLE, 1); 3314 3315 if (!on_entry) { 3316 /* 3317 * A reqidle request detected outside of VM-entry can be 3318 * handled directly by clearing the request (and taking 3319 * a lap to userspace). 3320 */ 3321 vcpu_assert_locked(vcpu); 3322 vcpu->reqidle = 0; 3323 } 3324 bail = true; 3325 } 3326 if (vcpu_should_yield(vm, vcpuid)) { 3327 vme->exitcode = VM_EXITCODE_BOGUS; 3328 vmm_stat_incr(vm, vcpuid, VMEXIT_ASTPENDING, 1); 3329 bail = true; 3330 } 3331 if (CPU_ISSET(vcpuid, &vm->debug_cpus)) { 3332 vme->exitcode = VM_EXITCODE_DEBUG; 3333 bail = true; 3334 } 3335 3336 if (bail) { 3337 if (on_entry) { 3338 /* 3339 * If bailing out during VM-entry, the current %rip must 3340 * be recorded in the exitinfo. 3341 */ 3342 vme->rip = entry_rip; 3343 } 3344 vme->inst_length = 0; 3345 } 3346 return (bail); 3347 } 3348 3349 static bool 3350 vcpu_sleep_bailout_checks(struct vm *vm, int vcpuid) 3351 { 3352 /* 3353 * Bail-out check done prior to sleeping (in vCPU contexts like HLT or 3354 * wait-for-SIPI) expect that %rip is already populated in the vm_exit 3355 * structure, and we would only modify the exitcode. 3356 */ 3357 return (vcpu_bailout_checks(vm, vcpuid, false, 0)); 3358 } 3359 3360 bool 3361 vcpu_entry_bailout_checks(struct vm *vm, int vcpuid, uint64_t rip) 3362 { 3363 /* 3364 * Bail-out checks done as part of VM entry require an updated %rip to 3365 * populate the vm_exit struct if any of the conditions of interest are 3366 * matched in the check. 3367 */ 3368 return (vcpu_bailout_checks(vm, vcpuid, true, rip)); 3369 } 3370 3371 cpuset_t 3372 vm_active_cpus(struct vm *vm) 3373 { 3374 3375 return (vm->active_cpus); 3376 } 3377 3378 cpuset_t 3379 vm_debug_cpus(struct vm *vm) 3380 { 3381 3382 return (vm->debug_cpus); 3383 } 3384 3385 cpuset_t 3386 vm_suspended_cpus(struct vm *vm) 3387 { 3388 3389 return (vm->suspended_cpus); 3390 } 3391 3392 void * 3393 vcpu_stats(struct vm *vm, int vcpuid) 3394 { 3395 3396 return (vm->vcpu[vcpuid].stats); 3397 } 3398 3399 int 3400 vm_get_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state *state) 3401 { 3402 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 3403 return (EINVAL); 3404 3405 *state = vm->vcpu[vcpuid].x2apic_state; 3406 3407 return (0); 3408 } 3409 3410 int 3411 vm_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state) 3412 { 3413 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 3414 return (EINVAL); 3415 3416 if (state >= X2APIC_STATE_LAST) 3417 return (EINVAL); 3418 3419 vm->vcpu[vcpuid].x2apic_state = state; 3420 3421 vlapic_set_x2apic_state(vm, vcpuid, state); 3422 3423 return (0); 3424 } 3425 3426 /* 3427 * This function is called to ensure that a vcpu "sees" a pending event 3428 * as soon as possible: 3429 * - If the vcpu thread is sleeping then it is woken up. 3430 * - If the vcpu is running on a different host_cpu then an IPI will be directed 3431 * to the host_cpu to cause the vcpu to trap into the hypervisor. 3432 */ 3433 static void 3434 vcpu_notify_event_locked(struct vcpu *vcpu, vcpu_notify_t ntype) 3435 { 3436 int hostcpu; 3437 3438 ASSERT(ntype == VCPU_NOTIFY_APIC || VCPU_NOTIFY_EXIT); 3439 3440 hostcpu = vcpu->hostcpu; 3441 if (vcpu->state == VCPU_RUNNING) { 3442 KASSERT(hostcpu != NOCPU, ("vcpu running on invalid hostcpu")); 3443 if (hostcpu != curcpu) { 3444 if (ntype == VCPU_NOTIFY_APIC) { 3445 vlapic_post_intr(vcpu->vlapic, hostcpu); 3446 } else { 3447 poke_cpu(hostcpu); 3448 } 3449 } else { 3450 /* 3451 * If the 'vcpu' is running on 'curcpu' then it must 3452 * be sending a notification to itself (e.g. SELF_IPI). 3453 * The pending event will be picked up when the vcpu 3454 * transitions back to guest context. 3455 */ 3456 } 3457 } else { 3458 KASSERT(hostcpu == NOCPU, ("vcpu state %d not consistent " 3459 "with hostcpu %d", vcpu->state, hostcpu)); 3460 if (vcpu->state == VCPU_SLEEPING) { 3461 cv_signal(&vcpu->vcpu_cv); 3462 } 3463 } 3464 } 3465 3466 void 3467 vcpu_notify_event(struct vm *vm, int vcpuid) 3468 { 3469 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 3470 3471 vcpu_lock(vcpu); 3472 vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT); 3473 vcpu_unlock(vcpu); 3474 } 3475 3476 void 3477 vcpu_notify_event_type(struct vm *vm, int vcpuid, vcpu_notify_t ntype) 3478 { 3479 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 3480 3481 if (ntype == VCPU_NOTIFY_NONE) { 3482 return; 3483 } 3484 3485 vcpu_lock(vcpu); 3486 vcpu_notify_event_locked(vcpu, ntype); 3487 vcpu_unlock(vcpu); 3488 } 3489 3490 void 3491 vcpu_ustate_change(struct vm *vm, int vcpuid, enum vcpu_ustate ustate) 3492 { 3493 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 3494 hrtime_t now = gethrtime(); 3495 3496 ASSERT3U(ustate, !=, vcpu->ustate); 3497 ASSERT3S(ustate, <, VU_MAX); 3498 ASSERT3S(ustate, >=, VU_INIT); 3499 3500 hrtime_t delta = now - vcpu->ustate_when; 3501 vcpu->ustate_total[vcpu->ustate] += delta; 3502 3503 membar_producer(); 3504 3505 vcpu->ustate_when = now; 3506 vcpu->ustate = ustate; 3507 } 3508 3509 struct vmspace * 3510 vm_get_vmspace(struct vm *vm) 3511 { 3512 3513 return (vm->vmspace); 3514 } 3515 3516 struct vm_client * 3517 vm_get_vmclient(struct vm *vm, int vcpuid) 3518 { 3519 return (vm->vcpu[vcpuid].vmclient); 3520 } 3521 3522 int 3523 vm_apicid2vcpuid(struct vm *vm, int apicid) 3524 { 3525 /* 3526 * XXX apic id is assumed to be numerically identical to vcpu id 3527 */ 3528 return (apicid); 3529 } 3530 3531 struct vatpic * 3532 vm_atpic(struct vm *vm) 3533 { 3534 return (vm->vatpic); 3535 } 3536 3537 struct vatpit * 3538 vm_atpit(struct vm *vm) 3539 { 3540 return (vm->vatpit); 3541 } 3542 3543 struct vpmtmr * 3544 vm_pmtmr(struct vm *vm) 3545 { 3546 3547 return (vm->vpmtmr); 3548 } 3549 3550 struct vrtc * 3551 vm_rtc(struct vm *vm) 3552 { 3553 3554 return (vm->vrtc); 3555 } 3556 3557 enum vm_reg_name 3558 vm_segment_name(int seg) 3559 { 3560 static enum vm_reg_name seg_names[] = { 3561 VM_REG_GUEST_ES, 3562 VM_REG_GUEST_CS, 3563 VM_REG_GUEST_SS, 3564 VM_REG_GUEST_DS, 3565 VM_REG_GUEST_FS, 3566 VM_REG_GUEST_GS 3567 }; 3568 3569 KASSERT(seg >= 0 && seg < nitems(seg_names), 3570 ("%s: invalid segment encoding %d", __func__, seg)); 3571 return (seg_names[seg]); 3572 } 3573 3574 void 3575 vm_copy_teardown(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo, 3576 uint_t num_copyinfo) 3577 { 3578 for (uint_t idx = 0; idx < num_copyinfo; idx++) { 3579 if (copyinfo[idx].cookie != NULL) { 3580 (void) vmp_release((vm_page_t *)copyinfo[idx].cookie); 3581 } 3582 } 3583 bzero(copyinfo, num_copyinfo * sizeof (struct vm_copyinfo)); 3584 } 3585 3586 int 3587 vm_copy_setup(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, 3588 uint64_t gla, size_t len, int prot, struct vm_copyinfo *copyinfo, 3589 uint_t num_copyinfo, int *fault) 3590 { 3591 uint_t idx, nused; 3592 size_t n, off, remaining; 3593 vm_client_t *vmc = vm_get_vmclient(vm, vcpuid); 3594 3595 bzero(copyinfo, sizeof (struct vm_copyinfo) * num_copyinfo); 3596 3597 nused = 0; 3598 remaining = len; 3599 while (remaining > 0) { 3600 uint64_t gpa; 3601 int error; 3602 3603 KASSERT(nused < num_copyinfo, ("insufficient vm_copyinfo")); 3604 error = vm_gla2gpa(vm, vcpuid, paging, gla, prot, &gpa, fault); 3605 if (error || *fault) 3606 return (error); 3607 off = gpa & PAGEOFFSET; 3608 n = min(remaining, PAGESIZE - off); 3609 copyinfo[nused].gpa = gpa; 3610 copyinfo[nused].len = n; 3611 remaining -= n; 3612 gla += n; 3613 nused++; 3614 } 3615 3616 for (idx = 0; idx < nused; idx++) { 3617 vm_page_t *vmp; 3618 caddr_t hva; 3619 3620 vmp = vmc_hold(vmc, copyinfo[idx].gpa & PAGEMASK, prot); 3621 if (vmp == NULL) { 3622 break; 3623 } 3624 if ((prot & PROT_WRITE) != 0) { 3625 hva = (caddr_t)vmp_get_writable(vmp); 3626 } else { 3627 hva = (caddr_t)vmp_get_readable(vmp); 3628 } 3629 copyinfo[idx].hva = hva + (copyinfo[idx].gpa & PAGEOFFSET); 3630 copyinfo[idx].cookie = vmp; 3631 copyinfo[idx].prot = prot; 3632 } 3633 3634 if (idx != nused) { 3635 vm_copy_teardown(vm, vcpuid, copyinfo, num_copyinfo); 3636 return (EFAULT); 3637 } else { 3638 *fault = 0; 3639 return (0); 3640 } 3641 } 3642 3643 void 3644 vm_copyin(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo, void *kaddr, 3645 size_t len) 3646 { 3647 char *dst; 3648 int idx; 3649 3650 dst = kaddr; 3651 idx = 0; 3652 while (len > 0) { 3653 ASSERT(copyinfo[idx].prot & PROT_READ); 3654 3655 bcopy(copyinfo[idx].hva, dst, copyinfo[idx].len); 3656 len -= copyinfo[idx].len; 3657 dst += copyinfo[idx].len; 3658 idx++; 3659 } 3660 } 3661 3662 void 3663 vm_copyout(struct vm *vm, int vcpuid, const void *kaddr, 3664 struct vm_copyinfo *copyinfo, size_t len) 3665 { 3666 const char *src; 3667 int idx; 3668 3669 src = kaddr; 3670 idx = 0; 3671 while (len > 0) { 3672 ASSERT(copyinfo[idx].prot & PROT_WRITE); 3673 3674 bcopy(src, copyinfo[idx].hva, copyinfo[idx].len); 3675 len -= copyinfo[idx].len; 3676 src += copyinfo[idx].len; 3677 idx++; 3678 } 3679 } 3680 3681 /* 3682 * Return the amount of in-use and wired memory for the VM. Since 3683 * these are global stats, only return the values with for vCPU 0 3684 */ 3685 VMM_STAT_DECLARE(VMM_MEM_RESIDENT); 3686 3687 static void 3688 vm_get_rescnt(struct vm *vm, int vcpu, struct vmm_stat_type *stat) 3689 { 3690 if (vcpu == 0) { 3691 vmm_stat_set(vm, vcpu, VMM_MEM_RESIDENT, 3692 PAGE_SIZE * vmspace_resident_count(vm->vmspace)); 3693 } 3694 } 3695 3696 VMM_STAT_FUNC(VMM_MEM_RESIDENT, "Resident memory", vm_get_rescnt); 3697 3698 int 3699 vm_ioport_access(struct vm *vm, int vcpuid, bool in, uint16_t port, 3700 uint8_t bytes, uint32_t *val) 3701 { 3702 return (vm_inout_access(&vm->ioports, in, port, bytes, val)); 3703 } 3704 3705 /* 3706 * bhyve-internal interfaces to attach or detach IO port handlers. 3707 * Must be called with VM write lock held for safety. 3708 */ 3709 int 3710 vm_ioport_attach(struct vm *vm, uint16_t port, ioport_handler_t func, void *arg, 3711 void **cookie) 3712 { 3713 int err; 3714 err = vm_inout_attach(&vm->ioports, port, IOPF_DEFAULT, func, arg); 3715 if (err == 0) { 3716 *cookie = (void *)IOP_GEN_COOKIE(func, arg, port); 3717 } 3718 return (err); 3719 } 3720 int 3721 vm_ioport_detach(struct vm *vm, void **cookie, ioport_handler_t *old_func, 3722 void **old_arg) 3723 { 3724 uint16_t port = IOP_PORT_FROM_COOKIE((uintptr_t)*cookie); 3725 int err; 3726 3727 err = vm_inout_detach(&vm->ioports, port, false, old_func, old_arg); 3728 if (err == 0) { 3729 *cookie = NULL; 3730 } 3731 return (err); 3732 } 3733 3734 /* 3735 * External driver interfaces to attach or detach IO port handlers. 3736 * Must be called with VM write lock held for safety. 3737 */ 3738 int 3739 vm_ioport_hook(struct vm *vm, uint16_t port, ioport_handler_t func, 3740 void *arg, void **cookie) 3741 { 3742 int err; 3743 3744 if (port == 0) { 3745 return (EINVAL); 3746 } 3747 3748 err = vm_inout_attach(&vm->ioports, port, IOPF_DRV_HOOK, func, arg); 3749 if (err == 0) { 3750 *cookie = (void *)IOP_GEN_COOKIE(func, arg, port); 3751 } 3752 return (err); 3753 } 3754 void 3755 vm_ioport_unhook(struct vm *vm, void **cookie) 3756 { 3757 uint16_t port = IOP_PORT_FROM_COOKIE((uintptr_t)*cookie); 3758 ioport_handler_t old_func; 3759 void *old_arg; 3760 int err; 3761 3762 err = vm_inout_detach(&vm->ioports, port, true, &old_func, &old_arg); 3763 3764 /* ioport-hook-using drivers are expected to be well-behaved */ 3765 VERIFY0(err); 3766 VERIFY(IOP_GEN_COOKIE(old_func, old_arg, port) == (uintptr_t)*cookie); 3767 3768 *cookie = NULL; 3769 } 3770 3771 int 3772 vmm_kstat_update_vcpu(struct kstat *ksp, int rw) 3773 { 3774 struct vm *vm = ksp->ks_private; 3775 vmm_vcpu_kstats_t *vvk = ksp->ks_data; 3776 const int vcpuid = vvk->vvk_vcpu.value.ui32; 3777 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 3778 3779 ASSERT3U(vcpuid, <, VM_MAXCPU); 3780 3781 vvk->vvk_time_init.value.ui64 = vcpu->ustate_total[VU_INIT]; 3782 vvk->vvk_time_run.value.ui64 = vcpu->ustate_total[VU_RUN]; 3783 vvk->vvk_time_idle.value.ui64 = vcpu->ustate_total[VU_IDLE]; 3784 vvk->vvk_time_emu_kern.value.ui64 = vcpu->ustate_total[VU_EMU_KERN]; 3785 vvk->vvk_time_emu_user.value.ui64 = vcpu->ustate_total[VU_EMU_USER]; 3786 vvk->vvk_time_sched.value.ui64 = vcpu->ustate_total[VU_SCHED]; 3787 3788 return (0); 3789 } 3790 3791 SET_DECLARE(vmm_data_version_entries, const vmm_data_version_entry_t); 3792 3793 static inline bool 3794 vmm_data_is_cpu_specific(uint16_t data_class) 3795 { 3796 switch (data_class) { 3797 case VDC_REGISTER: 3798 case VDC_MSR: 3799 case VDC_FPU: 3800 case VDC_LAPIC: 3801 return (true); 3802 default: 3803 return (false); 3804 } 3805 } 3806 3807 static int 3808 vmm_data_find(const vmm_data_req_t *req, const vmm_data_version_entry_t **resp) 3809 { 3810 const vmm_data_version_entry_t **vdpp, *vdp; 3811 3812 ASSERT(resp != NULL); 3813 ASSERT(req->vdr_result_len != NULL); 3814 3815 SET_FOREACH(vdpp, vmm_data_version_entries) { 3816 vdp = *vdpp; 3817 if (vdp->vdve_class == req->vdr_class && 3818 vdp->vdve_version == req->vdr_version) { 3819 /* 3820 * Enforce any data length expectation expressed by the 3821 * provider for this data. 3822 */ 3823 if (vdp->vdve_len_expect != 0 && 3824 vdp->vdve_len_expect > req->vdr_len) { 3825 *req->vdr_result_len = vdp->vdve_len_expect; 3826 return (ENOSPC); 3827 } 3828 *resp = vdp; 3829 return (0); 3830 } 3831 } 3832 return (EINVAL); 3833 } 3834 3835 static void * 3836 vmm_data_from_class(const vmm_data_req_t *req, struct vm *vm, int vcpuid) 3837 { 3838 switch (req->vdr_class) { 3839 /* per-cpu data/devices */ 3840 case VDC_LAPIC: 3841 return (vm_lapic(vm, vcpuid)); 3842 case VDC_VMM_ARCH: 3843 return (vm); 3844 3845 case VDC_FPU: 3846 case VDC_REGISTER: 3847 case VDC_MSR: 3848 /* 3849 * These have per-CPU handling which is dispatched outside 3850 * vmm_data_version_entries listing. 3851 */ 3852 return (NULL); 3853 3854 /* system-wide data/devices */ 3855 case VDC_IOAPIC: 3856 return (vm->vioapic); 3857 case VDC_ATPIT: 3858 return (vm->vatpit); 3859 case VDC_ATPIC: 3860 return (vm->vatpic); 3861 case VDC_HPET: 3862 return (vm->vhpet); 3863 case VDC_PM_TIMER: 3864 return (vm->vpmtmr); 3865 case VDC_RTC: 3866 return (vm->vrtc); 3867 3868 default: 3869 /* The data class will have been validated by now */ 3870 panic("Unexpected class %u", req->vdr_class); 3871 } 3872 } 3873 3874 const uint32_t arch_msr_iter[] = { 3875 MSR_EFER, 3876 3877 /* 3878 * While gsbase and fsbase are accessible via the MSR accessors, they 3879 * are not included in MSR iteration since they are covered by the 3880 * segment descriptor interface too. 3881 */ 3882 MSR_KGSBASE, 3883 3884 MSR_STAR, 3885 MSR_LSTAR, 3886 MSR_CSTAR, 3887 MSR_SF_MASK, 3888 3889 MSR_SYSENTER_CS_MSR, 3890 MSR_SYSENTER_ESP_MSR, 3891 MSR_SYSENTER_EIP_MSR, 3892 MSR_PAT, 3893 }; 3894 const uint32_t generic_msr_iter[] = { 3895 MSR_TSC, 3896 MSR_MTRRcap, 3897 MSR_MTRRdefType, 3898 3899 MSR_MTRR4kBase, MSR_MTRR4kBase + 1, MSR_MTRR4kBase + 2, 3900 MSR_MTRR4kBase + 3, MSR_MTRR4kBase + 4, MSR_MTRR4kBase + 5, 3901 MSR_MTRR4kBase + 6, MSR_MTRR4kBase + 7, 3902 3903 MSR_MTRR16kBase, MSR_MTRR16kBase + 1, 3904 3905 MSR_MTRR64kBase, 3906 }; 3907 3908 static int 3909 vmm_data_read_msrs(struct vm *vm, int vcpuid, const vmm_data_req_t *req) 3910 { 3911 VERIFY3U(req->vdr_class, ==, VDC_MSR); 3912 VERIFY3U(req->vdr_version, ==, 1); 3913 3914 const uint_t num_msrs = nitems(arch_msr_iter) + nitems(generic_msr_iter) 3915 + (VMM_MTRR_VAR_MAX * 2); 3916 const uint32_t output_len = 3917 num_msrs * sizeof (struct vdi_field_entry_v1); 3918 *req->vdr_result_len = output_len; 3919 3920 if (req->vdr_len < output_len) { 3921 return (ENOSPC); 3922 } 3923 3924 struct vdi_field_entry_v1 *entryp = req->vdr_data; 3925 for (uint_t i = 0; i < nitems(arch_msr_iter); i++, entryp++) { 3926 const uint32_t msr = arch_msr_iter[i]; 3927 uint64_t val = 0; 3928 3929 int err = ops->vmgetmsr(vm->cookie, vcpuid, msr, &val); 3930 /* All of these MSRs are expected to work */ 3931 VERIFY0(err); 3932 entryp->vfe_ident = msr; 3933 entryp->vfe_value = val; 3934 } 3935 3936 struct vm_mtrr *mtrr = &vm->vcpu[vcpuid].mtrr; 3937 for (uint_t i = 0; i < nitems(generic_msr_iter); i++, entryp++) { 3938 const uint32_t msr = generic_msr_iter[i]; 3939 3940 entryp->vfe_ident = msr; 3941 switch (msr) { 3942 case MSR_TSC: 3943 /* 3944 * Communicate this as the difference from the VM-wide 3945 * offset of the boot time. 3946 */ 3947 entryp->vfe_value = vm->vcpu[vcpuid].tsc_offset; 3948 break; 3949 case MSR_MTRRcap: 3950 case MSR_MTRRdefType: 3951 case MSR_MTRR4kBase ... MSR_MTRR4kBase + 7: 3952 case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1: 3953 case MSR_MTRR64kBase: { 3954 int err = vm_rdmtrr(mtrr, msr, &entryp->vfe_value); 3955 VERIFY0(err); 3956 break; 3957 } 3958 default: 3959 panic("unexpected msr export %x", msr); 3960 } 3961 } 3962 /* Copy the variable MTRRs */ 3963 for (uint_t i = 0; i < (VMM_MTRR_VAR_MAX * 2); i++, entryp++) { 3964 const uint32_t msr = MSR_MTRRVarBase + i; 3965 3966 entryp->vfe_ident = msr; 3967 int err = vm_rdmtrr(mtrr, msr, &entryp->vfe_value); 3968 VERIFY0(err); 3969 } 3970 return (0); 3971 } 3972 3973 static int 3974 vmm_data_write_msrs(struct vm *vm, int vcpuid, const vmm_data_req_t *req) 3975 { 3976 VERIFY3U(req->vdr_class, ==, VDC_MSR); 3977 VERIFY3U(req->vdr_version, ==, 1); 3978 3979 const struct vdi_field_entry_v1 *entryp = req->vdr_data; 3980 const uint_t entry_count = 3981 req->vdr_len / sizeof (struct vdi_field_entry_v1); 3982 struct vm_mtrr *mtrr = &vm->vcpu[vcpuid].mtrr; 3983 3984 /* 3985 * First make sure that all of the MSRs can be manipulated. 3986 * For now, this check is done by going though the getmsr handler 3987 */ 3988 for (uint_t i = 0; i < entry_count; i++, entryp++) { 3989 const uint32_t msr = entryp->vfe_ident; 3990 uint64_t val; 3991 int err = 0; 3992 3993 switch (msr) { 3994 case MSR_TSC: 3995 break; 3996 default: 3997 if (is_mtrr_msr(msr)) { 3998 err = vm_rdmtrr(mtrr, msr, &val); 3999 } else { 4000 err = ops->vmgetmsr(vm->cookie, vcpuid, msr, 4001 &val); 4002 } 4003 break; 4004 } 4005 if (err != 0) { 4006 return (err); 4007 } 4008 } 4009 4010 /* 4011 * Fairly confident that all of the 'set' operations are at least 4012 * targeting valid MSRs, continue on. 4013 */ 4014 entryp = req->vdr_data; 4015 for (uint_t i = 0; i < entry_count; i++, entryp++) { 4016 const uint32_t msr = entryp->vfe_ident; 4017 const uint64_t val = entryp->vfe_value; 4018 int err = 0; 4019 4020 switch (msr) { 4021 case MSR_TSC: 4022 vm->vcpu[vcpuid].tsc_offset = entryp->vfe_value; 4023 break; 4024 default: 4025 if (is_mtrr_msr(msr)) { 4026 if (msr == MSR_MTRRcap) { 4027 /* 4028 * MTRRcap is read-only. If the current 4029 * value matches the incoming one, 4030 * consider it a success 4031 */ 4032 uint64_t comp; 4033 err = vm_rdmtrr(mtrr, msr, &comp); 4034 if (err != 0 || comp != val) { 4035 err = EINVAL; 4036 } 4037 } else { 4038 err = vm_wrmtrr(mtrr, msr, val); 4039 } 4040 } else { 4041 err = ops->vmsetmsr(vm->cookie, vcpuid, msr, 4042 val); 4043 } 4044 break; 4045 } 4046 if (err != 0) { 4047 return (err); 4048 } 4049 } 4050 *req->vdr_result_len = entry_count * sizeof (struct vdi_field_entry_v1); 4051 4052 return (0); 4053 } 4054 4055 static const vmm_data_version_entry_t msr_v1 = { 4056 .vdve_class = VDC_MSR, 4057 .vdve_version = 1, 4058 .vdve_len_per_item = sizeof (struct vdi_field_entry_v1), 4059 /* Requires backend-specific dispatch */ 4060 .vdve_readf = NULL, 4061 .vdve_writef = NULL, 4062 }; 4063 VMM_DATA_VERSION(msr_v1); 4064 4065 static const uint32_t vmm_arch_v1_fields[] = { 4066 VAI_TSC_BOOT_OFFSET, 4067 VAI_BOOT_HRTIME, 4068 VAI_TSC_FREQ, 4069 }; 4070 4071 static bool 4072 vmm_read_arch_field(struct vm *vm, uint32_t ident, uint64_t *valp) 4073 { 4074 ASSERT(valp != NULL); 4075 4076 switch (ident) { 4077 case VAI_TSC_BOOT_OFFSET: 4078 *valp = vm->boot_tsc_offset; 4079 return (true); 4080 case VAI_BOOT_HRTIME: 4081 *valp = vm->boot_hrtime; 4082 return (true); 4083 case VAI_TSC_FREQ: 4084 /* 4085 * Since the system TSC calibration is not public, just derive 4086 * it from the scaling functions available. 4087 */ 4088 *valp = unscalehrtime(NANOSEC); 4089 return (true); 4090 default: 4091 break; 4092 } 4093 return (false); 4094 } 4095 4096 static int 4097 vmm_data_read_vmm_arch(void *arg, const vmm_data_req_t *req) 4098 { 4099 struct vm *vm = arg; 4100 4101 VERIFY3U(req->vdr_class, ==, VDC_VMM_ARCH); 4102 VERIFY3U(req->vdr_version, ==, 1); 4103 4104 struct vdi_field_entry_v1 *entryp = req->vdr_data; 4105 4106 /* Specific fields requested */ 4107 if ((req->vdr_flags & VDX_FLAG_READ_COPYIN) != 0) { 4108 const uint_t count = 4109 req->vdr_len / sizeof (struct vdi_field_entry_v1); 4110 4111 for (uint_t i = 0; i < count; i++, entryp++) { 4112 if (!vmm_read_arch_field(vm, entryp->vfe_ident, 4113 &entryp->vfe_value)) { 4114 return (EINVAL); 4115 } 4116 } 4117 *req->vdr_result_len = 4118 count * sizeof (struct vdi_field_entry_v1); 4119 return (0); 4120 } 4121 4122 /* Emit all of the possible values */ 4123 const uint32_t total_size = nitems(vmm_arch_v1_fields) * 4124 sizeof (struct vdi_field_entry_v1); 4125 *req->vdr_result_len = total_size; 4126 if (req->vdr_len < total_size) { 4127 return (ENOSPC); 4128 } 4129 for (uint_t i = 0; i < nitems(vmm_arch_v1_fields); i++, entryp++) { 4130 entryp->vfe_ident = vmm_arch_v1_fields[i]; 4131 VERIFY(vmm_read_arch_field(vm, entryp->vfe_ident, 4132 &entryp->vfe_value)); 4133 } 4134 return (0); 4135 } 4136 4137 static int 4138 vmm_data_write_vmm_arch(void *arg, const vmm_data_req_t *req) 4139 { 4140 struct vm *vm = arg; 4141 4142 VERIFY3U(req->vdr_class, ==, VDC_VMM_ARCH); 4143 VERIFY3U(req->vdr_version, ==, 1); 4144 4145 const struct vdi_field_entry_v1 *entryp = req->vdr_data; 4146 const uint_t entry_count = 4147 req->vdr_len / sizeof (struct vdi_field_entry_v1); 4148 4149 for (uint_t i = 0; i < entry_count; i++, entryp++) { 4150 const uint64_t val = entryp->vfe_value; 4151 4152 switch (entryp->vfe_ident) { 4153 case VAI_TSC_BOOT_OFFSET: 4154 vm->boot_tsc_offset = val; 4155 break; 4156 case VAI_BOOT_HRTIME: 4157 vm->boot_hrtime = val; 4158 break; 4159 case VAI_TSC_FREQ: 4160 /* Guest TSC frequency not (currently) adjustable */ 4161 return (EPERM); 4162 default: 4163 return (EINVAL); 4164 } 4165 } 4166 *req->vdr_result_len = entry_count * sizeof (struct vdi_field_entry_v1); 4167 return (0); 4168 } 4169 4170 static const vmm_data_version_entry_t vmm_arch_v1 = { 4171 .vdve_class = VDC_VMM_ARCH, 4172 .vdve_version = 1, 4173 .vdve_len_per_item = sizeof (struct vdi_field_entry_v1), 4174 .vdve_readf = vmm_data_read_vmm_arch, 4175 .vdve_writef = vmm_data_write_vmm_arch, 4176 }; 4177 VMM_DATA_VERSION(vmm_arch_v1); 4178 4179 static int 4180 vmm_data_read_versions(void *arg, const vmm_data_req_t *req) 4181 { 4182 VERIFY3U(req->vdr_class, ==, VDC_VERSION); 4183 VERIFY3U(req->vdr_version, ==, 1); 4184 4185 const uint32_t total_size = SET_COUNT(vmm_data_version_entries) * 4186 sizeof (struct vdi_version_entry_v1); 4187 4188 /* Make sure there is room for all of the entries */ 4189 *req->vdr_result_len = total_size; 4190 if (req->vdr_len < *req->vdr_result_len) { 4191 return (ENOSPC); 4192 } 4193 4194 struct vdi_version_entry_v1 *entryp = req->vdr_data; 4195 const vmm_data_version_entry_t **vdpp; 4196 SET_FOREACH(vdpp, vmm_data_version_entries) { 4197 const vmm_data_version_entry_t *vdp = *vdpp; 4198 4199 entryp->vve_class = vdp->vdve_class; 4200 entryp->vve_version = vdp->vdve_version; 4201 entryp->vve_len_expect = vdp->vdve_len_expect; 4202 entryp->vve_len_per_item = vdp->vdve_len_per_item; 4203 entryp++; 4204 } 4205 return (0); 4206 } 4207 4208 static int 4209 vmm_data_write_versions(void *arg, const vmm_data_req_t *req) 4210 { 4211 /* Writing to the version information makes no sense */ 4212 return (EPERM); 4213 } 4214 4215 static const vmm_data_version_entry_t versions_v1 = { 4216 .vdve_class = VDC_VERSION, 4217 .vdve_version = 1, 4218 .vdve_len_per_item = sizeof (struct vdi_version_entry_v1), 4219 .vdve_readf = vmm_data_read_versions, 4220 .vdve_writef = vmm_data_write_versions, 4221 }; 4222 VMM_DATA_VERSION(versions_v1); 4223 4224 int 4225 vmm_data_read(struct vm *vm, int vcpuid, const vmm_data_req_t *req) 4226 { 4227 int err = 0; 4228 4229 if (vmm_data_is_cpu_specific(req->vdr_class)) { 4230 if (vcpuid >= VM_MAXCPU) { 4231 return (EINVAL); 4232 } 4233 } 4234 4235 const vmm_data_version_entry_t *entry = NULL; 4236 err = vmm_data_find(req, &entry); 4237 if (err != 0) { 4238 return (err); 4239 } 4240 ASSERT(entry != NULL); 4241 4242 void *datap = vmm_data_from_class(req, vm, vcpuid); 4243 if (datap != NULL) { 4244 err = entry->vdve_readf(datap, req); 4245 4246 /* 4247 * Successful reads of fixed-length data should populate the 4248 * length of that result. 4249 */ 4250 if (err == 0 && entry->vdve_len_expect != 0) { 4251 *req->vdr_result_len = entry->vdve_len_expect; 4252 } 4253 } else { 4254 switch (req->vdr_class) { 4255 case VDC_MSR: 4256 err = vmm_data_read_msrs(vm, vcpuid, req); 4257 break; 4258 case VDC_FPU: 4259 /* TODO: wire up to xsave export via hma_fpu iface */ 4260 err = EINVAL; 4261 break; 4262 case VDC_REGISTER: 4263 default: 4264 err = EINVAL; 4265 break; 4266 } 4267 } 4268 4269 return (err); 4270 } 4271 4272 int 4273 vmm_data_write(struct vm *vm, int vcpuid, const vmm_data_req_t *req) 4274 { 4275 int err = 0; 4276 4277 if (vmm_data_is_cpu_specific(req->vdr_class)) { 4278 if (vcpuid >= VM_MAXCPU) { 4279 return (EINVAL); 4280 } 4281 } 4282 4283 const vmm_data_version_entry_t *entry = NULL; 4284 err = vmm_data_find(req, &entry); 4285 if (err != 0) { 4286 return (err); 4287 } 4288 ASSERT(entry != NULL); 4289 4290 void *datap = vmm_data_from_class(req, vm, vcpuid); 4291 if (datap != NULL) { 4292 err = entry->vdve_writef(datap, req); 4293 /* 4294 * Successful writes of fixed-length data should populate the 4295 * length of that result. 4296 */ 4297 if (err == 0 && entry->vdve_len_expect != 0) { 4298 *req->vdr_result_len = entry->vdve_len_expect; 4299 } 4300 } else { 4301 switch (req->vdr_class) { 4302 case VDC_MSR: 4303 err = vmm_data_write_msrs(vm, vcpuid, req); 4304 break; 4305 case VDC_FPU: 4306 /* TODO: wire up to xsave import via hma_fpu iface */ 4307 err = EINVAL; 4308 break; 4309 case VDC_REGISTER: 4310 default: 4311 err = EINVAL; 4312 break; 4313 } 4314 } 4315 4316 return (err); 4317 } 4318