1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2011 NetApp, Inc. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 * 28 * $FreeBSD$ 29 */ 30 /* 31 * This file and its contents are supplied under the terms of the 32 * Common Development and Distribution License ("CDDL"), version 1.0. 33 * You may only use this file in accordance with the terms of version 34 * 1.0 of the CDDL. 35 * 36 * A full copy of the text of the CDDL should have accompanied this 37 * source. A copy of the CDDL is also available via the Internet at 38 * http://www.illumos.org/license/CDDL. 39 * 40 * Copyright 2015 Pluribus Networks Inc. 41 * Copyright 2018 Joyent, Inc. 42 * Copyright 2022 Oxide Computer Company 43 * Copyright 2021 OmniOS Community Edition (OmniOSce) Association. 44 */ 45 46 #include <sys/cdefs.h> 47 __FBSDID("$FreeBSD$"); 48 49 #include <sys/param.h> 50 #include <sys/systm.h> 51 #include <sys/kernel.h> 52 #include <sys/module.h> 53 #include <sys/sysctl.h> 54 #include <sys/kmem.h> 55 #include <sys/pcpu.h> 56 #include <sys/mutex.h> 57 #include <sys/proc.h> 58 #include <sys/rwlock.h> 59 #include <sys/sched.h> 60 #include <sys/systm.h> 61 #include <sys/sunddi.h> 62 #include <sys/hma.h> 63 64 #include <machine/md_var.h> 65 #include <x86/psl.h> 66 #include <x86/apicreg.h> 67 68 #include <machine/specialreg.h> 69 #include <machine/vmm.h> 70 #include <machine/vmm_dev.h> 71 #include <machine/vmparam.h> 72 #include <sys/vmm_instruction_emul.h> 73 #include <sys/vmm_vm.h> 74 #include <sys/vmm_gpt.h> 75 #include <sys/vmm_data.h> 76 77 #include "vmm_ioport.h" 78 #include "vmm_host.h" 79 #include "vmm_util.h" 80 #include "vatpic.h" 81 #include "vatpit.h" 82 #include "vhpet.h" 83 #include "vioapic.h" 84 #include "vlapic.h" 85 #include "vpmtmr.h" 86 #include "vrtc.h" 87 #include "vmm_stat.h" 88 #include "vmm_lapic.h" 89 90 #include "io/ppt.h" 91 #include "io/iommu.h" 92 93 struct vlapic; 94 95 /* Flags for vtc_status */ 96 #define VTCS_FPU_RESTORED 1 /* guest FPU restored, host FPU saved */ 97 #define VTCS_FPU_CTX_CRITICAL 2 /* in ctx where FPU restore cannot be lazy */ 98 99 typedef struct vm_thread_ctx { 100 struct vm *vtc_vm; 101 int vtc_vcpuid; 102 uint_t vtc_status; 103 enum vcpu_ustate vtc_ustate; 104 } vm_thread_ctx_t; 105 106 #define VMM_MTRR_VAR_MAX 10 107 #define VMM_MTRR_DEF_MASK \ 108 (MTRR_DEF_ENABLE | MTRR_DEF_FIXED_ENABLE | MTRR_DEF_TYPE) 109 #define VMM_MTRR_PHYSBASE_MASK (MTRR_PHYSBASE_PHYSBASE | MTRR_PHYSBASE_TYPE) 110 #define VMM_MTRR_PHYSMASK_MASK (MTRR_PHYSMASK_PHYSMASK | MTRR_PHYSMASK_VALID) 111 struct vm_mtrr { 112 uint64_t def_type; 113 uint64_t fixed4k[8]; 114 uint64_t fixed16k[2]; 115 uint64_t fixed64k; 116 struct { 117 uint64_t base; 118 uint64_t mask; 119 } var[VMM_MTRR_VAR_MAX]; 120 }; 121 122 /* 123 * Initialization: 124 * (a) allocated when vcpu is created 125 * (i) initialized when vcpu is created and when it is reinitialized 126 * (o) initialized the first time the vcpu is created 127 * (x) initialized before use 128 */ 129 struct vcpu { 130 /* (o) protects state, run_state, hostcpu, sipi_vector */ 131 kmutex_t lock; 132 133 enum vcpu_state state; /* (o) vcpu state */ 134 enum vcpu_run_state run_state; /* (i) vcpu init/sipi/run state */ 135 kcondvar_t vcpu_cv; /* (o) cpu waiter cv */ 136 kcondvar_t state_cv; /* (o) IDLE-transition cv */ 137 int hostcpu; /* (o) vcpu's current host cpu */ 138 int lastloccpu; /* (o) last host cpu localized to */ 139 int reqidle; /* (i) request vcpu to idle */ 140 struct vlapic *vlapic; /* (i) APIC device model */ 141 enum x2apic_state x2apic_state; /* (i) APIC mode */ 142 uint64_t exit_intinfo; /* (i) events pending at VM exit */ 143 uint64_t exc_pending; /* (i) exception pending */ 144 bool nmi_pending; /* (i) NMI pending */ 145 bool extint_pending; /* (i) INTR pending */ 146 147 uint8_t sipi_vector; /* (i) SIPI vector */ 148 hma_fpu_t *guestfpu; /* (a,i) guest fpu state */ 149 uint64_t guest_xcr0; /* (i) guest %xcr0 register */ 150 void *stats; /* (a,i) statistics */ 151 struct vm_exit exitinfo; /* (x) exit reason and collateral */ 152 uint64_t nextrip; /* (x) next instruction to execute */ 153 struct vie *vie_ctx; /* (x) instruction emulation context */ 154 vm_client_t *vmclient; /* (a) VM-system client */ 155 uint64_t tsc_offset; /* (x) offset from host TSC */ 156 struct vm_mtrr mtrr; /* (i) vcpu's MTRR */ 157 vcpu_cpuid_config_t cpuid_cfg; /* (x) cpuid configuration */ 158 159 enum vcpu_ustate ustate; /* (i) microstate for the vcpu */ 160 hrtime_t ustate_when; /* (i) time of last ustate change */ 161 uint64_t ustate_total[VU_MAX]; /* (o) total time spent in ustates */ 162 vm_thread_ctx_t vtc; /* (o) thread state for ctxops */ 163 struct ctxop *ctxop; /* (o) ctxop storage for vcpu */ 164 }; 165 166 #define vcpu_lock(v) mutex_enter(&((v)->lock)) 167 #define vcpu_unlock(v) mutex_exit(&((v)->lock)) 168 #define vcpu_assert_locked(v) ASSERT(MUTEX_HELD(&((v)->lock))) 169 170 struct mem_seg { 171 size_t len; 172 bool sysmem; 173 vm_object_t *object; 174 }; 175 #define VM_MAX_MEMSEGS 5 176 177 struct mem_map { 178 vm_paddr_t gpa; 179 size_t len; 180 vm_ooffset_t segoff; 181 int segid; 182 int prot; 183 int flags; 184 }; 185 #define VM_MAX_MEMMAPS 8 186 187 /* 188 * Initialization: 189 * (o) initialized the first time the VM is created 190 * (i) initialized when VM is created and when it is reinitialized 191 * (x) initialized before use 192 */ 193 struct vm { 194 void *cookie; /* (i) cpu-specific data */ 195 void *iommu; /* (x) iommu-specific data */ 196 struct vhpet *vhpet; /* (i) virtual HPET */ 197 struct vioapic *vioapic; /* (i) virtual ioapic */ 198 struct vatpic *vatpic; /* (i) virtual atpic */ 199 struct vatpit *vatpit; /* (i) virtual atpit */ 200 struct vpmtmr *vpmtmr; /* (i) virtual ACPI PM timer */ 201 struct vrtc *vrtc; /* (o) virtual RTC */ 202 volatile cpuset_t active_cpus; /* (i) active vcpus */ 203 volatile cpuset_t debug_cpus; /* (i) vcpus stopped for dbg */ 204 int suspend; /* (i) stop VM execution */ 205 volatile cpuset_t suspended_cpus; /* (i) suspended vcpus */ 206 volatile cpuset_t halted_cpus; /* (x) cpus in a hard halt */ 207 struct mem_map mem_maps[VM_MAX_MEMMAPS]; /* (i) guest address space */ 208 struct mem_seg mem_segs[VM_MAX_MEMSEGS]; /* (o) guest memory regions */ 209 struct vmspace *vmspace; /* (o) guest's address space */ 210 struct vcpu vcpu[VM_MAXCPU]; /* (i) guest vcpus */ 211 /* The following describe the vm cpu topology */ 212 uint16_t sockets; /* (o) num of sockets */ 213 uint16_t cores; /* (o) num of cores/socket */ 214 uint16_t threads; /* (o) num of threads/core */ 215 uint16_t maxcpus; /* (o) max pluggable cpus */ 216 217 uint64_t boot_tsc_offset; /* (i) TSC offset at VM boot */ 218 hrtime_t boot_hrtime; /* (i) hrtime at VM boot */ 219 220 struct ioport_config ioports; /* (o) ioport handling */ 221 222 bool mem_transient; /* (o) alloc transient memory */ 223 bool is_paused; /* (i) instance is paused */ 224 }; 225 226 static int vmm_initialized; 227 228 229 static void 230 nullop_panic(void) 231 { 232 panic("null vmm operation call"); 233 } 234 235 /* Do not allow use of an un-set `ops` to do anything but panic */ 236 static struct vmm_ops vmm_ops_null = { 237 .init = (vmm_init_func_t)nullop_panic, 238 .cleanup = (vmm_cleanup_func_t)nullop_panic, 239 .resume = (vmm_resume_func_t)nullop_panic, 240 .vminit = (vmi_init_func_t)nullop_panic, 241 .vmrun = (vmi_run_func_t)nullop_panic, 242 .vmcleanup = (vmi_cleanup_func_t)nullop_panic, 243 .vmgetreg = (vmi_get_register_t)nullop_panic, 244 .vmsetreg = (vmi_set_register_t)nullop_panic, 245 .vmgetdesc = (vmi_get_desc_t)nullop_panic, 246 .vmsetdesc = (vmi_set_desc_t)nullop_panic, 247 .vmgetcap = (vmi_get_cap_t)nullop_panic, 248 .vmsetcap = (vmi_set_cap_t)nullop_panic, 249 .vlapic_init = (vmi_vlapic_init)nullop_panic, 250 .vlapic_cleanup = (vmi_vlapic_cleanup)nullop_panic, 251 .vmsavectx = (vmi_savectx)nullop_panic, 252 .vmrestorectx = (vmi_restorectx)nullop_panic, 253 .vmgetmsr = (vmi_get_msr_t)nullop_panic, 254 .vmsetmsr = (vmi_set_msr_t)nullop_panic, 255 }; 256 257 static struct vmm_ops *ops = &vmm_ops_null; 258 static vmm_pte_ops_t *pte_ops = NULL; 259 260 #define VMM_INIT() ((*ops->init)()) 261 #define VMM_CLEANUP() ((*ops->cleanup)()) 262 #define VMM_RESUME() ((*ops->resume)()) 263 264 #define VMINIT(vm) ((*ops->vminit)(vm)) 265 #define VMRUN(vmi, vcpu, rip) ((*ops->vmrun)(vmi, vcpu, rip)) 266 #define VMCLEANUP(vmi) ((*ops->vmcleanup)(vmi)) 267 268 #define VMGETREG(vmi, vcpu, num, rv) ((*ops->vmgetreg)(vmi, vcpu, num, rv)) 269 #define VMSETREG(vmi, vcpu, num, val) ((*ops->vmsetreg)(vmi, vcpu, num, val)) 270 #define VMGETDESC(vmi, vcpu, num, dsc) ((*ops->vmgetdesc)(vmi, vcpu, num, dsc)) 271 #define VMSETDESC(vmi, vcpu, num, dsc) ((*ops->vmsetdesc)(vmi, vcpu, num, dsc)) 272 #define VMGETCAP(vmi, vcpu, num, rv) ((*ops->vmgetcap)(vmi, vcpu, num, rv)) 273 #define VMSETCAP(vmi, vcpu, num, val) ((*ops->vmsetcap)(vmi, vcpu, num, val)) 274 #define VLAPIC_INIT(vmi, vcpu) ((*ops->vlapic_init)(vmi, vcpu)) 275 #define VLAPIC_CLEANUP(vmi, vlapic) ((*ops->vlapic_cleanup)(vmi, vlapic)) 276 277 #define fpu_start_emulating() load_cr0(rcr0() | CR0_TS) 278 #define fpu_stop_emulating() clts() 279 280 SDT_PROVIDER_DEFINE(vmm); 281 282 SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 283 NULL); 284 285 /* 286 * Halt the guest if all vcpus are executing a HLT instruction with 287 * interrupts disabled. 288 */ 289 int halt_detection_enabled = 1; 290 291 /* Trap into hypervisor on all guest exceptions and reflect them back */ 292 int trace_guest_exceptions; 293 294 /* Trap WBINVD and ignore it */ 295 int trap_wbinvd = 1; 296 297 static void vm_free_memmap(struct vm *vm, int ident); 298 static bool sysmem_mapping(struct vm *vm, struct mem_map *mm); 299 static void vcpu_notify_event_locked(struct vcpu *vcpu, vcpu_notify_t); 300 static bool vcpu_sleep_bailout_checks(struct vm *vm, int vcpuid); 301 static int vcpu_vector_sipi(struct vm *vm, int vcpuid, uint8_t vector); 302 303 static void vmm_savectx(void *); 304 static void vmm_restorectx(void *); 305 static const struct ctxop_template vmm_ctxop_tpl = { 306 .ct_rev = CTXOP_TPL_REV, 307 .ct_save = vmm_savectx, 308 .ct_restore = vmm_restorectx, 309 }; 310 311 #ifdef KTR 312 static const char * 313 vcpu_state2str(enum vcpu_state state) 314 { 315 316 switch (state) { 317 case VCPU_IDLE: 318 return ("idle"); 319 case VCPU_FROZEN: 320 return ("frozen"); 321 case VCPU_RUNNING: 322 return ("running"); 323 case VCPU_SLEEPING: 324 return ("sleeping"); 325 default: 326 return ("unknown"); 327 } 328 } 329 #endif 330 331 static void 332 vcpu_cleanup(struct vm *vm, int i, bool destroy) 333 { 334 struct vcpu *vcpu = &vm->vcpu[i]; 335 336 VLAPIC_CLEANUP(vm->cookie, vcpu->vlapic); 337 if (destroy) { 338 vmm_stat_free(vcpu->stats); 339 340 vcpu_cpuid_cleanup(&vcpu->cpuid_cfg); 341 342 hma_fpu_free(vcpu->guestfpu); 343 vcpu->guestfpu = NULL; 344 345 vie_free(vcpu->vie_ctx); 346 vcpu->vie_ctx = NULL; 347 348 vmc_destroy(vcpu->vmclient); 349 vcpu->vmclient = NULL; 350 351 ctxop_free(vcpu->ctxop); 352 mutex_destroy(&vcpu->lock); 353 } 354 } 355 356 static void 357 vcpu_init(struct vm *vm, int vcpu_id, bool create) 358 { 359 struct vcpu *vcpu; 360 361 KASSERT(vcpu_id >= 0 && vcpu_id < vm->maxcpus, 362 ("vcpu_init: invalid vcpu %d", vcpu_id)); 363 364 vcpu = &vm->vcpu[vcpu_id]; 365 366 if (create) { 367 mutex_init(&vcpu->lock, NULL, MUTEX_ADAPTIVE, NULL); 368 369 vcpu->state = VCPU_IDLE; 370 vcpu->hostcpu = NOCPU; 371 vcpu->lastloccpu = NOCPU; 372 vcpu->guestfpu = hma_fpu_alloc(KM_SLEEP); 373 vcpu->stats = vmm_stat_alloc(); 374 vcpu->vie_ctx = vie_alloc(); 375 vcpu_cpuid_init(&vcpu->cpuid_cfg); 376 377 vcpu->ustate = VU_INIT; 378 vcpu->ustate_when = gethrtime(); 379 380 vcpu->vtc.vtc_vm = vm; 381 vcpu->vtc.vtc_vcpuid = vcpu_id; 382 vcpu->ctxop = ctxop_allocate(&vmm_ctxop_tpl, &vcpu->vtc); 383 } else { 384 vie_reset(vcpu->vie_ctx); 385 bzero(&vcpu->exitinfo, sizeof (vcpu->exitinfo)); 386 if (vcpu->ustate != VU_INIT) { 387 vcpu_ustate_change(vm, vcpu_id, VU_INIT); 388 } 389 bzero(&vcpu->mtrr, sizeof (vcpu->mtrr)); 390 } 391 392 vcpu->run_state = VRS_HALT; 393 vcpu->vlapic = VLAPIC_INIT(vm->cookie, vcpu_id); 394 (void) vm_set_x2apic_state(vm, vcpu_id, X2APIC_DISABLED); 395 vcpu->reqidle = 0; 396 vcpu->exit_intinfo = 0; 397 vcpu->nmi_pending = false; 398 vcpu->extint_pending = false; 399 vcpu->exc_pending = 0; 400 vcpu->guest_xcr0 = XFEATURE_ENABLED_X87; 401 (void) hma_fpu_init(vcpu->guestfpu); 402 vmm_stat_init(vcpu->stats); 403 vcpu->tsc_offset = 0; 404 } 405 406 int 407 vcpu_trace_exceptions(struct vm *vm, int vcpuid) 408 { 409 return (trace_guest_exceptions); 410 } 411 412 int 413 vcpu_trap_wbinvd(struct vm *vm, int vcpuid) 414 { 415 return (trap_wbinvd); 416 } 417 418 struct vm_exit * 419 vm_exitinfo(struct vm *vm, int cpuid) 420 { 421 struct vcpu *vcpu; 422 423 if (cpuid < 0 || cpuid >= vm->maxcpus) 424 panic("vm_exitinfo: invalid cpuid %d", cpuid); 425 426 vcpu = &vm->vcpu[cpuid]; 427 428 return (&vcpu->exitinfo); 429 } 430 431 struct vie * 432 vm_vie_ctx(struct vm *vm, int cpuid) 433 { 434 if (cpuid < 0 || cpuid >= vm->maxcpus) 435 panic("vm_vie_ctx: invalid cpuid %d", cpuid); 436 437 return (vm->vcpu[cpuid].vie_ctx); 438 } 439 440 static int 441 vmm_init(void) 442 { 443 vmm_host_state_init(); 444 445 if (vmm_is_intel()) { 446 ops = &vmm_ops_intel; 447 pte_ops = &ept_pte_ops; 448 } else if (vmm_is_svm()) { 449 ops = &vmm_ops_amd; 450 pte_ops = &rvi_pte_ops; 451 } else { 452 return (ENXIO); 453 } 454 455 return (VMM_INIT()); 456 } 457 458 int 459 vmm_mod_load() 460 { 461 int error; 462 463 VERIFY(vmm_initialized == 0); 464 465 error = vmm_init(); 466 if (error == 0) 467 vmm_initialized = 1; 468 469 return (error); 470 } 471 472 int 473 vmm_mod_unload() 474 { 475 int error; 476 477 VERIFY(vmm_initialized == 1); 478 479 error = VMM_CLEANUP(); 480 if (error) 481 return (error); 482 vmm_initialized = 0; 483 484 return (0); 485 } 486 487 /* 488 * Create a test IOMMU domain to see if the host system has necessary hardware 489 * and drivers to do so. 490 */ 491 bool 492 vmm_check_iommu(void) 493 { 494 void *domain; 495 const size_t arb_test_sz = (1UL << 32); 496 497 domain = iommu_create_domain(arb_test_sz); 498 if (domain == NULL) { 499 return (false); 500 } 501 iommu_destroy_domain(domain); 502 return (true); 503 } 504 505 static void 506 vm_init(struct vm *vm, bool create) 507 { 508 int i; 509 510 vm->cookie = VMINIT(vm); 511 vm->iommu = NULL; 512 vm->vioapic = vioapic_init(vm); 513 vm->vhpet = vhpet_init(vm); 514 vm->vatpic = vatpic_init(vm); 515 vm->vatpit = vatpit_init(vm); 516 vm->vpmtmr = vpmtmr_init(vm); 517 if (create) 518 vm->vrtc = vrtc_init(vm); 519 520 vm_inout_init(vm, &vm->ioports); 521 522 CPU_ZERO(&vm->active_cpus); 523 CPU_ZERO(&vm->debug_cpus); 524 525 vm->suspend = 0; 526 CPU_ZERO(&vm->suspended_cpus); 527 528 for (i = 0; i < vm->maxcpus; i++) 529 vcpu_init(vm, i, create); 530 531 /* 532 * Configure the VM-wide TSC offset so that the call to vm_init() 533 * represents the boot time (when the TSC(s) read 0). Each vCPU will 534 * have its own offset from this, which is altered if/when the guest 535 * writes to MSR_TSC. 536 * 537 * The TSC offsetting math is all unsigned, using overflow for negative 538 * offets. A reading of the TSC is negated to form the boot offset. 539 */ 540 const uint64_t boot_tsc = rdtsc_offset(); 541 vm->boot_tsc_offset = (uint64_t)(-(int64_t)boot_tsc); 542 543 /* Convert the boot TSC reading to hrtime */ 544 vm->boot_hrtime = (hrtime_t)boot_tsc; 545 scalehrtime(&vm->boot_hrtime); 546 } 547 548 /* 549 * The default CPU topology is a single thread per package. 550 */ 551 uint_t cores_per_package = 1; 552 uint_t threads_per_core = 1; 553 554 int 555 vm_create(uint64_t flags, struct vm **retvm) 556 { 557 struct vm *vm; 558 struct vmspace *vmspace; 559 560 /* 561 * If vmm.ko could not be successfully initialized then don't attempt 562 * to create the virtual machine. 563 */ 564 if (!vmm_initialized) 565 return (ENXIO); 566 567 bool track_dirty = (flags & VCF_TRACK_DIRTY) != 0; 568 if (track_dirty && !pte_ops->vpeo_hw_ad_supported()) 569 return (ENOTSUP); 570 571 vmspace = vmspace_alloc(VM_MAXUSER_ADDRESS, pte_ops, track_dirty); 572 if (vmspace == NULL) 573 return (ENOMEM); 574 575 vm = kmem_zalloc(sizeof (struct vm), KM_SLEEP); 576 577 vm->vmspace = vmspace; 578 vm->mem_transient = (flags & VCF_RESERVOIR_MEM) == 0; 579 for (uint_t i = 0; i < VM_MAXCPU; i++) { 580 vm->vcpu[i].vmclient = vmspace_client_alloc(vmspace); 581 } 582 583 vm->sockets = 1; 584 vm->cores = cores_per_package; /* XXX backwards compatibility */ 585 vm->threads = threads_per_core; /* XXX backwards compatibility */ 586 vm->maxcpus = VM_MAXCPU; /* XXX temp to keep code working */ 587 588 vm_init(vm, true); 589 590 *retvm = vm; 591 return (0); 592 } 593 594 void 595 vm_get_topology(struct vm *vm, uint16_t *sockets, uint16_t *cores, 596 uint16_t *threads, uint16_t *maxcpus) 597 { 598 *sockets = vm->sockets; 599 *cores = vm->cores; 600 *threads = vm->threads; 601 *maxcpus = vm->maxcpus; 602 } 603 604 uint16_t 605 vm_get_maxcpus(struct vm *vm) 606 { 607 return (vm->maxcpus); 608 } 609 610 int 611 vm_set_topology(struct vm *vm, uint16_t sockets, uint16_t cores, 612 uint16_t threads, uint16_t maxcpus) 613 { 614 if (maxcpus != 0) 615 return (EINVAL); /* XXX remove when supported */ 616 if ((sockets * cores * threads) > vm->maxcpus) 617 return (EINVAL); 618 /* XXX need to check sockets * cores * threads == vCPU, how? */ 619 vm->sockets = sockets; 620 vm->cores = cores; 621 vm->threads = threads; 622 vm->maxcpus = VM_MAXCPU; /* XXX temp to keep code working */ 623 return (0); 624 } 625 626 static void 627 vm_cleanup(struct vm *vm, bool destroy) 628 { 629 struct mem_map *mm; 630 int i; 631 632 ppt_unassign_all(vm); 633 634 if (vm->iommu != NULL) 635 iommu_destroy_domain(vm->iommu); 636 637 /* 638 * Devices which attach their own ioport hooks should be cleaned up 639 * first so they can tear down those registrations. 640 */ 641 vpmtmr_cleanup(vm->vpmtmr); 642 643 vm_inout_cleanup(vm, &vm->ioports); 644 645 if (destroy) 646 vrtc_cleanup(vm->vrtc); 647 else 648 vrtc_reset(vm->vrtc); 649 650 vatpit_cleanup(vm->vatpit); 651 vhpet_cleanup(vm->vhpet); 652 vatpic_cleanup(vm->vatpic); 653 vioapic_cleanup(vm->vioapic); 654 655 for (i = 0; i < vm->maxcpus; i++) 656 vcpu_cleanup(vm, i, destroy); 657 658 VMCLEANUP(vm->cookie); 659 660 /* 661 * System memory is removed from the guest address space only when 662 * the VM is destroyed. This is because the mapping remains the same 663 * across VM reset. 664 * 665 * Device memory can be relocated by the guest (e.g. using PCI BARs) 666 * so those mappings are removed on a VM reset. 667 */ 668 for (i = 0; i < VM_MAX_MEMMAPS; i++) { 669 mm = &vm->mem_maps[i]; 670 if (destroy || !sysmem_mapping(vm, mm)) { 671 vm_free_memmap(vm, i); 672 } else { 673 /* 674 * We need to reset the IOMMU flag so this mapping can 675 * be reused when a VM is rebooted. Since the IOMMU 676 * domain has already been destroyed we can just reset 677 * the flag here. 678 */ 679 mm->flags &= ~VM_MEMMAP_F_IOMMU; 680 } 681 } 682 683 if (destroy) { 684 for (i = 0; i < VM_MAX_MEMSEGS; i++) 685 vm_free_memseg(vm, i); 686 687 vmspace_destroy(vm->vmspace); 688 vm->vmspace = NULL; 689 } 690 } 691 692 void 693 vm_destroy(struct vm *vm) 694 { 695 vm_cleanup(vm, true); 696 kmem_free(vm, sizeof (*vm)); 697 } 698 699 int 700 vm_reinit(struct vm *vm, uint64_t flags) 701 { 702 /* A virtual machine can be reset only if all vcpus are suspended. */ 703 if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) != 0) { 704 if ((flags & VM_REINIT_F_FORCE_SUSPEND) == 0) { 705 return (EBUSY); 706 } 707 708 /* 709 * Force the VM (and all its vCPUs) into a suspended state. 710 * This should be quick and easy, since the vm_reinit() call is 711 * made while holding the VM write lock, which requires holding 712 * all of the vCPUs in the VCPU_FROZEN state. 713 */ 714 (void) atomic_cmpset_int((uint_t *)&vm->suspend, 0, 715 VM_SUSPEND_RESET); 716 for (uint_t i = 0; i < vm->maxcpus; i++) { 717 struct vcpu *vcpu = &vm->vcpu[i]; 718 719 if (CPU_ISSET(i, &vm->suspended_cpus) || 720 !CPU_ISSET(i, &vm->active_cpus)) { 721 continue; 722 } 723 724 vcpu_lock(vcpu); 725 VERIFY3U(vcpu->state, ==, VCPU_FROZEN); 726 CPU_SET_ATOMIC(i, &vm->suspended_cpus); 727 vcpu_unlock(vcpu); 728 } 729 730 VERIFY0(CPU_CMP(&vm->suspended_cpus, &vm->active_cpus)); 731 } 732 733 vm_cleanup(vm, false); 734 vm_init(vm, false); 735 return (0); 736 } 737 738 bool 739 vm_is_paused(struct vm *vm) 740 { 741 return (vm->is_paused); 742 } 743 744 int 745 vm_pause_instance(struct vm *vm) 746 { 747 if (vm->is_paused) { 748 return (EALREADY); 749 } 750 vm->is_paused = true; 751 752 for (uint_t i = 0; i < vm->maxcpus; i++) { 753 struct vcpu *vcpu = &vm->vcpu[i]; 754 755 if (!CPU_ISSET(i, &vm->active_cpus)) { 756 continue; 757 } 758 vlapic_pause(vcpu->vlapic); 759 } 760 vhpet_pause(vm->vhpet); 761 vatpit_pause(vm->vatpit); 762 vrtc_pause(vm->vrtc); 763 764 return (0); 765 } 766 767 int 768 vm_resume_instance(struct vm *vm) 769 { 770 if (!vm->is_paused) { 771 return (EALREADY); 772 } 773 vm->is_paused = false; 774 775 vrtc_resume(vm->vrtc); 776 vatpit_resume(vm->vatpit); 777 vhpet_resume(vm->vhpet); 778 for (uint_t i = 0; i < vm->maxcpus; i++) { 779 struct vcpu *vcpu = &vm->vcpu[i]; 780 781 if (!CPU_ISSET(i, &vm->active_cpus)) { 782 continue; 783 } 784 vlapic_resume(vcpu->vlapic); 785 } 786 787 return (0); 788 } 789 790 int 791 vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa) 792 { 793 vm_object_t *obj; 794 795 if ((obj = vmm_mmio_alloc(vm->vmspace, gpa, len, hpa)) == NULL) 796 return (ENOMEM); 797 else 798 return (0); 799 } 800 801 int 802 vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len) 803 { 804 return (vmspace_unmap(vm->vmspace, gpa, gpa + len)); 805 } 806 807 /* 808 * Return 'true' if 'gpa' is allocated in the guest address space. 809 * 810 * This function is called in the context of a running vcpu which acts as 811 * an implicit lock on 'vm->mem_maps[]'. 812 */ 813 bool 814 vm_mem_allocated(struct vm *vm, int vcpuid, vm_paddr_t gpa) 815 { 816 struct mem_map *mm; 817 int i; 818 819 #ifdef INVARIANTS 820 int hostcpu, state; 821 state = vcpu_get_state(vm, vcpuid, &hostcpu); 822 KASSERT(state == VCPU_RUNNING && hostcpu == curcpu, 823 ("%s: invalid vcpu state %d/%d", __func__, state, hostcpu)); 824 #endif 825 826 for (i = 0; i < VM_MAX_MEMMAPS; i++) { 827 mm = &vm->mem_maps[i]; 828 if (mm->len != 0 && gpa >= mm->gpa && gpa < mm->gpa + mm->len) 829 return (true); /* 'gpa' is sysmem or devmem */ 830 } 831 832 if (ppt_is_mmio(vm, gpa)) 833 return (true); /* 'gpa' is pci passthru mmio */ 834 835 return (false); 836 } 837 838 int 839 vm_alloc_memseg(struct vm *vm, int ident, size_t len, bool sysmem) 840 { 841 struct mem_seg *seg; 842 vm_object_t *obj; 843 844 if (ident < 0 || ident >= VM_MAX_MEMSEGS) 845 return (EINVAL); 846 847 if (len == 0 || (len & PAGE_MASK)) 848 return (EINVAL); 849 850 seg = &vm->mem_segs[ident]; 851 if (seg->object != NULL) { 852 if (seg->len == len && seg->sysmem == sysmem) 853 return (EEXIST); 854 else 855 return (EINVAL); 856 } 857 858 obj = vm_object_mem_allocate(len, vm->mem_transient); 859 if (obj == NULL) 860 return (ENOMEM); 861 862 seg->len = len; 863 seg->object = obj; 864 seg->sysmem = sysmem; 865 return (0); 866 } 867 868 int 869 vm_get_memseg(struct vm *vm, int ident, size_t *len, bool *sysmem, 870 vm_object_t **objptr) 871 { 872 struct mem_seg *seg; 873 874 if (ident < 0 || ident >= VM_MAX_MEMSEGS) 875 return (EINVAL); 876 877 seg = &vm->mem_segs[ident]; 878 if (len) 879 *len = seg->len; 880 if (sysmem) 881 *sysmem = seg->sysmem; 882 if (objptr) 883 *objptr = seg->object; 884 return (0); 885 } 886 887 void 888 vm_free_memseg(struct vm *vm, int ident) 889 { 890 struct mem_seg *seg; 891 892 KASSERT(ident >= 0 && ident < VM_MAX_MEMSEGS, 893 ("%s: invalid memseg ident %d", __func__, ident)); 894 895 seg = &vm->mem_segs[ident]; 896 if (seg->object != NULL) { 897 vm_object_release(seg->object); 898 bzero(seg, sizeof (struct mem_seg)); 899 } 900 } 901 902 int 903 vm_mmap_memseg(struct vm *vm, vm_paddr_t gpa, int segid, vm_ooffset_t first, 904 size_t len, int prot, int flags) 905 { 906 struct mem_seg *seg; 907 struct mem_map *m, *map; 908 vm_ooffset_t last; 909 int i, error; 910 911 if (prot == 0 || (prot & ~(PROT_ALL)) != 0) 912 return (EINVAL); 913 914 if (flags & ~VM_MEMMAP_F_WIRED) 915 return (EINVAL); 916 917 if (segid < 0 || segid >= VM_MAX_MEMSEGS) 918 return (EINVAL); 919 920 seg = &vm->mem_segs[segid]; 921 if (seg->object == NULL) 922 return (EINVAL); 923 924 last = first + len; 925 if (first < 0 || first >= last || last > seg->len) 926 return (EINVAL); 927 928 if ((gpa | first | last) & PAGE_MASK) 929 return (EINVAL); 930 931 map = NULL; 932 for (i = 0; i < VM_MAX_MEMMAPS; i++) { 933 m = &vm->mem_maps[i]; 934 if (m->len == 0) { 935 map = m; 936 break; 937 } 938 } 939 940 if (map == NULL) 941 return (ENOSPC); 942 943 error = vmspace_map(vm->vmspace, seg->object, first, gpa, len, prot); 944 if (error != 0) 945 return (EFAULT); 946 947 vm_object_reference(seg->object); 948 949 if ((flags & VM_MEMMAP_F_WIRED) != 0) { 950 error = vmspace_populate(vm->vmspace, gpa, gpa + len); 951 if (error != 0) { 952 VERIFY0(vmspace_unmap(vm->vmspace, gpa, gpa + len)); 953 return (EFAULT); 954 } 955 } 956 957 map->gpa = gpa; 958 map->len = len; 959 map->segoff = first; 960 map->segid = segid; 961 map->prot = prot; 962 map->flags = flags; 963 return (0); 964 } 965 966 int 967 vm_munmap_memseg(struct vm *vm, vm_paddr_t gpa, size_t len) 968 { 969 struct mem_map *m; 970 int i; 971 972 for (i = 0; i < VM_MAX_MEMMAPS; i++) { 973 m = &vm->mem_maps[i]; 974 if (m->gpa == gpa && m->len == len && 975 (m->flags & VM_MEMMAP_F_IOMMU) == 0) { 976 vm_free_memmap(vm, i); 977 return (0); 978 } 979 } 980 981 return (EINVAL); 982 } 983 984 int 985 vm_mmap_getnext(struct vm *vm, vm_paddr_t *gpa, int *segid, 986 vm_ooffset_t *segoff, size_t *len, int *prot, int *flags) 987 { 988 struct mem_map *mm, *mmnext; 989 int i; 990 991 mmnext = NULL; 992 for (i = 0; i < VM_MAX_MEMMAPS; i++) { 993 mm = &vm->mem_maps[i]; 994 if (mm->len == 0 || mm->gpa < *gpa) 995 continue; 996 if (mmnext == NULL || mm->gpa < mmnext->gpa) 997 mmnext = mm; 998 } 999 1000 if (mmnext != NULL) { 1001 *gpa = mmnext->gpa; 1002 if (segid) 1003 *segid = mmnext->segid; 1004 if (segoff) 1005 *segoff = mmnext->segoff; 1006 if (len) 1007 *len = mmnext->len; 1008 if (prot) 1009 *prot = mmnext->prot; 1010 if (flags) 1011 *flags = mmnext->flags; 1012 return (0); 1013 } else { 1014 return (ENOENT); 1015 } 1016 } 1017 1018 static void 1019 vm_free_memmap(struct vm *vm, int ident) 1020 { 1021 struct mem_map *mm; 1022 int error; 1023 1024 mm = &vm->mem_maps[ident]; 1025 if (mm->len) { 1026 error = vmspace_unmap(vm->vmspace, mm->gpa, 1027 mm->gpa + mm->len); 1028 KASSERT(error == 0, ("%s: vmspace_unmap error %d", 1029 __func__, error)); 1030 bzero(mm, sizeof (struct mem_map)); 1031 } 1032 } 1033 1034 static __inline bool 1035 sysmem_mapping(struct vm *vm, struct mem_map *mm) 1036 { 1037 1038 if (mm->len != 0 && vm->mem_segs[mm->segid].sysmem) 1039 return (true); 1040 else 1041 return (false); 1042 } 1043 1044 vm_paddr_t 1045 vmm_sysmem_maxaddr(struct vm *vm) 1046 { 1047 struct mem_map *mm; 1048 vm_paddr_t maxaddr; 1049 int i; 1050 1051 maxaddr = 0; 1052 for (i = 0; i < VM_MAX_MEMMAPS; i++) { 1053 mm = &vm->mem_maps[i]; 1054 if (sysmem_mapping(vm, mm)) { 1055 if (maxaddr < mm->gpa + mm->len) 1056 maxaddr = mm->gpa + mm->len; 1057 } 1058 } 1059 return (maxaddr); 1060 } 1061 1062 static void 1063 vm_iommu_modify(struct vm *vm, bool map) 1064 { 1065 int i, sz; 1066 vm_paddr_t gpa, hpa; 1067 struct mem_map *mm; 1068 vm_client_t *vmc; 1069 1070 sz = PAGE_SIZE; 1071 vmc = vmspace_client_alloc(vm->vmspace); 1072 1073 for (i = 0; i < VM_MAX_MEMMAPS; i++) { 1074 mm = &vm->mem_maps[i]; 1075 if (!sysmem_mapping(vm, mm)) 1076 continue; 1077 1078 if (map) { 1079 KASSERT((mm->flags & VM_MEMMAP_F_IOMMU) == 0, 1080 ("iommu map found invalid memmap %lx/%lx/%x", 1081 mm->gpa, mm->len, mm->flags)); 1082 if ((mm->flags & VM_MEMMAP_F_WIRED) == 0) 1083 continue; 1084 mm->flags |= VM_MEMMAP_F_IOMMU; 1085 } else { 1086 if ((mm->flags & VM_MEMMAP_F_IOMMU) == 0) 1087 continue; 1088 mm->flags &= ~VM_MEMMAP_F_IOMMU; 1089 KASSERT((mm->flags & VM_MEMMAP_F_WIRED) != 0, 1090 ("iommu unmap found invalid memmap %lx/%lx/%x", 1091 mm->gpa, mm->len, mm->flags)); 1092 } 1093 1094 gpa = mm->gpa; 1095 while (gpa < mm->gpa + mm->len) { 1096 vm_page_t *vmp; 1097 1098 vmp = vmc_hold(vmc, gpa, PROT_WRITE); 1099 ASSERT(vmp != NULL); 1100 hpa = ((uintptr_t)vmp_get_pfn(vmp) << PAGESHIFT); 1101 (void) vmp_release(vmp); 1102 1103 /* 1104 * When originally ported from FreeBSD, the logic for 1105 * adding memory to the guest domain would 1106 * simultaneously remove it from the host domain. The 1107 * justification for that is not clear, and FreeBSD has 1108 * subsequently changed the behavior to not remove the 1109 * memory from the host domain. 1110 * 1111 * Leaving the guest memory in the host domain for the 1112 * life of the VM is necessary to make it available for 1113 * DMA, such as through viona in the TX path. 1114 */ 1115 if (map) { 1116 iommu_create_mapping(vm->iommu, gpa, hpa, sz); 1117 } else { 1118 iommu_remove_mapping(vm->iommu, gpa, sz); 1119 } 1120 1121 gpa += PAGE_SIZE; 1122 } 1123 } 1124 vmc_destroy(vmc); 1125 1126 /* 1127 * Invalidate the cached translations associated with the domain 1128 * from which pages were removed. 1129 */ 1130 iommu_invalidate_tlb(vm->iommu); 1131 } 1132 1133 int 1134 vm_unassign_pptdev(struct vm *vm, int pptfd) 1135 { 1136 int error; 1137 1138 error = ppt_unassign_device(vm, pptfd); 1139 if (error) 1140 return (error); 1141 1142 if (ppt_assigned_devices(vm) == 0) 1143 vm_iommu_modify(vm, false); 1144 1145 return (0); 1146 } 1147 1148 int 1149 vm_assign_pptdev(struct vm *vm, int pptfd) 1150 { 1151 int error; 1152 vm_paddr_t maxaddr; 1153 1154 /* Set up the IOMMU to do the 'gpa' to 'hpa' translation */ 1155 if (ppt_assigned_devices(vm) == 0) { 1156 KASSERT(vm->iommu == NULL, 1157 ("vm_assign_pptdev: iommu must be NULL")); 1158 maxaddr = vmm_sysmem_maxaddr(vm); 1159 vm->iommu = iommu_create_domain(maxaddr); 1160 if (vm->iommu == NULL) 1161 return (ENXIO); 1162 vm_iommu_modify(vm, true); 1163 } 1164 1165 error = ppt_assign_device(vm, pptfd); 1166 return (error); 1167 } 1168 1169 int 1170 vm_get_register(struct vm *vm, int vcpuid, int reg, uint64_t *retval) 1171 { 1172 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 1173 return (EINVAL); 1174 1175 if (reg >= VM_REG_LAST) 1176 return (EINVAL); 1177 1178 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 1179 switch (reg) { 1180 case VM_REG_GUEST_XCR0: 1181 *retval = vcpu->guest_xcr0; 1182 return (0); 1183 default: 1184 return (VMGETREG(vm->cookie, vcpuid, reg, retval)); 1185 } 1186 } 1187 1188 int 1189 vm_set_register(struct vm *vm, int vcpuid, int reg, uint64_t val) 1190 { 1191 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 1192 return (EINVAL); 1193 1194 if (reg >= VM_REG_LAST) 1195 return (EINVAL); 1196 1197 int error; 1198 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 1199 switch (reg) { 1200 case VM_REG_GUEST_RIP: 1201 error = VMSETREG(vm->cookie, vcpuid, reg, val); 1202 if (error == 0) { 1203 vcpu->nextrip = val; 1204 } 1205 return (error); 1206 case VM_REG_GUEST_XCR0: 1207 if (!validate_guest_xcr0(val, vmm_get_host_xcr0())) { 1208 return (EINVAL); 1209 } 1210 vcpu->guest_xcr0 = val; 1211 return (0); 1212 default: 1213 return (VMSETREG(vm->cookie, vcpuid, reg, val)); 1214 } 1215 } 1216 1217 static bool 1218 is_descriptor_table(int reg) 1219 { 1220 switch (reg) { 1221 case VM_REG_GUEST_IDTR: 1222 case VM_REG_GUEST_GDTR: 1223 return (true); 1224 default: 1225 return (false); 1226 } 1227 } 1228 1229 static bool 1230 is_segment_register(int reg) 1231 { 1232 switch (reg) { 1233 case VM_REG_GUEST_ES: 1234 case VM_REG_GUEST_CS: 1235 case VM_REG_GUEST_SS: 1236 case VM_REG_GUEST_DS: 1237 case VM_REG_GUEST_FS: 1238 case VM_REG_GUEST_GS: 1239 case VM_REG_GUEST_TR: 1240 case VM_REG_GUEST_LDTR: 1241 return (true); 1242 default: 1243 return (false); 1244 } 1245 } 1246 1247 int 1248 vm_get_seg_desc(struct vm *vm, int vcpu, int reg, struct seg_desc *desc) 1249 { 1250 1251 if (vcpu < 0 || vcpu >= vm->maxcpus) 1252 return (EINVAL); 1253 1254 if (!is_segment_register(reg) && !is_descriptor_table(reg)) 1255 return (EINVAL); 1256 1257 return (VMGETDESC(vm->cookie, vcpu, reg, desc)); 1258 } 1259 1260 int 1261 vm_set_seg_desc(struct vm *vm, int vcpu, int reg, const struct seg_desc *desc) 1262 { 1263 if (vcpu < 0 || vcpu >= vm->maxcpus) 1264 return (EINVAL); 1265 1266 if (!is_segment_register(reg) && !is_descriptor_table(reg)) 1267 return (EINVAL); 1268 1269 return (VMSETDESC(vm->cookie, vcpu, reg, desc)); 1270 } 1271 1272 static int 1273 translate_hma_xsave_result(hma_fpu_xsave_result_t res) 1274 { 1275 switch (res) { 1276 case HFXR_OK: 1277 return (0); 1278 case HFXR_NO_SPACE: 1279 return (ENOSPC); 1280 case HFXR_BAD_ALIGN: 1281 case HFXR_UNSUP_FMT: 1282 case HFXR_UNSUP_FEAT: 1283 case HFXR_INVALID_DATA: 1284 return (EINVAL); 1285 default: 1286 panic("unexpected xsave result"); 1287 } 1288 } 1289 1290 int 1291 vm_get_fpu(struct vm *vm, int vcpuid, void *buf, size_t len) 1292 { 1293 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 1294 return (EINVAL); 1295 1296 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 1297 hma_fpu_xsave_result_t res; 1298 1299 res = hma_fpu_get_xsave_state(vcpu->guestfpu, buf, len); 1300 return (translate_hma_xsave_result(res)); 1301 } 1302 1303 int 1304 vm_set_fpu(struct vm *vm, int vcpuid, void *buf, size_t len) 1305 { 1306 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 1307 return (EINVAL); 1308 1309 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 1310 hma_fpu_xsave_result_t res; 1311 1312 res = hma_fpu_set_xsave_state(vcpu->guestfpu, buf, len); 1313 return (translate_hma_xsave_result(res)); 1314 } 1315 1316 int 1317 vm_get_run_state(struct vm *vm, int vcpuid, uint32_t *state, uint8_t *sipi_vec) 1318 { 1319 struct vcpu *vcpu; 1320 1321 if (vcpuid < 0 || vcpuid >= vm->maxcpus) { 1322 return (EINVAL); 1323 } 1324 1325 vcpu = &vm->vcpu[vcpuid]; 1326 1327 vcpu_lock(vcpu); 1328 *state = vcpu->run_state; 1329 *sipi_vec = vcpu->sipi_vector; 1330 vcpu_unlock(vcpu); 1331 1332 return (0); 1333 } 1334 1335 int 1336 vm_set_run_state(struct vm *vm, int vcpuid, uint32_t state, uint8_t sipi_vec) 1337 { 1338 struct vcpu *vcpu; 1339 1340 if (vcpuid < 0 || vcpuid >= vm->maxcpus) { 1341 return (EINVAL); 1342 } 1343 if (!VRS_IS_VALID(state)) { 1344 return (EINVAL); 1345 } 1346 1347 vcpu = &vm->vcpu[vcpuid]; 1348 1349 vcpu_lock(vcpu); 1350 vcpu->run_state = state; 1351 vcpu->sipi_vector = sipi_vec; 1352 vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT); 1353 vcpu_unlock(vcpu); 1354 1355 return (0); 1356 } 1357 1358 int 1359 vm_track_dirty_pages(struct vm *vm, uint64_t gpa, size_t len, uint8_t *bitmap) 1360 { 1361 vmspace_t *vms = vm_get_vmspace(vm); 1362 return (vmspace_track_dirty(vms, gpa, len, bitmap)); 1363 } 1364 1365 static void 1366 restore_guest_fpustate(struct vcpu *vcpu) 1367 { 1368 /* Save host FPU and restore guest FPU */ 1369 fpu_stop_emulating(); 1370 hma_fpu_start_guest(vcpu->guestfpu); 1371 1372 /* restore guest XCR0 if XSAVE is enabled in the host */ 1373 if (rcr4() & CR4_XSAVE) 1374 load_xcr(0, vcpu->guest_xcr0); 1375 1376 /* 1377 * The FPU is now "dirty" with the guest's state so turn on emulation 1378 * to trap any access to the FPU by the host. 1379 */ 1380 fpu_start_emulating(); 1381 } 1382 1383 static void 1384 save_guest_fpustate(struct vcpu *vcpu) 1385 { 1386 1387 if ((rcr0() & CR0_TS) == 0) 1388 panic("fpu emulation not enabled in host!"); 1389 1390 /* save guest XCR0 and restore host XCR0 */ 1391 if (rcr4() & CR4_XSAVE) { 1392 vcpu->guest_xcr0 = rxcr(0); 1393 load_xcr(0, vmm_get_host_xcr0()); 1394 } 1395 1396 /* save guest FPU and restore host FPU */ 1397 fpu_stop_emulating(); 1398 hma_fpu_stop_guest(vcpu->guestfpu); 1399 /* 1400 * When the host state has been restored, we should not re-enable 1401 * CR0.TS on illumos for eager FPU. 1402 */ 1403 } 1404 1405 static int 1406 vcpu_set_state_locked(struct vm *vm, int vcpuid, enum vcpu_state newstate, 1407 bool from_idle) 1408 { 1409 struct vcpu *vcpu; 1410 int error; 1411 1412 vcpu = &vm->vcpu[vcpuid]; 1413 vcpu_assert_locked(vcpu); 1414 1415 /* 1416 * State transitions from the vmmdev_ioctl() must always begin from 1417 * the VCPU_IDLE state. This guarantees that there is only a single 1418 * ioctl() operating on a vcpu at any point. 1419 */ 1420 if (from_idle) { 1421 while (vcpu->state != VCPU_IDLE) { 1422 vcpu->reqidle = 1; 1423 vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT); 1424 cv_wait(&vcpu->state_cv, &vcpu->lock); 1425 } 1426 } else { 1427 KASSERT(vcpu->state != VCPU_IDLE, ("invalid transition from " 1428 "vcpu idle state")); 1429 } 1430 1431 if (vcpu->state == VCPU_RUNNING) { 1432 KASSERT(vcpu->hostcpu == curcpu, ("curcpu %d and hostcpu %d " 1433 "mismatch for running vcpu", curcpu, vcpu->hostcpu)); 1434 } else { 1435 KASSERT(vcpu->hostcpu == NOCPU, ("Invalid hostcpu %d for a " 1436 "vcpu that is not running", vcpu->hostcpu)); 1437 } 1438 1439 /* 1440 * The following state transitions are allowed: 1441 * IDLE -> FROZEN -> IDLE 1442 * FROZEN -> RUNNING -> FROZEN 1443 * FROZEN -> SLEEPING -> FROZEN 1444 */ 1445 switch (vcpu->state) { 1446 case VCPU_IDLE: 1447 case VCPU_RUNNING: 1448 case VCPU_SLEEPING: 1449 error = (newstate != VCPU_FROZEN); 1450 break; 1451 case VCPU_FROZEN: 1452 error = (newstate == VCPU_FROZEN); 1453 break; 1454 default: 1455 error = 1; 1456 break; 1457 } 1458 1459 if (error) 1460 return (EBUSY); 1461 1462 vcpu->state = newstate; 1463 if (newstate == VCPU_RUNNING) 1464 vcpu->hostcpu = curcpu; 1465 else 1466 vcpu->hostcpu = NOCPU; 1467 1468 if (newstate == VCPU_IDLE) { 1469 cv_broadcast(&vcpu->state_cv); 1470 } 1471 1472 return (0); 1473 } 1474 1475 static void 1476 vcpu_require_state(struct vm *vm, int vcpuid, enum vcpu_state newstate) 1477 { 1478 int error; 1479 1480 if ((error = vcpu_set_state(vm, vcpuid, newstate, false)) != 0) 1481 panic("Error %d setting state to %d\n", error, newstate); 1482 } 1483 1484 static void 1485 vcpu_require_state_locked(struct vm *vm, int vcpuid, enum vcpu_state newstate) 1486 { 1487 int error; 1488 1489 if ((error = vcpu_set_state_locked(vm, vcpuid, newstate, false)) != 0) 1490 panic("Error %d setting state to %d", error, newstate); 1491 } 1492 1493 /* 1494 * Emulate a guest 'hlt' by sleeping until the vcpu is ready to run. 1495 */ 1496 static int 1497 vm_handle_hlt(struct vm *vm, int vcpuid, bool intr_disabled) 1498 { 1499 struct vcpu *vcpu; 1500 int vcpu_halted, vm_halted; 1501 bool userspace_exit = false; 1502 1503 KASSERT(!CPU_ISSET(vcpuid, &vm->halted_cpus), ("vcpu already halted")); 1504 1505 vcpu = &vm->vcpu[vcpuid]; 1506 vcpu_halted = 0; 1507 vm_halted = 0; 1508 1509 vcpu_lock(vcpu); 1510 while (1) { 1511 /* 1512 * Do a final check for pending interrupts (including NMI and 1513 * INIT) before putting this thread to sleep. 1514 */ 1515 if (vm_nmi_pending(vm, vcpuid)) 1516 break; 1517 if (vcpu_run_state_pending(vm, vcpuid)) 1518 break; 1519 if (!intr_disabled) { 1520 if (vm_extint_pending(vm, vcpuid) || 1521 vlapic_pending_intr(vcpu->vlapic, NULL)) { 1522 break; 1523 } 1524 } 1525 1526 /* 1527 * Also check for software events which would cause a wake-up. 1528 * This will set the appropriate exitcode directly, rather than 1529 * requiring a trip through VM_RUN(). 1530 */ 1531 if (vcpu_sleep_bailout_checks(vm, vcpuid)) { 1532 userspace_exit = true; 1533 break; 1534 } 1535 1536 /* 1537 * Some Linux guests implement "halt" by having all vcpus 1538 * execute HLT with interrupts disabled. 'halted_cpus' keeps 1539 * track of the vcpus that have entered this state. When all 1540 * vcpus enter the halted state the virtual machine is halted. 1541 */ 1542 if (intr_disabled) { 1543 if (!vcpu_halted && halt_detection_enabled) { 1544 vcpu_halted = 1; 1545 CPU_SET_ATOMIC(vcpuid, &vm->halted_cpus); 1546 } 1547 if (CPU_CMP(&vm->halted_cpus, &vm->active_cpus) == 0) { 1548 vm_halted = 1; 1549 break; 1550 } 1551 } 1552 1553 vcpu_ustate_change(vm, vcpuid, VU_IDLE); 1554 vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING); 1555 (void) cv_wait_sig(&vcpu->vcpu_cv, &vcpu->lock); 1556 vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN); 1557 vcpu_ustate_change(vm, vcpuid, VU_EMU_KERN); 1558 } 1559 1560 if (vcpu_halted) 1561 CPU_CLR_ATOMIC(vcpuid, &vm->halted_cpus); 1562 1563 vcpu_unlock(vcpu); 1564 1565 if (vm_halted) { 1566 (void) vm_suspend(vm, VM_SUSPEND_HALT); 1567 } 1568 1569 return (userspace_exit ? -1 : 0); 1570 } 1571 1572 static int 1573 vm_handle_paging(struct vm *vm, int vcpuid) 1574 { 1575 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 1576 vm_client_t *vmc = vcpu->vmclient; 1577 struct vm_exit *vme = &vcpu->exitinfo; 1578 const int ftype = vme->u.paging.fault_type; 1579 1580 ASSERT0(vme->inst_length); 1581 ASSERT(ftype == PROT_READ || ftype == PROT_WRITE || ftype == PROT_EXEC); 1582 1583 if (vmc_fault(vmc, vme->u.paging.gpa, ftype) != 0) { 1584 /* 1585 * If the fault cannot be serviced, kick it out to userspace for 1586 * handling (or more likely, halting the instance). 1587 */ 1588 return (-1); 1589 } 1590 1591 return (0); 1592 } 1593 1594 int 1595 vm_service_mmio_read(struct vm *vm, int cpuid, uint64_t gpa, uint64_t *rval, 1596 int rsize) 1597 { 1598 int err = ESRCH; 1599 1600 if (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE) { 1601 struct vlapic *vlapic = vm_lapic(vm, cpuid); 1602 1603 err = vlapic_mmio_read(vlapic, gpa, rval, rsize); 1604 } else if (gpa >= VIOAPIC_BASE && gpa < VIOAPIC_BASE + VIOAPIC_SIZE) { 1605 err = vioapic_mmio_read(vm, cpuid, gpa, rval, rsize); 1606 } else if (gpa >= VHPET_BASE && gpa < VHPET_BASE + VHPET_SIZE) { 1607 err = vhpet_mmio_read(vm, cpuid, gpa, rval, rsize); 1608 } 1609 1610 return (err); 1611 } 1612 1613 int 1614 vm_service_mmio_write(struct vm *vm, int cpuid, uint64_t gpa, uint64_t wval, 1615 int wsize) 1616 { 1617 int err = ESRCH; 1618 1619 if (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE) { 1620 struct vlapic *vlapic = vm_lapic(vm, cpuid); 1621 1622 err = vlapic_mmio_write(vlapic, gpa, wval, wsize); 1623 } else if (gpa >= VIOAPIC_BASE && gpa < VIOAPIC_BASE + VIOAPIC_SIZE) { 1624 err = vioapic_mmio_write(vm, cpuid, gpa, wval, wsize); 1625 } else if (gpa >= VHPET_BASE && gpa < VHPET_BASE + VHPET_SIZE) { 1626 err = vhpet_mmio_write(vm, cpuid, gpa, wval, wsize); 1627 } 1628 1629 return (err); 1630 } 1631 1632 static int 1633 vm_handle_mmio_emul(struct vm *vm, int vcpuid) 1634 { 1635 struct vie *vie; 1636 struct vcpu *vcpu; 1637 struct vm_exit *vme; 1638 uint64_t inst_addr; 1639 int error, fault, cs_d; 1640 1641 vcpu = &vm->vcpu[vcpuid]; 1642 vme = &vcpu->exitinfo; 1643 vie = vcpu->vie_ctx; 1644 1645 KASSERT(vme->inst_length == 0, ("%s: invalid inst_length %d", 1646 __func__, vme->inst_length)); 1647 1648 inst_addr = vme->rip + vme->u.mmio_emul.cs_base; 1649 cs_d = vme->u.mmio_emul.cs_d; 1650 1651 /* Fetch the faulting instruction */ 1652 if (vie_needs_fetch(vie)) { 1653 error = vie_fetch_instruction(vie, vm, vcpuid, inst_addr, 1654 &fault); 1655 if (error != 0) { 1656 return (error); 1657 } else if (fault) { 1658 /* 1659 * If a fault during instruction fetch was encountered, 1660 * it will have asserted that the appropriate exception 1661 * be injected at next entry. 1662 * No further work is required. 1663 */ 1664 return (0); 1665 } 1666 } 1667 1668 if (vie_decode_instruction(vie, vm, vcpuid, cs_d) != 0) { 1669 /* Dump (unrecognized) instruction bytes in userspace */ 1670 vie_fallback_exitinfo(vie, vme); 1671 return (-1); 1672 } 1673 if (vme->u.mmio_emul.gla != VIE_INVALID_GLA && 1674 vie_verify_gla(vie, vm, vcpuid, vme->u.mmio_emul.gla) != 0) { 1675 /* Decoded GLA does not match GLA from VM exit state */ 1676 vie_fallback_exitinfo(vie, vme); 1677 return (-1); 1678 } 1679 1680 repeat: 1681 error = vie_emulate_mmio(vie, vm, vcpuid); 1682 if (error < 0) { 1683 /* 1684 * MMIO not handled by any of the in-kernel-emulated devices, so 1685 * make a trip out to userspace for it. 1686 */ 1687 vie_exitinfo(vie, vme); 1688 } else if (error == EAGAIN) { 1689 /* 1690 * Continue emulating the rep-prefixed instruction, which has 1691 * not completed its iterations. 1692 * 1693 * In case this can be emulated in-kernel and has a high 1694 * repetition count (causing a tight spin), it should be 1695 * deferential to yield conditions. 1696 */ 1697 if (!vcpu_should_yield(vm, vcpuid)) { 1698 goto repeat; 1699 } else { 1700 /* 1701 * Defer to the contending load by making a trip to 1702 * userspace with a no-op (BOGUS) exit reason. 1703 */ 1704 vie_reset(vie); 1705 vme->exitcode = VM_EXITCODE_BOGUS; 1706 return (-1); 1707 } 1708 } else if (error == 0) { 1709 /* Update %rip now that instruction has been emulated */ 1710 vie_advance_pc(vie, &vcpu->nextrip); 1711 } 1712 return (error); 1713 } 1714 1715 static int 1716 vm_handle_inout(struct vm *vm, int vcpuid, struct vm_exit *vme) 1717 { 1718 struct vcpu *vcpu; 1719 struct vie *vie; 1720 int err; 1721 1722 vcpu = &vm->vcpu[vcpuid]; 1723 vie = vcpu->vie_ctx; 1724 1725 repeat: 1726 err = vie_emulate_inout(vie, vm, vcpuid); 1727 1728 if (err < 0) { 1729 /* 1730 * In/out not handled by any of the in-kernel-emulated devices, 1731 * so make a trip out to userspace for it. 1732 */ 1733 vie_exitinfo(vie, vme); 1734 return (err); 1735 } else if (err == EAGAIN) { 1736 /* 1737 * Continue emulating the rep-prefixed ins/outs, which has not 1738 * completed its iterations. 1739 * 1740 * In case this can be emulated in-kernel and has a high 1741 * repetition count (causing a tight spin), it should be 1742 * deferential to yield conditions. 1743 */ 1744 if (!vcpu_should_yield(vm, vcpuid)) { 1745 goto repeat; 1746 } else { 1747 /* 1748 * Defer to the contending load by making a trip to 1749 * userspace with a no-op (BOGUS) exit reason. 1750 */ 1751 vie_reset(vie); 1752 vme->exitcode = VM_EXITCODE_BOGUS; 1753 return (-1); 1754 } 1755 } else if (err != 0) { 1756 /* Emulation failure. Bail all the way out to userspace. */ 1757 vme->exitcode = VM_EXITCODE_INST_EMUL; 1758 bzero(&vme->u.inst_emul, sizeof (vme->u.inst_emul)); 1759 return (-1); 1760 } 1761 1762 vie_advance_pc(vie, &vcpu->nextrip); 1763 return (0); 1764 } 1765 1766 static int 1767 vm_handle_inst_emul(struct vm *vm, int vcpuid) 1768 { 1769 struct vie *vie; 1770 struct vcpu *vcpu; 1771 struct vm_exit *vme; 1772 uint64_t cs_base; 1773 int error, fault, cs_d; 1774 1775 vcpu = &vm->vcpu[vcpuid]; 1776 vme = &vcpu->exitinfo; 1777 vie = vcpu->vie_ctx; 1778 1779 vie_cs_info(vie, vm, vcpuid, &cs_base, &cs_d); 1780 1781 /* Fetch the faulting instruction */ 1782 ASSERT(vie_needs_fetch(vie)); 1783 error = vie_fetch_instruction(vie, vm, vcpuid, vme->rip + cs_base, 1784 &fault); 1785 if (error != 0) { 1786 return (error); 1787 } else if (fault) { 1788 /* 1789 * If a fault during instruction fetch was encounted, it will 1790 * have asserted that the appropriate exception be injected at 1791 * next entry. No further work is required. 1792 */ 1793 return (0); 1794 } 1795 1796 if (vie_decode_instruction(vie, vm, vcpuid, cs_d) != 0) { 1797 /* Dump (unrecognized) instruction bytes in userspace */ 1798 vie_fallback_exitinfo(vie, vme); 1799 return (-1); 1800 } 1801 1802 error = vie_emulate_other(vie, vm, vcpuid); 1803 if (error != 0) { 1804 /* 1805 * Instruction emulation was unable to complete successfully, so 1806 * kick it out to userspace for handling. 1807 */ 1808 vie_fallback_exitinfo(vie, vme); 1809 } else { 1810 /* Update %rip now that instruction has been emulated */ 1811 vie_advance_pc(vie, &vcpu->nextrip); 1812 } 1813 return (error); 1814 } 1815 1816 static int 1817 vm_handle_suspend(struct vm *vm, int vcpuid) 1818 { 1819 int i; 1820 struct vcpu *vcpu; 1821 1822 vcpu = &vm->vcpu[vcpuid]; 1823 1824 CPU_SET_ATOMIC(vcpuid, &vm->suspended_cpus); 1825 1826 /* 1827 * Wait until all 'active_cpus' have suspended themselves. 1828 */ 1829 vcpu_lock(vcpu); 1830 vcpu_ustate_change(vm, vcpuid, VU_INIT); 1831 while (1) { 1832 int rc; 1833 1834 if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) { 1835 break; 1836 } 1837 1838 vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING); 1839 rc = cv_reltimedwait_sig(&vcpu->vcpu_cv, &vcpu->lock, hz, 1840 TR_CLOCK_TICK); 1841 vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN); 1842 1843 /* 1844 * If the userspace process driving the instance is killed, any 1845 * vCPUs yet to be marked suspended (because they are not 1846 * VM_RUN-ing in the kernel presently) will never reach that 1847 * state. 1848 * 1849 * To avoid vm_handle_suspend() getting stuck in the kernel 1850 * waiting for those vCPUs, offer a bail-out even though it 1851 * means returning without all vCPUs in a suspended state. 1852 */ 1853 if (rc <= 0) { 1854 if ((curproc->p_flag & SEXITING) != 0) { 1855 break; 1856 } 1857 } 1858 } 1859 vcpu_unlock(vcpu); 1860 1861 /* 1862 * Wakeup the other sleeping vcpus and return to userspace. 1863 */ 1864 for (i = 0; i < vm->maxcpus; i++) { 1865 if (CPU_ISSET(i, &vm->suspended_cpus)) { 1866 vcpu_notify_event(vm, i); 1867 } 1868 } 1869 1870 return (-1); 1871 } 1872 1873 static int 1874 vm_handle_reqidle(struct vm *vm, int vcpuid) 1875 { 1876 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 1877 1878 vcpu_lock(vcpu); 1879 KASSERT(vcpu->reqidle, ("invalid vcpu reqidle %d", vcpu->reqidle)); 1880 vcpu->reqidle = 0; 1881 vcpu_unlock(vcpu); 1882 return (-1); 1883 } 1884 1885 static int 1886 vm_handle_run_state(struct vm *vm, int vcpuid) 1887 { 1888 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 1889 bool handled = false; 1890 1891 vcpu_lock(vcpu); 1892 while (1) { 1893 if ((vcpu->run_state & VRS_PEND_INIT) != 0) { 1894 vcpu_unlock(vcpu); 1895 VERIFY0(vcpu_arch_reset(vm, vcpuid, true)); 1896 vcpu_lock(vcpu); 1897 1898 vcpu->run_state &= ~(VRS_RUN | VRS_PEND_INIT); 1899 vcpu->run_state |= VRS_INIT; 1900 } 1901 1902 if ((vcpu->run_state & (VRS_INIT | VRS_RUN | VRS_PEND_SIPI)) == 1903 (VRS_INIT | VRS_PEND_SIPI)) { 1904 const uint8_t vector = vcpu->sipi_vector; 1905 1906 vcpu_unlock(vcpu); 1907 VERIFY0(vcpu_vector_sipi(vm, vcpuid, vector)); 1908 vcpu_lock(vcpu); 1909 1910 vcpu->run_state &= ~VRS_PEND_SIPI; 1911 vcpu->run_state |= VRS_RUN; 1912 } 1913 1914 /* 1915 * If the vCPU is now in the running state, there is no need to 1916 * wait for anything prior to re-entry. 1917 */ 1918 if ((vcpu->run_state & VRS_RUN) != 0) { 1919 handled = true; 1920 break; 1921 } 1922 1923 /* 1924 * Also check for software events which would cause a wake-up. 1925 * This will set the appropriate exitcode directly, rather than 1926 * requiring a trip through VM_RUN(). 1927 */ 1928 if (vcpu_sleep_bailout_checks(vm, vcpuid)) { 1929 break; 1930 } 1931 1932 vcpu_ustate_change(vm, vcpuid, VU_IDLE); 1933 vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING); 1934 (void) cv_wait_sig(&vcpu->vcpu_cv, &vcpu->lock); 1935 vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN); 1936 vcpu_ustate_change(vm, vcpuid, VU_EMU_KERN); 1937 } 1938 vcpu_unlock(vcpu); 1939 1940 return (handled ? 0 : -1); 1941 } 1942 1943 static int 1944 vm_rdmtrr(const struct vm_mtrr *mtrr, uint32_t num, uint64_t *val) 1945 { 1946 switch (num) { 1947 case MSR_MTRRcap: 1948 *val = MTRR_CAP_WC | MTRR_CAP_FIXED | VMM_MTRR_VAR_MAX; 1949 break; 1950 case MSR_MTRRdefType: 1951 *val = mtrr->def_type; 1952 break; 1953 case MSR_MTRR4kBase ... MSR_MTRR4kBase + 7: 1954 *val = mtrr->fixed4k[num - MSR_MTRR4kBase]; 1955 break; 1956 case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1: 1957 *val = mtrr->fixed16k[num - MSR_MTRR16kBase]; 1958 break; 1959 case MSR_MTRR64kBase: 1960 *val = mtrr->fixed64k; 1961 break; 1962 case MSR_MTRRVarBase ... MSR_MTRRVarBase + (VMM_MTRR_VAR_MAX * 2) - 1: { 1963 uint_t offset = num - MSR_MTRRVarBase; 1964 if (offset % 2 == 0) { 1965 *val = mtrr->var[offset / 2].base; 1966 } else { 1967 *val = mtrr->var[offset / 2].mask; 1968 } 1969 break; 1970 } 1971 default: 1972 return (-1); 1973 } 1974 1975 return (0); 1976 } 1977 1978 static int 1979 vm_wrmtrr(struct vm_mtrr *mtrr, uint32_t num, uint64_t val) 1980 { 1981 switch (num) { 1982 case MSR_MTRRcap: 1983 /* MTRRCAP is read only */ 1984 return (-1); 1985 case MSR_MTRRdefType: 1986 if (val & ~VMM_MTRR_DEF_MASK) { 1987 /* generate #GP on writes to reserved fields */ 1988 return (-1); 1989 } 1990 mtrr->def_type = val; 1991 break; 1992 case MSR_MTRR4kBase ... MSR_MTRR4kBase + 7: 1993 mtrr->fixed4k[num - MSR_MTRR4kBase] = val; 1994 break; 1995 case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1: 1996 mtrr->fixed16k[num - MSR_MTRR16kBase] = val; 1997 break; 1998 case MSR_MTRR64kBase: 1999 mtrr->fixed64k = val; 2000 break; 2001 case MSR_MTRRVarBase ... MSR_MTRRVarBase + (VMM_MTRR_VAR_MAX * 2) - 1: { 2002 uint_t offset = num - MSR_MTRRVarBase; 2003 if (offset % 2 == 0) { 2004 if (val & ~VMM_MTRR_PHYSBASE_MASK) { 2005 /* generate #GP on writes to reserved fields */ 2006 return (-1); 2007 } 2008 mtrr->var[offset / 2].base = val; 2009 } else { 2010 if (val & ~VMM_MTRR_PHYSMASK_MASK) { 2011 /* generate #GP on writes to reserved fields */ 2012 return (-1); 2013 } 2014 mtrr->var[offset / 2].mask = val; 2015 } 2016 break; 2017 } 2018 default: 2019 return (-1); 2020 } 2021 2022 return (0); 2023 } 2024 2025 static bool 2026 is_mtrr_msr(uint32_t msr) 2027 { 2028 switch (msr) { 2029 case MSR_MTRRcap: 2030 case MSR_MTRRdefType: 2031 case MSR_MTRR4kBase ... MSR_MTRR4kBase + 7: 2032 case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1: 2033 case MSR_MTRR64kBase: 2034 case MSR_MTRRVarBase ... MSR_MTRRVarBase + (VMM_MTRR_VAR_MAX * 2) - 1: 2035 return (true); 2036 default: 2037 return (false); 2038 } 2039 } 2040 2041 static int 2042 vm_handle_rdmsr(struct vm *vm, int vcpuid, struct vm_exit *vme) 2043 { 2044 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 2045 const uint32_t code = vme->u.msr.code; 2046 uint64_t val = 0; 2047 2048 switch (code) { 2049 case MSR_MCG_CAP: 2050 case MSR_MCG_STATUS: 2051 val = 0; 2052 break; 2053 2054 case MSR_MTRRcap: 2055 case MSR_MTRRdefType: 2056 case MSR_MTRR4kBase ... MSR_MTRR4kBase + 7: 2057 case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1: 2058 case MSR_MTRR64kBase: 2059 case MSR_MTRRVarBase ... MSR_MTRRVarBase + (VMM_MTRR_VAR_MAX * 2) - 1: 2060 if (vm_rdmtrr(&vcpu->mtrr, code, &val) != 0) 2061 vm_inject_gp(vm, vcpuid); 2062 break; 2063 2064 case MSR_TSC: 2065 /* 2066 * In all likelihood, this should always be handled in guest 2067 * context by VMX/SVM rather than taking an exit. (Both VMX and 2068 * SVM pass through read-only access to MSR_TSC to the guest.) 2069 * 2070 * No physical offset is requested of vcpu_tsc_offset() since 2071 * rdtsc_offset() takes care of that instead. 2072 */ 2073 val = vcpu_tsc_offset(vm, vcpuid, false) + rdtsc_offset(); 2074 break; 2075 2076 default: 2077 /* 2078 * Anything not handled at this point will be kicked out to 2079 * userspace for attempted processing there. 2080 */ 2081 return (-1); 2082 } 2083 2084 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RAX, 2085 val & 0xffffffff)); 2086 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RDX, 2087 val >> 32)); 2088 return (0); 2089 } 2090 2091 static int 2092 vm_handle_wrmsr(struct vm *vm, int vcpuid, struct vm_exit *vme) 2093 { 2094 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 2095 const uint32_t code = vme->u.msr.code; 2096 const uint64_t val = vme->u.msr.wval; 2097 2098 switch (code) { 2099 case MSR_MCG_CAP: 2100 case MSR_MCG_STATUS: 2101 /* Ignore writes */ 2102 break; 2103 2104 case MSR_MTRRcap: 2105 case MSR_MTRRdefType: 2106 case MSR_MTRR4kBase ... MSR_MTRR4kBase + 7: 2107 case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1: 2108 case MSR_MTRR64kBase: 2109 case MSR_MTRRVarBase ... MSR_MTRRVarBase + (VMM_MTRR_VAR_MAX * 2) - 1: 2110 if (vm_wrmtrr(&vcpu->mtrr, code, val) != 0) 2111 vm_inject_gp(vm, vcpuid); 2112 break; 2113 2114 case MSR_TSC: 2115 /* 2116 * The effect of writing the TSC MSR is that a subsequent read 2117 * of the TSC would report that value written (plus any time 2118 * elapsed between the write and the read). The guest TSC value 2119 * is calculated from a global offset for the guest (which 2120 * effectively makes its TSC read 0 at guest boot) and a 2121 * per-vCPU offset to handle these writes to the MSR. 2122 * 2123 * To calculate that per-vCPU offset, we can work backwards from 2124 * the guest value at the time of write: 2125 * 2126 * value = host TSC + VM boot offset + vCPU offset 2127 * 2128 * so therefore: 2129 * 2130 * value - host TSC - VM boot offset = vCPU offset 2131 */ 2132 vcpu->tsc_offset = val - vm->boot_tsc_offset - rdtsc_offset(); 2133 break; 2134 2135 default: 2136 /* 2137 * Anything not handled at this point will be kicked out to 2138 * userspace for attempted processing there. 2139 */ 2140 return (-1); 2141 } 2142 2143 return (0); 2144 } 2145 2146 int 2147 vm_suspend(struct vm *vm, enum vm_suspend_how how) 2148 { 2149 if (how <= VM_SUSPEND_NONE || how >= VM_SUSPEND_LAST) 2150 return (EINVAL); 2151 2152 if (atomic_cmpset_int((uint_t *)&vm->suspend, 0, how) == 0) { 2153 return (EALREADY); 2154 } 2155 2156 /* 2157 * Notify all active vcpus that they are now suspended. 2158 */ 2159 for (uint_t i = 0; i < vm->maxcpus; i++) { 2160 struct vcpu *vcpu = &vm->vcpu[i]; 2161 2162 vcpu_lock(vcpu); 2163 if (vcpu->state == VCPU_IDLE || vcpu->state == VCPU_FROZEN) { 2164 /* 2165 * Any vCPUs not actively running or in HLT can be 2166 * marked as suspended immediately. 2167 */ 2168 if (CPU_ISSET(i, &vm->active_cpus)) { 2169 CPU_SET_ATOMIC(i, &vm->suspended_cpus); 2170 } 2171 } else { 2172 /* 2173 * Those which are running or in HLT will pick up the 2174 * suspended state after notification. 2175 */ 2176 vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT); 2177 } 2178 vcpu_unlock(vcpu); 2179 } 2180 return (0); 2181 } 2182 2183 void 2184 vm_exit_run_state(struct vm *vm, int vcpuid, uint64_t rip) 2185 { 2186 struct vm_exit *vmexit; 2187 2188 vmexit = vm_exitinfo(vm, vcpuid); 2189 vmexit->rip = rip; 2190 vmexit->inst_length = 0; 2191 vmexit->exitcode = VM_EXITCODE_RUN_STATE; 2192 vmm_stat_incr(vm, vcpuid, VMEXIT_RUN_STATE, 1); 2193 } 2194 2195 /* 2196 * Some vmm resources, such as the lapic, may have CPU-specific resources 2197 * allocated to them which would benefit from migration onto the host CPU which 2198 * is processing the vcpu state. 2199 */ 2200 static void 2201 vm_localize_resources(struct vm *vm, struct vcpu *vcpu) 2202 { 2203 /* 2204 * Localizing cyclic resources requires acquisition of cpu_lock, and 2205 * doing so with kpreempt disabled is a recipe for deadlock disaster. 2206 */ 2207 VERIFY(curthread->t_preempt == 0); 2208 2209 /* 2210 * Do not bother with localization if this vCPU is about to return to 2211 * the host CPU it was last localized to. 2212 */ 2213 if (vcpu->lastloccpu == curcpu) 2214 return; 2215 2216 /* 2217 * Localize system-wide resources to the primary boot vCPU. While any 2218 * of the other vCPUs may access them, it keeps the potential interrupt 2219 * footprint constrained to CPUs involved with this instance. 2220 */ 2221 if (vcpu == &vm->vcpu[0]) { 2222 vhpet_localize_resources(vm->vhpet); 2223 vrtc_localize_resources(vm->vrtc); 2224 vatpit_localize_resources(vm->vatpit); 2225 } 2226 2227 vlapic_localize_resources(vcpu->vlapic); 2228 2229 vcpu->lastloccpu = curcpu; 2230 } 2231 2232 static void 2233 vmm_savectx(void *arg) 2234 { 2235 vm_thread_ctx_t *vtc = arg; 2236 struct vm *vm = vtc->vtc_vm; 2237 const int vcpuid = vtc->vtc_vcpuid; 2238 2239 if (ops->vmsavectx != NULL) { 2240 ops->vmsavectx(vm->cookie, vcpuid); 2241 } 2242 2243 /* 2244 * Account for going off-cpu, unless the vCPU is idled, where being 2245 * off-cpu is the explicit point. 2246 */ 2247 if (vm->vcpu[vcpuid].ustate != VU_IDLE) { 2248 vtc->vtc_ustate = vm->vcpu[vcpuid].ustate; 2249 vcpu_ustate_change(vm, vcpuid, VU_SCHED); 2250 } 2251 2252 /* 2253 * If the CPU holds the restored guest FPU state, save it and restore 2254 * the host FPU state before this thread goes off-cpu. 2255 */ 2256 if ((vtc->vtc_status & VTCS_FPU_RESTORED) != 0) { 2257 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 2258 2259 save_guest_fpustate(vcpu); 2260 vtc->vtc_status &= ~VTCS_FPU_RESTORED; 2261 } 2262 } 2263 2264 static void 2265 vmm_restorectx(void *arg) 2266 { 2267 vm_thread_ctx_t *vtc = arg; 2268 struct vm *vm = vtc->vtc_vm; 2269 const int vcpuid = vtc->vtc_vcpuid; 2270 2271 /* Complete microstate accounting for vCPU being off-cpu */ 2272 if (vm->vcpu[vcpuid].ustate != VU_IDLE) { 2273 vcpu_ustate_change(vm, vcpuid, vtc->vtc_ustate); 2274 } 2275 2276 /* 2277 * When coming back on-cpu, only restore the guest FPU status if the 2278 * thread is in a context marked as requiring it. This should be rare, 2279 * occurring only when a future logic error results in a voluntary 2280 * sleep during the VMRUN critical section. 2281 * 2282 * The common case will result in elision of the guest FPU state 2283 * restoration, deferring that action until it is clearly necessary 2284 * during vm_run. 2285 */ 2286 VERIFY((vtc->vtc_status & VTCS_FPU_RESTORED) == 0); 2287 if ((vtc->vtc_status & VTCS_FPU_CTX_CRITICAL) != 0) { 2288 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 2289 2290 restore_guest_fpustate(vcpu); 2291 vtc->vtc_status |= VTCS_FPU_RESTORED; 2292 } 2293 2294 if (ops->vmrestorectx != NULL) { 2295 ops->vmrestorectx(vm->cookie, vcpuid); 2296 } 2297 2298 } 2299 2300 static int 2301 vm_entry_actions(struct vm *vm, int vcpuid, const struct vm_entry *entry, 2302 struct vm_exit *vme) 2303 { 2304 struct vcpu *vcpu; 2305 struct vie *vie; 2306 int err; 2307 2308 vcpu = &vm->vcpu[vcpuid]; 2309 vie = vcpu->vie_ctx; 2310 err = 0; 2311 2312 switch (entry->cmd) { 2313 case VEC_DEFAULT: 2314 return (0); 2315 case VEC_DISCARD_INSTR: 2316 vie_reset(vie); 2317 return (0); 2318 case VEC_FULFILL_MMIO: 2319 err = vie_fulfill_mmio(vie, &entry->u.mmio); 2320 if (err == 0) { 2321 err = vie_emulate_mmio(vie, vm, vcpuid); 2322 if (err == 0) { 2323 vie_advance_pc(vie, &vcpu->nextrip); 2324 } else if (err < 0) { 2325 vie_exitinfo(vie, vme); 2326 } else if (err == EAGAIN) { 2327 /* 2328 * Clear the instruction emulation state in 2329 * order to re-enter VM context and continue 2330 * this 'rep <instruction>' 2331 */ 2332 vie_reset(vie); 2333 err = 0; 2334 } 2335 } 2336 break; 2337 case VEC_FULFILL_INOUT: 2338 err = vie_fulfill_inout(vie, &entry->u.inout); 2339 if (err == 0) { 2340 err = vie_emulate_inout(vie, vm, vcpuid); 2341 if (err == 0) { 2342 vie_advance_pc(vie, &vcpu->nextrip); 2343 } else if (err < 0) { 2344 vie_exitinfo(vie, vme); 2345 } else if (err == EAGAIN) { 2346 /* 2347 * Clear the instruction emulation state in 2348 * order to re-enter VM context and continue 2349 * this 'rep ins/outs' 2350 */ 2351 vie_reset(vie); 2352 err = 0; 2353 } 2354 } 2355 break; 2356 default: 2357 return (EINVAL); 2358 } 2359 return (err); 2360 } 2361 2362 static int 2363 vm_loop_checks(struct vm *vm, int vcpuid, struct vm_exit *vme) 2364 { 2365 struct vie *vie; 2366 2367 vie = vm->vcpu[vcpuid].vie_ctx; 2368 2369 if (vie_pending(vie)) { 2370 /* 2371 * Userspace has not fulfilled the pending needs of the 2372 * instruction emulation, so bail back out. 2373 */ 2374 vie_exitinfo(vie, vme); 2375 return (-1); 2376 } 2377 2378 return (0); 2379 } 2380 2381 int 2382 vm_run(struct vm *vm, int vcpuid, const struct vm_entry *entry) 2383 { 2384 int error; 2385 struct vcpu *vcpu; 2386 struct vm_exit *vme; 2387 bool intr_disabled; 2388 int affinity_type = CPU_CURRENT; 2389 2390 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 2391 return (EINVAL); 2392 if (!CPU_ISSET(vcpuid, &vm->active_cpus)) 2393 return (EINVAL); 2394 2395 vcpu = &vm->vcpu[vcpuid]; 2396 vme = &vcpu->exitinfo; 2397 2398 vcpu_ustate_change(vm, vcpuid, VU_EMU_KERN); 2399 2400 vcpu->vtc.vtc_status = 0; 2401 ctxop_attach(curthread, vcpu->ctxop); 2402 2403 error = vm_entry_actions(vm, vcpuid, entry, vme); 2404 if (error != 0) { 2405 goto exit; 2406 } 2407 2408 restart: 2409 error = vm_loop_checks(vm, vcpuid, vme); 2410 if (error != 0) { 2411 goto exit; 2412 } 2413 2414 thread_affinity_set(curthread, affinity_type); 2415 /* 2416 * Resource localization should happen after the CPU affinity for the 2417 * thread has been set to ensure that access from restricted contexts, 2418 * such as VMX-accelerated APIC operations, can occur without inducing 2419 * cyclic cross-calls. 2420 * 2421 * This must be done prior to disabling kpreempt via critical_enter(). 2422 */ 2423 vm_localize_resources(vm, vcpu); 2424 affinity_type = CPU_CURRENT; 2425 critical_enter(); 2426 2427 /* Force a trip through update_sregs to reload %fs/%gs and friends */ 2428 PCB_SET_UPDATE_SEGS(&ttolwp(curthread)->lwp_pcb); 2429 2430 if ((vcpu->vtc.vtc_status & VTCS_FPU_RESTORED) == 0) { 2431 restore_guest_fpustate(vcpu); 2432 vcpu->vtc.vtc_status |= VTCS_FPU_RESTORED; 2433 } 2434 vcpu->vtc.vtc_status |= VTCS_FPU_CTX_CRITICAL; 2435 2436 vcpu_require_state(vm, vcpuid, VCPU_RUNNING); 2437 error = VMRUN(vm->cookie, vcpuid, vcpu->nextrip); 2438 vcpu_require_state(vm, vcpuid, VCPU_FROZEN); 2439 2440 /* 2441 * Once clear of the delicate contexts comprising the VM_RUN handler, 2442 * thread CPU affinity can be loosened while other processing occurs. 2443 */ 2444 vcpu->vtc.vtc_status &= ~VTCS_FPU_CTX_CRITICAL; 2445 thread_affinity_clear(curthread); 2446 critical_exit(); 2447 2448 if (error != 0) { 2449 /* Communicate out any error from VMRUN() above */ 2450 goto exit; 2451 } 2452 2453 vcpu->nextrip = vme->rip + vme->inst_length; 2454 switch (vme->exitcode) { 2455 case VM_EXITCODE_REQIDLE: 2456 error = vm_handle_reqidle(vm, vcpuid); 2457 break; 2458 case VM_EXITCODE_RUN_STATE: 2459 error = vm_handle_run_state(vm, vcpuid); 2460 break; 2461 case VM_EXITCODE_SUSPENDED: 2462 error = vm_handle_suspend(vm, vcpuid); 2463 break; 2464 case VM_EXITCODE_IOAPIC_EOI: 2465 vioapic_process_eoi(vm, vcpuid, 2466 vme->u.ioapic_eoi.vector); 2467 break; 2468 case VM_EXITCODE_HLT: 2469 intr_disabled = ((vme->u.hlt.rflags & PSL_I) == 0); 2470 error = vm_handle_hlt(vm, vcpuid, intr_disabled); 2471 break; 2472 case VM_EXITCODE_PAGING: 2473 error = vm_handle_paging(vm, vcpuid); 2474 break; 2475 case VM_EXITCODE_MMIO_EMUL: 2476 error = vm_handle_mmio_emul(vm, vcpuid); 2477 break; 2478 case VM_EXITCODE_INOUT: 2479 error = vm_handle_inout(vm, vcpuid, vme); 2480 break; 2481 case VM_EXITCODE_INST_EMUL: 2482 error = vm_handle_inst_emul(vm, vcpuid); 2483 break; 2484 case VM_EXITCODE_MONITOR: 2485 case VM_EXITCODE_MWAIT: 2486 case VM_EXITCODE_VMINSN: 2487 vm_inject_ud(vm, vcpuid); 2488 break; 2489 case VM_EXITCODE_RDMSR: 2490 error = vm_handle_rdmsr(vm, vcpuid, vme); 2491 break; 2492 case VM_EXITCODE_WRMSR: 2493 error = vm_handle_wrmsr(vm, vcpuid, vme); 2494 break; 2495 case VM_EXITCODE_HT: 2496 affinity_type = CPU_BEST; 2497 break; 2498 case VM_EXITCODE_MTRAP: 2499 VERIFY0(vm_suspend_cpu(vm, vcpuid)); 2500 error = -1; 2501 break; 2502 default: 2503 /* handled in userland */ 2504 error = -1; 2505 break; 2506 } 2507 2508 if (error == 0) { 2509 /* VM exit conditions handled in-kernel, continue running */ 2510 goto restart; 2511 } 2512 2513 exit: 2514 kpreempt_disable(); 2515 ctxop_detach(curthread, vcpu->ctxop); 2516 /* Make sure all of the needed vCPU context state is saved */ 2517 vmm_savectx(&vcpu->vtc); 2518 kpreempt_enable(); 2519 2520 vcpu_ustate_change(vm, vcpuid, VU_EMU_USER); 2521 return (error); 2522 } 2523 2524 int 2525 vm_restart_instruction(void *arg, int vcpuid) 2526 { 2527 struct vm *vm; 2528 struct vcpu *vcpu; 2529 enum vcpu_state state; 2530 uint64_t rip; 2531 int error; 2532 2533 vm = arg; 2534 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 2535 return (EINVAL); 2536 2537 vcpu = &vm->vcpu[vcpuid]; 2538 state = vcpu_get_state(vm, vcpuid, NULL); 2539 if (state == VCPU_RUNNING) { 2540 /* 2541 * When a vcpu is "running" the next instruction is determined 2542 * by adding 'rip' and 'inst_length' in the vcpu's 'exitinfo'. 2543 * Thus setting 'inst_length' to zero will cause the current 2544 * instruction to be restarted. 2545 */ 2546 vcpu->exitinfo.inst_length = 0; 2547 } else if (state == VCPU_FROZEN) { 2548 /* 2549 * When a vcpu is "frozen" it is outside the critical section 2550 * around VMRUN() and 'nextrip' points to the next instruction. 2551 * Thus instruction restart is achieved by setting 'nextrip' 2552 * to the vcpu's %rip. 2553 */ 2554 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RIP, &rip); 2555 KASSERT(!error, ("%s: error %d getting rip", __func__, error)); 2556 vcpu->nextrip = rip; 2557 } else { 2558 panic("%s: invalid state %d", __func__, state); 2559 } 2560 return (0); 2561 } 2562 2563 int 2564 vm_exit_intinfo(struct vm *vm, int vcpuid, uint64_t info) 2565 { 2566 struct vcpu *vcpu; 2567 2568 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 2569 return (EINVAL); 2570 2571 vcpu = &vm->vcpu[vcpuid]; 2572 2573 if (VM_INTINFO_PENDING(info)) { 2574 const uint32_t type = VM_INTINFO_TYPE(info); 2575 const uint8_t vector = VM_INTINFO_VECTOR(info); 2576 2577 if (type == VM_INTINFO_NMI && vector != IDT_NMI) 2578 return (EINVAL); 2579 if (type == VM_INTINFO_HWEXCP && vector >= 32) 2580 return (EINVAL); 2581 if (info & VM_INTINFO_MASK_RSVD) 2582 return (EINVAL); 2583 } else { 2584 info = 0; 2585 } 2586 vcpu->exit_intinfo = info; 2587 return (0); 2588 } 2589 2590 enum exc_class { 2591 EXC_BENIGN, 2592 EXC_CONTRIBUTORY, 2593 EXC_PAGEFAULT 2594 }; 2595 2596 #define IDT_VE 20 /* Virtualization Exception (Intel specific) */ 2597 2598 static enum exc_class 2599 exception_class(uint64_t info) 2600 { 2601 ASSERT(VM_INTINFO_PENDING(info)); 2602 2603 /* Table 6-4, "Interrupt and Exception Classes", Intel SDM, Vol 3 */ 2604 switch (VM_INTINFO_TYPE(info)) { 2605 case VM_INTINFO_HWINTR: 2606 case VM_INTINFO_SWINTR: 2607 case VM_INTINFO_NMI: 2608 return (EXC_BENIGN); 2609 default: 2610 /* 2611 * Hardware exception. 2612 * 2613 * SVM and VT-x use identical type values to represent NMI, 2614 * hardware interrupt and software interrupt. 2615 * 2616 * SVM uses type '3' for all exceptions. VT-x uses type '3' 2617 * for exceptions except #BP and #OF. #BP and #OF use a type 2618 * value of '5' or '6'. Therefore we don't check for explicit 2619 * values of 'type' to classify 'intinfo' into a hardware 2620 * exception. 2621 */ 2622 break; 2623 } 2624 2625 switch (VM_INTINFO_VECTOR(info)) { 2626 case IDT_PF: 2627 case IDT_VE: 2628 return (EXC_PAGEFAULT); 2629 case IDT_DE: 2630 case IDT_TS: 2631 case IDT_NP: 2632 case IDT_SS: 2633 case IDT_GP: 2634 return (EXC_CONTRIBUTORY); 2635 default: 2636 return (EXC_BENIGN); 2637 } 2638 } 2639 2640 /* 2641 * Fetch event pending injection into the guest, if one exists. 2642 * 2643 * Returns true if an event is to be injected (which is placed in `retinfo`). 2644 */ 2645 bool 2646 vm_entry_intinfo(struct vm *vm, int vcpuid, uint64_t *retinfo) 2647 { 2648 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 2649 const uint64_t info1 = vcpu->exit_intinfo; 2650 vcpu->exit_intinfo = 0; 2651 const uint64_t info2 = vcpu->exc_pending; 2652 vcpu->exc_pending = 0; 2653 2654 if (VM_INTINFO_PENDING(info1) && VM_INTINFO_PENDING(info2)) { 2655 /* 2656 * If an exception occurs while attempting to call the 2657 * double-fault handler the processor enters shutdown mode 2658 * (aka triple fault). 2659 */ 2660 if (VM_INTINFO_TYPE(info1) == VM_INTINFO_HWEXCP && 2661 VM_INTINFO_VECTOR(info1) == IDT_DF) { 2662 (void) vm_suspend(vm, VM_SUSPEND_TRIPLEFAULT); 2663 *retinfo = 0; 2664 return (false); 2665 } 2666 /* 2667 * "Conditions for Generating a Double Fault" 2668 * Intel SDM, Vol3, Table 6-5 2669 */ 2670 const enum exc_class exc1 = exception_class(info1); 2671 const enum exc_class exc2 = exception_class(info2); 2672 if ((exc1 == EXC_CONTRIBUTORY && exc2 == EXC_CONTRIBUTORY) || 2673 (exc1 == EXC_PAGEFAULT && exc2 != EXC_BENIGN)) { 2674 /* Convert nested fault into a double fault. */ 2675 *retinfo = 2676 VM_INTINFO_VALID | 2677 VM_INTINFO_DEL_ERRCODE | 2678 VM_INTINFO_HWEXCP | 2679 IDT_DF; 2680 } else { 2681 /* Handle exceptions serially */ 2682 vcpu->exit_intinfo = info1; 2683 *retinfo = info2; 2684 } 2685 return (true); 2686 } else if (VM_INTINFO_PENDING(info1)) { 2687 *retinfo = info1; 2688 return (true); 2689 } else if (VM_INTINFO_PENDING(info2)) { 2690 *retinfo = info2; 2691 return (true); 2692 } 2693 2694 return (false); 2695 } 2696 2697 int 2698 vm_get_intinfo(struct vm *vm, int vcpuid, uint64_t *info1, uint64_t *info2) 2699 { 2700 struct vcpu *vcpu; 2701 2702 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 2703 return (EINVAL); 2704 2705 vcpu = &vm->vcpu[vcpuid]; 2706 *info1 = vcpu->exit_intinfo; 2707 *info2 = vcpu->exc_pending; 2708 return (0); 2709 } 2710 2711 int 2712 vm_inject_exception(struct vm *vm, int vcpuid, uint8_t vector, 2713 bool errcode_valid, uint32_t errcode, bool restart_instruction) 2714 { 2715 struct vcpu *vcpu; 2716 uint64_t regval; 2717 int error; 2718 2719 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 2720 return (EINVAL); 2721 2722 if (vector >= 32) 2723 return (EINVAL); 2724 2725 /* 2726 * NMIs are to be injected via their own specialized path using 2727 * vm_inject_nmi(). 2728 */ 2729 if (vector == IDT_NMI) { 2730 return (EINVAL); 2731 } 2732 2733 /* 2734 * A double fault exception should never be injected directly into 2735 * the guest. It is a derived exception that results from specific 2736 * combinations of nested faults. 2737 */ 2738 if (vector == IDT_DF) { 2739 return (EINVAL); 2740 } 2741 2742 vcpu = &vm->vcpu[vcpuid]; 2743 2744 if (VM_INTINFO_PENDING(vcpu->exc_pending)) { 2745 /* Unable to inject exception due to one already pending */ 2746 return (EBUSY); 2747 } 2748 2749 if (errcode_valid) { 2750 /* 2751 * Exceptions don't deliver an error code in real mode. 2752 */ 2753 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_CR0, ®val); 2754 VERIFY0(error); 2755 if ((regval & CR0_PE) == 0) { 2756 errcode_valid = false; 2757 } 2758 } 2759 2760 /* 2761 * From section 26.6.1 "Interruptibility State" in Intel SDM: 2762 * 2763 * Event blocking by "STI" or "MOV SS" is cleared after guest executes 2764 * one instruction or incurs an exception. 2765 */ 2766 error = vm_set_register(vm, vcpuid, VM_REG_GUEST_INTR_SHADOW, 0); 2767 VERIFY0(error); 2768 2769 if (restart_instruction) { 2770 VERIFY0(vm_restart_instruction(vm, vcpuid)); 2771 } 2772 2773 uint64_t val = VM_INTINFO_VALID | VM_INTINFO_HWEXCP | vector; 2774 if (errcode_valid) { 2775 val |= VM_INTINFO_DEL_ERRCODE; 2776 val |= (uint64_t)errcode << VM_INTINFO_SHIFT_ERRCODE; 2777 } 2778 vcpu->exc_pending = val; 2779 return (0); 2780 } 2781 2782 void 2783 vm_inject_ud(struct vm *vm, int vcpuid) 2784 { 2785 VERIFY0(vm_inject_exception(vm, vcpuid, IDT_UD, false, 0, true)); 2786 } 2787 2788 void 2789 vm_inject_gp(struct vm *vm, int vcpuid) 2790 { 2791 VERIFY0(vm_inject_exception(vm, vcpuid, IDT_GP, true, 0, true)); 2792 } 2793 2794 void 2795 vm_inject_ac(struct vm *vm, int vcpuid, uint32_t errcode) 2796 { 2797 VERIFY0(vm_inject_exception(vm, vcpuid, IDT_AC, true, errcode, true)); 2798 } 2799 2800 void 2801 vm_inject_ss(struct vm *vm, int vcpuid, uint32_t errcode) 2802 { 2803 VERIFY0(vm_inject_exception(vm, vcpuid, IDT_SS, true, errcode, true)); 2804 } 2805 2806 void 2807 vm_inject_pf(struct vm *vm, int vcpuid, uint32_t errcode, uint64_t cr2) 2808 { 2809 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_CR2, cr2)); 2810 VERIFY0(vm_inject_exception(vm, vcpuid, IDT_PF, true, errcode, true)); 2811 } 2812 2813 static VMM_STAT(VCPU_NMI_COUNT, "number of NMIs delivered to vcpu"); 2814 2815 int 2816 vm_inject_nmi(struct vm *vm, int vcpuid) 2817 { 2818 struct vcpu *vcpu; 2819 2820 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 2821 return (EINVAL); 2822 2823 vcpu = &vm->vcpu[vcpuid]; 2824 2825 vcpu->nmi_pending = true; 2826 vcpu_notify_event(vm, vcpuid); 2827 return (0); 2828 } 2829 2830 bool 2831 vm_nmi_pending(struct vm *vm, int vcpuid) 2832 { 2833 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 2834 2835 return (vcpu->nmi_pending); 2836 } 2837 2838 void 2839 vm_nmi_clear(struct vm *vm, int vcpuid) 2840 { 2841 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 2842 2843 ASSERT(vcpu->nmi_pending); 2844 2845 vcpu->nmi_pending = false; 2846 vmm_stat_incr(vm, vcpuid, VCPU_NMI_COUNT, 1); 2847 } 2848 2849 static VMM_STAT(VCPU_EXTINT_COUNT, "number of ExtINTs delivered to vcpu"); 2850 2851 int 2852 vm_inject_extint(struct vm *vm, int vcpuid) 2853 { 2854 struct vcpu *vcpu; 2855 2856 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 2857 return (EINVAL); 2858 2859 vcpu = &vm->vcpu[vcpuid]; 2860 2861 vcpu->extint_pending = true; 2862 vcpu_notify_event(vm, vcpuid); 2863 return (0); 2864 } 2865 2866 bool 2867 vm_extint_pending(struct vm *vm, int vcpuid) 2868 { 2869 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 2870 2871 return (vcpu->extint_pending); 2872 } 2873 2874 void 2875 vm_extint_clear(struct vm *vm, int vcpuid) 2876 { 2877 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 2878 2879 ASSERT(vcpu->extint_pending); 2880 2881 vcpu->extint_pending = false; 2882 vmm_stat_incr(vm, vcpuid, VCPU_EXTINT_COUNT, 1); 2883 } 2884 2885 int 2886 vm_inject_init(struct vm *vm, int vcpuid) 2887 { 2888 struct vcpu *vcpu; 2889 2890 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 2891 return (EINVAL); 2892 2893 vcpu = &vm->vcpu[vcpuid]; 2894 vcpu_lock(vcpu); 2895 vcpu->run_state |= VRS_PEND_INIT; 2896 /* 2897 * As part of queuing the INIT request, clear any pending SIPI. It 2898 * would not otherwise survive across the reset of the vCPU when it 2899 * undergoes the requested INIT. We would not want it to linger when it 2900 * could be mistaken as a subsequent (after the INIT) SIPI request. 2901 */ 2902 vcpu->run_state &= ~VRS_PEND_SIPI; 2903 vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT); 2904 2905 vcpu_unlock(vcpu); 2906 return (0); 2907 } 2908 2909 int 2910 vm_inject_sipi(struct vm *vm, int vcpuid, uint8_t vector) 2911 { 2912 struct vcpu *vcpu; 2913 2914 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 2915 return (EINVAL); 2916 2917 vcpu = &vm->vcpu[vcpuid]; 2918 vcpu_lock(vcpu); 2919 vcpu->run_state |= VRS_PEND_SIPI; 2920 vcpu->sipi_vector = vector; 2921 /* SIPI is only actionable if the CPU is waiting in INIT state */ 2922 if ((vcpu->run_state & (VRS_INIT | VRS_RUN)) == VRS_INIT) { 2923 vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT); 2924 } 2925 vcpu_unlock(vcpu); 2926 return (0); 2927 } 2928 2929 bool 2930 vcpu_run_state_pending(struct vm *vm, int vcpuid) 2931 { 2932 struct vcpu *vcpu; 2933 2934 ASSERT(vcpuid >= 0 && vcpuid < vm->maxcpus); 2935 vcpu = &vm->vcpu[vcpuid]; 2936 2937 /* Of interest: vCPU not in running state or with pending INIT */ 2938 return ((vcpu->run_state & (VRS_RUN | VRS_PEND_INIT)) != VRS_RUN); 2939 } 2940 2941 int 2942 vcpu_arch_reset(struct vm *vm, int vcpuid, bool init_only) 2943 { 2944 struct seg_desc desc; 2945 const enum vm_reg_name clear_regs[] = { 2946 VM_REG_GUEST_CR2, 2947 VM_REG_GUEST_CR3, 2948 VM_REG_GUEST_CR4, 2949 VM_REG_GUEST_RAX, 2950 VM_REG_GUEST_RBX, 2951 VM_REG_GUEST_RCX, 2952 VM_REG_GUEST_RSI, 2953 VM_REG_GUEST_RDI, 2954 VM_REG_GUEST_RBP, 2955 VM_REG_GUEST_RSP, 2956 VM_REG_GUEST_R8, 2957 VM_REG_GUEST_R9, 2958 VM_REG_GUEST_R10, 2959 VM_REG_GUEST_R11, 2960 VM_REG_GUEST_R12, 2961 VM_REG_GUEST_R13, 2962 VM_REG_GUEST_R14, 2963 VM_REG_GUEST_R15, 2964 VM_REG_GUEST_DR0, 2965 VM_REG_GUEST_DR1, 2966 VM_REG_GUEST_DR2, 2967 VM_REG_GUEST_DR3, 2968 VM_REG_GUEST_EFER, 2969 }; 2970 const enum vm_reg_name data_segs[] = { 2971 VM_REG_GUEST_SS, 2972 VM_REG_GUEST_DS, 2973 VM_REG_GUEST_ES, 2974 VM_REG_GUEST_FS, 2975 VM_REG_GUEST_GS, 2976 }; 2977 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 2978 2979 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 2980 return (EINVAL); 2981 2982 for (uint_t i = 0; i < nitems(clear_regs); i++) { 2983 VERIFY0(vm_set_register(vm, vcpuid, clear_regs[i], 0)); 2984 } 2985 2986 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, 2)); 2987 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RIP, 0xfff0)); 2988 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_CR0, 0x60000010)); 2989 2990 /* 2991 * The prescribed contents of %rdx differ slightly between the Intel and 2992 * AMD architectural definitions. The former expects the Extended Model 2993 * in bits 16-19 where the latter expects all the Family, Model, and 2994 * Stepping be there. Common boot ROMs appear to disregard this 2995 * anyways, so we stick with a compromise value similar to what is 2996 * spelled out in the Intel SDM. 2997 */ 2998 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RDX, 0x600)); 2999 3000 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_DR6, 0xffff0ff0)); 3001 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_DR7, 0x400)); 3002 3003 /* CS: Present, R/W, Accessed */ 3004 desc.access = 0x0093; 3005 desc.base = 0xffff0000; 3006 desc.limit = 0xffff; 3007 VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_CS, &desc)); 3008 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_CS, 0xf000)); 3009 3010 /* SS, DS, ES, FS, GS: Present, R/W, Accessed */ 3011 desc.access = 0x0093; 3012 desc.base = 0; 3013 desc.limit = 0xffff; 3014 for (uint_t i = 0; i < nitems(data_segs); i++) { 3015 VERIFY0(vm_set_seg_desc(vm, vcpuid, data_segs[i], &desc)); 3016 VERIFY0(vm_set_register(vm, vcpuid, data_segs[i], 0)); 3017 } 3018 3019 /* GDTR, IDTR */ 3020 desc.base = 0; 3021 desc.limit = 0xffff; 3022 VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_GDTR, &desc)); 3023 VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_IDTR, &desc)); 3024 3025 /* LDTR: Present, LDT */ 3026 desc.access = 0x0082; 3027 desc.base = 0; 3028 desc.limit = 0xffff; 3029 VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_LDTR, &desc)); 3030 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_LDTR, 0)); 3031 3032 /* TR: Present, 32-bit TSS */ 3033 desc.access = 0x008b; 3034 desc.base = 0; 3035 desc.limit = 0xffff; 3036 VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_TR, &desc)); 3037 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_TR, 0)); 3038 3039 vlapic_reset(vm_lapic(vm, vcpuid)); 3040 3041 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_INTR_SHADOW, 0)); 3042 3043 vcpu->exit_intinfo = 0; 3044 vcpu->exc_pending = 0; 3045 vcpu->nmi_pending = false; 3046 vcpu->extint_pending = 0; 3047 3048 /* 3049 * A CPU reset caused by power-on or system reset clears more state than 3050 * one which is trigged from an INIT IPI. 3051 */ 3052 if (!init_only) { 3053 vcpu->guest_xcr0 = XFEATURE_ENABLED_X87; 3054 (void) hma_fpu_init(vcpu->guestfpu); 3055 3056 /* XXX: clear MSRs and other pieces */ 3057 bzero(&vcpu->mtrr, sizeof (vcpu->mtrr)); 3058 } 3059 3060 return (0); 3061 } 3062 3063 static int 3064 vcpu_vector_sipi(struct vm *vm, int vcpuid, uint8_t vector) 3065 { 3066 struct seg_desc desc; 3067 3068 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 3069 return (EINVAL); 3070 3071 /* CS: Present, R/W, Accessed */ 3072 desc.access = 0x0093; 3073 desc.base = (uint64_t)vector << 12; 3074 desc.limit = 0xffff; 3075 VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_CS, &desc)); 3076 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_CS, 3077 (uint64_t)vector << 8)); 3078 3079 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RIP, 0)); 3080 3081 return (0); 3082 } 3083 3084 int 3085 vm_get_capability(struct vm *vm, int vcpu, int type, int *retval) 3086 { 3087 if (vcpu < 0 || vcpu >= vm->maxcpus) 3088 return (EINVAL); 3089 3090 if (type < 0 || type >= VM_CAP_MAX) 3091 return (EINVAL); 3092 3093 return (VMGETCAP(vm->cookie, vcpu, type, retval)); 3094 } 3095 3096 int 3097 vm_set_capability(struct vm *vm, int vcpu, int type, int val) 3098 { 3099 if (vcpu < 0 || vcpu >= vm->maxcpus) 3100 return (EINVAL); 3101 3102 if (type < 0 || type >= VM_CAP_MAX) 3103 return (EINVAL); 3104 3105 return (VMSETCAP(vm->cookie, vcpu, type, val)); 3106 } 3107 3108 vcpu_cpuid_config_t * 3109 vm_cpuid_config(struct vm *vm, int vcpuid) 3110 { 3111 ASSERT3S(vcpuid, >=, 0); 3112 ASSERT3S(vcpuid, <, VM_MAXCPU); 3113 3114 return (&vm->vcpu[vcpuid].cpuid_cfg); 3115 } 3116 3117 struct vlapic * 3118 vm_lapic(struct vm *vm, int cpu) 3119 { 3120 ASSERT3S(cpu, >=, 0); 3121 ASSERT3S(cpu, <, VM_MAXCPU); 3122 3123 return (vm->vcpu[cpu].vlapic); 3124 } 3125 3126 struct vioapic * 3127 vm_ioapic(struct vm *vm) 3128 { 3129 3130 return (vm->vioapic); 3131 } 3132 3133 struct vhpet * 3134 vm_hpet(struct vm *vm) 3135 { 3136 3137 return (vm->vhpet); 3138 } 3139 3140 void * 3141 vm_iommu_domain(struct vm *vm) 3142 { 3143 3144 return (vm->iommu); 3145 } 3146 3147 int 3148 vcpu_set_state(struct vm *vm, int vcpuid, enum vcpu_state newstate, 3149 bool from_idle) 3150 { 3151 int error; 3152 struct vcpu *vcpu; 3153 3154 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 3155 panic("vcpu_set_state: invalid vcpuid %d", vcpuid); 3156 3157 vcpu = &vm->vcpu[vcpuid]; 3158 3159 vcpu_lock(vcpu); 3160 error = vcpu_set_state_locked(vm, vcpuid, newstate, from_idle); 3161 vcpu_unlock(vcpu); 3162 3163 return (error); 3164 } 3165 3166 enum vcpu_state 3167 vcpu_get_state(struct vm *vm, int vcpuid, int *hostcpu) 3168 { 3169 struct vcpu *vcpu; 3170 enum vcpu_state state; 3171 3172 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 3173 panic("vcpu_get_state: invalid vcpuid %d", vcpuid); 3174 3175 vcpu = &vm->vcpu[vcpuid]; 3176 3177 vcpu_lock(vcpu); 3178 state = vcpu->state; 3179 if (hostcpu != NULL) 3180 *hostcpu = vcpu->hostcpu; 3181 vcpu_unlock(vcpu); 3182 3183 return (state); 3184 } 3185 3186 uint64_t 3187 vcpu_tsc_offset(struct vm *vm, int vcpuid, bool phys_adj) 3188 { 3189 ASSERT(vcpuid >= 0 && vcpuid < vm->maxcpus); 3190 3191 uint64_t vcpu_off = vm->boot_tsc_offset + vm->vcpu[vcpuid].tsc_offset; 3192 3193 if (phys_adj) { 3194 /* Include any offset for the current physical CPU too */ 3195 extern hrtime_t tsc_gethrtime_tick_delta(void); 3196 vcpu_off += (uint64_t)tsc_gethrtime_tick_delta(); 3197 } 3198 3199 return (vcpu_off); 3200 } 3201 3202 /* Normalize hrtime against the boot time for a VM */ 3203 hrtime_t 3204 vm_normalize_hrtime(struct vm *vm, hrtime_t hrt) 3205 { 3206 /* To avoid underflow/overflow UB, perform math as unsigned */ 3207 return ((hrtime_t)((uint64_t)hrt - (uint64_t)vm->boot_hrtime)); 3208 } 3209 3210 /* Denormalize hrtime against the boot time for a VM */ 3211 hrtime_t 3212 vm_denormalize_hrtime(struct vm *vm, hrtime_t hrt) 3213 { 3214 /* To avoid underflow/overflow UB, perform math as unsigned */ 3215 return ((hrtime_t)((uint64_t)hrt + (uint64_t)vm->boot_hrtime)); 3216 } 3217 3218 int 3219 vm_activate_cpu(struct vm *vm, int vcpuid) 3220 { 3221 3222 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 3223 return (EINVAL); 3224 3225 if (CPU_ISSET(vcpuid, &vm->active_cpus)) 3226 return (EBUSY); 3227 3228 if (vm->suspend != 0) { 3229 return (EBUSY); 3230 } 3231 3232 CPU_SET_ATOMIC(vcpuid, &vm->active_cpus); 3233 3234 /* 3235 * It is possible that this vCPU was undergoing activation at the same 3236 * time that the VM was being suspended. If that happens to be the 3237 * case, it should reflect the suspended state immediately. 3238 */ 3239 if (atomic_load_acq_int((uint_t *)&vm->suspend) != 0) { 3240 CPU_SET_ATOMIC(vcpuid, &vm->suspended_cpus); 3241 } 3242 3243 return (0); 3244 } 3245 3246 int 3247 vm_suspend_cpu(struct vm *vm, int vcpuid) 3248 { 3249 int i; 3250 3251 if (vcpuid < -1 || vcpuid >= vm->maxcpus) 3252 return (EINVAL); 3253 3254 if (vcpuid == -1) { 3255 vm->debug_cpus = vm->active_cpus; 3256 for (i = 0; i < vm->maxcpus; i++) { 3257 if (CPU_ISSET(i, &vm->active_cpus)) 3258 vcpu_notify_event(vm, i); 3259 } 3260 } else { 3261 if (!CPU_ISSET(vcpuid, &vm->active_cpus)) 3262 return (EINVAL); 3263 3264 CPU_SET_ATOMIC(vcpuid, &vm->debug_cpus); 3265 vcpu_notify_event(vm, vcpuid); 3266 } 3267 return (0); 3268 } 3269 3270 int 3271 vm_resume_cpu(struct vm *vm, int vcpuid) 3272 { 3273 3274 if (vcpuid < -1 || vcpuid >= vm->maxcpus) 3275 return (EINVAL); 3276 3277 if (vcpuid == -1) { 3278 CPU_ZERO(&vm->debug_cpus); 3279 } else { 3280 if (!CPU_ISSET(vcpuid, &vm->debug_cpus)) 3281 return (EINVAL); 3282 3283 CPU_CLR_ATOMIC(vcpuid, &vm->debug_cpus); 3284 } 3285 return (0); 3286 } 3287 3288 static bool 3289 vcpu_bailout_checks(struct vm *vm, int vcpuid, bool on_entry, 3290 uint64_t entry_rip) 3291 { 3292 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 3293 struct vm_exit *vme = &vcpu->exitinfo; 3294 bool bail = false; 3295 3296 ASSERT(vcpuid >= 0 && vcpuid < vm->maxcpus); 3297 3298 if (vm->suspend) { 3299 if (on_entry) { 3300 VERIFY(vm->suspend > VM_SUSPEND_NONE && 3301 vm->suspend < VM_SUSPEND_LAST); 3302 3303 vme->exitcode = VM_EXITCODE_SUSPENDED; 3304 vme->u.suspended.how = vm->suspend; 3305 } else { 3306 /* 3307 * Handling VM suspend is complicated, so if that 3308 * condition is detected outside of VM-entry itself, 3309 * just emit a BOGUS exitcode so we take a lap to pick 3310 * up the event during an entry and are directed into 3311 * the vm_handle_suspend() logic. 3312 */ 3313 vme->exitcode = VM_EXITCODE_BOGUS; 3314 } 3315 bail = true; 3316 } 3317 if (vcpu->reqidle) { 3318 vme->exitcode = VM_EXITCODE_REQIDLE; 3319 vmm_stat_incr(vm, vcpuid, VMEXIT_REQIDLE, 1); 3320 3321 if (!on_entry) { 3322 /* 3323 * A reqidle request detected outside of VM-entry can be 3324 * handled directly by clearing the request (and taking 3325 * a lap to userspace). 3326 */ 3327 vcpu_assert_locked(vcpu); 3328 vcpu->reqidle = 0; 3329 } 3330 bail = true; 3331 } 3332 if (vcpu_should_yield(vm, vcpuid)) { 3333 vme->exitcode = VM_EXITCODE_BOGUS; 3334 vmm_stat_incr(vm, vcpuid, VMEXIT_ASTPENDING, 1); 3335 bail = true; 3336 } 3337 if (CPU_ISSET(vcpuid, &vm->debug_cpus)) { 3338 vme->exitcode = VM_EXITCODE_DEBUG; 3339 bail = true; 3340 } 3341 3342 if (bail) { 3343 if (on_entry) { 3344 /* 3345 * If bailing out during VM-entry, the current %rip must 3346 * be recorded in the exitinfo. 3347 */ 3348 vme->rip = entry_rip; 3349 } 3350 vme->inst_length = 0; 3351 } 3352 return (bail); 3353 } 3354 3355 static bool 3356 vcpu_sleep_bailout_checks(struct vm *vm, int vcpuid) 3357 { 3358 /* 3359 * Bail-out check done prior to sleeping (in vCPU contexts like HLT or 3360 * wait-for-SIPI) expect that %rip is already populated in the vm_exit 3361 * structure, and we would only modify the exitcode. 3362 */ 3363 return (vcpu_bailout_checks(vm, vcpuid, false, 0)); 3364 } 3365 3366 bool 3367 vcpu_entry_bailout_checks(struct vm *vm, int vcpuid, uint64_t rip) 3368 { 3369 /* 3370 * Bail-out checks done as part of VM entry require an updated %rip to 3371 * populate the vm_exit struct if any of the conditions of interest are 3372 * matched in the check. 3373 */ 3374 return (vcpu_bailout_checks(vm, vcpuid, true, rip)); 3375 } 3376 3377 cpuset_t 3378 vm_active_cpus(struct vm *vm) 3379 { 3380 3381 return (vm->active_cpus); 3382 } 3383 3384 cpuset_t 3385 vm_debug_cpus(struct vm *vm) 3386 { 3387 3388 return (vm->debug_cpus); 3389 } 3390 3391 cpuset_t 3392 vm_suspended_cpus(struct vm *vm) 3393 { 3394 3395 return (vm->suspended_cpus); 3396 } 3397 3398 void * 3399 vcpu_stats(struct vm *vm, int vcpuid) 3400 { 3401 3402 return (vm->vcpu[vcpuid].stats); 3403 } 3404 3405 int 3406 vm_get_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state *state) 3407 { 3408 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 3409 return (EINVAL); 3410 3411 *state = vm->vcpu[vcpuid].x2apic_state; 3412 3413 return (0); 3414 } 3415 3416 int 3417 vm_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state) 3418 { 3419 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 3420 return (EINVAL); 3421 3422 if (state >= X2APIC_STATE_LAST) 3423 return (EINVAL); 3424 3425 vm->vcpu[vcpuid].x2apic_state = state; 3426 3427 vlapic_set_x2apic_state(vm, vcpuid, state); 3428 3429 return (0); 3430 } 3431 3432 /* 3433 * This function is called to ensure that a vcpu "sees" a pending event 3434 * as soon as possible: 3435 * - If the vcpu thread is sleeping then it is woken up. 3436 * - If the vcpu is running on a different host_cpu then an IPI will be directed 3437 * to the host_cpu to cause the vcpu to trap into the hypervisor. 3438 */ 3439 static void 3440 vcpu_notify_event_locked(struct vcpu *vcpu, vcpu_notify_t ntype) 3441 { 3442 int hostcpu; 3443 3444 ASSERT(ntype == VCPU_NOTIFY_APIC || VCPU_NOTIFY_EXIT); 3445 3446 hostcpu = vcpu->hostcpu; 3447 if (vcpu->state == VCPU_RUNNING) { 3448 KASSERT(hostcpu != NOCPU, ("vcpu running on invalid hostcpu")); 3449 if (hostcpu != curcpu) { 3450 if (ntype == VCPU_NOTIFY_APIC) { 3451 vlapic_post_intr(vcpu->vlapic, hostcpu); 3452 } else { 3453 poke_cpu(hostcpu); 3454 } 3455 } else { 3456 /* 3457 * If the 'vcpu' is running on 'curcpu' then it must 3458 * be sending a notification to itself (e.g. SELF_IPI). 3459 * The pending event will be picked up when the vcpu 3460 * transitions back to guest context. 3461 */ 3462 } 3463 } else { 3464 KASSERT(hostcpu == NOCPU, ("vcpu state %d not consistent " 3465 "with hostcpu %d", vcpu->state, hostcpu)); 3466 if (vcpu->state == VCPU_SLEEPING) { 3467 cv_signal(&vcpu->vcpu_cv); 3468 } 3469 } 3470 } 3471 3472 void 3473 vcpu_notify_event(struct vm *vm, int vcpuid) 3474 { 3475 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 3476 3477 vcpu_lock(vcpu); 3478 vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT); 3479 vcpu_unlock(vcpu); 3480 } 3481 3482 void 3483 vcpu_notify_event_type(struct vm *vm, int vcpuid, vcpu_notify_t ntype) 3484 { 3485 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 3486 3487 if (ntype == VCPU_NOTIFY_NONE) { 3488 return; 3489 } 3490 3491 vcpu_lock(vcpu); 3492 vcpu_notify_event_locked(vcpu, ntype); 3493 vcpu_unlock(vcpu); 3494 } 3495 3496 void 3497 vcpu_ustate_change(struct vm *vm, int vcpuid, enum vcpu_ustate ustate) 3498 { 3499 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 3500 hrtime_t now = gethrtime(); 3501 3502 ASSERT3U(ustate, !=, vcpu->ustate); 3503 ASSERT3S(ustate, <, VU_MAX); 3504 ASSERT3S(ustate, >=, VU_INIT); 3505 3506 hrtime_t delta = now - vcpu->ustate_when; 3507 vcpu->ustate_total[vcpu->ustate] += delta; 3508 3509 membar_producer(); 3510 3511 vcpu->ustate_when = now; 3512 vcpu->ustate = ustate; 3513 } 3514 3515 struct vmspace * 3516 vm_get_vmspace(struct vm *vm) 3517 { 3518 3519 return (vm->vmspace); 3520 } 3521 3522 struct vm_client * 3523 vm_get_vmclient(struct vm *vm, int vcpuid) 3524 { 3525 return (vm->vcpu[vcpuid].vmclient); 3526 } 3527 3528 int 3529 vm_apicid2vcpuid(struct vm *vm, int apicid) 3530 { 3531 /* 3532 * XXX apic id is assumed to be numerically identical to vcpu id 3533 */ 3534 return (apicid); 3535 } 3536 3537 struct vatpic * 3538 vm_atpic(struct vm *vm) 3539 { 3540 return (vm->vatpic); 3541 } 3542 3543 struct vatpit * 3544 vm_atpit(struct vm *vm) 3545 { 3546 return (vm->vatpit); 3547 } 3548 3549 struct vpmtmr * 3550 vm_pmtmr(struct vm *vm) 3551 { 3552 3553 return (vm->vpmtmr); 3554 } 3555 3556 struct vrtc * 3557 vm_rtc(struct vm *vm) 3558 { 3559 3560 return (vm->vrtc); 3561 } 3562 3563 enum vm_reg_name 3564 vm_segment_name(int seg) 3565 { 3566 static enum vm_reg_name seg_names[] = { 3567 VM_REG_GUEST_ES, 3568 VM_REG_GUEST_CS, 3569 VM_REG_GUEST_SS, 3570 VM_REG_GUEST_DS, 3571 VM_REG_GUEST_FS, 3572 VM_REG_GUEST_GS 3573 }; 3574 3575 KASSERT(seg >= 0 && seg < nitems(seg_names), 3576 ("%s: invalid segment encoding %d", __func__, seg)); 3577 return (seg_names[seg]); 3578 } 3579 3580 void 3581 vm_copy_teardown(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo, 3582 uint_t num_copyinfo) 3583 { 3584 for (uint_t idx = 0; idx < num_copyinfo; idx++) { 3585 if (copyinfo[idx].cookie != NULL) { 3586 (void) vmp_release((vm_page_t *)copyinfo[idx].cookie); 3587 } 3588 } 3589 bzero(copyinfo, num_copyinfo * sizeof (struct vm_copyinfo)); 3590 } 3591 3592 int 3593 vm_copy_setup(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, 3594 uint64_t gla, size_t len, int prot, struct vm_copyinfo *copyinfo, 3595 uint_t num_copyinfo, int *fault) 3596 { 3597 uint_t idx, nused; 3598 size_t n, off, remaining; 3599 vm_client_t *vmc = vm_get_vmclient(vm, vcpuid); 3600 3601 bzero(copyinfo, sizeof (struct vm_copyinfo) * num_copyinfo); 3602 3603 nused = 0; 3604 remaining = len; 3605 while (remaining > 0) { 3606 uint64_t gpa; 3607 int error; 3608 3609 KASSERT(nused < num_copyinfo, ("insufficient vm_copyinfo")); 3610 error = vm_gla2gpa(vm, vcpuid, paging, gla, prot, &gpa, fault); 3611 if (error || *fault) 3612 return (error); 3613 off = gpa & PAGEOFFSET; 3614 n = min(remaining, PAGESIZE - off); 3615 copyinfo[nused].gpa = gpa; 3616 copyinfo[nused].len = n; 3617 remaining -= n; 3618 gla += n; 3619 nused++; 3620 } 3621 3622 for (idx = 0; idx < nused; idx++) { 3623 vm_page_t *vmp; 3624 caddr_t hva; 3625 3626 vmp = vmc_hold(vmc, copyinfo[idx].gpa & PAGEMASK, prot); 3627 if (vmp == NULL) { 3628 break; 3629 } 3630 if ((prot & PROT_WRITE) != 0) { 3631 hva = (caddr_t)vmp_get_writable(vmp); 3632 } else { 3633 hva = (caddr_t)vmp_get_readable(vmp); 3634 } 3635 copyinfo[idx].hva = hva + (copyinfo[idx].gpa & PAGEOFFSET); 3636 copyinfo[idx].cookie = vmp; 3637 copyinfo[idx].prot = prot; 3638 } 3639 3640 if (idx != nused) { 3641 vm_copy_teardown(vm, vcpuid, copyinfo, num_copyinfo); 3642 return (EFAULT); 3643 } else { 3644 *fault = 0; 3645 return (0); 3646 } 3647 } 3648 3649 void 3650 vm_copyin(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo, void *kaddr, 3651 size_t len) 3652 { 3653 char *dst; 3654 int idx; 3655 3656 dst = kaddr; 3657 idx = 0; 3658 while (len > 0) { 3659 ASSERT(copyinfo[idx].prot & PROT_READ); 3660 3661 bcopy(copyinfo[idx].hva, dst, copyinfo[idx].len); 3662 len -= copyinfo[idx].len; 3663 dst += copyinfo[idx].len; 3664 idx++; 3665 } 3666 } 3667 3668 void 3669 vm_copyout(struct vm *vm, int vcpuid, const void *kaddr, 3670 struct vm_copyinfo *copyinfo, size_t len) 3671 { 3672 const char *src; 3673 int idx; 3674 3675 src = kaddr; 3676 idx = 0; 3677 while (len > 0) { 3678 ASSERT(copyinfo[idx].prot & PROT_WRITE); 3679 3680 bcopy(src, copyinfo[idx].hva, copyinfo[idx].len); 3681 len -= copyinfo[idx].len; 3682 src += copyinfo[idx].len; 3683 idx++; 3684 } 3685 } 3686 3687 /* 3688 * Return the amount of in-use and wired memory for the VM. Since 3689 * these are global stats, only return the values with for vCPU 0 3690 */ 3691 VMM_STAT_DECLARE(VMM_MEM_RESIDENT); 3692 3693 static void 3694 vm_get_rescnt(struct vm *vm, int vcpu, struct vmm_stat_type *stat) 3695 { 3696 if (vcpu == 0) { 3697 vmm_stat_set(vm, vcpu, VMM_MEM_RESIDENT, 3698 PAGE_SIZE * vmspace_resident_count(vm->vmspace)); 3699 } 3700 } 3701 3702 VMM_STAT_FUNC(VMM_MEM_RESIDENT, "Resident memory", vm_get_rescnt); 3703 3704 int 3705 vm_ioport_access(struct vm *vm, int vcpuid, bool in, uint16_t port, 3706 uint8_t bytes, uint32_t *val) 3707 { 3708 return (vm_inout_access(&vm->ioports, in, port, bytes, val)); 3709 } 3710 3711 /* 3712 * bhyve-internal interfaces to attach or detach IO port handlers. 3713 * Must be called with VM write lock held for safety. 3714 */ 3715 int 3716 vm_ioport_attach(struct vm *vm, uint16_t port, ioport_handler_t func, void *arg, 3717 void **cookie) 3718 { 3719 int err; 3720 err = vm_inout_attach(&vm->ioports, port, IOPF_DEFAULT, func, arg); 3721 if (err == 0) { 3722 *cookie = (void *)IOP_GEN_COOKIE(func, arg, port); 3723 } 3724 return (err); 3725 } 3726 int 3727 vm_ioport_detach(struct vm *vm, void **cookie, ioport_handler_t *old_func, 3728 void **old_arg) 3729 { 3730 uint16_t port = IOP_PORT_FROM_COOKIE((uintptr_t)*cookie); 3731 int err; 3732 3733 err = vm_inout_detach(&vm->ioports, port, false, old_func, old_arg); 3734 if (err == 0) { 3735 *cookie = NULL; 3736 } 3737 return (err); 3738 } 3739 3740 /* 3741 * External driver interfaces to attach or detach IO port handlers. 3742 * Must be called with VM write lock held for safety. 3743 */ 3744 int 3745 vm_ioport_hook(struct vm *vm, uint16_t port, ioport_handler_t func, 3746 void *arg, void **cookie) 3747 { 3748 int err; 3749 3750 if (port == 0) { 3751 return (EINVAL); 3752 } 3753 3754 err = vm_inout_attach(&vm->ioports, port, IOPF_DRV_HOOK, func, arg); 3755 if (err == 0) { 3756 *cookie = (void *)IOP_GEN_COOKIE(func, arg, port); 3757 } 3758 return (err); 3759 } 3760 void 3761 vm_ioport_unhook(struct vm *vm, void **cookie) 3762 { 3763 uint16_t port = IOP_PORT_FROM_COOKIE((uintptr_t)*cookie); 3764 ioport_handler_t old_func; 3765 void *old_arg; 3766 int err; 3767 3768 err = vm_inout_detach(&vm->ioports, port, true, &old_func, &old_arg); 3769 3770 /* ioport-hook-using drivers are expected to be well-behaved */ 3771 VERIFY0(err); 3772 VERIFY(IOP_GEN_COOKIE(old_func, old_arg, port) == (uintptr_t)*cookie); 3773 3774 *cookie = NULL; 3775 } 3776 3777 int 3778 vmm_kstat_update_vcpu(struct kstat *ksp, int rw) 3779 { 3780 struct vm *vm = ksp->ks_private; 3781 vmm_vcpu_kstats_t *vvk = ksp->ks_data; 3782 const int vcpuid = vvk->vvk_vcpu.value.ui32; 3783 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 3784 3785 ASSERT3U(vcpuid, <, VM_MAXCPU); 3786 3787 vvk->vvk_time_init.value.ui64 = vcpu->ustate_total[VU_INIT]; 3788 vvk->vvk_time_run.value.ui64 = vcpu->ustate_total[VU_RUN]; 3789 vvk->vvk_time_idle.value.ui64 = vcpu->ustate_total[VU_IDLE]; 3790 vvk->vvk_time_emu_kern.value.ui64 = vcpu->ustate_total[VU_EMU_KERN]; 3791 vvk->vvk_time_emu_user.value.ui64 = vcpu->ustate_total[VU_EMU_USER]; 3792 vvk->vvk_time_sched.value.ui64 = vcpu->ustate_total[VU_SCHED]; 3793 3794 return (0); 3795 } 3796 3797 SET_DECLARE(vmm_data_version_entries, const vmm_data_version_entry_t); 3798 3799 static inline bool 3800 vmm_data_is_cpu_specific(uint16_t data_class) 3801 { 3802 switch (data_class) { 3803 case VDC_REGISTER: 3804 case VDC_MSR: 3805 case VDC_FPU: 3806 case VDC_LAPIC: 3807 return (true); 3808 default: 3809 return (false); 3810 } 3811 } 3812 3813 static int 3814 vmm_data_find(const vmm_data_req_t *req, const vmm_data_version_entry_t **resp) 3815 { 3816 const vmm_data_version_entry_t **vdpp, *vdp; 3817 3818 ASSERT(resp != NULL); 3819 ASSERT(req->vdr_result_len != NULL); 3820 3821 SET_FOREACH(vdpp, vmm_data_version_entries) { 3822 vdp = *vdpp; 3823 if (vdp->vdve_class == req->vdr_class && 3824 vdp->vdve_version == req->vdr_version) { 3825 /* 3826 * Enforce any data length expectation expressed by the 3827 * provider for this data. 3828 */ 3829 if (vdp->vdve_len_expect != 0 && 3830 vdp->vdve_len_expect > req->vdr_len) { 3831 *req->vdr_result_len = vdp->vdve_len_expect; 3832 return (ENOSPC); 3833 } 3834 *resp = vdp; 3835 return (0); 3836 } 3837 } 3838 return (EINVAL); 3839 } 3840 3841 static void * 3842 vmm_data_from_class(const vmm_data_req_t *req, struct vm *vm, int vcpuid) 3843 { 3844 switch (req->vdr_class) { 3845 /* per-cpu data/devices */ 3846 case VDC_LAPIC: 3847 return (vm_lapic(vm, vcpuid)); 3848 case VDC_VMM_ARCH: 3849 return (vm); 3850 3851 case VDC_FPU: 3852 case VDC_REGISTER: 3853 case VDC_MSR: 3854 /* 3855 * These have per-CPU handling which is dispatched outside 3856 * vmm_data_version_entries listing. 3857 */ 3858 return (NULL); 3859 3860 /* system-wide data/devices */ 3861 case VDC_IOAPIC: 3862 return (vm->vioapic); 3863 case VDC_ATPIT: 3864 return (vm->vatpit); 3865 case VDC_ATPIC: 3866 return (vm->vatpic); 3867 case VDC_HPET: 3868 return (vm->vhpet); 3869 case VDC_PM_TIMER: 3870 return (vm->vpmtmr); 3871 case VDC_RTC: 3872 return (vm->vrtc); 3873 3874 default: 3875 /* The data class will have been validated by now */ 3876 panic("Unexpected class %u", req->vdr_class); 3877 } 3878 } 3879 3880 const uint32_t arch_msr_iter[] = { 3881 MSR_EFER, 3882 3883 /* 3884 * While gsbase and fsbase are accessible via the MSR accessors, they 3885 * are not included in MSR iteration since they are covered by the 3886 * segment descriptor interface too. 3887 */ 3888 MSR_KGSBASE, 3889 3890 MSR_STAR, 3891 MSR_LSTAR, 3892 MSR_CSTAR, 3893 MSR_SF_MASK, 3894 3895 MSR_SYSENTER_CS_MSR, 3896 MSR_SYSENTER_ESP_MSR, 3897 MSR_SYSENTER_EIP_MSR, 3898 MSR_PAT, 3899 }; 3900 const uint32_t generic_msr_iter[] = { 3901 MSR_TSC, 3902 MSR_MTRRcap, 3903 MSR_MTRRdefType, 3904 3905 MSR_MTRR4kBase, MSR_MTRR4kBase + 1, MSR_MTRR4kBase + 2, 3906 MSR_MTRR4kBase + 3, MSR_MTRR4kBase + 4, MSR_MTRR4kBase + 5, 3907 MSR_MTRR4kBase + 6, MSR_MTRR4kBase + 7, 3908 3909 MSR_MTRR16kBase, MSR_MTRR16kBase + 1, 3910 3911 MSR_MTRR64kBase, 3912 }; 3913 3914 static int 3915 vmm_data_read_msrs(struct vm *vm, int vcpuid, const vmm_data_req_t *req) 3916 { 3917 VERIFY3U(req->vdr_class, ==, VDC_MSR); 3918 VERIFY3U(req->vdr_version, ==, 1); 3919 3920 const uint_t num_msrs = nitems(arch_msr_iter) + nitems(generic_msr_iter) 3921 + (VMM_MTRR_VAR_MAX * 2); 3922 const uint32_t output_len = 3923 num_msrs * sizeof (struct vdi_field_entry_v1); 3924 *req->vdr_result_len = output_len; 3925 3926 if (req->vdr_len < output_len) { 3927 return (ENOSPC); 3928 } 3929 3930 struct vdi_field_entry_v1 *entryp = req->vdr_data; 3931 for (uint_t i = 0; i < nitems(arch_msr_iter); i++, entryp++) { 3932 const uint32_t msr = arch_msr_iter[i]; 3933 uint64_t val = 0; 3934 3935 int err = ops->vmgetmsr(vm->cookie, vcpuid, msr, &val); 3936 /* All of these MSRs are expected to work */ 3937 VERIFY0(err); 3938 entryp->vfe_ident = msr; 3939 entryp->vfe_value = val; 3940 } 3941 3942 struct vm_mtrr *mtrr = &vm->vcpu[vcpuid].mtrr; 3943 for (uint_t i = 0; i < nitems(generic_msr_iter); i++, entryp++) { 3944 const uint32_t msr = generic_msr_iter[i]; 3945 3946 entryp->vfe_ident = msr; 3947 switch (msr) { 3948 case MSR_TSC: 3949 /* 3950 * Communicate this as the difference from the VM-wide 3951 * offset of the boot time. 3952 */ 3953 entryp->vfe_value = vm->vcpu[vcpuid].tsc_offset; 3954 break; 3955 case MSR_MTRRcap: 3956 case MSR_MTRRdefType: 3957 case MSR_MTRR4kBase ... MSR_MTRR4kBase + 7: 3958 case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1: 3959 case MSR_MTRR64kBase: { 3960 int err = vm_rdmtrr(mtrr, msr, &entryp->vfe_value); 3961 VERIFY0(err); 3962 break; 3963 } 3964 default: 3965 panic("unexpected msr export %x", msr); 3966 } 3967 } 3968 /* Copy the variable MTRRs */ 3969 for (uint_t i = 0; i < (VMM_MTRR_VAR_MAX * 2); i++, entryp++) { 3970 const uint32_t msr = MSR_MTRRVarBase + i; 3971 3972 entryp->vfe_ident = msr; 3973 int err = vm_rdmtrr(mtrr, msr, &entryp->vfe_value); 3974 VERIFY0(err); 3975 } 3976 return (0); 3977 } 3978 3979 static int 3980 vmm_data_write_msrs(struct vm *vm, int vcpuid, const vmm_data_req_t *req) 3981 { 3982 VERIFY3U(req->vdr_class, ==, VDC_MSR); 3983 VERIFY3U(req->vdr_version, ==, 1); 3984 3985 const struct vdi_field_entry_v1 *entryp = req->vdr_data; 3986 const uint_t entry_count = 3987 req->vdr_len / sizeof (struct vdi_field_entry_v1); 3988 struct vm_mtrr *mtrr = &vm->vcpu[vcpuid].mtrr; 3989 3990 /* 3991 * First make sure that all of the MSRs can be manipulated. 3992 * For now, this check is done by going though the getmsr handler 3993 */ 3994 for (uint_t i = 0; i < entry_count; i++, entryp++) { 3995 const uint32_t msr = entryp->vfe_ident; 3996 uint64_t val; 3997 int err = 0; 3998 3999 switch (msr) { 4000 case MSR_TSC: 4001 break; 4002 default: 4003 if (is_mtrr_msr(msr)) { 4004 err = vm_rdmtrr(mtrr, msr, &val); 4005 } else { 4006 err = ops->vmgetmsr(vm->cookie, vcpuid, msr, 4007 &val); 4008 } 4009 break; 4010 } 4011 if (err != 0) { 4012 return (err); 4013 } 4014 } 4015 4016 /* 4017 * Fairly confident that all of the 'set' operations are at least 4018 * targeting valid MSRs, continue on. 4019 */ 4020 entryp = req->vdr_data; 4021 for (uint_t i = 0; i < entry_count; i++, entryp++) { 4022 const uint32_t msr = entryp->vfe_ident; 4023 const uint64_t val = entryp->vfe_value; 4024 int err = 0; 4025 4026 switch (msr) { 4027 case MSR_TSC: 4028 vm->vcpu[vcpuid].tsc_offset = entryp->vfe_value; 4029 break; 4030 default: 4031 if (is_mtrr_msr(msr)) { 4032 if (msr == MSR_MTRRcap) { 4033 /* 4034 * MTRRcap is read-only. If the current 4035 * value matches the incoming one, 4036 * consider it a success 4037 */ 4038 uint64_t comp; 4039 err = vm_rdmtrr(mtrr, msr, &comp); 4040 if (err != 0 || comp != val) { 4041 err = EINVAL; 4042 } 4043 } else { 4044 err = vm_wrmtrr(mtrr, msr, val); 4045 } 4046 } else { 4047 err = ops->vmsetmsr(vm->cookie, vcpuid, msr, 4048 val); 4049 } 4050 break; 4051 } 4052 if (err != 0) { 4053 return (err); 4054 } 4055 } 4056 *req->vdr_result_len = entry_count * sizeof (struct vdi_field_entry_v1); 4057 4058 return (0); 4059 } 4060 4061 static const vmm_data_version_entry_t msr_v1 = { 4062 .vdve_class = VDC_MSR, 4063 .vdve_version = 1, 4064 .vdve_len_per_item = sizeof (struct vdi_field_entry_v1), 4065 /* Requires backend-specific dispatch */ 4066 .vdve_readf = NULL, 4067 .vdve_writef = NULL, 4068 }; 4069 VMM_DATA_VERSION(msr_v1); 4070 4071 static const uint32_t vmm_arch_v1_fields[] = { 4072 VAI_TSC_BOOT_OFFSET, 4073 VAI_BOOT_HRTIME, 4074 VAI_TSC_FREQ, 4075 }; 4076 4077 static bool 4078 vmm_read_arch_field(struct vm *vm, uint32_t ident, uint64_t *valp) 4079 { 4080 ASSERT(valp != NULL); 4081 4082 switch (ident) { 4083 case VAI_TSC_BOOT_OFFSET: 4084 *valp = vm->boot_tsc_offset; 4085 return (true); 4086 case VAI_BOOT_HRTIME: 4087 *valp = vm->boot_hrtime; 4088 return (true); 4089 case VAI_TSC_FREQ: 4090 /* 4091 * Since the system TSC calibration is not public, just derive 4092 * it from the scaling functions available. 4093 */ 4094 *valp = unscalehrtime(NANOSEC); 4095 return (true); 4096 default: 4097 break; 4098 } 4099 return (false); 4100 } 4101 4102 static int 4103 vmm_data_read_vmm_arch(void *arg, const vmm_data_req_t *req) 4104 { 4105 struct vm *vm = arg; 4106 4107 VERIFY3U(req->vdr_class, ==, VDC_VMM_ARCH); 4108 VERIFY3U(req->vdr_version, ==, 1); 4109 4110 struct vdi_field_entry_v1 *entryp = req->vdr_data; 4111 4112 /* Specific fields requested */ 4113 if ((req->vdr_flags & VDX_FLAG_READ_COPYIN) != 0) { 4114 const uint_t count = 4115 req->vdr_len / sizeof (struct vdi_field_entry_v1); 4116 4117 for (uint_t i = 0; i < count; i++, entryp++) { 4118 if (!vmm_read_arch_field(vm, entryp->vfe_ident, 4119 &entryp->vfe_value)) { 4120 return (EINVAL); 4121 } 4122 } 4123 *req->vdr_result_len = 4124 count * sizeof (struct vdi_field_entry_v1); 4125 return (0); 4126 } 4127 4128 /* Emit all of the possible values */ 4129 const uint32_t total_size = nitems(vmm_arch_v1_fields) * 4130 sizeof (struct vdi_field_entry_v1); 4131 *req->vdr_result_len = total_size; 4132 if (req->vdr_len < total_size) { 4133 return (ENOSPC); 4134 } 4135 for (uint_t i = 0; i < nitems(vmm_arch_v1_fields); i++, entryp++) { 4136 entryp->vfe_ident = vmm_arch_v1_fields[i]; 4137 VERIFY(vmm_read_arch_field(vm, entryp->vfe_ident, 4138 &entryp->vfe_value)); 4139 } 4140 return (0); 4141 } 4142 4143 static int 4144 vmm_data_write_vmm_arch(void *arg, const vmm_data_req_t *req) 4145 { 4146 struct vm *vm = arg; 4147 4148 VERIFY3U(req->vdr_class, ==, VDC_VMM_ARCH); 4149 VERIFY3U(req->vdr_version, ==, 1); 4150 4151 const struct vdi_field_entry_v1 *entryp = req->vdr_data; 4152 const uint_t entry_count = 4153 req->vdr_len / sizeof (struct vdi_field_entry_v1); 4154 4155 for (uint_t i = 0; i < entry_count; i++, entryp++) { 4156 const uint64_t val = entryp->vfe_value; 4157 4158 switch (entryp->vfe_ident) { 4159 case VAI_TSC_BOOT_OFFSET: 4160 vm->boot_tsc_offset = val; 4161 break; 4162 case VAI_BOOT_HRTIME: 4163 vm->boot_hrtime = val; 4164 break; 4165 case VAI_TSC_FREQ: 4166 /* Guest TSC frequency not (currently) adjustable */ 4167 return (EPERM); 4168 default: 4169 return (EINVAL); 4170 } 4171 } 4172 *req->vdr_result_len = entry_count * sizeof (struct vdi_field_entry_v1); 4173 return (0); 4174 } 4175 4176 static const vmm_data_version_entry_t vmm_arch_v1 = { 4177 .vdve_class = VDC_VMM_ARCH, 4178 .vdve_version = 1, 4179 .vdve_len_per_item = sizeof (struct vdi_field_entry_v1), 4180 .vdve_readf = vmm_data_read_vmm_arch, 4181 .vdve_writef = vmm_data_write_vmm_arch, 4182 }; 4183 VMM_DATA_VERSION(vmm_arch_v1); 4184 4185 static int 4186 vmm_data_read_versions(void *arg, const vmm_data_req_t *req) 4187 { 4188 VERIFY3U(req->vdr_class, ==, VDC_VERSION); 4189 VERIFY3U(req->vdr_version, ==, 1); 4190 4191 const uint32_t total_size = SET_COUNT(vmm_data_version_entries) * 4192 sizeof (struct vdi_version_entry_v1); 4193 4194 /* Make sure there is room for all of the entries */ 4195 *req->vdr_result_len = total_size; 4196 if (req->vdr_len < *req->vdr_result_len) { 4197 return (ENOSPC); 4198 } 4199 4200 struct vdi_version_entry_v1 *entryp = req->vdr_data; 4201 const vmm_data_version_entry_t **vdpp; 4202 SET_FOREACH(vdpp, vmm_data_version_entries) { 4203 const vmm_data_version_entry_t *vdp = *vdpp; 4204 4205 entryp->vve_class = vdp->vdve_class; 4206 entryp->vve_version = vdp->vdve_version; 4207 entryp->vve_len_expect = vdp->vdve_len_expect; 4208 entryp->vve_len_per_item = vdp->vdve_len_per_item; 4209 entryp++; 4210 } 4211 return (0); 4212 } 4213 4214 static int 4215 vmm_data_write_versions(void *arg, const vmm_data_req_t *req) 4216 { 4217 /* Writing to the version information makes no sense */ 4218 return (EPERM); 4219 } 4220 4221 static const vmm_data_version_entry_t versions_v1 = { 4222 .vdve_class = VDC_VERSION, 4223 .vdve_version = 1, 4224 .vdve_len_per_item = sizeof (struct vdi_version_entry_v1), 4225 .vdve_readf = vmm_data_read_versions, 4226 .vdve_writef = vmm_data_write_versions, 4227 }; 4228 VMM_DATA_VERSION(versions_v1); 4229 4230 int 4231 vmm_data_read(struct vm *vm, int vcpuid, const vmm_data_req_t *req) 4232 { 4233 int err = 0; 4234 4235 if (vmm_data_is_cpu_specific(req->vdr_class)) { 4236 if (vcpuid >= VM_MAXCPU) { 4237 return (EINVAL); 4238 } 4239 } 4240 4241 const vmm_data_version_entry_t *entry = NULL; 4242 err = vmm_data_find(req, &entry); 4243 if (err != 0) { 4244 return (err); 4245 } 4246 ASSERT(entry != NULL); 4247 4248 void *datap = vmm_data_from_class(req, vm, vcpuid); 4249 if (datap != NULL) { 4250 err = entry->vdve_readf(datap, req); 4251 4252 /* 4253 * Successful reads of fixed-length data should populate the 4254 * length of that result. 4255 */ 4256 if (err == 0 && entry->vdve_len_expect != 0) { 4257 *req->vdr_result_len = entry->vdve_len_expect; 4258 } 4259 } else { 4260 switch (req->vdr_class) { 4261 case VDC_MSR: 4262 err = vmm_data_read_msrs(vm, vcpuid, req); 4263 break; 4264 case VDC_FPU: 4265 /* TODO: wire up to xsave export via hma_fpu iface */ 4266 err = EINVAL; 4267 break; 4268 case VDC_REGISTER: 4269 default: 4270 err = EINVAL; 4271 break; 4272 } 4273 } 4274 4275 return (err); 4276 } 4277 4278 int 4279 vmm_data_write(struct vm *vm, int vcpuid, const vmm_data_req_t *req) 4280 { 4281 int err = 0; 4282 4283 if (vmm_data_is_cpu_specific(req->vdr_class)) { 4284 if (vcpuid >= VM_MAXCPU) { 4285 return (EINVAL); 4286 } 4287 } 4288 4289 const vmm_data_version_entry_t *entry = NULL; 4290 err = vmm_data_find(req, &entry); 4291 if (err != 0) { 4292 return (err); 4293 } 4294 ASSERT(entry != NULL); 4295 4296 void *datap = vmm_data_from_class(req, vm, vcpuid); 4297 if (datap != NULL) { 4298 err = entry->vdve_writef(datap, req); 4299 /* 4300 * Successful writes of fixed-length data should populate the 4301 * length of that result. 4302 */ 4303 if (err == 0 && entry->vdve_len_expect != 0) { 4304 *req->vdr_result_len = entry->vdve_len_expect; 4305 } 4306 } else { 4307 switch (req->vdr_class) { 4308 case VDC_MSR: 4309 err = vmm_data_write_msrs(vm, vcpuid, req); 4310 break; 4311 case VDC_FPU: 4312 /* TODO: wire up to xsave import via hma_fpu iface */ 4313 err = EINVAL; 4314 break; 4315 case VDC_REGISTER: 4316 default: 4317 err = EINVAL; 4318 break; 4319 } 4320 } 4321 4322 return (err); 4323 } 4324