1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2011 NetApp, Inc. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 * 28 * $FreeBSD$ 29 */ 30 /* 31 * This file and its contents are supplied under the terms of the 32 * Common Development and Distribution License ("CDDL"), version 1.0. 33 * You may only use this file in accordance with the terms of version 34 * 1.0 of the CDDL. 35 * 36 * A full copy of the text of the CDDL should have accompanied this 37 * source. A copy of the CDDL is also available via the Internet at 38 * http://www.illumos.org/license/CDDL. 39 * 40 * Copyright 2015 Pluribus Networks Inc. 41 * Copyright 2018 Joyent, Inc. 42 * Copyright 2022 Oxide Computer Company 43 * Copyright 2021 OmniOS Community Edition (OmniOSce) Association. 44 */ 45 46 #include <sys/cdefs.h> 47 __FBSDID("$FreeBSD$"); 48 49 #include <sys/param.h> 50 #include <sys/systm.h> 51 #include <sys/kernel.h> 52 #include <sys/module.h> 53 #include <sys/sysctl.h> 54 #include <sys/kmem.h> 55 #include <sys/pcpu.h> 56 #include <sys/mutex.h> 57 #include <sys/proc.h> 58 #include <sys/rwlock.h> 59 #include <sys/sched.h> 60 #include <sys/systm.h> 61 #include <sys/sunddi.h> 62 #include <sys/hma.h> 63 64 #include <machine/md_var.h> 65 #include <x86/psl.h> 66 #include <x86/apicreg.h> 67 68 #include <machine/specialreg.h> 69 #include <machine/vmm.h> 70 #include <machine/vmm_dev.h> 71 #include <machine/vmparam.h> 72 #include <sys/vmm_instruction_emul.h> 73 #include <sys/vmm_vm.h> 74 #include <sys/vmm_gpt.h> 75 #include <sys/vmm_data.h> 76 77 #include "vmm_ioport.h" 78 #include "vmm_host.h" 79 #include "vmm_util.h" 80 #include "vatpic.h" 81 #include "vatpit.h" 82 #include "vhpet.h" 83 #include "vioapic.h" 84 #include "vlapic.h" 85 #include "vpmtmr.h" 86 #include "vrtc.h" 87 #include "vmm_stat.h" 88 #include "vmm_lapic.h" 89 90 #include "io/ppt.h" 91 #include "io/iommu.h" 92 93 struct vlapic; 94 95 /* Flags for vtc_status */ 96 #define VTCS_FPU_RESTORED 1 /* guest FPU restored, host FPU saved */ 97 #define VTCS_FPU_CTX_CRITICAL 2 /* in ctx where FPU restore cannot be lazy */ 98 99 typedef struct vm_thread_ctx { 100 struct vm *vtc_vm; 101 int vtc_vcpuid; 102 uint_t vtc_status; 103 enum vcpu_ustate vtc_ustate; 104 } vm_thread_ctx_t; 105 106 #define VMM_MTRR_VAR_MAX 10 107 #define VMM_MTRR_DEF_MASK \ 108 (MTRR_DEF_ENABLE | MTRR_DEF_FIXED_ENABLE | MTRR_DEF_TYPE) 109 #define VMM_MTRR_PHYSBASE_MASK (MTRR_PHYSBASE_PHYSBASE | MTRR_PHYSBASE_TYPE) 110 #define VMM_MTRR_PHYSMASK_MASK (MTRR_PHYSMASK_PHYSMASK | MTRR_PHYSMASK_VALID) 111 struct vm_mtrr { 112 uint64_t def_type; 113 uint64_t fixed4k[8]; 114 uint64_t fixed16k[2]; 115 uint64_t fixed64k; 116 struct { 117 uint64_t base; 118 uint64_t mask; 119 } var[VMM_MTRR_VAR_MAX]; 120 }; 121 122 /* 123 * Initialization: 124 * (a) allocated when vcpu is created 125 * (i) initialized when vcpu is created and when it is reinitialized 126 * (o) initialized the first time the vcpu is created 127 * (x) initialized before use 128 */ 129 struct vcpu { 130 /* (o) protects state, run_state, hostcpu, sipi_vector */ 131 kmutex_t lock; 132 133 enum vcpu_state state; /* (o) vcpu state */ 134 enum vcpu_run_state run_state; /* (i) vcpu init/sipi/run state */ 135 kcondvar_t vcpu_cv; /* (o) cpu waiter cv */ 136 kcondvar_t state_cv; /* (o) IDLE-transition cv */ 137 int hostcpu; /* (o) vcpu's current host cpu */ 138 int lastloccpu; /* (o) last host cpu localized to */ 139 int reqidle; /* (i) request vcpu to idle */ 140 struct vlapic *vlapic; /* (i) APIC device model */ 141 enum x2apic_state x2apic_state; /* (i) APIC mode */ 142 uint64_t exit_intinfo; /* (i) events pending at VM exit */ 143 uint64_t exc_pending; /* (i) exception pending */ 144 bool nmi_pending; /* (i) NMI pending */ 145 bool extint_pending; /* (i) INTR pending */ 146 147 uint8_t sipi_vector; /* (i) SIPI vector */ 148 hma_fpu_t *guestfpu; /* (a,i) guest fpu state */ 149 uint64_t guest_xcr0; /* (i) guest %xcr0 register */ 150 void *stats; /* (a,i) statistics */ 151 struct vm_exit exitinfo; /* (x) exit reason and collateral */ 152 uint64_t nextrip; /* (x) next instruction to execute */ 153 struct vie *vie_ctx; /* (x) instruction emulation context */ 154 vm_client_t *vmclient; /* (a) VM-system client */ 155 uint64_t tsc_offset; /* (x) offset from host TSC */ 156 struct vm_mtrr mtrr; /* (i) vcpu's MTRR */ 157 158 enum vcpu_ustate ustate; /* (i) microstate for the vcpu */ 159 hrtime_t ustate_when; /* (i) time of last ustate change */ 160 uint64_t ustate_total[VU_MAX]; /* (o) total time spent in ustates */ 161 vm_thread_ctx_t vtc; /* (o) thread state for ctxops */ 162 struct ctxop *ctxop; /* (o) ctxop storage for vcpu */ 163 }; 164 165 #define vcpu_lock(v) mutex_enter(&((v)->lock)) 166 #define vcpu_unlock(v) mutex_exit(&((v)->lock)) 167 #define vcpu_assert_locked(v) ASSERT(MUTEX_HELD(&((v)->lock))) 168 169 struct mem_seg { 170 size_t len; 171 bool sysmem; 172 vm_object_t *object; 173 }; 174 #define VM_MAX_MEMSEGS 5 175 176 struct mem_map { 177 vm_paddr_t gpa; 178 size_t len; 179 vm_ooffset_t segoff; 180 int segid; 181 int prot; 182 int flags; 183 }; 184 #define VM_MAX_MEMMAPS 8 185 186 /* 187 * Initialization: 188 * (o) initialized the first time the VM is created 189 * (i) initialized when VM is created and when it is reinitialized 190 * (x) initialized before use 191 */ 192 struct vm { 193 void *cookie; /* (i) cpu-specific data */ 194 void *iommu; /* (x) iommu-specific data */ 195 struct vhpet *vhpet; /* (i) virtual HPET */ 196 struct vioapic *vioapic; /* (i) virtual ioapic */ 197 struct vatpic *vatpic; /* (i) virtual atpic */ 198 struct vatpit *vatpit; /* (i) virtual atpit */ 199 struct vpmtmr *vpmtmr; /* (i) virtual ACPI PM timer */ 200 struct vrtc *vrtc; /* (o) virtual RTC */ 201 volatile cpuset_t active_cpus; /* (i) active vcpus */ 202 volatile cpuset_t debug_cpus; /* (i) vcpus stopped for dbg */ 203 int suspend; /* (i) stop VM execution */ 204 volatile cpuset_t suspended_cpus; /* (i) suspended vcpus */ 205 volatile cpuset_t halted_cpus; /* (x) cpus in a hard halt */ 206 struct mem_map mem_maps[VM_MAX_MEMMAPS]; /* (i) guest address space */ 207 struct mem_seg mem_segs[VM_MAX_MEMSEGS]; /* (o) guest memory regions */ 208 struct vmspace *vmspace; /* (o) guest's address space */ 209 struct vcpu vcpu[VM_MAXCPU]; /* (i) guest vcpus */ 210 /* The following describe the vm cpu topology */ 211 uint16_t sockets; /* (o) num of sockets */ 212 uint16_t cores; /* (o) num of cores/socket */ 213 uint16_t threads; /* (o) num of threads/core */ 214 uint16_t maxcpus; /* (o) max pluggable cpus */ 215 216 uint64_t boot_tsc_offset; /* (i) TSC offset at VM boot */ 217 hrtime_t boot_hrtime; /* (i) hrtime at VM boot */ 218 219 struct ioport_config ioports; /* (o) ioport handling */ 220 221 bool mem_transient; /* (o) alloc transient memory */ 222 }; 223 224 static int vmm_initialized; 225 226 227 static void 228 nullop_panic(void) 229 { 230 panic("null vmm operation call"); 231 } 232 233 /* Do not allow use of an un-set `ops` to do anything but panic */ 234 static struct vmm_ops vmm_ops_null = { 235 .init = (vmm_init_func_t)nullop_panic, 236 .cleanup = (vmm_cleanup_func_t)nullop_panic, 237 .resume = (vmm_resume_func_t)nullop_panic, 238 .vminit = (vmi_init_func_t)nullop_panic, 239 .vmrun = (vmi_run_func_t)nullop_panic, 240 .vmcleanup = (vmi_cleanup_func_t)nullop_panic, 241 .vmgetreg = (vmi_get_register_t)nullop_panic, 242 .vmsetreg = (vmi_set_register_t)nullop_panic, 243 .vmgetdesc = (vmi_get_desc_t)nullop_panic, 244 .vmsetdesc = (vmi_set_desc_t)nullop_panic, 245 .vmgetcap = (vmi_get_cap_t)nullop_panic, 246 .vmsetcap = (vmi_set_cap_t)nullop_panic, 247 .vlapic_init = (vmi_vlapic_init)nullop_panic, 248 .vlapic_cleanup = (vmi_vlapic_cleanup)nullop_panic, 249 .vmsavectx = (vmi_savectx)nullop_panic, 250 .vmrestorectx = (vmi_restorectx)nullop_panic, 251 }; 252 253 static struct vmm_ops *ops = &vmm_ops_null; 254 static vmm_pte_ops_t *pte_ops = NULL; 255 256 #define VMM_INIT() ((*ops->init)()) 257 #define VMM_CLEANUP() ((*ops->cleanup)()) 258 #define VMM_RESUME() ((*ops->resume)()) 259 260 #define VMINIT(vm) ((*ops->vminit)(vm)) 261 #define VMRUN(vmi, vcpu, rip) ((*ops->vmrun)(vmi, vcpu, rip)) 262 #define VMCLEANUP(vmi) ((*ops->vmcleanup)(vmi)) 263 264 #define VMGETREG(vmi, vcpu, num, rv) ((*ops->vmgetreg)(vmi, vcpu, num, rv)) 265 #define VMSETREG(vmi, vcpu, num, val) ((*ops->vmsetreg)(vmi, vcpu, num, val)) 266 #define VMGETDESC(vmi, vcpu, num, dsc) ((*ops->vmgetdesc)(vmi, vcpu, num, dsc)) 267 #define VMSETDESC(vmi, vcpu, num, dsc) ((*ops->vmsetdesc)(vmi, vcpu, num, dsc)) 268 #define VMGETCAP(vmi, vcpu, num, rv) ((*ops->vmgetcap)(vmi, vcpu, num, rv)) 269 #define VMSETCAP(vmi, vcpu, num, val) ((*ops->vmsetcap)(vmi, vcpu, num, val)) 270 #define VLAPIC_INIT(vmi, vcpu) ((*ops->vlapic_init)(vmi, vcpu)) 271 #define VLAPIC_CLEANUP(vmi, vlapic) ((*ops->vlapic_cleanup)(vmi, vlapic)) 272 273 #define fpu_start_emulating() load_cr0(rcr0() | CR0_TS) 274 #define fpu_stop_emulating() clts() 275 276 SDT_PROVIDER_DEFINE(vmm); 277 278 SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 279 NULL); 280 281 /* 282 * Halt the guest if all vcpus are executing a HLT instruction with 283 * interrupts disabled. 284 */ 285 static int halt_detection_enabled = 1; 286 287 /* Trap into hypervisor on all guest exceptions and reflect them back */ 288 static int trace_guest_exceptions; 289 290 static void vm_free_memmap(struct vm *vm, int ident); 291 static bool sysmem_mapping(struct vm *vm, struct mem_map *mm); 292 static void vcpu_notify_event_locked(struct vcpu *vcpu, vcpu_notify_t); 293 static bool vcpu_sleep_bailout_checks(struct vm *vm, int vcpuid); 294 static int vcpu_vector_sipi(struct vm *vm, int vcpuid, uint8_t vector); 295 296 static void vmm_savectx(void *); 297 static void vmm_restorectx(void *); 298 static const struct ctxop_template vmm_ctxop_tpl = { 299 .ct_rev = CTXOP_TPL_REV, 300 .ct_save = vmm_savectx, 301 .ct_restore = vmm_restorectx, 302 }; 303 304 #ifdef KTR 305 static const char * 306 vcpu_state2str(enum vcpu_state state) 307 { 308 309 switch (state) { 310 case VCPU_IDLE: 311 return ("idle"); 312 case VCPU_FROZEN: 313 return ("frozen"); 314 case VCPU_RUNNING: 315 return ("running"); 316 case VCPU_SLEEPING: 317 return ("sleeping"); 318 default: 319 return ("unknown"); 320 } 321 } 322 #endif 323 324 static void 325 vcpu_cleanup(struct vm *vm, int i, bool destroy) 326 { 327 struct vcpu *vcpu = &vm->vcpu[i]; 328 329 VLAPIC_CLEANUP(vm->cookie, vcpu->vlapic); 330 if (destroy) { 331 vmm_stat_free(vcpu->stats); 332 333 hma_fpu_free(vcpu->guestfpu); 334 vcpu->guestfpu = NULL; 335 336 vie_free(vcpu->vie_ctx); 337 vcpu->vie_ctx = NULL; 338 339 vmc_destroy(vcpu->vmclient); 340 vcpu->vmclient = NULL; 341 342 ctxop_free(vcpu->ctxop); 343 mutex_destroy(&vcpu->lock); 344 } 345 } 346 347 static void 348 vcpu_init(struct vm *vm, int vcpu_id, bool create) 349 { 350 struct vcpu *vcpu; 351 352 KASSERT(vcpu_id >= 0 && vcpu_id < vm->maxcpus, 353 ("vcpu_init: invalid vcpu %d", vcpu_id)); 354 355 vcpu = &vm->vcpu[vcpu_id]; 356 357 if (create) { 358 mutex_init(&vcpu->lock, NULL, MUTEX_ADAPTIVE, NULL); 359 360 vcpu->state = VCPU_IDLE; 361 vcpu->hostcpu = NOCPU; 362 vcpu->lastloccpu = NOCPU; 363 vcpu->guestfpu = hma_fpu_alloc(KM_SLEEP); 364 vcpu->stats = vmm_stat_alloc(); 365 vcpu->vie_ctx = vie_alloc(); 366 367 vcpu->ustate = VU_INIT; 368 vcpu->ustate_when = gethrtime(); 369 370 vcpu->vtc.vtc_vm = vm; 371 vcpu->vtc.vtc_vcpuid = vcpu_id; 372 vcpu->ctxop = ctxop_allocate(&vmm_ctxop_tpl, &vcpu->vtc); 373 } else { 374 vie_reset(vcpu->vie_ctx); 375 bzero(&vcpu->exitinfo, sizeof (vcpu->exitinfo)); 376 if (vcpu->ustate != VU_INIT) { 377 vcpu_ustate_change(vm, vcpu_id, VU_INIT); 378 } 379 bzero(&vcpu->mtrr, sizeof (vcpu->mtrr)); 380 } 381 382 vcpu->run_state = VRS_HALT; 383 vcpu->vlapic = VLAPIC_INIT(vm->cookie, vcpu_id); 384 (void) vm_set_x2apic_state(vm, vcpu_id, X2APIC_DISABLED); 385 vcpu->reqidle = 0; 386 vcpu->exit_intinfo = 0; 387 vcpu->nmi_pending = false; 388 vcpu->extint_pending = false; 389 vcpu->exc_pending = 0; 390 vcpu->guest_xcr0 = XFEATURE_ENABLED_X87; 391 (void) hma_fpu_init(vcpu->guestfpu); 392 vmm_stat_init(vcpu->stats); 393 vcpu->tsc_offset = 0; 394 } 395 396 int 397 vcpu_trace_exceptions(struct vm *vm, int vcpuid) 398 { 399 400 return (trace_guest_exceptions); 401 } 402 403 struct vm_exit * 404 vm_exitinfo(struct vm *vm, int cpuid) 405 { 406 struct vcpu *vcpu; 407 408 if (cpuid < 0 || cpuid >= vm->maxcpus) 409 panic("vm_exitinfo: invalid cpuid %d", cpuid); 410 411 vcpu = &vm->vcpu[cpuid]; 412 413 return (&vcpu->exitinfo); 414 } 415 416 struct vie * 417 vm_vie_ctx(struct vm *vm, int cpuid) 418 { 419 if (cpuid < 0 || cpuid >= vm->maxcpus) 420 panic("vm_vie_ctx: invalid cpuid %d", cpuid); 421 422 return (vm->vcpu[cpuid].vie_ctx); 423 } 424 425 static int 426 vmm_init(void) 427 { 428 vmm_host_state_init(); 429 430 if (vmm_is_intel()) { 431 ops = &vmm_ops_intel; 432 pte_ops = &ept_pte_ops; 433 } else if (vmm_is_svm()) { 434 ops = &vmm_ops_amd; 435 pte_ops = &rvi_pte_ops; 436 } else { 437 return (ENXIO); 438 } 439 440 return (VMM_INIT()); 441 } 442 443 int 444 vmm_mod_load() 445 { 446 int error; 447 448 VERIFY(vmm_initialized == 0); 449 450 error = vmm_init(); 451 if (error == 0) 452 vmm_initialized = 1; 453 454 return (error); 455 } 456 457 int 458 vmm_mod_unload() 459 { 460 int error; 461 462 VERIFY(vmm_initialized == 1); 463 464 error = VMM_CLEANUP(); 465 if (error) 466 return (error); 467 vmm_initialized = 0; 468 469 return (0); 470 } 471 472 /* 473 * Create a test IOMMU domain to see if the host system has necessary hardware 474 * and drivers to do so. 475 */ 476 bool 477 vmm_check_iommu(void) 478 { 479 void *domain; 480 const size_t arb_test_sz = (1UL << 32); 481 482 domain = iommu_create_domain(arb_test_sz); 483 if (domain == NULL) { 484 return (false); 485 } 486 iommu_destroy_domain(domain); 487 return (true); 488 } 489 490 static void 491 vm_init(struct vm *vm, bool create) 492 { 493 int i; 494 495 vm->cookie = VMINIT(vm); 496 vm->iommu = NULL; 497 vm->vioapic = vioapic_init(vm); 498 vm->vhpet = vhpet_init(vm); 499 vm->vatpic = vatpic_init(vm); 500 vm->vatpit = vatpit_init(vm); 501 vm->vpmtmr = vpmtmr_init(vm); 502 if (create) 503 vm->vrtc = vrtc_init(vm); 504 505 vm_inout_init(vm, &vm->ioports); 506 507 CPU_ZERO(&vm->active_cpus); 508 CPU_ZERO(&vm->debug_cpus); 509 510 vm->suspend = 0; 511 CPU_ZERO(&vm->suspended_cpus); 512 513 for (i = 0; i < vm->maxcpus; i++) 514 vcpu_init(vm, i, create); 515 516 /* 517 * Configure the VM-wide TSC offset so that the call to vm_init() 518 * represents the boot time (when the TSC(s) read 0). Each vCPU will 519 * have its own offset from this, which is altered if/when the guest 520 * writes to MSR_TSC. 521 * 522 * The TSC offsetting math is all unsigned, using overflow for negative 523 * offets. A reading of the TSC is negated to form the boot offset. 524 */ 525 const uint64_t boot_tsc = rdtsc_offset(); 526 vm->boot_tsc_offset = (uint64_t)(-(int64_t)boot_tsc); 527 528 /* Convert the boot TSC reading to hrtime */ 529 vm->boot_hrtime = (hrtime_t)boot_tsc; 530 scalehrtime(&vm->boot_hrtime); 531 } 532 533 /* 534 * The default CPU topology is a single thread per package. 535 */ 536 uint_t cores_per_package = 1; 537 uint_t threads_per_core = 1; 538 539 /* 540 * Debugging tunable to enable dirty-page-tracking. 541 * (Remains off by default for now) 542 */ 543 bool gpt_track_dirty = false; 544 545 int 546 vm_create(uint64_t flags, struct vm **retvm) 547 { 548 struct vm *vm; 549 struct vmspace *vmspace; 550 551 /* 552 * If vmm.ko could not be successfully initialized then don't attempt 553 * to create the virtual machine. 554 */ 555 if (!vmm_initialized) 556 return (ENXIO); 557 558 vmspace = vmspace_alloc(VM_MAXUSER_ADDRESS, pte_ops, gpt_track_dirty); 559 if (vmspace == NULL) 560 return (ENOMEM); 561 562 vm = kmem_zalloc(sizeof (struct vm), KM_SLEEP); 563 564 vm->vmspace = vmspace; 565 vm->mem_transient = (flags & VCF_RESERVOIR_MEM) == 0; 566 for (uint_t i = 0; i < VM_MAXCPU; i++) { 567 vm->vcpu[i].vmclient = vmspace_client_alloc(vmspace); 568 } 569 570 vm->sockets = 1; 571 vm->cores = cores_per_package; /* XXX backwards compatibility */ 572 vm->threads = threads_per_core; /* XXX backwards compatibility */ 573 vm->maxcpus = VM_MAXCPU; /* XXX temp to keep code working */ 574 575 vm_init(vm, true); 576 577 *retvm = vm; 578 return (0); 579 } 580 581 void 582 vm_get_topology(struct vm *vm, uint16_t *sockets, uint16_t *cores, 583 uint16_t *threads, uint16_t *maxcpus) 584 { 585 *sockets = vm->sockets; 586 *cores = vm->cores; 587 *threads = vm->threads; 588 *maxcpus = vm->maxcpus; 589 } 590 591 uint16_t 592 vm_get_maxcpus(struct vm *vm) 593 { 594 return (vm->maxcpus); 595 } 596 597 int 598 vm_set_topology(struct vm *vm, uint16_t sockets, uint16_t cores, 599 uint16_t threads, uint16_t maxcpus) 600 { 601 if (maxcpus != 0) 602 return (EINVAL); /* XXX remove when supported */ 603 if ((sockets * cores * threads) > vm->maxcpus) 604 return (EINVAL); 605 /* XXX need to check sockets * cores * threads == vCPU, how? */ 606 vm->sockets = sockets; 607 vm->cores = cores; 608 vm->threads = threads; 609 vm->maxcpus = VM_MAXCPU; /* XXX temp to keep code working */ 610 return (0); 611 } 612 613 static void 614 vm_cleanup(struct vm *vm, bool destroy) 615 { 616 struct mem_map *mm; 617 int i; 618 619 ppt_unassign_all(vm); 620 621 if (vm->iommu != NULL) 622 iommu_destroy_domain(vm->iommu); 623 624 /* 625 * Devices which attach their own ioport hooks should be cleaned up 626 * first so they can tear down those registrations. 627 */ 628 vpmtmr_cleanup(vm->vpmtmr); 629 630 vm_inout_cleanup(vm, &vm->ioports); 631 632 if (destroy) 633 vrtc_cleanup(vm->vrtc); 634 else 635 vrtc_reset(vm->vrtc); 636 637 vatpit_cleanup(vm->vatpit); 638 vhpet_cleanup(vm->vhpet); 639 vatpic_cleanup(vm->vatpic); 640 vioapic_cleanup(vm->vioapic); 641 642 for (i = 0; i < vm->maxcpus; i++) 643 vcpu_cleanup(vm, i, destroy); 644 645 VMCLEANUP(vm->cookie); 646 647 /* 648 * System memory is removed from the guest address space only when 649 * the VM is destroyed. This is because the mapping remains the same 650 * across VM reset. 651 * 652 * Device memory can be relocated by the guest (e.g. using PCI BARs) 653 * so those mappings are removed on a VM reset. 654 */ 655 for (i = 0; i < VM_MAX_MEMMAPS; i++) { 656 mm = &vm->mem_maps[i]; 657 if (destroy || !sysmem_mapping(vm, mm)) { 658 vm_free_memmap(vm, i); 659 } else { 660 /* 661 * We need to reset the IOMMU flag so this mapping can 662 * be reused when a VM is rebooted. Since the IOMMU 663 * domain has already been destroyed we can just reset 664 * the flag here. 665 */ 666 mm->flags &= ~VM_MEMMAP_F_IOMMU; 667 } 668 } 669 670 if (destroy) { 671 for (i = 0; i < VM_MAX_MEMSEGS; i++) 672 vm_free_memseg(vm, i); 673 674 vmspace_destroy(vm->vmspace); 675 vm->vmspace = NULL; 676 } 677 } 678 679 void 680 vm_destroy(struct vm *vm) 681 { 682 vm_cleanup(vm, true); 683 kmem_free(vm, sizeof (*vm)); 684 } 685 686 int 687 vm_reinit(struct vm *vm, uint64_t flags) 688 { 689 /* A virtual machine can be reset only if all vcpus are suspended. */ 690 if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) != 0) { 691 if ((flags & VM_REINIT_F_FORCE_SUSPEND) == 0) { 692 return (EBUSY); 693 } 694 695 /* 696 * Force the VM (and all its vCPUs) into a suspended state. 697 * This should be quick and easy, since the vm_reinit() call is 698 * made while holding the VM write lock, which requires holding 699 * all of the vCPUs in the VCPU_FROZEN state. 700 */ 701 (void) atomic_cmpset_int((uint_t *)&vm->suspend, 0, 702 VM_SUSPEND_RESET); 703 for (uint_t i = 0; i < vm->maxcpus; i++) { 704 struct vcpu *vcpu = &vm->vcpu[i]; 705 706 if (CPU_ISSET(i, &vm->suspended_cpus) || 707 !CPU_ISSET(i, &vm->active_cpus)) { 708 continue; 709 } 710 711 vcpu_lock(vcpu); 712 VERIFY3U(vcpu->state, ==, VCPU_FROZEN); 713 CPU_SET_ATOMIC(i, &vm->suspended_cpus); 714 vcpu_unlock(vcpu); 715 } 716 717 VERIFY0(CPU_CMP(&vm->suspended_cpus, &vm->active_cpus)); 718 } 719 720 vm_cleanup(vm, false); 721 vm_init(vm, false); 722 return (0); 723 } 724 725 int 726 vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa) 727 { 728 vm_object_t *obj; 729 730 if ((obj = vmm_mmio_alloc(vm->vmspace, gpa, len, hpa)) == NULL) 731 return (ENOMEM); 732 else 733 return (0); 734 } 735 736 int 737 vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len) 738 { 739 return (vmspace_unmap(vm->vmspace, gpa, gpa + len)); 740 } 741 742 /* 743 * Return 'true' if 'gpa' is allocated in the guest address space. 744 * 745 * This function is called in the context of a running vcpu which acts as 746 * an implicit lock on 'vm->mem_maps[]'. 747 */ 748 bool 749 vm_mem_allocated(struct vm *vm, int vcpuid, vm_paddr_t gpa) 750 { 751 struct mem_map *mm; 752 int i; 753 754 #ifdef INVARIANTS 755 int hostcpu, state; 756 state = vcpu_get_state(vm, vcpuid, &hostcpu); 757 KASSERT(state == VCPU_RUNNING && hostcpu == curcpu, 758 ("%s: invalid vcpu state %d/%d", __func__, state, hostcpu)); 759 #endif 760 761 for (i = 0; i < VM_MAX_MEMMAPS; i++) { 762 mm = &vm->mem_maps[i]; 763 if (mm->len != 0 && gpa >= mm->gpa && gpa < mm->gpa + mm->len) 764 return (true); /* 'gpa' is sysmem or devmem */ 765 } 766 767 if (ppt_is_mmio(vm, gpa)) 768 return (true); /* 'gpa' is pci passthru mmio */ 769 770 return (false); 771 } 772 773 int 774 vm_alloc_memseg(struct vm *vm, int ident, size_t len, bool sysmem) 775 { 776 struct mem_seg *seg; 777 vm_object_t *obj; 778 779 if (ident < 0 || ident >= VM_MAX_MEMSEGS) 780 return (EINVAL); 781 782 if (len == 0 || (len & PAGE_MASK)) 783 return (EINVAL); 784 785 seg = &vm->mem_segs[ident]; 786 if (seg->object != NULL) { 787 if (seg->len == len && seg->sysmem == sysmem) 788 return (EEXIST); 789 else 790 return (EINVAL); 791 } 792 793 obj = vm_object_mem_allocate(len, vm->mem_transient); 794 if (obj == NULL) 795 return (ENOMEM); 796 797 seg->len = len; 798 seg->object = obj; 799 seg->sysmem = sysmem; 800 return (0); 801 } 802 803 int 804 vm_get_memseg(struct vm *vm, int ident, size_t *len, bool *sysmem, 805 vm_object_t **objptr) 806 { 807 struct mem_seg *seg; 808 809 if (ident < 0 || ident >= VM_MAX_MEMSEGS) 810 return (EINVAL); 811 812 seg = &vm->mem_segs[ident]; 813 if (len) 814 *len = seg->len; 815 if (sysmem) 816 *sysmem = seg->sysmem; 817 if (objptr) 818 *objptr = seg->object; 819 return (0); 820 } 821 822 void 823 vm_free_memseg(struct vm *vm, int ident) 824 { 825 struct mem_seg *seg; 826 827 KASSERT(ident >= 0 && ident < VM_MAX_MEMSEGS, 828 ("%s: invalid memseg ident %d", __func__, ident)); 829 830 seg = &vm->mem_segs[ident]; 831 if (seg->object != NULL) { 832 vm_object_release(seg->object); 833 bzero(seg, sizeof (struct mem_seg)); 834 } 835 } 836 837 int 838 vm_mmap_memseg(struct vm *vm, vm_paddr_t gpa, int segid, vm_ooffset_t first, 839 size_t len, int prot, int flags) 840 { 841 struct mem_seg *seg; 842 struct mem_map *m, *map; 843 vm_ooffset_t last; 844 int i, error; 845 846 if (prot == 0 || (prot & ~(PROT_ALL)) != 0) 847 return (EINVAL); 848 849 if (flags & ~VM_MEMMAP_F_WIRED) 850 return (EINVAL); 851 852 if (segid < 0 || segid >= VM_MAX_MEMSEGS) 853 return (EINVAL); 854 855 seg = &vm->mem_segs[segid]; 856 if (seg->object == NULL) 857 return (EINVAL); 858 859 last = first + len; 860 if (first < 0 || first >= last || last > seg->len) 861 return (EINVAL); 862 863 if ((gpa | first | last) & PAGE_MASK) 864 return (EINVAL); 865 866 map = NULL; 867 for (i = 0; i < VM_MAX_MEMMAPS; i++) { 868 m = &vm->mem_maps[i]; 869 if (m->len == 0) { 870 map = m; 871 break; 872 } 873 } 874 875 if (map == NULL) 876 return (ENOSPC); 877 878 error = vmspace_map(vm->vmspace, seg->object, first, gpa, len, prot); 879 if (error != 0) 880 return (EFAULT); 881 882 vm_object_reference(seg->object); 883 884 if ((flags & VM_MEMMAP_F_WIRED) != 0) { 885 error = vmspace_populate(vm->vmspace, gpa, gpa + len); 886 if (error != 0) { 887 VERIFY0(vmspace_unmap(vm->vmspace, gpa, gpa + len)); 888 return (EFAULT); 889 } 890 } 891 892 map->gpa = gpa; 893 map->len = len; 894 map->segoff = first; 895 map->segid = segid; 896 map->prot = prot; 897 map->flags = flags; 898 return (0); 899 } 900 901 int 902 vm_munmap_memseg(struct vm *vm, vm_paddr_t gpa, size_t len) 903 { 904 struct mem_map *m; 905 int i; 906 907 for (i = 0; i < VM_MAX_MEMMAPS; i++) { 908 m = &vm->mem_maps[i]; 909 if (m->gpa == gpa && m->len == len && 910 (m->flags & VM_MEMMAP_F_IOMMU) == 0) { 911 vm_free_memmap(vm, i); 912 return (0); 913 } 914 } 915 916 return (EINVAL); 917 } 918 919 int 920 vm_mmap_getnext(struct vm *vm, vm_paddr_t *gpa, int *segid, 921 vm_ooffset_t *segoff, size_t *len, int *prot, int *flags) 922 { 923 struct mem_map *mm, *mmnext; 924 int i; 925 926 mmnext = NULL; 927 for (i = 0; i < VM_MAX_MEMMAPS; i++) { 928 mm = &vm->mem_maps[i]; 929 if (mm->len == 0 || mm->gpa < *gpa) 930 continue; 931 if (mmnext == NULL || mm->gpa < mmnext->gpa) 932 mmnext = mm; 933 } 934 935 if (mmnext != NULL) { 936 *gpa = mmnext->gpa; 937 if (segid) 938 *segid = mmnext->segid; 939 if (segoff) 940 *segoff = mmnext->segoff; 941 if (len) 942 *len = mmnext->len; 943 if (prot) 944 *prot = mmnext->prot; 945 if (flags) 946 *flags = mmnext->flags; 947 return (0); 948 } else { 949 return (ENOENT); 950 } 951 } 952 953 static void 954 vm_free_memmap(struct vm *vm, int ident) 955 { 956 struct mem_map *mm; 957 int error; 958 959 mm = &vm->mem_maps[ident]; 960 if (mm->len) { 961 error = vmspace_unmap(vm->vmspace, mm->gpa, 962 mm->gpa + mm->len); 963 KASSERT(error == 0, ("%s: vmspace_unmap error %d", 964 __func__, error)); 965 bzero(mm, sizeof (struct mem_map)); 966 } 967 } 968 969 static __inline bool 970 sysmem_mapping(struct vm *vm, struct mem_map *mm) 971 { 972 973 if (mm->len != 0 && vm->mem_segs[mm->segid].sysmem) 974 return (true); 975 else 976 return (false); 977 } 978 979 vm_paddr_t 980 vmm_sysmem_maxaddr(struct vm *vm) 981 { 982 struct mem_map *mm; 983 vm_paddr_t maxaddr; 984 int i; 985 986 maxaddr = 0; 987 for (i = 0; i < VM_MAX_MEMMAPS; i++) { 988 mm = &vm->mem_maps[i]; 989 if (sysmem_mapping(vm, mm)) { 990 if (maxaddr < mm->gpa + mm->len) 991 maxaddr = mm->gpa + mm->len; 992 } 993 } 994 return (maxaddr); 995 } 996 997 static void 998 vm_iommu_modify(struct vm *vm, bool map) 999 { 1000 int i, sz; 1001 vm_paddr_t gpa, hpa; 1002 struct mem_map *mm; 1003 vm_client_t *vmc; 1004 1005 sz = PAGE_SIZE; 1006 vmc = vmspace_client_alloc(vm->vmspace); 1007 1008 for (i = 0; i < VM_MAX_MEMMAPS; i++) { 1009 mm = &vm->mem_maps[i]; 1010 if (!sysmem_mapping(vm, mm)) 1011 continue; 1012 1013 if (map) { 1014 KASSERT((mm->flags & VM_MEMMAP_F_IOMMU) == 0, 1015 ("iommu map found invalid memmap %lx/%lx/%x", 1016 mm->gpa, mm->len, mm->flags)); 1017 if ((mm->flags & VM_MEMMAP_F_WIRED) == 0) 1018 continue; 1019 mm->flags |= VM_MEMMAP_F_IOMMU; 1020 } else { 1021 if ((mm->flags & VM_MEMMAP_F_IOMMU) == 0) 1022 continue; 1023 mm->flags &= ~VM_MEMMAP_F_IOMMU; 1024 KASSERT((mm->flags & VM_MEMMAP_F_WIRED) != 0, 1025 ("iommu unmap found invalid memmap %lx/%lx/%x", 1026 mm->gpa, mm->len, mm->flags)); 1027 } 1028 1029 gpa = mm->gpa; 1030 while (gpa < mm->gpa + mm->len) { 1031 vm_page_t *vmp; 1032 1033 vmp = vmc_hold(vmc, gpa, PROT_WRITE); 1034 ASSERT(vmp != NULL); 1035 hpa = ((uintptr_t)vmp_get_pfn(vmp) << PAGESHIFT); 1036 (void) vmp_release(vmp); 1037 1038 /* 1039 * When originally ported from FreeBSD, the logic for 1040 * adding memory to the guest domain would 1041 * simultaneously remove it from the host domain. The 1042 * justification for that is not clear, and FreeBSD has 1043 * subsequently changed the behavior to not remove the 1044 * memory from the host domain. 1045 * 1046 * Leaving the guest memory in the host domain for the 1047 * life of the VM is necessary to make it available for 1048 * DMA, such as through viona in the TX path. 1049 */ 1050 if (map) { 1051 iommu_create_mapping(vm->iommu, gpa, hpa, sz); 1052 } else { 1053 iommu_remove_mapping(vm->iommu, gpa, sz); 1054 } 1055 1056 gpa += PAGE_SIZE; 1057 } 1058 } 1059 vmc_destroy(vmc); 1060 1061 /* 1062 * Invalidate the cached translations associated with the domain 1063 * from which pages were removed. 1064 */ 1065 iommu_invalidate_tlb(vm->iommu); 1066 } 1067 1068 int 1069 vm_unassign_pptdev(struct vm *vm, int pptfd) 1070 { 1071 int error; 1072 1073 error = ppt_unassign_device(vm, pptfd); 1074 if (error) 1075 return (error); 1076 1077 if (ppt_assigned_devices(vm) == 0) 1078 vm_iommu_modify(vm, false); 1079 1080 return (0); 1081 } 1082 1083 int 1084 vm_assign_pptdev(struct vm *vm, int pptfd) 1085 { 1086 int error; 1087 vm_paddr_t maxaddr; 1088 1089 /* Set up the IOMMU to do the 'gpa' to 'hpa' translation */ 1090 if (ppt_assigned_devices(vm) == 0) { 1091 KASSERT(vm->iommu == NULL, 1092 ("vm_assign_pptdev: iommu must be NULL")); 1093 maxaddr = vmm_sysmem_maxaddr(vm); 1094 vm->iommu = iommu_create_domain(maxaddr); 1095 if (vm->iommu == NULL) 1096 return (ENXIO); 1097 vm_iommu_modify(vm, true); 1098 } 1099 1100 error = ppt_assign_device(vm, pptfd); 1101 return (error); 1102 } 1103 1104 int 1105 vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval) 1106 { 1107 1108 if (vcpu < 0 || vcpu >= vm->maxcpus) 1109 return (EINVAL); 1110 1111 if (reg >= VM_REG_LAST) 1112 return (EINVAL); 1113 1114 return (VMGETREG(vm->cookie, vcpu, reg, retval)); 1115 } 1116 1117 int 1118 vm_set_register(struct vm *vm, int vcpuid, int reg, uint64_t val) 1119 { 1120 struct vcpu *vcpu; 1121 int error; 1122 1123 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 1124 return (EINVAL); 1125 1126 if (reg >= VM_REG_LAST) 1127 return (EINVAL); 1128 1129 error = VMSETREG(vm->cookie, vcpuid, reg, val); 1130 if (error || reg != VM_REG_GUEST_RIP) 1131 return (error); 1132 1133 /* Set 'nextrip' to match the value of %rip */ 1134 vcpu = &vm->vcpu[vcpuid]; 1135 vcpu->nextrip = val; 1136 return (0); 1137 } 1138 1139 static bool 1140 is_descriptor_table(int reg) 1141 { 1142 switch (reg) { 1143 case VM_REG_GUEST_IDTR: 1144 case VM_REG_GUEST_GDTR: 1145 return (true); 1146 default: 1147 return (false); 1148 } 1149 } 1150 1151 static bool 1152 is_segment_register(int reg) 1153 { 1154 switch (reg) { 1155 case VM_REG_GUEST_ES: 1156 case VM_REG_GUEST_CS: 1157 case VM_REG_GUEST_SS: 1158 case VM_REG_GUEST_DS: 1159 case VM_REG_GUEST_FS: 1160 case VM_REG_GUEST_GS: 1161 case VM_REG_GUEST_TR: 1162 case VM_REG_GUEST_LDTR: 1163 return (true); 1164 default: 1165 return (false); 1166 } 1167 } 1168 1169 int 1170 vm_get_seg_desc(struct vm *vm, int vcpu, int reg, struct seg_desc *desc) 1171 { 1172 1173 if (vcpu < 0 || vcpu >= vm->maxcpus) 1174 return (EINVAL); 1175 1176 if (!is_segment_register(reg) && !is_descriptor_table(reg)) 1177 return (EINVAL); 1178 1179 return (VMGETDESC(vm->cookie, vcpu, reg, desc)); 1180 } 1181 1182 int 1183 vm_set_seg_desc(struct vm *vm, int vcpu, int reg, const struct seg_desc *desc) 1184 { 1185 if (vcpu < 0 || vcpu >= vm->maxcpus) 1186 return (EINVAL); 1187 1188 if (!is_segment_register(reg) && !is_descriptor_table(reg)) 1189 return (EINVAL); 1190 1191 return (VMSETDESC(vm->cookie, vcpu, reg, desc)); 1192 } 1193 1194 static int 1195 translate_hma_xsave_result(hma_fpu_xsave_result_t res) 1196 { 1197 switch (res) { 1198 case HFXR_OK: 1199 return (0); 1200 case HFXR_NO_SPACE: 1201 return (ENOSPC); 1202 case HFXR_BAD_ALIGN: 1203 case HFXR_UNSUP_FMT: 1204 case HFXR_UNSUP_FEAT: 1205 case HFXR_INVALID_DATA: 1206 return (EINVAL); 1207 default: 1208 panic("unexpected xsave result"); 1209 } 1210 } 1211 1212 int 1213 vm_get_fpu(struct vm *vm, int vcpuid, void *buf, size_t len) 1214 { 1215 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 1216 return (EINVAL); 1217 1218 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 1219 hma_fpu_xsave_result_t res; 1220 1221 res = hma_fpu_get_xsave_state(vcpu->guestfpu, buf, len); 1222 return (translate_hma_xsave_result(res)); 1223 } 1224 1225 int 1226 vm_set_fpu(struct vm *vm, int vcpuid, void *buf, size_t len) 1227 { 1228 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 1229 return (EINVAL); 1230 1231 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 1232 hma_fpu_xsave_result_t res; 1233 1234 res = hma_fpu_set_xsave_state(vcpu->guestfpu, buf, len); 1235 return (translate_hma_xsave_result(res)); 1236 } 1237 1238 int 1239 vm_get_run_state(struct vm *vm, int vcpuid, uint32_t *state, uint8_t *sipi_vec) 1240 { 1241 struct vcpu *vcpu; 1242 1243 if (vcpuid < 0 || vcpuid >= vm->maxcpus) { 1244 return (EINVAL); 1245 } 1246 1247 vcpu = &vm->vcpu[vcpuid]; 1248 1249 vcpu_lock(vcpu); 1250 *state = vcpu->run_state; 1251 *sipi_vec = vcpu->sipi_vector; 1252 vcpu_unlock(vcpu); 1253 1254 return (0); 1255 } 1256 1257 int 1258 vm_set_run_state(struct vm *vm, int vcpuid, uint32_t state, uint8_t sipi_vec) 1259 { 1260 struct vcpu *vcpu; 1261 1262 if (vcpuid < 0 || vcpuid >= vm->maxcpus) { 1263 return (EINVAL); 1264 } 1265 if (!VRS_IS_VALID(state)) { 1266 return (EINVAL); 1267 } 1268 1269 vcpu = &vm->vcpu[vcpuid]; 1270 1271 vcpu_lock(vcpu); 1272 vcpu->run_state = state; 1273 vcpu->sipi_vector = sipi_vec; 1274 vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT); 1275 vcpu_unlock(vcpu); 1276 1277 return (0); 1278 } 1279 1280 void 1281 vm_track_dirty_pages(struct vm *vm, uint64_t gpa, size_t len, uint8_t *bitmap) 1282 { 1283 vmspace_t *vms = vm_get_vmspace(vm); 1284 vmspace_track_dirty(vms, gpa, len, bitmap); 1285 } 1286 1287 static void 1288 restore_guest_fpustate(struct vcpu *vcpu) 1289 { 1290 /* Save host FPU and restore guest FPU */ 1291 fpu_stop_emulating(); 1292 hma_fpu_start_guest(vcpu->guestfpu); 1293 1294 /* restore guest XCR0 if XSAVE is enabled in the host */ 1295 if (rcr4() & CR4_XSAVE) 1296 load_xcr(0, vcpu->guest_xcr0); 1297 1298 /* 1299 * The FPU is now "dirty" with the guest's state so turn on emulation 1300 * to trap any access to the FPU by the host. 1301 */ 1302 fpu_start_emulating(); 1303 } 1304 1305 static void 1306 save_guest_fpustate(struct vcpu *vcpu) 1307 { 1308 1309 if ((rcr0() & CR0_TS) == 0) 1310 panic("fpu emulation not enabled in host!"); 1311 1312 /* save guest XCR0 and restore host XCR0 */ 1313 if (rcr4() & CR4_XSAVE) { 1314 vcpu->guest_xcr0 = rxcr(0); 1315 load_xcr(0, vmm_get_host_xcr0()); 1316 } 1317 1318 /* save guest FPU and restore host FPU */ 1319 fpu_stop_emulating(); 1320 hma_fpu_stop_guest(vcpu->guestfpu); 1321 /* 1322 * When the host state has been restored, we should not re-enable 1323 * CR0.TS on illumos for eager FPU. 1324 */ 1325 } 1326 1327 static int 1328 vcpu_set_state_locked(struct vm *vm, int vcpuid, enum vcpu_state newstate, 1329 bool from_idle) 1330 { 1331 struct vcpu *vcpu; 1332 int error; 1333 1334 vcpu = &vm->vcpu[vcpuid]; 1335 vcpu_assert_locked(vcpu); 1336 1337 /* 1338 * State transitions from the vmmdev_ioctl() must always begin from 1339 * the VCPU_IDLE state. This guarantees that there is only a single 1340 * ioctl() operating on a vcpu at any point. 1341 */ 1342 if (from_idle) { 1343 while (vcpu->state != VCPU_IDLE) { 1344 vcpu->reqidle = 1; 1345 vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT); 1346 cv_wait(&vcpu->state_cv, &vcpu->lock); 1347 } 1348 } else { 1349 KASSERT(vcpu->state != VCPU_IDLE, ("invalid transition from " 1350 "vcpu idle state")); 1351 } 1352 1353 if (vcpu->state == VCPU_RUNNING) { 1354 KASSERT(vcpu->hostcpu == curcpu, ("curcpu %d and hostcpu %d " 1355 "mismatch for running vcpu", curcpu, vcpu->hostcpu)); 1356 } else { 1357 KASSERT(vcpu->hostcpu == NOCPU, ("Invalid hostcpu %d for a " 1358 "vcpu that is not running", vcpu->hostcpu)); 1359 } 1360 1361 /* 1362 * The following state transitions are allowed: 1363 * IDLE -> FROZEN -> IDLE 1364 * FROZEN -> RUNNING -> FROZEN 1365 * FROZEN -> SLEEPING -> FROZEN 1366 */ 1367 switch (vcpu->state) { 1368 case VCPU_IDLE: 1369 case VCPU_RUNNING: 1370 case VCPU_SLEEPING: 1371 error = (newstate != VCPU_FROZEN); 1372 break; 1373 case VCPU_FROZEN: 1374 error = (newstate == VCPU_FROZEN); 1375 break; 1376 default: 1377 error = 1; 1378 break; 1379 } 1380 1381 if (error) 1382 return (EBUSY); 1383 1384 vcpu->state = newstate; 1385 if (newstate == VCPU_RUNNING) 1386 vcpu->hostcpu = curcpu; 1387 else 1388 vcpu->hostcpu = NOCPU; 1389 1390 if (newstate == VCPU_IDLE) { 1391 cv_broadcast(&vcpu->state_cv); 1392 } 1393 1394 return (0); 1395 } 1396 1397 static void 1398 vcpu_require_state(struct vm *vm, int vcpuid, enum vcpu_state newstate) 1399 { 1400 int error; 1401 1402 if ((error = vcpu_set_state(vm, vcpuid, newstate, false)) != 0) 1403 panic("Error %d setting state to %d\n", error, newstate); 1404 } 1405 1406 static void 1407 vcpu_require_state_locked(struct vm *vm, int vcpuid, enum vcpu_state newstate) 1408 { 1409 int error; 1410 1411 if ((error = vcpu_set_state_locked(vm, vcpuid, newstate, false)) != 0) 1412 panic("Error %d setting state to %d", error, newstate); 1413 } 1414 1415 /* 1416 * Emulate a guest 'hlt' by sleeping until the vcpu is ready to run. 1417 */ 1418 static int 1419 vm_handle_hlt(struct vm *vm, int vcpuid, bool intr_disabled) 1420 { 1421 struct vcpu *vcpu; 1422 int vcpu_halted, vm_halted; 1423 bool userspace_exit = false; 1424 1425 KASSERT(!CPU_ISSET(vcpuid, &vm->halted_cpus), ("vcpu already halted")); 1426 1427 vcpu = &vm->vcpu[vcpuid]; 1428 vcpu_halted = 0; 1429 vm_halted = 0; 1430 1431 vcpu_lock(vcpu); 1432 while (1) { 1433 /* 1434 * Do a final check for pending interrupts (including NMI and 1435 * INIT) before putting this thread to sleep. 1436 */ 1437 if (vm_nmi_pending(vm, vcpuid)) 1438 break; 1439 if (vcpu_run_state_pending(vm, vcpuid)) 1440 break; 1441 if (!intr_disabled) { 1442 if (vm_extint_pending(vm, vcpuid) || 1443 vlapic_pending_intr(vcpu->vlapic, NULL)) { 1444 break; 1445 } 1446 } 1447 1448 /* 1449 * Also check for software events which would cause a wake-up. 1450 * This will set the appropriate exitcode directly, rather than 1451 * requiring a trip through VM_RUN(). 1452 */ 1453 if (vcpu_sleep_bailout_checks(vm, vcpuid)) { 1454 userspace_exit = true; 1455 break; 1456 } 1457 1458 /* 1459 * Some Linux guests implement "halt" by having all vcpus 1460 * execute HLT with interrupts disabled. 'halted_cpus' keeps 1461 * track of the vcpus that have entered this state. When all 1462 * vcpus enter the halted state the virtual machine is halted. 1463 */ 1464 if (intr_disabled) { 1465 if (!vcpu_halted && halt_detection_enabled) { 1466 vcpu_halted = 1; 1467 CPU_SET_ATOMIC(vcpuid, &vm->halted_cpus); 1468 } 1469 if (CPU_CMP(&vm->halted_cpus, &vm->active_cpus) == 0) { 1470 vm_halted = 1; 1471 break; 1472 } 1473 } 1474 1475 vcpu_ustate_change(vm, vcpuid, VU_IDLE); 1476 vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING); 1477 (void) cv_wait_sig(&vcpu->vcpu_cv, &vcpu->lock); 1478 vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN); 1479 vcpu_ustate_change(vm, vcpuid, VU_EMU_KERN); 1480 } 1481 1482 if (vcpu_halted) 1483 CPU_CLR_ATOMIC(vcpuid, &vm->halted_cpus); 1484 1485 vcpu_unlock(vcpu); 1486 1487 if (vm_halted) { 1488 (void) vm_suspend(vm, VM_SUSPEND_HALT); 1489 } 1490 1491 return (userspace_exit ? -1 : 0); 1492 } 1493 1494 static int 1495 vm_handle_paging(struct vm *vm, int vcpuid) 1496 { 1497 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 1498 vm_client_t *vmc = vcpu->vmclient; 1499 struct vm_exit *vme = &vcpu->exitinfo; 1500 int rv, ftype; 1501 1502 KASSERT(vme->inst_length == 0, ("%s: invalid inst_length %d", 1503 __func__, vme->inst_length)); 1504 1505 ftype = vme->u.paging.fault_type; 1506 KASSERT(ftype == PROT_READ || 1507 ftype == PROT_WRITE || ftype == PROT_EXEC, 1508 ("vm_handle_paging: invalid fault_type %d", ftype)); 1509 1510 rv = vmc_fault(vmc, vme->u.paging.gpa, ftype); 1511 1512 if (rv != 0) 1513 return (EFAULT); 1514 return (0); 1515 } 1516 1517 int 1518 vm_service_mmio_read(struct vm *vm, int cpuid, uint64_t gpa, uint64_t *rval, 1519 int rsize) 1520 { 1521 int err = ESRCH; 1522 1523 if (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE) { 1524 struct vlapic *vlapic = vm_lapic(vm, cpuid); 1525 1526 err = vlapic_mmio_read(vlapic, gpa, rval, rsize); 1527 } else if (gpa >= VIOAPIC_BASE && gpa < VIOAPIC_BASE + VIOAPIC_SIZE) { 1528 err = vioapic_mmio_read(vm, cpuid, gpa, rval, rsize); 1529 } else if (gpa >= VHPET_BASE && gpa < VHPET_BASE + VHPET_SIZE) { 1530 err = vhpet_mmio_read(vm, cpuid, gpa, rval, rsize); 1531 } 1532 1533 return (err); 1534 } 1535 1536 int 1537 vm_service_mmio_write(struct vm *vm, int cpuid, uint64_t gpa, uint64_t wval, 1538 int wsize) 1539 { 1540 int err = ESRCH; 1541 1542 if (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE) { 1543 struct vlapic *vlapic = vm_lapic(vm, cpuid); 1544 1545 err = vlapic_mmio_write(vlapic, gpa, wval, wsize); 1546 } else if (gpa >= VIOAPIC_BASE && gpa < VIOAPIC_BASE + VIOAPIC_SIZE) { 1547 err = vioapic_mmio_write(vm, cpuid, gpa, wval, wsize); 1548 } else if (gpa >= VHPET_BASE && gpa < VHPET_BASE + VHPET_SIZE) { 1549 err = vhpet_mmio_write(vm, cpuid, gpa, wval, wsize); 1550 } 1551 1552 return (err); 1553 } 1554 1555 static int 1556 vm_handle_mmio_emul(struct vm *vm, int vcpuid) 1557 { 1558 struct vie *vie; 1559 struct vcpu *vcpu; 1560 struct vm_exit *vme; 1561 uint64_t inst_addr; 1562 int error, fault, cs_d; 1563 1564 vcpu = &vm->vcpu[vcpuid]; 1565 vme = &vcpu->exitinfo; 1566 vie = vcpu->vie_ctx; 1567 1568 KASSERT(vme->inst_length == 0, ("%s: invalid inst_length %d", 1569 __func__, vme->inst_length)); 1570 1571 inst_addr = vme->rip + vme->u.mmio_emul.cs_base; 1572 cs_d = vme->u.mmio_emul.cs_d; 1573 1574 /* Fetch the faulting instruction */ 1575 if (vie_needs_fetch(vie)) { 1576 error = vie_fetch_instruction(vie, vm, vcpuid, inst_addr, 1577 &fault); 1578 if (error != 0) { 1579 return (error); 1580 } else if (fault) { 1581 /* 1582 * If a fault during instruction fetch was encountered, 1583 * it will have asserted that the appropriate exception 1584 * be injected at next entry. 1585 * No further work is required. 1586 */ 1587 return (0); 1588 } 1589 } 1590 1591 if (vie_decode_instruction(vie, vm, vcpuid, cs_d) != 0) { 1592 /* Dump (unrecognized) instruction bytes in userspace */ 1593 vie_fallback_exitinfo(vie, vme); 1594 return (-1); 1595 } 1596 if (vme->u.mmio_emul.gla != VIE_INVALID_GLA && 1597 vie_verify_gla(vie, vm, vcpuid, vme->u.mmio_emul.gla) != 0) { 1598 /* Decoded GLA does not match GLA from VM exit state */ 1599 vie_fallback_exitinfo(vie, vme); 1600 return (-1); 1601 } 1602 1603 repeat: 1604 error = vie_emulate_mmio(vie, vm, vcpuid); 1605 if (error < 0) { 1606 /* 1607 * MMIO not handled by any of the in-kernel-emulated devices, so 1608 * make a trip out to userspace for it. 1609 */ 1610 vie_exitinfo(vie, vme); 1611 } else if (error == EAGAIN) { 1612 /* 1613 * Continue emulating the rep-prefixed instruction, which has 1614 * not completed its iterations. 1615 * 1616 * In case this can be emulated in-kernel and has a high 1617 * repetition count (causing a tight spin), it should be 1618 * deferential to yield conditions. 1619 */ 1620 if (!vcpu_should_yield(vm, vcpuid)) { 1621 goto repeat; 1622 } else { 1623 /* 1624 * Defer to the contending load by making a trip to 1625 * userspace with a no-op (BOGUS) exit reason. 1626 */ 1627 vie_reset(vie); 1628 vme->exitcode = VM_EXITCODE_BOGUS; 1629 return (-1); 1630 } 1631 } else if (error == 0) { 1632 /* Update %rip now that instruction has been emulated */ 1633 vie_advance_pc(vie, &vcpu->nextrip); 1634 } 1635 return (error); 1636 } 1637 1638 static int 1639 vm_handle_inout(struct vm *vm, int vcpuid, struct vm_exit *vme) 1640 { 1641 struct vcpu *vcpu; 1642 struct vie *vie; 1643 int err; 1644 1645 vcpu = &vm->vcpu[vcpuid]; 1646 vie = vcpu->vie_ctx; 1647 1648 repeat: 1649 err = vie_emulate_inout(vie, vm, vcpuid); 1650 1651 if (err < 0) { 1652 /* 1653 * In/out not handled by any of the in-kernel-emulated devices, 1654 * so make a trip out to userspace for it. 1655 */ 1656 vie_exitinfo(vie, vme); 1657 return (err); 1658 } else if (err == EAGAIN) { 1659 /* 1660 * Continue emulating the rep-prefixed ins/outs, which has not 1661 * completed its iterations. 1662 * 1663 * In case this can be emulated in-kernel and has a high 1664 * repetition count (causing a tight spin), it should be 1665 * deferential to yield conditions. 1666 */ 1667 if (!vcpu_should_yield(vm, vcpuid)) { 1668 goto repeat; 1669 } else { 1670 /* 1671 * Defer to the contending load by making a trip to 1672 * userspace with a no-op (BOGUS) exit reason. 1673 */ 1674 vie_reset(vie); 1675 vme->exitcode = VM_EXITCODE_BOGUS; 1676 return (-1); 1677 } 1678 } else if (err != 0) { 1679 /* Emulation failure. Bail all the way out to userspace. */ 1680 vme->exitcode = VM_EXITCODE_INST_EMUL; 1681 bzero(&vme->u.inst_emul, sizeof (vme->u.inst_emul)); 1682 return (-1); 1683 } 1684 1685 vie_advance_pc(vie, &vcpu->nextrip); 1686 return (0); 1687 } 1688 1689 static int 1690 vm_handle_inst_emul(struct vm *vm, int vcpuid) 1691 { 1692 struct vie *vie; 1693 struct vcpu *vcpu; 1694 struct vm_exit *vme; 1695 uint64_t cs_base; 1696 int error, fault, cs_d; 1697 1698 vcpu = &vm->vcpu[vcpuid]; 1699 vme = &vcpu->exitinfo; 1700 vie = vcpu->vie_ctx; 1701 1702 vie_cs_info(vie, vm, vcpuid, &cs_base, &cs_d); 1703 1704 /* Fetch the faulting instruction */ 1705 ASSERT(vie_needs_fetch(vie)); 1706 error = vie_fetch_instruction(vie, vm, vcpuid, vme->rip + cs_base, 1707 &fault); 1708 if (error != 0) { 1709 return (error); 1710 } else if (fault) { 1711 /* 1712 * If a fault during instruction fetch was encounted, it will 1713 * have asserted that the appropriate exception be injected at 1714 * next entry. No further work is required. 1715 */ 1716 return (0); 1717 } 1718 1719 if (vie_decode_instruction(vie, vm, vcpuid, cs_d) != 0) { 1720 /* Dump (unrecognized) instruction bytes in userspace */ 1721 vie_fallback_exitinfo(vie, vme); 1722 return (-1); 1723 } 1724 1725 error = vie_emulate_other(vie, vm, vcpuid); 1726 if (error != 0) { 1727 /* 1728 * Instruction emulation was unable to complete successfully, so 1729 * kick it out to userspace for handling. 1730 */ 1731 vie_fallback_exitinfo(vie, vme); 1732 } else { 1733 /* Update %rip now that instruction has been emulated */ 1734 vie_advance_pc(vie, &vcpu->nextrip); 1735 } 1736 return (error); 1737 } 1738 1739 static int 1740 vm_handle_suspend(struct vm *vm, int vcpuid) 1741 { 1742 int i; 1743 struct vcpu *vcpu; 1744 1745 vcpu = &vm->vcpu[vcpuid]; 1746 1747 CPU_SET_ATOMIC(vcpuid, &vm->suspended_cpus); 1748 1749 /* 1750 * Wait until all 'active_cpus' have suspended themselves. 1751 */ 1752 vcpu_lock(vcpu); 1753 vcpu_ustate_change(vm, vcpuid, VU_INIT); 1754 while (1) { 1755 int rc; 1756 1757 if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) { 1758 break; 1759 } 1760 1761 vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING); 1762 rc = cv_reltimedwait_sig(&vcpu->vcpu_cv, &vcpu->lock, hz, 1763 TR_CLOCK_TICK); 1764 vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN); 1765 1766 /* 1767 * If the userspace process driving the instance is killed, any 1768 * vCPUs yet to be marked suspended (because they are not 1769 * VM_RUN-ing in the kernel presently) will never reach that 1770 * state. 1771 * 1772 * To avoid vm_handle_suspend() getting stuck in the kernel 1773 * waiting for those vCPUs, offer a bail-out even though it 1774 * means returning without all vCPUs in a suspended state. 1775 */ 1776 if (rc <= 0) { 1777 if ((curproc->p_flag & SEXITING) != 0) { 1778 break; 1779 } 1780 } 1781 } 1782 vcpu_unlock(vcpu); 1783 1784 /* 1785 * Wakeup the other sleeping vcpus and return to userspace. 1786 */ 1787 for (i = 0; i < vm->maxcpus; i++) { 1788 if (CPU_ISSET(i, &vm->suspended_cpus)) { 1789 vcpu_notify_event(vm, i); 1790 } 1791 } 1792 1793 return (-1); 1794 } 1795 1796 static int 1797 vm_handle_reqidle(struct vm *vm, int vcpuid) 1798 { 1799 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 1800 1801 vcpu_lock(vcpu); 1802 KASSERT(vcpu->reqidle, ("invalid vcpu reqidle %d", vcpu->reqidle)); 1803 vcpu->reqidle = 0; 1804 vcpu_unlock(vcpu); 1805 return (-1); 1806 } 1807 1808 static int 1809 vm_handle_run_state(struct vm *vm, int vcpuid) 1810 { 1811 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 1812 bool handled = false; 1813 1814 vcpu_lock(vcpu); 1815 while (1) { 1816 if ((vcpu->run_state & VRS_PEND_INIT) != 0) { 1817 vcpu_unlock(vcpu); 1818 VERIFY0(vcpu_arch_reset(vm, vcpuid, true)); 1819 vcpu_lock(vcpu); 1820 1821 vcpu->run_state &= ~(VRS_RUN | VRS_PEND_INIT); 1822 vcpu->run_state |= VRS_INIT; 1823 } 1824 1825 if ((vcpu->run_state & (VRS_INIT | VRS_RUN | VRS_PEND_SIPI)) == 1826 (VRS_INIT | VRS_PEND_SIPI)) { 1827 const uint8_t vector = vcpu->sipi_vector; 1828 1829 vcpu_unlock(vcpu); 1830 VERIFY0(vcpu_vector_sipi(vm, vcpuid, vector)); 1831 vcpu_lock(vcpu); 1832 1833 vcpu->run_state &= ~VRS_PEND_SIPI; 1834 vcpu->run_state |= VRS_RUN; 1835 } 1836 1837 /* 1838 * If the vCPU is now in the running state, there is no need to 1839 * wait for anything prior to re-entry. 1840 */ 1841 if ((vcpu->run_state & VRS_RUN) != 0) { 1842 handled = true; 1843 break; 1844 } 1845 1846 /* 1847 * Also check for software events which would cause a wake-up. 1848 * This will set the appropriate exitcode directly, rather than 1849 * requiring a trip through VM_RUN(). 1850 */ 1851 if (vcpu_sleep_bailout_checks(vm, vcpuid)) { 1852 break; 1853 } 1854 1855 vcpu_ustate_change(vm, vcpuid, VU_IDLE); 1856 vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING); 1857 (void) cv_wait_sig(&vcpu->vcpu_cv, &vcpu->lock); 1858 vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN); 1859 vcpu_ustate_change(vm, vcpuid, VU_EMU_KERN); 1860 } 1861 vcpu_unlock(vcpu); 1862 1863 return (handled ? 0 : -1); 1864 } 1865 1866 static int 1867 vm_rdmtrr(struct vm_mtrr *mtrr, uint32_t num, uint64_t *val) 1868 { 1869 switch (num) { 1870 case MSR_MTRRcap: 1871 *val = MTRR_CAP_WC | MTRR_CAP_FIXED | VMM_MTRR_VAR_MAX; 1872 break; 1873 case MSR_MTRRdefType: 1874 *val = mtrr->def_type; 1875 break; 1876 case MSR_MTRR4kBase ... MSR_MTRR4kBase + 7: 1877 *val = mtrr->fixed4k[num - MSR_MTRR4kBase]; 1878 break; 1879 case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1: 1880 *val = mtrr->fixed16k[num - MSR_MTRR16kBase]; 1881 break; 1882 case MSR_MTRR64kBase: 1883 *val = mtrr->fixed64k; 1884 break; 1885 case MSR_MTRRVarBase ... MSR_MTRRVarBase + (VMM_MTRR_VAR_MAX * 2) - 1: { 1886 uint_t offset = num - MSR_MTRRVarBase; 1887 if (offset % 2 == 0) { 1888 *val = mtrr->var[offset / 2].base; 1889 } else { 1890 *val = mtrr->var[offset / 2].mask; 1891 } 1892 break; 1893 } 1894 default: 1895 return (-1); 1896 } 1897 1898 return (0); 1899 } 1900 1901 static int 1902 vm_wrmtrr(struct vm_mtrr *mtrr, uint32_t num, uint64_t val) 1903 { 1904 switch (num) { 1905 case MSR_MTRRcap: 1906 /* MTRRCAP is read only */ 1907 return (-1); 1908 case MSR_MTRRdefType: 1909 if (val & ~VMM_MTRR_DEF_MASK) { 1910 /* generate #GP on writes to reserved fields */ 1911 return (-1); 1912 } 1913 mtrr->def_type = val; 1914 break; 1915 case MSR_MTRR4kBase ... MSR_MTRR4kBase + 7: 1916 mtrr->fixed4k[num - MSR_MTRR4kBase] = val; 1917 break; 1918 case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1: 1919 mtrr->fixed16k[num - MSR_MTRR16kBase] = val; 1920 break; 1921 case MSR_MTRR64kBase: 1922 mtrr->fixed64k = val; 1923 break; 1924 case MSR_MTRRVarBase ... MSR_MTRRVarBase + (VMM_MTRR_VAR_MAX * 2) - 1: { 1925 uint_t offset = num - MSR_MTRRVarBase; 1926 if (offset % 2 == 0) { 1927 if (val & ~VMM_MTRR_PHYSBASE_MASK) { 1928 /* generate #GP on writes to reserved fields */ 1929 return (-1); 1930 } 1931 mtrr->var[offset / 2].base = val; 1932 } else { 1933 if (val & ~VMM_MTRR_PHYSMASK_MASK) { 1934 /* generate #GP on writes to reserved fields */ 1935 return (-1); 1936 } 1937 mtrr->var[offset / 2].mask = val; 1938 } 1939 break; 1940 } 1941 default: 1942 return (-1); 1943 } 1944 1945 return (0); 1946 } 1947 1948 static int 1949 vm_handle_rdmsr(struct vm *vm, int vcpuid, struct vm_exit *vme) 1950 { 1951 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 1952 const uint32_t code = vme->u.msr.code; 1953 uint64_t val = 0; 1954 1955 switch (code) { 1956 case MSR_MCG_CAP: 1957 case MSR_MCG_STATUS: 1958 val = 0; 1959 break; 1960 1961 case MSR_MTRRcap: 1962 case MSR_MTRRdefType: 1963 case MSR_MTRR4kBase ... MSR_MTRR4kBase + 7: 1964 case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1: 1965 case MSR_MTRR64kBase: 1966 case MSR_MTRRVarBase ... MSR_MTRRVarBase + (VMM_MTRR_VAR_MAX * 2) - 1: 1967 if (vm_rdmtrr(&vcpu->mtrr, code, &val) != 0) 1968 vm_inject_gp(vm, vcpuid); 1969 break; 1970 1971 case MSR_TSC: 1972 /* 1973 * In all likelihood, this should always be handled in guest 1974 * context by VMX/SVM rather than taking an exit. (Both VMX and 1975 * SVM pass through read-only access to MSR_TSC to the guest.) 1976 * 1977 * No physical offset is requested of vcpu_tsc_offset() since 1978 * rdtsc_offset() takes care of that instead. 1979 */ 1980 val = vcpu_tsc_offset(vm, vcpuid, false) + rdtsc_offset(); 1981 break; 1982 1983 default: 1984 /* 1985 * Anything not handled at this point will be kicked out to 1986 * userspace for attempted processing there. 1987 */ 1988 return (-1); 1989 } 1990 1991 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RAX, 1992 val & 0xffffffff)); 1993 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RDX, 1994 val >> 32)); 1995 return (0); 1996 } 1997 1998 static int 1999 vm_handle_wrmsr(struct vm *vm, int vcpuid, struct vm_exit *vme) 2000 { 2001 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 2002 const uint32_t code = vme->u.msr.code; 2003 const uint64_t val = vme->u.msr.wval; 2004 2005 switch (code) { 2006 case MSR_MCG_CAP: 2007 case MSR_MCG_STATUS: 2008 /* Ignore writes */ 2009 break; 2010 2011 case MSR_MTRRcap: 2012 case MSR_MTRRdefType: 2013 case MSR_MTRR4kBase ... MSR_MTRR4kBase + 7: 2014 case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1: 2015 case MSR_MTRR64kBase: 2016 case MSR_MTRRVarBase ... MSR_MTRRVarBase + (VMM_MTRR_VAR_MAX * 2) - 1: 2017 if (vm_wrmtrr(&vcpu->mtrr, code, val) != 0) 2018 vm_inject_gp(vm, vcpuid); 2019 break; 2020 2021 case MSR_TSC: 2022 /* 2023 * The effect of writing the TSC MSR is that a subsequent read 2024 * of the TSC would report that value written (plus any time 2025 * elapsed between the write and the read). The guest TSC value 2026 * is calculated from a global offset for the guest (which 2027 * effectively makes its TSC read 0 at guest boot) and a 2028 * per-vCPU offset to handle these writes to the MSR. 2029 * 2030 * To calculate that per-vCPU offset, we can work backwards from 2031 * the guest value at the time of write: 2032 * 2033 * value = host TSC + VM boot offset + vCPU offset 2034 * 2035 * so therefore: 2036 * 2037 * value - host TSC - VM boot offset = vCPU offset 2038 */ 2039 vcpu->tsc_offset = val - vm->boot_tsc_offset - rdtsc_offset(); 2040 break; 2041 2042 default: 2043 /* 2044 * Anything not handled at this point will be kicked out to 2045 * userspace for attempted processing there. 2046 */ 2047 return (-1); 2048 } 2049 2050 return (0); 2051 } 2052 2053 int 2054 vm_suspend(struct vm *vm, enum vm_suspend_how how) 2055 { 2056 if (how <= VM_SUSPEND_NONE || how >= VM_SUSPEND_LAST) 2057 return (EINVAL); 2058 2059 if (atomic_cmpset_int((uint_t *)&vm->suspend, 0, how) == 0) { 2060 return (EALREADY); 2061 } 2062 2063 /* 2064 * Notify all active vcpus that they are now suspended. 2065 */ 2066 for (uint_t i = 0; i < vm->maxcpus; i++) { 2067 struct vcpu *vcpu = &vm->vcpu[i]; 2068 2069 vcpu_lock(vcpu); 2070 if (vcpu->state == VCPU_IDLE || vcpu->state == VCPU_FROZEN) { 2071 /* 2072 * Any vCPUs not actively running or in HLT can be 2073 * marked as suspended immediately. 2074 */ 2075 if (CPU_ISSET(i, &vm->active_cpus)) { 2076 CPU_SET_ATOMIC(i, &vm->suspended_cpus); 2077 } 2078 } else { 2079 /* 2080 * Those which are running or in HLT will pick up the 2081 * suspended state after notification. 2082 */ 2083 vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT); 2084 } 2085 vcpu_unlock(vcpu); 2086 } 2087 return (0); 2088 } 2089 2090 void 2091 vm_exit_run_state(struct vm *vm, int vcpuid, uint64_t rip) 2092 { 2093 struct vm_exit *vmexit; 2094 2095 vmexit = vm_exitinfo(vm, vcpuid); 2096 vmexit->rip = rip; 2097 vmexit->inst_length = 0; 2098 vmexit->exitcode = VM_EXITCODE_RUN_STATE; 2099 vmm_stat_incr(vm, vcpuid, VMEXIT_RUN_STATE, 1); 2100 } 2101 2102 /* 2103 * Some vmm resources, such as the lapic, may have CPU-specific resources 2104 * allocated to them which would benefit from migration onto the host CPU which 2105 * is processing the vcpu state. 2106 */ 2107 static void 2108 vm_localize_resources(struct vm *vm, struct vcpu *vcpu) 2109 { 2110 /* 2111 * Localizing cyclic resources requires acquisition of cpu_lock, and 2112 * doing so with kpreempt disabled is a recipe for deadlock disaster. 2113 */ 2114 VERIFY(curthread->t_preempt == 0); 2115 2116 /* 2117 * Do not bother with localization if this vCPU is about to return to 2118 * the host CPU it was last localized to. 2119 */ 2120 if (vcpu->lastloccpu == curcpu) 2121 return; 2122 2123 /* 2124 * Localize system-wide resources to the primary boot vCPU. While any 2125 * of the other vCPUs may access them, it keeps the potential interrupt 2126 * footprint constrained to CPUs involved with this instance. 2127 */ 2128 if (vcpu == &vm->vcpu[0]) { 2129 vhpet_localize_resources(vm->vhpet); 2130 vrtc_localize_resources(vm->vrtc); 2131 vatpit_localize_resources(vm->vatpit); 2132 } 2133 2134 vlapic_localize_resources(vcpu->vlapic); 2135 2136 vcpu->lastloccpu = curcpu; 2137 } 2138 2139 static void 2140 vmm_savectx(void *arg) 2141 { 2142 vm_thread_ctx_t *vtc = arg; 2143 struct vm *vm = vtc->vtc_vm; 2144 const int vcpuid = vtc->vtc_vcpuid; 2145 2146 if (ops->vmsavectx != NULL) { 2147 ops->vmsavectx(vm->cookie, vcpuid); 2148 } 2149 2150 /* 2151 * Account for going off-cpu, unless the vCPU is idled, where being 2152 * off-cpu is the explicit point. 2153 */ 2154 if (vm->vcpu[vcpuid].ustate != VU_IDLE) { 2155 vtc->vtc_ustate = vm->vcpu[vcpuid].ustate; 2156 vcpu_ustate_change(vm, vcpuid, VU_SCHED); 2157 } 2158 2159 /* 2160 * If the CPU holds the restored guest FPU state, save it and restore 2161 * the host FPU state before this thread goes off-cpu. 2162 */ 2163 if ((vtc->vtc_status & VTCS_FPU_RESTORED) != 0) { 2164 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 2165 2166 save_guest_fpustate(vcpu); 2167 vtc->vtc_status &= ~VTCS_FPU_RESTORED; 2168 } 2169 } 2170 2171 static void 2172 vmm_restorectx(void *arg) 2173 { 2174 vm_thread_ctx_t *vtc = arg; 2175 struct vm *vm = vtc->vtc_vm; 2176 const int vcpuid = vtc->vtc_vcpuid; 2177 2178 /* Complete microstate accounting for vCPU being off-cpu */ 2179 if (vm->vcpu[vcpuid].ustate != VU_IDLE) { 2180 vcpu_ustate_change(vm, vcpuid, vtc->vtc_ustate); 2181 } 2182 2183 /* 2184 * When coming back on-cpu, only restore the guest FPU status if the 2185 * thread is in a context marked as requiring it. This should be rare, 2186 * occurring only when a future logic error results in a voluntary 2187 * sleep during the VMRUN critical section. 2188 * 2189 * The common case will result in elision of the guest FPU state 2190 * restoration, deferring that action until it is clearly necessary 2191 * during vm_run. 2192 */ 2193 VERIFY((vtc->vtc_status & VTCS_FPU_RESTORED) == 0); 2194 if ((vtc->vtc_status & VTCS_FPU_CTX_CRITICAL) != 0) { 2195 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 2196 2197 restore_guest_fpustate(vcpu); 2198 vtc->vtc_status |= VTCS_FPU_RESTORED; 2199 } 2200 2201 if (ops->vmrestorectx != NULL) { 2202 ops->vmrestorectx(vm->cookie, vcpuid); 2203 } 2204 2205 } 2206 2207 static int 2208 vm_entry_actions(struct vm *vm, int vcpuid, const struct vm_entry *entry, 2209 struct vm_exit *vme) 2210 { 2211 struct vcpu *vcpu; 2212 struct vie *vie; 2213 int err; 2214 2215 vcpu = &vm->vcpu[vcpuid]; 2216 vie = vcpu->vie_ctx; 2217 err = 0; 2218 2219 switch (entry->cmd) { 2220 case VEC_DEFAULT: 2221 return (0); 2222 case VEC_DISCARD_INSTR: 2223 vie_reset(vie); 2224 return (0); 2225 case VEC_FULFILL_MMIO: 2226 err = vie_fulfill_mmio(vie, &entry->u.mmio); 2227 if (err == 0) { 2228 err = vie_emulate_mmio(vie, vm, vcpuid); 2229 if (err == 0) { 2230 vie_advance_pc(vie, &vcpu->nextrip); 2231 } else if (err < 0) { 2232 vie_exitinfo(vie, vme); 2233 } else if (err == EAGAIN) { 2234 /* 2235 * Clear the instruction emulation state in 2236 * order to re-enter VM context and continue 2237 * this 'rep <instruction>' 2238 */ 2239 vie_reset(vie); 2240 err = 0; 2241 } 2242 } 2243 break; 2244 case VEC_FULFILL_INOUT: 2245 err = vie_fulfill_inout(vie, &entry->u.inout); 2246 if (err == 0) { 2247 err = vie_emulate_inout(vie, vm, vcpuid); 2248 if (err == 0) { 2249 vie_advance_pc(vie, &vcpu->nextrip); 2250 } else if (err < 0) { 2251 vie_exitinfo(vie, vme); 2252 } else if (err == EAGAIN) { 2253 /* 2254 * Clear the instruction emulation state in 2255 * order to re-enter VM context and continue 2256 * this 'rep ins/outs' 2257 */ 2258 vie_reset(vie); 2259 err = 0; 2260 } 2261 } 2262 break; 2263 default: 2264 return (EINVAL); 2265 } 2266 return (err); 2267 } 2268 2269 static int 2270 vm_loop_checks(struct vm *vm, int vcpuid, struct vm_exit *vme) 2271 { 2272 struct vie *vie; 2273 2274 vie = vm->vcpu[vcpuid].vie_ctx; 2275 2276 if (vie_pending(vie)) { 2277 /* 2278 * Userspace has not fulfilled the pending needs of the 2279 * instruction emulation, so bail back out. 2280 */ 2281 vie_exitinfo(vie, vme); 2282 return (-1); 2283 } 2284 2285 return (0); 2286 } 2287 2288 int 2289 vm_run(struct vm *vm, int vcpuid, const struct vm_entry *entry) 2290 { 2291 int error; 2292 struct vcpu *vcpu; 2293 struct vm_exit *vme; 2294 bool intr_disabled; 2295 int affinity_type = CPU_CURRENT; 2296 2297 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 2298 return (EINVAL); 2299 if (!CPU_ISSET(vcpuid, &vm->active_cpus)) 2300 return (EINVAL); 2301 2302 vcpu = &vm->vcpu[vcpuid]; 2303 vme = &vcpu->exitinfo; 2304 2305 vcpu_ustate_change(vm, vcpuid, VU_EMU_KERN); 2306 2307 vcpu->vtc.vtc_status = 0; 2308 ctxop_attach(curthread, vcpu->ctxop); 2309 2310 error = vm_entry_actions(vm, vcpuid, entry, vme); 2311 if (error != 0) { 2312 goto exit; 2313 } 2314 2315 restart: 2316 error = vm_loop_checks(vm, vcpuid, vme); 2317 if (error != 0) { 2318 goto exit; 2319 } 2320 2321 thread_affinity_set(curthread, affinity_type); 2322 /* 2323 * Resource localization should happen after the CPU affinity for the 2324 * thread has been set to ensure that access from restricted contexts, 2325 * such as VMX-accelerated APIC operations, can occur without inducing 2326 * cyclic cross-calls. 2327 * 2328 * This must be done prior to disabling kpreempt via critical_enter(). 2329 */ 2330 vm_localize_resources(vm, vcpu); 2331 affinity_type = CPU_CURRENT; 2332 critical_enter(); 2333 2334 /* Force a trip through update_sregs to reload %fs/%gs and friends */ 2335 PCB_SET_UPDATE_SEGS(&ttolwp(curthread)->lwp_pcb); 2336 2337 if ((vcpu->vtc.vtc_status & VTCS_FPU_RESTORED) == 0) { 2338 restore_guest_fpustate(vcpu); 2339 vcpu->vtc.vtc_status |= VTCS_FPU_RESTORED; 2340 } 2341 vcpu->vtc.vtc_status |= VTCS_FPU_CTX_CRITICAL; 2342 2343 vcpu_require_state(vm, vcpuid, VCPU_RUNNING); 2344 error = VMRUN(vm->cookie, vcpuid, vcpu->nextrip); 2345 vcpu_require_state(vm, vcpuid, VCPU_FROZEN); 2346 2347 /* 2348 * Once clear of the delicate contexts comprising the VM_RUN handler, 2349 * thread CPU affinity can be loosened while other processing occurs. 2350 */ 2351 vcpu->vtc.vtc_status &= ~VTCS_FPU_CTX_CRITICAL; 2352 thread_affinity_clear(curthread); 2353 critical_exit(); 2354 2355 if (error != 0) { 2356 /* Communicate out any error from VMRUN() above */ 2357 goto exit; 2358 } 2359 2360 vcpu->nextrip = vme->rip + vme->inst_length; 2361 switch (vme->exitcode) { 2362 case VM_EXITCODE_REQIDLE: 2363 error = vm_handle_reqidle(vm, vcpuid); 2364 break; 2365 case VM_EXITCODE_RUN_STATE: 2366 error = vm_handle_run_state(vm, vcpuid); 2367 break; 2368 case VM_EXITCODE_SUSPENDED: 2369 error = vm_handle_suspend(vm, vcpuid); 2370 break; 2371 case VM_EXITCODE_IOAPIC_EOI: 2372 vioapic_process_eoi(vm, vcpuid, 2373 vme->u.ioapic_eoi.vector); 2374 break; 2375 case VM_EXITCODE_HLT: 2376 intr_disabled = ((vme->u.hlt.rflags & PSL_I) == 0); 2377 error = vm_handle_hlt(vm, vcpuid, intr_disabled); 2378 break; 2379 case VM_EXITCODE_PAGING: 2380 error = vm_handle_paging(vm, vcpuid); 2381 break; 2382 case VM_EXITCODE_MMIO_EMUL: 2383 error = vm_handle_mmio_emul(vm, vcpuid); 2384 break; 2385 case VM_EXITCODE_INOUT: 2386 error = vm_handle_inout(vm, vcpuid, vme); 2387 break; 2388 case VM_EXITCODE_INST_EMUL: 2389 error = vm_handle_inst_emul(vm, vcpuid); 2390 break; 2391 case VM_EXITCODE_MONITOR: 2392 case VM_EXITCODE_MWAIT: 2393 case VM_EXITCODE_VMINSN: 2394 vm_inject_ud(vm, vcpuid); 2395 break; 2396 case VM_EXITCODE_RDMSR: 2397 error = vm_handle_rdmsr(vm, vcpuid, vme); 2398 break; 2399 case VM_EXITCODE_WRMSR: 2400 error = vm_handle_wrmsr(vm, vcpuid, vme); 2401 break; 2402 case VM_EXITCODE_HT: 2403 affinity_type = CPU_BEST; 2404 break; 2405 case VM_EXITCODE_MTRAP: 2406 VERIFY0(vm_suspend_cpu(vm, vcpuid)); 2407 error = -1; 2408 break; 2409 default: 2410 /* handled in userland */ 2411 error = -1; 2412 break; 2413 } 2414 2415 if (error == 0) { 2416 /* VM exit conditions handled in-kernel, continue running */ 2417 goto restart; 2418 } 2419 2420 exit: 2421 kpreempt_disable(); 2422 ctxop_detach(curthread, vcpu->ctxop); 2423 /* Make sure all of the needed vCPU context state is saved */ 2424 vmm_savectx(&vcpu->vtc); 2425 kpreempt_enable(); 2426 2427 vcpu_ustate_change(vm, vcpuid, VU_EMU_USER); 2428 return (error); 2429 } 2430 2431 int 2432 vm_restart_instruction(void *arg, int vcpuid) 2433 { 2434 struct vm *vm; 2435 struct vcpu *vcpu; 2436 enum vcpu_state state; 2437 uint64_t rip; 2438 int error; 2439 2440 vm = arg; 2441 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 2442 return (EINVAL); 2443 2444 vcpu = &vm->vcpu[vcpuid]; 2445 state = vcpu_get_state(vm, vcpuid, NULL); 2446 if (state == VCPU_RUNNING) { 2447 /* 2448 * When a vcpu is "running" the next instruction is determined 2449 * by adding 'rip' and 'inst_length' in the vcpu's 'exitinfo'. 2450 * Thus setting 'inst_length' to zero will cause the current 2451 * instruction to be restarted. 2452 */ 2453 vcpu->exitinfo.inst_length = 0; 2454 } else if (state == VCPU_FROZEN) { 2455 /* 2456 * When a vcpu is "frozen" it is outside the critical section 2457 * around VMRUN() and 'nextrip' points to the next instruction. 2458 * Thus instruction restart is achieved by setting 'nextrip' 2459 * to the vcpu's %rip. 2460 */ 2461 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RIP, &rip); 2462 KASSERT(!error, ("%s: error %d getting rip", __func__, error)); 2463 vcpu->nextrip = rip; 2464 } else { 2465 panic("%s: invalid state %d", __func__, state); 2466 } 2467 return (0); 2468 } 2469 2470 int 2471 vm_exit_intinfo(struct vm *vm, int vcpuid, uint64_t info) 2472 { 2473 struct vcpu *vcpu; 2474 2475 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 2476 return (EINVAL); 2477 2478 vcpu = &vm->vcpu[vcpuid]; 2479 2480 if (VM_INTINFO_PENDING(info)) { 2481 const uint32_t type = VM_INTINFO_TYPE(info); 2482 const uint8_t vector = VM_INTINFO_VECTOR(info); 2483 2484 if (type == VM_INTINFO_NMI && vector != IDT_NMI) 2485 return (EINVAL); 2486 if (type == VM_INTINFO_HWEXCP && vector >= 32) 2487 return (EINVAL); 2488 if (info & VM_INTINFO_MASK_RSVD) 2489 return (EINVAL); 2490 } else { 2491 info = 0; 2492 } 2493 vcpu->exit_intinfo = info; 2494 return (0); 2495 } 2496 2497 enum exc_class { 2498 EXC_BENIGN, 2499 EXC_CONTRIBUTORY, 2500 EXC_PAGEFAULT 2501 }; 2502 2503 #define IDT_VE 20 /* Virtualization Exception (Intel specific) */ 2504 2505 static enum exc_class 2506 exception_class(uint64_t info) 2507 { 2508 ASSERT(VM_INTINFO_PENDING(info)); 2509 2510 /* Table 6-4, "Interrupt and Exception Classes", Intel SDM, Vol 3 */ 2511 switch (VM_INTINFO_TYPE(info)) { 2512 case VM_INTINFO_HWINTR: 2513 case VM_INTINFO_SWINTR: 2514 case VM_INTINFO_NMI: 2515 return (EXC_BENIGN); 2516 default: 2517 /* 2518 * Hardware exception. 2519 * 2520 * SVM and VT-x use identical type values to represent NMI, 2521 * hardware interrupt and software interrupt. 2522 * 2523 * SVM uses type '3' for all exceptions. VT-x uses type '3' 2524 * for exceptions except #BP and #OF. #BP and #OF use a type 2525 * value of '5' or '6'. Therefore we don't check for explicit 2526 * values of 'type' to classify 'intinfo' into a hardware 2527 * exception. 2528 */ 2529 break; 2530 } 2531 2532 switch (VM_INTINFO_VECTOR(info)) { 2533 case IDT_PF: 2534 case IDT_VE: 2535 return (EXC_PAGEFAULT); 2536 case IDT_DE: 2537 case IDT_TS: 2538 case IDT_NP: 2539 case IDT_SS: 2540 case IDT_GP: 2541 return (EXC_CONTRIBUTORY); 2542 default: 2543 return (EXC_BENIGN); 2544 } 2545 } 2546 2547 /* 2548 * Fetch event pending injection into the guest, if one exists. 2549 * 2550 * Returns true if an event is to be injected (which is placed in `retinfo`). 2551 */ 2552 bool 2553 vm_entry_intinfo(struct vm *vm, int vcpuid, uint64_t *retinfo) 2554 { 2555 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 2556 const uint64_t info1 = vcpu->exit_intinfo; 2557 vcpu->exit_intinfo = 0; 2558 const uint64_t info2 = vcpu->exc_pending; 2559 vcpu->exc_pending = 0; 2560 2561 if (VM_INTINFO_PENDING(info1) && VM_INTINFO_PENDING(info2)) { 2562 /* 2563 * If an exception occurs while attempting to call the 2564 * double-fault handler the processor enters shutdown mode 2565 * (aka triple fault). 2566 */ 2567 if (VM_INTINFO_TYPE(info1) == VM_INTINFO_HWEXCP && 2568 VM_INTINFO_VECTOR(info1) == IDT_DF) { 2569 (void) vm_suspend(vm, VM_SUSPEND_TRIPLEFAULT); 2570 *retinfo = 0; 2571 return (false); 2572 } 2573 /* 2574 * "Conditions for Generating a Double Fault" 2575 * Intel SDM, Vol3, Table 6-5 2576 */ 2577 const enum exc_class exc1 = exception_class(info1); 2578 const enum exc_class exc2 = exception_class(info2); 2579 if ((exc1 == EXC_CONTRIBUTORY && exc2 == EXC_CONTRIBUTORY) || 2580 (exc1 == EXC_PAGEFAULT && exc2 != EXC_BENIGN)) { 2581 /* Convert nested fault into a double fault. */ 2582 *retinfo = 2583 VM_INTINFO_VALID | 2584 VM_INTINFO_DEL_ERRCODE | 2585 VM_INTINFO_HWEXCP | 2586 IDT_DF; 2587 } else { 2588 /* Handle exceptions serially */ 2589 vcpu->exit_intinfo = info1; 2590 *retinfo = info2; 2591 } 2592 return (true); 2593 } else if (VM_INTINFO_PENDING(info1)) { 2594 *retinfo = info1; 2595 return (true); 2596 } else if (VM_INTINFO_PENDING(info2)) { 2597 *retinfo = info2; 2598 return (true); 2599 } 2600 2601 return (false); 2602 } 2603 2604 int 2605 vm_get_intinfo(struct vm *vm, int vcpuid, uint64_t *info1, uint64_t *info2) 2606 { 2607 struct vcpu *vcpu; 2608 2609 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 2610 return (EINVAL); 2611 2612 vcpu = &vm->vcpu[vcpuid]; 2613 *info1 = vcpu->exit_intinfo; 2614 *info2 = vcpu->exc_pending; 2615 return (0); 2616 } 2617 2618 int 2619 vm_inject_exception(struct vm *vm, int vcpuid, uint8_t vector, 2620 bool errcode_valid, uint32_t errcode, bool restart_instruction) 2621 { 2622 struct vcpu *vcpu; 2623 uint64_t regval; 2624 int error; 2625 2626 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 2627 return (EINVAL); 2628 2629 if (vector >= 32) 2630 return (EINVAL); 2631 2632 /* 2633 * NMIs are to be injected via their own specialized path using 2634 * vm_inject_nmi(). 2635 */ 2636 if (vector == IDT_NMI) { 2637 return (EINVAL); 2638 } 2639 2640 /* 2641 * A double fault exception should never be injected directly into 2642 * the guest. It is a derived exception that results from specific 2643 * combinations of nested faults. 2644 */ 2645 if (vector == IDT_DF) { 2646 return (EINVAL); 2647 } 2648 2649 vcpu = &vm->vcpu[vcpuid]; 2650 2651 if (VM_INTINFO_PENDING(vcpu->exc_pending)) { 2652 /* Unable to inject exception due to one already pending */ 2653 return (EBUSY); 2654 } 2655 2656 if (errcode_valid) { 2657 /* 2658 * Exceptions don't deliver an error code in real mode. 2659 */ 2660 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_CR0, ®val); 2661 VERIFY0(error); 2662 if ((regval & CR0_PE) == 0) { 2663 errcode_valid = false; 2664 } 2665 } 2666 2667 /* 2668 * From section 26.6.1 "Interruptibility State" in Intel SDM: 2669 * 2670 * Event blocking by "STI" or "MOV SS" is cleared after guest executes 2671 * one instruction or incurs an exception. 2672 */ 2673 error = vm_set_register(vm, vcpuid, VM_REG_GUEST_INTR_SHADOW, 0); 2674 VERIFY0(error); 2675 2676 if (restart_instruction) { 2677 VERIFY0(vm_restart_instruction(vm, vcpuid)); 2678 } 2679 2680 uint64_t val = VM_INTINFO_VALID | VM_INTINFO_HWEXCP | vector; 2681 if (errcode_valid) { 2682 val |= VM_INTINFO_DEL_ERRCODE; 2683 val |= (uint64_t)errcode << VM_INTINFO_SHIFT_ERRCODE; 2684 } 2685 vcpu->exc_pending = val; 2686 return (0); 2687 } 2688 2689 void 2690 vm_inject_ud(struct vm *vm, int vcpuid) 2691 { 2692 VERIFY0(vm_inject_exception(vm, vcpuid, IDT_UD, false, 0, true)); 2693 } 2694 2695 void 2696 vm_inject_gp(struct vm *vm, int vcpuid) 2697 { 2698 VERIFY0(vm_inject_exception(vm, vcpuid, IDT_GP, true, 0, true)); 2699 } 2700 2701 void 2702 vm_inject_ac(struct vm *vm, int vcpuid, uint32_t errcode) 2703 { 2704 VERIFY0(vm_inject_exception(vm, vcpuid, IDT_AC, true, errcode, true)); 2705 } 2706 2707 void 2708 vm_inject_ss(struct vm *vm, int vcpuid, uint32_t errcode) 2709 { 2710 VERIFY0(vm_inject_exception(vm, vcpuid, IDT_SS, true, errcode, true)); 2711 } 2712 2713 void 2714 vm_inject_pf(struct vm *vm, int vcpuid, uint32_t errcode, uint64_t cr2) 2715 { 2716 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_CR2, cr2)); 2717 VERIFY0(vm_inject_exception(vm, vcpuid, IDT_PF, true, errcode, true)); 2718 } 2719 2720 static VMM_STAT(VCPU_NMI_COUNT, "number of NMIs delivered to vcpu"); 2721 2722 int 2723 vm_inject_nmi(struct vm *vm, int vcpuid) 2724 { 2725 struct vcpu *vcpu; 2726 2727 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 2728 return (EINVAL); 2729 2730 vcpu = &vm->vcpu[vcpuid]; 2731 2732 vcpu->nmi_pending = true; 2733 vcpu_notify_event(vm, vcpuid); 2734 return (0); 2735 } 2736 2737 bool 2738 vm_nmi_pending(struct vm *vm, int vcpuid) 2739 { 2740 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 2741 2742 return (vcpu->nmi_pending); 2743 } 2744 2745 void 2746 vm_nmi_clear(struct vm *vm, int vcpuid) 2747 { 2748 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 2749 2750 ASSERT(vcpu->nmi_pending); 2751 2752 vcpu->nmi_pending = false; 2753 vmm_stat_incr(vm, vcpuid, VCPU_NMI_COUNT, 1); 2754 } 2755 2756 static VMM_STAT(VCPU_EXTINT_COUNT, "number of ExtINTs delivered to vcpu"); 2757 2758 int 2759 vm_inject_extint(struct vm *vm, int vcpuid) 2760 { 2761 struct vcpu *vcpu; 2762 2763 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 2764 return (EINVAL); 2765 2766 vcpu = &vm->vcpu[vcpuid]; 2767 2768 vcpu->extint_pending = true; 2769 vcpu_notify_event(vm, vcpuid); 2770 return (0); 2771 } 2772 2773 bool 2774 vm_extint_pending(struct vm *vm, int vcpuid) 2775 { 2776 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 2777 2778 return (vcpu->extint_pending); 2779 } 2780 2781 void 2782 vm_extint_clear(struct vm *vm, int vcpuid) 2783 { 2784 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 2785 2786 ASSERT(vcpu->extint_pending); 2787 2788 vcpu->extint_pending = false; 2789 vmm_stat_incr(vm, vcpuid, VCPU_EXTINT_COUNT, 1); 2790 } 2791 2792 int 2793 vm_inject_init(struct vm *vm, int vcpuid) 2794 { 2795 struct vcpu *vcpu; 2796 2797 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 2798 return (EINVAL); 2799 2800 vcpu = &vm->vcpu[vcpuid]; 2801 vcpu_lock(vcpu); 2802 vcpu->run_state |= VRS_PEND_INIT; 2803 /* 2804 * As part of queuing the INIT request, clear any pending SIPI. It 2805 * would not otherwise survive across the reset of the vCPU when it 2806 * undergoes the requested INIT. We would not want it to linger when it 2807 * could be mistaken as a subsequent (after the INIT) SIPI request. 2808 */ 2809 vcpu->run_state &= ~VRS_PEND_SIPI; 2810 vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT); 2811 2812 vcpu_unlock(vcpu); 2813 return (0); 2814 } 2815 2816 int 2817 vm_inject_sipi(struct vm *vm, int vcpuid, uint8_t vector) 2818 { 2819 struct vcpu *vcpu; 2820 2821 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 2822 return (EINVAL); 2823 2824 vcpu = &vm->vcpu[vcpuid]; 2825 vcpu_lock(vcpu); 2826 vcpu->run_state |= VRS_PEND_SIPI; 2827 vcpu->sipi_vector = vector; 2828 /* SIPI is only actionable if the CPU is waiting in INIT state */ 2829 if ((vcpu->run_state & (VRS_INIT | VRS_RUN)) == VRS_INIT) { 2830 vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT); 2831 } 2832 vcpu_unlock(vcpu); 2833 return (0); 2834 } 2835 2836 bool 2837 vcpu_run_state_pending(struct vm *vm, int vcpuid) 2838 { 2839 struct vcpu *vcpu; 2840 2841 ASSERT(vcpuid >= 0 && vcpuid < vm->maxcpus); 2842 vcpu = &vm->vcpu[vcpuid]; 2843 2844 /* Of interest: vCPU not in running state or with pending INIT */ 2845 return ((vcpu->run_state & (VRS_RUN | VRS_PEND_INIT)) != VRS_RUN); 2846 } 2847 2848 int 2849 vcpu_arch_reset(struct vm *vm, int vcpuid, bool init_only) 2850 { 2851 struct seg_desc desc; 2852 const enum vm_reg_name clear_regs[] = { 2853 VM_REG_GUEST_CR2, 2854 VM_REG_GUEST_CR3, 2855 VM_REG_GUEST_CR4, 2856 VM_REG_GUEST_RAX, 2857 VM_REG_GUEST_RBX, 2858 VM_REG_GUEST_RCX, 2859 VM_REG_GUEST_RSI, 2860 VM_REG_GUEST_RDI, 2861 VM_REG_GUEST_RBP, 2862 VM_REG_GUEST_RSP, 2863 VM_REG_GUEST_R8, 2864 VM_REG_GUEST_R9, 2865 VM_REG_GUEST_R10, 2866 VM_REG_GUEST_R11, 2867 VM_REG_GUEST_R12, 2868 VM_REG_GUEST_R13, 2869 VM_REG_GUEST_R14, 2870 VM_REG_GUEST_R15, 2871 VM_REG_GUEST_DR0, 2872 VM_REG_GUEST_DR1, 2873 VM_REG_GUEST_DR2, 2874 VM_REG_GUEST_DR3, 2875 VM_REG_GUEST_EFER, 2876 }; 2877 const enum vm_reg_name data_segs[] = { 2878 VM_REG_GUEST_SS, 2879 VM_REG_GUEST_DS, 2880 VM_REG_GUEST_ES, 2881 VM_REG_GUEST_FS, 2882 VM_REG_GUEST_GS, 2883 }; 2884 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 2885 2886 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 2887 return (EINVAL); 2888 2889 for (uint_t i = 0; i < nitems(clear_regs); i++) { 2890 VERIFY0(vm_set_register(vm, vcpuid, clear_regs[i], 0)); 2891 } 2892 2893 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, 2)); 2894 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RIP, 0xfff0)); 2895 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_CR0, 0x60000010)); 2896 2897 /* 2898 * The prescribed contents of %rdx differ slightly between the Intel and 2899 * AMD architectural definitions. The former expects the Extended Model 2900 * in bits 16-19 where the latter expects all the Family, Model, and 2901 * Stepping be there. Common boot ROMs appear to disregard this 2902 * anyways, so we stick with a compromise value similar to what is 2903 * spelled out in the Intel SDM. 2904 */ 2905 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RDX, 0x600)); 2906 2907 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_DR6, 0xffff0ff0)); 2908 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_DR7, 0x400)); 2909 2910 /* CS: Present, R/W, Accessed */ 2911 desc.access = 0x0093; 2912 desc.base = 0xffff0000; 2913 desc.limit = 0xffff; 2914 VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_CS, &desc)); 2915 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_CS, 0xf000)); 2916 2917 /* SS, DS, ES, FS, GS: Present, R/W, Accessed */ 2918 desc.access = 0x0093; 2919 desc.base = 0; 2920 desc.limit = 0xffff; 2921 for (uint_t i = 0; i < nitems(data_segs); i++) { 2922 VERIFY0(vm_set_seg_desc(vm, vcpuid, data_segs[i], &desc)); 2923 VERIFY0(vm_set_register(vm, vcpuid, data_segs[i], 0)); 2924 } 2925 2926 /* GDTR, IDTR */ 2927 desc.base = 0; 2928 desc.limit = 0xffff; 2929 VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_GDTR, &desc)); 2930 VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_IDTR, &desc)); 2931 2932 /* LDTR: Present, LDT */ 2933 desc.access = 0x0082; 2934 desc.base = 0; 2935 desc.limit = 0xffff; 2936 VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_LDTR, &desc)); 2937 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_LDTR, 0)); 2938 2939 /* TR: Present, 32-bit TSS */ 2940 desc.access = 0x008b; 2941 desc.base = 0; 2942 desc.limit = 0xffff; 2943 VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_TR, &desc)); 2944 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_TR, 0)); 2945 2946 vlapic_reset(vm_lapic(vm, vcpuid)); 2947 2948 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_INTR_SHADOW, 0)); 2949 2950 vcpu->exit_intinfo = 0; 2951 vcpu->exc_pending = 0; 2952 vcpu->nmi_pending = false; 2953 vcpu->extint_pending = 0; 2954 2955 /* 2956 * A CPU reset caused by power-on or system reset clears more state than 2957 * one which is trigged from an INIT IPI. 2958 */ 2959 if (!init_only) { 2960 vcpu->guest_xcr0 = XFEATURE_ENABLED_X87; 2961 (void) hma_fpu_init(vcpu->guestfpu); 2962 2963 /* XXX: clear MSRs and other pieces */ 2964 bzero(&vcpu->mtrr, sizeof (vcpu->mtrr)); 2965 } 2966 2967 return (0); 2968 } 2969 2970 static int 2971 vcpu_vector_sipi(struct vm *vm, int vcpuid, uint8_t vector) 2972 { 2973 struct seg_desc desc; 2974 2975 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 2976 return (EINVAL); 2977 2978 /* CS: Present, R/W, Accessed */ 2979 desc.access = 0x0093; 2980 desc.base = (uint64_t)vector << 12; 2981 desc.limit = 0xffff; 2982 VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_CS, &desc)); 2983 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_CS, 2984 (uint64_t)vector << 8)); 2985 2986 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RIP, 0)); 2987 2988 return (0); 2989 } 2990 2991 int 2992 vm_get_capability(struct vm *vm, int vcpu, int type, int *retval) 2993 { 2994 if (vcpu < 0 || vcpu >= vm->maxcpus) 2995 return (EINVAL); 2996 2997 if (type < 0 || type >= VM_CAP_MAX) 2998 return (EINVAL); 2999 3000 return (VMGETCAP(vm->cookie, vcpu, type, retval)); 3001 } 3002 3003 int 3004 vm_set_capability(struct vm *vm, int vcpu, int type, int val) 3005 { 3006 if (vcpu < 0 || vcpu >= vm->maxcpus) 3007 return (EINVAL); 3008 3009 if (type < 0 || type >= VM_CAP_MAX) 3010 return (EINVAL); 3011 3012 return (VMSETCAP(vm->cookie, vcpu, type, val)); 3013 } 3014 3015 struct vlapic * 3016 vm_lapic(struct vm *vm, int cpu) 3017 { 3018 ASSERT3S(cpu, >=, 0); 3019 ASSERT3S(cpu, <, VM_MAXCPU); 3020 3021 return (vm->vcpu[cpu].vlapic); 3022 } 3023 3024 struct vioapic * 3025 vm_ioapic(struct vm *vm) 3026 { 3027 3028 return (vm->vioapic); 3029 } 3030 3031 struct vhpet * 3032 vm_hpet(struct vm *vm) 3033 { 3034 3035 return (vm->vhpet); 3036 } 3037 3038 void * 3039 vm_iommu_domain(struct vm *vm) 3040 { 3041 3042 return (vm->iommu); 3043 } 3044 3045 int 3046 vcpu_set_state(struct vm *vm, int vcpuid, enum vcpu_state newstate, 3047 bool from_idle) 3048 { 3049 int error; 3050 struct vcpu *vcpu; 3051 3052 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 3053 panic("vcpu_set_state: invalid vcpuid %d", vcpuid); 3054 3055 vcpu = &vm->vcpu[vcpuid]; 3056 3057 vcpu_lock(vcpu); 3058 error = vcpu_set_state_locked(vm, vcpuid, newstate, from_idle); 3059 vcpu_unlock(vcpu); 3060 3061 return (error); 3062 } 3063 3064 enum vcpu_state 3065 vcpu_get_state(struct vm *vm, int vcpuid, int *hostcpu) 3066 { 3067 struct vcpu *vcpu; 3068 enum vcpu_state state; 3069 3070 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 3071 panic("vcpu_get_state: invalid vcpuid %d", vcpuid); 3072 3073 vcpu = &vm->vcpu[vcpuid]; 3074 3075 vcpu_lock(vcpu); 3076 state = vcpu->state; 3077 if (hostcpu != NULL) 3078 *hostcpu = vcpu->hostcpu; 3079 vcpu_unlock(vcpu); 3080 3081 return (state); 3082 } 3083 3084 uint64_t 3085 vcpu_tsc_offset(struct vm *vm, int vcpuid, bool phys_adj) 3086 { 3087 ASSERT(vcpuid >= 0 && vcpuid < vm->maxcpus); 3088 3089 uint64_t vcpu_off = vm->boot_tsc_offset + vm->vcpu[vcpuid].tsc_offset; 3090 3091 if (phys_adj) { 3092 /* Include any offset for the current physical CPU too */ 3093 extern hrtime_t tsc_gethrtime_tick_delta(void); 3094 vcpu_off += (uint64_t)tsc_gethrtime_tick_delta(); 3095 } 3096 3097 return (vcpu_off); 3098 } 3099 3100 /* Normalize hrtime against the boot time for a VM */ 3101 hrtime_t 3102 vm_normalize_hrtime(struct vm *vm, hrtime_t hrt) 3103 { 3104 /* To avoid underflow/overflow UB, perform math as unsigned */ 3105 return ((hrtime_t)((uint64_t)hrt - (uint64_t)vm->boot_hrtime)); 3106 } 3107 3108 /* Denormalize hrtime against the boot time for a VM */ 3109 hrtime_t 3110 vm_denormalize_hrtime(struct vm *vm, hrtime_t hrt) 3111 { 3112 /* To avoid underflow/overflow UB, perform math as unsigned */ 3113 return ((hrtime_t)((uint64_t)hrt + (uint64_t)vm->boot_hrtime)); 3114 } 3115 3116 int 3117 vm_activate_cpu(struct vm *vm, int vcpuid) 3118 { 3119 3120 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 3121 return (EINVAL); 3122 3123 if (CPU_ISSET(vcpuid, &vm->active_cpus)) 3124 return (EBUSY); 3125 3126 if (vm->suspend != 0) { 3127 return (EBUSY); 3128 } 3129 3130 CPU_SET_ATOMIC(vcpuid, &vm->active_cpus); 3131 3132 /* 3133 * It is possible that this vCPU was undergoing activation at the same 3134 * time that the VM was being suspended. If that happens to be the 3135 * case, it should reflect the suspended state immediately. 3136 */ 3137 if (atomic_load_acq_int((uint_t *)&vm->suspend) != 0) { 3138 CPU_SET_ATOMIC(vcpuid, &vm->suspended_cpus); 3139 } 3140 3141 return (0); 3142 } 3143 3144 int 3145 vm_suspend_cpu(struct vm *vm, int vcpuid) 3146 { 3147 int i; 3148 3149 if (vcpuid < -1 || vcpuid >= vm->maxcpus) 3150 return (EINVAL); 3151 3152 if (vcpuid == -1) { 3153 vm->debug_cpus = vm->active_cpus; 3154 for (i = 0; i < vm->maxcpus; i++) { 3155 if (CPU_ISSET(i, &vm->active_cpus)) 3156 vcpu_notify_event(vm, i); 3157 } 3158 } else { 3159 if (!CPU_ISSET(vcpuid, &vm->active_cpus)) 3160 return (EINVAL); 3161 3162 CPU_SET_ATOMIC(vcpuid, &vm->debug_cpus); 3163 vcpu_notify_event(vm, vcpuid); 3164 } 3165 return (0); 3166 } 3167 3168 int 3169 vm_resume_cpu(struct vm *vm, int vcpuid) 3170 { 3171 3172 if (vcpuid < -1 || vcpuid >= vm->maxcpus) 3173 return (EINVAL); 3174 3175 if (vcpuid == -1) { 3176 CPU_ZERO(&vm->debug_cpus); 3177 } else { 3178 if (!CPU_ISSET(vcpuid, &vm->debug_cpus)) 3179 return (EINVAL); 3180 3181 CPU_CLR_ATOMIC(vcpuid, &vm->debug_cpus); 3182 } 3183 return (0); 3184 } 3185 3186 static bool 3187 vcpu_bailout_checks(struct vm *vm, int vcpuid, bool on_entry, 3188 uint64_t entry_rip) 3189 { 3190 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 3191 struct vm_exit *vme = &vcpu->exitinfo; 3192 bool bail = false; 3193 3194 ASSERT(vcpuid >= 0 && vcpuid < vm->maxcpus); 3195 3196 if (vm->suspend) { 3197 if (on_entry) { 3198 VERIFY(vm->suspend > VM_SUSPEND_NONE && 3199 vm->suspend < VM_SUSPEND_LAST); 3200 3201 vme->exitcode = VM_EXITCODE_SUSPENDED; 3202 vme->u.suspended.how = vm->suspend; 3203 } else { 3204 /* 3205 * Handling VM suspend is complicated, so if that 3206 * condition is detected outside of VM-entry itself, 3207 * just emit a BOGUS exitcode so we take a lap to pick 3208 * up the event during an entry and are directed into 3209 * the vm_handle_suspend() logic. 3210 */ 3211 vme->exitcode = VM_EXITCODE_BOGUS; 3212 } 3213 bail = true; 3214 } 3215 if (vcpu->reqidle) { 3216 vme->exitcode = VM_EXITCODE_REQIDLE; 3217 vmm_stat_incr(vm, vcpuid, VMEXIT_REQIDLE, 1); 3218 3219 if (!on_entry) { 3220 /* 3221 * A reqidle request detected outside of VM-entry can be 3222 * handled directly by clearing the request (and taking 3223 * a lap to userspace). 3224 */ 3225 vcpu_assert_locked(vcpu); 3226 vcpu->reqidle = 0; 3227 } 3228 bail = true; 3229 } 3230 if (vcpu_should_yield(vm, vcpuid)) { 3231 vme->exitcode = VM_EXITCODE_BOGUS; 3232 vmm_stat_incr(vm, vcpuid, VMEXIT_ASTPENDING, 1); 3233 bail = true; 3234 } 3235 if (CPU_ISSET(vcpuid, &vm->debug_cpus)) { 3236 vme->exitcode = VM_EXITCODE_DEBUG; 3237 bail = true; 3238 } 3239 3240 if (bail) { 3241 if (on_entry) { 3242 /* 3243 * If bailing out during VM-entry, the current %rip must 3244 * be recorded in the exitinfo. 3245 */ 3246 vme->rip = entry_rip; 3247 } 3248 vme->inst_length = 0; 3249 } 3250 return (bail); 3251 } 3252 3253 static bool 3254 vcpu_sleep_bailout_checks(struct vm *vm, int vcpuid) 3255 { 3256 /* 3257 * Bail-out check done prior to sleeping (in vCPU contexts like HLT or 3258 * wait-for-SIPI) expect that %rip is already populated in the vm_exit 3259 * structure, and we would only modify the exitcode. 3260 */ 3261 return (vcpu_bailout_checks(vm, vcpuid, false, 0)); 3262 } 3263 3264 bool 3265 vcpu_entry_bailout_checks(struct vm *vm, int vcpuid, uint64_t rip) 3266 { 3267 /* 3268 * Bail-out checks done as part of VM entry require an updated %rip to 3269 * populate the vm_exit struct if any of the conditions of interest are 3270 * matched in the check. 3271 */ 3272 return (vcpu_bailout_checks(vm, vcpuid, true, rip)); 3273 } 3274 3275 cpuset_t 3276 vm_active_cpus(struct vm *vm) 3277 { 3278 3279 return (vm->active_cpus); 3280 } 3281 3282 cpuset_t 3283 vm_debug_cpus(struct vm *vm) 3284 { 3285 3286 return (vm->debug_cpus); 3287 } 3288 3289 cpuset_t 3290 vm_suspended_cpus(struct vm *vm) 3291 { 3292 3293 return (vm->suspended_cpus); 3294 } 3295 3296 void * 3297 vcpu_stats(struct vm *vm, int vcpuid) 3298 { 3299 3300 return (vm->vcpu[vcpuid].stats); 3301 } 3302 3303 int 3304 vm_get_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state *state) 3305 { 3306 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 3307 return (EINVAL); 3308 3309 *state = vm->vcpu[vcpuid].x2apic_state; 3310 3311 return (0); 3312 } 3313 3314 int 3315 vm_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state) 3316 { 3317 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 3318 return (EINVAL); 3319 3320 if (state >= X2APIC_STATE_LAST) 3321 return (EINVAL); 3322 3323 vm->vcpu[vcpuid].x2apic_state = state; 3324 3325 vlapic_set_x2apic_state(vm, vcpuid, state); 3326 3327 return (0); 3328 } 3329 3330 /* 3331 * This function is called to ensure that a vcpu "sees" a pending event 3332 * as soon as possible: 3333 * - If the vcpu thread is sleeping then it is woken up. 3334 * - If the vcpu is running on a different host_cpu then an IPI will be directed 3335 * to the host_cpu to cause the vcpu to trap into the hypervisor. 3336 */ 3337 static void 3338 vcpu_notify_event_locked(struct vcpu *vcpu, vcpu_notify_t ntype) 3339 { 3340 int hostcpu; 3341 3342 ASSERT(ntype == VCPU_NOTIFY_APIC || VCPU_NOTIFY_EXIT); 3343 3344 hostcpu = vcpu->hostcpu; 3345 if (vcpu->state == VCPU_RUNNING) { 3346 KASSERT(hostcpu != NOCPU, ("vcpu running on invalid hostcpu")); 3347 if (hostcpu != curcpu) { 3348 if (ntype == VCPU_NOTIFY_APIC) { 3349 vlapic_post_intr(vcpu->vlapic, hostcpu); 3350 } else { 3351 poke_cpu(hostcpu); 3352 } 3353 } else { 3354 /* 3355 * If the 'vcpu' is running on 'curcpu' then it must 3356 * be sending a notification to itself (e.g. SELF_IPI). 3357 * The pending event will be picked up when the vcpu 3358 * transitions back to guest context. 3359 */ 3360 } 3361 } else { 3362 KASSERT(hostcpu == NOCPU, ("vcpu state %d not consistent " 3363 "with hostcpu %d", vcpu->state, hostcpu)); 3364 if (vcpu->state == VCPU_SLEEPING) { 3365 cv_signal(&vcpu->vcpu_cv); 3366 } 3367 } 3368 } 3369 3370 void 3371 vcpu_notify_event(struct vm *vm, int vcpuid) 3372 { 3373 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 3374 3375 vcpu_lock(vcpu); 3376 vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT); 3377 vcpu_unlock(vcpu); 3378 } 3379 3380 void 3381 vcpu_notify_event_type(struct vm *vm, int vcpuid, vcpu_notify_t ntype) 3382 { 3383 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 3384 3385 if (ntype == VCPU_NOTIFY_NONE) { 3386 return; 3387 } 3388 3389 vcpu_lock(vcpu); 3390 vcpu_notify_event_locked(vcpu, ntype); 3391 vcpu_unlock(vcpu); 3392 } 3393 3394 void 3395 vcpu_ustate_change(struct vm *vm, int vcpuid, enum vcpu_ustate ustate) 3396 { 3397 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 3398 hrtime_t now = gethrtime(); 3399 3400 ASSERT3U(ustate, !=, vcpu->ustate); 3401 ASSERT3S(ustate, <, VU_MAX); 3402 ASSERT3S(ustate, >=, VU_INIT); 3403 3404 hrtime_t delta = now - vcpu->ustate_when; 3405 vcpu->ustate_total[vcpu->ustate] += delta; 3406 3407 membar_producer(); 3408 3409 vcpu->ustate_when = now; 3410 vcpu->ustate = ustate; 3411 } 3412 3413 struct vmspace * 3414 vm_get_vmspace(struct vm *vm) 3415 { 3416 3417 return (vm->vmspace); 3418 } 3419 3420 struct vm_client * 3421 vm_get_vmclient(struct vm *vm, int vcpuid) 3422 { 3423 return (vm->vcpu[vcpuid].vmclient); 3424 } 3425 3426 int 3427 vm_apicid2vcpuid(struct vm *vm, int apicid) 3428 { 3429 /* 3430 * XXX apic id is assumed to be numerically identical to vcpu id 3431 */ 3432 return (apicid); 3433 } 3434 3435 struct vatpic * 3436 vm_atpic(struct vm *vm) 3437 { 3438 return (vm->vatpic); 3439 } 3440 3441 struct vatpit * 3442 vm_atpit(struct vm *vm) 3443 { 3444 return (vm->vatpit); 3445 } 3446 3447 struct vpmtmr * 3448 vm_pmtmr(struct vm *vm) 3449 { 3450 3451 return (vm->vpmtmr); 3452 } 3453 3454 struct vrtc * 3455 vm_rtc(struct vm *vm) 3456 { 3457 3458 return (vm->vrtc); 3459 } 3460 3461 enum vm_reg_name 3462 vm_segment_name(int seg) 3463 { 3464 static enum vm_reg_name seg_names[] = { 3465 VM_REG_GUEST_ES, 3466 VM_REG_GUEST_CS, 3467 VM_REG_GUEST_SS, 3468 VM_REG_GUEST_DS, 3469 VM_REG_GUEST_FS, 3470 VM_REG_GUEST_GS 3471 }; 3472 3473 KASSERT(seg >= 0 && seg < nitems(seg_names), 3474 ("%s: invalid segment encoding %d", __func__, seg)); 3475 return (seg_names[seg]); 3476 } 3477 3478 void 3479 vm_copy_teardown(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo, 3480 uint_t num_copyinfo) 3481 { 3482 for (uint_t idx = 0; idx < num_copyinfo; idx++) { 3483 if (copyinfo[idx].cookie != NULL) { 3484 (void) vmp_release((vm_page_t *)copyinfo[idx].cookie); 3485 } 3486 } 3487 bzero(copyinfo, num_copyinfo * sizeof (struct vm_copyinfo)); 3488 } 3489 3490 int 3491 vm_copy_setup(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, 3492 uint64_t gla, size_t len, int prot, struct vm_copyinfo *copyinfo, 3493 uint_t num_copyinfo, int *fault) 3494 { 3495 uint_t idx, nused; 3496 size_t n, off, remaining; 3497 vm_client_t *vmc = vm_get_vmclient(vm, vcpuid); 3498 3499 bzero(copyinfo, sizeof (struct vm_copyinfo) * num_copyinfo); 3500 3501 nused = 0; 3502 remaining = len; 3503 while (remaining > 0) { 3504 uint64_t gpa; 3505 int error; 3506 3507 KASSERT(nused < num_copyinfo, ("insufficient vm_copyinfo")); 3508 error = vm_gla2gpa(vm, vcpuid, paging, gla, prot, &gpa, fault); 3509 if (error || *fault) 3510 return (error); 3511 off = gpa & PAGEOFFSET; 3512 n = min(remaining, PAGESIZE - off); 3513 copyinfo[nused].gpa = gpa; 3514 copyinfo[nused].len = n; 3515 remaining -= n; 3516 gla += n; 3517 nused++; 3518 } 3519 3520 for (idx = 0; idx < nused; idx++) { 3521 vm_page_t *vmp; 3522 caddr_t hva; 3523 3524 vmp = vmc_hold(vmc, copyinfo[idx].gpa & PAGEMASK, prot); 3525 if (vmp == NULL) { 3526 break; 3527 } 3528 if ((prot & PROT_WRITE) != 0) { 3529 hva = (caddr_t)vmp_get_writable(vmp); 3530 } else { 3531 hva = (caddr_t)vmp_get_readable(vmp); 3532 } 3533 copyinfo[idx].hva = hva + (copyinfo[idx].gpa & PAGEOFFSET); 3534 copyinfo[idx].cookie = vmp; 3535 copyinfo[idx].prot = prot; 3536 } 3537 3538 if (idx != nused) { 3539 vm_copy_teardown(vm, vcpuid, copyinfo, num_copyinfo); 3540 return (EFAULT); 3541 } else { 3542 *fault = 0; 3543 return (0); 3544 } 3545 } 3546 3547 void 3548 vm_copyin(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo, void *kaddr, 3549 size_t len) 3550 { 3551 char *dst; 3552 int idx; 3553 3554 dst = kaddr; 3555 idx = 0; 3556 while (len > 0) { 3557 ASSERT(copyinfo[idx].prot & PROT_READ); 3558 3559 bcopy(copyinfo[idx].hva, dst, copyinfo[idx].len); 3560 len -= copyinfo[idx].len; 3561 dst += copyinfo[idx].len; 3562 idx++; 3563 } 3564 } 3565 3566 void 3567 vm_copyout(struct vm *vm, int vcpuid, const void *kaddr, 3568 struct vm_copyinfo *copyinfo, size_t len) 3569 { 3570 const char *src; 3571 int idx; 3572 3573 src = kaddr; 3574 idx = 0; 3575 while (len > 0) { 3576 ASSERT(copyinfo[idx].prot & PROT_WRITE); 3577 3578 bcopy(src, copyinfo[idx].hva, copyinfo[idx].len); 3579 len -= copyinfo[idx].len; 3580 src += copyinfo[idx].len; 3581 idx++; 3582 } 3583 } 3584 3585 /* 3586 * Return the amount of in-use and wired memory for the VM. Since 3587 * these are global stats, only return the values with for vCPU 0 3588 */ 3589 VMM_STAT_DECLARE(VMM_MEM_RESIDENT); 3590 3591 static void 3592 vm_get_rescnt(struct vm *vm, int vcpu, struct vmm_stat_type *stat) 3593 { 3594 if (vcpu == 0) { 3595 vmm_stat_set(vm, vcpu, VMM_MEM_RESIDENT, 3596 PAGE_SIZE * vmspace_resident_count(vm->vmspace)); 3597 } 3598 } 3599 3600 VMM_STAT_FUNC(VMM_MEM_RESIDENT, "Resident memory", vm_get_rescnt); 3601 3602 int 3603 vm_ioport_access(struct vm *vm, int vcpuid, bool in, uint16_t port, 3604 uint8_t bytes, uint32_t *val) 3605 { 3606 return (vm_inout_access(&vm->ioports, in, port, bytes, val)); 3607 } 3608 3609 /* 3610 * bhyve-internal interfaces to attach or detach IO port handlers. 3611 * Must be called with VM write lock held for safety. 3612 */ 3613 int 3614 vm_ioport_attach(struct vm *vm, uint16_t port, ioport_handler_t func, void *arg, 3615 void **cookie) 3616 { 3617 int err; 3618 err = vm_inout_attach(&vm->ioports, port, IOPF_DEFAULT, func, arg); 3619 if (err == 0) { 3620 *cookie = (void *)IOP_GEN_COOKIE(func, arg, port); 3621 } 3622 return (err); 3623 } 3624 int 3625 vm_ioport_detach(struct vm *vm, void **cookie, ioport_handler_t *old_func, 3626 void **old_arg) 3627 { 3628 uint16_t port = IOP_PORT_FROM_COOKIE((uintptr_t)*cookie); 3629 int err; 3630 3631 err = vm_inout_detach(&vm->ioports, port, false, old_func, old_arg); 3632 if (err == 0) { 3633 *cookie = NULL; 3634 } 3635 return (err); 3636 } 3637 3638 /* 3639 * External driver interfaces to attach or detach IO port handlers. 3640 * Must be called with VM write lock held for safety. 3641 */ 3642 int 3643 vm_ioport_hook(struct vm *vm, uint16_t port, ioport_handler_t func, 3644 void *arg, void **cookie) 3645 { 3646 int err; 3647 3648 if (port == 0) { 3649 return (EINVAL); 3650 } 3651 3652 err = vm_inout_attach(&vm->ioports, port, IOPF_DRV_HOOK, func, arg); 3653 if (err == 0) { 3654 *cookie = (void *)IOP_GEN_COOKIE(func, arg, port); 3655 } 3656 return (err); 3657 } 3658 void 3659 vm_ioport_unhook(struct vm *vm, void **cookie) 3660 { 3661 uint16_t port = IOP_PORT_FROM_COOKIE((uintptr_t)*cookie); 3662 ioport_handler_t old_func; 3663 void *old_arg; 3664 int err; 3665 3666 err = vm_inout_detach(&vm->ioports, port, true, &old_func, &old_arg); 3667 3668 /* ioport-hook-using drivers are expected to be well-behaved */ 3669 VERIFY0(err); 3670 VERIFY(IOP_GEN_COOKIE(old_func, old_arg, port) == (uintptr_t)*cookie); 3671 3672 *cookie = NULL; 3673 } 3674 3675 int 3676 vmm_kstat_update_vcpu(struct kstat *ksp, int rw) 3677 { 3678 struct vm *vm = ksp->ks_private; 3679 vmm_vcpu_kstats_t *vvk = ksp->ks_data; 3680 const int vcpuid = vvk->vvk_vcpu.value.ui32; 3681 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 3682 3683 ASSERT3U(vcpuid, <, VM_MAXCPU); 3684 3685 vvk->vvk_time_init.value.ui64 = vcpu->ustate_total[VU_INIT]; 3686 vvk->vvk_time_run.value.ui64 = vcpu->ustate_total[VU_RUN]; 3687 vvk->vvk_time_idle.value.ui64 = vcpu->ustate_total[VU_IDLE]; 3688 vvk->vvk_time_emu_kern.value.ui64 = vcpu->ustate_total[VU_EMU_KERN]; 3689 vvk->vvk_time_emu_user.value.ui64 = vcpu->ustate_total[VU_EMU_USER]; 3690 vvk->vvk_time_sched.value.ui64 = vcpu->ustate_total[VU_SCHED]; 3691 3692 return (0); 3693 } 3694 3695 SET_DECLARE(vmm_data_version_entries, const vmm_data_version_entry_t); 3696 3697 static inline bool 3698 vmm_data_is_cpu_specific(uint16_t data_class) 3699 { 3700 switch (data_class) { 3701 case VDC_REGISTER: 3702 case VDC_MSR: 3703 case VDC_FPU: 3704 case VDC_LAPIC: 3705 case VDC_VMM_ARCH: 3706 return (true); 3707 default: 3708 return (false); 3709 } 3710 } 3711 3712 static const vmm_data_version_entry_t * 3713 vmm_data_find(const vmm_data_req_t *req, int *err) 3714 { 3715 const vmm_data_version_entry_t **vdpp, *vdp; 3716 SET_FOREACH(vdpp, vmm_data_version_entries) { 3717 vdp = *vdpp; 3718 if (vdp->vdve_class == req->vdr_class && 3719 vdp->vdve_version == req->vdr_version) { 3720 /* 3721 * Enforce any data length expectation expressed by the 3722 * provider for this data. 3723 */ 3724 if (vdp->vdve_len_expect != 0 && 3725 vdp->vdve_len_expect != req->vdr_len) { 3726 *err = ENOSPC; 3727 return (NULL); 3728 } 3729 return (vdp); 3730 } 3731 } 3732 *err = EINVAL; 3733 return (NULL); 3734 } 3735 3736 static void * 3737 vmm_data_from_class(const vmm_data_req_t *req, struct vm *vm, int vcpuid) 3738 { 3739 switch (req->vdr_class) { 3740 /* per-cpu data/devices */ 3741 case VDC_LAPIC: 3742 return (vm_lapic(vm, vcpuid)); 3743 3744 case VDC_FPU: 3745 case VDC_REGISTER: 3746 case VDC_VMM_ARCH: 3747 case VDC_MSR: 3748 /* 3749 * These have per-CPU handling which is dispatched outside 3750 * vmm_data_version_entries listing. 3751 */ 3752 return (NULL); 3753 3754 /* system-wide data/devices */ 3755 case VDC_IOAPIC: 3756 return (vm->vioapic); 3757 case VDC_ATPIT: 3758 return (vm->vatpit); 3759 case VDC_ATPIC: 3760 return (vm->vatpic); 3761 case VDC_HPET: 3762 return (vm->vhpet); 3763 case VDC_PM_TIMER: 3764 return (vm->vpmtmr); 3765 case VDC_RTC: 3766 return (vm->vrtc); 3767 3768 default: 3769 /* The data class will have been validated by now */ 3770 panic("Unexpected class %u", req->vdr_class); 3771 } 3772 } 3773 3774 int 3775 vmm_data_read(struct vm *vm, int vcpuid, const vmm_data_req_t *req) 3776 { 3777 int err = 0; 3778 3779 if (vmm_data_is_cpu_specific(req->vdr_class)) { 3780 if (vcpuid >= VM_MAXCPU) { 3781 return (EINVAL); 3782 } 3783 } 3784 3785 const vmm_data_version_entry_t *entry; 3786 entry = vmm_data_find(req, &err); 3787 if (entry == NULL) { 3788 ASSERT(err != 0); 3789 return (err); 3790 } 3791 3792 void *datap = vmm_data_from_class(req, vm, vcpuid); 3793 if (datap != NULL) { 3794 err = entry->vdve_readf(datap, req); 3795 } else { 3796 switch (req->vdr_class) { 3797 case VDC_FPU: 3798 /* TODO: wire up to xsave export via hma_fpu iface */ 3799 err = EINVAL; 3800 break; 3801 case VDC_REGISTER: 3802 case VDC_VMM_ARCH: 3803 case VDC_MSR: 3804 /* TODO: implement */ 3805 err = EINVAL; 3806 break; 3807 default: 3808 err = EINVAL; 3809 break; 3810 } 3811 } 3812 3813 return (err); 3814 } 3815 3816 int 3817 vmm_data_write(struct vm *vm, int vcpuid, const vmm_data_req_t *req) 3818 { 3819 int err = 0; 3820 3821 if (vmm_data_is_cpu_specific(req->vdr_class)) { 3822 if (vcpuid >= VM_MAXCPU) { 3823 return (EINVAL); 3824 } 3825 } 3826 3827 const vmm_data_version_entry_t *entry; 3828 entry = vmm_data_find(req, &err); 3829 if (entry == NULL) { 3830 ASSERT(err != 0); 3831 return (err); 3832 } 3833 3834 void *datap = vmm_data_from_class(req, vm, vcpuid); 3835 if (datap != NULL) { 3836 err = entry->vdve_writef(datap, req); 3837 } else { 3838 switch (req->vdr_class) { 3839 case VDC_FPU: 3840 /* TODO: wire up to xsave import via hma_fpu iface */ 3841 err = EINVAL; 3842 break; 3843 case VDC_REGISTER: 3844 case VDC_VMM_ARCH: 3845 case VDC_MSR: 3846 /* TODO: implement */ 3847 err = EINVAL; 3848 break; 3849 default: 3850 err = EINVAL; 3851 break; 3852 } 3853 } 3854 3855 return (err); 3856 } 3857