1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2011 NetApp, Inc. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 * 28 * $FreeBSD$ 29 */ 30 /* 31 * This file and its contents are supplied under the terms of the 32 * Common Development and Distribution License ("CDDL"), version 1.0. 33 * You may only use this file in accordance with the terms of version 34 * 1.0 of the CDDL. 35 * 36 * A full copy of the text of the CDDL should have accompanied this 37 * source. A copy of the CDDL is also available via the Internet at 38 * http://www.illumos.org/license/CDDL. 39 * 40 * Copyright 2015 Pluribus Networks Inc. 41 * Copyright 2018 Joyent, Inc. 42 * Copyright 2022 Oxide Computer Company 43 * Copyright 2021 OmniOS Community Edition (OmniOSce) Association. 44 */ 45 46 #include <sys/cdefs.h> 47 __FBSDID("$FreeBSD$"); 48 49 #include <sys/param.h> 50 #include <sys/systm.h> 51 #include <sys/kernel.h> 52 #include <sys/module.h> 53 #include <sys/sysctl.h> 54 #include <sys/kmem.h> 55 #include <sys/pcpu.h> 56 #include <sys/mutex.h> 57 #include <sys/proc.h> 58 #include <sys/rwlock.h> 59 #include <sys/sched.h> 60 #include <sys/systm.h> 61 #include <sys/sunddi.h> 62 #include <sys/hma.h> 63 64 #include <machine/md_var.h> 65 #include <x86/psl.h> 66 #include <x86/apicreg.h> 67 68 #include <machine/specialreg.h> 69 #include <machine/vmm.h> 70 #include <machine/vmm_dev.h> 71 #include <machine/vmparam.h> 72 #include <sys/vmm_instruction_emul.h> 73 #include <sys/vmm_vm.h> 74 #include <sys/vmm_gpt.h> 75 76 #include "vmm_ioport.h" 77 #include "vmm_host.h" 78 #include "vmm_util.h" 79 #include "vatpic.h" 80 #include "vatpit.h" 81 #include "vhpet.h" 82 #include "vioapic.h" 83 #include "vlapic.h" 84 #include "vpmtmr.h" 85 #include "vrtc.h" 86 #include "vmm_stat.h" 87 #include "vmm_lapic.h" 88 89 #include "io/ppt.h" 90 #include "io/iommu.h" 91 92 struct vlapic; 93 94 /* Flags for vtc_status */ 95 #define VTCS_FPU_RESTORED 1 /* guest FPU restored, host FPU saved */ 96 #define VTCS_FPU_CTX_CRITICAL 2 /* in ctx where FPU restore cannot be lazy */ 97 98 typedef struct vm_thread_ctx { 99 struct vm *vtc_vm; 100 int vtc_vcpuid; 101 uint_t vtc_status; 102 enum vcpu_ustate vtc_ustate; 103 } vm_thread_ctx_t; 104 105 #define VMM_MTRR_VAR_MAX 10 106 #define VMM_MTRR_DEF_MASK \ 107 (MTRR_DEF_ENABLE | MTRR_DEF_FIXED_ENABLE | MTRR_DEF_TYPE) 108 #define VMM_MTRR_PHYSBASE_MASK (MTRR_PHYSBASE_PHYSBASE | MTRR_PHYSBASE_TYPE) 109 #define VMM_MTRR_PHYSMASK_MASK (MTRR_PHYSMASK_PHYSMASK | MTRR_PHYSMASK_VALID) 110 struct vm_mtrr { 111 uint64_t def_type; 112 uint64_t fixed4k[8]; 113 uint64_t fixed16k[2]; 114 uint64_t fixed64k; 115 struct { 116 uint64_t base; 117 uint64_t mask; 118 } var[VMM_MTRR_VAR_MAX]; 119 }; 120 121 /* 122 * Initialization: 123 * (a) allocated when vcpu is created 124 * (i) initialized when vcpu is created and when it is reinitialized 125 * (o) initialized the first time the vcpu is created 126 * (x) initialized before use 127 */ 128 struct vcpu { 129 /* (o) protects state, run_state, hostcpu, sipi_vector */ 130 kmutex_t lock; 131 132 enum vcpu_state state; /* (o) vcpu state */ 133 enum vcpu_run_state run_state; /* (i) vcpu init/sipi/run state */ 134 kcondvar_t vcpu_cv; /* (o) cpu waiter cv */ 135 kcondvar_t state_cv; /* (o) IDLE-transition cv */ 136 int hostcpu; /* (o) vcpu's current host cpu */ 137 int lastloccpu; /* (o) last host cpu localized to */ 138 int reqidle; /* (i) request vcpu to idle */ 139 struct vlapic *vlapic; /* (i) APIC device model */ 140 enum x2apic_state x2apic_state; /* (i) APIC mode */ 141 uint64_t exit_intinfo; /* (i) events pending at VM exit */ 142 uint64_t exc_pending; /* (i) exception pending */ 143 bool nmi_pending; /* (i) NMI pending */ 144 bool extint_pending; /* (i) INTR pending */ 145 146 uint8_t sipi_vector; /* (i) SIPI vector */ 147 hma_fpu_t *guestfpu; /* (a,i) guest fpu state */ 148 uint64_t guest_xcr0; /* (i) guest %xcr0 register */ 149 void *stats; /* (a,i) statistics */ 150 struct vm_exit exitinfo; /* (x) exit reason and collateral */ 151 uint64_t nextrip; /* (x) next instruction to execute */ 152 struct vie *vie_ctx; /* (x) instruction emulation context */ 153 vm_client_t *vmclient; /* (a) VM-system client */ 154 uint64_t tsc_offset; /* (x) offset from host TSC */ 155 struct vm_mtrr mtrr; /* (i) vcpu's MTRR */ 156 157 enum vcpu_ustate ustate; /* (i) microstate for the vcpu */ 158 hrtime_t ustate_when; /* (i) time of last ustate change */ 159 uint64_t ustate_total[VU_MAX]; /* (o) total time spent in ustates */ 160 vm_thread_ctx_t vtc; /* (o) thread state for ctxops */ 161 struct ctxop *ctxop; /* (o) ctxop storage for vcpu */ 162 }; 163 164 #define vcpu_lock(v) mutex_enter(&((v)->lock)) 165 #define vcpu_unlock(v) mutex_exit(&((v)->lock)) 166 #define vcpu_assert_locked(v) ASSERT(MUTEX_HELD(&((v)->lock))) 167 168 struct mem_seg { 169 size_t len; 170 bool sysmem; 171 vm_object_t *object; 172 }; 173 #define VM_MAX_MEMSEGS 5 174 175 struct mem_map { 176 vm_paddr_t gpa; 177 size_t len; 178 vm_ooffset_t segoff; 179 int segid; 180 int prot; 181 int flags; 182 }; 183 #define VM_MAX_MEMMAPS 8 184 185 /* 186 * Initialization: 187 * (o) initialized the first time the VM is created 188 * (i) initialized when VM is created and when it is reinitialized 189 * (x) initialized before use 190 */ 191 struct vm { 192 void *cookie; /* (i) cpu-specific data */ 193 void *iommu; /* (x) iommu-specific data */ 194 struct vhpet *vhpet; /* (i) virtual HPET */ 195 struct vioapic *vioapic; /* (i) virtual ioapic */ 196 struct vatpic *vatpic; /* (i) virtual atpic */ 197 struct vatpit *vatpit; /* (i) virtual atpit */ 198 struct vpmtmr *vpmtmr; /* (i) virtual ACPI PM timer */ 199 struct vrtc *vrtc; /* (o) virtual RTC */ 200 volatile cpuset_t active_cpus; /* (i) active vcpus */ 201 volatile cpuset_t debug_cpus; /* (i) vcpus stopped for dbg */ 202 int suspend; /* (i) stop VM execution */ 203 volatile cpuset_t suspended_cpus; /* (i) suspended vcpus */ 204 volatile cpuset_t halted_cpus; /* (x) cpus in a hard halt */ 205 struct mem_map mem_maps[VM_MAX_MEMMAPS]; /* (i) guest address space */ 206 struct mem_seg mem_segs[VM_MAX_MEMSEGS]; /* (o) guest memory regions */ 207 struct vmspace *vmspace; /* (o) guest's address space */ 208 struct vcpu vcpu[VM_MAXCPU]; /* (i) guest vcpus */ 209 /* The following describe the vm cpu topology */ 210 uint16_t sockets; /* (o) num of sockets */ 211 uint16_t cores; /* (o) num of cores/socket */ 212 uint16_t threads; /* (o) num of threads/core */ 213 uint16_t maxcpus; /* (o) max pluggable cpus */ 214 uint64_t boot_tsc_offset; /* (i) TSC offset at VM boot */ 215 216 struct ioport_config ioports; /* (o) ioport handling */ 217 218 bool mem_transient; /* (o) alloc transient memory */ 219 }; 220 221 static int vmm_initialized; 222 223 224 static void 225 nullop_panic(void) 226 { 227 panic("null vmm operation call"); 228 } 229 230 /* Do not allow use of an un-set `ops` to do anything but panic */ 231 static struct vmm_ops vmm_ops_null = { 232 .init = (vmm_init_func_t)nullop_panic, 233 .cleanup = (vmm_cleanup_func_t)nullop_panic, 234 .resume = (vmm_resume_func_t)nullop_panic, 235 .vminit = (vmi_init_func_t)nullop_panic, 236 .vmrun = (vmi_run_func_t)nullop_panic, 237 .vmcleanup = (vmi_cleanup_func_t)nullop_panic, 238 .vmgetreg = (vmi_get_register_t)nullop_panic, 239 .vmsetreg = (vmi_set_register_t)nullop_panic, 240 .vmgetdesc = (vmi_get_desc_t)nullop_panic, 241 .vmsetdesc = (vmi_set_desc_t)nullop_panic, 242 .vmgetcap = (vmi_get_cap_t)nullop_panic, 243 .vmsetcap = (vmi_set_cap_t)nullop_panic, 244 .vlapic_init = (vmi_vlapic_init)nullop_panic, 245 .vlapic_cleanup = (vmi_vlapic_cleanup)nullop_panic, 246 .vmsavectx = (vmi_savectx)nullop_panic, 247 .vmrestorectx = (vmi_restorectx)nullop_panic, 248 }; 249 250 static struct vmm_ops *ops = &vmm_ops_null; 251 static vmm_pte_ops_t *pte_ops = NULL; 252 253 #define VMM_INIT() ((*ops->init)()) 254 #define VMM_CLEANUP() ((*ops->cleanup)()) 255 #define VMM_RESUME() ((*ops->resume)()) 256 257 #define VMINIT(vm) ((*ops->vminit)(vm)) 258 #define VMRUN(vmi, vcpu, rip) ((*ops->vmrun)(vmi, vcpu, rip)) 259 #define VMCLEANUP(vmi) ((*ops->vmcleanup)(vmi)) 260 261 #define VMGETREG(vmi, vcpu, num, rv) ((*ops->vmgetreg)(vmi, vcpu, num, rv)) 262 #define VMSETREG(vmi, vcpu, num, val) ((*ops->vmsetreg)(vmi, vcpu, num, val)) 263 #define VMGETDESC(vmi, vcpu, num, dsc) ((*ops->vmgetdesc)(vmi, vcpu, num, dsc)) 264 #define VMSETDESC(vmi, vcpu, num, dsc) ((*ops->vmsetdesc)(vmi, vcpu, num, dsc)) 265 #define VMGETCAP(vmi, vcpu, num, rv) ((*ops->vmgetcap)(vmi, vcpu, num, rv)) 266 #define VMSETCAP(vmi, vcpu, num, val) ((*ops->vmsetcap)(vmi, vcpu, num, val)) 267 #define VLAPIC_INIT(vmi, vcpu) ((*ops->vlapic_init)(vmi, vcpu)) 268 #define VLAPIC_CLEANUP(vmi, vlapic) ((*ops->vlapic_cleanup)(vmi, vlapic)) 269 270 #define fpu_start_emulating() load_cr0(rcr0() | CR0_TS) 271 #define fpu_stop_emulating() clts() 272 273 SDT_PROVIDER_DEFINE(vmm); 274 275 SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 276 NULL); 277 278 /* 279 * Halt the guest if all vcpus are executing a HLT instruction with 280 * interrupts disabled. 281 */ 282 static int halt_detection_enabled = 1; 283 284 /* Trap into hypervisor on all guest exceptions and reflect them back */ 285 static int trace_guest_exceptions; 286 287 static void vm_free_memmap(struct vm *vm, int ident); 288 static bool sysmem_mapping(struct vm *vm, struct mem_map *mm); 289 static void vcpu_notify_event_locked(struct vcpu *vcpu, vcpu_notify_t); 290 static bool vcpu_sleep_bailout_checks(struct vm *vm, int vcpuid); 291 static int vcpu_vector_sipi(struct vm *vm, int vcpuid, uint8_t vector); 292 293 static void vmm_savectx(void *); 294 static void vmm_restorectx(void *); 295 static const struct ctxop_template vmm_ctxop_tpl = { 296 .ct_rev = CTXOP_TPL_REV, 297 .ct_save = vmm_savectx, 298 .ct_restore = vmm_restorectx, 299 }; 300 301 #ifdef KTR 302 static const char * 303 vcpu_state2str(enum vcpu_state state) 304 { 305 306 switch (state) { 307 case VCPU_IDLE: 308 return ("idle"); 309 case VCPU_FROZEN: 310 return ("frozen"); 311 case VCPU_RUNNING: 312 return ("running"); 313 case VCPU_SLEEPING: 314 return ("sleeping"); 315 default: 316 return ("unknown"); 317 } 318 } 319 #endif 320 321 static void 322 vcpu_cleanup(struct vm *vm, int i, bool destroy) 323 { 324 struct vcpu *vcpu = &vm->vcpu[i]; 325 326 VLAPIC_CLEANUP(vm->cookie, vcpu->vlapic); 327 if (destroy) { 328 vmm_stat_free(vcpu->stats); 329 330 hma_fpu_free(vcpu->guestfpu); 331 vcpu->guestfpu = NULL; 332 333 vie_free(vcpu->vie_ctx); 334 vcpu->vie_ctx = NULL; 335 336 vmc_destroy(vcpu->vmclient); 337 vcpu->vmclient = NULL; 338 339 ctxop_free(vcpu->ctxop); 340 mutex_destroy(&vcpu->lock); 341 } 342 } 343 344 static void 345 vcpu_init(struct vm *vm, int vcpu_id, bool create) 346 { 347 struct vcpu *vcpu; 348 349 KASSERT(vcpu_id >= 0 && vcpu_id < vm->maxcpus, 350 ("vcpu_init: invalid vcpu %d", vcpu_id)); 351 352 vcpu = &vm->vcpu[vcpu_id]; 353 354 if (create) { 355 mutex_init(&vcpu->lock, NULL, MUTEX_ADAPTIVE, NULL); 356 357 vcpu->state = VCPU_IDLE; 358 vcpu->hostcpu = NOCPU; 359 vcpu->lastloccpu = NOCPU; 360 vcpu->guestfpu = hma_fpu_alloc(KM_SLEEP); 361 vcpu->stats = vmm_stat_alloc(); 362 vcpu->vie_ctx = vie_alloc(); 363 364 vcpu->ustate = VU_INIT; 365 vcpu->ustate_when = gethrtime(); 366 367 vcpu->vtc.vtc_vm = vm; 368 vcpu->vtc.vtc_vcpuid = vcpu_id; 369 vcpu->ctxop = ctxop_allocate(&vmm_ctxop_tpl, &vcpu->vtc); 370 } else { 371 vie_reset(vcpu->vie_ctx); 372 bzero(&vcpu->exitinfo, sizeof (vcpu->exitinfo)); 373 if (vcpu->ustate != VU_INIT) { 374 vcpu_ustate_change(vm, vcpu_id, VU_INIT); 375 } 376 bzero(&vcpu->mtrr, sizeof (vcpu->mtrr)); 377 } 378 379 vcpu->run_state = VRS_HALT; 380 vcpu->vlapic = VLAPIC_INIT(vm->cookie, vcpu_id); 381 (void) vm_set_x2apic_state(vm, vcpu_id, X2APIC_DISABLED); 382 vcpu->reqidle = 0; 383 vcpu->exit_intinfo = 0; 384 vcpu->nmi_pending = false; 385 vcpu->extint_pending = false; 386 vcpu->exc_pending = 0; 387 vcpu->guest_xcr0 = XFEATURE_ENABLED_X87; 388 (void) hma_fpu_init(vcpu->guestfpu); 389 vmm_stat_init(vcpu->stats); 390 vcpu->tsc_offset = 0; 391 } 392 393 int 394 vcpu_trace_exceptions(struct vm *vm, int vcpuid) 395 { 396 397 return (trace_guest_exceptions); 398 } 399 400 struct vm_exit * 401 vm_exitinfo(struct vm *vm, int cpuid) 402 { 403 struct vcpu *vcpu; 404 405 if (cpuid < 0 || cpuid >= vm->maxcpus) 406 panic("vm_exitinfo: invalid cpuid %d", cpuid); 407 408 vcpu = &vm->vcpu[cpuid]; 409 410 return (&vcpu->exitinfo); 411 } 412 413 struct vie * 414 vm_vie_ctx(struct vm *vm, int cpuid) 415 { 416 if (cpuid < 0 || cpuid >= vm->maxcpus) 417 panic("vm_vie_ctx: invalid cpuid %d", cpuid); 418 419 return (vm->vcpu[cpuid].vie_ctx); 420 } 421 422 static int 423 vmm_init(void) 424 { 425 vmm_host_state_init(); 426 427 if (vmm_is_intel()) { 428 ops = &vmm_ops_intel; 429 pte_ops = &ept_pte_ops; 430 } else if (vmm_is_svm()) { 431 ops = &vmm_ops_amd; 432 pte_ops = &rvi_pte_ops; 433 } else { 434 return (ENXIO); 435 } 436 437 return (VMM_INIT()); 438 } 439 440 int 441 vmm_mod_load() 442 { 443 int error; 444 445 VERIFY(vmm_initialized == 0); 446 447 error = vmm_init(); 448 if (error == 0) 449 vmm_initialized = 1; 450 451 return (error); 452 } 453 454 int 455 vmm_mod_unload() 456 { 457 int error; 458 459 VERIFY(vmm_initialized == 1); 460 461 error = VMM_CLEANUP(); 462 if (error) 463 return (error); 464 vmm_initialized = 0; 465 466 return (0); 467 } 468 469 /* 470 * Create a test IOMMU domain to see if the host system has necessary hardware 471 * and drivers to do so. 472 */ 473 bool 474 vmm_check_iommu(void) 475 { 476 void *domain; 477 const size_t arb_test_sz = (1UL << 32); 478 479 domain = iommu_create_domain(arb_test_sz); 480 if (domain == NULL) { 481 return (false); 482 } 483 iommu_destroy_domain(domain); 484 return (true); 485 } 486 487 static void 488 vm_init(struct vm *vm, bool create) 489 { 490 int i; 491 492 vm->cookie = VMINIT(vm); 493 vm->iommu = NULL; 494 vm->vioapic = vioapic_init(vm); 495 vm->vhpet = vhpet_init(vm); 496 vm->vatpic = vatpic_init(vm); 497 vm->vatpit = vatpit_init(vm); 498 vm->vpmtmr = vpmtmr_init(vm); 499 if (create) 500 vm->vrtc = vrtc_init(vm); 501 502 vm_inout_init(vm, &vm->ioports); 503 504 CPU_ZERO(&vm->active_cpus); 505 CPU_ZERO(&vm->debug_cpus); 506 507 vm->suspend = 0; 508 CPU_ZERO(&vm->suspended_cpus); 509 510 for (i = 0; i < vm->maxcpus; i++) 511 vcpu_init(vm, i, create); 512 513 /* 514 * Configure the VM-wide TSC offset so that the call to vm_init() 515 * represents the boot time (when the TSC(s) read 0). Each vCPU will 516 * have its own offset from this, which is altered if/when the guest 517 * writes to MSR_TSC. 518 * 519 * The TSC offsetting math is all unsigned, using overflow for negative 520 * offets. A reading of the TSC is negated to form the boot offset. 521 */ 522 vm->boot_tsc_offset = (uint64_t)(-(int64_t)rdtsc_offset()); 523 } 524 525 /* 526 * The default CPU topology is a single thread per package. 527 */ 528 uint_t cores_per_package = 1; 529 uint_t threads_per_core = 1; 530 531 /* 532 * Debugging tunable to enable dirty-page-tracking. 533 * (Remains off by default for now) 534 */ 535 bool gpt_track_dirty = false; 536 537 int 538 vm_create(uint64_t flags, struct vm **retvm) 539 { 540 struct vm *vm; 541 struct vmspace *vmspace; 542 543 /* 544 * If vmm.ko could not be successfully initialized then don't attempt 545 * to create the virtual machine. 546 */ 547 if (!vmm_initialized) 548 return (ENXIO); 549 550 vmspace = vmspace_alloc(VM_MAXUSER_ADDRESS, pte_ops, gpt_track_dirty); 551 if (vmspace == NULL) 552 return (ENOMEM); 553 554 vm = kmem_zalloc(sizeof (struct vm), KM_SLEEP); 555 556 vm->vmspace = vmspace; 557 vm->mem_transient = (flags & VCF_RESERVOIR_MEM) == 0; 558 for (uint_t i = 0; i < VM_MAXCPU; i++) { 559 vm->vcpu[i].vmclient = vmspace_client_alloc(vmspace); 560 } 561 562 vm->sockets = 1; 563 vm->cores = cores_per_package; /* XXX backwards compatibility */ 564 vm->threads = threads_per_core; /* XXX backwards compatibility */ 565 vm->maxcpus = VM_MAXCPU; /* XXX temp to keep code working */ 566 567 vm_init(vm, true); 568 569 *retvm = vm; 570 return (0); 571 } 572 573 void 574 vm_get_topology(struct vm *vm, uint16_t *sockets, uint16_t *cores, 575 uint16_t *threads, uint16_t *maxcpus) 576 { 577 *sockets = vm->sockets; 578 *cores = vm->cores; 579 *threads = vm->threads; 580 *maxcpus = vm->maxcpus; 581 } 582 583 uint16_t 584 vm_get_maxcpus(struct vm *vm) 585 { 586 return (vm->maxcpus); 587 } 588 589 int 590 vm_set_topology(struct vm *vm, uint16_t sockets, uint16_t cores, 591 uint16_t threads, uint16_t maxcpus) 592 { 593 if (maxcpus != 0) 594 return (EINVAL); /* XXX remove when supported */ 595 if ((sockets * cores * threads) > vm->maxcpus) 596 return (EINVAL); 597 /* XXX need to check sockets * cores * threads == vCPU, how? */ 598 vm->sockets = sockets; 599 vm->cores = cores; 600 vm->threads = threads; 601 vm->maxcpus = VM_MAXCPU; /* XXX temp to keep code working */ 602 return (0); 603 } 604 605 static void 606 vm_cleanup(struct vm *vm, bool destroy) 607 { 608 struct mem_map *mm; 609 int i; 610 611 ppt_unassign_all(vm); 612 613 if (vm->iommu != NULL) 614 iommu_destroy_domain(vm->iommu); 615 616 /* 617 * Devices which attach their own ioport hooks should be cleaned up 618 * first so they can tear down those registrations. 619 */ 620 vpmtmr_cleanup(vm->vpmtmr); 621 622 vm_inout_cleanup(vm, &vm->ioports); 623 624 if (destroy) 625 vrtc_cleanup(vm->vrtc); 626 else 627 vrtc_reset(vm->vrtc); 628 629 vatpit_cleanup(vm->vatpit); 630 vhpet_cleanup(vm->vhpet); 631 vatpic_cleanup(vm->vatpic); 632 vioapic_cleanup(vm->vioapic); 633 634 for (i = 0; i < vm->maxcpus; i++) 635 vcpu_cleanup(vm, i, destroy); 636 637 VMCLEANUP(vm->cookie); 638 639 /* 640 * System memory is removed from the guest address space only when 641 * the VM is destroyed. This is because the mapping remains the same 642 * across VM reset. 643 * 644 * Device memory can be relocated by the guest (e.g. using PCI BARs) 645 * so those mappings are removed on a VM reset. 646 */ 647 for (i = 0; i < VM_MAX_MEMMAPS; i++) { 648 mm = &vm->mem_maps[i]; 649 if (destroy || !sysmem_mapping(vm, mm)) { 650 vm_free_memmap(vm, i); 651 } else { 652 /* 653 * We need to reset the IOMMU flag so this mapping can 654 * be reused when a VM is rebooted. Since the IOMMU 655 * domain has already been destroyed we can just reset 656 * the flag here. 657 */ 658 mm->flags &= ~VM_MEMMAP_F_IOMMU; 659 } 660 } 661 662 if (destroy) { 663 for (i = 0; i < VM_MAX_MEMSEGS; i++) 664 vm_free_memseg(vm, i); 665 666 vmspace_destroy(vm->vmspace); 667 vm->vmspace = NULL; 668 } 669 } 670 671 void 672 vm_destroy(struct vm *vm) 673 { 674 vm_cleanup(vm, true); 675 kmem_free(vm, sizeof (*vm)); 676 } 677 678 int 679 vm_reinit(struct vm *vm, uint64_t flags) 680 { 681 /* A virtual machine can be reset only if all vcpus are suspended. */ 682 if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) != 0) { 683 if ((flags & VM_REINIT_F_FORCE_SUSPEND) == 0) { 684 return (EBUSY); 685 } 686 687 /* 688 * Force the VM (and all its vCPUs) into a suspended state. 689 * This should be quick and easy, since the vm_reinit() call is 690 * made while holding the VM write lock, which requires holding 691 * all of the vCPUs in the VCPU_FROZEN state. 692 */ 693 (void) atomic_cmpset_int((uint_t *)&vm->suspend, 0, 694 VM_SUSPEND_RESET); 695 for (uint_t i = 0; i < vm->maxcpus; i++) { 696 struct vcpu *vcpu = &vm->vcpu[i]; 697 698 if (CPU_ISSET(i, &vm->suspended_cpus) || 699 !CPU_ISSET(i, &vm->active_cpus)) { 700 continue; 701 } 702 703 vcpu_lock(vcpu); 704 VERIFY3U(vcpu->state, ==, VCPU_FROZEN); 705 CPU_SET_ATOMIC(i, &vm->suspended_cpus); 706 vcpu_unlock(vcpu); 707 } 708 709 VERIFY0(CPU_CMP(&vm->suspended_cpus, &vm->active_cpus)); 710 } 711 712 vm_cleanup(vm, false); 713 vm_init(vm, false); 714 return (0); 715 } 716 717 int 718 vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa) 719 { 720 vm_object_t *obj; 721 722 if ((obj = vmm_mmio_alloc(vm->vmspace, gpa, len, hpa)) == NULL) 723 return (ENOMEM); 724 else 725 return (0); 726 } 727 728 int 729 vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len) 730 { 731 return (vmspace_unmap(vm->vmspace, gpa, gpa + len)); 732 } 733 734 /* 735 * Return 'true' if 'gpa' is allocated in the guest address space. 736 * 737 * This function is called in the context of a running vcpu which acts as 738 * an implicit lock on 'vm->mem_maps[]'. 739 */ 740 bool 741 vm_mem_allocated(struct vm *vm, int vcpuid, vm_paddr_t gpa) 742 { 743 struct mem_map *mm; 744 int i; 745 746 #ifdef INVARIANTS 747 int hostcpu, state; 748 state = vcpu_get_state(vm, vcpuid, &hostcpu); 749 KASSERT(state == VCPU_RUNNING && hostcpu == curcpu, 750 ("%s: invalid vcpu state %d/%d", __func__, state, hostcpu)); 751 #endif 752 753 for (i = 0; i < VM_MAX_MEMMAPS; i++) { 754 mm = &vm->mem_maps[i]; 755 if (mm->len != 0 && gpa >= mm->gpa && gpa < mm->gpa + mm->len) 756 return (true); /* 'gpa' is sysmem or devmem */ 757 } 758 759 if (ppt_is_mmio(vm, gpa)) 760 return (true); /* 'gpa' is pci passthru mmio */ 761 762 return (false); 763 } 764 765 int 766 vm_alloc_memseg(struct vm *vm, int ident, size_t len, bool sysmem) 767 { 768 struct mem_seg *seg; 769 vm_object_t *obj; 770 771 if (ident < 0 || ident >= VM_MAX_MEMSEGS) 772 return (EINVAL); 773 774 if (len == 0 || (len & PAGE_MASK)) 775 return (EINVAL); 776 777 seg = &vm->mem_segs[ident]; 778 if (seg->object != NULL) { 779 if (seg->len == len && seg->sysmem == sysmem) 780 return (EEXIST); 781 else 782 return (EINVAL); 783 } 784 785 obj = vm_object_mem_allocate(len, vm->mem_transient); 786 if (obj == NULL) 787 return (ENOMEM); 788 789 seg->len = len; 790 seg->object = obj; 791 seg->sysmem = sysmem; 792 return (0); 793 } 794 795 int 796 vm_get_memseg(struct vm *vm, int ident, size_t *len, bool *sysmem, 797 vm_object_t **objptr) 798 { 799 struct mem_seg *seg; 800 801 if (ident < 0 || ident >= VM_MAX_MEMSEGS) 802 return (EINVAL); 803 804 seg = &vm->mem_segs[ident]; 805 if (len) 806 *len = seg->len; 807 if (sysmem) 808 *sysmem = seg->sysmem; 809 if (objptr) 810 *objptr = seg->object; 811 return (0); 812 } 813 814 void 815 vm_free_memseg(struct vm *vm, int ident) 816 { 817 struct mem_seg *seg; 818 819 KASSERT(ident >= 0 && ident < VM_MAX_MEMSEGS, 820 ("%s: invalid memseg ident %d", __func__, ident)); 821 822 seg = &vm->mem_segs[ident]; 823 if (seg->object != NULL) { 824 vm_object_release(seg->object); 825 bzero(seg, sizeof (struct mem_seg)); 826 } 827 } 828 829 int 830 vm_mmap_memseg(struct vm *vm, vm_paddr_t gpa, int segid, vm_ooffset_t first, 831 size_t len, int prot, int flags) 832 { 833 struct mem_seg *seg; 834 struct mem_map *m, *map; 835 vm_ooffset_t last; 836 int i, error; 837 838 if (prot == 0 || (prot & ~(PROT_ALL)) != 0) 839 return (EINVAL); 840 841 if (flags & ~VM_MEMMAP_F_WIRED) 842 return (EINVAL); 843 844 if (segid < 0 || segid >= VM_MAX_MEMSEGS) 845 return (EINVAL); 846 847 seg = &vm->mem_segs[segid]; 848 if (seg->object == NULL) 849 return (EINVAL); 850 851 last = first + len; 852 if (first < 0 || first >= last || last > seg->len) 853 return (EINVAL); 854 855 if ((gpa | first | last) & PAGE_MASK) 856 return (EINVAL); 857 858 map = NULL; 859 for (i = 0; i < VM_MAX_MEMMAPS; i++) { 860 m = &vm->mem_maps[i]; 861 if (m->len == 0) { 862 map = m; 863 break; 864 } 865 } 866 867 if (map == NULL) 868 return (ENOSPC); 869 870 error = vmspace_map(vm->vmspace, seg->object, first, gpa, len, prot); 871 if (error != 0) 872 return (EFAULT); 873 874 vm_object_reference(seg->object); 875 876 if ((flags & VM_MEMMAP_F_WIRED) != 0) { 877 error = vmspace_populate(vm->vmspace, gpa, gpa + len); 878 if (error != 0) { 879 VERIFY0(vmspace_unmap(vm->vmspace, gpa, gpa + len)); 880 return (EFAULT); 881 } 882 } 883 884 map->gpa = gpa; 885 map->len = len; 886 map->segoff = first; 887 map->segid = segid; 888 map->prot = prot; 889 map->flags = flags; 890 return (0); 891 } 892 893 int 894 vm_munmap_memseg(struct vm *vm, vm_paddr_t gpa, size_t len) 895 { 896 struct mem_map *m; 897 int i; 898 899 for (i = 0; i < VM_MAX_MEMMAPS; i++) { 900 m = &vm->mem_maps[i]; 901 if (m->gpa == gpa && m->len == len && 902 (m->flags & VM_MEMMAP_F_IOMMU) == 0) { 903 vm_free_memmap(vm, i); 904 return (0); 905 } 906 } 907 908 return (EINVAL); 909 } 910 911 int 912 vm_mmap_getnext(struct vm *vm, vm_paddr_t *gpa, int *segid, 913 vm_ooffset_t *segoff, size_t *len, int *prot, int *flags) 914 { 915 struct mem_map *mm, *mmnext; 916 int i; 917 918 mmnext = NULL; 919 for (i = 0; i < VM_MAX_MEMMAPS; i++) { 920 mm = &vm->mem_maps[i]; 921 if (mm->len == 0 || mm->gpa < *gpa) 922 continue; 923 if (mmnext == NULL || mm->gpa < mmnext->gpa) 924 mmnext = mm; 925 } 926 927 if (mmnext != NULL) { 928 *gpa = mmnext->gpa; 929 if (segid) 930 *segid = mmnext->segid; 931 if (segoff) 932 *segoff = mmnext->segoff; 933 if (len) 934 *len = mmnext->len; 935 if (prot) 936 *prot = mmnext->prot; 937 if (flags) 938 *flags = mmnext->flags; 939 return (0); 940 } else { 941 return (ENOENT); 942 } 943 } 944 945 static void 946 vm_free_memmap(struct vm *vm, int ident) 947 { 948 struct mem_map *mm; 949 int error; 950 951 mm = &vm->mem_maps[ident]; 952 if (mm->len) { 953 error = vmspace_unmap(vm->vmspace, mm->gpa, 954 mm->gpa + mm->len); 955 KASSERT(error == 0, ("%s: vmspace_unmap error %d", 956 __func__, error)); 957 bzero(mm, sizeof (struct mem_map)); 958 } 959 } 960 961 static __inline bool 962 sysmem_mapping(struct vm *vm, struct mem_map *mm) 963 { 964 965 if (mm->len != 0 && vm->mem_segs[mm->segid].sysmem) 966 return (true); 967 else 968 return (false); 969 } 970 971 vm_paddr_t 972 vmm_sysmem_maxaddr(struct vm *vm) 973 { 974 struct mem_map *mm; 975 vm_paddr_t maxaddr; 976 int i; 977 978 maxaddr = 0; 979 for (i = 0; i < VM_MAX_MEMMAPS; i++) { 980 mm = &vm->mem_maps[i]; 981 if (sysmem_mapping(vm, mm)) { 982 if (maxaddr < mm->gpa + mm->len) 983 maxaddr = mm->gpa + mm->len; 984 } 985 } 986 return (maxaddr); 987 } 988 989 static void 990 vm_iommu_modify(struct vm *vm, bool map) 991 { 992 int i, sz; 993 vm_paddr_t gpa, hpa; 994 struct mem_map *mm; 995 vm_client_t *vmc; 996 997 sz = PAGE_SIZE; 998 vmc = vmspace_client_alloc(vm->vmspace); 999 1000 for (i = 0; i < VM_MAX_MEMMAPS; i++) { 1001 mm = &vm->mem_maps[i]; 1002 if (!sysmem_mapping(vm, mm)) 1003 continue; 1004 1005 if (map) { 1006 KASSERT((mm->flags & VM_MEMMAP_F_IOMMU) == 0, 1007 ("iommu map found invalid memmap %lx/%lx/%x", 1008 mm->gpa, mm->len, mm->flags)); 1009 if ((mm->flags & VM_MEMMAP_F_WIRED) == 0) 1010 continue; 1011 mm->flags |= VM_MEMMAP_F_IOMMU; 1012 } else { 1013 if ((mm->flags & VM_MEMMAP_F_IOMMU) == 0) 1014 continue; 1015 mm->flags &= ~VM_MEMMAP_F_IOMMU; 1016 KASSERT((mm->flags & VM_MEMMAP_F_WIRED) != 0, 1017 ("iommu unmap found invalid memmap %lx/%lx/%x", 1018 mm->gpa, mm->len, mm->flags)); 1019 } 1020 1021 gpa = mm->gpa; 1022 while (gpa < mm->gpa + mm->len) { 1023 vm_page_t *vmp; 1024 1025 vmp = vmc_hold(vmc, gpa, PROT_WRITE); 1026 ASSERT(vmp != NULL); 1027 hpa = ((uintptr_t)vmp_get_pfn(vmp) << PAGESHIFT); 1028 (void) vmp_release(vmp); 1029 1030 /* 1031 * When originally ported from FreeBSD, the logic for 1032 * adding memory to the guest domain would 1033 * simultaneously remove it from the host domain. The 1034 * justification for that is not clear, and FreeBSD has 1035 * subsequently changed the behavior to not remove the 1036 * memory from the host domain. 1037 * 1038 * Leaving the guest memory in the host domain for the 1039 * life of the VM is necessary to make it available for 1040 * DMA, such as through viona in the TX path. 1041 */ 1042 if (map) { 1043 iommu_create_mapping(vm->iommu, gpa, hpa, sz); 1044 } else { 1045 iommu_remove_mapping(vm->iommu, gpa, sz); 1046 } 1047 1048 gpa += PAGE_SIZE; 1049 } 1050 } 1051 vmc_destroy(vmc); 1052 1053 /* 1054 * Invalidate the cached translations associated with the domain 1055 * from which pages were removed. 1056 */ 1057 iommu_invalidate_tlb(vm->iommu); 1058 } 1059 1060 int 1061 vm_unassign_pptdev(struct vm *vm, int pptfd) 1062 { 1063 int error; 1064 1065 error = ppt_unassign_device(vm, pptfd); 1066 if (error) 1067 return (error); 1068 1069 if (ppt_assigned_devices(vm) == 0) 1070 vm_iommu_modify(vm, false); 1071 1072 return (0); 1073 } 1074 1075 int 1076 vm_assign_pptdev(struct vm *vm, int pptfd) 1077 { 1078 int error; 1079 vm_paddr_t maxaddr; 1080 1081 /* Set up the IOMMU to do the 'gpa' to 'hpa' translation */ 1082 if (ppt_assigned_devices(vm) == 0) { 1083 KASSERT(vm->iommu == NULL, 1084 ("vm_assign_pptdev: iommu must be NULL")); 1085 maxaddr = vmm_sysmem_maxaddr(vm); 1086 vm->iommu = iommu_create_domain(maxaddr); 1087 if (vm->iommu == NULL) 1088 return (ENXIO); 1089 vm_iommu_modify(vm, true); 1090 } 1091 1092 error = ppt_assign_device(vm, pptfd); 1093 return (error); 1094 } 1095 1096 int 1097 vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval) 1098 { 1099 1100 if (vcpu < 0 || vcpu >= vm->maxcpus) 1101 return (EINVAL); 1102 1103 if (reg >= VM_REG_LAST) 1104 return (EINVAL); 1105 1106 return (VMGETREG(vm->cookie, vcpu, reg, retval)); 1107 } 1108 1109 int 1110 vm_set_register(struct vm *vm, int vcpuid, int reg, uint64_t val) 1111 { 1112 struct vcpu *vcpu; 1113 int error; 1114 1115 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 1116 return (EINVAL); 1117 1118 if (reg >= VM_REG_LAST) 1119 return (EINVAL); 1120 1121 error = VMSETREG(vm->cookie, vcpuid, reg, val); 1122 if (error || reg != VM_REG_GUEST_RIP) 1123 return (error); 1124 1125 /* Set 'nextrip' to match the value of %rip */ 1126 vcpu = &vm->vcpu[vcpuid]; 1127 vcpu->nextrip = val; 1128 return (0); 1129 } 1130 1131 static bool 1132 is_descriptor_table(int reg) 1133 { 1134 switch (reg) { 1135 case VM_REG_GUEST_IDTR: 1136 case VM_REG_GUEST_GDTR: 1137 return (true); 1138 default: 1139 return (false); 1140 } 1141 } 1142 1143 static bool 1144 is_segment_register(int reg) 1145 { 1146 switch (reg) { 1147 case VM_REG_GUEST_ES: 1148 case VM_REG_GUEST_CS: 1149 case VM_REG_GUEST_SS: 1150 case VM_REG_GUEST_DS: 1151 case VM_REG_GUEST_FS: 1152 case VM_REG_GUEST_GS: 1153 case VM_REG_GUEST_TR: 1154 case VM_REG_GUEST_LDTR: 1155 return (true); 1156 default: 1157 return (false); 1158 } 1159 } 1160 1161 int 1162 vm_get_seg_desc(struct vm *vm, int vcpu, int reg, struct seg_desc *desc) 1163 { 1164 1165 if (vcpu < 0 || vcpu >= vm->maxcpus) 1166 return (EINVAL); 1167 1168 if (!is_segment_register(reg) && !is_descriptor_table(reg)) 1169 return (EINVAL); 1170 1171 return (VMGETDESC(vm->cookie, vcpu, reg, desc)); 1172 } 1173 1174 int 1175 vm_set_seg_desc(struct vm *vm, int vcpu, int reg, const struct seg_desc *desc) 1176 { 1177 if (vcpu < 0 || vcpu >= vm->maxcpus) 1178 return (EINVAL); 1179 1180 if (!is_segment_register(reg) && !is_descriptor_table(reg)) 1181 return (EINVAL); 1182 1183 return (VMSETDESC(vm->cookie, vcpu, reg, desc)); 1184 } 1185 1186 static int 1187 translate_hma_xsave_result(hma_fpu_xsave_result_t res) 1188 { 1189 switch (res) { 1190 case HFXR_OK: 1191 return (0); 1192 case HFXR_NO_SPACE: 1193 return (ENOSPC); 1194 case HFXR_BAD_ALIGN: 1195 case HFXR_UNSUP_FMT: 1196 case HFXR_UNSUP_FEAT: 1197 case HFXR_INVALID_DATA: 1198 return (EINVAL); 1199 default: 1200 panic("unexpected xsave result"); 1201 } 1202 } 1203 1204 int 1205 vm_get_fpu(struct vm *vm, int vcpuid, void *buf, size_t len) 1206 { 1207 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 1208 return (EINVAL); 1209 1210 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 1211 hma_fpu_xsave_result_t res; 1212 1213 res = hma_fpu_get_xsave_state(vcpu->guestfpu, buf, len); 1214 return (translate_hma_xsave_result(res)); 1215 } 1216 1217 int 1218 vm_set_fpu(struct vm *vm, int vcpuid, void *buf, size_t len) 1219 { 1220 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 1221 return (EINVAL); 1222 1223 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 1224 hma_fpu_xsave_result_t res; 1225 1226 res = hma_fpu_set_xsave_state(vcpu->guestfpu, buf, len); 1227 return (translate_hma_xsave_result(res)); 1228 } 1229 1230 int 1231 vm_get_run_state(struct vm *vm, int vcpuid, uint32_t *state, uint8_t *sipi_vec) 1232 { 1233 struct vcpu *vcpu; 1234 1235 if (vcpuid < 0 || vcpuid >= vm->maxcpus) { 1236 return (EINVAL); 1237 } 1238 1239 vcpu = &vm->vcpu[vcpuid]; 1240 1241 vcpu_lock(vcpu); 1242 *state = vcpu->run_state; 1243 *sipi_vec = vcpu->sipi_vector; 1244 vcpu_unlock(vcpu); 1245 1246 return (0); 1247 } 1248 1249 int 1250 vm_set_run_state(struct vm *vm, int vcpuid, uint32_t state, uint8_t sipi_vec) 1251 { 1252 struct vcpu *vcpu; 1253 1254 if (vcpuid < 0 || vcpuid >= vm->maxcpus) { 1255 return (EINVAL); 1256 } 1257 if (!VRS_IS_VALID(state)) { 1258 return (EINVAL); 1259 } 1260 1261 vcpu = &vm->vcpu[vcpuid]; 1262 1263 vcpu_lock(vcpu); 1264 vcpu->run_state = state; 1265 vcpu->sipi_vector = sipi_vec; 1266 vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT); 1267 vcpu_unlock(vcpu); 1268 1269 return (0); 1270 } 1271 1272 void 1273 vm_track_dirty_pages(struct vm *vm, uint64_t gpa, size_t len, uint8_t *bitmap) 1274 { 1275 vmspace_t *vms = vm_get_vmspace(vm); 1276 vmspace_track_dirty(vms, gpa, len, bitmap); 1277 } 1278 1279 static void 1280 restore_guest_fpustate(struct vcpu *vcpu) 1281 { 1282 /* Save host FPU and restore guest FPU */ 1283 fpu_stop_emulating(); 1284 hma_fpu_start_guest(vcpu->guestfpu); 1285 1286 /* restore guest XCR0 if XSAVE is enabled in the host */ 1287 if (rcr4() & CR4_XSAVE) 1288 load_xcr(0, vcpu->guest_xcr0); 1289 1290 /* 1291 * The FPU is now "dirty" with the guest's state so turn on emulation 1292 * to trap any access to the FPU by the host. 1293 */ 1294 fpu_start_emulating(); 1295 } 1296 1297 static void 1298 save_guest_fpustate(struct vcpu *vcpu) 1299 { 1300 1301 if ((rcr0() & CR0_TS) == 0) 1302 panic("fpu emulation not enabled in host!"); 1303 1304 /* save guest XCR0 and restore host XCR0 */ 1305 if (rcr4() & CR4_XSAVE) { 1306 vcpu->guest_xcr0 = rxcr(0); 1307 load_xcr(0, vmm_get_host_xcr0()); 1308 } 1309 1310 /* save guest FPU and restore host FPU */ 1311 fpu_stop_emulating(); 1312 hma_fpu_stop_guest(vcpu->guestfpu); 1313 /* 1314 * When the host state has been restored, we should not re-enable 1315 * CR0.TS on illumos for eager FPU. 1316 */ 1317 } 1318 1319 static int 1320 vcpu_set_state_locked(struct vm *vm, int vcpuid, enum vcpu_state newstate, 1321 bool from_idle) 1322 { 1323 struct vcpu *vcpu; 1324 int error; 1325 1326 vcpu = &vm->vcpu[vcpuid]; 1327 vcpu_assert_locked(vcpu); 1328 1329 /* 1330 * State transitions from the vmmdev_ioctl() must always begin from 1331 * the VCPU_IDLE state. This guarantees that there is only a single 1332 * ioctl() operating on a vcpu at any point. 1333 */ 1334 if (from_idle) { 1335 while (vcpu->state != VCPU_IDLE) { 1336 vcpu->reqidle = 1; 1337 vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT); 1338 cv_wait(&vcpu->state_cv, &vcpu->lock); 1339 } 1340 } else { 1341 KASSERT(vcpu->state != VCPU_IDLE, ("invalid transition from " 1342 "vcpu idle state")); 1343 } 1344 1345 if (vcpu->state == VCPU_RUNNING) { 1346 KASSERT(vcpu->hostcpu == curcpu, ("curcpu %d and hostcpu %d " 1347 "mismatch for running vcpu", curcpu, vcpu->hostcpu)); 1348 } else { 1349 KASSERT(vcpu->hostcpu == NOCPU, ("Invalid hostcpu %d for a " 1350 "vcpu that is not running", vcpu->hostcpu)); 1351 } 1352 1353 /* 1354 * The following state transitions are allowed: 1355 * IDLE -> FROZEN -> IDLE 1356 * FROZEN -> RUNNING -> FROZEN 1357 * FROZEN -> SLEEPING -> FROZEN 1358 */ 1359 switch (vcpu->state) { 1360 case VCPU_IDLE: 1361 case VCPU_RUNNING: 1362 case VCPU_SLEEPING: 1363 error = (newstate != VCPU_FROZEN); 1364 break; 1365 case VCPU_FROZEN: 1366 error = (newstate == VCPU_FROZEN); 1367 break; 1368 default: 1369 error = 1; 1370 break; 1371 } 1372 1373 if (error) 1374 return (EBUSY); 1375 1376 vcpu->state = newstate; 1377 if (newstate == VCPU_RUNNING) 1378 vcpu->hostcpu = curcpu; 1379 else 1380 vcpu->hostcpu = NOCPU; 1381 1382 if (newstate == VCPU_IDLE) { 1383 cv_broadcast(&vcpu->state_cv); 1384 } 1385 1386 return (0); 1387 } 1388 1389 static void 1390 vcpu_require_state(struct vm *vm, int vcpuid, enum vcpu_state newstate) 1391 { 1392 int error; 1393 1394 if ((error = vcpu_set_state(vm, vcpuid, newstate, false)) != 0) 1395 panic("Error %d setting state to %d\n", error, newstate); 1396 } 1397 1398 static void 1399 vcpu_require_state_locked(struct vm *vm, int vcpuid, enum vcpu_state newstate) 1400 { 1401 int error; 1402 1403 if ((error = vcpu_set_state_locked(vm, vcpuid, newstate, false)) != 0) 1404 panic("Error %d setting state to %d", error, newstate); 1405 } 1406 1407 /* 1408 * Emulate a guest 'hlt' by sleeping until the vcpu is ready to run. 1409 */ 1410 static int 1411 vm_handle_hlt(struct vm *vm, int vcpuid, bool intr_disabled) 1412 { 1413 struct vcpu *vcpu; 1414 int vcpu_halted, vm_halted; 1415 bool userspace_exit = false; 1416 1417 KASSERT(!CPU_ISSET(vcpuid, &vm->halted_cpus), ("vcpu already halted")); 1418 1419 vcpu = &vm->vcpu[vcpuid]; 1420 vcpu_halted = 0; 1421 vm_halted = 0; 1422 1423 vcpu_lock(vcpu); 1424 while (1) { 1425 /* 1426 * Do a final check for pending interrupts (including NMI and 1427 * INIT) before putting this thread to sleep. 1428 */ 1429 if (vm_nmi_pending(vm, vcpuid)) 1430 break; 1431 if (vcpu_run_state_pending(vm, vcpuid)) 1432 break; 1433 if (!intr_disabled) { 1434 if (vm_extint_pending(vm, vcpuid) || 1435 vlapic_pending_intr(vcpu->vlapic, NULL)) { 1436 break; 1437 } 1438 } 1439 1440 /* 1441 * Also check for software events which would cause a wake-up. 1442 * This will set the appropriate exitcode directly, rather than 1443 * requiring a trip through VM_RUN(). 1444 */ 1445 if (vcpu_sleep_bailout_checks(vm, vcpuid)) { 1446 userspace_exit = true; 1447 break; 1448 } 1449 1450 /* 1451 * Some Linux guests implement "halt" by having all vcpus 1452 * execute HLT with interrupts disabled. 'halted_cpus' keeps 1453 * track of the vcpus that have entered this state. When all 1454 * vcpus enter the halted state the virtual machine is halted. 1455 */ 1456 if (intr_disabled) { 1457 if (!vcpu_halted && halt_detection_enabled) { 1458 vcpu_halted = 1; 1459 CPU_SET_ATOMIC(vcpuid, &vm->halted_cpus); 1460 } 1461 if (CPU_CMP(&vm->halted_cpus, &vm->active_cpus) == 0) { 1462 vm_halted = 1; 1463 break; 1464 } 1465 } 1466 1467 vcpu_ustate_change(vm, vcpuid, VU_IDLE); 1468 vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING); 1469 (void) cv_wait_sig(&vcpu->vcpu_cv, &vcpu->lock); 1470 vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN); 1471 vcpu_ustate_change(vm, vcpuid, VU_EMU_KERN); 1472 } 1473 1474 if (vcpu_halted) 1475 CPU_CLR_ATOMIC(vcpuid, &vm->halted_cpus); 1476 1477 vcpu_unlock(vcpu); 1478 1479 if (vm_halted) { 1480 (void) vm_suspend(vm, VM_SUSPEND_HALT); 1481 } 1482 1483 return (userspace_exit ? -1 : 0); 1484 } 1485 1486 static int 1487 vm_handle_paging(struct vm *vm, int vcpuid) 1488 { 1489 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 1490 vm_client_t *vmc = vcpu->vmclient; 1491 struct vm_exit *vme = &vcpu->exitinfo; 1492 int rv, ftype; 1493 1494 KASSERT(vme->inst_length == 0, ("%s: invalid inst_length %d", 1495 __func__, vme->inst_length)); 1496 1497 ftype = vme->u.paging.fault_type; 1498 KASSERT(ftype == PROT_READ || 1499 ftype == PROT_WRITE || ftype == PROT_EXEC, 1500 ("vm_handle_paging: invalid fault_type %d", ftype)); 1501 1502 rv = vmc_fault(vmc, vme->u.paging.gpa, ftype); 1503 1504 if (rv != 0) 1505 return (EFAULT); 1506 return (0); 1507 } 1508 1509 int 1510 vm_service_mmio_read(struct vm *vm, int cpuid, uint64_t gpa, uint64_t *rval, 1511 int rsize) 1512 { 1513 int err = ESRCH; 1514 1515 if (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE) { 1516 struct vlapic *vlapic = vm_lapic(vm, cpuid); 1517 1518 err = vlapic_mmio_read(vlapic, gpa, rval, rsize); 1519 } else if (gpa >= VIOAPIC_BASE && gpa < VIOAPIC_BASE + VIOAPIC_SIZE) { 1520 err = vioapic_mmio_read(vm, cpuid, gpa, rval, rsize); 1521 } else if (gpa >= VHPET_BASE && gpa < VHPET_BASE + VHPET_SIZE) { 1522 err = vhpet_mmio_read(vm, cpuid, gpa, rval, rsize); 1523 } 1524 1525 return (err); 1526 } 1527 1528 int 1529 vm_service_mmio_write(struct vm *vm, int cpuid, uint64_t gpa, uint64_t wval, 1530 int wsize) 1531 { 1532 int err = ESRCH; 1533 1534 if (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE) { 1535 struct vlapic *vlapic = vm_lapic(vm, cpuid); 1536 1537 err = vlapic_mmio_write(vlapic, gpa, wval, wsize); 1538 } else if (gpa >= VIOAPIC_BASE && gpa < VIOAPIC_BASE + VIOAPIC_SIZE) { 1539 err = vioapic_mmio_write(vm, cpuid, gpa, wval, wsize); 1540 } else if (gpa >= VHPET_BASE && gpa < VHPET_BASE + VHPET_SIZE) { 1541 err = vhpet_mmio_write(vm, cpuid, gpa, wval, wsize); 1542 } 1543 1544 return (err); 1545 } 1546 1547 static int 1548 vm_handle_mmio_emul(struct vm *vm, int vcpuid) 1549 { 1550 struct vie *vie; 1551 struct vcpu *vcpu; 1552 struct vm_exit *vme; 1553 uint64_t inst_addr; 1554 int error, fault, cs_d; 1555 1556 vcpu = &vm->vcpu[vcpuid]; 1557 vme = &vcpu->exitinfo; 1558 vie = vcpu->vie_ctx; 1559 1560 KASSERT(vme->inst_length == 0, ("%s: invalid inst_length %d", 1561 __func__, vme->inst_length)); 1562 1563 inst_addr = vme->rip + vme->u.mmio_emul.cs_base; 1564 cs_d = vme->u.mmio_emul.cs_d; 1565 1566 /* Fetch the faulting instruction */ 1567 if (vie_needs_fetch(vie)) { 1568 error = vie_fetch_instruction(vie, vm, vcpuid, inst_addr, 1569 &fault); 1570 if (error != 0) { 1571 return (error); 1572 } else if (fault) { 1573 /* 1574 * If a fault during instruction fetch was encountered, 1575 * it will have asserted that the appropriate exception 1576 * be injected at next entry. 1577 * No further work is required. 1578 */ 1579 return (0); 1580 } 1581 } 1582 1583 if (vie_decode_instruction(vie, vm, vcpuid, cs_d) != 0) { 1584 /* Dump (unrecognized) instruction bytes in userspace */ 1585 vie_fallback_exitinfo(vie, vme); 1586 return (-1); 1587 } 1588 if (vme->u.mmio_emul.gla != VIE_INVALID_GLA && 1589 vie_verify_gla(vie, vm, vcpuid, vme->u.mmio_emul.gla) != 0) { 1590 /* Decoded GLA does not match GLA from VM exit state */ 1591 vie_fallback_exitinfo(vie, vme); 1592 return (-1); 1593 } 1594 1595 repeat: 1596 error = vie_emulate_mmio(vie, vm, vcpuid); 1597 if (error < 0) { 1598 /* 1599 * MMIO not handled by any of the in-kernel-emulated devices, so 1600 * make a trip out to userspace for it. 1601 */ 1602 vie_exitinfo(vie, vme); 1603 } else if (error == EAGAIN) { 1604 /* 1605 * Continue emulating the rep-prefixed instruction, which has 1606 * not completed its iterations. 1607 * 1608 * In case this can be emulated in-kernel and has a high 1609 * repetition count (causing a tight spin), it should be 1610 * deferential to yield conditions. 1611 */ 1612 if (!vcpu_should_yield(vm, vcpuid)) { 1613 goto repeat; 1614 } else { 1615 /* 1616 * Defer to the contending load by making a trip to 1617 * userspace with a no-op (BOGUS) exit reason. 1618 */ 1619 vie_reset(vie); 1620 vme->exitcode = VM_EXITCODE_BOGUS; 1621 return (-1); 1622 } 1623 } else if (error == 0) { 1624 /* Update %rip now that instruction has been emulated */ 1625 vie_advance_pc(vie, &vcpu->nextrip); 1626 } 1627 return (error); 1628 } 1629 1630 static int 1631 vm_handle_inout(struct vm *vm, int vcpuid, struct vm_exit *vme) 1632 { 1633 struct vcpu *vcpu; 1634 struct vie *vie; 1635 int err; 1636 1637 vcpu = &vm->vcpu[vcpuid]; 1638 vie = vcpu->vie_ctx; 1639 1640 repeat: 1641 err = vie_emulate_inout(vie, vm, vcpuid); 1642 1643 if (err < 0) { 1644 /* 1645 * In/out not handled by any of the in-kernel-emulated devices, 1646 * so make a trip out to userspace for it. 1647 */ 1648 vie_exitinfo(vie, vme); 1649 return (err); 1650 } else if (err == EAGAIN) { 1651 /* 1652 * Continue emulating the rep-prefixed ins/outs, which has not 1653 * completed its iterations. 1654 * 1655 * In case this can be emulated in-kernel and has a high 1656 * repetition count (causing a tight spin), it should be 1657 * deferential to yield conditions. 1658 */ 1659 if (!vcpu_should_yield(vm, vcpuid)) { 1660 goto repeat; 1661 } else { 1662 /* 1663 * Defer to the contending load by making a trip to 1664 * userspace with a no-op (BOGUS) exit reason. 1665 */ 1666 vie_reset(vie); 1667 vme->exitcode = VM_EXITCODE_BOGUS; 1668 return (-1); 1669 } 1670 } else if (err != 0) { 1671 /* Emulation failure. Bail all the way out to userspace. */ 1672 vme->exitcode = VM_EXITCODE_INST_EMUL; 1673 bzero(&vme->u.inst_emul, sizeof (vme->u.inst_emul)); 1674 return (-1); 1675 } 1676 1677 vie_advance_pc(vie, &vcpu->nextrip); 1678 return (0); 1679 } 1680 1681 static int 1682 vm_handle_inst_emul(struct vm *vm, int vcpuid) 1683 { 1684 struct vie *vie; 1685 struct vcpu *vcpu; 1686 struct vm_exit *vme; 1687 uint64_t cs_base; 1688 int error, fault, cs_d; 1689 1690 vcpu = &vm->vcpu[vcpuid]; 1691 vme = &vcpu->exitinfo; 1692 vie = vcpu->vie_ctx; 1693 1694 vie_cs_info(vie, vm, vcpuid, &cs_base, &cs_d); 1695 1696 /* Fetch the faulting instruction */ 1697 ASSERT(vie_needs_fetch(vie)); 1698 error = vie_fetch_instruction(vie, vm, vcpuid, vme->rip + cs_base, 1699 &fault); 1700 if (error != 0) { 1701 return (error); 1702 } else if (fault) { 1703 /* 1704 * If a fault during instruction fetch was encounted, it will 1705 * have asserted that the appropriate exception be injected at 1706 * next entry. No further work is required. 1707 */ 1708 return (0); 1709 } 1710 1711 if (vie_decode_instruction(vie, vm, vcpuid, cs_d) != 0) { 1712 /* Dump (unrecognized) instruction bytes in userspace */ 1713 vie_fallback_exitinfo(vie, vme); 1714 return (-1); 1715 } 1716 1717 error = vie_emulate_other(vie, vm, vcpuid); 1718 if (error != 0) { 1719 /* 1720 * Instruction emulation was unable to complete successfully, so 1721 * kick it out to userspace for handling. 1722 */ 1723 vie_fallback_exitinfo(vie, vme); 1724 } else { 1725 /* Update %rip now that instruction has been emulated */ 1726 vie_advance_pc(vie, &vcpu->nextrip); 1727 } 1728 return (error); 1729 } 1730 1731 static int 1732 vm_handle_suspend(struct vm *vm, int vcpuid) 1733 { 1734 int i; 1735 struct vcpu *vcpu; 1736 1737 vcpu = &vm->vcpu[vcpuid]; 1738 1739 CPU_SET_ATOMIC(vcpuid, &vm->suspended_cpus); 1740 1741 /* 1742 * Wait until all 'active_cpus' have suspended themselves. 1743 */ 1744 vcpu_lock(vcpu); 1745 vcpu_ustate_change(vm, vcpuid, VU_INIT); 1746 while (1) { 1747 int rc; 1748 1749 if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) { 1750 break; 1751 } 1752 1753 vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING); 1754 rc = cv_reltimedwait_sig(&vcpu->vcpu_cv, &vcpu->lock, hz, 1755 TR_CLOCK_TICK); 1756 vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN); 1757 1758 /* 1759 * If the userspace process driving the instance is killed, any 1760 * vCPUs yet to be marked suspended (because they are not 1761 * VM_RUN-ing in the kernel presently) will never reach that 1762 * state. 1763 * 1764 * To avoid vm_handle_suspend() getting stuck in the kernel 1765 * waiting for those vCPUs, offer a bail-out even though it 1766 * means returning without all vCPUs in a suspended state. 1767 */ 1768 if (rc <= 0) { 1769 if ((curproc->p_flag & SEXITING) != 0) { 1770 break; 1771 } 1772 } 1773 } 1774 vcpu_unlock(vcpu); 1775 1776 /* 1777 * Wakeup the other sleeping vcpus and return to userspace. 1778 */ 1779 for (i = 0; i < vm->maxcpus; i++) { 1780 if (CPU_ISSET(i, &vm->suspended_cpus)) { 1781 vcpu_notify_event(vm, i); 1782 } 1783 } 1784 1785 return (-1); 1786 } 1787 1788 static int 1789 vm_handle_reqidle(struct vm *vm, int vcpuid) 1790 { 1791 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 1792 1793 vcpu_lock(vcpu); 1794 KASSERT(vcpu->reqidle, ("invalid vcpu reqidle %d", vcpu->reqidle)); 1795 vcpu->reqidle = 0; 1796 vcpu_unlock(vcpu); 1797 return (-1); 1798 } 1799 1800 static int 1801 vm_handle_run_state(struct vm *vm, int vcpuid) 1802 { 1803 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 1804 bool handled = false; 1805 1806 vcpu_lock(vcpu); 1807 while (1) { 1808 if ((vcpu->run_state & VRS_PEND_INIT) != 0) { 1809 vcpu_unlock(vcpu); 1810 VERIFY0(vcpu_arch_reset(vm, vcpuid, true)); 1811 vcpu_lock(vcpu); 1812 1813 vcpu->run_state &= ~(VRS_RUN | VRS_PEND_INIT); 1814 vcpu->run_state |= VRS_INIT; 1815 } 1816 1817 if ((vcpu->run_state & (VRS_INIT | VRS_RUN | VRS_PEND_SIPI)) == 1818 (VRS_INIT | VRS_PEND_SIPI)) { 1819 const uint8_t vector = vcpu->sipi_vector; 1820 1821 vcpu_unlock(vcpu); 1822 VERIFY0(vcpu_vector_sipi(vm, vcpuid, vector)); 1823 vcpu_lock(vcpu); 1824 1825 vcpu->run_state &= ~VRS_PEND_SIPI; 1826 vcpu->run_state |= VRS_RUN; 1827 } 1828 1829 /* 1830 * If the vCPU is now in the running state, there is no need to 1831 * wait for anything prior to re-entry. 1832 */ 1833 if ((vcpu->run_state & VRS_RUN) != 0) { 1834 handled = true; 1835 break; 1836 } 1837 1838 /* 1839 * Also check for software events which would cause a wake-up. 1840 * This will set the appropriate exitcode directly, rather than 1841 * requiring a trip through VM_RUN(). 1842 */ 1843 if (vcpu_sleep_bailout_checks(vm, vcpuid)) { 1844 break; 1845 } 1846 1847 vcpu_ustate_change(vm, vcpuid, VU_IDLE); 1848 vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING); 1849 (void) cv_wait_sig(&vcpu->vcpu_cv, &vcpu->lock); 1850 vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN); 1851 vcpu_ustate_change(vm, vcpuid, VU_EMU_KERN); 1852 } 1853 vcpu_unlock(vcpu); 1854 1855 return (handled ? 0 : -1); 1856 } 1857 1858 static int 1859 vm_rdmtrr(struct vm_mtrr *mtrr, uint32_t num, uint64_t *val) 1860 { 1861 switch (num) { 1862 case MSR_MTRRcap: 1863 *val = MTRR_CAP_WC | MTRR_CAP_FIXED | VMM_MTRR_VAR_MAX; 1864 break; 1865 case MSR_MTRRdefType: 1866 *val = mtrr->def_type; 1867 break; 1868 case MSR_MTRR4kBase ... MSR_MTRR4kBase + 7: 1869 *val = mtrr->fixed4k[num - MSR_MTRR4kBase]; 1870 break; 1871 case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1: 1872 *val = mtrr->fixed16k[num - MSR_MTRR16kBase]; 1873 break; 1874 case MSR_MTRR64kBase: 1875 *val = mtrr->fixed64k; 1876 break; 1877 case MSR_MTRRVarBase ... MSR_MTRRVarBase + (VMM_MTRR_VAR_MAX * 2) - 1: { 1878 uint_t offset = num - MSR_MTRRVarBase; 1879 if (offset % 2 == 0) { 1880 *val = mtrr->var[offset / 2].base; 1881 } else { 1882 *val = mtrr->var[offset / 2].mask; 1883 } 1884 break; 1885 } 1886 default: 1887 return (-1); 1888 } 1889 1890 return (0); 1891 } 1892 1893 static int 1894 vm_wrmtrr(struct vm_mtrr *mtrr, uint32_t num, uint64_t val) 1895 { 1896 switch (num) { 1897 case MSR_MTRRcap: 1898 /* MTRRCAP is read only */ 1899 return (-1); 1900 case MSR_MTRRdefType: 1901 if (val & ~VMM_MTRR_DEF_MASK) { 1902 /* generate #GP on writes to reserved fields */ 1903 return (-1); 1904 } 1905 mtrr->def_type = val; 1906 break; 1907 case MSR_MTRR4kBase ... MSR_MTRR4kBase + 7: 1908 mtrr->fixed4k[num - MSR_MTRR4kBase] = val; 1909 break; 1910 case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1: 1911 mtrr->fixed16k[num - MSR_MTRR16kBase] = val; 1912 break; 1913 case MSR_MTRR64kBase: 1914 mtrr->fixed64k = val; 1915 break; 1916 case MSR_MTRRVarBase ... MSR_MTRRVarBase + (VMM_MTRR_VAR_MAX * 2) - 1: { 1917 uint_t offset = num - MSR_MTRRVarBase; 1918 if (offset % 2 == 0) { 1919 if (val & ~VMM_MTRR_PHYSBASE_MASK) { 1920 /* generate #GP on writes to reserved fields */ 1921 return (-1); 1922 } 1923 mtrr->var[offset / 2].base = val; 1924 } else { 1925 if (val & ~VMM_MTRR_PHYSMASK_MASK) { 1926 /* generate #GP on writes to reserved fields */ 1927 return (-1); 1928 } 1929 mtrr->var[offset / 2].mask = val; 1930 } 1931 break; 1932 } 1933 default: 1934 return (-1); 1935 } 1936 1937 return (0); 1938 } 1939 1940 static int 1941 vm_handle_rdmsr(struct vm *vm, int vcpuid, struct vm_exit *vme) 1942 { 1943 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 1944 const uint32_t code = vme->u.msr.code; 1945 uint64_t val = 0; 1946 1947 switch (code) { 1948 case MSR_MCG_CAP: 1949 case MSR_MCG_STATUS: 1950 val = 0; 1951 break; 1952 1953 case MSR_MTRRcap: 1954 case MSR_MTRRdefType: 1955 case MSR_MTRR4kBase ... MSR_MTRR4kBase + 7: 1956 case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1: 1957 case MSR_MTRR64kBase: 1958 case MSR_MTRRVarBase ... MSR_MTRRVarBase + (VMM_MTRR_VAR_MAX * 2) - 1: 1959 if (vm_rdmtrr(&vcpu->mtrr, code, &val) != 0) 1960 vm_inject_gp(vm, vcpuid); 1961 break; 1962 1963 case MSR_TSC: 1964 /* 1965 * In all likelihood, this should always be handled in guest 1966 * context by VMX/SVM rather than taking an exit. (Both VMX and 1967 * SVM pass through read-only access to MSR_TSC to the guest.) 1968 * 1969 * No physical offset is requested of vcpu_tsc_offset() since 1970 * rdtsc_offset() takes care of that instead. 1971 */ 1972 val = vcpu_tsc_offset(vm, vcpuid, false) + rdtsc_offset(); 1973 break; 1974 1975 default: 1976 /* 1977 * Anything not handled at this point will be kicked out to 1978 * userspace for attempted processing there. 1979 */ 1980 return (-1); 1981 } 1982 1983 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RAX, 1984 val & 0xffffffff)); 1985 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RDX, 1986 val >> 32)); 1987 return (0); 1988 } 1989 1990 static int 1991 vm_handle_wrmsr(struct vm *vm, int vcpuid, struct vm_exit *vme) 1992 { 1993 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 1994 const uint32_t code = vme->u.msr.code; 1995 const uint64_t val = vme->u.msr.wval; 1996 1997 switch (code) { 1998 case MSR_MCG_CAP: 1999 case MSR_MCG_STATUS: 2000 /* Ignore writes */ 2001 break; 2002 2003 case MSR_MTRRcap: 2004 case MSR_MTRRdefType: 2005 case MSR_MTRR4kBase ... MSR_MTRR4kBase + 7: 2006 case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1: 2007 case MSR_MTRR64kBase: 2008 case MSR_MTRRVarBase ... MSR_MTRRVarBase + (VMM_MTRR_VAR_MAX * 2) - 1: 2009 if (vm_wrmtrr(&vcpu->mtrr, code, val) != 0) 2010 vm_inject_gp(vm, vcpuid); 2011 break; 2012 2013 case MSR_TSC: 2014 /* 2015 * The effect of writing the TSC MSR is that a subsequent read 2016 * of the TSC would report that value written (plus any time 2017 * elapsed between the write and the read). The guest TSC value 2018 * is calculated from a global offset for the guest (which 2019 * effectively makes its TSC read 0 at guest boot) and a 2020 * per-vCPU offset to handle these writes to the MSR. 2021 * 2022 * To calculate that per-vCPU offset, we can work backwards from 2023 * the guest value at the time of write: 2024 * 2025 * value = host TSC + VM boot offset + vCPU offset 2026 * 2027 * so therefore: 2028 * 2029 * value - host TSC - VM boot offset = vCPU offset 2030 */ 2031 vcpu->tsc_offset = val - vm->boot_tsc_offset - rdtsc_offset(); 2032 break; 2033 2034 default: 2035 /* 2036 * Anything not handled at this point will be kicked out to 2037 * userspace for attempted processing there. 2038 */ 2039 return (-1); 2040 } 2041 2042 return (0); 2043 } 2044 2045 int 2046 vm_suspend(struct vm *vm, enum vm_suspend_how how) 2047 { 2048 if (how <= VM_SUSPEND_NONE || how >= VM_SUSPEND_LAST) 2049 return (EINVAL); 2050 2051 if (atomic_cmpset_int((uint_t *)&vm->suspend, 0, how) == 0) { 2052 return (EALREADY); 2053 } 2054 2055 /* 2056 * Notify all active vcpus that they are now suspended. 2057 */ 2058 for (uint_t i = 0; i < vm->maxcpus; i++) { 2059 struct vcpu *vcpu = &vm->vcpu[i]; 2060 2061 vcpu_lock(vcpu); 2062 if (vcpu->state == VCPU_IDLE || vcpu->state == VCPU_FROZEN) { 2063 /* 2064 * Any vCPUs not actively running or in HLT can be 2065 * marked as suspended immediately. 2066 */ 2067 if (CPU_ISSET(i, &vm->active_cpus)) { 2068 CPU_SET_ATOMIC(i, &vm->suspended_cpus); 2069 } 2070 } else { 2071 /* 2072 * Those which are running or in HLT will pick up the 2073 * suspended state after notification. 2074 */ 2075 vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT); 2076 } 2077 vcpu_unlock(vcpu); 2078 } 2079 return (0); 2080 } 2081 2082 void 2083 vm_exit_run_state(struct vm *vm, int vcpuid, uint64_t rip) 2084 { 2085 struct vm_exit *vmexit; 2086 2087 vmexit = vm_exitinfo(vm, vcpuid); 2088 vmexit->rip = rip; 2089 vmexit->inst_length = 0; 2090 vmexit->exitcode = VM_EXITCODE_RUN_STATE; 2091 vmm_stat_incr(vm, vcpuid, VMEXIT_RUN_STATE, 1); 2092 } 2093 2094 /* 2095 * Some vmm resources, such as the lapic, may have CPU-specific resources 2096 * allocated to them which would benefit from migration onto the host CPU which 2097 * is processing the vcpu state. 2098 */ 2099 static void 2100 vm_localize_resources(struct vm *vm, struct vcpu *vcpu) 2101 { 2102 /* 2103 * Localizing cyclic resources requires acquisition of cpu_lock, and 2104 * doing so with kpreempt disabled is a recipe for deadlock disaster. 2105 */ 2106 VERIFY(curthread->t_preempt == 0); 2107 2108 /* 2109 * Do not bother with localization if this vCPU is about to return to 2110 * the host CPU it was last localized to. 2111 */ 2112 if (vcpu->lastloccpu == curcpu) 2113 return; 2114 2115 /* 2116 * Localize system-wide resources to the primary boot vCPU. While any 2117 * of the other vCPUs may access them, it keeps the potential interrupt 2118 * footprint constrained to CPUs involved with this instance. 2119 */ 2120 if (vcpu == &vm->vcpu[0]) { 2121 vhpet_localize_resources(vm->vhpet); 2122 vrtc_localize_resources(vm->vrtc); 2123 vatpit_localize_resources(vm->vatpit); 2124 } 2125 2126 vlapic_localize_resources(vcpu->vlapic); 2127 2128 vcpu->lastloccpu = curcpu; 2129 } 2130 2131 static void 2132 vmm_savectx(void *arg) 2133 { 2134 vm_thread_ctx_t *vtc = arg; 2135 struct vm *vm = vtc->vtc_vm; 2136 const int vcpuid = vtc->vtc_vcpuid; 2137 2138 if (ops->vmsavectx != NULL) { 2139 ops->vmsavectx(vm->cookie, vcpuid); 2140 } 2141 2142 /* 2143 * Account for going off-cpu, unless the vCPU is idled, where being 2144 * off-cpu is the explicit point. 2145 */ 2146 if (vm->vcpu[vcpuid].ustate != VU_IDLE) { 2147 vtc->vtc_ustate = vm->vcpu[vcpuid].ustate; 2148 vcpu_ustate_change(vm, vcpuid, VU_SCHED); 2149 } 2150 2151 /* 2152 * If the CPU holds the restored guest FPU state, save it and restore 2153 * the host FPU state before this thread goes off-cpu. 2154 */ 2155 if ((vtc->vtc_status & VTCS_FPU_RESTORED) != 0) { 2156 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 2157 2158 save_guest_fpustate(vcpu); 2159 vtc->vtc_status &= ~VTCS_FPU_RESTORED; 2160 } 2161 } 2162 2163 static void 2164 vmm_restorectx(void *arg) 2165 { 2166 vm_thread_ctx_t *vtc = arg; 2167 struct vm *vm = vtc->vtc_vm; 2168 const int vcpuid = vtc->vtc_vcpuid; 2169 2170 /* Complete microstate accounting for vCPU being off-cpu */ 2171 if (vm->vcpu[vcpuid].ustate != VU_IDLE) { 2172 vcpu_ustate_change(vm, vcpuid, vtc->vtc_ustate); 2173 } 2174 2175 /* 2176 * When coming back on-cpu, only restore the guest FPU status if the 2177 * thread is in a context marked as requiring it. This should be rare, 2178 * occurring only when a future logic error results in a voluntary 2179 * sleep during the VMRUN critical section. 2180 * 2181 * The common case will result in elision of the guest FPU state 2182 * restoration, deferring that action until it is clearly necessary 2183 * during vm_run. 2184 */ 2185 VERIFY((vtc->vtc_status & VTCS_FPU_RESTORED) == 0); 2186 if ((vtc->vtc_status & VTCS_FPU_CTX_CRITICAL) != 0) { 2187 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 2188 2189 restore_guest_fpustate(vcpu); 2190 vtc->vtc_status |= VTCS_FPU_RESTORED; 2191 } 2192 2193 if (ops->vmrestorectx != NULL) { 2194 ops->vmrestorectx(vm->cookie, vcpuid); 2195 } 2196 2197 } 2198 2199 static int 2200 vm_entry_actions(struct vm *vm, int vcpuid, const struct vm_entry *entry, 2201 struct vm_exit *vme) 2202 { 2203 struct vcpu *vcpu; 2204 struct vie *vie; 2205 int err; 2206 2207 vcpu = &vm->vcpu[vcpuid]; 2208 vie = vcpu->vie_ctx; 2209 err = 0; 2210 2211 switch (entry->cmd) { 2212 case VEC_DEFAULT: 2213 return (0); 2214 case VEC_DISCARD_INSTR: 2215 vie_reset(vie); 2216 return (0); 2217 case VEC_FULFILL_MMIO: 2218 err = vie_fulfill_mmio(vie, &entry->u.mmio); 2219 if (err == 0) { 2220 err = vie_emulate_mmio(vie, vm, vcpuid); 2221 if (err == 0) { 2222 vie_advance_pc(vie, &vcpu->nextrip); 2223 } else if (err < 0) { 2224 vie_exitinfo(vie, vme); 2225 } else if (err == EAGAIN) { 2226 /* 2227 * Clear the instruction emulation state in 2228 * order to re-enter VM context and continue 2229 * this 'rep <instruction>' 2230 */ 2231 vie_reset(vie); 2232 err = 0; 2233 } 2234 } 2235 break; 2236 case VEC_FULFILL_INOUT: 2237 err = vie_fulfill_inout(vie, &entry->u.inout); 2238 if (err == 0) { 2239 err = vie_emulate_inout(vie, vm, vcpuid); 2240 if (err == 0) { 2241 vie_advance_pc(vie, &vcpu->nextrip); 2242 } else if (err < 0) { 2243 vie_exitinfo(vie, vme); 2244 } else if (err == EAGAIN) { 2245 /* 2246 * Clear the instruction emulation state in 2247 * order to re-enter VM context and continue 2248 * this 'rep ins/outs' 2249 */ 2250 vie_reset(vie); 2251 err = 0; 2252 } 2253 } 2254 break; 2255 default: 2256 return (EINVAL); 2257 } 2258 return (err); 2259 } 2260 2261 static int 2262 vm_loop_checks(struct vm *vm, int vcpuid, struct vm_exit *vme) 2263 { 2264 struct vie *vie; 2265 2266 vie = vm->vcpu[vcpuid].vie_ctx; 2267 2268 if (vie_pending(vie)) { 2269 /* 2270 * Userspace has not fulfilled the pending needs of the 2271 * instruction emulation, so bail back out. 2272 */ 2273 vie_exitinfo(vie, vme); 2274 return (-1); 2275 } 2276 2277 return (0); 2278 } 2279 2280 int 2281 vm_run(struct vm *vm, int vcpuid, const struct vm_entry *entry) 2282 { 2283 int error; 2284 struct vcpu *vcpu; 2285 struct vm_exit *vme; 2286 bool intr_disabled; 2287 int affinity_type = CPU_CURRENT; 2288 2289 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 2290 return (EINVAL); 2291 if (!CPU_ISSET(vcpuid, &vm->active_cpus)) 2292 return (EINVAL); 2293 2294 vcpu = &vm->vcpu[vcpuid]; 2295 vme = &vcpu->exitinfo; 2296 2297 vcpu_ustate_change(vm, vcpuid, VU_EMU_KERN); 2298 2299 vcpu->vtc.vtc_status = 0; 2300 ctxop_attach(curthread, vcpu->ctxop); 2301 2302 error = vm_entry_actions(vm, vcpuid, entry, vme); 2303 if (error != 0) { 2304 goto exit; 2305 } 2306 2307 restart: 2308 error = vm_loop_checks(vm, vcpuid, vme); 2309 if (error != 0) { 2310 goto exit; 2311 } 2312 2313 thread_affinity_set(curthread, affinity_type); 2314 /* 2315 * Resource localization should happen after the CPU affinity for the 2316 * thread has been set to ensure that access from restricted contexts, 2317 * such as VMX-accelerated APIC operations, can occur without inducing 2318 * cyclic cross-calls. 2319 * 2320 * This must be done prior to disabling kpreempt via critical_enter(). 2321 */ 2322 vm_localize_resources(vm, vcpu); 2323 affinity_type = CPU_CURRENT; 2324 critical_enter(); 2325 2326 /* Force a trip through update_sregs to reload %fs/%gs and friends */ 2327 PCB_SET_UPDATE_SEGS(&ttolwp(curthread)->lwp_pcb); 2328 2329 if ((vcpu->vtc.vtc_status & VTCS_FPU_RESTORED) == 0) { 2330 restore_guest_fpustate(vcpu); 2331 vcpu->vtc.vtc_status |= VTCS_FPU_RESTORED; 2332 } 2333 vcpu->vtc.vtc_status |= VTCS_FPU_CTX_CRITICAL; 2334 2335 vcpu_require_state(vm, vcpuid, VCPU_RUNNING); 2336 error = VMRUN(vm->cookie, vcpuid, vcpu->nextrip); 2337 vcpu_require_state(vm, vcpuid, VCPU_FROZEN); 2338 2339 /* 2340 * Once clear of the delicate contexts comprising the VM_RUN handler, 2341 * thread CPU affinity can be loosened while other processing occurs. 2342 */ 2343 vcpu->vtc.vtc_status &= ~VTCS_FPU_CTX_CRITICAL; 2344 thread_affinity_clear(curthread); 2345 critical_exit(); 2346 2347 if (error != 0) { 2348 /* Communicate out any error from VMRUN() above */ 2349 goto exit; 2350 } 2351 2352 vcpu->nextrip = vme->rip + vme->inst_length; 2353 switch (vme->exitcode) { 2354 case VM_EXITCODE_REQIDLE: 2355 error = vm_handle_reqidle(vm, vcpuid); 2356 break; 2357 case VM_EXITCODE_RUN_STATE: 2358 error = vm_handle_run_state(vm, vcpuid); 2359 break; 2360 case VM_EXITCODE_SUSPENDED: 2361 error = vm_handle_suspend(vm, vcpuid); 2362 break; 2363 case VM_EXITCODE_IOAPIC_EOI: 2364 vioapic_process_eoi(vm, vcpuid, 2365 vme->u.ioapic_eoi.vector); 2366 break; 2367 case VM_EXITCODE_HLT: 2368 intr_disabled = ((vme->u.hlt.rflags & PSL_I) == 0); 2369 error = vm_handle_hlt(vm, vcpuid, intr_disabled); 2370 break; 2371 case VM_EXITCODE_PAGING: 2372 error = vm_handle_paging(vm, vcpuid); 2373 break; 2374 case VM_EXITCODE_MMIO_EMUL: 2375 error = vm_handle_mmio_emul(vm, vcpuid); 2376 break; 2377 case VM_EXITCODE_INOUT: 2378 error = vm_handle_inout(vm, vcpuid, vme); 2379 break; 2380 case VM_EXITCODE_INST_EMUL: 2381 error = vm_handle_inst_emul(vm, vcpuid); 2382 break; 2383 case VM_EXITCODE_MONITOR: 2384 case VM_EXITCODE_MWAIT: 2385 case VM_EXITCODE_VMINSN: 2386 vm_inject_ud(vm, vcpuid); 2387 break; 2388 case VM_EXITCODE_RDMSR: 2389 error = vm_handle_rdmsr(vm, vcpuid, vme); 2390 break; 2391 case VM_EXITCODE_WRMSR: 2392 error = vm_handle_wrmsr(vm, vcpuid, vme); 2393 break; 2394 case VM_EXITCODE_HT: 2395 affinity_type = CPU_BEST; 2396 break; 2397 case VM_EXITCODE_MTRAP: 2398 VERIFY0(vm_suspend_cpu(vm, vcpuid)); 2399 error = -1; 2400 break; 2401 default: 2402 /* handled in userland */ 2403 error = -1; 2404 break; 2405 } 2406 2407 if (error == 0) { 2408 /* VM exit conditions handled in-kernel, continue running */ 2409 goto restart; 2410 } 2411 2412 exit: 2413 kpreempt_disable(); 2414 ctxop_detach(curthread, vcpu->ctxop); 2415 /* Make sure all of the needed vCPU context state is saved */ 2416 vmm_savectx(&vcpu->vtc); 2417 kpreempt_enable(); 2418 2419 vcpu_ustate_change(vm, vcpuid, VU_EMU_USER); 2420 return (error); 2421 } 2422 2423 int 2424 vm_restart_instruction(void *arg, int vcpuid) 2425 { 2426 struct vm *vm; 2427 struct vcpu *vcpu; 2428 enum vcpu_state state; 2429 uint64_t rip; 2430 int error; 2431 2432 vm = arg; 2433 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 2434 return (EINVAL); 2435 2436 vcpu = &vm->vcpu[vcpuid]; 2437 state = vcpu_get_state(vm, vcpuid, NULL); 2438 if (state == VCPU_RUNNING) { 2439 /* 2440 * When a vcpu is "running" the next instruction is determined 2441 * by adding 'rip' and 'inst_length' in the vcpu's 'exitinfo'. 2442 * Thus setting 'inst_length' to zero will cause the current 2443 * instruction to be restarted. 2444 */ 2445 vcpu->exitinfo.inst_length = 0; 2446 } else if (state == VCPU_FROZEN) { 2447 /* 2448 * When a vcpu is "frozen" it is outside the critical section 2449 * around VMRUN() and 'nextrip' points to the next instruction. 2450 * Thus instruction restart is achieved by setting 'nextrip' 2451 * to the vcpu's %rip. 2452 */ 2453 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RIP, &rip); 2454 KASSERT(!error, ("%s: error %d getting rip", __func__, error)); 2455 vcpu->nextrip = rip; 2456 } else { 2457 panic("%s: invalid state %d", __func__, state); 2458 } 2459 return (0); 2460 } 2461 2462 int 2463 vm_exit_intinfo(struct vm *vm, int vcpuid, uint64_t info) 2464 { 2465 struct vcpu *vcpu; 2466 2467 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 2468 return (EINVAL); 2469 2470 vcpu = &vm->vcpu[vcpuid]; 2471 2472 if (VM_INTINFO_PENDING(info)) { 2473 const uint32_t type = VM_INTINFO_TYPE(info); 2474 const uint8_t vector = VM_INTINFO_VECTOR(info); 2475 2476 if (type == VM_INTINFO_NMI && vector != IDT_NMI) 2477 return (EINVAL); 2478 if (type == VM_INTINFO_HWEXCP && vector >= 32) 2479 return (EINVAL); 2480 if (info & VM_INTINFO_MASK_RSVD) 2481 return (EINVAL); 2482 } else { 2483 info = 0; 2484 } 2485 vcpu->exit_intinfo = info; 2486 return (0); 2487 } 2488 2489 enum exc_class { 2490 EXC_BENIGN, 2491 EXC_CONTRIBUTORY, 2492 EXC_PAGEFAULT 2493 }; 2494 2495 #define IDT_VE 20 /* Virtualization Exception (Intel specific) */ 2496 2497 static enum exc_class 2498 exception_class(uint64_t info) 2499 { 2500 ASSERT(VM_INTINFO_PENDING(info)); 2501 2502 /* Table 6-4, "Interrupt and Exception Classes", Intel SDM, Vol 3 */ 2503 switch (VM_INTINFO_TYPE(info)) { 2504 case VM_INTINFO_HWINTR: 2505 case VM_INTINFO_SWINTR: 2506 case VM_INTINFO_NMI: 2507 return (EXC_BENIGN); 2508 default: 2509 /* 2510 * Hardware exception. 2511 * 2512 * SVM and VT-x use identical type values to represent NMI, 2513 * hardware interrupt and software interrupt. 2514 * 2515 * SVM uses type '3' for all exceptions. VT-x uses type '3' 2516 * for exceptions except #BP and #OF. #BP and #OF use a type 2517 * value of '5' or '6'. Therefore we don't check for explicit 2518 * values of 'type' to classify 'intinfo' into a hardware 2519 * exception. 2520 */ 2521 break; 2522 } 2523 2524 switch (VM_INTINFO_VECTOR(info)) { 2525 case IDT_PF: 2526 case IDT_VE: 2527 return (EXC_PAGEFAULT); 2528 case IDT_DE: 2529 case IDT_TS: 2530 case IDT_NP: 2531 case IDT_SS: 2532 case IDT_GP: 2533 return (EXC_CONTRIBUTORY); 2534 default: 2535 return (EXC_BENIGN); 2536 } 2537 } 2538 2539 /* 2540 * Fetch event pending injection into the guest, if one exists. 2541 * 2542 * Returns true if an event is to be injected (which is placed in `retinfo`). 2543 */ 2544 bool 2545 vm_entry_intinfo(struct vm *vm, int vcpuid, uint64_t *retinfo) 2546 { 2547 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 2548 const uint64_t info1 = vcpu->exit_intinfo; 2549 vcpu->exit_intinfo = 0; 2550 const uint64_t info2 = vcpu->exc_pending; 2551 vcpu->exc_pending = 0; 2552 2553 if (VM_INTINFO_PENDING(info1) && VM_INTINFO_PENDING(info2)) { 2554 /* 2555 * If an exception occurs while attempting to call the 2556 * double-fault handler the processor enters shutdown mode 2557 * (aka triple fault). 2558 */ 2559 if (VM_INTINFO_TYPE(info1) == VM_INTINFO_HWEXCP && 2560 VM_INTINFO_VECTOR(info1) == IDT_DF) { 2561 (void) vm_suspend(vm, VM_SUSPEND_TRIPLEFAULT); 2562 *retinfo = 0; 2563 return (false); 2564 } 2565 /* 2566 * "Conditions for Generating a Double Fault" 2567 * Intel SDM, Vol3, Table 6-5 2568 */ 2569 const enum exc_class exc1 = exception_class(info1); 2570 const enum exc_class exc2 = exception_class(info2); 2571 if ((exc1 == EXC_CONTRIBUTORY && exc2 == EXC_CONTRIBUTORY) || 2572 (exc1 == EXC_PAGEFAULT && exc2 != EXC_BENIGN)) { 2573 /* Convert nested fault into a double fault. */ 2574 *retinfo = 2575 VM_INTINFO_VALID | 2576 VM_INTINFO_DEL_ERRCODE | 2577 VM_INTINFO_HWEXCP | 2578 IDT_DF; 2579 } else { 2580 /* Handle exceptions serially */ 2581 vcpu->exit_intinfo = info1; 2582 *retinfo = info2; 2583 } 2584 return (true); 2585 } else if (VM_INTINFO_PENDING(info1)) { 2586 *retinfo = info1; 2587 return (true); 2588 } else if (VM_INTINFO_PENDING(info2)) { 2589 *retinfo = info2; 2590 return (true); 2591 } 2592 2593 return (false); 2594 } 2595 2596 int 2597 vm_get_intinfo(struct vm *vm, int vcpuid, uint64_t *info1, uint64_t *info2) 2598 { 2599 struct vcpu *vcpu; 2600 2601 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 2602 return (EINVAL); 2603 2604 vcpu = &vm->vcpu[vcpuid]; 2605 *info1 = vcpu->exit_intinfo; 2606 *info2 = vcpu->exc_pending; 2607 return (0); 2608 } 2609 2610 int 2611 vm_inject_exception(struct vm *vm, int vcpuid, uint8_t vector, 2612 bool errcode_valid, uint32_t errcode, bool restart_instruction) 2613 { 2614 struct vcpu *vcpu; 2615 uint64_t regval; 2616 int error; 2617 2618 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 2619 return (EINVAL); 2620 2621 if (vector >= 32) 2622 return (EINVAL); 2623 2624 /* 2625 * NMIs are to be injected via their own specialized path using 2626 * vm_inject_nmi(). 2627 */ 2628 if (vector == IDT_NMI) { 2629 return (EINVAL); 2630 } 2631 2632 /* 2633 * A double fault exception should never be injected directly into 2634 * the guest. It is a derived exception that results from specific 2635 * combinations of nested faults. 2636 */ 2637 if (vector == IDT_DF) { 2638 return (EINVAL); 2639 } 2640 2641 vcpu = &vm->vcpu[vcpuid]; 2642 2643 if (VM_INTINFO_PENDING(vcpu->exc_pending)) { 2644 /* Unable to inject exception due to one already pending */ 2645 return (EBUSY); 2646 } 2647 2648 if (errcode_valid) { 2649 /* 2650 * Exceptions don't deliver an error code in real mode. 2651 */ 2652 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_CR0, ®val); 2653 VERIFY0(error); 2654 if ((regval & CR0_PE) == 0) { 2655 errcode_valid = false; 2656 } 2657 } 2658 2659 /* 2660 * From section 26.6.1 "Interruptibility State" in Intel SDM: 2661 * 2662 * Event blocking by "STI" or "MOV SS" is cleared after guest executes 2663 * one instruction or incurs an exception. 2664 */ 2665 error = vm_set_register(vm, vcpuid, VM_REG_GUEST_INTR_SHADOW, 0); 2666 VERIFY0(error); 2667 2668 if (restart_instruction) { 2669 VERIFY0(vm_restart_instruction(vm, vcpuid)); 2670 } 2671 2672 uint64_t val = VM_INTINFO_VALID | VM_INTINFO_HWEXCP | vector; 2673 if (errcode_valid) { 2674 val |= VM_INTINFO_DEL_ERRCODE; 2675 val |= (uint64_t)errcode << VM_INTINFO_SHIFT_ERRCODE; 2676 } 2677 vcpu->exc_pending = val; 2678 return (0); 2679 } 2680 2681 void 2682 vm_inject_ud(struct vm *vm, int vcpuid) 2683 { 2684 VERIFY0(vm_inject_exception(vm, vcpuid, IDT_UD, false, 0, true)); 2685 } 2686 2687 void 2688 vm_inject_gp(struct vm *vm, int vcpuid) 2689 { 2690 VERIFY0(vm_inject_exception(vm, vcpuid, IDT_GP, true, 0, true)); 2691 } 2692 2693 void 2694 vm_inject_ac(struct vm *vm, int vcpuid, uint32_t errcode) 2695 { 2696 VERIFY0(vm_inject_exception(vm, vcpuid, IDT_AC, true, errcode, true)); 2697 } 2698 2699 void 2700 vm_inject_ss(struct vm *vm, int vcpuid, uint32_t errcode) 2701 { 2702 VERIFY0(vm_inject_exception(vm, vcpuid, IDT_SS, true, errcode, true)); 2703 } 2704 2705 void 2706 vm_inject_pf(struct vm *vm, int vcpuid, uint32_t errcode, uint64_t cr2) 2707 { 2708 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_CR2, cr2)); 2709 VERIFY0(vm_inject_exception(vm, vcpuid, IDT_PF, true, errcode, true)); 2710 } 2711 2712 static VMM_STAT(VCPU_NMI_COUNT, "number of NMIs delivered to vcpu"); 2713 2714 int 2715 vm_inject_nmi(struct vm *vm, int vcpuid) 2716 { 2717 struct vcpu *vcpu; 2718 2719 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 2720 return (EINVAL); 2721 2722 vcpu = &vm->vcpu[vcpuid]; 2723 2724 vcpu->nmi_pending = true; 2725 vcpu_notify_event(vm, vcpuid); 2726 return (0); 2727 } 2728 2729 bool 2730 vm_nmi_pending(struct vm *vm, int vcpuid) 2731 { 2732 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 2733 2734 return (vcpu->nmi_pending); 2735 } 2736 2737 void 2738 vm_nmi_clear(struct vm *vm, int vcpuid) 2739 { 2740 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 2741 2742 ASSERT(vcpu->nmi_pending); 2743 2744 vcpu->nmi_pending = false; 2745 vmm_stat_incr(vm, vcpuid, VCPU_NMI_COUNT, 1); 2746 } 2747 2748 static VMM_STAT(VCPU_EXTINT_COUNT, "number of ExtINTs delivered to vcpu"); 2749 2750 int 2751 vm_inject_extint(struct vm *vm, int vcpuid) 2752 { 2753 struct vcpu *vcpu; 2754 2755 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 2756 return (EINVAL); 2757 2758 vcpu = &vm->vcpu[vcpuid]; 2759 2760 vcpu->extint_pending = true; 2761 vcpu_notify_event(vm, vcpuid); 2762 return (0); 2763 } 2764 2765 bool 2766 vm_extint_pending(struct vm *vm, int vcpuid) 2767 { 2768 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 2769 2770 return (vcpu->extint_pending); 2771 } 2772 2773 void 2774 vm_extint_clear(struct vm *vm, int vcpuid) 2775 { 2776 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 2777 2778 ASSERT(vcpu->extint_pending); 2779 2780 vcpu->extint_pending = false; 2781 vmm_stat_incr(vm, vcpuid, VCPU_EXTINT_COUNT, 1); 2782 } 2783 2784 int 2785 vm_inject_init(struct vm *vm, int vcpuid) 2786 { 2787 struct vcpu *vcpu; 2788 2789 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 2790 return (EINVAL); 2791 2792 vcpu = &vm->vcpu[vcpuid]; 2793 vcpu_lock(vcpu); 2794 vcpu->run_state |= VRS_PEND_INIT; 2795 /* 2796 * As part of queuing the INIT request, clear any pending SIPI. It 2797 * would not otherwise survive across the reset of the vCPU when it 2798 * undergoes the requested INIT. We would not want it to linger when it 2799 * could be mistaken as a subsequent (after the INIT) SIPI request. 2800 */ 2801 vcpu->run_state &= ~VRS_PEND_SIPI; 2802 vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT); 2803 2804 vcpu_unlock(vcpu); 2805 return (0); 2806 } 2807 2808 int 2809 vm_inject_sipi(struct vm *vm, int vcpuid, uint8_t vector) 2810 { 2811 struct vcpu *vcpu; 2812 2813 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 2814 return (EINVAL); 2815 2816 vcpu = &vm->vcpu[vcpuid]; 2817 vcpu_lock(vcpu); 2818 vcpu->run_state |= VRS_PEND_SIPI; 2819 vcpu->sipi_vector = vector; 2820 /* SIPI is only actionable if the CPU is waiting in INIT state */ 2821 if ((vcpu->run_state & (VRS_INIT | VRS_RUN)) == VRS_INIT) { 2822 vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT); 2823 } 2824 vcpu_unlock(vcpu); 2825 return (0); 2826 } 2827 2828 bool 2829 vcpu_run_state_pending(struct vm *vm, int vcpuid) 2830 { 2831 struct vcpu *vcpu; 2832 2833 ASSERT(vcpuid >= 0 && vcpuid < vm->maxcpus); 2834 vcpu = &vm->vcpu[vcpuid]; 2835 2836 /* Of interest: vCPU not in running state or with pending INIT */ 2837 return ((vcpu->run_state & (VRS_RUN | VRS_PEND_INIT)) != VRS_RUN); 2838 } 2839 2840 int 2841 vcpu_arch_reset(struct vm *vm, int vcpuid, bool init_only) 2842 { 2843 struct seg_desc desc; 2844 const enum vm_reg_name clear_regs[] = { 2845 VM_REG_GUEST_CR2, 2846 VM_REG_GUEST_CR3, 2847 VM_REG_GUEST_CR4, 2848 VM_REG_GUEST_RAX, 2849 VM_REG_GUEST_RBX, 2850 VM_REG_GUEST_RCX, 2851 VM_REG_GUEST_RSI, 2852 VM_REG_GUEST_RDI, 2853 VM_REG_GUEST_RBP, 2854 VM_REG_GUEST_RSP, 2855 VM_REG_GUEST_R8, 2856 VM_REG_GUEST_R9, 2857 VM_REG_GUEST_R10, 2858 VM_REG_GUEST_R11, 2859 VM_REG_GUEST_R12, 2860 VM_REG_GUEST_R13, 2861 VM_REG_GUEST_R14, 2862 VM_REG_GUEST_R15, 2863 VM_REG_GUEST_DR0, 2864 VM_REG_GUEST_DR1, 2865 VM_REG_GUEST_DR2, 2866 VM_REG_GUEST_DR3, 2867 VM_REG_GUEST_EFER, 2868 }; 2869 const enum vm_reg_name data_segs[] = { 2870 VM_REG_GUEST_SS, 2871 VM_REG_GUEST_DS, 2872 VM_REG_GUEST_ES, 2873 VM_REG_GUEST_FS, 2874 VM_REG_GUEST_GS, 2875 }; 2876 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 2877 2878 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 2879 return (EINVAL); 2880 2881 for (uint_t i = 0; i < nitems(clear_regs); i++) { 2882 VERIFY0(vm_set_register(vm, vcpuid, clear_regs[i], 0)); 2883 } 2884 2885 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, 2)); 2886 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RIP, 0xfff0)); 2887 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_CR0, 0x60000010)); 2888 2889 /* 2890 * The prescribed contents of %rdx differ slightly between the Intel and 2891 * AMD architectural definitions. The former expects the Extended Model 2892 * in bits 16-19 where the latter expects all the Family, Model, and 2893 * Stepping be there. Common boot ROMs appear to disregard this 2894 * anyways, so we stick with a compromise value similar to what is 2895 * spelled out in the Intel SDM. 2896 */ 2897 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RDX, 0x600)); 2898 2899 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_DR6, 0xffff0ff0)); 2900 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_DR7, 0x400)); 2901 2902 /* CS: Present, R/W, Accessed */ 2903 desc.access = 0x0093; 2904 desc.base = 0xffff0000; 2905 desc.limit = 0xffff; 2906 VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_CS, &desc)); 2907 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_CS, 0xf000)); 2908 2909 /* SS, DS, ES, FS, GS: Present, R/W, Accessed */ 2910 desc.access = 0x0093; 2911 desc.base = 0; 2912 desc.limit = 0xffff; 2913 for (uint_t i = 0; i < nitems(data_segs); i++) { 2914 VERIFY0(vm_set_seg_desc(vm, vcpuid, data_segs[i], &desc)); 2915 VERIFY0(vm_set_register(vm, vcpuid, data_segs[i], 0)); 2916 } 2917 2918 /* GDTR, IDTR */ 2919 desc.base = 0; 2920 desc.limit = 0xffff; 2921 VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_GDTR, &desc)); 2922 VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_IDTR, &desc)); 2923 2924 /* LDTR: Present, LDT */ 2925 desc.access = 0x0082; 2926 desc.base = 0; 2927 desc.limit = 0xffff; 2928 VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_LDTR, &desc)); 2929 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_LDTR, 0)); 2930 2931 /* TR: Present, 32-bit TSS */ 2932 desc.access = 0x008b; 2933 desc.base = 0; 2934 desc.limit = 0xffff; 2935 VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_TR, &desc)); 2936 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_TR, 0)); 2937 2938 vlapic_reset(vm_lapic(vm, vcpuid)); 2939 2940 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_INTR_SHADOW, 0)); 2941 2942 vcpu->exit_intinfo = 0; 2943 vcpu->exc_pending = 0; 2944 vcpu->nmi_pending = false; 2945 vcpu->extint_pending = 0; 2946 2947 /* 2948 * A CPU reset caused by power-on or system reset clears more state than 2949 * one which is trigged from an INIT IPI. 2950 */ 2951 if (!init_only) { 2952 vcpu->guest_xcr0 = XFEATURE_ENABLED_X87; 2953 (void) hma_fpu_init(vcpu->guestfpu); 2954 2955 /* XXX: clear MSRs and other pieces */ 2956 bzero(&vcpu->mtrr, sizeof (vcpu->mtrr)); 2957 } 2958 2959 return (0); 2960 } 2961 2962 static int 2963 vcpu_vector_sipi(struct vm *vm, int vcpuid, uint8_t vector) 2964 { 2965 struct seg_desc desc; 2966 2967 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 2968 return (EINVAL); 2969 2970 /* CS: Present, R/W, Accessed */ 2971 desc.access = 0x0093; 2972 desc.base = (uint64_t)vector << 12; 2973 desc.limit = 0xffff; 2974 VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_CS, &desc)); 2975 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_CS, 2976 (uint64_t)vector << 8)); 2977 2978 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RIP, 0)); 2979 2980 return (0); 2981 } 2982 2983 int 2984 vm_get_capability(struct vm *vm, int vcpu, int type, int *retval) 2985 { 2986 if (vcpu < 0 || vcpu >= vm->maxcpus) 2987 return (EINVAL); 2988 2989 if (type < 0 || type >= VM_CAP_MAX) 2990 return (EINVAL); 2991 2992 return (VMGETCAP(vm->cookie, vcpu, type, retval)); 2993 } 2994 2995 int 2996 vm_set_capability(struct vm *vm, int vcpu, int type, int val) 2997 { 2998 if (vcpu < 0 || vcpu >= vm->maxcpus) 2999 return (EINVAL); 3000 3001 if (type < 0 || type >= VM_CAP_MAX) 3002 return (EINVAL); 3003 3004 return (VMSETCAP(vm->cookie, vcpu, type, val)); 3005 } 3006 3007 struct vlapic * 3008 vm_lapic(struct vm *vm, int cpu) 3009 { 3010 return (vm->vcpu[cpu].vlapic); 3011 } 3012 3013 struct vioapic * 3014 vm_ioapic(struct vm *vm) 3015 { 3016 3017 return (vm->vioapic); 3018 } 3019 3020 struct vhpet * 3021 vm_hpet(struct vm *vm) 3022 { 3023 3024 return (vm->vhpet); 3025 } 3026 3027 void * 3028 vm_iommu_domain(struct vm *vm) 3029 { 3030 3031 return (vm->iommu); 3032 } 3033 3034 int 3035 vcpu_set_state(struct vm *vm, int vcpuid, enum vcpu_state newstate, 3036 bool from_idle) 3037 { 3038 int error; 3039 struct vcpu *vcpu; 3040 3041 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 3042 panic("vcpu_set_state: invalid vcpuid %d", vcpuid); 3043 3044 vcpu = &vm->vcpu[vcpuid]; 3045 3046 vcpu_lock(vcpu); 3047 error = vcpu_set_state_locked(vm, vcpuid, newstate, from_idle); 3048 vcpu_unlock(vcpu); 3049 3050 return (error); 3051 } 3052 3053 enum vcpu_state 3054 vcpu_get_state(struct vm *vm, int vcpuid, int *hostcpu) 3055 { 3056 struct vcpu *vcpu; 3057 enum vcpu_state state; 3058 3059 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 3060 panic("vcpu_get_state: invalid vcpuid %d", vcpuid); 3061 3062 vcpu = &vm->vcpu[vcpuid]; 3063 3064 vcpu_lock(vcpu); 3065 state = vcpu->state; 3066 if (hostcpu != NULL) 3067 *hostcpu = vcpu->hostcpu; 3068 vcpu_unlock(vcpu); 3069 3070 return (state); 3071 } 3072 3073 uint64_t 3074 vcpu_tsc_offset(struct vm *vm, int vcpuid, bool phys_adj) 3075 { 3076 ASSERT(vcpuid >= 0 && vcpuid < vm->maxcpus); 3077 3078 uint64_t vcpu_off = vm->boot_tsc_offset + vm->vcpu[vcpuid].tsc_offset; 3079 3080 if (phys_adj) { 3081 /* Include any offset for the current physical CPU too */ 3082 extern hrtime_t tsc_gethrtime_tick_delta(void); 3083 vcpu_off += (uint64_t)tsc_gethrtime_tick_delta(); 3084 } 3085 3086 return (vcpu_off); 3087 } 3088 3089 int 3090 vm_activate_cpu(struct vm *vm, int vcpuid) 3091 { 3092 3093 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 3094 return (EINVAL); 3095 3096 if (CPU_ISSET(vcpuid, &vm->active_cpus)) 3097 return (EBUSY); 3098 3099 if (vm->suspend != 0) { 3100 return (EBUSY); 3101 } 3102 3103 CPU_SET_ATOMIC(vcpuid, &vm->active_cpus); 3104 3105 /* 3106 * It is possible that this vCPU was undergoing activation at the same 3107 * time that the VM was being suspended. If that happens to be the 3108 * case, it should reflect the suspended state immediately. 3109 */ 3110 if (atomic_load_acq_int((uint_t *)&vm->suspend) != 0) { 3111 CPU_SET_ATOMIC(vcpuid, &vm->suspended_cpus); 3112 } 3113 3114 return (0); 3115 } 3116 3117 int 3118 vm_suspend_cpu(struct vm *vm, int vcpuid) 3119 { 3120 int i; 3121 3122 if (vcpuid < -1 || vcpuid >= vm->maxcpus) 3123 return (EINVAL); 3124 3125 if (vcpuid == -1) { 3126 vm->debug_cpus = vm->active_cpus; 3127 for (i = 0; i < vm->maxcpus; i++) { 3128 if (CPU_ISSET(i, &vm->active_cpus)) 3129 vcpu_notify_event(vm, i); 3130 } 3131 } else { 3132 if (!CPU_ISSET(vcpuid, &vm->active_cpus)) 3133 return (EINVAL); 3134 3135 CPU_SET_ATOMIC(vcpuid, &vm->debug_cpus); 3136 vcpu_notify_event(vm, vcpuid); 3137 } 3138 return (0); 3139 } 3140 3141 int 3142 vm_resume_cpu(struct vm *vm, int vcpuid) 3143 { 3144 3145 if (vcpuid < -1 || vcpuid >= vm->maxcpus) 3146 return (EINVAL); 3147 3148 if (vcpuid == -1) { 3149 CPU_ZERO(&vm->debug_cpus); 3150 } else { 3151 if (!CPU_ISSET(vcpuid, &vm->debug_cpus)) 3152 return (EINVAL); 3153 3154 CPU_CLR_ATOMIC(vcpuid, &vm->debug_cpus); 3155 } 3156 return (0); 3157 } 3158 3159 static bool 3160 vcpu_bailout_checks(struct vm *vm, int vcpuid, bool on_entry, 3161 uint64_t entry_rip) 3162 { 3163 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 3164 struct vm_exit *vme = &vcpu->exitinfo; 3165 bool bail = false; 3166 3167 ASSERT(vcpuid >= 0 && vcpuid < vm->maxcpus); 3168 3169 if (vm->suspend) { 3170 if (on_entry) { 3171 VERIFY(vm->suspend > VM_SUSPEND_NONE && 3172 vm->suspend < VM_SUSPEND_LAST); 3173 3174 vme->exitcode = VM_EXITCODE_SUSPENDED; 3175 vme->u.suspended.how = vm->suspend; 3176 } else { 3177 /* 3178 * Handling VM suspend is complicated, so if that 3179 * condition is detected outside of VM-entry itself, 3180 * just emit a BOGUS exitcode so we take a lap to pick 3181 * up the event during an entry and are directed into 3182 * the vm_handle_suspend() logic. 3183 */ 3184 vme->exitcode = VM_EXITCODE_BOGUS; 3185 } 3186 bail = true; 3187 } 3188 if (vcpu->reqidle) { 3189 vme->exitcode = VM_EXITCODE_REQIDLE; 3190 vmm_stat_incr(vm, vcpuid, VMEXIT_REQIDLE, 1); 3191 3192 if (!on_entry) { 3193 /* 3194 * A reqidle request detected outside of VM-entry can be 3195 * handled directly by clearing the request (and taking 3196 * a lap to userspace). 3197 */ 3198 vcpu_assert_locked(vcpu); 3199 vcpu->reqidle = 0; 3200 } 3201 bail = true; 3202 } 3203 if (vcpu_should_yield(vm, vcpuid)) { 3204 vme->exitcode = VM_EXITCODE_BOGUS; 3205 vmm_stat_incr(vm, vcpuid, VMEXIT_ASTPENDING, 1); 3206 bail = true; 3207 } 3208 if (CPU_ISSET(vcpuid, &vm->debug_cpus)) { 3209 vme->exitcode = VM_EXITCODE_DEBUG; 3210 bail = true; 3211 } 3212 3213 if (bail) { 3214 if (on_entry) { 3215 /* 3216 * If bailing out during VM-entry, the current %rip must 3217 * be recorded in the exitinfo. 3218 */ 3219 vme->rip = entry_rip; 3220 } 3221 vme->inst_length = 0; 3222 } 3223 return (bail); 3224 } 3225 3226 static bool 3227 vcpu_sleep_bailout_checks(struct vm *vm, int vcpuid) 3228 { 3229 /* 3230 * Bail-out check done prior to sleeping (in vCPU contexts like HLT or 3231 * wait-for-SIPI) expect that %rip is already populated in the vm_exit 3232 * structure, and we would only modify the exitcode. 3233 */ 3234 return (vcpu_bailout_checks(vm, vcpuid, false, 0)); 3235 } 3236 3237 bool 3238 vcpu_entry_bailout_checks(struct vm *vm, int vcpuid, uint64_t rip) 3239 { 3240 /* 3241 * Bail-out checks done as part of VM entry require an updated %rip to 3242 * populate the vm_exit struct if any of the conditions of interest are 3243 * matched in the check. 3244 */ 3245 return (vcpu_bailout_checks(vm, vcpuid, true, rip)); 3246 } 3247 3248 cpuset_t 3249 vm_active_cpus(struct vm *vm) 3250 { 3251 3252 return (vm->active_cpus); 3253 } 3254 3255 cpuset_t 3256 vm_debug_cpus(struct vm *vm) 3257 { 3258 3259 return (vm->debug_cpus); 3260 } 3261 3262 cpuset_t 3263 vm_suspended_cpus(struct vm *vm) 3264 { 3265 3266 return (vm->suspended_cpus); 3267 } 3268 3269 void * 3270 vcpu_stats(struct vm *vm, int vcpuid) 3271 { 3272 3273 return (vm->vcpu[vcpuid].stats); 3274 } 3275 3276 int 3277 vm_get_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state *state) 3278 { 3279 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 3280 return (EINVAL); 3281 3282 *state = vm->vcpu[vcpuid].x2apic_state; 3283 3284 return (0); 3285 } 3286 3287 int 3288 vm_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state) 3289 { 3290 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 3291 return (EINVAL); 3292 3293 if (state >= X2APIC_STATE_LAST) 3294 return (EINVAL); 3295 3296 vm->vcpu[vcpuid].x2apic_state = state; 3297 3298 vlapic_set_x2apic_state(vm, vcpuid, state); 3299 3300 return (0); 3301 } 3302 3303 /* 3304 * This function is called to ensure that a vcpu "sees" a pending event 3305 * as soon as possible: 3306 * - If the vcpu thread is sleeping then it is woken up. 3307 * - If the vcpu is running on a different host_cpu then an IPI will be directed 3308 * to the host_cpu to cause the vcpu to trap into the hypervisor. 3309 */ 3310 static void 3311 vcpu_notify_event_locked(struct vcpu *vcpu, vcpu_notify_t ntype) 3312 { 3313 int hostcpu; 3314 3315 ASSERT(ntype == VCPU_NOTIFY_APIC || VCPU_NOTIFY_EXIT); 3316 3317 hostcpu = vcpu->hostcpu; 3318 if (vcpu->state == VCPU_RUNNING) { 3319 KASSERT(hostcpu != NOCPU, ("vcpu running on invalid hostcpu")); 3320 if (hostcpu != curcpu) { 3321 if (ntype == VCPU_NOTIFY_APIC) { 3322 vlapic_post_intr(vcpu->vlapic, hostcpu); 3323 } else { 3324 poke_cpu(hostcpu); 3325 } 3326 } else { 3327 /* 3328 * If the 'vcpu' is running on 'curcpu' then it must 3329 * be sending a notification to itself (e.g. SELF_IPI). 3330 * The pending event will be picked up when the vcpu 3331 * transitions back to guest context. 3332 */ 3333 } 3334 } else { 3335 KASSERT(hostcpu == NOCPU, ("vcpu state %d not consistent " 3336 "with hostcpu %d", vcpu->state, hostcpu)); 3337 if (vcpu->state == VCPU_SLEEPING) { 3338 cv_signal(&vcpu->vcpu_cv); 3339 } 3340 } 3341 } 3342 3343 void 3344 vcpu_notify_event(struct vm *vm, int vcpuid) 3345 { 3346 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 3347 3348 vcpu_lock(vcpu); 3349 vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT); 3350 vcpu_unlock(vcpu); 3351 } 3352 3353 void 3354 vcpu_notify_event_type(struct vm *vm, int vcpuid, vcpu_notify_t ntype) 3355 { 3356 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 3357 3358 if (ntype == VCPU_NOTIFY_NONE) { 3359 return; 3360 } 3361 3362 vcpu_lock(vcpu); 3363 vcpu_notify_event_locked(vcpu, ntype); 3364 vcpu_unlock(vcpu); 3365 } 3366 3367 void 3368 vcpu_ustate_change(struct vm *vm, int vcpuid, enum vcpu_ustate ustate) 3369 { 3370 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 3371 hrtime_t now = gethrtime(); 3372 3373 ASSERT3U(ustate, !=, vcpu->ustate); 3374 ASSERT3S(ustate, <, VU_MAX); 3375 ASSERT3S(ustate, >=, VU_INIT); 3376 3377 hrtime_t delta = now - vcpu->ustate_when; 3378 vcpu->ustate_total[vcpu->ustate] += delta; 3379 3380 membar_producer(); 3381 3382 vcpu->ustate_when = now; 3383 vcpu->ustate = ustate; 3384 } 3385 3386 struct vmspace * 3387 vm_get_vmspace(struct vm *vm) 3388 { 3389 3390 return (vm->vmspace); 3391 } 3392 3393 struct vm_client * 3394 vm_get_vmclient(struct vm *vm, int vcpuid) 3395 { 3396 return (vm->vcpu[vcpuid].vmclient); 3397 } 3398 3399 int 3400 vm_apicid2vcpuid(struct vm *vm, int apicid) 3401 { 3402 /* 3403 * XXX apic id is assumed to be numerically identical to vcpu id 3404 */ 3405 return (apicid); 3406 } 3407 3408 struct vatpic * 3409 vm_atpic(struct vm *vm) 3410 { 3411 return (vm->vatpic); 3412 } 3413 3414 struct vatpit * 3415 vm_atpit(struct vm *vm) 3416 { 3417 return (vm->vatpit); 3418 } 3419 3420 struct vpmtmr * 3421 vm_pmtmr(struct vm *vm) 3422 { 3423 3424 return (vm->vpmtmr); 3425 } 3426 3427 struct vrtc * 3428 vm_rtc(struct vm *vm) 3429 { 3430 3431 return (vm->vrtc); 3432 } 3433 3434 enum vm_reg_name 3435 vm_segment_name(int seg) 3436 { 3437 static enum vm_reg_name seg_names[] = { 3438 VM_REG_GUEST_ES, 3439 VM_REG_GUEST_CS, 3440 VM_REG_GUEST_SS, 3441 VM_REG_GUEST_DS, 3442 VM_REG_GUEST_FS, 3443 VM_REG_GUEST_GS 3444 }; 3445 3446 KASSERT(seg >= 0 && seg < nitems(seg_names), 3447 ("%s: invalid segment encoding %d", __func__, seg)); 3448 return (seg_names[seg]); 3449 } 3450 3451 void 3452 vm_copy_teardown(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo, 3453 uint_t num_copyinfo) 3454 { 3455 for (uint_t idx = 0; idx < num_copyinfo; idx++) { 3456 if (copyinfo[idx].cookie != NULL) { 3457 (void) vmp_release((vm_page_t *)copyinfo[idx].cookie); 3458 } 3459 } 3460 bzero(copyinfo, num_copyinfo * sizeof (struct vm_copyinfo)); 3461 } 3462 3463 int 3464 vm_copy_setup(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, 3465 uint64_t gla, size_t len, int prot, struct vm_copyinfo *copyinfo, 3466 uint_t num_copyinfo, int *fault) 3467 { 3468 uint_t idx, nused; 3469 size_t n, off, remaining; 3470 vm_client_t *vmc = vm_get_vmclient(vm, vcpuid); 3471 3472 bzero(copyinfo, sizeof (struct vm_copyinfo) * num_copyinfo); 3473 3474 nused = 0; 3475 remaining = len; 3476 while (remaining > 0) { 3477 uint64_t gpa; 3478 int error; 3479 3480 KASSERT(nused < num_copyinfo, ("insufficient vm_copyinfo")); 3481 error = vm_gla2gpa(vm, vcpuid, paging, gla, prot, &gpa, fault); 3482 if (error || *fault) 3483 return (error); 3484 off = gpa & PAGEOFFSET; 3485 n = min(remaining, PAGESIZE - off); 3486 copyinfo[nused].gpa = gpa; 3487 copyinfo[nused].len = n; 3488 remaining -= n; 3489 gla += n; 3490 nused++; 3491 } 3492 3493 for (idx = 0; idx < nused; idx++) { 3494 vm_page_t *vmp; 3495 caddr_t hva; 3496 3497 vmp = vmc_hold(vmc, copyinfo[idx].gpa & PAGEMASK, prot); 3498 if (vmp == NULL) { 3499 break; 3500 } 3501 if ((prot & PROT_WRITE) != 0) { 3502 hva = (caddr_t)vmp_get_writable(vmp); 3503 } else { 3504 hva = (caddr_t)vmp_get_readable(vmp); 3505 } 3506 copyinfo[idx].hva = hva + (copyinfo[idx].gpa & PAGEOFFSET); 3507 copyinfo[idx].cookie = vmp; 3508 copyinfo[idx].prot = prot; 3509 } 3510 3511 if (idx != nused) { 3512 vm_copy_teardown(vm, vcpuid, copyinfo, num_copyinfo); 3513 return (EFAULT); 3514 } else { 3515 *fault = 0; 3516 return (0); 3517 } 3518 } 3519 3520 void 3521 vm_copyin(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo, void *kaddr, 3522 size_t len) 3523 { 3524 char *dst; 3525 int idx; 3526 3527 dst = kaddr; 3528 idx = 0; 3529 while (len > 0) { 3530 ASSERT(copyinfo[idx].prot & PROT_READ); 3531 3532 bcopy(copyinfo[idx].hva, dst, copyinfo[idx].len); 3533 len -= copyinfo[idx].len; 3534 dst += copyinfo[idx].len; 3535 idx++; 3536 } 3537 } 3538 3539 void 3540 vm_copyout(struct vm *vm, int vcpuid, const void *kaddr, 3541 struct vm_copyinfo *copyinfo, size_t len) 3542 { 3543 const char *src; 3544 int idx; 3545 3546 src = kaddr; 3547 idx = 0; 3548 while (len > 0) { 3549 ASSERT(copyinfo[idx].prot & PROT_WRITE); 3550 3551 bcopy(src, copyinfo[idx].hva, copyinfo[idx].len); 3552 len -= copyinfo[idx].len; 3553 src += copyinfo[idx].len; 3554 idx++; 3555 } 3556 } 3557 3558 /* 3559 * Return the amount of in-use and wired memory for the VM. Since 3560 * these are global stats, only return the values with for vCPU 0 3561 */ 3562 VMM_STAT_DECLARE(VMM_MEM_RESIDENT); 3563 3564 static void 3565 vm_get_rescnt(struct vm *vm, int vcpu, struct vmm_stat_type *stat) 3566 { 3567 if (vcpu == 0) { 3568 vmm_stat_set(vm, vcpu, VMM_MEM_RESIDENT, 3569 PAGE_SIZE * vmspace_resident_count(vm->vmspace)); 3570 } 3571 } 3572 3573 VMM_STAT_FUNC(VMM_MEM_RESIDENT, "Resident memory", vm_get_rescnt); 3574 3575 int 3576 vm_ioport_access(struct vm *vm, int vcpuid, bool in, uint16_t port, 3577 uint8_t bytes, uint32_t *val) 3578 { 3579 return (vm_inout_access(&vm->ioports, in, port, bytes, val)); 3580 } 3581 3582 /* 3583 * bhyve-internal interfaces to attach or detach IO port handlers. 3584 * Must be called with VM write lock held for safety. 3585 */ 3586 int 3587 vm_ioport_attach(struct vm *vm, uint16_t port, ioport_handler_t func, void *arg, 3588 void **cookie) 3589 { 3590 int err; 3591 err = vm_inout_attach(&vm->ioports, port, IOPF_DEFAULT, func, arg); 3592 if (err == 0) { 3593 *cookie = (void *)IOP_GEN_COOKIE(func, arg, port); 3594 } 3595 return (err); 3596 } 3597 int 3598 vm_ioport_detach(struct vm *vm, void **cookie, ioport_handler_t *old_func, 3599 void **old_arg) 3600 { 3601 uint16_t port = IOP_PORT_FROM_COOKIE((uintptr_t)*cookie); 3602 int err; 3603 3604 err = vm_inout_detach(&vm->ioports, port, false, old_func, old_arg); 3605 if (err == 0) { 3606 *cookie = NULL; 3607 } 3608 return (err); 3609 } 3610 3611 /* 3612 * External driver interfaces to attach or detach IO port handlers. 3613 * Must be called with VM write lock held for safety. 3614 */ 3615 int 3616 vm_ioport_hook(struct vm *vm, uint16_t port, ioport_handler_t func, 3617 void *arg, void **cookie) 3618 { 3619 int err; 3620 3621 if (port == 0) { 3622 return (EINVAL); 3623 } 3624 3625 err = vm_inout_attach(&vm->ioports, port, IOPF_DRV_HOOK, func, arg); 3626 if (err == 0) { 3627 *cookie = (void *)IOP_GEN_COOKIE(func, arg, port); 3628 } 3629 return (err); 3630 } 3631 void 3632 vm_ioport_unhook(struct vm *vm, void **cookie) 3633 { 3634 uint16_t port = IOP_PORT_FROM_COOKIE((uintptr_t)*cookie); 3635 ioport_handler_t old_func; 3636 void *old_arg; 3637 int err; 3638 3639 err = vm_inout_detach(&vm->ioports, port, true, &old_func, &old_arg); 3640 3641 /* ioport-hook-using drivers are expected to be well-behaved */ 3642 VERIFY0(err); 3643 VERIFY(IOP_GEN_COOKIE(old_func, old_arg, port) == (uintptr_t)*cookie); 3644 3645 *cookie = NULL; 3646 } 3647 3648 int 3649 vmm_kstat_update_vcpu(struct kstat *ksp, int rw) 3650 { 3651 struct vm *vm = ksp->ks_private; 3652 vmm_vcpu_kstats_t *vvk = ksp->ks_data; 3653 const int vcpuid = vvk->vvk_vcpu.value.ui32; 3654 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 3655 3656 ASSERT3U(vcpuid, <, VM_MAXCPU); 3657 3658 vvk->vvk_time_init.value.ui64 = vcpu->ustate_total[VU_INIT]; 3659 vvk->vvk_time_run.value.ui64 = vcpu->ustate_total[VU_RUN]; 3660 vvk->vvk_time_idle.value.ui64 = vcpu->ustate_total[VU_IDLE]; 3661 vvk->vvk_time_emu_kern.value.ui64 = vcpu->ustate_total[VU_EMU_KERN]; 3662 vvk->vvk_time_emu_user.value.ui64 = vcpu->ustate_total[VU_EMU_USER]; 3663 vvk->vvk_time_sched.value.ui64 = vcpu->ustate_total[VU_SCHED]; 3664 3665 return (0); 3666 } 3667