1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2011 NetApp, Inc. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 * 28 * $FreeBSD$ 29 */ 30 /* 31 * This file and its contents are supplied under the terms of the 32 * Common Development and Distribution License ("CDDL"), version 1.0. 33 * You may only use this file in accordance with the terms of version 34 * 1.0 of the CDDL. 35 * 36 * A full copy of the text of the CDDL should have accompanied this 37 * source. A copy of the CDDL is also available via the Internet at 38 * http://www.illumos.org/license/CDDL. 39 * 40 * Copyright 2015 Pluribus Networks Inc. 41 * Copyright 2018 Joyent, Inc. 42 * Copyright 2021 Oxide Computer Company 43 * Copyright 2021 OmniOS Community Edition (OmniOSce) Association. 44 */ 45 46 #include <sys/cdefs.h> 47 __FBSDID("$FreeBSD$"); 48 49 #include <sys/param.h> 50 #include <sys/systm.h> 51 #include <sys/kernel.h> 52 #include <sys/module.h> 53 #include <sys/sysctl.h> 54 #include <sys/malloc.h> 55 #include <sys/pcpu.h> 56 #include <sys/mutex.h> 57 #include <sys/proc.h> 58 #include <sys/rwlock.h> 59 #include <sys/sched.h> 60 #include <sys/systm.h> 61 #include <sys/sunddi.h> 62 #include <sys/hma.h> 63 64 #include <machine/md_var.h> 65 #include <x86/psl.h> 66 #include <x86/apicreg.h> 67 68 #include <machine/specialreg.h> 69 #include <machine/vmm.h> 70 #include <machine/vmm_dev.h> 71 #include <machine/vmparam.h> 72 #include <sys/vmm_instruction_emul.h> 73 #include <sys/vmm_vm.h> 74 #include <sys/vmm_gpt.h> 75 76 #include "vmm_ioport.h" 77 #include "vmm_ktr.h" 78 #include "vmm_host.h" 79 #include "vmm_util.h" 80 #include "vatpic.h" 81 #include "vatpit.h" 82 #include "vhpet.h" 83 #include "vioapic.h" 84 #include "vlapic.h" 85 #include "vpmtmr.h" 86 #include "vrtc.h" 87 #include "vmm_stat.h" 88 #include "vmm_lapic.h" 89 90 #include "io/ppt.h" 91 #include "io/iommu.h" 92 93 struct vlapic; 94 95 /* Flags for vtc_status */ 96 #define VTCS_FPU_RESTORED 1 /* guest FPU restored, host FPU saved */ 97 #define VTCS_FPU_CTX_CRITICAL 2 /* in ctx where FPU restore cannot be lazy */ 98 99 typedef struct vm_thread_ctx { 100 struct vm *vtc_vm; 101 int vtc_vcpuid; 102 uint_t vtc_status; 103 enum vcpu_ustate vtc_ustate; 104 } vm_thread_ctx_t; 105 106 /* 107 * Initialization: 108 * (a) allocated when vcpu is created 109 * (i) initialized when vcpu is created and when it is reinitialized 110 * (o) initialized the first time the vcpu is created 111 * (x) initialized before use 112 */ 113 struct vcpu { 114 /* (o) protects state, run_state, hostcpu, sipi_vector */ 115 kmutex_t lock; 116 117 enum vcpu_state state; /* (o) vcpu state */ 118 enum vcpu_run_state run_state; /* (i) vcpu init/sipi/run state */ 119 kcondvar_t vcpu_cv; /* (o) cpu waiter cv */ 120 kcondvar_t state_cv; /* (o) IDLE-transition cv */ 121 int hostcpu; /* (o) vcpu's current host cpu */ 122 int lastloccpu; /* (o) last host cpu localized to */ 123 int reqidle; /* (i) request vcpu to idle */ 124 struct vlapic *vlapic; /* (i) APIC device model */ 125 enum x2apic_state x2apic_state; /* (i) APIC mode */ 126 uint64_t exitintinfo; /* (i) events pending at VM exit */ 127 int nmi_pending; /* (i) NMI pending */ 128 int extint_pending; /* (i) INTR pending */ 129 int exception_pending; /* (i) exception pending */ 130 int exc_vector; /* (x) exception collateral */ 131 int exc_errcode_valid; 132 uint32_t exc_errcode; 133 uint8_t sipi_vector; /* (i) SIPI vector */ 134 hma_fpu_t *guestfpu; /* (a,i) guest fpu state */ 135 uint64_t guest_xcr0; /* (i) guest %xcr0 register */ 136 void *stats; /* (a,i) statistics */ 137 struct vm_exit exitinfo; /* (x) exit reason and collateral */ 138 uint64_t nextrip; /* (x) next instruction to execute */ 139 struct vie *vie_ctx; /* (x) instruction emulation context */ 140 vm_client_t *vmclient; /* (a) VM-system client */ 141 uint64_t tsc_offset; /* (x) offset from host TSC */ 142 143 enum vcpu_ustate ustate; /* (i) microstate for the vcpu */ 144 hrtime_t ustate_when; /* (i) time of last ustate change */ 145 uint64_t ustate_total[VU_MAX]; /* (o) total time spent in ustates */ 146 vm_thread_ctx_t vtc; /* (o) thread state for ctxops */ 147 struct ctxop *ctxop; /* (o) ctxop storage for vcpu */ 148 }; 149 150 #define vcpu_lock(v) mutex_enter(&((v)->lock)) 151 #define vcpu_unlock(v) mutex_exit(&((v)->lock)) 152 #define vcpu_assert_locked(v) ASSERT(MUTEX_HELD(&((v)->lock))) 153 154 struct mem_seg { 155 size_t len; 156 bool sysmem; 157 vm_object_t *object; 158 }; 159 #define VM_MAX_MEMSEGS 4 160 161 struct mem_map { 162 vm_paddr_t gpa; 163 size_t len; 164 vm_ooffset_t segoff; 165 int segid; 166 int prot; 167 int flags; 168 }; 169 #define VM_MAX_MEMMAPS 8 170 171 /* 172 * Initialization: 173 * (o) initialized the first time the VM is created 174 * (i) initialized when VM is created and when it is reinitialized 175 * (x) initialized before use 176 */ 177 struct vm { 178 void *cookie; /* (i) cpu-specific data */ 179 void *iommu; /* (x) iommu-specific data */ 180 struct vhpet *vhpet; /* (i) virtual HPET */ 181 struct vioapic *vioapic; /* (i) virtual ioapic */ 182 struct vatpic *vatpic; /* (i) virtual atpic */ 183 struct vatpit *vatpit; /* (i) virtual atpit */ 184 struct vpmtmr *vpmtmr; /* (i) virtual ACPI PM timer */ 185 struct vrtc *vrtc; /* (o) virtual RTC */ 186 volatile cpuset_t active_cpus; /* (i) active vcpus */ 187 volatile cpuset_t debug_cpus; /* (i) vcpus stopped for dbg */ 188 int suspend; /* (i) stop VM execution */ 189 volatile cpuset_t suspended_cpus; /* (i) suspended vcpus */ 190 volatile cpuset_t halted_cpus; /* (x) cpus in a hard halt */ 191 struct mem_map mem_maps[VM_MAX_MEMMAPS]; /* (i) guest address space */ 192 struct mem_seg mem_segs[VM_MAX_MEMSEGS]; /* (o) guest memory regions */ 193 struct vmspace *vmspace; /* (o) guest's address space */ 194 char name[VM_MAX_NAMELEN]; /* (o) virtual machine name */ 195 struct vcpu vcpu[VM_MAXCPU]; /* (i) guest vcpus */ 196 /* The following describe the vm cpu topology */ 197 uint16_t sockets; /* (o) num of sockets */ 198 uint16_t cores; /* (o) num of cores/socket */ 199 uint16_t threads; /* (o) num of threads/core */ 200 uint16_t maxcpus; /* (o) max pluggable cpus */ 201 uint64_t boot_tsc_offset; /* (i) TSC offset at VM boot */ 202 203 struct ioport_config ioports; /* (o) ioport handling */ 204 205 bool mem_transient; /* (o) alloc transient memory */ 206 }; 207 208 static int vmm_initialized; 209 210 211 static void 212 nullop_panic(void) 213 { 214 panic("null vmm operation call"); 215 } 216 217 /* Do not allow use of an un-set `ops` to do anything but panic */ 218 static struct vmm_ops vmm_ops_null = { 219 .init = (vmm_init_func_t)nullop_panic, 220 .cleanup = (vmm_cleanup_func_t)nullop_panic, 221 .resume = (vmm_resume_func_t)nullop_panic, 222 .vminit = (vmi_init_func_t)nullop_panic, 223 .vmrun = (vmi_run_func_t)nullop_panic, 224 .vmcleanup = (vmi_cleanup_func_t)nullop_panic, 225 .vmgetreg = (vmi_get_register_t)nullop_panic, 226 .vmsetreg = (vmi_set_register_t)nullop_panic, 227 .vmgetdesc = (vmi_get_desc_t)nullop_panic, 228 .vmsetdesc = (vmi_set_desc_t)nullop_panic, 229 .vmgetcap = (vmi_get_cap_t)nullop_panic, 230 .vmsetcap = (vmi_set_cap_t)nullop_panic, 231 .vlapic_init = (vmi_vlapic_init)nullop_panic, 232 .vlapic_cleanup = (vmi_vlapic_cleanup)nullop_panic, 233 .vmsavectx = (vmi_savectx)nullop_panic, 234 .vmrestorectx = (vmi_restorectx)nullop_panic, 235 }; 236 237 static struct vmm_ops *ops = &vmm_ops_null; 238 static vmm_pte_ops_t *pte_ops = NULL; 239 240 #define VMM_INIT() ((*ops->init)()) 241 #define VMM_CLEANUP() ((*ops->cleanup)()) 242 #define VMM_RESUME() ((*ops->resume)()) 243 244 #define VMINIT(vm) ((*ops->vminit)(vm)) 245 #define VMRUN(vmi, vcpu, rip) ((*ops->vmrun)(vmi, vcpu, rip)) 246 #define VMCLEANUP(vmi) ((*ops->vmcleanup)(vmi)) 247 248 #define VMGETREG(vmi, vcpu, num, rv) ((*ops->vmgetreg)(vmi, vcpu, num, rv)) 249 #define VMSETREG(vmi, vcpu, num, val) ((*ops->vmsetreg)(vmi, vcpu, num, val)) 250 #define VMGETDESC(vmi, vcpu, num, dsc) ((*ops->vmgetdesc)(vmi, vcpu, num, dsc)) 251 #define VMSETDESC(vmi, vcpu, num, dsc) ((*ops->vmsetdesc)(vmi, vcpu, num, dsc)) 252 #define VMGETCAP(vmi, vcpu, num, rv) ((*ops->vmgetcap)(vmi, vcpu, num, rv)) 253 #define VMSETCAP(vmi, vcpu, num, val) ((*ops->vmsetcap)(vmi, vcpu, num, val)) 254 #define VLAPIC_INIT(vmi, vcpu) ((*ops->vlapic_init)(vmi, vcpu)) 255 #define VLAPIC_CLEANUP(vmi, vlapic) ((*ops->vlapic_cleanup)(vmi, vlapic)) 256 257 #define fpu_start_emulating() load_cr0(rcr0() | CR0_TS) 258 #define fpu_stop_emulating() clts() 259 260 SDT_PROVIDER_DEFINE(vmm); 261 262 static MALLOC_DEFINE(M_VM, "vm", "vm"); 263 264 SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 265 NULL); 266 267 /* 268 * Halt the guest if all vcpus are executing a HLT instruction with 269 * interrupts disabled. 270 */ 271 static int halt_detection_enabled = 1; 272 273 /* Trap into hypervisor on all guest exceptions and reflect them back */ 274 static int trace_guest_exceptions; 275 276 static void vm_free_memmap(struct vm *vm, int ident); 277 static bool sysmem_mapping(struct vm *vm, struct mem_map *mm); 278 static void vcpu_notify_event_locked(struct vcpu *vcpu, vcpu_notify_t); 279 static bool vcpu_sleep_bailout_checks(struct vm *vm, int vcpuid); 280 static int vcpu_vector_sipi(struct vm *vm, int vcpuid, uint8_t vector); 281 282 static void vmm_savectx(void *); 283 static void vmm_restorectx(void *); 284 static const struct ctxop_template vmm_ctxop_tpl = { 285 .ct_rev = CTXOP_TPL_REV, 286 .ct_save = vmm_savectx, 287 .ct_restore = vmm_restorectx, 288 }; 289 290 #ifdef KTR 291 static const char * 292 vcpu_state2str(enum vcpu_state state) 293 { 294 295 switch (state) { 296 case VCPU_IDLE: 297 return ("idle"); 298 case VCPU_FROZEN: 299 return ("frozen"); 300 case VCPU_RUNNING: 301 return ("running"); 302 case VCPU_SLEEPING: 303 return ("sleeping"); 304 default: 305 return ("unknown"); 306 } 307 } 308 #endif 309 310 static void 311 vcpu_cleanup(struct vm *vm, int i, bool destroy) 312 { 313 struct vcpu *vcpu = &vm->vcpu[i]; 314 315 VLAPIC_CLEANUP(vm->cookie, vcpu->vlapic); 316 if (destroy) { 317 vmm_stat_free(vcpu->stats); 318 319 hma_fpu_free(vcpu->guestfpu); 320 vcpu->guestfpu = NULL; 321 322 vie_free(vcpu->vie_ctx); 323 vcpu->vie_ctx = NULL; 324 325 vmc_destroy(vcpu->vmclient); 326 vcpu->vmclient = NULL; 327 328 ctxop_free(vcpu->ctxop); 329 mutex_destroy(&vcpu->lock); 330 } 331 } 332 333 static void 334 vcpu_init(struct vm *vm, int vcpu_id, bool create) 335 { 336 struct vcpu *vcpu; 337 338 KASSERT(vcpu_id >= 0 && vcpu_id < vm->maxcpus, 339 ("vcpu_init: invalid vcpu %d", vcpu_id)); 340 341 vcpu = &vm->vcpu[vcpu_id]; 342 343 if (create) { 344 mutex_init(&vcpu->lock, NULL, MUTEX_ADAPTIVE, NULL); 345 346 vcpu->state = VCPU_IDLE; 347 vcpu->hostcpu = NOCPU; 348 vcpu->lastloccpu = NOCPU; 349 vcpu->guestfpu = hma_fpu_alloc(KM_SLEEP); 350 vcpu->stats = vmm_stat_alloc(); 351 vcpu->vie_ctx = vie_alloc(); 352 353 vcpu->ustate = VU_INIT; 354 vcpu->ustate_when = gethrtime(); 355 356 vcpu->vtc.vtc_vm = vm; 357 vcpu->vtc.vtc_vcpuid = vcpu_id; 358 vcpu->ctxop = ctxop_allocate(&vmm_ctxop_tpl, &vcpu->vtc); 359 } else { 360 vie_reset(vcpu->vie_ctx); 361 bzero(&vcpu->exitinfo, sizeof (vcpu->exitinfo)); 362 if (vcpu->ustate != VU_INIT) { 363 vcpu_ustate_change(vm, vcpu_id, VU_INIT); 364 } 365 } 366 367 vcpu->run_state = VRS_HALT; 368 vcpu->vlapic = VLAPIC_INIT(vm->cookie, vcpu_id); 369 (void) vm_set_x2apic_state(vm, vcpu_id, X2APIC_DISABLED); 370 vcpu->reqidle = 0; 371 vcpu->exitintinfo = 0; 372 vcpu->nmi_pending = 0; 373 vcpu->extint_pending = 0; 374 vcpu->exception_pending = 0; 375 vcpu->guest_xcr0 = XFEATURE_ENABLED_X87; 376 (void) hma_fpu_init(vcpu->guestfpu); 377 vmm_stat_init(vcpu->stats); 378 vcpu->tsc_offset = 0; 379 } 380 381 int 382 vcpu_trace_exceptions(struct vm *vm, int vcpuid) 383 { 384 385 return (trace_guest_exceptions); 386 } 387 388 struct vm_exit * 389 vm_exitinfo(struct vm *vm, int cpuid) 390 { 391 struct vcpu *vcpu; 392 393 if (cpuid < 0 || cpuid >= vm->maxcpus) 394 panic("vm_exitinfo: invalid cpuid %d", cpuid); 395 396 vcpu = &vm->vcpu[cpuid]; 397 398 return (&vcpu->exitinfo); 399 } 400 401 struct vie * 402 vm_vie_ctx(struct vm *vm, int cpuid) 403 { 404 if (cpuid < 0 || cpuid >= vm->maxcpus) 405 panic("vm_vie_ctx: invalid cpuid %d", cpuid); 406 407 return (vm->vcpu[cpuid].vie_ctx); 408 } 409 410 static int 411 vmm_init(void) 412 { 413 vmm_host_state_init(); 414 415 if (vmm_is_intel()) { 416 ops = &vmm_ops_intel; 417 pte_ops = &ept_pte_ops; 418 } else if (vmm_is_svm()) { 419 ops = &vmm_ops_amd; 420 pte_ops = &rvi_pte_ops; 421 } else { 422 return (ENXIO); 423 } 424 425 return (VMM_INIT()); 426 } 427 428 int 429 vmm_mod_load() 430 { 431 int error; 432 433 VERIFY(vmm_initialized == 0); 434 435 error = vmm_init(); 436 if (error == 0) 437 vmm_initialized = 1; 438 439 return (error); 440 } 441 442 int 443 vmm_mod_unload() 444 { 445 int error; 446 447 VERIFY(vmm_initialized == 1); 448 449 iommu_cleanup(); 450 error = VMM_CLEANUP(); 451 if (error) 452 return (error); 453 vmm_initialized = 0; 454 455 return (0); 456 } 457 458 static void 459 vm_init(struct vm *vm, bool create) 460 { 461 int i; 462 463 vm->cookie = VMINIT(vm); 464 vm->iommu = NULL; 465 vm->vioapic = vioapic_init(vm); 466 vm->vhpet = vhpet_init(vm); 467 vm->vatpic = vatpic_init(vm); 468 vm->vatpit = vatpit_init(vm); 469 vm->vpmtmr = vpmtmr_init(vm); 470 if (create) 471 vm->vrtc = vrtc_init(vm); 472 473 vm_inout_init(vm, &vm->ioports); 474 475 CPU_ZERO(&vm->active_cpus); 476 CPU_ZERO(&vm->debug_cpus); 477 478 vm->suspend = 0; 479 CPU_ZERO(&vm->suspended_cpus); 480 481 for (i = 0; i < vm->maxcpus; i++) 482 vcpu_init(vm, i, create); 483 484 /* 485 * Configure the VM-wide TSC offset so that the call to vm_init() 486 * represents the boot time (when the TSC(s) read 0). Each vCPU will 487 * have its own offset from this, which is altered if/when the guest 488 * writes to MSR_TSC. 489 * 490 * The TSC offsetting math is all unsigned, using overflow for negative 491 * offets. A reading of the TSC is negated to form the boot offset. 492 */ 493 vm->boot_tsc_offset = (uint64_t)(-(int64_t)rdtsc_offset()); 494 } 495 496 /* 497 * The default CPU topology is a single thread per package. 498 */ 499 uint_t cores_per_package = 1; 500 uint_t threads_per_core = 1; 501 502 /* 503 * Debugging tunable to enable dirty-page-tracking. 504 * (Remains off by default for now) 505 */ 506 bool gpt_track_dirty = false; 507 508 int 509 vm_create(const char *name, uint64_t flags, struct vm **retvm) 510 { 511 struct vm *vm; 512 struct vmspace *vmspace; 513 514 /* 515 * If vmm.ko could not be successfully initialized then don't attempt 516 * to create the virtual machine. 517 */ 518 if (!vmm_initialized) 519 return (ENXIO); 520 521 /* Name validation has already occurred */ 522 VERIFY3U(strnlen(name, VM_MAX_NAMELEN), <, VM_MAX_NAMELEN); 523 524 vmspace = vmspace_alloc(VM_MAXUSER_ADDRESS, pte_ops, gpt_track_dirty); 525 if (vmspace == NULL) 526 return (ENOMEM); 527 528 vm = malloc(sizeof (struct vm), M_VM, M_WAITOK | M_ZERO); 529 (void) strlcpy(vm->name, name, sizeof (vm->name)); 530 531 vm->vmspace = vmspace; 532 vm->mem_transient = (flags & VCF_RESERVOIR_MEM) == 0; 533 for (uint_t i = 0; i < VM_MAXCPU; i++) { 534 vm->vcpu[i].vmclient = vmspace_client_alloc(vmspace); 535 } 536 537 vm->sockets = 1; 538 vm->cores = cores_per_package; /* XXX backwards compatibility */ 539 vm->threads = threads_per_core; /* XXX backwards compatibility */ 540 vm->maxcpus = VM_MAXCPU; /* XXX temp to keep code working */ 541 542 vm_init(vm, true); 543 544 *retvm = vm; 545 return (0); 546 } 547 548 void 549 vm_get_topology(struct vm *vm, uint16_t *sockets, uint16_t *cores, 550 uint16_t *threads, uint16_t *maxcpus) 551 { 552 *sockets = vm->sockets; 553 *cores = vm->cores; 554 *threads = vm->threads; 555 *maxcpus = vm->maxcpus; 556 } 557 558 uint16_t 559 vm_get_maxcpus(struct vm *vm) 560 { 561 return (vm->maxcpus); 562 } 563 564 int 565 vm_set_topology(struct vm *vm, uint16_t sockets, uint16_t cores, 566 uint16_t threads, uint16_t maxcpus) 567 { 568 if (maxcpus != 0) 569 return (EINVAL); /* XXX remove when supported */ 570 if ((sockets * cores * threads) > vm->maxcpus) 571 return (EINVAL); 572 /* XXX need to check sockets * cores * threads == vCPU, how? */ 573 vm->sockets = sockets; 574 vm->cores = cores; 575 vm->threads = threads; 576 vm->maxcpus = VM_MAXCPU; /* XXX temp to keep code working */ 577 return (0); 578 } 579 580 static void 581 vm_cleanup(struct vm *vm, bool destroy) 582 { 583 struct mem_map *mm; 584 int i; 585 586 ppt_unassign_all(vm); 587 588 if (vm->iommu != NULL) 589 iommu_destroy_domain(vm->iommu); 590 591 /* 592 * Devices which attach their own ioport hooks should be cleaned up 593 * first so they can tear down those registrations. 594 */ 595 vpmtmr_cleanup(vm->vpmtmr); 596 597 vm_inout_cleanup(vm, &vm->ioports); 598 599 if (destroy) 600 vrtc_cleanup(vm->vrtc); 601 else 602 vrtc_reset(vm->vrtc); 603 604 vatpit_cleanup(vm->vatpit); 605 vhpet_cleanup(vm->vhpet); 606 vatpic_cleanup(vm->vatpic); 607 vioapic_cleanup(vm->vioapic); 608 609 for (i = 0; i < vm->maxcpus; i++) 610 vcpu_cleanup(vm, i, destroy); 611 612 VMCLEANUP(vm->cookie); 613 614 /* 615 * System memory is removed from the guest address space only when 616 * the VM is destroyed. This is because the mapping remains the same 617 * across VM reset. 618 * 619 * Device memory can be relocated by the guest (e.g. using PCI BARs) 620 * so those mappings are removed on a VM reset. 621 */ 622 for (i = 0; i < VM_MAX_MEMMAPS; i++) { 623 mm = &vm->mem_maps[i]; 624 if (destroy || !sysmem_mapping(vm, mm)) { 625 vm_free_memmap(vm, i); 626 } else { 627 /* 628 * We need to reset the IOMMU flag so this mapping can 629 * be reused when a VM is rebooted. Since the IOMMU 630 * domain has already been destroyed we can just reset 631 * the flag here. 632 */ 633 mm->flags &= ~VM_MEMMAP_F_IOMMU; 634 } 635 } 636 637 if (destroy) { 638 for (i = 0; i < VM_MAX_MEMSEGS; i++) 639 vm_free_memseg(vm, i); 640 641 vmspace_destroy(vm->vmspace); 642 vm->vmspace = NULL; 643 } 644 } 645 646 void 647 vm_destroy(struct vm *vm) 648 { 649 vm_cleanup(vm, true); 650 free(vm, M_VM); 651 } 652 653 int 654 vm_reinit(struct vm *vm, uint64_t flags) 655 { 656 /* A virtual machine can be reset only if all vcpus are suspended. */ 657 if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) != 0) { 658 if ((flags & VM_REINIT_F_FORCE_SUSPEND) == 0) { 659 return (EBUSY); 660 } 661 662 /* 663 * Force the VM (and all its vCPUs) into a suspended state. 664 * This should be quick and easy, since the vm_reinit() call is 665 * made while holding the VM write lock, which requires holding 666 * all of the vCPUs in the VCPU_FROZEN state. 667 */ 668 (void) atomic_cmpset_int((uint_t *)&vm->suspend, 0, 669 VM_SUSPEND_RESET); 670 for (uint_t i = 0; i < vm->maxcpus; i++) { 671 struct vcpu *vcpu = &vm->vcpu[i]; 672 673 if (CPU_ISSET(i, &vm->suspended_cpus) || 674 !CPU_ISSET(i, &vm->active_cpus)) { 675 continue; 676 } 677 678 vcpu_lock(vcpu); 679 VERIFY3U(vcpu->state, ==, VCPU_FROZEN); 680 CPU_SET_ATOMIC(i, &vm->suspended_cpus); 681 vcpu_unlock(vcpu); 682 } 683 684 VERIFY0(CPU_CMP(&vm->suspended_cpus, &vm->active_cpus)); 685 } 686 687 vm_cleanup(vm, false); 688 vm_init(vm, false); 689 return (0); 690 } 691 692 const char * 693 vm_name(struct vm *vm) 694 { 695 return (vm->name); 696 } 697 698 int 699 vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa) 700 { 701 vm_object_t *obj; 702 703 if ((obj = vmm_mmio_alloc(vm->vmspace, gpa, len, hpa)) == NULL) 704 return (ENOMEM); 705 else 706 return (0); 707 } 708 709 int 710 vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len) 711 { 712 return (vmspace_unmap(vm->vmspace, gpa, gpa + len)); 713 } 714 715 /* 716 * Return 'true' if 'gpa' is allocated in the guest address space. 717 * 718 * This function is called in the context of a running vcpu which acts as 719 * an implicit lock on 'vm->mem_maps[]'. 720 */ 721 bool 722 vm_mem_allocated(struct vm *vm, int vcpuid, vm_paddr_t gpa) 723 { 724 struct mem_map *mm; 725 int i; 726 727 #ifdef INVARIANTS 728 int hostcpu, state; 729 state = vcpu_get_state(vm, vcpuid, &hostcpu); 730 KASSERT(state == VCPU_RUNNING && hostcpu == curcpu, 731 ("%s: invalid vcpu state %d/%d", __func__, state, hostcpu)); 732 #endif 733 734 for (i = 0; i < VM_MAX_MEMMAPS; i++) { 735 mm = &vm->mem_maps[i]; 736 if (mm->len != 0 && gpa >= mm->gpa && gpa < mm->gpa + mm->len) 737 return (true); /* 'gpa' is sysmem or devmem */ 738 } 739 740 if (ppt_is_mmio(vm, gpa)) 741 return (true); /* 'gpa' is pci passthru mmio */ 742 743 return (false); 744 } 745 746 int 747 vm_alloc_memseg(struct vm *vm, int ident, size_t len, bool sysmem) 748 { 749 struct mem_seg *seg; 750 vm_object_t *obj; 751 752 if (ident < 0 || ident >= VM_MAX_MEMSEGS) 753 return (EINVAL); 754 755 if (len == 0 || (len & PAGE_MASK)) 756 return (EINVAL); 757 758 seg = &vm->mem_segs[ident]; 759 if (seg->object != NULL) { 760 if (seg->len == len && seg->sysmem == sysmem) 761 return (EEXIST); 762 else 763 return (EINVAL); 764 } 765 766 obj = vm_object_mem_allocate(len, vm->mem_transient); 767 if (obj == NULL) 768 return (ENOMEM); 769 770 seg->len = len; 771 seg->object = obj; 772 seg->sysmem = sysmem; 773 return (0); 774 } 775 776 int 777 vm_get_memseg(struct vm *vm, int ident, size_t *len, bool *sysmem, 778 vm_object_t **objptr) 779 { 780 struct mem_seg *seg; 781 782 if (ident < 0 || ident >= VM_MAX_MEMSEGS) 783 return (EINVAL); 784 785 seg = &vm->mem_segs[ident]; 786 if (len) 787 *len = seg->len; 788 if (sysmem) 789 *sysmem = seg->sysmem; 790 if (objptr) 791 *objptr = seg->object; 792 return (0); 793 } 794 795 void 796 vm_free_memseg(struct vm *vm, int ident) 797 { 798 struct mem_seg *seg; 799 800 KASSERT(ident >= 0 && ident < VM_MAX_MEMSEGS, 801 ("%s: invalid memseg ident %d", __func__, ident)); 802 803 seg = &vm->mem_segs[ident]; 804 if (seg->object != NULL) { 805 vm_object_release(seg->object); 806 bzero(seg, sizeof (struct mem_seg)); 807 } 808 } 809 810 int 811 vm_mmap_memseg(struct vm *vm, vm_paddr_t gpa, int segid, vm_ooffset_t first, 812 size_t len, int prot, int flags) 813 { 814 struct mem_seg *seg; 815 struct mem_map *m, *map; 816 vm_ooffset_t last; 817 int i, error; 818 819 if (prot == 0 || (prot & ~(PROT_ALL)) != 0) 820 return (EINVAL); 821 822 if (flags & ~VM_MEMMAP_F_WIRED) 823 return (EINVAL); 824 825 if (segid < 0 || segid >= VM_MAX_MEMSEGS) 826 return (EINVAL); 827 828 seg = &vm->mem_segs[segid]; 829 if (seg->object == NULL) 830 return (EINVAL); 831 832 last = first + len; 833 if (first < 0 || first >= last || last > seg->len) 834 return (EINVAL); 835 836 if ((gpa | first | last) & PAGE_MASK) 837 return (EINVAL); 838 839 map = NULL; 840 for (i = 0; i < VM_MAX_MEMMAPS; i++) { 841 m = &vm->mem_maps[i]; 842 if (m->len == 0) { 843 map = m; 844 break; 845 } 846 } 847 848 if (map == NULL) 849 return (ENOSPC); 850 851 error = vmspace_map(vm->vmspace, seg->object, first, gpa, len, prot); 852 if (error != 0) 853 return (EFAULT); 854 855 vm_object_reference(seg->object); 856 857 if ((flags & VM_MEMMAP_F_WIRED) != 0) { 858 error = vmspace_populate(vm->vmspace, gpa, gpa + len); 859 if (error != 0) { 860 VERIFY0(vmspace_unmap(vm->vmspace, gpa, gpa + len)); 861 return (EFAULT); 862 } 863 } 864 865 map->gpa = gpa; 866 map->len = len; 867 map->segoff = first; 868 map->segid = segid; 869 map->prot = prot; 870 map->flags = flags; 871 return (0); 872 } 873 874 int 875 vm_munmap_memseg(struct vm *vm, vm_paddr_t gpa, size_t len) 876 { 877 struct mem_map *m; 878 int i; 879 880 for (i = 0; i < VM_MAX_MEMMAPS; i++) { 881 m = &vm->mem_maps[i]; 882 if (m->gpa == gpa && m->len == len && 883 (m->flags & VM_MEMMAP_F_IOMMU) == 0) { 884 vm_free_memmap(vm, i); 885 return (0); 886 } 887 } 888 889 return (EINVAL); 890 } 891 892 int 893 vm_mmap_getnext(struct vm *vm, vm_paddr_t *gpa, int *segid, 894 vm_ooffset_t *segoff, size_t *len, int *prot, int *flags) 895 { 896 struct mem_map *mm, *mmnext; 897 int i; 898 899 mmnext = NULL; 900 for (i = 0; i < VM_MAX_MEMMAPS; i++) { 901 mm = &vm->mem_maps[i]; 902 if (mm->len == 0 || mm->gpa < *gpa) 903 continue; 904 if (mmnext == NULL || mm->gpa < mmnext->gpa) 905 mmnext = mm; 906 } 907 908 if (mmnext != NULL) { 909 *gpa = mmnext->gpa; 910 if (segid) 911 *segid = mmnext->segid; 912 if (segoff) 913 *segoff = mmnext->segoff; 914 if (len) 915 *len = mmnext->len; 916 if (prot) 917 *prot = mmnext->prot; 918 if (flags) 919 *flags = mmnext->flags; 920 return (0); 921 } else { 922 return (ENOENT); 923 } 924 } 925 926 static void 927 vm_free_memmap(struct vm *vm, int ident) 928 { 929 struct mem_map *mm; 930 int error; 931 932 mm = &vm->mem_maps[ident]; 933 if (mm->len) { 934 error = vmspace_unmap(vm->vmspace, mm->gpa, 935 mm->gpa + mm->len); 936 KASSERT(error == 0, ("%s: vmspace_unmap error %d", 937 __func__, error)); 938 bzero(mm, sizeof (struct mem_map)); 939 } 940 } 941 942 static __inline bool 943 sysmem_mapping(struct vm *vm, struct mem_map *mm) 944 { 945 946 if (mm->len != 0 && vm->mem_segs[mm->segid].sysmem) 947 return (true); 948 else 949 return (false); 950 } 951 952 vm_paddr_t 953 vmm_sysmem_maxaddr(struct vm *vm) 954 { 955 struct mem_map *mm; 956 vm_paddr_t maxaddr; 957 int i; 958 959 maxaddr = 0; 960 for (i = 0; i < VM_MAX_MEMMAPS; i++) { 961 mm = &vm->mem_maps[i]; 962 if (sysmem_mapping(vm, mm)) { 963 if (maxaddr < mm->gpa + mm->len) 964 maxaddr = mm->gpa + mm->len; 965 } 966 } 967 return (maxaddr); 968 } 969 970 static void 971 vm_iommu_modify(struct vm *vm, bool map) 972 { 973 int i, sz; 974 vm_paddr_t gpa, hpa; 975 struct mem_map *mm; 976 #ifdef __FreeBSD__ 977 void *vp, *cookie, *host_domain; 978 #endif 979 vm_client_t *vmc; 980 981 sz = PAGE_SIZE; 982 #ifdef __FreeBSD__ 983 host_domain = iommu_host_domain(); 984 #endif 985 vmc = vmspace_client_alloc(vm->vmspace); 986 987 for (i = 0; i < VM_MAX_MEMMAPS; i++) { 988 mm = &vm->mem_maps[i]; 989 if (!sysmem_mapping(vm, mm)) 990 continue; 991 992 if (map) { 993 KASSERT((mm->flags & VM_MEMMAP_F_IOMMU) == 0, 994 ("iommu map found invalid memmap %lx/%lx/%x", 995 mm->gpa, mm->len, mm->flags)); 996 if ((mm->flags & VM_MEMMAP_F_WIRED) == 0) 997 continue; 998 mm->flags |= VM_MEMMAP_F_IOMMU; 999 } else { 1000 if ((mm->flags & VM_MEMMAP_F_IOMMU) == 0) 1001 continue; 1002 mm->flags &= ~VM_MEMMAP_F_IOMMU; 1003 KASSERT((mm->flags & VM_MEMMAP_F_WIRED) != 0, 1004 ("iommu unmap found invalid memmap %lx/%lx/%x", 1005 mm->gpa, mm->len, mm->flags)); 1006 } 1007 1008 gpa = mm->gpa; 1009 while (gpa < mm->gpa + mm->len) { 1010 vm_page_t *vmp; 1011 1012 vmp = vmc_hold(vmc, gpa, PROT_WRITE); 1013 ASSERT(vmp != NULL); 1014 hpa = ((uintptr_t)vmp_get_pfn(vmp) << PAGESHIFT); 1015 (void) vmp_release(vmp); 1016 1017 if (map) { 1018 iommu_create_mapping(vm->iommu, gpa, hpa, sz); 1019 #ifdef __FreeBSD__ 1020 iommu_remove_mapping(host_domain, hpa, sz); 1021 #endif 1022 } else { 1023 iommu_remove_mapping(vm->iommu, gpa, sz); 1024 #ifdef __FreeBSD__ 1025 iommu_create_mapping(host_domain, hpa, hpa, sz); 1026 #endif 1027 } 1028 1029 gpa += PAGE_SIZE; 1030 } 1031 } 1032 vmc_destroy(vmc); 1033 1034 /* 1035 * Invalidate the cached translations associated with the domain 1036 * from which pages were removed. 1037 */ 1038 #ifdef __FreeBSD__ 1039 if (map) 1040 iommu_invalidate_tlb(host_domain); 1041 else 1042 iommu_invalidate_tlb(vm->iommu); 1043 #else 1044 iommu_invalidate_tlb(vm->iommu); 1045 #endif 1046 } 1047 1048 int 1049 vm_unassign_pptdev(struct vm *vm, int pptfd) 1050 { 1051 int error; 1052 1053 error = ppt_unassign_device(vm, pptfd); 1054 if (error) 1055 return (error); 1056 1057 if (ppt_assigned_devices(vm) == 0) 1058 vm_iommu_modify(vm, false); 1059 1060 return (0); 1061 } 1062 1063 int 1064 vm_assign_pptdev(struct vm *vm, int pptfd) 1065 { 1066 int error; 1067 vm_paddr_t maxaddr; 1068 1069 /* Set up the IOMMU to do the 'gpa' to 'hpa' translation */ 1070 if (ppt_assigned_devices(vm) == 0) { 1071 KASSERT(vm->iommu == NULL, 1072 ("vm_assign_pptdev: iommu must be NULL")); 1073 maxaddr = vmm_sysmem_maxaddr(vm); 1074 vm->iommu = iommu_create_domain(maxaddr); 1075 if (vm->iommu == NULL) 1076 return (ENXIO); 1077 vm_iommu_modify(vm, true); 1078 } 1079 1080 error = ppt_assign_device(vm, pptfd); 1081 return (error); 1082 } 1083 1084 int 1085 vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval) 1086 { 1087 1088 if (vcpu < 0 || vcpu >= vm->maxcpus) 1089 return (EINVAL); 1090 1091 if (reg >= VM_REG_LAST) 1092 return (EINVAL); 1093 1094 return (VMGETREG(vm->cookie, vcpu, reg, retval)); 1095 } 1096 1097 int 1098 vm_set_register(struct vm *vm, int vcpuid, int reg, uint64_t val) 1099 { 1100 struct vcpu *vcpu; 1101 int error; 1102 1103 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 1104 return (EINVAL); 1105 1106 if (reg >= VM_REG_LAST) 1107 return (EINVAL); 1108 1109 error = VMSETREG(vm->cookie, vcpuid, reg, val); 1110 if (error || reg != VM_REG_GUEST_RIP) 1111 return (error); 1112 1113 /* Set 'nextrip' to match the value of %rip */ 1114 VCPU_CTR1(vm, vcpuid, "Setting nextrip to %lx", val); 1115 vcpu = &vm->vcpu[vcpuid]; 1116 vcpu->nextrip = val; 1117 return (0); 1118 } 1119 1120 static bool 1121 is_descriptor_table(int reg) 1122 { 1123 switch (reg) { 1124 case VM_REG_GUEST_IDTR: 1125 case VM_REG_GUEST_GDTR: 1126 return (true); 1127 default: 1128 return (false); 1129 } 1130 } 1131 1132 static bool 1133 is_segment_register(int reg) 1134 { 1135 switch (reg) { 1136 case VM_REG_GUEST_ES: 1137 case VM_REG_GUEST_CS: 1138 case VM_REG_GUEST_SS: 1139 case VM_REG_GUEST_DS: 1140 case VM_REG_GUEST_FS: 1141 case VM_REG_GUEST_GS: 1142 case VM_REG_GUEST_TR: 1143 case VM_REG_GUEST_LDTR: 1144 return (true); 1145 default: 1146 return (false); 1147 } 1148 } 1149 1150 int 1151 vm_get_seg_desc(struct vm *vm, int vcpu, int reg, struct seg_desc *desc) 1152 { 1153 1154 if (vcpu < 0 || vcpu >= vm->maxcpus) 1155 return (EINVAL); 1156 1157 if (!is_segment_register(reg) && !is_descriptor_table(reg)) 1158 return (EINVAL); 1159 1160 return (VMGETDESC(vm->cookie, vcpu, reg, desc)); 1161 } 1162 1163 int 1164 vm_set_seg_desc(struct vm *vm, int vcpu, int reg, const struct seg_desc *desc) 1165 { 1166 if (vcpu < 0 || vcpu >= vm->maxcpus) 1167 return (EINVAL); 1168 1169 if (!is_segment_register(reg) && !is_descriptor_table(reg)) 1170 return (EINVAL); 1171 1172 return (VMSETDESC(vm->cookie, vcpu, reg, desc)); 1173 } 1174 1175 static int 1176 translate_hma_xsave_result(hma_fpu_xsave_result_t res) 1177 { 1178 switch (res) { 1179 case HFXR_OK: 1180 return (0); 1181 case HFXR_NO_SPACE: 1182 return (ENOSPC); 1183 case HFXR_BAD_ALIGN: 1184 case HFXR_UNSUP_FMT: 1185 case HFXR_UNSUP_FEAT: 1186 case HFXR_INVALID_DATA: 1187 return (EINVAL); 1188 default: 1189 panic("unexpected xsave result"); 1190 } 1191 } 1192 1193 int 1194 vm_get_fpu(struct vm *vm, int vcpuid, void *buf, size_t len) 1195 { 1196 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 1197 return (EINVAL); 1198 1199 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 1200 hma_fpu_xsave_result_t res; 1201 1202 res = hma_fpu_get_xsave_state(vcpu->guestfpu, buf, len); 1203 return (translate_hma_xsave_result(res)); 1204 } 1205 1206 int 1207 vm_set_fpu(struct vm *vm, int vcpuid, void *buf, size_t len) 1208 { 1209 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 1210 return (EINVAL); 1211 1212 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 1213 hma_fpu_xsave_result_t res; 1214 1215 res = hma_fpu_set_xsave_state(vcpu->guestfpu, buf, len); 1216 return (translate_hma_xsave_result(res)); 1217 } 1218 1219 int 1220 vm_get_run_state(struct vm *vm, int vcpuid, uint32_t *state, uint8_t *sipi_vec) 1221 { 1222 struct vcpu *vcpu; 1223 1224 if (vcpuid < 0 || vcpuid >= vm->maxcpus) { 1225 return (EINVAL); 1226 } 1227 1228 vcpu = &vm->vcpu[vcpuid]; 1229 1230 vcpu_lock(vcpu); 1231 *state = vcpu->run_state; 1232 *sipi_vec = vcpu->sipi_vector; 1233 vcpu_unlock(vcpu); 1234 1235 return (0); 1236 } 1237 1238 int 1239 vm_set_run_state(struct vm *vm, int vcpuid, uint32_t state, uint8_t sipi_vec) 1240 { 1241 struct vcpu *vcpu; 1242 1243 if (vcpuid < 0 || vcpuid >= vm->maxcpus) { 1244 return (EINVAL); 1245 } 1246 if (!VRS_IS_VALID(state)) { 1247 return (EINVAL); 1248 } 1249 1250 vcpu = &vm->vcpu[vcpuid]; 1251 1252 vcpu_lock(vcpu); 1253 vcpu->run_state = state; 1254 vcpu->sipi_vector = sipi_vec; 1255 vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT); 1256 vcpu_unlock(vcpu); 1257 1258 return (0); 1259 } 1260 1261 void 1262 vm_track_dirty_pages(struct vm *vm, uint64_t gpa, size_t len, uint8_t *bitmap) 1263 { 1264 vmspace_t *vms = vm_get_vmspace(vm); 1265 vmspace_track_dirty(vms, gpa, len, bitmap); 1266 } 1267 1268 static void 1269 restore_guest_fpustate(struct vcpu *vcpu) 1270 { 1271 /* Save host FPU and restore guest FPU */ 1272 fpu_stop_emulating(); 1273 hma_fpu_start_guest(vcpu->guestfpu); 1274 1275 /* restore guest XCR0 if XSAVE is enabled in the host */ 1276 if (rcr4() & CR4_XSAVE) 1277 load_xcr(0, vcpu->guest_xcr0); 1278 1279 /* 1280 * The FPU is now "dirty" with the guest's state so turn on emulation 1281 * to trap any access to the FPU by the host. 1282 */ 1283 fpu_start_emulating(); 1284 } 1285 1286 static void 1287 save_guest_fpustate(struct vcpu *vcpu) 1288 { 1289 1290 if ((rcr0() & CR0_TS) == 0) 1291 panic("fpu emulation not enabled in host!"); 1292 1293 /* save guest XCR0 and restore host XCR0 */ 1294 if (rcr4() & CR4_XSAVE) { 1295 vcpu->guest_xcr0 = rxcr(0); 1296 load_xcr(0, vmm_get_host_xcr0()); 1297 } 1298 1299 /* save guest FPU and restore host FPU */ 1300 fpu_stop_emulating(); 1301 hma_fpu_stop_guest(vcpu->guestfpu); 1302 /* 1303 * When the host state has been restored, we should not re-enable 1304 * CR0.TS on illumos for eager FPU. 1305 */ 1306 } 1307 1308 static int 1309 vcpu_set_state_locked(struct vm *vm, int vcpuid, enum vcpu_state newstate, 1310 bool from_idle) 1311 { 1312 struct vcpu *vcpu; 1313 int error; 1314 1315 vcpu = &vm->vcpu[vcpuid]; 1316 vcpu_assert_locked(vcpu); 1317 1318 /* 1319 * State transitions from the vmmdev_ioctl() must always begin from 1320 * the VCPU_IDLE state. This guarantees that there is only a single 1321 * ioctl() operating on a vcpu at any point. 1322 */ 1323 if (from_idle) { 1324 while (vcpu->state != VCPU_IDLE) { 1325 vcpu->reqidle = 1; 1326 vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT); 1327 VCPU_CTR1(vm, vcpuid, "vcpu state change from %s to " 1328 "idle requested", vcpu_state2str(vcpu->state)); 1329 cv_wait(&vcpu->state_cv, &vcpu->lock); 1330 } 1331 } else { 1332 KASSERT(vcpu->state != VCPU_IDLE, ("invalid transition from " 1333 "vcpu idle state")); 1334 } 1335 1336 if (vcpu->state == VCPU_RUNNING) { 1337 KASSERT(vcpu->hostcpu == curcpu, ("curcpu %d and hostcpu %d " 1338 "mismatch for running vcpu", curcpu, vcpu->hostcpu)); 1339 } else { 1340 KASSERT(vcpu->hostcpu == NOCPU, ("Invalid hostcpu %d for a " 1341 "vcpu that is not running", vcpu->hostcpu)); 1342 } 1343 1344 /* 1345 * The following state transitions are allowed: 1346 * IDLE -> FROZEN -> IDLE 1347 * FROZEN -> RUNNING -> FROZEN 1348 * FROZEN -> SLEEPING -> FROZEN 1349 */ 1350 switch (vcpu->state) { 1351 case VCPU_IDLE: 1352 case VCPU_RUNNING: 1353 case VCPU_SLEEPING: 1354 error = (newstate != VCPU_FROZEN); 1355 break; 1356 case VCPU_FROZEN: 1357 error = (newstate == VCPU_FROZEN); 1358 break; 1359 default: 1360 error = 1; 1361 break; 1362 } 1363 1364 if (error) 1365 return (EBUSY); 1366 1367 VCPU_CTR2(vm, vcpuid, "vcpu state changed from %s to %s", 1368 vcpu_state2str(vcpu->state), vcpu_state2str(newstate)); 1369 1370 vcpu->state = newstate; 1371 if (newstate == VCPU_RUNNING) 1372 vcpu->hostcpu = curcpu; 1373 else 1374 vcpu->hostcpu = NOCPU; 1375 1376 if (newstate == VCPU_IDLE) { 1377 cv_broadcast(&vcpu->state_cv); 1378 } 1379 1380 return (0); 1381 } 1382 1383 static void 1384 vcpu_require_state(struct vm *vm, int vcpuid, enum vcpu_state newstate) 1385 { 1386 int error; 1387 1388 if ((error = vcpu_set_state(vm, vcpuid, newstate, false)) != 0) 1389 panic("Error %d setting state to %d\n", error, newstate); 1390 } 1391 1392 static void 1393 vcpu_require_state_locked(struct vm *vm, int vcpuid, enum vcpu_state newstate) 1394 { 1395 int error; 1396 1397 if ((error = vcpu_set_state_locked(vm, vcpuid, newstate, false)) != 0) 1398 panic("Error %d setting state to %d", error, newstate); 1399 } 1400 1401 /* 1402 * Emulate a guest 'hlt' by sleeping until the vcpu is ready to run. 1403 */ 1404 static int 1405 vm_handle_hlt(struct vm *vm, int vcpuid, bool intr_disabled) 1406 { 1407 struct vcpu *vcpu; 1408 int vcpu_halted, vm_halted; 1409 bool userspace_exit = false; 1410 1411 KASSERT(!CPU_ISSET(vcpuid, &vm->halted_cpus), ("vcpu already halted")); 1412 1413 vcpu = &vm->vcpu[vcpuid]; 1414 vcpu_halted = 0; 1415 vm_halted = 0; 1416 1417 vcpu_lock(vcpu); 1418 while (1) { 1419 /* 1420 * Do a final check for pending interrupts (including NMI and 1421 * INIT) before putting this thread to sleep. 1422 */ 1423 if (vm_nmi_pending(vm, vcpuid)) 1424 break; 1425 if (vcpu_run_state_pending(vm, vcpuid)) 1426 break; 1427 if (!intr_disabled) { 1428 if (vm_extint_pending(vm, vcpuid) || 1429 vlapic_pending_intr(vcpu->vlapic, NULL)) { 1430 break; 1431 } 1432 } 1433 1434 /* 1435 * Also check for software events which would cause a wake-up. 1436 * This will set the appropriate exitcode directly, rather than 1437 * requiring a trip through VM_RUN(). 1438 */ 1439 if (vcpu_sleep_bailout_checks(vm, vcpuid)) { 1440 userspace_exit = true; 1441 break; 1442 } 1443 1444 /* 1445 * Some Linux guests implement "halt" by having all vcpus 1446 * execute HLT with interrupts disabled. 'halted_cpus' keeps 1447 * track of the vcpus that have entered this state. When all 1448 * vcpus enter the halted state the virtual machine is halted. 1449 */ 1450 if (intr_disabled) { 1451 if (!vcpu_halted && halt_detection_enabled) { 1452 vcpu_halted = 1; 1453 CPU_SET_ATOMIC(vcpuid, &vm->halted_cpus); 1454 } 1455 if (CPU_CMP(&vm->halted_cpus, &vm->active_cpus) == 0) { 1456 vm_halted = 1; 1457 break; 1458 } 1459 } 1460 1461 vcpu_ustate_change(vm, vcpuid, VU_IDLE); 1462 vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING); 1463 (void) cv_wait_sig(&vcpu->vcpu_cv, &vcpu->lock); 1464 vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN); 1465 vcpu_ustate_change(vm, vcpuid, VU_EMU_KERN); 1466 } 1467 1468 if (vcpu_halted) 1469 CPU_CLR_ATOMIC(vcpuid, &vm->halted_cpus); 1470 1471 vcpu_unlock(vcpu); 1472 1473 if (vm_halted) { 1474 (void) vm_suspend(vm, VM_SUSPEND_HALT); 1475 } 1476 1477 return (userspace_exit ? -1 : 0); 1478 } 1479 1480 static int 1481 vm_handle_paging(struct vm *vm, int vcpuid) 1482 { 1483 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 1484 vm_client_t *vmc = vcpu->vmclient; 1485 struct vm_exit *vme = &vcpu->exitinfo; 1486 int rv, ftype; 1487 1488 KASSERT(vme->inst_length == 0, ("%s: invalid inst_length %d", 1489 __func__, vme->inst_length)); 1490 1491 ftype = vme->u.paging.fault_type; 1492 KASSERT(ftype == PROT_READ || 1493 ftype == PROT_WRITE || ftype == PROT_EXEC, 1494 ("vm_handle_paging: invalid fault_type %d", ftype)); 1495 1496 rv = vmc_fault(vmc, vme->u.paging.gpa, ftype); 1497 1498 VCPU_CTR3(vm, vcpuid, "vm_handle_paging rv = %d, gpa = %lx, " 1499 "ftype = %d", rv, vme->u.paging.gpa, ftype); 1500 1501 if (rv != 0) 1502 return (EFAULT); 1503 return (0); 1504 } 1505 1506 int 1507 vm_service_mmio_read(struct vm *vm, int cpuid, uint64_t gpa, uint64_t *rval, 1508 int rsize) 1509 { 1510 int err = ESRCH; 1511 1512 if (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE) { 1513 struct vlapic *vlapic = vm_lapic(vm, cpuid); 1514 1515 err = vlapic_mmio_read(vlapic, gpa, rval, rsize); 1516 } else if (gpa >= VIOAPIC_BASE && gpa < VIOAPIC_BASE + VIOAPIC_SIZE) { 1517 err = vioapic_mmio_read(vm, cpuid, gpa, rval, rsize); 1518 } else if (gpa >= VHPET_BASE && gpa < VHPET_BASE + VHPET_SIZE) { 1519 err = vhpet_mmio_read(vm, cpuid, gpa, rval, rsize); 1520 } 1521 1522 return (err); 1523 } 1524 1525 int 1526 vm_service_mmio_write(struct vm *vm, int cpuid, uint64_t gpa, uint64_t wval, 1527 int wsize) 1528 { 1529 int err = ESRCH; 1530 1531 if (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE) { 1532 struct vlapic *vlapic = vm_lapic(vm, cpuid); 1533 1534 err = vlapic_mmio_write(vlapic, gpa, wval, wsize); 1535 } else if (gpa >= VIOAPIC_BASE && gpa < VIOAPIC_BASE + VIOAPIC_SIZE) { 1536 err = vioapic_mmio_write(vm, cpuid, gpa, wval, wsize); 1537 } else if (gpa >= VHPET_BASE && gpa < VHPET_BASE + VHPET_SIZE) { 1538 err = vhpet_mmio_write(vm, cpuid, gpa, wval, wsize); 1539 } 1540 1541 return (err); 1542 } 1543 1544 static int 1545 vm_handle_mmio_emul(struct vm *vm, int vcpuid) 1546 { 1547 struct vie *vie; 1548 struct vcpu *vcpu; 1549 struct vm_exit *vme; 1550 uint64_t inst_addr; 1551 int error, fault, cs_d; 1552 1553 vcpu = &vm->vcpu[vcpuid]; 1554 vme = &vcpu->exitinfo; 1555 vie = vcpu->vie_ctx; 1556 1557 KASSERT(vme->inst_length == 0, ("%s: invalid inst_length %d", 1558 __func__, vme->inst_length)); 1559 1560 inst_addr = vme->rip + vme->u.mmio_emul.cs_base; 1561 cs_d = vme->u.mmio_emul.cs_d; 1562 1563 VCPU_CTR1(vm, vcpuid, "inst_emul fault accessing gpa %lx", 1564 vme->u.mmio_emul.gpa); 1565 1566 /* Fetch the faulting instruction */ 1567 if (vie_needs_fetch(vie)) { 1568 error = vie_fetch_instruction(vie, vm, vcpuid, inst_addr, 1569 &fault); 1570 if (error != 0) { 1571 return (error); 1572 } else if (fault) { 1573 /* 1574 * If a fault during instruction fetch was encountered, 1575 * it will have asserted that the appropriate exception 1576 * be injected at next entry. 1577 * No further work is required. 1578 */ 1579 return (0); 1580 } 1581 } 1582 1583 if (vie_decode_instruction(vie, vm, vcpuid, cs_d) != 0) { 1584 VCPU_CTR1(vm, vcpuid, "Error decoding instruction at %lx", 1585 inst_addr); 1586 /* Dump (unrecognized) instruction bytes in userspace */ 1587 vie_fallback_exitinfo(vie, vme); 1588 return (-1); 1589 } 1590 if (vme->u.mmio_emul.gla != VIE_INVALID_GLA && 1591 vie_verify_gla(vie, vm, vcpuid, vme->u.mmio_emul.gla) != 0) { 1592 /* Decoded GLA does not match GLA from VM exit state */ 1593 vie_fallback_exitinfo(vie, vme); 1594 return (-1); 1595 } 1596 1597 repeat: 1598 error = vie_emulate_mmio(vie, vm, vcpuid); 1599 if (error < 0) { 1600 /* 1601 * MMIO not handled by any of the in-kernel-emulated devices, so 1602 * make a trip out to userspace for it. 1603 */ 1604 vie_exitinfo(vie, vme); 1605 } else if (error == EAGAIN) { 1606 /* 1607 * Continue emulating the rep-prefixed instruction, which has 1608 * not completed its iterations. 1609 * 1610 * In case this can be emulated in-kernel and has a high 1611 * repetition count (causing a tight spin), it should be 1612 * deferential to yield conditions. 1613 */ 1614 if (!vcpu_should_yield(vm, vcpuid)) { 1615 goto repeat; 1616 } else { 1617 /* 1618 * Defer to the contending load by making a trip to 1619 * userspace with a no-op (BOGUS) exit reason. 1620 */ 1621 vie_reset(vie); 1622 vme->exitcode = VM_EXITCODE_BOGUS; 1623 return (-1); 1624 } 1625 } else if (error == 0) { 1626 /* Update %rip now that instruction has been emulated */ 1627 vie_advance_pc(vie, &vcpu->nextrip); 1628 } 1629 return (error); 1630 } 1631 1632 static int 1633 vm_handle_inout(struct vm *vm, int vcpuid, struct vm_exit *vme) 1634 { 1635 struct vcpu *vcpu; 1636 struct vie *vie; 1637 int err; 1638 1639 vcpu = &vm->vcpu[vcpuid]; 1640 vie = vcpu->vie_ctx; 1641 1642 repeat: 1643 err = vie_emulate_inout(vie, vm, vcpuid); 1644 1645 if (err < 0) { 1646 /* 1647 * In/out not handled by any of the in-kernel-emulated devices, 1648 * so make a trip out to userspace for it. 1649 */ 1650 vie_exitinfo(vie, vme); 1651 return (err); 1652 } else if (err == EAGAIN) { 1653 /* 1654 * Continue emulating the rep-prefixed ins/outs, which has not 1655 * completed its iterations. 1656 * 1657 * In case this can be emulated in-kernel and has a high 1658 * repetition count (causing a tight spin), it should be 1659 * deferential to yield conditions. 1660 */ 1661 if (!vcpu_should_yield(vm, vcpuid)) { 1662 goto repeat; 1663 } else { 1664 /* 1665 * Defer to the contending load by making a trip to 1666 * userspace with a no-op (BOGUS) exit reason. 1667 */ 1668 vie_reset(vie); 1669 vme->exitcode = VM_EXITCODE_BOGUS; 1670 return (-1); 1671 } 1672 } else if (err != 0) { 1673 /* Emulation failure. Bail all the way out to userspace. */ 1674 vme->exitcode = VM_EXITCODE_INST_EMUL; 1675 bzero(&vme->u.inst_emul, sizeof (vme->u.inst_emul)); 1676 return (-1); 1677 } 1678 1679 vie_advance_pc(vie, &vcpu->nextrip); 1680 return (0); 1681 } 1682 1683 static int 1684 vm_handle_inst_emul(struct vm *vm, int vcpuid) 1685 { 1686 struct vie *vie; 1687 struct vcpu *vcpu; 1688 struct vm_exit *vme; 1689 uint64_t cs_base; 1690 int error, fault, cs_d; 1691 1692 vcpu = &vm->vcpu[vcpuid]; 1693 vme = &vcpu->exitinfo; 1694 vie = vcpu->vie_ctx; 1695 1696 vie_cs_info(vie, vm, vcpuid, &cs_base, &cs_d); 1697 1698 /* Fetch the faulting instruction */ 1699 ASSERT(vie_needs_fetch(vie)); 1700 error = vie_fetch_instruction(vie, vm, vcpuid, vme->rip + cs_base, 1701 &fault); 1702 if (error != 0) { 1703 return (error); 1704 } else if (fault) { 1705 /* 1706 * If a fault during instruction fetch was encounted, it will 1707 * have asserted that the appropriate exception be injected at 1708 * next entry. No further work is required. 1709 */ 1710 return (0); 1711 } 1712 1713 if (vie_decode_instruction(vie, vm, vcpuid, cs_d) != 0) { 1714 /* Dump (unrecognized) instruction bytes in userspace */ 1715 vie_fallback_exitinfo(vie, vme); 1716 return (-1); 1717 } 1718 1719 error = vie_emulate_other(vie, vm, vcpuid); 1720 if (error != 0) { 1721 /* 1722 * Instruction emulation was unable to complete successfully, so 1723 * kick it out to userspace for handling. 1724 */ 1725 vie_fallback_exitinfo(vie, vme); 1726 } else { 1727 /* Update %rip now that instruction has been emulated */ 1728 vie_advance_pc(vie, &vcpu->nextrip); 1729 } 1730 return (error); 1731 } 1732 1733 static int 1734 vm_handle_suspend(struct vm *vm, int vcpuid) 1735 { 1736 int i; 1737 struct vcpu *vcpu; 1738 1739 vcpu = &vm->vcpu[vcpuid]; 1740 1741 CPU_SET_ATOMIC(vcpuid, &vm->suspended_cpus); 1742 1743 /* 1744 * Wait until all 'active_cpus' have suspended themselves. 1745 */ 1746 vcpu_lock(vcpu); 1747 vcpu_ustate_change(vm, vcpuid, VU_INIT); 1748 while (1) { 1749 int rc; 1750 1751 if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) { 1752 VCPU_CTR0(vm, vcpuid, "All vcpus suspended"); 1753 break; 1754 } 1755 1756 vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING); 1757 rc = cv_reltimedwait_sig(&vcpu->vcpu_cv, &vcpu->lock, hz, 1758 TR_CLOCK_TICK); 1759 vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN); 1760 1761 /* 1762 * If the userspace process driving the instance is killed, any 1763 * vCPUs yet to be marked suspended (because they are not 1764 * VM_RUN-ing in the kernel presently) will never reach that 1765 * state. 1766 * 1767 * To avoid vm_handle_suspend() getting stuck in the kernel 1768 * waiting for those vCPUs, offer a bail-out even though it 1769 * means returning without all vCPUs in a suspended state. 1770 */ 1771 if (rc <= 0) { 1772 if ((curproc->p_flag & SEXITING) != 0) { 1773 break; 1774 } 1775 } 1776 } 1777 vcpu_unlock(vcpu); 1778 1779 /* 1780 * Wakeup the other sleeping vcpus and return to userspace. 1781 */ 1782 for (i = 0; i < vm->maxcpus; i++) { 1783 if (CPU_ISSET(i, &vm->suspended_cpus)) { 1784 vcpu_notify_event(vm, i); 1785 } 1786 } 1787 1788 return (-1); 1789 } 1790 1791 static int 1792 vm_handle_reqidle(struct vm *vm, int vcpuid) 1793 { 1794 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 1795 1796 vcpu_lock(vcpu); 1797 KASSERT(vcpu->reqidle, ("invalid vcpu reqidle %d", vcpu->reqidle)); 1798 vcpu->reqidle = 0; 1799 vcpu_unlock(vcpu); 1800 return (-1); 1801 } 1802 1803 static int 1804 vm_handle_run_state(struct vm *vm, int vcpuid) 1805 { 1806 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 1807 bool handled = false; 1808 1809 vcpu_lock(vcpu); 1810 while (1) { 1811 if ((vcpu->run_state & VRS_PEND_INIT) != 0) { 1812 vcpu_unlock(vcpu); 1813 VERIFY0(vcpu_arch_reset(vm, vcpuid, true)); 1814 vcpu_lock(vcpu); 1815 1816 vcpu->run_state &= ~(VRS_RUN | VRS_PEND_INIT); 1817 vcpu->run_state |= VRS_INIT; 1818 } 1819 1820 if ((vcpu->run_state & (VRS_INIT | VRS_RUN | VRS_PEND_SIPI)) == 1821 (VRS_INIT | VRS_PEND_SIPI)) { 1822 const uint8_t vector = vcpu->sipi_vector; 1823 1824 vcpu_unlock(vcpu); 1825 VERIFY0(vcpu_vector_sipi(vm, vcpuid, vector)); 1826 vcpu_lock(vcpu); 1827 1828 vcpu->run_state &= ~VRS_PEND_SIPI; 1829 vcpu->run_state |= VRS_RUN; 1830 } 1831 1832 /* 1833 * If the vCPU is now in the running state, there is no need to 1834 * wait for anything prior to re-entry. 1835 */ 1836 if ((vcpu->run_state & VRS_RUN) != 0) { 1837 handled = true; 1838 break; 1839 } 1840 1841 /* 1842 * Also check for software events which would cause a wake-up. 1843 * This will set the appropriate exitcode directly, rather than 1844 * requiring a trip through VM_RUN(). 1845 */ 1846 if (vcpu_sleep_bailout_checks(vm, vcpuid)) { 1847 break; 1848 } 1849 1850 vcpu_ustate_change(vm, vcpuid, VU_IDLE); 1851 vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING); 1852 (void) cv_wait_sig(&vcpu->vcpu_cv, &vcpu->lock); 1853 vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN); 1854 vcpu_ustate_change(vm, vcpuid, VU_EMU_KERN); 1855 } 1856 vcpu_unlock(vcpu); 1857 1858 return (handled ? 0 : -1); 1859 } 1860 1861 static int 1862 vm_handle_rdmsr(struct vm *vm, int vcpuid, struct vm_exit *vme) 1863 { 1864 const uint32_t code = vme->u.msr.code; 1865 uint64_t val = 0; 1866 1867 switch (code) { 1868 case MSR_MCG_CAP: 1869 case MSR_MCG_STATUS: 1870 val = 0; 1871 break; 1872 1873 case MSR_MTRRcap: 1874 case MSR_MTRRdefType: 1875 case MSR_MTRR4kBase ... MSR_MTRR4kBase + 8: 1876 case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1: 1877 case MSR_MTRR64kBase: 1878 val = 0; 1879 break; 1880 1881 case MSR_TSC: 1882 /* 1883 * In all likelihood, this should always be handled in guest 1884 * context by VMX/SVM rather than taking an exit. (Both VMX and 1885 * SVM pass through read-only access to MSR_TSC to the guest.) 1886 * 1887 * No physical offset is requested of vcpu_tsc_offset() since 1888 * rdtsc_offset() takes care of that instead. 1889 */ 1890 val = vcpu_tsc_offset(vm, vcpuid, false) + rdtsc_offset(); 1891 break; 1892 1893 default: 1894 /* 1895 * Anything not handled at this point will be kicked out to 1896 * userspace for attempted processing there. 1897 */ 1898 return (-1); 1899 } 1900 1901 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RAX, 1902 val & 0xffffffff)); 1903 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RDX, 1904 val >> 32)); 1905 return (0); 1906 } 1907 1908 static int 1909 vm_handle_wrmsr(struct vm *vm, int vcpuid, struct vm_exit *vme) 1910 { 1911 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 1912 const uint32_t code = vme->u.msr.code; 1913 const uint64_t val = vme->u.msr.wval; 1914 1915 switch (code) { 1916 case MSR_MCG_CAP: 1917 case MSR_MCG_STATUS: 1918 /* Ignore writes */ 1919 break; 1920 1921 case MSR_MTRRcap: 1922 vm_inject_gp(vm, vcpuid); 1923 break; 1924 case MSR_MTRRdefType: 1925 case MSR_MTRR4kBase ... MSR_MTRR4kBase + 8: 1926 case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1: 1927 case MSR_MTRR64kBase: 1928 /* Ignore writes */ 1929 break; 1930 1931 case MSR_TSC: 1932 /* 1933 * The effect of writing the TSC MSR is that a subsequent read 1934 * of the TSC would report that value written (plus any time 1935 * elapsed between the write and the read). The guest TSC value 1936 * is calculated from a global offset for the guest (which 1937 * effectively makes its TSC read 0 at guest boot) and a 1938 * per-vCPU offset to handle these writes to the MSR. 1939 * 1940 * To calculate that per-vCPU offset, we can work backwards from 1941 * the guest value at the time of write: 1942 * 1943 * value = host TSC + VM boot offset + vCPU offset 1944 * 1945 * so therefore: 1946 * 1947 * value - host TSC - VM boot offset = vCPU offset 1948 */ 1949 vcpu->tsc_offset = val - vm->boot_tsc_offset - rdtsc_offset(); 1950 break; 1951 1952 default: 1953 /* 1954 * Anything not handled at this point will be kicked out to 1955 * userspace for attempted processing there. 1956 */ 1957 return (-1); 1958 } 1959 1960 return (0); 1961 } 1962 1963 int 1964 vm_suspend(struct vm *vm, enum vm_suspend_how how) 1965 { 1966 if (how <= VM_SUSPEND_NONE || how >= VM_SUSPEND_LAST) 1967 return (EINVAL); 1968 1969 if (atomic_cmpset_int((uint_t *)&vm->suspend, 0, how) == 0) { 1970 return (EALREADY); 1971 } 1972 1973 /* 1974 * Notify all active vcpus that they are now suspended. 1975 */ 1976 for (uint_t i = 0; i < vm->maxcpus; i++) { 1977 struct vcpu *vcpu = &vm->vcpu[i]; 1978 1979 vcpu_lock(vcpu); 1980 if (vcpu->state == VCPU_IDLE || vcpu->state == VCPU_FROZEN) { 1981 /* 1982 * Any vCPUs not actively running or in HLT can be 1983 * marked as suspended immediately. 1984 */ 1985 if (CPU_ISSET(i, &vm->active_cpus)) { 1986 CPU_SET_ATOMIC(i, &vm->suspended_cpus); 1987 } 1988 } else { 1989 /* 1990 * Those which are running or in HLT will pick up the 1991 * suspended state after notification. 1992 */ 1993 vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT); 1994 } 1995 vcpu_unlock(vcpu); 1996 } 1997 return (0); 1998 } 1999 2000 void 2001 vm_exit_run_state(struct vm *vm, int vcpuid, uint64_t rip) 2002 { 2003 struct vm_exit *vmexit; 2004 2005 vmexit = vm_exitinfo(vm, vcpuid); 2006 vmexit->rip = rip; 2007 vmexit->inst_length = 0; 2008 vmexit->exitcode = VM_EXITCODE_RUN_STATE; 2009 vmm_stat_incr(vm, vcpuid, VMEXIT_RUN_STATE, 1); 2010 } 2011 2012 /* 2013 * Some vmm resources, such as the lapic, may have CPU-specific resources 2014 * allocated to them which would benefit from migration onto the host CPU which 2015 * is processing the vcpu state. 2016 */ 2017 static void 2018 vm_localize_resources(struct vm *vm, struct vcpu *vcpu) 2019 { 2020 /* 2021 * Localizing cyclic resources requires acquisition of cpu_lock, and 2022 * doing so with kpreempt disabled is a recipe for deadlock disaster. 2023 */ 2024 VERIFY(curthread->t_preempt == 0); 2025 2026 /* 2027 * Do not bother with localization if this vCPU is about to return to 2028 * the host CPU it was last localized to. 2029 */ 2030 if (vcpu->lastloccpu == curcpu) 2031 return; 2032 2033 /* 2034 * Localize system-wide resources to the primary boot vCPU. While any 2035 * of the other vCPUs may access them, it keeps the potential interrupt 2036 * footprint constrained to CPUs involved with this instance. 2037 */ 2038 if (vcpu == &vm->vcpu[0]) { 2039 vhpet_localize_resources(vm->vhpet); 2040 vrtc_localize_resources(vm->vrtc); 2041 vatpit_localize_resources(vm->vatpit); 2042 } 2043 2044 vlapic_localize_resources(vcpu->vlapic); 2045 2046 vcpu->lastloccpu = curcpu; 2047 } 2048 2049 static void 2050 vmm_savectx(void *arg) 2051 { 2052 vm_thread_ctx_t *vtc = arg; 2053 struct vm *vm = vtc->vtc_vm; 2054 const int vcpuid = vtc->vtc_vcpuid; 2055 2056 if (ops->vmsavectx != NULL) { 2057 ops->vmsavectx(vm->cookie, vcpuid); 2058 } 2059 2060 /* 2061 * Account for going off-cpu, unless the vCPU is idled, where being 2062 * off-cpu is the explicit point. 2063 */ 2064 if (vm->vcpu[vcpuid].ustate != VU_IDLE) { 2065 vtc->vtc_ustate = vm->vcpu[vcpuid].ustate; 2066 vcpu_ustate_change(vm, vcpuid, VU_SCHED); 2067 } 2068 2069 /* 2070 * If the CPU holds the restored guest FPU state, save it and restore 2071 * the host FPU state before this thread goes off-cpu. 2072 */ 2073 if ((vtc->vtc_status & VTCS_FPU_RESTORED) != 0) { 2074 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 2075 2076 save_guest_fpustate(vcpu); 2077 vtc->vtc_status &= ~VTCS_FPU_RESTORED; 2078 } 2079 } 2080 2081 static void 2082 vmm_restorectx(void *arg) 2083 { 2084 vm_thread_ctx_t *vtc = arg; 2085 struct vm *vm = vtc->vtc_vm; 2086 const int vcpuid = vtc->vtc_vcpuid; 2087 2088 /* Complete microstate accounting for vCPU being off-cpu */ 2089 if (vm->vcpu[vcpuid].ustate != VU_IDLE) { 2090 vcpu_ustate_change(vm, vcpuid, vtc->vtc_ustate); 2091 } 2092 2093 /* 2094 * When coming back on-cpu, only restore the guest FPU status if the 2095 * thread is in a context marked as requiring it. This should be rare, 2096 * occurring only when a future logic error results in a voluntary 2097 * sleep during the VMRUN critical section. 2098 * 2099 * The common case will result in elision of the guest FPU state 2100 * restoration, deferring that action until it is clearly necessary 2101 * during vm_run. 2102 */ 2103 VERIFY((vtc->vtc_status & VTCS_FPU_RESTORED) == 0); 2104 if ((vtc->vtc_status & VTCS_FPU_CTX_CRITICAL) != 0) { 2105 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 2106 2107 restore_guest_fpustate(vcpu); 2108 vtc->vtc_status |= VTCS_FPU_RESTORED; 2109 } 2110 2111 if (ops->vmrestorectx != NULL) { 2112 ops->vmrestorectx(vm->cookie, vcpuid); 2113 } 2114 2115 } 2116 2117 static int 2118 vm_entry_actions(struct vm *vm, int vcpuid, const struct vm_entry *entry, 2119 struct vm_exit *vme) 2120 { 2121 struct vcpu *vcpu; 2122 struct vie *vie; 2123 int err; 2124 2125 vcpu = &vm->vcpu[vcpuid]; 2126 vie = vcpu->vie_ctx; 2127 err = 0; 2128 2129 switch (entry->cmd) { 2130 case VEC_DEFAULT: 2131 return (0); 2132 case VEC_DISCARD_INSTR: 2133 vie_reset(vie); 2134 return (0); 2135 case VEC_FULFILL_MMIO: 2136 err = vie_fulfill_mmio(vie, &entry->u.mmio); 2137 if (err == 0) { 2138 err = vie_emulate_mmio(vie, vm, vcpuid); 2139 if (err == 0) { 2140 vie_advance_pc(vie, &vcpu->nextrip); 2141 } else if (err < 0) { 2142 vie_exitinfo(vie, vme); 2143 } else if (err == EAGAIN) { 2144 /* 2145 * Clear the instruction emulation state in 2146 * order to re-enter VM context and continue 2147 * this 'rep <instruction>' 2148 */ 2149 vie_reset(vie); 2150 err = 0; 2151 } 2152 } 2153 break; 2154 case VEC_FULFILL_INOUT: 2155 err = vie_fulfill_inout(vie, &entry->u.inout); 2156 if (err == 0) { 2157 err = vie_emulate_inout(vie, vm, vcpuid); 2158 if (err == 0) { 2159 vie_advance_pc(vie, &vcpu->nextrip); 2160 } else if (err < 0) { 2161 vie_exitinfo(vie, vme); 2162 } else if (err == EAGAIN) { 2163 /* 2164 * Clear the instruction emulation state in 2165 * order to re-enter VM context and continue 2166 * this 'rep ins/outs' 2167 */ 2168 vie_reset(vie); 2169 err = 0; 2170 } 2171 } 2172 break; 2173 default: 2174 return (EINVAL); 2175 } 2176 return (err); 2177 } 2178 2179 static int 2180 vm_loop_checks(struct vm *vm, int vcpuid, struct vm_exit *vme) 2181 { 2182 struct vie *vie; 2183 2184 vie = vm->vcpu[vcpuid].vie_ctx; 2185 2186 if (vie_pending(vie)) { 2187 /* 2188 * Userspace has not fulfilled the pending needs of the 2189 * instruction emulation, so bail back out. 2190 */ 2191 vie_exitinfo(vie, vme); 2192 return (-1); 2193 } 2194 2195 return (0); 2196 } 2197 2198 int 2199 vm_run(struct vm *vm, int vcpuid, const struct vm_entry *entry) 2200 { 2201 int error; 2202 struct vcpu *vcpu; 2203 struct vm_exit *vme; 2204 bool intr_disabled; 2205 int affinity_type = CPU_CURRENT; 2206 2207 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 2208 return (EINVAL); 2209 if (!CPU_ISSET(vcpuid, &vm->active_cpus)) 2210 return (EINVAL); 2211 2212 vcpu = &vm->vcpu[vcpuid]; 2213 vme = &vcpu->exitinfo; 2214 2215 vcpu_ustate_change(vm, vcpuid, VU_EMU_KERN); 2216 2217 vcpu->vtc.vtc_status = 0; 2218 ctxop_attach(curthread, vcpu->ctxop); 2219 2220 error = vm_entry_actions(vm, vcpuid, entry, vme); 2221 if (error != 0) { 2222 goto exit; 2223 } 2224 2225 restart: 2226 error = vm_loop_checks(vm, vcpuid, vme); 2227 if (error != 0) { 2228 goto exit; 2229 } 2230 2231 thread_affinity_set(curthread, affinity_type); 2232 /* 2233 * Resource localization should happen after the CPU affinity for the 2234 * thread has been set to ensure that access from restricted contexts, 2235 * such as VMX-accelerated APIC operations, can occur without inducing 2236 * cyclic cross-calls. 2237 * 2238 * This must be done prior to disabling kpreempt via critical_enter(). 2239 */ 2240 vm_localize_resources(vm, vcpu); 2241 affinity_type = CPU_CURRENT; 2242 critical_enter(); 2243 2244 /* Force a trip through update_sregs to reload %fs/%gs and friends */ 2245 PCB_SET_UPDATE_SEGS(&ttolwp(curthread)->lwp_pcb); 2246 2247 if ((vcpu->vtc.vtc_status & VTCS_FPU_RESTORED) == 0) { 2248 restore_guest_fpustate(vcpu); 2249 vcpu->vtc.vtc_status |= VTCS_FPU_RESTORED; 2250 } 2251 vcpu->vtc.vtc_status |= VTCS_FPU_CTX_CRITICAL; 2252 2253 vcpu_require_state(vm, vcpuid, VCPU_RUNNING); 2254 error = VMRUN(vm->cookie, vcpuid, vcpu->nextrip); 2255 vcpu_require_state(vm, vcpuid, VCPU_FROZEN); 2256 2257 /* 2258 * Once clear of the delicate contexts comprising the VM_RUN handler, 2259 * thread CPU affinity can be loosened while other processing occurs. 2260 */ 2261 vcpu->vtc.vtc_status &= ~VTCS_FPU_CTX_CRITICAL; 2262 thread_affinity_clear(curthread); 2263 critical_exit(); 2264 2265 if (error != 0) { 2266 /* Communicate out any error from VMRUN() above */ 2267 goto exit; 2268 } 2269 2270 vcpu->nextrip = vme->rip + vme->inst_length; 2271 switch (vme->exitcode) { 2272 case VM_EXITCODE_REQIDLE: 2273 error = vm_handle_reqidle(vm, vcpuid); 2274 break; 2275 case VM_EXITCODE_RUN_STATE: 2276 error = vm_handle_run_state(vm, vcpuid); 2277 break; 2278 case VM_EXITCODE_SUSPENDED: 2279 error = vm_handle_suspend(vm, vcpuid); 2280 break; 2281 case VM_EXITCODE_IOAPIC_EOI: 2282 vioapic_process_eoi(vm, vcpuid, 2283 vme->u.ioapic_eoi.vector); 2284 break; 2285 case VM_EXITCODE_HLT: 2286 intr_disabled = ((vme->u.hlt.rflags & PSL_I) == 0); 2287 error = vm_handle_hlt(vm, vcpuid, intr_disabled); 2288 break; 2289 case VM_EXITCODE_PAGING: 2290 error = vm_handle_paging(vm, vcpuid); 2291 break; 2292 case VM_EXITCODE_MMIO_EMUL: 2293 error = vm_handle_mmio_emul(vm, vcpuid); 2294 break; 2295 case VM_EXITCODE_INOUT: 2296 error = vm_handle_inout(vm, vcpuid, vme); 2297 break; 2298 case VM_EXITCODE_INST_EMUL: 2299 error = vm_handle_inst_emul(vm, vcpuid); 2300 break; 2301 case VM_EXITCODE_MONITOR: 2302 case VM_EXITCODE_MWAIT: 2303 case VM_EXITCODE_VMINSN: 2304 vm_inject_ud(vm, vcpuid); 2305 break; 2306 case VM_EXITCODE_RDMSR: 2307 error = vm_handle_rdmsr(vm, vcpuid, vme); 2308 break; 2309 case VM_EXITCODE_WRMSR: 2310 error = vm_handle_wrmsr(vm, vcpuid, vme); 2311 break; 2312 case VM_EXITCODE_HT: 2313 affinity_type = CPU_BEST; 2314 break; 2315 case VM_EXITCODE_MTRAP: 2316 VERIFY0(vm_suspend_cpu(vm, vcpuid)); 2317 error = -1; 2318 break; 2319 default: 2320 /* handled in userland */ 2321 error = -1; 2322 break; 2323 } 2324 2325 if (error == 0) { 2326 /* VM exit conditions handled in-kernel, continue running */ 2327 goto restart; 2328 } 2329 2330 exit: 2331 kpreempt_disable(); 2332 ctxop_detach(curthread, vcpu->ctxop); 2333 /* Make sure all of the needed vCPU context state is saved */ 2334 vmm_savectx(&vcpu->vtc); 2335 kpreempt_enable(); 2336 2337 VCPU_CTR2(vm, vcpuid, "retu %d/%d", error, vme->exitcode); 2338 2339 vcpu_ustate_change(vm, vcpuid, VU_EMU_USER); 2340 return (error); 2341 } 2342 2343 int 2344 vm_restart_instruction(void *arg, int vcpuid) 2345 { 2346 struct vm *vm; 2347 struct vcpu *vcpu; 2348 enum vcpu_state state; 2349 uint64_t rip; 2350 int error; 2351 2352 vm = arg; 2353 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 2354 return (EINVAL); 2355 2356 vcpu = &vm->vcpu[vcpuid]; 2357 state = vcpu_get_state(vm, vcpuid, NULL); 2358 if (state == VCPU_RUNNING) { 2359 /* 2360 * When a vcpu is "running" the next instruction is determined 2361 * by adding 'rip' and 'inst_length' in the vcpu's 'exitinfo'. 2362 * Thus setting 'inst_length' to zero will cause the current 2363 * instruction to be restarted. 2364 */ 2365 vcpu->exitinfo.inst_length = 0; 2366 VCPU_CTR1(vm, vcpuid, "restarting instruction at %lx by " 2367 "setting inst_length to zero", vcpu->exitinfo.rip); 2368 } else if (state == VCPU_FROZEN) { 2369 /* 2370 * When a vcpu is "frozen" it is outside the critical section 2371 * around VMRUN() and 'nextrip' points to the next instruction. 2372 * Thus instruction restart is achieved by setting 'nextrip' 2373 * to the vcpu's %rip. 2374 */ 2375 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RIP, &rip); 2376 KASSERT(!error, ("%s: error %d getting rip", __func__, error)); 2377 VCPU_CTR2(vm, vcpuid, "restarting instruction by updating " 2378 "nextrip from %lx to %lx", vcpu->nextrip, rip); 2379 vcpu->nextrip = rip; 2380 } else { 2381 panic("%s: invalid state %d", __func__, state); 2382 } 2383 return (0); 2384 } 2385 2386 int 2387 vm_exit_intinfo(struct vm *vm, int vcpuid, uint64_t info) 2388 { 2389 struct vcpu *vcpu; 2390 int type, vector; 2391 2392 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 2393 return (EINVAL); 2394 2395 vcpu = &vm->vcpu[vcpuid]; 2396 2397 if (info & VM_INTINFO_VALID) { 2398 type = info & VM_INTINFO_TYPE; 2399 vector = info & 0xff; 2400 if (type == VM_INTINFO_NMI && vector != IDT_NMI) 2401 return (EINVAL); 2402 if (type == VM_INTINFO_HWEXCEPTION && vector >= 32) 2403 return (EINVAL); 2404 if (info & VM_INTINFO_RSVD) 2405 return (EINVAL); 2406 } else { 2407 info = 0; 2408 } 2409 VCPU_CTR2(vm, vcpuid, "%s: info1(%lx)", __func__, info); 2410 vcpu->exitintinfo = info; 2411 return (0); 2412 } 2413 2414 enum exc_class { 2415 EXC_BENIGN, 2416 EXC_CONTRIBUTORY, 2417 EXC_PAGEFAULT 2418 }; 2419 2420 #define IDT_VE 20 /* Virtualization Exception (Intel specific) */ 2421 2422 static enum exc_class 2423 exception_class(uint64_t info) 2424 { 2425 int type, vector; 2426 2427 KASSERT(info & VM_INTINFO_VALID, ("intinfo must be valid: %lx", info)); 2428 type = info & VM_INTINFO_TYPE; 2429 vector = info & 0xff; 2430 2431 /* Table 6-4, "Interrupt and Exception Classes", Intel SDM, Vol 3 */ 2432 switch (type) { 2433 case VM_INTINFO_HWINTR: 2434 case VM_INTINFO_SWINTR: 2435 case VM_INTINFO_NMI: 2436 return (EXC_BENIGN); 2437 default: 2438 /* 2439 * Hardware exception. 2440 * 2441 * SVM and VT-x use identical type values to represent NMI, 2442 * hardware interrupt and software interrupt. 2443 * 2444 * SVM uses type '3' for all exceptions. VT-x uses type '3' 2445 * for exceptions except #BP and #OF. #BP and #OF use a type 2446 * value of '5' or '6'. Therefore we don't check for explicit 2447 * values of 'type' to classify 'intinfo' into a hardware 2448 * exception. 2449 */ 2450 break; 2451 } 2452 2453 switch (vector) { 2454 case IDT_PF: 2455 case IDT_VE: 2456 return (EXC_PAGEFAULT); 2457 case IDT_DE: 2458 case IDT_TS: 2459 case IDT_NP: 2460 case IDT_SS: 2461 case IDT_GP: 2462 return (EXC_CONTRIBUTORY); 2463 default: 2464 return (EXC_BENIGN); 2465 } 2466 } 2467 2468 static int 2469 nested_fault(struct vm *vm, int vcpuid, uint64_t info1, uint64_t info2, 2470 uint64_t *retinfo) 2471 { 2472 enum exc_class exc1, exc2; 2473 int type1, vector1; 2474 2475 KASSERT(info1 & VM_INTINFO_VALID, ("info1 %lx is not valid", info1)); 2476 KASSERT(info2 & VM_INTINFO_VALID, ("info2 %lx is not valid", info2)); 2477 2478 /* 2479 * If an exception occurs while attempting to call the double-fault 2480 * handler the processor enters shutdown mode (aka triple fault). 2481 */ 2482 type1 = info1 & VM_INTINFO_TYPE; 2483 vector1 = info1 & 0xff; 2484 if (type1 == VM_INTINFO_HWEXCEPTION && vector1 == IDT_DF) { 2485 VCPU_CTR2(vm, vcpuid, "triple fault: info1(%lx), info2(%lx)", 2486 info1, info2); 2487 (void) vm_suspend(vm, VM_SUSPEND_TRIPLEFAULT); 2488 *retinfo = 0; 2489 return (0); 2490 } 2491 2492 /* 2493 * Table 6-5 "Conditions for Generating a Double Fault", Intel SDM, Vol3 2494 */ 2495 exc1 = exception_class(info1); 2496 exc2 = exception_class(info2); 2497 if ((exc1 == EXC_CONTRIBUTORY && exc2 == EXC_CONTRIBUTORY) || 2498 (exc1 == EXC_PAGEFAULT && exc2 != EXC_BENIGN)) { 2499 /* Convert nested fault into a double fault. */ 2500 *retinfo = IDT_DF; 2501 *retinfo |= VM_INTINFO_VALID | VM_INTINFO_HWEXCEPTION; 2502 *retinfo |= VM_INTINFO_DEL_ERRCODE; 2503 } else { 2504 /* Handle exceptions serially */ 2505 *retinfo = info2; 2506 } 2507 return (1); 2508 } 2509 2510 static uint64_t 2511 vcpu_exception_intinfo(struct vcpu *vcpu) 2512 { 2513 uint64_t info = 0; 2514 2515 if (vcpu->exception_pending) { 2516 info = vcpu->exc_vector & 0xff; 2517 info |= VM_INTINFO_VALID | VM_INTINFO_HWEXCEPTION; 2518 if (vcpu->exc_errcode_valid) { 2519 info |= VM_INTINFO_DEL_ERRCODE; 2520 info |= (uint64_t)vcpu->exc_errcode << 32; 2521 } 2522 } 2523 return (info); 2524 } 2525 2526 int 2527 vm_entry_intinfo(struct vm *vm, int vcpuid, uint64_t *retinfo) 2528 { 2529 struct vcpu *vcpu; 2530 uint64_t info1, info2; 2531 int valid; 2532 2533 KASSERT(vcpuid >= 0 && 2534 vcpuid < vm->maxcpus, ("invalid vcpu %d", vcpuid)); 2535 2536 vcpu = &vm->vcpu[vcpuid]; 2537 2538 info1 = vcpu->exitintinfo; 2539 vcpu->exitintinfo = 0; 2540 2541 info2 = 0; 2542 if (vcpu->exception_pending) { 2543 info2 = vcpu_exception_intinfo(vcpu); 2544 vcpu->exception_pending = 0; 2545 VCPU_CTR2(vm, vcpuid, "Exception %d delivered: %lx", 2546 vcpu->exc_vector, info2); 2547 } 2548 2549 if ((info1 & VM_INTINFO_VALID) && (info2 & VM_INTINFO_VALID)) { 2550 valid = nested_fault(vm, vcpuid, info1, info2, retinfo); 2551 } else if (info1 & VM_INTINFO_VALID) { 2552 *retinfo = info1; 2553 valid = 1; 2554 } else if (info2 & VM_INTINFO_VALID) { 2555 *retinfo = info2; 2556 valid = 1; 2557 } else { 2558 valid = 0; 2559 } 2560 2561 if (valid) { 2562 VCPU_CTR4(vm, vcpuid, "%s: info1(%lx), info2(%lx), " 2563 "retinfo(%lx)", __func__, info1, info2, *retinfo); 2564 } 2565 2566 return (valid); 2567 } 2568 2569 int 2570 vm_get_intinfo(struct vm *vm, int vcpuid, uint64_t *info1, uint64_t *info2) 2571 { 2572 struct vcpu *vcpu; 2573 2574 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 2575 return (EINVAL); 2576 2577 vcpu = &vm->vcpu[vcpuid]; 2578 *info1 = vcpu->exitintinfo; 2579 *info2 = vcpu_exception_intinfo(vcpu); 2580 return (0); 2581 } 2582 2583 int 2584 vm_inject_exception(struct vm *vm, int vcpuid, int vector, int errcode_valid, 2585 uint32_t errcode, int restart_instruction) 2586 { 2587 struct vcpu *vcpu; 2588 uint64_t regval; 2589 int error; 2590 2591 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 2592 return (EINVAL); 2593 2594 if (vector < 0 || vector >= 32) 2595 return (EINVAL); 2596 2597 /* 2598 * NMIs (which bear an exception vector of 2) are to be injected via 2599 * their own specialized path using vm_inject_nmi(). 2600 */ 2601 if (vector == 2) { 2602 return (EINVAL); 2603 } 2604 2605 /* 2606 * A double fault exception should never be injected directly into 2607 * the guest. It is a derived exception that results from specific 2608 * combinations of nested faults. 2609 */ 2610 if (vector == IDT_DF) 2611 return (EINVAL); 2612 2613 vcpu = &vm->vcpu[vcpuid]; 2614 2615 if (vcpu->exception_pending) { 2616 VCPU_CTR2(vm, vcpuid, "Unable to inject exception %d due to " 2617 "pending exception %d", vector, vcpu->exc_vector); 2618 return (EBUSY); 2619 } 2620 2621 if (errcode_valid) { 2622 /* 2623 * Exceptions don't deliver an error code in real mode. 2624 */ 2625 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_CR0, ®val); 2626 KASSERT(!error, ("%s: error %d getting CR0", __func__, error)); 2627 if (!(regval & CR0_PE)) 2628 errcode_valid = 0; 2629 } 2630 2631 /* 2632 * From section 26.6.1 "Interruptibility State" in Intel SDM: 2633 * 2634 * Event blocking by "STI" or "MOV SS" is cleared after guest executes 2635 * one instruction or incurs an exception. 2636 */ 2637 error = vm_set_register(vm, vcpuid, VM_REG_GUEST_INTR_SHADOW, 0); 2638 KASSERT(error == 0, ("%s: error %d clearing interrupt shadow", 2639 __func__, error)); 2640 2641 if (restart_instruction) { 2642 VERIFY0(vm_restart_instruction(vm, vcpuid)); 2643 } 2644 2645 vcpu->exception_pending = 1; 2646 vcpu->exc_vector = vector; 2647 vcpu->exc_errcode = errcode; 2648 vcpu->exc_errcode_valid = errcode_valid; 2649 VCPU_CTR1(vm, vcpuid, "Exception %d pending", vector); 2650 return (0); 2651 } 2652 2653 void 2654 vm_inject_fault(struct vm *vm, int vcpuid, int vector, int errcode_valid, 2655 int errcode) 2656 { 2657 int error; 2658 2659 error = vm_inject_exception(vm, vcpuid, vector, errcode_valid, 2660 errcode, 1); 2661 KASSERT(error == 0, ("vm_inject_exception error %d", error)); 2662 } 2663 2664 void 2665 vm_inject_ud(struct vm *vm, int vcpuid) 2666 { 2667 vm_inject_fault(vm, vcpuid, IDT_UD, 0, 0); 2668 } 2669 2670 void 2671 vm_inject_gp(struct vm *vm, int vcpuid) 2672 { 2673 vm_inject_fault(vm, vcpuid, IDT_GP, 1, 0); 2674 } 2675 2676 void 2677 vm_inject_ac(struct vm *vm, int vcpuid, int errcode) 2678 { 2679 vm_inject_fault(vm, vcpuid, IDT_AC, 1, errcode); 2680 } 2681 2682 void 2683 vm_inject_ss(struct vm *vm, int vcpuid, int errcode) 2684 { 2685 vm_inject_fault(vm, vcpuid, IDT_SS, 1, errcode); 2686 } 2687 2688 void 2689 vm_inject_pf(struct vm *vm, int vcpuid, int error_code, uint64_t cr2) 2690 { 2691 int error; 2692 2693 VCPU_CTR2(vm, vcpuid, "Injecting page fault: error_code %x, cr2 %lx", 2694 error_code, cr2); 2695 2696 error = vm_set_register(vm, vcpuid, VM_REG_GUEST_CR2, cr2); 2697 KASSERT(error == 0, ("vm_set_register(cr2) error %d", error)); 2698 2699 vm_inject_fault(vm, vcpuid, IDT_PF, 1, error_code); 2700 } 2701 2702 static VMM_STAT(VCPU_NMI_COUNT, "number of NMIs delivered to vcpu"); 2703 2704 int 2705 vm_inject_nmi(struct vm *vm, int vcpuid) 2706 { 2707 struct vcpu *vcpu; 2708 2709 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 2710 return (EINVAL); 2711 2712 vcpu = &vm->vcpu[vcpuid]; 2713 2714 vcpu->nmi_pending = 1; 2715 vcpu_notify_event(vm, vcpuid); 2716 return (0); 2717 } 2718 2719 int 2720 vm_nmi_pending(struct vm *vm, int vcpuid) 2721 { 2722 struct vcpu *vcpu; 2723 2724 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 2725 panic("vm_nmi_pending: invalid vcpuid %d", vcpuid); 2726 2727 vcpu = &vm->vcpu[vcpuid]; 2728 2729 return (vcpu->nmi_pending); 2730 } 2731 2732 void 2733 vm_nmi_clear(struct vm *vm, int vcpuid) 2734 { 2735 struct vcpu *vcpu; 2736 2737 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 2738 panic("vm_nmi_pending: invalid vcpuid %d", vcpuid); 2739 2740 vcpu = &vm->vcpu[vcpuid]; 2741 2742 if (vcpu->nmi_pending == 0) 2743 panic("vm_nmi_clear: inconsistent nmi_pending state"); 2744 2745 vcpu->nmi_pending = 0; 2746 vmm_stat_incr(vm, vcpuid, VCPU_NMI_COUNT, 1); 2747 } 2748 2749 static VMM_STAT(VCPU_EXTINT_COUNT, "number of ExtINTs delivered to vcpu"); 2750 2751 int 2752 vm_inject_extint(struct vm *vm, int vcpuid) 2753 { 2754 struct vcpu *vcpu; 2755 2756 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 2757 return (EINVAL); 2758 2759 vcpu = &vm->vcpu[vcpuid]; 2760 2761 vcpu->extint_pending = 1; 2762 vcpu_notify_event(vm, vcpuid); 2763 return (0); 2764 } 2765 2766 int 2767 vm_extint_pending(struct vm *vm, int vcpuid) 2768 { 2769 struct vcpu *vcpu; 2770 2771 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 2772 panic("vm_extint_pending: invalid vcpuid %d", vcpuid); 2773 2774 vcpu = &vm->vcpu[vcpuid]; 2775 2776 return (vcpu->extint_pending); 2777 } 2778 2779 void 2780 vm_extint_clear(struct vm *vm, int vcpuid) 2781 { 2782 struct vcpu *vcpu; 2783 2784 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 2785 panic("vm_extint_pending: invalid vcpuid %d", vcpuid); 2786 2787 vcpu = &vm->vcpu[vcpuid]; 2788 2789 if (vcpu->extint_pending == 0) 2790 panic("vm_extint_clear: inconsistent extint_pending state"); 2791 2792 vcpu->extint_pending = 0; 2793 vmm_stat_incr(vm, vcpuid, VCPU_EXTINT_COUNT, 1); 2794 } 2795 2796 int 2797 vm_inject_init(struct vm *vm, int vcpuid) 2798 { 2799 struct vcpu *vcpu; 2800 2801 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 2802 return (EINVAL); 2803 2804 vcpu = &vm->vcpu[vcpuid]; 2805 vcpu_lock(vcpu); 2806 vcpu->run_state |= VRS_PEND_INIT; 2807 /* 2808 * As part of queuing the INIT request, clear any pending SIPI. It 2809 * would not otherwise survive across the reset of the vCPU when it 2810 * undergoes the requested INIT. We would not want it to linger when it 2811 * could be mistaken as a subsequent (after the INIT) SIPI request. 2812 */ 2813 vcpu->run_state &= ~VRS_PEND_SIPI; 2814 vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT); 2815 2816 vcpu_unlock(vcpu); 2817 return (0); 2818 } 2819 2820 int 2821 vm_inject_sipi(struct vm *vm, int vcpuid, uint8_t vector) 2822 { 2823 struct vcpu *vcpu; 2824 2825 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 2826 return (EINVAL); 2827 2828 vcpu = &vm->vcpu[vcpuid]; 2829 vcpu_lock(vcpu); 2830 vcpu->run_state |= VRS_PEND_SIPI; 2831 vcpu->sipi_vector = vector; 2832 /* SIPI is only actionable if the CPU is waiting in INIT state */ 2833 if ((vcpu->run_state & (VRS_INIT | VRS_RUN)) == VRS_INIT) { 2834 vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT); 2835 } 2836 vcpu_unlock(vcpu); 2837 return (0); 2838 } 2839 2840 bool 2841 vcpu_run_state_pending(struct vm *vm, int vcpuid) 2842 { 2843 struct vcpu *vcpu; 2844 2845 ASSERT(vcpuid >= 0 && vcpuid < vm->maxcpus); 2846 vcpu = &vm->vcpu[vcpuid]; 2847 2848 /* Of interest: vCPU not in running state or with pending INIT */ 2849 return ((vcpu->run_state & (VRS_RUN | VRS_PEND_INIT)) != VRS_RUN); 2850 } 2851 2852 int 2853 vcpu_arch_reset(struct vm *vm, int vcpuid, bool init_only) 2854 { 2855 struct seg_desc desc; 2856 const enum vm_reg_name clear_regs[] = { 2857 VM_REG_GUEST_CR2, 2858 VM_REG_GUEST_CR3, 2859 VM_REG_GUEST_CR4, 2860 VM_REG_GUEST_RAX, 2861 VM_REG_GUEST_RBX, 2862 VM_REG_GUEST_RCX, 2863 VM_REG_GUEST_RSI, 2864 VM_REG_GUEST_RDI, 2865 VM_REG_GUEST_RBP, 2866 VM_REG_GUEST_RSP, 2867 VM_REG_GUEST_R8, 2868 VM_REG_GUEST_R9, 2869 VM_REG_GUEST_R10, 2870 VM_REG_GUEST_R11, 2871 VM_REG_GUEST_R12, 2872 VM_REG_GUEST_R13, 2873 VM_REG_GUEST_R14, 2874 VM_REG_GUEST_R15, 2875 VM_REG_GUEST_DR0, 2876 VM_REG_GUEST_DR1, 2877 VM_REG_GUEST_DR2, 2878 VM_REG_GUEST_DR3, 2879 VM_REG_GUEST_EFER, 2880 }; 2881 const enum vm_reg_name data_segs[] = { 2882 VM_REG_GUEST_SS, 2883 VM_REG_GUEST_DS, 2884 VM_REG_GUEST_ES, 2885 VM_REG_GUEST_FS, 2886 VM_REG_GUEST_GS, 2887 }; 2888 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 2889 2890 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 2891 return (EINVAL); 2892 2893 for (uint_t i = 0; i < nitems(clear_regs); i++) { 2894 VERIFY0(vm_set_register(vm, vcpuid, clear_regs[i], 0)); 2895 } 2896 2897 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, 2)); 2898 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RIP, 0xfff0)); 2899 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_CR0, 0x60000010)); 2900 2901 /* 2902 * The prescribed contents of %rdx differ slightly between the Intel and 2903 * AMD architectural definitions. The former expects the Extended Model 2904 * in bits 16-19 where the latter expects all the Family, Model, and 2905 * Stepping be there. Common boot ROMs appear to disregard this 2906 * anyways, so we stick with a compromise value similar to what is 2907 * spelled out in the Intel SDM. 2908 */ 2909 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RDX, 0x600)); 2910 2911 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_DR6, 0xffff0ff0)); 2912 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_DR7, 0x400)); 2913 2914 /* CS: Present, R/W, Accessed */ 2915 desc.access = 0x0093; 2916 desc.base = 0xffff0000; 2917 desc.limit = 0xffff; 2918 VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_CS, &desc)); 2919 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_CS, 0xf000)); 2920 2921 /* SS, DS, ES, FS, GS: Present, R/W, Accessed */ 2922 desc.access = 0x0093; 2923 desc.base = 0; 2924 desc.limit = 0xffff; 2925 for (uint_t i = 0; i < nitems(data_segs); i++) { 2926 VERIFY0(vm_set_seg_desc(vm, vcpuid, data_segs[i], &desc)); 2927 VERIFY0(vm_set_register(vm, vcpuid, data_segs[i], 0)); 2928 } 2929 2930 /* GDTR, IDTR */ 2931 desc.base = 0; 2932 desc.limit = 0xffff; 2933 VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_GDTR, &desc)); 2934 VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_IDTR, &desc)); 2935 2936 /* LDTR: Present, LDT */ 2937 desc.access = 0x0082; 2938 desc.base = 0; 2939 desc.limit = 0xffff; 2940 VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_LDTR, &desc)); 2941 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_LDTR, 0)); 2942 2943 /* TR: Present, 32-bit TSS */ 2944 desc.access = 0x008b; 2945 desc.base = 0; 2946 desc.limit = 0xffff; 2947 VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_TR, &desc)); 2948 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_TR, 0)); 2949 2950 vlapic_reset(vm_lapic(vm, vcpuid)); 2951 2952 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_INTR_SHADOW, 0)); 2953 2954 vcpu->exitintinfo = 0; 2955 vcpu->exception_pending = 0; 2956 vcpu->nmi_pending = 0; 2957 vcpu->extint_pending = 0; 2958 2959 /* 2960 * A CPU reset caused by power-on or system reset clears more state than 2961 * one which is trigged from an INIT IPI. 2962 */ 2963 if (!init_only) { 2964 vcpu->guest_xcr0 = XFEATURE_ENABLED_X87; 2965 (void) hma_fpu_init(vcpu->guestfpu); 2966 2967 /* XXX: clear MSRs and other pieces */ 2968 } 2969 2970 return (0); 2971 } 2972 2973 static int 2974 vcpu_vector_sipi(struct vm *vm, int vcpuid, uint8_t vector) 2975 { 2976 struct seg_desc desc; 2977 2978 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 2979 return (EINVAL); 2980 2981 /* CS: Present, R/W, Accessed */ 2982 desc.access = 0x0093; 2983 desc.base = (uint64_t)vector << 12; 2984 desc.limit = 0xffff; 2985 VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_CS, &desc)); 2986 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_CS, 2987 (uint64_t)vector << 8)); 2988 2989 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RIP, 0)); 2990 2991 return (0); 2992 } 2993 2994 int 2995 vm_get_capability(struct vm *vm, int vcpu, int type, int *retval) 2996 { 2997 if (vcpu < 0 || vcpu >= vm->maxcpus) 2998 return (EINVAL); 2999 3000 if (type < 0 || type >= VM_CAP_MAX) 3001 return (EINVAL); 3002 3003 return (VMGETCAP(vm->cookie, vcpu, type, retval)); 3004 } 3005 3006 int 3007 vm_set_capability(struct vm *vm, int vcpu, int type, int val) 3008 { 3009 if (vcpu < 0 || vcpu >= vm->maxcpus) 3010 return (EINVAL); 3011 3012 if (type < 0 || type >= VM_CAP_MAX) 3013 return (EINVAL); 3014 3015 return (VMSETCAP(vm->cookie, vcpu, type, val)); 3016 } 3017 3018 struct vlapic * 3019 vm_lapic(struct vm *vm, int cpu) 3020 { 3021 return (vm->vcpu[cpu].vlapic); 3022 } 3023 3024 struct vioapic * 3025 vm_ioapic(struct vm *vm) 3026 { 3027 3028 return (vm->vioapic); 3029 } 3030 3031 struct vhpet * 3032 vm_hpet(struct vm *vm) 3033 { 3034 3035 return (vm->vhpet); 3036 } 3037 3038 void * 3039 vm_iommu_domain(struct vm *vm) 3040 { 3041 3042 return (vm->iommu); 3043 } 3044 3045 int 3046 vcpu_set_state(struct vm *vm, int vcpuid, enum vcpu_state newstate, 3047 bool from_idle) 3048 { 3049 int error; 3050 struct vcpu *vcpu; 3051 3052 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 3053 panic("vcpu_set_state: invalid vcpuid %d", vcpuid); 3054 3055 vcpu = &vm->vcpu[vcpuid]; 3056 3057 vcpu_lock(vcpu); 3058 error = vcpu_set_state_locked(vm, vcpuid, newstate, from_idle); 3059 vcpu_unlock(vcpu); 3060 3061 return (error); 3062 } 3063 3064 enum vcpu_state 3065 vcpu_get_state(struct vm *vm, int vcpuid, int *hostcpu) 3066 { 3067 struct vcpu *vcpu; 3068 enum vcpu_state state; 3069 3070 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 3071 panic("vcpu_get_state: invalid vcpuid %d", vcpuid); 3072 3073 vcpu = &vm->vcpu[vcpuid]; 3074 3075 vcpu_lock(vcpu); 3076 state = vcpu->state; 3077 if (hostcpu != NULL) 3078 *hostcpu = vcpu->hostcpu; 3079 vcpu_unlock(vcpu); 3080 3081 return (state); 3082 } 3083 3084 uint64_t 3085 vcpu_tsc_offset(struct vm *vm, int vcpuid, bool phys_adj) 3086 { 3087 ASSERT(vcpuid >= 0 && vcpuid < vm->maxcpus); 3088 3089 uint64_t vcpu_off = vm->boot_tsc_offset + vm->vcpu[vcpuid].tsc_offset; 3090 3091 if (phys_adj) { 3092 /* Include any offset for the current physical CPU too */ 3093 extern hrtime_t tsc_gethrtime_tick_delta(void); 3094 vcpu_off += (uint64_t)tsc_gethrtime_tick_delta(); 3095 } 3096 3097 return (vcpu_off); 3098 } 3099 3100 int 3101 vm_activate_cpu(struct vm *vm, int vcpuid) 3102 { 3103 3104 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 3105 return (EINVAL); 3106 3107 if (CPU_ISSET(vcpuid, &vm->active_cpus)) 3108 return (EBUSY); 3109 3110 if (vm->suspend != 0) { 3111 return (EBUSY); 3112 } 3113 3114 VCPU_CTR0(vm, vcpuid, "activated"); 3115 CPU_SET_ATOMIC(vcpuid, &vm->active_cpus); 3116 3117 /* 3118 * It is possible that this vCPU was undergoing activation at the same 3119 * time that the VM was being suspended. If that happens to be the 3120 * case, it should reflect the suspended state immediately. 3121 */ 3122 if (atomic_load_acq_int((uint_t *)&vm->suspend) != 0) { 3123 CPU_SET_ATOMIC(vcpuid, &vm->suspended_cpus); 3124 } 3125 3126 return (0); 3127 } 3128 3129 int 3130 vm_suspend_cpu(struct vm *vm, int vcpuid) 3131 { 3132 int i; 3133 3134 if (vcpuid < -1 || vcpuid >= vm->maxcpus) 3135 return (EINVAL); 3136 3137 if (vcpuid == -1) { 3138 vm->debug_cpus = vm->active_cpus; 3139 for (i = 0; i < vm->maxcpus; i++) { 3140 if (CPU_ISSET(i, &vm->active_cpus)) 3141 vcpu_notify_event(vm, i); 3142 } 3143 } else { 3144 if (!CPU_ISSET(vcpuid, &vm->active_cpus)) 3145 return (EINVAL); 3146 3147 CPU_SET_ATOMIC(vcpuid, &vm->debug_cpus); 3148 vcpu_notify_event(vm, vcpuid); 3149 } 3150 return (0); 3151 } 3152 3153 int 3154 vm_resume_cpu(struct vm *vm, int vcpuid) 3155 { 3156 3157 if (vcpuid < -1 || vcpuid >= vm->maxcpus) 3158 return (EINVAL); 3159 3160 if (vcpuid == -1) { 3161 CPU_ZERO(&vm->debug_cpus); 3162 } else { 3163 if (!CPU_ISSET(vcpuid, &vm->debug_cpus)) 3164 return (EINVAL); 3165 3166 CPU_CLR_ATOMIC(vcpuid, &vm->debug_cpus); 3167 } 3168 return (0); 3169 } 3170 3171 static bool 3172 vcpu_bailout_checks(struct vm *vm, int vcpuid, bool on_entry, 3173 uint64_t entry_rip) 3174 { 3175 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 3176 struct vm_exit *vme = &vcpu->exitinfo; 3177 bool bail = false; 3178 3179 ASSERT(vcpuid >= 0 && vcpuid < vm->maxcpus); 3180 3181 if (vm->suspend) { 3182 if (on_entry) { 3183 VERIFY(vm->suspend > VM_SUSPEND_NONE && 3184 vm->suspend < VM_SUSPEND_LAST); 3185 3186 vme->exitcode = VM_EXITCODE_SUSPENDED; 3187 vme->u.suspended.how = vm->suspend; 3188 } else { 3189 /* 3190 * Handling VM suspend is complicated, so if that 3191 * condition is detected outside of VM-entry itself, 3192 * just emit a BOGUS exitcode so we take a lap to pick 3193 * up the event during an entry and are directed into 3194 * the vm_handle_suspend() logic. 3195 */ 3196 vme->exitcode = VM_EXITCODE_BOGUS; 3197 } 3198 bail = true; 3199 } 3200 if (vcpu->reqidle) { 3201 vme->exitcode = VM_EXITCODE_REQIDLE; 3202 vmm_stat_incr(vm, vcpuid, VMEXIT_REQIDLE, 1); 3203 3204 if (!on_entry) { 3205 /* 3206 * A reqidle request detected outside of VM-entry can be 3207 * handled directly by clearing the request (and taking 3208 * a lap to userspace). 3209 */ 3210 vcpu_assert_locked(vcpu); 3211 vcpu->reqidle = 0; 3212 } 3213 bail = true; 3214 } 3215 if (vcpu_should_yield(vm, vcpuid)) { 3216 vme->exitcode = VM_EXITCODE_BOGUS; 3217 vmm_stat_incr(vm, vcpuid, VMEXIT_ASTPENDING, 1); 3218 bail = true; 3219 } 3220 if (CPU_ISSET(vcpuid, &vm->debug_cpus)) { 3221 vme->exitcode = VM_EXITCODE_DEBUG; 3222 bail = true; 3223 } 3224 3225 if (bail) { 3226 if (on_entry) { 3227 /* 3228 * If bailing out during VM-entry, the current %rip must 3229 * be recorded in the exitinfo. 3230 */ 3231 vme->rip = entry_rip; 3232 } 3233 vme->inst_length = 0; 3234 } 3235 return (bail); 3236 } 3237 3238 static bool 3239 vcpu_sleep_bailout_checks(struct vm *vm, int vcpuid) 3240 { 3241 /* 3242 * Bail-out check done prior to sleeping (in vCPU contexts like HLT or 3243 * wait-for-SIPI) expect that %rip is already populated in the vm_exit 3244 * structure, and we would only modify the exitcode. 3245 */ 3246 return (vcpu_bailout_checks(vm, vcpuid, false, 0)); 3247 } 3248 3249 bool 3250 vcpu_entry_bailout_checks(struct vm *vm, int vcpuid, uint64_t rip) 3251 { 3252 /* 3253 * Bail-out checks done as part of VM entry require an updated %rip to 3254 * populate the vm_exit struct if any of the conditions of interest are 3255 * matched in the check. 3256 */ 3257 return (vcpu_bailout_checks(vm, vcpuid, true, rip)); 3258 } 3259 3260 cpuset_t 3261 vm_active_cpus(struct vm *vm) 3262 { 3263 3264 return (vm->active_cpus); 3265 } 3266 3267 cpuset_t 3268 vm_debug_cpus(struct vm *vm) 3269 { 3270 3271 return (vm->debug_cpus); 3272 } 3273 3274 cpuset_t 3275 vm_suspended_cpus(struct vm *vm) 3276 { 3277 3278 return (vm->suspended_cpus); 3279 } 3280 3281 void * 3282 vcpu_stats(struct vm *vm, int vcpuid) 3283 { 3284 3285 return (vm->vcpu[vcpuid].stats); 3286 } 3287 3288 int 3289 vm_get_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state *state) 3290 { 3291 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 3292 return (EINVAL); 3293 3294 *state = vm->vcpu[vcpuid].x2apic_state; 3295 3296 return (0); 3297 } 3298 3299 int 3300 vm_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state) 3301 { 3302 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 3303 return (EINVAL); 3304 3305 if (state >= X2APIC_STATE_LAST) 3306 return (EINVAL); 3307 3308 vm->vcpu[vcpuid].x2apic_state = state; 3309 3310 vlapic_set_x2apic_state(vm, vcpuid, state); 3311 3312 return (0); 3313 } 3314 3315 /* 3316 * This function is called to ensure that a vcpu "sees" a pending event 3317 * as soon as possible: 3318 * - If the vcpu thread is sleeping then it is woken up. 3319 * - If the vcpu is running on a different host_cpu then an IPI will be directed 3320 * to the host_cpu to cause the vcpu to trap into the hypervisor. 3321 */ 3322 static void 3323 vcpu_notify_event_locked(struct vcpu *vcpu, vcpu_notify_t ntype) 3324 { 3325 int hostcpu; 3326 3327 ASSERT(ntype == VCPU_NOTIFY_APIC || VCPU_NOTIFY_EXIT); 3328 3329 hostcpu = vcpu->hostcpu; 3330 if (vcpu->state == VCPU_RUNNING) { 3331 KASSERT(hostcpu != NOCPU, ("vcpu running on invalid hostcpu")); 3332 if (hostcpu != curcpu) { 3333 if (ntype == VCPU_NOTIFY_APIC) { 3334 vlapic_post_intr(vcpu->vlapic, hostcpu); 3335 } else { 3336 poke_cpu(hostcpu); 3337 } 3338 } else { 3339 /* 3340 * If the 'vcpu' is running on 'curcpu' then it must 3341 * be sending a notification to itself (e.g. SELF_IPI). 3342 * The pending event will be picked up when the vcpu 3343 * transitions back to guest context. 3344 */ 3345 } 3346 } else { 3347 KASSERT(hostcpu == NOCPU, ("vcpu state %d not consistent " 3348 "with hostcpu %d", vcpu->state, hostcpu)); 3349 if (vcpu->state == VCPU_SLEEPING) { 3350 cv_signal(&vcpu->vcpu_cv); 3351 } 3352 } 3353 } 3354 3355 void 3356 vcpu_notify_event(struct vm *vm, int vcpuid) 3357 { 3358 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 3359 3360 vcpu_lock(vcpu); 3361 vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT); 3362 vcpu_unlock(vcpu); 3363 } 3364 3365 void 3366 vcpu_notify_event_type(struct vm *vm, int vcpuid, vcpu_notify_t ntype) 3367 { 3368 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 3369 3370 if (ntype == VCPU_NOTIFY_NONE) { 3371 return; 3372 } 3373 3374 vcpu_lock(vcpu); 3375 vcpu_notify_event_locked(vcpu, ntype); 3376 vcpu_unlock(vcpu); 3377 } 3378 3379 void 3380 vcpu_ustate_change(struct vm *vm, int vcpuid, enum vcpu_ustate ustate) 3381 { 3382 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 3383 hrtime_t now = gethrtime(); 3384 3385 ASSERT3U(ustate, !=, vcpu->ustate); 3386 ASSERT3S(ustate, <, VU_MAX); 3387 ASSERT3S(ustate, >=, VU_INIT); 3388 3389 hrtime_t delta = now - vcpu->ustate_when; 3390 vcpu->ustate_total[vcpu->ustate] += delta; 3391 3392 membar_producer(); 3393 3394 vcpu->ustate_when = now; 3395 vcpu->ustate = ustate; 3396 } 3397 3398 struct vmspace * 3399 vm_get_vmspace(struct vm *vm) 3400 { 3401 3402 return (vm->vmspace); 3403 } 3404 3405 struct vm_client * 3406 vm_get_vmclient(struct vm *vm, int vcpuid) 3407 { 3408 return (vm->vcpu[vcpuid].vmclient); 3409 } 3410 3411 int 3412 vm_apicid2vcpuid(struct vm *vm, int apicid) 3413 { 3414 /* 3415 * XXX apic id is assumed to be numerically identical to vcpu id 3416 */ 3417 return (apicid); 3418 } 3419 3420 struct vatpic * 3421 vm_atpic(struct vm *vm) 3422 { 3423 return (vm->vatpic); 3424 } 3425 3426 struct vatpit * 3427 vm_atpit(struct vm *vm) 3428 { 3429 return (vm->vatpit); 3430 } 3431 3432 struct vpmtmr * 3433 vm_pmtmr(struct vm *vm) 3434 { 3435 3436 return (vm->vpmtmr); 3437 } 3438 3439 struct vrtc * 3440 vm_rtc(struct vm *vm) 3441 { 3442 3443 return (vm->vrtc); 3444 } 3445 3446 enum vm_reg_name 3447 vm_segment_name(int seg) 3448 { 3449 static enum vm_reg_name seg_names[] = { 3450 VM_REG_GUEST_ES, 3451 VM_REG_GUEST_CS, 3452 VM_REG_GUEST_SS, 3453 VM_REG_GUEST_DS, 3454 VM_REG_GUEST_FS, 3455 VM_REG_GUEST_GS 3456 }; 3457 3458 KASSERT(seg >= 0 && seg < nitems(seg_names), 3459 ("%s: invalid segment encoding %d", __func__, seg)); 3460 return (seg_names[seg]); 3461 } 3462 3463 void 3464 vm_copy_teardown(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo, 3465 uint_t num_copyinfo) 3466 { 3467 for (uint_t idx = 0; idx < num_copyinfo; idx++) { 3468 if (copyinfo[idx].cookie != NULL) { 3469 (void) vmp_release((vm_page_t *)copyinfo[idx].cookie); 3470 } 3471 } 3472 bzero(copyinfo, num_copyinfo * sizeof (struct vm_copyinfo)); 3473 } 3474 3475 int 3476 vm_copy_setup(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, 3477 uint64_t gla, size_t len, int prot, struct vm_copyinfo *copyinfo, 3478 uint_t num_copyinfo, int *fault) 3479 { 3480 uint_t idx, nused; 3481 size_t n, off, remaining; 3482 vm_client_t *vmc = vm_get_vmclient(vm, vcpuid); 3483 3484 bzero(copyinfo, sizeof (struct vm_copyinfo) * num_copyinfo); 3485 3486 nused = 0; 3487 remaining = len; 3488 while (remaining > 0) { 3489 uint64_t gpa; 3490 int error; 3491 3492 KASSERT(nused < num_copyinfo, ("insufficient vm_copyinfo")); 3493 error = vm_gla2gpa(vm, vcpuid, paging, gla, prot, &gpa, fault); 3494 if (error || *fault) 3495 return (error); 3496 off = gpa & PAGEOFFSET; 3497 n = min(remaining, PAGESIZE - off); 3498 copyinfo[nused].gpa = gpa; 3499 copyinfo[nused].len = n; 3500 remaining -= n; 3501 gla += n; 3502 nused++; 3503 } 3504 3505 for (idx = 0; idx < nused; idx++) { 3506 vm_page_t *vmp; 3507 caddr_t hva; 3508 3509 vmp = vmc_hold(vmc, copyinfo[idx].gpa & PAGEMASK, prot); 3510 if (vmp == NULL) { 3511 break; 3512 } 3513 if ((prot & PROT_WRITE) != 0) { 3514 hva = (caddr_t)vmp_get_writable(vmp); 3515 } else { 3516 hva = (caddr_t)vmp_get_readable(vmp); 3517 } 3518 copyinfo[idx].hva = hva + (copyinfo[idx].gpa & PAGEOFFSET); 3519 copyinfo[idx].cookie = vmp; 3520 copyinfo[idx].prot = prot; 3521 } 3522 3523 if (idx != nused) { 3524 vm_copy_teardown(vm, vcpuid, copyinfo, num_copyinfo); 3525 return (EFAULT); 3526 } else { 3527 *fault = 0; 3528 return (0); 3529 } 3530 } 3531 3532 void 3533 vm_copyin(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo, void *kaddr, 3534 size_t len) 3535 { 3536 char *dst; 3537 int idx; 3538 3539 dst = kaddr; 3540 idx = 0; 3541 while (len > 0) { 3542 ASSERT(copyinfo[idx].prot & PROT_READ); 3543 3544 bcopy(copyinfo[idx].hva, dst, copyinfo[idx].len); 3545 len -= copyinfo[idx].len; 3546 dst += copyinfo[idx].len; 3547 idx++; 3548 } 3549 } 3550 3551 void 3552 vm_copyout(struct vm *vm, int vcpuid, const void *kaddr, 3553 struct vm_copyinfo *copyinfo, size_t len) 3554 { 3555 const char *src; 3556 int idx; 3557 3558 src = kaddr; 3559 idx = 0; 3560 while (len > 0) { 3561 ASSERT(copyinfo[idx].prot & PROT_WRITE); 3562 3563 bcopy(src, copyinfo[idx].hva, copyinfo[idx].len); 3564 len -= copyinfo[idx].len; 3565 src += copyinfo[idx].len; 3566 idx++; 3567 } 3568 } 3569 3570 /* 3571 * Return the amount of in-use and wired memory for the VM. Since 3572 * these are global stats, only return the values with for vCPU 0 3573 */ 3574 VMM_STAT_DECLARE(VMM_MEM_RESIDENT); 3575 3576 static void 3577 vm_get_rescnt(struct vm *vm, int vcpu, struct vmm_stat_type *stat) 3578 { 3579 if (vcpu == 0) { 3580 vmm_stat_set(vm, vcpu, VMM_MEM_RESIDENT, 3581 PAGE_SIZE * vmspace_resident_count(vm->vmspace)); 3582 } 3583 } 3584 3585 VMM_STAT_FUNC(VMM_MEM_RESIDENT, "Resident memory", vm_get_rescnt); 3586 3587 int 3588 vm_ioport_access(struct vm *vm, int vcpuid, bool in, uint16_t port, 3589 uint8_t bytes, uint32_t *val) 3590 { 3591 return (vm_inout_access(&vm->ioports, in, port, bytes, val)); 3592 } 3593 3594 /* 3595 * bhyve-internal interfaces to attach or detach IO port handlers. 3596 * Must be called with VM write lock held for safety. 3597 */ 3598 int 3599 vm_ioport_attach(struct vm *vm, uint16_t port, ioport_handler_t func, void *arg, 3600 void **cookie) 3601 { 3602 int err; 3603 err = vm_inout_attach(&vm->ioports, port, IOPF_DEFAULT, func, arg); 3604 if (err == 0) { 3605 *cookie = (void *)IOP_GEN_COOKIE(func, arg, port); 3606 } 3607 return (err); 3608 } 3609 int 3610 vm_ioport_detach(struct vm *vm, void **cookie, ioport_handler_t *old_func, 3611 void **old_arg) 3612 { 3613 uint16_t port = IOP_PORT_FROM_COOKIE((uintptr_t)*cookie); 3614 int err; 3615 3616 err = vm_inout_detach(&vm->ioports, port, false, old_func, old_arg); 3617 if (err == 0) { 3618 *cookie = NULL; 3619 } 3620 return (err); 3621 } 3622 3623 /* 3624 * External driver interfaces to attach or detach IO port handlers. 3625 * Must be called with VM write lock held for safety. 3626 */ 3627 int 3628 vm_ioport_hook(struct vm *vm, uint16_t port, ioport_handler_t func, 3629 void *arg, void **cookie) 3630 { 3631 int err; 3632 3633 if (port == 0) { 3634 return (EINVAL); 3635 } 3636 3637 err = vm_inout_attach(&vm->ioports, port, IOPF_DRV_HOOK, func, arg); 3638 if (err == 0) { 3639 *cookie = (void *)IOP_GEN_COOKIE(func, arg, port); 3640 } 3641 return (err); 3642 } 3643 void 3644 vm_ioport_unhook(struct vm *vm, void **cookie) 3645 { 3646 uint16_t port = IOP_PORT_FROM_COOKIE((uintptr_t)*cookie); 3647 ioport_handler_t old_func; 3648 void *old_arg; 3649 int err; 3650 3651 err = vm_inout_detach(&vm->ioports, port, true, &old_func, &old_arg); 3652 3653 /* ioport-hook-using drivers are expected to be well-behaved */ 3654 VERIFY0(err); 3655 VERIFY(IOP_GEN_COOKIE(old_func, old_arg, port) == (uintptr_t)*cookie); 3656 3657 *cookie = NULL; 3658 } 3659 3660 int 3661 vmm_kstat_update_vcpu(struct kstat *ksp, int rw) 3662 { 3663 struct vm *vm = ksp->ks_private; 3664 vmm_vcpu_kstats_t *vvk = ksp->ks_data; 3665 const int vcpuid = vvk->vvk_vcpu.value.ui32; 3666 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 3667 3668 ASSERT3U(vcpuid, <, VM_MAXCPU); 3669 3670 vvk->vvk_time_init.value.ui64 = vcpu->ustate_total[VU_INIT]; 3671 vvk->vvk_time_run.value.ui64 = vcpu->ustate_total[VU_RUN]; 3672 vvk->vvk_time_idle.value.ui64 = vcpu->ustate_total[VU_IDLE]; 3673 vvk->vvk_time_emu_kern.value.ui64 = vcpu->ustate_total[VU_EMU_KERN]; 3674 vvk->vvk_time_emu_user.value.ui64 = vcpu->ustate_total[VU_EMU_USER]; 3675 vvk->vvk_time_sched.value.ui64 = vcpu->ustate_total[VU_SCHED]; 3676 3677 return (0); 3678 } 3679