1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2011 NetApp, Inc. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 * 28 * $FreeBSD$ 29 */ 30 /* 31 * This file and its contents are supplied under the terms of the 32 * Common Development and Distribution License ("CDDL"), version 1.0. 33 * You may only use this file in accordance with the terms of version 34 * 1.0 of the CDDL. 35 * 36 * A full copy of the text of the CDDL should have accompanied this 37 * source. A copy of the CDDL is also available via the Internet at 38 * http://www.illumos.org/license/CDDL. 39 * 40 * Copyright 2015 Pluribus Networks Inc. 41 * Copyright 2018 Joyent, Inc. 42 * Copyright 2021 Oxide Computer Company 43 * Copyright 2021 OmniOS Community Edition (OmniOSce) Association. 44 */ 45 46 #include <sys/cdefs.h> 47 __FBSDID("$FreeBSD$"); 48 49 #include <sys/param.h> 50 #include <sys/systm.h> 51 #include <sys/kernel.h> 52 #include <sys/module.h> 53 #include <sys/sysctl.h> 54 #include <sys/malloc.h> 55 #include <sys/pcpu.h> 56 #include <sys/mutex.h> 57 #include <sys/proc.h> 58 #include <sys/rwlock.h> 59 #include <sys/sched.h> 60 #include <sys/systm.h> 61 #include <sys/sunddi.h> 62 #include <sys/hma.h> 63 64 #include <machine/md_var.h> 65 #include <x86/psl.h> 66 #include <x86/apicreg.h> 67 68 #include <machine/specialreg.h> 69 #include <machine/vmm.h> 70 #include <machine/vmm_dev.h> 71 #include <machine/vmparam.h> 72 #include <sys/vmm_instruction_emul.h> 73 #include <sys/vmm_vm.h> 74 #include <sys/vmm_gpt.h> 75 76 #include "vmm_ioport.h" 77 #include "vmm_ktr.h" 78 #include "vmm_host.h" 79 #include "vmm_util.h" 80 #include "vatpic.h" 81 #include "vatpit.h" 82 #include "vhpet.h" 83 #include "vioapic.h" 84 #include "vlapic.h" 85 #include "vpmtmr.h" 86 #include "vrtc.h" 87 #include "vmm_stat.h" 88 #include "vmm_lapic.h" 89 90 #include "io/ppt.h" 91 #include "io/iommu.h" 92 93 struct vlapic; 94 95 /* Flags for vtc_status */ 96 #define VTCS_FPU_RESTORED 1 /* guest FPU restored, host FPU saved */ 97 #define VTCS_FPU_CTX_CRITICAL 2 /* in ctx where FPU restore cannot be lazy */ 98 99 typedef struct vm_thread_ctx { 100 struct vm *vtc_vm; 101 int vtc_vcpuid; 102 uint_t vtc_status; 103 enum vcpu_ustate vtc_ustate; 104 } vm_thread_ctx_t; 105 106 /* 107 * Initialization: 108 * (a) allocated when vcpu is created 109 * (i) initialized when vcpu is created and when it is reinitialized 110 * (o) initialized the first time the vcpu is created 111 * (x) initialized before use 112 */ 113 struct vcpu { 114 /* (o) protects state, run_state, hostcpu, sipi_vector */ 115 kmutex_t lock; 116 117 enum vcpu_state state; /* (o) vcpu state */ 118 enum vcpu_run_state run_state; /* (i) vcpu init/sipi/run state */ 119 kcondvar_t vcpu_cv; /* (o) cpu waiter cv */ 120 kcondvar_t state_cv; /* (o) IDLE-transition cv */ 121 int hostcpu; /* (o) vcpu's current host cpu */ 122 int lastloccpu; /* (o) last host cpu localized to */ 123 int reqidle; /* (i) request vcpu to idle */ 124 struct vlapic *vlapic; /* (i) APIC device model */ 125 enum x2apic_state x2apic_state; /* (i) APIC mode */ 126 uint64_t exitintinfo; /* (i) events pending at VM exit */ 127 int nmi_pending; /* (i) NMI pending */ 128 int extint_pending; /* (i) INTR pending */ 129 int exception_pending; /* (i) exception pending */ 130 int exc_vector; /* (x) exception collateral */ 131 int exc_errcode_valid; 132 uint32_t exc_errcode; 133 uint8_t sipi_vector; /* (i) SIPI vector */ 134 hma_fpu_t *guestfpu; /* (a,i) guest fpu state */ 135 uint64_t guest_xcr0; /* (i) guest %xcr0 register */ 136 void *stats; /* (a,i) statistics */ 137 struct vm_exit exitinfo; /* (x) exit reason and collateral */ 138 uint64_t nextrip; /* (x) next instruction to execute */ 139 struct vie *vie_ctx; /* (x) instruction emulation context */ 140 vm_client_t *vmclient; /* (a) VM-system client */ 141 uint64_t tsc_offset; /* (x) offset from host TSC */ 142 143 enum vcpu_ustate ustate; /* (i) microstate for the vcpu */ 144 hrtime_t ustate_when; /* (i) time of last ustate change */ 145 uint64_t ustate_total[VU_MAX]; /* (o) total time spent in ustates */ 146 vm_thread_ctx_t vtc; /* (o) thread state for ctxops */ 147 struct ctxop *ctxop; /* (o) ctxop storage for vcpu */ 148 }; 149 150 #define vcpu_lock(v) mutex_enter(&((v)->lock)) 151 #define vcpu_unlock(v) mutex_exit(&((v)->lock)) 152 #define vcpu_assert_locked(v) ASSERT(MUTEX_HELD(&((v)->lock))) 153 154 struct mem_seg { 155 size_t len; 156 bool sysmem; 157 vm_object_t *object; 158 }; 159 #define VM_MAX_MEMSEGS 4 160 161 struct mem_map { 162 vm_paddr_t gpa; 163 size_t len; 164 vm_ooffset_t segoff; 165 int segid; 166 int prot; 167 int flags; 168 }; 169 #define VM_MAX_MEMMAPS 8 170 171 /* 172 * Initialization: 173 * (o) initialized the first time the VM is created 174 * (i) initialized when VM is created and when it is reinitialized 175 * (x) initialized before use 176 */ 177 struct vm { 178 void *cookie; /* (i) cpu-specific data */ 179 void *iommu; /* (x) iommu-specific data */ 180 struct vhpet *vhpet; /* (i) virtual HPET */ 181 struct vioapic *vioapic; /* (i) virtual ioapic */ 182 struct vatpic *vatpic; /* (i) virtual atpic */ 183 struct vatpit *vatpit; /* (i) virtual atpit */ 184 struct vpmtmr *vpmtmr; /* (i) virtual ACPI PM timer */ 185 struct vrtc *vrtc; /* (o) virtual RTC */ 186 volatile cpuset_t active_cpus; /* (i) active vcpus */ 187 volatile cpuset_t debug_cpus; /* (i) vcpus stopped for dbg */ 188 int suspend; /* (i) stop VM execution */ 189 volatile cpuset_t suspended_cpus; /* (i) suspended vcpus */ 190 volatile cpuset_t halted_cpus; /* (x) cpus in a hard halt */ 191 struct mem_map mem_maps[VM_MAX_MEMMAPS]; /* (i) guest address space */ 192 struct mem_seg mem_segs[VM_MAX_MEMSEGS]; /* (o) guest memory regions */ 193 struct vmspace *vmspace; /* (o) guest's address space */ 194 char name[VM_MAX_NAMELEN]; /* (o) virtual machine name */ 195 struct vcpu vcpu[VM_MAXCPU]; /* (i) guest vcpus */ 196 /* The following describe the vm cpu topology */ 197 uint16_t sockets; /* (o) num of sockets */ 198 uint16_t cores; /* (o) num of cores/socket */ 199 uint16_t threads; /* (o) num of threads/core */ 200 uint16_t maxcpus; /* (o) max pluggable cpus */ 201 uint64_t boot_tsc_offset; /* (i) TSC offset at VM boot */ 202 203 struct ioport_config ioports; /* (o) ioport handling */ 204 205 bool mem_transient; /* (o) alloc transient memory */ 206 }; 207 208 static int vmm_initialized; 209 210 211 static void 212 nullop_panic(void) 213 { 214 panic("null vmm operation call"); 215 } 216 217 /* Do not allow use of an un-set `ops` to do anything but panic */ 218 static struct vmm_ops vmm_ops_null = { 219 .init = (vmm_init_func_t)nullop_panic, 220 .cleanup = (vmm_cleanup_func_t)nullop_panic, 221 .resume = (vmm_resume_func_t)nullop_panic, 222 .vminit = (vmi_init_func_t)nullop_panic, 223 .vmrun = (vmi_run_func_t)nullop_panic, 224 .vmcleanup = (vmi_cleanup_func_t)nullop_panic, 225 .vmgetreg = (vmi_get_register_t)nullop_panic, 226 .vmsetreg = (vmi_set_register_t)nullop_panic, 227 .vmgetdesc = (vmi_get_desc_t)nullop_panic, 228 .vmsetdesc = (vmi_set_desc_t)nullop_panic, 229 .vmgetcap = (vmi_get_cap_t)nullop_panic, 230 .vmsetcap = (vmi_set_cap_t)nullop_panic, 231 .vlapic_init = (vmi_vlapic_init)nullop_panic, 232 .vlapic_cleanup = (vmi_vlapic_cleanup)nullop_panic, 233 .vmsavectx = (vmi_savectx)nullop_panic, 234 .vmrestorectx = (vmi_restorectx)nullop_panic, 235 }; 236 237 static struct vmm_ops *ops = &vmm_ops_null; 238 static vmm_pte_ops_t *pte_ops = NULL; 239 240 #define VMM_INIT() ((*ops->init)()) 241 #define VMM_CLEANUP() ((*ops->cleanup)()) 242 #define VMM_RESUME() ((*ops->resume)()) 243 244 #define VMINIT(vm) ((*ops->vminit)(vm)) 245 #define VMRUN(vmi, vcpu, rip) ((*ops->vmrun)(vmi, vcpu, rip)) 246 #define VMCLEANUP(vmi) ((*ops->vmcleanup)(vmi)) 247 248 #define VMGETREG(vmi, vcpu, num, rv) ((*ops->vmgetreg)(vmi, vcpu, num, rv)) 249 #define VMSETREG(vmi, vcpu, num, val) ((*ops->vmsetreg)(vmi, vcpu, num, val)) 250 #define VMGETDESC(vmi, vcpu, num, dsc) ((*ops->vmgetdesc)(vmi, vcpu, num, dsc)) 251 #define VMSETDESC(vmi, vcpu, num, dsc) ((*ops->vmsetdesc)(vmi, vcpu, num, dsc)) 252 #define VMGETCAP(vmi, vcpu, num, rv) ((*ops->vmgetcap)(vmi, vcpu, num, rv)) 253 #define VMSETCAP(vmi, vcpu, num, val) ((*ops->vmsetcap)(vmi, vcpu, num, val)) 254 #define VLAPIC_INIT(vmi, vcpu) ((*ops->vlapic_init)(vmi, vcpu)) 255 #define VLAPIC_CLEANUP(vmi, vlapic) ((*ops->vlapic_cleanup)(vmi, vlapic)) 256 257 #define fpu_start_emulating() load_cr0(rcr0() | CR0_TS) 258 #define fpu_stop_emulating() clts() 259 260 SDT_PROVIDER_DEFINE(vmm); 261 262 static MALLOC_DEFINE(M_VM, "vm", "vm"); 263 264 SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 265 NULL); 266 267 /* 268 * Halt the guest if all vcpus are executing a HLT instruction with 269 * interrupts disabled. 270 */ 271 static int halt_detection_enabled = 1; 272 273 /* Trap into hypervisor on all guest exceptions and reflect them back */ 274 static int trace_guest_exceptions; 275 276 static void vm_free_memmap(struct vm *vm, int ident); 277 static bool sysmem_mapping(struct vm *vm, struct mem_map *mm); 278 static void vcpu_notify_event_locked(struct vcpu *vcpu, vcpu_notify_t); 279 static bool vcpu_sleep_bailout_checks(struct vm *vm, int vcpuid); 280 static int vcpu_vector_sipi(struct vm *vm, int vcpuid, uint8_t vector); 281 282 static void vmm_savectx(void *); 283 static void vmm_restorectx(void *); 284 static const struct ctxop_template vmm_ctxop_tpl = { 285 .ct_rev = CTXOP_TPL_REV, 286 .ct_save = vmm_savectx, 287 .ct_restore = vmm_restorectx, 288 }; 289 290 #ifdef KTR 291 static const char * 292 vcpu_state2str(enum vcpu_state state) 293 { 294 295 switch (state) { 296 case VCPU_IDLE: 297 return ("idle"); 298 case VCPU_FROZEN: 299 return ("frozen"); 300 case VCPU_RUNNING: 301 return ("running"); 302 case VCPU_SLEEPING: 303 return ("sleeping"); 304 default: 305 return ("unknown"); 306 } 307 } 308 #endif 309 310 static void 311 vcpu_cleanup(struct vm *vm, int i, bool destroy) 312 { 313 struct vcpu *vcpu = &vm->vcpu[i]; 314 315 VLAPIC_CLEANUP(vm->cookie, vcpu->vlapic); 316 if (destroy) { 317 vmm_stat_free(vcpu->stats); 318 319 hma_fpu_free(vcpu->guestfpu); 320 vcpu->guestfpu = NULL; 321 322 vie_free(vcpu->vie_ctx); 323 vcpu->vie_ctx = NULL; 324 325 vmc_destroy(vcpu->vmclient); 326 vcpu->vmclient = NULL; 327 328 ctxop_free(vcpu->ctxop); 329 mutex_destroy(&vcpu->lock); 330 } 331 } 332 333 static void 334 vcpu_init(struct vm *vm, int vcpu_id, bool create) 335 { 336 struct vcpu *vcpu; 337 338 KASSERT(vcpu_id >= 0 && vcpu_id < vm->maxcpus, 339 ("vcpu_init: invalid vcpu %d", vcpu_id)); 340 341 vcpu = &vm->vcpu[vcpu_id]; 342 343 if (create) { 344 mutex_init(&vcpu->lock, NULL, MUTEX_ADAPTIVE, NULL); 345 346 vcpu->state = VCPU_IDLE; 347 vcpu->hostcpu = NOCPU; 348 vcpu->lastloccpu = NOCPU; 349 vcpu->guestfpu = hma_fpu_alloc(KM_SLEEP); 350 vcpu->stats = vmm_stat_alloc(); 351 vcpu->vie_ctx = vie_alloc(); 352 353 vcpu->ustate = VU_INIT; 354 vcpu->ustate_when = gethrtime(); 355 356 vcpu->vtc.vtc_vm = vm; 357 vcpu->vtc.vtc_vcpuid = vcpu_id; 358 vcpu->ctxop = ctxop_allocate(&vmm_ctxop_tpl, &vcpu->vtc); 359 } else { 360 vie_reset(vcpu->vie_ctx); 361 bzero(&vcpu->exitinfo, sizeof (vcpu->exitinfo)); 362 if (vcpu->ustate != VU_INIT) { 363 vcpu_ustate_change(vm, vcpu_id, VU_INIT); 364 } 365 } 366 367 vcpu->run_state = VRS_HALT; 368 vcpu->vlapic = VLAPIC_INIT(vm->cookie, vcpu_id); 369 vm_set_x2apic_state(vm, vcpu_id, X2APIC_DISABLED); 370 vcpu->reqidle = 0; 371 vcpu->exitintinfo = 0; 372 vcpu->nmi_pending = 0; 373 vcpu->extint_pending = 0; 374 vcpu->exception_pending = 0; 375 vcpu->guest_xcr0 = XFEATURE_ENABLED_X87; 376 hma_fpu_init(vcpu->guestfpu); 377 vmm_stat_init(vcpu->stats); 378 vcpu->tsc_offset = 0; 379 } 380 381 int 382 vcpu_trace_exceptions(struct vm *vm, int vcpuid) 383 { 384 385 return (trace_guest_exceptions); 386 } 387 388 struct vm_exit * 389 vm_exitinfo(struct vm *vm, int cpuid) 390 { 391 struct vcpu *vcpu; 392 393 if (cpuid < 0 || cpuid >= vm->maxcpus) 394 panic("vm_exitinfo: invalid cpuid %d", cpuid); 395 396 vcpu = &vm->vcpu[cpuid]; 397 398 return (&vcpu->exitinfo); 399 } 400 401 struct vie * 402 vm_vie_ctx(struct vm *vm, int cpuid) 403 { 404 if (cpuid < 0 || cpuid >= vm->maxcpus) 405 panic("vm_vie_ctx: invalid cpuid %d", cpuid); 406 407 return (vm->vcpu[cpuid].vie_ctx); 408 } 409 410 static int 411 vmm_init(void) 412 { 413 vmm_host_state_init(); 414 415 if (vmm_is_intel()) { 416 ops = &vmm_ops_intel; 417 pte_ops = &ept_pte_ops; 418 } else if (vmm_is_svm()) { 419 ops = &vmm_ops_amd; 420 pte_ops = &rvi_pte_ops; 421 } else { 422 return (ENXIO); 423 } 424 425 return (VMM_INIT()); 426 } 427 428 int 429 vmm_mod_load() 430 { 431 int error; 432 433 VERIFY(vmm_initialized == 0); 434 435 error = vmm_init(); 436 if (error == 0) 437 vmm_initialized = 1; 438 439 return (error); 440 } 441 442 int 443 vmm_mod_unload() 444 { 445 int error; 446 447 VERIFY(vmm_initialized == 1); 448 449 iommu_cleanup(); 450 error = VMM_CLEANUP(); 451 if (error) 452 return (error); 453 vmm_initialized = 0; 454 455 return (0); 456 } 457 458 static void 459 vm_init(struct vm *vm, bool create) 460 { 461 int i; 462 463 vm->cookie = VMINIT(vm); 464 vm->iommu = NULL; 465 vm->vioapic = vioapic_init(vm); 466 vm->vhpet = vhpet_init(vm); 467 vm->vatpic = vatpic_init(vm); 468 vm->vatpit = vatpit_init(vm); 469 vm->vpmtmr = vpmtmr_init(vm); 470 if (create) 471 vm->vrtc = vrtc_init(vm); 472 473 vm_inout_init(vm, &vm->ioports); 474 475 CPU_ZERO(&vm->active_cpus); 476 CPU_ZERO(&vm->debug_cpus); 477 478 vm->suspend = 0; 479 CPU_ZERO(&vm->suspended_cpus); 480 481 for (i = 0; i < vm->maxcpus; i++) 482 vcpu_init(vm, i, create); 483 484 /* 485 * Configure the VM-wide TSC offset so that the call to vm_init() 486 * represents the boot time (when the TSC(s) read 0). Each vCPU will 487 * have its own offset from this, which is altered if/when the guest 488 * writes to MSR_TSC. 489 * 490 * The TSC offsetting math is all unsigned, using overflow for negative 491 * offets. A reading of the TSC is negated to form the boot offset. 492 */ 493 vm->boot_tsc_offset = (uint64_t)(-(int64_t)rdtsc_offset()); 494 } 495 496 /* 497 * The default CPU topology is a single thread per package. 498 */ 499 uint_t cores_per_package = 1; 500 uint_t threads_per_core = 1; 501 502 /* 503 * Debugging tunable to enable dirty-page-tracking. 504 * (Remains off by default for now) 505 */ 506 bool gpt_track_dirty = false; 507 508 int 509 vm_create(const char *name, uint64_t flags, struct vm **retvm) 510 { 511 struct vm *vm; 512 struct vmspace *vmspace; 513 514 /* 515 * If vmm.ko could not be successfully initialized then don't attempt 516 * to create the virtual machine. 517 */ 518 if (!vmm_initialized) 519 return (ENXIO); 520 521 /* Name validation has already occurred */ 522 VERIFY3U(strnlen(name, VM_MAX_NAMELEN), <, VM_MAX_NAMELEN); 523 524 vmspace = vmspace_alloc(VM_MAXUSER_ADDRESS, pte_ops, gpt_track_dirty); 525 if (vmspace == NULL) 526 return (ENOMEM); 527 528 vm = malloc(sizeof (struct vm), M_VM, M_WAITOK | M_ZERO); 529 strcpy(vm->name, name); 530 531 vm->vmspace = vmspace; 532 vm->mem_transient = (flags & VCF_RESERVOIR_MEM) == 0; 533 for (uint_t i = 0; i < VM_MAXCPU; i++) { 534 vm->vcpu[i].vmclient = vmspace_client_alloc(vmspace); 535 } 536 537 vm->sockets = 1; 538 vm->cores = cores_per_package; /* XXX backwards compatibility */ 539 vm->threads = threads_per_core; /* XXX backwards compatibility */ 540 vm->maxcpus = VM_MAXCPU; /* XXX temp to keep code working */ 541 542 vm_init(vm, true); 543 544 *retvm = vm; 545 return (0); 546 } 547 548 void 549 vm_get_topology(struct vm *vm, uint16_t *sockets, uint16_t *cores, 550 uint16_t *threads, uint16_t *maxcpus) 551 { 552 *sockets = vm->sockets; 553 *cores = vm->cores; 554 *threads = vm->threads; 555 *maxcpus = vm->maxcpus; 556 } 557 558 uint16_t 559 vm_get_maxcpus(struct vm *vm) 560 { 561 return (vm->maxcpus); 562 } 563 564 int 565 vm_set_topology(struct vm *vm, uint16_t sockets, uint16_t cores, 566 uint16_t threads, uint16_t maxcpus) 567 { 568 if (maxcpus != 0) 569 return (EINVAL); /* XXX remove when supported */ 570 if ((sockets * cores * threads) > vm->maxcpus) 571 return (EINVAL); 572 /* XXX need to check sockets * cores * threads == vCPU, how? */ 573 vm->sockets = sockets; 574 vm->cores = cores; 575 vm->threads = threads; 576 vm->maxcpus = VM_MAXCPU; /* XXX temp to keep code working */ 577 return (0); 578 } 579 580 static void 581 vm_cleanup(struct vm *vm, bool destroy) 582 { 583 struct mem_map *mm; 584 int i; 585 586 ppt_unassign_all(vm); 587 588 if (vm->iommu != NULL) 589 iommu_destroy_domain(vm->iommu); 590 591 /* 592 * Devices which attach their own ioport hooks should be cleaned up 593 * first so they can tear down those registrations. 594 */ 595 vpmtmr_cleanup(vm->vpmtmr); 596 597 vm_inout_cleanup(vm, &vm->ioports); 598 599 if (destroy) 600 vrtc_cleanup(vm->vrtc); 601 else 602 vrtc_reset(vm->vrtc); 603 604 vatpit_cleanup(vm->vatpit); 605 vhpet_cleanup(vm->vhpet); 606 vatpic_cleanup(vm->vatpic); 607 vioapic_cleanup(vm->vioapic); 608 609 for (i = 0; i < vm->maxcpus; i++) 610 vcpu_cleanup(vm, i, destroy); 611 612 VMCLEANUP(vm->cookie); 613 614 /* 615 * System memory is removed from the guest address space only when 616 * the VM is destroyed. This is because the mapping remains the same 617 * across VM reset. 618 * 619 * Device memory can be relocated by the guest (e.g. using PCI BARs) 620 * so those mappings are removed on a VM reset. 621 */ 622 for (i = 0; i < VM_MAX_MEMMAPS; i++) { 623 mm = &vm->mem_maps[i]; 624 if (destroy || !sysmem_mapping(vm, mm)) { 625 vm_free_memmap(vm, i); 626 } else { 627 /* 628 * We need to reset the IOMMU flag so this mapping can 629 * be reused when a VM is rebooted. Since the IOMMU 630 * domain has already been destroyed we can just reset 631 * the flag here. 632 */ 633 mm->flags &= ~VM_MEMMAP_F_IOMMU; 634 } 635 } 636 637 if (destroy) { 638 for (i = 0; i < VM_MAX_MEMSEGS; i++) 639 vm_free_memseg(vm, i); 640 641 vmspace_destroy(vm->vmspace); 642 vm->vmspace = NULL; 643 } 644 } 645 646 void 647 vm_destroy(struct vm *vm) 648 { 649 vm_cleanup(vm, true); 650 free(vm, M_VM); 651 } 652 653 int 654 vm_reinit(struct vm *vm, uint64_t flags) 655 { 656 /* A virtual machine can be reset only if all vcpus are suspended. */ 657 if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) != 0) { 658 if ((flags & VM_REINIT_F_FORCE_SUSPEND) == 0) { 659 return (EBUSY); 660 } 661 662 /* 663 * Force the VM (and all its vCPUs) into a suspended state. 664 * This should be quick and easy, since the vm_reinit() call is 665 * made while holding the VM write lock, which requires holding 666 * all of the vCPUs in the VCPU_FROZEN state. 667 */ 668 (void) atomic_cmpset_int((uint_t *)&vm->suspend, 0, 669 VM_SUSPEND_RESET); 670 for (uint_t i = 0; i < vm->maxcpus; i++) { 671 struct vcpu *vcpu = &vm->vcpu[i]; 672 673 if (CPU_ISSET(i, &vm->suspended_cpus) || 674 !CPU_ISSET(i, &vm->active_cpus)) { 675 continue; 676 } 677 678 vcpu_lock(vcpu); 679 VERIFY3U(vcpu->state, ==, VCPU_FROZEN); 680 CPU_SET_ATOMIC(i, &vm->suspended_cpus); 681 vcpu_unlock(vcpu); 682 } 683 684 VERIFY0(CPU_CMP(&vm->suspended_cpus, &vm->active_cpus)); 685 } 686 687 vm_cleanup(vm, false); 688 vm_init(vm, false); 689 return (0); 690 } 691 692 const char * 693 vm_name(struct vm *vm) 694 { 695 return (vm->name); 696 } 697 698 int 699 vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa) 700 { 701 vm_object_t *obj; 702 703 if ((obj = vmm_mmio_alloc(vm->vmspace, gpa, len, hpa)) == NULL) 704 return (ENOMEM); 705 else 706 return (0); 707 } 708 709 int 710 vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len) 711 { 712 return (vmspace_unmap(vm->vmspace, gpa, gpa + len)); 713 } 714 715 /* 716 * Return 'true' if 'gpa' is allocated in the guest address space. 717 * 718 * This function is called in the context of a running vcpu which acts as 719 * an implicit lock on 'vm->mem_maps[]'. 720 */ 721 bool 722 vm_mem_allocated(struct vm *vm, int vcpuid, vm_paddr_t gpa) 723 { 724 struct mem_map *mm; 725 int i; 726 727 #ifdef INVARIANTS 728 int hostcpu, state; 729 state = vcpu_get_state(vm, vcpuid, &hostcpu); 730 KASSERT(state == VCPU_RUNNING && hostcpu == curcpu, 731 ("%s: invalid vcpu state %d/%d", __func__, state, hostcpu)); 732 #endif 733 734 for (i = 0; i < VM_MAX_MEMMAPS; i++) { 735 mm = &vm->mem_maps[i]; 736 if (mm->len != 0 && gpa >= mm->gpa && gpa < mm->gpa + mm->len) 737 return (true); /* 'gpa' is sysmem or devmem */ 738 } 739 740 if (ppt_is_mmio(vm, gpa)) 741 return (true); /* 'gpa' is pci passthru mmio */ 742 743 return (false); 744 } 745 746 int 747 vm_alloc_memseg(struct vm *vm, int ident, size_t len, bool sysmem) 748 { 749 struct mem_seg *seg; 750 vm_object_t *obj; 751 752 if (ident < 0 || ident >= VM_MAX_MEMSEGS) 753 return (EINVAL); 754 755 if (len == 0 || (len & PAGE_MASK)) 756 return (EINVAL); 757 758 seg = &vm->mem_segs[ident]; 759 if (seg->object != NULL) { 760 if (seg->len == len && seg->sysmem == sysmem) 761 return (EEXIST); 762 else 763 return (EINVAL); 764 } 765 766 obj = vm_object_mem_allocate(len, vm->mem_transient); 767 if (obj == NULL) 768 return (ENOMEM); 769 770 seg->len = len; 771 seg->object = obj; 772 seg->sysmem = sysmem; 773 return (0); 774 } 775 776 int 777 vm_get_memseg(struct vm *vm, int ident, size_t *len, bool *sysmem, 778 vm_object_t **objptr) 779 { 780 struct mem_seg *seg; 781 782 if (ident < 0 || ident >= VM_MAX_MEMSEGS) 783 return (EINVAL); 784 785 seg = &vm->mem_segs[ident]; 786 if (len) 787 *len = seg->len; 788 if (sysmem) 789 *sysmem = seg->sysmem; 790 if (objptr) 791 *objptr = seg->object; 792 return (0); 793 } 794 795 void 796 vm_free_memseg(struct vm *vm, int ident) 797 { 798 struct mem_seg *seg; 799 800 KASSERT(ident >= 0 && ident < VM_MAX_MEMSEGS, 801 ("%s: invalid memseg ident %d", __func__, ident)); 802 803 seg = &vm->mem_segs[ident]; 804 if (seg->object != NULL) { 805 vm_object_release(seg->object); 806 bzero(seg, sizeof (struct mem_seg)); 807 } 808 } 809 810 int 811 vm_mmap_memseg(struct vm *vm, vm_paddr_t gpa, int segid, vm_ooffset_t first, 812 size_t len, int prot, int flags) 813 { 814 struct mem_seg *seg; 815 struct mem_map *m, *map; 816 vm_ooffset_t last; 817 int i, error; 818 819 if (prot == 0 || (prot & ~(PROT_ALL)) != 0) 820 return (EINVAL); 821 822 if (flags & ~VM_MEMMAP_F_WIRED) 823 return (EINVAL); 824 825 if (segid < 0 || segid >= VM_MAX_MEMSEGS) 826 return (EINVAL); 827 828 seg = &vm->mem_segs[segid]; 829 if (seg->object == NULL) 830 return (EINVAL); 831 832 last = first + len; 833 if (first < 0 || first >= last || last > seg->len) 834 return (EINVAL); 835 836 if ((gpa | first | last) & PAGE_MASK) 837 return (EINVAL); 838 839 map = NULL; 840 for (i = 0; i < VM_MAX_MEMMAPS; i++) { 841 m = &vm->mem_maps[i]; 842 if (m->len == 0) { 843 map = m; 844 break; 845 } 846 } 847 848 if (map == NULL) 849 return (ENOSPC); 850 851 error = vmspace_map(vm->vmspace, seg->object, first, gpa, len, prot); 852 if (error != 0) 853 return (EFAULT); 854 855 vm_object_reference(seg->object); 856 857 if ((flags & VM_MEMMAP_F_WIRED) != 0) { 858 error = vmspace_populate(vm->vmspace, gpa, gpa + len); 859 if (error != 0) { 860 vmspace_unmap(vm->vmspace, gpa, gpa + len); 861 return (EFAULT); 862 } 863 } 864 865 map->gpa = gpa; 866 map->len = len; 867 map->segoff = first; 868 map->segid = segid; 869 map->prot = prot; 870 map->flags = flags; 871 return (0); 872 } 873 874 int 875 vm_munmap_memseg(struct vm *vm, vm_paddr_t gpa, size_t len) 876 { 877 struct mem_map *m; 878 int i; 879 880 for (i = 0; i < VM_MAX_MEMMAPS; i++) { 881 m = &vm->mem_maps[i]; 882 if (m->gpa == gpa && m->len == len && 883 (m->flags & VM_MEMMAP_F_IOMMU) == 0) { 884 vm_free_memmap(vm, i); 885 return (0); 886 } 887 } 888 889 return (EINVAL); 890 } 891 892 int 893 vm_mmap_getnext(struct vm *vm, vm_paddr_t *gpa, int *segid, 894 vm_ooffset_t *segoff, size_t *len, int *prot, int *flags) 895 { 896 struct mem_map *mm, *mmnext; 897 int i; 898 899 mmnext = NULL; 900 for (i = 0; i < VM_MAX_MEMMAPS; i++) { 901 mm = &vm->mem_maps[i]; 902 if (mm->len == 0 || mm->gpa < *gpa) 903 continue; 904 if (mmnext == NULL || mm->gpa < mmnext->gpa) 905 mmnext = mm; 906 } 907 908 if (mmnext != NULL) { 909 *gpa = mmnext->gpa; 910 if (segid) 911 *segid = mmnext->segid; 912 if (segoff) 913 *segoff = mmnext->segoff; 914 if (len) 915 *len = mmnext->len; 916 if (prot) 917 *prot = mmnext->prot; 918 if (flags) 919 *flags = mmnext->flags; 920 return (0); 921 } else { 922 return (ENOENT); 923 } 924 } 925 926 static void 927 vm_free_memmap(struct vm *vm, int ident) 928 { 929 struct mem_map *mm; 930 int error; 931 932 mm = &vm->mem_maps[ident]; 933 if (mm->len) { 934 error = vmspace_unmap(vm->vmspace, mm->gpa, 935 mm->gpa + mm->len); 936 KASSERT(error == 0, ("%s: vmspace_unmap error %d", 937 __func__, error)); 938 bzero(mm, sizeof (struct mem_map)); 939 } 940 } 941 942 static __inline bool 943 sysmem_mapping(struct vm *vm, struct mem_map *mm) 944 { 945 946 if (mm->len != 0 && vm->mem_segs[mm->segid].sysmem) 947 return (true); 948 else 949 return (false); 950 } 951 952 vm_paddr_t 953 vmm_sysmem_maxaddr(struct vm *vm) 954 { 955 struct mem_map *mm; 956 vm_paddr_t maxaddr; 957 int i; 958 959 maxaddr = 0; 960 for (i = 0; i < VM_MAX_MEMMAPS; i++) { 961 mm = &vm->mem_maps[i]; 962 if (sysmem_mapping(vm, mm)) { 963 if (maxaddr < mm->gpa + mm->len) 964 maxaddr = mm->gpa + mm->len; 965 } 966 } 967 return (maxaddr); 968 } 969 970 static void 971 vm_iommu_modify(struct vm *vm, bool map) 972 { 973 int i, sz; 974 vm_paddr_t gpa, hpa; 975 struct mem_map *mm; 976 #ifdef __FreeBSD__ 977 void *vp, *cookie, *host_domain; 978 #endif 979 vm_client_t *vmc; 980 981 sz = PAGE_SIZE; 982 #ifdef __FreeBSD__ 983 host_domain = iommu_host_domain(); 984 #endif 985 vmc = vmspace_client_alloc(vm->vmspace); 986 987 for (i = 0; i < VM_MAX_MEMMAPS; i++) { 988 mm = &vm->mem_maps[i]; 989 if (!sysmem_mapping(vm, mm)) 990 continue; 991 992 if (map) { 993 KASSERT((mm->flags & VM_MEMMAP_F_IOMMU) == 0, 994 ("iommu map found invalid memmap %lx/%lx/%x", 995 mm->gpa, mm->len, mm->flags)); 996 if ((mm->flags & VM_MEMMAP_F_WIRED) == 0) 997 continue; 998 mm->flags |= VM_MEMMAP_F_IOMMU; 999 } else { 1000 if ((mm->flags & VM_MEMMAP_F_IOMMU) == 0) 1001 continue; 1002 mm->flags &= ~VM_MEMMAP_F_IOMMU; 1003 KASSERT((mm->flags & VM_MEMMAP_F_WIRED) != 0, 1004 ("iommu unmap found invalid memmap %lx/%lx/%x", 1005 mm->gpa, mm->len, mm->flags)); 1006 } 1007 1008 gpa = mm->gpa; 1009 while (gpa < mm->gpa + mm->len) { 1010 vm_page_t *vmp; 1011 1012 vmp = vmc_hold(vmc, gpa, PROT_WRITE); 1013 ASSERT(vmp != NULL); 1014 hpa = ((uintptr_t)vmp_get_pfn(vmp) << PAGESHIFT); 1015 vmp_release(vmp); 1016 1017 if (map) { 1018 iommu_create_mapping(vm->iommu, gpa, hpa, sz); 1019 #ifdef __FreeBSD__ 1020 iommu_remove_mapping(host_domain, hpa, sz); 1021 #endif 1022 } else { 1023 iommu_remove_mapping(vm->iommu, gpa, sz); 1024 #ifdef __FreeBSD__ 1025 iommu_create_mapping(host_domain, hpa, hpa, sz); 1026 #endif 1027 } 1028 1029 gpa += PAGE_SIZE; 1030 } 1031 } 1032 vmc_destroy(vmc); 1033 1034 /* 1035 * Invalidate the cached translations associated with the domain 1036 * from which pages were removed. 1037 */ 1038 #ifdef __FreeBSD__ 1039 if (map) 1040 iommu_invalidate_tlb(host_domain); 1041 else 1042 iommu_invalidate_tlb(vm->iommu); 1043 #else 1044 iommu_invalidate_tlb(vm->iommu); 1045 #endif 1046 } 1047 1048 int 1049 vm_unassign_pptdev(struct vm *vm, int pptfd) 1050 { 1051 int error; 1052 1053 error = ppt_unassign_device(vm, pptfd); 1054 if (error) 1055 return (error); 1056 1057 if (ppt_assigned_devices(vm) == 0) 1058 vm_iommu_modify(vm, false); 1059 1060 return (0); 1061 } 1062 1063 int 1064 vm_assign_pptdev(struct vm *vm, int pptfd) 1065 { 1066 int error; 1067 vm_paddr_t maxaddr; 1068 1069 /* Set up the IOMMU to do the 'gpa' to 'hpa' translation */ 1070 if (ppt_assigned_devices(vm) == 0) { 1071 KASSERT(vm->iommu == NULL, 1072 ("vm_assign_pptdev: iommu must be NULL")); 1073 maxaddr = vmm_sysmem_maxaddr(vm); 1074 vm->iommu = iommu_create_domain(maxaddr); 1075 if (vm->iommu == NULL) 1076 return (ENXIO); 1077 vm_iommu_modify(vm, true); 1078 } 1079 1080 error = ppt_assign_device(vm, pptfd); 1081 return (error); 1082 } 1083 1084 int 1085 vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval) 1086 { 1087 1088 if (vcpu < 0 || vcpu >= vm->maxcpus) 1089 return (EINVAL); 1090 1091 if (reg >= VM_REG_LAST) 1092 return (EINVAL); 1093 1094 return (VMGETREG(vm->cookie, vcpu, reg, retval)); 1095 } 1096 1097 int 1098 vm_set_register(struct vm *vm, int vcpuid, int reg, uint64_t val) 1099 { 1100 struct vcpu *vcpu; 1101 int error; 1102 1103 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 1104 return (EINVAL); 1105 1106 if (reg >= VM_REG_LAST) 1107 return (EINVAL); 1108 1109 error = VMSETREG(vm->cookie, vcpuid, reg, val); 1110 if (error || reg != VM_REG_GUEST_RIP) 1111 return (error); 1112 1113 /* Set 'nextrip' to match the value of %rip */ 1114 VCPU_CTR1(vm, vcpuid, "Setting nextrip to %lx", val); 1115 vcpu = &vm->vcpu[vcpuid]; 1116 vcpu->nextrip = val; 1117 return (0); 1118 } 1119 1120 static bool 1121 is_descriptor_table(int reg) 1122 { 1123 switch (reg) { 1124 case VM_REG_GUEST_IDTR: 1125 case VM_REG_GUEST_GDTR: 1126 return (true); 1127 default: 1128 return (false); 1129 } 1130 } 1131 1132 static bool 1133 is_segment_register(int reg) 1134 { 1135 switch (reg) { 1136 case VM_REG_GUEST_ES: 1137 case VM_REG_GUEST_CS: 1138 case VM_REG_GUEST_SS: 1139 case VM_REG_GUEST_DS: 1140 case VM_REG_GUEST_FS: 1141 case VM_REG_GUEST_GS: 1142 case VM_REG_GUEST_TR: 1143 case VM_REG_GUEST_LDTR: 1144 return (true); 1145 default: 1146 return (false); 1147 } 1148 } 1149 1150 int 1151 vm_get_seg_desc(struct vm *vm, int vcpu, int reg, struct seg_desc *desc) 1152 { 1153 1154 if (vcpu < 0 || vcpu >= vm->maxcpus) 1155 return (EINVAL); 1156 1157 if (!is_segment_register(reg) && !is_descriptor_table(reg)) 1158 return (EINVAL); 1159 1160 return (VMGETDESC(vm->cookie, vcpu, reg, desc)); 1161 } 1162 1163 int 1164 vm_set_seg_desc(struct vm *vm, int vcpu, int reg, const struct seg_desc *desc) 1165 { 1166 if (vcpu < 0 || vcpu >= vm->maxcpus) 1167 return (EINVAL); 1168 1169 if (!is_segment_register(reg) && !is_descriptor_table(reg)) 1170 return (EINVAL); 1171 1172 return (VMSETDESC(vm->cookie, vcpu, reg, desc)); 1173 } 1174 1175 static int 1176 translate_hma_xsave_result(hma_fpu_xsave_result_t res) 1177 { 1178 switch (res) { 1179 case HFXR_OK: 1180 return (0); 1181 case HFXR_NO_SPACE: 1182 return (ENOSPC); 1183 case HFXR_BAD_ALIGN: 1184 case HFXR_UNSUP_FMT: 1185 case HFXR_UNSUP_FEAT: 1186 case HFXR_INVALID_DATA: 1187 return (EINVAL); 1188 default: 1189 panic("unexpected xsave result"); 1190 } 1191 } 1192 1193 int 1194 vm_get_fpu(struct vm *vm, int vcpuid, void *buf, size_t len) 1195 { 1196 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 1197 return (EINVAL); 1198 1199 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 1200 hma_fpu_xsave_result_t res; 1201 1202 res = hma_fpu_get_xsave_state(vcpu->guestfpu, buf, len); 1203 return (translate_hma_xsave_result(res)); 1204 } 1205 1206 int 1207 vm_set_fpu(struct vm *vm, int vcpuid, void *buf, size_t len) 1208 { 1209 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 1210 return (EINVAL); 1211 1212 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 1213 hma_fpu_xsave_result_t res; 1214 1215 res = hma_fpu_set_xsave_state(vcpu->guestfpu, buf, len); 1216 return (translate_hma_xsave_result(res)); 1217 } 1218 1219 int 1220 vm_get_run_state(struct vm *vm, int vcpuid, uint32_t *state, uint8_t *sipi_vec) 1221 { 1222 struct vcpu *vcpu; 1223 1224 if (vcpuid < 0 || vcpuid >= vm->maxcpus) { 1225 return (EINVAL); 1226 } 1227 1228 vcpu = &vm->vcpu[vcpuid]; 1229 1230 vcpu_lock(vcpu); 1231 *state = vcpu->run_state; 1232 *sipi_vec = vcpu->sipi_vector; 1233 vcpu_unlock(vcpu); 1234 1235 return (0); 1236 } 1237 1238 int 1239 vm_set_run_state(struct vm *vm, int vcpuid, uint32_t state, uint8_t sipi_vec) 1240 { 1241 struct vcpu *vcpu; 1242 1243 if (vcpuid < 0 || vcpuid >= vm->maxcpus) { 1244 return (EINVAL); 1245 } 1246 if (!VRS_IS_VALID(state)) { 1247 return (EINVAL); 1248 } 1249 1250 vcpu = &vm->vcpu[vcpuid]; 1251 1252 vcpu_lock(vcpu); 1253 vcpu->run_state = state; 1254 vcpu->sipi_vector = sipi_vec; 1255 vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT); 1256 vcpu_unlock(vcpu); 1257 1258 return (0); 1259 } 1260 1261 void 1262 vm_track_dirty_pages(struct vm *vm, uint64_t gpa, size_t len, uint8_t *bitmap) 1263 { 1264 vmspace_t *vms = vm_get_vmspace(vm); 1265 vmspace_track_dirty(vms, gpa, len, bitmap); 1266 } 1267 1268 static void 1269 restore_guest_fpustate(struct vcpu *vcpu) 1270 { 1271 /* Save host FPU and restore guest FPU */ 1272 fpu_stop_emulating(); 1273 hma_fpu_start_guest(vcpu->guestfpu); 1274 1275 /* restore guest XCR0 if XSAVE is enabled in the host */ 1276 if (rcr4() & CR4_XSAVE) 1277 load_xcr(0, vcpu->guest_xcr0); 1278 1279 /* 1280 * The FPU is now "dirty" with the guest's state so turn on emulation 1281 * to trap any access to the FPU by the host. 1282 */ 1283 fpu_start_emulating(); 1284 } 1285 1286 static void 1287 save_guest_fpustate(struct vcpu *vcpu) 1288 { 1289 1290 if ((rcr0() & CR0_TS) == 0) 1291 panic("fpu emulation not enabled in host!"); 1292 1293 /* save guest XCR0 and restore host XCR0 */ 1294 if (rcr4() & CR4_XSAVE) { 1295 vcpu->guest_xcr0 = rxcr(0); 1296 load_xcr(0, vmm_get_host_xcr0()); 1297 } 1298 1299 /* save guest FPU and restore host FPU */ 1300 fpu_stop_emulating(); 1301 hma_fpu_stop_guest(vcpu->guestfpu); 1302 /* 1303 * When the host state has been restored, we should not re-enable 1304 * CR0.TS on illumos for eager FPU. 1305 */ 1306 } 1307 1308 static int 1309 vcpu_set_state_locked(struct vm *vm, int vcpuid, enum vcpu_state newstate, 1310 bool from_idle) 1311 { 1312 struct vcpu *vcpu; 1313 int error; 1314 1315 vcpu = &vm->vcpu[vcpuid]; 1316 vcpu_assert_locked(vcpu); 1317 1318 /* 1319 * State transitions from the vmmdev_ioctl() must always begin from 1320 * the VCPU_IDLE state. This guarantees that there is only a single 1321 * ioctl() operating on a vcpu at any point. 1322 */ 1323 if (from_idle) { 1324 while (vcpu->state != VCPU_IDLE) { 1325 vcpu->reqidle = 1; 1326 vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT); 1327 VCPU_CTR1(vm, vcpuid, "vcpu state change from %s to " 1328 "idle requested", vcpu_state2str(vcpu->state)); 1329 cv_wait(&vcpu->state_cv, &vcpu->lock); 1330 } 1331 } else { 1332 KASSERT(vcpu->state != VCPU_IDLE, ("invalid transition from " 1333 "vcpu idle state")); 1334 } 1335 1336 if (vcpu->state == VCPU_RUNNING) { 1337 KASSERT(vcpu->hostcpu == curcpu, ("curcpu %d and hostcpu %d " 1338 "mismatch for running vcpu", curcpu, vcpu->hostcpu)); 1339 } else { 1340 KASSERT(vcpu->hostcpu == NOCPU, ("Invalid hostcpu %d for a " 1341 "vcpu that is not running", vcpu->hostcpu)); 1342 } 1343 1344 /* 1345 * The following state transitions are allowed: 1346 * IDLE -> FROZEN -> IDLE 1347 * FROZEN -> RUNNING -> FROZEN 1348 * FROZEN -> SLEEPING -> FROZEN 1349 */ 1350 switch (vcpu->state) { 1351 case VCPU_IDLE: 1352 case VCPU_RUNNING: 1353 case VCPU_SLEEPING: 1354 error = (newstate != VCPU_FROZEN); 1355 break; 1356 case VCPU_FROZEN: 1357 error = (newstate == VCPU_FROZEN); 1358 break; 1359 default: 1360 error = 1; 1361 break; 1362 } 1363 1364 if (error) 1365 return (EBUSY); 1366 1367 VCPU_CTR2(vm, vcpuid, "vcpu state changed from %s to %s", 1368 vcpu_state2str(vcpu->state), vcpu_state2str(newstate)); 1369 1370 vcpu->state = newstate; 1371 if (newstate == VCPU_RUNNING) 1372 vcpu->hostcpu = curcpu; 1373 else 1374 vcpu->hostcpu = NOCPU; 1375 1376 if (newstate == VCPU_IDLE) { 1377 cv_broadcast(&vcpu->state_cv); 1378 } 1379 1380 return (0); 1381 } 1382 1383 static void 1384 vcpu_require_state(struct vm *vm, int vcpuid, enum vcpu_state newstate) 1385 { 1386 int error; 1387 1388 if ((error = vcpu_set_state(vm, vcpuid, newstate, false)) != 0) 1389 panic("Error %d setting state to %d\n", error, newstate); 1390 } 1391 1392 static void 1393 vcpu_require_state_locked(struct vm *vm, int vcpuid, enum vcpu_state newstate) 1394 { 1395 int error; 1396 1397 if ((error = vcpu_set_state_locked(vm, vcpuid, newstate, false)) != 0) 1398 panic("Error %d setting state to %d", error, newstate); 1399 } 1400 1401 /* 1402 * Emulate a guest 'hlt' by sleeping until the vcpu is ready to run. 1403 */ 1404 static int 1405 vm_handle_hlt(struct vm *vm, int vcpuid, bool intr_disabled) 1406 { 1407 struct vcpu *vcpu; 1408 int vcpu_halted, vm_halted; 1409 bool userspace_exit = false; 1410 1411 KASSERT(!CPU_ISSET(vcpuid, &vm->halted_cpus), ("vcpu already halted")); 1412 1413 vcpu = &vm->vcpu[vcpuid]; 1414 vcpu_halted = 0; 1415 vm_halted = 0; 1416 1417 vcpu_lock(vcpu); 1418 while (1) { 1419 /* 1420 * Do a final check for pending interrupts (including NMI and 1421 * INIT) before putting this thread to sleep. 1422 */ 1423 if (vm_nmi_pending(vm, vcpuid)) 1424 break; 1425 if (vcpu_run_state_pending(vm, vcpuid)) 1426 break; 1427 if (!intr_disabled) { 1428 if (vm_extint_pending(vm, vcpuid) || 1429 vlapic_pending_intr(vcpu->vlapic, NULL)) { 1430 break; 1431 } 1432 } 1433 1434 /* 1435 * Also check for software events which would cause a wake-up. 1436 * This will set the appropriate exitcode directly, rather than 1437 * requiring a trip through VM_RUN(). 1438 */ 1439 if (vcpu_sleep_bailout_checks(vm, vcpuid)) { 1440 userspace_exit = true; 1441 break; 1442 } 1443 1444 /* 1445 * Some Linux guests implement "halt" by having all vcpus 1446 * execute HLT with interrupts disabled. 'halted_cpus' keeps 1447 * track of the vcpus that have entered this state. When all 1448 * vcpus enter the halted state the virtual machine is halted. 1449 */ 1450 if (intr_disabled) { 1451 if (!vcpu_halted && halt_detection_enabled) { 1452 vcpu_halted = 1; 1453 CPU_SET_ATOMIC(vcpuid, &vm->halted_cpus); 1454 } 1455 if (CPU_CMP(&vm->halted_cpus, &vm->active_cpus) == 0) { 1456 vm_halted = 1; 1457 break; 1458 } 1459 } 1460 1461 vcpu_ustate_change(vm, vcpuid, VU_IDLE); 1462 vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING); 1463 (void) cv_wait_sig(&vcpu->vcpu_cv, &vcpu->lock); 1464 vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN); 1465 vcpu_ustate_change(vm, vcpuid, VU_EMU_KERN); 1466 } 1467 1468 if (vcpu_halted) 1469 CPU_CLR_ATOMIC(vcpuid, &vm->halted_cpus); 1470 1471 vcpu_unlock(vcpu); 1472 1473 if (vm_halted) 1474 vm_suspend(vm, VM_SUSPEND_HALT); 1475 1476 return (userspace_exit ? -1 : 0); 1477 } 1478 1479 static int 1480 vm_handle_paging(struct vm *vm, int vcpuid) 1481 { 1482 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 1483 vm_client_t *vmc = vcpu->vmclient; 1484 struct vm_exit *vme = &vcpu->exitinfo; 1485 int rv, ftype; 1486 1487 KASSERT(vme->inst_length == 0, ("%s: invalid inst_length %d", 1488 __func__, vme->inst_length)); 1489 1490 ftype = vme->u.paging.fault_type; 1491 KASSERT(ftype == PROT_READ || 1492 ftype == PROT_WRITE || ftype == PROT_EXEC, 1493 ("vm_handle_paging: invalid fault_type %d", ftype)); 1494 1495 rv = vmc_fault(vmc, vme->u.paging.gpa, ftype); 1496 1497 VCPU_CTR3(vm, vcpuid, "vm_handle_paging rv = %d, gpa = %lx, " 1498 "ftype = %d", rv, vme->u.paging.gpa, ftype); 1499 1500 if (rv != 0) 1501 return (EFAULT); 1502 return (0); 1503 } 1504 1505 int 1506 vm_service_mmio_read(struct vm *vm, int cpuid, uint64_t gpa, uint64_t *rval, 1507 int rsize) 1508 { 1509 int err = ESRCH; 1510 1511 if (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE) { 1512 struct vlapic *vlapic = vm_lapic(vm, cpuid); 1513 1514 err = vlapic_mmio_read(vlapic, gpa, rval, rsize); 1515 } else if (gpa >= VIOAPIC_BASE && gpa < VIOAPIC_BASE + VIOAPIC_SIZE) { 1516 err = vioapic_mmio_read(vm, cpuid, gpa, rval, rsize); 1517 } else if (gpa >= VHPET_BASE && gpa < VHPET_BASE + VHPET_SIZE) { 1518 err = vhpet_mmio_read(vm, cpuid, gpa, rval, rsize); 1519 } 1520 1521 return (err); 1522 } 1523 1524 int 1525 vm_service_mmio_write(struct vm *vm, int cpuid, uint64_t gpa, uint64_t wval, 1526 int wsize) 1527 { 1528 int err = ESRCH; 1529 1530 if (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE) { 1531 struct vlapic *vlapic = vm_lapic(vm, cpuid); 1532 1533 err = vlapic_mmio_write(vlapic, gpa, wval, wsize); 1534 } else if (gpa >= VIOAPIC_BASE && gpa < VIOAPIC_BASE + VIOAPIC_SIZE) { 1535 err = vioapic_mmio_write(vm, cpuid, gpa, wval, wsize); 1536 } else if (gpa >= VHPET_BASE && gpa < VHPET_BASE + VHPET_SIZE) { 1537 err = vhpet_mmio_write(vm, cpuid, gpa, wval, wsize); 1538 } 1539 1540 return (err); 1541 } 1542 1543 static int 1544 vm_handle_mmio_emul(struct vm *vm, int vcpuid) 1545 { 1546 struct vie *vie; 1547 struct vcpu *vcpu; 1548 struct vm_exit *vme; 1549 uint64_t inst_addr; 1550 int error, fault, cs_d; 1551 1552 vcpu = &vm->vcpu[vcpuid]; 1553 vme = &vcpu->exitinfo; 1554 vie = vcpu->vie_ctx; 1555 1556 KASSERT(vme->inst_length == 0, ("%s: invalid inst_length %d", 1557 __func__, vme->inst_length)); 1558 1559 inst_addr = vme->rip + vme->u.mmio_emul.cs_base; 1560 cs_d = vme->u.mmio_emul.cs_d; 1561 1562 VCPU_CTR1(vm, vcpuid, "inst_emul fault accessing gpa %lx", 1563 vme->u.mmio_emul.gpa); 1564 1565 /* Fetch the faulting instruction */ 1566 if (vie_needs_fetch(vie)) { 1567 error = vie_fetch_instruction(vie, vm, vcpuid, inst_addr, 1568 &fault); 1569 if (error != 0) { 1570 return (error); 1571 } else if (fault) { 1572 /* 1573 * If a fault during instruction fetch was encountered, 1574 * it will have asserted that the appropriate exception 1575 * be injected at next entry. 1576 * No further work is required. 1577 */ 1578 return (0); 1579 } 1580 } 1581 1582 if (vie_decode_instruction(vie, vm, vcpuid, cs_d) != 0) { 1583 VCPU_CTR1(vm, vcpuid, "Error decoding instruction at %lx", 1584 inst_addr); 1585 /* Dump (unrecognized) instruction bytes in userspace */ 1586 vie_fallback_exitinfo(vie, vme); 1587 return (-1); 1588 } 1589 if (vme->u.mmio_emul.gla != VIE_INVALID_GLA && 1590 vie_verify_gla(vie, vm, vcpuid, vme->u.mmio_emul.gla) != 0) { 1591 /* Decoded GLA does not match GLA from VM exit state */ 1592 vie_fallback_exitinfo(vie, vme); 1593 return (-1); 1594 } 1595 1596 repeat: 1597 error = vie_emulate_mmio(vie, vm, vcpuid); 1598 if (error < 0) { 1599 /* 1600 * MMIO not handled by any of the in-kernel-emulated devices, so 1601 * make a trip out to userspace for it. 1602 */ 1603 vie_exitinfo(vie, vme); 1604 } else if (error == EAGAIN) { 1605 /* 1606 * Continue emulating the rep-prefixed instruction, which has 1607 * not completed its iterations. 1608 * 1609 * In case this can be emulated in-kernel and has a high 1610 * repetition count (causing a tight spin), it should be 1611 * deferential to yield conditions. 1612 */ 1613 if (!vcpu_should_yield(vm, vcpuid)) { 1614 goto repeat; 1615 } else { 1616 /* 1617 * Defer to the contending load by making a trip to 1618 * userspace with a no-op (BOGUS) exit reason. 1619 */ 1620 vie_reset(vie); 1621 vme->exitcode = VM_EXITCODE_BOGUS; 1622 return (-1); 1623 } 1624 } else if (error == 0) { 1625 /* Update %rip now that instruction has been emulated */ 1626 vie_advance_pc(vie, &vcpu->nextrip); 1627 } 1628 return (error); 1629 } 1630 1631 static int 1632 vm_handle_inout(struct vm *vm, int vcpuid, struct vm_exit *vme) 1633 { 1634 struct vcpu *vcpu; 1635 struct vie *vie; 1636 int err; 1637 1638 vcpu = &vm->vcpu[vcpuid]; 1639 vie = vcpu->vie_ctx; 1640 1641 repeat: 1642 err = vie_emulate_inout(vie, vm, vcpuid); 1643 1644 if (err < 0) { 1645 /* 1646 * In/out not handled by any of the in-kernel-emulated devices, 1647 * so make a trip out to userspace for it. 1648 */ 1649 vie_exitinfo(vie, vme); 1650 return (err); 1651 } else if (err == EAGAIN) { 1652 /* 1653 * Continue emulating the rep-prefixed ins/outs, which has not 1654 * completed its iterations. 1655 * 1656 * In case this can be emulated in-kernel and has a high 1657 * repetition count (causing a tight spin), it should be 1658 * deferential to yield conditions. 1659 */ 1660 if (!vcpu_should_yield(vm, vcpuid)) { 1661 goto repeat; 1662 } else { 1663 /* 1664 * Defer to the contending load by making a trip to 1665 * userspace with a no-op (BOGUS) exit reason. 1666 */ 1667 vie_reset(vie); 1668 vme->exitcode = VM_EXITCODE_BOGUS; 1669 return (-1); 1670 } 1671 } else if (err != 0) { 1672 /* Emulation failure. Bail all the way out to userspace. */ 1673 vme->exitcode = VM_EXITCODE_INST_EMUL; 1674 bzero(&vme->u.inst_emul, sizeof (vme->u.inst_emul)); 1675 return (-1); 1676 } 1677 1678 vie_advance_pc(vie, &vcpu->nextrip); 1679 return (0); 1680 } 1681 1682 static int 1683 vm_handle_inst_emul(struct vm *vm, int vcpuid) 1684 { 1685 struct vie *vie; 1686 struct vcpu *vcpu; 1687 struct vm_exit *vme; 1688 uint64_t cs_base; 1689 int error, fault, cs_d; 1690 1691 vcpu = &vm->vcpu[vcpuid]; 1692 vme = &vcpu->exitinfo; 1693 vie = vcpu->vie_ctx; 1694 1695 vie_cs_info(vie, vm, vcpuid, &cs_base, &cs_d); 1696 1697 /* Fetch the faulting instruction */ 1698 ASSERT(vie_needs_fetch(vie)); 1699 error = vie_fetch_instruction(vie, vm, vcpuid, vme->rip + cs_base, 1700 &fault); 1701 if (error != 0) { 1702 return (error); 1703 } else if (fault) { 1704 /* 1705 * If a fault during instruction fetch was encounted, it will 1706 * have asserted that the appropriate exception be injected at 1707 * next entry. No further work is required. 1708 */ 1709 return (0); 1710 } 1711 1712 if (vie_decode_instruction(vie, vm, vcpuid, cs_d) != 0) { 1713 /* Dump (unrecognized) instruction bytes in userspace */ 1714 vie_fallback_exitinfo(vie, vme); 1715 return (-1); 1716 } 1717 1718 error = vie_emulate_other(vie, vm, vcpuid); 1719 if (error != 0) { 1720 /* 1721 * Instruction emulation was unable to complete successfully, so 1722 * kick it out to userspace for handling. 1723 */ 1724 vie_fallback_exitinfo(vie, vme); 1725 } else { 1726 /* Update %rip now that instruction has been emulated */ 1727 vie_advance_pc(vie, &vcpu->nextrip); 1728 } 1729 return (error); 1730 } 1731 1732 static int 1733 vm_handle_suspend(struct vm *vm, int vcpuid) 1734 { 1735 int i; 1736 struct vcpu *vcpu; 1737 1738 vcpu = &vm->vcpu[vcpuid]; 1739 1740 CPU_SET_ATOMIC(vcpuid, &vm->suspended_cpus); 1741 1742 /* 1743 * Wait until all 'active_cpus' have suspended themselves. 1744 */ 1745 vcpu_lock(vcpu); 1746 vcpu_ustate_change(vm, vcpuid, VU_INIT); 1747 while (1) { 1748 int rc; 1749 1750 if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) { 1751 VCPU_CTR0(vm, vcpuid, "All vcpus suspended"); 1752 break; 1753 } 1754 1755 vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING); 1756 rc = cv_reltimedwait_sig(&vcpu->vcpu_cv, &vcpu->lock, hz, 1757 TR_CLOCK_TICK); 1758 vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN); 1759 1760 /* 1761 * If the userspace process driving the instance is killed, any 1762 * vCPUs yet to be marked suspended (because they are not 1763 * VM_RUN-ing in the kernel presently) will never reach that 1764 * state. 1765 * 1766 * To avoid vm_handle_suspend() getting stuck in the kernel 1767 * waiting for those vCPUs, offer a bail-out even though it 1768 * means returning without all vCPUs in a suspended state. 1769 */ 1770 if (rc <= 0) { 1771 if ((curproc->p_flag & SEXITING) != 0) { 1772 break; 1773 } 1774 } 1775 } 1776 vcpu_unlock(vcpu); 1777 1778 /* 1779 * Wakeup the other sleeping vcpus and return to userspace. 1780 */ 1781 for (i = 0; i < vm->maxcpus; i++) { 1782 if (CPU_ISSET(i, &vm->suspended_cpus)) { 1783 vcpu_notify_event(vm, i); 1784 } 1785 } 1786 1787 return (-1); 1788 } 1789 1790 static int 1791 vm_handle_reqidle(struct vm *vm, int vcpuid) 1792 { 1793 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 1794 1795 vcpu_lock(vcpu); 1796 KASSERT(vcpu->reqidle, ("invalid vcpu reqidle %d", vcpu->reqidle)); 1797 vcpu->reqidle = 0; 1798 vcpu_unlock(vcpu); 1799 return (-1); 1800 } 1801 1802 static int 1803 vm_handle_run_state(struct vm *vm, int vcpuid) 1804 { 1805 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 1806 bool handled = false; 1807 1808 vcpu_lock(vcpu); 1809 while (1) { 1810 if ((vcpu->run_state & VRS_PEND_INIT) != 0) { 1811 vcpu_unlock(vcpu); 1812 VERIFY0(vcpu_arch_reset(vm, vcpuid, true)); 1813 vcpu_lock(vcpu); 1814 1815 vcpu->run_state &= ~(VRS_RUN | VRS_PEND_INIT); 1816 vcpu->run_state |= VRS_INIT; 1817 } 1818 1819 if ((vcpu->run_state & (VRS_INIT | VRS_RUN | VRS_PEND_SIPI)) == 1820 (VRS_INIT | VRS_PEND_SIPI)) { 1821 const uint8_t vector = vcpu->sipi_vector; 1822 1823 vcpu_unlock(vcpu); 1824 VERIFY0(vcpu_vector_sipi(vm, vcpuid, vector)); 1825 vcpu_lock(vcpu); 1826 1827 vcpu->run_state &= ~VRS_PEND_SIPI; 1828 vcpu->run_state |= VRS_RUN; 1829 } 1830 1831 /* 1832 * If the vCPU is now in the running state, there is no need to 1833 * wait for anything prior to re-entry. 1834 */ 1835 if ((vcpu->run_state & VRS_RUN) != 0) { 1836 handled = true; 1837 break; 1838 } 1839 1840 /* 1841 * Also check for software events which would cause a wake-up. 1842 * This will set the appropriate exitcode directly, rather than 1843 * requiring a trip through VM_RUN(). 1844 */ 1845 if (vcpu_sleep_bailout_checks(vm, vcpuid)) { 1846 break; 1847 } 1848 1849 vcpu_ustate_change(vm, vcpuid, VU_IDLE); 1850 vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING); 1851 (void) cv_wait_sig(&vcpu->vcpu_cv, &vcpu->lock); 1852 vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN); 1853 vcpu_ustate_change(vm, vcpuid, VU_EMU_KERN); 1854 } 1855 vcpu_unlock(vcpu); 1856 1857 return (handled ? 0 : -1); 1858 } 1859 1860 static int 1861 vm_handle_rdmsr(struct vm *vm, int vcpuid, struct vm_exit *vme) 1862 { 1863 const uint32_t code = vme->u.msr.code; 1864 uint64_t val = 0; 1865 1866 switch (code) { 1867 case MSR_MCG_CAP: 1868 case MSR_MCG_STATUS: 1869 val = 0; 1870 break; 1871 1872 case MSR_MTRRcap: 1873 case MSR_MTRRdefType: 1874 case MSR_MTRR4kBase ... MSR_MTRR4kBase + 8: 1875 case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1: 1876 case MSR_MTRR64kBase: 1877 val = 0; 1878 break; 1879 1880 case MSR_TSC: 1881 /* 1882 * In all likelihood, this should always be handled in guest 1883 * context by VMX/SVM rather than taking an exit. (Both VMX and 1884 * SVM pass through read-only access to MSR_TSC to the guest.) 1885 * 1886 * No physical offset is requested of vcpu_tsc_offset() since 1887 * rdtsc_offset() takes care of that instead. 1888 */ 1889 val = vcpu_tsc_offset(vm, vcpuid, false) + rdtsc_offset(); 1890 break; 1891 1892 default: 1893 /* 1894 * Anything not handled at this point will be kicked out to 1895 * userspace for attempted processing there. 1896 */ 1897 return (-1); 1898 } 1899 1900 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RAX, 1901 val & 0xffffffff)); 1902 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RDX, 1903 val >> 32)); 1904 return (0); 1905 } 1906 1907 static int 1908 vm_handle_wrmsr(struct vm *vm, int vcpuid, struct vm_exit *vme) 1909 { 1910 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 1911 const uint32_t code = vme->u.msr.code; 1912 const uint64_t val = vme->u.msr.wval; 1913 1914 switch (code) { 1915 case MSR_MCG_CAP: 1916 case MSR_MCG_STATUS: 1917 /* Ignore writes */ 1918 break; 1919 1920 case MSR_MTRRcap: 1921 vm_inject_gp(vm, vcpuid); 1922 break; 1923 case MSR_MTRRdefType: 1924 case MSR_MTRR4kBase ... MSR_MTRR4kBase + 8: 1925 case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1: 1926 case MSR_MTRR64kBase: 1927 /* Ignore writes */ 1928 break; 1929 1930 case MSR_TSC: 1931 /* 1932 * The effect of writing the TSC MSR is that a subsequent read 1933 * of the TSC would report that value written (plus any time 1934 * elapsed between the write and the read). The guest TSC value 1935 * is calculated from a global offset for the guest (which 1936 * effectively makes its TSC read 0 at guest boot) and a 1937 * per-vCPU offset to handle these writes to the MSR. 1938 * 1939 * To calculate that per-vCPU offset, we can work backwards from 1940 * the guest value at the time of write: 1941 * 1942 * value = host TSC + VM boot offset + vCPU offset 1943 * 1944 * so therefore: 1945 * 1946 * value - host TSC - VM boot offset = vCPU offset 1947 */ 1948 vcpu->tsc_offset = val - vm->boot_tsc_offset - rdtsc_offset(); 1949 break; 1950 1951 default: 1952 /* 1953 * Anything not handled at this point will be kicked out to 1954 * userspace for attempted processing there. 1955 */ 1956 return (-1); 1957 } 1958 1959 return (0); 1960 } 1961 1962 int 1963 vm_suspend(struct vm *vm, enum vm_suspend_how how) 1964 { 1965 if (how <= VM_SUSPEND_NONE || how >= VM_SUSPEND_LAST) 1966 return (EINVAL); 1967 1968 if (atomic_cmpset_int((uint_t *)&vm->suspend, 0, how) == 0) { 1969 return (EALREADY); 1970 } 1971 1972 /* 1973 * Notify all active vcpus that they are now suspended. 1974 */ 1975 for (uint_t i = 0; i < vm->maxcpus; i++) { 1976 struct vcpu *vcpu = &vm->vcpu[i]; 1977 1978 vcpu_lock(vcpu); 1979 if (vcpu->state == VCPU_IDLE || vcpu->state == VCPU_FROZEN) { 1980 /* 1981 * Any vCPUs not actively running or in HLT can be 1982 * marked as suspended immediately. 1983 */ 1984 if (CPU_ISSET(i, &vm->active_cpus)) { 1985 CPU_SET_ATOMIC(i, &vm->suspended_cpus); 1986 } 1987 } else { 1988 /* 1989 * Those which are running or in HLT will pick up the 1990 * suspended state after notification. 1991 */ 1992 vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT); 1993 } 1994 vcpu_unlock(vcpu); 1995 } 1996 return (0); 1997 } 1998 1999 void 2000 vm_exit_run_state(struct vm *vm, int vcpuid, uint64_t rip) 2001 { 2002 struct vm_exit *vmexit; 2003 2004 vmexit = vm_exitinfo(vm, vcpuid); 2005 vmexit->rip = rip; 2006 vmexit->inst_length = 0; 2007 vmexit->exitcode = VM_EXITCODE_RUN_STATE; 2008 vmm_stat_incr(vm, vcpuid, VMEXIT_RUN_STATE, 1); 2009 } 2010 2011 /* 2012 * Some vmm resources, such as the lapic, may have CPU-specific resources 2013 * allocated to them which would benefit from migration onto the host CPU which 2014 * is processing the vcpu state. 2015 */ 2016 static void 2017 vm_localize_resources(struct vm *vm, struct vcpu *vcpu) 2018 { 2019 /* 2020 * Localizing cyclic resources requires acquisition of cpu_lock, and 2021 * doing so with kpreempt disabled is a recipe for deadlock disaster. 2022 */ 2023 VERIFY(curthread->t_preempt == 0); 2024 2025 /* 2026 * Do not bother with localization if this vCPU is about to return to 2027 * the host CPU it was last localized to. 2028 */ 2029 if (vcpu->lastloccpu == curcpu) 2030 return; 2031 2032 /* 2033 * Localize system-wide resources to the primary boot vCPU. While any 2034 * of the other vCPUs may access them, it keeps the potential interrupt 2035 * footprint constrained to CPUs involved with this instance. 2036 */ 2037 if (vcpu == &vm->vcpu[0]) { 2038 vhpet_localize_resources(vm->vhpet); 2039 vrtc_localize_resources(vm->vrtc); 2040 vatpit_localize_resources(vm->vatpit); 2041 } 2042 2043 vlapic_localize_resources(vcpu->vlapic); 2044 2045 vcpu->lastloccpu = curcpu; 2046 } 2047 2048 static void 2049 vmm_savectx(void *arg) 2050 { 2051 vm_thread_ctx_t *vtc = arg; 2052 struct vm *vm = vtc->vtc_vm; 2053 const int vcpuid = vtc->vtc_vcpuid; 2054 2055 if (ops->vmsavectx != NULL) { 2056 ops->vmsavectx(vm->cookie, vcpuid); 2057 } 2058 2059 /* 2060 * Account for going off-cpu, unless the vCPU is idled, where being 2061 * off-cpu is the explicit point. 2062 */ 2063 if (vm->vcpu[vcpuid].ustate != VU_IDLE) { 2064 vtc->vtc_ustate = vm->vcpu[vcpuid].ustate; 2065 vcpu_ustate_change(vm, vcpuid, VU_SCHED); 2066 } 2067 2068 /* 2069 * If the CPU holds the restored guest FPU state, save it and restore 2070 * the host FPU state before this thread goes off-cpu. 2071 */ 2072 if ((vtc->vtc_status & VTCS_FPU_RESTORED) != 0) { 2073 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 2074 2075 save_guest_fpustate(vcpu); 2076 vtc->vtc_status &= ~VTCS_FPU_RESTORED; 2077 } 2078 } 2079 2080 static void 2081 vmm_restorectx(void *arg) 2082 { 2083 vm_thread_ctx_t *vtc = arg; 2084 struct vm *vm = vtc->vtc_vm; 2085 const int vcpuid = vtc->vtc_vcpuid; 2086 2087 /* Complete microstate accounting for vCPU being off-cpu */ 2088 if (vm->vcpu[vcpuid].ustate != VU_IDLE) { 2089 vcpu_ustate_change(vm, vcpuid, vtc->vtc_ustate); 2090 } 2091 2092 /* 2093 * When coming back on-cpu, only restore the guest FPU status if the 2094 * thread is in a context marked as requiring it. This should be rare, 2095 * occurring only when a future logic error results in a voluntary 2096 * sleep during the VMRUN critical section. 2097 * 2098 * The common case will result in elision of the guest FPU state 2099 * restoration, deferring that action until it is clearly necessary 2100 * during vm_run. 2101 */ 2102 VERIFY((vtc->vtc_status & VTCS_FPU_RESTORED) == 0); 2103 if ((vtc->vtc_status & VTCS_FPU_CTX_CRITICAL) != 0) { 2104 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 2105 2106 restore_guest_fpustate(vcpu); 2107 vtc->vtc_status |= VTCS_FPU_RESTORED; 2108 } 2109 2110 if (ops->vmrestorectx != NULL) { 2111 ops->vmrestorectx(vm->cookie, vcpuid); 2112 } 2113 2114 } 2115 2116 static int 2117 vm_entry_actions(struct vm *vm, int vcpuid, const struct vm_entry *entry, 2118 struct vm_exit *vme) 2119 { 2120 struct vcpu *vcpu; 2121 struct vie *vie; 2122 int err; 2123 2124 vcpu = &vm->vcpu[vcpuid]; 2125 vie = vcpu->vie_ctx; 2126 err = 0; 2127 2128 switch (entry->cmd) { 2129 case VEC_DEFAULT: 2130 return (0); 2131 case VEC_DISCARD_INSTR: 2132 vie_reset(vie); 2133 return (0); 2134 case VEC_FULFILL_MMIO: 2135 err = vie_fulfill_mmio(vie, &entry->u.mmio); 2136 if (err == 0) { 2137 err = vie_emulate_mmio(vie, vm, vcpuid); 2138 if (err == 0) { 2139 vie_advance_pc(vie, &vcpu->nextrip); 2140 } else if (err < 0) { 2141 vie_exitinfo(vie, vme); 2142 } else if (err == EAGAIN) { 2143 /* 2144 * Clear the instruction emulation state in 2145 * order to re-enter VM context and continue 2146 * this 'rep <instruction>' 2147 */ 2148 vie_reset(vie); 2149 err = 0; 2150 } 2151 } 2152 break; 2153 case VEC_FULFILL_INOUT: 2154 err = vie_fulfill_inout(vie, &entry->u.inout); 2155 if (err == 0) { 2156 err = vie_emulate_inout(vie, vm, vcpuid); 2157 if (err == 0) { 2158 vie_advance_pc(vie, &vcpu->nextrip); 2159 } else if (err < 0) { 2160 vie_exitinfo(vie, vme); 2161 } else if (err == EAGAIN) { 2162 /* 2163 * Clear the instruction emulation state in 2164 * order to re-enter VM context and continue 2165 * this 'rep ins/outs' 2166 */ 2167 vie_reset(vie); 2168 err = 0; 2169 } 2170 } 2171 break; 2172 default: 2173 return (EINVAL); 2174 } 2175 return (err); 2176 } 2177 2178 static int 2179 vm_loop_checks(struct vm *vm, int vcpuid, struct vm_exit *vme) 2180 { 2181 struct vie *vie; 2182 2183 vie = vm->vcpu[vcpuid].vie_ctx; 2184 2185 if (vie_pending(vie)) { 2186 /* 2187 * Userspace has not fulfilled the pending needs of the 2188 * instruction emulation, so bail back out. 2189 */ 2190 vie_exitinfo(vie, vme); 2191 return (-1); 2192 } 2193 2194 return (0); 2195 } 2196 2197 int 2198 vm_run(struct vm *vm, int vcpuid, const struct vm_entry *entry) 2199 { 2200 int error; 2201 struct vcpu *vcpu; 2202 struct vm_exit *vme; 2203 bool intr_disabled; 2204 int affinity_type = CPU_CURRENT; 2205 2206 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 2207 return (EINVAL); 2208 if (!CPU_ISSET(vcpuid, &vm->active_cpus)) 2209 return (EINVAL); 2210 2211 vcpu = &vm->vcpu[vcpuid]; 2212 vme = &vcpu->exitinfo; 2213 2214 vcpu_ustate_change(vm, vcpuid, VU_EMU_KERN); 2215 2216 vcpu->vtc.vtc_status = 0; 2217 ctxop_attach(curthread, vcpu->ctxop); 2218 2219 error = vm_entry_actions(vm, vcpuid, entry, vme); 2220 if (error != 0) { 2221 goto exit; 2222 } 2223 2224 restart: 2225 error = vm_loop_checks(vm, vcpuid, vme); 2226 if (error != 0) { 2227 goto exit; 2228 } 2229 2230 thread_affinity_set(curthread, affinity_type); 2231 /* 2232 * Resource localization should happen after the CPU affinity for the 2233 * thread has been set to ensure that access from restricted contexts, 2234 * such as VMX-accelerated APIC operations, can occur without inducing 2235 * cyclic cross-calls. 2236 * 2237 * This must be done prior to disabling kpreempt via critical_enter(). 2238 */ 2239 vm_localize_resources(vm, vcpu); 2240 affinity_type = CPU_CURRENT; 2241 critical_enter(); 2242 2243 /* Force a trip through update_sregs to reload %fs/%gs and friends */ 2244 PCB_SET_UPDATE_SEGS(&ttolwp(curthread)->lwp_pcb); 2245 2246 if ((vcpu->vtc.vtc_status & VTCS_FPU_RESTORED) == 0) { 2247 restore_guest_fpustate(vcpu); 2248 vcpu->vtc.vtc_status |= VTCS_FPU_RESTORED; 2249 } 2250 vcpu->vtc.vtc_status |= VTCS_FPU_CTX_CRITICAL; 2251 2252 vcpu_require_state(vm, vcpuid, VCPU_RUNNING); 2253 error = VMRUN(vm->cookie, vcpuid, vcpu->nextrip); 2254 vcpu_require_state(vm, vcpuid, VCPU_FROZEN); 2255 2256 /* 2257 * Once clear of the delicate contexts comprising the VM_RUN handler, 2258 * thread CPU affinity can be loosened while other processing occurs. 2259 */ 2260 vcpu->vtc.vtc_status &= ~VTCS_FPU_CTX_CRITICAL; 2261 thread_affinity_clear(curthread); 2262 critical_exit(); 2263 2264 if (error != 0) { 2265 /* Communicate out any error from VMRUN() above */ 2266 goto exit; 2267 } 2268 2269 vcpu->nextrip = vme->rip + vme->inst_length; 2270 switch (vme->exitcode) { 2271 case VM_EXITCODE_REQIDLE: 2272 error = vm_handle_reqidle(vm, vcpuid); 2273 break; 2274 case VM_EXITCODE_RUN_STATE: 2275 error = vm_handle_run_state(vm, vcpuid); 2276 break; 2277 case VM_EXITCODE_SUSPENDED: 2278 error = vm_handle_suspend(vm, vcpuid); 2279 break; 2280 case VM_EXITCODE_IOAPIC_EOI: 2281 vioapic_process_eoi(vm, vcpuid, 2282 vme->u.ioapic_eoi.vector); 2283 break; 2284 case VM_EXITCODE_HLT: 2285 intr_disabled = ((vme->u.hlt.rflags & PSL_I) == 0); 2286 error = vm_handle_hlt(vm, vcpuid, intr_disabled); 2287 break; 2288 case VM_EXITCODE_PAGING: 2289 error = vm_handle_paging(vm, vcpuid); 2290 break; 2291 case VM_EXITCODE_MMIO_EMUL: 2292 error = vm_handle_mmio_emul(vm, vcpuid); 2293 break; 2294 case VM_EXITCODE_INOUT: 2295 error = vm_handle_inout(vm, vcpuid, vme); 2296 break; 2297 case VM_EXITCODE_INST_EMUL: 2298 error = vm_handle_inst_emul(vm, vcpuid); 2299 break; 2300 case VM_EXITCODE_MONITOR: 2301 case VM_EXITCODE_MWAIT: 2302 case VM_EXITCODE_VMINSN: 2303 vm_inject_ud(vm, vcpuid); 2304 break; 2305 case VM_EXITCODE_RDMSR: 2306 error = vm_handle_rdmsr(vm, vcpuid, vme); 2307 break; 2308 case VM_EXITCODE_WRMSR: 2309 error = vm_handle_wrmsr(vm, vcpuid, vme); 2310 break; 2311 case VM_EXITCODE_HT: 2312 affinity_type = CPU_BEST; 2313 break; 2314 case VM_EXITCODE_MTRAP: 2315 vm_suspend_cpu(vm, vcpuid); 2316 error = -1; 2317 break; 2318 default: 2319 /* handled in userland */ 2320 error = -1; 2321 break; 2322 } 2323 2324 if (error == 0) { 2325 /* VM exit conditions handled in-kernel, continue running */ 2326 goto restart; 2327 } 2328 2329 exit: 2330 kpreempt_disable(); 2331 ctxop_detach(curthread, vcpu->ctxop); 2332 /* Make sure all of the needed vCPU context state is saved */ 2333 vmm_savectx(&vcpu->vtc); 2334 kpreempt_enable(); 2335 2336 VCPU_CTR2(vm, vcpuid, "retu %d/%d", error, vme->exitcode); 2337 2338 vcpu_ustate_change(vm, vcpuid, VU_EMU_USER); 2339 return (error); 2340 } 2341 2342 int 2343 vm_restart_instruction(void *arg, int vcpuid) 2344 { 2345 struct vm *vm; 2346 struct vcpu *vcpu; 2347 enum vcpu_state state; 2348 uint64_t rip; 2349 int error; 2350 2351 vm = arg; 2352 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 2353 return (EINVAL); 2354 2355 vcpu = &vm->vcpu[vcpuid]; 2356 state = vcpu_get_state(vm, vcpuid, NULL); 2357 if (state == VCPU_RUNNING) { 2358 /* 2359 * When a vcpu is "running" the next instruction is determined 2360 * by adding 'rip' and 'inst_length' in the vcpu's 'exitinfo'. 2361 * Thus setting 'inst_length' to zero will cause the current 2362 * instruction to be restarted. 2363 */ 2364 vcpu->exitinfo.inst_length = 0; 2365 VCPU_CTR1(vm, vcpuid, "restarting instruction at %lx by " 2366 "setting inst_length to zero", vcpu->exitinfo.rip); 2367 } else if (state == VCPU_FROZEN) { 2368 /* 2369 * When a vcpu is "frozen" it is outside the critical section 2370 * around VMRUN() and 'nextrip' points to the next instruction. 2371 * Thus instruction restart is achieved by setting 'nextrip' 2372 * to the vcpu's %rip. 2373 */ 2374 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RIP, &rip); 2375 KASSERT(!error, ("%s: error %d getting rip", __func__, error)); 2376 VCPU_CTR2(vm, vcpuid, "restarting instruction by updating " 2377 "nextrip from %lx to %lx", vcpu->nextrip, rip); 2378 vcpu->nextrip = rip; 2379 } else { 2380 panic("%s: invalid state %d", __func__, state); 2381 } 2382 return (0); 2383 } 2384 2385 int 2386 vm_exit_intinfo(struct vm *vm, int vcpuid, uint64_t info) 2387 { 2388 struct vcpu *vcpu; 2389 int type, vector; 2390 2391 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 2392 return (EINVAL); 2393 2394 vcpu = &vm->vcpu[vcpuid]; 2395 2396 if (info & VM_INTINFO_VALID) { 2397 type = info & VM_INTINFO_TYPE; 2398 vector = info & 0xff; 2399 if (type == VM_INTINFO_NMI && vector != IDT_NMI) 2400 return (EINVAL); 2401 if (type == VM_INTINFO_HWEXCEPTION && vector >= 32) 2402 return (EINVAL); 2403 if (info & VM_INTINFO_RSVD) 2404 return (EINVAL); 2405 } else { 2406 info = 0; 2407 } 2408 VCPU_CTR2(vm, vcpuid, "%s: info1(%lx)", __func__, info); 2409 vcpu->exitintinfo = info; 2410 return (0); 2411 } 2412 2413 enum exc_class { 2414 EXC_BENIGN, 2415 EXC_CONTRIBUTORY, 2416 EXC_PAGEFAULT 2417 }; 2418 2419 #define IDT_VE 20 /* Virtualization Exception (Intel specific) */ 2420 2421 static enum exc_class 2422 exception_class(uint64_t info) 2423 { 2424 int type, vector; 2425 2426 KASSERT(info & VM_INTINFO_VALID, ("intinfo must be valid: %lx", info)); 2427 type = info & VM_INTINFO_TYPE; 2428 vector = info & 0xff; 2429 2430 /* Table 6-4, "Interrupt and Exception Classes", Intel SDM, Vol 3 */ 2431 switch (type) { 2432 case VM_INTINFO_HWINTR: 2433 case VM_INTINFO_SWINTR: 2434 case VM_INTINFO_NMI: 2435 return (EXC_BENIGN); 2436 default: 2437 /* 2438 * Hardware exception. 2439 * 2440 * SVM and VT-x use identical type values to represent NMI, 2441 * hardware interrupt and software interrupt. 2442 * 2443 * SVM uses type '3' for all exceptions. VT-x uses type '3' 2444 * for exceptions except #BP and #OF. #BP and #OF use a type 2445 * value of '5' or '6'. Therefore we don't check for explicit 2446 * values of 'type' to classify 'intinfo' into a hardware 2447 * exception. 2448 */ 2449 break; 2450 } 2451 2452 switch (vector) { 2453 case IDT_PF: 2454 case IDT_VE: 2455 return (EXC_PAGEFAULT); 2456 case IDT_DE: 2457 case IDT_TS: 2458 case IDT_NP: 2459 case IDT_SS: 2460 case IDT_GP: 2461 return (EXC_CONTRIBUTORY); 2462 default: 2463 return (EXC_BENIGN); 2464 } 2465 } 2466 2467 static int 2468 nested_fault(struct vm *vm, int vcpuid, uint64_t info1, uint64_t info2, 2469 uint64_t *retinfo) 2470 { 2471 enum exc_class exc1, exc2; 2472 int type1, vector1; 2473 2474 KASSERT(info1 & VM_INTINFO_VALID, ("info1 %lx is not valid", info1)); 2475 KASSERT(info2 & VM_INTINFO_VALID, ("info2 %lx is not valid", info2)); 2476 2477 /* 2478 * If an exception occurs while attempting to call the double-fault 2479 * handler the processor enters shutdown mode (aka triple fault). 2480 */ 2481 type1 = info1 & VM_INTINFO_TYPE; 2482 vector1 = info1 & 0xff; 2483 if (type1 == VM_INTINFO_HWEXCEPTION && vector1 == IDT_DF) { 2484 VCPU_CTR2(vm, vcpuid, "triple fault: info1(%lx), info2(%lx)", 2485 info1, info2); 2486 vm_suspend(vm, VM_SUSPEND_TRIPLEFAULT); 2487 *retinfo = 0; 2488 return (0); 2489 } 2490 2491 /* 2492 * Table 6-5 "Conditions for Generating a Double Fault", Intel SDM, Vol3 2493 */ 2494 exc1 = exception_class(info1); 2495 exc2 = exception_class(info2); 2496 if ((exc1 == EXC_CONTRIBUTORY && exc2 == EXC_CONTRIBUTORY) || 2497 (exc1 == EXC_PAGEFAULT && exc2 != EXC_BENIGN)) { 2498 /* Convert nested fault into a double fault. */ 2499 *retinfo = IDT_DF; 2500 *retinfo |= VM_INTINFO_VALID | VM_INTINFO_HWEXCEPTION; 2501 *retinfo |= VM_INTINFO_DEL_ERRCODE; 2502 } else { 2503 /* Handle exceptions serially */ 2504 *retinfo = info2; 2505 } 2506 return (1); 2507 } 2508 2509 static uint64_t 2510 vcpu_exception_intinfo(struct vcpu *vcpu) 2511 { 2512 uint64_t info = 0; 2513 2514 if (vcpu->exception_pending) { 2515 info = vcpu->exc_vector & 0xff; 2516 info |= VM_INTINFO_VALID | VM_INTINFO_HWEXCEPTION; 2517 if (vcpu->exc_errcode_valid) { 2518 info |= VM_INTINFO_DEL_ERRCODE; 2519 info |= (uint64_t)vcpu->exc_errcode << 32; 2520 } 2521 } 2522 return (info); 2523 } 2524 2525 int 2526 vm_entry_intinfo(struct vm *vm, int vcpuid, uint64_t *retinfo) 2527 { 2528 struct vcpu *vcpu; 2529 uint64_t info1, info2; 2530 int valid; 2531 2532 KASSERT(vcpuid >= 0 && 2533 vcpuid < vm->maxcpus, ("invalid vcpu %d", vcpuid)); 2534 2535 vcpu = &vm->vcpu[vcpuid]; 2536 2537 info1 = vcpu->exitintinfo; 2538 vcpu->exitintinfo = 0; 2539 2540 info2 = 0; 2541 if (vcpu->exception_pending) { 2542 info2 = vcpu_exception_intinfo(vcpu); 2543 vcpu->exception_pending = 0; 2544 VCPU_CTR2(vm, vcpuid, "Exception %d delivered: %lx", 2545 vcpu->exc_vector, info2); 2546 } 2547 2548 if ((info1 & VM_INTINFO_VALID) && (info2 & VM_INTINFO_VALID)) { 2549 valid = nested_fault(vm, vcpuid, info1, info2, retinfo); 2550 } else if (info1 & VM_INTINFO_VALID) { 2551 *retinfo = info1; 2552 valid = 1; 2553 } else if (info2 & VM_INTINFO_VALID) { 2554 *retinfo = info2; 2555 valid = 1; 2556 } else { 2557 valid = 0; 2558 } 2559 2560 if (valid) { 2561 VCPU_CTR4(vm, vcpuid, "%s: info1(%lx), info2(%lx), " 2562 "retinfo(%lx)", __func__, info1, info2, *retinfo); 2563 } 2564 2565 return (valid); 2566 } 2567 2568 int 2569 vm_get_intinfo(struct vm *vm, int vcpuid, uint64_t *info1, uint64_t *info2) 2570 { 2571 struct vcpu *vcpu; 2572 2573 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 2574 return (EINVAL); 2575 2576 vcpu = &vm->vcpu[vcpuid]; 2577 *info1 = vcpu->exitintinfo; 2578 *info2 = vcpu_exception_intinfo(vcpu); 2579 return (0); 2580 } 2581 2582 int 2583 vm_inject_exception(struct vm *vm, int vcpuid, int vector, int errcode_valid, 2584 uint32_t errcode, int restart_instruction) 2585 { 2586 struct vcpu *vcpu; 2587 uint64_t regval; 2588 int error; 2589 2590 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 2591 return (EINVAL); 2592 2593 if (vector < 0 || vector >= 32) 2594 return (EINVAL); 2595 2596 /* 2597 * NMIs (which bear an exception vector of 2) are to be injected via 2598 * their own specialized path using vm_inject_nmi(). 2599 */ 2600 if (vector == 2) { 2601 return (EINVAL); 2602 } 2603 2604 /* 2605 * A double fault exception should never be injected directly into 2606 * the guest. It is a derived exception that results from specific 2607 * combinations of nested faults. 2608 */ 2609 if (vector == IDT_DF) 2610 return (EINVAL); 2611 2612 vcpu = &vm->vcpu[vcpuid]; 2613 2614 if (vcpu->exception_pending) { 2615 VCPU_CTR2(vm, vcpuid, "Unable to inject exception %d due to " 2616 "pending exception %d", vector, vcpu->exc_vector); 2617 return (EBUSY); 2618 } 2619 2620 if (errcode_valid) { 2621 /* 2622 * Exceptions don't deliver an error code in real mode. 2623 */ 2624 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_CR0, ®val); 2625 KASSERT(!error, ("%s: error %d getting CR0", __func__, error)); 2626 if (!(regval & CR0_PE)) 2627 errcode_valid = 0; 2628 } 2629 2630 /* 2631 * From section 26.6.1 "Interruptibility State" in Intel SDM: 2632 * 2633 * Event blocking by "STI" or "MOV SS" is cleared after guest executes 2634 * one instruction or incurs an exception. 2635 */ 2636 error = vm_set_register(vm, vcpuid, VM_REG_GUEST_INTR_SHADOW, 0); 2637 KASSERT(error == 0, ("%s: error %d clearing interrupt shadow", 2638 __func__, error)); 2639 2640 if (restart_instruction) 2641 vm_restart_instruction(vm, vcpuid); 2642 2643 vcpu->exception_pending = 1; 2644 vcpu->exc_vector = vector; 2645 vcpu->exc_errcode = errcode; 2646 vcpu->exc_errcode_valid = errcode_valid; 2647 VCPU_CTR1(vm, vcpuid, "Exception %d pending", vector); 2648 return (0); 2649 } 2650 2651 void 2652 vm_inject_fault(struct vm *vm, int vcpuid, int vector, int errcode_valid, 2653 int errcode) 2654 { 2655 int error; 2656 2657 error = vm_inject_exception(vm, vcpuid, vector, errcode_valid, 2658 errcode, 1); 2659 KASSERT(error == 0, ("vm_inject_exception error %d", error)); 2660 } 2661 2662 void 2663 vm_inject_ud(struct vm *vm, int vcpuid) 2664 { 2665 vm_inject_fault(vm, vcpuid, IDT_UD, 0, 0); 2666 } 2667 2668 void 2669 vm_inject_gp(struct vm *vm, int vcpuid) 2670 { 2671 vm_inject_fault(vm, vcpuid, IDT_GP, 1, 0); 2672 } 2673 2674 void 2675 vm_inject_ac(struct vm *vm, int vcpuid, int errcode) 2676 { 2677 vm_inject_fault(vm, vcpuid, IDT_AC, 1, errcode); 2678 } 2679 2680 void 2681 vm_inject_ss(struct vm *vm, int vcpuid, int errcode) 2682 { 2683 vm_inject_fault(vm, vcpuid, IDT_SS, 1, errcode); 2684 } 2685 2686 void 2687 vm_inject_pf(struct vm *vm, int vcpuid, int error_code, uint64_t cr2) 2688 { 2689 int error; 2690 2691 VCPU_CTR2(vm, vcpuid, "Injecting page fault: error_code %x, cr2 %lx", 2692 error_code, cr2); 2693 2694 error = vm_set_register(vm, vcpuid, VM_REG_GUEST_CR2, cr2); 2695 KASSERT(error == 0, ("vm_set_register(cr2) error %d", error)); 2696 2697 vm_inject_fault(vm, vcpuid, IDT_PF, 1, error_code); 2698 } 2699 2700 static VMM_STAT(VCPU_NMI_COUNT, "number of NMIs delivered to vcpu"); 2701 2702 int 2703 vm_inject_nmi(struct vm *vm, int vcpuid) 2704 { 2705 struct vcpu *vcpu; 2706 2707 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 2708 return (EINVAL); 2709 2710 vcpu = &vm->vcpu[vcpuid]; 2711 2712 vcpu->nmi_pending = 1; 2713 vcpu_notify_event(vm, vcpuid); 2714 return (0); 2715 } 2716 2717 int 2718 vm_nmi_pending(struct vm *vm, int vcpuid) 2719 { 2720 struct vcpu *vcpu; 2721 2722 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 2723 panic("vm_nmi_pending: invalid vcpuid %d", vcpuid); 2724 2725 vcpu = &vm->vcpu[vcpuid]; 2726 2727 return (vcpu->nmi_pending); 2728 } 2729 2730 void 2731 vm_nmi_clear(struct vm *vm, int vcpuid) 2732 { 2733 struct vcpu *vcpu; 2734 2735 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 2736 panic("vm_nmi_pending: invalid vcpuid %d", vcpuid); 2737 2738 vcpu = &vm->vcpu[vcpuid]; 2739 2740 if (vcpu->nmi_pending == 0) 2741 panic("vm_nmi_clear: inconsistent nmi_pending state"); 2742 2743 vcpu->nmi_pending = 0; 2744 vmm_stat_incr(vm, vcpuid, VCPU_NMI_COUNT, 1); 2745 } 2746 2747 static VMM_STAT(VCPU_EXTINT_COUNT, "number of ExtINTs delivered to vcpu"); 2748 2749 int 2750 vm_inject_extint(struct vm *vm, int vcpuid) 2751 { 2752 struct vcpu *vcpu; 2753 2754 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 2755 return (EINVAL); 2756 2757 vcpu = &vm->vcpu[vcpuid]; 2758 2759 vcpu->extint_pending = 1; 2760 vcpu_notify_event(vm, vcpuid); 2761 return (0); 2762 } 2763 2764 int 2765 vm_extint_pending(struct vm *vm, int vcpuid) 2766 { 2767 struct vcpu *vcpu; 2768 2769 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 2770 panic("vm_extint_pending: invalid vcpuid %d", vcpuid); 2771 2772 vcpu = &vm->vcpu[vcpuid]; 2773 2774 return (vcpu->extint_pending); 2775 } 2776 2777 void 2778 vm_extint_clear(struct vm *vm, int vcpuid) 2779 { 2780 struct vcpu *vcpu; 2781 2782 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 2783 panic("vm_extint_pending: invalid vcpuid %d", vcpuid); 2784 2785 vcpu = &vm->vcpu[vcpuid]; 2786 2787 if (vcpu->extint_pending == 0) 2788 panic("vm_extint_clear: inconsistent extint_pending state"); 2789 2790 vcpu->extint_pending = 0; 2791 vmm_stat_incr(vm, vcpuid, VCPU_EXTINT_COUNT, 1); 2792 } 2793 2794 int 2795 vm_inject_init(struct vm *vm, int vcpuid) 2796 { 2797 struct vcpu *vcpu; 2798 2799 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 2800 return (EINVAL); 2801 2802 vcpu = &vm->vcpu[vcpuid]; 2803 vcpu_lock(vcpu); 2804 vcpu->run_state |= VRS_PEND_INIT; 2805 /* 2806 * As part of queuing the INIT request, clear any pending SIPI. It 2807 * would not otherwise survive across the reset of the vCPU when it 2808 * undergoes the requested INIT. We would not want it to linger when it 2809 * could be mistaken as a subsequent (after the INIT) SIPI request. 2810 */ 2811 vcpu->run_state &= ~VRS_PEND_SIPI; 2812 vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT); 2813 2814 vcpu_unlock(vcpu); 2815 return (0); 2816 } 2817 2818 int 2819 vm_inject_sipi(struct vm *vm, int vcpuid, uint8_t vector) 2820 { 2821 struct vcpu *vcpu; 2822 2823 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 2824 return (EINVAL); 2825 2826 vcpu = &vm->vcpu[vcpuid]; 2827 vcpu_lock(vcpu); 2828 vcpu->run_state |= VRS_PEND_SIPI; 2829 vcpu->sipi_vector = vector; 2830 /* SIPI is only actionable if the CPU is waiting in INIT state */ 2831 if ((vcpu->run_state & (VRS_INIT | VRS_RUN)) == VRS_INIT) { 2832 vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT); 2833 } 2834 vcpu_unlock(vcpu); 2835 return (0); 2836 } 2837 2838 bool 2839 vcpu_run_state_pending(struct vm *vm, int vcpuid) 2840 { 2841 struct vcpu *vcpu; 2842 2843 ASSERT(vcpuid >= 0 && vcpuid < vm->maxcpus); 2844 vcpu = &vm->vcpu[vcpuid]; 2845 2846 /* Of interest: vCPU not in running state or with pending INIT */ 2847 return ((vcpu->run_state & (VRS_RUN | VRS_PEND_INIT)) != VRS_RUN); 2848 } 2849 2850 int 2851 vcpu_arch_reset(struct vm *vm, int vcpuid, bool init_only) 2852 { 2853 struct seg_desc desc; 2854 const enum vm_reg_name clear_regs[] = { 2855 VM_REG_GUEST_CR2, 2856 VM_REG_GUEST_CR3, 2857 VM_REG_GUEST_CR4, 2858 VM_REG_GUEST_RAX, 2859 VM_REG_GUEST_RBX, 2860 VM_REG_GUEST_RCX, 2861 VM_REG_GUEST_RSI, 2862 VM_REG_GUEST_RDI, 2863 VM_REG_GUEST_RBP, 2864 VM_REG_GUEST_RSP, 2865 VM_REG_GUEST_R8, 2866 VM_REG_GUEST_R9, 2867 VM_REG_GUEST_R10, 2868 VM_REG_GUEST_R11, 2869 VM_REG_GUEST_R12, 2870 VM_REG_GUEST_R13, 2871 VM_REG_GUEST_R14, 2872 VM_REG_GUEST_R15, 2873 VM_REG_GUEST_DR0, 2874 VM_REG_GUEST_DR1, 2875 VM_REG_GUEST_DR2, 2876 VM_REG_GUEST_DR3, 2877 VM_REG_GUEST_EFER, 2878 }; 2879 const enum vm_reg_name data_segs[] = { 2880 VM_REG_GUEST_SS, 2881 VM_REG_GUEST_DS, 2882 VM_REG_GUEST_ES, 2883 VM_REG_GUEST_FS, 2884 VM_REG_GUEST_GS, 2885 }; 2886 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 2887 2888 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 2889 return (EINVAL); 2890 2891 for (uint_t i = 0; i < nitems(clear_regs); i++) { 2892 VERIFY0(vm_set_register(vm, vcpuid, clear_regs[i], 0)); 2893 } 2894 2895 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, 2)); 2896 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RIP, 0xfff0)); 2897 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_CR0, 0x60000010)); 2898 2899 /* 2900 * The prescribed contents of %rdx differ slightly between the Intel and 2901 * AMD architectural definitions. The former expects the Extended Model 2902 * in bits 16-19 where the latter expects all the Family, Model, and 2903 * Stepping be there. Common boot ROMs appear to disregard this 2904 * anyways, so we stick with a compromise value similar to what is 2905 * spelled out in the Intel SDM. 2906 */ 2907 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RDX, 0x600)); 2908 2909 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_DR6, 0xffff0ff0)); 2910 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_DR7, 0x400)); 2911 2912 /* CS: Present, R/W, Accessed */ 2913 desc.access = 0x0093; 2914 desc.base = 0xffff0000; 2915 desc.limit = 0xffff; 2916 VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_CS, &desc)); 2917 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_CS, 0xf000)); 2918 2919 /* SS, DS, ES, FS, GS: Present, R/W, Accessed */ 2920 desc.access = 0x0093; 2921 desc.base = 0; 2922 desc.limit = 0xffff; 2923 for (uint_t i = 0; i < nitems(data_segs); i++) { 2924 VERIFY0(vm_set_seg_desc(vm, vcpuid, data_segs[i], &desc)); 2925 VERIFY0(vm_set_register(vm, vcpuid, data_segs[i], 0)); 2926 } 2927 2928 /* GDTR, IDTR */ 2929 desc.base = 0; 2930 desc.limit = 0xffff; 2931 VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_GDTR, &desc)); 2932 VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_IDTR, &desc)); 2933 2934 /* LDTR: Present, LDT */ 2935 desc.access = 0x0082; 2936 desc.base = 0; 2937 desc.limit = 0xffff; 2938 VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_LDTR, &desc)); 2939 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_LDTR, 0)); 2940 2941 /* TR: Present, 32-bit TSS */ 2942 desc.access = 0x008b; 2943 desc.base = 0; 2944 desc.limit = 0xffff; 2945 VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_TR, &desc)); 2946 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_TR, 0)); 2947 2948 vlapic_reset(vm_lapic(vm, vcpuid)); 2949 2950 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_INTR_SHADOW, 0)); 2951 2952 vcpu->exitintinfo = 0; 2953 vcpu->exception_pending = 0; 2954 vcpu->nmi_pending = 0; 2955 vcpu->extint_pending = 0; 2956 2957 /* 2958 * A CPU reset caused by power-on or system reset clears more state than 2959 * one which is trigged from an INIT IPI. 2960 */ 2961 if (!init_only) { 2962 vcpu->guest_xcr0 = XFEATURE_ENABLED_X87; 2963 hma_fpu_init(vcpu->guestfpu); 2964 2965 /* XXX: clear MSRs and other pieces */ 2966 } 2967 2968 return (0); 2969 } 2970 2971 static int 2972 vcpu_vector_sipi(struct vm *vm, int vcpuid, uint8_t vector) 2973 { 2974 struct seg_desc desc; 2975 2976 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 2977 return (EINVAL); 2978 2979 /* CS: Present, R/W, Accessed */ 2980 desc.access = 0x0093; 2981 desc.base = (uint64_t)vector << 12; 2982 desc.limit = 0xffff; 2983 VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_CS, &desc)); 2984 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_CS, 2985 (uint64_t)vector << 8)); 2986 2987 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RIP, 0)); 2988 2989 return (0); 2990 } 2991 2992 int 2993 vm_get_capability(struct vm *vm, int vcpu, int type, int *retval) 2994 { 2995 if (vcpu < 0 || vcpu >= vm->maxcpus) 2996 return (EINVAL); 2997 2998 if (type < 0 || type >= VM_CAP_MAX) 2999 return (EINVAL); 3000 3001 return (VMGETCAP(vm->cookie, vcpu, type, retval)); 3002 } 3003 3004 int 3005 vm_set_capability(struct vm *vm, int vcpu, int type, int val) 3006 { 3007 if (vcpu < 0 || vcpu >= vm->maxcpus) 3008 return (EINVAL); 3009 3010 if (type < 0 || type >= VM_CAP_MAX) 3011 return (EINVAL); 3012 3013 return (VMSETCAP(vm->cookie, vcpu, type, val)); 3014 } 3015 3016 struct vlapic * 3017 vm_lapic(struct vm *vm, int cpu) 3018 { 3019 return (vm->vcpu[cpu].vlapic); 3020 } 3021 3022 struct vioapic * 3023 vm_ioapic(struct vm *vm) 3024 { 3025 3026 return (vm->vioapic); 3027 } 3028 3029 struct vhpet * 3030 vm_hpet(struct vm *vm) 3031 { 3032 3033 return (vm->vhpet); 3034 } 3035 3036 void * 3037 vm_iommu_domain(struct vm *vm) 3038 { 3039 3040 return (vm->iommu); 3041 } 3042 3043 int 3044 vcpu_set_state(struct vm *vm, int vcpuid, enum vcpu_state newstate, 3045 bool from_idle) 3046 { 3047 int error; 3048 struct vcpu *vcpu; 3049 3050 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 3051 panic("vcpu_set_state: invalid vcpuid %d", vcpuid); 3052 3053 vcpu = &vm->vcpu[vcpuid]; 3054 3055 vcpu_lock(vcpu); 3056 error = vcpu_set_state_locked(vm, vcpuid, newstate, from_idle); 3057 vcpu_unlock(vcpu); 3058 3059 return (error); 3060 } 3061 3062 enum vcpu_state 3063 vcpu_get_state(struct vm *vm, int vcpuid, int *hostcpu) 3064 { 3065 struct vcpu *vcpu; 3066 enum vcpu_state state; 3067 3068 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 3069 panic("vcpu_get_state: invalid vcpuid %d", vcpuid); 3070 3071 vcpu = &vm->vcpu[vcpuid]; 3072 3073 vcpu_lock(vcpu); 3074 state = vcpu->state; 3075 if (hostcpu != NULL) 3076 *hostcpu = vcpu->hostcpu; 3077 vcpu_unlock(vcpu); 3078 3079 return (state); 3080 } 3081 3082 uint64_t 3083 vcpu_tsc_offset(struct vm *vm, int vcpuid, bool phys_adj) 3084 { 3085 ASSERT(vcpuid >= 0 && vcpuid < vm->maxcpus); 3086 3087 uint64_t vcpu_off = vm->boot_tsc_offset + vm->vcpu[vcpuid].tsc_offset; 3088 3089 if (phys_adj) { 3090 /* Include any offset for the current physical CPU too */ 3091 extern hrtime_t tsc_gethrtime_tick_delta(void); 3092 vcpu_off += (uint64_t)tsc_gethrtime_tick_delta(); 3093 } 3094 3095 return (vcpu_off); 3096 } 3097 3098 int 3099 vm_activate_cpu(struct vm *vm, int vcpuid) 3100 { 3101 3102 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 3103 return (EINVAL); 3104 3105 if (CPU_ISSET(vcpuid, &vm->active_cpus)) 3106 return (EBUSY); 3107 3108 if (vm->suspend != 0) { 3109 return (EBUSY); 3110 } 3111 3112 VCPU_CTR0(vm, vcpuid, "activated"); 3113 CPU_SET_ATOMIC(vcpuid, &vm->active_cpus); 3114 3115 /* 3116 * It is possible that this vCPU was undergoing activation at the same 3117 * time that the VM was being suspended. If that happens to be the 3118 * case, it should reflect the suspended state immediately. 3119 */ 3120 if (atomic_load_acq_int((uint_t *)&vm->suspend) != 0) { 3121 CPU_SET_ATOMIC(vcpuid, &vm->suspended_cpus); 3122 } 3123 3124 return (0); 3125 } 3126 3127 int 3128 vm_suspend_cpu(struct vm *vm, int vcpuid) 3129 { 3130 int i; 3131 3132 if (vcpuid < -1 || vcpuid >= vm->maxcpus) 3133 return (EINVAL); 3134 3135 if (vcpuid == -1) { 3136 vm->debug_cpus = vm->active_cpus; 3137 for (i = 0; i < vm->maxcpus; i++) { 3138 if (CPU_ISSET(i, &vm->active_cpus)) 3139 vcpu_notify_event(vm, i); 3140 } 3141 } else { 3142 if (!CPU_ISSET(vcpuid, &vm->active_cpus)) 3143 return (EINVAL); 3144 3145 CPU_SET_ATOMIC(vcpuid, &vm->debug_cpus); 3146 vcpu_notify_event(vm, vcpuid); 3147 } 3148 return (0); 3149 } 3150 3151 int 3152 vm_resume_cpu(struct vm *vm, int vcpuid) 3153 { 3154 3155 if (vcpuid < -1 || vcpuid >= vm->maxcpus) 3156 return (EINVAL); 3157 3158 if (vcpuid == -1) { 3159 CPU_ZERO(&vm->debug_cpus); 3160 } else { 3161 if (!CPU_ISSET(vcpuid, &vm->debug_cpus)) 3162 return (EINVAL); 3163 3164 CPU_CLR_ATOMIC(vcpuid, &vm->debug_cpus); 3165 } 3166 return (0); 3167 } 3168 3169 static bool 3170 vcpu_bailout_checks(struct vm *vm, int vcpuid, bool on_entry, 3171 uint64_t entry_rip) 3172 { 3173 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 3174 struct vm_exit *vme = &vcpu->exitinfo; 3175 bool bail = false; 3176 3177 ASSERT(vcpuid >= 0 && vcpuid < vm->maxcpus); 3178 3179 if (vm->suspend) { 3180 if (on_entry) { 3181 VERIFY(vm->suspend > VM_SUSPEND_NONE && 3182 vm->suspend < VM_SUSPEND_LAST); 3183 3184 vme->exitcode = VM_EXITCODE_SUSPENDED; 3185 vme->u.suspended.how = vm->suspend; 3186 } else { 3187 /* 3188 * Handling VM suspend is complicated, so if that 3189 * condition is detected outside of VM-entry itself, 3190 * just emit a BOGUS exitcode so we take a lap to pick 3191 * up the event during an entry and are directed into 3192 * the vm_handle_suspend() logic. 3193 */ 3194 vme->exitcode = VM_EXITCODE_BOGUS; 3195 } 3196 bail = true; 3197 } 3198 if (vcpu->reqidle) { 3199 vme->exitcode = VM_EXITCODE_REQIDLE; 3200 vmm_stat_incr(vm, vcpuid, VMEXIT_REQIDLE, 1); 3201 3202 if (!on_entry) { 3203 /* 3204 * A reqidle request detected outside of VM-entry can be 3205 * handled directly by clearing the request (and taking 3206 * a lap to userspace). 3207 */ 3208 vcpu_assert_locked(vcpu); 3209 vcpu->reqidle = 0; 3210 } 3211 bail = true; 3212 } 3213 if (vcpu_should_yield(vm, vcpuid)) { 3214 vme->exitcode = VM_EXITCODE_BOGUS; 3215 vmm_stat_incr(vm, vcpuid, VMEXIT_ASTPENDING, 1); 3216 bail = true; 3217 } 3218 if (CPU_ISSET(vcpuid, &vm->debug_cpus)) { 3219 vme->exitcode = VM_EXITCODE_DEBUG; 3220 bail = true; 3221 } 3222 3223 if (bail) { 3224 if (on_entry) { 3225 /* 3226 * If bailing out during VM-entry, the current %rip must 3227 * be recorded in the exitinfo. 3228 */ 3229 vme->rip = entry_rip; 3230 } 3231 vme->inst_length = 0; 3232 } 3233 return (bail); 3234 } 3235 3236 static bool 3237 vcpu_sleep_bailout_checks(struct vm *vm, int vcpuid) 3238 { 3239 /* 3240 * Bail-out check done prior to sleeping (in vCPU contexts like HLT or 3241 * wait-for-SIPI) expect that %rip is already populated in the vm_exit 3242 * structure, and we would only modify the exitcode. 3243 */ 3244 return (vcpu_bailout_checks(vm, vcpuid, false, 0)); 3245 } 3246 3247 bool 3248 vcpu_entry_bailout_checks(struct vm *vm, int vcpuid, uint64_t rip) 3249 { 3250 /* 3251 * Bail-out checks done as part of VM entry require an updated %rip to 3252 * populate the vm_exit struct if any of the conditions of interest are 3253 * matched in the check. 3254 */ 3255 return (vcpu_bailout_checks(vm, vcpuid, true, rip)); 3256 } 3257 3258 cpuset_t 3259 vm_active_cpus(struct vm *vm) 3260 { 3261 3262 return (vm->active_cpus); 3263 } 3264 3265 cpuset_t 3266 vm_debug_cpus(struct vm *vm) 3267 { 3268 3269 return (vm->debug_cpus); 3270 } 3271 3272 cpuset_t 3273 vm_suspended_cpus(struct vm *vm) 3274 { 3275 3276 return (vm->suspended_cpus); 3277 } 3278 3279 void * 3280 vcpu_stats(struct vm *vm, int vcpuid) 3281 { 3282 3283 return (vm->vcpu[vcpuid].stats); 3284 } 3285 3286 int 3287 vm_get_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state *state) 3288 { 3289 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 3290 return (EINVAL); 3291 3292 *state = vm->vcpu[vcpuid].x2apic_state; 3293 3294 return (0); 3295 } 3296 3297 int 3298 vm_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state) 3299 { 3300 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 3301 return (EINVAL); 3302 3303 if (state >= X2APIC_STATE_LAST) 3304 return (EINVAL); 3305 3306 vm->vcpu[vcpuid].x2apic_state = state; 3307 3308 vlapic_set_x2apic_state(vm, vcpuid, state); 3309 3310 return (0); 3311 } 3312 3313 /* 3314 * This function is called to ensure that a vcpu "sees" a pending event 3315 * as soon as possible: 3316 * - If the vcpu thread is sleeping then it is woken up. 3317 * - If the vcpu is running on a different host_cpu then an IPI will be directed 3318 * to the host_cpu to cause the vcpu to trap into the hypervisor. 3319 */ 3320 static void 3321 vcpu_notify_event_locked(struct vcpu *vcpu, vcpu_notify_t ntype) 3322 { 3323 int hostcpu; 3324 3325 ASSERT(ntype == VCPU_NOTIFY_APIC || VCPU_NOTIFY_EXIT); 3326 3327 hostcpu = vcpu->hostcpu; 3328 if (vcpu->state == VCPU_RUNNING) { 3329 KASSERT(hostcpu != NOCPU, ("vcpu running on invalid hostcpu")); 3330 if (hostcpu != curcpu) { 3331 if (ntype == VCPU_NOTIFY_APIC) { 3332 vlapic_post_intr(vcpu->vlapic, hostcpu); 3333 } else { 3334 poke_cpu(hostcpu); 3335 } 3336 } else { 3337 /* 3338 * If the 'vcpu' is running on 'curcpu' then it must 3339 * be sending a notification to itself (e.g. SELF_IPI). 3340 * The pending event will be picked up when the vcpu 3341 * transitions back to guest context. 3342 */ 3343 } 3344 } else { 3345 KASSERT(hostcpu == NOCPU, ("vcpu state %d not consistent " 3346 "with hostcpu %d", vcpu->state, hostcpu)); 3347 if (vcpu->state == VCPU_SLEEPING) { 3348 cv_signal(&vcpu->vcpu_cv); 3349 } 3350 } 3351 } 3352 3353 void 3354 vcpu_notify_event(struct vm *vm, int vcpuid) 3355 { 3356 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 3357 3358 vcpu_lock(vcpu); 3359 vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT); 3360 vcpu_unlock(vcpu); 3361 } 3362 3363 void 3364 vcpu_notify_event_type(struct vm *vm, int vcpuid, vcpu_notify_t ntype) 3365 { 3366 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 3367 3368 if (ntype == VCPU_NOTIFY_NONE) { 3369 return; 3370 } 3371 3372 vcpu_lock(vcpu); 3373 vcpu_notify_event_locked(vcpu, ntype); 3374 vcpu_unlock(vcpu); 3375 } 3376 3377 void 3378 vcpu_ustate_change(struct vm *vm, int vcpuid, enum vcpu_ustate ustate) 3379 { 3380 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 3381 hrtime_t now = gethrtime(); 3382 3383 ASSERT3U(ustate, !=, vcpu->ustate); 3384 ASSERT3S(ustate, <, VU_MAX); 3385 ASSERT3S(ustate, >=, VU_INIT); 3386 3387 hrtime_t delta = now - vcpu->ustate_when; 3388 vcpu->ustate_total[vcpu->ustate] += delta; 3389 3390 membar_producer(); 3391 3392 vcpu->ustate_when = now; 3393 vcpu->ustate = ustate; 3394 } 3395 3396 struct vmspace * 3397 vm_get_vmspace(struct vm *vm) 3398 { 3399 3400 return (vm->vmspace); 3401 } 3402 3403 struct vm_client * 3404 vm_get_vmclient(struct vm *vm, int vcpuid) 3405 { 3406 return (vm->vcpu[vcpuid].vmclient); 3407 } 3408 3409 int 3410 vm_apicid2vcpuid(struct vm *vm, int apicid) 3411 { 3412 /* 3413 * XXX apic id is assumed to be numerically identical to vcpu id 3414 */ 3415 return (apicid); 3416 } 3417 3418 struct vatpic * 3419 vm_atpic(struct vm *vm) 3420 { 3421 return (vm->vatpic); 3422 } 3423 3424 struct vatpit * 3425 vm_atpit(struct vm *vm) 3426 { 3427 return (vm->vatpit); 3428 } 3429 3430 struct vpmtmr * 3431 vm_pmtmr(struct vm *vm) 3432 { 3433 3434 return (vm->vpmtmr); 3435 } 3436 3437 struct vrtc * 3438 vm_rtc(struct vm *vm) 3439 { 3440 3441 return (vm->vrtc); 3442 } 3443 3444 enum vm_reg_name 3445 vm_segment_name(int seg) 3446 { 3447 static enum vm_reg_name seg_names[] = { 3448 VM_REG_GUEST_ES, 3449 VM_REG_GUEST_CS, 3450 VM_REG_GUEST_SS, 3451 VM_REG_GUEST_DS, 3452 VM_REG_GUEST_FS, 3453 VM_REG_GUEST_GS 3454 }; 3455 3456 KASSERT(seg >= 0 && seg < nitems(seg_names), 3457 ("%s: invalid segment encoding %d", __func__, seg)); 3458 return (seg_names[seg]); 3459 } 3460 3461 void 3462 vm_copy_teardown(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo, 3463 uint_t num_copyinfo) 3464 { 3465 for (uint_t idx = 0; idx < num_copyinfo; idx++) { 3466 if (copyinfo[idx].cookie != NULL) { 3467 vmp_release((vm_page_t *)copyinfo[idx].cookie); 3468 } 3469 } 3470 bzero(copyinfo, num_copyinfo * sizeof (struct vm_copyinfo)); 3471 } 3472 3473 int 3474 vm_copy_setup(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, 3475 uint64_t gla, size_t len, int prot, struct vm_copyinfo *copyinfo, 3476 uint_t num_copyinfo, int *fault) 3477 { 3478 uint_t idx, nused; 3479 size_t n, off, remaining; 3480 vm_client_t *vmc = vm_get_vmclient(vm, vcpuid); 3481 3482 bzero(copyinfo, sizeof (struct vm_copyinfo) * num_copyinfo); 3483 3484 nused = 0; 3485 remaining = len; 3486 while (remaining > 0) { 3487 uint64_t gpa; 3488 int error; 3489 3490 KASSERT(nused < num_copyinfo, ("insufficient vm_copyinfo")); 3491 error = vm_gla2gpa(vm, vcpuid, paging, gla, prot, &gpa, fault); 3492 if (error || *fault) 3493 return (error); 3494 off = gpa & PAGEOFFSET; 3495 n = min(remaining, PAGESIZE - off); 3496 copyinfo[nused].gpa = gpa; 3497 copyinfo[nused].len = n; 3498 remaining -= n; 3499 gla += n; 3500 nused++; 3501 } 3502 3503 for (idx = 0; idx < nused; idx++) { 3504 vm_page_t *vmp; 3505 caddr_t hva; 3506 3507 vmp = vmc_hold(vmc, copyinfo[idx].gpa & PAGEMASK, prot); 3508 if (vmp == NULL) { 3509 break; 3510 } 3511 if ((prot & PROT_WRITE) != 0) { 3512 hva = (caddr_t)vmp_get_writable(vmp); 3513 } else { 3514 hva = (caddr_t)vmp_get_readable(vmp); 3515 } 3516 copyinfo[idx].hva = hva + (copyinfo[idx].gpa & PAGEOFFSET); 3517 copyinfo[idx].cookie = vmp; 3518 copyinfo[idx].prot = prot; 3519 } 3520 3521 if (idx != nused) { 3522 vm_copy_teardown(vm, vcpuid, copyinfo, num_copyinfo); 3523 return (EFAULT); 3524 } else { 3525 *fault = 0; 3526 return (0); 3527 } 3528 } 3529 3530 void 3531 vm_copyin(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo, void *kaddr, 3532 size_t len) 3533 { 3534 char *dst; 3535 int idx; 3536 3537 dst = kaddr; 3538 idx = 0; 3539 while (len > 0) { 3540 ASSERT(copyinfo[idx].prot & PROT_READ); 3541 3542 bcopy(copyinfo[idx].hva, dst, copyinfo[idx].len); 3543 len -= copyinfo[idx].len; 3544 dst += copyinfo[idx].len; 3545 idx++; 3546 } 3547 } 3548 3549 void 3550 vm_copyout(struct vm *vm, int vcpuid, const void *kaddr, 3551 struct vm_copyinfo *copyinfo, size_t len) 3552 { 3553 const char *src; 3554 int idx; 3555 3556 src = kaddr; 3557 idx = 0; 3558 while (len > 0) { 3559 ASSERT(copyinfo[idx].prot & PROT_WRITE); 3560 3561 bcopy(src, copyinfo[idx].hva, copyinfo[idx].len); 3562 len -= copyinfo[idx].len; 3563 src += copyinfo[idx].len; 3564 idx++; 3565 } 3566 } 3567 3568 /* 3569 * Return the amount of in-use and wired memory for the VM. Since 3570 * these are global stats, only return the values with for vCPU 0 3571 */ 3572 VMM_STAT_DECLARE(VMM_MEM_RESIDENT); 3573 3574 static void 3575 vm_get_rescnt(struct vm *vm, int vcpu, struct vmm_stat_type *stat) 3576 { 3577 if (vcpu == 0) { 3578 vmm_stat_set(vm, vcpu, VMM_MEM_RESIDENT, 3579 PAGE_SIZE * vmspace_resident_count(vm->vmspace)); 3580 } 3581 } 3582 3583 VMM_STAT_FUNC(VMM_MEM_RESIDENT, "Resident memory", vm_get_rescnt); 3584 3585 int 3586 vm_ioport_access(struct vm *vm, int vcpuid, bool in, uint16_t port, 3587 uint8_t bytes, uint32_t *val) 3588 { 3589 return (vm_inout_access(&vm->ioports, in, port, bytes, val)); 3590 } 3591 3592 /* 3593 * bhyve-internal interfaces to attach or detach IO port handlers. 3594 * Must be called with VM write lock held for safety. 3595 */ 3596 int 3597 vm_ioport_attach(struct vm *vm, uint16_t port, ioport_handler_t func, void *arg, 3598 void **cookie) 3599 { 3600 int err; 3601 err = vm_inout_attach(&vm->ioports, port, IOPF_DEFAULT, func, arg); 3602 if (err == 0) { 3603 *cookie = (void *)IOP_GEN_COOKIE(func, arg, port); 3604 } 3605 return (err); 3606 } 3607 int 3608 vm_ioport_detach(struct vm *vm, void **cookie, ioport_handler_t *old_func, 3609 void **old_arg) 3610 { 3611 uint16_t port = IOP_PORT_FROM_COOKIE((uintptr_t)*cookie); 3612 int err; 3613 3614 err = vm_inout_detach(&vm->ioports, port, false, old_func, old_arg); 3615 if (err == 0) { 3616 *cookie = NULL; 3617 } 3618 return (err); 3619 } 3620 3621 /* 3622 * External driver interfaces to attach or detach IO port handlers. 3623 * Must be called with VM write lock held for safety. 3624 */ 3625 int 3626 vm_ioport_hook(struct vm *vm, uint16_t port, ioport_handler_t func, 3627 void *arg, void **cookie) 3628 { 3629 int err; 3630 3631 if (port == 0) { 3632 return (EINVAL); 3633 } 3634 3635 err = vm_inout_attach(&vm->ioports, port, IOPF_DRV_HOOK, func, arg); 3636 if (err == 0) { 3637 *cookie = (void *)IOP_GEN_COOKIE(func, arg, port); 3638 } 3639 return (err); 3640 } 3641 void 3642 vm_ioport_unhook(struct vm *vm, void **cookie) 3643 { 3644 uint16_t port = IOP_PORT_FROM_COOKIE((uintptr_t)*cookie); 3645 ioport_handler_t old_func; 3646 void *old_arg; 3647 int err; 3648 3649 err = vm_inout_detach(&vm->ioports, port, true, &old_func, &old_arg); 3650 3651 /* ioport-hook-using drivers are expected to be well-behaved */ 3652 VERIFY0(err); 3653 VERIFY(IOP_GEN_COOKIE(old_func, old_arg, port) == (uintptr_t)*cookie); 3654 3655 *cookie = NULL; 3656 } 3657 3658 int 3659 vmm_kstat_update_vcpu(struct kstat *ksp, int rw) 3660 { 3661 struct vm *vm = ksp->ks_private; 3662 vmm_vcpu_kstats_t *vvk = ksp->ks_data; 3663 const int vcpuid = vvk->vvk_vcpu.value.ui32; 3664 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 3665 3666 ASSERT3U(vcpuid, <, VM_MAXCPU); 3667 3668 vvk->vvk_time_init.value.ui64 = vcpu->ustate_total[VU_INIT]; 3669 vvk->vvk_time_run.value.ui64 = vcpu->ustate_total[VU_RUN]; 3670 vvk->vvk_time_idle.value.ui64 = vcpu->ustate_total[VU_IDLE]; 3671 vvk->vvk_time_emu_kern.value.ui64 = vcpu->ustate_total[VU_EMU_KERN]; 3672 vvk->vvk_time_emu_user.value.ui64 = vcpu->ustate_total[VU_EMU_USER]; 3673 vvk->vvk_time_sched.value.ui64 = vcpu->ustate_total[VU_SCHED]; 3674 3675 return (0); 3676 } 3677