1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2011 NetApp, Inc. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 * 28 * $FreeBSD$ 29 */ 30 /* 31 * This file and its contents are supplied under the terms of the 32 * Common Development and Distribution License ("CDDL"), version 1.0. 33 * You may only use this file in accordance with the terms of version 34 * 1.0 of the CDDL. 35 * 36 * A full copy of the text of the CDDL should have accompanied this 37 * source. A copy of the CDDL is also available via the Internet at 38 * http://www.illumos.org/license/CDDL. 39 * 40 * Copyright 2015 Pluribus Networks Inc. 41 * Copyright 2018 Joyent, Inc. 42 * Copyright 2022 Oxide Computer Company 43 * Copyright 2021 OmniOS Community Edition (OmniOSce) Association. 44 */ 45 46 #include <sys/cdefs.h> 47 __FBSDID("$FreeBSD$"); 48 49 #include <sys/param.h> 50 #include <sys/systm.h> 51 #include <sys/kernel.h> 52 #include <sys/module.h> 53 #include <sys/sysctl.h> 54 #include <sys/malloc.h> 55 #include <sys/pcpu.h> 56 #include <sys/mutex.h> 57 #include <sys/proc.h> 58 #include <sys/rwlock.h> 59 #include <sys/sched.h> 60 #include <sys/systm.h> 61 #include <sys/sunddi.h> 62 #include <sys/hma.h> 63 64 #include <machine/md_var.h> 65 #include <x86/psl.h> 66 #include <x86/apicreg.h> 67 68 #include <machine/specialreg.h> 69 #include <machine/vmm.h> 70 #include <machine/vmm_dev.h> 71 #include <machine/vmparam.h> 72 #include <sys/vmm_instruction_emul.h> 73 #include <sys/vmm_vm.h> 74 #include <sys/vmm_gpt.h> 75 76 #include "vmm_ioport.h" 77 #include "vmm_ktr.h" 78 #include "vmm_host.h" 79 #include "vmm_util.h" 80 #include "vatpic.h" 81 #include "vatpit.h" 82 #include "vhpet.h" 83 #include "vioapic.h" 84 #include "vlapic.h" 85 #include "vpmtmr.h" 86 #include "vrtc.h" 87 #include "vmm_stat.h" 88 #include "vmm_lapic.h" 89 90 #include "io/ppt.h" 91 #include "io/iommu.h" 92 93 struct vlapic; 94 95 /* Flags for vtc_status */ 96 #define VTCS_FPU_RESTORED 1 /* guest FPU restored, host FPU saved */ 97 #define VTCS_FPU_CTX_CRITICAL 2 /* in ctx where FPU restore cannot be lazy */ 98 99 typedef struct vm_thread_ctx { 100 struct vm *vtc_vm; 101 int vtc_vcpuid; 102 uint_t vtc_status; 103 enum vcpu_ustate vtc_ustate; 104 } vm_thread_ctx_t; 105 106 #define VMM_MTRR_VAR_MAX 10 107 #define VMM_MTRR_DEF_MASK \ 108 (MTRR_DEF_ENABLE | MTRR_DEF_FIXED_ENABLE | MTRR_DEF_TYPE) 109 #define VMM_MTRR_PHYSBASE_MASK (MTRR_PHYSBASE_PHYSBASE | MTRR_PHYSBASE_TYPE) 110 #define VMM_MTRR_PHYSMASK_MASK (MTRR_PHYSMASK_PHYSMASK | MTRR_PHYSMASK_VALID) 111 struct vm_mtrr { 112 uint64_t def_type; 113 uint64_t fixed4k[8]; 114 uint64_t fixed16k[2]; 115 uint64_t fixed64k; 116 struct { 117 uint64_t base; 118 uint64_t mask; 119 } var[VMM_MTRR_VAR_MAX]; 120 }; 121 122 /* 123 * Initialization: 124 * (a) allocated when vcpu is created 125 * (i) initialized when vcpu is created and when it is reinitialized 126 * (o) initialized the first time the vcpu is created 127 * (x) initialized before use 128 */ 129 struct vcpu { 130 /* (o) protects state, run_state, hostcpu, sipi_vector */ 131 kmutex_t lock; 132 133 enum vcpu_state state; /* (o) vcpu state */ 134 enum vcpu_run_state run_state; /* (i) vcpu init/sipi/run state */ 135 kcondvar_t vcpu_cv; /* (o) cpu waiter cv */ 136 kcondvar_t state_cv; /* (o) IDLE-transition cv */ 137 int hostcpu; /* (o) vcpu's current host cpu */ 138 int lastloccpu; /* (o) last host cpu localized to */ 139 int reqidle; /* (i) request vcpu to idle */ 140 struct vlapic *vlapic; /* (i) APIC device model */ 141 enum x2apic_state x2apic_state; /* (i) APIC mode */ 142 uint64_t exit_intinfo; /* (i) events pending at VM exit */ 143 uint64_t exc_pending; /* (i) exception pending */ 144 bool nmi_pending; /* (i) NMI pending */ 145 bool extint_pending; /* (i) INTR pending */ 146 147 uint8_t sipi_vector; /* (i) SIPI vector */ 148 hma_fpu_t *guestfpu; /* (a,i) guest fpu state */ 149 uint64_t guest_xcr0; /* (i) guest %xcr0 register */ 150 void *stats; /* (a,i) statistics */ 151 struct vm_exit exitinfo; /* (x) exit reason and collateral */ 152 uint64_t nextrip; /* (x) next instruction to execute */ 153 struct vie *vie_ctx; /* (x) instruction emulation context */ 154 vm_client_t *vmclient; /* (a) VM-system client */ 155 uint64_t tsc_offset; /* (x) offset from host TSC */ 156 struct vm_mtrr mtrr; /* (i) vcpu's MTRR */ 157 158 enum vcpu_ustate ustate; /* (i) microstate for the vcpu */ 159 hrtime_t ustate_when; /* (i) time of last ustate change */ 160 uint64_t ustate_total[VU_MAX]; /* (o) total time spent in ustates */ 161 vm_thread_ctx_t vtc; /* (o) thread state for ctxops */ 162 struct ctxop *ctxop; /* (o) ctxop storage for vcpu */ 163 }; 164 165 #define vcpu_lock(v) mutex_enter(&((v)->lock)) 166 #define vcpu_unlock(v) mutex_exit(&((v)->lock)) 167 #define vcpu_assert_locked(v) ASSERT(MUTEX_HELD(&((v)->lock))) 168 169 struct mem_seg { 170 size_t len; 171 bool sysmem; 172 vm_object_t *object; 173 }; 174 #define VM_MAX_MEMSEGS 5 175 176 struct mem_map { 177 vm_paddr_t gpa; 178 size_t len; 179 vm_ooffset_t segoff; 180 int segid; 181 int prot; 182 int flags; 183 }; 184 #define VM_MAX_MEMMAPS 8 185 186 /* 187 * Initialization: 188 * (o) initialized the first time the VM is created 189 * (i) initialized when VM is created and when it is reinitialized 190 * (x) initialized before use 191 */ 192 struct vm { 193 void *cookie; /* (i) cpu-specific data */ 194 void *iommu; /* (x) iommu-specific data */ 195 struct vhpet *vhpet; /* (i) virtual HPET */ 196 struct vioapic *vioapic; /* (i) virtual ioapic */ 197 struct vatpic *vatpic; /* (i) virtual atpic */ 198 struct vatpit *vatpit; /* (i) virtual atpit */ 199 struct vpmtmr *vpmtmr; /* (i) virtual ACPI PM timer */ 200 struct vrtc *vrtc; /* (o) virtual RTC */ 201 volatile cpuset_t active_cpus; /* (i) active vcpus */ 202 volatile cpuset_t debug_cpus; /* (i) vcpus stopped for dbg */ 203 int suspend; /* (i) stop VM execution */ 204 volatile cpuset_t suspended_cpus; /* (i) suspended vcpus */ 205 volatile cpuset_t halted_cpus; /* (x) cpus in a hard halt */ 206 struct mem_map mem_maps[VM_MAX_MEMMAPS]; /* (i) guest address space */ 207 struct mem_seg mem_segs[VM_MAX_MEMSEGS]; /* (o) guest memory regions */ 208 struct vmspace *vmspace; /* (o) guest's address space */ 209 char name[VM_MAX_NAMELEN]; /* (o) virtual machine name */ 210 struct vcpu vcpu[VM_MAXCPU]; /* (i) guest vcpus */ 211 /* The following describe the vm cpu topology */ 212 uint16_t sockets; /* (o) num of sockets */ 213 uint16_t cores; /* (o) num of cores/socket */ 214 uint16_t threads; /* (o) num of threads/core */ 215 uint16_t maxcpus; /* (o) max pluggable cpus */ 216 uint64_t boot_tsc_offset; /* (i) TSC offset at VM boot */ 217 218 struct ioport_config ioports; /* (o) ioport handling */ 219 220 bool mem_transient; /* (o) alloc transient memory */ 221 }; 222 223 static int vmm_initialized; 224 225 226 static void 227 nullop_panic(void) 228 { 229 panic("null vmm operation call"); 230 } 231 232 /* Do not allow use of an un-set `ops` to do anything but panic */ 233 static struct vmm_ops vmm_ops_null = { 234 .init = (vmm_init_func_t)nullop_panic, 235 .cleanup = (vmm_cleanup_func_t)nullop_panic, 236 .resume = (vmm_resume_func_t)nullop_panic, 237 .vminit = (vmi_init_func_t)nullop_panic, 238 .vmrun = (vmi_run_func_t)nullop_panic, 239 .vmcleanup = (vmi_cleanup_func_t)nullop_panic, 240 .vmgetreg = (vmi_get_register_t)nullop_panic, 241 .vmsetreg = (vmi_set_register_t)nullop_panic, 242 .vmgetdesc = (vmi_get_desc_t)nullop_panic, 243 .vmsetdesc = (vmi_set_desc_t)nullop_panic, 244 .vmgetcap = (vmi_get_cap_t)nullop_panic, 245 .vmsetcap = (vmi_set_cap_t)nullop_panic, 246 .vlapic_init = (vmi_vlapic_init)nullop_panic, 247 .vlapic_cleanup = (vmi_vlapic_cleanup)nullop_panic, 248 .vmsavectx = (vmi_savectx)nullop_panic, 249 .vmrestorectx = (vmi_restorectx)nullop_panic, 250 }; 251 252 static struct vmm_ops *ops = &vmm_ops_null; 253 static vmm_pte_ops_t *pte_ops = NULL; 254 255 #define VMM_INIT() ((*ops->init)()) 256 #define VMM_CLEANUP() ((*ops->cleanup)()) 257 #define VMM_RESUME() ((*ops->resume)()) 258 259 #define VMINIT(vm) ((*ops->vminit)(vm)) 260 #define VMRUN(vmi, vcpu, rip) ((*ops->vmrun)(vmi, vcpu, rip)) 261 #define VMCLEANUP(vmi) ((*ops->vmcleanup)(vmi)) 262 263 #define VMGETREG(vmi, vcpu, num, rv) ((*ops->vmgetreg)(vmi, vcpu, num, rv)) 264 #define VMSETREG(vmi, vcpu, num, val) ((*ops->vmsetreg)(vmi, vcpu, num, val)) 265 #define VMGETDESC(vmi, vcpu, num, dsc) ((*ops->vmgetdesc)(vmi, vcpu, num, dsc)) 266 #define VMSETDESC(vmi, vcpu, num, dsc) ((*ops->vmsetdesc)(vmi, vcpu, num, dsc)) 267 #define VMGETCAP(vmi, vcpu, num, rv) ((*ops->vmgetcap)(vmi, vcpu, num, rv)) 268 #define VMSETCAP(vmi, vcpu, num, val) ((*ops->vmsetcap)(vmi, vcpu, num, val)) 269 #define VLAPIC_INIT(vmi, vcpu) ((*ops->vlapic_init)(vmi, vcpu)) 270 #define VLAPIC_CLEANUP(vmi, vlapic) ((*ops->vlapic_cleanup)(vmi, vlapic)) 271 272 #define fpu_start_emulating() load_cr0(rcr0() | CR0_TS) 273 #define fpu_stop_emulating() clts() 274 275 SDT_PROVIDER_DEFINE(vmm); 276 277 static MALLOC_DEFINE(M_VM, "vm", "vm"); 278 279 SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 280 NULL); 281 282 /* 283 * Halt the guest if all vcpus are executing a HLT instruction with 284 * interrupts disabled. 285 */ 286 static int halt_detection_enabled = 1; 287 288 /* Trap into hypervisor on all guest exceptions and reflect them back */ 289 static int trace_guest_exceptions; 290 291 static void vm_free_memmap(struct vm *vm, int ident); 292 static bool sysmem_mapping(struct vm *vm, struct mem_map *mm); 293 static void vcpu_notify_event_locked(struct vcpu *vcpu, vcpu_notify_t); 294 static bool vcpu_sleep_bailout_checks(struct vm *vm, int vcpuid); 295 static int vcpu_vector_sipi(struct vm *vm, int vcpuid, uint8_t vector); 296 297 static void vmm_savectx(void *); 298 static void vmm_restorectx(void *); 299 static const struct ctxop_template vmm_ctxop_tpl = { 300 .ct_rev = CTXOP_TPL_REV, 301 .ct_save = vmm_savectx, 302 .ct_restore = vmm_restorectx, 303 }; 304 305 #ifdef KTR 306 static const char * 307 vcpu_state2str(enum vcpu_state state) 308 { 309 310 switch (state) { 311 case VCPU_IDLE: 312 return ("idle"); 313 case VCPU_FROZEN: 314 return ("frozen"); 315 case VCPU_RUNNING: 316 return ("running"); 317 case VCPU_SLEEPING: 318 return ("sleeping"); 319 default: 320 return ("unknown"); 321 } 322 } 323 #endif 324 325 static void 326 vcpu_cleanup(struct vm *vm, int i, bool destroy) 327 { 328 struct vcpu *vcpu = &vm->vcpu[i]; 329 330 VLAPIC_CLEANUP(vm->cookie, vcpu->vlapic); 331 if (destroy) { 332 vmm_stat_free(vcpu->stats); 333 334 hma_fpu_free(vcpu->guestfpu); 335 vcpu->guestfpu = NULL; 336 337 vie_free(vcpu->vie_ctx); 338 vcpu->vie_ctx = NULL; 339 340 vmc_destroy(vcpu->vmclient); 341 vcpu->vmclient = NULL; 342 343 ctxop_free(vcpu->ctxop); 344 mutex_destroy(&vcpu->lock); 345 } 346 } 347 348 static void 349 vcpu_init(struct vm *vm, int vcpu_id, bool create) 350 { 351 struct vcpu *vcpu; 352 353 KASSERT(vcpu_id >= 0 && vcpu_id < vm->maxcpus, 354 ("vcpu_init: invalid vcpu %d", vcpu_id)); 355 356 vcpu = &vm->vcpu[vcpu_id]; 357 358 if (create) { 359 mutex_init(&vcpu->lock, NULL, MUTEX_ADAPTIVE, NULL); 360 361 vcpu->state = VCPU_IDLE; 362 vcpu->hostcpu = NOCPU; 363 vcpu->lastloccpu = NOCPU; 364 vcpu->guestfpu = hma_fpu_alloc(KM_SLEEP); 365 vcpu->stats = vmm_stat_alloc(); 366 vcpu->vie_ctx = vie_alloc(); 367 368 vcpu->ustate = VU_INIT; 369 vcpu->ustate_when = gethrtime(); 370 371 vcpu->vtc.vtc_vm = vm; 372 vcpu->vtc.vtc_vcpuid = vcpu_id; 373 vcpu->ctxop = ctxop_allocate(&vmm_ctxop_tpl, &vcpu->vtc); 374 } else { 375 vie_reset(vcpu->vie_ctx); 376 bzero(&vcpu->exitinfo, sizeof (vcpu->exitinfo)); 377 if (vcpu->ustate != VU_INIT) { 378 vcpu_ustate_change(vm, vcpu_id, VU_INIT); 379 } 380 bzero(&vcpu->mtrr, sizeof (vcpu->mtrr)); 381 } 382 383 vcpu->run_state = VRS_HALT; 384 vcpu->vlapic = VLAPIC_INIT(vm->cookie, vcpu_id); 385 (void) vm_set_x2apic_state(vm, vcpu_id, X2APIC_DISABLED); 386 vcpu->reqidle = 0; 387 vcpu->exit_intinfo = 0; 388 vcpu->nmi_pending = false; 389 vcpu->extint_pending = false; 390 vcpu->exc_pending = 0; 391 vcpu->guest_xcr0 = XFEATURE_ENABLED_X87; 392 (void) hma_fpu_init(vcpu->guestfpu); 393 vmm_stat_init(vcpu->stats); 394 vcpu->tsc_offset = 0; 395 } 396 397 int 398 vcpu_trace_exceptions(struct vm *vm, int vcpuid) 399 { 400 401 return (trace_guest_exceptions); 402 } 403 404 struct vm_exit * 405 vm_exitinfo(struct vm *vm, int cpuid) 406 { 407 struct vcpu *vcpu; 408 409 if (cpuid < 0 || cpuid >= vm->maxcpus) 410 panic("vm_exitinfo: invalid cpuid %d", cpuid); 411 412 vcpu = &vm->vcpu[cpuid]; 413 414 return (&vcpu->exitinfo); 415 } 416 417 struct vie * 418 vm_vie_ctx(struct vm *vm, int cpuid) 419 { 420 if (cpuid < 0 || cpuid >= vm->maxcpus) 421 panic("vm_vie_ctx: invalid cpuid %d", cpuid); 422 423 return (vm->vcpu[cpuid].vie_ctx); 424 } 425 426 static int 427 vmm_init(void) 428 { 429 vmm_host_state_init(); 430 431 if (vmm_is_intel()) { 432 ops = &vmm_ops_intel; 433 pte_ops = &ept_pte_ops; 434 } else if (vmm_is_svm()) { 435 ops = &vmm_ops_amd; 436 pte_ops = &rvi_pte_ops; 437 } else { 438 return (ENXIO); 439 } 440 441 return (VMM_INIT()); 442 } 443 444 int 445 vmm_mod_load() 446 { 447 int error; 448 449 VERIFY(vmm_initialized == 0); 450 451 error = vmm_init(); 452 if (error == 0) 453 vmm_initialized = 1; 454 455 return (error); 456 } 457 458 int 459 vmm_mod_unload() 460 { 461 int error; 462 463 VERIFY(vmm_initialized == 1); 464 465 iommu_cleanup(); 466 error = VMM_CLEANUP(); 467 if (error) 468 return (error); 469 vmm_initialized = 0; 470 471 return (0); 472 } 473 474 static void 475 vm_init(struct vm *vm, bool create) 476 { 477 int i; 478 479 vm->cookie = VMINIT(vm); 480 vm->iommu = NULL; 481 vm->vioapic = vioapic_init(vm); 482 vm->vhpet = vhpet_init(vm); 483 vm->vatpic = vatpic_init(vm); 484 vm->vatpit = vatpit_init(vm); 485 vm->vpmtmr = vpmtmr_init(vm); 486 if (create) 487 vm->vrtc = vrtc_init(vm); 488 489 vm_inout_init(vm, &vm->ioports); 490 491 CPU_ZERO(&vm->active_cpus); 492 CPU_ZERO(&vm->debug_cpus); 493 494 vm->suspend = 0; 495 CPU_ZERO(&vm->suspended_cpus); 496 497 for (i = 0; i < vm->maxcpus; i++) 498 vcpu_init(vm, i, create); 499 500 /* 501 * Configure the VM-wide TSC offset so that the call to vm_init() 502 * represents the boot time (when the TSC(s) read 0). Each vCPU will 503 * have its own offset from this, which is altered if/when the guest 504 * writes to MSR_TSC. 505 * 506 * The TSC offsetting math is all unsigned, using overflow for negative 507 * offets. A reading of the TSC is negated to form the boot offset. 508 */ 509 vm->boot_tsc_offset = (uint64_t)(-(int64_t)rdtsc_offset()); 510 } 511 512 /* 513 * The default CPU topology is a single thread per package. 514 */ 515 uint_t cores_per_package = 1; 516 uint_t threads_per_core = 1; 517 518 /* 519 * Debugging tunable to enable dirty-page-tracking. 520 * (Remains off by default for now) 521 */ 522 bool gpt_track_dirty = false; 523 524 int 525 vm_create(const char *name, uint64_t flags, struct vm **retvm) 526 { 527 struct vm *vm; 528 struct vmspace *vmspace; 529 530 /* 531 * If vmm.ko could not be successfully initialized then don't attempt 532 * to create the virtual machine. 533 */ 534 if (!vmm_initialized) 535 return (ENXIO); 536 537 /* Name validation has already occurred */ 538 VERIFY3U(strnlen(name, VM_MAX_NAMELEN), <, VM_MAX_NAMELEN); 539 540 vmspace = vmspace_alloc(VM_MAXUSER_ADDRESS, pte_ops, gpt_track_dirty); 541 if (vmspace == NULL) 542 return (ENOMEM); 543 544 vm = malloc(sizeof (struct vm), M_VM, M_WAITOK | M_ZERO); 545 (void) strlcpy(vm->name, name, sizeof (vm->name)); 546 547 vm->vmspace = vmspace; 548 vm->mem_transient = (flags & VCF_RESERVOIR_MEM) == 0; 549 for (uint_t i = 0; i < VM_MAXCPU; i++) { 550 vm->vcpu[i].vmclient = vmspace_client_alloc(vmspace); 551 } 552 553 vm->sockets = 1; 554 vm->cores = cores_per_package; /* XXX backwards compatibility */ 555 vm->threads = threads_per_core; /* XXX backwards compatibility */ 556 vm->maxcpus = VM_MAXCPU; /* XXX temp to keep code working */ 557 558 vm_init(vm, true); 559 560 *retvm = vm; 561 return (0); 562 } 563 564 void 565 vm_get_topology(struct vm *vm, uint16_t *sockets, uint16_t *cores, 566 uint16_t *threads, uint16_t *maxcpus) 567 { 568 *sockets = vm->sockets; 569 *cores = vm->cores; 570 *threads = vm->threads; 571 *maxcpus = vm->maxcpus; 572 } 573 574 uint16_t 575 vm_get_maxcpus(struct vm *vm) 576 { 577 return (vm->maxcpus); 578 } 579 580 int 581 vm_set_topology(struct vm *vm, uint16_t sockets, uint16_t cores, 582 uint16_t threads, uint16_t maxcpus) 583 { 584 if (maxcpus != 0) 585 return (EINVAL); /* XXX remove when supported */ 586 if ((sockets * cores * threads) > vm->maxcpus) 587 return (EINVAL); 588 /* XXX need to check sockets * cores * threads == vCPU, how? */ 589 vm->sockets = sockets; 590 vm->cores = cores; 591 vm->threads = threads; 592 vm->maxcpus = VM_MAXCPU; /* XXX temp to keep code working */ 593 return (0); 594 } 595 596 static void 597 vm_cleanup(struct vm *vm, bool destroy) 598 { 599 struct mem_map *mm; 600 int i; 601 602 ppt_unassign_all(vm); 603 604 if (vm->iommu != NULL) 605 iommu_destroy_domain(vm->iommu); 606 607 /* 608 * Devices which attach their own ioport hooks should be cleaned up 609 * first so they can tear down those registrations. 610 */ 611 vpmtmr_cleanup(vm->vpmtmr); 612 613 vm_inout_cleanup(vm, &vm->ioports); 614 615 if (destroy) 616 vrtc_cleanup(vm->vrtc); 617 else 618 vrtc_reset(vm->vrtc); 619 620 vatpit_cleanup(vm->vatpit); 621 vhpet_cleanup(vm->vhpet); 622 vatpic_cleanup(vm->vatpic); 623 vioapic_cleanup(vm->vioapic); 624 625 for (i = 0; i < vm->maxcpus; i++) 626 vcpu_cleanup(vm, i, destroy); 627 628 VMCLEANUP(vm->cookie); 629 630 /* 631 * System memory is removed from the guest address space only when 632 * the VM is destroyed. This is because the mapping remains the same 633 * across VM reset. 634 * 635 * Device memory can be relocated by the guest (e.g. using PCI BARs) 636 * so those mappings are removed on a VM reset. 637 */ 638 for (i = 0; i < VM_MAX_MEMMAPS; i++) { 639 mm = &vm->mem_maps[i]; 640 if (destroy || !sysmem_mapping(vm, mm)) { 641 vm_free_memmap(vm, i); 642 } else { 643 /* 644 * We need to reset the IOMMU flag so this mapping can 645 * be reused when a VM is rebooted. Since the IOMMU 646 * domain has already been destroyed we can just reset 647 * the flag here. 648 */ 649 mm->flags &= ~VM_MEMMAP_F_IOMMU; 650 } 651 } 652 653 if (destroy) { 654 for (i = 0; i < VM_MAX_MEMSEGS; i++) 655 vm_free_memseg(vm, i); 656 657 vmspace_destroy(vm->vmspace); 658 vm->vmspace = NULL; 659 } 660 } 661 662 void 663 vm_destroy(struct vm *vm) 664 { 665 vm_cleanup(vm, true); 666 free(vm, M_VM); 667 } 668 669 int 670 vm_reinit(struct vm *vm, uint64_t flags) 671 { 672 /* A virtual machine can be reset only if all vcpus are suspended. */ 673 if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) != 0) { 674 if ((flags & VM_REINIT_F_FORCE_SUSPEND) == 0) { 675 return (EBUSY); 676 } 677 678 /* 679 * Force the VM (and all its vCPUs) into a suspended state. 680 * This should be quick and easy, since the vm_reinit() call is 681 * made while holding the VM write lock, which requires holding 682 * all of the vCPUs in the VCPU_FROZEN state. 683 */ 684 (void) atomic_cmpset_int((uint_t *)&vm->suspend, 0, 685 VM_SUSPEND_RESET); 686 for (uint_t i = 0; i < vm->maxcpus; i++) { 687 struct vcpu *vcpu = &vm->vcpu[i]; 688 689 if (CPU_ISSET(i, &vm->suspended_cpus) || 690 !CPU_ISSET(i, &vm->active_cpus)) { 691 continue; 692 } 693 694 vcpu_lock(vcpu); 695 VERIFY3U(vcpu->state, ==, VCPU_FROZEN); 696 CPU_SET_ATOMIC(i, &vm->suspended_cpus); 697 vcpu_unlock(vcpu); 698 } 699 700 VERIFY0(CPU_CMP(&vm->suspended_cpus, &vm->active_cpus)); 701 } 702 703 vm_cleanup(vm, false); 704 vm_init(vm, false); 705 return (0); 706 } 707 708 const char * 709 vm_name(struct vm *vm) 710 { 711 return (vm->name); 712 } 713 714 int 715 vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa) 716 { 717 vm_object_t *obj; 718 719 if ((obj = vmm_mmio_alloc(vm->vmspace, gpa, len, hpa)) == NULL) 720 return (ENOMEM); 721 else 722 return (0); 723 } 724 725 int 726 vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len) 727 { 728 return (vmspace_unmap(vm->vmspace, gpa, gpa + len)); 729 } 730 731 /* 732 * Return 'true' if 'gpa' is allocated in the guest address space. 733 * 734 * This function is called in the context of a running vcpu which acts as 735 * an implicit lock on 'vm->mem_maps[]'. 736 */ 737 bool 738 vm_mem_allocated(struct vm *vm, int vcpuid, vm_paddr_t gpa) 739 { 740 struct mem_map *mm; 741 int i; 742 743 #ifdef INVARIANTS 744 int hostcpu, state; 745 state = vcpu_get_state(vm, vcpuid, &hostcpu); 746 KASSERT(state == VCPU_RUNNING && hostcpu == curcpu, 747 ("%s: invalid vcpu state %d/%d", __func__, state, hostcpu)); 748 #endif 749 750 for (i = 0; i < VM_MAX_MEMMAPS; i++) { 751 mm = &vm->mem_maps[i]; 752 if (mm->len != 0 && gpa >= mm->gpa && gpa < mm->gpa + mm->len) 753 return (true); /* 'gpa' is sysmem or devmem */ 754 } 755 756 if (ppt_is_mmio(vm, gpa)) 757 return (true); /* 'gpa' is pci passthru mmio */ 758 759 return (false); 760 } 761 762 int 763 vm_alloc_memseg(struct vm *vm, int ident, size_t len, bool sysmem) 764 { 765 struct mem_seg *seg; 766 vm_object_t *obj; 767 768 if (ident < 0 || ident >= VM_MAX_MEMSEGS) 769 return (EINVAL); 770 771 if (len == 0 || (len & PAGE_MASK)) 772 return (EINVAL); 773 774 seg = &vm->mem_segs[ident]; 775 if (seg->object != NULL) { 776 if (seg->len == len && seg->sysmem == sysmem) 777 return (EEXIST); 778 else 779 return (EINVAL); 780 } 781 782 obj = vm_object_mem_allocate(len, vm->mem_transient); 783 if (obj == NULL) 784 return (ENOMEM); 785 786 seg->len = len; 787 seg->object = obj; 788 seg->sysmem = sysmem; 789 return (0); 790 } 791 792 int 793 vm_get_memseg(struct vm *vm, int ident, size_t *len, bool *sysmem, 794 vm_object_t **objptr) 795 { 796 struct mem_seg *seg; 797 798 if (ident < 0 || ident >= VM_MAX_MEMSEGS) 799 return (EINVAL); 800 801 seg = &vm->mem_segs[ident]; 802 if (len) 803 *len = seg->len; 804 if (sysmem) 805 *sysmem = seg->sysmem; 806 if (objptr) 807 *objptr = seg->object; 808 return (0); 809 } 810 811 void 812 vm_free_memseg(struct vm *vm, int ident) 813 { 814 struct mem_seg *seg; 815 816 KASSERT(ident >= 0 && ident < VM_MAX_MEMSEGS, 817 ("%s: invalid memseg ident %d", __func__, ident)); 818 819 seg = &vm->mem_segs[ident]; 820 if (seg->object != NULL) { 821 vm_object_release(seg->object); 822 bzero(seg, sizeof (struct mem_seg)); 823 } 824 } 825 826 int 827 vm_mmap_memseg(struct vm *vm, vm_paddr_t gpa, int segid, vm_ooffset_t first, 828 size_t len, int prot, int flags) 829 { 830 struct mem_seg *seg; 831 struct mem_map *m, *map; 832 vm_ooffset_t last; 833 int i, error; 834 835 if (prot == 0 || (prot & ~(PROT_ALL)) != 0) 836 return (EINVAL); 837 838 if (flags & ~VM_MEMMAP_F_WIRED) 839 return (EINVAL); 840 841 if (segid < 0 || segid >= VM_MAX_MEMSEGS) 842 return (EINVAL); 843 844 seg = &vm->mem_segs[segid]; 845 if (seg->object == NULL) 846 return (EINVAL); 847 848 last = first + len; 849 if (first < 0 || first >= last || last > seg->len) 850 return (EINVAL); 851 852 if ((gpa | first | last) & PAGE_MASK) 853 return (EINVAL); 854 855 map = NULL; 856 for (i = 0; i < VM_MAX_MEMMAPS; i++) { 857 m = &vm->mem_maps[i]; 858 if (m->len == 0) { 859 map = m; 860 break; 861 } 862 } 863 864 if (map == NULL) 865 return (ENOSPC); 866 867 error = vmspace_map(vm->vmspace, seg->object, first, gpa, len, prot); 868 if (error != 0) 869 return (EFAULT); 870 871 vm_object_reference(seg->object); 872 873 if ((flags & VM_MEMMAP_F_WIRED) != 0) { 874 error = vmspace_populate(vm->vmspace, gpa, gpa + len); 875 if (error != 0) { 876 VERIFY0(vmspace_unmap(vm->vmspace, gpa, gpa + len)); 877 return (EFAULT); 878 } 879 } 880 881 map->gpa = gpa; 882 map->len = len; 883 map->segoff = first; 884 map->segid = segid; 885 map->prot = prot; 886 map->flags = flags; 887 return (0); 888 } 889 890 int 891 vm_munmap_memseg(struct vm *vm, vm_paddr_t gpa, size_t len) 892 { 893 struct mem_map *m; 894 int i; 895 896 for (i = 0; i < VM_MAX_MEMMAPS; i++) { 897 m = &vm->mem_maps[i]; 898 if (m->gpa == gpa && m->len == len && 899 (m->flags & VM_MEMMAP_F_IOMMU) == 0) { 900 vm_free_memmap(vm, i); 901 return (0); 902 } 903 } 904 905 return (EINVAL); 906 } 907 908 int 909 vm_mmap_getnext(struct vm *vm, vm_paddr_t *gpa, int *segid, 910 vm_ooffset_t *segoff, size_t *len, int *prot, int *flags) 911 { 912 struct mem_map *mm, *mmnext; 913 int i; 914 915 mmnext = NULL; 916 for (i = 0; i < VM_MAX_MEMMAPS; i++) { 917 mm = &vm->mem_maps[i]; 918 if (mm->len == 0 || mm->gpa < *gpa) 919 continue; 920 if (mmnext == NULL || mm->gpa < mmnext->gpa) 921 mmnext = mm; 922 } 923 924 if (mmnext != NULL) { 925 *gpa = mmnext->gpa; 926 if (segid) 927 *segid = mmnext->segid; 928 if (segoff) 929 *segoff = mmnext->segoff; 930 if (len) 931 *len = mmnext->len; 932 if (prot) 933 *prot = mmnext->prot; 934 if (flags) 935 *flags = mmnext->flags; 936 return (0); 937 } else { 938 return (ENOENT); 939 } 940 } 941 942 static void 943 vm_free_memmap(struct vm *vm, int ident) 944 { 945 struct mem_map *mm; 946 int error; 947 948 mm = &vm->mem_maps[ident]; 949 if (mm->len) { 950 error = vmspace_unmap(vm->vmspace, mm->gpa, 951 mm->gpa + mm->len); 952 KASSERT(error == 0, ("%s: vmspace_unmap error %d", 953 __func__, error)); 954 bzero(mm, sizeof (struct mem_map)); 955 } 956 } 957 958 static __inline bool 959 sysmem_mapping(struct vm *vm, struct mem_map *mm) 960 { 961 962 if (mm->len != 0 && vm->mem_segs[mm->segid].sysmem) 963 return (true); 964 else 965 return (false); 966 } 967 968 vm_paddr_t 969 vmm_sysmem_maxaddr(struct vm *vm) 970 { 971 struct mem_map *mm; 972 vm_paddr_t maxaddr; 973 int i; 974 975 maxaddr = 0; 976 for (i = 0; i < VM_MAX_MEMMAPS; i++) { 977 mm = &vm->mem_maps[i]; 978 if (sysmem_mapping(vm, mm)) { 979 if (maxaddr < mm->gpa + mm->len) 980 maxaddr = mm->gpa + mm->len; 981 } 982 } 983 return (maxaddr); 984 } 985 986 static void 987 vm_iommu_modify(struct vm *vm, bool map) 988 { 989 int i, sz; 990 vm_paddr_t gpa, hpa; 991 struct mem_map *mm; 992 #ifdef __FreeBSD__ 993 void *vp, *cookie, *host_domain; 994 #endif 995 vm_client_t *vmc; 996 997 sz = PAGE_SIZE; 998 #ifdef __FreeBSD__ 999 host_domain = iommu_host_domain(); 1000 #endif 1001 vmc = vmspace_client_alloc(vm->vmspace); 1002 1003 for (i = 0; i < VM_MAX_MEMMAPS; i++) { 1004 mm = &vm->mem_maps[i]; 1005 if (!sysmem_mapping(vm, mm)) 1006 continue; 1007 1008 if (map) { 1009 KASSERT((mm->flags & VM_MEMMAP_F_IOMMU) == 0, 1010 ("iommu map found invalid memmap %lx/%lx/%x", 1011 mm->gpa, mm->len, mm->flags)); 1012 if ((mm->flags & VM_MEMMAP_F_WIRED) == 0) 1013 continue; 1014 mm->flags |= VM_MEMMAP_F_IOMMU; 1015 } else { 1016 if ((mm->flags & VM_MEMMAP_F_IOMMU) == 0) 1017 continue; 1018 mm->flags &= ~VM_MEMMAP_F_IOMMU; 1019 KASSERT((mm->flags & VM_MEMMAP_F_WIRED) != 0, 1020 ("iommu unmap found invalid memmap %lx/%lx/%x", 1021 mm->gpa, mm->len, mm->flags)); 1022 } 1023 1024 gpa = mm->gpa; 1025 while (gpa < mm->gpa + mm->len) { 1026 vm_page_t *vmp; 1027 1028 vmp = vmc_hold(vmc, gpa, PROT_WRITE); 1029 ASSERT(vmp != NULL); 1030 hpa = ((uintptr_t)vmp_get_pfn(vmp) << PAGESHIFT); 1031 (void) vmp_release(vmp); 1032 1033 if (map) { 1034 iommu_create_mapping(vm->iommu, gpa, hpa, sz); 1035 #ifdef __FreeBSD__ 1036 iommu_remove_mapping(host_domain, hpa, sz); 1037 #endif 1038 } else { 1039 iommu_remove_mapping(vm->iommu, gpa, sz); 1040 #ifdef __FreeBSD__ 1041 iommu_create_mapping(host_domain, hpa, hpa, sz); 1042 #endif 1043 } 1044 1045 gpa += PAGE_SIZE; 1046 } 1047 } 1048 vmc_destroy(vmc); 1049 1050 /* 1051 * Invalidate the cached translations associated with the domain 1052 * from which pages were removed. 1053 */ 1054 #ifdef __FreeBSD__ 1055 if (map) 1056 iommu_invalidate_tlb(host_domain); 1057 else 1058 iommu_invalidate_tlb(vm->iommu); 1059 #else 1060 iommu_invalidate_tlb(vm->iommu); 1061 #endif 1062 } 1063 1064 int 1065 vm_unassign_pptdev(struct vm *vm, int pptfd) 1066 { 1067 int error; 1068 1069 error = ppt_unassign_device(vm, pptfd); 1070 if (error) 1071 return (error); 1072 1073 if (ppt_assigned_devices(vm) == 0) 1074 vm_iommu_modify(vm, false); 1075 1076 return (0); 1077 } 1078 1079 int 1080 vm_assign_pptdev(struct vm *vm, int pptfd) 1081 { 1082 int error; 1083 vm_paddr_t maxaddr; 1084 1085 /* Set up the IOMMU to do the 'gpa' to 'hpa' translation */ 1086 if (ppt_assigned_devices(vm) == 0) { 1087 KASSERT(vm->iommu == NULL, 1088 ("vm_assign_pptdev: iommu must be NULL")); 1089 maxaddr = vmm_sysmem_maxaddr(vm); 1090 vm->iommu = iommu_create_domain(maxaddr); 1091 if (vm->iommu == NULL) 1092 return (ENXIO); 1093 vm_iommu_modify(vm, true); 1094 } 1095 1096 error = ppt_assign_device(vm, pptfd); 1097 return (error); 1098 } 1099 1100 int 1101 vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval) 1102 { 1103 1104 if (vcpu < 0 || vcpu >= vm->maxcpus) 1105 return (EINVAL); 1106 1107 if (reg >= VM_REG_LAST) 1108 return (EINVAL); 1109 1110 return (VMGETREG(vm->cookie, vcpu, reg, retval)); 1111 } 1112 1113 int 1114 vm_set_register(struct vm *vm, int vcpuid, int reg, uint64_t val) 1115 { 1116 struct vcpu *vcpu; 1117 int error; 1118 1119 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 1120 return (EINVAL); 1121 1122 if (reg >= VM_REG_LAST) 1123 return (EINVAL); 1124 1125 error = VMSETREG(vm->cookie, vcpuid, reg, val); 1126 if (error || reg != VM_REG_GUEST_RIP) 1127 return (error); 1128 1129 /* Set 'nextrip' to match the value of %rip */ 1130 VCPU_CTR1(vm, vcpuid, "Setting nextrip to %lx", val); 1131 vcpu = &vm->vcpu[vcpuid]; 1132 vcpu->nextrip = val; 1133 return (0); 1134 } 1135 1136 static bool 1137 is_descriptor_table(int reg) 1138 { 1139 switch (reg) { 1140 case VM_REG_GUEST_IDTR: 1141 case VM_REG_GUEST_GDTR: 1142 return (true); 1143 default: 1144 return (false); 1145 } 1146 } 1147 1148 static bool 1149 is_segment_register(int reg) 1150 { 1151 switch (reg) { 1152 case VM_REG_GUEST_ES: 1153 case VM_REG_GUEST_CS: 1154 case VM_REG_GUEST_SS: 1155 case VM_REG_GUEST_DS: 1156 case VM_REG_GUEST_FS: 1157 case VM_REG_GUEST_GS: 1158 case VM_REG_GUEST_TR: 1159 case VM_REG_GUEST_LDTR: 1160 return (true); 1161 default: 1162 return (false); 1163 } 1164 } 1165 1166 int 1167 vm_get_seg_desc(struct vm *vm, int vcpu, int reg, struct seg_desc *desc) 1168 { 1169 1170 if (vcpu < 0 || vcpu >= vm->maxcpus) 1171 return (EINVAL); 1172 1173 if (!is_segment_register(reg) && !is_descriptor_table(reg)) 1174 return (EINVAL); 1175 1176 return (VMGETDESC(vm->cookie, vcpu, reg, desc)); 1177 } 1178 1179 int 1180 vm_set_seg_desc(struct vm *vm, int vcpu, int reg, const struct seg_desc *desc) 1181 { 1182 if (vcpu < 0 || vcpu >= vm->maxcpus) 1183 return (EINVAL); 1184 1185 if (!is_segment_register(reg) && !is_descriptor_table(reg)) 1186 return (EINVAL); 1187 1188 return (VMSETDESC(vm->cookie, vcpu, reg, desc)); 1189 } 1190 1191 static int 1192 translate_hma_xsave_result(hma_fpu_xsave_result_t res) 1193 { 1194 switch (res) { 1195 case HFXR_OK: 1196 return (0); 1197 case HFXR_NO_SPACE: 1198 return (ENOSPC); 1199 case HFXR_BAD_ALIGN: 1200 case HFXR_UNSUP_FMT: 1201 case HFXR_UNSUP_FEAT: 1202 case HFXR_INVALID_DATA: 1203 return (EINVAL); 1204 default: 1205 panic("unexpected xsave result"); 1206 } 1207 } 1208 1209 int 1210 vm_get_fpu(struct vm *vm, int vcpuid, void *buf, size_t len) 1211 { 1212 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 1213 return (EINVAL); 1214 1215 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 1216 hma_fpu_xsave_result_t res; 1217 1218 res = hma_fpu_get_xsave_state(vcpu->guestfpu, buf, len); 1219 return (translate_hma_xsave_result(res)); 1220 } 1221 1222 int 1223 vm_set_fpu(struct vm *vm, int vcpuid, void *buf, size_t len) 1224 { 1225 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 1226 return (EINVAL); 1227 1228 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 1229 hma_fpu_xsave_result_t res; 1230 1231 res = hma_fpu_set_xsave_state(vcpu->guestfpu, buf, len); 1232 return (translate_hma_xsave_result(res)); 1233 } 1234 1235 int 1236 vm_get_run_state(struct vm *vm, int vcpuid, uint32_t *state, uint8_t *sipi_vec) 1237 { 1238 struct vcpu *vcpu; 1239 1240 if (vcpuid < 0 || vcpuid >= vm->maxcpus) { 1241 return (EINVAL); 1242 } 1243 1244 vcpu = &vm->vcpu[vcpuid]; 1245 1246 vcpu_lock(vcpu); 1247 *state = vcpu->run_state; 1248 *sipi_vec = vcpu->sipi_vector; 1249 vcpu_unlock(vcpu); 1250 1251 return (0); 1252 } 1253 1254 int 1255 vm_set_run_state(struct vm *vm, int vcpuid, uint32_t state, uint8_t sipi_vec) 1256 { 1257 struct vcpu *vcpu; 1258 1259 if (vcpuid < 0 || vcpuid >= vm->maxcpus) { 1260 return (EINVAL); 1261 } 1262 if (!VRS_IS_VALID(state)) { 1263 return (EINVAL); 1264 } 1265 1266 vcpu = &vm->vcpu[vcpuid]; 1267 1268 vcpu_lock(vcpu); 1269 vcpu->run_state = state; 1270 vcpu->sipi_vector = sipi_vec; 1271 vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT); 1272 vcpu_unlock(vcpu); 1273 1274 return (0); 1275 } 1276 1277 void 1278 vm_track_dirty_pages(struct vm *vm, uint64_t gpa, size_t len, uint8_t *bitmap) 1279 { 1280 vmspace_t *vms = vm_get_vmspace(vm); 1281 vmspace_track_dirty(vms, gpa, len, bitmap); 1282 } 1283 1284 static void 1285 restore_guest_fpustate(struct vcpu *vcpu) 1286 { 1287 /* Save host FPU and restore guest FPU */ 1288 fpu_stop_emulating(); 1289 hma_fpu_start_guest(vcpu->guestfpu); 1290 1291 /* restore guest XCR0 if XSAVE is enabled in the host */ 1292 if (rcr4() & CR4_XSAVE) 1293 load_xcr(0, vcpu->guest_xcr0); 1294 1295 /* 1296 * The FPU is now "dirty" with the guest's state so turn on emulation 1297 * to trap any access to the FPU by the host. 1298 */ 1299 fpu_start_emulating(); 1300 } 1301 1302 static void 1303 save_guest_fpustate(struct vcpu *vcpu) 1304 { 1305 1306 if ((rcr0() & CR0_TS) == 0) 1307 panic("fpu emulation not enabled in host!"); 1308 1309 /* save guest XCR0 and restore host XCR0 */ 1310 if (rcr4() & CR4_XSAVE) { 1311 vcpu->guest_xcr0 = rxcr(0); 1312 load_xcr(0, vmm_get_host_xcr0()); 1313 } 1314 1315 /* save guest FPU and restore host FPU */ 1316 fpu_stop_emulating(); 1317 hma_fpu_stop_guest(vcpu->guestfpu); 1318 /* 1319 * When the host state has been restored, we should not re-enable 1320 * CR0.TS on illumos for eager FPU. 1321 */ 1322 } 1323 1324 static int 1325 vcpu_set_state_locked(struct vm *vm, int vcpuid, enum vcpu_state newstate, 1326 bool from_idle) 1327 { 1328 struct vcpu *vcpu; 1329 int error; 1330 1331 vcpu = &vm->vcpu[vcpuid]; 1332 vcpu_assert_locked(vcpu); 1333 1334 /* 1335 * State transitions from the vmmdev_ioctl() must always begin from 1336 * the VCPU_IDLE state. This guarantees that there is only a single 1337 * ioctl() operating on a vcpu at any point. 1338 */ 1339 if (from_idle) { 1340 while (vcpu->state != VCPU_IDLE) { 1341 vcpu->reqidle = 1; 1342 vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT); 1343 VCPU_CTR1(vm, vcpuid, "vcpu state change from %s to " 1344 "idle requested", vcpu_state2str(vcpu->state)); 1345 cv_wait(&vcpu->state_cv, &vcpu->lock); 1346 } 1347 } else { 1348 KASSERT(vcpu->state != VCPU_IDLE, ("invalid transition from " 1349 "vcpu idle state")); 1350 } 1351 1352 if (vcpu->state == VCPU_RUNNING) { 1353 KASSERT(vcpu->hostcpu == curcpu, ("curcpu %d and hostcpu %d " 1354 "mismatch for running vcpu", curcpu, vcpu->hostcpu)); 1355 } else { 1356 KASSERT(vcpu->hostcpu == NOCPU, ("Invalid hostcpu %d for a " 1357 "vcpu that is not running", vcpu->hostcpu)); 1358 } 1359 1360 /* 1361 * The following state transitions are allowed: 1362 * IDLE -> FROZEN -> IDLE 1363 * FROZEN -> RUNNING -> FROZEN 1364 * FROZEN -> SLEEPING -> FROZEN 1365 */ 1366 switch (vcpu->state) { 1367 case VCPU_IDLE: 1368 case VCPU_RUNNING: 1369 case VCPU_SLEEPING: 1370 error = (newstate != VCPU_FROZEN); 1371 break; 1372 case VCPU_FROZEN: 1373 error = (newstate == VCPU_FROZEN); 1374 break; 1375 default: 1376 error = 1; 1377 break; 1378 } 1379 1380 if (error) 1381 return (EBUSY); 1382 1383 VCPU_CTR2(vm, vcpuid, "vcpu state changed from %s to %s", 1384 vcpu_state2str(vcpu->state), vcpu_state2str(newstate)); 1385 1386 vcpu->state = newstate; 1387 if (newstate == VCPU_RUNNING) 1388 vcpu->hostcpu = curcpu; 1389 else 1390 vcpu->hostcpu = NOCPU; 1391 1392 if (newstate == VCPU_IDLE) { 1393 cv_broadcast(&vcpu->state_cv); 1394 } 1395 1396 return (0); 1397 } 1398 1399 static void 1400 vcpu_require_state(struct vm *vm, int vcpuid, enum vcpu_state newstate) 1401 { 1402 int error; 1403 1404 if ((error = vcpu_set_state(vm, vcpuid, newstate, false)) != 0) 1405 panic("Error %d setting state to %d\n", error, newstate); 1406 } 1407 1408 static void 1409 vcpu_require_state_locked(struct vm *vm, int vcpuid, enum vcpu_state newstate) 1410 { 1411 int error; 1412 1413 if ((error = vcpu_set_state_locked(vm, vcpuid, newstate, false)) != 0) 1414 panic("Error %d setting state to %d", error, newstate); 1415 } 1416 1417 /* 1418 * Emulate a guest 'hlt' by sleeping until the vcpu is ready to run. 1419 */ 1420 static int 1421 vm_handle_hlt(struct vm *vm, int vcpuid, bool intr_disabled) 1422 { 1423 struct vcpu *vcpu; 1424 int vcpu_halted, vm_halted; 1425 bool userspace_exit = false; 1426 1427 KASSERT(!CPU_ISSET(vcpuid, &vm->halted_cpus), ("vcpu already halted")); 1428 1429 vcpu = &vm->vcpu[vcpuid]; 1430 vcpu_halted = 0; 1431 vm_halted = 0; 1432 1433 vcpu_lock(vcpu); 1434 while (1) { 1435 /* 1436 * Do a final check for pending interrupts (including NMI and 1437 * INIT) before putting this thread to sleep. 1438 */ 1439 if (vm_nmi_pending(vm, vcpuid)) 1440 break; 1441 if (vcpu_run_state_pending(vm, vcpuid)) 1442 break; 1443 if (!intr_disabled) { 1444 if (vm_extint_pending(vm, vcpuid) || 1445 vlapic_pending_intr(vcpu->vlapic, NULL)) { 1446 break; 1447 } 1448 } 1449 1450 /* 1451 * Also check for software events which would cause a wake-up. 1452 * This will set the appropriate exitcode directly, rather than 1453 * requiring a trip through VM_RUN(). 1454 */ 1455 if (vcpu_sleep_bailout_checks(vm, vcpuid)) { 1456 userspace_exit = true; 1457 break; 1458 } 1459 1460 /* 1461 * Some Linux guests implement "halt" by having all vcpus 1462 * execute HLT with interrupts disabled. 'halted_cpus' keeps 1463 * track of the vcpus that have entered this state. When all 1464 * vcpus enter the halted state the virtual machine is halted. 1465 */ 1466 if (intr_disabled) { 1467 if (!vcpu_halted && halt_detection_enabled) { 1468 vcpu_halted = 1; 1469 CPU_SET_ATOMIC(vcpuid, &vm->halted_cpus); 1470 } 1471 if (CPU_CMP(&vm->halted_cpus, &vm->active_cpus) == 0) { 1472 vm_halted = 1; 1473 break; 1474 } 1475 } 1476 1477 vcpu_ustate_change(vm, vcpuid, VU_IDLE); 1478 vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING); 1479 (void) cv_wait_sig(&vcpu->vcpu_cv, &vcpu->lock); 1480 vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN); 1481 vcpu_ustate_change(vm, vcpuid, VU_EMU_KERN); 1482 } 1483 1484 if (vcpu_halted) 1485 CPU_CLR_ATOMIC(vcpuid, &vm->halted_cpus); 1486 1487 vcpu_unlock(vcpu); 1488 1489 if (vm_halted) { 1490 (void) vm_suspend(vm, VM_SUSPEND_HALT); 1491 } 1492 1493 return (userspace_exit ? -1 : 0); 1494 } 1495 1496 static int 1497 vm_handle_paging(struct vm *vm, int vcpuid) 1498 { 1499 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 1500 vm_client_t *vmc = vcpu->vmclient; 1501 struct vm_exit *vme = &vcpu->exitinfo; 1502 int rv, ftype; 1503 1504 KASSERT(vme->inst_length == 0, ("%s: invalid inst_length %d", 1505 __func__, vme->inst_length)); 1506 1507 ftype = vme->u.paging.fault_type; 1508 KASSERT(ftype == PROT_READ || 1509 ftype == PROT_WRITE || ftype == PROT_EXEC, 1510 ("vm_handle_paging: invalid fault_type %d", ftype)); 1511 1512 rv = vmc_fault(vmc, vme->u.paging.gpa, ftype); 1513 1514 VCPU_CTR3(vm, vcpuid, "vm_handle_paging rv = %d, gpa = %lx, " 1515 "ftype = %d", rv, vme->u.paging.gpa, ftype); 1516 1517 if (rv != 0) 1518 return (EFAULT); 1519 return (0); 1520 } 1521 1522 int 1523 vm_service_mmio_read(struct vm *vm, int cpuid, uint64_t gpa, uint64_t *rval, 1524 int rsize) 1525 { 1526 int err = ESRCH; 1527 1528 if (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE) { 1529 struct vlapic *vlapic = vm_lapic(vm, cpuid); 1530 1531 err = vlapic_mmio_read(vlapic, gpa, rval, rsize); 1532 } else if (gpa >= VIOAPIC_BASE && gpa < VIOAPIC_BASE + VIOAPIC_SIZE) { 1533 err = vioapic_mmio_read(vm, cpuid, gpa, rval, rsize); 1534 } else if (gpa >= VHPET_BASE && gpa < VHPET_BASE + VHPET_SIZE) { 1535 err = vhpet_mmio_read(vm, cpuid, gpa, rval, rsize); 1536 } 1537 1538 return (err); 1539 } 1540 1541 int 1542 vm_service_mmio_write(struct vm *vm, int cpuid, uint64_t gpa, uint64_t wval, 1543 int wsize) 1544 { 1545 int err = ESRCH; 1546 1547 if (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE) { 1548 struct vlapic *vlapic = vm_lapic(vm, cpuid); 1549 1550 err = vlapic_mmio_write(vlapic, gpa, wval, wsize); 1551 } else if (gpa >= VIOAPIC_BASE && gpa < VIOAPIC_BASE + VIOAPIC_SIZE) { 1552 err = vioapic_mmio_write(vm, cpuid, gpa, wval, wsize); 1553 } else if (gpa >= VHPET_BASE && gpa < VHPET_BASE + VHPET_SIZE) { 1554 err = vhpet_mmio_write(vm, cpuid, gpa, wval, wsize); 1555 } 1556 1557 return (err); 1558 } 1559 1560 static int 1561 vm_handle_mmio_emul(struct vm *vm, int vcpuid) 1562 { 1563 struct vie *vie; 1564 struct vcpu *vcpu; 1565 struct vm_exit *vme; 1566 uint64_t inst_addr; 1567 int error, fault, cs_d; 1568 1569 vcpu = &vm->vcpu[vcpuid]; 1570 vme = &vcpu->exitinfo; 1571 vie = vcpu->vie_ctx; 1572 1573 KASSERT(vme->inst_length == 0, ("%s: invalid inst_length %d", 1574 __func__, vme->inst_length)); 1575 1576 inst_addr = vme->rip + vme->u.mmio_emul.cs_base; 1577 cs_d = vme->u.mmio_emul.cs_d; 1578 1579 VCPU_CTR1(vm, vcpuid, "inst_emul fault accessing gpa %lx", 1580 vme->u.mmio_emul.gpa); 1581 1582 /* Fetch the faulting instruction */ 1583 if (vie_needs_fetch(vie)) { 1584 error = vie_fetch_instruction(vie, vm, vcpuid, inst_addr, 1585 &fault); 1586 if (error != 0) { 1587 return (error); 1588 } else if (fault) { 1589 /* 1590 * If a fault during instruction fetch was encountered, 1591 * it will have asserted that the appropriate exception 1592 * be injected at next entry. 1593 * No further work is required. 1594 */ 1595 return (0); 1596 } 1597 } 1598 1599 if (vie_decode_instruction(vie, vm, vcpuid, cs_d) != 0) { 1600 VCPU_CTR1(vm, vcpuid, "Error decoding instruction at %lx", 1601 inst_addr); 1602 /* Dump (unrecognized) instruction bytes in userspace */ 1603 vie_fallback_exitinfo(vie, vme); 1604 return (-1); 1605 } 1606 if (vme->u.mmio_emul.gla != VIE_INVALID_GLA && 1607 vie_verify_gla(vie, vm, vcpuid, vme->u.mmio_emul.gla) != 0) { 1608 /* Decoded GLA does not match GLA from VM exit state */ 1609 vie_fallback_exitinfo(vie, vme); 1610 return (-1); 1611 } 1612 1613 repeat: 1614 error = vie_emulate_mmio(vie, vm, vcpuid); 1615 if (error < 0) { 1616 /* 1617 * MMIO not handled by any of the in-kernel-emulated devices, so 1618 * make a trip out to userspace for it. 1619 */ 1620 vie_exitinfo(vie, vme); 1621 } else if (error == EAGAIN) { 1622 /* 1623 * Continue emulating the rep-prefixed instruction, which has 1624 * not completed its iterations. 1625 * 1626 * In case this can be emulated in-kernel and has a high 1627 * repetition count (causing a tight spin), it should be 1628 * deferential to yield conditions. 1629 */ 1630 if (!vcpu_should_yield(vm, vcpuid)) { 1631 goto repeat; 1632 } else { 1633 /* 1634 * Defer to the contending load by making a trip to 1635 * userspace with a no-op (BOGUS) exit reason. 1636 */ 1637 vie_reset(vie); 1638 vme->exitcode = VM_EXITCODE_BOGUS; 1639 return (-1); 1640 } 1641 } else if (error == 0) { 1642 /* Update %rip now that instruction has been emulated */ 1643 vie_advance_pc(vie, &vcpu->nextrip); 1644 } 1645 return (error); 1646 } 1647 1648 static int 1649 vm_handle_inout(struct vm *vm, int vcpuid, struct vm_exit *vme) 1650 { 1651 struct vcpu *vcpu; 1652 struct vie *vie; 1653 int err; 1654 1655 vcpu = &vm->vcpu[vcpuid]; 1656 vie = vcpu->vie_ctx; 1657 1658 repeat: 1659 err = vie_emulate_inout(vie, vm, vcpuid); 1660 1661 if (err < 0) { 1662 /* 1663 * In/out not handled by any of the in-kernel-emulated devices, 1664 * so make a trip out to userspace for it. 1665 */ 1666 vie_exitinfo(vie, vme); 1667 return (err); 1668 } else if (err == EAGAIN) { 1669 /* 1670 * Continue emulating the rep-prefixed ins/outs, which has not 1671 * completed its iterations. 1672 * 1673 * In case this can be emulated in-kernel and has a high 1674 * repetition count (causing a tight spin), it should be 1675 * deferential to yield conditions. 1676 */ 1677 if (!vcpu_should_yield(vm, vcpuid)) { 1678 goto repeat; 1679 } else { 1680 /* 1681 * Defer to the contending load by making a trip to 1682 * userspace with a no-op (BOGUS) exit reason. 1683 */ 1684 vie_reset(vie); 1685 vme->exitcode = VM_EXITCODE_BOGUS; 1686 return (-1); 1687 } 1688 } else if (err != 0) { 1689 /* Emulation failure. Bail all the way out to userspace. */ 1690 vme->exitcode = VM_EXITCODE_INST_EMUL; 1691 bzero(&vme->u.inst_emul, sizeof (vme->u.inst_emul)); 1692 return (-1); 1693 } 1694 1695 vie_advance_pc(vie, &vcpu->nextrip); 1696 return (0); 1697 } 1698 1699 static int 1700 vm_handle_inst_emul(struct vm *vm, int vcpuid) 1701 { 1702 struct vie *vie; 1703 struct vcpu *vcpu; 1704 struct vm_exit *vme; 1705 uint64_t cs_base; 1706 int error, fault, cs_d; 1707 1708 vcpu = &vm->vcpu[vcpuid]; 1709 vme = &vcpu->exitinfo; 1710 vie = vcpu->vie_ctx; 1711 1712 vie_cs_info(vie, vm, vcpuid, &cs_base, &cs_d); 1713 1714 /* Fetch the faulting instruction */ 1715 ASSERT(vie_needs_fetch(vie)); 1716 error = vie_fetch_instruction(vie, vm, vcpuid, vme->rip + cs_base, 1717 &fault); 1718 if (error != 0) { 1719 return (error); 1720 } else if (fault) { 1721 /* 1722 * If a fault during instruction fetch was encounted, it will 1723 * have asserted that the appropriate exception be injected at 1724 * next entry. No further work is required. 1725 */ 1726 return (0); 1727 } 1728 1729 if (vie_decode_instruction(vie, vm, vcpuid, cs_d) != 0) { 1730 /* Dump (unrecognized) instruction bytes in userspace */ 1731 vie_fallback_exitinfo(vie, vme); 1732 return (-1); 1733 } 1734 1735 error = vie_emulate_other(vie, vm, vcpuid); 1736 if (error != 0) { 1737 /* 1738 * Instruction emulation was unable to complete successfully, so 1739 * kick it out to userspace for handling. 1740 */ 1741 vie_fallback_exitinfo(vie, vme); 1742 } else { 1743 /* Update %rip now that instruction has been emulated */ 1744 vie_advance_pc(vie, &vcpu->nextrip); 1745 } 1746 return (error); 1747 } 1748 1749 static int 1750 vm_handle_suspend(struct vm *vm, int vcpuid) 1751 { 1752 int i; 1753 struct vcpu *vcpu; 1754 1755 vcpu = &vm->vcpu[vcpuid]; 1756 1757 CPU_SET_ATOMIC(vcpuid, &vm->suspended_cpus); 1758 1759 /* 1760 * Wait until all 'active_cpus' have suspended themselves. 1761 */ 1762 vcpu_lock(vcpu); 1763 vcpu_ustate_change(vm, vcpuid, VU_INIT); 1764 while (1) { 1765 int rc; 1766 1767 if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) { 1768 VCPU_CTR0(vm, vcpuid, "All vcpus suspended"); 1769 break; 1770 } 1771 1772 vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING); 1773 rc = cv_reltimedwait_sig(&vcpu->vcpu_cv, &vcpu->lock, hz, 1774 TR_CLOCK_TICK); 1775 vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN); 1776 1777 /* 1778 * If the userspace process driving the instance is killed, any 1779 * vCPUs yet to be marked suspended (because they are not 1780 * VM_RUN-ing in the kernel presently) will never reach that 1781 * state. 1782 * 1783 * To avoid vm_handle_suspend() getting stuck in the kernel 1784 * waiting for those vCPUs, offer a bail-out even though it 1785 * means returning without all vCPUs in a suspended state. 1786 */ 1787 if (rc <= 0) { 1788 if ((curproc->p_flag & SEXITING) != 0) { 1789 break; 1790 } 1791 } 1792 } 1793 vcpu_unlock(vcpu); 1794 1795 /* 1796 * Wakeup the other sleeping vcpus and return to userspace. 1797 */ 1798 for (i = 0; i < vm->maxcpus; i++) { 1799 if (CPU_ISSET(i, &vm->suspended_cpus)) { 1800 vcpu_notify_event(vm, i); 1801 } 1802 } 1803 1804 return (-1); 1805 } 1806 1807 static int 1808 vm_handle_reqidle(struct vm *vm, int vcpuid) 1809 { 1810 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 1811 1812 vcpu_lock(vcpu); 1813 KASSERT(vcpu->reqidle, ("invalid vcpu reqidle %d", vcpu->reqidle)); 1814 vcpu->reqidle = 0; 1815 vcpu_unlock(vcpu); 1816 return (-1); 1817 } 1818 1819 static int 1820 vm_handle_run_state(struct vm *vm, int vcpuid) 1821 { 1822 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 1823 bool handled = false; 1824 1825 vcpu_lock(vcpu); 1826 while (1) { 1827 if ((vcpu->run_state & VRS_PEND_INIT) != 0) { 1828 vcpu_unlock(vcpu); 1829 VERIFY0(vcpu_arch_reset(vm, vcpuid, true)); 1830 vcpu_lock(vcpu); 1831 1832 vcpu->run_state &= ~(VRS_RUN | VRS_PEND_INIT); 1833 vcpu->run_state |= VRS_INIT; 1834 } 1835 1836 if ((vcpu->run_state & (VRS_INIT | VRS_RUN | VRS_PEND_SIPI)) == 1837 (VRS_INIT | VRS_PEND_SIPI)) { 1838 const uint8_t vector = vcpu->sipi_vector; 1839 1840 vcpu_unlock(vcpu); 1841 VERIFY0(vcpu_vector_sipi(vm, vcpuid, vector)); 1842 vcpu_lock(vcpu); 1843 1844 vcpu->run_state &= ~VRS_PEND_SIPI; 1845 vcpu->run_state |= VRS_RUN; 1846 } 1847 1848 /* 1849 * If the vCPU is now in the running state, there is no need to 1850 * wait for anything prior to re-entry. 1851 */ 1852 if ((vcpu->run_state & VRS_RUN) != 0) { 1853 handled = true; 1854 break; 1855 } 1856 1857 /* 1858 * Also check for software events which would cause a wake-up. 1859 * This will set the appropriate exitcode directly, rather than 1860 * requiring a trip through VM_RUN(). 1861 */ 1862 if (vcpu_sleep_bailout_checks(vm, vcpuid)) { 1863 break; 1864 } 1865 1866 vcpu_ustate_change(vm, vcpuid, VU_IDLE); 1867 vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING); 1868 (void) cv_wait_sig(&vcpu->vcpu_cv, &vcpu->lock); 1869 vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN); 1870 vcpu_ustate_change(vm, vcpuid, VU_EMU_KERN); 1871 } 1872 vcpu_unlock(vcpu); 1873 1874 return (handled ? 0 : -1); 1875 } 1876 1877 static int 1878 vm_rdmtrr(struct vm_mtrr *mtrr, uint32_t num, uint64_t *val) 1879 { 1880 switch (num) { 1881 case MSR_MTRRcap: 1882 *val = MTRR_CAP_WC | MTRR_CAP_FIXED | VMM_MTRR_VAR_MAX; 1883 break; 1884 case MSR_MTRRdefType: 1885 *val = mtrr->def_type; 1886 break; 1887 case MSR_MTRR4kBase ... MSR_MTRR4kBase + 7: 1888 *val = mtrr->fixed4k[num - MSR_MTRR4kBase]; 1889 break; 1890 case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1: 1891 *val = mtrr->fixed16k[num - MSR_MTRR16kBase]; 1892 break; 1893 case MSR_MTRR64kBase: 1894 *val = mtrr->fixed64k; 1895 break; 1896 case MSR_MTRRVarBase ... MSR_MTRRVarBase + (VMM_MTRR_VAR_MAX * 2) - 1: { 1897 uint_t offset = num - MSR_MTRRVarBase; 1898 if (offset % 2 == 0) { 1899 *val = mtrr->var[offset / 2].base; 1900 } else { 1901 *val = mtrr->var[offset / 2].mask; 1902 } 1903 break; 1904 } 1905 default: 1906 return (-1); 1907 } 1908 1909 return (0); 1910 } 1911 1912 static int 1913 vm_wrmtrr(struct vm_mtrr *mtrr, uint32_t num, uint64_t val) 1914 { 1915 switch (num) { 1916 case MSR_MTRRcap: 1917 /* MTRRCAP is read only */ 1918 return (-1); 1919 case MSR_MTRRdefType: 1920 if (val & ~VMM_MTRR_DEF_MASK) { 1921 /* generate #GP on writes to reserved fields */ 1922 return (-1); 1923 } 1924 mtrr->def_type = val; 1925 break; 1926 case MSR_MTRR4kBase ... MSR_MTRR4kBase + 7: 1927 mtrr->fixed4k[num - MSR_MTRR4kBase] = val; 1928 break; 1929 case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1: 1930 mtrr->fixed16k[num - MSR_MTRR16kBase] = val; 1931 break; 1932 case MSR_MTRR64kBase: 1933 mtrr->fixed64k = val; 1934 break; 1935 case MSR_MTRRVarBase ... MSR_MTRRVarBase + (VMM_MTRR_VAR_MAX * 2) - 1: { 1936 uint_t offset = num - MSR_MTRRVarBase; 1937 if (offset % 2 == 0) { 1938 if (val & ~VMM_MTRR_PHYSBASE_MASK) { 1939 /* generate #GP on writes to reserved fields */ 1940 return (-1); 1941 } 1942 mtrr->var[offset / 2].base = val; 1943 } else { 1944 if (val & ~VMM_MTRR_PHYSMASK_MASK) { 1945 /* generate #GP on writes to reserved fields */ 1946 return (-1); 1947 } 1948 mtrr->var[offset / 2].mask = val; 1949 } 1950 break; 1951 } 1952 default: 1953 return (-1); 1954 } 1955 1956 return (0); 1957 } 1958 1959 static int 1960 vm_handle_rdmsr(struct vm *vm, int vcpuid, struct vm_exit *vme) 1961 { 1962 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 1963 const uint32_t code = vme->u.msr.code; 1964 uint64_t val = 0; 1965 1966 switch (code) { 1967 case MSR_MCG_CAP: 1968 case MSR_MCG_STATUS: 1969 val = 0; 1970 break; 1971 1972 case MSR_MTRRcap: 1973 case MSR_MTRRdefType: 1974 case MSR_MTRR4kBase ... MSR_MTRR4kBase + 7: 1975 case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1: 1976 case MSR_MTRR64kBase: 1977 case MSR_MTRRVarBase ... MSR_MTRRVarBase + (VMM_MTRR_VAR_MAX * 2) - 1: 1978 if (vm_rdmtrr(&vcpu->mtrr, code, &val) != 0) 1979 vm_inject_gp(vm, vcpuid); 1980 break; 1981 1982 case MSR_TSC: 1983 /* 1984 * In all likelihood, this should always be handled in guest 1985 * context by VMX/SVM rather than taking an exit. (Both VMX and 1986 * SVM pass through read-only access to MSR_TSC to the guest.) 1987 * 1988 * No physical offset is requested of vcpu_tsc_offset() since 1989 * rdtsc_offset() takes care of that instead. 1990 */ 1991 val = vcpu_tsc_offset(vm, vcpuid, false) + rdtsc_offset(); 1992 break; 1993 1994 default: 1995 /* 1996 * Anything not handled at this point will be kicked out to 1997 * userspace for attempted processing there. 1998 */ 1999 return (-1); 2000 } 2001 2002 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RAX, 2003 val & 0xffffffff)); 2004 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RDX, 2005 val >> 32)); 2006 return (0); 2007 } 2008 2009 static int 2010 vm_handle_wrmsr(struct vm *vm, int vcpuid, struct vm_exit *vme) 2011 { 2012 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 2013 const uint32_t code = vme->u.msr.code; 2014 const uint64_t val = vme->u.msr.wval; 2015 2016 switch (code) { 2017 case MSR_MCG_CAP: 2018 case MSR_MCG_STATUS: 2019 /* Ignore writes */ 2020 break; 2021 2022 case MSR_MTRRcap: 2023 case MSR_MTRRdefType: 2024 case MSR_MTRR4kBase ... MSR_MTRR4kBase + 7: 2025 case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1: 2026 case MSR_MTRR64kBase: 2027 case MSR_MTRRVarBase ... MSR_MTRRVarBase + (VMM_MTRR_VAR_MAX * 2) - 1: 2028 if (vm_wrmtrr(&vcpu->mtrr, code, val) != 0) 2029 vm_inject_gp(vm, vcpuid); 2030 break; 2031 2032 case MSR_TSC: 2033 /* 2034 * The effect of writing the TSC MSR is that a subsequent read 2035 * of the TSC would report that value written (plus any time 2036 * elapsed between the write and the read). The guest TSC value 2037 * is calculated from a global offset for the guest (which 2038 * effectively makes its TSC read 0 at guest boot) and a 2039 * per-vCPU offset to handle these writes to the MSR. 2040 * 2041 * To calculate that per-vCPU offset, we can work backwards from 2042 * the guest value at the time of write: 2043 * 2044 * value = host TSC + VM boot offset + vCPU offset 2045 * 2046 * so therefore: 2047 * 2048 * value - host TSC - VM boot offset = vCPU offset 2049 */ 2050 vcpu->tsc_offset = val - vm->boot_tsc_offset - rdtsc_offset(); 2051 break; 2052 2053 default: 2054 /* 2055 * Anything not handled at this point will be kicked out to 2056 * userspace for attempted processing there. 2057 */ 2058 return (-1); 2059 } 2060 2061 return (0); 2062 } 2063 2064 int 2065 vm_suspend(struct vm *vm, enum vm_suspend_how how) 2066 { 2067 if (how <= VM_SUSPEND_NONE || how >= VM_SUSPEND_LAST) 2068 return (EINVAL); 2069 2070 if (atomic_cmpset_int((uint_t *)&vm->suspend, 0, how) == 0) { 2071 return (EALREADY); 2072 } 2073 2074 /* 2075 * Notify all active vcpus that they are now suspended. 2076 */ 2077 for (uint_t i = 0; i < vm->maxcpus; i++) { 2078 struct vcpu *vcpu = &vm->vcpu[i]; 2079 2080 vcpu_lock(vcpu); 2081 if (vcpu->state == VCPU_IDLE || vcpu->state == VCPU_FROZEN) { 2082 /* 2083 * Any vCPUs not actively running or in HLT can be 2084 * marked as suspended immediately. 2085 */ 2086 if (CPU_ISSET(i, &vm->active_cpus)) { 2087 CPU_SET_ATOMIC(i, &vm->suspended_cpus); 2088 } 2089 } else { 2090 /* 2091 * Those which are running or in HLT will pick up the 2092 * suspended state after notification. 2093 */ 2094 vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT); 2095 } 2096 vcpu_unlock(vcpu); 2097 } 2098 return (0); 2099 } 2100 2101 void 2102 vm_exit_run_state(struct vm *vm, int vcpuid, uint64_t rip) 2103 { 2104 struct vm_exit *vmexit; 2105 2106 vmexit = vm_exitinfo(vm, vcpuid); 2107 vmexit->rip = rip; 2108 vmexit->inst_length = 0; 2109 vmexit->exitcode = VM_EXITCODE_RUN_STATE; 2110 vmm_stat_incr(vm, vcpuid, VMEXIT_RUN_STATE, 1); 2111 } 2112 2113 /* 2114 * Some vmm resources, such as the lapic, may have CPU-specific resources 2115 * allocated to them which would benefit from migration onto the host CPU which 2116 * is processing the vcpu state. 2117 */ 2118 static void 2119 vm_localize_resources(struct vm *vm, struct vcpu *vcpu) 2120 { 2121 /* 2122 * Localizing cyclic resources requires acquisition of cpu_lock, and 2123 * doing so with kpreempt disabled is a recipe for deadlock disaster. 2124 */ 2125 VERIFY(curthread->t_preempt == 0); 2126 2127 /* 2128 * Do not bother with localization if this vCPU is about to return to 2129 * the host CPU it was last localized to. 2130 */ 2131 if (vcpu->lastloccpu == curcpu) 2132 return; 2133 2134 /* 2135 * Localize system-wide resources to the primary boot vCPU. While any 2136 * of the other vCPUs may access them, it keeps the potential interrupt 2137 * footprint constrained to CPUs involved with this instance. 2138 */ 2139 if (vcpu == &vm->vcpu[0]) { 2140 vhpet_localize_resources(vm->vhpet); 2141 vrtc_localize_resources(vm->vrtc); 2142 vatpit_localize_resources(vm->vatpit); 2143 } 2144 2145 vlapic_localize_resources(vcpu->vlapic); 2146 2147 vcpu->lastloccpu = curcpu; 2148 } 2149 2150 static void 2151 vmm_savectx(void *arg) 2152 { 2153 vm_thread_ctx_t *vtc = arg; 2154 struct vm *vm = vtc->vtc_vm; 2155 const int vcpuid = vtc->vtc_vcpuid; 2156 2157 if (ops->vmsavectx != NULL) { 2158 ops->vmsavectx(vm->cookie, vcpuid); 2159 } 2160 2161 /* 2162 * Account for going off-cpu, unless the vCPU is idled, where being 2163 * off-cpu is the explicit point. 2164 */ 2165 if (vm->vcpu[vcpuid].ustate != VU_IDLE) { 2166 vtc->vtc_ustate = vm->vcpu[vcpuid].ustate; 2167 vcpu_ustate_change(vm, vcpuid, VU_SCHED); 2168 } 2169 2170 /* 2171 * If the CPU holds the restored guest FPU state, save it and restore 2172 * the host FPU state before this thread goes off-cpu. 2173 */ 2174 if ((vtc->vtc_status & VTCS_FPU_RESTORED) != 0) { 2175 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 2176 2177 save_guest_fpustate(vcpu); 2178 vtc->vtc_status &= ~VTCS_FPU_RESTORED; 2179 } 2180 } 2181 2182 static void 2183 vmm_restorectx(void *arg) 2184 { 2185 vm_thread_ctx_t *vtc = arg; 2186 struct vm *vm = vtc->vtc_vm; 2187 const int vcpuid = vtc->vtc_vcpuid; 2188 2189 /* Complete microstate accounting for vCPU being off-cpu */ 2190 if (vm->vcpu[vcpuid].ustate != VU_IDLE) { 2191 vcpu_ustate_change(vm, vcpuid, vtc->vtc_ustate); 2192 } 2193 2194 /* 2195 * When coming back on-cpu, only restore the guest FPU status if the 2196 * thread is in a context marked as requiring it. This should be rare, 2197 * occurring only when a future logic error results in a voluntary 2198 * sleep during the VMRUN critical section. 2199 * 2200 * The common case will result in elision of the guest FPU state 2201 * restoration, deferring that action until it is clearly necessary 2202 * during vm_run. 2203 */ 2204 VERIFY((vtc->vtc_status & VTCS_FPU_RESTORED) == 0); 2205 if ((vtc->vtc_status & VTCS_FPU_CTX_CRITICAL) != 0) { 2206 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 2207 2208 restore_guest_fpustate(vcpu); 2209 vtc->vtc_status |= VTCS_FPU_RESTORED; 2210 } 2211 2212 if (ops->vmrestorectx != NULL) { 2213 ops->vmrestorectx(vm->cookie, vcpuid); 2214 } 2215 2216 } 2217 2218 static int 2219 vm_entry_actions(struct vm *vm, int vcpuid, const struct vm_entry *entry, 2220 struct vm_exit *vme) 2221 { 2222 struct vcpu *vcpu; 2223 struct vie *vie; 2224 int err; 2225 2226 vcpu = &vm->vcpu[vcpuid]; 2227 vie = vcpu->vie_ctx; 2228 err = 0; 2229 2230 switch (entry->cmd) { 2231 case VEC_DEFAULT: 2232 return (0); 2233 case VEC_DISCARD_INSTR: 2234 vie_reset(vie); 2235 return (0); 2236 case VEC_FULFILL_MMIO: 2237 err = vie_fulfill_mmio(vie, &entry->u.mmio); 2238 if (err == 0) { 2239 err = vie_emulate_mmio(vie, vm, vcpuid); 2240 if (err == 0) { 2241 vie_advance_pc(vie, &vcpu->nextrip); 2242 } else if (err < 0) { 2243 vie_exitinfo(vie, vme); 2244 } else if (err == EAGAIN) { 2245 /* 2246 * Clear the instruction emulation state in 2247 * order to re-enter VM context and continue 2248 * this 'rep <instruction>' 2249 */ 2250 vie_reset(vie); 2251 err = 0; 2252 } 2253 } 2254 break; 2255 case VEC_FULFILL_INOUT: 2256 err = vie_fulfill_inout(vie, &entry->u.inout); 2257 if (err == 0) { 2258 err = vie_emulate_inout(vie, vm, vcpuid); 2259 if (err == 0) { 2260 vie_advance_pc(vie, &vcpu->nextrip); 2261 } else if (err < 0) { 2262 vie_exitinfo(vie, vme); 2263 } else if (err == EAGAIN) { 2264 /* 2265 * Clear the instruction emulation state in 2266 * order to re-enter VM context and continue 2267 * this 'rep ins/outs' 2268 */ 2269 vie_reset(vie); 2270 err = 0; 2271 } 2272 } 2273 break; 2274 default: 2275 return (EINVAL); 2276 } 2277 return (err); 2278 } 2279 2280 static int 2281 vm_loop_checks(struct vm *vm, int vcpuid, struct vm_exit *vme) 2282 { 2283 struct vie *vie; 2284 2285 vie = vm->vcpu[vcpuid].vie_ctx; 2286 2287 if (vie_pending(vie)) { 2288 /* 2289 * Userspace has not fulfilled the pending needs of the 2290 * instruction emulation, so bail back out. 2291 */ 2292 vie_exitinfo(vie, vme); 2293 return (-1); 2294 } 2295 2296 return (0); 2297 } 2298 2299 int 2300 vm_run(struct vm *vm, int vcpuid, const struct vm_entry *entry) 2301 { 2302 int error; 2303 struct vcpu *vcpu; 2304 struct vm_exit *vme; 2305 bool intr_disabled; 2306 int affinity_type = CPU_CURRENT; 2307 2308 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 2309 return (EINVAL); 2310 if (!CPU_ISSET(vcpuid, &vm->active_cpus)) 2311 return (EINVAL); 2312 2313 vcpu = &vm->vcpu[vcpuid]; 2314 vme = &vcpu->exitinfo; 2315 2316 vcpu_ustate_change(vm, vcpuid, VU_EMU_KERN); 2317 2318 vcpu->vtc.vtc_status = 0; 2319 ctxop_attach(curthread, vcpu->ctxop); 2320 2321 error = vm_entry_actions(vm, vcpuid, entry, vme); 2322 if (error != 0) { 2323 goto exit; 2324 } 2325 2326 restart: 2327 error = vm_loop_checks(vm, vcpuid, vme); 2328 if (error != 0) { 2329 goto exit; 2330 } 2331 2332 thread_affinity_set(curthread, affinity_type); 2333 /* 2334 * Resource localization should happen after the CPU affinity for the 2335 * thread has been set to ensure that access from restricted contexts, 2336 * such as VMX-accelerated APIC operations, can occur without inducing 2337 * cyclic cross-calls. 2338 * 2339 * This must be done prior to disabling kpreempt via critical_enter(). 2340 */ 2341 vm_localize_resources(vm, vcpu); 2342 affinity_type = CPU_CURRENT; 2343 critical_enter(); 2344 2345 /* Force a trip through update_sregs to reload %fs/%gs and friends */ 2346 PCB_SET_UPDATE_SEGS(&ttolwp(curthread)->lwp_pcb); 2347 2348 if ((vcpu->vtc.vtc_status & VTCS_FPU_RESTORED) == 0) { 2349 restore_guest_fpustate(vcpu); 2350 vcpu->vtc.vtc_status |= VTCS_FPU_RESTORED; 2351 } 2352 vcpu->vtc.vtc_status |= VTCS_FPU_CTX_CRITICAL; 2353 2354 vcpu_require_state(vm, vcpuid, VCPU_RUNNING); 2355 error = VMRUN(vm->cookie, vcpuid, vcpu->nextrip); 2356 vcpu_require_state(vm, vcpuid, VCPU_FROZEN); 2357 2358 /* 2359 * Once clear of the delicate contexts comprising the VM_RUN handler, 2360 * thread CPU affinity can be loosened while other processing occurs. 2361 */ 2362 vcpu->vtc.vtc_status &= ~VTCS_FPU_CTX_CRITICAL; 2363 thread_affinity_clear(curthread); 2364 critical_exit(); 2365 2366 if (error != 0) { 2367 /* Communicate out any error from VMRUN() above */ 2368 goto exit; 2369 } 2370 2371 vcpu->nextrip = vme->rip + vme->inst_length; 2372 switch (vme->exitcode) { 2373 case VM_EXITCODE_REQIDLE: 2374 error = vm_handle_reqidle(vm, vcpuid); 2375 break; 2376 case VM_EXITCODE_RUN_STATE: 2377 error = vm_handle_run_state(vm, vcpuid); 2378 break; 2379 case VM_EXITCODE_SUSPENDED: 2380 error = vm_handle_suspend(vm, vcpuid); 2381 break; 2382 case VM_EXITCODE_IOAPIC_EOI: 2383 vioapic_process_eoi(vm, vcpuid, 2384 vme->u.ioapic_eoi.vector); 2385 break; 2386 case VM_EXITCODE_HLT: 2387 intr_disabled = ((vme->u.hlt.rflags & PSL_I) == 0); 2388 error = vm_handle_hlt(vm, vcpuid, intr_disabled); 2389 break; 2390 case VM_EXITCODE_PAGING: 2391 error = vm_handle_paging(vm, vcpuid); 2392 break; 2393 case VM_EXITCODE_MMIO_EMUL: 2394 error = vm_handle_mmio_emul(vm, vcpuid); 2395 break; 2396 case VM_EXITCODE_INOUT: 2397 error = vm_handle_inout(vm, vcpuid, vme); 2398 break; 2399 case VM_EXITCODE_INST_EMUL: 2400 error = vm_handle_inst_emul(vm, vcpuid); 2401 break; 2402 case VM_EXITCODE_MONITOR: 2403 case VM_EXITCODE_MWAIT: 2404 case VM_EXITCODE_VMINSN: 2405 vm_inject_ud(vm, vcpuid); 2406 break; 2407 case VM_EXITCODE_RDMSR: 2408 error = vm_handle_rdmsr(vm, vcpuid, vme); 2409 break; 2410 case VM_EXITCODE_WRMSR: 2411 error = vm_handle_wrmsr(vm, vcpuid, vme); 2412 break; 2413 case VM_EXITCODE_HT: 2414 affinity_type = CPU_BEST; 2415 break; 2416 case VM_EXITCODE_MTRAP: 2417 VERIFY0(vm_suspend_cpu(vm, vcpuid)); 2418 error = -1; 2419 break; 2420 default: 2421 /* handled in userland */ 2422 error = -1; 2423 break; 2424 } 2425 2426 if (error == 0) { 2427 /* VM exit conditions handled in-kernel, continue running */ 2428 goto restart; 2429 } 2430 2431 exit: 2432 kpreempt_disable(); 2433 ctxop_detach(curthread, vcpu->ctxop); 2434 /* Make sure all of the needed vCPU context state is saved */ 2435 vmm_savectx(&vcpu->vtc); 2436 kpreempt_enable(); 2437 2438 VCPU_CTR2(vm, vcpuid, "retu %d/%d", error, vme->exitcode); 2439 2440 vcpu_ustate_change(vm, vcpuid, VU_EMU_USER); 2441 return (error); 2442 } 2443 2444 int 2445 vm_restart_instruction(void *arg, int vcpuid) 2446 { 2447 struct vm *vm; 2448 struct vcpu *vcpu; 2449 enum vcpu_state state; 2450 uint64_t rip; 2451 int error; 2452 2453 vm = arg; 2454 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 2455 return (EINVAL); 2456 2457 vcpu = &vm->vcpu[vcpuid]; 2458 state = vcpu_get_state(vm, vcpuid, NULL); 2459 if (state == VCPU_RUNNING) { 2460 /* 2461 * When a vcpu is "running" the next instruction is determined 2462 * by adding 'rip' and 'inst_length' in the vcpu's 'exitinfo'. 2463 * Thus setting 'inst_length' to zero will cause the current 2464 * instruction to be restarted. 2465 */ 2466 vcpu->exitinfo.inst_length = 0; 2467 VCPU_CTR1(vm, vcpuid, "restarting instruction at %lx by " 2468 "setting inst_length to zero", vcpu->exitinfo.rip); 2469 } else if (state == VCPU_FROZEN) { 2470 /* 2471 * When a vcpu is "frozen" it is outside the critical section 2472 * around VMRUN() and 'nextrip' points to the next instruction. 2473 * Thus instruction restart is achieved by setting 'nextrip' 2474 * to the vcpu's %rip. 2475 */ 2476 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RIP, &rip); 2477 KASSERT(!error, ("%s: error %d getting rip", __func__, error)); 2478 VCPU_CTR2(vm, vcpuid, "restarting instruction by updating " 2479 "nextrip from %lx to %lx", vcpu->nextrip, rip); 2480 vcpu->nextrip = rip; 2481 } else { 2482 panic("%s: invalid state %d", __func__, state); 2483 } 2484 return (0); 2485 } 2486 2487 int 2488 vm_exit_intinfo(struct vm *vm, int vcpuid, uint64_t info) 2489 { 2490 struct vcpu *vcpu; 2491 2492 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 2493 return (EINVAL); 2494 2495 vcpu = &vm->vcpu[vcpuid]; 2496 2497 if (VM_INTINFO_PENDING(info)) { 2498 const uint32_t type = VM_INTINFO_TYPE(info); 2499 const uint8_t vector = VM_INTINFO_VECTOR(info); 2500 2501 if (type == VM_INTINFO_NMI && vector != IDT_NMI) 2502 return (EINVAL); 2503 if (type == VM_INTINFO_HWEXCP && vector >= 32) 2504 return (EINVAL); 2505 if (info & VM_INTINFO_MASK_RSVD) 2506 return (EINVAL); 2507 } else { 2508 info = 0; 2509 } 2510 vcpu->exit_intinfo = info; 2511 return (0); 2512 } 2513 2514 enum exc_class { 2515 EXC_BENIGN, 2516 EXC_CONTRIBUTORY, 2517 EXC_PAGEFAULT 2518 }; 2519 2520 #define IDT_VE 20 /* Virtualization Exception (Intel specific) */ 2521 2522 static enum exc_class 2523 exception_class(uint64_t info) 2524 { 2525 ASSERT(VM_INTINFO_PENDING(info)); 2526 2527 /* Table 6-4, "Interrupt and Exception Classes", Intel SDM, Vol 3 */ 2528 switch (VM_INTINFO_TYPE(info)) { 2529 case VM_INTINFO_HWINTR: 2530 case VM_INTINFO_SWINTR: 2531 case VM_INTINFO_NMI: 2532 return (EXC_BENIGN); 2533 default: 2534 /* 2535 * Hardware exception. 2536 * 2537 * SVM and VT-x use identical type values to represent NMI, 2538 * hardware interrupt and software interrupt. 2539 * 2540 * SVM uses type '3' for all exceptions. VT-x uses type '3' 2541 * for exceptions except #BP and #OF. #BP and #OF use a type 2542 * value of '5' or '6'. Therefore we don't check for explicit 2543 * values of 'type' to classify 'intinfo' into a hardware 2544 * exception. 2545 */ 2546 break; 2547 } 2548 2549 switch (VM_INTINFO_VECTOR(info)) { 2550 case IDT_PF: 2551 case IDT_VE: 2552 return (EXC_PAGEFAULT); 2553 case IDT_DE: 2554 case IDT_TS: 2555 case IDT_NP: 2556 case IDT_SS: 2557 case IDT_GP: 2558 return (EXC_CONTRIBUTORY); 2559 default: 2560 return (EXC_BENIGN); 2561 } 2562 } 2563 2564 /* 2565 * Fetch event pending injection into the guest, if one exists. 2566 * 2567 * Returns true if an event is to be injected (which is placed in `retinfo`). 2568 */ 2569 bool 2570 vm_entry_intinfo(struct vm *vm, int vcpuid, uint64_t *retinfo) 2571 { 2572 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 2573 const uint64_t info1 = vcpu->exit_intinfo; 2574 vcpu->exit_intinfo = 0; 2575 const uint64_t info2 = vcpu->exc_pending; 2576 vcpu->exc_pending = 0; 2577 2578 if (VM_INTINFO_PENDING(info1) && VM_INTINFO_PENDING(info2)) { 2579 /* 2580 * If an exception occurs while attempting to call the 2581 * double-fault handler the processor enters shutdown mode 2582 * (aka triple fault). 2583 */ 2584 if (VM_INTINFO_TYPE(info1) == VM_INTINFO_HWEXCP && 2585 VM_INTINFO_VECTOR(info1) == IDT_DF) { 2586 (void) vm_suspend(vm, VM_SUSPEND_TRIPLEFAULT); 2587 *retinfo = 0; 2588 return (false); 2589 } 2590 /* 2591 * "Conditions for Generating a Double Fault" 2592 * Intel SDM, Vol3, Table 6-5 2593 */ 2594 const enum exc_class exc1 = exception_class(info1); 2595 const enum exc_class exc2 = exception_class(info2); 2596 if ((exc1 == EXC_CONTRIBUTORY && exc2 == EXC_CONTRIBUTORY) || 2597 (exc1 == EXC_PAGEFAULT && exc2 != EXC_BENIGN)) { 2598 /* Convert nested fault into a double fault. */ 2599 *retinfo = 2600 VM_INTINFO_VALID | 2601 VM_INTINFO_DEL_ERRCODE | 2602 VM_INTINFO_HWEXCP | 2603 IDT_DF; 2604 } else { 2605 /* Handle exceptions serially */ 2606 vcpu->exit_intinfo = info1; 2607 *retinfo = info2; 2608 } 2609 return (true); 2610 } else if (VM_INTINFO_PENDING(info1)) { 2611 *retinfo = info1; 2612 return (true); 2613 } else if (VM_INTINFO_PENDING(info2)) { 2614 *retinfo = info2; 2615 return (true); 2616 } 2617 2618 return (false); 2619 } 2620 2621 int 2622 vm_get_intinfo(struct vm *vm, int vcpuid, uint64_t *info1, uint64_t *info2) 2623 { 2624 struct vcpu *vcpu; 2625 2626 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 2627 return (EINVAL); 2628 2629 vcpu = &vm->vcpu[vcpuid]; 2630 *info1 = vcpu->exit_intinfo; 2631 *info2 = vcpu->exc_pending; 2632 return (0); 2633 } 2634 2635 int 2636 vm_inject_exception(struct vm *vm, int vcpuid, uint8_t vector, 2637 bool errcode_valid, uint32_t errcode, bool restart_instruction) 2638 { 2639 struct vcpu *vcpu; 2640 uint64_t regval; 2641 int error; 2642 2643 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 2644 return (EINVAL); 2645 2646 if (vector >= 32) 2647 return (EINVAL); 2648 2649 /* 2650 * NMIs are to be injected via their own specialized path using 2651 * vm_inject_nmi(). 2652 */ 2653 if (vector == IDT_NMI) { 2654 return (EINVAL); 2655 } 2656 2657 /* 2658 * A double fault exception should never be injected directly into 2659 * the guest. It is a derived exception that results from specific 2660 * combinations of nested faults. 2661 */ 2662 if (vector == IDT_DF) { 2663 return (EINVAL); 2664 } 2665 2666 vcpu = &vm->vcpu[vcpuid]; 2667 2668 if (VM_INTINFO_PENDING(vcpu->exc_pending)) { 2669 /* Unable to inject exception due to one already pending */ 2670 return (EBUSY); 2671 } 2672 2673 if (errcode_valid) { 2674 /* 2675 * Exceptions don't deliver an error code in real mode. 2676 */ 2677 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_CR0, ®val); 2678 VERIFY0(error); 2679 if ((regval & CR0_PE) == 0) { 2680 errcode_valid = false; 2681 } 2682 } 2683 2684 /* 2685 * From section 26.6.1 "Interruptibility State" in Intel SDM: 2686 * 2687 * Event blocking by "STI" or "MOV SS" is cleared after guest executes 2688 * one instruction or incurs an exception. 2689 */ 2690 error = vm_set_register(vm, vcpuid, VM_REG_GUEST_INTR_SHADOW, 0); 2691 VERIFY0(error); 2692 2693 if (restart_instruction) { 2694 VERIFY0(vm_restart_instruction(vm, vcpuid)); 2695 } 2696 2697 uint64_t val = VM_INTINFO_VALID | VM_INTINFO_HWEXCP | vector; 2698 if (errcode_valid) { 2699 val |= VM_INTINFO_DEL_ERRCODE; 2700 val |= (uint64_t)errcode << VM_INTINFO_SHIFT_ERRCODE; 2701 } 2702 vcpu->exc_pending = val; 2703 return (0); 2704 } 2705 2706 void 2707 vm_inject_ud(struct vm *vm, int vcpuid) 2708 { 2709 VERIFY0(vm_inject_exception(vm, vcpuid, IDT_UD, false, 0, true)); 2710 } 2711 2712 void 2713 vm_inject_gp(struct vm *vm, int vcpuid) 2714 { 2715 VERIFY0(vm_inject_exception(vm, vcpuid, IDT_GP, true, 0, true)); 2716 } 2717 2718 void 2719 vm_inject_ac(struct vm *vm, int vcpuid, uint32_t errcode) 2720 { 2721 VERIFY0(vm_inject_exception(vm, vcpuid, IDT_AC, true, errcode, true)); 2722 } 2723 2724 void 2725 vm_inject_ss(struct vm *vm, int vcpuid, uint32_t errcode) 2726 { 2727 VERIFY0(vm_inject_exception(vm, vcpuid, IDT_SS, true, errcode, true)); 2728 } 2729 2730 void 2731 vm_inject_pf(struct vm *vm, int vcpuid, uint32_t errcode, uint64_t cr2) 2732 { 2733 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_CR2, cr2)); 2734 VERIFY0(vm_inject_exception(vm, vcpuid, IDT_PF, true, errcode, true)); 2735 } 2736 2737 static VMM_STAT(VCPU_NMI_COUNT, "number of NMIs delivered to vcpu"); 2738 2739 int 2740 vm_inject_nmi(struct vm *vm, int vcpuid) 2741 { 2742 struct vcpu *vcpu; 2743 2744 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 2745 return (EINVAL); 2746 2747 vcpu = &vm->vcpu[vcpuid]; 2748 2749 vcpu->nmi_pending = true; 2750 vcpu_notify_event(vm, vcpuid); 2751 return (0); 2752 } 2753 2754 bool 2755 vm_nmi_pending(struct vm *vm, int vcpuid) 2756 { 2757 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 2758 2759 return (vcpu->nmi_pending); 2760 } 2761 2762 void 2763 vm_nmi_clear(struct vm *vm, int vcpuid) 2764 { 2765 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 2766 2767 ASSERT(vcpu->nmi_pending); 2768 2769 vcpu->nmi_pending = false; 2770 vmm_stat_incr(vm, vcpuid, VCPU_NMI_COUNT, 1); 2771 } 2772 2773 static VMM_STAT(VCPU_EXTINT_COUNT, "number of ExtINTs delivered to vcpu"); 2774 2775 int 2776 vm_inject_extint(struct vm *vm, int vcpuid) 2777 { 2778 struct vcpu *vcpu; 2779 2780 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 2781 return (EINVAL); 2782 2783 vcpu = &vm->vcpu[vcpuid]; 2784 2785 vcpu->extint_pending = true; 2786 vcpu_notify_event(vm, vcpuid); 2787 return (0); 2788 } 2789 2790 bool 2791 vm_extint_pending(struct vm *vm, int vcpuid) 2792 { 2793 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 2794 2795 return (vcpu->extint_pending); 2796 } 2797 2798 void 2799 vm_extint_clear(struct vm *vm, int vcpuid) 2800 { 2801 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 2802 2803 ASSERT(vcpu->extint_pending); 2804 2805 vcpu->extint_pending = false; 2806 vmm_stat_incr(vm, vcpuid, VCPU_EXTINT_COUNT, 1); 2807 } 2808 2809 int 2810 vm_inject_init(struct vm *vm, int vcpuid) 2811 { 2812 struct vcpu *vcpu; 2813 2814 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 2815 return (EINVAL); 2816 2817 vcpu = &vm->vcpu[vcpuid]; 2818 vcpu_lock(vcpu); 2819 vcpu->run_state |= VRS_PEND_INIT; 2820 /* 2821 * As part of queuing the INIT request, clear any pending SIPI. It 2822 * would not otherwise survive across the reset of the vCPU when it 2823 * undergoes the requested INIT. We would not want it to linger when it 2824 * could be mistaken as a subsequent (after the INIT) SIPI request. 2825 */ 2826 vcpu->run_state &= ~VRS_PEND_SIPI; 2827 vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT); 2828 2829 vcpu_unlock(vcpu); 2830 return (0); 2831 } 2832 2833 int 2834 vm_inject_sipi(struct vm *vm, int vcpuid, uint8_t vector) 2835 { 2836 struct vcpu *vcpu; 2837 2838 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 2839 return (EINVAL); 2840 2841 vcpu = &vm->vcpu[vcpuid]; 2842 vcpu_lock(vcpu); 2843 vcpu->run_state |= VRS_PEND_SIPI; 2844 vcpu->sipi_vector = vector; 2845 /* SIPI is only actionable if the CPU is waiting in INIT state */ 2846 if ((vcpu->run_state & (VRS_INIT | VRS_RUN)) == VRS_INIT) { 2847 vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT); 2848 } 2849 vcpu_unlock(vcpu); 2850 return (0); 2851 } 2852 2853 bool 2854 vcpu_run_state_pending(struct vm *vm, int vcpuid) 2855 { 2856 struct vcpu *vcpu; 2857 2858 ASSERT(vcpuid >= 0 && vcpuid < vm->maxcpus); 2859 vcpu = &vm->vcpu[vcpuid]; 2860 2861 /* Of interest: vCPU not in running state or with pending INIT */ 2862 return ((vcpu->run_state & (VRS_RUN | VRS_PEND_INIT)) != VRS_RUN); 2863 } 2864 2865 int 2866 vcpu_arch_reset(struct vm *vm, int vcpuid, bool init_only) 2867 { 2868 struct seg_desc desc; 2869 const enum vm_reg_name clear_regs[] = { 2870 VM_REG_GUEST_CR2, 2871 VM_REG_GUEST_CR3, 2872 VM_REG_GUEST_CR4, 2873 VM_REG_GUEST_RAX, 2874 VM_REG_GUEST_RBX, 2875 VM_REG_GUEST_RCX, 2876 VM_REG_GUEST_RSI, 2877 VM_REG_GUEST_RDI, 2878 VM_REG_GUEST_RBP, 2879 VM_REG_GUEST_RSP, 2880 VM_REG_GUEST_R8, 2881 VM_REG_GUEST_R9, 2882 VM_REG_GUEST_R10, 2883 VM_REG_GUEST_R11, 2884 VM_REG_GUEST_R12, 2885 VM_REG_GUEST_R13, 2886 VM_REG_GUEST_R14, 2887 VM_REG_GUEST_R15, 2888 VM_REG_GUEST_DR0, 2889 VM_REG_GUEST_DR1, 2890 VM_REG_GUEST_DR2, 2891 VM_REG_GUEST_DR3, 2892 VM_REG_GUEST_EFER, 2893 }; 2894 const enum vm_reg_name data_segs[] = { 2895 VM_REG_GUEST_SS, 2896 VM_REG_GUEST_DS, 2897 VM_REG_GUEST_ES, 2898 VM_REG_GUEST_FS, 2899 VM_REG_GUEST_GS, 2900 }; 2901 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 2902 2903 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 2904 return (EINVAL); 2905 2906 for (uint_t i = 0; i < nitems(clear_regs); i++) { 2907 VERIFY0(vm_set_register(vm, vcpuid, clear_regs[i], 0)); 2908 } 2909 2910 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, 2)); 2911 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RIP, 0xfff0)); 2912 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_CR0, 0x60000010)); 2913 2914 /* 2915 * The prescribed contents of %rdx differ slightly between the Intel and 2916 * AMD architectural definitions. The former expects the Extended Model 2917 * in bits 16-19 where the latter expects all the Family, Model, and 2918 * Stepping be there. Common boot ROMs appear to disregard this 2919 * anyways, so we stick with a compromise value similar to what is 2920 * spelled out in the Intel SDM. 2921 */ 2922 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RDX, 0x600)); 2923 2924 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_DR6, 0xffff0ff0)); 2925 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_DR7, 0x400)); 2926 2927 /* CS: Present, R/W, Accessed */ 2928 desc.access = 0x0093; 2929 desc.base = 0xffff0000; 2930 desc.limit = 0xffff; 2931 VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_CS, &desc)); 2932 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_CS, 0xf000)); 2933 2934 /* SS, DS, ES, FS, GS: Present, R/W, Accessed */ 2935 desc.access = 0x0093; 2936 desc.base = 0; 2937 desc.limit = 0xffff; 2938 for (uint_t i = 0; i < nitems(data_segs); i++) { 2939 VERIFY0(vm_set_seg_desc(vm, vcpuid, data_segs[i], &desc)); 2940 VERIFY0(vm_set_register(vm, vcpuid, data_segs[i], 0)); 2941 } 2942 2943 /* GDTR, IDTR */ 2944 desc.base = 0; 2945 desc.limit = 0xffff; 2946 VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_GDTR, &desc)); 2947 VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_IDTR, &desc)); 2948 2949 /* LDTR: Present, LDT */ 2950 desc.access = 0x0082; 2951 desc.base = 0; 2952 desc.limit = 0xffff; 2953 VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_LDTR, &desc)); 2954 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_LDTR, 0)); 2955 2956 /* TR: Present, 32-bit TSS */ 2957 desc.access = 0x008b; 2958 desc.base = 0; 2959 desc.limit = 0xffff; 2960 VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_TR, &desc)); 2961 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_TR, 0)); 2962 2963 vlapic_reset(vm_lapic(vm, vcpuid)); 2964 2965 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_INTR_SHADOW, 0)); 2966 2967 vcpu->exit_intinfo = 0; 2968 vcpu->exc_pending = 0; 2969 vcpu->nmi_pending = false; 2970 vcpu->extint_pending = 0; 2971 2972 /* 2973 * A CPU reset caused by power-on or system reset clears more state than 2974 * one which is trigged from an INIT IPI. 2975 */ 2976 if (!init_only) { 2977 vcpu->guest_xcr0 = XFEATURE_ENABLED_X87; 2978 (void) hma_fpu_init(vcpu->guestfpu); 2979 2980 /* XXX: clear MSRs and other pieces */ 2981 bzero(&vcpu->mtrr, sizeof (vcpu->mtrr)); 2982 } 2983 2984 return (0); 2985 } 2986 2987 static int 2988 vcpu_vector_sipi(struct vm *vm, int vcpuid, uint8_t vector) 2989 { 2990 struct seg_desc desc; 2991 2992 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 2993 return (EINVAL); 2994 2995 /* CS: Present, R/W, Accessed */ 2996 desc.access = 0x0093; 2997 desc.base = (uint64_t)vector << 12; 2998 desc.limit = 0xffff; 2999 VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_CS, &desc)); 3000 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_CS, 3001 (uint64_t)vector << 8)); 3002 3003 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RIP, 0)); 3004 3005 return (0); 3006 } 3007 3008 int 3009 vm_get_capability(struct vm *vm, int vcpu, int type, int *retval) 3010 { 3011 if (vcpu < 0 || vcpu >= vm->maxcpus) 3012 return (EINVAL); 3013 3014 if (type < 0 || type >= VM_CAP_MAX) 3015 return (EINVAL); 3016 3017 return (VMGETCAP(vm->cookie, vcpu, type, retval)); 3018 } 3019 3020 int 3021 vm_set_capability(struct vm *vm, int vcpu, int type, int val) 3022 { 3023 if (vcpu < 0 || vcpu >= vm->maxcpus) 3024 return (EINVAL); 3025 3026 if (type < 0 || type >= VM_CAP_MAX) 3027 return (EINVAL); 3028 3029 return (VMSETCAP(vm->cookie, vcpu, type, val)); 3030 } 3031 3032 struct vlapic * 3033 vm_lapic(struct vm *vm, int cpu) 3034 { 3035 return (vm->vcpu[cpu].vlapic); 3036 } 3037 3038 struct vioapic * 3039 vm_ioapic(struct vm *vm) 3040 { 3041 3042 return (vm->vioapic); 3043 } 3044 3045 struct vhpet * 3046 vm_hpet(struct vm *vm) 3047 { 3048 3049 return (vm->vhpet); 3050 } 3051 3052 void * 3053 vm_iommu_domain(struct vm *vm) 3054 { 3055 3056 return (vm->iommu); 3057 } 3058 3059 int 3060 vcpu_set_state(struct vm *vm, int vcpuid, enum vcpu_state newstate, 3061 bool from_idle) 3062 { 3063 int error; 3064 struct vcpu *vcpu; 3065 3066 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 3067 panic("vcpu_set_state: invalid vcpuid %d", vcpuid); 3068 3069 vcpu = &vm->vcpu[vcpuid]; 3070 3071 vcpu_lock(vcpu); 3072 error = vcpu_set_state_locked(vm, vcpuid, newstate, from_idle); 3073 vcpu_unlock(vcpu); 3074 3075 return (error); 3076 } 3077 3078 enum vcpu_state 3079 vcpu_get_state(struct vm *vm, int vcpuid, int *hostcpu) 3080 { 3081 struct vcpu *vcpu; 3082 enum vcpu_state state; 3083 3084 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 3085 panic("vcpu_get_state: invalid vcpuid %d", vcpuid); 3086 3087 vcpu = &vm->vcpu[vcpuid]; 3088 3089 vcpu_lock(vcpu); 3090 state = vcpu->state; 3091 if (hostcpu != NULL) 3092 *hostcpu = vcpu->hostcpu; 3093 vcpu_unlock(vcpu); 3094 3095 return (state); 3096 } 3097 3098 uint64_t 3099 vcpu_tsc_offset(struct vm *vm, int vcpuid, bool phys_adj) 3100 { 3101 ASSERT(vcpuid >= 0 && vcpuid < vm->maxcpus); 3102 3103 uint64_t vcpu_off = vm->boot_tsc_offset + vm->vcpu[vcpuid].tsc_offset; 3104 3105 if (phys_adj) { 3106 /* Include any offset for the current physical CPU too */ 3107 extern hrtime_t tsc_gethrtime_tick_delta(void); 3108 vcpu_off += (uint64_t)tsc_gethrtime_tick_delta(); 3109 } 3110 3111 return (vcpu_off); 3112 } 3113 3114 int 3115 vm_activate_cpu(struct vm *vm, int vcpuid) 3116 { 3117 3118 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 3119 return (EINVAL); 3120 3121 if (CPU_ISSET(vcpuid, &vm->active_cpus)) 3122 return (EBUSY); 3123 3124 if (vm->suspend != 0) { 3125 return (EBUSY); 3126 } 3127 3128 VCPU_CTR0(vm, vcpuid, "activated"); 3129 CPU_SET_ATOMIC(vcpuid, &vm->active_cpus); 3130 3131 /* 3132 * It is possible that this vCPU was undergoing activation at the same 3133 * time that the VM was being suspended. If that happens to be the 3134 * case, it should reflect the suspended state immediately. 3135 */ 3136 if (atomic_load_acq_int((uint_t *)&vm->suspend) != 0) { 3137 CPU_SET_ATOMIC(vcpuid, &vm->suspended_cpus); 3138 } 3139 3140 return (0); 3141 } 3142 3143 int 3144 vm_suspend_cpu(struct vm *vm, int vcpuid) 3145 { 3146 int i; 3147 3148 if (vcpuid < -1 || vcpuid >= vm->maxcpus) 3149 return (EINVAL); 3150 3151 if (vcpuid == -1) { 3152 vm->debug_cpus = vm->active_cpus; 3153 for (i = 0; i < vm->maxcpus; i++) { 3154 if (CPU_ISSET(i, &vm->active_cpus)) 3155 vcpu_notify_event(vm, i); 3156 } 3157 } else { 3158 if (!CPU_ISSET(vcpuid, &vm->active_cpus)) 3159 return (EINVAL); 3160 3161 CPU_SET_ATOMIC(vcpuid, &vm->debug_cpus); 3162 vcpu_notify_event(vm, vcpuid); 3163 } 3164 return (0); 3165 } 3166 3167 int 3168 vm_resume_cpu(struct vm *vm, int vcpuid) 3169 { 3170 3171 if (vcpuid < -1 || vcpuid >= vm->maxcpus) 3172 return (EINVAL); 3173 3174 if (vcpuid == -1) { 3175 CPU_ZERO(&vm->debug_cpus); 3176 } else { 3177 if (!CPU_ISSET(vcpuid, &vm->debug_cpus)) 3178 return (EINVAL); 3179 3180 CPU_CLR_ATOMIC(vcpuid, &vm->debug_cpus); 3181 } 3182 return (0); 3183 } 3184 3185 static bool 3186 vcpu_bailout_checks(struct vm *vm, int vcpuid, bool on_entry, 3187 uint64_t entry_rip) 3188 { 3189 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 3190 struct vm_exit *vme = &vcpu->exitinfo; 3191 bool bail = false; 3192 3193 ASSERT(vcpuid >= 0 && vcpuid < vm->maxcpus); 3194 3195 if (vm->suspend) { 3196 if (on_entry) { 3197 VERIFY(vm->suspend > VM_SUSPEND_NONE && 3198 vm->suspend < VM_SUSPEND_LAST); 3199 3200 vme->exitcode = VM_EXITCODE_SUSPENDED; 3201 vme->u.suspended.how = vm->suspend; 3202 } else { 3203 /* 3204 * Handling VM suspend is complicated, so if that 3205 * condition is detected outside of VM-entry itself, 3206 * just emit a BOGUS exitcode so we take a lap to pick 3207 * up the event during an entry and are directed into 3208 * the vm_handle_suspend() logic. 3209 */ 3210 vme->exitcode = VM_EXITCODE_BOGUS; 3211 } 3212 bail = true; 3213 } 3214 if (vcpu->reqidle) { 3215 vme->exitcode = VM_EXITCODE_REQIDLE; 3216 vmm_stat_incr(vm, vcpuid, VMEXIT_REQIDLE, 1); 3217 3218 if (!on_entry) { 3219 /* 3220 * A reqidle request detected outside of VM-entry can be 3221 * handled directly by clearing the request (and taking 3222 * a lap to userspace). 3223 */ 3224 vcpu_assert_locked(vcpu); 3225 vcpu->reqidle = 0; 3226 } 3227 bail = true; 3228 } 3229 if (vcpu_should_yield(vm, vcpuid)) { 3230 vme->exitcode = VM_EXITCODE_BOGUS; 3231 vmm_stat_incr(vm, vcpuid, VMEXIT_ASTPENDING, 1); 3232 bail = true; 3233 } 3234 if (CPU_ISSET(vcpuid, &vm->debug_cpus)) { 3235 vme->exitcode = VM_EXITCODE_DEBUG; 3236 bail = true; 3237 } 3238 3239 if (bail) { 3240 if (on_entry) { 3241 /* 3242 * If bailing out during VM-entry, the current %rip must 3243 * be recorded in the exitinfo. 3244 */ 3245 vme->rip = entry_rip; 3246 } 3247 vme->inst_length = 0; 3248 } 3249 return (bail); 3250 } 3251 3252 static bool 3253 vcpu_sleep_bailout_checks(struct vm *vm, int vcpuid) 3254 { 3255 /* 3256 * Bail-out check done prior to sleeping (in vCPU contexts like HLT or 3257 * wait-for-SIPI) expect that %rip is already populated in the vm_exit 3258 * structure, and we would only modify the exitcode. 3259 */ 3260 return (vcpu_bailout_checks(vm, vcpuid, false, 0)); 3261 } 3262 3263 bool 3264 vcpu_entry_bailout_checks(struct vm *vm, int vcpuid, uint64_t rip) 3265 { 3266 /* 3267 * Bail-out checks done as part of VM entry require an updated %rip to 3268 * populate the vm_exit struct if any of the conditions of interest are 3269 * matched in the check. 3270 */ 3271 return (vcpu_bailout_checks(vm, vcpuid, true, rip)); 3272 } 3273 3274 cpuset_t 3275 vm_active_cpus(struct vm *vm) 3276 { 3277 3278 return (vm->active_cpus); 3279 } 3280 3281 cpuset_t 3282 vm_debug_cpus(struct vm *vm) 3283 { 3284 3285 return (vm->debug_cpus); 3286 } 3287 3288 cpuset_t 3289 vm_suspended_cpus(struct vm *vm) 3290 { 3291 3292 return (vm->suspended_cpus); 3293 } 3294 3295 void * 3296 vcpu_stats(struct vm *vm, int vcpuid) 3297 { 3298 3299 return (vm->vcpu[vcpuid].stats); 3300 } 3301 3302 int 3303 vm_get_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state *state) 3304 { 3305 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 3306 return (EINVAL); 3307 3308 *state = vm->vcpu[vcpuid].x2apic_state; 3309 3310 return (0); 3311 } 3312 3313 int 3314 vm_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state) 3315 { 3316 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 3317 return (EINVAL); 3318 3319 if (state >= X2APIC_STATE_LAST) 3320 return (EINVAL); 3321 3322 vm->vcpu[vcpuid].x2apic_state = state; 3323 3324 vlapic_set_x2apic_state(vm, vcpuid, state); 3325 3326 return (0); 3327 } 3328 3329 /* 3330 * This function is called to ensure that a vcpu "sees" a pending event 3331 * as soon as possible: 3332 * - If the vcpu thread is sleeping then it is woken up. 3333 * - If the vcpu is running on a different host_cpu then an IPI will be directed 3334 * to the host_cpu to cause the vcpu to trap into the hypervisor. 3335 */ 3336 static void 3337 vcpu_notify_event_locked(struct vcpu *vcpu, vcpu_notify_t ntype) 3338 { 3339 int hostcpu; 3340 3341 ASSERT(ntype == VCPU_NOTIFY_APIC || VCPU_NOTIFY_EXIT); 3342 3343 hostcpu = vcpu->hostcpu; 3344 if (vcpu->state == VCPU_RUNNING) { 3345 KASSERT(hostcpu != NOCPU, ("vcpu running on invalid hostcpu")); 3346 if (hostcpu != curcpu) { 3347 if (ntype == VCPU_NOTIFY_APIC) { 3348 vlapic_post_intr(vcpu->vlapic, hostcpu); 3349 } else { 3350 poke_cpu(hostcpu); 3351 } 3352 } else { 3353 /* 3354 * If the 'vcpu' is running on 'curcpu' then it must 3355 * be sending a notification to itself (e.g. SELF_IPI). 3356 * The pending event will be picked up when the vcpu 3357 * transitions back to guest context. 3358 */ 3359 } 3360 } else { 3361 KASSERT(hostcpu == NOCPU, ("vcpu state %d not consistent " 3362 "with hostcpu %d", vcpu->state, hostcpu)); 3363 if (vcpu->state == VCPU_SLEEPING) { 3364 cv_signal(&vcpu->vcpu_cv); 3365 } 3366 } 3367 } 3368 3369 void 3370 vcpu_notify_event(struct vm *vm, int vcpuid) 3371 { 3372 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 3373 3374 vcpu_lock(vcpu); 3375 vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT); 3376 vcpu_unlock(vcpu); 3377 } 3378 3379 void 3380 vcpu_notify_event_type(struct vm *vm, int vcpuid, vcpu_notify_t ntype) 3381 { 3382 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 3383 3384 if (ntype == VCPU_NOTIFY_NONE) { 3385 return; 3386 } 3387 3388 vcpu_lock(vcpu); 3389 vcpu_notify_event_locked(vcpu, ntype); 3390 vcpu_unlock(vcpu); 3391 } 3392 3393 void 3394 vcpu_ustate_change(struct vm *vm, int vcpuid, enum vcpu_ustate ustate) 3395 { 3396 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 3397 hrtime_t now = gethrtime(); 3398 3399 ASSERT3U(ustate, !=, vcpu->ustate); 3400 ASSERT3S(ustate, <, VU_MAX); 3401 ASSERT3S(ustate, >=, VU_INIT); 3402 3403 hrtime_t delta = now - vcpu->ustate_when; 3404 vcpu->ustate_total[vcpu->ustate] += delta; 3405 3406 membar_producer(); 3407 3408 vcpu->ustate_when = now; 3409 vcpu->ustate = ustate; 3410 } 3411 3412 struct vmspace * 3413 vm_get_vmspace(struct vm *vm) 3414 { 3415 3416 return (vm->vmspace); 3417 } 3418 3419 struct vm_client * 3420 vm_get_vmclient(struct vm *vm, int vcpuid) 3421 { 3422 return (vm->vcpu[vcpuid].vmclient); 3423 } 3424 3425 int 3426 vm_apicid2vcpuid(struct vm *vm, int apicid) 3427 { 3428 /* 3429 * XXX apic id is assumed to be numerically identical to vcpu id 3430 */ 3431 return (apicid); 3432 } 3433 3434 struct vatpic * 3435 vm_atpic(struct vm *vm) 3436 { 3437 return (vm->vatpic); 3438 } 3439 3440 struct vatpit * 3441 vm_atpit(struct vm *vm) 3442 { 3443 return (vm->vatpit); 3444 } 3445 3446 struct vpmtmr * 3447 vm_pmtmr(struct vm *vm) 3448 { 3449 3450 return (vm->vpmtmr); 3451 } 3452 3453 struct vrtc * 3454 vm_rtc(struct vm *vm) 3455 { 3456 3457 return (vm->vrtc); 3458 } 3459 3460 enum vm_reg_name 3461 vm_segment_name(int seg) 3462 { 3463 static enum vm_reg_name seg_names[] = { 3464 VM_REG_GUEST_ES, 3465 VM_REG_GUEST_CS, 3466 VM_REG_GUEST_SS, 3467 VM_REG_GUEST_DS, 3468 VM_REG_GUEST_FS, 3469 VM_REG_GUEST_GS 3470 }; 3471 3472 KASSERT(seg >= 0 && seg < nitems(seg_names), 3473 ("%s: invalid segment encoding %d", __func__, seg)); 3474 return (seg_names[seg]); 3475 } 3476 3477 void 3478 vm_copy_teardown(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo, 3479 uint_t num_copyinfo) 3480 { 3481 for (uint_t idx = 0; idx < num_copyinfo; idx++) { 3482 if (copyinfo[idx].cookie != NULL) { 3483 (void) vmp_release((vm_page_t *)copyinfo[idx].cookie); 3484 } 3485 } 3486 bzero(copyinfo, num_copyinfo * sizeof (struct vm_copyinfo)); 3487 } 3488 3489 int 3490 vm_copy_setup(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, 3491 uint64_t gla, size_t len, int prot, struct vm_copyinfo *copyinfo, 3492 uint_t num_copyinfo, int *fault) 3493 { 3494 uint_t idx, nused; 3495 size_t n, off, remaining; 3496 vm_client_t *vmc = vm_get_vmclient(vm, vcpuid); 3497 3498 bzero(copyinfo, sizeof (struct vm_copyinfo) * num_copyinfo); 3499 3500 nused = 0; 3501 remaining = len; 3502 while (remaining > 0) { 3503 uint64_t gpa; 3504 int error; 3505 3506 KASSERT(nused < num_copyinfo, ("insufficient vm_copyinfo")); 3507 error = vm_gla2gpa(vm, vcpuid, paging, gla, prot, &gpa, fault); 3508 if (error || *fault) 3509 return (error); 3510 off = gpa & PAGEOFFSET; 3511 n = min(remaining, PAGESIZE - off); 3512 copyinfo[nused].gpa = gpa; 3513 copyinfo[nused].len = n; 3514 remaining -= n; 3515 gla += n; 3516 nused++; 3517 } 3518 3519 for (idx = 0; idx < nused; idx++) { 3520 vm_page_t *vmp; 3521 caddr_t hva; 3522 3523 vmp = vmc_hold(vmc, copyinfo[idx].gpa & PAGEMASK, prot); 3524 if (vmp == NULL) { 3525 break; 3526 } 3527 if ((prot & PROT_WRITE) != 0) { 3528 hva = (caddr_t)vmp_get_writable(vmp); 3529 } else { 3530 hva = (caddr_t)vmp_get_readable(vmp); 3531 } 3532 copyinfo[idx].hva = hva + (copyinfo[idx].gpa & PAGEOFFSET); 3533 copyinfo[idx].cookie = vmp; 3534 copyinfo[idx].prot = prot; 3535 } 3536 3537 if (idx != nused) { 3538 vm_copy_teardown(vm, vcpuid, copyinfo, num_copyinfo); 3539 return (EFAULT); 3540 } else { 3541 *fault = 0; 3542 return (0); 3543 } 3544 } 3545 3546 void 3547 vm_copyin(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo, void *kaddr, 3548 size_t len) 3549 { 3550 char *dst; 3551 int idx; 3552 3553 dst = kaddr; 3554 idx = 0; 3555 while (len > 0) { 3556 ASSERT(copyinfo[idx].prot & PROT_READ); 3557 3558 bcopy(copyinfo[idx].hva, dst, copyinfo[idx].len); 3559 len -= copyinfo[idx].len; 3560 dst += copyinfo[idx].len; 3561 idx++; 3562 } 3563 } 3564 3565 void 3566 vm_copyout(struct vm *vm, int vcpuid, const void *kaddr, 3567 struct vm_copyinfo *copyinfo, size_t len) 3568 { 3569 const char *src; 3570 int idx; 3571 3572 src = kaddr; 3573 idx = 0; 3574 while (len > 0) { 3575 ASSERT(copyinfo[idx].prot & PROT_WRITE); 3576 3577 bcopy(src, copyinfo[idx].hva, copyinfo[idx].len); 3578 len -= copyinfo[idx].len; 3579 src += copyinfo[idx].len; 3580 idx++; 3581 } 3582 } 3583 3584 /* 3585 * Return the amount of in-use and wired memory for the VM. Since 3586 * these are global stats, only return the values with for vCPU 0 3587 */ 3588 VMM_STAT_DECLARE(VMM_MEM_RESIDENT); 3589 3590 static void 3591 vm_get_rescnt(struct vm *vm, int vcpu, struct vmm_stat_type *stat) 3592 { 3593 if (vcpu == 0) { 3594 vmm_stat_set(vm, vcpu, VMM_MEM_RESIDENT, 3595 PAGE_SIZE * vmspace_resident_count(vm->vmspace)); 3596 } 3597 } 3598 3599 VMM_STAT_FUNC(VMM_MEM_RESIDENT, "Resident memory", vm_get_rescnt); 3600 3601 int 3602 vm_ioport_access(struct vm *vm, int vcpuid, bool in, uint16_t port, 3603 uint8_t bytes, uint32_t *val) 3604 { 3605 return (vm_inout_access(&vm->ioports, in, port, bytes, val)); 3606 } 3607 3608 /* 3609 * bhyve-internal interfaces to attach or detach IO port handlers. 3610 * Must be called with VM write lock held for safety. 3611 */ 3612 int 3613 vm_ioport_attach(struct vm *vm, uint16_t port, ioport_handler_t func, void *arg, 3614 void **cookie) 3615 { 3616 int err; 3617 err = vm_inout_attach(&vm->ioports, port, IOPF_DEFAULT, func, arg); 3618 if (err == 0) { 3619 *cookie = (void *)IOP_GEN_COOKIE(func, arg, port); 3620 } 3621 return (err); 3622 } 3623 int 3624 vm_ioport_detach(struct vm *vm, void **cookie, ioport_handler_t *old_func, 3625 void **old_arg) 3626 { 3627 uint16_t port = IOP_PORT_FROM_COOKIE((uintptr_t)*cookie); 3628 int err; 3629 3630 err = vm_inout_detach(&vm->ioports, port, false, old_func, old_arg); 3631 if (err == 0) { 3632 *cookie = NULL; 3633 } 3634 return (err); 3635 } 3636 3637 /* 3638 * External driver interfaces to attach or detach IO port handlers. 3639 * Must be called with VM write lock held for safety. 3640 */ 3641 int 3642 vm_ioport_hook(struct vm *vm, uint16_t port, ioport_handler_t func, 3643 void *arg, void **cookie) 3644 { 3645 int err; 3646 3647 if (port == 0) { 3648 return (EINVAL); 3649 } 3650 3651 err = vm_inout_attach(&vm->ioports, port, IOPF_DRV_HOOK, func, arg); 3652 if (err == 0) { 3653 *cookie = (void *)IOP_GEN_COOKIE(func, arg, port); 3654 } 3655 return (err); 3656 } 3657 void 3658 vm_ioport_unhook(struct vm *vm, void **cookie) 3659 { 3660 uint16_t port = IOP_PORT_FROM_COOKIE((uintptr_t)*cookie); 3661 ioport_handler_t old_func; 3662 void *old_arg; 3663 int err; 3664 3665 err = vm_inout_detach(&vm->ioports, port, true, &old_func, &old_arg); 3666 3667 /* ioport-hook-using drivers are expected to be well-behaved */ 3668 VERIFY0(err); 3669 VERIFY(IOP_GEN_COOKIE(old_func, old_arg, port) == (uintptr_t)*cookie); 3670 3671 *cookie = NULL; 3672 } 3673 3674 int 3675 vmm_kstat_update_vcpu(struct kstat *ksp, int rw) 3676 { 3677 struct vm *vm = ksp->ks_private; 3678 vmm_vcpu_kstats_t *vvk = ksp->ks_data; 3679 const int vcpuid = vvk->vvk_vcpu.value.ui32; 3680 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 3681 3682 ASSERT3U(vcpuid, <, VM_MAXCPU); 3683 3684 vvk->vvk_time_init.value.ui64 = vcpu->ustate_total[VU_INIT]; 3685 vvk->vvk_time_run.value.ui64 = vcpu->ustate_total[VU_RUN]; 3686 vvk->vvk_time_idle.value.ui64 = vcpu->ustate_total[VU_IDLE]; 3687 vvk->vvk_time_emu_kern.value.ui64 = vcpu->ustate_total[VU_EMU_KERN]; 3688 vvk->vvk_time_emu_user.value.ui64 = vcpu->ustate_total[VU_EMU_USER]; 3689 vvk->vvk_time_sched.value.ui64 = vcpu->ustate_total[VU_SCHED]; 3690 3691 return (0); 3692 } 3693