1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2011 NetApp, Inc. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 */ 28 29 #include "opt_bhyve_snapshot.h" 30 31 #include <sys/param.h> 32 #include <sys/systm.h> 33 #include <sys/kernel.h> 34 #include <sys/module.h> 35 #include <sys/sysctl.h> 36 #include <sys/malloc.h> 37 #include <sys/pcpu.h> 38 #include <sys/lock.h> 39 #include <sys/mutex.h> 40 #include <sys/proc.h> 41 #include <sys/rwlock.h> 42 #include <sys/sched.h> 43 #include <sys/smp.h> 44 #include <sys/sx.h> 45 #include <sys/vnode.h> 46 47 #include <vm/vm.h> 48 #include <vm/vm_param.h> 49 #include <vm/vm_extern.h> 50 #include <vm/vm_object.h> 51 #include <vm/vm_page.h> 52 #include <vm/pmap.h> 53 #include <vm/vm_map.h> 54 #include <vm/vm_pager.h> 55 #include <vm/vm_kern.h> 56 #include <vm/vnode_pager.h> 57 #include <vm/swap_pager.h> 58 #include <vm/uma.h> 59 60 #include <machine/cpu.h> 61 #include <machine/pcb.h> 62 #include <machine/smp.h> 63 #include <machine/md_var.h> 64 #include <x86/psl.h> 65 #include <x86/apicreg.h> 66 #include <x86/ifunc.h> 67 68 #include <machine/vmm.h> 69 #include <machine/vmm_instruction_emul.h> 70 #include <machine/vmm_snapshot.h> 71 72 #include <dev/vmm/vmm_dev.h> 73 #include <dev/vmm/vmm_ktr.h> 74 #include <dev/vmm/vmm_mem.h> 75 76 #include "vmm_ioport.h" 77 #include "vmm_host.h" 78 #include "vmm_mem.h" 79 #include "vmm_util.h" 80 #include "vatpic.h" 81 #include "vatpit.h" 82 #include "vhpet.h" 83 #include "vioapic.h" 84 #include "vlapic.h" 85 #include "vpmtmr.h" 86 #include "vrtc.h" 87 #include "vmm_stat.h" 88 #include "vmm_lapic.h" 89 90 #include "io/ppt.h" 91 #include "io/iommu.h" 92 93 struct vlapic; 94 95 /* 96 * Initialization: 97 * (a) allocated when vcpu is created 98 * (i) initialized when vcpu is created and when it is reinitialized 99 * (o) initialized the first time the vcpu is created 100 * (x) initialized before use 101 */ 102 struct vcpu { 103 struct mtx mtx; /* (o) protects 'state' and 'hostcpu' */ 104 enum vcpu_state state; /* (o) vcpu state */ 105 int vcpuid; /* (o) */ 106 int hostcpu; /* (o) vcpu's host cpu */ 107 int reqidle; /* (i) request vcpu to idle */ 108 struct vm *vm; /* (o) */ 109 void *cookie; /* (i) cpu-specific data */ 110 struct vlapic *vlapic; /* (i) APIC device model */ 111 enum x2apic_state x2apic_state; /* (i) APIC mode */ 112 uint64_t exitintinfo; /* (i) events pending at VM exit */ 113 int nmi_pending; /* (i) NMI pending */ 114 int extint_pending; /* (i) INTR pending */ 115 int exception_pending; /* (i) exception pending */ 116 int exc_vector; /* (x) exception collateral */ 117 int exc_errcode_valid; 118 uint32_t exc_errcode; 119 struct savefpu *guestfpu; /* (a,i) guest fpu state */ 120 uint64_t guest_xcr0; /* (i) guest %xcr0 register */ 121 void *stats; /* (a,i) statistics */ 122 struct vm_exit exitinfo; /* (x) exit reason and collateral */ 123 cpuset_t exitinfo_cpuset; /* (x) storage for vmexit handlers */ 124 uint64_t nextrip; /* (x) next instruction to execute */ 125 uint64_t tsc_offset; /* (o) TSC offsetting */ 126 }; 127 128 #define vcpu_lock_init(v) mtx_init(&((v)->mtx), "vcpu lock", 0, MTX_SPIN) 129 #define vcpu_lock_destroy(v) mtx_destroy(&((v)->mtx)) 130 #define vcpu_lock(v) mtx_lock_spin(&((v)->mtx)) 131 #define vcpu_unlock(v) mtx_unlock_spin(&((v)->mtx)) 132 #define vcpu_assert_locked(v) mtx_assert(&((v)->mtx), MA_OWNED) 133 134 /* 135 * Initialization: 136 * (o) initialized the first time the VM is created 137 * (i) initialized when VM is created and when it is reinitialized 138 * (x) initialized before use 139 * 140 * Locking: 141 * [m] mem_segs_lock 142 * [r] rendezvous_mtx 143 * [v] reads require one frozen vcpu, writes require freezing all vcpus 144 */ 145 struct vm { 146 void *cookie; /* (i) cpu-specific data */ 147 void *iommu; /* (x) iommu-specific data */ 148 struct vhpet *vhpet; /* (i) virtual HPET */ 149 struct vioapic *vioapic; /* (i) virtual ioapic */ 150 struct vatpic *vatpic; /* (i) virtual atpic */ 151 struct vatpit *vatpit; /* (i) virtual atpit */ 152 struct vpmtmr *vpmtmr; /* (i) virtual ACPI PM timer */ 153 struct vrtc *vrtc; /* (o) virtual RTC */ 154 volatile cpuset_t active_cpus; /* (i) active vcpus */ 155 volatile cpuset_t debug_cpus; /* (i) vcpus stopped for debug */ 156 cpuset_t startup_cpus; /* (i) [r] waiting for startup */ 157 int suspend; /* (i) stop VM execution */ 158 bool dying; /* (o) is dying */ 159 volatile cpuset_t suspended_cpus; /* (i) suspended vcpus */ 160 volatile cpuset_t halted_cpus; /* (x) cpus in a hard halt */ 161 cpuset_t rendezvous_req_cpus; /* (x) [r] rendezvous requested */ 162 cpuset_t rendezvous_done_cpus; /* (x) [r] rendezvous finished */ 163 void *rendezvous_arg; /* (x) [r] rendezvous func/arg */ 164 vm_rendezvous_func_t rendezvous_func; 165 struct mtx rendezvous_mtx; /* (o) rendezvous lock */ 166 struct vmspace *vmspace; /* (o) guest's address space */ 167 struct vm_mem mem; /* (i) [m+v] guest memory */ 168 char name[VM_MAX_NAMELEN+1]; /* (o) virtual machine name */ 169 struct vcpu **vcpu; /* (o) guest vcpus */ 170 /* The following describe the vm cpu topology */ 171 uint16_t sockets; /* (o) num of sockets */ 172 uint16_t cores; /* (o) num of cores/socket */ 173 uint16_t threads; /* (o) num of threads/core */ 174 uint16_t maxcpus; /* (o) max pluggable cpus */ 175 struct sx vcpus_init_lock; /* (o) */ 176 }; 177 178 #define VMM_CTR0(vcpu, format) \ 179 VCPU_CTR0((vcpu)->vm, (vcpu)->vcpuid, format) 180 181 #define VMM_CTR1(vcpu, format, p1) \ 182 VCPU_CTR1((vcpu)->vm, (vcpu)->vcpuid, format, p1) 183 184 #define VMM_CTR2(vcpu, format, p1, p2) \ 185 VCPU_CTR2((vcpu)->vm, (vcpu)->vcpuid, format, p1, p2) 186 187 #define VMM_CTR3(vcpu, format, p1, p2, p3) \ 188 VCPU_CTR3((vcpu)->vm, (vcpu)->vcpuid, format, p1, p2, p3) 189 190 #define VMM_CTR4(vcpu, format, p1, p2, p3, p4) \ 191 VCPU_CTR4((vcpu)->vm, (vcpu)->vcpuid, format, p1, p2, p3, p4) 192 193 static int vmm_initialized; 194 195 static void vmmops_panic(void); 196 197 static void 198 vmmops_panic(void) 199 { 200 panic("vmm_ops func called when !vmm_is_intel() && !vmm_is_svm()"); 201 } 202 203 #define DEFINE_VMMOPS_IFUNC(ret_type, opname, args) \ 204 DEFINE_IFUNC(static, ret_type, vmmops_##opname, args) \ 205 { \ 206 if (vmm_is_intel()) \ 207 return (vmm_ops_intel.opname); \ 208 else if (vmm_is_svm()) \ 209 return (vmm_ops_amd.opname); \ 210 else \ 211 return ((ret_type (*)args)vmmops_panic); \ 212 } 213 214 DEFINE_VMMOPS_IFUNC(int, modinit, (int ipinum)) 215 DEFINE_VMMOPS_IFUNC(int, modcleanup, (void)) 216 DEFINE_VMMOPS_IFUNC(void, modsuspend, (void)) 217 DEFINE_VMMOPS_IFUNC(void, modresume, (void)) 218 DEFINE_VMMOPS_IFUNC(void *, init, (struct vm *vm, struct pmap *pmap)) 219 DEFINE_VMMOPS_IFUNC(int, run, (void *vcpui, register_t rip, struct pmap *pmap, 220 struct vm_eventinfo *info)) 221 DEFINE_VMMOPS_IFUNC(void, cleanup, (void *vmi)) 222 DEFINE_VMMOPS_IFUNC(void *, vcpu_init, (void *vmi, struct vcpu *vcpu, 223 int vcpu_id)) 224 DEFINE_VMMOPS_IFUNC(void, vcpu_cleanup, (void *vcpui)) 225 DEFINE_VMMOPS_IFUNC(int, getreg, (void *vcpui, int num, uint64_t *retval)) 226 DEFINE_VMMOPS_IFUNC(int, setreg, (void *vcpui, int num, uint64_t val)) 227 DEFINE_VMMOPS_IFUNC(int, getdesc, (void *vcpui, int num, struct seg_desc *desc)) 228 DEFINE_VMMOPS_IFUNC(int, setdesc, (void *vcpui, int num, struct seg_desc *desc)) 229 DEFINE_VMMOPS_IFUNC(int, getcap, (void *vcpui, int num, int *retval)) 230 DEFINE_VMMOPS_IFUNC(int, setcap, (void *vcpui, int num, int val)) 231 DEFINE_VMMOPS_IFUNC(struct vmspace *, vmspace_alloc, (vm_offset_t min, 232 vm_offset_t max)) 233 DEFINE_VMMOPS_IFUNC(void, vmspace_free, (struct vmspace *vmspace)) 234 DEFINE_VMMOPS_IFUNC(struct vlapic *, vlapic_init, (void *vcpui)) 235 DEFINE_VMMOPS_IFUNC(void, vlapic_cleanup, (struct vlapic *vlapic)) 236 #ifdef BHYVE_SNAPSHOT 237 DEFINE_VMMOPS_IFUNC(int, vcpu_snapshot, (void *vcpui, 238 struct vm_snapshot_meta *meta)) 239 DEFINE_VMMOPS_IFUNC(int, restore_tsc, (void *vcpui, uint64_t now)) 240 #endif 241 242 SDT_PROVIDER_DEFINE(vmm); 243 244 static MALLOC_DEFINE(M_VM, "vm", "vm"); 245 246 /* statistics */ 247 static VMM_STAT(VCPU_TOTAL_RUNTIME, "vcpu total runtime"); 248 249 SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 250 NULL); 251 252 /* 253 * Halt the guest if all vcpus are executing a HLT instruction with 254 * interrupts disabled. 255 */ 256 static int halt_detection_enabled = 1; 257 SYSCTL_INT(_hw_vmm, OID_AUTO, halt_detection, CTLFLAG_RDTUN, 258 &halt_detection_enabled, 0, 259 "Halt VM if all vcpus execute HLT with interrupts disabled"); 260 261 static int vmm_ipinum; 262 SYSCTL_INT(_hw_vmm, OID_AUTO, ipinum, CTLFLAG_RD, &vmm_ipinum, 0, 263 "IPI vector used for vcpu notifications"); 264 265 static int trace_guest_exceptions; 266 SYSCTL_INT(_hw_vmm, OID_AUTO, trace_guest_exceptions, CTLFLAG_RDTUN, 267 &trace_guest_exceptions, 0, 268 "Trap into hypervisor on all guest exceptions and reflect them back"); 269 270 static int trap_wbinvd; 271 SYSCTL_INT(_hw_vmm, OID_AUTO, trap_wbinvd, CTLFLAG_RDTUN, &trap_wbinvd, 0, 272 "WBINVD triggers a VM-exit"); 273 274 u_int vm_maxcpu; 275 SYSCTL_UINT(_hw_vmm, OID_AUTO, maxcpu, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, 276 &vm_maxcpu, 0, "Maximum number of vCPUs"); 277 278 static void vcpu_notify_event_locked(struct vcpu *vcpu, bool lapic_intr); 279 280 /* global statistics */ 281 VMM_STAT(VCPU_MIGRATIONS, "vcpu migration across host cpus"); 282 VMM_STAT(VMEXIT_COUNT, "total number of vm exits"); 283 VMM_STAT(VMEXIT_EXTINT, "vm exits due to external interrupt"); 284 VMM_STAT(VMEXIT_HLT, "number of times hlt was intercepted"); 285 VMM_STAT(VMEXIT_CR_ACCESS, "number of times %cr access was intercepted"); 286 VMM_STAT(VMEXIT_RDMSR, "number of times rdmsr was intercepted"); 287 VMM_STAT(VMEXIT_WRMSR, "number of times wrmsr was intercepted"); 288 VMM_STAT(VMEXIT_MTRAP, "number of monitor trap exits"); 289 VMM_STAT(VMEXIT_PAUSE, "number of times pause was intercepted"); 290 VMM_STAT(VMEXIT_INTR_WINDOW, "vm exits due to interrupt window opening"); 291 VMM_STAT(VMEXIT_NMI_WINDOW, "vm exits due to nmi window opening"); 292 VMM_STAT(VMEXIT_INOUT, "number of times in/out was intercepted"); 293 VMM_STAT(VMEXIT_CPUID, "number of times cpuid was intercepted"); 294 VMM_STAT(VMEXIT_NESTED_FAULT, "vm exits due to nested page fault"); 295 VMM_STAT(VMEXIT_INST_EMUL, "vm exits for instruction emulation"); 296 VMM_STAT(VMEXIT_UNKNOWN, "number of vm exits for unknown reason"); 297 VMM_STAT(VMEXIT_ASTPENDING, "number of times astpending at exit"); 298 VMM_STAT(VMEXIT_REQIDLE, "number of times idle requested at exit"); 299 VMM_STAT(VMEXIT_USERSPACE, "number of vm exits handled in userspace"); 300 VMM_STAT(VMEXIT_RENDEZVOUS, "number of times rendezvous pending at exit"); 301 VMM_STAT(VMEXIT_EXCEPTION, "number of vm exits due to exceptions"); 302 303 /* 304 * Upper limit on vm_maxcpu. Limited by use of uint16_t types for CPU 305 * counts as well as range of vpid values for VT-x and by the capacity 306 * of cpuset_t masks. The call to new_unrhdr() in vpid_init() in 307 * vmx.c requires 'vm_maxcpu + 1 <= 0xffff', hence the '- 1' below. 308 */ 309 #define VM_MAXCPU MIN(0xffff - 1, CPU_SETSIZE) 310 311 #ifdef KTR 312 static const char * 313 vcpu_state2str(enum vcpu_state state) 314 { 315 316 switch (state) { 317 case VCPU_IDLE: 318 return ("idle"); 319 case VCPU_FROZEN: 320 return ("frozen"); 321 case VCPU_RUNNING: 322 return ("running"); 323 case VCPU_SLEEPING: 324 return ("sleeping"); 325 default: 326 return ("unknown"); 327 } 328 } 329 #endif 330 331 static void 332 vcpu_cleanup(struct vcpu *vcpu, bool destroy) 333 { 334 vmmops_vlapic_cleanup(vcpu->vlapic); 335 vmmops_vcpu_cleanup(vcpu->cookie); 336 vcpu->cookie = NULL; 337 if (destroy) { 338 vmm_stat_free(vcpu->stats); 339 fpu_save_area_free(vcpu->guestfpu); 340 vcpu_lock_destroy(vcpu); 341 free(vcpu, M_VM); 342 } 343 } 344 345 static struct vcpu * 346 vcpu_alloc(struct vm *vm, int vcpu_id) 347 { 348 struct vcpu *vcpu; 349 350 KASSERT(vcpu_id >= 0 && vcpu_id < vm->maxcpus, 351 ("vcpu_init: invalid vcpu %d", vcpu_id)); 352 353 vcpu = malloc(sizeof(*vcpu), M_VM, M_WAITOK | M_ZERO); 354 vcpu_lock_init(vcpu); 355 vcpu->state = VCPU_IDLE; 356 vcpu->hostcpu = NOCPU; 357 vcpu->vcpuid = vcpu_id; 358 vcpu->vm = vm; 359 vcpu->guestfpu = fpu_save_area_alloc(); 360 vcpu->stats = vmm_stat_alloc(); 361 vcpu->tsc_offset = 0; 362 return (vcpu); 363 } 364 365 static void 366 vcpu_init(struct vcpu *vcpu) 367 { 368 vcpu->cookie = vmmops_vcpu_init(vcpu->vm->cookie, vcpu, vcpu->vcpuid); 369 vcpu->vlapic = vmmops_vlapic_init(vcpu->cookie); 370 vm_set_x2apic_state(vcpu, X2APIC_DISABLED); 371 vcpu->reqidle = 0; 372 vcpu->exitintinfo = 0; 373 vcpu->nmi_pending = 0; 374 vcpu->extint_pending = 0; 375 vcpu->exception_pending = 0; 376 vcpu->guest_xcr0 = XFEATURE_ENABLED_X87; 377 fpu_save_area_reset(vcpu->guestfpu); 378 vmm_stat_init(vcpu->stats); 379 } 380 381 int 382 vcpu_trace_exceptions(struct vcpu *vcpu) 383 { 384 385 return (trace_guest_exceptions); 386 } 387 388 int 389 vcpu_trap_wbinvd(struct vcpu *vcpu) 390 { 391 return (trap_wbinvd); 392 } 393 394 struct vm_exit * 395 vm_exitinfo(struct vcpu *vcpu) 396 { 397 return (&vcpu->exitinfo); 398 } 399 400 cpuset_t * 401 vm_exitinfo_cpuset(struct vcpu *vcpu) 402 { 403 return (&vcpu->exitinfo_cpuset); 404 } 405 406 static int 407 vmm_init(void) 408 { 409 if (!vmm_is_hw_supported()) 410 return (ENXIO); 411 412 vm_maxcpu = mp_ncpus; 413 TUNABLE_INT_FETCH("hw.vmm.maxcpu", &vm_maxcpu); 414 415 if (vm_maxcpu > VM_MAXCPU) { 416 printf("vmm: vm_maxcpu clamped to %u\n", VM_MAXCPU); 417 vm_maxcpu = VM_MAXCPU; 418 } 419 if (vm_maxcpu == 0) 420 vm_maxcpu = 1; 421 422 vmm_host_state_init(); 423 424 vmm_ipinum = lapic_ipi_alloc(pti ? &IDTVEC(justreturn1_pti) : 425 &IDTVEC(justreturn)); 426 if (vmm_ipinum < 0) 427 vmm_ipinum = IPI_AST; 428 429 vmm_suspend_p = vmmops_modsuspend; 430 vmm_resume_p = vmmops_modresume; 431 432 return (vmmops_modinit(vmm_ipinum)); 433 } 434 435 static int 436 vmm_handler(module_t mod, int what, void *arg) 437 { 438 int error; 439 440 switch (what) { 441 case MOD_LOAD: 442 if (vmm_is_hw_supported()) { 443 error = vmmdev_init(); 444 if (error != 0) 445 break; 446 error = vmm_init(); 447 if (error == 0) 448 vmm_initialized = 1; 449 else 450 (void)vmmdev_cleanup(); 451 } else { 452 error = ENXIO; 453 } 454 break; 455 case MOD_UNLOAD: 456 if (vmm_is_hw_supported()) { 457 error = vmmdev_cleanup(); 458 if (error == 0) { 459 vmm_suspend_p = NULL; 460 vmm_resume_p = NULL; 461 iommu_cleanup(); 462 if (vmm_ipinum != IPI_AST) 463 lapic_ipi_free(vmm_ipinum); 464 error = vmmops_modcleanup(); 465 /* 466 * Something bad happened - prevent new 467 * VMs from being created 468 */ 469 if (error) 470 vmm_initialized = 0; 471 } 472 } else { 473 error = 0; 474 } 475 break; 476 default: 477 error = 0; 478 break; 479 } 480 return (error); 481 } 482 483 static moduledata_t vmm_kmod = { 484 "vmm", 485 vmm_handler, 486 NULL 487 }; 488 489 /* 490 * vmm initialization has the following dependencies: 491 * 492 * - VT-x initialization requires smp_rendezvous() and therefore must happen 493 * after SMP is fully functional (after SI_SUB_SMP). 494 * - vmm device initialization requires an initialized devfs. 495 */ 496 DECLARE_MODULE(vmm, vmm_kmod, MAX(SI_SUB_SMP, SI_SUB_DEVFS) + 1, SI_ORDER_ANY); 497 MODULE_VERSION(vmm, 1); 498 499 static void 500 vm_init(struct vm *vm, bool create) 501 { 502 vm->cookie = vmmops_init(vm, vmspace_pmap(vm->vmspace)); 503 vm->iommu = NULL; 504 vm->vioapic = vioapic_init(vm); 505 vm->vhpet = vhpet_init(vm); 506 vm->vatpic = vatpic_init(vm); 507 vm->vatpit = vatpit_init(vm); 508 vm->vpmtmr = vpmtmr_init(vm); 509 if (create) 510 vm->vrtc = vrtc_init(vm); 511 512 CPU_ZERO(&vm->active_cpus); 513 CPU_ZERO(&vm->debug_cpus); 514 CPU_ZERO(&vm->startup_cpus); 515 516 vm->suspend = 0; 517 CPU_ZERO(&vm->suspended_cpus); 518 519 if (!create) { 520 for (int i = 0; i < vm->maxcpus; i++) { 521 if (vm->vcpu[i] != NULL) 522 vcpu_init(vm->vcpu[i]); 523 } 524 } 525 } 526 527 void 528 vm_disable_vcpu_creation(struct vm *vm) 529 { 530 sx_xlock(&vm->vcpus_init_lock); 531 vm->dying = true; 532 sx_xunlock(&vm->vcpus_init_lock); 533 } 534 535 struct vcpu * 536 vm_alloc_vcpu(struct vm *vm, int vcpuid) 537 { 538 struct vcpu *vcpu; 539 540 if (vcpuid < 0 || vcpuid >= vm_get_maxcpus(vm)) 541 return (NULL); 542 543 vcpu = (struct vcpu *) 544 atomic_load_acq_ptr((uintptr_t *)&vm->vcpu[vcpuid]); 545 if (__predict_true(vcpu != NULL)) 546 return (vcpu); 547 548 sx_xlock(&vm->vcpus_init_lock); 549 vcpu = vm->vcpu[vcpuid]; 550 if (vcpu == NULL && !vm->dying) { 551 vcpu = vcpu_alloc(vm, vcpuid); 552 vcpu_init(vcpu); 553 554 /* 555 * Ensure vCPU is fully created before updating pointer 556 * to permit unlocked reads above. 557 */ 558 atomic_store_rel_ptr((uintptr_t *)&vm->vcpu[vcpuid], 559 (uintptr_t)vcpu); 560 } 561 sx_xunlock(&vm->vcpus_init_lock); 562 return (vcpu); 563 } 564 565 void 566 vm_slock_vcpus(struct vm *vm) 567 { 568 sx_slock(&vm->vcpus_init_lock); 569 } 570 571 void 572 vm_unlock_vcpus(struct vm *vm) 573 { 574 sx_unlock(&vm->vcpus_init_lock); 575 } 576 577 /* 578 * The default CPU topology is a single thread per package. 579 */ 580 u_int cores_per_package = 1; 581 u_int threads_per_core = 1; 582 583 int 584 vm_create(const char *name, struct vm **retvm) 585 { 586 struct vm *vm; 587 struct vmspace *vmspace; 588 589 /* 590 * If vmm.ko could not be successfully initialized then don't attempt 591 * to create the virtual machine. 592 */ 593 if (!vmm_initialized) 594 return (ENXIO); 595 596 if (name == NULL || strnlen(name, VM_MAX_NAMELEN + 1) == 597 VM_MAX_NAMELEN + 1) 598 return (EINVAL); 599 600 vmspace = vmmops_vmspace_alloc(0, VM_MAXUSER_ADDRESS_LA48); 601 if (vmspace == NULL) 602 return (ENOMEM); 603 604 vm = malloc(sizeof(struct vm), M_VM, M_WAITOK | M_ZERO); 605 strcpy(vm->name, name); 606 vm->vmspace = vmspace; 607 vm_mem_init(&vm->mem); 608 mtx_init(&vm->rendezvous_mtx, "vm rendezvous lock", 0, MTX_DEF); 609 sx_init(&vm->vcpus_init_lock, "vm vcpus"); 610 vm->vcpu = malloc(sizeof(*vm->vcpu) * vm_maxcpu, M_VM, M_WAITOK | 611 M_ZERO); 612 613 vm->sockets = 1; 614 vm->cores = cores_per_package; /* XXX backwards compatibility */ 615 vm->threads = threads_per_core; /* XXX backwards compatibility */ 616 vm->maxcpus = vm_maxcpu; 617 618 vm_init(vm, true); 619 620 *retvm = vm; 621 return (0); 622 } 623 624 void 625 vm_get_topology(struct vm *vm, uint16_t *sockets, uint16_t *cores, 626 uint16_t *threads, uint16_t *maxcpus) 627 { 628 *sockets = vm->sockets; 629 *cores = vm->cores; 630 *threads = vm->threads; 631 *maxcpus = vm->maxcpus; 632 } 633 634 uint16_t 635 vm_get_maxcpus(struct vm *vm) 636 { 637 return (vm->maxcpus); 638 } 639 640 int 641 vm_set_topology(struct vm *vm, uint16_t sockets, uint16_t cores, 642 uint16_t threads, uint16_t maxcpus __unused) 643 { 644 /* Ignore maxcpus. */ 645 if ((sockets * cores * threads) > vm->maxcpus) 646 return (EINVAL); 647 vm->sockets = sockets; 648 vm->cores = cores; 649 vm->threads = threads; 650 return(0); 651 } 652 653 static void 654 vm_cleanup(struct vm *vm, bool destroy) 655 { 656 if (destroy) 657 vm_xlock_memsegs(vm); 658 else 659 vm_assert_memseg_xlocked(vm); 660 661 ppt_unassign_all(vm); 662 663 if (vm->iommu != NULL) 664 iommu_destroy_domain(vm->iommu); 665 666 if (destroy) 667 vrtc_cleanup(vm->vrtc); 668 else 669 vrtc_reset(vm->vrtc); 670 vpmtmr_cleanup(vm->vpmtmr); 671 vatpit_cleanup(vm->vatpit); 672 vhpet_cleanup(vm->vhpet); 673 vatpic_cleanup(vm->vatpic); 674 vioapic_cleanup(vm->vioapic); 675 676 for (int i = 0; i < vm->maxcpus; i++) { 677 if (vm->vcpu[i] != NULL) 678 vcpu_cleanup(vm->vcpu[i], destroy); 679 } 680 681 vmmops_cleanup(vm->cookie); 682 683 vm_mem_cleanup(vm); 684 685 if (destroy) { 686 vm_mem_destroy(vm); 687 688 vmmops_vmspace_free(vm->vmspace); 689 vm->vmspace = NULL; 690 691 free(vm->vcpu, M_VM); 692 sx_destroy(&vm->vcpus_init_lock); 693 mtx_destroy(&vm->rendezvous_mtx); 694 } 695 } 696 697 void 698 vm_destroy(struct vm *vm) 699 { 700 vm_cleanup(vm, true); 701 free(vm, M_VM); 702 } 703 704 int 705 vm_reinit(struct vm *vm) 706 { 707 int error; 708 709 /* 710 * A virtual machine can be reset only if all vcpus are suspended. 711 */ 712 if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) { 713 vm_cleanup(vm, false); 714 vm_init(vm, false); 715 error = 0; 716 } else { 717 error = EBUSY; 718 } 719 720 return (error); 721 } 722 723 const char * 724 vm_name(struct vm *vm) 725 { 726 return (vm->name); 727 } 728 729 int 730 vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa) 731 { 732 vm_object_t obj; 733 734 if ((obj = vmm_mmio_alloc(vm->vmspace, gpa, len, hpa)) == NULL) 735 return (ENOMEM); 736 else 737 return (0); 738 } 739 740 int 741 vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len) 742 { 743 744 vmm_mmio_free(vm->vmspace, gpa, len); 745 return (0); 746 } 747 748 static void 749 vm_iommu_map(struct vm *vm) 750 { 751 vm_paddr_t gpa, hpa; 752 struct vm_mem_map *mm; 753 int i; 754 755 sx_assert(&vm->mem.mem_segs_lock, SX_LOCKED); 756 757 for (i = 0; i < VM_MAX_MEMMAPS; i++) { 758 if (!vm_memseg_sysmem(vm, i)) 759 continue; 760 761 mm = &vm->mem.mem_maps[i]; 762 KASSERT((mm->flags & VM_MEMMAP_F_IOMMU) == 0, 763 ("iommu map found invalid memmap %#lx/%#lx/%#x", 764 mm->gpa, mm->len, mm->flags)); 765 if ((mm->flags & VM_MEMMAP_F_WIRED) == 0) 766 continue; 767 mm->flags |= VM_MEMMAP_F_IOMMU; 768 769 for (gpa = mm->gpa; gpa < mm->gpa + mm->len; gpa += PAGE_SIZE) { 770 hpa = pmap_extract(vmspace_pmap(vm->vmspace), gpa); 771 772 /* 773 * All mappings in the vmm vmspace must be 774 * present since they are managed by vmm in this way. 775 * Because we are in pass-through mode, the 776 * mappings must also be wired. This implies 777 * that all pages must be mapped and wired, 778 * allowing to use pmap_extract() and avoiding the 779 * need to use vm_gpa_hold_global(). 780 * 781 * This could change if/when we start 782 * supporting page faults on IOMMU maps. 783 */ 784 KASSERT(vm_page_wired(PHYS_TO_VM_PAGE(hpa)), 785 ("vm_iommu_map: vm %p gpa %jx hpa %jx not wired", 786 vm, (uintmax_t)gpa, (uintmax_t)hpa)); 787 788 iommu_create_mapping(vm->iommu, gpa, hpa, PAGE_SIZE); 789 } 790 } 791 792 iommu_invalidate_tlb(iommu_host_domain()); 793 } 794 795 static void 796 vm_iommu_unmap(struct vm *vm) 797 { 798 vm_paddr_t gpa; 799 struct vm_mem_map *mm; 800 int i; 801 802 sx_assert(&vm->mem.mem_segs_lock, SX_LOCKED); 803 804 for (i = 0; i < VM_MAX_MEMMAPS; i++) { 805 if (!vm_memseg_sysmem(vm, i)) 806 continue; 807 808 mm = &vm->mem.mem_maps[i]; 809 if ((mm->flags & VM_MEMMAP_F_IOMMU) == 0) 810 continue; 811 mm->flags &= ~VM_MEMMAP_F_IOMMU; 812 KASSERT((mm->flags & VM_MEMMAP_F_WIRED) != 0, 813 ("iommu unmap found invalid memmap %#lx/%#lx/%#x", 814 mm->gpa, mm->len, mm->flags)); 815 816 for (gpa = mm->gpa; gpa < mm->gpa + mm->len; gpa += PAGE_SIZE) { 817 KASSERT(vm_page_wired(PHYS_TO_VM_PAGE(pmap_extract( 818 vmspace_pmap(vm->vmspace), gpa))), 819 ("vm_iommu_unmap: vm %p gpa %jx not wired", 820 vm, (uintmax_t)gpa)); 821 iommu_remove_mapping(vm->iommu, gpa, PAGE_SIZE); 822 } 823 } 824 825 /* 826 * Invalidate the cached translations associated with the domain 827 * from which pages were removed. 828 */ 829 iommu_invalidate_tlb(vm->iommu); 830 } 831 832 int 833 vm_unassign_pptdev(struct vm *vm, int bus, int slot, int func) 834 { 835 int error; 836 837 error = ppt_unassign_device(vm, bus, slot, func); 838 if (error) 839 return (error); 840 841 if (ppt_assigned_devices(vm) == 0) 842 vm_iommu_unmap(vm); 843 844 return (0); 845 } 846 847 int 848 vm_assign_pptdev(struct vm *vm, int bus, int slot, int func) 849 { 850 int error; 851 vm_paddr_t maxaddr; 852 853 /* Set up the IOMMU to do the 'gpa' to 'hpa' translation */ 854 if (ppt_assigned_devices(vm) == 0) { 855 KASSERT(vm->iommu == NULL, 856 ("vm_assign_pptdev: iommu must be NULL")); 857 maxaddr = vmm_sysmem_maxaddr(vm); 858 vm->iommu = iommu_create_domain(maxaddr); 859 if (vm->iommu == NULL) 860 return (ENXIO); 861 vm_iommu_map(vm); 862 } 863 864 error = ppt_assign_device(vm, bus, slot, func); 865 return (error); 866 } 867 868 int 869 vm_get_register(struct vcpu *vcpu, int reg, uint64_t *retval) 870 { 871 872 if (reg >= VM_REG_LAST) 873 return (EINVAL); 874 875 return (vmmops_getreg(vcpu->cookie, reg, retval)); 876 } 877 878 int 879 vm_set_register(struct vcpu *vcpu, int reg, uint64_t val) 880 { 881 int error; 882 883 if (reg >= VM_REG_LAST) 884 return (EINVAL); 885 886 error = vmmops_setreg(vcpu->cookie, reg, val); 887 if (error || reg != VM_REG_GUEST_RIP) 888 return (error); 889 890 /* Set 'nextrip' to match the value of %rip */ 891 VMM_CTR1(vcpu, "Setting nextrip to %#lx", val); 892 vcpu->nextrip = val; 893 return (0); 894 } 895 896 static bool 897 is_descriptor_table(int reg) 898 { 899 900 switch (reg) { 901 case VM_REG_GUEST_IDTR: 902 case VM_REG_GUEST_GDTR: 903 return (true); 904 default: 905 return (false); 906 } 907 } 908 909 static bool 910 is_segment_register(int reg) 911 { 912 913 switch (reg) { 914 case VM_REG_GUEST_ES: 915 case VM_REG_GUEST_CS: 916 case VM_REG_GUEST_SS: 917 case VM_REG_GUEST_DS: 918 case VM_REG_GUEST_FS: 919 case VM_REG_GUEST_GS: 920 case VM_REG_GUEST_TR: 921 case VM_REG_GUEST_LDTR: 922 return (true); 923 default: 924 return (false); 925 } 926 } 927 928 int 929 vm_get_seg_desc(struct vcpu *vcpu, int reg, struct seg_desc *desc) 930 { 931 932 if (!is_segment_register(reg) && !is_descriptor_table(reg)) 933 return (EINVAL); 934 935 return (vmmops_getdesc(vcpu->cookie, reg, desc)); 936 } 937 938 int 939 vm_set_seg_desc(struct vcpu *vcpu, int reg, struct seg_desc *desc) 940 { 941 942 if (!is_segment_register(reg) && !is_descriptor_table(reg)) 943 return (EINVAL); 944 945 return (vmmops_setdesc(vcpu->cookie, reg, desc)); 946 } 947 948 static void 949 restore_guest_fpustate(struct vcpu *vcpu) 950 { 951 952 /* flush host state to the pcb */ 953 fpuexit(curthread); 954 955 /* restore guest FPU state */ 956 fpu_enable(); 957 fpurestore(vcpu->guestfpu); 958 959 /* restore guest XCR0 if XSAVE is enabled in the host */ 960 if (rcr4() & CR4_XSAVE) 961 load_xcr(0, vcpu->guest_xcr0); 962 963 /* 964 * The FPU is now "dirty" with the guest's state so disable 965 * the FPU to trap any access by the host. 966 */ 967 fpu_disable(); 968 } 969 970 static void 971 save_guest_fpustate(struct vcpu *vcpu) 972 { 973 974 if ((rcr0() & CR0_TS) == 0) 975 panic("fpu emulation not enabled in host!"); 976 977 /* save guest XCR0 and restore host XCR0 */ 978 if (rcr4() & CR4_XSAVE) { 979 vcpu->guest_xcr0 = rxcr(0); 980 load_xcr(0, vmm_get_host_xcr0()); 981 } 982 983 /* save guest FPU state */ 984 fpu_enable(); 985 fpusave(vcpu->guestfpu); 986 fpu_disable(); 987 } 988 989 static VMM_STAT(VCPU_IDLE_TICKS, "number of ticks vcpu was idle"); 990 991 static int 992 vcpu_set_state_locked(struct vcpu *vcpu, enum vcpu_state newstate, 993 bool from_idle) 994 { 995 int error; 996 997 vcpu_assert_locked(vcpu); 998 999 /* 1000 * State transitions from the vmmdev_ioctl() must always begin from 1001 * the VCPU_IDLE state. This guarantees that there is only a single 1002 * ioctl() operating on a vcpu at any point. 1003 */ 1004 if (from_idle) { 1005 while (vcpu->state != VCPU_IDLE) { 1006 vcpu->reqidle = 1; 1007 vcpu_notify_event_locked(vcpu, false); 1008 VMM_CTR1(vcpu, "vcpu state change from %s to " 1009 "idle requested", vcpu_state2str(vcpu->state)); 1010 msleep_spin(&vcpu->state, &vcpu->mtx, "vmstat", hz); 1011 } 1012 } else { 1013 KASSERT(vcpu->state != VCPU_IDLE, ("invalid transition from " 1014 "vcpu idle state")); 1015 } 1016 1017 if (vcpu->state == VCPU_RUNNING) { 1018 KASSERT(vcpu->hostcpu == curcpu, ("curcpu %d and hostcpu %d " 1019 "mismatch for running vcpu", curcpu, vcpu->hostcpu)); 1020 } else { 1021 KASSERT(vcpu->hostcpu == NOCPU, ("Invalid hostcpu %d for a " 1022 "vcpu that is not running", vcpu->hostcpu)); 1023 } 1024 1025 /* 1026 * The following state transitions are allowed: 1027 * IDLE -> FROZEN -> IDLE 1028 * FROZEN -> RUNNING -> FROZEN 1029 * FROZEN -> SLEEPING -> FROZEN 1030 */ 1031 switch (vcpu->state) { 1032 case VCPU_IDLE: 1033 case VCPU_RUNNING: 1034 case VCPU_SLEEPING: 1035 error = (newstate != VCPU_FROZEN); 1036 break; 1037 case VCPU_FROZEN: 1038 error = (newstate == VCPU_FROZEN); 1039 break; 1040 default: 1041 error = 1; 1042 break; 1043 } 1044 1045 if (error) 1046 return (EBUSY); 1047 1048 VMM_CTR2(vcpu, "vcpu state changed from %s to %s", 1049 vcpu_state2str(vcpu->state), vcpu_state2str(newstate)); 1050 1051 vcpu->state = newstate; 1052 if (newstate == VCPU_RUNNING) 1053 vcpu->hostcpu = curcpu; 1054 else 1055 vcpu->hostcpu = NOCPU; 1056 1057 if (newstate == VCPU_IDLE) 1058 wakeup(&vcpu->state); 1059 1060 return (0); 1061 } 1062 1063 static void 1064 vcpu_require_state(struct vcpu *vcpu, enum vcpu_state newstate) 1065 { 1066 int error; 1067 1068 if ((error = vcpu_set_state(vcpu, newstate, false)) != 0) 1069 panic("Error %d setting state to %d\n", error, newstate); 1070 } 1071 1072 static void 1073 vcpu_require_state_locked(struct vcpu *vcpu, enum vcpu_state newstate) 1074 { 1075 int error; 1076 1077 if ((error = vcpu_set_state_locked(vcpu, newstate, false)) != 0) 1078 panic("Error %d setting state to %d", error, newstate); 1079 } 1080 1081 static int 1082 vm_handle_rendezvous(struct vcpu *vcpu) 1083 { 1084 struct vm *vm = vcpu->vm; 1085 struct thread *td; 1086 int error, vcpuid; 1087 1088 error = 0; 1089 vcpuid = vcpu->vcpuid; 1090 td = curthread; 1091 mtx_lock(&vm->rendezvous_mtx); 1092 while (vm->rendezvous_func != NULL) { 1093 /* 'rendezvous_req_cpus' must be a subset of 'active_cpus' */ 1094 CPU_AND(&vm->rendezvous_req_cpus, &vm->rendezvous_req_cpus, &vm->active_cpus); 1095 1096 if (CPU_ISSET(vcpuid, &vm->rendezvous_req_cpus) && 1097 !CPU_ISSET(vcpuid, &vm->rendezvous_done_cpus)) { 1098 VMM_CTR0(vcpu, "Calling rendezvous func"); 1099 (*vm->rendezvous_func)(vcpu, vm->rendezvous_arg); 1100 CPU_SET(vcpuid, &vm->rendezvous_done_cpus); 1101 } 1102 if (CPU_CMP(&vm->rendezvous_req_cpus, 1103 &vm->rendezvous_done_cpus) == 0) { 1104 VMM_CTR0(vcpu, "Rendezvous completed"); 1105 CPU_ZERO(&vm->rendezvous_req_cpus); 1106 vm->rendezvous_func = NULL; 1107 wakeup(&vm->rendezvous_func); 1108 break; 1109 } 1110 VMM_CTR0(vcpu, "Wait for rendezvous completion"); 1111 mtx_sleep(&vm->rendezvous_func, &vm->rendezvous_mtx, 0, 1112 "vmrndv", hz); 1113 if (td_ast_pending(td, TDA_SUSPEND)) { 1114 mtx_unlock(&vm->rendezvous_mtx); 1115 error = thread_check_susp(td, true); 1116 if (error != 0) 1117 return (error); 1118 mtx_lock(&vm->rendezvous_mtx); 1119 } 1120 } 1121 mtx_unlock(&vm->rendezvous_mtx); 1122 return (0); 1123 } 1124 1125 /* 1126 * Emulate a guest 'hlt' by sleeping until the vcpu is ready to run. 1127 */ 1128 static int 1129 vm_handle_hlt(struct vcpu *vcpu, bool intr_disabled, bool *retu) 1130 { 1131 struct vm *vm = vcpu->vm; 1132 const char *wmesg; 1133 struct thread *td; 1134 int error, t, vcpuid, vcpu_halted, vm_halted; 1135 1136 vcpuid = vcpu->vcpuid; 1137 vcpu_halted = 0; 1138 vm_halted = 0; 1139 error = 0; 1140 td = curthread; 1141 1142 KASSERT(!CPU_ISSET(vcpuid, &vm->halted_cpus), ("vcpu already halted")); 1143 1144 vcpu_lock(vcpu); 1145 while (1) { 1146 /* 1147 * Do a final check for pending NMI or interrupts before 1148 * really putting this thread to sleep. Also check for 1149 * software events that would cause this vcpu to wakeup. 1150 * 1151 * These interrupts/events could have happened after the 1152 * vcpu returned from vmmops_run() and before it acquired the 1153 * vcpu lock above. 1154 */ 1155 if (vm->rendezvous_func != NULL || vm->suspend || vcpu->reqidle) 1156 break; 1157 if (vm_nmi_pending(vcpu)) 1158 break; 1159 if (!intr_disabled) { 1160 if (vm_extint_pending(vcpu) || 1161 vlapic_pending_intr(vcpu->vlapic, NULL)) { 1162 break; 1163 } 1164 } 1165 1166 /* Don't go to sleep if the vcpu thread needs to yield */ 1167 if (vcpu_should_yield(vcpu)) 1168 break; 1169 1170 if (vcpu_debugged(vcpu)) 1171 break; 1172 1173 /* 1174 * Some Linux guests implement "halt" by having all vcpus 1175 * execute HLT with interrupts disabled. 'halted_cpus' keeps 1176 * track of the vcpus that have entered this state. When all 1177 * vcpus enter the halted state the virtual machine is halted. 1178 */ 1179 if (intr_disabled) { 1180 wmesg = "vmhalt"; 1181 VMM_CTR0(vcpu, "Halted"); 1182 if (!vcpu_halted && halt_detection_enabled) { 1183 vcpu_halted = 1; 1184 CPU_SET_ATOMIC(vcpuid, &vm->halted_cpus); 1185 } 1186 if (CPU_CMP(&vm->halted_cpus, &vm->active_cpus) == 0) { 1187 vm_halted = 1; 1188 break; 1189 } 1190 } else { 1191 wmesg = "vmidle"; 1192 } 1193 1194 t = ticks; 1195 vcpu_require_state_locked(vcpu, VCPU_SLEEPING); 1196 /* 1197 * XXX msleep_spin() cannot be interrupted by signals so 1198 * wake up periodically to check pending signals. 1199 */ 1200 msleep_spin(vcpu, &vcpu->mtx, wmesg, hz); 1201 vcpu_require_state_locked(vcpu, VCPU_FROZEN); 1202 vmm_stat_incr(vcpu, VCPU_IDLE_TICKS, ticks - t); 1203 if (td_ast_pending(td, TDA_SUSPEND)) { 1204 vcpu_unlock(vcpu); 1205 error = thread_check_susp(td, false); 1206 if (error != 0) { 1207 if (vcpu_halted) { 1208 CPU_CLR_ATOMIC(vcpuid, 1209 &vm->halted_cpus); 1210 } 1211 return (error); 1212 } 1213 vcpu_lock(vcpu); 1214 } 1215 } 1216 1217 if (vcpu_halted) 1218 CPU_CLR_ATOMIC(vcpuid, &vm->halted_cpus); 1219 1220 vcpu_unlock(vcpu); 1221 1222 if (vm_halted) 1223 vm_suspend(vm, VM_SUSPEND_HALT); 1224 1225 return (0); 1226 } 1227 1228 static int 1229 vm_handle_paging(struct vcpu *vcpu, bool *retu) 1230 { 1231 struct vm *vm = vcpu->vm; 1232 int rv, ftype; 1233 struct vm_map *map; 1234 struct vm_exit *vme; 1235 1236 vme = &vcpu->exitinfo; 1237 1238 KASSERT(vme->inst_length == 0, ("%s: invalid inst_length %d", 1239 __func__, vme->inst_length)); 1240 1241 ftype = vme->u.paging.fault_type; 1242 KASSERT(ftype == VM_PROT_READ || 1243 ftype == VM_PROT_WRITE || ftype == VM_PROT_EXECUTE, 1244 ("vm_handle_paging: invalid fault_type %d", ftype)); 1245 1246 if (ftype == VM_PROT_READ || ftype == VM_PROT_WRITE) { 1247 rv = pmap_emulate_accessed_dirty(vmspace_pmap(vm->vmspace), 1248 vme->u.paging.gpa, ftype); 1249 if (rv == 0) { 1250 VMM_CTR2(vcpu, "%s bit emulation for gpa %#lx", 1251 ftype == VM_PROT_READ ? "accessed" : "dirty", 1252 vme->u.paging.gpa); 1253 goto done; 1254 } 1255 } 1256 1257 map = &vm->vmspace->vm_map; 1258 rv = vm_fault(map, vme->u.paging.gpa, ftype, VM_FAULT_NORMAL, NULL); 1259 1260 VMM_CTR3(vcpu, "vm_handle_paging rv = %d, gpa = %#lx, " 1261 "ftype = %d", rv, vme->u.paging.gpa, ftype); 1262 1263 if (rv != KERN_SUCCESS) 1264 return (EFAULT); 1265 done: 1266 return (0); 1267 } 1268 1269 static int 1270 vm_handle_inst_emul(struct vcpu *vcpu, bool *retu) 1271 { 1272 struct vie *vie; 1273 struct vm_exit *vme; 1274 uint64_t gla, gpa, cs_base; 1275 struct vm_guest_paging *paging; 1276 mem_region_read_t mread; 1277 mem_region_write_t mwrite; 1278 enum vm_cpu_mode cpu_mode; 1279 int cs_d, error, fault; 1280 1281 vme = &vcpu->exitinfo; 1282 1283 KASSERT(vme->inst_length == 0, ("%s: invalid inst_length %d", 1284 __func__, vme->inst_length)); 1285 1286 gla = vme->u.inst_emul.gla; 1287 gpa = vme->u.inst_emul.gpa; 1288 cs_base = vme->u.inst_emul.cs_base; 1289 cs_d = vme->u.inst_emul.cs_d; 1290 vie = &vme->u.inst_emul.vie; 1291 paging = &vme->u.inst_emul.paging; 1292 cpu_mode = paging->cpu_mode; 1293 1294 VMM_CTR1(vcpu, "inst_emul fault accessing gpa %#lx", gpa); 1295 1296 /* Fetch, decode and emulate the faulting instruction */ 1297 if (vie->num_valid == 0) { 1298 error = vmm_fetch_instruction(vcpu, paging, vme->rip + cs_base, 1299 VIE_INST_SIZE, vie, &fault); 1300 } else { 1301 /* 1302 * The instruction bytes have already been copied into 'vie' 1303 */ 1304 error = fault = 0; 1305 } 1306 if (error || fault) 1307 return (error); 1308 1309 if (vmm_decode_instruction(vcpu, gla, cpu_mode, cs_d, vie) != 0) { 1310 VMM_CTR1(vcpu, "Error decoding instruction at %#lx", 1311 vme->rip + cs_base); 1312 *retu = true; /* dump instruction bytes in userspace */ 1313 return (0); 1314 } 1315 1316 /* 1317 * Update 'nextrip' based on the length of the emulated instruction. 1318 */ 1319 vme->inst_length = vie->num_processed; 1320 vcpu->nextrip += vie->num_processed; 1321 VMM_CTR1(vcpu, "nextrip updated to %#lx after instruction decoding", 1322 vcpu->nextrip); 1323 1324 /* return to userland unless this is an in-kernel emulated device */ 1325 if (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE) { 1326 mread = lapic_mmio_read; 1327 mwrite = lapic_mmio_write; 1328 } else if (gpa >= VIOAPIC_BASE && gpa < VIOAPIC_BASE + VIOAPIC_SIZE) { 1329 mread = vioapic_mmio_read; 1330 mwrite = vioapic_mmio_write; 1331 } else if (gpa >= VHPET_BASE && gpa < VHPET_BASE + VHPET_SIZE) { 1332 mread = vhpet_mmio_read; 1333 mwrite = vhpet_mmio_write; 1334 } else { 1335 *retu = true; 1336 return (0); 1337 } 1338 1339 error = vmm_emulate_instruction(vcpu, gpa, vie, paging, mread, mwrite, 1340 retu); 1341 1342 return (error); 1343 } 1344 1345 static int 1346 vm_handle_suspend(struct vcpu *vcpu, bool *retu) 1347 { 1348 struct vm *vm = vcpu->vm; 1349 int error, i; 1350 struct thread *td; 1351 1352 error = 0; 1353 td = curthread; 1354 1355 CPU_SET_ATOMIC(vcpu->vcpuid, &vm->suspended_cpus); 1356 1357 /* 1358 * Wait until all 'active_cpus' have suspended themselves. 1359 * 1360 * Since a VM may be suspended at any time including when one or 1361 * more vcpus are doing a rendezvous we need to call the rendezvous 1362 * handler while we are waiting to prevent a deadlock. 1363 */ 1364 vcpu_lock(vcpu); 1365 while (error == 0) { 1366 if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) { 1367 VMM_CTR0(vcpu, "All vcpus suspended"); 1368 break; 1369 } 1370 1371 if (vm->rendezvous_func == NULL) { 1372 VMM_CTR0(vcpu, "Sleeping during suspend"); 1373 vcpu_require_state_locked(vcpu, VCPU_SLEEPING); 1374 msleep_spin(vcpu, &vcpu->mtx, "vmsusp", hz); 1375 vcpu_require_state_locked(vcpu, VCPU_FROZEN); 1376 if (td_ast_pending(td, TDA_SUSPEND)) { 1377 vcpu_unlock(vcpu); 1378 error = thread_check_susp(td, false); 1379 vcpu_lock(vcpu); 1380 } 1381 } else { 1382 VMM_CTR0(vcpu, "Rendezvous during suspend"); 1383 vcpu_unlock(vcpu); 1384 error = vm_handle_rendezvous(vcpu); 1385 vcpu_lock(vcpu); 1386 } 1387 } 1388 vcpu_unlock(vcpu); 1389 1390 /* 1391 * Wakeup the other sleeping vcpus and return to userspace. 1392 */ 1393 for (i = 0; i < vm->maxcpus; i++) { 1394 if (CPU_ISSET(i, &vm->suspended_cpus)) { 1395 vcpu_notify_event(vm_vcpu(vm, i), false); 1396 } 1397 } 1398 1399 *retu = true; 1400 return (error); 1401 } 1402 1403 static int 1404 vm_handle_reqidle(struct vcpu *vcpu, bool *retu) 1405 { 1406 vcpu_lock(vcpu); 1407 KASSERT(vcpu->reqidle, ("invalid vcpu reqidle %d", vcpu->reqidle)); 1408 vcpu->reqidle = 0; 1409 vcpu_unlock(vcpu); 1410 *retu = true; 1411 return (0); 1412 } 1413 1414 static int 1415 vm_handle_db(struct vcpu *vcpu, struct vm_exit *vme, bool *retu) 1416 { 1417 int error, fault; 1418 uint64_t rsp; 1419 uint64_t rflags; 1420 struct vm_copyinfo copyinfo[2]; 1421 1422 *retu = true; 1423 if (!vme->u.dbg.pushf_intercept || vme->u.dbg.tf_shadow_val != 0) { 1424 return (0); 1425 } 1426 1427 vm_get_register(vcpu, VM_REG_GUEST_RSP, &rsp); 1428 error = vm_copy_setup(vcpu, &vme->u.dbg.paging, rsp, sizeof(uint64_t), 1429 VM_PROT_RW, copyinfo, nitems(copyinfo), &fault); 1430 if (error != 0 || fault != 0) { 1431 *retu = false; 1432 return (EINVAL); 1433 } 1434 1435 /* Read pushed rflags value from top of stack. */ 1436 vm_copyin(copyinfo, &rflags, sizeof(uint64_t)); 1437 1438 /* Clear TF bit. */ 1439 rflags &= ~(PSL_T); 1440 1441 /* Write updated value back to memory. */ 1442 vm_copyout(&rflags, copyinfo, sizeof(uint64_t)); 1443 vm_copy_teardown(copyinfo, nitems(copyinfo)); 1444 1445 return (0); 1446 } 1447 1448 int 1449 vm_suspend(struct vm *vm, enum vm_suspend_how how) 1450 { 1451 int i; 1452 1453 if (how <= VM_SUSPEND_NONE || how >= VM_SUSPEND_LAST) 1454 return (EINVAL); 1455 1456 if (atomic_cmpset_int(&vm->suspend, 0, how) == 0) { 1457 VM_CTR2(vm, "virtual machine already suspended %d/%d", 1458 vm->suspend, how); 1459 return (EALREADY); 1460 } 1461 1462 VM_CTR1(vm, "virtual machine successfully suspended %d", how); 1463 1464 /* 1465 * Notify all active vcpus that they are now suspended. 1466 */ 1467 for (i = 0; i < vm->maxcpus; i++) { 1468 if (CPU_ISSET(i, &vm->active_cpus)) 1469 vcpu_notify_event(vm_vcpu(vm, i), false); 1470 } 1471 1472 return (0); 1473 } 1474 1475 void 1476 vm_exit_suspended(struct vcpu *vcpu, uint64_t rip) 1477 { 1478 struct vm *vm = vcpu->vm; 1479 struct vm_exit *vmexit; 1480 1481 KASSERT(vm->suspend > VM_SUSPEND_NONE && vm->suspend < VM_SUSPEND_LAST, 1482 ("vm_exit_suspended: invalid suspend type %d", vm->suspend)); 1483 1484 vmexit = vm_exitinfo(vcpu); 1485 vmexit->rip = rip; 1486 vmexit->inst_length = 0; 1487 vmexit->exitcode = VM_EXITCODE_SUSPENDED; 1488 vmexit->u.suspended.how = vm->suspend; 1489 } 1490 1491 void 1492 vm_exit_debug(struct vcpu *vcpu, uint64_t rip) 1493 { 1494 struct vm_exit *vmexit; 1495 1496 vmexit = vm_exitinfo(vcpu); 1497 vmexit->rip = rip; 1498 vmexit->inst_length = 0; 1499 vmexit->exitcode = VM_EXITCODE_DEBUG; 1500 } 1501 1502 void 1503 vm_exit_rendezvous(struct vcpu *vcpu, uint64_t rip) 1504 { 1505 struct vm_exit *vmexit; 1506 1507 vmexit = vm_exitinfo(vcpu); 1508 vmexit->rip = rip; 1509 vmexit->inst_length = 0; 1510 vmexit->exitcode = VM_EXITCODE_RENDEZVOUS; 1511 vmm_stat_incr(vcpu, VMEXIT_RENDEZVOUS, 1); 1512 } 1513 1514 void 1515 vm_exit_reqidle(struct vcpu *vcpu, uint64_t rip) 1516 { 1517 struct vm_exit *vmexit; 1518 1519 vmexit = vm_exitinfo(vcpu); 1520 vmexit->rip = rip; 1521 vmexit->inst_length = 0; 1522 vmexit->exitcode = VM_EXITCODE_REQIDLE; 1523 vmm_stat_incr(vcpu, VMEXIT_REQIDLE, 1); 1524 } 1525 1526 void 1527 vm_exit_astpending(struct vcpu *vcpu, uint64_t rip) 1528 { 1529 struct vm_exit *vmexit; 1530 1531 vmexit = vm_exitinfo(vcpu); 1532 vmexit->rip = rip; 1533 vmexit->inst_length = 0; 1534 vmexit->exitcode = VM_EXITCODE_BOGUS; 1535 vmm_stat_incr(vcpu, VMEXIT_ASTPENDING, 1); 1536 } 1537 1538 int 1539 vm_run(struct vcpu *vcpu) 1540 { 1541 struct vm *vm = vcpu->vm; 1542 struct vm_eventinfo evinfo; 1543 int error, vcpuid; 1544 struct pcb *pcb; 1545 uint64_t tscval; 1546 struct vm_exit *vme; 1547 bool retu, intr_disabled; 1548 pmap_t pmap; 1549 1550 vcpuid = vcpu->vcpuid; 1551 1552 if (!CPU_ISSET(vcpuid, &vm->active_cpus)) 1553 return (EINVAL); 1554 1555 if (CPU_ISSET(vcpuid, &vm->suspended_cpus)) 1556 return (EINVAL); 1557 1558 pmap = vmspace_pmap(vm->vmspace); 1559 vme = &vcpu->exitinfo; 1560 evinfo.rptr = &vm->rendezvous_req_cpus; 1561 evinfo.sptr = &vm->suspend; 1562 evinfo.iptr = &vcpu->reqidle; 1563 restart: 1564 critical_enter(); 1565 1566 KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active), 1567 ("vm_run: absurd pm_active")); 1568 1569 tscval = rdtsc(); 1570 1571 pcb = PCPU_GET(curpcb); 1572 set_pcb_flags(pcb, PCB_FULL_IRET); 1573 1574 restore_guest_fpustate(vcpu); 1575 1576 vcpu_require_state(vcpu, VCPU_RUNNING); 1577 error = vmmops_run(vcpu->cookie, vcpu->nextrip, pmap, &evinfo); 1578 vcpu_require_state(vcpu, VCPU_FROZEN); 1579 1580 save_guest_fpustate(vcpu); 1581 1582 vmm_stat_incr(vcpu, VCPU_TOTAL_RUNTIME, rdtsc() - tscval); 1583 1584 critical_exit(); 1585 1586 if (error == 0) { 1587 retu = false; 1588 vcpu->nextrip = vme->rip + vme->inst_length; 1589 switch (vme->exitcode) { 1590 case VM_EXITCODE_REQIDLE: 1591 error = vm_handle_reqidle(vcpu, &retu); 1592 break; 1593 case VM_EXITCODE_SUSPENDED: 1594 error = vm_handle_suspend(vcpu, &retu); 1595 break; 1596 case VM_EXITCODE_IOAPIC_EOI: 1597 vioapic_process_eoi(vm, vme->u.ioapic_eoi.vector); 1598 break; 1599 case VM_EXITCODE_RENDEZVOUS: 1600 error = vm_handle_rendezvous(vcpu); 1601 break; 1602 case VM_EXITCODE_HLT: 1603 intr_disabled = ((vme->u.hlt.rflags & PSL_I) == 0); 1604 error = vm_handle_hlt(vcpu, intr_disabled, &retu); 1605 break; 1606 case VM_EXITCODE_PAGING: 1607 error = vm_handle_paging(vcpu, &retu); 1608 break; 1609 case VM_EXITCODE_INST_EMUL: 1610 error = vm_handle_inst_emul(vcpu, &retu); 1611 break; 1612 case VM_EXITCODE_INOUT: 1613 case VM_EXITCODE_INOUT_STR: 1614 error = vm_handle_inout(vcpu, vme, &retu); 1615 break; 1616 case VM_EXITCODE_DB: 1617 error = vm_handle_db(vcpu, vme, &retu); 1618 break; 1619 case VM_EXITCODE_MONITOR: 1620 case VM_EXITCODE_MWAIT: 1621 case VM_EXITCODE_VMINSN: 1622 vm_inject_ud(vcpu); 1623 break; 1624 default: 1625 retu = true; /* handled in userland */ 1626 break; 1627 } 1628 } 1629 1630 /* 1631 * VM_EXITCODE_INST_EMUL could access the apic which could transform the 1632 * exit code into VM_EXITCODE_IPI. 1633 */ 1634 if (error == 0 && vme->exitcode == VM_EXITCODE_IPI) 1635 error = vm_handle_ipi(vcpu, vme, &retu); 1636 1637 if (error == 0 && retu == false) 1638 goto restart; 1639 1640 vmm_stat_incr(vcpu, VMEXIT_USERSPACE, 1); 1641 VMM_CTR2(vcpu, "retu %d/%d", error, vme->exitcode); 1642 1643 return (error); 1644 } 1645 1646 int 1647 vm_restart_instruction(struct vcpu *vcpu) 1648 { 1649 enum vcpu_state state; 1650 uint64_t rip; 1651 int error __diagused; 1652 1653 state = vcpu_get_state(vcpu, NULL); 1654 if (state == VCPU_RUNNING) { 1655 /* 1656 * When a vcpu is "running" the next instruction is determined 1657 * by adding 'rip' and 'inst_length' in the vcpu's 'exitinfo'. 1658 * Thus setting 'inst_length' to zero will cause the current 1659 * instruction to be restarted. 1660 */ 1661 vcpu->exitinfo.inst_length = 0; 1662 VMM_CTR1(vcpu, "restarting instruction at %#lx by " 1663 "setting inst_length to zero", vcpu->exitinfo.rip); 1664 } else if (state == VCPU_FROZEN) { 1665 /* 1666 * When a vcpu is "frozen" it is outside the critical section 1667 * around vmmops_run() and 'nextrip' points to the next 1668 * instruction. Thus instruction restart is achieved by setting 1669 * 'nextrip' to the vcpu's %rip. 1670 */ 1671 error = vm_get_register(vcpu, VM_REG_GUEST_RIP, &rip); 1672 KASSERT(!error, ("%s: error %d getting rip", __func__, error)); 1673 VMM_CTR2(vcpu, "restarting instruction by updating " 1674 "nextrip from %#lx to %#lx", vcpu->nextrip, rip); 1675 vcpu->nextrip = rip; 1676 } else { 1677 panic("%s: invalid state %d", __func__, state); 1678 } 1679 return (0); 1680 } 1681 1682 int 1683 vm_exit_intinfo(struct vcpu *vcpu, uint64_t info) 1684 { 1685 int type, vector; 1686 1687 if (info & VM_INTINFO_VALID) { 1688 type = info & VM_INTINFO_TYPE; 1689 vector = info & 0xff; 1690 if (type == VM_INTINFO_NMI && vector != IDT_NMI) 1691 return (EINVAL); 1692 if (type == VM_INTINFO_HWEXCEPTION && vector >= 32) 1693 return (EINVAL); 1694 if (info & VM_INTINFO_RSVD) 1695 return (EINVAL); 1696 } else { 1697 info = 0; 1698 } 1699 VMM_CTR2(vcpu, "%s: info1(%#lx)", __func__, info); 1700 vcpu->exitintinfo = info; 1701 return (0); 1702 } 1703 1704 enum exc_class { 1705 EXC_BENIGN, 1706 EXC_CONTRIBUTORY, 1707 EXC_PAGEFAULT 1708 }; 1709 1710 #define IDT_VE 20 /* Virtualization Exception (Intel specific) */ 1711 1712 static enum exc_class 1713 exception_class(uint64_t info) 1714 { 1715 int type, vector; 1716 1717 KASSERT(info & VM_INTINFO_VALID, ("intinfo must be valid: %#lx", info)); 1718 type = info & VM_INTINFO_TYPE; 1719 vector = info & 0xff; 1720 1721 /* Table 6-4, "Interrupt and Exception Classes", Intel SDM, Vol 3 */ 1722 switch (type) { 1723 case VM_INTINFO_HWINTR: 1724 case VM_INTINFO_SWINTR: 1725 case VM_INTINFO_NMI: 1726 return (EXC_BENIGN); 1727 default: 1728 /* 1729 * Hardware exception. 1730 * 1731 * SVM and VT-x use identical type values to represent NMI, 1732 * hardware interrupt and software interrupt. 1733 * 1734 * SVM uses type '3' for all exceptions. VT-x uses type '3' 1735 * for exceptions except #BP and #OF. #BP and #OF use a type 1736 * value of '5' or '6'. Therefore we don't check for explicit 1737 * values of 'type' to classify 'intinfo' into a hardware 1738 * exception. 1739 */ 1740 break; 1741 } 1742 1743 switch (vector) { 1744 case IDT_PF: 1745 case IDT_VE: 1746 return (EXC_PAGEFAULT); 1747 case IDT_DE: 1748 case IDT_TS: 1749 case IDT_NP: 1750 case IDT_SS: 1751 case IDT_GP: 1752 return (EXC_CONTRIBUTORY); 1753 default: 1754 return (EXC_BENIGN); 1755 } 1756 } 1757 1758 static int 1759 nested_fault(struct vcpu *vcpu, uint64_t info1, uint64_t info2, 1760 uint64_t *retinfo) 1761 { 1762 enum exc_class exc1, exc2; 1763 int type1, vector1; 1764 1765 KASSERT(info1 & VM_INTINFO_VALID, ("info1 %#lx is not valid", info1)); 1766 KASSERT(info2 & VM_INTINFO_VALID, ("info2 %#lx is not valid", info2)); 1767 1768 /* 1769 * If an exception occurs while attempting to call the double-fault 1770 * handler the processor enters shutdown mode (aka triple fault). 1771 */ 1772 type1 = info1 & VM_INTINFO_TYPE; 1773 vector1 = info1 & 0xff; 1774 if (type1 == VM_INTINFO_HWEXCEPTION && vector1 == IDT_DF) { 1775 VMM_CTR2(vcpu, "triple fault: info1(%#lx), info2(%#lx)", 1776 info1, info2); 1777 vm_suspend(vcpu->vm, VM_SUSPEND_TRIPLEFAULT); 1778 *retinfo = 0; 1779 return (0); 1780 } 1781 1782 /* 1783 * Table 6-5 "Conditions for Generating a Double Fault", Intel SDM, Vol3 1784 */ 1785 exc1 = exception_class(info1); 1786 exc2 = exception_class(info2); 1787 if ((exc1 == EXC_CONTRIBUTORY && exc2 == EXC_CONTRIBUTORY) || 1788 (exc1 == EXC_PAGEFAULT && exc2 != EXC_BENIGN)) { 1789 /* Convert nested fault into a double fault. */ 1790 *retinfo = IDT_DF; 1791 *retinfo |= VM_INTINFO_VALID | VM_INTINFO_HWEXCEPTION; 1792 *retinfo |= VM_INTINFO_DEL_ERRCODE; 1793 } else { 1794 /* Handle exceptions serially */ 1795 *retinfo = info2; 1796 } 1797 return (1); 1798 } 1799 1800 static uint64_t 1801 vcpu_exception_intinfo(struct vcpu *vcpu) 1802 { 1803 uint64_t info = 0; 1804 1805 if (vcpu->exception_pending) { 1806 info = vcpu->exc_vector & 0xff; 1807 info |= VM_INTINFO_VALID | VM_INTINFO_HWEXCEPTION; 1808 if (vcpu->exc_errcode_valid) { 1809 info |= VM_INTINFO_DEL_ERRCODE; 1810 info |= (uint64_t)vcpu->exc_errcode << 32; 1811 } 1812 } 1813 return (info); 1814 } 1815 1816 int 1817 vm_entry_intinfo(struct vcpu *vcpu, uint64_t *retinfo) 1818 { 1819 uint64_t info1, info2; 1820 int valid; 1821 1822 info1 = vcpu->exitintinfo; 1823 vcpu->exitintinfo = 0; 1824 1825 info2 = 0; 1826 if (vcpu->exception_pending) { 1827 info2 = vcpu_exception_intinfo(vcpu); 1828 vcpu->exception_pending = 0; 1829 VMM_CTR2(vcpu, "Exception %d delivered: %#lx", 1830 vcpu->exc_vector, info2); 1831 } 1832 1833 if ((info1 & VM_INTINFO_VALID) && (info2 & VM_INTINFO_VALID)) { 1834 valid = nested_fault(vcpu, info1, info2, retinfo); 1835 } else if (info1 & VM_INTINFO_VALID) { 1836 *retinfo = info1; 1837 valid = 1; 1838 } else if (info2 & VM_INTINFO_VALID) { 1839 *retinfo = info2; 1840 valid = 1; 1841 } else { 1842 valid = 0; 1843 } 1844 1845 if (valid) { 1846 VMM_CTR4(vcpu, "%s: info1(%#lx), info2(%#lx), " 1847 "retinfo(%#lx)", __func__, info1, info2, *retinfo); 1848 } 1849 1850 return (valid); 1851 } 1852 1853 int 1854 vm_get_intinfo(struct vcpu *vcpu, uint64_t *info1, uint64_t *info2) 1855 { 1856 *info1 = vcpu->exitintinfo; 1857 *info2 = vcpu_exception_intinfo(vcpu); 1858 return (0); 1859 } 1860 1861 int 1862 vm_inject_exception(struct vcpu *vcpu, int vector, int errcode_valid, 1863 uint32_t errcode, int restart_instruction) 1864 { 1865 uint64_t regval; 1866 int error __diagused; 1867 1868 if (vector < 0 || vector >= 32) 1869 return (EINVAL); 1870 1871 /* 1872 * A double fault exception should never be injected directly into 1873 * the guest. It is a derived exception that results from specific 1874 * combinations of nested faults. 1875 */ 1876 if (vector == IDT_DF) 1877 return (EINVAL); 1878 1879 if (vcpu->exception_pending) { 1880 VMM_CTR2(vcpu, "Unable to inject exception %d due to " 1881 "pending exception %d", vector, vcpu->exc_vector); 1882 return (EBUSY); 1883 } 1884 1885 if (errcode_valid) { 1886 /* 1887 * Exceptions don't deliver an error code in real mode. 1888 */ 1889 error = vm_get_register(vcpu, VM_REG_GUEST_CR0, ®val); 1890 KASSERT(!error, ("%s: error %d getting CR0", __func__, error)); 1891 if (!(regval & CR0_PE)) 1892 errcode_valid = 0; 1893 } 1894 1895 /* 1896 * From section 26.6.1 "Interruptibility State" in Intel SDM: 1897 * 1898 * Event blocking by "STI" or "MOV SS" is cleared after guest executes 1899 * one instruction or incurs an exception. 1900 */ 1901 error = vm_set_register(vcpu, VM_REG_GUEST_INTR_SHADOW, 0); 1902 KASSERT(error == 0, ("%s: error %d clearing interrupt shadow", 1903 __func__, error)); 1904 1905 if (restart_instruction) 1906 vm_restart_instruction(vcpu); 1907 1908 vcpu->exception_pending = 1; 1909 vcpu->exc_vector = vector; 1910 vcpu->exc_errcode = errcode; 1911 vcpu->exc_errcode_valid = errcode_valid; 1912 VMM_CTR1(vcpu, "Exception %d pending", vector); 1913 return (0); 1914 } 1915 1916 void 1917 vm_inject_fault(struct vcpu *vcpu, int vector, int errcode_valid, int errcode) 1918 { 1919 int error __diagused, restart_instruction; 1920 1921 restart_instruction = 1; 1922 1923 error = vm_inject_exception(vcpu, vector, errcode_valid, 1924 errcode, restart_instruction); 1925 KASSERT(error == 0, ("vm_inject_exception error %d", error)); 1926 } 1927 1928 void 1929 vm_inject_pf(struct vcpu *vcpu, int error_code, uint64_t cr2) 1930 { 1931 int error __diagused; 1932 1933 VMM_CTR2(vcpu, "Injecting page fault: error_code %#x, cr2 %#lx", 1934 error_code, cr2); 1935 1936 error = vm_set_register(vcpu, VM_REG_GUEST_CR2, cr2); 1937 KASSERT(error == 0, ("vm_set_register(cr2) error %d", error)); 1938 1939 vm_inject_fault(vcpu, IDT_PF, 1, error_code); 1940 } 1941 1942 static VMM_STAT(VCPU_NMI_COUNT, "number of NMIs delivered to vcpu"); 1943 1944 int 1945 vm_inject_nmi(struct vcpu *vcpu) 1946 { 1947 1948 vcpu->nmi_pending = 1; 1949 vcpu_notify_event(vcpu, false); 1950 return (0); 1951 } 1952 1953 int 1954 vm_nmi_pending(struct vcpu *vcpu) 1955 { 1956 return (vcpu->nmi_pending); 1957 } 1958 1959 void 1960 vm_nmi_clear(struct vcpu *vcpu) 1961 { 1962 if (vcpu->nmi_pending == 0) 1963 panic("vm_nmi_clear: inconsistent nmi_pending state"); 1964 1965 vcpu->nmi_pending = 0; 1966 vmm_stat_incr(vcpu, VCPU_NMI_COUNT, 1); 1967 } 1968 1969 static VMM_STAT(VCPU_EXTINT_COUNT, "number of ExtINTs delivered to vcpu"); 1970 1971 int 1972 vm_inject_extint(struct vcpu *vcpu) 1973 { 1974 1975 vcpu->extint_pending = 1; 1976 vcpu_notify_event(vcpu, false); 1977 return (0); 1978 } 1979 1980 int 1981 vm_extint_pending(struct vcpu *vcpu) 1982 { 1983 return (vcpu->extint_pending); 1984 } 1985 1986 void 1987 vm_extint_clear(struct vcpu *vcpu) 1988 { 1989 if (vcpu->extint_pending == 0) 1990 panic("vm_extint_clear: inconsistent extint_pending state"); 1991 1992 vcpu->extint_pending = 0; 1993 vmm_stat_incr(vcpu, VCPU_EXTINT_COUNT, 1); 1994 } 1995 1996 int 1997 vm_get_capability(struct vcpu *vcpu, int type, int *retval) 1998 { 1999 if (type < 0 || type >= VM_CAP_MAX) 2000 return (EINVAL); 2001 2002 return (vmmops_getcap(vcpu->cookie, type, retval)); 2003 } 2004 2005 int 2006 vm_set_capability(struct vcpu *vcpu, int type, int val) 2007 { 2008 if (type < 0 || type >= VM_CAP_MAX) 2009 return (EINVAL); 2010 2011 return (vmmops_setcap(vcpu->cookie, type, val)); 2012 } 2013 2014 struct vm * 2015 vcpu_vm(struct vcpu *vcpu) 2016 { 2017 return (vcpu->vm); 2018 } 2019 2020 int 2021 vcpu_vcpuid(struct vcpu *vcpu) 2022 { 2023 return (vcpu->vcpuid); 2024 } 2025 2026 struct vcpu * 2027 vm_vcpu(struct vm *vm, int vcpuid) 2028 { 2029 return (vm->vcpu[vcpuid]); 2030 } 2031 2032 struct vlapic * 2033 vm_lapic(struct vcpu *vcpu) 2034 { 2035 return (vcpu->vlapic); 2036 } 2037 2038 struct vioapic * 2039 vm_ioapic(struct vm *vm) 2040 { 2041 2042 return (vm->vioapic); 2043 } 2044 2045 struct vhpet * 2046 vm_hpet(struct vm *vm) 2047 { 2048 2049 return (vm->vhpet); 2050 } 2051 2052 bool 2053 vmm_is_pptdev(int bus, int slot, int func) 2054 { 2055 int b, f, i, n, s; 2056 char *val, *cp, *cp2; 2057 bool found; 2058 2059 /* 2060 * XXX 2061 * The length of an environment variable is limited to 128 bytes which 2062 * puts an upper limit on the number of passthru devices that may be 2063 * specified using a single environment variable. 2064 * 2065 * Work around this by scanning multiple environment variable 2066 * names instead of a single one - yuck! 2067 */ 2068 const char *names[] = { "pptdevs", "pptdevs2", "pptdevs3", NULL }; 2069 2070 /* set pptdevs="1/2/3 4/5/6 7/8/9 10/11/12" */ 2071 found = false; 2072 for (i = 0; names[i] != NULL && !found; i++) { 2073 cp = val = kern_getenv(names[i]); 2074 while (cp != NULL && *cp != '\0') { 2075 if ((cp2 = strchr(cp, ' ')) != NULL) 2076 *cp2 = '\0'; 2077 2078 n = sscanf(cp, "%d/%d/%d", &b, &s, &f); 2079 if (n == 3 && bus == b && slot == s && func == f) { 2080 found = true; 2081 break; 2082 } 2083 2084 if (cp2 != NULL) 2085 *cp2++ = ' '; 2086 2087 cp = cp2; 2088 } 2089 freeenv(val); 2090 } 2091 return (found); 2092 } 2093 2094 void * 2095 vm_iommu_domain(struct vm *vm) 2096 { 2097 2098 return (vm->iommu); 2099 } 2100 2101 int 2102 vcpu_set_state(struct vcpu *vcpu, enum vcpu_state newstate, bool from_idle) 2103 { 2104 int error; 2105 2106 vcpu_lock(vcpu); 2107 error = vcpu_set_state_locked(vcpu, newstate, from_idle); 2108 vcpu_unlock(vcpu); 2109 2110 return (error); 2111 } 2112 2113 enum vcpu_state 2114 vcpu_get_state(struct vcpu *vcpu, int *hostcpu) 2115 { 2116 enum vcpu_state state; 2117 2118 vcpu_lock(vcpu); 2119 state = vcpu->state; 2120 if (hostcpu != NULL) 2121 *hostcpu = vcpu->hostcpu; 2122 vcpu_unlock(vcpu); 2123 2124 return (state); 2125 } 2126 2127 int 2128 vm_activate_cpu(struct vcpu *vcpu) 2129 { 2130 struct vm *vm = vcpu->vm; 2131 2132 if (CPU_ISSET(vcpu->vcpuid, &vm->active_cpus)) 2133 return (EBUSY); 2134 2135 VMM_CTR0(vcpu, "activated"); 2136 CPU_SET_ATOMIC(vcpu->vcpuid, &vm->active_cpus); 2137 return (0); 2138 } 2139 2140 int 2141 vm_suspend_cpu(struct vm *vm, struct vcpu *vcpu) 2142 { 2143 if (vcpu == NULL) { 2144 vm->debug_cpus = vm->active_cpus; 2145 for (int i = 0; i < vm->maxcpus; i++) { 2146 if (CPU_ISSET(i, &vm->active_cpus)) 2147 vcpu_notify_event(vm_vcpu(vm, i), false); 2148 } 2149 } else { 2150 if (!CPU_ISSET(vcpu->vcpuid, &vm->active_cpus)) 2151 return (EINVAL); 2152 2153 CPU_SET_ATOMIC(vcpu->vcpuid, &vm->debug_cpus); 2154 vcpu_notify_event(vcpu, false); 2155 } 2156 return (0); 2157 } 2158 2159 int 2160 vm_resume_cpu(struct vm *vm, struct vcpu *vcpu) 2161 { 2162 2163 if (vcpu == NULL) { 2164 CPU_ZERO(&vm->debug_cpus); 2165 } else { 2166 if (!CPU_ISSET(vcpu->vcpuid, &vm->debug_cpus)) 2167 return (EINVAL); 2168 2169 CPU_CLR_ATOMIC(vcpu->vcpuid, &vm->debug_cpus); 2170 } 2171 return (0); 2172 } 2173 2174 int 2175 vcpu_debugged(struct vcpu *vcpu) 2176 { 2177 2178 return (CPU_ISSET(vcpu->vcpuid, &vcpu->vm->debug_cpus)); 2179 } 2180 2181 cpuset_t 2182 vm_active_cpus(struct vm *vm) 2183 { 2184 2185 return (vm->active_cpus); 2186 } 2187 2188 cpuset_t 2189 vm_debug_cpus(struct vm *vm) 2190 { 2191 2192 return (vm->debug_cpus); 2193 } 2194 2195 cpuset_t 2196 vm_suspended_cpus(struct vm *vm) 2197 { 2198 2199 return (vm->suspended_cpus); 2200 } 2201 2202 /* 2203 * Returns the subset of vCPUs in tostart that are awaiting startup. 2204 * These vCPUs are also marked as no longer awaiting startup. 2205 */ 2206 cpuset_t 2207 vm_start_cpus(struct vm *vm, const cpuset_t *tostart) 2208 { 2209 cpuset_t set; 2210 2211 mtx_lock(&vm->rendezvous_mtx); 2212 CPU_AND(&set, &vm->startup_cpus, tostart); 2213 CPU_ANDNOT(&vm->startup_cpus, &vm->startup_cpus, &set); 2214 mtx_unlock(&vm->rendezvous_mtx); 2215 return (set); 2216 } 2217 2218 void 2219 vm_await_start(struct vm *vm, const cpuset_t *waiting) 2220 { 2221 mtx_lock(&vm->rendezvous_mtx); 2222 CPU_OR(&vm->startup_cpus, &vm->startup_cpus, waiting); 2223 mtx_unlock(&vm->rendezvous_mtx); 2224 } 2225 2226 void * 2227 vcpu_stats(struct vcpu *vcpu) 2228 { 2229 2230 return (vcpu->stats); 2231 } 2232 2233 int 2234 vm_get_x2apic_state(struct vcpu *vcpu, enum x2apic_state *state) 2235 { 2236 *state = vcpu->x2apic_state; 2237 2238 return (0); 2239 } 2240 2241 int 2242 vm_set_x2apic_state(struct vcpu *vcpu, enum x2apic_state state) 2243 { 2244 if (state >= X2APIC_STATE_LAST) 2245 return (EINVAL); 2246 2247 vcpu->x2apic_state = state; 2248 2249 vlapic_set_x2apic_state(vcpu, state); 2250 2251 return (0); 2252 } 2253 2254 /* 2255 * This function is called to ensure that a vcpu "sees" a pending event 2256 * as soon as possible: 2257 * - If the vcpu thread is sleeping then it is woken up. 2258 * - If the vcpu is running on a different host_cpu then an IPI will be directed 2259 * to the host_cpu to cause the vcpu to trap into the hypervisor. 2260 */ 2261 static void 2262 vcpu_notify_event_locked(struct vcpu *vcpu, bool lapic_intr) 2263 { 2264 int hostcpu; 2265 2266 hostcpu = vcpu->hostcpu; 2267 if (vcpu->state == VCPU_RUNNING) { 2268 KASSERT(hostcpu != NOCPU, ("vcpu running on invalid hostcpu")); 2269 if (hostcpu != curcpu) { 2270 if (lapic_intr) { 2271 vlapic_post_intr(vcpu->vlapic, hostcpu, 2272 vmm_ipinum); 2273 } else { 2274 ipi_cpu(hostcpu, vmm_ipinum); 2275 } 2276 } else { 2277 /* 2278 * If the 'vcpu' is running on 'curcpu' then it must 2279 * be sending a notification to itself (e.g. SELF_IPI). 2280 * The pending event will be picked up when the vcpu 2281 * transitions back to guest context. 2282 */ 2283 } 2284 } else { 2285 KASSERT(hostcpu == NOCPU, ("vcpu state %d not consistent " 2286 "with hostcpu %d", vcpu->state, hostcpu)); 2287 if (vcpu->state == VCPU_SLEEPING) 2288 wakeup_one(vcpu); 2289 } 2290 } 2291 2292 void 2293 vcpu_notify_event(struct vcpu *vcpu, bool lapic_intr) 2294 { 2295 vcpu_lock(vcpu); 2296 vcpu_notify_event_locked(vcpu, lapic_intr); 2297 vcpu_unlock(vcpu); 2298 } 2299 2300 struct vmspace * 2301 vm_vmspace(struct vm *vm) 2302 { 2303 return (vm->vmspace); 2304 } 2305 2306 struct vm_mem * 2307 vm_mem(struct vm *vm) 2308 { 2309 return (&vm->mem); 2310 } 2311 2312 int 2313 vm_apicid2vcpuid(struct vm *vm, int apicid) 2314 { 2315 /* 2316 * XXX apic id is assumed to be numerically identical to vcpu id 2317 */ 2318 return (apicid); 2319 } 2320 2321 int 2322 vm_smp_rendezvous(struct vcpu *vcpu, cpuset_t dest, 2323 vm_rendezvous_func_t func, void *arg) 2324 { 2325 struct vm *vm = vcpu->vm; 2326 int error, i; 2327 2328 /* 2329 * Enforce that this function is called without any locks 2330 */ 2331 WITNESS_WARN(WARN_PANIC, NULL, "vm_smp_rendezvous"); 2332 2333 restart: 2334 mtx_lock(&vm->rendezvous_mtx); 2335 if (vm->rendezvous_func != NULL) { 2336 /* 2337 * If a rendezvous is already in progress then we need to 2338 * call the rendezvous handler in case this 'vcpu' is one 2339 * of the targets of the rendezvous. 2340 */ 2341 VMM_CTR0(vcpu, "Rendezvous already in progress"); 2342 mtx_unlock(&vm->rendezvous_mtx); 2343 error = vm_handle_rendezvous(vcpu); 2344 if (error != 0) 2345 return (error); 2346 goto restart; 2347 } 2348 KASSERT(vm->rendezvous_func == NULL, ("vm_smp_rendezvous: previous " 2349 "rendezvous is still in progress")); 2350 2351 VMM_CTR0(vcpu, "Initiating rendezvous"); 2352 vm->rendezvous_req_cpus = dest; 2353 CPU_ZERO(&vm->rendezvous_done_cpus); 2354 vm->rendezvous_arg = arg; 2355 vm->rendezvous_func = func; 2356 mtx_unlock(&vm->rendezvous_mtx); 2357 2358 /* 2359 * Wake up any sleeping vcpus and trigger a VM-exit in any running 2360 * vcpus so they handle the rendezvous as soon as possible. 2361 */ 2362 for (i = 0; i < vm->maxcpus; i++) { 2363 if (CPU_ISSET(i, &dest)) 2364 vcpu_notify_event(vm_vcpu(vm, i), false); 2365 } 2366 2367 return (vm_handle_rendezvous(vcpu)); 2368 } 2369 2370 struct vatpic * 2371 vm_atpic(struct vm *vm) 2372 { 2373 return (vm->vatpic); 2374 } 2375 2376 struct vatpit * 2377 vm_atpit(struct vm *vm) 2378 { 2379 return (vm->vatpit); 2380 } 2381 2382 struct vpmtmr * 2383 vm_pmtmr(struct vm *vm) 2384 { 2385 2386 return (vm->vpmtmr); 2387 } 2388 2389 struct vrtc * 2390 vm_rtc(struct vm *vm) 2391 { 2392 2393 return (vm->vrtc); 2394 } 2395 2396 enum vm_reg_name 2397 vm_segment_name(int seg) 2398 { 2399 static enum vm_reg_name seg_names[] = { 2400 VM_REG_GUEST_ES, 2401 VM_REG_GUEST_CS, 2402 VM_REG_GUEST_SS, 2403 VM_REG_GUEST_DS, 2404 VM_REG_GUEST_FS, 2405 VM_REG_GUEST_GS 2406 }; 2407 2408 KASSERT(seg >= 0 && seg < nitems(seg_names), 2409 ("%s: invalid segment encoding %d", __func__, seg)); 2410 return (seg_names[seg]); 2411 } 2412 2413 void 2414 vm_copy_teardown(struct vm_copyinfo *copyinfo, int num_copyinfo) 2415 { 2416 int idx; 2417 2418 for (idx = 0; idx < num_copyinfo; idx++) { 2419 if (copyinfo[idx].cookie != NULL) 2420 vm_gpa_release(copyinfo[idx].cookie); 2421 } 2422 bzero(copyinfo, num_copyinfo * sizeof(struct vm_copyinfo)); 2423 } 2424 2425 int 2426 vm_copy_setup(struct vcpu *vcpu, struct vm_guest_paging *paging, 2427 uint64_t gla, size_t len, int prot, struct vm_copyinfo *copyinfo, 2428 int num_copyinfo, int *fault) 2429 { 2430 int error, idx, nused; 2431 size_t n, off, remaining; 2432 void *hva, *cookie; 2433 uint64_t gpa; 2434 2435 bzero(copyinfo, sizeof(struct vm_copyinfo) * num_copyinfo); 2436 2437 nused = 0; 2438 remaining = len; 2439 while (remaining > 0) { 2440 if (nused >= num_copyinfo) 2441 return (EFAULT); 2442 error = vm_gla2gpa(vcpu, paging, gla, prot, &gpa, fault); 2443 if (error || *fault) 2444 return (error); 2445 off = gpa & PAGE_MASK; 2446 n = min(remaining, PAGE_SIZE - off); 2447 copyinfo[nused].gpa = gpa; 2448 copyinfo[nused].len = n; 2449 remaining -= n; 2450 gla += n; 2451 nused++; 2452 } 2453 2454 for (idx = 0; idx < nused; idx++) { 2455 hva = vm_gpa_hold(vcpu, copyinfo[idx].gpa, 2456 copyinfo[idx].len, prot, &cookie); 2457 if (hva == NULL) 2458 break; 2459 copyinfo[idx].hva = hva; 2460 copyinfo[idx].cookie = cookie; 2461 } 2462 2463 if (idx != nused) { 2464 vm_copy_teardown(copyinfo, num_copyinfo); 2465 return (EFAULT); 2466 } else { 2467 *fault = 0; 2468 return (0); 2469 } 2470 } 2471 2472 void 2473 vm_copyin(struct vm_copyinfo *copyinfo, void *kaddr, size_t len) 2474 { 2475 char *dst; 2476 int idx; 2477 2478 dst = kaddr; 2479 idx = 0; 2480 while (len > 0) { 2481 bcopy(copyinfo[idx].hva, dst, copyinfo[idx].len); 2482 len -= copyinfo[idx].len; 2483 dst += copyinfo[idx].len; 2484 idx++; 2485 } 2486 } 2487 2488 void 2489 vm_copyout(const void *kaddr, struct vm_copyinfo *copyinfo, size_t len) 2490 { 2491 const char *src; 2492 int idx; 2493 2494 src = kaddr; 2495 idx = 0; 2496 while (len > 0) { 2497 bcopy(src, copyinfo[idx].hva, copyinfo[idx].len); 2498 len -= copyinfo[idx].len; 2499 src += copyinfo[idx].len; 2500 idx++; 2501 } 2502 } 2503 2504 /* 2505 * Return the amount of in-use and wired memory for the VM. Since 2506 * these are global stats, only return the values with for vCPU 0 2507 */ 2508 VMM_STAT_DECLARE(VMM_MEM_RESIDENT); 2509 VMM_STAT_DECLARE(VMM_MEM_WIRED); 2510 2511 static void 2512 vm_get_rescnt(struct vcpu *vcpu, struct vmm_stat_type *stat) 2513 { 2514 2515 if (vcpu->vcpuid == 0) { 2516 vmm_stat_set(vcpu, VMM_MEM_RESIDENT, PAGE_SIZE * 2517 vmspace_resident_count(vcpu->vm->vmspace)); 2518 } 2519 } 2520 2521 static void 2522 vm_get_wiredcnt(struct vcpu *vcpu, struct vmm_stat_type *stat) 2523 { 2524 2525 if (vcpu->vcpuid == 0) { 2526 vmm_stat_set(vcpu, VMM_MEM_WIRED, PAGE_SIZE * 2527 pmap_wired_count(vmspace_pmap(vcpu->vm->vmspace))); 2528 } 2529 } 2530 2531 VMM_STAT_FUNC(VMM_MEM_RESIDENT, "Resident memory", vm_get_rescnt); 2532 VMM_STAT_FUNC(VMM_MEM_WIRED, "Wired memory", vm_get_wiredcnt); 2533 2534 #ifdef BHYVE_SNAPSHOT 2535 static int 2536 vm_snapshot_vcpus(struct vm *vm, struct vm_snapshot_meta *meta) 2537 { 2538 uint64_t tsc, now; 2539 int ret; 2540 struct vcpu *vcpu; 2541 uint16_t i, maxcpus; 2542 2543 now = rdtsc(); 2544 maxcpus = vm_get_maxcpus(vm); 2545 for (i = 0; i < maxcpus; i++) { 2546 vcpu = vm->vcpu[i]; 2547 if (vcpu == NULL) 2548 continue; 2549 2550 SNAPSHOT_VAR_OR_LEAVE(vcpu->x2apic_state, meta, ret, done); 2551 SNAPSHOT_VAR_OR_LEAVE(vcpu->exitintinfo, meta, ret, done); 2552 SNAPSHOT_VAR_OR_LEAVE(vcpu->exc_vector, meta, ret, done); 2553 SNAPSHOT_VAR_OR_LEAVE(vcpu->exc_errcode_valid, meta, ret, done); 2554 SNAPSHOT_VAR_OR_LEAVE(vcpu->exc_errcode, meta, ret, done); 2555 SNAPSHOT_VAR_OR_LEAVE(vcpu->guest_xcr0, meta, ret, done); 2556 SNAPSHOT_VAR_OR_LEAVE(vcpu->exitinfo, meta, ret, done); 2557 SNAPSHOT_VAR_OR_LEAVE(vcpu->nextrip, meta, ret, done); 2558 2559 /* 2560 * Save the absolute TSC value by adding now to tsc_offset. 2561 * 2562 * It will be turned turned back into an actual offset when the 2563 * TSC restore function is called 2564 */ 2565 tsc = now + vcpu->tsc_offset; 2566 SNAPSHOT_VAR_OR_LEAVE(tsc, meta, ret, done); 2567 if (meta->op == VM_SNAPSHOT_RESTORE) 2568 vcpu->tsc_offset = tsc; 2569 } 2570 2571 done: 2572 return (ret); 2573 } 2574 2575 static int 2576 vm_snapshot_vm(struct vm *vm, struct vm_snapshot_meta *meta) 2577 { 2578 int ret; 2579 2580 ret = vm_snapshot_vcpus(vm, meta); 2581 if (ret != 0) 2582 goto done; 2583 2584 SNAPSHOT_VAR_OR_LEAVE(vm->startup_cpus, meta, ret, done); 2585 done: 2586 return (ret); 2587 } 2588 2589 static int 2590 vm_snapshot_vcpu(struct vm *vm, struct vm_snapshot_meta *meta) 2591 { 2592 int error; 2593 struct vcpu *vcpu; 2594 uint16_t i, maxcpus; 2595 2596 error = 0; 2597 2598 maxcpus = vm_get_maxcpus(vm); 2599 for (i = 0; i < maxcpus; i++) { 2600 vcpu = vm->vcpu[i]; 2601 if (vcpu == NULL) 2602 continue; 2603 2604 error = vmmops_vcpu_snapshot(vcpu->cookie, meta); 2605 if (error != 0) { 2606 printf("%s: failed to snapshot vmcs/vmcb data for " 2607 "vCPU: %d; error: %d\n", __func__, i, error); 2608 goto done; 2609 } 2610 } 2611 2612 done: 2613 return (error); 2614 } 2615 2616 /* 2617 * Save kernel-side structures to user-space for snapshotting. 2618 */ 2619 int 2620 vm_snapshot_req(struct vm *vm, struct vm_snapshot_meta *meta) 2621 { 2622 int ret = 0; 2623 2624 switch (meta->dev_req) { 2625 case STRUCT_VMCX: 2626 ret = vm_snapshot_vcpu(vm, meta); 2627 break; 2628 case STRUCT_VM: 2629 ret = vm_snapshot_vm(vm, meta); 2630 break; 2631 case STRUCT_VIOAPIC: 2632 ret = vioapic_snapshot(vm_ioapic(vm), meta); 2633 break; 2634 case STRUCT_VLAPIC: 2635 ret = vlapic_snapshot(vm, meta); 2636 break; 2637 case STRUCT_VHPET: 2638 ret = vhpet_snapshot(vm_hpet(vm), meta); 2639 break; 2640 case STRUCT_VATPIC: 2641 ret = vatpic_snapshot(vm_atpic(vm), meta); 2642 break; 2643 case STRUCT_VATPIT: 2644 ret = vatpit_snapshot(vm_atpit(vm), meta); 2645 break; 2646 case STRUCT_VPMTMR: 2647 ret = vpmtmr_snapshot(vm_pmtmr(vm), meta); 2648 break; 2649 case STRUCT_VRTC: 2650 ret = vrtc_snapshot(vm_rtc(vm), meta); 2651 break; 2652 default: 2653 printf("%s: failed to find the requested type %#x\n", 2654 __func__, meta->dev_req); 2655 ret = (EINVAL); 2656 } 2657 return (ret); 2658 } 2659 2660 void 2661 vm_set_tsc_offset(struct vcpu *vcpu, uint64_t offset) 2662 { 2663 vcpu->tsc_offset = offset; 2664 } 2665 2666 int 2667 vm_restore_time(struct vm *vm) 2668 { 2669 int error; 2670 uint64_t now; 2671 struct vcpu *vcpu; 2672 uint16_t i, maxcpus; 2673 2674 now = rdtsc(); 2675 2676 error = vhpet_restore_time(vm_hpet(vm)); 2677 if (error) 2678 return (error); 2679 2680 maxcpus = vm_get_maxcpus(vm); 2681 for (i = 0; i < maxcpus; i++) { 2682 vcpu = vm->vcpu[i]; 2683 if (vcpu == NULL) 2684 continue; 2685 2686 error = vmmops_restore_tsc(vcpu->cookie, 2687 vcpu->tsc_offset - now); 2688 if (error) 2689 return (error); 2690 } 2691 2692 return (0); 2693 } 2694 #endif 2695