1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2011 NetApp, Inc. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 * 28 * $FreeBSD$ 29 */ 30 31 #include <sys/cdefs.h> 32 __FBSDID("$FreeBSD$"); 33 34 #include "opt_bhyve_snapshot.h" 35 36 #include <sys/param.h> 37 #include <sys/systm.h> 38 #include <sys/kernel.h> 39 #include <sys/module.h> 40 #include <sys/sysctl.h> 41 #include <sys/malloc.h> 42 #include <sys/pcpu.h> 43 #include <sys/lock.h> 44 #include <sys/mutex.h> 45 #include <sys/proc.h> 46 #include <sys/rwlock.h> 47 #include <sys/sched.h> 48 #include <sys/smp.h> 49 #include <sys/vnode.h> 50 51 #include <vm/vm.h> 52 #include <vm/vm_param.h> 53 #include <vm/vm_extern.h> 54 #include <vm/vm_object.h> 55 #include <vm/vm_page.h> 56 #include <vm/pmap.h> 57 #include <vm/vm_map.h> 58 #include <vm/vm_pager.h> 59 #include <vm/vm_kern.h> 60 #include <vm/vnode_pager.h> 61 #include <vm/swap_pager.h> 62 #include <vm/uma.h> 63 64 #include <machine/cpu.h> 65 #include <machine/pcb.h> 66 #include <machine/smp.h> 67 #include <machine/md_var.h> 68 #include <x86/psl.h> 69 #include <x86/apicreg.h> 70 #include <x86/ifunc.h> 71 72 #include <machine/vmm.h> 73 #include <machine/vmm_dev.h> 74 #include <machine/vmm_instruction_emul.h> 75 #include <machine/vmm_snapshot.h> 76 77 #include "vmm_ioport.h" 78 #include "vmm_ktr.h" 79 #include "vmm_host.h" 80 #include "vmm_mem.h" 81 #include "vmm_util.h" 82 #include "vatpic.h" 83 #include "vatpit.h" 84 #include "vhpet.h" 85 #include "vioapic.h" 86 #include "vlapic.h" 87 #include "vpmtmr.h" 88 #include "vrtc.h" 89 #include "vmm_stat.h" 90 #include "vmm_lapic.h" 91 92 #include "io/ppt.h" 93 #include "io/iommu.h" 94 95 struct vlapic; 96 97 /* 98 * Initialization: 99 * (a) allocated when vcpu is created 100 * (i) initialized when vcpu is created and when it is reinitialized 101 * (o) initialized the first time the vcpu is created 102 * (x) initialized before use 103 */ 104 struct vcpu { 105 struct mtx mtx; /* (o) protects 'state' and 'hostcpu' */ 106 enum vcpu_state state; /* (o) vcpu state */ 107 int hostcpu; /* (o) vcpu's host cpu */ 108 int reqidle; /* (i) request vcpu to idle */ 109 struct vlapic *vlapic; /* (i) APIC device model */ 110 enum x2apic_state x2apic_state; /* (i) APIC mode */ 111 uint64_t exitintinfo; /* (i) events pending at VM exit */ 112 int nmi_pending; /* (i) NMI pending */ 113 int extint_pending; /* (i) INTR pending */ 114 int exception_pending; /* (i) exception pending */ 115 int exc_vector; /* (x) exception collateral */ 116 int exc_errcode_valid; 117 uint32_t exc_errcode; 118 struct savefpu *guestfpu; /* (a,i) guest fpu state */ 119 uint64_t guest_xcr0; /* (i) guest %xcr0 register */ 120 void *stats; /* (a,i) statistics */ 121 struct vm_exit exitinfo; /* (x) exit reason and collateral */ 122 uint64_t nextrip; /* (x) next instruction to execute */ 123 uint64_t tsc_offset; /* (o) TSC offsetting */ 124 }; 125 126 #define vcpu_lock_initialized(v) mtx_initialized(&((v)->mtx)) 127 #define vcpu_lock_init(v) mtx_init(&((v)->mtx), "vcpu lock", 0, MTX_SPIN) 128 #define vcpu_lock(v) mtx_lock_spin(&((v)->mtx)) 129 #define vcpu_unlock(v) mtx_unlock_spin(&((v)->mtx)) 130 #define vcpu_assert_locked(v) mtx_assert(&((v)->mtx), MA_OWNED) 131 132 struct mem_seg { 133 size_t len; 134 bool sysmem; 135 struct vm_object *object; 136 }; 137 #define VM_MAX_MEMSEGS 3 138 139 struct mem_map { 140 vm_paddr_t gpa; 141 size_t len; 142 vm_ooffset_t segoff; 143 int segid; 144 int prot; 145 int flags; 146 }; 147 #define VM_MAX_MEMMAPS 8 148 149 /* 150 * Initialization: 151 * (o) initialized the first time the VM is created 152 * (i) initialized when VM is created and when it is reinitialized 153 * (x) initialized before use 154 */ 155 struct vm { 156 void *cookie; /* (i) cpu-specific data */ 157 void *iommu; /* (x) iommu-specific data */ 158 struct vhpet *vhpet; /* (i) virtual HPET */ 159 struct vioapic *vioapic; /* (i) virtual ioapic */ 160 struct vatpic *vatpic; /* (i) virtual atpic */ 161 struct vatpit *vatpit; /* (i) virtual atpit */ 162 struct vpmtmr *vpmtmr; /* (i) virtual ACPI PM timer */ 163 struct vrtc *vrtc; /* (o) virtual RTC */ 164 volatile cpuset_t active_cpus; /* (i) active vcpus */ 165 volatile cpuset_t debug_cpus; /* (i) vcpus stopped for debug */ 166 int suspend; /* (i) stop VM execution */ 167 volatile cpuset_t suspended_cpus; /* (i) suspended vcpus */ 168 volatile cpuset_t halted_cpus; /* (x) cpus in a hard halt */ 169 cpuset_t rendezvous_req_cpus; /* (x) rendezvous requested */ 170 cpuset_t rendezvous_done_cpus; /* (x) rendezvous finished */ 171 void *rendezvous_arg; /* (x) rendezvous func/arg */ 172 vm_rendezvous_func_t rendezvous_func; 173 struct mtx rendezvous_mtx; /* (o) rendezvous lock */ 174 struct mem_map mem_maps[VM_MAX_MEMMAPS]; /* (i) guest address space */ 175 struct mem_seg mem_segs[VM_MAX_MEMSEGS]; /* (o) guest memory regions */ 176 struct vmspace *vmspace; /* (o) guest's address space */ 177 char name[VM_MAX_NAMELEN]; /* (o) virtual machine name */ 178 struct vcpu vcpu[VM_MAXCPU]; /* (i) guest vcpus */ 179 /* The following describe the vm cpu topology */ 180 uint16_t sockets; /* (o) num of sockets */ 181 uint16_t cores; /* (o) num of cores/socket */ 182 uint16_t threads; /* (o) num of threads/core */ 183 uint16_t maxcpus; /* (o) max pluggable cpus */ 184 }; 185 186 static int vmm_initialized; 187 188 static void vmmops_panic(void); 189 190 static void 191 vmmops_panic(void) 192 { 193 panic("vmm_ops func called when !vmm_is_intel() && !vmm_is_svm()"); 194 } 195 196 #define DEFINE_VMMOPS_IFUNC(ret_type, opname, args) \ 197 DEFINE_IFUNC(static, ret_type, vmmops_##opname, args) \ 198 { \ 199 if (vmm_is_intel()) \ 200 return (vmm_ops_intel.opname); \ 201 else if (vmm_is_svm()) \ 202 return (vmm_ops_amd.opname); \ 203 else \ 204 return ((ret_type (*)args)vmmops_panic); \ 205 } 206 207 DEFINE_VMMOPS_IFUNC(int, modinit, (int ipinum)) 208 DEFINE_VMMOPS_IFUNC(int, modcleanup, (void)) 209 DEFINE_VMMOPS_IFUNC(void, modresume, (void)) 210 DEFINE_VMMOPS_IFUNC(void *, init, (struct vm *vm, struct pmap *pmap)) 211 DEFINE_VMMOPS_IFUNC(int, run, (void *vmi, int vcpu, register_t rip, 212 struct pmap *pmap, struct vm_eventinfo *info)) 213 DEFINE_VMMOPS_IFUNC(void, cleanup, (void *vmi)) 214 DEFINE_VMMOPS_IFUNC(int, getreg, (void *vmi, int vcpu, int num, 215 uint64_t *retval)) 216 DEFINE_VMMOPS_IFUNC(int, setreg, (void *vmi, int vcpu, int num, 217 uint64_t val)) 218 DEFINE_VMMOPS_IFUNC(int, getdesc, (void *vmi, int vcpu, int num, 219 struct seg_desc *desc)) 220 DEFINE_VMMOPS_IFUNC(int, setdesc, (void *vmi, int vcpu, int num, 221 struct seg_desc *desc)) 222 DEFINE_VMMOPS_IFUNC(int, getcap, (void *vmi, int vcpu, int num, int *retval)) 223 DEFINE_VMMOPS_IFUNC(int, setcap, (void *vmi, int vcpu, int num, int val)) 224 DEFINE_VMMOPS_IFUNC(struct vmspace *, vmspace_alloc, (vm_offset_t min, 225 vm_offset_t max)) 226 DEFINE_VMMOPS_IFUNC(void, vmspace_free, (struct vmspace *vmspace)) 227 DEFINE_VMMOPS_IFUNC(struct vlapic *, vlapic_init, (void *vmi, int vcpu)) 228 DEFINE_VMMOPS_IFUNC(void, vlapic_cleanup, (void *vmi, struct vlapic *vlapic)) 229 #ifdef BHYVE_SNAPSHOT 230 DEFINE_VMMOPS_IFUNC(int, snapshot, (void *vmi, struct vm_snapshot_meta 231 *meta)) 232 DEFINE_VMMOPS_IFUNC(int, vmcx_snapshot, (void *vmi, struct vm_snapshot_meta 233 *meta, int vcpu)) 234 DEFINE_VMMOPS_IFUNC(int, restore_tsc, (void *vmi, int vcpuid, uint64_t now)) 235 #endif 236 237 #define fpu_start_emulating() load_cr0(rcr0() | CR0_TS) 238 #define fpu_stop_emulating() clts() 239 240 SDT_PROVIDER_DEFINE(vmm); 241 242 static MALLOC_DEFINE(M_VM, "vm", "vm"); 243 244 /* statistics */ 245 static VMM_STAT(VCPU_TOTAL_RUNTIME, "vcpu total runtime"); 246 247 SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 248 NULL); 249 250 /* 251 * Halt the guest if all vcpus are executing a HLT instruction with 252 * interrupts disabled. 253 */ 254 static int halt_detection_enabled = 1; 255 SYSCTL_INT(_hw_vmm, OID_AUTO, halt_detection, CTLFLAG_RDTUN, 256 &halt_detection_enabled, 0, 257 "Halt VM if all vcpus execute HLT with interrupts disabled"); 258 259 static int vmm_ipinum; 260 SYSCTL_INT(_hw_vmm, OID_AUTO, ipinum, CTLFLAG_RD, &vmm_ipinum, 0, 261 "IPI vector used for vcpu notifications"); 262 263 static int trace_guest_exceptions; 264 SYSCTL_INT(_hw_vmm, OID_AUTO, trace_guest_exceptions, CTLFLAG_RDTUN, 265 &trace_guest_exceptions, 0, 266 "Trap into hypervisor on all guest exceptions and reflect them back"); 267 268 static void vm_free_memmap(struct vm *vm, int ident); 269 static bool sysmem_mapping(struct vm *vm, struct mem_map *mm); 270 static void vcpu_notify_event_locked(struct vcpu *vcpu, bool lapic_intr); 271 272 #ifdef KTR 273 static const char * 274 vcpu_state2str(enum vcpu_state state) 275 { 276 277 switch (state) { 278 case VCPU_IDLE: 279 return ("idle"); 280 case VCPU_FROZEN: 281 return ("frozen"); 282 case VCPU_RUNNING: 283 return ("running"); 284 case VCPU_SLEEPING: 285 return ("sleeping"); 286 default: 287 return ("unknown"); 288 } 289 } 290 #endif 291 292 static void 293 vcpu_cleanup(struct vm *vm, int i, bool destroy) 294 { 295 struct vcpu *vcpu = &vm->vcpu[i]; 296 297 vmmops_vlapic_cleanup(vm->cookie, vcpu->vlapic); 298 if (destroy) { 299 vmm_stat_free(vcpu->stats); 300 fpu_save_area_free(vcpu->guestfpu); 301 } 302 } 303 304 static void 305 vcpu_init(struct vm *vm, int vcpu_id, bool create) 306 { 307 struct vcpu *vcpu; 308 309 KASSERT(vcpu_id >= 0 && vcpu_id < vm->maxcpus, 310 ("vcpu_init: invalid vcpu %d", vcpu_id)); 311 312 vcpu = &vm->vcpu[vcpu_id]; 313 314 if (create) { 315 KASSERT(!vcpu_lock_initialized(vcpu), ("vcpu %d already " 316 "initialized", vcpu_id)); 317 vcpu_lock_init(vcpu); 318 vcpu->state = VCPU_IDLE; 319 vcpu->hostcpu = NOCPU; 320 vcpu->guestfpu = fpu_save_area_alloc(); 321 vcpu->stats = vmm_stat_alloc(); 322 vcpu->tsc_offset = 0; 323 } 324 325 vcpu->vlapic = vmmops_vlapic_init(vm->cookie, vcpu_id); 326 vm_set_x2apic_state(vm, vcpu_id, X2APIC_DISABLED); 327 vcpu->reqidle = 0; 328 vcpu->exitintinfo = 0; 329 vcpu->nmi_pending = 0; 330 vcpu->extint_pending = 0; 331 vcpu->exception_pending = 0; 332 vcpu->guest_xcr0 = XFEATURE_ENABLED_X87; 333 fpu_save_area_reset(vcpu->guestfpu); 334 vmm_stat_init(vcpu->stats); 335 } 336 337 int 338 vcpu_trace_exceptions(struct vm *vm, int vcpuid) 339 { 340 341 return (trace_guest_exceptions); 342 } 343 344 struct vm_exit * 345 vm_exitinfo(struct vm *vm, int cpuid) 346 { 347 struct vcpu *vcpu; 348 349 if (cpuid < 0 || cpuid >= vm->maxcpus) 350 panic("vm_exitinfo: invalid cpuid %d", cpuid); 351 352 vcpu = &vm->vcpu[cpuid]; 353 354 return (&vcpu->exitinfo); 355 } 356 357 static int 358 vmm_init(void) 359 { 360 int error; 361 362 if (!vmm_is_hw_supported()) 363 return (ENXIO); 364 365 vmm_host_state_init(); 366 367 vmm_ipinum = lapic_ipi_alloc(pti ? &IDTVEC(justreturn1_pti) : 368 &IDTVEC(justreturn)); 369 if (vmm_ipinum < 0) 370 vmm_ipinum = IPI_AST; 371 372 error = vmm_mem_init(); 373 if (error) 374 return (error); 375 376 vmm_resume_p = vmmops_modresume; 377 378 return (vmmops_modinit(vmm_ipinum)); 379 } 380 381 static int 382 vmm_handler(module_t mod, int what, void *arg) 383 { 384 int error; 385 386 switch (what) { 387 case MOD_LOAD: 388 if (vmm_is_hw_supported()) { 389 vmmdev_init(); 390 error = vmm_init(); 391 if (error == 0) 392 vmm_initialized = 1; 393 } else { 394 error = ENXIO; 395 } 396 break; 397 case MOD_UNLOAD: 398 if (vmm_is_hw_supported()) { 399 error = vmmdev_cleanup(); 400 if (error == 0) { 401 vmm_resume_p = NULL; 402 iommu_cleanup(); 403 if (vmm_ipinum != IPI_AST) 404 lapic_ipi_free(vmm_ipinum); 405 error = vmmops_modcleanup(); 406 /* 407 * Something bad happened - prevent new 408 * VMs from being created 409 */ 410 if (error) 411 vmm_initialized = 0; 412 } 413 } else { 414 error = 0; 415 } 416 break; 417 default: 418 error = 0; 419 break; 420 } 421 return (error); 422 } 423 424 static moduledata_t vmm_kmod = { 425 "vmm", 426 vmm_handler, 427 NULL 428 }; 429 430 /* 431 * vmm initialization has the following dependencies: 432 * 433 * - VT-x initialization requires smp_rendezvous() and therefore must happen 434 * after SMP is fully functional (after SI_SUB_SMP). 435 */ 436 DECLARE_MODULE(vmm, vmm_kmod, SI_SUB_SMP + 1, SI_ORDER_ANY); 437 MODULE_VERSION(vmm, 1); 438 439 static void 440 vm_init(struct vm *vm, bool create) 441 { 442 int i; 443 444 vm->cookie = vmmops_init(vm, vmspace_pmap(vm->vmspace)); 445 vm->iommu = NULL; 446 vm->vioapic = vioapic_init(vm); 447 vm->vhpet = vhpet_init(vm); 448 vm->vatpic = vatpic_init(vm); 449 vm->vatpit = vatpit_init(vm); 450 vm->vpmtmr = vpmtmr_init(vm); 451 if (create) 452 vm->vrtc = vrtc_init(vm); 453 454 CPU_ZERO(&vm->active_cpus); 455 CPU_ZERO(&vm->debug_cpus); 456 457 vm->suspend = 0; 458 CPU_ZERO(&vm->suspended_cpus); 459 460 for (i = 0; i < vm->maxcpus; i++) 461 vcpu_init(vm, i, create); 462 } 463 464 /* 465 * The default CPU topology is a single thread per package. 466 */ 467 u_int cores_per_package = 1; 468 u_int threads_per_core = 1; 469 470 int 471 vm_create(const char *name, struct vm **retvm) 472 { 473 struct vm *vm; 474 struct vmspace *vmspace; 475 476 /* 477 * If vmm.ko could not be successfully initialized then don't attempt 478 * to create the virtual machine. 479 */ 480 if (!vmm_initialized) 481 return (ENXIO); 482 483 if (name == NULL || strlen(name) >= VM_MAX_NAMELEN) 484 return (EINVAL); 485 486 vmspace = vmmops_vmspace_alloc(0, VM_MAXUSER_ADDRESS_LA48); 487 if (vmspace == NULL) 488 return (ENOMEM); 489 490 vm = malloc(sizeof(struct vm), M_VM, M_WAITOK | M_ZERO); 491 strcpy(vm->name, name); 492 vm->vmspace = vmspace; 493 mtx_init(&vm->rendezvous_mtx, "vm rendezvous lock", 0, MTX_DEF); 494 495 vm->sockets = 1; 496 vm->cores = cores_per_package; /* XXX backwards compatibility */ 497 vm->threads = threads_per_core; /* XXX backwards compatibility */ 498 vm->maxcpus = VM_MAXCPU; /* XXX temp to keep code working */ 499 500 vm_init(vm, true); 501 502 *retvm = vm; 503 return (0); 504 } 505 506 void 507 vm_get_topology(struct vm *vm, uint16_t *sockets, uint16_t *cores, 508 uint16_t *threads, uint16_t *maxcpus) 509 { 510 *sockets = vm->sockets; 511 *cores = vm->cores; 512 *threads = vm->threads; 513 *maxcpus = vm->maxcpus; 514 } 515 516 uint16_t 517 vm_get_maxcpus(struct vm *vm) 518 { 519 return (vm->maxcpus); 520 } 521 522 int 523 vm_set_topology(struct vm *vm, uint16_t sockets, uint16_t cores, 524 uint16_t threads, uint16_t maxcpus) 525 { 526 if (maxcpus != 0) 527 return (EINVAL); /* XXX remove when supported */ 528 if ((sockets * cores * threads) > vm->maxcpus) 529 return (EINVAL); 530 /* XXX need to check sockets * cores * threads == vCPU, how? */ 531 vm->sockets = sockets; 532 vm->cores = cores; 533 vm->threads = threads; 534 vm->maxcpus = VM_MAXCPU; /* XXX temp to keep code working */ 535 return(0); 536 } 537 538 static void 539 vm_cleanup(struct vm *vm, bool destroy) 540 { 541 struct mem_map *mm; 542 int i; 543 544 ppt_unassign_all(vm); 545 546 if (vm->iommu != NULL) 547 iommu_destroy_domain(vm->iommu); 548 549 if (destroy) 550 vrtc_cleanup(vm->vrtc); 551 else 552 vrtc_reset(vm->vrtc); 553 vpmtmr_cleanup(vm->vpmtmr); 554 vatpit_cleanup(vm->vatpit); 555 vhpet_cleanup(vm->vhpet); 556 vatpic_cleanup(vm->vatpic); 557 vioapic_cleanup(vm->vioapic); 558 559 for (i = 0; i < vm->maxcpus; i++) 560 vcpu_cleanup(vm, i, destroy); 561 562 vmmops_cleanup(vm->cookie); 563 564 /* 565 * System memory is removed from the guest address space only when 566 * the VM is destroyed. This is because the mapping remains the same 567 * across VM reset. 568 * 569 * Device memory can be relocated by the guest (e.g. using PCI BARs) 570 * so those mappings are removed on a VM reset. 571 */ 572 for (i = 0; i < VM_MAX_MEMMAPS; i++) { 573 mm = &vm->mem_maps[i]; 574 if (destroy || !sysmem_mapping(vm, mm)) 575 vm_free_memmap(vm, i); 576 } 577 578 if (destroy) { 579 for (i = 0; i < VM_MAX_MEMSEGS; i++) 580 vm_free_memseg(vm, i); 581 582 vmmops_vmspace_free(vm->vmspace); 583 vm->vmspace = NULL; 584 } 585 } 586 587 void 588 vm_destroy(struct vm *vm) 589 { 590 vm_cleanup(vm, true); 591 free(vm, M_VM); 592 } 593 594 int 595 vm_reinit(struct vm *vm) 596 { 597 int error; 598 599 /* 600 * A virtual machine can be reset only if all vcpus are suspended. 601 */ 602 if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) { 603 vm_cleanup(vm, false); 604 vm_init(vm, false); 605 error = 0; 606 } else { 607 error = EBUSY; 608 } 609 610 return (error); 611 } 612 613 const char * 614 vm_name(struct vm *vm) 615 { 616 return (vm->name); 617 } 618 619 int 620 vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa) 621 { 622 vm_object_t obj; 623 624 if ((obj = vmm_mmio_alloc(vm->vmspace, gpa, len, hpa)) == NULL) 625 return (ENOMEM); 626 else 627 return (0); 628 } 629 630 int 631 vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len) 632 { 633 634 vmm_mmio_free(vm->vmspace, gpa, len); 635 return (0); 636 } 637 638 /* 639 * Return 'true' if 'gpa' is allocated in the guest address space. 640 * 641 * This function is called in the context of a running vcpu which acts as 642 * an implicit lock on 'vm->mem_maps[]'. 643 */ 644 bool 645 vm_mem_allocated(struct vm *vm, int vcpuid, vm_paddr_t gpa) 646 { 647 struct mem_map *mm; 648 int i; 649 650 #ifdef INVARIANTS 651 int hostcpu, state; 652 state = vcpu_get_state(vm, vcpuid, &hostcpu); 653 KASSERT(state == VCPU_RUNNING && hostcpu == curcpu, 654 ("%s: invalid vcpu state %d/%d", __func__, state, hostcpu)); 655 #endif 656 657 for (i = 0; i < VM_MAX_MEMMAPS; i++) { 658 mm = &vm->mem_maps[i]; 659 if (mm->len != 0 && gpa >= mm->gpa && gpa < mm->gpa + mm->len) 660 return (true); /* 'gpa' is sysmem or devmem */ 661 } 662 663 if (ppt_is_mmio(vm, gpa)) 664 return (true); /* 'gpa' is pci passthru mmio */ 665 666 return (false); 667 } 668 669 int 670 vm_alloc_memseg(struct vm *vm, int ident, size_t len, bool sysmem) 671 { 672 struct mem_seg *seg; 673 vm_object_t obj; 674 675 if (ident < 0 || ident >= VM_MAX_MEMSEGS) 676 return (EINVAL); 677 678 if (len == 0 || (len & PAGE_MASK)) 679 return (EINVAL); 680 681 seg = &vm->mem_segs[ident]; 682 if (seg->object != NULL) { 683 if (seg->len == len && seg->sysmem == sysmem) 684 return (EEXIST); 685 else 686 return (EINVAL); 687 } 688 689 obj = vm_object_allocate(OBJT_DEFAULT, len >> PAGE_SHIFT); 690 if (obj == NULL) 691 return (ENOMEM); 692 693 seg->len = len; 694 seg->object = obj; 695 seg->sysmem = sysmem; 696 return (0); 697 } 698 699 int 700 vm_get_memseg(struct vm *vm, int ident, size_t *len, bool *sysmem, 701 vm_object_t *objptr) 702 { 703 struct mem_seg *seg; 704 705 if (ident < 0 || ident >= VM_MAX_MEMSEGS) 706 return (EINVAL); 707 708 seg = &vm->mem_segs[ident]; 709 if (len) 710 *len = seg->len; 711 if (sysmem) 712 *sysmem = seg->sysmem; 713 if (objptr) 714 *objptr = seg->object; 715 return (0); 716 } 717 718 void 719 vm_free_memseg(struct vm *vm, int ident) 720 { 721 struct mem_seg *seg; 722 723 KASSERT(ident >= 0 && ident < VM_MAX_MEMSEGS, 724 ("%s: invalid memseg ident %d", __func__, ident)); 725 726 seg = &vm->mem_segs[ident]; 727 if (seg->object != NULL) { 728 vm_object_deallocate(seg->object); 729 bzero(seg, sizeof(struct mem_seg)); 730 } 731 } 732 733 int 734 vm_mmap_memseg(struct vm *vm, vm_paddr_t gpa, int segid, vm_ooffset_t first, 735 size_t len, int prot, int flags) 736 { 737 struct mem_seg *seg; 738 struct mem_map *m, *map; 739 vm_ooffset_t last; 740 int i, error; 741 742 if (prot == 0 || (prot & ~(VM_PROT_ALL)) != 0) 743 return (EINVAL); 744 745 if (flags & ~VM_MEMMAP_F_WIRED) 746 return (EINVAL); 747 748 if (segid < 0 || segid >= VM_MAX_MEMSEGS) 749 return (EINVAL); 750 751 seg = &vm->mem_segs[segid]; 752 if (seg->object == NULL) 753 return (EINVAL); 754 755 last = first + len; 756 if (first < 0 || first >= last || last > seg->len) 757 return (EINVAL); 758 759 if ((gpa | first | last) & PAGE_MASK) 760 return (EINVAL); 761 762 map = NULL; 763 for (i = 0; i < VM_MAX_MEMMAPS; i++) { 764 m = &vm->mem_maps[i]; 765 if (m->len == 0) { 766 map = m; 767 break; 768 } 769 } 770 771 if (map == NULL) 772 return (ENOSPC); 773 774 error = vm_map_find(&vm->vmspace->vm_map, seg->object, first, &gpa, 775 len, 0, VMFS_NO_SPACE, prot, prot, 0); 776 if (error != KERN_SUCCESS) 777 return (EFAULT); 778 779 vm_object_reference(seg->object); 780 781 if (flags & VM_MEMMAP_F_WIRED) { 782 error = vm_map_wire(&vm->vmspace->vm_map, gpa, gpa + len, 783 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); 784 if (error != KERN_SUCCESS) { 785 vm_map_remove(&vm->vmspace->vm_map, gpa, gpa + len); 786 return (error == KERN_RESOURCE_SHORTAGE ? ENOMEM : 787 EFAULT); 788 } 789 } 790 791 map->gpa = gpa; 792 map->len = len; 793 map->segoff = first; 794 map->segid = segid; 795 map->prot = prot; 796 map->flags = flags; 797 return (0); 798 } 799 800 int 801 vm_mmap_getnext(struct vm *vm, vm_paddr_t *gpa, int *segid, 802 vm_ooffset_t *segoff, size_t *len, int *prot, int *flags) 803 { 804 struct mem_map *mm, *mmnext; 805 int i; 806 807 mmnext = NULL; 808 for (i = 0; i < VM_MAX_MEMMAPS; i++) { 809 mm = &vm->mem_maps[i]; 810 if (mm->len == 0 || mm->gpa < *gpa) 811 continue; 812 if (mmnext == NULL || mm->gpa < mmnext->gpa) 813 mmnext = mm; 814 } 815 816 if (mmnext != NULL) { 817 *gpa = mmnext->gpa; 818 if (segid) 819 *segid = mmnext->segid; 820 if (segoff) 821 *segoff = mmnext->segoff; 822 if (len) 823 *len = mmnext->len; 824 if (prot) 825 *prot = mmnext->prot; 826 if (flags) 827 *flags = mmnext->flags; 828 return (0); 829 } else { 830 return (ENOENT); 831 } 832 } 833 834 static void 835 vm_free_memmap(struct vm *vm, int ident) 836 { 837 struct mem_map *mm; 838 int error; 839 840 mm = &vm->mem_maps[ident]; 841 if (mm->len) { 842 error = vm_map_remove(&vm->vmspace->vm_map, mm->gpa, 843 mm->gpa + mm->len); 844 KASSERT(error == KERN_SUCCESS, ("%s: vm_map_remove error %d", 845 __func__, error)); 846 bzero(mm, sizeof(struct mem_map)); 847 } 848 } 849 850 static __inline bool 851 sysmem_mapping(struct vm *vm, struct mem_map *mm) 852 { 853 854 if (mm->len != 0 && vm->mem_segs[mm->segid].sysmem) 855 return (true); 856 else 857 return (false); 858 } 859 860 vm_paddr_t 861 vmm_sysmem_maxaddr(struct vm *vm) 862 { 863 struct mem_map *mm; 864 vm_paddr_t maxaddr; 865 int i; 866 867 maxaddr = 0; 868 for (i = 0; i < VM_MAX_MEMMAPS; i++) { 869 mm = &vm->mem_maps[i]; 870 if (sysmem_mapping(vm, mm)) { 871 if (maxaddr < mm->gpa + mm->len) 872 maxaddr = mm->gpa + mm->len; 873 } 874 } 875 return (maxaddr); 876 } 877 878 static void 879 vm_iommu_modify(struct vm *vm, bool map) 880 { 881 int i, sz; 882 vm_paddr_t gpa, hpa; 883 struct mem_map *mm; 884 void *vp, *cookie, *host_domain; 885 886 sz = PAGE_SIZE; 887 host_domain = iommu_host_domain(); 888 889 for (i = 0; i < VM_MAX_MEMMAPS; i++) { 890 mm = &vm->mem_maps[i]; 891 if (!sysmem_mapping(vm, mm)) 892 continue; 893 894 if (map) { 895 KASSERT((mm->flags & VM_MEMMAP_F_IOMMU) == 0, 896 ("iommu map found invalid memmap %#lx/%#lx/%#x", 897 mm->gpa, mm->len, mm->flags)); 898 if ((mm->flags & VM_MEMMAP_F_WIRED) == 0) 899 continue; 900 mm->flags |= VM_MEMMAP_F_IOMMU; 901 } else { 902 if ((mm->flags & VM_MEMMAP_F_IOMMU) == 0) 903 continue; 904 mm->flags &= ~VM_MEMMAP_F_IOMMU; 905 KASSERT((mm->flags & VM_MEMMAP_F_WIRED) != 0, 906 ("iommu unmap found invalid memmap %#lx/%#lx/%#x", 907 mm->gpa, mm->len, mm->flags)); 908 } 909 910 gpa = mm->gpa; 911 while (gpa < mm->gpa + mm->len) { 912 vp = vm_gpa_hold(vm, -1, gpa, PAGE_SIZE, VM_PROT_WRITE, 913 &cookie); 914 KASSERT(vp != NULL, ("vm(%s) could not map gpa %#lx", 915 vm_name(vm), gpa)); 916 917 vm_gpa_release(cookie); 918 919 hpa = DMAP_TO_PHYS((uintptr_t)vp); 920 if (map) { 921 iommu_create_mapping(vm->iommu, gpa, hpa, sz); 922 iommu_remove_mapping(host_domain, hpa, sz); 923 } else { 924 iommu_remove_mapping(vm->iommu, gpa, sz); 925 iommu_create_mapping(host_domain, hpa, hpa, sz); 926 } 927 928 gpa += PAGE_SIZE; 929 } 930 } 931 932 /* 933 * Invalidate the cached translations associated with the domain 934 * from which pages were removed. 935 */ 936 if (map) 937 iommu_invalidate_tlb(host_domain); 938 else 939 iommu_invalidate_tlb(vm->iommu); 940 } 941 942 #define vm_iommu_unmap(vm) vm_iommu_modify((vm), false) 943 #define vm_iommu_map(vm) vm_iommu_modify((vm), true) 944 945 int 946 vm_unassign_pptdev(struct vm *vm, int bus, int slot, int func) 947 { 948 int error; 949 950 error = ppt_unassign_device(vm, bus, slot, func); 951 if (error) 952 return (error); 953 954 if (ppt_assigned_devices(vm) == 0) 955 vm_iommu_unmap(vm); 956 957 return (0); 958 } 959 960 int 961 vm_assign_pptdev(struct vm *vm, int bus, int slot, int func) 962 { 963 int error; 964 vm_paddr_t maxaddr; 965 966 /* Set up the IOMMU to do the 'gpa' to 'hpa' translation */ 967 if (ppt_assigned_devices(vm) == 0) { 968 KASSERT(vm->iommu == NULL, 969 ("vm_assign_pptdev: iommu must be NULL")); 970 maxaddr = vmm_sysmem_maxaddr(vm); 971 vm->iommu = iommu_create_domain(maxaddr); 972 if (vm->iommu == NULL) 973 return (ENXIO); 974 vm_iommu_map(vm); 975 } 976 977 error = ppt_assign_device(vm, bus, slot, func); 978 return (error); 979 } 980 981 void * 982 vm_gpa_hold(struct vm *vm, int vcpuid, vm_paddr_t gpa, size_t len, int reqprot, 983 void **cookie) 984 { 985 int i, count, pageoff; 986 struct mem_map *mm; 987 vm_page_t m; 988 #ifdef INVARIANTS 989 /* 990 * All vcpus are frozen by ioctls that modify the memory map 991 * (e.g. VM_MMAP_MEMSEG). Therefore 'vm->memmap[]' stability is 992 * guaranteed if at least one vcpu is in the VCPU_FROZEN state. 993 */ 994 int state; 995 KASSERT(vcpuid >= -1 && vcpuid < vm->maxcpus, ("%s: invalid vcpuid %d", 996 __func__, vcpuid)); 997 for (i = 0; i < vm->maxcpus; i++) { 998 if (vcpuid != -1 && vcpuid != i) 999 continue; 1000 state = vcpu_get_state(vm, i, NULL); 1001 KASSERT(state == VCPU_FROZEN, ("%s: invalid vcpu state %d", 1002 __func__, state)); 1003 } 1004 #endif 1005 pageoff = gpa & PAGE_MASK; 1006 if (len > PAGE_SIZE - pageoff) 1007 panic("vm_gpa_hold: invalid gpa/len: 0x%016lx/%lu", gpa, len); 1008 1009 count = 0; 1010 for (i = 0; i < VM_MAX_MEMMAPS; i++) { 1011 mm = &vm->mem_maps[i]; 1012 if (gpa >= mm->gpa && gpa < mm->gpa + mm->len) { 1013 count = vm_fault_quick_hold_pages(&vm->vmspace->vm_map, 1014 trunc_page(gpa), PAGE_SIZE, reqprot, &m, 1); 1015 break; 1016 } 1017 } 1018 1019 if (count == 1) { 1020 *cookie = m; 1021 return ((void *)(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)) + pageoff)); 1022 } else { 1023 *cookie = NULL; 1024 return (NULL); 1025 } 1026 } 1027 1028 void 1029 vm_gpa_release(void *cookie) 1030 { 1031 vm_page_t m = cookie; 1032 1033 vm_page_unwire(m, PQ_ACTIVE); 1034 } 1035 1036 int 1037 vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval) 1038 { 1039 1040 if (vcpu < 0 || vcpu >= vm->maxcpus) 1041 return (EINVAL); 1042 1043 if (reg >= VM_REG_LAST) 1044 return (EINVAL); 1045 1046 return (vmmops_getreg(vm->cookie, vcpu, reg, retval)); 1047 } 1048 1049 int 1050 vm_set_register(struct vm *vm, int vcpuid, int reg, uint64_t val) 1051 { 1052 struct vcpu *vcpu; 1053 int error; 1054 1055 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 1056 return (EINVAL); 1057 1058 if (reg >= VM_REG_LAST) 1059 return (EINVAL); 1060 1061 error = vmmops_setreg(vm->cookie, vcpuid, reg, val); 1062 if (error || reg != VM_REG_GUEST_RIP) 1063 return (error); 1064 1065 /* Set 'nextrip' to match the value of %rip */ 1066 VCPU_CTR1(vm, vcpuid, "Setting nextrip to %#lx", val); 1067 vcpu = &vm->vcpu[vcpuid]; 1068 vcpu->nextrip = val; 1069 return (0); 1070 } 1071 1072 static bool 1073 is_descriptor_table(int reg) 1074 { 1075 1076 switch (reg) { 1077 case VM_REG_GUEST_IDTR: 1078 case VM_REG_GUEST_GDTR: 1079 return (true); 1080 default: 1081 return (false); 1082 } 1083 } 1084 1085 static bool 1086 is_segment_register(int reg) 1087 { 1088 1089 switch (reg) { 1090 case VM_REG_GUEST_ES: 1091 case VM_REG_GUEST_CS: 1092 case VM_REG_GUEST_SS: 1093 case VM_REG_GUEST_DS: 1094 case VM_REG_GUEST_FS: 1095 case VM_REG_GUEST_GS: 1096 case VM_REG_GUEST_TR: 1097 case VM_REG_GUEST_LDTR: 1098 return (true); 1099 default: 1100 return (false); 1101 } 1102 } 1103 1104 int 1105 vm_get_seg_desc(struct vm *vm, int vcpu, int reg, 1106 struct seg_desc *desc) 1107 { 1108 1109 if (vcpu < 0 || vcpu >= vm->maxcpus) 1110 return (EINVAL); 1111 1112 if (!is_segment_register(reg) && !is_descriptor_table(reg)) 1113 return (EINVAL); 1114 1115 return (vmmops_getdesc(vm->cookie, vcpu, reg, desc)); 1116 } 1117 1118 int 1119 vm_set_seg_desc(struct vm *vm, int vcpu, int reg, 1120 struct seg_desc *desc) 1121 { 1122 if (vcpu < 0 || vcpu >= vm->maxcpus) 1123 return (EINVAL); 1124 1125 if (!is_segment_register(reg) && !is_descriptor_table(reg)) 1126 return (EINVAL); 1127 1128 return (vmmops_setdesc(vm->cookie, vcpu, reg, desc)); 1129 } 1130 1131 static void 1132 restore_guest_fpustate(struct vcpu *vcpu) 1133 { 1134 1135 /* flush host state to the pcb */ 1136 fpuexit(curthread); 1137 1138 /* restore guest FPU state */ 1139 fpu_stop_emulating(); 1140 fpurestore(vcpu->guestfpu); 1141 1142 /* restore guest XCR0 if XSAVE is enabled in the host */ 1143 if (rcr4() & CR4_XSAVE) 1144 load_xcr(0, vcpu->guest_xcr0); 1145 1146 /* 1147 * The FPU is now "dirty" with the guest's state so turn on emulation 1148 * to trap any access to the FPU by the host. 1149 */ 1150 fpu_start_emulating(); 1151 } 1152 1153 static void 1154 save_guest_fpustate(struct vcpu *vcpu) 1155 { 1156 1157 if ((rcr0() & CR0_TS) == 0) 1158 panic("fpu emulation not enabled in host!"); 1159 1160 /* save guest XCR0 and restore host XCR0 */ 1161 if (rcr4() & CR4_XSAVE) { 1162 vcpu->guest_xcr0 = rxcr(0); 1163 load_xcr(0, vmm_get_host_xcr0()); 1164 } 1165 1166 /* save guest FPU state */ 1167 fpu_stop_emulating(); 1168 fpusave(vcpu->guestfpu); 1169 fpu_start_emulating(); 1170 } 1171 1172 static VMM_STAT(VCPU_IDLE_TICKS, "number of ticks vcpu was idle"); 1173 1174 static int 1175 vcpu_set_state_locked(struct vm *vm, int vcpuid, enum vcpu_state newstate, 1176 bool from_idle) 1177 { 1178 struct vcpu *vcpu; 1179 int error; 1180 1181 vcpu = &vm->vcpu[vcpuid]; 1182 vcpu_assert_locked(vcpu); 1183 1184 /* 1185 * State transitions from the vmmdev_ioctl() must always begin from 1186 * the VCPU_IDLE state. This guarantees that there is only a single 1187 * ioctl() operating on a vcpu at any point. 1188 */ 1189 if (from_idle) { 1190 while (vcpu->state != VCPU_IDLE) { 1191 vcpu->reqidle = 1; 1192 vcpu_notify_event_locked(vcpu, false); 1193 VCPU_CTR1(vm, vcpuid, "vcpu state change from %s to " 1194 "idle requested", vcpu_state2str(vcpu->state)); 1195 msleep_spin(&vcpu->state, &vcpu->mtx, "vmstat", hz); 1196 } 1197 } else { 1198 KASSERT(vcpu->state != VCPU_IDLE, ("invalid transition from " 1199 "vcpu idle state")); 1200 } 1201 1202 if (vcpu->state == VCPU_RUNNING) { 1203 KASSERT(vcpu->hostcpu == curcpu, ("curcpu %d and hostcpu %d " 1204 "mismatch for running vcpu", curcpu, vcpu->hostcpu)); 1205 } else { 1206 KASSERT(vcpu->hostcpu == NOCPU, ("Invalid hostcpu %d for a " 1207 "vcpu that is not running", vcpu->hostcpu)); 1208 } 1209 1210 /* 1211 * The following state transitions are allowed: 1212 * IDLE -> FROZEN -> IDLE 1213 * FROZEN -> RUNNING -> FROZEN 1214 * FROZEN -> SLEEPING -> FROZEN 1215 */ 1216 switch (vcpu->state) { 1217 case VCPU_IDLE: 1218 case VCPU_RUNNING: 1219 case VCPU_SLEEPING: 1220 error = (newstate != VCPU_FROZEN); 1221 break; 1222 case VCPU_FROZEN: 1223 error = (newstate == VCPU_FROZEN); 1224 break; 1225 default: 1226 error = 1; 1227 break; 1228 } 1229 1230 if (error) 1231 return (EBUSY); 1232 1233 VCPU_CTR2(vm, vcpuid, "vcpu state changed from %s to %s", 1234 vcpu_state2str(vcpu->state), vcpu_state2str(newstate)); 1235 1236 vcpu->state = newstate; 1237 if (newstate == VCPU_RUNNING) 1238 vcpu->hostcpu = curcpu; 1239 else 1240 vcpu->hostcpu = NOCPU; 1241 1242 if (newstate == VCPU_IDLE) 1243 wakeup(&vcpu->state); 1244 1245 return (0); 1246 } 1247 1248 static void 1249 vcpu_require_state(struct vm *vm, int vcpuid, enum vcpu_state newstate) 1250 { 1251 int error; 1252 1253 if ((error = vcpu_set_state(vm, vcpuid, newstate, false)) != 0) 1254 panic("Error %d setting state to %d\n", error, newstate); 1255 } 1256 1257 static void 1258 vcpu_require_state_locked(struct vm *vm, int vcpuid, enum vcpu_state newstate) 1259 { 1260 int error; 1261 1262 if ((error = vcpu_set_state_locked(vm, vcpuid, newstate, false)) != 0) 1263 panic("Error %d setting state to %d", error, newstate); 1264 } 1265 1266 #define RENDEZVOUS_CTR0(vm, vcpuid, fmt) \ 1267 do { \ 1268 if (vcpuid >= 0) \ 1269 VCPU_CTR0(vm, vcpuid, fmt); \ 1270 else \ 1271 VM_CTR0(vm, fmt); \ 1272 } while (0) 1273 1274 static int 1275 vm_handle_rendezvous(struct vm *vm, int vcpuid) 1276 { 1277 struct thread *td; 1278 int error; 1279 1280 KASSERT(vcpuid == -1 || (vcpuid >= 0 && vcpuid < vm->maxcpus), 1281 ("vm_handle_rendezvous: invalid vcpuid %d", vcpuid)); 1282 1283 error = 0; 1284 td = curthread; 1285 mtx_lock(&vm->rendezvous_mtx); 1286 while (vm->rendezvous_func != NULL) { 1287 /* 'rendezvous_req_cpus' must be a subset of 'active_cpus' */ 1288 CPU_AND(&vm->rendezvous_req_cpus, &vm->active_cpus); 1289 1290 if (vcpuid != -1 && 1291 CPU_ISSET(vcpuid, &vm->rendezvous_req_cpus) && 1292 !CPU_ISSET(vcpuid, &vm->rendezvous_done_cpus)) { 1293 VCPU_CTR0(vm, vcpuid, "Calling rendezvous func"); 1294 (*vm->rendezvous_func)(vm, vcpuid, vm->rendezvous_arg); 1295 CPU_SET(vcpuid, &vm->rendezvous_done_cpus); 1296 } 1297 if (CPU_CMP(&vm->rendezvous_req_cpus, 1298 &vm->rendezvous_done_cpus) == 0) { 1299 VCPU_CTR0(vm, vcpuid, "Rendezvous completed"); 1300 vm->rendezvous_func = NULL; 1301 wakeup(&vm->rendezvous_func); 1302 break; 1303 } 1304 RENDEZVOUS_CTR0(vm, vcpuid, "Wait for rendezvous completion"); 1305 mtx_sleep(&vm->rendezvous_func, &vm->rendezvous_mtx, 0, 1306 "vmrndv", hz); 1307 if ((td->td_flags & TDF_NEEDSUSPCHK) != 0) { 1308 mtx_unlock(&vm->rendezvous_mtx); 1309 error = thread_check_susp(td, true); 1310 if (error != 0) 1311 return (error); 1312 mtx_lock(&vm->rendezvous_mtx); 1313 } 1314 } 1315 mtx_unlock(&vm->rendezvous_mtx); 1316 return (0); 1317 } 1318 1319 /* 1320 * Emulate a guest 'hlt' by sleeping until the vcpu is ready to run. 1321 */ 1322 static int 1323 vm_handle_hlt(struct vm *vm, int vcpuid, bool intr_disabled, bool *retu) 1324 { 1325 struct vcpu *vcpu; 1326 const char *wmesg; 1327 struct thread *td; 1328 int error, t, vcpu_halted, vm_halted; 1329 1330 KASSERT(!CPU_ISSET(vcpuid, &vm->halted_cpus), ("vcpu already halted")); 1331 1332 vcpu = &vm->vcpu[vcpuid]; 1333 vcpu_halted = 0; 1334 vm_halted = 0; 1335 error = 0; 1336 td = curthread; 1337 1338 vcpu_lock(vcpu); 1339 while (1) { 1340 /* 1341 * Do a final check for pending NMI or interrupts before 1342 * really putting this thread to sleep. Also check for 1343 * software events that would cause this vcpu to wakeup. 1344 * 1345 * These interrupts/events could have happened after the 1346 * vcpu returned from vmmops_run() and before it acquired the 1347 * vcpu lock above. 1348 */ 1349 if (vm->rendezvous_func != NULL || vm->suspend || vcpu->reqidle) 1350 break; 1351 if (vm_nmi_pending(vm, vcpuid)) 1352 break; 1353 if (!intr_disabled) { 1354 if (vm_extint_pending(vm, vcpuid) || 1355 vlapic_pending_intr(vcpu->vlapic, NULL)) { 1356 break; 1357 } 1358 } 1359 1360 /* Don't go to sleep if the vcpu thread needs to yield */ 1361 if (vcpu_should_yield(vm, vcpuid)) 1362 break; 1363 1364 if (vcpu_debugged(vm, vcpuid)) 1365 break; 1366 1367 /* 1368 * Some Linux guests implement "halt" by having all vcpus 1369 * execute HLT with interrupts disabled. 'halted_cpus' keeps 1370 * track of the vcpus that have entered this state. When all 1371 * vcpus enter the halted state the virtual machine is halted. 1372 */ 1373 if (intr_disabled) { 1374 wmesg = "vmhalt"; 1375 VCPU_CTR0(vm, vcpuid, "Halted"); 1376 if (!vcpu_halted && halt_detection_enabled) { 1377 vcpu_halted = 1; 1378 CPU_SET_ATOMIC(vcpuid, &vm->halted_cpus); 1379 } 1380 if (CPU_CMP(&vm->halted_cpus, &vm->active_cpus) == 0) { 1381 vm_halted = 1; 1382 break; 1383 } 1384 } else { 1385 wmesg = "vmidle"; 1386 } 1387 1388 t = ticks; 1389 vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING); 1390 /* 1391 * XXX msleep_spin() cannot be interrupted by signals so 1392 * wake up periodically to check pending signals. 1393 */ 1394 msleep_spin(vcpu, &vcpu->mtx, wmesg, hz); 1395 vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN); 1396 vmm_stat_incr(vm, vcpuid, VCPU_IDLE_TICKS, ticks - t); 1397 if ((td->td_flags & TDF_NEEDSUSPCHK) != 0) { 1398 vcpu_unlock(vcpu); 1399 error = thread_check_susp(td, false); 1400 if (error != 0) 1401 return (error); 1402 vcpu_lock(vcpu); 1403 } 1404 } 1405 1406 if (vcpu_halted) 1407 CPU_CLR_ATOMIC(vcpuid, &vm->halted_cpus); 1408 1409 vcpu_unlock(vcpu); 1410 1411 if (vm_halted) 1412 vm_suspend(vm, VM_SUSPEND_HALT); 1413 1414 return (0); 1415 } 1416 1417 static int 1418 vm_handle_paging(struct vm *vm, int vcpuid, bool *retu) 1419 { 1420 int rv, ftype; 1421 struct vm_map *map; 1422 struct vcpu *vcpu; 1423 struct vm_exit *vme; 1424 1425 vcpu = &vm->vcpu[vcpuid]; 1426 vme = &vcpu->exitinfo; 1427 1428 KASSERT(vme->inst_length == 0, ("%s: invalid inst_length %d", 1429 __func__, vme->inst_length)); 1430 1431 ftype = vme->u.paging.fault_type; 1432 KASSERT(ftype == VM_PROT_READ || 1433 ftype == VM_PROT_WRITE || ftype == VM_PROT_EXECUTE, 1434 ("vm_handle_paging: invalid fault_type %d", ftype)); 1435 1436 if (ftype == VM_PROT_READ || ftype == VM_PROT_WRITE) { 1437 rv = pmap_emulate_accessed_dirty(vmspace_pmap(vm->vmspace), 1438 vme->u.paging.gpa, ftype); 1439 if (rv == 0) { 1440 VCPU_CTR2(vm, vcpuid, "%s bit emulation for gpa %#lx", 1441 ftype == VM_PROT_READ ? "accessed" : "dirty", 1442 vme->u.paging.gpa); 1443 goto done; 1444 } 1445 } 1446 1447 map = &vm->vmspace->vm_map; 1448 rv = vm_fault(map, vme->u.paging.gpa, ftype, VM_FAULT_NORMAL, NULL); 1449 1450 VCPU_CTR3(vm, vcpuid, "vm_handle_paging rv = %d, gpa = %#lx, " 1451 "ftype = %d", rv, vme->u.paging.gpa, ftype); 1452 1453 if (rv != KERN_SUCCESS) 1454 return (EFAULT); 1455 done: 1456 return (0); 1457 } 1458 1459 static int 1460 vm_handle_inst_emul(struct vm *vm, int vcpuid, bool *retu) 1461 { 1462 struct vie *vie; 1463 struct vcpu *vcpu; 1464 struct vm_exit *vme; 1465 uint64_t gla, gpa, cs_base; 1466 struct vm_guest_paging *paging; 1467 mem_region_read_t mread; 1468 mem_region_write_t mwrite; 1469 enum vm_cpu_mode cpu_mode; 1470 int cs_d, error, fault; 1471 1472 vcpu = &vm->vcpu[vcpuid]; 1473 vme = &vcpu->exitinfo; 1474 1475 KASSERT(vme->inst_length == 0, ("%s: invalid inst_length %d", 1476 __func__, vme->inst_length)); 1477 1478 gla = vme->u.inst_emul.gla; 1479 gpa = vme->u.inst_emul.gpa; 1480 cs_base = vme->u.inst_emul.cs_base; 1481 cs_d = vme->u.inst_emul.cs_d; 1482 vie = &vme->u.inst_emul.vie; 1483 paging = &vme->u.inst_emul.paging; 1484 cpu_mode = paging->cpu_mode; 1485 1486 VCPU_CTR1(vm, vcpuid, "inst_emul fault accessing gpa %#lx", gpa); 1487 1488 /* Fetch, decode and emulate the faulting instruction */ 1489 if (vie->num_valid == 0) { 1490 error = vmm_fetch_instruction(vm, vcpuid, paging, vme->rip + 1491 cs_base, VIE_INST_SIZE, vie, &fault); 1492 } else { 1493 /* 1494 * The instruction bytes have already been copied into 'vie' 1495 */ 1496 error = fault = 0; 1497 } 1498 if (error || fault) 1499 return (error); 1500 1501 if (vmm_decode_instruction(vm, vcpuid, gla, cpu_mode, cs_d, vie) != 0) { 1502 VCPU_CTR1(vm, vcpuid, "Error decoding instruction at %#lx", 1503 vme->rip + cs_base); 1504 *retu = true; /* dump instruction bytes in userspace */ 1505 return (0); 1506 } 1507 1508 /* 1509 * Update 'nextrip' based on the length of the emulated instruction. 1510 */ 1511 vme->inst_length = vie->num_processed; 1512 vcpu->nextrip += vie->num_processed; 1513 VCPU_CTR1(vm, vcpuid, "nextrip updated to %#lx after instruction " 1514 "decoding", vcpu->nextrip); 1515 1516 /* return to userland unless this is an in-kernel emulated device */ 1517 if (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE) { 1518 mread = lapic_mmio_read; 1519 mwrite = lapic_mmio_write; 1520 } else if (gpa >= VIOAPIC_BASE && gpa < VIOAPIC_BASE + VIOAPIC_SIZE) { 1521 mread = vioapic_mmio_read; 1522 mwrite = vioapic_mmio_write; 1523 } else if (gpa >= VHPET_BASE && gpa < VHPET_BASE + VHPET_SIZE) { 1524 mread = vhpet_mmio_read; 1525 mwrite = vhpet_mmio_write; 1526 } else { 1527 *retu = true; 1528 return (0); 1529 } 1530 1531 error = vmm_emulate_instruction(vm, vcpuid, gpa, vie, paging, 1532 mread, mwrite, retu); 1533 1534 return (error); 1535 } 1536 1537 static int 1538 vm_handle_suspend(struct vm *vm, int vcpuid, bool *retu) 1539 { 1540 int error, i; 1541 struct vcpu *vcpu; 1542 struct thread *td; 1543 1544 error = 0; 1545 vcpu = &vm->vcpu[vcpuid]; 1546 td = curthread; 1547 1548 CPU_SET_ATOMIC(vcpuid, &vm->suspended_cpus); 1549 1550 /* 1551 * Wait until all 'active_cpus' have suspended themselves. 1552 * 1553 * Since a VM may be suspended at any time including when one or 1554 * more vcpus are doing a rendezvous we need to call the rendezvous 1555 * handler while we are waiting to prevent a deadlock. 1556 */ 1557 vcpu_lock(vcpu); 1558 while (error == 0) { 1559 if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) { 1560 VCPU_CTR0(vm, vcpuid, "All vcpus suspended"); 1561 break; 1562 } 1563 1564 if (vm->rendezvous_func == NULL) { 1565 VCPU_CTR0(vm, vcpuid, "Sleeping during suspend"); 1566 vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING); 1567 msleep_spin(vcpu, &vcpu->mtx, "vmsusp", hz); 1568 vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN); 1569 if ((td->td_flags & TDF_NEEDSUSPCHK) != 0) { 1570 vcpu_unlock(vcpu); 1571 error = thread_check_susp(td, false); 1572 vcpu_lock(vcpu); 1573 } 1574 } else { 1575 VCPU_CTR0(vm, vcpuid, "Rendezvous during suspend"); 1576 vcpu_unlock(vcpu); 1577 error = vm_handle_rendezvous(vm, vcpuid); 1578 vcpu_lock(vcpu); 1579 } 1580 } 1581 vcpu_unlock(vcpu); 1582 1583 /* 1584 * Wakeup the other sleeping vcpus and return to userspace. 1585 */ 1586 for (i = 0; i < vm->maxcpus; i++) { 1587 if (CPU_ISSET(i, &vm->suspended_cpus)) { 1588 vcpu_notify_event(vm, i, false); 1589 } 1590 } 1591 1592 *retu = true; 1593 return (error); 1594 } 1595 1596 static int 1597 vm_handle_reqidle(struct vm *vm, int vcpuid, bool *retu) 1598 { 1599 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 1600 1601 vcpu_lock(vcpu); 1602 KASSERT(vcpu->reqidle, ("invalid vcpu reqidle %d", vcpu->reqidle)); 1603 vcpu->reqidle = 0; 1604 vcpu_unlock(vcpu); 1605 *retu = true; 1606 return (0); 1607 } 1608 1609 int 1610 vm_suspend(struct vm *vm, enum vm_suspend_how how) 1611 { 1612 int i; 1613 1614 if (how <= VM_SUSPEND_NONE || how >= VM_SUSPEND_LAST) 1615 return (EINVAL); 1616 1617 if (atomic_cmpset_int(&vm->suspend, 0, how) == 0) { 1618 VM_CTR2(vm, "virtual machine already suspended %d/%d", 1619 vm->suspend, how); 1620 return (EALREADY); 1621 } 1622 1623 VM_CTR1(vm, "virtual machine successfully suspended %d", how); 1624 1625 /* 1626 * Notify all active vcpus that they are now suspended. 1627 */ 1628 for (i = 0; i < vm->maxcpus; i++) { 1629 if (CPU_ISSET(i, &vm->active_cpus)) 1630 vcpu_notify_event(vm, i, false); 1631 } 1632 1633 return (0); 1634 } 1635 1636 void 1637 vm_exit_suspended(struct vm *vm, int vcpuid, uint64_t rip) 1638 { 1639 struct vm_exit *vmexit; 1640 1641 KASSERT(vm->suspend > VM_SUSPEND_NONE && vm->suspend < VM_SUSPEND_LAST, 1642 ("vm_exit_suspended: invalid suspend type %d", vm->suspend)); 1643 1644 vmexit = vm_exitinfo(vm, vcpuid); 1645 vmexit->rip = rip; 1646 vmexit->inst_length = 0; 1647 vmexit->exitcode = VM_EXITCODE_SUSPENDED; 1648 vmexit->u.suspended.how = vm->suspend; 1649 } 1650 1651 void 1652 vm_exit_debug(struct vm *vm, int vcpuid, uint64_t rip) 1653 { 1654 struct vm_exit *vmexit; 1655 1656 vmexit = vm_exitinfo(vm, vcpuid); 1657 vmexit->rip = rip; 1658 vmexit->inst_length = 0; 1659 vmexit->exitcode = VM_EXITCODE_DEBUG; 1660 } 1661 1662 void 1663 vm_exit_rendezvous(struct vm *vm, int vcpuid, uint64_t rip) 1664 { 1665 struct vm_exit *vmexit; 1666 1667 KASSERT(vm->rendezvous_func != NULL, ("rendezvous not in progress")); 1668 1669 vmexit = vm_exitinfo(vm, vcpuid); 1670 vmexit->rip = rip; 1671 vmexit->inst_length = 0; 1672 vmexit->exitcode = VM_EXITCODE_RENDEZVOUS; 1673 vmm_stat_incr(vm, vcpuid, VMEXIT_RENDEZVOUS, 1); 1674 } 1675 1676 void 1677 vm_exit_reqidle(struct vm *vm, int vcpuid, uint64_t rip) 1678 { 1679 struct vm_exit *vmexit; 1680 1681 vmexit = vm_exitinfo(vm, vcpuid); 1682 vmexit->rip = rip; 1683 vmexit->inst_length = 0; 1684 vmexit->exitcode = VM_EXITCODE_REQIDLE; 1685 vmm_stat_incr(vm, vcpuid, VMEXIT_REQIDLE, 1); 1686 } 1687 1688 void 1689 vm_exit_astpending(struct vm *vm, int vcpuid, uint64_t rip) 1690 { 1691 struct vm_exit *vmexit; 1692 1693 vmexit = vm_exitinfo(vm, vcpuid); 1694 vmexit->rip = rip; 1695 vmexit->inst_length = 0; 1696 vmexit->exitcode = VM_EXITCODE_BOGUS; 1697 vmm_stat_incr(vm, vcpuid, VMEXIT_ASTPENDING, 1); 1698 } 1699 1700 int 1701 vm_run(struct vm *vm, struct vm_run *vmrun) 1702 { 1703 struct vm_eventinfo evinfo; 1704 int error, vcpuid; 1705 struct vcpu *vcpu; 1706 struct pcb *pcb; 1707 uint64_t tscval; 1708 struct vm_exit *vme; 1709 bool retu, intr_disabled; 1710 pmap_t pmap; 1711 1712 vcpuid = vmrun->cpuid; 1713 1714 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 1715 return (EINVAL); 1716 1717 if (!CPU_ISSET(vcpuid, &vm->active_cpus)) 1718 return (EINVAL); 1719 1720 if (CPU_ISSET(vcpuid, &vm->suspended_cpus)) 1721 return (EINVAL); 1722 1723 pmap = vmspace_pmap(vm->vmspace); 1724 vcpu = &vm->vcpu[vcpuid]; 1725 vme = &vcpu->exitinfo; 1726 evinfo.rptr = &vm->rendezvous_func; 1727 evinfo.sptr = &vm->suspend; 1728 evinfo.iptr = &vcpu->reqidle; 1729 restart: 1730 critical_enter(); 1731 1732 KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active), 1733 ("vm_run: absurd pm_active")); 1734 1735 tscval = rdtsc(); 1736 1737 pcb = PCPU_GET(curpcb); 1738 set_pcb_flags(pcb, PCB_FULL_IRET); 1739 1740 restore_guest_fpustate(vcpu); 1741 1742 vcpu_require_state(vm, vcpuid, VCPU_RUNNING); 1743 error = vmmops_run(vm->cookie, vcpuid, vcpu->nextrip, pmap, &evinfo); 1744 vcpu_require_state(vm, vcpuid, VCPU_FROZEN); 1745 1746 save_guest_fpustate(vcpu); 1747 1748 vmm_stat_incr(vm, vcpuid, VCPU_TOTAL_RUNTIME, rdtsc() - tscval); 1749 1750 critical_exit(); 1751 1752 if (error == 0) { 1753 retu = false; 1754 vcpu->nextrip = vme->rip + vme->inst_length; 1755 switch (vme->exitcode) { 1756 case VM_EXITCODE_REQIDLE: 1757 error = vm_handle_reqidle(vm, vcpuid, &retu); 1758 break; 1759 case VM_EXITCODE_SUSPENDED: 1760 error = vm_handle_suspend(vm, vcpuid, &retu); 1761 break; 1762 case VM_EXITCODE_IOAPIC_EOI: 1763 vioapic_process_eoi(vm, vcpuid, 1764 vme->u.ioapic_eoi.vector); 1765 break; 1766 case VM_EXITCODE_RENDEZVOUS: 1767 error = vm_handle_rendezvous(vm, vcpuid); 1768 break; 1769 case VM_EXITCODE_HLT: 1770 intr_disabled = ((vme->u.hlt.rflags & PSL_I) == 0); 1771 error = vm_handle_hlt(vm, vcpuid, intr_disabled, &retu); 1772 break; 1773 case VM_EXITCODE_PAGING: 1774 error = vm_handle_paging(vm, vcpuid, &retu); 1775 break; 1776 case VM_EXITCODE_INST_EMUL: 1777 error = vm_handle_inst_emul(vm, vcpuid, &retu); 1778 break; 1779 case VM_EXITCODE_INOUT: 1780 case VM_EXITCODE_INOUT_STR: 1781 error = vm_handle_inout(vm, vcpuid, vme, &retu); 1782 break; 1783 case VM_EXITCODE_MONITOR: 1784 case VM_EXITCODE_MWAIT: 1785 case VM_EXITCODE_VMINSN: 1786 vm_inject_ud(vm, vcpuid); 1787 break; 1788 default: 1789 retu = true; /* handled in userland */ 1790 break; 1791 } 1792 } 1793 1794 if (error == 0 && retu == false) 1795 goto restart; 1796 1797 VCPU_CTR2(vm, vcpuid, "retu %d/%d", error, vme->exitcode); 1798 1799 /* copy the exit information */ 1800 bcopy(vme, &vmrun->vm_exit, sizeof(struct vm_exit)); 1801 return (error); 1802 } 1803 1804 int 1805 vm_restart_instruction(void *arg, int vcpuid) 1806 { 1807 struct vm *vm; 1808 struct vcpu *vcpu; 1809 enum vcpu_state state; 1810 uint64_t rip; 1811 int error; 1812 1813 vm = arg; 1814 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 1815 return (EINVAL); 1816 1817 vcpu = &vm->vcpu[vcpuid]; 1818 state = vcpu_get_state(vm, vcpuid, NULL); 1819 if (state == VCPU_RUNNING) { 1820 /* 1821 * When a vcpu is "running" the next instruction is determined 1822 * by adding 'rip' and 'inst_length' in the vcpu's 'exitinfo'. 1823 * Thus setting 'inst_length' to zero will cause the current 1824 * instruction to be restarted. 1825 */ 1826 vcpu->exitinfo.inst_length = 0; 1827 VCPU_CTR1(vm, vcpuid, "restarting instruction at %#lx by " 1828 "setting inst_length to zero", vcpu->exitinfo.rip); 1829 } else if (state == VCPU_FROZEN) { 1830 /* 1831 * When a vcpu is "frozen" it is outside the critical section 1832 * around vmmops_run() and 'nextrip' points to the next 1833 * instruction. Thus instruction restart is achieved by setting 1834 * 'nextrip' to the vcpu's %rip. 1835 */ 1836 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RIP, &rip); 1837 KASSERT(!error, ("%s: error %d getting rip", __func__, error)); 1838 VCPU_CTR2(vm, vcpuid, "restarting instruction by updating " 1839 "nextrip from %#lx to %#lx", vcpu->nextrip, rip); 1840 vcpu->nextrip = rip; 1841 } else { 1842 panic("%s: invalid state %d", __func__, state); 1843 } 1844 return (0); 1845 } 1846 1847 int 1848 vm_exit_intinfo(struct vm *vm, int vcpuid, uint64_t info) 1849 { 1850 struct vcpu *vcpu; 1851 int type, vector; 1852 1853 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 1854 return (EINVAL); 1855 1856 vcpu = &vm->vcpu[vcpuid]; 1857 1858 if (info & VM_INTINFO_VALID) { 1859 type = info & VM_INTINFO_TYPE; 1860 vector = info & 0xff; 1861 if (type == VM_INTINFO_NMI && vector != IDT_NMI) 1862 return (EINVAL); 1863 if (type == VM_INTINFO_HWEXCEPTION && vector >= 32) 1864 return (EINVAL); 1865 if (info & VM_INTINFO_RSVD) 1866 return (EINVAL); 1867 } else { 1868 info = 0; 1869 } 1870 VCPU_CTR2(vm, vcpuid, "%s: info1(%#lx)", __func__, info); 1871 vcpu->exitintinfo = info; 1872 return (0); 1873 } 1874 1875 enum exc_class { 1876 EXC_BENIGN, 1877 EXC_CONTRIBUTORY, 1878 EXC_PAGEFAULT 1879 }; 1880 1881 #define IDT_VE 20 /* Virtualization Exception (Intel specific) */ 1882 1883 static enum exc_class 1884 exception_class(uint64_t info) 1885 { 1886 int type, vector; 1887 1888 KASSERT(info & VM_INTINFO_VALID, ("intinfo must be valid: %#lx", info)); 1889 type = info & VM_INTINFO_TYPE; 1890 vector = info & 0xff; 1891 1892 /* Table 6-4, "Interrupt and Exception Classes", Intel SDM, Vol 3 */ 1893 switch (type) { 1894 case VM_INTINFO_HWINTR: 1895 case VM_INTINFO_SWINTR: 1896 case VM_INTINFO_NMI: 1897 return (EXC_BENIGN); 1898 default: 1899 /* 1900 * Hardware exception. 1901 * 1902 * SVM and VT-x use identical type values to represent NMI, 1903 * hardware interrupt and software interrupt. 1904 * 1905 * SVM uses type '3' for all exceptions. VT-x uses type '3' 1906 * for exceptions except #BP and #OF. #BP and #OF use a type 1907 * value of '5' or '6'. Therefore we don't check for explicit 1908 * values of 'type' to classify 'intinfo' into a hardware 1909 * exception. 1910 */ 1911 break; 1912 } 1913 1914 switch (vector) { 1915 case IDT_PF: 1916 case IDT_VE: 1917 return (EXC_PAGEFAULT); 1918 case IDT_DE: 1919 case IDT_TS: 1920 case IDT_NP: 1921 case IDT_SS: 1922 case IDT_GP: 1923 return (EXC_CONTRIBUTORY); 1924 default: 1925 return (EXC_BENIGN); 1926 } 1927 } 1928 1929 static int 1930 nested_fault(struct vm *vm, int vcpuid, uint64_t info1, uint64_t info2, 1931 uint64_t *retinfo) 1932 { 1933 enum exc_class exc1, exc2; 1934 int type1, vector1; 1935 1936 KASSERT(info1 & VM_INTINFO_VALID, ("info1 %#lx is not valid", info1)); 1937 KASSERT(info2 & VM_INTINFO_VALID, ("info2 %#lx is not valid", info2)); 1938 1939 /* 1940 * If an exception occurs while attempting to call the double-fault 1941 * handler the processor enters shutdown mode (aka triple fault). 1942 */ 1943 type1 = info1 & VM_INTINFO_TYPE; 1944 vector1 = info1 & 0xff; 1945 if (type1 == VM_INTINFO_HWEXCEPTION && vector1 == IDT_DF) { 1946 VCPU_CTR2(vm, vcpuid, "triple fault: info1(%#lx), info2(%#lx)", 1947 info1, info2); 1948 vm_suspend(vm, VM_SUSPEND_TRIPLEFAULT); 1949 *retinfo = 0; 1950 return (0); 1951 } 1952 1953 /* 1954 * Table 6-5 "Conditions for Generating a Double Fault", Intel SDM, Vol3 1955 */ 1956 exc1 = exception_class(info1); 1957 exc2 = exception_class(info2); 1958 if ((exc1 == EXC_CONTRIBUTORY && exc2 == EXC_CONTRIBUTORY) || 1959 (exc1 == EXC_PAGEFAULT && exc2 != EXC_BENIGN)) { 1960 /* Convert nested fault into a double fault. */ 1961 *retinfo = IDT_DF; 1962 *retinfo |= VM_INTINFO_VALID | VM_INTINFO_HWEXCEPTION; 1963 *retinfo |= VM_INTINFO_DEL_ERRCODE; 1964 } else { 1965 /* Handle exceptions serially */ 1966 *retinfo = info2; 1967 } 1968 return (1); 1969 } 1970 1971 static uint64_t 1972 vcpu_exception_intinfo(struct vcpu *vcpu) 1973 { 1974 uint64_t info = 0; 1975 1976 if (vcpu->exception_pending) { 1977 info = vcpu->exc_vector & 0xff; 1978 info |= VM_INTINFO_VALID | VM_INTINFO_HWEXCEPTION; 1979 if (vcpu->exc_errcode_valid) { 1980 info |= VM_INTINFO_DEL_ERRCODE; 1981 info |= (uint64_t)vcpu->exc_errcode << 32; 1982 } 1983 } 1984 return (info); 1985 } 1986 1987 int 1988 vm_entry_intinfo(struct vm *vm, int vcpuid, uint64_t *retinfo) 1989 { 1990 struct vcpu *vcpu; 1991 uint64_t info1, info2; 1992 int valid; 1993 1994 KASSERT(vcpuid >= 0 && 1995 vcpuid < vm->maxcpus, ("invalid vcpu %d", vcpuid)); 1996 1997 vcpu = &vm->vcpu[vcpuid]; 1998 1999 info1 = vcpu->exitintinfo; 2000 vcpu->exitintinfo = 0; 2001 2002 info2 = 0; 2003 if (vcpu->exception_pending) { 2004 info2 = vcpu_exception_intinfo(vcpu); 2005 vcpu->exception_pending = 0; 2006 VCPU_CTR2(vm, vcpuid, "Exception %d delivered: %#lx", 2007 vcpu->exc_vector, info2); 2008 } 2009 2010 if ((info1 & VM_INTINFO_VALID) && (info2 & VM_INTINFO_VALID)) { 2011 valid = nested_fault(vm, vcpuid, info1, info2, retinfo); 2012 } else if (info1 & VM_INTINFO_VALID) { 2013 *retinfo = info1; 2014 valid = 1; 2015 } else if (info2 & VM_INTINFO_VALID) { 2016 *retinfo = info2; 2017 valid = 1; 2018 } else { 2019 valid = 0; 2020 } 2021 2022 if (valid) { 2023 VCPU_CTR4(vm, vcpuid, "%s: info1(%#lx), info2(%#lx), " 2024 "retinfo(%#lx)", __func__, info1, info2, *retinfo); 2025 } 2026 2027 return (valid); 2028 } 2029 2030 int 2031 vm_get_intinfo(struct vm *vm, int vcpuid, uint64_t *info1, uint64_t *info2) 2032 { 2033 struct vcpu *vcpu; 2034 2035 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 2036 return (EINVAL); 2037 2038 vcpu = &vm->vcpu[vcpuid]; 2039 *info1 = vcpu->exitintinfo; 2040 *info2 = vcpu_exception_intinfo(vcpu); 2041 return (0); 2042 } 2043 2044 int 2045 vm_inject_exception(struct vm *vm, int vcpuid, int vector, int errcode_valid, 2046 uint32_t errcode, int restart_instruction) 2047 { 2048 struct vcpu *vcpu; 2049 uint64_t regval; 2050 int error; 2051 2052 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 2053 return (EINVAL); 2054 2055 if (vector < 0 || vector >= 32) 2056 return (EINVAL); 2057 2058 /* 2059 * A double fault exception should never be injected directly into 2060 * the guest. It is a derived exception that results from specific 2061 * combinations of nested faults. 2062 */ 2063 if (vector == IDT_DF) 2064 return (EINVAL); 2065 2066 vcpu = &vm->vcpu[vcpuid]; 2067 2068 if (vcpu->exception_pending) { 2069 VCPU_CTR2(vm, vcpuid, "Unable to inject exception %d due to " 2070 "pending exception %d", vector, vcpu->exc_vector); 2071 return (EBUSY); 2072 } 2073 2074 if (errcode_valid) { 2075 /* 2076 * Exceptions don't deliver an error code in real mode. 2077 */ 2078 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_CR0, ®val); 2079 KASSERT(!error, ("%s: error %d getting CR0", __func__, error)); 2080 if (!(regval & CR0_PE)) 2081 errcode_valid = 0; 2082 } 2083 2084 /* 2085 * From section 26.6.1 "Interruptibility State" in Intel SDM: 2086 * 2087 * Event blocking by "STI" or "MOV SS" is cleared after guest executes 2088 * one instruction or incurs an exception. 2089 */ 2090 error = vm_set_register(vm, vcpuid, VM_REG_GUEST_INTR_SHADOW, 0); 2091 KASSERT(error == 0, ("%s: error %d clearing interrupt shadow", 2092 __func__, error)); 2093 2094 if (restart_instruction) 2095 vm_restart_instruction(vm, vcpuid); 2096 2097 vcpu->exception_pending = 1; 2098 vcpu->exc_vector = vector; 2099 vcpu->exc_errcode = errcode; 2100 vcpu->exc_errcode_valid = errcode_valid; 2101 VCPU_CTR1(vm, vcpuid, "Exception %d pending", vector); 2102 return (0); 2103 } 2104 2105 void 2106 vm_inject_fault(void *vmarg, int vcpuid, int vector, int errcode_valid, 2107 int errcode) 2108 { 2109 struct vm *vm; 2110 int error, restart_instruction; 2111 2112 vm = vmarg; 2113 restart_instruction = 1; 2114 2115 error = vm_inject_exception(vm, vcpuid, vector, errcode_valid, 2116 errcode, restart_instruction); 2117 KASSERT(error == 0, ("vm_inject_exception error %d", error)); 2118 } 2119 2120 void 2121 vm_inject_pf(void *vmarg, int vcpuid, int error_code, uint64_t cr2) 2122 { 2123 struct vm *vm; 2124 int error; 2125 2126 vm = vmarg; 2127 VCPU_CTR2(vm, vcpuid, "Injecting page fault: error_code %#x, cr2 %#lx", 2128 error_code, cr2); 2129 2130 error = vm_set_register(vm, vcpuid, VM_REG_GUEST_CR2, cr2); 2131 KASSERT(error == 0, ("vm_set_register(cr2) error %d", error)); 2132 2133 vm_inject_fault(vm, vcpuid, IDT_PF, 1, error_code); 2134 } 2135 2136 static VMM_STAT(VCPU_NMI_COUNT, "number of NMIs delivered to vcpu"); 2137 2138 int 2139 vm_inject_nmi(struct vm *vm, int vcpuid) 2140 { 2141 struct vcpu *vcpu; 2142 2143 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 2144 return (EINVAL); 2145 2146 vcpu = &vm->vcpu[vcpuid]; 2147 2148 vcpu->nmi_pending = 1; 2149 vcpu_notify_event(vm, vcpuid, false); 2150 return (0); 2151 } 2152 2153 int 2154 vm_nmi_pending(struct vm *vm, int vcpuid) 2155 { 2156 struct vcpu *vcpu; 2157 2158 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 2159 panic("vm_nmi_pending: invalid vcpuid %d", vcpuid); 2160 2161 vcpu = &vm->vcpu[vcpuid]; 2162 2163 return (vcpu->nmi_pending); 2164 } 2165 2166 void 2167 vm_nmi_clear(struct vm *vm, int vcpuid) 2168 { 2169 struct vcpu *vcpu; 2170 2171 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 2172 panic("vm_nmi_pending: invalid vcpuid %d", vcpuid); 2173 2174 vcpu = &vm->vcpu[vcpuid]; 2175 2176 if (vcpu->nmi_pending == 0) 2177 panic("vm_nmi_clear: inconsistent nmi_pending state"); 2178 2179 vcpu->nmi_pending = 0; 2180 vmm_stat_incr(vm, vcpuid, VCPU_NMI_COUNT, 1); 2181 } 2182 2183 static VMM_STAT(VCPU_EXTINT_COUNT, "number of ExtINTs delivered to vcpu"); 2184 2185 int 2186 vm_inject_extint(struct vm *vm, int vcpuid) 2187 { 2188 struct vcpu *vcpu; 2189 2190 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 2191 return (EINVAL); 2192 2193 vcpu = &vm->vcpu[vcpuid]; 2194 2195 vcpu->extint_pending = 1; 2196 vcpu_notify_event(vm, vcpuid, false); 2197 return (0); 2198 } 2199 2200 int 2201 vm_extint_pending(struct vm *vm, int vcpuid) 2202 { 2203 struct vcpu *vcpu; 2204 2205 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 2206 panic("vm_extint_pending: invalid vcpuid %d", vcpuid); 2207 2208 vcpu = &vm->vcpu[vcpuid]; 2209 2210 return (vcpu->extint_pending); 2211 } 2212 2213 void 2214 vm_extint_clear(struct vm *vm, int vcpuid) 2215 { 2216 struct vcpu *vcpu; 2217 2218 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 2219 panic("vm_extint_pending: invalid vcpuid %d", vcpuid); 2220 2221 vcpu = &vm->vcpu[vcpuid]; 2222 2223 if (vcpu->extint_pending == 0) 2224 panic("vm_extint_clear: inconsistent extint_pending state"); 2225 2226 vcpu->extint_pending = 0; 2227 vmm_stat_incr(vm, vcpuid, VCPU_EXTINT_COUNT, 1); 2228 } 2229 2230 int 2231 vm_get_capability(struct vm *vm, int vcpu, int type, int *retval) 2232 { 2233 if (vcpu < 0 || vcpu >= vm->maxcpus) 2234 return (EINVAL); 2235 2236 if (type < 0 || type >= VM_CAP_MAX) 2237 return (EINVAL); 2238 2239 return (vmmops_getcap(vm->cookie, vcpu, type, retval)); 2240 } 2241 2242 int 2243 vm_set_capability(struct vm *vm, int vcpu, int type, int val) 2244 { 2245 if (vcpu < 0 || vcpu >= vm->maxcpus) 2246 return (EINVAL); 2247 2248 if (type < 0 || type >= VM_CAP_MAX) 2249 return (EINVAL); 2250 2251 return (vmmops_setcap(vm->cookie, vcpu, type, val)); 2252 } 2253 2254 struct vlapic * 2255 vm_lapic(struct vm *vm, int cpu) 2256 { 2257 return (vm->vcpu[cpu].vlapic); 2258 } 2259 2260 struct vioapic * 2261 vm_ioapic(struct vm *vm) 2262 { 2263 2264 return (vm->vioapic); 2265 } 2266 2267 struct vhpet * 2268 vm_hpet(struct vm *vm) 2269 { 2270 2271 return (vm->vhpet); 2272 } 2273 2274 bool 2275 vmm_is_pptdev(int bus, int slot, int func) 2276 { 2277 int b, f, i, n, s; 2278 char *val, *cp, *cp2; 2279 bool found; 2280 2281 /* 2282 * XXX 2283 * The length of an environment variable is limited to 128 bytes which 2284 * puts an upper limit on the number of passthru devices that may be 2285 * specified using a single environment variable. 2286 * 2287 * Work around this by scanning multiple environment variable 2288 * names instead of a single one - yuck! 2289 */ 2290 const char *names[] = { "pptdevs", "pptdevs2", "pptdevs3", NULL }; 2291 2292 /* set pptdevs="1/2/3 4/5/6 7/8/9 10/11/12" */ 2293 found = false; 2294 for (i = 0; names[i] != NULL && !found; i++) { 2295 cp = val = kern_getenv(names[i]); 2296 while (cp != NULL && *cp != '\0') { 2297 if ((cp2 = strchr(cp, ' ')) != NULL) 2298 *cp2 = '\0'; 2299 2300 n = sscanf(cp, "%d/%d/%d", &b, &s, &f); 2301 if (n == 3 && bus == b && slot == s && func == f) { 2302 found = true; 2303 break; 2304 } 2305 2306 if (cp2 != NULL) 2307 *cp2++ = ' '; 2308 2309 cp = cp2; 2310 } 2311 freeenv(val); 2312 } 2313 return (found); 2314 } 2315 2316 void * 2317 vm_iommu_domain(struct vm *vm) 2318 { 2319 2320 return (vm->iommu); 2321 } 2322 2323 int 2324 vcpu_set_state(struct vm *vm, int vcpuid, enum vcpu_state newstate, 2325 bool from_idle) 2326 { 2327 int error; 2328 struct vcpu *vcpu; 2329 2330 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 2331 panic("vm_set_run_state: invalid vcpuid %d", vcpuid); 2332 2333 vcpu = &vm->vcpu[vcpuid]; 2334 2335 vcpu_lock(vcpu); 2336 error = vcpu_set_state_locked(vm, vcpuid, newstate, from_idle); 2337 vcpu_unlock(vcpu); 2338 2339 return (error); 2340 } 2341 2342 enum vcpu_state 2343 vcpu_get_state(struct vm *vm, int vcpuid, int *hostcpu) 2344 { 2345 struct vcpu *vcpu; 2346 enum vcpu_state state; 2347 2348 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 2349 panic("vm_get_run_state: invalid vcpuid %d", vcpuid); 2350 2351 vcpu = &vm->vcpu[vcpuid]; 2352 2353 vcpu_lock(vcpu); 2354 state = vcpu->state; 2355 if (hostcpu != NULL) 2356 *hostcpu = vcpu->hostcpu; 2357 vcpu_unlock(vcpu); 2358 2359 return (state); 2360 } 2361 2362 int 2363 vm_activate_cpu(struct vm *vm, int vcpuid) 2364 { 2365 2366 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 2367 return (EINVAL); 2368 2369 if (CPU_ISSET(vcpuid, &vm->active_cpus)) 2370 return (EBUSY); 2371 2372 VCPU_CTR0(vm, vcpuid, "activated"); 2373 CPU_SET_ATOMIC(vcpuid, &vm->active_cpus); 2374 return (0); 2375 } 2376 2377 int 2378 vm_suspend_cpu(struct vm *vm, int vcpuid) 2379 { 2380 int i; 2381 2382 if (vcpuid < -1 || vcpuid >= vm->maxcpus) 2383 return (EINVAL); 2384 2385 if (vcpuid == -1) { 2386 vm->debug_cpus = vm->active_cpus; 2387 for (i = 0; i < vm->maxcpus; i++) { 2388 if (CPU_ISSET(i, &vm->active_cpus)) 2389 vcpu_notify_event(vm, i, false); 2390 } 2391 } else { 2392 if (!CPU_ISSET(vcpuid, &vm->active_cpus)) 2393 return (EINVAL); 2394 2395 CPU_SET_ATOMIC(vcpuid, &vm->debug_cpus); 2396 vcpu_notify_event(vm, vcpuid, false); 2397 } 2398 return (0); 2399 } 2400 2401 int 2402 vm_resume_cpu(struct vm *vm, int vcpuid) 2403 { 2404 2405 if (vcpuid < -1 || vcpuid >= vm->maxcpus) 2406 return (EINVAL); 2407 2408 if (vcpuid == -1) { 2409 CPU_ZERO(&vm->debug_cpus); 2410 } else { 2411 if (!CPU_ISSET(vcpuid, &vm->debug_cpus)) 2412 return (EINVAL); 2413 2414 CPU_CLR_ATOMIC(vcpuid, &vm->debug_cpus); 2415 } 2416 return (0); 2417 } 2418 2419 int 2420 vcpu_debugged(struct vm *vm, int vcpuid) 2421 { 2422 2423 return (CPU_ISSET(vcpuid, &vm->debug_cpus)); 2424 } 2425 2426 cpuset_t 2427 vm_active_cpus(struct vm *vm) 2428 { 2429 2430 return (vm->active_cpus); 2431 } 2432 2433 cpuset_t 2434 vm_debug_cpus(struct vm *vm) 2435 { 2436 2437 return (vm->debug_cpus); 2438 } 2439 2440 cpuset_t 2441 vm_suspended_cpus(struct vm *vm) 2442 { 2443 2444 return (vm->suspended_cpus); 2445 } 2446 2447 void * 2448 vcpu_stats(struct vm *vm, int vcpuid) 2449 { 2450 2451 return (vm->vcpu[vcpuid].stats); 2452 } 2453 2454 int 2455 vm_get_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state *state) 2456 { 2457 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 2458 return (EINVAL); 2459 2460 *state = vm->vcpu[vcpuid].x2apic_state; 2461 2462 return (0); 2463 } 2464 2465 int 2466 vm_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state) 2467 { 2468 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 2469 return (EINVAL); 2470 2471 if (state >= X2APIC_STATE_LAST) 2472 return (EINVAL); 2473 2474 vm->vcpu[vcpuid].x2apic_state = state; 2475 2476 vlapic_set_x2apic_state(vm, vcpuid, state); 2477 2478 return (0); 2479 } 2480 2481 /* 2482 * This function is called to ensure that a vcpu "sees" a pending event 2483 * as soon as possible: 2484 * - If the vcpu thread is sleeping then it is woken up. 2485 * - If the vcpu is running on a different host_cpu then an IPI will be directed 2486 * to the host_cpu to cause the vcpu to trap into the hypervisor. 2487 */ 2488 static void 2489 vcpu_notify_event_locked(struct vcpu *vcpu, bool lapic_intr) 2490 { 2491 int hostcpu; 2492 2493 hostcpu = vcpu->hostcpu; 2494 if (vcpu->state == VCPU_RUNNING) { 2495 KASSERT(hostcpu != NOCPU, ("vcpu running on invalid hostcpu")); 2496 if (hostcpu != curcpu) { 2497 if (lapic_intr) { 2498 vlapic_post_intr(vcpu->vlapic, hostcpu, 2499 vmm_ipinum); 2500 } else { 2501 ipi_cpu(hostcpu, vmm_ipinum); 2502 } 2503 } else { 2504 /* 2505 * If the 'vcpu' is running on 'curcpu' then it must 2506 * be sending a notification to itself (e.g. SELF_IPI). 2507 * The pending event will be picked up when the vcpu 2508 * transitions back to guest context. 2509 */ 2510 } 2511 } else { 2512 KASSERT(hostcpu == NOCPU, ("vcpu state %d not consistent " 2513 "with hostcpu %d", vcpu->state, hostcpu)); 2514 if (vcpu->state == VCPU_SLEEPING) 2515 wakeup_one(vcpu); 2516 } 2517 } 2518 2519 void 2520 vcpu_notify_event(struct vm *vm, int vcpuid, bool lapic_intr) 2521 { 2522 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 2523 2524 vcpu_lock(vcpu); 2525 vcpu_notify_event_locked(vcpu, lapic_intr); 2526 vcpu_unlock(vcpu); 2527 } 2528 2529 struct vmspace * 2530 vm_get_vmspace(struct vm *vm) 2531 { 2532 2533 return (vm->vmspace); 2534 } 2535 2536 int 2537 vm_apicid2vcpuid(struct vm *vm, int apicid) 2538 { 2539 /* 2540 * XXX apic id is assumed to be numerically identical to vcpu id 2541 */ 2542 return (apicid); 2543 } 2544 2545 int 2546 vm_smp_rendezvous(struct vm *vm, int vcpuid, cpuset_t dest, 2547 vm_rendezvous_func_t func, void *arg) 2548 { 2549 int error, i; 2550 2551 /* 2552 * Enforce that this function is called without any locks 2553 */ 2554 WITNESS_WARN(WARN_PANIC, NULL, "vm_smp_rendezvous"); 2555 KASSERT(vcpuid == -1 || (vcpuid >= 0 && vcpuid < vm->maxcpus), 2556 ("vm_smp_rendezvous: invalid vcpuid %d", vcpuid)); 2557 2558 restart: 2559 mtx_lock(&vm->rendezvous_mtx); 2560 if (vm->rendezvous_func != NULL) { 2561 /* 2562 * If a rendezvous is already in progress then we need to 2563 * call the rendezvous handler in case this 'vcpuid' is one 2564 * of the targets of the rendezvous. 2565 */ 2566 RENDEZVOUS_CTR0(vm, vcpuid, "Rendezvous already in progress"); 2567 mtx_unlock(&vm->rendezvous_mtx); 2568 error = vm_handle_rendezvous(vm, vcpuid); 2569 if (error != 0) 2570 return (error); 2571 goto restart; 2572 } 2573 KASSERT(vm->rendezvous_func == NULL, ("vm_smp_rendezvous: previous " 2574 "rendezvous is still in progress")); 2575 2576 RENDEZVOUS_CTR0(vm, vcpuid, "Initiating rendezvous"); 2577 vm->rendezvous_req_cpus = dest; 2578 CPU_ZERO(&vm->rendezvous_done_cpus); 2579 vm->rendezvous_arg = arg; 2580 vm->rendezvous_func = func; 2581 mtx_unlock(&vm->rendezvous_mtx); 2582 2583 /* 2584 * Wake up any sleeping vcpus and trigger a VM-exit in any running 2585 * vcpus so they handle the rendezvous as soon as possible. 2586 */ 2587 for (i = 0; i < vm->maxcpus; i++) { 2588 if (CPU_ISSET(i, &dest)) 2589 vcpu_notify_event(vm, i, false); 2590 } 2591 2592 return (vm_handle_rendezvous(vm, vcpuid)); 2593 } 2594 2595 struct vatpic * 2596 vm_atpic(struct vm *vm) 2597 { 2598 return (vm->vatpic); 2599 } 2600 2601 struct vatpit * 2602 vm_atpit(struct vm *vm) 2603 { 2604 return (vm->vatpit); 2605 } 2606 2607 struct vpmtmr * 2608 vm_pmtmr(struct vm *vm) 2609 { 2610 2611 return (vm->vpmtmr); 2612 } 2613 2614 struct vrtc * 2615 vm_rtc(struct vm *vm) 2616 { 2617 2618 return (vm->vrtc); 2619 } 2620 2621 enum vm_reg_name 2622 vm_segment_name(int seg) 2623 { 2624 static enum vm_reg_name seg_names[] = { 2625 VM_REG_GUEST_ES, 2626 VM_REG_GUEST_CS, 2627 VM_REG_GUEST_SS, 2628 VM_REG_GUEST_DS, 2629 VM_REG_GUEST_FS, 2630 VM_REG_GUEST_GS 2631 }; 2632 2633 KASSERT(seg >= 0 && seg < nitems(seg_names), 2634 ("%s: invalid segment encoding %d", __func__, seg)); 2635 return (seg_names[seg]); 2636 } 2637 2638 void 2639 vm_copy_teardown(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo, 2640 int num_copyinfo) 2641 { 2642 int idx; 2643 2644 for (idx = 0; idx < num_copyinfo; idx++) { 2645 if (copyinfo[idx].cookie != NULL) 2646 vm_gpa_release(copyinfo[idx].cookie); 2647 } 2648 bzero(copyinfo, num_copyinfo * sizeof(struct vm_copyinfo)); 2649 } 2650 2651 int 2652 vm_copy_setup(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, 2653 uint64_t gla, size_t len, int prot, struct vm_copyinfo *copyinfo, 2654 int num_copyinfo, int *fault) 2655 { 2656 int error, idx, nused; 2657 size_t n, off, remaining; 2658 void *hva, *cookie; 2659 uint64_t gpa; 2660 2661 bzero(copyinfo, sizeof(struct vm_copyinfo) * num_copyinfo); 2662 2663 nused = 0; 2664 remaining = len; 2665 while (remaining > 0) { 2666 KASSERT(nused < num_copyinfo, ("insufficient vm_copyinfo")); 2667 error = vm_gla2gpa(vm, vcpuid, paging, gla, prot, &gpa, fault); 2668 if (error || *fault) 2669 return (error); 2670 off = gpa & PAGE_MASK; 2671 n = min(remaining, PAGE_SIZE - off); 2672 copyinfo[nused].gpa = gpa; 2673 copyinfo[nused].len = n; 2674 remaining -= n; 2675 gla += n; 2676 nused++; 2677 } 2678 2679 for (idx = 0; idx < nused; idx++) { 2680 hva = vm_gpa_hold(vm, vcpuid, copyinfo[idx].gpa, 2681 copyinfo[idx].len, prot, &cookie); 2682 if (hva == NULL) 2683 break; 2684 copyinfo[idx].hva = hva; 2685 copyinfo[idx].cookie = cookie; 2686 } 2687 2688 if (idx != nused) { 2689 vm_copy_teardown(vm, vcpuid, copyinfo, num_copyinfo); 2690 return (EFAULT); 2691 } else { 2692 *fault = 0; 2693 return (0); 2694 } 2695 } 2696 2697 void 2698 vm_copyin(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo, void *kaddr, 2699 size_t len) 2700 { 2701 char *dst; 2702 int idx; 2703 2704 dst = kaddr; 2705 idx = 0; 2706 while (len > 0) { 2707 bcopy(copyinfo[idx].hva, dst, copyinfo[idx].len); 2708 len -= copyinfo[idx].len; 2709 dst += copyinfo[idx].len; 2710 idx++; 2711 } 2712 } 2713 2714 void 2715 vm_copyout(struct vm *vm, int vcpuid, const void *kaddr, 2716 struct vm_copyinfo *copyinfo, size_t len) 2717 { 2718 const char *src; 2719 int idx; 2720 2721 src = kaddr; 2722 idx = 0; 2723 while (len > 0) { 2724 bcopy(src, copyinfo[idx].hva, copyinfo[idx].len); 2725 len -= copyinfo[idx].len; 2726 src += copyinfo[idx].len; 2727 idx++; 2728 } 2729 } 2730 2731 /* 2732 * Return the amount of in-use and wired memory for the VM. Since 2733 * these are global stats, only return the values with for vCPU 0 2734 */ 2735 VMM_STAT_DECLARE(VMM_MEM_RESIDENT); 2736 VMM_STAT_DECLARE(VMM_MEM_WIRED); 2737 2738 static void 2739 vm_get_rescnt(struct vm *vm, int vcpu, struct vmm_stat_type *stat) 2740 { 2741 2742 if (vcpu == 0) { 2743 vmm_stat_set(vm, vcpu, VMM_MEM_RESIDENT, 2744 PAGE_SIZE * vmspace_resident_count(vm->vmspace)); 2745 } 2746 } 2747 2748 static void 2749 vm_get_wiredcnt(struct vm *vm, int vcpu, struct vmm_stat_type *stat) 2750 { 2751 2752 if (vcpu == 0) { 2753 vmm_stat_set(vm, vcpu, VMM_MEM_WIRED, 2754 PAGE_SIZE * pmap_wired_count(vmspace_pmap(vm->vmspace))); 2755 } 2756 } 2757 2758 VMM_STAT_FUNC(VMM_MEM_RESIDENT, "Resident memory", vm_get_rescnt); 2759 VMM_STAT_FUNC(VMM_MEM_WIRED, "Wired memory", vm_get_wiredcnt); 2760 2761 #ifdef BHYVE_SNAPSHOT 2762 static int 2763 vm_snapshot_vcpus(struct vm *vm, struct vm_snapshot_meta *meta) 2764 { 2765 int ret; 2766 int i; 2767 struct vcpu *vcpu; 2768 2769 for (i = 0; i < VM_MAXCPU; i++) { 2770 vcpu = &vm->vcpu[i]; 2771 2772 SNAPSHOT_VAR_OR_LEAVE(vcpu->x2apic_state, meta, ret, done); 2773 SNAPSHOT_VAR_OR_LEAVE(vcpu->exitintinfo, meta, ret, done); 2774 SNAPSHOT_VAR_OR_LEAVE(vcpu->exc_vector, meta, ret, done); 2775 SNAPSHOT_VAR_OR_LEAVE(vcpu->exc_errcode_valid, meta, ret, done); 2776 SNAPSHOT_VAR_OR_LEAVE(vcpu->exc_errcode, meta, ret, done); 2777 SNAPSHOT_VAR_OR_LEAVE(vcpu->guest_xcr0, meta, ret, done); 2778 SNAPSHOT_VAR_OR_LEAVE(vcpu->exitinfo, meta, ret, done); 2779 SNAPSHOT_VAR_OR_LEAVE(vcpu->nextrip, meta, ret, done); 2780 /* XXX we're cheating here, since the value of tsc_offset as 2781 * saved here is actually the value of the guest's TSC value. 2782 * 2783 * It will be turned turned back into an actual offset when the 2784 * TSC restore function is called 2785 */ 2786 SNAPSHOT_VAR_OR_LEAVE(vcpu->tsc_offset, meta, ret, done); 2787 } 2788 2789 done: 2790 return (ret); 2791 } 2792 2793 static int 2794 vm_snapshot_vm(struct vm *vm, struct vm_snapshot_meta *meta) 2795 { 2796 int ret; 2797 int i; 2798 uint64_t now; 2799 2800 ret = 0; 2801 now = rdtsc(); 2802 2803 if (meta->op == VM_SNAPSHOT_SAVE) { 2804 /* XXX make tsc_offset take the value TSC proper as seen by the 2805 * guest 2806 */ 2807 for (i = 0; i < VM_MAXCPU; i++) 2808 vm->vcpu[i].tsc_offset += now; 2809 } 2810 2811 ret = vm_snapshot_vcpus(vm, meta); 2812 if (ret != 0) { 2813 printf("%s: failed to copy vm data to user buffer", __func__); 2814 goto done; 2815 } 2816 2817 if (meta->op == VM_SNAPSHOT_SAVE) { 2818 /* XXX turn tsc_offset back into an offset; actual value is only 2819 * required for restore; using it otherwise would be wrong 2820 */ 2821 for (i = 0; i < VM_MAXCPU; i++) 2822 vm->vcpu[i].tsc_offset -= now; 2823 } 2824 2825 done: 2826 return (ret); 2827 } 2828 2829 static int 2830 vm_snapshot_vmcx(struct vm *vm, struct vm_snapshot_meta *meta) 2831 { 2832 int i, error; 2833 2834 error = 0; 2835 2836 for (i = 0; i < VM_MAXCPU; i++) { 2837 error = vmmops_vmcx_snapshot(vm->cookie, meta, i); 2838 if (error != 0) { 2839 printf("%s: failed to snapshot vmcs/vmcb data for " 2840 "vCPU: %d; error: %d\n", __func__, i, error); 2841 goto done; 2842 } 2843 } 2844 2845 done: 2846 return (error); 2847 } 2848 2849 /* 2850 * Save kernel-side structures to user-space for snapshotting. 2851 */ 2852 int 2853 vm_snapshot_req(struct vm *vm, struct vm_snapshot_meta *meta) 2854 { 2855 int ret = 0; 2856 2857 switch (meta->dev_req) { 2858 case STRUCT_VMX: 2859 ret = vmmops_snapshot(vm->cookie, meta); 2860 break; 2861 case STRUCT_VMCX: 2862 ret = vm_snapshot_vmcx(vm, meta); 2863 break; 2864 case STRUCT_VM: 2865 ret = vm_snapshot_vm(vm, meta); 2866 break; 2867 case STRUCT_VIOAPIC: 2868 ret = vioapic_snapshot(vm_ioapic(vm), meta); 2869 break; 2870 case STRUCT_VLAPIC: 2871 ret = vlapic_snapshot(vm, meta); 2872 break; 2873 case STRUCT_VHPET: 2874 ret = vhpet_snapshot(vm_hpet(vm), meta); 2875 break; 2876 case STRUCT_VATPIC: 2877 ret = vatpic_snapshot(vm_atpic(vm), meta); 2878 break; 2879 case STRUCT_VATPIT: 2880 ret = vatpit_snapshot(vm_atpit(vm), meta); 2881 break; 2882 case STRUCT_VPMTMR: 2883 ret = vpmtmr_snapshot(vm_pmtmr(vm), meta); 2884 break; 2885 case STRUCT_VRTC: 2886 ret = vrtc_snapshot(vm_rtc(vm), meta); 2887 break; 2888 default: 2889 printf("%s: failed to find the requested type %#x\n", 2890 __func__, meta->dev_req); 2891 ret = (EINVAL); 2892 } 2893 return (ret); 2894 } 2895 2896 int 2897 vm_set_tsc_offset(struct vm *vm, int vcpuid, uint64_t offset) 2898 { 2899 struct vcpu *vcpu; 2900 2901 if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 2902 return (EINVAL); 2903 2904 vcpu = &vm->vcpu[vcpuid]; 2905 vcpu->tsc_offset = offset; 2906 2907 return (0); 2908 } 2909 2910 int 2911 vm_restore_time(struct vm *vm) 2912 { 2913 int error, i; 2914 uint64_t now; 2915 struct vcpu *vcpu; 2916 2917 now = rdtsc(); 2918 2919 error = vhpet_restore_time(vm_hpet(vm)); 2920 if (error) 2921 return (error); 2922 2923 for (i = 0; i < nitems(vm->vcpu); i++) { 2924 vcpu = &vm->vcpu[i]; 2925 2926 error = vmmops_restore_tsc(vm->cookie, i, vcpu->tsc_offset - 2927 now); 2928 if (error) 2929 return (error); 2930 } 2931 2932 return (0); 2933 } 2934 #endif 2935