1 /*- 2 * Copyright (c) 2011 NetApp, Inc. 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 * 26 * $FreeBSD$ 27 */ 28 29 #include <sys/cdefs.h> 30 __FBSDID("$FreeBSD$"); 31 32 #include <sys/param.h> 33 #include <sys/systm.h> 34 #include <sys/kernel.h> 35 #include <sys/module.h> 36 #include <sys/sysctl.h> 37 #include <sys/malloc.h> 38 #include <sys/pcpu.h> 39 #include <sys/lock.h> 40 #include <sys/mutex.h> 41 #include <sys/proc.h> 42 #include <sys/rwlock.h> 43 #include <sys/sched.h> 44 #include <sys/smp.h> 45 #include <sys/systm.h> 46 47 #include <vm/vm.h> 48 #include <vm/vm_object.h> 49 #include <vm/vm_page.h> 50 #include <vm/pmap.h> 51 #include <vm/vm_map.h> 52 #include <vm/vm_extern.h> 53 #include <vm/vm_param.h> 54 55 #include <machine/cpu.h> 56 #include <machine/vm.h> 57 #include <machine/pcb.h> 58 #include <machine/smp.h> 59 #include <x86/psl.h> 60 #include <x86/apicreg.h> 61 #include <machine/vmparam.h> 62 63 #include <machine/vmm.h> 64 #include <machine/vmm_dev.h> 65 #include <machine/vmm_instruction_emul.h> 66 67 #include "vmm_ioport.h" 68 #include "vmm_ktr.h" 69 #include "vmm_host.h" 70 #include "vmm_mem.h" 71 #include "vmm_util.h" 72 #include "vatpic.h" 73 #include "vatpit.h" 74 #include "vhpet.h" 75 #include "vioapic.h" 76 #include "vlapic.h" 77 #include "vpmtmr.h" 78 #include "vrtc.h" 79 #include "vmm_stat.h" 80 #include "vmm_lapic.h" 81 82 #include "io/ppt.h" 83 #include "io/iommu.h" 84 85 struct vlapic; 86 87 /* 88 * Initialization: 89 * (a) allocated when vcpu is created 90 * (i) initialized when vcpu is created and when it is reinitialized 91 * (o) initialized the first time the vcpu is created 92 * (x) initialized before use 93 */ 94 struct vcpu { 95 struct mtx mtx; /* (o) protects 'state' and 'hostcpu' */ 96 enum vcpu_state state; /* (o) vcpu state */ 97 int hostcpu; /* (o) vcpu's host cpu */ 98 int reqidle; /* (i) request vcpu to idle */ 99 struct vlapic *vlapic; /* (i) APIC device model */ 100 enum x2apic_state x2apic_state; /* (i) APIC mode */ 101 uint64_t exitintinfo; /* (i) events pending at VM exit */ 102 int nmi_pending; /* (i) NMI pending */ 103 int extint_pending; /* (i) INTR pending */ 104 int exception_pending; /* (i) exception pending */ 105 int exc_vector; /* (x) exception collateral */ 106 int exc_errcode_valid; 107 uint32_t exc_errcode; 108 struct savefpu *guestfpu; /* (a,i) guest fpu state */ 109 uint64_t guest_xcr0; /* (i) guest %xcr0 register */ 110 void *stats; /* (a,i) statistics */ 111 struct vm_exit exitinfo; /* (x) exit reason and collateral */ 112 uint64_t nextrip; /* (x) next instruction to execute */ 113 }; 114 115 #define vcpu_lock_initialized(v) mtx_initialized(&((v)->mtx)) 116 #define vcpu_lock_init(v) mtx_init(&((v)->mtx), "vcpu lock", 0, MTX_SPIN) 117 #define vcpu_lock(v) mtx_lock_spin(&((v)->mtx)) 118 #define vcpu_unlock(v) mtx_unlock_spin(&((v)->mtx)) 119 #define vcpu_assert_locked(v) mtx_assert(&((v)->mtx), MA_OWNED) 120 121 struct mem_seg { 122 size_t len; 123 bool sysmem; 124 struct vm_object *object; 125 }; 126 #define VM_MAX_MEMSEGS 2 127 128 struct mem_map { 129 vm_paddr_t gpa; 130 size_t len; 131 vm_ooffset_t segoff; 132 int segid; 133 int prot; 134 int flags; 135 }; 136 #define VM_MAX_MEMMAPS 4 137 138 /* 139 * Initialization: 140 * (o) initialized the first time the VM is created 141 * (i) initialized when VM is created and when it is reinitialized 142 * (x) initialized before use 143 */ 144 struct vm { 145 void *cookie; /* (i) cpu-specific data */ 146 void *iommu; /* (x) iommu-specific data */ 147 struct vhpet *vhpet; /* (i) virtual HPET */ 148 struct vioapic *vioapic; /* (i) virtual ioapic */ 149 struct vatpic *vatpic; /* (i) virtual atpic */ 150 struct vatpit *vatpit; /* (i) virtual atpit */ 151 struct vpmtmr *vpmtmr; /* (i) virtual ACPI PM timer */ 152 struct vrtc *vrtc; /* (o) virtual RTC */ 153 volatile cpuset_t active_cpus; /* (i) active vcpus */ 154 int suspend; /* (i) stop VM execution */ 155 volatile cpuset_t suspended_cpus; /* (i) suspended vcpus */ 156 volatile cpuset_t halted_cpus; /* (x) cpus in a hard halt */ 157 cpuset_t rendezvous_req_cpus; /* (x) rendezvous requested */ 158 cpuset_t rendezvous_done_cpus; /* (x) rendezvous finished */ 159 void *rendezvous_arg; /* (x) rendezvous func/arg */ 160 vm_rendezvous_func_t rendezvous_func; 161 struct mtx rendezvous_mtx; /* (o) rendezvous lock */ 162 struct mem_map mem_maps[VM_MAX_MEMMAPS]; /* (i) guest address space */ 163 struct mem_seg mem_segs[VM_MAX_MEMSEGS]; /* (o) guest memory regions */ 164 struct vmspace *vmspace; /* (o) guest's address space */ 165 char name[VM_MAX_NAMELEN]; /* (o) virtual machine name */ 166 struct vcpu vcpu[VM_MAXCPU]; /* (i) guest vcpus */ 167 }; 168 169 static int vmm_initialized; 170 171 static struct vmm_ops *ops; 172 #define VMM_INIT(num) (ops != NULL ? (*ops->init)(num) : 0) 173 #define VMM_CLEANUP() (ops != NULL ? (*ops->cleanup)() : 0) 174 #define VMM_RESUME() (ops != NULL ? (*ops->resume)() : 0) 175 176 #define VMINIT(vm, pmap) (ops != NULL ? (*ops->vminit)(vm, pmap): NULL) 177 #define VMRUN(vmi, vcpu, rip, pmap, evinfo) \ 178 (ops != NULL ? (*ops->vmrun)(vmi, vcpu, rip, pmap, evinfo) : ENXIO) 179 #define VMCLEANUP(vmi) (ops != NULL ? (*ops->vmcleanup)(vmi) : NULL) 180 #define VMSPACE_ALLOC(min, max) \ 181 (ops != NULL ? (*ops->vmspace_alloc)(min, max) : NULL) 182 #define VMSPACE_FREE(vmspace) \ 183 (ops != NULL ? (*ops->vmspace_free)(vmspace) : ENXIO) 184 #define VMGETREG(vmi, vcpu, num, retval) \ 185 (ops != NULL ? (*ops->vmgetreg)(vmi, vcpu, num, retval) : ENXIO) 186 #define VMSETREG(vmi, vcpu, num, val) \ 187 (ops != NULL ? (*ops->vmsetreg)(vmi, vcpu, num, val) : ENXIO) 188 #define VMGETDESC(vmi, vcpu, num, desc) \ 189 (ops != NULL ? (*ops->vmgetdesc)(vmi, vcpu, num, desc) : ENXIO) 190 #define VMSETDESC(vmi, vcpu, num, desc) \ 191 (ops != NULL ? (*ops->vmsetdesc)(vmi, vcpu, num, desc) : ENXIO) 192 #define VMGETCAP(vmi, vcpu, num, retval) \ 193 (ops != NULL ? (*ops->vmgetcap)(vmi, vcpu, num, retval) : ENXIO) 194 #define VMSETCAP(vmi, vcpu, num, val) \ 195 (ops != NULL ? (*ops->vmsetcap)(vmi, vcpu, num, val) : ENXIO) 196 #define VLAPIC_INIT(vmi, vcpu) \ 197 (ops != NULL ? (*ops->vlapic_init)(vmi, vcpu) : NULL) 198 #define VLAPIC_CLEANUP(vmi, vlapic) \ 199 (ops != NULL ? (*ops->vlapic_cleanup)(vmi, vlapic) : NULL) 200 201 #define fpu_start_emulating() load_cr0(rcr0() | CR0_TS) 202 #define fpu_stop_emulating() clts() 203 204 static MALLOC_DEFINE(M_VM, "vm", "vm"); 205 206 /* statistics */ 207 static VMM_STAT(VCPU_TOTAL_RUNTIME, "vcpu total runtime"); 208 209 SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW, NULL, NULL); 210 211 /* 212 * Halt the guest if all vcpus are executing a HLT instruction with 213 * interrupts disabled. 214 */ 215 static int halt_detection_enabled = 1; 216 SYSCTL_INT(_hw_vmm, OID_AUTO, halt_detection, CTLFLAG_RDTUN, 217 &halt_detection_enabled, 0, 218 "Halt VM if all vcpus execute HLT with interrupts disabled"); 219 220 static int vmm_ipinum; 221 SYSCTL_INT(_hw_vmm, OID_AUTO, ipinum, CTLFLAG_RD, &vmm_ipinum, 0, 222 "IPI vector used for vcpu notifications"); 223 224 static int trace_guest_exceptions; 225 SYSCTL_INT(_hw_vmm, OID_AUTO, trace_guest_exceptions, CTLFLAG_RDTUN, 226 &trace_guest_exceptions, 0, 227 "Trap into hypervisor on all guest exceptions and reflect them back"); 228 229 static int vmm_force_iommu = 0; 230 TUNABLE_INT("hw.vmm.force_iommu", &vmm_force_iommu); 231 SYSCTL_INT(_hw_vmm, OID_AUTO, force_iommu, CTLFLAG_RDTUN, &vmm_force_iommu, 0, 232 "Force use of I/O MMU even if no passthrough devices were found."); 233 234 static void vm_free_memmap(struct vm *vm, int ident); 235 static bool sysmem_mapping(struct vm *vm, struct mem_map *mm); 236 static void vcpu_notify_event_locked(struct vcpu *vcpu, bool lapic_intr); 237 238 #ifdef KTR 239 static const char * 240 vcpu_state2str(enum vcpu_state state) 241 { 242 243 switch (state) { 244 case VCPU_IDLE: 245 return ("idle"); 246 case VCPU_FROZEN: 247 return ("frozen"); 248 case VCPU_RUNNING: 249 return ("running"); 250 case VCPU_SLEEPING: 251 return ("sleeping"); 252 default: 253 return ("unknown"); 254 } 255 } 256 #endif 257 258 static void 259 vcpu_cleanup(struct vm *vm, int i, bool destroy) 260 { 261 struct vcpu *vcpu = &vm->vcpu[i]; 262 263 VLAPIC_CLEANUP(vm->cookie, vcpu->vlapic); 264 if (destroy) { 265 vmm_stat_free(vcpu->stats); 266 fpu_save_area_free(vcpu->guestfpu); 267 } 268 } 269 270 static void 271 vcpu_init(struct vm *vm, int vcpu_id, bool create) 272 { 273 struct vcpu *vcpu; 274 275 KASSERT(vcpu_id >= 0 && vcpu_id < VM_MAXCPU, 276 ("vcpu_init: invalid vcpu %d", vcpu_id)); 277 278 vcpu = &vm->vcpu[vcpu_id]; 279 280 if (create) { 281 KASSERT(!vcpu_lock_initialized(vcpu), ("vcpu %d already " 282 "initialized", vcpu_id)); 283 vcpu_lock_init(vcpu); 284 vcpu->state = VCPU_IDLE; 285 vcpu->hostcpu = NOCPU; 286 vcpu->guestfpu = fpu_save_area_alloc(); 287 vcpu->stats = vmm_stat_alloc(); 288 } 289 290 vcpu->vlapic = VLAPIC_INIT(vm->cookie, vcpu_id); 291 vm_set_x2apic_state(vm, vcpu_id, X2APIC_DISABLED); 292 vcpu->reqidle = 0; 293 vcpu->exitintinfo = 0; 294 vcpu->nmi_pending = 0; 295 vcpu->extint_pending = 0; 296 vcpu->exception_pending = 0; 297 vcpu->guest_xcr0 = XFEATURE_ENABLED_X87; 298 fpu_save_area_reset(vcpu->guestfpu); 299 vmm_stat_init(vcpu->stats); 300 } 301 302 int 303 vcpu_trace_exceptions(struct vm *vm, int vcpuid) 304 { 305 306 return (trace_guest_exceptions); 307 } 308 309 struct vm_exit * 310 vm_exitinfo(struct vm *vm, int cpuid) 311 { 312 struct vcpu *vcpu; 313 314 if (cpuid < 0 || cpuid >= VM_MAXCPU) 315 panic("vm_exitinfo: invalid cpuid %d", cpuid); 316 317 vcpu = &vm->vcpu[cpuid]; 318 319 return (&vcpu->exitinfo); 320 } 321 322 static void 323 vmm_resume(void) 324 { 325 VMM_RESUME(); 326 } 327 328 static int 329 vmm_init(void) 330 { 331 int error; 332 333 vmm_host_state_init(); 334 335 vmm_ipinum = lapic_ipi_alloc(&IDTVEC(justreturn)); 336 if (vmm_ipinum < 0) 337 vmm_ipinum = IPI_AST; 338 339 error = vmm_mem_init(); 340 if (error) 341 return (error); 342 343 if (vmm_is_intel()) 344 ops = &vmm_ops_intel; 345 else if (vmm_is_amd()) 346 ops = &vmm_ops_amd; 347 else 348 return (ENXIO); 349 350 vmm_resume_p = vmm_resume; 351 352 return (VMM_INIT(vmm_ipinum)); 353 } 354 355 static int 356 vmm_handler(module_t mod, int what, void *arg) 357 { 358 int error; 359 360 switch (what) { 361 case MOD_LOAD: 362 vmmdev_init(); 363 if (vmm_force_iommu || ppt_avail_devices() > 0) 364 iommu_init(); 365 error = vmm_init(); 366 if (error == 0) 367 vmm_initialized = 1; 368 break; 369 case MOD_UNLOAD: 370 error = vmmdev_cleanup(); 371 if (error == 0) { 372 vmm_resume_p = NULL; 373 iommu_cleanup(); 374 if (vmm_ipinum != IPI_AST) 375 lapic_ipi_free(vmm_ipinum); 376 error = VMM_CLEANUP(); 377 /* 378 * Something bad happened - prevent new 379 * VMs from being created 380 */ 381 if (error) 382 vmm_initialized = 0; 383 } 384 break; 385 default: 386 error = 0; 387 break; 388 } 389 return (error); 390 } 391 392 static moduledata_t vmm_kmod = { 393 "vmm", 394 vmm_handler, 395 NULL 396 }; 397 398 /* 399 * vmm initialization has the following dependencies: 400 * 401 * - iommu initialization must happen after the pci passthru driver has had 402 * a chance to attach to any passthru devices (after SI_SUB_CONFIGURE). 403 * 404 * - VT-x initialization requires smp_rendezvous() and therefore must happen 405 * after SMP is fully functional (after SI_SUB_SMP). 406 */ 407 DECLARE_MODULE(vmm, vmm_kmod, SI_SUB_SMP + 1, SI_ORDER_ANY); 408 MODULE_VERSION(vmm, 1); 409 410 static void 411 vm_init(struct vm *vm, bool create) 412 { 413 int i; 414 415 vm->cookie = VMINIT(vm, vmspace_pmap(vm->vmspace)); 416 vm->iommu = NULL; 417 vm->vioapic = vioapic_init(vm); 418 vm->vhpet = vhpet_init(vm); 419 vm->vatpic = vatpic_init(vm); 420 vm->vatpit = vatpit_init(vm); 421 vm->vpmtmr = vpmtmr_init(vm); 422 if (create) 423 vm->vrtc = vrtc_init(vm); 424 425 CPU_ZERO(&vm->active_cpus); 426 427 vm->suspend = 0; 428 CPU_ZERO(&vm->suspended_cpus); 429 430 for (i = 0; i < VM_MAXCPU; i++) 431 vcpu_init(vm, i, create); 432 } 433 434 int 435 vm_create(const char *name, struct vm **retvm) 436 { 437 struct vm *vm; 438 struct vmspace *vmspace; 439 440 /* 441 * If vmm.ko could not be successfully initialized then don't attempt 442 * to create the virtual machine. 443 */ 444 if (!vmm_initialized) 445 return (ENXIO); 446 447 if (name == NULL || strlen(name) >= VM_MAX_NAMELEN) 448 return (EINVAL); 449 450 vmspace = VMSPACE_ALLOC(0, VM_MAXUSER_ADDRESS); 451 if (vmspace == NULL) 452 return (ENOMEM); 453 454 vm = malloc(sizeof(struct vm), M_VM, M_WAITOK | M_ZERO); 455 strcpy(vm->name, name); 456 vm->vmspace = vmspace; 457 mtx_init(&vm->rendezvous_mtx, "vm rendezvous lock", 0, MTX_DEF); 458 459 vm_init(vm, true); 460 461 *retvm = vm; 462 return (0); 463 } 464 465 static void 466 vm_cleanup(struct vm *vm, bool destroy) 467 { 468 struct mem_map *mm; 469 int i; 470 471 ppt_unassign_all(vm); 472 473 if (vm->iommu != NULL) 474 iommu_destroy_domain(vm->iommu); 475 476 if (destroy) 477 vrtc_cleanup(vm->vrtc); 478 else 479 vrtc_reset(vm->vrtc); 480 vpmtmr_cleanup(vm->vpmtmr); 481 vatpit_cleanup(vm->vatpit); 482 vhpet_cleanup(vm->vhpet); 483 vatpic_cleanup(vm->vatpic); 484 vioapic_cleanup(vm->vioapic); 485 486 for (i = 0; i < VM_MAXCPU; i++) 487 vcpu_cleanup(vm, i, destroy); 488 489 VMCLEANUP(vm->cookie); 490 491 /* 492 * System memory is removed from the guest address space only when 493 * the VM is destroyed. This is because the mapping remains the same 494 * across VM reset. 495 * 496 * Device memory can be relocated by the guest (e.g. using PCI BARs) 497 * so those mappings are removed on a VM reset. 498 */ 499 for (i = 0; i < VM_MAX_MEMMAPS; i++) { 500 mm = &vm->mem_maps[i]; 501 if (destroy || !sysmem_mapping(vm, mm)) 502 vm_free_memmap(vm, i); 503 } 504 505 if (destroy) { 506 for (i = 0; i < VM_MAX_MEMSEGS; i++) 507 vm_free_memseg(vm, i); 508 509 VMSPACE_FREE(vm->vmspace); 510 vm->vmspace = NULL; 511 } 512 } 513 514 void 515 vm_destroy(struct vm *vm) 516 { 517 vm_cleanup(vm, true); 518 free(vm, M_VM); 519 } 520 521 int 522 vm_reinit(struct vm *vm) 523 { 524 int error; 525 526 /* 527 * A virtual machine can be reset only if all vcpus are suspended. 528 */ 529 if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) { 530 vm_cleanup(vm, false); 531 vm_init(vm, false); 532 error = 0; 533 } else { 534 error = EBUSY; 535 } 536 537 return (error); 538 } 539 540 const char * 541 vm_name(struct vm *vm) 542 { 543 return (vm->name); 544 } 545 546 int 547 vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa) 548 { 549 vm_object_t obj; 550 551 if ((obj = vmm_mmio_alloc(vm->vmspace, gpa, len, hpa)) == NULL) 552 return (ENOMEM); 553 else 554 return (0); 555 } 556 557 int 558 vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len) 559 { 560 561 vmm_mmio_free(vm->vmspace, gpa, len); 562 return (0); 563 } 564 565 /* 566 * Return 'true' if 'gpa' is allocated in the guest address space. 567 * 568 * This function is called in the context of a running vcpu which acts as 569 * an implicit lock on 'vm->mem_maps[]'. 570 */ 571 bool 572 vm_mem_allocated(struct vm *vm, int vcpuid, vm_paddr_t gpa) 573 { 574 struct mem_map *mm; 575 int i; 576 577 #ifdef INVARIANTS 578 int hostcpu, state; 579 state = vcpu_get_state(vm, vcpuid, &hostcpu); 580 KASSERT(state == VCPU_RUNNING && hostcpu == curcpu, 581 ("%s: invalid vcpu state %d/%d", __func__, state, hostcpu)); 582 #endif 583 584 for (i = 0; i < VM_MAX_MEMMAPS; i++) { 585 mm = &vm->mem_maps[i]; 586 if (mm->len != 0 && gpa >= mm->gpa && gpa < mm->gpa + mm->len) 587 return (true); /* 'gpa' is sysmem or devmem */ 588 } 589 590 if (ppt_is_mmio(vm, gpa)) 591 return (true); /* 'gpa' is pci passthru mmio */ 592 593 return (false); 594 } 595 596 int 597 vm_alloc_memseg(struct vm *vm, int ident, size_t len, bool sysmem) 598 { 599 struct mem_seg *seg; 600 vm_object_t obj; 601 602 if (ident < 0 || ident >= VM_MAX_MEMSEGS) 603 return (EINVAL); 604 605 if (len == 0 || (len & PAGE_MASK)) 606 return (EINVAL); 607 608 seg = &vm->mem_segs[ident]; 609 if (seg->object != NULL) { 610 if (seg->len == len && seg->sysmem == sysmem) 611 return (EEXIST); 612 else 613 return (EINVAL); 614 } 615 616 obj = vm_object_allocate(OBJT_DEFAULT, len >> PAGE_SHIFT); 617 if (obj == NULL) 618 return (ENOMEM); 619 620 seg->len = len; 621 seg->object = obj; 622 seg->sysmem = sysmem; 623 return (0); 624 } 625 626 int 627 vm_get_memseg(struct vm *vm, int ident, size_t *len, bool *sysmem, 628 vm_object_t *objptr) 629 { 630 struct mem_seg *seg; 631 632 if (ident < 0 || ident >= VM_MAX_MEMSEGS) 633 return (EINVAL); 634 635 seg = &vm->mem_segs[ident]; 636 if (len) 637 *len = seg->len; 638 if (sysmem) 639 *sysmem = seg->sysmem; 640 if (objptr) 641 *objptr = seg->object; 642 return (0); 643 } 644 645 void 646 vm_free_memseg(struct vm *vm, int ident) 647 { 648 struct mem_seg *seg; 649 650 KASSERT(ident >= 0 && ident < VM_MAX_MEMSEGS, 651 ("%s: invalid memseg ident %d", __func__, ident)); 652 653 seg = &vm->mem_segs[ident]; 654 if (seg->object != NULL) { 655 vm_object_deallocate(seg->object); 656 bzero(seg, sizeof(struct mem_seg)); 657 } 658 } 659 660 int 661 vm_mmap_memseg(struct vm *vm, vm_paddr_t gpa, int segid, vm_ooffset_t first, 662 size_t len, int prot, int flags) 663 { 664 struct mem_seg *seg; 665 struct mem_map *m, *map; 666 vm_ooffset_t last; 667 int i, error; 668 669 if (prot == 0 || (prot & ~(VM_PROT_ALL)) != 0) 670 return (EINVAL); 671 672 if (flags & ~VM_MEMMAP_F_WIRED) 673 return (EINVAL); 674 675 if (segid < 0 || segid >= VM_MAX_MEMSEGS) 676 return (EINVAL); 677 678 seg = &vm->mem_segs[segid]; 679 if (seg->object == NULL) 680 return (EINVAL); 681 682 last = first + len; 683 if (first < 0 || first >= last || last > seg->len) 684 return (EINVAL); 685 686 if ((gpa | first | last) & PAGE_MASK) 687 return (EINVAL); 688 689 map = NULL; 690 for (i = 0; i < VM_MAX_MEMMAPS; i++) { 691 m = &vm->mem_maps[i]; 692 if (m->len == 0) { 693 map = m; 694 break; 695 } 696 } 697 698 if (map == NULL) 699 return (ENOSPC); 700 701 error = vm_map_find(&vm->vmspace->vm_map, seg->object, first, &gpa, 702 len, 0, VMFS_NO_SPACE, prot, prot, 0); 703 if (error != KERN_SUCCESS) 704 return (EFAULT); 705 706 vm_object_reference(seg->object); 707 708 if (flags & VM_MEMMAP_F_WIRED) { 709 error = vm_map_wire(&vm->vmspace->vm_map, gpa, gpa + len, 710 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); 711 if (error != KERN_SUCCESS) { 712 vm_map_remove(&vm->vmspace->vm_map, gpa, gpa + len); 713 return (EFAULT); 714 } 715 } 716 717 map->gpa = gpa; 718 map->len = len; 719 map->segoff = first; 720 map->segid = segid; 721 map->prot = prot; 722 map->flags = flags; 723 return (0); 724 } 725 726 int 727 vm_mmap_getnext(struct vm *vm, vm_paddr_t *gpa, int *segid, 728 vm_ooffset_t *segoff, size_t *len, int *prot, int *flags) 729 { 730 struct mem_map *mm, *mmnext; 731 int i; 732 733 mmnext = NULL; 734 for (i = 0; i < VM_MAX_MEMMAPS; i++) { 735 mm = &vm->mem_maps[i]; 736 if (mm->len == 0 || mm->gpa < *gpa) 737 continue; 738 if (mmnext == NULL || mm->gpa < mmnext->gpa) 739 mmnext = mm; 740 } 741 742 if (mmnext != NULL) { 743 *gpa = mmnext->gpa; 744 if (segid) 745 *segid = mmnext->segid; 746 if (segoff) 747 *segoff = mmnext->segoff; 748 if (len) 749 *len = mmnext->len; 750 if (prot) 751 *prot = mmnext->prot; 752 if (flags) 753 *flags = mmnext->flags; 754 return (0); 755 } else { 756 return (ENOENT); 757 } 758 } 759 760 static void 761 vm_free_memmap(struct vm *vm, int ident) 762 { 763 struct mem_map *mm; 764 int error; 765 766 mm = &vm->mem_maps[ident]; 767 if (mm->len) { 768 error = vm_map_remove(&vm->vmspace->vm_map, mm->gpa, 769 mm->gpa + mm->len); 770 KASSERT(error == KERN_SUCCESS, ("%s: vm_map_remove error %d", 771 __func__, error)); 772 bzero(mm, sizeof(struct mem_map)); 773 } 774 } 775 776 static __inline bool 777 sysmem_mapping(struct vm *vm, struct mem_map *mm) 778 { 779 780 if (mm->len != 0 && vm->mem_segs[mm->segid].sysmem) 781 return (true); 782 else 783 return (false); 784 } 785 786 static vm_paddr_t 787 sysmem_maxaddr(struct vm *vm) 788 { 789 struct mem_map *mm; 790 vm_paddr_t maxaddr; 791 int i; 792 793 maxaddr = 0; 794 for (i = 0; i < VM_MAX_MEMMAPS; i++) { 795 mm = &vm->mem_maps[i]; 796 if (sysmem_mapping(vm, mm)) { 797 if (maxaddr < mm->gpa + mm->len) 798 maxaddr = mm->gpa + mm->len; 799 } 800 } 801 return (maxaddr); 802 } 803 804 static void 805 vm_iommu_modify(struct vm *vm, boolean_t map) 806 { 807 int i, sz; 808 vm_paddr_t gpa, hpa; 809 struct mem_map *mm; 810 void *vp, *cookie, *host_domain; 811 812 sz = PAGE_SIZE; 813 host_domain = iommu_host_domain(); 814 815 for (i = 0; i < VM_MAX_MEMMAPS; i++) { 816 mm = &vm->mem_maps[i]; 817 if (!sysmem_mapping(vm, mm)) 818 continue; 819 820 if (map) { 821 KASSERT((mm->flags & VM_MEMMAP_F_IOMMU) == 0, 822 ("iommu map found invalid memmap %#lx/%#lx/%#x", 823 mm->gpa, mm->len, mm->flags)); 824 if ((mm->flags & VM_MEMMAP_F_WIRED) == 0) 825 continue; 826 mm->flags |= VM_MEMMAP_F_IOMMU; 827 } else { 828 if ((mm->flags & VM_MEMMAP_F_IOMMU) == 0) 829 continue; 830 mm->flags &= ~VM_MEMMAP_F_IOMMU; 831 KASSERT((mm->flags & VM_MEMMAP_F_WIRED) != 0, 832 ("iommu unmap found invalid memmap %#lx/%#lx/%#x", 833 mm->gpa, mm->len, mm->flags)); 834 } 835 836 gpa = mm->gpa; 837 while (gpa < mm->gpa + mm->len) { 838 vp = vm_gpa_hold(vm, -1, gpa, PAGE_SIZE, VM_PROT_WRITE, 839 &cookie); 840 KASSERT(vp != NULL, ("vm(%s) could not map gpa %#lx", 841 vm_name(vm), gpa)); 842 843 vm_gpa_release(cookie); 844 845 hpa = DMAP_TO_PHYS((uintptr_t)vp); 846 if (map) { 847 iommu_create_mapping(vm->iommu, gpa, hpa, sz); 848 iommu_remove_mapping(host_domain, hpa, sz); 849 } else { 850 iommu_remove_mapping(vm->iommu, gpa, sz); 851 iommu_create_mapping(host_domain, hpa, hpa, sz); 852 } 853 854 gpa += PAGE_SIZE; 855 } 856 } 857 858 /* 859 * Invalidate the cached translations associated with the domain 860 * from which pages were removed. 861 */ 862 if (map) 863 iommu_invalidate_tlb(host_domain); 864 else 865 iommu_invalidate_tlb(vm->iommu); 866 } 867 868 #define vm_iommu_unmap(vm) vm_iommu_modify((vm), FALSE) 869 #define vm_iommu_map(vm) vm_iommu_modify((vm), TRUE) 870 871 int 872 vm_unassign_pptdev(struct vm *vm, int bus, int slot, int func) 873 { 874 int error; 875 876 error = ppt_unassign_device(vm, bus, slot, func); 877 if (error) 878 return (error); 879 880 if (ppt_assigned_devices(vm) == 0) 881 vm_iommu_unmap(vm); 882 883 return (0); 884 } 885 886 int 887 vm_assign_pptdev(struct vm *vm, int bus, int slot, int func) 888 { 889 int error; 890 vm_paddr_t maxaddr; 891 892 /* Set up the IOMMU to do the 'gpa' to 'hpa' translation */ 893 if (ppt_assigned_devices(vm) == 0) { 894 KASSERT(vm->iommu == NULL, 895 ("vm_assign_pptdev: iommu must be NULL")); 896 maxaddr = sysmem_maxaddr(vm); 897 vm->iommu = iommu_create_domain(maxaddr); 898 vm_iommu_map(vm); 899 } 900 901 error = ppt_assign_device(vm, bus, slot, func); 902 return (error); 903 } 904 905 void * 906 vm_gpa_hold(struct vm *vm, int vcpuid, vm_paddr_t gpa, size_t len, int reqprot, 907 void **cookie) 908 { 909 int i, count, pageoff; 910 struct mem_map *mm; 911 vm_page_t m; 912 #ifdef INVARIANTS 913 /* 914 * All vcpus are frozen by ioctls that modify the memory map 915 * (e.g. VM_MMAP_MEMSEG). Therefore 'vm->memmap[]' stability is 916 * guaranteed if at least one vcpu is in the VCPU_FROZEN state. 917 */ 918 int state; 919 KASSERT(vcpuid >= -1 || vcpuid < VM_MAXCPU, ("%s: invalid vcpuid %d", 920 __func__, vcpuid)); 921 for (i = 0; i < VM_MAXCPU; i++) { 922 if (vcpuid != -1 && vcpuid != i) 923 continue; 924 state = vcpu_get_state(vm, i, NULL); 925 KASSERT(state == VCPU_FROZEN, ("%s: invalid vcpu state %d", 926 __func__, state)); 927 } 928 #endif 929 pageoff = gpa & PAGE_MASK; 930 if (len > PAGE_SIZE - pageoff) 931 panic("vm_gpa_hold: invalid gpa/len: 0x%016lx/%lu", gpa, len); 932 933 count = 0; 934 for (i = 0; i < VM_MAX_MEMMAPS; i++) { 935 mm = &vm->mem_maps[i]; 936 if (sysmem_mapping(vm, mm) && gpa >= mm->gpa && 937 gpa < mm->gpa + mm->len) { 938 count = vm_fault_quick_hold_pages(&vm->vmspace->vm_map, 939 trunc_page(gpa), PAGE_SIZE, reqprot, &m, 1); 940 break; 941 } 942 } 943 944 if (count == 1) { 945 *cookie = m; 946 return ((void *)(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)) + pageoff)); 947 } else { 948 *cookie = NULL; 949 return (NULL); 950 } 951 } 952 953 void 954 vm_gpa_release(void *cookie) 955 { 956 vm_page_t m = cookie; 957 958 vm_page_lock(m); 959 vm_page_unhold(m); 960 vm_page_unlock(m); 961 } 962 963 int 964 vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval) 965 { 966 967 if (vcpu < 0 || vcpu >= VM_MAXCPU) 968 return (EINVAL); 969 970 if (reg >= VM_REG_LAST) 971 return (EINVAL); 972 973 return (VMGETREG(vm->cookie, vcpu, reg, retval)); 974 } 975 976 int 977 vm_set_register(struct vm *vm, int vcpuid, int reg, uint64_t val) 978 { 979 struct vcpu *vcpu; 980 int error; 981 982 if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 983 return (EINVAL); 984 985 if (reg >= VM_REG_LAST) 986 return (EINVAL); 987 988 error = VMSETREG(vm->cookie, vcpuid, reg, val); 989 if (error || reg != VM_REG_GUEST_RIP) 990 return (error); 991 992 /* Set 'nextrip' to match the value of %rip */ 993 VCPU_CTR1(vm, vcpuid, "Setting nextrip to %#lx", val); 994 vcpu = &vm->vcpu[vcpuid]; 995 vcpu->nextrip = val; 996 return (0); 997 } 998 999 static boolean_t 1000 is_descriptor_table(int reg) 1001 { 1002 1003 switch (reg) { 1004 case VM_REG_GUEST_IDTR: 1005 case VM_REG_GUEST_GDTR: 1006 return (TRUE); 1007 default: 1008 return (FALSE); 1009 } 1010 } 1011 1012 static boolean_t 1013 is_segment_register(int reg) 1014 { 1015 1016 switch (reg) { 1017 case VM_REG_GUEST_ES: 1018 case VM_REG_GUEST_CS: 1019 case VM_REG_GUEST_SS: 1020 case VM_REG_GUEST_DS: 1021 case VM_REG_GUEST_FS: 1022 case VM_REG_GUEST_GS: 1023 case VM_REG_GUEST_TR: 1024 case VM_REG_GUEST_LDTR: 1025 return (TRUE); 1026 default: 1027 return (FALSE); 1028 } 1029 } 1030 1031 int 1032 vm_get_seg_desc(struct vm *vm, int vcpu, int reg, 1033 struct seg_desc *desc) 1034 { 1035 1036 if (vcpu < 0 || vcpu >= VM_MAXCPU) 1037 return (EINVAL); 1038 1039 if (!is_segment_register(reg) && !is_descriptor_table(reg)) 1040 return (EINVAL); 1041 1042 return (VMGETDESC(vm->cookie, vcpu, reg, desc)); 1043 } 1044 1045 int 1046 vm_set_seg_desc(struct vm *vm, int vcpu, int reg, 1047 struct seg_desc *desc) 1048 { 1049 if (vcpu < 0 || vcpu >= VM_MAXCPU) 1050 return (EINVAL); 1051 1052 if (!is_segment_register(reg) && !is_descriptor_table(reg)) 1053 return (EINVAL); 1054 1055 return (VMSETDESC(vm->cookie, vcpu, reg, desc)); 1056 } 1057 1058 static void 1059 restore_guest_fpustate(struct vcpu *vcpu) 1060 { 1061 1062 /* flush host state to the pcb */ 1063 fpuexit(curthread); 1064 1065 /* restore guest FPU state */ 1066 fpu_stop_emulating(); 1067 fpurestore(vcpu->guestfpu); 1068 1069 /* restore guest XCR0 if XSAVE is enabled in the host */ 1070 if (rcr4() & CR4_XSAVE) 1071 load_xcr(0, vcpu->guest_xcr0); 1072 1073 /* 1074 * The FPU is now "dirty" with the guest's state so turn on emulation 1075 * to trap any access to the FPU by the host. 1076 */ 1077 fpu_start_emulating(); 1078 } 1079 1080 static void 1081 save_guest_fpustate(struct vcpu *vcpu) 1082 { 1083 1084 if ((rcr0() & CR0_TS) == 0) 1085 panic("fpu emulation not enabled in host!"); 1086 1087 /* save guest XCR0 and restore host XCR0 */ 1088 if (rcr4() & CR4_XSAVE) { 1089 vcpu->guest_xcr0 = rxcr(0); 1090 load_xcr(0, vmm_get_host_xcr0()); 1091 } 1092 1093 /* save guest FPU state */ 1094 fpu_stop_emulating(); 1095 fpusave(vcpu->guestfpu); 1096 fpu_start_emulating(); 1097 } 1098 1099 static VMM_STAT(VCPU_IDLE_TICKS, "number of ticks vcpu was idle"); 1100 1101 static int 1102 vcpu_set_state_locked(struct vm *vm, int vcpuid, enum vcpu_state newstate, 1103 bool from_idle) 1104 { 1105 struct vcpu *vcpu; 1106 int error; 1107 1108 vcpu = &vm->vcpu[vcpuid]; 1109 vcpu_assert_locked(vcpu); 1110 1111 /* 1112 * State transitions from the vmmdev_ioctl() must always begin from 1113 * the VCPU_IDLE state. This guarantees that there is only a single 1114 * ioctl() operating on a vcpu at any point. 1115 */ 1116 if (from_idle) { 1117 while (vcpu->state != VCPU_IDLE) { 1118 vcpu->reqidle = 1; 1119 vcpu_notify_event_locked(vcpu, false); 1120 VCPU_CTR1(vm, vcpuid, "vcpu state change from %s to " 1121 "idle requested", vcpu_state2str(vcpu->state)); 1122 msleep_spin(&vcpu->state, &vcpu->mtx, "vmstat", hz); 1123 } 1124 } else { 1125 KASSERT(vcpu->state != VCPU_IDLE, ("invalid transition from " 1126 "vcpu idle state")); 1127 } 1128 1129 if (vcpu->state == VCPU_RUNNING) { 1130 KASSERT(vcpu->hostcpu == curcpu, ("curcpu %d and hostcpu %d " 1131 "mismatch for running vcpu", curcpu, vcpu->hostcpu)); 1132 } else { 1133 KASSERT(vcpu->hostcpu == NOCPU, ("Invalid hostcpu %d for a " 1134 "vcpu that is not running", vcpu->hostcpu)); 1135 } 1136 1137 /* 1138 * The following state transitions are allowed: 1139 * IDLE -> FROZEN -> IDLE 1140 * FROZEN -> RUNNING -> FROZEN 1141 * FROZEN -> SLEEPING -> FROZEN 1142 */ 1143 switch (vcpu->state) { 1144 case VCPU_IDLE: 1145 case VCPU_RUNNING: 1146 case VCPU_SLEEPING: 1147 error = (newstate != VCPU_FROZEN); 1148 break; 1149 case VCPU_FROZEN: 1150 error = (newstate == VCPU_FROZEN); 1151 break; 1152 default: 1153 error = 1; 1154 break; 1155 } 1156 1157 if (error) 1158 return (EBUSY); 1159 1160 VCPU_CTR2(vm, vcpuid, "vcpu state changed from %s to %s", 1161 vcpu_state2str(vcpu->state), vcpu_state2str(newstate)); 1162 1163 vcpu->state = newstate; 1164 if (newstate == VCPU_RUNNING) 1165 vcpu->hostcpu = curcpu; 1166 else 1167 vcpu->hostcpu = NOCPU; 1168 1169 if (newstate == VCPU_IDLE) 1170 wakeup(&vcpu->state); 1171 1172 return (0); 1173 } 1174 1175 static void 1176 vcpu_require_state(struct vm *vm, int vcpuid, enum vcpu_state newstate) 1177 { 1178 int error; 1179 1180 if ((error = vcpu_set_state(vm, vcpuid, newstate, false)) != 0) 1181 panic("Error %d setting state to %d\n", error, newstate); 1182 } 1183 1184 static void 1185 vcpu_require_state_locked(struct vm *vm, int vcpuid, enum vcpu_state newstate) 1186 { 1187 int error; 1188 1189 if ((error = vcpu_set_state_locked(vm, vcpuid, newstate, false)) != 0) 1190 panic("Error %d setting state to %d", error, newstate); 1191 } 1192 1193 static void 1194 vm_set_rendezvous_func(struct vm *vm, vm_rendezvous_func_t func) 1195 { 1196 1197 KASSERT(mtx_owned(&vm->rendezvous_mtx), ("rendezvous_mtx not locked")); 1198 1199 /* 1200 * Update 'rendezvous_func' and execute a write memory barrier to 1201 * ensure that it is visible across all host cpus. This is not needed 1202 * for correctness but it does ensure that all the vcpus will notice 1203 * that the rendezvous is requested immediately. 1204 */ 1205 vm->rendezvous_func = func; 1206 wmb(); 1207 } 1208 1209 #define RENDEZVOUS_CTR0(vm, vcpuid, fmt) \ 1210 do { \ 1211 if (vcpuid >= 0) \ 1212 VCPU_CTR0(vm, vcpuid, fmt); \ 1213 else \ 1214 VM_CTR0(vm, fmt); \ 1215 } while (0) 1216 1217 static void 1218 vm_handle_rendezvous(struct vm *vm, int vcpuid) 1219 { 1220 1221 KASSERT(vcpuid == -1 || (vcpuid >= 0 && vcpuid < VM_MAXCPU), 1222 ("vm_handle_rendezvous: invalid vcpuid %d", vcpuid)); 1223 1224 mtx_lock(&vm->rendezvous_mtx); 1225 while (vm->rendezvous_func != NULL) { 1226 /* 'rendezvous_req_cpus' must be a subset of 'active_cpus' */ 1227 CPU_AND(&vm->rendezvous_req_cpus, &vm->active_cpus); 1228 1229 if (vcpuid != -1 && 1230 CPU_ISSET(vcpuid, &vm->rendezvous_req_cpus) && 1231 !CPU_ISSET(vcpuid, &vm->rendezvous_done_cpus)) { 1232 VCPU_CTR0(vm, vcpuid, "Calling rendezvous func"); 1233 (*vm->rendezvous_func)(vm, vcpuid, vm->rendezvous_arg); 1234 CPU_SET(vcpuid, &vm->rendezvous_done_cpus); 1235 } 1236 if (CPU_CMP(&vm->rendezvous_req_cpus, 1237 &vm->rendezvous_done_cpus) == 0) { 1238 VCPU_CTR0(vm, vcpuid, "Rendezvous completed"); 1239 vm_set_rendezvous_func(vm, NULL); 1240 wakeup(&vm->rendezvous_func); 1241 break; 1242 } 1243 RENDEZVOUS_CTR0(vm, vcpuid, "Wait for rendezvous completion"); 1244 mtx_sleep(&vm->rendezvous_func, &vm->rendezvous_mtx, 0, 1245 "vmrndv", 0); 1246 } 1247 mtx_unlock(&vm->rendezvous_mtx); 1248 } 1249 1250 /* 1251 * Emulate a guest 'hlt' by sleeping until the vcpu is ready to run. 1252 */ 1253 static int 1254 vm_handle_hlt(struct vm *vm, int vcpuid, bool intr_disabled, bool *retu) 1255 { 1256 struct vcpu *vcpu; 1257 const char *wmesg; 1258 int t, vcpu_halted, vm_halted; 1259 1260 KASSERT(!CPU_ISSET(vcpuid, &vm->halted_cpus), ("vcpu already halted")); 1261 1262 vcpu = &vm->vcpu[vcpuid]; 1263 vcpu_halted = 0; 1264 vm_halted = 0; 1265 1266 vcpu_lock(vcpu); 1267 while (1) { 1268 /* 1269 * Do a final check for pending NMI or interrupts before 1270 * really putting this thread to sleep. Also check for 1271 * software events that would cause this vcpu to wakeup. 1272 * 1273 * These interrupts/events could have happened after the 1274 * vcpu returned from VMRUN() and before it acquired the 1275 * vcpu lock above. 1276 */ 1277 if (vm->rendezvous_func != NULL || vm->suspend || vcpu->reqidle) 1278 break; 1279 if (vm_nmi_pending(vm, vcpuid)) 1280 break; 1281 if (!intr_disabled) { 1282 if (vm_extint_pending(vm, vcpuid) || 1283 vlapic_pending_intr(vcpu->vlapic, NULL)) { 1284 break; 1285 } 1286 } 1287 1288 /* Don't go to sleep if the vcpu thread needs to yield */ 1289 if (vcpu_should_yield(vm, vcpuid)) 1290 break; 1291 1292 /* 1293 * Some Linux guests implement "halt" by having all vcpus 1294 * execute HLT with interrupts disabled. 'halted_cpus' keeps 1295 * track of the vcpus that have entered this state. When all 1296 * vcpus enter the halted state the virtual machine is halted. 1297 */ 1298 if (intr_disabled) { 1299 wmesg = "vmhalt"; 1300 VCPU_CTR0(vm, vcpuid, "Halted"); 1301 if (!vcpu_halted && halt_detection_enabled) { 1302 vcpu_halted = 1; 1303 CPU_SET_ATOMIC(vcpuid, &vm->halted_cpus); 1304 } 1305 if (CPU_CMP(&vm->halted_cpus, &vm->active_cpus) == 0) { 1306 vm_halted = 1; 1307 break; 1308 } 1309 } else { 1310 wmesg = "vmidle"; 1311 } 1312 1313 t = ticks; 1314 vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING); 1315 /* 1316 * XXX msleep_spin() cannot be interrupted by signals so 1317 * wake up periodically to check pending signals. 1318 */ 1319 msleep_spin(vcpu, &vcpu->mtx, wmesg, hz); 1320 vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN); 1321 vmm_stat_incr(vm, vcpuid, VCPU_IDLE_TICKS, ticks - t); 1322 } 1323 1324 if (vcpu_halted) 1325 CPU_CLR_ATOMIC(vcpuid, &vm->halted_cpus); 1326 1327 vcpu_unlock(vcpu); 1328 1329 if (vm_halted) 1330 vm_suspend(vm, VM_SUSPEND_HALT); 1331 1332 return (0); 1333 } 1334 1335 static int 1336 vm_handle_paging(struct vm *vm, int vcpuid, bool *retu) 1337 { 1338 int rv, ftype; 1339 struct vm_map *map; 1340 struct vcpu *vcpu; 1341 struct vm_exit *vme; 1342 1343 vcpu = &vm->vcpu[vcpuid]; 1344 vme = &vcpu->exitinfo; 1345 1346 KASSERT(vme->inst_length == 0, ("%s: invalid inst_length %d", 1347 __func__, vme->inst_length)); 1348 1349 ftype = vme->u.paging.fault_type; 1350 KASSERT(ftype == VM_PROT_READ || 1351 ftype == VM_PROT_WRITE || ftype == VM_PROT_EXECUTE, 1352 ("vm_handle_paging: invalid fault_type %d", ftype)); 1353 1354 if (ftype == VM_PROT_READ || ftype == VM_PROT_WRITE) { 1355 rv = pmap_emulate_accessed_dirty(vmspace_pmap(vm->vmspace), 1356 vme->u.paging.gpa, ftype); 1357 if (rv == 0) { 1358 VCPU_CTR2(vm, vcpuid, "%s bit emulation for gpa %#lx", 1359 ftype == VM_PROT_READ ? "accessed" : "dirty", 1360 vme->u.paging.gpa); 1361 goto done; 1362 } 1363 } 1364 1365 map = &vm->vmspace->vm_map; 1366 rv = vm_fault(map, vme->u.paging.gpa, ftype, VM_FAULT_NORMAL); 1367 1368 VCPU_CTR3(vm, vcpuid, "vm_handle_paging rv = %d, gpa = %#lx, " 1369 "ftype = %d", rv, vme->u.paging.gpa, ftype); 1370 1371 if (rv != KERN_SUCCESS) 1372 return (EFAULT); 1373 done: 1374 return (0); 1375 } 1376 1377 static int 1378 vm_handle_inst_emul(struct vm *vm, int vcpuid, bool *retu) 1379 { 1380 struct vie *vie; 1381 struct vcpu *vcpu; 1382 struct vm_exit *vme; 1383 uint64_t gla, gpa, cs_base; 1384 struct vm_guest_paging *paging; 1385 mem_region_read_t mread; 1386 mem_region_write_t mwrite; 1387 enum vm_cpu_mode cpu_mode; 1388 int cs_d, error, fault; 1389 1390 vcpu = &vm->vcpu[vcpuid]; 1391 vme = &vcpu->exitinfo; 1392 1393 KASSERT(vme->inst_length == 0, ("%s: invalid inst_length %d", 1394 __func__, vme->inst_length)); 1395 1396 gla = vme->u.inst_emul.gla; 1397 gpa = vme->u.inst_emul.gpa; 1398 cs_base = vme->u.inst_emul.cs_base; 1399 cs_d = vme->u.inst_emul.cs_d; 1400 vie = &vme->u.inst_emul.vie; 1401 paging = &vme->u.inst_emul.paging; 1402 cpu_mode = paging->cpu_mode; 1403 1404 VCPU_CTR1(vm, vcpuid, "inst_emul fault accessing gpa %#lx", gpa); 1405 1406 /* Fetch, decode and emulate the faulting instruction */ 1407 if (vie->num_valid == 0) { 1408 error = vmm_fetch_instruction(vm, vcpuid, paging, vme->rip + 1409 cs_base, VIE_INST_SIZE, vie, &fault); 1410 } else { 1411 /* 1412 * The instruction bytes have already been copied into 'vie' 1413 */ 1414 error = fault = 0; 1415 } 1416 if (error || fault) 1417 return (error); 1418 1419 if (vmm_decode_instruction(vm, vcpuid, gla, cpu_mode, cs_d, vie) != 0) { 1420 VCPU_CTR1(vm, vcpuid, "Error decoding instruction at %#lx", 1421 vme->rip + cs_base); 1422 *retu = true; /* dump instruction bytes in userspace */ 1423 return (0); 1424 } 1425 1426 /* 1427 * Update 'nextrip' based on the length of the emulated instruction. 1428 */ 1429 vme->inst_length = vie->num_processed; 1430 vcpu->nextrip += vie->num_processed; 1431 VCPU_CTR1(vm, vcpuid, "nextrip updated to %#lx after instruction " 1432 "decoding", vcpu->nextrip); 1433 1434 /* return to userland unless this is an in-kernel emulated device */ 1435 if (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE) { 1436 mread = lapic_mmio_read; 1437 mwrite = lapic_mmio_write; 1438 } else if (gpa >= VIOAPIC_BASE && gpa < VIOAPIC_BASE + VIOAPIC_SIZE) { 1439 mread = vioapic_mmio_read; 1440 mwrite = vioapic_mmio_write; 1441 } else if (gpa >= VHPET_BASE && gpa < VHPET_BASE + VHPET_SIZE) { 1442 mread = vhpet_mmio_read; 1443 mwrite = vhpet_mmio_write; 1444 } else { 1445 *retu = true; 1446 return (0); 1447 } 1448 1449 error = vmm_emulate_instruction(vm, vcpuid, gpa, vie, paging, 1450 mread, mwrite, retu); 1451 1452 return (error); 1453 } 1454 1455 static int 1456 vm_handle_suspend(struct vm *vm, int vcpuid, bool *retu) 1457 { 1458 int i, done; 1459 struct vcpu *vcpu; 1460 1461 done = 0; 1462 vcpu = &vm->vcpu[vcpuid]; 1463 1464 CPU_SET_ATOMIC(vcpuid, &vm->suspended_cpus); 1465 1466 /* 1467 * Wait until all 'active_cpus' have suspended themselves. 1468 * 1469 * Since a VM may be suspended at any time including when one or 1470 * more vcpus are doing a rendezvous we need to call the rendezvous 1471 * handler while we are waiting to prevent a deadlock. 1472 */ 1473 vcpu_lock(vcpu); 1474 while (1) { 1475 if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) { 1476 VCPU_CTR0(vm, vcpuid, "All vcpus suspended"); 1477 break; 1478 } 1479 1480 if (vm->rendezvous_func == NULL) { 1481 VCPU_CTR0(vm, vcpuid, "Sleeping during suspend"); 1482 vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING); 1483 msleep_spin(vcpu, &vcpu->mtx, "vmsusp", hz); 1484 vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN); 1485 } else { 1486 VCPU_CTR0(vm, vcpuid, "Rendezvous during suspend"); 1487 vcpu_unlock(vcpu); 1488 vm_handle_rendezvous(vm, vcpuid); 1489 vcpu_lock(vcpu); 1490 } 1491 } 1492 vcpu_unlock(vcpu); 1493 1494 /* 1495 * Wakeup the other sleeping vcpus and return to userspace. 1496 */ 1497 for (i = 0; i < VM_MAXCPU; i++) { 1498 if (CPU_ISSET(i, &vm->suspended_cpus)) { 1499 vcpu_notify_event(vm, i, false); 1500 } 1501 } 1502 1503 *retu = true; 1504 return (0); 1505 } 1506 1507 static int 1508 vm_handle_reqidle(struct vm *vm, int vcpuid, bool *retu) 1509 { 1510 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 1511 1512 vcpu_lock(vcpu); 1513 KASSERT(vcpu->reqidle, ("invalid vcpu reqidle %d", vcpu->reqidle)); 1514 vcpu->reqidle = 0; 1515 vcpu_unlock(vcpu); 1516 *retu = true; 1517 return (0); 1518 } 1519 1520 int 1521 vm_suspend(struct vm *vm, enum vm_suspend_how how) 1522 { 1523 int i; 1524 1525 if (how <= VM_SUSPEND_NONE || how >= VM_SUSPEND_LAST) 1526 return (EINVAL); 1527 1528 if (atomic_cmpset_int(&vm->suspend, 0, how) == 0) { 1529 VM_CTR2(vm, "virtual machine already suspended %d/%d", 1530 vm->suspend, how); 1531 return (EALREADY); 1532 } 1533 1534 VM_CTR1(vm, "virtual machine successfully suspended %d", how); 1535 1536 /* 1537 * Notify all active vcpus that they are now suspended. 1538 */ 1539 for (i = 0; i < VM_MAXCPU; i++) { 1540 if (CPU_ISSET(i, &vm->active_cpus)) 1541 vcpu_notify_event(vm, i, false); 1542 } 1543 1544 return (0); 1545 } 1546 1547 void 1548 vm_exit_suspended(struct vm *vm, int vcpuid, uint64_t rip) 1549 { 1550 struct vm_exit *vmexit; 1551 1552 KASSERT(vm->suspend > VM_SUSPEND_NONE && vm->suspend < VM_SUSPEND_LAST, 1553 ("vm_exit_suspended: invalid suspend type %d", vm->suspend)); 1554 1555 vmexit = vm_exitinfo(vm, vcpuid); 1556 vmexit->rip = rip; 1557 vmexit->inst_length = 0; 1558 vmexit->exitcode = VM_EXITCODE_SUSPENDED; 1559 vmexit->u.suspended.how = vm->suspend; 1560 } 1561 1562 void 1563 vm_exit_rendezvous(struct vm *vm, int vcpuid, uint64_t rip) 1564 { 1565 struct vm_exit *vmexit; 1566 1567 KASSERT(vm->rendezvous_func != NULL, ("rendezvous not in progress")); 1568 1569 vmexit = vm_exitinfo(vm, vcpuid); 1570 vmexit->rip = rip; 1571 vmexit->inst_length = 0; 1572 vmexit->exitcode = VM_EXITCODE_RENDEZVOUS; 1573 vmm_stat_incr(vm, vcpuid, VMEXIT_RENDEZVOUS, 1); 1574 } 1575 1576 void 1577 vm_exit_reqidle(struct vm *vm, int vcpuid, uint64_t rip) 1578 { 1579 struct vm_exit *vmexit; 1580 1581 vmexit = vm_exitinfo(vm, vcpuid); 1582 vmexit->rip = rip; 1583 vmexit->inst_length = 0; 1584 vmexit->exitcode = VM_EXITCODE_REQIDLE; 1585 vmm_stat_incr(vm, vcpuid, VMEXIT_REQIDLE, 1); 1586 } 1587 1588 void 1589 vm_exit_astpending(struct vm *vm, int vcpuid, uint64_t rip) 1590 { 1591 struct vm_exit *vmexit; 1592 1593 vmexit = vm_exitinfo(vm, vcpuid); 1594 vmexit->rip = rip; 1595 vmexit->inst_length = 0; 1596 vmexit->exitcode = VM_EXITCODE_BOGUS; 1597 vmm_stat_incr(vm, vcpuid, VMEXIT_ASTPENDING, 1); 1598 } 1599 1600 int 1601 vm_run(struct vm *vm, struct vm_run *vmrun) 1602 { 1603 struct vm_eventinfo evinfo; 1604 int error, vcpuid; 1605 struct vcpu *vcpu; 1606 struct pcb *pcb; 1607 uint64_t tscval; 1608 struct vm_exit *vme; 1609 bool retu, intr_disabled; 1610 pmap_t pmap; 1611 1612 vcpuid = vmrun->cpuid; 1613 1614 if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 1615 return (EINVAL); 1616 1617 if (!CPU_ISSET(vcpuid, &vm->active_cpus)) 1618 return (EINVAL); 1619 1620 if (CPU_ISSET(vcpuid, &vm->suspended_cpus)) 1621 return (EINVAL); 1622 1623 pmap = vmspace_pmap(vm->vmspace); 1624 vcpu = &vm->vcpu[vcpuid]; 1625 vme = &vcpu->exitinfo; 1626 evinfo.rptr = &vm->rendezvous_func; 1627 evinfo.sptr = &vm->suspend; 1628 evinfo.iptr = &vcpu->reqidle; 1629 restart: 1630 critical_enter(); 1631 1632 KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active), 1633 ("vm_run: absurd pm_active")); 1634 1635 tscval = rdtsc(); 1636 1637 pcb = PCPU_GET(curpcb); 1638 set_pcb_flags(pcb, PCB_FULL_IRET); 1639 1640 restore_guest_fpustate(vcpu); 1641 1642 vcpu_require_state(vm, vcpuid, VCPU_RUNNING); 1643 error = VMRUN(vm->cookie, vcpuid, vcpu->nextrip, pmap, &evinfo); 1644 vcpu_require_state(vm, vcpuid, VCPU_FROZEN); 1645 1646 save_guest_fpustate(vcpu); 1647 1648 vmm_stat_incr(vm, vcpuid, VCPU_TOTAL_RUNTIME, rdtsc() - tscval); 1649 1650 critical_exit(); 1651 1652 if (error == 0) { 1653 retu = false; 1654 vcpu->nextrip = vme->rip + vme->inst_length; 1655 switch (vme->exitcode) { 1656 case VM_EXITCODE_REQIDLE: 1657 error = vm_handle_reqidle(vm, vcpuid, &retu); 1658 break; 1659 case VM_EXITCODE_SUSPENDED: 1660 error = vm_handle_suspend(vm, vcpuid, &retu); 1661 break; 1662 case VM_EXITCODE_IOAPIC_EOI: 1663 vioapic_process_eoi(vm, vcpuid, 1664 vme->u.ioapic_eoi.vector); 1665 break; 1666 case VM_EXITCODE_RENDEZVOUS: 1667 vm_handle_rendezvous(vm, vcpuid); 1668 error = 0; 1669 break; 1670 case VM_EXITCODE_HLT: 1671 intr_disabled = ((vme->u.hlt.rflags & PSL_I) == 0); 1672 error = vm_handle_hlt(vm, vcpuid, intr_disabled, &retu); 1673 break; 1674 case VM_EXITCODE_PAGING: 1675 error = vm_handle_paging(vm, vcpuid, &retu); 1676 break; 1677 case VM_EXITCODE_INST_EMUL: 1678 error = vm_handle_inst_emul(vm, vcpuid, &retu); 1679 break; 1680 case VM_EXITCODE_INOUT: 1681 case VM_EXITCODE_INOUT_STR: 1682 error = vm_handle_inout(vm, vcpuid, vme, &retu); 1683 break; 1684 case VM_EXITCODE_MONITOR: 1685 case VM_EXITCODE_MWAIT: 1686 vm_inject_ud(vm, vcpuid); 1687 break; 1688 default: 1689 retu = true; /* handled in userland */ 1690 break; 1691 } 1692 } 1693 1694 if (error == 0 && retu == false) 1695 goto restart; 1696 1697 VCPU_CTR2(vm, vcpuid, "retu %d/%d", error, vme->exitcode); 1698 1699 /* copy the exit information */ 1700 bcopy(vme, &vmrun->vm_exit, sizeof(struct vm_exit)); 1701 return (error); 1702 } 1703 1704 int 1705 vm_restart_instruction(void *arg, int vcpuid) 1706 { 1707 struct vm *vm; 1708 struct vcpu *vcpu; 1709 enum vcpu_state state; 1710 uint64_t rip; 1711 int error; 1712 1713 vm = arg; 1714 if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 1715 return (EINVAL); 1716 1717 vcpu = &vm->vcpu[vcpuid]; 1718 state = vcpu_get_state(vm, vcpuid, NULL); 1719 if (state == VCPU_RUNNING) { 1720 /* 1721 * When a vcpu is "running" the next instruction is determined 1722 * by adding 'rip' and 'inst_length' in the vcpu's 'exitinfo'. 1723 * Thus setting 'inst_length' to zero will cause the current 1724 * instruction to be restarted. 1725 */ 1726 vcpu->exitinfo.inst_length = 0; 1727 VCPU_CTR1(vm, vcpuid, "restarting instruction at %#lx by " 1728 "setting inst_length to zero", vcpu->exitinfo.rip); 1729 } else if (state == VCPU_FROZEN) { 1730 /* 1731 * When a vcpu is "frozen" it is outside the critical section 1732 * around VMRUN() and 'nextrip' points to the next instruction. 1733 * Thus instruction restart is achieved by setting 'nextrip' 1734 * to the vcpu's %rip. 1735 */ 1736 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RIP, &rip); 1737 KASSERT(!error, ("%s: error %d getting rip", __func__, error)); 1738 VCPU_CTR2(vm, vcpuid, "restarting instruction by updating " 1739 "nextrip from %#lx to %#lx", vcpu->nextrip, rip); 1740 vcpu->nextrip = rip; 1741 } else { 1742 panic("%s: invalid state %d", __func__, state); 1743 } 1744 return (0); 1745 } 1746 1747 int 1748 vm_exit_intinfo(struct vm *vm, int vcpuid, uint64_t info) 1749 { 1750 struct vcpu *vcpu; 1751 int type, vector; 1752 1753 if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 1754 return (EINVAL); 1755 1756 vcpu = &vm->vcpu[vcpuid]; 1757 1758 if (info & VM_INTINFO_VALID) { 1759 type = info & VM_INTINFO_TYPE; 1760 vector = info & 0xff; 1761 if (type == VM_INTINFO_NMI && vector != IDT_NMI) 1762 return (EINVAL); 1763 if (type == VM_INTINFO_HWEXCEPTION && vector >= 32) 1764 return (EINVAL); 1765 if (info & VM_INTINFO_RSVD) 1766 return (EINVAL); 1767 } else { 1768 info = 0; 1769 } 1770 VCPU_CTR2(vm, vcpuid, "%s: info1(%#lx)", __func__, info); 1771 vcpu->exitintinfo = info; 1772 return (0); 1773 } 1774 1775 enum exc_class { 1776 EXC_BENIGN, 1777 EXC_CONTRIBUTORY, 1778 EXC_PAGEFAULT 1779 }; 1780 1781 #define IDT_VE 20 /* Virtualization Exception (Intel specific) */ 1782 1783 static enum exc_class 1784 exception_class(uint64_t info) 1785 { 1786 int type, vector; 1787 1788 KASSERT(info & VM_INTINFO_VALID, ("intinfo must be valid: %#lx", info)); 1789 type = info & VM_INTINFO_TYPE; 1790 vector = info & 0xff; 1791 1792 /* Table 6-4, "Interrupt and Exception Classes", Intel SDM, Vol 3 */ 1793 switch (type) { 1794 case VM_INTINFO_HWINTR: 1795 case VM_INTINFO_SWINTR: 1796 case VM_INTINFO_NMI: 1797 return (EXC_BENIGN); 1798 default: 1799 /* 1800 * Hardware exception. 1801 * 1802 * SVM and VT-x use identical type values to represent NMI, 1803 * hardware interrupt and software interrupt. 1804 * 1805 * SVM uses type '3' for all exceptions. VT-x uses type '3' 1806 * for exceptions except #BP and #OF. #BP and #OF use a type 1807 * value of '5' or '6'. Therefore we don't check for explicit 1808 * values of 'type' to classify 'intinfo' into a hardware 1809 * exception. 1810 */ 1811 break; 1812 } 1813 1814 switch (vector) { 1815 case IDT_PF: 1816 case IDT_VE: 1817 return (EXC_PAGEFAULT); 1818 case IDT_DE: 1819 case IDT_TS: 1820 case IDT_NP: 1821 case IDT_SS: 1822 case IDT_GP: 1823 return (EXC_CONTRIBUTORY); 1824 default: 1825 return (EXC_BENIGN); 1826 } 1827 } 1828 1829 static int 1830 nested_fault(struct vm *vm, int vcpuid, uint64_t info1, uint64_t info2, 1831 uint64_t *retinfo) 1832 { 1833 enum exc_class exc1, exc2; 1834 int type1, vector1; 1835 1836 KASSERT(info1 & VM_INTINFO_VALID, ("info1 %#lx is not valid", info1)); 1837 KASSERT(info2 & VM_INTINFO_VALID, ("info2 %#lx is not valid", info2)); 1838 1839 /* 1840 * If an exception occurs while attempting to call the double-fault 1841 * handler the processor enters shutdown mode (aka triple fault). 1842 */ 1843 type1 = info1 & VM_INTINFO_TYPE; 1844 vector1 = info1 & 0xff; 1845 if (type1 == VM_INTINFO_HWEXCEPTION && vector1 == IDT_DF) { 1846 VCPU_CTR2(vm, vcpuid, "triple fault: info1(%#lx), info2(%#lx)", 1847 info1, info2); 1848 vm_suspend(vm, VM_SUSPEND_TRIPLEFAULT); 1849 *retinfo = 0; 1850 return (0); 1851 } 1852 1853 /* 1854 * Table 6-5 "Conditions for Generating a Double Fault", Intel SDM, Vol3 1855 */ 1856 exc1 = exception_class(info1); 1857 exc2 = exception_class(info2); 1858 if ((exc1 == EXC_CONTRIBUTORY && exc2 == EXC_CONTRIBUTORY) || 1859 (exc1 == EXC_PAGEFAULT && exc2 != EXC_BENIGN)) { 1860 /* Convert nested fault into a double fault. */ 1861 *retinfo = IDT_DF; 1862 *retinfo |= VM_INTINFO_VALID | VM_INTINFO_HWEXCEPTION; 1863 *retinfo |= VM_INTINFO_DEL_ERRCODE; 1864 } else { 1865 /* Handle exceptions serially */ 1866 *retinfo = info2; 1867 } 1868 return (1); 1869 } 1870 1871 static uint64_t 1872 vcpu_exception_intinfo(struct vcpu *vcpu) 1873 { 1874 uint64_t info = 0; 1875 1876 if (vcpu->exception_pending) { 1877 info = vcpu->exc_vector & 0xff; 1878 info |= VM_INTINFO_VALID | VM_INTINFO_HWEXCEPTION; 1879 if (vcpu->exc_errcode_valid) { 1880 info |= VM_INTINFO_DEL_ERRCODE; 1881 info |= (uint64_t)vcpu->exc_errcode << 32; 1882 } 1883 } 1884 return (info); 1885 } 1886 1887 int 1888 vm_entry_intinfo(struct vm *vm, int vcpuid, uint64_t *retinfo) 1889 { 1890 struct vcpu *vcpu; 1891 uint64_t info1, info2; 1892 int valid; 1893 1894 KASSERT(vcpuid >= 0 && vcpuid < VM_MAXCPU, ("invalid vcpu %d", vcpuid)); 1895 1896 vcpu = &vm->vcpu[vcpuid]; 1897 1898 info1 = vcpu->exitintinfo; 1899 vcpu->exitintinfo = 0; 1900 1901 info2 = 0; 1902 if (vcpu->exception_pending) { 1903 info2 = vcpu_exception_intinfo(vcpu); 1904 vcpu->exception_pending = 0; 1905 VCPU_CTR2(vm, vcpuid, "Exception %d delivered: %#lx", 1906 vcpu->exc_vector, info2); 1907 } 1908 1909 if ((info1 & VM_INTINFO_VALID) && (info2 & VM_INTINFO_VALID)) { 1910 valid = nested_fault(vm, vcpuid, info1, info2, retinfo); 1911 } else if (info1 & VM_INTINFO_VALID) { 1912 *retinfo = info1; 1913 valid = 1; 1914 } else if (info2 & VM_INTINFO_VALID) { 1915 *retinfo = info2; 1916 valid = 1; 1917 } else { 1918 valid = 0; 1919 } 1920 1921 if (valid) { 1922 VCPU_CTR4(vm, vcpuid, "%s: info1(%#lx), info2(%#lx), " 1923 "retinfo(%#lx)", __func__, info1, info2, *retinfo); 1924 } 1925 1926 return (valid); 1927 } 1928 1929 int 1930 vm_get_intinfo(struct vm *vm, int vcpuid, uint64_t *info1, uint64_t *info2) 1931 { 1932 struct vcpu *vcpu; 1933 1934 if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 1935 return (EINVAL); 1936 1937 vcpu = &vm->vcpu[vcpuid]; 1938 *info1 = vcpu->exitintinfo; 1939 *info2 = vcpu_exception_intinfo(vcpu); 1940 return (0); 1941 } 1942 1943 int 1944 vm_inject_exception(struct vm *vm, int vcpuid, int vector, int errcode_valid, 1945 uint32_t errcode, int restart_instruction) 1946 { 1947 struct vcpu *vcpu; 1948 uint64_t regval; 1949 int error; 1950 1951 if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 1952 return (EINVAL); 1953 1954 if (vector < 0 || vector >= 32) 1955 return (EINVAL); 1956 1957 /* 1958 * A double fault exception should never be injected directly into 1959 * the guest. It is a derived exception that results from specific 1960 * combinations of nested faults. 1961 */ 1962 if (vector == IDT_DF) 1963 return (EINVAL); 1964 1965 vcpu = &vm->vcpu[vcpuid]; 1966 1967 if (vcpu->exception_pending) { 1968 VCPU_CTR2(vm, vcpuid, "Unable to inject exception %d due to " 1969 "pending exception %d", vector, vcpu->exc_vector); 1970 return (EBUSY); 1971 } 1972 1973 if (errcode_valid) { 1974 /* 1975 * Exceptions don't deliver an error code in real mode. 1976 */ 1977 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_CR0, ®val); 1978 KASSERT(!error, ("%s: error %d getting CR0", __func__, error)); 1979 if (!(regval & CR0_PE)) 1980 errcode_valid = 0; 1981 } 1982 1983 /* 1984 * From section 26.6.1 "Interruptibility State" in Intel SDM: 1985 * 1986 * Event blocking by "STI" or "MOV SS" is cleared after guest executes 1987 * one instruction or incurs an exception. 1988 */ 1989 error = vm_set_register(vm, vcpuid, VM_REG_GUEST_INTR_SHADOW, 0); 1990 KASSERT(error == 0, ("%s: error %d clearing interrupt shadow", 1991 __func__, error)); 1992 1993 if (restart_instruction) 1994 vm_restart_instruction(vm, vcpuid); 1995 1996 vcpu->exception_pending = 1; 1997 vcpu->exc_vector = vector; 1998 vcpu->exc_errcode = errcode; 1999 vcpu->exc_errcode_valid = errcode_valid; 2000 VCPU_CTR1(vm, vcpuid, "Exception %d pending", vector); 2001 return (0); 2002 } 2003 2004 void 2005 vm_inject_fault(void *vmarg, int vcpuid, int vector, int errcode_valid, 2006 int errcode) 2007 { 2008 struct vm *vm; 2009 int error, restart_instruction; 2010 2011 vm = vmarg; 2012 restart_instruction = 1; 2013 2014 error = vm_inject_exception(vm, vcpuid, vector, errcode_valid, 2015 errcode, restart_instruction); 2016 KASSERT(error == 0, ("vm_inject_exception error %d", error)); 2017 } 2018 2019 void 2020 vm_inject_pf(void *vmarg, int vcpuid, int error_code, uint64_t cr2) 2021 { 2022 struct vm *vm; 2023 int error; 2024 2025 vm = vmarg; 2026 VCPU_CTR2(vm, vcpuid, "Injecting page fault: error_code %#x, cr2 %#lx", 2027 error_code, cr2); 2028 2029 error = vm_set_register(vm, vcpuid, VM_REG_GUEST_CR2, cr2); 2030 KASSERT(error == 0, ("vm_set_register(cr2) error %d", error)); 2031 2032 vm_inject_fault(vm, vcpuid, IDT_PF, 1, error_code); 2033 } 2034 2035 static VMM_STAT(VCPU_NMI_COUNT, "number of NMIs delivered to vcpu"); 2036 2037 int 2038 vm_inject_nmi(struct vm *vm, int vcpuid) 2039 { 2040 struct vcpu *vcpu; 2041 2042 if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 2043 return (EINVAL); 2044 2045 vcpu = &vm->vcpu[vcpuid]; 2046 2047 vcpu->nmi_pending = 1; 2048 vcpu_notify_event(vm, vcpuid, false); 2049 return (0); 2050 } 2051 2052 int 2053 vm_nmi_pending(struct vm *vm, int vcpuid) 2054 { 2055 struct vcpu *vcpu; 2056 2057 if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 2058 panic("vm_nmi_pending: invalid vcpuid %d", vcpuid); 2059 2060 vcpu = &vm->vcpu[vcpuid]; 2061 2062 return (vcpu->nmi_pending); 2063 } 2064 2065 void 2066 vm_nmi_clear(struct vm *vm, int vcpuid) 2067 { 2068 struct vcpu *vcpu; 2069 2070 if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 2071 panic("vm_nmi_pending: invalid vcpuid %d", vcpuid); 2072 2073 vcpu = &vm->vcpu[vcpuid]; 2074 2075 if (vcpu->nmi_pending == 0) 2076 panic("vm_nmi_clear: inconsistent nmi_pending state"); 2077 2078 vcpu->nmi_pending = 0; 2079 vmm_stat_incr(vm, vcpuid, VCPU_NMI_COUNT, 1); 2080 } 2081 2082 static VMM_STAT(VCPU_EXTINT_COUNT, "number of ExtINTs delivered to vcpu"); 2083 2084 int 2085 vm_inject_extint(struct vm *vm, int vcpuid) 2086 { 2087 struct vcpu *vcpu; 2088 2089 if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 2090 return (EINVAL); 2091 2092 vcpu = &vm->vcpu[vcpuid]; 2093 2094 vcpu->extint_pending = 1; 2095 vcpu_notify_event(vm, vcpuid, false); 2096 return (0); 2097 } 2098 2099 int 2100 vm_extint_pending(struct vm *vm, int vcpuid) 2101 { 2102 struct vcpu *vcpu; 2103 2104 if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 2105 panic("vm_extint_pending: invalid vcpuid %d", vcpuid); 2106 2107 vcpu = &vm->vcpu[vcpuid]; 2108 2109 return (vcpu->extint_pending); 2110 } 2111 2112 void 2113 vm_extint_clear(struct vm *vm, int vcpuid) 2114 { 2115 struct vcpu *vcpu; 2116 2117 if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 2118 panic("vm_extint_pending: invalid vcpuid %d", vcpuid); 2119 2120 vcpu = &vm->vcpu[vcpuid]; 2121 2122 if (vcpu->extint_pending == 0) 2123 panic("vm_extint_clear: inconsistent extint_pending state"); 2124 2125 vcpu->extint_pending = 0; 2126 vmm_stat_incr(vm, vcpuid, VCPU_EXTINT_COUNT, 1); 2127 } 2128 2129 int 2130 vm_get_capability(struct vm *vm, int vcpu, int type, int *retval) 2131 { 2132 if (vcpu < 0 || vcpu >= VM_MAXCPU) 2133 return (EINVAL); 2134 2135 if (type < 0 || type >= VM_CAP_MAX) 2136 return (EINVAL); 2137 2138 return (VMGETCAP(vm->cookie, vcpu, type, retval)); 2139 } 2140 2141 int 2142 vm_set_capability(struct vm *vm, int vcpu, int type, int val) 2143 { 2144 if (vcpu < 0 || vcpu >= VM_MAXCPU) 2145 return (EINVAL); 2146 2147 if (type < 0 || type >= VM_CAP_MAX) 2148 return (EINVAL); 2149 2150 return (VMSETCAP(vm->cookie, vcpu, type, val)); 2151 } 2152 2153 struct vlapic * 2154 vm_lapic(struct vm *vm, int cpu) 2155 { 2156 return (vm->vcpu[cpu].vlapic); 2157 } 2158 2159 struct vioapic * 2160 vm_ioapic(struct vm *vm) 2161 { 2162 2163 return (vm->vioapic); 2164 } 2165 2166 struct vhpet * 2167 vm_hpet(struct vm *vm) 2168 { 2169 2170 return (vm->vhpet); 2171 } 2172 2173 boolean_t 2174 vmm_is_pptdev(int bus, int slot, int func) 2175 { 2176 int found, i, n; 2177 int b, s, f; 2178 char *val, *cp, *cp2; 2179 2180 /* 2181 * XXX 2182 * The length of an environment variable is limited to 128 bytes which 2183 * puts an upper limit on the number of passthru devices that may be 2184 * specified using a single environment variable. 2185 * 2186 * Work around this by scanning multiple environment variable 2187 * names instead of a single one - yuck! 2188 */ 2189 const char *names[] = { "pptdevs", "pptdevs2", "pptdevs3", NULL }; 2190 2191 /* set pptdevs="1/2/3 4/5/6 7/8/9 10/11/12" */ 2192 found = 0; 2193 for (i = 0; names[i] != NULL && !found; i++) { 2194 cp = val = kern_getenv(names[i]); 2195 while (cp != NULL && *cp != '\0') { 2196 if ((cp2 = strchr(cp, ' ')) != NULL) 2197 *cp2 = '\0'; 2198 2199 n = sscanf(cp, "%d/%d/%d", &b, &s, &f); 2200 if (n == 3 && bus == b && slot == s && func == f) { 2201 found = 1; 2202 break; 2203 } 2204 2205 if (cp2 != NULL) 2206 *cp2++ = ' '; 2207 2208 cp = cp2; 2209 } 2210 freeenv(val); 2211 } 2212 return (found); 2213 } 2214 2215 void * 2216 vm_iommu_domain(struct vm *vm) 2217 { 2218 2219 return (vm->iommu); 2220 } 2221 2222 int 2223 vcpu_set_state(struct vm *vm, int vcpuid, enum vcpu_state newstate, 2224 bool from_idle) 2225 { 2226 int error; 2227 struct vcpu *vcpu; 2228 2229 if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 2230 panic("vm_set_run_state: invalid vcpuid %d", vcpuid); 2231 2232 vcpu = &vm->vcpu[vcpuid]; 2233 2234 vcpu_lock(vcpu); 2235 error = vcpu_set_state_locked(vm, vcpuid, newstate, from_idle); 2236 vcpu_unlock(vcpu); 2237 2238 return (error); 2239 } 2240 2241 enum vcpu_state 2242 vcpu_get_state(struct vm *vm, int vcpuid, int *hostcpu) 2243 { 2244 struct vcpu *vcpu; 2245 enum vcpu_state state; 2246 2247 if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 2248 panic("vm_get_run_state: invalid vcpuid %d", vcpuid); 2249 2250 vcpu = &vm->vcpu[vcpuid]; 2251 2252 vcpu_lock(vcpu); 2253 state = vcpu->state; 2254 if (hostcpu != NULL) 2255 *hostcpu = vcpu->hostcpu; 2256 vcpu_unlock(vcpu); 2257 2258 return (state); 2259 } 2260 2261 int 2262 vm_activate_cpu(struct vm *vm, int vcpuid) 2263 { 2264 2265 if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 2266 return (EINVAL); 2267 2268 if (CPU_ISSET(vcpuid, &vm->active_cpus)) 2269 return (EBUSY); 2270 2271 VCPU_CTR0(vm, vcpuid, "activated"); 2272 CPU_SET_ATOMIC(vcpuid, &vm->active_cpus); 2273 return (0); 2274 } 2275 2276 cpuset_t 2277 vm_active_cpus(struct vm *vm) 2278 { 2279 2280 return (vm->active_cpus); 2281 } 2282 2283 cpuset_t 2284 vm_suspended_cpus(struct vm *vm) 2285 { 2286 2287 return (vm->suspended_cpus); 2288 } 2289 2290 void * 2291 vcpu_stats(struct vm *vm, int vcpuid) 2292 { 2293 2294 return (vm->vcpu[vcpuid].stats); 2295 } 2296 2297 int 2298 vm_get_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state *state) 2299 { 2300 if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 2301 return (EINVAL); 2302 2303 *state = vm->vcpu[vcpuid].x2apic_state; 2304 2305 return (0); 2306 } 2307 2308 int 2309 vm_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state) 2310 { 2311 if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 2312 return (EINVAL); 2313 2314 if (state >= X2APIC_STATE_LAST) 2315 return (EINVAL); 2316 2317 vm->vcpu[vcpuid].x2apic_state = state; 2318 2319 vlapic_set_x2apic_state(vm, vcpuid, state); 2320 2321 return (0); 2322 } 2323 2324 /* 2325 * This function is called to ensure that a vcpu "sees" a pending event 2326 * as soon as possible: 2327 * - If the vcpu thread is sleeping then it is woken up. 2328 * - If the vcpu is running on a different host_cpu then an IPI will be directed 2329 * to the host_cpu to cause the vcpu to trap into the hypervisor. 2330 */ 2331 static void 2332 vcpu_notify_event_locked(struct vcpu *vcpu, bool lapic_intr) 2333 { 2334 int hostcpu; 2335 2336 hostcpu = vcpu->hostcpu; 2337 if (vcpu->state == VCPU_RUNNING) { 2338 KASSERT(hostcpu != NOCPU, ("vcpu running on invalid hostcpu")); 2339 if (hostcpu != curcpu) { 2340 if (lapic_intr) { 2341 vlapic_post_intr(vcpu->vlapic, hostcpu, 2342 vmm_ipinum); 2343 } else { 2344 ipi_cpu(hostcpu, vmm_ipinum); 2345 } 2346 } else { 2347 /* 2348 * If the 'vcpu' is running on 'curcpu' then it must 2349 * be sending a notification to itself (e.g. SELF_IPI). 2350 * The pending event will be picked up when the vcpu 2351 * transitions back to guest context. 2352 */ 2353 } 2354 } else { 2355 KASSERT(hostcpu == NOCPU, ("vcpu state %d not consistent " 2356 "with hostcpu %d", vcpu->state, hostcpu)); 2357 if (vcpu->state == VCPU_SLEEPING) 2358 wakeup_one(vcpu); 2359 } 2360 } 2361 2362 void 2363 vcpu_notify_event(struct vm *vm, int vcpuid, bool lapic_intr) 2364 { 2365 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 2366 2367 vcpu_lock(vcpu); 2368 vcpu_notify_event_locked(vcpu, lapic_intr); 2369 vcpu_unlock(vcpu); 2370 } 2371 2372 struct vmspace * 2373 vm_get_vmspace(struct vm *vm) 2374 { 2375 2376 return (vm->vmspace); 2377 } 2378 2379 int 2380 vm_apicid2vcpuid(struct vm *vm, int apicid) 2381 { 2382 /* 2383 * XXX apic id is assumed to be numerically identical to vcpu id 2384 */ 2385 return (apicid); 2386 } 2387 2388 void 2389 vm_smp_rendezvous(struct vm *vm, int vcpuid, cpuset_t dest, 2390 vm_rendezvous_func_t func, void *arg) 2391 { 2392 int i; 2393 2394 /* 2395 * Enforce that this function is called without any locks 2396 */ 2397 WITNESS_WARN(WARN_PANIC, NULL, "vm_smp_rendezvous"); 2398 KASSERT(vcpuid == -1 || (vcpuid >= 0 && vcpuid < VM_MAXCPU), 2399 ("vm_smp_rendezvous: invalid vcpuid %d", vcpuid)); 2400 2401 restart: 2402 mtx_lock(&vm->rendezvous_mtx); 2403 if (vm->rendezvous_func != NULL) { 2404 /* 2405 * If a rendezvous is already in progress then we need to 2406 * call the rendezvous handler in case this 'vcpuid' is one 2407 * of the targets of the rendezvous. 2408 */ 2409 RENDEZVOUS_CTR0(vm, vcpuid, "Rendezvous already in progress"); 2410 mtx_unlock(&vm->rendezvous_mtx); 2411 vm_handle_rendezvous(vm, vcpuid); 2412 goto restart; 2413 } 2414 KASSERT(vm->rendezvous_func == NULL, ("vm_smp_rendezvous: previous " 2415 "rendezvous is still in progress")); 2416 2417 RENDEZVOUS_CTR0(vm, vcpuid, "Initiating rendezvous"); 2418 vm->rendezvous_req_cpus = dest; 2419 CPU_ZERO(&vm->rendezvous_done_cpus); 2420 vm->rendezvous_arg = arg; 2421 vm_set_rendezvous_func(vm, func); 2422 mtx_unlock(&vm->rendezvous_mtx); 2423 2424 /* 2425 * Wake up any sleeping vcpus and trigger a VM-exit in any running 2426 * vcpus so they handle the rendezvous as soon as possible. 2427 */ 2428 for (i = 0; i < VM_MAXCPU; i++) { 2429 if (CPU_ISSET(i, &dest)) 2430 vcpu_notify_event(vm, i, false); 2431 } 2432 2433 vm_handle_rendezvous(vm, vcpuid); 2434 } 2435 2436 struct vatpic * 2437 vm_atpic(struct vm *vm) 2438 { 2439 return (vm->vatpic); 2440 } 2441 2442 struct vatpit * 2443 vm_atpit(struct vm *vm) 2444 { 2445 return (vm->vatpit); 2446 } 2447 2448 struct vpmtmr * 2449 vm_pmtmr(struct vm *vm) 2450 { 2451 2452 return (vm->vpmtmr); 2453 } 2454 2455 struct vrtc * 2456 vm_rtc(struct vm *vm) 2457 { 2458 2459 return (vm->vrtc); 2460 } 2461 2462 enum vm_reg_name 2463 vm_segment_name(int seg) 2464 { 2465 static enum vm_reg_name seg_names[] = { 2466 VM_REG_GUEST_ES, 2467 VM_REG_GUEST_CS, 2468 VM_REG_GUEST_SS, 2469 VM_REG_GUEST_DS, 2470 VM_REG_GUEST_FS, 2471 VM_REG_GUEST_GS 2472 }; 2473 2474 KASSERT(seg >= 0 && seg < nitems(seg_names), 2475 ("%s: invalid segment encoding %d", __func__, seg)); 2476 return (seg_names[seg]); 2477 } 2478 2479 void 2480 vm_copy_teardown(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo, 2481 int num_copyinfo) 2482 { 2483 int idx; 2484 2485 for (idx = 0; idx < num_copyinfo; idx++) { 2486 if (copyinfo[idx].cookie != NULL) 2487 vm_gpa_release(copyinfo[idx].cookie); 2488 } 2489 bzero(copyinfo, num_copyinfo * sizeof(struct vm_copyinfo)); 2490 } 2491 2492 int 2493 vm_copy_setup(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, 2494 uint64_t gla, size_t len, int prot, struct vm_copyinfo *copyinfo, 2495 int num_copyinfo, int *fault) 2496 { 2497 int error, idx, nused; 2498 size_t n, off, remaining; 2499 void *hva, *cookie; 2500 uint64_t gpa; 2501 2502 bzero(copyinfo, sizeof(struct vm_copyinfo) * num_copyinfo); 2503 2504 nused = 0; 2505 remaining = len; 2506 while (remaining > 0) { 2507 KASSERT(nused < num_copyinfo, ("insufficient vm_copyinfo")); 2508 error = vm_gla2gpa(vm, vcpuid, paging, gla, prot, &gpa, fault); 2509 if (error || *fault) 2510 return (error); 2511 off = gpa & PAGE_MASK; 2512 n = min(remaining, PAGE_SIZE - off); 2513 copyinfo[nused].gpa = gpa; 2514 copyinfo[nused].len = n; 2515 remaining -= n; 2516 gla += n; 2517 nused++; 2518 } 2519 2520 for (idx = 0; idx < nused; idx++) { 2521 hva = vm_gpa_hold(vm, vcpuid, copyinfo[idx].gpa, 2522 copyinfo[idx].len, prot, &cookie); 2523 if (hva == NULL) 2524 break; 2525 copyinfo[idx].hva = hva; 2526 copyinfo[idx].cookie = cookie; 2527 } 2528 2529 if (idx != nused) { 2530 vm_copy_teardown(vm, vcpuid, copyinfo, num_copyinfo); 2531 return (EFAULT); 2532 } else { 2533 *fault = 0; 2534 return (0); 2535 } 2536 } 2537 2538 void 2539 vm_copyin(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo, void *kaddr, 2540 size_t len) 2541 { 2542 char *dst; 2543 int idx; 2544 2545 dst = kaddr; 2546 idx = 0; 2547 while (len > 0) { 2548 bcopy(copyinfo[idx].hva, dst, copyinfo[idx].len); 2549 len -= copyinfo[idx].len; 2550 dst += copyinfo[idx].len; 2551 idx++; 2552 } 2553 } 2554 2555 void 2556 vm_copyout(struct vm *vm, int vcpuid, const void *kaddr, 2557 struct vm_copyinfo *copyinfo, size_t len) 2558 { 2559 const char *src; 2560 int idx; 2561 2562 src = kaddr; 2563 idx = 0; 2564 while (len > 0) { 2565 bcopy(src, copyinfo[idx].hva, copyinfo[idx].len); 2566 len -= copyinfo[idx].len; 2567 src += copyinfo[idx].len; 2568 idx++; 2569 } 2570 } 2571 2572 /* 2573 * Return the amount of in-use and wired memory for the VM. Since 2574 * these are global stats, only return the values with for vCPU 0 2575 */ 2576 VMM_STAT_DECLARE(VMM_MEM_RESIDENT); 2577 VMM_STAT_DECLARE(VMM_MEM_WIRED); 2578 2579 static void 2580 vm_get_rescnt(struct vm *vm, int vcpu, struct vmm_stat_type *stat) 2581 { 2582 2583 if (vcpu == 0) { 2584 vmm_stat_set(vm, vcpu, VMM_MEM_RESIDENT, 2585 PAGE_SIZE * vmspace_resident_count(vm->vmspace)); 2586 } 2587 } 2588 2589 static void 2590 vm_get_wiredcnt(struct vm *vm, int vcpu, struct vmm_stat_type *stat) 2591 { 2592 2593 if (vcpu == 0) { 2594 vmm_stat_set(vm, vcpu, VMM_MEM_WIRED, 2595 PAGE_SIZE * pmap_wired_count(vmspace_pmap(vm->vmspace))); 2596 } 2597 } 2598 2599 VMM_STAT_FUNC(VMM_MEM_RESIDENT, "Resident memory", vm_get_rescnt); 2600 VMM_STAT_FUNC(VMM_MEM_WIRED, "Wired memory", vm_get_wiredcnt); 2601