1 /*- 2 * Copyright (c) 2011 NetApp, Inc. 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 * 26 * $FreeBSD$ 27 */ 28 29 #include <sys/cdefs.h> 30 __FBSDID("$FreeBSD$"); 31 32 #include <sys/param.h> 33 #include <sys/systm.h> 34 #include <sys/kernel.h> 35 #include <sys/module.h> 36 #include <sys/sysctl.h> 37 #include <sys/malloc.h> 38 #include <sys/pcpu.h> 39 #include <sys/lock.h> 40 #include <sys/mutex.h> 41 #include <sys/proc.h> 42 #include <sys/rwlock.h> 43 #include <sys/sched.h> 44 #include <sys/smp.h> 45 #include <sys/systm.h> 46 47 #include <vm/vm.h> 48 #include <vm/vm_object.h> 49 #include <vm/vm_page.h> 50 #include <vm/pmap.h> 51 #include <vm/vm_map.h> 52 #include <vm/vm_extern.h> 53 #include <vm/vm_param.h> 54 55 #include <machine/cpu.h> 56 #include <machine/vm.h> 57 #include <machine/pcb.h> 58 #include <machine/smp.h> 59 #include <x86/psl.h> 60 #include <x86/apicreg.h> 61 #include <machine/vmparam.h> 62 63 #include <machine/vmm.h> 64 #include <machine/vmm_dev.h> 65 66 #include "vmm_ktr.h" 67 #include "vmm_host.h" 68 #include "vmm_mem.h" 69 #include "vmm_util.h" 70 #include "vatpic.h" 71 #include "vatpit.h" 72 #include "vhpet.h" 73 #include "vioapic.h" 74 #include "vlapic.h" 75 #include "vmm_msr.h" 76 #include "vmm_ipi.h" 77 #include "vmm_stat.h" 78 #include "vmm_lapic.h" 79 80 #include "io/ppt.h" 81 #include "io/iommu.h" 82 83 struct vlapic; 84 85 struct vcpu { 86 int flags; 87 enum vcpu_state state; 88 struct mtx mtx; 89 int hostcpu; /* host cpuid this vcpu last ran on */ 90 uint64_t guest_msrs[VMM_MSR_NUM]; 91 struct vlapic *vlapic; 92 int vcpuid; 93 struct savefpu *guestfpu; /* guest fpu state */ 94 uint64_t guest_xcr0; 95 void *stats; 96 struct vm_exit exitinfo; 97 enum x2apic_state x2apic_state; 98 int nmi_pending; 99 int extint_pending; 100 struct vm_exception exception; 101 int exception_pending; 102 }; 103 104 #define vcpu_lock_init(v) mtx_init(&((v)->mtx), "vcpu lock", 0, MTX_SPIN) 105 #define vcpu_lock(v) mtx_lock_spin(&((v)->mtx)) 106 #define vcpu_unlock(v) mtx_unlock_spin(&((v)->mtx)) 107 #define vcpu_assert_locked(v) mtx_assert(&((v)->mtx), MA_OWNED) 108 109 struct mem_seg { 110 vm_paddr_t gpa; 111 size_t len; 112 boolean_t wired; 113 vm_object_t object; 114 }; 115 #define VM_MAX_MEMORY_SEGMENTS 2 116 117 struct vm { 118 void *cookie; /* processor-specific data */ 119 void *iommu; /* iommu-specific data */ 120 struct vhpet *vhpet; /* virtual HPET */ 121 struct vioapic *vioapic; /* virtual ioapic */ 122 struct vatpic *vatpic; /* virtual atpic */ 123 struct vatpit *vatpit; /* virtual atpit */ 124 struct vmspace *vmspace; /* guest's address space */ 125 struct vcpu vcpu[VM_MAXCPU]; 126 int num_mem_segs; 127 struct mem_seg mem_segs[VM_MAX_MEMORY_SEGMENTS]; 128 char name[VM_MAX_NAMELEN]; 129 130 /* 131 * Set of active vcpus. 132 * An active vcpu is one that has been started implicitly (BSP) or 133 * explicitly (AP) by sending it a startup ipi. 134 */ 135 volatile cpuset_t active_cpus; 136 137 struct mtx rendezvous_mtx; 138 cpuset_t rendezvous_req_cpus; 139 cpuset_t rendezvous_done_cpus; 140 void *rendezvous_arg; 141 vm_rendezvous_func_t rendezvous_func; 142 143 int suspend; 144 volatile cpuset_t suspended_cpus; 145 146 volatile cpuset_t halted_cpus; 147 }; 148 149 static int vmm_initialized; 150 151 static struct vmm_ops *ops; 152 #define VMM_INIT(num) (ops != NULL ? (*ops->init)(num) : 0) 153 #define VMM_CLEANUP() (ops != NULL ? (*ops->cleanup)() : 0) 154 #define VMM_RESUME() (ops != NULL ? (*ops->resume)() : 0) 155 156 #define VMINIT(vm, pmap) (ops != NULL ? (*ops->vminit)(vm, pmap): NULL) 157 #define VMRUN(vmi, vcpu, rip, pmap, rptr, sptr) \ 158 (ops != NULL ? (*ops->vmrun)(vmi, vcpu, rip, pmap, rptr, sptr) : ENXIO) 159 #define VMCLEANUP(vmi) (ops != NULL ? (*ops->vmcleanup)(vmi) : NULL) 160 #define VMSPACE_ALLOC(min, max) \ 161 (ops != NULL ? (*ops->vmspace_alloc)(min, max) : NULL) 162 #define VMSPACE_FREE(vmspace) \ 163 (ops != NULL ? (*ops->vmspace_free)(vmspace) : ENXIO) 164 #define VMGETREG(vmi, vcpu, num, retval) \ 165 (ops != NULL ? (*ops->vmgetreg)(vmi, vcpu, num, retval) : ENXIO) 166 #define VMSETREG(vmi, vcpu, num, val) \ 167 (ops != NULL ? (*ops->vmsetreg)(vmi, vcpu, num, val) : ENXIO) 168 #define VMGETDESC(vmi, vcpu, num, desc) \ 169 (ops != NULL ? (*ops->vmgetdesc)(vmi, vcpu, num, desc) : ENXIO) 170 #define VMSETDESC(vmi, vcpu, num, desc) \ 171 (ops != NULL ? (*ops->vmsetdesc)(vmi, vcpu, num, desc) : ENXIO) 172 #define VMGETCAP(vmi, vcpu, num, retval) \ 173 (ops != NULL ? (*ops->vmgetcap)(vmi, vcpu, num, retval) : ENXIO) 174 #define VMSETCAP(vmi, vcpu, num, val) \ 175 (ops != NULL ? (*ops->vmsetcap)(vmi, vcpu, num, val) : ENXIO) 176 #define VLAPIC_INIT(vmi, vcpu) \ 177 (ops != NULL ? (*ops->vlapic_init)(vmi, vcpu) : NULL) 178 #define VLAPIC_CLEANUP(vmi, vlapic) \ 179 (ops != NULL ? (*ops->vlapic_cleanup)(vmi, vlapic) : NULL) 180 181 #define fpu_start_emulating() load_cr0(rcr0() | CR0_TS) 182 #define fpu_stop_emulating() clts() 183 184 static MALLOC_DEFINE(M_VM, "vm", "vm"); 185 CTASSERT(VMM_MSR_NUM <= 64); /* msr_mask can keep track of up to 64 msrs */ 186 187 /* statistics */ 188 static VMM_STAT(VCPU_TOTAL_RUNTIME, "vcpu total runtime"); 189 190 SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW, NULL, NULL); 191 192 /* 193 * Halt the guest if all vcpus are executing a HLT instruction with 194 * interrupts disabled. 195 */ 196 static int halt_detection_enabled = 1; 197 TUNABLE_INT("hw.vmm.halt_detection", &halt_detection_enabled); 198 SYSCTL_INT(_hw_vmm, OID_AUTO, halt_detection, CTLFLAG_RDTUN, 199 &halt_detection_enabled, 0, 200 "Halt VM if all vcpus execute HLT with interrupts disabled"); 201 202 static int vmm_ipinum; 203 SYSCTL_INT(_hw_vmm, OID_AUTO, ipinum, CTLFLAG_RD, &vmm_ipinum, 0, 204 "IPI vector used for vcpu notifications"); 205 206 static void 207 vcpu_cleanup(struct vm *vm, int i) 208 { 209 struct vcpu *vcpu = &vm->vcpu[i]; 210 211 VLAPIC_CLEANUP(vm->cookie, vcpu->vlapic); 212 vmm_stat_free(vcpu->stats); 213 fpu_save_area_free(vcpu->guestfpu); 214 } 215 216 static void 217 vcpu_init(struct vm *vm, uint32_t vcpu_id) 218 { 219 struct vcpu *vcpu; 220 221 vcpu = &vm->vcpu[vcpu_id]; 222 223 vcpu_lock_init(vcpu); 224 vcpu->hostcpu = NOCPU; 225 vcpu->vcpuid = vcpu_id; 226 vcpu->vlapic = VLAPIC_INIT(vm->cookie, vcpu_id); 227 vm_set_x2apic_state(vm, vcpu_id, X2APIC_DISABLED); 228 vcpu->guest_xcr0 = XFEATURE_ENABLED_X87; 229 vcpu->guestfpu = fpu_save_area_alloc(); 230 fpu_save_area_reset(vcpu->guestfpu); 231 vcpu->stats = vmm_stat_alloc(); 232 } 233 234 struct vm_exit * 235 vm_exitinfo(struct vm *vm, int cpuid) 236 { 237 struct vcpu *vcpu; 238 239 if (cpuid < 0 || cpuid >= VM_MAXCPU) 240 panic("vm_exitinfo: invalid cpuid %d", cpuid); 241 242 vcpu = &vm->vcpu[cpuid]; 243 244 return (&vcpu->exitinfo); 245 } 246 247 static void 248 vmm_resume(void) 249 { 250 VMM_RESUME(); 251 } 252 253 static int 254 vmm_init(void) 255 { 256 int error; 257 258 vmm_host_state_init(); 259 260 vmm_ipinum = vmm_ipi_alloc(); 261 if (vmm_ipinum == 0) 262 vmm_ipinum = IPI_AST; 263 264 error = vmm_mem_init(); 265 if (error) 266 return (error); 267 268 if (vmm_is_intel()) 269 ops = &vmm_ops_intel; 270 else if (vmm_is_amd()) 271 ops = &vmm_ops_amd; 272 else 273 return (ENXIO); 274 275 vmm_msr_init(); 276 vmm_resume_p = vmm_resume; 277 278 return (VMM_INIT(vmm_ipinum)); 279 } 280 281 static int 282 vmm_handler(module_t mod, int what, void *arg) 283 { 284 int error; 285 286 switch (what) { 287 case MOD_LOAD: 288 vmmdev_init(); 289 if (ppt_avail_devices() > 0) 290 iommu_init(); 291 error = vmm_init(); 292 if (error == 0) 293 vmm_initialized = 1; 294 break; 295 case MOD_UNLOAD: 296 error = vmmdev_cleanup(); 297 if (error == 0) { 298 vmm_resume_p = NULL; 299 iommu_cleanup(); 300 if (vmm_ipinum != IPI_AST) 301 vmm_ipi_free(vmm_ipinum); 302 error = VMM_CLEANUP(); 303 /* 304 * Something bad happened - prevent new 305 * VMs from being created 306 */ 307 if (error) 308 vmm_initialized = 0; 309 } 310 break; 311 default: 312 error = 0; 313 break; 314 } 315 return (error); 316 } 317 318 static moduledata_t vmm_kmod = { 319 "vmm", 320 vmm_handler, 321 NULL 322 }; 323 324 /* 325 * vmm initialization has the following dependencies: 326 * 327 * - iommu initialization must happen after the pci passthru driver has had 328 * a chance to attach to any passthru devices (after SI_SUB_CONFIGURE). 329 * 330 * - VT-x initialization requires smp_rendezvous() and therefore must happen 331 * after SMP is fully functional (after SI_SUB_SMP). 332 */ 333 DECLARE_MODULE(vmm, vmm_kmod, SI_SUB_SMP + 1, SI_ORDER_ANY); 334 MODULE_VERSION(vmm, 1); 335 336 int 337 vm_create(const char *name, struct vm **retvm) 338 { 339 int i; 340 struct vm *vm; 341 struct vmspace *vmspace; 342 343 const int BSP = 0; 344 345 /* 346 * If vmm.ko could not be successfully initialized then don't attempt 347 * to create the virtual machine. 348 */ 349 if (!vmm_initialized) 350 return (ENXIO); 351 352 if (name == NULL || strlen(name) >= VM_MAX_NAMELEN) 353 return (EINVAL); 354 355 vmspace = VMSPACE_ALLOC(VM_MIN_ADDRESS, VM_MAXUSER_ADDRESS); 356 if (vmspace == NULL) 357 return (ENOMEM); 358 359 vm = malloc(sizeof(struct vm), M_VM, M_WAITOK | M_ZERO); 360 strcpy(vm->name, name); 361 vm->vmspace = vmspace; 362 mtx_init(&vm->rendezvous_mtx, "vm rendezvous lock", 0, MTX_DEF); 363 vm->cookie = VMINIT(vm, vmspace_pmap(vmspace)); 364 vm->vioapic = vioapic_init(vm); 365 vm->vhpet = vhpet_init(vm); 366 vm->vatpic = vatpic_init(vm); 367 vm->vatpit = vatpit_init(vm); 368 369 for (i = 0; i < VM_MAXCPU; i++) { 370 vcpu_init(vm, i); 371 guest_msrs_init(vm, i); 372 } 373 374 vm_activate_cpu(vm, BSP); 375 376 *retvm = vm; 377 return (0); 378 } 379 380 static void 381 vm_free_mem_seg(struct vm *vm, struct mem_seg *seg) 382 { 383 384 if (seg->object != NULL) 385 vmm_mem_free(vm->vmspace, seg->gpa, seg->len); 386 387 bzero(seg, sizeof(*seg)); 388 } 389 390 void 391 vm_destroy(struct vm *vm) 392 { 393 int i; 394 395 ppt_unassign_all(vm); 396 397 if (vm->iommu != NULL) 398 iommu_destroy_domain(vm->iommu); 399 400 vatpit_cleanup(vm->vatpit); 401 vhpet_cleanup(vm->vhpet); 402 vatpic_cleanup(vm->vatpic); 403 vioapic_cleanup(vm->vioapic); 404 405 for (i = 0; i < vm->num_mem_segs; i++) 406 vm_free_mem_seg(vm, &vm->mem_segs[i]); 407 408 vm->num_mem_segs = 0; 409 410 for (i = 0; i < VM_MAXCPU; i++) 411 vcpu_cleanup(vm, i); 412 413 VMSPACE_FREE(vm->vmspace); 414 415 VMCLEANUP(vm->cookie); 416 417 free(vm, M_VM); 418 } 419 420 const char * 421 vm_name(struct vm *vm) 422 { 423 return (vm->name); 424 } 425 426 int 427 vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa) 428 { 429 vm_object_t obj; 430 431 if ((obj = vmm_mmio_alloc(vm->vmspace, gpa, len, hpa)) == NULL) 432 return (ENOMEM); 433 else 434 return (0); 435 } 436 437 int 438 vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len) 439 { 440 441 vmm_mmio_free(vm->vmspace, gpa, len); 442 return (0); 443 } 444 445 boolean_t 446 vm_mem_allocated(struct vm *vm, vm_paddr_t gpa) 447 { 448 int i; 449 vm_paddr_t gpabase, gpalimit; 450 451 for (i = 0; i < vm->num_mem_segs; i++) { 452 gpabase = vm->mem_segs[i].gpa; 453 gpalimit = gpabase + vm->mem_segs[i].len; 454 if (gpa >= gpabase && gpa < gpalimit) 455 return (TRUE); /* 'gpa' is regular memory */ 456 } 457 458 if (ppt_is_mmio(vm, gpa)) 459 return (TRUE); /* 'gpa' is pci passthru mmio */ 460 461 return (FALSE); 462 } 463 464 int 465 vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len) 466 { 467 int available, allocated; 468 struct mem_seg *seg; 469 vm_object_t object; 470 vm_paddr_t g; 471 472 if ((gpa & PAGE_MASK) || (len & PAGE_MASK) || len == 0) 473 return (EINVAL); 474 475 available = allocated = 0; 476 g = gpa; 477 while (g < gpa + len) { 478 if (vm_mem_allocated(vm, g)) 479 allocated++; 480 else 481 available++; 482 483 g += PAGE_SIZE; 484 } 485 486 /* 487 * If there are some allocated and some available pages in the address 488 * range then it is an error. 489 */ 490 if (allocated && available) 491 return (EINVAL); 492 493 /* 494 * If the entire address range being requested has already been 495 * allocated then there isn't anything more to do. 496 */ 497 if (allocated && available == 0) 498 return (0); 499 500 if (vm->num_mem_segs >= VM_MAX_MEMORY_SEGMENTS) 501 return (E2BIG); 502 503 seg = &vm->mem_segs[vm->num_mem_segs]; 504 505 if ((object = vmm_mem_alloc(vm->vmspace, gpa, len)) == NULL) 506 return (ENOMEM); 507 508 seg->gpa = gpa; 509 seg->len = len; 510 seg->object = object; 511 seg->wired = FALSE; 512 513 vm->num_mem_segs++; 514 515 return (0); 516 } 517 518 static void 519 vm_gpa_unwire(struct vm *vm) 520 { 521 int i, rv; 522 struct mem_seg *seg; 523 524 for (i = 0; i < vm->num_mem_segs; i++) { 525 seg = &vm->mem_segs[i]; 526 if (!seg->wired) 527 continue; 528 529 rv = vm_map_unwire(&vm->vmspace->vm_map, 530 seg->gpa, seg->gpa + seg->len, 531 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); 532 KASSERT(rv == KERN_SUCCESS, ("vm(%s) memory segment " 533 "%#lx/%ld could not be unwired: %d", 534 vm_name(vm), seg->gpa, seg->len, rv)); 535 536 seg->wired = FALSE; 537 } 538 } 539 540 static int 541 vm_gpa_wire(struct vm *vm) 542 { 543 int i, rv; 544 struct mem_seg *seg; 545 546 for (i = 0; i < vm->num_mem_segs; i++) { 547 seg = &vm->mem_segs[i]; 548 if (seg->wired) 549 continue; 550 551 /* XXX rlimits? */ 552 rv = vm_map_wire(&vm->vmspace->vm_map, 553 seg->gpa, seg->gpa + seg->len, 554 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); 555 if (rv != KERN_SUCCESS) 556 break; 557 558 seg->wired = TRUE; 559 } 560 561 if (i < vm->num_mem_segs) { 562 /* 563 * Undo the wiring before returning an error. 564 */ 565 vm_gpa_unwire(vm); 566 return (EAGAIN); 567 } 568 569 return (0); 570 } 571 572 static void 573 vm_iommu_modify(struct vm *vm, boolean_t map) 574 { 575 int i, sz; 576 vm_paddr_t gpa, hpa; 577 struct mem_seg *seg; 578 void *vp, *cookie, *host_domain; 579 580 sz = PAGE_SIZE; 581 host_domain = iommu_host_domain(); 582 583 for (i = 0; i < vm->num_mem_segs; i++) { 584 seg = &vm->mem_segs[i]; 585 KASSERT(seg->wired, ("vm(%s) memory segment %#lx/%ld not wired", 586 vm_name(vm), seg->gpa, seg->len)); 587 588 gpa = seg->gpa; 589 while (gpa < seg->gpa + seg->len) { 590 vp = vm_gpa_hold(vm, gpa, PAGE_SIZE, VM_PROT_WRITE, 591 &cookie); 592 KASSERT(vp != NULL, ("vm(%s) could not map gpa %#lx", 593 vm_name(vm), gpa)); 594 595 vm_gpa_release(cookie); 596 597 hpa = DMAP_TO_PHYS((uintptr_t)vp); 598 if (map) { 599 iommu_create_mapping(vm->iommu, gpa, hpa, sz); 600 iommu_remove_mapping(host_domain, hpa, sz); 601 } else { 602 iommu_remove_mapping(vm->iommu, gpa, sz); 603 iommu_create_mapping(host_domain, hpa, hpa, sz); 604 } 605 606 gpa += PAGE_SIZE; 607 } 608 } 609 610 /* 611 * Invalidate the cached translations associated with the domain 612 * from which pages were removed. 613 */ 614 if (map) 615 iommu_invalidate_tlb(host_domain); 616 else 617 iommu_invalidate_tlb(vm->iommu); 618 } 619 620 #define vm_iommu_unmap(vm) vm_iommu_modify((vm), FALSE) 621 #define vm_iommu_map(vm) vm_iommu_modify((vm), TRUE) 622 623 int 624 vm_unassign_pptdev(struct vm *vm, int bus, int slot, int func) 625 { 626 int error; 627 628 error = ppt_unassign_device(vm, bus, slot, func); 629 if (error) 630 return (error); 631 632 if (ppt_assigned_devices(vm) == 0) { 633 vm_iommu_unmap(vm); 634 vm_gpa_unwire(vm); 635 } 636 return (0); 637 } 638 639 int 640 vm_assign_pptdev(struct vm *vm, int bus, int slot, int func) 641 { 642 int error; 643 vm_paddr_t maxaddr; 644 645 /* 646 * Virtual machines with pci passthru devices get special treatment: 647 * - the guest physical memory is wired 648 * - the iommu is programmed to do the 'gpa' to 'hpa' translation 649 * 650 * We need to do this before the first pci passthru device is attached. 651 */ 652 if (ppt_assigned_devices(vm) == 0) { 653 KASSERT(vm->iommu == NULL, 654 ("vm_assign_pptdev: iommu must be NULL")); 655 maxaddr = vmm_mem_maxaddr(); 656 vm->iommu = iommu_create_domain(maxaddr); 657 658 error = vm_gpa_wire(vm); 659 if (error) 660 return (error); 661 662 vm_iommu_map(vm); 663 } 664 665 error = ppt_assign_device(vm, bus, slot, func); 666 return (error); 667 } 668 669 void * 670 vm_gpa_hold(struct vm *vm, vm_paddr_t gpa, size_t len, int reqprot, 671 void **cookie) 672 { 673 int count, pageoff; 674 vm_page_t m; 675 676 pageoff = gpa & PAGE_MASK; 677 if (len > PAGE_SIZE - pageoff) 678 panic("vm_gpa_hold: invalid gpa/len: 0x%016lx/%lu", gpa, len); 679 680 count = vm_fault_quick_hold_pages(&vm->vmspace->vm_map, 681 trunc_page(gpa), PAGE_SIZE, reqprot, &m, 1); 682 683 if (count == 1) { 684 *cookie = m; 685 return ((void *)(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)) + pageoff)); 686 } else { 687 *cookie = NULL; 688 return (NULL); 689 } 690 } 691 692 void 693 vm_gpa_release(void *cookie) 694 { 695 vm_page_t m = cookie; 696 697 vm_page_lock(m); 698 vm_page_unhold(m); 699 vm_page_unlock(m); 700 } 701 702 int 703 vm_gpabase2memseg(struct vm *vm, vm_paddr_t gpabase, 704 struct vm_memory_segment *seg) 705 { 706 int i; 707 708 for (i = 0; i < vm->num_mem_segs; i++) { 709 if (gpabase == vm->mem_segs[i].gpa) { 710 seg->gpa = vm->mem_segs[i].gpa; 711 seg->len = vm->mem_segs[i].len; 712 seg->wired = vm->mem_segs[i].wired; 713 return (0); 714 } 715 } 716 return (-1); 717 } 718 719 int 720 vm_get_memobj(struct vm *vm, vm_paddr_t gpa, size_t len, 721 vm_offset_t *offset, struct vm_object **object) 722 { 723 int i; 724 size_t seg_len; 725 vm_paddr_t seg_gpa; 726 vm_object_t seg_obj; 727 728 for (i = 0; i < vm->num_mem_segs; i++) { 729 if ((seg_obj = vm->mem_segs[i].object) == NULL) 730 continue; 731 732 seg_gpa = vm->mem_segs[i].gpa; 733 seg_len = vm->mem_segs[i].len; 734 735 if (gpa >= seg_gpa && gpa < seg_gpa + seg_len) { 736 *offset = gpa - seg_gpa; 737 *object = seg_obj; 738 vm_object_reference(seg_obj); 739 return (0); 740 } 741 } 742 743 return (EINVAL); 744 } 745 746 int 747 vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval) 748 { 749 750 if (vcpu < 0 || vcpu >= VM_MAXCPU) 751 return (EINVAL); 752 753 if (reg >= VM_REG_LAST) 754 return (EINVAL); 755 756 return (VMGETREG(vm->cookie, vcpu, reg, retval)); 757 } 758 759 int 760 vm_set_register(struct vm *vm, int vcpu, int reg, uint64_t val) 761 { 762 763 if (vcpu < 0 || vcpu >= VM_MAXCPU) 764 return (EINVAL); 765 766 if (reg >= VM_REG_LAST) 767 return (EINVAL); 768 769 return (VMSETREG(vm->cookie, vcpu, reg, val)); 770 } 771 772 static boolean_t 773 is_descriptor_table(int reg) 774 { 775 776 switch (reg) { 777 case VM_REG_GUEST_IDTR: 778 case VM_REG_GUEST_GDTR: 779 return (TRUE); 780 default: 781 return (FALSE); 782 } 783 } 784 785 static boolean_t 786 is_segment_register(int reg) 787 { 788 789 switch (reg) { 790 case VM_REG_GUEST_ES: 791 case VM_REG_GUEST_CS: 792 case VM_REG_GUEST_SS: 793 case VM_REG_GUEST_DS: 794 case VM_REG_GUEST_FS: 795 case VM_REG_GUEST_GS: 796 case VM_REG_GUEST_TR: 797 case VM_REG_GUEST_LDTR: 798 return (TRUE); 799 default: 800 return (FALSE); 801 } 802 } 803 804 int 805 vm_get_seg_desc(struct vm *vm, int vcpu, int reg, 806 struct seg_desc *desc) 807 { 808 809 if (vcpu < 0 || vcpu >= VM_MAXCPU) 810 return (EINVAL); 811 812 if (!is_segment_register(reg) && !is_descriptor_table(reg)) 813 return (EINVAL); 814 815 return (VMGETDESC(vm->cookie, vcpu, reg, desc)); 816 } 817 818 int 819 vm_set_seg_desc(struct vm *vm, int vcpu, int reg, 820 struct seg_desc *desc) 821 { 822 if (vcpu < 0 || vcpu >= VM_MAXCPU) 823 return (EINVAL); 824 825 if (!is_segment_register(reg) && !is_descriptor_table(reg)) 826 return (EINVAL); 827 828 return (VMSETDESC(vm->cookie, vcpu, reg, desc)); 829 } 830 831 static void 832 restore_guest_fpustate(struct vcpu *vcpu) 833 { 834 835 /* flush host state to the pcb */ 836 fpuexit(curthread); 837 838 /* restore guest FPU state */ 839 fpu_stop_emulating(); 840 fpurestore(vcpu->guestfpu); 841 842 /* restore guest XCR0 if XSAVE is enabled in the host */ 843 if (rcr4() & CR4_XSAVE) 844 load_xcr(0, vcpu->guest_xcr0); 845 846 /* 847 * The FPU is now "dirty" with the guest's state so turn on emulation 848 * to trap any access to the FPU by the host. 849 */ 850 fpu_start_emulating(); 851 } 852 853 static void 854 save_guest_fpustate(struct vcpu *vcpu) 855 { 856 857 if ((rcr0() & CR0_TS) == 0) 858 panic("fpu emulation not enabled in host!"); 859 860 /* save guest XCR0 and restore host XCR0 */ 861 if (rcr4() & CR4_XSAVE) { 862 vcpu->guest_xcr0 = rxcr(0); 863 load_xcr(0, vmm_get_host_xcr0()); 864 } 865 866 /* save guest FPU state */ 867 fpu_stop_emulating(); 868 fpusave(vcpu->guestfpu); 869 fpu_start_emulating(); 870 } 871 872 static VMM_STAT(VCPU_IDLE_TICKS, "number of ticks vcpu was idle"); 873 874 static int 875 vcpu_set_state_locked(struct vcpu *vcpu, enum vcpu_state newstate, 876 bool from_idle) 877 { 878 int error; 879 880 vcpu_assert_locked(vcpu); 881 882 /* 883 * State transitions from the vmmdev_ioctl() must always begin from 884 * the VCPU_IDLE state. This guarantees that there is only a single 885 * ioctl() operating on a vcpu at any point. 886 */ 887 if (from_idle) { 888 while (vcpu->state != VCPU_IDLE) 889 msleep_spin(&vcpu->state, &vcpu->mtx, "vmstat", hz); 890 } else { 891 KASSERT(vcpu->state != VCPU_IDLE, ("invalid transition from " 892 "vcpu idle state")); 893 } 894 895 if (vcpu->state == VCPU_RUNNING) { 896 KASSERT(vcpu->hostcpu == curcpu, ("curcpu %d and hostcpu %d " 897 "mismatch for running vcpu", curcpu, vcpu->hostcpu)); 898 } else { 899 KASSERT(vcpu->hostcpu == NOCPU, ("Invalid hostcpu %d for a " 900 "vcpu that is not running", vcpu->hostcpu)); 901 } 902 903 /* 904 * The following state transitions are allowed: 905 * IDLE -> FROZEN -> IDLE 906 * FROZEN -> RUNNING -> FROZEN 907 * FROZEN -> SLEEPING -> FROZEN 908 */ 909 switch (vcpu->state) { 910 case VCPU_IDLE: 911 case VCPU_RUNNING: 912 case VCPU_SLEEPING: 913 error = (newstate != VCPU_FROZEN); 914 break; 915 case VCPU_FROZEN: 916 error = (newstate == VCPU_FROZEN); 917 break; 918 default: 919 error = 1; 920 break; 921 } 922 923 if (error) 924 return (EBUSY); 925 926 vcpu->state = newstate; 927 if (newstate == VCPU_RUNNING) 928 vcpu->hostcpu = curcpu; 929 else 930 vcpu->hostcpu = NOCPU; 931 932 if (newstate == VCPU_IDLE) 933 wakeup(&vcpu->state); 934 935 return (0); 936 } 937 938 static void 939 vcpu_require_state(struct vm *vm, int vcpuid, enum vcpu_state newstate) 940 { 941 int error; 942 943 if ((error = vcpu_set_state(vm, vcpuid, newstate, false)) != 0) 944 panic("Error %d setting state to %d\n", error, newstate); 945 } 946 947 static void 948 vcpu_require_state_locked(struct vcpu *vcpu, enum vcpu_state newstate) 949 { 950 int error; 951 952 if ((error = vcpu_set_state_locked(vcpu, newstate, false)) != 0) 953 panic("Error %d setting state to %d", error, newstate); 954 } 955 956 static void 957 vm_set_rendezvous_func(struct vm *vm, vm_rendezvous_func_t func) 958 { 959 960 KASSERT(mtx_owned(&vm->rendezvous_mtx), ("rendezvous_mtx not locked")); 961 962 /* 963 * Update 'rendezvous_func' and execute a write memory barrier to 964 * ensure that it is visible across all host cpus. This is not needed 965 * for correctness but it does ensure that all the vcpus will notice 966 * that the rendezvous is requested immediately. 967 */ 968 vm->rendezvous_func = func; 969 wmb(); 970 } 971 972 #define RENDEZVOUS_CTR0(vm, vcpuid, fmt) \ 973 do { \ 974 if (vcpuid >= 0) \ 975 VCPU_CTR0(vm, vcpuid, fmt); \ 976 else \ 977 VM_CTR0(vm, fmt); \ 978 } while (0) 979 980 static void 981 vm_handle_rendezvous(struct vm *vm, int vcpuid) 982 { 983 984 KASSERT(vcpuid == -1 || (vcpuid >= 0 && vcpuid < VM_MAXCPU), 985 ("vm_handle_rendezvous: invalid vcpuid %d", vcpuid)); 986 987 mtx_lock(&vm->rendezvous_mtx); 988 while (vm->rendezvous_func != NULL) { 989 /* 'rendezvous_req_cpus' must be a subset of 'active_cpus' */ 990 CPU_AND(&vm->rendezvous_req_cpus, &vm->active_cpus); 991 992 if (vcpuid != -1 && 993 CPU_ISSET(vcpuid, &vm->rendezvous_req_cpus) && 994 !CPU_ISSET(vcpuid, &vm->rendezvous_done_cpus)) { 995 VCPU_CTR0(vm, vcpuid, "Calling rendezvous func"); 996 (*vm->rendezvous_func)(vm, vcpuid, vm->rendezvous_arg); 997 CPU_SET(vcpuid, &vm->rendezvous_done_cpus); 998 } 999 if (CPU_CMP(&vm->rendezvous_req_cpus, 1000 &vm->rendezvous_done_cpus) == 0) { 1001 VCPU_CTR0(vm, vcpuid, "Rendezvous completed"); 1002 vm_set_rendezvous_func(vm, NULL); 1003 wakeup(&vm->rendezvous_func); 1004 break; 1005 } 1006 RENDEZVOUS_CTR0(vm, vcpuid, "Wait for rendezvous completion"); 1007 mtx_sleep(&vm->rendezvous_func, &vm->rendezvous_mtx, 0, 1008 "vmrndv", 0); 1009 } 1010 mtx_unlock(&vm->rendezvous_mtx); 1011 } 1012 1013 /* 1014 * Emulate a guest 'hlt' by sleeping until the vcpu is ready to run. 1015 */ 1016 static int 1017 vm_handle_hlt(struct vm *vm, int vcpuid, bool intr_disabled, bool *retu) 1018 { 1019 struct vcpu *vcpu; 1020 const char *wmesg; 1021 int t, vcpu_halted, vm_halted; 1022 1023 KASSERT(!CPU_ISSET(vcpuid, &vm->halted_cpus), ("vcpu already halted")); 1024 1025 vcpu = &vm->vcpu[vcpuid]; 1026 vcpu_halted = 0; 1027 vm_halted = 0; 1028 1029 vcpu_lock(vcpu); 1030 while (1) { 1031 /* 1032 * Do a final check for pending NMI or interrupts before 1033 * really putting this thread to sleep. Also check for 1034 * software events that would cause this vcpu to wakeup. 1035 * 1036 * These interrupts/events could have happened after the 1037 * vcpu returned from VMRUN() and before it acquired the 1038 * vcpu lock above. 1039 */ 1040 if (vm->rendezvous_func != NULL || vm->suspend) 1041 break; 1042 if (vm_nmi_pending(vm, vcpuid)) 1043 break; 1044 if (!intr_disabled) { 1045 if (vm_extint_pending(vm, vcpuid) || 1046 vlapic_pending_intr(vcpu->vlapic, NULL)) { 1047 break; 1048 } 1049 } 1050 1051 /* 1052 * Some Linux guests implement "halt" by having all vcpus 1053 * execute HLT with interrupts disabled. 'halted_cpus' keeps 1054 * track of the vcpus that have entered this state. When all 1055 * vcpus enter the halted state the virtual machine is halted. 1056 */ 1057 if (intr_disabled) { 1058 wmesg = "vmhalt"; 1059 VCPU_CTR0(vm, vcpuid, "Halted"); 1060 if (!vcpu_halted && halt_detection_enabled) { 1061 vcpu_halted = 1; 1062 CPU_SET_ATOMIC(vcpuid, &vm->halted_cpus); 1063 } 1064 if (CPU_CMP(&vm->halted_cpus, &vm->active_cpus) == 0) { 1065 vm_halted = 1; 1066 break; 1067 } 1068 } else { 1069 wmesg = "vmidle"; 1070 } 1071 1072 t = ticks; 1073 vcpu_require_state_locked(vcpu, VCPU_SLEEPING); 1074 msleep_spin(vcpu, &vcpu->mtx, wmesg, 0); 1075 vcpu_require_state_locked(vcpu, VCPU_FROZEN); 1076 vmm_stat_incr(vm, vcpuid, VCPU_IDLE_TICKS, ticks - t); 1077 } 1078 1079 if (vcpu_halted) 1080 CPU_CLR_ATOMIC(vcpuid, &vm->halted_cpus); 1081 1082 vcpu_unlock(vcpu); 1083 1084 if (vm_halted) 1085 vm_suspend(vm, VM_SUSPEND_HALT); 1086 1087 return (0); 1088 } 1089 1090 static int 1091 vm_handle_paging(struct vm *vm, int vcpuid, bool *retu) 1092 { 1093 int rv, ftype; 1094 struct vm_map *map; 1095 struct vcpu *vcpu; 1096 struct vm_exit *vme; 1097 1098 vcpu = &vm->vcpu[vcpuid]; 1099 vme = &vcpu->exitinfo; 1100 1101 ftype = vme->u.paging.fault_type; 1102 KASSERT(ftype == VM_PROT_READ || 1103 ftype == VM_PROT_WRITE || ftype == VM_PROT_EXECUTE, 1104 ("vm_handle_paging: invalid fault_type %d", ftype)); 1105 1106 if (ftype == VM_PROT_READ || ftype == VM_PROT_WRITE) { 1107 rv = pmap_emulate_accessed_dirty(vmspace_pmap(vm->vmspace), 1108 vme->u.paging.gpa, ftype); 1109 if (rv == 0) 1110 goto done; 1111 } 1112 1113 map = &vm->vmspace->vm_map; 1114 rv = vm_fault(map, vme->u.paging.gpa, ftype, VM_FAULT_NORMAL); 1115 1116 VCPU_CTR3(vm, vcpuid, "vm_handle_paging rv = %d, gpa = %#lx, " 1117 "ftype = %d", rv, vme->u.paging.gpa, ftype); 1118 1119 if (rv != KERN_SUCCESS) 1120 return (EFAULT); 1121 done: 1122 /* restart execution at the faulting instruction */ 1123 vme->inst_length = 0; 1124 1125 return (0); 1126 } 1127 1128 static int 1129 vm_handle_inst_emul(struct vm *vm, int vcpuid, bool *retu) 1130 { 1131 struct vie *vie; 1132 struct vcpu *vcpu; 1133 struct vm_exit *vme; 1134 int error, inst_length; 1135 uint64_t rip, gla, gpa, cr3; 1136 enum vie_cpu_mode cpu_mode; 1137 enum vie_paging_mode paging_mode; 1138 mem_region_read_t mread; 1139 mem_region_write_t mwrite; 1140 1141 vcpu = &vm->vcpu[vcpuid]; 1142 vme = &vcpu->exitinfo; 1143 1144 rip = vme->rip; 1145 inst_length = vme->inst_length; 1146 1147 gla = vme->u.inst_emul.gla; 1148 gpa = vme->u.inst_emul.gpa; 1149 cr3 = vme->u.inst_emul.cr3; 1150 cpu_mode = vme->u.inst_emul.cpu_mode; 1151 paging_mode = vme->u.inst_emul.paging_mode; 1152 vie = &vme->u.inst_emul.vie; 1153 1154 vie_init(vie); 1155 1156 /* Fetch, decode and emulate the faulting instruction */ 1157 if (vmm_fetch_instruction(vm, vcpuid, rip, inst_length, cr3, 1158 paging_mode, vie) != 0) 1159 return (EFAULT); 1160 1161 if (vmm_decode_instruction(vm, vcpuid, gla, cpu_mode, vie) != 0) 1162 return (EFAULT); 1163 1164 /* return to userland unless this is an in-kernel emulated device */ 1165 if (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE) { 1166 mread = lapic_mmio_read; 1167 mwrite = lapic_mmio_write; 1168 } else if (gpa >= VIOAPIC_BASE && gpa < VIOAPIC_BASE + VIOAPIC_SIZE) { 1169 mread = vioapic_mmio_read; 1170 mwrite = vioapic_mmio_write; 1171 } else if (gpa >= VHPET_BASE && gpa < VHPET_BASE + VHPET_SIZE) { 1172 mread = vhpet_mmio_read; 1173 mwrite = vhpet_mmio_write; 1174 } else { 1175 *retu = true; 1176 return (0); 1177 } 1178 1179 error = vmm_emulate_instruction(vm, vcpuid, gpa, vie, mread, mwrite, 1180 retu); 1181 1182 return (error); 1183 } 1184 1185 static int 1186 vm_handle_suspend(struct vm *vm, int vcpuid, bool *retu) 1187 { 1188 int i, done; 1189 struct vcpu *vcpu; 1190 1191 done = 0; 1192 vcpu = &vm->vcpu[vcpuid]; 1193 1194 CPU_SET_ATOMIC(vcpuid, &vm->suspended_cpus); 1195 1196 /* 1197 * Wait until all 'active_cpus' have suspended themselves. 1198 * 1199 * Since a VM may be suspended at any time including when one or 1200 * more vcpus are doing a rendezvous we need to call the rendezvous 1201 * handler while we are waiting to prevent a deadlock. 1202 */ 1203 vcpu_lock(vcpu); 1204 while (1) { 1205 if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) { 1206 VCPU_CTR0(vm, vcpuid, "All vcpus suspended"); 1207 break; 1208 } 1209 1210 if (vm->rendezvous_func == NULL) { 1211 VCPU_CTR0(vm, vcpuid, "Sleeping during suspend"); 1212 vcpu_require_state_locked(vcpu, VCPU_SLEEPING); 1213 msleep_spin(vcpu, &vcpu->mtx, "vmsusp", hz); 1214 vcpu_require_state_locked(vcpu, VCPU_FROZEN); 1215 } else { 1216 VCPU_CTR0(vm, vcpuid, "Rendezvous during suspend"); 1217 vcpu_unlock(vcpu); 1218 vm_handle_rendezvous(vm, vcpuid); 1219 vcpu_lock(vcpu); 1220 } 1221 } 1222 vcpu_unlock(vcpu); 1223 1224 /* 1225 * Wakeup the other sleeping vcpus and return to userspace. 1226 */ 1227 for (i = 0; i < VM_MAXCPU; i++) { 1228 if (CPU_ISSET(i, &vm->suspended_cpus)) { 1229 vcpu_notify_event(vm, i, false); 1230 } 1231 } 1232 1233 *retu = true; 1234 return (0); 1235 } 1236 1237 int 1238 vm_suspend(struct vm *vm, enum vm_suspend_how how) 1239 { 1240 int i; 1241 1242 if (how <= VM_SUSPEND_NONE || how >= VM_SUSPEND_LAST) 1243 return (EINVAL); 1244 1245 if (atomic_cmpset_int(&vm->suspend, 0, how) == 0) { 1246 VM_CTR2(vm, "virtual machine already suspended %d/%d", 1247 vm->suspend, how); 1248 return (EALREADY); 1249 } 1250 1251 VM_CTR1(vm, "virtual machine successfully suspended %d", how); 1252 1253 /* 1254 * Notify all active vcpus that they are now suspended. 1255 */ 1256 for (i = 0; i < VM_MAXCPU; i++) { 1257 if (CPU_ISSET(i, &vm->active_cpus)) 1258 vcpu_notify_event(vm, i, false); 1259 } 1260 1261 return (0); 1262 } 1263 1264 void 1265 vm_exit_suspended(struct vm *vm, int vcpuid, uint64_t rip) 1266 { 1267 struct vm_exit *vmexit; 1268 1269 KASSERT(vm->suspend > VM_SUSPEND_NONE && vm->suspend < VM_SUSPEND_LAST, 1270 ("vm_exit_suspended: invalid suspend type %d", vm->suspend)); 1271 1272 vmexit = vm_exitinfo(vm, vcpuid); 1273 vmexit->rip = rip; 1274 vmexit->inst_length = 0; 1275 vmexit->exitcode = VM_EXITCODE_SUSPENDED; 1276 vmexit->u.suspended.how = vm->suspend; 1277 } 1278 1279 int 1280 vm_run(struct vm *vm, struct vm_run *vmrun) 1281 { 1282 int error, vcpuid; 1283 struct vcpu *vcpu; 1284 struct pcb *pcb; 1285 uint64_t tscval, rip; 1286 struct vm_exit *vme; 1287 bool retu, intr_disabled; 1288 pmap_t pmap; 1289 void *rptr, *sptr; 1290 1291 vcpuid = vmrun->cpuid; 1292 1293 if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 1294 return (EINVAL); 1295 1296 rptr = &vm->rendezvous_func; 1297 sptr = &vm->suspend; 1298 pmap = vmspace_pmap(vm->vmspace); 1299 vcpu = &vm->vcpu[vcpuid]; 1300 vme = &vcpu->exitinfo; 1301 rip = vmrun->rip; 1302 restart: 1303 critical_enter(); 1304 1305 KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active), 1306 ("vm_run: absurd pm_active")); 1307 1308 tscval = rdtsc(); 1309 1310 pcb = PCPU_GET(curpcb); 1311 set_pcb_flags(pcb, PCB_FULL_IRET); 1312 1313 restore_guest_msrs(vm, vcpuid); 1314 restore_guest_fpustate(vcpu); 1315 1316 vcpu_require_state(vm, vcpuid, VCPU_RUNNING); 1317 error = VMRUN(vm->cookie, vcpuid, rip, pmap, rptr, sptr); 1318 vcpu_require_state(vm, vcpuid, VCPU_FROZEN); 1319 1320 save_guest_fpustate(vcpu); 1321 restore_host_msrs(vm, vcpuid); 1322 1323 vmm_stat_incr(vm, vcpuid, VCPU_TOTAL_RUNTIME, rdtsc() - tscval); 1324 1325 critical_exit(); 1326 1327 if (error == 0) { 1328 retu = false; 1329 switch (vme->exitcode) { 1330 case VM_EXITCODE_SUSPENDED: 1331 error = vm_handle_suspend(vm, vcpuid, &retu); 1332 break; 1333 case VM_EXITCODE_IOAPIC_EOI: 1334 vioapic_process_eoi(vm, vcpuid, 1335 vme->u.ioapic_eoi.vector); 1336 break; 1337 case VM_EXITCODE_RENDEZVOUS: 1338 vm_handle_rendezvous(vm, vcpuid); 1339 error = 0; 1340 break; 1341 case VM_EXITCODE_HLT: 1342 intr_disabled = ((vme->u.hlt.rflags & PSL_I) == 0); 1343 error = vm_handle_hlt(vm, vcpuid, intr_disabled, &retu); 1344 break; 1345 case VM_EXITCODE_PAGING: 1346 error = vm_handle_paging(vm, vcpuid, &retu); 1347 break; 1348 case VM_EXITCODE_INST_EMUL: 1349 error = vm_handle_inst_emul(vm, vcpuid, &retu); 1350 break; 1351 default: 1352 retu = true; /* handled in userland */ 1353 break; 1354 } 1355 } 1356 1357 if (error == 0 && retu == false) { 1358 rip = vme->rip + vme->inst_length; 1359 goto restart; 1360 } 1361 1362 /* copy the exit information */ 1363 bcopy(vme, &vmrun->vm_exit, sizeof(struct vm_exit)); 1364 return (error); 1365 } 1366 1367 int 1368 vm_inject_exception(struct vm *vm, int vcpuid, struct vm_exception *exception) 1369 { 1370 struct vcpu *vcpu; 1371 1372 if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 1373 return (EINVAL); 1374 1375 if (exception->vector < 0 || exception->vector >= 32) 1376 return (EINVAL); 1377 1378 vcpu = &vm->vcpu[vcpuid]; 1379 1380 if (vcpu->exception_pending) { 1381 VCPU_CTR2(vm, vcpuid, "Unable to inject exception %d due to " 1382 "pending exception %d", exception->vector, 1383 vcpu->exception.vector); 1384 return (EBUSY); 1385 } 1386 1387 vcpu->exception_pending = 1; 1388 vcpu->exception = *exception; 1389 VCPU_CTR1(vm, vcpuid, "Exception %d pending", exception->vector); 1390 return (0); 1391 } 1392 1393 int 1394 vm_exception_pending(struct vm *vm, int vcpuid, struct vm_exception *exception) 1395 { 1396 struct vcpu *vcpu; 1397 int pending; 1398 1399 KASSERT(vcpuid >= 0 && vcpuid < VM_MAXCPU, ("invalid vcpu %d", vcpuid)); 1400 1401 vcpu = &vm->vcpu[vcpuid]; 1402 pending = vcpu->exception_pending; 1403 if (pending) { 1404 vcpu->exception_pending = 0; 1405 *exception = vcpu->exception; 1406 VCPU_CTR1(vm, vcpuid, "Exception %d delivered", 1407 exception->vector); 1408 } 1409 return (pending); 1410 } 1411 1412 static void 1413 vm_inject_fault(struct vm *vm, int vcpuid, struct vm_exception *exception) 1414 { 1415 struct vm_exit *vmexit; 1416 int error; 1417 1418 error = vm_inject_exception(vm, vcpuid, exception); 1419 KASSERT(error == 0, ("vm_inject_exception error %d", error)); 1420 1421 /* 1422 * A fault-like exception allows the instruction to be restarted 1423 * after the exception handler returns. 1424 * 1425 * By setting the inst_length to 0 we ensure that the instruction 1426 * pointer remains at the faulting instruction. 1427 */ 1428 vmexit = vm_exitinfo(vm, vcpuid); 1429 vmexit->inst_length = 0; 1430 } 1431 1432 void 1433 vm_inject_gp(struct vm *vm, int vcpuid) 1434 { 1435 struct vm_exception gpf = { 1436 .vector = IDT_GP, 1437 .error_code_valid = 1, 1438 .error_code = 0 1439 }; 1440 1441 vm_inject_fault(vm, vcpuid, &gpf); 1442 } 1443 1444 void 1445 vm_inject_ud(struct vm *vm, int vcpuid) 1446 { 1447 struct vm_exception udf = { 1448 .vector = IDT_UD, 1449 .error_code_valid = 0 1450 }; 1451 1452 vm_inject_fault(vm, vcpuid, &udf); 1453 } 1454 1455 static VMM_STAT(VCPU_NMI_COUNT, "number of NMIs delivered to vcpu"); 1456 1457 int 1458 vm_inject_nmi(struct vm *vm, int vcpuid) 1459 { 1460 struct vcpu *vcpu; 1461 1462 if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 1463 return (EINVAL); 1464 1465 vcpu = &vm->vcpu[vcpuid]; 1466 1467 vcpu->nmi_pending = 1; 1468 vcpu_notify_event(vm, vcpuid, false); 1469 return (0); 1470 } 1471 1472 int 1473 vm_nmi_pending(struct vm *vm, int vcpuid) 1474 { 1475 struct vcpu *vcpu; 1476 1477 if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 1478 panic("vm_nmi_pending: invalid vcpuid %d", vcpuid); 1479 1480 vcpu = &vm->vcpu[vcpuid]; 1481 1482 return (vcpu->nmi_pending); 1483 } 1484 1485 void 1486 vm_nmi_clear(struct vm *vm, int vcpuid) 1487 { 1488 struct vcpu *vcpu; 1489 1490 if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 1491 panic("vm_nmi_pending: invalid vcpuid %d", vcpuid); 1492 1493 vcpu = &vm->vcpu[vcpuid]; 1494 1495 if (vcpu->nmi_pending == 0) 1496 panic("vm_nmi_clear: inconsistent nmi_pending state"); 1497 1498 vcpu->nmi_pending = 0; 1499 vmm_stat_incr(vm, vcpuid, VCPU_NMI_COUNT, 1); 1500 } 1501 1502 static VMM_STAT(VCPU_EXTINT_COUNT, "number of ExtINTs delivered to vcpu"); 1503 1504 int 1505 vm_inject_extint(struct vm *vm, int vcpuid) 1506 { 1507 struct vcpu *vcpu; 1508 1509 if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 1510 return (EINVAL); 1511 1512 vcpu = &vm->vcpu[vcpuid]; 1513 1514 vcpu->extint_pending = 1; 1515 vcpu_notify_event(vm, vcpuid, false); 1516 return (0); 1517 } 1518 1519 int 1520 vm_extint_pending(struct vm *vm, int vcpuid) 1521 { 1522 struct vcpu *vcpu; 1523 1524 if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 1525 panic("vm_extint_pending: invalid vcpuid %d", vcpuid); 1526 1527 vcpu = &vm->vcpu[vcpuid]; 1528 1529 return (vcpu->extint_pending); 1530 } 1531 1532 void 1533 vm_extint_clear(struct vm *vm, int vcpuid) 1534 { 1535 struct vcpu *vcpu; 1536 1537 if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 1538 panic("vm_extint_pending: invalid vcpuid %d", vcpuid); 1539 1540 vcpu = &vm->vcpu[vcpuid]; 1541 1542 if (vcpu->extint_pending == 0) 1543 panic("vm_extint_clear: inconsistent extint_pending state"); 1544 1545 vcpu->extint_pending = 0; 1546 vmm_stat_incr(vm, vcpuid, VCPU_EXTINT_COUNT, 1); 1547 } 1548 1549 int 1550 vm_get_capability(struct vm *vm, int vcpu, int type, int *retval) 1551 { 1552 if (vcpu < 0 || vcpu >= VM_MAXCPU) 1553 return (EINVAL); 1554 1555 if (type < 0 || type >= VM_CAP_MAX) 1556 return (EINVAL); 1557 1558 return (VMGETCAP(vm->cookie, vcpu, type, retval)); 1559 } 1560 1561 int 1562 vm_set_capability(struct vm *vm, int vcpu, int type, int val) 1563 { 1564 if (vcpu < 0 || vcpu >= VM_MAXCPU) 1565 return (EINVAL); 1566 1567 if (type < 0 || type >= VM_CAP_MAX) 1568 return (EINVAL); 1569 1570 return (VMSETCAP(vm->cookie, vcpu, type, val)); 1571 } 1572 1573 uint64_t * 1574 vm_guest_msrs(struct vm *vm, int cpu) 1575 { 1576 return (vm->vcpu[cpu].guest_msrs); 1577 } 1578 1579 struct vlapic * 1580 vm_lapic(struct vm *vm, int cpu) 1581 { 1582 return (vm->vcpu[cpu].vlapic); 1583 } 1584 1585 struct vioapic * 1586 vm_ioapic(struct vm *vm) 1587 { 1588 1589 return (vm->vioapic); 1590 } 1591 1592 struct vhpet * 1593 vm_hpet(struct vm *vm) 1594 { 1595 1596 return (vm->vhpet); 1597 } 1598 1599 boolean_t 1600 vmm_is_pptdev(int bus, int slot, int func) 1601 { 1602 int found, i, n; 1603 int b, s, f; 1604 char *val, *cp, *cp2; 1605 1606 /* 1607 * XXX 1608 * The length of an environment variable is limited to 128 bytes which 1609 * puts an upper limit on the number of passthru devices that may be 1610 * specified using a single environment variable. 1611 * 1612 * Work around this by scanning multiple environment variable 1613 * names instead of a single one - yuck! 1614 */ 1615 const char *names[] = { "pptdevs", "pptdevs2", "pptdevs3", NULL }; 1616 1617 /* set pptdevs="1/2/3 4/5/6 7/8/9 10/11/12" */ 1618 found = 0; 1619 for (i = 0; names[i] != NULL && !found; i++) { 1620 cp = val = getenv(names[i]); 1621 while (cp != NULL && *cp != '\0') { 1622 if ((cp2 = strchr(cp, ' ')) != NULL) 1623 *cp2 = '\0'; 1624 1625 n = sscanf(cp, "%d/%d/%d", &b, &s, &f); 1626 if (n == 3 && bus == b && slot == s && func == f) { 1627 found = 1; 1628 break; 1629 } 1630 1631 if (cp2 != NULL) 1632 *cp2++ = ' '; 1633 1634 cp = cp2; 1635 } 1636 freeenv(val); 1637 } 1638 return (found); 1639 } 1640 1641 void * 1642 vm_iommu_domain(struct vm *vm) 1643 { 1644 1645 return (vm->iommu); 1646 } 1647 1648 int 1649 vcpu_set_state(struct vm *vm, int vcpuid, enum vcpu_state newstate, 1650 bool from_idle) 1651 { 1652 int error; 1653 struct vcpu *vcpu; 1654 1655 if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 1656 panic("vm_set_run_state: invalid vcpuid %d", vcpuid); 1657 1658 vcpu = &vm->vcpu[vcpuid]; 1659 1660 vcpu_lock(vcpu); 1661 error = vcpu_set_state_locked(vcpu, newstate, from_idle); 1662 vcpu_unlock(vcpu); 1663 1664 return (error); 1665 } 1666 1667 enum vcpu_state 1668 vcpu_get_state(struct vm *vm, int vcpuid, int *hostcpu) 1669 { 1670 struct vcpu *vcpu; 1671 enum vcpu_state state; 1672 1673 if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 1674 panic("vm_get_run_state: invalid vcpuid %d", vcpuid); 1675 1676 vcpu = &vm->vcpu[vcpuid]; 1677 1678 vcpu_lock(vcpu); 1679 state = vcpu->state; 1680 if (hostcpu != NULL) 1681 *hostcpu = vcpu->hostcpu; 1682 vcpu_unlock(vcpu); 1683 1684 return (state); 1685 } 1686 1687 void 1688 vm_activate_cpu(struct vm *vm, int vcpuid) 1689 { 1690 1691 KASSERT(vcpuid >= 0 && vcpuid < VM_MAXCPU, 1692 ("vm_activate_cpu: invalid vcpuid %d", vcpuid)); 1693 KASSERT(!CPU_ISSET(vcpuid, &vm->active_cpus), 1694 ("vm_activate_cpu: vcpuid %d is already active", vcpuid)); 1695 1696 VCPU_CTR0(vm, vcpuid, "activated"); 1697 CPU_SET_ATOMIC(vcpuid, &vm->active_cpus); 1698 } 1699 1700 cpuset_t 1701 vm_active_cpus(struct vm *vm) 1702 { 1703 1704 return (vm->active_cpus); 1705 } 1706 1707 void * 1708 vcpu_stats(struct vm *vm, int vcpuid) 1709 { 1710 1711 return (vm->vcpu[vcpuid].stats); 1712 } 1713 1714 int 1715 vm_get_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state *state) 1716 { 1717 if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 1718 return (EINVAL); 1719 1720 *state = vm->vcpu[vcpuid].x2apic_state; 1721 1722 return (0); 1723 } 1724 1725 int 1726 vm_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state) 1727 { 1728 if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 1729 return (EINVAL); 1730 1731 if (state >= X2APIC_STATE_LAST) 1732 return (EINVAL); 1733 1734 vm->vcpu[vcpuid].x2apic_state = state; 1735 1736 vlapic_set_x2apic_state(vm, vcpuid, state); 1737 1738 return (0); 1739 } 1740 1741 /* 1742 * This function is called to ensure that a vcpu "sees" a pending event 1743 * as soon as possible: 1744 * - If the vcpu thread is sleeping then it is woken up. 1745 * - If the vcpu is running on a different host_cpu then an IPI will be directed 1746 * to the host_cpu to cause the vcpu to trap into the hypervisor. 1747 */ 1748 void 1749 vcpu_notify_event(struct vm *vm, int vcpuid, bool lapic_intr) 1750 { 1751 int hostcpu; 1752 struct vcpu *vcpu; 1753 1754 vcpu = &vm->vcpu[vcpuid]; 1755 1756 vcpu_lock(vcpu); 1757 hostcpu = vcpu->hostcpu; 1758 if (vcpu->state == VCPU_RUNNING) { 1759 KASSERT(hostcpu != NOCPU, ("vcpu running on invalid hostcpu")); 1760 if (hostcpu != curcpu) { 1761 if (lapic_intr) { 1762 vlapic_post_intr(vcpu->vlapic, hostcpu, 1763 vmm_ipinum); 1764 } else { 1765 ipi_cpu(hostcpu, vmm_ipinum); 1766 } 1767 } else { 1768 /* 1769 * If the 'vcpu' is running on 'curcpu' then it must 1770 * be sending a notification to itself (e.g. SELF_IPI). 1771 * The pending event will be picked up when the vcpu 1772 * transitions back to guest context. 1773 */ 1774 } 1775 } else { 1776 KASSERT(hostcpu == NOCPU, ("vcpu state %d not consistent " 1777 "with hostcpu %d", vcpu->state, hostcpu)); 1778 if (vcpu->state == VCPU_SLEEPING) 1779 wakeup_one(vcpu); 1780 } 1781 vcpu_unlock(vcpu); 1782 } 1783 1784 struct vmspace * 1785 vm_get_vmspace(struct vm *vm) 1786 { 1787 1788 return (vm->vmspace); 1789 } 1790 1791 int 1792 vm_apicid2vcpuid(struct vm *vm, int apicid) 1793 { 1794 /* 1795 * XXX apic id is assumed to be numerically identical to vcpu id 1796 */ 1797 return (apicid); 1798 } 1799 1800 void 1801 vm_smp_rendezvous(struct vm *vm, int vcpuid, cpuset_t dest, 1802 vm_rendezvous_func_t func, void *arg) 1803 { 1804 int i; 1805 1806 /* 1807 * Enforce that this function is called without any locks 1808 */ 1809 WITNESS_WARN(WARN_PANIC, NULL, "vm_smp_rendezvous"); 1810 KASSERT(vcpuid == -1 || (vcpuid >= 0 && vcpuid < VM_MAXCPU), 1811 ("vm_smp_rendezvous: invalid vcpuid %d", vcpuid)); 1812 1813 restart: 1814 mtx_lock(&vm->rendezvous_mtx); 1815 if (vm->rendezvous_func != NULL) { 1816 /* 1817 * If a rendezvous is already in progress then we need to 1818 * call the rendezvous handler in case this 'vcpuid' is one 1819 * of the targets of the rendezvous. 1820 */ 1821 RENDEZVOUS_CTR0(vm, vcpuid, "Rendezvous already in progress"); 1822 mtx_unlock(&vm->rendezvous_mtx); 1823 vm_handle_rendezvous(vm, vcpuid); 1824 goto restart; 1825 } 1826 KASSERT(vm->rendezvous_func == NULL, ("vm_smp_rendezvous: previous " 1827 "rendezvous is still in progress")); 1828 1829 RENDEZVOUS_CTR0(vm, vcpuid, "Initiating rendezvous"); 1830 vm->rendezvous_req_cpus = dest; 1831 CPU_ZERO(&vm->rendezvous_done_cpus); 1832 vm->rendezvous_arg = arg; 1833 vm_set_rendezvous_func(vm, func); 1834 mtx_unlock(&vm->rendezvous_mtx); 1835 1836 /* 1837 * Wake up any sleeping vcpus and trigger a VM-exit in any running 1838 * vcpus so they handle the rendezvous as soon as possible. 1839 */ 1840 for (i = 0; i < VM_MAXCPU; i++) { 1841 if (CPU_ISSET(i, &dest)) 1842 vcpu_notify_event(vm, i, false); 1843 } 1844 1845 vm_handle_rendezvous(vm, vcpuid); 1846 } 1847 1848 struct vatpic * 1849 vm_atpic(struct vm *vm) 1850 { 1851 return (vm->vatpic); 1852 } 1853 1854 struct vatpit * 1855 vm_atpit(struct vm *vm) 1856 { 1857 return (vm->vatpit); 1858 } 1859