1 /*- 2 * Copyright (c) 2011 NetApp, Inc. 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 * 26 * $FreeBSD$ 27 */ 28 29 #include <sys/cdefs.h> 30 __FBSDID("$FreeBSD$"); 31 32 #include <sys/param.h> 33 #include <sys/systm.h> 34 #include <sys/kernel.h> 35 #include <sys/module.h> 36 #include <sys/sysctl.h> 37 #include <sys/malloc.h> 38 #include <sys/pcpu.h> 39 #include <sys/lock.h> 40 #include <sys/mutex.h> 41 #include <sys/proc.h> 42 #include <sys/rwlock.h> 43 #include <sys/sched.h> 44 #include <sys/smp.h> 45 #include <sys/systm.h> 46 47 #include <vm/vm.h> 48 #include <vm/vm_object.h> 49 #include <vm/vm_page.h> 50 #include <vm/pmap.h> 51 #include <vm/vm_map.h> 52 #include <vm/vm_extern.h> 53 #include <vm/vm_param.h> 54 55 #include <machine/vm.h> 56 #include <machine/pcb.h> 57 #include <machine/smp.h> 58 #include <x86/psl.h> 59 #include <x86/apicreg.h> 60 #include <machine/vmparam.h> 61 62 #include <machine/vmm.h> 63 #include <machine/vmm_dev.h> 64 65 #include "vmm_ktr.h" 66 #include "vmm_host.h" 67 #include "vmm_mem.h" 68 #include "vmm_util.h" 69 #include "vhpet.h" 70 #include "vioapic.h" 71 #include "vlapic.h" 72 #include "vmm_msr.h" 73 #include "vmm_ipi.h" 74 #include "vmm_stat.h" 75 #include "vmm_lapic.h" 76 77 #include "io/ppt.h" 78 #include "io/iommu.h" 79 80 struct vlapic; 81 82 struct vcpu { 83 int flags; 84 enum vcpu_state state; 85 struct mtx mtx; 86 int hostcpu; /* host cpuid this vcpu last ran on */ 87 uint64_t guest_msrs[VMM_MSR_NUM]; 88 struct vlapic *vlapic; 89 int vcpuid; 90 struct savefpu *guestfpu; /* guest fpu state */ 91 void *stats; 92 struct vm_exit exitinfo; 93 enum x2apic_state x2apic_state; 94 int nmi_pending; 95 }; 96 97 #define vcpu_lock_init(v) mtx_init(&((v)->mtx), "vcpu lock", 0, MTX_SPIN) 98 #define vcpu_lock(v) mtx_lock_spin(&((v)->mtx)) 99 #define vcpu_unlock(v) mtx_unlock_spin(&((v)->mtx)) 100 #define vcpu_assert_locked(v) mtx_assert(&((v)->mtx), MA_OWNED) 101 102 struct mem_seg { 103 vm_paddr_t gpa; 104 size_t len; 105 boolean_t wired; 106 vm_object_t object; 107 }; 108 #define VM_MAX_MEMORY_SEGMENTS 2 109 110 struct vm { 111 void *cookie; /* processor-specific data */ 112 void *iommu; /* iommu-specific data */ 113 struct vhpet *vhpet; /* virtual HPET */ 114 struct vioapic *vioapic; /* virtual ioapic */ 115 struct vmspace *vmspace; /* guest's address space */ 116 struct vcpu vcpu[VM_MAXCPU]; 117 int num_mem_segs; 118 struct mem_seg mem_segs[VM_MAX_MEMORY_SEGMENTS]; 119 char name[VM_MAX_NAMELEN]; 120 121 /* 122 * Set of active vcpus. 123 * An active vcpu is one that has been started implicitly (BSP) or 124 * explicitly (AP) by sending it a startup ipi. 125 */ 126 cpuset_t active_cpus; 127 }; 128 129 static int vmm_initialized; 130 131 static struct vmm_ops *ops; 132 #define VMM_INIT() (ops != NULL ? (*ops->init)() : 0) 133 #define VMM_CLEANUP() (ops != NULL ? (*ops->cleanup)() : 0) 134 135 #define VMINIT(vm, pmap) (ops != NULL ? (*ops->vminit)(vm, pmap): NULL) 136 #define VMRUN(vmi, vcpu, rip, pmap) \ 137 (ops != NULL ? (*ops->vmrun)(vmi, vcpu, rip, pmap) : ENXIO) 138 #define VMCLEANUP(vmi) (ops != NULL ? (*ops->vmcleanup)(vmi) : NULL) 139 #define VMSPACE_ALLOC(min, max) \ 140 (ops != NULL ? (*ops->vmspace_alloc)(min, max) : NULL) 141 #define VMSPACE_FREE(vmspace) \ 142 (ops != NULL ? (*ops->vmspace_free)(vmspace) : ENXIO) 143 #define VMGETREG(vmi, vcpu, num, retval) \ 144 (ops != NULL ? (*ops->vmgetreg)(vmi, vcpu, num, retval) : ENXIO) 145 #define VMSETREG(vmi, vcpu, num, val) \ 146 (ops != NULL ? (*ops->vmsetreg)(vmi, vcpu, num, val) : ENXIO) 147 #define VMGETDESC(vmi, vcpu, num, desc) \ 148 (ops != NULL ? (*ops->vmgetdesc)(vmi, vcpu, num, desc) : ENXIO) 149 #define VMSETDESC(vmi, vcpu, num, desc) \ 150 (ops != NULL ? (*ops->vmsetdesc)(vmi, vcpu, num, desc) : ENXIO) 151 #define VMINJECT(vmi, vcpu, type, vec, ec, ecv) \ 152 (ops != NULL ? (*ops->vminject)(vmi, vcpu, type, vec, ec, ecv) : ENXIO) 153 #define VMGETCAP(vmi, vcpu, num, retval) \ 154 (ops != NULL ? (*ops->vmgetcap)(vmi, vcpu, num, retval) : ENXIO) 155 #define VMSETCAP(vmi, vcpu, num, val) \ 156 (ops != NULL ? (*ops->vmsetcap)(vmi, vcpu, num, val) : ENXIO) 157 158 #define fpu_start_emulating() load_cr0(rcr0() | CR0_TS) 159 #define fpu_stop_emulating() clts() 160 161 static MALLOC_DEFINE(M_VM, "vm", "vm"); 162 CTASSERT(VMM_MSR_NUM <= 64); /* msr_mask can keep track of up to 64 msrs */ 163 164 /* statistics */ 165 static VMM_STAT(VCPU_TOTAL_RUNTIME, "vcpu total runtime"); 166 167 static void 168 vcpu_cleanup(struct vcpu *vcpu) 169 { 170 vlapic_cleanup(vcpu->vlapic); 171 vmm_stat_free(vcpu->stats); 172 fpu_save_area_free(vcpu->guestfpu); 173 } 174 175 static void 176 vcpu_init(struct vm *vm, uint32_t vcpu_id) 177 { 178 struct vcpu *vcpu; 179 180 vcpu = &vm->vcpu[vcpu_id]; 181 182 vcpu_lock_init(vcpu); 183 vcpu->hostcpu = NOCPU; 184 vcpu->vcpuid = vcpu_id; 185 vcpu->vlapic = vlapic_init(vm, vcpu_id); 186 vm_set_x2apic_state(vm, vcpu_id, X2APIC_ENABLED); 187 vcpu->guestfpu = fpu_save_area_alloc(); 188 fpu_save_area_reset(vcpu->guestfpu); 189 vcpu->stats = vmm_stat_alloc(); 190 } 191 192 struct vm_exit * 193 vm_exitinfo(struct vm *vm, int cpuid) 194 { 195 struct vcpu *vcpu; 196 197 if (cpuid < 0 || cpuid >= VM_MAXCPU) 198 panic("vm_exitinfo: invalid cpuid %d", cpuid); 199 200 vcpu = &vm->vcpu[cpuid]; 201 202 return (&vcpu->exitinfo); 203 } 204 205 static int 206 vmm_init(void) 207 { 208 int error; 209 210 vmm_host_state_init(); 211 vmm_ipi_init(); 212 213 error = vmm_mem_init(); 214 if (error) 215 return (error); 216 217 if (vmm_is_intel()) 218 ops = &vmm_ops_intel; 219 else if (vmm_is_amd()) 220 ops = &vmm_ops_amd; 221 else 222 return (ENXIO); 223 224 vmm_msr_init(); 225 226 return (VMM_INIT()); 227 } 228 229 static int 230 vmm_handler(module_t mod, int what, void *arg) 231 { 232 int error; 233 234 switch (what) { 235 case MOD_LOAD: 236 vmmdev_init(); 237 iommu_init(); 238 error = vmm_init(); 239 if (error == 0) 240 vmm_initialized = 1; 241 break; 242 case MOD_UNLOAD: 243 error = vmmdev_cleanup(); 244 if (error == 0) { 245 iommu_cleanup(); 246 vmm_ipi_cleanup(); 247 error = VMM_CLEANUP(); 248 /* 249 * Something bad happened - prevent new 250 * VMs from being created 251 */ 252 if (error) 253 vmm_initialized = 0; 254 } 255 break; 256 default: 257 error = 0; 258 break; 259 } 260 return (error); 261 } 262 263 static moduledata_t vmm_kmod = { 264 "vmm", 265 vmm_handler, 266 NULL 267 }; 268 269 /* 270 * vmm initialization has the following dependencies: 271 * 272 * - iommu initialization must happen after the pci passthru driver has had 273 * a chance to attach to any passthru devices (after SI_SUB_CONFIGURE). 274 * 275 * - VT-x initialization requires smp_rendezvous() and therefore must happen 276 * after SMP is fully functional (after SI_SUB_SMP). 277 */ 278 DECLARE_MODULE(vmm, vmm_kmod, SI_SUB_SMP + 1, SI_ORDER_ANY); 279 MODULE_VERSION(vmm, 1); 280 281 SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW, NULL, NULL); 282 283 int 284 vm_create(const char *name, struct vm **retvm) 285 { 286 int i; 287 struct vm *vm; 288 struct vmspace *vmspace; 289 290 const int BSP = 0; 291 292 /* 293 * If vmm.ko could not be successfully initialized then don't attempt 294 * to create the virtual machine. 295 */ 296 if (!vmm_initialized) 297 return (ENXIO); 298 299 if (name == NULL || strlen(name) >= VM_MAX_NAMELEN) 300 return (EINVAL); 301 302 vmspace = VMSPACE_ALLOC(VM_MIN_ADDRESS, VM_MAXUSER_ADDRESS); 303 if (vmspace == NULL) 304 return (ENOMEM); 305 306 vm = malloc(sizeof(struct vm), M_VM, M_WAITOK | M_ZERO); 307 strcpy(vm->name, name); 308 vm->cookie = VMINIT(vm, vmspace_pmap(vmspace)); 309 vm->vioapic = vioapic_init(vm); 310 vm->vhpet = vhpet_init(vm); 311 312 for (i = 0; i < VM_MAXCPU; i++) { 313 vcpu_init(vm, i); 314 guest_msrs_init(vm, i); 315 } 316 317 vm_activate_cpu(vm, BSP); 318 vm->vmspace = vmspace; 319 320 *retvm = vm; 321 return (0); 322 } 323 324 static void 325 vm_free_mem_seg(struct vm *vm, struct mem_seg *seg) 326 { 327 328 if (seg->object != NULL) 329 vmm_mem_free(vm->vmspace, seg->gpa, seg->len); 330 331 bzero(seg, sizeof(*seg)); 332 } 333 334 void 335 vm_destroy(struct vm *vm) 336 { 337 int i; 338 339 ppt_unassign_all(vm); 340 341 if (vm->iommu != NULL) 342 iommu_destroy_domain(vm->iommu); 343 344 vhpet_cleanup(vm->vhpet); 345 vioapic_cleanup(vm->vioapic); 346 347 for (i = 0; i < vm->num_mem_segs; i++) 348 vm_free_mem_seg(vm, &vm->mem_segs[i]); 349 350 vm->num_mem_segs = 0; 351 352 for (i = 0; i < VM_MAXCPU; i++) 353 vcpu_cleanup(&vm->vcpu[i]); 354 355 VMSPACE_FREE(vm->vmspace); 356 357 VMCLEANUP(vm->cookie); 358 359 free(vm, M_VM); 360 } 361 362 const char * 363 vm_name(struct vm *vm) 364 { 365 return (vm->name); 366 } 367 368 int 369 vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa) 370 { 371 vm_object_t obj; 372 373 if ((obj = vmm_mmio_alloc(vm->vmspace, gpa, len, hpa)) == NULL) 374 return (ENOMEM); 375 else 376 return (0); 377 } 378 379 int 380 vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len) 381 { 382 383 vmm_mmio_free(vm->vmspace, gpa, len); 384 return (0); 385 } 386 387 boolean_t 388 vm_mem_allocated(struct vm *vm, vm_paddr_t gpa) 389 { 390 int i; 391 vm_paddr_t gpabase, gpalimit; 392 393 for (i = 0; i < vm->num_mem_segs; i++) { 394 gpabase = vm->mem_segs[i].gpa; 395 gpalimit = gpabase + vm->mem_segs[i].len; 396 if (gpa >= gpabase && gpa < gpalimit) 397 return (TRUE); /* 'gpa' is regular memory */ 398 } 399 400 if (ppt_is_mmio(vm, gpa)) 401 return (TRUE); /* 'gpa' is pci passthru mmio */ 402 403 return (FALSE); 404 } 405 406 int 407 vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len) 408 { 409 int available, allocated; 410 struct mem_seg *seg; 411 vm_object_t object; 412 vm_paddr_t g; 413 414 if ((gpa & PAGE_MASK) || (len & PAGE_MASK) || len == 0) 415 return (EINVAL); 416 417 available = allocated = 0; 418 g = gpa; 419 while (g < gpa + len) { 420 if (vm_mem_allocated(vm, g)) 421 allocated++; 422 else 423 available++; 424 425 g += PAGE_SIZE; 426 } 427 428 /* 429 * If there are some allocated and some available pages in the address 430 * range then it is an error. 431 */ 432 if (allocated && available) 433 return (EINVAL); 434 435 /* 436 * If the entire address range being requested has already been 437 * allocated then there isn't anything more to do. 438 */ 439 if (allocated && available == 0) 440 return (0); 441 442 if (vm->num_mem_segs >= VM_MAX_MEMORY_SEGMENTS) 443 return (E2BIG); 444 445 seg = &vm->mem_segs[vm->num_mem_segs]; 446 447 if ((object = vmm_mem_alloc(vm->vmspace, gpa, len)) == NULL) 448 return (ENOMEM); 449 450 seg->gpa = gpa; 451 seg->len = len; 452 seg->object = object; 453 seg->wired = FALSE; 454 455 vm->num_mem_segs++; 456 457 return (0); 458 } 459 460 static void 461 vm_gpa_unwire(struct vm *vm) 462 { 463 int i, rv; 464 struct mem_seg *seg; 465 466 for (i = 0; i < vm->num_mem_segs; i++) { 467 seg = &vm->mem_segs[i]; 468 if (!seg->wired) 469 continue; 470 471 rv = vm_map_unwire(&vm->vmspace->vm_map, 472 seg->gpa, seg->gpa + seg->len, 473 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); 474 KASSERT(rv == KERN_SUCCESS, ("vm(%s) memory segment " 475 "%#lx/%ld could not be unwired: %d", 476 vm_name(vm), seg->gpa, seg->len, rv)); 477 478 seg->wired = FALSE; 479 } 480 } 481 482 static int 483 vm_gpa_wire(struct vm *vm) 484 { 485 int i, rv; 486 struct mem_seg *seg; 487 488 for (i = 0; i < vm->num_mem_segs; i++) { 489 seg = &vm->mem_segs[i]; 490 if (seg->wired) 491 continue; 492 493 /* XXX rlimits? */ 494 rv = vm_map_wire(&vm->vmspace->vm_map, 495 seg->gpa, seg->gpa + seg->len, 496 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); 497 if (rv != KERN_SUCCESS) 498 break; 499 500 seg->wired = TRUE; 501 } 502 503 if (i < vm->num_mem_segs) { 504 /* 505 * Undo the wiring before returning an error. 506 */ 507 vm_gpa_unwire(vm); 508 return (EAGAIN); 509 } 510 511 return (0); 512 } 513 514 static void 515 vm_iommu_modify(struct vm *vm, boolean_t map) 516 { 517 int i, sz; 518 vm_paddr_t gpa, hpa; 519 struct mem_seg *seg; 520 void *vp, *cookie, *host_domain; 521 522 sz = PAGE_SIZE; 523 host_domain = iommu_host_domain(); 524 525 for (i = 0; i < vm->num_mem_segs; i++) { 526 seg = &vm->mem_segs[i]; 527 KASSERT(seg->wired, ("vm(%s) memory segment %#lx/%ld not wired", 528 vm_name(vm), seg->gpa, seg->len)); 529 530 gpa = seg->gpa; 531 while (gpa < seg->gpa + seg->len) { 532 vp = vm_gpa_hold(vm, gpa, PAGE_SIZE, VM_PROT_WRITE, 533 &cookie); 534 KASSERT(vp != NULL, ("vm(%s) could not map gpa %#lx", 535 vm_name(vm), gpa)); 536 537 vm_gpa_release(cookie); 538 539 hpa = DMAP_TO_PHYS((uintptr_t)vp); 540 if (map) { 541 iommu_create_mapping(vm->iommu, gpa, hpa, sz); 542 iommu_remove_mapping(host_domain, hpa, sz); 543 } else { 544 iommu_remove_mapping(vm->iommu, gpa, sz); 545 iommu_create_mapping(host_domain, hpa, hpa, sz); 546 } 547 548 gpa += PAGE_SIZE; 549 } 550 } 551 552 /* 553 * Invalidate the cached translations associated with the domain 554 * from which pages were removed. 555 */ 556 if (map) 557 iommu_invalidate_tlb(host_domain); 558 else 559 iommu_invalidate_tlb(vm->iommu); 560 } 561 562 #define vm_iommu_unmap(vm) vm_iommu_modify((vm), FALSE) 563 #define vm_iommu_map(vm) vm_iommu_modify((vm), TRUE) 564 565 int 566 vm_unassign_pptdev(struct vm *vm, int bus, int slot, int func) 567 { 568 int error; 569 570 error = ppt_unassign_device(vm, bus, slot, func); 571 if (error) 572 return (error); 573 574 if (ppt_num_devices(vm) == 0) { 575 vm_iommu_unmap(vm); 576 vm_gpa_unwire(vm); 577 } 578 return (0); 579 } 580 581 int 582 vm_assign_pptdev(struct vm *vm, int bus, int slot, int func) 583 { 584 int error; 585 vm_paddr_t maxaddr; 586 587 /* 588 * Virtual machines with pci passthru devices get special treatment: 589 * - the guest physical memory is wired 590 * - the iommu is programmed to do the 'gpa' to 'hpa' translation 591 * 592 * We need to do this before the first pci passthru device is attached. 593 */ 594 if (ppt_num_devices(vm) == 0) { 595 KASSERT(vm->iommu == NULL, 596 ("vm_assign_pptdev: iommu must be NULL")); 597 maxaddr = vmm_mem_maxaddr(); 598 vm->iommu = iommu_create_domain(maxaddr); 599 600 error = vm_gpa_wire(vm); 601 if (error) 602 return (error); 603 604 vm_iommu_map(vm); 605 } 606 607 error = ppt_assign_device(vm, bus, slot, func); 608 return (error); 609 } 610 611 void * 612 vm_gpa_hold(struct vm *vm, vm_paddr_t gpa, size_t len, int reqprot, 613 void **cookie) 614 { 615 int count, pageoff; 616 vm_page_t m; 617 618 pageoff = gpa & PAGE_MASK; 619 if (len > PAGE_SIZE - pageoff) 620 panic("vm_gpa_hold: invalid gpa/len: 0x%016lx/%lu", gpa, len); 621 622 count = vm_fault_quick_hold_pages(&vm->vmspace->vm_map, 623 trunc_page(gpa), PAGE_SIZE, reqprot, &m, 1); 624 625 if (count == 1) { 626 *cookie = m; 627 return ((void *)(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)) + pageoff)); 628 } else { 629 *cookie = NULL; 630 return (NULL); 631 } 632 } 633 634 void 635 vm_gpa_release(void *cookie) 636 { 637 vm_page_t m = cookie; 638 639 vm_page_lock(m); 640 vm_page_unhold(m); 641 vm_page_unlock(m); 642 } 643 644 int 645 vm_gpabase2memseg(struct vm *vm, vm_paddr_t gpabase, 646 struct vm_memory_segment *seg) 647 { 648 int i; 649 650 for (i = 0; i < vm->num_mem_segs; i++) { 651 if (gpabase == vm->mem_segs[i].gpa) { 652 seg->gpa = vm->mem_segs[i].gpa; 653 seg->len = vm->mem_segs[i].len; 654 seg->wired = vm->mem_segs[i].wired; 655 return (0); 656 } 657 } 658 return (-1); 659 } 660 661 int 662 vm_get_memobj(struct vm *vm, vm_paddr_t gpa, size_t len, 663 vm_offset_t *offset, struct vm_object **object) 664 { 665 int i; 666 size_t seg_len; 667 vm_paddr_t seg_gpa; 668 vm_object_t seg_obj; 669 670 for (i = 0; i < vm->num_mem_segs; i++) { 671 if ((seg_obj = vm->mem_segs[i].object) == NULL) 672 continue; 673 674 seg_gpa = vm->mem_segs[i].gpa; 675 seg_len = vm->mem_segs[i].len; 676 677 if (gpa >= seg_gpa && gpa < seg_gpa + seg_len) { 678 *offset = gpa - seg_gpa; 679 *object = seg_obj; 680 vm_object_reference(seg_obj); 681 return (0); 682 } 683 } 684 685 return (EINVAL); 686 } 687 688 int 689 vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval) 690 { 691 692 if (vcpu < 0 || vcpu >= VM_MAXCPU) 693 return (EINVAL); 694 695 if (reg >= VM_REG_LAST) 696 return (EINVAL); 697 698 return (VMGETREG(vm->cookie, vcpu, reg, retval)); 699 } 700 701 int 702 vm_set_register(struct vm *vm, int vcpu, int reg, uint64_t val) 703 { 704 705 if (vcpu < 0 || vcpu >= VM_MAXCPU) 706 return (EINVAL); 707 708 if (reg >= VM_REG_LAST) 709 return (EINVAL); 710 711 return (VMSETREG(vm->cookie, vcpu, reg, val)); 712 } 713 714 static boolean_t 715 is_descriptor_table(int reg) 716 { 717 718 switch (reg) { 719 case VM_REG_GUEST_IDTR: 720 case VM_REG_GUEST_GDTR: 721 return (TRUE); 722 default: 723 return (FALSE); 724 } 725 } 726 727 static boolean_t 728 is_segment_register(int reg) 729 { 730 731 switch (reg) { 732 case VM_REG_GUEST_ES: 733 case VM_REG_GUEST_CS: 734 case VM_REG_GUEST_SS: 735 case VM_REG_GUEST_DS: 736 case VM_REG_GUEST_FS: 737 case VM_REG_GUEST_GS: 738 case VM_REG_GUEST_TR: 739 case VM_REG_GUEST_LDTR: 740 return (TRUE); 741 default: 742 return (FALSE); 743 } 744 } 745 746 int 747 vm_get_seg_desc(struct vm *vm, int vcpu, int reg, 748 struct seg_desc *desc) 749 { 750 751 if (vcpu < 0 || vcpu >= VM_MAXCPU) 752 return (EINVAL); 753 754 if (!is_segment_register(reg) && !is_descriptor_table(reg)) 755 return (EINVAL); 756 757 return (VMGETDESC(vm->cookie, vcpu, reg, desc)); 758 } 759 760 int 761 vm_set_seg_desc(struct vm *vm, int vcpu, int reg, 762 struct seg_desc *desc) 763 { 764 if (vcpu < 0 || vcpu >= VM_MAXCPU) 765 return (EINVAL); 766 767 if (!is_segment_register(reg) && !is_descriptor_table(reg)) 768 return (EINVAL); 769 770 return (VMSETDESC(vm->cookie, vcpu, reg, desc)); 771 } 772 773 static void 774 restore_guest_fpustate(struct vcpu *vcpu) 775 { 776 777 /* flush host state to the pcb */ 778 fpuexit(curthread); 779 780 /* restore guest FPU state */ 781 fpu_stop_emulating(); 782 fpurestore(vcpu->guestfpu); 783 784 /* 785 * The FPU is now "dirty" with the guest's state so turn on emulation 786 * to trap any access to the FPU by the host. 787 */ 788 fpu_start_emulating(); 789 } 790 791 static void 792 save_guest_fpustate(struct vcpu *vcpu) 793 { 794 795 if ((rcr0() & CR0_TS) == 0) 796 panic("fpu emulation not enabled in host!"); 797 798 /* save guest FPU state */ 799 fpu_stop_emulating(); 800 fpusave(vcpu->guestfpu); 801 fpu_start_emulating(); 802 } 803 804 static VMM_STAT(VCPU_IDLE_TICKS, "number of ticks vcpu was idle"); 805 806 static int 807 vcpu_set_state_locked(struct vcpu *vcpu, enum vcpu_state newstate) 808 { 809 int error; 810 811 vcpu_assert_locked(vcpu); 812 813 /* 814 * The following state transitions are allowed: 815 * IDLE -> FROZEN -> IDLE 816 * FROZEN -> RUNNING -> FROZEN 817 * FROZEN -> SLEEPING -> FROZEN 818 */ 819 switch (vcpu->state) { 820 case VCPU_IDLE: 821 case VCPU_RUNNING: 822 case VCPU_SLEEPING: 823 error = (newstate != VCPU_FROZEN); 824 break; 825 case VCPU_FROZEN: 826 error = (newstate == VCPU_FROZEN); 827 break; 828 default: 829 error = 1; 830 break; 831 } 832 833 if (error == 0) 834 vcpu->state = newstate; 835 else 836 error = EBUSY; 837 838 return (error); 839 } 840 841 static void 842 vcpu_require_state(struct vm *vm, int vcpuid, enum vcpu_state newstate) 843 { 844 int error; 845 846 if ((error = vcpu_set_state(vm, vcpuid, newstate)) != 0) 847 panic("Error %d setting state to %d\n", error, newstate); 848 } 849 850 static void 851 vcpu_require_state_locked(struct vcpu *vcpu, enum vcpu_state newstate) 852 { 853 int error; 854 855 if ((error = vcpu_set_state_locked(vcpu, newstate)) != 0) 856 panic("Error %d setting state to %d", error, newstate); 857 } 858 859 /* 860 * Emulate a guest 'hlt' by sleeping until the vcpu is ready to run. 861 */ 862 static int 863 vm_handle_hlt(struct vm *vm, int vcpuid, bool intr_disabled, bool *retu) 864 { 865 struct vm_exit *vmexit; 866 struct vcpu *vcpu; 867 int t, timo; 868 869 vcpu = &vm->vcpu[vcpuid]; 870 871 vcpu_lock(vcpu); 872 873 /* 874 * Do a final check for pending NMI or interrupts before 875 * really putting this thread to sleep. 876 * 877 * These interrupts could have happened any time after we 878 * returned from VMRUN() and before we grabbed the vcpu lock. 879 */ 880 if (!vm_nmi_pending(vm, vcpuid) && 881 (intr_disabled || vlapic_pending_intr(vcpu->vlapic) < 0)) { 882 t = ticks; 883 vcpu_require_state_locked(vcpu, VCPU_SLEEPING); 884 if (vlapic_enabled(vcpu->vlapic)) { 885 /* 886 * XXX msleep_spin() is not interruptible so use the 887 * 'timo' to put an upper bound on the sleep time. 888 */ 889 timo = hz; 890 msleep_spin(vcpu, &vcpu->mtx, "vmidle", timo); 891 } else { 892 /* 893 * Spindown the vcpu if the apic is disabled and it 894 * had entered the halted state. 895 */ 896 *retu = true; 897 vmexit = vm_exitinfo(vm, vcpuid); 898 vmexit->exitcode = VM_EXITCODE_SPINDOWN_CPU; 899 VCPU_CTR0(vm, vcpuid, "spinning down cpu"); 900 } 901 vcpu_require_state_locked(vcpu, VCPU_FROZEN); 902 vmm_stat_incr(vm, vcpuid, VCPU_IDLE_TICKS, ticks - t); 903 } 904 vcpu_unlock(vcpu); 905 906 return (0); 907 } 908 909 static int 910 vm_handle_paging(struct vm *vm, int vcpuid, bool *retu) 911 { 912 int rv, ftype; 913 struct vm_map *map; 914 struct vcpu *vcpu; 915 struct vm_exit *vme; 916 917 vcpu = &vm->vcpu[vcpuid]; 918 vme = &vcpu->exitinfo; 919 920 ftype = vme->u.paging.fault_type; 921 KASSERT(ftype == VM_PROT_READ || 922 ftype == VM_PROT_WRITE || ftype == VM_PROT_EXECUTE, 923 ("vm_handle_paging: invalid fault_type %d", ftype)); 924 925 if (ftype == VM_PROT_READ || ftype == VM_PROT_WRITE) { 926 rv = pmap_emulate_accessed_dirty(vmspace_pmap(vm->vmspace), 927 vme->u.paging.gpa, ftype); 928 if (rv == 0) 929 goto done; 930 } 931 932 map = &vm->vmspace->vm_map; 933 rv = vm_fault(map, vme->u.paging.gpa, ftype, VM_FAULT_NORMAL); 934 935 VCPU_CTR3(vm, vcpuid, "vm_handle_paging rv = %d, gpa = %#lx, " 936 "ftype = %d", rv, vme->u.paging.gpa, ftype); 937 938 if (rv != KERN_SUCCESS) 939 return (EFAULT); 940 done: 941 /* restart execution at the faulting instruction */ 942 vme->inst_length = 0; 943 944 return (0); 945 } 946 947 static int 948 vm_handle_inst_emul(struct vm *vm, int vcpuid, bool *retu) 949 { 950 struct vie *vie; 951 struct vcpu *vcpu; 952 struct vm_exit *vme; 953 int error, inst_length; 954 uint64_t rip, gla, gpa, cr3; 955 mem_region_read_t mread; 956 mem_region_write_t mwrite; 957 958 vcpu = &vm->vcpu[vcpuid]; 959 vme = &vcpu->exitinfo; 960 961 rip = vme->rip; 962 inst_length = vme->inst_length; 963 964 gla = vme->u.inst_emul.gla; 965 gpa = vme->u.inst_emul.gpa; 966 cr3 = vme->u.inst_emul.cr3; 967 vie = &vme->u.inst_emul.vie; 968 969 vie_init(vie); 970 971 /* Fetch, decode and emulate the faulting instruction */ 972 if (vmm_fetch_instruction(vm, vcpuid, rip, inst_length, cr3, vie) != 0) 973 return (EFAULT); 974 975 if (vmm_decode_instruction(vm, vcpuid, gla, vie) != 0) 976 return (EFAULT); 977 978 /* return to userland unless this is an in-kernel emulated device */ 979 if (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE) { 980 mread = lapic_mmio_read; 981 mwrite = lapic_mmio_write; 982 } else if (gpa >= VIOAPIC_BASE && gpa < VIOAPIC_BASE + VIOAPIC_SIZE) { 983 mread = vioapic_mmio_read; 984 mwrite = vioapic_mmio_write; 985 } else if (gpa >= VHPET_BASE && gpa < VHPET_BASE + VHPET_SIZE) { 986 mread = vhpet_mmio_read; 987 mwrite = vhpet_mmio_write; 988 } else { 989 *retu = true; 990 return (0); 991 } 992 993 error = vmm_emulate_instruction(vm, vcpuid, gpa, vie, mread, mwrite, 994 retu); 995 996 return (error); 997 } 998 999 int 1000 vm_run(struct vm *vm, struct vm_run *vmrun) 1001 { 1002 int error, vcpuid; 1003 struct vcpu *vcpu; 1004 struct pcb *pcb; 1005 uint64_t tscval, rip; 1006 struct vm_exit *vme; 1007 bool retu, intr_disabled; 1008 pmap_t pmap; 1009 1010 vcpuid = vmrun->cpuid; 1011 1012 if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 1013 return (EINVAL); 1014 1015 pmap = vmspace_pmap(vm->vmspace); 1016 vcpu = &vm->vcpu[vcpuid]; 1017 vme = &vcpu->exitinfo; 1018 rip = vmrun->rip; 1019 restart: 1020 critical_enter(); 1021 1022 KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active), 1023 ("vm_run: absurd pm_active")); 1024 1025 tscval = rdtsc(); 1026 1027 pcb = PCPU_GET(curpcb); 1028 set_pcb_flags(pcb, PCB_FULL_IRET); 1029 1030 restore_guest_msrs(vm, vcpuid); 1031 restore_guest_fpustate(vcpu); 1032 1033 vcpu_require_state(vm, vcpuid, VCPU_RUNNING); 1034 vcpu->hostcpu = curcpu; 1035 error = VMRUN(vm->cookie, vcpuid, rip, pmap); 1036 vcpu->hostcpu = NOCPU; 1037 vcpu_require_state(vm, vcpuid, VCPU_FROZEN); 1038 1039 save_guest_fpustate(vcpu); 1040 restore_host_msrs(vm, vcpuid); 1041 1042 vmm_stat_incr(vm, vcpuid, VCPU_TOTAL_RUNTIME, rdtsc() - tscval); 1043 1044 critical_exit(); 1045 1046 if (error == 0) { 1047 retu = false; 1048 switch (vme->exitcode) { 1049 case VM_EXITCODE_HLT: 1050 intr_disabled = ((vme->u.hlt.rflags & PSL_I) == 0); 1051 error = vm_handle_hlt(vm, vcpuid, intr_disabled, &retu); 1052 break; 1053 case VM_EXITCODE_PAGING: 1054 error = vm_handle_paging(vm, vcpuid, &retu); 1055 break; 1056 case VM_EXITCODE_INST_EMUL: 1057 error = vm_handle_inst_emul(vm, vcpuid, &retu); 1058 break; 1059 default: 1060 retu = true; /* handled in userland */ 1061 break; 1062 } 1063 } 1064 1065 if (error == 0 && retu == false) { 1066 rip = vme->rip + vme->inst_length; 1067 goto restart; 1068 } 1069 1070 /* copy the exit information */ 1071 bcopy(vme, &vmrun->vm_exit, sizeof(struct vm_exit)); 1072 return (error); 1073 } 1074 1075 int 1076 vm_inject_event(struct vm *vm, int vcpuid, int type, 1077 int vector, uint32_t code, int code_valid) 1078 { 1079 if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 1080 return (EINVAL); 1081 1082 if ((type > VM_EVENT_NONE && type < VM_EVENT_MAX) == 0) 1083 return (EINVAL); 1084 1085 if (vector < 0 || vector > 255) 1086 return (EINVAL); 1087 1088 return (VMINJECT(vm->cookie, vcpuid, type, vector, code, code_valid)); 1089 } 1090 1091 static VMM_STAT(VCPU_NMI_COUNT, "number of NMIs delivered to vcpu"); 1092 1093 int 1094 vm_inject_nmi(struct vm *vm, int vcpuid) 1095 { 1096 struct vcpu *vcpu; 1097 1098 if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 1099 return (EINVAL); 1100 1101 vcpu = &vm->vcpu[vcpuid]; 1102 1103 vcpu->nmi_pending = 1; 1104 vcpu_notify_event(vm, vcpuid); 1105 return (0); 1106 } 1107 1108 int 1109 vm_nmi_pending(struct vm *vm, int vcpuid) 1110 { 1111 struct vcpu *vcpu; 1112 1113 if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 1114 panic("vm_nmi_pending: invalid vcpuid %d", vcpuid); 1115 1116 vcpu = &vm->vcpu[vcpuid]; 1117 1118 return (vcpu->nmi_pending); 1119 } 1120 1121 void 1122 vm_nmi_clear(struct vm *vm, int vcpuid) 1123 { 1124 struct vcpu *vcpu; 1125 1126 if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 1127 panic("vm_nmi_pending: invalid vcpuid %d", vcpuid); 1128 1129 vcpu = &vm->vcpu[vcpuid]; 1130 1131 if (vcpu->nmi_pending == 0) 1132 panic("vm_nmi_clear: inconsistent nmi_pending state"); 1133 1134 vcpu->nmi_pending = 0; 1135 vmm_stat_incr(vm, vcpuid, VCPU_NMI_COUNT, 1); 1136 } 1137 1138 int 1139 vm_get_capability(struct vm *vm, int vcpu, int type, int *retval) 1140 { 1141 if (vcpu < 0 || vcpu >= VM_MAXCPU) 1142 return (EINVAL); 1143 1144 if (type < 0 || type >= VM_CAP_MAX) 1145 return (EINVAL); 1146 1147 return (VMGETCAP(vm->cookie, vcpu, type, retval)); 1148 } 1149 1150 int 1151 vm_set_capability(struct vm *vm, int vcpu, int type, int val) 1152 { 1153 if (vcpu < 0 || vcpu >= VM_MAXCPU) 1154 return (EINVAL); 1155 1156 if (type < 0 || type >= VM_CAP_MAX) 1157 return (EINVAL); 1158 1159 return (VMSETCAP(vm->cookie, vcpu, type, val)); 1160 } 1161 1162 uint64_t * 1163 vm_guest_msrs(struct vm *vm, int cpu) 1164 { 1165 return (vm->vcpu[cpu].guest_msrs); 1166 } 1167 1168 struct vlapic * 1169 vm_lapic(struct vm *vm, int cpu) 1170 { 1171 return (vm->vcpu[cpu].vlapic); 1172 } 1173 1174 struct vioapic * 1175 vm_ioapic(struct vm *vm) 1176 { 1177 1178 return (vm->vioapic); 1179 } 1180 1181 struct vhpet * 1182 vm_hpet(struct vm *vm) 1183 { 1184 1185 return (vm->vhpet); 1186 } 1187 1188 boolean_t 1189 vmm_is_pptdev(int bus, int slot, int func) 1190 { 1191 int found, i, n; 1192 int b, s, f; 1193 char *val, *cp, *cp2; 1194 1195 /* 1196 * XXX 1197 * The length of an environment variable is limited to 128 bytes which 1198 * puts an upper limit on the number of passthru devices that may be 1199 * specified using a single environment variable. 1200 * 1201 * Work around this by scanning multiple environment variable 1202 * names instead of a single one - yuck! 1203 */ 1204 const char *names[] = { "pptdevs", "pptdevs2", "pptdevs3", NULL }; 1205 1206 /* set pptdevs="1/2/3 4/5/6 7/8/9 10/11/12" */ 1207 found = 0; 1208 for (i = 0; names[i] != NULL && !found; i++) { 1209 cp = val = getenv(names[i]); 1210 while (cp != NULL && *cp != '\0') { 1211 if ((cp2 = strchr(cp, ' ')) != NULL) 1212 *cp2 = '\0'; 1213 1214 n = sscanf(cp, "%d/%d/%d", &b, &s, &f); 1215 if (n == 3 && bus == b && slot == s && func == f) { 1216 found = 1; 1217 break; 1218 } 1219 1220 if (cp2 != NULL) 1221 *cp2++ = ' '; 1222 1223 cp = cp2; 1224 } 1225 freeenv(val); 1226 } 1227 return (found); 1228 } 1229 1230 void * 1231 vm_iommu_domain(struct vm *vm) 1232 { 1233 1234 return (vm->iommu); 1235 } 1236 1237 int 1238 vcpu_set_state(struct vm *vm, int vcpuid, enum vcpu_state newstate) 1239 { 1240 int error; 1241 struct vcpu *vcpu; 1242 1243 if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 1244 panic("vm_set_run_state: invalid vcpuid %d", vcpuid); 1245 1246 vcpu = &vm->vcpu[vcpuid]; 1247 1248 vcpu_lock(vcpu); 1249 error = vcpu_set_state_locked(vcpu, newstate); 1250 vcpu_unlock(vcpu); 1251 1252 return (error); 1253 } 1254 1255 enum vcpu_state 1256 vcpu_get_state(struct vm *vm, int vcpuid, int *hostcpu) 1257 { 1258 struct vcpu *vcpu; 1259 enum vcpu_state state; 1260 1261 if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 1262 panic("vm_get_run_state: invalid vcpuid %d", vcpuid); 1263 1264 vcpu = &vm->vcpu[vcpuid]; 1265 1266 vcpu_lock(vcpu); 1267 state = vcpu->state; 1268 if (hostcpu != NULL) 1269 *hostcpu = vcpu->hostcpu; 1270 vcpu_unlock(vcpu); 1271 1272 return (state); 1273 } 1274 1275 void 1276 vm_activate_cpu(struct vm *vm, int vcpuid) 1277 { 1278 1279 if (vcpuid >= 0 && vcpuid < VM_MAXCPU) 1280 CPU_SET(vcpuid, &vm->active_cpus); 1281 } 1282 1283 cpuset_t 1284 vm_active_cpus(struct vm *vm) 1285 { 1286 1287 return (vm->active_cpus); 1288 } 1289 1290 void * 1291 vcpu_stats(struct vm *vm, int vcpuid) 1292 { 1293 1294 return (vm->vcpu[vcpuid].stats); 1295 } 1296 1297 int 1298 vm_get_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state *state) 1299 { 1300 if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 1301 return (EINVAL); 1302 1303 *state = vm->vcpu[vcpuid].x2apic_state; 1304 1305 return (0); 1306 } 1307 1308 int 1309 vm_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state) 1310 { 1311 if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 1312 return (EINVAL); 1313 1314 if (state >= X2APIC_STATE_LAST) 1315 return (EINVAL); 1316 1317 vm->vcpu[vcpuid].x2apic_state = state; 1318 1319 vlapic_set_x2apic_state(vm, vcpuid, state); 1320 1321 return (0); 1322 } 1323 1324 /* 1325 * This function is called to ensure that a vcpu "sees" a pending event 1326 * as soon as possible: 1327 * - If the vcpu thread is sleeping then it is woken up. 1328 * - If the vcpu is running on a different host_cpu then an IPI will be directed 1329 * to the host_cpu to cause the vcpu to trap into the hypervisor. 1330 */ 1331 void 1332 vcpu_notify_event(struct vm *vm, int vcpuid) 1333 { 1334 int hostcpu; 1335 struct vcpu *vcpu; 1336 1337 vcpu = &vm->vcpu[vcpuid]; 1338 1339 vcpu_lock(vcpu); 1340 hostcpu = vcpu->hostcpu; 1341 if (hostcpu == NOCPU) { 1342 if (vcpu->state == VCPU_SLEEPING) 1343 wakeup_one(vcpu); 1344 } else { 1345 if (vcpu->state != VCPU_RUNNING) 1346 panic("invalid vcpu state %d", vcpu->state); 1347 if (hostcpu != curcpu) 1348 ipi_cpu(hostcpu, vmm_ipinum); 1349 } 1350 vcpu_unlock(vcpu); 1351 } 1352 1353 struct vmspace * 1354 vm_get_vmspace(struct vm *vm) 1355 { 1356 1357 return (vm->vmspace); 1358 } 1359 1360 int 1361 vm_apicid2vcpuid(struct vm *vm, int apicid) 1362 { 1363 /* 1364 * XXX apic id is assumed to be numerically identical to vcpu id 1365 */ 1366 return (apicid); 1367 } 1368