1 /*- 2 * Copyright (c) 2011 NetApp, Inc. 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 * 26 * $FreeBSD$ 27 */ 28 29 #include <sys/cdefs.h> 30 __FBSDID("$FreeBSD$"); 31 32 #include <sys/param.h> 33 #include <sys/systm.h> 34 #include <sys/kernel.h> 35 #include <sys/module.h> 36 #include <sys/sysctl.h> 37 #include <sys/malloc.h> 38 #include <sys/pcpu.h> 39 #include <sys/lock.h> 40 #include <sys/mutex.h> 41 #include <sys/proc.h> 42 #include <sys/rwlock.h> 43 #include <sys/sched.h> 44 #include <sys/smp.h> 45 #include <sys/systm.h> 46 47 #include <vm/vm.h> 48 #include <vm/vm_object.h> 49 #include <vm/vm_page.h> 50 #include <vm/pmap.h> 51 #include <vm/vm_map.h> 52 #include <vm/vm_extern.h> 53 #include <vm/vm_param.h> 54 55 #include <machine/vm.h> 56 #include <machine/pcb.h> 57 #include <machine/smp.h> 58 #include <x86/apicreg.h> 59 #include <machine/vmparam.h> 60 61 #include <machine/vmm.h> 62 #include <machine/vmm_dev.h> 63 64 #include "vmm_ktr.h" 65 #include "vmm_host.h" 66 #include "vmm_mem.h" 67 #include "vmm_util.h" 68 #include "vhpet.h" 69 #include "vioapic.h" 70 #include "vlapic.h" 71 #include "vmm_msr.h" 72 #include "vmm_ipi.h" 73 #include "vmm_stat.h" 74 #include "vmm_lapic.h" 75 76 #include "io/ppt.h" 77 #include "io/iommu.h" 78 79 struct vlapic; 80 81 struct vcpu { 82 int flags; 83 enum vcpu_state state; 84 struct mtx mtx; 85 int hostcpu; /* host cpuid this vcpu last ran on */ 86 uint64_t guest_msrs[VMM_MSR_NUM]; 87 struct vlapic *vlapic; 88 int vcpuid; 89 struct savefpu *guestfpu; /* guest fpu state */ 90 void *stats; 91 struct vm_exit exitinfo; 92 enum x2apic_state x2apic_state; 93 int nmi_pending; 94 }; 95 96 #define vcpu_lock_init(v) mtx_init(&((v)->mtx), "vcpu lock", 0, MTX_SPIN) 97 #define vcpu_lock(v) mtx_lock_spin(&((v)->mtx)) 98 #define vcpu_unlock(v) mtx_unlock_spin(&((v)->mtx)) 99 #define vcpu_assert_locked(v) mtx_assert(&((v)->mtx), MA_OWNED) 100 101 struct mem_seg { 102 vm_paddr_t gpa; 103 size_t len; 104 boolean_t wired; 105 vm_object_t object; 106 }; 107 #define VM_MAX_MEMORY_SEGMENTS 2 108 109 struct vm { 110 void *cookie; /* processor-specific data */ 111 void *iommu; /* iommu-specific data */ 112 struct vhpet *vhpet; /* virtual HPET */ 113 struct vioapic *vioapic; /* virtual ioapic */ 114 struct vmspace *vmspace; /* guest's address space */ 115 struct vcpu vcpu[VM_MAXCPU]; 116 int num_mem_segs; 117 struct mem_seg mem_segs[VM_MAX_MEMORY_SEGMENTS]; 118 char name[VM_MAX_NAMELEN]; 119 120 /* 121 * Set of active vcpus. 122 * An active vcpu is one that has been started implicitly (BSP) or 123 * explicitly (AP) by sending it a startup ipi. 124 */ 125 cpuset_t active_cpus; 126 }; 127 128 static int vmm_initialized; 129 130 static struct vmm_ops *ops; 131 #define VMM_INIT() (ops != NULL ? (*ops->init)() : 0) 132 #define VMM_CLEANUP() (ops != NULL ? (*ops->cleanup)() : 0) 133 134 #define VMINIT(vm, pmap) (ops != NULL ? (*ops->vminit)(vm, pmap): NULL) 135 #define VMRUN(vmi, vcpu, rip, pmap) \ 136 (ops != NULL ? (*ops->vmrun)(vmi, vcpu, rip, pmap) : ENXIO) 137 #define VMCLEANUP(vmi) (ops != NULL ? (*ops->vmcleanup)(vmi) : NULL) 138 #define VMSPACE_ALLOC(min, max) \ 139 (ops != NULL ? (*ops->vmspace_alloc)(min, max) : NULL) 140 #define VMSPACE_FREE(vmspace) \ 141 (ops != NULL ? (*ops->vmspace_free)(vmspace) : ENXIO) 142 #define VMGETREG(vmi, vcpu, num, retval) \ 143 (ops != NULL ? (*ops->vmgetreg)(vmi, vcpu, num, retval) : ENXIO) 144 #define VMSETREG(vmi, vcpu, num, val) \ 145 (ops != NULL ? (*ops->vmsetreg)(vmi, vcpu, num, val) : ENXIO) 146 #define VMGETDESC(vmi, vcpu, num, desc) \ 147 (ops != NULL ? (*ops->vmgetdesc)(vmi, vcpu, num, desc) : ENXIO) 148 #define VMSETDESC(vmi, vcpu, num, desc) \ 149 (ops != NULL ? (*ops->vmsetdesc)(vmi, vcpu, num, desc) : ENXIO) 150 #define VMINJECT(vmi, vcpu, type, vec, ec, ecv) \ 151 (ops != NULL ? (*ops->vminject)(vmi, vcpu, type, vec, ec, ecv) : ENXIO) 152 #define VMGETCAP(vmi, vcpu, num, retval) \ 153 (ops != NULL ? (*ops->vmgetcap)(vmi, vcpu, num, retval) : ENXIO) 154 #define VMSETCAP(vmi, vcpu, num, val) \ 155 (ops != NULL ? (*ops->vmsetcap)(vmi, vcpu, num, val) : ENXIO) 156 157 #define fpu_start_emulating() load_cr0(rcr0() | CR0_TS) 158 #define fpu_stop_emulating() clts() 159 160 static MALLOC_DEFINE(M_VM, "vm", "vm"); 161 CTASSERT(VMM_MSR_NUM <= 64); /* msr_mask can keep track of up to 64 msrs */ 162 163 /* statistics */ 164 static VMM_STAT(VCPU_TOTAL_RUNTIME, "vcpu total runtime"); 165 166 static void 167 vcpu_cleanup(struct vcpu *vcpu) 168 { 169 vlapic_cleanup(vcpu->vlapic); 170 vmm_stat_free(vcpu->stats); 171 fpu_save_area_free(vcpu->guestfpu); 172 } 173 174 static void 175 vcpu_init(struct vm *vm, uint32_t vcpu_id) 176 { 177 struct vcpu *vcpu; 178 179 vcpu = &vm->vcpu[vcpu_id]; 180 181 vcpu_lock_init(vcpu); 182 vcpu->hostcpu = NOCPU; 183 vcpu->vcpuid = vcpu_id; 184 vcpu->vlapic = vlapic_init(vm, vcpu_id); 185 vm_set_x2apic_state(vm, vcpu_id, X2APIC_ENABLED); 186 vcpu->guestfpu = fpu_save_area_alloc(); 187 fpu_save_area_reset(vcpu->guestfpu); 188 vcpu->stats = vmm_stat_alloc(); 189 } 190 191 struct vm_exit * 192 vm_exitinfo(struct vm *vm, int cpuid) 193 { 194 struct vcpu *vcpu; 195 196 if (cpuid < 0 || cpuid >= VM_MAXCPU) 197 panic("vm_exitinfo: invalid cpuid %d", cpuid); 198 199 vcpu = &vm->vcpu[cpuid]; 200 201 return (&vcpu->exitinfo); 202 } 203 204 static int 205 vmm_init(void) 206 { 207 int error; 208 209 vmm_host_state_init(); 210 vmm_ipi_init(); 211 212 error = vmm_mem_init(); 213 if (error) 214 return (error); 215 216 if (vmm_is_intel()) 217 ops = &vmm_ops_intel; 218 else if (vmm_is_amd()) 219 ops = &vmm_ops_amd; 220 else 221 return (ENXIO); 222 223 vmm_msr_init(); 224 225 return (VMM_INIT()); 226 } 227 228 static int 229 vmm_handler(module_t mod, int what, void *arg) 230 { 231 int error; 232 233 switch (what) { 234 case MOD_LOAD: 235 vmmdev_init(); 236 iommu_init(); 237 error = vmm_init(); 238 if (error == 0) 239 vmm_initialized = 1; 240 break; 241 case MOD_UNLOAD: 242 error = vmmdev_cleanup(); 243 if (error == 0) { 244 iommu_cleanup(); 245 vmm_ipi_cleanup(); 246 error = VMM_CLEANUP(); 247 /* 248 * Something bad happened - prevent new 249 * VMs from being created 250 */ 251 if (error) 252 vmm_initialized = 0; 253 } 254 break; 255 default: 256 error = 0; 257 break; 258 } 259 return (error); 260 } 261 262 static moduledata_t vmm_kmod = { 263 "vmm", 264 vmm_handler, 265 NULL 266 }; 267 268 /* 269 * vmm initialization has the following dependencies: 270 * 271 * - iommu initialization must happen after the pci passthru driver has had 272 * a chance to attach to any passthru devices (after SI_SUB_CONFIGURE). 273 * 274 * - VT-x initialization requires smp_rendezvous() and therefore must happen 275 * after SMP is fully functional (after SI_SUB_SMP). 276 */ 277 DECLARE_MODULE(vmm, vmm_kmod, SI_SUB_SMP + 1, SI_ORDER_ANY); 278 MODULE_VERSION(vmm, 1); 279 280 SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW, NULL, NULL); 281 282 int 283 vm_create(const char *name, struct vm **retvm) 284 { 285 int i; 286 struct vm *vm; 287 struct vmspace *vmspace; 288 289 const int BSP = 0; 290 291 /* 292 * If vmm.ko could not be successfully initialized then don't attempt 293 * to create the virtual machine. 294 */ 295 if (!vmm_initialized) 296 return (ENXIO); 297 298 if (name == NULL || strlen(name) >= VM_MAX_NAMELEN) 299 return (EINVAL); 300 301 vmspace = VMSPACE_ALLOC(VM_MIN_ADDRESS, VM_MAXUSER_ADDRESS); 302 if (vmspace == NULL) 303 return (ENOMEM); 304 305 vm = malloc(sizeof(struct vm), M_VM, M_WAITOK | M_ZERO); 306 strcpy(vm->name, name); 307 vm->cookie = VMINIT(vm, vmspace_pmap(vmspace)); 308 vm->vioapic = vioapic_init(vm); 309 vm->vhpet = vhpet_init(vm); 310 311 for (i = 0; i < VM_MAXCPU; i++) { 312 vcpu_init(vm, i); 313 guest_msrs_init(vm, i); 314 } 315 316 vm_activate_cpu(vm, BSP); 317 vm->vmspace = vmspace; 318 319 *retvm = vm; 320 return (0); 321 } 322 323 static void 324 vm_free_mem_seg(struct vm *vm, struct mem_seg *seg) 325 { 326 327 if (seg->object != NULL) 328 vmm_mem_free(vm->vmspace, seg->gpa, seg->len); 329 330 bzero(seg, sizeof(*seg)); 331 } 332 333 void 334 vm_destroy(struct vm *vm) 335 { 336 int i; 337 338 ppt_unassign_all(vm); 339 340 if (vm->iommu != NULL) 341 iommu_destroy_domain(vm->iommu); 342 343 vhpet_cleanup(vm->vhpet); 344 vioapic_cleanup(vm->vioapic); 345 346 for (i = 0; i < vm->num_mem_segs; i++) 347 vm_free_mem_seg(vm, &vm->mem_segs[i]); 348 349 vm->num_mem_segs = 0; 350 351 for (i = 0; i < VM_MAXCPU; i++) 352 vcpu_cleanup(&vm->vcpu[i]); 353 354 VMSPACE_FREE(vm->vmspace); 355 356 VMCLEANUP(vm->cookie); 357 358 free(vm, M_VM); 359 } 360 361 const char * 362 vm_name(struct vm *vm) 363 { 364 return (vm->name); 365 } 366 367 int 368 vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa) 369 { 370 vm_object_t obj; 371 372 if ((obj = vmm_mmio_alloc(vm->vmspace, gpa, len, hpa)) == NULL) 373 return (ENOMEM); 374 else 375 return (0); 376 } 377 378 int 379 vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len) 380 { 381 382 vmm_mmio_free(vm->vmspace, gpa, len); 383 return (0); 384 } 385 386 boolean_t 387 vm_mem_allocated(struct vm *vm, vm_paddr_t gpa) 388 { 389 int i; 390 vm_paddr_t gpabase, gpalimit; 391 392 for (i = 0; i < vm->num_mem_segs; i++) { 393 gpabase = vm->mem_segs[i].gpa; 394 gpalimit = gpabase + vm->mem_segs[i].len; 395 if (gpa >= gpabase && gpa < gpalimit) 396 return (TRUE); /* 'gpa' is regular memory */ 397 } 398 399 if (ppt_is_mmio(vm, gpa)) 400 return (TRUE); /* 'gpa' is pci passthru mmio */ 401 402 return (FALSE); 403 } 404 405 int 406 vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len) 407 { 408 int available, allocated; 409 struct mem_seg *seg; 410 vm_object_t object; 411 vm_paddr_t g; 412 413 if ((gpa & PAGE_MASK) || (len & PAGE_MASK) || len == 0) 414 return (EINVAL); 415 416 available = allocated = 0; 417 g = gpa; 418 while (g < gpa + len) { 419 if (vm_mem_allocated(vm, g)) 420 allocated++; 421 else 422 available++; 423 424 g += PAGE_SIZE; 425 } 426 427 /* 428 * If there are some allocated and some available pages in the address 429 * range then it is an error. 430 */ 431 if (allocated && available) 432 return (EINVAL); 433 434 /* 435 * If the entire address range being requested has already been 436 * allocated then there isn't anything more to do. 437 */ 438 if (allocated && available == 0) 439 return (0); 440 441 if (vm->num_mem_segs >= VM_MAX_MEMORY_SEGMENTS) 442 return (E2BIG); 443 444 seg = &vm->mem_segs[vm->num_mem_segs]; 445 446 if ((object = vmm_mem_alloc(vm->vmspace, gpa, len)) == NULL) 447 return (ENOMEM); 448 449 seg->gpa = gpa; 450 seg->len = len; 451 seg->object = object; 452 seg->wired = FALSE; 453 454 vm->num_mem_segs++; 455 456 return (0); 457 } 458 459 static void 460 vm_gpa_unwire(struct vm *vm) 461 { 462 int i, rv; 463 struct mem_seg *seg; 464 465 for (i = 0; i < vm->num_mem_segs; i++) { 466 seg = &vm->mem_segs[i]; 467 if (!seg->wired) 468 continue; 469 470 rv = vm_map_unwire(&vm->vmspace->vm_map, 471 seg->gpa, seg->gpa + seg->len, 472 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); 473 KASSERT(rv == KERN_SUCCESS, ("vm(%s) memory segment " 474 "%#lx/%ld could not be unwired: %d", 475 vm_name(vm), seg->gpa, seg->len, rv)); 476 477 seg->wired = FALSE; 478 } 479 } 480 481 static int 482 vm_gpa_wire(struct vm *vm) 483 { 484 int i, rv; 485 struct mem_seg *seg; 486 487 for (i = 0; i < vm->num_mem_segs; i++) { 488 seg = &vm->mem_segs[i]; 489 if (seg->wired) 490 continue; 491 492 /* XXX rlimits? */ 493 rv = vm_map_wire(&vm->vmspace->vm_map, 494 seg->gpa, seg->gpa + seg->len, 495 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); 496 if (rv != KERN_SUCCESS) 497 break; 498 499 seg->wired = TRUE; 500 } 501 502 if (i < vm->num_mem_segs) { 503 /* 504 * Undo the wiring before returning an error. 505 */ 506 vm_gpa_unwire(vm); 507 return (EAGAIN); 508 } 509 510 return (0); 511 } 512 513 static void 514 vm_iommu_modify(struct vm *vm, boolean_t map) 515 { 516 int i, sz; 517 vm_paddr_t gpa, hpa; 518 struct mem_seg *seg; 519 void *vp, *cookie, *host_domain; 520 521 sz = PAGE_SIZE; 522 host_domain = iommu_host_domain(); 523 524 for (i = 0; i < vm->num_mem_segs; i++) { 525 seg = &vm->mem_segs[i]; 526 KASSERT(seg->wired, ("vm(%s) memory segment %#lx/%ld not wired", 527 vm_name(vm), seg->gpa, seg->len)); 528 529 gpa = seg->gpa; 530 while (gpa < seg->gpa + seg->len) { 531 vp = vm_gpa_hold(vm, gpa, PAGE_SIZE, VM_PROT_WRITE, 532 &cookie); 533 KASSERT(vp != NULL, ("vm(%s) could not map gpa %#lx", 534 vm_name(vm), gpa)); 535 536 vm_gpa_release(cookie); 537 538 hpa = DMAP_TO_PHYS((uintptr_t)vp); 539 if (map) { 540 iommu_create_mapping(vm->iommu, gpa, hpa, sz); 541 iommu_remove_mapping(host_domain, hpa, sz); 542 } else { 543 iommu_remove_mapping(vm->iommu, gpa, sz); 544 iommu_create_mapping(host_domain, hpa, hpa, sz); 545 } 546 547 gpa += PAGE_SIZE; 548 } 549 } 550 551 /* 552 * Invalidate the cached translations associated with the domain 553 * from which pages were removed. 554 */ 555 if (map) 556 iommu_invalidate_tlb(host_domain); 557 else 558 iommu_invalidate_tlb(vm->iommu); 559 } 560 561 #define vm_iommu_unmap(vm) vm_iommu_modify((vm), FALSE) 562 #define vm_iommu_map(vm) vm_iommu_modify((vm), TRUE) 563 564 int 565 vm_unassign_pptdev(struct vm *vm, int bus, int slot, int func) 566 { 567 int error; 568 569 error = ppt_unassign_device(vm, bus, slot, func); 570 if (error) 571 return (error); 572 573 if (ppt_num_devices(vm) == 0) { 574 vm_iommu_unmap(vm); 575 vm_gpa_unwire(vm); 576 } 577 return (0); 578 } 579 580 int 581 vm_assign_pptdev(struct vm *vm, int bus, int slot, int func) 582 { 583 int error; 584 vm_paddr_t maxaddr; 585 586 /* 587 * Virtual machines with pci passthru devices get special treatment: 588 * - the guest physical memory is wired 589 * - the iommu is programmed to do the 'gpa' to 'hpa' translation 590 * 591 * We need to do this before the first pci passthru device is attached. 592 */ 593 if (ppt_num_devices(vm) == 0) { 594 KASSERT(vm->iommu == NULL, 595 ("vm_assign_pptdev: iommu must be NULL")); 596 maxaddr = vmm_mem_maxaddr(); 597 vm->iommu = iommu_create_domain(maxaddr); 598 599 error = vm_gpa_wire(vm); 600 if (error) 601 return (error); 602 603 vm_iommu_map(vm); 604 } 605 606 error = ppt_assign_device(vm, bus, slot, func); 607 return (error); 608 } 609 610 void * 611 vm_gpa_hold(struct vm *vm, vm_paddr_t gpa, size_t len, int reqprot, 612 void **cookie) 613 { 614 int count, pageoff; 615 vm_page_t m; 616 617 pageoff = gpa & PAGE_MASK; 618 if (len > PAGE_SIZE - pageoff) 619 panic("vm_gpa_hold: invalid gpa/len: 0x%016lx/%lu", gpa, len); 620 621 count = vm_fault_quick_hold_pages(&vm->vmspace->vm_map, 622 trunc_page(gpa), PAGE_SIZE, reqprot, &m, 1); 623 624 if (count == 1) { 625 *cookie = m; 626 return ((void *)(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)) + pageoff)); 627 } else { 628 *cookie = NULL; 629 return (NULL); 630 } 631 } 632 633 void 634 vm_gpa_release(void *cookie) 635 { 636 vm_page_t m = cookie; 637 638 vm_page_lock(m); 639 vm_page_unhold(m); 640 vm_page_unlock(m); 641 } 642 643 int 644 vm_gpabase2memseg(struct vm *vm, vm_paddr_t gpabase, 645 struct vm_memory_segment *seg) 646 { 647 int i; 648 649 for (i = 0; i < vm->num_mem_segs; i++) { 650 if (gpabase == vm->mem_segs[i].gpa) { 651 seg->gpa = vm->mem_segs[i].gpa; 652 seg->len = vm->mem_segs[i].len; 653 seg->wired = vm->mem_segs[i].wired; 654 return (0); 655 } 656 } 657 return (-1); 658 } 659 660 int 661 vm_get_memobj(struct vm *vm, vm_paddr_t gpa, size_t len, 662 vm_offset_t *offset, struct vm_object **object) 663 { 664 int i; 665 size_t seg_len; 666 vm_paddr_t seg_gpa; 667 vm_object_t seg_obj; 668 669 for (i = 0; i < vm->num_mem_segs; i++) { 670 if ((seg_obj = vm->mem_segs[i].object) == NULL) 671 continue; 672 673 seg_gpa = vm->mem_segs[i].gpa; 674 seg_len = vm->mem_segs[i].len; 675 676 if (gpa >= seg_gpa && gpa < seg_gpa + seg_len) { 677 *offset = gpa - seg_gpa; 678 *object = seg_obj; 679 vm_object_reference(seg_obj); 680 return (0); 681 } 682 } 683 684 return (EINVAL); 685 } 686 687 int 688 vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval) 689 { 690 691 if (vcpu < 0 || vcpu >= VM_MAXCPU) 692 return (EINVAL); 693 694 if (reg >= VM_REG_LAST) 695 return (EINVAL); 696 697 return (VMGETREG(vm->cookie, vcpu, reg, retval)); 698 } 699 700 int 701 vm_set_register(struct vm *vm, int vcpu, int reg, uint64_t val) 702 { 703 704 if (vcpu < 0 || vcpu >= VM_MAXCPU) 705 return (EINVAL); 706 707 if (reg >= VM_REG_LAST) 708 return (EINVAL); 709 710 return (VMSETREG(vm->cookie, vcpu, reg, val)); 711 } 712 713 static boolean_t 714 is_descriptor_table(int reg) 715 { 716 717 switch (reg) { 718 case VM_REG_GUEST_IDTR: 719 case VM_REG_GUEST_GDTR: 720 return (TRUE); 721 default: 722 return (FALSE); 723 } 724 } 725 726 static boolean_t 727 is_segment_register(int reg) 728 { 729 730 switch (reg) { 731 case VM_REG_GUEST_ES: 732 case VM_REG_GUEST_CS: 733 case VM_REG_GUEST_SS: 734 case VM_REG_GUEST_DS: 735 case VM_REG_GUEST_FS: 736 case VM_REG_GUEST_GS: 737 case VM_REG_GUEST_TR: 738 case VM_REG_GUEST_LDTR: 739 return (TRUE); 740 default: 741 return (FALSE); 742 } 743 } 744 745 int 746 vm_get_seg_desc(struct vm *vm, int vcpu, int reg, 747 struct seg_desc *desc) 748 { 749 750 if (vcpu < 0 || vcpu >= VM_MAXCPU) 751 return (EINVAL); 752 753 if (!is_segment_register(reg) && !is_descriptor_table(reg)) 754 return (EINVAL); 755 756 return (VMGETDESC(vm->cookie, vcpu, reg, desc)); 757 } 758 759 int 760 vm_set_seg_desc(struct vm *vm, int vcpu, int reg, 761 struct seg_desc *desc) 762 { 763 if (vcpu < 0 || vcpu >= VM_MAXCPU) 764 return (EINVAL); 765 766 if (!is_segment_register(reg) && !is_descriptor_table(reg)) 767 return (EINVAL); 768 769 return (VMSETDESC(vm->cookie, vcpu, reg, desc)); 770 } 771 772 static void 773 restore_guest_fpustate(struct vcpu *vcpu) 774 { 775 776 /* flush host state to the pcb */ 777 fpuexit(curthread); 778 779 /* restore guest FPU state */ 780 fpu_stop_emulating(); 781 fpurestore(vcpu->guestfpu); 782 783 /* 784 * The FPU is now "dirty" with the guest's state so turn on emulation 785 * to trap any access to the FPU by the host. 786 */ 787 fpu_start_emulating(); 788 } 789 790 static void 791 save_guest_fpustate(struct vcpu *vcpu) 792 { 793 794 if ((rcr0() & CR0_TS) == 0) 795 panic("fpu emulation not enabled in host!"); 796 797 /* save guest FPU state */ 798 fpu_stop_emulating(); 799 fpusave(vcpu->guestfpu); 800 fpu_start_emulating(); 801 } 802 803 static VMM_STAT(VCPU_IDLE_TICKS, "number of ticks vcpu was idle"); 804 805 static int 806 vcpu_set_state_locked(struct vcpu *vcpu, enum vcpu_state newstate) 807 { 808 int error; 809 810 vcpu_assert_locked(vcpu); 811 812 /* 813 * The following state transitions are allowed: 814 * IDLE -> FROZEN -> IDLE 815 * FROZEN -> RUNNING -> FROZEN 816 * FROZEN -> SLEEPING -> FROZEN 817 */ 818 switch (vcpu->state) { 819 case VCPU_IDLE: 820 case VCPU_RUNNING: 821 case VCPU_SLEEPING: 822 error = (newstate != VCPU_FROZEN); 823 break; 824 case VCPU_FROZEN: 825 error = (newstate == VCPU_FROZEN); 826 break; 827 default: 828 error = 1; 829 break; 830 } 831 832 if (error == 0) 833 vcpu->state = newstate; 834 else 835 error = EBUSY; 836 837 return (error); 838 } 839 840 static void 841 vcpu_require_state(struct vm *vm, int vcpuid, enum vcpu_state newstate) 842 { 843 int error; 844 845 if ((error = vcpu_set_state(vm, vcpuid, newstate)) != 0) 846 panic("Error %d setting state to %d\n", error, newstate); 847 } 848 849 static void 850 vcpu_require_state_locked(struct vcpu *vcpu, enum vcpu_state newstate) 851 { 852 int error; 853 854 if ((error = vcpu_set_state_locked(vcpu, newstate)) != 0) 855 panic("Error %d setting state to %d", error, newstate); 856 } 857 858 /* 859 * Emulate a guest 'hlt' by sleeping until the vcpu is ready to run. 860 */ 861 static int 862 vm_handle_hlt(struct vm *vm, int vcpuid, boolean_t *retu) 863 { 864 struct vcpu *vcpu; 865 int sleepticks, t; 866 867 vcpu = &vm->vcpu[vcpuid]; 868 869 vcpu_lock(vcpu); 870 871 /* 872 * Figure out the number of host ticks until the next apic 873 * timer interrupt in the guest. 874 */ 875 sleepticks = lapic_timer_tick(vm, vcpuid); 876 877 /* 878 * If the guest local apic timer is disabled then sleep for 879 * a long time but not forever. 880 */ 881 if (sleepticks < 0) 882 sleepticks = hz; 883 884 /* 885 * Do a final check for pending NMI or interrupts before 886 * really putting this thread to sleep. 887 * 888 * These interrupts could have happened any time after we 889 * returned from VMRUN() and before we grabbed the vcpu lock. 890 */ 891 if (!vm_nmi_pending(vm, vcpuid) && lapic_pending_intr(vm, vcpuid) < 0) { 892 if (sleepticks <= 0) 893 panic("invalid sleepticks %d", sleepticks); 894 t = ticks; 895 vcpu_require_state_locked(vcpu, VCPU_SLEEPING); 896 msleep_spin(vcpu, &vcpu->mtx, "vmidle", sleepticks); 897 vcpu_require_state_locked(vcpu, VCPU_FROZEN); 898 vmm_stat_incr(vm, vcpuid, VCPU_IDLE_TICKS, ticks - t); 899 } 900 vcpu_unlock(vcpu); 901 902 return (0); 903 } 904 905 static int 906 vm_handle_paging(struct vm *vm, int vcpuid, boolean_t *retu) 907 { 908 int rv, ftype; 909 struct vm_map *map; 910 struct vcpu *vcpu; 911 struct vm_exit *vme; 912 913 vcpu = &vm->vcpu[vcpuid]; 914 vme = &vcpu->exitinfo; 915 916 ftype = vme->u.paging.fault_type; 917 KASSERT(ftype == VM_PROT_READ || 918 ftype == VM_PROT_WRITE || ftype == VM_PROT_EXECUTE, 919 ("vm_handle_paging: invalid fault_type %d", ftype)); 920 921 if (ftype == VM_PROT_READ || ftype == VM_PROT_WRITE) { 922 rv = pmap_emulate_accessed_dirty(vmspace_pmap(vm->vmspace), 923 vme->u.paging.gpa, ftype); 924 if (rv == 0) 925 goto done; 926 } 927 928 map = &vm->vmspace->vm_map; 929 rv = vm_fault(map, vme->u.paging.gpa, ftype, VM_FAULT_NORMAL); 930 931 VCPU_CTR3(vm, vcpuid, "vm_handle_paging rv = %d, gpa = %#lx, " 932 "ftype = %d", rv, vme->u.paging.gpa, ftype); 933 934 if (rv != KERN_SUCCESS) 935 return (EFAULT); 936 done: 937 /* restart execution at the faulting instruction */ 938 vme->inst_length = 0; 939 940 return (0); 941 } 942 943 static int 944 vm_handle_inst_emul(struct vm *vm, int vcpuid, boolean_t *retu) 945 { 946 struct vie *vie; 947 struct vcpu *vcpu; 948 struct vm_exit *vme; 949 int error, inst_length; 950 uint64_t rip, gla, gpa, cr3; 951 mem_region_read_t mread; 952 mem_region_write_t mwrite; 953 954 vcpu = &vm->vcpu[vcpuid]; 955 vme = &vcpu->exitinfo; 956 957 rip = vme->rip; 958 inst_length = vme->inst_length; 959 960 gla = vme->u.inst_emul.gla; 961 gpa = vme->u.inst_emul.gpa; 962 cr3 = vme->u.inst_emul.cr3; 963 vie = &vme->u.inst_emul.vie; 964 965 vie_init(vie); 966 967 /* Fetch, decode and emulate the faulting instruction */ 968 if (vmm_fetch_instruction(vm, vcpuid, rip, inst_length, cr3, vie) != 0) 969 return (EFAULT); 970 971 if (vmm_decode_instruction(vm, vcpuid, gla, vie) != 0) 972 return (EFAULT); 973 974 /* return to userland unless this is an in-kernel emulated device */ 975 if (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE) { 976 mread = lapic_mmio_read; 977 mwrite = lapic_mmio_write; 978 } else if (gpa >= VIOAPIC_BASE && gpa < VIOAPIC_BASE + VIOAPIC_SIZE) { 979 mread = vioapic_mmio_read; 980 mwrite = vioapic_mmio_write; 981 } else if (gpa >= VHPET_BASE && gpa < VHPET_BASE + VHPET_SIZE) { 982 mread = vhpet_mmio_read; 983 mwrite = vhpet_mmio_write; 984 } else { 985 *retu = TRUE; 986 return (0); 987 } 988 989 error = vmm_emulate_instruction(vm, vcpuid, gpa, vie, mread, mwrite, 0); 990 991 /* return to userland to spin up the AP */ 992 if (error == 0 && vme->exitcode == VM_EXITCODE_SPINUP_AP) 993 *retu = TRUE; 994 995 return (error); 996 } 997 998 int 999 vm_run(struct vm *vm, struct vm_run *vmrun) 1000 { 1001 int error, vcpuid; 1002 struct vcpu *vcpu; 1003 struct pcb *pcb; 1004 uint64_t tscval, rip; 1005 struct vm_exit *vme; 1006 boolean_t retu; 1007 pmap_t pmap; 1008 1009 vcpuid = vmrun->cpuid; 1010 1011 if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 1012 return (EINVAL); 1013 1014 pmap = vmspace_pmap(vm->vmspace); 1015 vcpu = &vm->vcpu[vcpuid]; 1016 vme = &vcpu->exitinfo; 1017 rip = vmrun->rip; 1018 restart: 1019 critical_enter(); 1020 1021 KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active), 1022 ("vm_run: absurd pm_active")); 1023 1024 tscval = rdtsc(); 1025 1026 pcb = PCPU_GET(curpcb); 1027 set_pcb_flags(pcb, PCB_FULL_IRET); 1028 1029 restore_guest_msrs(vm, vcpuid); 1030 restore_guest_fpustate(vcpu); 1031 1032 vcpu_require_state(vm, vcpuid, VCPU_RUNNING); 1033 vcpu->hostcpu = curcpu; 1034 error = VMRUN(vm->cookie, vcpuid, rip, pmap); 1035 vcpu->hostcpu = NOCPU; 1036 vcpu_require_state(vm, vcpuid, VCPU_FROZEN); 1037 1038 save_guest_fpustate(vcpu); 1039 restore_host_msrs(vm, vcpuid); 1040 1041 vmm_stat_incr(vm, vcpuid, VCPU_TOTAL_RUNTIME, rdtsc() - tscval); 1042 1043 critical_exit(); 1044 1045 if (error == 0) { 1046 retu = FALSE; 1047 switch (vme->exitcode) { 1048 case VM_EXITCODE_HLT: 1049 error = vm_handle_hlt(vm, vcpuid, &retu); 1050 break; 1051 case VM_EXITCODE_PAGING: 1052 error = vm_handle_paging(vm, vcpuid, &retu); 1053 break; 1054 case VM_EXITCODE_INST_EMUL: 1055 error = vm_handle_inst_emul(vm, vcpuid, &retu); 1056 break; 1057 default: 1058 retu = TRUE; /* handled in userland */ 1059 break; 1060 } 1061 } 1062 1063 if (error == 0 && retu == FALSE) { 1064 rip = vme->rip + vme->inst_length; 1065 goto restart; 1066 } 1067 1068 /* copy the exit information */ 1069 bcopy(vme, &vmrun->vm_exit, sizeof(struct vm_exit)); 1070 return (error); 1071 } 1072 1073 int 1074 vm_inject_event(struct vm *vm, int vcpuid, int type, 1075 int vector, uint32_t code, int code_valid) 1076 { 1077 if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 1078 return (EINVAL); 1079 1080 if ((type > VM_EVENT_NONE && type < VM_EVENT_MAX) == 0) 1081 return (EINVAL); 1082 1083 if (vector < 0 || vector > 255) 1084 return (EINVAL); 1085 1086 return (VMINJECT(vm->cookie, vcpuid, type, vector, code, code_valid)); 1087 } 1088 1089 static VMM_STAT(VCPU_NMI_COUNT, "number of NMIs delivered to vcpu"); 1090 1091 int 1092 vm_inject_nmi(struct vm *vm, int vcpuid) 1093 { 1094 struct vcpu *vcpu; 1095 1096 if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 1097 return (EINVAL); 1098 1099 vcpu = &vm->vcpu[vcpuid]; 1100 1101 vcpu->nmi_pending = 1; 1102 vm_interrupt_hostcpu(vm, vcpuid); 1103 return (0); 1104 } 1105 1106 int 1107 vm_nmi_pending(struct vm *vm, int vcpuid) 1108 { 1109 struct vcpu *vcpu; 1110 1111 if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 1112 panic("vm_nmi_pending: invalid vcpuid %d", vcpuid); 1113 1114 vcpu = &vm->vcpu[vcpuid]; 1115 1116 return (vcpu->nmi_pending); 1117 } 1118 1119 void 1120 vm_nmi_clear(struct vm *vm, int vcpuid) 1121 { 1122 struct vcpu *vcpu; 1123 1124 if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 1125 panic("vm_nmi_pending: invalid vcpuid %d", vcpuid); 1126 1127 vcpu = &vm->vcpu[vcpuid]; 1128 1129 if (vcpu->nmi_pending == 0) 1130 panic("vm_nmi_clear: inconsistent nmi_pending state"); 1131 1132 vcpu->nmi_pending = 0; 1133 vmm_stat_incr(vm, vcpuid, VCPU_NMI_COUNT, 1); 1134 } 1135 1136 int 1137 vm_get_capability(struct vm *vm, int vcpu, int type, int *retval) 1138 { 1139 if (vcpu < 0 || vcpu >= VM_MAXCPU) 1140 return (EINVAL); 1141 1142 if (type < 0 || type >= VM_CAP_MAX) 1143 return (EINVAL); 1144 1145 return (VMGETCAP(vm->cookie, vcpu, type, retval)); 1146 } 1147 1148 int 1149 vm_set_capability(struct vm *vm, int vcpu, int type, int val) 1150 { 1151 if (vcpu < 0 || vcpu >= VM_MAXCPU) 1152 return (EINVAL); 1153 1154 if (type < 0 || type >= VM_CAP_MAX) 1155 return (EINVAL); 1156 1157 return (VMSETCAP(vm->cookie, vcpu, type, val)); 1158 } 1159 1160 uint64_t * 1161 vm_guest_msrs(struct vm *vm, int cpu) 1162 { 1163 return (vm->vcpu[cpu].guest_msrs); 1164 } 1165 1166 struct vlapic * 1167 vm_lapic(struct vm *vm, int cpu) 1168 { 1169 return (vm->vcpu[cpu].vlapic); 1170 } 1171 1172 struct vioapic * 1173 vm_ioapic(struct vm *vm) 1174 { 1175 1176 return (vm->vioapic); 1177 } 1178 1179 struct vhpet * 1180 vm_hpet(struct vm *vm) 1181 { 1182 1183 return (vm->vhpet); 1184 } 1185 1186 boolean_t 1187 vmm_is_pptdev(int bus, int slot, int func) 1188 { 1189 int found, i, n; 1190 int b, s, f; 1191 char *val, *cp, *cp2; 1192 1193 /* 1194 * XXX 1195 * The length of an environment variable is limited to 128 bytes which 1196 * puts an upper limit on the number of passthru devices that may be 1197 * specified using a single environment variable. 1198 * 1199 * Work around this by scanning multiple environment variable 1200 * names instead of a single one - yuck! 1201 */ 1202 const char *names[] = { "pptdevs", "pptdevs2", "pptdevs3", NULL }; 1203 1204 /* set pptdevs="1/2/3 4/5/6 7/8/9 10/11/12" */ 1205 found = 0; 1206 for (i = 0; names[i] != NULL && !found; i++) { 1207 cp = val = getenv(names[i]); 1208 while (cp != NULL && *cp != '\0') { 1209 if ((cp2 = strchr(cp, ' ')) != NULL) 1210 *cp2 = '\0'; 1211 1212 n = sscanf(cp, "%d/%d/%d", &b, &s, &f); 1213 if (n == 3 && bus == b && slot == s && func == f) { 1214 found = 1; 1215 break; 1216 } 1217 1218 if (cp2 != NULL) 1219 *cp2++ = ' '; 1220 1221 cp = cp2; 1222 } 1223 freeenv(val); 1224 } 1225 return (found); 1226 } 1227 1228 void * 1229 vm_iommu_domain(struct vm *vm) 1230 { 1231 1232 return (vm->iommu); 1233 } 1234 1235 int 1236 vcpu_set_state(struct vm *vm, int vcpuid, enum vcpu_state newstate) 1237 { 1238 int error; 1239 struct vcpu *vcpu; 1240 1241 if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 1242 panic("vm_set_run_state: invalid vcpuid %d", vcpuid); 1243 1244 vcpu = &vm->vcpu[vcpuid]; 1245 1246 vcpu_lock(vcpu); 1247 error = vcpu_set_state_locked(vcpu, newstate); 1248 vcpu_unlock(vcpu); 1249 1250 return (error); 1251 } 1252 1253 enum vcpu_state 1254 vcpu_get_state(struct vm *vm, int vcpuid, int *hostcpu) 1255 { 1256 struct vcpu *vcpu; 1257 enum vcpu_state state; 1258 1259 if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 1260 panic("vm_get_run_state: invalid vcpuid %d", vcpuid); 1261 1262 vcpu = &vm->vcpu[vcpuid]; 1263 1264 vcpu_lock(vcpu); 1265 state = vcpu->state; 1266 if (hostcpu != NULL) 1267 *hostcpu = vcpu->hostcpu; 1268 vcpu_unlock(vcpu); 1269 1270 return (state); 1271 } 1272 1273 void 1274 vm_activate_cpu(struct vm *vm, int vcpuid) 1275 { 1276 1277 if (vcpuid >= 0 && vcpuid < VM_MAXCPU) 1278 CPU_SET(vcpuid, &vm->active_cpus); 1279 } 1280 1281 cpuset_t 1282 vm_active_cpus(struct vm *vm) 1283 { 1284 1285 return (vm->active_cpus); 1286 } 1287 1288 void * 1289 vcpu_stats(struct vm *vm, int vcpuid) 1290 { 1291 1292 return (vm->vcpu[vcpuid].stats); 1293 } 1294 1295 int 1296 vm_get_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state *state) 1297 { 1298 if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 1299 return (EINVAL); 1300 1301 *state = vm->vcpu[vcpuid].x2apic_state; 1302 1303 return (0); 1304 } 1305 1306 int 1307 vm_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state) 1308 { 1309 if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 1310 return (EINVAL); 1311 1312 if (state >= X2APIC_STATE_LAST) 1313 return (EINVAL); 1314 1315 vm->vcpu[vcpuid].x2apic_state = state; 1316 1317 vlapic_set_x2apic_state(vm, vcpuid, state); 1318 1319 return (0); 1320 } 1321 1322 void 1323 vm_interrupt_hostcpu(struct vm *vm, int vcpuid) 1324 { 1325 int hostcpu; 1326 struct vcpu *vcpu; 1327 1328 vcpu = &vm->vcpu[vcpuid]; 1329 1330 vcpu_lock(vcpu); 1331 hostcpu = vcpu->hostcpu; 1332 if (hostcpu == NOCPU) { 1333 if (vcpu->state == VCPU_SLEEPING) 1334 wakeup_one(vcpu); 1335 } else { 1336 if (vcpu->state != VCPU_RUNNING) 1337 panic("invalid vcpu state %d", vcpu->state); 1338 if (hostcpu != curcpu) 1339 ipi_cpu(hostcpu, vmm_ipinum); 1340 } 1341 vcpu_unlock(vcpu); 1342 } 1343 1344 struct vmspace * 1345 vm_get_vmspace(struct vm *vm) 1346 { 1347 1348 return (vm->vmspace); 1349 } 1350 1351 int 1352 vm_apicid2vcpuid(struct vm *vm, int apicid) 1353 { 1354 /* 1355 * XXX apic id is assumed to be numerically identical to vcpu id 1356 */ 1357 return (apicid); 1358 } 1359