1 /*- 2 * Copyright (c) 2011 NetApp, Inc. 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 * 26 * $FreeBSD$ 27 */ 28 29 #include <sys/cdefs.h> 30 __FBSDID("$FreeBSD$"); 31 32 #include <sys/param.h> 33 #include <sys/systm.h> 34 #include <sys/kernel.h> 35 #include <sys/module.h> 36 #include <sys/sysctl.h> 37 #include <sys/malloc.h> 38 #include <sys/pcpu.h> 39 #include <sys/lock.h> 40 #include <sys/mutex.h> 41 #include <sys/proc.h> 42 #include <sys/rwlock.h> 43 #include <sys/sched.h> 44 #include <sys/smp.h> 45 #include <sys/systm.h> 46 47 #include <vm/vm.h> 48 #include <vm/vm_object.h> 49 #include <vm/vm_page.h> 50 #include <vm/pmap.h> 51 #include <vm/vm_map.h> 52 #include <vm/vm_extern.h> 53 #include <vm/vm_param.h> 54 55 #include <machine/vm.h> 56 #include <machine/pcb.h> 57 #include <machine/smp.h> 58 #include <x86/apicreg.h> 59 #include <machine/pmap.h> 60 #include <machine/vmparam.h> 61 62 #include <machine/vmm.h> 63 #include "vmm_ktr.h" 64 #include "vmm_host.h" 65 #include "vmm_mem.h" 66 #include "vmm_util.h" 67 #include <machine/vmm_dev.h> 68 #include "vlapic.h" 69 #include "vmm_msr.h" 70 #include "vmm_ipi.h" 71 #include "vmm_stat.h" 72 #include "vmm_lapic.h" 73 74 #include "io/ppt.h" 75 #include "io/iommu.h" 76 77 struct vlapic; 78 79 struct vcpu { 80 int flags; 81 enum vcpu_state state; 82 struct mtx mtx; 83 int hostcpu; /* host cpuid this vcpu last ran on */ 84 uint64_t guest_msrs[VMM_MSR_NUM]; 85 struct vlapic *vlapic; 86 int vcpuid; 87 struct savefpu *guestfpu; /* guest fpu state */ 88 void *stats; 89 struct vm_exit exitinfo; 90 enum x2apic_state x2apic_state; 91 int nmi_pending; 92 }; 93 94 #define vcpu_lock_init(v) mtx_init(&((v)->mtx), "vcpu lock", 0, MTX_SPIN) 95 #define vcpu_lock(v) mtx_lock_spin(&((v)->mtx)) 96 #define vcpu_unlock(v) mtx_unlock_spin(&((v)->mtx)) 97 #define vcpu_assert_locked(v) mtx_assert(&((v)->mtx), MA_OWNED) 98 99 struct mem_seg { 100 vm_paddr_t gpa; 101 size_t len; 102 boolean_t wired; 103 vm_object_t object; 104 }; 105 #define VM_MAX_MEMORY_SEGMENTS 2 106 107 struct vm { 108 void *cookie; /* processor-specific data */ 109 void *iommu; /* iommu-specific data */ 110 struct vmspace *vmspace; /* guest's address space */ 111 struct vcpu vcpu[VM_MAXCPU]; 112 int num_mem_segs; 113 struct mem_seg mem_segs[VM_MAX_MEMORY_SEGMENTS]; 114 char name[VM_MAX_NAMELEN]; 115 116 /* 117 * Set of active vcpus. 118 * An active vcpu is one that has been started implicitly (BSP) or 119 * explicitly (AP) by sending it a startup ipi. 120 */ 121 cpuset_t active_cpus; 122 }; 123 124 static int vmm_initialized; 125 126 static struct vmm_ops *ops; 127 #define VMM_INIT() (ops != NULL ? (*ops->init)() : 0) 128 #define VMM_CLEANUP() (ops != NULL ? (*ops->cleanup)() : 0) 129 130 #define VMINIT(vm, pmap) (ops != NULL ? (*ops->vminit)(vm, pmap): NULL) 131 #define VMRUN(vmi, vcpu, rip, pmap) \ 132 (ops != NULL ? (*ops->vmrun)(vmi, vcpu, rip, pmap) : ENXIO) 133 #define VMCLEANUP(vmi) (ops != NULL ? (*ops->vmcleanup)(vmi) : NULL) 134 #define VMSPACE_ALLOC(min, max) \ 135 (ops != NULL ? (*ops->vmspace_alloc)(min, max) : NULL) 136 #define VMSPACE_FREE(vmspace) \ 137 (ops != NULL ? (*ops->vmspace_free)(vmspace) : ENXIO) 138 #define VMGETREG(vmi, vcpu, num, retval) \ 139 (ops != NULL ? (*ops->vmgetreg)(vmi, vcpu, num, retval) : ENXIO) 140 #define VMSETREG(vmi, vcpu, num, val) \ 141 (ops != NULL ? (*ops->vmsetreg)(vmi, vcpu, num, val) : ENXIO) 142 #define VMGETDESC(vmi, vcpu, num, desc) \ 143 (ops != NULL ? (*ops->vmgetdesc)(vmi, vcpu, num, desc) : ENXIO) 144 #define VMSETDESC(vmi, vcpu, num, desc) \ 145 (ops != NULL ? (*ops->vmsetdesc)(vmi, vcpu, num, desc) : ENXIO) 146 #define VMINJECT(vmi, vcpu, type, vec, ec, ecv) \ 147 (ops != NULL ? (*ops->vminject)(vmi, vcpu, type, vec, ec, ecv) : ENXIO) 148 #define VMGETCAP(vmi, vcpu, num, retval) \ 149 (ops != NULL ? (*ops->vmgetcap)(vmi, vcpu, num, retval) : ENXIO) 150 #define VMSETCAP(vmi, vcpu, num, val) \ 151 (ops != NULL ? (*ops->vmsetcap)(vmi, vcpu, num, val) : ENXIO) 152 153 #define fpu_start_emulating() load_cr0(rcr0() | CR0_TS) 154 #define fpu_stop_emulating() clts() 155 156 static MALLOC_DEFINE(M_VM, "vm", "vm"); 157 CTASSERT(VMM_MSR_NUM <= 64); /* msr_mask can keep track of up to 64 msrs */ 158 159 /* statistics */ 160 static VMM_STAT(VCPU_TOTAL_RUNTIME, "vcpu total runtime"); 161 162 static void 163 vcpu_cleanup(struct vcpu *vcpu) 164 { 165 vlapic_cleanup(vcpu->vlapic); 166 vmm_stat_free(vcpu->stats); 167 fpu_save_area_free(vcpu->guestfpu); 168 } 169 170 static void 171 vcpu_init(struct vm *vm, uint32_t vcpu_id) 172 { 173 struct vcpu *vcpu; 174 175 vcpu = &vm->vcpu[vcpu_id]; 176 177 vcpu_lock_init(vcpu); 178 vcpu->hostcpu = NOCPU; 179 vcpu->vcpuid = vcpu_id; 180 vcpu->vlapic = vlapic_init(vm, vcpu_id); 181 vm_set_x2apic_state(vm, vcpu_id, X2APIC_ENABLED); 182 vcpu->guestfpu = fpu_save_area_alloc(); 183 fpu_save_area_reset(vcpu->guestfpu); 184 vcpu->stats = vmm_stat_alloc(); 185 } 186 187 struct vm_exit * 188 vm_exitinfo(struct vm *vm, int cpuid) 189 { 190 struct vcpu *vcpu; 191 192 if (cpuid < 0 || cpuid >= VM_MAXCPU) 193 panic("vm_exitinfo: invalid cpuid %d", cpuid); 194 195 vcpu = &vm->vcpu[cpuid]; 196 197 return (&vcpu->exitinfo); 198 } 199 200 static int 201 vmm_init(void) 202 { 203 int error; 204 205 vmm_host_state_init(); 206 vmm_ipi_init(); 207 208 error = vmm_mem_init(); 209 if (error) 210 return (error); 211 212 if (vmm_is_intel()) 213 ops = &vmm_ops_intel; 214 else if (vmm_is_amd()) 215 ops = &vmm_ops_amd; 216 else 217 return (ENXIO); 218 219 vmm_msr_init(); 220 221 return (VMM_INIT()); 222 } 223 224 static int 225 vmm_handler(module_t mod, int what, void *arg) 226 { 227 int error; 228 229 switch (what) { 230 case MOD_LOAD: 231 vmmdev_init(); 232 iommu_init(); 233 error = vmm_init(); 234 if (error == 0) 235 vmm_initialized = 1; 236 break; 237 case MOD_UNLOAD: 238 error = vmmdev_cleanup(); 239 if (error == 0) { 240 iommu_cleanup(); 241 vmm_ipi_cleanup(); 242 error = VMM_CLEANUP(); 243 /* 244 * Something bad happened - prevent new 245 * VMs from being created 246 */ 247 if (error) 248 vmm_initialized = 0; 249 } 250 break; 251 default: 252 error = 0; 253 break; 254 } 255 return (error); 256 } 257 258 static moduledata_t vmm_kmod = { 259 "vmm", 260 vmm_handler, 261 NULL 262 }; 263 264 /* 265 * vmm initialization has the following dependencies: 266 * 267 * - iommu initialization must happen after the pci passthru driver has had 268 * a chance to attach to any passthru devices (after SI_SUB_CONFIGURE). 269 * 270 * - VT-x initialization requires smp_rendezvous() and therefore must happen 271 * after SMP is fully functional (after SI_SUB_SMP). 272 */ 273 DECLARE_MODULE(vmm, vmm_kmod, SI_SUB_SMP + 1, SI_ORDER_ANY); 274 MODULE_VERSION(vmm, 1); 275 276 SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW, NULL, NULL); 277 278 int 279 vm_create(const char *name, struct vm **retvm) 280 { 281 int i; 282 struct vm *vm; 283 struct vmspace *vmspace; 284 285 const int BSP = 0; 286 287 /* 288 * If vmm.ko could not be successfully initialized then don't attempt 289 * to create the virtual machine. 290 */ 291 if (!vmm_initialized) 292 return (ENXIO); 293 294 if (name == NULL || strlen(name) >= VM_MAX_NAMELEN) 295 return (EINVAL); 296 297 vmspace = VMSPACE_ALLOC(VM_MIN_ADDRESS, VM_MAXUSER_ADDRESS); 298 if (vmspace == NULL) 299 return (ENOMEM); 300 301 vm = malloc(sizeof(struct vm), M_VM, M_WAITOK | M_ZERO); 302 strcpy(vm->name, name); 303 vm->cookie = VMINIT(vm, vmspace_pmap(vmspace)); 304 305 for (i = 0; i < VM_MAXCPU; i++) { 306 vcpu_init(vm, i); 307 guest_msrs_init(vm, i); 308 } 309 310 vm_activate_cpu(vm, BSP); 311 vm->vmspace = vmspace; 312 313 *retvm = vm; 314 return (0); 315 } 316 317 static void 318 vm_free_mem_seg(struct vm *vm, struct mem_seg *seg) 319 { 320 321 if (seg->object != NULL) 322 vmm_mem_free(vm->vmspace, seg->gpa, seg->len); 323 324 bzero(seg, sizeof(*seg)); 325 } 326 327 void 328 vm_destroy(struct vm *vm) 329 { 330 int i; 331 332 ppt_unassign_all(vm); 333 334 if (vm->iommu != NULL) 335 iommu_destroy_domain(vm->iommu); 336 337 for (i = 0; i < vm->num_mem_segs; i++) 338 vm_free_mem_seg(vm, &vm->mem_segs[i]); 339 340 vm->num_mem_segs = 0; 341 342 for (i = 0; i < VM_MAXCPU; i++) 343 vcpu_cleanup(&vm->vcpu[i]); 344 345 VMSPACE_FREE(vm->vmspace); 346 347 VMCLEANUP(vm->cookie); 348 349 free(vm, M_VM); 350 } 351 352 const char * 353 vm_name(struct vm *vm) 354 { 355 return (vm->name); 356 } 357 358 int 359 vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa) 360 { 361 vm_object_t obj; 362 363 if ((obj = vmm_mmio_alloc(vm->vmspace, gpa, len, hpa)) == NULL) 364 return (ENOMEM); 365 else 366 return (0); 367 } 368 369 int 370 vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len) 371 { 372 373 vmm_mmio_free(vm->vmspace, gpa, len); 374 return (0); 375 } 376 377 boolean_t 378 vm_mem_allocated(struct vm *vm, vm_paddr_t gpa) 379 { 380 int i; 381 vm_paddr_t gpabase, gpalimit; 382 383 for (i = 0; i < vm->num_mem_segs; i++) { 384 gpabase = vm->mem_segs[i].gpa; 385 gpalimit = gpabase + vm->mem_segs[i].len; 386 if (gpa >= gpabase && gpa < gpalimit) 387 return (TRUE); /* 'gpa' is regular memory */ 388 } 389 390 if (ppt_is_mmio(vm, gpa)) 391 return (TRUE); /* 'gpa' is pci passthru mmio */ 392 393 return (FALSE); 394 } 395 396 int 397 vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len) 398 { 399 int available, allocated; 400 struct mem_seg *seg; 401 vm_object_t object; 402 vm_paddr_t g; 403 404 if ((gpa & PAGE_MASK) || (len & PAGE_MASK) || len == 0) 405 return (EINVAL); 406 407 available = allocated = 0; 408 g = gpa; 409 while (g < gpa + len) { 410 if (vm_mem_allocated(vm, g)) 411 allocated++; 412 else 413 available++; 414 415 g += PAGE_SIZE; 416 } 417 418 /* 419 * If there are some allocated and some available pages in the address 420 * range then it is an error. 421 */ 422 if (allocated && available) 423 return (EINVAL); 424 425 /* 426 * If the entire address range being requested has already been 427 * allocated then there isn't anything more to do. 428 */ 429 if (allocated && available == 0) 430 return (0); 431 432 if (vm->num_mem_segs >= VM_MAX_MEMORY_SEGMENTS) 433 return (E2BIG); 434 435 seg = &vm->mem_segs[vm->num_mem_segs]; 436 437 if ((object = vmm_mem_alloc(vm->vmspace, gpa, len)) == NULL) 438 return (ENOMEM); 439 440 seg->gpa = gpa; 441 seg->len = len; 442 seg->object = object; 443 seg->wired = FALSE; 444 445 vm->num_mem_segs++; 446 447 return (0); 448 } 449 450 static void 451 vm_gpa_unwire(struct vm *vm) 452 { 453 int i, rv; 454 struct mem_seg *seg; 455 456 for (i = 0; i < vm->num_mem_segs; i++) { 457 seg = &vm->mem_segs[i]; 458 if (!seg->wired) 459 continue; 460 461 rv = vm_map_unwire(&vm->vmspace->vm_map, 462 seg->gpa, seg->gpa + seg->len, 463 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); 464 KASSERT(rv == KERN_SUCCESS, ("vm(%s) memory segment " 465 "%#lx/%ld could not be unwired: %d", 466 vm_name(vm), seg->gpa, seg->len, rv)); 467 468 seg->wired = FALSE; 469 } 470 } 471 472 static int 473 vm_gpa_wire(struct vm *vm) 474 { 475 int i, rv; 476 struct mem_seg *seg; 477 478 for (i = 0; i < vm->num_mem_segs; i++) { 479 seg = &vm->mem_segs[i]; 480 if (seg->wired) 481 continue; 482 483 /* XXX rlimits? */ 484 rv = vm_map_wire(&vm->vmspace->vm_map, 485 seg->gpa, seg->gpa + seg->len, 486 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); 487 if (rv != KERN_SUCCESS) 488 break; 489 490 seg->wired = TRUE; 491 } 492 493 if (i < vm->num_mem_segs) { 494 /* 495 * Undo the wiring before returning an error. 496 */ 497 vm_gpa_unwire(vm); 498 return (EAGAIN); 499 } 500 501 return (0); 502 } 503 504 static void 505 vm_iommu_modify(struct vm *vm, boolean_t map) 506 { 507 int i, sz; 508 vm_paddr_t gpa, hpa; 509 struct mem_seg *seg; 510 void *vp, *cookie, *host_domain; 511 512 sz = PAGE_SIZE; 513 host_domain = iommu_host_domain(); 514 515 for (i = 0; i < vm->num_mem_segs; i++) { 516 seg = &vm->mem_segs[i]; 517 KASSERT(seg->wired, ("vm(%s) memory segment %#lx/%ld not wired", 518 vm_name(vm), seg->gpa, seg->len)); 519 520 gpa = seg->gpa; 521 while (gpa < seg->gpa + seg->len) { 522 vp = vm_gpa_hold(vm, gpa, PAGE_SIZE, VM_PROT_WRITE, 523 &cookie); 524 KASSERT(vp != NULL, ("vm(%s) could not map gpa %#lx", 525 vm_name(vm), gpa)); 526 527 vm_gpa_release(cookie); 528 529 hpa = DMAP_TO_PHYS((uintptr_t)vp); 530 if (map) { 531 iommu_create_mapping(vm->iommu, gpa, hpa, sz); 532 iommu_remove_mapping(host_domain, hpa, sz); 533 } else { 534 iommu_remove_mapping(vm->iommu, gpa, sz); 535 iommu_create_mapping(host_domain, hpa, hpa, sz); 536 } 537 538 gpa += PAGE_SIZE; 539 } 540 } 541 542 /* 543 * Invalidate the cached translations associated with the domain 544 * from which pages were removed. 545 */ 546 if (map) 547 iommu_invalidate_tlb(host_domain); 548 else 549 iommu_invalidate_tlb(vm->iommu); 550 } 551 552 #define vm_iommu_unmap(vm) vm_iommu_modify((vm), FALSE) 553 #define vm_iommu_map(vm) vm_iommu_modify((vm), TRUE) 554 555 int 556 vm_unassign_pptdev(struct vm *vm, int bus, int slot, int func) 557 { 558 int error; 559 560 error = ppt_unassign_device(vm, bus, slot, func); 561 if (error) 562 return (error); 563 564 if (ppt_num_devices(vm) == 0) { 565 vm_iommu_unmap(vm); 566 vm_gpa_unwire(vm); 567 } 568 return (0); 569 } 570 571 int 572 vm_assign_pptdev(struct vm *vm, int bus, int slot, int func) 573 { 574 int error; 575 vm_paddr_t maxaddr; 576 577 /* 578 * Virtual machines with pci passthru devices get special treatment: 579 * - the guest physical memory is wired 580 * - the iommu is programmed to do the 'gpa' to 'hpa' translation 581 * 582 * We need to do this before the first pci passthru device is attached. 583 */ 584 if (ppt_num_devices(vm) == 0) { 585 KASSERT(vm->iommu == NULL, 586 ("vm_assign_pptdev: iommu must be NULL")); 587 maxaddr = vmm_mem_maxaddr(); 588 vm->iommu = iommu_create_domain(maxaddr); 589 590 error = vm_gpa_wire(vm); 591 if (error) 592 return (error); 593 594 vm_iommu_map(vm); 595 } 596 597 error = ppt_assign_device(vm, bus, slot, func); 598 return (error); 599 } 600 601 void * 602 vm_gpa_hold(struct vm *vm, vm_paddr_t gpa, size_t len, int reqprot, 603 void **cookie) 604 { 605 int count, pageoff; 606 vm_page_t m; 607 608 pageoff = gpa & PAGE_MASK; 609 if (len > PAGE_SIZE - pageoff) 610 panic("vm_gpa_hold: invalid gpa/len: 0x%016lx/%lu", gpa, len); 611 612 count = vm_fault_quick_hold_pages(&vm->vmspace->vm_map, 613 trunc_page(gpa), PAGE_SIZE, reqprot, &m, 1); 614 615 if (count == 1) { 616 *cookie = m; 617 return ((void *)(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)) + pageoff)); 618 } else { 619 *cookie = NULL; 620 return (NULL); 621 } 622 } 623 624 void 625 vm_gpa_release(void *cookie) 626 { 627 vm_page_t m = cookie; 628 629 vm_page_lock(m); 630 vm_page_unhold(m); 631 vm_page_unlock(m); 632 } 633 634 int 635 vm_gpabase2memseg(struct vm *vm, vm_paddr_t gpabase, 636 struct vm_memory_segment *seg) 637 { 638 int i; 639 640 for (i = 0; i < vm->num_mem_segs; i++) { 641 if (gpabase == vm->mem_segs[i].gpa) { 642 seg->gpa = vm->mem_segs[i].gpa; 643 seg->len = vm->mem_segs[i].len; 644 seg->wired = vm->mem_segs[i].wired; 645 return (0); 646 } 647 } 648 return (-1); 649 } 650 651 int 652 vm_get_memobj(struct vm *vm, vm_paddr_t gpa, size_t len, 653 vm_offset_t *offset, struct vm_object **object) 654 { 655 int i; 656 size_t seg_len; 657 vm_paddr_t seg_gpa; 658 vm_object_t seg_obj; 659 660 for (i = 0; i < vm->num_mem_segs; i++) { 661 if ((seg_obj = vm->mem_segs[i].object) == NULL) 662 continue; 663 664 seg_gpa = vm->mem_segs[i].gpa; 665 seg_len = vm->mem_segs[i].len; 666 667 if (gpa >= seg_gpa && gpa < seg_gpa + seg_len) { 668 *offset = gpa - seg_gpa; 669 *object = seg_obj; 670 vm_object_reference(seg_obj); 671 return (0); 672 } 673 } 674 675 return (EINVAL); 676 } 677 678 int 679 vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval) 680 { 681 682 if (vcpu < 0 || vcpu >= VM_MAXCPU) 683 return (EINVAL); 684 685 if (reg >= VM_REG_LAST) 686 return (EINVAL); 687 688 return (VMGETREG(vm->cookie, vcpu, reg, retval)); 689 } 690 691 int 692 vm_set_register(struct vm *vm, int vcpu, int reg, uint64_t val) 693 { 694 695 if (vcpu < 0 || vcpu >= VM_MAXCPU) 696 return (EINVAL); 697 698 if (reg >= VM_REG_LAST) 699 return (EINVAL); 700 701 return (VMSETREG(vm->cookie, vcpu, reg, val)); 702 } 703 704 static boolean_t 705 is_descriptor_table(int reg) 706 { 707 708 switch (reg) { 709 case VM_REG_GUEST_IDTR: 710 case VM_REG_GUEST_GDTR: 711 return (TRUE); 712 default: 713 return (FALSE); 714 } 715 } 716 717 static boolean_t 718 is_segment_register(int reg) 719 { 720 721 switch (reg) { 722 case VM_REG_GUEST_ES: 723 case VM_REG_GUEST_CS: 724 case VM_REG_GUEST_SS: 725 case VM_REG_GUEST_DS: 726 case VM_REG_GUEST_FS: 727 case VM_REG_GUEST_GS: 728 case VM_REG_GUEST_TR: 729 case VM_REG_GUEST_LDTR: 730 return (TRUE); 731 default: 732 return (FALSE); 733 } 734 } 735 736 int 737 vm_get_seg_desc(struct vm *vm, int vcpu, int reg, 738 struct seg_desc *desc) 739 { 740 741 if (vcpu < 0 || vcpu >= VM_MAXCPU) 742 return (EINVAL); 743 744 if (!is_segment_register(reg) && !is_descriptor_table(reg)) 745 return (EINVAL); 746 747 return (VMGETDESC(vm->cookie, vcpu, reg, desc)); 748 } 749 750 int 751 vm_set_seg_desc(struct vm *vm, int vcpu, int reg, 752 struct seg_desc *desc) 753 { 754 if (vcpu < 0 || vcpu >= VM_MAXCPU) 755 return (EINVAL); 756 757 if (!is_segment_register(reg) && !is_descriptor_table(reg)) 758 return (EINVAL); 759 760 return (VMSETDESC(vm->cookie, vcpu, reg, desc)); 761 } 762 763 static void 764 restore_guest_fpustate(struct vcpu *vcpu) 765 { 766 767 /* flush host state to the pcb */ 768 fpuexit(curthread); 769 770 /* restore guest FPU state */ 771 fpu_stop_emulating(); 772 fpurestore(vcpu->guestfpu); 773 774 /* 775 * The FPU is now "dirty" with the guest's state so turn on emulation 776 * to trap any access to the FPU by the host. 777 */ 778 fpu_start_emulating(); 779 } 780 781 static void 782 save_guest_fpustate(struct vcpu *vcpu) 783 { 784 785 if ((rcr0() & CR0_TS) == 0) 786 panic("fpu emulation not enabled in host!"); 787 788 /* save guest FPU state */ 789 fpu_stop_emulating(); 790 fpusave(vcpu->guestfpu); 791 fpu_start_emulating(); 792 } 793 794 static VMM_STAT(VCPU_IDLE_TICKS, "number of ticks vcpu was idle"); 795 796 static int 797 vcpu_set_state_locked(struct vcpu *vcpu, enum vcpu_state newstate) 798 { 799 int error; 800 801 vcpu_assert_locked(vcpu); 802 803 /* 804 * The following state transitions are allowed: 805 * IDLE -> FROZEN -> IDLE 806 * FROZEN -> RUNNING -> FROZEN 807 * FROZEN -> SLEEPING -> FROZEN 808 */ 809 switch (vcpu->state) { 810 case VCPU_IDLE: 811 case VCPU_RUNNING: 812 case VCPU_SLEEPING: 813 error = (newstate != VCPU_FROZEN); 814 break; 815 case VCPU_FROZEN: 816 error = (newstate == VCPU_FROZEN); 817 break; 818 default: 819 error = 1; 820 break; 821 } 822 823 if (error == 0) 824 vcpu->state = newstate; 825 else 826 error = EBUSY; 827 828 return (error); 829 } 830 831 static void 832 vcpu_require_state(struct vm *vm, int vcpuid, enum vcpu_state newstate) 833 { 834 int error; 835 836 if ((error = vcpu_set_state(vm, vcpuid, newstate)) != 0) 837 panic("Error %d setting state to %d\n", error, newstate); 838 } 839 840 static void 841 vcpu_require_state_locked(struct vcpu *vcpu, enum vcpu_state newstate) 842 { 843 int error; 844 845 if ((error = vcpu_set_state_locked(vcpu, newstate)) != 0) 846 panic("Error %d setting state to %d", error, newstate); 847 } 848 849 /* 850 * Emulate a guest 'hlt' by sleeping until the vcpu is ready to run. 851 */ 852 static int 853 vm_handle_hlt(struct vm *vm, int vcpuid, boolean_t *retu) 854 { 855 struct vcpu *vcpu; 856 int sleepticks, t; 857 858 vcpu = &vm->vcpu[vcpuid]; 859 860 vcpu_lock(vcpu); 861 862 /* 863 * Figure out the number of host ticks until the next apic 864 * timer interrupt in the guest. 865 */ 866 sleepticks = lapic_timer_tick(vm, vcpuid); 867 868 /* 869 * If the guest local apic timer is disabled then sleep for 870 * a long time but not forever. 871 */ 872 if (sleepticks < 0) 873 sleepticks = hz; 874 875 /* 876 * Do a final check for pending NMI or interrupts before 877 * really putting this thread to sleep. 878 * 879 * These interrupts could have happened any time after we 880 * returned from VMRUN() and before we grabbed the vcpu lock. 881 */ 882 if (!vm_nmi_pending(vm, vcpuid) && lapic_pending_intr(vm, vcpuid) < 0) { 883 if (sleepticks <= 0) 884 panic("invalid sleepticks %d", sleepticks); 885 t = ticks; 886 vcpu_require_state_locked(vcpu, VCPU_SLEEPING); 887 msleep_spin(vcpu, &vcpu->mtx, "vmidle", sleepticks); 888 vcpu_require_state_locked(vcpu, VCPU_FROZEN); 889 vmm_stat_incr(vm, vcpuid, VCPU_IDLE_TICKS, ticks - t); 890 } 891 vcpu_unlock(vcpu); 892 893 return (0); 894 } 895 896 static int 897 vm_handle_paging(struct vm *vm, int vcpuid, boolean_t *retu) 898 { 899 int rv, ftype; 900 struct vm_map *map; 901 struct vcpu *vcpu; 902 struct vm_exit *vme; 903 904 vcpu = &vm->vcpu[vcpuid]; 905 vme = &vcpu->exitinfo; 906 907 ftype = vme->u.paging.fault_type; 908 KASSERT(ftype == VM_PROT_READ || 909 ftype == VM_PROT_WRITE || ftype == VM_PROT_EXECUTE, 910 ("vm_handle_paging: invalid fault_type %d", ftype)); 911 912 if (ftype == VM_PROT_READ || ftype == VM_PROT_WRITE) { 913 rv = pmap_emulate_accessed_dirty(vmspace_pmap(vm->vmspace), 914 vme->u.paging.gpa, ftype); 915 if (rv == 0) 916 goto done; 917 } 918 919 map = &vm->vmspace->vm_map; 920 rv = vm_fault(map, vme->u.paging.gpa, ftype, VM_FAULT_NORMAL); 921 922 VMM_CTR3(vm, vcpuid, "vm_handle_paging rv = %d, gpa = %#lx, ftype = %d", 923 rv, vme->u.paging.gpa, ftype); 924 925 if (rv != KERN_SUCCESS) 926 return (EFAULT); 927 done: 928 /* restart execution at the faulting instruction */ 929 vme->inst_length = 0; 930 931 return (0); 932 } 933 934 static int 935 vm_handle_inst_emul(struct vm *vm, int vcpuid, boolean_t *retu) 936 { 937 struct vie *vie; 938 struct vcpu *vcpu; 939 struct vm_exit *vme; 940 int error, inst_length; 941 uint64_t rip, gla, gpa, cr3; 942 943 vcpu = &vm->vcpu[vcpuid]; 944 vme = &vcpu->exitinfo; 945 946 rip = vme->rip; 947 inst_length = vme->inst_length; 948 949 gla = vme->u.inst_emul.gla; 950 gpa = vme->u.inst_emul.gpa; 951 cr3 = vme->u.inst_emul.cr3; 952 vie = &vme->u.inst_emul.vie; 953 954 vie_init(vie); 955 956 /* Fetch, decode and emulate the faulting instruction */ 957 if (vmm_fetch_instruction(vm, vcpuid, rip, inst_length, cr3, vie) != 0) 958 return (EFAULT); 959 960 if (vmm_decode_instruction(vm, vcpuid, gla, vie) != 0) 961 return (EFAULT); 962 963 /* return to userland unless this is a local apic access */ 964 if (gpa < DEFAULT_APIC_BASE || gpa >= DEFAULT_APIC_BASE + PAGE_SIZE) { 965 *retu = TRUE; 966 return (0); 967 } 968 969 error = vmm_emulate_instruction(vm, vcpuid, gpa, vie, 970 lapic_mmio_read, lapic_mmio_write, 0); 971 972 /* return to userland to spin up the AP */ 973 if (error == 0 && vme->exitcode == VM_EXITCODE_SPINUP_AP) 974 *retu = TRUE; 975 976 return (error); 977 } 978 979 int 980 vm_run(struct vm *vm, struct vm_run *vmrun) 981 { 982 int error, vcpuid; 983 struct vcpu *vcpu; 984 struct pcb *pcb; 985 uint64_t tscval, rip; 986 struct vm_exit *vme; 987 boolean_t retu; 988 pmap_t pmap; 989 990 vcpuid = vmrun->cpuid; 991 992 if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 993 return (EINVAL); 994 995 pmap = vmspace_pmap(vm->vmspace); 996 vcpu = &vm->vcpu[vcpuid]; 997 vme = &vcpu->exitinfo; 998 rip = vmrun->rip; 999 restart: 1000 critical_enter(); 1001 1002 KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active), 1003 ("vm_run: absurd pm_active")); 1004 1005 tscval = rdtsc(); 1006 1007 pcb = PCPU_GET(curpcb); 1008 set_pcb_flags(pcb, PCB_FULL_IRET); 1009 1010 restore_guest_msrs(vm, vcpuid); 1011 restore_guest_fpustate(vcpu); 1012 1013 vcpu_require_state(vm, vcpuid, VCPU_RUNNING); 1014 vcpu->hostcpu = curcpu; 1015 error = VMRUN(vm->cookie, vcpuid, rip, pmap); 1016 vcpu->hostcpu = NOCPU; 1017 vcpu_require_state(vm, vcpuid, VCPU_FROZEN); 1018 1019 save_guest_fpustate(vcpu); 1020 restore_host_msrs(vm, vcpuid); 1021 1022 vmm_stat_incr(vm, vcpuid, VCPU_TOTAL_RUNTIME, rdtsc() - tscval); 1023 1024 critical_exit(); 1025 1026 if (error == 0) { 1027 retu = FALSE; 1028 switch (vme->exitcode) { 1029 case VM_EXITCODE_HLT: 1030 error = vm_handle_hlt(vm, vcpuid, &retu); 1031 break; 1032 case VM_EXITCODE_PAGING: 1033 error = vm_handle_paging(vm, vcpuid, &retu); 1034 break; 1035 case VM_EXITCODE_INST_EMUL: 1036 error = vm_handle_inst_emul(vm, vcpuid, &retu); 1037 break; 1038 default: 1039 retu = TRUE; /* handled in userland */ 1040 break; 1041 } 1042 } 1043 1044 if (error == 0 && retu == FALSE) { 1045 rip = vme->rip + vme->inst_length; 1046 goto restart; 1047 } 1048 1049 /* copy the exit information */ 1050 bcopy(vme, &vmrun->vm_exit, sizeof(struct vm_exit)); 1051 return (error); 1052 } 1053 1054 int 1055 vm_inject_event(struct vm *vm, int vcpuid, int type, 1056 int vector, uint32_t code, int code_valid) 1057 { 1058 if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 1059 return (EINVAL); 1060 1061 if ((type > VM_EVENT_NONE && type < VM_EVENT_MAX) == 0) 1062 return (EINVAL); 1063 1064 if (vector < 0 || vector > 255) 1065 return (EINVAL); 1066 1067 return (VMINJECT(vm->cookie, vcpuid, type, vector, code, code_valid)); 1068 } 1069 1070 static VMM_STAT(VCPU_NMI_COUNT, "number of NMIs delivered to vcpu"); 1071 1072 int 1073 vm_inject_nmi(struct vm *vm, int vcpuid) 1074 { 1075 struct vcpu *vcpu; 1076 1077 if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 1078 return (EINVAL); 1079 1080 vcpu = &vm->vcpu[vcpuid]; 1081 1082 vcpu->nmi_pending = 1; 1083 vm_interrupt_hostcpu(vm, vcpuid); 1084 return (0); 1085 } 1086 1087 int 1088 vm_nmi_pending(struct vm *vm, int vcpuid) 1089 { 1090 struct vcpu *vcpu; 1091 1092 if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 1093 panic("vm_nmi_pending: invalid vcpuid %d", vcpuid); 1094 1095 vcpu = &vm->vcpu[vcpuid]; 1096 1097 return (vcpu->nmi_pending); 1098 } 1099 1100 void 1101 vm_nmi_clear(struct vm *vm, int vcpuid) 1102 { 1103 struct vcpu *vcpu; 1104 1105 if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 1106 panic("vm_nmi_pending: invalid vcpuid %d", vcpuid); 1107 1108 vcpu = &vm->vcpu[vcpuid]; 1109 1110 if (vcpu->nmi_pending == 0) 1111 panic("vm_nmi_clear: inconsistent nmi_pending state"); 1112 1113 vcpu->nmi_pending = 0; 1114 vmm_stat_incr(vm, vcpuid, VCPU_NMI_COUNT, 1); 1115 } 1116 1117 int 1118 vm_get_capability(struct vm *vm, int vcpu, int type, int *retval) 1119 { 1120 if (vcpu < 0 || vcpu >= VM_MAXCPU) 1121 return (EINVAL); 1122 1123 if (type < 0 || type >= VM_CAP_MAX) 1124 return (EINVAL); 1125 1126 return (VMGETCAP(vm->cookie, vcpu, type, retval)); 1127 } 1128 1129 int 1130 vm_set_capability(struct vm *vm, int vcpu, int type, int val) 1131 { 1132 if (vcpu < 0 || vcpu >= VM_MAXCPU) 1133 return (EINVAL); 1134 1135 if (type < 0 || type >= VM_CAP_MAX) 1136 return (EINVAL); 1137 1138 return (VMSETCAP(vm->cookie, vcpu, type, val)); 1139 } 1140 1141 uint64_t * 1142 vm_guest_msrs(struct vm *vm, int cpu) 1143 { 1144 return (vm->vcpu[cpu].guest_msrs); 1145 } 1146 1147 struct vlapic * 1148 vm_lapic(struct vm *vm, int cpu) 1149 { 1150 return (vm->vcpu[cpu].vlapic); 1151 } 1152 1153 boolean_t 1154 vmm_is_pptdev(int bus, int slot, int func) 1155 { 1156 int found, i, n; 1157 int b, s, f; 1158 char *val, *cp, *cp2; 1159 1160 /* 1161 * XXX 1162 * The length of an environment variable is limited to 128 bytes which 1163 * puts an upper limit on the number of passthru devices that may be 1164 * specified using a single environment variable. 1165 * 1166 * Work around this by scanning multiple environment variable 1167 * names instead of a single one - yuck! 1168 */ 1169 const char *names[] = { "pptdevs", "pptdevs2", "pptdevs3", NULL }; 1170 1171 /* set pptdevs="1/2/3 4/5/6 7/8/9 10/11/12" */ 1172 found = 0; 1173 for (i = 0; names[i] != NULL && !found; i++) { 1174 cp = val = getenv(names[i]); 1175 while (cp != NULL && *cp != '\0') { 1176 if ((cp2 = strchr(cp, ' ')) != NULL) 1177 *cp2 = '\0'; 1178 1179 n = sscanf(cp, "%d/%d/%d", &b, &s, &f); 1180 if (n == 3 && bus == b && slot == s && func == f) { 1181 found = 1; 1182 break; 1183 } 1184 1185 if (cp2 != NULL) 1186 *cp2++ = ' '; 1187 1188 cp = cp2; 1189 } 1190 freeenv(val); 1191 } 1192 return (found); 1193 } 1194 1195 void * 1196 vm_iommu_domain(struct vm *vm) 1197 { 1198 1199 return (vm->iommu); 1200 } 1201 1202 int 1203 vcpu_set_state(struct vm *vm, int vcpuid, enum vcpu_state newstate) 1204 { 1205 int error; 1206 struct vcpu *vcpu; 1207 1208 if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 1209 panic("vm_set_run_state: invalid vcpuid %d", vcpuid); 1210 1211 vcpu = &vm->vcpu[vcpuid]; 1212 1213 vcpu_lock(vcpu); 1214 error = vcpu_set_state_locked(vcpu, newstate); 1215 vcpu_unlock(vcpu); 1216 1217 return (error); 1218 } 1219 1220 enum vcpu_state 1221 vcpu_get_state(struct vm *vm, int vcpuid, int *hostcpu) 1222 { 1223 struct vcpu *vcpu; 1224 enum vcpu_state state; 1225 1226 if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 1227 panic("vm_get_run_state: invalid vcpuid %d", vcpuid); 1228 1229 vcpu = &vm->vcpu[vcpuid]; 1230 1231 vcpu_lock(vcpu); 1232 state = vcpu->state; 1233 if (hostcpu != NULL) 1234 *hostcpu = vcpu->hostcpu; 1235 vcpu_unlock(vcpu); 1236 1237 return (state); 1238 } 1239 1240 void 1241 vm_activate_cpu(struct vm *vm, int vcpuid) 1242 { 1243 1244 if (vcpuid >= 0 && vcpuid < VM_MAXCPU) 1245 CPU_SET(vcpuid, &vm->active_cpus); 1246 } 1247 1248 cpuset_t 1249 vm_active_cpus(struct vm *vm) 1250 { 1251 1252 return (vm->active_cpus); 1253 } 1254 1255 void * 1256 vcpu_stats(struct vm *vm, int vcpuid) 1257 { 1258 1259 return (vm->vcpu[vcpuid].stats); 1260 } 1261 1262 int 1263 vm_get_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state *state) 1264 { 1265 if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 1266 return (EINVAL); 1267 1268 *state = vm->vcpu[vcpuid].x2apic_state; 1269 1270 return (0); 1271 } 1272 1273 int 1274 vm_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state) 1275 { 1276 if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 1277 return (EINVAL); 1278 1279 if (state >= X2APIC_STATE_LAST) 1280 return (EINVAL); 1281 1282 vm->vcpu[vcpuid].x2apic_state = state; 1283 1284 vlapic_set_x2apic_state(vm, vcpuid, state); 1285 1286 return (0); 1287 } 1288 1289 void 1290 vm_interrupt_hostcpu(struct vm *vm, int vcpuid) 1291 { 1292 int hostcpu; 1293 struct vcpu *vcpu; 1294 1295 vcpu = &vm->vcpu[vcpuid]; 1296 1297 vcpu_lock(vcpu); 1298 hostcpu = vcpu->hostcpu; 1299 if (hostcpu == NOCPU) { 1300 if (vcpu->state == VCPU_SLEEPING) 1301 wakeup_one(vcpu); 1302 } else { 1303 if (vcpu->state != VCPU_RUNNING) 1304 panic("invalid vcpu state %d", vcpu->state); 1305 if (hostcpu != curcpu) 1306 ipi_cpu(hostcpu, vmm_ipinum); 1307 } 1308 vcpu_unlock(vcpu); 1309 } 1310 1311 struct vmspace * 1312 vm_get_vmspace(struct vm *vm) 1313 { 1314 1315 return (vm->vmspace); 1316 } 1317