1 /*- 2 * Copyright (c) 2011 NetApp, Inc. 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 * 26 * $FreeBSD$ 27 */ 28 29 #include <sys/cdefs.h> 30 __FBSDID("$FreeBSD$"); 31 32 #include <sys/param.h> 33 #include <sys/systm.h> 34 #include <sys/kernel.h> 35 #include <sys/module.h> 36 #include <sys/sysctl.h> 37 #include <sys/malloc.h> 38 #include <sys/pcpu.h> 39 #include <sys/lock.h> 40 #include <sys/mutex.h> 41 #include <sys/proc.h> 42 #include <sys/rwlock.h> 43 #include <sys/sched.h> 44 #include <sys/smp.h> 45 #include <sys/systm.h> 46 47 #include <vm/vm.h> 48 #include <vm/vm_object.h> 49 #include <vm/vm_page.h> 50 #include <vm/pmap.h> 51 #include <vm/vm_map.h> 52 #include <vm/vm_extern.h> 53 #include <vm/vm_param.h> 54 55 #include <machine/cpu.h> 56 #include <machine/vm.h> 57 #include <machine/pcb.h> 58 #include <machine/smp.h> 59 #include <x86/psl.h> 60 #include <x86/apicreg.h> 61 #include <machine/vmparam.h> 62 63 #include <machine/vmm.h> 64 #include <machine/vmm_dev.h> 65 66 #include "vmm_ktr.h" 67 #include "vmm_host.h" 68 #include "vmm_mem.h" 69 #include "vmm_util.h" 70 #include "vhpet.h" 71 #include "vioapic.h" 72 #include "vlapic.h" 73 #include "vmm_msr.h" 74 #include "vmm_ipi.h" 75 #include "vmm_stat.h" 76 #include "vmm_lapic.h" 77 78 #include "io/ppt.h" 79 #include "io/iommu.h" 80 81 struct vlapic; 82 83 struct vcpu { 84 int flags; 85 enum vcpu_state state; 86 struct mtx mtx; 87 int hostcpu; /* host cpuid this vcpu last ran on */ 88 uint64_t guest_msrs[VMM_MSR_NUM]; 89 struct vlapic *vlapic; 90 int vcpuid; 91 struct savefpu *guestfpu; /* guest fpu state */ 92 void *stats; 93 struct vm_exit exitinfo; 94 enum x2apic_state x2apic_state; 95 int nmi_pending; 96 }; 97 98 #define vcpu_lock_init(v) mtx_init(&((v)->mtx), "vcpu lock", 0, MTX_SPIN) 99 #define vcpu_lock(v) mtx_lock_spin(&((v)->mtx)) 100 #define vcpu_unlock(v) mtx_unlock_spin(&((v)->mtx)) 101 #define vcpu_assert_locked(v) mtx_assert(&((v)->mtx), MA_OWNED) 102 103 struct mem_seg { 104 vm_paddr_t gpa; 105 size_t len; 106 boolean_t wired; 107 vm_object_t object; 108 }; 109 #define VM_MAX_MEMORY_SEGMENTS 2 110 111 struct vm { 112 void *cookie; /* processor-specific data */ 113 void *iommu; /* iommu-specific data */ 114 struct vhpet *vhpet; /* virtual HPET */ 115 struct vioapic *vioapic; /* virtual ioapic */ 116 struct vmspace *vmspace; /* guest's address space */ 117 struct vcpu vcpu[VM_MAXCPU]; 118 int num_mem_segs; 119 struct mem_seg mem_segs[VM_MAX_MEMORY_SEGMENTS]; 120 char name[VM_MAX_NAMELEN]; 121 122 /* 123 * Set of active vcpus. 124 * An active vcpu is one that has been started implicitly (BSP) or 125 * explicitly (AP) by sending it a startup ipi. 126 */ 127 cpuset_t active_cpus; 128 }; 129 130 static int vmm_initialized; 131 132 static struct vmm_ops *ops; 133 #define VMM_INIT() (ops != NULL ? (*ops->init)() : 0) 134 #define VMM_CLEANUP() (ops != NULL ? (*ops->cleanup)() : 0) 135 #define VMM_RESUME() (ops != NULL ? (*ops->resume)() : 0) 136 137 #define VMINIT(vm, pmap) (ops != NULL ? (*ops->vminit)(vm, pmap): NULL) 138 #define VMRUN(vmi, vcpu, rip, pmap) \ 139 (ops != NULL ? (*ops->vmrun)(vmi, vcpu, rip, pmap) : ENXIO) 140 #define VMCLEANUP(vmi) (ops != NULL ? (*ops->vmcleanup)(vmi) : NULL) 141 #define VMSPACE_ALLOC(min, max) \ 142 (ops != NULL ? (*ops->vmspace_alloc)(min, max) : NULL) 143 #define VMSPACE_FREE(vmspace) \ 144 (ops != NULL ? (*ops->vmspace_free)(vmspace) : ENXIO) 145 #define VMGETREG(vmi, vcpu, num, retval) \ 146 (ops != NULL ? (*ops->vmgetreg)(vmi, vcpu, num, retval) : ENXIO) 147 #define VMSETREG(vmi, vcpu, num, val) \ 148 (ops != NULL ? (*ops->vmsetreg)(vmi, vcpu, num, val) : ENXIO) 149 #define VMGETDESC(vmi, vcpu, num, desc) \ 150 (ops != NULL ? (*ops->vmgetdesc)(vmi, vcpu, num, desc) : ENXIO) 151 #define VMSETDESC(vmi, vcpu, num, desc) \ 152 (ops != NULL ? (*ops->vmsetdesc)(vmi, vcpu, num, desc) : ENXIO) 153 #define VMINJECT(vmi, vcpu, type, vec, ec, ecv) \ 154 (ops != NULL ? (*ops->vminject)(vmi, vcpu, type, vec, ec, ecv) : ENXIO) 155 #define VMGETCAP(vmi, vcpu, num, retval) \ 156 (ops != NULL ? (*ops->vmgetcap)(vmi, vcpu, num, retval) : ENXIO) 157 #define VMSETCAP(vmi, vcpu, num, val) \ 158 (ops != NULL ? (*ops->vmsetcap)(vmi, vcpu, num, val) : ENXIO) 159 #define VLAPIC_INIT(vmi, vcpu) \ 160 (ops != NULL ? (*ops->vlapic_init)(vmi, vcpu) : NULL) 161 #define VLAPIC_CLEANUP(vmi, vlapic) \ 162 (ops != NULL ? (*ops->vlapic_cleanup)(vmi, vlapic) : NULL) 163 164 #define fpu_start_emulating() load_cr0(rcr0() | CR0_TS) 165 #define fpu_stop_emulating() clts() 166 167 static MALLOC_DEFINE(M_VM, "vm", "vm"); 168 CTASSERT(VMM_MSR_NUM <= 64); /* msr_mask can keep track of up to 64 msrs */ 169 170 /* statistics */ 171 static VMM_STAT(VCPU_TOTAL_RUNTIME, "vcpu total runtime"); 172 173 static void 174 vcpu_cleanup(struct vm *vm, int i) 175 { 176 struct vcpu *vcpu = &vm->vcpu[i]; 177 178 VLAPIC_CLEANUP(vm->cookie, vcpu->vlapic); 179 vmm_stat_free(vcpu->stats); 180 fpu_save_area_free(vcpu->guestfpu); 181 } 182 183 static void 184 vcpu_init(struct vm *vm, uint32_t vcpu_id) 185 { 186 struct vcpu *vcpu; 187 188 vcpu = &vm->vcpu[vcpu_id]; 189 190 vcpu_lock_init(vcpu); 191 vcpu->hostcpu = NOCPU; 192 vcpu->vcpuid = vcpu_id; 193 vcpu->vlapic = VLAPIC_INIT(vm->cookie, vcpu_id); 194 vm_set_x2apic_state(vm, vcpu_id, X2APIC_ENABLED); 195 vcpu->guestfpu = fpu_save_area_alloc(); 196 fpu_save_area_reset(vcpu->guestfpu); 197 vcpu->stats = vmm_stat_alloc(); 198 } 199 200 struct vm_exit * 201 vm_exitinfo(struct vm *vm, int cpuid) 202 { 203 struct vcpu *vcpu; 204 205 if (cpuid < 0 || cpuid >= VM_MAXCPU) 206 panic("vm_exitinfo: invalid cpuid %d", cpuid); 207 208 vcpu = &vm->vcpu[cpuid]; 209 210 return (&vcpu->exitinfo); 211 } 212 213 static void 214 vmm_resume(void) 215 { 216 VMM_RESUME(); 217 } 218 219 static int 220 vmm_init(void) 221 { 222 int error; 223 224 vmm_host_state_init(); 225 vmm_ipi_init(); 226 227 error = vmm_mem_init(); 228 if (error) 229 return (error); 230 231 if (vmm_is_intel()) 232 ops = &vmm_ops_intel; 233 else if (vmm_is_amd()) 234 ops = &vmm_ops_amd; 235 else 236 return (ENXIO); 237 238 vmm_msr_init(); 239 vmm_resume_p = vmm_resume; 240 241 return (VMM_INIT()); 242 } 243 244 static int 245 vmm_handler(module_t mod, int what, void *arg) 246 { 247 int error; 248 249 switch (what) { 250 case MOD_LOAD: 251 vmmdev_init(); 252 iommu_init(); 253 error = vmm_init(); 254 if (error == 0) 255 vmm_initialized = 1; 256 break; 257 case MOD_UNLOAD: 258 error = vmmdev_cleanup(); 259 if (error == 0) { 260 vmm_resume_p = NULL; 261 iommu_cleanup(); 262 vmm_ipi_cleanup(); 263 error = VMM_CLEANUP(); 264 /* 265 * Something bad happened - prevent new 266 * VMs from being created 267 */ 268 if (error) 269 vmm_initialized = 0; 270 } 271 break; 272 default: 273 error = 0; 274 break; 275 } 276 return (error); 277 } 278 279 static moduledata_t vmm_kmod = { 280 "vmm", 281 vmm_handler, 282 NULL 283 }; 284 285 /* 286 * vmm initialization has the following dependencies: 287 * 288 * - iommu initialization must happen after the pci passthru driver has had 289 * a chance to attach to any passthru devices (after SI_SUB_CONFIGURE). 290 * 291 * - VT-x initialization requires smp_rendezvous() and therefore must happen 292 * after SMP is fully functional (after SI_SUB_SMP). 293 */ 294 DECLARE_MODULE(vmm, vmm_kmod, SI_SUB_SMP + 1, SI_ORDER_ANY); 295 MODULE_VERSION(vmm, 1); 296 297 SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW, NULL, NULL); 298 299 int 300 vm_create(const char *name, struct vm **retvm) 301 { 302 int i; 303 struct vm *vm; 304 struct vmspace *vmspace; 305 306 const int BSP = 0; 307 308 /* 309 * If vmm.ko could not be successfully initialized then don't attempt 310 * to create the virtual machine. 311 */ 312 if (!vmm_initialized) 313 return (ENXIO); 314 315 if (name == NULL || strlen(name) >= VM_MAX_NAMELEN) 316 return (EINVAL); 317 318 vmspace = VMSPACE_ALLOC(VM_MIN_ADDRESS, VM_MAXUSER_ADDRESS); 319 if (vmspace == NULL) 320 return (ENOMEM); 321 322 vm = malloc(sizeof(struct vm), M_VM, M_WAITOK | M_ZERO); 323 strcpy(vm->name, name); 324 vm->cookie = VMINIT(vm, vmspace_pmap(vmspace)); 325 vm->vioapic = vioapic_init(vm); 326 vm->vhpet = vhpet_init(vm); 327 328 for (i = 0; i < VM_MAXCPU; i++) { 329 vcpu_init(vm, i); 330 guest_msrs_init(vm, i); 331 } 332 333 vm_activate_cpu(vm, BSP); 334 vm->vmspace = vmspace; 335 336 *retvm = vm; 337 return (0); 338 } 339 340 static void 341 vm_free_mem_seg(struct vm *vm, struct mem_seg *seg) 342 { 343 344 if (seg->object != NULL) 345 vmm_mem_free(vm->vmspace, seg->gpa, seg->len); 346 347 bzero(seg, sizeof(*seg)); 348 } 349 350 void 351 vm_destroy(struct vm *vm) 352 { 353 int i; 354 355 ppt_unassign_all(vm); 356 357 if (vm->iommu != NULL) 358 iommu_destroy_domain(vm->iommu); 359 360 vhpet_cleanup(vm->vhpet); 361 vioapic_cleanup(vm->vioapic); 362 363 for (i = 0; i < vm->num_mem_segs; i++) 364 vm_free_mem_seg(vm, &vm->mem_segs[i]); 365 366 vm->num_mem_segs = 0; 367 368 for (i = 0; i < VM_MAXCPU; i++) 369 vcpu_cleanup(vm, i); 370 371 VMSPACE_FREE(vm->vmspace); 372 373 VMCLEANUP(vm->cookie); 374 375 free(vm, M_VM); 376 } 377 378 const char * 379 vm_name(struct vm *vm) 380 { 381 return (vm->name); 382 } 383 384 int 385 vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa) 386 { 387 vm_object_t obj; 388 389 if ((obj = vmm_mmio_alloc(vm->vmspace, gpa, len, hpa)) == NULL) 390 return (ENOMEM); 391 else 392 return (0); 393 } 394 395 int 396 vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len) 397 { 398 399 vmm_mmio_free(vm->vmspace, gpa, len); 400 return (0); 401 } 402 403 boolean_t 404 vm_mem_allocated(struct vm *vm, vm_paddr_t gpa) 405 { 406 int i; 407 vm_paddr_t gpabase, gpalimit; 408 409 for (i = 0; i < vm->num_mem_segs; i++) { 410 gpabase = vm->mem_segs[i].gpa; 411 gpalimit = gpabase + vm->mem_segs[i].len; 412 if (gpa >= gpabase && gpa < gpalimit) 413 return (TRUE); /* 'gpa' is regular memory */ 414 } 415 416 if (ppt_is_mmio(vm, gpa)) 417 return (TRUE); /* 'gpa' is pci passthru mmio */ 418 419 return (FALSE); 420 } 421 422 int 423 vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len) 424 { 425 int available, allocated; 426 struct mem_seg *seg; 427 vm_object_t object; 428 vm_paddr_t g; 429 430 if ((gpa & PAGE_MASK) || (len & PAGE_MASK) || len == 0) 431 return (EINVAL); 432 433 available = allocated = 0; 434 g = gpa; 435 while (g < gpa + len) { 436 if (vm_mem_allocated(vm, g)) 437 allocated++; 438 else 439 available++; 440 441 g += PAGE_SIZE; 442 } 443 444 /* 445 * If there are some allocated and some available pages in the address 446 * range then it is an error. 447 */ 448 if (allocated && available) 449 return (EINVAL); 450 451 /* 452 * If the entire address range being requested has already been 453 * allocated then there isn't anything more to do. 454 */ 455 if (allocated && available == 0) 456 return (0); 457 458 if (vm->num_mem_segs >= VM_MAX_MEMORY_SEGMENTS) 459 return (E2BIG); 460 461 seg = &vm->mem_segs[vm->num_mem_segs]; 462 463 if ((object = vmm_mem_alloc(vm->vmspace, gpa, len)) == NULL) 464 return (ENOMEM); 465 466 seg->gpa = gpa; 467 seg->len = len; 468 seg->object = object; 469 seg->wired = FALSE; 470 471 vm->num_mem_segs++; 472 473 return (0); 474 } 475 476 static void 477 vm_gpa_unwire(struct vm *vm) 478 { 479 int i, rv; 480 struct mem_seg *seg; 481 482 for (i = 0; i < vm->num_mem_segs; i++) { 483 seg = &vm->mem_segs[i]; 484 if (!seg->wired) 485 continue; 486 487 rv = vm_map_unwire(&vm->vmspace->vm_map, 488 seg->gpa, seg->gpa + seg->len, 489 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); 490 KASSERT(rv == KERN_SUCCESS, ("vm(%s) memory segment " 491 "%#lx/%ld could not be unwired: %d", 492 vm_name(vm), seg->gpa, seg->len, rv)); 493 494 seg->wired = FALSE; 495 } 496 } 497 498 static int 499 vm_gpa_wire(struct vm *vm) 500 { 501 int i, rv; 502 struct mem_seg *seg; 503 504 for (i = 0; i < vm->num_mem_segs; i++) { 505 seg = &vm->mem_segs[i]; 506 if (seg->wired) 507 continue; 508 509 /* XXX rlimits? */ 510 rv = vm_map_wire(&vm->vmspace->vm_map, 511 seg->gpa, seg->gpa + seg->len, 512 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); 513 if (rv != KERN_SUCCESS) 514 break; 515 516 seg->wired = TRUE; 517 } 518 519 if (i < vm->num_mem_segs) { 520 /* 521 * Undo the wiring before returning an error. 522 */ 523 vm_gpa_unwire(vm); 524 return (EAGAIN); 525 } 526 527 return (0); 528 } 529 530 static void 531 vm_iommu_modify(struct vm *vm, boolean_t map) 532 { 533 int i, sz; 534 vm_paddr_t gpa, hpa; 535 struct mem_seg *seg; 536 void *vp, *cookie, *host_domain; 537 538 sz = PAGE_SIZE; 539 host_domain = iommu_host_domain(); 540 541 for (i = 0; i < vm->num_mem_segs; i++) { 542 seg = &vm->mem_segs[i]; 543 KASSERT(seg->wired, ("vm(%s) memory segment %#lx/%ld not wired", 544 vm_name(vm), seg->gpa, seg->len)); 545 546 gpa = seg->gpa; 547 while (gpa < seg->gpa + seg->len) { 548 vp = vm_gpa_hold(vm, gpa, PAGE_SIZE, VM_PROT_WRITE, 549 &cookie); 550 KASSERT(vp != NULL, ("vm(%s) could not map gpa %#lx", 551 vm_name(vm), gpa)); 552 553 vm_gpa_release(cookie); 554 555 hpa = DMAP_TO_PHYS((uintptr_t)vp); 556 if (map) { 557 iommu_create_mapping(vm->iommu, gpa, hpa, sz); 558 iommu_remove_mapping(host_domain, hpa, sz); 559 } else { 560 iommu_remove_mapping(vm->iommu, gpa, sz); 561 iommu_create_mapping(host_domain, hpa, hpa, sz); 562 } 563 564 gpa += PAGE_SIZE; 565 } 566 } 567 568 /* 569 * Invalidate the cached translations associated with the domain 570 * from which pages were removed. 571 */ 572 if (map) 573 iommu_invalidate_tlb(host_domain); 574 else 575 iommu_invalidate_tlb(vm->iommu); 576 } 577 578 #define vm_iommu_unmap(vm) vm_iommu_modify((vm), FALSE) 579 #define vm_iommu_map(vm) vm_iommu_modify((vm), TRUE) 580 581 int 582 vm_unassign_pptdev(struct vm *vm, int bus, int slot, int func) 583 { 584 int error; 585 586 error = ppt_unassign_device(vm, bus, slot, func); 587 if (error) 588 return (error); 589 590 if (ppt_num_devices(vm) == 0) { 591 vm_iommu_unmap(vm); 592 vm_gpa_unwire(vm); 593 } 594 return (0); 595 } 596 597 int 598 vm_assign_pptdev(struct vm *vm, int bus, int slot, int func) 599 { 600 int error; 601 vm_paddr_t maxaddr; 602 603 /* 604 * Virtual machines with pci passthru devices get special treatment: 605 * - the guest physical memory is wired 606 * - the iommu is programmed to do the 'gpa' to 'hpa' translation 607 * 608 * We need to do this before the first pci passthru device is attached. 609 */ 610 if (ppt_num_devices(vm) == 0) { 611 KASSERT(vm->iommu == NULL, 612 ("vm_assign_pptdev: iommu must be NULL")); 613 maxaddr = vmm_mem_maxaddr(); 614 vm->iommu = iommu_create_domain(maxaddr); 615 616 error = vm_gpa_wire(vm); 617 if (error) 618 return (error); 619 620 vm_iommu_map(vm); 621 } 622 623 error = ppt_assign_device(vm, bus, slot, func); 624 return (error); 625 } 626 627 void * 628 vm_gpa_hold(struct vm *vm, vm_paddr_t gpa, size_t len, int reqprot, 629 void **cookie) 630 { 631 int count, pageoff; 632 vm_page_t m; 633 634 pageoff = gpa & PAGE_MASK; 635 if (len > PAGE_SIZE - pageoff) 636 panic("vm_gpa_hold: invalid gpa/len: 0x%016lx/%lu", gpa, len); 637 638 count = vm_fault_quick_hold_pages(&vm->vmspace->vm_map, 639 trunc_page(gpa), PAGE_SIZE, reqprot, &m, 1); 640 641 if (count == 1) { 642 *cookie = m; 643 return ((void *)(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)) + pageoff)); 644 } else { 645 *cookie = NULL; 646 return (NULL); 647 } 648 } 649 650 void 651 vm_gpa_release(void *cookie) 652 { 653 vm_page_t m = cookie; 654 655 vm_page_lock(m); 656 vm_page_unhold(m); 657 vm_page_unlock(m); 658 } 659 660 int 661 vm_gpabase2memseg(struct vm *vm, vm_paddr_t gpabase, 662 struct vm_memory_segment *seg) 663 { 664 int i; 665 666 for (i = 0; i < vm->num_mem_segs; i++) { 667 if (gpabase == vm->mem_segs[i].gpa) { 668 seg->gpa = vm->mem_segs[i].gpa; 669 seg->len = vm->mem_segs[i].len; 670 seg->wired = vm->mem_segs[i].wired; 671 return (0); 672 } 673 } 674 return (-1); 675 } 676 677 int 678 vm_get_memobj(struct vm *vm, vm_paddr_t gpa, size_t len, 679 vm_offset_t *offset, struct vm_object **object) 680 { 681 int i; 682 size_t seg_len; 683 vm_paddr_t seg_gpa; 684 vm_object_t seg_obj; 685 686 for (i = 0; i < vm->num_mem_segs; i++) { 687 if ((seg_obj = vm->mem_segs[i].object) == NULL) 688 continue; 689 690 seg_gpa = vm->mem_segs[i].gpa; 691 seg_len = vm->mem_segs[i].len; 692 693 if (gpa >= seg_gpa && gpa < seg_gpa + seg_len) { 694 *offset = gpa - seg_gpa; 695 *object = seg_obj; 696 vm_object_reference(seg_obj); 697 return (0); 698 } 699 } 700 701 return (EINVAL); 702 } 703 704 int 705 vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval) 706 { 707 708 if (vcpu < 0 || vcpu >= VM_MAXCPU) 709 return (EINVAL); 710 711 if (reg >= VM_REG_LAST) 712 return (EINVAL); 713 714 return (VMGETREG(vm->cookie, vcpu, reg, retval)); 715 } 716 717 int 718 vm_set_register(struct vm *vm, int vcpu, int reg, uint64_t val) 719 { 720 721 if (vcpu < 0 || vcpu >= VM_MAXCPU) 722 return (EINVAL); 723 724 if (reg >= VM_REG_LAST) 725 return (EINVAL); 726 727 return (VMSETREG(vm->cookie, vcpu, reg, val)); 728 } 729 730 static boolean_t 731 is_descriptor_table(int reg) 732 { 733 734 switch (reg) { 735 case VM_REG_GUEST_IDTR: 736 case VM_REG_GUEST_GDTR: 737 return (TRUE); 738 default: 739 return (FALSE); 740 } 741 } 742 743 static boolean_t 744 is_segment_register(int reg) 745 { 746 747 switch (reg) { 748 case VM_REG_GUEST_ES: 749 case VM_REG_GUEST_CS: 750 case VM_REG_GUEST_SS: 751 case VM_REG_GUEST_DS: 752 case VM_REG_GUEST_FS: 753 case VM_REG_GUEST_GS: 754 case VM_REG_GUEST_TR: 755 case VM_REG_GUEST_LDTR: 756 return (TRUE); 757 default: 758 return (FALSE); 759 } 760 } 761 762 int 763 vm_get_seg_desc(struct vm *vm, int vcpu, int reg, 764 struct seg_desc *desc) 765 { 766 767 if (vcpu < 0 || vcpu >= VM_MAXCPU) 768 return (EINVAL); 769 770 if (!is_segment_register(reg) && !is_descriptor_table(reg)) 771 return (EINVAL); 772 773 return (VMGETDESC(vm->cookie, vcpu, reg, desc)); 774 } 775 776 int 777 vm_set_seg_desc(struct vm *vm, int vcpu, int reg, 778 struct seg_desc *desc) 779 { 780 if (vcpu < 0 || vcpu >= VM_MAXCPU) 781 return (EINVAL); 782 783 if (!is_segment_register(reg) && !is_descriptor_table(reg)) 784 return (EINVAL); 785 786 return (VMSETDESC(vm->cookie, vcpu, reg, desc)); 787 } 788 789 static void 790 restore_guest_fpustate(struct vcpu *vcpu) 791 { 792 793 /* flush host state to the pcb */ 794 fpuexit(curthread); 795 796 /* restore guest FPU state */ 797 fpu_stop_emulating(); 798 fpurestore(vcpu->guestfpu); 799 800 /* 801 * The FPU is now "dirty" with the guest's state so turn on emulation 802 * to trap any access to the FPU by the host. 803 */ 804 fpu_start_emulating(); 805 } 806 807 static void 808 save_guest_fpustate(struct vcpu *vcpu) 809 { 810 811 if ((rcr0() & CR0_TS) == 0) 812 panic("fpu emulation not enabled in host!"); 813 814 /* save guest FPU state */ 815 fpu_stop_emulating(); 816 fpusave(vcpu->guestfpu); 817 fpu_start_emulating(); 818 } 819 820 static VMM_STAT(VCPU_IDLE_TICKS, "number of ticks vcpu was idle"); 821 822 static int 823 vcpu_set_state_locked(struct vcpu *vcpu, enum vcpu_state newstate, 824 bool from_idle) 825 { 826 int error; 827 828 vcpu_assert_locked(vcpu); 829 830 /* 831 * State transitions from the vmmdev_ioctl() must always begin from 832 * the VCPU_IDLE state. This guarantees that there is only a single 833 * ioctl() operating on a vcpu at any point. 834 */ 835 if (from_idle) { 836 while (vcpu->state != VCPU_IDLE) 837 msleep_spin(&vcpu->state, &vcpu->mtx, "vmstat", hz); 838 } else { 839 KASSERT(vcpu->state != VCPU_IDLE, ("invalid transition from " 840 "vcpu idle state")); 841 } 842 843 /* 844 * The following state transitions are allowed: 845 * IDLE -> FROZEN -> IDLE 846 * FROZEN -> RUNNING -> FROZEN 847 * FROZEN -> SLEEPING -> FROZEN 848 */ 849 switch (vcpu->state) { 850 case VCPU_IDLE: 851 case VCPU_RUNNING: 852 case VCPU_SLEEPING: 853 error = (newstate != VCPU_FROZEN); 854 break; 855 case VCPU_FROZEN: 856 error = (newstate == VCPU_FROZEN); 857 break; 858 default: 859 error = 1; 860 break; 861 } 862 863 if (error) 864 return (EBUSY); 865 866 vcpu->state = newstate; 867 if (newstate == VCPU_IDLE) 868 wakeup(&vcpu->state); 869 870 return (0); 871 } 872 873 static void 874 vcpu_require_state(struct vm *vm, int vcpuid, enum vcpu_state newstate) 875 { 876 int error; 877 878 if ((error = vcpu_set_state(vm, vcpuid, newstate, false)) != 0) 879 panic("Error %d setting state to %d\n", error, newstate); 880 } 881 882 static void 883 vcpu_require_state_locked(struct vcpu *vcpu, enum vcpu_state newstate) 884 { 885 int error; 886 887 if ((error = vcpu_set_state_locked(vcpu, newstate, false)) != 0) 888 panic("Error %d setting state to %d", error, newstate); 889 } 890 891 /* 892 * Emulate a guest 'hlt' by sleeping until the vcpu is ready to run. 893 */ 894 static int 895 vm_handle_hlt(struct vm *vm, int vcpuid, bool intr_disabled, bool *retu) 896 { 897 struct vm_exit *vmexit; 898 struct vcpu *vcpu; 899 int t, timo; 900 901 vcpu = &vm->vcpu[vcpuid]; 902 903 vcpu_lock(vcpu); 904 905 /* 906 * Do a final check for pending NMI or interrupts before 907 * really putting this thread to sleep. 908 * 909 * These interrupts could have happened any time after we 910 * returned from VMRUN() and before we grabbed the vcpu lock. 911 */ 912 if (!vm_nmi_pending(vm, vcpuid) && 913 (intr_disabled || vlapic_pending_intr(vcpu->vlapic) < 0)) { 914 t = ticks; 915 vcpu_require_state_locked(vcpu, VCPU_SLEEPING); 916 if (vlapic_enabled(vcpu->vlapic)) { 917 /* 918 * XXX msleep_spin() is not interruptible so use the 919 * 'timo' to put an upper bound on the sleep time. 920 */ 921 timo = hz; 922 msleep_spin(vcpu, &vcpu->mtx, "vmidle", timo); 923 } else { 924 /* 925 * Spindown the vcpu if the apic is disabled and it 926 * had entered the halted state. 927 */ 928 *retu = true; 929 vmexit = vm_exitinfo(vm, vcpuid); 930 vmexit->exitcode = VM_EXITCODE_SPINDOWN_CPU; 931 VCPU_CTR0(vm, vcpuid, "spinning down cpu"); 932 } 933 vcpu_require_state_locked(vcpu, VCPU_FROZEN); 934 vmm_stat_incr(vm, vcpuid, VCPU_IDLE_TICKS, ticks - t); 935 } 936 vcpu_unlock(vcpu); 937 938 return (0); 939 } 940 941 static int 942 vm_handle_paging(struct vm *vm, int vcpuid, bool *retu) 943 { 944 int rv, ftype; 945 struct vm_map *map; 946 struct vcpu *vcpu; 947 struct vm_exit *vme; 948 949 vcpu = &vm->vcpu[vcpuid]; 950 vme = &vcpu->exitinfo; 951 952 ftype = vme->u.paging.fault_type; 953 KASSERT(ftype == VM_PROT_READ || 954 ftype == VM_PROT_WRITE || ftype == VM_PROT_EXECUTE, 955 ("vm_handle_paging: invalid fault_type %d", ftype)); 956 957 if (ftype == VM_PROT_READ || ftype == VM_PROT_WRITE) { 958 rv = pmap_emulate_accessed_dirty(vmspace_pmap(vm->vmspace), 959 vme->u.paging.gpa, ftype); 960 if (rv == 0) 961 goto done; 962 } 963 964 map = &vm->vmspace->vm_map; 965 rv = vm_fault(map, vme->u.paging.gpa, ftype, VM_FAULT_NORMAL); 966 967 VCPU_CTR3(vm, vcpuid, "vm_handle_paging rv = %d, gpa = %#lx, " 968 "ftype = %d", rv, vme->u.paging.gpa, ftype); 969 970 if (rv != KERN_SUCCESS) 971 return (EFAULT); 972 done: 973 /* restart execution at the faulting instruction */ 974 vme->inst_length = 0; 975 976 return (0); 977 } 978 979 static int 980 vm_handle_inst_emul(struct vm *vm, int vcpuid, bool *retu) 981 { 982 struct vie *vie; 983 struct vcpu *vcpu; 984 struct vm_exit *vme; 985 int error, inst_length; 986 uint64_t rip, gla, gpa, cr3; 987 mem_region_read_t mread; 988 mem_region_write_t mwrite; 989 990 vcpu = &vm->vcpu[vcpuid]; 991 vme = &vcpu->exitinfo; 992 993 rip = vme->rip; 994 inst_length = vme->inst_length; 995 996 gla = vme->u.inst_emul.gla; 997 gpa = vme->u.inst_emul.gpa; 998 cr3 = vme->u.inst_emul.cr3; 999 vie = &vme->u.inst_emul.vie; 1000 1001 vie_init(vie); 1002 1003 /* Fetch, decode and emulate the faulting instruction */ 1004 if (vmm_fetch_instruction(vm, vcpuid, rip, inst_length, cr3, vie) != 0) 1005 return (EFAULT); 1006 1007 if (vmm_decode_instruction(vm, vcpuid, gla, vie) != 0) 1008 return (EFAULT); 1009 1010 /* return to userland unless this is an in-kernel emulated device */ 1011 if (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE) { 1012 mread = lapic_mmio_read; 1013 mwrite = lapic_mmio_write; 1014 } else if (gpa >= VIOAPIC_BASE && gpa < VIOAPIC_BASE + VIOAPIC_SIZE) { 1015 mread = vioapic_mmio_read; 1016 mwrite = vioapic_mmio_write; 1017 } else if (gpa >= VHPET_BASE && gpa < VHPET_BASE + VHPET_SIZE) { 1018 mread = vhpet_mmio_read; 1019 mwrite = vhpet_mmio_write; 1020 } else { 1021 *retu = true; 1022 return (0); 1023 } 1024 1025 error = vmm_emulate_instruction(vm, vcpuid, gpa, vie, mread, mwrite, 1026 retu); 1027 1028 return (error); 1029 } 1030 1031 int 1032 vm_run(struct vm *vm, struct vm_run *vmrun) 1033 { 1034 int error, vcpuid; 1035 struct vcpu *vcpu; 1036 struct pcb *pcb; 1037 uint64_t tscval, rip; 1038 struct vm_exit *vme; 1039 bool retu, intr_disabled; 1040 pmap_t pmap; 1041 1042 vcpuid = vmrun->cpuid; 1043 1044 if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 1045 return (EINVAL); 1046 1047 pmap = vmspace_pmap(vm->vmspace); 1048 vcpu = &vm->vcpu[vcpuid]; 1049 vme = &vcpu->exitinfo; 1050 rip = vmrun->rip; 1051 restart: 1052 critical_enter(); 1053 1054 KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active), 1055 ("vm_run: absurd pm_active")); 1056 1057 tscval = rdtsc(); 1058 1059 pcb = PCPU_GET(curpcb); 1060 set_pcb_flags(pcb, PCB_FULL_IRET); 1061 1062 restore_guest_msrs(vm, vcpuid); 1063 restore_guest_fpustate(vcpu); 1064 1065 vcpu_require_state(vm, vcpuid, VCPU_RUNNING); 1066 vcpu->hostcpu = curcpu; 1067 error = VMRUN(vm->cookie, vcpuid, rip, pmap); 1068 vcpu->hostcpu = NOCPU; 1069 vcpu_require_state(vm, vcpuid, VCPU_FROZEN); 1070 1071 save_guest_fpustate(vcpu); 1072 restore_host_msrs(vm, vcpuid); 1073 1074 vmm_stat_incr(vm, vcpuid, VCPU_TOTAL_RUNTIME, rdtsc() - tscval); 1075 1076 critical_exit(); 1077 1078 if (error == 0) { 1079 retu = false; 1080 switch (vme->exitcode) { 1081 case VM_EXITCODE_HLT: 1082 intr_disabled = ((vme->u.hlt.rflags & PSL_I) == 0); 1083 error = vm_handle_hlt(vm, vcpuid, intr_disabled, &retu); 1084 break; 1085 case VM_EXITCODE_PAGING: 1086 error = vm_handle_paging(vm, vcpuid, &retu); 1087 break; 1088 case VM_EXITCODE_INST_EMUL: 1089 error = vm_handle_inst_emul(vm, vcpuid, &retu); 1090 break; 1091 default: 1092 retu = true; /* handled in userland */ 1093 break; 1094 } 1095 } 1096 1097 if (error == 0 && retu == false) { 1098 rip = vme->rip + vme->inst_length; 1099 goto restart; 1100 } 1101 1102 /* copy the exit information */ 1103 bcopy(vme, &vmrun->vm_exit, sizeof(struct vm_exit)); 1104 return (error); 1105 } 1106 1107 int 1108 vm_inject_event(struct vm *vm, int vcpuid, int type, 1109 int vector, uint32_t code, int code_valid) 1110 { 1111 if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 1112 return (EINVAL); 1113 1114 if ((type > VM_EVENT_NONE && type < VM_EVENT_MAX) == 0) 1115 return (EINVAL); 1116 1117 if (vector < 0 || vector > 255) 1118 return (EINVAL); 1119 1120 return (VMINJECT(vm->cookie, vcpuid, type, vector, code, code_valid)); 1121 } 1122 1123 static VMM_STAT(VCPU_NMI_COUNT, "number of NMIs delivered to vcpu"); 1124 1125 int 1126 vm_inject_nmi(struct vm *vm, int vcpuid) 1127 { 1128 struct vcpu *vcpu; 1129 1130 if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 1131 return (EINVAL); 1132 1133 vcpu = &vm->vcpu[vcpuid]; 1134 1135 vcpu->nmi_pending = 1; 1136 vcpu_notify_event(vm, vcpuid, false); 1137 return (0); 1138 } 1139 1140 int 1141 vm_nmi_pending(struct vm *vm, int vcpuid) 1142 { 1143 struct vcpu *vcpu; 1144 1145 if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 1146 panic("vm_nmi_pending: invalid vcpuid %d", vcpuid); 1147 1148 vcpu = &vm->vcpu[vcpuid]; 1149 1150 return (vcpu->nmi_pending); 1151 } 1152 1153 void 1154 vm_nmi_clear(struct vm *vm, int vcpuid) 1155 { 1156 struct vcpu *vcpu; 1157 1158 if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 1159 panic("vm_nmi_pending: invalid vcpuid %d", vcpuid); 1160 1161 vcpu = &vm->vcpu[vcpuid]; 1162 1163 if (vcpu->nmi_pending == 0) 1164 panic("vm_nmi_clear: inconsistent nmi_pending state"); 1165 1166 vcpu->nmi_pending = 0; 1167 vmm_stat_incr(vm, vcpuid, VCPU_NMI_COUNT, 1); 1168 } 1169 1170 int 1171 vm_get_capability(struct vm *vm, int vcpu, int type, int *retval) 1172 { 1173 if (vcpu < 0 || vcpu >= VM_MAXCPU) 1174 return (EINVAL); 1175 1176 if (type < 0 || type >= VM_CAP_MAX) 1177 return (EINVAL); 1178 1179 return (VMGETCAP(vm->cookie, vcpu, type, retval)); 1180 } 1181 1182 int 1183 vm_set_capability(struct vm *vm, int vcpu, int type, int val) 1184 { 1185 if (vcpu < 0 || vcpu >= VM_MAXCPU) 1186 return (EINVAL); 1187 1188 if (type < 0 || type >= VM_CAP_MAX) 1189 return (EINVAL); 1190 1191 return (VMSETCAP(vm->cookie, vcpu, type, val)); 1192 } 1193 1194 uint64_t * 1195 vm_guest_msrs(struct vm *vm, int cpu) 1196 { 1197 return (vm->vcpu[cpu].guest_msrs); 1198 } 1199 1200 struct vlapic * 1201 vm_lapic(struct vm *vm, int cpu) 1202 { 1203 return (vm->vcpu[cpu].vlapic); 1204 } 1205 1206 struct vioapic * 1207 vm_ioapic(struct vm *vm) 1208 { 1209 1210 return (vm->vioapic); 1211 } 1212 1213 struct vhpet * 1214 vm_hpet(struct vm *vm) 1215 { 1216 1217 return (vm->vhpet); 1218 } 1219 1220 boolean_t 1221 vmm_is_pptdev(int bus, int slot, int func) 1222 { 1223 int found, i, n; 1224 int b, s, f; 1225 char *val, *cp, *cp2; 1226 1227 /* 1228 * XXX 1229 * The length of an environment variable is limited to 128 bytes which 1230 * puts an upper limit on the number of passthru devices that may be 1231 * specified using a single environment variable. 1232 * 1233 * Work around this by scanning multiple environment variable 1234 * names instead of a single one - yuck! 1235 */ 1236 const char *names[] = { "pptdevs", "pptdevs2", "pptdevs3", NULL }; 1237 1238 /* set pptdevs="1/2/3 4/5/6 7/8/9 10/11/12" */ 1239 found = 0; 1240 for (i = 0; names[i] != NULL && !found; i++) { 1241 cp = val = getenv(names[i]); 1242 while (cp != NULL && *cp != '\0') { 1243 if ((cp2 = strchr(cp, ' ')) != NULL) 1244 *cp2 = '\0'; 1245 1246 n = sscanf(cp, "%d/%d/%d", &b, &s, &f); 1247 if (n == 3 && bus == b && slot == s && func == f) { 1248 found = 1; 1249 break; 1250 } 1251 1252 if (cp2 != NULL) 1253 *cp2++ = ' '; 1254 1255 cp = cp2; 1256 } 1257 freeenv(val); 1258 } 1259 return (found); 1260 } 1261 1262 void * 1263 vm_iommu_domain(struct vm *vm) 1264 { 1265 1266 return (vm->iommu); 1267 } 1268 1269 int 1270 vcpu_set_state(struct vm *vm, int vcpuid, enum vcpu_state newstate, 1271 bool from_idle) 1272 { 1273 int error; 1274 struct vcpu *vcpu; 1275 1276 if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 1277 panic("vm_set_run_state: invalid vcpuid %d", vcpuid); 1278 1279 vcpu = &vm->vcpu[vcpuid]; 1280 1281 vcpu_lock(vcpu); 1282 error = vcpu_set_state_locked(vcpu, newstate, from_idle); 1283 vcpu_unlock(vcpu); 1284 1285 return (error); 1286 } 1287 1288 enum vcpu_state 1289 vcpu_get_state(struct vm *vm, int vcpuid, int *hostcpu) 1290 { 1291 struct vcpu *vcpu; 1292 enum vcpu_state state; 1293 1294 if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 1295 panic("vm_get_run_state: invalid vcpuid %d", vcpuid); 1296 1297 vcpu = &vm->vcpu[vcpuid]; 1298 1299 vcpu_lock(vcpu); 1300 state = vcpu->state; 1301 if (hostcpu != NULL) 1302 *hostcpu = vcpu->hostcpu; 1303 vcpu_unlock(vcpu); 1304 1305 return (state); 1306 } 1307 1308 void 1309 vm_activate_cpu(struct vm *vm, int vcpuid) 1310 { 1311 1312 if (vcpuid >= 0 && vcpuid < VM_MAXCPU) 1313 CPU_SET(vcpuid, &vm->active_cpus); 1314 } 1315 1316 cpuset_t 1317 vm_active_cpus(struct vm *vm) 1318 { 1319 1320 return (vm->active_cpus); 1321 } 1322 1323 void * 1324 vcpu_stats(struct vm *vm, int vcpuid) 1325 { 1326 1327 return (vm->vcpu[vcpuid].stats); 1328 } 1329 1330 int 1331 vm_get_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state *state) 1332 { 1333 if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 1334 return (EINVAL); 1335 1336 *state = vm->vcpu[vcpuid].x2apic_state; 1337 1338 return (0); 1339 } 1340 1341 int 1342 vm_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state) 1343 { 1344 if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 1345 return (EINVAL); 1346 1347 if (state >= X2APIC_STATE_LAST) 1348 return (EINVAL); 1349 1350 vm->vcpu[vcpuid].x2apic_state = state; 1351 1352 vlapic_set_x2apic_state(vm, vcpuid, state); 1353 1354 return (0); 1355 } 1356 1357 /* 1358 * This function is called to ensure that a vcpu "sees" a pending event 1359 * as soon as possible: 1360 * - If the vcpu thread is sleeping then it is woken up. 1361 * - If the vcpu is running on a different host_cpu then an IPI will be directed 1362 * to the host_cpu to cause the vcpu to trap into the hypervisor. 1363 */ 1364 void 1365 vcpu_notify_event(struct vm *vm, int vcpuid, bool lapic_intr) 1366 { 1367 int hostcpu; 1368 struct vcpu *vcpu; 1369 1370 vcpu = &vm->vcpu[vcpuid]; 1371 1372 vcpu_lock(vcpu); 1373 hostcpu = vcpu->hostcpu; 1374 if (hostcpu == NOCPU) { 1375 if (vcpu->state == VCPU_SLEEPING) 1376 wakeup_one(vcpu); 1377 } else { 1378 if (vcpu->state != VCPU_RUNNING) 1379 panic("invalid vcpu state %d", vcpu->state); 1380 if (hostcpu != curcpu) { 1381 if (lapic_intr) 1382 vlapic_post_intr(vcpu->vlapic, hostcpu); 1383 else 1384 ipi_cpu(hostcpu, vmm_ipinum); 1385 } 1386 } 1387 vcpu_unlock(vcpu); 1388 } 1389 1390 struct vmspace * 1391 vm_get_vmspace(struct vm *vm) 1392 { 1393 1394 return (vm->vmspace); 1395 } 1396 1397 int 1398 vm_apicid2vcpuid(struct vm *vm, int apicid) 1399 { 1400 /* 1401 * XXX apic id is assumed to be numerically identical to vcpu id 1402 */ 1403 return (apicid); 1404 } 1405