1 /*- 2 * Copyright (c) 2011 NetApp, Inc. 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 * 26 * $FreeBSD$ 27 */ 28 29 #include <sys/cdefs.h> 30 __FBSDID("$FreeBSD$"); 31 32 #include <sys/param.h> 33 #include <sys/systm.h> 34 #include <sys/kernel.h> 35 #include <sys/module.h> 36 #include <sys/sysctl.h> 37 #include <sys/malloc.h> 38 #include <sys/pcpu.h> 39 #include <sys/lock.h> 40 #include <sys/mutex.h> 41 #include <sys/proc.h> 42 #include <sys/rwlock.h> 43 #include <sys/sched.h> 44 #include <sys/smp.h> 45 #include <sys/systm.h> 46 47 #include <vm/vm.h> 48 #include <vm/vm_object.h> 49 #include <vm/vm_page.h> 50 #include <vm/pmap.h> 51 #include <vm/vm_map.h> 52 #include <vm/vm_extern.h> 53 #include <vm/vm_param.h> 54 55 #include <machine/vm.h> 56 #include <machine/pcb.h> 57 #include <machine/smp.h> 58 #include <x86/apicreg.h> 59 #include <machine/vmparam.h> 60 61 #include <machine/vmm.h> 62 #include "vmm_ktr.h" 63 #include "vmm_host.h" 64 #include "vmm_mem.h" 65 #include "vmm_util.h" 66 #include <machine/vmm_dev.h> 67 #include "vlapic.h" 68 #include "vmm_msr.h" 69 #include "vmm_ipi.h" 70 #include "vmm_stat.h" 71 #include "vmm_lapic.h" 72 73 #include "io/ppt.h" 74 #include "io/iommu.h" 75 76 struct vlapic; 77 78 struct vcpu { 79 int flags; 80 enum vcpu_state state; 81 struct mtx mtx; 82 int hostcpu; /* host cpuid this vcpu last ran on */ 83 uint64_t guest_msrs[VMM_MSR_NUM]; 84 struct vlapic *vlapic; 85 int vcpuid; 86 struct savefpu *guestfpu; /* guest fpu state */ 87 void *stats; 88 struct vm_exit exitinfo; 89 enum x2apic_state x2apic_state; 90 int nmi_pending; 91 }; 92 93 #define vcpu_lock_init(v) mtx_init(&((v)->mtx), "vcpu lock", 0, MTX_SPIN) 94 #define vcpu_lock(v) mtx_lock_spin(&((v)->mtx)) 95 #define vcpu_unlock(v) mtx_unlock_spin(&((v)->mtx)) 96 #define vcpu_assert_locked(v) mtx_assert(&((v)->mtx), MA_OWNED) 97 98 struct mem_seg { 99 vm_paddr_t gpa; 100 size_t len; 101 boolean_t wired; 102 vm_object_t object; 103 }; 104 #define VM_MAX_MEMORY_SEGMENTS 2 105 106 struct vm { 107 void *cookie; /* processor-specific data */ 108 void *iommu; /* iommu-specific data */ 109 struct vmspace *vmspace; /* guest's address space */ 110 struct vcpu vcpu[VM_MAXCPU]; 111 int num_mem_segs; 112 struct mem_seg mem_segs[VM_MAX_MEMORY_SEGMENTS]; 113 char name[VM_MAX_NAMELEN]; 114 115 /* 116 * Set of active vcpus. 117 * An active vcpu is one that has been started implicitly (BSP) or 118 * explicitly (AP) by sending it a startup ipi. 119 */ 120 cpuset_t active_cpus; 121 }; 122 123 static int vmm_initialized; 124 125 static struct vmm_ops *ops; 126 #define VMM_INIT() (ops != NULL ? (*ops->init)() : 0) 127 #define VMM_CLEANUP() (ops != NULL ? (*ops->cleanup)() : 0) 128 129 #define VMINIT(vm, pmap) (ops != NULL ? (*ops->vminit)(vm, pmap): NULL) 130 #define VMRUN(vmi, vcpu, rip, pmap) \ 131 (ops != NULL ? (*ops->vmrun)(vmi, vcpu, rip, pmap) : ENXIO) 132 #define VMCLEANUP(vmi) (ops != NULL ? (*ops->vmcleanup)(vmi) : NULL) 133 #define VMSPACE_ALLOC(min, max) \ 134 (ops != NULL ? (*ops->vmspace_alloc)(min, max) : NULL) 135 #define VMSPACE_FREE(vmspace) \ 136 (ops != NULL ? (*ops->vmspace_free)(vmspace) : ENXIO) 137 #define VMGETREG(vmi, vcpu, num, retval) \ 138 (ops != NULL ? (*ops->vmgetreg)(vmi, vcpu, num, retval) : ENXIO) 139 #define VMSETREG(vmi, vcpu, num, val) \ 140 (ops != NULL ? (*ops->vmsetreg)(vmi, vcpu, num, val) : ENXIO) 141 #define VMGETDESC(vmi, vcpu, num, desc) \ 142 (ops != NULL ? (*ops->vmgetdesc)(vmi, vcpu, num, desc) : ENXIO) 143 #define VMSETDESC(vmi, vcpu, num, desc) \ 144 (ops != NULL ? (*ops->vmsetdesc)(vmi, vcpu, num, desc) : ENXIO) 145 #define VMINJECT(vmi, vcpu, type, vec, ec, ecv) \ 146 (ops != NULL ? (*ops->vminject)(vmi, vcpu, type, vec, ec, ecv) : ENXIO) 147 #define VMGETCAP(vmi, vcpu, num, retval) \ 148 (ops != NULL ? (*ops->vmgetcap)(vmi, vcpu, num, retval) : ENXIO) 149 #define VMSETCAP(vmi, vcpu, num, val) \ 150 (ops != NULL ? (*ops->vmsetcap)(vmi, vcpu, num, val) : ENXIO) 151 152 #define fpu_start_emulating() load_cr0(rcr0() | CR0_TS) 153 #define fpu_stop_emulating() clts() 154 155 static MALLOC_DEFINE(M_VM, "vm", "vm"); 156 CTASSERT(VMM_MSR_NUM <= 64); /* msr_mask can keep track of up to 64 msrs */ 157 158 /* statistics */ 159 static VMM_STAT(VCPU_TOTAL_RUNTIME, "vcpu total runtime"); 160 161 static void 162 vcpu_cleanup(struct vcpu *vcpu) 163 { 164 vlapic_cleanup(vcpu->vlapic); 165 vmm_stat_free(vcpu->stats); 166 fpu_save_area_free(vcpu->guestfpu); 167 } 168 169 static void 170 vcpu_init(struct vm *vm, uint32_t vcpu_id) 171 { 172 struct vcpu *vcpu; 173 174 vcpu = &vm->vcpu[vcpu_id]; 175 176 vcpu_lock_init(vcpu); 177 vcpu->hostcpu = NOCPU; 178 vcpu->vcpuid = vcpu_id; 179 vcpu->vlapic = vlapic_init(vm, vcpu_id); 180 vm_set_x2apic_state(vm, vcpu_id, X2APIC_ENABLED); 181 vcpu->guestfpu = fpu_save_area_alloc(); 182 fpu_save_area_reset(vcpu->guestfpu); 183 vcpu->stats = vmm_stat_alloc(); 184 } 185 186 struct vm_exit * 187 vm_exitinfo(struct vm *vm, int cpuid) 188 { 189 struct vcpu *vcpu; 190 191 if (cpuid < 0 || cpuid >= VM_MAXCPU) 192 panic("vm_exitinfo: invalid cpuid %d", cpuid); 193 194 vcpu = &vm->vcpu[cpuid]; 195 196 return (&vcpu->exitinfo); 197 } 198 199 static int 200 vmm_init(void) 201 { 202 int error; 203 204 vmm_host_state_init(); 205 vmm_ipi_init(); 206 207 error = vmm_mem_init(); 208 if (error) 209 return (error); 210 211 if (vmm_is_intel()) 212 ops = &vmm_ops_intel; 213 else if (vmm_is_amd()) 214 ops = &vmm_ops_amd; 215 else 216 return (ENXIO); 217 218 vmm_msr_init(); 219 220 return (VMM_INIT()); 221 } 222 223 static int 224 vmm_handler(module_t mod, int what, void *arg) 225 { 226 int error; 227 228 switch (what) { 229 case MOD_LOAD: 230 vmmdev_init(); 231 iommu_init(); 232 error = vmm_init(); 233 if (error == 0) 234 vmm_initialized = 1; 235 break; 236 case MOD_UNLOAD: 237 error = vmmdev_cleanup(); 238 if (error == 0) { 239 iommu_cleanup(); 240 vmm_ipi_cleanup(); 241 error = VMM_CLEANUP(); 242 /* 243 * Something bad happened - prevent new 244 * VMs from being created 245 */ 246 if (error) 247 vmm_initialized = 0; 248 } 249 break; 250 default: 251 error = 0; 252 break; 253 } 254 return (error); 255 } 256 257 static moduledata_t vmm_kmod = { 258 "vmm", 259 vmm_handler, 260 NULL 261 }; 262 263 /* 264 * vmm initialization has the following dependencies: 265 * 266 * - iommu initialization must happen after the pci passthru driver has had 267 * a chance to attach to any passthru devices (after SI_SUB_CONFIGURE). 268 * 269 * - VT-x initialization requires smp_rendezvous() and therefore must happen 270 * after SMP is fully functional (after SI_SUB_SMP). 271 */ 272 DECLARE_MODULE(vmm, vmm_kmod, SI_SUB_SMP + 1, SI_ORDER_ANY); 273 MODULE_VERSION(vmm, 1); 274 275 SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW, NULL, NULL); 276 277 int 278 vm_create(const char *name, struct vm **retvm) 279 { 280 int i; 281 struct vm *vm; 282 struct vmspace *vmspace; 283 284 const int BSP = 0; 285 286 /* 287 * If vmm.ko could not be successfully initialized then don't attempt 288 * to create the virtual machine. 289 */ 290 if (!vmm_initialized) 291 return (ENXIO); 292 293 if (name == NULL || strlen(name) >= VM_MAX_NAMELEN) 294 return (EINVAL); 295 296 vmspace = VMSPACE_ALLOC(VM_MIN_ADDRESS, VM_MAXUSER_ADDRESS); 297 if (vmspace == NULL) 298 return (ENOMEM); 299 300 vm = malloc(sizeof(struct vm), M_VM, M_WAITOK | M_ZERO); 301 strcpy(vm->name, name); 302 vm->cookie = VMINIT(vm, vmspace_pmap(vmspace)); 303 304 for (i = 0; i < VM_MAXCPU; i++) { 305 vcpu_init(vm, i); 306 guest_msrs_init(vm, i); 307 } 308 309 vm_activate_cpu(vm, BSP); 310 vm->vmspace = vmspace; 311 312 *retvm = vm; 313 return (0); 314 } 315 316 static void 317 vm_free_mem_seg(struct vm *vm, struct mem_seg *seg) 318 { 319 320 if (seg->object != NULL) 321 vmm_mem_free(vm->vmspace, seg->gpa, seg->len); 322 323 bzero(seg, sizeof(*seg)); 324 } 325 326 void 327 vm_destroy(struct vm *vm) 328 { 329 int i; 330 331 ppt_unassign_all(vm); 332 333 if (vm->iommu != NULL) 334 iommu_destroy_domain(vm->iommu); 335 336 for (i = 0; i < vm->num_mem_segs; i++) 337 vm_free_mem_seg(vm, &vm->mem_segs[i]); 338 339 vm->num_mem_segs = 0; 340 341 for (i = 0; i < VM_MAXCPU; i++) 342 vcpu_cleanup(&vm->vcpu[i]); 343 344 VMSPACE_FREE(vm->vmspace); 345 346 VMCLEANUP(vm->cookie); 347 348 free(vm, M_VM); 349 } 350 351 const char * 352 vm_name(struct vm *vm) 353 { 354 return (vm->name); 355 } 356 357 int 358 vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa) 359 { 360 vm_object_t obj; 361 362 if ((obj = vmm_mmio_alloc(vm->vmspace, gpa, len, hpa)) == NULL) 363 return (ENOMEM); 364 else 365 return (0); 366 } 367 368 int 369 vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len) 370 { 371 372 vmm_mmio_free(vm->vmspace, gpa, len); 373 return (0); 374 } 375 376 boolean_t 377 vm_mem_allocated(struct vm *vm, vm_paddr_t gpa) 378 { 379 int i; 380 vm_paddr_t gpabase, gpalimit; 381 382 for (i = 0; i < vm->num_mem_segs; i++) { 383 gpabase = vm->mem_segs[i].gpa; 384 gpalimit = gpabase + vm->mem_segs[i].len; 385 if (gpa >= gpabase && gpa < gpalimit) 386 return (TRUE); /* 'gpa' is regular memory */ 387 } 388 389 if (ppt_is_mmio(vm, gpa)) 390 return (TRUE); /* 'gpa' is pci passthru mmio */ 391 392 return (FALSE); 393 } 394 395 int 396 vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len) 397 { 398 int available, allocated; 399 struct mem_seg *seg; 400 vm_object_t object; 401 vm_paddr_t g; 402 403 if ((gpa & PAGE_MASK) || (len & PAGE_MASK) || len == 0) 404 return (EINVAL); 405 406 available = allocated = 0; 407 g = gpa; 408 while (g < gpa + len) { 409 if (vm_mem_allocated(vm, g)) 410 allocated++; 411 else 412 available++; 413 414 g += PAGE_SIZE; 415 } 416 417 /* 418 * If there are some allocated and some available pages in the address 419 * range then it is an error. 420 */ 421 if (allocated && available) 422 return (EINVAL); 423 424 /* 425 * If the entire address range being requested has already been 426 * allocated then there isn't anything more to do. 427 */ 428 if (allocated && available == 0) 429 return (0); 430 431 if (vm->num_mem_segs >= VM_MAX_MEMORY_SEGMENTS) 432 return (E2BIG); 433 434 seg = &vm->mem_segs[vm->num_mem_segs]; 435 436 if ((object = vmm_mem_alloc(vm->vmspace, gpa, len)) == NULL) 437 return (ENOMEM); 438 439 seg->gpa = gpa; 440 seg->len = len; 441 seg->object = object; 442 seg->wired = FALSE; 443 444 vm->num_mem_segs++; 445 446 return (0); 447 } 448 449 static void 450 vm_gpa_unwire(struct vm *vm) 451 { 452 int i, rv; 453 struct mem_seg *seg; 454 455 for (i = 0; i < vm->num_mem_segs; i++) { 456 seg = &vm->mem_segs[i]; 457 if (!seg->wired) 458 continue; 459 460 rv = vm_map_unwire(&vm->vmspace->vm_map, 461 seg->gpa, seg->gpa + seg->len, 462 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); 463 KASSERT(rv == KERN_SUCCESS, ("vm(%s) memory segment " 464 "%#lx/%ld could not be unwired: %d", 465 vm_name(vm), seg->gpa, seg->len, rv)); 466 467 seg->wired = FALSE; 468 } 469 } 470 471 static int 472 vm_gpa_wire(struct vm *vm) 473 { 474 int i, rv; 475 struct mem_seg *seg; 476 477 for (i = 0; i < vm->num_mem_segs; i++) { 478 seg = &vm->mem_segs[i]; 479 if (seg->wired) 480 continue; 481 482 /* XXX rlimits? */ 483 rv = vm_map_wire(&vm->vmspace->vm_map, 484 seg->gpa, seg->gpa + seg->len, 485 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); 486 if (rv != KERN_SUCCESS) 487 break; 488 489 seg->wired = TRUE; 490 } 491 492 if (i < vm->num_mem_segs) { 493 /* 494 * Undo the wiring before returning an error. 495 */ 496 vm_gpa_unwire(vm); 497 return (EAGAIN); 498 } 499 500 return (0); 501 } 502 503 static void 504 vm_iommu_modify(struct vm *vm, boolean_t map) 505 { 506 int i, sz; 507 vm_paddr_t gpa, hpa; 508 struct mem_seg *seg; 509 void *vp, *cookie, *host_domain; 510 511 sz = PAGE_SIZE; 512 host_domain = iommu_host_domain(); 513 514 for (i = 0; i < vm->num_mem_segs; i++) { 515 seg = &vm->mem_segs[i]; 516 KASSERT(seg->wired, ("vm(%s) memory segment %#lx/%ld not wired", 517 vm_name(vm), seg->gpa, seg->len)); 518 519 gpa = seg->gpa; 520 while (gpa < seg->gpa + seg->len) { 521 vp = vm_gpa_hold(vm, gpa, PAGE_SIZE, VM_PROT_WRITE, 522 &cookie); 523 KASSERT(vp != NULL, ("vm(%s) could not map gpa %#lx", 524 vm_name(vm), gpa)); 525 526 vm_gpa_release(cookie); 527 528 hpa = DMAP_TO_PHYS((uintptr_t)vp); 529 if (map) { 530 iommu_create_mapping(vm->iommu, gpa, hpa, sz); 531 iommu_remove_mapping(host_domain, hpa, sz); 532 } else { 533 iommu_remove_mapping(vm->iommu, gpa, sz); 534 iommu_create_mapping(host_domain, hpa, hpa, sz); 535 } 536 537 gpa += PAGE_SIZE; 538 } 539 } 540 541 /* 542 * Invalidate the cached translations associated with the domain 543 * from which pages were removed. 544 */ 545 if (map) 546 iommu_invalidate_tlb(host_domain); 547 else 548 iommu_invalidate_tlb(vm->iommu); 549 } 550 551 #define vm_iommu_unmap(vm) vm_iommu_modify((vm), FALSE) 552 #define vm_iommu_map(vm) vm_iommu_modify((vm), TRUE) 553 554 int 555 vm_unassign_pptdev(struct vm *vm, int bus, int slot, int func) 556 { 557 int error; 558 559 error = ppt_unassign_device(vm, bus, slot, func); 560 if (error) 561 return (error); 562 563 if (ppt_num_devices(vm) == 0) { 564 vm_iommu_unmap(vm); 565 vm_gpa_unwire(vm); 566 } 567 return (0); 568 } 569 570 int 571 vm_assign_pptdev(struct vm *vm, int bus, int slot, int func) 572 { 573 int error; 574 vm_paddr_t maxaddr; 575 576 /* 577 * Virtual machines with pci passthru devices get special treatment: 578 * - the guest physical memory is wired 579 * - the iommu is programmed to do the 'gpa' to 'hpa' translation 580 * 581 * We need to do this before the first pci passthru device is attached. 582 */ 583 if (ppt_num_devices(vm) == 0) { 584 KASSERT(vm->iommu == NULL, 585 ("vm_assign_pptdev: iommu must be NULL")); 586 maxaddr = vmm_mem_maxaddr(); 587 vm->iommu = iommu_create_domain(maxaddr); 588 589 error = vm_gpa_wire(vm); 590 if (error) 591 return (error); 592 593 vm_iommu_map(vm); 594 } 595 596 error = ppt_assign_device(vm, bus, slot, func); 597 return (error); 598 } 599 600 void * 601 vm_gpa_hold(struct vm *vm, vm_paddr_t gpa, size_t len, int reqprot, 602 void **cookie) 603 { 604 int count, pageoff; 605 vm_page_t m; 606 607 pageoff = gpa & PAGE_MASK; 608 if (len > PAGE_SIZE - pageoff) 609 panic("vm_gpa_hold: invalid gpa/len: 0x%016lx/%lu", gpa, len); 610 611 count = vm_fault_quick_hold_pages(&vm->vmspace->vm_map, 612 trunc_page(gpa), PAGE_SIZE, reqprot, &m, 1); 613 614 if (count == 1) { 615 *cookie = m; 616 return ((void *)(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)) + pageoff)); 617 } else { 618 *cookie = NULL; 619 return (NULL); 620 } 621 } 622 623 void 624 vm_gpa_release(void *cookie) 625 { 626 vm_page_t m = cookie; 627 628 vm_page_lock(m); 629 vm_page_unhold(m); 630 vm_page_unlock(m); 631 } 632 633 int 634 vm_gpabase2memseg(struct vm *vm, vm_paddr_t gpabase, 635 struct vm_memory_segment *seg) 636 { 637 int i; 638 639 for (i = 0; i < vm->num_mem_segs; i++) { 640 if (gpabase == vm->mem_segs[i].gpa) { 641 seg->gpa = vm->mem_segs[i].gpa; 642 seg->len = vm->mem_segs[i].len; 643 seg->wired = vm->mem_segs[i].wired; 644 return (0); 645 } 646 } 647 return (-1); 648 } 649 650 int 651 vm_get_memobj(struct vm *vm, vm_paddr_t gpa, size_t len, 652 vm_offset_t *offset, struct vm_object **object) 653 { 654 int i; 655 size_t seg_len; 656 vm_paddr_t seg_gpa; 657 vm_object_t seg_obj; 658 659 for (i = 0; i < vm->num_mem_segs; i++) { 660 if ((seg_obj = vm->mem_segs[i].object) == NULL) 661 continue; 662 663 seg_gpa = vm->mem_segs[i].gpa; 664 seg_len = vm->mem_segs[i].len; 665 666 if (gpa >= seg_gpa && gpa < seg_gpa + seg_len) { 667 *offset = gpa - seg_gpa; 668 *object = seg_obj; 669 vm_object_reference(seg_obj); 670 return (0); 671 } 672 } 673 674 return (EINVAL); 675 } 676 677 int 678 vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval) 679 { 680 681 if (vcpu < 0 || vcpu >= VM_MAXCPU) 682 return (EINVAL); 683 684 if (reg >= VM_REG_LAST) 685 return (EINVAL); 686 687 return (VMGETREG(vm->cookie, vcpu, reg, retval)); 688 } 689 690 int 691 vm_set_register(struct vm *vm, int vcpu, int reg, uint64_t val) 692 { 693 694 if (vcpu < 0 || vcpu >= VM_MAXCPU) 695 return (EINVAL); 696 697 if (reg >= VM_REG_LAST) 698 return (EINVAL); 699 700 return (VMSETREG(vm->cookie, vcpu, reg, val)); 701 } 702 703 static boolean_t 704 is_descriptor_table(int reg) 705 { 706 707 switch (reg) { 708 case VM_REG_GUEST_IDTR: 709 case VM_REG_GUEST_GDTR: 710 return (TRUE); 711 default: 712 return (FALSE); 713 } 714 } 715 716 static boolean_t 717 is_segment_register(int reg) 718 { 719 720 switch (reg) { 721 case VM_REG_GUEST_ES: 722 case VM_REG_GUEST_CS: 723 case VM_REG_GUEST_SS: 724 case VM_REG_GUEST_DS: 725 case VM_REG_GUEST_FS: 726 case VM_REG_GUEST_GS: 727 case VM_REG_GUEST_TR: 728 case VM_REG_GUEST_LDTR: 729 return (TRUE); 730 default: 731 return (FALSE); 732 } 733 } 734 735 int 736 vm_get_seg_desc(struct vm *vm, int vcpu, int reg, 737 struct seg_desc *desc) 738 { 739 740 if (vcpu < 0 || vcpu >= VM_MAXCPU) 741 return (EINVAL); 742 743 if (!is_segment_register(reg) && !is_descriptor_table(reg)) 744 return (EINVAL); 745 746 return (VMGETDESC(vm->cookie, vcpu, reg, desc)); 747 } 748 749 int 750 vm_set_seg_desc(struct vm *vm, int vcpu, int reg, 751 struct seg_desc *desc) 752 { 753 if (vcpu < 0 || vcpu >= VM_MAXCPU) 754 return (EINVAL); 755 756 if (!is_segment_register(reg) && !is_descriptor_table(reg)) 757 return (EINVAL); 758 759 return (VMSETDESC(vm->cookie, vcpu, reg, desc)); 760 } 761 762 static void 763 restore_guest_fpustate(struct vcpu *vcpu) 764 { 765 766 /* flush host state to the pcb */ 767 fpuexit(curthread); 768 769 /* restore guest FPU state */ 770 fpu_stop_emulating(); 771 fpurestore(vcpu->guestfpu); 772 773 /* 774 * The FPU is now "dirty" with the guest's state so turn on emulation 775 * to trap any access to the FPU by the host. 776 */ 777 fpu_start_emulating(); 778 } 779 780 static void 781 save_guest_fpustate(struct vcpu *vcpu) 782 { 783 784 if ((rcr0() & CR0_TS) == 0) 785 panic("fpu emulation not enabled in host!"); 786 787 /* save guest FPU state */ 788 fpu_stop_emulating(); 789 fpusave(vcpu->guestfpu); 790 fpu_start_emulating(); 791 } 792 793 static VMM_STAT(VCPU_IDLE_TICKS, "number of ticks vcpu was idle"); 794 795 static int 796 vcpu_set_state_locked(struct vcpu *vcpu, enum vcpu_state newstate) 797 { 798 int error; 799 800 vcpu_assert_locked(vcpu); 801 802 /* 803 * The following state transitions are allowed: 804 * IDLE -> FROZEN -> IDLE 805 * FROZEN -> RUNNING -> FROZEN 806 * FROZEN -> SLEEPING -> FROZEN 807 */ 808 switch (vcpu->state) { 809 case VCPU_IDLE: 810 case VCPU_RUNNING: 811 case VCPU_SLEEPING: 812 error = (newstate != VCPU_FROZEN); 813 break; 814 case VCPU_FROZEN: 815 error = (newstate == VCPU_FROZEN); 816 break; 817 default: 818 error = 1; 819 break; 820 } 821 822 if (error == 0) 823 vcpu->state = newstate; 824 else 825 error = EBUSY; 826 827 return (error); 828 } 829 830 static void 831 vcpu_require_state(struct vm *vm, int vcpuid, enum vcpu_state newstate) 832 { 833 int error; 834 835 if ((error = vcpu_set_state(vm, vcpuid, newstate)) != 0) 836 panic("Error %d setting state to %d\n", error, newstate); 837 } 838 839 static void 840 vcpu_require_state_locked(struct vcpu *vcpu, enum vcpu_state newstate) 841 { 842 int error; 843 844 if ((error = vcpu_set_state_locked(vcpu, newstate)) != 0) 845 panic("Error %d setting state to %d", error, newstate); 846 } 847 848 /* 849 * Emulate a guest 'hlt' by sleeping until the vcpu is ready to run. 850 */ 851 static int 852 vm_handle_hlt(struct vm *vm, int vcpuid, boolean_t *retu) 853 { 854 struct vcpu *vcpu; 855 int sleepticks, t; 856 857 vcpu = &vm->vcpu[vcpuid]; 858 859 vcpu_lock(vcpu); 860 861 /* 862 * Figure out the number of host ticks until the next apic 863 * timer interrupt in the guest. 864 */ 865 sleepticks = lapic_timer_tick(vm, vcpuid); 866 867 /* 868 * If the guest local apic timer is disabled then sleep for 869 * a long time but not forever. 870 */ 871 if (sleepticks < 0) 872 sleepticks = hz; 873 874 /* 875 * Do a final check for pending NMI or interrupts before 876 * really putting this thread to sleep. 877 * 878 * These interrupts could have happened any time after we 879 * returned from VMRUN() and before we grabbed the vcpu lock. 880 */ 881 if (!vm_nmi_pending(vm, vcpuid) && lapic_pending_intr(vm, vcpuid) < 0) { 882 if (sleepticks <= 0) 883 panic("invalid sleepticks %d", sleepticks); 884 t = ticks; 885 vcpu_require_state_locked(vcpu, VCPU_SLEEPING); 886 msleep_spin(vcpu, &vcpu->mtx, "vmidle", sleepticks); 887 vcpu_require_state_locked(vcpu, VCPU_FROZEN); 888 vmm_stat_incr(vm, vcpuid, VCPU_IDLE_TICKS, ticks - t); 889 } 890 vcpu_unlock(vcpu); 891 892 return (0); 893 } 894 895 static int 896 vm_handle_paging(struct vm *vm, int vcpuid, boolean_t *retu) 897 { 898 int rv, ftype; 899 struct vm_map *map; 900 struct vcpu *vcpu; 901 struct vm_exit *vme; 902 903 vcpu = &vm->vcpu[vcpuid]; 904 vme = &vcpu->exitinfo; 905 906 ftype = vme->u.paging.fault_type; 907 KASSERT(ftype == VM_PROT_READ || 908 ftype == VM_PROT_WRITE || ftype == VM_PROT_EXECUTE, 909 ("vm_handle_paging: invalid fault_type %d", ftype)); 910 911 if (ftype == VM_PROT_READ || ftype == VM_PROT_WRITE) { 912 rv = pmap_emulate_accessed_dirty(vmspace_pmap(vm->vmspace), 913 vme->u.paging.gpa, ftype); 914 if (rv == 0) 915 goto done; 916 } 917 918 map = &vm->vmspace->vm_map; 919 rv = vm_fault(map, vme->u.paging.gpa, ftype, VM_FAULT_NORMAL); 920 921 VCPU_CTR3(vm, vcpuid, "vm_handle_paging rv = %d, gpa = %#lx, " 922 "ftype = %d", rv, vme->u.paging.gpa, ftype); 923 924 if (rv != KERN_SUCCESS) 925 return (EFAULT); 926 done: 927 /* restart execution at the faulting instruction */ 928 vme->inst_length = 0; 929 930 return (0); 931 } 932 933 static int 934 vm_handle_inst_emul(struct vm *vm, int vcpuid, boolean_t *retu) 935 { 936 struct vie *vie; 937 struct vcpu *vcpu; 938 struct vm_exit *vme; 939 int error, inst_length; 940 uint64_t rip, gla, gpa, cr3; 941 942 vcpu = &vm->vcpu[vcpuid]; 943 vme = &vcpu->exitinfo; 944 945 rip = vme->rip; 946 inst_length = vme->inst_length; 947 948 gla = vme->u.inst_emul.gla; 949 gpa = vme->u.inst_emul.gpa; 950 cr3 = vme->u.inst_emul.cr3; 951 vie = &vme->u.inst_emul.vie; 952 953 vie_init(vie); 954 955 /* Fetch, decode and emulate the faulting instruction */ 956 if (vmm_fetch_instruction(vm, vcpuid, rip, inst_length, cr3, vie) != 0) 957 return (EFAULT); 958 959 if (vmm_decode_instruction(vm, vcpuid, gla, vie) != 0) 960 return (EFAULT); 961 962 /* return to userland unless this is a local apic access */ 963 if (gpa < DEFAULT_APIC_BASE || gpa >= DEFAULT_APIC_BASE + PAGE_SIZE) { 964 *retu = TRUE; 965 return (0); 966 } 967 968 error = vmm_emulate_instruction(vm, vcpuid, gpa, vie, 969 lapic_mmio_read, lapic_mmio_write, 0); 970 971 /* return to userland to spin up the AP */ 972 if (error == 0 && vme->exitcode == VM_EXITCODE_SPINUP_AP) 973 *retu = TRUE; 974 975 return (error); 976 } 977 978 int 979 vm_run(struct vm *vm, struct vm_run *vmrun) 980 { 981 int error, vcpuid; 982 struct vcpu *vcpu; 983 struct pcb *pcb; 984 uint64_t tscval, rip; 985 struct vm_exit *vme; 986 boolean_t retu; 987 pmap_t pmap; 988 989 vcpuid = vmrun->cpuid; 990 991 if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 992 return (EINVAL); 993 994 pmap = vmspace_pmap(vm->vmspace); 995 vcpu = &vm->vcpu[vcpuid]; 996 vme = &vcpu->exitinfo; 997 rip = vmrun->rip; 998 restart: 999 critical_enter(); 1000 1001 KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active), 1002 ("vm_run: absurd pm_active")); 1003 1004 tscval = rdtsc(); 1005 1006 pcb = PCPU_GET(curpcb); 1007 set_pcb_flags(pcb, PCB_FULL_IRET); 1008 1009 restore_guest_msrs(vm, vcpuid); 1010 restore_guest_fpustate(vcpu); 1011 1012 vcpu_require_state(vm, vcpuid, VCPU_RUNNING); 1013 vcpu->hostcpu = curcpu; 1014 error = VMRUN(vm->cookie, vcpuid, rip, pmap); 1015 vcpu->hostcpu = NOCPU; 1016 vcpu_require_state(vm, vcpuid, VCPU_FROZEN); 1017 1018 save_guest_fpustate(vcpu); 1019 restore_host_msrs(vm, vcpuid); 1020 1021 vmm_stat_incr(vm, vcpuid, VCPU_TOTAL_RUNTIME, rdtsc() - tscval); 1022 1023 critical_exit(); 1024 1025 if (error == 0) { 1026 retu = FALSE; 1027 switch (vme->exitcode) { 1028 case VM_EXITCODE_HLT: 1029 error = vm_handle_hlt(vm, vcpuid, &retu); 1030 break; 1031 case VM_EXITCODE_PAGING: 1032 error = vm_handle_paging(vm, vcpuid, &retu); 1033 break; 1034 case VM_EXITCODE_INST_EMUL: 1035 error = vm_handle_inst_emul(vm, vcpuid, &retu); 1036 break; 1037 default: 1038 retu = TRUE; /* handled in userland */ 1039 break; 1040 } 1041 } 1042 1043 if (error == 0 && retu == FALSE) { 1044 rip = vme->rip + vme->inst_length; 1045 goto restart; 1046 } 1047 1048 /* copy the exit information */ 1049 bcopy(vme, &vmrun->vm_exit, sizeof(struct vm_exit)); 1050 return (error); 1051 } 1052 1053 int 1054 vm_inject_event(struct vm *vm, int vcpuid, int type, 1055 int vector, uint32_t code, int code_valid) 1056 { 1057 if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 1058 return (EINVAL); 1059 1060 if ((type > VM_EVENT_NONE && type < VM_EVENT_MAX) == 0) 1061 return (EINVAL); 1062 1063 if (vector < 0 || vector > 255) 1064 return (EINVAL); 1065 1066 return (VMINJECT(vm->cookie, vcpuid, type, vector, code, code_valid)); 1067 } 1068 1069 static VMM_STAT(VCPU_NMI_COUNT, "number of NMIs delivered to vcpu"); 1070 1071 int 1072 vm_inject_nmi(struct vm *vm, int vcpuid) 1073 { 1074 struct vcpu *vcpu; 1075 1076 if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 1077 return (EINVAL); 1078 1079 vcpu = &vm->vcpu[vcpuid]; 1080 1081 vcpu->nmi_pending = 1; 1082 vm_interrupt_hostcpu(vm, vcpuid); 1083 return (0); 1084 } 1085 1086 int 1087 vm_nmi_pending(struct vm *vm, int vcpuid) 1088 { 1089 struct vcpu *vcpu; 1090 1091 if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 1092 panic("vm_nmi_pending: invalid vcpuid %d", vcpuid); 1093 1094 vcpu = &vm->vcpu[vcpuid]; 1095 1096 return (vcpu->nmi_pending); 1097 } 1098 1099 void 1100 vm_nmi_clear(struct vm *vm, int vcpuid) 1101 { 1102 struct vcpu *vcpu; 1103 1104 if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 1105 panic("vm_nmi_pending: invalid vcpuid %d", vcpuid); 1106 1107 vcpu = &vm->vcpu[vcpuid]; 1108 1109 if (vcpu->nmi_pending == 0) 1110 panic("vm_nmi_clear: inconsistent nmi_pending state"); 1111 1112 vcpu->nmi_pending = 0; 1113 vmm_stat_incr(vm, vcpuid, VCPU_NMI_COUNT, 1); 1114 } 1115 1116 int 1117 vm_get_capability(struct vm *vm, int vcpu, int type, int *retval) 1118 { 1119 if (vcpu < 0 || vcpu >= VM_MAXCPU) 1120 return (EINVAL); 1121 1122 if (type < 0 || type >= VM_CAP_MAX) 1123 return (EINVAL); 1124 1125 return (VMGETCAP(vm->cookie, vcpu, type, retval)); 1126 } 1127 1128 int 1129 vm_set_capability(struct vm *vm, int vcpu, int type, int val) 1130 { 1131 if (vcpu < 0 || vcpu >= VM_MAXCPU) 1132 return (EINVAL); 1133 1134 if (type < 0 || type >= VM_CAP_MAX) 1135 return (EINVAL); 1136 1137 return (VMSETCAP(vm->cookie, vcpu, type, val)); 1138 } 1139 1140 uint64_t * 1141 vm_guest_msrs(struct vm *vm, int cpu) 1142 { 1143 return (vm->vcpu[cpu].guest_msrs); 1144 } 1145 1146 struct vlapic * 1147 vm_lapic(struct vm *vm, int cpu) 1148 { 1149 return (vm->vcpu[cpu].vlapic); 1150 } 1151 1152 boolean_t 1153 vmm_is_pptdev(int bus, int slot, int func) 1154 { 1155 int found, i, n; 1156 int b, s, f; 1157 char *val, *cp, *cp2; 1158 1159 /* 1160 * XXX 1161 * The length of an environment variable is limited to 128 bytes which 1162 * puts an upper limit on the number of passthru devices that may be 1163 * specified using a single environment variable. 1164 * 1165 * Work around this by scanning multiple environment variable 1166 * names instead of a single one - yuck! 1167 */ 1168 const char *names[] = { "pptdevs", "pptdevs2", "pptdevs3", NULL }; 1169 1170 /* set pptdevs="1/2/3 4/5/6 7/8/9 10/11/12" */ 1171 found = 0; 1172 for (i = 0; names[i] != NULL && !found; i++) { 1173 cp = val = getenv(names[i]); 1174 while (cp != NULL && *cp != '\0') { 1175 if ((cp2 = strchr(cp, ' ')) != NULL) 1176 *cp2 = '\0'; 1177 1178 n = sscanf(cp, "%d/%d/%d", &b, &s, &f); 1179 if (n == 3 && bus == b && slot == s && func == f) { 1180 found = 1; 1181 break; 1182 } 1183 1184 if (cp2 != NULL) 1185 *cp2++ = ' '; 1186 1187 cp = cp2; 1188 } 1189 freeenv(val); 1190 } 1191 return (found); 1192 } 1193 1194 void * 1195 vm_iommu_domain(struct vm *vm) 1196 { 1197 1198 return (vm->iommu); 1199 } 1200 1201 int 1202 vcpu_set_state(struct vm *vm, int vcpuid, enum vcpu_state newstate) 1203 { 1204 int error; 1205 struct vcpu *vcpu; 1206 1207 if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 1208 panic("vm_set_run_state: invalid vcpuid %d", vcpuid); 1209 1210 vcpu = &vm->vcpu[vcpuid]; 1211 1212 vcpu_lock(vcpu); 1213 error = vcpu_set_state_locked(vcpu, newstate); 1214 vcpu_unlock(vcpu); 1215 1216 return (error); 1217 } 1218 1219 enum vcpu_state 1220 vcpu_get_state(struct vm *vm, int vcpuid, int *hostcpu) 1221 { 1222 struct vcpu *vcpu; 1223 enum vcpu_state state; 1224 1225 if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 1226 panic("vm_get_run_state: invalid vcpuid %d", vcpuid); 1227 1228 vcpu = &vm->vcpu[vcpuid]; 1229 1230 vcpu_lock(vcpu); 1231 state = vcpu->state; 1232 if (hostcpu != NULL) 1233 *hostcpu = vcpu->hostcpu; 1234 vcpu_unlock(vcpu); 1235 1236 return (state); 1237 } 1238 1239 void 1240 vm_activate_cpu(struct vm *vm, int vcpuid) 1241 { 1242 1243 if (vcpuid >= 0 && vcpuid < VM_MAXCPU) 1244 CPU_SET(vcpuid, &vm->active_cpus); 1245 } 1246 1247 cpuset_t 1248 vm_active_cpus(struct vm *vm) 1249 { 1250 1251 return (vm->active_cpus); 1252 } 1253 1254 void * 1255 vcpu_stats(struct vm *vm, int vcpuid) 1256 { 1257 1258 return (vm->vcpu[vcpuid].stats); 1259 } 1260 1261 int 1262 vm_get_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state *state) 1263 { 1264 if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 1265 return (EINVAL); 1266 1267 *state = vm->vcpu[vcpuid].x2apic_state; 1268 1269 return (0); 1270 } 1271 1272 int 1273 vm_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state) 1274 { 1275 if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 1276 return (EINVAL); 1277 1278 if (state >= X2APIC_STATE_LAST) 1279 return (EINVAL); 1280 1281 vm->vcpu[vcpuid].x2apic_state = state; 1282 1283 vlapic_set_x2apic_state(vm, vcpuid, state); 1284 1285 return (0); 1286 } 1287 1288 void 1289 vm_interrupt_hostcpu(struct vm *vm, int vcpuid) 1290 { 1291 int hostcpu; 1292 struct vcpu *vcpu; 1293 1294 vcpu = &vm->vcpu[vcpuid]; 1295 1296 vcpu_lock(vcpu); 1297 hostcpu = vcpu->hostcpu; 1298 if (hostcpu == NOCPU) { 1299 if (vcpu->state == VCPU_SLEEPING) 1300 wakeup_one(vcpu); 1301 } else { 1302 if (vcpu->state != VCPU_RUNNING) 1303 panic("invalid vcpu state %d", vcpu->state); 1304 if (hostcpu != curcpu) 1305 ipi_cpu(hostcpu, vmm_ipinum); 1306 } 1307 vcpu_unlock(vcpu); 1308 } 1309 1310 struct vmspace * 1311 vm_get_vmspace(struct vm *vm) 1312 { 1313 1314 return (vm->vmspace); 1315 } 1316