1 /*- 2 * Copyright (c) 2011 NetApp, Inc. 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 * 26 * $FreeBSD$ 27 */ 28 29 #include <sys/cdefs.h> 30 __FBSDID("$FreeBSD$"); 31 32 #include <sys/param.h> 33 #include <sys/systm.h> 34 #include <sys/kernel.h> 35 #include <sys/module.h> 36 #include <sys/sysctl.h> 37 #include <sys/malloc.h> 38 #include <sys/pcpu.h> 39 #include <sys/lock.h> 40 #include <sys/mutex.h> 41 #include <sys/proc.h> 42 #include <sys/rwlock.h> 43 #include <sys/sched.h> 44 #include <sys/smp.h> 45 #include <sys/systm.h> 46 47 #include <vm/vm.h> 48 #include <vm/vm_object.h> 49 #include <vm/vm_page.h> 50 #include <vm/pmap.h> 51 #include <vm/vm_map.h> 52 #include <vm/vm_extern.h> 53 #include <vm/vm_param.h> 54 55 #include <machine/cpu.h> 56 #include <machine/vm.h> 57 #include <machine/pcb.h> 58 #include <machine/smp.h> 59 #include <x86/psl.h> 60 #include <x86/apicreg.h> 61 #include <machine/vmparam.h> 62 63 #include <machine/vmm.h> 64 #include <machine/vmm_dev.h> 65 #include <machine/vmm_instruction_emul.h> 66 67 #include "vmm_ioport.h" 68 #include "vmm_ktr.h" 69 #include "vmm_host.h" 70 #include "vmm_mem.h" 71 #include "vmm_util.h" 72 #include "vatpic.h" 73 #include "vatpit.h" 74 #include "vhpet.h" 75 #include "vioapic.h" 76 #include "vlapic.h" 77 #include "vpmtmr.h" 78 #include "vrtc.h" 79 #include "vmm_ipi.h" 80 #include "vmm_stat.h" 81 #include "vmm_lapic.h" 82 83 #include "io/ppt.h" 84 #include "io/iommu.h" 85 86 struct vlapic; 87 88 /* 89 * Initialization: 90 * (a) allocated when vcpu is created 91 * (i) initialized when vcpu is created and when it is reinitialized 92 * (o) initialized the first time the vcpu is created 93 * (x) initialized before use 94 */ 95 struct vcpu { 96 struct mtx mtx; /* (o) protects 'state' and 'hostcpu' */ 97 enum vcpu_state state; /* (o) vcpu state */ 98 int hostcpu; /* (o) vcpu's host cpu */ 99 struct vlapic *vlapic; /* (i) APIC device model */ 100 enum x2apic_state x2apic_state; /* (i) APIC mode */ 101 uint64_t exitintinfo; /* (i) events pending at VM exit */ 102 int nmi_pending; /* (i) NMI pending */ 103 int extint_pending; /* (i) INTR pending */ 104 int exception_pending; /* (i) exception pending */ 105 int exc_vector; /* (x) exception collateral */ 106 int exc_errcode_valid; 107 uint32_t exc_errcode; 108 struct savefpu *guestfpu; /* (a,i) guest fpu state */ 109 uint64_t guest_xcr0; /* (i) guest %xcr0 register */ 110 void *stats; /* (a,i) statistics */ 111 struct vm_exit exitinfo; /* (x) exit reason and collateral */ 112 }; 113 114 #define vcpu_lock_initialized(v) mtx_initialized(&((v)->mtx)) 115 #define vcpu_lock_init(v) mtx_init(&((v)->mtx), "vcpu lock", 0, MTX_SPIN) 116 #define vcpu_lock(v) mtx_lock_spin(&((v)->mtx)) 117 #define vcpu_unlock(v) mtx_unlock_spin(&((v)->mtx)) 118 #define vcpu_assert_locked(v) mtx_assert(&((v)->mtx), MA_OWNED) 119 120 struct mem_seg { 121 vm_paddr_t gpa; 122 size_t len; 123 boolean_t wired; 124 vm_object_t object; 125 }; 126 #define VM_MAX_MEMORY_SEGMENTS 2 127 128 /* 129 * Initialization: 130 * (o) initialized the first time the VM is created 131 * (i) initialized when VM is created and when it is reinitialized 132 * (x) initialized before use 133 */ 134 struct vm { 135 void *cookie; /* (i) cpu-specific data */ 136 void *iommu; /* (x) iommu-specific data */ 137 struct vhpet *vhpet; /* (i) virtual HPET */ 138 struct vioapic *vioapic; /* (i) virtual ioapic */ 139 struct vatpic *vatpic; /* (i) virtual atpic */ 140 struct vatpit *vatpit; /* (i) virtual atpit */ 141 struct vpmtmr *vpmtmr; /* (i) virtual ACPI PM timer */ 142 struct vrtc *vrtc; /* (o) virtual RTC */ 143 volatile cpuset_t active_cpus; /* (i) active vcpus */ 144 int suspend; /* (i) stop VM execution */ 145 volatile cpuset_t suspended_cpus; /* (i) suspended vcpus */ 146 volatile cpuset_t halted_cpus; /* (x) cpus in a hard halt */ 147 cpuset_t rendezvous_req_cpus; /* (x) rendezvous requested */ 148 cpuset_t rendezvous_done_cpus; /* (x) rendezvous finished */ 149 void *rendezvous_arg; /* (x) rendezvous func/arg */ 150 vm_rendezvous_func_t rendezvous_func; 151 struct mtx rendezvous_mtx; /* (o) rendezvous lock */ 152 int num_mem_segs; /* (o) guest memory segments */ 153 struct mem_seg mem_segs[VM_MAX_MEMORY_SEGMENTS]; 154 struct vmspace *vmspace; /* (o) guest's address space */ 155 char name[VM_MAX_NAMELEN]; /* (o) virtual machine name */ 156 struct vcpu vcpu[VM_MAXCPU]; /* (i) guest vcpus */ 157 }; 158 159 static int vmm_initialized; 160 161 static struct vmm_ops *ops; 162 #define VMM_INIT(num) (ops != NULL ? (*ops->init)(num) : 0) 163 #define VMM_CLEANUP() (ops != NULL ? (*ops->cleanup)() : 0) 164 #define VMM_RESUME() (ops != NULL ? (*ops->resume)() : 0) 165 166 #define VMINIT(vm, pmap) (ops != NULL ? (*ops->vminit)(vm, pmap): NULL) 167 #define VMRUN(vmi, vcpu, rip, pmap, rptr, sptr) \ 168 (ops != NULL ? (*ops->vmrun)(vmi, vcpu, rip, pmap, rptr, sptr) : ENXIO) 169 #define VMCLEANUP(vmi) (ops != NULL ? (*ops->vmcleanup)(vmi) : NULL) 170 #define VMSPACE_ALLOC(min, max) \ 171 (ops != NULL ? (*ops->vmspace_alloc)(min, max) : NULL) 172 #define VMSPACE_FREE(vmspace) \ 173 (ops != NULL ? (*ops->vmspace_free)(vmspace) : ENXIO) 174 #define VMGETREG(vmi, vcpu, num, retval) \ 175 (ops != NULL ? (*ops->vmgetreg)(vmi, vcpu, num, retval) : ENXIO) 176 #define VMSETREG(vmi, vcpu, num, val) \ 177 (ops != NULL ? (*ops->vmsetreg)(vmi, vcpu, num, val) : ENXIO) 178 #define VMGETDESC(vmi, vcpu, num, desc) \ 179 (ops != NULL ? (*ops->vmgetdesc)(vmi, vcpu, num, desc) : ENXIO) 180 #define VMSETDESC(vmi, vcpu, num, desc) \ 181 (ops != NULL ? (*ops->vmsetdesc)(vmi, vcpu, num, desc) : ENXIO) 182 #define VMGETCAP(vmi, vcpu, num, retval) \ 183 (ops != NULL ? (*ops->vmgetcap)(vmi, vcpu, num, retval) : ENXIO) 184 #define VMSETCAP(vmi, vcpu, num, val) \ 185 (ops != NULL ? (*ops->vmsetcap)(vmi, vcpu, num, val) : ENXIO) 186 #define VLAPIC_INIT(vmi, vcpu) \ 187 (ops != NULL ? (*ops->vlapic_init)(vmi, vcpu) : NULL) 188 #define VLAPIC_CLEANUP(vmi, vlapic) \ 189 (ops != NULL ? (*ops->vlapic_cleanup)(vmi, vlapic) : NULL) 190 191 #define fpu_start_emulating() load_cr0(rcr0() | CR0_TS) 192 #define fpu_stop_emulating() clts() 193 194 static MALLOC_DEFINE(M_VM, "vm", "vm"); 195 196 /* statistics */ 197 static VMM_STAT(VCPU_TOTAL_RUNTIME, "vcpu total runtime"); 198 199 SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW, NULL, NULL); 200 201 /* 202 * Halt the guest if all vcpus are executing a HLT instruction with 203 * interrupts disabled. 204 */ 205 static int halt_detection_enabled = 1; 206 SYSCTL_INT(_hw_vmm, OID_AUTO, halt_detection, CTLFLAG_RDTUN, 207 &halt_detection_enabled, 0, 208 "Halt VM if all vcpus execute HLT with interrupts disabled"); 209 210 static int vmm_ipinum; 211 SYSCTL_INT(_hw_vmm, OID_AUTO, ipinum, CTLFLAG_RD, &vmm_ipinum, 0, 212 "IPI vector used for vcpu notifications"); 213 214 static int trace_guest_exceptions; 215 SYSCTL_INT(_hw_vmm, OID_AUTO, trace_guest_exceptions, CTLFLAG_RDTUN, 216 &trace_guest_exceptions, 0, 217 "Trap into hypervisor on all guest exceptions and reflect them back"); 218 219 static void 220 vcpu_cleanup(struct vm *vm, int i, bool destroy) 221 { 222 struct vcpu *vcpu = &vm->vcpu[i]; 223 224 VLAPIC_CLEANUP(vm->cookie, vcpu->vlapic); 225 if (destroy) { 226 vmm_stat_free(vcpu->stats); 227 fpu_save_area_free(vcpu->guestfpu); 228 } 229 } 230 231 static void 232 vcpu_init(struct vm *vm, int vcpu_id, bool create) 233 { 234 struct vcpu *vcpu; 235 236 KASSERT(vcpu_id >= 0 && vcpu_id < VM_MAXCPU, 237 ("vcpu_init: invalid vcpu %d", vcpu_id)); 238 239 vcpu = &vm->vcpu[vcpu_id]; 240 241 if (create) { 242 KASSERT(!vcpu_lock_initialized(vcpu), ("vcpu %d already " 243 "initialized", vcpu_id)); 244 vcpu_lock_init(vcpu); 245 vcpu->state = VCPU_IDLE; 246 vcpu->hostcpu = NOCPU; 247 vcpu->guestfpu = fpu_save_area_alloc(); 248 vcpu->stats = vmm_stat_alloc(); 249 } 250 251 vcpu->vlapic = VLAPIC_INIT(vm->cookie, vcpu_id); 252 vm_set_x2apic_state(vm, vcpu_id, X2APIC_DISABLED); 253 vcpu->exitintinfo = 0; 254 vcpu->nmi_pending = 0; 255 vcpu->extint_pending = 0; 256 vcpu->exception_pending = 0; 257 vcpu->guest_xcr0 = XFEATURE_ENABLED_X87; 258 fpu_save_area_reset(vcpu->guestfpu); 259 vmm_stat_init(vcpu->stats); 260 } 261 262 int 263 vcpu_trace_exceptions(struct vm *vm, int vcpuid) 264 { 265 266 return (trace_guest_exceptions); 267 } 268 269 struct vm_exit * 270 vm_exitinfo(struct vm *vm, int cpuid) 271 { 272 struct vcpu *vcpu; 273 274 if (cpuid < 0 || cpuid >= VM_MAXCPU) 275 panic("vm_exitinfo: invalid cpuid %d", cpuid); 276 277 vcpu = &vm->vcpu[cpuid]; 278 279 return (&vcpu->exitinfo); 280 } 281 282 static void 283 vmm_resume(void) 284 { 285 VMM_RESUME(); 286 } 287 288 static int 289 vmm_init(void) 290 { 291 int error; 292 293 vmm_host_state_init(); 294 295 vmm_ipinum = vmm_ipi_alloc(); 296 if (vmm_ipinum == 0) 297 vmm_ipinum = IPI_AST; 298 299 error = vmm_mem_init(); 300 if (error) 301 return (error); 302 303 if (vmm_is_intel()) 304 ops = &vmm_ops_intel; 305 else if (vmm_is_amd()) 306 ops = &vmm_ops_amd; 307 else 308 return (ENXIO); 309 310 vmm_resume_p = vmm_resume; 311 312 return (VMM_INIT(vmm_ipinum)); 313 } 314 315 static int 316 vmm_handler(module_t mod, int what, void *arg) 317 { 318 int error; 319 320 switch (what) { 321 case MOD_LOAD: 322 vmmdev_init(); 323 if (ppt_avail_devices() > 0) 324 iommu_init(); 325 error = vmm_init(); 326 if (error == 0) 327 vmm_initialized = 1; 328 break; 329 case MOD_UNLOAD: 330 error = vmmdev_cleanup(); 331 if (error == 0) { 332 vmm_resume_p = NULL; 333 iommu_cleanup(); 334 if (vmm_ipinum != IPI_AST) 335 vmm_ipi_free(vmm_ipinum); 336 error = VMM_CLEANUP(); 337 /* 338 * Something bad happened - prevent new 339 * VMs from being created 340 */ 341 if (error) 342 vmm_initialized = 0; 343 } 344 break; 345 default: 346 error = 0; 347 break; 348 } 349 return (error); 350 } 351 352 static moduledata_t vmm_kmod = { 353 "vmm", 354 vmm_handler, 355 NULL 356 }; 357 358 /* 359 * vmm initialization has the following dependencies: 360 * 361 * - iommu initialization must happen after the pci passthru driver has had 362 * a chance to attach to any passthru devices (after SI_SUB_CONFIGURE). 363 * 364 * - VT-x initialization requires smp_rendezvous() and therefore must happen 365 * after SMP is fully functional (after SI_SUB_SMP). 366 */ 367 DECLARE_MODULE(vmm, vmm_kmod, SI_SUB_SMP + 1, SI_ORDER_ANY); 368 MODULE_VERSION(vmm, 1); 369 370 static void 371 vm_init(struct vm *vm, bool create) 372 { 373 int i; 374 375 vm->cookie = VMINIT(vm, vmspace_pmap(vm->vmspace)); 376 vm->iommu = NULL; 377 vm->vioapic = vioapic_init(vm); 378 vm->vhpet = vhpet_init(vm); 379 vm->vatpic = vatpic_init(vm); 380 vm->vatpit = vatpit_init(vm); 381 vm->vpmtmr = vpmtmr_init(vm); 382 if (create) 383 vm->vrtc = vrtc_init(vm); 384 385 CPU_ZERO(&vm->active_cpus); 386 387 vm->suspend = 0; 388 CPU_ZERO(&vm->suspended_cpus); 389 390 for (i = 0; i < VM_MAXCPU; i++) 391 vcpu_init(vm, i, create); 392 } 393 394 int 395 vm_create(const char *name, struct vm **retvm) 396 { 397 struct vm *vm; 398 struct vmspace *vmspace; 399 400 /* 401 * If vmm.ko could not be successfully initialized then don't attempt 402 * to create the virtual machine. 403 */ 404 if (!vmm_initialized) 405 return (ENXIO); 406 407 if (name == NULL || strlen(name) >= VM_MAX_NAMELEN) 408 return (EINVAL); 409 410 vmspace = VMSPACE_ALLOC(0, VM_MAXUSER_ADDRESS); 411 if (vmspace == NULL) 412 return (ENOMEM); 413 414 vm = malloc(sizeof(struct vm), M_VM, M_WAITOK | M_ZERO); 415 strcpy(vm->name, name); 416 vm->num_mem_segs = 0; 417 vm->vmspace = vmspace; 418 mtx_init(&vm->rendezvous_mtx, "vm rendezvous lock", 0, MTX_DEF); 419 420 vm_init(vm, true); 421 422 *retvm = vm; 423 return (0); 424 } 425 426 static void 427 vm_free_mem_seg(struct vm *vm, struct mem_seg *seg) 428 { 429 430 if (seg->object != NULL) 431 vmm_mem_free(vm->vmspace, seg->gpa, seg->len); 432 433 bzero(seg, sizeof(*seg)); 434 } 435 436 static void 437 vm_cleanup(struct vm *vm, bool destroy) 438 { 439 int i; 440 441 ppt_unassign_all(vm); 442 443 if (vm->iommu != NULL) 444 iommu_destroy_domain(vm->iommu); 445 446 if (destroy) 447 vrtc_cleanup(vm->vrtc); 448 else 449 vrtc_reset(vm->vrtc); 450 vpmtmr_cleanup(vm->vpmtmr); 451 vatpit_cleanup(vm->vatpit); 452 vhpet_cleanup(vm->vhpet); 453 vatpic_cleanup(vm->vatpic); 454 vioapic_cleanup(vm->vioapic); 455 456 for (i = 0; i < VM_MAXCPU; i++) 457 vcpu_cleanup(vm, i, destroy); 458 459 VMCLEANUP(vm->cookie); 460 461 if (destroy) { 462 for (i = 0; i < vm->num_mem_segs; i++) 463 vm_free_mem_seg(vm, &vm->mem_segs[i]); 464 465 vm->num_mem_segs = 0; 466 467 VMSPACE_FREE(vm->vmspace); 468 vm->vmspace = NULL; 469 } 470 } 471 472 void 473 vm_destroy(struct vm *vm) 474 { 475 vm_cleanup(vm, true); 476 free(vm, M_VM); 477 } 478 479 int 480 vm_reinit(struct vm *vm) 481 { 482 int error; 483 484 /* 485 * A virtual machine can be reset only if all vcpus are suspended. 486 */ 487 if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) { 488 vm_cleanup(vm, false); 489 vm_init(vm, false); 490 error = 0; 491 } else { 492 error = EBUSY; 493 } 494 495 return (error); 496 } 497 498 const char * 499 vm_name(struct vm *vm) 500 { 501 return (vm->name); 502 } 503 504 int 505 vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa) 506 { 507 vm_object_t obj; 508 509 if ((obj = vmm_mmio_alloc(vm->vmspace, gpa, len, hpa)) == NULL) 510 return (ENOMEM); 511 else 512 return (0); 513 } 514 515 int 516 vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len) 517 { 518 519 vmm_mmio_free(vm->vmspace, gpa, len); 520 return (0); 521 } 522 523 boolean_t 524 vm_mem_allocated(struct vm *vm, vm_paddr_t gpa) 525 { 526 int i; 527 vm_paddr_t gpabase, gpalimit; 528 529 for (i = 0; i < vm->num_mem_segs; i++) { 530 gpabase = vm->mem_segs[i].gpa; 531 gpalimit = gpabase + vm->mem_segs[i].len; 532 if (gpa >= gpabase && gpa < gpalimit) 533 return (TRUE); /* 'gpa' is regular memory */ 534 } 535 536 if (ppt_is_mmio(vm, gpa)) 537 return (TRUE); /* 'gpa' is pci passthru mmio */ 538 539 return (FALSE); 540 } 541 542 int 543 vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len) 544 { 545 int available, allocated; 546 struct mem_seg *seg; 547 vm_object_t object; 548 vm_paddr_t g; 549 550 if ((gpa & PAGE_MASK) || (len & PAGE_MASK) || len == 0) 551 return (EINVAL); 552 553 available = allocated = 0; 554 g = gpa; 555 while (g < gpa + len) { 556 if (vm_mem_allocated(vm, g)) 557 allocated++; 558 else 559 available++; 560 561 g += PAGE_SIZE; 562 } 563 564 /* 565 * If there are some allocated and some available pages in the address 566 * range then it is an error. 567 */ 568 if (allocated && available) 569 return (EINVAL); 570 571 /* 572 * If the entire address range being requested has already been 573 * allocated then there isn't anything more to do. 574 */ 575 if (allocated && available == 0) 576 return (0); 577 578 if (vm->num_mem_segs >= VM_MAX_MEMORY_SEGMENTS) 579 return (E2BIG); 580 581 seg = &vm->mem_segs[vm->num_mem_segs]; 582 583 if ((object = vmm_mem_alloc(vm->vmspace, gpa, len)) == NULL) 584 return (ENOMEM); 585 586 seg->gpa = gpa; 587 seg->len = len; 588 seg->object = object; 589 seg->wired = FALSE; 590 591 vm->num_mem_segs++; 592 593 return (0); 594 } 595 596 static vm_paddr_t 597 vm_maxmem(struct vm *vm) 598 { 599 int i; 600 vm_paddr_t gpa, maxmem; 601 602 maxmem = 0; 603 for (i = 0; i < vm->num_mem_segs; i++) { 604 gpa = vm->mem_segs[i].gpa + vm->mem_segs[i].len; 605 if (gpa > maxmem) 606 maxmem = gpa; 607 } 608 return (maxmem); 609 } 610 611 static void 612 vm_gpa_unwire(struct vm *vm) 613 { 614 int i, rv; 615 struct mem_seg *seg; 616 617 for (i = 0; i < vm->num_mem_segs; i++) { 618 seg = &vm->mem_segs[i]; 619 if (!seg->wired) 620 continue; 621 622 rv = vm_map_unwire(&vm->vmspace->vm_map, 623 seg->gpa, seg->gpa + seg->len, 624 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); 625 KASSERT(rv == KERN_SUCCESS, ("vm(%s) memory segment " 626 "%#lx/%ld could not be unwired: %d", 627 vm_name(vm), seg->gpa, seg->len, rv)); 628 629 seg->wired = FALSE; 630 } 631 } 632 633 static int 634 vm_gpa_wire(struct vm *vm) 635 { 636 int i, rv; 637 struct mem_seg *seg; 638 639 for (i = 0; i < vm->num_mem_segs; i++) { 640 seg = &vm->mem_segs[i]; 641 if (seg->wired) 642 continue; 643 644 /* XXX rlimits? */ 645 rv = vm_map_wire(&vm->vmspace->vm_map, 646 seg->gpa, seg->gpa + seg->len, 647 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); 648 if (rv != KERN_SUCCESS) 649 break; 650 651 seg->wired = TRUE; 652 } 653 654 if (i < vm->num_mem_segs) { 655 /* 656 * Undo the wiring before returning an error. 657 */ 658 vm_gpa_unwire(vm); 659 return (EAGAIN); 660 } 661 662 return (0); 663 } 664 665 static void 666 vm_iommu_modify(struct vm *vm, boolean_t map) 667 { 668 int i, sz; 669 vm_paddr_t gpa, hpa; 670 struct mem_seg *seg; 671 void *vp, *cookie, *host_domain; 672 673 sz = PAGE_SIZE; 674 host_domain = iommu_host_domain(); 675 676 for (i = 0; i < vm->num_mem_segs; i++) { 677 seg = &vm->mem_segs[i]; 678 KASSERT(seg->wired, ("vm(%s) memory segment %#lx/%ld not wired", 679 vm_name(vm), seg->gpa, seg->len)); 680 681 gpa = seg->gpa; 682 while (gpa < seg->gpa + seg->len) { 683 vp = vm_gpa_hold(vm, gpa, PAGE_SIZE, VM_PROT_WRITE, 684 &cookie); 685 KASSERT(vp != NULL, ("vm(%s) could not map gpa %#lx", 686 vm_name(vm), gpa)); 687 688 vm_gpa_release(cookie); 689 690 hpa = DMAP_TO_PHYS((uintptr_t)vp); 691 if (map) { 692 iommu_create_mapping(vm->iommu, gpa, hpa, sz); 693 iommu_remove_mapping(host_domain, hpa, sz); 694 } else { 695 iommu_remove_mapping(vm->iommu, gpa, sz); 696 iommu_create_mapping(host_domain, hpa, hpa, sz); 697 } 698 699 gpa += PAGE_SIZE; 700 } 701 } 702 703 /* 704 * Invalidate the cached translations associated with the domain 705 * from which pages were removed. 706 */ 707 if (map) 708 iommu_invalidate_tlb(host_domain); 709 else 710 iommu_invalidate_tlb(vm->iommu); 711 } 712 713 #define vm_iommu_unmap(vm) vm_iommu_modify((vm), FALSE) 714 #define vm_iommu_map(vm) vm_iommu_modify((vm), TRUE) 715 716 int 717 vm_unassign_pptdev(struct vm *vm, int bus, int slot, int func) 718 { 719 int error; 720 721 error = ppt_unassign_device(vm, bus, slot, func); 722 if (error) 723 return (error); 724 725 if (ppt_assigned_devices(vm) == 0) { 726 vm_iommu_unmap(vm); 727 vm_gpa_unwire(vm); 728 } 729 return (0); 730 } 731 732 int 733 vm_assign_pptdev(struct vm *vm, int bus, int slot, int func) 734 { 735 int error; 736 vm_paddr_t maxaddr; 737 738 /* 739 * Virtual machines with pci passthru devices get special treatment: 740 * - the guest physical memory is wired 741 * - the iommu is programmed to do the 'gpa' to 'hpa' translation 742 * 743 * We need to do this before the first pci passthru device is attached. 744 */ 745 if (ppt_assigned_devices(vm) == 0) { 746 KASSERT(vm->iommu == NULL, 747 ("vm_assign_pptdev: iommu must be NULL")); 748 maxaddr = vm_maxmem(vm); 749 vm->iommu = iommu_create_domain(maxaddr); 750 751 error = vm_gpa_wire(vm); 752 if (error) 753 return (error); 754 755 vm_iommu_map(vm); 756 } 757 758 error = ppt_assign_device(vm, bus, slot, func); 759 return (error); 760 } 761 762 void * 763 vm_gpa_hold(struct vm *vm, vm_paddr_t gpa, size_t len, int reqprot, 764 void **cookie) 765 { 766 int count, pageoff; 767 vm_page_t m; 768 769 pageoff = gpa & PAGE_MASK; 770 if (len > PAGE_SIZE - pageoff) 771 panic("vm_gpa_hold: invalid gpa/len: 0x%016lx/%lu", gpa, len); 772 773 count = vm_fault_quick_hold_pages(&vm->vmspace->vm_map, 774 trunc_page(gpa), PAGE_SIZE, reqprot, &m, 1); 775 776 if (count == 1) { 777 *cookie = m; 778 return ((void *)(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)) + pageoff)); 779 } else { 780 *cookie = NULL; 781 return (NULL); 782 } 783 } 784 785 void 786 vm_gpa_release(void *cookie) 787 { 788 vm_page_t m = cookie; 789 790 vm_page_lock(m); 791 vm_page_unhold(m); 792 vm_page_unlock(m); 793 } 794 795 int 796 vm_gpabase2memseg(struct vm *vm, vm_paddr_t gpabase, 797 struct vm_memory_segment *seg) 798 { 799 int i; 800 801 for (i = 0; i < vm->num_mem_segs; i++) { 802 if (gpabase == vm->mem_segs[i].gpa) { 803 seg->gpa = vm->mem_segs[i].gpa; 804 seg->len = vm->mem_segs[i].len; 805 seg->wired = vm->mem_segs[i].wired; 806 return (0); 807 } 808 } 809 return (-1); 810 } 811 812 int 813 vm_get_memobj(struct vm *vm, vm_paddr_t gpa, size_t len, 814 vm_offset_t *offset, struct vm_object **object) 815 { 816 int i; 817 size_t seg_len; 818 vm_paddr_t seg_gpa; 819 vm_object_t seg_obj; 820 821 for (i = 0; i < vm->num_mem_segs; i++) { 822 if ((seg_obj = vm->mem_segs[i].object) == NULL) 823 continue; 824 825 seg_gpa = vm->mem_segs[i].gpa; 826 seg_len = vm->mem_segs[i].len; 827 828 if (gpa >= seg_gpa && gpa < seg_gpa + seg_len) { 829 *offset = gpa - seg_gpa; 830 *object = seg_obj; 831 vm_object_reference(seg_obj); 832 return (0); 833 } 834 } 835 836 return (EINVAL); 837 } 838 839 int 840 vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval) 841 { 842 843 if (vcpu < 0 || vcpu >= VM_MAXCPU) 844 return (EINVAL); 845 846 if (reg >= VM_REG_LAST) 847 return (EINVAL); 848 849 return (VMGETREG(vm->cookie, vcpu, reg, retval)); 850 } 851 852 int 853 vm_set_register(struct vm *vm, int vcpu, int reg, uint64_t val) 854 { 855 856 if (vcpu < 0 || vcpu >= VM_MAXCPU) 857 return (EINVAL); 858 859 if (reg >= VM_REG_LAST) 860 return (EINVAL); 861 862 return (VMSETREG(vm->cookie, vcpu, reg, val)); 863 } 864 865 static boolean_t 866 is_descriptor_table(int reg) 867 { 868 869 switch (reg) { 870 case VM_REG_GUEST_IDTR: 871 case VM_REG_GUEST_GDTR: 872 return (TRUE); 873 default: 874 return (FALSE); 875 } 876 } 877 878 static boolean_t 879 is_segment_register(int reg) 880 { 881 882 switch (reg) { 883 case VM_REG_GUEST_ES: 884 case VM_REG_GUEST_CS: 885 case VM_REG_GUEST_SS: 886 case VM_REG_GUEST_DS: 887 case VM_REG_GUEST_FS: 888 case VM_REG_GUEST_GS: 889 case VM_REG_GUEST_TR: 890 case VM_REG_GUEST_LDTR: 891 return (TRUE); 892 default: 893 return (FALSE); 894 } 895 } 896 897 int 898 vm_get_seg_desc(struct vm *vm, int vcpu, int reg, 899 struct seg_desc *desc) 900 { 901 902 if (vcpu < 0 || vcpu >= VM_MAXCPU) 903 return (EINVAL); 904 905 if (!is_segment_register(reg) && !is_descriptor_table(reg)) 906 return (EINVAL); 907 908 return (VMGETDESC(vm->cookie, vcpu, reg, desc)); 909 } 910 911 int 912 vm_set_seg_desc(struct vm *vm, int vcpu, int reg, 913 struct seg_desc *desc) 914 { 915 if (vcpu < 0 || vcpu >= VM_MAXCPU) 916 return (EINVAL); 917 918 if (!is_segment_register(reg) && !is_descriptor_table(reg)) 919 return (EINVAL); 920 921 return (VMSETDESC(vm->cookie, vcpu, reg, desc)); 922 } 923 924 static void 925 restore_guest_fpustate(struct vcpu *vcpu) 926 { 927 928 /* flush host state to the pcb */ 929 fpuexit(curthread); 930 931 /* restore guest FPU state */ 932 fpu_stop_emulating(); 933 fpurestore(vcpu->guestfpu); 934 935 /* restore guest XCR0 if XSAVE is enabled in the host */ 936 if (rcr4() & CR4_XSAVE) 937 load_xcr(0, vcpu->guest_xcr0); 938 939 /* 940 * The FPU is now "dirty" with the guest's state so turn on emulation 941 * to trap any access to the FPU by the host. 942 */ 943 fpu_start_emulating(); 944 } 945 946 static void 947 save_guest_fpustate(struct vcpu *vcpu) 948 { 949 950 if ((rcr0() & CR0_TS) == 0) 951 panic("fpu emulation not enabled in host!"); 952 953 /* save guest XCR0 and restore host XCR0 */ 954 if (rcr4() & CR4_XSAVE) { 955 vcpu->guest_xcr0 = rxcr(0); 956 load_xcr(0, vmm_get_host_xcr0()); 957 } 958 959 /* save guest FPU state */ 960 fpu_stop_emulating(); 961 fpusave(vcpu->guestfpu); 962 fpu_start_emulating(); 963 } 964 965 static VMM_STAT(VCPU_IDLE_TICKS, "number of ticks vcpu was idle"); 966 967 static int 968 vcpu_set_state_locked(struct vcpu *vcpu, enum vcpu_state newstate, 969 bool from_idle) 970 { 971 int error; 972 973 vcpu_assert_locked(vcpu); 974 975 /* 976 * State transitions from the vmmdev_ioctl() must always begin from 977 * the VCPU_IDLE state. This guarantees that there is only a single 978 * ioctl() operating on a vcpu at any point. 979 */ 980 if (from_idle) { 981 while (vcpu->state != VCPU_IDLE) 982 msleep_spin(&vcpu->state, &vcpu->mtx, "vmstat", hz); 983 } else { 984 KASSERT(vcpu->state != VCPU_IDLE, ("invalid transition from " 985 "vcpu idle state")); 986 } 987 988 if (vcpu->state == VCPU_RUNNING) { 989 KASSERT(vcpu->hostcpu == curcpu, ("curcpu %d and hostcpu %d " 990 "mismatch for running vcpu", curcpu, vcpu->hostcpu)); 991 } else { 992 KASSERT(vcpu->hostcpu == NOCPU, ("Invalid hostcpu %d for a " 993 "vcpu that is not running", vcpu->hostcpu)); 994 } 995 996 /* 997 * The following state transitions are allowed: 998 * IDLE -> FROZEN -> IDLE 999 * FROZEN -> RUNNING -> FROZEN 1000 * FROZEN -> SLEEPING -> FROZEN 1001 */ 1002 switch (vcpu->state) { 1003 case VCPU_IDLE: 1004 case VCPU_RUNNING: 1005 case VCPU_SLEEPING: 1006 error = (newstate != VCPU_FROZEN); 1007 break; 1008 case VCPU_FROZEN: 1009 error = (newstate == VCPU_FROZEN); 1010 break; 1011 default: 1012 error = 1; 1013 break; 1014 } 1015 1016 if (error) 1017 return (EBUSY); 1018 1019 vcpu->state = newstate; 1020 if (newstate == VCPU_RUNNING) 1021 vcpu->hostcpu = curcpu; 1022 else 1023 vcpu->hostcpu = NOCPU; 1024 1025 if (newstate == VCPU_IDLE) 1026 wakeup(&vcpu->state); 1027 1028 return (0); 1029 } 1030 1031 static void 1032 vcpu_require_state(struct vm *vm, int vcpuid, enum vcpu_state newstate) 1033 { 1034 int error; 1035 1036 if ((error = vcpu_set_state(vm, vcpuid, newstate, false)) != 0) 1037 panic("Error %d setting state to %d\n", error, newstate); 1038 } 1039 1040 static void 1041 vcpu_require_state_locked(struct vcpu *vcpu, enum vcpu_state newstate) 1042 { 1043 int error; 1044 1045 if ((error = vcpu_set_state_locked(vcpu, newstate, false)) != 0) 1046 panic("Error %d setting state to %d", error, newstate); 1047 } 1048 1049 static void 1050 vm_set_rendezvous_func(struct vm *vm, vm_rendezvous_func_t func) 1051 { 1052 1053 KASSERT(mtx_owned(&vm->rendezvous_mtx), ("rendezvous_mtx not locked")); 1054 1055 /* 1056 * Update 'rendezvous_func' and execute a write memory barrier to 1057 * ensure that it is visible across all host cpus. This is not needed 1058 * for correctness but it does ensure that all the vcpus will notice 1059 * that the rendezvous is requested immediately. 1060 */ 1061 vm->rendezvous_func = func; 1062 wmb(); 1063 } 1064 1065 #define RENDEZVOUS_CTR0(vm, vcpuid, fmt) \ 1066 do { \ 1067 if (vcpuid >= 0) \ 1068 VCPU_CTR0(vm, vcpuid, fmt); \ 1069 else \ 1070 VM_CTR0(vm, fmt); \ 1071 } while (0) 1072 1073 static void 1074 vm_handle_rendezvous(struct vm *vm, int vcpuid) 1075 { 1076 1077 KASSERT(vcpuid == -1 || (vcpuid >= 0 && vcpuid < VM_MAXCPU), 1078 ("vm_handle_rendezvous: invalid vcpuid %d", vcpuid)); 1079 1080 mtx_lock(&vm->rendezvous_mtx); 1081 while (vm->rendezvous_func != NULL) { 1082 /* 'rendezvous_req_cpus' must be a subset of 'active_cpus' */ 1083 CPU_AND(&vm->rendezvous_req_cpus, &vm->active_cpus); 1084 1085 if (vcpuid != -1 && 1086 CPU_ISSET(vcpuid, &vm->rendezvous_req_cpus) && 1087 !CPU_ISSET(vcpuid, &vm->rendezvous_done_cpus)) { 1088 VCPU_CTR0(vm, vcpuid, "Calling rendezvous func"); 1089 (*vm->rendezvous_func)(vm, vcpuid, vm->rendezvous_arg); 1090 CPU_SET(vcpuid, &vm->rendezvous_done_cpus); 1091 } 1092 if (CPU_CMP(&vm->rendezvous_req_cpus, 1093 &vm->rendezvous_done_cpus) == 0) { 1094 VCPU_CTR0(vm, vcpuid, "Rendezvous completed"); 1095 vm_set_rendezvous_func(vm, NULL); 1096 wakeup(&vm->rendezvous_func); 1097 break; 1098 } 1099 RENDEZVOUS_CTR0(vm, vcpuid, "Wait for rendezvous completion"); 1100 mtx_sleep(&vm->rendezvous_func, &vm->rendezvous_mtx, 0, 1101 "vmrndv", 0); 1102 } 1103 mtx_unlock(&vm->rendezvous_mtx); 1104 } 1105 1106 /* 1107 * Emulate a guest 'hlt' by sleeping until the vcpu is ready to run. 1108 */ 1109 static int 1110 vm_handle_hlt(struct vm *vm, int vcpuid, bool intr_disabled, bool *retu) 1111 { 1112 struct vcpu *vcpu; 1113 const char *wmesg; 1114 int t, vcpu_halted, vm_halted; 1115 1116 KASSERT(!CPU_ISSET(vcpuid, &vm->halted_cpus), ("vcpu already halted")); 1117 1118 vcpu = &vm->vcpu[vcpuid]; 1119 vcpu_halted = 0; 1120 vm_halted = 0; 1121 1122 vcpu_lock(vcpu); 1123 while (1) { 1124 /* 1125 * Do a final check for pending NMI or interrupts before 1126 * really putting this thread to sleep. Also check for 1127 * software events that would cause this vcpu to wakeup. 1128 * 1129 * These interrupts/events could have happened after the 1130 * vcpu returned from VMRUN() and before it acquired the 1131 * vcpu lock above. 1132 */ 1133 if (vm->rendezvous_func != NULL || vm->suspend) 1134 break; 1135 if (vm_nmi_pending(vm, vcpuid)) 1136 break; 1137 if (!intr_disabled) { 1138 if (vm_extint_pending(vm, vcpuid) || 1139 vlapic_pending_intr(vcpu->vlapic, NULL)) { 1140 break; 1141 } 1142 } 1143 1144 /* Don't go to sleep if the vcpu thread needs to yield */ 1145 if (vcpu_should_yield(vm, vcpuid)) 1146 break; 1147 1148 /* 1149 * Some Linux guests implement "halt" by having all vcpus 1150 * execute HLT with interrupts disabled. 'halted_cpus' keeps 1151 * track of the vcpus that have entered this state. When all 1152 * vcpus enter the halted state the virtual machine is halted. 1153 */ 1154 if (intr_disabled) { 1155 wmesg = "vmhalt"; 1156 VCPU_CTR0(vm, vcpuid, "Halted"); 1157 if (!vcpu_halted && halt_detection_enabled) { 1158 vcpu_halted = 1; 1159 CPU_SET_ATOMIC(vcpuid, &vm->halted_cpus); 1160 } 1161 if (CPU_CMP(&vm->halted_cpus, &vm->active_cpus) == 0) { 1162 vm_halted = 1; 1163 break; 1164 } 1165 } else { 1166 wmesg = "vmidle"; 1167 } 1168 1169 t = ticks; 1170 vcpu_require_state_locked(vcpu, VCPU_SLEEPING); 1171 /* 1172 * XXX msleep_spin() cannot be interrupted by signals so 1173 * wake up periodically to check pending signals. 1174 */ 1175 msleep_spin(vcpu, &vcpu->mtx, wmesg, hz); 1176 vcpu_require_state_locked(vcpu, VCPU_FROZEN); 1177 vmm_stat_incr(vm, vcpuid, VCPU_IDLE_TICKS, ticks - t); 1178 } 1179 1180 if (vcpu_halted) 1181 CPU_CLR_ATOMIC(vcpuid, &vm->halted_cpus); 1182 1183 vcpu_unlock(vcpu); 1184 1185 if (vm_halted) 1186 vm_suspend(vm, VM_SUSPEND_HALT); 1187 1188 return (0); 1189 } 1190 1191 static int 1192 vm_handle_paging(struct vm *vm, int vcpuid, bool *retu) 1193 { 1194 int rv, ftype; 1195 struct vm_map *map; 1196 struct vcpu *vcpu; 1197 struct vm_exit *vme; 1198 1199 vcpu = &vm->vcpu[vcpuid]; 1200 vme = &vcpu->exitinfo; 1201 1202 ftype = vme->u.paging.fault_type; 1203 KASSERT(ftype == VM_PROT_READ || 1204 ftype == VM_PROT_WRITE || ftype == VM_PROT_EXECUTE, 1205 ("vm_handle_paging: invalid fault_type %d", ftype)); 1206 1207 if (ftype == VM_PROT_READ || ftype == VM_PROT_WRITE) { 1208 rv = pmap_emulate_accessed_dirty(vmspace_pmap(vm->vmspace), 1209 vme->u.paging.gpa, ftype); 1210 if (rv == 0) { 1211 VCPU_CTR2(vm, vcpuid, "%s bit emulation for gpa %#lx", 1212 ftype == VM_PROT_READ ? "accessed" : "dirty", 1213 vme->u.paging.gpa); 1214 goto done; 1215 } 1216 } 1217 1218 map = &vm->vmspace->vm_map; 1219 rv = vm_fault(map, vme->u.paging.gpa, ftype, VM_FAULT_NORMAL); 1220 1221 VCPU_CTR3(vm, vcpuid, "vm_handle_paging rv = %d, gpa = %#lx, " 1222 "ftype = %d", rv, vme->u.paging.gpa, ftype); 1223 1224 if (rv != KERN_SUCCESS) 1225 return (EFAULT); 1226 done: 1227 /* restart execution at the faulting instruction */ 1228 vm_restart_instruction(vm, vcpuid); 1229 1230 return (0); 1231 } 1232 1233 static int 1234 vm_handle_inst_emul(struct vm *vm, int vcpuid, bool *retu) 1235 { 1236 struct vie *vie; 1237 struct vcpu *vcpu; 1238 struct vm_exit *vme; 1239 uint64_t gla, gpa; 1240 struct vm_guest_paging *paging; 1241 mem_region_read_t mread; 1242 mem_region_write_t mwrite; 1243 enum vm_cpu_mode cpu_mode; 1244 int cs_d, error, length; 1245 1246 vcpu = &vm->vcpu[vcpuid]; 1247 vme = &vcpu->exitinfo; 1248 1249 gla = vme->u.inst_emul.gla; 1250 gpa = vme->u.inst_emul.gpa; 1251 cs_d = vme->u.inst_emul.cs_d; 1252 vie = &vme->u.inst_emul.vie; 1253 paging = &vme->u.inst_emul.paging; 1254 cpu_mode = paging->cpu_mode; 1255 1256 VCPU_CTR1(vm, vcpuid, "inst_emul fault accessing gpa %#lx", gpa); 1257 1258 /* Fetch, decode and emulate the faulting instruction */ 1259 if (vie->num_valid == 0) { 1260 /* 1261 * If the instruction length is not known then assume a 1262 * maximum size instruction. 1263 */ 1264 length = vme->inst_length ? vme->inst_length : VIE_INST_SIZE; 1265 error = vmm_fetch_instruction(vm, vcpuid, paging, vme->rip, 1266 length, vie); 1267 } else { 1268 /* 1269 * The instruction bytes have already been copied into 'vie' 1270 */ 1271 error = 0; 1272 } 1273 if (error == 1) 1274 return (0); /* Resume guest to handle page fault */ 1275 else if (error == -1) 1276 return (EFAULT); 1277 else if (error != 0) 1278 panic("%s: vmm_fetch_instruction error %d", __func__, error); 1279 1280 if (vmm_decode_instruction(vm, vcpuid, gla, cpu_mode, cs_d, vie) != 0) 1281 return (EFAULT); 1282 1283 /* 1284 * If the instruction length is not specified the update it now. 1285 */ 1286 if (vme->inst_length == 0) 1287 vme->inst_length = vie->num_processed; 1288 1289 /* return to userland unless this is an in-kernel emulated device */ 1290 if (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE) { 1291 mread = lapic_mmio_read; 1292 mwrite = lapic_mmio_write; 1293 } else if (gpa >= VIOAPIC_BASE && gpa < VIOAPIC_BASE + VIOAPIC_SIZE) { 1294 mread = vioapic_mmio_read; 1295 mwrite = vioapic_mmio_write; 1296 } else if (gpa >= VHPET_BASE && gpa < VHPET_BASE + VHPET_SIZE) { 1297 mread = vhpet_mmio_read; 1298 mwrite = vhpet_mmio_write; 1299 } else { 1300 *retu = true; 1301 return (0); 1302 } 1303 1304 error = vmm_emulate_instruction(vm, vcpuid, gpa, vie, paging, 1305 mread, mwrite, retu); 1306 1307 return (error); 1308 } 1309 1310 static int 1311 vm_handle_suspend(struct vm *vm, int vcpuid, bool *retu) 1312 { 1313 int i, done; 1314 struct vcpu *vcpu; 1315 1316 done = 0; 1317 vcpu = &vm->vcpu[vcpuid]; 1318 1319 CPU_SET_ATOMIC(vcpuid, &vm->suspended_cpus); 1320 1321 /* 1322 * Wait until all 'active_cpus' have suspended themselves. 1323 * 1324 * Since a VM may be suspended at any time including when one or 1325 * more vcpus are doing a rendezvous we need to call the rendezvous 1326 * handler while we are waiting to prevent a deadlock. 1327 */ 1328 vcpu_lock(vcpu); 1329 while (1) { 1330 if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) { 1331 VCPU_CTR0(vm, vcpuid, "All vcpus suspended"); 1332 break; 1333 } 1334 1335 if (vm->rendezvous_func == NULL) { 1336 VCPU_CTR0(vm, vcpuid, "Sleeping during suspend"); 1337 vcpu_require_state_locked(vcpu, VCPU_SLEEPING); 1338 msleep_spin(vcpu, &vcpu->mtx, "vmsusp", hz); 1339 vcpu_require_state_locked(vcpu, VCPU_FROZEN); 1340 } else { 1341 VCPU_CTR0(vm, vcpuid, "Rendezvous during suspend"); 1342 vcpu_unlock(vcpu); 1343 vm_handle_rendezvous(vm, vcpuid); 1344 vcpu_lock(vcpu); 1345 } 1346 } 1347 vcpu_unlock(vcpu); 1348 1349 /* 1350 * Wakeup the other sleeping vcpus and return to userspace. 1351 */ 1352 for (i = 0; i < VM_MAXCPU; i++) { 1353 if (CPU_ISSET(i, &vm->suspended_cpus)) { 1354 vcpu_notify_event(vm, i, false); 1355 } 1356 } 1357 1358 *retu = true; 1359 return (0); 1360 } 1361 1362 int 1363 vm_suspend(struct vm *vm, enum vm_suspend_how how) 1364 { 1365 int i; 1366 1367 if (how <= VM_SUSPEND_NONE || how >= VM_SUSPEND_LAST) 1368 return (EINVAL); 1369 1370 if (atomic_cmpset_int(&vm->suspend, 0, how) == 0) { 1371 VM_CTR2(vm, "virtual machine already suspended %d/%d", 1372 vm->suspend, how); 1373 return (EALREADY); 1374 } 1375 1376 VM_CTR1(vm, "virtual machine successfully suspended %d", how); 1377 1378 /* 1379 * Notify all active vcpus that they are now suspended. 1380 */ 1381 for (i = 0; i < VM_MAXCPU; i++) { 1382 if (CPU_ISSET(i, &vm->active_cpus)) 1383 vcpu_notify_event(vm, i, false); 1384 } 1385 1386 return (0); 1387 } 1388 1389 void 1390 vm_exit_suspended(struct vm *vm, int vcpuid, uint64_t rip) 1391 { 1392 struct vm_exit *vmexit; 1393 1394 KASSERT(vm->suspend > VM_SUSPEND_NONE && vm->suspend < VM_SUSPEND_LAST, 1395 ("vm_exit_suspended: invalid suspend type %d", vm->suspend)); 1396 1397 vmexit = vm_exitinfo(vm, vcpuid); 1398 vmexit->rip = rip; 1399 vmexit->inst_length = 0; 1400 vmexit->exitcode = VM_EXITCODE_SUSPENDED; 1401 vmexit->u.suspended.how = vm->suspend; 1402 } 1403 1404 void 1405 vm_exit_rendezvous(struct vm *vm, int vcpuid, uint64_t rip) 1406 { 1407 struct vm_exit *vmexit; 1408 1409 KASSERT(vm->rendezvous_func != NULL, ("rendezvous not in progress")); 1410 1411 vmexit = vm_exitinfo(vm, vcpuid); 1412 vmexit->rip = rip; 1413 vmexit->inst_length = 0; 1414 vmexit->exitcode = VM_EXITCODE_RENDEZVOUS; 1415 vmm_stat_incr(vm, vcpuid, VMEXIT_RENDEZVOUS, 1); 1416 } 1417 1418 void 1419 vm_exit_astpending(struct vm *vm, int vcpuid, uint64_t rip) 1420 { 1421 struct vm_exit *vmexit; 1422 1423 vmexit = vm_exitinfo(vm, vcpuid); 1424 vmexit->rip = rip; 1425 vmexit->inst_length = 0; 1426 vmexit->exitcode = VM_EXITCODE_BOGUS; 1427 vmm_stat_incr(vm, vcpuid, VMEXIT_ASTPENDING, 1); 1428 } 1429 1430 int 1431 vm_run(struct vm *vm, struct vm_run *vmrun) 1432 { 1433 int error, vcpuid; 1434 struct vcpu *vcpu; 1435 struct pcb *pcb; 1436 uint64_t tscval, rip; 1437 struct vm_exit *vme; 1438 bool retu, intr_disabled; 1439 pmap_t pmap; 1440 void *rptr, *sptr; 1441 1442 vcpuid = vmrun->cpuid; 1443 1444 if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 1445 return (EINVAL); 1446 1447 if (!CPU_ISSET(vcpuid, &vm->active_cpus)) 1448 return (EINVAL); 1449 1450 if (CPU_ISSET(vcpuid, &vm->suspended_cpus)) 1451 return (EINVAL); 1452 1453 rptr = &vm->rendezvous_func; 1454 sptr = &vm->suspend; 1455 pmap = vmspace_pmap(vm->vmspace); 1456 vcpu = &vm->vcpu[vcpuid]; 1457 vme = &vcpu->exitinfo; 1458 rip = vmrun->rip; 1459 restart: 1460 critical_enter(); 1461 1462 KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active), 1463 ("vm_run: absurd pm_active")); 1464 1465 tscval = rdtsc(); 1466 1467 pcb = PCPU_GET(curpcb); 1468 set_pcb_flags(pcb, PCB_FULL_IRET); 1469 1470 restore_guest_fpustate(vcpu); 1471 1472 vcpu_require_state(vm, vcpuid, VCPU_RUNNING); 1473 error = VMRUN(vm->cookie, vcpuid, rip, pmap, rptr, sptr); 1474 vcpu_require_state(vm, vcpuid, VCPU_FROZEN); 1475 1476 save_guest_fpustate(vcpu); 1477 1478 vmm_stat_incr(vm, vcpuid, VCPU_TOTAL_RUNTIME, rdtsc() - tscval); 1479 1480 critical_exit(); 1481 1482 if (error == 0) { 1483 retu = false; 1484 switch (vme->exitcode) { 1485 case VM_EXITCODE_SUSPENDED: 1486 error = vm_handle_suspend(vm, vcpuid, &retu); 1487 break; 1488 case VM_EXITCODE_IOAPIC_EOI: 1489 vioapic_process_eoi(vm, vcpuid, 1490 vme->u.ioapic_eoi.vector); 1491 break; 1492 case VM_EXITCODE_RENDEZVOUS: 1493 vm_handle_rendezvous(vm, vcpuid); 1494 error = 0; 1495 break; 1496 case VM_EXITCODE_HLT: 1497 intr_disabled = ((vme->u.hlt.rflags & PSL_I) == 0); 1498 error = vm_handle_hlt(vm, vcpuid, intr_disabled, &retu); 1499 break; 1500 case VM_EXITCODE_PAGING: 1501 error = vm_handle_paging(vm, vcpuid, &retu); 1502 break; 1503 case VM_EXITCODE_INST_EMUL: 1504 error = vm_handle_inst_emul(vm, vcpuid, &retu); 1505 break; 1506 case VM_EXITCODE_INOUT: 1507 case VM_EXITCODE_INOUT_STR: 1508 error = vm_handle_inout(vm, vcpuid, vme, &retu); 1509 break; 1510 case VM_EXITCODE_MONITOR: 1511 case VM_EXITCODE_MWAIT: 1512 vm_inject_ud(vm, vcpuid); 1513 break; 1514 default: 1515 retu = true; /* handled in userland */ 1516 break; 1517 } 1518 } 1519 1520 if (error == 0 && retu == false) { 1521 rip = vme->rip + vme->inst_length; 1522 goto restart; 1523 } 1524 1525 /* copy the exit information */ 1526 bcopy(vme, &vmrun->vm_exit, sizeof(struct vm_exit)); 1527 return (error); 1528 } 1529 1530 int 1531 vm_restart_instruction(void *arg, int vcpuid) 1532 { 1533 struct vcpu *vcpu; 1534 struct vm *vm = arg; 1535 1536 if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 1537 return (EINVAL); 1538 1539 vcpu = &vm->vcpu[vcpuid]; 1540 vcpu->exitinfo.inst_length = 0; 1541 return (0); 1542 } 1543 1544 int 1545 vm_exit_intinfo(struct vm *vm, int vcpuid, uint64_t info) 1546 { 1547 struct vcpu *vcpu; 1548 int type, vector; 1549 1550 if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 1551 return (EINVAL); 1552 1553 vcpu = &vm->vcpu[vcpuid]; 1554 1555 if (info & VM_INTINFO_VALID) { 1556 type = info & VM_INTINFO_TYPE; 1557 vector = info & 0xff; 1558 if (type == VM_INTINFO_NMI && vector != IDT_NMI) 1559 return (EINVAL); 1560 if (type == VM_INTINFO_HWEXCEPTION && vector >= 32) 1561 return (EINVAL); 1562 if (info & VM_INTINFO_RSVD) 1563 return (EINVAL); 1564 } else { 1565 info = 0; 1566 } 1567 VCPU_CTR2(vm, vcpuid, "%s: info1(%#lx)", __func__, info); 1568 vcpu->exitintinfo = info; 1569 return (0); 1570 } 1571 1572 enum exc_class { 1573 EXC_BENIGN, 1574 EXC_CONTRIBUTORY, 1575 EXC_PAGEFAULT 1576 }; 1577 1578 #define IDT_VE 20 /* Virtualization Exception (Intel specific) */ 1579 1580 static enum exc_class 1581 exception_class(uint64_t info) 1582 { 1583 int type, vector; 1584 1585 KASSERT(info & VM_INTINFO_VALID, ("intinfo must be valid: %#lx", info)); 1586 type = info & VM_INTINFO_TYPE; 1587 vector = info & 0xff; 1588 1589 /* Table 6-4, "Interrupt and Exception Classes", Intel SDM, Vol 3 */ 1590 switch (type) { 1591 case VM_INTINFO_HWINTR: 1592 case VM_INTINFO_SWINTR: 1593 case VM_INTINFO_NMI: 1594 return (EXC_BENIGN); 1595 default: 1596 /* 1597 * Hardware exception. 1598 * 1599 * SVM and VT-x use identical type values to represent NMI, 1600 * hardware interrupt and software interrupt. 1601 * 1602 * SVM uses type '3' for all exceptions. VT-x uses type '3' 1603 * for exceptions except #BP and #OF. #BP and #OF use a type 1604 * value of '5' or '6'. Therefore we don't check for explicit 1605 * values of 'type' to classify 'intinfo' into a hardware 1606 * exception. 1607 */ 1608 break; 1609 } 1610 1611 switch (vector) { 1612 case IDT_PF: 1613 case IDT_VE: 1614 return (EXC_PAGEFAULT); 1615 case IDT_DE: 1616 case IDT_TS: 1617 case IDT_NP: 1618 case IDT_SS: 1619 case IDT_GP: 1620 return (EXC_CONTRIBUTORY); 1621 default: 1622 return (EXC_BENIGN); 1623 } 1624 } 1625 1626 static int 1627 nested_fault(struct vm *vm, int vcpuid, uint64_t info1, uint64_t info2, 1628 uint64_t *retinfo) 1629 { 1630 enum exc_class exc1, exc2; 1631 int type1, vector1; 1632 1633 KASSERT(info1 & VM_INTINFO_VALID, ("info1 %#lx is not valid", info1)); 1634 KASSERT(info2 & VM_INTINFO_VALID, ("info2 %#lx is not valid", info2)); 1635 1636 /* 1637 * If an exception occurs while attempting to call the double-fault 1638 * handler the processor enters shutdown mode (aka triple fault). 1639 */ 1640 type1 = info1 & VM_INTINFO_TYPE; 1641 vector1 = info1 & 0xff; 1642 if (type1 == VM_INTINFO_HWEXCEPTION && vector1 == IDT_DF) { 1643 VCPU_CTR2(vm, vcpuid, "triple fault: info1(%#lx), info2(%#lx)", 1644 info1, info2); 1645 vm_suspend(vm, VM_SUSPEND_TRIPLEFAULT); 1646 *retinfo = 0; 1647 return (0); 1648 } 1649 1650 /* 1651 * Table 6-5 "Conditions for Generating a Double Fault", Intel SDM, Vol3 1652 */ 1653 exc1 = exception_class(info1); 1654 exc2 = exception_class(info2); 1655 if ((exc1 == EXC_CONTRIBUTORY && exc2 == EXC_CONTRIBUTORY) || 1656 (exc1 == EXC_PAGEFAULT && exc2 != EXC_BENIGN)) { 1657 /* Convert nested fault into a double fault. */ 1658 *retinfo = IDT_DF; 1659 *retinfo |= VM_INTINFO_VALID | VM_INTINFO_HWEXCEPTION; 1660 *retinfo |= VM_INTINFO_DEL_ERRCODE; 1661 } else { 1662 /* Handle exceptions serially */ 1663 *retinfo = info2; 1664 } 1665 return (1); 1666 } 1667 1668 static uint64_t 1669 vcpu_exception_intinfo(struct vcpu *vcpu) 1670 { 1671 uint64_t info = 0; 1672 1673 if (vcpu->exception_pending) { 1674 info = vcpu->exc_vector & 0xff; 1675 info |= VM_INTINFO_VALID | VM_INTINFO_HWEXCEPTION; 1676 if (vcpu->exc_errcode_valid) { 1677 info |= VM_INTINFO_DEL_ERRCODE; 1678 info |= (uint64_t)vcpu->exc_errcode << 32; 1679 } 1680 } 1681 return (info); 1682 } 1683 1684 int 1685 vm_entry_intinfo(struct vm *vm, int vcpuid, uint64_t *retinfo) 1686 { 1687 struct vcpu *vcpu; 1688 uint64_t info1, info2; 1689 int valid; 1690 1691 KASSERT(vcpuid >= 0 && vcpuid < VM_MAXCPU, ("invalid vcpu %d", vcpuid)); 1692 1693 vcpu = &vm->vcpu[vcpuid]; 1694 1695 info1 = vcpu->exitintinfo; 1696 vcpu->exitintinfo = 0; 1697 1698 info2 = 0; 1699 if (vcpu->exception_pending) { 1700 info2 = vcpu_exception_intinfo(vcpu); 1701 vcpu->exception_pending = 0; 1702 VCPU_CTR2(vm, vcpuid, "Exception %d delivered: %#lx", 1703 vcpu->exc_vector, info2); 1704 } 1705 1706 if ((info1 & VM_INTINFO_VALID) && (info2 & VM_INTINFO_VALID)) { 1707 valid = nested_fault(vm, vcpuid, info1, info2, retinfo); 1708 } else if (info1 & VM_INTINFO_VALID) { 1709 *retinfo = info1; 1710 valid = 1; 1711 } else if (info2 & VM_INTINFO_VALID) { 1712 *retinfo = info2; 1713 valid = 1; 1714 } else { 1715 valid = 0; 1716 } 1717 1718 if (valid) { 1719 VCPU_CTR4(vm, vcpuid, "%s: info1(%#lx), info2(%#lx), " 1720 "retinfo(%#lx)", __func__, info1, info2, *retinfo); 1721 } 1722 1723 return (valid); 1724 } 1725 1726 int 1727 vm_get_intinfo(struct vm *vm, int vcpuid, uint64_t *info1, uint64_t *info2) 1728 { 1729 struct vcpu *vcpu; 1730 1731 if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 1732 return (EINVAL); 1733 1734 vcpu = &vm->vcpu[vcpuid]; 1735 *info1 = vcpu->exitintinfo; 1736 *info2 = vcpu_exception_intinfo(vcpu); 1737 return (0); 1738 } 1739 1740 int 1741 vm_inject_exception(struct vm *vm, int vcpuid, int vector, int errcode_valid, 1742 uint32_t errcode, int restart_instruction) 1743 { 1744 struct vcpu *vcpu; 1745 int error; 1746 1747 if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 1748 return (EINVAL); 1749 1750 if (vector < 0 || vector >= 32) 1751 return (EINVAL); 1752 1753 /* 1754 * A double fault exception should never be injected directly into 1755 * the guest. It is a derived exception that results from specific 1756 * combinations of nested faults. 1757 */ 1758 if (vector == IDT_DF) 1759 return (EINVAL); 1760 1761 vcpu = &vm->vcpu[vcpuid]; 1762 1763 if (vcpu->exception_pending) { 1764 VCPU_CTR2(vm, vcpuid, "Unable to inject exception %d due to " 1765 "pending exception %d", vector, vcpu->exc_vector); 1766 return (EBUSY); 1767 } 1768 1769 /* 1770 * From section 26.6.1 "Interruptibility State" in Intel SDM: 1771 * 1772 * Event blocking by "STI" or "MOV SS" is cleared after guest executes 1773 * one instruction or incurs an exception. 1774 */ 1775 error = vm_set_register(vm, vcpuid, VM_REG_GUEST_INTR_SHADOW, 0); 1776 KASSERT(error == 0, ("%s: error %d clearing interrupt shadow", 1777 __func__, error)); 1778 1779 if (restart_instruction) 1780 vm_restart_instruction(vm, vcpuid); 1781 1782 vcpu->exception_pending = 1; 1783 vcpu->exc_vector = vector; 1784 vcpu->exc_errcode = errcode; 1785 vcpu->exc_errcode_valid = errcode_valid; 1786 VCPU_CTR1(vm, vcpuid, "Exception %d pending", vector); 1787 return (0); 1788 } 1789 1790 void 1791 vm_inject_fault(void *vmarg, int vcpuid, int vector, int errcode_valid, 1792 int errcode) 1793 { 1794 struct vm *vm; 1795 int error, restart_instruction; 1796 1797 vm = vmarg; 1798 restart_instruction = 1; 1799 1800 error = vm_inject_exception(vm, vcpuid, vector, errcode_valid, 1801 errcode, restart_instruction); 1802 KASSERT(error == 0, ("vm_inject_exception error %d", error)); 1803 } 1804 1805 void 1806 vm_inject_pf(void *vmarg, int vcpuid, int error_code, uint64_t cr2) 1807 { 1808 struct vm *vm; 1809 int error; 1810 1811 vm = vmarg; 1812 VCPU_CTR2(vm, vcpuid, "Injecting page fault: error_code %#x, cr2 %#lx", 1813 error_code, cr2); 1814 1815 error = vm_set_register(vm, vcpuid, VM_REG_GUEST_CR2, cr2); 1816 KASSERT(error == 0, ("vm_set_register(cr2) error %d", error)); 1817 1818 vm_inject_fault(vm, vcpuid, IDT_PF, 1, error_code); 1819 } 1820 1821 static VMM_STAT(VCPU_NMI_COUNT, "number of NMIs delivered to vcpu"); 1822 1823 int 1824 vm_inject_nmi(struct vm *vm, int vcpuid) 1825 { 1826 struct vcpu *vcpu; 1827 1828 if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 1829 return (EINVAL); 1830 1831 vcpu = &vm->vcpu[vcpuid]; 1832 1833 vcpu->nmi_pending = 1; 1834 vcpu_notify_event(vm, vcpuid, false); 1835 return (0); 1836 } 1837 1838 int 1839 vm_nmi_pending(struct vm *vm, int vcpuid) 1840 { 1841 struct vcpu *vcpu; 1842 1843 if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 1844 panic("vm_nmi_pending: invalid vcpuid %d", vcpuid); 1845 1846 vcpu = &vm->vcpu[vcpuid]; 1847 1848 return (vcpu->nmi_pending); 1849 } 1850 1851 void 1852 vm_nmi_clear(struct vm *vm, int vcpuid) 1853 { 1854 struct vcpu *vcpu; 1855 1856 if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 1857 panic("vm_nmi_pending: invalid vcpuid %d", vcpuid); 1858 1859 vcpu = &vm->vcpu[vcpuid]; 1860 1861 if (vcpu->nmi_pending == 0) 1862 panic("vm_nmi_clear: inconsistent nmi_pending state"); 1863 1864 vcpu->nmi_pending = 0; 1865 vmm_stat_incr(vm, vcpuid, VCPU_NMI_COUNT, 1); 1866 } 1867 1868 static VMM_STAT(VCPU_EXTINT_COUNT, "number of ExtINTs delivered to vcpu"); 1869 1870 int 1871 vm_inject_extint(struct vm *vm, int vcpuid) 1872 { 1873 struct vcpu *vcpu; 1874 1875 if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 1876 return (EINVAL); 1877 1878 vcpu = &vm->vcpu[vcpuid]; 1879 1880 vcpu->extint_pending = 1; 1881 vcpu_notify_event(vm, vcpuid, false); 1882 return (0); 1883 } 1884 1885 int 1886 vm_extint_pending(struct vm *vm, int vcpuid) 1887 { 1888 struct vcpu *vcpu; 1889 1890 if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 1891 panic("vm_extint_pending: invalid vcpuid %d", vcpuid); 1892 1893 vcpu = &vm->vcpu[vcpuid]; 1894 1895 return (vcpu->extint_pending); 1896 } 1897 1898 void 1899 vm_extint_clear(struct vm *vm, int vcpuid) 1900 { 1901 struct vcpu *vcpu; 1902 1903 if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 1904 panic("vm_extint_pending: invalid vcpuid %d", vcpuid); 1905 1906 vcpu = &vm->vcpu[vcpuid]; 1907 1908 if (vcpu->extint_pending == 0) 1909 panic("vm_extint_clear: inconsistent extint_pending state"); 1910 1911 vcpu->extint_pending = 0; 1912 vmm_stat_incr(vm, vcpuid, VCPU_EXTINT_COUNT, 1); 1913 } 1914 1915 int 1916 vm_get_capability(struct vm *vm, int vcpu, int type, int *retval) 1917 { 1918 if (vcpu < 0 || vcpu >= VM_MAXCPU) 1919 return (EINVAL); 1920 1921 if (type < 0 || type >= VM_CAP_MAX) 1922 return (EINVAL); 1923 1924 return (VMGETCAP(vm->cookie, vcpu, type, retval)); 1925 } 1926 1927 int 1928 vm_set_capability(struct vm *vm, int vcpu, int type, int val) 1929 { 1930 if (vcpu < 0 || vcpu >= VM_MAXCPU) 1931 return (EINVAL); 1932 1933 if (type < 0 || type >= VM_CAP_MAX) 1934 return (EINVAL); 1935 1936 return (VMSETCAP(vm->cookie, vcpu, type, val)); 1937 } 1938 1939 struct vlapic * 1940 vm_lapic(struct vm *vm, int cpu) 1941 { 1942 return (vm->vcpu[cpu].vlapic); 1943 } 1944 1945 struct vioapic * 1946 vm_ioapic(struct vm *vm) 1947 { 1948 1949 return (vm->vioapic); 1950 } 1951 1952 struct vhpet * 1953 vm_hpet(struct vm *vm) 1954 { 1955 1956 return (vm->vhpet); 1957 } 1958 1959 boolean_t 1960 vmm_is_pptdev(int bus, int slot, int func) 1961 { 1962 int found, i, n; 1963 int b, s, f; 1964 char *val, *cp, *cp2; 1965 1966 /* 1967 * XXX 1968 * The length of an environment variable is limited to 128 bytes which 1969 * puts an upper limit on the number of passthru devices that may be 1970 * specified using a single environment variable. 1971 * 1972 * Work around this by scanning multiple environment variable 1973 * names instead of a single one - yuck! 1974 */ 1975 const char *names[] = { "pptdevs", "pptdevs2", "pptdevs3", NULL }; 1976 1977 /* set pptdevs="1/2/3 4/5/6 7/8/9 10/11/12" */ 1978 found = 0; 1979 for (i = 0; names[i] != NULL && !found; i++) { 1980 cp = val = kern_getenv(names[i]); 1981 while (cp != NULL && *cp != '\0') { 1982 if ((cp2 = strchr(cp, ' ')) != NULL) 1983 *cp2 = '\0'; 1984 1985 n = sscanf(cp, "%d/%d/%d", &b, &s, &f); 1986 if (n == 3 && bus == b && slot == s && func == f) { 1987 found = 1; 1988 break; 1989 } 1990 1991 if (cp2 != NULL) 1992 *cp2++ = ' '; 1993 1994 cp = cp2; 1995 } 1996 freeenv(val); 1997 } 1998 return (found); 1999 } 2000 2001 void * 2002 vm_iommu_domain(struct vm *vm) 2003 { 2004 2005 return (vm->iommu); 2006 } 2007 2008 int 2009 vcpu_set_state(struct vm *vm, int vcpuid, enum vcpu_state newstate, 2010 bool from_idle) 2011 { 2012 int error; 2013 struct vcpu *vcpu; 2014 2015 if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 2016 panic("vm_set_run_state: invalid vcpuid %d", vcpuid); 2017 2018 vcpu = &vm->vcpu[vcpuid]; 2019 2020 vcpu_lock(vcpu); 2021 error = vcpu_set_state_locked(vcpu, newstate, from_idle); 2022 vcpu_unlock(vcpu); 2023 2024 return (error); 2025 } 2026 2027 enum vcpu_state 2028 vcpu_get_state(struct vm *vm, int vcpuid, int *hostcpu) 2029 { 2030 struct vcpu *vcpu; 2031 enum vcpu_state state; 2032 2033 if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 2034 panic("vm_get_run_state: invalid vcpuid %d", vcpuid); 2035 2036 vcpu = &vm->vcpu[vcpuid]; 2037 2038 vcpu_lock(vcpu); 2039 state = vcpu->state; 2040 if (hostcpu != NULL) 2041 *hostcpu = vcpu->hostcpu; 2042 vcpu_unlock(vcpu); 2043 2044 return (state); 2045 } 2046 2047 int 2048 vm_activate_cpu(struct vm *vm, int vcpuid) 2049 { 2050 2051 if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 2052 return (EINVAL); 2053 2054 if (CPU_ISSET(vcpuid, &vm->active_cpus)) 2055 return (EBUSY); 2056 2057 VCPU_CTR0(vm, vcpuid, "activated"); 2058 CPU_SET_ATOMIC(vcpuid, &vm->active_cpus); 2059 return (0); 2060 } 2061 2062 cpuset_t 2063 vm_active_cpus(struct vm *vm) 2064 { 2065 2066 return (vm->active_cpus); 2067 } 2068 2069 cpuset_t 2070 vm_suspended_cpus(struct vm *vm) 2071 { 2072 2073 return (vm->suspended_cpus); 2074 } 2075 2076 void * 2077 vcpu_stats(struct vm *vm, int vcpuid) 2078 { 2079 2080 return (vm->vcpu[vcpuid].stats); 2081 } 2082 2083 int 2084 vm_get_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state *state) 2085 { 2086 if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 2087 return (EINVAL); 2088 2089 *state = vm->vcpu[vcpuid].x2apic_state; 2090 2091 return (0); 2092 } 2093 2094 int 2095 vm_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state) 2096 { 2097 if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 2098 return (EINVAL); 2099 2100 if (state >= X2APIC_STATE_LAST) 2101 return (EINVAL); 2102 2103 vm->vcpu[vcpuid].x2apic_state = state; 2104 2105 vlapic_set_x2apic_state(vm, vcpuid, state); 2106 2107 return (0); 2108 } 2109 2110 /* 2111 * This function is called to ensure that a vcpu "sees" a pending event 2112 * as soon as possible: 2113 * - If the vcpu thread is sleeping then it is woken up. 2114 * - If the vcpu is running on a different host_cpu then an IPI will be directed 2115 * to the host_cpu to cause the vcpu to trap into the hypervisor. 2116 */ 2117 void 2118 vcpu_notify_event(struct vm *vm, int vcpuid, bool lapic_intr) 2119 { 2120 int hostcpu; 2121 struct vcpu *vcpu; 2122 2123 vcpu = &vm->vcpu[vcpuid]; 2124 2125 vcpu_lock(vcpu); 2126 hostcpu = vcpu->hostcpu; 2127 if (vcpu->state == VCPU_RUNNING) { 2128 KASSERT(hostcpu != NOCPU, ("vcpu running on invalid hostcpu")); 2129 if (hostcpu != curcpu) { 2130 if (lapic_intr) { 2131 vlapic_post_intr(vcpu->vlapic, hostcpu, 2132 vmm_ipinum); 2133 } else { 2134 ipi_cpu(hostcpu, vmm_ipinum); 2135 } 2136 } else { 2137 /* 2138 * If the 'vcpu' is running on 'curcpu' then it must 2139 * be sending a notification to itself (e.g. SELF_IPI). 2140 * The pending event will be picked up when the vcpu 2141 * transitions back to guest context. 2142 */ 2143 } 2144 } else { 2145 KASSERT(hostcpu == NOCPU, ("vcpu state %d not consistent " 2146 "with hostcpu %d", vcpu->state, hostcpu)); 2147 if (vcpu->state == VCPU_SLEEPING) 2148 wakeup_one(vcpu); 2149 } 2150 vcpu_unlock(vcpu); 2151 } 2152 2153 struct vmspace * 2154 vm_get_vmspace(struct vm *vm) 2155 { 2156 2157 return (vm->vmspace); 2158 } 2159 2160 int 2161 vm_apicid2vcpuid(struct vm *vm, int apicid) 2162 { 2163 /* 2164 * XXX apic id is assumed to be numerically identical to vcpu id 2165 */ 2166 return (apicid); 2167 } 2168 2169 void 2170 vm_smp_rendezvous(struct vm *vm, int vcpuid, cpuset_t dest, 2171 vm_rendezvous_func_t func, void *arg) 2172 { 2173 int i; 2174 2175 /* 2176 * Enforce that this function is called without any locks 2177 */ 2178 WITNESS_WARN(WARN_PANIC, NULL, "vm_smp_rendezvous"); 2179 KASSERT(vcpuid == -1 || (vcpuid >= 0 && vcpuid < VM_MAXCPU), 2180 ("vm_smp_rendezvous: invalid vcpuid %d", vcpuid)); 2181 2182 restart: 2183 mtx_lock(&vm->rendezvous_mtx); 2184 if (vm->rendezvous_func != NULL) { 2185 /* 2186 * If a rendezvous is already in progress then we need to 2187 * call the rendezvous handler in case this 'vcpuid' is one 2188 * of the targets of the rendezvous. 2189 */ 2190 RENDEZVOUS_CTR0(vm, vcpuid, "Rendezvous already in progress"); 2191 mtx_unlock(&vm->rendezvous_mtx); 2192 vm_handle_rendezvous(vm, vcpuid); 2193 goto restart; 2194 } 2195 KASSERT(vm->rendezvous_func == NULL, ("vm_smp_rendezvous: previous " 2196 "rendezvous is still in progress")); 2197 2198 RENDEZVOUS_CTR0(vm, vcpuid, "Initiating rendezvous"); 2199 vm->rendezvous_req_cpus = dest; 2200 CPU_ZERO(&vm->rendezvous_done_cpus); 2201 vm->rendezvous_arg = arg; 2202 vm_set_rendezvous_func(vm, func); 2203 mtx_unlock(&vm->rendezvous_mtx); 2204 2205 /* 2206 * Wake up any sleeping vcpus and trigger a VM-exit in any running 2207 * vcpus so they handle the rendezvous as soon as possible. 2208 */ 2209 for (i = 0; i < VM_MAXCPU; i++) { 2210 if (CPU_ISSET(i, &dest)) 2211 vcpu_notify_event(vm, i, false); 2212 } 2213 2214 vm_handle_rendezvous(vm, vcpuid); 2215 } 2216 2217 struct vatpic * 2218 vm_atpic(struct vm *vm) 2219 { 2220 return (vm->vatpic); 2221 } 2222 2223 struct vatpit * 2224 vm_atpit(struct vm *vm) 2225 { 2226 return (vm->vatpit); 2227 } 2228 2229 struct vpmtmr * 2230 vm_pmtmr(struct vm *vm) 2231 { 2232 2233 return (vm->vpmtmr); 2234 } 2235 2236 struct vrtc * 2237 vm_rtc(struct vm *vm) 2238 { 2239 2240 return (vm->vrtc); 2241 } 2242 2243 enum vm_reg_name 2244 vm_segment_name(int seg) 2245 { 2246 static enum vm_reg_name seg_names[] = { 2247 VM_REG_GUEST_ES, 2248 VM_REG_GUEST_CS, 2249 VM_REG_GUEST_SS, 2250 VM_REG_GUEST_DS, 2251 VM_REG_GUEST_FS, 2252 VM_REG_GUEST_GS 2253 }; 2254 2255 KASSERT(seg >= 0 && seg < nitems(seg_names), 2256 ("%s: invalid segment encoding %d", __func__, seg)); 2257 return (seg_names[seg]); 2258 } 2259 2260 void 2261 vm_copy_teardown(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo, 2262 int num_copyinfo) 2263 { 2264 int idx; 2265 2266 for (idx = 0; idx < num_copyinfo; idx++) { 2267 if (copyinfo[idx].cookie != NULL) 2268 vm_gpa_release(copyinfo[idx].cookie); 2269 } 2270 bzero(copyinfo, num_copyinfo * sizeof(struct vm_copyinfo)); 2271 } 2272 2273 int 2274 vm_copy_setup(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, 2275 uint64_t gla, size_t len, int prot, struct vm_copyinfo *copyinfo, 2276 int num_copyinfo) 2277 { 2278 int error, idx, nused; 2279 size_t n, off, remaining; 2280 void *hva, *cookie; 2281 uint64_t gpa; 2282 2283 bzero(copyinfo, sizeof(struct vm_copyinfo) * num_copyinfo); 2284 2285 nused = 0; 2286 remaining = len; 2287 while (remaining > 0) { 2288 KASSERT(nused < num_copyinfo, ("insufficient vm_copyinfo")); 2289 error = vmm_gla2gpa(vm, vcpuid, paging, gla, prot, &gpa); 2290 if (error) 2291 return (error); 2292 off = gpa & PAGE_MASK; 2293 n = min(remaining, PAGE_SIZE - off); 2294 copyinfo[nused].gpa = gpa; 2295 copyinfo[nused].len = n; 2296 remaining -= n; 2297 gla += n; 2298 nused++; 2299 } 2300 2301 for (idx = 0; idx < nused; idx++) { 2302 hva = vm_gpa_hold(vm, copyinfo[idx].gpa, copyinfo[idx].len, 2303 prot, &cookie); 2304 if (hva == NULL) 2305 break; 2306 copyinfo[idx].hva = hva; 2307 copyinfo[idx].cookie = cookie; 2308 } 2309 2310 if (idx != nused) { 2311 vm_copy_teardown(vm, vcpuid, copyinfo, num_copyinfo); 2312 return (-1); 2313 } else { 2314 return (0); 2315 } 2316 } 2317 2318 void 2319 vm_copyin(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo, void *kaddr, 2320 size_t len) 2321 { 2322 char *dst; 2323 int idx; 2324 2325 dst = kaddr; 2326 idx = 0; 2327 while (len > 0) { 2328 bcopy(copyinfo[idx].hva, dst, copyinfo[idx].len); 2329 len -= copyinfo[idx].len; 2330 dst += copyinfo[idx].len; 2331 idx++; 2332 } 2333 } 2334 2335 void 2336 vm_copyout(struct vm *vm, int vcpuid, const void *kaddr, 2337 struct vm_copyinfo *copyinfo, size_t len) 2338 { 2339 const char *src; 2340 int idx; 2341 2342 src = kaddr; 2343 idx = 0; 2344 while (len > 0) { 2345 bcopy(src, copyinfo[idx].hva, copyinfo[idx].len); 2346 len -= copyinfo[idx].len; 2347 src += copyinfo[idx].len; 2348 idx++; 2349 } 2350 } 2351 2352 /* 2353 * Return the amount of in-use and wired memory for the VM. Since 2354 * these are global stats, only return the values with for vCPU 0 2355 */ 2356 VMM_STAT_DECLARE(VMM_MEM_RESIDENT); 2357 VMM_STAT_DECLARE(VMM_MEM_WIRED); 2358 2359 static void 2360 vm_get_rescnt(struct vm *vm, int vcpu, struct vmm_stat_type *stat) 2361 { 2362 2363 if (vcpu == 0) { 2364 vmm_stat_set(vm, vcpu, VMM_MEM_RESIDENT, 2365 PAGE_SIZE * vmspace_resident_count(vm->vmspace)); 2366 } 2367 } 2368 2369 static void 2370 vm_get_wiredcnt(struct vm *vm, int vcpu, struct vmm_stat_type *stat) 2371 { 2372 2373 if (vcpu == 0) { 2374 vmm_stat_set(vm, vcpu, VMM_MEM_WIRED, 2375 PAGE_SIZE * pmap_wired_count(vmspace_pmap(vm->vmspace))); 2376 } 2377 } 2378 2379 VMM_STAT_FUNC(VMM_MEM_RESIDENT, "Resident memory", vm_get_rescnt); 2380 VMM_STAT_FUNC(VMM_MEM_WIRED, "Wired memory", vm_get_wiredcnt); 2381