1 /* 2 * This file and its contents are supplied under the terms of the 3 * Common Development and Distribution License ("CDDL"), version 1.0. 4 * You may only use this file in accordance with the terms of version 5 * 1.0 of the CDDL. 6 * 7 * A full copy of the text of the CDDL should have accompanied this 8 * source. A copy of the CDDL is also available via the Internet at 9 * http://www.illumos.org/license/CDDL. 10 */ 11 /* This file is dual-licensed; see usr/src/contrib/bhyve/LICENSE */ 12 13 /* 14 * Copyright 2015 Pluribus Networks Inc. 15 * Copyright 2019 Joyent, Inc. 16 * Copyright 2020 OmniOS Community Edition (OmniOSce) Association. 17 * Copyright 2021 Oxide Computer Company 18 */ 19 20 #include <sys/types.h> 21 #include <sys/conf.h> 22 #include <sys/cpuvar.h> 23 #include <sys/ioccom.h> 24 #include <sys/stat.h> 25 #include <sys/vmsystm.h> 26 #include <sys/ddi.h> 27 #include <sys/mkdev.h> 28 #include <sys/sunddi.h> 29 #include <sys/fs/dv_node.h> 30 #include <sys/cpuset.h> 31 #include <sys/id_space.h> 32 #include <sys/fs/sdev_plugin.h> 33 #include <sys/smt.h> 34 #include <sys/kstat.h> 35 36 #include <sys/kernel.h> 37 #include <sys/hma.h> 38 #include <sys/x86_archext.h> 39 #include <x86/apicreg.h> 40 41 #include <sys/vmm.h> 42 #include <sys/vmm_kernel.h> 43 #include <sys/vmm_instruction_emul.h> 44 #include <sys/vmm_dev.h> 45 #include <sys/vmm_impl.h> 46 #include <sys/vmm_drv.h> 47 #include <sys/vmm_vm.h> 48 #include <sys/vmm_reservoir.h> 49 50 #include <vm/seg_dev.h> 51 52 #include "io/ppt.h" 53 #include "io/vatpic.h" 54 #include "io/vioapic.h" 55 #include "io/vrtc.h" 56 #include "io/vhpet.h" 57 #include "io/vpmtmr.h" 58 #include "vmm_lapic.h" 59 #include "vmm_stat.h" 60 #include "vmm_util.h" 61 62 /* 63 * Locking details: 64 * 65 * Driver-wide data (vmmdev_*) , including HMA and sdev registration, is 66 * protected by vmmdev_mtx. The list of vmm_softc_t instances and related data 67 * (vmm_*) are protected by vmm_mtx. Actions requiring both locks must acquire 68 * vmmdev_mtx before vmm_mtx. The sdev plugin functions must not attempt to 69 * acquire vmmdev_mtx, as they could deadlock with plugin unregistration. 70 */ 71 72 static kmutex_t vmmdev_mtx; 73 static dev_info_t *vmmdev_dip; 74 static hma_reg_t *vmmdev_hma_reg; 75 static uint_t vmmdev_hma_ref; 76 static sdev_plugin_hdl_t vmmdev_sdev_hdl; 77 78 static kmutex_t vmm_mtx; 79 static list_t vmm_list; 80 static list_t vmm_destroy_list; 81 static id_space_t *vmm_minors; 82 static void *vmm_statep; 83 84 static const char *vmmdev_hvm_name = "bhyve"; 85 86 /* For sdev plugin (/dev) */ 87 #define VMM_SDEV_ROOT "/dev/vmm" 88 89 /* From uts/intel/io/vmm/intel/vmx.c */ 90 extern int vmx_x86_supported(const char **); 91 92 /* Holds and hooks from drivers external to vmm */ 93 struct vmm_hold { 94 list_node_t vmh_node; 95 vmm_softc_t *vmh_sc; 96 boolean_t vmh_release_req; 97 uint_t vmh_ioport_hook_cnt; 98 }; 99 100 struct vmm_lease { 101 list_node_t vml_node; 102 struct vm *vml_vm; 103 vm_client_t *vml_vmclient; 104 boolean_t vml_expired; 105 boolean_t vml_break_deferred; 106 boolean_t (*vml_expire_func)(void *); 107 void *vml_expire_arg; 108 struct vmm_hold *vml_hold; 109 }; 110 111 static int vmm_drv_block_hook(vmm_softc_t *, boolean_t); 112 static void vmm_lease_block(vmm_softc_t *); 113 static void vmm_lease_unblock(vmm_softc_t *); 114 static int vmm_kstat_alloc(vmm_softc_t *, minor_t, const cred_t *); 115 static void vmm_kstat_init(vmm_softc_t *); 116 static void vmm_kstat_fini(vmm_softc_t *); 117 118 /* 119 * The 'devmem' hack: 120 * 121 * On native FreeBSD, bhyve consumers are allowed to create 'devmem' segments 122 * in the vm which appear with their own name related to the vm under /dev. 123 * Since this would be a hassle from an sdev perspective and would require a 124 * new cdev interface (or complicate the existing one), we choose to implement 125 * this in a different manner. Direct access to the underlying vm memory 126 * segments is exposed by placing them in a range of offsets beyond the normal 127 * guest memory space. Userspace can query the appropriate offset to mmap() 128 * for a given segment-id with the VM_DEVMEM_GETOFFSET ioctl. 129 */ 130 131 static vmm_devmem_entry_t * 132 vmmdev_devmem_find(vmm_softc_t *sc, int segid) 133 { 134 vmm_devmem_entry_t *ent = NULL; 135 list_t *dl = &sc->vmm_devmem_list; 136 137 for (ent = list_head(dl); ent != NULL; ent = list_next(dl, ent)) { 138 if (ent->vde_segid == segid) { 139 return (ent); 140 } 141 } 142 return (NULL); 143 } 144 145 static int 146 vmmdev_get_memseg(vmm_softc_t *sc, struct vm_memseg *mseg) 147 { 148 int error; 149 bool sysmem; 150 151 error = vm_get_memseg(sc->vmm_vm, mseg->segid, &mseg->len, &sysmem, 152 NULL); 153 if (error || mseg->len == 0) 154 return (error); 155 156 if (!sysmem) { 157 vmm_devmem_entry_t *de; 158 159 de = vmmdev_devmem_find(sc, mseg->segid); 160 if (de != NULL) { 161 (void) strlcpy(mseg->name, de->vde_name, 162 sizeof (mseg->name)); 163 } 164 } else { 165 bzero(mseg->name, sizeof (mseg->name)); 166 } 167 168 return (error); 169 } 170 171 static int 172 vmmdev_devmem_create(vmm_softc_t *sc, struct vm_memseg *mseg, const char *name) 173 { 174 off_t map_offset; 175 vmm_devmem_entry_t *entry; 176 177 if (list_is_empty(&sc->vmm_devmem_list)) { 178 map_offset = VM_DEVMEM_START; 179 } else { 180 entry = list_tail(&sc->vmm_devmem_list); 181 map_offset = entry->vde_off + entry->vde_len; 182 if (map_offset < entry->vde_off) { 183 /* Do not tolerate overflow */ 184 return (ERANGE); 185 } 186 /* 187 * XXXJOY: We could choose to search the list for duplicate 188 * names and toss an error. Since we're using the offset 189 * method for now, it does not make much of a difference. 190 */ 191 } 192 193 entry = kmem_zalloc(sizeof (*entry), KM_SLEEP); 194 entry->vde_segid = mseg->segid; 195 entry->vde_len = mseg->len; 196 entry->vde_off = map_offset; 197 (void) strlcpy(entry->vde_name, name, sizeof (entry->vde_name)); 198 list_insert_tail(&sc->vmm_devmem_list, entry); 199 200 return (0); 201 } 202 203 static boolean_t 204 vmmdev_devmem_segid(vmm_softc_t *sc, off_t off, off_t len, int *segidp, 205 off_t *map_offp) 206 { 207 list_t *dl = &sc->vmm_devmem_list; 208 vmm_devmem_entry_t *de = NULL; 209 const off_t map_end = off + len; 210 211 VERIFY(off >= VM_DEVMEM_START); 212 213 if (map_end < off) { 214 /* No match on overflow */ 215 return (B_FALSE); 216 } 217 218 for (de = list_head(dl); de != NULL; de = list_next(dl, de)) { 219 const off_t item_end = de->vde_off + de->vde_len; 220 221 if (de->vde_off <= off && item_end >= map_end) { 222 *segidp = de->vde_segid; 223 *map_offp = off - de->vde_off; 224 return (B_TRUE); 225 } 226 } 227 return (B_FALSE); 228 } 229 230 static void 231 vmmdev_devmem_purge(vmm_softc_t *sc) 232 { 233 vmm_devmem_entry_t *entry; 234 235 while ((entry = list_remove_head(&sc->vmm_devmem_list)) != NULL) { 236 kmem_free(entry, sizeof (*entry)); 237 } 238 } 239 240 static int 241 vmmdev_alloc_memseg(vmm_softc_t *sc, struct vm_memseg *mseg) 242 { 243 int error; 244 bool sysmem = true; 245 246 if (VM_MEMSEG_NAME(mseg)) { 247 sysmem = false; 248 } 249 error = vm_alloc_memseg(sc->vmm_vm, mseg->segid, mseg->len, sysmem); 250 251 if (error == 0) { 252 /* 253 * Rather than create a whole fresh device from which userspace 254 * can mmap this segment, instead make it available at an 255 * offset above where the main guest memory resides. 256 */ 257 error = vmmdev_devmem_create(sc, mseg, mseg->name); 258 if (error != 0) { 259 vm_free_memseg(sc->vmm_vm, mseg->segid); 260 } 261 } 262 return (error); 263 } 264 265 /* 266 * Resource Locking and Exclusion 267 * 268 * Much of bhyve depends on key portions of VM state, such as the guest memory 269 * map, to remain unchanged while the guest is running. As ported from 270 * FreeBSD, the initial strategy for this resource exclusion hinged on gating 271 * access to the instance vCPUs. Threads acting on a single vCPU, like those 272 * performing the work of actually running the guest in VMX/SVM, would lock 273 * only that vCPU during ioctl() entry. For ioctls which would change VM-wide 274 * state, all of the vCPUs would be first locked, ensuring that the 275 * operation(s) could complete without any other threads stumbling into 276 * intermediate states. 277 * 278 * This approach is largely effective for bhyve. Common operations, such as 279 * running the vCPUs, steer clear of lock contention. The model begins to 280 * break down for operations which do not occur in the context of a specific 281 * vCPU. LAPIC MSI delivery, for example, may be initiated from a worker 282 * thread in the bhyve process. In order to properly protect those vCPU-less 283 * operations from encountering invalid states, additional locking is required. 284 * This was solved by forcing those operations to lock the VM_MAXCPU-1 vCPU. 285 * It does mean that class of operations will be serialized on locking the 286 * specific vCPU and that instances sized at VM_MAXCPU will potentially see 287 * undue contention on the VM_MAXCPU-1 vCPU. 288 * 289 * In order to address the shortcomings of this model, the concept of a 290 * read/write lock has been added to bhyve. Operations which change 291 * fundamental aspects of a VM (such as the memory map) must acquire the write 292 * lock, which also implies locking all of the vCPUs and waiting for all read 293 * lock holders to release. While it increases the cost and waiting time for 294 * those few operations, it allows most hot-path operations on the VM (which 295 * depend on its configuration remaining stable) to occur with minimal locking. 296 * 297 * Consumers of the Driver API (see below) are a special case when it comes to 298 * this locking, since they may hold a read lock via the drv_lease mechanism 299 * for an extended period of time. Rather than forcing those consumers to 300 * continuously poll for a write lock attempt, the lease system forces them to 301 * provide a release callback to trigger their clean-up (and potential later 302 * reacquisition) of the read lock. 303 */ 304 305 static void 306 vcpu_lock_one(vmm_softc_t *sc, int vcpu) 307 { 308 ASSERT(vcpu >= 0 && vcpu < VM_MAXCPU); 309 310 /* 311 * Since this state transition is utilizing from_idle=true, it should 312 * not fail, but rather block until it can be successful. 313 */ 314 VERIFY0(vcpu_set_state(sc->vmm_vm, vcpu, VCPU_FROZEN, true)); 315 } 316 317 static void 318 vcpu_unlock_one(vmm_softc_t *sc, int vcpu) 319 { 320 ASSERT(vcpu >= 0 && vcpu < VM_MAXCPU); 321 322 VERIFY3U(vcpu_get_state(sc->vmm_vm, vcpu, NULL), ==, VCPU_FROZEN); 323 VERIFY0(vcpu_set_state(sc->vmm_vm, vcpu, VCPU_IDLE, false)); 324 } 325 326 static void 327 vmm_read_lock(vmm_softc_t *sc) 328 { 329 rw_enter(&sc->vmm_rwlock, RW_READER); 330 } 331 332 static void 333 vmm_read_unlock(vmm_softc_t *sc) 334 { 335 rw_exit(&sc->vmm_rwlock); 336 } 337 338 static void 339 vmm_write_lock(vmm_softc_t *sc) 340 { 341 int maxcpus; 342 343 /* First lock all the vCPUs */ 344 maxcpus = vm_get_maxcpus(sc->vmm_vm); 345 for (int vcpu = 0; vcpu < maxcpus; vcpu++) { 346 vcpu_lock_one(sc, vcpu); 347 } 348 349 /* 350 * Block vmm_drv leases from being acquired or held while the VM write 351 * lock is held. 352 */ 353 vmm_lease_block(sc); 354 355 rw_enter(&sc->vmm_rwlock, RW_WRITER); 356 /* 357 * For now, the 'maxcpus' value for an instance is fixed at the 358 * compile-time constant of VM_MAXCPU at creation. If this changes in 359 * the future, allowing for dynamic vCPU resource sizing, acquisition 360 * of the write lock will need to be wary of such changes. 361 */ 362 VERIFY(maxcpus == vm_get_maxcpus(sc->vmm_vm)); 363 } 364 365 static void 366 vmm_write_unlock(vmm_softc_t *sc) 367 { 368 int maxcpus; 369 370 /* Allow vmm_drv leases to be acquired once write lock is dropped */ 371 vmm_lease_unblock(sc); 372 373 /* 374 * The VM write lock _must_ be released from the same thread it was 375 * acquired in, unlike the read lock. 376 */ 377 VERIFY(rw_write_held(&sc->vmm_rwlock)); 378 rw_exit(&sc->vmm_rwlock); 379 380 /* Unlock all the vCPUs */ 381 maxcpus = vm_get_maxcpus(sc->vmm_vm); 382 for (int vcpu = 0; vcpu < maxcpus; vcpu++) { 383 vcpu_unlock_one(sc, vcpu); 384 } 385 } 386 387 static int 388 vmmdev_do_ioctl(vmm_softc_t *sc, int cmd, intptr_t arg, int md, 389 cred_t *credp, int *rvalp) 390 { 391 int error = 0, vcpu = -1; 392 void *datap = (void *)arg; 393 enum vm_lock_type { 394 LOCK_NONE = 0, 395 LOCK_VCPU, 396 LOCK_READ_HOLD, 397 LOCK_WRITE_HOLD 398 } lock_type = LOCK_NONE; 399 400 /* Acquire any exclusion resources needed for the operation. */ 401 switch (cmd) { 402 case VM_RUN: 403 case VM_GET_REGISTER: 404 case VM_SET_REGISTER: 405 case VM_GET_SEGMENT_DESCRIPTOR: 406 case VM_SET_SEGMENT_DESCRIPTOR: 407 case VM_GET_REGISTER_SET: 408 case VM_SET_REGISTER_SET: 409 case VM_INJECT_EXCEPTION: 410 case VM_GET_CAPABILITY: 411 case VM_SET_CAPABILITY: 412 case VM_PPTDEV_MSI: 413 case VM_PPTDEV_MSIX: 414 case VM_SET_X2APIC_STATE: 415 case VM_GLA2GPA: 416 case VM_GLA2GPA_NOFAULT: 417 case VM_ACTIVATE_CPU: 418 case VM_SET_INTINFO: 419 case VM_GET_INTINFO: 420 case VM_RESTART_INSTRUCTION: 421 case VM_SET_KERNEMU_DEV: 422 case VM_GET_KERNEMU_DEV: 423 case VM_RESET_CPU: 424 case VM_GET_RUN_STATE: 425 case VM_SET_RUN_STATE: 426 case VM_GET_FPU: 427 case VM_SET_FPU: 428 /* 429 * Copy in the ID of the vCPU chosen for this operation. 430 * Since a nefarious caller could update their struct between 431 * this locking and when the rest of the ioctl data is copied 432 * in, it is _critical_ that this local 'vcpu' variable be used 433 * rather than the in-struct one when performing the ioctl. 434 */ 435 if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) { 436 return (EFAULT); 437 } 438 if (vcpu < 0 || vcpu > vm_get_maxcpus(sc->vmm_vm)) { 439 return (EINVAL); 440 } 441 vcpu_lock_one(sc, vcpu); 442 lock_type = LOCK_VCPU; 443 break; 444 445 case VM_REINIT: 446 case VM_BIND_PPTDEV: 447 case VM_UNBIND_PPTDEV: 448 case VM_MAP_PPTDEV_MMIO: 449 case VM_UNMAP_PPTDEV_MMIO: 450 case VM_ALLOC_MEMSEG: 451 case VM_MMAP_MEMSEG: 452 case VM_MUNMAP_MEMSEG: 453 case VM_WRLOCK_CYCLE: 454 case VM_PMTMR_LOCATE: 455 vmm_write_lock(sc); 456 lock_type = LOCK_WRITE_HOLD; 457 break; 458 459 case VM_GET_MEMSEG: 460 case VM_MMAP_GETNEXT: 461 case VM_LAPIC_IRQ: 462 case VM_INJECT_NMI: 463 case VM_IOAPIC_ASSERT_IRQ: 464 case VM_IOAPIC_DEASSERT_IRQ: 465 case VM_IOAPIC_PULSE_IRQ: 466 case VM_LAPIC_MSI: 467 case VM_LAPIC_LOCAL_IRQ: 468 case VM_GET_X2APIC_STATE: 469 case VM_RTC_READ: 470 case VM_RTC_WRITE: 471 case VM_RTC_SETTIME: 472 case VM_RTC_GETTIME: 473 case VM_PPTDEV_DISABLE_MSIX: 474 case VM_DEVMEM_GETOFFSET: 475 case VM_TRACK_DIRTY_PAGES: 476 vmm_read_lock(sc); 477 lock_type = LOCK_READ_HOLD; 478 break; 479 480 case VM_GET_GPA_PMAP: 481 case VM_IOAPIC_PINCOUNT: 482 case VM_SUSPEND: 483 case VM_DESC_FPU_AREA: 484 default: 485 break; 486 } 487 488 /* Execute the primary logic for the ioctl. */ 489 switch (cmd) { 490 case VM_RUN: { 491 struct vm_entry entry; 492 493 if (ddi_copyin(datap, &entry, sizeof (entry), md)) { 494 error = EFAULT; 495 break; 496 } 497 498 if (!(curthread->t_schedflag & TS_VCPU)) 499 smt_mark_as_vcpu(); 500 501 error = vm_run(sc->vmm_vm, vcpu, &entry); 502 503 /* 504 * Unexpected states in vm_run() are expressed through positive 505 * errno-oriented return values. VM states which expect further 506 * processing in userspace (necessary context via exitinfo) are 507 * expressed through negative return values. For the time being 508 * a return value of 0 is not expected from vm_run(). 509 */ 510 ASSERT(error != 0); 511 if (error < 0) { 512 const struct vm_exit *vme; 513 void *outp = entry.exit_data; 514 515 error = 0; 516 vme = vm_exitinfo(sc->vmm_vm, vcpu); 517 if (ddi_copyout(vme, outp, sizeof (*vme), md)) { 518 error = EFAULT; 519 } 520 } 521 break; 522 } 523 case VM_SUSPEND: { 524 struct vm_suspend vmsuspend; 525 526 if (ddi_copyin(datap, &vmsuspend, sizeof (vmsuspend), md)) { 527 error = EFAULT; 528 break; 529 } 530 error = vm_suspend(sc->vmm_vm, vmsuspend.how); 531 break; 532 } 533 case VM_REINIT: { 534 struct vm_reinit reinit; 535 536 if (ddi_copyin(datap, &reinit, sizeof (reinit), md)) { 537 error = EFAULT; 538 break; 539 } 540 if ((error = vmm_drv_block_hook(sc, B_TRUE)) != 0) { 541 /* 542 * The VM instance should be free of driver-attached 543 * hooks during the reinitialization process. 544 */ 545 break; 546 } 547 error = vm_reinit(sc->vmm_vm, reinit.flags); 548 (void) vmm_drv_block_hook(sc, B_FALSE); 549 break; 550 } 551 case VM_STAT_DESC: { 552 struct vm_stat_desc statdesc; 553 554 if (ddi_copyin(datap, &statdesc, sizeof (statdesc), md)) { 555 error = EFAULT; 556 break; 557 } 558 error = vmm_stat_desc_copy(statdesc.index, statdesc.desc, 559 sizeof (statdesc.desc)); 560 if (error == 0 && 561 ddi_copyout(&statdesc, datap, sizeof (statdesc), md)) { 562 error = EFAULT; 563 break; 564 } 565 break; 566 } 567 case VM_STATS_IOC: { 568 struct vm_stats vmstats; 569 570 if (ddi_copyin(datap, &vmstats, sizeof (vmstats), md)) { 571 error = EFAULT; 572 break; 573 } 574 hrt2tv(gethrtime(), &vmstats.tv); 575 error = vmm_stat_copy(sc->vmm_vm, vmstats.cpuid, vmstats.index, 576 nitems(vmstats.statbuf), 577 &vmstats.num_entries, vmstats.statbuf); 578 if (error == 0 && 579 ddi_copyout(&vmstats, datap, sizeof (vmstats), md)) { 580 error = EFAULT; 581 break; 582 } 583 break; 584 } 585 586 case VM_PPTDEV_MSI: { 587 struct vm_pptdev_msi pptmsi; 588 589 if (ddi_copyin(datap, &pptmsi, sizeof (pptmsi), md)) { 590 error = EFAULT; 591 break; 592 } 593 error = ppt_setup_msi(sc->vmm_vm, pptmsi.vcpu, pptmsi.pptfd, 594 pptmsi.addr, pptmsi.msg, pptmsi.numvec); 595 break; 596 } 597 case VM_PPTDEV_MSIX: { 598 struct vm_pptdev_msix pptmsix; 599 600 if (ddi_copyin(datap, &pptmsix, sizeof (pptmsix), md)) { 601 error = EFAULT; 602 break; 603 } 604 error = ppt_setup_msix(sc->vmm_vm, pptmsix.vcpu, pptmsix.pptfd, 605 pptmsix.idx, pptmsix.addr, pptmsix.msg, 606 pptmsix.vector_control); 607 break; 608 } 609 case VM_PPTDEV_DISABLE_MSIX: { 610 struct vm_pptdev pptdev; 611 612 if (ddi_copyin(datap, &pptdev, sizeof (pptdev), md)) { 613 error = EFAULT; 614 break; 615 } 616 error = ppt_disable_msix(sc->vmm_vm, pptdev.pptfd); 617 break; 618 } 619 case VM_MAP_PPTDEV_MMIO: { 620 struct vm_pptdev_mmio pptmmio; 621 622 if (ddi_copyin(datap, &pptmmio, sizeof (pptmmio), md)) { 623 error = EFAULT; 624 break; 625 } 626 error = ppt_map_mmio(sc->vmm_vm, pptmmio.pptfd, pptmmio.gpa, 627 pptmmio.len, pptmmio.hpa); 628 break; 629 } 630 case VM_UNMAP_PPTDEV_MMIO: { 631 struct vm_pptdev_mmio pptmmio; 632 633 if (ddi_copyin(datap, &pptmmio, sizeof (pptmmio), md)) { 634 error = EFAULT; 635 break; 636 } 637 error = ppt_unmap_mmio(sc->vmm_vm, pptmmio.pptfd, pptmmio.gpa, 638 pptmmio.len); 639 break; 640 } 641 case VM_BIND_PPTDEV: { 642 struct vm_pptdev pptdev; 643 644 if (ddi_copyin(datap, &pptdev, sizeof (pptdev), md)) { 645 error = EFAULT; 646 break; 647 } 648 error = vm_assign_pptdev(sc->vmm_vm, pptdev.pptfd); 649 break; 650 } 651 case VM_UNBIND_PPTDEV: { 652 struct vm_pptdev pptdev; 653 654 if (ddi_copyin(datap, &pptdev, sizeof (pptdev), md)) { 655 error = EFAULT; 656 break; 657 } 658 error = vm_unassign_pptdev(sc->vmm_vm, pptdev.pptfd); 659 break; 660 } 661 case VM_GET_PPTDEV_LIMITS: { 662 struct vm_pptdev_limits pptlimits; 663 664 if (ddi_copyin(datap, &pptlimits, sizeof (pptlimits), md)) { 665 error = EFAULT; 666 break; 667 } 668 error = ppt_get_limits(sc->vmm_vm, pptlimits.pptfd, 669 &pptlimits.msi_limit, &pptlimits.msix_limit); 670 if (error == 0 && 671 ddi_copyout(&pptlimits, datap, sizeof (pptlimits), md)) { 672 error = EFAULT; 673 break; 674 } 675 break; 676 } 677 case VM_INJECT_EXCEPTION: { 678 struct vm_exception vmexc; 679 if (ddi_copyin(datap, &vmexc, sizeof (vmexc), md)) { 680 error = EFAULT; 681 break; 682 } 683 error = vm_inject_exception(sc->vmm_vm, vcpu, vmexc.vector, 684 vmexc.error_code_valid, vmexc.error_code, 685 vmexc.restart_instruction); 686 break; 687 } 688 case VM_INJECT_NMI: { 689 struct vm_nmi vmnmi; 690 691 if (ddi_copyin(datap, &vmnmi, sizeof (vmnmi), md)) { 692 error = EFAULT; 693 break; 694 } 695 error = vm_inject_nmi(sc->vmm_vm, vmnmi.cpuid); 696 break; 697 } 698 case VM_LAPIC_IRQ: { 699 struct vm_lapic_irq vmirq; 700 701 if (ddi_copyin(datap, &vmirq, sizeof (vmirq), md)) { 702 error = EFAULT; 703 break; 704 } 705 error = lapic_intr_edge(sc->vmm_vm, vmirq.cpuid, vmirq.vector); 706 break; 707 } 708 case VM_LAPIC_LOCAL_IRQ: { 709 struct vm_lapic_irq vmirq; 710 711 if (ddi_copyin(datap, &vmirq, sizeof (vmirq), md)) { 712 error = EFAULT; 713 break; 714 } 715 error = lapic_set_local_intr(sc->vmm_vm, vmirq.cpuid, 716 vmirq.vector); 717 break; 718 } 719 case VM_LAPIC_MSI: { 720 struct vm_lapic_msi vmmsi; 721 722 if (ddi_copyin(datap, &vmmsi, sizeof (vmmsi), md)) { 723 error = EFAULT; 724 break; 725 } 726 error = lapic_intr_msi(sc->vmm_vm, vmmsi.addr, vmmsi.msg); 727 break; 728 } 729 730 case VM_IOAPIC_ASSERT_IRQ: { 731 struct vm_ioapic_irq ioapic_irq; 732 733 if (ddi_copyin(datap, &ioapic_irq, sizeof (ioapic_irq), md)) { 734 error = EFAULT; 735 break; 736 } 737 error = vioapic_assert_irq(sc->vmm_vm, ioapic_irq.irq); 738 break; 739 } 740 case VM_IOAPIC_DEASSERT_IRQ: { 741 struct vm_ioapic_irq ioapic_irq; 742 743 if (ddi_copyin(datap, &ioapic_irq, sizeof (ioapic_irq), md)) { 744 error = EFAULT; 745 break; 746 } 747 error = vioapic_deassert_irq(sc->vmm_vm, ioapic_irq.irq); 748 break; 749 } 750 case VM_IOAPIC_PULSE_IRQ: { 751 struct vm_ioapic_irq ioapic_irq; 752 753 if (ddi_copyin(datap, &ioapic_irq, sizeof (ioapic_irq), md)) { 754 error = EFAULT; 755 break; 756 } 757 error = vioapic_pulse_irq(sc->vmm_vm, ioapic_irq.irq); 758 break; 759 } 760 case VM_IOAPIC_PINCOUNT: { 761 int pincount; 762 763 pincount = vioapic_pincount(sc->vmm_vm); 764 if (ddi_copyout(&pincount, datap, sizeof (int), md)) { 765 error = EFAULT; 766 break; 767 } 768 break; 769 } 770 case VM_DESC_FPU_AREA: { 771 struct vm_fpu_desc desc; 772 void *buf = NULL; 773 774 if (ddi_copyin(datap, &desc, sizeof (desc), md)) { 775 error = EFAULT; 776 break; 777 } 778 if (desc.vfd_num_entries > 64) { 779 error = EINVAL; 780 break; 781 } 782 const size_t buf_sz = sizeof (struct vm_fpu_desc_entry) * 783 desc.vfd_num_entries; 784 if (buf_sz != 0) { 785 buf = kmem_zalloc(buf_sz, KM_SLEEP); 786 } 787 788 /* 789 * For now, we are depending on vm_fpu_desc_entry and 790 * hma_xsave_state_desc_t having the same format. 791 */ 792 CTASSERT(sizeof (struct vm_fpu_desc_entry) == 793 sizeof (hma_xsave_state_desc_t)); 794 795 size_t req_size; 796 const uint_t max_entries = hma_fpu_describe_xsave_state( 797 (hma_xsave_state_desc_t *)buf, 798 desc.vfd_num_entries, 799 &req_size); 800 801 desc.vfd_req_size = req_size; 802 desc.vfd_num_entries = max_entries; 803 if (buf_sz != 0) { 804 if (ddi_copyout(buf, desc.vfd_entry_data, buf_sz, md)) { 805 error = EFAULT; 806 } 807 kmem_free(buf, buf_sz); 808 } 809 810 if (error == 0) { 811 if (ddi_copyout(&desc, datap, sizeof (desc), md)) { 812 error = EFAULT; 813 } 814 } 815 break; 816 } 817 818 case VM_ISA_ASSERT_IRQ: { 819 struct vm_isa_irq isa_irq; 820 821 if (ddi_copyin(datap, &isa_irq, sizeof (isa_irq), md)) { 822 error = EFAULT; 823 break; 824 } 825 error = vatpic_assert_irq(sc->vmm_vm, isa_irq.atpic_irq); 826 if (error == 0 && isa_irq.ioapic_irq != -1) { 827 error = vioapic_assert_irq(sc->vmm_vm, 828 isa_irq.ioapic_irq); 829 } 830 break; 831 } 832 case VM_ISA_DEASSERT_IRQ: { 833 struct vm_isa_irq isa_irq; 834 835 if (ddi_copyin(datap, &isa_irq, sizeof (isa_irq), md)) { 836 error = EFAULT; 837 break; 838 } 839 error = vatpic_deassert_irq(sc->vmm_vm, isa_irq.atpic_irq); 840 if (error == 0 && isa_irq.ioapic_irq != -1) { 841 error = vioapic_deassert_irq(sc->vmm_vm, 842 isa_irq.ioapic_irq); 843 } 844 break; 845 } 846 case VM_ISA_PULSE_IRQ: { 847 struct vm_isa_irq isa_irq; 848 849 if (ddi_copyin(datap, &isa_irq, sizeof (isa_irq), md)) { 850 error = EFAULT; 851 break; 852 } 853 error = vatpic_pulse_irq(sc->vmm_vm, isa_irq.atpic_irq); 854 if (error == 0 && isa_irq.ioapic_irq != -1) { 855 error = vioapic_pulse_irq(sc->vmm_vm, 856 isa_irq.ioapic_irq); 857 } 858 break; 859 } 860 case VM_ISA_SET_IRQ_TRIGGER: { 861 struct vm_isa_irq_trigger isa_irq_trigger; 862 863 if (ddi_copyin(datap, &isa_irq_trigger, 864 sizeof (isa_irq_trigger), md)) { 865 error = EFAULT; 866 break; 867 } 868 error = vatpic_set_irq_trigger(sc->vmm_vm, 869 isa_irq_trigger.atpic_irq, isa_irq_trigger.trigger); 870 break; 871 } 872 873 case VM_MMAP_GETNEXT: { 874 struct vm_memmap mm; 875 876 if (ddi_copyin(datap, &mm, sizeof (mm), md)) { 877 error = EFAULT; 878 break; 879 } 880 error = vm_mmap_getnext(sc->vmm_vm, &mm.gpa, &mm.segid, 881 &mm.segoff, &mm.len, &mm.prot, &mm.flags); 882 if (error == 0 && ddi_copyout(&mm, datap, sizeof (mm), md)) { 883 error = EFAULT; 884 break; 885 } 886 break; 887 } 888 case VM_MMAP_MEMSEG: { 889 struct vm_memmap mm; 890 891 if (ddi_copyin(datap, &mm, sizeof (mm), md)) { 892 error = EFAULT; 893 break; 894 } 895 error = vm_mmap_memseg(sc->vmm_vm, mm.gpa, mm.segid, mm.segoff, 896 mm.len, mm.prot, mm.flags); 897 break; 898 } 899 case VM_MUNMAP_MEMSEG: { 900 struct vm_munmap mu; 901 902 if (ddi_copyin(datap, &mu, sizeof (mu), md)) { 903 error = EFAULT; 904 break; 905 } 906 error = vm_munmap_memseg(sc->vmm_vm, mu.gpa, mu.len); 907 break; 908 } 909 case VM_ALLOC_MEMSEG: { 910 struct vm_memseg vmseg; 911 912 if (ddi_copyin(datap, &vmseg, sizeof (vmseg), md)) { 913 error = EFAULT; 914 break; 915 } 916 error = vmmdev_alloc_memseg(sc, &vmseg); 917 break; 918 } 919 case VM_GET_MEMSEG: { 920 struct vm_memseg vmseg; 921 922 if (ddi_copyin(datap, &vmseg, sizeof (vmseg), md)) { 923 error = EFAULT; 924 break; 925 } 926 error = vmmdev_get_memseg(sc, &vmseg); 927 if (error == 0 && 928 ddi_copyout(&vmseg, datap, sizeof (vmseg), md)) { 929 error = EFAULT; 930 break; 931 } 932 break; 933 } 934 case VM_GET_REGISTER: { 935 struct vm_register vmreg; 936 937 if (ddi_copyin(datap, &vmreg, sizeof (vmreg), md)) { 938 error = EFAULT; 939 break; 940 } 941 error = vm_get_register(sc->vmm_vm, vcpu, vmreg.regnum, 942 &vmreg.regval); 943 if (error == 0 && 944 ddi_copyout(&vmreg, datap, sizeof (vmreg), md)) { 945 error = EFAULT; 946 break; 947 } 948 break; 949 } 950 case VM_SET_REGISTER: { 951 struct vm_register vmreg; 952 953 if (ddi_copyin(datap, &vmreg, sizeof (vmreg), md)) { 954 error = EFAULT; 955 break; 956 } 957 error = vm_set_register(sc->vmm_vm, vcpu, vmreg.regnum, 958 vmreg.regval); 959 break; 960 } 961 case VM_SET_SEGMENT_DESCRIPTOR: { 962 struct vm_seg_desc vmsegd; 963 964 if (ddi_copyin(datap, &vmsegd, sizeof (vmsegd), md)) { 965 error = EFAULT; 966 break; 967 } 968 error = vm_set_seg_desc(sc->vmm_vm, vcpu, vmsegd.regnum, 969 &vmsegd.desc); 970 break; 971 } 972 case VM_GET_SEGMENT_DESCRIPTOR: { 973 struct vm_seg_desc vmsegd; 974 975 if (ddi_copyin(datap, &vmsegd, sizeof (vmsegd), md)) { 976 error = EFAULT; 977 break; 978 } 979 error = vm_get_seg_desc(sc->vmm_vm, vcpu, vmsegd.regnum, 980 &vmsegd.desc); 981 if (error == 0 && 982 ddi_copyout(&vmsegd, datap, sizeof (vmsegd), md)) { 983 error = EFAULT; 984 break; 985 } 986 break; 987 } 988 case VM_GET_REGISTER_SET: { 989 struct vm_register_set vrs; 990 int regnums[VM_REG_LAST]; 991 uint64_t regvals[VM_REG_LAST]; 992 993 if (ddi_copyin(datap, &vrs, sizeof (vrs), md)) { 994 error = EFAULT; 995 break; 996 } 997 if (vrs.count > VM_REG_LAST || vrs.count == 0) { 998 error = EINVAL; 999 break; 1000 } 1001 if (ddi_copyin(vrs.regnums, regnums, 1002 sizeof (int) * vrs.count, md)) { 1003 error = EFAULT; 1004 break; 1005 } 1006 1007 error = 0; 1008 for (uint_t i = 0; i < vrs.count && error == 0; i++) { 1009 if (regnums[i] < 0) { 1010 error = EINVAL; 1011 break; 1012 } 1013 error = vm_get_register(sc->vmm_vm, vcpu, regnums[i], 1014 ®vals[i]); 1015 } 1016 if (error == 0 && ddi_copyout(regvals, vrs.regvals, 1017 sizeof (uint64_t) * vrs.count, md)) { 1018 error = EFAULT; 1019 } 1020 break; 1021 } 1022 case VM_SET_REGISTER_SET: { 1023 struct vm_register_set vrs; 1024 int regnums[VM_REG_LAST]; 1025 uint64_t regvals[VM_REG_LAST]; 1026 1027 if (ddi_copyin(datap, &vrs, sizeof (vrs), md)) { 1028 error = EFAULT; 1029 break; 1030 } 1031 if (vrs.count > VM_REG_LAST || vrs.count == 0) { 1032 error = EINVAL; 1033 break; 1034 } 1035 if (ddi_copyin(vrs.regnums, regnums, 1036 sizeof (int) * vrs.count, md)) { 1037 error = EFAULT; 1038 break; 1039 } 1040 if (ddi_copyin(vrs.regvals, regvals, 1041 sizeof (uint64_t) * vrs.count, md)) { 1042 error = EFAULT; 1043 break; 1044 } 1045 1046 error = 0; 1047 for (uint_t i = 0; i < vrs.count && error == 0; i++) { 1048 /* 1049 * Setting registers in a set is not atomic, since a 1050 * failure in the middle of the set will cause a 1051 * bail-out and inconsistent register state. Callers 1052 * should be wary of this. 1053 */ 1054 if (regnums[i] < 0) { 1055 error = EINVAL; 1056 break; 1057 } 1058 error = vm_set_register(sc->vmm_vm, vcpu, regnums[i], 1059 regvals[i]); 1060 } 1061 break; 1062 } 1063 case VM_RESET_CPU: { 1064 struct vm_vcpu_reset vvr; 1065 1066 if (ddi_copyin(datap, &vvr, sizeof (vvr), md)) { 1067 error = EFAULT; 1068 break; 1069 } 1070 if (vvr.kind != VRK_RESET && vvr.kind != VRK_INIT) { 1071 error = EINVAL; 1072 } 1073 1074 error = vcpu_arch_reset(sc->vmm_vm, vcpu, vvr.kind == VRK_INIT); 1075 break; 1076 } 1077 case VM_GET_RUN_STATE: { 1078 struct vm_run_state vrs; 1079 1080 bzero(&vrs, sizeof (vrs)); 1081 error = vm_get_run_state(sc->vmm_vm, vcpu, &vrs.state, 1082 &vrs.sipi_vector); 1083 if (error == 0) { 1084 if (ddi_copyout(&vrs, datap, sizeof (vrs), md)) { 1085 error = EFAULT; 1086 break; 1087 } 1088 } 1089 break; 1090 } 1091 case VM_SET_RUN_STATE: { 1092 struct vm_run_state vrs; 1093 1094 if (ddi_copyin(datap, &vrs, sizeof (vrs), md)) { 1095 error = EFAULT; 1096 break; 1097 } 1098 error = vm_set_run_state(sc->vmm_vm, vcpu, vrs.state, 1099 vrs.sipi_vector); 1100 break; 1101 } 1102 case VM_GET_FPU: { 1103 struct vm_fpu_state req; 1104 const size_t max_len = (PAGESIZE * 2); 1105 void *kbuf; 1106 1107 if (ddi_copyin(datap, &req, sizeof (req), md)) { 1108 error = EFAULT; 1109 break; 1110 } 1111 if (req.len > max_len || req.len == 0) { 1112 error = EINVAL; 1113 break; 1114 } 1115 kbuf = kmem_zalloc(req.len, KM_SLEEP); 1116 error = vm_get_fpu(sc->vmm_vm, vcpu, kbuf, req.len); 1117 if (error == 0) { 1118 if (ddi_copyout(kbuf, req.buf, req.len, md)) { 1119 error = EFAULT; 1120 } 1121 } 1122 kmem_free(kbuf, req.len); 1123 break; 1124 } 1125 case VM_SET_FPU: { 1126 struct vm_fpu_state req; 1127 const size_t max_len = (PAGESIZE * 2); 1128 void *kbuf; 1129 1130 if (ddi_copyin(datap, &req, sizeof (req), md)) { 1131 error = EFAULT; 1132 break; 1133 } 1134 if (req.len > max_len || req.len == 0) { 1135 error = EINVAL; 1136 break; 1137 } 1138 kbuf = kmem_alloc(req.len, KM_SLEEP); 1139 if (ddi_copyin(req.buf, kbuf, req.len, md)) { 1140 error = EFAULT; 1141 } else { 1142 error = vm_set_fpu(sc->vmm_vm, vcpu, kbuf, req.len); 1143 } 1144 kmem_free(kbuf, req.len); 1145 break; 1146 } 1147 1148 case VM_SET_KERNEMU_DEV: 1149 case VM_GET_KERNEMU_DEV: { 1150 struct vm_readwrite_kernemu_device kemu; 1151 size_t size = 0; 1152 1153 if (ddi_copyin(datap, &kemu, sizeof (kemu), md)) { 1154 error = EFAULT; 1155 break; 1156 } 1157 1158 if (kemu.access_width > 3) { 1159 error = EINVAL; 1160 break; 1161 } 1162 size = (1 << kemu.access_width); 1163 ASSERT(size >= 1 && size <= 8); 1164 1165 if (cmd == VM_SET_KERNEMU_DEV) { 1166 error = vm_service_mmio_write(sc->vmm_vm, vcpu, 1167 kemu.gpa, kemu.value, size); 1168 } else { 1169 error = vm_service_mmio_read(sc->vmm_vm, vcpu, 1170 kemu.gpa, &kemu.value, size); 1171 } 1172 1173 if (error == 0) { 1174 if (ddi_copyout(&kemu, datap, sizeof (kemu), md)) { 1175 error = EFAULT; 1176 break; 1177 } 1178 } 1179 break; 1180 } 1181 1182 case VM_GET_CAPABILITY: { 1183 struct vm_capability vmcap; 1184 1185 if (ddi_copyin(datap, &vmcap, sizeof (vmcap), md)) { 1186 error = EFAULT; 1187 break; 1188 } 1189 error = vm_get_capability(sc->vmm_vm, vcpu, vmcap.captype, 1190 &vmcap.capval); 1191 if (error == 0 && 1192 ddi_copyout(&vmcap, datap, sizeof (vmcap), md)) { 1193 error = EFAULT; 1194 break; 1195 } 1196 break; 1197 } 1198 case VM_SET_CAPABILITY: { 1199 struct vm_capability vmcap; 1200 1201 if (ddi_copyin(datap, &vmcap, sizeof (vmcap), md)) { 1202 error = EFAULT; 1203 break; 1204 } 1205 error = vm_set_capability(sc->vmm_vm, vcpu, vmcap.captype, 1206 vmcap.capval); 1207 break; 1208 } 1209 case VM_SET_X2APIC_STATE: { 1210 struct vm_x2apic x2apic; 1211 1212 if (ddi_copyin(datap, &x2apic, sizeof (x2apic), md)) { 1213 error = EFAULT; 1214 break; 1215 } 1216 error = vm_set_x2apic_state(sc->vmm_vm, vcpu, x2apic.state); 1217 break; 1218 } 1219 case VM_GET_X2APIC_STATE: { 1220 struct vm_x2apic x2apic; 1221 1222 if (ddi_copyin(datap, &x2apic, sizeof (x2apic), md)) { 1223 error = EFAULT; 1224 break; 1225 } 1226 error = vm_get_x2apic_state(sc->vmm_vm, x2apic.cpuid, 1227 &x2apic.state); 1228 if (error == 0 && 1229 ddi_copyout(&x2apic, datap, sizeof (x2apic), md)) { 1230 error = EFAULT; 1231 break; 1232 } 1233 break; 1234 } 1235 case VM_GET_GPA_PMAP: { 1236 /* 1237 * Until there is a necessity to leak EPT/RVI PTE values to 1238 * userspace, this will remain unimplemented 1239 */ 1240 error = EINVAL; 1241 break; 1242 } 1243 case VM_GET_HPET_CAPABILITIES: { 1244 struct vm_hpet_cap hpetcap; 1245 1246 error = vhpet_getcap(&hpetcap); 1247 if (error == 0 && 1248 ddi_copyout(&hpetcap, datap, sizeof (hpetcap), md)) { 1249 error = EFAULT; 1250 break; 1251 } 1252 break; 1253 } 1254 case VM_GLA2GPA: { 1255 struct vm_gla2gpa gg; 1256 1257 if (ddi_copyin(datap, &gg, sizeof (gg), md)) { 1258 error = EFAULT; 1259 break; 1260 } 1261 gg.vcpuid = vcpu; 1262 error = vm_gla2gpa(sc->vmm_vm, vcpu, &gg.paging, gg.gla, 1263 gg.prot, &gg.gpa, &gg.fault); 1264 if (error == 0 && ddi_copyout(&gg, datap, sizeof (gg), md)) { 1265 error = EFAULT; 1266 break; 1267 } 1268 break; 1269 } 1270 case VM_GLA2GPA_NOFAULT: { 1271 struct vm_gla2gpa gg; 1272 1273 if (ddi_copyin(datap, &gg, sizeof (gg), md)) { 1274 error = EFAULT; 1275 break; 1276 } 1277 gg.vcpuid = vcpu; 1278 error = vm_gla2gpa_nofault(sc->vmm_vm, vcpu, &gg.paging, 1279 gg.gla, gg.prot, &gg.gpa, &gg.fault); 1280 if (error == 0 && ddi_copyout(&gg, datap, sizeof (gg), md)) { 1281 error = EFAULT; 1282 break; 1283 } 1284 break; 1285 } 1286 1287 case VM_ACTIVATE_CPU: 1288 error = vm_activate_cpu(sc->vmm_vm, vcpu); 1289 break; 1290 1291 case VM_SUSPEND_CPU: 1292 if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) { 1293 error = EFAULT; 1294 } else { 1295 error = vm_suspend_cpu(sc->vmm_vm, vcpu); 1296 } 1297 break; 1298 1299 case VM_RESUME_CPU: 1300 if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) { 1301 error = EFAULT; 1302 } else { 1303 error = vm_resume_cpu(sc->vmm_vm, vcpu); 1304 } 1305 break; 1306 1307 case VM_GET_CPUS: { 1308 struct vm_cpuset vm_cpuset; 1309 cpuset_t tempset; 1310 void *srcp = &tempset; 1311 int size; 1312 1313 if (ddi_copyin(datap, &vm_cpuset, sizeof (vm_cpuset), md)) { 1314 error = EFAULT; 1315 break; 1316 } 1317 1318 /* Be more generous about sizing since our cpuset_t is large. */ 1319 size = vm_cpuset.cpusetsize; 1320 if (size <= 0 || size > sizeof (cpuset_t)) { 1321 error = ERANGE; 1322 } 1323 /* 1324 * If they want a ulong_t or less, make sure they receive the 1325 * low bits with all the useful information. 1326 */ 1327 if (size <= sizeof (tempset.cpub[0])) { 1328 srcp = &tempset.cpub[0]; 1329 } 1330 1331 if (vm_cpuset.which == VM_ACTIVE_CPUS) { 1332 tempset = vm_active_cpus(sc->vmm_vm); 1333 } else if (vm_cpuset.which == VM_SUSPENDED_CPUS) { 1334 tempset = vm_suspended_cpus(sc->vmm_vm); 1335 } else if (vm_cpuset.which == VM_DEBUG_CPUS) { 1336 tempset = vm_debug_cpus(sc->vmm_vm); 1337 } else { 1338 error = EINVAL; 1339 } 1340 1341 ASSERT(size > 0 && size <= sizeof (tempset)); 1342 if (error == 0 && 1343 ddi_copyout(srcp, vm_cpuset.cpus, size, md)) { 1344 error = EFAULT; 1345 break; 1346 } 1347 break; 1348 } 1349 case VM_SET_INTINFO: { 1350 struct vm_intinfo vmii; 1351 1352 if (ddi_copyin(datap, &vmii, sizeof (vmii), md)) { 1353 error = EFAULT; 1354 break; 1355 } 1356 error = vm_exit_intinfo(sc->vmm_vm, vcpu, vmii.info1); 1357 break; 1358 } 1359 case VM_GET_INTINFO: { 1360 struct vm_intinfo vmii; 1361 1362 vmii.vcpuid = vcpu; 1363 error = vm_get_intinfo(sc->vmm_vm, vcpu, &vmii.info1, 1364 &vmii.info2); 1365 if (error == 0 && 1366 ddi_copyout(&vmii, datap, sizeof (vmii), md)) { 1367 error = EFAULT; 1368 break; 1369 } 1370 break; 1371 } 1372 case VM_RTC_WRITE: { 1373 struct vm_rtc_data rtcdata; 1374 1375 if (ddi_copyin(datap, &rtcdata, sizeof (rtcdata), md)) { 1376 error = EFAULT; 1377 break; 1378 } 1379 error = vrtc_nvram_write(sc->vmm_vm, rtcdata.offset, 1380 rtcdata.value); 1381 break; 1382 } 1383 case VM_RTC_READ: { 1384 struct vm_rtc_data rtcdata; 1385 1386 if (ddi_copyin(datap, &rtcdata, sizeof (rtcdata), md)) { 1387 error = EFAULT; 1388 break; 1389 } 1390 error = vrtc_nvram_read(sc->vmm_vm, rtcdata.offset, 1391 &rtcdata.value); 1392 if (error == 0 && 1393 ddi_copyout(&rtcdata, datap, sizeof (rtcdata), md)) { 1394 error = EFAULT; 1395 break; 1396 } 1397 break; 1398 } 1399 case VM_RTC_SETTIME: { 1400 struct vm_rtc_time rtctime; 1401 1402 if (ddi_copyin(datap, &rtctime, sizeof (rtctime), md)) { 1403 error = EFAULT; 1404 break; 1405 } 1406 error = vrtc_set_time(sc->vmm_vm, rtctime.secs); 1407 break; 1408 } 1409 case VM_RTC_GETTIME: { 1410 struct vm_rtc_time rtctime; 1411 1412 rtctime.secs = vrtc_get_time(sc->vmm_vm); 1413 if (ddi_copyout(&rtctime, datap, sizeof (rtctime), md)) { 1414 error = EFAULT; 1415 break; 1416 } 1417 break; 1418 } 1419 1420 case VM_PMTMR_LOCATE: { 1421 uint16_t port = arg; 1422 error = vpmtmr_set_location(sc->vmm_vm, port); 1423 break; 1424 } 1425 1426 case VM_RESTART_INSTRUCTION: 1427 error = vm_restart_instruction(sc->vmm_vm, vcpu); 1428 break; 1429 1430 case VM_SET_TOPOLOGY: { 1431 struct vm_cpu_topology topo; 1432 1433 if (ddi_copyin(datap, &topo, sizeof (topo), md) != 0) { 1434 error = EFAULT; 1435 break; 1436 } 1437 error = vm_set_topology(sc->vmm_vm, topo.sockets, topo.cores, 1438 topo.threads, topo.maxcpus); 1439 break; 1440 } 1441 case VM_GET_TOPOLOGY: { 1442 struct vm_cpu_topology topo; 1443 1444 vm_get_topology(sc->vmm_vm, &topo.sockets, &topo.cores, 1445 &topo.threads, &topo.maxcpus); 1446 if (ddi_copyout(&topo, datap, sizeof (topo), md) != 0) { 1447 error = EFAULT; 1448 break; 1449 } 1450 break; 1451 } 1452 case VM_DEVMEM_GETOFFSET: { 1453 struct vm_devmem_offset vdo; 1454 vmm_devmem_entry_t *de; 1455 1456 if (ddi_copyin(datap, &vdo, sizeof (vdo), md) != 0) { 1457 error = EFAULT; 1458 break; 1459 } 1460 1461 de = vmmdev_devmem_find(sc, vdo.segid); 1462 if (de != NULL) { 1463 vdo.offset = de->vde_off; 1464 if (ddi_copyout(&vdo, datap, sizeof (vdo), md) != 0) { 1465 error = EFAULT; 1466 } 1467 } else { 1468 error = ENOENT; 1469 } 1470 break; 1471 } 1472 case VM_TRACK_DIRTY_PAGES: { 1473 const size_t max_track_region_len = 8 * PAGESIZE * 8 * PAGESIZE; 1474 struct vmm_dirty_tracker tracker; 1475 uint8_t *bitmap; 1476 size_t len; 1477 1478 if (ddi_copyin(datap, &tracker, sizeof (tracker), md) != 0) { 1479 error = EFAULT; 1480 break; 1481 } 1482 if ((tracker.vdt_start_gpa & PAGEOFFSET) != 0) { 1483 error = EINVAL; 1484 break; 1485 } 1486 if (tracker.vdt_len == 0) { 1487 break; 1488 } 1489 if ((tracker.vdt_len & PAGEOFFSET) != 0) { 1490 error = EINVAL; 1491 break; 1492 } 1493 if (tracker.vdt_len > max_track_region_len) { 1494 error = EINVAL; 1495 break; 1496 } 1497 len = roundup(tracker.vdt_len / PAGESIZE, 8) / 8; 1498 bitmap = kmem_zalloc(len, KM_SLEEP); 1499 vm_track_dirty_pages(sc->vmm_vm, tracker.vdt_start_gpa, 1500 tracker.vdt_len, bitmap); 1501 if (ddi_copyout(bitmap, tracker.vdt_pfns, len, md) != 0) { 1502 error = EFAULT; 1503 } 1504 kmem_free(bitmap, len); 1505 1506 break; 1507 } 1508 case VM_WRLOCK_CYCLE: { 1509 /* 1510 * Present a test mechanism to acquire/release the write lock 1511 * on the VM without any other effects. 1512 */ 1513 break; 1514 } 1515 1516 default: 1517 error = ENOTTY; 1518 break; 1519 } 1520 1521 /* Release exclusion resources */ 1522 switch (lock_type) { 1523 case LOCK_NONE: 1524 break; 1525 case LOCK_VCPU: 1526 vcpu_unlock_one(sc, vcpu); 1527 break; 1528 case LOCK_READ_HOLD: 1529 vmm_read_unlock(sc); 1530 break; 1531 case LOCK_WRITE_HOLD: 1532 vmm_write_unlock(sc); 1533 break; 1534 default: 1535 panic("unexpected lock type"); 1536 break; 1537 } 1538 1539 return (error); 1540 } 1541 1542 static vmm_softc_t * 1543 vmm_lookup(const char *name) 1544 { 1545 list_t *vml = &vmm_list; 1546 vmm_softc_t *sc; 1547 1548 ASSERT(MUTEX_HELD(&vmm_mtx)); 1549 1550 for (sc = list_head(vml); sc != NULL; sc = list_next(vml, sc)) { 1551 if (strcmp(sc->vmm_name, name) == 0) { 1552 break; 1553 } 1554 } 1555 1556 return (sc); 1557 } 1558 1559 /* 1560 * Acquire an HMA registration if not already held. 1561 */ 1562 static boolean_t 1563 vmm_hma_acquire(void) 1564 { 1565 ASSERT(MUTEX_NOT_HELD(&vmm_mtx)); 1566 1567 mutex_enter(&vmmdev_mtx); 1568 1569 if (vmmdev_hma_reg == NULL) { 1570 VERIFY3U(vmmdev_hma_ref, ==, 0); 1571 vmmdev_hma_reg = hma_register(vmmdev_hvm_name); 1572 if (vmmdev_hma_reg == NULL) { 1573 cmn_err(CE_WARN, "%s HMA registration failed.", 1574 vmmdev_hvm_name); 1575 mutex_exit(&vmmdev_mtx); 1576 return (B_FALSE); 1577 } 1578 } 1579 1580 vmmdev_hma_ref++; 1581 1582 mutex_exit(&vmmdev_mtx); 1583 1584 return (B_TRUE); 1585 } 1586 1587 /* 1588 * Release the HMA registration if held and there are no remaining VMs. 1589 */ 1590 static void 1591 vmm_hma_release(void) 1592 { 1593 ASSERT(MUTEX_NOT_HELD(&vmm_mtx)); 1594 1595 mutex_enter(&vmmdev_mtx); 1596 1597 VERIFY3U(vmmdev_hma_ref, !=, 0); 1598 1599 vmmdev_hma_ref--; 1600 1601 if (vmmdev_hma_ref == 0) { 1602 VERIFY(vmmdev_hma_reg != NULL); 1603 hma_unregister(vmmdev_hma_reg); 1604 vmmdev_hma_reg = NULL; 1605 } 1606 mutex_exit(&vmmdev_mtx); 1607 } 1608 1609 static int 1610 vmmdev_do_vm_create(const struct vm_create_req *req, cred_t *cr) 1611 { 1612 vmm_softc_t *sc = NULL; 1613 minor_t minor; 1614 int error = ENOMEM; 1615 size_t len; 1616 const char *name = req->name; 1617 1618 len = strnlen(name, VM_MAX_NAMELEN); 1619 if (len == 0) { 1620 return (EINVAL); 1621 } 1622 if (len >= VM_MAX_NAMELEN) { 1623 return (ENAMETOOLONG); 1624 } 1625 if (strchr(name, '/') != NULL) { 1626 return (EINVAL); 1627 } 1628 1629 if (!vmm_hma_acquire()) 1630 return (ENXIO); 1631 1632 mutex_enter(&vmm_mtx); 1633 1634 /* Look for duplicate names */ 1635 if (vmm_lookup(name) != NULL) { 1636 mutex_exit(&vmm_mtx); 1637 vmm_hma_release(); 1638 return (EEXIST); 1639 } 1640 1641 /* Allow only one instance per non-global zone. */ 1642 if (!INGLOBALZONE(curproc)) { 1643 for (sc = list_head(&vmm_list); sc != NULL; 1644 sc = list_next(&vmm_list, sc)) { 1645 if (sc->vmm_zone == curzone) { 1646 mutex_exit(&vmm_mtx); 1647 vmm_hma_release(); 1648 return (EINVAL); 1649 } 1650 } 1651 } 1652 1653 minor = id_alloc(vmm_minors); 1654 if (ddi_soft_state_zalloc(vmm_statep, minor) != DDI_SUCCESS) { 1655 goto fail; 1656 } else if ((sc = ddi_get_soft_state(vmm_statep, minor)) == NULL) { 1657 ddi_soft_state_free(vmm_statep, minor); 1658 goto fail; 1659 } else if (ddi_create_minor_node(vmmdev_dip, name, S_IFCHR, minor, 1660 DDI_PSEUDO, 0) != DDI_SUCCESS) { 1661 goto fail; 1662 } 1663 1664 if (vmm_kstat_alloc(sc, minor, cr) != 0) { 1665 goto fail; 1666 } 1667 1668 error = vm_create(req->name, req->flags, &sc->vmm_vm); 1669 if (error == 0) { 1670 /* Complete VM intialization and report success. */ 1671 (void) strlcpy(sc->vmm_name, name, sizeof (sc->vmm_name)); 1672 sc->vmm_minor = minor; 1673 list_create(&sc->vmm_devmem_list, sizeof (vmm_devmem_entry_t), 1674 offsetof(vmm_devmem_entry_t, vde_node)); 1675 1676 list_create(&sc->vmm_holds, sizeof (vmm_hold_t), 1677 offsetof(vmm_hold_t, vmh_node)); 1678 cv_init(&sc->vmm_cv, NULL, CV_DEFAULT, NULL); 1679 1680 mutex_init(&sc->vmm_lease_lock, NULL, MUTEX_DEFAULT, NULL); 1681 list_create(&sc->vmm_lease_list, sizeof (vmm_lease_t), 1682 offsetof(vmm_lease_t, vml_node)); 1683 cv_init(&sc->vmm_lease_cv, NULL, CV_DEFAULT, NULL); 1684 rw_init(&sc->vmm_rwlock, NULL, RW_DEFAULT, NULL); 1685 1686 sc->vmm_zone = crgetzone(cr); 1687 zone_hold(sc->vmm_zone); 1688 vmm_zsd_add_vm(sc); 1689 vmm_kstat_init(sc); 1690 1691 list_insert_tail(&vmm_list, sc); 1692 mutex_exit(&vmm_mtx); 1693 return (0); 1694 } 1695 1696 vmm_kstat_fini(sc); 1697 ddi_remove_minor_node(vmmdev_dip, name); 1698 fail: 1699 id_free(vmm_minors, minor); 1700 if (sc != NULL) { 1701 ddi_soft_state_free(vmm_statep, minor); 1702 } 1703 mutex_exit(&vmm_mtx); 1704 vmm_hma_release(); 1705 1706 return (error); 1707 } 1708 1709 /* 1710 * Bhyve 'Driver' Interface 1711 * 1712 * While many devices are emulated in the bhyve userspace process, there are 1713 * others with performance constraints which require that they run mostly or 1714 * entirely in-kernel. For those not integrated directly into bhyve, an API is 1715 * needed so they can query/manipulate the portions of VM state needed to 1716 * fulfill their purpose. 1717 * 1718 * This includes: 1719 * - Translating guest-physical addresses to host-virtual pointers 1720 * - Injecting MSIs 1721 * - Hooking IO port addresses 1722 * 1723 * The vmm_drv interface exists to provide that functionality to its consumers. 1724 * (At this time, 'viona' is the only user) 1725 */ 1726 int 1727 vmm_drv_hold(file_t *fp, cred_t *cr, vmm_hold_t **holdp) 1728 { 1729 vnode_t *vp = fp->f_vnode; 1730 const dev_t dev = vp->v_rdev; 1731 vmm_softc_t *sc; 1732 vmm_hold_t *hold; 1733 int err = 0; 1734 1735 if (vp->v_type != VCHR) { 1736 return (ENXIO); 1737 } 1738 const major_t major = getmajor(dev); 1739 const minor_t minor = getminor(dev); 1740 1741 mutex_enter(&vmmdev_mtx); 1742 if (vmmdev_dip == NULL || major != ddi_driver_major(vmmdev_dip)) { 1743 mutex_exit(&vmmdev_mtx); 1744 return (ENOENT); 1745 } 1746 mutex_enter(&vmm_mtx); 1747 mutex_exit(&vmmdev_mtx); 1748 1749 if ((sc = ddi_get_soft_state(vmm_statep, minor)) == NULL) { 1750 err = ENOENT; 1751 goto out; 1752 } 1753 /* XXXJOY: check cred permissions against instance */ 1754 1755 if ((sc->vmm_flags & (VMM_CLEANUP|VMM_PURGED|VMM_DESTROY)) != 0) { 1756 err = EBUSY; 1757 goto out; 1758 } 1759 1760 hold = kmem_zalloc(sizeof (*hold), KM_SLEEP); 1761 hold->vmh_sc = sc; 1762 hold->vmh_release_req = B_FALSE; 1763 1764 list_insert_tail(&sc->vmm_holds, hold); 1765 sc->vmm_flags |= VMM_HELD; 1766 *holdp = hold; 1767 1768 out: 1769 mutex_exit(&vmm_mtx); 1770 return (err); 1771 } 1772 1773 void 1774 vmm_drv_rele(vmm_hold_t *hold) 1775 { 1776 vmm_softc_t *sc; 1777 1778 ASSERT(hold != NULL); 1779 ASSERT(hold->vmh_sc != NULL); 1780 VERIFY(hold->vmh_ioport_hook_cnt == 0); 1781 1782 mutex_enter(&vmm_mtx); 1783 sc = hold->vmh_sc; 1784 list_remove(&sc->vmm_holds, hold); 1785 if (list_is_empty(&sc->vmm_holds)) { 1786 sc->vmm_flags &= ~VMM_HELD; 1787 cv_broadcast(&sc->vmm_cv); 1788 } 1789 mutex_exit(&vmm_mtx); 1790 kmem_free(hold, sizeof (*hold)); 1791 } 1792 1793 boolean_t 1794 vmm_drv_release_reqd(vmm_hold_t *hold) 1795 { 1796 ASSERT(hold != NULL); 1797 1798 return (hold->vmh_release_req); 1799 } 1800 1801 vmm_lease_t * 1802 vmm_drv_lease_sign(vmm_hold_t *hold, boolean_t (*expiref)(void *), void *arg) 1803 { 1804 vmm_softc_t *sc = hold->vmh_sc; 1805 vmm_lease_t *lease; 1806 1807 ASSERT3P(expiref, !=, NULL); 1808 1809 if (hold->vmh_release_req) { 1810 return (NULL); 1811 } 1812 1813 lease = kmem_alloc(sizeof (*lease), KM_SLEEP); 1814 list_link_init(&lease->vml_node); 1815 lease->vml_expire_func = expiref; 1816 lease->vml_expire_arg = arg; 1817 lease->vml_expired = B_FALSE; 1818 lease->vml_break_deferred = B_FALSE; 1819 lease->vml_hold = hold; 1820 /* cache the VM pointer for one less pointer chase */ 1821 lease->vml_vm = sc->vmm_vm; 1822 lease->vml_vmclient = vmspace_client_alloc(vm_get_vmspace(sc->vmm_vm)); 1823 1824 mutex_enter(&sc->vmm_lease_lock); 1825 while (sc->vmm_lease_blocker != 0) { 1826 cv_wait(&sc->vmm_lease_cv, &sc->vmm_lease_lock); 1827 } 1828 list_insert_tail(&sc->vmm_lease_list, lease); 1829 vmm_read_lock(sc); 1830 mutex_exit(&sc->vmm_lease_lock); 1831 1832 return (lease); 1833 } 1834 1835 static void 1836 vmm_lease_break_locked(vmm_softc_t *sc, vmm_lease_t *lease) 1837 { 1838 ASSERT(MUTEX_HELD(&sc->vmm_lease_lock)); 1839 1840 list_remove(&sc->vmm_lease_list, lease); 1841 vmm_read_unlock(sc); 1842 vmc_destroy(lease->vml_vmclient); 1843 kmem_free(lease, sizeof (*lease)); 1844 } 1845 1846 static void 1847 vmm_lease_block(vmm_softc_t *sc) 1848 { 1849 mutex_enter(&sc->vmm_lease_lock); 1850 VERIFY3U(sc->vmm_lease_blocker, !=, UINT_MAX); 1851 sc->vmm_lease_blocker++; 1852 if (sc->vmm_lease_blocker == 1) { 1853 list_t *list = &sc->vmm_lease_list; 1854 vmm_lease_t *lease = list_head(list); 1855 1856 while (lease != NULL) { 1857 void *arg = lease->vml_expire_arg; 1858 boolean_t (*expiref)(void *) = lease->vml_expire_func; 1859 boolean_t sync_break = B_FALSE; 1860 1861 /* 1862 * Since the lease expiration notification may 1863 * need to take locks which would deadlock with 1864 * vmm_lease_lock, drop it across the call. 1865 * 1866 * We are the only one allowed to manipulate 1867 * vmm_lease_list right now, so it is safe to 1868 * continue iterating through it after 1869 * reacquiring the lock. 1870 */ 1871 lease->vml_expired = B_TRUE; 1872 mutex_exit(&sc->vmm_lease_lock); 1873 sync_break = expiref(arg); 1874 mutex_enter(&sc->vmm_lease_lock); 1875 1876 if (sync_break) { 1877 vmm_lease_t *next; 1878 1879 /* 1880 * These leases which are synchronously broken 1881 * result in vmm_read_unlock() calls from a 1882 * different thread than the corresponding 1883 * vmm_read_lock(). This is acceptable, given 1884 * that the rwlock underpinning the whole 1885 * mechanism tolerates the behavior. This 1886 * flexibility is _only_ afforded to VM read 1887 * lock (RW_READER) holders. 1888 */ 1889 next = list_next(list, lease); 1890 vmm_lease_break_locked(sc, lease); 1891 lease = next; 1892 } else { 1893 lease = list_next(list, lease); 1894 } 1895 } 1896 1897 /* Process leases which were not broken synchronously. */ 1898 while (!list_is_empty(list)) { 1899 /* 1900 * Although the nested loops are quadratic, the number 1901 * of leases is small. 1902 */ 1903 lease = list_head(list); 1904 while (lease != NULL) { 1905 vmm_lease_t *next = list_next(list, lease); 1906 if (lease->vml_break_deferred) { 1907 vmm_lease_break_locked(sc, lease); 1908 } 1909 lease = next; 1910 } 1911 if (list_is_empty(list)) { 1912 break; 1913 } 1914 cv_wait(&sc->vmm_lease_cv, &sc->vmm_lease_lock); 1915 } 1916 /* Wake anyone else waiting for the lease list to be empty */ 1917 cv_broadcast(&sc->vmm_lease_cv); 1918 } else { 1919 list_t *list = &sc->vmm_lease_list; 1920 1921 /* 1922 * Some other thread beat us to the duty of lease cleanup. 1923 * Wait until that is complete. 1924 */ 1925 while (!list_is_empty(list)) { 1926 cv_wait(&sc->vmm_lease_cv, &sc->vmm_lease_lock); 1927 } 1928 } 1929 mutex_exit(&sc->vmm_lease_lock); 1930 } 1931 1932 static void 1933 vmm_lease_unblock(vmm_softc_t *sc) 1934 { 1935 mutex_enter(&sc->vmm_lease_lock); 1936 VERIFY3U(sc->vmm_lease_blocker, !=, 0); 1937 sc->vmm_lease_blocker--; 1938 if (sc->vmm_lease_blocker == 0) { 1939 cv_broadcast(&sc->vmm_lease_cv); 1940 } 1941 mutex_exit(&sc->vmm_lease_lock); 1942 } 1943 1944 void 1945 vmm_drv_lease_break(vmm_hold_t *hold, vmm_lease_t *lease) 1946 { 1947 vmm_softc_t *sc = hold->vmh_sc; 1948 1949 VERIFY3P(hold, ==, lease->vml_hold); 1950 VERIFY(!lease->vml_break_deferred); 1951 1952 mutex_enter(&sc->vmm_lease_lock); 1953 if (sc->vmm_lease_blocker == 0) { 1954 vmm_lease_break_locked(sc, lease); 1955 } else { 1956 /* 1957 * Defer the lease-breaking to whichever thread is currently 1958 * cleaning up all leases as part of a vmm_lease_block() call. 1959 */ 1960 lease->vml_break_deferred = B_TRUE; 1961 cv_broadcast(&sc->vmm_lease_cv); 1962 } 1963 mutex_exit(&sc->vmm_lease_lock); 1964 } 1965 1966 boolean_t 1967 vmm_drv_lease_expired(vmm_lease_t *lease) 1968 { 1969 return (lease->vml_expired); 1970 } 1971 1972 vmm_page_t * 1973 vmm_drv_page_hold(vmm_lease_t *lease, uintptr_t gpa, int prot) 1974 { 1975 ASSERT(lease != NULL); 1976 ASSERT0(gpa & PAGEOFFSET); 1977 1978 return ((vmm_page_t *)vmc_hold(lease->vml_vmclient, gpa, prot)); 1979 } 1980 1981 void 1982 vmm_drv_page_release(vmm_page_t *vmmp) 1983 { 1984 (void) vmp_release((vm_page_t *)vmmp); 1985 } 1986 1987 void 1988 vmm_drv_page_release_chain(vmm_page_t *vmmp) 1989 { 1990 (void) vmp_release_chain((vm_page_t *)vmmp); 1991 } 1992 1993 const void * 1994 vmm_drv_page_readable(const vmm_page_t *vmmp) 1995 { 1996 return (vmp_get_readable((const vm_page_t *)vmmp)); 1997 } 1998 1999 void * 2000 vmm_drv_page_writable(const vmm_page_t *vmmp) 2001 { 2002 return (vmp_get_writable((const vm_page_t *)vmmp)); 2003 } 2004 2005 void 2006 vmm_drv_page_chain(vmm_page_t *vmmp, vmm_page_t *to_chain) 2007 { 2008 vmp_chain((vm_page_t *)vmmp, (vm_page_t *)to_chain); 2009 } 2010 2011 vmm_page_t * 2012 vmm_drv_page_next(const vmm_page_t *vmmp) 2013 { 2014 return ((vmm_page_t *)vmp_next((vm_page_t *)vmmp)); 2015 } 2016 2017 int 2018 vmm_drv_msi(vmm_lease_t *lease, uint64_t addr, uint64_t msg) 2019 { 2020 ASSERT(lease != NULL); 2021 2022 return (lapic_intr_msi(lease->vml_vm, addr, msg)); 2023 } 2024 2025 int 2026 vmm_drv_ioport_hook(vmm_hold_t *hold, uint16_t ioport, vmm_drv_iop_cb_t func, 2027 void *arg, void **cookie) 2028 { 2029 vmm_softc_t *sc; 2030 int err; 2031 2032 ASSERT(hold != NULL); 2033 ASSERT(cookie != NULL); 2034 2035 sc = hold->vmh_sc; 2036 mutex_enter(&vmm_mtx); 2037 /* Confirm that hook installation is not blocked */ 2038 if ((sc->vmm_flags & VMM_BLOCK_HOOK) != 0) { 2039 mutex_exit(&vmm_mtx); 2040 return (EBUSY); 2041 } 2042 /* 2043 * Optimistically record an installed hook which will prevent a block 2044 * from being asserted while the mutex is dropped. 2045 */ 2046 hold->vmh_ioport_hook_cnt++; 2047 mutex_exit(&vmm_mtx); 2048 2049 vmm_write_lock(sc); 2050 err = vm_ioport_hook(sc->vmm_vm, ioport, (ioport_handler_t)func, 2051 arg, cookie); 2052 vmm_write_unlock(sc); 2053 2054 if (err != 0) { 2055 mutex_enter(&vmm_mtx); 2056 /* Walk back optimism about the hook installation */ 2057 hold->vmh_ioport_hook_cnt--; 2058 mutex_exit(&vmm_mtx); 2059 } 2060 return (err); 2061 } 2062 2063 void 2064 vmm_drv_ioport_unhook(vmm_hold_t *hold, void **cookie) 2065 { 2066 vmm_softc_t *sc; 2067 2068 ASSERT(hold != NULL); 2069 ASSERT(cookie != NULL); 2070 ASSERT(hold->vmh_ioport_hook_cnt != 0); 2071 2072 sc = hold->vmh_sc; 2073 vmm_write_lock(sc); 2074 vm_ioport_unhook(sc->vmm_vm, cookie); 2075 vmm_write_unlock(sc); 2076 2077 mutex_enter(&vmm_mtx); 2078 hold->vmh_ioport_hook_cnt--; 2079 mutex_exit(&vmm_mtx); 2080 } 2081 2082 static int 2083 vmm_drv_purge(vmm_softc_t *sc) 2084 { 2085 ASSERT(MUTEX_HELD(&vmm_mtx)); 2086 2087 if ((sc->vmm_flags & VMM_HELD) != 0) { 2088 vmm_hold_t *hold; 2089 2090 sc->vmm_flags |= VMM_CLEANUP; 2091 for (hold = list_head(&sc->vmm_holds); hold != NULL; 2092 hold = list_next(&sc->vmm_holds, hold)) { 2093 hold->vmh_release_req = B_TRUE; 2094 } 2095 while ((sc->vmm_flags & VMM_HELD) != 0) { 2096 if (cv_wait_sig(&sc->vmm_cv, &vmm_mtx) <= 0) { 2097 return (EINTR); 2098 } 2099 } 2100 sc->vmm_flags &= ~VMM_CLEANUP; 2101 } 2102 2103 VERIFY(list_is_empty(&sc->vmm_holds)); 2104 sc->vmm_flags |= VMM_PURGED; 2105 return (0); 2106 } 2107 2108 static int 2109 vmm_drv_block_hook(vmm_softc_t *sc, boolean_t enable_block) 2110 { 2111 int err = 0; 2112 2113 mutex_enter(&vmm_mtx); 2114 if (!enable_block) { 2115 VERIFY((sc->vmm_flags & VMM_BLOCK_HOOK) != 0); 2116 2117 sc->vmm_flags &= ~VMM_BLOCK_HOOK; 2118 goto done; 2119 } 2120 2121 /* If any holds have hooks installed, the block is a failure */ 2122 if (!list_is_empty(&sc->vmm_holds)) { 2123 vmm_hold_t *hold; 2124 2125 for (hold = list_head(&sc->vmm_holds); hold != NULL; 2126 hold = list_next(&sc->vmm_holds, hold)) { 2127 if (hold->vmh_ioport_hook_cnt != 0) { 2128 err = EBUSY; 2129 goto done; 2130 } 2131 } 2132 } 2133 sc->vmm_flags |= VMM_BLOCK_HOOK; 2134 2135 done: 2136 mutex_exit(&vmm_mtx); 2137 return (err); 2138 } 2139 2140 static int 2141 vmm_do_vm_destroy_locked(vmm_softc_t *sc, boolean_t clean_zsd, 2142 boolean_t *hma_release) 2143 { 2144 dev_info_t *pdip = ddi_get_parent(vmmdev_dip); 2145 minor_t minor; 2146 2147 ASSERT(MUTEX_HELD(&vmm_mtx)); 2148 2149 *hma_release = B_FALSE; 2150 2151 if (vmm_drv_purge(sc) != 0) { 2152 return (EINTR); 2153 } 2154 2155 if (clean_zsd) { 2156 vmm_zsd_rem_vm(sc); 2157 } 2158 2159 /* Clean up devmem entries */ 2160 vmmdev_devmem_purge(sc); 2161 2162 list_remove(&vmm_list, sc); 2163 ddi_remove_minor_node(vmmdev_dip, sc->vmm_name); 2164 minor = sc->vmm_minor; 2165 zone_rele(sc->vmm_zone); 2166 if (sc->vmm_is_open) { 2167 list_insert_tail(&vmm_destroy_list, sc); 2168 sc->vmm_flags |= VMM_DESTROY; 2169 } else { 2170 vmm_kstat_fini(sc); 2171 vm_destroy(sc->vmm_vm); 2172 ddi_soft_state_free(vmm_statep, minor); 2173 id_free(vmm_minors, minor); 2174 *hma_release = B_TRUE; 2175 } 2176 (void) devfs_clean(pdip, NULL, DV_CLEAN_FORCE); 2177 2178 return (0); 2179 } 2180 2181 int 2182 vmm_do_vm_destroy(vmm_softc_t *sc, boolean_t clean_zsd) 2183 { 2184 boolean_t hma_release = B_FALSE; 2185 int err; 2186 2187 mutex_enter(&vmm_mtx); 2188 err = vmm_do_vm_destroy_locked(sc, clean_zsd, &hma_release); 2189 mutex_exit(&vmm_mtx); 2190 2191 if (hma_release) 2192 vmm_hma_release(); 2193 2194 return (err); 2195 } 2196 2197 /* ARGSUSED */ 2198 static int 2199 vmmdev_do_vm_destroy(const struct vm_destroy_req *req, cred_t *cr) 2200 { 2201 boolean_t hma_release = B_FALSE; 2202 vmm_softc_t *sc; 2203 int err; 2204 2205 if (crgetuid(cr) != 0) 2206 return (EPERM); 2207 2208 mutex_enter(&vmm_mtx); 2209 2210 if ((sc = vmm_lookup(req->name)) == NULL) { 2211 mutex_exit(&vmm_mtx); 2212 return (ENOENT); 2213 } 2214 /* 2215 * We don't check this in vmm_lookup() since that function is also used 2216 * for validation during create and currently vmm names must be unique. 2217 */ 2218 if (!INGLOBALZONE(curproc) && sc->vmm_zone != curzone) { 2219 mutex_exit(&vmm_mtx); 2220 return (EPERM); 2221 } 2222 err = vmm_do_vm_destroy_locked(sc, B_TRUE, &hma_release); 2223 2224 mutex_exit(&vmm_mtx); 2225 2226 if (hma_release) 2227 vmm_hma_release(); 2228 2229 return (err); 2230 } 2231 2232 #define VCPU_NAME_BUFLEN 32 2233 2234 static int 2235 vmm_kstat_alloc(vmm_softc_t *sc, minor_t minor, const cred_t *cr) 2236 { 2237 zoneid_t zid = crgetzoneid(cr); 2238 int instance = minor; 2239 kstat_t *ksp; 2240 2241 ASSERT3P(sc->vmm_kstat_vm, ==, NULL); 2242 2243 ksp = kstat_create_zone(VMM_MODULE_NAME, instance, "vm", 2244 VMM_KSTAT_CLASS, KSTAT_TYPE_NAMED, 2245 sizeof (vmm_kstats_t) / sizeof (kstat_named_t), 0, zid); 2246 2247 if (ksp == NULL) { 2248 return (-1); 2249 } 2250 sc->vmm_kstat_vm = ksp; 2251 2252 for (uint_t i = 0; i < VM_MAXCPU; i++) { 2253 char namebuf[VCPU_NAME_BUFLEN]; 2254 2255 ASSERT3P(sc->vmm_kstat_vcpu[i], ==, NULL); 2256 2257 (void) snprintf(namebuf, VCPU_NAME_BUFLEN, "vcpu%u", i); 2258 ksp = kstat_create_zone(VMM_MODULE_NAME, instance, namebuf, 2259 VMM_KSTAT_CLASS, KSTAT_TYPE_NAMED, 2260 sizeof (vmm_vcpu_kstats_t) / sizeof (kstat_named_t), 2261 0, zid); 2262 if (ksp == NULL) { 2263 goto fail; 2264 } 2265 2266 sc->vmm_kstat_vcpu[i] = ksp; 2267 } 2268 2269 /* 2270 * If this instance is associated with a non-global zone, make its 2271 * kstats visible from the GZ. 2272 */ 2273 if (zid != GLOBAL_ZONEID) { 2274 kstat_zone_add(sc->vmm_kstat_vm, GLOBAL_ZONEID); 2275 for (uint_t i = 0; i < VM_MAXCPU; i++) { 2276 kstat_zone_add(sc->vmm_kstat_vcpu[i], GLOBAL_ZONEID); 2277 } 2278 } 2279 2280 return (0); 2281 2282 fail: 2283 for (uint_t i = 0; i < VM_MAXCPU; i++) { 2284 if (sc->vmm_kstat_vcpu[i] != NULL) { 2285 kstat_delete(sc->vmm_kstat_vcpu[i]); 2286 sc->vmm_kstat_vcpu[i] = NULL; 2287 } else { 2288 break; 2289 } 2290 } 2291 kstat_delete(sc->vmm_kstat_vm); 2292 sc->vmm_kstat_vm = NULL; 2293 return (-1); 2294 } 2295 2296 static void 2297 vmm_kstat_init(vmm_softc_t *sc) 2298 { 2299 kstat_t *ksp; 2300 2301 ASSERT3P(sc->vmm_vm, !=, NULL); 2302 ASSERT3P(sc->vmm_kstat_vm, !=, NULL); 2303 2304 ksp = sc->vmm_kstat_vm; 2305 vmm_kstats_t *vk = ksp->ks_data; 2306 ksp->ks_private = sc->vmm_vm; 2307 kstat_named_init(&vk->vk_name, "vm_name", KSTAT_DATA_STRING); 2308 kstat_named_setstr(&vk->vk_name, sc->vmm_name); 2309 2310 for (uint_t i = 0; i < VM_MAXCPU; i++) { 2311 ASSERT3P(sc->vmm_kstat_vcpu[i], !=, NULL); 2312 2313 ksp = sc->vmm_kstat_vcpu[i]; 2314 vmm_vcpu_kstats_t *vvk = ksp->ks_data; 2315 2316 kstat_named_init(&vvk->vvk_vcpu, "vcpu", KSTAT_DATA_UINT32); 2317 vvk->vvk_vcpu.value.ui32 = i; 2318 kstat_named_init(&vvk->vvk_time_init, "time_init", 2319 KSTAT_DATA_UINT64); 2320 kstat_named_init(&vvk->vvk_time_run, "time_run", 2321 KSTAT_DATA_UINT64); 2322 kstat_named_init(&vvk->vvk_time_idle, "time_idle", 2323 KSTAT_DATA_UINT64); 2324 kstat_named_init(&vvk->vvk_time_emu_kern, "time_emu_kern", 2325 KSTAT_DATA_UINT64); 2326 kstat_named_init(&vvk->vvk_time_emu_user, "time_emu_user", 2327 KSTAT_DATA_UINT64); 2328 kstat_named_init(&vvk->vvk_time_sched, "time_sched", 2329 KSTAT_DATA_UINT64); 2330 ksp->ks_private = sc->vmm_vm; 2331 ksp->ks_update = vmm_kstat_update_vcpu; 2332 } 2333 2334 kstat_install(sc->vmm_kstat_vm); 2335 for (uint_t i = 0; i < VM_MAXCPU; i++) { 2336 kstat_install(sc->vmm_kstat_vcpu[i]); 2337 } 2338 } 2339 2340 static void 2341 vmm_kstat_fini(vmm_softc_t *sc) 2342 { 2343 ASSERT(sc->vmm_kstat_vm != NULL); 2344 2345 kstat_delete(sc->vmm_kstat_vm); 2346 sc->vmm_kstat_vm = NULL; 2347 2348 for (uint_t i = 0; i < VM_MAXCPU; i++) { 2349 ASSERT3P(sc->vmm_kstat_vcpu[i], !=, NULL); 2350 2351 kstat_delete(sc->vmm_kstat_vcpu[i]); 2352 sc->vmm_kstat_vcpu[i] = NULL; 2353 } 2354 } 2355 2356 static int 2357 vmm_open(dev_t *devp, int flag, int otyp, cred_t *credp) 2358 { 2359 minor_t minor; 2360 vmm_softc_t *sc; 2361 2362 /* 2363 * Forbid running bhyve in a 32-bit process until it has been tested and 2364 * verified to be safe. 2365 */ 2366 if (curproc->p_model != DATAMODEL_LP64) { 2367 return (EFBIG); 2368 } 2369 2370 minor = getminor(*devp); 2371 if (minor == VMM_CTL_MINOR) { 2372 /* 2373 * Master control device must be opened exclusively. 2374 */ 2375 if ((flag & FEXCL) != FEXCL || otyp != OTYP_CHR) { 2376 return (EINVAL); 2377 } 2378 2379 return (0); 2380 } 2381 2382 mutex_enter(&vmm_mtx); 2383 sc = ddi_get_soft_state(vmm_statep, minor); 2384 if (sc == NULL) { 2385 mutex_exit(&vmm_mtx); 2386 return (ENXIO); 2387 } 2388 2389 sc->vmm_is_open = B_TRUE; 2390 mutex_exit(&vmm_mtx); 2391 2392 return (0); 2393 } 2394 2395 static int 2396 vmm_close(dev_t dev, int flag, int otyp, cred_t *credp) 2397 { 2398 minor_t minor; 2399 vmm_softc_t *sc; 2400 boolean_t hma_release = B_FALSE; 2401 2402 minor = getminor(dev); 2403 if (minor == VMM_CTL_MINOR) 2404 return (0); 2405 2406 mutex_enter(&vmm_mtx); 2407 sc = ddi_get_soft_state(vmm_statep, minor); 2408 if (sc == NULL) { 2409 mutex_exit(&vmm_mtx); 2410 return (ENXIO); 2411 } 2412 2413 VERIFY(sc->vmm_is_open); 2414 sc->vmm_is_open = B_FALSE; 2415 2416 /* 2417 * If this VM was destroyed while the vmm device was open, then 2418 * clean it up now that it is closed. 2419 */ 2420 if (sc->vmm_flags & VMM_DESTROY) { 2421 list_remove(&vmm_destroy_list, sc); 2422 vmm_kstat_fini(sc); 2423 vm_destroy(sc->vmm_vm); 2424 ddi_soft_state_free(vmm_statep, minor); 2425 id_free(vmm_minors, minor); 2426 hma_release = B_TRUE; 2427 } 2428 mutex_exit(&vmm_mtx); 2429 2430 if (hma_release) 2431 vmm_hma_release(); 2432 2433 return (0); 2434 } 2435 2436 static int 2437 vmm_is_supported(intptr_t arg) 2438 { 2439 int r; 2440 const char *msg; 2441 2442 if (vmm_is_intel()) { 2443 r = vmx_x86_supported(&msg); 2444 } else if (vmm_is_svm()) { 2445 /* 2446 * HMA already ensured that the features necessary for SVM 2447 * operation were present and online during vmm_attach(). 2448 */ 2449 r = 0; 2450 } else { 2451 r = ENXIO; 2452 msg = "Unsupported CPU vendor"; 2453 } 2454 2455 if (r != 0 && arg != (intptr_t)NULL) { 2456 if (copyoutstr(msg, (char *)arg, strlen(msg) + 1, NULL) != 0) 2457 return (EFAULT); 2458 } 2459 return (r); 2460 } 2461 2462 static int 2463 vmm_ctl_ioctl(int cmd, intptr_t arg, int md, cred_t *cr, int *rvalp) 2464 { 2465 void *argp = (void *)arg; 2466 2467 switch (cmd) { 2468 case VMM_CREATE_VM: { 2469 struct vm_create_req req; 2470 2471 if ((md & FWRITE) == 0) { 2472 return (EPERM); 2473 } 2474 if (ddi_copyin(argp, &req, sizeof (req), md) != 0) { 2475 return (EFAULT); 2476 } 2477 return (vmmdev_do_vm_create(&req, cr)); 2478 } 2479 case VMM_DESTROY_VM: { 2480 struct vm_destroy_req req; 2481 2482 if ((md & FWRITE) == 0) { 2483 return (EPERM); 2484 } 2485 if (ddi_copyin(argp, &req, sizeof (req), md) != 0) { 2486 return (EFAULT); 2487 } 2488 return (vmmdev_do_vm_destroy(&req, cr)); 2489 } 2490 case VMM_VM_SUPPORTED: 2491 return (vmm_is_supported(arg)); 2492 case VMM_INTERFACE_VERSION: 2493 *rvalp = VMM_CURRENT_INTERFACE_VERSION; 2494 return (0); 2495 case VMM_RESV_QUERY: 2496 case VMM_RESV_ADD: 2497 case VMM_RESV_REMOVE: 2498 return (vmmr_ioctl(cmd, arg, md, cr, rvalp)); 2499 default: 2500 break; 2501 } 2502 /* No other actions are legal on ctl device */ 2503 return (ENOTTY); 2504 } 2505 2506 static int 2507 vmm_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, 2508 int *rvalp) 2509 { 2510 vmm_softc_t *sc; 2511 minor_t minor; 2512 2513 /* 2514 * Forbid running bhyve in a 32-bit process until it has been tested and 2515 * verified to be safe. 2516 */ 2517 if (curproc->p_model != DATAMODEL_LP64) { 2518 return (EFBIG); 2519 } 2520 2521 /* The structs in bhyve ioctls assume a 64-bit datamodel */ 2522 if (ddi_model_convert_from(mode & FMODELS) != DDI_MODEL_NONE) { 2523 return (ENOTSUP); 2524 } 2525 2526 minor = getminor(dev); 2527 2528 if (minor == VMM_CTL_MINOR) { 2529 return (vmm_ctl_ioctl(cmd, arg, mode, credp, rvalp)); 2530 } 2531 2532 sc = ddi_get_soft_state(vmm_statep, minor); 2533 ASSERT(sc); 2534 2535 if (sc->vmm_flags & VMM_DESTROY) 2536 return (ENXIO); 2537 2538 return (vmmdev_do_ioctl(sc, cmd, arg, mode, credp, rvalp)); 2539 } 2540 2541 static int 2542 vmm_segmap(dev_t dev, off_t off, struct as *as, caddr_t *addrp, off_t len, 2543 unsigned int prot, unsigned int maxprot, unsigned int flags, cred_t *credp) 2544 { 2545 vmm_softc_t *sc; 2546 const minor_t minor = getminor(dev); 2547 int err; 2548 2549 if (minor == VMM_CTL_MINOR) { 2550 return (ENODEV); 2551 } 2552 if (off < 0 || (off + len) <= 0) { 2553 return (EINVAL); 2554 } 2555 if ((prot & PROT_USER) == 0) { 2556 return (EACCES); 2557 } 2558 2559 sc = ddi_get_soft_state(vmm_statep, minor); 2560 ASSERT(sc); 2561 2562 if (sc->vmm_flags & VMM_DESTROY) 2563 return (ENXIO); 2564 2565 /* Grab read lock on the VM to prevent any changes to the memory map */ 2566 vmm_read_lock(sc); 2567 2568 if (off >= VM_DEVMEM_START) { 2569 int segid; 2570 off_t segoff; 2571 2572 /* Mapping a devmem "device" */ 2573 if (!vmmdev_devmem_segid(sc, off, len, &segid, &segoff)) { 2574 err = ENODEV; 2575 } else { 2576 err = vm_segmap_obj(sc->vmm_vm, segid, segoff, len, as, 2577 addrp, prot, maxprot, flags); 2578 } 2579 } else { 2580 /* Mapping a part of the guest physical space */ 2581 err = vm_segmap_space(sc->vmm_vm, off, as, addrp, len, prot, 2582 maxprot, flags); 2583 } 2584 2585 vmm_read_unlock(sc); 2586 return (err); 2587 } 2588 2589 static sdev_plugin_validate_t 2590 vmm_sdev_validate(sdev_ctx_t ctx) 2591 { 2592 const char *name = sdev_ctx_name(ctx); 2593 vmm_softc_t *sc; 2594 sdev_plugin_validate_t ret; 2595 minor_t minor; 2596 2597 if (sdev_ctx_vtype(ctx) != VCHR) 2598 return (SDEV_VTOR_INVALID); 2599 2600 VERIFY3S(sdev_ctx_minor(ctx, &minor), ==, 0); 2601 2602 mutex_enter(&vmm_mtx); 2603 if ((sc = vmm_lookup(name)) == NULL) 2604 ret = SDEV_VTOR_INVALID; 2605 else if (sc->vmm_minor != minor) 2606 ret = SDEV_VTOR_STALE; 2607 else 2608 ret = SDEV_VTOR_VALID; 2609 mutex_exit(&vmm_mtx); 2610 2611 return (ret); 2612 } 2613 2614 static int 2615 vmm_sdev_filldir(sdev_ctx_t ctx) 2616 { 2617 vmm_softc_t *sc; 2618 int ret; 2619 2620 if (strcmp(sdev_ctx_path(ctx), VMM_SDEV_ROOT) != 0) { 2621 cmn_err(CE_WARN, "%s: bad path '%s' != '%s'\n", __func__, 2622 sdev_ctx_path(ctx), VMM_SDEV_ROOT); 2623 return (EINVAL); 2624 } 2625 2626 mutex_enter(&vmm_mtx); 2627 ASSERT(vmmdev_dip != NULL); 2628 for (sc = list_head(&vmm_list); sc != NULL; 2629 sc = list_next(&vmm_list, sc)) { 2630 if (INGLOBALZONE(curproc) || sc->vmm_zone == curzone) { 2631 ret = sdev_plugin_mknod(ctx, sc->vmm_name, 2632 S_IFCHR | 0600, 2633 makedevice(ddi_driver_major(vmmdev_dip), 2634 sc->vmm_minor)); 2635 } else { 2636 continue; 2637 } 2638 if (ret != 0 && ret != EEXIST) 2639 goto out; 2640 } 2641 2642 ret = 0; 2643 2644 out: 2645 mutex_exit(&vmm_mtx); 2646 return (ret); 2647 } 2648 2649 /* ARGSUSED */ 2650 static void 2651 vmm_sdev_inactive(sdev_ctx_t ctx) 2652 { 2653 } 2654 2655 static sdev_plugin_ops_t vmm_sdev_ops = { 2656 .spo_version = SDEV_PLUGIN_VERSION, 2657 .spo_flags = SDEV_PLUGIN_SUBDIR, 2658 .spo_validate = vmm_sdev_validate, 2659 .spo_filldir = vmm_sdev_filldir, 2660 .spo_inactive = vmm_sdev_inactive 2661 }; 2662 2663 /* ARGSUSED */ 2664 static int 2665 vmm_info(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result) 2666 { 2667 int error; 2668 2669 switch (cmd) { 2670 case DDI_INFO_DEVT2DEVINFO: 2671 *result = (void *)vmmdev_dip; 2672 error = DDI_SUCCESS; 2673 break; 2674 case DDI_INFO_DEVT2INSTANCE: 2675 *result = (void *)0; 2676 error = DDI_SUCCESS; 2677 break; 2678 default: 2679 error = DDI_FAILURE; 2680 break; 2681 } 2682 return (error); 2683 } 2684 2685 static int 2686 vmm_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 2687 { 2688 sdev_plugin_hdl_t sph; 2689 hma_reg_t *reg = NULL; 2690 boolean_t vmm_loaded = B_FALSE; 2691 2692 if (cmd != DDI_ATTACH) { 2693 return (DDI_FAILURE); 2694 } 2695 2696 mutex_enter(&vmmdev_mtx); 2697 /* Ensure we are not already attached. */ 2698 if (vmmdev_dip != NULL) { 2699 mutex_exit(&vmmdev_mtx); 2700 return (DDI_FAILURE); 2701 } 2702 2703 vmm_sol_glue_init(); 2704 2705 /* 2706 * Perform temporary HMA registration to determine if the system 2707 * is capable. 2708 */ 2709 if ((reg = hma_register(vmmdev_hvm_name)) == NULL) { 2710 goto fail; 2711 } else if (vmm_mod_load() != 0) { 2712 goto fail; 2713 } 2714 vmm_loaded = B_TRUE; 2715 hma_unregister(reg); 2716 reg = NULL; 2717 2718 /* Create control node. Other nodes will be created on demand. */ 2719 if (ddi_create_minor_node(dip, "ctl", S_IFCHR, 2720 VMM_CTL_MINOR, DDI_PSEUDO, 0) != 0) { 2721 goto fail; 2722 } 2723 2724 sph = sdev_plugin_register(VMM_MODULE_NAME, &vmm_sdev_ops, NULL); 2725 if (sph == (sdev_plugin_hdl_t)NULL) { 2726 ddi_remove_minor_node(dip, NULL); 2727 goto fail; 2728 } 2729 2730 ddi_report_dev(dip); 2731 vmmdev_sdev_hdl = sph; 2732 vmmdev_dip = dip; 2733 mutex_exit(&vmmdev_mtx); 2734 return (DDI_SUCCESS); 2735 2736 fail: 2737 if (vmm_loaded) { 2738 VERIFY0(vmm_mod_unload()); 2739 } 2740 if (reg != NULL) { 2741 hma_unregister(reg); 2742 } 2743 vmm_sol_glue_cleanup(); 2744 mutex_exit(&vmmdev_mtx); 2745 return (DDI_FAILURE); 2746 } 2747 2748 static int 2749 vmm_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 2750 { 2751 if (cmd != DDI_DETACH) { 2752 return (DDI_FAILURE); 2753 } 2754 2755 /* 2756 * Ensure that all resources have been cleaned up. 2757 * 2758 * To prevent a deadlock with iommu_cleanup() we'll fail the detach if 2759 * vmmdev_mtx is already held. We can't wait for vmmdev_mtx with our 2760 * devinfo locked as iommu_cleanup() tries to recursively lock each 2761 * devinfo, including our own, while holding vmmdev_mtx. 2762 */ 2763 if (mutex_tryenter(&vmmdev_mtx) == 0) 2764 return (DDI_FAILURE); 2765 2766 mutex_enter(&vmm_mtx); 2767 if (!list_is_empty(&vmm_list) || !list_is_empty(&vmm_destroy_list)) { 2768 mutex_exit(&vmm_mtx); 2769 mutex_exit(&vmmdev_mtx); 2770 return (DDI_FAILURE); 2771 } 2772 mutex_exit(&vmm_mtx); 2773 2774 if (!vmmr_is_empty()) { 2775 mutex_exit(&vmmdev_mtx); 2776 return (DDI_FAILURE); 2777 } 2778 2779 VERIFY(vmmdev_sdev_hdl != (sdev_plugin_hdl_t)NULL); 2780 if (sdev_plugin_unregister(vmmdev_sdev_hdl) != 0) { 2781 mutex_exit(&vmmdev_mtx); 2782 return (DDI_FAILURE); 2783 } 2784 vmmdev_sdev_hdl = (sdev_plugin_hdl_t)NULL; 2785 2786 /* Remove the control node. */ 2787 ddi_remove_minor_node(dip, "ctl"); 2788 vmmdev_dip = NULL; 2789 2790 VERIFY0(vmm_mod_unload()); 2791 VERIFY3U(vmmdev_hma_reg, ==, NULL); 2792 vmm_sol_glue_cleanup(); 2793 2794 mutex_exit(&vmmdev_mtx); 2795 2796 return (DDI_SUCCESS); 2797 } 2798 2799 static struct cb_ops vmm_cb_ops = { 2800 vmm_open, 2801 vmm_close, 2802 nodev, /* strategy */ 2803 nodev, /* print */ 2804 nodev, /* dump */ 2805 nodev, /* read */ 2806 nodev, /* write */ 2807 vmm_ioctl, 2808 nodev, /* devmap */ 2809 nodev, /* mmap */ 2810 vmm_segmap, 2811 nochpoll, /* poll */ 2812 ddi_prop_op, 2813 NULL, 2814 D_NEW | D_MP | D_DEVMAP 2815 }; 2816 2817 static struct dev_ops vmm_ops = { 2818 DEVO_REV, 2819 0, 2820 vmm_info, 2821 nulldev, /* identify */ 2822 nulldev, /* probe */ 2823 vmm_attach, 2824 vmm_detach, 2825 nodev, /* reset */ 2826 &vmm_cb_ops, 2827 (struct bus_ops *)NULL 2828 }; 2829 2830 static struct modldrv modldrv = { 2831 &mod_driverops, 2832 "bhyve vmm", 2833 &vmm_ops 2834 }; 2835 2836 static struct modlinkage modlinkage = { 2837 MODREV_1, 2838 &modldrv, 2839 NULL 2840 }; 2841 2842 int 2843 _init(void) 2844 { 2845 int error; 2846 2847 sysinit(); 2848 2849 mutex_init(&vmmdev_mtx, NULL, MUTEX_DRIVER, NULL); 2850 mutex_init(&vmm_mtx, NULL, MUTEX_DRIVER, NULL); 2851 list_create(&vmm_list, sizeof (vmm_softc_t), 2852 offsetof(vmm_softc_t, vmm_node)); 2853 list_create(&vmm_destroy_list, sizeof (vmm_softc_t), 2854 offsetof(vmm_softc_t, vmm_node)); 2855 vmm_minors = id_space_create("vmm_minors", VMM_CTL_MINOR + 1, MAXMIN32); 2856 2857 error = ddi_soft_state_init(&vmm_statep, sizeof (vmm_softc_t), 0); 2858 if (error) { 2859 return (error); 2860 } 2861 2862 vmm_zsd_init(); 2863 vmmr_init(); 2864 2865 error = mod_install(&modlinkage); 2866 if (error) { 2867 ddi_soft_state_fini(&vmm_statep); 2868 vmm_zsd_fini(); 2869 vmmr_fini(); 2870 } 2871 2872 return (error); 2873 } 2874 2875 int 2876 _fini(void) 2877 { 2878 int error; 2879 2880 error = mod_remove(&modlinkage); 2881 if (error) { 2882 return (error); 2883 } 2884 2885 vmm_zsd_fini(); 2886 vmmr_fini(); 2887 2888 ddi_soft_state_fini(&vmm_statep); 2889 2890 return (0); 2891 } 2892 2893 int 2894 _info(struct modinfo *modinfop) 2895 { 2896 return (mod_info(&modlinkage, modinfop)); 2897 } 2898