1 /* 2 * This file and its contents are supplied under the terms of the 3 * Common Development and Distribution License ("CDDL"), version 1.0. 4 * You may only use this file in accordance with the terms of version 5 * 1.0 of the CDDL. 6 * 7 * A full copy of the text of the CDDL should have accompanied this 8 * source. A copy of the CDDL is also available via the Internet at 9 * http://www.illumos.org/license/CDDL. 10 */ 11 /* This file is dual-licensed; see usr/src/contrib/bhyve/LICENSE */ 12 13 /* 14 * Copyright 2015 Pluribus Networks Inc. 15 * Copyright 2019 Joyent, Inc. 16 * Copyright 2020 OmniOS Community Edition (OmniOSce) Association. 17 * Copyright 2023 Oxide Computer Company 18 */ 19 20 #include <sys/types.h> 21 #include <sys/conf.h> 22 #include <sys/cpuvar.h> 23 #include <sys/ioccom.h> 24 #include <sys/stat.h> 25 #include <sys/vmsystm.h> 26 #include <sys/ddi.h> 27 #include <sys/mkdev.h> 28 #include <sys/sunddi.h> 29 #include <sys/fs/dv_node.h> 30 #include <sys/cpuset.h> 31 #include <sys/id_space.h> 32 #include <sys/fs/sdev_plugin.h> 33 #include <sys/smt.h> 34 #include <sys/kstat.h> 35 36 #include <sys/kernel.h> 37 #include <sys/hma.h> 38 #include <sys/x86_archext.h> 39 #include <x86/apicreg.h> 40 41 #include <sys/vmm.h> 42 #include <sys/vmm_kernel.h> 43 #include <sys/vmm_instruction_emul.h> 44 #include <sys/vmm_dev.h> 45 #include <sys/vmm_impl.h> 46 #include <sys/vmm_drv.h> 47 #include <sys/vmm_vm.h> 48 #include <sys/vmm_reservoir.h> 49 50 #include <vm/seg_dev.h> 51 52 #include "io/ppt.h" 53 #include "io/vatpic.h" 54 #include "io/vioapic.h" 55 #include "io/vrtc.h" 56 #include "io/vhpet.h" 57 #include "io/vpmtmr.h" 58 #include "vmm_lapic.h" 59 #include "vmm_stat.h" 60 #include "vmm_util.h" 61 62 /* 63 * Locking details: 64 * 65 * Driver-wide data (vmmdev_*) , including HMA and sdev registration, is 66 * protected by vmmdev_mtx. The list of vmm_softc_t instances and related data 67 * (vmm_*) are protected by vmm_mtx. Actions requiring both locks must acquire 68 * vmmdev_mtx before vmm_mtx. The sdev plugin functions must not attempt to 69 * acquire vmmdev_mtx, as they could deadlock with plugin unregistration. 70 */ 71 72 static kmutex_t vmmdev_mtx; 73 static dev_info_t *vmmdev_dip; 74 static hma_reg_t *vmmdev_hma_reg; 75 static uint_t vmmdev_hma_ref; 76 static sdev_plugin_hdl_t vmmdev_sdev_hdl; 77 78 static kmutex_t vmm_mtx; 79 static list_t vmm_list; 80 static id_space_t *vmm_minors; 81 static void *vmm_statep; 82 83 /* 84 * Until device emulation in bhyve had been adequately scrutinized and tested, 85 * there was (justified) concern that unusual or corrupt device state payloads 86 * could crash the host when loaded via the vmm-data interface. 87 * 88 * Now that those concerns have been mitigated, this protection is loosened to 89 * default-allow, but the switch is left in place, in case there is a need to 90 * once again clamp down on vmm-data writes. 91 */ 92 int vmm_allow_state_writes = 1; 93 94 static const char *vmmdev_hvm_name = "bhyve"; 95 96 /* For sdev plugin (/dev) */ 97 #define VMM_SDEV_ROOT "/dev/vmm" 98 99 /* From uts/intel/io/vmm/intel/vmx.c */ 100 extern int vmx_x86_supported(const char **); 101 102 /* Holds and hooks from drivers external to vmm */ 103 struct vmm_hold { 104 list_node_t vmh_node; 105 vmm_softc_t *vmh_sc; 106 boolean_t vmh_release_req; 107 uint_t vmh_ioport_hook_cnt; 108 }; 109 110 struct vmm_lease { 111 list_node_t vml_node; 112 struct vm *vml_vm; 113 vm_client_t *vml_vmclient; 114 boolean_t vml_expired; 115 boolean_t vml_break_deferred; 116 boolean_t (*vml_expire_func)(void *); 117 void *vml_expire_arg; 118 struct vmm_hold *vml_hold; 119 }; 120 121 /* Options for vmm_destroy_locked */ 122 typedef enum vmm_destroy_opts { 123 VDO_DEFAULT = 0, 124 /* 125 * Indicate that zone-specific-data associated with this VM not be 126 * cleaned up as part of the destroy. Skipping ZSD clean-up is 127 * necessary when VM is being destroyed as part of zone destruction, 128 * when said ZSD is already being cleaned up. 129 */ 130 VDO_NO_CLEAN_ZSD = (1 << 0), 131 /* 132 * Attempt to wait for VM destruction to complete. This is opt-in, 133 * since there are many normal conditions which could lead to 134 * destruction being stalled pending other clean-up. 135 */ 136 VDO_ATTEMPT_WAIT = (1 << 1), 137 } vmm_destroy_opts_t; 138 139 static void vmm_hma_release(void); 140 static int vmm_destroy_locked(vmm_softc_t *, vmm_destroy_opts_t, bool *); 141 static int vmm_drv_block_hook(vmm_softc_t *, boolean_t); 142 static void vmm_lease_block(vmm_softc_t *); 143 static void vmm_lease_unblock(vmm_softc_t *); 144 static int vmm_kstat_alloc(vmm_softc_t *, minor_t, const cred_t *); 145 static void vmm_kstat_init(vmm_softc_t *); 146 static void vmm_kstat_fini(vmm_softc_t *); 147 148 /* 149 * The 'devmem' hack: 150 * 151 * On native FreeBSD, bhyve consumers are allowed to create 'devmem' segments 152 * in the vm which appear with their own name related to the vm under /dev. 153 * Since this would be a hassle from an sdev perspective and would require a 154 * new cdev interface (or complicate the existing one), we choose to implement 155 * this in a different manner. Direct access to the underlying vm memory 156 * segments is exposed by placing them in a range of offsets beyond the normal 157 * guest memory space. Userspace can query the appropriate offset to mmap() 158 * for a given segment-id with the VM_DEVMEM_GETOFFSET ioctl. 159 */ 160 161 static vmm_devmem_entry_t * 162 vmmdev_devmem_find(vmm_softc_t *sc, int segid) 163 { 164 vmm_devmem_entry_t *ent = NULL; 165 list_t *dl = &sc->vmm_devmem_list; 166 167 for (ent = list_head(dl); ent != NULL; ent = list_next(dl, ent)) { 168 if (ent->vde_segid == segid) { 169 return (ent); 170 } 171 } 172 return (NULL); 173 } 174 175 static int 176 vmmdev_get_memseg(vmm_softc_t *sc, struct vm_memseg *mseg) 177 { 178 int error; 179 bool sysmem; 180 181 error = vm_get_memseg(sc->vmm_vm, mseg->segid, &mseg->len, &sysmem, 182 NULL); 183 if (error || mseg->len == 0) 184 return (error); 185 186 if (!sysmem) { 187 vmm_devmem_entry_t *de; 188 189 de = vmmdev_devmem_find(sc, mseg->segid); 190 if (de != NULL) { 191 (void) strlcpy(mseg->name, de->vde_name, 192 sizeof (mseg->name)); 193 } 194 } else { 195 bzero(mseg->name, sizeof (mseg->name)); 196 } 197 198 return (error); 199 } 200 201 static int 202 vmmdev_devmem_create(vmm_softc_t *sc, struct vm_memseg *mseg, const char *name) 203 { 204 off_t map_offset; 205 vmm_devmem_entry_t *entry; 206 207 if (list_is_empty(&sc->vmm_devmem_list)) { 208 map_offset = VM_DEVMEM_START; 209 } else { 210 entry = list_tail(&sc->vmm_devmem_list); 211 map_offset = entry->vde_off + entry->vde_len; 212 if (map_offset < entry->vde_off) { 213 /* Do not tolerate overflow */ 214 return (ERANGE); 215 } 216 /* 217 * XXXJOY: We could choose to search the list for duplicate 218 * names and toss an error. Since we're using the offset 219 * method for now, it does not make much of a difference. 220 */ 221 } 222 223 entry = kmem_zalloc(sizeof (*entry), KM_SLEEP); 224 entry->vde_segid = mseg->segid; 225 entry->vde_len = mseg->len; 226 entry->vde_off = map_offset; 227 (void) strlcpy(entry->vde_name, name, sizeof (entry->vde_name)); 228 list_insert_tail(&sc->vmm_devmem_list, entry); 229 230 return (0); 231 } 232 233 static boolean_t 234 vmmdev_devmem_segid(vmm_softc_t *sc, off_t off, off_t len, int *segidp, 235 off_t *map_offp) 236 { 237 list_t *dl = &sc->vmm_devmem_list; 238 vmm_devmem_entry_t *de = NULL; 239 const off_t map_end = off + len; 240 241 VERIFY(off >= VM_DEVMEM_START); 242 243 if (map_end < off) { 244 /* No match on overflow */ 245 return (B_FALSE); 246 } 247 248 for (de = list_head(dl); de != NULL; de = list_next(dl, de)) { 249 const off_t item_end = de->vde_off + de->vde_len; 250 251 if (de->vde_off <= off && item_end >= map_end) { 252 *segidp = de->vde_segid; 253 *map_offp = off - de->vde_off; 254 return (B_TRUE); 255 } 256 } 257 return (B_FALSE); 258 } 259 260 /* 261 * When an instance is being destroyed, the devmem list of named memory objects 262 * can be torn down, as no new mappings are allowed. 263 */ 264 static void 265 vmmdev_devmem_purge(vmm_softc_t *sc) 266 { 267 vmm_devmem_entry_t *entry; 268 269 while ((entry = list_remove_head(&sc->vmm_devmem_list)) != NULL) { 270 kmem_free(entry, sizeof (*entry)); 271 } 272 } 273 274 static int 275 vmmdev_alloc_memseg(vmm_softc_t *sc, struct vm_memseg *mseg) 276 { 277 int error; 278 bool sysmem = true; 279 280 if (VM_MEMSEG_NAME(mseg)) { 281 sysmem = false; 282 } 283 error = vm_alloc_memseg(sc->vmm_vm, mseg->segid, mseg->len, sysmem); 284 285 if (error == 0) { 286 /* 287 * Rather than create a whole fresh device from which userspace 288 * can mmap this segment, instead make it available at an 289 * offset above where the main guest memory resides. 290 */ 291 error = vmmdev_devmem_create(sc, mseg, mseg->name); 292 if (error != 0) { 293 vm_free_memseg(sc->vmm_vm, mseg->segid); 294 } 295 } 296 return (error); 297 } 298 299 /* 300 * Resource Locking and Exclusion 301 * 302 * Much of bhyve depends on key portions of VM state, such as the guest memory 303 * map, to remain unchanged while the guest is running. As ported from 304 * FreeBSD, the initial strategy for this resource exclusion hinged on gating 305 * access to the instance vCPUs. Threads acting on a single vCPU, like those 306 * performing the work of actually running the guest in VMX/SVM, would lock 307 * only that vCPU during ioctl() entry. For ioctls which would change VM-wide 308 * state, all of the vCPUs would be first locked, ensuring that the 309 * operation(s) could complete without any other threads stumbling into 310 * intermediate states. 311 * 312 * This approach is largely effective for bhyve. Common operations, such as 313 * running the vCPUs, steer clear of lock contention. The model begins to 314 * break down for operations which do not occur in the context of a specific 315 * vCPU. LAPIC MSI delivery, for example, may be initiated from a worker 316 * thread in the bhyve process. In order to properly protect those vCPU-less 317 * operations from encountering invalid states, additional locking is required. 318 * This was solved by forcing those operations to lock the VM_MAXCPU-1 vCPU. 319 * It does mean that class of operations will be serialized on locking the 320 * specific vCPU and that instances sized at VM_MAXCPU will potentially see 321 * undue contention on the VM_MAXCPU-1 vCPU. 322 * 323 * In order to address the shortcomings of this model, the concept of a 324 * read/write lock has been added to bhyve. Operations which change 325 * fundamental aspects of a VM (such as the memory map) must acquire the write 326 * lock, which also implies locking all of the vCPUs and waiting for all read 327 * lock holders to release. While it increases the cost and waiting time for 328 * those few operations, it allows most hot-path operations on the VM (which 329 * depend on its configuration remaining stable) to occur with minimal locking. 330 * 331 * Consumers of the Driver API (see below) are a special case when it comes to 332 * this locking, since they may hold a read lock via the drv_lease mechanism 333 * for an extended period of time. Rather than forcing those consumers to 334 * continuously poll for a write lock attempt, the lease system forces them to 335 * provide a release callback to trigger their clean-up (and potential later 336 * reacquisition) of the read lock. 337 */ 338 339 static void 340 vcpu_lock_one(vmm_softc_t *sc, int vcpu) 341 { 342 ASSERT(vcpu >= 0 && vcpu < VM_MAXCPU); 343 344 /* 345 * Since this state transition is utilizing from_idle=true, it should 346 * not fail, but rather block until it can be successful. 347 */ 348 VERIFY0(vcpu_set_state(sc->vmm_vm, vcpu, VCPU_FROZEN, true)); 349 } 350 351 static void 352 vcpu_unlock_one(vmm_softc_t *sc, int vcpu) 353 { 354 ASSERT(vcpu >= 0 && vcpu < VM_MAXCPU); 355 356 VERIFY3U(vcpu_get_state(sc->vmm_vm, vcpu, NULL), ==, VCPU_FROZEN); 357 VERIFY0(vcpu_set_state(sc->vmm_vm, vcpu, VCPU_IDLE, false)); 358 } 359 360 static void 361 vmm_read_lock(vmm_softc_t *sc) 362 { 363 rw_enter(&sc->vmm_rwlock, RW_READER); 364 } 365 366 static void 367 vmm_read_unlock(vmm_softc_t *sc) 368 { 369 rw_exit(&sc->vmm_rwlock); 370 } 371 372 static void 373 vmm_write_lock(vmm_softc_t *sc) 374 { 375 int maxcpus; 376 377 /* First lock all the vCPUs */ 378 maxcpus = vm_get_maxcpus(sc->vmm_vm); 379 for (int vcpu = 0; vcpu < maxcpus; vcpu++) { 380 vcpu_lock_one(sc, vcpu); 381 } 382 383 /* 384 * Block vmm_drv leases from being acquired or held while the VM write 385 * lock is held. 386 */ 387 vmm_lease_block(sc); 388 389 rw_enter(&sc->vmm_rwlock, RW_WRITER); 390 /* 391 * For now, the 'maxcpus' value for an instance is fixed at the 392 * compile-time constant of VM_MAXCPU at creation. If this changes in 393 * the future, allowing for dynamic vCPU resource sizing, acquisition 394 * of the write lock will need to be wary of such changes. 395 */ 396 VERIFY(maxcpus == vm_get_maxcpus(sc->vmm_vm)); 397 } 398 399 static void 400 vmm_write_unlock(vmm_softc_t *sc) 401 { 402 int maxcpus; 403 404 /* Allow vmm_drv leases to be acquired once write lock is dropped */ 405 vmm_lease_unblock(sc); 406 407 /* 408 * The VM write lock _must_ be released from the same thread it was 409 * acquired in, unlike the read lock. 410 */ 411 VERIFY(rw_write_held(&sc->vmm_rwlock)); 412 rw_exit(&sc->vmm_rwlock); 413 414 /* Unlock all the vCPUs */ 415 maxcpus = vm_get_maxcpus(sc->vmm_vm); 416 for (int vcpu = 0; vcpu < maxcpus; vcpu++) { 417 vcpu_unlock_one(sc, vcpu); 418 } 419 } 420 421 static int 422 vmmdev_do_ioctl(vmm_softc_t *sc, int cmd, intptr_t arg, int md, 423 cred_t *credp, int *rvalp) 424 { 425 int error = 0, vcpu = -1; 426 void *datap = (void *)arg; 427 enum vm_lock_type { 428 LOCK_NONE = 0, 429 LOCK_VCPU, 430 LOCK_READ_HOLD, 431 LOCK_WRITE_HOLD 432 } lock_type = LOCK_NONE; 433 434 /* Acquire any exclusion resources needed for the operation. */ 435 switch (cmd) { 436 case VM_RUN: 437 case VM_GET_REGISTER: 438 case VM_SET_REGISTER: 439 case VM_GET_SEGMENT_DESCRIPTOR: 440 case VM_SET_SEGMENT_DESCRIPTOR: 441 case VM_GET_REGISTER_SET: 442 case VM_SET_REGISTER_SET: 443 case VM_INJECT_EXCEPTION: 444 case VM_GET_CAPABILITY: 445 case VM_SET_CAPABILITY: 446 case VM_PPTDEV_MSI: 447 case VM_PPTDEV_MSIX: 448 case VM_SET_X2APIC_STATE: 449 case VM_GLA2GPA: 450 case VM_GLA2GPA_NOFAULT: 451 case VM_ACTIVATE_CPU: 452 case VM_SET_INTINFO: 453 case VM_GET_INTINFO: 454 case VM_RESTART_INSTRUCTION: 455 case VM_SET_KERNEMU_DEV: 456 case VM_GET_KERNEMU_DEV: 457 case VM_RESET_CPU: 458 case VM_GET_RUN_STATE: 459 case VM_SET_RUN_STATE: 460 case VM_GET_FPU: 461 case VM_SET_FPU: 462 case VM_GET_CPUID: 463 case VM_SET_CPUID: 464 case VM_LEGACY_CPUID: 465 /* 466 * Copy in the ID of the vCPU chosen for this operation. 467 * Since a nefarious caller could update their struct between 468 * this locking and when the rest of the ioctl data is copied 469 * in, it is _critical_ that this local 'vcpu' variable be used 470 * rather than the in-struct one when performing the ioctl. 471 */ 472 if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) { 473 return (EFAULT); 474 } 475 if (vcpu < 0 || vcpu >= vm_get_maxcpus(sc->vmm_vm)) { 476 return (EINVAL); 477 } 478 vcpu_lock_one(sc, vcpu); 479 lock_type = LOCK_VCPU; 480 break; 481 482 case VM_REINIT: 483 case VM_BIND_PPTDEV: 484 case VM_UNBIND_PPTDEV: 485 case VM_MAP_PPTDEV_MMIO: 486 case VM_UNMAP_PPTDEV_MMIO: 487 case VM_ALLOC_MEMSEG: 488 case VM_MMAP_MEMSEG: 489 case VM_MUNMAP_MEMSEG: 490 case VM_WRLOCK_CYCLE: 491 case VM_PMTMR_LOCATE: 492 case VM_PAUSE: 493 case VM_RESUME: 494 vmm_write_lock(sc); 495 lock_type = LOCK_WRITE_HOLD; 496 break; 497 498 case VM_GET_MEMSEG: 499 case VM_MMAP_GETNEXT: 500 case VM_LAPIC_IRQ: 501 case VM_INJECT_NMI: 502 case VM_IOAPIC_ASSERT_IRQ: 503 case VM_IOAPIC_DEASSERT_IRQ: 504 case VM_IOAPIC_PULSE_IRQ: 505 case VM_LAPIC_MSI: 506 case VM_LAPIC_LOCAL_IRQ: 507 case VM_GET_X2APIC_STATE: 508 case VM_RTC_READ: 509 case VM_RTC_WRITE: 510 case VM_RTC_SETTIME: 511 case VM_RTC_GETTIME: 512 case VM_PPTDEV_DISABLE_MSIX: 513 case VM_DEVMEM_GETOFFSET: 514 case VM_TRACK_DIRTY_PAGES: 515 case VM_NPT_OPERATION: 516 vmm_read_lock(sc); 517 lock_type = LOCK_READ_HOLD; 518 break; 519 520 case VM_DATA_READ: 521 case VM_DATA_WRITE: 522 if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) { 523 return (EFAULT); 524 } 525 if (vcpu == -1) { 526 /* Access data for VM-wide devices */ 527 vmm_write_lock(sc); 528 lock_type = LOCK_WRITE_HOLD; 529 } else if (vcpu >= 0 && vcpu < vm_get_maxcpus(sc->vmm_vm)) { 530 /* Access data associated with a specific vCPU */ 531 vcpu_lock_one(sc, vcpu); 532 lock_type = LOCK_VCPU; 533 } else { 534 return (EINVAL); 535 } 536 break; 537 538 case VM_GET_GPA_PMAP: 539 case VM_IOAPIC_PINCOUNT: 540 case VM_SUSPEND: 541 case VM_DESC_FPU_AREA: 542 case VM_SET_AUTODESTRUCT: 543 case VM_DESTROY_SELF: 544 case VM_DESTROY_PENDING: 545 case VM_VCPU_BARRIER: 546 default: 547 break; 548 } 549 550 /* Execute the primary logic for the ioctl. */ 551 switch (cmd) { 552 case VM_RUN: { 553 struct vm_entry entry; 554 555 if (ddi_copyin(datap, &entry, sizeof (entry), md)) { 556 error = EFAULT; 557 break; 558 } 559 560 if (!(curthread->t_schedflag & TS_VCPU)) 561 smt_mark_as_vcpu(); 562 563 error = vm_run(sc->vmm_vm, vcpu, &entry); 564 565 /* 566 * Unexpected states in vm_run() are expressed through positive 567 * errno-oriented return values. VM states which expect further 568 * processing in userspace (necessary context via exitinfo) are 569 * expressed through negative return values. For the time being 570 * a return value of 0 is not expected from vm_run(). 571 */ 572 ASSERT(error != 0); 573 if (error < 0) { 574 const struct vm_exit *vme; 575 void *outp = entry.exit_data; 576 577 error = 0; 578 vme = vm_exitinfo(sc->vmm_vm, vcpu); 579 if (ddi_copyout(vme, outp, sizeof (*vme), md)) { 580 error = EFAULT; 581 } 582 } 583 break; 584 } 585 case VM_SUSPEND: { 586 struct vm_suspend vmsuspend; 587 588 if (ddi_copyin(datap, &vmsuspend, sizeof (vmsuspend), md)) { 589 error = EFAULT; 590 break; 591 } 592 error = vm_suspend(sc->vmm_vm, vmsuspend.how, vmsuspend.source); 593 break; 594 } 595 case VM_REINIT: { 596 struct vm_reinit reinit; 597 598 if (ddi_copyin(datap, &reinit, sizeof (reinit), md)) { 599 error = EFAULT; 600 break; 601 } 602 if ((error = vmm_drv_block_hook(sc, B_TRUE)) != 0) { 603 /* 604 * The VM instance should be free of driver-attached 605 * hooks during the reinitialization process. 606 */ 607 break; 608 } 609 error = vm_reinit(sc->vmm_vm, reinit.flags); 610 (void) vmm_drv_block_hook(sc, B_FALSE); 611 break; 612 } 613 case VM_STAT_DESC: { 614 struct vm_stat_desc statdesc; 615 616 if (ddi_copyin(datap, &statdesc, sizeof (statdesc), md)) { 617 error = EFAULT; 618 break; 619 } 620 error = vmm_stat_desc_copy(statdesc.index, statdesc.desc, 621 sizeof (statdesc.desc)); 622 if (error == 0 && 623 ddi_copyout(&statdesc, datap, sizeof (statdesc), md)) { 624 error = EFAULT; 625 break; 626 } 627 break; 628 } 629 case VM_STATS_IOC: { 630 struct vm_stats vmstats; 631 632 if (ddi_copyin(datap, &vmstats, sizeof (vmstats), md)) { 633 error = EFAULT; 634 break; 635 } 636 hrt2tv(gethrtime(), &vmstats.tv); 637 error = vmm_stat_copy(sc->vmm_vm, vmstats.cpuid, vmstats.index, 638 nitems(vmstats.statbuf), 639 &vmstats.num_entries, vmstats.statbuf); 640 if (error == 0 && 641 ddi_copyout(&vmstats, datap, sizeof (vmstats), md)) { 642 error = EFAULT; 643 break; 644 } 645 break; 646 } 647 648 case VM_PPTDEV_MSI: { 649 struct vm_pptdev_msi pptmsi; 650 651 if (ddi_copyin(datap, &pptmsi, sizeof (pptmsi), md)) { 652 error = EFAULT; 653 break; 654 } 655 error = ppt_setup_msi(sc->vmm_vm, pptmsi.vcpu, pptmsi.pptfd, 656 pptmsi.addr, pptmsi.msg, pptmsi.numvec); 657 break; 658 } 659 case VM_PPTDEV_MSIX: { 660 struct vm_pptdev_msix pptmsix; 661 662 if (ddi_copyin(datap, &pptmsix, sizeof (pptmsix), md)) { 663 error = EFAULT; 664 break; 665 } 666 error = ppt_setup_msix(sc->vmm_vm, pptmsix.vcpu, pptmsix.pptfd, 667 pptmsix.idx, pptmsix.addr, pptmsix.msg, 668 pptmsix.vector_control); 669 break; 670 } 671 case VM_PPTDEV_DISABLE_MSIX: { 672 struct vm_pptdev pptdev; 673 674 if (ddi_copyin(datap, &pptdev, sizeof (pptdev), md)) { 675 error = EFAULT; 676 break; 677 } 678 error = ppt_disable_msix(sc->vmm_vm, pptdev.pptfd); 679 break; 680 } 681 case VM_MAP_PPTDEV_MMIO: { 682 struct vm_pptdev_mmio pptmmio; 683 684 if (ddi_copyin(datap, &pptmmio, sizeof (pptmmio), md)) { 685 error = EFAULT; 686 break; 687 } 688 error = ppt_map_mmio(sc->vmm_vm, pptmmio.pptfd, pptmmio.gpa, 689 pptmmio.len, pptmmio.hpa); 690 break; 691 } 692 case VM_UNMAP_PPTDEV_MMIO: { 693 struct vm_pptdev_mmio pptmmio; 694 695 if (ddi_copyin(datap, &pptmmio, sizeof (pptmmio), md)) { 696 error = EFAULT; 697 break; 698 } 699 error = ppt_unmap_mmio(sc->vmm_vm, pptmmio.pptfd, pptmmio.gpa, 700 pptmmio.len); 701 break; 702 } 703 case VM_BIND_PPTDEV: { 704 struct vm_pptdev pptdev; 705 706 if (ddi_copyin(datap, &pptdev, sizeof (pptdev), md)) { 707 error = EFAULT; 708 break; 709 } 710 error = vm_assign_pptdev(sc->vmm_vm, pptdev.pptfd); 711 break; 712 } 713 case VM_UNBIND_PPTDEV: { 714 struct vm_pptdev pptdev; 715 716 if (ddi_copyin(datap, &pptdev, sizeof (pptdev), md)) { 717 error = EFAULT; 718 break; 719 } 720 error = vm_unassign_pptdev(sc->vmm_vm, pptdev.pptfd); 721 break; 722 } 723 case VM_GET_PPTDEV_LIMITS: { 724 struct vm_pptdev_limits pptlimits; 725 726 if (ddi_copyin(datap, &pptlimits, sizeof (pptlimits), md)) { 727 error = EFAULT; 728 break; 729 } 730 error = ppt_get_limits(sc->vmm_vm, pptlimits.pptfd, 731 &pptlimits.msi_limit, &pptlimits.msix_limit); 732 if (error == 0 && 733 ddi_copyout(&pptlimits, datap, sizeof (pptlimits), md)) { 734 error = EFAULT; 735 break; 736 } 737 break; 738 } 739 case VM_INJECT_EXCEPTION: { 740 struct vm_exception vmexc; 741 if (ddi_copyin(datap, &vmexc, sizeof (vmexc), md)) { 742 error = EFAULT; 743 break; 744 } 745 error = vm_inject_exception(sc->vmm_vm, vcpu, vmexc.vector, 746 vmexc.error_code_valid != 0, vmexc.error_code, 747 vmexc.restart_instruction != 0); 748 break; 749 } 750 case VM_INJECT_NMI: { 751 struct vm_nmi vmnmi; 752 753 if (ddi_copyin(datap, &vmnmi, sizeof (vmnmi), md)) { 754 error = EFAULT; 755 break; 756 } 757 error = vm_inject_nmi(sc->vmm_vm, vmnmi.cpuid); 758 break; 759 } 760 case VM_LAPIC_IRQ: { 761 struct vm_lapic_irq vmirq; 762 763 if (ddi_copyin(datap, &vmirq, sizeof (vmirq), md)) { 764 error = EFAULT; 765 break; 766 } 767 error = lapic_intr_edge(sc->vmm_vm, vmirq.cpuid, vmirq.vector); 768 break; 769 } 770 case VM_LAPIC_LOCAL_IRQ: { 771 struct vm_lapic_irq vmirq; 772 773 if (ddi_copyin(datap, &vmirq, sizeof (vmirq), md)) { 774 error = EFAULT; 775 break; 776 } 777 error = lapic_set_local_intr(sc->vmm_vm, vmirq.cpuid, 778 vmirq.vector); 779 break; 780 } 781 case VM_LAPIC_MSI: { 782 struct vm_lapic_msi vmmsi; 783 784 if (ddi_copyin(datap, &vmmsi, sizeof (vmmsi), md)) { 785 error = EFAULT; 786 break; 787 } 788 error = lapic_intr_msi(sc->vmm_vm, vmmsi.addr, vmmsi.msg); 789 break; 790 } 791 792 case VM_IOAPIC_ASSERT_IRQ: { 793 struct vm_ioapic_irq ioapic_irq; 794 795 if (ddi_copyin(datap, &ioapic_irq, sizeof (ioapic_irq), md)) { 796 error = EFAULT; 797 break; 798 } 799 error = vioapic_assert_irq(sc->vmm_vm, ioapic_irq.irq); 800 break; 801 } 802 case VM_IOAPIC_DEASSERT_IRQ: { 803 struct vm_ioapic_irq ioapic_irq; 804 805 if (ddi_copyin(datap, &ioapic_irq, sizeof (ioapic_irq), md)) { 806 error = EFAULT; 807 break; 808 } 809 error = vioapic_deassert_irq(sc->vmm_vm, ioapic_irq.irq); 810 break; 811 } 812 case VM_IOAPIC_PULSE_IRQ: { 813 struct vm_ioapic_irq ioapic_irq; 814 815 if (ddi_copyin(datap, &ioapic_irq, sizeof (ioapic_irq), md)) { 816 error = EFAULT; 817 break; 818 } 819 error = vioapic_pulse_irq(sc->vmm_vm, ioapic_irq.irq); 820 break; 821 } 822 case VM_IOAPIC_PINCOUNT: { 823 int pincount; 824 825 pincount = vioapic_pincount(sc->vmm_vm); 826 if (ddi_copyout(&pincount, datap, sizeof (int), md)) { 827 error = EFAULT; 828 break; 829 } 830 break; 831 } 832 case VM_DESC_FPU_AREA: { 833 struct vm_fpu_desc desc; 834 void *buf = NULL; 835 836 if (ddi_copyin(datap, &desc, sizeof (desc), md)) { 837 error = EFAULT; 838 break; 839 } 840 if (desc.vfd_num_entries > 64) { 841 error = EINVAL; 842 break; 843 } 844 const size_t buf_sz = sizeof (struct vm_fpu_desc_entry) * 845 desc.vfd_num_entries; 846 if (buf_sz != 0) { 847 buf = kmem_zalloc(buf_sz, KM_SLEEP); 848 } 849 850 /* 851 * For now, we are depending on vm_fpu_desc_entry and 852 * hma_xsave_state_desc_t having the same format. 853 */ 854 CTASSERT(sizeof (struct vm_fpu_desc_entry) == 855 sizeof (hma_xsave_state_desc_t)); 856 857 size_t req_size; 858 const uint_t max_entries = hma_fpu_describe_xsave_state( 859 (hma_xsave_state_desc_t *)buf, 860 desc.vfd_num_entries, 861 &req_size); 862 863 desc.vfd_req_size = req_size; 864 desc.vfd_num_entries = max_entries; 865 if (buf_sz != 0) { 866 if (ddi_copyout(buf, desc.vfd_entry_data, buf_sz, md)) { 867 error = EFAULT; 868 } 869 kmem_free(buf, buf_sz); 870 } 871 872 if (error == 0) { 873 if (ddi_copyout(&desc, datap, sizeof (desc), md)) { 874 error = EFAULT; 875 } 876 } 877 break; 878 } 879 case VM_SET_AUTODESTRUCT: { 880 /* 881 * Since this has to do with controlling the lifetime of the 882 * greater vmm_softc_t, the flag is protected by vmm_mtx, rather 883 * than the vcpu-centric or rwlock exclusion mechanisms. 884 */ 885 mutex_enter(&vmm_mtx); 886 if (arg != 0) { 887 sc->vmm_flags |= VMM_AUTODESTROY; 888 } else { 889 sc->vmm_flags &= ~VMM_AUTODESTROY; 890 } 891 mutex_exit(&vmm_mtx); 892 break; 893 } 894 case VM_DESTROY_SELF: { 895 bool hma_release = false; 896 897 /* 898 * Just like VMM_DESTROY_VM, but on the instance file descriptor 899 * itself, rather than having to perform a racy name lookup as 900 * part of the destroy process. 901 * 902 * Since vmm_destroy_locked() performs vCPU lock acquisition in 903 * order to kick the vCPUs out of guest context as part of any 904 * destruction, we do not need to worry about it ourself using 905 * the `lock_type` logic here. 906 */ 907 mutex_enter(&vmm_mtx); 908 VERIFY0(vmm_destroy_locked(sc, VDO_DEFAULT, &hma_release)); 909 mutex_exit(&vmm_mtx); 910 if (hma_release) { 911 vmm_hma_release(); 912 } 913 break; 914 } 915 case VM_DESTROY_PENDING: { 916 /* 917 * If we have made it this far, then destruction of the instance 918 * has not been initiated. 919 */ 920 *rvalp = 0; 921 break; 922 } 923 924 case VM_ISA_ASSERT_IRQ: { 925 struct vm_isa_irq isa_irq; 926 927 if (ddi_copyin(datap, &isa_irq, sizeof (isa_irq), md)) { 928 error = EFAULT; 929 break; 930 } 931 error = vatpic_assert_irq(sc->vmm_vm, isa_irq.atpic_irq); 932 if (error == 0 && isa_irq.ioapic_irq != -1) { 933 error = vioapic_assert_irq(sc->vmm_vm, 934 isa_irq.ioapic_irq); 935 } 936 break; 937 } 938 case VM_ISA_DEASSERT_IRQ: { 939 struct vm_isa_irq isa_irq; 940 941 if (ddi_copyin(datap, &isa_irq, sizeof (isa_irq), md)) { 942 error = EFAULT; 943 break; 944 } 945 error = vatpic_deassert_irq(sc->vmm_vm, isa_irq.atpic_irq); 946 if (error == 0 && isa_irq.ioapic_irq != -1) { 947 error = vioapic_deassert_irq(sc->vmm_vm, 948 isa_irq.ioapic_irq); 949 } 950 break; 951 } 952 case VM_ISA_PULSE_IRQ: { 953 struct vm_isa_irq isa_irq; 954 955 if (ddi_copyin(datap, &isa_irq, sizeof (isa_irq), md)) { 956 error = EFAULT; 957 break; 958 } 959 error = vatpic_pulse_irq(sc->vmm_vm, isa_irq.atpic_irq); 960 if (error == 0 && isa_irq.ioapic_irq != -1) { 961 error = vioapic_pulse_irq(sc->vmm_vm, 962 isa_irq.ioapic_irq); 963 } 964 break; 965 } 966 case VM_ISA_SET_IRQ_TRIGGER: { 967 struct vm_isa_irq_trigger isa_irq_trigger; 968 969 if (ddi_copyin(datap, &isa_irq_trigger, 970 sizeof (isa_irq_trigger), md)) { 971 error = EFAULT; 972 break; 973 } 974 error = vatpic_set_irq_trigger(sc->vmm_vm, 975 isa_irq_trigger.atpic_irq, isa_irq_trigger.trigger); 976 break; 977 } 978 979 case VM_MMAP_GETNEXT: { 980 struct vm_memmap mm; 981 982 if (ddi_copyin(datap, &mm, sizeof (mm), md)) { 983 error = EFAULT; 984 break; 985 } 986 error = vm_mmap_getnext(sc->vmm_vm, &mm.gpa, &mm.segid, 987 &mm.segoff, &mm.len, &mm.prot, &mm.flags); 988 if (error == 0 && ddi_copyout(&mm, datap, sizeof (mm), md)) { 989 error = EFAULT; 990 break; 991 } 992 break; 993 } 994 case VM_MMAP_MEMSEG: { 995 struct vm_memmap mm; 996 997 if (ddi_copyin(datap, &mm, sizeof (mm), md)) { 998 error = EFAULT; 999 break; 1000 } 1001 error = vm_mmap_memseg(sc->vmm_vm, mm.gpa, mm.segid, mm.segoff, 1002 mm.len, mm.prot, mm.flags); 1003 break; 1004 } 1005 case VM_MUNMAP_MEMSEG: { 1006 struct vm_munmap mu; 1007 1008 if (ddi_copyin(datap, &mu, sizeof (mu), md)) { 1009 error = EFAULT; 1010 break; 1011 } 1012 error = vm_munmap_memseg(sc->vmm_vm, mu.gpa, mu.len); 1013 break; 1014 } 1015 case VM_ALLOC_MEMSEG: { 1016 struct vm_memseg vmseg; 1017 1018 if (ddi_copyin(datap, &vmseg, sizeof (vmseg), md)) { 1019 error = EFAULT; 1020 break; 1021 } 1022 error = vmmdev_alloc_memseg(sc, &vmseg); 1023 break; 1024 } 1025 case VM_GET_MEMSEG: { 1026 struct vm_memseg vmseg; 1027 1028 if (ddi_copyin(datap, &vmseg, sizeof (vmseg), md)) { 1029 error = EFAULT; 1030 break; 1031 } 1032 error = vmmdev_get_memseg(sc, &vmseg); 1033 if (error == 0 && 1034 ddi_copyout(&vmseg, datap, sizeof (vmseg), md)) { 1035 error = EFAULT; 1036 break; 1037 } 1038 break; 1039 } 1040 case VM_GET_REGISTER: { 1041 struct vm_register vmreg; 1042 1043 if (ddi_copyin(datap, &vmreg, sizeof (vmreg), md)) { 1044 error = EFAULT; 1045 break; 1046 } 1047 error = vm_get_register(sc->vmm_vm, vcpu, vmreg.regnum, 1048 &vmreg.regval); 1049 if (error == 0 && 1050 ddi_copyout(&vmreg, datap, sizeof (vmreg), md)) { 1051 error = EFAULT; 1052 break; 1053 } 1054 break; 1055 } 1056 case VM_SET_REGISTER: { 1057 struct vm_register vmreg; 1058 1059 if (ddi_copyin(datap, &vmreg, sizeof (vmreg), md)) { 1060 error = EFAULT; 1061 break; 1062 } 1063 error = vm_set_register(sc->vmm_vm, vcpu, vmreg.regnum, 1064 vmreg.regval); 1065 break; 1066 } 1067 case VM_SET_SEGMENT_DESCRIPTOR: { 1068 struct vm_seg_desc vmsegd; 1069 1070 if (ddi_copyin(datap, &vmsegd, sizeof (vmsegd), md)) { 1071 error = EFAULT; 1072 break; 1073 } 1074 error = vm_set_seg_desc(sc->vmm_vm, vcpu, vmsegd.regnum, 1075 &vmsegd.desc); 1076 break; 1077 } 1078 case VM_GET_SEGMENT_DESCRIPTOR: { 1079 struct vm_seg_desc vmsegd; 1080 1081 if (ddi_copyin(datap, &vmsegd, sizeof (vmsegd), md)) { 1082 error = EFAULT; 1083 break; 1084 } 1085 error = vm_get_seg_desc(sc->vmm_vm, vcpu, vmsegd.regnum, 1086 &vmsegd.desc); 1087 if (error == 0 && 1088 ddi_copyout(&vmsegd, datap, sizeof (vmsegd), md)) { 1089 error = EFAULT; 1090 break; 1091 } 1092 break; 1093 } 1094 case VM_GET_REGISTER_SET: { 1095 struct vm_register_set vrs; 1096 int regnums[VM_REG_LAST]; 1097 uint64_t regvals[VM_REG_LAST]; 1098 1099 if (ddi_copyin(datap, &vrs, sizeof (vrs), md)) { 1100 error = EFAULT; 1101 break; 1102 } 1103 if (vrs.count > VM_REG_LAST || vrs.count == 0) { 1104 error = EINVAL; 1105 break; 1106 } 1107 if (ddi_copyin(vrs.regnums, regnums, 1108 sizeof (int) * vrs.count, md)) { 1109 error = EFAULT; 1110 break; 1111 } 1112 1113 error = 0; 1114 for (uint_t i = 0; i < vrs.count && error == 0; i++) { 1115 if (regnums[i] < 0) { 1116 error = EINVAL; 1117 break; 1118 } 1119 error = vm_get_register(sc->vmm_vm, vcpu, regnums[i], 1120 ®vals[i]); 1121 } 1122 if (error == 0 && ddi_copyout(regvals, vrs.regvals, 1123 sizeof (uint64_t) * vrs.count, md)) { 1124 error = EFAULT; 1125 } 1126 break; 1127 } 1128 case VM_SET_REGISTER_SET: { 1129 struct vm_register_set vrs; 1130 int regnums[VM_REG_LAST]; 1131 uint64_t regvals[VM_REG_LAST]; 1132 1133 if (ddi_copyin(datap, &vrs, sizeof (vrs), md)) { 1134 error = EFAULT; 1135 break; 1136 } 1137 if (vrs.count > VM_REG_LAST || vrs.count == 0) { 1138 error = EINVAL; 1139 break; 1140 } 1141 if (ddi_copyin(vrs.regnums, regnums, 1142 sizeof (int) * vrs.count, md)) { 1143 error = EFAULT; 1144 break; 1145 } 1146 if (ddi_copyin(vrs.regvals, regvals, 1147 sizeof (uint64_t) * vrs.count, md)) { 1148 error = EFAULT; 1149 break; 1150 } 1151 1152 error = 0; 1153 for (uint_t i = 0; i < vrs.count && error == 0; i++) { 1154 /* 1155 * Setting registers in a set is not atomic, since a 1156 * failure in the middle of the set will cause a 1157 * bail-out and inconsistent register state. Callers 1158 * should be wary of this. 1159 */ 1160 if (regnums[i] < 0) { 1161 error = EINVAL; 1162 break; 1163 } 1164 error = vm_set_register(sc->vmm_vm, vcpu, regnums[i], 1165 regvals[i]); 1166 } 1167 break; 1168 } 1169 case VM_RESET_CPU: { 1170 struct vm_vcpu_reset vvr; 1171 1172 if (ddi_copyin(datap, &vvr, sizeof (vvr), md)) { 1173 error = EFAULT; 1174 break; 1175 } 1176 if (vvr.kind != VRK_RESET && vvr.kind != VRK_INIT) { 1177 error = EINVAL; 1178 } 1179 1180 error = vcpu_arch_reset(sc->vmm_vm, vcpu, vvr.kind == VRK_INIT); 1181 break; 1182 } 1183 case VM_GET_RUN_STATE: { 1184 struct vm_run_state vrs; 1185 1186 bzero(&vrs, sizeof (vrs)); 1187 error = vm_get_run_state(sc->vmm_vm, vcpu, &vrs.state, 1188 &vrs.sipi_vector); 1189 if (error == 0) { 1190 if (ddi_copyout(&vrs, datap, sizeof (vrs), md)) { 1191 error = EFAULT; 1192 break; 1193 } 1194 } 1195 break; 1196 } 1197 case VM_SET_RUN_STATE: { 1198 struct vm_run_state vrs; 1199 1200 if (ddi_copyin(datap, &vrs, sizeof (vrs), md)) { 1201 error = EFAULT; 1202 break; 1203 } 1204 error = vm_set_run_state(sc->vmm_vm, vcpu, vrs.state, 1205 vrs.sipi_vector); 1206 break; 1207 } 1208 case VM_GET_FPU: { 1209 struct vm_fpu_state req; 1210 const size_t max_len = (PAGESIZE * 2); 1211 void *kbuf; 1212 1213 if (ddi_copyin(datap, &req, sizeof (req), md)) { 1214 error = EFAULT; 1215 break; 1216 } 1217 if (req.len > max_len || req.len == 0) { 1218 error = EINVAL; 1219 break; 1220 } 1221 kbuf = kmem_zalloc(req.len, KM_SLEEP); 1222 error = vm_get_fpu(sc->vmm_vm, vcpu, kbuf, req.len); 1223 if (error == 0) { 1224 if (ddi_copyout(kbuf, req.buf, req.len, md)) { 1225 error = EFAULT; 1226 } 1227 } 1228 kmem_free(kbuf, req.len); 1229 break; 1230 } 1231 case VM_SET_FPU: { 1232 struct vm_fpu_state req; 1233 const size_t max_len = (PAGESIZE * 2); 1234 void *kbuf; 1235 1236 if (ddi_copyin(datap, &req, sizeof (req), md)) { 1237 error = EFAULT; 1238 break; 1239 } 1240 if (req.len > max_len || req.len == 0) { 1241 error = EINVAL; 1242 break; 1243 } 1244 kbuf = kmem_alloc(req.len, KM_SLEEP); 1245 if (ddi_copyin(req.buf, kbuf, req.len, md)) { 1246 error = EFAULT; 1247 } else { 1248 error = vm_set_fpu(sc->vmm_vm, vcpu, kbuf, req.len); 1249 } 1250 kmem_free(kbuf, req.len); 1251 break; 1252 } 1253 case VM_GET_CPUID: { 1254 struct vm_vcpu_cpuid_config cfg; 1255 struct vcpu_cpuid_entry *entries = NULL; 1256 1257 if (ddi_copyin(datap, &cfg, sizeof (cfg), md)) { 1258 error = EFAULT; 1259 break; 1260 } 1261 if (cfg.vvcc_nent > VMM_MAX_CPUID_ENTRIES) { 1262 error = EINVAL; 1263 break; 1264 } 1265 1266 const size_t entries_size = 1267 cfg.vvcc_nent * sizeof (struct vcpu_cpuid_entry); 1268 if (entries_size != 0) { 1269 entries = kmem_zalloc(entries_size, KM_SLEEP); 1270 } 1271 1272 vcpu_cpuid_config_t vm_cfg = { 1273 .vcc_nent = cfg.vvcc_nent, 1274 .vcc_entries = entries, 1275 }; 1276 error = vm_get_cpuid(sc->vmm_vm, vcpu, &vm_cfg); 1277 1278 /* 1279 * Only attempt to copy out the resultant entries if we were 1280 * able to query them from the instance. The flags and number 1281 * of entries are emitted regardless. 1282 */ 1283 cfg.vvcc_flags = vm_cfg.vcc_flags; 1284 cfg.vvcc_nent = vm_cfg.vcc_nent; 1285 if (entries != NULL) { 1286 if (error == 0 && ddi_copyout(entries, cfg.vvcc_entries, 1287 entries_size, md) != 0) { 1288 error = EFAULT; 1289 } 1290 1291 kmem_free(entries, entries_size); 1292 } 1293 1294 if (ddi_copyout(&cfg, datap, sizeof (cfg), md) != 0) { 1295 error = EFAULT; 1296 } 1297 break; 1298 } 1299 case VM_SET_CPUID: { 1300 struct vm_vcpu_cpuid_config cfg; 1301 struct vcpu_cpuid_entry *entries = NULL; 1302 size_t entries_size = 0; 1303 1304 if (ddi_copyin(datap, &cfg, sizeof (cfg), md)) { 1305 error = EFAULT; 1306 break; 1307 } 1308 if (cfg.vvcc_nent > VMM_MAX_CPUID_ENTRIES) { 1309 error = EFBIG; 1310 break; 1311 } 1312 if ((cfg.vvcc_flags & VCC_FLAG_LEGACY_HANDLING) != 0) { 1313 /* 1314 * If we are being instructed to use "legacy" handling, 1315 * then no entries should be provided, since the static 1316 * in-kernel masking will be used. 1317 */ 1318 if (cfg.vvcc_nent != 0) { 1319 error = EINVAL; 1320 break; 1321 } 1322 } else if (cfg.vvcc_nent != 0) { 1323 entries_size = 1324 cfg.vvcc_nent * sizeof (struct vcpu_cpuid_entry); 1325 entries = kmem_alloc(entries_size, KM_SLEEP); 1326 1327 if (ddi_copyin(cfg.vvcc_entries, entries, entries_size, 1328 md) != 0) { 1329 error = EFAULT; 1330 kmem_free(entries, entries_size); 1331 break; 1332 } 1333 } 1334 1335 vcpu_cpuid_config_t vm_cfg = { 1336 .vcc_flags = cfg.vvcc_flags, 1337 .vcc_nent = cfg.vvcc_nent, 1338 .vcc_entries = entries, 1339 }; 1340 error = vm_set_cpuid(sc->vmm_vm, vcpu, &vm_cfg); 1341 1342 if (entries != NULL) { 1343 kmem_free(entries, entries_size); 1344 } 1345 break; 1346 } 1347 case VM_LEGACY_CPUID: { 1348 struct vm_legacy_cpuid vlc; 1349 if (ddi_copyin(datap, &vlc, sizeof (vlc), md)) { 1350 error = EFAULT; 1351 break; 1352 } 1353 vlc.vlc_vcpuid = vcpu; 1354 1355 legacy_emulate_cpuid(sc->vmm_vm, vcpu, &vlc.vlc_eax, 1356 &vlc.vlc_ebx, &vlc.vlc_ecx, &vlc.vlc_edx); 1357 1358 if (ddi_copyout(&vlc, datap, sizeof (vlc), md)) { 1359 error = EFAULT; 1360 break; 1361 } 1362 break; 1363 } 1364 1365 case VM_SET_KERNEMU_DEV: 1366 case VM_GET_KERNEMU_DEV: { 1367 struct vm_readwrite_kernemu_device kemu; 1368 size_t size = 0; 1369 1370 if (ddi_copyin(datap, &kemu, sizeof (kemu), md)) { 1371 error = EFAULT; 1372 break; 1373 } 1374 1375 if (kemu.access_width > 3) { 1376 error = EINVAL; 1377 break; 1378 } 1379 size = (1 << kemu.access_width); 1380 ASSERT(size >= 1 && size <= 8); 1381 1382 if (cmd == VM_SET_KERNEMU_DEV) { 1383 error = vm_service_mmio_write(sc->vmm_vm, vcpu, 1384 kemu.gpa, kemu.value, size); 1385 } else { 1386 error = vm_service_mmio_read(sc->vmm_vm, vcpu, 1387 kemu.gpa, &kemu.value, size); 1388 } 1389 1390 if (error == 0) { 1391 if (ddi_copyout(&kemu, datap, sizeof (kemu), md)) { 1392 error = EFAULT; 1393 break; 1394 } 1395 } 1396 break; 1397 } 1398 1399 case VM_GET_CAPABILITY: { 1400 struct vm_capability vmcap; 1401 1402 if (ddi_copyin(datap, &vmcap, sizeof (vmcap), md)) { 1403 error = EFAULT; 1404 break; 1405 } 1406 error = vm_get_capability(sc->vmm_vm, vcpu, vmcap.captype, 1407 &vmcap.capval); 1408 if (error == 0 && 1409 ddi_copyout(&vmcap, datap, sizeof (vmcap), md)) { 1410 error = EFAULT; 1411 break; 1412 } 1413 break; 1414 } 1415 case VM_SET_CAPABILITY: { 1416 struct vm_capability vmcap; 1417 1418 if (ddi_copyin(datap, &vmcap, sizeof (vmcap), md)) { 1419 error = EFAULT; 1420 break; 1421 } 1422 error = vm_set_capability(sc->vmm_vm, vcpu, vmcap.captype, 1423 vmcap.capval); 1424 break; 1425 } 1426 case VM_SET_X2APIC_STATE: { 1427 struct vm_x2apic x2apic; 1428 1429 if (ddi_copyin(datap, &x2apic, sizeof (x2apic), md)) { 1430 error = EFAULT; 1431 break; 1432 } 1433 error = vm_set_x2apic_state(sc->vmm_vm, vcpu, x2apic.state); 1434 break; 1435 } 1436 case VM_GET_X2APIC_STATE: { 1437 struct vm_x2apic x2apic; 1438 1439 if (ddi_copyin(datap, &x2apic, sizeof (x2apic), md)) { 1440 error = EFAULT; 1441 break; 1442 } 1443 error = vm_get_x2apic_state(sc->vmm_vm, x2apic.cpuid, 1444 &x2apic.state); 1445 if (error == 0 && 1446 ddi_copyout(&x2apic, datap, sizeof (x2apic), md)) { 1447 error = EFAULT; 1448 break; 1449 } 1450 break; 1451 } 1452 case VM_GET_GPA_PMAP: { 1453 /* 1454 * Until there is a necessity to leak EPT/RVI PTE values to 1455 * userspace, this will remain unimplemented 1456 */ 1457 error = EINVAL; 1458 break; 1459 } 1460 case VM_GET_HPET_CAPABILITIES: { 1461 struct vm_hpet_cap hpetcap; 1462 1463 error = vhpet_getcap(&hpetcap); 1464 if (error == 0 && 1465 ddi_copyout(&hpetcap, datap, sizeof (hpetcap), md)) { 1466 error = EFAULT; 1467 break; 1468 } 1469 break; 1470 } 1471 case VM_GLA2GPA: { 1472 struct vm_gla2gpa gg; 1473 1474 if (ddi_copyin(datap, &gg, sizeof (gg), md)) { 1475 error = EFAULT; 1476 break; 1477 } 1478 gg.vcpuid = vcpu; 1479 error = vm_gla2gpa(sc->vmm_vm, vcpu, &gg.paging, gg.gla, 1480 gg.prot, &gg.gpa, &gg.fault); 1481 if (error == 0 && ddi_copyout(&gg, datap, sizeof (gg), md)) { 1482 error = EFAULT; 1483 break; 1484 } 1485 break; 1486 } 1487 case VM_GLA2GPA_NOFAULT: { 1488 struct vm_gla2gpa gg; 1489 1490 if (ddi_copyin(datap, &gg, sizeof (gg), md)) { 1491 error = EFAULT; 1492 break; 1493 } 1494 gg.vcpuid = vcpu; 1495 error = vm_gla2gpa_nofault(sc->vmm_vm, vcpu, &gg.paging, 1496 gg.gla, gg.prot, &gg.gpa, &gg.fault); 1497 if (error == 0 && ddi_copyout(&gg, datap, sizeof (gg), md)) { 1498 error = EFAULT; 1499 break; 1500 } 1501 break; 1502 } 1503 1504 case VM_ACTIVATE_CPU: 1505 error = vm_activate_cpu(sc->vmm_vm, vcpu); 1506 break; 1507 1508 case VM_SUSPEND_CPU: 1509 if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) { 1510 error = EFAULT; 1511 } else { 1512 error = vm_suspend_cpu(sc->vmm_vm, vcpu); 1513 } 1514 break; 1515 1516 case VM_RESUME_CPU: 1517 if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) { 1518 error = EFAULT; 1519 } else { 1520 error = vm_resume_cpu(sc->vmm_vm, vcpu); 1521 } 1522 break; 1523 1524 case VM_VCPU_BARRIER: 1525 vcpu = arg; 1526 error = vm_vcpu_barrier(sc->vmm_vm, vcpu); 1527 break; 1528 1529 case VM_GET_CPUS: { 1530 struct vm_cpuset vm_cpuset; 1531 cpuset_t tempset; 1532 void *srcp = &tempset; 1533 int size; 1534 1535 if (ddi_copyin(datap, &vm_cpuset, sizeof (vm_cpuset), md)) { 1536 error = EFAULT; 1537 break; 1538 } 1539 1540 /* Be more generous about sizing since our cpuset_t is large. */ 1541 size = vm_cpuset.cpusetsize; 1542 if (size <= 0 || size > sizeof (cpuset_t)) { 1543 error = ERANGE; 1544 } 1545 /* 1546 * If they want a ulong_t or less, make sure they receive the 1547 * low bits with all the useful information. 1548 */ 1549 if (size <= sizeof (tempset.cpub[0])) { 1550 srcp = &tempset.cpub[0]; 1551 } 1552 1553 if (vm_cpuset.which == VM_ACTIVE_CPUS) { 1554 tempset = vm_active_cpus(sc->vmm_vm); 1555 } else if (vm_cpuset.which == VM_DEBUG_CPUS) { 1556 tempset = vm_debug_cpus(sc->vmm_vm); 1557 } else { 1558 error = EINVAL; 1559 } 1560 1561 ASSERT(size > 0 && size <= sizeof (tempset)); 1562 if (error == 0 && 1563 ddi_copyout(srcp, vm_cpuset.cpus, size, md)) { 1564 error = EFAULT; 1565 break; 1566 } 1567 break; 1568 } 1569 case VM_SET_INTINFO: { 1570 struct vm_intinfo vmii; 1571 1572 if (ddi_copyin(datap, &vmii, sizeof (vmii), md)) { 1573 error = EFAULT; 1574 break; 1575 } 1576 error = vm_exit_intinfo(sc->vmm_vm, vcpu, vmii.info1); 1577 break; 1578 } 1579 case VM_GET_INTINFO: { 1580 struct vm_intinfo vmii; 1581 1582 vmii.vcpuid = vcpu; 1583 error = vm_get_intinfo(sc->vmm_vm, vcpu, &vmii.info1, 1584 &vmii.info2); 1585 if (error == 0 && 1586 ddi_copyout(&vmii, datap, sizeof (vmii), md)) { 1587 error = EFAULT; 1588 break; 1589 } 1590 break; 1591 } 1592 case VM_RTC_WRITE: { 1593 struct vm_rtc_data rtcdata; 1594 1595 if (ddi_copyin(datap, &rtcdata, sizeof (rtcdata), md)) { 1596 error = EFAULT; 1597 break; 1598 } 1599 error = vrtc_nvram_write(sc->vmm_vm, rtcdata.offset, 1600 rtcdata.value); 1601 break; 1602 } 1603 case VM_RTC_READ: { 1604 struct vm_rtc_data rtcdata; 1605 1606 if (ddi_copyin(datap, &rtcdata, sizeof (rtcdata), md)) { 1607 error = EFAULT; 1608 break; 1609 } 1610 error = vrtc_nvram_read(sc->vmm_vm, rtcdata.offset, 1611 &rtcdata.value); 1612 if (error == 0 && 1613 ddi_copyout(&rtcdata, datap, sizeof (rtcdata), md)) { 1614 error = EFAULT; 1615 break; 1616 } 1617 break; 1618 } 1619 case VM_RTC_SETTIME: { 1620 timespec_t ts; 1621 1622 if (ddi_copyin(datap, &ts, sizeof (ts), md)) { 1623 error = EFAULT; 1624 break; 1625 } 1626 error = vrtc_set_time(sc->vmm_vm, &ts); 1627 break; 1628 } 1629 case VM_RTC_GETTIME: { 1630 timespec_t ts; 1631 1632 vrtc_get_time(sc->vmm_vm, &ts); 1633 if (ddi_copyout(&ts, datap, sizeof (ts), md)) { 1634 error = EFAULT; 1635 break; 1636 } 1637 break; 1638 } 1639 1640 case VM_PMTMR_LOCATE: { 1641 uint16_t port = arg; 1642 error = vpmtmr_set_location(sc->vmm_vm, port); 1643 break; 1644 } 1645 1646 case VM_RESTART_INSTRUCTION: 1647 error = vm_restart_instruction(sc->vmm_vm, vcpu); 1648 break; 1649 1650 case VM_SET_TOPOLOGY: { 1651 struct vm_cpu_topology topo; 1652 1653 if (ddi_copyin(datap, &topo, sizeof (topo), md) != 0) { 1654 error = EFAULT; 1655 break; 1656 } 1657 error = vm_set_topology(sc->vmm_vm, topo.sockets, topo.cores, 1658 topo.threads, topo.maxcpus); 1659 break; 1660 } 1661 case VM_GET_TOPOLOGY: { 1662 struct vm_cpu_topology topo; 1663 1664 vm_get_topology(sc->vmm_vm, &topo.sockets, &topo.cores, 1665 &topo.threads, &topo.maxcpus); 1666 if (ddi_copyout(&topo, datap, sizeof (topo), md) != 0) { 1667 error = EFAULT; 1668 break; 1669 } 1670 break; 1671 } 1672 case VM_DEVMEM_GETOFFSET: { 1673 struct vm_devmem_offset vdo; 1674 vmm_devmem_entry_t *de; 1675 1676 if (ddi_copyin(datap, &vdo, sizeof (vdo), md) != 0) { 1677 error = EFAULT; 1678 break; 1679 } 1680 1681 de = vmmdev_devmem_find(sc, vdo.segid); 1682 if (de != NULL) { 1683 vdo.offset = de->vde_off; 1684 if (ddi_copyout(&vdo, datap, sizeof (vdo), md) != 0) { 1685 error = EFAULT; 1686 } 1687 } else { 1688 error = ENOENT; 1689 } 1690 break; 1691 } 1692 case VM_TRACK_DIRTY_PAGES: { 1693 const size_t max_track_region_len = 8 * PAGESIZE * 8 * PAGESIZE; 1694 struct vmm_dirty_tracker tracker; 1695 uint8_t *bitmap; 1696 size_t len; 1697 1698 if (ddi_copyin(datap, &tracker, sizeof (tracker), md) != 0) { 1699 error = EFAULT; 1700 break; 1701 } 1702 if ((tracker.vdt_start_gpa & PAGEOFFSET) != 0) { 1703 error = EINVAL; 1704 break; 1705 } 1706 if (tracker.vdt_len == 0) { 1707 break; 1708 } 1709 if ((tracker.vdt_len & PAGEOFFSET) != 0) { 1710 error = EINVAL; 1711 break; 1712 } 1713 if (tracker.vdt_len > max_track_region_len) { 1714 error = EINVAL; 1715 break; 1716 } 1717 len = roundup(tracker.vdt_len / PAGESIZE, 8) / 8; 1718 bitmap = kmem_zalloc(len, KM_SLEEP); 1719 error = vm_track_dirty_pages(sc->vmm_vm, tracker.vdt_start_gpa, 1720 tracker.vdt_len, bitmap); 1721 if (error == 0 && 1722 ddi_copyout(bitmap, tracker.vdt_pfns, len, md) != 0) { 1723 error = EFAULT; 1724 } 1725 kmem_free(bitmap, len); 1726 1727 break; 1728 } 1729 case VM_NPT_OPERATION: { 1730 struct vm_npt_operation vno; 1731 uint8_t *bitmap = NULL; 1732 uint64_t bitmap_size = 0; 1733 1734 if (ddi_copyin(datap, &vno, sizeof (vno), md) != 0) { 1735 error = EFAULT; 1736 break; 1737 } 1738 if ((vno.vno_gpa & PAGEOFFSET) != 0 || 1739 (vno.vno_len & PAGEOFFSET) != 0) { 1740 error = EINVAL; 1741 break; 1742 } 1743 if ((UINT64_MAX - vno.vno_len) < vno.vno_gpa) { 1744 error = EOVERFLOW; 1745 break; 1746 } 1747 1748 /* 1749 * Allocate a bitmap for the operation if it is specified as 1750 * part of the input or output. 1751 */ 1752 if ((vno.vno_operation & 1753 (VNO_FLAG_BITMAP_IN | VNO_FLAG_BITMAP_OUT)) != 0) { 1754 /* 1755 * Operations expecting data to be copied in or out 1756 * should not have zero length. 1757 */ 1758 if (vno.vno_len == 0) { 1759 error = EINVAL; 1760 break; 1761 } 1762 1763 /* 1764 * Maximum bitmap size of 8 pages results in 1 GiB of 1765 * coverage. 1766 */ 1767 const uint64_t max_bitmap_size = 8 * PAGESIZE; 1768 1769 bitmap_size = roundup(vno.vno_len / PAGESIZE, 8) / 8; 1770 if (bitmap_size > max_bitmap_size) { 1771 error = E2BIG; 1772 break; 1773 } 1774 bitmap = kmem_zalloc(bitmap_size, KM_SLEEP); 1775 } 1776 1777 if ((vno.vno_operation & VNO_FLAG_BITMAP_IN) != 0) { 1778 ASSERT(bitmap != NULL); 1779 if (ddi_copyin(vno.vno_bitmap, bitmap, bitmap_size, 1780 md) != 0) { 1781 error = EFAULT; 1782 } 1783 } 1784 1785 if (error == 0) { 1786 error = vm_npt_do_operation(sc->vmm_vm, vno.vno_gpa, 1787 vno.vno_len, vno.vno_operation, bitmap, rvalp); 1788 } 1789 1790 if ((vno.vno_operation & VNO_FLAG_BITMAP_OUT) != 0 && 1791 error == 0) { 1792 ASSERT(bitmap != NULL); 1793 if (ddi_copyout(bitmap, vno.vno_bitmap, bitmap_size, 1794 md) != 0) { 1795 error = EFAULT; 1796 } 1797 } 1798 1799 if (bitmap != NULL) { 1800 kmem_free(bitmap, bitmap_size); 1801 } 1802 1803 break; 1804 } 1805 case VM_WRLOCK_CYCLE: { 1806 /* 1807 * Present a test mechanism to acquire/release the write lock 1808 * on the VM without any other effects. 1809 */ 1810 break; 1811 } 1812 case VM_DATA_READ: { 1813 struct vm_data_xfer vdx; 1814 1815 if (ddi_copyin(datap, &vdx, sizeof (vdx), md) != 0) { 1816 error = EFAULT; 1817 break; 1818 } 1819 if ((vdx.vdx_flags & ~VDX_FLAGS_VALID) != 0) { 1820 error = EINVAL; 1821 break; 1822 } 1823 if (vdx.vdx_len > VM_DATA_XFER_LIMIT) { 1824 error = EFBIG; 1825 break; 1826 } 1827 1828 const size_t len = vdx.vdx_len; 1829 void *buf = NULL; 1830 if (len != 0) { 1831 const void *udata = vdx.vdx_data; 1832 1833 buf = kmem_alloc(len, KM_SLEEP); 1834 if ((vdx.vdx_flags & VDX_FLAG_READ_COPYIN) == 0) { 1835 bzero(buf, len); 1836 } else if (ddi_copyin(udata, buf, len, md) != 0) { 1837 kmem_free(buf, len); 1838 error = EFAULT; 1839 break; 1840 } 1841 } 1842 1843 vdx.vdx_result_len = 0; 1844 vmm_data_req_t req = { 1845 .vdr_class = vdx.vdx_class, 1846 .vdr_version = vdx.vdx_version, 1847 .vdr_flags = vdx.vdx_flags, 1848 .vdr_len = len, 1849 .vdr_data = buf, 1850 .vdr_result_len = &vdx.vdx_result_len, 1851 }; 1852 error = vmm_data_read(sc->vmm_vm, vdx.vdx_vcpuid, &req); 1853 1854 if (error == 0 && buf != NULL) { 1855 if (ddi_copyout(buf, vdx.vdx_data, len, md) != 0) { 1856 error = EFAULT; 1857 } 1858 } 1859 1860 /* 1861 * Copy out the transfer request so that the value of 1862 * vdx_result_len can be made available, regardless of any 1863 * error(s) which may have occurred. 1864 */ 1865 if (ddi_copyout(&vdx, datap, sizeof (vdx), md) != 0) { 1866 error = (error != 0) ? error : EFAULT; 1867 } 1868 1869 if (buf != NULL) { 1870 kmem_free(buf, len); 1871 } 1872 break; 1873 } 1874 case VM_DATA_WRITE: { 1875 struct vm_data_xfer vdx; 1876 1877 if (ddi_copyin(datap, &vdx, sizeof (vdx), md) != 0) { 1878 error = EFAULT; 1879 break; 1880 } 1881 if ((vdx.vdx_flags & ~VDX_FLAGS_VALID) != 0) { 1882 error = EINVAL; 1883 break; 1884 } 1885 if (vdx.vdx_len > VM_DATA_XFER_LIMIT) { 1886 error = EFBIG; 1887 break; 1888 } 1889 1890 const size_t len = vdx.vdx_len; 1891 void *buf = NULL; 1892 if (len != 0) { 1893 buf = kmem_alloc(len, KM_SLEEP); 1894 if (ddi_copyin(vdx.vdx_data, buf, len, md) != 0) { 1895 kmem_free(buf, len); 1896 error = EFAULT; 1897 break; 1898 } 1899 } 1900 1901 vdx.vdx_result_len = 0; 1902 vmm_data_req_t req = { 1903 .vdr_class = vdx.vdx_class, 1904 .vdr_version = vdx.vdx_version, 1905 .vdr_flags = vdx.vdx_flags, 1906 .vdr_len = len, 1907 .vdr_data = buf, 1908 .vdr_result_len = &vdx.vdx_result_len, 1909 }; 1910 if (vmm_allow_state_writes != 0) { 1911 error = vmm_data_write(sc->vmm_vm, vdx.vdx_vcpuid, 1912 &req); 1913 } else { 1914 /* 1915 * Reject the write if somone has thrown the switch back 1916 * into the "disallow" position. 1917 */ 1918 error = EPERM; 1919 } 1920 1921 if (error == 0 && buf != NULL && 1922 (vdx.vdx_flags & VDX_FLAG_WRITE_COPYOUT) != 0) { 1923 if (ddi_copyout(buf, vdx.vdx_data, len, md) != 0) { 1924 error = EFAULT; 1925 } 1926 } 1927 1928 /* 1929 * Copy out the transfer request so that the value of 1930 * vdx_result_len can be made available, regardless of any 1931 * error(s) which may have occurred. 1932 */ 1933 if (ddi_copyout(&vdx, datap, sizeof (vdx), md) != 0) { 1934 error = (error != 0) ? error : EFAULT; 1935 } 1936 1937 if (buf != NULL) { 1938 kmem_free(buf, len); 1939 } 1940 break; 1941 } 1942 1943 case VM_PAUSE: { 1944 error = vm_pause_instance(sc->vmm_vm); 1945 break; 1946 } 1947 case VM_RESUME: { 1948 error = vm_resume_instance(sc->vmm_vm); 1949 break; 1950 } 1951 1952 default: 1953 error = ENOTTY; 1954 break; 1955 } 1956 1957 /* Release exclusion resources */ 1958 switch (lock_type) { 1959 case LOCK_NONE: 1960 break; 1961 case LOCK_VCPU: 1962 vcpu_unlock_one(sc, vcpu); 1963 break; 1964 case LOCK_READ_HOLD: 1965 vmm_read_unlock(sc); 1966 break; 1967 case LOCK_WRITE_HOLD: 1968 vmm_write_unlock(sc); 1969 break; 1970 default: 1971 panic("unexpected lock type"); 1972 break; 1973 } 1974 1975 return (error); 1976 } 1977 1978 static vmm_softc_t * 1979 vmm_lookup(const char *name) 1980 { 1981 list_t *vml = &vmm_list; 1982 vmm_softc_t *sc; 1983 1984 ASSERT(MUTEX_HELD(&vmm_mtx)); 1985 1986 for (sc = list_head(vml); sc != NULL; sc = list_next(vml, sc)) { 1987 if (strcmp(sc->vmm_name, name) == 0) { 1988 break; 1989 } 1990 } 1991 1992 return (sc); 1993 } 1994 1995 /* 1996 * Acquire an HMA registration if not already held. 1997 */ 1998 static boolean_t 1999 vmm_hma_acquire(void) 2000 { 2001 ASSERT(MUTEX_NOT_HELD(&vmm_mtx)); 2002 2003 mutex_enter(&vmmdev_mtx); 2004 2005 if (vmmdev_hma_reg == NULL) { 2006 VERIFY3U(vmmdev_hma_ref, ==, 0); 2007 vmmdev_hma_reg = hma_register(vmmdev_hvm_name); 2008 if (vmmdev_hma_reg == NULL) { 2009 cmn_err(CE_WARN, "%s HMA registration failed.", 2010 vmmdev_hvm_name); 2011 mutex_exit(&vmmdev_mtx); 2012 return (B_FALSE); 2013 } 2014 } 2015 2016 vmmdev_hma_ref++; 2017 2018 mutex_exit(&vmmdev_mtx); 2019 2020 return (B_TRUE); 2021 } 2022 2023 /* 2024 * Release the HMA registration if held and there are no remaining VMs. 2025 */ 2026 static void 2027 vmm_hma_release(void) 2028 { 2029 ASSERT(MUTEX_NOT_HELD(&vmm_mtx)); 2030 2031 mutex_enter(&vmmdev_mtx); 2032 2033 VERIFY3U(vmmdev_hma_ref, !=, 0); 2034 2035 vmmdev_hma_ref--; 2036 2037 if (vmmdev_hma_ref == 0) { 2038 VERIFY(vmmdev_hma_reg != NULL); 2039 hma_unregister(vmmdev_hma_reg); 2040 vmmdev_hma_reg = NULL; 2041 } 2042 mutex_exit(&vmmdev_mtx); 2043 } 2044 2045 static int 2046 vmmdev_do_vm_create(const struct vm_create_req *req, cred_t *cr) 2047 { 2048 vmm_softc_t *sc = NULL; 2049 minor_t minor; 2050 int error = ENOMEM; 2051 size_t len; 2052 const char *name = req->name; 2053 2054 len = strnlen(name, VM_MAX_NAMELEN); 2055 if (len == 0) { 2056 return (EINVAL); 2057 } 2058 if (len >= VM_MAX_NAMELEN) { 2059 return (ENAMETOOLONG); 2060 } 2061 if (strchr(name, '/') != NULL) { 2062 return (EINVAL); 2063 } 2064 2065 if (!vmm_hma_acquire()) 2066 return (ENXIO); 2067 2068 mutex_enter(&vmm_mtx); 2069 2070 /* Look for duplicate names */ 2071 if (vmm_lookup(name) != NULL) { 2072 mutex_exit(&vmm_mtx); 2073 vmm_hma_release(); 2074 return (EEXIST); 2075 } 2076 2077 /* Allow only one instance per non-global zone. */ 2078 if (!INGLOBALZONE(curproc)) { 2079 for (sc = list_head(&vmm_list); sc != NULL; 2080 sc = list_next(&vmm_list, sc)) { 2081 if (sc->vmm_zone == curzone) { 2082 mutex_exit(&vmm_mtx); 2083 vmm_hma_release(); 2084 return (EINVAL); 2085 } 2086 } 2087 } 2088 2089 minor = id_alloc(vmm_minors); 2090 if (ddi_soft_state_zalloc(vmm_statep, minor) != DDI_SUCCESS) { 2091 goto fail; 2092 } else if ((sc = ddi_get_soft_state(vmm_statep, minor)) == NULL) { 2093 ddi_soft_state_free(vmm_statep, minor); 2094 goto fail; 2095 } else if (ddi_create_minor_node(vmmdev_dip, name, S_IFCHR, minor, 2096 DDI_PSEUDO, 0) != DDI_SUCCESS) { 2097 goto fail; 2098 } 2099 2100 if (vmm_kstat_alloc(sc, minor, cr) != 0) { 2101 goto fail; 2102 } 2103 2104 error = vm_create(req->flags, &sc->vmm_vm); 2105 if (error == 0) { 2106 /* Complete VM intialization and report success. */ 2107 (void) strlcpy(sc->vmm_name, name, sizeof (sc->vmm_name)); 2108 sc->vmm_minor = minor; 2109 list_create(&sc->vmm_devmem_list, sizeof (vmm_devmem_entry_t), 2110 offsetof(vmm_devmem_entry_t, vde_node)); 2111 2112 list_create(&sc->vmm_holds, sizeof (vmm_hold_t), 2113 offsetof(vmm_hold_t, vmh_node)); 2114 cv_init(&sc->vmm_cv, NULL, CV_DEFAULT, NULL); 2115 2116 mutex_init(&sc->vmm_lease_lock, NULL, MUTEX_DEFAULT, NULL); 2117 list_create(&sc->vmm_lease_list, sizeof (vmm_lease_t), 2118 offsetof(vmm_lease_t, vml_node)); 2119 cv_init(&sc->vmm_lease_cv, NULL, CV_DEFAULT, NULL); 2120 rw_init(&sc->vmm_rwlock, NULL, RW_DEFAULT, NULL); 2121 2122 sc->vmm_zone = crgetzone(cr); 2123 zone_hold(sc->vmm_zone); 2124 vmm_zsd_add_vm(sc); 2125 vmm_kstat_init(sc); 2126 2127 list_insert_tail(&vmm_list, sc); 2128 mutex_exit(&vmm_mtx); 2129 return (0); 2130 } 2131 2132 vmm_kstat_fini(sc); 2133 ddi_remove_minor_node(vmmdev_dip, name); 2134 fail: 2135 id_free(vmm_minors, minor); 2136 if (sc != NULL) { 2137 ddi_soft_state_free(vmm_statep, minor); 2138 } 2139 mutex_exit(&vmm_mtx); 2140 vmm_hma_release(); 2141 2142 return (error); 2143 } 2144 2145 /* 2146 * Bhyve 'Driver' Interface 2147 * 2148 * While many devices are emulated in the bhyve userspace process, there are 2149 * others with performance constraints which require that they run mostly or 2150 * entirely in-kernel. For those not integrated directly into bhyve, an API is 2151 * needed so they can query/manipulate the portions of VM state needed to 2152 * fulfill their purpose. 2153 * 2154 * This includes: 2155 * - Translating guest-physical addresses to host-virtual pointers 2156 * - Injecting MSIs 2157 * - Hooking IO port addresses 2158 * 2159 * The vmm_drv interface exists to provide that functionality to its consumers. 2160 * (At this time, 'viona' is the only user) 2161 */ 2162 int 2163 vmm_drv_hold(file_t *fp, cred_t *cr, vmm_hold_t **holdp) 2164 { 2165 vnode_t *vp = fp->f_vnode; 2166 const dev_t dev = vp->v_rdev; 2167 vmm_softc_t *sc; 2168 vmm_hold_t *hold; 2169 int err = 0; 2170 2171 if (vp->v_type != VCHR) { 2172 return (ENXIO); 2173 } 2174 const major_t major = getmajor(dev); 2175 const minor_t minor = getminor(dev); 2176 2177 mutex_enter(&vmmdev_mtx); 2178 if (vmmdev_dip == NULL || major != ddi_driver_major(vmmdev_dip)) { 2179 mutex_exit(&vmmdev_mtx); 2180 return (ENOENT); 2181 } 2182 mutex_enter(&vmm_mtx); 2183 mutex_exit(&vmmdev_mtx); 2184 2185 if ((sc = ddi_get_soft_state(vmm_statep, minor)) == NULL) { 2186 err = ENOENT; 2187 goto out; 2188 } 2189 /* XXXJOY: check cred permissions against instance */ 2190 2191 if ((sc->vmm_flags & VMM_DESTROY) != 0) { 2192 err = EBUSY; 2193 goto out; 2194 } 2195 2196 hold = kmem_zalloc(sizeof (*hold), KM_SLEEP); 2197 hold->vmh_sc = sc; 2198 hold->vmh_release_req = B_FALSE; 2199 2200 list_insert_tail(&sc->vmm_holds, hold); 2201 sc->vmm_flags |= VMM_HELD; 2202 *holdp = hold; 2203 2204 out: 2205 mutex_exit(&vmm_mtx); 2206 return (err); 2207 } 2208 2209 void 2210 vmm_drv_rele(vmm_hold_t *hold) 2211 { 2212 vmm_softc_t *sc; 2213 bool hma_release = false; 2214 2215 ASSERT(hold != NULL); 2216 ASSERT(hold->vmh_sc != NULL); 2217 VERIFY(hold->vmh_ioport_hook_cnt == 0); 2218 2219 mutex_enter(&vmm_mtx); 2220 sc = hold->vmh_sc; 2221 list_remove(&sc->vmm_holds, hold); 2222 kmem_free(hold, sizeof (*hold)); 2223 2224 if (list_is_empty(&sc->vmm_holds)) { 2225 sc->vmm_flags &= ~VMM_HELD; 2226 2227 /* 2228 * Since outstanding holds would prevent instance destruction 2229 * from completing, attempt to finish it now if it was already 2230 * set in motion. 2231 */ 2232 if ((sc->vmm_flags & VMM_DESTROY) != 0) { 2233 VERIFY0(vmm_destroy_locked(sc, VDO_DEFAULT, 2234 &hma_release)); 2235 } 2236 } 2237 mutex_exit(&vmm_mtx); 2238 2239 if (hma_release) { 2240 vmm_hma_release(); 2241 } 2242 } 2243 2244 boolean_t 2245 vmm_drv_release_reqd(vmm_hold_t *hold) 2246 { 2247 ASSERT(hold != NULL); 2248 2249 return (hold->vmh_release_req); 2250 } 2251 2252 vmm_lease_t * 2253 vmm_drv_lease_sign(vmm_hold_t *hold, boolean_t (*expiref)(void *), void *arg) 2254 { 2255 vmm_softc_t *sc = hold->vmh_sc; 2256 vmm_lease_t *lease; 2257 2258 ASSERT3P(expiref, !=, NULL); 2259 2260 if (hold->vmh_release_req) { 2261 return (NULL); 2262 } 2263 2264 lease = kmem_alloc(sizeof (*lease), KM_SLEEP); 2265 list_link_init(&lease->vml_node); 2266 lease->vml_expire_func = expiref; 2267 lease->vml_expire_arg = arg; 2268 lease->vml_expired = B_FALSE; 2269 lease->vml_break_deferred = B_FALSE; 2270 lease->vml_hold = hold; 2271 /* cache the VM pointer for one less pointer chase */ 2272 lease->vml_vm = sc->vmm_vm; 2273 lease->vml_vmclient = vmspace_client_alloc(vm_get_vmspace(sc->vmm_vm)); 2274 2275 mutex_enter(&sc->vmm_lease_lock); 2276 while (sc->vmm_lease_blocker != 0) { 2277 cv_wait(&sc->vmm_lease_cv, &sc->vmm_lease_lock); 2278 } 2279 list_insert_tail(&sc->vmm_lease_list, lease); 2280 vmm_read_lock(sc); 2281 mutex_exit(&sc->vmm_lease_lock); 2282 2283 return (lease); 2284 } 2285 2286 static void 2287 vmm_lease_break_locked(vmm_softc_t *sc, vmm_lease_t *lease) 2288 { 2289 ASSERT(MUTEX_HELD(&sc->vmm_lease_lock)); 2290 2291 list_remove(&sc->vmm_lease_list, lease); 2292 vmm_read_unlock(sc); 2293 vmc_destroy(lease->vml_vmclient); 2294 kmem_free(lease, sizeof (*lease)); 2295 } 2296 2297 static void 2298 vmm_lease_block(vmm_softc_t *sc) 2299 { 2300 mutex_enter(&sc->vmm_lease_lock); 2301 VERIFY3U(sc->vmm_lease_blocker, !=, UINT_MAX); 2302 sc->vmm_lease_blocker++; 2303 if (sc->vmm_lease_blocker == 1) { 2304 list_t *list = &sc->vmm_lease_list; 2305 vmm_lease_t *lease = list_head(list); 2306 2307 while (lease != NULL) { 2308 void *arg = lease->vml_expire_arg; 2309 boolean_t (*expiref)(void *) = lease->vml_expire_func; 2310 boolean_t sync_break = B_FALSE; 2311 2312 /* 2313 * Since the lease expiration notification may 2314 * need to take locks which would deadlock with 2315 * vmm_lease_lock, drop it across the call. 2316 * 2317 * We are the only one allowed to manipulate 2318 * vmm_lease_list right now, so it is safe to 2319 * continue iterating through it after 2320 * reacquiring the lock. 2321 */ 2322 lease->vml_expired = B_TRUE; 2323 mutex_exit(&sc->vmm_lease_lock); 2324 sync_break = expiref(arg); 2325 mutex_enter(&sc->vmm_lease_lock); 2326 2327 if (sync_break) { 2328 vmm_lease_t *next; 2329 2330 /* 2331 * These leases which are synchronously broken 2332 * result in vmm_read_unlock() calls from a 2333 * different thread than the corresponding 2334 * vmm_read_lock(). This is acceptable, given 2335 * that the rwlock underpinning the whole 2336 * mechanism tolerates the behavior. This 2337 * flexibility is _only_ afforded to VM read 2338 * lock (RW_READER) holders. 2339 */ 2340 next = list_next(list, lease); 2341 vmm_lease_break_locked(sc, lease); 2342 lease = next; 2343 } else { 2344 lease = list_next(list, lease); 2345 } 2346 } 2347 2348 /* Process leases which were not broken synchronously. */ 2349 while (!list_is_empty(list)) { 2350 /* 2351 * Although the nested loops are quadratic, the number 2352 * of leases is small. 2353 */ 2354 lease = list_head(list); 2355 while (lease != NULL) { 2356 vmm_lease_t *next = list_next(list, lease); 2357 if (lease->vml_break_deferred) { 2358 vmm_lease_break_locked(sc, lease); 2359 } 2360 lease = next; 2361 } 2362 if (list_is_empty(list)) { 2363 break; 2364 } 2365 cv_wait(&sc->vmm_lease_cv, &sc->vmm_lease_lock); 2366 } 2367 /* Wake anyone else waiting for the lease list to be empty */ 2368 cv_broadcast(&sc->vmm_lease_cv); 2369 } else { 2370 list_t *list = &sc->vmm_lease_list; 2371 2372 /* 2373 * Some other thread beat us to the duty of lease cleanup. 2374 * Wait until that is complete. 2375 */ 2376 while (!list_is_empty(list)) { 2377 cv_wait(&sc->vmm_lease_cv, &sc->vmm_lease_lock); 2378 } 2379 } 2380 mutex_exit(&sc->vmm_lease_lock); 2381 } 2382 2383 static void 2384 vmm_lease_unblock(vmm_softc_t *sc) 2385 { 2386 mutex_enter(&sc->vmm_lease_lock); 2387 VERIFY3U(sc->vmm_lease_blocker, !=, 0); 2388 sc->vmm_lease_blocker--; 2389 if (sc->vmm_lease_blocker == 0) { 2390 cv_broadcast(&sc->vmm_lease_cv); 2391 } 2392 mutex_exit(&sc->vmm_lease_lock); 2393 } 2394 2395 void 2396 vmm_drv_lease_break(vmm_hold_t *hold, vmm_lease_t *lease) 2397 { 2398 vmm_softc_t *sc = hold->vmh_sc; 2399 2400 VERIFY3P(hold, ==, lease->vml_hold); 2401 VERIFY(!lease->vml_break_deferred); 2402 2403 mutex_enter(&sc->vmm_lease_lock); 2404 if (sc->vmm_lease_blocker == 0) { 2405 vmm_lease_break_locked(sc, lease); 2406 } else { 2407 /* 2408 * Defer the lease-breaking to whichever thread is currently 2409 * cleaning up all leases as part of a vmm_lease_block() call. 2410 */ 2411 lease->vml_break_deferred = B_TRUE; 2412 cv_broadcast(&sc->vmm_lease_cv); 2413 } 2414 mutex_exit(&sc->vmm_lease_lock); 2415 } 2416 2417 boolean_t 2418 vmm_drv_lease_expired(vmm_lease_t *lease) 2419 { 2420 return (lease->vml_expired); 2421 } 2422 2423 vmm_page_t * 2424 vmm_drv_page_hold(vmm_lease_t *lease, uintptr_t gpa, int prot) 2425 { 2426 ASSERT(lease != NULL); 2427 ASSERT0(gpa & PAGEOFFSET); 2428 2429 return ((vmm_page_t *)vmc_hold(lease->vml_vmclient, gpa, prot)); 2430 } 2431 2432 2433 /* Ensure that flags mirrored by vmm_drv interface properly match up */ 2434 CTASSERT(VMPF_DEFER_DIRTY == VPF_DEFER_DIRTY); 2435 2436 vmm_page_t * 2437 vmm_drv_page_hold_ext(vmm_lease_t *lease, uintptr_t gpa, int prot, int flags) 2438 { 2439 ASSERT(lease != NULL); 2440 ASSERT0(gpa & PAGEOFFSET); 2441 2442 vmm_page_t *page = 2443 (vmm_page_t *)vmc_hold_ext(lease->vml_vmclient, gpa, prot, flags); 2444 return (page); 2445 } 2446 2447 void 2448 vmm_drv_page_release(vmm_page_t *vmmp) 2449 { 2450 (void) vmp_release((vm_page_t *)vmmp); 2451 } 2452 2453 void 2454 vmm_drv_page_release_chain(vmm_page_t *vmmp) 2455 { 2456 (void) vmp_release_chain((vm_page_t *)vmmp); 2457 } 2458 2459 const void * 2460 vmm_drv_page_readable(const vmm_page_t *vmmp) 2461 { 2462 return (vmp_get_readable((const vm_page_t *)vmmp)); 2463 } 2464 2465 void * 2466 vmm_drv_page_writable(const vmm_page_t *vmmp) 2467 { 2468 return (vmp_get_writable((const vm_page_t *)vmmp)); 2469 } 2470 2471 void 2472 vmm_drv_page_mark_dirty(vmm_page_t *vmmp) 2473 { 2474 return (vmp_mark_dirty((vm_page_t *)vmmp)); 2475 } 2476 2477 void 2478 vmm_drv_page_chain(vmm_page_t *vmmp, vmm_page_t *to_chain) 2479 { 2480 vmp_chain((vm_page_t *)vmmp, (vm_page_t *)to_chain); 2481 } 2482 2483 vmm_page_t * 2484 vmm_drv_page_next(const vmm_page_t *vmmp) 2485 { 2486 return ((vmm_page_t *)vmp_next((vm_page_t *)vmmp)); 2487 } 2488 2489 int 2490 vmm_drv_msi(vmm_lease_t *lease, uint64_t addr, uint64_t msg) 2491 { 2492 ASSERT(lease != NULL); 2493 2494 return (lapic_intr_msi(lease->vml_vm, addr, msg)); 2495 } 2496 2497 int 2498 vmm_drv_ioport_hook(vmm_hold_t *hold, uint16_t ioport, vmm_drv_iop_cb_t func, 2499 void *arg, void **cookie) 2500 { 2501 vmm_softc_t *sc; 2502 int err; 2503 2504 ASSERT(hold != NULL); 2505 ASSERT(cookie != NULL); 2506 2507 sc = hold->vmh_sc; 2508 mutex_enter(&vmm_mtx); 2509 /* Confirm that hook installation is not blocked */ 2510 if ((sc->vmm_flags & VMM_BLOCK_HOOK) != 0) { 2511 mutex_exit(&vmm_mtx); 2512 return (EBUSY); 2513 } 2514 /* 2515 * Optimistically record an installed hook which will prevent a block 2516 * from being asserted while the mutex is dropped. 2517 */ 2518 hold->vmh_ioport_hook_cnt++; 2519 mutex_exit(&vmm_mtx); 2520 2521 vmm_write_lock(sc); 2522 err = vm_ioport_hook(sc->vmm_vm, ioport, (ioport_handler_t)func, 2523 arg, cookie); 2524 vmm_write_unlock(sc); 2525 2526 if (err != 0) { 2527 mutex_enter(&vmm_mtx); 2528 /* Walk back optimism about the hook installation */ 2529 hold->vmh_ioport_hook_cnt--; 2530 mutex_exit(&vmm_mtx); 2531 } 2532 return (err); 2533 } 2534 2535 void 2536 vmm_drv_ioport_unhook(vmm_hold_t *hold, void **cookie) 2537 { 2538 vmm_softc_t *sc; 2539 2540 ASSERT(hold != NULL); 2541 ASSERT(cookie != NULL); 2542 ASSERT(hold->vmh_ioport_hook_cnt != 0); 2543 2544 sc = hold->vmh_sc; 2545 vmm_write_lock(sc); 2546 vm_ioport_unhook(sc->vmm_vm, cookie); 2547 vmm_write_unlock(sc); 2548 2549 mutex_enter(&vmm_mtx); 2550 hold->vmh_ioport_hook_cnt--; 2551 mutex_exit(&vmm_mtx); 2552 } 2553 2554 static void 2555 vmm_drv_purge(vmm_softc_t *sc) 2556 { 2557 ASSERT(MUTEX_HELD(&vmm_mtx)); 2558 2559 if ((sc->vmm_flags & VMM_HELD) != 0) { 2560 vmm_hold_t *hold; 2561 2562 for (hold = list_head(&sc->vmm_holds); hold != NULL; 2563 hold = list_next(&sc->vmm_holds, hold)) { 2564 hold->vmh_release_req = B_TRUE; 2565 } 2566 2567 /* 2568 * Require that all leases on the instance be broken, now that 2569 * all associated holds have been marked as needing release. 2570 * 2571 * Dropping vmm_mtx is not strictly necessary, but if any of the 2572 * lessees are slow to respond, it would be nice to leave it 2573 * available for other parties. 2574 */ 2575 mutex_exit(&vmm_mtx); 2576 vmm_lease_block(sc); 2577 vmm_lease_unblock(sc); 2578 mutex_enter(&vmm_mtx); 2579 } 2580 } 2581 2582 static int 2583 vmm_drv_block_hook(vmm_softc_t *sc, boolean_t enable_block) 2584 { 2585 int err = 0; 2586 2587 mutex_enter(&vmm_mtx); 2588 if (!enable_block) { 2589 VERIFY((sc->vmm_flags & VMM_BLOCK_HOOK) != 0); 2590 2591 sc->vmm_flags &= ~VMM_BLOCK_HOOK; 2592 goto done; 2593 } 2594 2595 /* If any holds have hooks installed, the block is a failure */ 2596 if (!list_is_empty(&sc->vmm_holds)) { 2597 vmm_hold_t *hold; 2598 2599 for (hold = list_head(&sc->vmm_holds); hold != NULL; 2600 hold = list_next(&sc->vmm_holds, hold)) { 2601 if (hold->vmh_ioport_hook_cnt != 0) { 2602 err = EBUSY; 2603 goto done; 2604 } 2605 } 2606 } 2607 sc->vmm_flags |= VMM_BLOCK_HOOK; 2608 2609 done: 2610 mutex_exit(&vmm_mtx); 2611 return (err); 2612 } 2613 2614 2615 static void 2616 vmm_destroy_begin(vmm_softc_t *sc, vmm_destroy_opts_t opts) 2617 { 2618 ASSERT(MUTEX_HELD(&vmm_mtx)); 2619 ASSERT0(sc->vmm_flags & VMM_DESTROY); 2620 2621 sc->vmm_flags |= VMM_DESTROY; 2622 2623 /* 2624 * Lock and unlock all of the vCPUs to ensure that they are kicked out 2625 * of guest context, being unable to return now that the instance is 2626 * marked for destruction. 2627 */ 2628 const int maxcpus = vm_get_maxcpus(sc->vmm_vm); 2629 for (int vcpu = 0; vcpu < maxcpus; vcpu++) { 2630 vcpu_lock_one(sc, vcpu); 2631 vcpu_unlock_one(sc, vcpu); 2632 } 2633 2634 vmmdev_devmem_purge(sc); 2635 if ((opts & VDO_NO_CLEAN_ZSD) == 0) { 2636 /* 2637 * The ZSD should be cleaned up now, unless destruction of the 2638 * instance was initated by destruction of the containing zone, 2639 * in which case the ZSD has already been removed. 2640 */ 2641 vmm_zsd_rem_vm(sc); 2642 } 2643 zone_rele(sc->vmm_zone); 2644 2645 vmm_drv_purge(sc); 2646 } 2647 2648 static bool 2649 vmm_destroy_ready(vmm_softc_t *sc) 2650 { 2651 ASSERT(MUTEX_HELD(&vmm_mtx)); 2652 2653 if ((sc->vmm_flags & (VMM_HELD | VMM_IS_OPEN)) == 0) { 2654 VERIFY(list_is_empty(&sc->vmm_holds)); 2655 return (true); 2656 } 2657 2658 return (false); 2659 } 2660 2661 static void 2662 vmm_destroy_finish(vmm_softc_t *sc) 2663 { 2664 ASSERT(MUTEX_HELD(&vmm_mtx)); 2665 ASSERT(vmm_destroy_ready(sc)); 2666 2667 list_remove(&vmm_list, sc); 2668 vmm_kstat_fini(sc); 2669 vm_destroy(sc->vmm_vm); 2670 ddi_remove_minor_node(vmmdev_dip, sc->vmm_name); 2671 (void) devfs_clean(ddi_get_parent(vmmdev_dip), NULL, DV_CLEAN_FORCE); 2672 2673 const minor_t minor = sc->vmm_minor; 2674 ddi_soft_state_free(vmm_statep, minor); 2675 id_free(vmm_minors, minor); 2676 } 2677 2678 /* 2679 * Initiate or attempt to finish destruction of a VMM instance. 2680 * 2681 * This is called from several contexts: 2682 * - An explicit destroy ioctl is made 2683 * - A vmm_drv consumer releases its hold (being the last on the instance) 2684 * - The vmm device is closed, and auto-destruct is enabled 2685 */ 2686 static int 2687 vmm_destroy_locked(vmm_softc_t *sc, vmm_destroy_opts_t opts, 2688 bool *hma_release) 2689 { 2690 ASSERT(MUTEX_HELD(&vmm_mtx)); 2691 2692 *hma_release = false; 2693 2694 /* 2695 * When instance destruction begins, it is so marked such that any 2696 * further requests to operate the instance will fail. 2697 */ 2698 if ((sc->vmm_flags & VMM_DESTROY) == 0) { 2699 vmm_destroy_begin(sc, opts); 2700 } 2701 2702 if (vmm_destroy_ready(sc)) { 2703 2704 /* 2705 * Notify anyone waiting for the destruction to finish. They 2706 * must be clear before we can safely tear down the softc. 2707 */ 2708 if (sc->vmm_destroy_waiters != 0) { 2709 cv_broadcast(&sc->vmm_cv); 2710 while (sc->vmm_destroy_waiters != 0) { 2711 cv_wait(&sc->vmm_cv, &vmm_mtx); 2712 } 2713 } 2714 2715 /* 2716 * Finish destruction of instance. After this point, the softc 2717 * is freed and cannot be accessed again. 2718 * 2719 * With destruction complete, the HMA hold can be released 2720 */ 2721 vmm_destroy_finish(sc); 2722 *hma_release = true; 2723 return (0); 2724 } else if ((opts & VDO_ATTEMPT_WAIT) != 0) { 2725 int err = 0; 2726 2727 sc->vmm_destroy_waiters++; 2728 while (!vmm_destroy_ready(sc) && err == 0) { 2729 if (cv_wait_sig(&sc->vmm_cv, &vmm_mtx) <= 0) { 2730 err = EINTR; 2731 } 2732 } 2733 sc->vmm_destroy_waiters--; 2734 2735 if (sc->vmm_destroy_waiters == 0) { 2736 /* 2737 * If we were the last waiter, it could be that VM 2738 * destruction is waiting on _us_ to proceed with the 2739 * final clean-up. 2740 */ 2741 cv_signal(&sc->vmm_cv); 2742 } 2743 return (err); 2744 } else { 2745 /* 2746 * Since the instance is not ready for destruction, and the 2747 * caller did not ask to wait, consider it a success for now. 2748 */ 2749 return (0); 2750 } 2751 } 2752 2753 void 2754 vmm_zone_vm_destroy(vmm_softc_t *sc) 2755 { 2756 bool hma_release = false; 2757 int err; 2758 2759 mutex_enter(&vmm_mtx); 2760 err = vmm_destroy_locked(sc, VDO_NO_CLEAN_ZSD, &hma_release); 2761 mutex_exit(&vmm_mtx); 2762 2763 VERIFY0(err); 2764 2765 if (hma_release) { 2766 vmm_hma_release(); 2767 } 2768 } 2769 2770 static int 2771 vmmdev_do_vm_destroy(const struct vm_destroy_req *req, cred_t *cr) 2772 { 2773 vmm_softc_t *sc; 2774 bool hma_release = false; 2775 int err; 2776 2777 if (crgetuid(cr) != 0) { 2778 return (EPERM); 2779 } 2780 2781 mutex_enter(&vmm_mtx); 2782 sc = vmm_lookup(req->name); 2783 if (sc == NULL) { 2784 mutex_exit(&vmm_mtx); 2785 return (ENOENT); 2786 } 2787 /* 2788 * We don't check this in vmm_lookup() since that function is also used 2789 * for validation during create and currently vmm names must be unique. 2790 */ 2791 if (!INGLOBALZONE(curproc) && sc->vmm_zone != curzone) { 2792 mutex_exit(&vmm_mtx); 2793 return (EPERM); 2794 } 2795 2796 err = vmm_destroy_locked(sc, VDO_ATTEMPT_WAIT, &hma_release); 2797 mutex_exit(&vmm_mtx); 2798 2799 if (hma_release) { 2800 vmm_hma_release(); 2801 } 2802 2803 return (err); 2804 } 2805 2806 #define VCPU_NAME_BUFLEN 32 2807 2808 static int 2809 vmm_kstat_alloc(vmm_softc_t *sc, minor_t minor, const cred_t *cr) 2810 { 2811 zoneid_t zid = crgetzoneid(cr); 2812 int instance = minor; 2813 kstat_t *ksp; 2814 2815 ASSERT3P(sc->vmm_kstat_vm, ==, NULL); 2816 2817 ksp = kstat_create_zone(VMM_MODULE_NAME, instance, "vm", 2818 VMM_KSTAT_CLASS, KSTAT_TYPE_NAMED, 2819 sizeof (vmm_kstats_t) / sizeof (kstat_named_t), 0, zid); 2820 2821 if (ksp == NULL) { 2822 return (-1); 2823 } 2824 sc->vmm_kstat_vm = ksp; 2825 2826 for (uint_t i = 0; i < VM_MAXCPU; i++) { 2827 char namebuf[VCPU_NAME_BUFLEN]; 2828 2829 ASSERT3P(sc->vmm_kstat_vcpu[i], ==, NULL); 2830 2831 (void) snprintf(namebuf, VCPU_NAME_BUFLEN, "vcpu%u", i); 2832 ksp = kstat_create_zone(VMM_MODULE_NAME, instance, namebuf, 2833 VMM_KSTAT_CLASS, KSTAT_TYPE_NAMED, 2834 sizeof (vmm_vcpu_kstats_t) / sizeof (kstat_named_t), 2835 0, zid); 2836 if (ksp == NULL) { 2837 goto fail; 2838 } 2839 2840 sc->vmm_kstat_vcpu[i] = ksp; 2841 } 2842 2843 /* 2844 * If this instance is associated with a non-global zone, make its 2845 * kstats visible from the GZ. 2846 */ 2847 if (zid != GLOBAL_ZONEID) { 2848 kstat_zone_add(sc->vmm_kstat_vm, GLOBAL_ZONEID); 2849 for (uint_t i = 0; i < VM_MAXCPU; i++) { 2850 kstat_zone_add(sc->vmm_kstat_vcpu[i], GLOBAL_ZONEID); 2851 } 2852 } 2853 2854 return (0); 2855 2856 fail: 2857 for (uint_t i = 0; i < VM_MAXCPU; i++) { 2858 if (sc->vmm_kstat_vcpu[i] != NULL) { 2859 kstat_delete(sc->vmm_kstat_vcpu[i]); 2860 sc->vmm_kstat_vcpu[i] = NULL; 2861 } else { 2862 break; 2863 } 2864 } 2865 kstat_delete(sc->vmm_kstat_vm); 2866 sc->vmm_kstat_vm = NULL; 2867 return (-1); 2868 } 2869 2870 static void 2871 vmm_kstat_init(vmm_softc_t *sc) 2872 { 2873 kstat_t *ksp; 2874 2875 ASSERT3P(sc->vmm_vm, !=, NULL); 2876 ASSERT3P(sc->vmm_kstat_vm, !=, NULL); 2877 2878 ksp = sc->vmm_kstat_vm; 2879 vmm_kstats_t *vk = ksp->ks_data; 2880 ksp->ks_private = sc->vmm_vm; 2881 kstat_named_init(&vk->vk_name, "vm_name", KSTAT_DATA_STRING); 2882 kstat_named_setstr(&vk->vk_name, sc->vmm_name); 2883 2884 for (uint_t i = 0; i < VM_MAXCPU; i++) { 2885 ASSERT3P(sc->vmm_kstat_vcpu[i], !=, NULL); 2886 2887 ksp = sc->vmm_kstat_vcpu[i]; 2888 vmm_vcpu_kstats_t *vvk = ksp->ks_data; 2889 2890 kstat_named_init(&vvk->vvk_vcpu, "vcpu", KSTAT_DATA_UINT32); 2891 vvk->vvk_vcpu.value.ui32 = i; 2892 kstat_named_init(&vvk->vvk_time_init, "time_init", 2893 KSTAT_DATA_UINT64); 2894 kstat_named_init(&vvk->vvk_time_run, "time_run", 2895 KSTAT_DATA_UINT64); 2896 kstat_named_init(&vvk->vvk_time_idle, "time_idle", 2897 KSTAT_DATA_UINT64); 2898 kstat_named_init(&vvk->vvk_time_emu_kern, "time_emu_kern", 2899 KSTAT_DATA_UINT64); 2900 kstat_named_init(&vvk->vvk_time_emu_user, "time_emu_user", 2901 KSTAT_DATA_UINT64); 2902 kstat_named_init(&vvk->vvk_time_sched, "time_sched", 2903 KSTAT_DATA_UINT64); 2904 ksp->ks_private = sc->vmm_vm; 2905 ksp->ks_update = vmm_kstat_update_vcpu; 2906 } 2907 2908 kstat_install(sc->vmm_kstat_vm); 2909 for (uint_t i = 0; i < VM_MAXCPU; i++) { 2910 kstat_install(sc->vmm_kstat_vcpu[i]); 2911 } 2912 } 2913 2914 static void 2915 vmm_kstat_fini(vmm_softc_t *sc) 2916 { 2917 ASSERT(sc->vmm_kstat_vm != NULL); 2918 2919 kstat_delete(sc->vmm_kstat_vm); 2920 sc->vmm_kstat_vm = NULL; 2921 2922 for (uint_t i = 0; i < VM_MAXCPU; i++) { 2923 ASSERT3P(sc->vmm_kstat_vcpu[i], !=, NULL); 2924 2925 kstat_delete(sc->vmm_kstat_vcpu[i]); 2926 sc->vmm_kstat_vcpu[i] = NULL; 2927 } 2928 } 2929 2930 static int 2931 vmm_open(dev_t *devp, int flag, int otyp, cred_t *credp) 2932 { 2933 minor_t minor; 2934 vmm_softc_t *sc; 2935 2936 /* 2937 * Forbid running bhyve in a 32-bit process until it has been tested and 2938 * verified to be safe. 2939 */ 2940 if (curproc->p_model != DATAMODEL_LP64) { 2941 return (EFBIG); 2942 } 2943 2944 minor = getminor(*devp); 2945 if (minor == VMM_CTL_MINOR) { 2946 /* 2947 * Master control device must be opened exclusively. 2948 */ 2949 if ((flag & FEXCL) != FEXCL || otyp != OTYP_CHR) { 2950 return (EINVAL); 2951 } 2952 2953 return (0); 2954 } 2955 2956 mutex_enter(&vmm_mtx); 2957 sc = ddi_get_soft_state(vmm_statep, minor); 2958 if (sc == NULL) { 2959 mutex_exit(&vmm_mtx); 2960 return (ENXIO); 2961 } 2962 2963 sc->vmm_flags |= VMM_IS_OPEN; 2964 mutex_exit(&vmm_mtx); 2965 2966 return (0); 2967 } 2968 2969 static int 2970 vmm_close(dev_t dev, int flag, int otyp, cred_t *credp) 2971 { 2972 const minor_t minor = getminor(dev); 2973 vmm_softc_t *sc; 2974 bool hma_release = false; 2975 2976 if (minor == VMM_CTL_MINOR) { 2977 return (0); 2978 } 2979 2980 mutex_enter(&vmm_mtx); 2981 sc = ddi_get_soft_state(vmm_statep, minor); 2982 if (sc == NULL) { 2983 mutex_exit(&vmm_mtx); 2984 return (ENXIO); 2985 } 2986 2987 VERIFY3U(sc->vmm_flags & VMM_IS_OPEN, !=, 0); 2988 sc->vmm_flags &= ~VMM_IS_OPEN; 2989 2990 /* 2991 * If instance was marked for auto-destruction begin that now. Instance 2992 * destruction may have been initated already, so try to make progress 2993 * in that case, since closure of the device is one of its requirements. 2994 */ 2995 if ((sc->vmm_flags & VMM_DESTROY) != 0 || 2996 (sc->vmm_flags & VMM_AUTODESTROY) != 0) { 2997 VERIFY0(vmm_destroy_locked(sc, VDO_DEFAULT, &hma_release)); 2998 } 2999 mutex_exit(&vmm_mtx); 3000 3001 if (hma_release) { 3002 vmm_hma_release(); 3003 } 3004 3005 return (0); 3006 } 3007 3008 static int 3009 vmm_is_supported(intptr_t arg) 3010 { 3011 int r; 3012 const char *msg; 3013 3014 if (vmm_is_intel()) { 3015 r = vmx_x86_supported(&msg); 3016 } else if (vmm_is_svm()) { 3017 /* 3018 * HMA already ensured that the features necessary for SVM 3019 * operation were present and online during vmm_attach(). 3020 */ 3021 r = 0; 3022 } else { 3023 r = ENXIO; 3024 msg = "Unsupported CPU vendor"; 3025 } 3026 3027 if (r != 0 && arg != (intptr_t)NULL) { 3028 if (copyoutstr(msg, (char *)arg, strlen(msg) + 1, NULL) != 0) 3029 return (EFAULT); 3030 } 3031 return (r); 3032 } 3033 3034 static int 3035 vmm_ctl_ioctl(int cmd, intptr_t arg, int md, cred_t *cr, int *rvalp) 3036 { 3037 void *argp = (void *)arg; 3038 3039 switch (cmd) { 3040 case VMM_CREATE_VM: { 3041 struct vm_create_req req; 3042 3043 if ((md & FWRITE) == 0) { 3044 return (EPERM); 3045 } 3046 if (ddi_copyin(argp, &req, sizeof (req), md) != 0) { 3047 return (EFAULT); 3048 } 3049 return (vmmdev_do_vm_create(&req, cr)); 3050 } 3051 case VMM_DESTROY_VM: { 3052 struct vm_destroy_req req; 3053 3054 if ((md & FWRITE) == 0) { 3055 return (EPERM); 3056 } 3057 if (ddi_copyin(argp, &req, sizeof (req), md) != 0) { 3058 return (EFAULT); 3059 } 3060 return (vmmdev_do_vm_destroy(&req, cr)); 3061 } 3062 case VMM_VM_SUPPORTED: 3063 return (vmm_is_supported(arg)); 3064 case VMM_CHECK_IOMMU: 3065 if (!vmm_check_iommu()) { 3066 return (ENXIO); 3067 } 3068 return (0); 3069 case VMM_RESV_QUERY: 3070 case VMM_RESV_SET_TARGET: 3071 return (vmmr_ioctl(cmd, arg, md, cr, rvalp)); 3072 default: 3073 break; 3074 } 3075 /* No other actions are legal on ctl device */ 3076 return (ENOTTY); 3077 } 3078 3079 static int 3080 vmm_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, 3081 int *rvalp) 3082 { 3083 vmm_softc_t *sc; 3084 minor_t minor; 3085 3086 /* 3087 * Forbid running bhyve in a 32-bit process until it has been tested and 3088 * verified to be safe. 3089 */ 3090 if (curproc->p_model != DATAMODEL_LP64) { 3091 return (EFBIG); 3092 } 3093 3094 /* The structs in bhyve ioctls assume a 64-bit datamodel */ 3095 if (ddi_model_convert_from(mode & FMODELS) != DDI_MODEL_NONE) { 3096 return (ENOTSUP); 3097 } 3098 3099 /* 3100 * Regardless of minor (vmmctl or instance), we respond to queries of 3101 * the interface version. 3102 */ 3103 if (cmd == VMM_INTERFACE_VERSION) { 3104 *rvalp = VMM_CURRENT_INTERFACE_VERSION; 3105 return (0); 3106 } 3107 3108 minor = getminor(dev); 3109 3110 if (minor == VMM_CTL_MINOR) { 3111 return (vmm_ctl_ioctl(cmd, arg, mode, credp, rvalp)); 3112 } 3113 3114 sc = ddi_get_soft_state(vmm_statep, minor); 3115 ASSERT(sc != NULL); 3116 3117 /* 3118 * Turn away any ioctls against an instance when it is being destroyed. 3119 * (Except for the ioctl inquiring about that destroy-in-progress.) 3120 */ 3121 if ((sc->vmm_flags & VMM_DESTROY) != 0) { 3122 if (cmd == VM_DESTROY_PENDING) { 3123 *rvalp = 1; 3124 return (0); 3125 } 3126 return (ENXIO); 3127 } 3128 3129 return (vmmdev_do_ioctl(sc, cmd, arg, mode, credp, rvalp)); 3130 } 3131 3132 static int 3133 vmm_segmap(dev_t dev, off_t off, struct as *as, caddr_t *addrp, off_t len, 3134 unsigned int prot, unsigned int maxprot, unsigned int flags, cred_t *credp) 3135 { 3136 vmm_softc_t *sc; 3137 const minor_t minor = getminor(dev); 3138 int err; 3139 3140 if (minor == VMM_CTL_MINOR) { 3141 return (ENODEV); 3142 } 3143 if (off < 0 || (off + len) <= 0) { 3144 return (EINVAL); 3145 } 3146 if ((prot & PROT_USER) == 0) { 3147 return (EACCES); 3148 } 3149 3150 sc = ddi_get_soft_state(vmm_statep, minor); 3151 ASSERT(sc); 3152 3153 if (sc->vmm_flags & VMM_DESTROY) 3154 return (ENXIO); 3155 3156 /* Grab read lock on the VM to prevent any changes to the memory map */ 3157 vmm_read_lock(sc); 3158 3159 if (off >= VM_DEVMEM_START) { 3160 int segid; 3161 off_t segoff; 3162 3163 /* Mapping a devmem "device" */ 3164 if (!vmmdev_devmem_segid(sc, off, len, &segid, &segoff)) { 3165 err = ENODEV; 3166 } else { 3167 err = vm_segmap_obj(sc->vmm_vm, segid, segoff, len, as, 3168 addrp, prot, maxprot, flags); 3169 } 3170 } else { 3171 /* Mapping a part of the guest physical space */ 3172 err = vm_segmap_space(sc->vmm_vm, off, as, addrp, len, prot, 3173 maxprot, flags); 3174 } 3175 3176 vmm_read_unlock(sc); 3177 return (err); 3178 } 3179 3180 static sdev_plugin_validate_t 3181 vmm_sdev_validate(sdev_ctx_t ctx) 3182 { 3183 const char *name = sdev_ctx_name(ctx); 3184 vmm_softc_t *sc; 3185 sdev_plugin_validate_t ret; 3186 minor_t minor; 3187 3188 if (sdev_ctx_vtype(ctx) != VCHR) 3189 return (SDEV_VTOR_INVALID); 3190 3191 VERIFY3S(sdev_ctx_minor(ctx, &minor), ==, 0); 3192 3193 mutex_enter(&vmm_mtx); 3194 if ((sc = vmm_lookup(name)) == NULL) 3195 ret = SDEV_VTOR_INVALID; 3196 else if (sc->vmm_minor != minor) 3197 ret = SDEV_VTOR_STALE; 3198 else 3199 ret = SDEV_VTOR_VALID; 3200 mutex_exit(&vmm_mtx); 3201 3202 return (ret); 3203 } 3204 3205 static int 3206 vmm_sdev_filldir(sdev_ctx_t ctx) 3207 { 3208 vmm_softc_t *sc; 3209 int ret; 3210 3211 if (strcmp(sdev_ctx_path(ctx), VMM_SDEV_ROOT) != 0) { 3212 cmn_err(CE_WARN, "%s: bad path '%s' != '%s'\n", __func__, 3213 sdev_ctx_path(ctx), VMM_SDEV_ROOT); 3214 return (EINVAL); 3215 } 3216 3217 mutex_enter(&vmm_mtx); 3218 ASSERT(vmmdev_dip != NULL); 3219 for (sc = list_head(&vmm_list); sc != NULL; 3220 sc = list_next(&vmm_list, sc)) { 3221 if (INGLOBALZONE(curproc) || sc->vmm_zone == curzone) { 3222 ret = sdev_plugin_mknod(ctx, sc->vmm_name, 3223 S_IFCHR | 0600, 3224 makedevice(ddi_driver_major(vmmdev_dip), 3225 sc->vmm_minor)); 3226 } else { 3227 continue; 3228 } 3229 if (ret != 0 && ret != EEXIST) 3230 goto out; 3231 } 3232 3233 ret = 0; 3234 3235 out: 3236 mutex_exit(&vmm_mtx); 3237 return (ret); 3238 } 3239 3240 /* ARGSUSED */ 3241 static void 3242 vmm_sdev_inactive(sdev_ctx_t ctx) 3243 { 3244 } 3245 3246 static sdev_plugin_ops_t vmm_sdev_ops = { 3247 .spo_version = SDEV_PLUGIN_VERSION, 3248 .spo_flags = SDEV_PLUGIN_SUBDIR, 3249 .spo_validate = vmm_sdev_validate, 3250 .spo_filldir = vmm_sdev_filldir, 3251 .spo_inactive = vmm_sdev_inactive 3252 }; 3253 3254 /* ARGSUSED */ 3255 static int 3256 vmm_info(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result) 3257 { 3258 int error; 3259 3260 switch (cmd) { 3261 case DDI_INFO_DEVT2DEVINFO: 3262 *result = (void *)vmmdev_dip; 3263 error = DDI_SUCCESS; 3264 break; 3265 case DDI_INFO_DEVT2INSTANCE: 3266 *result = (void *)0; 3267 error = DDI_SUCCESS; 3268 break; 3269 default: 3270 error = DDI_FAILURE; 3271 break; 3272 } 3273 return (error); 3274 } 3275 3276 static int 3277 vmm_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 3278 { 3279 sdev_plugin_hdl_t sph; 3280 hma_reg_t *reg = NULL; 3281 boolean_t vmm_loaded = B_FALSE; 3282 3283 if (cmd != DDI_ATTACH) { 3284 return (DDI_FAILURE); 3285 } 3286 3287 mutex_enter(&vmmdev_mtx); 3288 /* Ensure we are not already attached. */ 3289 if (vmmdev_dip != NULL) { 3290 mutex_exit(&vmmdev_mtx); 3291 return (DDI_FAILURE); 3292 } 3293 3294 vmm_sol_glue_init(); 3295 3296 /* 3297 * Perform temporary HMA registration to determine if the system 3298 * is capable. 3299 */ 3300 if ((reg = hma_register(vmmdev_hvm_name)) == NULL) { 3301 goto fail; 3302 } else if (vmm_mod_load() != 0) { 3303 goto fail; 3304 } 3305 vmm_loaded = B_TRUE; 3306 hma_unregister(reg); 3307 reg = NULL; 3308 3309 /* Create control node. Other nodes will be created on demand. */ 3310 if (ddi_create_minor_node(dip, "ctl", S_IFCHR, 3311 VMM_CTL_MINOR, DDI_PSEUDO, 0) != 0) { 3312 goto fail; 3313 } 3314 3315 sph = sdev_plugin_register(VMM_MODULE_NAME, &vmm_sdev_ops, NULL); 3316 if (sph == (sdev_plugin_hdl_t)NULL) { 3317 ddi_remove_minor_node(dip, NULL); 3318 goto fail; 3319 } 3320 3321 ddi_report_dev(dip); 3322 vmmdev_sdev_hdl = sph; 3323 vmmdev_dip = dip; 3324 mutex_exit(&vmmdev_mtx); 3325 return (DDI_SUCCESS); 3326 3327 fail: 3328 if (vmm_loaded) { 3329 VERIFY0(vmm_mod_unload()); 3330 } 3331 if (reg != NULL) { 3332 hma_unregister(reg); 3333 } 3334 vmm_sol_glue_cleanup(); 3335 mutex_exit(&vmmdev_mtx); 3336 return (DDI_FAILURE); 3337 } 3338 3339 static int 3340 vmm_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 3341 { 3342 if (cmd != DDI_DETACH) { 3343 return (DDI_FAILURE); 3344 } 3345 3346 /* 3347 * Ensure that all resources have been cleaned up. 3348 * 3349 * To prevent a deadlock with iommu_cleanup() we'll fail the detach if 3350 * vmmdev_mtx is already held. We can't wait for vmmdev_mtx with our 3351 * devinfo locked as iommu_cleanup() tries to recursively lock each 3352 * devinfo, including our own, while holding vmmdev_mtx. 3353 */ 3354 if (mutex_tryenter(&vmmdev_mtx) == 0) 3355 return (DDI_FAILURE); 3356 3357 mutex_enter(&vmm_mtx); 3358 if (!list_is_empty(&vmm_list)) { 3359 mutex_exit(&vmm_mtx); 3360 mutex_exit(&vmmdev_mtx); 3361 return (DDI_FAILURE); 3362 } 3363 mutex_exit(&vmm_mtx); 3364 3365 if (!vmmr_is_empty()) { 3366 mutex_exit(&vmmdev_mtx); 3367 return (DDI_FAILURE); 3368 } 3369 3370 VERIFY(vmmdev_sdev_hdl != (sdev_plugin_hdl_t)NULL); 3371 if (sdev_plugin_unregister(vmmdev_sdev_hdl) != 0) { 3372 mutex_exit(&vmmdev_mtx); 3373 return (DDI_FAILURE); 3374 } 3375 vmmdev_sdev_hdl = (sdev_plugin_hdl_t)NULL; 3376 3377 /* Remove the control node. */ 3378 ddi_remove_minor_node(dip, "ctl"); 3379 vmmdev_dip = NULL; 3380 3381 VERIFY0(vmm_mod_unload()); 3382 VERIFY3U(vmmdev_hma_reg, ==, NULL); 3383 vmm_sol_glue_cleanup(); 3384 3385 mutex_exit(&vmmdev_mtx); 3386 3387 return (DDI_SUCCESS); 3388 } 3389 3390 static struct cb_ops vmm_cb_ops = { 3391 vmm_open, 3392 vmm_close, 3393 nodev, /* strategy */ 3394 nodev, /* print */ 3395 nodev, /* dump */ 3396 nodev, /* read */ 3397 nodev, /* write */ 3398 vmm_ioctl, 3399 nodev, /* devmap */ 3400 nodev, /* mmap */ 3401 vmm_segmap, 3402 nochpoll, /* poll */ 3403 ddi_prop_op, 3404 NULL, 3405 D_NEW | D_MP | D_DEVMAP 3406 }; 3407 3408 static struct dev_ops vmm_ops = { 3409 DEVO_REV, 3410 0, 3411 vmm_info, 3412 nulldev, /* identify */ 3413 nulldev, /* probe */ 3414 vmm_attach, 3415 vmm_detach, 3416 nodev, /* reset */ 3417 &vmm_cb_ops, 3418 (struct bus_ops *)NULL 3419 }; 3420 3421 static struct modldrv modldrv = { 3422 &mod_driverops, 3423 "bhyve vmm", 3424 &vmm_ops 3425 }; 3426 3427 static struct modlinkage modlinkage = { 3428 MODREV_1, 3429 &modldrv, 3430 NULL 3431 }; 3432 3433 int 3434 _init(void) 3435 { 3436 int error; 3437 3438 sysinit(); 3439 3440 mutex_init(&vmmdev_mtx, NULL, MUTEX_DRIVER, NULL); 3441 mutex_init(&vmm_mtx, NULL, MUTEX_DRIVER, NULL); 3442 list_create(&vmm_list, sizeof (vmm_softc_t), 3443 offsetof(vmm_softc_t, vmm_node)); 3444 vmm_minors = id_space_create("vmm_minors", VMM_CTL_MINOR + 1, MAXMIN32); 3445 3446 error = ddi_soft_state_init(&vmm_statep, sizeof (vmm_softc_t), 0); 3447 if (error) { 3448 return (error); 3449 } 3450 3451 error = vmmr_init(); 3452 if (error) { 3453 ddi_soft_state_fini(&vmm_statep); 3454 return (error); 3455 } 3456 3457 vmm_zsd_init(); 3458 3459 error = mod_install(&modlinkage); 3460 if (error) { 3461 ddi_soft_state_fini(&vmm_statep); 3462 vmm_zsd_fini(); 3463 vmmr_fini(); 3464 } 3465 3466 return (error); 3467 } 3468 3469 int 3470 _fini(void) 3471 { 3472 int error; 3473 3474 error = mod_remove(&modlinkage); 3475 if (error) { 3476 return (error); 3477 } 3478 3479 vmm_zsd_fini(); 3480 vmmr_fini(); 3481 3482 ddi_soft_state_fini(&vmm_statep); 3483 3484 return (0); 3485 } 3486 3487 int 3488 _info(struct modinfo *modinfop) 3489 { 3490 return (mod_info(&modlinkage, modinfop)); 3491 } 3492