1 /* 2 * This file and its contents are supplied under the terms of the 3 * Common Development and Distribution License ("CDDL"), version 1.0. 4 * You may only use this file in accordance with the terms of version 5 * 1.0 of the CDDL. 6 * 7 * A full copy of the text of the CDDL should have accompanied this 8 * source. A copy of the CDDL is also available via the Internet at 9 * http://www.illumos.org/license/CDDL. 10 */ 11 /* This file is dual-licensed; see usr/src/contrib/bhyve/LICENSE */ 12 13 /* 14 * Copyright 2015 Pluribus Networks Inc. 15 * Copyright 2019 Joyent, Inc. 16 * Copyright 2020 OmniOS Community Edition (OmniOSce) Association. 17 * Copyright 2022 Oxide Computer Company 18 */ 19 20 #include <sys/types.h> 21 #include <sys/conf.h> 22 #include <sys/cpuvar.h> 23 #include <sys/ioccom.h> 24 #include <sys/stat.h> 25 #include <sys/vmsystm.h> 26 #include <sys/ddi.h> 27 #include <sys/mkdev.h> 28 #include <sys/sunddi.h> 29 #include <sys/fs/dv_node.h> 30 #include <sys/cpuset.h> 31 #include <sys/id_space.h> 32 #include <sys/fs/sdev_plugin.h> 33 #include <sys/smt.h> 34 #include <sys/kstat.h> 35 36 #include <sys/kernel.h> 37 #include <sys/hma.h> 38 #include <sys/x86_archext.h> 39 #include <x86/apicreg.h> 40 41 #include <sys/vmm.h> 42 #include <sys/vmm_kernel.h> 43 #include <sys/vmm_instruction_emul.h> 44 #include <sys/vmm_dev.h> 45 #include <sys/vmm_impl.h> 46 #include <sys/vmm_drv.h> 47 #include <sys/vmm_vm.h> 48 #include <sys/vmm_reservoir.h> 49 50 #include <vm/seg_dev.h> 51 52 #include "io/ppt.h" 53 #include "io/vatpic.h" 54 #include "io/vioapic.h" 55 #include "io/vrtc.h" 56 #include "io/vhpet.h" 57 #include "io/vpmtmr.h" 58 #include "vmm_lapic.h" 59 #include "vmm_stat.h" 60 #include "vmm_util.h" 61 62 /* 63 * Locking details: 64 * 65 * Driver-wide data (vmmdev_*) , including HMA and sdev registration, is 66 * protected by vmmdev_mtx. The list of vmm_softc_t instances and related data 67 * (vmm_*) are protected by vmm_mtx. Actions requiring both locks must acquire 68 * vmmdev_mtx before vmm_mtx. The sdev plugin functions must not attempt to 69 * acquire vmmdev_mtx, as they could deadlock with plugin unregistration. 70 */ 71 72 static kmutex_t vmmdev_mtx; 73 static dev_info_t *vmmdev_dip; 74 static hma_reg_t *vmmdev_hma_reg; 75 static uint_t vmmdev_hma_ref; 76 static sdev_plugin_hdl_t vmmdev_sdev_hdl; 77 78 static kmutex_t vmm_mtx; 79 static list_t vmm_list; 80 static list_t vmm_destroy_list; 81 static id_space_t *vmm_minors; 82 static void *vmm_statep; 83 84 /* temporary safety switch */ 85 int vmm_allow_state_writes; 86 87 static const char *vmmdev_hvm_name = "bhyve"; 88 89 /* For sdev plugin (/dev) */ 90 #define VMM_SDEV_ROOT "/dev/vmm" 91 92 /* From uts/intel/io/vmm/intel/vmx.c */ 93 extern int vmx_x86_supported(const char **); 94 95 /* Holds and hooks from drivers external to vmm */ 96 struct vmm_hold { 97 list_node_t vmh_node; 98 vmm_softc_t *vmh_sc; 99 boolean_t vmh_release_req; 100 uint_t vmh_ioport_hook_cnt; 101 }; 102 103 struct vmm_lease { 104 list_node_t vml_node; 105 struct vm *vml_vm; 106 vm_client_t *vml_vmclient; 107 boolean_t vml_expired; 108 boolean_t vml_break_deferred; 109 boolean_t (*vml_expire_func)(void *); 110 void *vml_expire_arg; 111 struct vmm_hold *vml_hold; 112 }; 113 114 /* Options for vmm_destroy_locked */ 115 typedef enum vmm_destroy_opts { 116 VDO_DEFAULT = 0, 117 /* 118 * Request that zone-specific-data associated with this VM not be 119 * cleaned up as part of the destroy. Skipping ZSD clean-up is 120 * necessary when VM is being destroyed as part of zone destruction, 121 * when said ZSD is already being cleaned up. 122 */ 123 VDO_NO_CLEAN_ZSD = (1 << 0), 124 /* 125 * Skip any attempt to wait for vmm_drv consumers when attempting to 126 * purge them from the instance. When performing an auto-destruct, it 127 * is not desirable to wait, since said consumer might exist in a 128 * "higher" file descriptor which has not yet been closed. 129 */ 130 VDO_NO_PURGE_WAIT = (1 << 1), 131 } vmm_destroy_opts_t; 132 133 static int vmm_destroy_locked(vmm_softc_t *, vmm_destroy_opts_t, boolean_t *); 134 static int vmm_drv_block_hook(vmm_softc_t *, boolean_t); 135 static void vmm_lease_block(vmm_softc_t *); 136 static void vmm_lease_unblock(vmm_softc_t *); 137 static int vmm_kstat_alloc(vmm_softc_t *, minor_t, const cred_t *); 138 static void vmm_kstat_init(vmm_softc_t *); 139 static void vmm_kstat_fini(vmm_softc_t *); 140 141 /* 142 * The 'devmem' hack: 143 * 144 * On native FreeBSD, bhyve consumers are allowed to create 'devmem' segments 145 * in the vm which appear with their own name related to the vm under /dev. 146 * Since this would be a hassle from an sdev perspective and would require a 147 * new cdev interface (or complicate the existing one), we choose to implement 148 * this in a different manner. Direct access to the underlying vm memory 149 * segments is exposed by placing them in a range of offsets beyond the normal 150 * guest memory space. Userspace can query the appropriate offset to mmap() 151 * for a given segment-id with the VM_DEVMEM_GETOFFSET ioctl. 152 */ 153 154 static vmm_devmem_entry_t * 155 vmmdev_devmem_find(vmm_softc_t *sc, int segid) 156 { 157 vmm_devmem_entry_t *ent = NULL; 158 list_t *dl = &sc->vmm_devmem_list; 159 160 for (ent = list_head(dl); ent != NULL; ent = list_next(dl, ent)) { 161 if (ent->vde_segid == segid) { 162 return (ent); 163 } 164 } 165 return (NULL); 166 } 167 168 static int 169 vmmdev_get_memseg(vmm_softc_t *sc, struct vm_memseg *mseg) 170 { 171 int error; 172 bool sysmem; 173 174 error = vm_get_memseg(sc->vmm_vm, mseg->segid, &mseg->len, &sysmem, 175 NULL); 176 if (error || mseg->len == 0) 177 return (error); 178 179 if (!sysmem) { 180 vmm_devmem_entry_t *de; 181 182 de = vmmdev_devmem_find(sc, mseg->segid); 183 if (de != NULL) { 184 (void) strlcpy(mseg->name, de->vde_name, 185 sizeof (mseg->name)); 186 } 187 } else { 188 bzero(mseg->name, sizeof (mseg->name)); 189 } 190 191 return (error); 192 } 193 194 static int 195 vmmdev_devmem_create(vmm_softc_t *sc, struct vm_memseg *mseg, const char *name) 196 { 197 off_t map_offset; 198 vmm_devmem_entry_t *entry; 199 200 if (list_is_empty(&sc->vmm_devmem_list)) { 201 map_offset = VM_DEVMEM_START; 202 } else { 203 entry = list_tail(&sc->vmm_devmem_list); 204 map_offset = entry->vde_off + entry->vde_len; 205 if (map_offset < entry->vde_off) { 206 /* Do not tolerate overflow */ 207 return (ERANGE); 208 } 209 /* 210 * XXXJOY: We could choose to search the list for duplicate 211 * names and toss an error. Since we're using the offset 212 * method for now, it does not make much of a difference. 213 */ 214 } 215 216 entry = kmem_zalloc(sizeof (*entry), KM_SLEEP); 217 entry->vde_segid = mseg->segid; 218 entry->vde_len = mseg->len; 219 entry->vde_off = map_offset; 220 (void) strlcpy(entry->vde_name, name, sizeof (entry->vde_name)); 221 list_insert_tail(&sc->vmm_devmem_list, entry); 222 223 return (0); 224 } 225 226 static boolean_t 227 vmmdev_devmem_segid(vmm_softc_t *sc, off_t off, off_t len, int *segidp, 228 off_t *map_offp) 229 { 230 list_t *dl = &sc->vmm_devmem_list; 231 vmm_devmem_entry_t *de = NULL; 232 const off_t map_end = off + len; 233 234 VERIFY(off >= VM_DEVMEM_START); 235 236 if (map_end < off) { 237 /* No match on overflow */ 238 return (B_FALSE); 239 } 240 241 for (de = list_head(dl); de != NULL; de = list_next(dl, de)) { 242 const off_t item_end = de->vde_off + de->vde_len; 243 244 if (de->vde_off <= off && item_end >= map_end) { 245 *segidp = de->vde_segid; 246 *map_offp = off - de->vde_off; 247 return (B_TRUE); 248 } 249 } 250 return (B_FALSE); 251 } 252 253 static void 254 vmmdev_devmem_purge(vmm_softc_t *sc) 255 { 256 vmm_devmem_entry_t *entry; 257 258 while ((entry = list_remove_head(&sc->vmm_devmem_list)) != NULL) { 259 kmem_free(entry, sizeof (*entry)); 260 } 261 } 262 263 static int 264 vmmdev_alloc_memseg(vmm_softc_t *sc, struct vm_memseg *mseg) 265 { 266 int error; 267 bool sysmem = true; 268 269 if (VM_MEMSEG_NAME(mseg)) { 270 sysmem = false; 271 } 272 error = vm_alloc_memseg(sc->vmm_vm, mseg->segid, mseg->len, sysmem); 273 274 if (error == 0) { 275 /* 276 * Rather than create a whole fresh device from which userspace 277 * can mmap this segment, instead make it available at an 278 * offset above where the main guest memory resides. 279 */ 280 error = vmmdev_devmem_create(sc, mseg, mseg->name); 281 if (error != 0) { 282 vm_free_memseg(sc->vmm_vm, mseg->segid); 283 } 284 } 285 return (error); 286 } 287 288 /* 289 * Resource Locking and Exclusion 290 * 291 * Much of bhyve depends on key portions of VM state, such as the guest memory 292 * map, to remain unchanged while the guest is running. As ported from 293 * FreeBSD, the initial strategy for this resource exclusion hinged on gating 294 * access to the instance vCPUs. Threads acting on a single vCPU, like those 295 * performing the work of actually running the guest in VMX/SVM, would lock 296 * only that vCPU during ioctl() entry. For ioctls which would change VM-wide 297 * state, all of the vCPUs would be first locked, ensuring that the 298 * operation(s) could complete without any other threads stumbling into 299 * intermediate states. 300 * 301 * This approach is largely effective for bhyve. Common operations, such as 302 * running the vCPUs, steer clear of lock contention. The model begins to 303 * break down for operations which do not occur in the context of a specific 304 * vCPU. LAPIC MSI delivery, for example, may be initiated from a worker 305 * thread in the bhyve process. In order to properly protect those vCPU-less 306 * operations from encountering invalid states, additional locking is required. 307 * This was solved by forcing those operations to lock the VM_MAXCPU-1 vCPU. 308 * It does mean that class of operations will be serialized on locking the 309 * specific vCPU and that instances sized at VM_MAXCPU will potentially see 310 * undue contention on the VM_MAXCPU-1 vCPU. 311 * 312 * In order to address the shortcomings of this model, the concept of a 313 * read/write lock has been added to bhyve. Operations which change 314 * fundamental aspects of a VM (such as the memory map) must acquire the write 315 * lock, which also implies locking all of the vCPUs and waiting for all read 316 * lock holders to release. While it increases the cost and waiting time for 317 * those few operations, it allows most hot-path operations on the VM (which 318 * depend on its configuration remaining stable) to occur with minimal locking. 319 * 320 * Consumers of the Driver API (see below) are a special case when it comes to 321 * this locking, since they may hold a read lock via the drv_lease mechanism 322 * for an extended period of time. Rather than forcing those consumers to 323 * continuously poll for a write lock attempt, the lease system forces them to 324 * provide a release callback to trigger their clean-up (and potential later 325 * reacquisition) of the read lock. 326 */ 327 328 static void 329 vcpu_lock_one(vmm_softc_t *sc, int vcpu) 330 { 331 ASSERT(vcpu >= 0 && vcpu < VM_MAXCPU); 332 333 /* 334 * Since this state transition is utilizing from_idle=true, it should 335 * not fail, but rather block until it can be successful. 336 */ 337 VERIFY0(vcpu_set_state(sc->vmm_vm, vcpu, VCPU_FROZEN, true)); 338 } 339 340 static void 341 vcpu_unlock_one(vmm_softc_t *sc, int vcpu) 342 { 343 ASSERT(vcpu >= 0 && vcpu < VM_MAXCPU); 344 345 VERIFY3U(vcpu_get_state(sc->vmm_vm, vcpu, NULL), ==, VCPU_FROZEN); 346 VERIFY0(vcpu_set_state(sc->vmm_vm, vcpu, VCPU_IDLE, false)); 347 } 348 349 static void 350 vmm_read_lock(vmm_softc_t *sc) 351 { 352 rw_enter(&sc->vmm_rwlock, RW_READER); 353 } 354 355 static void 356 vmm_read_unlock(vmm_softc_t *sc) 357 { 358 rw_exit(&sc->vmm_rwlock); 359 } 360 361 static void 362 vmm_write_lock(vmm_softc_t *sc) 363 { 364 int maxcpus; 365 366 /* First lock all the vCPUs */ 367 maxcpus = vm_get_maxcpus(sc->vmm_vm); 368 for (int vcpu = 0; vcpu < maxcpus; vcpu++) { 369 vcpu_lock_one(sc, vcpu); 370 } 371 372 /* 373 * Block vmm_drv leases from being acquired or held while the VM write 374 * lock is held. 375 */ 376 vmm_lease_block(sc); 377 378 rw_enter(&sc->vmm_rwlock, RW_WRITER); 379 /* 380 * For now, the 'maxcpus' value for an instance is fixed at the 381 * compile-time constant of VM_MAXCPU at creation. If this changes in 382 * the future, allowing for dynamic vCPU resource sizing, acquisition 383 * of the write lock will need to be wary of such changes. 384 */ 385 VERIFY(maxcpus == vm_get_maxcpus(sc->vmm_vm)); 386 } 387 388 static void 389 vmm_write_unlock(vmm_softc_t *sc) 390 { 391 int maxcpus; 392 393 /* Allow vmm_drv leases to be acquired once write lock is dropped */ 394 vmm_lease_unblock(sc); 395 396 /* 397 * The VM write lock _must_ be released from the same thread it was 398 * acquired in, unlike the read lock. 399 */ 400 VERIFY(rw_write_held(&sc->vmm_rwlock)); 401 rw_exit(&sc->vmm_rwlock); 402 403 /* Unlock all the vCPUs */ 404 maxcpus = vm_get_maxcpus(sc->vmm_vm); 405 for (int vcpu = 0; vcpu < maxcpus; vcpu++) { 406 vcpu_unlock_one(sc, vcpu); 407 } 408 } 409 410 static int 411 vmmdev_do_ioctl(vmm_softc_t *sc, int cmd, intptr_t arg, int md, 412 cred_t *credp, int *rvalp) 413 { 414 int error = 0, vcpu = -1; 415 void *datap = (void *)arg; 416 enum vm_lock_type { 417 LOCK_NONE = 0, 418 LOCK_VCPU, 419 LOCK_READ_HOLD, 420 LOCK_WRITE_HOLD 421 } lock_type = LOCK_NONE; 422 423 /* Acquire any exclusion resources needed for the operation. */ 424 switch (cmd) { 425 case VM_RUN: 426 case VM_GET_REGISTER: 427 case VM_SET_REGISTER: 428 case VM_GET_SEGMENT_DESCRIPTOR: 429 case VM_SET_SEGMENT_DESCRIPTOR: 430 case VM_GET_REGISTER_SET: 431 case VM_SET_REGISTER_SET: 432 case VM_INJECT_EXCEPTION: 433 case VM_GET_CAPABILITY: 434 case VM_SET_CAPABILITY: 435 case VM_PPTDEV_MSI: 436 case VM_PPTDEV_MSIX: 437 case VM_SET_X2APIC_STATE: 438 case VM_GLA2GPA: 439 case VM_GLA2GPA_NOFAULT: 440 case VM_ACTIVATE_CPU: 441 case VM_SET_INTINFO: 442 case VM_GET_INTINFO: 443 case VM_RESTART_INSTRUCTION: 444 case VM_SET_KERNEMU_DEV: 445 case VM_GET_KERNEMU_DEV: 446 case VM_RESET_CPU: 447 case VM_GET_RUN_STATE: 448 case VM_SET_RUN_STATE: 449 case VM_GET_FPU: 450 case VM_SET_FPU: 451 /* 452 * Copy in the ID of the vCPU chosen for this operation. 453 * Since a nefarious caller could update their struct between 454 * this locking and when the rest of the ioctl data is copied 455 * in, it is _critical_ that this local 'vcpu' variable be used 456 * rather than the in-struct one when performing the ioctl. 457 */ 458 if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) { 459 return (EFAULT); 460 } 461 if (vcpu < 0 || vcpu > vm_get_maxcpus(sc->vmm_vm)) { 462 return (EINVAL); 463 } 464 vcpu_lock_one(sc, vcpu); 465 lock_type = LOCK_VCPU; 466 break; 467 468 case VM_REINIT: 469 case VM_BIND_PPTDEV: 470 case VM_UNBIND_PPTDEV: 471 case VM_MAP_PPTDEV_MMIO: 472 case VM_UNMAP_PPTDEV_MMIO: 473 case VM_ALLOC_MEMSEG: 474 case VM_MMAP_MEMSEG: 475 case VM_MUNMAP_MEMSEG: 476 case VM_WRLOCK_CYCLE: 477 case VM_PMTMR_LOCATE: 478 vmm_write_lock(sc); 479 lock_type = LOCK_WRITE_HOLD; 480 break; 481 482 case VM_GET_MEMSEG: 483 case VM_MMAP_GETNEXT: 484 case VM_LAPIC_IRQ: 485 case VM_INJECT_NMI: 486 case VM_IOAPIC_ASSERT_IRQ: 487 case VM_IOAPIC_DEASSERT_IRQ: 488 case VM_IOAPIC_PULSE_IRQ: 489 case VM_LAPIC_MSI: 490 case VM_LAPIC_LOCAL_IRQ: 491 case VM_GET_X2APIC_STATE: 492 case VM_RTC_READ: 493 case VM_RTC_WRITE: 494 case VM_RTC_SETTIME: 495 case VM_RTC_GETTIME: 496 case VM_PPTDEV_DISABLE_MSIX: 497 case VM_DEVMEM_GETOFFSET: 498 case VM_TRACK_DIRTY_PAGES: 499 vmm_read_lock(sc); 500 lock_type = LOCK_READ_HOLD; 501 break; 502 503 case VM_DATA_READ: 504 case VM_DATA_WRITE: 505 if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) { 506 return (EFAULT); 507 } 508 if (vcpu == -1) { 509 /* Access data for VM-wide devices */ 510 vmm_write_lock(sc); 511 lock_type = LOCK_WRITE_HOLD; 512 } else if (vcpu >= 0 && vcpu < vm_get_maxcpus(sc->vmm_vm)) { 513 /* Access data associated with a specific vCPU */ 514 vcpu_lock_one(sc, vcpu); 515 lock_type = LOCK_VCPU; 516 } else { 517 return (EINVAL); 518 } 519 break; 520 521 case VM_GET_GPA_PMAP: 522 case VM_IOAPIC_PINCOUNT: 523 case VM_SUSPEND: 524 case VM_DESC_FPU_AREA: 525 case VM_SET_AUTODESTRUCT: 526 default: 527 break; 528 } 529 530 /* Execute the primary logic for the ioctl. */ 531 switch (cmd) { 532 case VM_RUN: { 533 struct vm_entry entry; 534 535 if (ddi_copyin(datap, &entry, sizeof (entry), md)) { 536 error = EFAULT; 537 break; 538 } 539 540 if (!(curthread->t_schedflag & TS_VCPU)) 541 smt_mark_as_vcpu(); 542 543 error = vm_run(sc->vmm_vm, vcpu, &entry); 544 545 /* 546 * Unexpected states in vm_run() are expressed through positive 547 * errno-oriented return values. VM states which expect further 548 * processing in userspace (necessary context via exitinfo) are 549 * expressed through negative return values. For the time being 550 * a return value of 0 is not expected from vm_run(). 551 */ 552 ASSERT(error != 0); 553 if (error < 0) { 554 const struct vm_exit *vme; 555 void *outp = entry.exit_data; 556 557 error = 0; 558 vme = vm_exitinfo(sc->vmm_vm, vcpu); 559 if (ddi_copyout(vme, outp, sizeof (*vme), md)) { 560 error = EFAULT; 561 } 562 } 563 break; 564 } 565 case VM_SUSPEND: { 566 struct vm_suspend vmsuspend; 567 568 if (ddi_copyin(datap, &vmsuspend, sizeof (vmsuspend), md)) { 569 error = EFAULT; 570 break; 571 } 572 error = vm_suspend(sc->vmm_vm, vmsuspend.how); 573 break; 574 } 575 case VM_REINIT: { 576 struct vm_reinit reinit; 577 578 if (ddi_copyin(datap, &reinit, sizeof (reinit), md)) { 579 error = EFAULT; 580 break; 581 } 582 if ((error = vmm_drv_block_hook(sc, B_TRUE)) != 0) { 583 /* 584 * The VM instance should be free of driver-attached 585 * hooks during the reinitialization process. 586 */ 587 break; 588 } 589 error = vm_reinit(sc->vmm_vm, reinit.flags); 590 (void) vmm_drv_block_hook(sc, B_FALSE); 591 break; 592 } 593 case VM_STAT_DESC: { 594 struct vm_stat_desc statdesc; 595 596 if (ddi_copyin(datap, &statdesc, sizeof (statdesc), md)) { 597 error = EFAULT; 598 break; 599 } 600 error = vmm_stat_desc_copy(statdesc.index, statdesc.desc, 601 sizeof (statdesc.desc)); 602 if (error == 0 && 603 ddi_copyout(&statdesc, datap, sizeof (statdesc), md)) { 604 error = EFAULT; 605 break; 606 } 607 break; 608 } 609 case VM_STATS_IOC: { 610 struct vm_stats vmstats; 611 612 if (ddi_copyin(datap, &vmstats, sizeof (vmstats), md)) { 613 error = EFAULT; 614 break; 615 } 616 hrt2tv(gethrtime(), &vmstats.tv); 617 error = vmm_stat_copy(sc->vmm_vm, vmstats.cpuid, vmstats.index, 618 nitems(vmstats.statbuf), 619 &vmstats.num_entries, vmstats.statbuf); 620 if (error == 0 && 621 ddi_copyout(&vmstats, datap, sizeof (vmstats), md)) { 622 error = EFAULT; 623 break; 624 } 625 break; 626 } 627 628 case VM_PPTDEV_MSI: { 629 struct vm_pptdev_msi pptmsi; 630 631 if (ddi_copyin(datap, &pptmsi, sizeof (pptmsi), md)) { 632 error = EFAULT; 633 break; 634 } 635 error = ppt_setup_msi(sc->vmm_vm, pptmsi.vcpu, pptmsi.pptfd, 636 pptmsi.addr, pptmsi.msg, pptmsi.numvec); 637 break; 638 } 639 case VM_PPTDEV_MSIX: { 640 struct vm_pptdev_msix pptmsix; 641 642 if (ddi_copyin(datap, &pptmsix, sizeof (pptmsix), md)) { 643 error = EFAULT; 644 break; 645 } 646 error = ppt_setup_msix(sc->vmm_vm, pptmsix.vcpu, pptmsix.pptfd, 647 pptmsix.idx, pptmsix.addr, pptmsix.msg, 648 pptmsix.vector_control); 649 break; 650 } 651 case VM_PPTDEV_DISABLE_MSIX: { 652 struct vm_pptdev pptdev; 653 654 if (ddi_copyin(datap, &pptdev, sizeof (pptdev), md)) { 655 error = EFAULT; 656 break; 657 } 658 error = ppt_disable_msix(sc->vmm_vm, pptdev.pptfd); 659 break; 660 } 661 case VM_MAP_PPTDEV_MMIO: { 662 struct vm_pptdev_mmio pptmmio; 663 664 if (ddi_copyin(datap, &pptmmio, sizeof (pptmmio), md)) { 665 error = EFAULT; 666 break; 667 } 668 error = ppt_map_mmio(sc->vmm_vm, pptmmio.pptfd, pptmmio.gpa, 669 pptmmio.len, pptmmio.hpa); 670 break; 671 } 672 case VM_UNMAP_PPTDEV_MMIO: { 673 struct vm_pptdev_mmio pptmmio; 674 675 if (ddi_copyin(datap, &pptmmio, sizeof (pptmmio), md)) { 676 error = EFAULT; 677 break; 678 } 679 error = ppt_unmap_mmio(sc->vmm_vm, pptmmio.pptfd, pptmmio.gpa, 680 pptmmio.len); 681 break; 682 } 683 case VM_BIND_PPTDEV: { 684 struct vm_pptdev pptdev; 685 686 if (ddi_copyin(datap, &pptdev, sizeof (pptdev), md)) { 687 error = EFAULT; 688 break; 689 } 690 error = vm_assign_pptdev(sc->vmm_vm, pptdev.pptfd); 691 break; 692 } 693 case VM_UNBIND_PPTDEV: { 694 struct vm_pptdev pptdev; 695 696 if (ddi_copyin(datap, &pptdev, sizeof (pptdev), md)) { 697 error = EFAULT; 698 break; 699 } 700 error = vm_unassign_pptdev(sc->vmm_vm, pptdev.pptfd); 701 break; 702 } 703 case VM_GET_PPTDEV_LIMITS: { 704 struct vm_pptdev_limits pptlimits; 705 706 if (ddi_copyin(datap, &pptlimits, sizeof (pptlimits), md)) { 707 error = EFAULT; 708 break; 709 } 710 error = ppt_get_limits(sc->vmm_vm, pptlimits.pptfd, 711 &pptlimits.msi_limit, &pptlimits.msix_limit); 712 if (error == 0 && 713 ddi_copyout(&pptlimits, datap, sizeof (pptlimits), md)) { 714 error = EFAULT; 715 break; 716 } 717 break; 718 } 719 case VM_INJECT_EXCEPTION: { 720 struct vm_exception vmexc; 721 if (ddi_copyin(datap, &vmexc, sizeof (vmexc), md)) { 722 error = EFAULT; 723 break; 724 } 725 error = vm_inject_exception(sc->vmm_vm, vcpu, vmexc.vector, 726 vmexc.error_code_valid != 0, vmexc.error_code, 727 vmexc.restart_instruction != 0); 728 break; 729 } 730 case VM_INJECT_NMI: { 731 struct vm_nmi vmnmi; 732 733 if (ddi_copyin(datap, &vmnmi, sizeof (vmnmi), md)) { 734 error = EFAULT; 735 break; 736 } 737 error = vm_inject_nmi(sc->vmm_vm, vmnmi.cpuid); 738 break; 739 } 740 case VM_LAPIC_IRQ: { 741 struct vm_lapic_irq vmirq; 742 743 if (ddi_copyin(datap, &vmirq, sizeof (vmirq), md)) { 744 error = EFAULT; 745 break; 746 } 747 error = lapic_intr_edge(sc->vmm_vm, vmirq.cpuid, vmirq.vector); 748 break; 749 } 750 case VM_LAPIC_LOCAL_IRQ: { 751 struct vm_lapic_irq vmirq; 752 753 if (ddi_copyin(datap, &vmirq, sizeof (vmirq), md)) { 754 error = EFAULT; 755 break; 756 } 757 error = lapic_set_local_intr(sc->vmm_vm, vmirq.cpuid, 758 vmirq.vector); 759 break; 760 } 761 case VM_LAPIC_MSI: { 762 struct vm_lapic_msi vmmsi; 763 764 if (ddi_copyin(datap, &vmmsi, sizeof (vmmsi), md)) { 765 error = EFAULT; 766 break; 767 } 768 error = lapic_intr_msi(sc->vmm_vm, vmmsi.addr, vmmsi.msg); 769 break; 770 } 771 772 case VM_IOAPIC_ASSERT_IRQ: { 773 struct vm_ioapic_irq ioapic_irq; 774 775 if (ddi_copyin(datap, &ioapic_irq, sizeof (ioapic_irq), md)) { 776 error = EFAULT; 777 break; 778 } 779 error = vioapic_assert_irq(sc->vmm_vm, ioapic_irq.irq); 780 break; 781 } 782 case VM_IOAPIC_DEASSERT_IRQ: { 783 struct vm_ioapic_irq ioapic_irq; 784 785 if (ddi_copyin(datap, &ioapic_irq, sizeof (ioapic_irq), md)) { 786 error = EFAULT; 787 break; 788 } 789 error = vioapic_deassert_irq(sc->vmm_vm, ioapic_irq.irq); 790 break; 791 } 792 case VM_IOAPIC_PULSE_IRQ: { 793 struct vm_ioapic_irq ioapic_irq; 794 795 if (ddi_copyin(datap, &ioapic_irq, sizeof (ioapic_irq), md)) { 796 error = EFAULT; 797 break; 798 } 799 error = vioapic_pulse_irq(sc->vmm_vm, ioapic_irq.irq); 800 break; 801 } 802 case VM_IOAPIC_PINCOUNT: { 803 int pincount; 804 805 pincount = vioapic_pincount(sc->vmm_vm); 806 if (ddi_copyout(&pincount, datap, sizeof (int), md)) { 807 error = EFAULT; 808 break; 809 } 810 break; 811 } 812 case VM_DESC_FPU_AREA: { 813 struct vm_fpu_desc desc; 814 void *buf = NULL; 815 816 if (ddi_copyin(datap, &desc, sizeof (desc), md)) { 817 error = EFAULT; 818 break; 819 } 820 if (desc.vfd_num_entries > 64) { 821 error = EINVAL; 822 break; 823 } 824 const size_t buf_sz = sizeof (struct vm_fpu_desc_entry) * 825 desc.vfd_num_entries; 826 if (buf_sz != 0) { 827 buf = kmem_zalloc(buf_sz, KM_SLEEP); 828 } 829 830 /* 831 * For now, we are depending on vm_fpu_desc_entry and 832 * hma_xsave_state_desc_t having the same format. 833 */ 834 CTASSERT(sizeof (struct vm_fpu_desc_entry) == 835 sizeof (hma_xsave_state_desc_t)); 836 837 size_t req_size; 838 const uint_t max_entries = hma_fpu_describe_xsave_state( 839 (hma_xsave_state_desc_t *)buf, 840 desc.vfd_num_entries, 841 &req_size); 842 843 desc.vfd_req_size = req_size; 844 desc.vfd_num_entries = max_entries; 845 if (buf_sz != 0) { 846 if (ddi_copyout(buf, desc.vfd_entry_data, buf_sz, md)) { 847 error = EFAULT; 848 } 849 kmem_free(buf, buf_sz); 850 } 851 852 if (error == 0) { 853 if (ddi_copyout(&desc, datap, sizeof (desc), md)) { 854 error = EFAULT; 855 } 856 } 857 break; 858 } 859 case VM_SET_AUTODESTRUCT: { 860 /* 861 * Since this has to do with controlling the lifetime of the 862 * greater vmm_softc_t, the flag is protected by vmm_mtx, rather 863 * than the vcpu-centric or rwlock exclusion mechanisms. 864 */ 865 mutex_enter(&vmm_mtx); 866 sc->vmm_autodestruct = (arg != 0); 867 mutex_exit(&vmm_mtx); 868 break; 869 } 870 871 case VM_ISA_ASSERT_IRQ: { 872 struct vm_isa_irq isa_irq; 873 874 if (ddi_copyin(datap, &isa_irq, sizeof (isa_irq), md)) { 875 error = EFAULT; 876 break; 877 } 878 error = vatpic_assert_irq(sc->vmm_vm, isa_irq.atpic_irq); 879 if (error == 0 && isa_irq.ioapic_irq != -1) { 880 error = vioapic_assert_irq(sc->vmm_vm, 881 isa_irq.ioapic_irq); 882 } 883 break; 884 } 885 case VM_ISA_DEASSERT_IRQ: { 886 struct vm_isa_irq isa_irq; 887 888 if (ddi_copyin(datap, &isa_irq, sizeof (isa_irq), md)) { 889 error = EFAULT; 890 break; 891 } 892 error = vatpic_deassert_irq(sc->vmm_vm, isa_irq.atpic_irq); 893 if (error == 0 && isa_irq.ioapic_irq != -1) { 894 error = vioapic_deassert_irq(sc->vmm_vm, 895 isa_irq.ioapic_irq); 896 } 897 break; 898 } 899 case VM_ISA_PULSE_IRQ: { 900 struct vm_isa_irq isa_irq; 901 902 if (ddi_copyin(datap, &isa_irq, sizeof (isa_irq), md)) { 903 error = EFAULT; 904 break; 905 } 906 error = vatpic_pulse_irq(sc->vmm_vm, isa_irq.atpic_irq); 907 if (error == 0 && isa_irq.ioapic_irq != -1) { 908 error = vioapic_pulse_irq(sc->vmm_vm, 909 isa_irq.ioapic_irq); 910 } 911 break; 912 } 913 case VM_ISA_SET_IRQ_TRIGGER: { 914 struct vm_isa_irq_trigger isa_irq_trigger; 915 916 if (ddi_copyin(datap, &isa_irq_trigger, 917 sizeof (isa_irq_trigger), md)) { 918 error = EFAULT; 919 break; 920 } 921 error = vatpic_set_irq_trigger(sc->vmm_vm, 922 isa_irq_trigger.atpic_irq, isa_irq_trigger.trigger); 923 break; 924 } 925 926 case VM_MMAP_GETNEXT: { 927 struct vm_memmap mm; 928 929 if (ddi_copyin(datap, &mm, sizeof (mm), md)) { 930 error = EFAULT; 931 break; 932 } 933 error = vm_mmap_getnext(sc->vmm_vm, &mm.gpa, &mm.segid, 934 &mm.segoff, &mm.len, &mm.prot, &mm.flags); 935 if (error == 0 && ddi_copyout(&mm, datap, sizeof (mm), md)) { 936 error = EFAULT; 937 break; 938 } 939 break; 940 } 941 case VM_MMAP_MEMSEG: { 942 struct vm_memmap mm; 943 944 if (ddi_copyin(datap, &mm, sizeof (mm), md)) { 945 error = EFAULT; 946 break; 947 } 948 error = vm_mmap_memseg(sc->vmm_vm, mm.gpa, mm.segid, mm.segoff, 949 mm.len, mm.prot, mm.flags); 950 break; 951 } 952 case VM_MUNMAP_MEMSEG: { 953 struct vm_munmap mu; 954 955 if (ddi_copyin(datap, &mu, sizeof (mu), md)) { 956 error = EFAULT; 957 break; 958 } 959 error = vm_munmap_memseg(sc->vmm_vm, mu.gpa, mu.len); 960 break; 961 } 962 case VM_ALLOC_MEMSEG: { 963 struct vm_memseg vmseg; 964 965 if (ddi_copyin(datap, &vmseg, sizeof (vmseg), md)) { 966 error = EFAULT; 967 break; 968 } 969 error = vmmdev_alloc_memseg(sc, &vmseg); 970 break; 971 } 972 case VM_GET_MEMSEG: { 973 struct vm_memseg vmseg; 974 975 if (ddi_copyin(datap, &vmseg, sizeof (vmseg), md)) { 976 error = EFAULT; 977 break; 978 } 979 error = vmmdev_get_memseg(sc, &vmseg); 980 if (error == 0 && 981 ddi_copyout(&vmseg, datap, sizeof (vmseg), md)) { 982 error = EFAULT; 983 break; 984 } 985 break; 986 } 987 case VM_GET_REGISTER: { 988 struct vm_register vmreg; 989 990 if (ddi_copyin(datap, &vmreg, sizeof (vmreg), md)) { 991 error = EFAULT; 992 break; 993 } 994 error = vm_get_register(sc->vmm_vm, vcpu, vmreg.regnum, 995 &vmreg.regval); 996 if (error == 0 && 997 ddi_copyout(&vmreg, datap, sizeof (vmreg), md)) { 998 error = EFAULT; 999 break; 1000 } 1001 break; 1002 } 1003 case VM_SET_REGISTER: { 1004 struct vm_register vmreg; 1005 1006 if (ddi_copyin(datap, &vmreg, sizeof (vmreg), md)) { 1007 error = EFAULT; 1008 break; 1009 } 1010 error = vm_set_register(sc->vmm_vm, vcpu, vmreg.regnum, 1011 vmreg.regval); 1012 break; 1013 } 1014 case VM_SET_SEGMENT_DESCRIPTOR: { 1015 struct vm_seg_desc vmsegd; 1016 1017 if (ddi_copyin(datap, &vmsegd, sizeof (vmsegd), md)) { 1018 error = EFAULT; 1019 break; 1020 } 1021 error = vm_set_seg_desc(sc->vmm_vm, vcpu, vmsegd.regnum, 1022 &vmsegd.desc); 1023 break; 1024 } 1025 case VM_GET_SEGMENT_DESCRIPTOR: { 1026 struct vm_seg_desc vmsegd; 1027 1028 if (ddi_copyin(datap, &vmsegd, sizeof (vmsegd), md)) { 1029 error = EFAULT; 1030 break; 1031 } 1032 error = vm_get_seg_desc(sc->vmm_vm, vcpu, vmsegd.regnum, 1033 &vmsegd.desc); 1034 if (error == 0 && 1035 ddi_copyout(&vmsegd, datap, sizeof (vmsegd), md)) { 1036 error = EFAULT; 1037 break; 1038 } 1039 break; 1040 } 1041 case VM_GET_REGISTER_SET: { 1042 struct vm_register_set vrs; 1043 int regnums[VM_REG_LAST]; 1044 uint64_t regvals[VM_REG_LAST]; 1045 1046 if (ddi_copyin(datap, &vrs, sizeof (vrs), md)) { 1047 error = EFAULT; 1048 break; 1049 } 1050 if (vrs.count > VM_REG_LAST || vrs.count == 0) { 1051 error = EINVAL; 1052 break; 1053 } 1054 if (ddi_copyin(vrs.regnums, regnums, 1055 sizeof (int) * vrs.count, md)) { 1056 error = EFAULT; 1057 break; 1058 } 1059 1060 error = 0; 1061 for (uint_t i = 0; i < vrs.count && error == 0; i++) { 1062 if (regnums[i] < 0) { 1063 error = EINVAL; 1064 break; 1065 } 1066 error = vm_get_register(sc->vmm_vm, vcpu, regnums[i], 1067 ®vals[i]); 1068 } 1069 if (error == 0 && ddi_copyout(regvals, vrs.regvals, 1070 sizeof (uint64_t) * vrs.count, md)) { 1071 error = EFAULT; 1072 } 1073 break; 1074 } 1075 case VM_SET_REGISTER_SET: { 1076 struct vm_register_set vrs; 1077 int regnums[VM_REG_LAST]; 1078 uint64_t regvals[VM_REG_LAST]; 1079 1080 if (ddi_copyin(datap, &vrs, sizeof (vrs), md)) { 1081 error = EFAULT; 1082 break; 1083 } 1084 if (vrs.count > VM_REG_LAST || vrs.count == 0) { 1085 error = EINVAL; 1086 break; 1087 } 1088 if (ddi_copyin(vrs.regnums, regnums, 1089 sizeof (int) * vrs.count, md)) { 1090 error = EFAULT; 1091 break; 1092 } 1093 if (ddi_copyin(vrs.regvals, regvals, 1094 sizeof (uint64_t) * vrs.count, md)) { 1095 error = EFAULT; 1096 break; 1097 } 1098 1099 error = 0; 1100 for (uint_t i = 0; i < vrs.count && error == 0; i++) { 1101 /* 1102 * Setting registers in a set is not atomic, since a 1103 * failure in the middle of the set will cause a 1104 * bail-out and inconsistent register state. Callers 1105 * should be wary of this. 1106 */ 1107 if (regnums[i] < 0) { 1108 error = EINVAL; 1109 break; 1110 } 1111 error = vm_set_register(sc->vmm_vm, vcpu, regnums[i], 1112 regvals[i]); 1113 } 1114 break; 1115 } 1116 case VM_RESET_CPU: { 1117 struct vm_vcpu_reset vvr; 1118 1119 if (ddi_copyin(datap, &vvr, sizeof (vvr), md)) { 1120 error = EFAULT; 1121 break; 1122 } 1123 if (vvr.kind != VRK_RESET && vvr.kind != VRK_INIT) { 1124 error = EINVAL; 1125 } 1126 1127 error = vcpu_arch_reset(sc->vmm_vm, vcpu, vvr.kind == VRK_INIT); 1128 break; 1129 } 1130 case VM_GET_RUN_STATE: { 1131 struct vm_run_state vrs; 1132 1133 bzero(&vrs, sizeof (vrs)); 1134 error = vm_get_run_state(sc->vmm_vm, vcpu, &vrs.state, 1135 &vrs.sipi_vector); 1136 if (error == 0) { 1137 if (ddi_copyout(&vrs, datap, sizeof (vrs), md)) { 1138 error = EFAULT; 1139 break; 1140 } 1141 } 1142 break; 1143 } 1144 case VM_SET_RUN_STATE: { 1145 struct vm_run_state vrs; 1146 1147 if (ddi_copyin(datap, &vrs, sizeof (vrs), md)) { 1148 error = EFAULT; 1149 break; 1150 } 1151 error = vm_set_run_state(sc->vmm_vm, vcpu, vrs.state, 1152 vrs.sipi_vector); 1153 break; 1154 } 1155 case VM_GET_FPU: { 1156 struct vm_fpu_state req; 1157 const size_t max_len = (PAGESIZE * 2); 1158 void *kbuf; 1159 1160 if (ddi_copyin(datap, &req, sizeof (req), md)) { 1161 error = EFAULT; 1162 break; 1163 } 1164 if (req.len > max_len || req.len == 0) { 1165 error = EINVAL; 1166 break; 1167 } 1168 kbuf = kmem_zalloc(req.len, KM_SLEEP); 1169 error = vm_get_fpu(sc->vmm_vm, vcpu, kbuf, req.len); 1170 if (error == 0) { 1171 if (ddi_copyout(kbuf, req.buf, req.len, md)) { 1172 error = EFAULT; 1173 } 1174 } 1175 kmem_free(kbuf, req.len); 1176 break; 1177 } 1178 case VM_SET_FPU: { 1179 struct vm_fpu_state req; 1180 const size_t max_len = (PAGESIZE * 2); 1181 void *kbuf; 1182 1183 if (ddi_copyin(datap, &req, sizeof (req), md)) { 1184 error = EFAULT; 1185 break; 1186 } 1187 if (req.len > max_len || req.len == 0) { 1188 error = EINVAL; 1189 break; 1190 } 1191 kbuf = kmem_alloc(req.len, KM_SLEEP); 1192 if (ddi_copyin(req.buf, kbuf, req.len, md)) { 1193 error = EFAULT; 1194 } else { 1195 error = vm_set_fpu(sc->vmm_vm, vcpu, kbuf, req.len); 1196 } 1197 kmem_free(kbuf, req.len); 1198 break; 1199 } 1200 1201 case VM_SET_KERNEMU_DEV: 1202 case VM_GET_KERNEMU_DEV: { 1203 struct vm_readwrite_kernemu_device kemu; 1204 size_t size = 0; 1205 1206 if (ddi_copyin(datap, &kemu, sizeof (kemu), md)) { 1207 error = EFAULT; 1208 break; 1209 } 1210 1211 if (kemu.access_width > 3) { 1212 error = EINVAL; 1213 break; 1214 } 1215 size = (1 << kemu.access_width); 1216 ASSERT(size >= 1 && size <= 8); 1217 1218 if (cmd == VM_SET_KERNEMU_DEV) { 1219 error = vm_service_mmio_write(sc->vmm_vm, vcpu, 1220 kemu.gpa, kemu.value, size); 1221 } else { 1222 error = vm_service_mmio_read(sc->vmm_vm, vcpu, 1223 kemu.gpa, &kemu.value, size); 1224 } 1225 1226 if (error == 0) { 1227 if (ddi_copyout(&kemu, datap, sizeof (kemu), md)) { 1228 error = EFAULT; 1229 break; 1230 } 1231 } 1232 break; 1233 } 1234 1235 case VM_GET_CAPABILITY: { 1236 struct vm_capability vmcap; 1237 1238 if (ddi_copyin(datap, &vmcap, sizeof (vmcap), md)) { 1239 error = EFAULT; 1240 break; 1241 } 1242 error = vm_get_capability(sc->vmm_vm, vcpu, vmcap.captype, 1243 &vmcap.capval); 1244 if (error == 0 && 1245 ddi_copyout(&vmcap, datap, sizeof (vmcap), md)) { 1246 error = EFAULT; 1247 break; 1248 } 1249 break; 1250 } 1251 case VM_SET_CAPABILITY: { 1252 struct vm_capability vmcap; 1253 1254 if (ddi_copyin(datap, &vmcap, sizeof (vmcap), md)) { 1255 error = EFAULT; 1256 break; 1257 } 1258 error = vm_set_capability(sc->vmm_vm, vcpu, vmcap.captype, 1259 vmcap.capval); 1260 break; 1261 } 1262 case VM_SET_X2APIC_STATE: { 1263 struct vm_x2apic x2apic; 1264 1265 if (ddi_copyin(datap, &x2apic, sizeof (x2apic), md)) { 1266 error = EFAULT; 1267 break; 1268 } 1269 error = vm_set_x2apic_state(sc->vmm_vm, vcpu, x2apic.state); 1270 break; 1271 } 1272 case VM_GET_X2APIC_STATE: { 1273 struct vm_x2apic x2apic; 1274 1275 if (ddi_copyin(datap, &x2apic, sizeof (x2apic), md)) { 1276 error = EFAULT; 1277 break; 1278 } 1279 error = vm_get_x2apic_state(sc->vmm_vm, x2apic.cpuid, 1280 &x2apic.state); 1281 if (error == 0 && 1282 ddi_copyout(&x2apic, datap, sizeof (x2apic), md)) { 1283 error = EFAULT; 1284 break; 1285 } 1286 break; 1287 } 1288 case VM_GET_GPA_PMAP: { 1289 /* 1290 * Until there is a necessity to leak EPT/RVI PTE values to 1291 * userspace, this will remain unimplemented 1292 */ 1293 error = EINVAL; 1294 break; 1295 } 1296 case VM_GET_HPET_CAPABILITIES: { 1297 struct vm_hpet_cap hpetcap; 1298 1299 error = vhpet_getcap(&hpetcap); 1300 if (error == 0 && 1301 ddi_copyout(&hpetcap, datap, sizeof (hpetcap), md)) { 1302 error = EFAULT; 1303 break; 1304 } 1305 break; 1306 } 1307 case VM_GLA2GPA: { 1308 struct vm_gla2gpa gg; 1309 1310 if (ddi_copyin(datap, &gg, sizeof (gg), md)) { 1311 error = EFAULT; 1312 break; 1313 } 1314 gg.vcpuid = vcpu; 1315 error = vm_gla2gpa(sc->vmm_vm, vcpu, &gg.paging, gg.gla, 1316 gg.prot, &gg.gpa, &gg.fault); 1317 if (error == 0 && ddi_copyout(&gg, datap, sizeof (gg), md)) { 1318 error = EFAULT; 1319 break; 1320 } 1321 break; 1322 } 1323 case VM_GLA2GPA_NOFAULT: { 1324 struct vm_gla2gpa gg; 1325 1326 if (ddi_copyin(datap, &gg, sizeof (gg), md)) { 1327 error = EFAULT; 1328 break; 1329 } 1330 gg.vcpuid = vcpu; 1331 error = vm_gla2gpa_nofault(sc->vmm_vm, vcpu, &gg.paging, 1332 gg.gla, gg.prot, &gg.gpa, &gg.fault); 1333 if (error == 0 && ddi_copyout(&gg, datap, sizeof (gg), md)) { 1334 error = EFAULT; 1335 break; 1336 } 1337 break; 1338 } 1339 1340 case VM_ACTIVATE_CPU: 1341 error = vm_activate_cpu(sc->vmm_vm, vcpu); 1342 break; 1343 1344 case VM_SUSPEND_CPU: 1345 if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) { 1346 error = EFAULT; 1347 } else { 1348 error = vm_suspend_cpu(sc->vmm_vm, vcpu); 1349 } 1350 break; 1351 1352 case VM_RESUME_CPU: 1353 if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) { 1354 error = EFAULT; 1355 } else { 1356 error = vm_resume_cpu(sc->vmm_vm, vcpu); 1357 } 1358 break; 1359 1360 case VM_GET_CPUS: { 1361 struct vm_cpuset vm_cpuset; 1362 cpuset_t tempset; 1363 void *srcp = &tempset; 1364 int size; 1365 1366 if (ddi_copyin(datap, &vm_cpuset, sizeof (vm_cpuset), md)) { 1367 error = EFAULT; 1368 break; 1369 } 1370 1371 /* Be more generous about sizing since our cpuset_t is large. */ 1372 size = vm_cpuset.cpusetsize; 1373 if (size <= 0 || size > sizeof (cpuset_t)) { 1374 error = ERANGE; 1375 } 1376 /* 1377 * If they want a ulong_t or less, make sure they receive the 1378 * low bits with all the useful information. 1379 */ 1380 if (size <= sizeof (tempset.cpub[0])) { 1381 srcp = &tempset.cpub[0]; 1382 } 1383 1384 if (vm_cpuset.which == VM_ACTIVE_CPUS) { 1385 tempset = vm_active_cpus(sc->vmm_vm); 1386 } else if (vm_cpuset.which == VM_SUSPENDED_CPUS) { 1387 tempset = vm_suspended_cpus(sc->vmm_vm); 1388 } else if (vm_cpuset.which == VM_DEBUG_CPUS) { 1389 tempset = vm_debug_cpus(sc->vmm_vm); 1390 } else { 1391 error = EINVAL; 1392 } 1393 1394 ASSERT(size > 0 && size <= sizeof (tempset)); 1395 if (error == 0 && 1396 ddi_copyout(srcp, vm_cpuset.cpus, size, md)) { 1397 error = EFAULT; 1398 break; 1399 } 1400 break; 1401 } 1402 case VM_SET_INTINFO: { 1403 struct vm_intinfo vmii; 1404 1405 if (ddi_copyin(datap, &vmii, sizeof (vmii), md)) { 1406 error = EFAULT; 1407 break; 1408 } 1409 error = vm_exit_intinfo(sc->vmm_vm, vcpu, vmii.info1); 1410 break; 1411 } 1412 case VM_GET_INTINFO: { 1413 struct vm_intinfo vmii; 1414 1415 vmii.vcpuid = vcpu; 1416 error = vm_get_intinfo(sc->vmm_vm, vcpu, &vmii.info1, 1417 &vmii.info2); 1418 if (error == 0 && 1419 ddi_copyout(&vmii, datap, sizeof (vmii), md)) { 1420 error = EFAULT; 1421 break; 1422 } 1423 break; 1424 } 1425 case VM_RTC_WRITE: { 1426 struct vm_rtc_data rtcdata; 1427 1428 if (ddi_copyin(datap, &rtcdata, sizeof (rtcdata), md)) { 1429 error = EFAULT; 1430 break; 1431 } 1432 error = vrtc_nvram_write(sc->vmm_vm, rtcdata.offset, 1433 rtcdata.value); 1434 break; 1435 } 1436 case VM_RTC_READ: { 1437 struct vm_rtc_data rtcdata; 1438 1439 if (ddi_copyin(datap, &rtcdata, sizeof (rtcdata), md)) { 1440 error = EFAULT; 1441 break; 1442 } 1443 error = vrtc_nvram_read(sc->vmm_vm, rtcdata.offset, 1444 &rtcdata.value); 1445 if (error == 0 && 1446 ddi_copyout(&rtcdata, datap, sizeof (rtcdata), md)) { 1447 error = EFAULT; 1448 break; 1449 } 1450 break; 1451 } 1452 case VM_RTC_SETTIME: { 1453 struct vm_rtc_time rtctime; 1454 1455 if (ddi_copyin(datap, &rtctime, sizeof (rtctime), md)) { 1456 error = EFAULT; 1457 break; 1458 } 1459 error = vrtc_set_time(sc->vmm_vm, rtctime.secs); 1460 break; 1461 } 1462 case VM_RTC_GETTIME: { 1463 struct vm_rtc_time rtctime; 1464 1465 rtctime.secs = vrtc_get_time(sc->vmm_vm); 1466 if (ddi_copyout(&rtctime, datap, sizeof (rtctime), md)) { 1467 error = EFAULT; 1468 break; 1469 } 1470 break; 1471 } 1472 1473 case VM_PMTMR_LOCATE: { 1474 uint16_t port = arg; 1475 error = vpmtmr_set_location(sc->vmm_vm, port); 1476 break; 1477 } 1478 1479 case VM_RESTART_INSTRUCTION: 1480 error = vm_restart_instruction(sc->vmm_vm, vcpu); 1481 break; 1482 1483 case VM_SET_TOPOLOGY: { 1484 struct vm_cpu_topology topo; 1485 1486 if (ddi_copyin(datap, &topo, sizeof (topo), md) != 0) { 1487 error = EFAULT; 1488 break; 1489 } 1490 error = vm_set_topology(sc->vmm_vm, topo.sockets, topo.cores, 1491 topo.threads, topo.maxcpus); 1492 break; 1493 } 1494 case VM_GET_TOPOLOGY: { 1495 struct vm_cpu_topology topo; 1496 1497 vm_get_topology(sc->vmm_vm, &topo.sockets, &topo.cores, 1498 &topo.threads, &topo.maxcpus); 1499 if (ddi_copyout(&topo, datap, sizeof (topo), md) != 0) { 1500 error = EFAULT; 1501 break; 1502 } 1503 break; 1504 } 1505 case VM_DEVMEM_GETOFFSET: { 1506 struct vm_devmem_offset vdo; 1507 vmm_devmem_entry_t *de; 1508 1509 if (ddi_copyin(datap, &vdo, sizeof (vdo), md) != 0) { 1510 error = EFAULT; 1511 break; 1512 } 1513 1514 de = vmmdev_devmem_find(sc, vdo.segid); 1515 if (de != NULL) { 1516 vdo.offset = de->vde_off; 1517 if (ddi_copyout(&vdo, datap, sizeof (vdo), md) != 0) { 1518 error = EFAULT; 1519 } 1520 } else { 1521 error = ENOENT; 1522 } 1523 break; 1524 } 1525 case VM_TRACK_DIRTY_PAGES: { 1526 const size_t max_track_region_len = 8 * PAGESIZE * 8 * PAGESIZE; 1527 struct vmm_dirty_tracker tracker; 1528 uint8_t *bitmap; 1529 size_t len; 1530 1531 if (ddi_copyin(datap, &tracker, sizeof (tracker), md) != 0) { 1532 error = EFAULT; 1533 break; 1534 } 1535 if ((tracker.vdt_start_gpa & PAGEOFFSET) != 0) { 1536 error = EINVAL; 1537 break; 1538 } 1539 if (tracker.vdt_len == 0) { 1540 break; 1541 } 1542 if ((tracker.vdt_len & PAGEOFFSET) != 0) { 1543 error = EINVAL; 1544 break; 1545 } 1546 if (tracker.vdt_len > max_track_region_len) { 1547 error = EINVAL; 1548 break; 1549 } 1550 len = roundup(tracker.vdt_len / PAGESIZE, 8) / 8; 1551 bitmap = kmem_zalloc(len, KM_SLEEP); 1552 vm_track_dirty_pages(sc->vmm_vm, tracker.vdt_start_gpa, 1553 tracker.vdt_len, bitmap); 1554 if (ddi_copyout(bitmap, tracker.vdt_pfns, len, md) != 0) { 1555 error = EFAULT; 1556 } 1557 kmem_free(bitmap, len); 1558 1559 break; 1560 } 1561 case VM_WRLOCK_CYCLE: { 1562 /* 1563 * Present a test mechanism to acquire/release the write lock 1564 * on the VM without any other effects. 1565 */ 1566 break; 1567 } 1568 case VM_DATA_READ: { 1569 struct vm_data_xfer vdx; 1570 1571 if (ddi_copyin(datap, &vdx, sizeof (vdx), md) != 0) { 1572 error = EFAULT; 1573 break; 1574 } 1575 if ((vdx.vdx_flags & ~VDX_FLAGS_VALID) != 0) { 1576 error = EINVAL; 1577 break; 1578 } 1579 if (vdx.vdx_len > VM_DATA_XFER_LIMIT) { 1580 error = EFBIG; 1581 break; 1582 } 1583 1584 const size_t len = vdx.vdx_len; 1585 void *buf = NULL; 1586 if (len != 0) { 1587 buf = kmem_alloc(len, KM_SLEEP); 1588 if ((vdx.vdx_flags & VDX_FLAG_READ_COPYIN) != 0 && 1589 ddi_copyin(vdx.vdx_data, buf, len, md) != 0) { 1590 kmem_free(buf, len); 1591 error = EFAULT; 1592 break; 1593 } else { 1594 bzero(buf, len); 1595 } 1596 } 1597 1598 vdx.vdx_result_len = 0; 1599 vmm_data_req_t req = { 1600 .vdr_class = vdx.vdx_class, 1601 .vdr_version = vdx.vdx_version, 1602 .vdr_flags = vdx.vdx_flags, 1603 .vdr_len = len, 1604 .vdr_data = buf, 1605 .vdr_result_len = &vdx.vdx_result_len, 1606 }; 1607 error = vmm_data_read(sc->vmm_vm, vdx.vdx_vcpuid, &req); 1608 1609 if (error == 0 && buf != NULL) { 1610 if (ddi_copyout(buf, vdx.vdx_data, len, md) != 0) { 1611 error = EFAULT; 1612 } 1613 } 1614 1615 /* 1616 * Copy out the transfer request so that the value of 1617 * vdx_result_len can be made available, regardless of any 1618 * error(s) which may have occurred. 1619 */ 1620 if (ddi_copyout(&vdx, datap, sizeof (vdx), md) != 0) { 1621 error = (error != 0) ? error : EFAULT; 1622 } 1623 1624 if (buf != NULL) { 1625 kmem_free(buf, len); 1626 } 1627 break; 1628 } 1629 case VM_DATA_WRITE: { 1630 struct vm_data_xfer vdx; 1631 1632 if (ddi_copyin(datap, &vdx, sizeof (vdx), md) != 0) { 1633 error = EFAULT; 1634 break; 1635 } 1636 if ((vdx.vdx_flags & ~VDX_FLAGS_VALID) != 0) { 1637 error = EINVAL; 1638 break; 1639 } 1640 if (vdx.vdx_len > VM_DATA_XFER_LIMIT) { 1641 error = EFBIG; 1642 break; 1643 } 1644 1645 const size_t len = vdx.vdx_len; 1646 void *buf = NULL; 1647 if (len != 0) { 1648 buf = kmem_alloc(len, KM_SLEEP); 1649 if (ddi_copyin(vdx.vdx_data, buf, len, md) != 0) { 1650 kmem_free(buf, len); 1651 error = EFAULT; 1652 break; 1653 } 1654 } 1655 1656 vdx.vdx_result_len = 0; 1657 vmm_data_req_t req = { 1658 .vdr_class = vdx.vdx_class, 1659 .vdr_version = vdx.vdx_version, 1660 .vdr_flags = vdx.vdx_flags, 1661 .vdr_len = len, 1662 .vdr_data = buf, 1663 .vdr_result_len = &vdx.vdx_result_len, 1664 }; 1665 if (vmm_allow_state_writes == 0) { 1666 /* XXX: Play it safe for now */ 1667 error = EPERM; 1668 } else { 1669 error = vmm_data_write(sc->vmm_vm, vdx.vdx_vcpuid, 1670 &req); 1671 } 1672 1673 if (error == 0 && buf != NULL && 1674 (vdx.vdx_flags & VDX_FLAG_WRITE_COPYOUT) != 0) { 1675 if (ddi_copyout(buf, vdx.vdx_data, len, md) != 0) { 1676 error = EFAULT; 1677 } 1678 } 1679 1680 /* 1681 * Copy out the transfer request so that the value of 1682 * vdx_result_len can be made available, regardless of any 1683 * error(s) which may have occurred. 1684 */ 1685 if (ddi_copyout(&vdx, datap, sizeof (vdx), md) != 0) { 1686 error = (error != 0) ? error : EFAULT; 1687 } 1688 1689 if (buf != NULL) { 1690 kmem_free(buf, len); 1691 } 1692 break; 1693 } 1694 1695 default: 1696 error = ENOTTY; 1697 break; 1698 } 1699 1700 /* Release exclusion resources */ 1701 switch (lock_type) { 1702 case LOCK_NONE: 1703 break; 1704 case LOCK_VCPU: 1705 vcpu_unlock_one(sc, vcpu); 1706 break; 1707 case LOCK_READ_HOLD: 1708 vmm_read_unlock(sc); 1709 break; 1710 case LOCK_WRITE_HOLD: 1711 vmm_write_unlock(sc); 1712 break; 1713 default: 1714 panic("unexpected lock type"); 1715 break; 1716 } 1717 1718 return (error); 1719 } 1720 1721 static vmm_softc_t * 1722 vmm_lookup(const char *name) 1723 { 1724 list_t *vml = &vmm_list; 1725 vmm_softc_t *sc; 1726 1727 ASSERT(MUTEX_HELD(&vmm_mtx)); 1728 1729 for (sc = list_head(vml); sc != NULL; sc = list_next(vml, sc)) { 1730 if (strcmp(sc->vmm_name, name) == 0) { 1731 break; 1732 } 1733 } 1734 1735 return (sc); 1736 } 1737 1738 /* 1739 * Acquire an HMA registration if not already held. 1740 */ 1741 static boolean_t 1742 vmm_hma_acquire(void) 1743 { 1744 ASSERT(MUTEX_NOT_HELD(&vmm_mtx)); 1745 1746 mutex_enter(&vmmdev_mtx); 1747 1748 if (vmmdev_hma_reg == NULL) { 1749 VERIFY3U(vmmdev_hma_ref, ==, 0); 1750 vmmdev_hma_reg = hma_register(vmmdev_hvm_name); 1751 if (vmmdev_hma_reg == NULL) { 1752 cmn_err(CE_WARN, "%s HMA registration failed.", 1753 vmmdev_hvm_name); 1754 mutex_exit(&vmmdev_mtx); 1755 return (B_FALSE); 1756 } 1757 } 1758 1759 vmmdev_hma_ref++; 1760 1761 mutex_exit(&vmmdev_mtx); 1762 1763 return (B_TRUE); 1764 } 1765 1766 /* 1767 * Release the HMA registration if held and there are no remaining VMs. 1768 */ 1769 static void 1770 vmm_hma_release(void) 1771 { 1772 ASSERT(MUTEX_NOT_HELD(&vmm_mtx)); 1773 1774 mutex_enter(&vmmdev_mtx); 1775 1776 VERIFY3U(vmmdev_hma_ref, !=, 0); 1777 1778 vmmdev_hma_ref--; 1779 1780 if (vmmdev_hma_ref == 0) { 1781 VERIFY(vmmdev_hma_reg != NULL); 1782 hma_unregister(vmmdev_hma_reg); 1783 vmmdev_hma_reg = NULL; 1784 } 1785 mutex_exit(&vmmdev_mtx); 1786 } 1787 1788 static int 1789 vmmdev_do_vm_create(const struct vm_create_req *req, cred_t *cr) 1790 { 1791 vmm_softc_t *sc = NULL; 1792 minor_t minor; 1793 int error = ENOMEM; 1794 size_t len; 1795 const char *name = req->name; 1796 1797 len = strnlen(name, VM_MAX_NAMELEN); 1798 if (len == 0) { 1799 return (EINVAL); 1800 } 1801 if (len >= VM_MAX_NAMELEN) { 1802 return (ENAMETOOLONG); 1803 } 1804 if (strchr(name, '/') != NULL) { 1805 return (EINVAL); 1806 } 1807 1808 if (!vmm_hma_acquire()) 1809 return (ENXIO); 1810 1811 mutex_enter(&vmm_mtx); 1812 1813 /* Look for duplicate names */ 1814 if (vmm_lookup(name) != NULL) { 1815 mutex_exit(&vmm_mtx); 1816 vmm_hma_release(); 1817 return (EEXIST); 1818 } 1819 1820 /* Allow only one instance per non-global zone. */ 1821 if (!INGLOBALZONE(curproc)) { 1822 for (sc = list_head(&vmm_list); sc != NULL; 1823 sc = list_next(&vmm_list, sc)) { 1824 if (sc->vmm_zone == curzone) { 1825 mutex_exit(&vmm_mtx); 1826 vmm_hma_release(); 1827 return (EINVAL); 1828 } 1829 } 1830 } 1831 1832 minor = id_alloc(vmm_minors); 1833 if (ddi_soft_state_zalloc(vmm_statep, minor) != DDI_SUCCESS) { 1834 goto fail; 1835 } else if ((sc = ddi_get_soft_state(vmm_statep, minor)) == NULL) { 1836 ddi_soft_state_free(vmm_statep, minor); 1837 goto fail; 1838 } else if (ddi_create_minor_node(vmmdev_dip, name, S_IFCHR, minor, 1839 DDI_PSEUDO, 0) != DDI_SUCCESS) { 1840 goto fail; 1841 } 1842 1843 if (vmm_kstat_alloc(sc, minor, cr) != 0) { 1844 goto fail; 1845 } 1846 1847 error = vm_create(req->flags, &sc->vmm_vm); 1848 if (error == 0) { 1849 /* Complete VM intialization and report success. */ 1850 (void) strlcpy(sc->vmm_name, name, sizeof (sc->vmm_name)); 1851 sc->vmm_minor = minor; 1852 list_create(&sc->vmm_devmem_list, sizeof (vmm_devmem_entry_t), 1853 offsetof(vmm_devmem_entry_t, vde_node)); 1854 1855 list_create(&sc->vmm_holds, sizeof (vmm_hold_t), 1856 offsetof(vmm_hold_t, vmh_node)); 1857 cv_init(&sc->vmm_cv, NULL, CV_DEFAULT, NULL); 1858 1859 mutex_init(&sc->vmm_lease_lock, NULL, MUTEX_DEFAULT, NULL); 1860 list_create(&sc->vmm_lease_list, sizeof (vmm_lease_t), 1861 offsetof(vmm_lease_t, vml_node)); 1862 cv_init(&sc->vmm_lease_cv, NULL, CV_DEFAULT, NULL); 1863 rw_init(&sc->vmm_rwlock, NULL, RW_DEFAULT, NULL); 1864 1865 sc->vmm_zone = crgetzone(cr); 1866 zone_hold(sc->vmm_zone); 1867 vmm_zsd_add_vm(sc); 1868 vmm_kstat_init(sc); 1869 1870 list_insert_tail(&vmm_list, sc); 1871 mutex_exit(&vmm_mtx); 1872 return (0); 1873 } 1874 1875 vmm_kstat_fini(sc); 1876 ddi_remove_minor_node(vmmdev_dip, name); 1877 fail: 1878 id_free(vmm_minors, minor); 1879 if (sc != NULL) { 1880 ddi_soft_state_free(vmm_statep, minor); 1881 } 1882 mutex_exit(&vmm_mtx); 1883 vmm_hma_release(); 1884 1885 return (error); 1886 } 1887 1888 /* 1889 * Bhyve 'Driver' Interface 1890 * 1891 * While many devices are emulated in the bhyve userspace process, there are 1892 * others with performance constraints which require that they run mostly or 1893 * entirely in-kernel. For those not integrated directly into bhyve, an API is 1894 * needed so they can query/manipulate the portions of VM state needed to 1895 * fulfill their purpose. 1896 * 1897 * This includes: 1898 * - Translating guest-physical addresses to host-virtual pointers 1899 * - Injecting MSIs 1900 * - Hooking IO port addresses 1901 * 1902 * The vmm_drv interface exists to provide that functionality to its consumers. 1903 * (At this time, 'viona' is the only user) 1904 */ 1905 int 1906 vmm_drv_hold(file_t *fp, cred_t *cr, vmm_hold_t **holdp) 1907 { 1908 vnode_t *vp = fp->f_vnode; 1909 const dev_t dev = vp->v_rdev; 1910 vmm_softc_t *sc; 1911 vmm_hold_t *hold; 1912 int err = 0; 1913 1914 if (vp->v_type != VCHR) { 1915 return (ENXIO); 1916 } 1917 const major_t major = getmajor(dev); 1918 const minor_t minor = getminor(dev); 1919 1920 mutex_enter(&vmmdev_mtx); 1921 if (vmmdev_dip == NULL || major != ddi_driver_major(vmmdev_dip)) { 1922 mutex_exit(&vmmdev_mtx); 1923 return (ENOENT); 1924 } 1925 mutex_enter(&vmm_mtx); 1926 mutex_exit(&vmmdev_mtx); 1927 1928 if ((sc = ddi_get_soft_state(vmm_statep, minor)) == NULL) { 1929 err = ENOENT; 1930 goto out; 1931 } 1932 /* XXXJOY: check cred permissions against instance */ 1933 1934 if ((sc->vmm_flags & (VMM_CLEANUP|VMM_PURGED|VMM_DESTROY)) != 0) { 1935 err = EBUSY; 1936 goto out; 1937 } 1938 1939 hold = kmem_zalloc(sizeof (*hold), KM_SLEEP); 1940 hold->vmh_sc = sc; 1941 hold->vmh_release_req = B_FALSE; 1942 1943 list_insert_tail(&sc->vmm_holds, hold); 1944 sc->vmm_flags |= VMM_HELD; 1945 *holdp = hold; 1946 1947 out: 1948 mutex_exit(&vmm_mtx); 1949 return (err); 1950 } 1951 1952 void 1953 vmm_drv_rele(vmm_hold_t *hold) 1954 { 1955 vmm_softc_t *sc; 1956 boolean_t hma_release = B_FALSE; 1957 1958 ASSERT(hold != NULL); 1959 ASSERT(hold->vmh_sc != NULL); 1960 VERIFY(hold->vmh_ioport_hook_cnt == 0); 1961 1962 mutex_enter(&vmm_mtx); 1963 sc = hold->vmh_sc; 1964 list_remove(&sc->vmm_holds, hold); 1965 if (list_is_empty(&sc->vmm_holds)) { 1966 sc->vmm_flags &= ~VMM_HELD; 1967 cv_broadcast(&sc->vmm_cv); 1968 1969 /* 1970 * If pending hold(s) had prevented an auto-destruct of the 1971 * instance when it was closed, finish that clean-up now. 1972 */ 1973 if (sc->vmm_autodestruct && !sc->vmm_is_open) { 1974 int err = vmm_destroy_locked(sc, 1975 VDO_NO_PURGE_WAIT, &hma_release); 1976 1977 VERIFY0(err); 1978 VERIFY(hma_release); 1979 } 1980 } 1981 mutex_exit(&vmm_mtx); 1982 kmem_free(hold, sizeof (*hold)); 1983 1984 if (hma_release) { 1985 vmm_hma_release(); 1986 } 1987 } 1988 1989 boolean_t 1990 vmm_drv_release_reqd(vmm_hold_t *hold) 1991 { 1992 ASSERT(hold != NULL); 1993 1994 return (hold->vmh_release_req); 1995 } 1996 1997 vmm_lease_t * 1998 vmm_drv_lease_sign(vmm_hold_t *hold, boolean_t (*expiref)(void *), void *arg) 1999 { 2000 vmm_softc_t *sc = hold->vmh_sc; 2001 vmm_lease_t *lease; 2002 2003 ASSERT3P(expiref, !=, NULL); 2004 2005 if (hold->vmh_release_req) { 2006 return (NULL); 2007 } 2008 2009 lease = kmem_alloc(sizeof (*lease), KM_SLEEP); 2010 list_link_init(&lease->vml_node); 2011 lease->vml_expire_func = expiref; 2012 lease->vml_expire_arg = arg; 2013 lease->vml_expired = B_FALSE; 2014 lease->vml_break_deferred = B_FALSE; 2015 lease->vml_hold = hold; 2016 /* cache the VM pointer for one less pointer chase */ 2017 lease->vml_vm = sc->vmm_vm; 2018 lease->vml_vmclient = vmspace_client_alloc(vm_get_vmspace(sc->vmm_vm)); 2019 2020 mutex_enter(&sc->vmm_lease_lock); 2021 while (sc->vmm_lease_blocker != 0) { 2022 cv_wait(&sc->vmm_lease_cv, &sc->vmm_lease_lock); 2023 } 2024 list_insert_tail(&sc->vmm_lease_list, lease); 2025 vmm_read_lock(sc); 2026 mutex_exit(&sc->vmm_lease_lock); 2027 2028 return (lease); 2029 } 2030 2031 static void 2032 vmm_lease_break_locked(vmm_softc_t *sc, vmm_lease_t *lease) 2033 { 2034 ASSERT(MUTEX_HELD(&sc->vmm_lease_lock)); 2035 2036 list_remove(&sc->vmm_lease_list, lease); 2037 vmm_read_unlock(sc); 2038 vmc_destroy(lease->vml_vmclient); 2039 kmem_free(lease, sizeof (*lease)); 2040 } 2041 2042 static void 2043 vmm_lease_block(vmm_softc_t *sc) 2044 { 2045 mutex_enter(&sc->vmm_lease_lock); 2046 VERIFY3U(sc->vmm_lease_blocker, !=, UINT_MAX); 2047 sc->vmm_lease_blocker++; 2048 if (sc->vmm_lease_blocker == 1) { 2049 list_t *list = &sc->vmm_lease_list; 2050 vmm_lease_t *lease = list_head(list); 2051 2052 while (lease != NULL) { 2053 void *arg = lease->vml_expire_arg; 2054 boolean_t (*expiref)(void *) = lease->vml_expire_func; 2055 boolean_t sync_break = B_FALSE; 2056 2057 /* 2058 * Since the lease expiration notification may 2059 * need to take locks which would deadlock with 2060 * vmm_lease_lock, drop it across the call. 2061 * 2062 * We are the only one allowed to manipulate 2063 * vmm_lease_list right now, so it is safe to 2064 * continue iterating through it after 2065 * reacquiring the lock. 2066 */ 2067 lease->vml_expired = B_TRUE; 2068 mutex_exit(&sc->vmm_lease_lock); 2069 sync_break = expiref(arg); 2070 mutex_enter(&sc->vmm_lease_lock); 2071 2072 if (sync_break) { 2073 vmm_lease_t *next; 2074 2075 /* 2076 * These leases which are synchronously broken 2077 * result in vmm_read_unlock() calls from a 2078 * different thread than the corresponding 2079 * vmm_read_lock(). This is acceptable, given 2080 * that the rwlock underpinning the whole 2081 * mechanism tolerates the behavior. This 2082 * flexibility is _only_ afforded to VM read 2083 * lock (RW_READER) holders. 2084 */ 2085 next = list_next(list, lease); 2086 vmm_lease_break_locked(sc, lease); 2087 lease = next; 2088 } else { 2089 lease = list_next(list, lease); 2090 } 2091 } 2092 2093 /* Process leases which were not broken synchronously. */ 2094 while (!list_is_empty(list)) { 2095 /* 2096 * Although the nested loops are quadratic, the number 2097 * of leases is small. 2098 */ 2099 lease = list_head(list); 2100 while (lease != NULL) { 2101 vmm_lease_t *next = list_next(list, lease); 2102 if (lease->vml_break_deferred) { 2103 vmm_lease_break_locked(sc, lease); 2104 } 2105 lease = next; 2106 } 2107 if (list_is_empty(list)) { 2108 break; 2109 } 2110 cv_wait(&sc->vmm_lease_cv, &sc->vmm_lease_lock); 2111 } 2112 /* Wake anyone else waiting for the lease list to be empty */ 2113 cv_broadcast(&sc->vmm_lease_cv); 2114 } else { 2115 list_t *list = &sc->vmm_lease_list; 2116 2117 /* 2118 * Some other thread beat us to the duty of lease cleanup. 2119 * Wait until that is complete. 2120 */ 2121 while (!list_is_empty(list)) { 2122 cv_wait(&sc->vmm_lease_cv, &sc->vmm_lease_lock); 2123 } 2124 } 2125 mutex_exit(&sc->vmm_lease_lock); 2126 } 2127 2128 static void 2129 vmm_lease_unblock(vmm_softc_t *sc) 2130 { 2131 mutex_enter(&sc->vmm_lease_lock); 2132 VERIFY3U(sc->vmm_lease_blocker, !=, 0); 2133 sc->vmm_lease_blocker--; 2134 if (sc->vmm_lease_blocker == 0) { 2135 cv_broadcast(&sc->vmm_lease_cv); 2136 } 2137 mutex_exit(&sc->vmm_lease_lock); 2138 } 2139 2140 void 2141 vmm_drv_lease_break(vmm_hold_t *hold, vmm_lease_t *lease) 2142 { 2143 vmm_softc_t *sc = hold->vmh_sc; 2144 2145 VERIFY3P(hold, ==, lease->vml_hold); 2146 VERIFY(!lease->vml_break_deferred); 2147 2148 mutex_enter(&sc->vmm_lease_lock); 2149 if (sc->vmm_lease_blocker == 0) { 2150 vmm_lease_break_locked(sc, lease); 2151 } else { 2152 /* 2153 * Defer the lease-breaking to whichever thread is currently 2154 * cleaning up all leases as part of a vmm_lease_block() call. 2155 */ 2156 lease->vml_break_deferred = B_TRUE; 2157 cv_broadcast(&sc->vmm_lease_cv); 2158 } 2159 mutex_exit(&sc->vmm_lease_lock); 2160 } 2161 2162 boolean_t 2163 vmm_drv_lease_expired(vmm_lease_t *lease) 2164 { 2165 return (lease->vml_expired); 2166 } 2167 2168 vmm_page_t * 2169 vmm_drv_page_hold(vmm_lease_t *lease, uintptr_t gpa, int prot) 2170 { 2171 ASSERT(lease != NULL); 2172 ASSERT0(gpa & PAGEOFFSET); 2173 2174 return ((vmm_page_t *)vmc_hold(lease->vml_vmclient, gpa, prot)); 2175 } 2176 2177 void 2178 vmm_drv_page_release(vmm_page_t *vmmp) 2179 { 2180 (void) vmp_release((vm_page_t *)vmmp); 2181 } 2182 2183 void 2184 vmm_drv_page_release_chain(vmm_page_t *vmmp) 2185 { 2186 (void) vmp_release_chain((vm_page_t *)vmmp); 2187 } 2188 2189 const void * 2190 vmm_drv_page_readable(const vmm_page_t *vmmp) 2191 { 2192 return (vmp_get_readable((const vm_page_t *)vmmp)); 2193 } 2194 2195 void * 2196 vmm_drv_page_writable(const vmm_page_t *vmmp) 2197 { 2198 return (vmp_get_writable((const vm_page_t *)vmmp)); 2199 } 2200 2201 void 2202 vmm_drv_page_chain(vmm_page_t *vmmp, vmm_page_t *to_chain) 2203 { 2204 vmp_chain((vm_page_t *)vmmp, (vm_page_t *)to_chain); 2205 } 2206 2207 vmm_page_t * 2208 vmm_drv_page_next(const vmm_page_t *vmmp) 2209 { 2210 return ((vmm_page_t *)vmp_next((vm_page_t *)vmmp)); 2211 } 2212 2213 int 2214 vmm_drv_msi(vmm_lease_t *lease, uint64_t addr, uint64_t msg) 2215 { 2216 ASSERT(lease != NULL); 2217 2218 return (lapic_intr_msi(lease->vml_vm, addr, msg)); 2219 } 2220 2221 int 2222 vmm_drv_ioport_hook(vmm_hold_t *hold, uint16_t ioport, vmm_drv_iop_cb_t func, 2223 void *arg, void **cookie) 2224 { 2225 vmm_softc_t *sc; 2226 int err; 2227 2228 ASSERT(hold != NULL); 2229 ASSERT(cookie != NULL); 2230 2231 sc = hold->vmh_sc; 2232 mutex_enter(&vmm_mtx); 2233 /* Confirm that hook installation is not blocked */ 2234 if ((sc->vmm_flags & VMM_BLOCK_HOOK) != 0) { 2235 mutex_exit(&vmm_mtx); 2236 return (EBUSY); 2237 } 2238 /* 2239 * Optimistically record an installed hook which will prevent a block 2240 * from being asserted while the mutex is dropped. 2241 */ 2242 hold->vmh_ioport_hook_cnt++; 2243 mutex_exit(&vmm_mtx); 2244 2245 vmm_write_lock(sc); 2246 err = vm_ioport_hook(sc->vmm_vm, ioport, (ioport_handler_t)func, 2247 arg, cookie); 2248 vmm_write_unlock(sc); 2249 2250 if (err != 0) { 2251 mutex_enter(&vmm_mtx); 2252 /* Walk back optimism about the hook installation */ 2253 hold->vmh_ioport_hook_cnt--; 2254 mutex_exit(&vmm_mtx); 2255 } 2256 return (err); 2257 } 2258 2259 void 2260 vmm_drv_ioport_unhook(vmm_hold_t *hold, void **cookie) 2261 { 2262 vmm_softc_t *sc; 2263 2264 ASSERT(hold != NULL); 2265 ASSERT(cookie != NULL); 2266 ASSERT(hold->vmh_ioport_hook_cnt != 0); 2267 2268 sc = hold->vmh_sc; 2269 vmm_write_lock(sc); 2270 vm_ioport_unhook(sc->vmm_vm, cookie); 2271 vmm_write_unlock(sc); 2272 2273 mutex_enter(&vmm_mtx); 2274 hold->vmh_ioport_hook_cnt--; 2275 mutex_exit(&vmm_mtx); 2276 } 2277 2278 static int 2279 vmm_drv_purge(vmm_softc_t *sc, boolean_t no_wait) 2280 { 2281 ASSERT(MUTEX_HELD(&vmm_mtx)); 2282 2283 if ((sc->vmm_flags & VMM_HELD) != 0) { 2284 vmm_hold_t *hold; 2285 2286 sc->vmm_flags |= VMM_CLEANUP; 2287 for (hold = list_head(&sc->vmm_holds); hold != NULL; 2288 hold = list_next(&sc->vmm_holds, hold)) { 2289 hold->vmh_release_req = B_TRUE; 2290 } 2291 2292 /* 2293 * Require that all leases on the instance be broken, now that 2294 * all associated holds have been marked as needing release. 2295 * 2296 * Dropping vmm_mtx is not strictly necessary, but if any of the 2297 * lessees are slow to respond, it would be nice to leave it 2298 * available for other parties. 2299 */ 2300 mutex_exit(&vmm_mtx); 2301 vmm_lease_block(sc); 2302 vmm_lease_unblock(sc); 2303 mutex_enter(&vmm_mtx); 2304 2305 /* 2306 * With all of the leases broken, we can proceed in an orderly 2307 * fashion to waiting for any lingering holds to be dropped. 2308 */ 2309 while ((sc->vmm_flags & VMM_HELD) != 0) { 2310 /* 2311 * Some holds remain, so wait (if acceptable) for them 2312 * to be cleaned up. 2313 */ 2314 if (no_wait || 2315 cv_wait_sig(&sc->vmm_cv, &vmm_mtx) <= 0) { 2316 sc->vmm_flags &= ~VMM_CLEANUP; 2317 return (EINTR); 2318 } 2319 } 2320 sc->vmm_flags &= ~VMM_CLEANUP; 2321 } 2322 2323 VERIFY(list_is_empty(&sc->vmm_holds)); 2324 sc->vmm_flags |= VMM_PURGED; 2325 return (0); 2326 } 2327 2328 static int 2329 vmm_drv_block_hook(vmm_softc_t *sc, boolean_t enable_block) 2330 { 2331 int err = 0; 2332 2333 mutex_enter(&vmm_mtx); 2334 if (!enable_block) { 2335 VERIFY((sc->vmm_flags & VMM_BLOCK_HOOK) != 0); 2336 2337 sc->vmm_flags &= ~VMM_BLOCK_HOOK; 2338 goto done; 2339 } 2340 2341 /* If any holds have hooks installed, the block is a failure */ 2342 if (!list_is_empty(&sc->vmm_holds)) { 2343 vmm_hold_t *hold; 2344 2345 for (hold = list_head(&sc->vmm_holds); hold != NULL; 2346 hold = list_next(&sc->vmm_holds, hold)) { 2347 if (hold->vmh_ioport_hook_cnt != 0) { 2348 err = EBUSY; 2349 goto done; 2350 } 2351 } 2352 } 2353 sc->vmm_flags |= VMM_BLOCK_HOOK; 2354 2355 done: 2356 mutex_exit(&vmm_mtx); 2357 return (err); 2358 } 2359 2360 static int 2361 vmm_destroy_locked(vmm_softc_t *sc, vmm_destroy_opts_t opts, 2362 boolean_t *hma_release) 2363 { 2364 dev_info_t *pdip = ddi_get_parent(vmmdev_dip); 2365 minor_t minor; 2366 2367 ASSERT(MUTEX_HELD(&vmm_mtx)); 2368 2369 *hma_release = B_FALSE; 2370 2371 if (vmm_drv_purge(sc, (opts & VDO_NO_PURGE_WAIT) != 0) != 0) { 2372 return (EINTR); 2373 } 2374 2375 if ((opts & VDO_NO_CLEAN_ZSD) == 0) { 2376 vmm_zsd_rem_vm(sc); 2377 } 2378 2379 /* Clean up devmem entries */ 2380 vmmdev_devmem_purge(sc); 2381 2382 list_remove(&vmm_list, sc); 2383 ddi_remove_minor_node(vmmdev_dip, sc->vmm_name); 2384 minor = sc->vmm_minor; 2385 zone_rele(sc->vmm_zone); 2386 if (sc->vmm_is_open) { 2387 list_insert_tail(&vmm_destroy_list, sc); 2388 sc->vmm_flags |= VMM_DESTROY; 2389 } else { 2390 vmm_kstat_fini(sc); 2391 vm_destroy(sc->vmm_vm); 2392 ddi_soft_state_free(vmm_statep, minor); 2393 id_free(vmm_minors, minor); 2394 *hma_release = B_TRUE; 2395 } 2396 (void) devfs_clean(pdip, NULL, DV_CLEAN_FORCE); 2397 2398 return (0); 2399 } 2400 2401 int 2402 vmm_zone_vm_destroy(vmm_softc_t *sc) 2403 { 2404 boolean_t hma_release = B_FALSE; 2405 int err; 2406 2407 mutex_enter(&vmm_mtx); 2408 err = vmm_destroy_locked(sc, VDO_NO_CLEAN_ZSD, &hma_release); 2409 mutex_exit(&vmm_mtx); 2410 2411 if (hma_release) 2412 vmm_hma_release(); 2413 2414 return (err); 2415 } 2416 2417 /* ARGSUSED */ 2418 static int 2419 vmmdev_do_vm_destroy(const struct vm_destroy_req *req, cred_t *cr) 2420 { 2421 boolean_t hma_release = B_FALSE; 2422 vmm_softc_t *sc; 2423 int err; 2424 2425 if (crgetuid(cr) != 0) 2426 return (EPERM); 2427 2428 mutex_enter(&vmm_mtx); 2429 2430 if ((sc = vmm_lookup(req->name)) == NULL) { 2431 mutex_exit(&vmm_mtx); 2432 return (ENOENT); 2433 } 2434 /* 2435 * We don't check this in vmm_lookup() since that function is also used 2436 * for validation during create and currently vmm names must be unique. 2437 */ 2438 if (!INGLOBALZONE(curproc) && sc->vmm_zone != curzone) { 2439 mutex_exit(&vmm_mtx); 2440 return (EPERM); 2441 } 2442 err = vmm_destroy_locked(sc, VDO_DEFAULT, &hma_release); 2443 2444 mutex_exit(&vmm_mtx); 2445 2446 if (hma_release) 2447 vmm_hma_release(); 2448 2449 return (err); 2450 } 2451 2452 #define VCPU_NAME_BUFLEN 32 2453 2454 static int 2455 vmm_kstat_alloc(vmm_softc_t *sc, minor_t minor, const cred_t *cr) 2456 { 2457 zoneid_t zid = crgetzoneid(cr); 2458 int instance = minor; 2459 kstat_t *ksp; 2460 2461 ASSERT3P(sc->vmm_kstat_vm, ==, NULL); 2462 2463 ksp = kstat_create_zone(VMM_MODULE_NAME, instance, "vm", 2464 VMM_KSTAT_CLASS, KSTAT_TYPE_NAMED, 2465 sizeof (vmm_kstats_t) / sizeof (kstat_named_t), 0, zid); 2466 2467 if (ksp == NULL) { 2468 return (-1); 2469 } 2470 sc->vmm_kstat_vm = ksp; 2471 2472 for (uint_t i = 0; i < VM_MAXCPU; i++) { 2473 char namebuf[VCPU_NAME_BUFLEN]; 2474 2475 ASSERT3P(sc->vmm_kstat_vcpu[i], ==, NULL); 2476 2477 (void) snprintf(namebuf, VCPU_NAME_BUFLEN, "vcpu%u", i); 2478 ksp = kstat_create_zone(VMM_MODULE_NAME, instance, namebuf, 2479 VMM_KSTAT_CLASS, KSTAT_TYPE_NAMED, 2480 sizeof (vmm_vcpu_kstats_t) / sizeof (kstat_named_t), 2481 0, zid); 2482 if (ksp == NULL) { 2483 goto fail; 2484 } 2485 2486 sc->vmm_kstat_vcpu[i] = ksp; 2487 } 2488 2489 /* 2490 * If this instance is associated with a non-global zone, make its 2491 * kstats visible from the GZ. 2492 */ 2493 if (zid != GLOBAL_ZONEID) { 2494 kstat_zone_add(sc->vmm_kstat_vm, GLOBAL_ZONEID); 2495 for (uint_t i = 0; i < VM_MAXCPU; i++) { 2496 kstat_zone_add(sc->vmm_kstat_vcpu[i], GLOBAL_ZONEID); 2497 } 2498 } 2499 2500 return (0); 2501 2502 fail: 2503 for (uint_t i = 0; i < VM_MAXCPU; i++) { 2504 if (sc->vmm_kstat_vcpu[i] != NULL) { 2505 kstat_delete(sc->vmm_kstat_vcpu[i]); 2506 sc->vmm_kstat_vcpu[i] = NULL; 2507 } else { 2508 break; 2509 } 2510 } 2511 kstat_delete(sc->vmm_kstat_vm); 2512 sc->vmm_kstat_vm = NULL; 2513 return (-1); 2514 } 2515 2516 static void 2517 vmm_kstat_init(vmm_softc_t *sc) 2518 { 2519 kstat_t *ksp; 2520 2521 ASSERT3P(sc->vmm_vm, !=, NULL); 2522 ASSERT3P(sc->vmm_kstat_vm, !=, NULL); 2523 2524 ksp = sc->vmm_kstat_vm; 2525 vmm_kstats_t *vk = ksp->ks_data; 2526 ksp->ks_private = sc->vmm_vm; 2527 kstat_named_init(&vk->vk_name, "vm_name", KSTAT_DATA_STRING); 2528 kstat_named_setstr(&vk->vk_name, sc->vmm_name); 2529 2530 for (uint_t i = 0; i < VM_MAXCPU; i++) { 2531 ASSERT3P(sc->vmm_kstat_vcpu[i], !=, NULL); 2532 2533 ksp = sc->vmm_kstat_vcpu[i]; 2534 vmm_vcpu_kstats_t *vvk = ksp->ks_data; 2535 2536 kstat_named_init(&vvk->vvk_vcpu, "vcpu", KSTAT_DATA_UINT32); 2537 vvk->vvk_vcpu.value.ui32 = i; 2538 kstat_named_init(&vvk->vvk_time_init, "time_init", 2539 KSTAT_DATA_UINT64); 2540 kstat_named_init(&vvk->vvk_time_run, "time_run", 2541 KSTAT_DATA_UINT64); 2542 kstat_named_init(&vvk->vvk_time_idle, "time_idle", 2543 KSTAT_DATA_UINT64); 2544 kstat_named_init(&vvk->vvk_time_emu_kern, "time_emu_kern", 2545 KSTAT_DATA_UINT64); 2546 kstat_named_init(&vvk->vvk_time_emu_user, "time_emu_user", 2547 KSTAT_DATA_UINT64); 2548 kstat_named_init(&vvk->vvk_time_sched, "time_sched", 2549 KSTAT_DATA_UINT64); 2550 ksp->ks_private = sc->vmm_vm; 2551 ksp->ks_update = vmm_kstat_update_vcpu; 2552 } 2553 2554 kstat_install(sc->vmm_kstat_vm); 2555 for (uint_t i = 0; i < VM_MAXCPU; i++) { 2556 kstat_install(sc->vmm_kstat_vcpu[i]); 2557 } 2558 } 2559 2560 static void 2561 vmm_kstat_fini(vmm_softc_t *sc) 2562 { 2563 ASSERT(sc->vmm_kstat_vm != NULL); 2564 2565 kstat_delete(sc->vmm_kstat_vm); 2566 sc->vmm_kstat_vm = NULL; 2567 2568 for (uint_t i = 0; i < VM_MAXCPU; i++) { 2569 ASSERT3P(sc->vmm_kstat_vcpu[i], !=, NULL); 2570 2571 kstat_delete(sc->vmm_kstat_vcpu[i]); 2572 sc->vmm_kstat_vcpu[i] = NULL; 2573 } 2574 } 2575 2576 static int 2577 vmm_open(dev_t *devp, int flag, int otyp, cred_t *credp) 2578 { 2579 minor_t minor; 2580 vmm_softc_t *sc; 2581 2582 /* 2583 * Forbid running bhyve in a 32-bit process until it has been tested and 2584 * verified to be safe. 2585 */ 2586 if (curproc->p_model != DATAMODEL_LP64) { 2587 return (EFBIG); 2588 } 2589 2590 minor = getminor(*devp); 2591 if (minor == VMM_CTL_MINOR) { 2592 /* 2593 * Master control device must be opened exclusively. 2594 */ 2595 if ((flag & FEXCL) != FEXCL || otyp != OTYP_CHR) { 2596 return (EINVAL); 2597 } 2598 2599 return (0); 2600 } 2601 2602 mutex_enter(&vmm_mtx); 2603 sc = ddi_get_soft_state(vmm_statep, minor); 2604 if (sc == NULL) { 2605 mutex_exit(&vmm_mtx); 2606 return (ENXIO); 2607 } 2608 2609 sc->vmm_is_open = B_TRUE; 2610 mutex_exit(&vmm_mtx); 2611 2612 return (0); 2613 } 2614 2615 static int 2616 vmm_close(dev_t dev, int flag, int otyp, cred_t *credp) 2617 { 2618 minor_t minor; 2619 vmm_softc_t *sc; 2620 boolean_t hma_release = B_FALSE; 2621 2622 minor = getminor(dev); 2623 if (minor == VMM_CTL_MINOR) 2624 return (0); 2625 2626 mutex_enter(&vmm_mtx); 2627 sc = ddi_get_soft_state(vmm_statep, minor); 2628 if (sc == NULL) { 2629 mutex_exit(&vmm_mtx); 2630 return (ENXIO); 2631 } 2632 2633 VERIFY(sc->vmm_is_open); 2634 sc->vmm_is_open = B_FALSE; 2635 2636 /* 2637 * If this VM was destroyed while the vmm device was open, then 2638 * clean it up now that it is closed. 2639 */ 2640 if (sc->vmm_flags & VMM_DESTROY) { 2641 list_remove(&vmm_destroy_list, sc); 2642 vmm_kstat_fini(sc); 2643 vm_destroy(sc->vmm_vm); 2644 ddi_soft_state_free(vmm_statep, minor); 2645 id_free(vmm_minors, minor); 2646 hma_release = B_TRUE; 2647 } else if (sc->vmm_autodestruct) { 2648 /* 2649 * Attempt auto-destruct on instance if requested. 2650 * 2651 * Do not wait for existing holds to be purged from the 2652 * instance, since there is no guarantee that will happen in a 2653 * timely manner. Auto-destruction will resume when the last 2654 * hold is released. (See: vmm_drv_rele) 2655 */ 2656 (void) vmm_destroy_locked(sc, VDO_NO_PURGE_WAIT, &hma_release); 2657 } 2658 mutex_exit(&vmm_mtx); 2659 2660 if (hma_release) 2661 vmm_hma_release(); 2662 2663 return (0); 2664 } 2665 2666 static int 2667 vmm_is_supported(intptr_t arg) 2668 { 2669 int r; 2670 const char *msg; 2671 2672 if (vmm_is_intel()) { 2673 r = vmx_x86_supported(&msg); 2674 } else if (vmm_is_svm()) { 2675 /* 2676 * HMA already ensured that the features necessary for SVM 2677 * operation were present and online during vmm_attach(). 2678 */ 2679 r = 0; 2680 } else { 2681 r = ENXIO; 2682 msg = "Unsupported CPU vendor"; 2683 } 2684 2685 if (r != 0 && arg != (intptr_t)NULL) { 2686 if (copyoutstr(msg, (char *)arg, strlen(msg) + 1, NULL) != 0) 2687 return (EFAULT); 2688 } 2689 return (r); 2690 } 2691 2692 static int 2693 vmm_ctl_ioctl(int cmd, intptr_t arg, int md, cred_t *cr, int *rvalp) 2694 { 2695 void *argp = (void *)arg; 2696 2697 switch (cmd) { 2698 case VMM_CREATE_VM: { 2699 struct vm_create_req req; 2700 2701 if ((md & FWRITE) == 0) { 2702 return (EPERM); 2703 } 2704 if (ddi_copyin(argp, &req, sizeof (req), md) != 0) { 2705 return (EFAULT); 2706 } 2707 return (vmmdev_do_vm_create(&req, cr)); 2708 } 2709 case VMM_DESTROY_VM: { 2710 struct vm_destroy_req req; 2711 2712 if ((md & FWRITE) == 0) { 2713 return (EPERM); 2714 } 2715 if (ddi_copyin(argp, &req, sizeof (req), md) != 0) { 2716 return (EFAULT); 2717 } 2718 return (vmmdev_do_vm_destroy(&req, cr)); 2719 } 2720 case VMM_VM_SUPPORTED: 2721 return (vmm_is_supported(arg)); 2722 case VMM_INTERFACE_VERSION: 2723 *rvalp = VMM_CURRENT_INTERFACE_VERSION; 2724 return (0); 2725 case VMM_CHECK_IOMMU: 2726 if (!vmm_check_iommu()) { 2727 return (ENXIO); 2728 } 2729 return (0); 2730 case VMM_RESV_QUERY: 2731 case VMM_RESV_ADD: 2732 case VMM_RESV_REMOVE: 2733 return (vmmr_ioctl(cmd, arg, md, cr, rvalp)); 2734 default: 2735 break; 2736 } 2737 /* No other actions are legal on ctl device */ 2738 return (ENOTTY); 2739 } 2740 2741 static int 2742 vmm_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, 2743 int *rvalp) 2744 { 2745 vmm_softc_t *sc; 2746 minor_t minor; 2747 2748 /* 2749 * Forbid running bhyve in a 32-bit process until it has been tested and 2750 * verified to be safe. 2751 */ 2752 if (curproc->p_model != DATAMODEL_LP64) { 2753 return (EFBIG); 2754 } 2755 2756 /* The structs in bhyve ioctls assume a 64-bit datamodel */ 2757 if (ddi_model_convert_from(mode & FMODELS) != DDI_MODEL_NONE) { 2758 return (ENOTSUP); 2759 } 2760 2761 minor = getminor(dev); 2762 2763 if (minor == VMM_CTL_MINOR) { 2764 return (vmm_ctl_ioctl(cmd, arg, mode, credp, rvalp)); 2765 } 2766 2767 sc = ddi_get_soft_state(vmm_statep, minor); 2768 ASSERT(sc); 2769 2770 if (sc->vmm_flags & VMM_DESTROY) 2771 return (ENXIO); 2772 2773 return (vmmdev_do_ioctl(sc, cmd, arg, mode, credp, rvalp)); 2774 } 2775 2776 static int 2777 vmm_segmap(dev_t dev, off_t off, struct as *as, caddr_t *addrp, off_t len, 2778 unsigned int prot, unsigned int maxprot, unsigned int flags, cred_t *credp) 2779 { 2780 vmm_softc_t *sc; 2781 const minor_t minor = getminor(dev); 2782 int err; 2783 2784 if (minor == VMM_CTL_MINOR) { 2785 return (ENODEV); 2786 } 2787 if (off < 0 || (off + len) <= 0) { 2788 return (EINVAL); 2789 } 2790 if ((prot & PROT_USER) == 0) { 2791 return (EACCES); 2792 } 2793 2794 sc = ddi_get_soft_state(vmm_statep, minor); 2795 ASSERT(sc); 2796 2797 if (sc->vmm_flags & VMM_DESTROY) 2798 return (ENXIO); 2799 2800 /* Grab read lock on the VM to prevent any changes to the memory map */ 2801 vmm_read_lock(sc); 2802 2803 if (off >= VM_DEVMEM_START) { 2804 int segid; 2805 off_t segoff; 2806 2807 /* Mapping a devmem "device" */ 2808 if (!vmmdev_devmem_segid(sc, off, len, &segid, &segoff)) { 2809 err = ENODEV; 2810 } else { 2811 err = vm_segmap_obj(sc->vmm_vm, segid, segoff, len, as, 2812 addrp, prot, maxprot, flags); 2813 } 2814 } else { 2815 /* Mapping a part of the guest physical space */ 2816 err = vm_segmap_space(sc->vmm_vm, off, as, addrp, len, prot, 2817 maxprot, flags); 2818 } 2819 2820 vmm_read_unlock(sc); 2821 return (err); 2822 } 2823 2824 static sdev_plugin_validate_t 2825 vmm_sdev_validate(sdev_ctx_t ctx) 2826 { 2827 const char *name = sdev_ctx_name(ctx); 2828 vmm_softc_t *sc; 2829 sdev_plugin_validate_t ret; 2830 minor_t minor; 2831 2832 if (sdev_ctx_vtype(ctx) != VCHR) 2833 return (SDEV_VTOR_INVALID); 2834 2835 VERIFY3S(sdev_ctx_minor(ctx, &minor), ==, 0); 2836 2837 mutex_enter(&vmm_mtx); 2838 if ((sc = vmm_lookup(name)) == NULL) 2839 ret = SDEV_VTOR_INVALID; 2840 else if (sc->vmm_minor != minor) 2841 ret = SDEV_VTOR_STALE; 2842 else 2843 ret = SDEV_VTOR_VALID; 2844 mutex_exit(&vmm_mtx); 2845 2846 return (ret); 2847 } 2848 2849 static int 2850 vmm_sdev_filldir(sdev_ctx_t ctx) 2851 { 2852 vmm_softc_t *sc; 2853 int ret; 2854 2855 if (strcmp(sdev_ctx_path(ctx), VMM_SDEV_ROOT) != 0) { 2856 cmn_err(CE_WARN, "%s: bad path '%s' != '%s'\n", __func__, 2857 sdev_ctx_path(ctx), VMM_SDEV_ROOT); 2858 return (EINVAL); 2859 } 2860 2861 mutex_enter(&vmm_mtx); 2862 ASSERT(vmmdev_dip != NULL); 2863 for (sc = list_head(&vmm_list); sc != NULL; 2864 sc = list_next(&vmm_list, sc)) { 2865 if (INGLOBALZONE(curproc) || sc->vmm_zone == curzone) { 2866 ret = sdev_plugin_mknod(ctx, sc->vmm_name, 2867 S_IFCHR | 0600, 2868 makedevice(ddi_driver_major(vmmdev_dip), 2869 sc->vmm_minor)); 2870 } else { 2871 continue; 2872 } 2873 if (ret != 0 && ret != EEXIST) 2874 goto out; 2875 } 2876 2877 ret = 0; 2878 2879 out: 2880 mutex_exit(&vmm_mtx); 2881 return (ret); 2882 } 2883 2884 /* ARGSUSED */ 2885 static void 2886 vmm_sdev_inactive(sdev_ctx_t ctx) 2887 { 2888 } 2889 2890 static sdev_plugin_ops_t vmm_sdev_ops = { 2891 .spo_version = SDEV_PLUGIN_VERSION, 2892 .spo_flags = SDEV_PLUGIN_SUBDIR, 2893 .spo_validate = vmm_sdev_validate, 2894 .spo_filldir = vmm_sdev_filldir, 2895 .spo_inactive = vmm_sdev_inactive 2896 }; 2897 2898 /* ARGSUSED */ 2899 static int 2900 vmm_info(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result) 2901 { 2902 int error; 2903 2904 switch (cmd) { 2905 case DDI_INFO_DEVT2DEVINFO: 2906 *result = (void *)vmmdev_dip; 2907 error = DDI_SUCCESS; 2908 break; 2909 case DDI_INFO_DEVT2INSTANCE: 2910 *result = (void *)0; 2911 error = DDI_SUCCESS; 2912 break; 2913 default: 2914 error = DDI_FAILURE; 2915 break; 2916 } 2917 return (error); 2918 } 2919 2920 static int 2921 vmm_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 2922 { 2923 sdev_plugin_hdl_t sph; 2924 hma_reg_t *reg = NULL; 2925 boolean_t vmm_loaded = B_FALSE; 2926 2927 if (cmd != DDI_ATTACH) { 2928 return (DDI_FAILURE); 2929 } 2930 2931 mutex_enter(&vmmdev_mtx); 2932 /* Ensure we are not already attached. */ 2933 if (vmmdev_dip != NULL) { 2934 mutex_exit(&vmmdev_mtx); 2935 return (DDI_FAILURE); 2936 } 2937 2938 vmm_sol_glue_init(); 2939 2940 /* 2941 * Perform temporary HMA registration to determine if the system 2942 * is capable. 2943 */ 2944 if ((reg = hma_register(vmmdev_hvm_name)) == NULL) { 2945 goto fail; 2946 } else if (vmm_mod_load() != 0) { 2947 goto fail; 2948 } 2949 vmm_loaded = B_TRUE; 2950 hma_unregister(reg); 2951 reg = NULL; 2952 2953 /* Create control node. Other nodes will be created on demand. */ 2954 if (ddi_create_minor_node(dip, "ctl", S_IFCHR, 2955 VMM_CTL_MINOR, DDI_PSEUDO, 0) != 0) { 2956 goto fail; 2957 } 2958 2959 sph = sdev_plugin_register(VMM_MODULE_NAME, &vmm_sdev_ops, NULL); 2960 if (sph == (sdev_plugin_hdl_t)NULL) { 2961 ddi_remove_minor_node(dip, NULL); 2962 goto fail; 2963 } 2964 2965 ddi_report_dev(dip); 2966 vmmdev_sdev_hdl = sph; 2967 vmmdev_dip = dip; 2968 mutex_exit(&vmmdev_mtx); 2969 return (DDI_SUCCESS); 2970 2971 fail: 2972 if (vmm_loaded) { 2973 VERIFY0(vmm_mod_unload()); 2974 } 2975 if (reg != NULL) { 2976 hma_unregister(reg); 2977 } 2978 vmm_sol_glue_cleanup(); 2979 mutex_exit(&vmmdev_mtx); 2980 return (DDI_FAILURE); 2981 } 2982 2983 static int 2984 vmm_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 2985 { 2986 if (cmd != DDI_DETACH) { 2987 return (DDI_FAILURE); 2988 } 2989 2990 /* 2991 * Ensure that all resources have been cleaned up. 2992 * 2993 * To prevent a deadlock with iommu_cleanup() we'll fail the detach if 2994 * vmmdev_mtx is already held. We can't wait for vmmdev_mtx with our 2995 * devinfo locked as iommu_cleanup() tries to recursively lock each 2996 * devinfo, including our own, while holding vmmdev_mtx. 2997 */ 2998 if (mutex_tryenter(&vmmdev_mtx) == 0) 2999 return (DDI_FAILURE); 3000 3001 mutex_enter(&vmm_mtx); 3002 if (!list_is_empty(&vmm_list) || !list_is_empty(&vmm_destroy_list)) { 3003 mutex_exit(&vmm_mtx); 3004 mutex_exit(&vmmdev_mtx); 3005 return (DDI_FAILURE); 3006 } 3007 mutex_exit(&vmm_mtx); 3008 3009 if (!vmmr_is_empty()) { 3010 mutex_exit(&vmmdev_mtx); 3011 return (DDI_FAILURE); 3012 } 3013 3014 VERIFY(vmmdev_sdev_hdl != (sdev_plugin_hdl_t)NULL); 3015 if (sdev_plugin_unregister(vmmdev_sdev_hdl) != 0) { 3016 mutex_exit(&vmmdev_mtx); 3017 return (DDI_FAILURE); 3018 } 3019 vmmdev_sdev_hdl = (sdev_plugin_hdl_t)NULL; 3020 3021 /* Remove the control node. */ 3022 ddi_remove_minor_node(dip, "ctl"); 3023 vmmdev_dip = NULL; 3024 3025 VERIFY0(vmm_mod_unload()); 3026 VERIFY3U(vmmdev_hma_reg, ==, NULL); 3027 vmm_sol_glue_cleanup(); 3028 3029 mutex_exit(&vmmdev_mtx); 3030 3031 return (DDI_SUCCESS); 3032 } 3033 3034 static struct cb_ops vmm_cb_ops = { 3035 vmm_open, 3036 vmm_close, 3037 nodev, /* strategy */ 3038 nodev, /* print */ 3039 nodev, /* dump */ 3040 nodev, /* read */ 3041 nodev, /* write */ 3042 vmm_ioctl, 3043 nodev, /* devmap */ 3044 nodev, /* mmap */ 3045 vmm_segmap, 3046 nochpoll, /* poll */ 3047 ddi_prop_op, 3048 NULL, 3049 D_NEW | D_MP | D_DEVMAP 3050 }; 3051 3052 static struct dev_ops vmm_ops = { 3053 DEVO_REV, 3054 0, 3055 vmm_info, 3056 nulldev, /* identify */ 3057 nulldev, /* probe */ 3058 vmm_attach, 3059 vmm_detach, 3060 nodev, /* reset */ 3061 &vmm_cb_ops, 3062 (struct bus_ops *)NULL 3063 }; 3064 3065 static struct modldrv modldrv = { 3066 &mod_driverops, 3067 "bhyve vmm", 3068 &vmm_ops 3069 }; 3070 3071 static struct modlinkage modlinkage = { 3072 MODREV_1, 3073 &modldrv, 3074 NULL 3075 }; 3076 3077 int 3078 _init(void) 3079 { 3080 int error; 3081 3082 sysinit(); 3083 3084 mutex_init(&vmmdev_mtx, NULL, MUTEX_DRIVER, NULL); 3085 mutex_init(&vmm_mtx, NULL, MUTEX_DRIVER, NULL); 3086 list_create(&vmm_list, sizeof (vmm_softc_t), 3087 offsetof(vmm_softc_t, vmm_node)); 3088 list_create(&vmm_destroy_list, sizeof (vmm_softc_t), 3089 offsetof(vmm_softc_t, vmm_node)); 3090 vmm_minors = id_space_create("vmm_minors", VMM_CTL_MINOR + 1, MAXMIN32); 3091 3092 error = ddi_soft_state_init(&vmm_statep, sizeof (vmm_softc_t), 0); 3093 if (error) { 3094 return (error); 3095 } 3096 3097 vmm_zsd_init(); 3098 vmmr_init(); 3099 3100 error = mod_install(&modlinkage); 3101 if (error) { 3102 ddi_soft_state_fini(&vmm_statep); 3103 vmm_zsd_fini(); 3104 vmmr_fini(); 3105 } 3106 3107 return (error); 3108 } 3109 3110 int 3111 _fini(void) 3112 { 3113 int error; 3114 3115 error = mod_remove(&modlinkage); 3116 if (error) { 3117 return (error); 3118 } 3119 3120 vmm_zsd_fini(); 3121 vmmr_fini(); 3122 3123 ddi_soft_state_fini(&vmm_statep); 3124 3125 return (0); 3126 } 3127 3128 int 3129 _info(struct modinfo *modinfop) 3130 { 3131 return (mod_info(&modlinkage, modinfop)); 3132 } 3133