1 /* 2 * This file and its contents are supplied under the terms of the 3 * Common Development and Distribution License ("CDDL"), version 1.0. 4 * You may only use this file in accordance with the terms of version 5 * 1.0 of the CDDL. 6 * 7 * A full copy of the text of the CDDL should have accompanied this 8 * source. A copy of the CDDL is also available via the Internet at 9 * http://www.illumos.org/license/CDDL. 10 */ 11 /* This file is dual-licensed; see usr/src/contrib/bhyve/LICENSE */ 12 13 /* 14 * Copyright 2015 Pluribus Networks Inc. 15 * Copyright 2019 Joyent, Inc. 16 * Copyright 2020 OmniOS Community Edition (OmniOSce) Association. 17 * Copyright 2022 Oxide Computer Company 18 */ 19 20 #include <sys/types.h> 21 #include <sys/conf.h> 22 #include <sys/cpuvar.h> 23 #include <sys/ioccom.h> 24 #include <sys/stat.h> 25 #include <sys/vmsystm.h> 26 #include <sys/ddi.h> 27 #include <sys/mkdev.h> 28 #include <sys/sunddi.h> 29 #include <sys/fs/dv_node.h> 30 #include <sys/cpuset.h> 31 #include <sys/id_space.h> 32 #include <sys/fs/sdev_plugin.h> 33 #include <sys/smt.h> 34 #include <sys/kstat.h> 35 36 #include <sys/kernel.h> 37 #include <sys/hma.h> 38 #include <sys/x86_archext.h> 39 #include <x86/apicreg.h> 40 41 #include <sys/vmm.h> 42 #include <sys/vmm_kernel.h> 43 #include <sys/vmm_instruction_emul.h> 44 #include <sys/vmm_dev.h> 45 #include <sys/vmm_impl.h> 46 #include <sys/vmm_drv.h> 47 #include <sys/vmm_vm.h> 48 #include <sys/vmm_reservoir.h> 49 50 #include <vm/seg_dev.h> 51 52 #include "io/ppt.h" 53 #include "io/vatpic.h" 54 #include "io/vioapic.h" 55 #include "io/vrtc.h" 56 #include "io/vhpet.h" 57 #include "io/vpmtmr.h" 58 #include "vmm_lapic.h" 59 #include "vmm_stat.h" 60 #include "vmm_util.h" 61 62 /* 63 * Locking details: 64 * 65 * Driver-wide data (vmmdev_*) , including HMA and sdev registration, is 66 * protected by vmmdev_mtx. The list of vmm_softc_t instances and related data 67 * (vmm_*) are protected by vmm_mtx. Actions requiring both locks must acquire 68 * vmmdev_mtx before vmm_mtx. The sdev plugin functions must not attempt to 69 * acquire vmmdev_mtx, as they could deadlock with plugin unregistration. 70 */ 71 72 static kmutex_t vmmdev_mtx; 73 static dev_info_t *vmmdev_dip; 74 static hma_reg_t *vmmdev_hma_reg; 75 static uint_t vmmdev_hma_ref; 76 static sdev_plugin_hdl_t vmmdev_sdev_hdl; 77 78 static kmutex_t vmm_mtx; 79 static list_t vmm_list; 80 static list_t vmm_destroy_list; 81 static id_space_t *vmm_minors; 82 static void *vmm_statep; 83 84 /* temporary safety switch */ 85 int vmm_allow_state_writes; 86 87 static const char *vmmdev_hvm_name = "bhyve"; 88 89 /* For sdev plugin (/dev) */ 90 #define VMM_SDEV_ROOT "/dev/vmm" 91 92 /* From uts/intel/io/vmm/intel/vmx.c */ 93 extern int vmx_x86_supported(const char **); 94 95 /* Holds and hooks from drivers external to vmm */ 96 struct vmm_hold { 97 list_node_t vmh_node; 98 vmm_softc_t *vmh_sc; 99 boolean_t vmh_release_req; 100 uint_t vmh_ioport_hook_cnt; 101 }; 102 103 struct vmm_lease { 104 list_node_t vml_node; 105 struct vm *vml_vm; 106 vm_client_t *vml_vmclient; 107 boolean_t vml_expired; 108 boolean_t vml_break_deferred; 109 boolean_t (*vml_expire_func)(void *); 110 void *vml_expire_arg; 111 struct vmm_hold *vml_hold; 112 }; 113 114 /* Options for vmm_destroy_locked */ 115 typedef enum vmm_destroy_opts { 116 VDO_DEFAULT = 0, 117 /* 118 * Request that zone-specific-data associated with this VM not be 119 * cleaned up as part of the destroy. Skipping ZSD clean-up is 120 * necessary when VM is being destroyed as part of zone destruction, 121 * when said ZSD is already being cleaned up. 122 */ 123 VDO_NO_CLEAN_ZSD = (1 << 0), 124 /* 125 * Skip any attempt to wait for vmm_drv consumers when attempting to 126 * purge them from the instance. When performing an auto-destruct, it 127 * is not desirable to wait, since said consumer might exist in a 128 * "higher" file descriptor which has not yet been closed. 129 */ 130 VDO_NO_PURGE_WAIT = (1 << 1), 131 } vmm_destroy_opts_t; 132 133 static int vmm_destroy_locked(vmm_softc_t *, vmm_destroy_opts_t, boolean_t *); 134 static int vmm_drv_block_hook(vmm_softc_t *, boolean_t); 135 static void vmm_lease_block(vmm_softc_t *); 136 static void vmm_lease_unblock(vmm_softc_t *); 137 static int vmm_kstat_alloc(vmm_softc_t *, minor_t, const cred_t *); 138 static void vmm_kstat_init(vmm_softc_t *); 139 static void vmm_kstat_fini(vmm_softc_t *); 140 141 /* 142 * The 'devmem' hack: 143 * 144 * On native FreeBSD, bhyve consumers are allowed to create 'devmem' segments 145 * in the vm which appear with their own name related to the vm under /dev. 146 * Since this would be a hassle from an sdev perspective and would require a 147 * new cdev interface (or complicate the existing one), we choose to implement 148 * this in a different manner. Direct access to the underlying vm memory 149 * segments is exposed by placing them in a range of offsets beyond the normal 150 * guest memory space. Userspace can query the appropriate offset to mmap() 151 * for a given segment-id with the VM_DEVMEM_GETOFFSET ioctl. 152 */ 153 154 static vmm_devmem_entry_t * 155 vmmdev_devmem_find(vmm_softc_t *sc, int segid) 156 { 157 vmm_devmem_entry_t *ent = NULL; 158 list_t *dl = &sc->vmm_devmem_list; 159 160 for (ent = list_head(dl); ent != NULL; ent = list_next(dl, ent)) { 161 if (ent->vde_segid == segid) { 162 return (ent); 163 } 164 } 165 return (NULL); 166 } 167 168 static int 169 vmmdev_get_memseg(vmm_softc_t *sc, struct vm_memseg *mseg) 170 { 171 int error; 172 bool sysmem; 173 174 error = vm_get_memseg(sc->vmm_vm, mseg->segid, &mseg->len, &sysmem, 175 NULL); 176 if (error || mseg->len == 0) 177 return (error); 178 179 if (!sysmem) { 180 vmm_devmem_entry_t *de; 181 182 de = vmmdev_devmem_find(sc, mseg->segid); 183 if (de != NULL) { 184 (void) strlcpy(mseg->name, de->vde_name, 185 sizeof (mseg->name)); 186 } 187 } else { 188 bzero(mseg->name, sizeof (mseg->name)); 189 } 190 191 return (error); 192 } 193 194 static int 195 vmmdev_devmem_create(vmm_softc_t *sc, struct vm_memseg *mseg, const char *name) 196 { 197 off_t map_offset; 198 vmm_devmem_entry_t *entry; 199 200 if (list_is_empty(&sc->vmm_devmem_list)) { 201 map_offset = VM_DEVMEM_START; 202 } else { 203 entry = list_tail(&sc->vmm_devmem_list); 204 map_offset = entry->vde_off + entry->vde_len; 205 if (map_offset < entry->vde_off) { 206 /* Do not tolerate overflow */ 207 return (ERANGE); 208 } 209 /* 210 * XXXJOY: We could choose to search the list for duplicate 211 * names and toss an error. Since we're using the offset 212 * method for now, it does not make much of a difference. 213 */ 214 } 215 216 entry = kmem_zalloc(sizeof (*entry), KM_SLEEP); 217 entry->vde_segid = mseg->segid; 218 entry->vde_len = mseg->len; 219 entry->vde_off = map_offset; 220 (void) strlcpy(entry->vde_name, name, sizeof (entry->vde_name)); 221 list_insert_tail(&sc->vmm_devmem_list, entry); 222 223 return (0); 224 } 225 226 static boolean_t 227 vmmdev_devmem_segid(vmm_softc_t *sc, off_t off, off_t len, int *segidp, 228 off_t *map_offp) 229 { 230 list_t *dl = &sc->vmm_devmem_list; 231 vmm_devmem_entry_t *de = NULL; 232 const off_t map_end = off + len; 233 234 VERIFY(off >= VM_DEVMEM_START); 235 236 if (map_end < off) { 237 /* No match on overflow */ 238 return (B_FALSE); 239 } 240 241 for (de = list_head(dl); de != NULL; de = list_next(dl, de)) { 242 const off_t item_end = de->vde_off + de->vde_len; 243 244 if (de->vde_off <= off && item_end >= map_end) { 245 *segidp = de->vde_segid; 246 *map_offp = off - de->vde_off; 247 return (B_TRUE); 248 } 249 } 250 return (B_FALSE); 251 } 252 253 static void 254 vmmdev_devmem_purge(vmm_softc_t *sc) 255 { 256 vmm_devmem_entry_t *entry; 257 258 while ((entry = list_remove_head(&sc->vmm_devmem_list)) != NULL) { 259 kmem_free(entry, sizeof (*entry)); 260 } 261 } 262 263 static int 264 vmmdev_alloc_memseg(vmm_softc_t *sc, struct vm_memseg *mseg) 265 { 266 int error; 267 bool sysmem = true; 268 269 if (VM_MEMSEG_NAME(mseg)) { 270 sysmem = false; 271 } 272 error = vm_alloc_memseg(sc->vmm_vm, mseg->segid, mseg->len, sysmem); 273 274 if (error == 0) { 275 /* 276 * Rather than create a whole fresh device from which userspace 277 * can mmap this segment, instead make it available at an 278 * offset above where the main guest memory resides. 279 */ 280 error = vmmdev_devmem_create(sc, mseg, mseg->name); 281 if (error != 0) { 282 vm_free_memseg(sc->vmm_vm, mseg->segid); 283 } 284 } 285 return (error); 286 } 287 288 /* 289 * Resource Locking and Exclusion 290 * 291 * Much of bhyve depends on key portions of VM state, such as the guest memory 292 * map, to remain unchanged while the guest is running. As ported from 293 * FreeBSD, the initial strategy for this resource exclusion hinged on gating 294 * access to the instance vCPUs. Threads acting on a single vCPU, like those 295 * performing the work of actually running the guest in VMX/SVM, would lock 296 * only that vCPU during ioctl() entry. For ioctls which would change VM-wide 297 * state, all of the vCPUs would be first locked, ensuring that the 298 * operation(s) could complete without any other threads stumbling into 299 * intermediate states. 300 * 301 * This approach is largely effective for bhyve. Common operations, such as 302 * running the vCPUs, steer clear of lock contention. The model begins to 303 * break down for operations which do not occur in the context of a specific 304 * vCPU. LAPIC MSI delivery, for example, may be initiated from a worker 305 * thread in the bhyve process. In order to properly protect those vCPU-less 306 * operations from encountering invalid states, additional locking is required. 307 * This was solved by forcing those operations to lock the VM_MAXCPU-1 vCPU. 308 * It does mean that class of operations will be serialized on locking the 309 * specific vCPU and that instances sized at VM_MAXCPU will potentially see 310 * undue contention on the VM_MAXCPU-1 vCPU. 311 * 312 * In order to address the shortcomings of this model, the concept of a 313 * read/write lock has been added to bhyve. Operations which change 314 * fundamental aspects of a VM (such as the memory map) must acquire the write 315 * lock, which also implies locking all of the vCPUs and waiting for all read 316 * lock holders to release. While it increases the cost and waiting time for 317 * those few operations, it allows most hot-path operations on the VM (which 318 * depend on its configuration remaining stable) to occur with minimal locking. 319 * 320 * Consumers of the Driver API (see below) are a special case when it comes to 321 * this locking, since they may hold a read lock via the drv_lease mechanism 322 * for an extended period of time. Rather than forcing those consumers to 323 * continuously poll for a write lock attempt, the lease system forces them to 324 * provide a release callback to trigger their clean-up (and potential later 325 * reacquisition) of the read lock. 326 */ 327 328 static void 329 vcpu_lock_one(vmm_softc_t *sc, int vcpu) 330 { 331 ASSERT(vcpu >= 0 && vcpu < VM_MAXCPU); 332 333 /* 334 * Since this state transition is utilizing from_idle=true, it should 335 * not fail, but rather block until it can be successful. 336 */ 337 VERIFY0(vcpu_set_state(sc->vmm_vm, vcpu, VCPU_FROZEN, true)); 338 } 339 340 static void 341 vcpu_unlock_one(vmm_softc_t *sc, int vcpu) 342 { 343 ASSERT(vcpu >= 0 && vcpu < VM_MAXCPU); 344 345 VERIFY3U(vcpu_get_state(sc->vmm_vm, vcpu, NULL), ==, VCPU_FROZEN); 346 VERIFY0(vcpu_set_state(sc->vmm_vm, vcpu, VCPU_IDLE, false)); 347 } 348 349 static void 350 vmm_read_lock(vmm_softc_t *sc) 351 { 352 rw_enter(&sc->vmm_rwlock, RW_READER); 353 } 354 355 static void 356 vmm_read_unlock(vmm_softc_t *sc) 357 { 358 rw_exit(&sc->vmm_rwlock); 359 } 360 361 static void 362 vmm_write_lock(vmm_softc_t *sc) 363 { 364 int maxcpus; 365 366 /* First lock all the vCPUs */ 367 maxcpus = vm_get_maxcpus(sc->vmm_vm); 368 for (int vcpu = 0; vcpu < maxcpus; vcpu++) { 369 vcpu_lock_one(sc, vcpu); 370 } 371 372 /* 373 * Block vmm_drv leases from being acquired or held while the VM write 374 * lock is held. 375 */ 376 vmm_lease_block(sc); 377 378 rw_enter(&sc->vmm_rwlock, RW_WRITER); 379 /* 380 * For now, the 'maxcpus' value for an instance is fixed at the 381 * compile-time constant of VM_MAXCPU at creation. If this changes in 382 * the future, allowing for dynamic vCPU resource sizing, acquisition 383 * of the write lock will need to be wary of such changes. 384 */ 385 VERIFY(maxcpus == vm_get_maxcpus(sc->vmm_vm)); 386 } 387 388 static void 389 vmm_write_unlock(vmm_softc_t *sc) 390 { 391 int maxcpus; 392 393 /* Allow vmm_drv leases to be acquired once write lock is dropped */ 394 vmm_lease_unblock(sc); 395 396 /* 397 * The VM write lock _must_ be released from the same thread it was 398 * acquired in, unlike the read lock. 399 */ 400 VERIFY(rw_write_held(&sc->vmm_rwlock)); 401 rw_exit(&sc->vmm_rwlock); 402 403 /* Unlock all the vCPUs */ 404 maxcpus = vm_get_maxcpus(sc->vmm_vm); 405 for (int vcpu = 0; vcpu < maxcpus; vcpu++) { 406 vcpu_unlock_one(sc, vcpu); 407 } 408 } 409 410 static int 411 vmmdev_do_ioctl(vmm_softc_t *sc, int cmd, intptr_t arg, int md, 412 cred_t *credp, int *rvalp) 413 { 414 int error = 0, vcpu = -1; 415 void *datap = (void *)arg; 416 enum vm_lock_type { 417 LOCK_NONE = 0, 418 LOCK_VCPU, 419 LOCK_READ_HOLD, 420 LOCK_WRITE_HOLD 421 } lock_type = LOCK_NONE; 422 423 /* Acquire any exclusion resources needed for the operation. */ 424 switch (cmd) { 425 case VM_RUN: 426 case VM_GET_REGISTER: 427 case VM_SET_REGISTER: 428 case VM_GET_SEGMENT_DESCRIPTOR: 429 case VM_SET_SEGMENT_DESCRIPTOR: 430 case VM_GET_REGISTER_SET: 431 case VM_SET_REGISTER_SET: 432 case VM_INJECT_EXCEPTION: 433 case VM_GET_CAPABILITY: 434 case VM_SET_CAPABILITY: 435 case VM_PPTDEV_MSI: 436 case VM_PPTDEV_MSIX: 437 case VM_SET_X2APIC_STATE: 438 case VM_GLA2GPA: 439 case VM_GLA2GPA_NOFAULT: 440 case VM_ACTIVATE_CPU: 441 case VM_SET_INTINFO: 442 case VM_GET_INTINFO: 443 case VM_RESTART_INSTRUCTION: 444 case VM_SET_KERNEMU_DEV: 445 case VM_GET_KERNEMU_DEV: 446 case VM_RESET_CPU: 447 case VM_GET_RUN_STATE: 448 case VM_SET_RUN_STATE: 449 case VM_GET_FPU: 450 case VM_SET_FPU: 451 case VM_GET_CPUID: 452 case VM_SET_CPUID: 453 case VM_LEGACY_CPUID: 454 /* 455 * Copy in the ID of the vCPU chosen for this operation. 456 * Since a nefarious caller could update their struct between 457 * this locking and when the rest of the ioctl data is copied 458 * in, it is _critical_ that this local 'vcpu' variable be used 459 * rather than the in-struct one when performing the ioctl. 460 */ 461 if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) { 462 return (EFAULT); 463 } 464 if (vcpu < 0 || vcpu > vm_get_maxcpus(sc->vmm_vm)) { 465 return (EINVAL); 466 } 467 vcpu_lock_one(sc, vcpu); 468 lock_type = LOCK_VCPU; 469 break; 470 471 case VM_REINIT: 472 case VM_BIND_PPTDEV: 473 case VM_UNBIND_PPTDEV: 474 case VM_MAP_PPTDEV_MMIO: 475 case VM_UNMAP_PPTDEV_MMIO: 476 case VM_ALLOC_MEMSEG: 477 case VM_MMAP_MEMSEG: 478 case VM_MUNMAP_MEMSEG: 479 case VM_WRLOCK_CYCLE: 480 case VM_PMTMR_LOCATE: 481 vmm_write_lock(sc); 482 lock_type = LOCK_WRITE_HOLD; 483 break; 484 485 case VM_GET_MEMSEG: 486 case VM_MMAP_GETNEXT: 487 case VM_LAPIC_IRQ: 488 case VM_INJECT_NMI: 489 case VM_IOAPIC_ASSERT_IRQ: 490 case VM_IOAPIC_DEASSERT_IRQ: 491 case VM_IOAPIC_PULSE_IRQ: 492 case VM_LAPIC_MSI: 493 case VM_LAPIC_LOCAL_IRQ: 494 case VM_GET_X2APIC_STATE: 495 case VM_RTC_READ: 496 case VM_RTC_WRITE: 497 case VM_RTC_SETTIME: 498 case VM_RTC_GETTIME: 499 case VM_PPTDEV_DISABLE_MSIX: 500 case VM_DEVMEM_GETOFFSET: 501 case VM_TRACK_DIRTY_PAGES: 502 vmm_read_lock(sc); 503 lock_type = LOCK_READ_HOLD; 504 break; 505 506 case VM_DATA_READ: 507 case VM_DATA_WRITE: 508 if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) { 509 return (EFAULT); 510 } 511 if (vcpu == -1) { 512 /* Access data for VM-wide devices */ 513 vmm_write_lock(sc); 514 lock_type = LOCK_WRITE_HOLD; 515 } else if (vcpu >= 0 && vcpu < vm_get_maxcpus(sc->vmm_vm)) { 516 /* Access data associated with a specific vCPU */ 517 vcpu_lock_one(sc, vcpu); 518 lock_type = LOCK_VCPU; 519 } else { 520 return (EINVAL); 521 } 522 break; 523 524 case VM_GET_GPA_PMAP: 525 case VM_IOAPIC_PINCOUNT: 526 case VM_SUSPEND: 527 case VM_DESC_FPU_AREA: 528 case VM_SET_AUTODESTRUCT: 529 default: 530 break; 531 } 532 533 /* Execute the primary logic for the ioctl. */ 534 switch (cmd) { 535 case VM_RUN: { 536 struct vm_entry entry; 537 538 if (ddi_copyin(datap, &entry, sizeof (entry), md)) { 539 error = EFAULT; 540 break; 541 } 542 543 if (!(curthread->t_schedflag & TS_VCPU)) 544 smt_mark_as_vcpu(); 545 546 error = vm_run(sc->vmm_vm, vcpu, &entry); 547 548 /* 549 * Unexpected states in vm_run() are expressed through positive 550 * errno-oriented return values. VM states which expect further 551 * processing in userspace (necessary context via exitinfo) are 552 * expressed through negative return values. For the time being 553 * a return value of 0 is not expected from vm_run(). 554 */ 555 ASSERT(error != 0); 556 if (error < 0) { 557 const struct vm_exit *vme; 558 void *outp = entry.exit_data; 559 560 error = 0; 561 vme = vm_exitinfo(sc->vmm_vm, vcpu); 562 if (ddi_copyout(vme, outp, sizeof (*vme), md)) { 563 error = EFAULT; 564 } 565 } 566 break; 567 } 568 case VM_SUSPEND: { 569 struct vm_suspend vmsuspend; 570 571 if (ddi_copyin(datap, &vmsuspend, sizeof (vmsuspend), md)) { 572 error = EFAULT; 573 break; 574 } 575 error = vm_suspend(sc->vmm_vm, vmsuspend.how); 576 break; 577 } 578 case VM_REINIT: { 579 struct vm_reinit reinit; 580 581 if (ddi_copyin(datap, &reinit, sizeof (reinit), md)) { 582 error = EFAULT; 583 break; 584 } 585 if ((error = vmm_drv_block_hook(sc, B_TRUE)) != 0) { 586 /* 587 * The VM instance should be free of driver-attached 588 * hooks during the reinitialization process. 589 */ 590 break; 591 } 592 error = vm_reinit(sc->vmm_vm, reinit.flags); 593 (void) vmm_drv_block_hook(sc, B_FALSE); 594 break; 595 } 596 case VM_STAT_DESC: { 597 struct vm_stat_desc statdesc; 598 599 if (ddi_copyin(datap, &statdesc, sizeof (statdesc), md)) { 600 error = EFAULT; 601 break; 602 } 603 error = vmm_stat_desc_copy(statdesc.index, statdesc.desc, 604 sizeof (statdesc.desc)); 605 if (error == 0 && 606 ddi_copyout(&statdesc, datap, sizeof (statdesc), md)) { 607 error = EFAULT; 608 break; 609 } 610 break; 611 } 612 case VM_STATS_IOC: { 613 struct vm_stats vmstats; 614 615 if (ddi_copyin(datap, &vmstats, sizeof (vmstats), md)) { 616 error = EFAULT; 617 break; 618 } 619 hrt2tv(gethrtime(), &vmstats.tv); 620 error = vmm_stat_copy(sc->vmm_vm, vmstats.cpuid, vmstats.index, 621 nitems(vmstats.statbuf), 622 &vmstats.num_entries, vmstats.statbuf); 623 if (error == 0 && 624 ddi_copyout(&vmstats, datap, sizeof (vmstats), md)) { 625 error = EFAULT; 626 break; 627 } 628 break; 629 } 630 631 case VM_PPTDEV_MSI: { 632 struct vm_pptdev_msi pptmsi; 633 634 if (ddi_copyin(datap, &pptmsi, sizeof (pptmsi), md)) { 635 error = EFAULT; 636 break; 637 } 638 error = ppt_setup_msi(sc->vmm_vm, pptmsi.vcpu, pptmsi.pptfd, 639 pptmsi.addr, pptmsi.msg, pptmsi.numvec); 640 break; 641 } 642 case VM_PPTDEV_MSIX: { 643 struct vm_pptdev_msix pptmsix; 644 645 if (ddi_copyin(datap, &pptmsix, sizeof (pptmsix), md)) { 646 error = EFAULT; 647 break; 648 } 649 error = ppt_setup_msix(sc->vmm_vm, pptmsix.vcpu, pptmsix.pptfd, 650 pptmsix.idx, pptmsix.addr, pptmsix.msg, 651 pptmsix.vector_control); 652 break; 653 } 654 case VM_PPTDEV_DISABLE_MSIX: { 655 struct vm_pptdev pptdev; 656 657 if (ddi_copyin(datap, &pptdev, sizeof (pptdev), md)) { 658 error = EFAULT; 659 break; 660 } 661 error = ppt_disable_msix(sc->vmm_vm, pptdev.pptfd); 662 break; 663 } 664 case VM_MAP_PPTDEV_MMIO: { 665 struct vm_pptdev_mmio pptmmio; 666 667 if (ddi_copyin(datap, &pptmmio, sizeof (pptmmio), md)) { 668 error = EFAULT; 669 break; 670 } 671 error = ppt_map_mmio(sc->vmm_vm, pptmmio.pptfd, pptmmio.gpa, 672 pptmmio.len, pptmmio.hpa); 673 break; 674 } 675 case VM_UNMAP_PPTDEV_MMIO: { 676 struct vm_pptdev_mmio pptmmio; 677 678 if (ddi_copyin(datap, &pptmmio, sizeof (pptmmio), md)) { 679 error = EFAULT; 680 break; 681 } 682 error = ppt_unmap_mmio(sc->vmm_vm, pptmmio.pptfd, pptmmio.gpa, 683 pptmmio.len); 684 break; 685 } 686 case VM_BIND_PPTDEV: { 687 struct vm_pptdev pptdev; 688 689 if (ddi_copyin(datap, &pptdev, sizeof (pptdev), md)) { 690 error = EFAULT; 691 break; 692 } 693 error = vm_assign_pptdev(sc->vmm_vm, pptdev.pptfd); 694 break; 695 } 696 case VM_UNBIND_PPTDEV: { 697 struct vm_pptdev pptdev; 698 699 if (ddi_copyin(datap, &pptdev, sizeof (pptdev), md)) { 700 error = EFAULT; 701 break; 702 } 703 error = vm_unassign_pptdev(sc->vmm_vm, pptdev.pptfd); 704 break; 705 } 706 case VM_GET_PPTDEV_LIMITS: { 707 struct vm_pptdev_limits pptlimits; 708 709 if (ddi_copyin(datap, &pptlimits, sizeof (pptlimits), md)) { 710 error = EFAULT; 711 break; 712 } 713 error = ppt_get_limits(sc->vmm_vm, pptlimits.pptfd, 714 &pptlimits.msi_limit, &pptlimits.msix_limit); 715 if (error == 0 && 716 ddi_copyout(&pptlimits, datap, sizeof (pptlimits), md)) { 717 error = EFAULT; 718 break; 719 } 720 break; 721 } 722 case VM_INJECT_EXCEPTION: { 723 struct vm_exception vmexc; 724 if (ddi_copyin(datap, &vmexc, sizeof (vmexc), md)) { 725 error = EFAULT; 726 break; 727 } 728 error = vm_inject_exception(sc->vmm_vm, vcpu, vmexc.vector, 729 vmexc.error_code_valid != 0, vmexc.error_code, 730 vmexc.restart_instruction != 0); 731 break; 732 } 733 case VM_INJECT_NMI: { 734 struct vm_nmi vmnmi; 735 736 if (ddi_copyin(datap, &vmnmi, sizeof (vmnmi), md)) { 737 error = EFAULT; 738 break; 739 } 740 error = vm_inject_nmi(sc->vmm_vm, vmnmi.cpuid); 741 break; 742 } 743 case VM_LAPIC_IRQ: { 744 struct vm_lapic_irq vmirq; 745 746 if (ddi_copyin(datap, &vmirq, sizeof (vmirq), md)) { 747 error = EFAULT; 748 break; 749 } 750 error = lapic_intr_edge(sc->vmm_vm, vmirq.cpuid, vmirq.vector); 751 break; 752 } 753 case VM_LAPIC_LOCAL_IRQ: { 754 struct vm_lapic_irq vmirq; 755 756 if (ddi_copyin(datap, &vmirq, sizeof (vmirq), md)) { 757 error = EFAULT; 758 break; 759 } 760 error = lapic_set_local_intr(sc->vmm_vm, vmirq.cpuid, 761 vmirq.vector); 762 break; 763 } 764 case VM_LAPIC_MSI: { 765 struct vm_lapic_msi vmmsi; 766 767 if (ddi_copyin(datap, &vmmsi, sizeof (vmmsi), md)) { 768 error = EFAULT; 769 break; 770 } 771 error = lapic_intr_msi(sc->vmm_vm, vmmsi.addr, vmmsi.msg); 772 break; 773 } 774 775 case VM_IOAPIC_ASSERT_IRQ: { 776 struct vm_ioapic_irq ioapic_irq; 777 778 if (ddi_copyin(datap, &ioapic_irq, sizeof (ioapic_irq), md)) { 779 error = EFAULT; 780 break; 781 } 782 error = vioapic_assert_irq(sc->vmm_vm, ioapic_irq.irq); 783 break; 784 } 785 case VM_IOAPIC_DEASSERT_IRQ: { 786 struct vm_ioapic_irq ioapic_irq; 787 788 if (ddi_copyin(datap, &ioapic_irq, sizeof (ioapic_irq), md)) { 789 error = EFAULT; 790 break; 791 } 792 error = vioapic_deassert_irq(sc->vmm_vm, ioapic_irq.irq); 793 break; 794 } 795 case VM_IOAPIC_PULSE_IRQ: { 796 struct vm_ioapic_irq ioapic_irq; 797 798 if (ddi_copyin(datap, &ioapic_irq, sizeof (ioapic_irq), md)) { 799 error = EFAULT; 800 break; 801 } 802 error = vioapic_pulse_irq(sc->vmm_vm, ioapic_irq.irq); 803 break; 804 } 805 case VM_IOAPIC_PINCOUNT: { 806 int pincount; 807 808 pincount = vioapic_pincount(sc->vmm_vm); 809 if (ddi_copyout(&pincount, datap, sizeof (int), md)) { 810 error = EFAULT; 811 break; 812 } 813 break; 814 } 815 case VM_DESC_FPU_AREA: { 816 struct vm_fpu_desc desc; 817 void *buf = NULL; 818 819 if (ddi_copyin(datap, &desc, sizeof (desc), md)) { 820 error = EFAULT; 821 break; 822 } 823 if (desc.vfd_num_entries > 64) { 824 error = EINVAL; 825 break; 826 } 827 const size_t buf_sz = sizeof (struct vm_fpu_desc_entry) * 828 desc.vfd_num_entries; 829 if (buf_sz != 0) { 830 buf = kmem_zalloc(buf_sz, KM_SLEEP); 831 } 832 833 /* 834 * For now, we are depending on vm_fpu_desc_entry and 835 * hma_xsave_state_desc_t having the same format. 836 */ 837 CTASSERT(sizeof (struct vm_fpu_desc_entry) == 838 sizeof (hma_xsave_state_desc_t)); 839 840 size_t req_size; 841 const uint_t max_entries = hma_fpu_describe_xsave_state( 842 (hma_xsave_state_desc_t *)buf, 843 desc.vfd_num_entries, 844 &req_size); 845 846 desc.vfd_req_size = req_size; 847 desc.vfd_num_entries = max_entries; 848 if (buf_sz != 0) { 849 if (ddi_copyout(buf, desc.vfd_entry_data, buf_sz, md)) { 850 error = EFAULT; 851 } 852 kmem_free(buf, buf_sz); 853 } 854 855 if (error == 0) { 856 if (ddi_copyout(&desc, datap, sizeof (desc), md)) { 857 error = EFAULT; 858 } 859 } 860 break; 861 } 862 case VM_SET_AUTODESTRUCT: { 863 /* 864 * Since this has to do with controlling the lifetime of the 865 * greater vmm_softc_t, the flag is protected by vmm_mtx, rather 866 * than the vcpu-centric or rwlock exclusion mechanisms. 867 */ 868 mutex_enter(&vmm_mtx); 869 sc->vmm_autodestruct = (arg != 0); 870 mutex_exit(&vmm_mtx); 871 break; 872 } 873 874 case VM_ISA_ASSERT_IRQ: { 875 struct vm_isa_irq isa_irq; 876 877 if (ddi_copyin(datap, &isa_irq, sizeof (isa_irq), md)) { 878 error = EFAULT; 879 break; 880 } 881 error = vatpic_assert_irq(sc->vmm_vm, isa_irq.atpic_irq); 882 if (error == 0 && isa_irq.ioapic_irq != -1) { 883 error = vioapic_assert_irq(sc->vmm_vm, 884 isa_irq.ioapic_irq); 885 } 886 break; 887 } 888 case VM_ISA_DEASSERT_IRQ: { 889 struct vm_isa_irq isa_irq; 890 891 if (ddi_copyin(datap, &isa_irq, sizeof (isa_irq), md)) { 892 error = EFAULT; 893 break; 894 } 895 error = vatpic_deassert_irq(sc->vmm_vm, isa_irq.atpic_irq); 896 if (error == 0 && isa_irq.ioapic_irq != -1) { 897 error = vioapic_deassert_irq(sc->vmm_vm, 898 isa_irq.ioapic_irq); 899 } 900 break; 901 } 902 case VM_ISA_PULSE_IRQ: { 903 struct vm_isa_irq isa_irq; 904 905 if (ddi_copyin(datap, &isa_irq, sizeof (isa_irq), md)) { 906 error = EFAULT; 907 break; 908 } 909 error = vatpic_pulse_irq(sc->vmm_vm, isa_irq.atpic_irq); 910 if (error == 0 && isa_irq.ioapic_irq != -1) { 911 error = vioapic_pulse_irq(sc->vmm_vm, 912 isa_irq.ioapic_irq); 913 } 914 break; 915 } 916 case VM_ISA_SET_IRQ_TRIGGER: { 917 struct vm_isa_irq_trigger isa_irq_trigger; 918 919 if (ddi_copyin(datap, &isa_irq_trigger, 920 sizeof (isa_irq_trigger), md)) { 921 error = EFAULT; 922 break; 923 } 924 error = vatpic_set_irq_trigger(sc->vmm_vm, 925 isa_irq_trigger.atpic_irq, isa_irq_trigger.trigger); 926 break; 927 } 928 929 case VM_MMAP_GETNEXT: { 930 struct vm_memmap mm; 931 932 if (ddi_copyin(datap, &mm, sizeof (mm), md)) { 933 error = EFAULT; 934 break; 935 } 936 error = vm_mmap_getnext(sc->vmm_vm, &mm.gpa, &mm.segid, 937 &mm.segoff, &mm.len, &mm.prot, &mm.flags); 938 if (error == 0 && ddi_copyout(&mm, datap, sizeof (mm), md)) { 939 error = EFAULT; 940 break; 941 } 942 break; 943 } 944 case VM_MMAP_MEMSEG: { 945 struct vm_memmap mm; 946 947 if (ddi_copyin(datap, &mm, sizeof (mm), md)) { 948 error = EFAULT; 949 break; 950 } 951 error = vm_mmap_memseg(sc->vmm_vm, mm.gpa, mm.segid, mm.segoff, 952 mm.len, mm.prot, mm.flags); 953 break; 954 } 955 case VM_MUNMAP_MEMSEG: { 956 struct vm_munmap mu; 957 958 if (ddi_copyin(datap, &mu, sizeof (mu), md)) { 959 error = EFAULT; 960 break; 961 } 962 error = vm_munmap_memseg(sc->vmm_vm, mu.gpa, mu.len); 963 break; 964 } 965 case VM_ALLOC_MEMSEG: { 966 struct vm_memseg vmseg; 967 968 if (ddi_copyin(datap, &vmseg, sizeof (vmseg), md)) { 969 error = EFAULT; 970 break; 971 } 972 error = vmmdev_alloc_memseg(sc, &vmseg); 973 break; 974 } 975 case VM_GET_MEMSEG: { 976 struct vm_memseg vmseg; 977 978 if (ddi_copyin(datap, &vmseg, sizeof (vmseg), md)) { 979 error = EFAULT; 980 break; 981 } 982 error = vmmdev_get_memseg(sc, &vmseg); 983 if (error == 0 && 984 ddi_copyout(&vmseg, datap, sizeof (vmseg), md)) { 985 error = EFAULT; 986 break; 987 } 988 break; 989 } 990 case VM_GET_REGISTER: { 991 struct vm_register vmreg; 992 993 if (ddi_copyin(datap, &vmreg, sizeof (vmreg), md)) { 994 error = EFAULT; 995 break; 996 } 997 error = vm_get_register(sc->vmm_vm, vcpu, vmreg.regnum, 998 &vmreg.regval); 999 if (error == 0 && 1000 ddi_copyout(&vmreg, datap, sizeof (vmreg), md)) { 1001 error = EFAULT; 1002 break; 1003 } 1004 break; 1005 } 1006 case VM_SET_REGISTER: { 1007 struct vm_register vmreg; 1008 1009 if (ddi_copyin(datap, &vmreg, sizeof (vmreg), md)) { 1010 error = EFAULT; 1011 break; 1012 } 1013 error = vm_set_register(sc->vmm_vm, vcpu, vmreg.regnum, 1014 vmreg.regval); 1015 break; 1016 } 1017 case VM_SET_SEGMENT_DESCRIPTOR: { 1018 struct vm_seg_desc vmsegd; 1019 1020 if (ddi_copyin(datap, &vmsegd, sizeof (vmsegd), md)) { 1021 error = EFAULT; 1022 break; 1023 } 1024 error = vm_set_seg_desc(sc->vmm_vm, vcpu, vmsegd.regnum, 1025 &vmsegd.desc); 1026 break; 1027 } 1028 case VM_GET_SEGMENT_DESCRIPTOR: { 1029 struct vm_seg_desc vmsegd; 1030 1031 if (ddi_copyin(datap, &vmsegd, sizeof (vmsegd), md)) { 1032 error = EFAULT; 1033 break; 1034 } 1035 error = vm_get_seg_desc(sc->vmm_vm, vcpu, vmsegd.regnum, 1036 &vmsegd.desc); 1037 if (error == 0 && 1038 ddi_copyout(&vmsegd, datap, sizeof (vmsegd), md)) { 1039 error = EFAULT; 1040 break; 1041 } 1042 break; 1043 } 1044 case VM_GET_REGISTER_SET: { 1045 struct vm_register_set vrs; 1046 int regnums[VM_REG_LAST]; 1047 uint64_t regvals[VM_REG_LAST]; 1048 1049 if (ddi_copyin(datap, &vrs, sizeof (vrs), md)) { 1050 error = EFAULT; 1051 break; 1052 } 1053 if (vrs.count > VM_REG_LAST || vrs.count == 0) { 1054 error = EINVAL; 1055 break; 1056 } 1057 if (ddi_copyin(vrs.regnums, regnums, 1058 sizeof (int) * vrs.count, md)) { 1059 error = EFAULT; 1060 break; 1061 } 1062 1063 error = 0; 1064 for (uint_t i = 0; i < vrs.count && error == 0; i++) { 1065 if (regnums[i] < 0) { 1066 error = EINVAL; 1067 break; 1068 } 1069 error = vm_get_register(sc->vmm_vm, vcpu, regnums[i], 1070 ®vals[i]); 1071 } 1072 if (error == 0 && ddi_copyout(regvals, vrs.regvals, 1073 sizeof (uint64_t) * vrs.count, md)) { 1074 error = EFAULT; 1075 } 1076 break; 1077 } 1078 case VM_SET_REGISTER_SET: { 1079 struct vm_register_set vrs; 1080 int regnums[VM_REG_LAST]; 1081 uint64_t regvals[VM_REG_LAST]; 1082 1083 if (ddi_copyin(datap, &vrs, sizeof (vrs), md)) { 1084 error = EFAULT; 1085 break; 1086 } 1087 if (vrs.count > VM_REG_LAST || vrs.count == 0) { 1088 error = EINVAL; 1089 break; 1090 } 1091 if (ddi_copyin(vrs.regnums, regnums, 1092 sizeof (int) * vrs.count, md)) { 1093 error = EFAULT; 1094 break; 1095 } 1096 if (ddi_copyin(vrs.regvals, regvals, 1097 sizeof (uint64_t) * vrs.count, md)) { 1098 error = EFAULT; 1099 break; 1100 } 1101 1102 error = 0; 1103 for (uint_t i = 0; i < vrs.count && error == 0; i++) { 1104 /* 1105 * Setting registers in a set is not atomic, since a 1106 * failure in the middle of the set will cause a 1107 * bail-out and inconsistent register state. Callers 1108 * should be wary of this. 1109 */ 1110 if (regnums[i] < 0) { 1111 error = EINVAL; 1112 break; 1113 } 1114 error = vm_set_register(sc->vmm_vm, vcpu, regnums[i], 1115 regvals[i]); 1116 } 1117 break; 1118 } 1119 case VM_RESET_CPU: { 1120 struct vm_vcpu_reset vvr; 1121 1122 if (ddi_copyin(datap, &vvr, sizeof (vvr), md)) { 1123 error = EFAULT; 1124 break; 1125 } 1126 if (vvr.kind != VRK_RESET && vvr.kind != VRK_INIT) { 1127 error = EINVAL; 1128 } 1129 1130 error = vcpu_arch_reset(sc->vmm_vm, vcpu, vvr.kind == VRK_INIT); 1131 break; 1132 } 1133 case VM_GET_RUN_STATE: { 1134 struct vm_run_state vrs; 1135 1136 bzero(&vrs, sizeof (vrs)); 1137 error = vm_get_run_state(sc->vmm_vm, vcpu, &vrs.state, 1138 &vrs.sipi_vector); 1139 if (error == 0) { 1140 if (ddi_copyout(&vrs, datap, sizeof (vrs), md)) { 1141 error = EFAULT; 1142 break; 1143 } 1144 } 1145 break; 1146 } 1147 case VM_SET_RUN_STATE: { 1148 struct vm_run_state vrs; 1149 1150 if (ddi_copyin(datap, &vrs, sizeof (vrs), md)) { 1151 error = EFAULT; 1152 break; 1153 } 1154 error = vm_set_run_state(sc->vmm_vm, vcpu, vrs.state, 1155 vrs.sipi_vector); 1156 break; 1157 } 1158 case VM_GET_FPU: { 1159 struct vm_fpu_state req; 1160 const size_t max_len = (PAGESIZE * 2); 1161 void *kbuf; 1162 1163 if (ddi_copyin(datap, &req, sizeof (req), md)) { 1164 error = EFAULT; 1165 break; 1166 } 1167 if (req.len > max_len || req.len == 0) { 1168 error = EINVAL; 1169 break; 1170 } 1171 kbuf = kmem_zalloc(req.len, KM_SLEEP); 1172 error = vm_get_fpu(sc->vmm_vm, vcpu, kbuf, req.len); 1173 if (error == 0) { 1174 if (ddi_copyout(kbuf, req.buf, req.len, md)) { 1175 error = EFAULT; 1176 } 1177 } 1178 kmem_free(kbuf, req.len); 1179 break; 1180 } 1181 case VM_SET_FPU: { 1182 struct vm_fpu_state req; 1183 const size_t max_len = (PAGESIZE * 2); 1184 void *kbuf; 1185 1186 if (ddi_copyin(datap, &req, sizeof (req), md)) { 1187 error = EFAULT; 1188 break; 1189 } 1190 if (req.len > max_len || req.len == 0) { 1191 error = EINVAL; 1192 break; 1193 } 1194 kbuf = kmem_alloc(req.len, KM_SLEEP); 1195 if (ddi_copyin(req.buf, kbuf, req.len, md)) { 1196 error = EFAULT; 1197 } else { 1198 error = vm_set_fpu(sc->vmm_vm, vcpu, kbuf, req.len); 1199 } 1200 kmem_free(kbuf, req.len); 1201 break; 1202 } 1203 case VM_GET_CPUID: { 1204 struct vm_vcpu_cpuid_config cfg; 1205 struct vcpu_cpuid_entry *entries = NULL; 1206 1207 if (ddi_copyin(datap, &cfg, sizeof (cfg), md)) { 1208 error = EFAULT; 1209 break; 1210 } 1211 if (cfg.vvcc_nent > VMM_MAX_CPUID_ENTRIES) { 1212 error = EINVAL; 1213 break; 1214 } 1215 1216 const size_t entries_size = 1217 cfg.vvcc_nent * sizeof (struct vcpu_cpuid_entry); 1218 if (entries_size != 0) { 1219 entries = kmem_zalloc(entries_size, KM_SLEEP); 1220 } 1221 1222 vcpu_cpuid_config_t vm_cfg = { 1223 .vcc_nent = cfg.vvcc_nent, 1224 .vcc_entries = entries, 1225 }; 1226 error = vm_get_cpuid(sc->vmm_vm, vcpu, &vm_cfg); 1227 1228 /* 1229 * Only attempt to copy out the resultant entries if we were 1230 * able to query them from the instance. The flags and number 1231 * of entries are emitted regardless. 1232 */ 1233 cfg.vvcc_flags = vm_cfg.vcc_flags; 1234 cfg.vvcc_nent = vm_cfg.vcc_nent; 1235 if (entries != NULL) { 1236 if (error == 0 && ddi_copyout(entries, cfg.vvcc_entries, 1237 entries_size, md) != 0) { 1238 error = EFAULT; 1239 } 1240 1241 kmem_free(entries, entries_size); 1242 } 1243 1244 if (ddi_copyout(&cfg, datap, sizeof (cfg), md) != 0) { 1245 error = EFAULT; 1246 } 1247 break; 1248 } 1249 case VM_SET_CPUID: { 1250 struct vm_vcpu_cpuid_config cfg; 1251 struct vcpu_cpuid_entry *entries = NULL; 1252 size_t entries_size = 0; 1253 1254 if (ddi_copyin(datap, &cfg, sizeof (cfg), md)) { 1255 error = EFAULT; 1256 break; 1257 } 1258 if (cfg.vvcc_nent > VMM_MAX_CPUID_ENTRIES) { 1259 error = EFBIG; 1260 break; 1261 } 1262 if ((cfg.vvcc_flags & VCC_FLAG_LEGACY_HANDLING) != 0) { 1263 /* 1264 * If we are being instructed to use "legacy" handling, 1265 * then no entries should be provided, since the static 1266 * in-kernel masking will be used. 1267 */ 1268 if (cfg.vvcc_nent != 0) { 1269 error = EINVAL; 1270 break; 1271 } 1272 } else if (cfg.vvcc_nent != 0) { 1273 entries_size = 1274 cfg.vvcc_nent * sizeof (struct vcpu_cpuid_entry); 1275 entries = kmem_alloc(entries_size, KM_SLEEP); 1276 1277 if (ddi_copyin(cfg.vvcc_entries, entries, entries_size, 1278 md) != 0) { 1279 error = EFAULT; 1280 kmem_free(entries, entries_size); 1281 break; 1282 } 1283 } 1284 1285 vcpu_cpuid_config_t vm_cfg = { 1286 .vcc_flags = cfg.vvcc_flags, 1287 .vcc_nent = cfg.vvcc_nent, 1288 .vcc_entries = entries, 1289 }; 1290 error = vm_set_cpuid(sc->vmm_vm, vcpu, &vm_cfg); 1291 1292 if (entries != NULL) { 1293 kmem_free(entries, entries_size); 1294 } 1295 break; 1296 } 1297 case VM_LEGACY_CPUID: { 1298 struct vm_legacy_cpuid vlc; 1299 if (ddi_copyin(datap, &vlc, sizeof (vlc), md)) { 1300 error = EFAULT; 1301 break; 1302 } 1303 vlc.vlc_vcpuid = vcpu; 1304 1305 legacy_emulate_cpuid(sc->vmm_vm, vcpu, &vlc.vlc_eax, 1306 &vlc.vlc_ebx, &vlc.vlc_ecx, &vlc.vlc_edx); 1307 1308 if (ddi_copyout(&vlc, datap, sizeof (vlc), md)) { 1309 error = EFAULT; 1310 break; 1311 } 1312 break; 1313 } 1314 1315 case VM_SET_KERNEMU_DEV: 1316 case VM_GET_KERNEMU_DEV: { 1317 struct vm_readwrite_kernemu_device kemu; 1318 size_t size = 0; 1319 1320 if (ddi_copyin(datap, &kemu, sizeof (kemu), md)) { 1321 error = EFAULT; 1322 break; 1323 } 1324 1325 if (kemu.access_width > 3) { 1326 error = EINVAL; 1327 break; 1328 } 1329 size = (1 << kemu.access_width); 1330 ASSERT(size >= 1 && size <= 8); 1331 1332 if (cmd == VM_SET_KERNEMU_DEV) { 1333 error = vm_service_mmio_write(sc->vmm_vm, vcpu, 1334 kemu.gpa, kemu.value, size); 1335 } else { 1336 error = vm_service_mmio_read(sc->vmm_vm, vcpu, 1337 kemu.gpa, &kemu.value, size); 1338 } 1339 1340 if (error == 0) { 1341 if (ddi_copyout(&kemu, datap, sizeof (kemu), md)) { 1342 error = EFAULT; 1343 break; 1344 } 1345 } 1346 break; 1347 } 1348 1349 case VM_GET_CAPABILITY: { 1350 struct vm_capability vmcap; 1351 1352 if (ddi_copyin(datap, &vmcap, sizeof (vmcap), md)) { 1353 error = EFAULT; 1354 break; 1355 } 1356 error = vm_get_capability(sc->vmm_vm, vcpu, vmcap.captype, 1357 &vmcap.capval); 1358 if (error == 0 && 1359 ddi_copyout(&vmcap, datap, sizeof (vmcap), md)) { 1360 error = EFAULT; 1361 break; 1362 } 1363 break; 1364 } 1365 case VM_SET_CAPABILITY: { 1366 struct vm_capability vmcap; 1367 1368 if (ddi_copyin(datap, &vmcap, sizeof (vmcap), md)) { 1369 error = EFAULT; 1370 break; 1371 } 1372 error = vm_set_capability(sc->vmm_vm, vcpu, vmcap.captype, 1373 vmcap.capval); 1374 break; 1375 } 1376 case VM_SET_X2APIC_STATE: { 1377 struct vm_x2apic x2apic; 1378 1379 if (ddi_copyin(datap, &x2apic, sizeof (x2apic), md)) { 1380 error = EFAULT; 1381 break; 1382 } 1383 error = vm_set_x2apic_state(sc->vmm_vm, vcpu, x2apic.state); 1384 break; 1385 } 1386 case VM_GET_X2APIC_STATE: { 1387 struct vm_x2apic x2apic; 1388 1389 if (ddi_copyin(datap, &x2apic, sizeof (x2apic), md)) { 1390 error = EFAULT; 1391 break; 1392 } 1393 error = vm_get_x2apic_state(sc->vmm_vm, x2apic.cpuid, 1394 &x2apic.state); 1395 if (error == 0 && 1396 ddi_copyout(&x2apic, datap, sizeof (x2apic), md)) { 1397 error = EFAULT; 1398 break; 1399 } 1400 break; 1401 } 1402 case VM_GET_GPA_PMAP: { 1403 /* 1404 * Until there is a necessity to leak EPT/RVI PTE values to 1405 * userspace, this will remain unimplemented 1406 */ 1407 error = EINVAL; 1408 break; 1409 } 1410 case VM_GET_HPET_CAPABILITIES: { 1411 struct vm_hpet_cap hpetcap; 1412 1413 error = vhpet_getcap(&hpetcap); 1414 if (error == 0 && 1415 ddi_copyout(&hpetcap, datap, sizeof (hpetcap), md)) { 1416 error = EFAULT; 1417 break; 1418 } 1419 break; 1420 } 1421 case VM_GLA2GPA: { 1422 struct vm_gla2gpa gg; 1423 1424 if (ddi_copyin(datap, &gg, sizeof (gg), md)) { 1425 error = EFAULT; 1426 break; 1427 } 1428 gg.vcpuid = vcpu; 1429 error = vm_gla2gpa(sc->vmm_vm, vcpu, &gg.paging, gg.gla, 1430 gg.prot, &gg.gpa, &gg.fault); 1431 if (error == 0 && ddi_copyout(&gg, datap, sizeof (gg), md)) { 1432 error = EFAULT; 1433 break; 1434 } 1435 break; 1436 } 1437 case VM_GLA2GPA_NOFAULT: { 1438 struct vm_gla2gpa gg; 1439 1440 if (ddi_copyin(datap, &gg, sizeof (gg), md)) { 1441 error = EFAULT; 1442 break; 1443 } 1444 gg.vcpuid = vcpu; 1445 error = vm_gla2gpa_nofault(sc->vmm_vm, vcpu, &gg.paging, 1446 gg.gla, gg.prot, &gg.gpa, &gg.fault); 1447 if (error == 0 && ddi_copyout(&gg, datap, sizeof (gg), md)) { 1448 error = EFAULT; 1449 break; 1450 } 1451 break; 1452 } 1453 1454 case VM_ACTIVATE_CPU: 1455 error = vm_activate_cpu(sc->vmm_vm, vcpu); 1456 break; 1457 1458 case VM_SUSPEND_CPU: 1459 if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) { 1460 error = EFAULT; 1461 } else { 1462 error = vm_suspend_cpu(sc->vmm_vm, vcpu); 1463 } 1464 break; 1465 1466 case VM_RESUME_CPU: 1467 if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) { 1468 error = EFAULT; 1469 } else { 1470 error = vm_resume_cpu(sc->vmm_vm, vcpu); 1471 } 1472 break; 1473 1474 case VM_GET_CPUS: { 1475 struct vm_cpuset vm_cpuset; 1476 cpuset_t tempset; 1477 void *srcp = &tempset; 1478 int size; 1479 1480 if (ddi_copyin(datap, &vm_cpuset, sizeof (vm_cpuset), md)) { 1481 error = EFAULT; 1482 break; 1483 } 1484 1485 /* Be more generous about sizing since our cpuset_t is large. */ 1486 size = vm_cpuset.cpusetsize; 1487 if (size <= 0 || size > sizeof (cpuset_t)) { 1488 error = ERANGE; 1489 } 1490 /* 1491 * If they want a ulong_t or less, make sure they receive the 1492 * low bits with all the useful information. 1493 */ 1494 if (size <= sizeof (tempset.cpub[0])) { 1495 srcp = &tempset.cpub[0]; 1496 } 1497 1498 if (vm_cpuset.which == VM_ACTIVE_CPUS) { 1499 tempset = vm_active_cpus(sc->vmm_vm); 1500 } else if (vm_cpuset.which == VM_SUSPENDED_CPUS) { 1501 tempset = vm_suspended_cpus(sc->vmm_vm); 1502 } else if (vm_cpuset.which == VM_DEBUG_CPUS) { 1503 tempset = vm_debug_cpus(sc->vmm_vm); 1504 } else { 1505 error = EINVAL; 1506 } 1507 1508 ASSERT(size > 0 && size <= sizeof (tempset)); 1509 if (error == 0 && 1510 ddi_copyout(srcp, vm_cpuset.cpus, size, md)) { 1511 error = EFAULT; 1512 break; 1513 } 1514 break; 1515 } 1516 case VM_SET_INTINFO: { 1517 struct vm_intinfo vmii; 1518 1519 if (ddi_copyin(datap, &vmii, sizeof (vmii), md)) { 1520 error = EFAULT; 1521 break; 1522 } 1523 error = vm_exit_intinfo(sc->vmm_vm, vcpu, vmii.info1); 1524 break; 1525 } 1526 case VM_GET_INTINFO: { 1527 struct vm_intinfo vmii; 1528 1529 vmii.vcpuid = vcpu; 1530 error = vm_get_intinfo(sc->vmm_vm, vcpu, &vmii.info1, 1531 &vmii.info2); 1532 if (error == 0 && 1533 ddi_copyout(&vmii, datap, sizeof (vmii), md)) { 1534 error = EFAULT; 1535 break; 1536 } 1537 break; 1538 } 1539 case VM_RTC_WRITE: { 1540 struct vm_rtc_data rtcdata; 1541 1542 if (ddi_copyin(datap, &rtcdata, sizeof (rtcdata), md)) { 1543 error = EFAULT; 1544 break; 1545 } 1546 error = vrtc_nvram_write(sc->vmm_vm, rtcdata.offset, 1547 rtcdata.value); 1548 break; 1549 } 1550 case VM_RTC_READ: { 1551 struct vm_rtc_data rtcdata; 1552 1553 if (ddi_copyin(datap, &rtcdata, sizeof (rtcdata), md)) { 1554 error = EFAULT; 1555 break; 1556 } 1557 error = vrtc_nvram_read(sc->vmm_vm, rtcdata.offset, 1558 &rtcdata.value); 1559 if (error == 0 && 1560 ddi_copyout(&rtcdata, datap, sizeof (rtcdata), md)) { 1561 error = EFAULT; 1562 break; 1563 } 1564 break; 1565 } 1566 case VM_RTC_SETTIME: { 1567 struct vm_rtc_time rtctime; 1568 1569 if (ddi_copyin(datap, &rtctime, sizeof (rtctime), md)) { 1570 error = EFAULT; 1571 break; 1572 } 1573 error = vrtc_set_time(sc->vmm_vm, rtctime.secs); 1574 break; 1575 } 1576 case VM_RTC_GETTIME: { 1577 struct vm_rtc_time rtctime; 1578 1579 rtctime.secs = vrtc_get_time(sc->vmm_vm); 1580 if (ddi_copyout(&rtctime, datap, sizeof (rtctime), md)) { 1581 error = EFAULT; 1582 break; 1583 } 1584 break; 1585 } 1586 1587 case VM_PMTMR_LOCATE: { 1588 uint16_t port = arg; 1589 error = vpmtmr_set_location(sc->vmm_vm, port); 1590 break; 1591 } 1592 1593 case VM_RESTART_INSTRUCTION: 1594 error = vm_restart_instruction(sc->vmm_vm, vcpu); 1595 break; 1596 1597 case VM_SET_TOPOLOGY: { 1598 struct vm_cpu_topology topo; 1599 1600 if (ddi_copyin(datap, &topo, sizeof (topo), md) != 0) { 1601 error = EFAULT; 1602 break; 1603 } 1604 error = vm_set_topology(sc->vmm_vm, topo.sockets, topo.cores, 1605 topo.threads, topo.maxcpus); 1606 break; 1607 } 1608 case VM_GET_TOPOLOGY: { 1609 struct vm_cpu_topology topo; 1610 1611 vm_get_topology(sc->vmm_vm, &topo.sockets, &topo.cores, 1612 &topo.threads, &topo.maxcpus); 1613 if (ddi_copyout(&topo, datap, sizeof (topo), md) != 0) { 1614 error = EFAULT; 1615 break; 1616 } 1617 break; 1618 } 1619 case VM_DEVMEM_GETOFFSET: { 1620 struct vm_devmem_offset vdo; 1621 vmm_devmem_entry_t *de; 1622 1623 if (ddi_copyin(datap, &vdo, sizeof (vdo), md) != 0) { 1624 error = EFAULT; 1625 break; 1626 } 1627 1628 de = vmmdev_devmem_find(sc, vdo.segid); 1629 if (de != NULL) { 1630 vdo.offset = de->vde_off; 1631 if (ddi_copyout(&vdo, datap, sizeof (vdo), md) != 0) { 1632 error = EFAULT; 1633 } 1634 } else { 1635 error = ENOENT; 1636 } 1637 break; 1638 } 1639 case VM_TRACK_DIRTY_PAGES: { 1640 const size_t max_track_region_len = 8 * PAGESIZE * 8 * PAGESIZE; 1641 struct vmm_dirty_tracker tracker; 1642 uint8_t *bitmap; 1643 size_t len; 1644 1645 if (ddi_copyin(datap, &tracker, sizeof (tracker), md) != 0) { 1646 error = EFAULT; 1647 break; 1648 } 1649 if ((tracker.vdt_start_gpa & PAGEOFFSET) != 0) { 1650 error = EINVAL; 1651 break; 1652 } 1653 if (tracker.vdt_len == 0) { 1654 break; 1655 } 1656 if ((tracker.vdt_len & PAGEOFFSET) != 0) { 1657 error = EINVAL; 1658 break; 1659 } 1660 if (tracker.vdt_len > max_track_region_len) { 1661 error = EINVAL; 1662 break; 1663 } 1664 len = roundup(tracker.vdt_len / PAGESIZE, 8) / 8; 1665 bitmap = kmem_zalloc(len, KM_SLEEP); 1666 vm_track_dirty_pages(sc->vmm_vm, tracker.vdt_start_gpa, 1667 tracker.vdt_len, bitmap); 1668 if (ddi_copyout(bitmap, tracker.vdt_pfns, len, md) != 0) { 1669 error = EFAULT; 1670 } 1671 kmem_free(bitmap, len); 1672 1673 break; 1674 } 1675 case VM_WRLOCK_CYCLE: { 1676 /* 1677 * Present a test mechanism to acquire/release the write lock 1678 * on the VM without any other effects. 1679 */ 1680 break; 1681 } 1682 case VM_DATA_READ: { 1683 struct vm_data_xfer vdx; 1684 1685 if (ddi_copyin(datap, &vdx, sizeof (vdx), md) != 0) { 1686 error = EFAULT; 1687 break; 1688 } 1689 if ((vdx.vdx_flags & ~VDX_FLAGS_VALID) != 0) { 1690 error = EINVAL; 1691 break; 1692 } 1693 if (vdx.vdx_len > VM_DATA_XFER_LIMIT) { 1694 error = EFBIG; 1695 break; 1696 } 1697 1698 const size_t len = vdx.vdx_len; 1699 void *buf = NULL; 1700 if (len != 0) { 1701 buf = kmem_alloc(len, KM_SLEEP); 1702 if ((vdx.vdx_flags & VDX_FLAG_READ_COPYIN) != 0 && 1703 ddi_copyin(vdx.vdx_data, buf, len, md) != 0) { 1704 kmem_free(buf, len); 1705 error = EFAULT; 1706 break; 1707 } else { 1708 bzero(buf, len); 1709 } 1710 } 1711 1712 vdx.vdx_result_len = 0; 1713 vmm_data_req_t req = { 1714 .vdr_class = vdx.vdx_class, 1715 .vdr_version = vdx.vdx_version, 1716 .vdr_flags = vdx.vdx_flags, 1717 .vdr_len = len, 1718 .vdr_data = buf, 1719 .vdr_result_len = &vdx.vdx_result_len, 1720 }; 1721 error = vmm_data_read(sc->vmm_vm, vdx.vdx_vcpuid, &req); 1722 1723 if (error == 0 && buf != NULL) { 1724 if (ddi_copyout(buf, vdx.vdx_data, len, md) != 0) { 1725 error = EFAULT; 1726 } 1727 } 1728 1729 /* 1730 * Copy out the transfer request so that the value of 1731 * vdx_result_len can be made available, regardless of any 1732 * error(s) which may have occurred. 1733 */ 1734 if (ddi_copyout(&vdx, datap, sizeof (vdx), md) != 0) { 1735 error = (error != 0) ? error : EFAULT; 1736 } 1737 1738 if (buf != NULL) { 1739 kmem_free(buf, len); 1740 } 1741 break; 1742 } 1743 case VM_DATA_WRITE: { 1744 struct vm_data_xfer vdx; 1745 1746 if (ddi_copyin(datap, &vdx, sizeof (vdx), md) != 0) { 1747 error = EFAULT; 1748 break; 1749 } 1750 if ((vdx.vdx_flags & ~VDX_FLAGS_VALID) != 0) { 1751 error = EINVAL; 1752 break; 1753 } 1754 if (vdx.vdx_len > VM_DATA_XFER_LIMIT) { 1755 error = EFBIG; 1756 break; 1757 } 1758 1759 const size_t len = vdx.vdx_len; 1760 void *buf = NULL; 1761 if (len != 0) { 1762 buf = kmem_alloc(len, KM_SLEEP); 1763 if (ddi_copyin(vdx.vdx_data, buf, len, md) != 0) { 1764 kmem_free(buf, len); 1765 error = EFAULT; 1766 break; 1767 } 1768 } 1769 1770 vdx.vdx_result_len = 0; 1771 vmm_data_req_t req = { 1772 .vdr_class = vdx.vdx_class, 1773 .vdr_version = vdx.vdx_version, 1774 .vdr_flags = vdx.vdx_flags, 1775 .vdr_len = len, 1776 .vdr_data = buf, 1777 .vdr_result_len = &vdx.vdx_result_len, 1778 }; 1779 if (vmm_allow_state_writes == 0) { 1780 /* XXX: Play it safe for now */ 1781 error = EPERM; 1782 } else { 1783 error = vmm_data_write(sc->vmm_vm, vdx.vdx_vcpuid, 1784 &req); 1785 } 1786 1787 if (error == 0 && buf != NULL && 1788 (vdx.vdx_flags & VDX_FLAG_WRITE_COPYOUT) != 0) { 1789 if (ddi_copyout(buf, vdx.vdx_data, len, md) != 0) { 1790 error = EFAULT; 1791 } 1792 } 1793 1794 /* 1795 * Copy out the transfer request so that the value of 1796 * vdx_result_len can be made available, regardless of any 1797 * error(s) which may have occurred. 1798 */ 1799 if (ddi_copyout(&vdx, datap, sizeof (vdx), md) != 0) { 1800 error = (error != 0) ? error : EFAULT; 1801 } 1802 1803 if (buf != NULL) { 1804 kmem_free(buf, len); 1805 } 1806 break; 1807 } 1808 1809 default: 1810 error = ENOTTY; 1811 break; 1812 } 1813 1814 /* Release exclusion resources */ 1815 switch (lock_type) { 1816 case LOCK_NONE: 1817 break; 1818 case LOCK_VCPU: 1819 vcpu_unlock_one(sc, vcpu); 1820 break; 1821 case LOCK_READ_HOLD: 1822 vmm_read_unlock(sc); 1823 break; 1824 case LOCK_WRITE_HOLD: 1825 vmm_write_unlock(sc); 1826 break; 1827 default: 1828 panic("unexpected lock type"); 1829 break; 1830 } 1831 1832 return (error); 1833 } 1834 1835 static vmm_softc_t * 1836 vmm_lookup(const char *name) 1837 { 1838 list_t *vml = &vmm_list; 1839 vmm_softc_t *sc; 1840 1841 ASSERT(MUTEX_HELD(&vmm_mtx)); 1842 1843 for (sc = list_head(vml); sc != NULL; sc = list_next(vml, sc)) { 1844 if (strcmp(sc->vmm_name, name) == 0) { 1845 break; 1846 } 1847 } 1848 1849 return (sc); 1850 } 1851 1852 /* 1853 * Acquire an HMA registration if not already held. 1854 */ 1855 static boolean_t 1856 vmm_hma_acquire(void) 1857 { 1858 ASSERT(MUTEX_NOT_HELD(&vmm_mtx)); 1859 1860 mutex_enter(&vmmdev_mtx); 1861 1862 if (vmmdev_hma_reg == NULL) { 1863 VERIFY3U(vmmdev_hma_ref, ==, 0); 1864 vmmdev_hma_reg = hma_register(vmmdev_hvm_name); 1865 if (vmmdev_hma_reg == NULL) { 1866 cmn_err(CE_WARN, "%s HMA registration failed.", 1867 vmmdev_hvm_name); 1868 mutex_exit(&vmmdev_mtx); 1869 return (B_FALSE); 1870 } 1871 } 1872 1873 vmmdev_hma_ref++; 1874 1875 mutex_exit(&vmmdev_mtx); 1876 1877 return (B_TRUE); 1878 } 1879 1880 /* 1881 * Release the HMA registration if held and there are no remaining VMs. 1882 */ 1883 static void 1884 vmm_hma_release(void) 1885 { 1886 ASSERT(MUTEX_NOT_HELD(&vmm_mtx)); 1887 1888 mutex_enter(&vmmdev_mtx); 1889 1890 VERIFY3U(vmmdev_hma_ref, !=, 0); 1891 1892 vmmdev_hma_ref--; 1893 1894 if (vmmdev_hma_ref == 0) { 1895 VERIFY(vmmdev_hma_reg != NULL); 1896 hma_unregister(vmmdev_hma_reg); 1897 vmmdev_hma_reg = NULL; 1898 } 1899 mutex_exit(&vmmdev_mtx); 1900 } 1901 1902 static int 1903 vmmdev_do_vm_create(const struct vm_create_req *req, cred_t *cr) 1904 { 1905 vmm_softc_t *sc = NULL; 1906 minor_t minor; 1907 int error = ENOMEM; 1908 size_t len; 1909 const char *name = req->name; 1910 1911 len = strnlen(name, VM_MAX_NAMELEN); 1912 if (len == 0) { 1913 return (EINVAL); 1914 } 1915 if (len >= VM_MAX_NAMELEN) { 1916 return (ENAMETOOLONG); 1917 } 1918 if (strchr(name, '/') != NULL) { 1919 return (EINVAL); 1920 } 1921 1922 if (!vmm_hma_acquire()) 1923 return (ENXIO); 1924 1925 mutex_enter(&vmm_mtx); 1926 1927 /* Look for duplicate names */ 1928 if (vmm_lookup(name) != NULL) { 1929 mutex_exit(&vmm_mtx); 1930 vmm_hma_release(); 1931 return (EEXIST); 1932 } 1933 1934 /* Allow only one instance per non-global zone. */ 1935 if (!INGLOBALZONE(curproc)) { 1936 for (sc = list_head(&vmm_list); sc != NULL; 1937 sc = list_next(&vmm_list, sc)) { 1938 if (sc->vmm_zone == curzone) { 1939 mutex_exit(&vmm_mtx); 1940 vmm_hma_release(); 1941 return (EINVAL); 1942 } 1943 } 1944 } 1945 1946 minor = id_alloc(vmm_minors); 1947 if (ddi_soft_state_zalloc(vmm_statep, minor) != DDI_SUCCESS) { 1948 goto fail; 1949 } else if ((sc = ddi_get_soft_state(vmm_statep, minor)) == NULL) { 1950 ddi_soft_state_free(vmm_statep, minor); 1951 goto fail; 1952 } else if (ddi_create_minor_node(vmmdev_dip, name, S_IFCHR, minor, 1953 DDI_PSEUDO, 0) != DDI_SUCCESS) { 1954 goto fail; 1955 } 1956 1957 if (vmm_kstat_alloc(sc, minor, cr) != 0) { 1958 goto fail; 1959 } 1960 1961 error = vm_create(req->flags, &sc->vmm_vm); 1962 if (error == 0) { 1963 /* Complete VM intialization and report success. */ 1964 (void) strlcpy(sc->vmm_name, name, sizeof (sc->vmm_name)); 1965 sc->vmm_minor = minor; 1966 list_create(&sc->vmm_devmem_list, sizeof (vmm_devmem_entry_t), 1967 offsetof(vmm_devmem_entry_t, vde_node)); 1968 1969 list_create(&sc->vmm_holds, sizeof (vmm_hold_t), 1970 offsetof(vmm_hold_t, vmh_node)); 1971 cv_init(&sc->vmm_cv, NULL, CV_DEFAULT, NULL); 1972 1973 mutex_init(&sc->vmm_lease_lock, NULL, MUTEX_DEFAULT, NULL); 1974 list_create(&sc->vmm_lease_list, sizeof (vmm_lease_t), 1975 offsetof(vmm_lease_t, vml_node)); 1976 cv_init(&sc->vmm_lease_cv, NULL, CV_DEFAULT, NULL); 1977 rw_init(&sc->vmm_rwlock, NULL, RW_DEFAULT, NULL); 1978 1979 sc->vmm_zone = crgetzone(cr); 1980 zone_hold(sc->vmm_zone); 1981 vmm_zsd_add_vm(sc); 1982 vmm_kstat_init(sc); 1983 1984 list_insert_tail(&vmm_list, sc); 1985 mutex_exit(&vmm_mtx); 1986 return (0); 1987 } 1988 1989 vmm_kstat_fini(sc); 1990 ddi_remove_minor_node(vmmdev_dip, name); 1991 fail: 1992 id_free(vmm_minors, minor); 1993 if (sc != NULL) { 1994 ddi_soft_state_free(vmm_statep, minor); 1995 } 1996 mutex_exit(&vmm_mtx); 1997 vmm_hma_release(); 1998 1999 return (error); 2000 } 2001 2002 /* 2003 * Bhyve 'Driver' Interface 2004 * 2005 * While many devices are emulated in the bhyve userspace process, there are 2006 * others with performance constraints which require that they run mostly or 2007 * entirely in-kernel. For those not integrated directly into bhyve, an API is 2008 * needed so they can query/manipulate the portions of VM state needed to 2009 * fulfill their purpose. 2010 * 2011 * This includes: 2012 * - Translating guest-physical addresses to host-virtual pointers 2013 * - Injecting MSIs 2014 * - Hooking IO port addresses 2015 * 2016 * The vmm_drv interface exists to provide that functionality to its consumers. 2017 * (At this time, 'viona' is the only user) 2018 */ 2019 int 2020 vmm_drv_hold(file_t *fp, cred_t *cr, vmm_hold_t **holdp) 2021 { 2022 vnode_t *vp = fp->f_vnode; 2023 const dev_t dev = vp->v_rdev; 2024 vmm_softc_t *sc; 2025 vmm_hold_t *hold; 2026 int err = 0; 2027 2028 if (vp->v_type != VCHR) { 2029 return (ENXIO); 2030 } 2031 const major_t major = getmajor(dev); 2032 const minor_t minor = getminor(dev); 2033 2034 mutex_enter(&vmmdev_mtx); 2035 if (vmmdev_dip == NULL || major != ddi_driver_major(vmmdev_dip)) { 2036 mutex_exit(&vmmdev_mtx); 2037 return (ENOENT); 2038 } 2039 mutex_enter(&vmm_mtx); 2040 mutex_exit(&vmmdev_mtx); 2041 2042 if ((sc = ddi_get_soft_state(vmm_statep, minor)) == NULL) { 2043 err = ENOENT; 2044 goto out; 2045 } 2046 /* XXXJOY: check cred permissions against instance */ 2047 2048 if ((sc->vmm_flags & (VMM_CLEANUP|VMM_PURGED|VMM_DESTROY)) != 0) { 2049 err = EBUSY; 2050 goto out; 2051 } 2052 2053 hold = kmem_zalloc(sizeof (*hold), KM_SLEEP); 2054 hold->vmh_sc = sc; 2055 hold->vmh_release_req = B_FALSE; 2056 2057 list_insert_tail(&sc->vmm_holds, hold); 2058 sc->vmm_flags |= VMM_HELD; 2059 *holdp = hold; 2060 2061 out: 2062 mutex_exit(&vmm_mtx); 2063 return (err); 2064 } 2065 2066 void 2067 vmm_drv_rele(vmm_hold_t *hold) 2068 { 2069 vmm_softc_t *sc; 2070 boolean_t hma_release = B_FALSE; 2071 2072 ASSERT(hold != NULL); 2073 ASSERT(hold->vmh_sc != NULL); 2074 VERIFY(hold->vmh_ioport_hook_cnt == 0); 2075 2076 mutex_enter(&vmm_mtx); 2077 sc = hold->vmh_sc; 2078 list_remove(&sc->vmm_holds, hold); 2079 if (list_is_empty(&sc->vmm_holds)) { 2080 sc->vmm_flags &= ~VMM_HELD; 2081 cv_broadcast(&sc->vmm_cv); 2082 2083 /* 2084 * If pending hold(s) had prevented an auto-destruct of the 2085 * instance when it was closed, finish that clean-up now. 2086 */ 2087 if (sc->vmm_autodestruct && !sc->vmm_is_open) { 2088 int err = vmm_destroy_locked(sc, 2089 VDO_NO_PURGE_WAIT, &hma_release); 2090 2091 VERIFY0(err); 2092 VERIFY(hma_release); 2093 } 2094 } 2095 mutex_exit(&vmm_mtx); 2096 kmem_free(hold, sizeof (*hold)); 2097 2098 if (hma_release) { 2099 vmm_hma_release(); 2100 } 2101 } 2102 2103 boolean_t 2104 vmm_drv_release_reqd(vmm_hold_t *hold) 2105 { 2106 ASSERT(hold != NULL); 2107 2108 return (hold->vmh_release_req); 2109 } 2110 2111 vmm_lease_t * 2112 vmm_drv_lease_sign(vmm_hold_t *hold, boolean_t (*expiref)(void *), void *arg) 2113 { 2114 vmm_softc_t *sc = hold->vmh_sc; 2115 vmm_lease_t *lease; 2116 2117 ASSERT3P(expiref, !=, NULL); 2118 2119 if (hold->vmh_release_req) { 2120 return (NULL); 2121 } 2122 2123 lease = kmem_alloc(sizeof (*lease), KM_SLEEP); 2124 list_link_init(&lease->vml_node); 2125 lease->vml_expire_func = expiref; 2126 lease->vml_expire_arg = arg; 2127 lease->vml_expired = B_FALSE; 2128 lease->vml_break_deferred = B_FALSE; 2129 lease->vml_hold = hold; 2130 /* cache the VM pointer for one less pointer chase */ 2131 lease->vml_vm = sc->vmm_vm; 2132 lease->vml_vmclient = vmspace_client_alloc(vm_get_vmspace(sc->vmm_vm)); 2133 2134 mutex_enter(&sc->vmm_lease_lock); 2135 while (sc->vmm_lease_blocker != 0) { 2136 cv_wait(&sc->vmm_lease_cv, &sc->vmm_lease_lock); 2137 } 2138 list_insert_tail(&sc->vmm_lease_list, lease); 2139 vmm_read_lock(sc); 2140 mutex_exit(&sc->vmm_lease_lock); 2141 2142 return (lease); 2143 } 2144 2145 static void 2146 vmm_lease_break_locked(vmm_softc_t *sc, vmm_lease_t *lease) 2147 { 2148 ASSERT(MUTEX_HELD(&sc->vmm_lease_lock)); 2149 2150 list_remove(&sc->vmm_lease_list, lease); 2151 vmm_read_unlock(sc); 2152 vmc_destroy(lease->vml_vmclient); 2153 kmem_free(lease, sizeof (*lease)); 2154 } 2155 2156 static void 2157 vmm_lease_block(vmm_softc_t *sc) 2158 { 2159 mutex_enter(&sc->vmm_lease_lock); 2160 VERIFY3U(sc->vmm_lease_blocker, !=, UINT_MAX); 2161 sc->vmm_lease_blocker++; 2162 if (sc->vmm_lease_blocker == 1) { 2163 list_t *list = &sc->vmm_lease_list; 2164 vmm_lease_t *lease = list_head(list); 2165 2166 while (lease != NULL) { 2167 void *arg = lease->vml_expire_arg; 2168 boolean_t (*expiref)(void *) = lease->vml_expire_func; 2169 boolean_t sync_break = B_FALSE; 2170 2171 /* 2172 * Since the lease expiration notification may 2173 * need to take locks which would deadlock with 2174 * vmm_lease_lock, drop it across the call. 2175 * 2176 * We are the only one allowed to manipulate 2177 * vmm_lease_list right now, so it is safe to 2178 * continue iterating through it after 2179 * reacquiring the lock. 2180 */ 2181 lease->vml_expired = B_TRUE; 2182 mutex_exit(&sc->vmm_lease_lock); 2183 sync_break = expiref(arg); 2184 mutex_enter(&sc->vmm_lease_lock); 2185 2186 if (sync_break) { 2187 vmm_lease_t *next; 2188 2189 /* 2190 * These leases which are synchronously broken 2191 * result in vmm_read_unlock() calls from a 2192 * different thread than the corresponding 2193 * vmm_read_lock(). This is acceptable, given 2194 * that the rwlock underpinning the whole 2195 * mechanism tolerates the behavior. This 2196 * flexibility is _only_ afforded to VM read 2197 * lock (RW_READER) holders. 2198 */ 2199 next = list_next(list, lease); 2200 vmm_lease_break_locked(sc, lease); 2201 lease = next; 2202 } else { 2203 lease = list_next(list, lease); 2204 } 2205 } 2206 2207 /* Process leases which were not broken synchronously. */ 2208 while (!list_is_empty(list)) { 2209 /* 2210 * Although the nested loops are quadratic, the number 2211 * of leases is small. 2212 */ 2213 lease = list_head(list); 2214 while (lease != NULL) { 2215 vmm_lease_t *next = list_next(list, lease); 2216 if (lease->vml_break_deferred) { 2217 vmm_lease_break_locked(sc, lease); 2218 } 2219 lease = next; 2220 } 2221 if (list_is_empty(list)) { 2222 break; 2223 } 2224 cv_wait(&sc->vmm_lease_cv, &sc->vmm_lease_lock); 2225 } 2226 /* Wake anyone else waiting for the lease list to be empty */ 2227 cv_broadcast(&sc->vmm_lease_cv); 2228 } else { 2229 list_t *list = &sc->vmm_lease_list; 2230 2231 /* 2232 * Some other thread beat us to the duty of lease cleanup. 2233 * Wait until that is complete. 2234 */ 2235 while (!list_is_empty(list)) { 2236 cv_wait(&sc->vmm_lease_cv, &sc->vmm_lease_lock); 2237 } 2238 } 2239 mutex_exit(&sc->vmm_lease_lock); 2240 } 2241 2242 static void 2243 vmm_lease_unblock(vmm_softc_t *sc) 2244 { 2245 mutex_enter(&sc->vmm_lease_lock); 2246 VERIFY3U(sc->vmm_lease_blocker, !=, 0); 2247 sc->vmm_lease_blocker--; 2248 if (sc->vmm_lease_blocker == 0) { 2249 cv_broadcast(&sc->vmm_lease_cv); 2250 } 2251 mutex_exit(&sc->vmm_lease_lock); 2252 } 2253 2254 void 2255 vmm_drv_lease_break(vmm_hold_t *hold, vmm_lease_t *lease) 2256 { 2257 vmm_softc_t *sc = hold->vmh_sc; 2258 2259 VERIFY3P(hold, ==, lease->vml_hold); 2260 VERIFY(!lease->vml_break_deferred); 2261 2262 mutex_enter(&sc->vmm_lease_lock); 2263 if (sc->vmm_lease_blocker == 0) { 2264 vmm_lease_break_locked(sc, lease); 2265 } else { 2266 /* 2267 * Defer the lease-breaking to whichever thread is currently 2268 * cleaning up all leases as part of a vmm_lease_block() call. 2269 */ 2270 lease->vml_break_deferred = B_TRUE; 2271 cv_broadcast(&sc->vmm_lease_cv); 2272 } 2273 mutex_exit(&sc->vmm_lease_lock); 2274 } 2275 2276 boolean_t 2277 vmm_drv_lease_expired(vmm_lease_t *lease) 2278 { 2279 return (lease->vml_expired); 2280 } 2281 2282 vmm_page_t * 2283 vmm_drv_page_hold(vmm_lease_t *lease, uintptr_t gpa, int prot) 2284 { 2285 ASSERT(lease != NULL); 2286 ASSERT0(gpa & PAGEOFFSET); 2287 2288 return ((vmm_page_t *)vmc_hold(lease->vml_vmclient, gpa, prot)); 2289 } 2290 2291 void 2292 vmm_drv_page_release(vmm_page_t *vmmp) 2293 { 2294 (void) vmp_release((vm_page_t *)vmmp); 2295 } 2296 2297 void 2298 vmm_drv_page_release_chain(vmm_page_t *vmmp) 2299 { 2300 (void) vmp_release_chain((vm_page_t *)vmmp); 2301 } 2302 2303 const void * 2304 vmm_drv_page_readable(const vmm_page_t *vmmp) 2305 { 2306 return (vmp_get_readable((const vm_page_t *)vmmp)); 2307 } 2308 2309 void * 2310 vmm_drv_page_writable(const vmm_page_t *vmmp) 2311 { 2312 return (vmp_get_writable((const vm_page_t *)vmmp)); 2313 } 2314 2315 void 2316 vmm_drv_page_chain(vmm_page_t *vmmp, vmm_page_t *to_chain) 2317 { 2318 vmp_chain((vm_page_t *)vmmp, (vm_page_t *)to_chain); 2319 } 2320 2321 vmm_page_t * 2322 vmm_drv_page_next(const vmm_page_t *vmmp) 2323 { 2324 return ((vmm_page_t *)vmp_next((vm_page_t *)vmmp)); 2325 } 2326 2327 int 2328 vmm_drv_msi(vmm_lease_t *lease, uint64_t addr, uint64_t msg) 2329 { 2330 ASSERT(lease != NULL); 2331 2332 return (lapic_intr_msi(lease->vml_vm, addr, msg)); 2333 } 2334 2335 int 2336 vmm_drv_ioport_hook(vmm_hold_t *hold, uint16_t ioport, vmm_drv_iop_cb_t func, 2337 void *arg, void **cookie) 2338 { 2339 vmm_softc_t *sc; 2340 int err; 2341 2342 ASSERT(hold != NULL); 2343 ASSERT(cookie != NULL); 2344 2345 sc = hold->vmh_sc; 2346 mutex_enter(&vmm_mtx); 2347 /* Confirm that hook installation is not blocked */ 2348 if ((sc->vmm_flags & VMM_BLOCK_HOOK) != 0) { 2349 mutex_exit(&vmm_mtx); 2350 return (EBUSY); 2351 } 2352 /* 2353 * Optimistically record an installed hook which will prevent a block 2354 * from being asserted while the mutex is dropped. 2355 */ 2356 hold->vmh_ioport_hook_cnt++; 2357 mutex_exit(&vmm_mtx); 2358 2359 vmm_write_lock(sc); 2360 err = vm_ioport_hook(sc->vmm_vm, ioport, (ioport_handler_t)func, 2361 arg, cookie); 2362 vmm_write_unlock(sc); 2363 2364 if (err != 0) { 2365 mutex_enter(&vmm_mtx); 2366 /* Walk back optimism about the hook installation */ 2367 hold->vmh_ioport_hook_cnt--; 2368 mutex_exit(&vmm_mtx); 2369 } 2370 return (err); 2371 } 2372 2373 void 2374 vmm_drv_ioport_unhook(vmm_hold_t *hold, void **cookie) 2375 { 2376 vmm_softc_t *sc; 2377 2378 ASSERT(hold != NULL); 2379 ASSERT(cookie != NULL); 2380 ASSERT(hold->vmh_ioport_hook_cnt != 0); 2381 2382 sc = hold->vmh_sc; 2383 vmm_write_lock(sc); 2384 vm_ioport_unhook(sc->vmm_vm, cookie); 2385 vmm_write_unlock(sc); 2386 2387 mutex_enter(&vmm_mtx); 2388 hold->vmh_ioport_hook_cnt--; 2389 mutex_exit(&vmm_mtx); 2390 } 2391 2392 static int 2393 vmm_drv_purge(vmm_softc_t *sc, boolean_t no_wait) 2394 { 2395 ASSERT(MUTEX_HELD(&vmm_mtx)); 2396 2397 if ((sc->vmm_flags & VMM_HELD) != 0) { 2398 vmm_hold_t *hold; 2399 2400 sc->vmm_flags |= VMM_CLEANUP; 2401 for (hold = list_head(&sc->vmm_holds); hold != NULL; 2402 hold = list_next(&sc->vmm_holds, hold)) { 2403 hold->vmh_release_req = B_TRUE; 2404 } 2405 2406 /* 2407 * Require that all leases on the instance be broken, now that 2408 * all associated holds have been marked as needing release. 2409 * 2410 * Dropping vmm_mtx is not strictly necessary, but if any of the 2411 * lessees are slow to respond, it would be nice to leave it 2412 * available for other parties. 2413 */ 2414 mutex_exit(&vmm_mtx); 2415 vmm_lease_block(sc); 2416 vmm_lease_unblock(sc); 2417 mutex_enter(&vmm_mtx); 2418 2419 /* 2420 * With all of the leases broken, we can proceed in an orderly 2421 * fashion to waiting for any lingering holds to be dropped. 2422 */ 2423 while ((sc->vmm_flags & VMM_HELD) != 0) { 2424 /* 2425 * Some holds remain, so wait (if acceptable) for them 2426 * to be cleaned up. 2427 */ 2428 if (no_wait || 2429 cv_wait_sig(&sc->vmm_cv, &vmm_mtx) <= 0) { 2430 sc->vmm_flags &= ~VMM_CLEANUP; 2431 return (EINTR); 2432 } 2433 } 2434 sc->vmm_flags &= ~VMM_CLEANUP; 2435 } 2436 2437 VERIFY(list_is_empty(&sc->vmm_holds)); 2438 sc->vmm_flags |= VMM_PURGED; 2439 return (0); 2440 } 2441 2442 static int 2443 vmm_drv_block_hook(vmm_softc_t *sc, boolean_t enable_block) 2444 { 2445 int err = 0; 2446 2447 mutex_enter(&vmm_mtx); 2448 if (!enable_block) { 2449 VERIFY((sc->vmm_flags & VMM_BLOCK_HOOK) != 0); 2450 2451 sc->vmm_flags &= ~VMM_BLOCK_HOOK; 2452 goto done; 2453 } 2454 2455 /* If any holds have hooks installed, the block is a failure */ 2456 if (!list_is_empty(&sc->vmm_holds)) { 2457 vmm_hold_t *hold; 2458 2459 for (hold = list_head(&sc->vmm_holds); hold != NULL; 2460 hold = list_next(&sc->vmm_holds, hold)) { 2461 if (hold->vmh_ioport_hook_cnt != 0) { 2462 err = EBUSY; 2463 goto done; 2464 } 2465 } 2466 } 2467 sc->vmm_flags |= VMM_BLOCK_HOOK; 2468 2469 done: 2470 mutex_exit(&vmm_mtx); 2471 return (err); 2472 } 2473 2474 static int 2475 vmm_destroy_locked(vmm_softc_t *sc, vmm_destroy_opts_t opts, 2476 boolean_t *hma_release) 2477 { 2478 dev_info_t *pdip = ddi_get_parent(vmmdev_dip); 2479 minor_t minor; 2480 2481 ASSERT(MUTEX_HELD(&vmm_mtx)); 2482 2483 *hma_release = B_FALSE; 2484 2485 if (vmm_drv_purge(sc, (opts & VDO_NO_PURGE_WAIT) != 0) != 0) { 2486 return (EINTR); 2487 } 2488 2489 if ((opts & VDO_NO_CLEAN_ZSD) == 0) { 2490 vmm_zsd_rem_vm(sc); 2491 } 2492 2493 /* Clean up devmem entries */ 2494 vmmdev_devmem_purge(sc); 2495 2496 list_remove(&vmm_list, sc); 2497 ddi_remove_minor_node(vmmdev_dip, sc->vmm_name); 2498 minor = sc->vmm_minor; 2499 zone_rele(sc->vmm_zone); 2500 if (sc->vmm_is_open) { 2501 list_insert_tail(&vmm_destroy_list, sc); 2502 sc->vmm_flags |= VMM_DESTROY; 2503 } else { 2504 vmm_kstat_fini(sc); 2505 vm_destroy(sc->vmm_vm); 2506 ddi_soft_state_free(vmm_statep, minor); 2507 id_free(vmm_minors, minor); 2508 *hma_release = B_TRUE; 2509 } 2510 (void) devfs_clean(pdip, NULL, DV_CLEAN_FORCE); 2511 2512 return (0); 2513 } 2514 2515 int 2516 vmm_zone_vm_destroy(vmm_softc_t *sc) 2517 { 2518 boolean_t hma_release = B_FALSE; 2519 int err; 2520 2521 mutex_enter(&vmm_mtx); 2522 err = vmm_destroy_locked(sc, VDO_NO_CLEAN_ZSD, &hma_release); 2523 mutex_exit(&vmm_mtx); 2524 2525 if (hma_release) 2526 vmm_hma_release(); 2527 2528 return (err); 2529 } 2530 2531 /* ARGSUSED */ 2532 static int 2533 vmmdev_do_vm_destroy(const struct vm_destroy_req *req, cred_t *cr) 2534 { 2535 boolean_t hma_release = B_FALSE; 2536 vmm_softc_t *sc; 2537 int err; 2538 2539 if (crgetuid(cr) != 0) 2540 return (EPERM); 2541 2542 mutex_enter(&vmm_mtx); 2543 2544 if ((sc = vmm_lookup(req->name)) == NULL) { 2545 mutex_exit(&vmm_mtx); 2546 return (ENOENT); 2547 } 2548 /* 2549 * We don't check this in vmm_lookup() since that function is also used 2550 * for validation during create and currently vmm names must be unique. 2551 */ 2552 if (!INGLOBALZONE(curproc) && sc->vmm_zone != curzone) { 2553 mutex_exit(&vmm_mtx); 2554 return (EPERM); 2555 } 2556 err = vmm_destroy_locked(sc, VDO_DEFAULT, &hma_release); 2557 2558 mutex_exit(&vmm_mtx); 2559 2560 if (hma_release) 2561 vmm_hma_release(); 2562 2563 return (err); 2564 } 2565 2566 #define VCPU_NAME_BUFLEN 32 2567 2568 static int 2569 vmm_kstat_alloc(vmm_softc_t *sc, minor_t minor, const cred_t *cr) 2570 { 2571 zoneid_t zid = crgetzoneid(cr); 2572 int instance = minor; 2573 kstat_t *ksp; 2574 2575 ASSERT3P(sc->vmm_kstat_vm, ==, NULL); 2576 2577 ksp = kstat_create_zone(VMM_MODULE_NAME, instance, "vm", 2578 VMM_KSTAT_CLASS, KSTAT_TYPE_NAMED, 2579 sizeof (vmm_kstats_t) / sizeof (kstat_named_t), 0, zid); 2580 2581 if (ksp == NULL) { 2582 return (-1); 2583 } 2584 sc->vmm_kstat_vm = ksp; 2585 2586 for (uint_t i = 0; i < VM_MAXCPU; i++) { 2587 char namebuf[VCPU_NAME_BUFLEN]; 2588 2589 ASSERT3P(sc->vmm_kstat_vcpu[i], ==, NULL); 2590 2591 (void) snprintf(namebuf, VCPU_NAME_BUFLEN, "vcpu%u", i); 2592 ksp = kstat_create_zone(VMM_MODULE_NAME, instance, namebuf, 2593 VMM_KSTAT_CLASS, KSTAT_TYPE_NAMED, 2594 sizeof (vmm_vcpu_kstats_t) / sizeof (kstat_named_t), 2595 0, zid); 2596 if (ksp == NULL) { 2597 goto fail; 2598 } 2599 2600 sc->vmm_kstat_vcpu[i] = ksp; 2601 } 2602 2603 /* 2604 * If this instance is associated with a non-global zone, make its 2605 * kstats visible from the GZ. 2606 */ 2607 if (zid != GLOBAL_ZONEID) { 2608 kstat_zone_add(sc->vmm_kstat_vm, GLOBAL_ZONEID); 2609 for (uint_t i = 0; i < VM_MAXCPU; i++) { 2610 kstat_zone_add(sc->vmm_kstat_vcpu[i], GLOBAL_ZONEID); 2611 } 2612 } 2613 2614 return (0); 2615 2616 fail: 2617 for (uint_t i = 0; i < VM_MAXCPU; i++) { 2618 if (sc->vmm_kstat_vcpu[i] != NULL) { 2619 kstat_delete(sc->vmm_kstat_vcpu[i]); 2620 sc->vmm_kstat_vcpu[i] = NULL; 2621 } else { 2622 break; 2623 } 2624 } 2625 kstat_delete(sc->vmm_kstat_vm); 2626 sc->vmm_kstat_vm = NULL; 2627 return (-1); 2628 } 2629 2630 static void 2631 vmm_kstat_init(vmm_softc_t *sc) 2632 { 2633 kstat_t *ksp; 2634 2635 ASSERT3P(sc->vmm_vm, !=, NULL); 2636 ASSERT3P(sc->vmm_kstat_vm, !=, NULL); 2637 2638 ksp = sc->vmm_kstat_vm; 2639 vmm_kstats_t *vk = ksp->ks_data; 2640 ksp->ks_private = sc->vmm_vm; 2641 kstat_named_init(&vk->vk_name, "vm_name", KSTAT_DATA_STRING); 2642 kstat_named_setstr(&vk->vk_name, sc->vmm_name); 2643 2644 for (uint_t i = 0; i < VM_MAXCPU; i++) { 2645 ASSERT3P(sc->vmm_kstat_vcpu[i], !=, NULL); 2646 2647 ksp = sc->vmm_kstat_vcpu[i]; 2648 vmm_vcpu_kstats_t *vvk = ksp->ks_data; 2649 2650 kstat_named_init(&vvk->vvk_vcpu, "vcpu", KSTAT_DATA_UINT32); 2651 vvk->vvk_vcpu.value.ui32 = i; 2652 kstat_named_init(&vvk->vvk_time_init, "time_init", 2653 KSTAT_DATA_UINT64); 2654 kstat_named_init(&vvk->vvk_time_run, "time_run", 2655 KSTAT_DATA_UINT64); 2656 kstat_named_init(&vvk->vvk_time_idle, "time_idle", 2657 KSTAT_DATA_UINT64); 2658 kstat_named_init(&vvk->vvk_time_emu_kern, "time_emu_kern", 2659 KSTAT_DATA_UINT64); 2660 kstat_named_init(&vvk->vvk_time_emu_user, "time_emu_user", 2661 KSTAT_DATA_UINT64); 2662 kstat_named_init(&vvk->vvk_time_sched, "time_sched", 2663 KSTAT_DATA_UINT64); 2664 ksp->ks_private = sc->vmm_vm; 2665 ksp->ks_update = vmm_kstat_update_vcpu; 2666 } 2667 2668 kstat_install(sc->vmm_kstat_vm); 2669 for (uint_t i = 0; i < VM_MAXCPU; i++) { 2670 kstat_install(sc->vmm_kstat_vcpu[i]); 2671 } 2672 } 2673 2674 static void 2675 vmm_kstat_fini(vmm_softc_t *sc) 2676 { 2677 ASSERT(sc->vmm_kstat_vm != NULL); 2678 2679 kstat_delete(sc->vmm_kstat_vm); 2680 sc->vmm_kstat_vm = NULL; 2681 2682 for (uint_t i = 0; i < VM_MAXCPU; i++) { 2683 ASSERT3P(sc->vmm_kstat_vcpu[i], !=, NULL); 2684 2685 kstat_delete(sc->vmm_kstat_vcpu[i]); 2686 sc->vmm_kstat_vcpu[i] = NULL; 2687 } 2688 } 2689 2690 static int 2691 vmm_open(dev_t *devp, int flag, int otyp, cred_t *credp) 2692 { 2693 minor_t minor; 2694 vmm_softc_t *sc; 2695 2696 /* 2697 * Forbid running bhyve in a 32-bit process until it has been tested and 2698 * verified to be safe. 2699 */ 2700 if (curproc->p_model != DATAMODEL_LP64) { 2701 return (EFBIG); 2702 } 2703 2704 minor = getminor(*devp); 2705 if (minor == VMM_CTL_MINOR) { 2706 /* 2707 * Master control device must be opened exclusively. 2708 */ 2709 if ((flag & FEXCL) != FEXCL || otyp != OTYP_CHR) { 2710 return (EINVAL); 2711 } 2712 2713 return (0); 2714 } 2715 2716 mutex_enter(&vmm_mtx); 2717 sc = ddi_get_soft_state(vmm_statep, minor); 2718 if (sc == NULL) { 2719 mutex_exit(&vmm_mtx); 2720 return (ENXIO); 2721 } 2722 2723 sc->vmm_is_open = B_TRUE; 2724 mutex_exit(&vmm_mtx); 2725 2726 return (0); 2727 } 2728 2729 static int 2730 vmm_close(dev_t dev, int flag, int otyp, cred_t *credp) 2731 { 2732 minor_t minor; 2733 vmm_softc_t *sc; 2734 boolean_t hma_release = B_FALSE; 2735 2736 minor = getminor(dev); 2737 if (minor == VMM_CTL_MINOR) 2738 return (0); 2739 2740 mutex_enter(&vmm_mtx); 2741 sc = ddi_get_soft_state(vmm_statep, minor); 2742 if (sc == NULL) { 2743 mutex_exit(&vmm_mtx); 2744 return (ENXIO); 2745 } 2746 2747 VERIFY(sc->vmm_is_open); 2748 sc->vmm_is_open = B_FALSE; 2749 2750 /* 2751 * If this VM was destroyed while the vmm device was open, then 2752 * clean it up now that it is closed. 2753 */ 2754 if (sc->vmm_flags & VMM_DESTROY) { 2755 list_remove(&vmm_destroy_list, sc); 2756 vmm_kstat_fini(sc); 2757 vm_destroy(sc->vmm_vm); 2758 ddi_soft_state_free(vmm_statep, minor); 2759 id_free(vmm_minors, minor); 2760 hma_release = B_TRUE; 2761 } else if (sc->vmm_autodestruct) { 2762 /* 2763 * Attempt auto-destruct on instance if requested. 2764 * 2765 * Do not wait for existing holds to be purged from the 2766 * instance, since there is no guarantee that will happen in a 2767 * timely manner. Auto-destruction will resume when the last 2768 * hold is released. (See: vmm_drv_rele) 2769 */ 2770 (void) vmm_destroy_locked(sc, VDO_NO_PURGE_WAIT, &hma_release); 2771 } 2772 mutex_exit(&vmm_mtx); 2773 2774 if (hma_release) 2775 vmm_hma_release(); 2776 2777 return (0); 2778 } 2779 2780 static int 2781 vmm_is_supported(intptr_t arg) 2782 { 2783 int r; 2784 const char *msg; 2785 2786 if (vmm_is_intel()) { 2787 r = vmx_x86_supported(&msg); 2788 } else if (vmm_is_svm()) { 2789 /* 2790 * HMA already ensured that the features necessary for SVM 2791 * operation were present and online during vmm_attach(). 2792 */ 2793 r = 0; 2794 } else { 2795 r = ENXIO; 2796 msg = "Unsupported CPU vendor"; 2797 } 2798 2799 if (r != 0 && arg != (intptr_t)NULL) { 2800 if (copyoutstr(msg, (char *)arg, strlen(msg) + 1, NULL) != 0) 2801 return (EFAULT); 2802 } 2803 return (r); 2804 } 2805 2806 static int 2807 vmm_ctl_ioctl(int cmd, intptr_t arg, int md, cred_t *cr, int *rvalp) 2808 { 2809 void *argp = (void *)arg; 2810 2811 switch (cmd) { 2812 case VMM_CREATE_VM: { 2813 struct vm_create_req req; 2814 2815 if ((md & FWRITE) == 0) { 2816 return (EPERM); 2817 } 2818 if (ddi_copyin(argp, &req, sizeof (req), md) != 0) { 2819 return (EFAULT); 2820 } 2821 return (vmmdev_do_vm_create(&req, cr)); 2822 } 2823 case VMM_DESTROY_VM: { 2824 struct vm_destroy_req req; 2825 2826 if ((md & FWRITE) == 0) { 2827 return (EPERM); 2828 } 2829 if (ddi_copyin(argp, &req, sizeof (req), md) != 0) { 2830 return (EFAULT); 2831 } 2832 return (vmmdev_do_vm_destroy(&req, cr)); 2833 } 2834 case VMM_VM_SUPPORTED: 2835 return (vmm_is_supported(arg)); 2836 case VMM_INTERFACE_VERSION: 2837 *rvalp = VMM_CURRENT_INTERFACE_VERSION; 2838 return (0); 2839 case VMM_CHECK_IOMMU: 2840 if (!vmm_check_iommu()) { 2841 return (ENXIO); 2842 } 2843 return (0); 2844 case VMM_RESV_QUERY: 2845 case VMM_RESV_ADD: 2846 case VMM_RESV_REMOVE: 2847 return (vmmr_ioctl(cmd, arg, md, cr, rvalp)); 2848 default: 2849 break; 2850 } 2851 /* No other actions are legal on ctl device */ 2852 return (ENOTTY); 2853 } 2854 2855 static int 2856 vmm_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, 2857 int *rvalp) 2858 { 2859 vmm_softc_t *sc; 2860 minor_t minor; 2861 2862 /* 2863 * Forbid running bhyve in a 32-bit process until it has been tested and 2864 * verified to be safe. 2865 */ 2866 if (curproc->p_model != DATAMODEL_LP64) { 2867 return (EFBIG); 2868 } 2869 2870 /* The structs in bhyve ioctls assume a 64-bit datamodel */ 2871 if (ddi_model_convert_from(mode & FMODELS) != DDI_MODEL_NONE) { 2872 return (ENOTSUP); 2873 } 2874 2875 minor = getminor(dev); 2876 2877 if (minor == VMM_CTL_MINOR) { 2878 return (vmm_ctl_ioctl(cmd, arg, mode, credp, rvalp)); 2879 } 2880 2881 sc = ddi_get_soft_state(vmm_statep, minor); 2882 ASSERT(sc); 2883 2884 if (sc->vmm_flags & VMM_DESTROY) 2885 return (ENXIO); 2886 2887 return (vmmdev_do_ioctl(sc, cmd, arg, mode, credp, rvalp)); 2888 } 2889 2890 static int 2891 vmm_segmap(dev_t dev, off_t off, struct as *as, caddr_t *addrp, off_t len, 2892 unsigned int prot, unsigned int maxprot, unsigned int flags, cred_t *credp) 2893 { 2894 vmm_softc_t *sc; 2895 const minor_t minor = getminor(dev); 2896 int err; 2897 2898 if (minor == VMM_CTL_MINOR) { 2899 return (ENODEV); 2900 } 2901 if (off < 0 || (off + len) <= 0) { 2902 return (EINVAL); 2903 } 2904 if ((prot & PROT_USER) == 0) { 2905 return (EACCES); 2906 } 2907 2908 sc = ddi_get_soft_state(vmm_statep, minor); 2909 ASSERT(sc); 2910 2911 if (sc->vmm_flags & VMM_DESTROY) 2912 return (ENXIO); 2913 2914 /* Grab read lock on the VM to prevent any changes to the memory map */ 2915 vmm_read_lock(sc); 2916 2917 if (off >= VM_DEVMEM_START) { 2918 int segid; 2919 off_t segoff; 2920 2921 /* Mapping a devmem "device" */ 2922 if (!vmmdev_devmem_segid(sc, off, len, &segid, &segoff)) { 2923 err = ENODEV; 2924 } else { 2925 err = vm_segmap_obj(sc->vmm_vm, segid, segoff, len, as, 2926 addrp, prot, maxprot, flags); 2927 } 2928 } else { 2929 /* Mapping a part of the guest physical space */ 2930 err = vm_segmap_space(sc->vmm_vm, off, as, addrp, len, prot, 2931 maxprot, flags); 2932 } 2933 2934 vmm_read_unlock(sc); 2935 return (err); 2936 } 2937 2938 static sdev_plugin_validate_t 2939 vmm_sdev_validate(sdev_ctx_t ctx) 2940 { 2941 const char *name = sdev_ctx_name(ctx); 2942 vmm_softc_t *sc; 2943 sdev_plugin_validate_t ret; 2944 minor_t minor; 2945 2946 if (sdev_ctx_vtype(ctx) != VCHR) 2947 return (SDEV_VTOR_INVALID); 2948 2949 VERIFY3S(sdev_ctx_minor(ctx, &minor), ==, 0); 2950 2951 mutex_enter(&vmm_mtx); 2952 if ((sc = vmm_lookup(name)) == NULL) 2953 ret = SDEV_VTOR_INVALID; 2954 else if (sc->vmm_minor != minor) 2955 ret = SDEV_VTOR_STALE; 2956 else 2957 ret = SDEV_VTOR_VALID; 2958 mutex_exit(&vmm_mtx); 2959 2960 return (ret); 2961 } 2962 2963 static int 2964 vmm_sdev_filldir(sdev_ctx_t ctx) 2965 { 2966 vmm_softc_t *sc; 2967 int ret; 2968 2969 if (strcmp(sdev_ctx_path(ctx), VMM_SDEV_ROOT) != 0) { 2970 cmn_err(CE_WARN, "%s: bad path '%s' != '%s'\n", __func__, 2971 sdev_ctx_path(ctx), VMM_SDEV_ROOT); 2972 return (EINVAL); 2973 } 2974 2975 mutex_enter(&vmm_mtx); 2976 ASSERT(vmmdev_dip != NULL); 2977 for (sc = list_head(&vmm_list); sc != NULL; 2978 sc = list_next(&vmm_list, sc)) { 2979 if (INGLOBALZONE(curproc) || sc->vmm_zone == curzone) { 2980 ret = sdev_plugin_mknod(ctx, sc->vmm_name, 2981 S_IFCHR | 0600, 2982 makedevice(ddi_driver_major(vmmdev_dip), 2983 sc->vmm_minor)); 2984 } else { 2985 continue; 2986 } 2987 if (ret != 0 && ret != EEXIST) 2988 goto out; 2989 } 2990 2991 ret = 0; 2992 2993 out: 2994 mutex_exit(&vmm_mtx); 2995 return (ret); 2996 } 2997 2998 /* ARGSUSED */ 2999 static void 3000 vmm_sdev_inactive(sdev_ctx_t ctx) 3001 { 3002 } 3003 3004 static sdev_plugin_ops_t vmm_sdev_ops = { 3005 .spo_version = SDEV_PLUGIN_VERSION, 3006 .spo_flags = SDEV_PLUGIN_SUBDIR, 3007 .spo_validate = vmm_sdev_validate, 3008 .spo_filldir = vmm_sdev_filldir, 3009 .spo_inactive = vmm_sdev_inactive 3010 }; 3011 3012 /* ARGSUSED */ 3013 static int 3014 vmm_info(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result) 3015 { 3016 int error; 3017 3018 switch (cmd) { 3019 case DDI_INFO_DEVT2DEVINFO: 3020 *result = (void *)vmmdev_dip; 3021 error = DDI_SUCCESS; 3022 break; 3023 case DDI_INFO_DEVT2INSTANCE: 3024 *result = (void *)0; 3025 error = DDI_SUCCESS; 3026 break; 3027 default: 3028 error = DDI_FAILURE; 3029 break; 3030 } 3031 return (error); 3032 } 3033 3034 static int 3035 vmm_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 3036 { 3037 sdev_plugin_hdl_t sph; 3038 hma_reg_t *reg = NULL; 3039 boolean_t vmm_loaded = B_FALSE; 3040 3041 if (cmd != DDI_ATTACH) { 3042 return (DDI_FAILURE); 3043 } 3044 3045 mutex_enter(&vmmdev_mtx); 3046 /* Ensure we are not already attached. */ 3047 if (vmmdev_dip != NULL) { 3048 mutex_exit(&vmmdev_mtx); 3049 return (DDI_FAILURE); 3050 } 3051 3052 vmm_sol_glue_init(); 3053 3054 /* 3055 * Perform temporary HMA registration to determine if the system 3056 * is capable. 3057 */ 3058 if ((reg = hma_register(vmmdev_hvm_name)) == NULL) { 3059 goto fail; 3060 } else if (vmm_mod_load() != 0) { 3061 goto fail; 3062 } 3063 vmm_loaded = B_TRUE; 3064 hma_unregister(reg); 3065 reg = NULL; 3066 3067 /* Create control node. Other nodes will be created on demand. */ 3068 if (ddi_create_minor_node(dip, "ctl", S_IFCHR, 3069 VMM_CTL_MINOR, DDI_PSEUDO, 0) != 0) { 3070 goto fail; 3071 } 3072 3073 sph = sdev_plugin_register(VMM_MODULE_NAME, &vmm_sdev_ops, NULL); 3074 if (sph == (sdev_plugin_hdl_t)NULL) { 3075 ddi_remove_minor_node(dip, NULL); 3076 goto fail; 3077 } 3078 3079 ddi_report_dev(dip); 3080 vmmdev_sdev_hdl = sph; 3081 vmmdev_dip = dip; 3082 mutex_exit(&vmmdev_mtx); 3083 return (DDI_SUCCESS); 3084 3085 fail: 3086 if (vmm_loaded) { 3087 VERIFY0(vmm_mod_unload()); 3088 } 3089 if (reg != NULL) { 3090 hma_unregister(reg); 3091 } 3092 vmm_sol_glue_cleanup(); 3093 mutex_exit(&vmmdev_mtx); 3094 return (DDI_FAILURE); 3095 } 3096 3097 static int 3098 vmm_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 3099 { 3100 if (cmd != DDI_DETACH) { 3101 return (DDI_FAILURE); 3102 } 3103 3104 /* 3105 * Ensure that all resources have been cleaned up. 3106 * 3107 * To prevent a deadlock with iommu_cleanup() we'll fail the detach if 3108 * vmmdev_mtx is already held. We can't wait for vmmdev_mtx with our 3109 * devinfo locked as iommu_cleanup() tries to recursively lock each 3110 * devinfo, including our own, while holding vmmdev_mtx. 3111 */ 3112 if (mutex_tryenter(&vmmdev_mtx) == 0) 3113 return (DDI_FAILURE); 3114 3115 mutex_enter(&vmm_mtx); 3116 if (!list_is_empty(&vmm_list) || !list_is_empty(&vmm_destroy_list)) { 3117 mutex_exit(&vmm_mtx); 3118 mutex_exit(&vmmdev_mtx); 3119 return (DDI_FAILURE); 3120 } 3121 mutex_exit(&vmm_mtx); 3122 3123 if (!vmmr_is_empty()) { 3124 mutex_exit(&vmmdev_mtx); 3125 return (DDI_FAILURE); 3126 } 3127 3128 VERIFY(vmmdev_sdev_hdl != (sdev_plugin_hdl_t)NULL); 3129 if (sdev_plugin_unregister(vmmdev_sdev_hdl) != 0) { 3130 mutex_exit(&vmmdev_mtx); 3131 return (DDI_FAILURE); 3132 } 3133 vmmdev_sdev_hdl = (sdev_plugin_hdl_t)NULL; 3134 3135 /* Remove the control node. */ 3136 ddi_remove_minor_node(dip, "ctl"); 3137 vmmdev_dip = NULL; 3138 3139 VERIFY0(vmm_mod_unload()); 3140 VERIFY3U(vmmdev_hma_reg, ==, NULL); 3141 vmm_sol_glue_cleanup(); 3142 3143 mutex_exit(&vmmdev_mtx); 3144 3145 return (DDI_SUCCESS); 3146 } 3147 3148 static struct cb_ops vmm_cb_ops = { 3149 vmm_open, 3150 vmm_close, 3151 nodev, /* strategy */ 3152 nodev, /* print */ 3153 nodev, /* dump */ 3154 nodev, /* read */ 3155 nodev, /* write */ 3156 vmm_ioctl, 3157 nodev, /* devmap */ 3158 nodev, /* mmap */ 3159 vmm_segmap, 3160 nochpoll, /* poll */ 3161 ddi_prop_op, 3162 NULL, 3163 D_NEW | D_MP | D_DEVMAP 3164 }; 3165 3166 static struct dev_ops vmm_ops = { 3167 DEVO_REV, 3168 0, 3169 vmm_info, 3170 nulldev, /* identify */ 3171 nulldev, /* probe */ 3172 vmm_attach, 3173 vmm_detach, 3174 nodev, /* reset */ 3175 &vmm_cb_ops, 3176 (struct bus_ops *)NULL 3177 }; 3178 3179 static struct modldrv modldrv = { 3180 &mod_driverops, 3181 "bhyve vmm", 3182 &vmm_ops 3183 }; 3184 3185 static struct modlinkage modlinkage = { 3186 MODREV_1, 3187 &modldrv, 3188 NULL 3189 }; 3190 3191 int 3192 _init(void) 3193 { 3194 int error; 3195 3196 sysinit(); 3197 3198 mutex_init(&vmmdev_mtx, NULL, MUTEX_DRIVER, NULL); 3199 mutex_init(&vmm_mtx, NULL, MUTEX_DRIVER, NULL); 3200 list_create(&vmm_list, sizeof (vmm_softc_t), 3201 offsetof(vmm_softc_t, vmm_node)); 3202 list_create(&vmm_destroy_list, sizeof (vmm_softc_t), 3203 offsetof(vmm_softc_t, vmm_node)); 3204 vmm_minors = id_space_create("vmm_minors", VMM_CTL_MINOR + 1, MAXMIN32); 3205 3206 error = ddi_soft_state_init(&vmm_statep, sizeof (vmm_softc_t), 0); 3207 if (error) { 3208 return (error); 3209 } 3210 3211 vmm_zsd_init(); 3212 vmmr_init(); 3213 3214 error = mod_install(&modlinkage); 3215 if (error) { 3216 ddi_soft_state_fini(&vmm_statep); 3217 vmm_zsd_fini(); 3218 vmmr_fini(); 3219 } 3220 3221 return (error); 3222 } 3223 3224 int 3225 _fini(void) 3226 { 3227 int error; 3228 3229 error = mod_remove(&modlinkage); 3230 if (error) { 3231 return (error); 3232 } 3233 3234 vmm_zsd_fini(); 3235 vmmr_fini(); 3236 3237 ddi_soft_state_fini(&vmm_statep); 3238 3239 return (0); 3240 } 3241 3242 int 3243 _info(struct modinfo *modinfop) 3244 { 3245 return (mod_info(&modlinkage, modinfop)); 3246 } 3247