1 /* 2 * This file and its contents are supplied under the terms of the 3 * Common Development and Distribution License ("CDDL"), version 1.0. 4 * You may only use this file in accordance with the terms of version 5 * 1.0 of the CDDL. 6 * 7 * A full copy of the text of the CDDL should have accompanied this 8 * source. A copy of the CDDL is also available via the Internet at 9 * http://www.illumos.org/license/CDDL. 10 */ 11 /* This file is dual-licensed; see usr/src/contrib/bhyve/LICENSE */ 12 13 /* 14 * Copyright 2015 Pluribus Networks Inc. 15 * Copyright 2019 Joyent, Inc. 16 * Copyright 2020 OmniOS Community Edition (OmniOSce) Association. 17 * Copyright 2023 Oxide Computer Company 18 */ 19 20 #include <sys/types.h> 21 #include <sys/conf.h> 22 #include <sys/cpuvar.h> 23 #include <sys/ioccom.h> 24 #include <sys/stat.h> 25 #include <sys/vmsystm.h> 26 #include <sys/ddi.h> 27 #include <sys/mkdev.h> 28 #include <sys/sunddi.h> 29 #include <sys/fs/dv_node.h> 30 #include <sys/cpuset.h> 31 #include <sys/id_space.h> 32 #include <sys/fs/sdev_plugin.h> 33 #include <sys/smt.h> 34 #include <sys/kstat.h> 35 36 #include <sys/kernel.h> 37 #include <sys/hma.h> 38 #include <sys/x86_archext.h> 39 #include <x86/apicreg.h> 40 41 #include <sys/vmm.h> 42 #include <sys/vmm_kernel.h> 43 #include <sys/vmm_instruction_emul.h> 44 #include <sys/vmm_dev.h> 45 #include <sys/vmm_impl.h> 46 #include <sys/vmm_drv.h> 47 #include <sys/vmm_vm.h> 48 #include <sys/vmm_reservoir.h> 49 50 #include <vm/seg_dev.h> 51 52 #include "io/ppt.h" 53 #include "io/vatpic.h" 54 #include "io/vioapic.h" 55 #include "io/vrtc.h" 56 #include "io/vhpet.h" 57 #include "io/vpmtmr.h" 58 #include "vmm_lapic.h" 59 #include "vmm_stat.h" 60 #include "vmm_util.h" 61 62 /* 63 * Locking details: 64 * 65 * Driver-wide data (vmmdev_*) , including HMA and sdev registration, is 66 * protected by vmmdev_mtx. The list of vmm_softc_t instances and related data 67 * (vmm_*) are protected by vmm_mtx. Actions requiring both locks must acquire 68 * vmmdev_mtx before vmm_mtx. The sdev plugin functions must not attempt to 69 * acquire vmmdev_mtx, as they could deadlock with plugin unregistration. 70 */ 71 72 static kmutex_t vmmdev_mtx; 73 static dev_info_t *vmmdev_dip; 74 static hma_reg_t *vmmdev_hma_reg; 75 static uint_t vmmdev_hma_ref; 76 static sdev_plugin_hdl_t vmmdev_sdev_hdl; 77 78 static kmutex_t vmm_mtx; 79 static list_t vmm_list; 80 static id_space_t *vmm_minors; 81 static void *vmm_statep; 82 83 /* 84 * Until device emulation in bhyve had been adequately scrutinized and tested, 85 * there was (justified) concern that unusual or corrupt device state payloads 86 * could crash the host when loaded via the vmm-data interface. 87 * 88 * Now that those concerns have been mitigated, this protection is loosened to 89 * default-allow, but the switch is left in place, in case there is a need to 90 * once again clamp down on vmm-data writes. 91 */ 92 int vmm_allow_state_writes = 1; 93 94 static const char *vmmdev_hvm_name = "bhyve"; 95 96 /* For sdev plugin (/dev) */ 97 #define VMM_SDEV_ROOT "/dev/vmm" 98 99 /* From uts/intel/io/vmm/intel/vmx.c */ 100 extern int vmx_x86_supported(const char **); 101 102 /* Holds and hooks from drivers external to vmm */ 103 struct vmm_hold { 104 list_node_t vmh_node; 105 vmm_softc_t *vmh_sc; 106 boolean_t vmh_release_req; 107 uint_t vmh_ioport_hook_cnt; 108 }; 109 110 struct vmm_lease { 111 list_node_t vml_node; 112 struct vm *vml_vm; 113 vm_client_t *vml_vmclient; 114 boolean_t vml_expired; 115 boolean_t vml_break_deferred; 116 boolean_t (*vml_expire_func)(void *); 117 void *vml_expire_arg; 118 struct vmm_hold *vml_hold; 119 }; 120 121 /* Options for vmm_destroy_locked */ 122 typedef enum vmm_destroy_opts { 123 VDO_DEFAULT = 0, 124 /* 125 * Indicate that zone-specific-data associated with this VM not be 126 * cleaned up as part of the destroy. Skipping ZSD clean-up is 127 * necessary when VM is being destroyed as part of zone destruction, 128 * when said ZSD is already being cleaned up. 129 */ 130 VDO_NO_CLEAN_ZSD = (1 << 0), 131 /* 132 * Attempt to wait for VM destruction to complete. This is opt-in, 133 * since there are many normal conditions which could lead to 134 * destruction being stalled pending other clean-up. 135 */ 136 VDO_ATTEMPT_WAIT = (1 << 1), 137 } vmm_destroy_opts_t; 138 139 static void vmm_hma_release(void); 140 static int vmm_destroy_locked(vmm_softc_t *, vmm_destroy_opts_t, bool *); 141 static int vmm_drv_block_hook(vmm_softc_t *, boolean_t); 142 static void vmm_lease_block(vmm_softc_t *); 143 static void vmm_lease_unblock(vmm_softc_t *); 144 static int vmm_kstat_alloc(vmm_softc_t *, minor_t, const cred_t *); 145 static void vmm_kstat_init(vmm_softc_t *); 146 static void vmm_kstat_fini(vmm_softc_t *); 147 148 /* 149 * The 'devmem' hack: 150 * 151 * On native FreeBSD, bhyve consumers are allowed to create 'devmem' segments 152 * in the vm which appear with their own name related to the vm under /dev. 153 * Since this would be a hassle from an sdev perspective and would require a 154 * new cdev interface (or complicate the existing one), we choose to implement 155 * this in a different manner. Direct access to the underlying vm memory 156 * segments is exposed by placing them in a range of offsets beyond the normal 157 * guest memory space. Userspace can query the appropriate offset to mmap() 158 * for a given segment-id with the VM_DEVMEM_GETOFFSET ioctl. 159 */ 160 161 static vmm_devmem_entry_t * 162 vmmdev_devmem_find(vmm_softc_t *sc, int segid) 163 { 164 vmm_devmem_entry_t *ent = NULL; 165 list_t *dl = &sc->vmm_devmem_list; 166 167 for (ent = list_head(dl); ent != NULL; ent = list_next(dl, ent)) { 168 if (ent->vde_segid == segid) { 169 return (ent); 170 } 171 } 172 return (NULL); 173 } 174 175 static int 176 vmmdev_get_memseg(vmm_softc_t *sc, struct vm_memseg *mseg) 177 { 178 int error; 179 bool sysmem; 180 181 error = vm_get_memseg(sc->vmm_vm, mseg->segid, &mseg->len, &sysmem, 182 NULL); 183 if (error || mseg->len == 0) 184 return (error); 185 186 if (!sysmem) { 187 vmm_devmem_entry_t *de; 188 189 de = vmmdev_devmem_find(sc, mseg->segid); 190 if (de != NULL) { 191 (void) strlcpy(mseg->name, de->vde_name, 192 sizeof (mseg->name)); 193 } 194 } else { 195 bzero(mseg->name, sizeof (mseg->name)); 196 } 197 198 return (error); 199 } 200 201 static int 202 vmmdev_devmem_create(vmm_softc_t *sc, struct vm_memseg *mseg, const char *name) 203 { 204 off_t map_offset; 205 vmm_devmem_entry_t *entry; 206 207 if (list_is_empty(&sc->vmm_devmem_list)) { 208 map_offset = VM_DEVMEM_START; 209 } else { 210 entry = list_tail(&sc->vmm_devmem_list); 211 map_offset = entry->vde_off + entry->vde_len; 212 if (map_offset < entry->vde_off) { 213 /* Do not tolerate overflow */ 214 return (ERANGE); 215 } 216 /* 217 * XXXJOY: We could choose to search the list for duplicate 218 * names and toss an error. Since we're using the offset 219 * method for now, it does not make much of a difference. 220 */ 221 } 222 223 entry = kmem_zalloc(sizeof (*entry), KM_SLEEP); 224 entry->vde_segid = mseg->segid; 225 entry->vde_len = mseg->len; 226 entry->vde_off = map_offset; 227 (void) strlcpy(entry->vde_name, name, sizeof (entry->vde_name)); 228 list_insert_tail(&sc->vmm_devmem_list, entry); 229 230 return (0); 231 } 232 233 static boolean_t 234 vmmdev_devmem_segid(vmm_softc_t *sc, off_t off, off_t len, int *segidp, 235 off_t *map_offp) 236 { 237 list_t *dl = &sc->vmm_devmem_list; 238 vmm_devmem_entry_t *de = NULL; 239 const off_t map_end = off + len; 240 241 VERIFY(off >= VM_DEVMEM_START); 242 243 if (map_end < off) { 244 /* No match on overflow */ 245 return (B_FALSE); 246 } 247 248 for (de = list_head(dl); de != NULL; de = list_next(dl, de)) { 249 const off_t item_end = de->vde_off + de->vde_len; 250 251 if (de->vde_off <= off && item_end >= map_end) { 252 *segidp = de->vde_segid; 253 *map_offp = off - de->vde_off; 254 return (B_TRUE); 255 } 256 } 257 return (B_FALSE); 258 } 259 260 /* 261 * When an instance is being destroyed, the devmem list of named memory objects 262 * can be torn down, as no new mappings are allowed. 263 */ 264 static void 265 vmmdev_devmem_purge(vmm_softc_t *sc) 266 { 267 vmm_devmem_entry_t *entry; 268 269 while ((entry = list_remove_head(&sc->vmm_devmem_list)) != NULL) { 270 kmem_free(entry, sizeof (*entry)); 271 } 272 } 273 274 static int 275 vmmdev_alloc_memseg(vmm_softc_t *sc, struct vm_memseg *mseg) 276 { 277 int error; 278 bool sysmem = true; 279 280 if (VM_MEMSEG_NAME(mseg)) { 281 sysmem = false; 282 } 283 error = vm_alloc_memseg(sc->vmm_vm, mseg->segid, mseg->len, sysmem); 284 285 if (error == 0) { 286 /* 287 * Rather than create a whole fresh device from which userspace 288 * can mmap this segment, instead make it available at an 289 * offset above where the main guest memory resides. 290 */ 291 error = vmmdev_devmem_create(sc, mseg, mseg->name); 292 if (error != 0) { 293 vm_free_memseg(sc->vmm_vm, mseg->segid); 294 } 295 } 296 return (error); 297 } 298 299 /* 300 * Resource Locking and Exclusion 301 * 302 * Much of bhyve depends on key portions of VM state, such as the guest memory 303 * map, to remain unchanged while the guest is running. As ported from 304 * FreeBSD, the initial strategy for this resource exclusion hinged on gating 305 * access to the instance vCPUs. Threads acting on a single vCPU, like those 306 * performing the work of actually running the guest in VMX/SVM, would lock 307 * only that vCPU during ioctl() entry. For ioctls which would change VM-wide 308 * state, all of the vCPUs would be first locked, ensuring that the 309 * operation(s) could complete without any other threads stumbling into 310 * intermediate states. 311 * 312 * This approach is largely effective for bhyve. Common operations, such as 313 * running the vCPUs, steer clear of lock contention. The model begins to 314 * break down for operations which do not occur in the context of a specific 315 * vCPU. LAPIC MSI delivery, for example, may be initiated from a worker 316 * thread in the bhyve process. In order to properly protect those vCPU-less 317 * operations from encountering invalid states, additional locking is required. 318 * This was solved by forcing those operations to lock the VM_MAXCPU-1 vCPU. 319 * It does mean that class of operations will be serialized on locking the 320 * specific vCPU and that instances sized at VM_MAXCPU will potentially see 321 * undue contention on the VM_MAXCPU-1 vCPU. 322 * 323 * In order to address the shortcomings of this model, the concept of a 324 * read/write lock has been added to bhyve. Operations which change 325 * fundamental aspects of a VM (such as the memory map) must acquire the write 326 * lock, which also implies locking all of the vCPUs and waiting for all read 327 * lock holders to release. While it increases the cost and waiting time for 328 * those few operations, it allows most hot-path operations on the VM (which 329 * depend on its configuration remaining stable) to occur with minimal locking. 330 * 331 * Consumers of the Driver API (see below) are a special case when it comes to 332 * this locking, since they may hold a read lock via the drv_lease mechanism 333 * for an extended period of time. Rather than forcing those consumers to 334 * continuously poll for a write lock attempt, the lease system forces them to 335 * provide a release callback to trigger their clean-up (and potential later 336 * reacquisition) of the read lock. 337 */ 338 339 static void 340 vcpu_lock_one(vmm_softc_t *sc, int vcpu) 341 { 342 ASSERT(vcpu >= 0 && vcpu < VM_MAXCPU); 343 344 /* 345 * Since this state transition is utilizing from_idle=true, it should 346 * not fail, but rather block until it can be successful. 347 */ 348 VERIFY0(vcpu_set_state(sc->vmm_vm, vcpu, VCPU_FROZEN, true)); 349 } 350 351 static void 352 vcpu_unlock_one(vmm_softc_t *sc, int vcpu) 353 { 354 ASSERT(vcpu >= 0 && vcpu < VM_MAXCPU); 355 356 VERIFY3U(vcpu_get_state(sc->vmm_vm, vcpu, NULL), ==, VCPU_FROZEN); 357 VERIFY0(vcpu_set_state(sc->vmm_vm, vcpu, VCPU_IDLE, false)); 358 } 359 360 static void 361 vmm_read_lock(vmm_softc_t *sc) 362 { 363 rw_enter(&sc->vmm_rwlock, RW_READER); 364 } 365 366 static void 367 vmm_read_unlock(vmm_softc_t *sc) 368 { 369 rw_exit(&sc->vmm_rwlock); 370 } 371 372 static void 373 vmm_write_lock(vmm_softc_t *sc) 374 { 375 int maxcpus; 376 377 /* First lock all the vCPUs */ 378 maxcpus = vm_get_maxcpus(sc->vmm_vm); 379 for (int vcpu = 0; vcpu < maxcpus; vcpu++) { 380 vcpu_lock_one(sc, vcpu); 381 } 382 383 /* 384 * Block vmm_drv leases from being acquired or held while the VM write 385 * lock is held. 386 */ 387 vmm_lease_block(sc); 388 389 rw_enter(&sc->vmm_rwlock, RW_WRITER); 390 /* 391 * For now, the 'maxcpus' value for an instance is fixed at the 392 * compile-time constant of VM_MAXCPU at creation. If this changes in 393 * the future, allowing for dynamic vCPU resource sizing, acquisition 394 * of the write lock will need to be wary of such changes. 395 */ 396 VERIFY(maxcpus == vm_get_maxcpus(sc->vmm_vm)); 397 } 398 399 static void 400 vmm_write_unlock(vmm_softc_t *sc) 401 { 402 int maxcpus; 403 404 /* Allow vmm_drv leases to be acquired once write lock is dropped */ 405 vmm_lease_unblock(sc); 406 407 /* 408 * The VM write lock _must_ be released from the same thread it was 409 * acquired in, unlike the read lock. 410 */ 411 VERIFY(rw_write_held(&sc->vmm_rwlock)); 412 rw_exit(&sc->vmm_rwlock); 413 414 /* Unlock all the vCPUs */ 415 maxcpus = vm_get_maxcpus(sc->vmm_vm); 416 for (int vcpu = 0; vcpu < maxcpus; vcpu++) { 417 vcpu_unlock_one(sc, vcpu); 418 } 419 } 420 421 static int 422 vmmdev_do_ioctl(vmm_softc_t *sc, int cmd, intptr_t arg, int md, 423 cred_t *credp, int *rvalp) 424 { 425 int error = 0, vcpu = -1; 426 void *datap = (void *)arg; 427 enum vm_lock_type { 428 LOCK_NONE = 0, 429 LOCK_VCPU, 430 LOCK_READ_HOLD, 431 LOCK_WRITE_HOLD 432 } lock_type = LOCK_NONE; 433 434 /* Acquire any exclusion resources needed for the operation. */ 435 switch (cmd) { 436 case VM_RUN: 437 case VM_GET_REGISTER: 438 case VM_SET_REGISTER: 439 case VM_GET_SEGMENT_DESCRIPTOR: 440 case VM_SET_SEGMENT_DESCRIPTOR: 441 case VM_GET_REGISTER_SET: 442 case VM_SET_REGISTER_SET: 443 case VM_INJECT_EXCEPTION: 444 case VM_GET_CAPABILITY: 445 case VM_SET_CAPABILITY: 446 case VM_PPTDEV_MSI: 447 case VM_PPTDEV_MSIX: 448 case VM_SET_X2APIC_STATE: 449 case VM_GLA2GPA: 450 case VM_GLA2GPA_NOFAULT: 451 case VM_ACTIVATE_CPU: 452 case VM_SET_INTINFO: 453 case VM_GET_INTINFO: 454 case VM_RESTART_INSTRUCTION: 455 case VM_SET_KERNEMU_DEV: 456 case VM_GET_KERNEMU_DEV: 457 case VM_RESET_CPU: 458 case VM_GET_RUN_STATE: 459 case VM_SET_RUN_STATE: 460 case VM_GET_FPU: 461 case VM_SET_FPU: 462 case VM_GET_CPUID: 463 case VM_SET_CPUID: 464 case VM_LEGACY_CPUID: 465 /* 466 * Copy in the ID of the vCPU chosen for this operation. 467 * Since a nefarious caller could update their struct between 468 * this locking and when the rest of the ioctl data is copied 469 * in, it is _critical_ that this local 'vcpu' variable be used 470 * rather than the in-struct one when performing the ioctl. 471 */ 472 if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) { 473 return (EFAULT); 474 } 475 if (vcpu < 0 || vcpu > vm_get_maxcpus(sc->vmm_vm)) { 476 return (EINVAL); 477 } 478 vcpu_lock_one(sc, vcpu); 479 lock_type = LOCK_VCPU; 480 break; 481 482 case VM_REINIT: 483 case VM_BIND_PPTDEV: 484 case VM_UNBIND_PPTDEV: 485 case VM_MAP_PPTDEV_MMIO: 486 case VM_UNMAP_PPTDEV_MMIO: 487 case VM_ALLOC_MEMSEG: 488 case VM_MMAP_MEMSEG: 489 case VM_MUNMAP_MEMSEG: 490 case VM_WRLOCK_CYCLE: 491 case VM_PMTMR_LOCATE: 492 case VM_PAUSE: 493 case VM_RESUME: 494 vmm_write_lock(sc); 495 lock_type = LOCK_WRITE_HOLD; 496 break; 497 498 case VM_GET_MEMSEG: 499 case VM_MMAP_GETNEXT: 500 case VM_LAPIC_IRQ: 501 case VM_INJECT_NMI: 502 case VM_IOAPIC_ASSERT_IRQ: 503 case VM_IOAPIC_DEASSERT_IRQ: 504 case VM_IOAPIC_PULSE_IRQ: 505 case VM_LAPIC_MSI: 506 case VM_LAPIC_LOCAL_IRQ: 507 case VM_GET_X2APIC_STATE: 508 case VM_RTC_READ: 509 case VM_RTC_WRITE: 510 case VM_RTC_SETTIME: 511 case VM_RTC_GETTIME: 512 case VM_PPTDEV_DISABLE_MSIX: 513 case VM_DEVMEM_GETOFFSET: 514 case VM_TRACK_DIRTY_PAGES: 515 vmm_read_lock(sc); 516 lock_type = LOCK_READ_HOLD; 517 break; 518 519 case VM_DATA_READ: 520 case VM_DATA_WRITE: 521 if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) { 522 return (EFAULT); 523 } 524 if (vcpu == -1) { 525 /* Access data for VM-wide devices */ 526 vmm_write_lock(sc); 527 lock_type = LOCK_WRITE_HOLD; 528 } else if (vcpu >= 0 && vcpu < vm_get_maxcpus(sc->vmm_vm)) { 529 /* Access data associated with a specific vCPU */ 530 vcpu_lock_one(sc, vcpu); 531 lock_type = LOCK_VCPU; 532 } else { 533 return (EINVAL); 534 } 535 break; 536 537 case VM_GET_GPA_PMAP: 538 case VM_IOAPIC_PINCOUNT: 539 case VM_SUSPEND: 540 case VM_DESC_FPU_AREA: 541 case VM_SET_AUTODESTRUCT: 542 case VM_DESTROY_SELF: 543 case VM_DESTROY_PENDING: 544 default: 545 break; 546 } 547 548 /* Execute the primary logic for the ioctl. */ 549 switch (cmd) { 550 case VM_RUN: { 551 struct vm_entry entry; 552 553 if (ddi_copyin(datap, &entry, sizeof (entry), md)) { 554 error = EFAULT; 555 break; 556 } 557 558 if (!(curthread->t_schedflag & TS_VCPU)) 559 smt_mark_as_vcpu(); 560 561 error = vm_run(sc->vmm_vm, vcpu, &entry); 562 563 /* 564 * Unexpected states in vm_run() are expressed through positive 565 * errno-oriented return values. VM states which expect further 566 * processing in userspace (necessary context via exitinfo) are 567 * expressed through negative return values. For the time being 568 * a return value of 0 is not expected from vm_run(). 569 */ 570 ASSERT(error != 0); 571 if (error < 0) { 572 const struct vm_exit *vme; 573 void *outp = entry.exit_data; 574 575 error = 0; 576 vme = vm_exitinfo(sc->vmm_vm, vcpu); 577 if (ddi_copyout(vme, outp, sizeof (*vme), md)) { 578 error = EFAULT; 579 } 580 } 581 break; 582 } 583 case VM_SUSPEND: { 584 struct vm_suspend vmsuspend; 585 586 if (ddi_copyin(datap, &vmsuspend, sizeof (vmsuspend), md)) { 587 error = EFAULT; 588 break; 589 } 590 error = vm_suspend(sc->vmm_vm, vmsuspend.how); 591 break; 592 } 593 case VM_REINIT: { 594 struct vm_reinit reinit; 595 596 if (ddi_copyin(datap, &reinit, sizeof (reinit), md)) { 597 error = EFAULT; 598 break; 599 } 600 if ((error = vmm_drv_block_hook(sc, B_TRUE)) != 0) { 601 /* 602 * The VM instance should be free of driver-attached 603 * hooks during the reinitialization process. 604 */ 605 break; 606 } 607 error = vm_reinit(sc->vmm_vm, reinit.flags); 608 (void) vmm_drv_block_hook(sc, B_FALSE); 609 break; 610 } 611 case VM_STAT_DESC: { 612 struct vm_stat_desc statdesc; 613 614 if (ddi_copyin(datap, &statdesc, sizeof (statdesc), md)) { 615 error = EFAULT; 616 break; 617 } 618 error = vmm_stat_desc_copy(statdesc.index, statdesc.desc, 619 sizeof (statdesc.desc)); 620 if (error == 0 && 621 ddi_copyout(&statdesc, datap, sizeof (statdesc), md)) { 622 error = EFAULT; 623 break; 624 } 625 break; 626 } 627 case VM_STATS_IOC: { 628 struct vm_stats vmstats; 629 630 if (ddi_copyin(datap, &vmstats, sizeof (vmstats), md)) { 631 error = EFAULT; 632 break; 633 } 634 hrt2tv(gethrtime(), &vmstats.tv); 635 error = vmm_stat_copy(sc->vmm_vm, vmstats.cpuid, vmstats.index, 636 nitems(vmstats.statbuf), 637 &vmstats.num_entries, vmstats.statbuf); 638 if (error == 0 && 639 ddi_copyout(&vmstats, datap, sizeof (vmstats), md)) { 640 error = EFAULT; 641 break; 642 } 643 break; 644 } 645 646 case VM_PPTDEV_MSI: { 647 struct vm_pptdev_msi pptmsi; 648 649 if (ddi_copyin(datap, &pptmsi, sizeof (pptmsi), md)) { 650 error = EFAULT; 651 break; 652 } 653 error = ppt_setup_msi(sc->vmm_vm, pptmsi.vcpu, pptmsi.pptfd, 654 pptmsi.addr, pptmsi.msg, pptmsi.numvec); 655 break; 656 } 657 case VM_PPTDEV_MSIX: { 658 struct vm_pptdev_msix pptmsix; 659 660 if (ddi_copyin(datap, &pptmsix, sizeof (pptmsix), md)) { 661 error = EFAULT; 662 break; 663 } 664 error = ppt_setup_msix(sc->vmm_vm, pptmsix.vcpu, pptmsix.pptfd, 665 pptmsix.idx, pptmsix.addr, pptmsix.msg, 666 pptmsix.vector_control); 667 break; 668 } 669 case VM_PPTDEV_DISABLE_MSIX: { 670 struct vm_pptdev pptdev; 671 672 if (ddi_copyin(datap, &pptdev, sizeof (pptdev), md)) { 673 error = EFAULT; 674 break; 675 } 676 error = ppt_disable_msix(sc->vmm_vm, pptdev.pptfd); 677 break; 678 } 679 case VM_MAP_PPTDEV_MMIO: { 680 struct vm_pptdev_mmio pptmmio; 681 682 if (ddi_copyin(datap, &pptmmio, sizeof (pptmmio), md)) { 683 error = EFAULT; 684 break; 685 } 686 error = ppt_map_mmio(sc->vmm_vm, pptmmio.pptfd, pptmmio.gpa, 687 pptmmio.len, pptmmio.hpa); 688 break; 689 } 690 case VM_UNMAP_PPTDEV_MMIO: { 691 struct vm_pptdev_mmio pptmmio; 692 693 if (ddi_copyin(datap, &pptmmio, sizeof (pptmmio), md)) { 694 error = EFAULT; 695 break; 696 } 697 error = ppt_unmap_mmio(sc->vmm_vm, pptmmio.pptfd, pptmmio.gpa, 698 pptmmio.len); 699 break; 700 } 701 case VM_BIND_PPTDEV: { 702 struct vm_pptdev pptdev; 703 704 if (ddi_copyin(datap, &pptdev, sizeof (pptdev), md)) { 705 error = EFAULT; 706 break; 707 } 708 error = vm_assign_pptdev(sc->vmm_vm, pptdev.pptfd); 709 break; 710 } 711 case VM_UNBIND_PPTDEV: { 712 struct vm_pptdev pptdev; 713 714 if (ddi_copyin(datap, &pptdev, sizeof (pptdev), md)) { 715 error = EFAULT; 716 break; 717 } 718 error = vm_unassign_pptdev(sc->vmm_vm, pptdev.pptfd); 719 break; 720 } 721 case VM_GET_PPTDEV_LIMITS: { 722 struct vm_pptdev_limits pptlimits; 723 724 if (ddi_copyin(datap, &pptlimits, sizeof (pptlimits), md)) { 725 error = EFAULT; 726 break; 727 } 728 error = ppt_get_limits(sc->vmm_vm, pptlimits.pptfd, 729 &pptlimits.msi_limit, &pptlimits.msix_limit); 730 if (error == 0 && 731 ddi_copyout(&pptlimits, datap, sizeof (pptlimits), md)) { 732 error = EFAULT; 733 break; 734 } 735 break; 736 } 737 case VM_INJECT_EXCEPTION: { 738 struct vm_exception vmexc; 739 if (ddi_copyin(datap, &vmexc, sizeof (vmexc), md)) { 740 error = EFAULT; 741 break; 742 } 743 error = vm_inject_exception(sc->vmm_vm, vcpu, vmexc.vector, 744 vmexc.error_code_valid != 0, vmexc.error_code, 745 vmexc.restart_instruction != 0); 746 break; 747 } 748 case VM_INJECT_NMI: { 749 struct vm_nmi vmnmi; 750 751 if (ddi_copyin(datap, &vmnmi, sizeof (vmnmi), md)) { 752 error = EFAULT; 753 break; 754 } 755 error = vm_inject_nmi(sc->vmm_vm, vmnmi.cpuid); 756 break; 757 } 758 case VM_LAPIC_IRQ: { 759 struct vm_lapic_irq vmirq; 760 761 if (ddi_copyin(datap, &vmirq, sizeof (vmirq), md)) { 762 error = EFAULT; 763 break; 764 } 765 error = lapic_intr_edge(sc->vmm_vm, vmirq.cpuid, vmirq.vector); 766 break; 767 } 768 case VM_LAPIC_LOCAL_IRQ: { 769 struct vm_lapic_irq vmirq; 770 771 if (ddi_copyin(datap, &vmirq, sizeof (vmirq), md)) { 772 error = EFAULT; 773 break; 774 } 775 error = lapic_set_local_intr(sc->vmm_vm, vmirq.cpuid, 776 vmirq.vector); 777 break; 778 } 779 case VM_LAPIC_MSI: { 780 struct vm_lapic_msi vmmsi; 781 782 if (ddi_copyin(datap, &vmmsi, sizeof (vmmsi), md)) { 783 error = EFAULT; 784 break; 785 } 786 error = lapic_intr_msi(sc->vmm_vm, vmmsi.addr, vmmsi.msg); 787 break; 788 } 789 790 case VM_IOAPIC_ASSERT_IRQ: { 791 struct vm_ioapic_irq ioapic_irq; 792 793 if (ddi_copyin(datap, &ioapic_irq, sizeof (ioapic_irq), md)) { 794 error = EFAULT; 795 break; 796 } 797 error = vioapic_assert_irq(sc->vmm_vm, ioapic_irq.irq); 798 break; 799 } 800 case VM_IOAPIC_DEASSERT_IRQ: { 801 struct vm_ioapic_irq ioapic_irq; 802 803 if (ddi_copyin(datap, &ioapic_irq, sizeof (ioapic_irq), md)) { 804 error = EFAULT; 805 break; 806 } 807 error = vioapic_deassert_irq(sc->vmm_vm, ioapic_irq.irq); 808 break; 809 } 810 case VM_IOAPIC_PULSE_IRQ: { 811 struct vm_ioapic_irq ioapic_irq; 812 813 if (ddi_copyin(datap, &ioapic_irq, sizeof (ioapic_irq), md)) { 814 error = EFAULT; 815 break; 816 } 817 error = vioapic_pulse_irq(sc->vmm_vm, ioapic_irq.irq); 818 break; 819 } 820 case VM_IOAPIC_PINCOUNT: { 821 int pincount; 822 823 pincount = vioapic_pincount(sc->vmm_vm); 824 if (ddi_copyout(&pincount, datap, sizeof (int), md)) { 825 error = EFAULT; 826 break; 827 } 828 break; 829 } 830 case VM_DESC_FPU_AREA: { 831 struct vm_fpu_desc desc; 832 void *buf = NULL; 833 834 if (ddi_copyin(datap, &desc, sizeof (desc), md)) { 835 error = EFAULT; 836 break; 837 } 838 if (desc.vfd_num_entries > 64) { 839 error = EINVAL; 840 break; 841 } 842 const size_t buf_sz = sizeof (struct vm_fpu_desc_entry) * 843 desc.vfd_num_entries; 844 if (buf_sz != 0) { 845 buf = kmem_zalloc(buf_sz, KM_SLEEP); 846 } 847 848 /* 849 * For now, we are depending on vm_fpu_desc_entry and 850 * hma_xsave_state_desc_t having the same format. 851 */ 852 CTASSERT(sizeof (struct vm_fpu_desc_entry) == 853 sizeof (hma_xsave_state_desc_t)); 854 855 size_t req_size; 856 const uint_t max_entries = hma_fpu_describe_xsave_state( 857 (hma_xsave_state_desc_t *)buf, 858 desc.vfd_num_entries, 859 &req_size); 860 861 desc.vfd_req_size = req_size; 862 desc.vfd_num_entries = max_entries; 863 if (buf_sz != 0) { 864 if (ddi_copyout(buf, desc.vfd_entry_data, buf_sz, md)) { 865 error = EFAULT; 866 } 867 kmem_free(buf, buf_sz); 868 } 869 870 if (error == 0) { 871 if (ddi_copyout(&desc, datap, sizeof (desc), md)) { 872 error = EFAULT; 873 } 874 } 875 break; 876 } 877 case VM_SET_AUTODESTRUCT: { 878 /* 879 * Since this has to do with controlling the lifetime of the 880 * greater vmm_softc_t, the flag is protected by vmm_mtx, rather 881 * than the vcpu-centric or rwlock exclusion mechanisms. 882 */ 883 mutex_enter(&vmm_mtx); 884 if (arg != 0) { 885 sc->vmm_flags |= VMM_AUTODESTROY; 886 } else { 887 sc->vmm_flags &= ~VMM_AUTODESTROY; 888 } 889 mutex_exit(&vmm_mtx); 890 break; 891 } 892 case VM_DESTROY_SELF: { 893 bool hma_release = false; 894 895 /* 896 * Just like VMM_DESTROY_VM, but on the instance file descriptor 897 * itself, rather than having to perform a racy name lookup as 898 * part of the destroy process. 899 * 900 * Since vmm_destroy_locked() performs vCPU lock acquisition in 901 * order to kick the vCPUs out of guest context as part of any 902 * destruction, we do not need to worry about it ourself using 903 * the `lock_type` logic here. 904 */ 905 mutex_enter(&vmm_mtx); 906 VERIFY0(vmm_destroy_locked(sc, VDO_DEFAULT, &hma_release)); 907 mutex_exit(&vmm_mtx); 908 if (hma_release) { 909 vmm_hma_release(); 910 } 911 break; 912 } 913 case VM_DESTROY_PENDING: { 914 /* 915 * If we have made it this far, then destruction of the instance 916 * has not been initiated. 917 */ 918 *rvalp = 0; 919 break; 920 } 921 922 case VM_ISA_ASSERT_IRQ: { 923 struct vm_isa_irq isa_irq; 924 925 if (ddi_copyin(datap, &isa_irq, sizeof (isa_irq), md)) { 926 error = EFAULT; 927 break; 928 } 929 error = vatpic_assert_irq(sc->vmm_vm, isa_irq.atpic_irq); 930 if (error == 0 && isa_irq.ioapic_irq != -1) { 931 error = vioapic_assert_irq(sc->vmm_vm, 932 isa_irq.ioapic_irq); 933 } 934 break; 935 } 936 case VM_ISA_DEASSERT_IRQ: { 937 struct vm_isa_irq isa_irq; 938 939 if (ddi_copyin(datap, &isa_irq, sizeof (isa_irq), md)) { 940 error = EFAULT; 941 break; 942 } 943 error = vatpic_deassert_irq(sc->vmm_vm, isa_irq.atpic_irq); 944 if (error == 0 && isa_irq.ioapic_irq != -1) { 945 error = vioapic_deassert_irq(sc->vmm_vm, 946 isa_irq.ioapic_irq); 947 } 948 break; 949 } 950 case VM_ISA_PULSE_IRQ: { 951 struct vm_isa_irq isa_irq; 952 953 if (ddi_copyin(datap, &isa_irq, sizeof (isa_irq), md)) { 954 error = EFAULT; 955 break; 956 } 957 error = vatpic_pulse_irq(sc->vmm_vm, isa_irq.atpic_irq); 958 if (error == 0 && isa_irq.ioapic_irq != -1) { 959 error = vioapic_pulse_irq(sc->vmm_vm, 960 isa_irq.ioapic_irq); 961 } 962 break; 963 } 964 case VM_ISA_SET_IRQ_TRIGGER: { 965 struct vm_isa_irq_trigger isa_irq_trigger; 966 967 if (ddi_copyin(datap, &isa_irq_trigger, 968 sizeof (isa_irq_trigger), md)) { 969 error = EFAULT; 970 break; 971 } 972 error = vatpic_set_irq_trigger(sc->vmm_vm, 973 isa_irq_trigger.atpic_irq, isa_irq_trigger.trigger); 974 break; 975 } 976 977 case VM_MMAP_GETNEXT: { 978 struct vm_memmap mm; 979 980 if (ddi_copyin(datap, &mm, sizeof (mm), md)) { 981 error = EFAULT; 982 break; 983 } 984 error = vm_mmap_getnext(sc->vmm_vm, &mm.gpa, &mm.segid, 985 &mm.segoff, &mm.len, &mm.prot, &mm.flags); 986 if (error == 0 && ddi_copyout(&mm, datap, sizeof (mm), md)) { 987 error = EFAULT; 988 break; 989 } 990 break; 991 } 992 case VM_MMAP_MEMSEG: { 993 struct vm_memmap mm; 994 995 if (ddi_copyin(datap, &mm, sizeof (mm), md)) { 996 error = EFAULT; 997 break; 998 } 999 error = vm_mmap_memseg(sc->vmm_vm, mm.gpa, mm.segid, mm.segoff, 1000 mm.len, mm.prot, mm.flags); 1001 break; 1002 } 1003 case VM_MUNMAP_MEMSEG: { 1004 struct vm_munmap mu; 1005 1006 if (ddi_copyin(datap, &mu, sizeof (mu), md)) { 1007 error = EFAULT; 1008 break; 1009 } 1010 error = vm_munmap_memseg(sc->vmm_vm, mu.gpa, mu.len); 1011 break; 1012 } 1013 case VM_ALLOC_MEMSEG: { 1014 struct vm_memseg vmseg; 1015 1016 if (ddi_copyin(datap, &vmseg, sizeof (vmseg), md)) { 1017 error = EFAULT; 1018 break; 1019 } 1020 error = vmmdev_alloc_memseg(sc, &vmseg); 1021 break; 1022 } 1023 case VM_GET_MEMSEG: { 1024 struct vm_memseg vmseg; 1025 1026 if (ddi_copyin(datap, &vmseg, sizeof (vmseg), md)) { 1027 error = EFAULT; 1028 break; 1029 } 1030 error = vmmdev_get_memseg(sc, &vmseg); 1031 if (error == 0 && 1032 ddi_copyout(&vmseg, datap, sizeof (vmseg), md)) { 1033 error = EFAULT; 1034 break; 1035 } 1036 break; 1037 } 1038 case VM_GET_REGISTER: { 1039 struct vm_register vmreg; 1040 1041 if (ddi_copyin(datap, &vmreg, sizeof (vmreg), md)) { 1042 error = EFAULT; 1043 break; 1044 } 1045 error = vm_get_register(sc->vmm_vm, vcpu, vmreg.regnum, 1046 &vmreg.regval); 1047 if (error == 0 && 1048 ddi_copyout(&vmreg, datap, sizeof (vmreg), md)) { 1049 error = EFAULT; 1050 break; 1051 } 1052 break; 1053 } 1054 case VM_SET_REGISTER: { 1055 struct vm_register vmreg; 1056 1057 if (ddi_copyin(datap, &vmreg, sizeof (vmreg), md)) { 1058 error = EFAULT; 1059 break; 1060 } 1061 error = vm_set_register(sc->vmm_vm, vcpu, vmreg.regnum, 1062 vmreg.regval); 1063 break; 1064 } 1065 case VM_SET_SEGMENT_DESCRIPTOR: { 1066 struct vm_seg_desc vmsegd; 1067 1068 if (ddi_copyin(datap, &vmsegd, sizeof (vmsegd), md)) { 1069 error = EFAULT; 1070 break; 1071 } 1072 error = vm_set_seg_desc(sc->vmm_vm, vcpu, vmsegd.regnum, 1073 &vmsegd.desc); 1074 break; 1075 } 1076 case VM_GET_SEGMENT_DESCRIPTOR: { 1077 struct vm_seg_desc vmsegd; 1078 1079 if (ddi_copyin(datap, &vmsegd, sizeof (vmsegd), md)) { 1080 error = EFAULT; 1081 break; 1082 } 1083 error = vm_get_seg_desc(sc->vmm_vm, vcpu, vmsegd.regnum, 1084 &vmsegd.desc); 1085 if (error == 0 && 1086 ddi_copyout(&vmsegd, datap, sizeof (vmsegd), md)) { 1087 error = EFAULT; 1088 break; 1089 } 1090 break; 1091 } 1092 case VM_GET_REGISTER_SET: { 1093 struct vm_register_set vrs; 1094 int regnums[VM_REG_LAST]; 1095 uint64_t regvals[VM_REG_LAST]; 1096 1097 if (ddi_copyin(datap, &vrs, sizeof (vrs), md)) { 1098 error = EFAULT; 1099 break; 1100 } 1101 if (vrs.count > VM_REG_LAST || vrs.count == 0) { 1102 error = EINVAL; 1103 break; 1104 } 1105 if (ddi_copyin(vrs.regnums, regnums, 1106 sizeof (int) * vrs.count, md)) { 1107 error = EFAULT; 1108 break; 1109 } 1110 1111 error = 0; 1112 for (uint_t i = 0; i < vrs.count && error == 0; i++) { 1113 if (regnums[i] < 0) { 1114 error = EINVAL; 1115 break; 1116 } 1117 error = vm_get_register(sc->vmm_vm, vcpu, regnums[i], 1118 ®vals[i]); 1119 } 1120 if (error == 0 && ddi_copyout(regvals, vrs.regvals, 1121 sizeof (uint64_t) * vrs.count, md)) { 1122 error = EFAULT; 1123 } 1124 break; 1125 } 1126 case VM_SET_REGISTER_SET: { 1127 struct vm_register_set vrs; 1128 int regnums[VM_REG_LAST]; 1129 uint64_t regvals[VM_REG_LAST]; 1130 1131 if (ddi_copyin(datap, &vrs, sizeof (vrs), md)) { 1132 error = EFAULT; 1133 break; 1134 } 1135 if (vrs.count > VM_REG_LAST || vrs.count == 0) { 1136 error = EINVAL; 1137 break; 1138 } 1139 if (ddi_copyin(vrs.regnums, regnums, 1140 sizeof (int) * vrs.count, md)) { 1141 error = EFAULT; 1142 break; 1143 } 1144 if (ddi_copyin(vrs.regvals, regvals, 1145 sizeof (uint64_t) * vrs.count, md)) { 1146 error = EFAULT; 1147 break; 1148 } 1149 1150 error = 0; 1151 for (uint_t i = 0; i < vrs.count && error == 0; i++) { 1152 /* 1153 * Setting registers in a set is not atomic, since a 1154 * failure in the middle of the set will cause a 1155 * bail-out and inconsistent register state. Callers 1156 * should be wary of this. 1157 */ 1158 if (regnums[i] < 0) { 1159 error = EINVAL; 1160 break; 1161 } 1162 error = vm_set_register(sc->vmm_vm, vcpu, regnums[i], 1163 regvals[i]); 1164 } 1165 break; 1166 } 1167 case VM_RESET_CPU: { 1168 struct vm_vcpu_reset vvr; 1169 1170 if (ddi_copyin(datap, &vvr, sizeof (vvr), md)) { 1171 error = EFAULT; 1172 break; 1173 } 1174 if (vvr.kind != VRK_RESET && vvr.kind != VRK_INIT) { 1175 error = EINVAL; 1176 } 1177 1178 error = vcpu_arch_reset(sc->vmm_vm, vcpu, vvr.kind == VRK_INIT); 1179 break; 1180 } 1181 case VM_GET_RUN_STATE: { 1182 struct vm_run_state vrs; 1183 1184 bzero(&vrs, sizeof (vrs)); 1185 error = vm_get_run_state(sc->vmm_vm, vcpu, &vrs.state, 1186 &vrs.sipi_vector); 1187 if (error == 0) { 1188 if (ddi_copyout(&vrs, datap, sizeof (vrs), md)) { 1189 error = EFAULT; 1190 break; 1191 } 1192 } 1193 break; 1194 } 1195 case VM_SET_RUN_STATE: { 1196 struct vm_run_state vrs; 1197 1198 if (ddi_copyin(datap, &vrs, sizeof (vrs), md)) { 1199 error = EFAULT; 1200 break; 1201 } 1202 error = vm_set_run_state(sc->vmm_vm, vcpu, vrs.state, 1203 vrs.sipi_vector); 1204 break; 1205 } 1206 case VM_GET_FPU: { 1207 struct vm_fpu_state req; 1208 const size_t max_len = (PAGESIZE * 2); 1209 void *kbuf; 1210 1211 if (ddi_copyin(datap, &req, sizeof (req), md)) { 1212 error = EFAULT; 1213 break; 1214 } 1215 if (req.len > max_len || req.len == 0) { 1216 error = EINVAL; 1217 break; 1218 } 1219 kbuf = kmem_zalloc(req.len, KM_SLEEP); 1220 error = vm_get_fpu(sc->vmm_vm, vcpu, kbuf, req.len); 1221 if (error == 0) { 1222 if (ddi_copyout(kbuf, req.buf, req.len, md)) { 1223 error = EFAULT; 1224 } 1225 } 1226 kmem_free(kbuf, req.len); 1227 break; 1228 } 1229 case VM_SET_FPU: { 1230 struct vm_fpu_state req; 1231 const size_t max_len = (PAGESIZE * 2); 1232 void *kbuf; 1233 1234 if (ddi_copyin(datap, &req, sizeof (req), md)) { 1235 error = EFAULT; 1236 break; 1237 } 1238 if (req.len > max_len || req.len == 0) { 1239 error = EINVAL; 1240 break; 1241 } 1242 kbuf = kmem_alloc(req.len, KM_SLEEP); 1243 if (ddi_copyin(req.buf, kbuf, req.len, md)) { 1244 error = EFAULT; 1245 } else { 1246 error = vm_set_fpu(sc->vmm_vm, vcpu, kbuf, req.len); 1247 } 1248 kmem_free(kbuf, req.len); 1249 break; 1250 } 1251 case VM_GET_CPUID: { 1252 struct vm_vcpu_cpuid_config cfg; 1253 struct vcpu_cpuid_entry *entries = NULL; 1254 1255 if (ddi_copyin(datap, &cfg, sizeof (cfg), md)) { 1256 error = EFAULT; 1257 break; 1258 } 1259 if (cfg.vvcc_nent > VMM_MAX_CPUID_ENTRIES) { 1260 error = EINVAL; 1261 break; 1262 } 1263 1264 const size_t entries_size = 1265 cfg.vvcc_nent * sizeof (struct vcpu_cpuid_entry); 1266 if (entries_size != 0) { 1267 entries = kmem_zalloc(entries_size, KM_SLEEP); 1268 } 1269 1270 vcpu_cpuid_config_t vm_cfg = { 1271 .vcc_nent = cfg.vvcc_nent, 1272 .vcc_entries = entries, 1273 }; 1274 error = vm_get_cpuid(sc->vmm_vm, vcpu, &vm_cfg); 1275 1276 /* 1277 * Only attempt to copy out the resultant entries if we were 1278 * able to query them from the instance. The flags and number 1279 * of entries are emitted regardless. 1280 */ 1281 cfg.vvcc_flags = vm_cfg.vcc_flags; 1282 cfg.vvcc_nent = vm_cfg.vcc_nent; 1283 if (entries != NULL) { 1284 if (error == 0 && ddi_copyout(entries, cfg.vvcc_entries, 1285 entries_size, md) != 0) { 1286 error = EFAULT; 1287 } 1288 1289 kmem_free(entries, entries_size); 1290 } 1291 1292 if (ddi_copyout(&cfg, datap, sizeof (cfg), md) != 0) { 1293 error = EFAULT; 1294 } 1295 break; 1296 } 1297 case VM_SET_CPUID: { 1298 struct vm_vcpu_cpuid_config cfg; 1299 struct vcpu_cpuid_entry *entries = NULL; 1300 size_t entries_size = 0; 1301 1302 if (ddi_copyin(datap, &cfg, sizeof (cfg), md)) { 1303 error = EFAULT; 1304 break; 1305 } 1306 if (cfg.vvcc_nent > VMM_MAX_CPUID_ENTRIES) { 1307 error = EFBIG; 1308 break; 1309 } 1310 if ((cfg.vvcc_flags & VCC_FLAG_LEGACY_HANDLING) != 0) { 1311 /* 1312 * If we are being instructed to use "legacy" handling, 1313 * then no entries should be provided, since the static 1314 * in-kernel masking will be used. 1315 */ 1316 if (cfg.vvcc_nent != 0) { 1317 error = EINVAL; 1318 break; 1319 } 1320 } else if (cfg.vvcc_nent != 0) { 1321 entries_size = 1322 cfg.vvcc_nent * sizeof (struct vcpu_cpuid_entry); 1323 entries = kmem_alloc(entries_size, KM_SLEEP); 1324 1325 if (ddi_copyin(cfg.vvcc_entries, entries, entries_size, 1326 md) != 0) { 1327 error = EFAULT; 1328 kmem_free(entries, entries_size); 1329 break; 1330 } 1331 } 1332 1333 vcpu_cpuid_config_t vm_cfg = { 1334 .vcc_flags = cfg.vvcc_flags, 1335 .vcc_nent = cfg.vvcc_nent, 1336 .vcc_entries = entries, 1337 }; 1338 error = vm_set_cpuid(sc->vmm_vm, vcpu, &vm_cfg); 1339 1340 if (entries != NULL) { 1341 kmem_free(entries, entries_size); 1342 } 1343 break; 1344 } 1345 case VM_LEGACY_CPUID: { 1346 struct vm_legacy_cpuid vlc; 1347 if (ddi_copyin(datap, &vlc, sizeof (vlc), md)) { 1348 error = EFAULT; 1349 break; 1350 } 1351 vlc.vlc_vcpuid = vcpu; 1352 1353 legacy_emulate_cpuid(sc->vmm_vm, vcpu, &vlc.vlc_eax, 1354 &vlc.vlc_ebx, &vlc.vlc_ecx, &vlc.vlc_edx); 1355 1356 if (ddi_copyout(&vlc, datap, sizeof (vlc), md)) { 1357 error = EFAULT; 1358 break; 1359 } 1360 break; 1361 } 1362 1363 case VM_SET_KERNEMU_DEV: 1364 case VM_GET_KERNEMU_DEV: { 1365 struct vm_readwrite_kernemu_device kemu; 1366 size_t size = 0; 1367 1368 if (ddi_copyin(datap, &kemu, sizeof (kemu), md)) { 1369 error = EFAULT; 1370 break; 1371 } 1372 1373 if (kemu.access_width > 3) { 1374 error = EINVAL; 1375 break; 1376 } 1377 size = (1 << kemu.access_width); 1378 ASSERT(size >= 1 && size <= 8); 1379 1380 if (cmd == VM_SET_KERNEMU_DEV) { 1381 error = vm_service_mmio_write(sc->vmm_vm, vcpu, 1382 kemu.gpa, kemu.value, size); 1383 } else { 1384 error = vm_service_mmio_read(sc->vmm_vm, vcpu, 1385 kemu.gpa, &kemu.value, size); 1386 } 1387 1388 if (error == 0) { 1389 if (ddi_copyout(&kemu, datap, sizeof (kemu), md)) { 1390 error = EFAULT; 1391 break; 1392 } 1393 } 1394 break; 1395 } 1396 1397 case VM_GET_CAPABILITY: { 1398 struct vm_capability vmcap; 1399 1400 if (ddi_copyin(datap, &vmcap, sizeof (vmcap), md)) { 1401 error = EFAULT; 1402 break; 1403 } 1404 error = vm_get_capability(sc->vmm_vm, vcpu, vmcap.captype, 1405 &vmcap.capval); 1406 if (error == 0 && 1407 ddi_copyout(&vmcap, datap, sizeof (vmcap), md)) { 1408 error = EFAULT; 1409 break; 1410 } 1411 break; 1412 } 1413 case VM_SET_CAPABILITY: { 1414 struct vm_capability vmcap; 1415 1416 if (ddi_copyin(datap, &vmcap, sizeof (vmcap), md)) { 1417 error = EFAULT; 1418 break; 1419 } 1420 error = vm_set_capability(sc->vmm_vm, vcpu, vmcap.captype, 1421 vmcap.capval); 1422 break; 1423 } 1424 case VM_SET_X2APIC_STATE: { 1425 struct vm_x2apic x2apic; 1426 1427 if (ddi_copyin(datap, &x2apic, sizeof (x2apic), md)) { 1428 error = EFAULT; 1429 break; 1430 } 1431 error = vm_set_x2apic_state(sc->vmm_vm, vcpu, x2apic.state); 1432 break; 1433 } 1434 case VM_GET_X2APIC_STATE: { 1435 struct vm_x2apic x2apic; 1436 1437 if (ddi_copyin(datap, &x2apic, sizeof (x2apic), md)) { 1438 error = EFAULT; 1439 break; 1440 } 1441 error = vm_get_x2apic_state(sc->vmm_vm, x2apic.cpuid, 1442 &x2apic.state); 1443 if (error == 0 && 1444 ddi_copyout(&x2apic, datap, sizeof (x2apic), md)) { 1445 error = EFAULT; 1446 break; 1447 } 1448 break; 1449 } 1450 case VM_GET_GPA_PMAP: { 1451 /* 1452 * Until there is a necessity to leak EPT/RVI PTE values to 1453 * userspace, this will remain unimplemented 1454 */ 1455 error = EINVAL; 1456 break; 1457 } 1458 case VM_GET_HPET_CAPABILITIES: { 1459 struct vm_hpet_cap hpetcap; 1460 1461 error = vhpet_getcap(&hpetcap); 1462 if (error == 0 && 1463 ddi_copyout(&hpetcap, datap, sizeof (hpetcap), md)) { 1464 error = EFAULT; 1465 break; 1466 } 1467 break; 1468 } 1469 case VM_GLA2GPA: { 1470 struct vm_gla2gpa gg; 1471 1472 if (ddi_copyin(datap, &gg, sizeof (gg), md)) { 1473 error = EFAULT; 1474 break; 1475 } 1476 gg.vcpuid = vcpu; 1477 error = vm_gla2gpa(sc->vmm_vm, vcpu, &gg.paging, gg.gla, 1478 gg.prot, &gg.gpa, &gg.fault); 1479 if (error == 0 && ddi_copyout(&gg, datap, sizeof (gg), md)) { 1480 error = EFAULT; 1481 break; 1482 } 1483 break; 1484 } 1485 case VM_GLA2GPA_NOFAULT: { 1486 struct vm_gla2gpa gg; 1487 1488 if (ddi_copyin(datap, &gg, sizeof (gg), md)) { 1489 error = EFAULT; 1490 break; 1491 } 1492 gg.vcpuid = vcpu; 1493 error = vm_gla2gpa_nofault(sc->vmm_vm, vcpu, &gg.paging, 1494 gg.gla, gg.prot, &gg.gpa, &gg.fault); 1495 if (error == 0 && ddi_copyout(&gg, datap, sizeof (gg), md)) { 1496 error = EFAULT; 1497 break; 1498 } 1499 break; 1500 } 1501 1502 case VM_ACTIVATE_CPU: 1503 error = vm_activate_cpu(sc->vmm_vm, vcpu); 1504 break; 1505 1506 case VM_SUSPEND_CPU: 1507 if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) { 1508 error = EFAULT; 1509 } else { 1510 error = vm_suspend_cpu(sc->vmm_vm, vcpu); 1511 } 1512 break; 1513 1514 case VM_RESUME_CPU: 1515 if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) { 1516 error = EFAULT; 1517 } else { 1518 error = vm_resume_cpu(sc->vmm_vm, vcpu); 1519 } 1520 break; 1521 1522 case VM_GET_CPUS: { 1523 struct vm_cpuset vm_cpuset; 1524 cpuset_t tempset; 1525 void *srcp = &tempset; 1526 int size; 1527 1528 if (ddi_copyin(datap, &vm_cpuset, sizeof (vm_cpuset), md)) { 1529 error = EFAULT; 1530 break; 1531 } 1532 1533 /* Be more generous about sizing since our cpuset_t is large. */ 1534 size = vm_cpuset.cpusetsize; 1535 if (size <= 0 || size > sizeof (cpuset_t)) { 1536 error = ERANGE; 1537 } 1538 /* 1539 * If they want a ulong_t or less, make sure they receive the 1540 * low bits with all the useful information. 1541 */ 1542 if (size <= sizeof (tempset.cpub[0])) { 1543 srcp = &tempset.cpub[0]; 1544 } 1545 1546 if (vm_cpuset.which == VM_ACTIVE_CPUS) { 1547 tempset = vm_active_cpus(sc->vmm_vm); 1548 } else if (vm_cpuset.which == VM_SUSPENDED_CPUS) { 1549 tempset = vm_suspended_cpus(sc->vmm_vm); 1550 } else if (vm_cpuset.which == VM_DEBUG_CPUS) { 1551 tempset = vm_debug_cpus(sc->vmm_vm); 1552 } else { 1553 error = EINVAL; 1554 } 1555 1556 ASSERT(size > 0 && size <= sizeof (tempset)); 1557 if (error == 0 && 1558 ddi_copyout(srcp, vm_cpuset.cpus, size, md)) { 1559 error = EFAULT; 1560 break; 1561 } 1562 break; 1563 } 1564 case VM_SET_INTINFO: { 1565 struct vm_intinfo vmii; 1566 1567 if (ddi_copyin(datap, &vmii, sizeof (vmii), md)) { 1568 error = EFAULT; 1569 break; 1570 } 1571 error = vm_exit_intinfo(sc->vmm_vm, vcpu, vmii.info1); 1572 break; 1573 } 1574 case VM_GET_INTINFO: { 1575 struct vm_intinfo vmii; 1576 1577 vmii.vcpuid = vcpu; 1578 error = vm_get_intinfo(sc->vmm_vm, vcpu, &vmii.info1, 1579 &vmii.info2); 1580 if (error == 0 && 1581 ddi_copyout(&vmii, datap, sizeof (vmii), md)) { 1582 error = EFAULT; 1583 break; 1584 } 1585 break; 1586 } 1587 case VM_RTC_WRITE: { 1588 struct vm_rtc_data rtcdata; 1589 1590 if (ddi_copyin(datap, &rtcdata, sizeof (rtcdata), md)) { 1591 error = EFAULT; 1592 break; 1593 } 1594 error = vrtc_nvram_write(sc->vmm_vm, rtcdata.offset, 1595 rtcdata.value); 1596 break; 1597 } 1598 case VM_RTC_READ: { 1599 struct vm_rtc_data rtcdata; 1600 1601 if (ddi_copyin(datap, &rtcdata, sizeof (rtcdata), md)) { 1602 error = EFAULT; 1603 break; 1604 } 1605 error = vrtc_nvram_read(sc->vmm_vm, rtcdata.offset, 1606 &rtcdata.value); 1607 if (error == 0 && 1608 ddi_copyout(&rtcdata, datap, sizeof (rtcdata), md)) { 1609 error = EFAULT; 1610 break; 1611 } 1612 break; 1613 } 1614 case VM_RTC_SETTIME: { 1615 timespec_t ts; 1616 1617 if (ddi_copyin(datap, &ts, sizeof (ts), md)) { 1618 error = EFAULT; 1619 break; 1620 } 1621 error = vrtc_set_time(sc->vmm_vm, &ts); 1622 break; 1623 } 1624 case VM_RTC_GETTIME: { 1625 timespec_t ts; 1626 1627 vrtc_get_time(sc->vmm_vm, &ts); 1628 if (ddi_copyout(&ts, datap, sizeof (ts), md)) { 1629 error = EFAULT; 1630 break; 1631 } 1632 break; 1633 } 1634 1635 case VM_PMTMR_LOCATE: { 1636 uint16_t port = arg; 1637 error = vpmtmr_set_location(sc->vmm_vm, port); 1638 break; 1639 } 1640 1641 case VM_RESTART_INSTRUCTION: 1642 error = vm_restart_instruction(sc->vmm_vm, vcpu); 1643 break; 1644 1645 case VM_SET_TOPOLOGY: { 1646 struct vm_cpu_topology topo; 1647 1648 if (ddi_copyin(datap, &topo, sizeof (topo), md) != 0) { 1649 error = EFAULT; 1650 break; 1651 } 1652 error = vm_set_topology(sc->vmm_vm, topo.sockets, topo.cores, 1653 topo.threads, topo.maxcpus); 1654 break; 1655 } 1656 case VM_GET_TOPOLOGY: { 1657 struct vm_cpu_topology topo; 1658 1659 vm_get_topology(sc->vmm_vm, &topo.sockets, &topo.cores, 1660 &topo.threads, &topo.maxcpus); 1661 if (ddi_copyout(&topo, datap, sizeof (topo), md) != 0) { 1662 error = EFAULT; 1663 break; 1664 } 1665 break; 1666 } 1667 case VM_DEVMEM_GETOFFSET: { 1668 struct vm_devmem_offset vdo; 1669 vmm_devmem_entry_t *de; 1670 1671 if (ddi_copyin(datap, &vdo, sizeof (vdo), md) != 0) { 1672 error = EFAULT; 1673 break; 1674 } 1675 1676 de = vmmdev_devmem_find(sc, vdo.segid); 1677 if (de != NULL) { 1678 vdo.offset = de->vde_off; 1679 if (ddi_copyout(&vdo, datap, sizeof (vdo), md) != 0) { 1680 error = EFAULT; 1681 } 1682 } else { 1683 error = ENOENT; 1684 } 1685 break; 1686 } 1687 case VM_TRACK_DIRTY_PAGES: { 1688 const size_t max_track_region_len = 8 * PAGESIZE * 8 * PAGESIZE; 1689 struct vmm_dirty_tracker tracker; 1690 uint8_t *bitmap; 1691 size_t len; 1692 1693 if (ddi_copyin(datap, &tracker, sizeof (tracker), md) != 0) { 1694 error = EFAULT; 1695 break; 1696 } 1697 if ((tracker.vdt_start_gpa & PAGEOFFSET) != 0) { 1698 error = EINVAL; 1699 break; 1700 } 1701 if (tracker.vdt_len == 0) { 1702 break; 1703 } 1704 if ((tracker.vdt_len & PAGEOFFSET) != 0) { 1705 error = EINVAL; 1706 break; 1707 } 1708 if (tracker.vdt_len > max_track_region_len) { 1709 error = EINVAL; 1710 break; 1711 } 1712 len = roundup(tracker.vdt_len / PAGESIZE, 8) / 8; 1713 bitmap = kmem_zalloc(len, KM_SLEEP); 1714 error = vm_track_dirty_pages(sc->vmm_vm, tracker.vdt_start_gpa, 1715 tracker.vdt_len, bitmap); 1716 if (error == 0 && 1717 ddi_copyout(bitmap, tracker.vdt_pfns, len, md) != 0) { 1718 error = EFAULT; 1719 } 1720 kmem_free(bitmap, len); 1721 1722 break; 1723 } 1724 case VM_WRLOCK_CYCLE: { 1725 /* 1726 * Present a test mechanism to acquire/release the write lock 1727 * on the VM without any other effects. 1728 */ 1729 break; 1730 } 1731 case VM_DATA_READ: { 1732 struct vm_data_xfer vdx; 1733 1734 if (ddi_copyin(datap, &vdx, sizeof (vdx), md) != 0) { 1735 error = EFAULT; 1736 break; 1737 } 1738 if ((vdx.vdx_flags & ~VDX_FLAGS_VALID) != 0) { 1739 error = EINVAL; 1740 break; 1741 } 1742 if (vdx.vdx_len > VM_DATA_XFER_LIMIT) { 1743 error = EFBIG; 1744 break; 1745 } 1746 1747 const size_t len = vdx.vdx_len; 1748 void *buf = NULL; 1749 if (len != 0) { 1750 const void *udata = vdx.vdx_data; 1751 1752 buf = kmem_alloc(len, KM_SLEEP); 1753 if ((vdx.vdx_flags & VDX_FLAG_READ_COPYIN) == 0) { 1754 bzero(buf, len); 1755 } else if (ddi_copyin(udata, buf, len, md) != 0) { 1756 kmem_free(buf, len); 1757 error = EFAULT; 1758 break; 1759 } 1760 } 1761 1762 vdx.vdx_result_len = 0; 1763 vmm_data_req_t req = { 1764 .vdr_class = vdx.vdx_class, 1765 .vdr_version = vdx.vdx_version, 1766 .vdr_flags = vdx.vdx_flags, 1767 .vdr_len = len, 1768 .vdr_data = buf, 1769 .vdr_result_len = &vdx.vdx_result_len, 1770 }; 1771 error = vmm_data_read(sc->vmm_vm, vdx.vdx_vcpuid, &req); 1772 1773 if (error == 0 && buf != NULL) { 1774 if (ddi_copyout(buf, vdx.vdx_data, len, md) != 0) { 1775 error = EFAULT; 1776 } 1777 } 1778 1779 /* 1780 * Copy out the transfer request so that the value of 1781 * vdx_result_len can be made available, regardless of any 1782 * error(s) which may have occurred. 1783 */ 1784 if (ddi_copyout(&vdx, datap, sizeof (vdx), md) != 0) { 1785 error = (error != 0) ? error : EFAULT; 1786 } 1787 1788 if (buf != NULL) { 1789 kmem_free(buf, len); 1790 } 1791 break; 1792 } 1793 case VM_DATA_WRITE: { 1794 struct vm_data_xfer vdx; 1795 1796 if (ddi_copyin(datap, &vdx, sizeof (vdx), md) != 0) { 1797 error = EFAULT; 1798 break; 1799 } 1800 if ((vdx.vdx_flags & ~VDX_FLAGS_VALID) != 0) { 1801 error = EINVAL; 1802 break; 1803 } 1804 if (vdx.vdx_len > VM_DATA_XFER_LIMIT) { 1805 error = EFBIG; 1806 break; 1807 } 1808 1809 const size_t len = vdx.vdx_len; 1810 void *buf = NULL; 1811 if (len != 0) { 1812 buf = kmem_alloc(len, KM_SLEEP); 1813 if (ddi_copyin(vdx.vdx_data, buf, len, md) != 0) { 1814 kmem_free(buf, len); 1815 error = EFAULT; 1816 break; 1817 } 1818 } 1819 1820 vdx.vdx_result_len = 0; 1821 vmm_data_req_t req = { 1822 .vdr_class = vdx.vdx_class, 1823 .vdr_version = vdx.vdx_version, 1824 .vdr_flags = vdx.vdx_flags, 1825 .vdr_len = len, 1826 .vdr_data = buf, 1827 .vdr_result_len = &vdx.vdx_result_len, 1828 }; 1829 if (vmm_allow_state_writes != 0) { 1830 error = vmm_data_write(sc->vmm_vm, vdx.vdx_vcpuid, 1831 &req); 1832 } else { 1833 /* 1834 * Reject the write if somone has thrown the switch back 1835 * into the "disallow" position. 1836 */ 1837 error = EPERM; 1838 } 1839 1840 if (error == 0 && buf != NULL && 1841 (vdx.vdx_flags & VDX_FLAG_WRITE_COPYOUT) != 0) { 1842 if (ddi_copyout(buf, vdx.vdx_data, len, md) != 0) { 1843 error = EFAULT; 1844 } 1845 } 1846 1847 /* 1848 * Copy out the transfer request so that the value of 1849 * vdx_result_len can be made available, regardless of any 1850 * error(s) which may have occurred. 1851 */ 1852 if (ddi_copyout(&vdx, datap, sizeof (vdx), md) != 0) { 1853 error = (error != 0) ? error : EFAULT; 1854 } 1855 1856 if (buf != NULL) { 1857 kmem_free(buf, len); 1858 } 1859 break; 1860 } 1861 1862 case VM_PAUSE: { 1863 error = vm_pause_instance(sc->vmm_vm); 1864 break; 1865 } 1866 case VM_RESUME: { 1867 error = vm_resume_instance(sc->vmm_vm); 1868 break; 1869 } 1870 1871 default: 1872 error = ENOTTY; 1873 break; 1874 } 1875 1876 /* Release exclusion resources */ 1877 switch (lock_type) { 1878 case LOCK_NONE: 1879 break; 1880 case LOCK_VCPU: 1881 vcpu_unlock_one(sc, vcpu); 1882 break; 1883 case LOCK_READ_HOLD: 1884 vmm_read_unlock(sc); 1885 break; 1886 case LOCK_WRITE_HOLD: 1887 vmm_write_unlock(sc); 1888 break; 1889 default: 1890 panic("unexpected lock type"); 1891 break; 1892 } 1893 1894 return (error); 1895 } 1896 1897 static vmm_softc_t * 1898 vmm_lookup(const char *name) 1899 { 1900 list_t *vml = &vmm_list; 1901 vmm_softc_t *sc; 1902 1903 ASSERT(MUTEX_HELD(&vmm_mtx)); 1904 1905 for (sc = list_head(vml); sc != NULL; sc = list_next(vml, sc)) { 1906 if (strcmp(sc->vmm_name, name) == 0) { 1907 break; 1908 } 1909 } 1910 1911 return (sc); 1912 } 1913 1914 /* 1915 * Acquire an HMA registration if not already held. 1916 */ 1917 static boolean_t 1918 vmm_hma_acquire(void) 1919 { 1920 ASSERT(MUTEX_NOT_HELD(&vmm_mtx)); 1921 1922 mutex_enter(&vmmdev_mtx); 1923 1924 if (vmmdev_hma_reg == NULL) { 1925 VERIFY3U(vmmdev_hma_ref, ==, 0); 1926 vmmdev_hma_reg = hma_register(vmmdev_hvm_name); 1927 if (vmmdev_hma_reg == NULL) { 1928 cmn_err(CE_WARN, "%s HMA registration failed.", 1929 vmmdev_hvm_name); 1930 mutex_exit(&vmmdev_mtx); 1931 return (B_FALSE); 1932 } 1933 } 1934 1935 vmmdev_hma_ref++; 1936 1937 mutex_exit(&vmmdev_mtx); 1938 1939 return (B_TRUE); 1940 } 1941 1942 /* 1943 * Release the HMA registration if held and there are no remaining VMs. 1944 */ 1945 static void 1946 vmm_hma_release(void) 1947 { 1948 ASSERT(MUTEX_NOT_HELD(&vmm_mtx)); 1949 1950 mutex_enter(&vmmdev_mtx); 1951 1952 VERIFY3U(vmmdev_hma_ref, !=, 0); 1953 1954 vmmdev_hma_ref--; 1955 1956 if (vmmdev_hma_ref == 0) { 1957 VERIFY(vmmdev_hma_reg != NULL); 1958 hma_unregister(vmmdev_hma_reg); 1959 vmmdev_hma_reg = NULL; 1960 } 1961 mutex_exit(&vmmdev_mtx); 1962 } 1963 1964 static int 1965 vmmdev_do_vm_create(const struct vm_create_req *req, cred_t *cr) 1966 { 1967 vmm_softc_t *sc = NULL; 1968 minor_t minor; 1969 int error = ENOMEM; 1970 size_t len; 1971 const char *name = req->name; 1972 1973 len = strnlen(name, VM_MAX_NAMELEN); 1974 if (len == 0) { 1975 return (EINVAL); 1976 } 1977 if (len >= VM_MAX_NAMELEN) { 1978 return (ENAMETOOLONG); 1979 } 1980 if (strchr(name, '/') != NULL) { 1981 return (EINVAL); 1982 } 1983 1984 if (!vmm_hma_acquire()) 1985 return (ENXIO); 1986 1987 mutex_enter(&vmm_mtx); 1988 1989 /* Look for duplicate names */ 1990 if (vmm_lookup(name) != NULL) { 1991 mutex_exit(&vmm_mtx); 1992 vmm_hma_release(); 1993 return (EEXIST); 1994 } 1995 1996 /* Allow only one instance per non-global zone. */ 1997 if (!INGLOBALZONE(curproc)) { 1998 for (sc = list_head(&vmm_list); sc != NULL; 1999 sc = list_next(&vmm_list, sc)) { 2000 if (sc->vmm_zone == curzone) { 2001 mutex_exit(&vmm_mtx); 2002 vmm_hma_release(); 2003 return (EINVAL); 2004 } 2005 } 2006 } 2007 2008 minor = id_alloc(vmm_minors); 2009 if (ddi_soft_state_zalloc(vmm_statep, minor) != DDI_SUCCESS) { 2010 goto fail; 2011 } else if ((sc = ddi_get_soft_state(vmm_statep, minor)) == NULL) { 2012 ddi_soft_state_free(vmm_statep, minor); 2013 goto fail; 2014 } else if (ddi_create_minor_node(vmmdev_dip, name, S_IFCHR, minor, 2015 DDI_PSEUDO, 0) != DDI_SUCCESS) { 2016 goto fail; 2017 } 2018 2019 if (vmm_kstat_alloc(sc, minor, cr) != 0) { 2020 goto fail; 2021 } 2022 2023 error = vm_create(req->flags, &sc->vmm_vm); 2024 if (error == 0) { 2025 /* Complete VM intialization and report success. */ 2026 (void) strlcpy(sc->vmm_name, name, sizeof (sc->vmm_name)); 2027 sc->vmm_minor = minor; 2028 list_create(&sc->vmm_devmem_list, sizeof (vmm_devmem_entry_t), 2029 offsetof(vmm_devmem_entry_t, vde_node)); 2030 2031 list_create(&sc->vmm_holds, sizeof (vmm_hold_t), 2032 offsetof(vmm_hold_t, vmh_node)); 2033 cv_init(&sc->vmm_cv, NULL, CV_DEFAULT, NULL); 2034 2035 mutex_init(&sc->vmm_lease_lock, NULL, MUTEX_DEFAULT, NULL); 2036 list_create(&sc->vmm_lease_list, sizeof (vmm_lease_t), 2037 offsetof(vmm_lease_t, vml_node)); 2038 cv_init(&sc->vmm_lease_cv, NULL, CV_DEFAULT, NULL); 2039 rw_init(&sc->vmm_rwlock, NULL, RW_DEFAULT, NULL); 2040 2041 sc->vmm_zone = crgetzone(cr); 2042 zone_hold(sc->vmm_zone); 2043 vmm_zsd_add_vm(sc); 2044 vmm_kstat_init(sc); 2045 2046 list_insert_tail(&vmm_list, sc); 2047 mutex_exit(&vmm_mtx); 2048 return (0); 2049 } 2050 2051 vmm_kstat_fini(sc); 2052 ddi_remove_minor_node(vmmdev_dip, name); 2053 fail: 2054 id_free(vmm_minors, minor); 2055 if (sc != NULL) { 2056 ddi_soft_state_free(vmm_statep, minor); 2057 } 2058 mutex_exit(&vmm_mtx); 2059 vmm_hma_release(); 2060 2061 return (error); 2062 } 2063 2064 /* 2065 * Bhyve 'Driver' Interface 2066 * 2067 * While many devices are emulated in the bhyve userspace process, there are 2068 * others with performance constraints which require that they run mostly or 2069 * entirely in-kernel. For those not integrated directly into bhyve, an API is 2070 * needed so they can query/manipulate the portions of VM state needed to 2071 * fulfill their purpose. 2072 * 2073 * This includes: 2074 * - Translating guest-physical addresses to host-virtual pointers 2075 * - Injecting MSIs 2076 * - Hooking IO port addresses 2077 * 2078 * The vmm_drv interface exists to provide that functionality to its consumers. 2079 * (At this time, 'viona' is the only user) 2080 */ 2081 int 2082 vmm_drv_hold(file_t *fp, cred_t *cr, vmm_hold_t **holdp) 2083 { 2084 vnode_t *vp = fp->f_vnode; 2085 const dev_t dev = vp->v_rdev; 2086 vmm_softc_t *sc; 2087 vmm_hold_t *hold; 2088 int err = 0; 2089 2090 if (vp->v_type != VCHR) { 2091 return (ENXIO); 2092 } 2093 const major_t major = getmajor(dev); 2094 const minor_t minor = getminor(dev); 2095 2096 mutex_enter(&vmmdev_mtx); 2097 if (vmmdev_dip == NULL || major != ddi_driver_major(vmmdev_dip)) { 2098 mutex_exit(&vmmdev_mtx); 2099 return (ENOENT); 2100 } 2101 mutex_enter(&vmm_mtx); 2102 mutex_exit(&vmmdev_mtx); 2103 2104 if ((sc = ddi_get_soft_state(vmm_statep, minor)) == NULL) { 2105 err = ENOENT; 2106 goto out; 2107 } 2108 /* XXXJOY: check cred permissions against instance */ 2109 2110 if ((sc->vmm_flags & VMM_DESTROY) != 0) { 2111 err = EBUSY; 2112 goto out; 2113 } 2114 2115 hold = kmem_zalloc(sizeof (*hold), KM_SLEEP); 2116 hold->vmh_sc = sc; 2117 hold->vmh_release_req = B_FALSE; 2118 2119 list_insert_tail(&sc->vmm_holds, hold); 2120 sc->vmm_flags |= VMM_HELD; 2121 *holdp = hold; 2122 2123 out: 2124 mutex_exit(&vmm_mtx); 2125 return (err); 2126 } 2127 2128 void 2129 vmm_drv_rele(vmm_hold_t *hold) 2130 { 2131 vmm_softc_t *sc; 2132 bool hma_release = false; 2133 2134 ASSERT(hold != NULL); 2135 ASSERT(hold->vmh_sc != NULL); 2136 VERIFY(hold->vmh_ioport_hook_cnt == 0); 2137 2138 mutex_enter(&vmm_mtx); 2139 sc = hold->vmh_sc; 2140 list_remove(&sc->vmm_holds, hold); 2141 kmem_free(hold, sizeof (*hold)); 2142 2143 if (list_is_empty(&sc->vmm_holds)) { 2144 sc->vmm_flags &= ~VMM_HELD; 2145 2146 /* 2147 * Since outstanding holds would prevent instance destruction 2148 * from completing, attempt to finish it now if it was already 2149 * set in motion. 2150 */ 2151 if ((sc->vmm_flags & VMM_DESTROY) != 0) { 2152 VERIFY0(vmm_destroy_locked(sc, VDO_DEFAULT, 2153 &hma_release)); 2154 } 2155 } 2156 mutex_exit(&vmm_mtx); 2157 2158 if (hma_release) { 2159 vmm_hma_release(); 2160 } 2161 } 2162 2163 boolean_t 2164 vmm_drv_release_reqd(vmm_hold_t *hold) 2165 { 2166 ASSERT(hold != NULL); 2167 2168 return (hold->vmh_release_req); 2169 } 2170 2171 vmm_lease_t * 2172 vmm_drv_lease_sign(vmm_hold_t *hold, boolean_t (*expiref)(void *), void *arg) 2173 { 2174 vmm_softc_t *sc = hold->vmh_sc; 2175 vmm_lease_t *lease; 2176 2177 ASSERT3P(expiref, !=, NULL); 2178 2179 if (hold->vmh_release_req) { 2180 return (NULL); 2181 } 2182 2183 lease = kmem_alloc(sizeof (*lease), KM_SLEEP); 2184 list_link_init(&lease->vml_node); 2185 lease->vml_expire_func = expiref; 2186 lease->vml_expire_arg = arg; 2187 lease->vml_expired = B_FALSE; 2188 lease->vml_break_deferred = B_FALSE; 2189 lease->vml_hold = hold; 2190 /* cache the VM pointer for one less pointer chase */ 2191 lease->vml_vm = sc->vmm_vm; 2192 lease->vml_vmclient = vmspace_client_alloc(vm_get_vmspace(sc->vmm_vm)); 2193 2194 mutex_enter(&sc->vmm_lease_lock); 2195 while (sc->vmm_lease_blocker != 0) { 2196 cv_wait(&sc->vmm_lease_cv, &sc->vmm_lease_lock); 2197 } 2198 list_insert_tail(&sc->vmm_lease_list, lease); 2199 vmm_read_lock(sc); 2200 mutex_exit(&sc->vmm_lease_lock); 2201 2202 return (lease); 2203 } 2204 2205 static void 2206 vmm_lease_break_locked(vmm_softc_t *sc, vmm_lease_t *lease) 2207 { 2208 ASSERT(MUTEX_HELD(&sc->vmm_lease_lock)); 2209 2210 list_remove(&sc->vmm_lease_list, lease); 2211 vmm_read_unlock(sc); 2212 vmc_destroy(lease->vml_vmclient); 2213 kmem_free(lease, sizeof (*lease)); 2214 } 2215 2216 static void 2217 vmm_lease_block(vmm_softc_t *sc) 2218 { 2219 mutex_enter(&sc->vmm_lease_lock); 2220 VERIFY3U(sc->vmm_lease_blocker, !=, UINT_MAX); 2221 sc->vmm_lease_blocker++; 2222 if (sc->vmm_lease_blocker == 1) { 2223 list_t *list = &sc->vmm_lease_list; 2224 vmm_lease_t *lease = list_head(list); 2225 2226 while (lease != NULL) { 2227 void *arg = lease->vml_expire_arg; 2228 boolean_t (*expiref)(void *) = lease->vml_expire_func; 2229 boolean_t sync_break = B_FALSE; 2230 2231 /* 2232 * Since the lease expiration notification may 2233 * need to take locks which would deadlock with 2234 * vmm_lease_lock, drop it across the call. 2235 * 2236 * We are the only one allowed to manipulate 2237 * vmm_lease_list right now, so it is safe to 2238 * continue iterating through it after 2239 * reacquiring the lock. 2240 */ 2241 lease->vml_expired = B_TRUE; 2242 mutex_exit(&sc->vmm_lease_lock); 2243 sync_break = expiref(arg); 2244 mutex_enter(&sc->vmm_lease_lock); 2245 2246 if (sync_break) { 2247 vmm_lease_t *next; 2248 2249 /* 2250 * These leases which are synchronously broken 2251 * result in vmm_read_unlock() calls from a 2252 * different thread than the corresponding 2253 * vmm_read_lock(). This is acceptable, given 2254 * that the rwlock underpinning the whole 2255 * mechanism tolerates the behavior. This 2256 * flexibility is _only_ afforded to VM read 2257 * lock (RW_READER) holders. 2258 */ 2259 next = list_next(list, lease); 2260 vmm_lease_break_locked(sc, lease); 2261 lease = next; 2262 } else { 2263 lease = list_next(list, lease); 2264 } 2265 } 2266 2267 /* Process leases which were not broken synchronously. */ 2268 while (!list_is_empty(list)) { 2269 /* 2270 * Although the nested loops are quadratic, the number 2271 * of leases is small. 2272 */ 2273 lease = list_head(list); 2274 while (lease != NULL) { 2275 vmm_lease_t *next = list_next(list, lease); 2276 if (lease->vml_break_deferred) { 2277 vmm_lease_break_locked(sc, lease); 2278 } 2279 lease = next; 2280 } 2281 if (list_is_empty(list)) { 2282 break; 2283 } 2284 cv_wait(&sc->vmm_lease_cv, &sc->vmm_lease_lock); 2285 } 2286 /* Wake anyone else waiting for the lease list to be empty */ 2287 cv_broadcast(&sc->vmm_lease_cv); 2288 } else { 2289 list_t *list = &sc->vmm_lease_list; 2290 2291 /* 2292 * Some other thread beat us to the duty of lease cleanup. 2293 * Wait until that is complete. 2294 */ 2295 while (!list_is_empty(list)) { 2296 cv_wait(&sc->vmm_lease_cv, &sc->vmm_lease_lock); 2297 } 2298 } 2299 mutex_exit(&sc->vmm_lease_lock); 2300 } 2301 2302 static void 2303 vmm_lease_unblock(vmm_softc_t *sc) 2304 { 2305 mutex_enter(&sc->vmm_lease_lock); 2306 VERIFY3U(sc->vmm_lease_blocker, !=, 0); 2307 sc->vmm_lease_blocker--; 2308 if (sc->vmm_lease_blocker == 0) { 2309 cv_broadcast(&sc->vmm_lease_cv); 2310 } 2311 mutex_exit(&sc->vmm_lease_lock); 2312 } 2313 2314 void 2315 vmm_drv_lease_break(vmm_hold_t *hold, vmm_lease_t *lease) 2316 { 2317 vmm_softc_t *sc = hold->vmh_sc; 2318 2319 VERIFY3P(hold, ==, lease->vml_hold); 2320 VERIFY(!lease->vml_break_deferred); 2321 2322 mutex_enter(&sc->vmm_lease_lock); 2323 if (sc->vmm_lease_blocker == 0) { 2324 vmm_lease_break_locked(sc, lease); 2325 } else { 2326 /* 2327 * Defer the lease-breaking to whichever thread is currently 2328 * cleaning up all leases as part of a vmm_lease_block() call. 2329 */ 2330 lease->vml_break_deferred = B_TRUE; 2331 cv_broadcast(&sc->vmm_lease_cv); 2332 } 2333 mutex_exit(&sc->vmm_lease_lock); 2334 } 2335 2336 boolean_t 2337 vmm_drv_lease_expired(vmm_lease_t *lease) 2338 { 2339 return (lease->vml_expired); 2340 } 2341 2342 vmm_page_t * 2343 vmm_drv_page_hold(vmm_lease_t *lease, uintptr_t gpa, int prot) 2344 { 2345 ASSERT(lease != NULL); 2346 ASSERT0(gpa & PAGEOFFSET); 2347 2348 return ((vmm_page_t *)vmc_hold(lease->vml_vmclient, gpa, prot)); 2349 } 2350 2351 2352 /* Ensure that flags mirrored by vmm_drv interface properly match up */ 2353 CTASSERT(VMPF_DEFER_DIRTY == VPF_DEFER_DIRTY); 2354 2355 vmm_page_t * 2356 vmm_drv_page_hold_ext(vmm_lease_t *lease, uintptr_t gpa, int prot, int flags) 2357 { 2358 ASSERT(lease != NULL); 2359 ASSERT0(gpa & PAGEOFFSET); 2360 2361 vmm_page_t *page = 2362 (vmm_page_t *)vmc_hold_ext(lease->vml_vmclient, gpa, prot, flags); 2363 return (page); 2364 } 2365 2366 void 2367 vmm_drv_page_release(vmm_page_t *vmmp) 2368 { 2369 (void) vmp_release((vm_page_t *)vmmp); 2370 } 2371 2372 void 2373 vmm_drv_page_release_chain(vmm_page_t *vmmp) 2374 { 2375 (void) vmp_release_chain((vm_page_t *)vmmp); 2376 } 2377 2378 const void * 2379 vmm_drv_page_readable(const vmm_page_t *vmmp) 2380 { 2381 return (vmp_get_readable((const vm_page_t *)vmmp)); 2382 } 2383 2384 void * 2385 vmm_drv_page_writable(const vmm_page_t *vmmp) 2386 { 2387 return (vmp_get_writable((const vm_page_t *)vmmp)); 2388 } 2389 2390 void 2391 vmm_drv_page_mark_dirty(vmm_page_t *vmmp) 2392 { 2393 return (vmp_mark_dirty((vm_page_t *)vmmp)); 2394 } 2395 2396 void 2397 vmm_drv_page_chain(vmm_page_t *vmmp, vmm_page_t *to_chain) 2398 { 2399 vmp_chain((vm_page_t *)vmmp, (vm_page_t *)to_chain); 2400 } 2401 2402 vmm_page_t * 2403 vmm_drv_page_next(const vmm_page_t *vmmp) 2404 { 2405 return ((vmm_page_t *)vmp_next((vm_page_t *)vmmp)); 2406 } 2407 2408 int 2409 vmm_drv_msi(vmm_lease_t *lease, uint64_t addr, uint64_t msg) 2410 { 2411 ASSERT(lease != NULL); 2412 2413 return (lapic_intr_msi(lease->vml_vm, addr, msg)); 2414 } 2415 2416 int 2417 vmm_drv_ioport_hook(vmm_hold_t *hold, uint16_t ioport, vmm_drv_iop_cb_t func, 2418 void *arg, void **cookie) 2419 { 2420 vmm_softc_t *sc; 2421 int err; 2422 2423 ASSERT(hold != NULL); 2424 ASSERT(cookie != NULL); 2425 2426 sc = hold->vmh_sc; 2427 mutex_enter(&vmm_mtx); 2428 /* Confirm that hook installation is not blocked */ 2429 if ((sc->vmm_flags & VMM_BLOCK_HOOK) != 0) { 2430 mutex_exit(&vmm_mtx); 2431 return (EBUSY); 2432 } 2433 /* 2434 * Optimistically record an installed hook which will prevent a block 2435 * from being asserted while the mutex is dropped. 2436 */ 2437 hold->vmh_ioport_hook_cnt++; 2438 mutex_exit(&vmm_mtx); 2439 2440 vmm_write_lock(sc); 2441 err = vm_ioport_hook(sc->vmm_vm, ioport, (ioport_handler_t)func, 2442 arg, cookie); 2443 vmm_write_unlock(sc); 2444 2445 if (err != 0) { 2446 mutex_enter(&vmm_mtx); 2447 /* Walk back optimism about the hook installation */ 2448 hold->vmh_ioport_hook_cnt--; 2449 mutex_exit(&vmm_mtx); 2450 } 2451 return (err); 2452 } 2453 2454 void 2455 vmm_drv_ioport_unhook(vmm_hold_t *hold, void **cookie) 2456 { 2457 vmm_softc_t *sc; 2458 2459 ASSERT(hold != NULL); 2460 ASSERT(cookie != NULL); 2461 ASSERT(hold->vmh_ioport_hook_cnt != 0); 2462 2463 sc = hold->vmh_sc; 2464 vmm_write_lock(sc); 2465 vm_ioport_unhook(sc->vmm_vm, cookie); 2466 vmm_write_unlock(sc); 2467 2468 mutex_enter(&vmm_mtx); 2469 hold->vmh_ioport_hook_cnt--; 2470 mutex_exit(&vmm_mtx); 2471 } 2472 2473 static void 2474 vmm_drv_purge(vmm_softc_t *sc) 2475 { 2476 ASSERT(MUTEX_HELD(&vmm_mtx)); 2477 2478 if ((sc->vmm_flags & VMM_HELD) != 0) { 2479 vmm_hold_t *hold; 2480 2481 for (hold = list_head(&sc->vmm_holds); hold != NULL; 2482 hold = list_next(&sc->vmm_holds, hold)) { 2483 hold->vmh_release_req = B_TRUE; 2484 } 2485 2486 /* 2487 * Require that all leases on the instance be broken, now that 2488 * all associated holds have been marked as needing release. 2489 * 2490 * Dropping vmm_mtx is not strictly necessary, but if any of the 2491 * lessees are slow to respond, it would be nice to leave it 2492 * available for other parties. 2493 */ 2494 mutex_exit(&vmm_mtx); 2495 vmm_lease_block(sc); 2496 vmm_lease_unblock(sc); 2497 mutex_enter(&vmm_mtx); 2498 } 2499 } 2500 2501 static int 2502 vmm_drv_block_hook(vmm_softc_t *sc, boolean_t enable_block) 2503 { 2504 int err = 0; 2505 2506 mutex_enter(&vmm_mtx); 2507 if (!enable_block) { 2508 VERIFY((sc->vmm_flags & VMM_BLOCK_HOOK) != 0); 2509 2510 sc->vmm_flags &= ~VMM_BLOCK_HOOK; 2511 goto done; 2512 } 2513 2514 /* If any holds have hooks installed, the block is a failure */ 2515 if (!list_is_empty(&sc->vmm_holds)) { 2516 vmm_hold_t *hold; 2517 2518 for (hold = list_head(&sc->vmm_holds); hold != NULL; 2519 hold = list_next(&sc->vmm_holds, hold)) { 2520 if (hold->vmh_ioport_hook_cnt != 0) { 2521 err = EBUSY; 2522 goto done; 2523 } 2524 } 2525 } 2526 sc->vmm_flags |= VMM_BLOCK_HOOK; 2527 2528 done: 2529 mutex_exit(&vmm_mtx); 2530 return (err); 2531 } 2532 2533 2534 static void 2535 vmm_destroy_begin(vmm_softc_t *sc, vmm_destroy_opts_t opts) 2536 { 2537 ASSERT(MUTEX_HELD(&vmm_mtx)); 2538 ASSERT0(sc->vmm_flags & VMM_DESTROY); 2539 2540 sc->vmm_flags |= VMM_DESTROY; 2541 2542 /* 2543 * Lock and unlock all of the vCPUs to ensure that they are kicked out 2544 * of guest context, being unable to return now that the instance is 2545 * marked for destruction. 2546 */ 2547 const int maxcpus = vm_get_maxcpus(sc->vmm_vm); 2548 for (int vcpu = 0; vcpu < maxcpus; vcpu++) { 2549 vcpu_lock_one(sc, vcpu); 2550 vcpu_unlock_one(sc, vcpu); 2551 } 2552 2553 vmmdev_devmem_purge(sc); 2554 if ((opts & VDO_NO_CLEAN_ZSD) == 0) { 2555 /* 2556 * The ZSD should be cleaned up now, unless destruction of the 2557 * instance was initated by destruction of the containing zone, 2558 * in which case the ZSD has already been removed. 2559 */ 2560 vmm_zsd_rem_vm(sc); 2561 } 2562 zone_rele(sc->vmm_zone); 2563 2564 vmm_drv_purge(sc); 2565 } 2566 2567 static bool 2568 vmm_destroy_ready(vmm_softc_t *sc) 2569 { 2570 ASSERT(MUTEX_HELD(&vmm_mtx)); 2571 2572 if ((sc->vmm_flags & (VMM_HELD | VMM_IS_OPEN)) == 0) { 2573 VERIFY(list_is_empty(&sc->vmm_holds)); 2574 return (true); 2575 } 2576 2577 return (false); 2578 } 2579 2580 static void 2581 vmm_destroy_finish(vmm_softc_t *sc) 2582 { 2583 ASSERT(MUTEX_HELD(&vmm_mtx)); 2584 ASSERT(vmm_destroy_ready(sc)); 2585 2586 list_remove(&vmm_list, sc); 2587 vmm_kstat_fini(sc); 2588 vm_destroy(sc->vmm_vm); 2589 ddi_remove_minor_node(vmmdev_dip, sc->vmm_name); 2590 (void) devfs_clean(ddi_get_parent(vmmdev_dip), NULL, DV_CLEAN_FORCE); 2591 2592 const minor_t minor = sc->vmm_minor; 2593 ddi_soft_state_free(vmm_statep, minor); 2594 id_free(vmm_minors, minor); 2595 } 2596 2597 /* 2598 * Initiate or attempt to finish destruction of a VMM instance. 2599 * 2600 * This is called from several contexts: 2601 * - An explicit destroy ioctl is made 2602 * - A vmm_drv consumer releases its hold (being the last on the instance) 2603 * - The vmm device is closed, and auto-destruct is enabled 2604 */ 2605 static int 2606 vmm_destroy_locked(vmm_softc_t *sc, vmm_destroy_opts_t opts, 2607 bool *hma_release) 2608 { 2609 ASSERT(MUTEX_HELD(&vmm_mtx)); 2610 2611 *hma_release = false; 2612 2613 /* 2614 * When instance destruction begins, it is so marked such that any 2615 * further requests to operate the instance will fail. 2616 */ 2617 if ((sc->vmm_flags & VMM_DESTROY) == 0) { 2618 vmm_destroy_begin(sc, opts); 2619 } 2620 2621 if (vmm_destroy_ready(sc)) { 2622 2623 /* 2624 * Notify anyone waiting for the destruction to finish. They 2625 * must be clear before we can safely tear down the softc. 2626 */ 2627 if (sc->vmm_destroy_waiters != 0) { 2628 cv_broadcast(&sc->vmm_cv); 2629 while (sc->vmm_destroy_waiters != 0) { 2630 cv_wait(&sc->vmm_cv, &vmm_mtx); 2631 } 2632 } 2633 2634 /* 2635 * Finish destruction of instance. After this point, the softc 2636 * is freed and cannot be accessed again. 2637 * 2638 * With destruction complete, the HMA hold can be released 2639 */ 2640 vmm_destroy_finish(sc); 2641 *hma_release = true; 2642 return (0); 2643 } else if ((opts & VDO_ATTEMPT_WAIT) != 0) { 2644 int err = 0; 2645 2646 sc->vmm_destroy_waiters++; 2647 while (!vmm_destroy_ready(sc) && err == 0) { 2648 if (cv_wait_sig(&sc->vmm_cv, &vmm_mtx) <= 0) { 2649 err = EINTR; 2650 } 2651 } 2652 sc->vmm_destroy_waiters--; 2653 2654 if (sc->vmm_destroy_waiters == 0) { 2655 /* 2656 * If we were the last waiter, it could be that VM 2657 * destruction is waiting on _us_ to proceed with the 2658 * final clean-up. 2659 */ 2660 cv_signal(&sc->vmm_cv); 2661 } 2662 return (err); 2663 } else { 2664 /* 2665 * Since the instance is not ready for destruction, and the 2666 * caller did not ask to wait, consider it a success for now. 2667 */ 2668 return (0); 2669 } 2670 } 2671 2672 void 2673 vmm_zone_vm_destroy(vmm_softc_t *sc) 2674 { 2675 bool hma_release = false; 2676 int err; 2677 2678 mutex_enter(&vmm_mtx); 2679 err = vmm_destroy_locked(sc, VDO_NO_CLEAN_ZSD, &hma_release); 2680 mutex_exit(&vmm_mtx); 2681 2682 VERIFY0(err); 2683 2684 if (hma_release) { 2685 vmm_hma_release(); 2686 } 2687 } 2688 2689 static int 2690 vmmdev_do_vm_destroy(const struct vm_destroy_req *req, cred_t *cr) 2691 { 2692 vmm_softc_t *sc; 2693 bool hma_release = false; 2694 int err; 2695 2696 if (crgetuid(cr) != 0) { 2697 return (EPERM); 2698 } 2699 2700 mutex_enter(&vmm_mtx); 2701 sc = vmm_lookup(req->name); 2702 if (sc == NULL) { 2703 mutex_exit(&vmm_mtx); 2704 return (ENOENT); 2705 } 2706 /* 2707 * We don't check this in vmm_lookup() since that function is also used 2708 * for validation during create and currently vmm names must be unique. 2709 */ 2710 if (!INGLOBALZONE(curproc) && sc->vmm_zone != curzone) { 2711 mutex_exit(&vmm_mtx); 2712 return (EPERM); 2713 } 2714 2715 err = vmm_destroy_locked(sc, VDO_ATTEMPT_WAIT, &hma_release); 2716 mutex_exit(&vmm_mtx); 2717 2718 if (hma_release) { 2719 vmm_hma_release(); 2720 } 2721 2722 return (err); 2723 } 2724 2725 #define VCPU_NAME_BUFLEN 32 2726 2727 static int 2728 vmm_kstat_alloc(vmm_softc_t *sc, minor_t minor, const cred_t *cr) 2729 { 2730 zoneid_t zid = crgetzoneid(cr); 2731 int instance = minor; 2732 kstat_t *ksp; 2733 2734 ASSERT3P(sc->vmm_kstat_vm, ==, NULL); 2735 2736 ksp = kstat_create_zone(VMM_MODULE_NAME, instance, "vm", 2737 VMM_KSTAT_CLASS, KSTAT_TYPE_NAMED, 2738 sizeof (vmm_kstats_t) / sizeof (kstat_named_t), 0, zid); 2739 2740 if (ksp == NULL) { 2741 return (-1); 2742 } 2743 sc->vmm_kstat_vm = ksp; 2744 2745 for (uint_t i = 0; i < VM_MAXCPU; i++) { 2746 char namebuf[VCPU_NAME_BUFLEN]; 2747 2748 ASSERT3P(sc->vmm_kstat_vcpu[i], ==, NULL); 2749 2750 (void) snprintf(namebuf, VCPU_NAME_BUFLEN, "vcpu%u", i); 2751 ksp = kstat_create_zone(VMM_MODULE_NAME, instance, namebuf, 2752 VMM_KSTAT_CLASS, KSTAT_TYPE_NAMED, 2753 sizeof (vmm_vcpu_kstats_t) / sizeof (kstat_named_t), 2754 0, zid); 2755 if (ksp == NULL) { 2756 goto fail; 2757 } 2758 2759 sc->vmm_kstat_vcpu[i] = ksp; 2760 } 2761 2762 /* 2763 * If this instance is associated with a non-global zone, make its 2764 * kstats visible from the GZ. 2765 */ 2766 if (zid != GLOBAL_ZONEID) { 2767 kstat_zone_add(sc->vmm_kstat_vm, GLOBAL_ZONEID); 2768 for (uint_t i = 0; i < VM_MAXCPU; i++) { 2769 kstat_zone_add(sc->vmm_kstat_vcpu[i], GLOBAL_ZONEID); 2770 } 2771 } 2772 2773 return (0); 2774 2775 fail: 2776 for (uint_t i = 0; i < VM_MAXCPU; i++) { 2777 if (sc->vmm_kstat_vcpu[i] != NULL) { 2778 kstat_delete(sc->vmm_kstat_vcpu[i]); 2779 sc->vmm_kstat_vcpu[i] = NULL; 2780 } else { 2781 break; 2782 } 2783 } 2784 kstat_delete(sc->vmm_kstat_vm); 2785 sc->vmm_kstat_vm = NULL; 2786 return (-1); 2787 } 2788 2789 static void 2790 vmm_kstat_init(vmm_softc_t *sc) 2791 { 2792 kstat_t *ksp; 2793 2794 ASSERT3P(sc->vmm_vm, !=, NULL); 2795 ASSERT3P(sc->vmm_kstat_vm, !=, NULL); 2796 2797 ksp = sc->vmm_kstat_vm; 2798 vmm_kstats_t *vk = ksp->ks_data; 2799 ksp->ks_private = sc->vmm_vm; 2800 kstat_named_init(&vk->vk_name, "vm_name", KSTAT_DATA_STRING); 2801 kstat_named_setstr(&vk->vk_name, sc->vmm_name); 2802 2803 for (uint_t i = 0; i < VM_MAXCPU; i++) { 2804 ASSERT3P(sc->vmm_kstat_vcpu[i], !=, NULL); 2805 2806 ksp = sc->vmm_kstat_vcpu[i]; 2807 vmm_vcpu_kstats_t *vvk = ksp->ks_data; 2808 2809 kstat_named_init(&vvk->vvk_vcpu, "vcpu", KSTAT_DATA_UINT32); 2810 vvk->vvk_vcpu.value.ui32 = i; 2811 kstat_named_init(&vvk->vvk_time_init, "time_init", 2812 KSTAT_DATA_UINT64); 2813 kstat_named_init(&vvk->vvk_time_run, "time_run", 2814 KSTAT_DATA_UINT64); 2815 kstat_named_init(&vvk->vvk_time_idle, "time_idle", 2816 KSTAT_DATA_UINT64); 2817 kstat_named_init(&vvk->vvk_time_emu_kern, "time_emu_kern", 2818 KSTAT_DATA_UINT64); 2819 kstat_named_init(&vvk->vvk_time_emu_user, "time_emu_user", 2820 KSTAT_DATA_UINT64); 2821 kstat_named_init(&vvk->vvk_time_sched, "time_sched", 2822 KSTAT_DATA_UINT64); 2823 ksp->ks_private = sc->vmm_vm; 2824 ksp->ks_update = vmm_kstat_update_vcpu; 2825 } 2826 2827 kstat_install(sc->vmm_kstat_vm); 2828 for (uint_t i = 0; i < VM_MAXCPU; i++) { 2829 kstat_install(sc->vmm_kstat_vcpu[i]); 2830 } 2831 } 2832 2833 static void 2834 vmm_kstat_fini(vmm_softc_t *sc) 2835 { 2836 ASSERT(sc->vmm_kstat_vm != NULL); 2837 2838 kstat_delete(sc->vmm_kstat_vm); 2839 sc->vmm_kstat_vm = NULL; 2840 2841 for (uint_t i = 0; i < VM_MAXCPU; i++) { 2842 ASSERT3P(sc->vmm_kstat_vcpu[i], !=, NULL); 2843 2844 kstat_delete(sc->vmm_kstat_vcpu[i]); 2845 sc->vmm_kstat_vcpu[i] = NULL; 2846 } 2847 } 2848 2849 static int 2850 vmm_open(dev_t *devp, int flag, int otyp, cred_t *credp) 2851 { 2852 minor_t minor; 2853 vmm_softc_t *sc; 2854 2855 /* 2856 * Forbid running bhyve in a 32-bit process until it has been tested and 2857 * verified to be safe. 2858 */ 2859 if (curproc->p_model != DATAMODEL_LP64) { 2860 return (EFBIG); 2861 } 2862 2863 minor = getminor(*devp); 2864 if (minor == VMM_CTL_MINOR) { 2865 /* 2866 * Master control device must be opened exclusively. 2867 */ 2868 if ((flag & FEXCL) != FEXCL || otyp != OTYP_CHR) { 2869 return (EINVAL); 2870 } 2871 2872 return (0); 2873 } 2874 2875 mutex_enter(&vmm_mtx); 2876 sc = ddi_get_soft_state(vmm_statep, minor); 2877 if (sc == NULL) { 2878 mutex_exit(&vmm_mtx); 2879 return (ENXIO); 2880 } 2881 2882 sc->vmm_flags |= VMM_IS_OPEN; 2883 mutex_exit(&vmm_mtx); 2884 2885 return (0); 2886 } 2887 2888 static int 2889 vmm_close(dev_t dev, int flag, int otyp, cred_t *credp) 2890 { 2891 const minor_t minor = getminor(dev); 2892 vmm_softc_t *sc; 2893 bool hma_release = false; 2894 2895 if (minor == VMM_CTL_MINOR) { 2896 return (0); 2897 } 2898 2899 mutex_enter(&vmm_mtx); 2900 sc = ddi_get_soft_state(vmm_statep, minor); 2901 if (sc == NULL) { 2902 mutex_exit(&vmm_mtx); 2903 return (ENXIO); 2904 } 2905 2906 VERIFY3U(sc->vmm_flags & VMM_IS_OPEN, !=, 0); 2907 sc->vmm_flags &= ~VMM_IS_OPEN; 2908 2909 /* 2910 * If instance was marked for auto-destruction begin that now. Instance 2911 * destruction may have been initated already, so try to make progress 2912 * in that case, since closure of the device is one of its requirements. 2913 */ 2914 if ((sc->vmm_flags & VMM_DESTROY) != 0 || 2915 (sc->vmm_flags & VMM_AUTODESTROY) != 0) { 2916 VERIFY0(vmm_destroy_locked(sc, VDO_DEFAULT, &hma_release)); 2917 } 2918 mutex_exit(&vmm_mtx); 2919 2920 if (hma_release) { 2921 vmm_hma_release(); 2922 } 2923 2924 return (0); 2925 } 2926 2927 static int 2928 vmm_is_supported(intptr_t arg) 2929 { 2930 int r; 2931 const char *msg; 2932 2933 if (vmm_is_intel()) { 2934 r = vmx_x86_supported(&msg); 2935 } else if (vmm_is_svm()) { 2936 /* 2937 * HMA already ensured that the features necessary for SVM 2938 * operation were present and online during vmm_attach(). 2939 */ 2940 r = 0; 2941 } else { 2942 r = ENXIO; 2943 msg = "Unsupported CPU vendor"; 2944 } 2945 2946 if (r != 0 && arg != (intptr_t)NULL) { 2947 if (copyoutstr(msg, (char *)arg, strlen(msg) + 1, NULL) != 0) 2948 return (EFAULT); 2949 } 2950 return (r); 2951 } 2952 2953 static int 2954 vmm_ctl_ioctl(int cmd, intptr_t arg, int md, cred_t *cr, int *rvalp) 2955 { 2956 void *argp = (void *)arg; 2957 2958 switch (cmd) { 2959 case VMM_CREATE_VM: { 2960 struct vm_create_req req; 2961 2962 if ((md & FWRITE) == 0) { 2963 return (EPERM); 2964 } 2965 if (ddi_copyin(argp, &req, sizeof (req), md) != 0) { 2966 return (EFAULT); 2967 } 2968 return (vmmdev_do_vm_create(&req, cr)); 2969 } 2970 case VMM_DESTROY_VM: { 2971 struct vm_destroy_req req; 2972 2973 if ((md & FWRITE) == 0) { 2974 return (EPERM); 2975 } 2976 if (ddi_copyin(argp, &req, sizeof (req), md) != 0) { 2977 return (EFAULT); 2978 } 2979 return (vmmdev_do_vm_destroy(&req, cr)); 2980 } 2981 case VMM_VM_SUPPORTED: 2982 return (vmm_is_supported(arg)); 2983 case VMM_CHECK_IOMMU: 2984 if (!vmm_check_iommu()) { 2985 return (ENXIO); 2986 } 2987 return (0); 2988 case VMM_RESV_QUERY: 2989 case VMM_RESV_SET_TARGET: 2990 return (vmmr_ioctl(cmd, arg, md, cr, rvalp)); 2991 default: 2992 break; 2993 } 2994 /* No other actions are legal on ctl device */ 2995 return (ENOTTY); 2996 } 2997 2998 static int 2999 vmm_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, 3000 int *rvalp) 3001 { 3002 vmm_softc_t *sc; 3003 minor_t minor; 3004 3005 /* 3006 * Forbid running bhyve in a 32-bit process until it has been tested and 3007 * verified to be safe. 3008 */ 3009 if (curproc->p_model != DATAMODEL_LP64) { 3010 return (EFBIG); 3011 } 3012 3013 /* The structs in bhyve ioctls assume a 64-bit datamodel */ 3014 if (ddi_model_convert_from(mode & FMODELS) != DDI_MODEL_NONE) { 3015 return (ENOTSUP); 3016 } 3017 3018 /* 3019 * Regardless of minor (vmmctl or instance), we respond to queries of 3020 * the interface version. 3021 */ 3022 if (cmd == VMM_INTERFACE_VERSION) { 3023 *rvalp = VMM_CURRENT_INTERFACE_VERSION; 3024 return (0); 3025 } 3026 3027 minor = getminor(dev); 3028 3029 if (minor == VMM_CTL_MINOR) { 3030 return (vmm_ctl_ioctl(cmd, arg, mode, credp, rvalp)); 3031 } 3032 3033 sc = ddi_get_soft_state(vmm_statep, minor); 3034 ASSERT(sc != NULL); 3035 3036 /* 3037 * Turn away any ioctls against an instance when it is being destroyed. 3038 * (Except for the ioctl inquiring about that destroy-in-progress.) 3039 */ 3040 if ((sc->vmm_flags & VMM_DESTROY) != 0) { 3041 if (cmd == VM_DESTROY_PENDING) { 3042 *rvalp = 1; 3043 return (0); 3044 } 3045 return (ENXIO); 3046 } 3047 3048 return (vmmdev_do_ioctl(sc, cmd, arg, mode, credp, rvalp)); 3049 } 3050 3051 static int 3052 vmm_segmap(dev_t dev, off_t off, struct as *as, caddr_t *addrp, off_t len, 3053 unsigned int prot, unsigned int maxprot, unsigned int flags, cred_t *credp) 3054 { 3055 vmm_softc_t *sc; 3056 const minor_t minor = getminor(dev); 3057 int err; 3058 3059 if (minor == VMM_CTL_MINOR) { 3060 return (ENODEV); 3061 } 3062 if (off < 0 || (off + len) <= 0) { 3063 return (EINVAL); 3064 } 3065 if ((prot & PROT_USER) == 0) { 3066 return (EACCES); 3067 } 3068 3069 sc = ddi_get_soft_state(vmm_statep, minor); 3070 ASSERT(sc); 3071 3072 if (sc->vmm_flags & VMM_DESTROY) 3073 return (ENXIO); 3074 3075 /* Grab read lock on the VM to prevent any changes to the memory map */ 3076 vmm_read_lock(sc); 3077 3078 if (off >= VM_DEVMEM_START) { 3079 int segid; 3080 off_t segoff; 3081 3082 /* Mapping a devmem "device" */ 3083 if (!vmmdev_devmem_segid(sc, off, len, &segid, &segoff)) { 3084 err = ENODEV; 3085 } else { 3086 err = vm_segmap_obj(sc->vmm_vm, segid, segoff, len, as, 3087 addrp, prot, maxprot, flags); 3088 } 3089 } else { 3090 /* Mapping a part of the guest physical space */ 3091 err = vm_segmap_space(sc->vmm_vm, off, as, addrp, len, prot, 3092 maxprot, flags); 3093 } 3094 3095 vmm_read_unlock(sc); 3096 return (err); 3097 } 3098 3099 static sdev_plugin_validate_t 3100 vmm_sdev_validate(sdev_ctx_t ctx) 3101 { 3102 const char *name = sdev_ctx_name(ctx); 3103 vmm_softc_t *sc; 3104 sdev_plugin_validate_t ret; 3105 minor_t minor; 3106 3107 if (sdev_ctx_vtype(ctx) != VCHR) 3108 return (SDEV_VTOR_INVALID); 3109 3110 VERIFY3S(sdev_ctx_minor(ctx, &minor), ==, 0); 3111 3112 mutex_enter(&vmm_mtx); 3113 if ((sc = vmm_lookup(name)) == NULL) 3114 ret = SDEV_VTOR_INVALID; 3115 else if (sc->vmm_minor != minor) 3116 ret = SDEV_VTOR_STALE; 3117 else 3118 ret = SDEV_VTOR_VALID; 3119 mutex_exit(&vmm_mtx); 3120 3121 return (ret); 3122 } 3123 3124 static int 3125 vmm_sdev_filldir(sdev_ctx_t ctx) 3126 { 3127 vmm_softc_t *sc; 3128 int ret; 3129 3130 if (strcmp(sdev_ctx_path(ctx), VMM_SDEV_ROOT) != 0) { 3131 cmn_err(CE_WARN, "%s: bad path '%s' != '%s'\n", __func__, 3132 sdev_ctx_path(ctx), VMM_SDEV_ROOT); 3133 return (EINVAL); 3134 } 3135 3136 mutex_enter(&vmm_mtx); 3137 ASSERT(vmmdev_dip != NULL); 3138 for (sc = list_head(&vmm_list); sc != NULL; 3139 sc = list_next(&vmm_list, sc)) { 3140 if (INGLOBALZONE(curproc) || sc->vmm_zone == curzone) { 3141 ret = sdev_plugin_mknod(ctx, sc->vmm_name, 3142 S_IFCHR | 0600, 3143 makedevice(ddi_driver_major(vmmdev_dip), 3144 sc->vmm_minor)); 3145 } else { 3146 continue; 3147 } 3148 if (ret != 0 && ret != EEXIST) 3149 goto out; 3150 } 3151 3152 ret = 0; 3153 3154 out: 3155 mutex_exit(&vmm_mtx); 3156 return (ret); 3157 } 3158 3159 /* ARGSUSED */ 3160 static void 3161 vmm_sdev_inactive(sdev_ctx_t ctx) 3162 { 3163 } 3164 3165 static sdev_plugin_ops_t vmm_sdev_ops = { 3166 .spo_version = SDEV_PLUGIN_VERSION, 3167 .spo_flags = SDEV_PLUGIN_SUBDIR, 3168 .spo_validate = vmm_sdev_validate, 3169 .spo_filldir = vmm_sdev_filldir, 3170 .spo_inactive = vmm_sdev_inactive 3171 }; 3172 3173 /* ARGSUSED */ 3174 static int 3175 vmm_info(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result) 3176 { 3177 int error; 3178 3179 switch (cmd) { 3180 case DDI_INFO_DEVT2DEVINFO: 3181 *result = (void *)vmmdev_dip; 3182 error = DDI_SUCCESS; 3183 break; 3184 case DDI_INFO_DEVT2INSTANCE: 3185 *result = (void *)0; 3186 error = DDI_SUCCESS; 3187 break; 3188 default: 3189 error = DDI_FAILURE; 3190 break; 3191 } 3192 return (error); 3193 } 3194 3195 static int 3196 vmm_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 3197 { 3198 sdev_plugin_hdl_t sph; 3199 hma_reg_t *reg = NULL; 3200 boolean_t vmm_loaded = B_FALSE; 3201 3202 if (cmd != DDI_ATTACH) { 3203 return (DDI_FAILURE); 3204 } 3205 3206 mutex_enter(&vmmdev_mtx); 3207 /* Ensure we are not already attached. */ 3208 if (vmmdev_dip != NULL) { 3209 mutex_exit(&vmmdev_mtx); 3210 return (DDI_FAILURE); 3211 } 3212 3213 vmm_sol_glue_init(); 3214 3215 /* 3216 * Perform temporary HMA registration to determine if the system 3217 * is capable. 3218 */ 3219 if ((reg = hma_register(vmmdev_hvm_name)) == NULL) { 3220 goto fail; 3221 } else if (vmm_mod_load() != 0) { 3222 goto fail; 3223 } 3224 vmm_loaded = B_TRUE; 3225 hma_unregister(reg); 3226 reg = NULL; 3227 3228 /* Create control node. Other nodes will be created on demand. */ 3229 if (ddi_create_minor_node(dip, "ctl", S_IFCHR, 3230 VMM_CTL_MINOR, DDI_PSEUDO, 0) != 0) { 3231 goto fail; 3232 } 3233 3234 sph = sdev_plugin_register(VMM_MODULE_NAME, &vmm_sdev_ops, NULL); 3235 if (sph == (sdev_plugin_hdl_t)NULL) { 3236 ddi_remove_minor_node(dip, NULL); 3237 goto fail; 3238 } 3239 3240 ddi_report_dev(dip); 3241 vmmdev_sdev_hdl = sph; 3242 vmmdev_dip = dip; 3243 mutex_exit(&vmmdev_mtx); 3244 return (DDI_SUCCESS); 3245 3246 fail: 3247 if (vmm_loaded) { 3248 VERIFY0(vmm_mod_unload()); 3249 } 3250 if (reg != NULL) { 3251 hma_unregister(reg); 3252 } 3253 vmm_sol_glue_cleanup(); 3254 mutex_exit(&vmmdev_mtx); 3255 return (DDI_FAILURE); 3256 } 3257 3258 static int 3259 vmm_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 3260 { 3261 if (cmd != DDI_DETACH) { 3262 return (DDI_FAILURE); 3263 } 3264 3265 /* 3266 * Ensure that all resources have been cleaned up. 3267 * 3268 * To prevent a deadlock with iommu_cleanup() we'll fail the detach if 3269 * vmmdev_mtx is already held. We can't wait for vmmdev_mtx with our 3270 * devinfo locked as iommu_cleanup() tries to recursively lock each 3271 * devinfo, including our own, while holding vmmdev_mtx. 3272 */ 3273 if (mutex_tryenter(&vmmdev_mtx) == 0) 3274 return (DDI_FAILURE); 3275 3276 mutex_enter(&vmm_mtx); 3277 if (!list_is_empty(&vmm_list)) { 3278 mutex_exit(&vmm_mtx); 3279 mutex_exit(&vmmdev_mtx); 3280 return (DDI_FAILURE); 3281 } 3282 mutex_exit(&vmm_mtx); 3283 3284 if (!vmmr_is_empty()) { 3285 mutex_exit(&vmmdev_mtx); 3286 return (DDI_FAILURE); 3287 } 3288 3289 VERIFY(vmmdev_sdev_hdl != (sdev_plugin_hdl_t)NULL); 3290 if (sdev_plugin_unregister(vmmdev_sdev_hdl) != 0) { 3291 mutex_exit(&vmmdev_mtx); 3292 return (DDI_FAILURE); 3293 } 3294 vmmdev_sdev_hdl = (sdev_plugin_hdl_t)NULL; 3295 3296 /* Remove the control node. */ 3297 ddi_remove_minor_node(dip, "ctl"); 3298 vmmdev_dip = NULL; 3299 3300 VERIFY0(vmm_mod_unload()); 3301 VERIFY3U(vmmdev_hma_reg, ==, NULL); 3302 vmm_sol_glue_cleanup(); 3303 3304 mutex_exit(&vmmdev_mtx); 3305 3306 return (DDI_SUCCESS); 3307 } 3308 3309 static struct cb_ops vmm_cb_ops = { 3310 vmm_open, 3311 vmm_close, 3312 nodev, /* strategy */ 3313 nodev, /* print */ 3314 nodev, /* dump */ 3315 nodev, /* read */ 3316 nodev, /* write */ 3317 vmm_ioctl, 3318 nodev, /* devmap */ 3319 nodev, /* mmap */ 3320 vmm_segmap, 3321 nochpoll, /* poll */ 3322 ddi_prop_op, 3323 NULL, 3324 D_NEW | D_MP | D_DEVMAP 3325 }; 3326 3327 static struct dev_ops vmm_ops = { 3328 DEVO_REV, 3329 0, 3330 vmm_info, 3331 nulldev, /* identify */ 3332 nulldev, /* probe */ 3333 vmm_attach, 3334 vmm_detach, 3335 nodev, /* reset */ 3336 &vmm_cb_ops, 3337 (struct bus_ops *)NULL 3338 }; 3339 3340 static struct modldrv modldrv = { 3341 &mod_driverops, 3342 "bhyve vmm", 3343 &vmm_ops 3344 }; 3345 3346 static struct modlinkage modlinkage = { 3347 MODREV_1, 3348 &modldrv, 3349 NULL 3350 }; 3351 3352 int 3353 _init(void) 3354 { 3355 int error; 3356 3357 sysinit(); 3358 3359 mutex_init(&vmmdev_mtx, NULL, MUTEX_DRIVER, NULL); 3360 mutex_init(&vmm_mtx, NULL, MUTEX_DRIVER, NULL); 3361 list_create(&vmm_list, sizeof (vmm_softc_t), 3362 offsetof(vmm_softc_t, vmm_node)); 3363 vmm_minors = id_space_create("vmm_minors", VMM_CTL_MINOR + 1, MAXMIN32); 3364 3365 error = ddi_soft_state_init(&vmm_statep, sizeof (vmm_softc_t), 0); 3366 if (error) { 3367 return (error); 3368 } 3369 3370 error = vmmr_init(); 3371 if (error) { 3372 ddi_soft_state_fini(&vmm_statep); 3373 return (error); 3374 } 3375 3376 vmm_zsd_init(); 3377 3378 error = mod_install(&modlinkage); 3379 if (error) { 3380 ddi_soft_state_fini(&vmm_statep); 3381 vmm_zsd_fini(); 3382 vmmr_fini(); 3383 } 3384 3385 return (error); 3386 } 3387 3388 int 3389 _fini(void) 3390 { 3391 int error; 3392 3393 error = mod_remove(&modlinkage); 3394 if (error) { 3395 return (error); 3396 } 3397 3398 vmm_zsd_fini(); 3399 vmmr_fini(); 3400 3401 ddi_soft_state_fini(&vmm_statep); 3402 3403 return (0); 3404 } 3405 3406 int 3407 _info(struct modinfo *modinfop) 3408 { 3409 return (mod_info(&modlinkage, modinfop)); 3410 } 3411