1 /* 2 * This file and its contents are supplied under the terms of the 3 * Common Development and Distribution License ("CDDL"), version 1.0. 4 * You may only use this file in accordance with the terms of version 5 * 1.0 of the CDDL. 6 * 7 * A full copy of the text of the CDDL should have accompanied this 8 * source. A copy of the CDDL is also available via the Internet at 9 * http://www.illumos.org/license/CDDL. 10 */ 11 /* This file is dual-licensed; see usr/src/contrib/bhyve/LICENSE */ 12 13 /* 14 * Copyright 2015 Pluribus Networks Inc. 15 * Copyright 2019 Joyent, Inc. 16 * Copyright 2020 OmniOS Community Edition (OmniOSce) Association. 17 * Copyright 2023 Oxide Computer Company 18 */ 19 20 #include <sys/types.h> 21 #include <sys/conf.h> 22 #include <sys/cpuvar.h> 23 #include <sys/ioccom.h> 24 #include <sys/stat.h> 25 #include <sys/vmsystm.h> 26 #include <sys/ddi.h> 27 #include <sys/mkdev.h> 28 #include <sys/sunddi.h> 29 #include <sys/fs/dv_node.h> 30 #include <sys/cpuset.h> 31 #include <sys/id_space.h> 32 #include <sys/fs/sdev_plugin.h> 33 #include <sys/smt.h> 34 #include <sys/kstat.h> 35 36 #include <sys/kernel.h> 37 #include <sys/hma.h> 38 #include <sys/x86_archext.h> 39 #include <x86/apicreg.h> 40 41 #include <sys/vmm.h> 42 #include <sys/vmm_kernel.h> 43 #include <sys/vmm_instruction_emul.h> 44 #include <sys/vmm_dev.h> 45 #include <sys/vmm_impl.h> 46 #include <sys/vmm_drv.h> 47 #include <sys/vmm_vm.h> 48 #include <sys/vmm_reservoir.h> 49 50 #include <vm/seg_dev.h> 51 52 #include "io/ppt.h" 53 #include "io/vatpic.h" 54 #include "io/vioapic.h" 55 #include "io/vrtc.h" 56 #include "io/vhpet.h" 57 #include "io/vpmtmr.h" 58 #include "vmm_lapic.h" 59 #include "vmm_stat.h" 60 #include "vmm_util.h" 61 62 /* 63 * Locking details: 64 * 65 * Driver-wide data (vmmdev_*) , including HMA and sdev registration, is 66 * protected by vmmdev_mtx. The list of vmm_softc_t instances and related data 67 * (vmm_*) are protected by vmm_mtx. Actions requiring both locks must acquire 68 * vmmdev_mtx before vmm_mtx. The sdev plugin functions must not attempt to 69 * acquire vmmdev_mtx, as they could deadlock with plugin unregistration. 70 */ 71 72 static kmutex_t vmmdev_mtx; 73 static dev_info_t *vmmdev_dip; 74 static hma_reg_t *vmmdev_hma_reg; 75 static uint_t vmmdev_hma_ref; 76 static sdev_plugin_hdl_t vmmdev_sdev_hdl; 77 78 static kmutex_t vmm_mtx; 79 static list_t vmm_list; 80 static id_space_t *vmm_minors; 81 static void *vmm_statep; 82 83 /* temporary safety switch */ 84 int vmm_allow_state_writes; 85 86 static const char *vmmdev_hvm_name = "bhyve"; 87 88 /* For sdev plugin (/dev) */ 89 #define VMM_SDEV_ROOT "/dev/vmm" 90 91 /* From uts/intel/io/vmm/intel/vmx.c */ 92 extern int vmx_x86_supported(const char **); 93 94 /* Holds and hooks from drivers external to vmm */ 95 struct vmm_hold { 96 list_node_t vmh_node; 97 vmm_softc_t *vmh_sc; 98 boolean_t vmh_release_req; 99 uint_t vmh_ioport_hook_cnt; 100 }; 101 102 struct vmm_lease { 103 list_node_t vml_node; 104 struct vm *vml_vm; 105 vm_client_t *vml_vmclient; 106 boolean_t vml_expired; 107 boolean_t vml_break_deferred; 108 boolean_t (*vml_expire_func)(void *); 109 void *vml_expire_arg; 110 struct vmm_hold *vml_hold; 111 }; 112 113 /* Options for vmm_destroy_locked */ 114 typedef enum vmm_destroy_opts { 115 VDO_DEFAULT = 0, 116 /* 117 * Indicate that zone-specific-data associated with this VM not be 118 * cleaned up as part of the destroy. Skipping ZSD clean-up is 119 * necessary when VM is being destroyed as part of zone destruction, 120 * when said ZSD is already being cleaned up. 121 */ 122 VDO_NO_CLEAN_ZSD = (1 << 0), 123 /* 124 * Attempt to wait for VM destruction to complete. This is opt-in, 125 * since there are many normal conditions which could lead to 126 * destruction being stalled pending other clean-up. 127 */ 128 VDO_ATTEMPT_WAIT = (1 << 1), 129 } vmm_destroy_opts_t; 130 131 static void vmm_hma_release(void); 132 static int vmm_destroy_locked(vmm_softc_t *, vmm_destroy_opts_t, bool *); 133 static int vmm_drv_block_hook(vmm_softc_t *, boolean_t); 134 static void vmm_lease_block(vmm_softc_t *); 135 static void vmm_lease_unblock(vmm_softc_t *); 136 static int vmm_kstat_alloc(vmm_softc_t *, minor_t, const cred_t *); 137 static void vmm_kstat_init(vmm_softc_t *); 138 static void vmm_kstat_fini(vmm_softc_t *); 139 140 /* 141 * The 'devmem' hack: 142 * 143 * On native FreeBSD, bhyve consumers are allowed to create 'devmem' segments 144 * in the vm which appear with their own name related to the vm under /dev. 145 * Since this would be a hassle from an sdev perspective and would require a 146 * new cdev interface (or complicate the existing one), we choose to implement 147 * this in a different manner. Direct access to the underlying vm memory 148 * segments is exposed by placing them in a range of offsets beyond the normal 149 * guest memory space. Userspace can query the appropriate offset to mmap() 150 * for a given segment-id with the VM_DEVMEM_GETOFFSET ioctl. 151 */ 152 153 static vmm_devmem_entry_t * 154 vmmdev_devmem_find(vmm_softc_t *sc, int segid) 155 { 156 vmm_devmem_entry_t *ent = NULL; 157 list_t *dl = &sc->vmm_devmem_list; 158 159 for (ent = list_head(dl); ent != NULL; ent = list_next(dl, ent)) { 160 if (ent->vde_segid == segid) { 161 return (ent); 162 } 163 } 164 return (NULL); 165 } 166 167 static int 168 vmmdev_get_memseg(vmm_softc_t *sc, struct vm_memseg *mseg) 169 { 170 int error; 171 bool sysmem; 172 173 error = vm_get_memseg(sc->vmm_vm, mseg->segid, &mseg->len, &sysmem, 174 NULL); 175 if (error || mseg->len == 0) 176 return (error); 177 178 if (!sysmem) { 179 vmm_devmem_entry_t *de; 180 181 de = vmmdev_devmem_find(sc, mseg->segid); 182 if (de != NULL) { 183 (void) strlcpy(mseg->name, de->vde_name, 184 sizeof (mseg->name)); 185 } 186 } else { 187 bzero(mseg->name, sizeof (mseg->name)); 188 } 189 190 return (error); 191 } 192 193 static int 194 vmmdev_devmem_create(vmm_softc_t *sc, struct vm_memseg *mseg, const char *name) 195 { 196 off_t map_offset; 197 vmm_devmem_entry_t *entry; 198 199 if (list_is_empty(&sc->vmm_devmem_list)) { 200 map_offset = VM_DEVMEM_START; 201 } else { 202 entry = list_tail(&sc->vmm_devmem_list); 203 map_offset = entry->vde_off + entry->vde_len; 204 if (map_offset < entry->vde_off) { 205 /* Do not tolerate overflow */ 206 return (ERANGE); 207 } 208 /* 209 * XXXJOY: We could choose to search the list for duplicate 210 * names and toss an error. Since we're using the offset 211 * method for now, it does not make much of a difference. 212 */ 213 } 214 215 entry = kmem_zalloc(sizeof (*entry), KM_SLEEP); 216 entry->vde_segid = mseg->segid; 217 entry->vde_len = mseg->len; 218 entry->vde_off = map_offset; 219 (void) strlcpy(entry->vde_name, name, sizeof (entry->vde_name)); 220 list_insert_tail(&sc->vmm_devmem_list, entry); 221 222 return (0); 223 } 224 225 static boolean_t 226 vmmdev_devmem_segid(vmm_softc_t *sc, off_t off, off_t len, int *segidp, 227 off_t *map_offp) 228 { 229 list_t *dl = &sc->vmm_devmem_list; 230 vmm_devmem_entry_t *de = NULL; 231 const off_t map_end = off + len; 232 233 VERIFY(off >= VM_DEVMEM_START); 234 235 if (map_end < off) { 236 /* No match on overflow */ 237 return (B_FALSE); 238 } 239 240 for (de = list_head(dl); de != NULL; de = list_next(dl, de)) { 241 const off_t item_end = de->vde_off + de->vde_len; 242 243 if (de->vde_off <= off && item_end >= map_end) { 244 *segidp = de->vde_segid; 245 *map_offp = off - de->vde_off; 246 return (B_TRUE); 247 } 248 } 249 return (B_FALSE); 250 } 251 252 /* 253 * When an instance is being destroyed, the devmem list of named memory objects 254 * can be torn down, as no new mappings are allowed. 255 */ 256 static void 257 vmmdev_devmem_purge(vmm_softc_t *sc) 258 { 259 vmm_devmem_entry_t *entry; 260 261 while ((entry = list_remove_head(&sc->vmm_devmem_list)) != NULL) { 262 kmem_free(entry, sizeof (*entry)); 263 } 264 } 265 266 static int 267 vmmdev_alloc_memseg(vmm_softc_t *sc, struct vm_memseg *mseg) 268 { 269 int error; 270 bool sysmem = true; 271 272 if (VM_MEMSEG_NAME(mseg)) { 273 sysmem = false; 274 } 275 error = vm_alloc_memseg(sc->vmm_vm, mseg->segid, mseg->len, sysmem); 276 277 if (error == 0) { 278 /* 279 * Rather than create a whole fresh device from which userspace 280 * can mmap this segment, instead make it available at an 281 * offset above where the main guest memory resides. 282 */ 283 error = vmmdev_devmem_create(sc, mseg, mseg->name); 284 if (error != 0) { 285 vm_free_memseg(sc->vmm_vm, mseg->segid); 286 } 287 } 288 return (error); 289 } 290 291 /* 292 * Resource Locking and Exclusion 293 * 294 * Much of bhyve depends on key portions of VM state, such as the guest memory 295 * map, to remain unchanged while the guest is running. As ported from 296 * FreeBSD, the initial strategy for this resource exclusion hinged on gating 297 * access to the instance vCPUs. Threads acting on a single vCPU, like those 298 * performing the work of actually running the guest in VMX/SVM, would lock 299 * only that vCPU during ioctl() entry. For ioctls which would change VM-wide 300 * state, all of the vCPUs would be first locked, ensuring that the 301 * operation(s) could complete without any other threads stumbling into 302 * intermediate states. 303 * 304 * This approach is largely effective for bhyve. Common operations, such as 305 * running the vCPUs, steer clear of lock contention. The model begins to 306 * break down for operations which do not occur in the context of a specific 307 * vCPU. LAPIC MSI delivery, for example, may be initiated from a worker 308 * thread in the bhyve process. In order to properly protect those vCPU-less 309 * operations from encountering invalid states, additional locking is required. 310 * This was solved by forcing those operations to lock the VM_MAXCPU-1 vCPU. 311 * It does mean that class of operations will be serialized on locking the 312 * specific vCPU and that instances sized at VM_MAXCPU will potentially see 313 * undue contention on the VM_MAXCPU-1 vCPU. 314 * 315 * In order to address the shortcomings of this model, the concept of a 316 * read/write lock has been added to bhyve. Operations which change 317 * fundamental aspects of a VM (such as the memory map) must acquire the write 318 * lock, which also implies locking all of the vCPUs and waiting for all read 319 * lock holders to release. While it increases the cost and waiting time for 320 * those few operations, it allows most hot-path operations on the VM (which 321 * depend on its configuration remaining stable) to occur with minimal locking. 322 * 323 * Consumers of the Driver API (see below) are a special case when it comes to 324 * this locking, since they may hold a read lock via the drv_lease mechanism 325 * for an extended period of time. Rather than forcing those consumers to 326 * continuously poll for a write lock attempt, the lease system forces them to 327 * provide a release callback to trigger their clean-up (and potential later 328 * reacquisition) of the read lock. 329 */ 330 331 static void 332 vcpu_lock_one(vmm_softc_t *sc, int vcpu) 333 { 334 ASSERT(vcpu >= 0 && vcpu < VM_MAXCPU); 335 336 /* 337 * Since this state transition is utilizing from_idle=true, it should 338 * not fail, but rather block until it can be successful. 339 */ 340 VERIFY0(vcpu_set_state(sc->vmm_vm, vcpu, VCPU_FROZEN, true)); 341 } 342 343 static void 344 vcpu_unlock_one(vmm_softc_t *sc, int vcpu) 345 { 346 ASSERT(vcpu >= 0 && vcpu < VM_MAXCPU); 347 348 VERIFY3U(vcpu_get_state(sc->vmm_vm, vcpu, NULL), ==, VCPU_FROZEN); 349 VERIFY0(vcpu_set_state(sc->vmm_vm, vcpu, VCPU_IDLE, false)); 350 } 351 352 static void 353 vmm_read_lock(vmm_softc_t *sc) 354 { 355 rw_enter(&sc->vmm_rwlock, RW_READER); 356 } 357 358 static void 359 vmm_read_unlock(vmm_softc_t *sc) 360 { 361 rw_exit(&sc->vmm_rwlock); 362 } 363 364 static void 365 vmm_write_lock(vmm_softc_t *sc) 366 { 367 int maxcpus; 368 369 /* First lock all the vCPUs */ 370 maxcpus = vm_get_maxcpus(sc->vmm_vm); 371 for (int vcpu = 0; vcpu < maxcpus; vcpu++) { 372 vcpu_lock_one(sc, vcpu); 373 } 374 375 /* 376 * Block vmm_drv leases from being acquired or held while the VM write 377 * lock is held. 378 */ 379 vmm_lease_block(sc); 380 381 rw_enter(&sc->vmm_rwlock, RW_WRITER); 382 /* 383 * For now, the 'maxcpus' value for an instance is fixed at the 384 * compile-time constant of VM_MAXCPU at creation. If this changes in 385 * the future, allowing for dynamic vCPU resource sizing, acquisition 386 * of the write lock will need to be wary of such changes. 387 */ 388 VERIFY(maxcpus == vm_get_maxcpus(sc->vmm_vm)); 389 } 390 391 static void 392 vmm_write_unlock(vmm_softc_t *sc) 393 { 394 int maxcpus; 395 396 /* Allow vmm_drv leases to be acquired once write lock is dropped */ 397 vmm_lease_unblock(sc); 398 399 /* 400 * The VM write lock _must_ be released from the same thread it was 401 * acquired in, unlike the read lock. 402 */ 403 VERIFY(rw_write_held(&sc->vmm_rwlock)); 404 rw_exit(&sc->vmm_rwlock); 405 406 /* Unlock all the vCPUs */ 407 maxcpus = vm_get_maxcpus(sc->vmm_vm); 408 for (int vcpu = 0; vcpu < maxcpus; vcpu++) { 409 vcpu_unlock_one(sc, vcpu); 410 } 411 } 412 413 static int 414 vmmdev_do_ioctl(vmm_softc_t *sc, int cmd, intptr_t arg, int md, 415 cred_t *credp, int *rvalp) 416 { 417 int error = 0, vcpu = -1; 418 void *datap = (void *)arg; 419 enum vm_lock_type { 420 LOCK_NONE = 0, 421 LOCK_VCPU, 422 LOCK_READ_HOLD, 423 LOCK_WRITE_HOLD 424 } lock_type = LOCK_NONE; 425 426 /* Acquire any exclusion resources needed for the operation. */ 427 switch (cmd) { 428 case VM_RUN: 429 case VM_GET_REGISTER: 430 case VM_SET_REGISTER: 431 case VM_GET_SEGMENT_DESCRIPTOR: 432 case VM_SET_SEGMENT_DESCRIPTOR: 433 case VM_GET_REGISTER_SET: 434 case VM_SET_REGISTER_SET: 435 case VM_INJECT_EXCEPTION: 436 case VM_GET_CAPABILITY: 437 case VM_SET_CAPABILITY: 438 case VM_PPTDEV_MSI: 439 case VM_PPTDEV_MSIX: 440 case VM_SET_X2APIC_STATE: 441 case VM_GLA2GPA: 442 case VM_GLA2GPA_NOFAULT: 443 case VM_ACTIVATE_CPU: 444 case VM_SET_INTINFO: 445 case VM_GET_INTINFO: 446 case VM_RESTART_INSTRUCTION: 447 case VM_SET_KERNEMU_DEV: 448 case VM_GET_KERNEMU_DEV: 449 case VM_RESET_CPU: 450 case VM_GET_RUN_STATE: 451 case VM_SET_RUN_STATE: 452 case VM_GET_FPU: 453 case VM_SET_FPU: 454 case VM_GET_CPUID: 455 case VM_SET_CPUID: 456 case VM_LEGACY_CPUID: 457 /* 458 * Copy in the ID of the vCPU chosen for this operation. 459 * Since a nefarious caller could update their struct between 460 * this locking and when the rest of the ioctl data is copied 461 * in, it is _critical_ that this local 'vcpu' variable be used 462 * rather than the in-struct one when performing the ioctl. 463 */ 464 if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) { 465 return (EFAULT); 466 } 467 if (vcpu < 0 || vcpu > vm_get_maxcpus(sc->vmm_vm)) { 468 return (EINVAL); 469 } 470 vcpu_lock_one(sc, vcpu); 471 lock_type = LOCK_VCPU; 472 break; 473 474 case VM_REINIT: 475 case VM_BIND_PPTDEV: 476 case VM_UNBIND_PPTDEV: 477 case VM_MAP_PPTDEV_MMIO: 478 case VM_UNMAP_PPTDEV_MMIO: 479 case VM_ALLOC_MEMSEG: 480 case VM_MMAP_MEMSEG: 481 case VM_MUNMAP_MEMSEG: 482 case VM_WRLOCK_CYCLE: 483 case VM_PMTMR_LOCATE: 484 case VM_PAUSE: 485 case VM_RESUME: 486 vmm_write_lock(sc); 487 lock_type = LOCK_WRITE_HOLD; 488 break; 489 490 case VM_GET_MEMSEG: 491 case VM_MMAP_GETNEXT: 492 case VM_LAPIC_IRQ: 493 case VM_INJECT_NMI: 494 case VM_IOAPIC_ASSERT_IRQ: 495 case VM_IOAPIC_DEASSERT_IRQ: 496 case VM_IOAPIC_PULSE_IRQ: 497 case VM_LAPIC_MSI: 498 case VM_LAPIC_LOCAL_IRQ: 499 case VM_GET_X2APIC_STATE: 500 case VM_RTC_READ: 501 case VM_RTC_WRITE: 502 case VM_RTC_SETTIME: 503 case VM_RTC_GETTIME: 504 case VM_PPTDEV_DISABLE_MSIX: 505 case VM_DEVMEM_GETOFFSET: 506 case VM_TRACK_DIRTY_PAGES: 507 vmm_read_lock(sc); 508 lock_type = LOCK_READ_HOLD; 509 break; 510 511 case VM_DATA_READ: 512 case VM_DATA_WRITE: 513 if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) { 514 return (EFAULT); 515 } 516 if (vcpu == -1) { 517 /* Access data for VM-wide devices */ 518 vmm_write_lock(sc); 519 lock_type = LOCK_WRITE_HOLD; 520 } else if (vcpu >= 0 && vcpu < vm_get_maxcpus(sc->vmm_vm)) { 521 /* Access data associated with a specific vCPU */ 522 vcpu_lock_one(sc, vcpu); 523 lock_type = LOCK_VCPU; 524 } else { 525 return (EINVAL); 526 } 527 break; 528 529 case VM_GET_GPA_PMAP: 530 case VM_IOAPIC_PINCOUNT: 531 case VM_SUSPEND: 532 case VM_DESC_FPU_AREA: 533 case VM_SET_AUTODESTRUCT: 534 case VM_DESTROY_SELF: 535 case VM_DESTROY_PENDING: 536 default: 537 break; 538 } 539 540 /* Execute the primary logic for the ioctl. */ 541 switch (cmd) { 542 case VM_RUN: { 543 struct vm_entry entry; 544 545 if (ddi_copyin(datap, &entry, sizeof (entry), md)) { 546 error = EFAULT; 547 break; 548 } 549 550 if (!(curthread->t_schedflag & TS_VCPU)) 551 smt_mark_as_vcpu(); 552 553 error = vm_run(sc->vmm_vm, vcpu, &entry); 554 555 /* 556 * Unexpected states in vm_run() are expressed through positive 557 * errno-oriented return values. VM states which expect further 558 * processing in userspace (necessary context via exitinfo) are 559 * expressed through negative return values. For the time being 560 * a return value of 0 is not expected from vm_run(). 561 */ 562 ASSERT(error != 0); 563 if (error < 0) { 564 const struct vm_exit *vme; 565 void *outp = entry.exit_data; 566 567 error = 0; 568 vme = vm_exitinfo(sc->vmm_vm, vcpu); 569 if (ddi_copyout(vme, outp, sizeof (*vme), md)) { 570 error = EFAULT; 571 } 572 } 573 break; 574 } 575 case VM_SUSPEND: { 576 struct vm_suspend vmsuspend; 577 578 if (ddi_copyin(datap, &vmsuspend, sizeof (vmsuspend), md)) { 579 error = EFAULT; 580 break; 581 } 582 error = vm_suspend(sc->vmm_vm, vmsuspend.how); 583 break; 584 } 585 case VM_REINIT: { 586 struct vm_reinit reinit; 587 588 if (ddi_copyin(datap, &reinit, sizeof (reinit), md)) { 589 error = EFAULT; 590 break; 591 } 592 if ((error = vmm_drv_block_hook(sc, B_TRUE)) != 0) { 593 /* 594 * The VM instance should be free of driver-attached 595 * hooks during the reinitialization process. 596 */ 597 break; 598 } 599 error = vm_reinit(sc->vmm_vm, reinit.flags); 600 (void) vmm_drv_block_hook(sc, B_FALSE); 601 break; 602 } 603 case VM_STAT_DESC: { 604 struct vm_stat_desc statdesc; 605 606 if (ddi_copyin(datap, &statdesc, sizeof (statdesc), md)) { 607 error = EFAULT; 608 break; 609 } 610 error = vmm_stat_desc_copy(statdesc.index, statdesc.desc, 611 sizeof (statdesc.desc)); 612 if (error == 0 && 613 ddi_copyout(&statdesc, datap, sizeof (statdesc), md)) { 614 error = EFAULT; 615 break; 616 } 617 break; 618 } 619 case VM_STATS_IOC: { 620 struct vm_stats vmstats; 621 622 if (ddi_copyin(datap, &vmstats, sizeof (vmstats), md)) { 623 error = EFAULT; 624 break; 625 } 626 hrt2tv(gethrtime(), &vmstats.tv); 627 error = vmm_stat_copy(sc->vmm_vm, vmstats.cpuid, vmstats.index, 628 nitems(vmstats.statbuf), 629 &vmstats.num_entries, vmstats.statbuf); 630 if (error == 0 && 631 ddi_copyout(&vmstats, datap, sizeof (vmstats), md)) { 632 error = EFAULT; 633 break; 634 } 635 break; 636 } 637 638 case VM_PPTDEV_MSI: { 639 struct vm_pptdev_msi pptmsi; 640 641 if (ddi_copyin(datap, &pptmsi, sizeof (pptmsi), md)) { 642 error = EFAULT; 643 break; 644 } 645 error = ppt_setup_msi(sc->vmm_vm, pptmsi.vcpu, pptmsi.pptfd, 646 pptmsi.addr, pptmsi.msg, pptmsi.numvec); 647 break; 648 } 649 case VM_PPTDEV_MSIX: { 650 struct vm_pptdev_msix pptmsix; 651 652 if (ddi_copyin(datap, &pptmsix, sizeof (pptmsix), md)) { 653 error = EFAULT; 654 break; 655 } 656 error = ppt_setup_msix(sc->vmm_vm, pptmsix.vcpu, pptmsix.pptfd, 657 pptmsix.idx, pptmsix.addr, pptmsix.msg, 658 pptmsix.vector_control); 659 break; 660 } 661 case VM_PPTDEV_DISABLE_MSIX: { 662 struct vm_pptdev pptdev; 663 664 if (ddi_copyin(datap, &pptdev, sizeof (pptdev), md)) { 665 error = EFAULT; 666 break; 667 } 668 error = ppt_disable_msix(sc->vmm_vm, pptdev.pptfd); 669 break; 670 } 671 case VM_MAP_PPTDEV_MMIO: { 672 struct vm_pptdev_mmio pptmmio; 673 674 if (ddi_copyin(datap, &pptmmio, sizeof (pptmmio), md)) { 675 error = EFAULT; 676 break; 677 } 678 error = ppt_map_mmio(sc->vmm_vm, pptmmio.pptfd, pptmmio.gpa, 679 pptmmio.len, pptmmio.hpa); 680 break; 681 } 682 case VM_UNMAP_PPTDEV_MMIO: { 683 struct vm_pptdev_mmio pptmmio; 684 685 if (ddi_copyin(datap, &pptmmio, sizeof (pptmmio), md)) { 686 error = EFAULT; 687 break; 688 } 689 error = ppt_unmap_mmio(sc->vmm_vm, pptmmio.pptfd, pptmmio.gpa, 690 pptmmio.len); 691 break; 692 } 693 case VM_BIND_PPTDEV: { 694 struct vm_pptdev pptdev; 695 696 if (ddi_copyin(datap, &pptdev, sizeof (pptdev), md)) { 697 error = EFAULT; 698 break; 699 } 700 error = vm_assign_pptdev(sc->vmm_vm, pptdev.pptfd); 701 break; 702 } 703 case VM_UNBIND_PPTDEV: { 704 struct vm_pptdev pptdev; 705 706 if (ddi_copyin(datap, &pptdev, sizeof (pptdev), md)) { 707 error = EFAULT; 708 break; 709 } 710 error = vm_unassign_pptdev(sc->vmm_vm, pptdev.pptfd); 711 break; 712 } 713 case VM_GET_PPTDEV_LIMITS: { 714 struct vm_pptdev_limits pptlimits; 715 716 if (ddi_copyin(datap, &pptlimits, sizeof (pptlimits), md)) { 717 error = EFAULT; 718 break; 719 } 720 error = ppt_get_limits(sc->vmm_vm, pptlimits.pptfd, 721 &pptlimits.msi_limit, &pptlimits.msix_limit); 722 if (error == 0 && 723 ddi_copyout(&pptlimits, datap, sizeof (pptlimits), md)) { 724 error = EFAULT; 725 break; 726 } 727 break; 728 } 729 case VM_INJECT_EXCEPTION: { 730 struct vm_exception vmexc; 731 if (ddi_copyin(datap, &vmexc, sizeof (vmexc), md)) { 732 error = EFAULT; 733 break; 734 } 735 error = vm_inject_exception(sc->vmm_vm, vcpu, vmexc.vector, 736 vmexc.error_code_valid != 0, vmexc.error_code, 737 vmexc.restart_instruction != 0); 738 break; 739 } 740 case VM_INJECT_NMI: { 741 struct vm_nmi vmnmi; 742 743 if (ddi_copyin(datap, &vmnmi, sizeof (vmnmi), md)) { 744 error = EFAULT; 745 break; 746 } 747 error = vm_inject_nmi(sc->vmm_vm, vmnmi.cpuid); 748 break; 749 } 750 case VM_LAPIC_IRQ: { 751 struct vm_lapic_irq vmirq; 752 753 if (ddi_copyin(datap, &vmirq, sizeof (vmirq), md)) { 754 error = EFAULT; 755 break; 756 } 757 error = lapic_intr_edge(sc->vmm_vm, vmirq.cpuid, vmirq.vector); 758 break; 759 } 760 case VM_LAPIC_LOCAL_IRQ: { 761 struct vm_lapic_irq vmirq; 762 763 if (ddi_copyin(datap, &vmirq, sizeof (vmirq), md)) { 764 error = EFAULT; 765 break; 766 } 767 error = lapic_set_local_intr(sc->vmm_vm, vmirq.cpuid, 768 vmirq.vector); 769 break; 770 } 771 case VM_LAPIC_MSI: { 772 struct vm_lapic_msi vmmsi; 773 774 if (ddi_copyin(datap, &vmmsi, sizeof (vmmsi), md)) { 775 error = EFAULT; 776 break; 777 } 778 error = lapic_intr_msi(sc->vmm_vm, vmmsi.addr, vmmsi.msg); 779 break; 780 } 781 782 case VM_IOAPIC_ASSERT_IRQ: { 783 struct vm_ioapic_irq ioapic_irq; 784 785 if (ddi_copyin(datap, &ioapic_irq, sizeof (ioapic_irq), md)) { 786 error = EFAULT; 787 break; 788 } 789 error = vioapic_assert_irq(sc->vmm_vm, ioapic_irq.irq); 790 break; 791 } 792 case VM_IOAPIC_DEASSERT_IRQ: { 793 struct vm_ioapic_irq ioapic_irq; 794 795 if (ddi_copyin(datap, &ioapic_irq, sizeof (ioapic_irq), md)) { 796 error = EFAULT; 797 break; 798 } 799 error = vioapic_deassert_irq(sc->vmm_vm, ioapic_irq.irq); 800 break; 801 } 802 case VM_IOAPIC_PULSE_IRQ: { 803 struct vm_ioapic_irq ioapic_irq; 804 805 if (ddi_copyin(datap, &ioapic_irq, sizeof (ioapic_irq), md)) { 806 error = EFAULT; 807 break; 808 } 809 error = vioapic_pulse_irq(sc->vmm_vm, ioapic_irq.irq); 810 break; 811 } 812 case VM_IOAPIC_PINCOUNT: { 813 int pincount; 814 815 pincount = vioapic_pincount(sc->vmm_vm); 816 if (ddi_copyout(&pincount, datap, sizeof (int), md)) { 817 error = EFAULT; 818 break; 819 } 820 break; 821 } 822 case VM_DESC_FPU_AREA: { 823 struct vm_fpu_desc desc; 824 void *buf = NULL; 825 826 if (ddi_copyin(datap, &desc, sizeof (desc), md)) { 827 error = EFAULT; 828 break; 829 } 830 if (desc.vfd_num_entries > 64) { 831 error = EINVAL; 832 break; 833 } 834 const size_t buf_sz = sizeof (struct vm_fpu_desc_entry) * 835 desc.vfd_num_entries; 836 if (buf_sz != 0) { 837 buf = kmem_zalloc(buf_sz, KM_SLEEP); 838 } 839 840 /* 841 * For now, we are depending on vm_fpu_desc_entry and 842 * hma_xsave_state_desc_t having the same format. 843 */ 844 CTASSERT(sizeof (struct vm_fpu_desc_entry) == 845 sizeof (hma_xsave_state_desc_t)); 846 847 size_t req_size; 848 const uint_t max_entries = hma_fpu_describe_xsave_state( 849 (hma_xsave_state_desc_t *)buf, 850 desc.vfd_num_entries, 851 &req_size); 852 853 desc.vfd_req_size = req_size; 854 desc.vfd_num_entries = max_entries; 855 if (buf_sz != 0) { 856 if (ddi_copyout(buf, desc.vfd_entry_data, buf_sz, md)) { 857 error = EFAULT; 858 } 859 kmem_free(buf, buf_sz); 860 } 861 862 if (error == 0) { 863 if (ddi_copyout(&desc, datap, sizeof (desc), md)) { 864 error = EFAULT; 865 } 866 } 867 break; 868 } 869 case VM_SET_AUTODESTRUCT: { 870 /* 871 * Since this has to do with controlling the lifetime of the 872 * greater vmm_softc_t, the flag is protected by vmm_mtx, rather 873 * than the vcpu-centric or rwlock exclusion mechanisms. 874 */ 875 mutex_enter(&vmm_mtx); 876 if (arg != 0) { 877 sc->vmm_flags |= VMM_AUTODESTROY; 878 } else { 879 sc->vmm_flags &= ~VMM_AUTODESTROY; 880 } 881 mutex_exit(&vmm_mtx); 882 break; 883 } 884 case VM_DESTROY_SELF: { 885 bool hma_release = false; 886 887 /* 888 * Just like VMM_DESTROY_VM, but on the instance file descriptor 889 * itself, rather than having to perform a racy name lookup as 890 * part of the destroy process. 891 * 892 * Since vmm_destroy_locked() performs vCPU lock acquisition in 893 * order to kick the vCPUs out of guest context as part of any 894 * destruction, we do not need to worry about it ourself using 895 * the `lock_type` logic here. 896 */ 897 mutex_enter(&vmm_mtx); 898 VERIFY0(vmm_destroy_locked(sc, VDO_DEFAULT, &hma_release)); 899 mutex_exit(&vmm_mtx); 900 if (hma_release) { 901 vmm_hma_release(); 902 } 903 break; 904 } 905 case VM_DESTROY_PENDING: { 906 /* 907 * If we have made it this far, then destruction of the instance 908 * has not been initiated. 909 */ 910 *rvalp = 0; 911 break; 912 } 913 914 case VM_ISA_ASSERT_IRQ: { 915 struct vm_isa_irq isa_irq; 916 917 if (ddi_copyin(datap, &isa_irq, sizeof (isa_irq), md)) { 918 error = EFAULT; 919 break; 920 } 921 error = vatpic_assert_irq(sc->vmm_vm, isa_irq.atpic_irq); 922 if (error == 0 && isa_irq.ioapic_irq != -1) { 923 error = vioapic_assert_irq(sc->vmm_vm, 924 isa_irq.ioapic_irq); 925 } 926 break; 927 } 928 case VM_ISA_DEASSERT_IRQ: { 929 struct vm_isa_irq isa_irq; 930 931 if (ddi_copyin(datap, &isa_irq, sizeof (isa_irq), md)) { 932 error = EFAULT; 933 break; 934 } 935 error = vatpic_deassert_irq(sc->vmm_vm, isa_irq.atpic_irq); 936 if (error == 0 && isa_irq.ioapic_irq != -1) { 937 error = vioapic_deassert_irq(sc->vmm_vm, 938 isa_irq.ioapic_irq); 939 } 940 break; 941 } 942 case VM_ISA_PULSE_IRQ: { 943 struct vm_isa_irq isa_irq; 944 945 if (ddi_copyin(datap, &isa_irq, sizeof (isa_irq), md)) { 946 error = EFAULT; 947 break; 948 } 949 error = vatpic_pulse_irq(sc->vmm_vm, isa_irq.atpic_irq); 950 if (error == 0 && isa_irq.ioapic_irq != -1) { 951 error = vioapic_pulse_irq(sc->vmm_vm, 952 isa_irq.ioapic_irq); 953 } 954 break; 955 } 956 case VM_ISA_SET_IRQ_TRIGGER: { 957 struct vm_isa_irq_trigger isa_irq_trigger; 958 959 if (ddi_copyin(datap, &isa_irq_trigger, 960 sizeof (isa_irq_trigger), md)) { 961 error = EFAULT; 962 break; 963 } 964 error = vatpic_set_irq_trigger(sc->vmm_vm, 965 isa_irq_trigger.atpic_irq, isa_irq_trigger.trigger); 966 break; 967 } 968 969 case VM_MMAP_GETNEXT: { 970 struct vm_memmap mm; 971 972 if (ddi_copyin(datap, &mm, sizeof (mm), md)) { 973 error = EFAULT; 974 break; 975 } 976 error = vm_mmap_getnext(sc->vmm_vm, &mm.gpa, &mm.segid, 977 &mm.segoff, &mm.len, &mm.prot, &mm.flags); 978 if (error == 0 && ddi_copyout(&mm, datap, sizeof (mm), md)) { 979 error = EFAULT; 980 break; 981 } 982 break; 983 } 984 case VM_MMAP_MEMSEG: { 985 struct vm_memmap mm; 986 987 if (ddi_copyin(datap, &mm, sizeof (mm), md)) { 988 error = EFAULT; 989 break; 990 } 991 error = vm_mmap_memseg(sc->vmm_vm, mm.gpa, mm.segid, mm.segoff, 992 mm.len, mm.prot, mm.flags); 993 break; 994 } 995 case VM_MUNMAP_MEMSEG: { 996 struct vm_munmap mu; 997 998 if (ddi_copyin(datap, &mu, sizeof (mu), md)) { 999 error = EFAULT; 1000 break; 1001 } 1002 error = vm_munmap_memseg(sc->vmm_vm, mu.gpa, mu.len); 1003 break; 1004 } 1005 case VM_ALLOC_MEMSEG: { 1006 struct vm_memseg vmseg; 1007 1008 if (ddi_copyin(datap, &vmseg, sizeof (vmseg), md)) { 1009 error = EFAULT; 1010 break; 1011 } 1012 error = vmmdev_alloc_memseg(sc, &vmseg); 1013 break; 1014 } 1015 case VM_GET_MEMSEG: { 1016 struct vm_memseg vmseg; 1017 1018 if (ddi_copyin(datap, &vmseg, sizeof (vmseg), md)) { 1019 error = EFAULT; 1020 break; 1021 } 1022 error = vmmdev_get_memseg(sc, &vmseg); 1023 if (error == 0 && 1024 ddi_copyout(&vmseg, datap, sizeof (vmseg), md)) { 1025 error = EFAULT; 1026 break; 1027 } 1028 break; 1029 } 1030 case VM_GET_REGISTER: { 1031 struct vm_register vmreg; 1032 1033 if (ddi_copyin(datap, &vmreg, sizeof (vmreg), md)) { 1034 error = EFAULT; 1035 break; 1036 } 1037 error = vm_get_register(sc->vmm_vm, vcpu, vmreg.regnum, 1038 &vmreg.regval); 1039 if (error == 0 && 1040 ddi_copyout(&vmreg, datap, sizeof (vmreg), md)) { 1041 error = EFAULT; 1042 break; 1043 } 1044 break; 1045 } 1046 case VM_SET_REGISTER: { 1047 struct vm_register vmreg; 1048 1049 if (ddi_copyin(datap, &vmreg, sizeof (vmreg), md)) { 1050 error = EFAULT; 1051 break; 1052 } 1053 error = vm_set_register(sc->vmm_vm, vcpu, vmreg.regnum, 1054 vmreg.regval); 1055 break; 1056 } 1057 case VM_SET_SEGMENT_DESCRIPTOR: { 1058 struct vm_seg_desc vmsegd; 1059 1060 if (ddi_copyin(datap, &vmsegd, sizeof (vmsegd), md)) { 1061 error = EFAULT; 1062 break; 1063 } 1064 error = vm_set_seg_desc(sc->vmm_vm, vcpu, vmsegd.regnum, 1065 &vmsegd.desc); 1066 break; 1067 } 1068 case VM_GET_SEGMENT_DESCRIPTOR: { 1069 struct vm_seg_desc vmsegd; 1070 1071 if (ddi_copyin(datap, &vmsegd, sizeof (vmsegd), md)) { 1072 error = EFAULT; 1073 break; 1074 } 1075 error = vm_get_seg_desc(sc->vmm_vm, vcpu, vmsegd.regnum, 1076 &vmsegd.desc); 1077 if (error == 0 && 1078 ddi_copyout(&vmsegd, datap, sizeof (vmsegd), md)) { 1079 error = EFAULT; 1080 break; 1081 } 1082 break; 1083 } 1084 case VM_GET_REGISTER_SET: { 1085 struct vm_register_set vrs; 1086 int regnums[VM_REG_LAST]; 1087 uint64_t regvals[VM_REG_LAST]; 1088 1089 if (ddi_copyin(datap, &vrs, sizeof (vrs), md)) { 1090 error = EFAULT; 1091 break; 1092 } 1093 if (vrs.count > VM_REG_LAST || vrs.count == 0) { 1094 error = EINVAL; 1095 break; 1096 } 1097 if (ddi_copyin(vrs.regnums, regnums, 1098 sizeof (int) * vrs.count, md)) { 1099 error = EFAULT; 1100 break; 1101 } 1102 1103 error = 0; 1104 for (uint_t i = 0; i < vrs.count && error == 0; i++) { 1105 if (regnums[i] < 0) { 1106 error = EINVAL; 1107 break; 1108 } 1109 error = vm_get_register(sc->vmm_vm, vcpu, regnums[i], 1110 ®vals[i]); 1111 } 1112 if (error == 0 && ddi_copyout(regvals, vrs.regvals, 1113 sizeof (uint64_t) * vrs.count, md)) { 1114 error = EFAULT; 1115 } 1116 break; 1117 } 1118 case VM_SET_REGISTER_SET: { 1119 struct vm_register_set vrs; 1120 int regnums[VM_REG_LAST]; 1121 uint64_t regvals[VM_REG_LAST]; 1122 1123 if (ddi_copyin(datap, &vrs, sizeof (vrs), md)) { 1124 error = EFAULT; 1125 break; 1126 } 1127 if (vrs.count > VM_REG_LAST || vrs.count == 0) { 1128 error = EINVAL; 1129 break; 1130 } 1131 if (ddi_copyin(vrs.regnums, regnums, 1132 sizeof (int) * vrs.count, md)) { 1133 error = EFAULT; 1134 break; 1135 } 1136 if (ddi_copyin(vrs.regvals, regvals, 1137 sizeof (uint64_t) * vrs.count, md)) { 1138 error = EFAULT; 1139 break; 1140 } 1141 1142 error = 0; 1143 for (uint_t i = 0; i < vrs.count && error == 0; i++) { 1144 /* 1145 * Setting registers in a set is not atomic, since a 1146 * failure in the middle of the set will cause a 1147 * bail-out and inconsistent register state. Callers 1148 * should be wary of this. 1149 */ 1150 if (regnums[i] < 0) { 1151 error = EINVAL; 1152 break; 1153 } 1154 error = vm_set_register(sc->vmm_vm, vcpu, regnums[i], 1155 regvals[i]); 1156 } 1157 break; 1158 } 1159 case VM_RESET_CPU: { 1160 struct vm_vcpu_reset vvr; 1161 1162 if (ddi_copyin(datap, &vvr, sizeof (vvr), md)) { 1163 error = EFAULT; 1164 break; 1165 } 1166 if (vvr.kind != VRK_RESET && vvr.kind != VRK_INIT) { 1167 error = EINVAL; 1168 } 1169 1170 error = vcpu_arch_reset(sc->vmm_vm, vcpu, vvr.kind == VRK_INIT); 1171 break; 1172 } 1173 case VM_GET_RUN_STATE: { 1174 struct vm_run_state vrs; 1175 1176 bzero(&vrs, sizeof (vrs)); 1177 error = vm_get_run_state(sc->vmm_vm, vcpu, &vrs.state, 1178 &vrs.sipi_vector); 1179 if (error == 0) { 1180 if (ddi_copyout(&vrs, datap, sizeof (vrs), md)) { 1181 error = EFAULT; 1182 break; 1183 } 1184 } 1185 break; 1186 } 1187 case VM_SET_RUN_STATE: { 1188 struct vm_run_state vrs; 1189 1190 if (ddi_copyin(datap, &vrs, sizeof (vrs), md)) { 1191 error = EFAULT; 1192 break; 1193 } 1194 error = vm_set_run_state(sc->vmm_vm, vcpu, vrs.state, 1195 vrs.sipi_vector); 1196 break; 1197 } 1198 case VM_GET_FPU: { 1199 struct vm_fpu_state req; 1200 const size_t max_len = (PAGESIZE * 2); 1201 void *kbuf; 1202 1203 if (ddi_copyin(datap, &req, sizeof (req), md)) { 1204 error = EFAULT; 1205 break; 1206 } 1207 if (req.len > max_len || req.len == 0) { 1208 error = EINVAL; 1209 break; 1210 } 1211 kbuf = kmem_zalloc(req.len, KM_SLEEP); 1212 error = vm_get_fpu(sc->vmm_vm, vcpu, kbuf, req.len); 1213 if (error == 0) { 1214 if (ddi_copyout(kbuf, req.buf, req.len, md)) { 1215 error = EFAULT; 1216 } 1217 } 1218 kmem_free(kbuf, req.len); 1219 break; 1220 } 1221 case VM_SET_FPU: { 1222 struct vm_fpu_state req; 1223 const size_t max_len = (PAGESIZE * 2); 1224 void *kbuf; 1225 1226 if (ddi_copyin(datap, &req, sizeof (req), md)) { 1227 error = EFAULT; 1228 break; 1229 } 1230 if (req.len > max_len || req.len == 0) { 1231 error = EINVAL; 1232 break; 1233 } 1234 kbuf = kmem_alloc(req.len, KM_SLEEP); 1235 if (ddi_copyin(req.buf, kbuf, req.len, md)) { 1236 error = EFAULT; 1237 } else { 1238 error = vm_set_fpu(sc->vmm_vm, vcpu, kbuf, req.len); 1239 } 1240 kmem_free(kbuf, req.len); 1241 break; 1242 } 1243 case VM_GET_CPUID: { 1244 struct vm_vcpu_cpuid_config cfg; 1245 struct vcpu_cpuid_entry *entries = NULL; 1246 1247 if (ddi_copyin(datap, &cfg, sizeof (cfg), md)) { 1248 error = EFAULT; 1249 break; 1250 } 1251 if (cfg.vvcc_nent > VMM_MAX_CPUID_ENTRIES) { 1252 error = EINVAL; 1253 break; 1254 } 1255 1256 const size_t entries_size = 1257 cfg.vvcc_nent * sizeof (struct vcpu_cpuid_entry); 1258 if (entries_size != 0) { 1259 entries = kmem_zalloc(entries_size, KM_SLEEP); 1260 } 1261 1262 vcpu_cpuid_config_t vm_cfg = { 1263 .vcc_nent = cfg.vvcc_nent, 1264 .vcc_entries = entries, 1265 }; 1266 error = vm_get_cpuid(sc->vmm_vm, vcpu, &vm_cfg); 1267 1268 /* 1269 * Only attempt to copy out the resultant entries if we were 1270 * able to query them from the instance. The flags and number 1271 * of entries are emitted regardless. 1272 */ 1273 cfg.vvcc_flags = vm_cfg.vcc_flags; 1274 cfg.vvcc_nent = vm_cfg.vcc_nent; 1275 if (entries != NULL) { 1276 if (error == 0 && ddi_copyout(entries, cfg.vvcc_entries, 1277 entries_size, md) != 0) { 1278 error = EFAULT; 1279 } 1280 1281 kmem_free(entries, entries_size); 1282 } 1283 1284 if (ddi_copyout(&cfg, datap, sizeof (cfg), md) != 0) { 1285 error = EFAULT; 1286 } 1287 break; 1288 } 1289 case VM_SET_CPUID: { 1290 struct vm_vcpu_cpuid_config cfg; 1291 struct vcpu_cpuid_entry *entries = NULL; 1292 size_t entries_size = 0; 1293 1294 if (ddi_copyin(datap, &cfg, sizeof (cfg), md)) { 1295 error = EFAULT; 1296 break; 1297 } 1298 if (cfg.vvcc_nent > VMM_MAX_CPUID_ENTRIES) { 1299 error = EFBIG; 1300 break; 1301 } 1302 if ((cfg.vvcc_flags & VCC_FLAG_LEGACY_HANDLING) != 0) { 1303 /* 1304 * If we are being instructed to use "legacy" handling, 1305 * then no entries should be provided, since the static 1306 * in-kernel masking will be used. 1307 */ 1308 if (cfg.vvcc_nent != 0) { 1309 error = EINVAL; 1310 break; 1311 } 1312 } else if (cfg.vvcc_nent != 0) { 1313 entries_size = 1314 cfg.vvcc_nent * sizeof (struct vcpu_cpuid_entry); 1315 entries = kmem_alloc(entries_size, KM_SLEEP); 1316 1317 if (ddi_copyin(cfg.vvcc_entries, entries, entries_size, 1318 md) != 0) { 1319 error = EFAULT; 1320 kmem_free(entries, entries_size); 1321 break; 1322 } 1323 } 1324 1325 vcpu_cpuid_config_t vm_cfg = { 1326 .vcc_flags = cfg.vvcc_flags, 1327 .vcc_nent = cfg.vvcc_nent, 1328 .vcc_entries = entries, 1329 }; 1330 error = vm_set_cpuid(sc->vmm_vm, vcpu, &vm_cfg); 1331 1332 if (entries != NULL) { 1333 kmem_free(entries, entries_size); 1334 } 1335 break; 1336 } 1337 case VM_LEGACY_CPUID: { 1338 struct vm_legacy_cpuid vlc; 1339 if (ddi_copyin(datap, &vlc, sizeof (vlc), md)) { 1340 error = EFAULT; 1341 break; 1342 } 1343 vlc.vlc_vcpuid = vcpu; 1344 1345 legacy_emulate_cpuid(sc->vmm_vm, vcpu, &vlc.vlc_eax, 1346 &vlc.vlc_ebx, &vlc.vlc_ecx, &vlc.vlc_edx); 1347 1348 if (ddi_copyout(&vlc, datap, sizeof (vlc), md)) { 1349 error = EFAULT; 1350 break; 1351 } 1352 break; 1353 } 1354 1355 case VM_SET_KERNEMU_DEV: 1356 case VM_GET_KERNEMU_DEV: { 1357 struct vm_readwrite_kernemu_device kemu; 1358 size_t size = 0; 1359 1360 if (ddi_copyin(datap, &kemu, sizeof (kemu), md)) { 1361 error = EFAULT; 1362 break; 1363 } 1364 1365 if (kemu.access_width > 3) { 1366 error = EINVAL; 1367 break; 1368 } 1369 size = (1 << kemu.access_width); 1370 ASSERT(size >= 1 && size <= 8); 1371 1372 if (cmd == VM_SET_KERNEMU_DEV) { 1373 error = vm_service_mmio_write(sc->vmm_vm, vcpu, 1374 kemu.gpa, kemu.value, size); 1375 } else { 1376 error = vm_service_mmio_read(sc->vmm_vm, vcpu, 1377 kemu.gpa, &kemu.value, size); 1378 } 1379 1380 if (error == 0) { 1381 if (ddi_copyout(&kemu, datap, sizeof (kemu), md)) { 1382 error = EFAULT; 1383 break; 1384 } 1385 } 1386 break; 1387 } 1388 1389 case VM_GET_CAPABILITY: { 1390 struct vm_capability vmcap; 1391 1392 if (ddi_copyin(datap, &vmcap, sizeof (vmcap), md)) { 1393 error = EFAULT; 1394 break; 1395 } 1396 error = vm_get_capability(sc->vmm_vm, vcpu, vmcap.captype, 1397 &vmcap.capval); 1398 if (error == 0 && 1399 ddi_copyout(&vmcap, datap, sizeof (vmcap), md)) { 1400 error = EFAULT; 1401 break; 1402 } 1403 break; 1404 } 1405 case VM_SET_CAPABILITY: { 1406 struct vm_capability vmcap; 1407 1408 if (ddi_copyin(datap, &vmcap, sizeof (vmcap), md)) { 1409 error = EFAULT; 1410 break; 1411 } 1412 error = vm_set_capability(sc->vmm_vm, vcpu, vmcap.captype, 1413 vmcap.capval); 1414 break; 1415 } 1416 case VM_SET_X2APIC_STATE: { 1417 struct vm_x2apic x2apic; 1418 1419 if (ddi_copyin(datap, &x2apic, sizeof (x2apic), md)) { 1420 error = EFAULT; 1421 break; 1422 } 1423 error = vm_set_x2apic_state(sc->vmm_vm, vcpu, x2apic.state); 1424 break; 1425 } 1426 case VM_GET_X2APIC_STATE: { 1427 struct vm_x2apic x2apic; 1428 1429 if (ddi_copyin(datap, &x2apic, sizeof (x2apic), md)) { 1430 error = EFAULT; 1431 break; 1432 } 1433 error = vm_get_x2apic_state(sc->vmm_vm, x2apic.cpuid, 1434 &x2apic.state); 1435 if (error == 0 && 1436 ddi_copyout(&x2apic, datap, sizeof (x2apic), md)) { 1437 error = EFAULT; 1438 break; 1439 } 1440 break; 1441 } 1442 case VM_GET_GPA_PMAP: { 1443 /* 1444 * Until there is a necessity to leak EPT/RVI PTE values to 1445 * userspace, this will remain unimplemented 1446 */ 1447 error = EINVAL; 1448 break; 1449 } 1450 case VM_GET_HPET_CAPABILITIES: { 1451 struct vm_hpet_cap hpetcap; 1452 1453 error = vhpet_getcap(&hpetcap); 1454 if (error == 0 && 1455 ddi_copyout(&hpetcap, datap, sizeof (hpetcap), md)) { 1456 error = EFAULT; 1457 break; 1458 } 1459 break; 1460 } 1461 case VM_GLA2GPA: { 1462 struct vm_gla2gpa gg; 1463 1464 if (ddi_copyin(datap, &gg, sizeof (gg), md)) { 1465 error = EFAULT; 1466 break; 1467 } 1468 gg.vcpuid = vcpu; 1469 error = vm_gla2gpa(sc->vmm_vm, vcpu, &gg.paging, gg.gla, 1470 gg.prot, &gg.gpa, &gg.fault); 1471 if (error == 0 && ddi_copyout(&gg, datap, sizeof (gg), md)) { 1472 error = EFAULT; 1473 break; 1474 } 1475 break; 1476 } 1477 case VM_GLA2GPA_NOFAULT: { 1478 struct vm_gla2gpa gg; 1479 1480 if (ddi_copyin(datap, &gg, sizeof (gg), md)) { 1481 error = EFAULT; 1482 break; 1483 } 1484 gg.vcpuid = vcpu; 1485 error = vm_gla2gpa_nofault(sc->vmm_vm, vcpu, &gg.paging, 1486 gg.gla, gg.prot, &gg.gpa, &gg.fault); 1487 if (error == 0 && ddi_copyout(&gg, datap, sizeof (gg), md)) { 1488 error = EFAULT; 1489 break; 1490 } 1491 break; 1492 } 1493 1494 case VM_ACTIVATE_CPU: 1495 error = vm_activate_cpu(sc->vmm_vm, vcpu); 1496 break; 1497 1498 case VM_SUSPEND_CPU: 1499 if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) { 1500 error = EFAULT; 1501 } else { 1502 error = vm_suspend_cpu(sc->vmm_vm, vcpu); 1503 } 1504 break; 1505 1506 case VM_RESUME_CPU: 1507 if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) { 1508 error = EFAULT; 1509 } else { 1510 error = vm_resume_cpu(sc->vmm_vm, vcpu); 1511 } 1512 break; 1513 1514 case VM_GET_CPUS: { 1515 struct vm_cpuset vm_cpuset; 1516 cpuset_t tempset; 1517 void *srcp = &tempset; 1518 int size; 1519 1520 if (ddi_copyin(datap, &vm_cpuset, sizeof (vm_cpuset), md)) { 1521 error = EFAULT; 1522 break; 1523 } 1524 1525 /* Be more generous about sizing since our cpuset_t is large. */ 1526 size = vm_cpuset.cpusetsize; 1527 if (size <= 0 || size > sizeof (cpuset_t)) { 1528 error = ERANGE; 1529 } 1530 /* 1531 * If they want a ulong_t or less, make sure they receive the 1532 * low bits with all the useful information. 1533 */ 1534 if (size <= sizeof (tempset.cpub[0])) { 1535 srcp = &tempset.cpub[0]; 1536 } 1537 1538 if (vm_cpuset.which == VM_ACTIVE_CPUS) { 1539 tempset = vm_active_cpus(sc->vmm_vm); 1540 } else if (vm_cpuset.which == VM_SUSPENDED_CPUS) { 1541 tempset = vm_suspended_cpus(sc->vmm_vm); 1542 } else if (vm_cpuset.which == VM_DEBUG_CPUS) { 1543 tempset = vm_debug_cpus(sc->vmm_vm); 1544 } else { 1545 error = EINVAL; 1546 } 1547 1548 ASSERT(size > 0 && size <= sizeof (tempset)); 1549 if (error == 0 && 1550 ddi_copyout(srcp, vm_cpuset.cpus, size, md)) { 1551 error = EFAULT; 1552 break; 1553 } 1554 break; 1555 } 1556 case VM_SET_INTINFO: { 1557 struct vm_intinfo vmii; 1558 1559 if (ddi_copyin(datap, &vmii, sizeof (vmii), md)) { 1560 error = EFAULT; 1561 break; 1562 } 1563 error = vm_exit_intinfo(sc->vmm_vm, vcpu, vmii.info1); 1564 break; 1565 } 1566 case VM_GET_INTINFO: { 1567 struct vm_intinfo vmii; 1568 1569 vmii.vcpuid = vcpu; 1570 error = vm_get_intinfo(sc->vmm_vm, vcpu, &vmii.info1, 1571 &vmii.info2); 1572 if (error == 0 && 1573 ddi_copyout(&vmii, datap, sizeof (vmii), md)) { 1574 error = EFAULT; 1575 break; 1576 } 1577 break; 1578 } 1579 case VM_RTC_WRITE: { 1580 struct vm_rtc_data rtcdata; 1581 1582 if (ddi_copyin(datap, &rtcdata, sizeof (rtcdata), md)) { 1583 error = EFAULT; 1584 break; 1585 } 1586 error = vrtc_nvram_write(sc->vmm_vm, rtcdata.offset, 1587 rtcdata.value); 1588 break; 1589 } 1590 case VM_RTC_READ: { 1591 struct vm_rtc_data rtcdata; 1592 1593 if (ddi_copyin(datap, &rtcdata, sizeof (rtcdata), md)) { 1594 error = EFAULT; 1595 break; 1596 } 1597 error = vrtc_nvram_read(sc->vmm_vm, rtcdata.offset, 1598 &rtcdata.value); 1599 if (error == 0 && 1600 ddi_copyout(&rtcdata, datap, sizeof (rtcdata), md)) { 1601 error = EFAULT; 1602 break; 1603 } 1604 break; 1605 } 1606 case VM_RTC_SETTIME: { 1607 struct vm_rtc_time rtctime; 1608 1609 if (ddi_copyin(datap, &rtctime, sizeof (rtctime), md)) { 1610 error = EFAULT; 1611 break; 1612 } 1613 error = vrtc_set_time(sc->vmm_vm, rtctime.secs); 1614 break; 1615 } 1616 case VM_RTC_GETTIME: { 1617 struct vm_rtc_time rtctime; 1618 1619 rtctime.secs = vrtc_get_time(sc->vmm_vm); 1620 if (ddi_copyout(&rtctime, datap, sizeof (rtctime), md)) { 1621 error = EFAULT; 1622 break; 1623 } 1624 break; 1625 } 1626 1627 case VM_PMTMR_LOCATE: { 1628 uint16_t port = arg; 1629 error = vpmtmr_set_location(sc->vmm_vm, port); 1630 break; 1631 } 1632 1633 case VM_RESTART_INSTRUCTION: 1634 error = vm_restart_instruction(sc->vmm_vm, vcpu); 1635 break; 1636 1637 case VM_SET_TOPOLOGY: { 1638 struct vm_cpu_topology topo; 1639 1640 if (ddi_copyin(datap, &topo, sizeof (topo), md) != 0) { 1641 error = EFAULT; 1642 break; 1643 } 1644 error = vm_set_topology(sc->vmm_vm, topo.sockets, topo.cores, 1645 topo.threads, topo.maxcpus); 1646 break; 1647 } 1648 case VM_GET_TOPOLOGY: { 1649 struct vm_cpu_topology topo; 1650 1651 vm_get_topology(sc->vmm_vm, &topo.sockets, &topo.cores, 1652 &topo.threads, &topo.maxcpus); 1653 if (ddi_copyout(&topo, datap, sizeof (topo), md) != 0) { 1654 error = EFAULT; 1655 break; 1656 } 1657 break; 1658 } 1659 case VM_DEVMEM_GETOFFSET: { 1660 struct vm_devmem_offset vdo; 1661 vmm_devmem_entry_t *de; 1662 1663 if (ddi_copyin(datap, &vdo, sizeof (vdo), md) != 0) { 1664 error = EFAULT; 1665 break; 1666 } 1667 1668 de = vmmdev_devmem_find(sc, vdo.segid); 1669 if (de != NULL) { 1670 vdo.offset = de->vde_off; 1671 if (ddi_copyout(&vdo, datap, sizeof (vdo), md) != 0) { 1672 error = EFAULT; 1673 } 1674 } else { 1675 error = ENOENT; 1676 } 1677 break; 1678 } 1679 case VM_TRACK_DIRTY_PAGES: { 1680 const size_t max_track_region_len = 8 * PAGESIZE * 8 * PAGESIZE; 1681 struct vmm_dirty_tracker tracker; 1682 uint8_t *bitmap; 1683 size_t len; 1684 1685 if (ddi_copyin(datap, &tracker, sizeof (tracker), md) != 0) { 1686 error = EFAULT; 1687 break; 1688 } 1689 if ((tracker.vdt_start_gpa & PAGEOFFSET) != 0) { 1690 error = EINVAL; 1691 break; 1692 } 1693 if (tracker.vdt_len == 0) { 1694 break; 1695 } 1696 if ((tracker.vdt_len & PAGEOFFSET) != 0) { 1697 error = EINVAL; 1698 break; 1699 } 1700 if (tracker.vdt_len > max_track_region_len) { 1701 error = EINVAL; 1702 break; 1703 } 1704 len = roundup(tracker.vdt_len / PAGESIZE, 8) / 8; 1705 bitmap = kmem_zalloc(len, KM_SLEEP); 1706 error = vm_track_dirty_pages(sc->vmm_vm, tracker.vdt_start_gpa, 1707 tracker.vdt_len, bitmap); 1708 if (error == 0 && 1709 ddi_copyout(bitmap, tracker.vdt_pfns, len, md) != 0) { 1710 error = EFAULT; 1711 } 1712 kmem_free(bitmap, len); 1713 1714 break; 1715 } 1716 case VM_WRLOCK_CYCLE: { 1717 /* 1718 * Present a test mechanism to acquire/release the write lock 1719 * on the VM without any other effects. 1720 */ 1721 break; 1722 } 1723 case VM_DATA_READ: { 1724 struct vm_data_xfer vdx; 1725 1726 if (ddi_copyin(datap, &vdx, sizeof (vdx), md) != 0) { 1727 error = EFAULT; 1728 break; 1729 } 1730 if ((vdx.vdx_flags & ~VDX_FLAGS_VALID) != 0) { 1731 error = EINVAL; 1732 break; 1733 } 1734 if (vdx.vdx_len > VM_DATA_XFER_LIMIT) { 1735 error = EFBIG; 1736 break; 1737 } 1738 1739 const size_t len = vdx.vdx_len; 1740 void *buf = NULL; 1741 if (len != 0) { 1742 const void *udata = vdx.vdx_data; 1743 1744 buf = kmem_alloc(len, KM_SLEEP); 1745 if ((vdx.vdx_flags & VDX_FLAG_READ_COPYIN) == 0) { 1746 bzero(buf, len); 1747 } else if (ddi_copyin(udata, buf, len, md) != 0) { 1748 kmem_free(buf, len); 1749 error = EFAULT; 1750 break; 1751 } 1752 } 1753 1754 vdx.vdx_result_len = 0; 1755 vmm_data_req_t req = { 1756 .vdr_class = vdx.vdx_class, 1757 .vdr_version = vdx.vdx_version, 1758 .vdr_flags = vdx.vdx_flags, 1759 .vdr_len = len, 1760 .vdr_data = buf, 1761 .vdr_result_len = &vdx.vdx_result_len, 1762 }; 1763 error = vmm_data_read(sc->vmm_vm, vdx.vdx_vcpuid, &req); 1764 1765 if (error == 0 && buf != NULL) { 1766 if (ddi_copyout(buf, vdx.vdx_data, len, md) != 0) { 1767 error = EFAULT; 1768 } 1769 } 1770 1771 /* 1772 * Copy out the transfer request so that the value of 1773 * vdx_result_len can be made available, regardless of any 1774 * error(s) which may have occurred. 1775 */ 1776 if (ddi_copyout(&vdx, datap, sizeof (vdx), md) != 0) { 1777 error = (error != 0) ? error : EFAULT; 1778 } 1779 1780 if (buf != NULL) { 1781 kmem_free(buf, len); 1782 } 1783 break; 1784 } 1785 case VM_DATA_WRITE: { 1786 struct vm_data_xfer vdx; 1787 1788 if (ddi_copyin(datap, &vdx, sizeof (vdx), md) != 0) { 1789 error = EFAULT; 1790 break; 1791 } 1792 if ((vdx.vdx_flags & ~VDX_FLAGS_VALID) != 0) { 1793 error = EINVAL; 1794 break; 1795 } 1796 if (vdx.vdx_len > VM_DATA_XFER_LIMIT) { 1797 error = EFBIG; 1798 break; 1799 } 1800 1801 const size_t len = vdx.vdx_len; 1802 void *buf = NULL; 1803 if (len != 0) { 1804 buf = kmem_alloc(len, KM_SLEEP); 1805 if (ddi_copyin(vdx.vdx_data, buf, len, md) != 0) { 1806 kmem_free(buf, len); 1807 error = EFAULT; 1808 break; 1809 } 1810 } 1811 1812 vdx.vdx_result_len = 0; 1813 vmm_data_req_t req = { 1814 .vdr_class = vdx.vdx_class, 1815 .vdr_version = vdx.vdx_version, 1816 .vdr_flags = vdx.vdx_flags, 1817 .vdr_len = len, 1818 .vdr_data = buf, 1819 .vdr_result_len = &vdx.vdx_result_len, 1820 }; 1821 if (vmm_allow_state_writes == 0) { 1822 /* XXX: Play it safe for now */ 1823 error = EPERM; 1824 } else { 1825 error = vmm_data_write(sc->vmm_vm, vdx.vdx_vcpuid, 1826 &req); 1827 } 1828 1829 if (error == 0 && buf != NULL && 1830 (vdx.vdx_flags & VDX_FLAG_WRITE_COPYOUT) != 0) { 1831 if (ddi_copyout(buf, vdx.vdx_data, len, md) != 0) { 1832 error = EFAULT; 1833 } 1834 } 1835 1836 /* 1837 * Copy out the transfer request so that the value of 1838 * vdx_result_len can be made available, regardless of any 1839 * error(s) which may have occurred. 1840 */ 1841 if (ddi_copyout(&vdx, datap, sizeof (vdx), md) != 0) { 1842 error = (error != 0) ? error : EFAULT; 1843 } 1844 1845 if (buf != NULL) { 1846 kmem_free(buf, len); 1847 } 1848 break; 1849 } 1850 1851 case VM_PAUSE: { 1852 error = vm_pause_instance(sc->vmm_vm); 1853 break; 1854 } 1855 case VM_RESUME: { 1856 error = vm_resume_instance(sc->vmm_vm); 1857 break; 1858 } 1859 1860 default: 1861 error = ENOTTY; 1862 break; 1863 } 1864 1865 /* Release exclusion resources */ 1866 switch (lock_type) { 1867 case LOCK_NONE: 1868 break; 1869 case LOCK_VCPU: 1870 vcpu_unlock_one(sc, vcpu); 1871 break; 1872 case LOCK_READ_HOLD: 1873 vmm_read_unlock(sc); 1874 break; 1875 case LOCK_WRITE_HOLD: 1876 vmm_write_unlock(sc); 1877 break; 1878 default: 1879 panic("unexpected lock type"); 1880 break; 1881 } 1882 1883 return (error); 1884 } 1885 1886 static vmm_softc_t * 1887 vmm_lookup(const char *name) 1888 { 1889 list_t *vml = &vmm_list; 1890 vmm_softc_t *sc; 1891 1892 ASSERT(MUTEX_HELD(&vmm_mtx)); 1893 1894 for (sc = list_head(vml); sc != NULL; sc = list_next(vml, sc)) { 1895 if (strcmp(sc->vmm_name, name) == 0) { 1896 break; 1897 } 1898 } 1899 1900 return (sc); 1901 } 1902 1903 /* 1904 * Acquire an HMA registration if not already held. 1905 */ 1906 static boolean_t 1907 vmm_hma_acquire(void) 1908 { 1909 ASSERT(MUTEX_NOT_HELD(&vmm_mtx)); 1910 1911 mutex_enter(&vmmdev_mtx); 1912 1913 if (vmmdev_hma_reg == NULL) { 1914 VERIFY3U(vmmdev_hma_ref, ==, 0); 1915 vmmdev_hma_reg = hma_register(vmmdev_hvm_name); 1916 if (vmmdev_hma_reg == NULL) { 1917 cmn_err(CE_WARN, "%s HMA registration failed.", 1918 vmmdev_hvm_name); 1919 mutex_exit(&vmmdev_mtx); 1920 return (B_FALSE); 1921 } 1922 } 1923 1924 vmmdev_hma_ref++; 1925 1926 mutex_exit(&vmmdev_mtx); 1927 1928 return (B_TRUE); 1929 } 1930 1931 /* 1932 * Release the HMA registration if held and there are no remaining VMs. 1933 */ 1934 static void 1935 vmm_hma_release(void) 1936 { 1937 ASSERT(MUTEX_NOT_HELD(&vmm_mtx)); 1938 1939 mutex_enter(&vmmdev_mtx); 1940 1941 VERIFY3U(vmmdev_hma_ref, !=, 0); 1942 1943 vmmdev_hma_ref--; 1944 1945 if (vmmdev_hma_ref == 0) { 1946 VERIFY(vmmdev_hma_reg != NULL); 1947 hma_unregister(vmmdev_hma_reg); 1948 vmmdev_hma_reg = NULL; 1949 } 1950 mutex_exit(&vmmdev_mtx); 1951 } 1952 1953 static int 1954 vmmdev_do_vm_create(const struct vm_create_req *req, cred_t *cr) 1955 { 1956 vmm_softc_t *sc = NULL; 1957 minor_t minor; 1958 int error = ENOMEM; 1959 size_t len; 1960 const char *name = req->name; 1961 1962 len = strnlen(name, VM_MAX_NAMELEN); 1963 if (len == 0) { 1964 return (EINVAL); 1965 } 1966 if (len >= VM_MAX_NAMELEN) { 1967 return (ENAMETOOLONG); 1968 } 1969 if (strchr(name, '/') != NULL) { 1970 return (EINVAL); 1971 } 1972 1973 if (!vmm_hma_acquire()) 1974 return (ENXIO); 1975 1976 mutex_enter(&vmm_mtx); 1977 1978 /* Look for duplicate names */ 1979 if (vmm_lookup(name) != NULL) { 1980 mutex_exit(&vmm_mtx); 1981 vmm_hma_release(); 1982 return (EEXIST); 1983 } 1984 1985 /* Allow only one instance per non-global zone. */ 1986 if (!INGLOBALZONE(curproc)) { 1987 for (sc = list_head(&vmm_list); sc != NULL; 1988 sc = list_next(&vmm_list, sc)) { 1989 if (sc->vmm_zone == curzone) { 1990 mutex_exit(&vmm_mtx); 1991 vmm_hma_release(); 1992 return (EINVAL); 1993 } 1994 } 1995 } 1996 1997 minor = id_alloc(vmm_minors); 1998 if (ddi_soft_state_zalloc(vmm_statep, minor) != DDI_SUCCESS) { 1999 goto fail; 2000 } else if ((sc = ddi_get_soft_state(vmm_statep, minor)) == NULL) { 2001 ddi_soft_state_free(vmm_statep, minor); 2002 goto fail; 2003 } else if (ddi_create_minor_node(vmmdev_dip, name, S_IFCHR, minor, 2004 DDI_PSEUDO, 0) != DDI_SUCCESS) { 2005 goto fail; 2006 } 2007 2008 if (vmm_kstat_alloc(sc, minor, cr) != 0) { 2009 goto fail; 2010 } 2011 2012 error = vm_create(req->flags, &sc->vmm_vm); 2013 if (error == 0) { 2014 /* Complete VM intialization and report success. */ 2015 (void) strlcpy(sc->vmm_name, name, sizeof (sc->vmm_name)); 2016 sc->vmm_minor = minor; 2017 list_create(&sc->vmm_devmem_list, sizeof (vmm_devmem_entry_t), 2018 offsetof(vmm_devmem_entry_t, vde_node)); 2019 2020 list_create(&sc->vmm_holds, sizeof (vmm_hold_t), 2021 offsetof(vmm_hold_t, vmh_node)); 2022 cv_init(&sc->vmm_cv, NULL, CV_DEFAULT, NULL); 2023 2024 mutex_init(&sc->vmm_lease_lock, NULL, MUTEX_DEFAULT, NULL); 2025 list_create(&sc->vmm_lease_list, sizeof (vmm_lease_t), 2026 offsetof(vmm_lease_t, vml_node)); 2027 cv_init(&sc->vmm_lease_cv, NULL, CV_DEFAULT, NULL); 2028 rw_init(&sc->vmm_rwlock, NULL, RW_DEFAULT, NULL); 2029 2030 sc->vmm_zone = crgetzone(cr); 2031 zone_hold(sc->vmm_zone); 2032 vmm_zsd_add_vm(sc); 2033 vmm_kstat_init(sc); 2034 2035 list_insert_tail(&vmm_list, sc); 2036 mutex_exit(&vmm_mtx); 2037 return (0); 2038 } 2039 2040 vmm_kstat_fini(sc); 2041 ddi_remove_minor_node(vmmdev_dip, name); 2042 fail: 2043 id_free(vmm_minors, minor); 2044 if (sc != NULL) { 2045 ddi_soft_state_free(vmm_statep, minor); 2046 } 2047 mutex_exit(&vmm_mtx); 2048 vmm_hma_release(); 2049 2050 return (error); 2051 } 2052 2053 /* 2054 * Bhyve 'Driver' Interface 2055 * 2056 * While many devices are emulated in the bhyve userspace process, there are 2057 * others with performance constraints which require that they run mostly or 2058 * entirely in-kernel. For those not integrated directly into bhyve, an API is 2059 * needed so they can query/manipulate the portions of VM state needed to 2060 * fulfill their purpose. 2061 * 2062 * This includes: 2063 * - Translating guest-physical addresses to host-virtual pointers 2064 * - Injecting MSIs 2065 * - Hooking IO port addresses 2066 * 2067 * The vmm_drv interface exists to provide that functionality to its consumers. 2068 * (At this time, 'viona' is the only user) 2069 */ 2070 int 2071 vmm_drv_hold(file_t *fp, cred_t *cr, vmm_hold_t **holdp) 2072 { 2073 vnode_t *vp = fp->f_vnode; 2074 const dev_t dev = vp->v_rdev; 2075 vmm_softc_t *sc; 2076 vmm_hold_t *hold; 2077 int err = 0; 2078 2079 if (vp->v_type != VCHR) { 2080 return (ENXIO); 2081 } 2082 const major_t major = getmajor(dev); 2083 const minor_t minor = getminor(dev); 2084 2085 mutex_enter(&vmmdev_mtx); 2086 if (vmmdev_dip == NULL || major != ddi_driver_major(vmmdev_dip)) { 2087 mutex_exit(&vmmdev_mtx); 2088 return (ENOENT); 2089 } 2090 mutex_enter(&vmm_mtx); 2091 mutex_exit(&vmmdev_mtx); 2092 2093 if ((sc = ddi_get_soft_state(vmm_statep, minor)) == NULL) { 2094 err = ENOENT; 2095 goto out; 2096 } 2097 /* XXXJOY: check cred permissions against instance */ 2098 2099 if ((sc->vmm_flags & VMM_DESTROY) != 0) { 2100 err = EBUSY; 2101 goto out; 2102 } 2103 2104 hold = kmem_zalloc(sizeof (*hold), KM_SLEEP); 2105 hold->vmh_sc = sc; 2106 hold->vmh_release_req = B_FALSE; 2107 2108 list_insert_tail(&sc->vmm_holds, hold); 2109 sc->vmm_flags |= VMM_HELD; 2110 *holdp = hold; 2111 2112 out: 2113 mutex_exit(&vmm_mtx); 2114 return (err); 2115 } 2116 2117 void 2118 vmm_drv_rele(vmm_hold_t *hold) 2119 { 2120 vmm_softc_t *sc; 2121 bool hma_release = false; 2122 2123 ASSERT(hold != NULL); 2124 ASSERT(hold->vmh_sc != NULL); 2125 VERIFY(hold->vmh_ioport_hook_cnt == 0); 2126 2127 mutex_enter(&vmm_mtx); 2128 sc = hold->vmh_sc; 2129 list_remove(&sc->vmm_holds, hold); 2130 kmem_free(hold, sizeof (*hold)); 2131 2132 if (list_is_empty(&sc->vmm_holds)) { 2133 sc->vmm_flags &= ~VMM_HELD; 2134 2135 /* 2136 * Since outstanding holds would prevent instance destruction 2137 * from completing, attempt to finish it now if it was already 2138 * set in motion. 2139 */ 2140 if ((sc->vmm_flags & VMM_DESTROY) != 0) { 2141 VERIFY0(vmm_destroy_locked(sc, VDO_DEFAULT, 2142 &hma_release)); 2143 } 2144 } 2145 mutex_exit(&vmm_mtx); 2146 2147 if (hma_release) { 2148 vmm_hma_release(); 2149 } 2150 } 2151 2152 boolean_t 2153 vmm_drv_release_reqd(vmm_hold_t *hold) 2154 { 2155 ASSERT(hold != NULL); 2156 2157 return (hold->vmh_release_req); 2158 } 2159 2160 vmm_lease_t * 2161 vmm_drv_lease_sign(vmm_hold_t *hold, boolean_t (*expiref)(void *), void *arg) 2162 { 2163 vmm_softc_t *sc = hold->vmh_sc; 2164 vmm_lease_t *lease; 2165 2166 ASSERT3P(expiref, !=, NULL); 2167 2168 if (hold->vmh_release_req) { 2169 return (NULL); 2170 } 2171 2172 lease = kmem_alloc(sizeof (*lease), KM_SLEEP); 2173 list_link_init(&lease->vml_node); 2174 lease->vml_expire_func = expiref; 2175 lease->vml_expire_arg = arg; 2176 lease->vml_expired = B_FALSE; 2177 lease->vml_break_deferred = B_FALSE; 2178 lease->vml_hold = hold; 2179 /* cache the VM pointer for one less pointer chase */ 2180 lease->vml_vm = sc->vmm_vm; 2181 lease->vml_vmclient = vmspace_client_alloc(vm_get_vmspace(sc->vmm_vm)); 2182 2183 mutex_enter(&sc->vmm_lease_lock); 2184 while (sc->vmm_lease_blocker != 0) { 2185 cv_wait(&sc->vmm_lease_cv, &sc->vmm_lease_lock); 2186 } 2187 list_insert_tail(&sc->vmm_lease_list, lease); 2188 vmm_read_lock(sc); 2189 mutex_exit(&sc->vmm_lease_lock); 2190 2191 return (lease); 2192 } 2193 2194 static void 2195 vmm_lease_break_locked(vmm_softc_t *sc, vmm_lease_t *lease) 2196 { 2197 ASSERT(MUTEX_HELD(&sc->vmm_lease_lock)); 2198 2199 list_remove(&sc->vmm_lease_list, lease); 2200 vmm_read_unlock(sc); 2201 vmc_destroy(lease->vml_vmclient); 2202 kmem_free(lease, sizeof (*lease)); 2203 } 2204 2205 static void 2206 vmm_lease_block(vmm_softc_t *sc) 2207 { 2208 mutex_enter(&sc->vmm_lease_lock); 2209 VERIFY3U(sc->vmm_lease_blocker, !=, UINT_MAX); 2210 sc->vmm_lease_blocker++; 2211 if (sc->vmm_lease_blocker == 1) { 2212 list_t *list = &sc->vmm_lease_list; 2213 vmm_lease_t *lease = list_head(list); 2214 2215 while (lease != NULL) { 2216 void *arg = lease->vml_expire_arg; 2217 boolean_t (*expiref)(void *) = lease->vml_expire_func; 2218 boolean_t sync_break = B_FALSE; 2219 2220 /* 2221 * Since the lease expiration notification may 2222 * need to take locks which would deadlock with 2223 * vmm_lease_lock, drop it across the call. 2224 * 2225 * We are the only one allowed to manipulate 2226 * vmm_lease_list right now, so it is safe to 2227 * continue iterating through it after 2228 * reacquiring the lock. 2229 */ 2230 lease->vml_expired = B_TRUE; 2231 mutex_exit(&sc->vmm_lease_lock); 2232 sync_break = expiref(arg); 2233 mutex_enter(&sc->vmm_lease_lock); 2234 2235 if (sync_break) { 2236 vmm_lease_t *next; 2237 2238 /* 2239 * These leases which are synchronously broken 2240 * result in vmm_read_unlock() calls from a 2241 * different thread than the corresponding 2242 * vmm_read_lock(). This is acceptable, given 2243 * that the rwlock underpinning the whole 2244 * mechanism tolerates the behavior. This 2245 * flexibility is _only_ afforded to VM read 2246 * lock (RW_READER) holders. 2247 */ 2248 next = list_next(list, lease); 2249 vmm_lease_break_locked(sc, lease); 2250 lease = next; 2251 } else { 2252 lease = list_next(list, lease); 2253 } 2254 } 2255 2256 /* Process leases which were not broken synchronously. */ 2257 while (!list_is_empty(list)) { 2258 /* 2259 * Although the nested loops are quadratic, the number 2260 * of leases is small. 2261 */ 2262 lease = list_head(list); 2263 while (lease != NULL) { 2264 vmm_lease_t *next = list_next(list, lease); 2265 if (lease->vml_break_deferred) { 2266 vmm_lease_break_locked(sc, lease); 2267 } 2268 lease = next; 2269 } 2270 if (list_is_empty(list)) { 2271 break; 2272 } 2273 cv_wait(&sc->vmm_lease_cv, &sc->vmm_lease_lock); 2274 } 2275 /* Wake anyone else waiting for the lease list to be empty */ 2276 cv_broadcast(&sc->vmm_lease_cv); 2277 } else { 2278 list_t *list = &sc->vmm_lease_list; 2279 2280 /* 2281 * Some other thread beat us to the duty of lease cleanup. 2282 * Wait until that is complete. 2283 */ 2284 while (!list_is_empty(list)) { 2285 cv_wait(&sc->vmm_lease_cv, &sc->vmm_lease_lock); 2286 } 2287 } 2288 mutex_exit(&sc->vmm_lease_lock); 2289 } 2290 2291 static void 2292 vmm_lease_unblock(vmm_softc_t *sc) 2293 { 2294 mutex_enter(&sc->vmm_lease_lock); 2295 VERIFY3U(sc->vmm_lease_blocker, !=, 0); 2296 sc->vmm_lease_blocker--; 2297 if (sc->vmm_lease_blocker == 0) { 2298 cv_broadcast(&sc->vmm_lease_cv); 2299 } 2300 mutex_exit(&sc->vmm_lease_lock); 2301 } 2302 2303 void 2304 vmm_drv_lease_break(vmm_hold_t *hold, vmm_lease_t *lease) 2305 { 2306 vmm_softc_t *sc = hold->vmh_sc; 2307 2308 VERIFY3P(hold, ==, lease->vml_hold); 2309 VERIFY(!lease->vml_break_deferred); 2310 2311 mutex_enter(&sc->vmm_lease_lock); 2312 if (sc->vmm_lease_blocker == 0) { 2313 vmm_lease_break_locked(sc, lease); 2314 } else { 2315 /* 2316 * Defer the lease-breaking to whichever thread is currently 2317 * cleaning up all leases as part of a vmm_lease_block() call. 2318 */ 2319 lease->vml_break_deferred = B_TRUE; 2320 cv_broadcast(&sc->vmm_lease_cv); 2321 } 2322 mutex_exit(&sc->vmm_lease_lock); 2323 } 2324 2325 boolean_t 2326 vmm_drv_lease_expired(vmm_lease_t *lease) 2327 { 2328 return (lease->vml_expired); 2329 } 2330 2331 vmm_page_t * 2332 vmm_drv_page_hold(vmm_lease_t *lease, uintptr_t gpa, int prot) 2333 { 2334 ASSERT(lease != NULL); 2335 ASSERT0(gpa & PAGEOFFSET); 2336 2337 return ((vmm_page_t *)vmc_hold(lease->vml_vmclient, gpa, prot)); 2338 } 2339 2340 2341 /* Ensure that flags mirrored by vmm_drv interface properly match up */ 2342 CTASSERT(VMPF_DEFER_DIRTY == VPF_DEFER_DIRTY); 2343 2344 vmm_page_t * 2345 vmm_drv_page_hold_ext(vmm_lease_t *lease, uintptr_t gpa, int prot, int flags) 2346 { 2347 ASSERT(lease != NULL); 2348 ASSERT0(gpa & PAGEOFFSET); 2349 2350 vmm_page_t *page = 2351 (vmm_page_t *)vmc_hold_ext(lease->vml_vmclient, gpa, prot, flags); 2352 return (page); 2353 } 2354 2355 void 2356 vmm_drv_page_release(vmm_page_t *vmmp) 2357 { 2358 (void) vmp_release((vm_page_t *)vmmp); 2359 } 2360 2361 void 2362 vmm_drv_page_release_chain(vmm_page_t *vmmp) 2363 { 2364 (void) vmp_release_chain((vm_page_t *)vmmp); 2365 } 2366 2367 const void * 2368 vmm_drv_page_readable(const vmm_page_t *vmmp) 2369 { 2370 return (vmp_get_readable((const vm_page_t *)vmmp)); 2371 } 2372 2373 void * 2374 vmm_drv_page_writable(const vmm_page_t *vmmp) 2375 { 2376 return (vmp_get_writable((const vm_page_t *)vmmp)); 2377 } 2378 2379 void 2380 vmm_drv_page_mark_dirty(vmm_page_t *vmmp) 2381 { 2382 return (vmp_mark_dirty((vm_page_t *)vmmp)); 2383 } 2384 2385 void 2386 vmm_drv_page_chain(vmm_page_t *vmmp, vmm_page_t *to_chain) 2387 { 2388 vmp_chain((vm_page_t *)vmmp, (vm_page_t *)to_chain); 2389 } 2390 2391 vmm_page_t * 2392 vmm_drv_page_next(const vmm_page_t *vmmp) 2393 { 2394 return ((vmm_page_t *)vmp_next((vm_page_t *)vmmp)); 2395 } 2396 2397 int 2398 vmm_drv_msi(vmm_lease_t *lease, uint64_t addr, uint64_t msg) 2399 { 2400 ASSERT(lease != NULL); 2401 2402 return (lapic_intr_msi(lease->vml_vm, addr, msg)); 2403 } 2404 2405 int 2406 vmm_drv_ioport_hook(vmm_hold_t *hold, uint16_t ioport, vmm_drv_iop_cb_t func, 2407 void *arg, void **cookie) 2408 { 2409 vmm_softc_t *sc; 2410 int err; 2411 2412 ASSERT(hold != NULL); 2413 ASSERT(cookie != NULL); 2414 2415 sc = hold->vmh_sc; 2416 mutex_enter(&vmm_mtx); 2417 /* Confirm that hook installation is not blocked */ 2418 if ((sc->vmm_flags & VMM_BLOCK_HOOK) != 0) { 2419 mutex_exit(&vmm_mtx); 2420 return (EBUSY); 2421 } 2422 /* 2423 * Optimistically record an installed hook which will prevent a block 2424 * from being asserted while the mutex is dropped. 2425 */ 2426 hold->vmh_ioport_hook_cnt++; 2427 mutex_exit(&vmm_mtx); 2428 2429 vmm_write_lock(sc); 2430 err = vm_ioport_hook(sc->vmm_vm, ioport, (ioport_handler_t)func, 2431 arg, cookie); 2432 vmm_write_unlock(sc); 2433 2434 if (err != 0) { 2435 mutex_enter(&vmm_mtx); 2436 /* Walk back optimism about the hook installation */ 2437 hold->vmh_ioport_hook_cnt--; 2438 mutex_exit(&vmm_mtx); 2439 } 2440 return (err); 2441 } 2442 2443 void 2444 vmm_drv_ioport_unhook(vmm_hold_t *hold, void **cookie) 2445 { 2446 vmm_softc_t *sc; 2447 2448 ASSERT(hold != NULL); 2449 ASSERT(cookie != NULL); 2450 ASSERT(hold->vmh_ioport_hook_cnt != 0); 2451 2452 sc = hold->vmh_sc; 2453 vmm_write_lock(sc); 2454 vm_ioport_unhook(sc->vmm_vm, cookie); 2455 vmm_write_unlock(sc); 2456 2457 mutex_enter(&vmm_mtx); 2458 hold->vmh_ioport_hook_cnt--; 2459 mutex_exit(&vmm_mtx); 2460 } 2461 2462 static void 2463 vmm_drv_purge(vmm_softc_t *sc) 2464 { 2465 ASSERT(MUTEX_HELD(&vmm_mtx)); 2466 2467 if ((sc->vmm_flags & VMM_HELD) != 0) { 2468 vmm_hold_t *hold; 2469 2470 for (hold = list_head(&sc->vmm_holds); hold != NULL; 2471 hold = list_next(&sc->vmm_holds, hold)) { 2472 hold->vmh_release_req = B_TRUE; 2473 } 2474 2475 /* 2476 * Require that all leases on the instance be broken, now that 2477 * all associated holds have been marked as needing release. 2478 * 2479 * Dropping vmm_mtx is not strictly necessary, but if any of the 2480 * lessees are slow to respond, it would be nice to leave it 2481 * available for other parties. 2482 */ 2483 mutex_exit(&vmm_mtx); 2484 vmm_lease_block(sc); 2485 vmm_lease_unblock(sc); 2486 mutex_enter(&vmm_mtx); 2487 } 2488 } 2489 2490 static int 2491 vmm_drv_block_hook(vmm_softc_t *sc, boolean_t enable_block) 2492 { 2493 int err = 0; 2494 2495 mutex_enter(&vmm_mtx); 2496 if (!enable_block) { 2497 VERIFY((sc->vmm_flags & VMM_BLOCK_HOOK) != 0); 2498 2499 sc->vmm_flags &= ~VMM_BLOCK_HOOK; 2500 goto done; 2501 } 2502 2503 /* If any holds have hooks installed, the block is a failure */ 2504 if (!list_is_empty(&sc->vmm_holds)) { 2505 vmm_hold_t *hold; 2506 2507 for (hold = list_head(&sc->vmm_holds); hold != NULL; 2508 hold = list_next(&sc->vmm_holds, hold)) { 2509 if (hold->vmh_ioport_hook_cnt != 0) { 2510 err = EBUSY; 2511 goto done; 2512 } 2513 } 2514 } 2515 sc->vmm_flags |= VMM_BLOCK_HOOK; 2516 2517 done: 2518 mutex_exit(&vmm_mtx); 2519 return (err); 2520 } 2521 2522 2523 static void 2524 vmm_destroy_begin(vmm_softc_t *sc, vmm_destroy_opts_t opts) 2525 { 2526 ASSERT(MUTEX_HELD(&vmm_mtx)); 2527 ASSERT0(sc->vmm_flags & VMM_DESTROY); 2528 2529 sc->vmm_flags |= VMM_DESTROY; 2530 2531 /* 2532 * Lock and unlock all of the vCPUs to ensure that they are kicked out 2533 * of guest context, being unable to return now that the instance is 2534 * marked for destruction. 2535 */ 2536 const int maxcpus = vm_get_maxcpus(sc->vmm_vm); 2537 for (int vcpu = 0; vcpu < maxcpus; vcpu++) { 2538 vcpu_lock_one(sc, vcpu); 2539 vcpu_unlock_one(sc, vcpu); 2540 } 2541 2542 vmmdev_devmem_purge(sc); 2543 if ((opts & VDO_NO_CLEAN_ZSD) == 0) { 2544 /* 2545 * The ZSD should be cleaned up now, unless destruction of the 2546 * instance was initated by destruction of the containing zone, 2547 * in which case the ZSD has already been removed. 2548 */ 2549 vmm_zsd_rem_vm(sc); 2550 } 2551 zone_rele(sc->vmm_zone); 2552 2553 vmm_drv_purge(sc); 2554 } 2555 2556 static bool 2557 vmm_destroy_ready(vmm_softc_t *sc) 2558 { 2559 ASSERT(MUTEX_HELD(&vmm_mtx)); 2560 2561 if ((sc->vmm_flags & (VMM_HELD | VMM_IS_OPEN)) == 0) { 2562 VERIFY(list_is_empty(&sc->vmm_holds)); 2563 return (true); 2564 } 2565 2566 return (false); 2567 } 2568 2569 static void 2570 vmm_destroy_finish(vmm_softc_t *sc) 2571 { 2572 ASSERT(MUTEX_HELD(&vmm_mtx)); 2573 ASSERT(vmm_destroy_ready(sc)); 2574 2575 list_remove(&vmm_list, sc); 2576 vmm_kstat_fini(sc); 2577 vm_destroy(sc->vmm_vm); 2578 ddi_remove_minor_node(vmmdev_dip, sc->vmm_name); 2579 (void) devfs_clean(ddi_get_parent(vmmdev_dip), NULL, DV_CLEAN_FORCE); 2580 2581 const minor_t minor = sc->vmm_minor; 2582 ddi_soft_state_free(vmm_statep, minor); 2583 id_free(vmm_minors, minor); 2584 } 2585 2586 /* 2587 * Initiate or attempt to finish destruction of a VMM instance. 2588 * 2589 * This is called from several contexts: 2590 * - An explicit destroy ioctl is made 2591 * - A vmm_drv consumer releases its hold (being the last on the instance) 2592 * - The vmm device is closed, and auto-destruct is enabled 2593 */ 2594 static int 2595 vmm_destroy_locked(vmm_softc_t *sc, vmm_destroy_opts_t opts, 2596 bool *hma_release) 2597 { 2598 ASSERT(MUTEX_HELD(&vmm_mtx)); 2599 2600 *hma_release = false; 2601 2602 /* 2603 * When instance destruction begins, it is so marked such that any 2604 * further requests to operate the instance will fail. 2605 */ 2606 if ((sc->vmm_flags & VMM_DESTROY) == 0) { 2607 vmm_destroy_begin(sc, opts); 2608 } 2609 2610 if (vmm_destroy_ready(sc)) { 2611 2612 /* 2613 * Notify anyone waiting for the destruction to finish. They 2614 * must be clear before we can safely tear down the softc. 2615 */ 2616 if (sc->vmm_destroy_waiters != 0) { 2617 cv_broadcast(&sc->vmm_cv); 2618 while (sc->vmm_destroy_waiters != 0) { 2619 cv_wait(&sc->vmm_cv, &vmm_mtx); 2620 } 2621 } 2622 2623 /* 2624 * Finish destruction of instance. After this point, the softc 2625 * is freed and cannot be accessed again. 2626 * 2627 * With destruction complete, the HMA hold can be released 2628 */ 2629 vmm_destroy_finish(sc); 2630 *hma_release = true; 2631 return (0); 2632 } else if ((opts & VDO_ATTEMPT_WAIT) != 0) { 2633 int err = 0; 2634 2635 sc->vmm_destroy_waiters++; 2636 while (!vmm_destroy_ready(sc) && err == 0) { 2637 if (cv_wait_sig(&sc->vmm_cv, &vmm_mtx) <= 0) { 2638 err = EINTR; 2639 } 2640 } 2641 sc->vmm_destroy_waiters--; 2642 2643 if (sc->vmm_destroy_waiters == 0) { 2644 /* 2645 * If we were the last waiter, it could be that VM 2646 * destruction is waiting on _us_ to proceed with the 2647 * final clean-up. 2648 */ 2649 cv_signal(&sc->vmm_cv); 2650 } 2651 return (err); 2652 } else { 2653 /* 2654 * Since the instance is not ready for destruction, and the 2655 * caller did not ask to wait, consider it a success for now. 2656 */ 2657 return (0); 2658 } 2659 } 2660 2661 void 2662 vmm_zone_vm_destroy(vmm_softc_t *sc) 2663 { 2664 bool hma_release = false; 2665 int err; 2666 2667 mutex_enter(&vmm_mtx); 2668 err = vmm_destroy_locked(sc, VDO_NO_CLEAN_ZSD, &hma_release); 2669 mutex_exit(&vmm_mtx); 2670 2671 VERIFY0(err); 2672 2673 if (hma_release) { 2674 vmm_hma_release(); 2675 } 2676 } 2677 2678 static int 2679 vmmdev_do_vm_destroy(const struct vm_destroy_req *req, cred_t *cr) 2680 { 2681 vmm_softc_t *sc; 2682 bool hma_release = false; 2683 int err; 2684 2685 if (crgetuid(cr) != 0) { 2686 return (EPERM); 2687 } 2688 2689 mutex_enter(&vmm_mtx); 2690 sc = vmm_lookup(req->name); 2691 if (sc == NULL) { 2692 mutex_exit(&vmm_mtx); 2693 return (ENOENT); 2694 } 2695 /* 2696 * We don't check this in vmm_lookup() since that function is also used 2697 * for validation during create and currently vmm names must be unique. 2698 */ 2699 if (!INGLOBALZONE(curproc) && sc->vmm_zone != curzone) { 2700 mutex_exit(&vmm_mtx); 2701 return (EPERM); 2702 } 2703 2704 err = vmm_destroy_locked(sc, VDO_ATTEMPT_WAIT, &hma_release); 2705 mutex_exit(&vmm_mtx); 2706 2707 if (hma_release) { 2708 vmm_hma_release(); 2709 } 2710 2711 return (err); 2712 } 2713 2714 #define VCPU_NAME_BUFLEN 32 2715 2716 static int 2717 vmm_kstat_alloc(vmm_softc_t *sc, minor_t minor, const cred_t *cr) 2718 { 2719 zoneid_t zid = crgetzoneid(cr); 2720 int instance = minor; 2721 kstat_t *ksp; 2722 2723 ASSERT3P(sc->vmm_kstat_vm, ==, NULL); 2724 2725 ksp = kstat_create_zone(VMM_MODULE_NAME, instance, "vm", 2726 VMM_KSTAT_CLASS, KSTAT_TYPE_NAMED, 2727 sizeof (vmm_kstats_t) / sizeof (kstat_named_t), 0, zid); 2728 2729 if (ksp == NULL) { 2730 return (-1); 2731 } 2732 sc->vmm_kstat_vm = ksp; 2733 2734 for (uint_t i = 0; i < VM_MAXCPU; i++) { 2735 char namebuf[VCPU_NAME_BUFLEN]; 2736 2737 ASSERT3P(sc->vmm_kstat_vcpu[i], ==, NULL); 2738 2739 (void) snprintf(namebuf, VCPU_NAME_BUFLEN, "vcpu%u", i); 2740 ksp = kstat_create_zone(VMM_MODULE_NAME, instance, namebuf, 2741 VMM_KSTAT_CLASS, KSTAT_TYPE_NAMED, 2742 sizeof (vmm_vcpu_kstats_t) / sizeof (kstat_named_t), 2743 0, zid); 2744 if (ksp == NULL) { 2745 goto fail; 2746 } 2747 2748 sc->vmm_kstat_vcpu[i] = ksp; 2749 } 2750 2751 /* 2752 * If this instance is associated with a non-global zone, make its 2753 * kstats visible from the GZ. 2754 */ 2755 if (zid != GLOBAL_ZONEID) { 2756 kstat_zone_add(sc->vmm_kstat_vm, GLOBAL_ZONEID); 2757 for (uint_t i = 0; i < VM_MAXCPU; i++) { 2758 kstat_zone_add(sc->vmm_kstat_vcpu[i], GLOBAL_ZONEID); 2759 } 2760 } 2761 2762 return (0); 2763 2764 fail: 2765 for (uint_t i = 0; i < VM_MAXCPU; i++) { 2766 if (sc->vmm_kstat_vcpu[i] != NULL) { 2767 kstat_delete(sc->vmm_kstat_vcpu[i]); 2768 sc->vmm_kstat_vcpu[i] = NULL; 2769 } else { 2770 break; 2771 } 2772 } 2773 kstat_delete(sc->vmm_kstat_vm); 2774 sc->vmm_kstat_vm = NULL; 2775 return (-1); 2776 } 2777 2778 static void 2779 vmm_kstat_init(vmm_softc_t *sc) 2780 { 2781 kstat_t *ksp; 2782 2783 ASSERT3P(sc->vmm_vm, !=, NULL); 2784 ASSERT3P(sc->vmm_kstat_vm, !=, NULL); 2785 2786 ksp = sc->vmm_kstat_vm; 2787 vmm_kstats_t *vk = ksp->ks_data; 2788 ksp->ks_private = sc->vmm_vm; 2789 kstat_named_init(&vk->vk_name, "vm_name", KSTAT_DATA_STRING); 2790 kstat_named_setstr(&vk->vk_name, sc->vmm_name); 2791 2792 for (uint_t i = 0; i < VM_MAXCPU; i++) { 2793 ASSERT3P(sc->vmm_kstat_vcpu[i], !=, NULL); 2794 2795 ksp = sc->vmm_kstat_vcpu[i]; 2796 vmm_vcpu_kstats_t *vvk = ksp->ks_data; 2797 2798 kstat_named_init(&vvk->vvk_vcpu, "vcpu", KSTAT_DATA_UINT32); 2799 vvk->vvk_vcpu.value.ui32 = i; 2800 kstat_named_init(&vvk->vvk_time_init, "time_init", 2801 KSTAT_DATA_UINT64); 2802 kstat_named_init(&vvk->vvk_time_run, "time_run", 2803 KSTAT_DATA_UINT64); 2804 kstat_named_init(&vvk->vvk_time_idle, "time_idle", 2805 KSTAT_DATA_UINT64); 2806 kstat_named_init(&vvk->vvk_time_emu_kern, "time_emu_kern", 2807 KSTAT_DATA_UINT64); 2808 kstat_named_init(&vvk->vvk_time_emu_user, "time_emu_user", 2809 KSTAT_DATA_UINT64); 2810 kstat_named_init(&vvk->vvk_time_sched, "time_sched", 2811 KSTAT_DATA_UINT64); 2812 ksp->ks_private = sc->vmm_vm; 2813 ksp->ks_update = vmm_kstat_update_vcpu; 2814 } 2815 2816 kstat_install(sc->vmm_kstat_vm); 2817 for (uint_t i = 0; i < VM_MAXCPU; i++) { 2818 kstat_install(sc->vmm_kstat_vcpu[i]); 2819 } 2820 } 2821 2822 static void 2823 vmm_kstat_fini(vmm_softc_t *sc) 2824 { 2825 ASSERT(sc->vmm_kstat_vm != NULL); 2826 2827 kstat_delete(sc->vmm_kstat_vm); 2828 sc->vmm_kstat_vm = NULL; 2829 2830 for (uint_t i = 0; i < VM_MAXCPU; i++) { 2831 ASSERT3P(sc->vmm_kstat_vcpu[i], !=, NULL); 2832 2833 kstat_delete(sc->vmm_kstat_vcpu[i]); 2834 sc->vmm_kstat_vcpu[i] = NULL; 2835 } 2836 } 2837 2838 static int 2839 vmm_open(dev_t *devp, int flag, int otyp, cred_t *credp) 2840 { 2841 minor_t minor; 2842 vmm_softc_t *sc; 2843 2844 /* 2845 * Forbid running bhyve in a 32-bit process until it has been tested and 2846 * verified to be safe. 2847 */ 2848 if (curproc->p_model != DATAMODEL_LP64) { 2849 return (EFBIG); 2850 } 2851 2852 minor = getminor(*devp); 2853 if (minor == VMM_CTL_MINOR) { 2854 /* 2855 * Master control device must be opened exclusively. 2856 */ 2857 if ((flag & FEXCL) != FEXCL || otyp != OTYP_CHR) { 2858 return (EINVAL); 2859 } 2860 2861 return (0); 2862 } 2863 2864 mutex_enter(&vmm_mtx); 2865 sc = ddi_get_soft_state(vmm_statep, minor); 2866 if (sc == NULL) { 2867 mutex_exit(&vmm_mtx); 2868 return (ENXIO); 2869 } 2870 2871 sc->vmm_flags |= VMM_IS_OPEN; 2872 mutex_exit(&vmm_mtx); 2873 2874 return (0); 2875 } 2876 2877 static int 2878 vmm_close(dev_t dev, int flag, int otyp, cred_t *credp) 2879 { 2880 const minor_t minor = getminor(dev); 2881 vmm_softc_t *sc; 2882 bool hma_release = false; 2883 2884 if (minor == VMM_CTL_MINOR) { 2885 return (0); 2886 } 2887 2888 mutex_enter(&vmm_mtx); 2889 sc = ddi_get_soft_state(vmm_statep, minor); 2890 if (sc == NULL) { 2891 mutex_exit(&vmm_mtx); 2892 return (ENXIO); 2893 } 2894 2895 VERIFY3U(sc->vmm_flags & VMM_IS_OPEN, !=, 0); 2896 sc->vmm_flags &= ~VMM_IS_OPEN; 2897 2898 /* 2899 * If instance was marked for auto-destruction begin that now. Instance 2900 * destruction may have been initated already, so try to make progress 2901 * in that case, since closure of the device is one of its requirements. 2902 */ 2903 if ((sc->vmm_flags & VMM_DESTROY) != 0 || 2904 (sc->vmm_flags & VMM_AUTODESTROY) != 0) { 2905 VERIFY0(vmm_destroy_locked(sc, VDO_DEFAULT, &hma_release)); 2906 } 2907 mutex_exit(&vmm_mtx); 2908 2909 if (hma_release) { 2910 vmm_hma_release(); 2911 } 2912 2913 return (0); 2914 } 2915 2916 static int 2917 vmm_is_supported(intptr_t arg) 2918 { 2919 int r; 2920 const char *msg; 2921 2922 if (vmm_is_intel()) { 2923 r = vmx_x86_supported(&msg); 2924 } else if (vmm_is_svm()) { 2925 /* 2926 * HMA already ensured that the features necessary for SVM 2927 * operation were present and online during vmm_attach(). 2928 */ 2929 r = 0; 2930 } else { 2931 r = ENXIO; 2932 msg = "Unsupported CPU vendor"; 2933 } 2934 2935 if (r != 0 && arg != (intptr_t)NULL) { 2936 if (copyoutstr(msg, (char *)arg, strlen(msg) + 1, NULL) != 0) 2937 return (EFAULT); 2938 } 2939 return (r); 2940 } 2941 2942 static int 2943 vmm_ctl_ioctl(int cmd, intptr_t arg, int md, cred_t *cr, int *rvalp) 2944 { 2945 void *argp = (void *)arg; 2946 2947 switch (cmd) { 2948 case VMM_CREATE_VM: { 2949 struct vm_create_req req; 2950 2951 if ((md & FWRITE) == 0) { 2952 return (EPERM); 2953 } 2954 if (ddi_copyin(argp, &req, sizeof (req), md) != 0) { 2955 return (EFAULT); 2956 } 2957 return (vmmdev_do_vm_create(&req, cr)); 2958 } 2959 case VMM_DESTROY_VM: { 2960 struct vm_destroy_req req; 2961 2962 if ((md & FWRITE) == 0) { 2963 return (EPERM); 2964 } 2965 if (ddi_copyin(argp, &req, sizeof (req), md) != 0) { 2966 return (EFAULT); 2967 } 2968 return (vmmdev_do_vm_destroy(&req, cr)); 2969 } 2970 case VMM_VM_SUPPORTED: 2971 return (vmm_is_supported(arg)); 2972 case VMM_CHECK_IOMMU: 2973 if (!vmm_check_iommu()) { 2974 return (ENXIO); 2975 } 2976 return (0); 2977 case VMM_RESV_QUERY: 2978 case VMM_RESV_SET_TARGET: 2979 return (vmmr_ioctl(cmd, arg, md, cr, rvalp)); 2980 default: 2981 break; 2982 } 2983 /* No other actions are legal on ctl device */ 2984 return (ENOTTY); 2985 } 2986 2987 static int 2988 vmm_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, 2989 int *rvalp) 2990 { 2991 vmm_softc_t *sc; 2992 minor_t minor; 2993 2994 /* 2995 * Forbid running bhyve in a 32-bit process until it has been tested and 2996 * verified to be safe. 2997 */ 2998 if (curproc->p_model != DATAMODEL_LP64) { 2999 return (EFBIG); 3000 } 3001 3002 /* The structs in bhyve ioctls assume a 64-bit datamodel */ 3003 if (ddi_model_convert_from(mode & FMODELS) != DDI_MODEL_NONE) { 3004 return (ENOTSUP); 3005 } 3006 3007 /* 3008 * Regardless of minor (vmmctl or instance), we respond to queries of 3009 * the interface version. 3010 */ 3011 if (cmd == VMM_INTERFACE_VERSION) { 3012 *rvalp = VMM_CURRENT_INTERFACE_VERSION; 3013 return (0); 3014 } 3015 3016 minor = getminor(dev); 3017 3018 if (minor == VMM_CTL_MINOR) { 3019 return (vmm_ctl_ioctl(cmd, arg, mode, credp, rvalp)); 3020 } 3021 3022 sc = ddi_get_soft_state(vmm_statep, minor); 3023 ASSERT(sc != NULL); 3024 3025 /* 3026 * Turn away any ioctls against an instance when it is being destroyed. 3027 * (Except for the ioctl inquiring about that destroy-in-progress.) 3028 */ 3029 if ((sc->vmm_flags & VMM_DESTROY) != 0) { 3030 if (cmd == VM_DESTROY_PENDING) { 3031 *rvalp = 1; 3032 return (0); 3033 } 3034 return (ENXIO); 3035 } 3036 3037 return (vmmdev_do_ioctl(sc, cmd, arg, mode, credp, rvalp)); 3038 } 3039 3040 static int 3041 vmm_segmap(dev_t dev, off_t off, struct as *as, caddr_t *addrp, off_t len, 3042 unsigned int prot, unsigned int maxprot, unsigned int flags, cred_t *credp) 3043 { 3044 vmm_softc_t *sc; 3045 const minor_t minor = getminor(dev); 3046 int err; 3047 3048 if (minor == VMM_CTL_MINOR) { 3049 return (ENODEV); 3050 } 3051 if (off < 0 || (off + len) <= 0) { 3052 return (EINVAL); 3053 } 3054 if ((prot & PROT_USER) == 0) { 3055 return (EACCES); 3056 } 3057 3058 sc = ddi_get_soft_state(vmm_statep, minor); 3059 ASSERT(sc); 3060 3061 if (sc->vmm_flags & VMM_DESTROY) 3062 return (ENXIO); 3063 3064 /* Grab read lock on the VM to prevent any changes to the memory map */ 3065 vmm_read_lock(sc); 3066 3067 if (off >= VM_DEVMEM_START) { 3068 int segid; 3069 off_t segoff; 3070 3071 /* Mapping a devmem "device" */ 3072 if (!vmmdev_devmem_segid(sc, off, len, &segid, &segoff)) { 3073 err = ENODEV; 3074 } else { 3075 err = vm_segmap_obj(sc->vmm_vm, segid, segoff, len, as, 3076 addrp, prot, maxprot, flags); 3077 } 3078 } else { 3079 /* Mapping a part of the guest physical space */ 3080 err = vm_segmap_space(sc->vmm_vm, off, as, addrp, len, prot, 3081 maxprot, flags); 3082 } 3083 3084 vmm_read_unlock(sc); 3085 return (err); 3086 } 3087 3088 static sdev_plugin_validate_t 3089 vmm_sdev_validate(sdev_ctx_t ctx) 3090 { 3091 const char *name = sdev_ctx_name(ctx); 3092 vmm_softc_t *sc; 3093 sdev_plugin_validate_t ret; 3094 minor_t minor; 3095 3096 if (sdev_ctx_vtype(ctx) != VCHR) 3097 return (SDEV_VTOR_INVALID); 3098 3099 VERIFY3S(sdev_ctx_minor(ctx, &minor), ==, 0); 3100 3101 mutex_enter(&vmm_mtx); 3102 if ((sc = vmm_lookup(name)) == NULL) 3103 ret = SDEV_VTOR_INVALID; 3104 else if (sc->vmm_minor != minor) 3105 ret = SDEV_VTOR_STALE; 3106 else 3107 ret = SDEV_VTOR_VALID; 3108 mutex_exit(&vmm_mtx); 3109 3110 return (ret); 3111 } 3112 3113 static int 3114 vmm_sdev_filldir(sdev_ctx_t ctx) 3115 { 3116 vmm_softc_t *sc; 3117 int ret; 3118 3119 if (strcmp(sdev_ctx_path(ctx), VMM_SDEV_ROOT) != 0) { 3120 cmn_err(CE_WARN, "%s: bad path '%s' != '%s'\n", __func__, 3121 sdev_ctx_path(ctx), VMM_SDEV_ROOT); 3122 return (EINVAL); 3123 } 3124 3125 mutex_enter(&vmm_mtx); 3126 ASSERT(vmmdev_dip != NULL); 3127 for (sc = list_head(&vmm_list); sc != NULL; 3128 sc = list_next(&vmm_list, sc)) { 3129 if (INGLOBALZONE(curproc) || sc->vmm_zone == curzone) { 3130 ret = sdev_plugin_mknod(ctx, sc->vmm_name, 3131 S_IFCHR | 0600, 3132 makedevice(ddi_driver_major(vmmdev_dip), 3133 sc->vmm_minor)); 3134 } else { 3135 continue; 3136 } 3137 if (ret != 0 && ret != EEXIST) 3138 goto out; 3139 } 3140 3141 ret = 0; 3142 3143 out: 3144 mutex_exit(&vmm_mtx); 3145 return (ret); 3146 } 3147 3148 /* ARGSUSED */ 3149 static void 3150 vmm_sdev_inactive(sdev_ctx_t ctx) 3151 { 3152 } 3153 3154 static sdev_plugin_ops_t vmm_sdev_ops = { 3155 .spo_version = SDEV_PLUGIN_VERSION, 3156 .spo_flags = SDEV_PLUGIN_SUBDIR, 3157 .spo_validate = vmm_sdev_validate, 3158 .spo_filldir = vmm_sdev_filldir, 3159 .spo_inactive = vmm_sdev_inactive 3160 }; 3161 3162 /* ARGSUSED */ 3163 static int 3164 vmm_info(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result) 3165 { 3166 int error; 3167 3168 switch (cmd) { 3169 case DDI_INFO_DEVT2DEVINFO: 3170 *result = (void *)vmmdev_dip; 3171 error = DDI_SUCCESS; 3172 break; 3173 case DDI_INFO_DEVT2INSTANCE: 3174 *result = (void *)0; 3175 error = DDI_SUCCESS; 3176 break; 3177 default: 3178 error = DDI_FAILURE; 3179 break; 3180 } 3181 return (error); 3182 } 3183 3184 static int 3185 vmm_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 3186 { 3187 sdev_plugin_hdl_t sph; 3188 hma_reg_t *reg = NULL; 3189 boolean_t vmm_loaded = B_FALSE; 3190 3191 if (cmd != DDI_ATTACH) { 3192 return (DDI_FAILURE); 3193 } 3194 3195 mutex_enter(&vmmdev_mtx); 3196 /* Ensure we are not already attached. */ 3197 if (vmmdev_dip != NULL) { 3198 mutex_exit(&vmmdev_mtx); 3199 return (DDI_FAILURE); 3200 } 3201 3202 vmm_sol_glue_init(); 3203 3204 /* 3205 * Perform temporary HMA registration to determine if the system 3206 * is capable. 3207 */ 3208 if ((reg = hma_register(vmmdev_hvm_name)) == NULL) { 3209 goto fail; 3210 } else if (vmm_mod_load() != 0) { 3211 goto fail; 3212 } 3213 vmm_loaded = B_TRUE; 3214 hma_unregister(reg); 3215 reg = NULL; 3216 3217 /* Create control node. Other nodes will be created on demand. */ 3218 if (ddi_create_minor_node(dip, "ctl", S_IFCHR, 3219 VMM_CTL_MINOR, DDI_PSEUDO, 0) != 0) { 3220 goto fail; 3221 } 3222 3223 sph = sdev_plugin_register(VMM_MODULE_NAME, &vmm_sdev_ops, NULL); 3224 if (sph == (sdev_plugin_hdl_t)NULL) { 3225 ddi_remove_minor_node(dip, NULL); 3226 goto fail; 3227 } 3228 3229 ddi_report_dev(dip); 3230 vmmdev_sdev_hdl = sph; 3231 vmmdev_dip = dip; 3232 mutex_exit(&vmmdev_mtx); 3233 return (DDI_SUCCESS); 3234 3235 fail: 3236 if (vmm_loaded) { 3237 VERIFY0(vmm_mod_unload()); 3238 } 3239 if (reg != NULL) { 3240 hma_unregister(reg); 3241 } 3242 vmm_sol_glue_cleanup(); 3243 mutex_exit(&vmmdev_mtx); 3244 return (DDI_FAILURE); 3245 } 3246 3247 static int 3248 vmm_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 3249 { 3250 if (cmd != DDI_DETACH) { 3251 return (DDI_FAILURE); 3252 } 3253 3254 /* 3255 * Ensure that all resources have been cleaned up. 3256 * 3257 * To prevent a deadlock with iommu_cleanup() we'll fail the detach if 3258 * vmmdev_mtx is already held. We can't wait for vmmdev_mtx with our 3259 * devinfo locked as iommu_cleanup() tries to recursively lock each 3260 * devinfo, including our own, while holding vmmdev_mtx. 3261 */ 3262 if (mutex_tryenter(&vmmdev_mtx) == 0) 3263 return (DDI_FAILURE); 3264 3265 mutex_enter(&vmm_mtx); 3266 if (!list_is_empty(&vmm_list)) { 3267 mutex_exit(&vmm_mtx); 3268 mutex_exit(&vmmdev_mtx); 3269 return (DDI_FAILURE); 3270 } 3271 mutex_exit(&vmm_mtx); 3272 3273 if (!vmmr_is_empty()) { 3274 mutex_exit(&vmmdev_mtx); 3275 return (DDI_FAILURE); 3276 } 3277 3278 VERIFY(vmmdev_sdev_hdl != (sdev_plugin_hdl_t)NULL); 3279 if (sdev_plugin_unregister(vmmdev_sdev_hdl) != 0) { 3280 mutex_exit(&vmmdev_mtx); 3281 return (DDI_FAILURE); 3282 } 3283 vmmdev_sdev_hdl = (sdev_plugin_hdl_t)NULL; 3284 3285 /* Remove the control node. */ 3286 ddi_remove_minor_node(dip, "ctl"); 3287 vmmdev_dip = NULL; 3288 3289 VERIFY0(vmm_mod_unload()); 3290 VERIFY3U(vmmdev_hma_reg, ==, NULL); 3291 vmm_sol_glue_cleanup(); 3292 3293 mutex_exit(&vmmdev_mtx); 3294 3295 return (DDI_SUCCESS); 3296 } 3297 3298 static struct cb_ops vmm_cb_ops = { 3299 vmm_open, 3300 vmm_close, 3301 nodev, /* strategy */ 3302 nodev, /* print */ 3303 nodev, /* dump */ 3304 nodev, /* read */ 3305 nodev, /* write */ 3306 vmm_ioctl, 3307 nodev, /* devmap */ 3308 nodev, /* mmap */ 3309 vmm_segmap, 3310 nochpoll, /* poll */ 3311 ddi_prop_op, 3312 NULL, 3313 D_NEW | D_MP | D_DEVMAP 3314 }; 3315 3316 static struct dev_ops vmm_ops = { 3317 DEVO_REV, 3318 0, 3319 vmm_info, 3320 nulldev, /* identify */ 3321 nulldev, /* probe */ 3322 vmm_attach, 3323 vmm_detach, 3324 nodev, /* reset */ 3325 &vmm_cb_ops, 3326 (struct bus_ops *)NULL 3327 }; 3328 3329 static struct modldrv modldrv = { 3330 &mod_driverops, 3331 "bhyve vmm", 3332 &vmm_ops 3333 }; 3334 3335 static struct modlinkage modlinkage = { 3336 MODREV_1, 3337 &modldrv, 3338 NULL 3339 }; 3340 3341 int 3342 _init(void) 3343 { 3344 int error; 3345 3346 sysinit(); 3347 3348 mutex_init(&vmmdev_mtx, NULL, MUTEX_DRIVER, NULL); 3349 mutex_init(&vmm_mtx, NULL, MUTEX_DRIVER, NULL); 3350 list_create(&vmm_list, sizeof (vmm_softc_t), 3351 offsetof(vmm_softc_t, vmm_node)); 3352 vmm_minors = id_space_create("vmm_minors", VMM_CTL_MINOR + 1, MAXMIN32); 3353 3354 error = ddi_soft_state_init(&vmm_statep, sizeof (vmm_softc_t), 0); 3355 if (error) { 3356 return (error); 3357 } 3358 3359 error = vmmr_init(); 3360 if (error) { 3361 ddi_soft_state_fini(&vmm_statep); 3362 return (error); 3363 } 3364 3365 vmm_zsd_init(); 3366 3367 error = mod_install(&modlinkage); 3368 if (error) { 3369 ddi_soft_state_fini(&vmm_statep); 3370 vmm_zsd_fini(); 3371 vmmr_fini(); 3372 } 3373 3374 return (error); 3375 } 3376 3377 int 3378 _fini(void) 3379 { 3380 int error; 3381 3382 error = mod_remove(&modlinkage); 3383 if (error) { 3384 return (error); 3385 } 3386 3387 vmm_zsd_fini(); 3388 vmmr_fini(); 3389 3390 ddi_soft_state_fini(&vmm_statep); 3391 3392 return (0); 3393 } 3394 3395 int 3396 _info(struct modinfo *modinfop) 3397 { 3398 return (mod_info(&modlinkage, modinfop)); 3399 } 3400