1 /* 2 * This file and its contents are supplied under the terms of the 3 * Common Development and Distribution License ("CDDL"), version 1.0. 4 * You may only use this file in accordance with the terms of version 5 * 1.0 of the CDDL. 6 * 7 * A full copy of the text of the CDDL should have accompanied this 8 * source. A copy of the CDDL is also available via the Internet at 9 * http://www.illumos.org/license/CDDL. 10 */ 11 /* This file is dual-licensed; see usr/src/contrib/bhyve/LICENSE */ 12 13 /* 14 * Copyright 2015 Pluribus Networks Inc. 15 * Copyright 2019 Joyent, Inc. 16 * Copyright 2020 OmniOS Community Edition (OmniOSce) Association. 17 * Copyright 2022 Oxide Computer Company 18 */ 19 20 #include <sys/types.h> 21 #include <sys/conf.h> 22 #include <sys/cpuvar.h> 23 #include <sys/ioccom.h> 24 #include <sys/stat.h> 25 #include <sys/vmsystm.h> 26 #include <sys/ddi.h> 27 #include <sys/mkdev.h> 28 #include <sys/sunddi.h> 29 #include <sys/fs/dv_node.h> 30 #include <sys/cpuset.h> 31 #include <sys/id_space.h> 32 #include <sys/fs/sdev_plugin.h> 33 #include <sys/smt.h> 34 #include <sys/kstat.h> 35 36 #include <sys/kernel.h> 37 #include <sys/hma.h> 38 #include <sys/x86_archext.h> 39 #include <x86/apicreg.h> 40 41 #include <sys/vmm.h> 42 #include <sys/vmm_kernel.h> 43 #include <sys/vmm_instruction_emul.h> 44 #include <sys/vmm_dev.h> 45 #include <sys/vmm_impl.h> 46 #include <sys/vmm_drv.h> 47 #include <sys/vmm_vm.h> 48 #include <sys/vmm_reservoir.h> 49 50 #include <vm/seg_dev.h> 51 52 #include "io/ppt.h" 53 #include "io/vatpic.h" 54 #include "io/vioapic.h" 55 #include "io/vrtc.h" 56 #include "io/vhpet.h" 57 #include "io/vpmtmr.h" 58 #include "vmm_lapic.h" 59 #include "vmm_stat.h" 60 #include "vmm_util.h" 61 62 /* 63 * Locking details: 64 * 65 * Driver-wide data (vmmdev_*) , including HMA and sdev registration, is 66 * protected by vmmdev_mtx. The list of vmm_softc_t instances and related data 67 * (vmm_*) are protected by vmm_mtx. Actions requiring both locks must acquire 68 * vmmdev_mtx before vmm_mtx. The sdev plugin functions must not attempt to 69 * acquire vmmdev_mtx, as they could deadlock with plugin unregistration. 70 */ 71 72 static kmutex_t vmmdev_mtx; 73 static dev_info_t *vmmdev_dip; 74 static hma_reg_t *vmmdev_hma_reg; 75 static uint_t vmmdev_hma_ref; 76 static sdev_plugin_hdl_t vmmdev_sdev_hdl; 77 78 static kmutex_t vmm_mtx; 79 static list_t vmm_list; 80 static id_space_t *vmm_minors; 81 static void *vmm_statep; 82 83 /* temporary safety switch */ 84 int vmm_allow_state_writes; 85 86 static const char *vmmdev_hvm_name = "bhyve"; 87 88 /* For sdev plugin (/dev) */ 89 #define VMM_SDEV_ROOT "/dev/vmm" 90 91 /* From uts/intel/io/vmm/intel/vmx.c */ 92 extern int vmx_x86_supported(const char **); 93 94 /* Holds and hooks from drivers external to vmm */ 95 struct vmm_hold { 96 list_node_t vmh_node; 97 vmm_softc_t *vmh_sc; 98 boolean_t vmh_release_req; 99 uint_t vmh_ioport_hook_cnt; 100 }; 101 102 struct vmm_lease { 103 list_node_t vml_node; 104 struct vm *vml_vm; 105 vm_client_t *vml_vmclient; 106 boolean_t vml_expired; 107 boolean_t vml_break_deferred; 108 boolean_t (*vml_expire_func)(void *); 109 void *vml_expire_arg; 110 struct vmm_hold *vml_hold; 111 }; 112 113 /* Options for vmm_destroy_locked */ 114 typedef enum vmm_destroy_opts { 115 VDO_DEFAULT = 0, 116 /* 117 * Indicate that zone-specific-data associated with this VM not be 118 * cleaned up as part of the destroy. Skipping ZSD clean-up is 119 * necessary when VM is being destroyed as part of zone destruction, 120 * when said ZSD is already being cleaned up. 121 */ 122 VDO_NO_CLEAN_ZSD = (1 << 0), 123 /* 124 * Attempt to wait for VM destruction to complete. This is opt-in, 125 * since there are many normal conditions which could lead to 126 * destruction being stalled pending other clean-up. 127 */ 128 VDO_ATTEMPT_WAIT = (1 << 1), 129 } vmm_destroy_opts_t; 130 131 static void vmm_hma_release(void); 132 static int vmm_destroy_locked(vmm_softc_t *, vmm_destroy_opts_t, bool *); 133 static int vmm_drv_block_hook(vmm_softc_t *, boolean_t); 134 static void vmm_lease_block(vmm_softc_t *); 135 static void vmm_lease_unblock(vmm_softc_t *); 136 static int vmm_kstat_alloc(vmm_softc_t *, minor_t, const cred_t *); 137 static void vmm_kstat_init(vmm_softc_t *); 138 static void vmm_kstat_fini(vmm_softc_t *); 139 140 /* 141 * The 'devmem' hack: 142 * 143 * On native FreeBSD, bhyve consumers are allowed to create 'devmem' segments 144 * in the vm which appear with their own name related to the vm under /dev. 145 * Since this would be a hassle from an sdev perspective and would require a 146 * new cdev interface (or complicate the existing one), we choose to implement 147 * this in a different manner. Direct access to the underlying vm memory 148 * segments is exposed by placing them in a range of offsets beyond the normal 149 * guest memory space. Userspace can query the appropriate offset to mmap() 150 * for a given segment-id with the VM_DEVMEM_GETOFFSET ioctl. 151 */ 152 153 static vmm_devmem_entry_t * 154 vmmdev_devmem_find(vmm_softc_t *sc, int segid) 155 { 156 vmm_devmem_entry_t *ent = NULL; 157 list_t *dl = &sc->vmm_devmem_list; 158 159 for (ent = list_head(dl); ent != NULL; ent = list_next(dl, ent)) { 160 if (ent->vde_segid == segid) { 161 return (ent); 162 } 163 } 164 return (NULL); 165 } 166 167 static int 168 vmmdev_get_memseg(vmm_softc_t *sc, struct vm_memseg *mseg) 169 { 170 int error; 171 bool sysmem; 172 173 error = vm_get_memseg(sc->vmm_vm, mseg->segid, &mseg->len, &sysmem, 174 NULL); 175 if (error || mseg->len == 0) 176 return (error); 177 178 if (!sysmem) { 179 vmm_devmem_entry_t *de; 180 181 de = vmmdev_devmem_find(sc, mseg->segid); 182 if (de != NULL) { 183 (void) strlcpy(mseg->name, de->vde_name, 184 sizeof (mseg->name)); 185 } 186 } else { 187 bzero(mseg->name, sizeof (mseg->name)); 188 } 189 190 return (error); 191 } 192 193 static int 194 vmmdev_devmem_create(vmm_softc_t *sc, struct vm_memseg *mseg, const char *name) 195 { 196 off_t map_offset; 197 vmm_devmem_entry_t *entry; 198 199 if (list_is_empty(&sc->vmm_devmem_list)) { 200 map_offset = VM_DEVMEM_START; 201 } else { 202 entry = list_tail(&sc->vmm_devmem_list); 203 map_offset = entry->vde_off + entry->vde_len; 204 if (map_offset < entry->vde_off) { 205 /* Do not tolerate overflow */ 206 return (ERANGE); 207 } 208 /* 209 * XXXJOY: We could choose to search the list for duplicate 210 * names and toss an error. Since we're using the offset 211 * method for now, it does not make much of a difference. 212 */ 213 } 214 215 entry = kmem_zalloc(sizeof (*entry), KM_SLEEP); 216 entry->vde_segid = mseg->segid; 217 entry->vde_len = mseg->len; 218 entry->vde_off = map_offset; 219 (void) strlcpy(entry->vde_name, name, sizeof (entry->vde_name)); 220 list_insert_tail(&sc->vmm_devmem_list, entry); 221 222 return (0); 223 } 224 225 static boolean_t 226 vmmdev_devmem_segid(vmm_softc_t *sc, off_t off, off_t len, int *segidp, 227 off_t *map_offp) 228 { 229 list_t *dl = &sc->vmm_devmem_list; 230 vmm_devmem_entry_t *de = NULL; 231 const off_t map_end = off + len; 232 233 VERIFY(off >= VM_DEVMEM_START); 234 235 if (map_end < off) { 236 /* No match on overflow */ 237 return (B_FALSE); 238 } 239 240 for (de = list_head(dl); de != NULL; de = list_next(dl, de)) { 241 const off_t item_end = de->vde_off + de->vde_len; 242 243 if (de->vde_off <= off && item_end >= map_end) { 244 *segidp = de->vde_segid; 245 *map_offp = off - de->vde_off; 246 return (B_TRUE); 247 } 248 } 249 return (B_FALSE); 250 } 251 252 /* 253 * When an instance is being destroyed, the devmem list of named memory objects 254 * can be torn down, as no new mappings are allowed. 255 */ 256 static void 257 vmmdev_devmem_purge(vmm_softc_t *sc) 258 { 259 vmm_devmem_entry_t *entry; 260 261 while ((entry = list_remove_head(&sc->vmm_devmem_list)) != NULL) { 262 kmem_free(entry, sizeof (*entry)); 263 } 264 } 265 266 static int 267 vmmdev_alloc_memseg(vmm_softc_t *sc, struct vm_memseg *mseg) 268 { 269 int error; 270 bool sysmem = true; 271 272 if (VM_MEMSEG_NAME(mseg)) { 273 sysmem = false; 274 } 275 error = vm_alloc_memseg(sc->vmm_vm, mseg->segid, mseg->len, sysmem); 276 277 if (error == 0) { 278 /* 279 * Rather than create a whole fresh device from which userspace 280 * can mmap this segment, instead make it available at an 281 * offset above where the main guest memory resides. 282 */ 283 error = vmmdev_devmem_create(sc, mseg, mseg->name); 284 if (error != 0) { 285 vm_free_memseg(sc->vmm_vm, mseg->segid); 286 } 287 } 288 return (error); 289 } 290 291 /* 292 * Resource Locking and Exclusion 293 * 294 * Much of bhyve depends on key portions of VM state, such as the guest memory 295 * map, to remain unchanged while the guest is running. As ported from 296 * FreeBSD, the initial strategy for this resource exclusion hinged on gating 297 * access to the instance vCPUs. Threads acting on a single vCPU, like those 298 * performing the work of actually running the guest in VMX/SVM, would lock 299 * only that vCPU during ioctl() entry. For ioctls which would change VM-wide 300 * state, all of the vCPUs would be first locked, ensuring that the 301 * operation(s) could complete without any other threads stumbling into 302 * intermediate states. 303 * 304 * This approach is largely effective for bhyve. Common operations, such as 305 * running the vCPUs, steer clear of lock contention. The model begins to 306 * break down for operations which do not occur in the context of a specific 307 * vCPU. LAPIC MSI delivery, for example, may be initiated from a worker 308 * thread in the bhyve process. In order to properly protect those vCPU-less 309 * operations from encountering invalid states, additional locking is required. 310 * This was solved by forcing those operations to lock the VM_MAXCPU-1 vCPU. 311 * It does mean that class of operations will be serialized on locking the 312 * specific vCPU and that instances sized at VM_MAXCPU will potentially see 313 * undue contention on the VM_MAXCPU-1 vCPU. 314 * 315 * In order to address the shortcomings of this model, the concept of a 316 * read/write lock has been added to bhyve. Operations which change 317 * fundamental aspects of a VM (such as the memory map) must acquire the write 318 * lock, which also implies locking all of the vCPUs and waiting for all read 319 * lock holders to release. While it increases the cost and waiting time for 320 * those few operations, it allows most hot-path operations on the VM (which 321 * depend on its configuration remaining stable) to occur with minimal locking. 322 * 323 * Consumers of the Driver API (see below) are a special case when it comes to 324 * this locking, since they may hold a read lock via the drv_lease mechanism 325 * for an extended period of time. Rather than forcing those consumers to 326 * continuously poll for a write lock attempt, the lease system forces them to 327 * provide a release callback to trigger their clean-up (and potential later 328 * reacquisition) of the read lock. 329 */ 330 331 static void 332 vcpu_lock_one(vmm_softc_t *sc, int vcpu) 333 { 334 ASSERT(vcpu >= 0 && vcpu < VM_MAXCPU); 335 336 /* 337 * Since this state transition is utilizing from_idle=true, it should 338 * not fail, but rather block until it can be successful. 339 */ 340 VERIFY0(vcpu_set_state(sc->vmm_vm, vcpu, VCPU_FROZEN, true)); 341 } 342 343 static void 344 vcpu_unlock_one(vmm_softc_t *sc, int vcpu) 345 { 346 ASSERT(vcpu >= 0 && vcpu < VM_MAXCPU); 347 348 VERIFY3U(vcpu_get_state(sc->vmm_vm, vcpu, NULL), ==, VCPU_FROZEN); 349 VERIFY0(vcpu_set_state(sc->vmm_vm, vcpu, VCPU_IDLE, false)); 350 } 351 352 static void 353 vmm_read_lock(vmm_softc_t *sc) 354 { 355 rw_enter(&sc->vmm_rwlock, RW_READER); 356 } 357 358 static void 359 vmm_read_unlock(vmm_softc_t *sc) 360 { 361 rw_exit(&sc->vmm_rwlock); 362 } 363 364 static void 365 vmm_write_lock(vmm_softc_t *sc) 366 { 367 int maxcpus; 368 369 /* First lock all the vCPUs */ 370 maxcpus = vm_get_maxcpus(sc->vmm_vm); 371 for (int vcpu = 0; vcpu < maxcpus; vcpu++) { 372 vcpu_lock_one(sc, vcpu); 373 } 374 375 /* 376 * Block vmm_drv leases from being acquired or held while the VM write 377 * lock is held. 378 */ 379 vmm_lease_block(sc); 380 381 rw_enter(&sc->vmm_rwlock, RW_WRITER); 382 /* 383 * For now, the 'maxcpus' value for an instance is fixed at the 384 * compile-time constant of VM_MAXCPU at creation. If this changes in 385 * the future, allowing for dynamic vCPU resource sizing, acquisition 386 * of the write lock will need to be wary of such changes. 387 */ 388 VERIFY(maxcpus == vm_get_maxcpus(sc->vmm_vm)); 389 } 390 391 static void 392 vmm_write_unlock(vmm_softc_t *sc) 393 { 394 int maxcpus; 395 396 /* Allow vmm_drv leases to be acquired once write lock is dropped */ 397 vmm_lease_unblock(sc); 398 399 /* 400 * The VM write lock _must_ be released from the same thread it was 401 * acquired in, unlike the read lock. 402 */ 403 VERIFY(rw_write_held(&sc->vmm_rwlock)); 404 rw_exit(&sc->vmm_rwlock); 405 406 /* Unlock all the vCPUs */ 407 maxcpus = vm_get_maxcpus(sc->vmm_vm); 408 for (int vcpu = 0; vcpu < maxcpus; vcpu++) { 409 vcpu_unlock_one(sc, vcpu); 410 } 411 } 412 413 static int 414 vmmdev_do_ioctl(vmm_softc_t *sc, int cmd, intptr_t arg, int md, 415 cred_t *credp, int *rvalp) 416 { 417 int error = 0, vcpu = -1; 418 void *datap = (void *)arg; 419 enum vm_lock_type { 420 LOCK_NONE = 0, 421 LOCK_VCPU, 422 LOCK_READ_HOLD, 423 LOCK_WRITE_HOLD 424 } lock_type = LOCK_NONE; 425 426 /* Acquire any exclusion resources needed for the operation. */ 427 switch (cmd) { 428 case VM_RUN: 429 case VM_GET_REGISTER: 430 case VM_SET_REGISTER: 431 case VM_GET_SEGMENT_DESCRIPTOR: 432 case VM_SET_SEGMENT_DESCRIPTOR: 433 case VM_GET_REGISTER_SET: 434 case VM_SET_REGISTER_SET: 435 case VM_INJECT_EXCEPTION: 436 case VM_GET_CAPABILITY: 437 case VM_SET_CAPABILITY: 438 case VM_PPTDEV_MSI: 439 case VM_PPTDEV_MSIX: 440 case VM_SET_X2APIC_STATE: 441 case VM_GLA2GPA: 442 case VM_GLA2GPA_NOFAULT: 443 case VM_ACTIVATE_CPU: 444 case VM_SET_INTINFO: 445 case VM_GET_INTINFO: 446 case VM_RESTART_INSTRUCTION: 447 case VM_SET_KERNEMU_DEV: 448 case VM_GET_KERNEMU_DEV: 449 case VM_RESET_CPU: 450 case VM_GET_RUN_STATE: 451 case VM_SET_RUN_STATE: 452 case VM_GET_FPU: 453 case VM_SET_FPU: 454 case VM_GET_CPUID: 455 case VM_SET_CPUID: 456 case VM_LEGACY_CPUID: 457 /* 458 * Copy in the ID of the vCPU chosen for this operation. 459 * Since a nefarious caller could update their struct between 460 * this locking and when the rest of the ioctl data is copied 461 * in, it is _critical_ that this local 'vcpu' variable be used 462 * rather than the in-struct one when performing the ioctl. 463 */ 464 if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) { 465 return (EFAULT); 466 } 467 if (vcpu < 0 || vcpu > vm_get_maxcpus(sc->vmm_vm)) { 468 return (EINVAL); 469 } 470 vcpu_lock_one(sc, vcpu); 471 lock_type = LOCK_VCPU; 472 break; 473 474 case VM_REINIT: 475 case VM_BIND_PPTDEV: 476 case VM_UNBIND_PPTDEV: 477 case VM_MAP_PPTDEV_MMIO: 478 case VM_UNMAP_PPTDEV_MMIO: 479 case VM_ALLOC_MEMSEG: 480 case VM_MMAP_MEMSEG: 481 case VM_MUNMAP_MEMSEG: 482 case VM_WRLOCK_CYCLE: 483 case VM_PMTMR_LOCATE: 484 case VM_PAUSE: 485 case VM_RESUME: 486 vmm_write_lock(sc); 487 lock_type = LOCK_WRITE_HOLD; 488 break; 489 490 case VM_GET_MEMSEG: 491 case VM_MMAP_GETNEXT: 492 case VM_LAPIC_IRQ: 493 case VM_INJECT_NMI: 494 case VM_IOAPIC_ASSERT_IRQ: 495 case VM_IOAPIC_DEASSERT_IRQ: 496 case VM_IOAPIC_PULSE_IRQ: 497 case VM_LAPIC_MSI: 498 case VM_LAPIC_LOCAL_IRQ: 499 case VM_GET_X2APIC_STATE: 500 case VM_RTC_READ: 501 case VM_RTC_WRITE: 502 case VM_RTC_SETTIME: 503 case VM_RTC_GETTIME: 504 case VM_PPTDEV_DISABLE_MSIX: 505 case VM_DEVMEM_GETOFFSET: 506 case VM_TRACK_DIRTY_PAGES: 507 vmm_read_lock(sc); 508 lock_type = LOCK_READ_HOLD; 509 break; 510 511 case VM_DATA_READ: 512 case VM_DATA_WRITE: 513 if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) { 514 return (EFAULT); 515 } 516 if (vcpu == -1) { 517 /* Access data for VM-wide devices */ 518 vmm_write_lock(sc); 519 lock_type = LOCK_WRITE_HOLD; 520 } else if (vcpu >= 0 && vcpu < vm_get_maxcpus(sc->vmm_vm)) { 521 /* Access data associated with a specific vCPU */ 522 vcpu_lock_one(sc, vcpu); 523 lock_type = LOCK_VCPU; 524 } else { 525 return (EINVAL); 526 } 527 break; 528 529 case VM_GET_GPA_PMAP: 530 case VM_IOAPIC_PINCOUNT: 531 case VM_SUSPEND: 532 case VM_DESC_FPU_AREA: 533 case VM_SET_AUTODESTRUCT: 534 case VM_DESTROY_SELF: 535 case VM_DESTROY_PENDING: 536 default: 537 break; 538 } 539 540 /* Execute the primary logic for the ioctl. */ 541 switch (cmd) { 542 case VM_RUN: { 543 struct vm_entry entry; 544 545 if (ddi_copyin(datap, &entry, sizeof (entry), md)) { 546 error = EFAULT; 547 break; 548 } 549 550 if (!(curthread->t_schedflag & TS_VCPU)) 551 smt_mark_as_vcpu(); 552 553 error = vm_run(sc->vmm_vm, vcpu, &entry); 554 555 /* 556 * Unexpected states in vm_run() are expressed through positive 557 * errno-oriented return values. VM states which expect further 558 * processing in userspace (necessary context via exitinfo) are 559 * expressed through negative return values. For the time being 560 * a return value of 0 is not expected from vm_run(). 561 */ 562 ASSERT(error != 0); 563 if (error < 0) { 564 const struct vm_exit *vme; 565 void *outp = entry.exit_data; 566 567 error = 0; 568 vme = vm_exitinfo(sc->vmm_vm, vcpu); 569 if (ddi_copyout(vme, outp, sizeof (*vme), md)) { 570 error = EFAULT; 571 } 572 } 573 break; 574 } 575 case VM_SUSPEND: { 576 struct vm_suspend vmsuspend; 577 578 if (ddi_copyin(datap, &vmsuspend, sizeof (vmsuspend), md)) { 579 error = EFAULT; 580 break; 581 } 582 error = vm_suspend(sc->vmm_vm, vmsuspend.how); 583 break; 584 } 585 case VM_REINIT: { 586 struct vm_reinit reinit; 587 588 if (ddi_copyin(datap, &reinit, sizeof (reinit), md)) { 589 error = EFAULT; 590 break; 591 } 592 if ((error = vmm_drv_block_hook(sc, B_TRUE)) != 0) { 593 /* 594 * The VM instance should be free of driver-attached 595 * hooks during the reinitialization process. 596 */ 597 break; 598 } 599 error = vm_reinit(sc->vmm_vm, reinit.flags); 600 (void) vmm_drv_block_hook(sc, B_FALSE); 601 break; 602 } 603 case VM_STAT_DESC: { 604 struct vm_stat_desc statdesc; 605 606 if (ddi_copyin(datap, &statdesc, sizeof (statdesc), md)) { 607 error = EFAULT; 608 break; 609 } 610 error = vmm_stat_desc_copy(statdesc.index, statdesc.desc, 611 sizeof (statdesc.desc)); 612 if (error == 0 && 613 ddi_copyout(&statdesc, datap, sizeof (statdesc), md)) { 614 error = EFAULT; 615 break; 616 } 617 break; 618 } 619 case VM_STATS_IOC: { 620 struct vm_stats vmstats; 621 622 if (ddi_copyin(datap, &vmstats, sizeof (vmstats), md)) { 623 error = EFAULT; 624 break; 625 } 626 hrt2tv(gethrtime(), &vmstats.tv); 627 error = vmm_stat_copy(sc->vmm_vm, vmstats.cpuid, vmstats.index, 628 nitems(vmstats.statbuf), 629 &vmstats.num_entries, vmstats.statbuf); 630 if (error == 0 && 631 ddi_copyout(&vmstats, datap, sizeof (vmstats), md)) { 632 error = EFAULT; 633 break; 634 } 635 break; 636 } 637 638 case VM_PPTDEV_MSI: { 639 struct vm_pptdev_msi pptmsi; 640 641 if (ddi_copyin(datap, &pptmsi, sizeof (pptmsi), md)) { 642 error = EFAULT; 643 break; 644 } 645 error = ppt_setup_msi(sc->vmm_vm, pptmsi.vcpu, pptmsi.pptfd, 646 pptmsi.addr, pptmsi.msg, pptmsi.numvec); 647 break; 648 } 649 case VM_PPTDEV_MSIX: { 650 struct vm_pptdev_msix pptmsix; 651 652 if (ddi_copyin(datap, &pptmsix, sizeof (pptmsix), md)) { 653 error = EFAULT; 654 break; 655 } 656 error = ppt_setup_msix(sc->vmm_vm, pptmsix.vcpu, pptmsix.pptfd, 657 pptmsix.idx, pptmsix.addr, pptmsix.msg, 658 pptmsix.vector_control); 659 break; 660 } 661 case VM_PPTDEV_DISABLE_MSIX: { 662 struct vm_pptdev pptdev; 663 664 if (ddi_copyin(datap, &pptdev, sizeof (pptdev), md)) { 665 error = EFAULT; 666 break; 667 } 668 error = ppt_disable_msix(sc->vmm_vm, pptdev.pptfd); 669 break; 670 } 671 case VM_MAP_PPTDEV_MMIO: { 672 struct vm_pptdev_mmio pptmmio; 673 674 if (ddi_copyin(datap, &pptmmio, sizeof (pptmmio), md)) { 675 error = EFAULT; 676 break; 677 } 678 error = ppt_map_mmio(sc->vmm_vm, pptmmio.pptfd, pptmmio.gpa, 679 pptmmio.len, pptmmio.hpa); 680 break; 681 } 682 case VM_UNMAP_PPTDEV_MMIO: { 683 struct vm_pptdev_mmio pptmmio; 684 685 if (ddi_copyin(datap, &pptmmio, sizeof (pptmmio), md)) { 686 error = EFAULT; 687 break; 688 } 689 error = ppt_unmap_mmio(sc->vmm_vm, pptmmio.pptfd, pptmmio.gpa, 690 pptmmio.len); 691 break; 692 } 693 case VM_BIND_PPTDEV: { 694 struct vm_pptdev pptdev; 695 696 if (ddi_copyin(datap, &pptdev, sizeof (pptdev), md)) { 697 error = EFAULT; 698 break; 699 } 700 error = vm_assign_pptdev(sc->vmm_vm, pptdev.pptfd); 701 break; 702 } 703 case VM_UNBIND_PPTDEV: { 704 struct vm_pptdev pptdev; 705 706 if (ddi_copyin(datap, &pptdev, sizeof (pptdev), md)) { 707 error = EFAULT; 708 break; 709 } 710 error = vm_unassign_pptdev(sc->vmm_vm, pptdev.pptfd); 711 break; 712 } 713 case VM_GET_PPTDEV_LIMITS: { 714 struct vm_pptdev_limits pptlimits; 715 716 if (ddi_copyin(datap, &pptlimits, sizeof (pptlimits), md)) { 717 error = EFAULT; 718 break; 719 } 720 error = ppt_get_limits(sc->vmm_vm, pptlimits.pptfd, 721 &pptlimits.msi_limit, &pptlimits.msix_limit); 722 if (error == 0 && 723 ddi_copyout(&pptlimits, datap, sizeof (pptlimits), md)) { 724 error = EFAULT; 725 break; 726 } 727 break; 728 } 729 case VM_INJECT_EXCEPTION: { 730 struct vm_exception vmexc; 731 if (ddi_copyin(datap, &vmexc, sizeof (vmexc), md)) { 732 error = EFAULT; 733 break; 734 } 735 error = vm_inject_exception(sc->vmm_vm, vcpu, vmexc.vector, 736 vmexc.error_code_valid != 0, vmexc.error_code, 737 vmexc.restart_instruction != 0); 738 break; 739 } 740 case VM_INJECT_NMI: { 741 struct vm_nmi vmnmi; 742 743 if (ddi_copyin(datap, &vmnmi, sizeof (vmnmi), md)) { 744 error = EFAULT; 745 break; 746 } 747 error = vm_inject_nmi(sc->vmm_vm, vmnmi.cpuid); 748 break; 749 } 750 case VM_LAPIC_IRQ: { 751 struct vm_lapic_irq vmirq; 752 753 if (ddi_copyin(datap, &vmirq, sizeof (vmirq), md)) { 754 error = EFAULT; 755 break; 756 } 757 error = lapic_intr_edge(sc->vmm_vm, vmirq.cpuid, vmirq.vector); 758 break; 759 } 760 case VM_LAPIC_LOCAL_IRQ: { 761 struct vm_lapic_irq vmirq; 762 763 if (ddi_copyin(datap, &vmirq, sizeof (vmirq), md)) { 764 error = EFAULT; 765 break; 766 } 767 error = lapic_set_local_intr(sc->vmm_vm, vmirq.cpuid, 768 vmirq.vector); 769 break; 770 } 771 case VM_LAPIC_MSI: { 772 struct vm_lapic_msi vmmsi; 773 774 if (ddi_copyin(datap, &vmmsi, sizeof (vmmsi), md)) { 775 error = EFAULT; 776 break; 777 } 778 error = lapic_intr_msi(sc->vmm_vm, vmmsi.addr, vmmsi.msg); 779 break; 780 } 781 782 case VM_IOAPIC_ASSERT_IRQ: { 783 struct vm_ioapic_irq ioapic_irq; 784 785 if (ddi_copyin(datap, &ioapic_irq, sizeof (ioapic_irq), md)) { 786 error = EFAULT; 787 break; 788 } 789 error = vioapic_assert_irq(sc->vmm_vm, ioapic_irq.irq); 790 break; 791 } 792 case VM_IOAPIC_DEASSERT_IRQ: { 793 struct vm_ioapic_irq ioapic_irq; 794 795 if (ddi_copyin(datap, &ioapic_irq, sizeof (ioapic_irq), md)) { 796 error = EFAULT; 797 break; 798 } 799 error = vioapic_deassert_irq(sc->vmm_vm, ioapic_irq.irq); 800 break; 801 } 802 case VM_IOAPIC_PULSE_IRQ: { 803 struct vm_ioapic_irq ioapic_irq; 804 805 if (ddi_copyin(datap, &ioapic_irq, sizeof (ioapic_irq), md)) { 806 error = EFAULT; 807 break; 808 } 809 error = vioapic_pulse_irq(sc->vmm_vm, ioapic_irq.irq); 810 break; 811 } 812 case VM_IOAPIC_PINCOUNT: { 813 int pincount; 814 815 pincount = vioapic_pincount(sc->vmm_vm); 816 if (ddi_copyout(&pincount, datap, sizeof (int), md)) { 817 error = EFAULT; 818 break; 819 } 820 break; 821 } 822 case VM_DESC_FPU_AREA: { 823 struct vm_fpu_desc desc; 824 void *buf = NULL; 825 826 if (ddi_copyin(datap, &desc, sizeof (desc), md)) { 827 error = EFAULT; 828 break; 829 } 830 if (desc.vfd_num_entries > 64) { 831 error = EINVAL; 832 break; 833 } 834 const size_t buf_sz = sizeof (struct vm_fpu_desc_entry) * 835 desc.vfd_num_entries; 836 if (buf_sz != 0) { 837 buf = kmem_zalloc(buf_sz, KM_SLEEP); 838 } 839 840 /* 841 * For now, we are depending on vm_fpu_desc_entry and 842 * hma_xsave_state_desc_t having the same format. 843 */ 844 CTASSERT(sizeof (struct vm_fpu_desc_entry) == 845 sizeof (hma_xsave_state_desc_t)); 846 847 size_t req_size; 848 const uint_t max_entries = hma_fpu_describe_xsave_state( 849 (hma_xsave_state_desc_t *)buf, 850 desc.vfd_num_entries, 851 &req_size); 852 853 desc.vfd_req_size = req_size; 854 desc.vfd_num_entries = max_entries; 855 if (buf_sz != 0) { 856 if (ddi_copyout(buf, desc.vfd_entry_data, buf_sz, md)) { 857 error = EFAULT; 858 } 859 kmem_free(buf, buf_sz); 860 } 861 862 if (error == 0) { 863 if (ddi_copyout(&desc, datap, sizeof (desc), md)) { 864 error = EFAULT; 865 } 866 } 867 break; 868 } 869 case VM_SET_AUTODESTRUCT: { 870 /* 871 * Since this has to do with controlling the lifetime of the 872 * greater vmm_softc_t, the flag is protected by vmm_mtx, rather 873 * than the vcpu-centric or rwlock exclusion mechanisms. 874 */ 875 mutex_enter(&vmm_mtx); 876 if (arg != 0) { 877 sc->vmm_flags |= VMM_AUTODESTROY; 878 } else { 879 sc->vmm_flags &= ~VMM_AUTODESTROY; 880 } 881 mutex_exit(&vmm_mtx); 882 break; 883 } 884 case VM_DESTROY_SELF: { 885 bool hma_release = false; 886 887 /* 888 * Just like VMM_DESTROY_VM, but on the instance file descriptor 889 * itself, rather than having to perform a racy name lookup as 890 * part of the destroy process. 891 * 892 * Since vmm_destroy_locked() performs vCPU lock acquisition in 893 * order to kick the vCPUs out of guest context as part of any 894 * destruction, we do not need to worry about it ourself using 895 * the `lock_type` logic here. 896 */ 897 mutex_enter(&vmm_mtx); 898 VERIFY0(vmm_destroy_locked(sc, VDO_DEFAULT, &hma_release)); 899 mutex_exit(&vmm_mtx); 900 if (hma_release) { 901 vmm_hma_release(); 902 } 903 break; 904 } 905 case VM_DESTROY_PENDING: { 906 /* 907 * If we have made it this far, then destruction of the instance 908 * has not been initiated. 909 */ 910 *rvalp = 0; 911 break; 912 } 913 914 case VM_ISA_ASSERT_IRQ: { 915 struct vm_isa_irq isa_irq; 916 917 if (ddi_copyin(datap, &isa_irq, sizeof (isa_irq), md)) { 918 error = EFAULT; 919 break; 920 } 921 error = vatpic_assert_irq(sc->vmm_vm, isa_irq.atpic_irq); 922 if (error == 0 && isa_irq.ioapic_irq != -1) { 923 error = vioapic_assert_irq(sc->vmm_vm, 924 isa_irq.ioapic_irq); 925 } 926 break; 927 } 928 case VM_ISA_DEASSERT_IRQ: { 929 struct vm_isa_irq isa_irq; 930 931 if (ddi_copyin(datap, &isa_irq, sizeof (isa_irq), md)) { 932 error = EFAULT; 933 break; 934 } 935 error = vatpic_deassert_irq(sc->vmm_vm, isa_irq.atpic_irq); 936 if (error == 0 && isa_irq.ioapic_irq != -1) { 937 error = vioapic_deassert_irq(sc->vmm_vm, 938 isa_irq.ioapic_irq); 939 } 940 break; 941 } 942 case VM_ISA_PULSE_IRQ: { 943 struct vm_isa_irq isa_irq; 944 945 if (ddi_copyin(datap, &isa_irq, sizeof (isa_irq), md)) { 946 error = EFAULT; 947 break; 948 } 949 error = vatpic_pulse_irq(sc->vmm_vm, isa_irq.atpic_irq); 950 if (error == 0 && isa_irq.ioapic_irq != -1) { 951 error = vioapic_pulse_irq(sc->vmm_vm, 952 isa_irq.ioapic_irq); 953 } 954 break; 955 } 956 case VM_ISA_SET_IRQ_TRIGGER: { 957 struct vm_isa_irq_trigger isa_irq_trigger; 958 959 if (ddi_copyin(datap, &isa_irq_trigger, 960 sizeof (isa_irq_trigger), md)) { 961 error = EFAULT; 962 break; 963 } 964 error = vatpic_set_irq_trigger(sc->vmm_vm, 965 isa_irq_trigger.atpic_irq, isa_irq_trigger.trigger); 966 break; 967 } 968 969 case VM_MMAP_GETNEXT: { 970 struct vm_memmap mm; 971 972 if (ddi_copyin(datap, &mm, sizeof (mm), md)) { 973 error = EFAULT; 974 break; 975 } 976 error = vm_mmap_getnext(sc->vmm_vm, &mm.gpa, &mm.segid, 977 &mm.segoff, &mm.len, &mm.prot, &mm.flags); 978 if (error == 0 && ddi_copyout(&mm, datap, sizeof (mm), md)) { 979 error = EFAULT; 980 break; 981 } 982 break; 983 } 984 case VM_MMAP_MEMSEG: { 985 struct vm_memmap mm; 986 987 if (ddi_copyin(datap, &mm, sizeof (mm), md)) { 988 error = EFAULT; 989 break; 990 } 991 error = vm_mmap_memseg(sc->vmm_vm, mm.gpa, mm.segid, mm.segoff, 992 mm.len, mm.prot, mm.flags); 993 break; 994 } 995 case VM_MUNMAP_MEMSEG: { 996 struct vm_munmap mu; 997 998 if (ddi_copyin(datap, &mu, sizeof (mu), md)) { 999 error = EFAULT; 1000 break; 1001 } 1002 error = vm_munmap_memseg(sc->vmm_vm, mu.gpa, mu.len); 1003 break; 1004 } 1005 case VM_ALLOC_MEMSEG: { 1006 struct vm_memseg vmseg; 1007 1008 if (ddi_copyin(datap, &vmseg, sizeof (vmseg), md)) { 1009 error = EFAULT; 1010 break; 1011 } 1012 error = vmmdev_alloc_memseg(sc, &vmseg); 1013 break; 1014 } 1015 case VM_GET_MEMSEG: { 1016 struct vm_memseg vmseg; 1017 1018 if (ddi_copyin(datap, &vmseg, sizeof (vmseg), md)) { 1019 error = EFAULT; 1020 break; 1021 } 1022 error = vmmdev_get_memseg(sc, &vmseg); 1023 if (error == 0 && 1024 ddi_copyout(&vmseg, datap, sizeof (vmseg), md)) { 1025 error = EFAULT; 1026 break; 1027 } 1028 break; 1029 } 1030 case VM_GET_REGISTER: { 1031 struct vm_register vmreg; 1032 1033 if (ddi_copyin(datap, &vmreg, sizeof (vmreg), md)) { 1034 error = EFAULT; 1035 break; 1036 } 1037 error = vm_get_register(sc->vmm_vm, vcpu, vmreg.regnum, 1038 &vmreg.regval); 1039 if (error == 0 && 1040 ddi_copyout(&vmreg, datap, sizeof (vmreg), md)) { 1041 error = EFAULT; 1042 break; 1043 } 1044 break; 1045 } 1046 case VM_SET_REGISTER: { 1047 struct vm_register vmreg; 1048 1049 if (ddi_copyin(datap, &vmreg, sizeof (vmreg), md)) { 1050 error = EFAULT; 1051 break; 1052 } 1053 error = vm_set_register(sc->vmm_vm, vcpu, vmreg.regnum, 1054 vmreg.regval); 1055 break; 1056 } 1057 case VM_SET_SEGMENT_DESCRIPTOR: { 1058 struct vm_seg_desc vmsegd; 1059 1060 if (ddi_copyin(datap, &vmsegd, sizeof (vmsegd), md)) { 1061 error = EFAULT; 1062 break; 1063 } 1064 error = vm_set_seg_desc(sc->vmm_vm, vcpu, vmsegd.regnum, 1065 &vmsegd.desc); 1066 break; 1067 } 1068 case VM_GET_SEGMENT_DESCRIPTOR: { 1069 struct vm_seg_desc vmsegd; 1070 1071 if (ddi_copyin(datap, &vmsegd, sizeof (vmsegd), md)) { 1072 error = EFAULT; 1073 break; 1074 } 1075 error = vm_get_seg_desc(sc->vmm_vm, vcpu, vmsegd.regnum, 1076 &vmsegd.desc); 1077 if (error == 0 && 1078 ddi_copyout(&vmsegd, datap, sizeof (vmsegd), md)) { 1079 error = EFAULT; 1080 break; 1081 } 1082 break; 1083 } 1084 case VM_GET_REGISTER_SET: { 1085 struct vm_register_set vrs; 1086 int regnums[VM_REG_LAST]; 1087 uint64_t regvals[VM_REG_LAST]; 1088 1089 if (ddi_copyin(datap, &vrs, sizeof (vrs), md)) { 1090 error = EFAULT; 1091 break; 1092 } 1093 if (vrs.count > VM_REG_LAST || vrs.count == 0) { 1094 error = EINVAL; 1095 break; 1096 } 1097 if (ddi_copyin(vrs.regnums, regnums, 1098 sizeof (int) * vrs.count, md)) { 1099 error = EFAULT; 1100 break; 1101 } 1102 1103 error = 0; 1104 for (uint_t i = 0; i < vrs.count && error == 0; i++) { 1105 if (regnums[i] < 0) { 1106 error = EINVAL; 1107 break; 1108 } 1109 error = vm_get_register(sc->vmm_vm, vcpu, regnums[i], 1110 ®vals[i]); 1111 } 1112 if (error == 0 && ddi_copyout(regvals, vrs.regvals, 1113 sizeof (uint64_t) * vrs.count, md)) { 1114 error = EFAULT; 1115 } 1116 break; 1117 } 1118 case VM_SET_REGISTER_SET: { 1119 struct vm_register_set vrs; 1120 int regnums[VM_REG_LAST]; 1121 uint64_t regvals[VM_REG_LAST]; 1122 1123 if (ddi_copyin(datap, &vrs, sizeof (vrs), md)) { 1124 error = EFAULT; 1125 break; 1126 } 1127 if (vrs.count > VM_REG_LAST || vrs.count == 0) { 1128 error = EINVAL; 1129 break; 1130 } 1131 if (ddi_copyin(vrs.regnums, regnums, 1132 sizeof (int) * vrs.count, md)) { 1133 error = EFAULT; 1134 break; 1135 } 1136 if (ddi_copyin(vrs.regvals, regvals, 1137 sizeof (uint64_t) * vrs.count, md)) { 1138 error = EFAULT; 1139 break; 1140 } 1141 1142 error = 0; 1143 for (uint_t i = 0; i < vrs.count && error == 0; i++) { 1144 /* 1145 * Setting registers in a set is not atomic, since a 1146 * failure in the middle of the set will cause a 1147 * bail-out and inconsistent register state. Callers 1148 * should be wary of this. 1149 */ 1150 if (regnums[i] < 0) { 1151 error = EINVAL; 1152 break; 1153 } 1154 error = vm_set_register(sc->vmm_vm, vcpu, regnums[i], 1155 regvals[i]); 1156 } 1157 break; 1158 } 1159 case VM_RESET_CPU: { 1160 struct vm_vcpu_reset vvr; 1161 1162 if (ddi_copyin(datap, &vvr, sizeof (vvr), md)) { 1163 error = EFAULT; 1164 break; 1165 } 1166 if (vvr.kind != VRK_RESET && vvr.kind != VRK_INIT) { 1167 error = EINVAL; 1168 } 1169 1170 error = vcpu_arch_reset(sc->vmm_vm, vcpu, vvr.kind == VRK_INIT); 1171 break; 1172 } 1173 case VM_GET_RUN_STATE: { 1174 struct vm_run_state vrs; 1175 1176 bzero(&vrs, sizeof (vrs)); 1177 error = vm_get_run_state(sc->vmm_vm, vcpu, &vrs.state, 1178 &vrs.sipi_vector); 1179 if (error == 0) { 1180 if (ddi_copyout(&vrs, datap, sizeof (vrs), md)) { 1181 error = EFAULT; 1182 break; 1183 } 1184 } 1185 break; 1186 } 1187 case VM_SET_RUN_STATE: { 1188 struct vm_run_state vrs; 1189 1190 if (ddi_copyin(datap, &vrs, sizeof (vrs), md)) { 1191 error = EFAULT; 1192 break; 1193 } 1194 error = vm_set_run_state(sc->vmm_vm, vcpu, vrs.state, 1195 vrs.sipi_vector); 1196 break; 1197 } 1198 case VM_GET_FPU: { 1199 struct vm_fpu_state req; 1200 const size_t max_len = (PAGESIZE * 2); 1201 void *kbuf; 1202 1203 if (ddi_copyin(datap, &req, sizeof (req), md)) { 1204 error = EFAULT; 1205 break; 1206 } 1207 if (req.len > max_len || req.len == 0) { 1208 error = EINVAL; 1209 break; 1210 } 1211 kbuf = kmem_zalloc(req.len, KM_SLEEP); 1212 error = vm_get_fpu(sc->vmm_vm, vcpu, kbuf, req.len); 1213 if (error == 0) { 1214 if (ddi_copyout(kbuf, req.buf, req.len, md)) { 1215 error = EFAULT; 1216 } 1217 } 1218 kmem_free(kbuf, req.len); 1219 break; 1220 } 1221 case VM_SET_FPU: { 1222 struct vm_fpu_state req; 1223 const size_t max_len = (PAGESIZE * 2); 1224 void *kbuf; 1225 1226 if (ddi_copyin(datap, &req, sizeof (req), md)) { 1227 error = EFAULT; 1228 break; 1229 } 1230 if (req.len > max_len || req.len == 0) { 1231 error = EINVAL; 1232 break; 1233 } 1234 kbuf = kmem_alloc(req.len, KM_SLEEP); 1235 if (ddi_copyin(req.buf, kbuf, req.len, md)) { 1236 error = EFAULT; 1237 } else { 1238 error = vm_set_fpu(sc->vmm_vm, vcpu, kbuf, req.len); 1239 } 1240 kmem_free(kbuf, req.len); 1241 break; 1242 } 1243 case VM_GET_CPUID: { 1244 struct vm_vcpu_cpuid_config cfg; 1245 struct vcpu_cpuid_entry *entries = NULL; 1246 1247 if (ddi_copyin(datap, &cfg, sizeof (cfg), md)) { 1248 error = EFAULT; 1249 break; 1250 } 1251 if (cfg.vvcc_nent > VMM_MAX_CPUID_ENTRIES) { 1252 error = EINVAL; 1253 break; 1254 } 1255 1256 const size_t entries_size = 1257 cfg.vvcc_nent * sizeof (struct vcpu_cpuid_entry); 1258 if (entries_size != 0) { 1259 entries = kmem_zalloc(entries_size, KM_SLEEP); 1260 } 1261 1262 vcpu_cpuid_config_t vm_cfg = { 1263 .vcc_nent = cfg.vvcc_nent, 1264 .vcc_entries = entries, 1265 }; 1266 error = vm_get_cpuid(sc->vmm_vm, vcpu, &vm_cfg); 1267 1268 /* 1269 * Only attempt to copy out the resultant entries if we were 1270 * able to query them from the instance. The flags and number 1271 * of entries are emitted regardless. 1272 */ 1273 cfg.vvcc_flags = vm_cfg.vcc_flags; 1274 cfg.vvcc_nent = vm_cfg.vcc_nent; 1275 if (entries != NULL) { 1276 if (error == 0 && ddi_copyout(entries, cfg.vvcc_entries, 1277 entries_size, md) != 0) { 1278 error = EFAULT; 1279 } 1280 1281 kmem_free(entries, entries_size); 1282 } 1283 1284 if (ddi_copyout(&cfg, datap, sizeof (cfg), md) != 0) { 1285 error = EFAULT; 1286 } 1287 break; 1288 } 1289 case VM_SET_CPUID: { 1290 struct vm_vcpu_cpuid_config cfg; 1291 struct vcpu_cpuid_entry *entries = NULL; 1292 size_t entries_size = 0; 1293 1294 if (ddi_copyin(datap, &cfg, sizeof (cfg), md)) { 1295 error = EFAULT; 1296 break; 1297 } 1298 if (cfg.vvcc_nent > VMM_MAX_CPUID_ENTRIES) { 1299 error = EFBIG; 1300 break; 1301 } 1302 if ((cfg.vvcc_flags & VCC_FLAG_LEGACY_HANDLING) != 0) { 1303 /* 1304 * If we are being instructed to use "legacy" handling, 1305 * then no entries should be provided, since the static 1306 * in-kernel masking will be used. 1307 */ 1308 if (cfg.vvcc_nent != 0) { 1309 error = EINVAL; 1310 break; 1311 } 1312 } else if (cfg.vvcc_nent != 0) { 1313 entries_size = 1314 cfg.vvcc_nent * sizeof (struct vcpu_cpuid_entry); 1315 entries = kmem_alloc(entries_size, KM_SLEEP); 1316 1317 if (ddi_copyin(cfg.vvcc_entries, entries, entries_size, 1318 md) != 0) { 1319 error = EFAULT; 1320 kmem_free(entries, entries_size); 1321 break; 1322 } 1323 } 1324 1325 vcpu_cpuid_config_t vm_cfg = { 1326 .vcc_flags = cfg.vvcc_flags, 1327 .vcc_nent = cfg.vvcc_nent, 1328 .vcc_entries = entries, 1329 }; 1330 error = vm_set_cpuid(sc->vmm_vm, vcpu, &vm_cfg); 1331 1332 if (entries != NULL) { 1333 kmem_free(entries, entries_size); 1334 } 1335 break; 1336 } 1337 case VM_LEGACY_CPUID: { 1338 struct vm_legacy_cpuid vlc; 1339 if (ddi_copyin(datap, &vlc, sizeof (vlc), md)) { 1340 error = EFAULT; 1341 break; 1342 } 1343 vlc.vlc_vcpuid = vcpu; 1344 1345 legacy_emulate_cpuid(sc->vmm_vm, vcpu, &vlc.vlc_eax, 1346 &vlc.vlc_ebx, &vlc.vlc_ecx, &vlc.vlc_edx); 1347 1348 if (ddi_copyout(&vlc, datap, sizeof (vlc), md)) { 1349 error = EFAULT; 1350 break; 1351 } 1352 break; 1353 } 1354 1355 case VM_SET_KERNEMU_DEV: 1356 case VM_GET_KERNEMU_DEV: { 1357 struct vm_readwrite_kernemu_device kemu; 1358 size_t size = 0; 1359 1360 if (ddi_copyin(datap, &kemu, sizeof (kemu), md)) { 1361 error = EFAULT; 1362 break; 1363 } 1364 1365 if (kemu.access_width > 3) { 1366 error = EINVAL; 1367 break; 1368 } 1369 size = (1 << kemu.access_width); 1370 ASSERT(size >= 1 && size <= 8); 1371 1372 if (cmd == VM_SET_KERNEMU_DEV) { 1373 error = vm_service_mmio_write(sc->vmm_vm, vcpu, 1374 kemu.gpa, kemu.value, size); 1375 } else { 1376 error = vm_service_mmio_read(sc->vmm_vm, vcpu, 1377 kemu.gpa, &kemu.value, size); 1378 } 1379 1380 if (error == 0) { 1381 if (ddi_copyout(&kemu, datap, sizeof (kemu), md)) { 1382 error = EFAULT; 1383 break; 1384 } 1385 } 1386 break; 1387 } 1388 1389 case VM_GET_CAPABILITY: { 1390 struct vm_capability vmcap; 1391 1392 if (ddi_copyin(datap, &vmcap, sizeof (vmcap), md)) { 1393 error = EFAULT; 1394 break; 1395 } 1396 error = vm_get_capability(sc->vmm_vm, vcpu, vmcap.captype, 1397 &vmcap.capval); 1398 if (error == 0 && 1399 ddi_copyout(&vmcap, datap, sizeof (vmcap), md)) { 1400 error = EFAULT; 1401 break; 1402 } 1403 break; 1404 } 1405 case VM_SET_CAPABILITY: { 1406 struct vm_capability vmcap; 1407 1408 if (ddi_copyin(datap, &vmcap, sizeof (vmcap), md)) { 1409 error = EFAULT; 1410 break; 1411 } 1412 error = vm_set_capability(sc->vmm_vm, vcpu, vmcap.captype, 1413 vmcap.capval); 1414 break; 1415 } 1416 case VM_SET_X2APIC_STATE: { 1417 struct vm_x2apic x2apic; 1418 1419 if (ddi_copyin(datap, &x2apic, sizeof (x2apic), md)) { 1420 error = EFAULT; 1421 break; 1422 } 1423 error = vm_set_x2apic_state(sc->vmm_vm, vcpu, x2apic.state); 1424 break; 1425 } 1426 case VM_GET_X2APIC_STATE: { 1427 struct vm_x2apic x2apic; 1428 1429 if (ddi_copyin(datap, &x2apic, sizeof (x2apic), md)) { 1430 error = EFAULT; 1431 break; 1432 } 1433 error = vm_get_x2apic_state(sc->vmm_vm, x2apic.cpuid, 1434 &x2apic.state); 1435 if (error == 0 && 1436 ddi_copyout(&x2apic, datap, sizeof (x2apic), md)) { 1437 error = EFAULT; 1438 break; 1439 } 1440 break; 1441 } 1442 case VM_GET_GPA_PMAP: { 1443 /* 1444 * Until there is a necessity to leak EPT/RVI PTE values to 1445 * userspace, this will remain unimplemented 1446 */ 1447 error = EINVAL; 1448 break; 1449 } 1450 case VM_GET_HPET_CAPABILITIES: { 1451 struct vm_hpet_cap hpetcap; 1452 1453 error = vhpet_getcap(&hpetcap); 1454 if (error == 0 && 1455 ddi_copyout(&hpetcap, datap, sizeof (hpetcap), md)) { 1456 error = EFAULT; 1457 break; 1458 } 1459 break; 1460 } 1461 case VM_GLA2GPA: { 1462 struct vm_gla2gpa gg; 1463 1464 if (ddi_copyin(datap, &gg, sizeof (gg), md)) { 1465 error = EFAULT; 1466 break; 1467 } 1468 gg.vcpuid = vcpu; 1469 error = vm_gla2gpa(sc->vmm_vm, vcpu, &gg.paging, gg.gla, 1470 gg.prot, &gg.gpa, &gg.fault); 1471 if (error == 0 && ddi_copyout(&gg, datap, sizeof (gg), md)) { 1472 error = EFAULT; 1473 break; 1474 } 1475 break; 1476 } 1477 case VM_GLA2GPA_NOFAULT: { 1478 struct vm_gla2gpa gg; 1479 1480 if (ddi_copyin(datap, &gg, sizeof (gg), md)) { 1481 error = EFAULT; 1482 break; 1483 } 1484 gg.vcpuid = vcpu; 1485 error = vm_gla2gpa_nofault(sc->vmm_vm, vcpu, &gg.paging, 1486 gg.gla, gg.prot, &gg.gpa, &gg.fault); 1487 if (error == 0 && ddi_copyout(&gg, datap, sizeof (gg), md)) { 1488 error = EFAULT; 1489 break; 1490 } 1491 break; 1492 } 1493 1494 case VM_ACTIVATE_CPU: 1495 error = vm_activate_cpu(sc->vmm_vm, vcpu); 1496 break; 1497 1498 case VM_SUSPEND_CPU: 1499 if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) { 1500 error = EFAULT; 1501 } else { 1502 error = vm_suspend_cpu(sc->vmm_vm, vcpu); 1503 } 1504 break; 1505 1506 case VM_RESUME_CPU: 1507 if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) { 1508 error = EFAULT; 1509 } else { 1510 error = vm_resume_cpu(sc->vmm_vm, vcpu); 1511 } 1512 break; 1513 1514 case VM_GET_CPUS: { 1515 struct vm_cpuset vm_cpuset; 1516 cpuset_t tempset; 1517 void *srcp = &tempset; 1518 int size; 1519 1520 if (ddi_copyin(datap, &vm_cpuset, sizeof (vm_cpuset), md)) { 1521 error = EFAULT; 1522 break; 1523 } 1524 1525 /* Be more generous about sizing since our cpuset_t is large. */ 1526 size = vm_cpuset.cpusetsize; 1527 if (size <= 0 || size > sizeof (cpuset_t)) { 1528 error = ERANGE; 1529 } 1530 /* 1531 * If they want a ulong_t or less, make sure they receive the 1532 * low bits with all the useful information. 1533 */ 1534 if (size <= sizeof (tempset.cpub[0])) { 1535 srcp = &tempset.cpub[0]; 1536 } 1537 1538 if (vm_cpuset.which == VM_ACTIVE_CPUS) { 1539 tempset = vm_active_cpus(sc->vmm_vm); 1540 } else if (vm_cpuset.which == VM_SUSPENDED_CPUS) { 1541 tempset = vm_suspended_cpus(sc->vmm_vm); 1542 } else if (vm_cpuset.which == VM_DEBUG_CPUS) { 1543 tempset = vm_debug_cpus(sc->vmm_vm); 1544 } else { 1545 error = EINVAL; 1546 } 1547 1548 ASSERT(size > 0 && size <= sizeof (tempset)); 1549 if (error == 0 && 1550 ddi_copyout(srcp, vm_cpuset.cpus, size, md)) { 1551 error = EFAULT; 1552 break; 1553 } 1554 break; 1555 } 1556 case VM_SET_INTINFO: { 1557 struct vm_intinfo vmii; 1558 1559 if (ddi_copyin(datap, &vmii, sizeof (vmii), md)) { 1560 error = EFAULT; 1561 break; 1562 } 1563 error = vm_exit_intinfo(sc->vmm_vm, vcpu, vmii.info1); 1564 break; 1565 } 1566 case VM_GET_INTINFO: { 1567 struct vm_intinfo vmii; 1568 1569 vmii.vcpuid = vcpu; 1570 error = vm_get_intinfo(sc->vmm_vm, vcpu, &vmii.info1, 1571 &vmii.info2); 1572 if (error == 0 && 1573 ddi_copyout(&vmii, datap, sizeof (vmii), md)) { 1574 error = EFAULT; 1575 break; 1576 } 1577 break; 1578 } 1579 case VM_RTC_WRITE: { 1580 struct vm_rtc_data rtcdata; 1581 1582 if (ddi_copyin(datap, &rtcdata, sizeof (rtcdata), md)) { 1583 error = EFAULT; 1584 break; 1585 } 1586 error = vrtc_nvram_write(sc->vmm_vm, rtcdata.offset, 1587 rtcdata.value); 1588 break; 1589 } 1590 case VM_RTC_READ: { 1591 struct vm_rtc_data rtcdata; 1592 1593 if (ddi_copyin(datap, &rtcdata, sizeof (rtcdata), md)) { 1594 error = EFAULT; 1595 break; 1596 } 1597 error = vrtc_nvram_read(sc->vmm_vm, rtcdata.offset, 1598 &rtcdata.value); 1599 if (error == 0 && 1600 ddi_copyout(&rtcdata, datap, sizeof (rtcdata), md)) { 1601 error = EFAULT; 1602 break; 1603 } 1604 break; 1605 } 1606 case VM_RTC_SETTIME: { 1607 struct vm_rtc_time rtctime; 1608 1609 if (ddi_copyin(datap, &rtctime, sizeof (rtctime), md)) { 1610 error = EFAULT; 1611 break; 1612 } 1613 error = vrtc_set_time(sc->vmm_vm, rtctime.secs); 1614 break; 1615 } 1616 case VM_RTC_GETTIME: { 1617 struct vm_rtc_time rtctime; 1618 1619 rtctime.secs = vrtc_get_time(sc->vmm_vm); 1620 if (ddi_copyout(&rtctime, datap, sizeof (rtctime), md)) { 1621 error = EFAULT; 1622 break; 1623 } 1624 break; 1625 } 1626 1627 case VM_PMTMR_LOCATE: { 1628 uint16_t port = arg; 1629 error = vpmtmr_set_location(sc->vmm_vm, port); 1630 break; 1631 } 1632 1633 case VM_RESTART_INSTRUCTION: 1634 error = vm_restart_instruction(sc->vmm_vm, vcpu); 1635 break; 1636 1637 case VM_SET_TOPOLOGY: { 1638 struct vm_cpu_topology topo; 1639 1640 if (ddi_copyin(datap, &topo, sizeof (topo), md) != 0) { 1641 error = EFAULT; 1642 break; 1643 } 1644 error = vm_set_topology(sc->vmm_vm, topo.sockets, topo.cores, 1645 topo.threads, topo.maxcpus); 1646 break; 1647 } 1648 case VM_GET_TOPOLOGY: { 1649 struct vm_cpu_topology topo; 1650 1651 vm_get_topology(sc->vmm_vm, &topo.sockets, &topo.cores, 1652 &topo.threads, &topo.maxcpus); 1653 if (ddi_copyout(&topo, datap, sizeof (topo), md) != 0) { 1654 error = EFAULT; 1655 break; 1656 } 1657 break; 1658 } 1659 case VM_DEVMEM_GETOFFSET: { 1660 struct vm_devmem_offset vdo; 1661 vmm_devmem_entry_t *de; 1662 1663 if (ddi_copyin(datap, &vdo, sizeof (vdo), md) != 0) { 1664 error = EFAULT; 1665 break; 1666 } 1667 1668 de = vmmdev_devmem_find(sc, vdo.segid); 1669 if (de != NULL) { 1670 vdo.offset = de->vde_off; 1671 if (ddi_copyout(&vdo, datap, sizeof (vdo), md) != 0) { 1672 error = EFAULT; 1673 } 1674 } else { 1675 error = ENOENT; 1676 } 1677 break; 1678 } 1679 case VM_TRACK_DIRTY_PAGES: { 1680 const size_t max_track_region_len = 8 * PAGESIZE * 8 * PAGESIZE; 1681 struct vmm_dirty_tracker tracker; 1682 uint8_t *bitmap; 1683 size_t len; 1684 1685 if (ddi_copyin(datap, &tracker, sizeof (tracker), md) != 0) { 1686 error = EFAULT; 1687 break; 1688 } 1689 if ((tracker.vdt_start_gpa & PAGEOFFSET) != 0) { 1690 error = EINVAL; 1691 break; 1692 } 1693 if (tracker.vdt_len == 0) { 1694 break; 1695 } 1696 if ((tracker.vdt_len & PAGEOFFSET) != 0) { 1697 error = EINVAL; 1698 break; 1699 } 1700 if (tracker.vdt_len > max_track_region_len) { 1701 error = EINVAL; 1702 break; 1703 } 1704 len = roundup(tracker.vdt_len / PAGESIZE, 8) / 8; 1705 bitmap = kmem_zalloc(len, KM_SLEEP); 1706 vm_track_dirty_pages(sc->vmm_vm, tracker.vdt_start_gpa, 1707 tracker.vdt_len, bitmap); 1708 if (ddi_copyout(bitmap, tracker.vdt_pfns, len, md) != 0) { 1709 error = EFAULT; 1710 } 1711 kmem_free(bitmap, len); 1712 1713 break; 1714 } 1715 case VM_WRLOCK_CYCLE: { 1716 /* 1717 * Present a test mechanism to acquire/release the write lock 1718 * on the VM without any other effects. 1719 */ 1720 break; 1721 } 1722 case VM_DATA_READ: { 1723 struct vm_data_xfer vdx; 1724 1725 if (ddi_copyin(datap, &vdx, sizeof (vdx), md) != 0) { 1726 error = EFAULT; 1727 break; 1728 } 1729 if ((vdx.vdx_flags & ~VDX_FLAGS_VALID) != 0) { 1730 error = EINVAL; 1731 break; 1732 } 1733 if (vdx.vdx_len > VM_DATA_XFER_LIMIT) { 1734 error = EFBIG; 1735 break; 1736 } 1737 1738 const size_t len = vdx.vdx_len; 1739 void *buf = NULL; 1740 if (len != 0) { 1741 buf = kmem_alloc(len, KM_SLEEP); 1742 if ((vdx.vdx_flags & VDX_FLAG_READ_COPYIN) != 0 && 1743 ddi_copyin(vdx.vdx_data, buf, len, md) != 0) { 1744 kmem_free(buf, len); 1745 error = EFAULT; 1746 break; 1747 } else { 1748 bzero(buf, len); 1749 } 1750 } 1751 1752 vdx.vdx_result_len = 0; 1753 vmm_data_req_t req = { 1754 .vdr_class = vdx.vdx_class, 1755 .vdr_version = vdx.vdx_version, 1756 .vdr_flags = vdx.vdx_flags, 1757 .vdr_len = len, 1758 .vdr_data = buf, 1759 .vdr_result_len = &vdx.vdx_result_len, 1760 }; 1761 error = vmm_data_read(sc->vmm_vm, vdx.vdx_vcpuid, &req); 1762 1763 if (error == 0 && buf != NULL) { 1764 if (ddi_copyout(buf, vdx.vdx_data, len, md) != 0) { 1765 error = EFAULT; 1766 } 1767 } 1768 1769 /* 1770 * Copy out the transfer request so that the value of 1771 * vdx_result_len can be made available, regardless of any 1772 * error(s) which may have occurred. 1773 */ 1774 if (ddi_copyout(&vdx, datap, sizeof (vdx), md) != 0) { 1775 error = (error != 0) ? error : EFAULT; 1776 } 1777 1778 if (buf != NULL) { 1779 kmem_free(buf, len); 1780 } 1781 break; 1782 } 1783 case VM_DATA_WRITE: { 1784 struct vm_data_xfer vdx; 1785 1786 if (ddi_copyin(datap, &vdx, sizeof (vdx), md) != 0) { 1787 error = EFAULT; 1788 break; 1789 } 1790 if ((vdx.vdx_flags & ~VDX_FLAGS_VALID) != 0) { 1791 error = EINVAL; 1792 break; 1793 } 1794 if (vdx.vdx_len > VM_DATA_XFER_LIMIT) { 1795 error = EFBIG; 1796 break; 1797 } 1798 1799 const size_t len = vdx.vdx_len; 1800 void *buf = NULL; 1801 if (len != 0) { 1802 buf = kmem_alloc(len, KM_SLEEP); 1803 if (ddi_copyin(vdx.vdx_data, buf, len, md) != 0) { 1804 kmem_free(buf, len); 1805 error = EFAULT; 1806 break; 1807 } 1808 } 1809 1810 vdx.vdx_result_len = 0; 1811 vmm_data_req_t req = { 1812 .vdr_class = vdx.vdx_class, 1813 .vdr_version = vdx.vdx_version, 1814 .vdr_flags = vdx.vdx_flags, 1815 .vdr_len = len, 1816 .vdr_data = buf, 1817 .vdr_result_len = &vdx.vdx_result_len, 1818 }; 1819 if (vmm_allow_state_writes == 0) { 1820 /* XXX: Play it safe for now */ 1821 error = EPERM; 1822 } else { 1823 error = vmm_data_write(sc->vmm_vm, vdx.vdx_vcpuid, 1824 &req); 1825 } 1826 1827 if (error == 0 && buf != NULL && 1828 (vdx.vdx_flags & VDX_FLAG_WRITE_COPYOUT) != 0) { 1829 if (ddi_copyout(buf, vdx.vdx_data, len, md) != 0) { 1830 error = EFAULT; 1831 } 1832 } 1833 1834 /* 1835 * Copy out the transfer request so that the value of 1836 * vdx_result_len can be made available, regardless of any 1837 * error(s) which may have occurred. 1838 */ 1839 if (ddi_copyout(&vdx, datap, sizeof (vdx), md) != 0) { 1840 error = (error != 0) ? error : EFAULT; 1841 } 1842 1843 if (buf != NULL) { 1844 kmem_free(buf, len); 1845 } 1846 break; 1847 } 1848 1849 case VM_PAUSE: { 1850 error = vm_pause_instance(sc->vmm_vm); 1851 break; 1852 } 1853 case VM_RESUME: { 1854 error = vm_resume_instance(sc->vmm_vm); 1855 break; 1856 } 1857 1858 default: 1859 error = ENOTTY; 1860 break; 1861 } 1862 1863 /* Release exclusion resources */ 1864 switch (lock_type) { 1865 case LOCK_NONE: 1866 break; 1867 case LOCK_VCPU: 1868 vcpu_unlock_one(sc, vcpu); 1869 break; 1870 case LOCK_READ_HOLD: 1871 vmm_read_unlock(sc); 1872 break; 1873 case LOCK_WRITE_HOLD: 1874 vmm_write_unlock(sc); 1875 break; 1876 default: 1877 panic("unexpected lock type"); 1878 break; 1879 } 1880 1881 return (error); 1882 } 1883 1884 static vmm_softc_t * 1885 vmm_lookup(const char *name) 1886 { 1887 list_t *vml = &vmm_list; 1888 vmm_softc_t *sc; 1889 1890 ASSERT(MUTEX_HELD(&vmm_mtx)); 1891 1892 for (sc = list_head(vml); sc != NULL; sc = list_next(vml, sc)) { 1893 if (strcmp(sc->vmm_name, name) == 0) { 1894 break; 1895 } 1896 } 1897 1898 return (sc); 1899 } 1900 1901 /* 1902 * Acquire an HMA registration if not already held. 1903 */ 1904 static boolean_t 1905 vmm_hma_acquire(void) 1906 { 1907 ASSERT(MUTEX_NOT_HELD(&vmm_mtx)); 1908 1909 mutex_enter(&vmmdev_mtx); 1910 1911 if (vmmdev_hma_reg == NULL) { 1912 VERIFY3U(vmmdev_hma_ref, ==, 0); 1913 vmmdev_hma_reg = hma_register(vmmdev_hvm_name); 1914 if (vmmdev_hma_reg == NULL) { 1915 cmn_err(CE_WARN, "%s HMA registration failed.", 1916 vmmdev_hvm_name); 1917 mutex_exit(&vmmdev_mtx); 1918 return (B_FALSE); 1919 } 1920 } 1921 1922 vmmdev_hma_ref++; 1923 1924 mutex_exit(&vmmdev_mtx); 1925 1926 return (B_TRUE); 1927 } 1928 1929 /* 1930 * Release the HMA registration if held and there are no remaining VMs. 1931 */ 1932 static void 1933 vmm_hma_release(void) 1934 { 1935 ASSERT(MUTEX_NOT_HELD(&vmm_mtx)); 1936 1937 mutex_enter(&vmmdev_mtx); 1938 1939 VERIFY3U(vmmdev_hma_ref, !=, 0); 1940 1941 vmmdev_hma_ref--; 1942 1943 if (vmmdev_hma_ref == 0) { 1944 VERIFY(vmmdev_hma_reg != NULL); 1945 hma_unregister(vmmdev_hma_reg); 1946 vmmdev_hma_reg = NULL; 1947 } 1948 mutex_exit(&vmmdev_mtx); 1949 } 1950 1951 static int 1952 vmmdev_do_vm_create(const struct vm_create_req *req, cred_t *cr) 1953 { 1954 vmm_softc_t *sc = NULL; 1955 minor_t minor; 1956 int error = ENOMEM; 1957 size_t len; 1958 const char *name = req->name; 1959 1960 len = strnlen(name, VM_MAX_NAMELEN); 1961 if (len == 0) { 1962 return (EINVAL); 1963 } 1964 if (len >= VM_MAX_NAMELEN) { 1965 return (ENAMETOOLONG); 1966 } 1967 if (strchr(name, '/') != NULL) { 1968 return (EINVAL); 1969 } 1970 1971 if (!vmm_hma_acquire()) 1972 return (ENXIO); 1973 1974 mutex_enter(&vmm_mtx); 1975 1976 /* Look for duplicate names */ 1977 if (vmm_lookup(name) != NULL) { 1978 mutex_exit(&vmm_mtx); 1979 vmm_hma_release(); 1980 return (EEXIST); 1981 } 1982 1983 /* Allow only one instance per non-global zone. */ 1984 if (!INGLOBALZONE(curproc)) { 1985 for (sc = list_head(&vmm_list); sc != NULL; 1986 sc = list_next(&vmm_list, sc)) { 1987 if (sc->vmm_zone == curzone) { 1988 mutex_exit(&vmm_mtx); 1989 vmm_hma_release(); 1990 return (EINVAL); 1991 } 1992 } 1993 } 1994 1995 minor = id_alloc(vmm_minors); 1996 if (ddi_soft_state_zalloc(vmm_statep, minor) != DDI_SUCCESS) { 1997 goto fail; 1998 } else if ((sc = ddi_get_soft_state(vmm_statep, minor)) == NULL) { 1999 ddi_soft_state_free(vmm_statep, minor); 2000 goto fail; 2001 } else if (ddi_create_minor_node(vmmdev_dip, name, S_IFCHR, minor, 2002 DDI_PSEUDO, 0) != DDI_SUCCESS) { 2003 goto fail; 2004 } 2005 2006 if (vmm_kstat_alloc(sc, minor, cr) != 0) { 2007 goto fail; 2008 } 2009 2010 error = vm_create(req->flags, &sc->vmm_vm); 2011 if (error == 0) { 2012 /* Complete VM intialization and report success. */ 2013 (void) strlcpy(sc->vmm_name, name, sizeof (sc->vmm_name)); 2014 sc->vmm_minor = minor; 2015 list_create(&sc->vmm_devmem_list, sizeof (vmm_devmem_entry_t), 2016 offsetof(vmm_devmem_entry_t, vde_node)); 2017 2018 list_create(&sc->vmm_holds, sizeof (vmm_hold_t), 2019 offsetof(vmm_hold_t, vmh_node)); 2020 cv_init(&sc->vmm_cv, NULL, CV_DEFAULT, NULL); 2021 2022 mutex_init(&sc->vmm_lease_lock, NULL, MUTEX_DEFAULT, NULL); 2023 list_create(&sc->vmm_lease_list, sizeof (vmm_lease_t), 2024 offsetof(vmm_lease_t, vml_node)); 2025 cv_init(&sc->vmm_lease_cv, NULL, CV_DEFAULT, NULL); 2026 rw_init(&sc->vmm_rwlock, NULL, RW_DEFAULT, NULL); 2027 2028 sc->vmm_zone = crgetzone(cr); 2029 zone_hold(sc->vmm_zone); 2030 vmm_zsd_add_vm(sc); 2031 vmm_kstat_init(sc); 2032 2033 list_insert_tail(&vmm_list, sc); 2034 mutex_exit(&vmm_mtx); 2035 return (0); 2036 } 2037 2038 vmm_kstat_fini(sc); 2039 ddi_remove_minor_node(vmmdev_dip, name); 2040 fail: 2041 id_free(vmm_minors, minor); 2042 if (sc != NULL) { 2043 ddi_soft_state_free(vmm_statep, minor); 2044 } 2045 mutex_exit(&vmm_mtx); 2046 vmm_hma_release(); 2047 2048 return (error); 2049 } 2050 2051 /* 2052 * Bhyve 'Driver' Interface 2053 * 2054 * While many devices are emulated in the bhyve userspace process, there are 2055 * others with performance constraints which require that they run mostly or 2056 * entirely in-kernel. For those not integrated directly into bhyve, an API is 2057 * needed so they can query/manipulate the portions of VM state needed to 2058 * fulfill their purpose. 2059 * 2060 * This includes: 2061 * - Translating guest-physical addresses to host-virtual pointers 2062 * - Injecting MSIs 2063 * - Hooking IO port addresses 2064 * 2065 * The vmm_drv interface exists to provide that functionality to its consumers. 2066 * (At this time, 'viona' is the only user) 2067 */ 2068 int 2069 vmm_drv_hold(file_t *fp, cred_t *cr, vmm_hold_t **holdp) 2070 { 2071 vnode_t *vp = fp->f_vnode; 2072 const dev_t dev = vp->v_rdev; 2073 vmm_softc_t *sc; 2074 vmm_hold_t *hold; 2075 int err = 0; 2076 2077 if (vp->v_type != VCHR) { 2078 return (ENXIO); 2079 } 2080 const major_t major = getmajor(dev); 2081 const minor_t minor = getminor(dev); 2082 2083 mutex_enter(&vmmdev_mtx); 2084 if (vmmdev_dip == NULL || major != ddi_driver_major(vmmdev_dip)) { 2085 mutex_exit(&vmmdev_mtx); 2086 return (ENOENT); 2087 } 2088 mutex_enter(&vmm_mtx); 2089 mutex_exit(&vmmdev_mtx); 2090 2091 if ((sc = ddi_get_soft_state(vmm_statep, minor)) == NULL) { 2092 err = ENOENT; 2093 goto out; 2094 } 2095 /* XXXJOY: check cred permissions against instance */ 2096 2097 if ((sc->vmm_flags & VMM_DESTROY) != 0) { 2098 err = EBUSY; 2099 goto out; 2100 } 2101 2102 hold = kmem_zalloc(sizeof (*hold), KM_SLEEP); 2103 hold->vmh_sc = sc; 2104 hold->vmh_release_req = B_FALSE; 2105 2106 list_insert_tail(&sc->vmm_holds, hold); 2107 sc->vmm_flags |= VMM_HELD; 2108 *holdp = hold; 2109 2110 out: 2111 mutex_exit(&vmm_mtx); 2112 return (err); 2113 } 2114 2115 void 2116 vmm_drv_rele(vmm_hold_t *hold) 2117 { 2118 vmm_softc_t *sc; 2119 bool hma_release = false; 2120 2121 ASSERT(hold != NULL); 2122 ASSERT(hold->vmh_sc != NULL); 2123 VERIFY(hold->vmh_ioport_hook_cnt == 0); 2124 2125 mutex_enter(&vmm_mtx); 2126 sc = hold->vmh_sc; 2127 list_remove(&sc->vmm_holds, hold); 2128 kmem_free(hold, sizeof (*hold)); 2129 2130 if (list_is_empty(&sc->vmm_holds)) { 2131 sc->vmm_flags &= ~VMM_HELD; 2132 2133 /* 2134 * Since outstanding holds would prevent instance destruction 2135 * from completing, attempt to finish it now if it was already 2136 * set in motion. 2137 */ 2138 if ((sc->vmm_flags & VMM_DESTROY) != 0) { 2139 VERIFY0(vmm_destroy_locked(sc, VDO_DEFAULT, 2140 &hma_release)); 2141 } 2142 } 2143 mutex_exit(&vmm_mtx); 2144 2145 if (hma_release) { 2146 vmm_hma_release(); 2147 } 2148 } 2149 2150 boolean_t 2151 vmm_drv_release_reqd(vmm_hold_t *hold) 2152 { 2153 ASSERT(hold != NULL); 2154 2155 return (hold->vmh_release_req); 2156 } 2157 2158 vmm_lease_t * 2159 vmm_drv_lease_sign(vmm_hold_t *hold, boolean_t (*expiref)(void *), void *arg) 2160 { 2161 vmm_softc_t *sc = hold->vmh_sc; 2162 vmm_lease_t *lease; 2163 2164 ASSERT3P(expiref, !=, NULL); 2165 2166 if (hold->vmh_release_req) { 2167 return (NULL); 2168 } 2169 2170 lease = kmem_alloc(sizeof (*lease), KM_SLEEP); 2171 list_link_init(&lease->vml_node); 2172 lease->vml_expire_func = expiref; 2173 lease->vml_expire_arg = arg; 2174 lease->vml_expired = B_FALSE; 2175 lease->vml_break_deferred = B_FALSE; 2176 lease->vml_hold = hold; 2177 /* cache the VM pointer for one less pointer chase */ 2178 lease->vml_vm = sc->vmm_vm; 2179 lease->vml_vmclient = vmspace_client_alloc(vm_get_vmspace(sc->vmm_vm)); 2180 2181 mutex_enter(&sc->vmm_lease_lock); 2182 while (sc->vmm_lease_blocker != 0) { 2183 cv_wait(&sc->vmm_lease_cv, &sc->vmm_lease_lock); 2184 } 2185 list_insert_tail(&sc->vmm_lease_list, lease); 2186 vmm_read_lock(sc); 2187 mutex_exit(&sc->vmm_lease_lock); 2188 2189 return (lease); 2190 } 2191 2192 static void 2193 vmm_lease_break_locked(vmm_softc_t *sc, vmm_lease_t *lease) 2194 { 2195 ASSERT(MUTEX_HELD(&sc->vmm_lease_lock)); 2196 2197 list_remove(&sc->vmm_lease_list, lease); 2198 vmm_read_unlock(sc); 2199 vmc_destroy(lease->vml_vmclient); 2200 kmem_free(lease, sizeof (*lease)); 2201 } 2202 2203 static void 2204 vmm_lease_block(vmm_softc_t *sc) 2205 { 2206 mutex_enter(&sc->vmm_lease_lock); 2207 VERIFY3U(sc->vmm_lease_blocker, !=, UINT_MAX); 2208 sc->vmm_lease_blocker++; 2209 if (sc->vmm_lease_blocker == 1) { 2210 list_t *list = &sc->vmm_lease_list; 2211 vmm_lease_t *lease = list_head(list); 2212 2213 while (lease != NULL) { 2214 void *arg = lease->vml_expire_arg; 2215 boolean_t (*expiref)(void *) = lease->vml_expire_func; 2216 boolean_t sync_break = B_FALSE; 2217 2218 /* 2219 * Since the lease expiration notification may 2220 * need to take locks which would deadlock with 2221 * vmm_lease_lock, drop it across the call. 2222 * 2223 * We are the only one allowed to manipulate 2224 * vmm_lease_list right now, so it is safe to 2225 * continue iterating through it after 2226 * reacquiring the lock. 2227 */ 2228 lease->vml_expired = B_TRUE; 2229 mutex_exit(&sc->vmm_lease_lock); 2230 sync_break = expiref(arg); 2231 mutex_enter(&sc->vmm_lease_lock); 2232 2233 if (sync_break) { 2234 vmm_lease_t *next; 2235 2236 /* 2237 * These leases which are synchronously broken 2238 * result in vmm_read_unlock() calls from a 2239 * different thread than the corresponding 2240 * vmm_read_lock(). This is acceptable, given 2241 * that the rwlock underpinning the whole 2242 * mechanism tolerates the behavior. This 2243 * flexibility is _only_ afforded to VM read 2244 * lock (RW_READER) holders. 2245 */ 2246 next = list_next(list, lease); 2247 vmm_lease_break_locked(sc, lease); 2248 lease = next; 2249 } else { 2250 lease = list_next(list, lease); 2251 } 2252 } 2253 2254 /* Process leases which were not broken synchronously. */ 2255 while (!list_is_empty(list)) { 2256 /* 2257 * Although the nested loops are quadratic, the number 2258 * of leases is small. 2259 */ 2260 lease = list_head(list); 2261 while (lease != NULL) { 2262 vmm_lease_t *next = list_next(list, lease); 2263 if (lease->vml_break_deferred) { 2264 vmm_lease_break_locked(sc, lease); 2265 } 2266 lease = next; 2267 } 2268 if (list_is_empty(list)) { 2269 break; 2270 } 2271 cv_wait(&sc->vmm_lease_cv, &sc->vmm_lease_lock); 2272 } 2273 /* Wake anyone else waiting for the lease list to be empty */ 2274 cv_broadcast(&sc->vmm_lease_cv); 2275 } else { 2276 list_t *list = &sc->vmm_lease_list; 2277 2278 /* 2279 * Some other thread beat us to the duty of lease cleanup. 2280 * Wait until that is complete. 2281 */ 2282 while (!list_is_empty(list)) { 2283 cv_wait(&sc->vmm_lease_cv, &sc->vmm_lease_lock); 2284 } 2285 } 2286 mutex_exit(&sc->vmm_lease_lock); 2287 } 2288 2289 static void 2290 vmm_lease_unblock(vmm_softc_t *sc) 2291 { 2292 mutex_enter(&sc->vmm_lease_lock); 2293 VERIFY3U(sc->vmm_lease_blocker, !=, 0); 2294 sc->vmm_lease_blocker--; 2295 if (sc->vmm_lease_blocker == 0) { 2296 cv_broadcast(&sc->vmm_lease_cv); 2297 } 2298 mutex_exit(&sc->vmm_lease_lock); 2299 } 2300 2301 void 2302 vmm_drv_lease_break(vmm_hold_t *hold, vmm_lease_t *lease) 2303 { 2304 vmm_softc_t *sc = hold->vmh_sc; 2305 2306 VERIFY3P(hold, ==, lease->vml_hold); 2307 VERIFY(!lease->vml_break_deferred); 2308 2309 mutex_enter(&sc->vmm_lease_lock); 2310 if (sc->vmm_lease_blocker == 0) { 2311 vmm_lease_break_locked(sc, lease); 2312 } else { 2313 /* 2314 * Defer the lease-breaking to whichever thread is currently 2315 * cleaning up all leases as part of a vmm_lease_block() call. 2316 */ 2317 lease->vml_break_deferred = B_TRUE; 2318 cv_broadcast(&sc->vmm_lease_cv); 2319 } 2320 mutex_exit(&sc->vmm_lease_lock); 2321 } 2322 2323 boolean_t 2324 vmm_drv_lease_expired(vmm_lease_t *lease) 2325 { 2326 return (lease->vml_expired); 2327 } 2328 2329 vmm_page_t * 2330 vmm_drv_page_hold(vmm_lease_t *lease, uintptr_t gpa, int prot) 2331 { 2332 ASSERT(lease != NULL); 2333 ASSERT0(gpa & PAGEOFFSET); 2334 2335 return ((vmm_page_t *)vmc_hold(lease->vml_vmclient, gpa, prot)); 2336 } 2337 2338 void 2339 vmm_drv_page_release(vmm_page_t *vmmp) 2340 { 2341 (void) vmp_release((vm_page_t *)vmmp); 2342 } 2343 2344 void 2345 vmm_drv_page_release_chain(vmm_page_t *vmmp) 2346 { 2347 (void) vmp_release_chain((vm_page_t *)vmmp); 2348 } 2349 2350 const void * 2351 vmm_drv_page_readable(const vmm_page_t *vmmp) 2352 { 2353 return (vmp_get_readable((const vm_page_t *)vmmp)); 2354 } 2355 2356 void * 2357 vmm_drv_page_writable(const vmm_page_t *vmmp) 2358 { 2359 return (vmp_get_writable((const vm_page_t *)vmmp)); 2360 } 2361 2362 void 2363 vmm_drv_page_chain(vmm_page_t *vmmp, vmm_page_t *to_chain) 2364 { 2365 vmp_chain((vm_page_t *)vmmp, (vm_page_t *)to_chain); 2366 } 2367 2368 vmm_page_t * 2369 vmm_drv_page_next(const vmm_page_t *vmmp) 2370 { 2371 return ((vmm_page_t *)vmp_next((vm_page_t *)vmmp)); 2372 } 2373 2374 int 2375 vmm_drv_msi(vmm_lease_t *lease, uint64_t addr, uint64_t msg) 2376 { 2377 ASSERT(lease != NULL); 2378 2379 return (lapic_intr_msi(lease->vml_vm, addr, msg)); 2380 } 2381 2382 int 2383 vmm_drv_ioport_hook(vmm_hold_t *hold, uint16_t ioport, vmm_drv_iop_cb_t func, 2384 void *arg, void **cookie) 2385 { 2386 vmm_softc_t *sc; 2387 int err; 2388 2389 ASSERT(hold != NULL); 2390 ASSERT(cookie != NULL); 2391 2392 sc = hold->vmh_sc; 2393 mutex_enter(&vmm_mtx); 2394 /* Confirm that hook installation is not blocked */ 2395 if ((sc->vmm_flags & VMM_BLOCK_HOOK) != 0) { 2396 mutex_exit(&vmm_mtx); 2397 return (EBUSY); 2398 } 2399 /* 2400 * Optimistically record an installed hook which will prevent a block 2401 * from being asserted while the mutex is dropped. 2402 */ 2403 hold->vmh_ioport_hook_cnt++; 2404 mutex_exit(&vmm_mtx); 2405 2406 vmm_write_lock(sc); 2407 err = vm_ioport_hook(sc->vmm_vm, ioport, (ioport_handler_t)func, 2408 arg, cookie); 2409 vmm_write_unlock(sc); 2410 2411 if (err != 0) { 2412 mutex_enter(&vmm_mtx); 2413 /* Walk back optimism about the hook installation */ 2414 hold->vmh_ioport_hook_cnt--; 2415 mutex_exit(&vmm_mtx); 2416 } 2417 return (err); 2418 } 2419 2420 void 2421 vmm_drv_ioport_unhook(vmm_hold_t *hold, void **cookie) 2422 { 2423 vmm_softc_t *sc; 2424 2425 ASSERT(hold != NULL); 2426 ASSERT(cookie != NULL); 2427 ASSERT(hold->vmh_ioport_hook_cnt != 0); 2428 2429 sc = hold->vmh_sc; 2430 vmm_write_lock(sc); 2431 vm_ioport_unhook(sc->vmm_vm, cookie); 2432 vmm_write_unlock(sc); 2433 2434 mutex_enter(&vmm_mtx); 2435 hold->vmh_ioport_hook_cnt--; 2436 mutex_exit(&vmm_mtx); 2437 } 2438 2439 static void 2440 vmm_drv_purge(vmm_softc_t *sc) 2441 { 2442 ASSERT(MUTEX_HELD(&vmm_mtx)); 2443 2444 if ((sc->vmm_flags & VMM_HELD) != 0) { 2445 vmm_hold_t *hold; 2446 2447 for (hold = list_head(&sc->vmm_holds); hold != NULL; 2448 hold = list_next(&sc->vmm_holds, hold)) { 2449 hold->vmh_release_req = B_TRUE; 2450 } 2451 2452 /* 2453 * Require that all leases on the instance be broken, now that 2454 * all associated holds have been marked as needing release. 2455 * 2456 * Dropping vmm_mtx is not strictly necessary, but if any of the 2457 * lessees are slow to respond, it would be nice to leave it 2458 * available for other parties. 2459 */ 2460 mutex_exit(&vmm_mtx); 2461 vmm_lease_block(sc); 2462 vmm_lease_unblock(sc); 2463 mutex_enter(&vmm_mtx); 2464 } 2465 } 2466 2467 static int 2468 vmm_drv_block_hook(vmm_softc_t *sc, boolean_t enable_block) 2469 { 2470 int err = 0; 2471 2472 mutex_enter(&vmm_mtx); 2473 if (!enable_block) { 2474 VERIFY((sc->vmm_flags & VMM_BLOCK_HOOK) != 0); 2475 2476 sc->vmm_flags &= ~VMM_BLOCK_HOOK; 2477 goto done; 2478 } 2479 2480 /* If any holds have hooks installed, the block is a failure */ 2481 if (!list_is_empty(&sc->vmm_holds)) { 2482 vmm_hold_t *hold; 2483 2484 for (hold = list_head(&sc->vmm_holds); hold != NULL; 2485 hold = list_next(&sc->vmm_holds, hold)) { 2486 if (hold->vmh_ioport_hook_cnt != 0) { 2487 err = EBUSY; 2488 goto done; 2489 } 2490 } 2491 } 2492 sc->vmm_flags |= VMM_BLOCK_HOOK; 2493 2494 done: 2495 mutex_exit(&vmm_mtx); 2496 return (err); 2497 } 2498 2499 2500 static void 2501 vmm_destroy_begin(vmm_softc_t *sc, vmm_destroy_opts_t opts) 2502 { 2503 ASSERT(MUTEX_HELD(&vmm_mtx)); 2504 ASSERT0(sc->vmm_flags & VMM_DESTROY); 2505 2506 sc->vmm_flags |= VMM_DESTROY; 2507 2508 /* 2509 * Lock and unlock all of the vCPUs to ensure that they are kicked out 2510 * of guest context, being unable to return now that the instance is 2511 * marked for destruction. 2512 */ 2513 const int maxcpus = vm_get_maxcpus(sc->vmm_vm); 2514 for (int vcpu = 0; vcpu < maxcpus; vcpu++) { 2515 vcpu_lock_one(sc, vcpu); 2516 vcpu_unlock_one(sc, vcpu); 2517 } 2518 2519 vmmdev_devmem_purge(sc); 2520 if ((opts & VDO_NO_CLEAN_ZSD) == 0) { 2521 /* 2522 * The ZSD should be cleaned up now, unless destruction of the 2523 * instance was initated by destruction of the containing zone, 2524 * in which case the ZSD has already been removed. 2525 */ 2526 vmm_zsd_rem_vm(sc); 2527 } 2528 zone_rele(sc->vmm_zone); 2529 2530 vmm_drv_purge(sc); 2531 } 2532 2533 static bool 2534 vmm_destroy_ready(vmm_softc_t *sc) 2535 { 2536 ASSERT(MUTEX_HELD(&vmm_mtx)); 2537 2538 if ((sc->vmm_flags & (VMM_HELD | VMM_IS_OPEN)) == 0) { 2539 VERIFY(list_is_empty(&sc->vmm_holds)); 2540 return (true); 2541 } 2542 2543 return (false); 2544 } 2545 2546 static void 2547 vmm_destroy_finish(vmm_softc_t *sc) 2548 { 2549 ASSERT(MUTEX_HELD(&vmm_mtx)); 2550 ASSERT(vmm_destroy_ready(sc)); 2551 2552 list_remove(&vmm_list, sc); 2553 vmm_kstat_fini(sc); 2554 vm_destroy(sc->vmm_vm); 2555 ddi_remove_minor_node(vmmdev_dip, sc->vmm_name); 2556 (void) devfs_clean(ddi_get_parent(vmmdev_dip), NULL, DV_CLEAN_FORCE); 2557 2558 const minor_t minor = sc->vmm_minor; 2559 ddi_soft_state_free(vmm_statep, minor); 2560 id_free(vmm_minors, minor); 2561 } 2562 2563 /* 2564 * Initiate or attempt to finish destruction of a VMM instance. 2565 * 2566 * This is called from several contexts: 2567 * - An explicit destroy ioctl is made 2568 * - A vmm_drv consumer releases its hold (being the last on the instance) 2569 * - The vmm device is closed, and auto-destruct is enabled 2570 */ 2571 static int 2572 vmm_destroy_locked(vmm_softc_t *sc, vmm_destroy_opts_t opts, 2573 bool *hma_release) 2574 { 2575 ASSERT(MUTEX_HELD(&vmm_mtx)); 2576 2577 *hma_release = false; 2578 2579 /* 2580 * When instance destruction begins, it is so marked such that any 2581 * further requests to operate the instance will fail. 2582 */ 2583 if ((sc->vmm_flags & VMM_DESTROY) == 0) { 2584 vmm_destroy_begin(sc, opts); 2585 } 2586 2587 if (vmm_destroy_ready(sc)) { 2588 2589 /* 2590 * Notify anyone waiting for the destruction to finish. They 2591 * must be clear before we can safely tear down the softc. 2592 */ 2593 if (sc->vmm_destroy_waiters != 0) { 2594 cv_broadcast(&sc->vmm_cv); 2595 while (sc->vmm_destroy_waiters != 0) { 2596 cv_wait(&sc->vmm_cv, &vmm_mtx); 2597 } 2598 } 2599 2600 /* 2601 * Finish destruction of instance. After this point, the softc 2602 * is freed and cannot be accessed again. 2603 * 2604 * With destruction complete, the HMA hold can be released 2605 */ 2606 vmm_destroy_finish(sc); 2607 *hma_release = true; 2608 return (0); 2609 } else if ((opts & VDO_ATTEMPT_WAIT) != 0) { 2610 int err = 0; 2611 2612 sc->vmm_destroy_waiters++; 2613 while (!vmm_destroy_ready(sc) && err == 0) { 2614 if (cv_wait_sig(&sc->vmm_cv, &vmm_mtx) <= 0) { 2615 err = EINTR; 2616 } 2617 } 2618 sc->vmm_destroy_waiters--; 2619 2620 if (sc->vmm_destroy_waiters == 0) { 2621 /* 2622 * If we were the last waiter, it could be that VM 2623 * destruction is waiting on _us_ to proceed with the 2624 * final clean-up. 2625 */ 2626 cv_signal(&sc->vmm_cv); 2627 } 2628 return (err); 2629 } else { 2630 /* 2631 * Since the instance is not ready for destruction, and the 2632 * caller did not ask to wait, consider it a success for now. 2633 */ 2634 return (0); 2635 } 2636 } 2637 2638 void 2639 vmm_zone_vm_destroy(vmm_softc_t *sc) 2640 { 2641 bool hma_release = false; 2642 int err; 2643 2644 mutex_enter(&vmm_mtx); 2645 err = vmm_destroy_locked(sc, VDO_NO_CLEAN_ZSD, &hma_release); 2646 mutex_exit(&vmm_mtx); 2647 2648 VERIFY0(err); 2649 2650 if (hma_release) { 2651 vmm_hma_release(); 2652 } 2653 } 2654 2655 static int 2656 vmmdev_do_vm_destroy(const struct vm_destroy_req *req, cred_t *cr) 2657 { 2658 vmm_softc_t *sc; 2659 bool hma_release = false; 2660 int err; 2661 2662 if (crgetuid(cr) != 0) { 2663 return (EPERM); 2664 } 2665 2666 mutex_enter(&vmm_mtx); 2667 sc = vmm_lookup(req->name); 2668 if (sc == NULL) { 2669 mutex_exit(&vmm_mtx); 2670 return (ENOENT); 2671 } 2672 /* 2673 * We don't check this in vmm_lookup() since that function is also used 2674 * for validation during create and currently vmm names must be unique. 2675 */ 2676 if (!INGLOBALZONE(curproc) && sc->vmm_zone != curzone) { 2677 mutex_exit(&vmm_mtx); 2678 return (EPERM); 2679 } 2680 2681 err = vmm_destroy_locked(sc, VDO_ATTEMPT_WAIT, &hma_release); 2682 mutex_exit(&vmm_mtx); 2683 2684 if (hma_release) { 2685 vmm_hma_release(); 2686 } 2687 2688 return (err); 2689 } 2690 2691 #define VCPU_NAME_BUFLEN 32 2692 2693 static int 2694 vmm_kstat_alloc(vmm_softc_t *sc, minor_t minor, const cred_t *cr) 2695 { 2696 zoneid_t zid = crgetzoneid(cr); 2697 int instance = minor; 2698 kstat_t *ksp; 2699 2700 ASSERT3P(sc->vmm_kstat_vm, ==, NULL); 2701 2702 ksp = kstat_create_zone(VMM_MODULE_NAME, instance, "vm", 2703 VMM_KSTAT_CLASS, KSTAT_TYPE_NAMED, 2704 sizeof (vmm_kstats_t) / sizeof (kstat_named_t), 0, zid); 2705 2706 if (ksp == NULL) { 2707 return (-1); 2708 } 2709 sc->vmm_kstat_vm = ksp; 2710 2711 for (uint_t i = 0; i < VM_MAXCPU; i++) { 2712 char namebuf[VCPU_NAME_BUFLEN]; 2713 2714 ASSERT3P(sc->vmm_kstat_vcpu[i], ==, NULL); 2715 2716 (void) snprintf(namebuf, VCPU_NAME_BUFLEN, "vcpu%u", i); 2717 ksp = kstat_create_zone(VMM_MODULE_NAME, instance, namebuf, 2718 VMM_KSTAT_CLASS, KSTAT_TYPE_NAMED, 2719 sizeof (vmm_vcpu_kstats_t) / sizeof (kstat_named_t), 2720 0, zid); 2721 if (ksp == NULL) { 2722 goto fail; 2723 } 2724 2725 sc->vmm_kstat_vcpu[i] = ksp; 2726 } 2727 2728 /* 2729 * If this instance is associated with a non-global zone, make its 2730 * kstats visible from the GZ. 2731 */ 2732 if (zid != GLOBAL_ZONEID) { 2733 kstat_zone_add(sc->vmm_kstat_vm, GLOBAL_ZONEID); 2734 for (uint_t i = 0; i < VM_MAXCPU; i++) { 2735 kstat_zone_add(sc->vmm_kstat_vcpu[i], GLOBAL_ZONEID); 2736 } 2737 } 2738 2739 return (0); 2740 2741 fail: 2742 for (uint_t i = 0; i < VM_MAXCPU; i++) { 2743 if (sc->vmm_kstat_vcpu[i] != NULL) { 2744 kstat_delete(sc->vmm_kstat_vcpu[i]); 2745 sc->vmm_kstat_vcpu[i] = NULL; 2746 } else { 2747 break; 2748 } 2749 } 2750 kstat_delete(sc->vmm_kstat_vm); 2751 sc->vmm_kstat_vm = NULL; 2752 return (-1); 2753 } 2754 2755 static void 2756 vmm_kstat_init(vmm_softc_t *sc) 2757 { 2758 kstat_t *ksp; 2759 2760 ASSERT3P(sc->vmm_vm, !=, NULL); 2761 ASSERT3P(sc->vmm_kstat_vm, !=, NULL); 2762 2763 ksp = sc->vmm_kstat_vm; 2764 vmm_kstats_t *vk = ksp->ks_data; 2765 ksp->ks_private = sc->vmm_vm; 2766 kstat_named_init(&vk->vk_name, "vm_name", KSTAT_DATA_STRING); 2767 kstat_named_setstr(&vk->vk_name, sc->vmm_name); 2768 2769 for (uint_t i = 0; i < VM_MAXCPU; i++) { 2770 ASSERT3P(sc->vmm_kstat_vcpu[i], !=, NULL); 2771 2772 ksp = sc->vmm_kstat_vcpu[i]; 2773 vmm_vcpu_kstats_t *vvk = ksp->ks_data; 2774 2775 kstat_named_init(&vvk->vvk_vcpu, "vcpu", KSTAT_DATA_UINT32); 2776 vvk->vvk_vcpu.value.ui32 = i; 2777 kstat_named_init(&vvk->vvk_time_init, "time_init", 2778 KSTAT_DATA_UINT64); 2779 kstat_named_init(&vvk->vvk_time_run, "time_run", 2780 KSTAT_DATA_UINT64); 2781 kstat_named_init(&vvk->vvk_time_idle, "time_idle", 2782 KSTAT_DATA_UINT64); 2783 kstat_named_init(&vvk->vvk_time_emu_kern, "time_emu_kern", 2784 KSTAT_DATA_UINT64); 2785 kstat_named_init(&vvk->vvk_time_emu_user, "time_emu_user", 2786 KSTAT_DATA_UINT64); 2787 kstat_named_init(&vvk->vvk_time_sched, "time_sched", 2788 KSTAT_DATA_UINT64); 2789 ksp->ks_private = sc->vmm_vm; 2790 ksp->ks_update = vmm_kstat_update_vcpu; 2791 } 2792 2793 kstat_install(sc->vmm_kstat_vm); 2794 for (uint_t i = 0; i < VM_MAXCPU; i++) { 2795 kstat_install(sc->vmm_kstat_vcpu[i]); 2796 } 2797 } 2798 2799 static void 2800 vmm_kstat_fini(vmm_softc_t *sc) 2801 { 2802 ASSERT(sc->vmm_kstat_vm != NULL); 2803 2804 kstat_delete(sc->vmm_kstat_vm); 2805 sc->vmm_kstat_vm = NULL; 2806 2807 for (uint_t i = 0; i < VM_MAXCPU; i++) { 2808 ASSERT3P(sc->vmm_kstat_vcpu[i], !=, NULL); 2809 2810 kstat_delete(sc->vmm_kstat_vcpu[i]); 2811 sc->vmm_kstat_vcpu[i] = NULL; 2812 } 2813 } 2814 2815 static int 2816 vmm_open(dev_t *devp, int flag, int otyp, cred_t *credp) 2817 { 2818 minor_t minor; 2819 vmm_softc_t *sc; 2820 2821 /* 2822 * Forbid running bhyve in a 32-bit process until it has been tested and 2823 * verified to be safe. 2824 */ 2825 if (curproc->p_model != DATAMODEL_LP64) { 2826 return (EFBIG); 2827 } 2828 2829 minor = getminor(*devp); 2830 if (minor == VMM_CTL_MINOR) { 2831 /* 2832 * Master control device must be opened exclusively. 2833 */ 2834 if ((flag & FEXCL) != FEXCL || otyp != OTYP_CHR) { 2835 return (EINVAL); 2836 } 2837 2838 return (0); 2839 } 2840 2841 mutex_enter(&vmm_mtx); 2842 sc = ddi_get_soft_state(vmm_statep, minor); 2843 if (sc == NULL) { 2844 mutex_exit(&vmm_mtx); 2845 return (ENXIO); 2846 } 2847 2848 sc->vmm_flags |= VMM_IS_OPEN; 2849 mutex_exit(&vmm_mtx); 2850 2851 return (0); 2852 } 2853 2854 static int 2855 vmm_close(dev_t dev, int flag, int otyp, cred_t *credp) 2856 { 2857 const minor_t minor = getminor(dev); 2858 vmm_softc_t *sc; 2859 bool hma_release = false; 2860 2861 if (minor == VMM_CTL_MINOR) { 2862 return (0); 2863 } 2864 2865 mutex_enter(&vmm_mtx); 2866 sc = ddi_get_soft_state(vmm_statep, minor); 2867 if (sc == NULL) { 2868 mutex_exit(&vmm_mtx); 2869 return (ENXIO); 2870 } 2871 2872 VERIFY3U(sc->vmm_flags & VMM_IS_OPEN, !=, 0); 2873 sc->vmm_flags &= ~VMM_IS_OPEN; 2874 2875 /* 2876 * If instance was marked for auto-destruction begin that now. Instance 2877 * destruction may have been initated already, so try to make progress 2878 * in that case, since closure of the device is one of its requirements. 2879 */ 2880 if ((sc->vmm_flags & VMM_DESTROY) != 0 || 2881 (sc->vmm_flags & VMM_AUTODESTROY) != 0) { 2882 VERIFY0(vmm_destroy_locked(sc, VDO_DEFAULT, &hma_release)); 2883 } 2884 mutex_exit(&vmm_mtx); 2885 2886 if (hma_release) { 2887 vmm_hma_release(); 2888 } 2889 2890 return (0); 2891 } 2892 2893 static int 2894 vmm_is_supported(intptr_t arg) 2895 { 2896 int r; 2897 const char *msg; 2898 2899 if (vmm_is_intel()) { 2900 r = vmx_x86_supported(&msg); 2901 } else if (vmm_is_svm()) { 2902 /* 2903 * HMA already ensured that the features necessary for SVM 2904 * operation were present and online during vmm_attach(). 2905 */ 2906 r = 0; 2907 } else { 2908 r = ENXIO; 2909 msg = "Unsupported CPU vendor"; 2910 } 2911 2912 if (r != 0 && arg != (intptr_t)NULL) { 2913 if (copyoutstr(msg, (char *)arg, strlen(msg) + 1, NULL) != 0) 2914 return (EFAULT); 2915 } 2916 return (r); 2917 } 2918 2919 static int 2920 vmm_ctl_ioctl(int cmd, intptr_t arg, int md, cred_t *cr, int *rvalp) 2921 { 2922 void *argp = (void *)arg; 2923 2924 switch (cmd) { 2925 case VMM_CREATE_VM: { 2926 struct vm_create_req req; 2927 2928 if ((md & FWRITE) == 0) { 2929 return (EPERM); 2930 } 2931 if (ddi_copyin(argp, &req, sizeof (req), md) != 0) { 2932 return (EFAULT); 2933 } 2934 return (vmmdev_do_vm_create(&req, cr)); 2935 } 2936 case VMM_DESTROY_VM: { 2937 struct vm_destroy_req req; 2938 2939 if ((md & FWRITE) == 0) { 2940 return (EPERM); 2941 } 2942 if (ddi_copyin(argp, &req, sizeof (req), md) != 0) { 2943 return (EFAULT); 2944 } 2945 return (vmmdev_do_vm_destroy(&req, cr)); 2946 } 2947 case VMM_VM_SUPPORTED: 2948 return (vmm_is_supported(arg)); 2949 case VMM_CHECK_IOMMU: 2950 if (!vmm_check_iommu()) { 2951 return (ENXIO); 2952 } 2953 return (0); 2954 case VMM_RESV_QUERY: 2955 case VMM_RESV_ADD: 2956 case VMM_RESV_REMOVE: 2957 return (vmmr_ioctl(cmd, arg, md, cr, rvalp)); 2958 default: 2959 break; 2960 } 2961 /* No other actions are legal on ctl device */ 2962 return (ENOTTY); 2963 } 2964 2965 static int 2966 vmm_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, 2967 int *rvalp) 2968 { 2969 vmm_softc_t *sc; 2970 minor_t minor; 2971 2972 /* 2973 * Forbid running bhyve in a 32-bit process until it has been tested and 2974 * verified to be safe. 2975 */ 2976 if (curproc->p_model != DATAMODEL_LP64) { 2977 return (EFBIG); 2978 } 2979 2980 /* The structs in bhyve ioctls assume a 64-bit datamodel */ 2981 if (ddi_model_convert_from(mode & FMODELS) != DDI_MODEL_NONE) { 2982 return (ENOTSUP); 2983 } 2984 2985 /* 2986 * Regardless of minor (vmmctl or instance), we respond to queries of 2987 * the interface version. 2988 */ 2989 if (cmd == VMM_INTERFACE_VERSION) { 2990 *rvalp = VMM_CURRENT_INTERFACE_VERSION; 2991 return (0); 2992 } 2993 2994 minor = getminor(dev); 2995 2996 if (minor == VMM_CTL_MINOR) { 2997 return (vmm_ctl_ioctl(cmd, arg, mode, credp, rvalp)); 2998 } 2999 3000 sc = ddi_get_soft_state(vmm_statep, minor); 3001 ASSERT(sc != NULL); 3002 3003 /* 3004 * Turn away any ioctls against an instance when it is being destroyed. 3005 * (Except for the ioctl inquiring about that destroy-in-progress.) 3006 */ 3007 if ((sc->vmm_flags & VMM_DESTROY) != 0) { 3008 if (cmd == VM_DESTROY_PENDING) { 3009 *rvalp = 1; 3010 return (0); 3011 } 3012 return (ENXIO); 3013 } 3014 3015 return (vmmdev_do_ioctl(sc, cmd, arg, mode, credp, rvalp)); 3016 } 3017 3018 static int 3019 vmm_segmap(dev_t dev, off_t off, struct as *as, caddr_t *addrp, off_t len, 3020 unsigned int prot, unsigned int maxprot, unsigned int flags, cred_t *credp) 3021 { 3022 vmm_softc_t *sc; 3023 const minor_t minor = getminor(dev); 3024 int err; 3025 3026 if (minor == VMM_CTL_MINOR) { 3027 return (ENODEV); 3028 } 3029 if (off < 0 || (off + len) <= 0) { 3030 return (EINVAL); 3031 } 3032 if ((prot & PROT_USER) == 0) { 3033 return (EACCES); 3034 } 3035 3036 sc = ddi_get_soft_state(vmm_statep, minor); 3037 ASSERT(sc); 3038 3039 if (sc->vmm_flags & VMM_DESTROY) 3040 return (ENXIO); 3041 3042 /* Grab read lock on the VM to prevent any changes to the memory map */ 3043 vmm_read_lock(sc); 3044 3045 if (off >= VM_DEVMEM_START) { 3046 int segid; 3047 off_t segoff; 3048 3049 /* Mapping a devmem "device" */ 3050 if (!vmmdev_devmem_segid(sc, off, len, &segid, &segoff)) { 3051 err = ENODEV; 3052 } else { 3053 err = vm_segmap_obj(sc->vmm_vm, segid, segoff, len, as, 3054 addrp, prot, maxprot, flags); 3055 } 3056 } else { 3057 /* Mapping a part of the guest physical space */ 3058 err = vm_segmap_space(sc->vmm_vm, off, as, addrp, len, prot, 3059 maxprot, flags); 3060 } 3061 3062 vmm_read_unlock(sc); 3063 return (err); 3064 } 3065 3066 static sdev_plugin_validate_t 3067 vmm_sdev_validate(sdev_ctx_t ctx) 3068 { 3069 const char *name = sdev_ctx_name(ctx); 3070 vmm_softc_t *sc; 3071 sdev_plugin_validate_t ret; 3072 minor_t minor; 3073 3074 if (sdev_ctx_vtype(ctx) != VCHR) 3075 return (SDEV_VTOR_INVALID); 3076 3077 VERIFY3S(sdev_ctx_minor(ctx, &minor), ==, 0); 3078 3079 mutex_enter(&vmm_mtx); 3080 if ((sc = vmm_lookup(name)) == NULL) 3081 ret = SDEV_VTOR_INVALID; 3082 else if (sc->vmm_minor != minor) 3083 ret = SDEV_VTOR_STALE; 3084 else 3085 ret = SDEV_VTOR_VALID; 3086 mutex_exit(&vmm_mtx); 3087 3088 return (ret); 3089 } 3090 3091 static int 3092 vmm_sdev_filldir(sdev_ctx_t ctx) 3093 { 3094 vmm_softc_t *sc; 3095 int ret; 3096 3097 if (strcmp(sdev_ctx_path(ctx), VMM_SDEV_ROOT) != 0) { 3098 cmn_err(CE_WARN, "%s: bad path '%s' != '%s'\n", __func__, 3099 sdev_ctx_path(ctx), VMM_SDEV_ROOT); 3100 return (EINVAL); 3101 } 3102 3103 mutex_enter(&vmm_mtx); 3104 ASSERT(vmmdev_dip != NULL); 3105 for (sc = list_head(&vmm_list); sc != NULL; 3106 sc = list_next(&vmm_list, sc)) { 3107 if (INGLOBALZONE(curproc) || sc->vmm_zone == curzone) { 3108 ret = sdev_plugin_mknod(ctx, sc->vmm_name, 3109 S_IFCHR | 0600, 3110 makedevice(ddi_driver_major(vmmdev_dip), 3111 sc->vmm_minor)); 3112 } else { 3113 continue; 3114 } 3115 if (ret != 0 && ret != EEXIST) 3116 goto out; 3117 } 3118 3119 ret = 0; 3120 3121 out: 3122 mutex_exit(&vmm_mtx); 3123 return (ret); 3124 } 3125 3126 /* ARGSUSED */ 3127 static void 3128 vmm_sdev_inactive(sdev_ctx_t ctx) 3129 { 3130 } 3131 3132 static sdev_plugin_ops_t vmm_sdev_ops = { 3133 .spo_version = SDEV_PLUGIN_VERSION, 3134 .spo_flags = SDEV_PLUGIN_SUBDIR, 3135 .spo_validate = vmm_sdev_validate, 3136 .spo_filldir = vmm_sdev_filldir, 3137 .spo_inactive = vmm_sdev_inactive 3138 }; 3139 3140 /* ARGSUSED */ 3141 static int 3142 vmm_info(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result) 3143 { 3144 int error; 3145 3146 switch (cmd) { 3147 case DDI_INFO_DEVT2DEVINFO: 3148 *result = (void *)vmmdev_dip; 3149 error = DDI_SUCCESS; 3150 break; 3151 case DDI_INFO_DEVT2INSTANCE: 3152 *result = (void *)0; 3153 error = DDI_SUCCESS; 3154 break; 3155 default: 3156 error = DDI_FAILURE; 3157 break; 3158 } 3159 return (error); 3160 } 3161 3162 static int 3163 vmm_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 3164 { 3165 sdev_plugin_hdl_t sph; 3166 hma_reg_t *reg = NULL; 3167 boolean_t vmm_loaded = B_FALSE; 3168 3169 if (cmd != DDI_ATTACH) { 3170 return (DDI_FAILURE); 3171 } 3172 3173 mutex_enter(&vmmdev_mtx); 3174 /* Ensure we are not already attached. */ 3175 if (vmmdev_dip != NULL) { 3176 mutex_exit(&vmmdev_mtx); 3177 return (DDI_FAILURE); 3178 } 3179 3180 vmm_sol_glue_init(); 3181 3182 /* 3183 * Perform temporary HMA registration to determine if the system 3184 * is capable. 3185 */ 3186 if ((reg = hma_register(vmmdev_hvm_name)) == NULL) { 3187 goto fail; 3188 } else if (vmm_mod_load() != 0) { 3189 goto fail; 3190 } 3191 vmm_loaded = B_TRUE; 3192 hma_unregister(reg); 3193 reg = NULL; 3194 3195 /* Create control node. Other nodes will be created on demand. */ 3196 if (ddi_create_minor_node(dip, "ctl", S_IFCHR, 3197 VMM_CTL_MINOR, DDI_PSEUDO, 0) != 0) { 3198 goto fail; 3199 } 3200 3201 sph = sdev_plugin_register(VMM_MODULE_NAME, &vmm_sdev_ops, NULL); 3202 if (sph == (sdev_plugin_hdl_t)NULL) { 3203 ddi_remove_minor_node(dip, NULL); 3204 goto fail; 3205 } 3206 3207 ddi_report_dev(dip); 3208 vmmdev_sdev_hdl = sph; 3209 vmmdev_dip = dip; 3210 mutex_exit(&vmmdev_mtx); 3211 return (DDI_SUCCESS); 3212 3213 fail: 3214 if (vmm_loaded) { 3215 VERIFY0(vmm_mod_unload()); 3216 } 3217 if (reg != NULL) { 3218 hma_unregister(reg); 3219 } 3220 vmm_sol_glue_cleanup(); 3221 mutex_exit(&vmmdev_mtx); 3222 return (DDI_FAILURE); 3223 } 3224 3225 static int 3226 vmm_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 3227 { 3228 if (cmd != DDI_DETACH) { 3229 return (DDI_FAILURE); 3230 } 3231 3232 /* 3233 * Ensure that all resources have been cleaned up. 3234 * 3235 * To prevent a deadlock with iommu_cleanup() we'll fail the detach if 3236 * vmmdev_mtx is already held. We can't wait for vmmdev_mtx with our 3237 * devinfo locked as iommu_cleanup() tries to recursively lock each 3238 * devinfo, including our own, while holding vmmdev_mtx. 3239 */ 3240 if (mutex_tryenter(&vmmdev_mtx) == 0) 3241 return (DDI_FAILURE); 3242 3243 mutex_enter(&vmm_mtx); 3244 if (!list_is_empty(&vmm_list)) { 3245 mutex_exit(&vmm_mtx); 3246 mutex_exit(&vmmdev_mtx); 3247 return (DDI_FAILURE); 3248 } 3249 mutex_exit(&vmm_mtx); 3250 3251 if (!vmmr_is_empty()) { 3252 mutex_exit(&vmmdev_mtx); 3253 return (DDI_FAILURE); 3254 } 3255 3256 VERIFY(vmmdev_sdev_hdl != (sdev_plugin_hdl_t)NULL); 3257 if (sdev_plugin_unregister(vmmdev_sdev_hdl) != 0) { 3258 mutex_exit(&vmmdev_mtx); 3259 return (DDI_FAILURE); 3260 } 3261 vmmdev_sdev_hdl = (sdev_plugin_hdl_t)NULL; 3262 3263 /* Remove the control node. */ 3264 ddi_remove_minor_node(dip, "ctl"); 3265 vmmdev_dip = NULL; 3266 3267 VERIFY0(vmm_mod_unload()); 3268 VERIFY3U(vmmdev_hma_reg, ==, NULL); 3269 vmm_sol_glue_cleanup(); 3270 3271 mutex_exit(&vmmdev_mtx); 3272 3273 return (DDI_SUCCESS); 3274 } 3275 3276 static struct cb_ops vmm_cb_ops = { 3277 vmm_open, 3278 vmm_close, 3279 nodev, /* strategy */ 3280 nodev, /* print */ 3281 nodev, /* dump */ 3282 nodev, /* read */ 3283 nodev, /* write */ 3284 vmm_ioctl, 3285 nodev, /* devmap */ 3286 nodev, /* mmap */ 3287 vmm_segmap, 3288 nochpoll, /* poll */ 3289 ddi_prop_op, 3290 NULL, 3291 D_NEW | D_MP | D_DEVMAP 3292 }; 3293 3294 static struct dev_ops vmm_ops = { 3295 DEVO_REV, 3296 0, 3297 vmm_info, 3298 nulldev, /* identify */ 3299 nulldev, /* probe */ 3300 vmm_attach, 3301 vmm_detach, 3302 nodev, /* reset */ 3303 &vmm_cb_ops, 3304 (struct bus_ops *)NULL 3305 }; 3306 3307 static struct modldrv modldrv = { 3308 &mod_driverops, 3309 "bhyve vmm", 3310 &vmm_ops 3311 }; 3312 3313 static struct modlinkage modlinkage = { 3314 MODREV_1, 3315 &modldrv, 3316 NULL 3317 }; 3318 3319 int 3320 _init(void) 3321 { 3322 int error; 3323 3324 sysinit(); 3325 3326 mutex_init(&vmmdev_mtx, NULL, MUTEX_DRIVER, NULL); 3327 mutex_init(&vmm_mtx, NULL, MUTEX_DRIVER, NULL); 3328 list_create(&vmm_list, sizeof (vmm_softc_t), 3329 offsetof(vmm_softc_t, vmm_node)); 3330 vmm_minors = id_space_create("vmm_minors", VMM_CTL_MINOR + 1, MAXMIN32); 3331 3332 error = ddi_soft_state_init(&vmm_statep, sizeof (vmm_softc_t), 0); 3333 if (error) { 3334 return (error); 3335 } 3336 3337 vmm_zsd_init(); 3338 vmmr_init(); 3339 3340 error = mod_install(&modlinkage); 3341 if (error) { 3342 ddi_soft_state_fini(&vmm_statep); 3343 vmm_zsd_fini(); 3344 vmmr_fini(); 3345 } 3346 3347 return (error); 3348 } 3349 3350 int 3351 _fini(void) 3352 { 3353 int error; 3354 3355 error = mod_remove(&modlinkage); 3356 if (error) { 3357 return (error); 3358 } 3359 3360 vmm_zsd_fini(); 3361 vmmr_fini(); 3362 3363 ddi_soft_state_fini(&vmm_statep); 3364 3365 return (0); 3366 } 3367 3368 int 3369 _info(struct modinfo *modinfop) 3370 { 3371 return (mod_info(&modlinkage, modinfop)); 3372 } 3373