1 /* 2 * This file and its contents are supplied under the terms of the 3 * Common Development and Distribution License ("CDDL"), version 1.0. 4 * You may only use this file in accordance with the terms of version 5 * 1.0 of the CDDL. 6 * 7 * A full copy of the text of the CDDL should have accompanied this 8 * source. A copy of the CDDL is also available via the Internet at 9 * http://www.illumos.org/license/CDDL. 10 */ 11 /* This file is dual-licensed; see usr/src/contrib/bhyve/LICENSE */ 12 13 /* 14 * Copyright 2015 Pluribus Networks Inc. 15 * Copyright 2019 Joyent, Inc. 16 * Copyright 2020 OmniOS Community Edition (OmniOSce) Association. 17 * Copyright 2023 Oxide Computer Company 18 */ 19 20 #include <sys/types.h> 21 #include <sys/conf.h> 22 #include <sys/cpuvar.h> 23 #include <sys/ioccom.h> 24 #include <sys/stat.h> 25 #include <sys/vmsystm.h> 26 #include <sys/ddi.h> 27 #include <sys/mkdev.h> 28 #include <sys/sunddi.h> 29 #include <sys/fs/dv_node.h> 30 #include <sys/cpuset.h> 31 #include <sys/id_space.h> 32 #include <sys/fs/sdev_plugin.h> 33 #include <sys/smt.h> 34 #include <sys/kstat.h> 35 36 #include <sys/kernel.h> 37 #include <sys/hma.h> 38 #include <sys/x86_archext.h> 39 #include <x86/apicreg.h> 40 41 #include <sys/vmm.h> 42 #include <sys/vmm_kernel.h> 43 #include <sys/vmm_instruction_emul.h> 44 #include <sys/vmm_dev.h> 45 #include <sys/vmm_impl.h> 46 #include <sys/vmm_drv.h> 47 #include <sys/vmm_vm.h> 48 #include <sys/vmm_reservoir.h> 49 50 #include <vm/seg_dev.h> 51 52 #include "io/ppt.h" 53 #include "io/vatpic.h" 54 #include "io/vioapic.h" 55 #include "io/vrtc.h" 56 #include "io/vhpet.h" 57 #include "io/vpmtmr.h" 58 #include "vmm_lapic.h" 59 #include "vmm_stat.h" 60 #include "vmm_util.h" 61 62 /* 63 * Locking details: 64 * 65 * Driver-wide data (vmmdev_*) , including HMA and sdev registration, is 66 * protected by vmmdev_mtx. The list of vmm_softc_t instances and related data 67 * (vmm_*) are protected by vmm_mtx. Actions requiring both locks must acquire 68 * vmmdev_mtx before vmm_mtx. The sdev plugin functions must not attempt to 69 * acquire vmmdev_mtx, as they could deadlock with plugin unregistration. 70 */ 71 72 static kmutex_t vmmdev_mtx; 73 static dev_info_t *vmmdev_dip; 74 static hma_reg_t *vmmdev_hma_reg; 75 static uint_t vmmdev_hma_ref; 76 static sdev_plugin_hdl_t vmmdev_sdev_hdl; 77 78 static kmutex_t vmm_mtx; 79 static list_t vmm_list; 80 static id_space_t *vmm_minors; 81 static void *vmm_statep; 82 83 /* temporary safety switch */ 84 int vmm_allow_state_writes; 85 86 static const char *vmmdev_hvm_name = "bhyve"; 87 88 /* For sdev plugin (/dev) */ 89 #define VMM_SDEV_ROOT "/dev/vmm" 90 91 /* From uts/intel/io/vmm/intel/vmx.c */ 92 extern int vmx_x86_supported(const char **); 93 94 /* Holds and hooks from drivers external to vmm */ 95 struct vmm_hold { 96 list_node_t vmh_node; 97 vmm_softc_t *vmh_sc; 98 boolean_t vmh_release_req; 99 uint_t vmh_ioport_hook_cnt; 100 }; 101 102 struct vmm_lease { 103 list_node_t vml_node; 104 struct vm *vml_vm; 105 vm_client_t *vml_vmclient; 106 boolean_t vml_expired; 107 boolean_t vml_break_deferred; 108 boolean_t (*vml_expire_func)(void *); 109 void *vml_expire_arg; 110 struct vmm_hold *vml_hold; 111 }; 112 113 /* Options for vmm_destroy_locked */ 114 typedef enum vmm_destroy_opts { 115 VDO_DEFAULT = 0, 116 /* 117 * Indicate that zone-specific-data associated with this VM not be 118 * cleaned up as part of the destroy. Skipping ZSD clean-up is 119 * necessary when VM is being destroyed as part of zone destruction, 120 * when said ZSD is already being cleaned up. 121 */ 122 VDO_NO_CLEAN_ZSD = (1 << 0), 123 /* 124 * Attempt to wait for VM destruction to complete. This is opt-in, 125 * since there are many normal conditions which could lead to 126 * destruction being stalled pending other clean-up. 127 */ 128 VDO_ATTEMPT_WAIT = (1 << 1), 129 } vmm_destroy_opts_t; 130 131 static void vmm_hma_release(void); 132 static int vmm_destroy_locked(vmm_softc_t *, vmm_destroy_opts_t, bool *); 133 static int vmm_drv_block_hook(vmm_softc_t *, boolean_t); 134 static void vmm_lease_block(vmm_softc_t *); 135 static void vmm_lease_unblock(vmm_softc_t *); 136 static int vmm_kstat_alloc(vmm_softc_t *, minor_t, const cred_t *); 137 static void vmm_kstat_init(vmm_softc_t *); 138 static void vmm_kstat_fini(vmm_softc_t *); 139 140 /* 141 * The 'devmem' hack: 142 * 143 * On native FreeBSD, bhyve consumers are allowed to create 'devmem' segments 144 * in the vm which appear with their own name related to the vm under /dev. 145 * Since this would be a hassle from an sdev perspective and would require a 146 * new cdev interface (or complicate the existing one), we choose to implement 147 * this in a different manner. Direct access to the underlying vm memory 148 * segments is exposed by placing them in a range of offsets beyond the normal 149 * guest memory space. Userspace can query the appropriate offset to mmap() 150 * for a given segment-id with the VM_DEVMEM_GETOFFSET ioctl. 151 */ 152 153 static vmm_devmem_entry_t * 154 vmmdev_devmem_find(vmm_softc_t *sc, int segid) 155 { 156 vmm_devmem_entry_t *ent = NULL; 157 list_t *dl = &sc->vmm_devmem_list; 158 159 for (ent = list_head(dl); ent != NULL; ent = list_next(dl, ent)) { 160 if (ent->vde_segid == segid) { 161 return (ent); 162 } 163 } 164 return (NULL); 165 } 166 167 static int 168 vmmdev_get_memseg(vmm_softc_t *sc, struct vm_memseg *mseg) 169 { 170 int error; 171 bool sysmem; 172 173 error = vm_get_memseg(sc->vmm_vm, mseg->segid, &mseg->len, &sysmem, 174 NULL); 175 if (error || mseg->len == 0) 176 return (error); 177 178 if (!sysmem) { 179 vmm_devmem_entry_t *de; 180 181 de = vmmdev_devmem_find(sc, mseg->segid); 182 if (de != NULL) { 183 (void) strlcpy(mseg->name, de->vde_name, 184 sizeof (mseg->name)); 185 } 186 } else { 187 bzero(mseg->name, sizeof (mseg->name)); 188 } 189 190 return (error); 191 } 192 193 static int 194 vmmdev_devmem_create(vmm_softc_t *sc, struct vm_memseg *mseg, const char *name) 195 { 196 off_t map_offset; 197 vmm_devmem_entry_t *entry; 198 199 if (list_is_empty(&sc->vmm_devmem_list)) { 200 map_offset = VM_DEVMEM_START; 201 } else { 202 entry = list_tail(&sc->vmm_devmem_list); 203 map_offset = entry->vde_off + entry->vde_len; 204 if (map_offset < entry->vde_off) { 205 /* Do not tolerate overflow */ 206 return (ERANGE); 207 } 208 /* 209 * XXXJOY: We could choose to search the list for duplicate 210 * names and toss an error. Since we're using the offset 211 * method for now, it does not make much of a difference. 212 */ 213 } 214 215 entry = kmem_zalloc(sizeof (*entry), KM_SLEEP); 216 entry->vde_segid = mseg->segid; 217 entry->vde_len = mseg->len; 218 entry->vde_off = map_offset; 219 (void) strlcpy(entry->vde_name, name, sizeof (entry->vde_name)); 220 list_insert_tail(&sc->vmm_devmem_list, entry); 221 222 return (0); 223 } 224 225 static boolean_t 226 vmmdev_devmem_segid(vmm_softc_t *sc, off_t off, off_t len, int *segidp, 227 off_t *map_offp) 228 { 229 list_t *dl = &sc->vmm_devmem_list; 230 vmm_devmem_entry_t *de = NULL; 231 const off_t map_end = off + len; 232 233 VERIFY(off >= VM_DEVMEM_START); 234 235 if (map_end < off) { 236 /* No match on overflow */ 237 return (B_FALSE); 238 } 239 240 for (de = list_head(dl); de != NULL; de = list_next(dl, de)) { 241 const off_t item_end = de->vde_off + de->vde_len; 242 243 if (de->vde_off <= off && item_end >= map_end) { 244 *segidp = de->vde_segid; 245 *map_offp = off - de->vde_off; 246 return (B_TRUE); 247 } 248 } 249 return (B_FALSE); 250 } 251 252 /* 253 * When an instance is being destroyed, the devmem list of named memory objects 254 * can be torn down, as no new mappings are allowed. 255 */ 256 static void 257 vmmdev_devmem_purge(vmm_softc_t *sc) 258 { 259 vmm_devmem_entry_t *entry; 260 261 while ((entry = list_remove_head(&sc->vmm_devmem_list)) != NULL) { 262 kmem_free(entry, sizeof (*entry)); 263 } 264 } 265 266 static int 267 vmmdev_alloc_memseg(vmm_softc_t *sc, struct vm_memseg *mseg) 268 { 269 int error; 270 bool sysmem = true; 271 272 if (VM_MEMSEG_NAME(mseg)) { 273 sysmem = false; 274 } 275 error = vm_alloc_memseg(sc->vmm_vm, mseg->segid, mseg->len, sysmem); 276 277 if (error == 0) { 278 /* 279 * Rather than create a whole fresh device from which userspace 280 * can mmap this segment, instead make it available at an 281 * offset above where the main guest memory resides. 282 */ 283 error = vmmdev_devmem_create(sc, mseg, mseg->name); 284 if (error != 0) { 285 vm_free_memseg(sc->vmm_vm, mseg->segid); 286 } 287 } 288 return (error); 289 } 290 291 /* 292 * Resource Locking and Exclusion 293 * 294 * Much of bhyve depends on key portions of VM state, such as the guest memory 295 * map, to remain unchanged while the guest is running. As ported from 296 * FreeBSD, the initial strategy for this resource exclusion hinged on gating 297 * access to the instance vCPUs. Threads acting on a single vCPU, like those 298 * performing the work of actually running the guest in VMX/SVM, would lock 299 * only that vCPU during ioctl() entry. For ioctls which would change VM-wide 300 * state, all of the vCPUs would be first locked, ensuring that the 301 * operation(s) could complete without any other threads stumbling into 302 * intermediate states. 303 * 304 * This approach is largely effective for bhyve. Common operations, such as 305 * running the vCPUs, steer clear of lock contention. The model begins to 306 * break down for operations which do not occur in the context of a specific 307 * vCPU. LAPIC MSI delivery, for example, may be initiated from a worker 308 * thread in the bhyve process. In order to properly protect those vCPU-less 309 * operations from encountering invalid states, additional locking is required. 310 * This was solved by forcing those operations to lock the VM_MAXCPU-1 vCPU. 311 * It does mean that class of operations will be serialized on locking the 312 * specific vCPU and that instances sized at VM_MAXCPU will potentially see 313 * undue contention on the VM_MAXCPU-1 vCPU. 314 * 315 * In order to address the shortcomings of this model, the concept of a 316 * read/write lock has been added to bhyve. Operations which change 317 * fundamental aspects of a VM (such as the memory map) must acquire the write 318 * lock, which also implies locking all of the vCPUs and waiting for all read 319 * lock holders to release. While it increases the cost and waiting time for 320 * those few operations, it allows most hot-path operations on the VM (which 321 * depend on its configuration remaining stable) to occur with minimal locking. 322 * 323 * Consumers of the Driver API (see below) are a special case when it comes to 324 * this locking, since they may hold a read lock via the drv_lease mechanism 325 * for an extended period of time. Rather than forcing those consumers to 326 * continuously poll for a write lock attempt, the lease system forces them to 327 * provide a release callback to trigger their clean-up (and potential later 328 * reacquisition) of the read lock. 329 */ 330 331 static void 332 vcpu_lock_one(vmm_softc_t *sc, int vcpu) 333 { 334 ASSERT(vcpu >= 0 && vcpu < VM_MAXCPU); 335 336 /* 337 * Since this state transition is utilizing from_idle=true, it should 338 * not fail, but rather block until it can be successful. 339 */ 340 VERIFY0(vcpu_set_state(sc->vmm_vm, vcpu, VCPU_FROZEN, true)); 341 } 342 343 static void 344 vcpu_unlock_one(vmm_softc_t *sc, int vcpu) 345 { 346 ASSERT(vcpu >= 0 && vcpu < VM_MAXCPU); 347 348 VERIFY3U(vcpu_get_state(sc->vmm_vm, vcpu, NULL), ==, VCPU_FROZEN); 349 VERIFY0(vcpu_set_state(sc->vmm_vm, vcpu, VCPU_IDLE, false)); 350 } 351 352 static void 353 vmm_read_lock(vmm_softc_t *sc) 354 { 355 rw_enter(&sc->vmm_rwlock, RW_READER); 356 } 357 358 static void 359 vmm_read_unlock(vmm_softc_t *sc) 360 { 361 rw_exit(&sc->vmm_rwlock); 362 } 363 364 static void 365 vmm_write_lock(vmm_softc_t *sc) 366 { 367 int maxcpus; 368 369 /* First lock all the vCPUs */ 370 maxcpus = vm_get_maxcpus(sc->vmm_vm); 371 for (int vcpu = 0; vcpu < maxcpus; vcpu++) { 372 vcpu_lock_one(sc, vcpu); 373 } 374 375 /* 376 * Block vmm_drv leases from being acquired or held while the VM write 377 * lock is held. 378 */ 379 vmm_lease_block(sc); 380 381 rw_enter(&sc->vmm_rwlock, RW_WRITER); 382 /* 383 * For now, the 'maxcpus' value for an instance is fixed at the 384 * compile-time constant of VM_MAXCPU at creation. If this changes in 385 * the future, allowing for dynamic vCPU resource sizing, acquisition 386 * of the write lock will need to be wary of such changes. 387 */ 388 VERIFY(maxcpus == vm_get_maxcpus(sc->vmm_vm)); 389 } 390 391 static void 392 vmm_write_unlock(vmm_softc_t *sc) 393 { 394 int maxcpus; 395 396 /* Allow vmm_drv leases to be acquired once write lock is dropped */ 397 vmm_lease_unblock(sc); 398 399 /* 400 * The VM write lock _must_ be released from the same thread it was 401 * acquired in, unlike the read lock. 402 */ 403 VERIFY(rw_write_held(&sc->vmm_rwlock)); 404 rw_exit(&sc->vmm_rwlock); 405 406 /* Unlock all the vCPUs */ 407 maxcpus = vm_get_maxcpus(sc->vmm_vm); 408 for (int vcpu = 0; vcpu < maxcpus; vcpu++) { 409 vcpu_unlock_one(sc, vcpu); 410 } 411 } 412 413 static int 414 vmmdev_do_ioctl(vmm_softc_t *sc, int cmd, intptr_t arg, int md, 415 cred_t *credp, int *rvalp) 416 { 417 int error = 0, vcpu = -1; 418 void *datap = (void *)arg; 419 enum vm_lock_type { 420 LOCK_NONE = 0, 421 LOCK_VCPU, 422 LOCK_READ_HOLD, 423 LOCK_WRITE_HOLD 424 } lock_type = LOCK_NONE; 425 426 /* Acquire any exclusion resources needed for the operation. */ 427 switch (cmd) { 428 case VM_RUN: 429 case VM_GET_REGISTER: 430 case VM_SET_REGISTER: 431 case VM_GET_SEGMENT_DESCRIPTOR: 432 case VM_SET_SEGMENT_DESCRIPTOR: 433 case VM_GET_REGISTER_SET: 434 case VM_SET_REGISTER_SET: 435 case VM_INJECT_EXCEPTION: 436 case VM_GET_CAPABILITY: 437 case VM_SET_CAPABILITY: 438 case VM_PPTDEV_MSI: 439 case VM_PPTDEV_MSIX: 440 case VM_SET_X2APIC_STATE: 441 case VM_GLA2GPA: 442 case VM_GLA2GPA_NOFAULT: 443 case VM_ACTIVATE_CPU: 444 case VM_SET_INTINFO: 445 case VM_GET_INTINFO: 446 case VM_RESTART_INSTRUCTION: 447 case VM_SET_KERNEMU_DEV: 448 case VM_GET_KERNEMU_DEV: 449 case VM_RESET_CPU: 450 case VM_GET_RUN_STATE: 451 case VM_SET_RUN_STATE: 452 case VM_GET_FPU: 453 case VM_SET_FPU: 454 case VM_GET_CPUID: 455 case VM_SET_CPUID: 456 case VM_LEGACY_CPUID: 457 /* 458 * Copy in the ID of the vCPU chosen for this operation. 459 * Since a nefarious caller could update their struct between 460 * this locking and when the rest of the ioctl data is copied 461 * in, it is _critical_ that this local 'vcpu' variable be used 462 * rather than the in-struct one when performing the ioctl. 463 */ 464 if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) { 465 return (EFAULT); 466 } 467 if (vcpu < 0 || vcpu > vm_get_maxcpus(sc->vmm_vm)) { 468 return (EINVAL); 469 } 470 vcpu_lock_one(sc, vcpu); 471 lock_type = LOCK_VCPU; 472 break; 473 474 case VM_REINIT: 475 case VM_BIND_PPTDEV: 476 case VM_UNBIND_PPTDEV: 477 case VM_MAP_PPTDEV_MMIO: 478 case VM_UNMAP_PPTDEV_MMIO: 479 case VM_ALLOC_MEMSEG: 480 case VM_MMAP_MEMSEG: 481 case VM_MUNMAP_MEMSEG: 482 case VM_WRLOCK_CYCLE: 483 case VM_PMTMR_LOCATE: 484 case VM_PAUSE: 485 case VM_RESUME: 486 vmm_write_lock(sc); 487 lock_type = LOCK_WRITE_HOLD; 488 break; 489 490 case VM_GET_MEMSEG: 491 case VM_MMAP_GETNEXT: 492 case VM_LAPIC_IRQ: 493 case VM_INJECT_NMI: 494 case VM_IOAPIC_ASSERT_IRQ: 495 case VM_IOAPIC_DEASSERT_IRQ: 496 case VM_IOAPIC_PULSE_IRQ: 497 case VM_LAPIC_MSI: 498 case VM_LAPIC_LOCAL_IRQ: 499 case VM_GET_X2APIC_STATE: 500 case VM_RTC_READ: 501 case VM_RTC_WRITE: 502 case VM_RTC_SETTIME: 503 case VM_RTC_GETTIME: 504 case VM_PPTDEV_DISABLE_MSIX: 505 case VM_DEVMEM_GETOFFSET: 506 case VM_TRACK_DIRTY_PAGES: 507 vmm_read_lock(sc); 508 lock_type = LOCK_READ_HOLD; 509 break; 510 511 case VM_DATA_READ: 512 case VM_DATA_WRITE: 513 if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) { 514 return (EFAULT); 515 } 516 if (vcpu == -1) { 517 /* Access data for VM-wide devices */ 518 vmm_write_lock(sc); 519 lock_type = LOCK_WRITE_HOLD; 520 } else if (vcpu >= 0 && vcpu < vm_get_maxcpus(sc->vmm_vm)) { 521 /* Access data associated with a specific vCPU */ 522 vcpu_lock_one(sc, vcpu); 523 lock_type = LOCK_VCPU; 524 } else { 525 return (EINVAL); 526 } 527 break; 528 529 case VM_GET_GPA_PMAP: 530 case VM_IOAPIC_PINCOUNT: 531 case VM_SUSPEND: 532 case VM_DESC_FPU_AREA: 533 case VM_SET_AUTODESTRUCT: 534 case VM_DESTROY_SELF: 535 case VM_DESTROY_PENDING: 536 default: 537 break; 538 } 539 540 /* Execute the primary logic for the ioctl. */ 541 switch (cmd) { 542 case VM_RUN: { 543 struct vm_entry entry; 544 545 if (ddi_copyin(datap, &entry, sizeof (entry), md)) { 546 error = EFAULT; 547 break; 548 } 549 550 if (!(curthread->t_schedflag & TS_VCPU)) 551 smt_mark_as_vcpu(); 552 553 error = vm_run(sc->vmm_vm, vcpu, &entry); 554 555 /* 556 * Unexpected states in vm_run() are expressed through positive 557 * errno-oriented return values. VM states which expect further 558 * processing in userspace (necessary context via exitinfo) are 559 * expressed through negative return values. For the time being 560 * a return value of 0 is not expected from vm_run(). 561 */ 562 ASSERT(error != 0); 563 if (error < 0) { 564 const struct vm_exit *vme; 565 void *outp = entry.exit_data; 566 567 error = 0; 568 vme = vm_exitinfo(sc->vmm_vm, vcpu); 569 if (ddi_copyout(vme, outp, sizeof (*vme), md)) { 570 error = EFAULT; 571 } 572 } 573 break; 574 } 575 case VM_SUSPEND: { 576 struct vm_suspend vmsuspend; 577 578 if (ddi_copyin(datap, &vmsuspend, sizeof (vmsuspend), md)) { 579 error = EFAULT; 580 break; 581 } 582 error = vm_suspend(sc->vmm_vm, vmsuspend.how); 583 break; 584 } 585 case VM_REINIT: { 586 struct vm_reinit reinit; 587 588 if (ddi_copyin(datap, &reinit, sizeof (reinit), md)) { 589 error = EFAULT; 590 break; 591 } 592 if ((error = vmm_drv_block_hook(sc, B_TRUE)) != 0) { 593 /* 594 * The VM instance should be free of driver-attached 595 * hooks during the reinitialization process. 596 */ 597 break; 598 } 599 error = vm_reinit(sc->vmm_vm, reinit.flags); 600 (void) vmm_drv_block_hook(sc, B_FALSE); 601 break; 602 } 603 case VM_STAT_DESC: { 604 struct vm_stat_desc statdesc; 605 606 if (ddi_copyin(datap, &statdesc, sizeof (statdesc), md)) { 607 error = EFAULT; 608 break; 609 } 610 error = vmm_stat_desc_copy(statdesc.index, statdesc.desc, 611 sizeof (statdesc.desc)); 612 if (error == 0 && 613 ddi_copyout(&statdesc, datap, sizeof (statdesc), md)) { 614 error = EFAULT; 615 break; 616 } 617 break; 618 } 619 case VM_STATS_IOC: { 620 struct vm_stats vmstats; 621 622 if (ddi_copyin(datap, &vmstats, sizeof (vmstats), md)) { 623 error = EFAULT; 624 break; 625 } 626 hrt2tv(gethrtime(), &vmstats.tv); 627 error = vmm_stat_copy(sc->vmm_vm, vmstats.cpuid, vmstats.index, 628 nitems(vmstats.statbuf), 629 &vmstats.num_entries, vmstats.statbuf); 630 if (error == 0 && 631 ddi_copyout(&vmstats, datap, sizeof (vmstats), md)) { 632 error = EFAULT; 633 break; 634 } 635 break; 636 } 637 638 case VM_PPTDEV_MSI: { 639 struct vm_pptdev_msi pptmsi; 640 641 if (ddi_copyin(datap, &pptmsi, sizeof (pptmsi), md)) { 642 error = EFAULT; 643 break; 644 } 645 error = ppt_setup_msi(sc->vmm_vm, pptmsi.vcpu, pptmsi.pptfd, 646 pptmsi.addr, pptmsi.msg, pptmsi.numvec); 647 break; 648 } 649 case VM_PPTDEV_MSIX: { 650 struct vm_pptdev_msix pptmsix; 651 652 if (ddi_copyin(datap, &pptmsix, sizeof (pptmsix), md)) { 653 error = EFAULT; 654 break; 655 } 656 error = ppt_setup_msix(sc->vmm_vm, pptmsix.vcpu, pptmsix.pptfd, 657 pptmsix.idx, pptmsix.addr, pptmsix.msg, 658 pptmsix.vector_control); 659 break; 660 } 661 case VM_PPTDEV_DISABLE_MSIX: { 662 struct vm_pptdev pptdev; 663 664 if (ddi_copyin(datap, &pptdev, sizeof (pptdev), md)) { 665 error = EFAULT; 666 break; 667 } 668 error = ppt_disable_msix(sc->vmm_vm, pptdev.pptfd); 669 break; 670 } 671 case VM_MAP_PPTDEV_MMIO: { 672 struct vm_pptdev_mmio pptmmio; 673 674 if (ddi_copyin(datap, &pptmmio, sizeof (pptmmio), md)) { 675 error = EFAULT; 676 break; 677 } 678 error = ppt_map_mmio(sc->vmm_vm, pptmmio.pptfd, pptmmio.gpa, 679 pptmmio.len, pptmmio.hpa); 680 break; 681 } 682 case VM_UNMAP_PPTDEV_MMIO: { 683 struct vm_pptdev_mmio pptmmio; 684 685 if (ddi_copyin(datap, &pptmmio, sizeof (pptmmio), md)) { 686 error = EFAULT; 687 break; 688 } 689 error = ppt_unmap_mmio(sc->vmm_vm, pptmmio.pptfd, pptmmio.gpa, 690 pptmmio.len); 691 break; 692 } 693 case VM_BIND_PPTDEV: { 694 struct vm_pptdev pptdev; 695 696 if (ddi_copyin(datap, &pptdev, sizeof (pptdev), md)) { 697 error = EFAULT; 698 break; 699 } 700 error = vm_assign_pptdev(sc->vmm_vm, pptdev.pptfd); 701 break; 702 } 703 case VM_UNBIND_PPTDEV: { 704 struct vm_pptdev pptdev; 705 706 if (ddi_copyin(datap, &pptdev, sizeof (pptdev), md)) { 707 error = EFAULT; 708 break; 709 } 710 error = vm_unassign_pptdev(sc->vmm_vm, pptdev.pptfd); 711 break; 712 } 713 case VM_GET_PPTDEV_LIMITS: { 714 struct vm_pptdev_limits pptlimits; 715 716 if (ddi_copyin(datap, &pptlimits, sizeof (pptlimits), md)) { 717 error = EFAULT; 718 break; 719 } 720 error = ppt_get_limits(sc->vmm_vm, pptlimits.pptfd, 721 &pptlimits.msi_limit, &pptlimits.msix_limit); 722 if (error == 0 && 723 ddi_copyout(&pptlimits, datap, sizeof (pptlimits), md)) { 724 error = EFAULT; 725 break; 726 } 727 break; 728 } 729 case VM_INJECT_EXCEPTION: { 730 struct vm_exception vmexc; 731 if (ddi_copyin(datap, &vmexc, sizeof (vmexc), md)) { 732 error = EFAULT; 733 break; 734 } 735 error = vm_inject_exception(sc->vmm_vm, vcpu, vmexc.vector, 736 vmexc.error_code_valid != 0, vmexc.error_code, 737 vmexc.restart_instruction != 0); 738 break; 739 } 740 case VM_INJECT_NMI: { 741 struct vm_nmi vmnmi; 742 743 if (ddi_copyin(datap, &vmnmi, sizeof (vmnmi), md)) { 744 error = EFAULT; 745 break; 746 } 747 error = vm_inject_nmi(sc->vmm_vm, vmnmi.cpuid); 748 break; 749 } 750 case VM_LAPIC_IRQ: { 751 struct vm_lapic_irq vmirq; 752 753 if (ddi_copyin(datap, &vmirq, sizeof (vmirq), md)) { 754 error = EFAULT; 755 break; 756 } 757 error = lapic_intr_edge(sc->vmm_vm, vmirq.cpuid, vmirq.vector); 758 break; 759 } 760 case VM_LAPIC_LOCAL_IRQ: { 761 struct vm_lapic_irq vmirq; 762 763 if (ddi_copyin(datap, &vmirq, sizeof (vmirq), md)) { 764 error = EFAULT; 765 break; 766 } 767 error = lapic_set_local_intr(sc->vmm_vm, vmirq.cpuid, 768 vmirq.vector); 769 break; 770 } 771 case VM_LAPIC_MSI: { 772 struct vm_lapic_msi vmmsi; 773 774 if (ddi_copyin(datap, &vmmsi, sizeof (vmmsi), md)) { 775 error = EFAULT; 776 break; 777 } 778 error = lapic_intr_msi(sc->vmm_vm, vmmsi.addr, vmmsi.msg); 779 break; 780 } 781 782 case VM_IOAPIC_ASSERT_IRQ: { 783 struct vm_ioapic_irq ioapic_irq; 784 785 if (ddi_copyin(datap, &ioapic_irq, sizeof (ioapic_irq), md)) { 786 error = EFAULT; 787 break; 788 } 789 error = vioapic_assert_irq(sc->vmm_vm, ioapic_irq.irq); 790 break; 791 } 792 case VM_IOAPIC_DEASSERT_IRQ: { 793 struct vm_ioapic_irq ioapic_irq; 794 795 if (ddi_copyin(datap, &ioapic_irq, sizeof (ioapic_irq), md)) { 796 error = EFAULT; 797 break; 798 } 799 error = vioapic_deassert_irq(sc->vmm_vm, ioapic_irq.irq); 800 break; 801 } 802 case VM_IOAPIC_PULSE_IRQ: { 803 struct vm_ioapic_irq ioapic_irq; 804 805 if (ddi_copyin(datap, &ioapic_irq, sizeof (ioapic_irq), md)) { 806 error = EFAULT; 807 break; 808 } 809 error = vioapic_pulse_irq(sc->vmm_vm, ioapic_irq.irq); 810 break; 811 } 812 case VM_IOAPIC_PINCOUNT: { 813 int pincount; 814 815 pincount = vioapic_pincount(sc->vmm_vm); 816 if (ddi_copyout(&pincount, datap, sizeof (int), md)) { 817 error = EFAULT; 818 break; 819 } 820 break; 821 } 822 case VM_DESC_FPU_AREA: { 823 struct vm_fpu_desc desc; 824 void *buf = NULL; 825 826 if (ddi_copyin(datap, &desc, sizeof (desc), md)) { 827 error = EFAULT; 828 break; 829 } 830 if (desc.vfd_num_entries > 64) { 831 error = EINVAL; 832 break; 833 } 834 const size_t buf_sz = sizeof (struct vm_fpu_desc_entry) * 835 desc.vfd_num_entries; 836 if (buf_sz != 0) { 837 buf = kmem_zalloc(buf_sz, KM_SLEEP); 838 } 839 840 /* 841 * For now, we are depending on vm_fpu_desc_entry and 842 * hma_xsave_state_desc_t having the same format. 843 */ 844 CTASSERT(sizeof (struct vm_fpu_desc_entry) == 845 sizeof (hma_xsave_state_desc_t)); 846 847 size_t req_size; 848 const uint_t max_entries = hma_fpu_describe_xsave_state( 849 (hma_xsave_state_desc_t *)buf, 850 desc.vfd_num_entries, 851 &req_size); 852 853 desc.vfd_req_size = req_size; 854 desc.vfd_num_entries = max_entries; 855 if (buf_sz != 0) { 856 if (ddi_copyout(buf, desc.vfd_entry_data, buf_sz, md)) { 857 error = EFAULT; 858 } 859 kmem_free(buf, buf_sz); 860 } 861 862 if (error == 0) { 863 if (ddi_copyout(&desc, datap, sizeof (desc), md)) { 864 error = EFAULT; 865 } 866 } 867 break; 868 } 869 case VM_SET_AUTODESTRUCT: { 870 /* 871 * Since this has to do with controlling the lifetime of the 872 * greater vmm_softc_t, the flag is protected by vmm_mtx, rather 873 * than the vcpu-centric or rwlock exclusion mechanisms. 874 */ 875 mutex_enter(&vmm_mtx); 876 if (arg != 0) { 877 sc->vmm_flags |= VMM_AUTODESTROY; 878 } else { 879 sc->vmm_flags &= ~VMM_AUTODESTROY; 880 } 881 mutex_exit(&vmm_mtx); 882 break; 883 } 884 case VM_DESTROY_SELF: { 885 bool hma_release = false; 886 887 /* 888 * Just like VMM_DESTROY_VM, but on the instance file descriptor 889 * itself, rather than having to perform a racy name lookup as 890 * part of the destroy process. 891 * 892 * Since vmm_destroy_locked() performs vCPU lock acquisition in 893 * order to kick the vCPUs out of guest context as part of any 894 * destruction, we do not need to worry about it ourself using 895 * the `lock_type` logic here. 896 */ 897 mutex_enter(&vmm_mtx); 898 VERIFY0(vmm_destroy_locked(sc, VDO_DEFAULT, &hma_release)); 899 mutex_exit(&vmm_mtx); 900 if (hma_release) { 901 vmm_hma_release(); 902 } 903 break; 904 } 905 case VM_DESTROY_PENDING: { 906 /* 907 * If we have made it this far, then destruction of the instance 908 * has not been initiated. 909 */ 910 *rvalp = 0; 911 break; 912 } 913 914 case VM_ISA_ASSERT_IRQ: { 915 struct vm_isa_irq isa_irq; 916 917 if (ddi_copyin(datap, &isa_irq, sizeof (isa_irq), md)) { 918 error = EFAULT; 919 break; 920 } 921 error = vatpic_assert_irq(sc->vmm_vm, isa_irq.atpic_irq); 922 if (error == 0 && isa_irq.ioapic_irq != -1) { 923 error = vioapic_assert_irq(sc->vmm_vm, 924 isa_irq.ioapic_irq); 925 } 926 break; 927 } 928 case VM_ISA_DEASSERT_IRQ: { 929 struct vm_isa_irq isa_irq; 930 931 if (ddi_copyin(datap, &isa_irq, sizeof (isa_irq), md)) { 932 error = EFAULT; 933 break; 934 } 935 error = vatpic_deassert_irq(sc->vmm_vm, isa_irq.atpic_irq); 936 if (error == 0 && isa_irq.ioapic_irq != -1) { 937 error = vioapic_deassert_irq(sc->vmm_vm, 938 isa_irq.ioapic_irq); 939 } 940 break; 941 } 942 case VM_ISA_PULSE_IRQ: { 943 struct vm_isa_irq isa_irq; 944 945 if (ddi_copyin(datap, &isa_irq, sizeof (isa_irq), md)) { 946 error = EFAULT; 947 break; 948 } 949 error = vatpic_pulse_irq(sc->vmm_vm, isa_irq.atpic_irq); 950 if (error == 0 && isa_irq.ioapic_irq != -1) { 951 error = vioapic_pulse_irq(sc->vmm_vm, 952 isa_irq.ioapic_irq); 953 } 954 break; 955 } 956 case VM_ISA_SET_IRQ_TRIGGER: { 957 struct vm_isa_irq_trigger isa_irq_trigger; 958 959 if (ddi_copyin(datap, &isa_irq_trigger, 960 sizeof (isa_irq_trigger), md)) { 961 error = EFAULT; 962 break; 963 } 964 error = vatpic_set_irq_trigger(sc->vmm_vm, 965 isa_irq_trigger.atpic_irq, isa_irq_trigger.trigger); 966 break; 967 } 968 969 case VM_MMAP_GETNEXT: { 970 struct vm_memmap mm; 971 972 if (ddi_copyin(datap, &mm, sizeof (mm), md)) { 973 error = EFAULT; 974 break; 975 } 976 error = vm_mmap_getnext(sc->vmm_vm, &mm.gpa, &mm.segid, 977 &mm.segoff, &mm.len, &mm.prot, &mm.flags); 978 if (error == 0 && ddi_copyout(&mm, datap, sizeof (mm), md)) { 979 error = EFAULT; 980 break; 981 } 982 break; 983 } 984 case VM_MMAP_MEMSEG: { 985 struct vm_memmap mm; 986 987 if (ddi_copyin(datap, &mm, sizeof (mm), md)) { 988 error = EFAULT; 989 break; 990 } 991 error = vm_mmap_memseg(sc->vmm_vm, mm.gpa, mm.segid, mm.segoff, 992 mm.len, mm.prot, mm.flags); 993 break; 994 } 995 case VM_MUNMAP_MEMSEG: { 996 struct vm_munmap mu; 997 998 if (ddi_copyin(datap, &mu, sizeof (mu), md)) { 999 error = EFAULT; 1000 break; 1001 } 1002 error = vm_munmap_memseg(sc->vmm_vm, mu.gpa, mu.len); 1003 break; 1004 } 1005 case VM_ALLOC_MEMSEG: { 1006 struct vm_memseg vmseg; 1007 1008 if (ddi_copyin(datap, &vmseg, sizeof (vmseg), md)) { 1009 error = EFAULT; 1010 break; 1011 } 1012 error = vmmdev_alloc_memseg(sc, &vmseg); 1013 break; 1014 } 1015 case VM_GET_MEMSEG: { 1016 struct vm_memseg vmseg; 1017 1018 if (ddi_copyin(datap, &vmseg, sizeof (vmseg), md)) { 1019 error = EFAULT; 1020 break; 1021 } 1022 error = vmmdev_get_memseg(sc, &vmseg); 1023 if (error == 0 && 1024 ddi_copyout(&vmseg, datap, sizeof (vmseg), md)) { 1025 error = EFAULT; 1026 break; 1027 } 1028 break; 1029 } 1030 case VM_GET_REGISTER: { 1031 struct vm_register vmreg; 1032 1033 if (ddi_copyin(datap, &vmreg, sizeof (vmreg), md)) { 1034 error = EFAULT; 1035 break; 1036 } 1037 error = vm_get_register(sc->vmm_vm, vcpu, vmreg.regnum, 1038 &vmreg.regval); 1039 if (error == 0 && 1040 ddi_copyout(&vmreg, datap, sizeof (vmreg), md)) { 1041 error = EFAULT; 1042 break; 1043 } 1044 break; 1045 } 1046 case VM_SET_REGISTER: { 1047 struct vm_register vmreg; 1048 1049 if (ddi_copyin(datap, &vmreg, sizeof (vmreg), md)) { 1050 error = EFAULT; 1051 break; 1052 } 1053 error = vm_set_register(sc->vmm_vm, vcpu, vmreg.regnum, 1054 vmreg.regval); 1055 break; 1056 } 1057 case VM_SET_SEGMENT_DESCRIPTOR: { 1058 struct vm_seg_desc vmsegd; 1059 1060 if (ddi_copyin(datap, &vmsegd, sizeof (vmsegd), md)) { 1061 error = EFAULT; 1062 break; 1063 } 1064 error = vm_set_seg_desc(sc->vmm_vm, vcpu, vmsegd.regnum, 1065 &vmsegd.desc); 1066 break; 1067 } 1068 case VM_GET_SEGMENT_DESCRIPTOR: { 1069 struct vm_seg_desc vmsegd; 1070 1071 if (ddi_copyin(datap, &vmsegd, sizeof (vmsegd), md)) { 1072 error = EFAULT; 1073 break; 1074 } 1075 error = vm_get_seg_desc(sc->vmm_vm, vcpu, vmsegd.regnum, 1076 &vmsegd.desc); 1077 if (error == 0 && 1078 ddi_copyout(&vmsegd, datap, sizeof (vmsegd), md)) { 1079 error = EFAULT; 1080 break; 1081 } 1082 break; 1083 } 1084 case VM_GET_REGISTER_SET: { 1085 struct vm_register_set vrs; 1086 int regnums[VM_REG_LAST]; 1087 uint64_t regvals[VM_REG_LAST]; 1088 1089 if (ddi_copyin(datap, &vrs, sizeof (vrs), md)) { 1090 error = EFAULT; 1091 break; 1092 } 1093 if (vrs.count > VM_REG_LAST || vrs.count == 0) { 1094 error = EINVAL; 1095 break; 1096 } 1097 if (ddi_copyin(vrs.regnums, regnums, 1098 sizeof (int) * vrs.count, md)) { 1099 error = EFAULT; 1100 break; 1101 } 1102 1103 error = 0; 1104 for (uint_t i = 0; i < vrs.count && error == 0; i++) { 1105 if (regnums[i] < 0) { 1106 error = EINVAL; 1107 break; 1108 } 1109 error = vm_get_register(sc->vmm_vm, vcpu, regnums[i], 1110 ®vals[i]); 1111 } 1112 if (error == 0 && ddi_copyout(regvals, vrs.regvals, 1113 sizeof (uint64_t) * vrs.count, md)) { 1114 error = EFAULT; 1115 } 1116 break; 1117 } 1118 case VM_SET_REGISTER_SET: { 1119 struct vm_register_set vrs; 1120 int regnums[VM_REG_LAST]; 1121 uint64_t regvals[VM_REG_LAST]; 1122 1123 if (ddi_copyin(datap, &vrs, sizeof (vrs), md)) { 1124 error = EFAULT; 1125 break; 1126 } 1127 if (vrs.count > VM_REG_LAST || vrs.count == 0) { 1128 error = EINVAL; 1129 break; 1130 } 1131 if (ddi_copyin(vrs.regnums, regnums, 1132 sizeof (int) * vrs.count, md)) { 1133 error = EFAULT; 1134 break; 1135 } 1136 if (ddi_copyin(vrs.regvals, regvals, 1137 sizeof (uint64_t) * vrs.count, md)) { 1138 error = EFAULT; 1139 break; 1140 } 1141 1142 error = 0; 1143 for (uint_t i = 0; i < vrs.count && error == 0; i++) { 1144 /* 1145 * Setting registers in a set is not atomic, since a 1146 * failure in the middle of the set will cause a 1147 * bail-out and inconsistent register state. Callers 1148 * should be wary of this. 1149 */ 1150 if (regnums[i] < 0) { 1151 error = EINVAL; 1152 break; 1153 } 1154 error = vm_set_register(sc->vmm_vm, vcpu, regnums[i], 1155 regvals[i]); 1156 } 1157 break; 1158 } 1159 case VM_RESET_CPU: { 1160 struct vm_vcpu_reset vvr; 1161 1162 if (ddi_copyin(datap, &vvr, sizeof (vvr), md)) { 1163 error = EFAULT; 1164 break; 1165 } 1166 if (vvr.kind != VRK_RESET && vvr.kind != VRK_INIT) { 1167 error = EINVAL; 1168 } 1169 1170 error = vcpu_arch_reset(sc->vmm_vm, vcpu, vvr.kind == VRK_INIT); 1171 break; 1172 } 1173 case VM_GET_RUN_STATE: { 1174 struct vm_run_state vrs; 1175 1176 bzero(&vrs, sizeof (vrs)); 1177 error = vm_get_run_state(sc->vmm_vm, vcpu, &vrs.state, 1178 &vrs.sipi_vector); 1179 if (error == 0) { 1180 if (ddi_copyout(&vrs, datap, sizeof (vrs), md)) { 1181 error = EFAULT; 1182 break; 1183 } 1184 } 1185 break; 1186 } 1187 case VM_SET_RUN_STATE: { 1188 struct vm_run_state vrs; 1189 1190 if (ddi_copyin(datap, &vrs, sizeof (vrs), md)) { 1191 error = EFAULT; 1192 break; 1193 } 1194 error = vm_set_run_state(sc->vmm_vm, vcpu, vrs.state, 1195 vrs.sipi_vector); 1196 break; 1197 } 1198 case VM_GET_FPU: { 1199 struct vm_fpu_state req; 1200 const size_t max_len = (PAGESIZE * 2); 1201 void *kbuf; 1202 1203 if (ddi_copyin(datap, &req, sizeof (req), md)) { 1204 error = EFAULT; 1205 break; 1206 } 1207 if (req.len > max_len || req.len == 0) { 1208 error = EINVAL; 1209 break; 1210 } 1211 kbuf = kmem_zalloc(req.len, KM_SLEEP); 1212 error = vm_get_fpu(sc->vmm_vm, vcpu, kbuf, req.len); 1213 if (error == 0) { 1214 if (ddi_copyout(kbuf, req.buf, req.len, md)) { 1215 error = EFAULT; 1216 } 1217 } 1218 kmem_free(kbuf, req.len); 1219 break; 1220 } 1221 case VM_SET_FPU: { 1222 struct vm_fpu_state req; 1223 const size_t max_len = (PAGESIZE * 2); 1224 void *kbuf; 1225 1226 if (ddi_copyin(datap, &req, sizeof (req), md)) { 1227 error = EFAULT; 1228 break; 1229 } 1230 if (req.len > max_len || req.len == 0) { 1231 error = EINVAL; 1232 break; 1233 } 1234 kbuf = kmem_alloc(req.len, KM_SLEEP); 1235 if (ddi_copyin(req.buf, kbuf, req.len, md)) { 1236 error = EFAULT; 1237 } else { 1238 error = vm_set_fpu(sc->vmm_vm, vcpu, kbuf, req.len); 1239 } 1240 kmem_free(kbuf, req.len); 1241 break; 1242 } 1243 case VM_GET_CPUID: { 1244 struct vm_vcpu_cpuid_config cfg; 1245 struct vcpu_cpuid_entry *entries = NULL; 1246 1247 if (ddi_copyin(datap, &cfg, sizeof (cfg), md)) { 1248 error = EFAULT; 1249 break; 1250 } 1251 if (cfg.vvcc_nent > VMM_MAX_CPUID_ENTRIES) { 1252 error = EINVAL; 1253 break; 1254 } 1255 1256 const size_t entries_size = 1257 cfg.vvcc_nent * sizeof (struct vcpu_cpuid_entry); 1258 if (entries_size != 0) { 1259 entries = kmem_zalloc(entries_size, KM_SLEEP); 1260 } 1261 1262 vcpu_cpuid_config_t vm_cfg = { 1263 .vcc_nent = cfg.vvcc_nent, 1264 .vcc_entries = entries, 1265 }; 1266 error = vm_get_cpuid(sc->vmm_vm, vcpu, &vm_cfg); 1267 1268 /* 1269 * Only attempt to copy out the resultant entries if we were 1270 * able to query them from the instance. The flags and number 1271 * of entries are emitted regardless. 1272 */ 1273 cfg.vvcc_flags = vm_cfg.vcc_flags; 1274 cfg.vvcc_nent = vm_cfg.vcc_nent; 1275 if (entries != NULL) { 1276 if (error == 0 && ddi_copyout(entries, cfg.vvcc_entries, 1277 entries_size, md) != 0) { 1278 error = EFAULT; 1279 } 1280 1281 kmem_free(entries, entries_size); 1282 } 1283 1284 if (ddi_copyout(&cfg, datap, sizeof (cfg), md) != 0) { 1285 error = EFAULT; 1286 } 1287 break; 1288 } 1289 case VM_SET_CPUID: { 1290 struct vm_vcpu_cpuid_config cfg; 1291 struct vcpu_cpuid_entry *entries = NULL; 1292 size_t entries_size = 0; 1293 1294 if (ddi_copyin(datap, &cfg, sizeof (cfg), md)) { 1295 error = EFAULT; 1296 break; 1297 } 1298 if (cfg.vvcc_nent > VMM_MAX_CPUID_ENTRIES) { 1299 error = EFBIG; 1300 break; 1301 } 1302 if ((cfg.vvcc_flags & VCC_FLAG_LEGACY_HANDLING) != 0) { 1303 /* 1304 * If we are being instructed to use "legacy" handling, 1305 * then no entries should be provided, since the static 1306 * in-kernel masking will be used. 1307 */ 1308 if (cfg.vvcc_nent != 0) { 1309 error = EINVAL; 1310 break; 1311 } 1312 } else if (cfg.vvcc_nent != 0) { 1313 entries_size = 1314 cfg.vvcc_nent * sizeof (struct vcpu_cpuid_entry); 1315 entries = kmem_alloc(entries_size, KM_SLEEP); 1316 1317 if (ddi_copyin(cfg.vvcc_entries, entries, entries_size, 1318 md) != 0) { 1319 error = EFAULT; 1320 kmem_free(entries, entries_size); 1321 break; 1322 } 1323 } 1324 1325 vcpu_cpuid_config_t vm_cfg = { 1326 .vcc_flags = cfg.vvcc_flags, 1327 .vcc_nent = cfg.vvcc_nent, 1328 .vcc_entries = entries, 1329 }; 1330 error = vm_set_cpuid(sc->vmm_vm, vcpu, &vm_cfg); 1331 1332 if (entries != NULL) { 1333 kmem_free(entries, entries_size); 1334 } 1335 break; 1336 } 1337 case VM_LEGACY_CPUID: { 1338 struct vm_legacy_cpuid vlc; 1339 if (ddi_copyin(datap, &vlc, sizeof (vlc), md)) { 1340 error = EFAULT; 1341 break; 1342 } 1343 vlc.vlc_vcpuid = vcpu; 1344 1345 legacy_emulate_cpuid(sc->vmm_vm, vcpu, &vlc.vlc_eax, 1346 &vlc.vlc_ebx, &vlc.vlc_ecx, &vlc.vlc_edx); 1347 1348 if (ddi_copyout(&vlc, datap, sizeof (vlc), md)) { 1349 error = EFAULT; 1350 break; 1351 } 1352 break; 1353 } 1354 1355 case VM_SET_KERNEMU_DEV: 1356 case VM_GET_KERNEMU_DEV: { 1357 struct vm_readwrite_kernemu_device kemu; 1358 size_t size = 0; 1359 1360 if (ddi_copyin(datap, &kemu, sizeof (kemu), md)) { 1361 error = EFAULT; 1362 break; 1363 } 1364 1365 if (kemu.access_width > 3) { 1366 error = EINVAL; 1367 break; 1368 } 1369 size = (1 << kemu.access_width); 1370 ASSERT(size >= 1 && size <= 8); 1371 1372 if (cmd == VM_SET_KERNEMU_DEV) { 1373 error = vm_service_mmio_write(sc->vmm_vm, vcpu, 1374 kemu.gpa, kemu.value, size); 1375 } else { 1376 error = vm_service_mmio_read(sc->vmm_vm, vcpu, 1377 kemu.gpa, &kemu.value, size); 1378 } 1379 1380 if (error == 0) { 1381 if (ddi_copyout(&kemu, datap, sizeof (kemu), md)) { 1382 error = EFAULT; 1383 break; 1384 } 1385 } 1386 break; 1387 } 1388 1389 case VM_GET_CAPABILITY: { 1390 struct vm_capability vmcap; 1391 1392 if (ddi_copyin(datap, &vmcap, sizeof (vmcap), md)) { 1393 error = EFAULT; 1394 break; 1395 } 1396 error = vm_get_capability(sc->vmm_vm, vcpu, vmcap.captype, 1397 &vmcap.capval); 1398 if (error == 0 && 1399 ddi_copyout(&vmcap, datap, sizeof (vmcap), md)) { 1400 error = EFAULT; 1401 break; 1402 } 1403 break; 1404 } 1405 case VM_SET_CAPABILITY: { 1406 struct vm_capability vmcap; 1407 1408 if (ddi_copyin(datap, &vmcap, sizeof (vmcap), md)) { 1409 error = EFAULT; 1410 break; 1411 } 1412 error = vm_set_capability(sc->vmm_vm, vcpu, vmcap.captype, 1413 vmcap.capval); 1414 break; 1415 } 1416 case VM_SET_X2APIC_STATE: { 1417 struct vm_x2apic x2apic; 1418 1419 if (ddi_copyin(datap, &x2apic, sizeof (x2apic), md)) { 1420 error = EFAULT; 1421 break; 1422 } 1423 error = vm_set_x2apic_state(sc->vmm_vm, vcpu, x2apic.state); 1424 break; 1425 } 1426 case VM_GET_X2APIC_STATE: { 1427 struct vm_x2apic x2apic; 1428 1429 if (ddi_copyin(datap, &x2apic, sizeof (x2apic), md)) { 1430 error = EFAULT; 1431 break; 1432 } 1433 error = vm_get_x2apic_state(sc->vmm_vm, x2apic.cpuid, 1434 &x2apic.state); 1435 if (error == 0 && 1436 ddi_copyout(&x2apic, datap, sizeof (x2apic), md)) { 1437 error = EFAULT; 1438 break; 1439 } 1440 break; 1441 } 1442 case VM_GET_GPA_PMAP: { 1443 /* 1444 * Until there is a necessity to leak EPT/RVI PTE values to 1445 * userspace, this will remain unimplemented 1446 */ 1447 error = EINVAL; 1448 break; 1449 } 1450 case VM_GET_HPET_CAPABILITIES: { 1451 struct vm_hpet_cap hpetcap; 1452 1453 error = vhpet_getcap(&hpetcap); 1454 if (error == 0 && 1455 ddi_copyout(&hpetcap, datap, sizeof (hpetcap), md)) { 1456 error = EFAULT; 1457 break; 1458 } 1459 break; 1460 } 1461 case VM_GLA2GPA: { 1462 struct vm_gla2gpa gg; 1463 1464 if (ddi_copyin(datap, &gg, sizeof (gg), md)) { 1465 error = EFAULT; 1466 break; 1467 } 1468 gg.vcpuid = vcpu; 1469 error = vm_gla2gpa(sc->vmm_vm, vcpu, &gg.paging, gg.gla, 1470 gg.prot, &gg.gpa, &gg.fault); 1471 if (error == 0 && ddi_copyout(&gg, datap, sizeof (gg), md)) { 1472 error = EFAULT; 1473 break; 1474 } 1475 break; 1476 } 1477 case VM_GLA2GPA_NOFAULT: { 1478 struct vm_gla2gpa gg; 1479 1480 if (ddi_copyin(datap, &gg, sizeof (gg), md)) { 1481 error = EFAULT; 1482 break; 1483 } 1484 gg.vcpuid = vcpu; 1485 error = vm_gla2gpa_nofault(sc->vmm_vm, vcpu, &gg.paging, 1486 gg.gla, gg.prot, &gg.gpa, &gg.fault); 1487 if (error == 0 && ddi_copyout(&gg, datap, sizeof (gg), md)) { 1488 error = EFAULT; 1489 break; 1490 } 1491 break; 1492 } 1493 1494 case VM_ACTIVATE_CPU: 1495 error = vm_activate_cpu(sc->vmm_vm, vcpu); 1496 break; 1497 1498 case VM_SUSPEND_CPU: 1499 if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) { 1500 error = EFAULT; 1501 } else { 1502 error = vm_suspend_cpu(sc->vmm_vm, vcpu); 1503 } 1504 break; 1505 1506 case VM_RESUME_CPU: 1507 if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) { 1508 error = EFAULT; 1509 } else { 1510 error = vm_resume_cpu(sc->vmm_vm, vcpu); 1511 } 1512 break; 1513 1514 case VM_GET_CPUS: { 1515 struct vm_cpuset vm_cpuset; 1516 cpuset_t tempset; 1517 void *srcp = &tempset; 1518 int size; 1519 1520 if (ddi_copyin(datap, &vm_cpuset, sizeof (vm_cpuset), md)) { 1521 error = EFAULT; 1522 break; 1523 } 1524 1525 /* Be more generous about sizing since our cpuset_t is large. */ 1526 size = vm_cpuset.cpusetsize; 1527 if (size <= 0 || size > sizeof (cpuset_t)) { 1528 error = ERANGE; 1529 } 1530 /* 1531 * If they want a ulong_t or less, make sure they receive the 1532 * low bits with all the useful information. 1533 */ 1534 if (size <= sizeof (tempset.cpub[0])) { 1535 srcp = &tempset.cpub[0]; 1536 } 1537 1538 if (vm_cpuset.which == VM_ACTIVE_CPUS) { 1539 tempset = vm_active_cpus(sc->vmm_vm); 1540 } else if (vm_cpuset.which == VM_SUSPENDED_CPUS) { 1541 tempset = vm_suspended_cpus(sc->vmm_vm); 1542 } else if (vm_cpuset.which == VM_DEBUG_CPUS) { 1543 tempset = vm_debug_cpus(sc->vmm_vm); 1544 } else { 1545 error = EINVAL; 1546 } 1547 1548 ASSERT(size > 0 && size <= sizeof (tempset)); 1549 if (error == 0 && 1550 ddi_copyout(srcp, vm_cpuset.cpus, size, md)) { 1551 error = EFAULT; 1552 break; 1553 } 1554 break; 1555 } 1556 case VM_SET_INTINFO: { 1557 struct vm_intinfo vmii; 1558 1559 if (ddi_copyin(datap, &vmii, sizeof (vmii), md)) { 1560 error = EFAULT; 1561 break; 1562 } 1563 error = vm_exit_intinfo(sc->vmm_vm, vcpu, vmii.info1); 1564 break; 1565 } 1566 case VM_GET_INTINFO: { 1567 struct vm_intinfo vmii; 1568 1569 vmii.vcpuid = vcpu; 1570 error = vm_get_intinfo(sc->vmm_vm, vcpu, &vmii.info1, 1571 &vmii.info2); 1572 if (error == 0 && 1573 ddi_copyout(&vmii, datap, sizeof (vmii), md)) { 1574 error = EFAULT; 1575 break; 1576 } 1577 break; 1578 } 1579 case VM_RTC_WRITE: { 1580 struct vm_rtc_data rtcdata; 1581 1582 if (ddi_copyin(datap, &rtcdata, sizeof (rtcdata), md)) { 1583 error = EFAULT; 1584 break; 1585 } 1586 error = vrtc_nvram_write(sc->vmm_vm, rtcdata.offset, 1587 rtcdata.value); 1588 break; 1589 } 1590 case VM_RTC_READ: { 1591 struct vm_rtc_data rtcdata; 1592 1593 if (ddi_copyin(datap, &rtcdata, sizeof (rtcdata), md)) { 1594 error = EFAULT; 1595 break; 1596 } 1597 error = vrtc_nvram_read(sc->vmm_vm, rtcdata.offset, 1598 &rtcdata.value); 1599 if (error == 0 && 1600 ddi_copyout(&rtcdata, datap, sizeof (rtcdata), md)) { 1601 error = EFAULT; 1602 break; 1603 } 1604 break; 1605 } 1606 case VM_RTC_SETTIME: { 1607 struct vm_rtc_time rtctime; 1608 1609 if (ddi_copyin(datap, &rtctime, sizeof (rtctime), md)) { 1610 error = EFAULT; 1611 break; 1612 } 1613 error = vrtc_set_time(sc->vmm_vm, rtctime.secs); 1614 break; 1615 } 1616 case VM_RTC_GETTIME: { 1617 struct vm_rtc_time rtctime; 1618 1619 rtctime.secs = vrtc_get_time(sc->vmm_vm); 1620 if (ddi_copyout(&rtctime, datap, sizeof (rtctime), md)) { 1621 error = EFAULT; 1622 break; 1623 } 1624 break; 1625 } 1626 1627 case VM_PMTMR_LOCATE: { 1628 uint16_t port = arg; 1629 error = vpmtmr_set_location(sc->vmm_vm, port); 1630 break; 1631 } 1632 1633 case VM_RESTART_INSTRUCTION: 1634 error = vm_restart_instruction(sc->vmm_vm, vcpu); 1635 break; 1636 1637 case VM_SET_TOPOLOGY: { 1638 struct vm_cpu_topology topo; 1639 1640 if (ddi_copyin(datap, &topo, sizeof (topo), md) != 0) { 1641 error = EFAULT; 1642 break; 1643 } 1644 error = vm_set_topology(sc->vmm_vm, topo.sockets, topo.cores, 1645 topo.threads, topo.maxcpus); 1646 break; 1647 } 1648 case VM_GET_TOPOLOGY: { 1649 struct vm_cpu_topology topo; 1650 1651 vm_get_topology(sc->vmm_vm, &topo.sockets, &topo.cores, 1652 &topo.threads, &topo.maxcpus); 1653 if (ddi_copyout(&topo, datap, sizeof (topo), md) != 0) { 1654 error = EFAULT; 1655 break; 1656 } 1657 break; 1658 } 1659 case VM_DEVMEM_GETOFFSET: { 1660 struct vm_devmem_offset vdo; 1661 vmm_devmem_entry_t *de; 1662 1663 if (ddi_copyin(datap, &vdo, sizeof (vdo), md) != 0) { 1664 error = EFAULT; 1665 break; 1666 } 1667 1668 de = vmmdev_devmem_find(sc, vdo.segid); 1669 if (de != NULL) { 1670 vdo.offset = de->vde_off; 1671 if (ddi_copyout(&vdo, datap, sizeof (vdo), md) != 0) { 1672 error = EFAULT; 1673 } 1674 } else { 1675 error = ENOENT; 1676 } 1677 break; 1678 } 1679 case VM_TRACK_DIRTY_PAGES: { 1680 const size_t max_track_region_len = 8 * PAGESIZE * 8 * PAGESIZE; 1681 struct vmm_dirty_tracker tracker; 1682 uint8_t *bitmap; 1683 size_t len; 1684 1685 if (ddi_copyin(datap, &tracker, sizeof (tracker), md) != 0) { 1686 error = EFAULT; 1687 break; 1688 } 1689 if ((tracker.vdt_start_gpa & PAGEOFFSET) != 0) { 1690 error = EINVAL; 1691 break; 1692 } 1693 if (tracker.vdt_len == 0) { 1694 break; 1695 } 1696 if ((tracker.vdt_len & PAGEOFFSET) != 0) { 1697 error = EINVAL; 1698 break; 1699 } 1700 if (tracker.vdt_len > max_track_region_len) { 1701 error = EINVAL; 1702 break; 1703 } 1704 len = roundup(tracker.vdt_len / PAGESIZE, 8) / 8; 1705 bitmap = kmem_zalloc(len, KM_SLEEP); 1706 error = vm_track_dirty_pages(sc->vmm_vm, tracker.vdt_start_gpa, 1707 tracker.vdt_len, bitmap); 1708 if (error == 0 && 1709 ddi_copyout(bitmap, tracker.vdt_pfns, len, md) != 0) { 1710 error = EFAULT; 1711 } 1712 kmem_free(bitmap, len); 1713 1714 break; 1715 } 1716 case VM_WRLOCK_CYCLE: { 1717 /* 1718 * Present a test mechanism to acquire/release the write lock 1719 * on the VM without any other effects. 1720 */ 1721 break; 1722 } 1723 case VM_DATA_READ: { 1724 struct vm_data_xfer vdx; 1725 1726 if (ddi_copyin(datap, &vdx, sizeof (vdx), md) != 0) { 1727 error = EFAULT; 1728 break; 1729 } 1730 if ((vdx.vdx_flags & ~VDX_FLAGS_VALID) != 0) { 1731 error = EINVAL; 1732 break; 1733 } 1734 if (vdx.vdx_len > VM_DATA_XFER_LIMIT) { 1735 error = EFBIG; 1736 break; 1737 } 1738 1739 const size_t len = vdx.vdx_len; 1740 void *buf = NULL; 1741 if (len != 0) { 1742 buf = kmem_alloc(len, KM_SLEEP); 1743 if ((vdx.vdx_flags & VDX_FLAG_READ_COPYIN) != 0 && 1744 ddi_copyin(vdx.vdx_data, buf, len, md) != 0) { 1745 kmem_free(buf, len); 1746 error = EFAULT; 1747 break; 1748 } else { 1749 bzero(buf, len); 1750 } 1751 } 1752 1753 vdx.vdx_result_len = 0; 1754 vmm_data_req_t req = { 1755 .vdr_class = vdx.vdx_class, 1756 .vdr_version = vdx.vdx_version, 1757 .vdr_flags = vdx.vdx_flags, 1758 .vdr_len = len, 1759 .vdr_data = buf, 1760 .vdr_result_len = &vdx.vdx_result_len, 1761 }; 1762 error = vmm_data_read(sc->vmm_vm, vdx.vdx_vcpuid, &req); 1763 1764 if (error == 0 && buf != NULL) { 1765 if (ddi_copyout(buf, vdx.vdx_data, len, md) != 0) { 1766 error = EFAULT; 1767 } 1768 } 1769 1770 /* 1771 * Copy out the transfer request so that the value of 1772 * vdx_result_len can be made available, regardless of any 1773 * error(s) which may have occurred. 1774 */ 1775 if (ddi_copyout(&vdx, datap, sizeof (vdx), md) != 0) { 1776 error = (error != 0) ? error : EFAULT; 1777 } 1778 1779 if (buf != NULL) { 1780 kmem_free(buf, len); 1781 } 1782 break; 1783 } 1784 case VM_DATA_WRITE: { 1785 struct vm_data_xfer vdx; 1786 1787 if (ddi_copyin(datap, &vdx, sizeof (vdx), md) != 0) { 1788 error = EFAULT; 1789 break; 1790 } 1791 if ((vdx.vdx_flags & ~VDX_FLAGS_VALID) != 0) { 1792 error = EINVAL; 1793 break; 1794 } 1795 if (vdx.vdx_len > VM_DATA_XFER_LIMIT) { 1796 error = EFBIG; 1797 break; 1798 } 1799 1800 const size_t len = vdx.vdx_len; 1801 void *buf = NULL; 1802 if (len != 0) { 1803 buf = kmem_alloc(len, KM_SLEEP); 1804 if (ddi_copyin(vdx.vdx_data, buf, len, md) != 0) { 1805 kmem_free(buf, len); 1806 error = EFAULT; 1807 break; 1808 } 1809 } 1810 1811 vdx.vdx_result_len = 0; 1812 vmm_data_req_t req = { 1813 .vdr_class = vdx.vdx_class, 1814 .vdr_version = vdx.vdx_version, 1815 .vdr_flags = vdx.vdx_flags, 1816 .vdr_len = len, 1817 .vdr_data = buf, 1818 .vdr_result_len = &vdx.vdx_result_len, 1819 }; 1820 if (vmm_allow_state_writes == 0) { 1821 /* XXX: Play it safe for now */ 1822 error = EPERM; 1823 } else { 1824 error = vmm_data_write(sc->vmm_vm, vdx.vdx_vcpuid, 1825 &req); 1826 } 1827 1828 if (error == 0 && buf != NULL && 1829 (vdx.vdx_flags & VDX_FLAG_WRITE_COPYOUT) != 0) { 1830 if (ddi_copyout(buf, vdx.vdx_data, len, md) != 0) { 1831 error = EFAULT; 1832 } 1833 } 1834 1835 /* 1836 * Copy out the transfer request so that the value of 1837 * vdx_result_len can be made available, regardless of any 1838 * error(s) which may have occurred. 1839 */ 1840 if (ddi_copyout(&vdx, datap, sizeof (vdx), md) != 0) { 1841 error = (error != 0) ? error : EFAULT; 1842 } 1843 1844 if (buf != NULL) { 1845 kmem_free(buf, len); 1846 } 1847 break; 1848 } 1849 1850 case VM_PAUSE: { 1851 error = vm_pause_instance(sc->vmm_vm); 1852 break; 1853 } 1854 case VM_RESUME: { 1855 error = vm_resume_instance(sc->vmm_vm); 1856 break; 1857 } 1858 1859 default: 1860 error = ENOTTY; 1861 break; 1862 } 1863 1864 /* Release exclusion resources */ 1865 switch (lock_type) { 1866 case LOCK_NONE: 1867 break; 1868 case LOCK_VCPU: 1869 vcpu_unlock_one(sc, vcpu); 1870 break; 1871 case LOCK_READ_HOLD: 1872 vmm_read_unlock(sc); 1873 break; 1874 case LOCK_WRITE_HOLD: 1875 vmm_write_unlock(sc); 1876 break; 1877 default: 1878 panic("unexpected lock type"); 1879 break; 1880 } 1881 1882 return (error); 1883 } 1884 1885 static vmm_softc_t * 1886 vmm_lookup(const char *name) 1887 { 1888 list_t *vml = &vmm_list; 1889 vmm_softc_t *sc; 1890 1891 ASSERT(MUTEX_HELD(&vmm_mtx)); 1892 1893 for (sc = list_head(vml); sc != NULL; sc = list_next(vml, sc)) { 1894 if (strcmp(sc->vmm_name, name) == 0) { 1895 break; 1896 } 1897 } 1898 1899 return (sc); 1900 } 1901 1902 /* 1903 * Acquire an HMA registration if not already held. 1904 */ 1905 static boolean_t 1906 vmm_hma_acquire(void) 1907 { 1908 ASSERT(MUTEX_NOT_HELD(&vmm_mtx)); 1909 1910 mutex_enter(&vmmdev_mtx); 1911 1912 if (vmmdev_hma_reg == NULL) { 1913 VERIFY3U(vmmdev_hma_ref, ==, 0); 1914 vmmdev_hma_reg = hma_register(vmmdev_hvm_name); 1915 if (vmmdev_hma_reg == NULL) { 1916 cmn_err(CE_WARN, "%s HMA registration failed.", 1917 vmmdev_hvm_name); 1918 mutex_exit(&vmmdev_mtx); 1919 return (B_FALSE); 1920 } 1921 } 1922 1923 vmmdev_hma_ref++; 1924 1925 mutex_exit(&vmmdev_mtx); 1926 1927 return (B_TRUE); 1928 } 1929 1930 /* 1931 * Release the HMA registration if held and there are no remaining VMs. 1932 */ 1933 static void 1934 vmm_hma_release(void) 1935 { 1936 ASSERT(MUTEX_NOT_HELD(&vmm_mtx)); 1937 1938 mutex_enter(&vmmdev_mtx); 1939 1940 VERIFY3U(vmmdev_hma_ref, !=, 0); 1941 1942 vmmdev_hma_ref--; 1943 1944 if (vmmdev_hma_ref == 0) { 1945 VERIFY(vmmdev_hma_reg != NULL); 1946 hma_unregister(vmmdev_hma_reg); 1947 vmmdev_hma_reg = NULL; 1948 } 1949 mutex_exit(&vmmdev_mtx); 1950 } 1951 1952 static int 1953 vmmdev_do_vm_create(const struct vm_create_req *req, cred_t *cr) 1954 { 1955 vmm_softc_t *sc = NULL; 1956 minor_t minor; 1957 int error = ENOMEM; 1958 size_t len; 1959 const char *name = req->name; 1960 1961 len = strnlen(name, VM_MAX_NAMELEN); 1962 if (len == 0) { 1963 return (EINVAL); 1964 } 1965 if (len >= VM_MAX_NAMELEN) { 1966 return (ENAMETOOLONG); 1967 } 1968 if (strchr(name, '/') != NULL) { 1969 return (EINVAL); 1970 } 1971 1972 if (!vmm_hma_acquire()) 1973 return (ENXIO); 1974 1975 mutex_enter(&vmm_mtx); 1976 1977 /* Look for duplicate names */ 1978 if (vmm_lookup(name) != NULL) { 1979 mutex_exit(&vmm_mtx); 1980 vmm_hma_release(); 1981 return (EEXIST); 1982 } 1983 1984 /* Allow only one instance per non-global zone. */ 1985 if (!INGLOBALZONE(curproc)) { 1986 for (sc = list_head(&vmm_list); sc != NULL; 1987 sc = list_next(&vmm_list, sc)) { 1988 if (sc->vmm_zone == curzone) { 1989 mutex_exit(&vmm_mtx); 1990 vmm_hma_release(); 1991 return (EINVAL); 1992 } 1993 } 1994 } 1995 1996 minor = id_alloc(vmm_minors); 1997 if (ddi_soft_state_zalloc(vmm_statep, minor) != DDI_SUCCESS) { 1998 goto fail; 1999 } else if ((sc = ddi_get_soft_state(vmm_statep, minor)) == NULL) { 2000 ddi_soft_state_free(vmm_statep, minor); 2001 goto fail; 2002 } else if (ddi_create_minor_node(vmmdev_dip, name, S_IFCHR, minor, 2003 DDI_PSEUDO, 0) != DDI_SUCCESS) { 2004 goto fail; 2005 } 2006 2007 if (vmm_kstat_alloc(sc, minor, cr) != 0) { 2008 goto fail; 2009 } 2010 2011 error = vm_create(req->flags, &sc->vmm_vm); 2012 if (error == 0) { 2013 /* Complete VM intialization and report success. */ 2014 (void) strlcpy(sc->vmm_name, name, sizeof (sc->vmm_name)); 2015 sc->vmm_minor = minor; 2016 list_create(&sc->vmm_devmem_list, sizeof (vmm_devmem_entry_t), 2017 offsetof(vmm_devmem_entry_t, vde_node)); 2018 2019 list_create(&sc->vmm_holds, sizeof (vmm_hold_t), 2020 offsetof(vmm_hold_t, vmh_node)); 2021 cv_init(&sc->vmm_cv, NULL, CV_DEFAULT, NULL); 2022 2023 mutex_init(&sc->vmm_lease_lock, NULL, MUTEX_DEFAULT, NULL); 2024 list_create(&sc->vmm_lease_list, sizeof (vmm_lease_t), 2025 offsetof(vmm_lease_t, vml_node)); 2026 cv_init(&sc->vmm_lease_cv, NULL, CV_DEFAULT, NULL); 2027 rw_init(&sc->vmm_rwlock, NULL, RW_DEFAULT, NULL); 2028 2029 sc->vmm_zone = crgetzone(cr); 2030 zone_hold(sc->vmm_zone); 2031 vmm_zsd_add_vm(sc); 2032 vmm_kstat_init(sc); 2033 2034 list_insert_tail(&vmm_list, sc); 2035 mutex_exit(&vmm_mtx); 2036 return (0); 2037 } 2038 2039 vmm_kstat_fini(sc); 2040 ddi_remove_minor_node(vmmdev_dip, name); 2041 fail: 2042 id_free(vmm_minors, minor); 2043 if (sc != NULL) { 2044 ddi_soft_state_free(vmm_statep, minor); 2045 } 2046 mutex_exit(&vmm_mtx); 2047 vmm_hma_release(); 2048 2049 return (error); 2050 } 2051 2052 /* 2053 * Bhyve 'Driver' Interface 2054 * 2055 * While many devices are emulated in the bhyve userspace process, there are 2056 * others with performance constraints which require that they run mostly or 2057 * entirely in-kernel. For those not integrated directly into bhyve, an API is 2058 * needed so they can query/manipulate the portions of VM state needed to 2059 * fulfill their purpose. 2060 * 2061 * This includes: 2062 * - Translating guest-physical addresses to host-virtual pointers 2063 * - Injecting MSIs 2064 * - Hooking IO port addresses 2065 * 2066 * The vmm_drv interface exists to provide that functionality to its consumers. 2067 * (At this time, 'viona' is the only user) 2068 */ 2069 int 2070 vmm_drv_hold(file_t *fp, cred_t *cr, vmm_hold_t **holdp) 2071 { 2072 vnode_t *vp = fp->f_vnode; 2073 const dev_t dev = vp->v_rdev; 2074 vmm_softc_t *sc; 2075 vmm_hold_t *hold; 2076 int err = 0; 2077 2078 if (vp->v_type != VCHR) { 2079 return (ENXIO); 2080 } 2081 const major_t major = getmajor(dev); 2082 const minor_t minor = getminor(dev); 2083 2084 mutex_enter(&vmmdev_mtx); 2085 if (vmmdev_dip == NULL || major != ddi_driver_major(vmmdev_dip)) { 2086 mutex_exit(&vmmdev_mtx); 2087 return (ENOENT); 2088 } 2089 mutex_enter(&vmm_mtx); 2090 mutex_exit(&vmmdev_mtx); 2091 2092 if ((sc = ddi_get_soft_state(vmm_statep, minor)) == NULL) { 2093 err = ENOENT; 2094 goto out; 2095 } 2096 /* XXXJOY: check cred permissions against instance */ 2097 2098 if ((sc->vmm_flags & VMM_DESTROY) != 0) { 2099 err = EBUSY; 2100 goto out; 2101 } 2102 2103 hold = kmem_zalloc(sizeof (*hold), KM_SLEEP); 2104 hold->vmh_sc = sc; 2105 hold->vmh_release_req = B_FALSE; 2106 2107 list_insert_tail(&sc->vmm_holds, hold); 2108 sc->vmm_flags |= VMM_HELD; 2109 *holdp = hold; 2110 2111 out: 2112 mutex_exit(&vmm_mtx); 2113 return (err); 2114 } 2115 2116 void 2117 vmm_drv_rele(vmm_hold_t *hold) 2118 { 2119 vmm_softc_t *sc; 2120 bool hma_release = false; 2121 2122 ASSERT(hold != NULL); 2123 ASSERT(hold->vmh_sc != NULL); 2124 VERIFY(hold->vmh_ioport_hook_cnt == 0); 2125 2126 mutex_enter(&vmm_mtx); 2127 sc = hold->vmh_sc; 2128 list_remove(&sc->vmm_holds, hold); 2129 kmem_free(hold, sizeof (*hold)); 2130 2131 if (list_is_empty(&sc->vmm_holds)) { 2132 sc->vmm_flags &= ~VMM_HELD; 2133 2134 /* 2135 * Since outstanding holds would prevent instance destruction 2136 * from completing, attempt to finish it now if it was already 2137 * set in motion. 2138 */ 2139 if ((sc->vmm_flags & VMM_DESTROY) != 0) { 2140 VERIFY0(vmm_destroy_locked(sc, VDO_DEFAULT, 2141 &hma_release)); 2142 } 2143 } 2144 mutex_exit(&vmm_mtx); 2145 2146 if (hma_release) { 2147 vmm_hma_release(); 2148 } 2149 } 2150 2151 boolean_t 2152 vmm_drv_release_reqd(vmm_hold_t *hold) 2153 { 2154 ASSERT(hold != NULL); 2155 2156 return (hold->vmh_release_req); 2157 } 2158 2159 vmm_lease_t * 2160 vmm_drv_lease_sign(vmm_hold_t *hold, boolean_t (*expiref)(void *), void *arg) 2161 { 2162 vmm_softc_t *sc = hold->vmh_sc; 2163 vmm_lease_t *lease; 2164 2165 ASSERT3P(expiref, !=, NULL); 2166 2167 if (hold->vmh_release_req) { 2168 return (NULL); 2169 } 2170 2171 lease = kmem_alloc(sizeof (*lease), KM_SLEEP); 2172 list_link_init(&lease->vml_node); 2173 lease->vml_expire_func = expiref; 2174 lease->vml_expire_arg = arg; 2175 lease->vml_expired = B_FALSE; 2176 lease->vml_break_deferred = B_FALSE; 2177 lease->vml_hold = hold; 2178 /* cache the VM pointer for one less pointer chase */ 2179 lease->vml_vm = sc->vmm_vm; 2180 lease->vml_vmclient = vmspace_client_alloc(vm_get_vmspace(sc->vmm_vm)); 2181 2182 mutex_enter(&sc->vmm_lease_lock); 2183 while (sc->vmm_lease_blocker != 0) { 2184 cv_wait(&sc->vmm_lease_cv, &sc->vmm_lease_lock); 2185 } 2186 list_insert_tail(&sc->vmm_lease_list, lease); 2187 vmm_read_lock(sc); 2188 mutex_exit(&sc->vmm_lease_lock); 2189 2190 return (lease); 2191 } 2192 2193 static void 2194 vmm_lease_break_locked(vmm_softc_t *sc, vmm_lease_t *lease) 2195 { 2196 ASSERT(MUTEX_HELD(&sc->vmm_lease_lock)); 2197 2198 list_remove(&sc->vmm_lease_list, lease); 2199 vmm_read_unlock(sc); 2200 vmc_destroy(lease->vml_vmclient); 2201 kmem_free(lease, sizeof (*lease)); 2202 } 2203 2204 static void 2205 vmm_lease_block(vmm_softc_t *sc) 2206 { 2207 mutex_enter(&sc->vmm_lease_lock); 2208 VERIFY3U(sc->vmm_lease_blocker, !=, UINT_MAX); 2209 sc->vmm_lease_blocker++; 2210 if (sc->vmm_lease_blocker == 1) { 2211 list_t *list = &sc->vmm_lease_list; 2212 vmm_lease_t *lease = list_head(list); 2213 2214 while (lease != NULL) { 2215 void *arg = lease->vml_expire_arg; 2216 boolean_t (*expiref)(void *) = lease->vml_expire_func; 2217 boolean_t sync_break = B_FALSE; 2218 2219 /* 2220 * Since the lease expiration notification may 2221 * need to take locks which would deadlock with 2222 * vmm_lease_lock, drop it across the call. 2223 * 2224 * We are the only one allowed to manipulate 2225 * vmm_lease_list right now, so it is safe to 2226 * continue iterating through it after 2227 * reacquiring the lock. 2228 */ 2229 lease->vml_expired = B_TRUE; 2230 mutex_exit(&sc->vmm_lease_lock); 2231 sync_break = expiref(arg); 2232 mutex_enter(&sc->vmm_lease_lock); 2233 2234 if (sync_break) { 2235 vmm_lease_t *next; 2236 2237 /* 2238 * These leases which are synchronously broken 2239 * result in vmm_read_unlock() calls from a 2240 * different thread than the corresponding 2241 * vmm_read_lock(). This is acceptable, given 2242 * that the rwlock underpinning the whole 2243 * mechanism tolerates the behavior. This 2244 * flexibility is _only_ afforded to VM read 2245 * lock (RW_READER) holders. 2246 */ 2247 next = list_next(list, lease); 2248 vmm_lease_break_locked(sc, lease); 2249 lease = next; 2250 } else { 2251 lease = list_next(list, lease); 2252 } 2253 } 2254 2255 /* Process leases which were not broken synchronously. */ 2256 while (!list_is_empty(list)) { 2257 /* 2258 * Although the nested loops are quadratic, the number 2259 * of leases is small. 2260 */ 2261 lease = list_head(list); 2262 while (lease != NULL) { 2263 vmm_lease_t *next = list_next(list, lease); 2264 if (lease->vml_break_deferred) { 2265 vmm_lease_break_locked(sc, lease); 2266 } 2267 lease = next; 2268 } 2269 if (list_is_empty(list)) { 2270 break; 2271 } 2272 cv_wait(&sc->vmm_lease_cv, &sc->vmm_lease_lock); 2273 } 2274 /* Wake anyone else waiting for the lease list to be empty */ 2275 cv_broadcast(&sc->vmm_lease_cv); 2276 } else { 2277 list_t *list = &sc->vmm_lease_list; 2278 2279 /* 2280 * Some other thread beat us to the duty of lease cleanup. 2281 * Wait until that is complete. 2282 */ 2283 while (!list_is_empty(list)) { 2284 cv_wait(&sc->vmm_lease_cv, &sc->vmm_lease_lock); 2285 } 2286 } 2287 mutex_exit(&sc->vmm_lease_lock); 2288 } 2289 2290 static void 2291 vmm_lease_unblock(vmm_softc_t *sc) 2292 { 2293 mutex_enter(&sc->vmm_lease_lock); 2294 VERIFY3U(sc->vmm_lease_blocker, !=, 0); 2295 sc->vmm_lease_blocker--; 2296 if (sc->vmm_lease_blocker == 0) { 2297 cv_broadcast(&sc->vmm_lease_cv); 2298 } 2299 mutex_exit(&sc->vmm_lease_lock); 2300 } 2301 2302 void 2303 vmm_drv_lease_break(vmm_hold_t *hold, vmm_lease_t *lease) 2304 { 2305 vmm_softc_t *sc = hold->vmh_sc; 2306 2307 VERIFY3P(hold, ==, lease->vml_hold); 2308 VERIFY(!lease->vml_break_deferred); 2309 2310 mutex_enter(&sc->vmm_lease_lock); 2311 if (sc->vmm_lease_blocker == 0) { 2312 vmm_lease_break_locked(sc, lease); 2313 } else { 2314 /* 2315 * Defer the lease-breaking to whichever thread is currently 2316 * cleaning up all leases as part of a vmm_lease_block() call. 2317 */ 2318 lease->vml_break_deferred = B_TRUE; 2319 cv_broadcast(&sc->vmm_lease_cv); 2320 } 2321 mutex_exit(&sc->vmm_lease_lock); 2322 } 2323 2324 boolean_t 2325 vmm_drv_lease_expired(vmm_lease_t *lease) 2326 { 2327 return (lease->vml_expired); 2328 } 2329 2330 vmm_page_t * 2331 vmm_drv_page_hold(vmm_lease_t *lease, uintptr_t gpa, int prot) 2332 { 2333 ASSERT(lease != NULL); 2334 ASSERT0(gpa & PAGEOFFSET); 2335 2336 return ((vmm_page_t *)vmc_hold(lease->vml_vmclient, gpa, prot)); 2337 } 2338 2339 2340 /* Ensure that flags mirrored by vmm_drv interface properly match up */ 2341 CTASSERT(VMPF_DEFER_DIRTY == VPF_DEFER_DIRTY); 2342 2343 vmm_page_t * 2344 vmm_drv_page_hold_ext(vmm_lease_t *lease, uintptr_t gpa, int prot, int flags) 2345 { 2346 ASSERT(lease != NULL); 2347 ASSERT0(gpa & PAGEOFFSET); 2348 2349 vmm_page_t *page = 2350 (vmm_page_t *)vmc_hold_ext(lease->vml_vmclient, gpa, prot, flags); 2351 return (page); 2352 } 2353 2354 void 2355 vmm_drv_page_release(vmm_page_t *vmmp) 2356 { 2357 (void) vmp_release((vm_page_t *)vmmp); 2358 } 2359 2360 void 2361 vmm_drv_page_release_chain(vmm_page_t *vmmp) 2362 { 2363 (void) vmp_release_chain((vm_page_t *)vmmp); 2364 } 2365 2366 const void * 2367 vmm_drv_page_readable(const vmm_page_t *vmmp) 2368 { 2369 return (vmp_get_readable((const vm_page_t *)vmmp)); 2370 } 2371 2372 void * 2373 vmm_drv_page_writable(const vmm_page_t *vmmp) 2374 { 2375 return (vmp_get_writable((const vm_page_t *)vmmp)); 2376 } 2377 2378 void 2379 vmm_drv_page_mark_dirty(vmm_page_t *vmmp) 2380 { 2381 return (vmp_mark_dirty((vm_page_t *)vmmp)); 2382 } 2383 2384 void 2385 vmm_drv_page_chain(vmm_page_t *vmmp, vmm_page_t *to_chain) 2386 { 2387 vmp_chain((vm_page_t *)vmmp, (vm_page_t *)to_chain); 2388 } 2389 2390 vmm_page_t * 2391 vmm_drv_page_next(const vmm_page_t *vmmp) 2392 { 2393 return ((vmm_page_t *)vmp_next((vm_page_t *)vmmp)); 2394 } 2395 2396 int 2397 vmm_drv_msi(vmm_lease_t *lease, uint64_t addr, uint64_t msg) 2398 { 2399 ASSERT(lease != NULL); 2400 2401 return (lapic_intr_msi(lease->vml_vm, addr, msg)); 2402 } 2403 2404 int 2405 vmm_drv_ioport_hook(vmm_hold_t *hold, uint16_t ioport, vmm_drv_iop_cb_t func, 2406 void *arg, void **cookie) 2407 { 2408 vmm_softc_t *sc; 2409 int err; 2410 2411 ASSERT(hold != NULL); 2412 ASSERT(cookie != NULL); 2413 2414 sc = hold->vmh_sc; 2415 mutex_enter(&vmm_mtx); 2416 /* Confirm that hook installation is not blocked */ 2417 if ((sc->vmm_flags & VMM_BLOCK_HOOK) != 0) { 2418 mutex_exit(&vmm_mtx); 2419 return (EBUSY); 2420 } 2421 /* 2422 * Optimistically record an installed hook which will prevent a block 2423 * from being asserted while the mutex is dropped. 2424 */ 2425 hold->vmh_ioport_hook_cnt++; 2426 mutex_exit(&vmm_mtx); 2427 2428 vmm_write_lock(sc); 2429 err = vm_ioport_hook(sc->vmm_vm, ioport, (ioport_handler_t)func, 2430 arg, cookie); 2431 vmm_write_unlock(sc); 2432 2433 if (err != 0) { 2434 mutex_enter(&vmm_mtx); 2435 /* Walk back optimism about the hook installation */ 2436 hold->vmh_ioport_hook_cnt--; 2437 mutex_exit(&vmm_mtx); 2438 } 2439 return (err); 2440 } 2441 2442 void 2443 vmm_drv_ioport_unhook(vmm_hold_t *hold, void **cookie) 2444 { 2445 vmm_softc_t *sc; 2446 2447 ASSERT(hold != NULL); 2448 ASSERT(cookie != NULL); 2449 ASSERT(hold->vmh_ioport_hook_cnt != 0); 2450 2451 sc = hold->vmh_sc; 2452 vmm_write_lock(sc); 2453 vm_ioport_unhook(sc->vmm_vm, cookie); 2454 vmm_write_unlock(sc); 2455 2456 mutex_enter(&vmm_mtx); 2457 hold->vmh_ioport_hook_cnt--; 2458 mutex_exit(&vmm_mtx); 2459 } 2460 2461 static void 2462 vmm_drv_purge(vmm_softc_t *sc) 2463 { 2464 ASSERT(MUTEX_HELD(&vmm_mtx)); 2465 2466 if ((sc->vmm_flags & VMM_HELD) != 0) { 2467 vmm_hold_t *hold; 2468 2469 for (hold = list_head(&sc->vmm_holds); hold != NULL; 2470 hold = list_next(&sc->vmm_holds, hold)) { 2471 hold->vmh_release_req = B_TRUE; 2472 } 2473 2474 /* 2475 * Require that all leases on the instance be broken, now that 2476 * all associated holds have been marked as needing release. 2477 * 2478 * Dropping vmm_mtx is not strictly necessary, but if any of the 2479 * lessees are slow to respond, it would be nice to leave it 2480 * available for other parties. 2481 */ 2482 mutex_exit(&vmm_mtx); 2483 vmm_lease_block(sc); 2484 vmm_lease_unblock(sc); 2485 mutex_enter(&vmm_mtx); 2486 } 2487 } 2488 2489 static int 2490 vmm_drv_block_hook(vmm_softc_t *sc, boolean_t enable_block) 2491 { 2492 int err = 0; 2493 2494 mutex_enter(&vmm_mtx); 2495 if (!enable_block) { 2496 VERIFY((sc->vmm_flags & VMM_BLOCK_HOOK) != 0); 2497 2498 sc->vmm_flags &= ~VMM_BLOCK_HOOK; 2499 goto done; 2500 } 2501 2502 /* If any holds have hooks installed, the block is a failure */ 2503 if (!list_is_empty(&sc->vmm_holds)) { 2504 vmm_hold_t *hold; 2505 2506 for (hold = list_head(&sc->vmm_holds); hold != NULL; 2507 hold = list_next(&sc->vmm_holds, hold)) { 2508 if (hold->vmh_ioport_hook_cnt != 0) { 2509 err = EBUSY; 2510 goto done; 2511 } 2512 } 2513 } 2514 sc->vmm_flags |= VMM_BLOCK_HOOK; 2515 2516 done: 2517 mutex_exit(&vmm_mtx); 2518 return (err); 2519 } 2520 2521 2522 static void 2523 vmm_destroy_begin(vmm_softc_t *sc, vmm_destroy_opts_t opts) 2524 { 2525 ASSERT(MUTEX_HELD(&vmm_mtx)); 2526 ASSERT0(sc->vmm_flags & VMM_DESTROY); 2527 2528 sc->vmm_flags |= VMM_DESTROY; 2529 2530 /* 2531 * Lock and unlock all of the vCPUs to ensure that they are kicked out 2532 * of guest context, being unable to return now that the instance is 2533 * marked for destruction. 2534 */ 2535 const int maxcpus = vm_get_maxcpus(sc->vmm_vm); 2536 for (int vcpu = 0; vcpu < maxcpus; vcpu++) { 2537 vcpu_lock_one(sc, vcpu); 2538 vcpu_unlock_one(sc, vcpu); 2539 } 2540 2541 vmmdev_devmem_purge(sc); 2542 if ((opts & VDO_NO_CLEAN_ZSD) == 0) { 2543 /* 2544 * The ZSD should be cleaned up now, unless destruction of the 2545 * instance was initated by destruction of the containing zone, 2546 * in which case the ZSD has already been removed. 2547 */ 2548 vmm_zsd_rem_vm(sc); 2549 } 2550 zone_rele(sc->vmm_zone); 2551 2552 vmm_drv_purge(sc); 2553 } 2554 2555 static bool 2556 vmm_destroy_ready(vmm_softc_t *sc) 2557 { 2558 ASSERT(MUTEX_HELD(&vmm_mtx)); 2559 2560 if ((sc->vmm_flags & (VMM_HELD | VMM_IS_OPEN)) == 0) { 2561 VERIFY(list_is_empty(&sc->vmm_holds)); 2562 return (true); 2563 } 2564 2565 return (false); 2566 } 2567 2568 static void 2569 vmm_destroy_finish(vmm_softc_t *sc) 2570 { 2571 ASSERT(MUTEX_HELD(&vmm_mtx)); 2572 ASSERT(vmm_destroy_ready(sc)); 2573 2574 list_remove(&vmm_list, sc); 2575 vmm_kstat_fini(sc); 2576 vm_destroy(sc->vmm_vm); 2577 ddi_remove_minor_node(vmmdev_dip, sc->vmm_name); 2578 (void) devfs_clean(ddi_get_parent(vmmdev_dip), NULL, DV_CLEAN_FORCE); 2579 2580 const minor_t minor = sc->vmm_minor; 2581 ddi_soft_state_free(vmm_statep, minor); 2582 id_free(vmm_minors, minor); 2583 } 2584 2585 /* 2586 * Initiate or attempt to finish destruction of a VMM instance. 2587 * 2588 * This is called from several contexts: 2589 * - An explicit destroy ioctl is made 2590 * - A vmm_drv consumer releases its hold (being the last on the instance) 2591 * - The vmm device is closed, and auto-destruct is enabled 2592 */ 2593 static int 2594 vmm_destroy_locked(vmm_softc_t *sc, vmm_destroy_opts_t opts, 2595 bool *hma_release) 2596 { 2597 ASSERT(MUTEX_HELD(&vmm_mtx)); 2598 2599 *hma_release = false; 2600 2601 /* 2602 * When instance destruction begins, it is so marked such that any 2603 * further requests to operate the instance will fail. 2604 */ 2605 if ((sc->vmm_flags & VMM_DESTROY) == 0) { 2606 vmm_destroy_begin(sc, opts); 2607 } 2608 2609 if (vmm_destroy_ready(sc)) { 2610 2611 /* 2612 * Notify anyone waiting for the destruction to finish. They 2613 * must be clear before we can safely tear down the softc. 2614 */ 2615 if (sc->vmm_destroy_waiters != 0) { 2616 cv_broadcast(&sc->vmm_cv); 2617 while (sc->vmm_destroy_waiters != 0) { 2618 cv_wait(&sc->vmm_cv, &vmm_mtx); 2619 } 2620 } 2621 2622 /* 2623 * Finish destruction of instance. After this point, the softc 2624 * is freed and cannot be accessed again. 2625 * 2626 * With destruction complete, the HMA hold can be released 2627 */ 2628 vmm_destroy_finish(sc); 2629 *hma_release = true; 2630 return (0); 2631 } else if ((opts & VDO_ATTEMPT_WAIT) != 0) { 2632 int err = 0; 2633 2634 sc->vmm_destroy_waiters++; 2635 while (!vmm_destroy_ready(sc) && err == 0) { 2636 if (cv_wait_sig(&sc->vmm_cv, &vmm_mtx) <= 0) { 2637 err = EINTR; 2638 } 2639 } 2640 sc->vmm_destroy_waiters--; 2641 2642 if (sc->vmm_destroy_waiters == 0) { 2643 /* 2644 * If we were the last waiter, it could be that VM 2645 * destruction is waiting on _us_ to proceed with the 2646 * final clean-up. 2647 */ 2648 cv_signal(&sc->vmm_cv); 2649 } 2650 return (err); 2651 } else { 2652 /* 2653 * Since the instance is not ready for destruction, and the 2654 * caller did not ask to wait, consider it a success for now. 2655 */ 2656 return (0); 2657 } 2658 } 2659 2660 void 2661 vmm_zone_vm_destroy(vmm_softc_t *sc) 2662 { 2663 bool hma_release = false; 2664 int err; 2665 2666 mutex_enter(&vmm_mtx); 2667 err = vmm_destroy_locked(sc, VDO_NO_CLEAN_ZSD, &hma_release); 2668 mutex_exit(&vmm_mtx); 2669 2670 VERIFY0(err); 2671 2672 if (hma_release) { 2673 vmm_hma_release(); 2674 } 2675 } 2676 2677 static int 2678 vmmdev_do_vm_destroy(const struct vm_destroy_req *req, cred_t *cr) 2679 { 2680 vmm_softc_t *sc; 2681 bool hma_release = false; 2682 int err; 2683 2684 if (crgetuid(cr) != 0) { 2685 return (EPERM); 2686 } 2687 2688 mutex_enter(&vmm_mtx); 2689 sc = vmm_lookup(req->name); 2690 if (sc == NULL) { 2691 mutex_exit(&vmm_mtx); 2692 return (ENOENT); 2693 } 2694 /* 2695 * We don't check this in vmm_lookup() since that function is also used 2696 * for validation during create and currently vmm names must be unique. 2697 */ 2698 if (!INGLOBALZONE(curproc) && sc->vmm_zone != curzone) { 2699 mutex_exit(&vmm_mtx); 2700 return (EPERM); 2701 } 2702 2703 err = vmm_destroy_locked(sc, VDO_ATTEMPT_WAIT, &hma_release); 2704 mutex_exit(&vmm_mtx); 2705 2706 if (hma_release) { 2707 vmm_hma_release(); 2708 } 2709 2710 return (err); 2711 } 2712 2713 #define VCPU_NAME_BUFLEN 32 2714 2715 static int 2716 vmm_kstat_alloc(vmm_softc_t *sc, minor_t minor, const cred_t *cr) 2717 { 2718 zoneid_t zid = crgetzoneid(cr); 2719 int instance = minor; 2720 kstat_t *ksp; 2721 2722 ASSERT3P(sc->vmm_kstat_vm, ==, NULL); 2723 2724 ksp = kstat_create_zone(VMM_MODULE_NAME, instance, "vm", 2725 VMM_KSTAT_CLASS, KSTAT_TYPE_NAMED, 2726 sizeof (vmm_kstats_t) / sizeof (kstat_named_t), 0, zid); 2727 2728 if (ksp == NULL) { 2729 return (-1); 2730 } 2731 sc->vmm_kstat_vm = ksp; 2732 2733 for (uint_t i = 0; i < VM_MAXCPU; i++) { 2734 char namebuf[VCPU_NAME_BUFLEN]; 2735 2736 ASSERT3P(sc->vmm_kstat_vcpu[i], ==, NULL); 2737 2738 (void) snprintf(namebuf, VCPU_NAME_BUFLEN, "vcpu%u", i); 2739 ksp = kstat_create_zone(VMM_MODULE_NAME, instance, namebuf, 2740 VMM_KSTAT_CLASS, KSTAT_TYPE_NAMED, 2741 sizeof (vmm_vcpu_kstats_t) / sizeof (kstat_named_t), 2742 0, zid); 2743 if (ksp == NULL) { 2744 goto fail; 2745 } 2746 2747 sc->vmm_kstat_vcpu[i] = ksp; 2748 } 2749 2750 /* 2751 * If this instance is associated with a non-global zone, make its 2752 * kstats visible from the GZ. 2753 */ 2754 if (zid != GLOBAL_ZONEID) { 2755 kstat_zone_add(sc->vmm_kstat_vm, GLOBAL_ZONEID); 2756 for (uint_t i = 0; i < VM_MAXCPU; i++) { 2757 kstat_zone_add(sc->vmm_kstat_vcpu[i], GLOBAL_ZONEID); 2758 } 2759 } 2760 2761 return (0); 2762 2763 fail: 2764 for (uint_t i = 0; i < VM_MAXCPU; i++) { 2765 if (sc->vmm_kstat_vcpu[i] != NULL) { 2766 kstat_delete(sc->vmm_kstat_vcpu[i]); 2767 sc->vmm_kstat_vcpu[i] = NULL; 2768 } else { 2769 break; 2770 } 2771 } 2772 kstat_delete(sc->vmm_kstat_vm); 2773 sc->vmm_kstat_vm = NULL; 2774 return (-1); 2775 } 2776 2777 static void 2778 vmm_kstat_init(vmm_softc_t *sc) 2779 { 2780 kstat_t *ksp; 2781 2782 ASSERT3P(sc->vmm_vm, !=, NULL); 2783 ASSERT3P(sc->vmm_kstat_vm, !=, NULL); 2784 2785 ksp = sc->vmm_kstat_vm; 2786 vmm_kstats_t *vk = ksp->ks_data; 2787 ksp->ks_private = sc->vmm_vm; 2788 kstat_named_init(&vk->vk_name, "vm_name", KSTAT_DATA_STRING); 2789 kstat_named_setstr(&vk->vk_name, sc->vmm_name); 2790 2791 for (uint_t i = 0; i < VM_MAXCPU; i++) { 2792 ASSERT3P(sc->vmm_kstat_vcpu[i], !=, NULL); 2793 2794 ksp = sc->vmm_kstat_vcpu[i]; 2795 vmm_vcpu_kstats_t *vvk = ksp->ks_data; 2796 2797 kstat_named_init(&vvk->vvk_vcpu, "vcpu", KSTAT_DATA_UINT32); 2798 vvk->vvk_vcpu.value.ui32 = i; 2799 kstat_named_init(&vvk->vvk_time_init, "time_init", 2800 KSTAT_DATA_UINT64); 2801 kstat_named_init(&vvk->vvk_time_run, "time_run", 2802 KSTAT_DATA_UINT64); 2803 kstat_named_init(&vvk->vvk_time_idle, "time_idle", 2804 KSTAT_DATA_UINT64); 2805 kstat_named_init(&vvk->vvk_time_emu_kern, "time_emu_kern", 2806 KSTAT_DATA_UINT64); 2807 kstat_named_init(&vvk->vvk_time_emu_user, "time_emu_user", 2808 KSTAT_DATA_UINT64); 2809 kstat_named_init(&vvk->vvk_time_sched, "time_sched", 2810 KSTAT_DATA_UINT64); 2811 ksp->ks_private = sc->vmm_vm; 2812 ksp->ks_update = vmm_kstat_update_vcpu; 2813 } 2814 2815 kstat_install(sc->vmm_kstat_vm); 2816 for (uint_t i = 0; i < VM_MAXCPU; i++) { 2817 kstat_install(sc->vmm_kstat_vcpu[i]); 2818 } 2819 } 2820 2821 static void 2822 vmm_kstat_fini(vmm_softc_t *sc) 2823 { 2824 ASSERT(sc->vmm_kstat_vm != NULL); 2825 2826 kstat_delete(sc->vmm_kstat_vm); 2827 sc->vmm_kstat_vm = NULL; 2828 2829 for (uint_t i = 0; i < VM_MAXCPU; i++) { 2830 ASSERT3P(sc->vmm_kstat_vcpu[i], !=, NULL); 2831 2832 kstat_delete(sc->vmm_kstat_vcpu[i]); 2833 sc->vmm_kstat_vcpu[i] = NULL; 2834 } 2835 } 2836 2837 static int 2838 vmm_open(dev_t *devp, int flag, int otyp, cred_t *credp) 2839 { 2840 minor_t minor; 2841 vmm_softc_t *sc; 2842 2843 /* 2844 * Forbid running bhyve in a 32-bit process until it has been tested and 2845 * verified to be safe. 2846 */ 2847 if (curproc->p_model != DATAMODEL_LP64) { 2848 return (EFBIG); 2849 } 2850 2851 minor = getminor(*devp); 2852 if (minor == VMM_CTL_MINOR) { 2853 /* 2854 * Master control device must be opened exclusively. 2855 */ 2856 if ((flag & FEXCL) != FEXCL || otyp != OTYP_CHR) { 2857 return (EINVAL); 2858 } 2859 2860 return (0); 2861 } 2862 2863 mutex_enter(&vmm_mtx); 2864 sc = ddi_get_soft_state(vmm_statep, minor); 2865 if (sc == NULL) { 2866 mutex_exit(&vmm_mtx); 2867 return (ENXIO); 2868 } 2869 2870 sc->vmm_flags |= VMM_IS_OPEN; 2871 mutex_exit(&vmm_mtx); 2872 2873 return (0); 2874 } 2875 2876 static int 2877 vmm_close(dev_t dev, int flag, int otyp, cred_t *credp) 2878 { 2879 const minor_t minor = getminor(dev); 2880 vmm_softc_t *sc; 2881 bool hma_release = false; 2882 2883 if (minor == VMM_CTL_MINOR) { 2884 return (0); 2885 } 2886 2887 mutex_enter(&vmm_mtx); 2888 sc = ddi_get_soft_state(vmm_statep, minor); 2889 if (sc == NULL) { 2890 mutex_exit(&vmm_mtx); 2891 return (ENXIO); 2892 } 2893 2894 VERIFY3U(sc->vmm_flags & VMM_IS_OPEN, !=, 0); 2895 sc->vmm_flags &= ~VMM_IS_OPEN; 2896 2897 /* 2898 * If instance was marked for auto-destruction begin that now. Instance 2899 * destruction may have been initated already, so try to make progress 2900 * in that case, since closure of the device is one of its requirements. 2901 */ 2902 if ((sc->vmm_flags & VMM_DESTROY) != 0 || 2903 (sc->vmm_flags & VMM_AUTODESTROY) != 0) { 2904 VERIFY0(vmm_destroy_locked(sc, VDO_DEFAULT, &hma_release)); 2905 } 2906 mutex_exit(&vmm_mtx); 2907 2908 if (hma_release) { 2909 vmm_hma_release(); 2910 } 2911 2912 return (0); 2913 } 2914 2915 static int 2916 vmm_is_supported(intptr_t arg) 2917 { 2918 int r; 2919 const char *msg; 2920 2921 if (vmm_is_intel()) { 2922 r = vmx_x86_supported(&msg); 2923 } else if (vmm_is_svm()) { 2924 /* 2925 * HMA already ensured that the features necessary for SVM 2926 * operation were present and online during vmm_attach(). 2927 */ 2928 r = 0; 2929 } else { 2930 r = ENXIO; 2931 msg = "Unsupported CPU vendor"; 2932 } 2933 2934 if (r != 0 && arg != (intptr_t)NULL) { 2935 if (copyoutstr(msg, (char *)arg, strlen(msg) + 1, NULL) != 0) 2936 return (EFAULT); 2937 } 2938 return (r); 2939 } 2940 2941 static int 2942 vmm_ctl_ioctl(int cmd, intptr_t arg, int md, cred_t *cr, int *rvalp) 2943 { 2944 void *argp = (void *)arg; 2945 2946 switch (cmd) { 2947 case VMM_CREATE_VM: { 2948 struct vm_create_req req; 2949 2950 if ((md & FWRITE) == 0) { 2951 return (EPERM); 2952 } 2953 if (ddi_copyin(argp, &req, sizeof (req), md) != 0) { 2954 return (EFAULT); 2955 } 2956 return (vmmdev_do_vm_create(&req, cr)); 2957 } 2958 case VMM_DESTROY_VM: { 2959 struct vm_destroy_req req; 2960 2961 if ((md & FWRITE) == 0) { 2962 return (EPERM); 2963 } 2964 if (ddi_copyin(argp, &req, sizeof (req), md) != 0) { 2965 return (EFAULT); 2966 } 2967 return (vmmdev_do_vm_destroy(&req, cr)); 2968 } 2969 case VMM_VM_SUPPORTED: 2970 return (vmm_is_supported(arg)); 2971 case VMM_CHECK_IOMMU: 2972 if (!vmm_check_iommu()) { 2973 return (ENXIO); 2974 } 2975 return (0); 2976 case VMM_RESV_QUERY: 2977 case VMM_RESV_SET_TARGET: 2978 return (vmmr_ioctl(cmd, arg, md, cr, rvalp)); 2979 default: 2980 break; 2981 } 2982 /* No other actions are legal on ctl device */ 2983 return (ENOTTY); 2984 } 2985 2986 static int 2987 vmm_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, 2988 int *rvalp) 2989 { 2990 vmm_softc_t *sc; 2991 minor_t minor; 2992 2993 /* 2994 * Forbid running bhyve in a 32-bit process until it has been tested and 2995 * verified to be safe. 2996 */ 2997 if (curproc->p_model != DATAMODEL_LP64) { 2998 return (EFBIG); 2999 } 3000 3001 /* The structs in bhyve ioctls assume a 64-bit datamodel */ 3002 if (ddi_model_convert_from(mode & FMODELS) != DDI_MODEL_NONE) { 3003 return (ENOTSUP); 3004 } 3005 3006 /* 3007 * Regardless of minor (vmmctl or instance), we respond to queries of 3008 * the interface version. 3009 */ 3010 if (cmd == VMM_INTERFACE_VERSION) { 3011 *rvalp = VMM_CURRENT_INTERFACE_VERSION; 3012 return (0); 3013 } 3014 3015 minor = getminor(dev); 3016 3017 if (minor == VMM_CTL_MINOR) { 3018 return (vmm_ctl_ioctl(cmd, arg, mode, credp, rvalp)); 3019 } 3020 3021 sc = ddi_get_soft_state(vmm_statep, minor); 3022 ASSERT(sc != NULL); 3023 3024 /* 3025 * Turn away any ioctls against an instance when it is being destroyed. 3026 * (Except for the ioctl inquiring about that destroy-in-progress.) 3027 */ 3028 if ((sc->vmm_flags & VMM_DESTROY) != 0) { 3029 if (cmd == VM_DESTROY_PENDING) { 3030 *rvalp = 1; 3031 return (0); 3032 } 3033 return (ENXIO); 3034 } 3035 3036 return (vmmdev_do_ioctl(sc, cmd, arg, mode, credp, rvalp)); 3037 } 3038 3039 static int 3040 vmm_segmap(dev_t dev, off_t off, struct as *as, caddr_t *addrp, off_t len, 3041 unsigned int prot, unsigned int maxprot, unsigned int flags, cred_t *credp) 3042 { 3043 vmm_softc_t *sc; 3044 const minor_t minor = getminor(dev); 3045 int err; 3046 3047 if (minor == VMM_CTL_MINOR) { 3048 return (ENODEV); 3049 } 3050 if (off < 0 || (off + len) <= 0) { 3051 return (EINVAL); 3052 } 3053 if ((prot & PROT_USER) == 0) { 3054 return (EACCES); 3055 } 3056 3057 sc = ddi_get_soft_state(vmm_statep, minor); 3058 ASSERT(sc); 3059 3060 if (sc->vmm_flags & VMM_DESTROY) 3061 return (ENXIO); 3062 3063 /* Grab read lock on the VM to prevent any changes to the memory map */ 3064 vmm_read_lock(sc); 3065 3066 if (off >= VM_DEVMEM_START) { 3067 int segid; 3068 off_t segoff; 3069 3070 /* Mapping a devmem "device" */ 3071 if (!vmmdev_devmem_segid(sc, off, len, &segid, &segoff)) { 3072 err = ENODEV; 3073 } else { 3074 err = vm_segmap_obj(sc->vmm_vm, segid, segoff, len, as, 3075 addrp, prot, maxprot, flags); 3076 } 3077 } else { 3078 /* Mapping a part of the guest physical space */ 3079 err = vm_segmap_space(sc->vmm_vm, off, as, addrp, len, prot, 3080 maxprot, flags); 3081 } 3082 3083 vmm_read_unlock(sc); 3084 return (err); 3085 } 3086 3087 static sdev_plugin_validate_t 3088 vmm_sdev_validate(sdev_ctx_t ctx) 3089 { 3090 const char *name = sdev_ctx_name(ctx); 3091 vmm_softc_t *sc; 3092 sdev_plugin_validate_t ret; 3093 minor_t minor; 3094 3095 if (sdev_ctx_vtype(ctx) != VCHR) 3096 return (SDEV_VTOR_INVALID); 3097 3098 VERIFY3S(sdev_ctx_minor(ctx, &minor), ==, 0); 3099 3100 mutex_enter(&vmm_mtx); 3101 if ((sc = vmm_lookup(name)) == NULL) 3102 ret = SDEV_VTOR_INVALID; 3103 else if (sc->vmm_minor != minor) 3104 ret = SDEV_VTOR_STALE; 3105 else 3106 ret = SDEV_VTOR_VALID; 3107 mutex_exit(&vmm_mtx); 3108 3109 return (ret); 3110 } 3111 3112 static int 3113 vmm_sdev_filldir(sdev_ctx_t ctx) 3114 { 3115 vmm_softc_t *sc; 3116 int ret; 3117 3118 if (strcmp(sdev_ctx_path(ctx), VMM_SDEV_ROOT) != 0) { 3119 cmn_err(CE_WARN, "%s: bad path '%s' != '%s'\n", __func__, 3120 sdev_ctx_path(ctx), VMM_SDEV_ROOT); 3121 return (EINVAL); 3122 } 3123 3124 mutex_enter(&vmm_mtx); 3125 ASSERT(vmmdev_dip != NULL); 3126 for (sc = list_head(&vmm_list); sc != NULL; 3127 sc = list_next(&vmm_list, sc)) { 3128 if (INGLOBALZONE(curproc) || sc->vmm_zone == curzone) { 3129 ret = sdev_plugin_mknod(ctx, sc->vmm_name, 3130 S_IFCHR | 0600, 3131 makedevice(ddi_driver_major(vmmdev_dip), 3132 sc->vmm_minor)); 3133 } else { 3134 continue; 3135 } 3136 if (ret != 0 && ret != EEXIST) 3137 goto out; 3138 } 3139 3140 ret = 0; 3141 3142 out: 3143 mutex_exit(&vmm_mtx); 3144 return (ret); 3145 } 3146 3147 /* ARGSUSED */ 3148 static void 3149 vmm_sdev_inactive(sdev_ctx_t ctx) 3150 { 3151 } 3152 3153 static sdev_plugin_ops_t vmm_sdev_ops = { 3154 .spo_version = SDEV_PLUGIN_VERSION, 3155 .spo_flags = SDEV_PLUGIN_SUBDIR, 3156 .spo_validate = vmm_sdev_validate, 3157 .spo_filldir = vmm_sdev_filldir, 3158 .spo_inactive = vmm_sdev_inactive 3159 }; 3160 3161 /* ARGSUSED */ 3162 static int 3163 vmm_info(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result) 3164 { 3165 int error; 3166 3167 switch (cmd) { 3168 case DDI_INFO_DEVT2DEVINFO: 3169 *result = (void *)vmmdev_dip; 3170 error = DDI_SUCCESS; 3171 break; 3172 case DDI_INFO_DEVT2INSTANCE: 3173 *result = (void *)0; 3174 error = DDI_SUCCESS; 3175 break; 3176 default: 3177 error = DDI_FAILURE; 3178 break; 3179 } 3180 return (error); 3181 } 3182 3183 static int 3184 vmm_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 3185 { 3186 sdev_plugin_hdl_t sph; 3187 hma_reg_t *reg = NULL; 3188 boolean_t vmm_loaded = B_FALSE; 3189 3190 if (cmd != DDI_ATTACH) { 3191 return (DDI_FAILURE); 3192 } 3193 3194 mutex_enter(&vmmdev_mtx); 3195 /* Ensure we are not already attached. */ 3196 if (vmmdev_dip != NULL) { 3197 mutex_exit(&vmmdev_mtx); 3198 return (DDI_FAILURE); 3199 } 3200 3201 vmm_sol_glue_init(); 3202 3203 /* 3204 * Perform temporary HMA registration to determine if the system 3205 * is capable. 3206 */ 3207 if ((reg = hma_register(vmmdev_hvm_name)) == NULL) { 3208 goto fail; 3209 } else if (vmm_mod_load() != 0) { 3210 goto fail; 3211 } 3212 vmm_loaded = B_TRUE; 3213 hma_unregister(reg); 3214 reg = NULL; 3215 3216 /* Create control node. Other nodes will be created on demand. */ 3217 if (ddi_create_minor_node(dip, "ctl", S_IFCHR, 3218 VMM_CTL_MINOR, DDI_PSEUDO, 0) != 0) { 3219 goto fail; 3220 } 3221 3222 sph = sdev_plugin_register(VMM_MODULE_NAME, &vmm_sdev_ops, NULL); 3223 if (sph == (sdev_plugin_hdl_t)NULL) { 3224 ddi_remove_minor_node(dip, NULL); 3225 goto fail; 3226 } 3227 3228 ddi_report_dev(dip); 3229 vmmdev_sdev_hdl = sph; 3230 vmmdev_dip = dip; 3231 mutex_exit(&vmmdev_mtx); 3232 return (DDI_SUCCESS); 3233 3234 fail: 3235 if (vmm_loaded) { 3236 VERIFY0(vmm_mod_unload()); 3237 } 3238 if (reg != NULL) { 3239 hma_unregister(reg); 3240 } 3241 vmm_sol_glue_cleanup(); 3242 mutex_exit(&vmmdev_mtx); 3243 return (DDI_FAILURE); 3244 } 3245 3246 static int 3247 vmm_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 3248 { 3249 if (cmd != DDI_DETACH) { 3250 return (DDI_FAILURE); 3251 } 3252 3253 /* 3254 * Ensure that all resources have been cleaned up. 3255 * 3256 * To prevent a deadlock with iommu_cleanup() we'll fail the detach if 3257 * vmmdev_mtx is already held. We can't wait for vmmdev_mtx with our 3258 * devinfo locked as iommu_cleanup() tries to recursively lock each 3259 * devinfo, including our own, while holding vmmdev_mtx. 3260 */ 3261 if (mutex_tryenter(&vmmdev_mtx) == 0) 3262 return (DDI_FAILURE); 3263 3264 mutex_enter(&vmm_mtx); 3265 if (!list_is_empty(&vmm_list)) { 3266 mutex_exit(&vmm_mtx); 3267 mutex_exit(&vmmdev_mtx); 3268 return (DDI_FAILURE); 3269 } 3270 mutex_exit(&vmm_mtx); 3271 3272 if (!vmmr_is_empty()) { 3273 mutex_exit(&vmmdev_mtx); 3274 return (DDI_FAILURE); 3275 } 3276 3277 VERIFY(vmmdev_sdev_hdl != (sdev_plugin_hdl_t)NULL); 3278 if (sdev_plugin_unregister(vmmdev_sdev_hdl) != 0) { 3279 mutex_exit(&vmmdev_mtx); 3280 return (DDI_FAILURE); 3281 } 3282 vmmdev_sdev_hdl = (sdev_plugin_hdl_t)NULL; 3283 3284 /* Remove the control node. */ 3285 ddi_remove_minor_node(dip, "ctl"); 3286 vmmdev_dip = NULL; 3287 3288 VERIFY0(vmm_mod_unload()); 3289 VERIFY3U(vmmdev_hma_reg, ==, NULL); 3290 vmm_sol_glue_cleanup(); 3291 3292 mutex_exit(&vmmdev_mtx); 3293 3294 return (DDI_SUCCESS); 3295 } 3296 3297 static struct cb_ops vmm_cb_ops = { 3298 vmm_open, 3299 vmm_close, 3300 nodev, /* strategy */ 3301 nodev, /* print */ 3302 nodev, /* dump */ 3303 nodev, /* read */ 3304 nodev, /* write */ 3305 vmm_ioctl, 3306 nodev, /* devmap */ 3307 nodev, /* mmap */ 3308 vmm_segmap, 3309 nochpoll, /* poll */ 3310 ddi_prop_op, 3311 NULL, 3312 D_NEW | D_MP | D_DEVMAP 3313 }; 3314 3315 static struct dev_ops vmm_ops = { 3316 DEVO_REV, 3317 0, 3318 vmm_info, 3319 nulldev, /* identify */ 3320 nulldev, /* probe */ 3321 vmm_attach, 3322 vmm_detach, 3323 nodev, /* reset */ 3324 &vmm_cb_ops, 3325 (struct bus_ops *)NULL 3326 }; 3327 3328 static struct modldrv modldrv = { 3329 &mod_driverops, 3330 "bhyve vmm", 3331 &vmm_ops 3332 }; 3333 3334 static struct modlinkage modlinkage = { 3335 MODREV_1, 3336 &modldrv, 3337 NULL 3338 }; 3339 3340 int 3341 _init(void) 3342 { 3343 int error; 3344 3345 sysinit(); 3346 3347 mutex_init(&vmmdev_mtx, NULL, MUTEX_DRIVER, NULL); 3348 mutex_init(&vmm_mtx, NULL, MUTEX_DRIVER, NULL); 3349 list_create(&vmm_list, sizeof (vmm_softc_t), 3350 offsetof(vmm_softc_t, vmm_node)); 3351 vmm_minors = id_space_create("vmm_minors", VMM_CTL_MINOR + 1, MAXMIN32); 3352 3353 error = ddi_soft_state_init(&vmm_statep, sizeof (vmm_softc_t), 0); 3354 if (error) { 3355 return (error); 3356 } 3357 3358 error = vmmr_init(); 3359 if (error) { 3360 ddi_soft_state_fini(&vmm_statep); 3361 return (error); 3362 } 3363 3364 vmm_zsd_init(); 3365 3366 error = mod_install(&modlinkage); 3367 if (error) { 3368 ddi_soft_state_fini(&vmm_statep); 3369 vmm_zsd_fini(); 3370 vmmr_fini(); 3371 } 3372 3373 return (error); 3374 } 3375 3376 int 3377 _fini(void) 3378 { 3379 int error; 3380 3381 error = mod_remove(&modlinkage); 3382 if (error) { 3383 return (error); 3384 } 3385 3386 vmm_zsd_fini(); 3387 vmmr_fini(); 3388 3389 ddi_soft_state_fini(&vmm_statep); 3390 3391 return (0); 3392 } 3393 3394 int 3395 _info(struct modinfo *modinfop) 3396 { 3397 return (mod_info(&modlinkage, modinfop)); 3398 } 3399