1 /* 2 * This file and its contents are supplied under the terms of the 3 * Common Development and Distribution License ("CDDL"), version 1.0. 4 * You may only use this file in accordance with the terms of version 5 * 1.0 of the CDDL. 6 * 7 * A full copy of the text of the CDDL should have accompanied this 8 * source. A copy of the CDDL is also available via the Internet at 9 * http://www.illumos.org/license/CDDL. 10 */ 11 /* This file is dual-licensed; see usr/src/contrib/bhyve/LICENSE */ 12 13 /* 14 * Copyright 2015 Pluribus Networks Inc. 15 * Copyright 2019 Joyent, Inc. 16 * Copyright 2020 OmniOS Community Edition (OmniOSce) Association. 17 * Copyright 2022 Oxide Computer Company 18 */ 19 20 #include <sys/types.h> 21 #include <sys/conf.h> 22 #include <sys/cpuvar.h> 23 #include <sys/ioccom.h> 24 #include <sys/stat.h> 25 #include <sys/vmsystm.h> 26 #include <sys/ddi.h> 27 #include <sys/mkdev.h> 28 #include <sys/sunddi.h> 29 #include <sys/fs/dv_node.h> 30 #include <sys/cpuset.h> 31 #include <sys/id_space.h> 32 #include <sys/fs/sdev_plugin.h> 33 #include <sys/smt.h> 34 #include <sys/kstat.h> 35 36 #include <sys/kernel.h> 37 #include <sys/hma.h> 38 #include <sys/x86_archext.h> 39 #include <x86/apicreg.h> 40 41 #include <sys/vmm.h> 42 #include <sys/vmm_kernel.h> 43 #include <sys/vmm_instruction_emul.h> 44 #include <sys/vmm_dev.h> 45 #include <sys/vmm_impl.h> 46 #include <sys/vmm_drv.h> 47 #include <sys/vmm_vm.h> 48 #include <sys/vmm_reservoir.h> 49 50 #include <vm/seg_dev.h> 51 52 #include "io/ppt.h" 53 #include "io/vatpic.h" 54 #include "io/vioapic.h" 55 #include "io/vrtc.h" 56 #include "io/vhpet.h" 57 #include "io/vpmtmr.h" 58 #include "vmm_lapic.h" 59 #include "vmm_stat.h" 60 #include "vmm_util.h" 61 62 /* 63 * Locking details: 64 * 65 * Driver-wide data (vmmdev_*) , including HMA and sdev registration, is 66 * protected by vmmdev_mtx. The list of vmm_softc_t instances and related data 67 * (vmm_*) are protected by vmm_mtx. Actions requiring both locks must acquire 68 * vmmdev_mtx before vmm_mtx. The sdev plugin functions must not attempt to 69 * acquire vmmdev_mtx, as they could deadlock with plugin unregistration. 70 */ 71 72 static kmutex_t vmmdev_mtx; 73 static dev_info_t *vmmdev_dip; 74 static hma_reg_t *vmmdev_hma_reg; 75 static uint_t vmmdev_hma_ref; 76 static sdev_plugin_hdl_t vmmdev_sdev_hdl; 77 78 static kmutex_t vmm_mtx; 79 static list_t vmm_list; 80 static id_space_t *vmm_minors; 81 static void *vmm_statep; 82 83 /* temporary safety switch */ 84 int vmm_allow_state_writes; 85 86 static const char *vmmdev_hvm_name = "bhyve"; 87 88 /* For sdev plugin (/dev) */ 89 #define VMM_SDEV_ROOT "/dev/vmm" 90 91 /* From uts/intel/io/vmm/intel/vmx.c */ 92 extern int vmx_x86_supported(const char **); 93 94 /* Holds and hooks from drivers external to vmm */ 95 struct vmm_hold { 96 list_node_t vmh_node; 97 vmm_softc_t *vmh_sc; 98 boolean_t vmh_release_req; 99 uint_t vmh_ioport_hook_cnt; 100 }; 101 102 struct vmm_lease { 103 list_node_t vml_node; 104 struct vm *vml_vm; 105 vm_client_t *vml_vmclient; 106 boolean_t vml_expired; 107 boolean_t vml_break_deferred; 108 boolean_t (*vml_expire_func)(void *); 109 void *vml_expire_arg; 110 struct vmm_hold *vml_hold; 111 }; 112 113 /* Options for vmm_destroy_locked */ 114 typedef enum vmm_destroy_opts { 115 VDO_DEFAULT = 0, 116 /* 117 * Indicate that zone-specific-data associated with this VM not be 118 * cleaned up as part of the destroy. Skipping ZSD clean-up is 119 * necessary when VM is being destroyed as part of zone destruction, 120 * when said ZSD is already being cleaned up. 121 */ 122 VDO_NO_CLEAN_ZSD = (1 << 0), 123 /* 124 * Attempt to wait for VM destruction to complete. This is opt-in, 125 * since there are many normal conditions which could lead to 126 * destruction being stalled pending other clean-up. 127 */ 128 VDO_ATTEMPT_WAIT = (1 << 1), 129 } vmm_destroy_opts_t; 130 131 static void vmm_hma_release(void); 132 static int vmm_destroy_locked(vmm_softc_t *, vmm_destroy_opts_t, bool *); 133 static int vmm_drv_block_hook(vmm_softc_t *, boolean_t); 134 static void vmm_lease_block(vmm_softc_t *); 135 static void vmm_lease_unblock(vmm_softc_t *); 136 static int vmm_kstat_alloc(vmm_softc_t *, minor_t, const cred_t *); 137 static void vmm_kstat_init(vmm_softc_t *); 138 static void vmm_kstat_fini(vmm_softc_t *); 139 140 /* 141 * The 'devmem' hack: 142 * 143 * On native FreeBSD, bhyve consumers are allowed to create 'devmem' segments 144 * in the vm which appear with their own name related to the vm under /dev. 145 * Since this would be a hassle from an sdev perspective and would require a 146 * new cdev interface (or complicate the existing one), we choose to implement 147 * this in a different manner. Direct access to the underlying vm memory 148 * segments is exposed by placing them in a range of offsets beyond the normal 149 * guest memory space. Userspace can query the appropriate offset to mmap() 150 * for a given segment-id with the VM_DEVMEM_GETOFFSET ioctl. 151 */ 152 153 static vmm_devmem_entry_t * 154 vmmdev_devmem_find(vmm_softc_t *sc, int segid) 155 { 156 vmm_devmem_entry_t *ent = NULL; 157 list_t *dl = &sc->vmm_devmem_list; 158 159 for (ent = list_head(dl); ent != NULL; ent = list_next(dl, ent)) { 160 if (ent->vde_segid == segid) { 161 return (ent); 162 } 163 } 164 return (NULL); 165 } 166 167 static int 168 vmmdev_get_memseg(vmm_softc_t *sc, struct vm_memseg *mseg) 169 { 170 int error; 171 bool sysmem; 172 173 error = vm_get_memseg(sc->vmm_vm, mseg->segid, &mseg->len, &sysmem, 174 NULL); 175 if (error || mseg->len == 0) 176 return (error); 177 178 if (!sysmem) { 179 vmm_devmem_entry_t *de; 180 181 de = vmmdev_devmem_find(sc, mseg->segid); 182 if (de != NULL) { 183 (void) strlcpy(mseg->name, de->vde_name, 184 sizeof (mseg->name)); 185 } 186 } else { 187 bzero(mseg->name, sizeof (mseg->name)); 188 } 189 190 return (error); 191 } 192 193 static int 194 vmmdev_devmem_create(vmm_softc_t *sc, struct vm_memseg *mseg, const char *name) 195 { 196 off_t map_offset; 197 vmm_devmem_entry_t *entry; 198 199 if (list_is_empty(&sc->vmm_devmem_list)) { 200 map_offset = VM_DEVMEM_START; 201 } else { 202 entry = list_tail(&sc->vmm_devmem_list); 203 map_offset = entry->vde_off + entry->vde_len; 204 if (map_offset < entry->vde_off) { 205 /* Do not tolerate overflow */ 206 return (ERANGE); 207 } 208 /* 209 * XXXJOY: We could choose to search the list for duplicate 210 * names and toss an error. Since we're using the offset 211 * method for now, it does not make much of a difference. 212 */ 213 } 214 215 entry = kmem_zalloc(sizeof (*entry), KM_SLEEP); 216 entry->vde_segid = mseg->segid; 217 entry->vde_len = mseg->len; 218 entry->vde_off = map_offset; 219 (void) strlcpy(entry->vde_name, name, sizeof (entry->vde_name)); 220 list_insert_tail(&sc->vmm_devmem_list, entry); 221 222 return (0); 223 } 224 225 static boolean_t 226 vmmdev_devmem_segid(vmm_softc_t *sc, off_t off, off_t len, int *segidp, 227 off_t *map_offp) 228 { 229 list_t *dl = &sc->vmm_devmem_list; 230 vmm_devmem_entry_t *de = NULL; 231 const off_t map_end = off + len; 232 233 VERIFY(off >= VM_DEVMEM_START); 234 235 if (map_end < off) { 236 /* No match on overflow */ 237 return (B_FALSE); 238 } 239 240 for (de = list_head(dl); de != NULL; de = list_next(dl, de)) { 241 const off_t item_end = de->vde_off + de->vde_len; 242 243 if (de->vde_off <= off && item_end >= map_end) { 244 *segidp = de->vde_segid; 245 *map_offp = off - de->vde_off; 246 return (B_TRUE); 247 } 248 } 249 return (B_FALSE); 250 } 251 252 /* 253 * When an instance is being destroyed, the devmem list of named memory objects 254 * can be torn down, as no new mappings are allowed. 255 */ 256 static void 257 vmmdev_devmem_purge(vmm_softc_t *sc) 258 { 259 vmm_devmem_entry_t *entry; 260 261 while ((entry = list_remove_head(&sc->vmm_devmem_list)) != NULL) { 262 kmem_free(entry, sizeof (*entry)); 263 } 264 } 265 266 static int 267 vmmdev_alloc_memseg(vmm_softc_t *sc, struct vm_memseg *mseg) 268 { 269 int error; 270 bool sysmem = true; 271 272 if (VM_MEMSEG_NAME(mseg)) { 273 sysmem = false; 274 } 275 error = vm_alloc_memseg(sc->vmm_vm, mseg->segid, mseg->len, sysmem); 276 277 if (error == 0) { 278 /* 279 * Rather than create a whole fresh device from which userspace 280 * can mmap this segment, instead make it available at an 281 * offset above where the main guest memory resides. 282 */ 283 error = vmmdev_devmem_create(sc, mseg, mseg->name); 284 if (error != 0) { 285 vm_free_memseg(sc->vmm_vm, mseg->segid); 286 } 287 } 288 return (error); 289 } 290 291 /* 292 * Resource Locking and Exclusion 293 * 294 * Much of bhyve depends on key portions of VM state, such as the guest memory 295 * map, to remain unchanged while the guest is running. As ported from 296 * FreeBSD, the initial strategy for this resource exclusion hinged on gating 297 * access to the instance vCPUs. Threads acting on a single vCPU, like those 298 * performing the work of actually running the guest in VMX/SVM, would lock 299 * only that vCPU during ioctl() entry. For ioctls which would change VM-wide 300 * state, all of the vCPUs would be first locked, ensuring that the 301 * operation(s) could complete without any other threads stumbling into 302 * intermediate states. 303 * 304 * This approach is largely effective for bhyve. Common operations, such as 305 * running the vCPUs, steer clear of lock contention. The model begins to 306 * break down for operations which do not occur in the context of a specific 307 * vCPU. LAPIC MSI delivery, for example, may be initiated from a worker 308 * thread in the bhyve process. In order to properly protect those vCPU-less 309 * operations from encountering invalid states, additional locking is required. 310 * This was solved by forcing those operations to lock the VM_MAXCPU-1 vCPU. 311 * It does mean that class of operations will be serialized on locking the 312 * specific vCPU and that instances sized at VM_MAXCPU will potentially see 313 * undue contention on the VM_MAXCPU-1 vCPU. 314 * 315 * In order to address the shortcomings of this model, the concept of a 316 * read/write lock has been added to bhyve. Operations which change 317 * fundamental aspects of a VM (such as the memory map) must acquire the write 318 * lock, which also implies locking all of the vCPUs and waiting for all read 319 * lock holders to release. While it increases the cost and waiting time for 320 * those few operations, it allows most hot-path operations on the VM (which 321 * depend on its configuration remaining stable) to occur with minimal locking. 322 * 323 * Consumers of the Driver API (see below) are a special case when it comes to 324 * this locking, since they may hold a read lock via the drv_lease mechanism 325 * for an extended period of time. Rather than forcing those consumers to 326 * continuously poll for a write lock attempt, the lease system forces them to 327 * provide a release callback to trigger their clean-up (and potential later 328 * reacquisition) of the read lock. 329 */ 330 331 static void 332 vcpu_lock_one(vmm_softc_t *sc, int vcpu) 333 { 334 ASSERT(vcpu >= 0 && vcpu < VM_MAXCPU); 335 336 /* 337 * Since this state transition is utilizing from_idle=true, it should 338 * not fail, but rather block until it can be successful. 339 */ 340 VERIFY0(vcpu_set_state(sc->vmm_vm, vcpu, VCPU_FROZEN, true)); 341 } 342 343 static void 344 vcpu_unlock_one(vmm_softc_t *sc, int vcpu) 345 { 346 ASSERT(vcpu >= 0 && vcpu < VM_MAXCPU); 347 348 VERIFY3U(vcpu_get_state(sc->vmm_vm, vcpu, NULL), ==, VCPU_FROZEN); 349 VERIFY0(vcpu_set_state(sc->vmm_vm, vcpu, VCPU_IDLE, false)); 350 } 351 352 static void 353 vmm_read_lock(vmm_softc_t *sc) 354 { 355 rw_enter(&sc->vmm_rwlock, RW_READER); 356 } 357 358 static void 359 vmm_read_unlock(vmm_softc_t *sc) 360 { 361 rw_exit(&sc->vmm_rwlock); 362 } 363 364 static void 365 vmm_write_lock(vmm_softc_t *sc) 366 { 367 int maxcpus; 368 369 /* First lock all the vCPUs */ 370 maxcpus = vm_get_maxcpus(sc->vmm_vm); 371 for (int vcpu = 0; vcpu < maxcpus; vcpu++) { 372 vcpu_lock_one(sc, vcpu); 373 } 374 375 /* 376 * Block vmm_drv leases from being acquired or held while the VM write 377 * lock is held. 378 */ 379 vmm_lease_block(sc); 380 381 rw_enter(&sc->vmm_rwlock, RW_WRITER); 382 /* 383 * For now, the 'maxcpus' value for an instance is fixed at the 384 * compile-time constant of VM_MAXCPU at creation. If this changes in 385 * the future, allowing for dynamic vCPU resource sizing, acquisition 386 * of the write lock will need to be wary of such changes. 387 */ 388 VERIFY(maxcpus == vm_get_maxcpus(sc->vmm_vm)); 389 } 390 391 static void 392 vmm_write_unlock(vmm_softc_t *sc) 393 { 394 int maxcpus; 395 396 /* Allow vmm_drv leases to be acquired once write lock is dropped */ 397 vmm_lease_unblock(sc); 398 399 /* 400 * The VM write lock _must_ be released from the same thread it was 401 * acquired in, unlike the read lock. 402 */ 403 VERIFY(rw_write_held(&sc->vmm_rwlock)); 404 rw_exit(&sc->vmm_rwlock); 405 406 /* Unlock all the vCPUs */ 407 maxcpus = vm_get_maxcpus(sc->vmm_vm); 408 for (int vcpu = 0; vcpu < maxcpus; vcpu++) { 409 vcpu_unlock_one(sc, vcpu); 410 } 411 } 412 413 static int 414 vmmdev_do_ioctl(vmm_softc_t *sc, int cmd, intptr_t arg, int md, 415 cred_t *credp, int *rvalp) 416 { 417 int error = 0, vcpu = -1; 418 void *datap = (void *)arg; 419 enum vm_lock_type { 420 LOCK_NONE = 0, 421 LOCK_VCPU, 422 LOCK_READ_HOLD, 423 LOCK_WRITE_HOLD 424 } lock_type = LOCK_NONE; 425 426 /* Acquire any exclusion resources needed for the operation. */ 427 switch (cmd) { 428 case VM_RUN: 429 case VM_GET_REGISTER: 430 case VM_SET_REGISTER: 431 case VM_GET_SEGMENT_DESCRIPTOR: 432 case VM_SET_SEGMENT_DESCRIPTOR: 433 case VM_GET_REGISTER_SET: 434 case VM_SET_REGISTER_SET: 435 case VM_INJECT_EXCEPTION: 436 case VM_GET_CAPABILITY: 437 case VM_SET_CAPABILITY: 438 case VM_PPTDEV_MSI: 439 case VM_PPTDEV_MSIX: 440 case VM_SET_X2APIC_STATE: 441 case VM_GLA2GPA: 442 case VM_GLA2GPA_NOFAULT: 443 case VM_ACTIVATE_CPU: 444 case VM_SET_INTINFO: 445 case VM_GET_INTINFO: 446 case VM_RESTART_INSTRUCTION: 447 case VM_SET_KERNEMU_DEV: 448 case VM_GET_KERNEMU_DEV: 449 case VM_RESET_CPU: 450 case VM_GET_RUN_STATE: 451 case VM_SET_RUN_STATE: 452 case VM_GET_FPU: 453 case VM_SET_FPU: 454 case VM_GET_CPUID: 455 case VM_SET_CPUID: 456 case VM_LEGACY_CPUID: 457 /* 458 * Copy in the ID of the vCPU chosen for this operation. 459 * Since a nefarious caller could update their struct between 460 * this locking and when the rest of the ioctl data is copied 461 * in, it is _critical_ that this local 'vcpu' variable be used 462 * rather than the in-struct one when performing the ioctl. 463 */ 464 if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) { 465 return (EFAULT); 466 } 467 if (vcpu < 0 || vcpu > vm_get_maxcpus(sc->vmm_vm)) { 468 return (EINVAL); 469 } 470 vcpu_lock_one(sc, vcpu); 471 lock_type = LOCK_VCPU; 472 break; 473 474 case VM_REINIT: 475 case VM_BIND_PPTDEV: 476 case VM_UNBIND_PPTDEV: 477 case VM_MAP_PPTDEV_MMIO: 478 case VM_UNMAP_PPTDEV_MMIO: 479 case VM_ALLOC_MEMSEG: 480 case VM_MMAP_MEMSEG: 481 case VM_MUNMAP_MEMSEG: 482 case VM_WRLOCK_CYCLE: 483 case VM_PMTMR_LOCATE: 484 vmm_write_lock(sc); 485 lock_type = LOCK_WRITE_HOLD; 486 break; 487 488 case VM_GET_MEMSEG: 489 case VM_MMAP_GETNEXT: 490 case VM_LAPIC_IRQ: 491 case VM_INJECT_NMI: 492 case VM_IOAPIC_ASSERT_IRQ: 493 case VM_IOAPIC_DEASSERT_IRQ: 494 case VM_IOAPIC_PULSE_IRQ: 495 case VM_LAPIC_MSI: 496 case VM_LAPIC_LOCAL_IRQ: 497 case VM_GET_X2APIC_STATE: 498 case VM_RTC_READ: 499 case VM_RTC_WRITE: 500 case VM_RTC_SETTIME: 501 case VM_RTC_GETTIME: 502 case VM_PPTDEV_DISABLE_MSIX: 503 case VM_DEVMEM_GETOFFSET: 504 case VM_TRACK_DIRTY_PAGES: 505 vmm_read_lock(sc); 506 lock_type = LOCK_READ_HOLD; 507 break; 508 509 case VM_DATA_READ: 510 case VM_DATA_WRITE: 511 if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) { 512 return (EFAULT); 513 } 514 if (vcpu == -1) { 515 /* Access data for VM-wide devices */ 516 vmm_write_lock(sc); 517 lock_type = LOCK_WRITE_HOLD; 518 } else if (vcpu >= 0 && vcpu < vm_get_maxcpus(sc->vmm_vm)) { 519 /* Access data associated with a specific vCPU */ 520 vcpu_lock_one(sc, vcpu); 521 lock_type = LOCK_VCPU; 522 } else { 523 return (EINVAL); 524 } 525 break; 526 527 case VM_GET_GPA_PMAP: 528 case VM_IOAPIC_PINCOUNT: 529 case VM_SUSPEND: 530 case VM_DESC_FPU_AREA: 531 case VM_SET_AUTODESTRUCT: 532 case VM_DESTROY_SELF: 533 case VM_DESTROY_PENDING: 534 default: 535 break; 536 } 537 538 /* Execute the primary logic for the ioctl. */ 539 switch (cmd) { 540 case VM_RUN: { 541 struct vm_entry entry; 542 543 if (ddi_copyin(datap, &entry, sizeof (entry), md)) { 544 error = EFAULT; 545 break; 546 } 547 548 if (!(curthread->t_schedflag & TS_VCPU)) 549 smt_mark_as_vcpu(); 550 551 error = vm_run(sc->vmm_vm, vcpu, &entry); 552 553 /* 554 * Unexpected states in vm_run() are expressed through positive 555 * errno-oriented return values. VM states which expect further 556 * processing in userspace (necessary context via exitinfo) are 557 * expressed through negative return values. For the time being 558 * a return value of 0 is not expected from vm_run(). 559 */ 560 ASSERT(error != 0); 561 if (error < 0) { 562 const struct vm_exit *vme; 563 void *outp = entry.exit_data; 564 565 error = 0; 566 vme = vm_exitinfo(sc->vmm_vm, vcpu); 567 if (ddi_copyout(vme, outp, sizeof (*vme), md)) { 568 error = EFAULT; 569 } 570 } 571 break; 572 } 573 case VM_SUSPEND: { 574 struct vm_suspend vmsuspend; 575 576 if (ddi_copyin(datap, &vmsuspend, sizeof (vmsuspend), md)) { 577 error = EFAULT; 578 break; 579 } 580 error = vm_suspend(sc->vmm_vm, vmsuspend.how); 581 break; 582 } 583 case VM_REINIT: { 584 struct vm_reinit reinit; 585 586 if (ddi_copyin(datap, &reinit, sizeof (reinit), md)) { 587 error = EFAULT; 588 break; 589 } 590 if ((error = vmm_drv_block_hook(sc, B_TRUE)) != 0) { 591 /* 592 * The VM instance should be free of driver-attached 593 * hooks during the reinitialization process. 594 */ 595 break; 596 } 597 error = vm_reinit(sc->vmm_vm, reinit.flags); 598 (void) vmm_drv_block_hook(sc, B_FALSE); 599 break; 600 } 601 case VM_STAT_DESC: { 602 struct vm_stat_desc statdesc; 603 604 if (ddi_copyin(datap, &statdesc, sizeof (statdesc), md)) { 605 error = EFAULT; 606 break; 607 } 608 error = vmm_stat_desc_copy(statdesc.index, statdesc.desc, 609 sizeof (statdesc.desc)); 610 if (error == 0 && 611 ddi_copyout(&statdesc, datap, sizeof (statdesc), md)) { 612 error = EFAULT; 613 break; 614 } 615 break; 616 } 617 case VM_STATS_IOC: { 618 struct vm_stats vmstats; 619 620 if (ddi_copyin(datap, &vmstats, sizeof (vmstats), md)) { 621 error = EFAULT; 622 break; 623 } 624 hrt2tv(gethrtime(), &vmstats.tv); 625 error = vmm_stat_copy(sc->vmm_vm, vmstats.cpuid, vmstats.index, 626 nitems(vmstats.statbuf), 627 &vmstats.num_entries, vmstats.statbuf); 628 if (error == 0 && 629 ddi_copyout(&vmstats, datap, sizeof (vmstats), md)) { 630 error = EFAULT; 631 break; 632 } 633 break; 634 } 635 636 case VM_PPTDEV_MSI: { 637 struct vm_pptdev_msi pptmsi; 638 639 if (ddi_copyin(datap, &pptmsi, sizeof (pptmsi), md)) { 640 error = EFAULT; 641 break; 642 } 643 error = ppt_setup_msi(sc->vmm_vm, pptmsi.vcpu, pptmsi.pptfd, 644 pptmsi.addr, pptmsi.msg, pptmsi.numvec); 645 break; 646 } 647 case VM_PPTDEV_MSIX: { 648 struct vm_pptdev_msix pptmsix; 649 650 if (ddi_copyin(datap, &pptmsix, sizeof (pptmsix), md)) { 651 error = EFAULT; 652 break; 653 } 654 error = ppt_setup_msix(sc->vmm_vm, pptmsix.vcpu, pptmsix.pptfd, 655 pptmsix.idx, pptmsix.addr, pptmsix.msg, 656 pptmsix.vector_control); 657 break; 658 } 659 case VM_PPTDEV_DISABLE_MSIX: { 660 struct vm_pptdev pptdev; 661 662 if (ddi_copyin(datap, &pptdev, sizeof (pptdev), md)) { 663 error = EFAULT; 664 break; 665 } 666 error = ppt_disable_msix(sc->vmm_vm, pptdev.pptfd); 667 break; 668 } 669 case VM_MAP_PPTDEV_MMIO: { 670 struct vm_pptdev_mmio pptmmio; 671 672 if (ddi_copyin(datap, &pptmmio, sizeof (pptmmio), md)) { 673 error = EFAULT; 674 break; 675 } 676 error = ppt_map_mmio(sc->vmm_vm, pptmmio.pptfd, pptmmio.gpa, 677 pptmmio.len, pptmmio.hpa); 678 break; 679 } 680 case VM_UNMAP_PPTDEV_MMIO: { 681 struct vm_pptdev_mmio pptmmio; 682 683 if (ddi_copyin(datap, &pptmmio, sizeof (pptmmio), md)) { 684 error = EFAULT; 685 break; 686 } 687 error = ppt_unmap_mmio(sc->vmm_vm, pptmmio.pptfd, pptmmio.gpa, 688 pptmmio.len); 689 break; 690 } 691 case VM_BIND_PPTDEV: { 692 struct vm_pptdev pptdev; 693 694 if (ddi_copyin(datap, &pptdev, sizeof (pptdev), md)) { 695 error = EFAULT; 696 break; 697 } 698 error = vm_assign_pptdev(sc->vmm_vm, pptdev.pptfd); 699 break; 700 } 701 case VM_UNBIND_PPTDEV: { 702 struct vm_pptdev pptdev; 703 704 if (ddi_copyin(datap, &pptdev, sizeof (pptdev), md)) { 705 error = EFAULT; 706 break; 707 } 708 error = vm_unassign_pptdev(sc->vmm_vm, pptdev.pptfd); 709 break; 710 } 711 case VM_GET_PPTDEV_LIMITS: { 712 struct vm_pptdev_limits pptlimits; 713 714 if (ddi_copyin(datap, &pptlimits, sizeof (pptlimits), md)) { 715 error = EFAULT; 716 break; 717 } 718 error = ppt_get_limits(sc->vmm_vm, pptlimits.pptfd, 719 &pptlimits.msi_limit, &pptlimits.msix_limit); 720 if (error == 0 && 721 ddi_copyout(&pptlimits, datap, sizeof (pptlimits), md)) { 722 error = EFAULT; 723 break; 724 } 725 break; 726 } 727 case VM_INJECT_EXCEPTION: { 728 struct vm_exception vmexc; 729 if (ddi_copyin(datap, &vmexc, sizeof (vmexc), md)) { 730 error = EFAULT; 731 break; 732 } 733 error = vm_inject_exception(sc->vmm_vm, vcpu, vmexc.vector, 734 vmexc.error_code_valid != 0, vmexc.error_code, 735 vmexc.restart_instruction != 0); 736 break; 737 } 738 case VM_INJECT_NMI: { 739 struct vm_nmi vmnmi; 740 741 if (ddi_copyin(datap, &vmnmi, sizeof (vmnmi), md)) { 742 error = EFAULT; 743 break; 744 } 745 error = vm_inject_nmi(sc->vmm_vm, vmnmi.cpuid); 746 break; 747 } 748 case VM_LAPIC_IRQ: { 749 struct vm_lapic_irq vmirq; 750 751 if (ddi_copyin(datap, &vmirq, sizeof (vmirq), md)) { 752 error = EFAULT; 753 break; 754 } 755 error = lapic_intr_edge(sc->vmm_vm, vmirq.cpuid, vmirq.vector); 756 break; 757 } 758 case VM_LAPIC_LOCAL_IRQ: { 759 struct vm_lapic_irq vmirq; 760 761 if (ddi_copyin(datap, &vmirq, sizeof (vmirq), md)) { 762 error = EFAULT; 763 break; 764 } 765 error = lapic_set_local_intr(sc->vmm_vm, vmirq.cpuid, 766 vmirq.vector); 767 break; 768 } 769 case VM_LAPIC_MSI: { 770 struct vm_lapic_msi vmmsi; 771 772 if (ddi_copyin(datap, &vmmsi, sizeof (vmmsi), md)) { 773 error = EFAULT; 774 break; 775 } 776 error = lapic_intr_msi(sc->vmm_vm, vmmsi.addr, vmmsi.msg); 777 break; 778 } 779 780 case VM_IOAPIC_ASSERT_IRQ: { 781 struct vm_ioapic_irq ioapic_irq; 782 783 if (ddi_copyin(datap, &ioapic_irq, sizeof (ioapic_irq), md)) { 784 error = EFAULT; 785 break; 786 } 787 error = vioapic_assert_irq(sc->vmm_vm, ioapic_irq.irq); 788 break; 789 } 790 case VM_IOAPIC_DEASSERT_IRQ: { 791 struct vm_ioapic_irq ioapic_irq; 792 793 if (ddi_copyin(datap, &ioapic_irq, sizeof (ioapic_irq), md)) { 794 error = EFAULT; 795 break; 796 } 797 error = vioapic_deassert_irq(sc->vmm_vm, ioapic_irq.irq); 798 break; 799 } 800 case VM_IOAPIC_PULSE_IRQ: { 801 struct vm_ioapic_irq ioapic_irq; 802 803 if (ddi_copyin(datap, &ioapic_irq, sizeof (ioapic_irq), md)) { 804 error = EFAULT; 805 break; 806 } 807 error = vioapic_pulse_irq(sc->vmm_vm, ioapic_irq.irq); 808 break; 809 } 810 case VM_IOAPIC_PINCOUNT: { 811 int pincount; 812 813 pincount = vioapic_pincount(sc->vmm_vm); 814 if (ddi_copyout(&pincount, datap, sizeof (int), md)) { 815 error = EFAULT; 816 break; 817 } 818 break; 819 } 820 case VM_DESC_FPU_AREA: { 821 struct vm_fpu_desc desc; 822 void *buf = NULL; 823 824 if (ddi_copyin(datap, &desc, sizeof (desc), md)) { 825 error = EFAULT; 826 break; 827 } 828 if (desc.vfd_num_entries > 64) { 829 error = EINVAL; 830 break; 831 } 832 const size_t buf_sz = sizeof (struct vm_fpu_desc_entry) * 833 desc.vfd_num_entries; 834 if (buf_sz != 0) { 835 buf = kmem_zalloc(buf_sz, KM_SLEEP); 836 } 837 838 /* 839 * For now, we are depending on vm_fpu_desc_entry and 840 * hma_xsave_state_desc_t having the same format. 841 */ 842 CTASSERT(sizeof (struct vm_fpu_desc_entry) == 843 sizeof (hma_xsave_state_desc_t)); 844 845 size_t req_size; 846 const uint_t max_entries = hma_fpu_describe_xsave_state( 847 (hma_xsave_state_desc_t *)buf, 848 desc.vfd_num_entries, 849 &req_size); 850 851 desc.vfd_req_size = req_size; 852 desc.vfd_num_entries = max_entries; 853 if (buf_sz != 0) { 854 if (ddi_copyout(buf, desc.vfd_entry_data, buf_sz, md)) { 855 error = EFAULT; 856 } 857 kmem_free(buf, buf_sz); 858 } 859 860 if (error == 0) { 861 if (ddi_copyout(&desc, datap, sizeof (desc), md)) { 862 error = EFAULT; 863 } 864 } 865 break; 866 } 867 case VM_SET_AUTODESTRUCT: { 868 /* 869 * Since this has to do with controlling the lifetime of the 870 * greater vmm_softc_t, the flag is protected by vmm_mtx, rather 871 * than the vcpu-centric or rwlock exclusion mechanisms. 872 */ 873 mutex_enter(&vmm_mtx); 874 if (arg != 0) { 875 sc->vmm_flags |= VMM_AUTODESTROY; 876 } else { 877 sc->vmm_flags &= ~VMM_AUTODESTROY; 878 } 879 mutex_exit(&vmm_mtx); 880 break; 881 } 882 case VM_DESTROY_SELF: { 883 bool hma_release = false; 884 885 /* 886 * Just like VMM_DESTROY_VM, but on the instance file descriptor 887 * itself, rather than having to perform a racy name lookup as 888 * part of the destroy process. 889 * 890 * Since vmm_destroy_locked() performs vCPU lock acquisition in 891 * order to kick the vCPUs out of guest context as part of any 892 * destruction, we do not need to worry about it ourself using 893 * the `lock_type` logic here. 894 */ 895 mutex_enter(&vmm_mtx); 896 VERIFY0(vmm_destroy_locked(sc, VDO_DEFAULT, &hma_release)); 897 mutex_exit(&vmm_mtx); 898 if (hma_release) { 899 vmm_hma_release(); 900 } 901 break; 902 } 903 case VM_DESTROY_PENDING: { 904 /* 905 * If we have made it this far, then destruction of the instance 906 * has not been initiated. 907 */ 908 *rvalp = 0; 909 break; 910 } 911 912 case VM_ISA_ASSERT_IRQ: { 913 struct vm_isa_irq isa_irq; 914 915 if (ddi_copyin(datap, &isa_irq, sizeof (isa_irq), md)) { 916 error = EFAULT; 917 break; 918 } 919 error = vatpic_assert_irq(sc->vmm_vm, isa_irq.atpic_irq); 920 if (error == 0 && isa_irq.ioapic_irq != -1) { 921 error = vioapic_assert_irq(sc->vmm_vm, 922 isa_irq.ioapic_irq); 923 } 924 break; 925 } 926 case VM_ISA_DEASSERT_IRQ: { 927 struct vm_isa_irq isa_irq; 928 929 if (ddi_copyin(datap, &isa_irq, sizeof (isa_irq), md)) { 930 error = EFAULT; 931 break; 932 } 933 error = vatpic_deassert_irq(sc->vmm_vm, isa_irq.atpic_irq); 934 if (error == 0 && isa_irq.ioapic_irq != -1) { 935 error = vioapic_deassert_irq(sc->vmm_vm, 936 isa_irq.ioapic_irq); 937 } 938 break; 939 } 940 case VM_ISA_PULSE_IRQ: { 941 struct vm_isa_irq isa_irq; 942 943 if (ddi_copyin(datap, &isa_irq, sizeof (isa_irq), md)) { 944 error = EFAULT; 945 break; 946 } 947 error = vatpic_pulse_irq(sc->vmm_vm, isa_irq.atpic_irq); 948 if (error == 0 && isa_irq.ioapic_irq != -1) { 949 error = vioapic_pulse_irq(sc->vmm_vm, 950 isa_irq.ioapic_irq); 951 } 952 break; 953 } 954 case VM_ISA_SET_IRQ_TRIGGER: { 955 struct vm_isa_irq_trigger isa_irq_trigger; 956 957 if (ddi_copyin(datap, &isa_irq_trigger, 958 sizeof (isa_irq_trigger), md)) { 959 error = EFAULT; 960 break; 961 } 962 error = vatpic_set_irq_trigger(sc->vmm_vm, 963 isa_irq_trigger.atpic_irq, isa_irq_trigger.trigger); 964 break; 965 } 966 967 case VM_MMAP_GETNEXT: { 968 struct vm_memmap mm; 969 970 if (ddi_copyin(datap, &mm, sizeof (mm), md)) { 971 error = EFAULT; 972 break; 973 } 974 error = vm_mmap_getnext(sc->vmm_vm, &mm.gpa, &mm.segid, 975 &mm.segoff, &mm.len, &mm.prot, &mm.flags); 976 if (error == 0 && ddi_copyout(&mm, datap, sizeof (mm), md)) { 977 error = EFAULT; 978 break; 979 } 980 break; 981 } 982 case VM_MMAP_MEMSEG: { 983 struct vm_memmap mm; 984 985 if (ddi_copyin(datap, &mm, sizeof (mm), md)) { 986 error = EFAULT; 987 break; 988 } 989 error = vm_mmap_memseg(sc->vmm_vm, mm.gpa, mm.segid, mm.segoff, 990 mm.len, mm.prot, mm.flags); 991 break; 992 } 993 case VM_MUNMAP_MEMSEG: { 994 struct vm_munmap mu; 995 996 if (ddi_copyin(datap, &mu, sizeof (mu), md)) { 997 error = EFAULT; 998 break; 999 } 1000 error = vm_munmap_memseg(sc->vmm_vm, mu.gpa, mu.len); 1001 break; 1002 } 1003 case VM_ALLOC_MEMSEG: { 1004 struct vm_memseg vmseg; 1005 1006 if (ddi_copyin(datap, &vmseg, sizeof (vmseg), md)) { 1007 error = EFAULT; 1008 break; 1009 } 1010 error = vmmdev_alloc_memseg(sc, &vmseg); 1011 break; 1012 } 1013 case VM_GET_MEMSEG: { 1014 struct vm_memseg vmseg; 1015 1016 if (ddi_copyin(datap, &vmseg, sizeof (vmseg), md)) { 1017 error = EFAULT; 1018 break; 1019 } 1020 error = vmmdev_get_memseg(sc, &vmseg); 1021 if (error == 0 && 1022 ddi_copyout(&vmseg, datap, sizeof (vmseg), md)) { 1023 error = EFAULT; 1024 break; 1025 } 1026 break; 1027 } 1028 case VM_GET_REGISTER: { 1029 struct vm_register vmreg; 1030 1031 if (ddi_copyin(datap, &vmreg, sizeof (vmreg), md)) { 1032 error = EFAULT; 1033 break; 1034 } 1035 error = vm_get_register(sc->vmm_vm, vcpu, vmreg.regnum, 1036 &vmreg.regval); 1037 if (error == 0 && 1038 ddi_copyout(&vmreg, datap, sizeof (vmreg), md)) { 1039 error = EFAULT; 1040 break; 1041 } 1042 break; 1043 } 1044 case VM_SET_REGISTER: { 1045 struct vm_register vmreg; 1046 1047 if (ddi_copyin(datap, &vmreg, sizeof (vmreg), md)) { 1048 error = EFAULT; 1049 break; 1050 } 1051 error = vm_set_register(sc->vmm_vm, vcpu, vmreg.regnum, 1052 vmreg.regval); 1053 break; 1054 } 1055 case VM_SET_SEGMENT_DESCRIPTOR: { 1056 struct vm_seg_desc vmsegd; 1057 1058 if (ddi_copyin(datap, &vmsegd, sizeof (vmsegd), md)) { 1059 error = EFAULT; 1060 break; 1061 } 1062 error = vm_set_seg_desc(sc->vmm_vm, vcpu, vmsegd.regnum, 1063 &vmsegd.desc); 1064 break; 1065 } 1066 case VM_GET_SEGMENT_DESCRIPTOR: { 1067 struct vm_seg_desc vmsegd; 1068 1069 if (ddi_copyin(datap, &vmsegd, sizeof (vmsegd), md)) { 1070 error = EFAULT; 1071 break; 1072 } 1073 error = vm_get_seg_desc(sc->vmm_vm, vcpu, vmsegd.regnum, 1074 &vmsegd.desc); 1075 if (error == 0 && 1076 ddi_copyout(&vmsegd, datap, sizeof (vmsegd), md)) { 1077 error = EFAULT; 1078 break; 1079 } 1080 break; 1081 } 1082 case VM_GET_REGISTER_SET: { 1083 struct vm_register_set vrs; 1084 int regnums[VM_REG_LAST]; 1085 uint64_t regvals[VM_REG_LAST]; 1086 1087 if (ddi_copyin(datap, &vrs, sizeof (vrs), md)) { 1088 error = EFAULT; 1089 break; 1090 } 1091 if (vrs.count > VM_REG_LAST || vrs.count == 0) { 1092 error = EINVAL; 1093 break; 1094 } 1095 if (ddi_copyin(vrs.regnums, regnums, 1096 sizeof (int) * vrs.count, md)) { 1097 error = EFAULT; 1098 break; 1099 } 1100 1101 error = 0; 1102 for (uint_t i = 0; i < vrs.count && error == 0; i++) { 1103 if (regnums[i] < 0) { 1104 error = EINVAL; 1105 break; 1106 } 1107 error = vm_get_register(sc->vmm_vm, vcpu, regnums[i], 1108 ®vals[i]); 1109 } 1110 if (error == 0 && ddi_copyout(regvals, vrs.regvals, 1111 sizeof (uint64_t) * vrs.count, md)) { 1112 error = EFAULT; 1113 } 1114 break; 1115 } 1116 case VM_SET_REGISTER_SET: { 1117 struct vm_register_set vrs; 1118 int regnums[VM_REG_LAST]; 1119 uint64_t regvals[VM_REG_LAST]; 1120 1121 if (ddi_copyin(datap, &vrs, sizeof (vrs), md)) { 1122 error = EFAULT; 1123 break; 1124 } 1125 if (vrs.count > VM_REG_LAST || vrs.count == 0) { 1126 error = EINVAL; 1127 break; 1128 } 1129 if (ddi_copyin(vrs.regnums, regnums, 1130 sizeof (int) * vrs.count, md)) { 1131 error = EFAULT; 1132 break; 1133 } 1134 if (ddi_copyin(vrs.regvals, regvals, 1135 sizeof (uint64_t) * vrs.count, md)) { 1136 error = EFAULT; 1137 break; 1138 } 1139 1140 error = 0; 1141 for (uint_t i = 0; i < vrs.count && error == 0; i++) { 1142 /* 1143 * Setting registers in a set is not atomic, since a 1144 * failure in the middle of the set will cause a 1145 * bail-out and inconsistent register state. Callers 1146 * should be wary of this. 1147 */ 1148 if (regnums[i] < 0) { 1149 error = EINVAL; 1150 break; 1151 } 1152 error = vm_set_register(sc->vmm_vm, vcpu, regnums[i], 1153 regvals[i]); 1154 } 1155 break; 1156 } 1157 case VM_RESET_CPU: { 1158 struct vm_vcpu_reset vvr; 1159 1160 if (ddi_copyin(datap, &vvr, sizeof (vvr), md)) { 1161 error = EFAULT; 1162 break; 1163 } 1164 if (vvr.kind != VRK_RESET && vvr.kind != VRK_INIT) { 1165 error = EINVAL; 1166 } 1167 1168 error = vcpu_arch_reset(sc->vmm_vm, vcpu, vvr.kind == VRK_INIT); 1169 break; 1170 } 1171 case VM_GET_RUN_STATE: { 1172 struct vm_run_state vrs; 1173 1174 bzero(&vrs, sizeof (vrs)); 1175 error = vm_get_run_state(sc->vmm_vm, vcpu, &vrs.state, 1176 &vrs.sipi_vector); 1177 if (error == 0) { 1178 if (ddi_copyout(&vrs, datap, sizeof (vrs), md)) { 1179 error = EFAULT; 1180 break; 1181 } 1182 } 1183 break; 1184 } 1185 case VM_SET_RUN_STATE: { 1186 struct vm_run_state vrs; 1187 1188 if (ddi_copyin(datap, &vrs, sizeof (vrs), md)) { 1189 error = EFAULT; 1190 break; 1191 } 1192 error = vm_set_run_state(sc->vmm_vm, vcpu, vrs.state, 1193 vrs.sipi_vector); 1194 break; 1195 } 1196 case VM_GET_FPU: { 1197 struct vm_fpu_state req; 1198 const size_t max_len = (PAGESIZE * 2); 1199 void *kbuf; 1200 1201 if (ddi_copyin(datap, &req, sizeof (req), md)) { 1202 error = EFAULT; 1203 break; 1204 } 1205 if (req.len > max_len || req.len == 0) { 1206 error = EINVAL; 1207 break; 1208 } 1209 kbuf = kmem_zalloc(req.len, KM_SLEEP); 1210 error = vm_get_fpu(sc->vmm_vm, vcpu, kbuf, req.len); 1211 if (error == 0) { 1212 if (ddi_copyout(kbuf, req.buf, req.len, md)) { 1213 error = EFAULT; 1214 } 1215 } 1216 kmem_free(kbuf, req.len); 1217 break; 1218 } 1219 case VM_SET_FPU: { 1220 struct vm_fpu_state req; 1221 const size_t max_len = (PAGESIZE * 2); 1222 void *kbuf; 1223 1224 if (ddi_copyin(datap, &req, sizeof (req), md)) { 1225 error = EFAULT; 1226 break; 1227 } 1228 if (req.len > max_len || req.len == 0) { 1229 error = EINVAL; 1230 break; 1231 } 1232 kbuf = kmem_alloc(req.len, KM_SLEEP); 1233 if (ddi_copyin(req.buf, kbuf, req.len, md)) { 1234 error = EFAULT; 1235 } else { 1236 error = vm_set_fpu(sc->vmm_vm, vcpu, kbuf, req.len); 1237 } 1238 kmem_free(kbuf, req.len); 1239 break; 1240 } 1241 case VM_GET_CPUID: { 1242 struct vm_vcpu_cpuid_config cfg; 1243 struct vcpu_cpuid_entry *entries = NULL; 1244 1245 if (ddi_copyin(datap, &cfg, sizeof (cfg), md)) { 1246 error = EFAULT; 1247 break; 1248 } 1249 if (cfg.vvcc_nent > VMM_MAX_CPUID_ENTRIES) { 1250 error = EINVAL; 1251 break; 1252 } 1253 1254 const size_t entries_size = 1255 cfg.vvcc_nent * sizeof (struct vcpu_cpuid_entry); 1256 if (entries_size != 0) { 1257 entries = kmem_zalloc(entries_size, KM_SLEEP); 1258 } 1259 1260 vcpu_cpuid_config_t vm_cfg = { 1261 .vcc_nent = cfg.vvcc_nent, 1262 .vcc_entries = entries, 1263 }; 1264 error = vm_get_cpuid(sc->vmm_vm, vcpu, &vm_cfg); 1265 1266 /* 1267 * Only attempt to copy out the resultant entries if we were 1268 * able to query them from the instance. The flags and number 1269 * of entries are emitted regardless. 1270 */ 1271 cfg.vvcc_flags = vm_cfg.vcc_flags; 1272 cfg.vvcc_nent = vm_cfg.vcc_nent; 1273 if (entries != NULL) { 1274 if (error == 0 && ddi_copyout(entries, cfg.vvcc_entries, 1275 entries_size, md) != 0) { 1276 error = EFAULT; 1277 } 1278 1279 kmem_free(entries, entries_size); 1280 } 1281 1282 if (ddi_copyout(&cfg, datap, sizeof (cfg), md) != 0) { 1283 error = EFAULT; 1284 } 1285 break; 1286 } 1287 case VM_SET_CPUID: { 1288 struct vm_vcpu_cpuid_config cfg; 1289 struct vcpu_cpuid_entry *entries = NULL; 1290 size_t entries_size = 0; 1291 1292 if (ddi_copyin(datap, &cfg, sizeof (cfg), md)) { 1293 error = EFAULT; 1294 break; 1295 } 1296 if (cfg.vvcc_nent > VMM_MAX_CPUID_ENTRIES) { 1297 error = EFBIG; 1298 break; 1299 } 1300 if ((cfg.vvcc_flags & VCC_FLAG_LEGACY_HANDLING) != 0) { 1301 /* 1302 * If we are being instructed to use "legacy" handling, 1303 * then no entries should be provided, since the static 1304 * in-kernel masking will be used. 1305 */ 1306 if (cfg.vvcc_nent != 0) { 1307 error = EINVAL; 1308 break; 1309 } 1310 } else if (cfg.vvcc_nent != 0) { 1311 entries_size = 1312 cfg.vvcc_nent * sizeof (struct vcpu_cpuid_entry); 1313 entries = kmem_alloc(entries_size, KM_SLEEP); 1314 1315 if (ddi_copyin(cfg.vvcc_entries, entries, entries_size, 1316 md) != 0) { 1317 error = EFAULT; 1318 kmem_free(entries, entries_size); 1319 break; 1320 } 1321 } 1322 1323 vcpu_cpuid_config_t vm_cfg = { 1324 .vcc_flags = cfg.vvcc_flags, 1325 .vcc_nent = cfg.vvcc_nent, 1326 .vcc_entries = entries, 1327 }; 1328 error = vm_set_cpuid(sc->vmm_vm, vcpu, &vm_cfg); 1329 1330 if (entries != NULL) { 1331 kmem_free(entries, entries_size); 1332 } 1333 break; 1334 } 1335 case VM_LEGACY_CPUID: { 1336 struct vm_legacy_cpuid vlc; 1337 if (ddi_copyin(datap, &vlc, sizeof (vlc), md)) { 1338 error = EFAULT; 1339 break; 1340 } 1341 vlc.vlc_vcpuid = vcpu; 1342 1343 legacy_emulate_cpuid(sc->vmm_vm, vcpu, &vlc.vlc_eax, 1344 &vlc.vlc_ebx, &vlc.vlc_ecx, &vlc.vlc_edx); 1345 1346 if (ddi_copyout(&vlc, datap, sizeof (vlc), md)) { 1347 error = EFAULT; 1348 break; 1349 } 1350 break; 1351 } 1352 1353 case VM_SET_KERNEMU_DEV: 1354 case VM_GET_KERNEMU_DEV: { 1355 struct vm_readwrite_kernemu_device kemu; 1356 size_t size = 0; 1357 1358 if (ddi_copyin(datap, &kemu, sizeof (kemu), md)) { 1359 error = EFAULT; 1360 break; 1361 } 1362 1363 if (kemu.access_width > 3) { 1364 error = EINVAL; 1365 break; 1366 } 1367 size = (1 << kemu.access_width); 1368 ASSERT(size >= 1 && size <= 8); 1369 1370 if (cmd == VM_SET_KERNEMU_DEV) { 1371 error = vm_service_mmio_write(sc->vmm_vm, vcpu, 1372 kemu.gpa, kemu.value, size); 1373 } else { 1374 error = vm_service_mmio_read(sc->vmm_vm, vcpu, 1375 kemu.gpa, &kemu.value, size); 1376 } 1377 1378 if (error == 0) { 1379 if (ddi_copyout(&kemu, datap, sizeof (kemu), md)) { 1380 error = EFAULT; 1381 break; 1382 } 1383 } 1384 break; 1385 } 1386 1387 case VM_GET_CAPABILITY: { 1388 struct vm_capability vmcap; 1389 1390 if (ddi_copyin(datap, &vmcap, sizeof (vmcap), md)) { 1391 error = EFAULT; 1392 break; 1393 } 1394 error = vm_get_capability(sc->vmm_vm, vcpu, vmcap.captype, 1395 &vmcap.capval); 1396 if (error == 0 && 1397 ddi_copyout(&vmcap, datap, sizeof (vmcap), md)) { 1398 error = EFAULT; 1399 break; 1400 } 1401 break; 1402 } 1403 case VM_SET_CAPABILITY: { 1404 struct vm_capability vmcap; 1405 1406 if (ddi_copyin(datap, &vmcap, sizeof (vmcap), md)) { 1407 error = EFAULT; 1408 break; 1409 } 1410 error = vm_set_capability(sc->vmm_vm, vcpu, vmcap.captype, 1411 vmcap.capval); 1412 break; 1413 } 1414 case VM_SET_X2APIC_STATE: { 1415 struct vm_x2apic x2apic; 1416 1417 if (ddi_copyin(datap, &x2apic, sizeof (x2apic), md)) { 1418 error = EFAULT; 1419 break; 1420 } 1421 error = vm_set_x2apic_state(sc->vmm_vm, vcpu, x2apic.state); 1422 break; 1423 } 1424 case VM_GET_X2APIC_STATE: { 1425 struct vm_x2apic x2apic; 1426 1427 if (ddi_copyin(datap, &x2apic, sizeof (x2apic), md)) { 1428 error = EFAULT; 1429 break; 1430 } 1431 error = vm_get_x2apic_state(sc->vmm_vm, x2apic.cpuid, 1432 &x2apic.state); 1433 if (error == 0 && 1434 ddi_copyout(&x2apic, datap, sizeof (x2apic), md)) { 1435 error = EFAULT; 1436 break; 1437 } 1438 break; 1439 } 1440 case VM_GET_GPA_PMAP: { 1441 /* 1442 * Until there is a necessity to leak EPT/RVI PTE values to 1443 * userspace, this will remain unimplemented 1444 */ 1445 error = EINVAL; 1446 break; 1447 } 1448 case VM_GET_HPET_CAPABILITIES: { 1449 struct vm_hpet_cap hpetcap; 1450 1451 error = vhpet_getcap(&hpetcap); 1452 if (error == 0 && 1453 ddi_copyout(&hpetcap, datap, sizeof (hpetcap), md)) { 1454 error = EFAULT; 1455 break; 1456 } 1457 break; 1458 } 1459 case VM_GLA2GPA: { 1460 struct vm_gla2gpa gg; 1461 1462 if (ddi_copyin(datap, &gg, sizeof (gg), md)) { 1463 error = EFAULT; 1464 break; 1465 } 1466 gg.vcpuid = vcpu; 1467 error = vm_gla2gpa(sc->vmm_vm, vcpu, &gg.paging, gg.gla, 1468 gg.prot, &gg.gpa, &gg.fault); 1469 if (error == 0 && ddi_copyout(&gg, datap, sizeof (gg), md)) { 1470 error = EFAULT; 1471 break; 1472 } 1473 break; 1474 } 1475 case VM_GLA2GPA_NOFAULT: { 1476 struct vm_gla2gpa gg; 1477 1478 if (ddi_copyin(datap, &gg, sizeof (gg), md)) { 1479 error = EFAULT; 1480 break; 1481 } 1482 gg.vcpuid = vcpu; 1483 error = vm_gla2gpa_nofault(sc->vmm_vm, vcpu, &gg.paging, 1484 gg.gla, gg.prot, &gg.gpa, &gg.fault); 1485 if (error == 0 && ddi_copyout(&gg, datap, sizeof (gg), md)) { 1486 error = EFAULT; 1487 break; 1488 } 1489 break; 1490 } 1491 1492 case VM_ACTIVATE_CPU: 1493 error = vm_activate_cpu(sc->vmm_vm, vcpu); 1494 break; 1495 1496 case VM_SUSPEND_CPU: 1497 if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) { 1498 error = EFAULT; 1499 } else { 1500 error = vm_suspend_cpu(sc->vmm_vm, vcpu); 1501 } 1502 break; 1503 1504 case VM_RESUME_CPU: 1505 if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) { 1506 error = EFAULT; 1507 } else { 1508 error = vm_resume_cpu(sc->vmm_vm, vcpu); 1509 } 1510 break; 1511 1512 case VM_GET_CPUS: { 1513 struct vm_cpuset vm_cpuset; 1514 cpuset_t tempset; 1515 void *srcp = &tempset; 1516 int size; 1517 1518 if (ddi_copyin(datap, &vm_cpuset, sizeof (vm_cpuset), md)) { 1519 error = EFAULT; 1520 break; 1521 } 1522 1523 /* Be more generous about sizing since our cpuset_t is large. */ 1524 size = vm_cpuset.cpusetsize; 1525 if (size <= 0 || size > sizeof (cpuset_t)) { 1526 error = ERANGE; 1527 } 1528 /* 1529 * If they want a ulong_t or less, make sure they receive the 1530 * low bits with all the useful information. 1531 */ 1532 if (size <= sizeof (tempset.cpub[0])) { 1533 srcp = &tempset.cpub[0]; 1534 } 1535 1536 if (vm_cpuset.which == VM_ACTIVE_CPUS) { 1537 tempset = vm_active_cpus(sc->vmm_vm); 1538 } else if (vm_cpuset.which == VM_SUSPENDED_CPUS) { 1539 tempset = vm_suspended_cpus(sc->vmm_vm); 1540 } else if (vm_cpuset.which == VM_DEBUG_CPUS) { 1541 tempset = vm_debug_cpus(sc->vmm_vm); 1542 } else { 1543 error = EINVAL; 1544 } 1545 1546 ASSERT(size > 0 && size <= sizeof (tempset)); 1547 if (error == 0 && 1548 ddi_copyout(srcp, vm_cpuset.cpus, size, md)) { 1549 error = EFAULT; 1550 break; 1551 } 1552 break; 1553 } 1554 case VM_SET_INTINFO: { 1555 struct vm_intinfo vmii; 1556 1557 if (ddi_copyin(datap, &vmii, sizeof (vmii), md)) { 1558 error = EFAULT; 1559 break; 1560 } 1561 error = vm_exit_intinfo(sc->vmm_vm, vcpu, vmii.info1); 1562 break; 1563 } 1564 case VM_GET_INTINFO: { 1565 struct vm_intinfo vmii; 1566 1567 vmii.vcpuid = vcpu; 1568 error = vm_get_intinfo(sc->vmm_vm, vcpu, &vmii.info1, 1569 &vmii.info2); 1570 if (error == 0 && 1571 ddi_copyout(&vmii, datap, sizeof (vmii), md)) { 1572 error = EFAULT; 1573 break; 1574 } 1575 break; 1576 } 1577 case VM_RTC_WRITE: { 1578 struct vm_rtc_data rtcdata; 1579 1580 if (ddi_copyin(datap, &rtcdata, sizeof (rtcdata), md)) { 1581 error = EFAULT; 1582 break; 1583 } 1584 error = vrtc_nvram_write(sc->vmm_vm, rtcdata.offset, 1585 rtcdata.value); 1586 break; 1587 } 1588 case VM_RTC_READ: { 1589 struct vm_rtc_data rtcdata; 1590 1591 if (ddi_copyin(datap, &rtcdata, sizeof (rtcdata), md)) { 1592 error = EFAULT; 1593 break; 1594 } 1595 error = vrtc_nvram_read(sc->vmm_vm, rtcdata.offset, 1596 &rtcdata.value); 1597 if (error == 0 && 1598 ddi_copyout(&rtcdata, datap, sizeof (rtcdata), md)) { 1599 error = EFAULT; 1600 break; 1601 } 1602 break; 1603 } 1604 case VM_RTC_SETTIME: { 1605 struct vm_rtc_time rtctime; 1606 1607 if (ddi_copyin(datap, &rtctime, sizeof (rtctime), md)) { 1608 error = EFAULT; 1609 break; 1610 } 1611 error = vrtc_set_time(sc->vmm_vm, rtctime.secs); 1612 break; 1613 } 1614 case VM_RTC_GETTIME: { 1615 struct vm_rtc_time rtctime; 1616 1617 rtctime.secs = vrtc_get_time(sc->vmm_vm); 1618 if (ddi_copyout(&rtctime, datap, sizeof (rtctime), md)) { 1619 error = EFAULT; 1620 break; 1621 } 1622 break; 1623 } 1624 1625 case VM_PMTMR_LOCATE: { 1626 uint16_t port = arg; 1627 error = vpmtmr_set_location(sc->vmm_vm, port); 1628 break; 1629 } 1630 1631 case VM_RESTART_INSTRUCTION: 1632 error = vm_restart_instruction(sc->vmm_vm, vcpu); 1633 break; 1634 1635 case VM_SET_TOPOLOGY: { 1636 struct vm_cpu_topology topo; 1637 1638 if (ddi_copyin(datap, &topo, sizeof (topo), md) != 0) { 1639 error = EFAULT; 1640 break; 1641 } 1642 error = vm_set_topology(sc->vmm_vm, topo.sockets, topo.cores, 1643 topo.threads, topo.maxcpus); 1644 break; 1645 } 1646 case VM_GET_TOPOLOGY: { 1647 struct vm_cpu_topology topo; 1648 1649 vm_get_topology(sc->vmm_vm, &topo.sockets, &topo.cores, 1650 &topo.threads, &topo.maxcpus); 1651 if (ddi_copyout(&topo, datap, sizeof (topo), md) != 0) { 1652 error = EFAULT; 1653 break; 1654 } 1655 break; 1656 } 1657 case VM_DEVMEM_GETOFFSET: { 1658 struct vm_devmem_offset vdo; 1659 vmm_devmem_entry_t *de; 1660 1661 if (ddi_copyin(datap, &vdo, sizeof (vdo), md) != 0) { 1662 error = EFAULT; 1663 break; 1664 } 1665 1666 de = vmmdev_devmem_find(sc, vdo.segid); 1667 if (de != NULL) { 1668 vdo.offset = de->vde_off; 1669 if (ddi_copyout(&vdo, datap, sizeof (vdo), md) != 0) { 1670 error = EFAULT; 1671 } 1672 } else { 1673 error = ENOENT; 1674 } 1675 break; 1676 } 1677 case VM_TRACK_DIRTY_PAGES: { 1678 const size_t max_track_region_len = 8 * PAGESIZE * 8 * PAGESIZE; 1679 struct vmm_dirty_tracker tracker; 1680 uint8_t *bitmap; 1681 size_t len; 1682 1683 if (ddi_copyin(datap, &tracker, sizeof (tracker), md) != 0) { 1684 error = EFAULT; 1685 break; 1686 } 1687 if ((tracker.vdt_start_gpa & PAGEOFFSET) != 0) { 1688 error = EINVAL; 1689 break; 1690 } 1691 if (tracker.vdt_len == 0) { 1692 break; 1693 } 1694 if ((tracker.vdt_len & PAGEOFFSET) != 0) { 1695 error = EINVAL; 1696 break; 1697 } 1698 if (tracker.vdt_len > max_track_region_len) { 1699 error = EINVAL; 1700 break; 1701 } 1702 len = roundup(tracker.vdt_len / PAGESIZE, 8) / 8; 1703 bitmap = kmem_zalloc(len, KM_SLEEP); 1704 vm_track_dirty_pages(sc->vmm_vm, tracker.vdt_start_gpa, 1705 tracker.vdt_len, bitmap); 1706 if (ddi_copyout(bitmap, tracker.vdt_pfns, len, md) != 0) { 1707 error = EFAULT; 1708 } 1709 kmem_free(bitmap, len); 1710 1711 break; 1712 } 1713 case VM_WRLOCK_CYCLE: { 1714 /* 1715 * Present a test mechanism to acquire/release the write lock 1716 * on the VM without any other effects. 1717 */ 1718 break; 1719 } 1720 case VM_DATA_READ: { 1721 struct vm_data_xfer vdx; 1722 1723 if (ddi_copyin(datap, &vdx, sizeof (vdx), md) != 0) { 1724 error = EFAULT; 1725 break; 1726 } 1727 if ((vdx.vdx_flags & ~VDX_FLAGS_VALID) != 0) { 1728 error = EINVAL; 1729 break; 1730 } 1731 if (vdx.vdx_len > VM_DATA_XFER_LIMIT) { 1732 error = EFBIG; 1733 break; 1734 } 1735 1736 const size_t len = vdx.vdx_len; 1737 void *buf = NULL; 1738 if (len != 0) { 1739 buf = kmem_alloc(len, KM_SLEEP); 1740 if ((vdx.vdx_flags & VDX_FLAG_READ_COPYIN) != 0 && 1741 ddi_copyin(vdx.vdx_data, buf, len, md) != 0) { 1742 kmem_free(buf, len); 1743 error = EFAULT; 1744 break; 1745 } else { 1746 bzero(buf, len); 1747 } 1748 } 1749 1750 vdx.vdx_result_len = 0; 1751 vmm_data_req_t req = { 1752 .vdr_class = vdx.vdx_class, 1753 .vdr_version = vdx.vdx_version, 1754 .vdr_flags = vdx.vdx_flags, 1755 .vdr_len = len, 1756 .vdr_data = buf, 1757 .vdr_result_len = &vdx.vdx_result_len, 1758 }; 1759 error = vmm_data_read(sc->vmm_vm, vdx.vdx_vcpuid, &req); 1760 1761 if (error == 0 && buf != NULL) { 1762 if (ddi_copyout(buf, vdx.vdx_data, len, md) != 0) { 1763 error = EFAULT; 1764 } 1765 } 1766 1767 /* 1768 * Copy out the transfer request so that the value of 1769 * vdx_result_len can be made available, regardless of any 1770 * error(s) which may have occurred. 1771 */ 1772 if (ddi_copyout(&vdx, datap, sizeof (vdx), md) != 0) { 1773 error = (error != 0) ? error : EFAULT; 1774 } 1775 1776 if (buf != NULL) { 1777 kmem_free(buf, len); 1778 } 1779 break; 1780 } 1781 case VM_DATA_WRITE: { 1782 struct vm_data_xfer vdx; 1783 1784 if (ddi_copyin(datap, &vdx, sizeof (vdx), md) != 0) { 1785 error = EFAULT; 1786 break; 1787 } 1788 if ((vdx.vdx_flags & ~VDX_FLAGS_VALID) != 0) { 1789 error = EINVAL; 1790 break; 1791 } 1792 if (vdx.vdx_len > VM_DATA_XFER_LIMIT) { 1793 error = EFBIG; 1794 break; 1795 } 1796 1797 const size_t len = vdx.vdx_len; 1798 void *buf = NULL; 1799 if (len != 0) { 1800 buf = kmem_alloc(len, KM_SLEEP); 1801 if (ddi_copyin(vdx.vdx_data, buf, len, md) != 0) { 1802 kmem_free(buf, len); 1803 error = EFAULT; 1804 break; 1805 } 1806 } 1807 1808 vdx.vdx_result_len = 0; 1809 vmm_data_req_t req = { 1810 .vdr_class = vdx.vdx_class, 1811 .vdr_version = vdx.vdx_version, 1812 .vdr_flags = vdx.vdx_flags, 1813 .vdr_len = len, 1814 .vdr_data = buf, 1815 .vdr_result_len = &vdx.vdx_result_len, 1816 }; 1817 if (vmm_allow_state_writes == 0) { 1818 /* XXX: Play it safe for now */ 1819 error = EPERM; 1820 } else { 1821 error = vmm_data_write(sc->vmm_vm, vdx.vdx_vcpuid, 1822 &req); 1823 } 1824 1825 if (error == 0 && buf != NULL && 1826 (vdx.vdx_flags & VDX_FLAG_WRITE_COPYOUT) != 0) { 1827 if (ddi_copyout(buf, vdx.vdx_data, len, md) != 0) { 1828 error = EFAULT; 1829 } 1830 } 1831 1832 /* 1833 * Copy out the transfer request so that the value of 1834 * vdx_result_len can be made available, regardless of any 1835 * error(s) which may have occurred. 1836 */ 1837 if (ddi_copyout(&vdx, datap, sizeof (vdx), md) != 0) { 1838 error = (error != 0) ? error : EFAULT; 1839 } 1840 1841 if (buf != NULL) { 1842 kmem_free(buf, len); 1843 } 1844 break; 1845 } 1846 1847 default: 1848 error = ENOTTY; 1849 break; 1850 } 1851 1852 /* Release exclusion resources */ 1853 switch (lock_type) { 1854 case LOCK_NONE: 1855 break; 1856 case LOCK_VCPU: 1857 vcpu_unlock_one(sc, vcpu); 1858 break; 1859 case LOCK_READ_HOLD: 1860 vmm_read_unlock(sc); 1861 break; 1862 case LOCK_WRITE_HOLD: 1863 vmm_write_unlock(sc); 1864 break; 1865 default: 1866 panic("unexpected lock type"); 1867 break; 1868 } 1869 1870 return (error); 1871 } 1872 1873 static vmm_softc_t * 1874 vmm_lookup(const char *name) 1875 { 1876 list_t *vml = &vmm_list; 1877 vmm_softc_t *sc; 1878 1879 ASSERT(MUTEX_HELD(&vmm_mtx)); 1880 1881 for (sc = list_head(vml); sc != NULL; sc = list_next(vml, sc)) { 1882 if (strcmp(sc->vmm_name, name) == 0) { 1883 break; 1884 } 1885 } 1886 1887 return (sc); 1888 } 1889 1890 /* 1891 * Acquire an HMA registration if not already held. 1892 */ 1893 static boolean_t 1894 vmm_hma_acquire(void) 1895 { 1896 ASSERT(MUTEX_NOT_HELD(&vmm_mtx)); 1897 1898 mutex_enter(&vmmdev_mtx); 1899 1900 if (vmmdev_hma_reg == NULL) { 1901 VERIFY3U(vmmdev_hma_ref, ==, 0); 1902 vmmdev_hma_reg = hma_register(vmmdev_hvm_name); 1903 if (vmmdev_hma_reg == NULL) { 1904 cmn_err(CE_WARN, "%s HMA registration failed.", 1905 vmmdev_hvm_name); 1906 mutex_exit(&vmmdev_mtx); 1907 return (B_FALSE); 1908 } 1909 } 1910 1911 vmmdev_hma_ref++; 1912 1913 mutex_exit(&vmmdev_mtx); 1914 1915 return (B_TRUE); 1916 } 1917 1918 /* 1919 * Release the HMA registration if held and there are no remaining VMs. 1920 */ 1921 static void 1922 vmm_hma_release(void) 1923 { 1924 ASSERT(MUTEX_NOT_HELD(&vmm_mtx)); 1925 1926 mutex_enter(&vmmdev_mtx); 1927 1928 VERIFY3U(vmmdev_hma_ref, !=, 0); 1929 1930 vmmdev_hma_ref--; 1931 1932 if (vmmdev_hma_ref == 0) { 1933 VERIFY(vmmdev_hma_reg != NULL); 1934 hma_unregister(vmmdev_hma_reg); 1935 vmmdev_hma_reg = NULL; 1936 } 1937 mutex_exit(&vmmdev_mtx); 1938 } 1939 1940 static int 1941 vmmdev_do_vm_create(const struct vm_create_req *req, cred_t *cr) 1942 { 1943 vmm_softc_t *sc = NULL; 1944 minor_t minor; 1945 int error = ENOMEM; 1946 size_t len; 1947 const char *name = req->name; 1948 1949 len = strnlen(name, VM_MAX_NAMELEN); 1950 if (len == 0) { 1951 return (EINVAL); 1952 } 1953 if (len >= VM_MAX_NAMELEN) { 1954 return (ENAMETOOLONG); 1955 } 1956 if (strchr(name, '/') != NULL) { 1957 return (EINVAL); 1958 } 1959 1960 if (!vmm_hma_acquire()) 1961 return (ENXIO); 1962 1963 mutex_enter(&vmm_mtx); 1964 1965 /* Look for duplicate names */ 1966 if (vmm_lookup(name) != NULL) { 1967 mutex_exit(&vmm_mtx); 1968 vmm_hma_release(); 1969 return (EEXIST); 1970 } 1971 1972 /* Allow only one instance per non-global zone. */ 1973 if (!INGLOBALZONE(curproc)) { 1974 for (sc = list_head(&vmm_list); sc != NULL; 1975 sc = list_next(&vmm_list, sc)) { 1976 if (sc->vmm_zone == curzone) { 1977 mutex_exit(&vmm_mtx); 1978 vmm_hma_release(); 1979 return (EINVAL); 1980 } 1981 } 1982 } 1983 1984 minor = id_alloc(vmm_minors); 1985 if (ddi_soft_state_zalloc(vmm_statep, minor) != DDI_SUCCESS) { 1986 goto fail; 1987 } else if ((sc = ddi_get_soft_state(vmm_statep, minor)) == NULL) { 1988 ddi_soft_state_free(vmm_statep, minor); 1989 goto fail; 1990 } else if (ddi_create_minor_node(vmmdev_dip, name, S_IFCHR, minor, 1991 DDI_PSEUDO, 0) != DDI_SUCCESS) { 1992 goto fail; 1993 } 1994 1995 if (vmm_kstat_alloc(sc, minor, cr) != 0) { 1996 goto fail; 1997 } 1998 1999 error = vm_create(req->flags, &sc->vmm_vm); 2000 if (error == 0) { 2001 /* Complete VM intialization and report success. */ 2002 (void) strlcpy(sc->vmm_name, name, sizeof (sc->vmm_name)); 2003 sc->vmm_minor = minor; 2004 list_create(&sc->vmm_devmem_list, sizeof (vmm_devmem_entry_t), 2005 offsetof(vmm_devmem_entry_t, vde_node)); 2006 2007 list_create(&sc->vmm_holds, sizeof (vmm_hold_t), 2008 offsetof(vmm_hold_t, vmh_node)); 2009 cv_init(&sc->vmm_cv, NULL, CV_DEFAULT, NULL); 2010 2011 mutex_init(&sc->vmm_lease_lock, NULL, MUTEX_DEFAULT, NULL); 2012 list_create(&sc->vmm_lease_list, sizeof (vmm_lease_t), 2013 offsetof(vmm_lease_t, vml_node)); 2014 cv_init(&sc->vmm_lease_cv, NULL, CV_DEFAULT, NULL); 2015 rw_init(&sc->vmm_rwlock, NULL, RW_DEFAULT, NULL); 2016 2017 sc->vmm_zone = crgetzone(cr); 2018 zone_hold(sc->vmm_zone); 2019 vmm_zsd_add_vm(sc); 2020 vmm_kstat_init(sc); 2021 2022 list_insert_tail(&vmm_list, sc); 2023 mutex_exit(&vmm_mtx); 2024 return (0); 2025 } 2026 2027 vmm_kstat_fini(sc); 2028 ddi_remove_minor_node(vmmdev_dip, name); 2029 fail: 2030 id_free(vmm_minors, minor); 2031 if (sc != NULL) { 2032 ddi_soft_state_free(vmm_statep, minor); 2033 } 2034 mutex_exit(&vmm_mtx); 2035 vmm_hma_release(); 2036 2037 return (error); 2038 } 2039 2040 /* 2041 * Bhyve 'Driver' Interface 2042 * 2043 * While many devices are emulated in the bhyve userspace process, there are 2044 * others with performance constraints which require that they run mostly or 2045 * entirely in-kernel. For those not integrated directly into bhyve, an API is 2046 * needed so they can query/manipulate the portions of VM state needed to 2047 * fulfill their purpose. 2048 * 2049 * This includes: 2050 * - Translating guest-physical addresses to host-virtual pointers 2051 * - Injecting MSIs 2052 * - Hooking IO port addresses 2053 * 2054 * The vmm_drv interface exists to provide that functionality to its consumers. 2055 * (At this time, 'viona' is the only user) 2056 */ 2057 int 2058 vmm_drv_hold(file_t *fp, cred_t *cr, vmm_hold_t **holdp) 2059 { 2060 vnode_t *vp = fp->f_vnode; 2061 const dev_t dev = vp->v_rdev; 2062 vmm_softc_t *sc; 2063 vmm_hold_t *hold; 2064 int err = 0; 2065 2066 if (vp->v_type != VCHR) { 2067 return (ENXIO); 2068 } 2069 const major_t major = getmajor(dev); 2070 const minor_t minor = getminor(dev); 2071 2072 mutex_enter(&vmmdev_mtx); 2073 if (vmmdev_dip == NULL || major != ddi_driver_major(vmmdev_dip)) { 2074 mutex_exit(&vmmdev_mtx); 2075 return (ENOENT); 2076 } 2077 mutex_enter(&vmm_mtx); 2078 mutex_exit(&vmmdev_mtx); 2079 2080 if ((sc = ddi_get_soft_state(vmm_statep, minor)) == NULL) { 2081 err = ENOENT; 2082 goto out; 2083 } 2084 /* XXXJOY: check cred permissions against instance */ 2085 2086 if ((sc->vmm_flags & VMM_DESTROY) != 0) { 2087 err = EBUSY; 2088 goto out; 2089 } 2090 2091 hold = kmem_zalloc(sizeof (*hold), KM_SLEEP); 2092 hold->vmh_sc = sc; 2093 hold->vmh_release_req = B_FALSE; 2094 2095 list_insert_tail(&sc->vmm_holds, hold); 2096 sc->vmm_flags |= VMM_HELD; 2097 *holdp = hold; 2098 2099 out: 2100 mutex_exit(&vmm_mtx); 2101 return (err); 2102 } 2103 2104 void 2105 vmm_drv_rele(vmm_hold_t *hold) 2106 { 2107 vmm_softc_t *sc; 2108 bool hma_release = false; 2109 2110 ASSERT(hold != NULL); 2111 ASSERT(hold->vmh_sc != NULL); 2112 VERIFY(hold->vmh_ioport_hook_cnt == 0); 2113 2114 mutex_enter(&vmm_mtx); 2115 sc = hold->vmh_sc; 2116 list_remove(&sc->vmm_holds, hold); 2117 kmem_free(hold, sizeof (*hold)); 2118 2119 if (list_is_empty(&sc->vmm_holds)) { 2120 sc->vmm_flags &= ~VMM_HELD; 2121 2122 /* 2123 * Since outstanding holds would prevent instance destruction 2124 * from completing, attempt to finish it now if it was already 2125 * set in motion. 2126 */ 2127 if ((sc->vmm_flags & VMM_DESTROY) != 0) { 2128 VERIFY0(vmm_destroy_locked(sc, VDO_DEFAULT, 2129 &hma_release)); 2130 } 2131 } 2132 mutex_exit(&vmm_mtx); 2133 2134 if (hma_release) { 2135 vmm_hma_release(); 2136 } 2137 } 2138 2139 boolean_t 2140 vmm_drv_release_reqd(vmm_hold_t *hold) 2141 { 2142 ASSERT(hold != NULL); 2143 2144 return (hold->vmh_release_req); 2145 } 2146 2147 vmm_lease_t * 2148 vmm_drv_lease_sign(vmm_hold_t *hold, boolean_t (*expiref)(void *), void *arg) 2149 { 2150 vmm_softc_t *sc = hold->vmh_sc; 2151 vmm_lease_t *lease; 2152 2153 ASSERT3P(expiref, !=, NULL); 2154 2155 if (hold->vmh_release_req) { 2156 return (NULL); 2157 } 2158 2159 lease = kmem_alloc(sizeof (*lease), KM_SLEEP); 2160 list_link_init(&lease->vml_node); 2161 lease->vml_expire_func = expiref; 2162 lease->vml_expire_arg = arg; 2163 lease->vml_expired = B_FALSE; 2164 lease->vml_break_deferred = B_FALSE; 2165 lease->vml_hold = hold; 2166 /* cache the VM pointer for one less pointer chase */ 2167 lease->vml_vm = sc->vmm_vm; 2168 lease->vml_vmclient = vmspace_client_alloc(vm_get_vmspace(sc->vmm_vm)); 2169 2170 mutex_enter(&sc->vmm_lease_lock); 2171 while (sc->vmm_lease_blocker != 0) { 2172 cv_wait(&sc->vmm_lease_cv, &sc->vmm_lease_lock); 2173 } 2174 list_insert_tail(&sc->vmm_lease_list, lease); 2175 vmm_read_lock(sc); 2176 mutex_exit(&sc->vmm_lease_lock); 2177 2178 return (lease); 2179 } 2180 2181 static void 2182 vmm_lease_break_locked(vmm_softc_t *sc, vmm_lease_t *lease) 2183 { 2184 ASSERT(MUTEX_HELD(&sc->vmm_lease_lock)); 2185 2186 list_remove(&sc->vmm_lease_list, lease); 2187 vmm_read_unlock(sc); 2188 vmc_destroy(lease->vml_vmclient); 2189 kmem_free(lease, sizeof (*lease)); 2190 } 2191 2192 static void 2193 vmm_lease_block(vmm_softc_t *sc) 2194 { 2195 mutex_enter(&sc->vmm_lease_lock); 2196 VERIFY3U(sc->vmm_lease_blocker, !=, UINT_MAX); 2197 sc->vmm_lease_blocker++; 2198 if (sc->vmm_lease_blocker == 1) { 2199 list_t *list = &sc->vmm_lease_list; 2200 vmm_lease_t *lease = list_head(list); 2201 2202 while (lease != NULL) { 2203 void *arg = lease->vml_expire_arg; 2204 boolean_t (*expiref)(void *) = lease->vml_expire_func; 2205 boolean_t sync_break = B_FALSE; 2206 2207 /* 2208 * Since the lease expiration notification may 2209 * need to take locks which would deadlock with 2210 * vmm_lease_lock, drop it across the call. 2211 * 2212 * We are the only one allowed to manipulate 2213 * vmm_lease_list right now, so it is safe to 2214 * continue iterating through it after 2215 * reacquiring the lock. 2216 */ 2217 lease->vml_expired = B_TRUE; 2218 mutex_exit(&sc->vmm_lease_lock); 2219 sync_break = expiref(arg); 2220 mutex_enter(&sc->vmm_lease_lock); 2221 2222 if (sync_break) { 2223 vmm_lease_t *next; 2224 2225 /* 2226 * These leases which are synchronously broken 2227 * result in vmm_read_unlock() calls from a 2228 * different thread than the corresponding 2229 * vmm_read_lock(). This is acceptable, given 2230 * that the rwlock underpinning the whole 2231 * mechanism tolerates the behavior. This 2232 * flexibility is _only_ afforded to VM read 2233 * lock (RW_READER) holders. 2234 */ 2235 next = list_next(list, lease); 2236 vmm_lease_break_locked(sc, lease); 2237 lease = next; 2238 } else { 2239 lease = list_next(list, lease); 2240 } 2241 } 2242 2243 /* Process leases which were not broken synchronously. */ 2244 while (!list_is_empty(list)) { 2245 /* 2246 * Although the nested loops are quadratic, the number 2247 * of leases is small. 2248 */ 2249 lease = list_head(list); 2250 while (lease != NULL) { 2251 vmm_lease_t *next = list_next(list, lease); 2252 if (lease->vml_break_deferred) { 2253 vmm_lease_break_locked(sc, lease); 2254 } 2255 lease = next; 2256 } 2257 if (list_is_empty(list)) { 2258 break; 2259 } 2260 cv_wait(&sc->vmm_lease_cv, &sc->vmm_lease_lock); 2261 } 2262 /* Wake anyone else waiting for the lease list to be empty */ 2263 cv_broadcast(&sc->vmm_lease_cv); 2264 } else { 2265 list_t *list = &sc->vmm_lease_list; 2266 2267 /* 2268 * Some other thread beat us to the duty of lease cleanup. 2269 * Wait until that is complete. 2270 */ 2271 while (!list_is_empty(list)) { 2272 cv_wait(&sc->vmm_lease_cv, &sc->vmm_lease_lock); 2273 } 2274 } 2275 mutex_exit(&sc->vmm_lease_lock); 2276 } 2277 2278 static void 2279 vmm_lease_unblock(vmm_softc_t *sc) 2280 { 2281 mutex_enter(&sc->vmm_lease_lock); 2282 VERIFY3U(sc->vmm_lease_blocker, !=, 0); 2283 sc->vmm_lease_blocker--; 2284 if (sc->vmm_lease_blocker == 0) { 2285 cv_broadcast(&sc->vmm_lease_cv); 2286 } 2287 mutex_exit(&sc->vmm_lease_lock); 2288 } 2289 2290 void 2291 vmm_drv_lease_break(vmm_hold_t *hold, vmm_lease_t *lease) 2292 { 2293 vmm_softc_t *sc = hold->vmh_sc; 2294 2295 VERIFY3P(hold, ==, lease->vml_hold); 2296 VERIFY(!lease->vml_break_deferred); 2297 2298 mutex_enter(&sc->vmm_lease_lock); 2299 if (sc->vmm_lease_blocker == 0) { 2300 vmm_lease_break_locked(sc, lease); 2301 } else { 2302 /* 2303 * Defer the lease-breaking to whichever thread is currently 2304 * cleaning up all leases as part of a vmm_lease_block() call. 2305 */ 2306 lease->vml_break_deferred = B_TRUE; 2307 cv_broadcast(&sc->vmm_lease_cv); 2308 } 2309 mutex_exit(&sc->vmm_lease_lock); 2310 } 2311 2312 boolean_t 2313 vmm_drv_lease_expired(vmm_lease_t *lease) 2314 { 2315 return (lease->vml_expired); 2316 } 2317 2318 vmm_page_t * 2319 vmm_drv_page_hold(vmm_lease_t *lease, uintptr_t gpa, int prot) 2320 { 2321 ASSERT(lease != NULL); 2322 ASSERT0(gpa & PAGEOFFSET); 2323 2324 return ((vmm_page_t *)vmc_hold(lease->vml_vmclient, gpa, prot)); 2325 } 2326 2327 void 2328 vmm_drv_page_release(vmm_page_t *vmmp) 2329 { 2330 (void) vmp_release((vm_page_t *)vmmp); 2331 } 2332 2333 void 2334 vmm_drv_page_release_chain(vmm_page_t *vmmp) 2335 { 2336 (void) vmp_release_chain((vm_page_t *)vmmp); 2337 } 2338 2339 const void * 2340 vmm_drv_page_readable(const vmm_page_t *vmmp) 2341 { 2342 return (vmp_get_readable((const vm_page_t *)vmmp)); 2343 } 2344 2345 void * 2346 vmm_drv_page_writable(const vmm_page_t *vmmp) 2347 { 2348 return (vmp_get_writable((const vm_page_t *)vmmp)); 2349 } 2350 2351 void 2352 vmm_drv_page_chain(vmm_page_t *vmmp, vmm_page_t *to_chain) 2353 { 2354 vmp_chain((vm_page_t *)vmmp, (vm_page_t *)to_chain); 2355 } 2356 2357 vmm_page_t * 2358 vmm_drv_page_next(const vmm_page_t *vmmp) 2359 { 2360 return ((vmm_page_t *)vmp_next((vm_page_t *)vmmp)); 2361 } 2362 2363 int 2364 vmm_drv_msi(vmm_lease_t *lease, uint64_t addr, uint64_t msg) 2365 { 2366 ASSERT(lease != NULL); 2367 2368 return (lapic_intr_msi(lease->vml_vm, addr, msg)); 2369 } 2370 2371 int 2372 vmm_drv_ioport_hook(vmm_hold_t *hold, uint16_t ioport, vmm_drv_iop_cb_t func, 2373 void *arg, void **cookie) 2374 { 2375 vmm_softc_t *sc; 2376 int err; 2377 2378 ASSERT(hold != NULL); 2379 ASSERT(cookie != NULL); 2380 2381 sc = hold->vmh_sc; 2382 mutex_enter(&vmm_mtx); 2383 /* Confirm that hook installation is not blocked */ 2384 if ((sc->vmm_flags & VMM_BLOCK_HOOK) != 0) { 2385 mutex_exit(&vmm_mtx); 2386 return (EBUSY); 2387 } 2388 /* 2389 * Optimistically record an installed hook which will prevent a block 2390 * from being asserted while the mutex is dropped. 2391 */ 2392 hold->vmh_ioport_hook_cnt++; 2393 mutex_exit(&vmm_mtx); 2394 2395 vmm_write_lock(sc); 2396 err = vm_ioport_hook(sc->vmm_vm, ioport, (ioport_handler_t)func, 2397 arg, cookie); 2398 vmm_write_unlock(sc); 2399 2400 if (err != 0) { 2401 mutex_enter(&vmm_mtx); 2402 /* Walk back optimism about the hook installation */ 2403 hold->vmh_ioport_hook_cnt--; 2404 mutex_exit(&vmm_mtx); 2405 } 2406 return (err); 2407 } 2408 2409 void 2410 vmm_drv_ioport_unhook(vmm_hold_t *hold, void **cookie) 2411 { 2412 vmm_softc_t *sc; 2413 2414 ASSERT(hold != NULL); 2415 ASSERT(cookie != NULL); 2416 ASSERT(hold->vmh_ioport_hook_cnt != 0); 2417 2418 sc = hold->vmh_sc; 2419 vmm_write_lock(sc); 2420 vm_ioport_unhook(sc->vmm_vm, cookie); 2421 vmm_write_unlock(sc); 2422 2423 mutex_enter(&vmm_mtx); 2424 hold->vmh_ioport_hook_cnt--; 2425 mutex_exit(&vmm_mtx); 2426 } 2427 2428 static void 2429 vmm_drv_purge(vmm_softc_t *sc) 2430 { 2431 ASSERT(MUTEX_HELD(&vmm_mtx)); 2432 2433 if ((sc->vmm_flags & VMM_HELD) != 0) { 2434 vmm_hold_t *hold; 2435 2436 for (hold = list_head(&sc->vmm_holds); hold != NULL; 2437 hold = list_next(&sc->vmm_holds, hold)) { 2438 hold->vmh_release_req = B_TRUE; 2439 } 2440 2441 /* 2442 * Require that all leases on the instance be broken, now that 2443 * all associated holds have been marked as needing release. 2444 * 2445 * Dropping vmm_mtx is not strictly necessary, but if any of the 2446 * lessees are slow to respond, it would be nice to leave it 2447 * available for other parties. 2448 */ 2449 mutex_exit(&vmm_mtx); 2450 vmm_lease_block(sc); 2451 vmm_lease_unblock(sc); 2452 mutex_enter(&vmm_mtx); 2453 } 2454 } 2455 2456 static int 2457 vmm_drv_block_hook(vmm_softc_t *sc, boolean_t enable_block) 2458 { 2459 int err = 0; 2460 2461 mutex_enter(&vmm_mtx); 2462 if (!enable_block) { 2463 VERIFY((sc->vmm_flags & VMM_BLOCK_HOOK) != 0); 2464 2465 sc->vmm_flags &= ~VMM_BLOCK_HOOK; 2466 goto done; 2467 } 2468 2469 /* If any holds have hooks installed, the block is a failure */ 2470 if (!list_is_empty(&sc->vmm_holds)) { 2471 vmm_hold_t *hold; 2472 2473 for (hold = list_head(&sc->vmm_holds); hold != NULL; 2474 hold = list_next(&sc->vmm_holds, hold)) { 2475 if (hold->vmh_ioport_hook_cnt != 0) { 2476 err = EBUSY; 2477 goto done; 2478 } 2479 } 2480 } 2481 sc->vmm_flags |= VMM_BLOCK_HOOK; 2482 2483 done: 2484 mutex_exit(&vmm_mtx); 2485 return (err); 2486 } 2487 2488 2489 static void 2490 vmm_destroy_begin(vmm_softc_t *sc, vmm_destroy_opts_t opts) 2491 { 2492 ASSERT(MUTEX_HELD(&vmm_mtx)); 2493 ASSERT0(sc->vmm_flags & VMM_DESTROY); 2494 2495 sc->vmm_flags |= VMM_DESTROY; 2496 2497 /* 2498 * Lock and unlock all of the vCPUs to ensure that they are kicked out 2499 * of guest context, being unable to return now that the instance is 2500 * marked for destruction. 2501 */ 2502 const int maxcpus = vm_get_maxcpus(sc->vmm_vm); 2503 for (int vcpu = 0; vcpu < maxcpus; vcpu++) { 2504 vcpu_lock_one(sc, vcpu); 2505 vcpu_unlock_one(sc, vcpu); 2506 } 2507 2508 vmmdev_devmem_purge(sc); 2509 if ((opts & VDO_NO_CLEAN_ZSD) == 0) { 2510 /* 2511 * The ZSD should be cleaned up now, unless destruction of the 2512 * instance was initated by destruction of the containing zone, 2513 * in which case the ZSD has already been removed. 2514 */ 2515 vmm_zsd_rem_vm(sc); 2516 } 2517 zone_rele(sc->vmm_zone); 2518 2519 vmm_drv_purge(sc); 2520 } 2521 2522 static bool 2523 vmm_destroy_ready(vmm_softc_t *sc) 2524 { 2525 ASSERT(MUTEX_HELD(&vmm_mtx)); 2526 2527 if ((sc->vmm_flags & (VMM_HELD | VMM_IS_OPEN)) == 0) { 2528 VERIFY(list_is_empty(&sc->vmm_holds)); 2529 return (true); 2530 } 2531 2532 return (false); 2533 } 2534 2535 static void 2536 vmm_destroy_finish(vmm_softc_t *sc) 2537 { 2538 ASSERT(MUTEX_HELD(&vmm_mtx)); 2539 ASSERT(vmm_destroy_ready(sc)); 2540 2541 list_remove(&vmm_list, sc); 2542 vmm_kstat_fini(sc); 2543 vm_destroy(sc->vmm_vm); 2544 ddi_remove_minor_node(vmmdev_dip, sc->vmm_name); 2545 (void) devfs_clean(ddi_get_parent(vmmdev_dip), NULL, DV_CLEAN_FORCE); 2546 2547 const minor_t minor = sc->vmm_minor; 2548 ddi_soft_state_free(vmm_statep, minor); 2549 id_free(vmm_minors, minor); 2550 } 2551 2552 /* 2553 * Initiate or attempt to finish destruction of a VMM instance. 2554 * 2555 * This is called from several contexts: 2556 * - An explicit destroy ioctl is made 2557 * - A vmm_drv consumer releases its hold (being the last on the instance) 2558 * - The vmm device is closed, and auto-destruct is enabled 2559 */ 2560 static int 2561 vmm_destroy_locked(vmm_softc_t *sc, vmm_destroy_opts_t opts, 2562 bool *hma_release) 2563 { 2564 ASSERT(MUTEX_HELD(&vmm_mtx)); 2565 2566 *hma_release = false; 2567 2568 /* 2569 * When instance destruction begins, it is so marked such that any 2570 * further requests to operate the instance will fail. 2571 */ 2572 if ((sc->vmm_flags & VMM_DESTROY) == 0) { 2573 vmm_destroy_begin(sc, opts); 2574 } 2575 2576 if (vmm_destroy_ready(sc)) { 2577 2578 /* 2579 * Notify anyone waiting for the destruction to finish. They 2580 * must be clear before we can safely tear down the softc. 2581 */ 2582 if (sc->vmm_destroy_waiters != 0) { 2583 cv_broadcast(&sc->vmm_cv); 2584 while (sc->vmm_destroy_waiters != 0) { 2585 cv_wait(&sc->vmm_cv, &vmm_mtx); 2586 } 2587 } 2588 2589 /* 2590 * Finish destruction of instance. After this point, the softc 2591 * is freed and cannot be accessed again. 2592 * 2593 * With destruction complete, the HMA hold can be released 2594 */ 2595 vmm_destroy_finish(sc); 2596 *hma_release = true; 2597 return (0); 2598 } else if ((opts & VDO_ATTEMPT_WAIT) != 0) { 2599 int err = 0; 2600 2601 sc->vmm_destroy_waiters++; 2602 while (!vmm_destroy_ready(sc) && err == 0) { 2603 if (cv_wait_sig(&sc->vmm_cv, &vmm_mtx) <= 0) { 2604 err = EINTR; 2605 } 2606 } 2607 sc->vmm_destroy_waiters--; 2608 2609 if (sc->vmm_destroy_waiters == 0) { 2610 /* 2611 * If we were the last waiter, it could be that VM 2612 * destruction is waiting on _us_ to proceed with the 2613 * final clean-up. 2614 */ 2615 cv_signal(&sc->vmm_cv); 2616 } 2617 return (err); 2618 } else { 2619 /* 2620 * Since the instance is not ready for destruction, and the 2621 * caller did not ask to wait, consider it a success for now. 2622 */ 2623 return (0); 2624 } 2625 } 2626 2627 void 2628 vmm_zone_vm_destroy(vmm_softc_t *sc) 2629 { 2630 bool hma_release = false; 2631 int err; 2632 2633 mutex_enter(&vmm_mtx); 2634 err = vmm_destroy_locked(sc, VDO_NO_CLEAN_ZSD, &hma_release); 2635 mutex_exit(&vmm_mtx); 2636 2637 VERIFY0(err); 2638 2639 if (hma_release) { 2640 vmm_hma_release(); 2641 } 2642 } 2643 2644 static int 2645 vmmdev_do_vm_destroy(const struct vm_destroy_req *req, cred_t *cr) 2646 { 2647 vmm_softc_t *sc; 2648 bool hma_release = false; 2649 int err; 2650 2651 if (crgetuid(cr) != 0) { 2652 return (EPERM); 2653 } 2654 2655 mutex_enter(&vmm_mtx); 2656 sc = vmm_lookup(req->name); 2657 if (sc == NULL) { 2658 mutex_exit(&vmm_mtx); 2659 return (ENOENT); 2660 } 2661 /* 2662 * We don't check this in vmm_lookup() since that function is also used 2663 * for validation during create and currently vmm names must be unique. 2664 */ 2665 if (!INGLOBALZONE(curproc) && sc->vmm_zone != curzone) { 2666 mutex_exit(&vmm_mtx); 2667 return (EPERM); 2668 } 2669 2670 err = vmm_destroy_locked(sc, VDO_ATTEMPT_WAIT, &hma_release); 2671 mutex_exit(&vmm_mtx); 2672 2673 if (hma_release) { 2674 vmm_hma_release(); 2675 } 2676 2677 return (err); 2678 } 2679 2680 #define VCPU_NAME_BUFLEN 32 2681 2682 static int 2683 vmm_kstat_alloc(vmm_softc_t *sc, minor_t minor, const cred_t *cr) 2684 { 2685 zoneid_t zid = crgetzoneid(cr); 2686 int instance = minor; 2687 kstat_t *ksp; 2688 2689 ASSERT3P(sc->vmm_kstat_vm, ==, NULL); 2690 2691 ksp = kstat_create_zone(VMM_MODULE_NAME, instance, "vm", 2692 VMM_KSTAT_CLASS, KSTAT_TYPE_NAMED, 2693 sizeof (vmm_kstats_t) / sizeof (kstat_named_t), 0, zid); 2694 2695 if (ksp == NULL) { 2696 return (-1); 2697 } 2698 sc->vmm_kstat_vm = ksp; 2699 2700 for (uint_t i = 0; i < VM_MAXCPU; i++) { 2701 char namebuf[VCPU_NAME_BUFLEN]; 2702 2703 ASSERT3P(sc->vmm_kstat_vcpu[i], ==, NULL); 2704 2705 (void) snprintf(namebuf, VCPU_NAME_BUFLEN, "vcpu%u", i); 2706 ksp = kstat_create_zone(VMM_MODULE_NAME, instance, namebuf, 2707 VMM_KSTAT_CLASS, KSTAT_TYPE_NAMED, 2708 sizeof (vmm_vcpu_kstats_t) / sizeof (kstat_named_t), 2709 0, zid); 2710 if (ksp == NULL) { 2711 goto fail; 2712 } 2713 2714 sc->vmm_kstat_vcpu[i] = ksp; 2715 } 2716 2717 /* 2718 * If this instance is associated with a non-global zone, make its 2719 * kstats visible from the GZ. 2720 */ 2721 if (zid != GLOBAL_ZONEID) { 2722 kstat_zone_add(sc->vmm_kstat_vm, GLOBAL_ZONEID); 2723 for (uint_t i = 0; i < VM_MAXCPU; i++) { 2724 kstat_zone_add(sc->vmm_kstat_vcpu[i], GLOBAL_ZONEID); 2725 } 2726 } 2727 2728 return (0); 2729 2730 fail: 2731 for (uint_t i = 0; i < VM_MAXCPU; i++) { 2732 if (sc->vmm_kstat_vcpu[i] != NULL) { 2733 kstat_delete(sc->vmm_kstat_vcpu[i]); 2734 sc->vmm_kstat_vcpu[i] = NULL; 2735 } else { 2736 break; 2737 } 2738 } 2739 kstat_delete(sc->vmm_kstat_vm); 2740 sc->vmm_kstat_vm = NULL; 2741 return (-1); 2742 } 2743 2744 static void 2745 vmm_kstat_init(vmm_softc_t *sc) 2746 { 2747 kstat_t *ksp; 2748 2749 ASSERT3P(sc->vmm_vm, !=, NULL); 2750 ASSERT3P(sc->vmm_kstat_vm, !=, NULL); 2751 2752 ksp = sc->vmm_kstat_vm; 2753 vmm_kstats_t *vk = ksp->ks_data; 2754 ksp->ks_private = sc->vmm_vm; 2755 kstat_named_init(&vk->vk_name, "vm_name", KSTAT_DATA_STRING); 2756 kstat_named_setstr(&vk->vk_name, sc->vmm_name); 2757 2758 for (uint_t i = 0; i < VM_MAXCPU; i++) { 2759 ASSERT3P(sc->vmm_kstat_vcpu[i], !=, NULL); 2760 2761 ksp = sc->vmm_kstat_vcpu[i]; 2762 vmm_vcpu_kstats_t *vvk = ksp->ks_data; 2763 2764 kstat_named_init(&vvk->vvk_vcpu, "vcpu", KSTAT_DATA_UINT32); 2765 vvk->vvk_vcpu.value.ui32 = i; 2766 kstat_named_init(&vvk->vvk_time_init, "time_init", 2767 KSTAT_DATA_UINT64); 2768 kstat_named_init(&vvk->vvk_time_run, "time_run", 2769 KSTAT_DATA_UINT64); 2770 kstat_named_init(&vvk->vvk_time_idle, "time_idle", 2771 KSTAT_DATA_UINT64); 2772 kstat_named_init(&vvk->vvk_time_emu_kern, "time_emu_kern", 2773 KSTAT_DATA_UINT64); 2774 kstat_named_init(&vvk->vvk_time_emu_user, "time_emu_user", 2775 KSTAT_DATA_UINT64); 2776 kstat_named_init(&vvk->vvk_time_sched, "time_sched", 2777 KSTAT_DATA_UINT64); 2778 ksp->ks_private = sc->vmm_vm; 2779 ksp->ks_update = vmm_kstat_update_vcpu; 2780 } 2781 2782 kstat_install(sc->vmm_kstat_vm); 2783 for (uint_t i = 0; i < VM_MAXCPU; i++) { 2784 kstat_install(sc->vmm_kstat_vcpu[i]); 2785 } 2786 } 2787 2788 static void 2789 vmm_kstat_fini(vmm_softc_t *sc) 2790 { 2791 ASSERT(sc->vmm_kstat_vm != NULL); 2792 2793 kstat_delete(sc->vmm_kstat_vm); 2794 sc->vmm_kstat_vm = NULL; 2795 2796 for (uint_t i = 0; i < VM_MAXCPU; i++) { 2797 ASSERT3P(sc->vmm_kstat_vcpu[i], !=, NULL); 2798 2799 kstat_delete(sc->vmm_kstat_vcpu[i]); 2800 sc->vmm_kstat_vcpu[i] = NULL; 2801 } 2802 } 2803 2804 static int 2805 vmm_open(dev_t *devp, int flag, int otyp, cred_t *credp) 2806 { 2807 minor_t minor; 2808 vmm_softc_t *sc; 2809 2810 /* 2811 * Forbid running bhyve in a 32-bit process until it has been tested and 2812 * verified to be safe. 2813 */ 2814 if (curproc->p_model != DATAMODEL_LP64) { 2815 return (EFBIG); 2816 } 2817 2818 minor = getminor(*devp); 2819 if (minor == VMM_CTL_MINOR) { 2820 /* 2821 * Master control device must be opened exclusively. 2822 */ 2823 if ((flag & FEXCL) != FEXCL || otyp != OTYP_CHR) { 2824 return (EINVAL); 2825 } 2826 2827 return (0); 2828 } 2829 2830 mutex_enter(&vmm_mtx); 2831 sc = ddi_get_soft_state(vmm_statep, minor); 2832 if (sc == NULL) { 2833 mutex_exit(&vmm_mtx); 2834 return (ENXIO); 2835 } 2836 2837 sc->vmm_flags |= VMM_IS_OPEN; 2838 mutex_exit(&vmm_mtx); 2839 2840 return (0); 2841 } 2842 2843 static int 2844 vmm_close(dev_t dev, int flag, int otyp, cred_t *credp) 2845 { 2846 const minor_t minor = getminor(dev); 2847 vmm_softc_t *sc; 2848 bool hma_release = false; 2849 2850 if (minor == VMM_CTL_MINOR) { 2851 return (0); 2852 } 2853 2854 mutex_enter(&vmm_mtx); 2855 sc = ddi_get_soft_state(vmm_statep, minor); 2856 if (sc == NULL) { 2857 mutex_exit(&vmm_mtx); 2858 return (ENXIO); 2859 } 2860 2861 VERIFY3U(sc->vmm_flags & VMM_IS_OPEN, !=, 0); 2862 sc->vmm_flags &= ~VMM_IS_OPEN; 2863 2864 /* 2865 * If instance was marked for auto-destruction begin that now. Instance 2866 * destruction may have been initated already, so try to make progress 2867 * in that case, since closure of the device is one of its requirements. 2868 */ 2869 if ((sc->vmm_flags & VMM_DESTROY) != 0 || 2870 (sc->vmm_flags & VMM_AUTODESTROY) != 0) { 2871 VERIFY0(vmm_destroy_locked(sc, VDO_DEFAULT, &hma_release)); 2872 } 2873 mutex_exit(&vmm_mtx); 2874 2875 if (hma_release) { 2876 vmm_hma_release(); 2877 } 2878 2879 return (0); 2880 } 2881 2882 static int 2883 vmm_is_supported(intptr_t arg) 2884 { 2885 int r; 2886 const char *msg; 2887 2888 if (vmm_is_intel()) { 2889 r = vmx_x86_supported(&msg); 2890 } else if (vmm_is_svm()) { 2891 /* 2892 * HMA already ensured that the features necessary for SVM 2893 * operation were present and online during vmm_attach(). 2894 */ 2895 r = 0; 2896 } else { 2897 r = ENXIO; 2898 msg = "Unsupported CPU vendor"; 2899 } 2900 2901 if (r != 0 && arg != (intptr_t)NULL) { 2902 if (copyoutstr(msg, (char *)arg, strlen(msg) + 1, NULL) != 0) 2903 return (EFAULT); 2904 } 2905 return (r); 2906 } 2907 2908 static int 2909 vmm_ctl_ioctl(int cmd, intptr_t arg, int md, cred_t *cr, int *rvalp) 2910 { 2911 void *argp = (void *)arg; 2912 2913 switch (cmd) { 2914 case VMM_CREATE_VM: { 2915 struct vm_create_req req; 2916 2917 if ((md & FWRITE) == 0) { 2918 return (EPERM); 2919 } 2920 if (ddi_copyin(argp, &req, sizeof (req), md) != 0) { 2921 return (EFAULT); 2922 } 2923 return (vmmdev_do_vm_create(&req, cr)); 2924 } 2925 case VMM_DESTROY_VM: { 2926 struct vm_destroy_req req; 2927 2928 if ((md & FWRITE) == 0) { 2929 return (EPERM); 2930 } 2931 if (ddi_copyin(argp, &req, sizeof (req), md) != 0) { 2932 return (EFAULT); 2933 } 2934 return (vmmdev_do_vm_destroy(&req, cr)); 2935 } 2936 case VMM_VM_SUPPORTED: 2937 return (vmm_is_supported(arg)); 2938 case VMM_CHECK_IOMMU: 2939 if (!vmm_check_iommu()) { 2940 return (ENXIO); 2941 } 2942 return (0); 2943 case VMM_RESV_QUERY: 2944 case VMM_RESV_ADD: 2945 case VMM_RESV_REMOVE: 2946 return (vmmr_ioctl(cmd, arg, md, cr, rvalp)); 2947 default: 2948 break; 2949 } 2950 /* No other actions are legal on ctl device */ 2951 return (ENOTTY); 2952 } 2953 2954 static int 2955 vmm_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, 2956 int *rvalp) 2957 { 2958 vmm_softc_t *sc; 2959 minor_t minor; 2960 2961 /* 2962 * Forbid running bhyve in a 32-bit process until it has been tested and 2963 * verified to be safe. 2964 */ 2965 if (curproc->p_model != DATAMODEL_LP64) { 2966 return (EFBIG); 2967 } 2968 2969 /* The structs in bhyve ioctls assume a 64-bit datamodel */ 2970 if (ddi_model_convert_from(mode & FMODELS) != DDI_MODEL_NONE) { 2971 return (ENOTSUP); 2972 } 2973 2974 /* 2975 * Regardless of minor (vmmctl or instance), we respond to queries of 2976 * the interface version. 2977 */ 2978 if (cmd == VMM_INTERFACE_VERSION) { 2979 *rvalp = VMM_CURRENT_INTERFACE_VERSION; 2980 return (0); 2981 } 2982 2983 minor = getminor(dev); 2984 2985 if (minor == VMM_CTL_MINOR) { 2986 return (vmm_ctl_ioctl(cmd, arg, mode, credp, rvalp)); 2987 } 2988 2989 sc = ddi_get_soft_state(vmm_statep, minor); 2990 ASSERT(sc != NULL); 2991 2992 /* 2993 * Turn away any ioctls against an instance when it is being destroyed. 2994 * (Except for the ioctl inquiring about that destroy-in-progress.) 2995 */ 2996 if ((sc->vmm_flags & VMM_DESTROY) != 0) { 2997 if (cmd == VM_DESTROY_PENDING) { 2998 *rvalp = 1; 2999 return (0); 3000 } 3001 return (ENXIO); 3002 } 3003 3004 return (vmmdev_do_ioctl(sc, cmd, arg, mode, credp, rvalp)); 3005 } 3006 3007 static int 3008 vmm_segmap(dev_t dev, off_t off, struct as *as, caddr_t *addrp, off_t len, 3009 unsigned int prot, unsigned int maxprot, unsigned int flags, cred_t *credp) 3010 { 3011 vmm_softc_t *sc; 3012 const minor_t minor = getminor(dev); 3013 int err; 3014 3015 if (minor == VMM_CTL_MINOR) { 3016 return (ENODEV); 3017 } 3018 if (off < 0 || (off + len) <= 0) { 3019 return (EINVAL); 3020 } 3021 if ((prot & PROT_USER) == 0) { 3022 return (EACCES); 3023 } 3024 3025 sc = ddi_get_soft_state(vmm_statep, minor); 3026 ASSERT(sc); 3027 3028 if (sc->vmm_flags & VMM_DESTROY) 3029 return (ENXIO); 3030 3031 /* Grab read lock on the VM to prevent any changes to the memory map */ 3032 vmm_read_lock(sc); 3033 3034 if (off >= VM_DEVMEM_START) { 3035 int segid; 3036 off_t segoff; 3037 3038 /* Mapping a devmem "device" */ 3039 if (!vmmdev_devmem_segid(sc, off, len, &segid, &segoff)) { 3040 err = ENODEV; 3041 } else { 3042 err = vm_segmap_obj(sc->vmm_vm, segid, segoff, len, as, 3043 addrp, prot, maxprot, flags); 3044 } 3045 } else { 3046 /* Mapping a part of the guest physical space */ 3047 err = vm_segmap_space(sc->vmm_vm, off, as, addrp, len, prot, 3048 maxprot, flags); 3049 } 3050 3051 vmm_read_unlock(sc); 3052 return (err); 3053 } 3054 3055 static sdev_plugin_validate_t 3056 vmm_sdev_validate(sdev_ctx_t ctx) 3057 { 3058 const char *name = sdev_ctx_name(ctx); 3059 vmm_softc_t *sc; 3060 sdev_plugin_validate_t ret; 3061 minor_t minor; 3062 3063 if (sdev_ctx_vtype(ctx) != VCHR) 3064 return (SDEV_VTOR_INVALID); 3065 3066 VERIFY3S(sdev_ctx_minor(ctx, &minor), ==, 0); 3067 3068 mutex_enter(&vmm_mtx); 3069 if ((sc = vmm_lookup(name)) == NULL) 3070 ret = SDEV_VTOR_INVALID; 3071 else if (sc->vmm_minor != minor) 3072 ret = SDEV_VTOR_STALE; 3073 else 3074 ret = SDEV_VTOR_VALID; 3075 mutex_exit(&vmm_mtx); 3076 3077 return (ret); 3078 } 3079 3080 static int 3081 vmm_sdev_filldir(sdev_ctx_t ctx) 3082 { 3083 vmm_softc_t *sc; 3084 int ret; 3085 3086 if (strcmp(sdev_ctx_path(ctx), VMM_SDEV_ROOT) != 0) { 3087 cmn_err(CE_WARN, "%s: bad path '%s' != '%s'\n", __func__, 3088 sdev_ctx_path(ctx), VMM_SDEV_ROOT); 3089 return (EINVAL); 3090 } 3091 3092 mutex_enter(&vmm_mtx); 3093 ASSERT(vmmdev_dip != NULL); 3094 for (sc = list_head(&vmm_list); sc != NULL; 3095 sc = list_next(&vmm_list, sc)) { 3096 if (INGLOBALZONE(curproc) || sc->vmm_zone == curzone) { 3097 ret = sdev_plugin_mknod(ctx, sc->vmm_name, 3098 S_IFCHR | 0600, 3099 makedevice(ddi_driver_major(vmmdev_dip), 3100 sc->vmm_minor)); 3101 } else { 3102 continue; 3103 } 3104 if (ret != 0 && ret != EEXIST) 3105 goto out; 3106 } 3107 3108 ret = 0; 3109 3110 out: 3111 mutex_exit(&vmm_mtx); 3112 return (ret); 3113 } 3114 3115 /* ARGSUSED */ 3116 static void 3117 vmm_sdev_inactive(sdev_ctx_t ctx) 3118 { 3119 } 3120 3121 static sdev_plugin_ops_t vmm_sdev_ops = { 3122 .spo_version = SDEV_PLUGIN_VERSION, 3123 .spo_flags = SDEV_PLUGIN_SUBDIR, 3124 .spo_validate = vmm_sdev_validate, 3125 .spo_filldir = vmm_sdev_filldir, 3126 .spo_inactive = vmm_sdev_inactive 3127 }; 3128 3129 /* ARGSUSED */ 3130 static int 3131 vmm_info(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result) 3132 { 3133 int error; 3134 3135 switch (cmd) { 3136 case DDI_INFO_DEVT2DEVINFO: 3137 *result = (void *)vmmdev_dip; 3138 error = DDI_SUCCESS; 3139 break; 3140 case DDI_INFO_DEVT2INSTANCE: 3141 *result = (void *)0; 3142 error = DDI_SUCCESS; 3143 break; 3144 default: 3145 error = DDI_FAILURE; 3146 break; 3147 } 3148 return (error); 3149 } 3150 3151 static int 3152 vmm_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 3153 { 3154 sdev_plugin_hdl_t sph; 3155 hma_reg_t *reg = NULL; 3156 boolean_t vmm_loaded = B_FALSE; 3157 3158 if (cmd != DDI_ATTACH) { 3159 return (DDI_FAILURE); 3160 } 3161 3162 mutex_enter(&vmmdev_mtx); 3163 /* Ensure we are not already attached. */ 3164 if (vmmdev_dip != NULL) { 3165 mutex_exit(&vmmdev_mtx); 3166 return (DDI_FAILURE); 3167 } 3168 3169 vmm_sol_glue_init(); 3170 3171 /* 3172 * Perform temporary HMA registration to determine if the system 3173 * is capable. 3174 */ 3175 if ((reg = hma_register(vmmdev_hvm_name)) == NULL) { 3176 goto fail; 3177 } else if (vmm_mod_load() != 0) { 3178 goto fail; 3179 } 3180 vmm_loaded = B_TRUE; 3181 hma_unregister(reg); 3182 reg = NULL; 3183 3184 /* Create control node. Other nodes will be created on demand. */ 3185 if (ddi_create_minor_node(dip, "ctl", S_IFCHR, 3186 VMM_CTL_MINOR, DDI_PSEUDO, 0) != 0) { 3187 goto fail; 3188 } 3189 3190 sph = sdev_plugin_register(VMM_MODULE_NAME, &vmm_sdev_ops, NULL); 3191 if (sph == (sdev_plugin_hdl_t)NULL) { 3192 ddi_remove_minor_node(dip, NULL); 3193 goto fail; 3194 } 3195 3196 ddi_report_dev(dip); 3197 vmmdev_sdev_hdl = sph; 3198 vmmdev_dip = dip; 3199 mutex_exit(&vmmdev_mtx); 3200 return (DDI_SUCCESS); 3201 3202 fail: 3203 if (vmm_loaded) { 3204 VERIFY0(vmm_mod_unload()); 3205 } 3206 if (reg != NULL) { 3207 hma_unregister(reg); 3208 } 3209 vmm_sol_glue_cleanup(); 3210 mutex_exit(&vmmdev_mtx); 3211 return (DDI_FAILURE); 3212 } 3213 3214 static int 3215 vmm_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 3216 { 3217 if (cmd != DDI_DETACH) { 3218 return (DDI_FAILURE); 3219 } 3220 3221 /* 3222 * Ensure that all resources have been cleaned up. 3223 * 3224 * To prevent a deadlock with iommu_cleanup() we'll fail the detach if 3225 * vmmdev_mtx is already held. We can't wait for vmmdev_mtx with our 3226 * devinfo locked as iommu_cleanup() tries to recursively lock each 3227 * devinfo, including our own, while holding vmmdev_mtx. 3228 */ 3229 if (mutex_tryenter(&vmmdev_mtx) == 0) 3230 return (DDI_FAILURE); 3231 3232 mutex_enter(&vmm_mtx); 3233 if (!list_is_empty(&vmm_list)) { 3234 mutex_exit(&vmm_mtx); 3235 mutex_exit(&vmmdev_mtx); 3236 return (DDI_FAILURE); 3237 } 3238 mutex_exit(&vmm_mtx); 3239 3240 if (!vmmr_is_empty()) { 3241 mutex_exit(&vmmdev_mtx); 3242 return (DDI_FAILURE); 3243 } 3244 3245 VERIFY(vmmdev_sdev_hdl != (sdev_plugin_hdl_t)NULL); 3246 if (sdev_plugin_unregister(vmmdev_sdev_hdl) != 0) { 3247 mutex_exit(&vmmdev_mtx); 3248 return (DDI_FAILURE); 3249 } 3250 vmmdev_sdev_hdl = (sdev_plugin_hdl_t)NULL; 3251 3252 /* Remove the control node. */ 3253 ddi_remove_minor_node(dip, "ctl"); 3254 vmmdev_dip = NULL; 3255 3256 VERIFY0(vmm_mod_unload()); 3257 VERIFY3U(vmmdev_hma_reg, ==, NULL); 3258 vmm_sol_glue_cleanup(); 3259 3260 mutex_exit(&vmmdev_mtx); 3261 3262 return (DDI_SUCCESS); 3263 } 3264 3265 static struct cb_ops vmm_cb_ops = { 3266 vmm_open, 3267 vmm_close, 3268 nodev, /* strategy */ 3269 nodev, /* print */ 3270 nodev, /* dump */ 3271 nodev, /* read */ 3272 nodev, /* write */ 3273 vmm_ioctl, 3274 nodev, /* devmap */ 3275 nodev, /* mmap */ 3276 vmm_segmap, 3277 nochpoll, /* poll */ 3278 ddi_prop_op, 3279 NULL, 3280 D_NEW | D_MP | D_DEVMAP 3281 }; 3282 3283 static struct dev_ops vmm_ops = { 3284 DEVO_REV, 3285 0, 3286 vmm_info, 3287 nulldev, /* identify */ 3288 nulldev, /* probe */ 3289 vmm_attach, 3290 vmm_detach, 3291 nodev, /* reset */ 3292 &vmm_cb_ops, 3293 (struct bus_ops *)NULL 3294 }; 3295 3296 static struct modldrv modldrv = { 3297 &mod_driverops, 3298 "bhyve vmm", 3299 &vmm_ops 3300 }; 3301 3302 static struct modlinkage modlinkage = { 3303 MODREV_1, 3304 &modldrv, 3305 NULL 3306 }; 3307 3308 int 3309 _init(void) 3310 { 3311 int error; 3312 3313 sysinit(); 3314 3315 mutex_init(&vmmdev_mtx, NULL, MUTEX_DRIVER, NULL); 3316 mutex_init(&vmm_mtx, NULL, MUTEX_DRIVER, NULL); 3317 list_create(&vmm_list, sizeof (vmm_softc_t), 3318 offsetof(vmm_softc_t, vmm_node)); 3319 vmm_minors = id_space_create("vmm_minors", VMM_CTL_MINOR + 1, MAXMIN32); 3320 3321 error = ddi_soft_state_init(&vmm_statep, sizeof (vmm_softc_t), 0); 3322 if (error) { 3323 return (error); 3324 } 3325 3326 vmm_zsd_init(); 3327 vmmr_init(); 3328 3329 error = mod_install(&modlinkage); 3330 if (error) { 3331 ddi_soft_state_fini(&vmm_statep); 3332 vmm_zsd_fini(); 3333 vmmr_fini(); 3334 } 3335 3336 return (error); 3337 } 3338 3339 int 3340 _fini(void) 3341 { 3342 int error; 3343 3344 error = mod_remove(&modlinkage); 3345 if (error) { 3346 return (error); 3347 } 3348 3349 vmm_zsd_fini(); 3350 vmmr_fini(); 3351 3352 ddi_soft_state_fini(&vmm_statep); 3353 3354 return (0); 3355 } 3356 3357 int 3358 _info(struct modinfo *modinfop) 3359 { 3360 return (mod_info(&modlinkage, modinfop)); 3361 } 3362