1 /* 2 * This file and its contents are supplied under the terms of the 3 * Common Development and Distribution License ("CDDL"), version 1.0. 4 * You may only use this file in accordance with the terms of version 5 * 1.0 of the CDDL. 6 * 7 * A full copy of the text of the CDDL should have accompanied this 8 * source. A copy of the CDDL is also available via the Internet at 9 * http://www.illumos.org/license/CDDL. 10 */ 11 /* This file is dual-licensed; see usr/src/contrib/bhyve/LICENSE */ 12 13 /* 14 * Copyright 2015 Pluribus Networks Inc. 15 * Copyright 2019 Joyent, Inc. 16 * Copyright 2020 OmniOS Community Edition (OmniOSce) Association. 17 * Copyright 2023 Oxide Computer Company 18 */ 19 20 #include <sys/types.h> 21 #include <sys/conf.h> 22 #include <sys/cpuvar.h> 23 #include <sys/ioccom.h> 24 #include <sys/stat.h> 25 #include <sys/vmsystm.h> 26 #include <sys/ddi.h> 27 #include <sys/mkdev.h> 28 #include <sys/sunddi.h> 29 #include <sys/fs/dv_node.h> 30 #include <sys/cpuset.h> 31 #include <sys/id_space.h> 32 #include <sys/fs/sdev_plugin.h> 33 #include <sys/smt.h> 34 #include <sys/kstat.h> 35 36 #include <sys/kernel.h> 37 #include <sys/hma.h> 38 #include <sys/x86_archext.h> 39 #include <x86/apicreg.h> 40 41 #include <sys/vmm.h> 42 #include <sys/vmm_kernel.h> 43 #include <sys/vmm_instruction_emul.h> 44 #include <sys/vmm_dev.h> 45 #include <sys/vmm_impl.h> 46 #include <sys/vmm_drv.h> 47 #include <sys/vmm_vm.h> 48 #include <sys/vmm_reservoir.h> 49 50 #include <vm/seg_dev.h> 51 52 #include "io/ppt.h" 53 #include "io/vatpic.h" 54 #include "io/vioapic.h" 55 #include "io/vrtc.h" 56 #include "io/vhpet.h" 57 #include "io/vpmtmr.h" 58 #include "vmm_lapic.h" 59 #include "vmm_stat.h" 60 #include "vmm_util.h" 61 62 /* 63 * Locking details: 64 * 65 * Driver-wide data (vmmdev_*) , including HMA and sdev registration, is 66 * protected by vmmdev_mtx. The list of vmm_softc_t instances and related data 67 * (vmm_*) are protected by vmm_mtx. Actions requiring both locks must acquire 68 * vmmdev_mtx before vmm_mtx. The sdev plugin functions must not attempt to 69 * acquire vmmdev_mtx, as they could deadlock with plugin unregistration. 70 */ 71 72 static kmutex_t vmmdev_mtx; 73 static dev_info_t *vmmdev_dip; 74 static hma_reg_t *vmmdev_hma_reg; 75 static uint_t vmmdev_hma_ref; 76 static sdev_plugin_hdl_t vmmdev_sdev_hdl; 77 78 static kmutex_t vmm_mtx; 79 static list_t vmm_list; 80 static id_space_t *vmm_minors; 81 static void *vmm_statep; 82 83 /* 84 * Until device emulation in bhyve had been adequately scrutinized and tested, 85 * there was (justified) concern that unusual or corrupt device state payloads 86 * could crash the host when loaded via the vmm-data interface. 87 * 88 * Now that those concerns have been mitigated, this protection is loosened to 89 * default-allow, but the switch is left in place, in case there is a need to 90 * once again clamp down on vmm-data writes. 91 */ 92 int vmm_allow_state_writes = 1; 93 94 static const char *vmmdev_hvm_name = "bhyve"; 95 96 /* For sdev plugin (/dev) */ 97 #define VMM_SDEV_ROOT "/dev/vmm" 98 99 /* From uts/intel/io/vmm/intel/vmx.c */ 100 extern int vmx_x86_supported(const char **); 101 102 /* Holds and hooks from drivers external to vmm */ 103 struct vmm_hold { 104 list_node_t vmh_node; 105 vmm_softc_t *vmh_sc; 106 boolean_t vmh_release_req; 107 uint_t vmh_ioport_hook_cnt; 108 }; 109 110 struct vmm_lease { 111 list_node_t vml_node; 112 struct vm *vml_vm; 113 vm_client_t *vml_vmclient; 114 boolean_t vml_expired; 115 boolean_t vml_break_deferred; 116 boolean_t (*vml_expire_func)(void *); 117 void *vml_expire_arg; 118 struct vmm_hold *vml_hold; 119 }; 120 121 /* Options for vmm_destroy_locked */ 122 typedef enum vmm_destroy_opts { 123 VDO_DEFAULT = 0, 124 /* 125 * Indicate that zone-specific-data associated with this VM not be 126 * cleaned up as part of the destroy. Skipping ZSD clean-up is 127 * necessary when VM is being destroyed as part of zone destruction, 128 * when said ZSD is already being cleaned up. 129 */ 130 VDO_NO_CLEAN_ZSD = (1 << 0), 131 /* 132 * Attempt to wait for VM destruction to complete. This is opt-in, 133 * since there are many normal conditions which could lead to 134 * destruction being stalled pending other clean-up. 135 */ 136 VDO_ATTEMPT_WAIT = (1 << 1), 137 } vmm_destroy_opts_t; 138 139 static void vmm_hma_release(void); 140 static int vmm_destroy_locked(vmm_softc_t *, vmm_destroy_opts_t, bool *); 141 static int vmm_drv_block_hook(vmm_softc_t *, boolean_t); 142 static void vmm_lease_block(vmm_softc_t *); 143 static void vmm_lease_unblock(vmm_softc_t *); 144 static int vmm_kstat_alloc(vmm_softc_t *, minor_t, const cred_t *); 145 static void vmm_kstat_init(vmm_softc_t *); 146 static void vmm_kstat_fini(vmm_softc_t *); 147 148 /* 149 * The 'devmem' hack: 150 * 151 * On native FreeBSD, bhyve consumers are allowed to create 'devmem' segments 152 * in the vm which appear with their own name related to the vm under /dev. 153 * Since this would be a hassle from an sdev perspective and would require a 154 * new cdev interface (or complicate the existing one), we choose to implement 155 * this in a different manner. Direct access to the underlying vm memory 156 * segments is exposed by placing them in a range of offsets beyond the normal 157 * guest memory space. Userspace can query the appropriate offset to mmap() 158 * for a given segment-id with the VM_DEVMEM_GETOFFSET ioctl. 159 */ 160 161 static vmm_devmem_entry_t * 162 vmmdev_devmem_find(vmm_softc_t *sc, int segid) 163 { 164 vmm_devmem_entry_t *ent = NULL; 165 list_t *dl = &sc->vmm_devmem_list; 166 167 for (ent = list_head(dl); ent != NULL; ent = list_next(dl, ent)) { 168 if (ent->vde_segid == segid) { 169 return (ent); 170 } 171 } 172 return (NULL); 173 } 174 175 static int 176 vmmdev_get_memseg(vmm_softc_t *sc, struct vm_memseg *mseg) 177 { 178 int error; 179 bool sysmem; 180 181 error = vm_get_memseg(sc->vmm_vm, mseg->segid, &mseg->len, &sysmem, 182 NULL); 183 if (error || mseg->len == 0) 184 return (error); 185 186 if (!sysmem) { 187 vmm_devmem_entry_t *de; 188 189 de = vmmdev_devmem_find(sc, mseg->segid); 190 if (de != NULL) { 191 (void) strlcpy(mseg->name, de->vde_name, 192 sizeof (mseg->name)); 193 } 194 } else { 195 bzero(mseg->name, sizeof (mseg->name)); 196 } 197 198 return (error); 199 } 200 201 static int 202 vmmdev_devmem_create(vmm_softc_t *sc, struct vm_memseg *mseg, const char *name) 203 { 204 off_t map_offset; 205 vmm_devmem_entry_t *entry; 206 207 if (list_is_empty(&sc->vmm_devmem_list)) { 208 map_offset = VM_DEVMEM_START; 209 } else { 210 entry = list_tail(&sc->vmm_devmem_list); 211 map_offset = entry->vde_off + entry->vde_len; 212 if (map_offset < entry->vde_off) { 213 /* Do not tolerate overflow */ 214 return (ERANGE); 215 } 216 /* 217 * XXXJOY: We could choose to search the list for duplicate 218 * names and toss an error. Since we're using the offset 219 * method for now, it does not make much of a difference. 220 */ 221 } 222 223 entry = kmem_zalloc(sizeof (*entry), KM_SLEEP); 224 entry->vde_segid = mseg->segid; 225 entry->vde_len = mseg->len; 226 entry->vde_off = map_offset; 227 (void) strlcpy(entry->vde_name, name, sizeof (entry->vde_name)); 228 list_insert_tail(&sc->vmm_devmem_list, entry); 229 230 return (0); 231 } 232 233 static boolean_t 234 vmmdev_devmem_segid(vmm_softc_t *sc, off_t off, off_t len, int *segidp, 235 off_t *map_offp) 236 { 237 list_t *dl = &sc->vmm_devmem_list; 238 vmm_devmem_entry_t *de = NULL; 239 const off_t map_end = off + len; 240 241 VERIFY(off >= VM_DEVMEM_START); 242 243 if (map_end < off) { 244 /* No match on overflow */ 245 return (B_FALSE); 246 } 247 248 for (de = list_head(dl); de != NULL; de = list_next(dl, de)) { 249 const off_t item_end = de->vde_off + de->vde_len; 250 251 if (de->vde_off <= off && item_end >= map_end) { 252 *segidp = de->vde_segid; 253 *map_offp = off - de->vde_off; 254 return (B_TRUE); 255 } 256 } 257 return (B_FALSE); 258 } 259 260 /* 261 * When an instance is being destroyed, the devmem list of named memory objects 262 * can be torn down, as no new mappings are allowed. 263 */ 264 static void 265 vmmdev_devmem_purge(vmm_softc_t *sc) 266 { 267 vmm_devmem_entry_t *entry; 268 269 while ((entry = list_remove_head(&sc->vmm_devmem_list)) != NULL) { 270 kmem_free(entry, sizeof (*entry)); 271 } 272 } 273 274 static int 275 vmmdev_alloc_memseg(vmm_softc_t *sc, struct vm_memseg *mseg) 276 { 277 int error; 278 bool sysmem = true; 279 280 if (VM_MEMSEG_NAME(mseg)) { 281 sysmem = false; 282 } 283 error = vm_alloc_memseg(sc->vmm_vm, mseg->segid, mseg->len, sysmem); 284 285 if (error == 0) { 286 /* 287 * Rather than create a whole fresh device from which userspace 288 * can mmap this segment, instead make it available at an 289 * offset above where the main guest memory resides. 290 */ 291 error = vmmdev_devmem_create(sc, mseg, mseg->name); 292 if (error != 0) { 293 vm_free_memseg(sc->vmm_vm, mseg->segid); 294 } 295 } 296 return (error); 297 } 298 299 /* 300 * Resource Locking and Exclusion 301 * 302 * Much of bhyve depends on key portions of VM state, such as the guest memory 303 * map, to remain unchanged while the guest is running. As ported from 304 * FreeBSD, the initial strategy for this resource exclusion hinged on gating 305 * access to the instance vCPUs. Threads acting on a single vCPU, like those 306 * performing the work of actually running the guest in VMX/SVM, would lock 307 * only that vCPU during ioctl() entry. For ioctls which would change VM-wide 308 * state, all of the vCPUs would be first locked, ensuring that the 309 * operation(s) could complete without any other threads stumbling into 310 * intermediate states. 311 * 312 * This approach is largely effective for bhyve. Common operations, such as 313 * running the vCPUs, steer clear of lock contention. The model begins to 314 * break down for operations which do not occur in the context of a specific 315 * vCPU. LAPIC MSI delivery, for example, may be initiated from a worker 316 * thread in the bhyve process. In order to properly protect those vCPU-less 317 * operations from encountering invalid states, additional locking is required. 318 * This was solved by forcing those operations to lock the VM_MAXCPU-1 vCPU. 319 * It does mean that class of operations will be serialized on locking the 320 * specific vCPU and that instances sized at VM_MAXCPU will potentially see 321 * undue contention on the VM_MAXCPU-1 vCPU. 322 * 323 * In order to address the shortcomings of this model, the concept of a 324 * read/write lock has been added to bhyve. Operations which change 325 * fundamental aspects of a VM (such as the memory map) must acquire the write 326 * lock, which also implies locking all of the vCPUs and waiting for all read 327 * lock holders to release. While it increases the cost and waiting time for 328 * those few operations, it allows most hot-path operations on the VM (which 329 * depend on its configuration remaining stable) to occur with minimal locking. 330 * 331 * Consumers of the Driver API (see below) are a special case when it comes to 332 * this locking, since they may hold a read lock via the drv_lease mechanism 333 * for an extended period of time. Rather than forcing those consumers to 334 * continuously poll for a write lock attempt, the lease system forces them to 335 * provide a release callback to trigger their clean-up (and potential later 336 * reacquisition) of the read lock. 337 */ 338 339 static void 340 vcpu_lock_one(vmm_softc_t *sc, int vcpu) 341 { 342 ASSERT(vcpu >= 0 && vcpu < VM_MAXCPU); 343 344 /* 345 * Since this state transition is utilizing from_idle=true, it should 346 * not fail, but rather block until it can be successful. 347 */ 348 VERIFY0(vcpu_set_state(sc->vmm_vm, vcpu, VCPU_FROZEN, true)); 349 } 350 351 static void 352 vcpu_unlock_one(vmm_softc_t *sc, int vcpu) 353 { 354 ASSERT(vcpu >= 0 && vcpu < VM_MAXCPU); 355 356 VERIFY3U(vcpu_get_state(sc->vmm_vm, vcpu, NULL), ==, VCPU_FROZEN); 357 VERIFY0(vcpu_set_state(sc->vmm_vm, vcpu, VCPU_IDLE, false)); 358 } 359 360 static void 361 vmm_read_lock(vmm_softc_t *sc) 362 { 363 rw_enter(&sc->vmm_rwlock, RW_READER); 364 } 365 366 static void 367 vmm_read_unlock(vmm_softc_t *sc) 368 { 369 rw_exit(&sc->vmm_rwlock); 370 } 371 372 static void 373 vmm_write_lock(vmm_softc_t *sc) 374 { 375 int maxcpus; 376 377 /* First lock all the vCPUs */ 378 maxcpus = vm_get_maxcpus(sc->vmm_vm); 379 for (int vcpu = 0; vcpu < maxcpus; vcpu++) { 380 vcpu_lock_one(sc, vcpu); 381 } 382 383 /* 384 * Block vmm_drv leases from being acquired or held while the VM write 385 * lock is held. 386 */ 387 vmm_lease_block(sc); 388 389 rw_enter(&sc->vmm_rwlock, RW_WRITER); 390 /* 391 * For now, the 'maxcpus' value for an instance is fixed at the 392 * compile-time constant of VM_MAXCPU at creation. If this changes in 393 * the future, allowing for dynamic vCPU resource sizing, acquisition 394 * of the write lock will need to be wary of such changes. 395 */ 396 VERIFY(maxcpus == vm_get_maxcpus(sc->vmm_vm)); 397 } 398 399 static void 400 vmm_write_unlock(vmm_softc_t *sc) 401 { 402 int maxcpus; 403 404 /* Allow vmm_drv leases to be acquired once write lock is dropped */ 405 vmm_lease_unblock(sc); 406 407 /* 408 * The VM write lock _must_ be released from the same thread it was 409 * acquired in, unlike the read lock. 410 */ 411 VERIFY(rw_write_held(&sc->vmm_rwlock)); 412 rw_exit(&sc->vmm_rwlock); 413 414 /* Unlock all the vCPUs */ 415 maxcpus = vm_get_maxcpus(sc->vmm_vm); 416 for (int vcpu = 0; vcpu < maxcpus; vcpu++) { 417 vcpu_unlock_one(sc, vcpu); 418 } 419 } 420 421 static int 422 vmmdev_do_ioctl(vmm_softc_t *sc, int cmd, intptr_t arg, int md, 423 cred_t *credp, int *rvalp) 424 { 425 int error = 0, vcpu = -1; 426 void *datap = (void *)arg; 427 enum vm_lock_type { 428 LOCK_NONE = 0, 429 LOCK_VCPU, 430 LOCK_READ_HOLD, 431 LOCK_WRITE_HOLD 432 } lock_type = LOCK_NONE; 433 434 /* Acquire any exclusion resources needed for the operation. */ 435 switch (cmd) { 436 case VM_RUN: 437 case VM_GET_REGISTER: 438 case VM_SET_REGISTER: 439 case VM_GET_SEGMENT_DESCRIPTOR: 440 case VM_SET_SEGMENT_DESCRIPTOR: 441 case VM_GET_REGISTER_SET: 442 case VM_SET_REGISTER_SET: 443 case VM_INJECT_EXCEPTION: 444 case VM_GET_CAPABILITY: 445 case VM_SET_CAPABILITY: 446 case VM_PPTDEV_MSI: 447 case VM_PPTDEV_MSIX: 448 case VM_SET_X2APIC_STATE: 449 case VM_GLA2GPA: 450 case VM_GLA2GPA_NOFAULT: 451 case VM_ACTIVATE_CPU: 452 case VM_SET_INTINFO: 453 case VM_GET_INTINFO: 454 case VM_RESTART_INSTRUCTION: 455 case VM_SET_KERNEMU_DEV: 456 case VM_GET_KERNEMU_DEV: 457 case VM_RESET_CPU: 458 case VM_GET_RUN_STATE: 459 case VM_SET_RUN_STATE: 460 case VM_GET_FPU: 461 case VM_SET_FPU: 462 case VM_GET_CPUID: 463 case VM_SET_CPUID: 464 case VM_LEGACY_CPUID: 465 /* 466 * Copy in the ID of the vCPU chosen for this operation. 467 * Since a nefarious caller could update their struct between 468 * this locking and when the rest of the ioctl data is copied 469 * in, it is _critical_ that this local 'vcpu' variable be used 470 * rather than the in-struct one when performing the ioctl. 471 */ 472 if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) { 473 return (EFAULT); 474 } 475 if (vcpu < 0 || vcpu >= vm_get_maxcpus(sc->vmm_vm)) { 476 return (EINVAL); 477 } 478 vcpu_lock_one(sc, vcpu); 479 lock_type = LOCK_VCPU; 480 break; 481 482 case VM_REINIT: 483 case VM_BIND_PPTDEV: 484 case VM_UNBIND_PPTDEV: 485 case VM_MAP_PPTDEV_MMIO: 486 case VM_UNMAP_PPTDEV_MMIO: 487 case VM_ALLOC_MEMSEG: 488 case VM_MMAP_MEMSEG: 489 case VM_MUNMAP_MEMSEG: 490 case VM_WRLOCK_CYCLE: 491 case VM_PMTMR_LOCATE: 492 case VM_PAUSE: 493 case VM_RESUME: 494 vmm_write_lock(sc); 495 lock_type = LOCK_WRITE_HOLD; 496 break; 497 498 case VM_GET_MEMSEG: 499 case VM_MMAP_GETNEXT: 500 case VM_LAPIC_IRQ: 501 case VM_INJECT_NMI: 502 case VM_IOAPIC_ASSERT_IRQ: 503 case VM_IOAPIC_DEASSERT_IRQ: 504 case VM_IOAPIC_PULSE_IRQ: 505 case VM_LAPIC_MSI: 506 case VM_LAPIC_LOCAL_IRQ: 507 case VM_GET_X2APIC_STATE: 508 case VM_RTC_READ: 509 case VM_RTC_WRITE: 510 case VM_RTC_SETTIME: 511 case VM_RTC_GETTIME: 512 case VM_PPTDEV_DISABLE_MSIX: 513 case VM_DEVMEM_GETOFFSET: 514 case VM_TRACK_DIRTY_PAGES: 515 vmm_read_lock(sc); 516 lock_type = LOCK_READ_HOLD; 517 break; 518 519 case VM_DATA_READ: 520 case VM_DATA_WRITE: 521 if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) { 522 return (EFAULT); 523 } 524 if (vcpu == -1) { 525 /* Access data for VM-wide devices */ 526 vmm_write_lock(sc); 527 lock_type = LOCK_WRITE_HOLD; 528 } else if (vcpu >= 0 && vcpu < vm_get_maxcpus(sc->vmm_vm)) { 529 /* Access data associated with a specific vCPU */ 530 vcpu_lock_one(sc, vcpu); 531 lock_type = LOCK_VCPU; 532 } else { 533 return (EINVAL); 534 } 535 break; 536 537 case VM_GET_GPA_PMAP: 538 case VM_IOAPIC_PINCOUNT: 539 case VM_SUSPEND: 540 case VM_DESC_FPU_AREA: 541 case VM_SET_AUTODESTRUCT: 542 case VM_DESTROY_SELF: 543 case VM_DESTROY_PENDING: 544 case VM_VCPU_BARRIER: 545 default: 546 break; 547 } 548 549 /* Execute the primary logic for the ioctl. */ 550 switch (cmd) { 551 case VM_RUN: { 552 struct vm_entry entry; 553 554 if (ddi_copyin(datap, &entry, sizeof (entry), md)) { 555 error = EFAULT; 556 break; 557 } 558 559 if (!(curthread->t_schedflag & TS_VCPU)) 560 smt_mark_as_vcpu(); 561 562 error = vm_run(sc->vmm_vm, vcpu, &entry); 563 564 /* 565 * Unexpected states in vm_run() are expressed through positive 566 * errno-oriented return values. VM states which expect further 567 * processing in userspace (necessary context via exitinfo) are 568 * expressed through negative return values. For the time being 569 * a return value of 0 is not expected from vm_run(). 570 */ 571 ASSERT(error != 0); 572 if (error < 0) { 573 const struct vm_exit *vme; 574 void *outp = entry.exit_data; 575 576 error = 0; 577 vme = vm_exitinfo(sc->vmm_vm, vcpu); 578 if (ddi_copyout(vme, outp, sizeof (*vme), md)) { 579 error = EFAULT; 580 } 581 } 582 break; 583 } 584 case VM_SUSPEND: { 585 struct vm_suspend vmsuspend; 586 587 if (ddi_copyin(datap, &vmsuspend, sizeof (vmsuspend), md)) { 588 error = EFAULT; 589 break; 590 } 591 error = vm_suspend(sc->vmm_vm, vmsuspend.how, vmsuspend.source); 592 break; 593 } 594 case VM_REINIT: { 595 struct vm_reinit reinit; 596 597 if (ddi_copyin(datap, &reinit, sizeof (reinit), md)) { 598 error = EFAULT; 599 break; 600 } 601 if ((error = vmm_drv_block_hook(sc, B_TRUE)) != 0) { 602 /* 603 * The VM instance should be free of driver-attached 604 * hooks during the reinitialization process. 605 */ 606 break; 607 } 608 error = vm_reinit(sc->vmm_vm, reinit.flags); 609 (void) vmm_drv_block_hook(sc, B_FALSE); 610 break; 611 } 612 case VM_STAT_DESC: { 613 struct vm_stat_desc statdesc; 614 615 if (ddi_copyin(datap, &statdesc, sizeof (statdesc), md)) { 616 error = EFAULT; 617 break; 618 } 619 error = vmm_stat_desc_copy(statdesc.index, statdesc.desc, 620 sizeof (statdesc.desc)); 621 if (error == 0 && 622 ddi_copyout(&statdesc, datap, sizeof (statdesc), md)) { 623 error = EFAULT; 624 break; 625 } 626 break; 627 } 628 case VM_STATS_IOC: { 629 struct vm_stats vmstats; 630 631 if (ddi_copyin(datap, &vmstats, sizeof (vmstats), md)) { 632 error = EFAULT; 633 break; 634 } 635 hrt2tv(gethrtime(), &vmstats.tv); 636 error = vmm_stat_copy(sc->vmm_vm, vmstats.cpuid, vmstats.index, 637 nitems(vmstats.statbuf), 638 &vmstats.num_entries, vmstats.statbuf); 639 if (error == 0 && 640 ddi_copyout(&vmstats, datap, sizeof (vmstats), md)) { 641 error = EFAULT; 642 break; 643 } 644 break; 645 } 646 647 case VM_PPTDEV_MSI: { 648 struct vm_pptdev_msi pptmsi; 649 650 if (ddi_copyin(datap, &pptmsi, sizeof (pptmsi), md)) { 651 error = EFAULT; 652 break; 653 } 654 error = ppt_setup_msi(sc->vmm_vm, pptmsi.vcpu, pptmsi.pptfd, 655 pptmsi.addr, pptmsi.msg, pptmsi.numvec); 656 break; 657 } 658 case VM_PPTDEV_MSIX: { 659 struct vm_pptdev_msix pptmsix; 660 661 if (ddi_copyin(datap, &pptmsix, sizeof (pptmsix), md)) { 662 error = EFAULT; 663 break; 664 } 665 error = ppt_setup_msix(sc->vmm_vm, pptmsix.vcpu, pptmsix.pptfd, 666 pptmsix.idx, pptmsix.addr, pptmsix.msg, 667 pptmsix.vector_control); 668 break; 669 } 670 case VM_PPTDEV_DISABLE_MSIX: { 671 struct vm_pptdev pptdev; 672 673 if (ddi_copyin(datap, &pptdev, sizeof (pptdev), md)) { 674 error = EFAULT; 675 break; 676 } 677 error = ppt_disable_msix(sc->vmm_vm, pptdev.pptfd); 678 break; 679 } 680 case VM_MAP_PPTDEV_MMIO: { 681 struct vm_pptdev_mmio pptmmio; 682 683 if (ddi_copyin(datap, &pptmmio, sizeof (pptmmio), md)) { 684 error = EFAULT; 685 break; 686 } 687 error = ppt_map_mmio(sc->vmm_vm, pptmmio.pptfd, pptmmio.gpa, 688 pptmmio.len, pptmmio.hpa); 689 break; 690 } 691 case VM_UNMAP_PPTDEV_MMIO: { 692 struct vm_pptdev_mmio pptmmio; 693 694 if (ddi_copyin(datap, &pptmmio, sizeof (pptmmio), md)) { 695 error = EFAULT; 696 break; 697 } 698 error = ppt_unmap_mmio(sc->vmm_vm, pptmmio.pptfd, pptmmio.gpa, 699 pptmmio.len); 700 break; 701 } 702 case VM_BIND_PPTDEV: { 703 struct vm_pptdev pptdev; 704 705 if (ddi_copyin(datap, &pptdev, sizeof (pptdev), md)) { 706 error = EFAULT; 707 break; 708 } 709 error = vm_assign_pptdev(sc->vmm_vm, pptdev.pptfd); 710 break; 711 } 712 case VM_UNBIND_PPTDEV: { 713 struct vm_pptdev pptdev; 714 715 if (ddi_copyin(datap, &pptdev, sizeof (pptdev), md)) { 716 error = EFAULT; 717 break; 718 } 719 error = vm_unassign_pptdev(sc->vmm_vm, pptdev.pptfd); 720 break; 721 } 722 case VM_GET_PPTDEV_LIMITS: { 723 struct vm_pptdev_limits pptlimits; 724 725 if (ddi_copyin(datap, &pptlimits, sizeof (pptlimits), md)) { 726 error = EFAULT; 727 break; 728 } 729 error = ppt_get_limits(sc->vmm_vm, pptlimits.pptfd, 730 &pptlimits.msi_limit, &pptlimits.msix_limit); 731 if (error == 0 && 732 ddi_copyout(&pptlimits, datap, sizeof (pptlimits), md)) { 733 error = EFAULT; 734 break; 735 } 736 break; 737 } 738 case VM_INJECT_EXCEPTION: { 739 struct vm_exception vmexc; 740 if (ddi_copyin(datap, &vmexc, sizeof (vmexc), md)) { 741 error = EFAULT; 742 break; 743 } 744 error = vm_inject_exception(sc->vmm_vm, vcpu, vmexc.vector, 745 vmexc.error_code_valid != 0, vmexc.error_code, 746 vmexc.restart_instruction != 0); 747 break; 748 } 749 case VM_INJECT_NMI: { 750 struct vm_nmi vmnmi; 751 752 if (ddi_copyin(datap, &vmnmi, sizeof (vmnmi), md)) { 753 error = EFAULT; 754 break; 755 } 756 error = vm_inject_nmi(sc->vmm_vm, vmnmi.cpuid); 757 break; 758 } 759 case VM_LAPIC_IRQ: { 760 struct vm_lapic_irq vmirq; 761 762 if (ddi_copyin(datap, &vmirq, sizeof (vmirq), md)) { 763 error = EFAULT; 764 break; 765 } 766 error = lapic_intr_edge(sc->vmm_vm, vmirq.cpuid, vmirq.vector); 767 break; 768 } 769 case VM_LAPIC_LOCAL_IRQ: { 770 struct vm_lapic_irq vmirq; 771 772 if (ddi_copyin(datap, &vmirq, sizeof (vmirq), md)) { 773 error = EFAULT; 774 break; 775 } 776 error = lapic_set_local_intr(sc->vmm_vm, vmirq.cpuid, 777 vmirq.vector); 778 break; 779 } 780 case VM_LAPIC_MSI: { 781 struct vm_lapic_msi vmmsi; 782 783 if (ddi_copyin(datap, &vmmsi, sizeof (vmmsi), md)) { 784 error = EFAULT; 785 break; 786 } 787 error = lapic_intr_msi(sc->vmm_vm, vmmsi.addr, vmmsi.msg); 788 break; 789 } 790 791 case VM_IOAPIC_ASSERT_IRQ: { 792 struct vm_ioapic_irq ioapic_irq; 793 794 if (ddi_copyin(datap, &ioapic_irq, sizeof (ioapic_irq), md)) { 795 error = EFAULT; 796 break; 797 } 798 error = vioapic_assert_irq(sc->vmm_vm, ioapic_irq.irq); 799 break; 800 } 801 case VM_IOAPIC_DEASSERT_IRQ: { 802 struct vm_ioapic_irq ioapic_irq; 803 804 if (ddi_copyin(datap, &ioapic_irq, sizeof (ioapic_irq), md)) { 805 error = EFAULT; 806 break; 807 } 808 error = vioapic_deassert_irq(sc->vmm_vm, ioapic_irq.irq); 809 break; 810 } 811 case VM_IOAPIC_PULSE_IRQ: { 812 struct vm_ioapic_irq ioapic_irq; 813 814 if (ddi_copyin(datap, &ioapic_irq, sizeof (ioapic_irq), md)) { 815 error = EFAULT; 816 break; 817 } 818 error = vioapic_pulse_irq(sc->vmm_vm, ioapic_irq.irq); 819 break; 820 } 821 case VM_IOAPIC_PINCOUNT: { 822 int pincount; 823 824 pincount = vioapic_pincount(sc->vmm_vm); 825 if (ddi_copyout(&pincount, datap, sizeof (int), md)) { 826 error = EFAULT; 827 break; 828 } 829 break; 830 } 831 case VM_DESC_FPU_AREA: { 832 struct vm_fpu_desc desc; 833 void *buf = NULL; 834 835 if (ddi_copyin(datap, &desc, sizeof (desc), md)) { 836 error = EFAULT; 837 break; 838 } 839 if (desc.vfd_num_entries > 64) { 840 error = EINVAL; 841 break; 842 } 843 const size_t buf_sz = sizeof (struct vm_fpu_desc_entry) * 844 desc.vfd_num_entries; 845 if (buf_sz != 0) { 846 buf = kmem_zalloc(buf_sz, KM_SLEEP); 847 } 848 849 /* 850 * For now, we are depending on vm_fpu_desc_entry and 851 * hma_xsave_state_desc_t having the same format. 852 */ 853 CTASSERT(sizeof (struct vm_fpu_desc_entry) == 854 sizeof (hma_xsave_state_desc_t)); 855 856 size_t req_size; 857 const uint_t max_entries = hma_fpu_describe_xsave_state( 858 (hma_xsave_state_desc_t *)buf, 859 desc.vfd_num_entries, 860 &req_size); 861 862 desc.vfd_req_size = req_size; 863 desc.vfd_num_entries = max_entries; 864 if (buf_sz != 0) { 865 if (ddi_copyout(buf, desc.vfd_entry_data, buf_sz, md)) { 866 error = EFAULT; 867 } 868 kmem_free(buf, buf_sz); 869 } 870 871 if (error == 0) { 872 if (ddi_copyout(&desc, datap, sizeof (desc), md)) { 873 error = EFAULT; 874 } 875 } 876 break; 877 } 878 case VM_SET_AUTODESTRUCT: { 879 /* 880 * Since this has to do with controlling the lifetime of the 881 * greater vmm_softc_t, the flag is protected by vmm_mtx, rather 882 * than the vcpu-centric or rwlock exclusion mechanisms. 883 */ 884 mutex_enter(&vmm_mtx); 885 if (arg != 0) { 886 sc->vmm_flags |= VMM_AUTODESTROY; 887 } else { 888 sc->vmm_flags &= ~VMM_AUTODESTROY; 889 } 890 mutex_exit(&vmm_mtx); 891 break; 892 } 893 case VM_DESTROY_SELF: { 894 bool hma_release = false; 895 896 /* 897 * Just like VMM_DESTROY_VM, but on the instance file descriptor 898 * itself, rather than having to perform a racy name lookup as 899 * part of the destroy process. 900 * 901 * Since vmm_destroy_locked() performs vCPU lock acquisition in 902 * order to kick the vCPUs out of guest context as part of any 903 * destruction, we do not need to worry about it ourself using 904 * the `lock_type` logic here. 905 */ 906 mutex_enter(&vmm_mtx); 907 VERIFY0(vmm_destroy_locked(sc, VDO_DEFAULT, &hma_release)); 908 mutex_exit(&vmm_mtx); 909 if (hma_release) { 910 vmm_hma_release(); 911 } 912 break; 913 } 914 case VM_DESTROY_PENDING: { 915 /* 916 * If we have made it this far, then destruction of the instance 917 * has not been initiated. 918 */ 919 *rvalp = 0; 920 break; 921 } 922 923 case VM_ISA_ASSERT_IRQ: { 924 struct vm_isa_irq isa_irq; 925 926 if (ddi_copyin(datap, &isa_irq, sizeof (isa_irq), md)) { 927 error = EFAULT; 928 break; 929 } 930 error = vatpic_assert_irq(sc->vmm_vm, isa_irq.atpic_irq); 931 if (error == 0 && isa_irq.ioapic_irq != -1) { 932 error = vioapic_assert_irq(sc->vmm_vm, 933 isa_irq.ioapic_irq); 934 } 935 break; 936 } 937 case VM_ISA_DEASSERT_IRQ: { 938 struct vm_isa_irq isa_irq; 939 940 if (ddi_copyin(datap, &isa_irq, sizeof (isa_irq), md)) { 941 error = EFAULT; 942 break; 943 } 944 error = vatpic_deassert_irq(sc->vmm_vm, isa_irq.atpic_irq); 945 if (error == 0 && isa_irq.ioapic_irq != -1) { 946 error = vioapic_deassert_irq(sc->vmm_vm, 947 isa_irq.ioapic_irq); 948 } 949 break; 950 } 951 case VM_ISA_PULSE_IRQ: { 952 struct vm_isa_irq isa_irq; 953 954 if (ddi_copyin(datap, &isa_irq, sizeof (isa_irq), md)) { 955 error = EFAULT; 956 break; 957 } 958 error = vatpic_pulse_irq(sc->vmm_vm, isa_irq.atpic_irq); 959 if (error == 0 && isa_irq.ioapic_irq != -1) { 960 error = vioapic_pulse_irq(sc->vmm_vm, 961 isa_irq.ioapic_irq); 962 } 963 break; 964 } 965 case VM_ISA_SET_IRQ_TRIGGER: { 966 struct vm_isa_irq_trigger isa_irq_trigger; 967 968 if (ddi_copyin(datap, &isa_irq_trigger, 969 sizeof (isa_irq_trigger), md)) { 970 error = EFAULT; 971 break; 972 } 973 error = vatpic_set_irq_trigger(sc->vmm_vm, 974 isa_irq_trigger.atpic_irq, isa_irq_trigger.trigger); 975 break; 976 } 977 978 case VM_MMAP_GETNEXT: { 979 struct vm_memmap mm; 980 981 if (ddi_copyin(datap, &mm, sizeof (mm), md)) { 982 error = EFAULT; 983 break; 984 } 985 error = vm_mmap_getnext(sc->vmm_vm, &mm.gpa, &mm.segid, 986 &mm.segoff, &mm.len, &mm.prot, &mm.flags); 987 if (error == 0 && ddi_copyout(&mm, datap, sizeof (mm), md)) { 988 error = EFAULT; 989 break; 990 } 991 break; 992 } 993 case VM_MMAP_MEMSEG: { 994 struct vm_memmap mm; 995 996 if (ddi_copyin(datap, &mm, sizeof (mm), md)) { 997 error = EFAULT; 998 break; 999 } 1000 error = vm_mmap_memseg(sc->vmm_vm, mm.gpa, mm.segid, mm.segoff, 1001 mm.len, mm.prot, mm.flags); 1002 break; 1003 } 1004 case VM_MUNMAP_MEMSEG: { 1005 struct vm_munmap mu; 1006 1007 if (ddi_copyin(datap, &mu, sizeof (mu), md)) { 1008 error = EFAULT; 1009 break; 1010 } 1011 error = vm_munmap_memseg(sc->vmm_vm, mu.gpa, mu.len); 1012 break; 1013 } 1014 case VM_ALLOC_MEMSEG: { 1015 struct vm_memseg vmseg; 1016 1017 if (ddi_copyin(datap, &vmseg, sizeof (vmseg), md)) { 1018 error = EFAULT; 1019 break; 1020 } 1021 error = vmmdev_alloc_memseg(sc, &vmseg); 1022 break; 1023 } 1024 case VM_GET_MEMSEG: { 1025 struct vm_memseg vmseg; 1026 1027 if (ddi_copyin(datap, &vmseg, sizeof (vmseg), md)) { 1028 error = EFAULT; 1029 break; 1030 } 1031 error = vmmdev_get_memseg(sc, &vmseg); 1032 if (error == 0 && 1033 ddi_copyout(&vmseg, datap, sizeof (vmseg), md)) { 1034 error = EFAULT; 1035 break; 1036 } 1037 break; 1038 } 1039 case VM_GET_REGISTER: { 1040 struct vm_register vmreg; 1041 1042 if (ddi_copyin(datap, &vmreg, sizeof (vmreg), md)) { 1043 error = EFAULT; 1044 break; 1045 } 1046 error = vm_get_register(sc->vmm_vm, vcpu, vmreg.regnum, 1047 &vmreg.regval); 1048 if (error == 0 && 1049 ddi_copyout(&vmreg, datap, sizeof (vmreg), md)) { 1050 error = EFAULT; 1051 break; 1052 } 1053 break; 1054 } 1055 case VM_SET_REGISTER: { 1056 struct vm_register vmreg; 1057 1058 if (ddi_copyin(datap, &vmreg, sizeof (vmreg), md)) { 1059 error = EFAULT; 1060 break; 1061 } 1062 error = vm_set_register(sc->vmm_vm, vcpu, vmreg.regnum, 1063 vmreg.regval); 1064 break; 1065 } 1066 case VM_SET_SEGMENT_DESCRIPTOR: { 1067 struct vm_seg_desc vmsegd; 1068 1069 if (ddi_copyin(datap, &vmsegd, sizeof (vmsegd), md)) { 1070 error = EFAULT; 1071 break; 1072 } 1073 error = vm_set_seg_desc(sc->vmm_vm, vcpu, vmsegd.regnum, 1074 &vmsegd.desc); 1075 break; 1076 } 1077 case VM_GET_SEGMENT_DESCRIPTOR: { 1078 struct vm_seg_desc vmsegd; 1079 1080 if (ddi_copyin(datap, &vmsegd, sizeof (vmsegd), md)) { 1081 error = EFAULT; 1082 break; 1083 } 1084 error = vm_get_seg_desc(sc->vmm_vm, vcpu, vmsegd.regnum, 1085 &vmsegd.desc); 1086 if (error == 0 && 1087 ddi_copyout(&vmsegd, datap, sizeof (vmsegd), md)) { 1088 error = EFAULT; 1089 break; 1090 } 1091 break; 1092 } 1093 case VM_GET_REGISTER_SET: { 1094 struct vm_register_set vrs; 1095 int regnums[VM_REG_LAST]; 1096 uint64_t regvals[VM_REG_LAST]; 1097 1098 if (ddi_copyin(datap, &vrs, sizeof (vrs), md)) { 1099 error = EFAULT; 1100 break; 1101 } 1102 if (vrs.count > VM_REG_LAST || vrs.count == 0) { 1103 error = EINVAL; 1104 break; 1105 } 1106 if (ddi_copyin(vrs.regnums, regnums, 1107 sizeof (int) * vrs.count, md)) { 1108 error = EFAULT; 1109 break; 1110 } 1111 1112 error = 0; 1113 for (uint_t i = 0; i < vrs.count && error == 0; i++) { 1114 if (regnums[i] < 0) { 1115 error = EINVAL; 1116 break; 1117 } 1118 error = vm_get_register(sc->vmm_vm, vcpu, regnums[i], 1119 ®vals[i]); 1120 } 1121 if (error == 0 && ddi_copyout(regvals, vrs.regvals, 1122 sizeof (uint64_t) * vrs.count, md)) { 1123 error = EFAULT; 1124 } 1125 break; 1126 } 1127 case VM_SET_REGISTER_SET: { 1128 struct vm_register_set vrs; 1129 int regnums[VM_REG_LAST]; 1130 uint64_t regvals[VM_REG_LAST]; 1131 1132 if (ddi_copyin(datap, &vrs, sizeof (vrs), md)) { 1133 error = EFAULT; 1134 break; 1135 } 1136 if (vrs.count > VM_REG_LAST || vrs.count == 0) { 1137 error = EINVAL; 1138 break; 1139 } 1140 if (ddi_copyin(vrs.regnums, regnums, 1141 sizeof (int) * vrs.count, md)) { 1142 error = EFAULT; 1143 break; 1144 } 1145 if (ddi_copyin(vrs.regvals, regvals, 1146 sizeof (uint64_t) * vrs.count, md)) { 1147 error = EFAULT; 1148 break; 1149 } 1150 1151 error = 0; 1152 for (uint_t i = 0; i < vrs.count && error == 0; i++) { 1153 /* 1154 * Setting registers in a set is not atomic, since a 1155 * failure in the middle of the set will cause a 1156 * bail-out and inconsistent register state. Callers 1157 * should be wary of this. 1158 */ 1159 if (regnums[i] < 0) { 1160 error = EINVAL; 1161 break; 1162 } 1163 error = vm_set_register(sc->vmm_vm, vcpu, regnums[i], 1164 regvals[i]); 1165 } 1166 break; 1167 } 1168 case VM_RESET_CPU: { 1169 struct vm_vcpu_reset vvr; 1170 1171 if (ddi_copyin(datap, &vvr, sizeof (vvr), md)) { 1172 error = EFAULT; 1173 break; 1174 } 1175 if (vvr.kind != VRK_RESET && vvr.kind != VRK_INIT) { 1176 error = EINVAL; 1177 } 1178 1179 error = vcpu_arch_reset(sc->vmm_vm, vcpu, vvr.kind == VRK_INIT); 1180 break; 1181 } 1182 case VM_GET_RUN_STATE: { 1183 struct vm_run_state vrs; 1184 1185 bzero(&vrs, sizeof (vrs)); 1186 error = vm_get_run_state(sc->vmm_vm, vcpu, &vrs.state, 1187 &vrs.sipi_vector); 1188 if (error == 0) { 1189 if (ddi_copyout(&vrs, datap, sizeof (vrs), md)) { 1190 error = EFAULT; 1191 break; 1192 } 1193 } 1194 break; 1195 } 1196 case VM_SET_RUN_STATE: { 1197 struct vm_run_state vrs; 1198 1199 if (ddi_copyin(datap, &vrs, sizeof (vrs), md)) { 1200 error = EFAULT; 1201 break; 1202 } 1203 error = vm_set_run_state(sc->vmm_vm, vcpu, vrs.state, 1204 vrs.sipi_vector); 1205 break; 1206 } 1207 case VM_GET_FPU: { 1208 struct vm_fpu_state req; 1209 const size_t max_len = (PAGESIZE * 2); 1210 void *kbuf; 1211 1212 if (ddi_copyin(datap, &req, sizeof (req), md)) { 1213 error = EFAULT; 1214 break; 1215 } 1216 if (req.len > max_len || req.len == 0) { 1217 error = EINVAL; 1218 break; 1219 } 1220 kbuf = kmem_zalloc(req.len, KM_SLEEP); 1221 error = vm_get_fpu(sc->vmm_vm, vcpu, kbuf, req.len); 1222 if (error == 0) { 1223 if (ddi_copyout(kbuf, req.buf, req.len, md)) { 1224 error = EFAULT; 1225 } 1226 } 1227 kmem_free(kbuf, req.len); 1228 break; 1229 } 1230 case VM_SET_FPU: { 1231 struct vm_fpu_state req; 1232 const size_t max_len = (PAGESIZE * 2); 1233 void *kbuf; 1234 1235 if (ddi_copyin(datap, &req, sizeof (req), md)) { 1236 error = EFAULT; 1237 break; 1238 } 1239 if (req.len > max_len || req.len == 0) { 1240 error = EINVAL; 1241 break; 1242 } 1243 kbuf = kmem_alloc(req.len, KM_SLEEP); 1244 if (ddi_copyin(req.buf, kbuf, req.len, md)) { 1245 error = EFAULT; 1246 } else { 1247 error = vm_set_fpu(sc->vmm_vm, vcpu, kbuf, req.len); 1248 } 1249 kmem_free(kbuf, req.len); 1250 break; 1251 } 1252 case VM_GET_CPUID: { 1253 struct vm_vcpu_cpuid_config cfg; 1254 struct vcpu_cpuid_entry *entries = NULL; 1255 1256 if (ddi_copyin(datap, &cfg, sizeof (cfg), md)) { 1257 error = EFAULT; 1258 break; 1259 } 1260 if (cfg.vvcc_nent > VMM_MAX_CPUID_ENTRIES) { 1261 error = EINVAL; 1262 break; 1263 } 1264 1265 const size_t entries_size = 1266 cfg.vvcc_nent * sizeof (struct vcpu_cpuid_entry); 1267 if (entries_size != 0) { 1268 entries = kmem_zalloc(entries_size, KM_SLEEP); 1269 } 1270 1271 vcpu_cpuid_config_t vm_cfg = { 1272 .vcc_nent = cfg.vvcc_nent, 1273 .vcc_entries = entries, 1274 }; 1275 error = vm_get_cpuid(sc->vmm_vm, vcpu, &vm_cfg); 1276 1277 /* 1278 * Only attempt to copy out the resultant entries if we were 1279 * able to query them from the instance. The flags and number 1280 * of entries are emitted regardless. 1281 */ 1282 cfg.vvcc_flags = vm_cfg.vcc_flags; 1283 cfg.vvcc_nent = vm_cfg.vcc_nent; 1284 if (entries != NULL) { 1285 if (error == 0 && ddi_copyout(entries, cfg.vvcc_entries, 1286 entries_size, md) != 0) { 1287 error = EFAULT; 1288 } 1289 1290 kmem_free(entries, entries_size); 1291 } 1292 1293 if (ddi_copyout(&cfg, datap, sizeof (cfg), md) != 0) { 1294 error = EFAULT; 1295 } 1296 break; 1297 } 1298 case VM_SET_CPUID: { 1299 struct vm_vcpu_cpuid_config cfg; 1300 struct vcpu_cpuid_entry *entries = NULL; 1301 size_t entries_size = 0; 1302 1303 if (ddi_copyin(datap, &cfg, sizeof (cfg), md)) { 1304 error = EFAULT; 1305 break; 1306 } 1307 if (cfg.vvcc_nent > VMM_MAX_CPUID_ENTRIES) { 1308 error = EFBIG; 1309 break; 1310 } 1311 if ((cfg.vvcc_flags & VCC_FLAG_LEGACY_HANDLING) != 0) { 1312 /* 1313 * If we are being instructed to use "legacy" handling, 1314 * then no entries should be provided, since the static 1315 * in-kernel masking will be used. 1316 */ 1317 if (cfg.vvcc_nent != 0) { 1318 error = EINVAL; 1319 break; 1320 } 1321 } else if (cfg.vvcc_nent != 0) { 1322 entries_size = 1323 cfg.vvcc_nent * sizeof (struct vcpu_cpuid_entry); 1324 entries = kmem_alloc(entries_size, KM_SLEEP); 1325 1326 if (ddi_copyin(cfg.vvcc_entries, entries, entries_size, 1327 md) != 0) { 1328 error = EFAULT; 1329 kmem_free(entries, entries_size); 1330 break; 1331 } 1332 } 1333 1334 vcpu_cpuid_config_t vm_cfg = { 1335 .vcc_flags = cfg.vvcc_flags, 1336 .vcc_nent = cfg.vvcc_nent, 1337 .vcc_entries = entries, 1338 }; 1339 error = vm_set_cpuid(sc->vmm_vm, vcpu, &vm_cfg); 1340 1341 if (entries != NULL) { 1342 kmem_free(entries, entries_size); 1343 } 1344 break; 1345 } 1346 case VM_LEGACY_CPUID: { 1347 struct vm_legacy_cpuid vlc; 1348 if (ddi_copyin(datap, &vlc, sizeof (vlc), md)) { 1349 error = EFAULT; 1350 break; 1351 } 1352 vlc.vlc_vcpuid = vcpu; 1353 1354 legacy_emulate_cpuid(sc->vmm_vm, vcpu, &vlc.vlc_eax, 1355 &vlc.vlc_ebx, &vlc.vlc_ecx, &vlc.vlc_edx); 1356 1357 if (ddi_copyout(&vlc, datap, sizeof (vlc), md)) { 1358 error = EFAULT; 1359 break; 1360 } 1361 break; 1362 } 1363 1364 case VM_SET_KERNEMU_DEV: 1365 case VM_GET_KERNEMU_DEV: { 1366 struct vm_readwrite_kernemu_device kemu; 1367 size_t size = 0; 1368 1369 if (ddi_copyin(datap, &kemu, sizeof (kemu), md)) { 1370 error = EFAULT; 1371 break; 1372 } 1373 1374 if (kemu.access_width > 3) { 1375 error = EINVAL; 1376 break; 1377 } 1378 size = (1 << kemu.access_width); 1379 ASSERT(size >= 1 && size <= 8); 1380 1381 if (cmd == VM_SET_KERNEMU_DEV) { 1382 error = vm_service_mmio_write(sc->vmm_vm, vcpu, 1383 kemu.gpa, kemu.value, size); 1384 } else { 1385 error = vm_service_mmio_read(sc->vmm_vm, vcpu, 1386 kemu.gpa, &kemu.value, size); 1387 } 1388 1389 if (error == 0) { 1390 if (ddi_copyout(&kemu, datap, sizeof (kemu), md)) { 1391 error = EFAULT; 1392 break; 1393 } 1394 } 1395 break; 1396 } 1397 1398 case VM_GET_CAPABILITY: { 1399 struct vm_capability vmcap; 1400 1401 if (ddi_copyin(datap, &vmcap, sizeof (vmcap), md)) { 1402 error = EFAULT; 1403 break; 1404 } 1405 error = vm_get_capability(sc->vmm_vm, vcpu, vmcap.captype, 1406 &vmcap.capval); 1407 if (error == 0 && 1408 ddi_copyout(&vmcap, datap, sizeof (vmcap), md)) { 1409 error = EFAULT; 1410 break; 1411 } 1412 break; 1413 } 1414 case VM_SET_CAPABILITY: { 1415 struct vm_capability vmcap; 1416 1417 if (ddi_copyin(datap, &vmcap, sizeof (vmcap), md)) { 1418 error = EFAULT; 1419 break; 1420 } 1421 error = vm_set_capability(sc->vmm_vm, vcpu, vmcap.captype, 1422 vmcap.capval); 1423 break; 1424 } 1425 case VM_SET_X2APIC_STATE: { 1426 struct vm_x2apic x2apic; 1427 1428 if (ddi_copyin(datap, &x2apic, sizeof (x2apic), md)) { 1429 error = EFAULT; 1430 break; 1431 } 1432 error = vm_set_x2apic_state(sc->vmm_vm, vcpu, x2apic.state); 1433 break; 1434 } 1435 case VM_GET_X2APIC_STATE: { 1436 struct vm_x2apic x2apic; 1437 1438 if (ddi_copyin(datap, &x2apic, sizeof (x2apic), md)) { 1439 error = EFAULT; 1440 break; 1441 } 1442 error = vm_get_x2apic_state(sc->vmm_vm, x2apic.cpuid, 1443 &x2apic.state); 1444 if (error == 0 && 1445 ddi_copyout(&x2apic, datap, sizeof (x2apic), md)) { 1446 error = EFAULT; 1447 break; 1448 } 1449 break; 1450 } 1451 case VM_GET_GPA_PMAP: { 1452 /* 1453 * Until there is a necessity to leak EPT/RVI PTE values to 1454 * userspace, this will remain unimplemented 1455 */ 1456 error = EINVAL; 1457 break; 1458 } 1459 case VM_GET_HPET_CAPABILITIES: { 1460 struct vm_hpet_cap hpetcap; 1461 1462 error = vhpet_getcap(&hpetcap); 1463 if (error == 0 && 1464 ddi_copyout(&hpetcap, datap, sizeof (hpetcap), md)) { 1465 error = EFAULT; 1466 break; 1467 } 1468 break; 1469 } 1470 case VM_GLA2GPA: { 1471 struct vm_gla2gpa gg; 1472 1473 if (ddi_copyin(datap, &gg, sizeof (gg), md)) { 1474 error = EFAULT; 1475 break; 1476 } 1477 gg.vcpuid = vcpu; 1478 error = vm_gla2gpa(sc->vmm_vm, vcpu, &gg.paging, gg.gla, 1479 gg.prot, &gg.gpa, &gg.fault); 1480 if (error == 0 && ddi_copyout(&gg, datap, sizeof (gg), md)) { 1481 error = EFAULT; 1482 break; 1483 } 1484 break; 1485 } 1486 case VM_GLA2GPA_NOFAULT: { 1487 struct vm_gla2gpa gg; 1488 1489 if (ddi_copyin(datap, &gg, sizeof (gg), md)) { 1490 error = EFAULT; 1491 break; 1492 } 1493 gg.vcpuid = vcpu; 1494 error = vm_gla2gpa_nofault(sc->vmm_vm, vcpu, &gg.paging, 1495 gg.gla, gg.prot, &gg.gpa, &gg.fault); 1496 if (error == 0 && ddi_copyout(&gg, datap, sizeof (gg), md)) { 1497 error = EFAULT; 1498 break; 1499 } 1500 break; 1501 } 1502 1503 case VM_ACTIVATE_CPU: 1504 error = vm_activate_cpu(sc->vmm_vm, vcpu); 1505 break; 1506 1507 case VM_SUSPEND_CPU: 1508 if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) { 1509 error = EFAULT; 1510 } else { 1511 error = vm_suspend_cpu(sc->vmm_vm, vcpu); 1512 } 1513 break; 1514 1515 case VM_RESUME_CPU: 1516 if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) { 1517 error = EFAULT; 1518 } else { 1519 error = vm_resume_cpu(sc->vmm_vm, vcpu); 1520 } 1521 break; 1522 1523 case VM_VCPU_BARRIER: 1524 vcpu = arg; 1525 error = vm_vcpu_barrier(sc->vmm_vm, vcpu); 1526 break; 1527 1528 case VM_GET_CPUS: { 1529 struct vm_cpuset vm_cpuset; 1530 cpuset_t tempset; 1531 void *srcp = &tempset; 1532 int size; 1533 1534 if (ddi_copyin(datap, &vm_cpuset, sizeof (vm_cpuset), md)) { 1535 error = EFAULT; 1536 break; 1537 } 1538 1539 /* Be more generous about sizing since our cpuset_t is large. */ 1540 size = vm_cpuset.cpusetsize; 1541 if (size <= 0 || size > sizeof (cpuset_t)) { 1542 error = ERANGE; 1543 } 1544 /* 1545 * If they want a ulong_t or less, make sure they receive the 1546 * low bits with all the useful information. 1547 */ 1548 if (size <= sizeof (tempset.cpub[0])) { 1549 srcp = &tempset.cpub[0]; 1550 } 1551 1552 if (vm_cpuset.which == VM_ACTIVE_CPUS) { 1553 tempset = vm_active_cpus(sc->vmm_vm); 1554 } else if (vm_cpuset.which == VM_DEBUG_CPUS) { 1555 tempset = vm_debug_cpus(sc->vmm_vm); 1556 } else { 1557 error = EINVAL; 1558 } 1559 1560 ASSERT(size > 0 && size <= sizeof (tempset)); 1561 if (error == 0 && 1562 ddi_copyout(srcp, vm_cpuset.cpus, size, md)) { 1563 error = EFAULT; 1564 break; 1565 } 1566 break; 1567 } 1568 case VM_SET_INTINFO: { 1569 struct vm_intinfo vmii; 1570 1571 if (ddi_copyin(datap, &vmii, sizeof (vmii), md)) { 1572 error = EFAULT; 1573 break; 1574 } 1575 error = vm_exit_intinfo(sc->vmm_vm, vcpu, vmii.info1); 1576 break; 1577 } 1578 case VM_GET_INTINFO: { 1579 struct vm_intinfo vmii; 1580 1581 vmii.vcpuid = vcpu; 1582 error = vm_get_intinfo(sc->vmm_vm, vcpu, &vmii.info1, 1583 &vmii.info2); 1584 if (error == 0 && 1585 ddi_copyout(&vmii, datap, sizeof (vmii), md)) { 1586 error = EFAULT; 1587 break; 1588 } 1589 break; 1590 } 1591 case VM_RTC_WRITE: { 1592 struct vm_rtc_data rtcdata; 1593 1594 if (ddi_copyin(datap, &rtcdata, sizeof (rtcdata), md)) { 1595 error = EFAULT; 1596 break; 1597 } 1598 error = vrtc_nvram_write(sc->vmm_vm, rtcdata.offset, 1599 rtcdata.value); 1600 break; 1601 } 1602 case VM_RTC_READ: { 1603 struct vm_rtc_data rtcdata; 1604 1605 if (ddi_copyin(datap, &rtcdata, sizeof (rtcdata), md)) { 1606 error = EFAULT; 1607 break; 1608 } 1609 error = vrtc_nvram_read(sc->vmm_vm, rtcdata.offset, 1610 &rtcdata.value); 1611 if (error == 0 && 1612 ddi_copyout(&rtcdata, datap, sizeof (rtcdata), md)) { 1613 error = EFAULT; 1614 break; 1615 } 1616 break; 1617 } 1618 case VM_RTC_SETTIME: { 1619 timespec_t ts; 1620 1621 if (ddi_copyin(datap, &ts, sizeof (ts), md)) { 1622 error = EFAULT; 1623 break; 1624 } 1625 error = vrtc_set_time(sc->vmm_vm, &ts); 1626 break; 1627 } 1628 case VM_RTC_GETTIME: { 1629 timespec_t ts; 1630 1631 vrtc_get_time(sc->vmm_vm, &ts); 1632 if (ddi_copyout(&ts, datap, sizeof (ts), md)) { 1633 error = EFAULT; 1634 break; 1635 } 1636 break; 1637 } 1638 1639 case VM_PMTMR_LOCATE: { 1640 uint16_t port = arg; 1641 error = vpmtmr_set_location(sc->vmm_vm, port); 1642 break; 1643 } 1644 1645 case VM_RESTART_INSTRUCTION: 1646 error = vm_restart_instruction(sc->vmm_vm, vcpu); 1647 break; 1648 1649 case VM_SET_TOPOLOGY: { 1650 struct vm_cpu_topology topo; 1651 1652 if (ddi_copyin(datap, &topo, sizeof (topo), md) != 0) { 1653 error = EFAULT; 1654 break; 1655 } 1656 error = vm_set_topology(sc->vmm_vm, topo.sockets, topo.cores, 1657 topo.threads, topo.maxcpus); 1658 break; 1659 } 1660 case VM_GET_TOPOLOGY: { 1661 struct vm_cpu_topology topo; 1662 1663 vm_get_topology(sc->vmm_vm, &topo.sockets, &topo.cores, 1664 &topo.threads, &topo.maxcpus); 1665 if (ddi_copyout(&topo, datap, sizeof (topo), md) != 0) { 1666 error = EFAULT; 1667 break; 1668 } 1669 break; 1670 } 1671 case VM_DEVMEM_GETOFFSET: { 1672 struct vm_devmem_offset vdo; 1673 vmm_devmem_entry_t *de; 1674 1675 if (ddi_copyin(datap, &vdo, sizeof (vdo), md) != 0) { 1676 error = EFAULT; 1677 break; 1678 } 1679 1680 de = vmmdev_devmem_find(sc, vdo.segid); 1681 if (de != NULL) { 1682 vdo.offset = de->vde_off; 1683 if (ddi_copyout(&vdo, datap, sizeof (vdo), md) != 0) { 1684 error = EFAULT; 1685 } 1686 } else { 1687 error = ENOENT; 1688 } 1689 break; 1690 } 1691 case VM_TRACK_DIRTY_PAGES: { 1692 const size_t max_track_region_len = 8 * PAGESIZE * 8 * PAGESIZE; 1693 struct vmm_dirty_tracker tracker; 1694 uint8_t *bitmap; 1695 size_t len; 1696 1697 if (ddi_copyin(datap, &tracker, sizeof (tracker), md) != 0) { 1698 error = EFAULT; 1699 break; 1700 } 1701 if ((tracker.vdt_start_gpa & PAGEOFFSET) != 0) { 1702 error = EINVAL; 1703 break; 1704 } 1705 if (tracker.vdt_len == 0) { 1706 break; 1707 } 1708 if ((tracker.vdt_len & PAGEOFFSET) != 0) { 1709 error = EINVAL; 1710 break; 1711 } 1712 if (tracker.vdt_len > max_track_region_len) { 1713 error = EINVAL; 1714 break; 1715 } 1716 len = roundup(tracker.vdt_len / PAGESIZE, 8) / 8; 1717 bitmap = kmem_zalloc(len, KM_SLEEP); 1718 error = vm_track_dirty_pages(sc->vmm_vm, tracker.vdt_start_gpa, 1719 tracker.vdt_len, bitmap); 1720 if (error == 0 && 1721 ddi_copyout(bitmap, tracker.vdt_pfns, len, md) != 0) { 1722 error = EFAULT; 1723 } 1724 kmem_free(bitmap, len); 1725 1726 break; 1727 } 1728 case VM_WRLOCK_CYCLE: { 1729 /* 1730 * Present a test mechanism to acquire/release the write lock 1731 * on the VM without any other effects. 1732 */ 1733 break; 1734 } 1735 case VM_DATA_READ: { 1736 struct vm_data_xfer vdx; 1737 1738 if (ddi_copyin(datap, &vdx, sizeof (vdx), md) != 0) { 1739 error = EFAULT; 1740 break; 1741 } 1742 if ((vdx.vdx_flags & ~VDX_FLAGS_VALID) != 0) { 1743 error = EINVAL; 1744 break; 1745 } 1746 if (vdx.vdx_len > VM_DATA_XFER_LIMIT) { 1747 error = EFBIG; 1748 break; 1749 } 1750 1751 const size_t len = vdx.vdx_len; 1752 void *buf = NULL; 1753 if (len != 0) { 1754 const void *udata = vdx.vdx_data; 1755 1756 buf = kmem_alloc(len, KM_SLEEP); 1757 if ((vdx.vdx_flags & VDX_FLAG_READ_COPYIN) == 0) { 1758 bzero(buf, len); 1759 } else if (ddi_copyin(udata, buf, len, md) != 0) { 1760 kmem_free(buf, len); 1761 error = EFAULT; 1762 break; 1763 } 1764 } 1765 1766 vdx.vdx_result_len = 0; 1767 vmm_data_req_t req = { 1768 .vdr_class = vdx.vdx_class, 1769 .vdr_version = vdx.vdx_version, 1770 .vdr_flags = vdx.vdx_flags, 1771 .vdr_len = len, 1772 .vdr_data = buf, 1773 .vdr_result_len = &vdx.vdx_result_len, 1774 }; 1775 error = vmm_data_read(sc->vmm_vm, vdx.vdx_vcpuid, &req); 1776 1777 if (error == 0 && buf != NULL) { 1778 if (ddi_copyout(buf, vdx.vdx_data, len, md) != 0) { 1779 error = EFAULT; 1780 } 1781 } 1782 1783 /* 1784 * Copy out the transfer request so that the value of 1785 * vdx_result_len can be made available, regardless of any 1786 * error(s) which may have occurred. 1787 */ 1788 if (ddi_copyout(&vdx, datap, sizeof (vdx), md) != 0) { 1789 error = (error != 0) ? error : EFAULT; 1790 } 1791 1792 if (buf != NULL) { 1793 kmem_free(buf, len); 1794 } 1795 break; 1796 } 1797 case VM_DATA_WRITE: { 1798 struct vm_data_xfer vdx; 1799 1800 if (ddi_copyin(datap, &vdx, sizeof (vdx), md) != 0) { 1801 error = EFAULT; 1802 break; 1803 } 1804 if ((vdx.vdx_flags & ~VDX_FLAGS_VALID) != 0) { 1805 error = EINVAL; 1806 break; 1807 } 1808 if (vdx.vdx_len > VM_DATA_XFER_LIMIT) { 1809 error = EFBIG; 1810 break; 1811 } 1812 1813 const size_t len = vdx.vdx_len; 1814 void *buf = NULL; 1815 if (len != 0) { 1816 buf = kmem_alloc(len, KM_SLEEP); 1817 if (ddi_copyin(vdx.vdx_data, buf, len, md) != 0) { 1818 kmem_free(buf, len); 1819 error = EFAULT; 1820 break; 1821 } 1822 } 1823 1824 vdx.vdx_result_len = 0; 1825 vmm_data_req_t req = { 1826 .vdr_class = vdx.vdx_class, 1827 .vdr_version = vdx.vdx_version, 1828 .vdr_flags = vdx.vdx_flags, 1829 .vdr_len = len, 1830 .vdr_data = buf, 1831 .vdr_result_len = &vdx.vdx_result_len, 1832 }; 1833 if (vmm_allow_state_writes != 0) { 1834 error = vmm_data_write(sc->vmm_vm, vdx.vdx_vcpuid, 1835 &req); 1836 } else { 1837 /* 1838 * Reject the write if somone has thrown the switch back 1839 * into the "disallow" position. 1840 */ 1841 error = EPERM; 1842 } 1843 1844 if (error == 0 && buf != NULL && 1845 (vdx.vdx_flags & VDX_FLAG_WRITE_COPYOUT) != 0) { 1846 if (ddi_copyout(buf, vdx.vdx_data, len, md) != 0) { 1847 error = EFAULT; 1848 } 1849 } 1850 1851 /* 1852 * Copy out the transfer request so that the value of 1853 * vdx_result_len can be made available, regardless of any 1854 * error(s) which may have occurred. 1855 */ 1856 if (ddi_copyout(&vdx, datap, sizeof (vdx), md) != 0) { 1857 error = (error != 0) ? error : EFAULT; 1858 } 1859 1860 if (buf != NULL) { 1861 kmem_free(buf, len); 1862 } 1863 break; 1864 } 1865 1866 case VM_PAUSE: { 1867 error = vm_pause_instance(sc->vmm_vm); 1868 break; 1869 } 1870 case VM_RESUME: { 1871 error = vm_resume_instance(sc->vmm_vm); 1872 break; 1873 } 1874 1875 default: 1876 error = ENOTTY; 1877 break; 1878 } 1879 1880 /* Release exclusion resources */ 1881 switch (lock_type) { 1882 case LOCK_NONE: 1883 break; 1884 case LOCK_VCPU: 1885 vcpu_unlock_one(sc, vcpu); 1886 break; 1887 case LOCK_READ_HOLD: 1888 vmm_read_unlock(sc); 1889 break; 1890 case LOCK_WRITE_HOLD: 1891 vmm_write_unlock(sc); 1892 break; 1893 default: 1894 panic("unexpected lock type"); 1895 break; 1896 } 1897 1898 return (error); 1899 } 1900 1901 static vmm_softc_t * 1902 vmm_lookup(const char *name) 1903 { 1904 list_t *vml = &vmm_list; 1905 vmm_softc_t *sc; 1906 1907 ASSERT(MUTEX_HELD(&vmm_mtx)); 1908 1909 for (sc = list_head(vml); sc != NULL; sc = list_next(vml, sc)) { 1910 if (strcmp(sc->vmm_name, name) == 0) { 1911 break; 1912 } 1913 } 1914 1915 return (sc); 1916 } 1917 1918 /* 1919 * Acquire an HMA registration if not already held. 1920 */ 1921 static boolean_t 1922 vmm_hma_acquire(void) 1923 { 1924 ASSERT(MUTEX_NOT_HELD(&vmm_mtx)); 1925 1926 mutex_enter(&vmmdev_mtx); 1927 1928 if (vmmdev_hma_reg == NULL) { 1929 VERIFY3U(vmmdev_hma_ref, ==, 0); 1930 vmmdev_hma_reg = hma_register(vmmdev_hvm_name); 1931 if (vmmdev_hma_reg == NULL) { 1932 cmn_err(CE_WARN, "%s HMA registration failed.", 1933 vmmdev_hvm_name); 1934 mutex_exit(&vmmdev_mtx); 1935 return (B_FALSE); 1936 } 1937 } 1938 1939 vmmdev_hma_ref++; 1940 1941 mutex_exit(&vmmdev_mtx); 1942 1943 return (B_TRUE); 1944 } 1945 1946 /* 1947 * Release the HMA registration if held and there are no remaining VMs. 1948 */ 1949 static void 1950 vmm_hma_release(void) 1951 { 1952 ASSERT(MUTEX_NOT_HELD(&vmm_mtx)); 1953 1954 mutex_enter(&vmmdev_mtx); 1955 1956 VERIFY3U(vmmdev_hma_ref, !=, 0); 1957 1958 vmmdev_hma_ref--; 1959 1960 if (vmmdev_hma_ref == 0) { 1961 VERIFY(vmmdev_hma_reg != NULL); 1962 hma_unregister(vmmdev_hma_reg); 1963 vmmdev_hma_reg = NULL; 1964 } 1965 mutex_exit(&vmmdev_mtx); 1966 } 1967 1968 static int 1969 vmmdev_do_vm_create(const struct vm_create_req *req, cred_t *cr) 1970 { 1971 vmm_softc_t *sc = NULL; 1972 minor_t minor; 1973 int error = ENOMEM; 1974 size_t len; 1975 const char *name = req->name; 1976 1977 len = strnlen(name, VM_MAX_NAMELEN); 1978 if (len == 0) { 1979 return (EINVAL); 1980 } 1981 if (len >= VM_MAX_NAMELEN) { 1982 return (ENAMETOOLONG); 1983 } 1984 if (strchr(name, '/') != NULL) { 1985 return (EINVAL); 1986 } 1987 1988 if (!vmm_hma_acquire()) 1989 return (ENXIO); 1990 1991 mutex_enter(&vmm_mtx); 1992 1993 /* Look for duplicate names */ 1994 if (vmm_lookup(name) != NULL) { 1995 mutex_exit(&vmm_mtx); 1996 vmm_hma_release(); 1997 return (EEXIST); 1998 } 1999 2000 /* Allow only one instance per non-global zone. */ 2001 if (!INGLOBALZONE(curproc)) { 2002 for (sc = list_head(&vmm_list); sc != NULL; 2003 sc = list_next(&vmm_list, sc)) { 2004 if (sc->vmm_zone == curzone) { 2005 mutex_exit(&vmm_mtx); 2006 vmm_hma_release(); 2007 return (EINVAL); 2008 } 2009 } 2010 } 2011 2012 minor = id_alloc(vmm_minors); 2013 if (ddi_soft_state_zalloc(vmm_statep, minor) != DDI_SUCCESS) { 2014 goto fail; 2015 } else if ((sc = ddi_get_soft_state(vmm_statep, minor)) == NULL) { 2016 ddi_soft_state_free(vmm_statep, minor); 2017 goto fail; 2018 } else if (ddi_create_minor_node(vmmdev_dip, name, S_IFCHR, minor, 2019 DDI_PSEUDO, 0) != DDI_SUCCESS) { 2020 goto fail; 2021 } 2022 2023 if (vmm_kstat_alloc(sc, minor, cr) != 0) { 2024 goto fail; 2025 } 2026 2027 error = vm_create(req->flags, &sc->vmm_vm); 2028 if (error == 0) { 2029 /* Complete VM intialization and report success. */ 2030 (void) strlcpy(sc->vmm_name, name, sizeof (sc->vmm_name)); 2031 sc->vmm_minor = minor; 2032 list_create(&sc->vmm_devmem_list, sizeof (vmm_devmem_entry_t), 2033 offsetof(vmm_devmem_entry_t, vde_node)); 2034 2035 list_create(&sc->vmm_holds, sizeof (vmm_hold_t), 2036 offsetof(vmm_hold_t, vmh_node)); 2037 cv_init(&sc->vmm_cv, NULL, CV_DEFAULT, NULL); 2038 2039 mutex_init(&sc->vmm_lease_lock, NULL, MUTEX_DEFAULT, NULL); 2040 list_create(&sc->vmm_lease_list, sizeof (vmm_lease_t), 2041 offsetof(vmm_lease_t, vml_node)); 2042 cv_init(&sc->vmm_lease_cv, NULL, CV_DEFAULT, NULL); 2043 rw_init(&sc->vmm_rwlock, NULL, RW_DEFAULT, NULL); 2044 2045 sc->vmm_zone = crgetzone(cr); 2046 zone_hold(sc->vmm_zone); 2047 vmm_zsd_add_vm(sc); 2048 vmm_kstat_init(sc); 2049 2050 list_insert_tail(&vmm_list, sc); 2051 mutex_exit(&vmm_mtx); 2052 return (0); 2053 } 2054 2055 vmm_kstat_fini(sc); 2056 ddi_remove_minor_node(vmmdev_dip, name); 2057 fail: 2058 id_free(vmm_minors, minor); 2059 if (sc != NULL) { 2060 ddi_soft_state_free(vmm_statep, minor); 2061 } 2062 mutex_exit(&vmm_mtx); 2063 vmm_hma_release(); 2064 2065 return (error); 2066 } 2067 2068 /* 2069 * Bhyve 'Driver' Interface 2070 * 2071 * While many devices are emulated in the bhyve userspace process, there are 2072 * others with performance constraints which require that they run mostly or 2073 * entirely in-kernel. For those not integrated directly into bhyve, an API is 2074 * needed so they can query/manipulate the portions of VM state needed to 2075 * fulfill their purpose. 2076 * 2077 * This includes: 2078 * - Translating guest-physical addresses to host-virtual pointers 2079 * - Injecting MSIs 2080 * - Hooking IO port addresses 2081 * 2082 * The vmm_drv interface exists to provide that functionality to its consumers. 2083 * (At this time, 'viona' is the only user) 2084 */ 2085 int 2086 vmm_drv_hold(file_t *fp, cred_t *cr, vmm_hold_t **holdp) 2087 { 2088 vnode_t *vp = fp->f_vnode; 2089 const dev_t dev = vp->v_rdev; 2090 vmm_softc_t *sc; 2091 vmm_hold_t *hold; 2092 int err = 0; 2093 2094 if (vp->v_type != VCHR) { 2095 return (ENXIO); 2096 } 2097 const major_t major = getmajor(dev); 2098 const minor_t minor = getminor(dev); 2099 2100 mutex_enter(&vmmdev_mtx); 2101 if (vmmdev_dip == NULL || major != ddi_driver_major(vmmdev_dip)) { 2102 mutex_exit(&vmmdev_mtx); 2103 return (ENOENT); 2104 } 2105 mutex_enter(&vmm_mtx); 2106 mutex_exit(&vmmdev_mtx); 2107 2108 if ((sc = ddi_get_soft_state(vmm_statep, minor)) == NULL) { 2109 err = ENOENT; 2110 goto out; 2111 } 2112 /* XXXJOY: check cred permissions against instance */ 2113 2114 if ((sc->vmm_flags & VMM_DESTROY) != 0) { 2115 err = EBUSY; 2116 goto out; 2117 } 2118 2119 hold = kmem_zalloc(sizeof (*hold), KM_SLEEP); 2120 hold->vmh_sc = sc; 2121 hold->vmh_release_req = B_FALSE; 2122 2123 list_insert_tail(&sc->vmm_holds, hold); 2124 sc->vmm_flags |= VMM_HELD; 2125 *holdp = hold; 2126 2127 out: 2128 mutex_exit(&vmm_mtx); 2129 return (err); 2130 } 2131 2132 void 2133 vmm_drv_rele(vmm_hold_t *hold) 2134 { 2135 vmm_softc_t *sc; 2136 bool hma_release = false; 2137 2138 ASSERT(hold != NULL); 2139 ASSERT(hold->vmh_sc != NULL); 2140 VERIFY(hold->vmh_ioport_hook_cnt == 0); 2141 2142 mutex_enter(&vmm_mtx); 2143 sc = hold->vmh_sc; 2144 list_remove(&sc->vmm_holds, hold); 2145 kmem_free(hold, sizeof (*hold)); 2146 2147 if (list_is_empty(&sc->vmm_holds)) { 2148 sc->vmm_flags &= ~VMM_HELD; 2149 2150 /* 2151 * Since outstanding holds would prevent instance destruction 2152 * from completing, attempt to finish it now if it was already 2153 * set in motion. 2154 */ 2155 if ((sc->vmm_flags & VMM_DESTROY) != 0) { 2156 VERIFY0(vmm_destroy_locked(sc, VDO_DEFAULT, 2157 &hma_release)); 2158 } 2159 } 2160 mutex_exit(&vmm_mtx); 2161 2162 if (hma_release) { 2163 vmm_hma_release(); 2164 } 2165 } 2166 2167 boolean_t 2168 vmm_drv_release_reqd(vmm_hold_t *hold) 2169 { 2170 ASSERT(hold != NULL); 2171 2172 return (hold->vmh_release_req); 2173 } 2174 2175 vmm_lease_t * 2176 vmm_drv_lease_sign(vmm_hold_t *hold, boolean_t (*expiref)(void *), void *arg) 2177 { 2178 vmm_softc_t *sc = hold->vmh_sc; 2179 vmm_lease_t *lease; 2180 2181 ASSERT3P(expiref, !=, NULL); 2182 2183 if (hold->vmh_release_req) { 2184 return (NULL); 2185 } 2186 2187 lease = kmem_alloc(sizeof (*lease), KM_SLEEP); 2188 list_link_init(&lease->vml_node); 2189 lease->vml_expire_func = expiref; 2190 lease->vml_expire_arg = arg; 2191 lease->vml_expired = B_FALSE; 2192 lease->vml_break_deferred = B_FALSE; 2193 lease->vml_hold = hold; 2194 /* cache the VM pointer for one less pointer chase */ 2195 lease->vml_vm = sc->vmm_vm; 2196 lease->vml_vmclient = vmspace_client_alloc(vm_get_vmspace(sc->vmm_vm)); 2197 2198 mutex_enter(&sc->vmm_lease_lock); 2199 while (sc->vmm_lease_blocker != 0) { 2200 cv_wait(&sc->vmm_lease_cv, &sc->vmm_lease_lock); 2201 } 2202 list_insert_tail(&sc->vmm_lease_list, lease); 2203 vmm_read_lock(sc); 2204 mutex_exit(&sc->vmm_lease_lock); 2205 2206 return (lease); 2207 } 2208 2209 static void 2210 vmm_lease_break_locked(vmm_softc_t *sc, vmm_lease_t *lease) 2211 { 2212 ASSERT(MUTEX_HELD(&sc->vmm_lease_lock)); 2213 2214 list_remove(&sc->vmm_lease_list, lease); 2215 vmm_read_unlock(sc); 2216 vmc_destroy(lease->vml_vmclient); 2217 kmem_free(lease, sizeof (*lease)); 2218 } 2219 2220 static void 2221 vmm_lease_block(vmm_softc_t *sc) 2222 { 2223 mutex_enter(&sc->vmm_lease_lock); 2224 VERIFY3U(sc->vmm_lease_blocker, !=, UINT_MAX); 2225 sc->vmm_lease_blocker++; 2226 if (sc->vmm_lease_blocker == 1) { 2227 list_t *list = &sc->vmm_lease_list; 2228 vmm_lease_t *lease = list_head(list); 2229 2230 while (lease != NULL) { 2231 void *arg = lease->vml_expire_arg; 2232 boolean_t (*expiref)(void *) = lease->vml_expire_func; 2233 boolean_t sync_break = B_FALSE; 2234 2235 /* 2236 * Since the lease expiration notification may 2237 * need to take locks which would deadlock with 2238 * vmm_lease_lock, drop it across the call. 2239 * 2240 * We are the only one allowed to manipulate 2241 * vmm_lease_list right now, so it is safe to 2242 * continue iterating through it after 2243 * reacquiring the lock. 2244 */ 2245 lease->vml_expired = B_TRUE; 2246 mutex_exit(&sc->vmm_lease_lock); 2247 sync_break = expiref(arg); 2248 mutex_enter(&sc->vmm_lease_lock); 2249 2250 if (sync_break) { 2251 vmm_lease_t *next; 2252 2253 /* 2254 * These leases which are synchronously broken 2255 * result in vmm_read_unlock() calls from a 2256 * different thread than the corresponding 2257 * vmm_read_lock(). This is acceptable, given 2258 * that the rwlock underpinning the whole 2259 * mechanism tolerates the behavior. This 2260 * flexibility is _only_ afforded to VM read 2261 * lock (RW_READER) holders. 2262 */ 2263 next = list_next(list, lease); 2264 vmm_lease_break_locked(sc, lease); 2265 lease = next; 2266 } else { 2267 lease = list_next(list, lease); 2268 } 2269 } 2270 2271 /* Process leases which were not broken synchronously. */ 2272 while (!list_is_empty(list)) { 2273 /* 2274 * Although the nested loops are quadratic, the number 2275 * of leases is small. 2276 */ 2277 lease = list_head(list); 2278 while (lease != NULL) { 2279 vmm_lease_t *next = list_next(list, lease); 2280 if (lease->vml_break_deferred) { 2281 vmm_lease_break_locked(sc, lease); 2282 } 2283 lease = next; 2284 } 2285 if (list_is_empty(list)) { 2286 break; 2287 } 2288 cv_wait(&sc->vmm_lease_cv, &sc->vmm_lease_lock); 2289 } 2290 /* Wake anyone else waiting for the lease list to be empty */ 2291 cv_broadcast(&sc->vmm_lease_cv); 2292 } else { 2293 list_t *list = &sc->vmm_lease_list; 2294 2295 /* 2296 * Some other thread beat us to the duty of lease cleanup. 2297 * Wait until that is complete. 2298 */ 2299 while (!list_is_empty(list)) { 2300 cv_wait(&sc->vmm_lease_cv, &sc->vmm_lease_lock); 2301 } 2302 } 2303 mutex_exit(&sc->vmm_lease_lock); 2304 } 2305 2306 static void 2307 vmm_lease_unblock(vmm_softc_t *sc) 2308 { 2309 mutex_enter(&sc->vmm_lease_lock); 2310 VERIFY3U(sc->vmm_lease_blocker, !=, 0); 2311 sc->vmm_lease_blocker--; 2312 if (sc->vmm_lease_blocker == 0) { 2313 cv_broadcast(&sc->vmm_lease_cv); 2314 } 2315 mutex_exit(&sc->vmm_lease_lock); 2316 } 2317 2318 void 2319 vmm_drv_lease_break(vmm_hold_t *hold, vmm_lease_t *lease) 2320 { 2321 vmm_softc_t *sc = hold->vmh_sc; 2322 2323 VERIFY3P(hold, ==, lease->vml_hold); 2324 VERIFY(!lease->vml_break_deferred); 2325 2326 mutex_enter(&sc->vmm_lease_lock); 2327 if (sc->vmm_lease_blocker == 0) { 2328 vmm_lease_break_locked(sc, lease); 2329 } else { 2330 /* 2331 * Defer the lease-breaking to whichever thread is currently 2332 * cleaning up all leases as part of a vmm_lease_block() call. 2333 */ 2334 lease->vml_break_deferred = B_TRUE; 2335 cv_broadcast(&sc->vmm_lease_cv); 2336 } 2337 mutex_exit(&sc->vmm_lease_lock); 2338 } 2339 2340 boolean_t 2341 vmm_drv_lease_expired(vmm_lease_t *lease) 2342 { 2343 return (lease->vml_expired); 2344 } 2345 2346 vmm_page_t * 2347 vmm_drv_page_hold(vmm_lease_t *lease, uintptr_t gpa, int prot) 2348 { 2349 ASSERT(lease != NULL); 2350 ASSERT0(gpa & PAGEOFFSET); 2351 2352 return ((vmm_page_t *)vmc_hold(lease->vml_vmclient, gpa, prot)); 2353 } 2354 2355 2356 /* Ensure that flags mirrored by vmm_drv interface properly match up */ 2357 CTASSERT(VMPF_DEFER_DIRTY == VPF_DEFER_DIRTY); 2358 2359 vmm_page_t * 2360 vmm_drv_page_hold_ext(vmm_lease_t *lease, uintptr_t gpa, int prot, int flags) 2361 { 2362 ASSERT(lease != NULL); 2363 ASSERT0(gpa & PAGEOFFSET); 2364 2365 vmm_page_t *page = 2366 (vmm_page_t *)vmc_hold_ext(lease->vml_vmclient, gpa, prot, flags); 2367 return (page); 2368 } 2369 2370 void 2371 vmm_drv_page_release(vmm_page_t *vmmp) 2372 { 2373 (void) vmp_release((vm_page_t *)vmmp); 2374 } 2375 2376 void 2377 vmm_drv_page_release_chain(vmm_page_t *vmmp) 2378 { 2379 (void) vmp_release_chain((vm_page_t *)vmmp); 2380 } 2381 2382 const void * 2383 vmm_drv_page_readable(const vmm_page_t *vmmp) 2384 { 2385 return (vmp_get_readable((const vm_page_t *)vmmp)); 2386 } 2387 2388 void * 2389 vmm_drv_page_writable(const vmm_page_t *vmmp) 2390 { 2391 return (vmp_get_writable((const vm_page_t *)vmmp)); 2392 } 2393 2394 void 2395 vmm_drv_page_mark_dirty(vmm_page_t *vmmp) 2396 { 2397 return (vmp_mark_dirty((vm_page_t *)vmmp)); 2398 } 2399 2400 void 2401 vmm_drv_page_chain(vmm_page_t *vmmp, vmm_page_t *to_chain) 2402 { 2403 vmp_chain((vm_page_t *)vmmp, (vm_page_t *)to_chain); 2404 } 2405 2406 vmm_page_t * 2407 vmm_drv_page_next(const vmm_page_t *vmmp) 2408 { 2409 return ((vmm_page_t *)vmp_next((vm_page_t *)vmmp)); 2410 } 2411 2412 int 2413 vmm_drv_msi(vmm_lease_t *lease, uint64_t addr, uint64_t msg) 2414 { 2415 ASSERT(lease != NULL); 2416 2417 return (lapic_intr_msi(lease->vml_vm, addr, msg)); 2418 } 2419 2420 int 2421 vmm_drv_ioport_hook(vmm_hold_t *hold, uint16_t ioport, vmm_drv_iop_cb_t func, 2422 void *arg, void **cookie) 2423 { 2424 vmm_softc_t *sc; 2425 int err; 2426 2427 ASSERT(hold != NULL); 2428 ASSERT(cookie != NULL); 2429 2430 sc = hold->vmh_sc; 2431 mutex_enter(&vmm_mtx); 2432 /* Confirm that hook installation is not blocked */ 2433 if ((sc->vmm_flags & VMM_BLOCK_HOOK) != 0) { 2434 mutex_exit(&vmm_mtx); 2435 return (EBUSY); 2436 } 2437 /* 2438 * Optimistically record an installed hook which will prevent a block 2439 * from being asserted while the mutex is dropped. 2440 */ 2441 hold->vmh_ioport_hook_cnt++; 2442 mutex_exit(&vmm_mtx); 2443 2444 vmm_write_lock(sc); 2445 err = vm_ioport_hook(sc->vmm_vm, ioport, (ioport_handler_t)func, 2446 arg, cookie); 2447 vmm_write_unlock(sc); 2448 2449 if (err != 0) { 2450 mutex_enter(&vmm_mtx); 2451 /* Walk back optimism about the hook installation */ 2452 hold->vmh_ioport_hook_cnt--; 2453 mutex_exit(&vmm_mtx); 2454 } 2455 return (err); 2456 } 2457 2458 void 2459 vmm_drv_ioport_unhook(vmm_hold_t *hold, void **cookie) 2460 { 2461 vmm_softc_t *sc; 2462 2463 ASSERT(hold != NULL); 2464 ASSERT(cookie != NULL); 2465 ASSERT(hold->vmh_ioport_hook_cnt != 0); 2466 2467 sc = hold->vmh_sc; 2468 vmm_write_lock(sc); 2469 vm_ioport_unhook(sc->vmm_vm, cookie); 2470 vmm_write_unlock(sc); 2471 2472 mutex_enter(&vmm_mtx); 2473 hold->vmh_ioport_hook_cnt--; 2474 mutex_exit(&vmm_mtx); 2475 } 2476 2477 static void 2478 vmm_drv_purge(vmm_softc_t *sc) 2479 { 2480 ASSERT(MUTEX_HELD(&vmm_mtx)); 2481 2482 if ((sc->vmm_flags & VMM_HELD) != 0) { 2483 vmm_hold_t *hold; 2484 2485 for (hold = list_head(&sc->vmm_holds); hold != NULL; 2486 hold = list_next(&sc->vmm_holds, hold)) { 2487 hold->vmh_release_req = B_TRUE; 2488 } 2489 2490 /* 2491 * Require that all leases on the instance be broken, now that 2492 * all associated holds have been marked as needing release. 2493 * 2494 * Dropping vmm_mtx is not strictly necessary, but if any of the 2495 * lessees are slow to respond, it would be nice to leave it 2496 * available for other parties. 2497 */ 2498 mutex_exit(&vmm_mtx); 2499 vmm_lease_block(sc); 2500 vmm_lease_unblock(sc); 2501 mutex_enter(&vmm_mtx); 2502 } 2503 } 2504 2505 static int 2506 vmm_drv_block_hook(vmm_softc_t *sc, boolean_t enable_block) 2507 { 2508 int err = 0; 2509 2510 mutex_enter(&vmm_mtx); 2511 if (!enable_block) { 2512 VERIFY((sc->vmm_flags & VMM_BLOCK_HOOK) != 0); 2513 2514 sc->vmm_flags &= ~VMM_BLOCK_HOOK; 2515 goto done; 2516 } 2517 2518 /* If any holds have hooks installed, the block is a failure */ 2519 if (!list_is_empty(&sc->vmm_holds)) { 2520 vmm_hold_t *hold; 2521 2522 for (hold = list_head(&sc->vmm_holds); hold != NULL; 2523 hold = list_next(&sc->vmm_holds, hold)) { 2524 if (hold->vmh_ioport_hook_cnt != 0) { 2525 err = EBUSY; 2526 goto done; 2527 } 2528 } 2529 } 2530 sc->vmm_flags |= VMM_BLOCK_HOOK; 2531 2532 done: 2533 mutex_exit(&vmm_mtx); 2534 return (err); 2535 } 2536 2537 2538 static void 2539 vmm_destroy_begin(vmm_softc_t *sc, vmm_destroy_opts_t opts) 2540 { 2541 ASSERT(MUTEX_HELD(&vmm_mtx)); 2542 ASSERT0(sc->vmm_flags & VMM_DESTROY); 2543 2544 sc->vmm_flags |= VMM_DESTROY; 2545 2546 /* 2547 * Lock and unlock all of the vCPUs to ensure that they are kicked out 2548 * of guest context, being unable to return now that the instance is 2549 * marked for destruction. 2550 */ 2551 const int maxcpus = vm_get_maxcpus(sc->vmm_vm); 2552 for (int vcpu = 0; vcpu < maxcpus; vcpu++) { 2553 vcpu_lock_one(sc, vcpu); 2554 vcpu_unlock_one(sc, vcpu); 2555 } 2556 2557 vmmdev_devmem_purge(sc); 2558 if ((opts & VDO_NO_CLEAN_ZSD) == 0) { 2559 /* 2560 * The ZSD should be cleaned up now, unless destruction of the 2561 * instance was initated by destruction of the containing zone, 2562 * in which case the ZSD has already been removed. 2563 */ 2564 vmm_zsd_rem_vm(sc); 2565 } 2566 zone_rele(sc->vmm_zone); 2567 2568 vmm_drv_purge(sc); 2569 } 2570 2571 static bool 2572 vmm_destroy_ready(vmm_softc_t *sc) 2573 { 2574 ASSERT(MUTEX_HELD(&vmm_mtx)); 2575 2576 if ((sc->vmm_flags & (VMM_HELD | VMM_IS_OPEN)) == 0) { 2577 VERIFY(list_is_empty(&sc->vmm_holds)); 2578 return (true); 2579 } 2580 2581 return (false); 2582 } 2583 2584 static void 2585 vmm_destroy_finish(vmm_softc_t *sc) 2586 { 2587 ASSERT(MUTEX_HELD(&vmm_mtx)); 2588 ASSERT(vmm_destroy_ready(sc)); 2589 2590 list_remove(&vmm_list, sc); 2591 vmm_kstat_fini(sc); 2592 vm_destroy(sc->vmm_vm); 2593 ddi_remove_minor_node(vmmdev_dip, sc->vmm_name); 2594 (void) devfs_clean(ddi_get_parent(vmmdev_dip), NULL, DV_CLEAN_FORCE); 2595 2596 const minor_t minor = sc->vmm_minor; 2597 ddi_soft_state_free(vmm_statep, minor); 2598 id_free(vmm_minors, minor); 2599 } 2600 2601 /* 2602 * Initiate or attempt to finish destruction of a VMM instance. 2603 * 2604 * This is called from several contexts: 2605 * - An explicit destroy ioctl is made 2606 * - A vmm_drv consumer releases its hold (being the last on the instance) 2607 * - The vmm device is closed, and auto-destruct is enabled 2608 */ 2609 static int 2610 vmm_destroy_locked(vmm_softc_t *sc, vmm_destroy_opts_t opts, 2611 bool *hma_release) 2612 { 2613 ASSERT(MUTEX_HELD(&vmm_mtx)); 2614 2615 *hma_release = false; 2616 2617 /* 2618 * When instance destruction begins, it is so marked such that any 2619 * further requests to operate the instance will fail. 2620 */ 2621 if ((sc->vmm_flags & VMM_DESTROY) == 0) { 2622 vmm_destroy_begin(sc, opts); 2623 } 2624 2625 if (vmm_destroy_ready(sc)) { 2626 2627 /* 2628 * Notify anyone waiting for the destruction to finish. They 2629 * must be clear before we can safely tear down the softc. 2630 */ 2631 if (sc->vmm_destroy_waiters != 0) { 2632 cv_broadcast(&sc->vmm_cv); 2633 while (sc->vmm_destroy_waiters != 0) { 2634 cv_wait(&sc->vmm_cv, &vmm_mtx); 2635 } 2636 } 2637 2638 /* 2639 * Finish destruction of instance. After this point, the softc 2640 * is freed and cannot be accessed again. 2641 * 2642 * With destruction complete, the HMA hold can be released 2643 */ 2644 vmm_destroy_finish(sc); 2645 *hma_release = true; 2646 return (0); 2647 } else if ((opts & VDO_ATTEMPT_WAIT) != 0) { 2648 int err = 0; 2649 2650 sc->vmm_destroy_waiters++; 2651 while (!vmm_destroy_ready(sc) && err == 0) { 2652 if (cv_wait_sig(&sc->vmm_cv, &vmm_mtx) <= 0) { 2653 err = EINTR; 2654 } 2655 } 2656 sc->vmm_destroy_waiters--; 2657 2658 if (sc->vmm_destroy_waiters == 0) { 2659 /* 2660 * If we were the last waiter, it could be that VM 2661 * destruction is waiting on _us_ to proceed with the 2662 * final clean-up. 2663 */ 2664 cv_signal(&sc->vmm_cv); 2665 } 2666 return (err); 2667 } else { 2668 /* 2669 * Since the instance is not ready for destruction, and the 2670 * caller did not ask to wait, consider it a success for now. 2671 */ 2672 return (0); 2673 } 2674 } 2675 2676 void 2677 vmm_zone_vm_destroy(vmm_softc_t *sc) 2678 { 2679 bool hma_release = false; 2680 int err; 2681 2682 mutex_enter(&vmm_mtx); 2683 err = vmm_destroy_locked(sc, VDO_NO_CLEAN_ZSD, &hma_release); 2684 mutex_exit(&vmm_mtx); 2685 2686 VERIFY0(err); 2687 2688 if (hma_release) { 2689 vmm_hma_release(); 2690 } 2691 } 2692 2693 static int 2694 vmmdev_do_vm_destroy(const struct vm_destroy_req *req, cred_t *cr) 2695 { 2696 vmm_softc_t *sc; 2697 bool hma_release = false; 2698 int err; 2699 2700 if (crgetuid(cr) != 0) { 2701 return (EPERM); 2702 } 2703 2704 mutex_enter(&vmm_mtx); 2705 sc = vmm_lookup(req->name); 2706 if (sc == NULL) { 2707 mutex_exit(&vmm_mtx); 2708 return (ENOENT); 2709 } 2710 /* 2711 * We don't check this in vmm_lookup() since that function is also used 2712 * for validation during create and currently vmm names must be unique. 2713 */ 2714 if (!INGLOBALZONE(curproc) && sc->vmm_zone != curzone) { 2715 mutex_exit(&vmm_mtx); 2716 return (EPERM); 2717 } 2718 2719 err = vmm_destroy_locked(sc, VDO_ATTEMPT_WAIT, &hma_release); 2720 mutex_exit(&vmm_mtx); 2721 2722 if (hma_release) { 2723 vmm_hma_release(); 2724 } 2725 2726 return (err); 2727 } 2728 2729 #define VCPU_NAME_BUFLEN 32 2730 2731 static int 2732 vmm_kstat_alloc(vmm_softc_t *sc, minor_t minor, const cred_t *cr) 2733 { 2734 zoneid_t zid = crgetzoneid(cr); 2735 int instance = minor; 2736 kstat_t *ksp; 2737 2738 ASSERT3P(sc->vmm_kstat_vm, ==, NULL); 2739 2740 ksp = kstat_create_zone(VMM_MODULE_NAME, instance, "vm", 2741 VMM_KSTAT_CLASS, KSTAT_TYPE_NAMED, 2742 sizeof (vmm_kstats_t) / sizeof (kstat_named_t), 0, zid); 2743 2744 if (ksp == NULL) { 2745 return (-1); 2746 } 2747 sc->vmm_kstat_vm = ksp; 2748 2749 for (uint_t i = 0; i < VM_MAXCPU; i++) { 2750 char namebuf[VCPU_NAME_BUFLEN]; 2751 2752 ASSERT3P(sc->vmm_kstat_vcpu[i], ==, NULL); 2753 2754 (void) snprintf(namebuf, VCPU_NAME_BUFLEN, "vcpu%u", i); 2755 ksp = kstat_create_zone(VMM_MODULE_NAME, instance, namebuf, 2756 VMM_KSTAT_CLASS, KSTAT_TYPE_NAMED, 2757 sizeof (vmm_vcpu_kstats_t) / sizeof (kstat_named_t), 2758 0, zid); 2759 if (ksp == NULL) { 2760 goto fail; 2761 } 2762 2763 sc->vmm_kstat_vcpu[i] = ksp; 2764 } 2765 2766 /* 2767 * If this instance is associated with a non-global zone, make its 2768 * kstats visible from the GZ. 2769 */ 2770 if (zid != GLOBAL_ZONEID) { 2771 kstat_zone_add(sc->vmm_kstat_vm, GLOBAL_ZONEID); 2772 for (uint_t i = 0; i < VM_MAXCPU; i++) { 2773 kstat_zone_add(sc->vmm_kstat_vcpu[i], GLOBAL_ZONEID); 2774 } 2775 } 2776 2777 return (0); 2778 2779 fail: 2780 for (uint_t i = 0; i < VM_MAXCPU; i++) { 2781 if (sc->vmm_kstat_vcpu[i] != NULL) { 2782 kstat_delete(sc->vmm_kstat_vcpu[i]); 2783 sc->vmm_kstat_vcpu[i] = NULL; 2784 } else { 2785 break; 2786 } 2787 } 2788 kstat_delete(sc->vmm_kstat_vm); 2789 sc->vmm_kstat_vm = NULL; 2790 return (-1); 2791 } 2792 2793 static void 2794 vmm_kstat_init(vmm_softc_t *sc) 2795 { 2796 kstat_t *ksp; 2797 2798 ASSERT3P(sc->vmm_vm, !=, NULL); 2799 ASSERT3P(sc->vmm_kstat_vm, !=, NULL); 2800 2801 ksp = sc->vmm_kstat_vm; 2802 vmm_kstats_t *vk = ksp->ks_data; 2803 ksp->ks_private = sc->vmm_vm; 2804 kstat_named_init(&vk->vk_name, "vm_name", KSTAT_DATA_STRING); 2805 kstat_named_setstr(&vk->vk_name, sc->vmm_name); 2806 2807 for (uint_t i = 0; i < VM_MAXCPU; i++) { 2808 ASSERT3P(sc->vmm_kstat_vcpu[i], !=, NULL); 2809 2810 ksp = sc->vmm_kstat_vcpu[i]; 2811 vmm_vcpu_kstats_t *vvk = ksp->ks_data; 2812 2813 kstat_named_init(&vvk->vvk_vcpu, "vcpu", KSTAT_DATA_UINT32); 2814 vvk->vvk_vcpu.value.ui32 = i; 2815 kstat_named_init(&vvk->vvk_time_init, "time_init", 2816 KSTAT_DATA_UINT64); 2817 kstat_named_init(&vvk->vvk_time_run, "time_run", 2818 KSTAT_DATA_UINT64); 2819 kstat_named_init(&vvk->vvk_time_idle, "time_idle", 2820 KSTAT_DATA_UINT64); 2821 kstat_named_init(&vvk->vvk_time_emu_kern, "time_emu_kern", 2822 KSTAT_DATA_UINT64); 2823 kstat_named_init(&vvk->vvk_time_emu_user, "time_emu_user", 2824 KSTAT_DATA_UINT64); 2825 kstat_named_init(&vvk->vvk_time_sched, "time_sched", 2826 KSTAT_DATA_UINT64); 2827 ksp->ks_private = sc->vmm_vm; 2828 ksp->ks_update = vmm_kstat_update_vcpu; 2829 } 2830 2831 kstat_install(sc->vmm_kstat_vm); 2832 for (uint_t i = 0; i < VM_MAXCPU; i++) { 2833 kstat_install(sc->vmm_kstat_vcpu[i]); 2834 } 2835 } 2836 2837 static void 2838 vmm_kstat_fini(vmm_softc_t *sc) 2839 { 2840 ASSERT(sc->vmm_kstat_vm != NULL); 2841 2842 kstat_delete(sc->vmm_kstat_vm); 2843 sc->vmm_kstat_vm = NULL; 2844 2845 for (uint_t i = 0; i < VM_MAXCPU; i++) { 2846 ASSERT3P(sc->vmm_kstat_vcpu[i], !=, NULL); 2847 2848 kstat_delete(sc->vmm_kstat_vcpu[i]); 2849 sc->vmm_kstat_vcpu[i] = NULL; 2850 } 2851 } 2852 2853 static int 2854 vmm_open(dev_t *devp, int flag, int otyp, cred_t *credp) 2855 { 2856 minor_t minor; 2857 vmm_softc_t *sc; 2858 2859 /* 2860 * Forbid running bhyve in a 32-bit process until it has been tested and 2861 * verified to be safe. 2862 */ 2863 if (curproc->p_model != DATAMODEL_LP64) { 2864 return (EFBIG); 2865 } 2866 2867 minor = getminor(*devp); 2868 if (minor == VMM_CTL_MINOR) { 2869 /* 2870 * Master control device must be opened exclusively. 2871 */ 2872 if ((flag & FEXCL) != FEXCL || otyp != OTYP_CHR) { 2873 return (EINVAL); 2874 } 2875 2876 return (0); 2877 } 2878 2879 mutex_enter(&vmm_mtx); 2880 sc = ddi_get_soft_state(vmm_statep, minor); 2881 if (sc == NULL) { 2882 mutex_exit(&vmm_mtx); 2883 return (ENXIO); 2884 } 2885 2886 sc->vmm_flags |= VMM_IS_OPEN; 2887 mutex_exit(&vmm_mtx); 2888 2889 return (0); 2890 } 2891 2892 static int 2893 vmm_close(dev_t dev, int flag, int otyp, cred_t *credp) 2894 { 2895 const minor_t minor = getminor(dev); 2896 vmm_softc_t *sc; 2897 bool hma_release = false; 2898 2899 if (minor == VMM_CTL_MINOR) { 2900 return (0); 2901 } 2902 2903 mutex_enter(&vmm_mtx); 2904 sc = ddi_get_soft_state(vmm_statep, minor); 2905 if (sc == NULL) { 2906 mutex_exit(&vmm_mtx); 2907 return (ENXIO); 2908 } 2909 2910 VERIFY3U(sc->vmm_flags & VMM_IS_OPEN, !=, 0); 2911 sc->vmm_flags &= ~VMM_IS_OPEN; 2912 2913 /* 2914 * If instance was marked for auto-destruction begin that now. Instance 2915 * destruction may have been initated already, so try to make progress 2916 * in that case, since closure of the device is one of its requirements. 2917 */ 2918 if ((sc->vmm_flags & VMM_DESTROY) != 0 || 2919 (sc->vmm_flags & VMM_AUTODESTROY) != 0) { 2920 VERIFY0(vmm_destroy_locked(sc, VDO_DEFAULT, &hma_release)); 2921 } 2922 mutex_exit(&vmm_mtx); 2923 2924 if (hma_release) { 2925 vmm_hma_release(); 2926 } 2927 2928 return (0); 2929 } 2930 2931 static int 2932 vmm_is_supported(intptr_t arg) 2933 { 2934 int r; 2935 const char *msg; 2936 2937 if (vmm_is_intel()) { 2938 r = vmx_x86_supported(&msg); 2939 } else if (vmm_is_svm()) { 2940 /* 2941 * HMA already ensured that the features necessary for SVM 2942 * operation were present and online during vmm_attach(). 2943 */ 2944 r = 0; 2945 } else { 2946 r = ENXIO; 2947 msg = "Unsupported CPU vendor"; 2948 } 2949 2950 if (r != 0 && arg != (intptr_t)NULL) { 2951 if (copyoutstr(msg, (char *)arg, strlen(msg) + 1, NULL) != 0) 2952 return (EFAULT); 2953 } 2954 return (r); 2955 } 2956 2957 static int 2958 vmm_ctl_ioctl(int cmd, intptr_t arg, int md, cred_t *cr, int *rvalp) 2959 { 2960 void *argp = (void *)arg; 2961 2962 switch (cmd) { 2963 case VMM_CREATE_VM: { 2964 struct vm_create_req req; 2965 2966 if ((md & FWRITE) == 0) { 2967 return (EPERM); 2968 } 2969 if (ddi_copyin(argp, &req, sizeof (req), md) != 0) { 2970 return (EFAULT); 2971 } 2972 return (vmmdev_do_vm_create(&req, cr)); 2973 } 2974 case VMM_DESTROY_VM: { 2975 struct vm_destroy_req req; 2976 2977 if ((md & FWRITE) == 0) { 2978 return (EPERM); 2979 } 2980 if (ddi_copyin(argp, &req, sizeof (req), md) != 0) { 2981 return (EFAULT); 2982 } 2983 return (vmmdev_do_vm_destroy(&req, cr)); 2984 } 2985 case VMM_VM_SUPPORTED: 2986 return (vmm_is_supported(arg)); 2987 case VMM_CHECK_IOMMU: 2988 if (!vmm_check_iommu()) { 2989 return (ENXIO); 2990 } 2991 return (0); 2992 case VMM_RESV_QUERY: 2993 case VMM_RESV_SET_TARGET: 2994 return (vmmr_ioctl(cmd, arg, md, cr, rvalp)); 2995 default: 2996 break; 2997 } 2998 /* No other actions are legal on ctl device */ 2999 return (ENOTTY); 3000 } 3001 3002 static int 3003 vmm_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, 3004 int *rvalp) 3005 { 3006 vmm_softc_t *sc; 3007 minor_t minor; 3008 3009 /* 3010 * Forbid running bhyve in a 32-bit process until it has been tested and 3011 * verified to be safe. 3012 */ 3013 if (curproc->p_model != DATAMODEL_LP64) { 3014 return (EFBIG); 3015 } 3016 3017 /* The structs in bhyve ioctls assume a 64-bit datamodel */ 3018 if (ddi_model_convert_from(mode & FMODELS) != DDI_MODEL_NONE) { 3019 return (ENOTSUP); 3020 } 3021 3022 /* 3023 * Regardless of minor (vmmctl or instance), we respond to queries of 3024 * the interface version. 3025 */ 3026 if (cmd == VMM_INTERFACE_VERSION) { 3027 *rvalp = VMM_CURRENT_INTERFACE_VERSION; 3028 return (0); 3029 } 3030 3031 minor = getminor(dev); 3032 3033 if (minor == VMM_CTL_MINOR) { 3034 return (vmm_ctl_ioctl(cmd, arg, mode, credp, rvalp)); 3035 } 3036 3037 sc = ddi_get_soft_state(vmm_statep, minor); 3038 ASSERT(sc != NULL); 3039 3040 /* 3041 * Turn away any ioctls against an instance when it is being destroyed. 3042 * (Except for the ioctl inquiring about that destroy-in-progress.) 3043 */ 3044 if ((sc->vmm_flags & VMM_DESTROY) != 0) { 3045 if (cmd == VM_DESTROY_PENDING) { 3046 *rvalp = 1; 3047 return (0); 3048 } 3049 return (ENXIO); 3050 } 3051 3052 return (vmmdev_do_ioctl(sc, cmd, arg, mode, credp, rvalp)); 3053 } 3054 3055 static int 3056 vmm_segmap(dev_t dev, off_t off, struct as *as, caddr_t *addrp, off_t len, 3057 unsigned int prot, unsigned int maxprot, unsigned int flags, cred_t *credp) 3058 { 3059 vmm_softc_t *sc; 3060 const minor_t minor = getminor(dev); 3061 int err; 3062 3063 if (minor == VMM_CTL_MINOR) { 3064 return (ENODEV); 3065 } 3066 if (off < 0 || (off + len) <= 0) { 3067 return (EINVAL); 3068 } 3069 if ((prot & PROT_USER) == 0) { 3070 return (EACCES); 3071 } 3072 3073 sc = ddi_get_soft_state(vmm_statep, minor); 3074 ASSERT(sc); 3075 3076 if (sc->vmm_flags & VMM_DESTROY) 3077 return (ENXIO); 3078 3079 /* Grab read lock on the VM to prevent any changes to the memory map */ 3080 vmm_read_lock(sc); 3081 3082 if (off >= VM_DEVMEM_START) { 3083 int segid; 3084 off_t segoff; 3085 3086 /* Mapping a devmem "device" */ 3087 if (!vmmdev_devmem_segid(sc, off, len, &segid, &segoff)) { 3088 err = ENODEV; 3089 } else { 3090 err = vm_segmap_obj(sc->vmm_vm, segid, segoff, len, as, 3091 addrp, prot, maxprot, flags); 3092 } 3093 } else { 3094 /* Mapping a part of the guest physical space */ 3095 err = vm_segmap_space(sc->vmm_vm, off, as, addrp, len, prot, 3096 maxprot, flags); 3097 } 3098 3099 vmm_read_unlock(sc); 3100 return (err); 3101 } 3102 3103 static sdev_plugin_validate_t 3104 vmm_sdev_validate(sdev_ctx_t ctx) 3105 { 3106 const char *name = sdev_ctx_name(ctx); 3107 vmm_softc_t *sc; 3108 sdev_plugin_validate_t ret; 3109 minor_t minor; 3110 3111 if (sdev_ctx_vtype(ctx) != VCHR) 3112 return (SDEV_VTOR_INVALID); 3113 3114 VERIFY3S(sdev_ctx_minor(ctx, &minor), ==, 0); 3115 3116 mutex_enter(&vmm_mtx); 3117 if ((sc = vmm_lookup(name)) == NULL) 3118 ret = SDEV_VTOR_INVALID; 3119 else if (sc->vmm_minor != minor) 3120 ret = SDEV_VTOR_STALE; 3121 else 3122 ret = SDEV_VTOR_VALID; 3123 mutex_exit(&vmm_mtx); 3124 3125 return (ret); 3126 } 3127 3128 static int 3129 vmm_sdev_filldir(sdev_ctx_t ctx) 3130 { 3131 vmm_softc_t *sc; 3132 int ret; 3133 3134 if (strcmp(sdev_ctx_path(ctx), VMM_SDEV_ROOT) != 0) { 3135 cmn_err(CE_WARN, "%s: bad path '%s' != '%s'\n", __func__, 3136 sdev_ctx_path(ctx), VMM_SDEV_ROOT); 3137 return (EINVAL); 3138 } 3139 3140 mutex_enter(&vmm_mtx); 3141 ASSERT(vmmdev_dip != NULL); 3142 for (sc = list_head(&vmm_list); sc != NULL; 3143 sc = list_next(&vmm_list, sc)) { 3144 if (INGLOBALZONE(curproc) || sc->vmm_zone == curzone) { 3145 ret = sdev_plugin_mknod(ctx, sc->vmm_name, 3146 S_IFCHR | 0600, 3147 makedevice(ddi_driver_major(vmmdev_dip), 3148 sc->vmm_minor)); 3149 } else { 3150 continue; 3151 } 3152 if (ret != 0 && ret != EEXIST) 3153 goto out; 3154 } 3155 3156 ret = 0; 3157 3158 out: 3159 mutex_exit(&vmm_mtx); 3160 return (ret); 3161 } 3162 3163 /* ARGSUSED */ 3164 static void 3165 vmm_sdev_inactive(sdev_ctx_t ctx) 3166 { 3167 } 3168 3169 static sdev_plugin_ops_t vmm_sdev_ops = { 3170 .spo_version = SDEV_PLUGIN_VERSION, 3171 .spo_flags = SDEV_PLUGIN_SUBDIR, 3172 .spo_validate = vmm_sdev_validate, 3173 .spo_filldir = vmm_sdev_filldir, 3174 .spo_inactive = vmm_sdev_inactive 3175 }; 3176 3177 /* ARGSUSED */ 3178 static int 3179 vmm_info(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result) 3180 { 3181 int error; 3182 3183 switch (cmd) { 3184 case DDI_INFO_DEVT2DEVINFO: 3185 *result = (void *)vmmdev_dip; 3186 error = DDI_SUCCESS; 3187 break; 3188 case DDI_INFO_DEVT2INSTANCE: 3189 *result = (void *)0; 3190 error = DDI_SUCCESS; 3191 break; 3192 default: 3193 error = DDI_FAILURE; 3194 break; 3195 } 3196 return (error); 3197 } 3198 3199 static int 3200 vmm_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 3201 { 3202 sdev_plugin_hdl_t sph; 3203 hma_reg_t *reg = NULL; 3204 boolean_t vmm_loaded = B_FALSE; 3205 3206 if (cmd != DDI_ATTACH) { 3207 return (DDI_FAILURE); 3208 } 3209 3210 mutex_enter(&vmmdev_mtx); 3211 /* Ensure we are not already attached. */ 3212 if (vmmdev_dip != NULL) { 3213 mutex_exit(&vmmdev_mtx); 3214 return (DDI_FAILURE); 3215 } 3216 3217 vmm_sol_glue_init(); 3218 3219 /* 3220 * Perform temporary HMA registration to determine if the system 3221 * is capable. 3222 */ 3223 if ((reg = hma_register(vmmdev_hvm_name)) == NULL) { 3224 goto fail; 3225 } else if (vmm_mod_load() != 0) { 3226 goto fail; 3227 } 3228 vmm_loaded = B_TRUE; 3229 hma_unregister(reg); 3230 reg = NULL; 3231 3232 /* Create control node. Other nodes will be created on demand. */ 3233 if (ddi_create_minor_node(dip, "ctl", S_IFCHR, 3234 VMM_CTL_MINOR, DDI_PSEUDO, 0) != 0) { 3235 goto fail; 3236 } 3237 3238 sph = sdev_plugin_register(VMM_MODULE_NAME, &vmm_sdev_ops, NULL); 3239 if (sph == (sdev_plugin_hdl_t)NULL) { 3240 ddi_remove_minor_node(dip, NULL); 3241 goto fail; 3242 } 3243 3244 ddi_report_dev(dip); 3245 vmmdev_sdev_hdl = sph; 3246 vmmdev_dip = dip; 3247 mutex_exit(&vmmdev_mtx); 3248 return (DDI_SUCCESS); 3249 3250 fail: 3251 if (vmm_loaded) { 3252 VERIFY0(vmm_mod_unload()); 3253 } 3254 if (reg != NULL) { 3255 hma_unregister(reg); 3256 } 3257 vmm_sol_glue_cleanup(); 3258 mutex_exit(&vmmdev_mtx); 3259 return (DDI_FAILURE); 3260 } 3261 3262 static int 3263 vmm_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 3264 { 3265 if (cmd != DDI_DETACH) { 3266 return (DDI_FAILURE); 3267 } 3268 3269 /* 3270 * Ensure that all resources have been cleaned up. 3271 * 3272 * To prevent a deadlock with iommu_cleanup() we'll fail the detach if 3273 * vmmdev_mtx is already held. We can't wait for vmmdev_mtx with our 3274 * devinfo locked as iommu_cleanup() tries to recursively lock each 3275 * devinfo, including our own, while holding vmmdev_mtx. 3276 */ 3277 if (mutex_tryenter(&vmmdev_mtx) == 0) 3278 return (DDI_FAILURE); 3279 3280 mutex_enter(&vmm_mtx); 3281 if (!list_is_empty(&vmm_list)) { 3282 mutex_exit(&vmm_mtx); 3283 mutex_exit(&vmmdev_mtx); 3284 return (DDI_FAILURE); 3285 } 3286 mutex_exit(&vmm_mtx); 3287 3288 if (!vmmr_is_empty()) { 3289 mutex_exit(&vmmdev_mtx); 3290 return (DDI_FAILURE); 3291 } 3292 3293 VERIFY(vmmdev_sdev_hdl != (sdev_plugin_hdl_t)NULL); 3294 if (sdev_plugin_unregister(vmmdev_sdev_hdl) != 0) { 3295 mutex_exit(&vmmdev_mtx); 3296 return (DDI_FAILURE); 3297 } 3298 vmmdev_sdev_hdl = (sdev_plugin_hdl_t)NULL; 3299 3300 /* Remove the control node. */ 3301 ddi_remove_minor_node(dip, "ctl"); 3302 vmmdev_dip = NULL; 3303 3304 VERIFY0(vmm_mod_unload()); 3305 VERIFY3U(vmmdev_hma_reg, ==, NULL); 3306 vmm_sol_glue_cleanup(); 3307 3308 mutex_exit(&vmmdev_mtx); 3309 3310 return (DDI_SUCCESS); 3311 } 3312 3313 static struct cb_ops vmm_cb_ops = { 3314 vmm_open, 3315 vmm_close, 3316 nodev, /* strategy */ 3317 nodev, /* print */ 3318 nodev, /* dump */ 3319 nodev, /* read */ 3320 nodev, /* write */ 3321 vmm_ioctl, 3322 nodev, /* devmap */ 3323 nodev, /* mmap */ 3324 vmm_segmap, 3325 nochpoll, /* poll */ 3326 ddi_prop_op, 3327 NULL, 3328 D_NEW | D_MP | D_DEVMAP 3329 }; 3330 3331 static struct dev_ops vmm_ops = { 3332 DEVO_REV, 3333 0, 3334 vmm_info, 3335 nulldev, /* identify */ 3336 nulldev, /* probe */ 3337 vmm_attach, 3338 vmm_detach, 3339 nodev, /* reset */ 3340 &vmm_cb_ops, 3341 (struct bus_ops *)NULL 3342 }; 3343 3344 static struct modldrv modldrv = { 3345 &mod_driverops, 3346 "bhyve vmm", 3347 &vmm_ops 3348 }; 3349 3350 static struct modlinkage modlinkage = { 3351 MODREV_1, 3352 &modldrv, 3353 NULL 3354 }; 3355 3356 int 3357 _init(void) 3358 { 3359 int error; 3360 3361 sysinit(); 3362 3363 mutex_init(&vmmdev_mtx, NULL, MUTEX_DRIVER, NULL); 3364 mutex_init(&vmm_mtx, NULL, MUTEX_DRIVER, NULL); 3365 list_create(&vmm_list, sizeof (vmm_softc_t), 3366 offsetof(vmm_softc_t, vmm_node)); 3367 vmm_minors = id_space_create("vmm_minors", VMM_CTL_MINOR + 1, MAXMIN32); 3368 3369 error = ddi_soft_state_init(&vmm_statep, sizeof (vmm_softc_t), 0); 3370 if (error) { 3371 return (error); 3372 } 3373 3374 error = vmmr_init(); 3375 if (error) { 3376 ddi_soft_state_fini(&vmm_statep); 3377 return (error); 3378 } 3379 3380 vmm_zsd_init(); 3381 3382 error = mod_install(&modlinkage); 3383 if (error) { 3384 ddi_soft_state_fini(&vmm_statep); 3385 vmm_zsd_fini(); 3386 vmmr_fini(); 3387 } 3388 3389 return (error); 3390 } 3391 3392 int 3393 _fini(void) 3394 { 3395 int error; 3396 3397 error = mod_remove(&modlinkage); 3398 if (error) { 3399 return (error); 3400 } 3401 3402 vmm_zsd_fini(); 3403 vmmr_fini(); 3404 3405 ddi_soft_state_fini(&vmm_statep); 3406 3407 return (0); 3408 } 3409 3410 int 3411 _info(struct modinfo *modinfop) 3412 { 3413 return (mod_info(&modlinkage, modinfop)); 3414 } 3415