1 /* 2 * This file and its contents are supplied under the terms of the 3 * Common Development and Distribution License ("CDDL"), version 1.0. 4 * You may only use this file in accordance with the terms of version 5 * 1.0 of the CDDL. 6 * 7 * A full copy of the text of the CDDL should have accompanied this 8 * source. A copy of the CDDL is also available via the Internet at 9 * http://www.illumos.org/license/CDDL. 10 */ 11 /* This file is dual-licensed; see usr/src/contrib/bhyve/LICENSE */ 12 13 /* 14 * Copyright 2015 Pluribus Networks Inc. 15 * Copyright 2019 Joyent, Inc. 16 * Copyright 2020 OmniOS Community Edition (OmniOSce) Association. 17 * Copyright 2022 Oxide Computer Company 18 */ 19 20 #include <sys/types.h> 21 #include <sys/conf.h> 22 #include <sys/cpuvar.h> 23 #include <sys/ioccom.h> 24 #include <sys/stat.h> 25 #include <sys/vmsystm.h> 26 #include <sys/ddi.h> 27 #include <sys/mkdev.h> 28 #include <sys/sunddi.h> 29 #include <sys/fs/dv_node.h> 30 #include <sys/cpuset.h> 31 #include <sys/id_space.h> 32 #include <sys/fs/sdev_plugin.h> 33 #include <sys/smt.h> 34 #include <sys/kstat.h> 35 36 #include <sys/kernel.h> 37 #include <sys/hma.h> 38 #include <sys/x86_archext.h> 39 #include <x86/apicreg.h> 40 41 #include <sys/vmm.h> 42 #include <sys/vmm_kernel.h> 43 #include <sys/vmm_instruction_emul.h> 44 #include <sys/vmm_dev.h> 45 #include <sys/vmm_impl.h> 46 #include <sys/vmm_drv.h> 47 #include <sys/vmm_vm.h> 48 #include <sys/vmm_reservoir.h> 49 50 #include <vm/seg_dev.h> 51 52 #include "io/ppt.h" 53 #include "io/vatpic.h" 54 #include "io/vioapic.h" 55 #include "io/vrtc.h" 56 #include "io/vhpet.h" 57 #include "io/vpmtmr.h" 58 #include "vmm_lapic.h" 59 #include "vmm_stat.h" 60 #include "vmm_util.h" 61 62 /* 63 * Locking details: 64 * 65 * Driver-wide data (vmmdev_*) , including HMA and sdev registration, is 66 * protected by vmmdev_mtx. The list of vmm_softc_t instances and related data 67 * (vmm_*) are protected by vmm_mtx. Actions requiring both locks must acquire 68 * vmmdev_mtx before vmm_mtx. The sdev plugin functions must not attempt to 69 * acquire vmmdev_mtx, as they could deadlock with plugin unregistration. 70 */ 71 72 static kmutex_t vmmdev_mtx; 73 static dev_info_t *vmmdev_dip; 74 static hma_reg_t *vmmdev_hma_reg; 75 static uint_t vmmdev_hma_ref; 76 static sdev_plugin_hdl_t vmmdev_sdev_hdl; 77 78 static kmutex_t vmm_mtx; 79 static list_t vmm_list; 80 static list_t vmm_destroy_list; 81 static id_space_t *vmm_minors; 82 static void *vmm_statep; 83 84 /* temporary safety switch */ 85 int vmm_allow_state_writes; 86 87 static const char *vmmdev_hvm_name = "bhyve"; 88 89 /* For sdev plugin (/dev) */ 90 #define VMM_SDEV_ROOT "/dev/vmm" 91 92 /* From uts/intel/io/vmm/intel/vmx.c */ 93 extern int vmx_x86_supported(const char **); 94 95 /* Holds and hooks from drivers external to vmm */ 96 struct vmm_hold { 97 list_node_t vmh_node; 98 vmm_softc_t *vmh_sc; 99 boolean_t vmh_release_req; 100 uint_t vmh_ioport_hook_cnt; 101 }; 102 103 struct vmm_lease { 104 list_node_t vml_node; 105 struct vm *vml_vm; 106 vm_client_t *vml_vmclient; 107 boolean_t vml_expired; 108 boolean_t vml_break_deferred; 109 boolean_t (*vml_expire_func)(void *); 110 void *vml_expire_arg; 111 struct vmm_hold *vml_hold; 112 }; 113 114 static int vmm_drv_block_hook(vmm_softc_t *, boolean_t); 115 static void vmm_lease_block(vmm_softc_t *); 116 static void vmm_lease_unblock(vmm_softc_t *); 117 static int vmm_kstat_alloc(vmm_softc_t *, minor_t, const cred_t *); 118 static void vmm_kstat_init(vmm_softc_t *); 119 static void vmm_kstat_fini(vmm_softc_t *); 120 121 /* 122 * The 'devmem' hack: 123 * 124 * On native FreeBSD, bhyve consumers are allowed to create 'devmem' segments 125 * in the vm which appear with their own name related to the vm under /dev. 126 * Since this would be a hassle from an sdev perspective and would require a 127 * new cdev interface (or complicate the existing one), we choose to implement 128 * this in a different manner. Direct access to the underlying vm memory 129 * segments is exposed by placing them in a range of offsets beyond the normal 130 * guest memory space. Userspace can query the appropriate offset to mmap() 131 * for a given segment-id with the VM_DEVMEM_GETOFFSET ioctl. 132 */ 133 134 static vmm_devmem_entry_t * 135 vmmdev_devmem_find(vmm_softc_t *sc, int segid) 136 { 137 vmm_devmem_entry_t *ent = NULL; 138 list_t *dl = &sc->vmm_devmem_list; 139 140 for (ent = list_head(dl); ent != NULL; ent = list_next(dl, ent)) { 141 if (ent->vde_segid == segid) { 142 return (ent); 143 } 144 } 145 return (NULL); 146 } 147 148 static int 149 vmmdev_get_memseg(vmm_softc_t *sc, struct vm_memseg *mseg) 150 { 151 int error; 152 bool sysmem; 153 154 error = vm_get_memseg(sc->vmm_vm, mseg->segid, &mseg->len, &sysmem, 155 NULL); 156 if (error || mseg->len == 0) 157 return (error); 158 159 if (!sysmem) { 160 vmm_devmem_entry_t *de; 161 162 de = vmmdev_devmem_find(sc, mseg->segid); 163 if (de != NULL) { 164 (void) strlcpy(mseg->name, de->vde_name, 165 sizeof (mseg->name)); 166 } 167 } else { 168 bzero(mseg->name, sizeof (mseg->name)); 169 } 170 171 return (error); 172 } 173 174 static int 175 vmmdev_devmem_create(vmm_softc_t *sc, struct vm_memseg *mseg, const char *name) 176 { 177 off_t map_offset; 178 vmm_devmem_entry_t *entry; 179 180 if (list_is_empty(&sc->vmm_devmem_list)) { 181 map_offset = VM_DEVMEM_START; 182 } else { 183 entry = list_tail(&sc->vmm_devmem_list); 184 map_offset = entry->vde_off + entry->vde_len; 185 if (map_offset < entry->vde_off) { 186 /* Do not tolerate overflow */ 187 return (ERANGE); 188 } 189 /* 190 * XXXJOY: We could choose to search the list for duplicate 191 * names and toss an error. Since we're using the offset 192 * method for now, it does not make much of a difference. 193 */ 194 } 195 196 entry = kmem_zalloc(sizeof (*entry), KM_SLEEP); 197 entry->vde_segid = mseg->segid; 198 entry->vde_len = mseg->len; 199 entry->vde_off = map_offset; 200 (void) strlcpy(entry->vde_name, name, sizeof (entry->vde_name)); 201 list_insert_tail(&sc->vmm_devmem_list, entry); 202 203 return (0); 204 } 205 206 static boolean_t 207 vmmdev_devmem_segid(vmm_softc_t *sc, off_t off, off_t len, int *segidp, 208 off_t *map_offp) 209 { 210 list_t *dl = &sc->vmm_devmem_list; 211 vmm_devmem_entry_t *de = NULL; 212 const off_t map_end = off + len; 213 214 VERIFY(off >= VM_DEVMEM_START); 215 216 if (map_end < off) { 217 /* No match on overflow */ 218 return (B_FALSE); 219 } 220 221 for (de = list_head(dl); de != NULL; de = list_next(dl, de)) { 222 const off_t item_end = de->vde_off + de->vde_len; 223 224 if (de->vde_off <= off && item_end >= map_end) { 225 *segidp = de->vde_segid; 226 *map_offp = off - de->vde_off; 227 return (B_TRUE); 228 } 229 } 230 return (B_FALSE); 231 } 232 233 static void 234 vmmdev_devmem_purge(vmm_softc_t *sc) 235 { 236 vmm_devmem_entry_t *entry; 237 238 while ((entry = list_remove_head(&sc->vmm_devmem_list)) != NULL) { 239 kmem_free(entry, sizeof (*entry)); 240 } 241 } 242 243 static int 244 vmmdev_alloc_memseg(vmm_softc_t *sc, struct vm_memseg *mseg) 245 { 246 int error; 247 bool sysmem = true; 248 249 if (VM_MEMSEG_NAME(mseg)) { 250 sysmem = false; 251 } 252 error = vm_alloc_memseg(sc->vmm_vm, mseg->segid, mseg->len, sysmem); 253 254 if (error == 0) { 255 /* 256 * Rather than create a whole fresh device from which userspace 257 * can mmap this segment, instead make it available at an 258 * offset above where the main guest memory resides. 259 */ 260 error = vmmdev_devmem_create(sc, mseg, mseg->name); 261 if (error != 0) { 262 vm_free_memseg(sc->vmm_vm, mseg->segid); 263 } 264 } 265 return (error); 266 } 267 268 /* 269 * Resource Locking and Exclusion 270 * 271 * Much of bhyve depends on key portions of VM state, such as the guest memory 272 * map, to remain unchanged while the guest is running. As ported from 273 * FreeBSD, the initial strategy for this resource exclusion hinged on gating 274 * access to the instance vCPUs. Threads acting on a single vCPU, like those 275 * performing the work of actually running the guest in VMX/SVM, would lock 276 * only that vCPU during ioctl() entry. For ioctls which would change VM-wide 277 * state, all of the vCPUs would be first locked, ensuring that the 278 * operation(s) could complete without any other threads stumbling into 279 * intermediate states. 280 * 281 * This approach is largely effective for bhyve. Common operations, such as 282 * running the vCPUs, steer clear of lock contention. The model begins to 283 * break down for operations which do not occur in the context of a specific 284 * vCPU. LAPIC MSI delivery, for example, may be initiated from a worker 285 * thread in the bhyve process. In order to properly protect those vCPU-less 286 * operations from encountering invalid states, additional locking is required. 287 * This was solved by forcing those operations to lock the VM_MAXCPU-1 vCPU. 288 * It does mean that class of operations will be serialized on locking the 289 * specific vCPU and that instances sized at VM_MAXCPU will potentially see 290 * undue contention on the VM_MAXCPU-1 vCPU. 291 * 292 * In order to address the shortcomings of this model, the concept of a 293 * read/write lock has been added to bhyve. Operations which change 294 * fundamental aspects of a VM (such as the memory map) must acquire the write 295 * lock, which also implies locking all of the vCPUs and waiting for all read 296 * lock holders to release. While it increases the cost and waiting time for 297 * those few operations, it allows most hot-path operations on the VM (which 298 * depend on its configuration remaining stable) to occur with minimal locking. 299 * 300 * Consumers of the Driver API (see below) are a special case when it comes to 301 * this locking, since they may hold a read lock via the drv_lease mechanism 302 * for an extended period of time. Rather than forcing those consumers to 303 * continuously poll for a write lock attempt, the lease system forces them to 304 * provide a release callback to trigger their clean-up (and potential later 305 * reacquisition) of the read lock. 306 */ 307 308 static void 309 vcpu_lock_one(vmm_softc_t *sc, int vcpu) 310 { 311 ASSERT(vcpu >= 0 && vcpu < VM_MAXCPU); 312 313 /* 314 * Since this state transition is utilizing from_idle=true, it should 315 * not fail, but rather block until it can be successful. 316 */ 317 VERIFY0(vcpu_set_state(sc->vmm_vm, vcpu, VCPU_FROZEN, true)); 318 } 319 320 static void 321 vcpu_unlock_one(vmm_softc_t *sc, int vcpu) 322 { 323 ASSERT(vcpu >= 0 && vcpu < VM_MAXCPU); 324 325 VERIFY3U(vcpu_get_state(sc->vmm_vm, vcpu, NULL), ==, VCPU_FROZEN); 326 VERIFY0(vcpu_set_state(sc->vmm_vm, vcpu, VCPU_IDLE, false)); 327 } 328 329 static void 330 vmm_read_lock(vmm_softc_t *sc) 331 { 332 rw_enter(&sc->vmm_rwlock, RW_READER); 333 } 334 335 static void 336 vmm_read_unlock(vmm_softc_t *sc) 337 { 338 rw_exit(&sc->vmm_rwlock); 339 } 340 341 static void 342 vmm_write_lock(vmm_softc_t *sc) 343 { 344 int maxcpus; 345 346 /* First lock all the vCPUs */ 347 maxcpus = vm_get_maxcpus(sc->vmm_vm); 348 for (int vcpu = 0; vcpu < maxcpus; vcpu++) { 349 vcpu_lock_one(sc, vcpu); 350 } 351 352 /* 353 * Block vmm_drv leases from being acquired or held while the VM write 354 * lock is held. 355 */ 356 vmm_lease_block(sc); 357 358 rw_enter(&sc->vmm_rwlock, RW_WRITER); 359 /* 360 * For now, the 'maxcpus' value for an instance is fixed at the 361 * compile-time constant of VM_MAXCPU at creation. If this changes in 362 * the future, allowing for dynamic vCPU resource sizing, acquisition 363 * of the write lock will need to be wary of such changes. 364 */ 365 VERIFY(maxcpus == vm_get_maxcpus(sc->vmm_vm)); 366 } 367 368 static void 369 vmm_write_unlock(vmm_softc_t *sc) 370 { 371 int maxcpus; 372 373 /* Allow vmm_drv leases to be acquired once write lock is dropped */ 374 vmm_lease_unblock(sc); 375 376 /* 377 * The VM write lock _must_ be released from the same thread it was 378 * acquired in, unlike the read lock. 379 */ 380 VERIFY(rw_write_held(&sc->vmm_rwlock)); 381 rw_exit(&sc->vmm_rwlock); 382 383 /* Unlock all the vCPUs */ 384 maxcpus = vm_get_maxcpus(sc->vmm_vm); 385 for (int vcpu = 0; vcpu < maxcpus; vcpu++) { 386 vcpu_unlock_one(sc, vcpu); 387 } 388 } 389 390 static int 391 vmmdev_do_ioctl(vmm_softc_t *sc, int cmd, intptr_t arg, int md, 392 cred_t *credp, int *rvalp) 393 { 394 int error = 0, vcpu = -1; 395 void *datap = (void *)arg; 396 enum vm_lock_type { 397 LOCK_NONE = 0, 398 LOCK_VCPU, 399 LOCK_READ_HOLD, 400 LOCK_WRITE_HOLD 401 } lock_type = LOCK_NONE; 402 403 /* Acquire any exclusion resources needed for the operation. */ 404 switch (cmd) { 405 case VM_RUN: 406 case VM_GET_REGISTER: 407 case VM_SET_REGISTER: 408 case VM_GET_SEGMENT_DESCRIPTOR: 409 case VM_SET_SEGMENT_DESCRIPTOR: 410 case VM_GET_REGISTER_SET: 411 case VM_SET_REGISTER_SET: 412 case VM_INJECT_EXCEPTION: 413 case VM_GET_CAPABILITY: 414 case VM_SET_CAPABILITY: 415 case VM_PPTDEV_MSI: 416 case VM_PPTDEV_MSIX: 417 case VM_SET_X2APIC_STATE: 418 case VM_GLA2GPA: 419 case VM_GLA2GPA_NOFAULT: 420 case VM_ACTIVATE_CPU: 421 case VM_SET_INTINFO: 422 case VM_GET_INTINFO: 423 case VM_RESTART_INSTRUCTION: 424 case VM_SET_KERNEMU_DEV: 425 case VM_GET_KERNEMU_DEV: 426 case VM_RESET_CPU: 427 case VM_GET_RUN_STATE: 428 case VM_SET_RUN_STATE: 429 case VM_GET_FPU: 430 case VM_SET_FPU: 431 /* 432 * Copy in the ID of the vCPU chosen for this operation. 433 * Since a nefarious caller could update their struct between 434 * this locking and when the rest of the ioctl data is copied 435 * in, it is _critical_ that this local 'vcpu' variable be used 436 * rather than the in-struct one when performing the ioctl. 437 */ 438 if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) { 439 return (EFAULT); 440 } 441 if (vcpu < 0 || vcpu > vm_get_maxcpus(sc->vmm_vm)) { 442 return (EINVAL); 443 } 444 vcpu_lock_one(sc, vcpu); 445 lock_type = LOCK_VCPU; 446 break; 447 448 case VM_REINIT: 449 case VM_BIND_PPTDEV: 450 case VM_UNBIND_PPTDEV: 451 case VM_MAP_PPTDEV_MMIO: 452 case VM_UNMAP_PPTDEV_MMIO: 453 case VM_ALLOC_MEMSEG: 454 case VM_MMAP_MEMSEG: 455 case VM_MUNMAP_MEMSEG: 456 case VM_WRLOCK_CYCLE: 457 case VM_PMTMR_LOCATE: 458 vmm_write_lock(sc); 459 lock_type = LOCK_WRITE_HOLD; 460 break; 461 462 case VM_GET_MEMSEG: 463 case VM_MMAP_GETNEXT: 464 case VM_LAPIC_IRQ: 465 case VM_INJECT_NMI: 466 case VM_IOAPIC_ASSERT_IRQ: 467 case VM_IOAPIC_DEASSERT_IRQ: 468 case VM_IOAPIC_PULSE_IRQ: 469 case VM_LAPIC_MSI: 470 case VM_LAPIC_LOCAL_IRQ: 471 case VM_GET_X2APIC_STATE: 472 case VM_RTC_READ: 473 case VM_RTC_WRITE: 474 case VM_RTC_SETTIME: 475 case VM_RTC_GETTIME: 476 case VM_PPTDEV_DISABLE_MSIX: 477 case VM_DEVMEM_GETOFFSET: 478 case VM_TRACK_DIRTY_PAGES: 479 vmm_read_lock(sc); 480 lock_type = LOCK_READ_HOLD; 481 break; 482 483 case VM_DATA_READ: 484 case VM_DATA_WRITE: 485 if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) { 486 return (EFAULT); 487 } 488 if (vcpu == -1) { 489 /* Access data for VM-wide devices */ 490 vmm_write_lock(sc); 491 lock_type = LOCK_WRITE_HOLD; 492 } else if (vcpu >= 0 && vcpu < vm_get_maxcpus(sc->vmm_vm)) { 493 /* Access data associated with a specific vCPU */ 494 vcpu_lock_one(sc, vcpu); 495 lock_type = LOCK_VCPU; 496 } else { 497 return (EINVAL); 498 } 499 break; 500 501 case VM_GET_GPA_PMAP: 502 case VM_IOAPIC_PINCOUNT: 503 case VM_SUSPEND: 504 case VM_DESC_FPU_AREA: 505 default: 506 break; 507 } 508 509 /* Execute the primary logic for the ioctl. */ 510 switch (cmd) { 511 case VM_RUN: { 512 struct vm_entry entry; 513 514 if (ddi_copyin(datap, &entry, sizeof (entry), md)) { 515 error = EFAULT; 516 break; 517 } 518 519 if (!(curthread->t_schedflag & TS_VCPU)) 520 smt_mark_as_vcpu(); 521 522 error = vm_run(sc->vmm_vm, vcpu, &entry); 523 524 /* 525 * Unexpected states in vm_run() are expressed through positive 526 * errno-oriented return values. VM states which expect further 527 * processing in userspace (necessary context via exitinfo) are 528 * expressed through negative return values. For the time being 529 * a return value of 0 is not expected from vm_run(). 530 */ 531 ASSERT(error != 0); 532 if (error < 0) { 533 const struct vm_exit *vme; 534 void *outp = entry.exit_data; 535 536 error = 0; 537 vme = vm_exitinfo(sc->vmm_vm, vcpu); 538 if (ddi_copyout(vme, outp, sizeof (*vme), md)) { 539 error = EFAULT; 540 } 541 } 542 break; 543 } 544 case VM_SUSPEND: { 545 struct vm_suspend vmsuspend; 546 547 if (ddi_copyin(datap, &vmsuspend, sizeof (vmsuspend), md)) { 548 error = EFAULT; 549 break; 550 } 551 error = vm_suspend(sc->vmm_vm, vmsuspend.how); 552 break; 553 } 554 case VM_REINIT: { 555 struct vm_reinit reinit; 556 557 if (ddi_copyin(datap, &reinit, sizeof (reinit), md)) { 558 error = EFAULT; 559 break; 560 } 561 if ((error = vmm_drv_block_hook(sc, B_TRUE)) != 0) { 562 /* 563 * The VM instance should be free of driver-attached 564 * hooks during the reinitialization process. 565 */ 566 break; 567 } 568 error = vm_reinit(sc->vmm_vm, reinit.flags); 569 (void) vmm_drv_block_hook(sc, B_FALSE); 570 break; 571 } 572 case VM_STAT_DESC: { 573 struct vm_stat_desc statdesc; 574 575 if (ddi_copyin(datap, &statdesc, sizeof (statdesc), md)) { 576 error = EFAULT; 577 break; 578 } 579 error = vmm_stat_desc_copy(statdesc.index, statdesc.desc, 580 sizeof (statdesc.desc)); 581 if (error == 0 && 582 ddi_copyout(&statdesc, datap, sizeof (statdesc), md)) { 583 error = EFAULT; 584 break; 585 } 586 break; 587 } 588 case VM_STATS_IOC: { 589 struct vm_stats vmstats; 590 591 if (ddi_copyin(datap, &vmstats, sizeof (vmstats), md)) { 592 error = EFAULT; 593 break; 594 } 595 hrt2tv(gethrtime(), &vmstats.tv); 596 error = vmm_stat_copy(sc->vmm_vm, vmstats.cpuid, vmstats.index, 597 nitems(vmstats.statbuf), 598 &vmstats.num_entries, vmstats.statbuf); 599 if (error == 0 && 600 ddi_copyout(&vmstats, datap, sizeof (vmstats), md)) { 601 error = EFAULT; 602 break; 603 } 604 break; 605 } 606 607 case VM_PPTDEV_MSI: { 608 struct vm_pptdev_msi pptmsi; 609 610 if (ddi_copyin(datap, &pptmsi, sizeof (pptmsi), md)) { 611 error = EFAULT; 612 break; 613 } 614 error = ppt_setup_msi(sc->vmm_vm, pptmsi.vcpu, pptmsi.pptfd, 615 pptmsi.addr, pptmsi.msg, pptmsi.numvec); 616 break; 617 } 618 case VM_PPTDEV_MSIX: { 619 struct vm_pptdev_msix pptmsix; 620 621 if (ddi_copyin(datap, &pptmsix, sizeof (pptmsix), md)) { 622 error = EFAULT; 623 break; 624 } 625 error = ppt_setup_msix(sc->vmm_vm, pptmsix.vcpu, pptmsix.pptfd, 626 pptmsix.idx, pptmsix.addr, pptmsix.msg, 627 pptmsix.vector_control); 628 break; 629 } 630 case VM_PPTDEV_DISABLE_MSIX: { 631 struct vm_pptdev pptdev; 632 633 if (ddi_copyin(datap, &pptdev, sizeof (pptdev), md)) { 634 error = EFAULT; 635 break; 636 } 637 error = ppt_disable_msix(sc->vmm_vm, pptdev.pptfd); 638 break; 639 } 640 case VM_MAP_PPTDEV_MMIO: { 641 struct vm_pptdev_mmio pptmmio; 642 643 if (ddi_copyin(datap, &pptmmio, sizeof (pptmmio), md)) { 644 error = EFAULT; 645 break; 646 } 647 error = ppt_map_mmio(sc->vmm_vm, pptmmio.pptfd, pptmmio.gpa, 648 pptmmio.len, pptmmio.hpa); 649 break; 650 } 651 case VM_UNMAP_PPTDEV_MMIO: { 652 struct vm_pptdev_mmio pptmmio; 653 654 if (ddi_copyin(datap, &pptmmio, sizeof (pptmmio), md)) { 655 error = EFAULT; 656 break; 657 } 658 error = ppt_unmap_mmio(sc->vmm_vm, pptmmio.pptfd, pptmmio.gpa, 659 pptmmio.len); 660 break; 661 } 662 case VM_BIND_PPTDEV: { 663 struct vm_pptdev pptdev; 664 665 if (ddi_copyin(datap, &pptdev, sizeof (pptdev), md)) { 666 error = EFAULT; 667 break; 668 } 669 error = vm_assign_pptdev(sc->vmm_vm, pptdev.pptfd); 670 break; 671 } 672 case VM_UNBIND_PPTDEV: { 673 struct vm_pptdev pptdev; 674 675 if (ddi_copyin(datap, &pptdev, sizeof (pptdev), md)) { 676 error = EFAULT; 677 break; 678 } 679 error = vm_unassign_pptdev(sc->vmm_vm, pptdev.pptfd); 680 break; 681 } 682 case VM_GET_PPTDEV_LIMITS: { 683 struct vm_pptdev_limits pptlimits; 684 685 if (ddi_copyin(datap, &pptlimits, sizeof (pptlimits), md)) { 686 error = EFAULT; 687 break; 688 } 689 error = ppt_get_limits(sc->vmm_vm, pptlimits.pptfd, 690 &pptlimits.msi_limit, &pptlimits.msix_limit); 691 if (error == 0 && 692 ddi_copyout(&pptlimits, datap, sizeof (pptlimits), md)) { 693 error = EFAULT; 694 break; 695 } 696 break; 697 } 698 case VM_INJECT_EXCEPTION: { 699 struct vm_exception vmexc; 700 if (ddi_copyin(datap, &vmexc, sizeof (vmexc), md)) { 701 error = EFAULT; 702 break; 703 } 704 error = vm_inject_exception(sc->vmm_vm, vcpu, vmexc.vector, 705 vmexc.error_code_valid != 0, vmexc.error_code, 706 vmexc.restart_instruction != 0); 707 break; 708 } 709 case VM_INJECT_NMI: { 710 struct vm_nmi vmnmi; 711 712 if (ddi_copyin(datap, &vmnmi, sizeof (vmnmi), md)) { 713 error = EFAULT; 714 break; 715 } 716 error = vm_inject_nmi(sc->vmm_vm, vmnmi.cpuid); 717 break; 718 } 719 case VM_LAPIC_IRQ: { 720 struct vm_lapic_irq vmirq; 721 722 if (ddi_copyin(datap, &vmirq, sizeof (vmirq), md)) { 723 error = EFAULT; 724 break; 725 } 726 error = lapic_intr_edge(sc->vmm_vm, vmirq.cpuid, vmirq.vector); 727 break; 728 } 729 case VM_LAPIC_LOCAL_IRQ: { 730 struct vm_lapic_irq vmirq; 731 732 if (ddi_copyin(datap, &vmirq, sizeof (vmirq), md)) { 733 error = EFAULT; 734 break; 735 } 736 error = lapic_set_local_intr(sc->vmm_vm, vmirq.cpuid, 737 vmirq.vector); 738 break; 739 } 740 case VM_LAPIC_MSI: { 741 struct vm_lapic_msi vmmsi; 742 743 if (ddi_copyin(datap, &vmmsi, sizeof (vmmsi), md)) { 744 error = EFAULT; 745 break; 746 } 747 error = lapic_intr_msi(sc->vmm_vm, vmmsi.addr, vmmsi.msg); 748 break; 749 } 750 751 case VM_IOAPIC_ASSERT_IRQ: { 752 struct vm_ioapic_irq ioapic_irq; 753 754 if (ddi_copyin(datap, &ioapic_irq, sizeof (ioapic_irq), md)) { 755 error = EFAULT; 756 break; 757 } 758 error = vioapic_assert_irq(sc->vmm_vm, ioapic_irq.irq); 759 break; 760 } 761 case VM_IOAPIC_DEASSERT_IRQ: { 762 struct vm_ioapic_irq ioapic_irq; 763 764 if (ddi_copyin(datap, &ioapic_irq, sizeof (ioapic_irq), md)) { 765 error = EFAULT; 766 break; 767 } 768 error = vioapic_deassert_irq(sc->vmm_vm, ioapic_irq.irq); 769 break; 770 } 771 case VM_IOAPIC_PULSE_IRQ: { 772 struct vm_ioapic_irq ioapic_irq; 773 774 if (ddi_copyin(datap, &ioapic_irq, sizeof (ioapic_irq), md)) { 775 error = EFAULT; 776 break; 777 } 778 error = vioapic_pulse_irq(sc->vmm_vm, ioapic_irq.irq); 779 break; 780 } 781 case VM_IOAPIC_PINCOUNT: { 782 int pincount; 783 784 pincount = vioapic_pincount(sc->vmm_vm); 785 if (ddi_copyout(&pincount, datap, sizeof (int), md)) { 786 error = EFAULT; 787 break; 788 } 789 break; 790 } 791 case VM_DESC_FPU_AREA: { 792 struct vm_fpu_desc desc; 793 void *buf = NULL; 794 795 if (ddi_copyin(datap, &desc, sizeof (desc), md)) { 796 error = EFAULT; 797 break; 798 } 799 if (desc.vfd_num_entries > 64) { 800 error = EINVAL; 801 break; 802 } 803 const size_t buf_sz = sizeof (struct vm_fpu_desc_entry) * 804 desc.vfd_num_entries; 805 if (buf_sz != 0) { 806 buf = kmem_zalloc(buf_sz, KM_SLEEP); 807 } 808 809 /* 810 * For now, we are depending on vm_fpu_desc_entry and 811 * hma_xsave_state_desc_t having the same format. 812 */ 813 CTASSERT(sizeof (struct vm_fpu_desc_entry) == 814 sizeof (hma_xsave_state_desc_t)); 815 816 size_t req_size; 817 const uint_t max_entries = hma_fpu_describe_xsave_state( 818 (hma_xsave_state_desc_t *)buf, 819 desc.vfd_num_entries, 820 &req_size); 821 822 desc.vfd_req_size = req_size; 823 desc.vfd_num_entries = max_entries; 824 if (buf_sz != 0) { 825 if (ddi_copyout(buf, desc.vfd_entry_data, buf_sz, md)) { 826 error = EFAULT; 827 } 828 kmem_free(buf, buf_sz); 829 } 830 831 if (error == 0) { 832 if (ddi_copyout(&desc, datap, sizeof (desc), md)) { 833 error = EFAULT; 834 } 835 } 836 break; 837 } 838 839 case VM_ISA_ASSERT_IRQ: { 840 struct vm_isa_irq isa_irq; 841 842 if (ddi_copyin(datap, &isa_irq, sizeof (isa_irq), md)) { 843 error = EFAULT; 844 break; 845 } 846 error = vatpic_assert_irq(sc->vmm_vm, isa_irq.atpic_irq); 847 if (error == 0 && isa_irq.ioapic_irq != -1) { 848 error = vioapic_assert_irq(sc->vmm_vm, 849 isa_irq.ioapic_irq); 850 } 851 break; 852 } 853 case VM_ISA_DEASSERT_IRQ: { 854 struct vm_isa_irq isa_irq; 855 856 if (ddi_copyin(datap, &isa_irq, sizeof (isa_irq), md)) { 857 error = EFAULT; 858 break; 859 } 860 error = vatpic_deassert_irq(sc->vmm_vm, isa_irq.atpic_irq); 861 if (error == 0 && isa_irq.ioapic_irq != -1) { 862 error = vioapic_deassert_irq(sc->vmm_vm, 863 isa_irq.ioapic_irq); 864 } 865 break; 866 } 867 case VM_ISA_PULSE_IRQ: { 868 struct vm_isa_irq isa_irq; 869 870 if (ddi_copyin(datap, &isa_irq, sizeof (isa_irq), md)) { 871 error = EFAULT; 872 break; 873 } 874 error = vatpic_pulse_irq(sc->vmm_vm, isa_irq.atpic_irq); 875 if (error == 0 && isa_irq.ioapic_irq != -1) { 876 error = vioapic_pulse_irq(sc->vmm_vm, 877 isa_irq.ioapic_irq); 878 } 879 break; 880 } 881 case VM_ISA_SET_IRQ_TRIGGER: { 882 struct vm_isa_irq_trigger isa_irq_trigger; 883 884 if (ddi_copyin(datap, &isa_irq_trigger, 885 sizeof (isa_irq_trigger), md)) { 886 error = EFAULT; 887 break; 888 } 889 error = vatpic_set_irq_trigger(sc->vmm_vm, 890 isa_irq_trigger.atpic_irq, isa_irq_trigger.trigger); 891 break; 892 } 893 894 case VM_MMAP_GETNEXT: { 895 struct vm_memmap mm; 896 897 if (ddi_copyin(datap, &mm, sizeof (mm), md)) { 898 error = EFAULT; 899 break; 900 } 901 error = vm_mmap_getnext(sc->vmm_vm, &mm.gpa, &mm.segid, 902 &mm.segoff, &mm.len, &mm.prot, &mm.flags); 903 if (error == 0 && ddi_copyout(&mm, datap, sizeof (mm), md)) { 904 error = EFAULT; 905 break; 906 } 907 break; 908 } 909 case VM_MMAP_MEMSEG: { 910 struct vm_memmap mm; 911 912 if (ddi_copyin(datap, &mm, sizeof (mm), md)) { 913 error = EFAULT; 914 break; 915 } 916 error = vm_mmap_memseg(sc->vmm_vm, mm.gpa, mm.segid, mm.segoff, 917 mm.len, mm.prot, mm.flags); 918 break; 919 } 920 case VM_MUNMAP_MEMSEG: { 921 struct vm_munmap mu; 922 923 if (ddi_copyin(datap, &mu, sizeof (mu), md)) { 924 error = EFAULT; 925 break; 926 } 927 error = vm_munmap_memseg(sc->vmm_vm, mu.gpa, mu.len); 928 break; 929 } 930 case VM_ALLOC_MEMSEG: { 931 struct vm_memseg vmseg; 932 933 if (ddi_copyin(datap, &vmseg, sizeof (vmseg), md)) { 934 error = EFAULT; 935 break; 936 } 937 error = vmmdev_alloc_memseg(sc, &vmseg); 938 break; 939 } 940 case VM_GET_MEMSEG: { 941 struct vm_memseg vmseg; 942 943 if (ddi_copyin(datap, &vmseg, sizeof (vmseg), md)) { 944 error = EFAULT; 945 break; 946 } 947 error = vmmdev_get_memseg(sc, &vmseg); 948 if (error == 0 && 949 ddi_copyout(&vmseg, datap, sizeof (vmseg), md)) { 950 error = EFAULT; 951 break; 952 } 953 break; 954 } 955 case VM_GET_REGISTER: { 956 struct vm_register vmreg; 957 958 if (ddi_copyin(datap, &vmreg, sizeof (vmreg), md)) { 959 error = EFAULT; 960 break; 961 } 962 error = vm_get_register(sc->vmm_vm, vcpu, vmreg.regnum, 963 &vmreg.regval); 964 if (error == 0 && 965 ddi_copyout(&vmreg, datap, sizeof (vmreg), md)) { 966 error = EFAULT; 967 break; 968 } 969 break; 970 } 971 case VM_SET_REGISTER: { 972 struct vm_register vmreg; 973 974 if (ddi_copyin(datap, &vmreg, sizeof (vmreg), md)) { 975 error = EFAULT; 976 break; 977 } 978 error = vm_set_register(sc->vmm_vm, vcpu, vmreg.regnum, 979 vmreg.regval); 980 break; 981 } 982 case VM_SET_SEGMENT_DESCRIPTOR: { 983 struct vm_seg_desc vmsegd; 984 985 if (ddi_copyin(datap, &vmsegd, sizeof (vmsegd), md)) { 986 error = EFAULT; 987 break; 988 } 989 error = vm_set_seg_desc(sc->vmm_vm, vcpu, vmsegd.regnum, 990 &vmsegd.desc); 991 break; 992 } 993 case VM_GET_SEGMENT_DESCRIPTOR: { 994 struct vm_seg_desc vmsegd; 995 996 if (ddi_copyin(datap, &vmsegd, sizeof (vmsegd), md)) { 997 error = EFAULT; 998 break; 999 } 1000 error = vm_get_seg_desc(sc->vmm_vm, vcpu, vmsegd.regnum, 1001 &vmsegd.desc); 1002 if (error == 0 && 1003 ddi_copyout(&vmsegd, datap, sizeof (vmsegd), md)) { 1004 error = EFAULT; 1005 break; 1006 } 1007 break; 1008 } 1009 case VM_GET_REGISTER_SET: { 1010 struct vm_register_set vrs; 1011 int regnums[VM_REG_LAST]; 1012 uint64_t regvals[VM_REG_LAST]; 1013 1014 if (ddi_copyin(datap, &vrs, sizeof (vrs), md)) { 1015 error = EFAULT; 1016 break; 1017 } 1018 if (vrs.count > VM_REG_LAST || vrs.count == 0) { 1019 error = EINVAL; 1020 break; 1021 } 1022 if (ddi_copyin(vrs.regnums, regnums, 1023 sizeof (int) * vrs.count, md)) { 1024 error = EFAULT; 1025 break; 1026 } 1027 1028 error = 0; 1029 for (uint_t i = 0; i < vrs.count && error == 0; i++) { 1030 if (regnums[i] < 0) { 1031 error = EINVAL; 1032 break; 1033 } 1034 error = vm_get_register(sc->vmm_vm, vcpu, regnums[i], 1035 ®vals[i]); 1036 } 1037 if (error == 0 && ddi_copyout(regvals, vrs.regvals, 1038 sizeof (uint64_t) * vrs.count, md)) { 1039 error = EFAULT; 1040 } 1041 break; 1042 } 1043 case VM_SET_REGISTER_SET: { 1044 struct vm_register_set vrs; 1045 int regnums[VM_REG_LAST]; 1046 uint64_t regvals[VM_REG_LAST]; 1047 1048 if (ddi_copyin(datap, &vrs, sizeof (vrs), md)) { 1049 error = EFAULT; 1050 break; 1051 } 1052 if (vrs.count > VM_REG_LAST || vrs.count == 0) { 1053 error = EINVAL; 1054 break; 1055 } 1056 if (ddi_copyin(vrs.regnums, regnums, 1057 sizeof (int) * vrs.count, md)) { 1058 error = EFAULT; 1059 break; 1060 } 1061 if (ddi_copyin(vrs.regvals, regvals, 1062 sizeof (uint64_t) * vrs.count, md)) { 1063 error = EFAULT; 1064 break; 1065 } 1066 1067 error = 0; 1068 for (uint_t i = 0; i < vrs.count && error == 0; i++) { 1069 /* 1070 * Setting registers in a set is not atomic, since a 1071 * failure in the middle of the set will cause a 1072 * bail-out and inconsistent register state. Callers 1073 * should be wary of this. 1074 */ 1075 if (regnums[i] < 0) { 1076 error = EINVAL; 1077 break; 1078 } 1079 error = vm_set_register(sc->vmm_vm, vcpu, regnums[i], 1080 regvals[i]); 1081 } 1082 break; 1083 } 1084 case VM_RESET_CPU: { 1085 struct vm_vcpu_reset vvr; 1086 1087 if (ddi_copyin(datap, &vvr, sizeof (vvr), md)) { 1088 error = EFAULT; 1089 break; 1090 } 1091 if (vvr.kind != VRK_RESET && vvr.kind != VRK_INIT) { 1092 error = EINVAL; 1093 } 1094 1095 error = vcpu_arch_reset(sc->vmm_vm, vcpu, vvr.kind == VRK_INIT); 1096 break; 1097 } 1098 case VM_GET_RUN_STATE: { 1099 struct vm_run_state vrs; 1100 1101 bzero(&vrs, sizeof (vrs)); 1102 error = vm_get_run_state(sc->vmm_vm, vcpu, &vrs.state, 1103 &vrs.sipi_vector); 1104 if (error == 0) { 1105 if (ddi_copyout(&vrs, datap, sizeof (vrs), md)) { 1106 error = EFAULT; 1107 break; 1108 } 1109 } 1110 break; 1111 } 1112 case VM_SET_RUN_STATE: { 1113 struct vm_run_state vrs; 1114 1115 if (ddi_copyin(datap, &vrs, sizeof (vrs), md)) { 1116 error = EFAULT; 1117 break; 1118 } 1119 error = vm_set_run_state(sc->vmm_vm, vcpu, vrs.state, 1120 vrs.sipi_vector); 1121 break; 1122 } 1123 case VM_GET_FPU: { 1124 struct vm_fpu_state req; 1125 const size_t max_len = (PAGESIZE * 2); 1126 void *kbuf; 1127 1128 if (ddi_copyin(datap, &req, sizeof (req), md)) { 1129 error = EFAULT; 1130 break; 1131 } 1132 if (req.len > max_len || req.len == 0) { 1133 error = EINVAL; 1134 break; 1135 } 1136 kbuf = kmem_zalloc(req.len, KM_SLEEP); 1137 error = vm_get_fpu(sc->vmm_vm, vcpu, kbuf, req.len); 1138 if (error == 0) { 1139 if (ddi_copyout(kbuf, req.buf, req.len, md)) { 1140 error = EFAULT; 1141 } 1142 } 1143 kmem_free(kbuf, req.len); 1144 break; 1145 } 1146 case VM_SET_FPU: { 1147 struct vm_fpu_state req; 1148 const size_t max_len = (PAGESIZE * 2); 1149 void *kbuf; 1150 1151 if (ddi_copyin(datap, &req, sizeof (req), md)) { 1152 error = EFAULT; 1153 break; 1154 } 1155 if (req.len > max_len || req.len == 0) { 1156 error = EINVAL; 1157 break; 1158 } 1159 kbuf = kmem_alloc(req.len, KM_SLEEP); 1160 if (ddi_copyin(req.buf, kbuf, req.len, md)) { 1161 error = EFAULT; 1162 } else { 1163 error = vm_set_fpu(sc->vmm_vm, vcpu, kbuf, req.len); 1164 } 1165 kmem_free(kbuf, req.len); 1166 break; 1167 } 1168 1169 case VM_SET_KERNEMU_DEV: 1170 case VM_GET_KERNEMU_DEV: { 1171 struct vm_readwrite_kernemu_device kemu; 1172 size_t size = 0; 1173 1174 if (ddi_copyin(datap, &kemu, sizeof (kemu), md)) { 1175 error = EFAULT; 1176 break; 1177 } 1178 1179 if (kemu.access_width > 3) { 1180 error = EINVAL; 1181 break; 1182 } 1183 size = (1 << kemu.access_width); 1184 ASSERT(size >= 1 && size <= 8); 1185 1186 if (cmd == VM_SET_KERNEMU_DEV) { 1187 error = vm_service_mmio_write(sc->vmm_vm, vcpu, 1188 kemu.gpa, kemu.value, size); 1189 } else { 1190 error = vm_service_mmio_read(sc->vmm_vm, vcpu, 1191 kemu.gpa, &kemu.value, size); 1192 } 1193 1194 if (error == 0) { 1195 if (ddi_copyout(&kemu, datap, sizeof (kemu), md)) { 1196 error = EFAULT; 1197 break; 1198 } 1199 } 1200 break; 1201 } 1202 1203 case VM_GET_CAPABILITY: { 1204 struct vm_capability vmcap; 1205 1206 if (ddi_copyin(datap, &vmcap, sizeof (vmcap), md)) { 1207 error = EFAULT; 1208 break; 1209 } 1210 error = vm_get_capability(sc->vmm_vm, vcpu, vmcap.captype, 1211 &vmcap.capval); 1212 if (error == 0 && 1213 ddi_copyout(&vmcap, datap, sizeof (vmcap), md)) { 1214 error = EFAULT; 1215 break; 1216 } 1217 break; 1218 } 1219 case VM_SET_CAPABILITY: { 1220 struct vm_capability vmcap; 1221 1222 if (ddi_copyin(datap, &vmcap, sizeof (vmcap), md)) { 1223 error = EFAULT; 1224 break; 1225 } 1226 error = vm_set_capability(sc->vmm_vm, vcpu, vmcap.captype, 1227 vmcap.capval); 1228 break; 1229 } 1230 case VM_SET_X2APIC_STATE: { 1231 struct vm_x2apic x2apic; 1232 1233 if (ddi_copyin(datap, &x2apic, sizeof (x2apic), md)) { 1234 error = EFAULT; 1235 break; 1236 } 1237 error = vm_set_x2apic_state(sc->vmm_vm, vcpu, x2apic.state); 1238 break; 1239 } 1240 case VM_GET_X2APIC_STATE: { 1241 struct vm_x2apic x2apic; 1242 1243 if (ddi_copyin(datap, &x2apic, sizeof (x2apic), md)) { 1244 error = EFAULT; 1245 break; 1246 } 1247 error = vm_get_x2apic_state(sc->vmm_vm, x2apic.cpuid, 1248 &x2apic.state); 1249 if (error == 0 && 1250 ddi_copyout(&x2apic, datap, sizeof (x2apic), md)) { 1251 error = EFAULT; 1252 break; 1253 } 1254 break; 1255 } 1256 case VM_GET_GPA_PMAP: { 1257 /* 1258 * Until there is a necessity to leak EPT/RVI PTE values to 1259 * userspace, this will remain unimplemented 1260 */ 1261 error = EINVAL; 1262 break; 1263 } 1264 case VM_GET_HPET_CAPABILITIES: { 1265 struct vm_hpet_cap hpetcap; 1266 1267 error = vhpet_getcap(&hpetcap); 1268 if (error == 0 && 1269 ddi_copyout(&hpetcap, datap, sizeof (hpetcap), md)) { 1270 error = EFAULT; 1271 break; 1272 } 1273 break; 1274 } 1275 case VM_GLA2GPA: { 1276 struct vm_gla2gpa gg; 1277 1278 if (ddi_copyin(datap, &gg, sizeof (gg), md)) { 1279 error = EFAULT; 1280 break; 1281 } 1282 gg.vcpuid = vcpu; 1283 error = vm_gla2gpa(sc->vmm_vm, vcpu, &gg.paging, gg.gla, 1284 gg.prot, &gg.gpa, &gg.fault); 1285 if (error == 0 && ddi_copyout(&gg, datap, sizeof (gg), md)) { 1286 error = EFAULT; 1287 break; 1288 } 1289 break; 1290 } 1291 case VM_GLA2GPA_NOFAULT: { 1292 struct vm_gla2gpa gg; 1293 1294 if (ddi_copyin(datap, &gg, sizeof (gg), md)) { 1295 error = EFAULT; 1296 break; 1297 } 1298 gg.vcpuid = vcpu; 1299 error = vm_gla2gpa_nofault(sc->vmm_vm, vcpu, &gg.paging, 1300 gg.gla, gg.prot, &gg.gpa, &gg.fault); 1301 if (error == 0 && ddi_copyout(&gg, datap, sizeof (gg), md)) { 1302 error = EFAULT; 1303 break; 1304 } 1305 break; 1306 } 1307 1308 case VM_ACTIVATE_CPU: 1309 error = vm_activate_cpu(sc->vmm_vm, vcpu); 1310 break; 1311 1312 case VM_SUSPEND_CPU: 1313 if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) { 1314 error = EFAULT; 1315 } else { 1316 error = vm_suspend_cpu(sc->vmm_vm, vcpu); 1317 } 1318 break; 1319 1320 case VM_RESUME_CPU: 1321 if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) { 1322 error = EFAULT; 1323 } else { 1324 error = vm_resume_cpu(sc->vmm_vm, vcpu); 1325 } 1326 break; 1327 1328 case VM_GET_CPUS: { 1329 struct vm_cpuset vm_cpuset; 1330 cpuset_t tempset; 1331 void *srcp = &tempset; 1332 int size; 1333 1334 if (ddi_copyin(datap, &vm_cpuset, sizeof (vm_cpuset), md)) { 1335 error = EFAULT; 1336 break; 1337 } 1338 1339 /* Be more generous about sizing since our cpuset_t is large. */ 1340 size = vm_cpuset.cpusetsize; 1341 if (size <= 0 || size > sizeof (cpuset_t)) { 1342 error = ERANGE; 1343 } 1344 /* 1345 * If they want a ulong_t or less, make sure they receive the 1346 * low bits with all the useful information. 1347 */ 1348 if (size <= sizeof (tempset.cpub[0])) { 1349 srcp = &tempset.cpub[0]; 1350 } 1351 1352 if (vm_cpuset.which == VM_ACTIVE_CPUS) { 1353 tempset = vm_active_cpus(sc->vmm_vm); 1354 } else if (vm_cpuset.which == VM_SUSPENDED_CPUS) { 1355 tempset = vm_suspended_cpus(sc->vmm_vm); 1356 } else if (vm_cpuset.which == VM_DEBUG_CPUS) { 1357 tempset = vm_debug_cpus(sc->vmm_vm); 1358 } else { 1359 error = EINVAL; 1360 } 1361 1362 ASSERT(size > 0 && size <= sizeof (tempset)); 1363 if (error == 0 && 1364 ddi_copyout(srcp, vm_cpuset.cpus, size, md)) { 1365 error = EFAULT; 1366 break; 1367 } 1368 break; 1369 } 1370 case VM_SET_INTINFO: { 1371 struct vm_intinfo vmii; 1372 1373 if (ddi_copyin(datap, &vmii, sizeof (vmii), md)) { 1374 error = EFAULT; 1375 break; 1376 } 1377 error = vm_exit_intinfo(sc->vmm_vm, vcpu, vmii.info1); 1378 break; 1379 } 1380 case VM_GET_INTINFO: { 1381 struct vm_intinfo vmii; 1382 1383 vmii.vcpuid = vcpu; 1384 error = vm_get_intinfo(sc->vmm_vm, vcpu, &vmii.info1, 1385 &vmii.info2); 1386 if (error == 0 && 1387 ddi_copyout(&vmii, datap, sizeof (vmii), md)) { 1388 error = EFAULT; 1389 break; 1390 } 1391 break; 1392 } 1393 case VM_RTC_WRITE: { 1394 struct vm_rtc_data rtcdata; 1395 1396 if (ddi_copyin(datap, &rtcdata, sizeof (rtcdata), md)) { 1397 error = EFAULT; 1398 break; 1399 } 1400 error = vrtc_nvram_write(sc->vmm_vm, rtcdata.offset, 1401 rtcdata.value); 1402 break; 1403 } 1404 case VM_RTC_READ: { 1405 struct vm_rtc_data rtcdata; 1406 1407 if (ddi_copyin(datap, &rtcdata, sizeof (rtcdata), md)) { 1408 error = EFAULT; 1409 break; 1410 } 1411 error = vrtc_nvram_read(sc->vmm_vm, rtcdata.offset, 1412 &rtcdata.value); 1413 if (error == 0 && 1414 ddi_copyout(&rtcdata, datap, sizeof (rtcdata), md)) { 1415 error = EFAULT; 1416 break; 1417 } 1418 break; 1419 } 1420 case VM_RTC_SETTIME: { 1421 struct vm_rtc_time rtctime; 1422 1423 if (ddi_copyin(datap, &rtctime, sizeof (rtctime), md)) { 1424 error = EFAULT; 1425 break; 1426 } 1427 error = vrtc_set_time(sc->vmm_vm, rtctime.secs); 1428 break; 1429 } 1430 case VM_RTC_GETTIME: { 1431 struct vm_rtc_time rtctime; 1432 1433 rtctime.secs = vrtc_get_time(sc->vmm_vm); 1434 if (ddi_copyout(&rtctime, datap, sizeof (rtctime), md)) { 1435 error = EFAULT; 1436 break; 1437 } 1438 break; 1439 } 1440 1441 case VM_PMTMR_LOCATE: { 1442 uint16_t port = arg; 1443 error = vpmtmr_set_location(sc->vmm_vm, port); 1444 break; 1445 } 1446 1447 case VM_RESTART_INSTRUCTION: 1448 error = vm_restart_instruction(sc->vmm_vm, vcpu); 1449 break; 1450 1451 case VM_SET_TOPOLOGY: { 1452 struct vm_cpu_topology topo; 1453 1454 if (ddi_copyin(datap, &topo, sizeof (topo), md) != 0) { 1455 error = EFAULT; 1456 break; 1457 } 1458 error = vm_set_topology(sc->vmm_vm, topo.sockets, topo.cores, 1459 topo.threads, topo.maxcpus); 1460 break; 1461 } 1462 case VM_GET_TOPOLOGY: { 1463 struct vm_cpu_topology topo; 1464 1465 vm_get_topology(sc->vmm_vm, &topo.sockets, &topo.cores, 1466 &topo.threads, &topo.maxcpus); 1467 if (ddi_copyout(&topo, datap, sizeof (topo), md) != 0) { 1468 error = EFAULT; 1469 break; 1470 } 1471 break; 1472 } 1473 case VM_DEVMEM_GETOFFSET: { 1474 struct vm_devmem_offset vdo; 1475 vmm_devmem_entry_t *de; 1476 1477 if (ddi_copyin(datap, &vdo, sizeof (vdo), md) != 0) { 1478 error = EFAULT; 1479 break; 1480 } 1481 1482 de = vmmdev_devmem_find(sc, vdo.segid); 1483 if (de != NULL) { 1484 vdo.offset = de->vde_off; 1485 if (ddi_copyout(&vdo, datap, sizeof (vdo), md) != 0) { 1486 error = EFAULT; 1487 } 1488 } else { 1489 error = ENOENT; 1490 } 1491 break; 1492 } 1493 case VM_TRACK_DIRTY_PAGES: { 1494 const size_t max_track_region_len = 8 * PAGESIZE * 8 * PAGESIZE; 1495 struct vmm_dirty_tracker tracker; 1496 uint8_t *bitmap; 1497 size_t len; 1498 1499 if (ddi_copyin(datap, &tracker, sizeof (tracker), md) != 0) { 1500 error = EFAULT; 1501 break; 1502 } 1503 if ((tracker.vdt_start_gpa & PAGEOFFSET) != 0) { 1504 error = EINVAL; 1505 break; 1506 } 1507 if (tracker.vdt_len == 0) { 1508 break; 1509 } 1510 if ((tracker.vdt_len & PAGEOFFSET) != 0) { 1511 error = EINVAL; 1512 break; 1513 } 1514 if (tracker.vdt_len > max_track_region_len) { 1515 error = EINVAL; 1516 break; 1517 } 1518 len = roundup(tracker.vdt_len / PAGESIZE, 8) / 8; 1519 bitmap = kmem_zalloc(len, KM_SLEEP); 1520 vm_track_dirty_pages(sc->vmm_vm, tracker.vdt_start_gpa, 1521 tracker.vdt_len, bitmap); 1522 if (ddi_copyout(bitmap, tracker.vdt_pfns, len, md) != 0) { 1523 error = EFAULT; 1524 } 1525 kmem_free(bitmap, len); 1526 1527 break; 1528 } 1529 case VM_WRLOCK_CYCLE: { 1530 /* 1531 * Present a test mechanism to acquire/release the write lock 1532 * on the VM without any other effects. 1533 */ 1534 break; 1535 } 1536 case VM_DATA_READ: { 1537 struct vm_data_xfer vdx; 1538 1539 if (ddi_copyin(datap, &vdx, sizeof (vdx), md) != 0) { 1540 error = EFAULT; 1541 break; 1542 } 1543 if ((vdx.vdx_flags & ~VDX_FLAGS_VALID) != 0) { 1544 error = EINVAL; 1545 break; 1546 } 1547 if (vdx.vdx_len > VM_DATA_XFER_LIMIT) { 1548 error = EFBIG; 1549 break; 1550 } 1551 1552 const size_t len = vdx.vdx_len; 1553 void *buf = NULL; 1554 if (len != 0) { 1555 buf = kmem_alloc(len, KM_SLEEP); 1556 if ((vdx.vdx_flags & VDX_FLAG_READ_COPYIN) != 0 && 1557 ddi_copyin(vdx.vdx_data, buf, len, md) != 0) { 1558 kmem_free(buf, len); 1559 error = EFAULT; 1560 break; 1561 } else { 1562 bzero(buf, len); 1563 } 1564 } 1565 1566 vdx.vdx_result_len = 0; 1567 vmm_data_req_t req = { 1568 .vdr_class = vdx.vdx_class, 1569 .vdr_version = vdx.vdx_version, 1570 .vdr_flags = vdx.vdx_flags, 1571 .vdr_len = len, 1572 .vdr_data = buf, 1573 .vdr_result_len = &vdx.vdx_result_len, 1574 }; 1575 error = vmm_data_read(sc->vmm_vm, vdx.vdx_vcpuid, &req); 1576 1577 if (error == 0 && buf != NULL) { 1578 if (ddi_copyout(buf, vdx.vdx_data, len, md) != 0) { 1579 error = EFAULT; 1580 } 1581 } 1582 1583 /* 1584 * Copy out the transfer request so that the value of 1585 * vdx_result_len can be made available, regardless of any 1586 * error(s) which may have occurred. 1587 */ 1588 if (ddi_copyout(&vdx, datap, sizeof (vdx), md) != 0) { 1589 error = (error != 0) ? error : EFAULT; 1590 } 1591 1592 if (buf != NULL) { 1593 kmem_free(buf, len); 1594 } 1595 break; 1596 } 1597 case VM_DATA_WRITE: { 1598 struct vm_data_xfer vdx; 1599 1600 if (ddi_copyin(datap, &vdx, sizeof (vdx), md) != 0) { 1601 error = EFAULT; 1602 break; 1603 } 1604 if ((vdx.vdx_flags & ~VDX_FLAGS_VALID) != 0) { 1605 error = EINVAL; 1606 break; 1607 } 1608 if (vdx.vdx_len > VM_DATA_XFER_LIMIT) { 1609 error = EFBIG; 1610 break; 1611 } 1612 1613 const size_t len = vdx.vdx_len; 1614 void *buf = NULL; 1615 if (len != 0) { 1616 buf = kmem_alloc(len, KM_SLEEP); 1617 if (ddi_copyin(vdx.vdx_data, buf, len, md) != 0) { 1618 kmem_free(buf, len); 1619 error = EFAULT; 1620 break; 1621 } 1622 } 1623 1624 vdx.vdx_result_len = 0; 1625 vmm_data_req_t req = { 1626 .vdr_class = vdx.vdx_class, 1627 .vdr_version = vdx.vdx_version, 1628 .vdr_flags = vdx.vdx_flags, 1629 .vdr_len = len, 1630 .vdr_data = buf, 1631 .vdr_result_len = &vdx.vdx_result_len, 1632 }; 1633 if (vmm_allow_state_writes == 0) { 1634 /* XXX: Play it safe for now */ 1635 error = EPERM; 1636 } else { 1637 error = vmm_data_write(sc->vmm_vm, vdx.vdx_vcpuid, 1638 &req); 1639 } 1640 1641 if (error == 0 && buf != NULL && 1642 (vdx.vdx_flags & VDX_FLAG_WRITE_COPYOUT) != 0) { 1643 if (ddi_copyout(buf, vdx.vdx_data, len, md) != 0) { 1644 error = EFAULT; 1645 } 1646 } 1647 1648 /* 1649 * Copy out the transfer request so that the value of 1650 * vdx_result_len can be made available, regardless of any 1651 * error(s) which may have occurred. 1652 */ 1653 if (ddi_copyout(&vdx, datap, sizeof (vdx), md) != 0) { 1654 error = (error != 0) ? error : EFAULT; 1655 } 1656 1657 if (buf != NULL) { 1658 kmem_free(buf, len); 1659 } 1660 break; 1661 } 1662 1663 default: 1664 error = ENOTTY; 1665 break; 1666 } 1667 1668 /* Release exclusion resources */ 1669 switch (lock_type) { 1670 case LOCK_NONE: 1671 break; 1672 case LOCK_VCPU: 1673 vcpu_unlock_one(sc, vcpu); 1674 break; 1675 case LOCK_READ_HOLD: 1676 vmm_read_unlock(sc); 1677 break; 1678 case LOCK_WRITE_HOLD: 1679 vmm_write_unlock(sc); 1680 break; 1681 default: 1682 panic("unexpected lock type"); 1683 break; 1684 } 1685 1686 return (error); 1687 } 1688 1689 static vmm_softc_t * 1690 vmm_lookup(const char *name) 1691 { 1692 list_t *vml = &vmm_list; 1693 vmm_softc_t *sc; 1694 1695 ASSERT(MUTEX_HELD(&vmm_mtx)); 1696 1697 for (sc = list_head(vml); sc != NULL; sc = list_next(vml, sc)) { 1698 if (strcmp(sc->vmm_name, name) == 0) { 1699 break; 1700 } 1701 } 1702 1703 return (sc); 1704 } 1705 1706 /* 1707 * Acquire an HMA registration if not already held. 1708 */ 1709 static boolean_t 1710 vmm_hma_acquire(void) 1711 { 1712 ASSERT(MUTEX_NOT_HELD(&vmm_mtx)); 1713 1714 mutex_enter(&vmmdev_mtx); 1715 1716 if (vmmdev_hma_reg == NULL) { 1717 VERIFY3U(vmmdev_hma_ref, ==, 0); 1718 vmmdev_hma_reg = hma_register(vmmdev_hvm_name); 1719 if (vmmdev_hma_reg == NULL) { 1720 cmn_err(CE_WARN, "%s HMA registration failed.", 1721 vmmdev_hvm_name); 1722 mutex_exit(&vmmdev_mtx); 1723 return (B_FALSE); 1724 } 1725 } 1726 1727 vmmdev_hma_ref++; 1728 1729 mutex_exit(&vmmdev_mtx); 1730 1731 return (B_TRUE); 1732 } 1733 1734 /* 1735 * Release the HMA registration if held and there are no remaining VMs. 1736 */ 1737 static void 1738 vmm_hma_release(void) 1739 { 1740 ASSERT(MUTEX_NOT_HELD(&vmm_mtx)); 1741 1742 mutex_enter(&vmmdev_mtx); 1743 1744 VERIFY3U(vmmdev_hma_ref, !=, 0); 1745 1746 vmmdev_hma_ref--; 1747 1748 if (vmmdev_hma_ref == 0) { 1749 VERIFY(vmmdev_hma_reg != NULL); 1750 hma_unregister(vmmdev_hma_reg); 1751 vmmdev_hma_reg = NULL; 1752 } 1753 mutex_exit(&vmmdev_mtx); 1754 } 1755 1756 static int 1757 vmmdev_do_vm_create(const struct vm_create_req *req, cred_t *cr) 1758 { 1759 vmm_softc_t *sc = NULL; 1760 minor_t minor; 1761 int error = ENOMEM; 1762 size_t len; 1763 const char *name = req->name; 1764 1765 len = strnlen(name, VM_MAX_NAMELEN); 1766 if (len == 0) { 1767 return (EINVAL); 1768 } 1769 if (len >= VM_MAX_NAMELEN) { 1770 return (ENAMETOOLONG); 1771 } 1772 if (strchr(name, '/') != NULL) { 1773 return (EINVAL); 1774 } 1775 1776 if (!vmm_hma_acquire()) 1777 return (ENXIO); 1778 1779 mutex_enter(&vmm_mtx); 1780 1781 /* Look for duplicate names */ 1782 if (vmm_lookup(name) != NULL) { 1783 mutex_exit(&vmm_mtx); 1784 vmm_hma_release(); 1785 return (EEXIST); 1786 } 1787 1788 /* Allow only one instance per non-global zone. */ 1789 if (!INGLOBALZONE(curproc)) { 1790 for (sc = list_head(&vmm_list); sc != NULL; 1791 sc = list_next(&vmm_list, sc)) { 1792 if (sc->vmm_zone == curzone) { 1793 mutex_exit(&vmm_mtx); 1794 vmm_hma_release(); 1795 return (EINVAL); 1796 } 1797 } 1798 } 1799 1800 minor = id_alloc(vmm_minors); 1801 if (ddi_soft_state_zalloc(vmm_statep, minor) != DDI_SUCCESS) { 1802 goto fail; 1803 } else if ((sc = ddi_get_soft_state(vmm_statep, minor)) == NULL) { 1804 ddi_soft_state_free(vmm_statep, minor); 1805 goto fail; 1806 } else if (ddi_create_minor_node(vmmdev_dip, name, S_IFCHR, minor, 1807 DDI_PSEUDO, 0) != DDI_SUCCESS) { 1808 goto fail; 1809 } 1810 1811 if (vmm_kstat_alloc(sc, minor, cr) != 0) { 1812 goto fail; 1813 } 1814 1815 error = vm_create(req->flags, &sc->vmm_vm); 1816 if (error == 0) { 1817 /* Complete VM intialization and report success. */ 1818 (void) strlcpy(sc->vmm_name, name, sizeof (sc->vmm_name)); 1819 sc->vmm_minor = minor; 1820 list_create(&sc->vmm_devmem_list, sizeof (vmm_devmem_entry_t), 1821 offsetof(vmm_devmem_entry_t, vde_node)); 1822 1823 list_create(&sc->vmm_holds, sizeof (vmm_hold_t), 1824 offsetof(vmm_hold_t, vmh_node)); 1825 cv_init(&sc->vmm_cv, NULL, CV_DEFAULT, NULL); 1826 1827 mutex_init(&sc->vmm_lease_lock, NULL, MUTEX_DEFAULT, NULL); 1828 list_create(&sc->vmm_lease_list, sizeof (vmm_lease_t), 1829 offsetof(vmm_lease_t, vml_node)); 1830 cv_init(&sc->vmm_lease_cv, NULL, CV_DEFAULT, NULL); 1831 rw_init(&sc->vmm_rwlock, NULL, RW_DEFAULT, NULL); 1832 1833 sc->vmm_zone = crgetzone(cr); 1834 zone_hold(sc->vmm_zone); 1835 vmm_zsd_add_vm(sc); 1836 vmm_kstat_init(sc); 1837 1838 list_insert_tail(&vmm_list, sc); 1839 mutex_exit(&vmm_mtx); 1840 return (0); 1841 } 1842 1843 vmm_kstat_fini(sc); 1844 ddi_remove_minor_node(vmmdev_dip, name); 1845 fail: 1846 id_free(vmm_minors, minor); 1847 if (sc != NULL) { 1848 ddi_soft_state_free(vmm_statep, minor); 1849 } 1850 mutex_exit(&vmm_mtx); 1851 vmm_hma_release(); 1852 1853 return (error); 1854 } 1855 1856 /* 1857 * Bhyve 'Driver' Interface 1858 * 1859 * While many devices are emulated in the bhyve userspace process, there are 1860 * others with performance constraints which require that they run mostly or 1861 * entirely in-kernel. For those not integrated directly into bhyve, an API is 1862 * needed so they can query/manipulate the portions of VM state needed to 1863 * fulfill their purpose. 1864 * 1865 * This includes: 1866 * - Translating guest-physical addresses to host-virtual pointers 1867 * - Injecting MSIs 1868 * - Hooking IO port addresses 1869 * 1870 * The vmm_drv interface exists to provide that functionality to its consumers. 1871 * (At this time, 'viona' is the only user) 1872 */ 1873 int 1874 vmm_drv_hold(file_t *fp, cred_t *cr, vmm_hold_t **holdp) 1875 { 1876 vnode_t *vp = fp->f_vnode; 1877 const dev_t dev = vp->v_rdev; 1878 vmm_softc_t *sc; 1879 vmm_hold_t *hold; 1880 int err = 0; 1881 1882 if (vp->v_type != VCHR) { 1883 return (ENXIO); 1884 } 1885 const major_t major = getmajor(dev); 1886 const minor_t minor = getminor(dev); 1887 1888 mutex_enter(&vmmdev_mtx); 1889 if (vmmdev_dip == NULL || major != ddi_driver_major(vmmdev_dip)) { 1890 mutex_exit(&vmmdev_mtx); 1891 return (ENOENT); 1892 } 1893 mutex_enter(&vmm_mtx); 1894 mutex_exit(&vmmdev_mtx); 1895 1896 if ((sc = ddi_get_soft_state(vmm_statep, minor)) == NULL) { 1897 err = ENOENT; 1898 goto out; 1899 } 1900 /* XXXJOY: check cred permissions against instance */ 1901 1902 if ((sc->vmm_flags & (VMM_CLEANUP|VMM_PURGED|VMM_DESTROY)) != 0) { 1903 err = EBUSY; 1904 goto out; 1905 } 1906 1907 hold = kmem_zalloc(sizeof (*hold), KM_SLEEP); 1908 hold->vmh_sc = sc; 1909 hold->vmh_release_req = B_FALSE; 1910 1911 list_insert_tail(&sc->vmm_holds, hold); 1912 sc->vmm_flags |= VMM_HELD; 1913 *holdp = hold; 1914 1915 out: 1916 mutex_exit(&vmm_mtx); 1917 return (err); 1918 } 1919 1920 void 1921 vmm_drv_rele(vmm_hold_t *hold) 1922 { 1923 vmm_softc_t *sc; 1924 1925 ASSERT(hold != NULL); 1926 ASSERT(hold->vmh_sc != NULL); 1927 VERIFY(hold->vmh_ioport_hook_cnt == 0); 1928 1929 mutex_enter(&vmm_mtx); 1930 sc = hold->vmh_sc; 1931 list_remove(&sc->vmm_holds, hold); 1932 if (list_is_empty(&sc->vmm_holds)) { 1933 sc->vmm_flags &= ~VMM_HELD; 1934 cv_broadcast(&sc->vmm_cv); 1935 } 1936 mutex_exit(&vmm_mtx); 1937 kmem_free(hold, sizeof (*hold)); 1938 } 1939 1940 boolean_t 1941 vmm_drv_release_reqd(vmm_hold_t *hold) 1942 { 1943 ASSERT(hold != NULL); 1944 1945 return (hold->vmh_release_req); 1946 } 1947 1948 vmm_lease_t * 1949 vmm_drv_lease_sign(vmm_hold_t *hold, boolean_t (*expiref)(void *), void *arg) 1950 { 1951 vmm_softc_t *sc = hold->vmh_sc; 1952 vmm_lease_t *lease; 1953 1954 ASSERT3P(expiref, !=, NULL); 1955 1956 if (hold->vmh_release_req) { 1957 return (NULL); 1958 } 1959 1960 lease = kmem_alloc(sizeof (*lease), KM_SLEEP); 1961 list_link_init(&lease->vml_node); 1962 lease->vml_expire_func = expiref; 1963 lease->vml_expire_arg = arg; 1964 lease->vml_expired = B_FALSE; 1965 lease->vml_break_deferred = B_FALSE; 1966 lease->vml_hold = hold; 1967 /* cache the VM pointer for one less pointer chase */ 1968 lease->vml_vm = sc->vmm_vm; 1969 lease->vml_vmclient = vmspace_client_alloc(vm_get_vmspace(sc->vmm_vm)); 1970 1971 mutex_enter(&sc->vmm_lease_lock); 1972 while (sc->vmm_lease_blocker != 0) { 1973 cv_wait(&sc->vmm_lease_cv, &sc->vmm_lease_lock); 1974 } 1975 list_insert_tail(&sc->vmm_lease_list, lease); 1976 vmm_read_lock(sc); 1977 mutex_exit(&sc->vmm_lease_lock); 1978 1979 return (lease); 1980 } 1981 1982 static void 1983 vmm_lease_break_locked(vmm_softc_t *sc, vmm_lease_t *lease) 1984 { 1985 ASSERT(MUTEX_HELD(&sc->vmm_lease_lock)); 1986 1987 list_remove(&sc->vmm_lease_list, lease); 1988 vmm_read_unlock(sc); 1989 vmc_destroy(lease->vml_vmclient); 1990 kmem_free(lease, sizeof (*lease)); 1991 } 1992 1993 static void 1994 vmm_lease_block(vmm_softc_t *sc) 1995 { 1996 mutex_enter(&sc->vmm_lease_lock); 1997 VERIFY3U(sc->vmm_lease_blocker, !=, UINT_MAX); 1998 sc->vmm_lease_blocker++; 1999 if (sc->vmm_lease_blocker == 1) { 2000 list_t *list = &sc->vmm_lease_list; 2001 vmm_lease_t *lease = list_head(list); 2002 2003 while (lease != NULL) { 2004 void *arg = lease->vml_expire_arg; 2005 boolean_t (*expiref)(void *) = lease->vml_expire_func; 2006 boolean_t sync_break = B_FALSE; 2007 2008 /* 2009 * Since the lease expiration notification may 2010 * need to take locks which would deadlock with 2011 * vmm_lease_lock, drop it across the call. 2012 * 2013 * We are the only one allowed to manipulate 2014 * vmm_lease_list right now, so it is safe to 2015 * continue iterating through it after 2016 * reacquiring the lock. 2017 */ 2018 lease->vml_expired = B_TRUE; 2019 mutex_exit(&sc->vmm_lease_lock); 2020 sync_break = expiref(arg); 2021 mutex_enter(&sc->vmm_lease_lock); 2022 2023 if (sync_break) { 2024 vmm_lease_t *next; 2025 2026 /* 2027 * These leases which are synchronously broken 2028 * result in vmm_read_unlock() calls from a 2029 * different thread than the corresponding 2030 * vmm_read_lock(). This is acceptable, given 2031 * that the rwlock underpinning the whole 2032 * mechanism tolerates the behavior. This 2033 * flexibility is _only_ afforded to VM read 2034 * lock (RW_READER) holders. 2035 */ 2036 next = list_next(list, lease); 2037 vmm_lease_break_locked(sc, lease); 2038 lease = next; 2039 } else { 2040 lease = list_next(list, lease); 2041 } 2042 } 2043 2044 /* Process leases which were not broken synchronously. */ 2045 while (!list_is_empty(list)) { 2046 /* 2047 * Although the nested loops are quadratic, the number 2048 * of leases is small. 2049 */ 2050 lease = list_head(list); 2051 while (lease != NULL) { 2052 vmm_lease_t *next = list_next(list, lease); 2053 if (lease->vml_break_deferred) { 2054 vmm_lease_break_locked(sc, lease); 2055 } 2056 lease = next; 2057 } 2058 if (list_is_empty(list)) { 2059 break; 2060 } 2061 cv_wait(&sc->vmm_lease_cv, &sc->vmm_lease_lock); 2062 } 2063 /* Wake anyone else waiting for the lease list to be empty */ 2064 cv_broadcast(&sc->vmm_lease_cv); 2065 } else { 2066 list_t *list = &sc->vmm_lease_list; 2067 2068 /* 2069 * Some other thread beat us to the duty of lease cleanup. 2070 * Wait until that is complete. 2071 */ 2072 while (!list_is_empty(list)) { 2073 cv_wait(&sc->vmm_lease_cv, &sc->vmm_lease_lock); 2074 } 2075 } 2076 mutex_exit(&sc->vmm_lease_lock); 2077 } 2078 2079 static void 2080 vmm_lease_unblock(vmm_softc_t *sc) 2081 { 2082 mutex_enter(&sc->vmm_lease_lock); 2083 VERIFY3U(sc->vmm_lease_blocker, !=, 0); 2084 sc->vmm_lease_blocker--; 2085 if (sc->vmm_lease_blocker == 0) { 2086 cv_broadcast(&sc->vmm_lease_cv); 2087 } 2088 mutex_exit(&sc->vmm_lease_lock); 2089 } 2090 2091 void 2092 vmm_drv_lease_break(vmm_hold_t *hold, vmm_lease_t *lease) 2093 { 2094 vmm_softc_t *sc = hold->vmh_sc; 2095 2096 VERIFY3P(hold, ==, lease->vml_hold); 2097 VERIFY(!lease->vml_break_deferred); 2098 2099 mutex_enter(&sc->vmm_lease_lock); 2100 if (sc->vmm_lease_blocker == 0) { 2101 vmm_lease_break_locked(sc, lease); 2102 } else { 2103 /* 2104 * Defer the lease-breaking to whichever thread is currently 2105 * cleaning up all leases as part of a vmm_lease_block() call. 2106 */ 2107 lease->vml_break_deferred = B_TRUE; 2108 cv_broadcast(&sc->vmm_lease_cv); 2109 } 2110 mutex_exit(&sc->vmm_lease_lock); 2111 } 2112 2113 boolean_t 2114 vmm_drv_lease_expired(vmm_lease_t *lease) 2115 { 2116 return (lease->vml_expired); 2117 } 2118 2119 vmm_page_t * 2120 vmm_drv_page_hold(vmm_lease_t *lease, uintptr_t gpa, int prot) 2121 { 2122 ASSERT(lease != NULL); 2123 ASSERT0(gpa & PAGEOFFSET); 2124 2125 return ((vmm_page_t *)vmc_hold(lease->vml_vmclient, gpa, prot)); 2126 } 2127 2128 void 2129 vmm_drv_page_release(vmm_page_t *vmmp) 2130 { 2131 (void) vmp_release((vm_page_t *)vmmp); 2132 } 2133 2134 void 2135 vmm_drv_page_release_chain(vmm_page_t *vmmp) 2136 { 2137 (void) vmp_release_chain((vm_page_t *)vmmp); 2138 } 2139 2140 const void * 2141 vmm_drv_page_readable(const vmm_page_t *vmmp) 2142 { 2143 return (vmp_get_readable((const vm_page_t *)vmmp)); 2144 } 2145 2146 void * 2147 vmm_drv_page_writable(const vmm_page_t *vmmp) 2148 { 2149 return (vmp_get_writable((const vm_page_t *)vmmp)); 2150 } 2151 2152 void 2153 vmm_drv_page_chain(vmm_page_t *vmmp, vmm_page_t *to_chain) 2154 { 2155 vmp_chain((vm_page_t *)vmmp, (vm_page_t *)to_chain); 2156 } 2157 2158 vmm_page_t * 2159 vmm_drv_page_next(const vmm_page_t *vmmp) 2160 { 2161 return ((vmm_page_t *)vmp_next((vm_page_t *)vmmp)); 2162 } 2163 2164 int 2165 vmm_drv_msi(vmm_lease_t *lease, uint64_t addr, uint64_t msg) 2166 { 2167 ASSERT(lease != NULL); 2168 2169 return (lapic_intr_msi(lease->vml_vm, addr, msg)); 2170 } 2171 2172 int 2173 vmm_drv_ioport_hook(vmm_hold_t *hold, uint16_t ioport, vmm_drv_iop_cb_t func, 2174 void *arg, void **cookie) 2175 { 2176 vmm_softc_t *sc; 2177 int err; 2178 2179 ASSERT(hold != NULL); 2180 ASSERT(cookie != NULL); 2181 2182 sc = hold->vmh_sc; 2183 mutex_enter(&vmm_mtx); 2184 /* Confirm that hook installation is not blocked */ 2185 if ((sc->vmm_flags & VMM_BLOCK_HOOK) != 0) { 2186 mutex_exit(&vmm_mtx); 2187 return (EBUSY); 2188 } 2189 /* 2190 * Optimistically record an installed hook which will prevent a block 2191 * from being asserted while the mutex is dropped. 2192 */ 2193 hold->vmh_ioport_hook_cnt++; 2194 mutex_exit(&vmm_mtx); 2195 2196 vmm_write_lock(sc); 2197 err = vm_ioport_hook(sc->vmm_vm, ioport, (ioport_handler_t)func, 2198 arg, cookie); 2199 vmm_write_unlock(sc); 2200 2201 if (err != 0) { 2202 mutex_enter(&vmm_mtx); 2203 /* Walk back optimism about the hook installation */ 2204 hold->vmh_ioport_hook_cnt--; 2205 mutex_exit(&vmm_mtx); 2206 } 2207 return (err); 2208 } 2209 2210 void 2211 vmm_drv_ioport_unhook(vmm_hold_t *hold, void **cookie) 2212 { 2213 vmm_softc_t *sc; 2214 2215 ASSERT(hold != NULL); 2216 ASSERT(cookie != NULL); 2217 ASSERT(hold->vmh_ioport_hook_cnt != 0); 2218 2219 sc = hold->vmh_sc; 2220 vmm_write_lock(sc); 2221 vm_ioport_unhook(sc->vmm_vm, cookie); 2222 vmm_write_unlock(sc); 2223 2224 mutex_enter(&vmm_mtx); 2225 hold->vmh_ioport_hook_cnt--; 2226 mutex_exit(&vmm_mtx); 2227 } 2228 2229 static int 2230 vmm_drv_purge(vmm_softc_t *sc) 2231 { 2232 ASSERT(MUTEX_HELD(&vmm_mtx)); 2233 2234 if ((sc->vmm_flags & VMM_HELD) != 0) { 2235 vmm_hold_t *hold; 2236 2237 sc->vmm_flags |= VMM_CLEANUP; 2238 for (hold = list_head(&sc->vmm_holds); hold != NULL; 2239 hold = list_next(&sc->vmm_holds, hold)) { 2240 hold->vmh_release_req = B_TRUE; 2241 } 2242 2243 /* 2244 * Require that all leases on the instance be broken, now that 2245 * all associated holds have been marked as needing release. 2246 * 2247 * Dropping vmm_mtx is not strictly necessary, but if any of the 2248 * lessees are slow to respond, it would be nice to leave it 2249 * available for other parties. 2250 */ 2251 mutex_exit(&vmm_mtx); 2252 vmm_lease_block(sc); 2253 vmm_lease_unblock(sc); 2254 mutex_enter(&vmm_mtx); 2255 2256 /* 2257 * With all of the leases broken, we can proceed in an orderly 2258 * fashion to waiting for any lingering holds to be dropped. 2259 */ 2260 while ((sc->vmm_flags & VMM_HELD) != 0) { 2261 if (cv_wait_sig(&sc->vmm_cv, &vmm_mtx) <= 0) { 2262 return (EINTR); 2263 } 2264 } 2265 sc->vmm_flags &= ~VMM_CLEANUP; 2266 } 2267 2268 VERIFY(list_is_empty(&sc->vmm_holds)); 2269 sc->vmm_flags |= VMM_PURGED; 2270 return (0); 2271 } 2272 2273 static int 2274 vmm_drv_block_hook(vmm_softc_t *sc, boolean_t enable_block) 2275 { 2276 int err = 0; 2277 2278 mutex_enter(&vmm_mtx); 2279 if (!enable_block) { 2280 VERIFY((sc->vmm_flags & VMM_BLOCK_HOOK) != 0); 2281 2282 sc->vmm_flags &= ~VMM_BLOCK_HOOK; 2283 goto done; 2284 } 2285 2286 /* If any holds have hooks installed, the block is a failure */ 2287 if (!list_is_empty(&sc->vmm_holds)) { 2288 vmm_hold_t *hold; 2289 2290 for (hold = list_head(&sc->vmm_holds); hold != NULL; 2291 hold = list_next(&sc->vmm_holds, hold)) { 2292 if (hold->vmh_ioport_hook_cnt != 0) { 2293 err = EBUSY; 2294 goto done; 2295 } 2296 } 2297 } 2298 sc->vmm_flags |= VMM_BLOCK_HOOK; 2299 2300 done: 2301 mutex_exit(&vmm_mtx); 2302 return (err); 2303 } 2304 2305 static int 2306 vmm_do_vm_destroy_locked(vmm_softc_t *sc, boolean_t clean_zsd, 2307 boolean_t *hma_release) 2308 { 2309 dev_info_t *pdip = ddi_get_parent(vmmdev_dip); 2310 minor_t minor; 2311 2312 ASSERT(MUTEX_HELD(&vmm_mtx)); 2313 2314 *hma_release = B_FALSE; 2315 2316 if (vmm_drv_purge(sc) != 0) { 2317 return (EINTR); 2318 } 2319 2320 if (clean_zsd) { 2321 vmm_zsd_rem_vm(sc); 2322 } 2323 2324 /* Clean up devmem entries */ 2325 vmmdev_devmem_purge(sc); 2326 2327 list_remove(&vmm_list, sc); 2328 ddi_remove_minor_node(vmmdev_dip, sc->vmm_name); 2329 minor = sc->vmm_minor; 2330 zone_rele(sc->vmm_zone); 2331 if (sc->vmm_is_open) { 2332 list_insert_tail(&vmm_destroy_list, sc); 2333 sc->vmm_flags |= VMM_DESTROY; 2334 } else { 2335 vmm_kstat_fini(sc); 2336 vm_destroy(sc->vmm_vm); 2337 ddi_soft_state_free(vmm_statep, minor); 2338 id_free(vmm_minors, minor); 2339 *hma_release = B_TRUE; 2340 } 2341 (void) devfs_clean(pdip, NULL, DV_CLEAN_FORCE); 2342 2343 return (0); 2344 } 2345 2346 int 2347 vmm_do_vm_destroy(vmm_softc_t *sc, boolean_t clean_zsd) 2348 { 2349 boolean_t hma_release = B_FALSE; 2350 int err; 2351 2352 mutex_enter(&vmm_mtx); 2353 err = vmm_do_vm_destroy_locked(sc, clean_zsd, &hma_release); 2354 mutex_exit(&vmm_mtx); 2355 2356 if (hma_release) 2357 vmm_hma_release(); 2358 2359 return (err); 2360 } 2361 2362 /* ARGSUSED */ 2363 static int 2364 vmmdev_do_vm_destroy(const struct vm_destroy_req *req, cred_t *cr) 2365 { 2366 boolean_t hma_release = B_FALSE; 2367 vmm_softc_t *sc; 2368 int err; 2369 2370 if (crgetuid(cr) != 0) 2371 return (EPERM); 2372 2373 mutex_enter(&vmm_mtx); 2374 2375 if ((sc = vmm_lookup(req->name)) == NULL) { 2376 mutex_exit(&vmm_mtx); 2377 return (ENOENT); 2378 } 2379 /* 2380 * We don't check this in vmm_lookup() since that function is also used 2381 * for validation during create and currently vmm names must be unique. 2382 */ 2383 if (!INGLOBALZONE(curproc) && sc->vmm_zone != curzone) { 2384 mutex_exit(&vmm_mtx); 2385 return (EPERM); 2386 } 2387 err = vmm_do_vm_destroy_locked(sc, B_TRUE, &hma_release); 2388 2389 mutex_exit(&vmm_mtx); 2390 2391 if (hma_release) 2392 vmm_hma_release(); 2393 2394 return (err); 2395 } 2396 2397 #define VCPU_NAME_BUFLEN 32 2398 2399 static int 2400 vmm_kstat_alloc(vmm_softc_t *sc, minor_t minor, const cred_t *cr) 2401 { 2402 zoneid_t zid = crgetzoneid(cr); 2403 int instance = minor; 2404 kstat_t *ksp; 2405 2406 ASSERT3P(sc->vmm_kstat_vm, ==, NULL); 2407 2408 ksp = kstat_create_zone(VMM_MODULE_NAME, instance, "vm", 2409 VMM_KSTAT_CLASS, KSTAT_TYPE_NAMED, 2410 sizeof (vmm_kstats_t) / sizeof (kstat_named_t), 0, zid); 2411 2412 if (ksp == NULL) { 2413 return (-1); 2414 } 2415 sc->vmm_kstat_vm = ksp; 2416 2417 for (uint_t i = 0; i < VM_MAXCPU; i++) { 2418 char namebuf[VCPU_NAME_BUFLEN]; 2419 2420 ASSERT3P(sc->vmm_kstat_vcpu[i], ==, NULL); 2421 2422 (void) snprintf(namebuf, VCPU_NAME_BUFLEN, "vcpu%u", i); 2423 ksp = kstat_create_zone(VMM_MODULE_NAME, instance, namebuf, 2424 VMM_KSTAT_CLASS, KSTAT_TYPE_NAMED, 2425 sizeof (vmm_vcpu_kstats_t) / sizeof (kstat_named_t), 2426 0, zid); 2427 if (ksp == NULL) { 2428 goto fail; 2429 } 2430 2431 sc->vmm_kstat_vcpu[i] = ksp; 2432 } 2433 2434 /* 2435 * If this instance is associated with a non-global zone, make its 2436 * kstats visible from the GZ. 2437 */ 2438 if (zid != GLOBAL_ZONEID) { 2439 kstat_zone_add(sc->vmm_kstat_vm, GLOBAL_ZONEID); 2440 for (uint_t i = 0; i < VM_MAXCPU; i++) { 2441 kstat_zone_add(sc->vmm_kstat_vcpu[i], GLOBAL_ZONEID); 2442 } 2443 } 2444 2445 return (0); 2446 2447 fail: 2448 for (uint_t i = 0; i < VM_MAXCPU; i++) { 2449 if (sc->vmm_kstat_vcpu[i] != NULL) { 2450 kstat_delete(sc->vmm_kstat_vcpu[i]); 2451 sc->vmm_kstat_vcpu[i] = NULL; 2452 } else { 2453 break; 2454 } 2455 } 2456 kstat_delete(sc->vmm_kstat_vm); 2457 sc->vmm_kstat_vm = NULL; 2458 return (-1); 2459 } 2460 2461 static void 2462 vmm_kstat_init(vmm_softc_t *sc) 2463 { 2464 kstat_t *ksp; 2465 2466 ASSERT3P(sc->vmm_vm, !=, NULL); 2467 ASSERT3P(sc->vmm_kstat_vm, !=, NULL); 2468 2469 ksp = sc->vmm_kstat_vm; 2470 vmm_kstats_t *vk = ksp->ks_data; 2471 ksp->ks_private = sc->vmm_vm; 2472 kstat_named_init(&vk->vk_name, "vm_name", KSTAT_DATA_STRING); 2473 kstat_named_setstr(&vk->vk_name, sc->vmm_name); 2474 2475 for (uint_t i = 0; i < VM_MAXCPU; i++) { 2476 ASSERT3P(sc->vmm_kstat_vcpu[i], !=, NULL); 2477 2478 ksp = sc->vmm_kstat_vcpu[i]; 2479 vmm_vcpu_kstats_t *vvk = ksp->ks_data; 2480 2481 kstat_named_init(&vvk->vvk_vcpu, "vcpu", KSTAT_DATA_UINT32); 2482 vvk->vvk_vcpu.value.ui32 = i; 2483 kstat_named_init(&vvk->vvk_time_init, "time_init", 2484 KSTAT_DATA_UINT64); 2485 kstat_named_init(&vvk->vvk_time_run, "time_run", 2486 KSTAT_DATA_UINT64); 2487 kstat_named_init(&vvk->vvk_time_idle, "time_idle", 2488 KSTAT_DATA_UINT64); 2489 kstat_named_init(&vvk->vvk_time_emu_kern, "time_emu_kern", 2490 KSTAT_DATA_UINT64); 2491 kstat_named_init(&vvk->vvk_time_emu_user, "time_emu_user", 2492 KSTAT_DATA_UINT64); 2493 kstat_named_init(&vvk->vvk_time_sched, "time_sched", 2494 KSTAT_DATA_UINT64); 2495 ksp->ks_private = sc->vmm_vm; 2496 ksp->ks_update = vmm_kstat_update_vcpu; 2497 } 2498 2499 kstat_install(sc->vmm_kstat_vm); 2500 for (uint_t i = 0; i < VM_MAXCPU; i++) { 2501 kstat_install(sc->vmm_kstat_vcpu[i]); 2502 } 2503 } 2504 2505 static void 2506 vmm_kstat_fini(vmm_softc_t *sc) 2507 { 2508 ASSERT(sc->vmm_kstat_vm != NULL); 2509 2510 kstat_delete(sc->vmm_kstat_vm); 2511 sc->vmm_kstat_vm = NULL; 2512 2513 for (uint_t i = 0; i < VM_MAXCPU; i++) { 2514 ASSERT3P(sc->vmm_kstat_vcpu[i], !=, NULL); 2515 2516 kstat_delete(sc->vmm_kstat_vcpu[i]); 2517 sc->vmm_kstat_vcpu[i] = NULL; 2518 } 2519 } 2520 2521 static int 2522 vmm_open(dev_t *devp, int flag, int otyp, cred_t *credp) 2523 { 2524 minor_t minor; 2525 vmm_softc_t *sc; 2526 2527 /* 2528 * Forbid running bhyve in a 32-bit process until it has been tested and 2529 * verified to be safe. 2530 */ 2531 if (curproc->p_model != DATAMODEL_LP64) { 2532 return (EFBIG); 2533 } 2534 2535 minor = getminor(*devp); 2536 if (minor == VMM_CTL_MINOR) { 2537 /* 2538 * Master control device must be opened exclusively. 2539 */ 2540 if ((flag & FEXCL) != FEXCL || otyp != OTYP_CHR) { 2541 return (EINVAL); 2542 } 2543 2544 return (0); 2545 } 2546 2547 mutex_enter(&vmm_mtx); 2548 sc = ddi_get_soft_state(vmm_statep, minor); 2549 if (sc == NULL) { 2550 mutex_exit(&vmm_mtx); 2551 return (ENXIO); 2552 } 2553 2554 sc->vmm_is_open = B_TRUE; 2555 mutex_exit(&vmm_mtx); 2556 2557 return (0); 2558 } 2559 2560 static int 2561 vmm_close(dev_t dev, int flag, int otyp, cred_t *credp) 2562 { 2563 minor_t minor; 2564 vmm_softc_t *sc; 2565 boolean_t hma_release = B_FALSE; 2566 2567 minor = getminor(dev); 2568 if (minor == VMM_CTL_MINOR) 2569 return (0); 2570 2571 mutex_enter(&vmm_mtx); 2572 sc = ddi_get_soft_state(vmm_statep, minor); 2573 if (sc == NULL) { 2574 mutex_exit(&vmm_mtx); 2575 return (ENXIO); 2576 } 2577 2578 VERIFY(sc->vmm_is_open); 2579 sc->vmm_is_open = B_FALSE; 2580 2581 /* 2582 * If this VM was destroyed while the vmm device was open, then 2583 * clean it up now that it is closed. 2584 */ 2585 if (sc->vmm_flags & VMM_DESTROY) { 2586 list_remove(&vmm_destroy_list, sc); 2587 vmm_kstat_fini(sc); 2588 vm_destroy(sc->vmm_vm); 2589 ddi_soft_state_free(vmm_statep, minor); 2590 id_free(vmm_minors, minor); 2591 hma_release = B_TRUE; 2592 } 2593 mutex_exit(&vmm_mtx); 2594 2595 if (hma_release) 2596 vmm_hma_release(); 2597 2598 return (0); 2599 } 2600 2601 static int 2602 vmm_is_supported(intptr_t arg) 2603 { 2604 int r; 2605 const char *msg; 2606 2607 if (vmm_is_intel()) { 2608 r = vmx_x86_supported(&msg); 2609 } else if (vmm_is_svm()) { 2610 /* 2611 * HMA already ensured that the features necessary for SVM 2612 * operation were present and online during vmm_attach(). 2613 */ 2614 r = 0; 2615 } else { 2616 r = ENXIO; 2617 msg = "Unsupported CPU vendor"; 2618 } 2619 2620 if (r != 0 && arg != (intptr_t)NULL) { 2621 if (copyoutstr(msg, (char *)arg, strlen(msg) + 1, NULL) != 0) 2622 return (EFAULT); 2623 } 2624 return (r); 2625 } 2626 2627 static int 2628 vmm_ctl_ioctl(int cmd, intptr_t arg, int md, cred_t *cr, int *rvalp) 2629 { 2630 void *argp = (void *)arg; 2631 2632 switch (cmd) { 2633 case VMM_CREATE_VM: { 2634 struct vm_create_req req; 2635 2636 if ((md & FWRITE) == 0) { 2637 return (EPERM); 2638 } 2639 if (ddi_copyin(argp, &req, sizeof (req), md) != 0) { 2640 return (EFAULT); 2641 } 2642 return (vmmdev_do_vm_create(&req, cr)); 2643 } 2644 case VMM_DESTROY_VM: { 2645 struct vm_destroy_req req; 2646 2647 if ((md & FWRITE) == 0) { 2648 return (EPERM); 2649 } 2650 if (ddi_copyin(argp, &req, sizeof (req), md) != 0) { 2651 return (EFAULT); 2652 } 2653 return (vmmdev_do_vm_destroy(&req, cr)); 2654 } 2655 case VMM_VM_SUPPORTED: 2656 return (vmm_is_supported(arg)); 2657 case VMM_INTERFACE_VERSION: 2658 *rvalp = VMM_CURRENT_INTERFACE_VERSION; 2659 return (0); 2660 case VMM_CHECK_IOMMU: 2661 if (!vmm_check_iommu()) { 2662 return (ENXIO); 2663 } 2664 return (0); 2665 case VMM_RESV_QUERY: 2666 case VMM_RESV_ADD: 2667 case VMM_RESV_REMOVE: 2668 return (vmmr_ioctl(cmd, arg, md, cr, rvalp)); 2669 default: 2670 break; 2671 } 2672 /* No other actions are legal on ctl device */ 2673 return (ENOTTY); 2674 } 2675 2676 static int 2677 vmm_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, 2678 int *rvalp) 2679 { 2680 vmm_softc_t *sc; 2681 minor_t minor; 2682 2683 /* 2684 * Forbid running bhyve in a 32-bit process until it has been tested and 2685 * verified to be safe. 2686 */ 2687 if (curproc->p_model != DATAMODEL_LP64) { 2688 return (EFBIG); 2689 } 2690 2691 /* The structs in bhyve ioctls assume a 64-bit datamodel */ 2692 if (ddi_model_convert_from(mode & FMODELS) != DDI_MODEL_NONE) { 2693 return (ENOTSUP); 2694 } 2695 2696 minor = getminor(dev); 2697 2698 if (minor == VMM_CTL_MINOR) { 2699 return (vmm_ctl_ioctl(cmd, arg, mode, credp, rvalp)); 2700 } 2701 2702 sc = ddi_get_soft_state(vmm_statep, minor); 2703 ASSERT(sc); 2704 2705 if (sc->vmm_flags & VMM_DESTROY) 2706 return (ENXIO); 2707 2708 return (vmmdev_do_ioctl(sc, cmd, arg, mode, credp, rvalp)); 2709 } 2710 2711 static int 2712 vmm_segmap(dev_t dev, off_t off, struct as *as, caddr_t *addrp, off_t len, 2713 unsigned int prot, unsigned int maxprot, unsigned int flags, cred_t *credp) 2714 { 2715 vmm_softc_t *sc; 2716 const minor_t minor = getminor(dev); 2717 int err; 2718 2719 if (minor == VMM_CTL_MINOR) { 2720 return (ENODEV); 2721 } 2722 if (off < 0 || (off + len) <= 0) { 2723 return (EINVAL); 2724 } 2725 if ((prot & PROT_USER) == 0) { 2726 return (EACCES); 2727 } 2728 2729 sc = ddi_get_soft_state(vmm_statep, minor); 2730 ASSERT(sc); 2731 2732 if (sc->vmm_flags & VMM_DESTROY) 2733 return (ENXIO); 2734 2735 /* Grab read lock on the VM to prevent any changes to the memory map */ 2736 vmm_read_lock(sc); 2737 2738 if (off >= VM_DEVMEM_START) { 2739 int segid; 2740 off_t segoff; 2741 2742 /* Mapping a devmem "device" */ 2743 if (!vmmdev_devmem_segid(sc, off, len, &segid, &segoff)) { 2744 err = ENODEV; 2745 } else { 2746 err = vm_segmap_obj(sc->vmm_vm, segid, segoff, len, as, 2747 addrp, prot, maxprot, flags); 2748 } 2749 } else { 2750 /* Mapping a part of the guest physical space */ 2751 err = vm_segmap_space(sc->vmm_vm, off, as, addrp, len, prot, 2752 maxprot, flags); 2753 } 2754 2755 vmm_read_unlock(sc); 2756 return (err); 2757 } 2758 2759 static sdev_plugin_validate_t 2760 vmm_sdev_validate(sdev_ctx_t ctx) 2761 { 2762 const char *name = sdev_ctx_name(ctx); 2763 vmm_softc_t *sc; 2764 sdev_plugin_validate_t ret; 2765 minor_t minor; 2766 2767 if (sdev_ctx_vtype(ctx) != VCHR) 2768 return (SDEV_VTOR_INVALID); 2769 2770 VERIFY3S(sdev_ctx_minor(ctx, &minor), ==, 0); 2771 2772 mutex_enter(&vmm_mtx); 2773 if ((sc = vmm_lookup(name)) == NULL) 2774 ret = SDEV_VTOR_INVALID; 2775 else if (sc->vmm_minor != minor) 2776 ret = SDEV_VTOR_STALE; 2777 else 2778 ret = SDEV_VTOR_VALID; 2779 mutex_exit(&vmm_mtx); 2780 2781 return (ret); 2782 } 2783 2784 static int 2785 vmm_sdev_filldir(sdev_ctx_t ctx) 2786 { 2787 vmm_softc_t *sc; 2788 int ret; 2789 2790 if (strcmp(sdev_ctx_path(ctx), VMM_SDEV_ROOT) != 0) { 2791 cmn_err(CE_WARN, "%s: bad path '%s' != '%s'\n", __func__, 2792 sdev_ctx_path(ctx), VMM_SDEV_ROOT); 2793 return (EINVAL); 2794 } 2795 2796 mutex_enter(&vmm_mtx); 2797 ASSERT(vmmdev_dip != NULL); 2798 for (sc = list_head(&vmm_list); sc != NULL; 2799 sc = list_next(&vmm_list, sc)) { 2800 if (INGLOBALZONE(curproc) || sc->vmm_zone == curzone) { 2801 ret = sdev_plugin_mknod(ctx, sc->vmm_name, 2802 S_IFCHR | 0600, 2803 makedevice(ddi_driver_major(vmmdev_dip), 2804 sc->vmm_minor)); 2805 } else { 2806 continue; 2807 } 2808 if (ret != 0 && ret != EEXIST) 2809 goto out; 2810 } 2811 2812 ret = 0; 2813 2814 out: 2815 mutex_exit(&vmm_mtx); 2816 return (ret); 2817 } 2818 2819 /* ARGSUSED */ 2820 static void 2821 vmm_sdev_inactive(sdev_ctx_t ctx) 2822 { 2823 } 2824 2825 static sdev_plugin_ops_t vmm_sdev_ops = { 2826 .spo_version = SDEV_PLUGIN_VERSION, 2827 .spo_flags = SDEV_PLUGIN_SUBDIR, 2828 .spo_validate = vmm_sdev_validate, 2829 .spo_filldir = vmm_sdev_filldir, 2830 .spo_inactive = vmm_sdev_inactive 2831 }; 2832 2833 /* ARGSUSED */ 2834 static int 2835 vmm_info(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result) 2836 { 2837 int error; 2838 2839 switch (cmd) { 2840 case DDI_INFO_DEVT2DEVINFO: 2841 *result = (void *)vmmdev_dip; 2842 error = DDI_SUCCESS; 2843 break; 2844 case DDI_INFO_DEVT2INSTANCE: 2845 *result = (void *)0; 2846 error = DDI_SUCCESS; 2847 break; 2848 default: 2849 error = DDI_FAILURE; 2850 break; 2851 } 2852 return (error); 2853 } 2854 2855 static int 2856 vmm_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 2857 { 2858 sdev_plugin_hdl_t sph; 2859 hma_reg_t *reg = NULL; 2860 boolean_t vmm_loaded = B_FALSE; 2861 2862 if (cmd != DDI_ATTACH) { 2863 return (DDI_FAILURE); 2864 } 2865 2866 mutex_enter(&vmmdev_mtx); 2867 /* Ensure we are not already attached. */ 2868 if (vmmdev_dip != NULL) { 2869 mutex_exit(&vmmdev_mtx); 2870 return (DDI_FAILURE); 2871 } 2872 2873 vmm_sol_glue_init(); 2874 2875 /* 2876 * Perform temporary HMA registration to determine if the system 2877 * is capable. 2878 */ 2879 if ((reg = hma_register(vmmdev_hvm_name)) == NULL) { 2880 goto fail; 2881 } else if (vmm_mod_load() != 0) { 2882 goto fail; 2883 } 2884 vmm_loaded = B_TRUE; 2885 hma_unregister(reg); 2886 reg = NULL; 2887 2888 /* Create control node. Other nodes will be created on demand. */ 2889 if (ddi_create_minor_node(dip, "ctl", S_IFCHR, 2890 VMM_CTL_MINOR, DDI_PSEUDO, 0) != 0) { 2891 goto fail; 2892 } 2893 2894 sph = sdev_plugin_register(VMM_MODULE_NAME, &vmm_sdev_ops, NULL); 2895 if (sph == (sdev_plugin_hdl_t)NULL) { 2896 ddi_remove_minor_node(dip, NULL); 2897 goto fail; 2898 } 2899 2900 ddi_report_dev(dip); 2901 vmmdev_sdev_hdl = sph; 2902 vmmdev_dip = dip; 2903 mutex_exit(&vmmdev_mtx); 2904 return (DDI_SUCCESS); 2905 2906 fail: 2907 if (vmm_loaded) { 2908 VERIFY0(vmm_mod_unload()); 2909 } 2910 if (reg != NULL) { 2911 hma_unregister(reg); 2912 } 2913 vmm_sol_glue_cleanup(); 2914 mutex_exit(&vmmdev_mtx); 2915 return (DDI_FAILURE); 2916 } 2917 2918 static int 2919 vmm_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 2920 { 2921 if (cmd != DDI_DETACH) { 2922 return (DDI_FAILURE); 2923 } 2924 2925 /* 2926 * Ensure that all resources have been cleaned up. 2927 * 2928 * To prevent a deadlock with iommu_cleanup() we'll fail the detach if 2929 * vmmdev_mtx is already held. We can't wait for vmmdev_mtx with our 2930 * devinfo locked as iommu_cleanup() tries to recursively lock each 2931 * devinfo, including our own, while holding vmmdev_mtx. 2932 */ 2933 if (mutex_tryenter(&vmmdev_mtx) == 0) 2934 return (DDI_FAILURE); 2935 2936 mutex_enter(&vmm_mtx); 2937 if (!list_is_empty(&vmm_list) || !list_is_empty(&vmm_destroy_list)) { 2938 mutex_exit(&vmm_mtx); 2939 mutex_exit(&vmmdev_mtx); 2940 return (DDI_FAILURE); 2941 } 2942 mutex_exit(&vmm_mtx); 2943 2944 if (!vmmr_is_empty()) { 2945 mutex_exit(&vmmdev_mtx); 2946 return (DDI_FAILURE); 2947 } 2948 2949 VERIFY(vmmdev_sdev_hdl != (sdev_plugin_hdl_t)NULL); 2950 if (sdev_plugin_unregister(vmmdev_sdev_hdl) != 0) { 2951 mutex_exit(&vmmdev_mtx); 2952 return (DDI_FAILURE); 2953 } 2954 vmmdev_sdev_hdl = (sdev_plugin_hdl_t)NULL; 2955 2956 /* Remove the control node. */ 2957 ddi_remove_minor_node(dip, "ctl"); 2958 vmmdev_dip = NULL; 2959 2960 VERIFY0(vmm_mod_unload()); 2961 VERIFY3U(vmmdev_hma_reg, ==, NULL); 2962 vmm_sol_glue_cleanup(); 2963 2964 mutex_exit(&vmmdev_mtx); 2965 2966 return (DDI_SUCCESS); 2967 } 2968 2969 static struct cb_ops vmm_cb_ops = { 2970 vmm_open, 2971 vmm_close, 2972 nodev, /* strategy */ 2973 nodev, /* print */ 2974 nodev, /* dump */ 2975 nodev, /* read */ 2976 nodev, /* write */ 2977 vmm_ioctl, 2978 nodev, /* devmap */ 2979 nodev, /* mmap */ 2980 vmm_segmap, 2981 nochpoll, /* poll */ 2982 ddi_prop_op, 2983 NULL, 2984 D_NEW | D_MP | D_DEVMAP 2985 }; 2986 2987 static struct dev_ops vmm_ops = { 2988 DEVO_REV, 2989 0, 2990 vmm_info, 2991 nulldev, /* identify */ 2992 nulldev, /* probe */ 2993 vmm_attach, 2994 vmm_detach, 2995 nodev, /* reset */ 2996 &vmm_cb_ops, 2997 (struct bus_ops *)NULL 2998 }; 2999 3000 static struct modldrv modldrv = { 3001 &mod_driverops, 3002 "bhyve vmm", 3003 &vmm_ops 3004 }; 3005 3006 static struct modlinkage modlinkage = { 3007 MODREV_1, 3008 &modldrv, 3009 NULL 3010 }; 3011 3012 int 3013 _init(void) 3014 { 3015 int error; 3016 3017 sysinit(); 3018 3019 mutex_init(&vmmdev_mtx, NULL, MUTEX_DRIVER, NULL); 3020 mutex_init(&vmm_mtx, NULL, MUTEX_DRIVER, NULL); 3021 list_create(&vmm_list, sizeof (vmm_softc_t), 3022 offsetof(vmm_softc_t, vmm_node)); 3023 list_create(&vmm_destroy_list, sizeof (vmm_softc_t), 3024 offsetof(vmm_softc_t, vmm_node)); 3025 vmm_minors = id_space_create("vmm_minors", VMM_CTL_MINOR + 1, MAXMIN32); 3026 3027 error = ddi_soft_state_init(&vmm_statep, sizeof (vmm_softc_t), 0); 3028 if (error) { 3029 return (error); 3030 } 3031 3032 vmm_zsd_init(); 3033 vmmr_init(); 3034 3035 error = mod_install(&modlinkage); 3036 if (error) { 3037 ddi_soft_state_fini(&vmm_statep); 3038 vmm_zsd_fini(); 3039 vmmr_fini(); 3040 } 3041 3042 return (error); 3043 } 3044 3045 int 3046 _fini(void) 3047 { 3048 int error; 3049 3050 error = mod_remove(&modlinkage); 3051 if (error) { 3052 return (error); 3053 } 3054 3055 vmm_zsd_fini(); 3056 vmmr_fini(); 3057 3058 ddi_soft_state_fini(&vmm_statep); 3059 3060 return (0); 3061 } 3062 3063 int 3064 _info(struct modinfo *modinfop) 3065 { 3066 return (mod_info(&modlinkage, modinfop)); 3067 } 3068