1 /* 2 * This file and its contents are supplied under the terms of the 3 * Common Development and Distribution License ("CDDL"), version 1.0. 4 * You may only use this file in accordance with the terms of version 5 * 1.0 of the CDDL. 6 * 7 * A full copy of the text of the CDDL should have accompanied this 8 * source. A copy of the CDDL is also available via the Internet at 9 * http://www.illumos.org/license/CDDL. 10 */ 11 /* This file is dual-licensed; see usr/src/contrib/bhyve/LICENSE */ 12 13 /* 14 * Copyright 2015 Pluribus Networks Inc. 15 * Copyright 2019 Joyent, Inc. 16 * Copyright 2020 OmniOS Community Edition (OmniOSce) Association. 17 * Copyright 2022 Oxide Computer Company 18 */ 19 20 #include <sys/types.h> 21 #include <sys/conf.h> 22 #include <sys/cpuvar.h> 23 #include <sys/ioccom.h> 24 #include <sys/stat.h> 25 #include <sys/vmsystm.h> 26 #include <sys/ddi.h> 27 #include <sys/mkdev.h> 28 #include <sys/sunddi.h> 29 #include <sys/fs/dv_node.h> 30 #include <sys/cpuset.h> 31 #include <sys/id_space.h> 32 #include <sys/fs/sdev_plugin.h> 33 #include <sys/smt.h> 34 #include <sys/kstat.h> 35 36 #include <sys/kernel.h> 37 #include <sys/hma.h> 38 #include <sys/x86_archext.h> 39 #include <x86/apicreg.h> 40 41 #include <sys/vmm.h> 42 #include <sys/vmm_kernel.h> 43 #include <sys/vmm_instruction_emul.h> 44 #include <sys/vmm_dev.h> 45 #include <sys/vmm_impl.h> 46 #include <sys/vmm_drv.h> 47 #include <sys/vmm_vm.h> 48 #include <sys/vmm_reservoir.h> 49 50 #include <vm/seg_dev.h> 51 52 #include "io/ppt.h" 53 #include "io/vatpic.h" 54 #include "io/vioapic.h" 55 #include "io/vrtc.h" 56 #include "io/vhpet.h" 57 #include "io/vpmtmr.h" 58 #include "vmm_lapic.h" 59 #include "vmm_stat.h" 60 #include "vmm_util.h" 61 62 /* 63 * Locking details: 64 * 65 * Driver-wide data (vmmdev_*) , including HMA and sdev registration, is 66 * protected by vmmdev_mtx. The list of vmm_softc_t instances and related data 67 * (vmm_*) are protected by vmm_mtx. Actions requiring both locks must acquire 68 * vmmdev_mtx before vmm_mtx. The sdev plugin functions must not attempt to 69 * acquire vmmdev_mtx, as they could deadlock with plugin unregistration. 70 */ 71 72 static kmutex_t vmmdev_mtx; 73 static dev_info_t *vmmdev_dip; 74 static hma_reg_t *vmmdev_hma_reg; 75 static uint_t vmmdev_hma_ref; 76 static sdev_plugin_hdl_t vmmdev_sdev_hdl; 77 78 static kmutex_t vmm_mtx; 79 static list_t vmm_list; 80 static list_t vmm_destroy_list; 81 static id_space_t *vmm_minors; 82 static void *vmm_statep; 83 84 /* temporary safety switch */ 85 int vmm_allow_state_writes; 86 87 static const char *vmmdev_hvm_name = "bhyve"; 88 89 /* For sdev plugin (/dev) */ 90 #define VMM_SDEV_ROOT "/dev/vmm" 91 92 /* From uts/intel/io/vmm/intel/vmx.c */ 93 extern int vmx_x86_supported(const char **); 94 95 /* Holds and hooks from drivers external to vmm */ 96 struct vmm_hold { 97 list_node_t vmh_node; 98 vmm_softc_t *vmh_sc; 99 boolean_t vmh_release_req; 100 uint_t vmh_ioport_hook_cnt; 101 }; 102 103 struct vmm_lease { 104 list_node_t vml_node; 105 struct vm *vml_vm; 106 vm_client_t *vml_vmclient; 107 boolean_t vml_expired; 108 boolean_t vml_break_deferred; 109 boolean_t (*vml_expire_func)(void *); 110 void *vml_expire_arg; 111 struct vmm_hold *vml_hold; 112 }; 113 114 static int vmm_drv_block_hook(vmm_softc_t *, boolean_t); 115 static void vmm_lease_block(vmm_softc_t *); 116 static void vmm_lease_unblock(vmm_softc_t *); 117 static int vmm_kstat_alloc(vmm_softc_t *, minor_t, const cred_t *); 118 static void vmm_kstat_init(vmm_softc_t *); 119 static void vmm_kstat_fini(vmm_softc_t *); 120 121 /* 122 * The 'devmem' hack: 123 * 124 * On native FreeBSD, bhyve consumers are allowed to create 'devmem' segments 125 * in the vm which appear with their own name related to the vm under /dev. 126 * Since this would be a hassle from an sdev perspective and would require a 127 * new cdev interface (or complicate the existing one), we choose to implement 128 * this in a different manner. Direct access to the underlying vm memory 129 * segments is exposed by placing them in a range of offsets beyond the normal 130 * guest memory space. Userspace can query the appropriate offset to mmap() 131 * for a given segment-id with the VM_DEVMEM_GETOFFSET ioctl. 132 */ 133 134 static vmm_devmem_entry_t * 135 vmmdev_devmem_find(vmm_softc_t *sc, int segid) 136 { 137 vmm_devmem_entry_t *ent = NULL; 138 list_t *dl = &sc->vmm_devmem_list; 139 140 for (ent = list_head(dl); ent != NULL; ent = list_next(dl, ent)) { 141 if (ent->vde_segid == segid) { 142 return (ent); 143 } 144 } 145 return (NULL); 146 } 147 148 static int 149 vmmdev_get_memseg(vmm_softc_t *sc, struct vm_memseg *mseg) 150 { 151 int error; 152 bool sysmem; 153 154 error = vm_get_memseg(sc->vmm_vm, mseg->segid, &mseg->len, &sysmem, 155 NULL); 156 if (error || mseg->len == 0) 157 return (error); 158 159 if (!sysmem) { 160 vmm_devmem_entry_t *de; 161 162 de = vmmdev_devmem_find(sc, mseg->segid); 163 if (de != NULL) { 164 (void) strlcpy(mseg->name, de->vde_name, 165 sizeof (mseg->name)); 166 } 167 } else { 168 bzero(mseg->name, sizeof (mseg->name)); 169 } 170 171 return (error); 172 } 173 174 static int 175 vmmdev_devmem_create(vmm_softc_t *sc, struct vm_memseg *mseg, const char *name) 176 { 177 off_t map_offset; 178 vmm_devmem_entry_t *entry; 179 180 if (list_is_empty(&sc->vmm_devmem_list)) { 181 map_offset = VM_DEVMEM_START; 182 } else { 183 entry = list_tail(&sc->vmm_devmem_list); 184 map_offset = entry->vde_off + entry->vde_len; 185 if (map_offset < entry->vde_off) { 186 /* Do not tolerate overflow */ 187 return (ERANGE); 188 } 189 /* 190 * XXXJOY: We could choose to search the list for duplicate 191 * names and toss an error. Since we're using the offset 192 * method for now, it does not make much of a difference. 193 */ 194 } 195 196 entry = kmem_zalloc(sizeof (*entry), KM_SLEEP); 197 entry->vde_segid = mseg->segid; 198 entry->vde_len = mseg->len; 199 entry->vde_off = map_offset; 200 (void) strlcpy(entry->vde_name, name, sizeof (entry->vde_name)); 201 list_insert_tail(&sc->vmm_devmem_list, entry); 202 203 return (0); 204 } 205 206 static boolean_t 207 vmmdev_devmem_segid(vmm_softc_t *sc, off_t off, off_t len, int *segidp, 208 off_t *map_offp) 209 { 210 list_t *dl = &sc->vmm_devmem_list; 211 vmm_devmem_entry_t *de = NULL; 212 const off_t map_end = off + len; 213 214 VERIFY(off >= VM_DEVMEM_START); 215 216 if (map_end < off) { 217 /* No match on overflow */ 218 return (B_FALSE); 219 } 220 221 for (de = list_head(dl); de != NULL; de = list_next(dl, de)) { 222 const off_t item_end = de->vde_off + de->vde_len; 223 224 if (de->vde_off <= off && item_end >= map_end) { 225 *segidp = de->vde_segid; 226 *map_offp = off - de->vde_off; 227 return (B_TRUE); 228 } 229 } 230 return (B_FALSE); 231 } 232 233 static void 234 vmmdev_devmem_purge(vmm_softc_t *sc) 235 { 236 vmm_devmem_entry_t *entry; 237 238 while ((entry = list_remove_head(&sc->vmm_devmem_list)) != NULL) { 239 kmem_free(entry, sizeof (*entry)); 240 } 241 } 242 243 static int 244 vmmdev_alloc_memseg(vmm_softc_t *sc, struct vm_memseg *mseg) 245 { 246 int error; 247 bool sysmem = true; 248 249 if (VM_MEMSEG_NAME(mseg)) { 250 sysmem = false; 251 } 252 error = vm_alloc_memseg(sc->vmm_vm, mseg->segid, mseg->len, sysmem); 253 254 if (error == 0) { 255 /* 256 * Rather than create a whole fresh device from which userspace 257 * can mmap this segment, instead make it available at an 258 * offset above where the main guest memory resides. 259 */ 260 error = vmmdev_devmem_create(sc, mseg, mseg->name); 261 if (error != 0) { 262 vm_free_memseg(sc->vmm_vm, mseg->segid); 263 } 264 } 265 return (error); 266 } 267 268 /* 269 * Resource Locking and Exclusion 270 * 271 * Much of bhyve depends on key portions of VM state, such as the guest memory 272 * map, to remain unchanged while the guest is running. As ported from 273 * FreeBSD, the initial strategy for this resource exclusion hinged on gating 274 * access to the instance vCPUs. Threads acting on a single vCPU, like those 275 * performing the work of actually running the guest in VMX/SVM, would lock 276 * only that vCPU during ioctl() entry. For ioctls which would change VM-wide 277 * state, all of the vCPUs would be first locked, ensuring that the 278 * operation(s) could complete without any other threads stumbling into 279 * intermediate states. 280 * 281 * This approach is largely effective for bhyve. Common operations, such as 282 * running the vCPUs, steer clear of lock contention. The model begins to 283 * break down for operations which do not occur in the context of a specific 284 * vCPU. LAPIC MSI delivery, for example, may be initiated from a worker 285 * thread in the bhyve process. In order to properly protect those vCPU-less 286 * operations from encountering invalid states, additional locking is required. 287 * This was solved by forcing those operations to lock the VM_MAXCPU-1 vCPU. 288 * It does mean that class of operations will be serialized on locking the 289 * specific vCPU and that instances sized at VM_MAXCPU will potentially see 290 * undue contention on the VM_MAXCPU-1 vCPU. 291 * 292 * In order to address the shortcomings of this model, the concept of a 293 * read/write lock has been added to bhyve. Operations which change 294 * fundamental aspects of a VM (such as the memory map) must acquire the write 295 * lock, which also implies locking all of the vCPUs and waiting for all read 296 * lock holders to release. While it increases the cost and waiting time for 297 * those few operations, it allows most hot-path operations on the VM (which 298 * depend on its configuration remaining stable) to occur with minimal locking. 299 * 300 * Consumers of the Driver API (see below) are a special case when it comes to 301 * this locking, since they may hold a read lock via the drv_lease mechanism 302 * for an extended period of time. Rather than forcing those consumers to 303 * continuously poll for a write lock attempt, the lease system forces them to 304 * provide a release callback to trigger their clean-up (and potential later 305 * reacquisition) of the read lock. 306 */ 307 308 static void 309 vcpu_lock_one(vmm_softc_t *sc, int vcpu) 310 { 311 ASSERT(vcpu >= 0 && vcpu < VM_MAXCPU); 312 313 /* 314 * Since this state transition is utilizing from_idle=true, it should 315 * not fail, but rather block until it can be successful. 316 */ 317 VERIFY0(vcpu_set_state(sc->vmm_vm, vcpu, VCPU_FROZEN, true)); 318 } 319 320 static void 321 vcpu_unlock_one(vmm_softc_t *sc, int vcpu) 322 { 323 ASSERT(vcpu >= 0 && vcpu < VM_MAXCPU); 324 325 VERIFY3U(vcpu_get_state(sc->vmm_vm, vcpu, NULL), ==, VCPU_FROZEN); 326 VERIFY0(vcpu_set_state(sc->vmm_vm, vcpu, VCPU_IDLE, false)); 327 } 328 329 static void 330 vmm_read_lock(vmm_softc_t *sc) 331 { 332 rw_enter(&sc->vmm_rwlock, RW_READER); 333 } 334 335 static void 336 vmm_read_unlock(vmm_softc_t *sc) 337 { 338 rw_exit(&sc->vmm_rwlock); 339 } 340 341 static void 342 vmm_write_lock(vmm_softc_t *sc) 343 { 344 int maxcpus; 345 346 /* First lock all the vCPUs */ 347 maxcpus = vm_get_maxcpus(sc->vmm_vm); 348 for (int vcpu = 0; vcpu < maxcpus; vcpu++) { 349 vcpu_lock_one(sc, vcpu); 350 } 351 352 /* 353 * Block vmm_drv leases from being acquired or held while the VM write 354 * lock is held. 355 */ 356 vmm_lease_block(sc); 357 358 rw_enter(&sc->vmm_rwlock, RW_WRITER); 359 /* 360 * For now, the 'maxcpus' value for an instance is fixed at the 361 * compile-time constant of VM_MAXCPU at creation. If this changes in 362 * the future, allowing for dynamic vCPU resource sizing, acquisition 363 * of the write lock will need to be wary of such changes. 364 */ 365 VERIFY(maxcpus == vm_get_maxcpus(sc->vmm_vm)); 366 } 367 368 static void 369 vmm_write_unlock(vmm_softc_t *sc) 370 { 371 int maxcpus; 372 373 /* Allow vmm_drv leases to be acquired once write lock is dropped */ 374 vmm_lease_unblock(sc); 375 376 /* 377 * The VM write lock _must_ be released from the same thread it was 378 * acquired in, unlike the read lock. 379 */ 380 VERIFY(rw_write_held(&sc->vmm_rwlock)); 381 rw_exit(&sc->vmm_rwlock); 382 383 /* Unlock all the vCPUs */ 384 maxcpus = vm_get_maxcpus(sc->vmm_vm); 385 for (int vcpu = 0; vcpu < maxcpus; vcpu++) { 386 vcpu_unlock_one(sc, vcpu); 387 } 388 } 389 390 static int 391 vmmdev_do_ioctl(vmm_softc_t *sc, int cmd, intptr_t arg, int md, 392 cred_t *credp, int *rvalp) 393 { 394 int error = 0, vcpu = -1; 395 void *datap = (void *)arg; 396 enum vm_lock_type { 397 LOCK_NONE = 0, 398 LOCK_VCPU, 399 LOCK_READ_HOLD, 400 LOCK_WRITE_HOLD 401 } lock_type = LOCK_NONE; 402 403 /* Acquire any exclusion resources needed for the operation. */ 404 switch (cmd) { 405 case VM_RUN: 406 case VM_GET_REGISTER: 407 case VM_SET_REGISTER: 408 case VM_GET_SEGMENT_DESCRIPTOR: 409 case VM_SET_SEGMENT_DESCRIPTOR: 410 case VM_GET_REGISTER_SET: 411 case VM_SET_REGISTER_SET: 412 case VM_INJECT_EXCEPTION: 413 case VM_GET_CAPABILITY: 414 case VM_SET_CAPABILITY: 415 case VM_PPTDEV_MSI: 416 case VM_PPTDEV_MSIX: 417 case VM_SET_X2APIC_STATE: 418 case VM_GLA2GPA: 419 case VM_GLA2GPA_NOFAULT: 420 case VM_ACTIVATE_CPU: 421 case VM_SET_INTINFO: 422 case VM_GET_INTINFO: 423 case VM_RESTART_INSTRUCTION: 424 case VM_SET_KERNEMU_DEV: 425 case VM_GET_KERNEMU_DEV: 426 case VM_RESET_CPU: 427 case VM_GET_RUN_STATE: 428 case VM_SET_RUN_STATE: 429 case VM_GET_FPU: 430 case VM_SET_FPU: 431 /* 432 * Copy in the ID of the vCPU chosen for this operation. 433 * Since a nefarious caller could update their struct between 434 * this locking and when the rest of the ioctl data is copied 435 * in, it is _critical_ that this local 'vcpu' variable be used 436 * rather than the in-struct one when performing the ioctl. 437 */ 438 if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) { 439 return (EFAULT); 440 } 441 if (vcpu < 0 || vcpu > vm_get_maxcpus(sc->vmm_vm)) { 442 return (EINVAL); 443 } 444 vcpu_lock_one(sc, vcpu); 445 lock_type = LOCK_VCPU; 446 break; 447 448 case VM_REINIT: 449 case VM_BIND_PPTDEV: 450 case VM_UNBIND_PPTDEV: 451 case VM_MAP_PPTDEV_MMIO: 452 case VM_UNMAP_PPTDEV_MMIO: 453 case VM_ALLOC_MEMSEG: 454 case VM_MMAP_MEMSEG: 455 case VM_MUNMAP_MEMSEG: 456 case VM_WRLOCK_CYCLE: 457 case VM_PMTMR_LOCATE: 458 vmm_write_lock(sc); 459 lock_type = LOCK_WRITE_HOLD; 460 break; 461 462 case VM_GET_MEMSEG: 463 case VM_MMAP_GETNEXT: 464 case VM_LAPIC_IRQ: 465 case VM_INJECT_NMI: 466 case VM_IOAPIC_ASSERT_IRQ: 467 case VM_IOAPIC_DEASSERT_IRQ: 468 case VM_IOAPIC_PULSE_IRQ: 469 case VM_LAPIC_MSI: 470 case VM_LAPIC_LOCAL_IRQ: 471 case VM_GET_X2APIC_STATE: 472 case VM_RTC_READ: 473 case VM_RTC_WRITE: 474 case VM_RTC_SETTIME: 475 case VM_RTC_GETTIME: 476 case VM_PPTDEV_DISABLE_MSIX: 477 case VM_DEVMEM_GETOFFSET: 478 case VM_TRACK_DIRTY_PAGES: 479 vmm_read_lock(sc); 480 lock_type = LOCK_READ_HOLD; 481 break; 482 483 case VM_DATA_READ: 484 case VM_DATA_WRITE: 485 if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) { 486 return (EFAULT); 487 } 488 if (vcpu == -1) { 489 /* Access data for VM-wide devices */ 490 vmm_write_lock(sc); 491 lock_type = LOCK_WRITE_HOLD; 492 } else if (vcpu >= 0 && vcpu < vm_get_maxcpus(sc->vmm_vm)) { 493 /* Access data associated with a specific vCPU */ 494 vcpu_lock_one(sc, vcpu); 495 lock_type = LOCK_VCPU; 496 } else { 497 return (EINVAL); 498 } 499 break; 500 501 case VM_GET_GPA_PMAP: 502 case VM_IOAPIC_PINCOUNT: 503 case VM_SUSPEND: 504 case VM_DESC_FPU_AREA: 505 default: 506 break; 507 } 508 509 /* Execute the primary logic for the ioctl. */ 510 switch (cmd) { 511 case VM_RUN: { 512 struct vm_entry entry; 513 514 if (ddi_copyin(datap, &entry, sizeof (entry), md)) { 515 error = EFAULT; 516 break; 517 } 518 519 if (!(curthread->t_schedflag & TS_VCPU)) 520 smt_mark_as_vcpu(); 521 522 error = vm_run(sc->vmm_vm, vcpu, &entry); 523 524 /* 525 * Unexpected states in vm_run() are expressed through positive 526 * errno-oriented return values. VM states which expect further 527 * processing in userspace (necessary context via exitinfo) are 528 * expressed through negative return values. For the time being 529 * a return value of 0 is not expected from vm_run(). 530 */ 531 ASSERT(error != 0); 532 if (error < 0) { 533 const struct vm_exit *vme; 534 void *outp = entry.exit_data; 535 536 error = 0; 537 vme = vm_exitinfo(sc->vmm_vm, vcpu); 538 if (ddi_copyout(vme, outp, sizeof (*vme), md)) { 539 error = EFAULT; 540 } 541 } 542 break; 543 } 544 case VM_SUSPEND: { 545 struct vm_suspend vmsuspend; 546 547 if (ddi_copyin(datap, &vmsuspend, sizeof (vmsuspend), md)) { 548 error = EFAULT; 549 break; 550 } 551 error = vm_suspend(sc->vmm_vm, vmsuspend.how); 552 break; 553 } 554 case VM_REINIT: { 555 struct vm_reinit reinit; 556 557 if (ddi_copyin(datap, &reinit, sizeof (reinit), md)) { 558 error = EFAULT; 559 break; 560 } 561 if ((error = vmm_drv_block_hook(sc, B_TRUE)) != 0) { 562 /* 563 * The VM instance should be free of driver-attached 564 * hooks during the reinitialization process. 565 */ 566 break; 567 } 568 error = vm_reinit(sc->vmm_vm, reinit.flags); 569 (void) vmm_drv_block_hook(sc, B_FALSE); 570 break; 571 } 572 case VM_STAT_DESC: { 573 struct vm_stat_desc statdesc; 574 575 if (ddi_copyin(datap, &statdesc, sizeof (statdesc), md)) { 576 error = EFAULT; 577 break; 578 } 579 error = vmm_stat_desc_copy(statdesc.index, statdesc.desc, 580 sizeof (statdesc.desc)); 581 if (error == 0 && 582 ddi_copyout(&statdesc, datap, sizeof (statdesc), md)) { 583 error = EFAULT; 584 break; 585 } 586 break; 587 } 588 case VM_STATS_IOC: { 589 struct vm_stats vmstats; 590 591 if (ddi_copyin(datap, &vmstats, sizeof (vmstats), md)) { 592 error = EFAULT; 593 break; 594 } 595 hrt2tv(gethrtime(), &vmstats.tv); 596 error = vmm_stat_copy(sc->vmm_vm, vmstats.cpuid, vmstats.index, 597 nitems(vmstats.statbuf), 598 &vmstats.num_entries, vmstats.statbuf); 599 if (error == 0 && 600 ddi_copyout(&vmstats, datap, sizeof (vmstats), md)) { 601 error = EFAULT; 602 break; 603 } 604 break; 605 } 606 607 case VM_PPTDEV_MSI: { 608 struct vm_pptdev_msi pptmsi; 609 610 if (ddi_copyin(datap, &pptmsi, sizeof (pptmsi), md)) { 611 error = EFAULT; 612 break; 613 } 614 error = ppt_setup_msi(sc->vmm_vm, pptmsi.vcpu, pptmsi.pptfd, 615 pptmsi.addr, pptmsi.msg, pptmsi.numvec); 616 break; 617 } 618 case VM_PPTDEV_MSIX: { 619 struct vm_pptdev_msix pptmsix; 620 621 if (ddi_copyin(datap, &pptmsix, sizeof (pptmsix), md)) { 622 error = EFAULT; 623 break; 624 } 625 error = ppt_setup_msix(sc->vmm_vm, pptmsix.vcpu, pptmsix.pptfd, 626 pptmsix.idx, pptmsix.addr, pptmsix.msg, 627 pptmsix.vector_control); 628 break; 629 } 630 case VM_PPTDEV_DISABLE_MSIX: { 631 struct vm_pptdev pptdev; 632 633 if (ddi_copyin(datap, &pptdev, sizeof (pptdev), md)) { 634 error = EFAULT; 635 break; 636 } 637 error = ppt_disable_msix(sc->vmm_vm, pptdev.pptfd); 638 break; 639 } 640 case VM_MAP_PPTDEV_MMIO: { 641 struct vm_pptdev_mmio pptmmio; 642 643 if (ddi_copyin(datap, &pptmmio, sizeof (pptmmio), md)) { 644 error = EFAULT; 645 break; 646 } 647 error = ppt_map_mmio(sc->vmm_vm, pptmmio.pptfd, pptmmio.gpa, 648 pptmmio.len, pptmmio.hpa); 649 break; 650 } 651 case VM_UNMAP_PPTDEV_MMIO: { 652 struct vm_pptdev_mmio pptmmio; 653 654 if (ddi_copyin(datap, &pptmmio, sizeof (pptmmio), md)) { 655 error = EFAULT; 656 break; 657 } 658 error = ppt_unmap_mmio(sc->vmm_vm, pptmmio.pptfd, pptmmio.gpa, 659 pptmmio.len); 660 break; 661 } 662 case VM_BIND_PPTDEV: { 663 struct vm_pptdev pptdev; 664 665 if (ddi_copyin(datap, &pptdev, sizeof (pptdev), md)) { 666 error = EFAULT; 667 break; 668 } 669 error = vm_assign_pptdev(sc->vmm_vm, pptdev.pptfd); 670 break; 671 } 672 case VM_UNBIND_PPTDEV: { 673 struct vm_pptdev pptdev; 674 675 if (ddi_copyin(datap, &pptdev, sizeof (pptdev), md)) { 676 error = EFAULT; 677 break; 678 } 679 error = vm_unassign_pptdev(sc->vmm_vm, pptdev.pptfd); 680 break; 681 } 682 case VM_GET_PPTDEV_LIMITS: { 683 struct vm_pptdev_limits pptlimits; 684 685 if (ddi_copyin(datap, &pptlimits, sizeof (pptlimits), md)) { 686 error = EFAULT; 687 break; 688 } 689 error = ppt_get_limits(sc->vmm_vm, pptlimits.pptfd, 690 &pptlimits.msi_limit, &pptlimits.msix_limit); 691 if (error == 0 && 692 ddi_copyout(&pptlimits, datap, sizeof (pptlimits), md)) { 693 error = EFAULT; 694 break; 695 } 696 break; 697 } 698 case VM_INJECT_EXCEPTION: { 699 struct vm_exception vmexc; 700 if (ddi_copyin(datap, &vmexc, sizeof (vmexc), md)) { 701 error = EFAULT; 702 break; 703 } 704 error = vm_inject_exception(sc->vmm_vm, vcpu, vmexc.vector, 705 vmexc.error_code_valid != 0, vmexc.error_code, 706 vmexc.restart_instruction != 0); 707 break; 708 } 709 case VM_INJECT_NMI: { 710 struct vm_nmi vmnmi; 711 712 if (ddi_copyin(datap, &vmnmi, sizeof (vmnmi), md)) { 713 error = EFAULT; 714 break; 715 } 716 error = vm_inject_nmi(sc->vmm_vm, vmnmi.cpuid); 717 break; 718 } 719 case VM_LAPIC_IRQ: { 720 struct vm_lapic_irq vmirq; 721 722 if (ddi_copyin(datap, &vmirq, sizeof (vmirq), md)) { 723 error = EFAULT; 724 break; 725 } 726 error = lapic_intr_edge(sc->vmm_vm, vmirq.cpuid, vmirq.vector); 727 break; 728 } 729 case VM_LAPIC_LOCAL_IRQ: { 730 struct vm_lapic_irq vmirq; 731 732 if (ddi_copyin(datap, &vmirq, sizeof (vmirq), md)) { 733 error = EFAULT; 734 break; 735 } 736 error = lapic_set_local_intr(sc->vmm_vm, vmirq.cpuid, 737 vmirq.vector); 738 break; 739 } 740 case VM_LAPIC_MSI: { 741 struct vm_lapic_msi vmmsi; 742 743 if (ddi_copyin(datap, &vmmsi, sizeof (vmmsi), md)) { 744 error = EFAULT; 745 break; 746 } 747 error = lapic_intr_msi(sc->vmm_vm, vmmsi.addr, vmmsi.msg); 748 break; 749 } 750 751 case VM_IOAPIC_ASSERT_IRQ: { 752 struct vm_ioapic_irq ioapic_irq; 753 754 if (ddi_copyin(datap, &ioapic_irq, sizeof (ioapic_irq), md)) { 755 error = EFAULT; 756 break; 757 } 758 error = vioapic_assert_irq(sc->vmm_vm, ioapic_irq.irq); 759 break; 760 } 761 case VM_IOAPIC_DEASSERT_IRQ: { 762 struct vm_ioapic_irq ioapic_irq; 763 764 if (ddi_copyin(datap, &ioapic_irq, sizeof (ioapic_irq), md)) { 765 error = EFAULT; 766 break; 767 } 768 error = vioapic_deassert_irq(sc->vmm_vm, ioapic_irq.irq); 769 break; 770 } 771 case VM_IOAPIC_PULSE_IRQ: { 772 struct vm_ioapic_irq ioapic_irq; 773 774 if (ddi_copyin(datap, &ioapic_irq, sizeof (ioapic_irq), md)) { 775 error = EFAULT; 776 break; 777 } 778 error = vioapic_pulse_irq(sc->vmm_vm, ioapic_irq.irq); 779 break; 780 } 781 case VM_IOAPIC_PINCOUNT: { 782 int pincount; 783 784 pincount = vioapic_pincount(sc->vmm_vm); 785 if (ddi_copyout(&pincount, datap, sizeof (int), md)) { 786 error = EFAULT; 787 break; 788 } 789 break; 790 } 791 case VM_DESC_FPU_AREA: { 792 struct vm_fpu_desc desc; 793 void *buf = NULL; 794 795 if (ddi_copyin(datap, &desc, sizeof (desc), md)) { 796 error = EFAULT; 797 break; 798 } 799 if (desc.vfd_num_entries > 64) { 800 error = EINVAL; 801 break; 802 } 803 const size_t buf_sz = sizeof (struct vm_fpu_desc_entry) * 804 desc.vfd_num_entries; 805 if (buf_sz != 0) { 806 buf = kmem_zalloc(buf_sz, KM_SLEEP); 807 } 808 809 /* 810 * For now, we are depending on vm_fpu_desc_entry and 811 * hma_xsave_state_desc_t having the same format. 812 */ 813 CTASSERT(sizeof (struct vm_fpu_desc_entry) == 814 sizeof (hma_xsave_state_desc_t)); 815 816 size_t req_size; 817 const uint_t max_entries = hma_fpu_describe_xsave_state( 818 (hma_xsave_state_desc_t *)buf, 819 desc.vfd_num_entries, 820 &req_size); 821 822 desc.vfd_req_size = req_size; 823 desc.vfd_num_entries = max_entries; 824 if (buf_sz != 0) { 825 if (ddi_copyout(buf, desc.vfd_entry_data, buf_sz, md)) { 826 error = EFAULT; 827 } 828 kmem_free(buf, buf_sz); 829 } 830 831 if (error == 0) { 832 if (ddi_copyout(&desc, datap, sizeof (desc), md)) { 833 error = EFAULT; 834 } 835 } 836 break; 837 } 838 839 case VM_ISA_ASSERT_IRQ: { 840 struct vm_isa_irq isa_irq; 841 842 if (ddi_copyin(datap, &isa_irq, sizeof (isa_irq), md)) { 843 error = EFAULT; 844 break; 845 } 846 error = vatpic_assert_irq(sc->vmm_vm, isa_irq.atpic_irq); 847 if (error == 0 && isa_irq.ioapic_irq != -1) { 848 error = vioapic_assert_irq(sc->vmm_vm, 849 isa_irq.ioapic_irq); 850 } 851 break; 852 } 853 case VM_ISA_DEASSERT_IRQ: { 854 struct vm_isa_irq isa_irq; 855 856 if (ddi_copyin(datap, &isa_irq, sizeof (isa_irq), md)) { 857 error = EFAULT; 858 break; 859 } 860 error = vatpic_deassert_irq(sc->vmm_vm, isa_irq.atpic_irq); 861 if (error == 0 && isa_irq.ioapic_irq != -1) { 862 error = vioapic_deassert_irq(sc->vmm_vm, 863 isa_irq.ioapic_irq); 864 } 865 break; 866 } 867 case VM_ISA_PULSE_IRQ: { 868 struct vm_isa_irq isa_irq; 869 870 if (ddi_copyin(datap, &isa_irq, sizeof (isa_irq), md)) { 871 error = EFAULT; 872 break; 873 } 874 error = vatpic_pulse_irq(sc->vmm_vm, isa_irq.atpic_irq); 875 if (error == 0 && isa_irq.ioapic_irq != -1) { 876 error = vioapic_pulse_irq(sc->vmm_vm, 877 isa_irq.ioapic_irq); 878 } 879 break; 880 } 881 case VM_ISA_SET_IRQ_TRIGGER: { 882 struct vm_isa_irq_trigger isa_irq_trigger; 883 884 if (ddi_copyin(datap, &isa_irq_trigger, 885 sizeof (isa_irq_trigger), md)) { 886 error = EFAULT; 887 break; 888 } 889 error = vatpic_set_irq_trigger(sc->vmm_vm, 890 isa_irq_trigger.atpic_irq, isa_irq_trigger.trigger); 891 break; 892 } 893 894 case VM_MMAP_GETNEXT: { 895 struct vm_memmap mm; 896 897 if (ddi_copyin(datap, &mm, sizeof (mm), md)) { 898 error = EFAULT; 899 break; 900 } 901 error = vm_mmap_getnext(sc->vmm_vm, &mm.gpa, &mm.segid, 902 &mm.segoff, &mm.len, &mm.prot, &mm.flags); 903 if (error == 0 && ddi_copyout(&mm, datap, sizeof (mm), md)) { 904 error = EFAULT; 905 break; 906 } 907 break; 908 } 909 case VM_MMAP_MEMSEG: { 910 struct vm_memmap mm; 911 912 if (ddi_copyin(datap, &mm, sizeof (mm), md)) { 913 error = EFAULT; 914 break; 915 } 916 error = vm_mmap_memseg(sc->vmm_vm, mm.gpa, mm.segid, mm.segoff, 917 mm.len, mm.prot, mm.flags); 918 break; 919 } 920 case VM_MUNMAP_MEMSEG: { 921 struct vm_munmap mu; 922 923 if (ddi_copyin(datap, &mu, sizeof (mu), md)) { 924 error = EFAULT; 925 break; 926 } 927 error = vm_munmap_memseg(sc->vmm_vm, mu.gpa, mu.len); 928 break; 929 } 930 case VM_ALLOC_MEMSEG: { 931 struct vm_memseg vmseg; 932 933 if (ddi_copyin(datap, &vmseg, sizeof (vmseg), md)) { 934 error = EFAULT; 935 break; 936 } 937 error = vmmdev_alloc_memseg(sc, &vmseg); 938 break; 939 } 940 case VM_GET_MEMSEG: { 941 struct vm_memseg vmseg; 942 943 if (ddi_copyin(datap, &vmseg, sizeof (vmseg), md)) { 944 error = EFAULT; 945 break; 946 } 947 error = vmmdev_get_memseg(sc, &vmseg); 948 if (error == 0 && 949 ddi_copyout(&vmseg, datap, sizeof (vmseg), md)) { 950 error = EFAULT; 951 break; 952 } 953 break; 954 } 955 case VM_GET_REGISTER: { 956 struct vm_register vmreg; 957 958 if (ddi_copyin(datap, &vmreg, sizeof (vmreg), md)) { 959 error = EFAULT; 960 break; 961 } 962 error = vm_get_register(sc->vmm_vm, vcpu, vmreg.regnum, 963 &vmreg.regval); 964 if (error == 0 && 965 ddi_copyout(&vmreg, datap, sizeof (vmreg), md)) { 966 error = EFAULT; 967 break; 968 } 969 break; 970 } 971 case VM_SET_REGISTER: { 972 struct vm_register vmreg; 973 974 if (ddi_copyin(datap, &vmreg, sizeof (vmreg), md)) { 975 error = EFAULT; 976 break; 977 } 978 error = vm_set_register(sc->vmm_vm, vcpu, vmreg.regnum, 979 vmreg.regval); 980 break; 981 } 982 case VM_SET_SEGMENT_DESCRIPTOR: { 983 struct vm_seg_desc vmsegd; 984 985 if (ddi_copyin(datap, &vmsegd, sizeof (vmsegd), md)) { 986 error = EFAULT; 987 break; 988 } 989 error = vm_set_seg_desc(sc->vmm_vm, vcpu, vmsegd.regnum, 990 &vmsegd.desc); 991 break; 992 } 993 case VM_GET_SEGMENT_DESCRIPTOR: { 994 struct vm_seg_desc vmsegd; 995 996 if (ddi_copyin(datap, &vmsegd, sizeof (vmsegd), md)) { 997 error = EFAULT; 998 break; 999 } 1000 error = vm_get_seg_desc(sc->vmm_vm, vcpu, vmsegd.regnum, 1001 &vmsegd.desc); 1002 if (error == 0 && 1003 ddi_copyout(&vmsegd, datap, sizeof (vmsegd), md)) { 1004 error = EFAULT; 1005 break; 1006 } 1007 break; 1008 } 1009 case VM_GET_REGISTER_SET: { 1010 struct vm_register_set vrs; 1011 int regnums[VM_REG_LAST]; 1012 uint64_t regvals[VM_REG_LAST]; 1013 1014 if (ddi_copyin(datap, &vrs, sizeof (vrs), md)) { 1015 error = EFAULT; 1016 break; 1017 } 1018 if (vrs.count > VM_REG_LAST || vrs.count == 0) { 1019 error = EINVAL; 1020 break; 1021 } 1022 if (ddi_copyin(vrs.regnums, regnums, 1023 sizeof (int) * vrs.count, md)) { 1024 error = EFAULT; 1025 break; 1026 } 1027 1028 error = 0; 1029 for (uint_t i = 0; i < vrs.count && error == 0; i++) { 1030 if (regnums[i] < 0) { 1031 error = EINVAL; 1032 break; 1033 } 1034 error = vm_get_register(sc->vmm_vm, vcpu, regnums[i], 1035 ®vals[i]); 1036 } 1037 if (error == 0 && ddi_copyout(regvals, vrs.regvals, 1038 sizeof (uint64_t) * vrs.count, md)) { 1039 error = EFAULT; 1040 } 1041 break; 1042 } 1043 case VM_SET_REGISTER_SET: { 1044 struct vm_register_set vrs; 1045 int regnums[VM_REG_LAST]; 1046 uint64_t regvals[VM_REG_LAST]; 1047 1048 if (ddi_copyin(datap, &vrs, sizeof (vrs), md)) { 1049 error = EFAULT; 1050 break; 1051 } 1052 if (vrs.count > VM_REG_LAST || vrs.count == 0) { 1053 error = EINVAL; 1054 break; 1055 } 1056 if (ddi_copyin(vrs.regnums, regnums, 1057 sizeof (int) * vrs.count, md)) { 1058 error = EFAULT; 1059 break; 1060 } 1061 if (ddi_copyin(vrs.regvals, regvals, 1062 sizeof (uint64_t) * vrs.count, md)) { 1063 error = EFAULT; 1064 break; 1065 } 1066 1067 error = 0; 1068 for (uint_t i = 0; i < vrs.count && error == 0; i++) { 1069 /* 1070 * Setting registers in a set is not atomic, since a 1071 * failure in the middle of the set will cause a 1072 * bail-out and inconsistent register state. Callers 1073 * should be wary of this. 1074 */ 1075 if (regnums[i] < 0) { 1076 error = EINVAL; 1077 break; 1078 } 1079 error = vm_set_register(sc->vmm_vm, vcpu, regnums[i], 1080 regvals[i]); 1081 } 1082 break; 1083 } 1084 case VM_RESET_CPU: { 1085 struct vm_vcpu_reset vvr; 1086 1087 if (ddi_copyin(datap, &vvr, sizeof (vvr), md)) { 1088 error = EFAULT; 1089 break; 1090 } 1091 if (vvr.kind != VRK_RESET && vvr.kind != VRK_INIT) { 1092 error = EINVAL; 1093 } 1094 1095 error = vcpu_arch_reset(sc->vmm_vm, vcpu, vvr.kind == VRK_INIT); 1096 break; 1097 } 1098 case VM_GET_RUN_STATE: { 1099 struct vm_run_state vrs; 1100 1101 bzero(&vrs, sizeof (vrs)); 1102 error = vm_get_run_state(sc->vmm_vm, vcpu, &vrs.state, 1103 &vrs.sipi_vector); 1104 if (error == 0) { 1105 if (ddi_copyout(&vrs, datap, sizeof (vrs), md)) { 1106 error = EFAULT; 1107 break; 1108 } 1109 } 1110 break; 1111 } 1112 case VM_SET_RUN_STATE: { 1113 struct vm_run_state vrs; 1114 1115 if (ddi_copyin(datap, &vrs, sizeof (vrs), md)) { 1116 error = EFAULT; 1117 break; 1118 } 1119 error = vm_set_run_state(sc->vmm_vm, vcpu, vrs.state, 1120 vrs.sipi_vector); 1121 break; 1122 } 1123 case VM_GET_FPU: { 1124 struct vm_fpu_state req; 1125 const size_t max_len = (PAGESIZE * 2); 1126 void *kbuf; 1127 1128 if (ddi_copyin(datap, &req, sizeof (req), md)) { 1129 error = EFAULT; 1130 break; 1131 } 1132 if (req.len > max_len || req.len == 0) { 1133 error = EINVAL; 1134 break; 1135 } 1136 kbuf = kmem_zalloc(req.len, KM_SLEEP); 1137 error = vm_get_fpu(sc->vmm_vm, vcpu, kbuf, req.len); 1138 if (error == 0) { 1139 if (ddi_copyout(kbuf, req.buf, req.len, md)) { 1140 error = EFAULT; 1141 } 1142 } 1143 kmem_free(kbuf, req.len); 1144 break; 1145 } 1146 case VM_SET_FPU: { 1147 struct vm_fpu_state req; 1148 const size_t max_len = (PAGESIZE * 2); 1149 void *kbuf; 1150 1151 if (ddi_copyin(datap, &req, sizeof (req), md)) { 1152 error = EFAULT; 1153 break; 1154 } 1155 if (req.len > max_len || req.len == 0) { 1156 error = EINVAL; 1157 break; 1158 } 1159 kbuf = kmem_alloc(req.len, KM_SLEEP); 1160 if (ddi_copyin(req.buf, kbuf, req.len, md)) { 1161 error = EFAULT; 1162 } else { 1163 error = vm_set_fpu(sc->vmm_vm, vcpu, kbuf, req.len); 1164 } 1165 kmem_free(kbuf, req.len); 1166 break; 1167 } 1168 1169 case VM_SET_KERNEMU_DEV: 1170 case VM_GET_KERNEMU_DEV: { 1171 struct vm_readwrite_kernemu_device kemu; 1172 size_t size = 0; 1173 1174 if (ddi_copyin(datap, &kemu, sizeof (kemu), md)) { 1175 error = EFAULT; 1176 break; 1177 } 1178 1179 if (kemu.access_width > 3) { 1180 error = EINVAL; 1181 break; 1182 } 1183 size = (1 << kemu.access_width); 1184 ASSERT(size >= 1 && size <= 8); 1185 1186 if (cmd == VM_SET_KERNEMU_DEV) { 1187 error = vm_service_mmio_write(sc->vmm_vm, vcpu, 1188 kemu.gpa, kemu.value, size); 1189 } else { 1190 error = vm_service_mmio_read(sc->vmm_vm, vcpu, 1191 kemu.gpa, &kemu.value, size); 1192 } 1193 1194 if (error == 0) { 1195 if (ddi_copyout(&kemu, datap, sizeof (kemu), md)) { 1196 error = EFAULT; 1197 break; 1198 } 1199 } 1200 break; 1201 } 1202 1203 case VM_GET_CAPABILITY: { 1204 struct vm_capability vmcap; 1205 1206 if (ddi_copyin(datap, &vmcap, sizeof (vmcap), md)) { 1207 error = EFAULT; 1208 break; 1209 } 1210 error = vm_get_capability(sc->vmm_vm, vcpu, vmcap.captype, 1211 &vmcap.capval); 1212 if (error == 0 && 1213 ddi_copyout(&vmcap, datap, sizeof (vmcap), md)) { 1214 error = EFAULT; 1215 break; 1216 } 1217 break; 1218 } 1219 case VM_SET_CAPABILITY: { 1220 struct vm_capability vmcap; 1221 1222 if (ddi_copyin(datap, &vmcap, sizeof (vmcap), md)) { 1223 error = EFAULT; 1224 break; 1225 } 1226 error = vm_set_capability(sc->vmm_vm, vcpu, vmcap.captype, 1227 vmcap.capval); 1228 break; 1229 } 1230 case VM_SET_X2APIC_STATE: { 1231 struct vm_x2apic x2apic; 1232 1233 if (ddi_copyin(datap, &x2apic, sizeof (x2apic), md)) { 1234 error = EFAULT; 1235 break; 1236 } 1237 error = vm_set_x2apic_state(sc->vmm_vm, vcpu, x2apic.state); 1238 break; 1239 } 1240 case VM_GET_X2APIC_STATE: { 1241 struct vm_x2apic x2apic; 1242 1243 if (ddi_copyin(datap, &x2apic, sizeof (x2apic), md)) { 1244 error = EFAULT; 1245 break; 1246 } 1247 error = vm_get_x2apic_state(sc->vmm_vm, x2apic.cpuid, 1248 &x2apic.state); 1249 if (error == 0 && 1250 ddi_copyout(&x2apic, datap, sizeof (x2apic), md)) { 1251 error = EFAULT; 1252 break; 1253 } 1254 break; 1255 } 1256 case VM_GET_GPA_PMAP: { 1257 /* 1258 * Until there is a necessity to leak EPT/RVI PTE values to 1259 * userspace, this will remain unimplemented 1260 */ 1261 error = EINVAL; 1262 break; 1263 } 1264 case VM_GET_HPET_CAPABILITIES: { 1265 struct vm_hpet_cap hpetcap; 1266 1267 error = vhpet_getcap(&hpetcap); 1268 if (error == 0 && 1269 ddi_copyout(&hpetcap, datap, sizeof (hpetcap), md)) { 1270 error = EFAULT; 1271 break; 1272 } 1273 break; 1274 } 1275 case VM_GLA2GPA: { 1276 struct vm_gla2gpa gg; 1277 1278 if (ddi_copyin(datap, &gg, sizeof (gg), md)) { 1279 error = EFAULT; 1280 break; 1281 } 1282 gg.vcpuid = vcpu; 1283 error = vm_gla2gpa(sc->vmm_vm, vcpu, &gg.paging, gg.gla, 1284 gg.prot, &gg.gpa, &gg.fault); 1285 if (error == 0 && ddi_copyout(&gg, datap, sizeof (gg), md)) { 1286 error = EFAULT; 1287 break; 1288 } 1289 break; 1290 } 1291 case VM_GLA2GPA_NOFAULT: { 1292 struct vm_gla2gpa gg; 1293 1294 if (ddi_copyin(datap, &gg, sizeof (gg), md)) { 1295 error = EFAULT; 1296 break; 1297 } 1298 gg.vcpuid = vcpu; 1299 error = vm_gla2gpa_nofault(sc->vmm_vm, vcpu, &gg.paging, 1300 gg.gla, gg.prot, &gg.gpa, &gg.fault); 1301 if (error == 0 && ddi_copyout(&gg, datap, sizeof (gg), md)) { 1302 error = EFAULT; 1303 break; 1304 } 1305 break; 1306 } 1307 1308 case VM_ACTIVATE_CPU: 1309 error = vm_activate_cpu(sc->vmm_vm, vcpu); 1310 break; 1311 1312 case VM_SUSPEND_CPU: 1313 if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) { 1314 error = EFAULT; 1315 } else { 1316 error = vm_suspend_cpu(sc->vmm_vm, vcpu); 1317 } 1318 break; 1319 1320 case VM_RESUME_CPU: 1321 if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) { 1322 error = EFAULT; 1323 } else { 1324 error = vm_resume_cpu(sc->vmm_vm, vcpu); 1325 } 1326 break; 1327 1328 case VM_GET_CPUS: { 1329 struct vm_cpuset vm_cpuset; 1330 cpuset_t tempset; 1331 void *srcp = &tempset; 1332 int size; 1333 1334 if (ddi_copyin(datap, &vm_cpuset, sizeof (vm_cpuset), md)) { 1335 error = EFAULT; 1336 break; 1337 } 1338 1339 /* Be more generous about sizing since our cpuset_t is large. */ 1340 size = vm_cpuset.cpusetsize; 1341 if (size <= 0 || size > sizeof (cpuset_t)) { 1342 error = ERANGE; 1343 } 1344 /* 1345 * If they want a ulong_t or less, make sure they receive the 1346 * low bits with all the useful information. 1347 */ 1348 if (size <= sizeof (tempset.cpub[0])) { 1349 srcp = &tempset.cpub[0]; 1350 } 1351 1352 if (vm_cpuset.which == VM_ACTIVE_CPUS) { 1353 tempset = vm_active_cpus(sc->vmm_vm); 1354 } else if (vm_cpuset.which == VM_SUSPENDED_CPUS) { 1355 tempset = vm_suspended_cpus(sc->vmm_vm); 1356 } else if (vm_cpuset.which == VM_DEBUG_CPUS) { 1357 tempset = vm_debug_cpus(sc->vmm_vm); 1358 } else { 1359 error = EINVAL; 1360 } 1361 1362 ASSERT(size > 0 && size <= sizeof (tempset)); 1363 if (error == 0 && 1364 ddi_copyout(srcp, vm_cpuset.cpus, size, md)) { 1365 error = EFAULT; 1366 break; 1367 } 1368 break; 1369 } 1370 case VM_SET_INTINFO: { 1371 struct vm_intinfo vmii; 1372 1373 if (ddi_copyin(datap, &vmii, sizeof (vmii), md)) { 1374 error = EFAULT; 1375 break; 1376 } 1377 error = vm_exit_intinfo(sc->vmm_vm, vcpu, vmii.info1); 1378 break; 1379 } 1380 case VM_GET_INTINFO: { 1381 struct vm_intinfo vmii; 1382 1383 vmii.vcpuid = vcpu; 1384 error = vm_get_intinfo(sc->vmm_vm, vcpu, &vmii.info1, 1385 &vmii.info2); 1386 if (error == 0 && 1387 ddi_copyout(&vmii, datap, sizeof (vmii), md)) { 1388 error = EFAULT; 1389 break; 1390 } 1391 break; 1392 } 1393 case VM_RTC_WRITE: { 1394 struct vm_rtc_data rtcdata; 1395 1396 if (ddi_copyin(datap, &rtcdata, sizeof (rtcdata), md)) { 1397 error = EFAULT; 1398 break; 1399 } 1400 error = vrtc_nvram_write(sc->vmm_vm, rtcdata.offset, 1401 rtcdata.value); 1402 break; 1403 } 1404 case VM_RTC_READ: { 1405 struct vm_rtc_data rtcdata; 1406 1407 if (ddi_copyin(datap, &rtcdata, sizeof (rtcdata), md)) { 1408 error = EFAULT; 1409 break; 1410 } 1411 error = vrtc_nvram_read(sc->vmm_vm, rtcdata.offset, 1412 &rtcdata.value); 1413 if (error == 0 && 1414 ddi_copyout(&rtcdata, datap, sizeof (rtcdata), md)) { 1415 error = EFAULT; 1416 break; 1417 } 1418 break; 1419 } 1420 case VM_RTC_SETTIME: { 1421 struct vm_rtc_time rtctime; 1422 1423 if (ddi_copyin(datap, &rtctime, sizeof (rtctime), md)) { 1424 error = EFAULT; 1425 break; 1426 } 1427 error = vrtc_set_time(sc->vmm_vm, rtctime.secs); 1428 break; 1429 } 1430 case VM_RTC_GETTIME: { 1431 struct vm_rtc_time rtctime; 1432 1433 rtctime.secs = vrtc_get_time(sc->vmm_vm); 1434 if (ddi_copyout(&rtctime, datap, sizeof (rtctime), md)) { 1435 error = EFAULT; 1436 break; 1437 } 1438 break; 1439 } 1440 1441 case VM_PMTMR_LOCATE: { 1442 uint16_t port = arg; 1443 error = vpmtmr_set_location(sc->vmm_vm, port); 1444 break; 1445 } 1446 1447 case VM_RESTART_INSTRUCTION: 1448 error = vm_restart_instruction(sc->vmm_vm, vcpu); 1449 break; 1450 1451 case VM_SET_TOPOLOGY: { 1452 struct vm_cpu_topology topo; 1453 1454 if (ddi_copyin(datap, &topo, sizeof (topo), md) != 0) { 1455 error = EFAULT; 1456 break; 1457 } 1458 error = vm_set_topology(sc->vmm_vm, topo.sockets, topo.cores, 1459 topo.threads, topo.maxcpus); 1460 break; 1461 } 1462 case VM_GET_TOPOLOGY: { 1463 struct vm_cpu_topology topo; 1464 1465 vm_get_topology(sc->vmm_vm, &topo.sockets, &topo.cores, 1466 &topo.threads, &topo.maxcpus); 1467 if (ddi_copyout(&topo, datap, sizeof (topo), md) != 0) { 1468 error = EFAULT; 1469 break; 1470 } 1471 break; 1472 } 1473 case VM_DEVMEM_GETOFFSET: { 1474 struct vm_devmem_offset vdo; 1475 vmm_devmem_entry_t *de; 1476 1477 if (ddi_copyin(datap, &vdo, sizeof (vdo), md) != 0) { 1478 error = EFAULT; 1479 break; 1480 } 1481 1482 de = vmmdev_devmem_find(sc, vdo.segid); 1483 if (de != NULL) { 1484 vdo.offset = de->vde_off; 1485 if (ddi_copyout(&vdo, datap, sizeof (vdo), md) != 0) { 1486 error = EFAULT; 1487 } 1488 } else { 1489 error = ENOENT; 1490 } 1491 break; 1492 } 1493 case VM_TRACK_DIRTY_PAGES: { 1494 const size_t max_track_region_len = 8 * PAGESIZE * 8 * PAGESIZE; 1495 struct vmm_dirty_tracker tracker; 1496 uint8_t *bitmap; 1497 size_t len; 1498 1499 if (ddi_copyin(datap, &tracker, sizeof (tracker), md) != 0) { 1500 error = EFAULT; 1501 break; 1502 } 1503 if ((tracker.vdt_start_gpa & PAGEOFFSET) != 0) { 1504 error = EINVAL; 1505 break; 1506 } 1507 if (tracker.vdt_len == 0) { 1508 break; 1509 } 1510 if ((tracker.vdt_len & PAGEOFFSET) != 0) { 1511 error = EINVAL; 1512 break; 1513 } 1514 if (tracker.vdt_len > max_track_region_len) { 1515 error = EINVAL; 1516 break; 1517 } 1518 len = roundup(tracker.vdt_len / PAGESIZE, 8) / 8; 1519 bitmap = kmem_zalloc(len, KM_SLEEP); 1520 vm_track_dirty_pages(sc->vmm_vm, tracker.vdt_start_gpa, 1521 tracker.vdt_len, bitmap); 1522 if (ddi_copyout(bitmap, tracker.vdt_pfns, len, md) != 0) { 1523 error = EFAULT; 1524 } 1525 kmem_free(bitmap, len); 1526 1527 break; 1528 } 1529 case VM_WRLOCK_CYCLE: { 1530 /* 1531 * Present a test mechanism to acquire/release the write lock 1532 * on the VM without any other effects. 1533 */ 1534 break; 1535 } 1536 case VM_DATA_READ: { 1537 struct vm_data_xfer vdx; 1538 1539 if (ddi_copyin(datap, &vdx, sizeof (vdx), md) != 0) { 1540 error = EFAULT; 1541 break; 1542 } 1543 if ((vdx.vdx_flags & ~VDX_FLAGS_VALID) != 0) { 1544 error = EINVAL; 1545 break; 1546 } 1547 if (vdx.vdx_len > VM_DATA_XFER_LIMIT) { 1548 error = EFBIG; 1549 break; 1550 } 1551 1552 const size_t len = vdx.vdx_len; 1553 void *buf = NULL; 1554 if (len != 0) { 1555 buf = kmem_alloc(len, KM_SLEEP); 1556 if ((vdx.vdx_flags & VDX_FLAG_READ_COPYIN) != 0 && 1557 ddi_copyin(vdx.vdx_data, buf, len, md) != 0) { 1558 kmem_free(buf, len); 1559 error = EFAULT; 1560 break; 1561 } else { 1562 bzero(buf, len); 1563 } 1564 } 1565 1566 vdx.vdx_result_len = 0; 1567 vmm_data_req_t req = { 1568 .vdr_class = vdx.vdx_class, 1569 .vdr_version = vdx.vdx_version, 1570 .vdr_flags = vdx.vdx_flags, 1571 .vdr_len = len, 1572 .vdr_data = buf, 1573 .vdr_result_len = &vdx.vdx_result_len, 1574 }; 1575 error = vmm_data_read(sc->vmm_vm, vdx.vdx_vcpuid, &req); 1576 1577 if (error == 0 && buf != NULL) { 1578 if (ddi_copyout(buf, vdx.vdx_data, len, md) != 0) { 1579 error = EFAULT; 1580 } 1581 } 1582 1583 /* 1584 * Copy out the transfer request so that the value of 1585 * vdx_result_len can be made available, regardless of any 1586 * error(s) which may have occurred. 1587 */ 1588 if (ddi_copyout(&vdx, datap, sizeof (vdx), md) != 0) { 1589 error = (error != 0) ? error : EFAULT; 1590 } 1591 1592 if (buf != NULL) { 1593 kmem_free(buf, len); 1594 } 1595 break; 1596 } 1597 case VM_DATA_WRITE: { 1598 struct vm_data_xfer vdx; 1599 1600 if (ddi_copyin(datap, &vdx, sizeof (vdx), md) != 0) { 1601 error = EFAULT; 1602 break; 1603 } 1604 if ((vdx.vdx_flags & ~VDX_FLAGS_VALID) != 0) { 1605 error = EINVAL; 1606 break; 1607 } 1608 if (vdx.vdx_len > VM_DATA_XFER_LIMIT) { 1609 error = EFBIG; 1610 break; 1611 } 1612 1613 const size_t len = vdx.vdx_len; 1614 void *buf = NULL; 1615 if (len != 0) { 1616 buf = kmem_alloc(len, KM_SLEEP); 1617 if (ddi_copyin(vdx.vdx_data, buf, len, md) != 0) { 1618 kmem_free(buf, len); 1619 error = EFAULT; 1620 break; 1621 } 1622 } 1623 1624 vdx.vdx_result_len = 0; 1625 vmm_data_req_t req = { 1626 .vdr_class = vdx.vdx_class, 1627 .vdr_version = vdx.vdx_version, 1628 .vdr_flags = vdx.vdx_flags, 1629 .vdr_len = len, 1630 .vdr_data = buf, 1631 .vdr_result_len = &vdx.vdx_result_len, 1632 }; 1633 if (vmm_allow_state_writes == 0) { 1634 /* XXX: Play it safe for now */ 1635 error = EPERM; 1636 } else { 1637 error = vmm_data_write(sc->vmm_vm, vdx.vdx_vcpuid, 1638 &req); 1639 } 1640 1641 if (error == 0 && buf != NULL && 1642 (vdx.vdx_flags & VDX_FLAG_WRITE_COPYOUT) != 0) { 1643 if (ddi_copyout(buf, vdx.vdx_data, len, md) != 0) { 1644 error = EFAULT; 1645 } 1646 } 1647 1648 /* 1649 * Copy out the transfer request so that the value of 1650 * vdx_result_len can be made available, regardless of any 1651 * error(s) which may have occurred. 1652 */ 1653 if (ddi_copyout(&vdx, datap, sizeof (vdx), md) != 0) { 1654 error = (error != 0) ? error : EFAULT; 1655 } 1656 1657 if (buf != NULL) { 1658 kmem_free(buf, len); 1659 } 1660 break; 1661 } 1662 1663 default: 1664 error = ENOTTY; 1665 break; 1666 } 1667 1668 /* Release exclusion resources */ 1669 switch (lock_type) { 1670 case LOCK_NONE: 1671 break; 1672 case LOCK_VCPU: 1673 vcpu_unlock_one(sc, vcpu); 1674 break; 1675 case LOCK_READ_HOLD: 1676 vmm_read_unlock(sc); 1677 break; 1678 case LOCK_WRITE_HOLD: 1679 vmm_write_unlock(sc); 1680 break; 1681 default: 1682 panic("unexpected lock type"); 1683 break; 1684 } 1685 1686 return (error); 1687 } 1688 1689 static vmm_softc_t * 1690 vmm_lookup(const char *name) 1691 { 1692 list_t *vml = &vmm_list; 1693 vmm_softc_t *sc; 1694 1695 ASSERT(MUTEX_HELD(&vmm_mtx)); 1696 1697 for (sc = list_head(vml); sc != NULL; sc = list_next(vml, sc)) { 1698 if (strcmp(sc->vmm_name, name) == 0) { 1699 break; 1700 } 1701 } 1702 1703 return (sc); 1704 } 1705 1706 /* 1707 * Acquire an HMA registration if not already held. 1708 */ 1709 static boolean_t 1710 vmm_hma_acquire(void) 1711 { 1712 ASSERT(MUTEX_NOT_HELD(&vmm_mtx)); 1713 1714 mutex_enter(&vmmdev_mtx); 1715 1716 if (vmmdev_hma_reg == NULL) { 1717 VERIFY3U(vmmdev_hma_ref, ==, 0); 1718 vmmdev_hma_reg = hma_register(vmmdev_hvm_name); 1719 if (vmmdev_hma_reg == NULL) { 1720 cmn_err(CE_WARN, "%s HMA registration failed.", 1721 vmmdev_hvm_name); 1722 mutex_exit(&vmmdev_mtx); 1723 return (B_FALSE); 1724 } 1725 } 1726 1727 vmmdev_hma_ref++; 1728 1729 mutex_exit(&vmmdev_mtx); 1730 1731 return (B_TRUE); 1732 } 1733 1734 /* 1735 * Release the HMA registration if held and there are no remaining VMs. 1736 */ 1737 static void 1738 vmm_hma_release(void) 1739 { 1740 ASSERT(MUTEX_NOT_HELD(&vmm_mtx)); 1741 1742 mutex_enter(&vmmdev_mtx); 1743 1744 VERIFY3U(vmmdev_hma_ref, !=, 0); 1745 1746 vmmdev_hma_ref--; 1747 1748 if (vmmdev_hma_ref == 0) { 1749 VERIFY(vmmdev_hma_reg != NULL); 1750 hma_unregister(vmmdev_hma_reg); 1751 vmmdev_hma_reg = NULL; 1752 } 1753 mutex_exit(&vmmdev_mtx); 1754 } 1755 1756 static int 1757 vmmdev_do_vm_create(const struct vm_create_req *req, cred_t *cr) 1758 { 1759 vmm_softc_t *sc = NULL; 1760 minor_t minor; 1761 int error = ENOMEM; 1762 size_t len; 1763 const char *name = req->name; 1764 1765 len = strnlen(name, VM_MAX_NAMELEN); 1766 if (len == 0) { 1767 return (EINVAL); 1768 } 1769 if (len >= VM_MAX_NAMELEN) { 1770 return (ENAMETOOLONG); 1771 } 1772 if (strchr(name, '/') != NULL) { 1773 return (EINVAL); 1774 } 1775 1776 if (!vmm_hma_acquire()) 1777 return (ENXIO); 1778 1779 mutex_enter(&vmm_mtx); 1780 1781 /* Look for duplicate names */ 1782 if (vmm_lookup(name) != NULL) { 1783 mutex_exit(&vmm_mtx); 1784 vmm_hma_release(); 1785 return (EEXIST); 1786 } 1787 1788 /* Allow only one instance per non-global zone. */ 1789 if (!INGLOBALZONE(curproc)) { 1790 for (sc = list_head(&vmm_list); sc != NULL; 1791 sc = list_next(&vmm_list, sc)) { 1792 if (sc->vmm_zone == curzone) { 1793 mutex_exit(&vmm_mtx); 1794 vmm_hma_release(); 1795 return (EINVAL); 1796 } 1797 } 1798 } 1799 1800 minor = id_alloc(vmm_minors); 1801 if (ddi_soft_state_zalloc(vmm_statep, minor) != DDI_SUCCESS) { 1802 goto fail; 1803 } else if ((sc = ddi_get_soft_state(vmm_statep, minor)) == NULL) { 1804 ddi_soft_state_free(vmm_statep, minor); 1805 goto fail; 1806 } else if (ddi_create_minor_node(vmmdev_dip, name, S_IFCHR, minor, 1807 DDI_PSEUDO, 0) != DDI_SUCCESS) { 1808 goto fail; 1809 } 1810 1811 if (vmm_kstat_alloc(sc, minor, cr) != 0) { 1812 goto fail; 1813 } 1814 1815 error = vm_create(req->flags, &sc->vmm_vm); 1816 if (error == 0) { 1817 /* Complete VM intialization and report success. */ 1818 (void) strlcpy(sc->vmm_name, name, sizeof (sc->vmm_name)); 1819 sc->vmm_minor = minor; 1820 list_create(&sc->vmm_devmem_list, sizeof (vmm_devmem_entry_t), 1821 offsetof(vmm_devmem_entry_t, vde_node)); 1822 1823 list_create(&sc->vmm_holds, sizeof (vmm_hold_t), 1824 offsetof(vmm_hold_t, vmh_node)); 1825 cv_init(&sc->vmm_cv, NULL, CV_DEFAULT, NULL); 1826 1827 mutex_init(&sc->vmm_lease_lock, NULL, MUTEX_DEFAULT, NULL); 1828 list_create(&sc->vmm_lease_list, sizeof (vmm_lease_t), 1829 offsetof(vmm_lease_t, vml_node)); 1830 cv_init(&sc->vmm_lease_cv, NULL, CV_DEFAULT, NULL); 1831 rw_init(&sc->vmm_rwlock, NULL, RW_DEFAULT, NULL); 1832 1833 sc->vmm_zone = crgetzone(cr); 1834 zone_hold(sc->vmm_zone); 1835 vmm_zsd_add_vm(sc); 1836 vmm_kstat_init(sc); 1837 1838 list_insert_tail(&vmm_list, sc); 1839 mutex_exit(&vmm_mtx); 1840 return (0); 1841 } 1842 1843 vmm_kstat_fini(sc); 1844 ddi_remove_minor_node(vmmdev_dip, name); 1845 fail: 1846 id_free(vmm_minors, minor); 1847 if (sc != NULL) { 1848 ddi_soft_state_free(vmm_statep, minor); 1849 } 1850 mutex_exit(&vmm_mtx); 1851 vmm_hma_release(); 1852 1853 return (error); 1854 } 1855 1856 /* 1857 * Bhyve 'Driver' Interface 1858 * 1859 * While many devices are emulated in the bhyve userspace process, there are 1860 * others with performance constraints which require that they run mostly or 1861 * entirely in-kernel. For those not integrated directly into bhyve, an API is 1862 * needed so they can query/manipulate the portions of VM state needed to 1863 * fulfill their purpose. 1864 * 1865 * This includes: 1866 * - Translating guest-physical addresses to host-virtual pointers 1867 * - Injecting MSIs 1868 * - Hooking IO port addresses 1869 * 1870 * The vmm_drv interface exists to provide that functionality to its consumers. 1871 * (At this time, 'viona' is the only user) 1872 */ 1873 int 1874 vmm_drv_hold(file_t *fp, cred_t *cr, vmm_hold_t **holdp) 1875 { 1876 vnode_t *vp = fp->f_vnode; 1877 const dev_t dev = vp->v_rdev; 1878 vmm_softc_t *sc; 1879 vmm_hold_t *hold; 1880 int err = 0; 1881 1882 if (vp->v_type != VCHR) { 1883 return (ENXIO); 1884 } 1885 const major_t major = getmajor(dev); 1886 const minor_t minor = getminor(dev); 1887 1888 mutex_enter(&vmmdev_mtx); 1889 if (vmmdev_dip == NULL || major != ddi_driver_major(vmmdev_dip)) { 1890 mutex_exit(&vmmdev_mtx); 1891 return (ENOENT); 1892 } 1893 mutex_enter(&vmm_mtx); 1894 mutex_exit(&vmmdev_mtx); 1895 1896 if ((sc = ddi_get_soft_state(vmm_statep, minor)) == NULL) { 1897 err = ENOENT; 1898 goto out; 1899 } 1900 /* XXXJOY: check cred permissions against instance */ 1901 1902 if ((sc->vmm_flags & (VMM_CLEANUP|VMM_PURGED|VMM_DESTROY)) != 0) { 1903 err = EBUSY; 1904 goto out; 1905 } 1906 1907 hold = kmem_zalloc(sizeof (*hold), KM_SLEEP); 1908 hold->vmh_sc = sc; 1909 hold->vmh_release_req = B_FALSE; 1910 1911 list_insert_tail(&sc->vmm_holds, hold); 1912 sc->vmm_flags |= VMM_HELD; 1913 *holdp = hold; 1914 1915 out: 1916 mutex_exit(&vmm_mtx); 1917 return (err); 1918 } 1919 1920 void 1921 vmm_drv_rele(vmm_hold_t *hold) 1922 { 1923 vmm_softc_t *sc; 1924 1925 ASSERT(hold != NULL); 1926 ASSERT(hold->vmh_sc != NULL); 1927 VERIFY(hold->vmh_ioport_hook_cnt == 0); 1928 1929 mutex_enter(&vmm_mtx); 1930 sc = hold->vmh_sc; 1931 list_remove(&sc->vmm_holds, hold); 1932 if (list_is_empty(&sc->vmm_holds)) { 1933 sc->vmm_flags &= ~VMM_HELD; 1934 cv_broadcast(&sc->vmm_cv); 1935 } 1936 mutex_exit(&vmm_mtx); 1937 kmem_free(hold, sizeof (*hold)); 1938 } 1939 1940 boolean_t 1941 vmm_drv_release_reqd(vmm_hold_t *hold) 1942 { 1943 ASSERT(hold != NULL); 1944 1945 return (hold->vmh_release_req); 1946 } 1947 1948 vmm_lease_t * 1949 vmm_drv_lease_sign(vmm_hold_t *hold, boolean_t (*expiref)(void *), void *arg) 1950 { 1951 vmm_softc_t *sc = hold->vmh_sc; 1952 vmm_lease_t *lease; 1953 1954 ASSERT3P(expiref, !=, NULL); 1955 1956 if (hold->vmh_release_req) { 1957 return (NULL); 1958 } 1959 1960 lease = kmem_alloc(sizeof (*lease), KM_SLEEP); 1961 list_link_init(&lease->vml_node); 1962 lease->vml_expire_func = expiref; 1963 lease->vml_expire_arg = arg; 1964 lease->vml_expired = B_FALSE; 1965 lease->vml_break_deferred = B_FALSE; 1966 lease->vml_hold = hold; 1967 /* cache the VM pointer for one less pointer chase */ 1968 lease->vml_vm = sc->vmm_vm; 1969 lease->vml_vmclient = vmspace_client_alloc(vm_get_vmspace(sc->vmm_vm)); 1970 1971 mutex_enter(&sc->vmm_lease_lock); 1972 while (sc->vmm_lease_blocker != 0) { 1973 cv_wait(&sc->vmm_lease_cv, &sc->vmm_lease_lock); 1974 } 1975 list_insert_tail(&sc->vmm_lease_list, lease); 1976 vmm_read_lock(sc); 1977 mutex_exit(&sc->vmm_lease_lock); 1978 1979 return (lease); 1980 } 1981 1982 static void 1983 vmm_lease_break_locked(vmm_softc_t *sc, vmm_lease_t *lease) 1984 { 1985 ASSERT(MUTEX_HELD(&sc->vmm_lease_lock)); 1986 1987 list_remove(&sc->vmm_lease_list, lease); 1988 vmm_read_unlock(sc); 1989 vmc_destroy(lease->vml_vmclient); 1990 kmem_free(lease, sizeof (*lease)); 1991 } 1992 1993 static void 1994 vmm_lease_block(vmm_softc_t *sc) 1995 { 1996 mutex_enter(&sc->vmm_lease_lock); 1997 VERIFY3U(sc->vmm_lease_blocker, !=, UINT_MAX); 1998 sc->vmm_lease_blocker++; 1999 if (sc->vmm_lease_blocker == 1) { 2000 list_t *list = &sc->vmm_lease_list; 2001 vmm_lease_t *lease = list_head(list); 2002 2003 while (lease != NULL) { 2004 void *arg = lease->vml_expire_arg; 2005 boolean_t (*expiref)(void *) = lease->vml_expire_func; 2006 boolean_t sync_break = B_FALSE; 2007 2008 /* 2009 * Since the lease expiration notification may 2010 * need to take locks which would deadlock with 2011 * vmm_lease_lock, drop it across the call. 2012 * 2013 * We are the only one allowed to manipulate 2014 * vmm_lease_list right now, so it is safe to 2015 * continue iterating through it after 2016 * reacquiring the lock. 2017 */ 2018 lease->vml_expired = B_TRUE; 2019 mutex_exit(&sc->vmm_lease_lock); 2020 sync_break = expiref(arg); 2021 mutex_enter(&sc->vmm_lease_lock); 2022 2023 if (sync_break) { 2024 vmm_lease_t *next; 2025 2026 /* 2027 * These leases which are synchronously broken 2028 * result in vmm_read_unlock() calls from a 2029 * different thread than the corresponding 2030 * vmm_read_lock(). This is acceptable, given 2031 * that the rwlock underpinning the whole 2032 * mechanism tolerates the behavior. This 2033 * flexibility is _only_ afforded to VM read 2034 * lock (RW_READER) holders. 2035 */ 2036 next = list_next(list, lease); 2037 vmm_lease_break_locked(sc, lease); 2038 lease = next; 2039 } else { 2040 lease = list_next(list, lease); 2041 } 2042 } 2043 2044 /* Process leases which were not broken synchronously. */ 2045 while (!list_is_empty(list)) { 2046 /* 2047 * Although the nested loops are quadratic, the number 2048 * of leases is small. 2049 */ 2050 lease = list_head(list); 2051 while (lease != NULL) { 2052 vmm_lease_t *next = list_next(list, lease); 2053 if (lease->vml_break_deferred) { 2054 vmm_lease_break_locked(sc, lease); 2055 } 2056 lease = next; 2057 } 2058 if (list_is_empty(list)) { 2059 break; 2060 } 2061 cv_wait(&sc->vmm_lease_cv, &sc->vmm_lease_lock); 2062 } 2063 /* Wake anyone else waiting for the lease list to be empty */ 2064 cv_broadcast(&sc->vmm_lease_cv); 2065 } else { 2066 list_t *list = &sc->vmm_lease_list; 2067 2068 /* 2069 * Some other thread beat us to the duty of lease cleanup. 2070 * Wait until that is complete. 2071 */ 2072 while (!list_is_empty(list)) { 2073 cv_wait(&sc->vmm_lease_cv, &sc->vmm_lease_lock); 2074 } 2075 } 2076 mutex_exit(&sc->vmm_lease_lock); 2077 } 2078 2079 static void 2080 vmm_lease_unblock(vmm_softc_t *sc) 2081 { 2082 mutex_enter(&sc->vmm_lease_lock); 2083 VERIFY3U(sc->vmm_lease_blocker, !=, 0); 2084 sc->vmm_lease_blocker--; 2085 if (sc->vmm_lease_blocker == 0) { 2086 cv_broadcast(&sc->vmm_lease_cv); 2087 } 2088 mutex_exit(&sc->vmm_lease_lock); 2089 } 2090 2091 void 2092 vmm_drv_lease_break(vmm_hold_t *hold, vmm_lease_t *lease) 2093 { 2094 vmm_softc_t *sc = hold->vmh_sc; 2095 2096 VERIFY3P(hold, ==, lease->vml_hold); 2097 VERIFY(!lease->vml_break_deferred); 2098 2099 mutex_enter(&sc->vmm_lease_lock); 2100 if (sc->vmm_lease_blocker == 0) { 2101 vmm_lease_break_locked(sc, lease); 2102 } else { 2103 /* 2104 * Defer the lease-breaking to whichever thread is currently 2105 * cleaning up all leases as part of a vmm_lease_block() call. 2106 */ 2107 lease->vml_break_deferred = B_TRUE; 2108 cv_broadcast(&sc->vmm_lease_cv); 2109 } 2110 mutex_exit(&sc->vmm_lease_lock); 2111 } 2112 2113 boolean_t 2114 vmm_drv_lease_expired(vmm_lease_t *lease) 2115 { 2116 return (lease->vml_expired); 2117 } 2118 2119 vmm_page_t * 2120 vmm_drv_page_hold(vmm_lease_t *lease, uintptr_t gpa, int prot) 2121 { 2122 ASSERT(lease != NULL); 2123 ASSERT0(gpa & PAGEOFFSET); 2124 2125 return ((vmm_page_t *)vmc_hold(lease->vml_vmclient, gpa, prot)); 2126 } 2127 2128 void 2129 vmm_drv_page_release(vmm_page_t *vmmp) 2130 { 2131 (void) vmp_release((vm_page_t *)vmmp); 2132 } 2133 2134 void 2135 vmm_drv_page_release_chain(vmm_page_t *vmmp) 2136 { 2137 (void) vmp_release_chain((vm_page_t *)vmmp); 2138 } 2139 2140 const void * 2141 vmm_drv_page_readable(const vmm_page_t *vmmp) 2142 { 2143 return (vmp_get_readable((const vm_page_t *)vmmp)); 2144 } 2145 2146 void * 2147 vmm_drv_page_writable(const vmm_page_t *vmmp) 2148 { 2149 return (vmp_get_writable((const vm_page_t *)vmmp)); 2150 } 2151 2152 void 2153 vmm_drv_page_chain(vmm_page_t *vmmp, vmm_page_t *to_chain) 2154 { 2155 vmp_chain((vm_page_t *)vmmp, (vm_page_t *)to_chain); 2156 } 2157 2158 vmm_page_t * 2159 vmm_drv_page_next(const vmm_page_t *vmmp) 2160 { 2161 return ((vmm_page_t *)vmp_next((vm_page_t *)vmmp)); 2162 } 2163 2164 int 2165 vmm_drv_msi(vmm_lease_t *lease, uint64_t addr, uint64_t msg) 2166 { 2167 ASSERT(lease != NULL); 2168 2169 return (lapic_intr_msi(lease->vml_vm, addr, msg)); 2170 } 2171 2172 int 2173 vmm_drv_ioport_hook(vmm_hold_t *hold, uint16_t ioport, vmm_drv_iop_cb_t func, 2174 void *arg, void **cookie) 2175 { 2176 vmm_softc_t *sc; 2177 int err; 2178 2179 ASSERT(hold != NULL); 2180 ASSERT(cookie != NULL); 2181 2182 sc = hold->vmh_sc; 2183 mutex_enter(&vmm_mtx); 2184 /* Confirm that hook installation is not blocked */ 2185 if ((sc->vmm_flags & VMM_BLOCK_HOOK) != 0) { 2186 mutex_exit(&vmm_mtx); 2187 return (EBUSY); 2188 } 2189 /* 2190 * Optimistically record an installed hook which will prevent a block 2191 * from being asserted while the mutex is dropped. 2192 */ 2193 hold->vmh_ioport_hook_cnt++; 2194 mutex_exit(&vmm_mtx); 2195 2196 vmm_write_lock(sc); 2197 err = vm_ioport_hook(sc->vmm_vm, ioport, (ioport_handler_t)func, 2198 arg, cookie); 2199 vmm_write_unlock(sc); 2200 2201 if (err != 0) { 2202 mutex_enter(&vmm_mtx); 2203 /* Walk back optimism about the hook installation */ 2204 hold->vmh_ioport_hook_cnt--; 2205 mutex_exit(&vmm_mtx); 2206 } 2207 return (err); 2208 } 2209 2210 void 2211 vmm_drv_ioport_unhook(vmm_hold_t *hold, void **cookie) 2212 { 2213 vmm_softc_t *sc; 2214 2215 ASSERT(hold != NULL); 2216 ASSERT(cookie != NULL); 2217 ASSERT(hold->vmh_ioport_hook_cnt != 0); 2218 2219 sc = hold->vmh_sc; 2220 vmm_write_lock(sc); 2221 vm_ioport_unhook(sc->vmm_vm, cookie); 2222 vmm_write_unlock(sc); 2223 2224 mutex_enter(&vmm_mtx); 2225 hold->vmh_ioport_hook_cnt--; 2226 mutex_exit(&vmm_mtx); 2227 } 2228 2229 static int 2230 vmm_drv_purge(vmm_softc_t *sc) 2231 { 2232 ASSERT(MUTEX_HELD(&vmm_mtx)); 2233 2234 if ((sc->vmm_flags & VMM_HELD) != 0) { 2235 vmm_hold_t *hold; 2236 2237 sc->vmm_flags |= VMM_CLEANUP; 2238 for (hold = list_head(&sc->vmm_holds); hold != NULL; 2239 hold = list_next(&sc->vmm_holds, hold)) { 2240 hold->vmh_release_req = B_TRUE; 2241 } 2242 while ((sc->vmm_flags & VMM_HELD) != 0) { 2243 if (cv_wait_sig(&sc->vmm_cv, &vmm_mtx) <= 0) { 2244 return (EINTR); 2245 } 2246 } 2247 sc->vmm_flags &= ~VMM_CLEANUP; 2248 } 2249 2250 VERIFY(list_is_empty(&sc->vmm_holds)); 2251 sc->vmm_flags |= VMM_PURGED; 2252 return (0); 2253 } 2254 2255 static int 2256 vmm_drv_block_hook(vmm_softc_t *sc, boolean_t enable_block) 2257 { 2258 int err = 0; 2259 2260 mutex_enter(&vmm_mtx); 2261 if (!enable_block) { 2262 VERIFY((sc->vmm_flags & VMM_BLOCK_HOOK) != 0); 2263 2264 sc->vmm_flags &= ~VMM_BLOCK_HOOK; 2265 goto done; 2266 } 2267 2268 /* If any holds have hooks installed, the block is a failure */ 2269 if (!list_is_empty(&sc->vmm_holds)) { 2270 vmm_hold_t *hold; 2271 2272 for (hold = list_head(&sc->vmm_holds); hold != NULL; 2273 hold = list_next(&sc->vmm_holds, hold)) { 2274 if (hold->vmh_ioport_hook_cnt != 0) { 2275 err = EBUSY; 2276 goto done; 2277 } 2278 } 2279 } 2280 sc->vmm_flags |= VMM_BLOCK_HOOK; 2281 2282 done: 2283 mutex_exit(&vmm_mtx); 2284 return (err); 2285 } 2286 2287 static int 2288 vmm_do_vm_destroy_locked(vmm_softc_t *sc, boolean_t clean_zsd, 2289 boolean_t *hma_release) 2290 { 2291 dev_info_t *pdip = ddi_get_parent(vmmdev_dip); 2292 minor_t minor; 2293 2294 ASSERT(MUTEX_HELD(&vmm_mtx)); 2295 2296 *hma_release = B_FALSE; 2297 2298 if (vmm_drv_purge(sc) != 0) { 2299 return (EINTR); 2300 } 2301 2302 if (clean_zsd) { 2303 vmm_zsd_rem_vm(sc); 2304 } 2305 2306 /* Clean up devmem entries */ 2307 vmmdev_devmem_purge(sc); 2308 2309 list_remove(&vmm_list, sc); 2310 ddi_remove_minor_node(vmmdev_dip, sc->vmm_name); 2311 minor = sc->vmm_minor; 2312 zone_rele(sc->vmm_zone); 2313 if (sc->vmm_is_open) { 2314 list_insert_tail(&vmm_destroy_list, sc); 2315 sc->vmm_flags |= VMM_DESTROY; 2316 } else { 2317 vmm_kstat_fini(sc); 2318 vm_destroy(sc->vmm_vm); 2319 ddi_soft_state_free(vmm_statep, minor); 2320 id_free(vmm_minors, minor); 2321 *hma_release = B_TRUE; 2322 } 2323 (void) devfs_clean(pdip, NULL, DV_CLEAN_FORCE); 2324 2325 return (0); 2326 } 2327 2328 int 2329 vmm_do_vm_destroy(vmm_softc_t *sc, boolean_t clean_zsd) 2330 { 2331 boolean_t hma_release = B_FALSE; 2332 int err; 2333 2334 mutex_enter(&vmm_mtx); 2335 err = vmm_do_vm_destroy_locked(sc, clean_zsd, &hma_release); 2336 mutex_exit(&vmm_mtx); 2337 2338 if (hma_release) 2339 vmm_hma_release(); 2340 2341 return (err); 2342 } 2343 2344 /* ARGSUSED */ 2345 static int 2346 vmmdev_do_vm_destroy(const struct vm_destroy_req *req, cred_t *cr) 2347 { 2348 boolean_t hma_release = B_FALSE; 2349 vmm_softc_t *sc; 2350 int err; 2351 2352 if (crgetuid(cr) != 0) 2353 return (EPERM); 2354 2355 mutex_enter(&vmm_mtx); 2356 2357 if ((sc = vmm_lookup(req->name)) == NULL) { 2358 mutex_exit(&vmm_mtx); 2359 return (ENOENT); 2360 } 2361 /* 2362 * We don't check this in vmm_lookup() since that function is also used 2363 * for validation during create and currently vmm names must be unique. 2364 */ 2365 if (!INGLOBALZONE(curproc) && sc->vmm_zone != curzone) { 2366 mutex_exit(&vmm_mtx); 2367 return (EPERM); 2368 } 2369 err = vmm_do_vm_destroy_locked(sc, B_TRUE, &hma_release); 2370 2371 mutex_exit(&vmm_mtx); 2372 2373 if (hma_release) 2374 vmm_hma_release(); 2375 2376 return (err); 2377 } 2378 2379 #define VCPU_NAME_BUFLEN 32 2380 2381 static int 2382 vmm_kstat_alloc(vmm_softc_t *sc, minor_t minor, const cred_t *cr) 2383 { 2384 zoneid_t zid = crgetzoneid(cr); 2385 int instance = minor; 2386 kstat_t *ksp; 2387 2388 ASSERT3P(sc->vmm_kstat_vm, ==, NULL); 2389 2390 ksp = kstat_create_zone(VMM_MODULE_NAME, instance, "vm", 2391 VMM_KSTAT_CLASS, KSTAT_TYPE_NAMED, 2392 sizeof (vmm_kstats_t) / sizeof (kstat_named_t), 0, zid); 2393 2394 if (ksp == NULL) { 2395 return (-1); 2396 } 2397 sc->vmm_kstat_vm = ksp; 2398 2399 for (uint_t i = 0; i < VM_MAXCPU; i++) { 2400 char namebuf[VCPU_NAME_BUFLEN]; 2401 2402 ASSERT3P(sc->vmm_kstat_vcpu[i], ==, NULL); 2403 2404 (void) snprintf(namebuf, VCPU_NAME_BUFLEN, "vcpu%u", i); 2405 ksp = kstat_create_zone(VMM_MODULE_NAME, instance, namebuf, 2406 VMM_KSTAT_CLASS, KSTAT_TYPE_NAMED, 2407 sizeof (vmm_vcpu_kstats_t) / sizeof (kstat_named_t), 2408 0, zid); 2409 if (ksp == NULL) { 2410 goto fail; 2411 } 2412 2413 sc->vmm_kstat_vcpu[i] = ksp; 2414 } 2415 2416 /* 2417 * If this instance is associated with a non-global zone, make its 2418 * kstats visible from the GZ. 2419 */ 2420 if (zid != GLOBAL_ZONEID) { 2421 kstat_zone_add(sc->vmm_kstat_vm, GLOBAL_ZONEID); 2422 for (uint_t i = 0; i < VM_MAXCPU; i++) { 2423 kstat_zone_add(sc->vmm_kstat_vcpu[i], GLOBAL_ZONEID); 2424 } 2425 } 2426 2427 return (0); 2428 2429 fail: 2430 for (uint_t i = 0; i < VM_MAXCPU; i++) { 2431 if (sc->vmm_kstat_vcpu[i] != NULL) { 2432 kstat_delete(sc->vmm_kstat_vcpu[i]); 2433 sc->vmm_kstat_vcpu[i] = NULL; 2434 } else { 2435 break; 2436 } 2437 } 2438 kstat_delete(sc->vmm_kstat_vm); 2439 sc->vmm_kstat_vm = NULL; 2440 return (-1); 2441 } 2442 2443 static void 2444 vmm_kstat_init(vmm_softc_t *sc) 2445 { 2446 kstat_t *ksp; 2447 2448 ASSERT3P(sc->vmm_vm, !=, NULL); 2449 ASSERT3P(sc->vmm_kstat_vm, !=, NULL); 2450 2451 ksp = sc->vmm_kstat_vm; 2452 vmm_kstats_t *vk = ksp->ks_data; 2453 ksp->ks_private = sc->vmm_vm; 2454 kstat_named_init(&vk->vk_name, "vm_name", KSTAT_DATA_STRING); 2455 kstat_named_setstr(&vk->vk_name, sc->vmm_name); 2456 2457 for (uint_t i = 0; i < VM_MAXCPU; i++) { 2458 ASSERT3P(sc->vmm_kstat_vcpu[i], !=, NULL); 2459 2460 ksp = sc->vmm_kstat_vcpu[i]; 2461 vmm_vcpu_kstats_t *vvk = ksp->ks_data; 2462 2463 kstat_named_init(&vvk->vvk_vcpu, "vcpu", KSTAT_DATA_UINT32); 2464 vvk->vvk_vcpu.value.ui32 = i; 2465 kstat_named_init(&vvk->vvk_time_init, "time_init", 2466 KSTAT_DATA_UINT64); 2467 kstat_named_init(&vvk->vvk_time_run, "time_run", 2468 KSTAT_DATA_UINT64); 2469 kstat_named_init(&vvk->vvk_time_idle, "time_idle", 2470 KSTAT_DATA_UINT64); 2471 kstat_named_init(&vvk->vvk_time_emu_kern, "time_emu_kern", 2472 KSTAT_DATA_UINT64); 2473 kstat_named_init(&vvk->vvk_time_emu_user, "time_emu_user", 2474 KSTAT_DATA_UINT64); 2475 kstat_named_init(&vvk->vvk_time_sched, "time_sched", 2476 KSTAT_DATA_UINT64); 2477 ksp->ks_private = sc->vmm_vm; 2478 ksp->ks_update = vmm_kstat_update_vcpu; 2479 } 2480 2481 kstat_install(sc->vmm_kstat_vm); 2482 for (uint_t i = 0; i < VM_MAXCPU; i++) { 2483 kstat_install(sc->vmm_kstat_vcpu[i]); 2484 } 2485 } 2486 2487 static void 2488 vmm_kstat_fini(vmm_softc_t *sc) 2489 { 2490 ASSERT(sc->vmm_kstat_vm != NULL); 2491 2492 kstat_delete(sc->vmm_kstat_vm); 2493 sc->vmm_kstat_vm = NULL; 2494 2495 for (uint_t i = 0; i < VM_MAXCPU; i++) { 2496 ASSERT3P(sc->vmm_kstat_vcpu[i], !=, NULL); 2497 2498 kstat_delete(sc->vmm_kstat_vcpu[i]); 2499 sc->vmm_kstat_vcpu[i] = NULL; 2500 } 2501 } 2502 2503 static int 2504 vmm_open(dev_t *devp, int flag, int otyp, cred_t *credp) 2505 { 2506 minor_t minor; 2507 vmm_softc_t *sc; 2508 2509 /* 2510 * Forbid running bhyve in a 32-bit process until it has been tested and 2511 * verified to be safe. 2512 */ 2513 if (curproc->p_model != DATAMODEL_LP64) { 2514 return (EFBIG); 2515 } 2516 2517 minor = getminor(*devp); 2518 if (minor == VMM_CTL_MINOR) { 2519 /* 2520 * Master control device must be opened exclusively. 2521 */ 2522 if ((flag & FEXCL) != FEXCL || otyp != OTYP_CHR) { 2523 return (EINVAL); 2524 } 2525 2526 return (0); 2527 } 2528 2529 mutex_enter(&vmm_mtx); 2530 sc = ddi_get_soft_state(vmm_statep, minor); 2531 if (sc == NULL) { 2532 mutex_exit(&vmm_mtx); 2533 return (ENXIO); 2534 } 2535 2536 sc->vmm_is_open = B_TRUE; 2537 mutex_exit(&vmm_mtx); 2538 2539 return (0); 2540 } 2541 2542 static int 2543 vmm_close(dev_t dev, int flag, int otyp, cred_t *credp) 2544 { 2545 minor_t minor; 2546 vmm_softc_t *sc; 2547 boolean_t hma_release = B_FALSE; 2548 2549 minor = getminor(dev); 2550 if (minor == VMM_CTL_MINOR) 2551 return (0); 2552 2553 mutex_enter(&vmm_mtx); 2554 sc = ddi_get_soft_state(vmm_statep, minor); 2555 if (sc == NULL) { 2556 mutex_exit(&vmm_mtx); 2557 return (ENXIO); 2558 } 2559 2560 VERIFY(sc->vmm_is_open); 2561 sc->vmm_is_open = B_FALSE; 2562 2563 /* 2564 * If this VM was destroyed while the vmm device was open, then 2565 * clean it up now that it is closed. 2566 */ 2567 if (sc->vmm_flags & VMM_DESTROY) { 2568 list_remove(&vmm_destroy_list, sc); 2569 vmm_kstat_fini(sc); 2570 vm_destroy(sc->vmm_vm); 2571 ddi_soft_state_free(vmm_statep, minor); 2572 id_free(vmm_minors, minor); 2573 hma_release = B_TRUE; 2574 } 2575 mutex_exit(&vmm_mtx); 2576 2577 if (hma_release) 2578 vmm_hma_release(); 2579 2580 return (0); 2581 } 2582 2583 static int 2584 vmm_is_supported(intptr_t arg) 2585 { 2586 int r; 2587 const char *msg; 2588 2589 if (vmm_is_intel()) { 2590 r = vmx_x86_supported(&msg); 2591 } else if (vmm_is_svm()) { 2592 /* 2593 * HMA already ensured that the features necessary for SVM 2594 * operation were present and online during vmm_attach(). 2595 */ 2596 r = 0; 2597 } else { 2598 r = ENXIO; 2599 msg = "Unsupported CPU vendor"; 2600 } 2601 2602 if (r != 0 && arg != (intptr_t)NULL) { 2603 if (copyoutstr(msg, (char *)arg, strlen(msg) + 1, NULL) != 0) 2604 return (EFAULT); 2605 } 2606 return (r); 2607 } 2608 2609 static int 2610 vmm_ctl_ioctl(int cmd, intptr_t arg, int md, cred_t *cr, int *rvalp) 2611 { 2612 void *argp = (void *)arg; 2613 2614 switch (cmd) { 2615 case VMM_CREATE_VM: { 2616 struct vm_create_req req; 2617 2618 if ((md & FWRITE) == 0) { 2619 return (EPERM); 2620 } 2621 if (ddi_copyin(argp, &req, sizeof (req), md) != 0) { 2622 return (EFAULT); 2623 } 2624 return (vmmdev_do_vm_create(&req, cr)); 2625 } 2626 case VMM_DESTROY_VM: { 2627 struct vm_destroy_req req; 2628 2629 if ((md & FWRITE) == 0) { 2630 return (EPERM); 2631 } 2632 if (ddi_copyin(argp, &req, sizeof (req), md) != 0) { 2633 return (EFAULT); 2634 } 2635 return (vmmdev_do_vm_destroy(&req, cr)); 2636 } 2637 case VMM_VM_SUPPORTED: 2638 return (vmm_is_supported(arg)); 2639 case VMM_INTERFACE_VERSION: 2640 *rvalp = VMM_CURRENT_INTERFACE_VERSION; 2641 return (0); 2642 case VMM_CHECK_IOMMU: 2643 if (!vmm_check_iommu()) { 2644 return (ENXIO); 2645 } 2646 return (0); 2647 case VMM_RESV_QUERY: 2648 case VMM_RESV_ADD: 2649 case VMM_RESV_REMOVE: 2650 return (vmmr_ioctl(cmd, arg, md, cr, rvalp)); 2651 default: 2652 break; 2653 } 2654 /* No other actions are legal on ctl device */ 2655 return (ENOTTY); 2656 } 2657 2658 static int 2659 vmm_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, 2660 int *rvalp) 2661 { 2662 vmm_softc_t *sc; 2663 minor_t minor; 2664 2665 /* 2666 * Forbid running bhyve in a 32-bit process until it has been tested and 2667 * verified to be safe. 2668 */ 2669 if (curproc->p_model != DATAMODEL_LP64) { 2670 return (EFBIG); 2671 } 2672 2673 /* The structs in bhyve ioctls assume a 64-bit datamodel */ 2674 if (ddi_model_convert_from(mode & FMODELS) != DDI_MODEL_NONE) { 2675 return (ENOTSUP); 2676 } 2677 2678 minor = getminor(dev); 2679 2680 if (minor == VMM_CTL_MINOR) { 2681 return (vmm_ctl_ioctl(cmd, arg, mode, credp, rvalp)); 2682 } 2683 2684 sc = ddi_get_soft_state(vmm_statep, minor); 2685 ASSERT(sc); 2686 2687 if (sc->vmm_flags & VMM_DESTROY) 2688 return (ENXIO); 2689 2690 return (vmmdev_do_ioctl(sc, cmd, arg, mode, credp, rvalp)); 2691 } 2692 2693 static int 2694 vmm_segmap(dev_t dev, off_t off, struct as *as, caddr_t *addrp, off_t len, 2695 unsigned int prot, unsigned int maxprot, unsigned int flags, cred_t *credp) 2696 { 2697 vmm_softc_t *sc; 2698 const minor_t minor = getminor(dev); 2699 int err; 2700 2701 if (minor == VMM_CTL_MINOR) { 2702 return (ENODEV); 2703 } 2704 if (off < 0 || (off + len) <= 0) { 2705 return (EINVAL); 2706 } 2707 if ((prot & PROT_USER) == 0) { 2708 return (EACCES); 2709 } 2710 2711 sc = ddi_get_soft_state(vmm_statep, minor); 2712 ASSERT(sc); 2713 2714 if (sc->vmm_flags & VMM_DESTROY) 2715 return (ENXIO); 2716 2717 /* Grab read lock on the VM to prevent any changes to the memory map */ 2718 vmm_read_lock(sc); 2719 2720 if (off >= VM_DEVMEM_START) { 2721 int segid; 2722 off_t segoff; 2723 2724 /* Mapping a devmem "device" */ 2725 if (!vmmdev_devmem_segid(sc, off, len, &segid, &segoff)) { 2726 err = ENODEV; 2727 } else { 2728 err = vm_segmap_obj(sc->vmm_vm, segid, segoff, len, as, 2729 addrp, prot, maxprot, flags); 2730 } 2731 } else { 2732 /* Mapping a part of the guest physical space */ 2733 err = vm_segmap_space(sc->vmm_vm, off, as, addrp, len, prot, 2734 maxprot, flags); 2735 } 2736 2737 vmm_read_unlock(sc); 2738 return (err); 2739 } 2740 2741 static sdev_plugin_validate_t 2742 vmm_sdev_validate(sdev_ctx_t ctx) 2743 { 2744 const char *name = sdev_ctx_name(ctx); 2745 vmm_softc_t *sc; 2746 sdev_plugin_validate_t ret; 2747 minor_t minor; 2748 2749 if (sdev_ctx_vtype(ctx) != VCHR) 2750 return (SDEV_VTOR_INVALID); 2751 2752 VERIFY3S(sdev_ctx_minor(ctx, &minor), ==, 0); 2753 2754 mutex_enter(&vmm_mtx); 2755 if ((sc = vmm_lookup(name)) == NULL) 2756 ret = SDEV_VTOR_INVALID; 2757 else if (sc->vmm_minor != minor) 2758 ret = SDEV_VTOR_STALE; 2759 else 2760 ret = SDEV_VTOR_VALID; 2761 mutex_exit(&vmm_mtx); 2762 2763 return (ret); 2764 } 2765 2766 static int 2767 vmm_sdev_filldir(sdev_ctx_t ctx) 2768 { 2769 vmm_softc_t *sc; 2770 int ret; 2771 2772 if (strcmp(sdev_ctx_path(ctx), VMM_SDEV_ROOT) != 0) { 2773 cmn_err(CE_WARN, "%s: bad path '%s' != '%s'\n", __func__, 2774 sdev_ctx_path(ctx), VMM_SDEV_ROOT); 2775 return (EINVAL); 2776 } 2777 2778 mutex_enter(&vmm_mtx); 2779 ASSERT(vmmdev_dip != NULL); 2780 for (sc = list_head(&vmm_list); sc != NULL; 2781 sc = list_next(&vmm_list, sc)) { 2782 if (INGLOBALZONE(curproc) || sc->vmm_zone == curzone) { 2783 ret = sdev_plugin_mknod(ctx, sc->vmm_name, 2784 S_IFCHR | 0600, 2785 makedevice(ddi_driver_major(vmmdev_dip), 2786 sc->vmm_minor)); 2787 } else { 2788 continue; 2789 } 2790 if (ret != 0 && ret != EEXIST) 2791 goto out; 2792 } 2793 2794 ret = 0; 2795 2796 out: 2797 mutex_exit(&vmm_mtx); 2798 return (ret); 2799 } 2800 2801 /* ARGSUSED */ 2802 static void 2803 vmm_sdev_inactive(sdev_ctx_t ctx) 2804 { 2805 } 2806 2807 static sdev_plugin_ops_t vmm_sdev_ops = { 2808 .spo_version = SDEV_PLUGIN_VERSION, 2809 .spo_flags = SDEV_PLUGIN_SUBDIR, 2810 .spo_validate = vmm_sdev_validate, 2811 .spo_filldir = vmm_sdev_filldir, 2812 .spo_inactive = vmm_sdev_inactive 2813 }; 2814 2815 /* ARGSUSED */ 2816 static int 2817 vmm_info(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result) 2818 { 2819 int error; 2820 2821 switch (cmd) { 2822 case DDI_INFO_DEVT2DEVINFO: 2823 *result = (void *)vmmdev_dip; 2824 error = DDI_SUCCESS; 2825 break; 2826 case DDI_INFO_DEVT2INSTANCE: 2827 *result = (void *)0; 2828 error = DDI_SUCCESS; 2829 break; 2830 default: 2831 error = DDI_FAILURE; 2832 break; 2833 } 2834 return (error); 2835 } 2836 2837 static int 2838 vmm_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 2839 { 2840 sdev_plugin_hdl_t sph; 2841 hma_reg_t *reg = NULL; 2842 boolean_t vmm_loaded = B_FALSE; 2843 2844 if (cmd != DDI_ATTACH) { 2845 return (DDI_FAILURE); 2846 } 2847 2848 mutex_enter(&vmmdev_mtx); 2849 /* Ensure we are not already attached. */ 2850 if (vmmdev_dip != NULL) { 2851 mutex_exit(&vmmdev_mtx); 2852 return (DDI_FAILURE); 2853 } 2854 2855 vmm_sol_glue_init(); 2856 2857 /* 2858 * Perform temporary HMA registration to determine if the system 2859 * is capable. 2860 */ 2861 if ((reg = hma_register(vmmdev_hvm_name)) == NULL) { 2862 goto fail; 2863 } else if (vmm_mod_load() != 0) { 2864 goto fail; 2865 } 2866 vmm_loaded = B_TRUE; 2867 hma_unregister(reg); 2868 reg = NULL; 2869 2870 /* Create control node. Other nodes will be created on demand. */ 2871 if (ddi_create_minor_node(dip, "ctl", S_IFCHR, 2872 VMM_CTL_MINOR, DDI_PSEUDO, 0) != 0) { 2873 goto fail; 2874 } 2875 2876 sph = sdev_plugin_register(VMM_MODULE_NAME, &vmm_sdev_ops, NULL); 2877 if (sph == (sdev_plugin_hdl_t)NULL) { 2878 ddi_remove_minor_node(dip, NULL); 2879 goto fail; 2880 } 2881 2882 ddi_report_dev(dip); 2883 vmmdev_sdev_hdl = sph; 2884 vmmdev_dip = dip; 2885 mutex_exit(&vmmdev_mtx); 2886 return (DDI_SUCCESS); 2887 2888 fail: 2889 if (vmm_loaded) { 2890 VERIFY0(vmm_mod_unload()); 2891 } 2892 if (reg != NULL) { 2893 hma_unregister(reg); 2894 } 2895 vmm_sol_glue_cleanup(); 2896 mutex_exit(&vmmdev_mtx); 2897 return (DDI_FAILURE); 2898 } 2899 2900 static int 2901 vmm_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 2902 { 2903 if (cmd != DDI_DETACH) { 2904 return (DDI_FAILURE); 2905 } 2906 2907 /* 2908 * Ensure that all resources have been cleaned up. 2909 * 2910 * To prevent a deadlock with iommu_cleanup() we'll fail the detach if 2911 * vmmdev_mtx is already held. We can't wait for vmmdev_mtx with our 2912 * devinfo locked as iommu_cleanup() tries to recursively lock each 2913 * devinfo, including our own, while holding vmmdev_mtx. 2914 */ 2915 if (mutex_tryenter(&vmmdev_mtx) == 0) 2916 return (DDI_FAILURE); 2917 2918 mutex_enter(&vmm_mtx); 2919 if (!list_is_empty(&vmm_list) || !list_is_empty(&vmm_destroy_list)) { 2920 mutex_exit(&vmm_mtx); 2921 mutex_exit(&vmmdev_mtx); 2922 return (DDI_FAILURE); 2923 } 2924 mutex_exit(&vmm_mtx); 2925 2926 if (!vmmr_is_empty()) { 2927 mutex_exit(&vmmdev_mtx); 2928 return (DDI_FAILURE); 2929 } 2930 2931 VERIFY(vmmdev_sdev_hdl != (sdev_plugin_hdl_t)NULL); 2932 if (sdev_plugin_unregister(vmmdev_sdev_hdl) != 0) { 2933 mutex_exit(&vmmdev_mtx); 2934 return (DDI_FAILURE); 2935 } 2936 vmmdev_sdev_hdl = (sdev_plugin_hdl_t)NULL; 2937 2938 /* Remove the control node. */ 2939 ddi_remove_minor_node(dip, "ctl"); 2940 vmmdev_dip = NULL; 2941 2942 VERIFY0(vmm_mod_unload()); 2943 VERIFY3U(vmmdev_hma_reg, ==, NULL); 2944 vmm_sol_glue_cleanup(); 2945 2946 mutex_exit(&vmmdev_mtx); 2947 2948 return (DDI_SUCCESS); 2949 } 2950 2951 static struct cb_ops vmm_cb_ops = { 2952 vmm_open, 2953 vmm_close, 2954 nodev, /* strategy */ 2955 nodev, /* print */ 2956 nodev, /* dump */ 2957 nodev, /* read */ 2958 nodev, /* write */ 2959 vmm_ioctl, 2960 nodev, /* devmap */ 2961 nodev, /* mmap */ 2962 vmm_segmap, 2963 nochpoll, /* poll */ 2964 ddi_prop_op, 2965 NULL, 2966 D_NEW | D_MP | D_DEVMAP 2967 }; 2968 2969 static struct dev_ops vmm_ops = { 2970 DEVO_REV, 2971 0, 2972 vmm_info, 2973 nulldev, /* identify */ 2974 nulldev, /* probe */ 2975 vmm_attach, 2976 vmm_detach, 2977 nodev, /* reset */ 2978 &vmm_cb_ops, 2979 (struct bus_ops *)NULL 2980 }; 2981 2982 static struct modldrv modldrv = { 2983 &mod_driverops, 2984 "bhyve vmm", 2985 &vmm_ops 2986 }; 2987 2988 static struct modlinkage modlinkage = { 2989 MODREV_1, 2990 &modldrv, 2991 NULL 2992 }; 2993 2994 int 2995 _init(void) 2996 { 2997 int error; 2998 2999 sysinit(); 3000 3001 mutex_init(&vmmdev_mtx, NULL, MUTEX_DRIVER, NULL); 3002 mutex_init(&vmm_mtx, NULL, MUTEX_DRIVER, NULL); 3003 list_create(&vmm_list, sizeof (vmm_softc_t), 3004 offsetof(vmm_softc_t, vmm_node)); 3005 list_create(&vmm_destroy_list, sizeof (vmm_softc_t), 3006 offsetof(vmm_softc_t, vmm_node)); 3007 vmm_minors = id_space_create("vmm_minors", VMM_CTL_MINOR + 1, MAXMIN32); 3008 3009 error = ddi_soft_state_init(&vmm_statep, sizeof (vmm_softc_t), 0); 3010 if (error) { 3011 return (error); 3012 } 3013 3014 vmm_zsd_init(); 3015 vmmr_init(); 3016 3017 error = mod_install(&modlinkage); 3018 if (error) { 3019 ddi_soft_state_fini(&vmm_statep); 3020 vmm_zsd_fini(); 3021 vmmr_fini(); 3022 } 3023 3024 return (error); 3025 } 3026 3027 int 3028 _fini(void) 3029 { 3030 int error; 3031 3032 error = mod_remove(&modlinkage); 3033 if (error) { 3034 return (error); 3035 } 3036 3037 vmm_zsd_fini(); 3038 vmmr_fini(); 3039 3040 ddi_soft_state_fini(&vmm_statep); 3041 3042 return (0); 3043 } 3044 3045 int 3046 _info(struct modinfo *modinfop) 3047 { 3048 return (mod_info(&modlinkage, modinfop)); 3049 } 3050