1 /*
2 * This file and its contents are supplied under the terms of the
3 * Common Development and Distribution License ("CDDL"), version 1.0.
4 * You may only use this file in accordance with the terms of version
5 * 1.0 of the CDDL.
6 *
7 * A full copy of the text of the CDDL should have accompanied this
8 * source. A copy of the CDDL is also available via the Internet at
9 * http://www.illumos.org/license/CDDL.
10 */
11 /* This file is dual-licensed; see usr/src/contrib/bhyve/LICENSE */
12
13 /*
14 * Copyright 2015 Pluribus Networks Inc.
15 * Copyright 2019 Joyent, Inc.
16 * Copyright 2020 OmniOS Community Edition (OmniOSce) Association.
17 * Copyright 2023 Oxide Computer Company
18 */
19
20 #include <sys/types.h>
21 #include <sys/conf.h>
22 #include <sys/cpuvar.h>
23 #include <sys/ioccom.h>
24 #include <sys/stat.h>
25 #include <sys/vmsystm.h>
26 #include <sys/ddi.h>
27 #include <sys/mkdev.h>
28 #include <sys/sunddi.h>
29 #include <sys/fs/dv_node.h>
30 #include <sys/cpuset.h>
31 #include <sys/id_space.h>
32 #include <sys/fs/sdev_plugin.h>
33 #include <sys/smt.h>
34 #include <sys/kstat.h>
35
36 #include <sys/kernel.h>
37 #include <sys/hma.h>
38 #include <sys/x86_archext.h>
39 #include <x86/apicreg.h>
40
41 #include <sys/vmm.h>
42 #include <sys/vmm_kernel.h>
43 #include <sys/vmm_instruction_emul.h>
44 #include <sys/vmm_dev.h>
45 #include <sys/vmm_impl.h>
46 #include <sys/vmm_drv.h>
47 #include <sys/vmm_vm.h>
48 #include <sys/vmm_reservoir.h>
49
50 #include <vm/seg_dev.h>
51
52 #include "io/ppt.h"
53 #include "io/vatpic.h"
54 #include "io/vioapic.h"
55 #include "io/vrtc.h"
56 #include "io/vhpet.h"
57 #include "io/vpmtmr.h"
58 #include "vmm_lapic.h"
59 #include "vmm_stat.h"
60 #include "vmm_util.h"
61
62 /*
63 * Locking details:
64 *
65 * Driver-wide data (vmmdev_*) , including HMA and sdev registration, is
66 * protected by vmmdev_mtx. The list of vmm_softc_t instances and related data
67 * (vmm_*) are protected by vmm_mtx. Actions requiring both locks must acquire
68 * vmmdev_mtx before vmm_mtx. The sdev plugin functions must not attempt to
69 * acquire vmmdev_mtx, as they could deadlock with plugin unregistration.
70 */
71
72 static kmutex_t vmmdev_mtx;
73 static dev_info_t *vmmdev_dip;
74 static hma_reg_t *vmmdev_hma_reg;
75 static uint_t vmmdev_hma_ref;
76 static sdev_plugin_hdl_t vmmdev_sdev_hdl;
77
78 static kmutex_t vmm_mtx;
79 static list_t vmm_list;
80 static id_space_t *vmm_minors;
81 static void *vmm_statep;
82
83 /*
84 * Until device emulation in bhyve had been adequately scrutinized and tested,
85 * there was (justified) concern that unusual or corrupt device state payloads
86 * could crash the host when loaded via the vmm-data interface.
87 *
88 * Now that those concerns have been mitigated, this protection is loosened to
89 * default-allow, but the switch is left in place, in case there is a need to
90 * once again clamp down on vmm-data writes.
91 */
92 int vmm_allow_state_writes = 1;
93
94 static const char *vmmdev_hvm_name = "bhyve";
95
96 /* For sdev plugin (/dev) */
97 #define VMM_SDEV_ROOT "/dev/vmm"
98
99 /* From uts/intel/io/vmm/intel/vmx.c */
100 extern int vmx_x86_supported(const char **);
101
102 /* Holds and hooks from drivers external to vmm */
103 struct vmm_hold {
104 list_node_t vmh_node;
105 vmm_softc_t *vmh_sc;
106 boolean_t vmh_release_req;
107 uint_t vmh_ioport_hook_cnt;
108 };
109
110 struct vmm_lease {
111 list_node_t vml_node;
112 struct vm *vml_vm;
113 vm_client_t *vml_vmclient;
114 boolean_t vml_expired;
115 boolean_t vml_break_deferred;
116 boolean_t (*vml_expire_func)(void *);
117 void *vml_expire_arg;
118 struct vmm_hold *vml_hold;
119 };
120
121 /* Options for vmm_destroy_locked */
122 typedef enum vmm_destroy_opts {
123 VDO_DEFAULT = 0,
124 /*
125 * Indicate that zone-specific-data associated with this VM not be
126 * cleaned up as part of the destroy. Skipping ZSD clean-up is
127 * necessary when VM is being destroyed as part of zone destruction,
128 * when said ZSD is already being cleaned up.
129 */
130 VDO_NO_CLEAN_ZSD = (1 << 0),
131 /*
132 * Attempt to wait for VM destruction to complete. This is opt-in,
133 * since there are many normal conditions which could lead to
134 * destruction being stalled pending other clean-up.
135 */
136 VDO_ATTEMPT_WAIT = (1 << 1),
137 } vmm_destroy_opts_t;
138
139 static void vmm_hma_release(void);
140 static int vmm_destroy_locked(vmm_softc_t *, vmm_destroy_opts_t, bool *);
141 static int vmm_drv_block_hook(vmm_softc_t *, boolean_t);
142 static void vmm_lease_block(vmm_softc_t *);
143 static void vmm_lease_unblock(vmm_softc_t *);
144 static int vmm_kstat_alloc(vmm_softc_t *, minor_t, const cred_t *);
145 static void vmm_kstat_init(vmm_softc_t *);
146 static void vmm_kstat_fini(vmm_softc_t *);
147
148 /*
149 * The 'devmem' hack:
150 *
151 * On native FreeBSD, bhyve consumers are allowed to create 'devmem' segments
152 * in the vm which appear with their own name related to the vm under /dev.
153 * Since this would be a hassle from an sdev perspective and would require a
154 * new cdev interface (or complicate the existing one), we choose to implement
155 * this in a different manner. Direct access to the underlying vm memory
156 * segments is exposed by placing them in a range of offsets beyond the normal
157 * guest memory space. Userspace can query the appropriate offset to mmap()
158 * for a given segment-id with the VM_DEVMEM_GETOFFSET ioctl.
159 */
160
161 static vmm_devmem_entry_t *
vmmdev_devmem_find(vmm_softc_t * sc,int segid)162 vmmdev_devmem_find(vmm_softc_t *sc, int segid)
163 {
164 vmm_devmem_entry_t *ent = NULL;
165 list_t *dl = &sc->vmm_devmem_list;
166
167 for (ent = list_head(dl); ent != NULL; ent = list_next(dl, ent)) {
168 if (ent->vde_segid == segid) {
169 return (ent);
170 }
171 }
172 return (NULL);
173 }
174
175 static int
vmmdev_get_memseg(vmm_softc_t * sc,struct vm_memseg * mseg)176 vmmdev_get_memseg(vmm_softc_t *sc, struct vm_memseg *mseg)
177 {
178 int error;
179 bool sysmem;
180
181 error = vm_get_memseg(sc->vmm_vm, mseg->segid, &mseg->len, &sysmem,
182 NULL);
183 if (error || mseg->len == 0)
184 return (error);
185
186 if (!sysmem) {
187 vmm_devmem_entry_t *de;
188
189 de = vmmdev_devmem_find(sc, mseg->segid);
190 if (de != NULL) {
191 (void) strlcpy(mseg->name, de->vde_name,
192 sizeof (mseg->name));
193 }
194 } else {
195 bzero(mseg->name, sizeof (mseg->name));
196 }
197
198 return (error);
199 }
200
201 static int
vmmdev_devmem_create(vmm_softc_t * sc,struct vm_memseg * mseg,const char * name)202 vmmdev_devmem_create(vmm_softc_t *sc, struct vm_memseg *mseg, const char *name)
203 {
204 off_t map_offset;
205 vmm_devmem_entry_t *entry;
206
207 if (list_is_empty(&sc->vmm_devmem_list)) {
208 map_offset = VM_DEVMEM_START;
209 } else {
210 entry = list_tail(&sc->vmm_devmem_list);
211 map_offset = entry->vde_off + entry->vde_len;
212 if (map_offset < entry->vde_off) {
213 /* Do not tolerate overflow */
214 return (ERANGE);
215 }
216 /*
217 * XXXJOY: We could choose to search the list for duplicate
218 * names and toss an error. Since we're using the offset
219 * method for now, it does not make much of a difference.
220 */
221 }
222
223 entry = kmem_zalloc(sizeof (*entry), KM_SLEEP);
224 entry->vde_segid = mseg->segid;
225 entry->vde_len = mseg->len;
226 entry->vde_off = map_offset;
227 (void) strlcpy(entry->vde_name, name, sizeof (entry->vde_name));
228 list_insert_tail(&sc->vmm_devmem_list, entry);
229
230 return (0);
231 }
232
233 static boolean_t
vmmdev_devmem_segid(vmm_softc_t * sc,off_t off,off_t len,int * segidp,off_t * map_offp)234 vmmdev_devmem_segid(vmm_softc_t *sc, off_t off, off_t len, int *segidp,
235 off_t *map_offp)
236 {
237 list_t *dl = &sc->vmm_devmem_list;
238 vmm_devmem_entry_t *de = NULL;
239 const off_t map_end = off + len;
240
241 VERIFY(off >= VM_DEVMEM_START);
242
243 if (map_end < off) {
244 /* No match on overflow */
245 return (B_FALSE);
246 }
247
248 for (de = list_head(dl); de != NULL; de = list_next(dl, de)) {
249 const off_t item_end = de->vde_off + de->vde_len;
250
251 if (de->vde_off <= off && item_end >= map_end) {
252 *segidp = de->vde_segid;
253 *map_offp = off - de->vde_off;
254 return (B_TRUE);
255 }
256 }
257 return (B_FALSE);
258 }
259
260 /*
261 * When an instance is being destroyed, the devmem list of named memory objects
262 * can be torn down, as no new mappings are allowed.
263 */
264 static void
vmmdev_devmem_purge(vmm_softc_t * sc)265 vmmdev_devmem_purge(vmm_softc_t *sc)
266 {
267 vmm_devmem_entry_t *entry;
268
269 while ((entry = list_remove_head(&sc->vmm_devmem_list)) != NULL) {
270 kmem_free(entry, sizeof (*entry));
271 }
272 }
273
274 static int
vmmdev_alloc_memseg(vmm_softc_t * sc,struct vm_memseg * mseg)275 vmmdev_alloc_memseg(vmm_softc_t *sc, struct vm_memseg *mseg)
276 {
277 int error;
278 bool sysmem = true;
279
280 if (VM_MEMSEG_NAME(mseg)) {
281 sysmem = false;
282 }
283 error = vm_alloc_memseg(sc->vmm_vm, mseg->segid, mseg->len, sysmem);
284
285 if (error == 0) {
286 /*
287 * Rather than create a whole fresh device from which userspace
288 * can mmap this segment, instead make it available at an
289 * offset above where the main guest memory resides.
290 */
291 error = vmmdev_devmem_create(sc, mseg, mseg->name);
292 if (error != 0) {
293 vm_free_memseg(sc->vmm_vm, mseg->segid);
294 }
295 }
296 return (error);
297 }
298
299 /*
300 * Resource Locking and Exclusion
301 *
302 * Much of bhyve depends on key portions of VM state, such as the guest memory
303 * map, to remain unchanged while the guest is running. As ported from
304 * FreeBSD, the initial strategy for this resource exclusion hinged on gating
305 * access to the instance vCPUs. Threads acting on a single vCPU, like those
306 * performing the work of actually running the guest in VMX/SVM, would lock
307 * only that vCPU during ioctl() entry. For ioctls which would change VM-wide
308 * state, all of the vCPUs would be first locked, ensuring that the
309 * operation(s) could complete without any other threads stumbling into
310 * intermediate states.
311 *
312 * This approach is largely effective for bhyve. Common operations, such as
313 * running the vCPUs, steer clear of lock contention. The model begins to
314 * break down for operations which do not occur in the context of a specific
315 * vCPU. LAPIC MSI delivery, for example, may be initiated from a worker
316 * thread in the bhyve process. In order to properly protect those vCPU-less
317 * operations from encountering invalid states, additional locking is required.
318 * This was solved by forcing those operations to lock the VM_MAXCPU-1 vCPU.
319 * It does mean that class of operations will be serialized on locking the
320 * specific vCPU and that instances sized at VM_MAXCPU will potentially see
321 * undue contention on the VM_MAXCPU-1 vCPU.
322 *
323 * In order to address the shortcomings of this model, the concept of a
324 * read/write lock has been added to bhyve. Operations which change
325 * fundamental aspects of a VM (such as the memory map) must acquire the write
326 * lock, which also implies locking all of the vCPUs and waiting for all read
327 * lock holders to release. While it increases the cost and waiting time for
328 * those few operations, it allows most hot-path operations on the VM (which
329 * depend on its configuration remaining stable) to occur with minimal locking.
330 *
331 * Consumers of the Driver API (see below) are a special case when it comes to
332 * this locking, since they may hold a read lock via the drv_lease mechanism
333 * for an extended period of time. Rather than forcing those consumers to
334 * continuously poll for a write lock attempt, the lease system forces them to
335 * provide a release callback to trigger their clean-up (and potential later
336 * reacquisition) of the read lock.
337 */
338
339 static void
vcpu_lock_one(vmm_softc_t * sc,int vcpu)340 vcpu_lock_one(vmm_softc_t *sc, int vcpu)
341 {
342 ASSERT(vcpu >= 0 && vcpu < VM_MAXCPU);
343
344 /*
345 * Since this state transition is utilizing from_idle=true, it should
346 * not fail, but rather block until it can be successful.
347 */
348 VERIFY0(vcpu_set_state(sc->vmm_vm, vcpu, VCPU_FROZEN, true));
349 }
350
351 static void
vcpu_unlock_one(vmm_softc_t * sc,int vcpu)352 vcpu_unlock_one(vmm_softc_t *sc, int vcpu)
353 {
354 ASSERT(vcpu >= 0 && vcpu < VM_MAXCPU);
355
356 VERIFY3U(vcpu_get_state(sc->vmm_vm, vcpu, NULL), ==, VCPU_FROZEN);
357 VERIFY0(vcpu_set_state(sc->vmm_vm, vcpu, VCPU_IDLE, false));
358 }
359
360 static void
vmm_read_lock(vmm_softc_t * sc)361 vmm_read_lock(vmm_softc_t *sc)
362 {
363 rw_enter(&sc->vmm_rwlock, RW_READER);
364 }
365
366 static void
vmm_read_unlock(vmm_softc_t * sc)367 vmm_read_unlock(vmm_softc_t *sc)
368 {
369 rw_exit(&sc->vmm_rwlock);
370 }
371
372 static void
vmm_write_lock(vmm_softc_t * sc)373 vmm_write_lock(vmm_softc_t *sc)
374 {
375 int maxcpus;
376
377 /* First lock all the vCPUs */
378 maxcpus = vm_get_maxcpus(sc->vmm_vm);
379 for (int vcpu = 0; vcpu < maxcpus; vcpu++) {
380 vcpu_lock_one(sc, vcpu);
381 }
382
383 /*
384 * Block vmm_drv leases from being acquired or held while the VM write
385 * lock is held.
386 */
387 vmm_lease_block(sc);
388
389 rw_enter(&sc->vmm_rwlock, RW_WRITER);
390 /*
391 * For now, the 'maxcpus' value for an instance is fixed at the
392 * compile-time constant of VM_MAXCPU at creation. If this changes in
393 * the future, allowing for dynamic vCPU resource sizing, acquisition
394 * of the write lock will need to be wary of such changes.
395 */
396 VERIFY(maxcpus == vm_get_maxcpus(sc->vmm_vm));
397 }
398
399 static void
vmm_write_unlock(vmm_softc_t * sc)400 vmm_write_unlock(vmm_softc_t *sc)
401 {
402 int maxcpus;
403
404 /* Allow vmm_drv leases to be acquired once write lock is dropped */
405 vmm_lease_unblock(sc);
406
407 /*
408 * The VM write lock _must_ be released from the same thread it was
409 * acquired in, unlike the read lock.
410 */
411 VERIFY(rw_write_held(&sc->vmm_rwlock));
412 rw_exit(&sc->vmm_rwlock);
413
414 /* Unlock all the vCPUs */
415 maxcpus = vm_get_maxcpus(sc->vmm_vm);
416 for (int vcpu = 0; vcpu < maxcpus; vcpu++) {
417 vcpu_unlock_one(sc, vcpu);
418 }
419 }
420
421 static int
vmmdev_do_ioctl(vmm_softc_t * sc,int cmd,intptr_t arg,int md,cred_t * credp,int * rvalp)422 vmmdev_do_ioctl(vmm_softc_t *sc, int cmd, intptr_t arg, int md,
423 cred_t *credp, int *rvalp)
424 {
425 int error = 0, vcpu = -1;
426 void *datap = (void *)arg;
427 enum vm_lock_type {
428 LOCK_NONE = 0,
429 LOCK_VCPU,
430 LOCK_READ_HOLD,
431 LOCK_WRITE_HOLD
432 } lock_type = LOCK_NONE;
433
434 /* Acquire any exclusion resources needed for the operation. */
435 switch (cmd) {
436 case VM_RUN:
437 case VM_GET_REGISTER:
438 case VM_SET_REGISTER:
439 case VM_GET_SEGMENT_DESCRIPTOR:
440 case VM_SET_SEGMENT_DESCRIPTOR:
441 case VM_GET_REGISTER_SET:
442 case VM_SET_REGISTER_SET:
443 case VM_INJECT_EXCEPTION:
444 case VM_GET_CAPABILITY:
445 case VM_SET_CAPABILITY:
446 case VM_PPTDEV_MSI:
447 case VM_PPTDEV_MSIX:
448 case VM_SET_X2APIC_STATE:
449 case VM_GLA2GPA:
450 case VM_GLA2GPA_NOFAULT:
451 case VM_ACTIVATE_CPU:
452 case VM_SET_INTINFO:
453 case VM_GET_INTINFO:
454 case VM_RESTART_INSTRUCTION:
455 case VM_SET_KERNEMU_DEV:
456 case VM_GET_KERNEMU_DEV:
457 case VM_RESET_CPU:
458 case VM_GET_RUN_STATE:
459 case VM_SET_RUN_STATE:
460 case VM_GET_FPU:
461 case VM_SET_FPU:
462 case VM_GET_CPUID:
463 case VM_SET_CPUID:
464 case VM_LEGACY_CPUID:
465 /*
466 * Copy in the ID of the vCPU chosen for this operation.
467 * Since a nefarious caller could update their struct between
468 * this locking and when the rest of the ioctl data is copied
469 * in, it is _critical_ that this local 'vcpu' variable be used
470 * rather than the in-struct one when performing the ioctl.
471 */
472 if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) {
473 return (EFAULT);
474 }
475 if (vcpu < 0 || vcpu >= vm_get_maxcpus(sc->vmm_vm)) {
476 return (EINVAL);
477 }
478 vcpu_lock_one(sc, vcpu);
479 lock_type = LOCK_VCPU;
480 break;
481
482 case VM_REINIT:
483 case VM_BIND_PPTDEV:
484 case VM_UNBIND_PPTDEV:
485 case VM_MAP_PPTDEV_MMIO:
486 case VM_UNMAP_PPTDEV_MMIO:
487 case VM_ALLOC_MEMSEG:
488 case VM_MMAP_MEMSEG:
489 case VM_MUNMAP_MEMSEG:
490 case VM_WRLOCK_CYCLE:
491 case VM_PMTMR_LOCATE:
492 case VM_PAUSE:
493 case VM_RESUME:
494 vmm_write_lock(sc);
495 lock_type = LOCK_WRITE_HOLD;
496 break;
497
498 case VM_GET_MEMSEG:
499 case VM_MMAP_GETNEXT:
500 case VM_LAPIC_IRQ:
501 case VM_INJECT_NMI:
502 case VM_IOAPIC_ASSERT_IRQ:
503 case VM_IOAPIC_DEASSERT_IRQ:
504 case VM_IOAPIC_PULSE_IRQ:
505 case VM_LAPIC_MSI:
506 case VM_LAPIC_LOCAL_IRQ:
507 case VM_GET_X2APIC_STATE:
508 case VM_RTC_READ:
509 case VM_RTC_WRITE:
510 case VM_RTC_SETTIME:
511 case VM_RTC_GETTIME:
512 case VM_PPTDEV_DISABLE_MSIX:
513 case VM_DEVMEM_GETOFFSET:
514 case VM_TRACK_DIRTY_PAGES:
515 case VM_NPT_OPERATION:
516 vmm_read_lock(sc);
517 lock_type = LOCK_READ_HOLD;
518 break;
519
520 case VM_DATA_READ:
521 case VM_DATA_WRITE:
522 if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) {
523 return (EFAULT);
524 }
525 if (vcpu == -1) {
526 /* Access data for VM-wide devices */
527 vmm_write_lock(sc);
528 lock_type = LOCK_WRITE_HOLD;
529 } else if (vcpu >= 0 && vcpu < vm_get_maxcpus(sc->vmm_vm)) {
530 /* Access data associated with a specific vCPU */
531 vcpu_lock_one(sc, vcpu);
532 lock_type = LOCK_VCPU;
533 } else {
534 return (EINVAL);
535 }
536 break;
537
538 case VM_GET_GPA_PMAP:
539 case VM_IOAPIC_PINCOUNT:
540 case VM_SUSPEND:
541 case VM_DESC_FPU_AREA:
542 case VM_SET_AUTODESTRUCT:
543 case VM_DESTROY_SELF:
544 case VM_DESTROY_PENDING:
545 case VM_VCPU_BARRIER:
546 default:
547 break;
548 }
549
550 /* Execute the primary logic for the ioctl. */
551 switch (cmd) {
552 case VM_RUN: {
553 struct vm_entry entry;
554
555 if (ddi_copyin(datap, &entry, sizeof (entry), md)) {
556 error = EFAULT;
557 break;
558 }
559
560 if (!(curthread->t_schedflag & TS_VCPU))
561 smt_mark_as_vcpu();
562
563 error = vm_run(sc->vmm_vm, vcpu, &entry);
564
565 /*
566 * Unexpected states in vm_run() are expressed through positive
567 * errno-oriented return values. VM states which expect further
568 * processing in userspace (necessary context via exitinfo) are
569 * expressed through negative return values. For the time being
570 * a return value of 0 is not expected from vm_run().
571 */
572 ASSERT(error != 0);
573 if (error < 0) {
574 const struct vm_exit *vme;
575 void *outp = entry.exit_data;
576
577 error = 0;
578 vme = vm_exitinfo(sc->vmm_vm, vcpu);
579 if (ddi_copyout(vme, outp, sizeof (*vme), md)) {
580 error = EFAULT;
581 }
582 }
583 break;
584 }
585 case VM_SUSPEND: {
586 struct vm_suspend vmsuspend;
587
588 if (ddi_copyin(datap, &vmsuspend, sizeof (vmsuspend), md)) {
589 error = EFAULT;
590 break;
591 }
592 error = vm_suspend(sc->vmm_vm, vmsuspend.how, vmsuspend.source);
593 break;
594 }
595 case VM_REINIT: {
596 struct vm_reinit reinit;
597
598 if (ddi_copyin(datap, &reinit, sizeof (reinit), md)) {
599 error = EFAULT;
600 break;
601 }
602 if ((error = vmm_drv_block_hook(sc, B_TRUE)) != 0) {
603 /*
604 * The VM instance should be free of driver-attached
605 * hooks during the reinitialization process.
606 */
607 break;
608 }
609 error = vm_reinit(sc->vmm_vm, reinit.flags);
610 (void) vmm_drv_block_hook(sc, B_FALSE);
611 break;
612 }
613 case VM_STAT_DESC: {
614 struct vm_stat_desc statdesc;
615
616 if (ddi_copyin(datap, &statdesc, sizeof (statdesc), md)) {
617 error = EFAULT;
618 break;
619 }
620 error = vmm_stat_desc_copy(statdesc.index, statdesc.desc,
621 sizeof (statdesc.desc));
622 if (error == 0 &&
623 ddi_copyout(&statdesc, datap, sizeof (statdesc), md)) {
624 error = EFAULT;
625 break;
626 }
627 break;
628 }
629 case VM_STATS_IOC: {
630 struct vm_stats vmstats;
631
632 if (ddi_copyin(datap, &vmstats, sizeof (vmstats), md)) {
633 error = EFAULT;
634 break;
635 }
636 hrt2tv(gethrtime(), &vmstats.tv);
637 error = vmm_stat_copy(sc->vmm_vm, vmstats.cpuid, vmstats.index,
638 nitems(vmstats.statbuf),
639 &vmstats.num_entries, vmstats.statbuf);
640 if (error == 0 &&
641 ddi_copyout(&vmstats, datap, sizeof (vmstats), md)) {
642 error = EFAULT;
643 break;
644 }
645 break;
646 }
647
648 case VM_PPTDEV_MSI: {
649 struct vm_pptdev_msi pptmsi;
650
651 if (ddi_copyin(datap, &pptmsi, sizeof (pptmsi), md)) {
652 error = EFAULT;
653 break;
654 }
655 error = ppt_setup_msi(sc->vmm_vm, pptmsi.vcpu, pptmsi.pptfd,
656 pptmsi.addr, pptmsi.msg, pptmsi.numvec);
657 break;
658 }
659 case VM_PPTDEV_MSIX: {
660 struct vm_pptdev_msix pptmsix;
661
662 if (ddi_copyin(datap, &pptmsix, sizeof (pptmsix), md)) {
663 error = EFAULT;
664 break;
665 }
666 error = ppt_setup_msix(sc->vmm_vm, pptmsix.vcpu, pptmsix.pptfd,
667 pptmsix.idx, pptmsix.addr, pptmsix.msg,
668 pptmsix.vector_control);
669 break;
670 }
671 case VM_PPTDEV_DISABLE_MSIX: {
672 struct vm_pptdev pptdev;
673
674 if (ddi_copyin(datap, &pptdev, sizeof (pptdev), md)) {
675 error = EFAULT;
676 break;
677 }
678 error = ppt_disable_msix(sc->vmm_vm, pptdev.pptfd);
679 break;
680 }
681 case VM_MAP_PPTDEV_MMIO: {
682 struct vm_pptdev_mmio pptmmio;
683
684 if (ddi_copyin(datap, &pptmmio, sizeof (pptmmio), md)) {
685 error = EFAULT;
686 break;
687 }
688 error = ppt_map_mmio(sc->vmm_vm, pptmmio.pptfd, pptmmio.gpa,
689 pptmmio.len, pptmmio.hpa);
690 break;
691 }
692 case VM_UNMAP_PPTDEV_MMIO: {
693 struct vm_pptdev_mmio pptmmio;
694
695 if (ddi_copyin(datap, &pptmmio, sizeof (pptmmio), md)) {
696 error = EFAULT;
697 break;
698 }
699 error = ppt_unmap_mmio(sc->vmm_vm, pptmmio.pptfd, pptmmio.gpa,
700 pptmmio.len);
701 break;
702 }
703 case VM_BIND_PPTDEV: {
704 struct vm_pptdev pptdev;
705
706 if (ddi_copyin(datap, &pptdev, sizeof (pptdev), md)) {
707 error = EFAULT;
708 break;
709 }
710 error = vm_assign_pptdev(sc->vmm_vm, pptdev.pptfd);
711 break;
712 }
713 case VM_UNBIND_PPTDEV: {
714 struct vm_pptdev pptdev;
715
716 if (ddi_copyin(datap, &pptdev, sizeof (pptdev), md)) {
717 error = EFAULT;
718 break;
719 }
720 error = vm_unassign_pptdev(sc->vmm_vm, pptdev.pptfd);
721 break;
722 }
723 case VM_GET_PPTDEV_LIMITS: {
724 struct vm_pptdev_limits pptlimits;
725
726 if (ddi_copyin(datap, &pptlimits, sizeof (pptlimits), md)) {
727 error = EFAULT;
728 break;
729 }
730 error = ppt_get_limits(sc->vmm_vm, pptlimits.pptfd,
731 &pptlimits.msi_limit, &pptlimits.msix_limit);
732 if (error == 0 &&
733 ddi_copyout(&pptlimits, datap, sizeof (pptlimits), md)) {
734 error = EFAULT;
735 break;
736 }
737 break;
738 }
739 case VM_INJECT_EXCEPTION: {
740 struct vm_exception vmexc;
741 if (ddi_copyin(datap, &vmexc, sizeof (vmexc), md)) {
742 error = EFAULT;
743 break;
744 }
745 error = vm_inject_exception(sc->vmm_vm, vcpu, vmexc.vector,
746 vmexc.error_code_valid != 0, vmexc.error_code,
747 vmexc.restart_instruction != 0);
748 break;
749 }
750 case VM_INJECT_NMI: {
751 struct vm_nmi vmnmi;
752
753 if (ddi_copyin(datap, &vmnmi, sizeof (vmnmi), md)) {
754 error = EFAULT;
755 break;
756 }
757 error = vm_inject_nmi(sc->vmm_vm, vmnmi.cpuid);
758 break;
759 }
760 case VM_LAPIC_IRQ: {
761 struct vm_lapic_irq vmirq;
762
763 if (ddi_copyin(datap, &vmirq, sizeof (vmirq), md)) {
764 error = EFAULT;
765 break;
766 }
767 error = lapic_intr_edge(sc->vmm_vm, vmirq.cpuid, vmirq.vector);
768 break;
769 }
770 case VM_LAPIC_LOCAL_IRQ: {
771 struct vm_lapic_irq vmirq;
772
773 if (ddi_copyin(datap, &vmirq, sizeof (vmirq), md)) {
774 error = EFAULT;
775 break;
776 }
777 error = lapic_set_local_intr(sc->vmm_vm, vmirq.cpuid,
778 vmirq.vector);
779 break;
780 }
781 case VM_LAPIC_MSI: {
782 struct vm_lapic_msi vmmsi;
783
784 if (ddi_copyin(datap, &vmmsi, sizeof (vmmsi), md)) {
785 error = EFAULT;
786 break;
787 }
788 error = lapic_intr_msi(sc->vmm_vm, vmmsi.addr, vmmsi.msg);
789 break;
790 }
791
792 case VM_IOAPIC_ASSERT_IRQ: {
793 struct vm_ioapic_irq ioapic_irq;
794
795 if (ddi_copyin(datap, &ioapic_irq, sizeof (ioapic_irq), md)) {
796 error = EFAULT;
797 break;
798 }
799 error = vioapic_assert_irq(sc->vmm_vm, ioapic_irq.irq);
800 break;
801 }
802 case VM_IOAPIC_DEASSERT_IRQ: {
803 struct vm_ioapic_irq ioapic_irq;
804
805 if (ddi_copyin(datap, &ioapic_irq, sizeof (ioapic_irq), md)) {
806 error = EFAULT;
807 break;
808 }
809 error = vioapic_deassert_irq(sc->vmm_vm, ioapic_irq.irq);
810 break;
811 }
812 case VM_IOAPIC_PULSE_IRQ: {
813 struct vm_ioapic_irq ioapic_irq;
814
815 if (ddi_copyin(datap, &ioapic_irq, sizeof (ioapic_irq), md)) {
816 error = EFAULT;
817 break;
818 }
819 error = vioapic_pulse_irq(sc->vmm_vm, ioapic_irq.irq);
820 break;
821 }
822 case VM_IOAPIC_PINCOUNT: {
823 int pincount;
824
825 pincount = vioapic_pincount(sc->vmm_vm);
826 if (ddi_copyout(&pincount, datap, sizeof (int), md)) {
827 error = EFAULT;
828 break;
829 }
830 break;
831 }
832 case VM_DESC_FPU_AREA: {
833 struct vm_fpu_desc desc;
834 void *buf = NULL;
835
836 if (ddi_copyin(datap, &desc, sizeof (desc), md)) {
837 error = EFAULT;
838 break;
839 }
840 if (desc.vfd_num_entries > 64) {
841 error = EINVAL;
842 break;
843 }
844 const size_t buf_sz = sizeof (struct vm_fpu_desc_entry) *
845 desc.vfd_num_entries;
846 if (buf_sz != 0) {
847 buf = kmem_zalloc(buf_sz, KM_SLEEP);
848 }
849
850 /*
851 * For now, we are depending on vm_fpu_desc_entry and
852 * hma_xsave_state_desc_t having the same format.
853 */
854 CTASSERT(sizeof (struct vm_fpu_desc_entry) ==
855 sizeof (hma_xsave_state_desc_t));
856
857 size_t req_size;
858 const uint_t max_entries = hma_fpu_describe_xsave_state(
859 (hma_xsave_state_desc_t *)buf,
860 desc.vfd_num_entries,
861 &req_size);
862
863 desc.vfd_req_size = req_size;
864 desc.vfd_num_entries = max_entries;
865 if (buf_sz != 0) {
866 if (ddi_copyout(buf, desc.vfd_entry_data, buf_sz, md)) {
867 error = EFAULT;
868 }
869 kmem_free(buf, buf_sz);
870 }
871
872 if (error == 0) {
873 if (ddi_copyout(&desc, datap, sizeof (desc), md)) {
874 error = EFAULT;
875 }
876 }
877 break;
878 }
879 case VM_SET_AUTODESTRUCT: {
880 /*
881 * Since this has to do with controlling the lifetime of the
882 * greater vmm_softc_t, the flag is protected by vmm_mtx, rather
883 * than the vcpu-centric or rwlock exclusion mechanisms.
884 */
885 mutex_enter(&vmm_mtx);
886 if (arg != 0) {
887 sc->vmm_flags |= VMM_AUTODESTROY;
888 } else {
889 sc->vmm_flags &= ~VMM_AUTODESTROY;
890 }
891 mutex_exit(&vmm_mtx);
892 break;
893 }
894 case VM_DESTROY_SELF: {
895 bool hma_release = false;
896
897 /*
898 * Just like VMM_DESTROY_VM, but on the instance file descriptor
899 * itself, rather than having to perform a racy name lookup as
900 * part of the destroy process.
901 *
902 * Since vmm_destroy_locked() performs vCPU lock acquisition in
903 * order to kick the vCPUs out of guest context as part of any
904 * destruction, we do not need to worry about it ourself using
905 * the `lock_type` logic here.
906 */
907 mutex_enter(&vmm_mtx);
908 VERIFY0(vmm_destroy_locked(sc, VDO_DEFAULT, &hma_release));
909 mutex_exit(&vmm_mtx);
910 if (hma_release) {
911 vmm_hma_release();
912 }
913 break;
914 }
915 case VM_DESTROY_PENDING: {
916 /*
917 * If we have made it this far, then destruction of the instance
918 * has not been initiated.
919 */
920 *rvalp = 0;
921 break;
922 }
923
924 case VM_ISA_ASSERT_IRQ: {
925 struct vm_isa_irq isa_irq;
926
927 if (ddi_copyin(datap, &isa_irq, sizeof (isa_irq), md)) {
928 error = EFAULT;
929 break;
930 }
931 error = vatpic_assert_irq(sc->vmm_vm, isa_irq.atpic_irq);
932 if (error == 0 && isa_irq.ioapic_irq != -1) {
933 error = vioapic_assert_irq(sc->vmm_vm,
934 isa_irq.ioapic_irq);
935 }
936 break;
937 }
938 case VM_ISA_DEASSERT_IRQ: {
939 struct vm_isa_irq isa_irq;
940
941 if (ddi_copyin(datap, &isa_irq, sizeof (isa_irq), md)) {
942 error = EFAULT;
943 break;
944 }
945 error = vatpic_deassert_irq(sc->vmm_vm, isa_irq.atpic_irq);
946 if (error == 0 && isa_irq.ioapic_irq != -1) {
947 error = vioapic_deassert_irq(sc->vmm_vm,
948 isa_irq.ioapic_irq);
949 }
950 break;
951 }
952 case VM_ISA_PULSE_IRQ: {
953 struct vm_isa_irq isa_irq;
954
955 if (ddi_copyin(datap, &isa_irq, sizeof (isa_irq), md)) {
956 error = EFAULT;
957 break;
958 }
959 error = vatpic_pulse_irq(sc->vmm_vm, isa_irq.atpic_irq);
960 if (error == 0 && isa_irq.ioapic_irq != -1) {
961 error = vioapic_pulse_irq(sc->vmm_vm,
962 isa_irq.ioapic_irq);
963 }
964 break;
965 }
966 case VM_ISA_SET_IRQ_TRIGGER: {
967 struct vm_isa_irq_trigger isa_irq_trigger;
968
969 if (ddi_copyin(datap, &isa_irq_trigger,
970 sizeof (isa_irq_trigger), md)) {
971 error = EFAULT;
972 break;
973 }
974 error = vatpic_set_irq_trigger(sc->vmm_vm,
975 isa_irq_trigger.atpic_irq, isa_irq_trigger.trigger);
976 break;
977 }
978
979 case VM_MMAP_GETNEXT: {
980 struct vm_memmap mm;
981
982 if (ddi_copyin(datap, &mm, sizeof (mm), md)) {
983 error = EFAULT;
984 break;
985 }
986 error = vm_mmap_getnext(sc->vmm_vm, &mm.gpa, &mm.segid,
987 &mm.segoff, &mm.len, &mm.prot, &mm.flags);
988 if (error == 0 && ddi_copyout(&mm, datap, sizeof (mm), md)) {
989 error = EFAULT;
990 break;
991 }
992 break;
993 }
994 case VM_MMAP_MEMSEG: {
995 struct vm_memmap mm;
996
997 if (ddi_copyin(datap, &mm, sizeof (mm), md)) {
998 error = EFAULT;
999 break;
1000 }
1001 error = vm_mmap_memseg(sc->vmm_vm, mm.gpa, mm.segid, mm.segoff,
1002 mm.len, mm.prot, mm.flags);
1003 break;
1004 }
1005 case VM_MUNMAP_MEMSEG: {
1006 struct vm_munmap mu;
1007
1008 if (ddi_copyin(datap, &mu, sizeof (mu), md)) {
1009 error = EFAULT;
1010 break;
1011 }
1012 error = vm_munmap_memseg(sc->vmm_vm, mu.gpa, mu.len);
1013 break;
1014 }
1015 case VM_ALLOC_MEMSEG: {
1016 struct vm_memseg vmseg;
1017
1018 if (ddi_copyin(datap, &vmseg, sizeof (vmseg), md)) {
1019 error = EFAULT;
1020 break;
1021 }
1022 error = vmmdev_alloc_memseg(sc, &vmseg);
1023 break;
1024 }
1025 case VM_GET_MEMSEG: {
1026 struct vm_memseg vmseg;
1027
1028 if (ddi_copyin(datap, &vmseg, sizeof (vmseg), md)) {
1029 error = EFAULT;
1030 break;
1031 }
1032 error = vmmdev_get_memseg(sc, &vmseg);
1033 if (error == 0 &&
1034 ddi_copyout(&vmseg, datap, sizeof (vmseg), md)) {
1035 error = EFAULT;
1036 break;
1037 }
1038 break;
1039 }
1040 case VM_GET_REGISTER: {
1041 struct vm_register vmreg;
1042
1043 if (ddi_copyin(datap, &vmreg, sizeof (vmreg), md)) {
1044 error = EFAULT;
1045 break;
1046 }
1047 error = vm_get_register(sc->vmm_vm, vcpu, vmreg.regnum,
1048 &vmreg.regval);
1049 if (error == 0 &&
1050 ddi_copyout(&vmreg, datap, sizeof (vmreg), md)) {
1051 error = EFAULT;
1052 break;
1053 }
1054 break;
1055 }
1056 case VM_SET_REGISTER: {
1057 struct vm_register vmreg;
1058
1059 if (ddi_copyin(datap, &vmreg, sizeof (vmreg), md)) {
1060 error = EFAULT;
1061 break;
1062 }
1063 error = vm_set_register(sc->vmm_vm, vcpu, vmreg.regnum,
1064 vmreg.regval);
1065 break;
1066 }
1067 case VM_SET_SEGMENT_DESCRIPTOR: {
1068 struct vm_seg_desc vmsegd;
1069
1070 if (ddi_copyin(datap, &vmsegd, sizeof (vmsegd), md)) {
1071 error = EFAULT;
1072 break;
1073 }
1074 error = vm_set_seg_desc(sc->vmm_vm, vcpu, vmsegd.regnum,
1075 &vmsegd.desc);
1076 break;
1077 }
1078 case VM_GET_SEGMENT_DESCRIPTOR: {
1079 struct vm_seg_desc vmsegd;
1080
1081 if (ddi_copyin(datap, &vmsegd, sizeof (vmsegd), md)) {
1082 error = EFAULT;
1083 break;
1084 }
1085 error = vm_get_seg_desc(sc->vmm_vm, vcpu, vmsegd.regnum,
1086 &vmsegd.desc);
1087 if (error == 0 &&
1088 ddi_copyout(&vmsegd, datap, sizeof (vmsegd), md)) {
1089 error = EFAULT;
1090 break;
1091 }
1092 break;
1093 }
1094 case VM_GET_REGISTER_SET: {
1095 struct vm_register_set vrs;
1096 int regnums[VM_REG_LAST];
1097 uint64_t regvals[VM_REG_LAST];
1098
1099 if (ddi_copyin(datap, &vrs, sizeof (vrs), md)) {
1100 error = EFAULT;
1101 break;
1102 }
1103 if (vrs.count > VM_REG_LAST || vrs.count == 0) {
1104 error = EINVAL;
1105 break;
1106 }
1107 if (ddi_copyin(vrs.regnums, regnums,
1108 sizeof (int) * vrs.count, md)) {
1109 error = EFAULT;
1110 break;
1111 }
1112
1113 error = 0;
1114 for (uint_t i = 0; i < vrs.count && error == 0; i++) {
1115 if (regnums[i] < 0) {
1116 error = EINVAL;
1117 break;
1118 }
1119 error = vm_get_register(sc->vmm_vm, vcpu, regnums[i],
1120 ®vals[i]);
1121 }
1122 if (error == 0 && ddi_copyout(regvals, vrs.regvals,
1123 sizeof (uint64_t) * vrs.count, md)) {
1124 error = EFAULT;
1125 }
1126 break;
1127 }
1128 case VM_SET_REGISTER_SET: {
1129 struct vm_register_set vrs;
1130 int regnums[VM_REG_LAST];
1131 uint64_t regvals[VM_REG_LAST];
1132
1133 if (ddi_copyin(datap, &vrs, sizeof (vrs), md)) {
1134 error = EFAULT;
1135 break;
1136 }
1137 if (vrs.count > VM_REG_LAST || vrs.count == 0) {
1138 error = EINVAL;
1139 break;
1140 }
1141 if (ddi_copyin(vrs.regnums, regnums,
1142 sizeof (int) * vrs.count, md)) {
1143 error = EFAULT;
1144 break;
1145 }
1146 if (ddi_copyin(vrs.regvals, regvals,
1147 sizeof (uint64_t) * vrs.count, md)) {
1148 error = EFAULT;
1149 break;
1150 }
1151
1152 error = 0;
1153 for (uint_t i = 0; i < vrs.count && error == 0; i++) {
1154 /*
1155 * Setting registers in a set is not atomic, since a
1156 * failure in the middle of the set will cause a
1157 * bail-out and inconsistent register state. Callers
1158 * should be wary of this.
1159 */
1160 if (regnums[i] < 0) {
1161 error = EINVAL;
1162 break;
1163 }
1164 error = vm_set_register(sc->vmm_vm, vcpu, regnums[i],
1165 regvals[i]);
1166 }
1167 break;
1168 }
1169 case VM_RESET_CPU: {
1170 struct vm_vcpu_reset vvr;
1171
1172 if (ddi_copyin(datap, &vvr, sizeof (vvr), md)) {
1173 error = EFAULT;
1174 break;
1175 }
1176 if (vvr.kind != VRK_RESET && vvr.kind != VRK_INIT) {
1177 error = EINVAL;
1178 }
1179
1180 error = vcpu_arch_reset(sc->vmm_vm, vcpu, vvr.kind == VRK_INIT);
1181 break;
1182 }
1183 case VM_GET_RUN_STATE: {
1184 struct vm_run_state vrs;
1185
1186 bzero(&vrs, sizeof (vrs));
1187 error = vm_get_run_state(sc->vmm_vm, vcpu, &vrs.state,
1188 &vrs.sipi_vector);
1189 if (error == 0) {
1190 if (ddi_copyout(&vrs, datap, sizeof (vrs), md)) {
1191 error = EFAULT;
1192 break;
1193 }
1194 }
1195 break;
1196 }
1197 case VM_SET_RUN_STATE: {
1198 struct vm_run_state vrs;
1199
1200 if (ddi_copyin(datap, &vrs, sizeof (vrs), md)) {
1201 error = EFAULT;
1202 break;
1203 }
1204 error = vm_set_run_state(sc->vmm_vm, vcpu, vrs.state,
1205 vrs.sipi_vector);
1206 break;
1207 }
1208 case VM_GET_FPU: {
1209 struct vm_fpu_state req;
1210 const size_t max_len = (PAGESIZE * 2);
1211 void *kbuf;
1212
1213 if (ddi_copyin(datap, &req, sizeof (req), md)) {
1214 error = EFAULT;
1215 break;
1216 }
1217 if (req.len > max_len || req.len == 0) {
1218 error = EINVAL;
1219 break;
1220 }
1221 kbuf = kmem_zalloc(req.len, KM_SLEEP);
1222 error = vm_get_fpu(sc->vmm_vm, vcpu, kbuf, req.len);
1223 if (error == 0) {
1224 if (ddi_copyout(kbuf, req.buf, req.len, md)) {
1225 error = EFAULT;
1226 }
1227 }
1228 kmem_free(kbuf, req.len);
1229 break;
1230 }
1231 case VM_SET_FPU: {
1232 struct vm_fpu_state req;
1233 const size_t max_len = (PAGESIZE * 2);
1234 void *kbuf;
1235
1236 if (ddi_copyin(datap, &req, sizeof (req), md)) {
1237 error = EFAULT;
1238 break;
1239 }
1240 if (req.len > max_len || req.len == 0) {
1241 error = EINVAL;
1242 break;
1243 }
1244 kbuf = kmem_alloc(req.len, KM_SLEEP);
1245 if (ddi_copyin(req.buf, kbuf, req.len, md)) {
1246 error = EFAULT;
1247 } else {
1248 error = vm_set_fpu(sc->vmm_vm, vcpu, kbuf, req.len);
1249 }
1250 kmem_free(kbuf, req.len);
1251 break;
1252 }
1253 case VM_GET_CPUID: {
1254 struct vm_vcpu_cpuid_config cfg;
1255 struct vcpu_cpuid_entry *entries = NULL;
1256
1257 if (ddi_copyin(datap, &cfg, sizeof (cfg), md)) {
1258 error = EFAULT;
1259 break;
1260 }
1261 if (cfg.vvcc_nent > VMM_MAX_CPUID_ENTRIES) {
1262 error = EINVAL;
1263 break;
1264 }
1265
1266 const size_t entries_size =
1267 cfg.vvcc_nent * sizeof (struct vcpu_cpuid_entry);
1268 if (entries_size != 0) {
1269 entries = kmem_zalloc(entries_size, KM_SLEEP);
1270 }
1271
1272 vcpu_cpuid_config_t vm_cfg = {
1273 .vcc_nent = cfg.vvcc_nent,
1274 .vcc_entries = entries,
1275 };
1276 error = vm_get_cpuid(sc->vmm_vm, vcpu, &vm_cfg);
1277
1278 /*
1279 * Only attempt to copy out the resultant entries if we were
1280 * able to query them from the instance. The flags and number
1281 * of entries are emitted regardless.
1282 */
1283 cfg.vvcc_flags = vm_cfg.vcc_flags;
1284 cfg.vvcc_nent = vm_cfg.vcc_nent;
1285 if (entries != NULL) {
1286 if (error == 0 && ddi_copyout(entries, cfg.vvcc_entries,
1287 entries_size, md) != 0) {
1288 error = EFAULT;
1289 }
1290
1291 kmem_free(entries, entries_size);
1292 }
1293
1294 if (ddi_copyout(&cfg, datap, sizeof (cfg), md) != 0) {
1295 error = EFAULT;
1296 }
1297 break;
1298 }
1299 case VM_SET_CPUID: {
1300 struct vm_vcpu_cpuid_config cfg;
1301 struct vcpu_cpuid_entry *entries = NULL;
1302 size_t entries_size = 0;
1303
1304 if (ddi_copyin(datap, &cfg, sizeof (cfg), md)) {
1305 error = EFAULT;
1306 break;
1307 }
1308 if (cfg.vvcc_nent > VMM_MAX_CPUID_ENTRIES) {
1309 error = EFBIG;
1310 break;
1311 }
1312 if ((cfg.vvcc_flags & VCC_FLAG_LEGACY_HANDLING) != 0) {
1313 /*
1314 * If we are being instructed to use "legacy" handling,
1315 * then no entries should be provided, since the static
1316 * in-kernel masking will be used.
1317 */
1318 if (cfg.vvcc_nent != 0) {
1319 error = EINVAL;
1320 break;
1321 }
1322 } else if (cfg.vvcc_nent != 0) {
1323 entries_size =
1324 cfg.vvcc_nent * sizeof (struct vcpu_cpuid_entry);
1325 entries = kmem_alloc(entries_size, KM_SLEEP);
1326
1327 if (ddi_copyin(cfg.vvcc_entries, entries, entries_size,
1328 md) != 0) {
1329 error = EFAULT;
1330 kmem_free(entries, entries_size);
1331 break;
1332 }
1333 }
1334
1335 vcpu_cpuid_config_t vm_cfg = {
1336 .vcc_flags = cfg.vvcc_flags,
1337 .vcc_nent = cfg.vvcc_nent,
1338 .vcc_entries = entries,
1339 };
1340 error = vm_set_cpuid(sc->vmm_vm, vcpu, &vm_cfg);
1341
1342 if (entries != NULL) {
1343 kmem_free(entries, entries_size);
1344 }
1345 break;
1346 }
1347 case VM_LEGACY_CPUID: {
1348 struct vm_legacy_cpuid vlc;
1349 if (ddi_copyin(datap, &vlc, sizeof (vlc), md)) {
1350 error = EFAULT;
1351 break;
1352 }
1353 vlc.vlc_vcpuid = vcpu;
1354
1355 legacy_emulate_cpuid(sc->vmm_vm, vcpu, &vlc.vlc_eax,
1356 &vlc.vlc_ebx, &vlc.vlc_ecx, &vlc.vlc_edx);
1357
1358 if (ddi_copyout(&vlc, datap, sizeof (vlc), md)) {
1359 error = EFAULT;
1360 break;
1361 }
1362 break;
1363 }
1364
1365 case VM_SET_KERNEMU_DEV:
1366 case VM_GET_KERNEMU_DEV: {
1367 struct vm_readwrite_kernemu_device kemu;
1368 size_t size = 0;
1369
1370 if (ddi_copyin(datap, &kemu, sizeof (kemu), md)) {
1371 error = EFAULT;
1372 break;
1373 }
1374
1375 if (kemu.access_width > 3) {
1376 error = EINVAL;
1377 break;
1378 }
1379 size = (1 << kemu.access_width);
1380 ASSERT(size >= 1 && size <= 8);
1381
1382 if (cmd == VM_SET_KERNEMU_DEV) {
1383 error = vm_service_mmio_write(sc->vmm_vm, vcpu,
1384 kemu.gpa, kemu.value, size);
1385 } else {
1386 error = vm_service_mmio_read(sc->vmm_vm, vcpu,
1387 kemu.gpa, &kemu.value, size);
1388 }
1389
1390 if (error == 0) {
1391 if (ddi_copyout(&kemu, datap, sizeof (kemu), md)) {
1392 error = EFAULT;
1393 break;
1394 }
1395 }
1396 break;
1397 }
1398
1399 case VM_GET_CAPABILITY: {
1400 struct vm_capability vmcap;
1401
1402 if (ddi_copyin(datap, &vmcap, sizeof (vmcap), md)) {
1403 error = EFAULT;
1404 break;
1405 }
1406 error = vm_get_capability(sc->vmm_vm, vcpu, vmcap.captype,
1407 &vmcap.capval);
1408 if (error == 0 &&
1409 ddi_copyout(&vmcap, datap, sizeof (vmcap), md)) {
1410 error = EFAULT;
1411 break;
1412 }
1413 break;
1414 }
1415 case VM_SET_CAPABILITY: {
1416 struct vm_capability vmcap;
1417
1418 if (ddi_copyin(datap, &vmcap, sizeof (vmcap), md)) {
1419 error = EFAULT;
1420 break;
1421 }
1422 error = vm_set_capability(sc->vmm_vm, vcpu, vmcap.captype,
1423 vmcap.capval);
1424 break;
1425 }
1426 case VM_SET_X2APIC_STATE: {
1427 struct vm_x2apic x2apic;
1428
1429 if (ddi_copyin(datap, &x2apic, sizeof (x2apic), md)) {
1430 error = EFAULT;
1431 break;
1432 }
1433 error = vm_set_x2apic_state(sc->vmm_vm, vcpu, x2apic.state);
1434 break;
1435 }
1436 case VM_GET_X2APIC_STATE: {
1437 struct vm_x2apic x2apic;
1438
1439 if (ddi_copyin(datap, &x2apic, sizeof (x2apic), md)) {
1440 error = EFAULT;
1441 break;
1442 }
1443 error = vm_get_x2apic_state(sc->vmm_vm, x2apic.cpuid,
1444 &x2apic.state);
1445 if (error == 0 &&
1446 ddi_copyout(&x2apic, datap, sizeof (x2apic), md)) {
1447 error = EFAULT;
1448 break;
1449 }
1450 break;
1451 }
1452 case VM_GET_GPA_PMAP: {
1453 /*
1454 * Until there is a necessity to leak EPT/RVI PTE values to
1455 * userspace, this will remain unimplemented
1456 */
1457 error = EINVAL;
1458 break;
1459 }
1460 case VM_GET_HPET_CAPABILITIES: {
1461 struct vm_hpet_cap hpetcap;
1462
1463 error = vhpet_getcap(&hpetcap);
1464 if (error == 0 &&
1465 ddi_copyout(&hpetcap, datap, sizeof (hpetcap), md)) {
1466 error = EFAULT;
1467 break;
1468 }
1469 break;
1470 }
1471 case VM_GLA2GPA: {
1472 struct vm_gla2gpa gg;
1473
1474 if (ddi_copyin(datap, &gg, sizeof (gg), md)) {
1475 error = EFAULT;
1476 break;
1477 }
1478 gg.vcpuid = vcpu;
1479 error = vm_gla2gpa(sc->vmm_vm, vcpu, &gg.paging, gg.gla,
1480 gg.prot, &gg.gpa, &gg.fault);
1481 if (error == 0 && ddi_copyout(&gg, datap, sizeof (gg), md)) {
1482 error = EFAULT;
1483 break;
1484 }
1485 break;
1486 }
1487 case VM_GLA2GPA_NOFAULT: {
1488 struct vm_gla2gpa gg;
1489
1490 if (ddi_copyin(datap, &gg, sizeof (gg), md)) {
1491 error = EFAULT;
1492 break;
1493 }
1494 gg.vcpuid = vcpu;
1495 error = vm_gla2gpa_nofault(sc->vmm_vm, vcpu, &gg.paging,
1496 gg.gla, gg.prot, &gg.gpa, &gg.fault);
1497 if (error == 0 && ddi_copyout(&gg, datap, sizeof (gg), md)) {
1498 error = EFAULT;
1499 break;
1500 }
1501 break;
1502 }
1503
1504 case VM_ACTIVATE_CPU:
1505 error = vm_activate_cpu(sc->vmm_vm, vcpu);
1506 break;
1507
1508 case VM_SUSPEND_CPU:
1509 if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) {
1510 error = EFAULT;
1511 } else {
1512 error = vm_suspend_cpu(sc->vmm_vm, vcpu);
1513 }
1514 break;
1515
1516 case VM_RESUME_CPU:
1517 if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) {
1518 error = EFAULT;
1519 } else {
1520 error = vm_resume_cpu(sc->vmm_vm, vcpu);
1521 }
1522 break;
1523
1524 case VM_VCPU_BARRIER:
1525 vcpu = arg;
1526 error = vm_vcpu_barrier(sc->vmm_vm, vcpu);
1527 break;
1528
1529 case VM_GET_CPUS: {
1530 struct vm_cpuset vm_cpuset;
1531 cpuset_t tempset;
1532 void *srcp = &tempset;
1533 int size;
1534
1535 if (ddi_copyin(datap, &vm_cpuset, sizeof (vm_cpuset), md)) {
1536 error = EFAULT;
1537 break;
1538 }
1539
1540 /* Be more generous about sizing since our cpuset_t is large. */
1541 size = vm_cpuset.cpusetsize;
1542 if (size <= 0 || size > sizeof (cpuset_t)) {
1543 error = ERANGE;
1544 }
1545 /*
1546 * If they want a ulong_t or less, make sure they receive the
1547 * low bits with all the useful information.
1548 */
1549 if (size <= sizeof (tempset.cpub[0])) {
1550 srcp = &tempset.cpub[0];
1551 }
1552
1553 if (vm_cpuset.which == VM_ACTIVE_CPUS) {
1554 tempset = vm_active_cpus(sc->vmm_vm);
1555 } else if (vm_cpuset.which == VM_DEBUG_CPUS) {
1556 tempset = vm_debug_cpus(sc->vmm_vm);
1557 } else {
1558 error = EINVAL;
1559 }
1560
1561 ASSERT(size > 0 && size <= sizeof (tempset));
1562 if (error == 0 &&
1563 ddi_copyout(srcp, vm_cpuset.cpus, size, md)) {
1564 error = EFAULT;
1565 break;
1566 }
1567 break;
1568 }
1569 case VM_SET_INTINFO: {
1570 struct vm_intinfo vmii;
1571
1572 if (ddi_copyin(datap, &vmii, sizeof (vmii), md)) {
1573 error = EFAULT;
1574 break;
1575 }
1576 error = vm_exit_intinfo(sc->vmm_vm, vcpu, vmii.info1);
1577 break;
1578 }
1579 case VM_GET_INTINFO: {
1580 struct vm_intinfo vmii;
1581
1582 vmii.vcpuid = vcpu;
1583 error = vm_get_intinfo(sc->vmm_vm, vcpu, &vmii.info1,
1584 &vmii.info2);
1585 if (error == 0 &&
1586 ddi_copyout(&vmii, datap, sizeof (vmii), md)) {
1587 error = EFAULT;
1588 break;
1589 }
1590 break;
1591 }
1592 case VM_RTC_WRITE: {
1593 struct vm_rtc_data rtcdata;
1594
1595 if (ddi_copyin(datap, &rtcdata, sizeof (rtcdata), md)) {
1596 error = EFAULT;
1597 break;
1598 }
1599 error = vrtc_nvram_write(sc->vmm_vm, rtcdata.offset,
1600 rtcdata.value);
1601 break;
1602 }
1603 case VM_RTC_READ: {
1604 struct vm_rtc_data rtcdata;
1605
1606 if (ddi_copyin(datap, &rtcdata, sizeof (rtcdata), md)) {
1607 error = EFAULT;
1608 break;
1609 }
1610 error = vrtc_nvram_read(sc->vmm_vm, rtcdata.offset,
1611 &rtcdata.value);
1612 if (error == 0 &&
1613 ddi_copyout(&rtcdata, datap, sizeof (rtcdata), md)) {
1614 error = EFAULT;
1615 break;
1616 }
1617 break;
1618 }
1619 case VM_RTC_SETTIME: {
1620 timespec_t ts;
1621
1622 if (ddi_copyin(datap, &ts, sizeof (ts), md)) {
1623 error = EFAULT;
1624 break;
1625 }
1626 error = vrtc_set_time(sc->vmm_vm, &ts);
1627 break;
1628 }
1629 case VM_RTC_GETTIME: {
1630 timespec_t ts;
1631
1632 vrtc_get_time(sc->vmm_vm, &ts);
1633 if (ddi_copyout(&ts, datap, sizeof (ts), md)) {
1634 error = EFAULT;
1635 break;
1636 }
1637 break;
1638 }
1639
1640 case VM_PMTMR_LOCATE: {
1641 uint16_t port = arg;
1642 error = vpmtmr_set_location(sc->vmm_vm, port);
1643 break;
1644 }
1645
1646 case VM_RESTART_INSTRUCTION:
1647 error = vm_restart_instruction(sc->vmm_vm, vcpu);
1648 break;
1649
1650 case VM_SET_TOPOLOGY: {
1651 struct vm_cpu_topology topo;
1652
1653 if (ddi_copyin(datap, &topo, sizeof (topo), md) != 0) {
1654 error = EFAULT;
1655 break;
1656 }
1657 error = vm_set_topology(sc->vmm_vm, topo.sockets, topo.cores,
1658 topo.threads, topo.maxcpus);
1659 break;
1660 }
1661 case VM_GET_TOPOLOGY: {
1662 struct vm_cpu_topology topo;
1663
1664 vm_get_topology(sc->vmm_vm, &topo.sockets, &topo.cores,
1665 &topo.threads, &topo.maxcpus);
1666 if (ddi_copyout(&topo, datap, sizeof (topo), md) != 0) {
1667 error = EFAULT;
1668 break;
1669 }
1670 break;
1671 }
1672 case VM_DEVMEM_GETOFFSET: {
1673 struct vm_devmem_offset vdo;
1674 vmm_devmem_entry_t *de;
1675
1676 if (ddi_copyin(datap, &vdo, sizeof (vdo), md) != 0) {
1677 error = EFAULT;
1678 break;
1679 }
1680
1681 de = vmmdev_devmem_find(sc, vdo.segid);
1682 if (de != NULL) {
1683 vdo.offset = de->vde_off;
1684 if (ddi_copyout(&vdo, datap, sizeof (vdo), md) != 0) {
1685 error = EFAULT;
1686 }
1687 } else {
1688 error = ENOENT;
1689 }
1690 break;
1691 }
1692 case VM_TRACK_DIRTY_PAGES: {
1693 const size_t max_track_region_len = 8 * PAGESIZE * 8 * PAGESIZE;
1694 struct vmm_dirty_tracker tracker;
1695 uint8_t *bitmap;
1696 size_t len;
1697
1698 if (ddi_copyin(datap, &tracker, sizeof (tracker), md) != 0) {
1699 error = EFAULT;
1700 break;
1701 }
1702 if ((tracker.vdt_start_gpa & PAGEOFFSET) != 0) {
1703 error = EINVAL;
1704 break;
1705 }
1706 if (tracker.vdt_len == 0) {
1707 break;
1708 }
1709 if ((tracker.vdt_len & PAGEOFFSET) != 0) {
1710 error = EINVAL;
1711 break;
1712 }
1713 if (tracker.vdt_len > max_track_region_len) {
1714 error = EINVAL;
1715 break;
1716 }
1717 len = roundup(tracker.vdt_len / PAGESIZE, 8) / 8;
1718 bitmap = kmem_zalloc(len, KM_SLEEP);
1719 error = vm_track_dirty_pages(sc->vmm_vm, tracker.vdt_start_gpa,
1720 tracker.vdt_len, bitmap);
1721 if (error == 0 &&
1722 ddi_copyout(bitmap, tracker.vdt_pfns, len, md) != 0) {
1723 error = EFAULT;
1724 }
1725 kmem_free(bitmap, len);
1726
1727 break;
1728 }
1729 case VM_NPT_OPERATION: {
1730 struct vm_npt_operation vno;
1731 uint8_t *bitmap = NULL;
1732 uint64_t bitmap_size = 0;
1733
1734 if (ddi_copyin(datap, &vno, sizeof (vno), md) != 0) {
1735 error = EFAULT;
1736 break;
1737 }
1738 if ((vno.vno_gpa & PAGEOFFSET) != 0 ||
1739 (vno.vno_len & PAGEOFFSET) != 0) {
1740 error = EINVAL;
1741 break;
1742 }
1743 if ((UINT64_MAX - vno.vno_len) < vno.vno_gpa) {
1744 error = EOVERFLOW;
1745 break;
1746 }
1747
1748 /*
1749 * Allocate a bitmap for the operation if it is specified as
1750 * part of the input or output.
1751 */
1752 if ((vno.vno_operation &
1753 (VNO_FLAG_BITMAP_IN | VNO_FLAG_BITMAP_OUT)) != 0) {
1754 /*
1755 * Operations expecting data to be copied in or out
1756 * should not have zero length.
1757 */
1758 if (vno.vno_len == 0) {
1759 error = EINVAL;
1760 break;
1761 }
1762
1763 /*
1764 * Maximum bitmap size of 8 pages results in 1 GiB of
1765 * coverage.
1766 */
1767 const uint64_t max_bitmap_size = 8 * PAGESIZE;
1768
1769 bitmap_size = roundup(vno.vno_len / PAGESIZE, 8) / 8;
1770 if (bitmap_size > max_bitmap_size) {
1771 error = E2BIG;
1772 break;
1773 }
1774 bitmap = kmem_zalloc(bitmap_size, KM_SLEEP);
1775 }
1776
1777 if ((vno.vno_operation & VNO_FLAG_BITMAP_IN) != 0) {
1778 ASSERT(bitmap != NULL);
1779 if (ddi_copyin(vno.vno_bitmap, bitmap, bitmap_size,
1780 md) != 0) {
1781 error = EFAULT;
1782 }
1783 }
1784
1785 if (error == 0) {
1786 error = vm_npt_do_operation(sc->vmm_vm, vno.vno_gpa,
1787 vno.vno_len, vno.vno_operation, bitmap, rvalp);
1788 }
1789
1790 if ((vno.vno_operation & VNO_FLAG_BITMAP_OUT) != 0 &&
1791 error == 0) {
1792 ASSERT(bitmap != NULL);
1793 if (ddi_copyout(bitmap, vno.vno_bitmap, bitmap_size,
1794 md) != 0) {
1795 error = EFAULT;
1796 }
1797 }
1798
1799 if (bitmap != NULL) {
1800 kmem_free(bitmap, bitmap_size);
1801 }
1802
1803 break;
1804 }
1805 case VM_WRLOCK_CYCLE: {
1806 /*
1807 * Present a test mechanism to acquire/release the write lock
1808 * on the VM without any other effects.
1809 */
1810 break;
1811 }
1812 case VM_DATA_READ: {
1813 struct vm_data_xfer vdx;
1814
1815 if (ddi_copyin(datap, &vdx, sizeof (vdx), md) != 0) {
1816 error = EFAULT;
1817 break;
1818 }
1819 if ((vdx.vdx_flags & ~VDX_FLAGS_VALID) != 0) {
1820 error = EINVAL;
1821 break;
1822 }
1823 if (vdx.vdx_len > VM_DATA_XFER_LIMIT) {
1824 error = EFBIG;
1825 break;
1826 }
1827
1828 const size_t len = vdx.vdx_len;
1829 void *buf = NULL;
1830 if (len != 0) {
1831 const void *udata = vdx.vdx_data;
1832
1833 buf = kmem_alloc(len, KM_SLEEP);
1834 if ((vdx.vdx_flags & VDX_FLAG_READ_COPYIN) == 0) {
1835 bzero(buf, len);
1836 } else if (ddi_copyin(udata, buf, len, md) != 0) {
1837 kmem_free(buf, len);
1838 error = EFAULT;
1839 break;
1840 }
1841 }
1842
1843 vdx.vdx_result_len = 0;
1844 vmm_data_req_t req = {
1845 .vdr_class = vdx.vdx_class,
1846 .vdr_version = vdx.vdx_version,
1847 .vdr_flags = vdx.vdx_flags,
1848 .vdr_len = len,
1849 .vdr_data = buf,
1850 .vdr_result_len = &vdx.vdx_result_len,
1851 .vdr_vcpuid = vdx.vdx_vcpuid,
1852 };
1853 error = vmm_data_read(sc->vmm_vm, &req);
1854
1855 if (error == 0 && buf != NULL) {
1856 if (ddi_copyout(buf, vdx.vdx_data, len, md) != 0) {
1857 error = EFAULT;
1858 }
1859 }
1860
1861 /*
1862 * Copy out the transfer request so that the value of
1863 * vdx_result_len can be made available, regardless of any
1864 * error(s) which may have occurred.
1865 */
1866 if (ddi_copyout(&vdx, datap, sizeof (vdx), md) != 0) {
1867 error = (error != 0) ? error : EFAULT;
1868 }
1869
1870 if (buf != NULL) {
1871 kmem_free(buf, len);
1872 }
1873 break;
1874 }
1875 case VM_DATA_WRITE: {
1876 struct vm_data_xfer vdx;
1877
1878 if (ddi_copyin(datap, &vdx, sizeof (vdx), md) != 0) {
1879 error = EFAULT;
1880 break;
1881 }
1882 if ((vdx.vdx_flags & ~VDX_FLAGS_VALID) != 0) {
1883 error = EINVAL;
1884 break;
1885 }
1886 if (vdx.vdx_len > VM_DATA_XFER_LIMIT) {
1887 error = EFBIG;
1888 break;
1889 }
1890
1891 const size_t len = vdx.vdx_len;
1892 void *buf = NULL;
1893 if (len != 0) {
1894 buf = kmem_alloc(len, KM_SLEEP);
1895 if (ddi_copyin(vdx.vdx_data, buf, len, md) != 0) {
1896 kmem_free(buf, len);
1897 error = EFAULT;
1898 break;
1899 }
1900 }
1901
1902 vdx.vdx_result_len = 0;
1903 vmm_data_req_t req = {
1904 .vdr_class = vdx.vdx_class,
1905 .vdr_version = vdx.vdx_version,
1906 .vdr_flags = vdx.vdx_flags,
1907 .vdr_len = len,
1908 .vdr_data = buf,
1909 .vdr_result_len = &vdx.vdx_result_len,
1910 .vdr_vcpuid = vdx.vdx_vcpuid,
1911 };
1912 if (vmm_allow_state_writes != 0) {
1913 error = vmm_data_write(sc->vmm_vm, &req);
1914 } else {
1915 /*
1916 * Reject the write if somone has thrown the switch back
1917 * into the "disallow" position.
1918 */
1919 error = EPERM;
1920 }
1921
1922 if (error == 0 && buf != NULL &&
1923 (vdx.vdx_flags & VDX_FLAG_WRITE_COPYOUT) != 0) {
1924 if (ddi_copyout(buf, vdx.vdx_data, len, md) != 0) {
1925 error = EFAULT;
1926 }
1927 }
1928
1929 /*
1930 * Copy out the transfer request so that the value of
1931 * vdx_result_len can be made available, regardless of any
1932 * error(s) which may have occurred.
1933 */
1934 if (ddi_copyout(&vdx, datap, sizeof (vdx), md) != 0) {
1935 error = (error != 0) ? error : EFAULT;
1936 }
1937
1938 if (buf != NULL) {
1939 kmem_free(buf, len);
1940 }
1941 break;
1942 }
1943
1944 case VM_PAUSE: {
1945 error = vm_pause_instance(sc->vmm_vm);
1946 break;
1947 }
1948 case VM_RESUME: {
1949 error = vm_resume_instance(sc->vmm_vm);
1950 break;
1951 }
1952
1953 default:
1954 error = ENOTTY;
1955 break;
1956 }
1957
1958 /* Release exclusion resources */
1959 switch (lock_type) {
1960 case LOCK_NONE:
1961 break;
1962 case LOCK_VCPU:
1963 vcpu_unlock_one(sc, vcpu);
1964 break;
1965 case LOCK_READ_HOLD:
1966 vmm_read_unlock(sc);
1967 break;
1968 case LOCK_WRITE_HOLD:
1969 vmm_write_unlock(sc);
1970 break;
1971 default:
1972 panic("unexpected lock type");
1973 break;
1974 }
1975
1976 return (error);
1977 }
1978
1979 static vmm_softc_t *
vmm_lookup(const char * name)1980 vmm_lookup(const char *name)
1981 {
1982 list_t *vml = &vmm_list;
1983 vmm_softc_t *sc;
1984
1985 ASSERT(MUTEX_HELD(&vmm_mtx));
1986
1987 for (sc = list_head(vml); sc != NULL; sc = list_next(vml, sc)) {
1988 if (strcmp(sc->vmm_name, name) == 0) {
1989 break;
1990 }
1991 }
1992
1993 return (sc);
1994 }
1995
1996 /*
1997 * Acquire an HMA registration if not already held.
1998 */
1999 static boolean_t
vmm_hma_acquire(void)2000 vmm_hma_acquire(void)
2001 {
2002 ASSERT(MUTEX_NOT_HELD(&vmm_mtx));
2003
2004 mutex_enter(&vmmdev_mtx);
2005
2006 if (vmmdev_hma_reg == NULL) {
2007 VERIFY3U(vmmdev_hma_ref, ==, 0);
2008 vmmdev_hma_reg = hma_register(vmmdev_hvm_name);
2009 if (vmmdev_hma_reg == NULL) {
2010 cmn_err(CE_WARN, "%s HMA registration failed.",
2011 vmmdev_hvm_name);
2012 mutex_exit(&vmmdev_mtx);
2013 return (B_FALSE);
2014 }
2015 }
2016
2017 vmmdev_hma_ref++;
2018
2019 mutex_exit(&vmmdev_mtx);
2020
2021 return (B_TRUE);
2022 }
2023
2024 /*
2025 * Release the HMA registration if held and there are no remaining VMs.
2026 */
2027 static void
vmm_hma_release(void)2028 vmm_hma_release(void)
2029 {
2030 ASSERT(MUTEX_NOT_HELD(&vmm_mtx));
2031
2032 mutex_enter(&vmmdev_mtx);
2033
2034 VERIFY3U(vmmdev_hma_ref, !=, 0);
2035
2036 vmmdev_hma_ref--;
2037
2038 if (vmmdev_hma_ref == 0) {
2039 VERIFY(vmmdev_hma_reg != NULL);
2040 hma_unregister(vmmdev_hma_reg);
2041 vmmdev_hma_reg = NULL;
2042 }
2043 mutex_exit(&vmmdev_mtx);
2044 }
2045
2046 static int
vmmdev_do_vm_create(const struct vm_create_req * req,cred_t * cr)2047 vmmdev_do_vm_create(const struct vm_create_req *req, cred_t *cr)
2048 {
2049 vmm_softc_t *sc = NULL;
2050 minor_t minor;
2051 int error = ENOMEM;
2052 size_t len;
2053 const char *name = req->name;
2054
2055 len = strnlen(name, VM_MAX_NAMELEN);
2056 if (len == 0) {
2057 return (EINVAL);
2058 }
2059 if (len >= VM_MAX_NAMELEN) {
2060 return (ENAMETOOLONG);
2061 }
2062 if (strchr(name, '/') != NULL) {
2063 return (EINVAL);
2064 }
2065
2066 if (!vmm_hma_acquire())
2067 return (ENXIO);
2068
2069 mutex_enter(&vmm_mtx);
2070
2071 /* Look for duplicate names */
2072 if (vmm_lookup(name) != NULL) {
2073 mutex_exit(&vmm_mtx);
2074 vmm_hma_release();
2075 return (EEXIST);
2076 }
2077
2078 /* Allow only one instance per non-global zone. */
2079 if (!INGLOBALZONE(curproc)) {
2080 for (sc = list_head(&vmm_list); sc != NULL;
2081 sc = list_next(&vmm_list, sc)) {
2082 if (sc->vmm_zone == curzone) {
2083 mutex_exit(&vmm_mtx);
2084 vmm_hma_release();
2085 return (EINVAL);
2086 }
2087 }
2088 }
2089
2090 minor = id_alloc(vmm_minors);
2091 if (ddi_soft_state_zalloc(vmm_statep, minor) != DDI_SUCCESS) {
2092 goto fail;
2093 } else if ((sc = ddi_get_soft_state(vmm_statep, minor)) == NULL) {
2094 ddi_soft_state_free(vmm_statep, minor);
2095 goto fail;
2096 } else if (ddi_create_minor_node(vmmdev_dip, name, S_IFCHR, minor,
2097 DDI_PSEUDO, 0) != DDI_SUCCESS) {
2098 goto fail;
2099 }
2100
2101 if (vmm_kstat_alloc(sc, minor, cr) != 0) {
2102 goto fail;
2103 }
2104
2105 error = vm_create(req->flags, &sc->vmm_vm);
2106 if (error == 0) {
2107 /* Complete VM intialization and report success. */
2108 (void) strlcpy(sc->vmm_name, name, sizeof (sc->vmm_name));
2109 sc->vmm_minor = minor;
2110 list_create(&sc->vmm_devmem_list, sizeof (vmm_devmem_entry_t),
2111 offsetof(vmm_devmem_entry_t, vde_node));
2112
2113 list_create(&sc->vmm_holds, sizeof (vmm_hold_t),
2114 offsetof(vmm_hold_t, vmh_node));
2115 cv_init(&sc->vmm_cv, NULL, CV_DEFAULT, NULL);
2116
2117 mutex_init(&sc->vmm_lease_lock, NULL, MUTEX_DEFAULT, NULL);
2118 list_create(&sc->vmm_lease_list, sizeof (vmm_lease_t),
2119 offsetof(vmm_lease_t, vml_node));
2120 cv_init(&sc->vmm_lease_cv, NULL, CV_DEFAULT, NULL);
2121 rw_init(&sc->vmm_rwlock, NULL, RW_DEFAULT, NULL);
2122
2123 sc->vmm_zone = crgetzone(cr);
2124 zone_hold(sc->vmm_zone);
2125 vmm_zsd_add_vm(sc);
2126 vmm_kstat_init(sc);
2127
2128 list_insert_tail(&vmm_list, sc);
2129 mutex_exit(&vmm_mtx);
2130 return (0);
2131 }
2132
2133 vmm_kstat_fini(sc);
2134 ddi_remove_minor_node(vmmdev_dip, name);
2135 fail:
2136 id_free(vmm_minors, minor);
2137 if (sc != NULL) {
2138 ddi_soft_state_free(vmm_statep, minor);
2139 }
2140 mutex_exit(&vmm_mtx);
2141 vmm_hma_release();
2142
2143 return (error);
2144 }
2145
2146 /*
2147 * Bhyve 'Driver' Interface
2148 *
2149 * While many devices are emulated in the bhyve userspace process, there are
2150 * others with performance constraints which require that they run mostly or
2151 * entirely in-kernel. For those not integrated directly into bhyve, an API is
2152 * needed so they can query/manipulate the portions of VM state needed to
2153 * fulfill their purpose.
2154 *
2155 * This includes:
2156 * - Translating guest-physical addresses to host-virtual pointers
2157 * - Injecting MSIs
2158 * - Hooking IO port addresses
2159 *
2160 * The vmm_drv interface exists to provide that functionality to its consumers.
2161 * (At this time, 'viona' is the only user)
2162 */
2163 int
vmm_drv_hold(file_t * fp,cred_t * cr,vmm_hold_t ** holdp)2164 vmm_drv_hold(file_t *fp, cred_t *cr, vmm_hold_t **holdp)
2165 {
2166 vnode_t *vp = fp->f_vnode;
2167 const dev_t dev = vp->v_rdev;
2168 vmm_softc_t *sc;
2169 vmm_hold_t *hold;
2170 int err = 0;
2171
2172 if (vp->v_type != VCHR) {
2173 return (ENXIO);
2174 }
2175 const major_t major = getmajor(dev);
2176 const minor_t minor = getminor(dev);
2177
2178 mutex_enter(&vmmdev_mtx);
2179 if (vmmdev_dip == NULL || major != ddi_driver_major(vmmdev_dip)) {
2180 mutex_exit(&vmmdev_mtx);
2181 return (ENOENT);
2182 }
2183 mutex_enter(&vmm_mtx);
2184 mutex_exit(&vmmdev_mtx);
2185
2186 if ((sc = ddi_get_soft_state(vmm_statep, minor)) == NULL) {
2187 err = ENOENT;
2188 goto out;
2189 }
2190 /* XXXJOY: check cred permissions against instance */
2191
2192 if ((sc->vmm_flags & VMM_DESTROY) != 0) {
2193 err = EBUSY;
2194 goto out;
2195 }
2196
2197 hold = kmem_zalloc(sizeof (*hold), KM_SLEEP);
2198 hold->vmh_sc = sc;
2199 hold->vmh_release_req = B_FALSE;
2200
2201 list_insert_tail(&sc->vmm_holds, hold);
2202 sc->vmm_flags |= VMM_HELD;
2203 *holdp = hold;
2204
2205 out:
2206 mutex_exit(&vmm_mtx);
2207 return (err);
2208 }
2209
2210 void
vmm_drv_rele(vmm_hold_t * hold)2211 vmm_drv_rele(vmm_hold_t *hold)
2212 {
2213 vmm_softc_t *sc;
2214 bool hma_release = false;
2215
2216 ASSERT(hold != NULL);
2217 ASSERT(hold->vmh_sc != NULL);
2218 VERIFY(hold->vmh_ioport_hook_cnt == 0);
2219
2220 mutex_enter(&vmm_mtx);
2221 sc = hold->vmh_sc;
2222 list_remove(&sc->vmm_holds, hold);
2223 kmem_free(hold, sizeof (*hold));
2224
2225 if (list_is_empty(&sc->vmm_holds)) {
2226 sc->vmm_flags &= ~VMM_HELD;
2227
2228 /*
2229 * Since outstanding holds would prevent instance destruction
2230 * from completing, attempt to finish it now if it was already
2231 * set in motion.
2232 */
2233 if ((sc->vmm_flags & VMM_DESTROY) != 0) {
2234 VERIFY0(vmm_destroy_locked(sc, VDO_DEFAULT,
2235 &hma_release));
2236 }
2237 }
2238 mutex_exit(&vmm_mtx);
2239
2240 if (hma_release) {
2241 vmm_hma_release();
2242 }
2243 }
2244
2245 boolean_t
vmm_drv_release_reqd(vmm_hold_t * hold)2246 vmm_drv_release_reqd(vmm_hold_t *hold)
2247 {
2248 ASSERT(hold != NULL);
2249
2250 return (hold->vmh_release_req);
2251 }
2252
2253 vmm_lease_t *
vmm_drv_lease_sign(vmm_hold_t * hold,boolean_t (* expiref)(void *),void * arg)2254 vmm_drv_lease_sign(vmm_hold_t *hold, boolean_t (*expiref)(void *), void *arg)
2255 {
2256 vmm_softc_t *sc = hold->vmh_sc;
2257 vmm_lease_t *lease;
2258
2259 ASSERT3P(expiref, !=, NULL);
2260
2261 if (hold->vmh_release_req) {
2262 return (NULL);
2263 }
2264
2265 lease = kmem_alloc(sizeof (*lease), KM_SLEEP);
2266 list_link_init(&lease->vml_node);
2267 lease->vml_expire_func = expiref;
2268 lease->vml_expire_arg = arg;
2269 lease->vml_expired = B_FALSE;
2270 lease->vml_break_deferred = B_FALSE;
2271 lease->vml_hold = hold;
2272 /* cache the VM pointer for one less pointer chase */
2273 lease->vml_vm = sc->vmm_vm;
2274 lease->vml_vmclient = vmspace_client_alloc(vm_get_vmspace(sc->vmm_vm));
2275
2276 mutex_enter(&sc->vmm_lease_lock);
2277 while (sc->vmm_lease_blocker != 0) {
2278 cv_wait(&sc->vmm_lease_cv, &sc->vmm_lease_lock);
2279 }
2280 list_insert_tail(&sc->vmm_lease_list, lease);
2281 vmm_read_lock(sc);
2282 mutex_exit(&sc->vmm_lease_lock);
2283
2284 return (lease);
2285 }
2286
2287 static void
vmm_lease_break_locked(vmm_softc_t * sc,vmm_lease_t * lease)2288 vmm_lease_break_locked(vmm_softc_t *sc, vmm_lease_t *lease)
2289 {
2290 ASSERT(MUTEX_HELD(&sc->vmm_lease_lock));
2291
2292 list_remove(&sc->vmm_lease_list, lease);
2293 vmm_read_unlock(sc);
2294 vmc_destroy(lease->vml_vmclient);
2295 kmem_free(lease, sizeof (*lease));
2296 }
2297
2298 static void
vmm_lease_block(vmm_softc_t * sc)2299 vmm_lease_block(vmm_softc_t *sc)
2300 {
2301 mutex_enter(&sc->vmm_lease_lock);
2302 VERIFY3U(sc->vmm_lease_blocker, !=, UINT_MAX);
2303 sc->vmm_lease_blocker++;
2304 if (sc->vmm_lease_blocker == 1) {
2305 list_t *list = &sc->vmm_lease_list;
2306 vmm_lease_t *lease = list_head(list);
2307
2308 while (lease != NULL) {
2309 void *arg = lease->vml_expire_arg;
2310 boolean_t (*expiref)(void *) = lease->vml_expire_func;
2311 boolean_t sync_break = B_FALSE;
2312
2313 /*
2314 * Since the lease expiration notification may
2315 * need to take locks which would deadlock with
2316 * vmm_lease_lock, drop it across the call.
2317 *
2318 * We are the only one allowed to manipulate
2319 * vmm_lease_list right now, so it is safe to
2320 * continue iterating through it after
2321 * reacquiring the lock.
2322 */
2323 lease->vml_expired = B_TRUE;
2324 mutex_exit(&sc->vmm_lease_lock);
2325 sync_break = expiref(arg);
2326 mutex_enter(&sc->vmm_lease_lock);
2327
2328 if (sync_break) {
2329 vmm_lease_t *next;
2330
2331 /*
2332 * These leases which are synchronously broken
2333 * result in vmm_read_unlock() calls from a
2334 * different thread than the corresponding
2335 * vmm_read_lock(). This is acceptable, given
2336 * that the rwlock underpinning the whole
2337 * mechanism tolerates the behavior. This
2338 * flexibility is _only_ afforded to VM read
2339 * lock (RW_READER) holders.
2340 */
2341 next = list_next(list, lease);
2342 vmm_lease_break_locked(sc, lease);
2343 lease = next;
2344 } else {
2345 lease = list_next(list, lease);
2346 }
2347 }
2348
2349 /* Process leases which were not broken synchronously. */
2350 while (!list_is_empty(list)) {
2351 /*
2352 * Although the nested loops are quadratic, the number
2353 * of leases is small.
2354 */
2355 lease = list_head(list);
2356 while (lease != NULL) {
2357 vmm_lease_t *next = list_next(list, lease);
2358 if (lease->vml_break_deferred) {
2359 vmm_lease_break_locked(sc, lease);
2360 }
2361 lease = next;
2362 }
2363 if (list_is_empty(list)) {
2364 break;
2365 }
2366 cv_wait(&sc->vmm_lease_cv, &sc->vmm_lease_lock);
2367 }
2368 /* Wake anyone else waiting for the lease list to be empty */
2369 cv_broadcast(&sc->vmm_lease_cv);
2370 } else {
2371 list_t *list = &sc->vmm_lease_list;
2372
2373 /*
2374 * Some other thread beat us to the duty of lease cleanup.
2375 * Wait until that is complete.
2376 */
2377 while (!list_is_empty(list)) {
2378 cv_wait(&sc->vmm_lease_cv, &sc->vmm_lease_lock);
2379 }
2380 }
2381 mutex_exit(&sc->vmm_lease_lock);
2382 }
2383
2384 static void
vmm_lease_unblock(vmm_softc_t * sc)2385 vmm_lease_unblock(vmm_softc_t *sc)
2386 {
2387 mutex_enter(&sc->vmm_lease_lock);
2388 VERIFY3U(sc->vmm_lease_blocker, !=, 0);
2389 sc->vmm_lease_blocker--;
2390 if (sc->vmm_lease_blocker == 0) {
2391 cv_broadcast(&sc->vmm_lease_cv);
2392 }
2393 mutex_exit(&sc->vmm_lease_lock);
2394 }
2395
2396 void
vmm_drv_lease_break(vmm_hold_t * hold,vmm_lease_t * lease)2397 vmm_drv_lease_break(vmm_hold_t *hold, vmm_lease_t *lease)
2398 {
2399 vmm_softc_t *sc = hold->vmh_sc;
2400
2401 VERIFY3P(hold, ==, lease->vml_hold);
2402 VERIFY(!lease->vml_break_deferred);
2403
2404 mutex_enter(&sc->vmm_lease_lock);
2405 if (sc->vmm_lease_blocker == 0) {
2406 vmm_lease_break_locked(sc, lease);
2407 } else {
2408 /*
2409 * Defer the lease-breaking to whichever thread is currently
2410 * cleaning up all leases as part of a vmm_lease_block() call.
2411 */
2412 lease->vml_break_deferred = B_TRUE;
2413 cv_broadcast(&sc->vmm_lease_cv);
2414 }
2415 mutex_exit(&sc->vmm_lease_lock);
2416 }
2417
2418 boolean_t
vmm_drv_lease_expired(vmm_lease_t * lease)2419 vmm_drv_lease_expired(vmm_lease_t *lease)
2420 {
2421 return (lease->vml_expired);
2422 }
2423
2424 vmm_page_t *
vmm_drv_page_hold(vmm_lease_t * lease,uintptr_t gpa,int prot)2425 vmm_drv_page_hold(vmm_lease_t *lease, uintptr_t gpa, int prot)
2426 {
2427 ASSERT(lease != NULL);
2428 ASSERT0(gpa & PAGEOFFSET);
2429
2430 return ((vmm_page_t *)vmc_hold(lease->vml_vmclient, gpa, prot));
2431 }
2432
2433
2434 /* Ensure that flags mirrored by vmm_drv interface properly match up */
2435 CTASSERT(VMPF_DEFER_DIRTY == VPF_DEFER_DIRTY);
2436
2437 vmm_page_t *
vmm_drv_page_hold_ext(vmm_lease_t * lease,uintptr_t gpa,int prot,int flags)2438 vmm_drv_page_hold_ext(vmm_lease_t *lease, uintptr_t gpa, int prot, int flags)
2439 {
2440 ASSERT(lease != NULL);
2441 ASSERT0(gpa & PAGEOFFSET);
2442
2443 vmm_page_t *page =
2444 (vmm_page_t *)vmc_hold_ext(lease->vml_vmclient, gpa, prot, flags);
2445 return (page);
2446 }
2447
2448 void
vmm_drv_page_release(vmm_page_t * vmmp)2449 vmm_drv_page_release(vmm_page_t *vmmp)
2450 {
2451 (void) vmp_release((vm_page_t *)vmmp);
2452 }
2453
2454 void
vmm_drv_page_release_chain(vmm_page_t * vmmp)2455 vmm_drv_page_release_chain(vmm_page_t *vmmp)
2456 {
2457 (void) vmp_release_chain((vm_page_t *)vmmp);
2458 }
2459
2460 const void *
vmm_drv_page_readable(const vmm_page_t * vmmp)2461 vmm_drv_page_readable(const vmm_page_t *vmmp)
2462 {
2463 return (vmp_get_readable((const vm_page_t *)vmmp));
2464 }
2465
2466 void *
vmm_drv_page_writable(const vmm_page_t * vmmp)2467 vmm_drv_page_writable(const vmm_page_t *vmmp)
2468 {
2469 return (vmp_get_writable((const vm_page_t *)vmmp));
2470 }
2471
2472 void
vmm_drv_page_mark_dirty(vmm_page_t * vmmp)2473 vmm_drv_page_mark_dirty(vmm_page_t *vmmp)
2474 {
2475 return (vmp_mark_dirty((vm_page_t *)vmmp));
2476 }
2477
2478 void
vmm_drv_page_chain(vmm_page_t * vmmp,vmm_page_t * to_chain)2479 vmm_drv_page_chain(vmm_page_t *vmmp, vmm_page_t *to_chain)
2480 {
2481 vmp_chain((vm_page_t *)vmmp, (vm_page_t *)to_chain);
2482 }
2483
2484 vmm_page_t *
vmm_drv_page_next(const vmm_page_t * vmmp)2485 vmm_drv_page_next(const vmm_page_t *vmmp)
2486 {
2487 return ((vmm_page_t *)vmp_next((vm_page_t *)vmmp));
2488 }
2489
2490 int
vmm_drv_msi(vmm_lease_t * lease,uint64_t addr,uint64_t msg)2491 vmm_drv_msi(vmm_lease_t *lease, uint64_t addr, uint64_t msg)
2492 {
2493 ASSERT(lease != NULL);
2494
2495 return (lapic_intr_msi(lease->vml_vm, addr, msg));
2496 }
2497
2498 int
vmm_drv_ioport_hook(vmm_hold_t * hold,uint16_t ioport,vmm_drv_iop_cb_t func,void * arg,void ** cookie)2499 vmm_drv_ioport_hook(vmm_hold_t *hold, uint16_t ioport, vmm_drv_iop_cb_t func,
2500 void *arg, void **cookie)
2501 {
2502 vmm_softc_t *sc;
2503 int err;
2504
2505 ASSERT(hold != NULL);
2506 ASSERT(cookie != NULL);
2507
2508 sc = hold->vmh_sc;
2509 mutex_enter(&vmm_mtx);
2510 /* Confirm that hook installation is not blocked */
2511 if ((sc->vmm_flags & VMM_BLOCK_HOOK) != 0) {
2512 mutex_exit(&vmm_mtx);
2513 return (EBUSY);
2514 }
2515 /*
2516 * Optimistically record an installed hook which will prevent a block
2517 * from being asserted while the mutex is dropped.
2518 */
2519 hold->vmh_ioport_hook_cnt++;
2520 mutex_exit(&vmm_mtx);
2521
2522 vmm_write_lock(sc);
2523 err = vm_ioport_hook(sc->vmm_vm, ioport, (ioport_handler_t)func,
2524 arg, cookie);
2525 vmm_write_unlock(sc);
2526
2527 if (err != 0) {
2528 mutex_enter(&vmm_mtx);
2529 /* Walk back optimism about the hook installation */
2530 hold->vmh_ioport_hook_cnt--;
2531 mutex_exit(&vmm_mtx);
2532 }
2533 return (err);
2534 }
2535
2536 void
vmm_drv_ioport_unhook(vmm_hold_t * hold,void ** cookie)2537 vmm_drv_ioport_unhook(vmm_hold_t *hold, void **cookie)
2538 {
2539 vmm_softc_t *sc;
2540
2541 ASSERT(hold != NULL);
2542 ASSERT(cookie != NULL);
2543 ASSERT(hold->vmh_ioport_hook_cnt != 0);
2544
2545 sc = hold->vmh_sc;
2546 vmm_write_lock(sc);
2547 vm_ioport_unhook(sc->vmm_vm, cookie);
2548 vmm_write_unlock(sc);
2549
2550 mutex_enter(&vmm_mtx);
2551 hold->vmh_ioport_hook_cnt--;
2552 mutex_exit(&vmm_mtx);
2553 }
2554
2555 static void
vmm_drv_purge(vmm_softc_t * sc)2556 vmm_drv_purge(vmm_softc_t *sc)
2557 {
2558 ASSERT(MUTEX_HELD(&vmm_mtx));
2559
2560 if ((sc->vmm_flags & VMM_HELD) != 0) {
2561 vmm_hold_t *hold;
2562
2563 for (hold = list_head(&sc->vmm_holds); hold != NULL;
2564 hold = list_next(&sc->vmm_holds, hold)) {
2565 hold->vmh_release_req = B_TRUE;
2566 }
2567
2568 /*
2569 * Require that all leases on the instance be broken, now that
2570 * all associated holds have been marked as needing release.
2571 *
2572 * Dropping vmm_mtx is not strictly necessary, but if any of the
2573 * lessees are slow to respond, it would be nice to leave it
2574 * available for other parties.
2575 */
2576 mutex_exit(&vmm_mtx);
2577 vmm_lease_block(sc);
2578 vmm_lease_unblock(sc);
2579 mutex_enter(&vmm_mtx);
2580 }
2581 }
2582
2583 static int
vmm_drv_block_hook(vmm_softc_t * sc,boolean_t enable_block)2584 vmm_drv_block_hook(vmm_softc_t *sc, boolean_t enable_block)
2585 {
2586 int err = 0;
2587
2588 mutex_enter(&vmm_mtx);
2589 if (!enable_block) {
2590 VERIFY((sc->vmm_flags & VMM_BLOCK_HOOK) != 0);
2591
2592 sc->vmm_flags &= ~VMM_BLOCK_HOOK;
2593 goto done;
2594 }
2595
2596 /* If any holds have hooks installed, the block is a failure */
2597 if (!list_is_empty(&sc->vmm_holds)) {
2598 vmm_hold_t *hold;
2599
2600 for (hold = list_head(&sc->vmm_holds); hold != NULL;
2601 hold = list_next(&sc->vmm_holds, hold)) {
2602 if (hold->vmh_ioport_hook_cnt != 0) {
2603 err = EBUSY;
2604 goto done;
2605 }
2606 }
2607 }
2608 sc->vmm_flags |= VMM_BLOCK_HOOK;
2609
2610 done:
2611 mutex_exit(&vmm_mtx);
2612 return (err);
2613 }
2614
2615
2616 static void
vmm_destroy_begin(vmm_softc_t * sc,vmm_destroy_opts_t opts)2617 vmm_destroy_begin(vmm_softc_t *sc, vmm_destroy_opts_t opts)
2618 {
2619 ASSERT(MUTEX_HELD(&vmm_mtx));
2620 ASSERT0(sc->vmm_flags & VMM_DESTROY);
2621
2622 sc->vmm_flags |= VMM_DESTROY;
2623
2624 /*
2625 * Lock and unlock all of the vCPUs to ensure that they are kicked out
2626 * of guest context, being unable to return now that the instance is
2627 * marked for destruction.
2628 */
2629 const int maxcpus = vm_get_maxcpus(sc->vmm_vm);
2630 for (int vcpu = 0; vcpu < maxcpus; vcpu++) {
2631 vcpu_lock_one(sc, vcpu);
2632 vcpu_unlock_one(sc, vcpu);
2633 }
2634
2635 vmmdev_devmem_purge(sc);
2636 if ((opts & VDO_NO_CLEAN_ZSD) == 0) {
2637 /*
2638 * The ZSD should be cleaned up now, unless destruction of the
2639 * instance was initated by destruction of the containing zone,
2640 * in which case the ZSD has already been removed.
2641 */
2642 vmm_zsd_rem_vm(sc);
2643 }
2644 zone_rele(sc->vmm_zone);
2645
2646 vmm_drv_purge(sc);
2647 }
2648
2649 static bool
vmm_destroy_ready(vmm_softc_t * sc)2650 vmm_destroy_ready(vmm_softc_t *sc)
2651 {
2652 ASSERT(MUTEX_HELD(&vmm_mtx));
2653
2654 if ((sc->vmm_flags & (VMM_HELD | VMM_IS_OPEN)) == 0) {
2655 VERIFY(list_is_empty(&sc->vmm_holds));
2656 return (true);
2657 }
2658
2659 return (false);
2660 }
2661
2662 static void
vmm_destroy_finish(vmm_softc_t * sc)2663 vmm_destroy_finish(vmm_softc_t *sc)
2664 {
2665 ASSERT(MUTEX_HELD(&vmm_mtx));
2666 ASSERT(vmm_destroy_ready(sc));
2667
2668 list_remove(&vmm_list, sc);
2669 vmm_kstat_fini(sc);
2670 vm_destroy(sc->vmm_vm);
2671 ddi_remove_minor_node(vmmdev_dip, sc->vmm_name);
2672 (void) devfs_clean(ddi_get_parent(vmmdev_dip), NULL, DV_CLEAN_FORCE);
2673
2674 const minor_t minor = sc->vmm_minor;
2675 ddi_soft_state_free(vmm_statep, minor);
2676 id_free(vmm_minors, minor);
2677 }
2678
2679 /*
2680 * Initiate or attempt to finish destruction of a VMM instance.
2681 *
2682 * This is called from several contexts:
2683 * - An explicit destroy ioctl is made
2684 * - A vmm_drv consumer releases its hold (being the last on the instance)
2685 * - The vmm device is closed, and auto-destruct is enabled
2686 */
2687 static int
vmm_destroy_locked(vmm_softc_t * sc,vmm_destroy_opts_t opts,bool * hma_release)2688 vmm_destroy_locked(vmm_softc_t *sc, vmm_destroy_opts_t opts,
2689 bool *hma_release)
2690 {
2691 ASSERT(MUTEX_HELD(&vmm_mtx));
2692
2693 *hma_release = false;
2694
2695 /*
2696 * When instance destruction begins, it is so marked such that any
2697 * further requests to operate the instance will fail.
2698 */
2699 if ((sc->vmm_flags & VMM_DESTROY) == 0) {
2700 vmm_destroy_begin(sc, opts);
2701 }
2702
2703 if (vmm_destroy_ready(sc)) {
2704
2705 /*
2706 * Notify anyone waiting for the destruction to finish. They
2707 * must be clear before we can safely tear down the softc.
2708 */
2709 if (sc->vmm_destroy_waiters != 0) {
2710 cv_broadcast(&sc->vmm_cv);
2711 while (sc->vmm_destroy_waiters != 0) {
2712 cv_wait(&sc->vmm_cv, &vmm_mtx);
2713 }
2714 }
2715
2716 /*
2717 * Finish destruction of instance. After this point, the softc
2718 * is freed and cannot be accessed again.
2719 *
2720 * With destruction complete, the HMA hold can be released
2721 */
2722 vmm_destroy_finish(sc);
2723 *hma_release = true;
2724 return (0);
2725 } else if ((opts & VDO_ATTEMPT_WAIT) != 0) {
2726 int err = 0;
2727
2728 sc->vmm_destroy_waiters++;
2729 while (!vmm_destroy_ready(sc) && err == 0) {
2730 if (cv_wait_sig(&sc->vmm_cv, &vmm_mtx) <= 0) {
2731 err = EINTR;
2732 }
2733 }
2734 sc->vmm_destroy_waiters--;
2735
2736 if (sc->vmm_destroy_waiters == 0) {
2737 /*
2738 * If we were the last waiter, it could be that VM
2739 * destruction is waiting on _us_ to proceed with the
2740 * final clean-up.
2741 */
2742 cv_signal(&sc->vmm_cv);
2743 }
2744 return (err);
2745 } else {
2746 /*
2747 * Since the instance is not ready for destruction, and the
2748 * caller did not ask to wait, consider it a success for now.
2749 */
2750 return (0);
2751 }
2752 }
2753
2754 void
vmm_zone_vm_destroy(vmm_softc_t * sc)2755 vmm_zone_vm_destroy(vmm_softc_t *sc)
2756 {
2757 bool hma_release = false;
2758 int err;
2759
2760 mutex_enter(&vmm_mtx);
2761 err = vmm_destroy_locked(sc, VDO_NO_CLEAN_ZSD, &hma_release);
2762 mutex_exit(&vmm_mtx);
2763
2764 VERIFY0(err);
2765
2766 if (hma_release) {
2767 vmm_hma_release();
2768 }
2769 }
2770
2771 static int
vmmdev_do_vm_destroy(const struct vm_destroy_req * req,cred_t * cr)2772 vmmdev_do_vm_destroy(const struct vm_destroy_req *req, cred_t *cr)
2773 {
2774 vmm_softc_t *sc;
2775 bool hma_release = false;
2776 int err;
2777
2778 if (crgetuid(cr) != 0) {
2779 return (EPERM);
2780 }
2781
2782 mutex_enter(&vmm_mtx);
2783 sc = vmm_lookup(req->name);
2784 if (sc == NULL) {
2785 mutex_exit(&vmm_mtx);
2786 return (ENOENT);
2787 }
2788 /*
2789 * We don't check this in vmm_lookup() since that function is also used
2790 * for validation during create and currently vmm names must be unique.
2791 */
2792 if (!INGLOBALZONE(curproc) && sc->vmm_zone != curzone) {
2793 mutex_exit(&vmm_mtx);
2794 return (EPERM);
2795 }
2796
2797 err = vmm_destroy_locked(sc, VDO_ATTEMPT_WAIT, &hma_release);
2798 mutex_exit(&vmm_mtx);
2799
2800 if (hma_release) {
2801 vmm_hma_release();
2802 }
2803
2804 return (err);
2805 }
2806
2807 #define VCPU_NAME_BUFLEN 32
2808
2809 static int
vmm_kstat_alloc(vmm_softc_t * sc,minor_t minor,const cred_t * cr)2810 vmm_kstat_alloc(vmm_softc_t *sc, minor_t minor, const cred_t *cr)
2811 {
2812 zoneid_t zid = crgetzoneid(cr);
2813 int instance = minor;
2814 kstat_t *ksp;
2815
2816 ASSERT3P(sc->vmm_kstat_vm, ==, NULL);
2817
2818 ksp = kstat_create_zone(VMM_MODULE_NAME, instance, "vm",
2819 VMM_KSTAT_CLASS, KSTAT_TYPE_NAMED,
2820 sizeof (vmm_kstats_t) / sizeof (kstat_named_t), 0, zid);
2821
2822 if (ksp == NULL) {
2823 return (-1);
2824 }
2825 sc->vmm_kstat_vm = ksp;
2826
2827 for (uint_t i = 0; i < VM_MAXCPU; i++) {
2828 char namebuf[VCPU_NAME_BUFLEN];
2829
2830 ASSERT3P(sc->vmm_kstat_vcpu[i], ==, NULL);
2831
2832 (void) snprintf(namebuf, VCPU_NAME_BUFLEN, "vcpu%u", i);
2833 ksp = kstat_create_zone(VMM_MODULE_NAME, instance, namebuf,
2834 VMM_KSTAT_CLASS, KSTAT_TYPE_NAMED,
2835 sizeof (vmm_vcpu_kstats_t) / sizeof (kstat_named_t),
2836 0, zid);
2837 if (ksp == NULL) {
2838 goto fail;
2839 }
2840
2841 sc->vmm_kstat_vcpu[i] = ksp;
2842 }
2843
2844 /*
2845 * If this instance is associated with a non-global zone, make its
2846 * kstats visible from the GZ.
2847 */
2848 if (zid != GLOBAL_ZONEID) {
2849 kstat_zone_add(sc->vmm_kstat_vm, GLOBAL_ZONEID);
2850 for (uint_t i = 0; i < VM_MAXCPU; i++) {
2851 kstat_zone_add(sc->vmm_kstat_vcpu[i], GLOBAL_ZONEID);
2852 }
2853 }
2854
2855 return (0);
2856
2857 fail:
2858 for (uint_t i = 0; i < VM_MAXCPU; i++) {
2859 if (sc->vmm_kstat_vcpu[i] != NULL) {
2860 kstat_delete(sc->vmm_kstat_vcpu[i]);
2861 sc->vmm_kstat_vcpu[i] = NULL;
2862 } else {
2863 break;
2864 }
2865 }
2866 kstat_delete(sc->vmm_kstat_vm);
2867 sc->vmm_kstat_vm = NULL;
2868 return (-1);
2869 }
2870
2871 static void
vmm_kstat_init(vmm_softc_t * sc)2872 vmm_kstat_init(vmm_softc_t *sc)
2873 {
2874 kstat_t *ksp;
2875
2876 ASSERT3P(sc->vmm_vm, !=, NULL);
2877 ASSERT3P(sc->vmm_kstat_vm, !=, NULL);
2878
2879 ksp = sc->vmm_kstat_vm;
2880 vmm_kstats_t *vk = ksp->ks_data;
2881 ksp->ks_private = sc->vmm_vm;
2882 kstat_named_init(&vk->vk_name, "vm_name", KSTAT_DATA_STRING);
2883 kstat_named_setstr(&vk->vk_name, sc->vmm_name);
2884
2885 for (uint_t i = 0; i < VM_MAXCPU; i++) {
2886 ASSERT3P(sc->vmm_kstat_vcpu[i], !=, NULL);
2887
2888 ksp = sc->vmm_kstat_vcpu[i];
2889 vmm_vcpu_kstats_t *vvk = ksp->ks_data;
2890
2891 kstat_named_init(&vvk->vvk_vcpu, "vcpu", KSTAT_DATA_UINT32);
2892 vvk->vvk_vcpu.value.ui32 = i;
2893 kstat_named_init(&vvk->vvk_time_init, "time_init",
2894 KSTAT_DATA_UINT64);
2895 kstat_named_init(&vvk->vvk_time_run, "time_run",
2896 KSTAT_DATA_UINT64);
2897 kstat_named_init(&vvk->vvk_time_idle, "time_idle",
2898 KSTAT_DATA_UINT64);
2899 kstat_named_init(&vvk->vvk_time_emu_kern, "time_emu_kern",
2900 KSTAT_DATA_UINT64);
2901 kstat_named_init(&vvk->vvk_time_emu_user, "time_emu_user",
2902 KSTAT_DATA_UINT64);
2903 kstat_named_init(&vvk->vvk_time_sched, "time_sched",
2904 KSTAT_DATA_UINT64);
2905 ksp->ks_private = sc->vmm_vm;
2906 ksp->ks_update = vmm_kstat_update_vcpu;
2907 }
2908
2909 kstat_install(sc->vmm_kstat_vm);
2910 for (uint_t i = 0; i < VM_MAXCPU; i++) {
2911 kstat_install(sc->vmm_kstat_vcpu[i]);
2912 }
2913 }
2914
2915 static void
vmm_kstat_fini(vmm_softc_t * sc)2916 vmm_kstat_fini(vmm_softc_t *sc)
2917 {
2918 ASSERT(sc->vmm_kstat_vm != NULL);
2919
2920 kstat_delete(sc->vmm_kstat_vm);
2921 sc->vmm_kstat_vm = NULL;
2922
2923 for (uint_t i = 0; i < VM_MAXCPU; i++) {
2924 ASSERT3P(sc->vmm_kstat_vcpu[i], !=, NULL);
2925
2926 kstat_delete(sc->vmm_kstat_vcpu[i]);
2927 sc->vmm_kstat_vcpu[i] = NULL;
2928 }
2929 }
2930
2931 static int
vmm_open(dev_t * devp,int flag,int otyp,cred_t * credp)2932 vmm_open(dev_t *devp, int flag, int otyp, cred_t *credp)
2933 {
2934 minor_t minor;
2935 vmm_softc_t *sc;
2936
2937 /*
2938 * Forbid running bhyve in a 32-bit process until it has been tested and
2939 * verified to be safe.
2940 */
2941 if (curproc->p_model != DATAMODEL_LP64) {
2942 return (EFBIG);
2943 }
2944
2945 minor = getminor(*devp);
2946 if (minor == VMM_CTL_MINOR) {
2947 /*
2948 * Master control device must be opened exclusively.
2949 */
2950 if ((flag & FEXCL) != FEXCL || otyp != OTYP_CHR) {
2951 return (EINVAL);
2952 }
2953
2954 return (0);
2955 }
2956
2957 mutex_enter(&vmm_mtx);
2958 sc = ddi_get_soft_state(vmm_statep, minor);
2959 if (sc == NULL) {
2960 mutex_exit(&vmm_mtx);
2961 return (ENXIO);
2962 }
2963
2964 sc->vmm_flags |= VMM_IS_OPEN;
2965 mutex_exit(&vmm_mtx);
2966
2967 return (0);
2968 }
2969
2970 static int
vmm_close(dev_t dev,int flag,int otyp,cred_t * credp)2971 vmm_close(dev_t dev, int flag, int otyp, cred_t *credp)
2972 {
2973 const minor_t minor = getminor(dev);
2974 vmm_softc_t *sc;
2975 bool hma_release = false;
2976
2977 if (minor == VMM_CTL_MINOR) {
2978 return (0);
2979 }
2980
2981 mutex_enter(&vmm_mtx);
2982 sc = ddi_get_soft_state(vmm_statep, minor);
2983 if (sc == NULL) {
2984 mutex_exit(&vmm_mtx);
2985 return (ENXIO);
2986 }
2987
2988 VERIFY3U(sc->vmm_flags & VMM_IS_OPEN, !=, 0);
2989 sc->vmm_flags &= ~VMM_IS_OPEN;
2990
2991 /*
2992 * If instance was marked for auto-destruction begin that now. Instance
2993 * destruction may have been initated already, so try to make progress
2994 * in that case, since closure of the device is one of its requirements.
2995 */
2996 if ((sc->vmm_flags & VMM_DESTROY) != 0 ||
2997 (sc->vmm_flags & VMM_AUTODESTROY) != 0) {
2998 VERIFY0(vmm_destroy_locked(sc, VDO_DEFAULT, &hma_release));
2999 }
3000 mutex_exit(&vmm_mtx);
3001
3002 if (hma_release) {
3003 vmm_hma_release();
3004 }
3005
3006 return (0);
3007 }
3008
3009 static int
vmm_is_supported(intptr_t arg)3010 vmm_is_supported(intptr_t arg)
3011 {
3012 int r;
3013 const char *msg;
3014
3015 if (vmm_is_intel()) {
3016 r = vmx_x86_supported(&msg);
3017 } else if (vmm_is_svm()) {
3018 /*
3019 * HMA already ensured that the features necessary for SVM
3020 * operation were present and online during vmm_attach().
3021 */
3022 r = 0;
3023 } else {
3024 r = ENXIO;
3025 msg = "Unsupported CPU vendor";
3026 }
3027
3028 if (r != 0 && arg != (intptr_t)NULL) {
3029 if (copyoutstr(msg, (char *)arg, strlen(msg) + 1, NULL) != 0)
3030 return (EFAULT);
3031 }
3032 return (r);
3033 }
3034
3035 static int
vmm_ctl_ioctl(int cmd,intptr_t arg,int md,cred_t * cr,int * rvalp)3036 vmm_ctl_ioctl(int cmd, intptr_t arg, int md, cred_t *cr, int *rvalp)
3037 {
3038 void *argp = (void *)arg;
3039
3040 switch (cmd) {
3041 case VMM_CREATE_VM: {
3042 struct vm_create_req req;
3043
3044 if ((md & FWRITE) == 0) {
3045 return (EPERM);
3046 }
3047 if (ddi_copyin(argp, &req, sizeof (req), md) != 0) {
3048 return (EFAULT);
3049 }
3050 return (vmmdev_do_vm_create(&req, cr));
3051 }
3052 case VMM_DESTROY_VM: {
3053 struct vm_destroy_req req;
3054
3055 if ((md & FWRITE) == 0) {
3056 return (EPERM);
3057 }
3058 if (ddi_copyin(argp, &req, sizeof (req), md) != 0) {
3059 return (EFAULT);
3060 }
3061 return (vmmdev_do_vm_destroy(&req, cr));
3062 }
3063 case VMM_VM_SUPPORTED:
3064 return (vmm_is_supported(arg));
3065 case VMM_CHECK_IOMMU:
3066 if (!vmm_check_iommu()) {
3067 return (ENXIO);
3068 }
3069 return (0);
3070 case VMM_RESV_QUERY:
3071 case VMM_RESV_SET_TARGET:
3072 return (vmmr_ioctl(cmd, arg, md, cr, rvalp));
3073 default:
3074 break;
3075 }
3076 /* No other actions are legal on ctl device */
3077 return (ENOTTY);
3078 }
3079
3080 static int
vmm_ioctl(dev_t dev,int cmd,intptr_t arg,int mode,cred_t * credp,int * rvalp)3081 vmm_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp,
3082 int *rvalp)
3083 {
3084 vmm_softc_t *sc;
3085 minor_t minor;
3086
3087 /*
3088 * Forbid running bhyve in a 32-bit process until it has been tested and
3089 * verified to be safe.
3090 */
3091 if (curproc->p_model != DATAMODEL_LP64) {
3092 return (EFBIG);
3093 }
3094
3095 /* The structs in bhyve ioctls assume a 64-bit datamodel */
3096 if (ddi_model_convert_from(mode & FMODELS) != DDI_MODEL_NONE) {
3097 return (ENOTSUP);
3098 }
3099
3100 /*
3101 * Regardless of minor (vmmctl or instance), we respond to queries of
3102 * the interface version.
3103 */
3104 if (cmd == VMM_INTERFACE_VERSION) {
3105 *rvalp = VMM_CURRENT_INTERFACE_VERSION;
3106 return (0);
3107 }
3108
3109 minor = getminor(dev);
3110
3111 if (minor == VMM_CTL_MINOR) {
3112 return (vmm_ctl_ioctl(cmd, arg, mode, credp, rvalp));
3113 }
3114
3115 sc = ddi_get_soft_state(vmm_statep, minor);
3116 ASSERT(sc != NULL);
3117
3118 /*
3119 * Turn away any ioctls against an instance when it is being destroyed.
3120 * (Except for the ioctl inquiring about that destroy-in-progress.)
3121 */
3122 if ((sc->vmm_flags & VMM_DESTROY) != 0) {
3123 if (cmd == VM_DESTROY_PENDING) {
3124 *rvalp = 1;
3125 return (0);
3126 }
3127 return (ENXIO);
3128 }
3129
3130 return (vmmdev_do_ioctl(sc, cmd, arg, mode, credp, rvalp));
3131 }
3132
3133 static int
vmm_segmap(dev_t dev,off_t off,struct as * as,caddr_t * addrp,off_t len,unsigned int prot,unsigned int maxprot,unsigned int flags,cred_t * credp)3134 vmm_segmap(dev_t dev, off_t off, struct as *as, caddr_t *addrp, off_t len,
3135 unsigned int prot, unsigned int maxprot, unsigned int flags, cred_t *credp)
3136 {
3137 vmm_softc_t *sc;
3138 const minor_t minor = getminor(dev);
3139 int err;
3140
3141 if (minor == VMM_CTL_MINOR) {
3142 return (ENODEV);
3143 }
3144 if (off < 0 || (off + len) <= 0) {
3145 return (EINVAL);
3146 }
3147 if ((prot & PROT_USER) == 0) {
3148 return (EACCES);
3149 }
3150
3151 sc = ddi_get_soft_state(vmm_statep, minor);
3152 ASSERT(sc);
3153
3154 if (sc->vmm_flags & VMM_DESTROY)
3155 return (ENXIO);
3156
3157 /* Grab read lock on the VM to prevent any changes to the memory map */
3158 vmm_read_lock(sc);
3159
3160 if (off >= VM_DEVMEM_START) {
3161 int segid;
3162 off_t segoff;
3163
3164 /* Mapping a devmem "device" */
3165 if (!vmmdev_devmem_segid(sc, off, len, &segid, &segoff)) {
3166 err = ENODEV;
3167 } else {
3168 err = vm_segmap_obj(sc->vmm_vm, segid, segoff, len, as,
3169 addrp, prot, maxprot, flags);
3170 }
3171 } else {
3172 /* Mapping a part of the guest physical space */
3173 err = vm_segmap_space(sc->vmm_vm, off, as, addrp, len, prot,
3174 maxprot, flags);
3175 }
3176
3177 vmm_read_unlock(sc);
3178 return (err);
3179 }
3180
3181 static sdev_plugin_validate_t
vmm_sdev_validate(sdev_ctx_t ctx)3182 vmm_sdev_validate(sdev_ctx_t ctx)
3183 {
3184 const char *name = sdev_ctx_name(ctx);
3185 vmm_softc_t *sc;
3186 sdev_plugin_validate_t ret;
3187 minor_t minor;
3188
3189 if (sdev_ctx_vtype(ctx) != VCHR)
3190 return (SDEV_VTOR_INVALID);
3191
3192 VERIFY3S(sdev_ctx_minor(ctx, &minor), ==, 0);
3193
3194 mutex_enter(&vmm_mtx);
3195 if ((sc = vmm_lookup(name)) == NULL)
3196 ret = SDEV_VTOR_INVALID;
3197 else if (sc->vmm_minor != minor)
3198 ret = SDEV_VTOR_STALE;
3199 else
3200 ret = SDEV_VTOR_VALID;
3201 mutex_exit(&vmm_mtx);
3202
3203 return (ret);
3204 }
3205
3206 static int
vmm_sdev_filldir(sdev_ctx_t ctx)3207 vmm_sdev_filldir(sdev_ctx_t ctx)
3208 {
3209 vmm_softc_t *sc;
3210 int ret;
3211
3212 if (strcmp(sdev_ctx_path(ctx), VMM_SDEV_ROOT) != 0) {
3213 cmn_err(CE_WARN, "%s: bad path '%s' != '%s'\n", __func__,
3214 sdev_ctx_path(ctx), VMM_SDEV_ROOT);
3215 return (EINVAL);
3216 }
3217
3218 mutex_enter(&vmm_mtx);
3219 ASSERT(vmmdev_dip != NULL);
3220 for (sc = list_head(&vmm_list); sc != NULL;
3221 sc = list_next(&vmm_list, sc)) {
3222 if (INGLOBALZONE(curproc) || sc->vmm_zone == curzone) {
3223 ret = sdev_plugin_mknod(ctx, sc->vmm_name,
3224 S_IFCHR | 0600,
3225 makedevice(ddi_driver_major(vmmdev_dip),
3226 sc->vmm_minor));
3227 } else {
3228 continue;
3229 }
3230 if (ret != 0 && ret != EEXIST)
3231 goto out;
3232 }
3233
3234 ret = 0;
3235
3236 out:
3237 mutex_exit(&vmm_mtx);
3238 return (ret);
3239 }
3240
3241 /* ARGSUSED */
3242 static void
vmm_sdev_inactive(sdev_ctx_t ctx)3243 vmm_sdev_inactive(sdev_ctx_t ctx)
3244 {
3245 }
3246
3247 static sdev_plugin_ops_t vmm_sdev_ops = {
3248 .spo_version = SDEV_PLUGIN_VERSION,
3249 .spo_flags = SDEV_PLUGIN_SUBDIR,
3250 .spo_validate = vmm_sdev_validate,
3251 .spo_filldir = vmm_sdev_filldir,
3252 .spo_inactive = vmm_sdev_inactive
3253 };
3254
3255 /* ARGSUSED */
3256 static int
vmm_info(dev_info_t * dip,ddi_info_cmd_t cmd,void * arg,void ** result)3257 vmm_info(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result)
3258 {
3259 int error;
3260
3261 switch (cmd) {
3262 case DDI_INFO_DEVT2DEVINFO:
3263 *result = (void *)vmmdev_dip;
3264 error = DDI_SUCCESS;
3265 break;
3266 case DDI_INFO_DEVT2INSTANCE:
3267 *result = (void *)0;
3268 error = DDI_SUCCESS;
3269 break;
3270 default:
3271 error = DDI_FAILURE;
3272 break;
3273 }
3274 return (error);
3275 }
3276
3277 static int
vmm_attach(dev_info_t * dip,ddi_attach_cmd_t cmd)3278 vmm_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
3279 {
3280 sdev_plugin_hdl_t sph;
3281 hma_reg_t *reg = NULL;
3282 boolean_t vmm_loaded = B_FALSE;
3283
3284 if (cmd != DDI_ATTACH) {
3285 return (DDI_FAILURE);
3286 }
3287
3288 mutex_enter(&vmmdev_mtx);
3289 /* Ensure we are not already attached. */
3290 if (vmmdev_dip != NULL) {
3291 mutex_exit(&vmmdev_mtx);
3292 return (DDI_FAILURE);
3293 }
3294
3295 vmm_sol_glue_init();
3296
3297 /*
3298 * Perform temporary HMA registration to determine if the system
3299 * is capable.
3300 */
3301 if ((reg = hma_register(vmmdev_hvm_name)) == NULL) {
3302 goto fail;
3303 } else if (vmm_mod_load() != 0) {
3304 goto fail;
3305 }
3306 vmm_loaded = B_TRUE;
3307 hma_unregister(reg);
3308 reg = NULL;
3309
3310 /* Create control node. Other nodes will be created on demand. */
3311 if (ddi_create_minor_node(dip, "ctl", S_IFCHR,
3312 VMM_CTL_MINOR, DDI_PSEUDO, 0) != 0) {
3313 goto fail;
3314 }
3315
3316 sph = sdev_plugin_register(VMM_MODULE_NAME, &vmm_sdev_ops, NULL);
3317 if (sph == (sdev_plugin_hdl_t)NULL) {
3318 ddi_remove_minor_node(dip, NULL);
3319 goto fail;
3320 }
3321
3322 ddi_report_dev(dip);
3323 vmmdev_sdev_hdl = sph;
3324 vmmdev_dip = dip;
3325 mutex_exit(&vmmdev_mtx);
3326 return (DDI_SUCCESS);
3327
3328 fail:
3329 if (vmm_loaded) {
3330 VERIFY0(vmm_mod_unload());
3331 }
3332 if (reg != NULL) {
3333 hma_unregister(reg);
3334 }
3335 vmm_sol_glue_cleanup();
3336 mutex_exit(&vmmdev_mtx);
3337 return (DDI_FAILURE);
3338 }
3339
3340 static int
vmm_detach(dev_info_t * dip,ddi_detach_cmd_t cmd)3341 vmm_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
3342 {
3343 if (cmd != DDI_DETACH) {
3344 return (DDI_FAILURE);
3345 }
3346
3347 /*
3348 * Ensure that all resources have been cleaned up.
3349 *
3350 * To prevent a deadlock with iommu_cleanup() we'll fail the detach if
3351 * vmmdev_mtx is already held. We can't wait for vmmdev_mtx with our
3352 * devinfo locked as iommu_cleanup() tries to recursively lock each
3353 * devinfo, including our own, while holding vmmdev_mtx.
3354 */
3355 if (mutex_tryenter(&vmmdev_mtx) == 0)
3356 return (DDI_FAILURE);
3357
3358 mutex_enter(&vmm_mtx);
3359 if (!list_is_empty(&vmm_list)) {
3360 mutex_exit(&vmm_mtx);
3361 mutex_exit(&vmmdev_mtx);
3362 return (DDI_FAILURE);
3363 }
3364 mutex_exit(&vmm_mtx);
3365
3366 if (!vmmr_is_empty()) {
3367 mutex_exit(&vmmdev_mtx);
3368 return (DDI_FAILURE);
3369 }
3370
3371 VERIFY(vmmdev_sdev_hdl != (sdev_plugin_hdl_t)NULL);
3372 if (sdev_plugin_unregister(vmmdev_sdev_hdl) != 0) {
3373 mutex_exit(&vmmdev_mtx);
3374 return (DDI_FAILURE);
3375 }
3376 vmmdev_sdev_hdl = (sdev_plugin_hdl_t)NULL;
3377
3378 /* Remove the control node. */
3379 ddi_remove_minor_node(dip, "ctl");
3380 vmmdev_dip = NULL;
3381
3382 VERIFY0(vmm_mod_unload());
3383 VERIFY3U(vmmdev_hma_reg, ==, NULL);
3384 vmm_sol_glue_cleanup();
3385
3386 mutex_exit(&vmmdev_mtx);
3387
3388 return (DDI_SUCCESS);
3389 }
3390
3391 static struct cb_ops vmm_cb_ops = {
3392 vmm_open,
3393 vmm_close,
3394 nodev, /* strategy */
3395 nodev, /* print */
3396 nodev, /* dump */
3397 nodev, /* read */
3398 nodev, /* write */
3399 vmm_ioctl,
3400 nodev, /* devmap */
3401 nodev, /* mmap */
3402 vmm_segmap,
3403 nochpoll, /* poll */
3404 ddi_prop_op,
3405 NULL,
3406 D_NEW | D_MP | D_DEVMAP
3407 };
3408
3409 static struct dev_ops vmm_ops = {
3410 DEVO_REV,
3411 0,
3412 vmm_info,
3413 nulldev, /* identify */
3414 nulldev, /* probe */
3415 vmm_attach,
3416 vmm_detach,
3417 nodev, /* reset */
3418 &vmm_cb_ops,
3419 (struct bus_ops *)NULL
3420 };
3421
3422 static struct modldrv modldrv = {
3423 &mod_driverops,
3424 "bhyve vmm",
3425 &vmm_ops
3426 };
3427
3428 static struct modlinkage modlinkage = {
3429 MODREV_1,
3430 &modldrv,
3431 NULL
3432 };
3433
3434 int
_init(void)3435 _init(void)
3436 {
3437 int error;
3438
3439 sysinit();
3440
3441 mutex_init(&vmmdev_mtx, NULL, MUTEX_DRIVER, NULL);
3442 mutex_init(&vmm_mtx, NULL, MUTEX_DRIVER, NULL);
3443 list_create(&vmm_list, sizeof (vmm_softc_t),
3444 offsetof(vmm_softc_t, vmm_node));
3445 vmm_minors = id_space_create("vmm_minors", VMM_CTL_MINOR + 1, MAXMIN32);
3446
3447 error = ddi_soft_state_init(&vmm_statep, sizeof (vmm_softc_t), 0);
3448 if (error) {
3449 return (error);
3450 }
3451
3452 error = vmmr_init();
3453 if (error) {
3454 ddi_soft_state_fini(&vmm_statep);
3455 return (error);
3456 }
3457
3458 vmm_zsd_init();
3459
3460 error = mod_install(&modlinkage);
3461 if (error) {
3462 ddi_soft_state_fini(&vmm_statep);
3463 vmm_zsd_fini();
3464 vmmr_fini();
3465 }
3466
3467 return (error);
3468 }
3469
3470 int
_fini(void)3471 _fini(void)
3472 {
3473 int error;
3474
3475 error = mod_remove(&modlinkage);
3476 if (error) {
3477 return (error);
3478 }
3479
3480 vmm_zsd_fini();
3481 vmmr_fini();
3482
3483 ddi_soft_state_fini(&vmm_statep);
3484
3485 return (0);
3486 }
3487
3488 int
_info(struct modinfo * modinfop)3489 _info(struct modinfo *modinfop)
3490 {
3491 return (mod_info(&modlinkage, modinfop));
3492 }
3493