xref: /illumos-gate/usr/src/uts/intel/io/vmm/vmm_sol_dev.c (revision b8052df9f609edb713f6828c9eecc3d7be19dfb3)
1 /*
2  * This file and its contents are supplied under the terms of the
3  * Common Development and Distribution License ("CDDL"), version 1.0.
4  * You may only use this file in accordance with the terms of version
5  * 1.0 of the CDDL.
6  *
7  * A full copy of the text of the CDDL should have accompanied this
8  * source.  A copy of the CDDL is also available via the Internet at
9  * http://www.illumos.org/license/CDDL.
10  */
11 /* This file is dual-licensed; see usr/src/contrib/bhyve/LICENSE */
12 
13 /*
14  * Copyright 2015 Pluribus Networks Inc.
15  * Copyright 2019 Joyent, Inc.
16  * Copyright 2020 OmniOS Community Edition (OmniOSce) Association.
17  * Copyright 2022 Oxide Computer Company
18  */
19 
20 #include <sys/types.h>
21 #include <sys/conf.h>
22 #include <sys/cpuvar.h>
23 #include <sys/ioccom.h>
24 #include <sys/stat.h>
25 #include <sys/vmsystm.h>
26 #include <sys/ddi.h>
27 #include <sys/mkdev.h>
28 #include <sys/sunddi.h>
29 #include <sys/fs/dv_node.h>
30 #include <sys/cpuset.h>
31 #include <sys/id_space.h>
32 #include <sys/fs/sdev_plugin.h>
33 #include <sys/smt.h>
34 #include <sys/kstat.h>
35 
36 #include <sys/kernel.h>
37 #include <sys/hma.h>
38 #include <sys/x86_archext.h>
39 #include <x86/apicreg.h>
40 
41 #include <sys/vmm.h>
42 #include <sys/vmm_kernel.h>
43 #include <sys/vmm_instruction_emul.h>
44 #include <sys/vmm_dev.h>
45 #include <sys/vmm_impl.h>
46 #include <sys/vmm_drv.h>
47 #include <sys/vmm_vm.h>
48 #include <sys/vmm_reservoir.h>
49 
50 #include <vm/seg_dev.h>
51 
52 #include "io/ppt.h"
53 #include "io/vatpic.h"
54 #include "io/vioapic.h"
55 #include "io/vrtc.h"
56 #include "io/vhpet.h"
57 #include "io/vpmtmr.h"
58 #include "vmm_lapic.h"
59 #include "vmm_stat.h"
60 #include "vmm_util.h"
61 
62 /*
63  * Locking details:
64  *
65  * Driver-wide data (vmmdev_*) , including HMA and sdev registration, is
66  * protected by vmmdev_mtx.  The list of vmm_softc_t instances and related data
67  * (vmm_*) are protected by vmm_mtx.  Actions requiring both locks must acquire
68  * vmmdev_mtx before vmm_mtx.  The sdev plugin functions must not attempt to
69  * acquire vmmdev_mtx, as they could deadlock with plugin unregistration.
70  */
71 
72 static kmutex_t		vmmdev_mtx;
73 static dev_info_t	*vmmdev_dip;
74 static hma_reg_t	*vmmdev_hma_reg;
75 static uint_t		vmmdev_hma_ref;
76 static sdev_plugin_hdl_t vmmdev_sdev_hdl;
77 
78 static kmutex_t		vmm_mtx;
79 static list_t		vmm_list;
80 static id_space_t	*vmm_minors;
81 static void		*vmm_statep;
82 
83 /* temporary safety switch */
84 int		vmm_allow_state_writes;
85 
86 static const char *vmmdev_hvm_name = "bhyve";
87 
88 /* For sdev plugin (/dev) */
89 #define	VMM_SDEV_ROOT "/dev/vmm"
90 
91 /* From uts/intel/io/vmm/intel/vmx.c */
92 extern int vmx_x86_supported(const char **);
93 
94 /* Holds and hooks from drivers external to vmm */
95 struct vmm_hold {
96 	list_node_t	vmh_node;
97 	vmm_softc_t	*vmh_sc;
98 	boolean_t	vmh_release_req;
99 	uint_t		vmh_ioport_hook_cnt;
100 };
101 
102 struct vmm_lease {
103 	list_node_t		vml_node;
104 	struct vm		*vml_vm;
105 	vm_client_t		*vml_vmclient;
106 	boolean_t		vml_expired;
107 	boolean_t		vml_break_deferred;
108 	boolean_t		(*vml_expire_func)(void *);
109 	void			*vml_expire_arg;
110 	struct vmm_hold		*vml_hold;
111 };
112 
113 /* Options for vmm_destroy_locked */
114 typedef enum vmm_destroy_opts {
115 	VDO_DEFAULT		= 0,
116 	/*
117 	 * Indicate that zone-specific-data associated with this VM not be
118 	 * cleaned up as part of the destroy.  Skipping ZSD clean-up is
119 	 * necessary when VM is being destroyed as part of zone destruction,
120 	 * when said ZSD is already being cleaned up.
121 	 */
122 	VDO_NO_CLEAN_ZSD	= (1 << 0),
123 	/*
124 	 * Attempt to wait for VM destruction to complete.  This is opt-in,
125 	 * since there are many normal conditions which could lead to
126 	 * destruction being stalled pending other clean-up.
127 	 */
128 	VDO_ATTEMPT_WAIT	= (1 << 1),
129 } vmm_destroy_opts_t;
130 
131 static void vmm_hma_release(void);
132 static int vmm_destroy_locked(vmm_softc_t *, vmm_destroy_opts_t, bool *);
133 static int vmm_drv_block_hook(vmm_softc_t *, boolean_t);
134 static void vmm_lease_block(vmm_softc_t *);
135 static void vmm_lease_unblock(vmm_softc_t *);
136 static int vmm_kstat_alloc(vmm_softc_t *, minor_t, const cred_t *);
137 static void vmm_kstat_init(vmm_softc_t *);
138 static void vmm_kstat_fini(vmm_softc_t *);
139 
140 /*
141  * The 'devmem' hack:
142  *
143  * On native FreeBSD, bhyve consumers are allowed to create 'devmem' segments
144  * in the vm which appear with their own name related to the vm under /dev.
145  * Since this would be a hassle from an sdev perspective and would require a
146  * new cdev interface (or complicate the existing one), we choose to implement
147  * this in a different manner.  Direct access to the underlying vm memory
148  * segments is exposed by placing them in a range of offsets beyond the normal
149  * guest memory space.  Userspace can query the appropriate offset to mmap()
150  * for a given segment-id with the VM_DEVMEM_GETOFFSET ioctl.
151  */
152 
153 static vmm_devmem_entry_t *
154 vmmdev_devmem_find(vmm_softc_t *sc, int segid)
155 {
156 	vmm_devmem_entry_t *ent = NULL;
157 	list_t *dl = &sc->vmm_devmem_list;
158 
159 	for (ent = list_head(dl); ent != NULL; ent = list_next(dl, ent)) {
160 		if (ent->vde_segid == segid) {
161 			return (ent);
162 		}
163 	}
164 	return (NULL);
165 }
166 
167 static int
168 vmmdev_get_memseg(vmm_softc_t *sc, struct vm_memseg *mseg)
169 {
170 	int error;
171 	bool sysmem;
172 
173 	error = vm_get_memseg(sc->vmm_vm, mseg->segid, &mseg->len, &sysmem,
174 	    NULL);
175 	if (error || mseg->len == 0)
176 		return (error);
177 
178 	if (!sysmem) {
179 		vmm_devmem_entry_t *de;
180 
181 		de = vmmdev_devmem_find(sc, mseg->segid);
182 		if (de != NULL) {
183 			(void) strlcpy(mseg->name, de->vde_name,
184 			    sizeof (mseg->name));
185 		}
186 	} else {
187 		bzero(mseg->name, sizeof (mseg->name));
188 	}
189 
190 	return (error);
191 }
192 
193 static int
194 vmmdev_devmem_create(vmm_softc_t *sc, struct vm_memseg *mseg, const char *name)
195 {
196 	off_t map_offset;
197 	vmm_devmem_entry_t *entry;
198 
199 	if (list_is_empty(&sc->vmm_devmem_list)) {
200 		map_offset = VM_DEVMEM_START;
201 	} else {
202 		entry = list_tail(&sc->vmm_devmem_list);
203 		map_offset = entry->vde_off + entry->vde_len;
204 		if (map_offset < entry->vde_off) {
205 			/* Do not tolerate overflow */
206 			return (ERANGE);
207 		}
208 		/*
209 		 * XXXJOY: We could choose to search the list for duplicate
210 		 * names and toss an error.  Since we're using the offset
211 		 * method for now, it does not make much of a difference.
212 		 */
213 	}
214 
215 	entry = kmem_zalloc(sizeof (*entry), KM_SLEEP);
216 	entry->vde_segid = mseg->segid;
217 	entry->vde_len = mseg->len;
218 	entry->vde_off = map_offset;
219 	(void) strlcpy(entry->vde_name, name, sizeof (entry->vde_name));
220 	list_insert_tail(&sc->vmm_devmem_list, entry);
221 
222 	return (0);
223 }
224 
225 static boolean_t
226 vmmdev_devmem_segid(vmm_softc_t *sc, off_t off, off_t len, int *segidp,
227     off_t *map_offp)
228 {
229 	list_t *dl = &sc->vmm_devmem_list;
230 	vmm_devmem_entry_t *de = NULL;
231 	const off_t map_end = off + len;
232 
233 	VERIFY(off >= VM_DEVMEM_START);
234 
235 	if (map_end < off) {
236 		/* No match on overflow */
237 		return (B_FALSE);
238 	}
239 
240 	for (de = list_head(dl); de != NULL; de = list_next(dl, de)) {
241 		const off_t item_end = de->vde_off + de->vde_len;
242 
243 		if (de->vde_off <= off && item_end >= map_end) {
244 			*segidp = de->vde_segid;
245 			*map_offp = off - de->vde_off;
246 			return (B_TRUE);
247 		}
248 	}
249 	return (B_FALSE);
250 }
251 
252 /*
253  * When an instance is being destroyed, the devmem list of named memory objects
254  * can be torn down, as no new mappings are allowed.
255  */
256 static void
257 vmmdev_devmem_purge(vmm_softc_t *sc)
258 {
259 	vmm_devmem_entry_t *entry;
260 
261 	while ((entry = list_remove_head(&sc->vmm_devmem_list)) != NULL) {
262 		kmem_free(entry, sizeof (*entry));
263 	}
264 }
265 
266 static int
267 vmmdev_alloc_memseg(vmm_softc_t *sc, struct vm_memseg *mseg)
268 {
269 	int error;
270 	bool sysmem = true;
271 
272 	if (VM_MEMSEG_NAME(mseg)) {
273 		sysmem = false;
274 	}
275 	error = vm_alloc_memseg(sc->vmm_vm, mseg->segid, mseg->len, sysmem);
276 
277 	if (error == 0) {
278 		/*
279 		 * Rather than create a whole fresh device from which userspace
280 		 * can mmap this segment, instead make it available at an
281 		 * offset above where the main guest memory resides.
282 		 */
283 		error = vmmdev_devmem_create(sc, mseg, mseg->name);
284 		if (error != 0) {
285 			vm_free_memseg(sc->vmm_vm, mseg->segid);
286 		}
287 	}
288 	return (error);
289 }
290 
291 /*
292  * Resource Locking and Exclusion
293  *
294  * Much of bhyve depends on key portions of VM state, such as the guest memory
295  * map, to remain unchanged while the guest is running.  As ported from
296  * FreeBSD, the initial strategy for this resource exclusion hinged on gating
297  * access to the instance vCPUs.  Threads acting on a single vCPU, like those
298  * performing the work of actually running the guest in VMX/SVM, would lock
299  * only that vCPU during ioctl() entry.  For ioctls which would change VM-wide
300  * state, all of the vCPUs would be first locked, ensuring that the
301  * operation(s) could complete without any other threads stumbling into
302  * intermediate states.
303  *
304  * This approach is largely effective for bhyve.  Common operations, such as
305  * running the vCPUs, steer clear of lock contention.  The model begins to
306  * break down for operations which do not occur in the context of a specific
307  * vCPU.  LAPIC MSI delivery, for example, may be initiated from a worker
308  * thread in the bhyve process.  In order to properly protect those vCPU-less
309  * operations from encountering invalid states, additional locking is required.
310  * This was solved by forcing those operations to lock the VM_MAXCPU-1 vCPU.
311  * It does mean that class of operations will be serialized on locking the
312  * specific vCPU and that instances sized at VM_MAXCPU will potentially see
313  * undue contention on the VM_MAXCPU-1 vCPU.
314  *
315  * In order to address the shortcomings of this model, the concept of a
316  * read/write lock has been added to bhyve.  Operations which change
317  * fundamental aspects of a VM (such as the memory map) must acquire the write
318  * lock, which also implies locking all of the vCPUs and waiting for all read
319  * lock holders to release.  While it increases the cost and waiting time for
320  * those few operations, it allows most hot-path operations on the VM (which
321  * depend on its configuration remaining stable) to occur with minimal locking.
322  *
323  * Consumers of the Driver API (see below) are a special case when it comes to
324  * this locking, since they may hold a read lock via the drv_lease mechanism
325  * for an extended period of time.  Rather than forcing those consumers to
326  * continuously poll for a write lock attempt, the lease system forces them to
327  * provide a release callback to trigger their clean-up (and potential later
328  * reacquisition) of the read lock.
329  */
330 
331 static void
332 vcpu_lock_one(vmm_softc_t *sc, int vcpu)
333 {
334 	ASSERT(vcpu >= 0 && vcpu < VM_MAXCPU);
335 
336 	/*
337 	 * Since this state transition is utilizing from_idle=true, it should
338 	 * not fail, but rather block until it can be successful.
339 	 */
340 	VERIFY0(vcpu_set_state(sc->vmm_vm, vcpu, VCPU_FROZEN, true));
341 }
342 
343 static void
344 vcpu_unlock_one(vmm_softc_t *sc, int vcpu)
345 {
346 	ASSERT(vcpu >= 0 && vcpu < VM_MAXCPU);
347 
348 	VERIFY3U(vcpu_get_state(sc->vmm_vm, vcpu, NULL), ==, VCPU_FROZEN);
349 	VERIFY0(vcpu_set_state(sc->vmm_vm, vcpu, VCPU_IDLE, false));
350 }
351 
352 static void
353 vmm_read_lock(vmm_softc_t *sc)
354 {
355 	rw_enter(&sc->vmm_rwlock, RW_READER);
356 }
357 
358 static void
359 vmm_read_unlock(vmm_softc_t *sc)
360 {
361 	rw_exit(&sc->vmm_rwlock);
362 }
363 
364 static void
365 vmm_write_lock(vmm_softc_t *sc)
366 {
367 	int maxcpus;
368 
369 	/* First lock all the vCPUs */
370 	maxcpus = vm_get_maxcpus(sc->vmm_vm);
371 	for (int vcpu = 0; vcpu < maxcpus; vcpu++) {
372 		vcpu_lock_one(sc, vcpu);
373 	}
374 
375 	/*
376 	 * Block vmm_drv leases from being acquired or held while the VM write
377 	 * lock is held.
378 	 */
379 	vmm_lease_block(sc);
380 
381 	rw_enter(&sc->vmm_rwlock, RW_WRITER);
382 	/*
383 	 * For now, the 'maxcpus' value for an instance is fixed at the
384 	 * compile-time constant of VM_MAXCPU at creation.  If this changes in
385 	 * the future, allowing for dynamic vCPU resource sizing, acquisition
386 	 * of the write lock will need to be wary of such changes.
387 	 */
388 	VERIFY(maxcpus == vm_get_maxcpus(sc->vmm_vm));
389 }
390 
391 static void
392 vmm_write_unlock(vmm_softc_t *sc)
393 {
394 	int maxcpus;
395 
396 	/* Allow vmm_drv leases to be acquired once write lock is dropped */
397 	vmm_lease_unblock(sc);
398 
399 	/*
400 	 * The VM write lock _must_ be released from the same thread it was
401 	 * acquired in, unlike the read lock.
402 	 */
403 	VERIFY(rw_write_held(&sc->vmm_rwlock));
404 	rw_exit(&sc->vmm_rwlock);
405 
406 	/* Unlock all the vCPUs */
407 	maxcpus = vm_get_maxcpus(sc->vmm_vm);
408 	for (int vcpu = 0; vcpu < maxcpus; vcpu++) {
409 		vcpu_unlock_one(sc, vcpu);
410 	}
411 }
412 
413 static int
414 vmmdev_do_ioctl(vmm_softc_t *sc, int cmd, intptr_t arg, int md,
415     cred_t *credp, int *rvalp)
416 {
417 	int error = 0, vcpu = -1;
418 	void *datap = (void *)arg;
419 	enum vm_lock_type {
420 		LOCK_NONE = 0,
421 		LOCK_VCPU,
422 		LOCK_READ_HOLD,
423 		LOCK_WRITE_HOLD
424 	} lock_type = LOCK_NONE;
425 
426 	/* Acquire any exclusion resources needed for the operation. */
427 	switch (cmd) {
428 	case VM_RUN:
429 	case VM_GET_REGISTER:
430 	case VM_SET_REGISTER:
431 	case VM_GET_SEGMENT_DESCRIPTOR:
432 	case VM_SET_SEGMENT_DESCRIPTOR:
433 	case VM_GET_REGISTER_SET:
434 	case VM_SET_REGISTER_SET:
435 	case VM_INJECT_EXCEPTION:
436 	case VM_GET_CAPABILITY:
437 	case VM_SET_CAPABILITY:
438 	case VM_PPTDEV_MSI:
439 	case VM_PPTDEV_MSIX:
440 	case VM_SET_X2APIC_STATE:
441 	case VM_GLA2GPA:
442 	case VM_GLA2GPA_NOFAULT:
443 	case VM_ACTIVATE_CPU:
444 	case VM_SET_INTINFO:
445 	case VM_GET_INTINFO:
446 	case VM_RESTART_INSTRUCTION:
447 	case VM_SET_KERNEMU_DEV:
448 	case VM_GET_KERNEMU_DEV:
449 	case VM_RESET_CPU:
450 	case VM_GET_RUN_STATE:
451 	case VM_SET_RUN_STATE:
452 	case VM_GET_FPU:
453 	case VM_SET_FPU:
454 	case VM_GET_CPUID:
455 	case VM_SET_CPUID:
456 	case VM_LEGACY_CPUID:
457 		/*
458 		 * Copy in the ID of the vCPU chosen for this operation.
459 		 * Since a nefarious caller could update their struct between
460 		 * this locking and when the rest of the ioctl data is copied
461 		 * in, it is _critical_ that this local 'vcpu' variable be used
462 		 * rather than the in-struct one when performing the ioctl.
463 		 */
464 		if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) {
465 			return (EFAULT);
466 		}
467 		if (vcpu < 0 || vcpu > vm_get_maxcpus(sc->vmm_vm)) {
468 			return (EINVAL);
469 		}
470 		vcpu_lock_one(sc, vcpu);
471 		lock_type = LOCK_VCPU;
472 		break;
473 
474 	case VM_REINIT:
475 	case VM_BIND_PPTDEV:
476 	case VM_UNBIND_PPTDEV:
477 	case VM_MAP_PPTDEV_MMIO:
478 	case VM_UNMAP_PPTDEV_MMIO:
479 	case VM_ALLOC_MEMSEG:
480 	case VM_MMAP_MEMSEG:
481 	case VM_MUNMAP_MEMSEG:
482 	case VM_WRLOCK_CYCLE:
483 	case VM_PMTMR_LOCATE:
484 		vmm_write_lock(sc);
485 		lock_type = LOCK_WRITE_HOLD;
486 		break;
487 
488 	case VM_GET_MEMSEG:
489 	case VM_MMAP_GETNEXT:
490 	case VM_LAPIC_IRQ:
491 	case VM_INJECT_NMI:
492 	case VM_IOAPIC_ASSERT_IRQ:
493 	case VM_IOAPIC_DEASSERT_IRQ:
494 	case VM_IOAPIC_PULSE_IRQ:
495 	case VM_LAPIC_MSI:
496 	case VM_LAPIC_LOCAL_IRQ:
497 	case VM_GET_X2APIC_STATE:
498 	case VM_RTC_READ:
499 	case VM_RTC_WRITE:
500 	case VM_RTC_SETTIME:
501 	case VM_RTC_GETTIME:
502 	case VM_PPTDEV_DISABLE_MSIX:
503 	case VM_DEVMEM_GETOFFSET:
504 	case VM_TRACK_DIRTY_PAGES:
505 		vmm_read_lock(sc);
506 		lock_type = LOCK_READ_HOLD;
507 		break;
508 
509 	case VM_DATA_READ:
510 	case VM_DATA_WRITE:
511 		if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) {
512 			return (EFAULT);
513 		}
514 		if (vcpu == -1) {
515 			/* Access data for VM-wide devices */
516 			vmm_write_lock(sc);
517 			lock_type = LOCK_WRITE_HOLD;
518 		} else if (vcpu >= 0 && vcpu < vm_get_maxcpus(sc->vmm_vm)) {
519 			/* Access data associated with a specific vCPU */
520 			vcpu_lock_one(sc, vcpu);
521 			lock_type = LOCK_VCPU;
522 		} else {
523 			return (EINVAL);
524 		}
525 		break;
526 
527 	case VM_GET_GPA_PMAP:
528 	case VM_IOAPIC_PINCOUNT:
529 	case VM_SUSPEND:
530 	case VM_DESC_FPU_AREA:
531 	case VM_SET_AUTODESTRUCT:
532 	case VM_DESTROY_SELF:
533 	case VM_DESTROY_PENDING:
534 	default:
535 		break;
536 	}
537 
538 	/* Execute the primary logic for the ioctl. */
539 	switch (cmd) {
540 	case VM_RUN: {
541 		struct vm_entry entry;
542 
543 		if (ddi_copyin(datap, &entry, sizeof (entry), md)) {
544 			error = EFAULT;
545 			break;
546 		}
547 
548 		if (!(curthread->t_schedflag & TS_VCPU))
549 			smt_mark_as_vcpu();
550 
551 		error = vm_run(sc->vmm_vm, vcpu, &entry);
552 
553 		/*
554 		 * Unexpected states in vm_run() are expressed through positive
555 		 * errno-oriented return values.  VM states which expect further
556 		 * processing in userspace (necessary context via exitinfo) are
557 		 * expressed through negative return values.  For the time being
558 		 * a return value of 0 is not expected from vm_run().
559 		 */
560 		ASSERT(error != 0);
561 		if (error < 0) {
562 			const struct vm_exit *vme;
563 			void *outp = entry.exit_data;
564 
565 			error = 0;
566 			vme = vm_exitinfo(sc->vmm_vm, vcpu);
567 			if (ddi_copyout(vme, outp, sizeof (*vme), md)) {
568 				error = EFAULT;
569 			}
570 		}
571 		break;
572 	}
573 	case VM_SUSPEND: {
574 		struct vm_suspend vmsuspend;
575 
576 		if (ddi_copyin(datap, &vmsuspend, sizeof (vmsuspend), md)) {
577 			error = EFAULT;
578 			break;
579 		}
580 		error = vm_suspend(sc->vmm_vm, vmsuspend.how);
581 		break;
582 	}
583 	case VM_REINIT: {
584 		struct vm_reinit reinit;
585 
586 		if (ddi_copyin(datap, &reinit, sizeof (reinit), md)) {
587 			error = EFAULT;
588 			break;
589 		}
590 		if ((error = vmm_drv_block_hook(sc, B_TRUE)) != 0) {
591 			/*
592 			 * The VM instance should be free of driver-attached
593 			 * hooks during the reinitialization process.
594 			 */
595 			break;
596 		}
597 		error = vm_reinit(sc->vmm_vm, reinit.flags);
598 		(void) vmm_drv_block_hook(sc, B_FALSE);
599 		break;
600 	}
601 	case VM_STAT_DESC: {
602 		struct vm_stat_desc statdesc;
603 
604 		if (ddi_copyin(datap, &statdesc, sizeof (statdesc), md)) {
605 			error = EFAULT;
606 			break;
607 		}
608 		error = vmm_stat_desc_copy(statdesc.index, statdesc.desc,
609 		    sizeof (statdesc.desc));
610 		if (error == 0 &&
611 		    ddi_copyout(&statdesc, datap, sizeof (statdesc), md)) {
612 			error = EFAULT;
613 			break;
614 		}
615 		break;
616 	}
617 	case VM_STATS_IOC: {
618 		struct vm_stats vmstats;
619 
620 		if (ddi_copyin(datap, &vmstats, sizeof (vmstats), md)) {
621 			error = EFAULT;
622 			break;
623 		}
624 		hrt2tv(gethrtime(), &vmstats.tv);
625 		error = vmm_stat_copy(sc->vmm_vm, vmstats.cpuid, vmstats.index,
626 		    nitems(vmstats.statbuf),
627 		    &vmstats.num_entries, vmstats.statbuf);
628 		if (error == 0 &&
629 		    ddi_copyout(&vmstats, datap, sizeof (vmstats), md)) {
630 			error = EFAULT;
631 			break;
632 		}
633 		break;
634 	}
635 
636 	case VM_PPTDEV_MSI: {
637 		struct vm_pptdev_msi pptmsi;
638 
639 		if (ddi_copyin(datap, &pptmsi, sizeof (pptmsi), md)) {
640 			error = EFAULT;
641 			break;
642 		}
643 		error = ppt_setup_msi(sc->vmm_vm, pptmsi.vcpu, pptmsi.pptfd,
644 		    pptmsi.addr, pptmsi.msg, pptmsi.numvec);
645 		break;
646 	}
647 	case VM_PPTDEV_MSIX: {
648 		struct vm_pptdev_msix pptmsix;
649 
650 		if (ddi_copyin(datap, &pptmsix, sizeof (pptmsix), md)) {
651 			error = EFAULT;
652 			break;
653 		}
654 		error = ppt_setup_msix(sc->vmm_vm, pptmsix.vcpu, pptmsix.pptfd,
655 		    pptmsix.idx, pptmsix.addr, pptmsix.msg,
656 		    pptmsix.vector_control);
657 		break;
658 	}
659 	case VM_PPTDEV_DISABLE_MSIX: {
660 		struct vm_pptdev pptdev;
661 
662 		if (ddi_copyin(datap, &pptdev, sizeof (pptdev), md)) {
663 			error = EFAULT;
664 			break;
665 		}
666 		error = ppt_disable_msix(sc->vmm_vm, pptdev.pptfd);
667 		break;
668 	}
669 	case VM_MAP_PPTDEV_MMIO: {
670 		struct vm_pptdev_mmio pptmmio;
671 
672 		if (ddi_copyin(datap, &pptmmio, sizeof (pptmmio), md)) {
673 			error = EFAULT;
674 			break;
675 		}
676 		error = ppt_map_mmio(sc->vmm_vm, pptmmio.pptfd, pptmmio.gpa,
677 		    pptmmio.len, pptmmio.hpa);
678 		break;
679 	}
680 	case VM_UNMAP_PPTDEV_MMIO: {
681 		struct vm_pptdev_mmio pptmmio;
682 
683 		if (ddi_copyin(datap, &pptmmio, sizeof (pptmmio), md)) {
684 			error = EFAULT;
685 			break;
686 		}
687 		error = ppt_unmap_mmio(sc->vmm_vm, pptmmio.pptfd, pptmmio.gpa,
688 		    pptmmio.len);
689 		break;
690 	}
691 	case VM_BIND_PPTDEV: {
692 		struct vm_pptdev pptdev;
693 
694 		if (ddi_copyin(datap, &pptdev, sizeof (pptdev), md)) {
695 			error = EFAULT;
696 			break;
697 		}
698 		error = vm_assign_pptdev(sc->vmm_vm, pptdev.pptfd);
699 		break;
700 	}
701 	case VM_UNBIND_PPTDEV: {
702 		struct vm_pptdev pptdev;
703 
704 		if (ddi_copyin(datap, &pptdev, sizeof (pptdev), md)) {
705 			error = EFAULT;
706 			break;
707 		}
708 		error = vm_unassign_pptdev(sc->vmm_vm, pptdev.pptfd);
709 		break;
710 	}
711 	case VM_GET_PPTDEV_LIMITS: {
712 		struct vm_pptdev_limits pptlimits;
713 
714 		if (ddi_copyin(datap, &pptlimits, sizeof (pptlimits), md)) {
715 			error = EFAULT;
716 			break;
717 		}
718 		error = ppt_get_limits(sc->vmm_vm, pptlimits.pptfd,
719 		    &pptlimits.msi_limit, &pptlimits.msix_limit);
720 		if (error == 0 &&
721 		    ddi_copyout(&pptlimits, datap, sizeof (pptlimits), md)) {
722 			error = EFAULT;
723 			break;
724 		}
725 		break;
726 	}
727 	case VM_INJECT_EXCEPTION: {
728 		struct vm_exception vmexc;
729 		if (ddi_copyin(datap, &vmexc, sizeof (vmexc), md)) {
730 			error = EFAULT;
731 			break;
732 		}
733 		error = vm_inject_exception(sc->vmm_vm, vcpu, vmexc.vector,
734 		    vmexc.error_code_valid != 0, vmexc.error_code,
735 		    vmexc.restart_instruction != 0);
736 		break;
737 	}
738 	case VM_INJECT_NMI: {
739 		struct vm_nmi vmnmi;
740 
741 		if (ddi_copyin(datap, &vmnmi, sizeof (vmnmi), md)) {
742 			error = EFAULT;
743 			break;
744 		}
745 		error = vm_inject_nmi(sc->vmm_vm, vmnmi.cpuid);
746 		break;
747 	}
748 	case VM_LAPIC_IRQ: {
749 		struct vm_lapic_irq vmirq;
750 
751 		if (ddi_copyin(datap, &vmirq, sizeof (vmirq), md)) {
752 			error = EFAULT;
753 			break;
754 		}
755 		error = lapic_intr_edge(sc->vmm_vm, vmirq.cpuid, vmirq.vector);
756 		break;
757 	}
758 	case VM_LAPIC_LOCAL_IRQ: {
759 		struct vm_lapic_irq vmirq;
760 
761 		if (ddi_copyin(datap, &vmirq, sizeof (vmirq), md)) {
762 			error = EFAULT;
763 			break;
764 		}
765 		error = lapic_set_local_intr(sc->vmm_vm, vmirq.cpuid,
766 		    vmirq.vector);
767 		break;
768 	}
769 	case VM_LAPIC_MSI: {
770 		struct vm_lapic_msi vmmsi;
771 
772 		if (ddi_copyin(datap, &vmmsi, sizeof (vmmsi), md)) {
773 			error = EFAULT;
774 			break;
775 		}
776 		error = lapic_intr_msi(sc->vmm_vm, vmmsi.addr, vmmsi.msg);
777 		break;
778 	}
779 
780 	case VM_IOAPIC_ASSERT_IRQ: {
781 		struct vm_ioapic_irq ioapic_irq;
782 
783 		if (ddi_copyin(datap, &ioapic_irq, sizeof (ioapic_irq), md)) {
784 			error = EFAULT;
785 			break;
786 		}
787 		error = vioapic_assert_irq(sc->vmm_vm, ioapic_irq.irq);
788 		break;
789 	}
790 	case VM_IOAPIC_DEASSERT_IRQ: {
791 		struct vm_ioapic_irq ioapic_irq;
792 
793 		if (ddi_copyin(datap, &ioapic_irq, sizeof (ioapic_irq), md)) {
794 			error = EFAULT;
795 			break;
796 		}
797 		error = vioapic_deassert_irq(sc->vmm_vm, ioapic_irq.irq);
798 		break;
799 	}
800 	case VM_IOAPIC_PULSE_IRQ: {
801 		struct vm_ioapic_irq ioapic_irq;
802 
803 		if (ddi_copyin(datap, &ioapic_irq, sizeof (ioapic_irq), md)) {
804 			error = EFAULT;
805 			break;
806 		}
807 		error = vioapic_pulse_irq(sc->vmm_vm, ioapic_irq.irq);
808 		break;
809 	}
810 	case VM_IOAPIC_PINCOUNT: {
811 		int pincount;
812 
813 		pincount = vioapic_pincount(sc->vmm_vm);
814 		if (ddi_copyout(&pincount, datap, sizeof (int), md)) {
815 			error = EFAULT;
816 			break;
817 		}
818 		break;
819 	}
820 	case VM_DESC_FPU_AREA: {
821 		struct vm_fpu_desc desc;
822 		void *buf = NULL;
823 
824 		if (ddi_copyin(datap, &desc, sizeof (desc), md)) {
825 			error = EFAULT;
826 			break;
827 		}
828 		if (desc.vfd_num_entries > 64) {
829 			error = EINVAL;
830 			break;
831 		}
832 		const size_t buf_sz = sizeof (struct vm_fpu_desc_entry) *
833 		    desc.vfd_num_entries;
834 		if (buf_sz != 0) {
835 			buf = kmem_zalloc(buf_sz, KM_SLEEP);
836 		}
837 
838 		/*
839 		 * For now, we are depending on vm_fpu_desc_entry and
840 		 * hma_xsave_state_desc_t having the same format.
841 		 */
842 		CTASSERT(sizeof (struct vm_fpu_desc_entry) ==
843 		    sizeof (hma_xsave_state_desc_t));
844 
845 		size_t req_size;
846 		const uint_t max_entries = hma_fpu_describe_xsave_state(
847 		    (hma_xsave_state_desc_t *)buf,
848 		    desc.vfd_num_entries,
849 		    &req_size);
850 
851 		desc.vfd_req_size = req_size;
852 		desc.vfd_num_entries = max_entries;
853 		if (buf_sz != 0) {
854 			if (ddi_copyout(buf, desc.vfd_entry_data, buf_sz, md)) {
855 				error = EFAULT;
856 			}
857 			kmem_free(buf, buf_sz);
858 		}
859 
860 		if (error == 0) {
861 			if (ddi_copyout(&desc, datap, sizeof (desc), md)) {
862 				error = EFAULT;
863 			}
864 		}
865 		break;
866 	}
867 	case VM_SET_AUTODESTRUCT: {
868 		/*
869 		 * Since this has to do with controlling the lifetime of the
870 		 * greater vmm_softc_t, the flag is protected by vmm_mtx, rather
871 		 * than the vcpu-centric or rwlock exclusion mechanisms.
872 		 */
873 		mutex_enter(&vmm_mtx);
874 		if (arg != 0) {
875 			sc->vmm_flags |= VMM_AUTODESTROY;
876 		} else {
877 			sc->vmm_flags &= ~VMM_AUTODESTROY;
878 		}
879 		mutex_exit(&vmm_mtx);
880 		break;
881 	}
882 	case VM_DESTROY_SELF: {
883 		bool hma_release = false;
884 
885 		/*
886 		 * Just like VMM_DESTROY_VM, but on the instance file descriptor
887 		 * itself, rather than having to perform a racy name lookup as
888 		 * part of the destroy process.
889 		 *
890 		 * Since vmm_destroy_locked() performs vCPU lock acquisition in
891 		 * order to kick the vCPUs out of guest context as part of any
892 		 * destruction, we do not need to worry about it ourself using
893 		 * the `lock_type` logic here.
894 		 */
895 		mutex_enter(&vmm_mtx);
896 		VERIFY0(vmm_destroy_locked(sc, VDO_DEFAULT, &hma_release));
897 		mutex_exit(&vmm_mtx);
898 		if (hma_release) {
899 			vmm_hma_release();
900 		}
901 		break;
902 	}
903 	case VM_DESTROY_PENDING: {
904 		/*
905 		 * If we have made it this far, then destruction of the instance
906 		 * has not been initiated.
907 		 */
908 		*rvalp = 0;
909 		break;
910 	}
911 
912 	case VM_ISA_ASSERT_IRQ: {
913 		struct vm_isa_irq isa_irq;
914 
915 		if (ddi_copyin(datap, &isa_irq, sizeof (isa_irq), md)) {
916 			error = EFAULT;
917 			break;
918 		}
919 		error = vatpic_assert_irq(sc->vmm_vm, isa_irq.atpic_irq);
920 		if (error == 0 && isa_irq.ioapic_irq != -1) {
921 			error = vioapic_assert_irq(sc->vmm_vm,
922 			    isa_irq.ioapic_irq);
923 		}
924 		break;
925 	}
926 	case VM_ISA_DEASSERT_IRQ: {
927 		struct vm_isa_irq isa_irq;
928 
929 		if (ddi_copyin(datap, &isa_irq, sizeof (isa_irq), md)) {
930 			error = EFAULT;
931 			break;
932 		}
933 		error = vatpic_deassert_irq(sc->vmm_vm, isa_irq.atpic_irq);
934 		if (error == 0 && isa_irq.ioapic_irq != -1) {
935 			error = vioapic_deassert_irq(sc->vmm_vm,
936 			    isa_irq.ioapic_irq);
937 		}
938 		break;
939 	}
940 	case VM_ISA_PULSE_IRQ: {
941 		struct vm_isa_irq isa_irq;
942 
943 		if (ddi_copyin(datap, &isa_irq, sizeof (isa_irq), md)) {
944 			error = EFAULT;
945 			break;
946 		}
947 		error = vatpic_pulse_irq(sc->vmm_vm, isa_irq.atpic_irq);
948 		if (error == 0 && isa_irq.ioapic_irq != -1) {
949 			error = vioapic_pulse_irq(sc->vmm_vm,
950 			    isa_irq.ioapic_irq);
951 		}
952 		break;
953 	}
954 	case VM_ISA_SET_IRQ_TRIGGER: {
955 		struct vm_isa_irq_trigger isa_irq_trigger;
956 
957 		if (ddi_copyin(datap, &isa_irq_trigger,
958 		    sizeof (isa_irq_trigger), md)) {
959 			error = EFAULT;
960 			break;
961 		}
962 		error = vatpic_set_irq_trigger(sc->vmm_vm,
963 		    isa_irq_trigger.atpic_irq, isa_irq_trigger.trigger);
964 		break;
965 	}
966 
967 	case VM_MMAP_GETNEXT: {
968 		struct vm_memmap mm;
969 
970 		if (ddi_copyin(datap, &mm, sizeof (mm), md)) {
971 			error = EFAULT;
972 			break;
973 		}
974 		error = vm_mmap_getnext(sc->vmm_vm, &mm.gpa, &mm.segid,
975 		    &mm.segoff, &mm.len, &mm.prot, &mm.flags);
976 		if (error == 0 && ddi_copyout(&mm, datap, sizeof (mm), md)) {
977 			error = EFAULT;
978 			break;
979 		}
980 		break;
981 	}
982 	case VM_MMAP_MEMSEG: {
983 		struct vm_memmap mm;
984 
985 		if (ddi_copyin(datap, &mm, sizeof (mm), md)) {
986 			error = EFAULT;
987 			break;
988 		}
989 		error = vm_mmap_memseg(sc->vmm_vm, mm.gpa, mm.segid, mm.segoff,
990 		    mm.len, mm.prot, mm.flags);
991 		break;
992 	}
993 	case VM_MUNMAP_MEMSEG: {
994 		struct vm_munmap mu;
995 
996 		if (ddi_copyin(datap, &mu, sizeof (mu), md)) {
997 			error = EFAULT;
998 			break;
999 		}
1000 		error = vm_munmap_memseg(sc->vmm_vm, mu.gpa, mu.len);
1001 		break;
1002 	}
1003 	case VM_ALLOC_MEMSEG: {
1004 		struct vm_memseg vmseg;
1005 
1006 		if (ddi_copyin(datap, &vmseg, sizeof (vmseg), md)) {
1007 			error = EFAULT;
1008 			break;
1009 		}
1010 		error = vmmdev_alloc_memseg(sc, &vmseg);
1011 		break;
1012 	}
1013 	case VM_GET_MEMSEG: {
1014 		struct vm_memseg vmseg;
1015 
1016 		if (ddi_copyin(datap, &vmseg, sizeof (vmseg), md)) {
1017 			error = EFAULT;
1018 			break;
1019 		}
1020 		error = vmmdev_get_memseg(sc, &vmseg);
1021 		if (error == 0 &&
1022 		    ddi_copyout(&vmseg, datap, sizeof (vmseg), md)) {
1023 			error = EFAULT;
1024 			break;
1025 		}
1026 		break;
1027 	}
1028 	case VM_GET_REGISTER: {
1029 		struct vm_register vmreg;
1030 
1031 		if (ddi_copyin(datap, &vmreg, sizeof (vmreg), md)) {
1032 			error = EFAULT;
1033 			break;
1034 		}
1035 		error = vm_get_register(sc->vmm_vm, vcpu, vmreg.regnum,
1036 		    &vmreg.regval);
1037 		if (error == 0 &&
1038 		    ddi_copyout(&vmreg, datap, sizeof (vmreg), md)) {
1039 			error = EFAULT;
1040 			break;
1041 		}
1042 		break;
1043 	}
1044 	case VM_SET_REGISTER: {
1045 		struct vm_register vmreg;
1046 
1047 		if (ddi_copyin(datap, &vmreg, sizeof (vmreg), md)) {
1048 			error = EFAULT;
1049 			break;
1050 		}
1051 		error = vm_set_register(sc->vmm_vm, vcpu, vmreg.regnum,
1052 		    vmreg.regval);
1053 		break;
1054 	}
1055 	case VM_SET_SEGMENT_DESCRIPTOR: {
1056 		struct vm_seg_desc vmsegd;
1057 
1058 		if (ddi_copyin(datap, &vmsegd, sizeof (vmsegd), md)) {
1059 			error = EFAULT;
1060 			break;
1061 		}
1062 		error = vm_set_seg_desc(sc->vmm_vm, vcpu, vmsegd.regnum,
1063 		    &vmsegd.desc);
1064 		break;
1065 	}
1066 	case VM_GET_SEGMENT_DESCRIPTOR: {
1067 		struct vm_seg_desc vmsegd;
1068 
1069 		if (ddi_copyin(datap, &vmsegd, sizeof (vmsegd), md)) {
1070 			error = EFAULT;
1071 			break;
1072 		}
1073 		error = vm_get_seg_desc(sc->vmm_vm, vcpu, vmsegd.regnum,
1074 		    &vmsegd.desc);
1075 		if (error == 0 &&
1076 		    ddi_copyout(&vmsegd, datap, sizeof (vmsegd), md)) {
1077 			error = EFAULT;
1078 			break;
1079 		}
1080 		break;
1081 	}
1082 	case VM_GET_REGISTER_SET: {
1083 		struct vm_register_set vrs;
1084 		int regnums[VM_REG_LAST];
1085 		uint64_t regvals[VM_REG_LAST];
1086 
1087 		if (ddi_copyin(datap, &vrs, sizeof (vrs), md)) {
1088 			error = EFAULT;
1089 			break;
1090 		}
1091 		if (vrs.count > VM_REG_LAST || vrs.count == 0) {
1092 			error = EINVAL;
1093 			break;
1094 		}
1095 		if (ddi_copyin(vrs.regnums, regnums,
1096 		    sizeof (int) * vrs.count, md)) {
1097 			error = EFAULT;
1098 			break;
1099 		}
1100 
1101 		error = 0;
1102 		for (uint_t i = 0; i < vrs.count && error == 0; i++) {
1103 			if (regnums[i] < 0) {
1104 				error = EINVAL;
1105 				break;
1106 			}
1107 			error = vm_get_register(sc->vmm_vm, vcpu, regnums[i],
1108 			    &regvals[i]);
1109 		}
1110 		if (error == 0 && ddi_copyout(regvals, vrs.regvals,
1111 		    sizeof (uint64_t) * vrs.count, md)) {
1112 			error = EFAULT;
1113 		}
1114 		break;
1115 	}
1116 	case VM_SET_REGISTER_SET: {
1117 		struct vm_register_set vrs;
1118 		int regnums[VM_REG_LAST];
1119 		uint64_t regvals[VM_REG_LAST];
1120 
1121 		if (ddi_copyin(datap, &vrs, sizeof (vrs), md)) {
1122 			error = EFAULT;
1123 			break;
1124 		}
1125 		if (vrs.count > VM_REG_LAST || vrs.count == 0) {
1126 			error = EINVAL;
1127 			break;
1128 		}
1129 		if (ddi_copyin(vrs.regnums, regnums,
1130 		    sizeof (int) * vrs.count, md)) {
1131 			error = EFAULT;
1132 			break;
1133 		}
1134 		if (ddi_copyin(vrs.regvals, regvals,
1135 		    sizeof (uint64_t) * vrs.count, md)) {
1136 			error = EFAULT;
1137 			break;
1138 		}
1139 
1140 		error = 0;
1141 		for (uint_t i = 0; i < vrs.count && error == 0; i++) {
1142 			/*
1143 			 * Setting registers in a set is not atomic, since a
1144 			 * failure in the middle of the set will cause a
1145 			 * bail-out and inconsistent register state.  Callers
1146 			 * should be wary of this.
1147 			 */
1148 			if (regnums[i] < 0) {
1149 				error = EINVAL;
1150 				break;
1151 			}
1152 			error = vm_set_register(sc->vmm_vm, vcpu, regnums[i],
1153 			    regvals[i]);
1154 		}
1155 		break;
1156 	}
1157 	case VM_RESET_CPU: {
1158 		struct vm_vcpu_reset vvr;
1159 
1160 		if (ddi_copyin(datap, &vvr, sizeof (vvr), md)) {
1161 			error = EFAULT;
1162 			break;
1163 		}
1164 		if (vvr.kind != VRK_RESET && vvr.kind != VRK_INIT) {
1165 			error = EINVAL;
1166 		}
1167 
1168 		error = vcpu_arch_reset(sc->vmm_vm, vcpu, vvr.kind == VRK_INIT);
1169 		break;
1170 	}
1171 	case VM_GET_RUN_STATE: {
1172 		struct vm_run_state vrs;
1173 
1174 		bzero(&vrs, sizeof (vrs));
1175 		error = vm_get_run_state(sc->vmm_vm, vcpu, &vrs.state,
1176 		    &vrs.sipi_vector);
1177 		if (error == 0) {
1178 			if (ddi_copyout(&vrs, datap, sizeof (vrs), md)) {
1179 				error = EFAULT;
1180 				break;
1181 			}
1182 		}
1183 		break;
1184 	}
1185 	case VM_SET_RUN_STATE: {
1186 		struct vm_run_state vrs;
1187 
1188 		if (ddi_copyin(datap, &vrs, sizeof (vrs), md)) {
1189 			error = EFAULT;
1190 			break;
1191 		}
1192 		error = vm_set_run_state(sc->vmm_vm, vcpu, vrs.state,
1193 		    vrs.sipi_vector);
1194 		break;
1195 	}
1196 	case VM_GET_FPU: {
1197 		struct vm_fpu_state req;
1198 		const size_t max_len = (PAGESIZE * 2);
1199 		void *kbuf;
1200 
1201 		if (ddi_copyin(datap, &req, sizeof (req), md)) {
1202 			error = EFAULT;
1203 			break;
1204 		}
1205 		if (req.len > max_len || req.len == 0) {
1206 			error = EINVAL;
1207 			break;
1208 		}
1209 		kbuf = kmem_zalloc(req.len, KM_SLEEP);
1210 		error = vm_get_fpu(sc->vmm_vm, vcpu, kbuf, req.len);
1211 		if (error == 0) {
1212 			if (ddi_copyout(kbuf, req.buf, req.len, md)) {
1213 				error = EFAULT;
1214 			}
1215 		}
1216 		kmem_free(kbuf, req.len);
1217 		break;
1218 	}
1219 	case VM_SET_FPU: {
1220 		struct vm_fpu_state req;
1221 		const size_t max_len = (PAGESIZE * 2);
1222 		void *kbuf;
1223 
1224 		if (ddi_copyin(datap, &req, sizeof (req), md)) {
1225 			error = EFAULT;
1226 			break;
1227 		}
1228 		if (req.len > max_len || req.len == 0) {
1229 			error = EINVAL;
1230 			break;
1231 		}
1232 		kbuf = kmem_alloc(req.len, KM_SLEEP);
1233 		if (ddi_copyin(req.buf, kbuf, req.len, md)) {
1234 			error = EFAULT;
1235 		} else {
1236 			error = vm_set_fpu(sc->vmm_vm, vcpu, kbuf, req.len);
1237 		}
1238 		kmem_free(kbuf, req.len);
1239 		break;
1240 	}
1241 	case VM_GET_CPUID: {
1242 		struct vm_vcpu_cpuid_config cfg;
1243 		struct vcpu_cpuid_entry *entries = NULL;
1244 
1245 		if (ddi_copyin(datap, &cfg, sizeof (cfg), md)) {
1246 			error = EFAULT;
1247 			break;
1248 		}
1249 		if (cfg.vvcc_nent > VMM_MAX_CPUID_ENTRIES) {
1250 			error = EINVAL;
1251 			break;
1252 		}
1253 
1254 		const size_t entries_size =
1255 		    cfg.vvcc_nent * sizeof (struct vcpu_cpuid_entry);
1256 		if (entries_size != 0) {
1257 			entries = kmem_zalloc(entries_size, KM_SLEEP);
1258 		}
1259 
1260 		vcpu_cpuid_config_t vm_cfg = {
1261 			.vcc_nent = cfg.vvcc_nent,
1262 			.vcc_entries = entries,
1263 		};
1264 		error = vm_get_cpuid(sc->vmm_vm, vcpu, &vm_cfg);
1265 
1266 		/*
1267 		 * Only attempt to copy out the resultant entries if we were
1268 		 * able to query them from the instance.  The flags and number
1269 		 * of entries are emitted regardless.
1270 		 */
1271 		cfg.vvcc_flags = vm_cfg.vcc_flags;
1272 		cfg.vvcc_nent = vm_cfg.vcc_nent;
1273 		if (entries != NULL) {
1274 			if (error == 0 && ddi_copyout(entries, cfg.vvcc_entries,
1275 			    entries_size, md) != 0) {
1276 				error = EFAULT;
1277 			}
1278 
1279 			kmem_free(entries, entries_size);
1280 		}
1281 
1282 		if (ddi_copyout(&cfg, datap, sizeof (cfg), md) != 0) {
1283 			error = EFAULT;
1284 		}
1285 		break;
1286 	}
1287 	case VM_SET_CPUID: {
1288 		struct vm_vcpu_cpuid_config cfg;
1289 		struct vcpu_cpuid_entry *entries = NULL;
1290 		size_t entries_size = 0;
1291 
1292 		if (ddi_copyin(datap, &cfg, sizeof (cfg), md)) {
1293 			error = EFAULT;
1294 			break;
1295 		}
1296 		if (cfg.vvcc_nent > VMM_MAX_CPUID_ENTRIES) {
1297 			error = EFBIG;
1298 			break;
1299 		}
1300 		if ((cfg.vvcc_flags & VCC_FLAG_LEGACY_HANDLING) != 0) {
1301 			/*
1302 			 * If we are being instructed to use "legacy" handling,
1303 			 * then no entries should be provided, since the static
1304 			 * in-kernel masking will be used.
1305 			 */
1306 			if (cfg.vvcc_nent != 0) {
1307 				error = EINVAL;
1308 				break;
1309 			}
1310 		} else if (cfg.vvcc_nent != 0) {
1311 			entries_size =
1312 			    cfg.vvcc_nent * sizeof (struct vcpu_cpuid_entry);
1313 			entries = kmem_alloc(entries_size, KM_SLEEP);
1314 
1315 			if (ddi_copyin(cfg.vvcc_entries, entries, entries_size,
1316 			    md) != 0) {
1317 				error = EFAULT;
1318 				kmem_free(entries, entries_size);
1319 				break;
1320 			}
1321 		}
1322 
1323 		vcpu_cpuid_config_t vm_cfg = {
1324 			.vcc_flags = cfg.vvcc_flags,
1325 			.vcc_nent = cfg.vvcc_nent,
1326 			.vcc_entries = entries,
1327 		};
1328 		error = vm_set_cpuid(sc->vmm_vm, vcpu, &vm_cfg);
1329 
1330 		if (entries != NULL) {
1331 			kmem_free(entries, entries_size);
1332 		}
1333 		break;
1334 	}
1335 	case VM_LEGACY_CPUID: {
1336 		struct vm_legacy_cpuid vlc;
1337 		if (ddi_copyin(datap, &vlc, sizeof (vlc), md)) {
1338 			error = EFAULT;
1339 			break;
1340 		}
1341 		vlc.vlc_vcpuid = vcpu;
1342 
1343 		legacy_emulate_cpuid(sc->vmm_vm, vcpu, &vlc.vlc_eax,
1344 		    &vlc.vlc_ebx, &vlc.vlc_ecx, &vlc.vlc_edx);
1345 
1346 		if (ddi_copyout(&vlc, datap, sizeof (vlc), md)) {
1347 			error = EFAULT;
1348 			break;
1349 		}
1350 		break;
1351 	}
1352 
1353 	case VM_SET_KERNEMU_DEV:
1354 	case VM_GET_KERNEMU_DEV: {
1355 		struct vm_readwrite_kernemu_device kemu;
1356 		size_t size = 0;
1357 
1358 		if (ddi_copyin(datap, &kemu, sizeof (kemu), md)) {
1359 			error = EFAULT;
1360 			break;
1361 		}
1362 
1363 		if (kemu.access_width > 3) {
1364 			error = EINVAL;
1365 			break;
1366 		}
1367 		size = (1 << kemu.access_width);
1368 		ASSERT(size >= 1 && size <= 8);
1369 
1370 		if (cmd == VM_SET_KERNEMU_DEV) {
1371 			error = vm_service_mmio_write(sc->vmm_vm, vcpu,
1372 			    kemu.gpa, kemu.value, size);
1373 		} else {
1374 			error = vm_service_mmio_read(sc->vmm_vm, vcpu,
1375 			    kemu.gpa, &kemu.value, size);
1376 		}
1377 
1378 		if (error == 0) {
1379 			if (ddi_copyout(&kemu, datap, sizeof (kemu), md)) {
1380 				error = EFAULT;
1381 				break;
1382 			}
1383 		}
1384 		break;
1385 	}
1386 
1387 	case VM_GET_CAPABILITY: {
1388 		struct vm_capability vmcap;
1389 
1390 		if (ddi_copyin(datap, &vmcap, sizeof (vmcap), md)) {
1391 			error = EFAULT;
1392 			break;
1393 		}
1394 		error = vm_get_capability(sc->vmm_vm, vcpu, vmcap.captype,
1395 		    &vmcap.capval);
1396 		if (error == 0 &&
1397 		    ddi_copyout(&vmcap, datap, sizeof (vmcap), md)) {
1398 			error = EFAULT;
1399 			break;
1400 		}
1401 		break;
1402 	}
1403 	case VM_SET_CAPABILITY: {
1404 		struct vm_capability vmcap;
1405 
1406 		if (ddi_copyin(datap, &vmcap, sizeof (vmcap), md)) {
1407 			error = EFAULT;
1408 			break;
1409 		}
1410 		error = vm_set_capability(sc->vmm_vm, vcpu, vmcap.captype,
1411 		    vmcap.capval);
1412 		break;
1413 	}
1414 	case VM_SET_X2APIC_STATE: {
1415 		struct vm_x2apic x2apic;
1416 
1417 		if (ddi_copyin(datap, &x2apic, sizeof (x2apic), md)) {
1418 			error = EFAULT;
1419 			break;
1420 		}
1421 		error = vm_set_x2apic_state(sc->vmm_vm, vcpu, x2apic.state);
1422 		break;
1423 	}
1424 	case VM_GET_X2APIC_STATE: {
1425 		struct vm_x2apic x2apic;
1426 
1427 		if (ddi_copyin(datap, &x2apic, sizeof (x2apic), md)) {
1428 			error = EFAULT;
1429 			break;
1430 		}
1431 		error = vm_get_x2apic_state(sc->vmm_vm, x2apic.cpuid,
1432 		    &x2apic.state);
1433 		if (error == 0 &&
1434 		    ddi_copyout(&x2apic, datap, sizeof (x2apic), md)) {
1435 			error = EFAULT;
1436 			break;
1437 		}
1438 		break;
1439 	}
1440 	case VM_GET_GPA_PMAP: {
1441 		/*
1442 		 * Until there is a necessity to leak EPT/RVI PTE values to
1443 		 * userspace, this will remain unimplemented
1444 		 */
1445 		error = EINVAL;
1446 		break;
1447 	}
1448 	case VM_GET_HPET_CAPABILITIES: {
1449 		struct vm_hpet_cap hpetcap;
1450 
1451 		error = vhpet_getcap(&hpetcap);
1452 		if (error == 0 &&
1453 		    ddi_copyout(&hpetcap, datap, sizeof (hpetcap), md)) {
1454 			error = EFAULT;
1455 			break;
1456 		}
1457 		break;
1458 	}
1459 	case VM_GLA2GPA: {
1460 		struct vm_gla2gpa gg;
1461 
1462 		if (ddi_copyin(datap, &gg, sizeof (gg), md)) {
1463 			error = EFAULT;
1464 			break;
1465 		}
1466 		gg.vcpuid = vcpu;
1467 		error = vm_gla2gpa(sc->vmm_vm, vcpu, &gg.paging, gg.gla,
1468 		    gg.prot, &gg.gpa, &gg.fault);
1469 		if (error == 0 && ddi_copyout(&gg, datap, sizeof (gg), md)) {
1470 			error = EFAULT;
1471 			break;
1472 		}
1473 		break;
1474 	}
1475 	case VM_GLA2GPA_NOFAULT: {
1476 		struct vm_gla2gpa gg;
1477 
1478 		if (ddi_copyin(datap, &gg, sizeof (gg), md)) {
1479 			error = EFAULT;
1480 			break;
1481 		}
1482 		gg.vcpuid = vcpu;
1483 		error = vm_gla2gpa_nofault(sc->vmm_vm, vcpu, &gg.paging,
1484 		    gg.gla, gg.prot, &gg.gpa, &gg.fault);
1485 		if (error == 0 && ddi_copyout(&gg, datap, sizeof (gg), md)) {
1486 			error = EFAULT;
1487 			break;
1488 		}
1489 		break;
1490 	}
1491 
1492 	case VM_ACTIVATE_CPU:
1493 		error = vm_activate_cpu(sc->vmm_vm, vcpu);
1494 		break;
1495 
1496 	case VM_SUSPEND_CPU:
1497 		if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) {
1498 			error = EFAULT;
1499 		} else {
1500 			error = vm_suspend_cpu(sc->vmm_vm, vcpu);
1501 		}
1502 		break;
1503 
1504 	case VM_RESUME_CPU:
1505 		if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) {
1506 			error = EFAULT;
1507 		} else {
1508 			error = vm_resume_cpu(sc->vmm_vm, vcpu);
1509 		}
1510 		break;
1511 
1512 	case VM_GET_CPUS: {
1513 		struct vm_cpuset vm_cpuset;
1514 		cpuset_t tempset;
1515 		void *srcp = &tempset;
1516 		int size;
1517 
1518 		if (ddi_copyin(datap, &vm_cpuset, sizeof (vm_cpuset), md)) {
1519 			error = EFAULT;
1520 			break;
1521 		}
1522 
1523 		/* Be more generous about sizing since our cpuset_t is large. */
1524 		size = vm_cpuset.cpusetsize;
1525 		if (size <= 0 || size > sizeof (cpuset_t)) {
1526 			error = ERANGE;
1527 		}
1528 		/*
1529 		 * If they want a ulong_t or less, make sure they receive the
1530 		 * low bits with all the useful information.
1531 		 */
1532 		if (size <= sizeof (tempset.cpub[0])) {
1533 			srcp = &tempset.cpub[0];
1534 		}
1535 
1536 		if (vm_cpuset.which == VM_ACTIVE_CPUS) {
1537 			tempset = vm_active_cpus(sc->vmm_vm);
1538 		} else if (vm_cpuset.which == VM_SUSPENDED_CPUS) {
1539 			tempset = vm_suspended_cpus(sc->vmm_vm);
1540 		} else if (vm_cpuset.which == VM_DEBUG_CPUS) {
1541 			tempset = vm_debug_cpus(sc->vmm_vm);
1542 		} else {
1543 			error = EINVAL;
1544 		}
1545 
1546 		ASSERT(size > 0 && size <= sizeof (tempset));
1547 		if (error == 0 &&
1548 		    ddi_copyout(srcp, vm_cpuset.cpus, size, md)) {
1549 			error = EFAULT;
1550 			break;
1551 		}
1552 		break;
1553 	}
1554 	case VM_SET_INTINFO: {
1555 		struct vm_intinfo vmii;
1556 
1557 		if (ddi_copyin(datap, &vmii, sizeof (vmii), md)) {
1558 			error = EFAULT;
1559 			break;
1560 		}
1561 		error = vm_exit_intinfo(sc->vmm_vm, vcpu, vmii.info1);
1562 		break;
1563 	}
1564 	case VM_GET_INTINFO: {
1565 		struct vm_intinfo vmii;
1566 
1567 		vmii.vcpuid = vcpu;
1568 		error = vm_get_intinfo(sc->vmm_vm, vcpu, &vmii.info1,
1569 		    &vmii.info2);
1570 		if (error == 0 &&
1571 		    ddi_copyout(&vmii, datap, sizeof (vmii), md)) {
1572 			error = EFAULT;
1573 			break;
1574 		}
1575 		break;
1576 	}
1577 	case VM_RTC_WRITE: {
1578 		struct vm_rtc_data rtcdata;
1579 
1580 		if (ddi_copyin(datap, &rtcdata, sizeof (rtcdata), md)) {
1581 			error = EFAULT;
1582 			break;
1583 		}
1584 		error = vrtc_nvram_write(sc->vmm_vm, rtcdata.offset,
1585 		    rtcdata.value);
1586 		break;
1587 	}
1588 	case VM_RTC_READ: {
1589 		struct vm_rtc_data rtcdata;
1590 
1591 		if (ddi_copyin(datap, &rtcdata, sizeof (rtcdata), md)) {
1592 			error = EFAULT;
1593 			break;
1594 		}
1595 		error = vrtc_nvram_read(sc->vmm_vm, rtcdata.offset,
1596 		    &rtcdata.value);
1597 		if (error == 0 &&
1598 		    ddi_copyout(&rtcdata, datap, sizeof (rtcdata), md)) {
1599 			error = EFAULT;
1600 			break;
1601 		}
1602 		break;
1603 	}
1604 	case VM_RTC_SETTIME: {
1605 		struct vm_rtc_time rtctime;
1606 
1607 		if (ddi_copyin(datap, &rtctime, sizeof (rtctime), md)) {
1608 			error = EFAULT;
1609 			break;
1610 		}
1611 		error = vrtc_set_time(sc->vmm_vm, rtctime.secs);
1612 		break;
1613 	}
1614 	case VM_RTC_GETTIME: {
1615 		struct vm_rtc_time rtctime;
1616 
1617 		rtctime.secs = vrtc_get_time(sc->vmm_vm);
1618 		if (ddi_copyout(&rtctime, datap, sizeof (rtctime), md)) {
1619 			error = EFAULT;
1620 			break;
1621 		}
1622 		break;
1623 	}
1624 
1625 	case VM_PMTMR_LOCATE: {
1626 		uint16_t port = arg;
1627 		error = vpmtmr_set_location(sc->vmm_vm, port);
1628 		break;
1629 	}
1630 
1631 	case VM_RESTART_INSTRUCTION:
1632 		error = vm_restart_instruction(sc->vmm_vm, vcpu);
1633 		break;
1634 
1635 	case VM_SET_TOPOLOGY: {
1636 		struct vm_cpu_topology topo;
1637 
1638 		if (ddi_copyin(datap, &topo, sizeof (topo), md) != 0) {
1639 			error = EFAULT;
1640 			break;
1641 		}
1642 		error = vm_set_topology(sc->vmm_vm, topo.sockets, topo.cores,
1643 		    topo.threads, topo.maxcpus);
1644 		break;
1645 	}
1646 	case VM_GET_TOPOLOGY: {
1647 		struct vm_cpu_topology topo;
1648 
1649 		vm_get_topology(sc->vmm_vm, &topo.sockets, &topo.cores,
1650 		    &topo.threads, &topo.maxcpus);
1651 		if (ddi_copyout(&topo, datap, sizeof (topo), md) != 0) {
1652 			error = EFAULT;
1653 			break;
1654 		}
1655 		break;
1656 	}
1657 	case VM_DEVMEM_GETOFFSET: {
1658 		struct vm_devmem_offset vdo;
1659 		vmm_devmem_entry_t *de;
1660 
1661 		if (ddi_copyin(datap, &vdo, sizeof (vdo), md) != 0) {
1662 			error = EFAULT;
1663 			break;
1664 		}
1665 
1666 		de = vmmdev_devmem_find(sc, vdo.segid);
1667 		if (de != NULL) {
1668 			vdo.offset = de->vde_off;
1669 			if (ddi_copyout(&vdo, datap, sizeof (vdo), md) != 0) {
1670 				error = EFAULT;
1671 			}
1672 		} else {
1673 			error = ENOENT;
1674 		}
1675 		break;
1676 	}
1677 	case VM_TRACK_DIRTY_PAGES: {
1678 		const size_t max_track_region_len = 8 * PAGESIZE * 8 * PAGESIZE;
1679 		struct vmm_dirty_tracker tracker;
1680 		uint8_t *bitmap;
1681 		size_t len;
1682 
1683 		if (ddi_copyin(datap, &tracker, sizeof (tracker), md) != 0) {
1684 			error = EFAULT;
1685 			break;
1686 		}
1687 		if ((tracker.vdt_start_gpa & PAGEOFFSET) != 0) {
1688 			error = EINVAL;
1689 			break;
1690 		}
1691 		if (tracker.vdt_len == 0) {
1692 			break;
1693 		}
1694 		if ((tracker.vdt_len & PAGEOFFSET) != 0) {
1695 			error = EINVAL;
1696 			break;
1697 		}
1698 		if (tracker.vdt_len > max_track_region_len) {
1699 			error = EINVAL;
1700 			break;
1701 		}
1702 		len = roundup(tracker.vdt_len / PAGESIZE, 8) / 8;
1703 		bitmap = kmem_zalloc(len, KM_SLEEP);
1704 		vm_track_dirty_pages(sc->vmm_vm, tracker.vdt_start_gpa,
1705 		    tracker.vdt_len, bitmap);
1706 		if (ddi_copyout(bitmap, tracker.vdt_pfns, len, md) != 0) {
1707 			error = EFAULT;
1708 		}
1709 		kmem_free(bitmap, len);
1710 
1711 		break;
1712 	}
1713 	case VM_WRLOCK_CYCLE: {
1714 		/*
1715 		 * Present a test mechanism to acquire/release the write lock
1716 		 * on the VM without any other effects.
1717 		 */
1718 		break;
1719 	}
1720 	case VM_DATA_READ: {
1721 		struct vm_data_xfer vdx;
1722 
1723 		if (ddi_copyin(datap, &vdx, sizeof (vdx), md) != 0) {
1724 			error = EFAULT;
1725 			break;
1726 		}
1727 		if ((vdx.vdx_flags & ~VDX_FLAGS_VALID) != 0) {
1728 			error = EINVAL;
1729 			break;
1730 		}
1731 		if (vdx.vdx_len > VM_DATA_XFER_LIMIT) {
1732 			error = EFBIG;
1733 			break;
1734 		}
1735 
1736 		const size_t len = vdx.vdx_len;
1737 		void *buf = NULL;
1738 		if (len != 0) {
1739 			buf = kmem_alloc(len, KM_SLEEP);
1740 			if ((vdx.vdx_flags & VDX_FLAG_READ_COPYIN) != 0 &&
1741 			    ddi_copyin(vdx.vdx_data, buf, len, md) != 0) {
1742 				kmem_free(buf, len);
1743 				error = EFAULT;
1744 				break;
1745 			} else {
1746 				bzero(buf, len);
1747 			}
1748 		}
1749 
1750 		vdx.vdx_result_len = 0;
1751 		vmm_data_req_t req = {
1752 			.vdr_class = vdx.vdx_class,
1753 			.vdr_version = vdx.vdx_version,
1754 			.vdr_flags = vdx.vdx_flags,
1755 			.vdr_len = len,
1756 			.vdr_data = buf,
1757 			.vdr_result_len = &vdx.vdx_result_len,
1758 		};
1759 		error = vmm_data_read(sc->vmm_vm, vdx.vdx_vcpuid, &req);
1760 
1761 		if (error == 0 && buf != NULL) {
1762 			if (ddi_copyout(buf, vdx.vdx_data, len, md) != 0) {
1763 				error = EFAULT;
1764 			}
1765 		}
1766 
1767 		/*
1768 		 * Copy out the transfer request so that the value of
1769 		 * vdx_result_len can be made available, regardless of any
1770 		 * error(s) which may have occurred.
1771 		 */
1772 		if (ddi_copyout(&vdx, datap, sizeof (vdx), md) != 0) {
1773 			error = (error != 0) ? error : EFAULT;
1774 		}
1775 
1776 		if (buf != NULL) {
1777 			kmem_free(buf, len);
1778 		}
1779 		break;
1780 	}
1781 	case VM_DATA_WRITE: {
1782 		struct vm_data_xfer vdx;
1783 
1784 		if (ddi_copyin(datap, &vdx, sizeof (vdx), md) != 0) {
1785 			error = EFAULT;
1786 			break;
1787 		}
1788 		if ((vdx.vdx_flags & ~VDX_FLAGS_VALID) != 0) {
1789 			error = EINVAL;
1790 			break;
1791 		}
1792 		if (vdx.vdx_len > VM_DATA_XFER_LIMIT) {
1793 			error = EFBIG;
1794 			break;
1795 		}
1796 
1797 		const size_t len = vdx.vdx_len;
1798 		void *buf = NULL;
1799 		if (len != 0) {
1800 			buf = kmem_alloc(len, KM_SLEEP);
1801 			if (ddi_copyin(vdx.vdx_data, buf, len, md) != 0) {
1802 				kmem_free(buf, len);
1803 				error = EFAULT;
1804 				break;
1805 			}
1806 		}
1807 
1808 		vdx.vdx_result_len = 0;
1809 		vmm_data_req_t req = {
1810 			.vdr_class = vdx.vdx_class,
1811 			.vdr_version = vdx.vdx_version,
1812 			.vdr_flags = vdx.vdx_flags,
1813 			.vdr_len = len,
1814 			.vdr_data = buf,
1815 			.vdr_result_len = &vdx.vdx_result_len,
1816 		};
1817 		if (vmm_allow_state_writes == 0) {
1818 			/* XXX: Play it safe for now */
1819 			error = EPERM;
1820 		} else {
1821 			error = vmm_data_write(sc->vmm_vm, vdx.vdx_vcpuid,
1822 			    &req);
1823 		}
1824 
1825 		if (error == 0 && buf != NULL &&
1826 		    (vdx.vdx_flags & VDX_FLAG_WRITE_COPYOUT) != 0) {
1827 			if (ddi_copyout(buf, vdx.vdx_data, len, md) != 0) {
1828 				error = EFAULT;
1829 			}
1830 		}
1831 
1832 		/*
1833 		 * Copy out the transfer request so that the value of
1834 		 * vdx_result_len can be made available, regardless of any
1835 		 * error(s) which may have occurred.
1836 		 */
1837 		if (ddi_copyout(&vdx, datap, sizeof (vdx), md) != 0) {
1838 			error = (error != 0) ? error : EFAULT;
1839 		}
1840 
1841 		if (buf != NULL) {
1842 			kmem_free(buf, len);
1843 		}
1844 		break;
1845 	}
1846 
1847 	default:
1848 		error = ENOTTY;
1849 		break;
1850 	}
1851 
1852 	/* Release exclusion resources */
1853 	switch (lock_type) {
1854 	case LOCK_NONE:
1855 		break;
1856 	case LOCK_VCPU:
1857 		vcpu_unlock_one(sc, vcpu);
1858 		break;
1859 	case LOCK_READ_HOLD:
1860 		vmm_read_unlock(sc);
1861 		break;
1862 	case LOCK_WRITE_HOLD:
1863 		vmm_write_unlock(sc);
1864 		break;
1865 	default:
1866 		panic("unexpected lock type");
1867 		break;
1868 	}
1869 
1870 	return (error);
1871 }
1872 
1873 static vmm_softc_t *
1874 vmm_lookup(const char *name)
1875 {
1876 	list_t *vml = &vmm_list;
1877 	vmm_softc_t *sc;
1878 
1879 	ASSERT(MUTEX_HELD(&vmm_mtx));
1880 
1881 	for (sc = list_head(vml); sc != NULL; sc = list_next(vml, sc)) {
1882 		if (strcmp(sc->vmm_name, name) == 0) {
1883 			break;
1884 		}
1885 	}
1886 
1887 	return (sc);
1888 }
1889 
1890 /*
1891  * Acquire an HMA registration if not already held.
1892  */
1893 static boolean_t
1894 vmm_hma_acquire(void)
1895 {
1896 	ASSERT(MUTEX_NOT_HELD(&vmm_mtx));
1897 
1898 	mutex_enter(&vmmdev_mtx);
1899 
1900 	if (vmmdev_hma_reg == NULL) {
1901 		VERIFY3U(vmmdev_hma_ref, ==, 0);
1902 		vmmdev_hma_reg = hma_register(vmmdev_hvm_name);
1903 		if (vmmdev_hma_reg == NULL) {
1904 			cmn_err(CE_WARN, "%s HMA registration failed.",
1905 			    vmmdev_hvm_name);
1906 			mutex_exit(&vmmdev_mtx);
1907 			return (B_FALSE);
1908 		}
1909 	}
1910 
1911 	vmmdev_hma_ref++;
1912 
1913 	mutex_exit(&vmmdev_mtx);
1914 
1915 	return (B_TRUE);
1916 }
1917 
1918 /*
1919  * Release the HMA registration if held and there are no remaining VMs.
1920  */
1921 static void
1922 vmm_hma_release(void)
1923 {
1924 	ASSERT(MUTEX_NOT_HELD(&vmm_mtx));
1925 
1926 	mutex_enter(&vmmdev_mtx);
1927 
1928 	VERIFY3U(vmmdev_hma_ref, !=, 0);
1929 
1930 	vmmdev_hma_ref--;
1931 
1932 	if (vmmdev_hma_ref == 0) {
1933 		VERIFY(vmmdev_hma_reg != NULL);
1934 		hma_unregister(vmmdev_hma_reg);
1935 		vmmdev_hma_reg = NULL;
1936 	}
1937 	mutex_exit(&vmmdev_mtx);
1938 }
1939 
1940 static int
1941 vmmdev_do_vm_create(const struct vm_create_req *req, cred_t *cr)
1942 {
1943 	vmm_softc_t	*sc = NULL;
1944 	minor_t		minor;
1945 	int		error = ENOMEM;
1946 	size_t		len;
1947 	const char	*name = req->name;
1948 
1949 	len = strnlen(name, VM_MAX_NAMELEN);
1950 	if (len == 0) {
1951 		return (EINVAL);
1952 	}
1953 	if (len >= VM_MAX_NAMELEN) {
1954 		return (ENAMETOOLONG);
1955 	}
1956 	if (strchr(name, '/') != NULL) {
1957 		return (EINVAL);
1958 	}
1959 
1960 	if (!vmm_hma_acquire())
1961 		return (ENXIO);
1962 
1963 	mutex_enter(&vmm_mtx);
1964 
1965 	/* Look for duplicate names */
1966 	if (vmm_lookup(name) != NULL) {
1967 		mutex_exit(&vmm_mtx);
1968 		vmm_hma_release();
1969 		return (EEXIST);
1970 	}
1971 
1972 	/* Allow only one instance per non-global zone. */
1973 	if (!INGLOBALZONE(curproc)) {
1974 		for (sc = list_head(&vmm_list); sc != NULL;
1975 		    sc = list_next(&vmm_list, sc)) {
1976 			if (sc->vmm_zone == curzone) {
1977 				mutex_exit(&vmm_mtx);
1978 				vmm_hma_release();
1979 				return (EINVAL);
1980 			}
1981 		}
1982 	}
1983 
1984 	minor = id_alloc(vmm_minors);
1985 	if (ddi_soft_state_zalloc(vmm_statep, minor) != DDI_SUCCESS) {
1986 		goto fail;
1987 	} else if ((sc = ddi_get_soft_state(vmm_statep, minor)) == NULL) {
1988 		ddi_soft_state_free(vmm_statep, minor);
1989 		goto fail;
1990 	} else if (ddi_create_minor_node(vmmdev_dip, name, S_IFCHR, minor,
1991 	    DDI_PSEUDO, 0) != DDI_SUCCESS) {
1992 		goto fail;
1993 	}
1994 
1995 	if (vmm_kstat_alloc(sc, minor, cr) != 0) {
1996 		goto fail;
1997 	}
1998 
1999 	error = vm_create(req->flags, &sc->vmm_vm);
2000 	if (error == 0) {
2001 		/* Complete VM intialization and report success. */
2002 		(void) strlcpy(sc->vmm_name, name, sizeof (sc->vmm_name));
2003 		sc->vmm_minor = minor;
2004 		list_create(&sc->vmm_devmem_list, sizeof (vmm_devmem_entry_t),
2005 		    offsetof(vmm_devmem_entry_t, vde_node));
2006 
2007 		list_create(&sc->vmm_holds, sizeof (vmm_hold_t),
2008 		    offsetof(vmm_hold_t, vmh_node));
2009 		cv_init(&sc->vmm_cv, NULL, CV_DEFAULT, NULL);
2010 
2011 		mutex_init(&sc->vmm_lease_lock, NULL, MUTEX_DEFAULT, NULL);
2012 		list_create(&sc->vmm_lease_list, sizeof (vmm_lease_t),
2013 		    offsetof(vmm_lease_t, vml_node));
2014 		cv_init(&sc->vmm_lease_cv, NULL, CV_DEFAULT, NULL);
2015 		rw_init(&sc->vmm_rwlock, NULL, RW_DEFAULT, NULL);
2016 
2017 		sc->vmm_zone = crgetzone(cr);
2018 		zone_hold(sc->vmm_zone);
2019 		vmm_zsd_add_vm(sc);
2020 		vmm_kstat_init(sc);
2021 
2022 		list_insert_tail(&vmm_list, sc);
2023 		mutex_exit(&vmm_mtx);
2024 		return (0);
2025 	}
2026 
2027 	vmm_kstat_fini(sc);
2028 	ddi_remove_minor_node(vmmdev_dip, name);
2029 fail:
2030 	id_free(vmm_minors, minor);
2031 	if (sc != NULL) {
2032 		ddi_soft_state_free(vmm_statep, minor);
2033 	}
2034 	mutex_exit(&vmm_mtx);
2035 	vmm_hma_release();
2036 
2037 	return (error);
2038 }
2039 
2040 /*
2041  * Bhyve 'Driver' Interface
2042  *
2043  * While many devices are emulated in the bhyve userspace process, there are
2044  * others with performance constraints which require that they run mostly or
2045  * entirely in-kernel.  For those not integrated directly into bhyve, an API is
2046  * needed so they can query/manipulate the portions of VM state needed to
2047  * fulfill their purpose.
2048  *
2049  * This includes:
2050  * - Translating guest-physical addresses to host-virtual pointers
2051  * - Injecting MSIs
2052  * - Hooking IO port addresses
2053  *
2054  * The vmm_drv interface exists to provide that functionality to its consumers.
2055  * (At this time, 'viona' is the only user)
2056  */
2057 int
2058 vmm_drv_hold(file_t *fp, cred_t *cr, vmm_hold_t **holdp)
2059 {
2060 	vnode_t *vp = fp->f_vnode;
2061 	const dev_t dev = vp->v_rdev;
2062 	vmm_softc_t *sc;
2063 	vmm_hold_t *hold;
2064 	int err = 0;
2065 
2066 	if (vp->v_type != VCHR) {
2067 		return (ENXIO);
2068 	}
2069 	const major_t major = getmajor(dev);
2070 	const minor_t minor = getminor(dev);
2071 
2072 	mutex_enter(&vmmdev_mtx);
2073 	if (vmmdev_dip == NULL || major != ddi_driver_major(vmmdev_dip)) {
2074 		mutex_exit(&vmmdev_mtx);
2075 		return (ENOENT);
2076 	}
2077 	mutex_enter(&vmm_mtx);
2078 	mutex_exit(&vmmdev_mtx);
2079 
2080 	if ((sc = ddi_get_soft_state(vmm_statep, minor)) == NULL) {
2081 		err = ENOENT;
2082 		goto out;
2083 	}
2084 	/* XXXJOY: check cred permissions against instance */
2085 
2086 	if ((sc->vmm_flags & VMM_DESTROY) != 0) {
2087 		err = EBUSY;
2088 		goto out;
2089 	}
2090 
2091 	hold = kmem_zalloc(sizeof (*hold), KM_SLEEP);
2092 	hold->vmh_sc = sc;
2093 	hold->vmh_release_req = B_FALSE;
2094 
2095 	list_insert_tail(&sc->vmm_holds, hold);
2096 	sc->vmm_flags |= VMM_HELD;
2097 	*holdp = hold;
2098 
2099 out:
2100 	mutex_exit(&vmm_mtx);
2101 	return (err);
2102 }
2103 
2104 void
2105 vmm_drv_rele(vmm_hold_t *hold)
2106 {
2107 	vmm_softc_t *sc;
2108 	bool hma_release = false;
2109 
2110 	ASSERT(hold != NULL);
2111 	ASSERT(hold->vmh_sc != NULL);
2112 	VERIFY(hold->vmh_ioport_hook_cnt == 0);
2113 
2114 	mutex_enter(&vmm_mtx);
2115 	sc = hold->vmh_sc;
2116 	list_remove(&sc->vmm_holds, hold);
2117 	kmem_free(hold, sizeof (*hold));
2118 
2119 	if (list_is_empty(&sc->vmm_holds)) {
2120 		sc->vmm_flags &= ~VMM_HELD;
2121 
2122 		/*
2123 		 * Since outstanding holds would prevent instance destruction
2124 		 * from completing, attempt to finish it now if it was already
2125 		 * set in motion.
2126 		 */
2127 		if ((sc->vmm_flags & VMM_DESTROY) != 0) {
2128 			VERIFY0(vmm_destroy_locked(sc, VDO_DEFAULT,
2129 			    &hma_release));
2130 		}
2131 	}
2132 	mutex_exit(&vmm_mtx);
2133 
2134 	if (hma_release) {
2135 		vmm_hma_release();
2136 	}
2137 }
2138 
2139 boolean_t
2140 vmm_drv_release_reqd(vmm_hold_t *hold)
2141 {
2142 	ASSERT(hold != NULL);
2143 
2144 	return (hold->vmh_release_req);
2145 }
2146 
2147 vmm_lease_t *
2148 vmm_drv_lease_sign(vmm_hold_t *hold, boolean_t (*expiref)(void *), void *arg)
2149 {
2150 	vmm_softc_t *sc = hold->vmh_sc;
2151 	vmm_lease_t *lease;
2152 
2153 	ASSERT3P(expiref, !=, NULL);
2154 
2155 	if (hold->vmh_release_req) {
2156 		return (NULL);
2157 	}
2158 
2159 	lease = kmem_alloc(sizeof (*lease), KM_SLEEP);
2160 	list_link_init(&lease->vml_node);
2161 	lease->vml_expire_func = expiref;
2162 	lease->vml_expire_arg = arg;
2163 	lease->vml_expired = B_FALSE;
2164 	lease->vml_break_deferred = B_FALSE;
2165 	lease->vml_hold = hold;
2166 	/* cache the VM pointer for one less pointer chase */
2167 	lease->vml_vm = sc->vmm_vm;
2168 	lease->vml_vmclient = vmspace_client_alloc(vm_get_vmspace(sc->vmm_vm));
2169 
2170 	mutex_enter(&sc->vmm_lease_lock);
2171 	while (sc->vmm_lease_blocker != 0) {
2172 		cv_wait(&sc->vmm_lease_cv, &sc->vmm_lease_lock);
2173 	}
2174 	list_insert_tail(&sc->vmm_lease_list, lease);
2175 	vmm_read_lock(sc);
2176 	mutex_exit(&sc->vmm_lease_lock);
2177 
2178 	return (lease);
2179 }
2180 
2181 static void
2182 vmm_lease_break_locked(vmm_softc_t *sc, vmm_lease_t *lease)
2183 {
2184 	ASSERT(MUTEX_HELD(&sc->vmm_lease_lock));
2185 
2186 	list_remove(&sc->vmm_lease_list, lease);
2187 	vmm_read_unlock(sc);
2188 	vmc_destroy(lease->vml_vmclient);
2189 	kmem_free(lease, sizeof (*lease));
2190 }
2191 
2192 static void
2193 vmm_lease_block(vmm_softc_t *sc)
2194 {
2195 	mutex_enter(&sc->vmm_lease_lock);
2196 	VERIFY3U(sc->vmm_lease_blocker, !=, UINT_MAX);
2197 	sc->vmm_lease_blocker++;
2198 	if (sc->vmm_lease_blocker == 1) {
2199 		list_t *list = &sc->vmm_lease_list;
2200 		vmm_lease_t *lease = list_head(list);
2201 
2202 		while (lease != NULL) {
2203 			void *arg = lease->vml_expire_arg;
2204 			boolean_t (*expiref)(void *) = lease->vml_expire_func;
2205 			boolean_t sync_break = B_FALSE;
2206 
2207 			/*
2208 			 * Since the lease expiration notification may
2209 			 * need to take locks which would deadlock with
2210 			 * vmm_lease_lock, drop it across the call.
2211 			 *
2212 			 * We are the only one allowed to manipulate
2213 			 * vmm_lease_list right now, so it is safe to
2214 			 * continue iterating through it after
2215 			 * reacquiring the lock.
2216 			 */
2217 			lease->vml_expired = B_TRUE;
2218 			mutex_exit(&sc->vmm_lease_lock);
2219 			sync_break = expiref(arg);
2220 			mutex_enter(&sc->vmm_lease_lock);
2221 
2222 			if (sync_break) {
2223 				vmm_lease_t *next;
2224 
2225 				/*
2226 				 * These leases which are synchronously broken
2227 				 * result in vmm_read_unlock() calls from a
2228 				 * different thread than the corresponding
2229 				 * vmm_read_lock().  This is acceptable, given
2230 				 * that the rwlock underpinning the whole
2231 				 * mechanism tolerates the behavior.  This
2232 				 * flexibility is _only_ afforded to VM read
2233 				 * lock (RW_READER) holders.
2234 				 */
2235 				next = list_next(list, lease);
2236 				vmm_lease_break_locked(sc, lease);
2237 				lease = next;
2238 			} else {
2239 				lease = list_next(list, lease);
2240 			}
2241 		}
2242 
2243 		/* Process leases which were not broken synchronously. */
2244 		while (!list_is_empty(list)) {
2245 			/*
2246 			 * Although the nested loops are quadratic, the number
2247 			 * of leases is small.
2248 			 */
2249 			lease = list_head(list);
2250 			while (lease != NULL) {
2251 				vmm_lease_t *next = list_next(list, lease);
2252 				if (lease->vml_break_deferred) {
2253 					vmm_lease_break_locked(sc, lease);
2254 				}
2255 				lease = next;
2256 			}
2257 			if (list_is_empty(list)) {
2258 				break;
2259 			}
2260 			cv_wait(&sc->vmm_lease_cv, &sc->vmm_lease_lock);
2261 		}
2262 		/* Wake anyone else waiting for the lease list to be empty  */
2263 		cv_broadcast(&sc->vmm_lease_cv);
2264 	} else {
2265 		list_t *list = &sc->vmm_lease_list;
2266 
2267 		/*
2268 		 * Some other thread beat us to the duty of lease cleanup.
2269 		 * Wait until that is complete.
2270 		 */
2271 		while (!list_is_empty(list)) {
2272 			cv_wait(&sc->vmm_lease_cv, &sc->vmm_lease_lock);
2273 		}
2274 	}
2275 	mutex_exit(&sc->vmm_lease_lock);
2276 }
2277 
2278 static void
2279 vmm_lease_unblock(vmm_softc_t *sc)
2280 {
2281 	mutex_enter(&sc->vmm_lease_lock);
2282 	VERIFY3U(sc->vmm_lease_blocker, !=, 0);
2283 	sc->vmm_lease_blocker--;
2284 	if (sc->vmm_lease_blocker == 0) {
2285 		cv_broadcast(&sc->vmm_lease_cv);
2286 	}
2287 	mutex_exit(&sc->vmm_lease_lock);
2288 }
2289 
2290 void
2291 vmm_drv_lease_break(vmm_hold_t *hold, vmm_lease_t *lease)
2292 {
2293 	vmm_softc_t *sc = hold->vmh_sc;
2294 
2295 	VERIFY3P(hold, ==, lease->vml_hold);
2296 	VERIFY(!lease->vml_break_deferred);
2297 
2298 	mutex_enter(&sc->vmm_lease_lock);
2299 	if (sc->vmm_lease_blocker == 0) {
2300 		vmm_lease_break_locked(sc, lease);
2301 	} else {
2302 		/*
2303 		 * Defer the lease-breaking to whichever thread is currently
2304 		 * cleaning up all leases as part of a vmm_lease_block() call.
2305 		 */
2306 		lease->vml_break_deferred = B_TRUE;
2307 		cv_broadcast(&sc->vmm_lease_cv);
2308 	}
2309 	mutex_exit(&sc->vmm_lease_lock);
2310 }
2311 
2312 boolean_t
2313 vmm_drv_lease_expired(vmm_lease_t *lease)
2314 {
2315 	return (lease->vml_expired);
2316 }
2317 
2318 vmm_page_t *
2319 vmm_drv_page_hold(vmm_lease_t *lease, uintptr_t gpa, int prot)
2320 {
2321 	ASSERT(lease != NULL);
2322 	ASSERT0(gpa & PAGEOFFSET);
2323 
2324 	return ((vmm_page_t *)vmc_hold(lease->vml_vmclient, gpa, prot));
2325 }
2326 
2327 void
2328 vmm_drv_page_release(vmm_page_t *vmmp)
2329 {
2330 	(void) vmp_release((vm_page_t *)vmmp);
2331 }
2332 
2333 void
2334 vmm_drv_page_release_chain(vmm_page_t *vmmp)
2335 {
2336 	(void) vmp_release_chain((vm_page_t *)vmmp);
2337 }
2338 
2339 const void *
2340 vmm_drv_page_readable(const vmm_page_t *vmmp)
2341 {
2342 	return (vmp_get_readable((const vm_page_t *)vmmp));
2343 }
2344 
2345 void *
2346 vmm_drv_page_writable(const vmm_page_t *vmmp)
2347 {
2348 	return (vmp_get_writable((const vm_page_t *)vmmp));
2349 }
2350 
2351 void
2352 vmm_drv_page_chain(vmm_page_t *vmmp, vmm_page_t *to_chain)
2353 {
2354 	vmp_chain((vm_page_t *)vmmp, (vm_page_t *)to_chain);
2355 }
2356 
2357 vmm_page_t *
2358 vmm_drv_page_next(const vmm_page_t *vmmp)
2359 {
2360 	return ((vmm_page_t *)vmp_next((vm_page_t *)vmmp));
2361 }
2362 
2363 int
2364 vmm_drv_msi(vmm_lease_t *lease, uint64_t addr, uint64_t msg)
2365 {
2366 	ASSERT(lease != NULL);
2367 
2368 	return (lapic_intr_msi(lease->vml_vm, addr, msg));
2369 }
2370 
2371 int
2372 vmm_drv_ioport_hook(vmm_hold_t *hold, uint16_t ioport, vmm_drv_iop_cb_t func,
2373     void *arg, void **cookie)
2374 {
2375 	vmm_softc_t *sc;
2376 	int err;
2377 
2378 	ASSERT(hold != NULL);
2379 	ASSERT(cookie != NULL);
2380 
2381 	sc = hold->vmh_sc;
2382 	mutex_enter(&vmm_mtx);
2383 	/* Confirm that hook installation is not blocked */
2384 	if ((sc->vmm_flags & VMM_BLOCK_HOOK) != 0) {
2385 		mutex_exit(&vmm_mtx);
2386 		return (EBUSY);
2387 	}
2388 	/*
2389 	 * Optimistically record an installed hook which will prevent a block
2390 	 * from being asserted while the mutex is dropped.
2391 	 */
2392 	hold->vmh_ioport_hook_cnt++;
2393 	mutex_exit(&vmm_mtx);
2394 
2395 	vmm_write_lock(sc);
2396 	err = vm_ioport_hook(sc->vmm_vm, ioport, (ioport_handler_t)func,
2397 	    arg, cookie);
2398 	vmm_write_unlock(sc);
2399 
2400 	if (err != 0) {
2401 		mutex_enter(&vmm_mtx);
2402 		/* Walk back optimism about the hook installation */
2403 		hold->vmh_ioport_hook_cnt--;
2404 		mutex_exit(&vmm_mtx);
2405 	}
2406 	return (err);
2407 }
2408 
2409 void
2410 vmm_drv_ioport_unhook(vmm_hold_t *hold, void **cookie)
2411 {
2412 	vmm_softc_t *sc;
2413 
2414 	ASSERT(hold != NULL);
2415 	ASSERT(cookie != NULL);
2416 	ASSERT(hold->vmh_ioport_hook_cnt != 0);
2417 
2418 	sc = hold->vmh_sc;
2419 	vmm_write_lock(sc);
2420 	vm_ioport_unhook(sc->vmm_vm, cookie);
2421 	vmm_write_unlock(sc);
2422 
2423 	mutex_enter(&vmm_mtx);
2424 	hold->vmh_ioport_hook_cnt--;
2425 	mutex_exit(&vmm_mtx);
2426 }
2427 
2428 static void
2429 vmm_drv_purge(vmm_softc_t *sc)
2430 {
2431 	ASSERT(MUTEX_HELD(&vmm_mtx));
2432 
2433 	if ((sc->vmm_flags & VMM_HELD) != 0) {
2434 		vmm_hold_t *hold;
2435 
2436 		for (hold = list_head(&sc->vmm_holds); hold != NULL;
2437 		    hold = list_next(&sc->vmm_holds, hold)) {
2438 			hold->vmh_release_req = B_TRUE;
2439 		}
2440 
2441 		/*
2442 		 * Require that all leases on the instance be broken, now that
2443 		 * all associated holds have been marked as needing release.
2444 		 *
2445 		 * Dropping vmm_mtx is not strictly necessary, but if any of the
2446 		 * lessees are slow to respond, it would be nice to leave it
2447 		 * available for other parties.
2448 		 */
2449 		mutex_exit(&vmm_mtx);
2450 		vmm_lease_block(sc);
2451 		vmm_lease_unblock(sc);
2452 		mutex_enter(&vmm_mtx);
2453 	}
2454 }
2455 
2456 static int
2457 vmm_drv_block_hook(vmm_softc_t *sc, boolean_t enable_block)
2458 {
2459 	int err = 0;
2460 
2461 	mutex_enter(&vmm_mtx);
2462 	if (!enable_block) {
2463 		VERIFY((sc->vmm_flags & VMM_BLOCK_HOOK) != 0);
2464 
2465 		sc->vmm_flags &= ~VMM_BLOCK_HOOK;
2466 		goto done;
2467 	}
2468 
2469 	/* If any holds have hooks installed, the block is a failure */
2470 	if (!list_is_empty(&sc->vmm_holds)) {
2471 		vmm_hold_t *hold;
2472 
2473 		for (hold = list_head(&sc->vmm_holds); hold != NULL;
2474 		    hold = list_next(&sc->vmm_holds, hold)) {
2475 			if (hold->vmh_ioport_hook_cnt != 0) {
2476 				err = EBUSY;
2477 				goto done;
2478 			}
2479 		}
2480 	}
2481 	sc->vmm_flags |= VMM_BLOCK_HOOK;
2482 
2483 done:
2484 	mutex_exit(&vmm_mtx);
2485 	return (err);
2486 }
2487 
2488 
2489 static void
2490 vmm_destroy_begin(vmm_softc_t *sc, vmm_destroy_opts_t opts)
2491 {
2492 	ASSERT(MUTEX_HELD(&vmm_mtx));
2493 	ASSERT0(sc->vmm_flags & VMM_DESTROY);
2494 
2495 	sc->vmm_flags |= VMM_DESTROY;
2496 
2497 	/*
2498 	 * Lock and unlock all of the vCPUs to ensure that they are kicked out
2499 	 * of guest context, being unable to return now that the instance is
2500 	 * marked for destruction.
2501 	 */
2502 	const int maxcpus = vm_get_maxcpus(sc->vmm_vm);
2503 	for (int vcpu = 0; vcpu < maxcpus; vcpu++) {
2504 		vcpu_lock_one(sc, vcpu);
2505 		vcpu_unlock_one(sc, vcpu);
2506 	}
2507 
2508 	vmmdev_devmem_purge(sc);
2509 	if ((opts & VDO_NO_CLEAN_ZSD) == 0) {
2510 		/*
2511 		 * The ZSD should be cleaned up now, unless destruction of the
2512 		 * instance was initated by destruction of the containing zone,
2513 		 * in which case the ZSD has already been removed.
2514 		 */
2515 		vmm_zsd_rem_vm(sc);
2516 	}
2517 	zone_rele(sc->vmm_zone);
2518 
2519 	vmm_drv_purge(sc);
2520 }
2521 
2522 static bool
2523 vmm_destroy_ready(vmm_softc_t *sc)
2524 {
2525 	ASSERT(MUTEX_HELD(&vmm_mtx));
2526 
2527 	if ((sc->vmm_flags & (VMM_HELD | VMM_IS_OPEN)) == 0) {
2528 		VERIFY(list_is_empty(&sc->vmm_holds));
2529 		return (true);
2530 	}
2531 
2532 	return (false);
2533 }
2534 
2535 static void
2536 vmm_destroy_finish(vmm_softc_t *sc)
2537 {
2538 	ASSERT(MUTEX_HELD(&vmm_mtx));
2539 	ASSERT(vmm_destroy_ready(sc));
2540 
2541 	list_remove(&vmm_list, sc);
2542 	vmm_kstat_fini(sc);
2543 	vm_destroy(sc->vmm_vm);
2544 	ddi_remove_minor_node(vmmdev_dip, sc->vmm_name);
2545 	(void) devfs_clean(ddi_get_parent(vmmdev_dip), NULL, DV_CLEAN_FORCE);
2546 
2547 	const minor_t minor = sc->vmm_minor;
2548 	ddi_soft_state_free(vmm_statep, minor);
2549 	id_free(vmm_minors, minor);
2550 }
2551 
2552 /*
2553  * Initiate or attempt to finish destruction of a VMM instance.
2554  *
2555  * This is called from several contexts:
2556  * - An explicit destroy ioctl is made
2557  * - A vmm_drv consumer releases its hold (being the last on the instance)
2558  * - The vmm device is closed, and auto-destruct is enabled
2559  */
2560 static int
2561 vmm_destroy_locked(vmm_softc_t *sc, vmm_destroy_opts_t opts,
2562     bool *hma_release)
2563 {
2564 	ASSERT(MUTEX_HELD(&vmm_mtx));
2565 
2566 	*hma_release = false;
2567 
2568 	/*
2569 	 * When instance destruction begins, it is so marked such that any
2570 	 * further requests to operate the instance will fail.
2571 	 */
2572 	if ((sc->vmm_flags & VMM_DESTROY) == 0) {
2573 		vmm_destroy_begin(sc, opts);
2574 	}
2575 
2576 	if (vmm_destroy_ready(sc)) {
2577 
2578 		/*
2579 		 * Notify anyone waiting for the destruction to finish.  They
2580 		 * must be clear before we can safely tear down the softc.
2581 		 */
2582 		if (sc->vmm_destroy_waiters != 0) {
2583 			cv_broadcast(&sc->vmm_cv);
2584 			while (sc->vmm_destroy_waiters != 0) {
2585 				cv_wait(&sc->vmm_cv, &vmm_mtx);
2586 			}
2587 		}
2588 
2589 		/*
2590 		 * Finish destruction of instance.  After this point, the softc
2591 		 * is freed and cannot be accessed again.
2592 		 *
2593 		 * With destruction complete, the HMA hold can be released
2594 		 */
2595 		vmm_destroy_finish(sc);
2596 		*hma_release = true;
2597 		return (0);
2598 	} else if ((opts & VDO_ATTEMPT_WAIT) != 0) {
2599 		int err = 0;
2600 
2601 		sc->vmm_destroy_waiters++;
2602 		while (!vmm_destroy_ready(sc) && err == 0) {
2603 			if (cv_wait_sig(&sc->vmm_cv, &vmm_mtx) <= 0) {
2604 				err = EINTR;
2605 			}
2606 		}
2607 		sc->vmm_destroy_waiters--;
2608 
2609 		if (sc->vmm_destroy_waiters == 0) {
2610 			/*
2611 			 * If we were the last waiter, it could be that VM
2612 			 * destruction is waiting on _us_ to proceed with the
2613 			 * final clean-up.
2614 			 */
2615 			cv_signal(&sc->vmm_cv);
2616 		}
2617 		return (err);
2618 	} else {
2619 		/*
2620 		 * Since the instance is not ready for destruction, and the
2621 		 * caller did not ask to wait, consider it a success for now.
2622 		 */
2623 		return (0);
2624 	}
2625 }
2626 
2627 void
2628 vmm_zone_vm_destroy(vmm_softc_t *sc)
2629 {
2630 	bool hma_release = false;
2631 	int err;
2632 
2633 	mutex_enter(&vmm_mtx);
2634 	err = vmm_destroy_locked(sc, VDO_NO_CLEAN_ZSD, &hma_release);
2635 	mutex_exit(&vmm_mtx);
2636 
2637 	VERIFY0(err);
2638 
2639 	if (hma_release) {
2640 		vmm_hma_release();
2641 	}
2642 }
2643 
2644 static int
2645 vmmdev_do_vm_destroy(const struct vm_destroy_req *req, cred_t *cr)
2646 {
2647 	vmm_softc_t *sc;
2648 	bool hma_release = false;
2649 	int err;
2650 
2651 	if (crgetuid(cr) != 0) {
2652 		return (EPERM);
2653 	}
2654 
2655 	mutex_enter(&vmm_mtx);
2656 	sc = vmm_lookup(req->name);
2657 	if (sc == NULL) {
2658 		mutex_exit(&vmm_mtx);
2659 		return (ENOENT);
2660 	}
2661 	/*
2662 	 * We don't check this in vmm_lookup() since that function is also used
2663 	 * for validation during create and currently vmm names must be unique.
2664 	 */
2665 	if (!INGLOBALZONE(curproc) && sc->vmm_zone != curzone) {
2666 		mutex_exit(&vmm_mtx);
2667 		return (EPERM);
2668 	}
2669 
2670 	err = vmm_destroy_locked(sc, VDO_ATTEMPT_WAIT, &hma_release);
2671 	mutex_exit(&vmm_mtx);
2672 
2673 	if (hma_release) {
2674 		vmm_hma_release();
2675 	}
2676 
2677 	return (err);
2678 }
2679 
2680 #define	VCPU_NAME_BUFLEN	32
2681 
2682 static int
2683 vmm_kstat_alloc(vmm_softc_t *sc, minor_t minor, const cred_t *cr)
2684 {
2685 	zoneid_t zid = crgetzoneid(cr);
2686 	int instance = minor;
2687 	kstat_t *ksp;
2688 
2689 	ASSERT3P(sc->vmm_kstat_vm, ==, NULL);
2690 
2691 	ksp = kstat_create_zone(VMM_MODULE_NAME, instance, "vm",
2692 	    VMM_KSTAT_CLASS, KSTAT_TYPE_NAMED,
2693 	    sizeof (vmm_kstats_t) / sizeof (kstat_named_t), 0, zid);
2694 
2695 	if (ksp == NULL) {
2696 		return (-1);
2697 	}
2698 	sc->vmm_kstat_vm = ksp;
2699 
2700 	for (uint_t i = 0; i < VM_MAXCPU; i++) {
2701 		char namebuf[VCPU_NAME_BUFLEN];
2702 
2703 		ASSERT3P(sc->vmm_kstat_vcpu[i], ==, NULL);
2704 
2705 		(void) snprintf(namebuf, VCPU_NAME_BUFLEN, "vcpu%u", i);
2706 		ksp = kstat_create_zone(VMM_MODULE_NAME, instance, namebuf,
2707 		    VMM_KSTAT_CLASS, KSTAT_TYPE_NAMED,
2708 		    sizeof (vmm_vcpu_kstats_t) / sizeof (kstat_named_t),
2709 		    0, zid);
2710 		if (ksp == NULL) {
2711 			goto fail;
2712 		}
2713 
2714 		sc->vmm_kstat_vcpu[i] = ksp;
2715 	}
2716 
2717 	/*
2718 	 * If this instance is associated with a non-global zone, make its
2719 	 * kstats visible from the GZ.
2720 	 */
2721 	if (zid != GLOBAL_ZONEID) {
2722 		kstat_zone_add(sc->vmm_kstat_vm, GLOBAL_ZONEID);
2723 		for (uint_t i = 0; i < VM_MAXCPU; i++) {
2724 			kstat_zone_add(sc->vmm_kstat_vcpu[i], GLOBAL_ZONEID);
2725 		}
2726 	}
2727 
2728 	return (0);
2729 
2730 fail:
2731 	for (uint_t i = 0; i < VM_MAXCPU; i++) {
2732 		if (sc->vmm_kstat_vcpu[i] != NULL) {
2733 			kstat_delete(sc->vmm_kstat_vcpu[i]);
2734 			sc->vmm_kstat_vcpu[i] = NULL;
2735 		} else {
2736 			break;
2737 		}
2738 	}
2739 	kstat_delete(sc->vmm_kstat_vm);
2740 	sc->vmm_kstat_vm = NULL;
2741 	return (-1);
2742 }
2743 
2744 static void
2745 vmm_kstat_init(vmm_softc_t *sc)
2746 {
2747 	kstat_t *ksp;
2748 
2749 	ASSERT3P(sc->vmm_vm, !=, NULL);
2750 	ASSERT3P(sc->vmm_kstat_vm, !=, NULL);
2751 
2752 	ksp = sc->vmm_kstat_vm;
2753 	vmm_kstats_t *vk = ksp->ks_data;
2754 	ksp->ks_private = sc->vmm_vm;
2755 	kstat_named_init(&vk->vk_name, "vm_name", KSTAT_DATA_STRING);
2756 	kstat_named_setstr(&vk->vk_name, sc->vmm_name);
2757 
2758 	for (uint_t i = 0; i < VM_MAXCPU; i++) {
2759 		ASSERT3P(sc->vmm_kstat_vcpu[i], !=, NULL);
2760 
2761 		ksp = sc->vmm_kstat_vcpu[i];
2762 		vmm_vcpu_kstats_t *vvk = ksp->ks_data;
2763 
2764 		kstat_named_init(&vvk->vvk_vcpu, "vcpu", KSTAT_DATA_UINT32);
2765 		vvk->vvk_vcpu.value.ui32 = i;
2766 		kstat_named_init(&vvk->vvk_time_init, "time_init",
2767 		    KSTAT_DATA_UINT64);
2768 		kstat_named_init(&vvk->vvk_time_run, "time_run",
2769 		    KSTAT_DATA_UINT64);
2770 		kstat_named_init(&vvk->vvk_time_idle, "time_idle",
2771 		    KSTAT_DATA_UINT64);
2772 		kstat_named_init(&vvk->vvk_time_emu_kern, "time_emu_kern",
2773 		    KSTAT_DATA_UINT64);
2774 		kstat_named_init(&vvk->vvk_time_emu_user, "time_emu_user",
2775 		    KSTAT_DATA_UINT64);
2776 		kstat_named_init(&vvk->vvk_time_sched, "time_sched",
2777 		    KSTAT_DATA_UINT64);
2778 		ksp->ks_private = sc->vmm_vm;
2779 		ksp->ks_update = vmm_kstat_update_vcpu;
2780 	}
2781 
2782 	kstat_install(sc->vmm_kstat_vm);
2783 	for (uint_t i = 0; i < VM_MAXCPU; i++) {
2784 		kstat_install(sc->vmm_kstat_vcpu[i]);
2785 	}
2786 }
2787 
2788 static void
2789 vmm_kstat_fini(vmm_softc_t *sc)
2790 {
2791 	ASSERT(sc->vmm_kstat_vm != NULL);
2792 
2793 	kstat_delete(sc->vmm_kstat_vm);
2794 	sc->vmm_kstat_vm = NULL;
2795 
2796 	for (uint_t i = 0; i < VM_MAXCPU; i++) {
2797 		ASSERT3P(sc->vmm_kstat_vcpu[i], !=, NULL);
2798 
2799 		kstat_delete(sc->vmm_kstat_vcpu[i]);
2800 		sc->vmm_kstat_vcpu[i] = NULL;
2801 	}
2802 }
2803 
2804 static int
2805 vmm_open(dev_t *devp, int flag, int otyp, cred_t *credp)
2806 {
2807 	minor_t		minor;
2808 	vmm_softc_t	*sc;
2809 
2810 	/*
2811 	 * Forbid running bhyve in a 32-bit process until it has been tested and
2812 	 * verified to be safe.
2813 	 */
2814 	if (curproc->p_model != DATAMODEL_LP64) {
2815 		return (EFBIG);
2816 	}
2817 
2818 	minor = getminor(*devp);
2819 	if (minor == VMM_CTL_MINOR) {
2820 		/*
2821 		 * Master control device must be opened exclusively.
2822 		 */
2823 		if ((flag & FEXCL) != FEXCL || otyp != OTYP_CHR) {
2824 			return (EINVAL);
2825 		}
2826 
2827 		return (0);
2828 	}
2829 
2830 	mutex_enter(&vmm_mtx);
2831 	sc = ddi_get_soft_state(vmm_statep, minor);
2832 	if (sc == NULL) {
2833 		mutex_exit(&vmm_mtx);
2834 		return (ENXIO);
2835 	}
2836 
2837 	sc->vmm_flags |= VMM_IS_OPEN;
2838 	mutex_exit(&vmm_mtx);
2839 
2840 	return (0);
2841 }
2842 
2843 static int
2844 vmm_close(dev_t dev, int flag, int otyp, cred_t *credp)
2845 {
2846 	const minor_t minor = getminor(dev);
2847 	vmm_softc_t *sc;
2848 	bool hma_release = false;
2849 
2850 	if (minor == VMM_CTL_MINOR) {
2851 		return (0);
2852 	}
2853 
2854 	mutex_enter(&vmm_mtx);
2855 	sc = ddi_get_soft_state(vmm_statep, minor);
2856 	if (sc == NULL) {
2857 		mutex_exit(&vmm_mtx);
2858 		return (ENXIO);
2859 	}
2860 
2861 	VERIFY3U(sc->vmm_flags & VMM_IS_OPEN, !=, 0);
2862 	sc->vmm_flags &= ~VMM_IS_OPEN;
2863 
2864 	/*
2865 	 * If instance was marked for auto-destruction begin that now.  Instance
2866 	 * destruction may have been initated already, so try to make progress
2867 	 * in that case, since closure of the device is one of its requirements.
2868 	 */
2869 	if ((sc->vmm_flags & VMM_DESTROY) != 0 ||
2870 	    (sc->vmm_flags & VMM_AUTODESTROY) != 0) {
2871 		VERIFY0(vmm_destroy_locked(sc, VDO_DEFAULT, &hma_release));
2872 	}
2873 	mutex_exit(&vmm_mtx);
2874 
2875 	if (hma_release) {
2876 		vmm_hma_release();
2877 	}
2878 
2879 	return (0);
2880 }
2881 
2882 static int
2883 vmm_is_supported(intptr_t arg)
2884 {
2885 	int r;
2886 	const char *msg;
2887 
2888 	if (vmm_is_intel()) {
2889 		r = vmx_x86_supported(&msg);
2890 	} else if (vmm_is_svm()) {
2891 		/*
2892 		 * HMA already ensured that the features necessary for SVM
2893 		 * operation were present and online during vmm_attach().
2894 		 */
2895 		r = 0;
2896 	} else {
2897 		r = ENXIO;
2898 		msg = "Unsupported CPU vendor";
2899 	}
2900 
2901 	if (r != 0 && arg != (intptr_t)NULL) {
2902 		if (copyoutstr(msg, (char *)arg, strlen(msg) + 1, NULL) != 0)
2903 			return (EFAULT);
2904 	}
2905 	return (r);
2906 }
2907 
2908 static int
2909 vmm_ctl_ioctl(int cmd, intptr_t arg, int md, cred_t *cr, int *rvalp)
2910 {
2911 	void *argp = (void *)arg;
2912 
2913 	switch (cmd) {
2914 	case VMM_CREATE_VM: {
2915 		struct vm_create_req req;
2916 
2917 		if ((md & FWRITE) == 0) {
2918 			return (EPERM);
2919 		}
2920 		if (ddi_copyin(argp, &req, sizeof (req), md) != 0) {
2921 			return (EFAULT);
2922 		}
2923 		return (vmmdev_do_vm_create(&req, cr));
2924 	}
2925 	case VMM_DESTROY_VM: {
2926 		struct vm_destroy_req req;
2927 
2928 		if ((md & FWRITE) == 0) {
2929 			return (EPERM);
2930 		}
2931 		if (ddi_copyin(argp, &req, sizeof (req), md) != 0) {
2932 			return (EFAULT);
2933 		}
2934 		return (vmmdev_do_vm_destroy(&req, cr));
2935 	}
2936 	case VMM_VM_SUPPORTED:
2937 		return (vmm_is_supported(arg));
2938 	case VMM_CHECK_IOMMU:
2939 		if (!vmm_check_iommu()) {
2940 			return (ENXIO);
2941 		}
2942 		return (0);
2943 	case VMM_RESV_QUERY:
2944 	case VMM_RESV_ADD:
2945 	case VMM_RESV_REMOVE:
2946 		return (vmmr_ioctl(cmd, arg, md, cr, rvalp));
2947 	default:
2948 		break;
2949 	}
2950 	/* No other actions are legal on ctl device */
2951 	return (ENOTTY);
2952 }
2953 
2954 static int
2955 vmm_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp,
2956     int *rvalp)
2957 {
2958 	vmm_softc_t	*sc;
2959 	minor_t		minor;
2960 
2961 	/*
2962 	 * Forbid running bhyve in a 32-bit process until it has been tested and
2963 	 * verified to be safe.
2964 	 */
2965 	if (curproc->p_model != DATAMODEL_LP64) {
2966 		return (EFBIG);
2967 	}
2968 
2969 	/* The structs in bhyve ioctls assume a 64-bit datamodel */
2970 	if (ddi_model_convert_from(mode & FMODELS) != DDI_MODEL_NONE) {
2971 		return (ENOTSUP);
2972 	}
2973 
2974 	/*
2975 	 * Regardless of minor (vmmctl or instance), we respond to queries of
2976 	 * the interface version.
2977 	 */
2978 	if (cmd == VMM_INTERFACE_VERSION) {
2979 		*rvalp = VMM_CURRENT_INTERFACE_VERSION;
2980 		return (0);
2981 	}
2982 
2983 	minor = getminor(dev);
2984 
2985 	if (minor == VMM_CTL_MINOR) {
2986 		return (vmm_ctl_ioctl(cmd, arg, mode, credp, rvalp));
2987 	}
2988 
2989 	sc = ddi_get_soft_state(vmm_statep, minor);
2990 	ASSERT(sc != NULL);
2991 
2992 	/*
2993 	 * Turn away any ioctls against an instance when it is being destroyed.
2994 	 * (Except for the ioctl inquiring about that destroy-in-progress.)
2995 	 */
2996 	if ((sc->vmm_flags & VMM_DESTROY) != 0) {
2997 		if (cmd == VM_DESTROY_PENDING) {
2998 			*rvalp = 1;
2999 			return (0);
3000 		}
3001 		return (ENXIO);
3002 	}
3003 
3004 	return (vmmdev_do_ioctl(sc, cmd, arg, mode, credp, rvalp));
3005 }
3006 
3007 static int
3008 vmm_segmap(dev_t dev, off_t off, struct as *as, caddr_t *addrp, off_t len,
3009     unsigned int prot, unsigned int maxprot, unsigned int flags, cred_t *credp)
3010 {
3011 	vmm_softc_t *sc;
3012 	const minor_t minor = getminor(dev);
3013 	int err;
3014 
3015 	if (minor == VMM_CTL_MINOR) {
3016 		return (ENODEV);
3017 	}
3018 	if (off < 0 || (off + len) <= 0) {
3019 		return (EINVAL);
3020 	}
3021 	if ((prot & PROT_USER) == 0) {
3022 		return (EACCES);
3023 	}
3024 
3025 	sc = ddi_get_soft_state(vmm_statep, minor);
3026 	ASSERT(sc);
3027 
3028 	if (sc->vmm_flags & VMM_DESTROY)
3029 		return (ENXIO);
3030 
3031 	/* Grab read lock on the VM to prevent any changes to the memory map */
3032 	vmm_read_lock(sc);
3033 
3034 	if (off >= VM_DEVMEM_START) {
3035 		int segid;
3036 		off_t segoff;
3037 
3038 		/* Mapping a devmem "device" */
3039 		if (!vmmdev_devmem_segid(sc, off, len, &segid, &segoff)) {
3040 			err = ENODEV;
3041 		} else {
3042 			err = vm_segmap_obj(sc->vmm_vm, segid, segoff, len, as,
3043 			    addrp, prot, maxprot, flags);
3044 		}
3045 	} else {
3046 		/* Mapping a part of the guest physical space */
3047 		err = vm_segmap_space(sc->vmm_vm, off, as, addrp, len, prot,
3048 		    maxprot, flags);
3049 	}
3050 
3051 	vmm_read_unlock(sc);
3052 	return (err);
3053 }
3054 
3055 static sdev_plugin_validate_t
3056 vmm_sdev_validate(sdev_ctx_t ctx)
3057 {
3058 	const char *name = sdev_ctx_name(ctx);
3059 	vmm_softc_t *sc;
3060 	sdev_plugin_validate_t ret;
3061 	minor_t minor;
3062 
3063 	if (sdev_ctx_vtype(ctx) != VCHR)
3064 		return (SDEV_VTOR_INVALID);
3065 
3066 	VERIFY3S(sdev_ctx_minor(ctx, &minor), ==, 0);
3067 
3068 	mutex_enter(&vmm_mtx);
3069 	if ((sc = vmm_lookup(name)) == NULL)
3070 		ret = SDEV_VTOR_INVALID;
3071 	else if (sc->vmm_minor != minor)
3072 		ret = SDEV_VTOR_STALE;
3073 	else
3074 		ret = SDEV_VTOR_VALID;
3075 	mutex_exit(&vmm_mtx);
3076 
3077 	return (ret);
3078 }
3079 
3080 static int
3081 vmm_sdev_filldir(sdev_ctx_t ctx)
3082 {
3083 	vmm_softc_t *sc;
3084 	int ret;
3085 
3086 	if (strcmp(sdev_ctx_path(ctx), VMM_SDEV_ROOT) != 0) {
3087 		cmn_err(CE_WARN, "%s: bad path '%s' != '%s'\n", __func__,
3088 		    sdev_ctx_path(ctx), VMM_SDEV_ROOT);
3089 		return (EINVAL);
3090 	}
3091 
3092 	mutex_enter(&vmm_mtx);
3093 	ASSERT(vmmdev_dip != NULL);
3094 	for (sc = list_head(&vmm_list); sc != NULL;
3095 	    sc = list_next(&vmm_list, sc)) {
3096 		if (INGLOBALZONE(curproc) || sc->vmm_zone == curzone) {
3097 			ret = sdev_plugin_mknod(ctx, sc->vmm_name,
3098 			    S_IFCHR | 0600,
3099 			    makedevice(ddi_driver_major(vmmdev_dip),
3100 			    sc->vmm_minor));
3101 		} else {
3102 			continue;
3103 		}
3104 		if (ret != 0 && ret != EEXIST)
3105 			goto out;
3106 	}
3107 
3108 	ret = 0;
3109 
3110 out:
3111 	mutex_exit(&vmm_mtx);
3112 	return (ret);
3113 }
3114 
3115 /* ARGSUSED */
3116 static void
3117 vmm_sdev_inactive(sdev_ctx_t ctx)
3118 {
3119 }
3120 
3121 static sdev_plugin_ops_t vmm_sdev_ops = {
3122 	.spo_version = SDEV_PLUGIN_VERSION,
3123 	.spo_flags = SDEV_PLUGIN_SUBDIR,
3124 	.spo_validate = vmm_sdev_validate,
3125 	.spo_filldir = vmm_sdev_filldir,
3126 	.spo_inactive = vmm_sdev_inactive
3127 };
3128 
3129 /* ARGSUSED */
3130 static int
3131 vmm_info(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result)
3132 {
3133 	int error;
3134 
3135 	switch (cmd) {
3136 	case DDI_INFO_DEVT2DEVINFO:
3137 		*result = (void *)vmmdev_dip;
3138 		error = DDI_SUCCESS;
3139 		break;
3140 	case DDI_INFO_DEVT2INSTANCE:
3141 		*result = (void *)0;
3142 		error = DDI_SUCCESS;
3143 		break;
3144 	default:
3145 		error = DDI_FAILURE;
3146 		break;
3147 	}
3148 	return (error);
3149 }
3150 
3151 static int
3152 vmm_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
3153 {
3154 	sdev_plugin_hdl_t sph;
3155 	hma_reg_t *reg = NULL;
3156 	boolean_t vmm_loaded = B_FALSE;
3157 
3158 	if (cmd != DDI_ATTACH) {
3159 		return (DDI_FAILURE);
3160 	}
3161 
3162 	mutex_enter(&vmmdev_mtx);
3163 	/* Ensure we are not already attached. */
3164 	if (vmmdev_dip != NULL) {
3165 		mutex_exit(&vmmdev_mtx);
3166 		return (DDI_FAILURE);
3167 	}
3168 
3169 	vmm_sol_glue_init();
3170 
3171 	/*
3172 	 * Perform temporary HMA registration to determine if the system
3173 	 * is capable.
3174 	 */
3175 	if ((reg = hma_register(vmmdev_hvm_name)) == NULL) {
3176 		goto fail;
3177 	} else if (vmm_mod_load() != 0) {
3178 		goto fail;
3179 	}
3180 	vmm_loaded = B_TRUE;
3181 	hma_unregister(reg);
3182 	reg = NULL;
3183 
3184 	/* Create control node.  Other nodes will be created on demand. */
3185 	if (ddi_create_minor_node(dip, "ctl", S_IFCHR,
3186 	    VMM_CTL_MINOR, DDI_PSEUDO, 0) != 0) {
3187 		goto fail;
3188 	}
3189 
3190 	sph = sdev_plugin_register(VMM_MODULE_NAME, &vmm_sdev_ops, NULL);
3191 	if (sph == (sdev_plugin_hdl_t)NULL) {
3192 		ddi_remove_minor_node(dip, NULL);
3193 		goto fail;
3194 	}
3195 
3196 	ddi_report_dev(dip);
3197 	vmmdev_sdev_hdl = sph;
3198 	vmmdev_dip = dip;
3199 	mutex_exit(&vmmdev_mtx);
3200 	return (DDI_SUCCESS);
3201 
3202 fail:
3203 	if (vmm_loaded) {
3204 		VERIFY0(vmm_mod_unload());
3205 	}
3206 	if (reg != NULL) {
3207 		hma_unregister(reg);
3208 	}
3209 	vmm_sol_glue_cleanup();
3210 	mutex_exit(&vmmdev_mtx);
3211 	return (DDI_FAILURE);
3212 }
3213 
3214 static int
3215 vmm_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
3216 {
3217 	if (cmd != DDI_DETACH) {
3218 		return (DDI_FAILURE);
3219 	}
3220 
3221 	/*
3222 	 * Ensure that all resources have been cleaned up.
3223 	 *
3224 	 * To prevent a deadlock with iommu_cleanup() we'll fail the detach if
3225 	 * vmmdev_mtx is already held. We can't wait for vmmdev_mtx with our
3226 	 * devinfo locked as iommu_cleanup() tries to recursively lock each
3227 	 * devinfo, including our own, while holding vmmdev_mtx.
3228 	 */
3229 	if (mutex_tryenter(&vmmdev_mtx) == 0)
3230 		return (DDI_FAILURE);
3231 
3232 	mutex_enter(&vmm_mtx);
3233 	if (!list_is_empty(&vmm_list)) {
3234 		mutex_exit(&vmm_mtx);
3235 		mutex_exit(&vmmdev_mtx);
3236 		return (DDI_FAILURE);
3237 	}
3238 	mutex_exit(&vmm_mtx);
3239 
3240 	if (!vmmr_is_empty()) {
3241 		mutex_exit(&vmmdev_mtx);
3242 		return (DDI_FAILURE);
3243 	}
3244 
3245 	VERIFY(vmmdev_sdev_hdl != (sdev_plugin_hdl_t)NULL);
3246 	if (sdev_plugin_unregister(vmmdev_sdev_hdl) != 0) {
3247 		mutex_exit(&vmmdev_mtx);
3248 		return (DDI_FAILURE);
3249 	}
3250 	vmmdev_sdev_hdl = (sdev_plugin_hdl_t)NULL;
3251 
3252 	/* Remove the control node. */
3253 	ddi_remove_minor_node(dip, "ctl");
3254 	vmmdev_dip = NULL;
3255 
3256 	VERIFY0(vmm_mod_unload());
3257 	VERIFY3U(vmmdev_hma_reg, ==, NULL);
3258 	vmm_sol_glue_cleanup();
3259 
3260 	mutex_exit(&vmmdev_mtx);
3261 
3262 	return (DDI_SUCCESS);
3263 }
3264 
3265 static struct cb_ops vmm_cb_ops = {
3266 	vmm_open,
3267 	vmm_close,
3268 	nodev,		/* strategy */
3269 	nodev,		/* print */
3270 	nodev,		/* dump */
3271 	nodev,		/* read */
3272 	nodev,		/* write */
3273 	vmm_ioctl,
3274 	nodev,		/* devmap */
3275 	nodev,		/* mmap */
3276 	vmm_segmap,
3277 	nochpoll,	/* poll */
3278 	ddi_prop_op,
3279 	NULL,
3280 	D_NEW | D_MP | D_DEVMAP
3281 };
3282 
3283 static struct dev_ops vmm_ops = {
3284 	DEVO_REV,
3285 	0,
3286 	vmm_info,
3287 	nulldev,	/* identify */
3288 	nulldev,	/* probe */
3289 	vmm_attach,
3290 	vmm_detach,
3291 	nodev,		/* reset */
3292 	&vmm_cb_ops,
3293 	(struct bus_ops *)NULL
3294 };
3295 
3296 static struct modldrv modldrv = {
3297 	&mod_driverops,
3298 	"bhyve vmm",
3299 	&vmm_ops
3300 };
3301 
3302 static struct modlinkage modlinkage = {
3303 	MODREV_1,
3304 	&modldrv,
3305 	NULL
3306 };
3307 
3308 int
3309 _init(void)
3310 {
3311 	int	error;
3312 
3313 	sysinit();
3314 
3315 	mutex_init(&vmmdev_mtx, NULL, MUTEX_DRIVER, NULL);
3316 	mutex_init(&vmm_mtx, NULL, MUTEX_DRIVER, NULL);
3317 	list_create(&vmm_list, sizeof (vmm_softc_t),
3318 	    offsetof(vmm_softc_t, vmm_node));
3319 	vmm_minors = id_space_create("vmm_minors", VMM_CTL_MINOR + 1, MAXMIN32);
3320 
3321 	error = ddi_soft_state_init(&vmm_statep, sizeof (vmm_softc_t), 0);
3322 	if (error) {
3323 		return (error);
3324 	}
3325 
3326 	vmm_zsd_init();
3327 	vmmr_init();
3328 
3329 	error = mod_install(&modlinkage);
3330 	if (error) {
3331 		ddi_soft_state_fini(&vmm_statep);
3332 		vmm_zsd_fini();
3333 		vmmr_fini();
3334 	}
3335 
3336 	return (error);
3337 }
3338 
3339 int
3340 _fini(void)
3341 {
3342 	int	error;
3343 
3344 	error = mod_remove(&modlinkage);
3345 	if (error) {
3346 		return (error);
3347 	}
3348 
3349 	vmm_zsd_fini();
3350 	vmmr_fini();
3351 
3352 	ddi_soft_state_fini(&vmm_statep);
3353 
3354 	return (0);
3355 }
3356 
3357 int
3358 _info(struct modinfo *modinfop)
3359 {
3360 	return (mod_info(&modlinkage, modinfop));
3361 }
3362