xref: /illumos-gate/usr/src/uts/intel/io/vmm/vmm_sol_dev.c (revision 1677a13522f801f59117c9fb50212af5fb87a872)
1 /*
2  * This file and its contents are supplied under the terms of the
3  * Common Development and Distribution License ("CDDL"), version 1.0.
4  * You may only use this file in accordance with the terms of version
5  * 1.0 of the CDDL.
6  *
7  * A full copy of the text of the CDDL should have accompanied this
8  * source.  A copy of the CDDL is also available via the Internet at
9  * http://www.illumos.org/license/CDDL.
10  */
11 /* This file is dual-licensed; see usr/src/contrib/bhyve/LICENSE */
12 
13 /*
14  * Copyright 2015 Pluribus Networks Inc.
15  * Copyright 2019 Joyent, Inc.
16  * Copyright 2020 OmniOS Community Edition (OmniOSce) Association.
17  * Copyright 2023 Oxide Computer Company
18  */
19 
20 #include <sys/types.h>
21 #include <sys/conf.h>
22 #include <sys/cpuvar.h>
23 #include <sys/ioccom.h>
24 #include <sys/stat.h>
25 #include <sys/vmsystm.h>
26 #include <sys/ddi.h>
27 #include <sys/mkdev.h>
28 #include <sys/sunddi.h>
29 #include <sys/fs/dv_node.h>
30 #include <sys/cpuset.h>
31 #include <sys/id_space.h>
32 #include <sys/fs/sdev_plugin.h>
33 #include <sys/smt.h>
34 #include <sys/kstat.h>
35 
36 #include <sys/kernel.h>
37 #include <sys/hma.h>
38 #include <sys/x86_archext.h>
39 #include <x86/apicreg.h>
40 
41 #include <sys/vmm.h>
42 #include <sys/vmm_kernel.h>
43 #include <sys/vmm_instruction_emul.h>
44 #include <sys/vmm_dev.h>
45 #include <sys/vmm_impl.h>
46 #include <sys/vmm_drv.h>
47 #include <sys/vmm_vm.h>
48 #include <sys/vmm_reservoir.h>
49 
50 #include <vm/seg_dev.h>
51 
52 #include "io/ppt.h"
53 #include "io/vatpic.h"
54 #include "io/vioapic.h"
55 #include "io/vrtc.h"
56 #include "io/vhpet.h"
57 #include "io/vpmtmr.h"
58 #include "vmm_lapic.h"
59 #include "vmm_stat.h"
60 #include "vmm_util.h"
61 
62 /*
63  * Locking details:
64  *
65  * Driver-wide data (vmmdev_*) , including HMA and sdev registration, is
66  * protected by vmmdev_mtx.  The list of vmm_softc_t instances and related data
67  * (vmm_*) are protected by vmm_mtx.  Actions requiring both locks must acquire
68  * vmmdev_mtx before vmm_mtx.  The sdev plugin functions must not attempt to
69  * acquire vmmdev_mtx, as they could deadlock with plugin unregistration.
70  */
71 
72 static kmutex_t		vmmdev_mtx;
73 static dev_info_t	*vmmdev_dip;
74 static hma_reg_t	*vmmdev_hma_reg;
75 static uint_t		vmmdev_hma_ref;
76 static sdev_plugin_hdl_t vmmdev_sdev_hdl;
77 
78 static kmutex_t		vmm_mtx;
79 static list_t		vmm_list;
80 static id_space_t	*vmm_minors;
81 static void		*vmm_statep;
82 
83 /*
84  * Until device emulation in bhyve had been adequately scrutinized and tested,
85  * there was (justified) concern that unusual or corrupt device state payloads
86  * could crash the host when loaded via the vmm-data interface.
87  *
88  * Now that those concerns have been mitigated, this protection is loosened to
89  * default-allow, but the switch is left in place, in case there is a need to
90  * once again clamp down on vmm-data writes.
91  */
92 int		vmm_allow_state_writes = 1;
93 
94 static const char *vmmdev_hvm_name = "bhyve";
95 
96 /* For sdev plugin (/dev) */
97 #define	VMM_SDEV_ROOT "/dev/vmm"
98 
99 /* From uts/intel/io/vmm/intel/vmx.c */
100 extern int vmx_x86_supported(const char **);
101 
102 /* Holds and hooks from drivers external to vmm */
103 struct vmm_hold {
104 	list_node_t	vmh_node;
105 	vmm_softc_t	*vmh_sc;
106 	boolean_t	vmh_release_req;
107 	uint_t		vmh_ioport_hook_cnt;
108 };
109 
110 struct vmm_lease {
111 	list_node_t		vml_node;
112 	struct vm		*vml_vm;
113 	vm_client_t		*vml_vmclient;
114 	boolean_t		vml_expired;
115 	boolean_t		vml_break_deferred;
116 	boolean_t		(*vml_expire_func)(void *);
117 	void			*vml_expire_arg;
118 	struct vmm_hold		*vml_hold;
119 };
120 
121 /* Options for vmm_destroy_locked */
122 typedef enum vmm_destroy_opts {
123 	VDO_DEFAULT		= 0,
124 	/*
125 	 * Indicate that zone-specific-data associated with this VM not be
126 	 * cleaned up as part of the destroy.  Skipping ZSD clean-up is
127 	 * necessary when VM is being destroyed as part of zone destruction,
128 	 * when said ZSD is already being cleaned up.
129 	 */
130 	VDO_NO_CLEAN_ZSD	= (1 << 0),
131 	/*
132 	 * Attempt to wait for VM destruction to complete.  This is opt-in,
133 	 * since there are many normal conditions which could lead to
134 	 * destruction being stalled pending other clean-up.
135 	 */
136 	VDO_ATTEMPT_WAIT	= (1 << 1),
137 } vmm_destroy_opts_t;
138 
139 static void vmm_hma_release(void);
140 static int vmm_destroy_locked(vmm_softc_t *, vmm_destroy_opts_t, bool *);
141 static int vmm_drv_block_hook(vmm_softc_t *, boolean_t);
142 static void vmm_lease_block(vmm_softc_t *);
143 static void vmm_lease_unblock(vmm_softc_t *);
144 static int vmm_kstat_alloc(vmm_softc_t *, minor_t, const cred_t *);
145 static void vmm_kstat_init(vmm_softc_t *);
146 static void vmm_kstat_fini(vmm_softc_t *);
147 
148 /*
149  * The 'devmem' hack:
150  *
151  * On native FreeBSD, bhyve consumers are allowed to create 'devmem' segments
152  * in the vm which appear with their own name related to the vm under /dev.
153  * Since this would be a hassle from an sdev perspective and would require a
154  * new cdev interface (or complicate the existing one), we choose to implement
155  * this in a different manner.  Direct access to the underlying vm memory
156  * segments is exposed by placing them in a range of offsets beyond the normal
157  * guest memory space.  Userspace can query the appropriate offset to mmap()
158  * for a given segment-id with the VM_DEVMEM_GETOFFSET ioctl.
159  */
160 
161 static vmm_devmem_entry_t *
162 vmmdev_devmem_find(vmm_softc_t *sc, int segid)
163 {
164 	vmm_devmem_entry_t *ent = NULL;
165 	list_t *dl = &sc->vmm_devmem_list;
166 
167 	for (ent = list_head(dl); ent != NULL; ent = list_next(dl, ent)) {
168 		if (ent->vde_segid == segid) {
169 			return (ent);
170 		}
171 	}
172 	return (NULL);
173 }
174 
175 static int
176 vmmdev_get_memseg(vmm_softc_t *sc, struct vm_memseg *mseg)
177 {
178 	int error;
179 	bool sysmem;
180 
181 	error = vm_get_memseg(sc->vmm_vm, mseg->segid, &mseg->len, &sysmem,
182 	    NULL);
183 	if (error || mseg->len == 0)
184 		return (error);
185 
186 	if (!sysmem) {
187 		vmm_devmem_entry_t *de;
188 
189 		de = vmmdev_devmem_find(sc, mseg->segid);
190 		if (de != NULL) {
191 			(void) strlcpy(mseg->name, de->vde_name,
192 			    sizeof (mseg->name));
193 		}
194 	} else {
195 		bzero(mseg->name, sizeof (mseg->name));
196 	}
197 
198 	return (error);
199 }
200 
201 static int
202 vmmdev_devmem_create(vmm_softc_t *sc, struct vm_memseg *mseg, const char *name)
203 {
204 	off_t map_offset;
205 	vmm_devmem_entry_t *entry;
206 
207 	if (list_is_empty(&sc->vmm_devmem_list)) {
208 		map_offset = VM_DEVMEM_START;
209 	} else {
210 		entry = list_tail(&sc->vmm_devmem_list);
211 		map_offset = entry->vde_off + entry->vde_len;
212 		if (map_offset < entry->vde_off) {
213 			/* Do not tolerate overflow */
214 			return (ERANGE);
215 		}
216 		/*
217 		 * XXXJOY: We could choose to search the list for duplicate
218 		 * names and toss an error.  Since we're using the offset
219 		 * method for now, it does not make much of a difference.
220 		 */
221 	}
222 
223 	entry = kmem_zalloc(sizeof (*entry), KM_SLEEP);
224 	entry->vde_segid = mseg->segid;
225 	entry->vde_len = mseg->len;
226 	entry->vde_off = map_offset;
227 	(void) strlcpy(entry->vde_name, name, sizeof (entry->vde_name));
228 	list_insert_tail(&sc->vmm_devmem_list, entry);
229 
230 	return (0);
231 }
232 
233 static boolean_t
234 vmmdev_devmem_segid(vmm_softc_t *sc, off_t off, off_t len, int *segidp,
235     off_t *map_offp)
236 {
237 	list_t *dl = &sc->vmm_devmem_list;
238 	vmm_devmem_entry_t *de = NULL;
239 	const off_t map_end = off + len;
240 
241 	VERIFY(off >= VM_DEVMEM_START);
242 
243 	if (map_end < off) {
244 		/* No match on overflow */
245 		return (B_FALSE);
246 	}
247 
248 	for (de = list_head(dl); de != NULL; de = list_next(dl, de)) {
249 		const off_t item_end = de->vde_off + de->vde_len;
250 
251 		if (de->vde_off <= off && item_end >= map_end) {
252 			*segidp = de->vde_segid;
253 			*map_offp = off - de->vde_off;
254 			return (B_TRUE);
255 		}
256 	}
257 	return (B_FALSE);
258 }
259 
260 /*
261  * When an instance is being destroyed, the devmem list of named memory objects
262  * can be torn down, as no new mappings are allowed.
263  */
264 static void
265 vmmdev_devmem_purge(vmm_softc_t *sc)
266 {
267 	vmm_devmem_entry_t *entry;
268 
269 	while ((entry = list_remove_head(&sc->vmm_devmem_list)) != NULL) {
270 		kmem_free(entry, sizeof (*entry));
271 	}
272 }
273 
274 static int
275 vmmdev_alloc_memseg(vmm_softc_t *sc, struct vm_memseg *mseg)
276 {
277 	int error;
278 	bool sysmem = true;
279 
280 	if (VM_MEMSEG_NAME(mseg)) {
281 		sysmem = false;
282 	}
283 	error = vm_alloc_memseg(sc->vmm_vm, mseg->segid, mseg->len, sysmem);
284 
285 	if (error == 0) {
286 		/*
287 		 * Rather than create a whole fresh device from which userspace
288 		 * can mmap this segment, instead make it available at an
289 		 * offset above where the main guest memory resides.
290 		 */
291 		error = vmmdev_devmem_create(sc, mseg, mseg->name);
292 		if (error != 0) {
293 			vm_free_memseg(sc->vmm_vm, mseg->segid);
294 		}
295 	}
296 	return (error);
297 }
298 
299 /*
300  * Resource Locking and Exclusion
301  *
302  * Much of bhyve depends on key portions of VM state, such as the guest memory
303  * map, to remain unchanged while the guest is running.  As ported from
304  * FreeBSD, the initial strategy for this resource exclusion hinged on gating
305  * access to the instance vCPUs.  Threads acting on a single vCPU, like those
306  * performing the work of actually running the guest in VMX/SVM, would lock
307  * only that vCPU during ioctl() entry.  For ioctls which would change VM-wide
308  * state, all of the vCPUs would be first locked, ensuring that the
309  * operation(s) could complete without any other threads stumbling into
310  * intermediate states.
311  *
312  * This approach is largely effective for bhyve.  Common operations, such as
313  * running the vCPUs, steer clear of lock contention.  The model begins to
314  * break down for operations which do not occur in the context of a specific
315  * vCPU.  LAPIC MSI delivery, for example, may be initiated from a worker
316  * thread in the bhyve process.  In order to properly protect those vCPU-less
317  * operations from encountering invalid states, additional locking is required.
318  * This was solved by forcing those operations to lock the VM_MAXCPU-1 vCPU.
319  * It does mean that class of operations will be serialized on locking the
320  * specific vCPU and that instances sized at VM_MAXCPU will potentially see
321  * undue contention on the VM_MAXCPU-1 vCPU.
322  *
323  * In order to address the shortcomings of this model, the concept of a
324  * read/write lock has been added to bhyve.  Operations which change
325  * fundamental aspects of a VM (such as the memory map) must acquire the write
326  * lock, which also implies locking all of the vCPUs and waiting for all read
327  * lock holders to release.  While it increases the cost and waiting time for
328  * those few operations, it allows most hot-path operations on the VM (which
329  * depend on its configuration remaining stable) to occur with minimal locking.
330  *
331  * Consumers of the Driver API (see below) are a special case when it comes to
332  * this locking, since they may hold a read lock via the drv_lease mechanism
333  * for an extended period of time.  Rather than forcing those consumers to
334  * continuously poll for a write lock attempt, the lease system forces them to
335  * provide a release callback to trigger their clean-up (and potential later
336  * reacquisition) of the read lock.
337  */
338 
339 static void
340 vcpu_lock_one(vmm_softc_t *sc, int vcpu)
341 {
342 	ASSERT(vcpu >= 0 && vcpu < VM_MAXCPU);
343 
344 	/*
345 	 * Since this state transition is utilizing from_idle=true, it should
346 	 * not fail, but rather block until it can be successful.
347 	 */
348 	VERIFY0(vcpu_set_state(sc->vmm_vm, vcpu, VCPU_FROZEN, true));
349 }
350 
351 static void
352 vcpu_unlock_one(vmm_softc_t *sc, int vcpu)
353 {
354 	ASSERT(vcpu >= 0 && vcpu < VM_MAXCPU);
355 
356 	VERIFY3U(vcpu_get_state(sc->vmm_vm, vcpu, NULL), ==, VCPU_FROZEN);
357 	VERIFY0(vcpu_set_state(sc->vmm_vm, vcpu, VCPU_IDLE, false));
358 }
359 
360 static void
361 vmm_read_lock(vmm_softc_t *sc)
362 {
363 	rw_enter(&sc->vmm_rwlock, RW_READER);
364 }
365 
366 static void
367 vmm_read_unlock(vmm_softc_t *sc)
368 {
369 	rw_exit(&sc->vmm_rwlock);
370 }
371 
372 static void
373 vmm_write_lock(vmm_softc_t *sc)
374 {
375 	int maxcpus;
376 
377 	/* First lock all the vCPUs */
378 	maxcpus = vm_get_maxcpus(sc->vmm_vm);
379 	for (int vcpu = 0; vcpu < maxcpus; vcpu++) {
380 		vcpu_lock_one(sc, vcpu);
381 	}
382 
383 	/*
384 	 * Block vmm_drv leases from being acquired or held while the VM write
385 	 * lock is held.
386 	 */
387 	vmm_lease_block(sc);
388 
389 	rw_enter(&sc->vmm_rwlock, RW_WRITER);
390 	/*
391 	 * For now, the 'maxcpus' value for an instance is fixed at the
392 	 * compile-time constant of VM_MAXCPU at creation.  If this changes in
393 	 * the future, allowing for dynamic vCPU resource sizing, acquisition
394 	 * of the write lock will need to be wary of such changes.
395 	 */
396 	VERIFY(maxcpus == vm_get_maxcpus(sc->vmm_vm));
397 }
398 
399 static void
400 vmm_write_unlock(vmm_softc_t *sc)
401 {
402 	int maxcpus;
403 
404 	/* Allow vmm_drv leases to be acquired once write lock is dropped */
405 	vmm_lease_unblock(sc);
406 
407 	/*
408 	 * The VM write lock _must_ be released from the same thread it was
409 	 * acquired in, unlike the read lock.
410 	 */
411 	VERIFY(rw_write_held(&sc->vmm_rwlock));
412 	rw_exit(&sc->vmm_rwlock);
413 
414 	/* Unlock all the vCPUs */
415 	maxcpus = vm_get_maxcpus(sc->vmm_vm);
416 	for (int vcpu = 0; vcpu < maxcpus; vcpu++) {
417 		vcpu_unlock_one(sc, vcpu);
418 	}
419 }
420 
421 static int
422 vmmdev_do_ioctl(vmm_softc_t *sc, int cmd, intptr_t arg, int md,
423     cred_t *credp, int *rvalp)
424 {
425 	int error = 0, vcpu = -1;
426 	void *datap = (void *)arg;
427 	enum vm_lock_type {
428 		LOCK_NONE = 0,
429 		LOCK_VCPU,
430 		LOCK_READ_HOLD,
431 		LOCK_WRITE_HOLD
432 	} lock_type = LOCK_NONE;
433 
434 	/* Acquire any exclusion resources needed for the operation. */
435 	switch (cmd) {
436 	case VM_RUN:
437 	case VM_GET_REGISTER:
438 	case VM_SET_REGISTER:
439 	case VM_GET_SEGMENT_DESCRIPTOR:
440 	case VM_SET_SEGMENT_DESCRIPTOR:
441 	case VM_GET_REGISTER_SET:
442 	case VM_SET_REGISTER_SET:
443 	case VM_INJECT_EXCEPTION:
444 	case VM_GET_CAPABILITY:
445 	case VM_SET_CAPABILITY:
446 	case VM_PPTDEV_MSI:
447 	case VM_PPTDEV_MSIX:
448 	case VM_SET_X2APIC_STATE:
449 	case VM_GLA2GPA:
450 	case VM_GLA2GPA_NOFAULT:
451 	case VM_ACTIVATE_CPU:
452 	case VM_SET_INTINFO:
453 	case VM_GET_INTINFO:
454 	case VM_RESTART_INSTRUCTION:
455 	case VM_SET_KERNEMU_DEV:
456 	case VM_GET_KERNEMU_DEV:
457 	case VM_RESET_CPU:
458 	case VM_GET_RUN_STATE:
459 	case VM_SET_RUN_STATE:
460 	case VM_GET_FPU:
461 	case VM_SET_FPU:
462 	case VM_GET_CPUID:
463 	case VM_SET_CPUID:
464 	case VM_LEGACY_CPUID:
465 		/*
466 		 * Copy in the ID of the vCPU chosen for this operation.
467 		 * Since a nefarious caller could update their struct between
468 		 * this locking and when the rest of the ioctl data is copied
469 		 * in, it is _critical_ that this local 'vcpu' variable be used
470 		 * rather than the in-struct one when performing the ioctl.
471 		 */
472 		if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) {
473 			return (EFAULT);
474 		}
475 		if (vcpu < 0 || vcpu > vm_get_maxcpus(sc->vmm_vm)) {
476 			return (EINVAL);
477 		}
478 		vcpu_lock_one(sc, vcpu);
479 		lock_type = LOCK_VCPU;
480 		break;
481 
482 	case VM_REINIT:
483 	case VM_BIND_PPTDEV:
484 	case VM_UNBIND_PPTDEV:
485 	case VM_MAP_PPTDEV_MMIO:
486 	case VM_UNMAP_PPTDEV_MMIO:
487 	case VM_ALLOC_MEMSEG:
488 	case VM_MMAP_MEMSEG:
489 	case VM_MUNMAP_MEMSEG:
490 	case VM_WRLOCK_CYCLE:
491 	case VM_PMTMR_LOCATE:
492 	case VM_PAUSE:
493 	case VM_RESUME:
494 		vmm_write_lock(sc);
495 		lock_type = LOCK_WRITE_HOLD;
496 		break;
497 
498 	case VM_GET_MEMSEG:
499 	case VM_MMAP_GETNEXT:
500 	case VM_LAPIC_IRQ:
501 	case VM_INJECT_NMI:
502 	case VM_IOAPIC_ASSERT_IRQ:
503 	case VM_IOAPIC_DEASSERT_IRQ:
504 	case VM_IOAPIC_PULSE_IRQ:
505 	case VM_LAPIC_MSI:
506 	case VM_LAPIC_LOCAL_IRQ:
507 	case VM_GET_X2APIC_STATE:
508 	case VM_RTC_READ:
509 	case VM_RTC_WRITE:
510 	case VM_RTC_SETTIME:
511 	case VM_RTC_GETTIME:
512 	case VM_PPTDEV_DISABLE_MSIX:
513 	case VM_DEVMEM_GETOFFSET:
514 	case VM_TRACK_DIRTY_PAGES:
515 		vmm_read_lock(sc);
516 		lock_type = LOCK_READ_HOLD;
517 		break;
518 
519 	case VM_DATA_READ:
520 	case VM_DATA_WRITE:
521 		if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) {
522 			return (EFAULT);
523 		}
524 		if (vcpu == -1) {
525 			/* Access data for VM-wide devices */
526 			vmm_write_lock(sc);
527 			lock_type = LOCK_WRITE_HOLD;
528 		} else if (vcpu >= 0 && vcpu < vm_get_maxcpus(sc->vmm_vm)) {
529 			/* Access data associated with a specific vCPU */
530 			vcpu_lock_one(sc, vcpu);
531 			lock_type = LOCK_VCPU;
532 		} else {
533 			return (EINVAL);
534 		}
535 		break;
536 
537 	case VM_GET_GPA_PMAP:
538 	case VM_IOAPIC_PINCOUNT:
539 	case VM_SUSPEND:
540 	case VM_DESC_FPU_AREA:
541 	case VM_SET_AUTODESTRUCT:
542 	case VM_DESTROY_SELF:
543 	case VM_DESTROY_PENDING:
544 	default:
545 		break;
546 	}
547 
548 	/* Execute the primary logic for the ioctl. */
549 	switch (cmd) {
550 	case VM_RUN: {
551 		struct vm_entry entry;
552 
553 		if (ddi_copyin(datap, &entry, sizeof (entry), md)) {
554 			error = EFAULT;
555 			break;
556 		}
557 
558 		if (!(curthread->t_schedflag & TS_VCPU))
559 			smt_mark_as_vcpu();
560 
561 		error = vm_run(sc->vmm_vm, vcpu, &entry);
562 
563 		/*
564 		 * Unexpected states in vm_run() are expressed through positive
565 		 * errno-oriented return values.  VM states which expect further
566 		 * processing in userspace (necessary context via exitinfo) are
567 		 * expressed through negative return values.  For the time being
568 		 * a return value of 0 is not expected from vm_run().
569 		 */
570 		ASSERT(error != 0);
571 		if (error < 0) {
572 			const struct vm_exit *vme;
573 			void *outp = entry.exit_data;
574 
575 			error = 0;
576 			vme = vm_exitinfo(sc->vmm_vm, vcpu);
577 			if (ddi_copyout(vme, outp, sizeof (*vme), md)) {
578 				error = EFAULT;
579 			}
580 		}
581 		break;
582 	}
583 	case VM_SUSPEND: {
584 		struct vm_suspend vmsuspend;
585 
586 		if (ddi_copyin(datap, &vmsuspend, sizeof (vmsuspend), md)) {
587 			error = EFAULT;
588 			break;
589 		}
590 		error = vm_suspend(sc->vmm_vm, vmsuspend.how);
591 		break;
592 	}
593 	case VM_REINIT: {
594 		struct vm_reinit reinit;
595 
596 		if (ddi_copyin(datap, &reinit, sizeof (reinit), md)) {
597 			error = EFAULT;
598 			break;
599 		}
600 		if ((error = vmm_drv_block_hook(sc, B_TRUE)) != 0) {
601 			/*
602 			 * The VM instance should be free of driver-attached
603 			 * hooks during the reinitialization process.
604 			 */
605 			break;
606 		}
607 		error = vm_reinit(sc->vmm_vm, reinit.flags);
608 		(void) vmm_drv_block_hook(sc, B_FALSE);
609 		break;
610 	}
611 	case VM_STAT_DESC: {
612 		struct vm_stat_desc statdesc;
613 
614 		if (ddi_copyin(datap, &statdesc, sizeof (statdesc), md)) {
615 			error = EFAULT;
616 			break;
617 		}
618 		error = vmm_stat_desc_copy(statdesc.index, statdesc.desc,
619 		    sizeof (statdesc.desc));
620 		if (error == 0 &&
621 		    ddi_copyout(&statdesc, datap, sizeof (statdesc), md)) {
622 			error = EFAULT;
623 			break;
624 		}
625 		break;
626 	}
627 	case VM_STATS_IOC: {
628 		struct vm_stats vmstats;
629 
630 		if (ddi_copyin(datap, &vmstats, sizeof (vmstats), md)) {
631 			error = EFAULT;
632 			break;
633 		}
634 		hrt2tv(gethrtime(), &vmstats.tv);
635 		error = vmm_stat_copy(sc->vmm_vm, vmstats.cpuid, vmstats.index,
636 		    nitems(vmstats.statbuf),
637 		    &vmstats.num_entries, vmstats.statbuf);
638 		if (error == 0 &&
639 		    ddi_copyout(&vmstats, datap, sizeof (vmstats), md)) {
640 			error = EFAULT;
641 			break;
642 		}
643 		break;
644 	}
645 
646 	case VM_PPTDEV_MSI: {
647 		struct vm_pptdev_msi pptmsi;
648 
649 		if (ddi_copyin(datap, &pptmsi, sizeof (pptmsi), md)) {
650 			error = EFAULT;
651 			break;
652 		}
653 		error = ppt_setup_msi(sc->vmm_vm, pptmsi.vcpu, pptmsi.pptfd,
654 		    pptmsi.addr, pptmsi.msg, pptmsi.numvec);
655 		break;
656 	}
657 	case VM_PPTDEV_MSIX: {
658 		struct vm_pptdev_msix pptmsix;
659 
660 		if (ddi_copyin(datap, &pptmsix, sizeof (pptmsix), md)) {
661 			error = EFAULT;
662 			break;
663 		}
664 		error = ppt_setup_msix(sc->vmm_vm, pptmsix.vcpu, pptmsix.pptfd,
665 		    pptmsix.idx, pptmsix.addr, pptmsix.msg,
666 		    pptmsix.vector_control);
667 		break;
668 	}
669 	case VM_PPTDEV_DISABLE_MSIX: {
670 		struct vm_pptdev pptdev;
671 
672 		if (ddi_copyin(datap, &pptdev, sizeof (pptdev), md)) {
673 			error = EFAULT;
674 			break;
675 		}
676 		error = ppt_disable_msix(sc->vmm_vm, pptdev.pptfd);
677 		break;
678 	}
679 	case VM_MAP_PPTDEV_MMIO: {
680 		struct vm_pptdev_mmio pptmmio;
681 
682 		if (ddi_copyin(datap, &pptmmio, sizeof (pptmmio), md)) {
683 			error = EFAULT;
684 			break;
685 		}
686 		error = ppt_map_mmio(sc->vmm_vm, pptmmio.pptfd, pptmmio.gpa,
687 		    pptmmio.len, pptmmio.hpa);
688 		break;
689 	}
690 	case VM_UNMAP_PPTDEV_MMIO: {
691 		struct vm_pptdev_mmio pptmmio;
692 
693 		if (ddi_copyin(datap, &pptmmio, sizeof (pptmmio), md)) {
694 			error = EFAULT;
695 			break;
696 		}
697 		error = ppt_unmap_mmio(sc->vmm_vm, pptmmio.pptfd, pptmmio.gpa,
698 		    pptmmio.len);
699 		break;
700 	}
701 	case VM_BIND_PPTDEV: {
702 		struct vm_pptdev pptdev;
703 
704 		if (ddi_copyin(datap, &pptdev, sizeof (pptdev), md)) {
705 			error = EFAULT;
706 			break;
707 		}
708 		error = vm_assign_pptdev(sc->vmm_vm, pptdev.pptfd);
709 		break;
710 	}
711 	case VM_UNBIND_PPTDEV: {
712 		struct vm_pptdev pptdev;
713 
714 		if (ddi_copyin(datap, &pptdev, sizeof (pptdev), md)) {
715 			error = EFAULT;
716 			break;
717 		}
718 		error = vm_unassign_pptdev(sc->vmm_vm, pptdev.pptfd);
719 		break;
720 	}
721 	case VM_GET_PPTDEV_LIMITS: {
722 		struct vm_pptdev_limits pptlimits;
723 
724 		if (ddi_copyin(datap, &pptlimits, sizeof (pptlimits), md)) {
725 			error = EFAULT;
726 			break;
727 		}
728 		error = ppt_get_limits(sc->vmm_vm, pptlimits.pptfd,
729 		    &pptlimits.msi_limit, &pptlimits.msix_limit);
730 		if (error == 0 &&
731 		    ddi_copyout(&pptlimits, datap, sizeof (pptlimits), md)) {
732 			error = EFAULT;
733 			break;
734 		}
735 		break;
736 	}
737 	case VM_INJECT_EXCEPTION: {
738 		struct vm_exception vmexc;
739 		if (ddi_copyin(datap, &vmexc, sizeof (vmexc), md)) {
740 			error = EFAULT;
741 			break;
742 		}
743 		error = vm_inject_exception(sc->vmm_vm, vcpu, vmexc.vector,
744 		    vmexc.error_code_valid != 0, vmexc.error_code,
745 		    vmexc.restart_instruction != 0);
746 		break;
747 	}
748 	case VM_INJECT_NMI: {
749 		struct vm_nmi vmnmi;
750 
751 		if (ddi_copyin(datap, &vmnmi, sizeof (vmnmi), md)) {
752 			error = EFAULT;
753 			break;
754 		}
755 		error = vm_inject_nmi(sc->vmm_vm, vmnmi.cpuid);
756 		break;
757 	}
758 	case VM_LAPIC_IRQ: {
759 		struct vm_lapic_irq vmirq;
760 
761 		if (ddi_copyin(datap, &vmirq, sizeof (vmirq), md)) {
762 			error = EFAULT;
763 			break;
764 		}
765 		error = lapic_intr_edge(sc->vmm_vm, vmirq.cpuid, vmirq.vector);
766 		break;
767 	}
768 	case VM_LAPIC_LOCAL_IRQ: {
769 		struct vm_lapic_irq vmirq;
770 
771 		if (ddi_copyin(datap, &vmirq, sizeof (vmirq), md)) {
772 			error = EFAULT;
773 			break;
774 		}
775 		error = lapic_set_local_intr(sc->vmm_vm, vmirq.cpuid,
776 		    vmirq.vector);
777 		break;
778 	}
779 	case VM_LAPIC_MSI: {
780 		struct vm_lapic_msi vmmsi;
781 
782 		if (ddi_copyin(datap, &vmmsi, sizeof (vmmsi), md)) {
783 			error = EFAULT;
784 			break;
785 		}
786 		error = lapic_intr_msi(sc->vmm_vm, vmmsi.addr, vmmsi.msg);
787 		break;
788 	}
789 
790 	case VM_IOAPIC_ASSERT_IRQ: {
791 		struct vm_ioapic_irq ioapic_irq;
792 
793 		if (ddi_copyin(datap, &ioapic_irq, sizeof (ioapic_irq), md)) {
794 			error = EFAULT;
795 			break;
796 		}
797 		error = vioapic_assert_irq(sc->vmm_vm, ioapic_irq.irq);
798 		break;
799 	}
800 	case VM_IOAPIC_DEASSERT_IRQ: {
801 		struct vm_ioapic_irq ioapic_irq;
802 
803 		if (ddi_copyin(datap, &ioapic_irq, sizeof (ioapic_irq), md)) {
804 			error = EFAULT;
805 			break;
806 		}
807 		error = vioapic_deassert_irq(sc->vmm_vm, ioapic_irq.irq);
808 		break;
809 	}
810 	case VM_IOAPIC_PULSE_IRQ: {
811 		struct vm_ioapic_irq ioapic_irq;
812 
813 		if (ddi_copyin(datap, &ioapic_irq, sizeof (ioapic_irq), md)) {
814 			error = EFAULT;
815 			break;
816 		}
817 		error = vioapic_pulse_irq(sc->vmm_vm, ioapic_irq.irq);
818 		break;
819 	}
820 	case VM_IOAPIC_PINCOUNT: {
821 		int pincount;
822 
823 		pincount = vioapic_pincount(sc->vmm_vm);
824 		if (ddi_copyout(&pincount, datap, sizeof (int), md)) {
825 			error = EFAULT;
826 			break;
827 		}
828 		break;
829 	}
830 	case VM_DESC_FPU_AREA: {
831 		struct vm_fpu_desc desc;
832 		void *buf = NULL;
833 
834 		if (ddi_copyin(datap, &desc, sizeof (desc), md)) {
835 			error = EFAULT;
836 			break;
837 		}
838 		if (desc.vfd_num_entries > 64) {
839 			error = EINVAL;
840 			break;
841 		}
842 		const size_t buf_sz = sizeof (struct vm_fpu_desc_entry) *
843 		    desc.vfd_num_entries;
844 		if (buf_sz != 0) {
845 			buf = kmem_zalloc(buf_sz, KM_SLEEP);
846 		}
847 
848 		/*
849 		 * For now, we are depending on vm_fpu_desc_entry and
850 		 * hma_xsave_state_desc_t having the same format.
851 		 */
852 		CTASSERT(sizeof (struct vm_fpu_desc_entry) ==
853 		    sizeof (hma_xsave_state_desc_t));
854 
855 		size_t req_size;
856 		const uint_t max_entries = hma_fpu_describe_xsave_state(
857 		    (hma_xsave_state_desc_t *)buf,
858 		    desc.vfd_num_entries,
859 		    &req_size);
860 
861 		desc.vfd_req_size = req_size;
862 		desc.vfd_num_entries = max_entries;
863 		if (buf_sz != 0) {
864 			if (ddi_copyout(buf, desc.vfd_entry_data, buf_sz, md)) {
865 				error = EFAULT;
866 			}
867 			kmem_free(buf, buf_sz);
868 		}
869 
870 		if (error == 0) {
871 			if (ddi_copyout(&desc, datap, sizeof (desc), md)) {
872 				error = EFAULT;
873 			}
874 		}
875 		break;
876 	}
877 	case VM_SET_AUTODESTRUCT: {
878 		/*
879 		 * Since this has to do with controlling the lifetime of the
880 		 * greater vmm_softc_t, the flag is protected by vmm_mtx, rather
881 		 * than the vcpu-centric or rwlock exclusion mechanisms.
882 		 */
883 		mutex_enter(&vmm_mtx);
884 		if (arg != 0) {
885 			sc->vmm_flags |= VMM_AUTODESTROY;
886 		} else {
887 			sc->vmm_flags &= ~VMM_AUTODESTROY;
888 		}
889 		mutex_exit(&vmm_mtx);
890 		break;
891 	}
892 	case VM_DESTROY_SELF: {
893 		bool hma_release = false;
894 
895 		/*
896 		 * Just like VMM_DESTROY_VM, but on the instance file descriptor
897 		 * itself, rather than having to perform a racy name lookup as
898 		 * part of the destroy process.
899 		 *
900 		 * Since vmm_destroy_locked() performs vCPU lock acquisition in
901 		 * order to kick the vCPUs out of guest context as part of any
902 		 * destruction, we do not need to worry about it ourself using
903 		 * the `lock_type` logic here.
904 		 */
905 		mutex_enter(&vmm_mtx);
906 		VERIFY0(vmm_destroy_locked(sc, VDO_DEFAULT, &hma_release));
907 		mutex_exit(&vmm_mtx);
908 		if (hma_release) {
909 			vmm_hma_release();
910 		}
911 		break;
912 	}
913 	case VM_DESTROY_PENDING: {
914 		/*
915 		 * If we have made it this far, then destruction of the instance
916 		 * has not been initiated.
917 		 */
918 		*rvalp = 0;
919 		break;
920 	}
921 
922 	case VM_ISA_ASSERT_IRQ: {
923 		struct vm_isa_irq isa_irq;
924 
925 		if (ddi_copyin(datap, &isa_irq, sizeof (isa_irq), md)) {
926 			error = EFAULT;
927 			break;
928 		}
929 		error = vatpic_assert_irq(sc->vmm_vm, isa_irq.atpic_irq);
930 		if (error == 0 && isa_irq.ioapic_irq != -1) {
931 			error = vioapic_assert_irq(sc->vmm_vm,
932 			    isa_irq.ioapic_irq);
933 		}
934 		break;
935 	}
936 	case VM_ISA_DEASSERT_IRQ: {
937 		struct vm_isa_irq isa_irq;
938 
939 		if (ddi_copyin(datap, &isa_irq, sizeof (isa_irq), md)) {
940 			error = EFAULT;
941 			break;
942 		}
943 		error = vatpic_deassert_irq(sc->vmm_vm, isa_irq.atpic_irq);
944 		if (error == 0 && isa_irq.ioapic_irq != -1) {
945 			error = vioapic_deassert_irq(sc->vmm_vm,
946 			    isa_irq.ioapic_irq);
947 		}
948 		break;
949 	}
950 	case VM_ISA_PULSE_IRQ: {
951 		struct vm_isa_irq isa_irq;
952 
953 		if (ddi_copyin(datap, &isa_irq, sizeof (isa_irq), md)) {
954 			error = EFAULT;
955 			break;
956 		}
957 		error = vatpic_pulse_irq(sc->vmm_vm, isa_irq.atpic_irq);
958 		if (error == 0 && isa_irq.ioapic_irq != -1) {
959 			error = vioapic_pulse_irq(sc->vmm_vm,
960 			    isa_irq.ioapic_irq);
961 		}
962 		break;
963 	}
964 	case VM_ISA_SET_IRQ_TRIGGER: {
965 		struct vm_isa_irq_trigger isa_irq_trigger;
966 
967 		if (ddi_copyin(datap, &isa_irq_trigger,
968 		    sizeof (isa_irq_trigger), md)) {
969 			error = EFAULT;
970 			break;
971 		}
972 		error = vatpic_set_irq_trigger(sc->vmm_vm,
973 		    isa_irq_trigger.atpic_irq, isa_irq_trigger.trigger);
974 		break;
975 	}
976 
977 	case VM_MMAP_GETNEXT: {
978 		struct vm_memmap mm;
979 
980 		if (ddi_copyin(datap, &mm, sizeof (mm), md)) {
981 			error = EFAULT;
982 			break;
983 		}
984 		error = vm_mmap_getnext(sc->vmm_vm, &mm.gpa, &mm.segid,
985 		    &mm.segoff, &mm.len, &mm.prot, &mm.flags);
986 		if (error == 0 && ddi_copyout(&mm, datap, sizeof (mm), md)) {
987 			error = EFAULT;
988 			break;
989 		}
990 		break;
991 	}
992 	case VM_MMAP_MEMSEG: {
993 		struct vm_memmap mm;
994 
995 		if (ddi_copyin(datap, &mm, sizeof (mm), md)) {
996 			error = EFAULT;
997 			break;
998 		}
999 		error = vm_mmap_memseg(sc->vmm_vm, mm.gpa, mm.segid, mm.segoff,
1000 		    mm.len, mm.prot, mm.flags);
1001 		break;
1002 	}
1003 	case VM_MUNMAP_MEMSEG: {
1004 		struct vm_munmap mu;
1005 
1006 		if (ddi_copyin(datap, &mu, sizeof (mu), md)) {
1007 			error = EFAULT;
1008 			break;
1009 		}
1010 		error = vm_munmap_memseg(sc->vmm_vm, mu.gpa, mu.len);
1011 		break;
1012 	}
1013 	case VM_ALLOC_MEMSEG: {
1014 		struct vm_memseg vmseg;
1015 
1016 		if (ddi_copyin(datap, &vmseg, sizeof (vmseg), md)) {
1017 			error = EFAULT;
1018 			break;
1019 		}
1020 		error = vmmdev_alloc_memseg(sc, &vmseg);
1021 		break;
1022 	}
1023 	case VM_GET_MEMSEG: {
1024 		struct vm_memseg vmseg;
1025 
1026 		if (ddi_copyin(datap, &vmseg, sizeof (vmseg), md)) {
1027 			error = EFAULT;
1028 			break;
1029 		}
1030 		error = vmmdev_get_memseg(sc, &vmseg);
1031 		if (error == 0 &&
1032 		    ddi_copyout(&vmseg, datap, sizeof (vmseg), md)) {
1033 			error = EFAULT;
1034 			break;
1035 		}
1036 		break;
1037 	}
1038 	case VM_GET_REGISTER: {
1039 		struct vm_register vmreg;
1040 
1041 		if (ddi_copyin(datap, &vmreg, sizeof (vmreg), md)) {
1042 			error = EFAULT;
1043 			break;
1044 		}
1045 		error = vm_get_register(sc->vmm_vm, vcpu, vmreg.regnum,
1046 		    &vmreg.regval);
1047 		if (error == 0 &&
1048 		    ddi_copyout(&vmreg, datap, sizeof (vmreg), md)) {
1049 			error = EFAULT;
1050 			break;
1051 		}
1052 		break;
1053 	}
1054 	case VM_SET_REGISTER: {
1055 		struct vm_register vmreg;
1056 
1057 		if (ddi_copyin(datap, &vmreg, sizeof (vmreg), md)) {
1058 			error = EFAULT;
1059 			break;
1060 		}
1061 		error = vm_set_register(sc->vmm_vm, vcpu, vmreg.regnum,
1062 		    vmreg.regval);
1063 		break;
1064 	}
1065 	case VM_SET_SEGMENT_DESCRIPTOR: {
1066 		struct vm_seg_desc vmsegd;
1067 
1068 		if (ddi_copyin(datap, &vmsegd, sizeof (vmsegd), md)) {
1069 			error = EFAULT;
1070 			break;
1071 		}
1072 		error = vm_set_seg_desc(sc->vmm_vm, vcpu, vmsegd.regnum,
1073 		    &vmsegd.desc);
1074 		break;
1075 	}
1076 	case VM_GET_SEGMENT_DESCRIPTOR: {
1077 		struct vm_seg_desc vmsegd;
1078 
1079 		if (ddi_copyin(datap, &vmsegd, sizeof (vmsegd), md)) {
1080 			error = EFAULT;
1081 			break;
1082 		}
1083 		error = vm_get_seg_desc(sc->vmm_vm, vcpu, vmsegd.regnum,
1084 		    &vmsegd.desc);
1085 		if (error == 0 &&
1086 		    ddi_copyout(&vmsegd, datap, sizeof (vmsegd), md)) {
1087 			error = EFAULT;
1088 			break;
1089 		}
1090 		break;
1091 	}
1092 	case VM_GET_REGISTER_SET: {
1093 		struct vm_register_set vrs;
1094 		int regnums[VM_REG_LAST];
1095 		uint64_t regvals[VM_REG_LAST];
1096 
1097 		if (ddi_copyin(datap, &vrs, sizeof (vrs), md)) {
1098 			error = EFAULT;
1099 			break;
1100 		}
1101 		if (vrs.count > VM_REG_LAST || vrs.count == 0) {
1102 			error = EINVAL;
1103 			break;
1104 		}
1105 		if (ddi_copyin(vrs.regnums, regnums,
1106 		    sizeof (int) * vrs.count, md)) {
1107 			error = EFAULT;
1108 			break;
1109 		}
1110 
1111 		error = 0;
1112 		for (uint_t i = 0; i < vrs.count && error == 0; i++) {
1113 			if (regnums[i] < 0) {
1114 				error = EINVAL;
1115 				break;
1116 			}
1117 			error = vm_get_register(sc->vmm_vm, vcpu, regnums[i],
1118 			    &regvals[i]);
1119 		}
1120 		if (error == 0 && ddi_copyout(regvals, vrs.regvals,
1121 		    sizeof (uint64_t) * vrs.count, md)) {
1122 			error = EFAULT;
1123 		}
1124 		break;
1125 	}
1126 	case VM_SET_REGISTER_SET: {
1127 		struct vm_register_set vrs;
1128 		int regnums[VM_REG_LAST];
1129 		uint64_t regvals[VM_REG_LAST];
1130 
1131 		if (ddi_copyin(datap, &vrs, sizeof (vrs), md)) {
1132 			error = EFAULT;
1133 			break;
1134 		}
1135 		if (vrs.count > VM_REG_LAST || vrs.count == 0) {
1136 			error = EINVAL;
1137 			break;
1138 		}
1139 		if (ddi_copyin(vrs.regnums, regnums,
1140 		    sizeof (int) * vrs.count, md)) {
1141 			error = EFAULT;
1142 			break;
1143 		}
1144 		if (ddi_copyin(vrs.regvals, regvals,
1145 		    sizeof (uint64_t) * vrs.count, md)) {
1146 			error = EFAULT;
1147 			break;
1148 		}
1149 
1150 		error = 0;
1151 		for (uint_t i = 0; i < vrs.count && error == 0; i++) {
1152 			/*
1153 			 * Setting registers in a set is not atomic, since a
1154 			 * failure in the middle of the set will cause a
1155 			 * bail-out and inconsistent register state.  Callers
1156 			 * should be wary of this.
1157 			 */
1158 			if (regnums[i] < 0) {
1159 				error = EINVAL;
1160 				break;
1161 			}
1162 			error = vm_set_register(sc->vmm_vm, vcpu, regnums[i],
1163 			    regvals[i]);
1164 		}
1165 		break;
1166 	}
1167 	case VM_RESET_CPU: {
1168 		struct vm_vcpu_reset vvr;
1169 
1170 		if (ddi_copyin(datap, &vvr, sizeof (vvr), md)) {
1171 			error = EFAULT;
1172 			break;
1173 		}
1174 		if (vvr.kind != VRK_RESET && vvr.kind != VRK_INIT) {
1175 			error = EINVAL;
1176 		}
1177 
1178 		error = vcpu_arch_reset(sc->vmm_vm, vcpu, vvr.kind == VRK_INIT);
1179 		break;
1180 	}
1181 	case VM_GET_RUN_STATE: {
1182 		struct vm_run_state vrs;
1183 
1184 		bzero(&vrs, sizeof (vrs));
1185 		error = vm_get_run_state(sc->vmm_vm, vcpu, &vrs.state,
1186 		    &vrs.sipi_vector);
1187 		if (error == 0) {
1188 			if (ddi_copyout(&vrs, datap, sizeof (vrs), md)) {
1189 				error = EFAULT;
1190 				break;
1191 			}
1192 		}
1193 		break;
1194 	}
1195 	case VM_SET_RUN_STATE: {
1196 		struct vm_run_state vrs;
1197 
1198 		if (ddi_copyin(datap, &vrs, sizeof (vrs), md)) {
1199 			error = EFAULT;
1200 			break;
1201 		}
1202 		error = vm_set_run_state(sc->vmm_vm, vcpu, vrs.state,
1203 		    vrs.sipi_vector);
1204 		break;
1205 	}
1206 	case VM_GET_FPU: {
1207 		struct vm_fpu_state req;
1208 		const size_t max_len = (PAGESIZE * 2);
1209 		void *kbuf;
1210 
1211 		if (ddi_copyin(datap, &req, sizeof (req), md)) {
1212 			error = EFAULT;
1213 			break;
1214 		}
1215 		if (req.len > max_len || req.len == 0) {
1216 			error = EINVAL;
1217 			break;
1218 		}
1219 		kbuf = kmem_zalloc(req.len, KM_SLEEP);
1220 		error = vm_get_fpu(sc->vmm_vm, vcpu, kbuf, req.len);
1221 		if (error == 0) {
1222 			if (ddi_copyout(kbuf, req.buf, req.len, md)) {
1223 				error = EFAULT;
1224 			}
1225 		}
1226 		kmem_free(kbuf, req.len);
1227 		break;
1228 	}
1229 	case VM_SET_FPU: {
1230 		struct vm_fpu_state req;
1231 		const size_t max_len = (PAGESIZE * 2);
1232 		void *kbuf;
1233 
1234 		if (ddi_copyin(datap, &req, sizeof (req), md)) {
1235 			error = EFAULT;
1236 			break;
1237 		}
1238 		if (req.len > max_len || req.len == 0) {
1239 			error = EINVAL;
1240 			break;
1241 		}
1242 		kbuf = kmem_alloc(req.len, KM_SLEEP);
1243 		if (ddi_copyin(req.buf, kbuf, req.len, md)) {
1244 			error = EFAULT;
1245 		} else {
1246 			error = vm_set_fpu(sc->vmm_vm, vcpu, kbuf, req.len);
1247 		}
1248 		kmem_free(kbuf, req.len);
1249 		break;
1250 	}
1251 	case VM_GET_CPUID: {
1252 		struct vm_vcpu_cpuid_config cfg;
1253 		struct vcpu_cpuid_entry *entries = NULL;
1254 
1255 		if (ddi_copyin(datap, &cfg, sizeof (cfg), md)) {
1256 			error = EFAULT;
1257 			break;
1258 		}
1259 		if (cfg.vvcc_nent > VMM_MAX_CPUID_ENTRIES) {
1260 			error = EINVAL;
1261 			break;
1262 		}
1263 
1264 		const size_t entries_size =
1265 		    cfg.vvcc_nent * sizeof (struct vcpu_cpuid_entry);
1266 		if (entries_size != 0) {
1267 			entries = kmem_zalloc(entries_size, KM_SLEEP);
1268 		}
1269 
1270 		vcpu_cpuid_config_t vm_cfg = {
1271 			.vcc_nent = cfg.vvcc_nent,
1272 			.vcc_entries = entries,
1273 		};
1274 		error = vm_get_cpuid(sc->vmm_vm, vcpu, &vm_cfg);
1275 
1276 		/*
1277 		 * Only attempt to copy out the resultant entries if we were
1278 		 * able to query them from the instance.  The flags and number
1279 		 * of entries are emitted regardless.
1280 		 */
1281 		cfg.vvcc_flags = vm_cfg.vcc_flags;
1282 		cfg.vvcc_nent = vm_cfg.vcc_nent;
1283 		if (entries != NULL) {
1284 			if (error == 0 && ddi_copyout(entries, cfg.vvcc_entries,
1285 			    entries_size, md) != 0) {
1286 				error = EFAULT;
1287 			}
1288 
1289 			kmem_free(entries, entries_size);
1290 		}
1291 
1292 		if (ddi_copyout(&cfg, datap, sizeof (cfg), md) != 0) {
1293 			error = EFAULT;
1294 		}
1295 		break;
1296 	}
1297 	case VM_SET_CPUID: {
1298 		struct vm_vcpu_cpuid_config cfg;
1299 		struct vcpu_cpuid_entry *entries = NULL;
1300 		size_t entries_size = 0;
1301 
1302 		if (ddi_copyin(datap, &cfg, sizeof (cfg), md)) {
1303 			error = EFAULT;
1304 			break;
1305 		}
1306 		if (cfg.vvcc_nent > VMM_MAX_CPUID_ENTRIES) {
1307 			error = EFBIG;
1308 			break;
1309 		}
1310 		if ((cfg.vvcc_flags & VCC_FLAG_LEGACY_HANDLING) != 0) {
1311 			/*
1312 			 * If we are being instructed to use "legacy" handling,
1313 			 * then no entries should be provided, since the static
1314 			 * in-kernel masking will be used.
1315 			 */
1316 			if (cfg.vvcc_nent != 0) {
1317 				error = EINVAL;
1318 				break;
1319 			}
1320 		} else if (cfg.vvcc_nent != 0) {
1321 			entries_size =
1322 			    cfg.vvcc_nent * sizeof (struct vcpu_cpuid_entry);
1323 			entries = kmem_alloc(entries_size, KM_SLEEP);
1324 
1325 			if (ddi_copyin(cfg.vvcc_entries, entries, entries_size,
1326 			    md) != 0) {
1327 				error = EFAULT;
1328 				kmem_free(entries, entries_size);
1329 				break;
1330 			}
1331 		}
1332 
1333 		vcpu_cpuid_config_t vm_cfg = {
1334 			.vcc_flags = cfg.vvcc_flags,
1335 			.vcc_nent = cfg.vvcc_nent,
1336 			.vcc_entries = entries,
1337 		};
1338 		error = vm_set_cpuid(sc->vmm_vm, vcpu, &vm_cfg);
1339 
1340 		if (entries != NULL) {
1341 			kmem_free(entries, entries_size);
1342 		}
1343 		break;
1344 	}
1345 	case VM_LEGACY_CPUID: {
1346 		struct vm_legacy_cpuid vlc;
1347 		if (ddi_copyin(datap, &vlc, sizeof (vlc), md)) {
1348 			error = EFAULT;
1349 			break;
1350 		}
1351 		vlc.vlc_vcpuid = vcpu;
1352 
1353 		legacy_emulate_cpuid(sc->vmm_vm, vcpu, &vlc.vlc_eax,
1354 		    &vlc.vlc_ebx, &vlc.vlc_ecx, &vlc.vlc_edx);
1355 
1356 		if (ddi_copyout(&vlc, datap, sizeof (vlc), md)) {
1357 			error = EFAULT;
1358 			break;
1359 		}
1360 		break;
1361 	}
1362 
1363 	case VM_SET_KERNEMU_DEV:
1364 	case VM_GET_KERNEMU_DEV: {
1365 		struct vm_readwrite_kernemu_device kemu;
1366 		size_t size = 0;
1367 
1368 		if (ddi_copyin(datap, &kemu, sizeof (kemu), md)) {
1369 			error = EFAULT;
1370 			break;
1371 		}
1372 
1373 		if (kemu.access_width > 3) {
1374 			error = EINVAL;
1375 			break;
1376 		}
1377 		size = (1 << kemu.access_width);
1378 		ASSERT(size >= 1 && size <= 8);
1379 
1380 		if (cmd == VM_SET_KERNEMU_DEV) {
1381 			error = vm_service_mmio_write(sc->vmm_vm, vcpu,
1382 			    kemu.gpa, kemu.value, size);
1383 		} else {
1384 			error = vm_service_mmio_read(sc->vmm_vm, vcpu,
1385 			    kemu.gpa, &kemu.value, size);
1386 		}
1387 
1388 		if (error == 0) {
1389 			if (ddi_copyout(&kemu, datap, sizeof (kemu), md)) {
1390 				error = EFAULT;
1391 				break;
1392 			}
1393 		}
1394 		break;
1395 	}
1396 
1397 	case VM_GET_CAPABILITY: {
1398 		struct vm_capability vmcap;
1399 
1400 		if (ddi_copyin(datap, &vmcap, sizeof (vmcap), md)) {
1401 			error = EFAULT;
1402 			break;
1403 		}
1404 		error = vm_get_capability(sc->vmm_vm, vcpu, vmcap.captype,
1405 		    &vmcap.capval);
1406 		if (error == 0 &&
1407 		    ddi_copyout(&vmcap, datap, sizeof (vmcap), md)) {
1408 			error = EFAULT;
1409 			break;
1410 		}
1411 		break;
1412 	}
1413 	case VM_SET_CAPABILITY: {
1414 		struct vm_capability vmcap;
1415 
1416 		if (ddi_copyin(datap, &vmcap, sizeof (vmcap), md)) {
1417 			error = EFAULT;
1418 			break;
1419 		}
1420 		error = vm_set_capability(sc->vmm_vm, vcpu, vmcap.captype,
1421 		    vmcap.capval);
1422 		break;
1423 	}
1424 	case VM_SET_X2APIC_STATE: {
1425 		struct vm_x2apic x2apic;
1426 
1427 		if (ddi_copyin(datap, &x2apic, sizeof (x2apic), md)) {
1428 			error = EFAULT;
1429 			break;
1430 		}
1431 		error = vm_set_x2apic_state(sc->vmm_vm, vcpu, x2apic.state);
1432 		break;
1433 	}
1434 	case VM_GET_X2APIC_STATE: {
1435 		struct vm_x2apic x2apic;
1436 
1437 		if (ddi_copyin(datap, &x2apic, sizeof (x2apic), md)) {
1438 			error = EFAULT;
1439 			break;
1440 		}
1441 		error = vm_get_x2apic_state(sc->vmm_vm, x2apic.cpuid,
1442 		    &x2apic.state);
1443 		if (error == 0 &&
1444 		    ddi_copyout(&x2apic, datap, sizeof (x2apic), md)) {
1445 			error = EFAULT;
1446 			break;
1447 		}
1448 		break;
1449 	}
1450 	case VM_GET_GPA_PMAP: {
1451 		/*
1452 		 * Until there is a necessity to leak EPT/RVI PTE values to
1453 		 * userspace, this will remain unimplemented
1454 		 */
1455 		error = EINVAL;
1456 		break;
1457 	}
1458 	case VM_GET_HPET_CAPABILITIES: {
1459 		struct vm_hpet_cap hpetcap;
1460 
1461 		error = vhpet_getcap(&hpetcap);
1462 		if (error == 0 &&
1463 		    ddi_copyout(&hpetcap, datap, sizeof (hpetcap), md)) {
1464 			error = EFAULT;
1465 			break;
1466 		}
1467 		break;
1468 	}
1469 	case VM_GLA2GPA: {
1470 		struct vm_gla2gpa gg;
1471 
1472 		if (ddi_copyin(datap, &gg, sizeof (gg), md)) {
1473 			error = EFAULT;
1474 			break;
1475 		}
1476 		gg.vcpuid = vcpu;
1477 		error = vm_gla2gpa(sc->vmm_vm, vcpu, &gg.paging, gg.gla,
1478 		    gg.prot, &gg.gpa, &gg.fault);
1479 		if (error == 0 && ddi_copyout(&gg, datap, sizeof (gg), md)) {
1480 			error = EFAULT;
1481 			break;
1482 		}
1483 		break;
1484 	}
1485 	case VM_GLA2GPA_NOFAULT: {
1486 		struct vm_gla2gpa gg;
1487 
1488 		if (ddi_copyin(datap, &gg, sizeof (gg), md)) {
1489 			error = EFAULT;
1490 			break;
1491 		}
1492 		gg.vcpuid = vcpu;
1493 		error = vm_gla2gpa_nofault(sc->vmm_vm, vcpu, &gg.paging,
1494 		    gg.gla, gg.prot, &gg.gpa, &gg.fault);
1495 		if (error == 0 && ddi_copyout(&gg, datap, sizeof (gg), md)) {
1496 			error = EFAULT;
1497 			break;
1498 		}
1499 		break;
1500 	}
1501 
1502 	case VM_ACTIVATE_CPU:
1503 		error = vm_activate_cpu(sc->vmm_vm, vcpu);
1504 		break;
1505 
1506 	case VM_SUSPEND_CPU:
1507 		if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) {
1508 			error = EFAULT;
1509 		} else {
1510 			error = vm_suspend_cpu(sc->vmm_vm, vcpu);
1511 		}
1512 		break;
1513 
1514 	case VM_RESUME_CPU:
1515 		if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) {
1516 			error = EFAULT;
1517 		} else {
1518 			error = vm_resume_cpu(sc->vmm_vm, vcpu);
1519 		}
1520 		break;
1521 
1522 	case VM_GET_CPUS: {
1523 		struct vm_cpuset vm_cpuset;
1524 		cpuset_t tempset;
1525 		void *srcp = &tempset;
1526 		int size;
1527 
1528 		if (ddi_copyin(datap, &vm_cpuset, sizeof (vm_cpuset), md)) {
1529 			error = EFAULT;
1530 			break;
1531 		}
1532 
1533 		/* Be more generous about sizing since our cpuset_t is large. */
1534 		size = vm_cpuset.cpusetsize;
1535 		if (size <= 0 || size > sizeof (cpuset_t)) {
1536 			error = ERANGE;
1537 		}
1538 		/*
1539 		 * If they want a ulong_t or less, make sure they receive the
1540 		 * low bits with all the useful information.
1541 		 */
1542 		if (size <= sizeof (tempset.cpub[0])) {
1543 			srcp = &tempset.cpub[0];
1544 		}
1545 
1546 		if (vm_cpuset.which == VM_ACTIVE_CPUS) {
1547 			tempset = vm_active_cpus(sc->vmm_vm);
1548 		} else if (vm_cpuset.which == VM_SUSPENDED_CPUS) {
1549 			tempset = vm_suspended_cpus(sc->vmm_vm);
1550 		} else if (vm_cpuset.which == VM_DEBUG_CPUS) {
1551 			tempset = vm_debug_cpus(sc->vmm_vm);
1552 		} else {
1553 			error = EINVAL;
1554 		}
1555 
1556 		ASSERT(size > 0 && size <= sizeof (tempset));
1557 		if (error == 0 &&
1558 		    ddi_copyout(srcp, vm_cpuset.cpus, size, md)) {
1559 			error = EFAULT;
1560 			break;
1561 		}
1562 		break;
1563 	}
1564 	case VM_SET_INTINFO: {
1565 		struct vm_intinfo vmii;
1566 
1567 		if (ddi_copyin(datap, &vmii, sizeof (vmii), md)) {
1568 			error = EFAULT;
1569 			break;
1570 		}
1571 		error = vm_exit_intinfo(sc->vmm_vm, vcpu, vmii.info1);
1572 		break;
1573 	}
1574 	case VM_GET_INTINFO: {
1575 		struct vm_intinfo vmii;
1576 
1577 		vmii.vcpuid = vcpu;
1578 		error = vm_get_intinfo(sc->vmm_vm, vcpu, &vmii.info1,
1579 		    &vmii.info2);
1580 		if (error == 0 &&
1581 		    ddi_copyout(&vmii, datap, sizeof (vmii), md)) {
1582 			error = EFAULT;
1583 			break;
1584 		}
1585 		break;
1586 	}
1587 	case VM_RTC_WRITE: {
1588 		struct vm_rtc_data rtcdata;
1589 
1590 		if (ddi_copyin(datap, &rtcdata, sizeof (rtcdata), md)) {
1591 			error = EFAULT;
1592 			break;
1593 		}
1594 		error = vrtc_nvram_write(sc->vmm_vm, rtcdata.offset,
1595 		    rtcdata.value);
1596 		break;
1597 	}
1598 	case VM_RTC_READ: {
1599 		struct vm_rtc_data rtcdata;
1600 
1601 		if (ddi_copyin(datap, &rtcdata, sizeof (rtcdata), md)) {
1602 			error = EFAULT;
1603 			break;
1604 		}
1605 		error = vrtc_nvram_read(sc->vmm_vm, rtcdata.offset,
1606 		    &rtcdata.value);
1607 		if (error == 0 &&
1608 		    ddi_copyout(&rtcdata, datap, sizeof (rtcdata), md)) {
1609 			error = EFAULT;
1610 			break;
1611 		}
1612 		break;
1613 	}
1614 	case VM_RTC_SETTIME: {
1615 		timespec_t ts;
1616 
1617 		if (ddi_copyin(datap, &ts, sizeof (ts), md)) {
1618 			error = EFAULT;
1619 			break;
1620 		}
1621 		error = vrtc_set_time(sc->vmm_vm, &ts);
1622 		break;
1623 	}
1624 	case VM_RTC_GETTIME: {
1625 		timespec_t ts;
1626 
1627 		vrtc_get_time(sc->vmm_vm, &ts);
1628 		if (ddi_copyout(&ts, datap, sizeof (ts), md)) {
1629 			error = EFAULT;
1630 			break;
1631 		}
1632 		break;
1633 	}
1634 
1635 	case VM_PMTMR_LOCATE: {
1636 		uint16_t port = arg;
1637 		error = vpmtmr_set_location(sc->vmm_vm, port);
1638 		break;
1639 	}
1640 
1641 	case VM_RESTART_INSTRUCTION:
1642 		error = vm_restart_instruction(sc->vmm_vm, vcpu);
1643 		break;
1644 
1645 	case VM_SET_TOPOLOGY: {
1646 		struct vm_cpu_topology topo;
1647 
1648 		if (ddi_copyin(datap, &topo, sizeof (topo), md) != 0) {
1649 			error = EFAULT;
1650 			break;
1651 		}
1652 		error = vm_set_topology(sc->vmm_vm, topo.sockets, topo.cores,
1653 		    topo.threads, topo.maxcpus);
1654 		break;
1655 	}
1656 	case VM_GET_TOPOLOGY: {
1657 		struct vm_cpu_topology topo;
1658 
1659 		vm_get_topology(sc->vmm_vm, &topo.sockets, &topo.cores,
1660 		    &topo.threads, &topo.maxcpus);
1661 		if (ddi_copyout(&topo, datap, sizeof (topo), md) != 0) {
1662 			error = EFAULT;
1663 			break;
1664 		}
1665 		break;
1666 	}
1667 	case VM_DEVMEM_GETOFFSET: {
1668 		struct vm_devmem_offset vdo;
1669 		vmm_devmem_entry_t *de;
1670 
1671 		if (ddi_copyin(datap, &vdo, sizeof (vdo), md) != 0) {
1672 			error = EFAULT;
1673 			break;
1674 		}
1675 
1676 		de = vmmdev_devmem_find(sc, vdo.segid);
1677 		if (de != NULL) {
1678 			vdo.offset = de->vde_off;
1679 			if (ddi_copyout(&vdo, datap, sizeof (vdo), md) != 0) {
1680 				error = EFAULT;
1681 			}
1682 		} else {
1683 			error = ENOENT;
1684 		}
1685 		break;
1686 	}
1687 	case VM_TRACK_DIRTY_PAGES: {
1688 		const size_t max_track_region_len = 8 * PAGESIZE * 8 * PAGESIZE;
1689 		struct vmm_dirty_tracker tracker;
1690 		uint8_t *bitmap;
1691 		size_t len;
1692 
1693 		if (ddi_copyin(datap, &tracker, sizeof (tracker), md) != 0) {
1694 			error = EFAULT;
1695 			break;
1696 		}
1697 		if ((tracker.vdt_start_gpa & PAGEOFFSET) != 0) {
1698 			error = EINVAL;
1699 			break;
1700 		}
1701 		if (tracker.vdt_len == 0) {
1702 			break;
1703 		}
1704 		if ((tracker.vdt_len & PAGEOFFSET) != 0) {
1705 			error = EINVAL;
1706 			break;
1707 		}
1708 		if (tracker.vdt_len > max_track_region_len) {
1709 			error = EINVAL;
1710 			break;
1711 		}
1712 		len = roundup(tracker.vdt_len / PAGESIZE, 8) / 8;
1713 		bitmap = kmem_zalloc(len, KM_SLEEP);
1714 		error = vm_track_dirty_pages(sc->vmm_vm, tracker.vdt_start_gpa,
1715 		    tracker.vdt_len, bitmap);
1716 		if (error == 0 &&
1717 		    ddi_copyout(bitmap, tracker.vdt_pfns, len, md) != 0) {
1718 			error = EFAULT;
1719 		}
1720 		kmem_free(bitmap, len);
1721 
1722 		break;
1723 	}
1724 	case VM_WRLOCK_CYCLE: {
1725 		/*
1726 		 * Present a test mechanism to acquire/release the write lock
1727 		 * on the VM without any other effects.
1728 		 */
1729 		break;
1730 	}
1731 	case VM_DATA_READ: {
1732 		struct vm_data_xfer vdx;
1733 
1734 		if (ddi_copyin(datap, &vdx, sizeof (vdx), md) != 0) {
1735 			error = EFAULT;
1736 			break;
1737 		}
1738 		if ((vdx.vdx_flags & ~VDX_FLAGS_VALID) != 0) {
1739 			error = EINVAL;
1740 			break;
1741 		}
1742 		if (vdx.vdx_len > VM_DATA_XFER_LIMIT) {
1743 			error = EFBIG;
1744 			break;
1745 		}
1746 
1747 		const size_t len = vdx.vdx_len;
1748 		void *buf = NULL;
1749 		if (len != 0) {
1750 			const void *udata = vdx.vdx_data;
1751 
1752 			buf = kmem_alloc(len, KM_SLEEP);
1753 			if ((vdx.vdx_flags & VDX_FLAG_READ_COPYIN) == 0) {
1754 				bzero(buf, len);
1755 			} else if (ddi_copyin(udata, buf, len, md) != 0) {
1756 				kmem_free(buf, len);
1757 				error = EFAULT;
1758 				break;
1759 			}
1760 		}
1761 
1762 		vdx.vdx_result_len = 0;
1763 		vmm_data_req_t req = {
1764 			.vdr_class = vdx.vdx_class,
1765 			.vdr_version = vdx.vdx_version,
1766 			.vdr_flags = vdx.vdx_flags,
1767 			.vdr_len = len,
1768 			.vdr_data = buf,
1769 			.vdr_result_len = &vdx.vdx_result_len,
1770 		};
1771 		error = vmm_data_read(sc->vmm_vm, vdx.vdx_vcpuid, &req);
1772 
1773 		if (error == 0 && buf != NULL) {
1774 			if (ddi_copyout(buf, vdx.vdx_data, len, md) != 0) {
1775 				error = EFAULT;
1776 			}
1777 		}
1778 
1779 		/*
1780 		 * Copy out the transfer request so that the value of
1781 		 * vdx_result_len can be made available, regardless of any
1782 		 * error(s) which may have occurred.
1783 		 */
1784 		if (ddi_copyout(&vdx, datap, sizeof (vdx), md) != 0) {
1785 			error = (error != 0) ? error : EFAULT;
1786 		}
1787 
1788 		if (buf != NULL) {
1789 			kmem_free(buf, len);
1790 		}
1791 		break;
1792 	}
1793 	case VM_DATA_WRITE: {
1794 		struct vm_data_xfer vdx;
1795 
1796 		if (ddi_copyin(datap, &vdx, sizeof (vdx), md) != 0) {
1797 			error = EFAULT;
1798 			break;
1799 		}
1800 		if ((vdx.vdx_flags & ~VDX_FLAGS_VALID) != 0) {
1801 			error = EINVAL;
1802 			break;
1803 		}
1804 		if (vdx.vdx_len > VM_DATA_XFER_LIMIT) {
1805 			error = EFBIG;
1806 			break;
1807 		}
1808 
1809 		const size_t len = vdx.vdx_len;
1810 		void *buf = NULL;
1811 		if (len != 0) {
1812 			buf = kmem_alloc(len, KM_SLEEP);
1813 			if (ddi_copyin(vdx.vdx_data, buf, len, md) != 0) {
1814 				kmem_free(buf, len);
1815 				error = EFAULT;
1816 				break;
1817 			}
1818 		}
1819 
1820 		vdx.vdx_result_len = 0;
1821 		vmm_data_req_t req = {
1822 			.vdr_class = vdx.vdx_class,
1823 			.vdr_version = vdx.vdx_version,
1824 			.vdr_flags = vdx.vdx_flags,
1825 			.vdr_len = len,
1826 			.vdr_data = buf,
1827 			.vdr_result_len = &vdx.vdx_result_len,
1828 		};
1829 		if (vmm_allow_state_writes != 0) {
1830 			error = vmm_data_write(sc->vmm_vm, vdx.vdx_vcpuid,
1831 			    &req);
1832 		} else {
1833 			/*
1834 			 * Reject the write if somone has thrown the switch back
1835 			 * into the "disallow" position.
1836 			 */
1837 			error = EPERM;
1838 		}
1839 
1840 		if (error == 0 && buf != NULL &&
1841 		    (vdx.vdx_flags & VDX_FLAG_WRITE_COPYOUT) != 0) {
1842 			if (ddi_copyout(buf, vdx.vdx_data, len, md) != 0) {
1843 				error = EFAULT;
1844 			}
1845 		}
1846 
1847 		/*
1848 		 * Copy out the transfer request so that the value of
1849 		 * vdx_result_len can be made available, regardless of any
1850 		 * error(s) which may have occurred.
1851 		 */
1852 		if (ddi_copyout(&vdx, datap, sizeof (vdx), md) != 0) {
1853 			error = (error != 0) ? error : EFAULT;
1854 		}
1855 
1856 		if (buf != NULL) {
1857 			kmem_free(buf, len);
1858 		}
1859 		break;
1860 	}
1861 
1862 	case VM_PAUSE: {
1863 		error = vm_pause_instance(sc->vmm_vm);
1864 		break;
1865 	}
1866 	case VM_RESUME: {
1867 		error = vm_resume_instance(sc->vmm_vm);
1868 		break;
1869 	}
1870 
1871 	default:
1872 		error = ENOTTY;
1873 		break;
1874 	}
1875 
1876 	/* Release exclusion resources */
1877 	switch (lock_type) {
1878 	case LOCK_NONE:
1879 		break;
1880 	case LOCK_VCPU:
1881 		vcpu_unlock_one(sc, vcpu);
1882 		break;
1883 	case LOCK_READ_HOLD:
1884 		vmm_read_unlock(sc);
1885 		break;
1886 	case LOCK_WRITE_HOLD:
1887 		vmm_write_unlock(sc);
1888 		break;
1889 	default:
1890 		panic("unexpected lock type");
1891 		break;
1892 	}
1893 
1894 	return (error);
1895 }
1896 
1897 static vmm_softc_t *
1898 vmm_lookup(const char *name)
1899 {
1900 	list_t *vml = &vmm_list;
1901 	vmm_softc_t *sc;
1902 
1903 	ASSERT(MUTEX_HELD(&vmm_mtx));
1904 
1905 	for (sc = list_head(vml); sc != NULL; sc = list_next(vml, sc)) {
1906 		if (strcmp(sc->vmm_name, name) == 0) {
1907 			break;
1908 		}
1909 	}
1910 
1911 	return (sc);
1912 }
1913 
1914 /*
1915  * Acquire an HMA registration if not already held.
1916  */
1917 static boolean_t
1918 vmm_hma_acquire(void)
1919 {
1920 	ASSERT(MUTEX_NOT_HELD(&vmm_mtx));
1921 
1922 	mutex_enter(&vmmdev_mtx);
1923 
1924 	if (vmmdev_hma_reg == NULL) {
1925 		VERIFY3U(vmmdev_hma_ref, ==, 0);
1926 		vmmdev_hma_reg = hma_register(vmmdev_hvm_name);
1927 		if (vmmdev_hma_reg == NULL) {
1928 			cmn_err(CE_WARN, "%s HMA registration failed.",
1929 			    vmmdev_hvm_name);
1930 			mutex_exit(&vmmdev_mtx);
1931 			return (B_FALSE);
1932 		}
1933 	}
1934 
1935 	vmmdev_hma_ref++;
1936 
1937 	mutex_exit(&vmmdev_mtx);
1938 
1939 	return (B_TRUE);
1940 }
1941 
1942 /*
1943  * Release the HMA registration if held and there are no remaining VMs.
1944  */
1945 static void
1946 vmm_hma_release(void)
1947 {
1948 	ASSERT(MUTEX_NOT_HELD(&vmm_mtx));
1949 
1950 	mutex_enter(&vmmdev_mtx);
1951 
1952 	VERIFY3U(vmmdev_hma_ref, !=, 0);
1953 
1954 	vmmdev_hma_ref--;
1955 
1956 	if (vmmdev_hma_ref == 0) {
1957 		VERIFY(vmmdev_hma_reg != NULL);
1958 		hma_unregister(vmmdev_hma_reg);
1959 		vmmdev_hma_reg = NULL;
1960 	}
1961 	mutex_exit(&vmmdev_mtx);
1962 }
1963 
1964 static int
1965 vmmdev_do_vm_create(const struct vm_create_req *req, cred_t *cr)
1966 {
1967 	vmm_softc_t	*sc = NULL;
1968 	minor_t		minor;
1969 	int		error = ENOMEM;
1970 	size_t		len;
1971 	const char	*name = req->name;
1972 
1973 	len = strnlen(name, VM_MAX_NAMELEN);
1974 	if (len == 0) {
1975 		return (EINVAL);
1976 	}
1977 	if (len >= VM_MAX_NAMELEN) {
1978 		return (ENAMETOOLONG);
1979 	}
1980 	if (strchr(name, '/') != NULL) {
1981 		return (EINVAL);
1982 	}
1983 
1984 	if (!vmm_hma_acquire())
1985 		return (ENXIO);
1986 
1987 	mutex_enter(&vmm_mtx);
1988 
1989 	/* Look for duplicate names */
1990 	if (vmm_lookup(name) != NULL) {
1991 		mutex_exit(&vmm_mtx);
1992 		vmm_hma_release();
1993 		return (EEXIST);
1994 	}
1995 
1996 	/* Allow only one instance per non-global zone. */
1997 	if (!INGLOBALZONE(curproc)) {
1998 		for (sc = list_head(&vmm_list); sc != NULL;
1999 		    sc = list_next(&vmm_list, sc)) {
2000 			if (sc->vmm_zone == curzone) {
2001 				mutex_exit(&vmm_mtx);
2002 				vmm_hma_release();
2003 				return (EINVAL);
2004 			}
2005 		}
2006 	}
2007 
2008 	minor = id_alloc(vmm_minors);
2009 	if (ddi_soft_state_zalloc(vmm_statep, minor) != DDI_SUCCESS) {
2010 		goto fail;
2011 	} else if ((sc = ddi_get_soft_state(vmm_statep, minor)) == NULL) {
2012 		ddi_soft_state_free(vmm_statep, minor);
2013 		goto fail;
2014 	} else if (ddi_create_minor_node(vmmdev_dip, name, S_IFCHR, minor,
2015 	    DDI_PSEUDO, 0) != DDI_SUCCESS) {
2016 		goto fail;
2017 	}
2018 
2019 	if (vmm_kstat_alloc(sc, minor, cr) != 0) {
2020 		goto fail;
2021 	}
2022 
2023 	error = vm_create(req->flags, &sc->vmm_vm);
2024 	if (error == 0) {
2025 		/* Complete VM intialization and report success. */
2026 		(void) strlcpy(sc->vmm_name, name, sizeof (sc->vmm_name));
2027 		sc->vmm_minor = minor;
2028 		list_create(&sc->vmm_devmem_list, sizeof (vmm_devmem_entry_t),
2029 		    offsetof(vmm_devmem_entry_t, vde_node));
2030 
2031 		list_create(&sc->vmm_holds, sizeof (vmm_hold_t),
2032 		    offsetof(vmm_hold_t, vmh_node));
2033 		cv_init(&sc->vmm_cv, NULL, CV_DEFAULT, NULL);
2034 
2035 		mutex_init(&sc->vmm_lease_lock, NULL, MUTEX_DEFAULT, NULL);
2036 		list_create(&sc->vmm_lease_list, sizeof (vmm_lease_t),
2037 		    offsetof(vmm_lease_t, vml_node));
2038 		cv_init(&sc->vmm_lease_cv, NULL, CV_DEFAULT, NULL);
2039 		rw_init(&sc->vmm_rwlock, NULL, RW_DEFAULT, NULL);
2040 
2041 		sc->vmm_zone = crgetzone(cr);
2042 		zone_hold(sc->vmm_zone);
2043 		vmm_zsd_add_vm(sc);
2044 		vmm_kstat_init(sc);
2045 
2046 		list_insert_tail(&vmm_list, sc);
2047 		mutex_exit(&vmm_mtx);
2048 		return (0);
2049 	}
2050 
2051 	vmm_kstat_fini(sc);
2052 	ddi_remove_minor_node(vmmdev_dip, name);
2053 fail:
2054 	id_free(vmm_minors, minor);
2055 	if (sc != NULL) {
2056 		ddi_soft_state_free(vmm_statep, minor);
2057 	}
2058 	mutex_exit(&vmm_mtx);
2059 	vmm_hma_release();
2060 
2061 	return (error);
2062 }
2063 
2064 /*
2065  * Bhyve 'Driver' Interface
2066  *
2067  * While many devices are emulated in the bhyve userspace process, there are
2068  * others with performance constraints which require that they run mostly or
2069  * entirely in-kernel.  For those not integrated directly into bhyve, an API is
2070  * needed so they can query/manipulate the portions of VM state needed to
2071  * fulfill their purpose.
2072  *
2073  * This includes:
2074  * - Translating guest-physical addresses to host-virtual pointers
2075  * - Injecting MSIs
2076  * - Hooking IO port addresses
2077  *
2078  * The vmm_drv interface exists to provide that functionality to its consumers.
2079  * (At this time, 'viona' is the only user)
2080  */
2081 int
2082 vmm_drv_hold(file_t *fp, cred_t *cr, vmm_hold_t **holdp)
2083 {
2084 	vnode_t *vp = fp->f_vnode;
2085 	const dev_t dev = vp->v_rdev;
2086 	vmm_softc_t *sc;
2087 	vmm_hold_t *hold;
2088 	int err = 0;
2089 
2090 	if (vp->v_type != VCHR) {
2091 		return (ENXIO);
2092 	}
2093 	const major_t major = getmajor(dev);
2094 	const minor_t minor = getminor(dev);
2095 
2096 	mutex_enter(&vmmdev_mtx);
2097 	if (vmmdev_dip == NULL || major != ddi_driver_major(vmmdev_dip)) {
2098 		mutex_exit(&vmmdev_mtx);
2099 		return (ENOENT);
2100 	}
2101 	mutex_enter(&vmm_mtx);
2102 	mutex_exit(&vmmdev_mtx);
2103 
2104 	if ((sc = ddi_get_soft_state(vmm_statep, minor)) == NULL) {
2105 		err = ENOENT;
2106 		goto out;
2107 	}
2108 	/* XXXJOY: check cred permissions against instance */
2109 
2110 	if ((sc->vmm_flags & VMM_DESTROY) != 0) {
2111 		err = EBUSY;
2112 		goto out;
2113 	}
2114 
2115 	hold = kmem_zalloc(sizeof (*hold), KM_SLEEP);
2116 	hold->vmh_sc = sc;
2117 	hold->vmh_release_req = B_FALSE;
2118 
2119 	list_insert_tail(&sc->vmm_holds, hold);
2120 	sc->vmm_flags |= VMM_HELD;
2121 	*holdp = hold;
2122 
2123 out:
2124 	mutex_exit(&vmm_mtx);
2125 	return (err);
2126 }
2127 
2128 void
2129 vmm_drv_rele(vmm_hold_t *hold)
2130 {
2131 	vmm_softc_t *sc;
2132 	bool hma_release = false;
2133 
2134 	ASSERT(hold != NULL);
2135 	ASSERT(hold->vmh_sc != NULL);
2136 	VERIFY(hold->vmh_ioport_hook_cnt == 0);
2137 
2138 	mutex_enter(&vmm_mtx);
2139 	sc = hold->vmh_sc;
2140 	list_remove(&sc->vmm_holds, hold);
2141 	kmem_free(hold, sizeof (*hold));
2142 
2143 	if (list_is_empty(&sc->vmm_holds)) {
2144 		sc->vmm_flags &= ~VMM_HELD;
2145 
2146 		/*
2147 		 * Since outstanding holds would prevent instance destruction
2148 		 * from completing, attempt to finish it now if it was already
2149 		 * set in motion.
2150 		 */
2151 		if ((sc->vmm_flags & VMM_DESTROY) != 0) {
2152 			VERIFY0(vmm_destroy_locked(sc, VDO_DEFAULT,
2153 			    &hma_release));
2154 		}
2155 	}
2156 	mutex_exit(&vmm_mtx);
2157 
2158 	if (hma_release) {
2159 		vmm_hma_release();
2160 	}
2161 }
2162 
2163 boolean_t
2164 vmm_drv_release_reqd(vmm_hold_t *hold)
2165 {
2166 	ASSERT(hold != NULL);
2167 
2168 	return (hold->vmh_release_req);
2169 }
2170 
2171 vmm_lease_t *
2172 vmm_drv_lease_sign(vmm_hold_t *hold, boolean_t (*expiref)(void *), void *arg)
2173 {
2174 	vmm_softc_t *sc = hold->vmh_sc;
2175 	vmm_lease_t *lease;
2176 
2177 	ASSERT3P(expiref, !=, NULL);
2178 
2179 	if (hold->vmh_release_req) {
2180 		return (NULL);
2181 	}
2182 
2183 	lease = kmem_alloc(sizeof (*lease), KM_SLEEP);
2184 	list_link_init(&lease->vml_node);
2185 	lease->vml_expire_func = expiref;
2186 	lease->vml_expire_arg = arg;
2187 	lease->vml_expired = B_FALSE;
2188 	lease->vml_break_deferred = B_FALSE;
2189 	lease->vml_hold = hold;
2190 	/* cache the VM pointer for one less pointer chase */
2191 	lease->vml_vm = sc->vmm_vm;
2192 	lease->vml_vmclient = vmspace_client_alloc(vm_get_vmspace(sc->vmm_vm));
2193 
2194 	mutex_enter(&sc->vmm_lease_lock);
2195 	while (sc->vmm_lease_blocker != 0) {
2196 		cv_wait(&sc->vmm_lease_cv, &sc->vmm_lease_lock);
2197 	}
2198 	list_insert_tail(&sc->vmm_lease_list, lease);
2199 	vmm_read_lock(sc);
2200 	mutex_exit(&sc->vmm_lease_lock);
2201 
2202 	return (lease);
2203 }
2204 
2205 static void
2206 vmm_lease_break_locked(vmm_softc_t *sc, vmm_lease_t *lease)
2207 {
2208 	ASSERT(MUTEX_HELD(&sc->vmm_lease_lock));
2209 
2210 	list_remove(&sc->vmm_lease_list, lease);
2211 	vmm_read_unlock(sc);
2212 	vmc_destroy(lease->vml_vmclient);
2213 	kmem_free(lease, sizeof (*lease));
2214 }
2215 
2216 static void
2217 vmm_lease_block(vmm_softc_t *sc)
2218 {
2219 	mutex_enter(&sc->vmm_lease_lock);
2220 	VERIFY3U(sc->vmm_lease_blocker, !=, UINT_MAX);
2221 	sc->vmm_lease_blocker++;
2222 	if (sc->vmm_lease_blocker == 1) {
2223 		list_t *list = &sc->vmm_lease_list;
2224 		vmm_lease_t *lease = list_head(list);
2225 
2226 		while (lease != NULL) {
2227 			void *arg = lease->vml_expire_arg;
2228 			boolean_t (*expiref)(void *) = lease->vml_expire_func;
2229 			boolean_t sync_break = B_FALSE;
2230 
2231 			/*
2232 			 * Since the lease expiration notification may
2233 			 * need to take locks which would deadlock with
2234 			 * vmm_lease_lock, drop it across the call.
2235 			 *
2236 			 * We are the only one allowed to manipulate
2237 			 * vmm_lease_list right now, so it is safe to
2238 			 * continue iterating through it after
2239 			 * reacquiring the lock.
2240 			 */
2241 			lease->vml_expired = B_TRUE;
2242 			mutex_exit(&sc->vmm_lease_lock);
2243 			sync_break = expiref(arg);
2244 			mutex_enter(&sc->vmm_lease_lock);
2245 
2246 			if (sync_break) {
2247 				vmm_lease_t *next;
2248 
2249 				/*
2250 				 * These leases which are synchronously broken
2251 				 * result in vmm_read_unlock() calls from a
2252 				 * different thread than the corresponding
2253 				 * vmm_read_lock().  This is acceptable, given
2254 				 * that the rwlock underpinning the whole
2255 				 * mechanism tolerates the behavior.  This
2256 				 * flexibility is _only_ afforded to VM read
2257 				 * lock (RW_READER) holders.
2258 				 */
2259 				next = list_next(list, lease);
2260 				vmm_lease_break_locked(sc, lease);
2261 				lease = next;
2262 			} else {
2263 				lease = list_next(list, lease);
2264 			}
2265 		}
2266 
2267 		/* Process leases which were not broken synchronously. */
2268 		while (!list_is_empty(list)) {
2269 			/*
2270 			 * Although the nested loops are quadratic, the number
2271 			 * of leases is small.
2272 			 */
2273 			lease = list_head(list);
2274 			while (lease != NULL) {
2275 				vmm_lease_t *next = list_next(list, lease);
2276 				if (lease->vml_break_deferred) {
2277 					vmm_lease_break_locked(sc, lease);
2278 				}
2279 				lease = next;
2280 			}
2281 			if (list_is_empty(list)) {
2282 				break;
2283 			}
2284 			cv_wait(&sc->vmm_lease_cv, &sc->vmm_lease_lock);
2285 		}
2286 		/* Wake anyone else waiting for the lease list to be empty  */
2287 		cv_broadcast(&sc->vmm_lease_cv);
2288 	} else {
2289 		list_t *list = &sc->vmm_lease_list;
2290 
2291 		/*
2292 		 * Some other thread beat us to the duty of lease cleanup.
2293 		 * Wait until that is complete.
2294 		 */
2295 		while (!list_is_empty(list)) {
2296 			cv_wait(&sc->vmm_lease_cv, &sc->vmm_lease_lock);
2297 		}
2298 	}
2299 	mutex_exit(&sc->vmm_lease_lock);
2300 }
2301 
2302 static void
2303 vmm_lease_unblock(vmm_softc_t *sc)
2304 {
2305 	mutex_enter(&sc->vmm_lease_lock);
2306 	VERIFY3U(sc->vmm_lease_blocker, !=, 0);
2307 	sc->vmm_lease_blocker--;
2308 	if (sc->vmm_lease_blocker == 0) {
2309 		cv_broadcast(&sc->vmm_lease_cv);
2310 	}
2311 	mutex_exit(&sc->vmm_lease_lock);
2312 }
2313 
2314 void
2315 vmm_drv_lease_break(vmm_hold_t *hold, vmm_lease_t *lease)
2316 {
2317 	vmm_softc_t *sc = hold->vmh_sc;
2318 
2319 	VERIFY3P(hold, ==, lease->vml_hold);
2320 	VERIFY(!lease->vml_break_deferred);
2321 
2322 	mutex_enter(&sc->vmm_lease_lock);
2323 	if (sc->vmm_lease_blocker == 0) {
2324 		vmm_lease_break_locked(sc, lease);
2325 	} else {
2326 		/*
2327 		 * Defer the lease-breaking to whichever thread is currently
2328 		 * cleaning up all leases as part of a vmm_lease_block() call.
2329 		 */
2330 		lease->vml_break_deferred = B_TRUE;
2331 		cv_broadcast(&sc->vmm_lease_cv);
2332 	}
2333 	mutex_exit(&sc->vmm_lease_lock);
2334 }
2335 
2336 boolean_t
2337 vmm_drv_lease_expired(vmm_lease_t *lease)
2338 {
2339 	return (lease->vml_expired);
2340 }
2341 
2342 vmm_page_t *
2343 vmm_drv_page_hold(vmm_lease_t *lease, uintptr_t gpa, int prot)
2344 {
2345 	ASSERT(lease != NULL);
2346 	ASSERT0(gpa & PAGEOFFSET);
2347 
2348 	return ((vmm_page_t *)vmc_hold(lease->vml_vmclient, gpa, prot));
2349 }
2350 
2351 
2352 /* Ensure that flags mirrored by vmm_drv interface properly match up */
2353 CTASSERT(VMPF_DEFER_DIRTY == VPF_DEFER_DIRTY);
2354 
2355 vmm_page_t *
2356 vmm_drv_page_hold_ext(vmm_lease_t *lease, uintptr_t gpa, int prot, int flags)
2357 {
2358 	ASSERT(lease != NULL);
2359 	ASSERT0(gpa & PAGEOFFSET);
2360 
2361 	vmm_page_t *page =
2362 	    (vmm_page_t *)vmc_hold_ext(lease->vml_vmclient, gpa, prot, flags);
2363 	return (page);
2364 }
2365 
2366 void
2367 vmm_drv_page_release(vmm_page_t *vmmp)
2368 {
2369 	(void) vmp_release((vm_page_t *)vmmp);
2370 }
2371 
2372 void
2373 vmm_drv_page_release_chain(vmm_page_t *vmmp)
2374 {
2375 	(void) vmp_release_chain((vm_page_t *)vmmp);
2376 }
2377 
2378 const void *
2379 vmm_drv_page_readable(const vmm_page_t *vmmp)
2380 {
2381 	return (vmp_get_readable((const vm_page_t *)vmmp));
2382 }
2383 
2384 void *
2385 vmm_drv_page_writable(const vmm_page_t *vmmp)
2386 {
2387 	return (vmp_get_writable((const vm_page_t *)vmmp));
2388 }
2389 
2390 void
2391 vmm_drv_page_mark_dirty(vmm_page_t *vmmp)
2392 {
2393 	return (vmp_mark_dirty((vm_page_t *)vmmp));
2394 }
2395 
2396 void
2397 vmm_drv_page_chain(vmm_page_t *vmmp, vmm_page_t *to_chain)
2398 {
2399 	vmp_chain((vm_page_t *)vmmp, (vm_page_t *)to_chain);
2400 }
2401 
2402 vmm_page_t *
2403 vmm_drv_page_next(const vmm_page_t *vmmp)
2404 {
2405 	return ((vmm_page_t *)vmp_next((vm_page_t *)vmmp));
2406 }
2407 
2408 int
2409 vmm_drv_msi(vmm_lease_t *lease, uint64_t addr, uint64_t msg)
2410 {
2411 	ASSERT(lease != NULL);
2412 
2413 	return (lapic_intr_msi(lease->vml_vm, addr, msg));
2414 }
2415 
2416 int
2417 vmm_drv_ioport_hook(vmm_hold_t *hold, uint16_t ioport, vmm_drv_iop_cb_t func,
2418     void *arg, void **cookie)
2419 {
2420 	vmm_softc_t *sc;
2421 	int err;
2422 
2423 	ASSERT(hold != NULL);
2424 	ASSERT(cookie != NULL);
2425 
2426 	sc = hold->vmh_sc;
2427 	mutex_enter(&vmm_mtx);
2428 	/* Confirm that hook installation is not blocked */
2429 	if ((sc->vmm_flags & VMM_BLOCK_HOOK) != 0) {
2430 		mutex_exit(&vmm_mtx);
2431 		return (EBUSY);
2432 	}
2433 	/*
2434 	 * Optimistically record an installed hook which will prevent a block
2435 	 * from being asserted while the mutex is dropped.
2436 	 */
2437 	hold->vmh_ioport_hook_cnt++;
2438 	mutex_exit(&vmm_mtx);
2439 
2440 	vmm_write_lock(sc);
2441 	err = vm_ioport_hook(sc->vmm_vm, ioport, (ioport_handler_t)func,
2442 	    arg, cookie);
2443 	vmm_write_unlock(sc);
2444 
2445 	if (err != 0) {
2446 		mutex_enter(&vmm_mtx);
2447 		/* Walk back optimism about the hook installation */
2448 		hold->vmh_ioport_hook_cnt--;
2449 		mutex_exit(&vmm_mtx);
2450 	}
2451 	return (err);
2452 }
2453 
2454 void
2455 vmm_drv_ioport_unhook(vmm_hold_t *hold, void **cookie)
2456 {
2457 	vmm_softc_t *sc;
2458 
2459 	ASSERT(hold != NULL);
2460 	ASSERT(cookie != NULL);
2461 	ASSERT(hold->vmh_ioport_hook_cnt != 0);
2462 
2463 	sc = hold->vmh_sc;
2464 	vmm_write_lock(sc);
2465 	vm_ioport_unhook(sc->vmm_vm, cookie);
2466 	vmm_write_unlock(sc);
2467 
2468 	mutex_enter(&vmm_mtx);
2469 	hold->vmh_ioport_hook_cnt--;
2470 	mutex_exit(&vmm_mtx);
2471 }
2472 
2473 static void
2474 vmm_drv_purge(vmm_softc_t *sc)
2475 {
2476 	ASSERT(MUTEX_HELD(&vmm_mtx));
2477 
2478 	if ((sc->vmm_flags & VMM_HELD) != 0) {
2479 		vmm_hold_t *hold;
2480 
2481 		for (hold = list_head(&sc->vmm_holds); hold != NULL;
2482 		    hold = list_next(&sc->vmm_holds, hold)) {
2483 			hold->vmh_release_req = B_TRUE;
2484 		}
2485 
2486 		/*
2487 		 * Require that all leases on the instance be broken, now that
2488 		 * all associated holds have been marked as needing release.
2489 		 *
2490 		 * Dropping vmm_mtx is not strictly necessary, but if any of the
2491 		 * lessees are slow to respond, it would be nice to leave it
2492 		 * available for other parties.
2493 		 */
2494 		mutex_exit(&vmm_mtx);
2495 		vmm_lease_block(sc);
2496 		vmm_lease_unblock(sc);
2497 		mutex_enter(&vmm_mtx);
2498 	}
2499 }
2500 
2501 static int
2502 vmm_drv_block_hook(vmm_softc_t *sc, boolean_t enable_block)
2503 {
2504 	int err = 0;
2505 
2506 	mutex_enter(&vmm_mtx);
2507 	if (!enable_block) {
2508 		VERIFY((sc->vmm_flags & VMM_BLOCK_HOOK) != 0);
2509 
2510 		sc->vmm_flags &= ~VMM_BLOCK_HOOK;
2511 		goto done;
2512 	}
2513 
2514 	/* If any holds have hooks installed, the block is a failure */
2515 	if (!list_is_empty(&sc->vmm_holds)) {
2516 		vmm_hold_t *hold;
2517 
2518 		for (hold = list_head(&sc->vmm_holds); hold != NULL;
2519 		    hold = list_next(&sc->vmm_holds, hold)) {
2520 			if (hold->vmh_ioport_hook_cnt != 0) {
2521 				err = EBUSY;
2522 				goto done;
2523 			}
2524 		}
2525 	}
2526 	sc->vmm_flags |= VMM_BLOCK_HOOK;
2527 
2528 done:
2529 	mutex_exit(&vmm_mtx);
2530 	return (err);
2531 }
2532 
2533 
2534 static void
2535 vmm_destroy_begin(vmm_softc_t *sc, vmm_destroy_opts_t opts)
2536 {
2537 	ASSERT(MUTEX_HELD(&vmm_mtx));
2538 	ASSERT0(sc->vmm_flags & VMM_DESTROY);
2539 
2540 	sc->vmm_flags |= VMM_DESTROY;
2541 
2542 	/*
2543 	 * Lock and unlock all of the vCPUs to ensure that they are kicked out
2544 	 * of guest context, being unable to return now that the instance is
2545 	 * marked for destruction.
2546 	 */
2547 	const int maxcpus = vm_get_maxcpus(sc->vmm_vm);
2548 	for (int vcpu = 0; vcpu < maxcpus; vcpu++) {
2549 		vcpu_lock_one(sc, vcpu);
2550 		vcpu_unlock_one(sc, vcpu);
2551 	}
2552 
2553 	vmmdev_devmem_purge(sc);
2554 	if ((opts & VDO_NO_CLEAN_ZSD) == 0) {
2555 		/*
2556 		 * The ZSD should be cleaned up now, unless destruction of the
2557 		 * instance was initated by destruction of the containing zone,
2558 		 * in which case the ZSD has already been removed.
2559 		 */
2560 		vmm_zsd_rem_vm(sc);
2561 	}
2562 	zone_rele(sc->vmm_zone);
2563 
2564 	vmm_drv_purge(sc);
2565 }
2566 
2567 static bool
2568 vmm_destroy_ready(vmm_softc_t *sc)
2569 {
2570 	ASSERT(MUTEX_HELD(&vmm_mtx));
2571 
2572 	if ((sc->vmm_flags & (VMM_HELD | VMM_IS_OPEN)) == 0) {
2573 		VERIFY(list_is_empty(&sc->vmm_holds));
2574 		return (true);
2575 	}
2576 
2577 	return (false);
2578 }
2579 
2580 static void
2581 vmm_destroy_finish(vmm_softc_t *sc)
2582 {
2583 	ASSERT(MUTEX_HELD(&vmm_mtx));
2584 	ASSERT(vmm_destroy_ready(sc));
2585 
2586 	list_remove(&vmm_list, sc);
2587 	vmm_kstat_fini(sc);
2588 	vm_destroy(sc->vmm_vm);
2589 	ddi_remove_minor_node(vmmdev_dip, sc->vmm_name);
2590 	(void) devfs_clean(ddi_get_parent(vmmdev_dip), NULL, DV_CLEAN_FORCE);
2591 
2592 	const minor_t minor = sc->vmm_minor;
2593 	ddi_soft_state_free(vmm_statep, minor);
2594 	id_free(vmm_minors, minor);
2595 }
2596 
2597 /*
2598  * Initiate or attempt to finish destruction of a VMM instance.
2599  *
2600  * This is called from several contexts:
2601  * - An explicit destroy ioctl is made
2602  * - A vmm_drv consumer releases its hold (being the last on the instance)
2603  * - The vmm device is closed, and auto-destruct is enabled
2604  */
2605 static int
2606 vmm_destroy_locked(vmm_softc_t *sc, vmm_destroy_opts_t opts,
2607     bool *hma_release)
2608 {
2609 	ASSERT(MUTEX_HELD(&vmm_mtx));
2610 
2611 	*hma_release = false;
2612 
2613 	/*
2614 	 * When instance destruction begins, it is so marked such that any
2615 	 * further requests to operate the instance will fail.
2616 	 */
2617 	if ((sc->vmm_flags & VMM_DESTROY) == 0) {
2618 		vmm_destroy_begin(sc, opts);
2619 	}
2620 
2621 	if (vmm_destroy_ready(sc)) {
2622 
2623 		/*
2624 		 * Notify anyone waiting for the destruction to finish.  They
2625 		 * must be clear before we can safely tear down the softc.
2626 		 */
2627 		if (sc->vmm_destroy_waiters != 0) {
2628 			cv_broadcast(&sc->vmm_cv);
2629 			while (sc->vmm_destroy_waiters != 0) {
2630 				cv_wait(&sc->vmm_cv, &vmm_mtx);
2631 			}
2632 		}
2633 
2634 		/*
2635 		 * Finish destruction of instance.  After this point, the softc
2636 		 * is freed and cannot be accessed again.
2637 		 *
2638 		 * With destruction complete, the HMA hold can be released
2639 		 */
2640 		vmm_destroy_finish(sc);
2641 		*hma_release = true;
2642 		return (0);
2643 	} else if ((opts & VDO_ATTEMPT_WAIT) != 0) {
2644 		int err = 0;
2645 
2646 		sc->vmm_destroy_waiters++;
2647 		while (!vmm_destroy_ready(sc) && err == 0) {
2648 			if (cv_wait_sig(&sc->vmm_cv, &vmm_mtx) <= 0) {
2649 				err = EINTR;
2650 			}
2651 		}
2652 		sc->vmm_destroy_waiters--;
2653 
2654 		if (sc->vmm_destroy_waiters == 0) {
2655 			/*
2656 			 * If we were the last waiter, it could be that VM
2657 			 * destruction is waiting on _us_ to proceed with the
2658 			 * final clean-up.
2659 			 */
2660 			cv_signal(&sc->vmm_cv);
2661 		}
2662 		return (err);
2663 	} else {
2664 		/*
2665 		 * Since the instance is not ready for destruction, and the
2666 		 * caller did not ask to wait, consider it a success for now.
2667 		 */
2668 		return (0);
2669 	}
2670 }
2671 
2672 void
2673 vmm_zone_vm_destroy(vmm_softc_t *sc)
2674 {
2675 	bool hma_release = false;
2676 	int err;
2677 
2678 	mutex_enter(&vmm_mtx);
2679 	err = vmm_destroy_locked(sc, VDO_NO_CLEAN_ZSD, &hma_release);
2680 	mutex_exit(&vmm_mtx);
2681 
2682 	VERIFY0(err);
2683 
2684 	if (hma_release) {
2685 		vmm_hma_release();
2686 	}
2687 }
2688 
2689 static int
2690 vmmdev_do_vm_destroy(const struct vm_destroy_req *req, cred_t *cr)
2691 {
2692 	vmm_softc_t *sc;
2693 	bool hma_release = false;
2694 	int err;
2695 
2696 	if (crgetuid(cr) != 0) {
2697 		return (EPERM);
2698 	}
2699 
2700 	mutex_enter(&vmm_mtx);
2701 	sc = vmm_lookup(req->name);
2702 	if (sc == NULL) {
2703 		mutex_exit(&vmm_mtx);
2704 		return (ENOENT);
2705 	}
2706 	/*
2707 	 * We don't check this in vmm_lookup() since that function is also used
2708 	 * for validation during create and currently vmm names must be unique.
2709 	 */
2710 	if (!INGLOBALZONE(curproc) && sc->vmm_zone != curzone) {
2711 		mutex_exit(&vmm_mtx);
2712 		return (EPERM);
2713 	}
2714 
2715 	err = vmm_destroy_locked(sc, VDO_ATTEMPT_WAIT, &hma_release);
2716 	mutex_exit(&vmm_mtx);
2717 
2718 	if (hma_release) {
2719 		vmm_hma_release();
2720 	}
2721 
2722 	return (err);
2723 }
2724 
2725 #define	VCPU_NAME_BUFLEN	32
2726 
2727 static int
2728 vmm_kstat_alloc(vmm_softc_t *sc, minor_t minor, const cred_t *cr)
2729 {
2730 	zoneid_t zid = crgetzoneid(cr);
2731 	int instance = minor;
2732 	kstat_t *ksp;
2733 
2734 	ASSERT3P(sc->vmm_kstat_vm, ==, NULL);
2735 
2736 	ksp = kstat_create_zone(VMM_MODULE_NAME, instance, "vm",
2737 	    VMM_KSTAT_CLASS, KSTAT_TYPE_NAMED,
2738 	    sizeof (vmm_kstats_t) / sizeof (kstat_named_t), 0, zid);
2739 
2740 	if (ksp == NULL) {
2741 		return (-1);
2742 	}
2743 	sc->vmm_kstat_vm = ksp;
2744 
2745 	for (uint_t i = 0; i < VM_MAXCPU; i++) {
2746 		char namebuf[VCPU_NAME_BUFLEN];
2747 
2748 		ASSERT3P(sc->vmm_kstat_vcpu[i], ==, NULL);
2749 
2750 		(void) snprintf(namebuf, VCPU_NAME_BUFLEN, "vcpu%u", i);
2751 		ksp = kstat_create_zone(VMM_MODULE_NAME, instance, namebuf,
2752 		    VMM_KSTAT_CLASS, KSTAT_TYPE_NAMED,
2753 		    sizeof (vmm_vcpu_kstats_t) / sizeof (kstat_named_t),
2754 		    0, zid);
2755 		if (ksp == NULL) {
2756 			goto fail;
2757 		}
2758 
2759 		sc->vmm_kstat_vcpu[i] = ksp;
2760 	}
2761 
2762 	/*
2763 	 * If this instance is associated with a non-global zone, make its
2764 	 * kstats visible from the GZ.
2765 	 */
2766 	if (zid != GLOBAL_ZONEID) {
2767 		kstat_zone_add(sc->vmm_kstat_vm, GLOBAL_ZONEID);
2768 		for (uint_t i = 0; i < VM_MAXCPU; i++) {
2769 			kstat_zone_add(sc->vmm_kstat_vcpu[i], GLOBAL_ZONEID);
2770 		}
2771 	}
2772 
2773 	return (0);
2774 
2775 fail:
2776 	for (uint_t i = 0; i < VM_MAXCPU; i++) {
2777 		if (sc->vmm_kstat_vcpu[i] != NULL) {
2778 			kstat_delete(sc->vmm_kstat_vcpu[i]);
2779 			sc->vmm_kstat_vcpu[i] = NULL;
2780 		} else {
2781 			break;
2782 		}
2783 	}
2784 	kstat_delete(sc->vmm_kstat_vm);
2785 	sc->vmm_kstat_vm = NULL;
2786 	return (-1);
2787 }
2788 
2789 static void
2790 vmm_kstat_init(vmm_softc_t *sc)
2791 {
2792 	kstat_t *ksp;
2793 
2794 	ASSERT3P(sc->vmm_vm, !=, NULL);
2795 	ASSERT3P(sc->vmm_kstat_vm, !=, NULL);
2796 
2797 	ksp = sc->vmm_kstat_vm;
2798 	vmm_kstats_t *vk = ksp->ks_data;
2799 	ksp->ks_private = sc->vmm_vm;
2800 	kstat_named_init(&vk->vk_name, "vm_name", KSTAT_DATA_STRING);
2801 	kstat_named_setstr(&vk->vk_name, sc->vmm_name);
2802 
2803 	for (uint_t i = 0; i < VM_MAXCPU; i++) {
2804 		ASSERT3P(sc->vmm_kstat_vcpu[i], !=, NULL);
2805 
2806 		ksp = sc->vmm_kstat_vcpu[i];
2807 		vmm_vcpu_kstats_t *vvk = ksp->ks_data;
2808 
2809 		kstat_named_init(&vvk->vvk_vcpu, "vcpu", KSTAT_DATA_UINT32);
2810 		vvk->vvk_vcpu.value.ui32 = i;
2811 		kstat_named_init(&vvk->vvk_time_init, "time_init",
2812 		    KSTAT_DATA_UINT64);
2813 		kstat_named_init(&vvk->vvk_time_run, "time_run",
2814 		    KSTAT_DATA_UINT64);
2815 		kstat_named_init(&vvk->vvk_time_idle, "time_idle",
2816 		    KSTAT_DATA_UINT64);
2817 		kstat_named_init(&vvk->vvk_time_emu_kern, "time_emu_kern",
2818 		    KSTAT_DATA_UINT64);
2819 		kstat_named_init(&vvk->vvk_time_emu_user, "time_emu_user",
2820 		    KSTAT_DATA_UINT64);
2821 		kstat_named_init(&vvk->vvk_time_sched, "time_sched",
2822 		    KSTAT_DATA_UINT64);
2823 		ksp->ks_private = sc->vmm_vm;
2824 		ksp->ks_update = vmm_kstat_update_vcpu;
2825 	}
2826 
2827 	kstat_install(sc->vmm_kstat_vm);
2828 	for (uint_t i = 0; i < VM_MAXCPU; i++) {
2829 		kstat_install(sc->vmm_kstat_vcpu[i]);
2830 	}
2831 }
2832 
2833 static void
2834 vmm_kstat_fini(vmm_softc_t *sc)
2835 {
2836 	ASSERT(sc->vmm_kstat_vm != NULL);
2837 
2838 	kstat_delete(sc->vmm_kstat_vm);
2839 	sc->vmm_kstat_vm = NULL;
2840 
2841 	for (uint_t i = 0; i < VM_MAXCPU; i++) {
2842 		ASSERT3P(sc->vmm_kstat_vcpu[i], !=, NULL);
2843 
2844 		kstat_delete(sc->vmm_kstat_vcpu[i]);
2845 		sc->vmm_kstat_vcpu[i] = NULL;
2846 	}
2847 }
2848 
2849 static int
2850 vmm_open(dev_t *devp, int flag, int otyp, cred_t *credp)
2851 {
2852 	minor_t		minor;
2853 	vmm_softc_t	*sc;
2854 
2855 	/*
2856 	 * Forbid running bhyve in a 32-bit process until it has been tested and
2857 	 * verified to be safe.
2858 	 */
2859 	if (curproc->p_model != DATAMODEL_LP64) {
2860 		return (EFBIG);
2861 	}
2862 
2863 	minor = getminor(*devp);
2864 	if (minor == VMM_CTL_MINOR) {
2865 		/*
2866 		 * Master control device must be opened exclusively.
2867 		 */
2868 		if ((flag & FEXCL) != FEXCL || otyp != OTYP_CHR) {
2869 			return (EINVAL);
2870 		}
2871 
2872 		return (0);
2873 	}
2874 
2875 	mutex_enter(&vmm_mtx);
2876 	sc = ddi_get_soft_state(vmm_statep, minor);
2877 	if (sc == NULL) {
2878 		mutex_exit(&vmm_mtx);
2879 		return (ENXIO);
2880 	}
2881 
2882 	sc->vmm_flags |= VMM_IS_OPEN;
2883 	mutex_exit(&vmm_mtx);
2884 
2885 	return (0);
2886 }
2887 
2888 static int
2889 vmm_close(dev_t dev, int flag, int otyp, cred_t *credp)
2890 {
2891 	const minor_t minor = getminor(dev);
2892 	vmm_softc_t *sc;
2893 	bool hma_release = false;
2894 
2895 	if (minor == VMM_CTL_MINOR) {
2896 		return (0);
2897 	}
2898 
2899 	mutex_enter(&vmm_mtx);
2900 	sc = ddi_get_soft_state(vmm_statep, minor);
2901 	if (sc == NULL) {
2902 		mutex_exit(&vmm_mtx);
2903 		return (ENXIO);
2904 	}
2905 
2906 	VERIFY3U(sc->vmm_flags & VMM_IS_OPEN, !=, 0);
2907 	sc->vmm_flags &= ~VMM_IS_OPEN;
2908 
2909 	/*
2910 	 * If instance was marked for auto-destruction begin that now.  Instance
2911 	 * destruction may have been initated already, so try to make progress
2912 	 * in that case, since closure of the device is one of its requirements.
2913 	 */
2914 	if ((sc->vmm_flags & VMM_DESTROY) != 0 ||
2915 	    (sc->vmm_flags & VMM_AUTODESTROY) != 0) {
2916 		VERIFY0(vmm_destroy_locked(sc, VDO_DEFAULT, &hma_release));
2917 	}
2918 	mutex_exit(&vmm_mtx);
2919 
2920 	if (hma_release) {
2921 		vmm_hma_release();
2922 	}
2923 
2924 	return (0);
2925 }
2926 
2927 static int
2928 vmm_is_supported(intptr_t arg)
2929 {
2930 	int r;
2931 	const char *msg;
2932 
2933 	if (vmm_is_intel()) {
2934 		r = vmx_x86_supported(&msg);
2935 	} else if (vmm_is_svm()) {
2936 		/*
2937 		 * HMA already ensured that the features necessary for SVM
2938 		 * operation were present and online during vmm_attach().
2939 		 */
2940 		r = 0;
2941 	} else {
2942 		r = ENXIO;
2943 		msg = "Unsupported CPU vendor";
2944 	}
2945 
2946 	if (r != 0 && arg != (intptr_t)NULL) {
2947 		if (copyoutstr(msg, (char *)arg, strlen(msg) + 1, NULL) != 0)
2948 			return (EFAULT);
2949 	}
2950 	return (r);
2951 }
2952 
2953 static int
2954 vmm_ctl_ioctl(int cmd, intptr_t arg, int md, cred_t *cr, int *rvalp)
2955 {
2956 	void *argp = (void *)arg;
2957 
2958 	switch (cmd) {
2959 	case VMM_CREATE_VM: {
2960 		struct vm_create_req req;
2961 
2962 		if ((md & FWRITE) == 0) {
2963 			return (EPERM);
2964 		}
2965 		if (ddi_copyin(argp, &req, sizeof (req), md) != 0) {
2966 			return (EFAULT);
2967 		}
2968 		return (vmmdev_do_vm_create(&req, cr));
2969 	}
2970 	case VMM_DESTROY_VM: {
2971 		struct vm_destroy_req req;
2972 
2973 		if ((md & FWRITE) == 0) {
2974 			return (EPERM);
2975 		}
2976 		if (ddi_copyin(argp, &req, sizeof (req), md) != 0) {
2977 			return (EFAULT);
2978 		}
2979 		return (vmmdev_do_vm_destroy(&req, cr));
2980 	}
2981 	case VMM_VM_SUPPORTED:
2982 		return (vmm_is_supported(arg));
2983 	case VMM_CHECK_IOMMU:
2984 		if (!vmm_check_iommu()) {
2985 			return (ENXIO);
2986 		}
2987 		return (0);
2988 	case VMM_RESV_QUERY:
2989 	case VMM_RESV_SET_TARGET:
2990 		return (vmmr_ioctl(cmd, arg, md, cr, rvalp));
2991 	default:
2992 		break;
2993 	}
2994 	/* No other actions are legal on ctl device */
2995 	return (ENOTTY);
2996 }
2997 
2998 static int
2999 vmm_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp,
3000     int *rvalp)
3001 {
3002 	vmm_softc_t	*sc;
3003 	minor_t		minor;
3004 
3005 	/*
3006 	 * Forbid running bhyve in a 32-bit process until it has been tested and
3007 	 * verified to be safe.
3008 	 */
3009 	if (curproc->p_model != DATAMODEL_LP64) {
3010 		return (EFBIG);
3011 	}
3012 
3013 	/* The structs in bhyve ioctls assume a 64-bit datamodel */
3014 	if (ddi_model_convert_from(mode & FMODELS) != DDI_MODEL_NONE) {
3015 		return (ENOTSUP);
3016 	}
3017 
3018 	/*
3019 	 * Regardless of minor (vmmctl or instance), we respond to queries of
3020 	 * the interface version.
3021 	 */
3022 	if (cmd == VMM_INTERFACE_VERSION) {
3023 		*rvalp = VMM_CURRENT_INTERFACE_VERSION;
3024 		return (0);
3025 	}
3026 
3027 	minor = getminor(dev);
3028 
3029 	if (minor == VMM_CTL_MINOR) {
3030 		return (vmm_ctl_ioctl(cmd, arg, mode, credp, rvalp));
3031 	}
3032 
3033 	sc = ddi_get_soft_state(vmm_statep, minor);
3034 	ASSERT(sc != NULL);
3035 
3036 	/*
3037 	 * Turn away any ioctls against an instance when it is being destroyed.
3038 	 * (Except for the ioctl inquiring about that destroy-in-progress.)
3039 	 */
3040 	if ((sc->vmm_flags & VMM_DESTROY) != 0) {
3041 		if (cmd == VM_DESTROY_PENDING) {
3042 			*rvalp = 1;
3043 			return (0);
3044 		}
3045 		return (ENXIO);
3046 	}
3047 
3048 	return (vmmdev_do_ioctl(sc, cmd, arg, mode, credp, rvalp));
3049 }
3050 
3051 static int
3052 vmm_segmap(dev_t dev, off_t off, struct as *as, caddr_t *addrp, off_t len,
3053     unsigned int prot, unsigned int maxprot, unsigned int flags, cred_t *credp)
3054 {
3055 	vmm_softc_t *sc;
3056 	const minor_t minor = getminor(dev);
3057 	int err;
3058 
3059 	if (minor == VMM_CTL_MINOR) {
3060 		return (ENODEV);
3061 	}
3062 	if (off < 0 || (off + len) <= 0) {
3063 		return (EINVAL);
3064 	}
3065 	if ((prot & PROT_USER) == 0) {
3066 		return (EACCES);
3067 	}
3068 
3069 	sc = ddi_get_soft_state(vmm_statep, minor);
3070 	ASSERT(sc);
3071 
3072 	if (sc->vmm_flags & VMM_DESTROY)
3073 		return (ENXIO);
3074 
3075 	/* Grab read lock on the VM to prevent any changes to the memory map */
3076 	vmm_read_lock(sc);
3077 
3078 	if (off >= VM_DEVMEM_START) {
3079 		int segid;
3080 		off_t segoff;
3081 
3082 		/* Mapping a devmem "device" */
3083 		if (!vmmdev_devmem_segid(sc, off, len, &segid, &segoff)) {
3084 			err = ENODEV;
3085 		} else {
3086 			err = vm_segmap_obj(sc->vmm_vm, segid, segoff, len, as,
3087 			    addrp, prot, maxprot, flags);
3088 		}
3089 	} else {
3090 		/* Mapping a part of the guest physical space */
3091 		err = vm_segmap_space(sc->vmm_vm, off, as, addrp, len, prot,
3092 		    maxprot, flags);
3093 	}
3094 
3095 	vmm_read_unlock(sc);
3096 	return (err);
3097 }
3098 
3099 static sdev_plugin_validate_t
3100 vmm_sdev_validate(sdev_ctx_t ctx)
3101 {
3102 	const char *name = sdev_ctx_name(ctx);
3103 	vmm_softc_t *sc;
3104 	sdev_plugin_validate_t ret;
3105 	minor_t minor;
3106 
3107 	if (sdev_ctx_vtype(ctx) != VCHR)
3108 		return (SDEV_VTOR_INVALID);
3109 
3110 	VERIFY3S(sdev_ctx_minor(ctx, &minor), ==, 0);
3111 
3112 	mutex_enter(&vmm_mtx);
3113 	if ((sc = vmm_lookup(name)) == NULL)
3114 		ret = SDEV_VTOR_INVALID;
3115 	else if (sc->vmm_minor != minor)
3116 		ret = SDEV_VTOR_STALE;
3117 	else
3118 		ret = SDEV_VTOR_VALID;
3119 	mutex_exit(&vmm_mtx);
3120 
3121 	return (ret);
3122 }
3123 
3124 static int
3125 vmm_sdev_filldir(sdev_ctx_t ctx)
3126 {
3127 	vmm_softc_t *sc;
3128 	int ret;
3129 
3130 	if (strcmp(sdev_ctx_path(ctx), VMM_SDEV_ROOT) != 0) {
3131 		cmn_err(CE_WARN, "%s: bad path '%s' != '%s'\n", __func__,
3132 		    sdev_ctx_path(ctx), VMM_SDEV_ROOT);
3133 		return (EINVAL);
3134 	}
3135 
3136 	mutex_enter(&vmm_mtx);
3137 	ASSERT(vmmdev_dip != NULL);
3138 	for (sc = list_head(&vmm_list); sc != NULL;
3139 	    sc = list_next(&vmm_list, sc)) {
3140 		if (INGLOBALZONE(curproc) || sc->vmm_zone == curzone) {
3141 			ret = sdev_plugin_mknod(ctx, sc->vmm_name,
3142 			    S_IFCHR | 0600,
3143 			    makedevice(ddi_driver_major(vmmdev_dip),
3144 			    sc->vmm_minor));
3145 		} else {
3146 			continue;
3147 		}
3148 		if (ret != 0 && ret != EEXIST)
3149 			goto out;
3150 	}
3151 
3152 	ret = 0;
3153 
3154 out:
3155 	mutex_exit(&vmm_mtx);
3156 	return (ret);
3157 }
3158 
3159 /* ARGSUSED */
3160 static void
3161 vmm_sdev_inactive(sdev_ctx_t ctx)
3162 {
3163 }
3164 
3165 static sdev_plugin_ops_t vmm_sdev_ops = {
3166 	.spo_version = SDEV_PLUGIN_VERSION,
3167 	.spo_flags = SDEV_PLUGIN_SUBDIR,
3168 	.spo_validate = vmm_sdev_validate,
3169 	.spo_filldir = vmm_sdev_filldir,
3170 	.spo_inactive = vmm_sdev_inactive
3171 };
3172 
3173 /* ARGSUSED */
3174 static int
3175 vmm_info(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result)
3176 {
3177 	int error;
3178 
3179 	switch (cmd) {
3180 	case DDI_INFO_DEVT2DEVINFO:
3181 		*result = (void *)vmmdev_dip;
3182 		error = DDI_SUCCESS;
3183 		break;
3184 	case DDI_INFO_DEVT2INSTANCE:
3185 		*result = (void *)0;
3186 		error = DDI_SUCCESS;
3187 		break;
3188 	default:
3189 		error = DDI_FAILURE;
3190 		break;
3191 	}
3192 	return (error);
3193 }
3194 
3195 static int
3196 vmm_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
3197 {
3198 	sdev_plugin_hdl_t sph;
3199 	hma_reg_t *reg = NULL;
3200 	boolean_t vmm_loaded = B_FALSE;
3201 
3202 	if (cmd != DDI_ATTACH) {
3203 		return (DDI_FAILURE);
3204 	}
3205 
3206 	mutex_enter(&vmmdev_mtx);
3207 	/* Ensure we are not already attached. */
3208 	if (vmmdev_dip != NULL) {
3209 		mutex_exit(&vmmdev_mtx);
3210 		return (DDI_FAILURE);
3211 	}
3212 
3213 	vmm_sol_glue_init();
3214 
3215 	/*
3216 	 * Perform temporary HMA registration to determine if the system
3217 	 * is capable.
3218 	 */
3219 	if ((reg = hma_register(vmmdev_hvm_name)) == NULL) {
3220 		goto fail;
3221 	} else if (vmm_mod_load() != 0) {
3222 		goto fail;
3223 	}
3224 	vmm_loaded = B_TRUE;
3225 	hma_unregister(reg);
3226 	reg = NULL;
3227 
3228 	/* Create control node.  Other nodes will be created on demand. */
3229 	if (ddi_create_minor_node(dip, "ctl", S_IFCHR,
3230 	    VMM_CTL_MINOR, DDI_PSEUDO, 0) != 0) {
3231 		goto fail;
3232 	}
3233 
3234 	sph = sdev_plugin_register(VMM_MODULE_NAME, &vmm_sdev_ops, NULL);
3235 	if (sph == (sdev_plugin_hdl_t)NULL) {
3236 		ddi_remove_minor_node(dip, NULL);
3237 		goto fail;
3238 	}
3239 
3240 	ddi_report_dev(dip);
3241 	vmmdev_sdev_hdl = sph;
3242 	vmmdev_dip = dip;
3243 	mutex_exit(&vmmdev_mtx);
3244 	return (DDI_SUCCESS);
3245 
3246 fail:
3247 	if (vmm_loaded) {
3248 		VERIFY0(vmm_mod_unload());
3249 	}
3250 	if (reg != NULL) {
3251 		hma_unregister(reg);
3252 	}
3253 	vmm_sol_glue_cleanup();
3254 	mutex_exit(&vmmdev_mtx);
3255 	return (DDI_FAILURE);
3256 }
3257 
3258 static int
3259 vmm_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
3260 {
3261 	if (cmd != DDI_DETACH) {
3262 		return (DDI_FAILURE);
3263 	}
3264 
3265 	/*
3266 	 * Ensure that all resources have been cleaned up.
3267 	 *
3268 	 * To prevent a deadlock with iommu_cleanup() we'll fail the detach if
3269 	 * vmmdev_mtx is already held. We can't wait for vmmdev_mtx with our
3270 	 * devinfo locked as iommu_cleanup() tries to recursively lock each
3271 	 * devinfo, including our own, while holding vmmdev_mtx.
3272 	 */
3273 	if (mutex_tryenter(&vmmdev_mtx) == 0)
3274 		return (DDI_FAILURE);
3275 
3276 	mutex_enter(&vmm_mtx);
3277 	if (!list_is_empty(&vmm_list)) {
3278 		mutex_exit(&vmm_mtx);
3279 		mutex_exit(&vmmdev_mtx);
3280 		return (DDI_FAILURE);
3281 	}
3282 	mutex_exit(&vmm_mtx);
3283 
3284 	if (!vmmr_is_empty()) {
3285 		mutex_exit(&vmmdev_mtx);
3286 		return (DDI_FAILURE);
3287 	}
3288 
3289 	VERIFY(vmmdev_sdev_hdl != (sdev_plugin_hdl_t)NULL);
3290 	if (sdev_plugin_unregister(vmmdev_sdev_hdl) != 0) {
3291 		mutex_exit(&vmmdev_mtx);
3292 		return (DDI_FAILURE);
3293 	}
3294 	vmmdev_sdev_hdl = (sdev_plugin_hdl_t)NULL;
3295 
3296 	/* Remove the control node. */
3297 	ddi_remove_minor_node(dip, "ctl");
3298 	vmmdev_dip = NULL;
3299 
3300 	VERIFY0(vmm_mod_unload());
3301 	VERIFY3U(vmmdev_hma_reg, ==, NULL);
3302 	vmm_sol_glue_cleanup();
3303 
3304 	mutex_exit(&vmmdev_mtx);
3305 
3306 	return (DDI_SUCCESS);
3307 }
3308 
3309 static struct cb_ops vmm_cb_ops = {
3310 	vmm_open,
3311 	vmm_close,
3312 	nodev,		/* strategy */
3313 	nodev,		/* print */
3314 	nodev,		/* dump */
3315 	nodev,		/* read */
3316 	nodev,		/* write */
3317 	vmm_ioctl,
3318 	nodev,		/* devmap */
3319 	nodev,		/* mmap */
3320 	vmm_segmap,
3321 	nochpoll,	/* poll */
3322 	ddi_prop_op,
3323 	NULL,
3324 	D_NEW | D_MP | D_DEVMAP
3325 };
3326 
3327 static struct dev_ops vmm_ops = {
3328 	DEVO_REV,
3329 	0,
3330 	vmm_info,
3331 	nulldev,	/* identify */
3332 	nulldev,	/* probe */
3333 	vmm_attach,
3334 	vmm_detach,
3335 	nodev,		/* reset */
3336 	&vmm_cb_ops,
3337 	(struct bus_ops *)NULL
3338 };
3339 
3340 static struct modldrv modldrv = {
3341 	&mod_driverops,
3342 	"bhyve vmm",
3343 	&vmm_ops
3344 };
3345 
3346 static struct modlinkage modlinkage = {
3347 	MODREV_1,
3348 	&modldrv,
3349 	NULL
3350 };
3351 
3352 int
3353 _init(void)
3354 {
3355 	int	error;
3356 
3357 	sysinit();
3358 
3359 	mutex_init(&vmmdev_mtx, NULL, MUTEX_DRIVER, NULL);
3360 	mutex_init(&vmm_mtx, NULL, MUTEX_DRIVER, NULL);
3361 	list_create(&vmm_list, sizeof (vmm_softc_t),
3362 	    offsetof(vmm_softc_t, vmm_node));
3363 	vmm_minors = id_space_create("vmm_minors", VMM_CTL_MINOR + 1, MAXMIN32);
3364 
3365 	error = ddi_soft_state_init(&vmm_statep, sizeof (vmm_softc_t), 0);
3366 	if (error) {
3367 		return (error);
3368 	}
3369 
3370 	error = vmmr_init();
3371 	if (error) {
3372 		ddi_soft_state_fini(&vmm_statep);
3373 		return (error);
3374 	}
3375 
3376 	vmm_zsd_init();
3377 
3378 	error = mod_install(&modlinkage);
3379 	if (error) {
3380 		ddi_soft_state_fini(&vmm_statep);
3381 		vmm_zsd_fini();
3382 		vmmr_fini();
3383 	}
3384 
3385 	return (error);
3386 }
3387 
3388 int
3389 _fini(void)
3390 {
3391 	int	error;
3392 
3393 	error = mod_remove(&modlinkage);
3394 	if (error) {
3395 		return (error);
3396 	}
3397 
3398 	vmm_zsd_fini();
3399 	vmmr_fini();
3400 
3401 	ddi_soft_state_fini(&vmm_statep);
3402 
3403 	return (0);
3404 }
3405 
3406 int
3407 _info(struct modinfo *modinfop)
3408 {
3409 	return (mod_info(&modlinkage, modinfop));
3410 }
3411