xref: /illumos-gate/usr/src/uts/intel/io/vmm/vmm_sol_dev.c (revision 589f9b6223af8482576c4b68c4acc0626246eb32)
1 /*
2  * This file and its contents are supplied under the terms of the
3  * Common Development and Distribution License ("CDDL"), version 1.0.
4  * You may only use this file in accordance with the terms of version
5  * 1.0 of the CDDL.
6  *
7  * A full copy of the text of the CDDL should have accompanied this
8  * source.  A copy of the CDDL is also available via the Internet at
9  * http://www.illumos.org/license/CDDL.
10  */
11 /* This file is dual-licensed; see usr/src/contrib/bhyve/LICENSE */
12 
13 /*
14  * Copyright 2015 Pluribus Networks Inc.
15  * Copyright 2019 Joyent, Inc.
16  * Copyright 2020 OmniOS Community Edition (OmniOSce) Association.
17  * Copyright 2022 Oxide Computer Company
18  */
19 
20 #include <sys/types.h>
21 #include <sys/conf.h>
22 #include <sys/cpuvar.h>
23 #include <sys/ioccom.h>
24 #include <sys/stat.h>
25 #include <sys/vmsystm.h>
26 #include <sys/ddi.h>
27 #include <sys/mkdev.h>
28 #include <sys/sunddi.h>
29 #include <sys/fs/dv_node.h>
30 #include <sys/cpuset.h>
31 #include <sys/id_space.h>
32 #include <sys/fs/sdev_plugin.h>
33 #include <sys/smt.h>
34 #include <sys/kstat.h>
35 
36 #include <sys/kernel.h>
37 #include <sys/hma.h>
38 #include <sys/x86_archext.h>
39 #include <x86/apicreg.h>
40 
41 #include <sys/vmm.h>
42 #include <sys/vmm_kernel.h>
43 #include <sys/vmm_instruction_emul.h>
44 #include <sys/vmm_dev.h>
45 #include <sys/vmm_impl.h>
46 #include <sys/vmm_drv.h>
47 #include <sys/vmm_vm.h>
48 #include <sys/vmm_reservoir.h>
49 
50 #include <vm/seg_dev.h>
51 
52 #include "io/ppt.h"
53 #include "io/vatpic.h"
54 #include "io/vioapic.h"
55 #include "io/vrtc.h"
56 #include "io/vhpet.h"
57 #include "io/vpmtmr.h"
58 #include "vmm_lapic.h"
59 #include "vmm_stat.h"
60 #include "vmm_util.h"
61 
62 /*
63  * Locking details:
64  *
65  * Driver-wide data (vmmdev_*) , including HMA and sdev registration, is
66  * protected by vmmdev_mtx.  The list of vmm_softc_t instances and related data
67  * (vmm_*) are protected by vmm_mtx.  Actions requiring both locks must acquire
68  * vmmdev_mtx before vmm_mtx.  The sdev plugin functions must not attempt to
69  * acquire vmmdev_mtx, as they could deadlock with plugin unregistration.
70  */
71 
72 static kmutex_t		vmmdev_mtx;
73 static dev_info_t	*vmmdev_dip;
74 static hma_reg_t	*vmmdev_hma_reg;
75 static uint_t		vmmdev_hma_ref;
76 static sdev_plugin_hdl_t vmmdev_sdev_hdl;
77 
78 static kmutex_t		vmm_mtx;
79 static list_t		vmm_list;
80 static list_t		vmm_destroy_list;
81 static id_space_t	*vmm_minors;
82 static void		*vmm_statep;
83 
84 /* temporary safety switch */
85 int		vmm_allow_state_writes;
86 
87 static const char *vmmdev_hvm_name = "bhyve";
88 
89 /* For sdev plugin (/dev) */
90 #define	VMM_SDEV_ROOT "/dev/vmm"
91 
92 /* From uts/intel/io/vmm/intel/vmx.c */
93 extern int vmx_x86_supported(const char **);
94 
95 /* Holds and hooks from drivers external to vmm */
96 struct vmm_hold {
97 	list_node_t	vmh_node;
98 	vmm_softc_t	*vmh_sc;
99 	boolean_t	vmh_release_req;
100 	uint_t		vmh_ioport_hook_cnt;
101 };
102 
103 struct vmm_lease {
104 	list_node_t		vml_node;
105 	struct vm		*vml_vm;
106 	vm_client_t		*vml_vmclient;
107 	boolean_t		vml_expired;
108 	boolean_t		vml_break_deferred;
109 	boolean_t		(*vml_expire_func)(void *);
110 	void			*vml_expire_arg;
111 	struct vmm_hold		*vml_hold;
112 };
113 
114 /* Options for vmm_destroy_locked */
115 typedef enum vmm_destroy_opts {
116 	VDO_DEFAULT		= 0,
117 	/*
118 	 * Request that zone-specific-data associated with this VM not be
119 	 * cleaned up as part of the destroy.  Skipping ZSD clean-up is
120 	 * necessary when VM is being destroyed as part of zone destruction,
121 	 * when said ZSD is already being cleaned up.
122 	 */
123 	VDO_NO_CLEAN_ZSD	= (1 << 0),
124 	/*
125 	 * Skip any attempt to wait for vmm_drv consumers when attempting to
126 	 * purge them from the instance.  When performing an auto-destruct, it
127 	 * is not desirable to wait, since said consumer might exist in a
128 	 * "higher" file descriptor which has not yet been closed.
129 	 */
130 	VDO_NO_PURGE_WAIT	= (1 << 1),
131 } vmm_destroy_opts_t;
132 
133 static int vmm_destroy_locked(vmm_softc_t *, vmm_destroy_opts_t, boolean_t *);
134 static int vmm_drv_block_hook(vmm_softc_t *, boolean_t);
135 static void vmm_lease_block(vmm_softc_t *);
136 static void vmm_lease_unblock(vmm_softc_t *);
137 static int vmm_kstat_alloc(vmm_softc_t *, minor_t, const cred_t *);
138 static void vmm_kstat_init(vmm_softc_t *);
139 static void vmm_kstat_fini(vmm_softc_t *);
140 
141 /*
142  * The 'devmem' hack:
143  *
144  * On native FreeBSD, bhyve consumers are allowed to create 'devmem' segments
145  * in the vm which appear with their own name related to the vm under /dev.
146  * Since this would be a hassle from an sdev perspective and would require a
147  * new cdev interface (or complicate the existing one), we choose to implement
148  * this in a different manner.  Direct access to the underlying vm memory
149  * segments is exposed by placing them in a range of offsets beyond the normal
150  * guest memory space.  Userspace can query the appropriate offset to mmap()
151  * for a given segment-id with the VM_DEVMEM_GETOFFSET ioctl.
152  */
153 
154 static vmm_devmem_entry_t *
155 vmmdev_devmem_find(vmm_softc_t *sc, int segid)
156 {
157 	vmm_devmem_entry_t *ent = NULL;
158 	list_t *dl = &sc->vmm_devmem_list;
159 
160 	for (ent = list_head(dl); ent != NULL; ent = list_next(dl, ent)) {
161 		if (ent->vde_segid == segid) {
162 			return (ent);
163 		}
164 	}
165 	return (NULL);
166 }
167 
168 static int
169 vmmdev_get_memseg(vmm_softc_t *sc, struct vm_memseg *mseg)
170 {
171 	int error;
172 	bool sysmem;
173 
174 	error = vm_get_memseg(sc->vmm_vm, mseg->segid, &mseg->len, &sysmem,
175 	    NULL);
176 	if (error || mseg->len == 0)
177 		return (error);
178 
179 	if (!sysmem) {
180 		vmm_devmem_entry_t *de;
181 
182 		de = vmmdev_devmem_find(sc, mseg->segid);
183 		if (de != NULL) {
184 			(void) strlcpy(mseg->name, de->vde_name,
185 			    sizeof (mseg->name));
186 		}
187 	} else {
188 		bzero(mseg->name, sizeof (mseg->name));
189 	}
190 
191 	return (error);
192 }
193 
194 static int
195 vmmdev_devmem_create(vmm_softc_t *sc, struct vm_memseg *mseg, const char *name)
196 {
197 	off_t map_offset;
198 	vmm_devmem_entry_t *entry;
199 
200 	if (list_is_empty(&sc->vmm_devmem_list)) {
201 		map_offset = VM_DEVMEM_START;
202 	} else {
203 		entry = list_tail(&sc->vmm_devmem_list);
204 		map_offset = entry->vde_off + entry->vde_len;
205 		if (map_offset < entry->vde_off) {
206 			/* Do not tolerate overflow */
207 			return (ERANGE);
208 		}
209 		/*
210 		 * XXXJOY: We could choose to search the list for duplicate
211 		 * names and toss an error.  Since we're using the offset
212 		 * method for now, it does not make much of a difference.
213 		 */
214 	}
215 
216 	entry = kmem_zalloc(sizeof (*entry), KM_SLEEP);
217 	entry->vde_segid = mseg->segid;
218 	entry->vde_len = mseg->len;
219 	entry->vde_off = map_offset;
220 	(void) strlcpy(entry->vde_name, name, sizeof (entry->vde_name));
221 	list_insert_tail(&sc->vmm_devmem_list, entry);
222 
223 	return (0);
224 }
225 
226 static boolean_t
227 vmmdev_devmem_segid(vmm_softc_t *sc, off_t off, off_t len, int *segidp,
228     off_t *map_offp)
229 {
230 	list_t *dl = &sc->vmm_devmem_list;
231 	vmm_devmem_entry_t *de = NULL;
232 	const off_t map_end = off + len;
233 
234 	VERIFY(off >= VM_DEVMEM_START);
235 
236 	if (map_end < off) {
237 		/* No match on overflow */
238 		return (B_FALSE);
239 	}
240 
241 	for (de = list_head(dl); de != NULL; de = list_next(dl, de)) {
242 		const off_t item_end = de->vde_off + de->vde_len;
243 
244 		if (de->vde_off <= off && item_end >= map_end) {
245 			*segidp = de->vde_segid;
246 			*map_offp = off - de->vde_off;
247 			return (B_TRUE);
248 		}
249 	}
250 	return (B_FALSE);
251 }
252 
253 static void
254 vmmdev_devmem_purge(vmm_softc_t *sc)
255 {
256 	vmm_devmem_entry_t *entry;
257 
258 	while ((entry = list_remove_head(&sc->vmm_devmem_list)) != NULL) {
259 		kmem_free(entry, sizeof (*entry));
260 	}
261 }
262 
263 static int
264 vmmdev_alloc_memseg(vmm_softc_t *sc, struct vm_memseg *mseg)
265 {
266 	int error;
267 	bool sysmem = true;
268 
269 	if (VM_MEMSEG_NAME(mseg)) {
270 		sysmem = false;
271 	}
272 	error = vm_alloc_memseg(sc->vmm_vm, mseg->segid, mseg->len, sysmem);
273 
274 	if (error == 0) {
275 		/*
276 		 * Rather than create a whole fresh device from which userspace
277 		 * can mmap this segment, instead make it available at an
278 		 * offset above where the main guest memory resides.
279 		 */
280 		error = vmmdev_devmem_create(sc, mseg, mseg->name);
281 		if (error != 0) {
282 			vm_free_memseg(sc->vmm_vm, mseg->segid);
283 		}
284 	}
285 	return (error);
286 }
287 
288 /*
289  * Resource Locking and Exclusion
290  *
291  * Much of bhyve depends on key portions of VM state, such as the guest memory
292  * map, to remain unchanged while the guest is running.  As ported from
293  * FreeBSD, the initial strategy for this resource exclusion hinged on gating
294  * access to the instance vCPUs.  Threads acting on a single vCPU, like those
295  * performing the work of actually running the guest in VMX/SVM, would lock
296  * only that vCPU during ioctl() entry.  For ioctls which would change VM-wide
297  * state, all of the vCPUs would be first locked, ensuring that the
298  * operation(s) could complete without any other threads stumbling into
299  * intermediate states.
300  *
301  * This approach is largely effective for bhyve.  Common operations, such as
302  * running the vCPUs, steer clear of lock contention.  The model begins to
303  * break down for operations which do not occur in the context of a specific
304  * vCPU.  LAPIC MSI delivery, for example, may be initiated from a worker
305  * thread in the bhyve process.  In order to properly protect those vCPU-less
306  * operations from encountering invalid states, additional locking is required.
307  * This was solved by forcing those operations to lock the VM_MAXCPU-1 vCPU.
308  * It does mean that class of operations will be serialized on locking the
309  * specific vCPU and that instances sized at VM_MAXCPU will potentially see
310  * undue contention on the VM_MAXCPU-1 vCPU.
311  *
312  * In order to address the shortcomings of this model, the concept of a
313  * read/write lock has been added to bhyve.  Operations which change
314  * fundamental aspects of a VM (such as the memory map) must acquire the write
315  * lock, which also implies locking all of the vCPUs and waiting for all read
316  * lock holders to release.  While it increases the cost and waiting time for
317  * those few operations, it allows most hot-path operations on the VM (which
318  * depend on its configuration remaining stable) to occur with minimal locking.
319  *
320  * Consumers of the Driver API (see below) are a special case when it comes to
321  * this locking, since they may hold a read lock via the drv_lease mechanism
322  * for an extended period of time.  Rather than forcing those consumers to
323  * continuously poll for a write lock attempt, the lease system forces them to
324  * provide a release callback to trigger their clean-up (and potential later
325  * reacquisition) of the read lock.
326  */
327 
328 static void
329 vcpu_lock_one(vmm_softc_t *sc, int vcpu)
330 {
331 	ASSERT(vcpu >= 0 && vcpu < VM_MAXCPU);
332 
333 	/*
334 	 * Since this state transition is utilizing from_idle=true, it should
335 	 * not fail, but rather block until it can be successful.
336 	 */
337 	VERIFY0(vcpu_set_state(sc->vmm_vm, vcpu, VCPU_FROZEN, true));
338 }
339 
340 static void
341 vcpu_unlock_one(vmm_softc_t *sc, int vcpu)
342 {
343 	ASSERT(vcpu >= 0 && vcpu < VM_MAXCPU);
344 
345 	VERIFY3U(vcpu_get_state(sc->vmm_vm, vcpu, NULL), ==, VCPU_FROZEN);
346 	VERIFY0(vcpu_set_state(sc->vmm_vm, vcpu, VCPU_IDLE, false));
347 }
348 
349 static void
350 vmm_read_lock(vmm_softc_t *sc)
351 {
352 	rw_enter(&sc->vmm_rwlock, RW_READER);
353 }
354 
355 static void
356 vmm_read_unlock(vmm_softc_t *sc)
357 {
358 	rw_exit(&sc->vmm_rwlock);
359 }
360 
361 static void
362 vmm_write_lock(vmm_softc_t *sc)
363 {
364 	int maxcpus;
365 
366 	/* First lock all the vCPUs */
367 	maxcpus = vm_get_maxcpus(sc->vmm_vm);
368 	for (int vcpu = 0; vcpu < maxcpus; vcpu++) {
369 		vcpu_lock_one(sc, vcpu);
370 	}
371 
372 	/*
373 	 * Block vmm_drv leases from being acquired or held while the VM write
374 	 * lock is held.
375 	 */
376 	vmm_lease_block(sc);
377 
378 	rw_enter(&sc->vmm_rwlock, RW_WRITER);
379 	/*
380 	 * For now, the 'maxcpus' value for an instance is fixed at the
381 	 * compile-time constant of VM_MAXCPU at creation.  If this changes in
382 	 * the future, allowing for dynamic vCPU resource sizing, acquisition
383 	 * of the write lock will need to be wary of such changes.
384 	 */
385 	VERIFY(maxcpus == vm_get_maxcpus(sc->vmm_vm));
386 }
387 
388 static void
389 vmm_write_unlock(vmm_softc_t *sc)
390 {
391 	int maxcpus;
392 
393 	/* Allow vmm_drv leases to be acquired once write lock is dropped */
394 	vmm_lease_unblock(sc);
395 
396 	/*
397 	 * The VM write lock _must_ be released from the same thread it was
398 	 * acquired in, unlike the read lock.
399 	 */
400 	VERIFY(rw_write_held(&sc->vmm_rwlock));
401 	rw_exit(&sc->vmm_rwlock);
402 
403 	/* Unlock all the vCPUs */
404 	maxcpus = vm_get_maxcpus(sc->vmm_vm);
405 	for (int vcpu = 0; vcpu < maxcpus; vcpu++) {
406 		vcpu_unlock_one(sc, vcpu);
407 	}
408 }
409 
410 static int
411 vmmdev_do_ioctl(vmm_softc_t *sc, int cmd, intptr_t arg, int md,
412     cred_t *credp, int *rvalp)
413 {
414 	int error = 0, vcpu = -1;
415 	void *datap = (void *)arg;
416 	enum vm_lock_type {
417 		LOCK_NONE = 0,
418 		LOCK_VCPU,
419 		LOCK_READ_HOLD,
420 		LOCK_WRITE_HOLD
421 	} lock_type = LOCK_NONE;
422 
423 	/* Acquire any exclusion resources needed for the operation. */
424 	switch (cmd) {
425 	case VM_RUN:
426 	case VM_GET_REGISTER:
427 	case VM_SET_REGISTER:
428 	case VM_GET_SEGMENT_DESCRIPTOR:
429 	case VM_SET_SEGMENT_DESCRIPTOR:
430 	case VM_GET_REGISTER_SET:
431 	case VM_SET_REGISTER_SET:
432 	case VM_INJECT_EXCEPTION:
433 	case VM_GET_CAPABILITY:
434 	case VM_SET_CAPABILITY:
435 	case VM_PPTDEV_MSI:
436 	case VM_PPTDEV_MSIX:
437 	case VM_SET_X2APIC_STATE:
438 	case VM_GLA2GPA:
439 	case VM_GLA2GPA_NOFAULT:
440 	case VM_ACTIVATE_CPU:
441 	case VM_SET_INTINFO:
442 	case VM_GET_INTINFO:
443 	case VM_RESTART_INSTRUCTION:
444 	case VM_SET_KERNEMU_DEV:
445 	case VM_GET_KERNEMU_DEV:
446 	case VM_RESET_CPU:
447 	case VM_GET_RUN_STATE:
448 	case VM_SET_RUN_STATE:
449 	case VM_GET_FPU:
450 	case VM_SET_FPU:
451 	case VM_GET_CPUID:
452 	case VM_SET_CPUID:
453 	case VM_LEGACY_CPUID:
454 		/*
455 		 * Copy in the ID of the vCPU chosen for this operation.
456 		 * Since a nefarious caller could update their struct between
457 		 * this locking and when the rest of the ioctl data is copied
458 		 * in, it is _critical_ that this local 'vcpu' variable be used
459 		 * rather than the in-struct one when performing the ioctl.
460 		 */
461 		if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) {
462 			return (EFAULT);
463 		}
464 		if (vcpu < 0 || vcpu > vm_get_maxcpus(sc->vmm_vm)) {
465 			return (EINVAL);
466 		}
467 		vcpu_lock_one(sc, vcpu);
468 		lock_type = LOCK_VCPU;
469 		break;
470 
471 	case VM_REINIT:
472 	case VM_BIND_PPTDEV:
473 	case VM_UNBIND_PPTDEV:
474 	case VM_MAP_PPTDEV_MMIO:
475 	case VM_UNMAP_PPTDEV_MMIO:
476 	case VM_ALLOC_MEMSEG:
477 	case VM_MMAP_MEMSEG:
478 	case VM_MUNMAP_MEMSEG:
479 	case VM_WRLOCK_CYCLE:
480 	case VM_PMTMR_LOCATE:
481 		vmm_write_lock(sc);
482 		lock_type = LOCK_WRITE_HOLD;
483 		break;
484 
485 	case VM_GET_MEMSEG:
486 	case VM_MMAP_GETNEXT:
487 	case VM_LAPIC_IRQ:
488 	case VM_INJECT_NMI:
489 	case VM_IOAPIC_ASSERT_IRQ:
490 	case VM_IOAPIC_DEASSERT_IRQ:
491 	case VM_IOAPIC_PULSE_IRQ:
492 	case VM_LAPIC_MSI:
493 	case VM_LAPIC_LOCAL_IRQ:
494 	case VM_GET_X2APIC_STATE:
495 	case VM_RTC_READ:
496 	case VM_RTC_WRITE:
497 	case VM_RTC_SETTIME:
498 	case VM_RTC_GETTIME:
499 	case VM_PPTDEV_DISABLE_MSIX:
500 	case VM_DEVMEM_GETOFFSET:
501 	case VM_TRACK_DIRTY_PAGES:
502 		vmm_read_lock(sc);
503 		lock_type = LOCK_READ_HOLD;
504 		break;
505 
506 	case VM_DATA_READ:
507 	case VM_DATA_WRITE:
508 		if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) {
509 			return (EFAULT);
510 		}
511 		if (vcpu == -1) {
512 			/* Access data for VM-wide devices */
513 			vmm_write_lock(sc);
514 			lock_type = LOCK_WRITE_HOLD;
515 		} else if (vcpu >= 0 && vcpu < vm_get_maxcpus(sc->vmm_vm)) {
516 			/* Access data associated with a specific vCPU */
517 			vcpu_lock_one(sc, vcpu);
518 			lock_type = LOCK_VCPU;
519 		} else {
520 			return (EINVAL);
521 		}
522 		break;
523 
524 	case VM_GET_GPA_PMAP:
525 	case VM_IOAPIC_PINCOUNT:
526 	case VM_SUSPEND:
527 	case VM_DESC_FPU_AREA:
528 	case VM_SET_AUTODESTRUCT:
529 	default:
530 		break;
531 	}
532 
533 	/* Execute the primary logic for the ioctl. */
534 	switch (cmd) {
535 	case VM_RUN: {
536 		struct vm_entry entry;
537 
538 		if (ddi_copyin(datap, &entry, sizeof (entry), md)) {
539 			error = EFAULT;
540 			break;
541 		}
542 
543 		if (!(curthread->t_schedflag & TS_VCPU))
544 			smt_mark_as_vcpu();
545 
546 		error = vm_run(sc->vmm_vm, vcpu, &entry);
547 
548 		/*
549 		 * Unexpected states in vm_run() are expressed through positive
550 		 * errno-oriented return values.  VM states which expect further
551 		 * processing in userspace (necessary context via exitinfo) are
552 		 * expressed through negative return values.  For the time being
553 		 * a return value of 0 is not expected from vm_run().
554 		 */
555 		ASSERT(error != 0);
556 		if (error < 0) {
557 			const struct vm_exit *vme;
558 			void *outp = entry.exit_data;
559 
560 			error = 0;
561 			vme = vm_exitinfo(sc->vmm_vm, vcpu);
562 			if (ddi_copyout(vme, outp, sizeof (*vme), md)) {
563 				error = EFAULT;
564 			}
565 		}
566 		break;
567 	}
568 	case VM_SUSPEND: {
569 		struct vm_suspend vmsuspend;
570 
571 		if (ddi_copyin(datap, &vmsuspend, sizeof (vmsuspend), md)) {
572 			error = EFAULT;
573 			break;
574 		}
575 		error = vm_suspend(sc->vmm_vm, vmsuspend.how);
576 		break;
577 	}
578 	case VM_REINIT: {
579 		struct vm_reinit reinit;
580 
581 		if (ddi_copyin(datap, &reinit, sizeof (reinit), md)) {
582 			error = EFAULT;
583 			break;
584 		}
585 		if ((error = vmm_drv_block_hook(sc, B_TRUE)) != 0) {
586 			/*
587 			 * The VM instance should be free of driver-attached
588 			 * hooks during the reinitialization process.
589 			 */
590 			break;
591 		}
592 		error = vm_reinit(sc->vmm_vm, reinit.flags);
593 		(void) vmm_drv_block_hook(sc, B_FALSE);
594 		break;
595 	}
596 	case VM_STAT_DESC: {
597 		struct vm_stat_desc statdesc;
598 
599 		if (ddi_copyin(datap, &statdesc, sizeof (statdesc), md)) {
600 			error = EFAULT;
601 			break;
602 		}
603 		error = vmm_stat_desc_copy(statdesc.index, statdesc.desc,
604 		    sizeof (statdesc.desc));
605 		if (error == 0 &&
606 		    ddi_copyout(&statdesc, datap, sizeof (statdesc), md)) {
607 			error = EFAULT;
608 			break;
609 		}
610 		break;
611 	}
612 	case VM_STATS_IOC: {
613 		struct vm_stats vmstats;
614 
615 		if (ddi_copyin(datap, &vmstats, sizeof (vmstats), md)) {
616 			error = EFAULT;
617 			break;
618 		}
619 		hrt2tv(gethrtime(), &vmstats.tv);
620 		error = vmm_stat_copy(sc->vmm_vm, vmstats.cpuid, vmstats.index,
621 		    nitems(vmstats.statbuf),
622 		    &vmstats.num_entries, vmstats.statbuf);
623 		if (error == 0 &&
624 		    ddi_copyout(&vmstats, datap, sizeof (vmstats), md)) {
625 			error = EFAULT;
626 			break;
627 		}
628 		break;
629 	}
630 
631 	case VM_PPTDEV_MSI: {
632 		struct vm_pptdev_msi pptmsi;
633 
634 		if (ddi_copyin(datap, &pptmsi, sizeof (pptmsi), md)) {
635 			error = EFAULT;
636 			break;
637 		}
638 		error = ppt_setup_msi(sc->vmm_vm, pptmsi.vcpu, pptmsi.pptfd,
639 		    pptmsi.addr, pptmsi.msg, pptmsi.numvec);
640 		break;
641 	}
642 	case VM_PPTDEV_MSIX: {
643 		struct vm_pptdev_msix pptmsix;
644 
645 		if (ddi_copyin(datap, &pptmsix, sizeof (pptmsix), md)) {
646 			error = EFAULT;
647 			break;
648 		}
649 		error = ppt_setup_msix(sc->vmm_vm, pptmsix.vcpu, pptmsix.pptfd,
650 		    pptmsix.idx, pptmsix.addr, pptmsix.msg,
651 		    pptmsix.vector_control);
652 		break;
653 	}
654 	case VM_PPTDEV_DISABLE_MSIX: {
655 		struct vm_pptdev pptdev;
656 
657 		if (ddi_copyin(datap, &pptdev, sizeof (pptdev), md)) {
658 			error = EFAULT;
659 			break;
660 		}
661 		error = ppt_disable_msix(sc->vmm_vm, pptdev.pptfd);
662 		break;
663 	}
664 	case VM_MAP_PPTDEV_MMIO: {
665 		struct vm_pptdev_mmio pptmmio;
666 
667 		if (ddi_copyin(datap, &pptmmio, sizeof (pptmmio), md)) {
668 			error = EFAULT;
669 			break;
670 		}
671 		error = ppt_map_mmio(sc->vmm_vm, pptmmio.pptfd, pptmmio.gpa,
672 		    pptmmio.len, pptmmio.hpa);
673 		break;
674 	}
675 	case VM_UNMAP_PPTDEV_MMIO: {
676 		struct vm_pptdev_mmio pptmmio;
677 
678 		if (ddi_copyin(datap, &pptmmio, sizeof (pptmmio), md)) {
679 			error = EFAULT;
680 			break;
681 		}
682 		error = ppt_unmap_mmio(sc->vmm_vm, pptmmio.pptfd, pptmmio.gpa,
683 		    pptmmio.len);
684 		break;
685 	}
686 	case VM_BIND_PPTDEV: {
687 		struct vm_pptdev pptdev;
688 
689 		if (ddi_copyin(datap, &pptdev, sizeof (pptdev), md)) {
690 			error = EFAULT;
691 			break;
692 		}
693 		error = vm_assign_pptdev(sc->vmm_vm, pptdev.pptfd);
694 		break;
695 	}
696 	case VM_UNBIND_PPTDEV: {
697 		struct vm_pptdev pptdev;
698 
699 		if (ddi_copyin(datap, &pptdev, sizeof (pptdev), md)) {
700 			error = EFAULT;
701 			break;
702 		}
703 		error = vm_unassign_pptdev(sc->vmm_vm, pptdev.pptfd);
704 		break;
705 	}
706 	case VM_GET_PPTDEV_LIMITS: {
707 		struct vm_pptdev_limits pptlimits;
708 
709 		if (ddi_copyin(datap, &pptlimits, sizeof (pptlimits), md)) {
710 			error = EFAULT;
711 			break;
712 		}
713 		error = ppt_get_limits(sc->vmm_vm, pptlimits.pptfd,
714 		    &pptlimits.msi_limit, &pptlimits.msix_limit);
715 		if (error == 0 &&
716 		    ddi_copyout(&pptlimits, datap, sizeof (pptlimits), md)) {
717 			error = EFAULT;
718 			break;
719 		}
720 		break;
721 	}
722 	case VM_INJECT_EXCEPTION: {
723 		struct vm_exception vmexc;
724 		if (ddi_copyin(datap, &vmexc, sizeof (vmexc), md)) {
725 			error = EFAULT;
726 			break;
727 		}
728 		error = vm_inject_exception(sc->vmm_vm, vcpu, vmexc.vector,
729 		    vmexc.error_code_valid != 0, vmexc.error_code,
730 		    vmexc.restart_instruction != 0);
731 		break;
732 	}
733 	case VM_INJECT_NMI: {
734 		struct vm_nmi vmnmi;
735 
736 		if (ddi_copyin(datap, &vmnmi, sizeof (vmnmi), md)) {
737 			error = EFAULT;
738 			break;
739 		}
740 		error = vm_inject_nmi(sc->vmm_vm, vmnmi.cpuid);
741 		break;
742 	}
743 	case VM_LAPIC_IRQ: {
744 		struct vm_lapic_irq vmirq;
745 
746 		if (ddi_copyin(datap, &vmirq, sizeof (vmirq), md)) {
747 			error = EFAULT;
748 			break;
749 		}
750 		error = lapic_intr_edge(sc->vmm_vm, vmirq.cpuid, vmirq.vector);
751 		break;
752 	}
753 	case VM_LAPIC_LOCAL_IRQ: {
754 		struct vm_lapic_irq vmirq;
755 
756 		if (ddi_copyin(datap, &vmirq, sizeof (vmirq), md)) {
757 			error = EFAULT;
758 			break;
759 		}
760 		error = lapic_set_local_intr(sc->vmm_vm, vmirq.cpuid,
761 		    vmirq.vector);
762 		break;
763 	}
764 	case VM_LAPIC_MSI: {
765 		struct vm_lapic_msi vmmsi;
766 
767 		if (ddi_copyin(datap, &vmmsi, sizeof (vmmsi), md)) {
768 			error = EFAULT;
769 			break;
770 		}
771 		error = lapic_intr_msi(sc->vmm_vm, vmmsi.addr, vmmsi.msg);
772 		break;
773 	}
774 
775 	case VM_IOAPIC_ASSERT_IRQ: {
776 		struct vm_ioapic_irq ioapic_irq;
777 
778 		if (ddi_copyin(datap, &ioapic_irq, sizeof (ioapic_irq), md)) {
779 			error = EFAULT;
780 			break;
781 		}
782 		error = vioapic_assert_irq(sc->vmm_vm, ioapic_irq.irq);
783 		break;
784 	}
785 	case VM_IOAPIC_DEASSERT_IRQ: {
786 		struct vm_ioapic_irq ioapic_irq;
787 
788 		if (ddi_copyin(datap, &ioapic_irq, sizeof (ioapic_irq), md)) {
789 			error = EFAULT;
790 			break;
791 		}
792 		error = vioapic_deassert_irq(sc->vmm_vm, ioapic_irq.irq);
793 		break;
794 	}
795 	case VM_IOAPIC_PULSE_IRQ: {
796 		struct vm_ioapic_irq ioapic_irq;
797 
798 		if (ddi_copyin(datap, &ioapic_irq, sizeof (ioapic_irq), md)) {
799 			error = EFAULT;
800 			break;
801 		}
802 		error = vioapic_pulse_irq(sc->vmm_vm, ioapic_irq.irq);
803 		break;
804 	}
805 	case VM_IOAPIC_PINCOUNT: {
806 		int pincount;
807 
808 		pincount = vioapic_pincount(sc->vmm_vm);
809 		if (ddi_copyout(&pincount, datap, sizeof (int), md)) {
810 			error = EFAULT;
811 			break;
812 		}
813 		break;
814 	}
815 	case VM_DESC_FPU_AREA: {
816 		struct vm_fpu_desc desc;
817 		void *buf = NULL;
818 
819 		if (ddi_copyin(datap, &desc, sizeof (desc), md)) {
820 			error = EFAULT;
821 			break;
822 		}
823 		if (desc.vfd_num_entries > 64) {
824 			error = EINVAL;
825 			break;
826 		}
827 		const size_t buf_sz = sizeof (struct vm_fpu_desc_entry) *
828 		    desc.vfd_num_entries;
829 		if (buf_sz != 0) {
830 			buf = kmem_zalloc(buf_sz, KM_SLEEP);
831 		}
832 
833 		/*
834 		 * For now, we are depending on vm_fpu_desc_entry and
835 		 * hma_xsave_state_desc_t having the same format.
836 		 */
837 		CTASSERT(sizeof (struct vm_fpu_desc_entry) ==
838 		    sizeof (hma_xsave_state_desc_t));
839 
840 		size_t req_size;
841 		const uint_t max_entries = hma_fpu_describe_xsave_state(
842 		    (hma_xsave_state_desc_t *)buf,
843 		    desc.vfd_num_entries,
844 		    &req_size);
845 
846 		desc.vfd_req_size = req_size;
847 		desc.vfd_num_entries = max_entries;
848 		if (buf_sz != 0) {
849 			if (ddi_copyout(buf, desc.vfd_entry_data, buf_sz, md)) {
850 				error = EFAULT;
851 			}
852 			kmem_free(buf, buf_sz);
853 		}
854 
855 		if (error == 0) {
856 			if (ddi_copyout(&desc, datap, sizeof (desc), md)) {
857 				error = EFAULT;
858 			}
859 		}
860 		break;
861 	}
862 	case VM_SET_AUTODESTRUCT: {
863 		/*
864 		 * Since this has to do with controlling the lifetime of the
865 		 * greater vmm_softc_t, the flag is protected by vmm_mtx, rather
866 		 * than the vcpu-centric or rwlock exclusion mechanisms.
867 		 */
868 		mutex_enter(&vmm_mtx);
869 		sc->vmm_autodestruct = (arg != 0);
870 		mutex_exit(&vmm_mtx);
871 		break;
872 	}
873 
874 	case VM_ISA_ASSERT_IRQ: {
875 		struct vm_isa_irq isa_irq;
876 
877 		if (ddi_copyin(datap, &isa_irq, sizeof (isa_irq), md)) {
878 			error = EFAULT;
879 			break;
880 		}
881 		error = vatpic_assert_irq(sc->vmm_vm, isa_irq.atpic_irq);
882 		if (error == 0 && isa_irq.ioapic_irq != -1) {
883 			error = vioapic_assert_irq(sc->vmm_vm,
884 			    isa_irq.ioapic_irq);
885 		}
886 		break;
887 	}
888 	case VM_ISA_DEASSERT_IRQ: {
889 		struct vm_isa_irq isa_irq;
890 
891 		if (ddi_copyin(datap, &isa_irq, sizeof (isa_irq), md)) {
892 			error = EFAULT;
893 			break;
894 		}
895 		error = vatpic_deassert_irq(sc->vmm_vm, isa_irq.atpic_irq);
896 		if (error == 0 && isa_irq.ioapic_irq != -1) {
897 			error = vioapic_deassert_irq(sc->vmm_vm,
898 			    isa_irq.ioapic_irq);
899 		}
900 		break;
901 	}
902 	case VM_ISA_PULSE_IRQ: {
903 		struct vm_isa_irq isa_irq;
904 
905 		if (ddi_copyin(datap, &isa_irq, sizeof (isa_irq), md)) {
906 			error = EFAULT;
907 			break;
908 		}
909 		error = vatpic_pulse_irq(sc->vmm_vm, isa_irq.atpic_irq);
910 		if (error == 0 && isa_irq.ioapic_irq != -1) {
911 			error = vioapic_pulse_irq(sc->vmm_vm,
912 			    isa_irq.ioapic_irq);
913 		}
914 		break;
915 	}
916 	case VM_ISA_SET_IRQ_TRIGGER: {
917 		struct vm_isa_irq_trigger isa_irq_trigger;
918 
919 		if (ddi_copyin(datap, &isa_irq_trigger,
920 		    sizeof (isa_irq_trigger), md)) {
921 			error = EFAULT;
922 			break;
923 		}
924 		error = vatpic_set_irq_trigger(sc->vmm_vm,
925 		    isa_irq_trigger.atpic_irq, isa_irq_trigger.trigger);
926 		break;
927 	}
928 
929 	case VM_MMAP_GETNEXT: {
930 		struct vm_memmap mm;
931 
932 		if (ddi_copyin(datap, &mm, sizeof (mm), md)) {
933 			error = EFAULT;
934 			break;
935 		}
936 		error = vm_mmap_getnext(sc->vmm_vm, &mm.gpa, &mm.segid,
937 		    &mm.segoff, &mm.len, &mm.prot, &mm.flags);
938 		if (error == 0 && ddi_copyout(&mm, datap, sizeof (mm), md)) {
939 			error = EFAULT;
940 			break;
941 		}
942 		break;
943 	}
944 	case VM_MMAP_MEMSEG: {
945 		struct vm_memmap mm;
946 
947 		if (ddi_copyin(datap, &mm, sizeof (mm), md)) {
948 			error = EFAULT;
949 			break;
950 		}
951 		error = vm_mmap_memseg(sc->vmm_vm, mm.gpa, mm.segid, mm.segoff,
952 		    mm.len, mm.prot, mm.flags);
953 		break;
954 	}
955 	case VM_MUNMAP_MEMSEG: {
956 		struct vm_munmap mu;
957 
958 		if (ddi_copyin(datap, &mu, sizeof (mu), md)) {
959 			error = EFAULT;
960 			break;
961 		}
962 		error = vm_munmap_memseg(sc->vmm_vm, mu.gpa, mu.len);
963 		break;
964 	}
965 	case VM_ALLOC_MEMSEG: {
966 		struct vm_memseg vmseg;
967 
968 		if (ddi_copyin(datap, &vmseg, sizeof (vmseg), md)) {
969 			error = EFAULT;
970 			break;
971 		}
972 		error = vmmdev_alloc_memseg(sc, &vmseg);
973 		break;
974 	}
975 	case VM_GET_MEMSEG: {
976 		struct vm_memseg vmseg;
977 
978 		if (ddi_copyin(datap, &vmseg, sizeof (vmseg), md)) {
979 			error = EFAULT;
980 			break;
981 		}
982 		error = vmmdev_get_memseg(sc, &vmseg);
983 		if (error == 0 &&
984 		    ddi_copyout(&vmseg, datap, sizeof (vmseg), md)) {
985 			error = EFAULT;
986 			break;
987 		}
988 		break;
989 	}
990 	case VM_GET_REGISTER: {
991 		struct vm_register vmreg;
992 
993 		if (ddi_copyin(datap, &vmreg, sizeof (vmreg), md)) {
994 			error = EFAULT;
995 			break;
996 		}
997 		error = vm_get_register(sc->vmm_vm, vcpu, vmreg.regnum,
998 		    &vmreg.regval);
999 		if (error == 0 &&
1000 		    ddi_copyout(&vmreg, datap, sizeof (vmreg), md)) {
1001 			error = EFAULT;
1002 			break;
1003 		}
1004 		break;
1005 	}
1006 	case VM_SET_REGISTER: {
1007 		struct vm_register vmreg;
1008 
1009 		if (ddi_copyin(datap, &vmreg, sizeof (vmreg), md)) {
1010 			error = EFAULT;
1011 			break;
1012 		}
1013 		error = vm_set_register(sc->vmm_vm, vcpu, vmreg.regnum,
1014 		    vmreg.regval);
1015 		break;
1016 	}
1017 	case VM_SET_SEGMENT_DESCRIPTOR: {
1018 		struct vm_seg_desc vmsegd;
1019 
1020 		if (ddi_copyin(datap, &vmsegd, sizeof (vmsegd), md)) {
1021 			error = EFAULT;
1022 			break;
1023 		}
1024 		error = vm_set_seg_desc(sc->vmm_vm, vcpu, vmsegd.regnum,
1025 		    &vmsegd.desc);
1026 		break;
1027 	}
1028 	case VM_GET_SEGMENT_DESCRIPTOR: {
1029 		struct vm_seg_desc vmsegd;
1030 
1031 		if (ddi_copyin(datap, &vmsegd, sizeof (vmsegd), md)) {
1032 			error = EFAULT;
1033 			break;
1034 		}
1035 		error = vm_get_seg_desc(sc->vmm_vm, vcpu, vmsegd.regnum,
1036 		    &vmsegd.desc);
1037 		if (error == 0 &&
1038 		    ddi_copyout(&vmsegd, datap, sizeof (vmsegd), md)) {
1039 			error = EFAULT;
1040 			break;
1041 		}
1042 		break;
1043 	}
1044 	case VM_GET_REGISTER_SET: {
1045 		struct vm_register_set vrs;
1046 		int regnums[VM_REG_LAST];
1047 		uint64_t regvals[VM_REG_LAST];
1048 
1049 		if (ddi_copyin(datap, &vrs, sizeof (vrs), md)) {
1050 			error = EFAULT;
1051 			break;
1052 		}
1053 		if (vrs.count > VM_REG_LAST || vrs.count == 0) {
1054 			error = EINVAL;
1055 			break;
1056 		}
1057 		if (ddi_copyin(vrs.regnums, regnums,
1058 		    sizeof (int) * vrs.count, md)) {
1059 			error = EFAULT;
1060 			break;
1061 		}
1062 
1063 		error = 0;
1064 		for (uint_t i = 0; i < vrs.count && error == 0; i++) {
1065 			if (regnums[i] < 0) {
1066 				error = EINVAL;
1067 				break;
1068 			}
1069 			error = vm_get_register(sc->vmm_vm, vcpu, regnums[i],
1070 			    &regvals[i]);
1071 		}
1072 		if (error == 0 && ddi_copyout(regvals, vrs.regvals,
1073 		    sizeof (uint64_t) * vrs.count, md)) {
1074 			error = EFAULT;
1075 		}
1076 		break;
1077 	}
1078 	case VM_SET_REGISTER_SET: {
1079 		struct vm_register_set vrs;
1080 		int regnums[VM_REG_LAST];
1081 		uint64_t regvals[VM_REG_LAST];
1082 
1083 		if (ddi_copyin(datap, &vrs, sizeof (vrs), md)) {
1084 			error = EFAULT;
1085 			break;
1086 		}
1087 		if (vrs.count > VM_REG_LAST || vrs.count == 0) {
1088 			error = EINVAL;
1089 			break;
1090 		}
1091 		if (ddi_copyin(vrs.regnums, regnums,
1092 		    sizeof (int) * vrs.count, md)) {
1093 			error = EFAULT;
1094 			break;
1095 		}
1096 		if (ddi_copyin(vrs.regvals, regvals,
1097 		    sizeof (uint64_t) * vrs.count, md)) {
1098 			error = EFAULT;
1099 			break;
1100 		}
1101 
1102 		error = 0;
1103 		for (uint_t i = 0; i < vrs.count && error == 0; i++) {
1104 			/*
1105 			 * Setting registers in a set is not atomic, since a
1106 			 * failure in the middle of the set will cause a
1107 			 * bail-out and inconsistent register state.  Callers
1108 			 * should be wary of this.
1109 			 */
1110 			if (regnums[i] < 0) {
1111 				error = EINVAL;
1112 				break;
1113 			}
1114 			error = vm_set_register(sc->vmm_vm, vcpu, regnums[i],
1115 			    regvals[i]);
1116 		}
1117 		break;
1118 	}
1119 	case VM_RESET_CPU: {
1120 		struct vm_vcpu_reset vvr;
1121 
1122 		if (ddi_copyin(datap, &vvr, sizeof (vvr), md)) {
1123 			error = EFAULT;
1124 			break;
1125 		}
1126 		if (vvr.kind != VRK_RESET && vvr.kind != VRK_INIT) {
1127 			error = EINVAL;
1128 		}
1129 
1130 		error = vcpu_arch_reset(sc->vmm_vm, vcpu, vvr.kind == VRK_INIT);
1131 		break;
1132 	}
1133 	case VM_GET_RUN_STATE: {
1134 		struct vm_run_state vrs;
1135 
1136 		bzero(&vrs, sizeof (vrs));
1137 		error = vm_get_run_state(sc->vmm_vm, vcpu, &vrs.state,
1138 		    &vrs.sipi_vector);
1139 		if (error == 0) {
1140 			if (ddi_copyout(&vrs, datap, sizeof (vrs), md)) {
1141 				error = EFAULT;
1142 				break;
1143 			}
1144 		}
1145 		break;
1146 	}
1147 	case VM_SET_RUN_STATE: {
1148 		struct vm_run_state vrs;
1149 
1150 		if (ddi_copyin(datap, &vrs, sizeof (vrs), md)) {
1151 			error = EFAULT;
1152 			break;
1153 		}
1154 		error = vm_set_run_state(sc->vmm_vm, vcpu, vrs.state,
1155 		    vrs.sipi_vector);
1156 		break;
1157 	}
1158 	case VM_GET_FPU: {
1159 		struct vm_fpu_state req;
1160 		const size_t max_len = (PAGESIZE * 2);
1161 		void *kbuf;
1162 
1163 		if (ddi_copyin(datap, &req, sizeof (req), md)) {
1164 			error = EFAULT;
1165 			break;
1166 		}
1167 		if (req.len > max_len || req.len == 0) {
1168 			error = EINVAL;
1169 			break;
1170 		}
1171 		kbuf = kmem_zalloc(req.len, KM_SLEEP);
1172 		error = vm_get_fpu(sc->vmm_vm, vcpu, kbuf, req.len);
1173 		if (error == 0) {
1174 			if (ddi_copyout(kbuf, req.buf, req.len, md)) {
1175 				error = EFAULT;
1176 			}
1177 		}
1178 		kmem_free(kbuf, req.len);
1179 		break;
1180 	}
1181 	case VM_SET_FPU: {
1182 		struct vm_fpu_state req;
1183 		const size_t max_len = (PAGESIZE * 2);
1184 		void *kbuf;
1185 
1186 		if (ddi_copyin(datap, &req, sizeof (req), md)) {
1187 			error = EFAULT;
1188 			break;
1189 		}
1190 		if (req.len > max_len || req.len == 0) {
1191 			error = EINVAL;
1192 			break;
1193 		}
1194 		kbuf = kmem_alloc(req.len, KM_SLEEP);
1195 		if (ddi_copyin(req.buf, kbuf, req.len, md)) {
1196 			error = EFAULT;
1197 		} else {
1198 			error = vm_set_fpu(sc->vmm_vm, vcpu, kbuf, req.len);
1199 		}
1200 		kmem_free(kbuf, req.len);
1201 		break;
1202 	}
1203 	case VM_GET_CPUID: {
1204 		struct vm_vcpu_cpuid_config cfg;
1205 		struct vcpu_cpuid_entry *entries = NULL;
1206 
1207 		if (ddi_copyin(datap, &cfg, sizeof (cfg), md)) {
1208 			error = EFAULT;
1209 			break;
1210 		}
1211 		if (cfg.vvcc_nent > VMM_MAX_CPUID_ENTRIES) {
1212 			error = EINVAL;
1213 			break;
1214 		}
1215 
1216 		const size_t entries_size =
1217 		    cfg.vvcc_nent * sizeof (struct vcpu_cpuid_entry);
1218 		if (entries_size != 0) {
1219 			entries = kmem_zalloc(entries_size, KM_SLEEP);
1220 		}
1221 
1222 		vcpu_cpuid_config_t vm_cfg = {
1223 			.vcc_nent = cfg.vvcc_nent,
1224 			.vcc_entries = entries,
1225 		};
1226 		error = vm_get_cpuid(sc->vmm_vm, vcpu, &vm_cfg);
1227 
1228 		/*
1229 		 * Only attempt to copy out the resultant entries if we were
1230 		 * able to query them from the instance.  The flags and number
1231 		 * of entries are emitted regardless.
1232 		 */
1233 		cfg.vvcc_flags = vm_cfg.vcc_flags;
1234 		cfg.vvcc_nent = vm_cfg.vcc_nent;
1235 		if (entries != NULL) {
1236 			if (error == 0 && ddi_copyout(entries, cfg.vvcc_entries,
1237 			    entries_size, md) != 0) {
1238 				error = EFAULT;
1239 			}
1240 
1241 			kmem_free(entries, entries_size);
1242 		}
1243 
1244 		if (ddi_copyout(&cfg, datap, sizeof (cfg), md) != 0) {
1245 			error = EFAULT;
1246 		}
1247 		break;
1248 	}
1249 	case VM_SET_CPUID: {
1250 		struct vm_vcpu_cpuid_config cfg;
1251 		struct vcpu_cpuid_entry *entries = NULL;
1252 		size_t entries_size = 0;
1253 
1254 		if (ddi_copyin(datap, &cfg, sizeof (cfg), md)) {
1255 			error = EFAULT;
1256 			break;
1257 		}
1258 		if (cfg.vvcc_nent > VMM_MAX_CPUID_ENTRIES) {
1259 			error = EFBIG;
1260 			break;
1261 		}
1262 		if ((cfg.vvcc_flags & VCC_FLAG_LEGACY_HANDLING) != 0) {
1263 			/*
1264 			 * If we are being instructed to use "legacy" handling,
1265 			 * then no entries should be provided, since the static
1266 			 * in-kernel masking will be used.
1267 			 */
1268 			if (cfg.vvcc_nent != 0) {
1269 				error = EINVAL;
1270 				break;
1271 			}
1272 		} else if (cfg.vvcc_nent != 0) {
1273 			entries_size =
1274 			    cfg.vvcc_nent * sizeof (struct vcpu_cpuid_entry);
1275 			entries = kmem_alloc(entries_size, KM_SLEEP);
1276 
1277 			if (ddi_copyin(cfg.vvcc_entries, entries, entries_size,
1278 			    md) != 0) {
1279 				error = EFAULT;
1280 				kmem_free(entries, entries_size);
1281 				break;
1282 			}
1283 		}
1284 
1285 		vcpu_cpuid_config_t vm_cfg = {
1286 			.vcc_flags = cfg.vvcc_flags,
1287 			.vcc_nent = cfg.vvcc_nent,
1288 			.vcc_entries = entries,
1289 		};
1290 		error = vm_set_cpuid(sc->vmm_vm, vcpu, &vm_cfg);
1291 
1292 		if (entries != NULL) {
1293 			kmem_free(entries, entries_size);
1294 		}
1295 		break;
1296 	}
1297 	case VM_LEGACY_CPUID: {
1298 		struct vm_legacy_cpuid vlc;
1299 		if (ddi_copyin(datap, &vlc, sizeof (vlc), md)) {
1300 			error = EFAULT;
1301 			break;
1302 		}
1303 		vlc.vlc_vcpuid = vcpu;
1304 
1305 		legacy_emulate_cpuid(sc->vmm_vm, vcpu, &vlc.vlc_eax,
1306 		    &vlc.vlc_ebx, &vlc.vlc_ecx, &vlc.vlc_edx);
1307 
1308 		if (ddi_copyout(&vlc, datap, sizeof (vlc), md)) {
1309 			error = EFAULT;
1310 			break;
1311 		}
1312 		break;
1313 	}
1314 
1315 	case VM_SET_KERNEMU_DEV:
1316 	case VM_GET_KERNEMU_DEV: {
1317 		struct vm_readwrite_kernemu_device kemu;
1318 		size_t size = 0;
1319 
1320 		if (ddi_copyin(datap, &kemu, sizeof (kemu), md)) {
1321 			error = EFAULT;
1322 			break;
1323 		}
1324 
1325 		if (kemu.access_width > 3) {
1326 			error = EINVAL;
1327 			break;
1328 		}
1329 		size = (1 << kemu.access_width);
1330 		ASSERT(size >= 1 && size <= 8);
1331 
1332 		if (cmd == VM_SET_KERNEMU_DEV) {
1333 			error = vm_service_mmio_write(sc->vmm_vm, vcpu,
1334 			    kemu.gpa, kemu.value, size);
1335 		} else {
1336 			error = vm_service_mmio_read(sc->vmm_vm, vcpu,
1337 			    kemu.gpa, &kemu.value, size);
1338 		}
1339 
1340 		if (error == 0) {
1341 			if (ddi_copyout(&kemu, datap, sizeof (kemu), md)) {
1342 				error = EFAULT;
1343 				break;
1344 			}
1345 		}
1346 		break;
1347 	}
1348 
1349 	case VM_GET_CAPABILITY: {
1350 		struct vm_capability vmcap;
1351 
1352 		if (ddi_copyin(datap, &vmcap, sizeof (vmcap), md)) {
1353 			error = EFAULT;
1354 			break;
1355 		}
1356 		error = vm_get_capability(sc->vmm_vm, vcpu, vmcap.captype,
1357 		    &vmcap.capval);
1358 		if (error == 0 &&
1359 		    ddi_copyout(&vmcap, datap, sizeof (vmcap), md)) {
1360 			error = EFAULT;
1361 			break;
1362 		}
1363 		break;
1364 	}
1365 	case VM_SET_CAPABILITY: {
1366 		struct vm_capability vmcap;
1367 
1368 		if (ddi_copyin(datap, &vmcap, sizeof (vmcap), md)) {
1369 			error = EFAULT;
1370 			break;
1371 		}
1372 		error = vm_set_capability(sc->vmm_vm, vcpu, vmcap.captype,
1373 		    vmcap.capval);
1374 		break;
1375 	}
1376 	case VM_SET_X2APIC_STATE: {
1377 		struct vm_x2apic x2apic;
1378 
1379 		if (ddi_copyin(datap, &x2apic, sizeof (x2apic), md)) {
1380 			error = EFAULT;
1381 			break;
1382 		}
1383 		error = vm_set_x2apic_state(sc->vmm_vm, vcpu, x2apic.state);
1384 		break;
1385 	}
1386 	case VM_GET_X2APIC_STATE: {
1387 		struct vm_x2apic x2apic;
1388 
1389 		if (ddi_copyin(datap, &x2apic, sizeof (x2apic), md)) {
1390 			error = EFAULT;
1391 			break;
1392 		}
1393 		error = vm_get_x2apic_state(sc->vmm_vm, x2apic.cpuid,
1394 		    &x2apic.state);
1395 		if (error == 0 &&
1396 		    ddi_copyout(&x2apic, datap, sizeof (x2apic), md)) {
1397 			error = EFAULT;
1398 			break;
1399 		}
1400 		break;
1401 	}
1402 	case VM_GET_GPA_PMAP: {
1403 		/*
1404 		 * Until there is a necessity to leak EPT/RVI PTE values to
1405 		 * userspace, this will remain unimplemented
1406 		 */
1407 		error = EINVAL;
1408 		break;
1409 	}
1410 	case VM_GET_HPET_CAPABILITIES: {
1411 		struct vm_hpet_cap hpetcap;
1412 
1413 		error = vhpet_getcap(&hpetcap);
1414 		if (error == 0 &&
1415 		    ddi_copyout(&hpetcap, datap, sizeof (hpetcap), md)) {
1416 			error = EFAULT;
1417 			break;
1418 		}
1419 		break;
1420 	}
1421 	case VM_GLA2GPA: {
1422 		struct vm_gla2gpa gg;
1423 
1424 		if (ddi_copyin(datap, &gg, sizeof (gg), md)) {
1425 			error = EFAULT;
1426 			break;
1427 		}
1428 		gg.vcpuid = vcpu;
1429 		error = vm_gla2gpa(sc->vmm_vm, vcpu, &gg.paging, gg.gla,
1430 		    gg.prot, &gg.gpa, &gg.fault);
1431 		if (error == 0 && ddi_copyout(&gg, datap, sizeof (gg), md)) {
1432 			error = EFAULT;
1433 			break;
1434 		}
1435 		break;
1436 	}
1437 	case VM_GLA2GPA_NOFAULT: {
1438 		struct vm_gla2gpa gg;
1439 
1440 		if (ddi_copyin(datap, &gg, sizeof (gg), md)) {
1441 			error = EFAULT;
1442 			break;
1443 		}
1444 		gg.vcpuid = vcpu;
1445 		error = vm_gla2gpa_nofault(sc->vmm_vm, vcpu, &gg.paging,
1446 		    gg.gla, gg.prot, &gg.gpa, &gg.fault);
1447 		if (error == 0 && ddi_copyout(&gg, datap, sizeof (gg), md)) {
1448 			error = EFAULT;
1449 			break;
1450 		}
1451 		break;
1452 	}
1453 
1454 	case VM_ACTIVATE_CPU:
1455 		error = vm_activate_cpu(sc->vmm_vm, vcpu);
1456 		break;
1457 
1458 	case VM_SUSPEND_CPU:
1459 		if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) {
1460 			error = EFAULT;
1461 		} else {
1462 			error = vm_suspend_cpu(sc->vmm_vm, vcpu);
1463 		}
1464 		break;
1465 
1466 	case VM_RESUME_CPU:
1467 		if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) {
1468 			error = EFAULT;
1469 		} else {
1470 			error = vm_resume_cpu(sc->vmm_vm, vcpu);
1471 		}
1472 		break;
1473 
1474 	case VM_GET_CPUS: {
1475 		struct vm_cpuset vm_cpuset;
1476 		cpuset_t tempset;
1477 		void *srcp = &tempset;
1478 		int size;
1479 
1480 		if (ddi_copyin(datap, &vm_cpuset, sizeof (vm_cpuset), md)) {
1481 			error = EFAULT;
1482 			break;
1483 		}
1484 
1485 		/* Be more generous about sizing since our cpuset_t is large. */
1486 		size = vm_cpuset.cpusetsize;
1487 		if (size <= 0 || size > sizeof (cpuset_t)) {
1488 			error = ERANGE;
1489 		}
1490 		/*
1491 		 * If they want a ulong_t or less, make sure they receive the
1492 		 * low bits with all the useful information.
1493 		 */
1494 		if (size <= sizeof (tempset.cpub[0])) {
1495 			srcp = &tempset.cpub[0];
1496 		}
1497 
1498 		if (vm_cpuset.which == VM_ACTIVE_CPUS) {
1499 			tempset = vm_active_cpus(sc->vmm_vm);
1500 		} else if (vm_cpuset.which == VM_SUSPENDED_CPUS) {
1501 			tempset = vm_suspended_cpus(sc->vmm_vm);
1502 		} else if (vm_cpuset.which == VM_DEBUG_CPUS) {
1503 			tempset = vm_debug_cpus(sc->vmm_vm);
1504 		} else {
1505 			error = EINVAL;
1506 		}
1507 
1508 		ASSERT(size > 0 && size <= sizeof (tempset));
1509 		if (error == 0 &&
1510 		    ddi_copyout(srcp, vm_cpuset.cpus, size, md)) {
1511 			error = EFAULT;
1512 			break;
1513 		}
1514 		break;
1515 	}
1516 	case VM_SET_INTINFO: {
1517 		struct vm_intinfo vmii;
1518 
1519 		if (ddi_copyin(datap, &vmii, sizeof (vmii), md)) {
1520 			error = EFAULT;
1521 			break;
1522 		}
1523 		error = vm_exit_intinfo(sc->vmm_vm, vcpu, vmii.info1);
1524 		break;
1525 	}
1526 	case VM_GET_INTINFO: {
1527 		struct vm_intinfo vmii;
1528 
1529 		vmii.vcpuid = vcpu;
1530 		error = vm_get_intinfo(sc->vmm_vm, vcpu, &vmii.info1,
1531 		    &vmii.info2);
1532 		if (error == 0 &&
1533 		    ddi_copyout(&vmii, datap, sizeof (vmii), md)) {
1534 			error = EFAULT;
1535 			break;
1536 		}
1537 		break;
1538 	}
1539 	case VM_RTC_WRITE: {
1540 		struct vm_rtc_data rtcdata;
1541 
1542 		if (ddi_copyin(datap, &rtcdata, sizeof (rtcdata), md)) {
1543 			error = EFAULT;
1544 			break;
1545 		}
1546 		error = vrtc_nvram_write(sc->vmm_vm, rtcdata.offset,
1547 		    rtcdata.value);
1548 		break;
1549 	}
1550 	case VM_RTC_READ: {
1551 		struct vm_rtc_data rtcdata;
1552 
1553 		if (ddi_copyin(datap, &rtcdata, sizeof (rtcdata), md)) {
1554 			error = EFAULT;
1555 			break;
1556 		}
1557 		error = vrtc_nvram_read(sc->vmm_vm, rtcdata.offset,
1558 		    &rtcdata.value);
1559 		if (error == 0 &&
1560 		    ddi_copyout(&rtcdata, datap, sizeof (rtcdata), md)) {
1561 			error = EFAULT;
1562 			break;
1563 		}
1564 		break;
1565 	}
1566 	case VM_RTC_SETTIME: {
1567 		struct vm_rtc_time rtctime;
1568 
1569 		if (ddi_copyin(datap, &rtctime, sizeof (rtctime), md)) {
1570 			error = EFAULT;
1571 			break;
1572 		}
1573 		error = vrtc_set_time(sc->vmm_vm, rtctime.secs);
1574 		break;
1575 	}
1576 	case VM_RTC_GETTIME: {
1577 		struct vm_rtc_time rtctime;
1578 
1579 		rtctime.secs = vrtc_get_time(sc->vmm_vm);
1580 		if (ddi_copyout(&rtctime, datap, sizeof (rtctime), md)) {
1581 			error = EFAULT;
1582 			break;
1583 		}
1584 		break;
1585 	}
1586 
1587 	case VM_PMTMR_LOCATE: {
1588 		uint16_t port = arg;
1589 		error = vpmtmr_set_location(sc->vmm_vm, port);
1590 		break;
1591 	}
1592 
1593 	case VM_RESTART_INSTRUCTION:
1594 		error = vm_restart_instruction(sc->vmm_vm, vcpu);
1595 		break;
1596 
1597 	case VM_SET_TOPOLOGY: {
1598 		struct vm_cpu_topology topo;
1599 
1600 		if (ddi_copyin(datap, &topo, sizeof (topo), md) != 0) {
1601 			error = EFAULT;
1602 			break;
1603 		}
1604 		error = vm_set_topology(sc->vmm_vm, topo.sockets, topo.cores,
1605 		    topo.threads, topo.maxcpus);
1606 		break;
1607 	}
1608 	case VM_GET_TOPOLOGY: {
1609 		struct vm_cpu_topology topo;
1610 
1611 		vm_get_topology(sc->vmm_vm, &topo.sockets, &topo.cores,
1612 		    &topo.threads, &topo.maxcpus);
1613 		if (ddi_copyout(&topo, datap, sizeof (topo), md) != 0) {
1614 			error = EFAULT;
1615 			break;
1616 		}
1617 		break;
1618 	}
1619 	case VM_DEVMEM_GETOFFSET: {
1620 		struct vm_devmem_offset vdo;
1621 		vmm_devmem_entry_t *de;
1622 
1623 		if (ddi_copyin(datap, &vdo, sizeof (vdo), md) != 0) {
1624 			error = EFAULT;
1625 			break;
1626 		}
1627 
1628 		de = vmmdev_devmem_find(sc, vdo.segid);
1629 		if (de != NULL) {
1630 			vdo.offset = de->vde_off;
1631 			if (ddi_copyout(&vdo, datap, sizeof (vdo), md) != 0) {
1632 				error = EFAULT;
1633 			}
1634 		} else {
1635 			error = ENOENT;
1636 		}
1637 		break;
1638 	}
1639 	case VM_TRACK_DIRTY_PAGES: {
1640 		const size_t max_track_region_len = 8 * PAGESIZE * 8 * PAGESIZE;
1641 		struct vmm_dirty_tracker tracker;
1642 		uint8_t *bitmap;
1643 		size_t len;
1644 
1645 		if (ddi_copyin(datap, &tracker, sizeof (tracker), md) != 0) {
1646 			error = EFAULT;
1647 			break;
1648 		}
1649 		if ((tracker.vdt_start_gpa & PAGEOFFSET) != 0) {
1650 			error = EINVAL;
1651 			break;
1652 		}
1653 		if (tracker.vdt_len == 0) {
1654 			break;
1655 		}
1656 		if ((tracker.vdt_len & PAGEOFFSET) != 0) {
1657 			error = EINVAL;
1658 			break;
1659 		}
1660 		if (tracker.vdt_len > max_track_region_len) {
1661 			error = EINVAL;
1662 			break;
1663 		}
1664 		len = roundup(tracker.vdt_len / PAGESIZE, 8) / 8;
1665 		bitmap = kmem_zalloc(len, KM_SLEEP);
1666 		vm_track_dirty_pages(sc->vmm_vm, tracker.vdt_start_gpa,
1667 		    tracker.vdt_len, bitmap);
1668 		if (ddi_copyout(bitmap, tracker.vdt_pfns, len, md) != 0) {
1669 			error = EFAULT;
1670 		}
1671 		kmem_free(bitmap, len);
1672 
1673 		break;
1674 	}
1675 	case VM_WRLOCK_CYCLE: {
1676 		/*
1677 		 * Present a test mechanism to acquire/release the write lock
1678 		 * on the VM without any other effects.
1679 		 */
1680 		break;
1681 	}
1682 	case VM_DATA_READ: {
1683 		struct vm_data_xfer vdx;
1684 
1685 		if (ddi_copyin(datap, &vdx, sizeof (vdx), md) != 0) {
1686 			error = EFAULT;
1687 			break;
1688 		}
1689 		if ((vdx.vdx_flags & ~VDX_FLAGS_VALID) != 0) {
1690 			error = EINVAL;
1691 			break;
1692 		}
1693 		if (vdx.vdx_len > VM_DATA_XFER_LIMIT) {
1694 			error = EFBIG;
1695 			break;
1696 		}
1697 
1698 		const size_t len = vdx.vdx_len;
1699 		void *buf = NULL;
1700 		if (len != 0) {
1701 			buf = kmem_alloc(len, KM_SLEEP);
1702 			if ((vdx.vdx_flags & VDX_FLAG_READ_COPYIN) != 0 &&
1703 			    ddi_copyin(vdx.vdx_data, buf, len, md) != 0) {
1704 				kmem_free(buf, len);
1705 				error = EFAULT;
1706 				break;
1707 			} else {
1708 				bzero(buf, len);
1709 			}
1710 		}
1711 
1712 		vdx.vdx_result_len = 0;
1713 		vmm_data_req_t req = {
1714 			.vdr_class = vdx.vdx_class,
1715 			.vdr_version = vdx.vdx_version,
1716 			.vdr_flags = vdx.vdx_flags,
1717 			.vdr_len = len,
1718 			.vdr_data = buf,
1719 			.vdr_result_len = &vdx.vdx_result_len,
1720 		};
1721 		error = vmm_data_read(sc->vmm_vm, vdx.vdx_vcpuid, &req);
1722 
1723 		if (error == 0 && buf != NULL) {
1724 			if (ddi_copyout(buf, vdx.vdx_data, len, md) != 0) {
1725 				error = EFAULT;
1726 			}
1727 		}
1728 
1729 		/*
1730 		 * Copy out the transfer request so that the value of
1731 		 * vdx_result_len can be made available, regardless of any
1732 		 * error(s) which may have occurred.
1733 		 */
1734 		if (ddi_copyout(&vdx, datap, sizeof (vdx), md) != 0) {
1735 			error = (error != 0) ? error : EFAULT;
1736 		}
1737 
1738 		if (buf != NULL) {
1739 			kmem_free(buf, len);
1740 		}
1741 		break;
1742 	}
1743 	case VM_DATA_WRITE: {
1744 		struct vm_data_xfer vdx;
1745 
1746 		if (ddi_copyin(datap, &vdx, sizeof (vdx), md) != 0) {
1747 			error = EFAULT;
1748 			break;
1749 		}
1750 		if ((vdx.vdx_flags & ~VDX_FLAGS_VALID) != 0) {
1751 			error = EINVAL;
1752 			break;
1753 		}
1754 		if (vdx.vdx_len > VM_DATA_XFER_LIMIT) {
1755 			error = EFBIG;
1756 			break;
1757 		}
1758 
1759 		const size_t len = vdx.vdx_len;
1760 		void *buf = NULL;
1761 		if (len != 0) {
1762 			buf = kmem_alloc(len, KM_SLEEP);
1763 			if (ddi_copyin(vdx.vdx_data, buf, len, md) != 0) {
1764 				kmem_free(buf, len);
1765 				error = EFAULT;
1766 				break;
1767 			}
1768 		}
1769 
1770 		vdx.vdx_result_len = 0;
1771 		vmm_data_req_t req = {
1772 			.vdr_class = vdx.vdx_class,
1773 			.vdr_version = vdx.vdx_version,
1774 			.vdr_flags = vdx.vdx_flags,
1775 			.vdr_len = len,
1776 			.vdr_data = buf,
1777 			.vdr_result_len = &vdx.vdx_result_len,
1778 		};
1779 		if (vmm_allow_state_writes == 0) {
1780 			/* XXX: Play it safe for now */
1781 			error = EPERM;
1782 		} else {
1783 			error = vmm_data_write(sc->vmm_vm, vdx.vdx_vcpuid,
1784 			    &req);
1785 		}
1786 
1787 		if (error == 0 && buf != NULL &&
1788 		    (vdx.vdx_flags & VDX_FLAG_WRITE_COPYOUT) != 0) {
1789 			if (ddi_copyout(buf, vdx.vdx_data, len, md) != 0) {
1790 				error = EFAULT;
1791 			}
1792 		}
1793 
1794 		/*
1795 		 * Copy out the transfer request so that the value of
1796 		 * vdx_result_len can be made available, regardless of any
1797 		 * error(s) which may have occurred.
1798 		 */
1799 		if (ddi_copyout(&vdx, datap, sizeof (vdx), md) != 0) {
1800 			error = (error != 0) ? error : EFAULT;
1801 		}
1802 
1803 		if (buf != NULL) {
1804 			kmem_free(buf, len);
1805 		}
1806 		break;
1807 	}
1808 
1809 	default:
1810 		error = ENOTTY;
1811 		break;
1812 	}
1813 
1814 	/* Release exclusion resources */
1815 	switch (lock_type) {
1816 	case LOCK_NONE:
1817 		break;
1818 	case LOCK_VCPU:
1819 		vcpu_unlock_one(sc, vcpu);
1820 		break;
1821 	case LOCK_READ_HOLD:
1822 		vmm_read_unlock(sc);
1823 		break;
1824 	case LOCK_WRITE_HOLD:
1825 		vmm_write_unlock(sc);
1826 		break;
1827 	default:
1828 		panic("unexpected lock type");
1829 		break;
1830 	}
1831 
1832 	return (error);
1833 }
1834 
1835 static vmm_softc_t *
1836 vmm_lookup(const char *name)
1837 {
1838 	list_t *vml = &vmm_list;
1839 	vmm_softc_t *sc;
1840 
1841 	ASSERT(MUTEX_HELD(&vmm_mtx));
1842 
1843 	for (sc = list_head(vml); sc != NULL; sc = list_next(vml, sc)) {
1844 		if (strcmp(sc->vmm_name, name) == 0) {
1845 			break;
1846 		}
1847 	}
1848 
1849 	return (sc);
1850 }
1851 
1852 /*
1853  * Acquire an HMA registration if not already held.
1854  */
1855 static boolean_t
1856 vmm_hma_acquire(void)
1857 {
1858 	ASSERT(MUTEX_NOT_HELD(&vmm_mtx));
1859 
1860 	mutex_enter(&vmmdev_mtx);
1861 
1862 	if (vmmdev_hma_reg == NULL) {
1863 		VERIFY3U(vmmdev_hma_ref, ==, 0);
1864 		vmmdev_hma_reg = hma_register(vmmdev_hvm_name);
1865 		if (vmmdev_hma_reg == NULL) {
1866 			cmn_err(CE_WARN, "%s HMA registration failed.",
1867 			    vmmdev_hvm_name);
1868 			mutex_exit(&vmmdev_mtx);
1869 			return (B_FALSE);
1870 		}
1871 	}
1872 
1873 	vmmdev_hma_ref++;
1874 
1875 	mutex_exit(&vmmdev_mtx);
1876 
1877 	return (B_TRUE);
1878 }
1879 
1880 /*
1881  * Release the HMA registration if held and there are no remaining VMs.
1882  */
1883 static void
1884 vmm_hma_release(void)
1885 {
1886 	ASSERT(MUTEX_NOT_HELD(&vmm_mtx));
1887 
1888 	mutex_enter(&vmmdev_mtx);
1889 
1890 	VERIFY3U(vmmdev_hma_ref, !=, 0);
1891 
1892 	vmmdev_hma_ref--;
1893 
1894 	if (vmmdev_hma_ref == 0) {
1895 		VERIFY(vmmdev_hma_reg != NULL);
1896 		hma_unregister(vmmdev_hma_reg);
1897 		vmmdev_hma_reg = NULL;
1898 	}
1899 	mutex_exit(&vmmdev_mtx);
1900 }
1901 
1902 static int
1903 vmmdev_do_vm_create(const struct vm_create_req *req, cred_t *cr)
1904 {
1905 	vmm_softc_t	*sc = NULL;
1906 	minor_t		minor;
1907 	int		error = ENOMEM;
1908 	size_t		len;
1909 	const char	*name = req->name;
1910 
1911 	len = strnlen(name, VM_MAX_NAMELEN);
1912 	if (len == 0) {
1913 		return (EINVAL);
1914 	}
1915 	if (len >= VM_MAX_NAMELEN) {
1916 		return (ENAMETOOLONG);
1917 	}
1918 	if (strchr(name, '/') != NULL) {
1919 		return (EINVAL);
1920 	}
1921 
1922 	if (!vmm_hma_acquire())
1923 		return (ENXIO);
1924 
1925 	mutex_enter(&vmm_mtx);
1926 
1927 	/* Look for duplicate names */
1928 	if (vmm_lookup(name) != NULL) {
1929 		mutex_exit(&vmm_mtx);
1930 		vmm_hma_release();
1931 		return (EEXIST);
1932 	}
1933 
1934 	/* Allow only one instance per non-global zone. */
1935 	if (!INGLOBALZONE(curproc)) {
1936 		for (sc = list_head(&vmm_list); sc != NULL;
1937 		    sc = list_next(&vmm_list, sc)) {
1938 			if (sc->vmm_zone == curzone) {
1939 				mutex_exit(&vmm_mtx);
1940 				vmm_hma_release();
1941 				return (EINVAL);
1942 			}
1943 		}
1944 	}
1945 
1946 	minor = id_alloc(vmm_minors);
1947 	if (ddi_soft_state_zalloc(vmm_statep, minor) != DDI_SUCCESS) {
1948 		goto fail;
1949 	} else if ((sc = ddi_get_soft_state(vmm_statep, minor)) == NULL) {
1950 		ddi_soft_state_free(vmm_statep, minor);
1951 		goto fail;
1952 	} else if (ddi_create_minor_node(vmmdev_dip, name, S_IFCHR, minor,
1953 	    DDI_PSEUDO, 0) != DDI_SUCCESS) {
1954 		goto fail;
1955 	}
1956 
1957 	if (vmm_kstat_alloc(sc, minor, cr) != 0) {
1958 		goto fail;
1959 	}
1960 
1961 	error = vm_create(req->flags, &sc->vmm_vm);
1962 	if (error == 0) {
1963 		/* Complete VM intialization and report success. */
1964 		(void) strlcpy(sc->vmm_name, name, sizeof (sc->vmm_name));
1965 		sc->vmm_minor = minor;
1966 		list_create(&sc->vmm_devmem_list, sizeof (vmm_devmem_entry_t),
1967 		    offsetof(vmm_devmem_entry_t, vde_node));
1968 
1969 		list_create(&sc->vmm_holds, sizeof (vmm_hold_t),
1970 		    offsetof(vmm_hold_t, vmh_node));
1971 		cv_init(&sc->vmm_cv, NULL, CV_DEFAULT, NULL);
1972 
1973 		mutex_init(&sc->vmm_lease_lock, NULL, MUTEX_DEFAULT, NULL);
1974 		list_create(&sc->vmm_lease_list, sizeof (vmm_lease_t),
1975 		    offsetof(vmm_lease_t, vml_node));
1976 		cv_init(&sc->vmm_lease_cv, NULL, CV_DEFAULT, NULL);
1977 		rw_init(&sc->vmm_rwlock, NULL, RW_DEFAULT, NULL);
1978 
1979 		sc->vmm_zone = crgetzone(cr);
1980 		zone_hold(sc->vmm_zone);
1981 		vmm_zsd_add_vm(sc);
1982 		vmm_kstat_init(sc);
1983 
1984 		list_insert_tail(&vmm_list, sc);
1985 		mutex_exit(&vmm_mtx);
1986 		return (0);
1987 	}
1988 
1989 	vmm_kstat_fini(sc);
1990 	ddi_remove_minor_node(vmmdev_dip, name);
1991 fail:
1992 	id_free(vmm_minors, minor);
1993 	if (sc != NULL) {
1994 		ddi_soft_state_free(vmm_statep, minor);
1995 	}
1996 	mutex_exit(&vmm_mtx);
1997 	vmm_hma_release();
1998 
1999 	return (error);
2000 }
2001 
2002 /*
2003  * Bhyve 'Driver' Interface
2004  *
2005  * While many devices are emulated in the bhyve userspace process, there are
2006  * others with performance constraints which require that they run mostly or
2007  * entirely in-kernel.  For those not integrated directly into bhyve, an API is
2008  * needed so they can query/manipulate the portions of VM state needed to
2009  * fulfill their purpose.
2010  *
2011  * This includes:
2012  * - Translating guest-physical addresses to host-virtual pointers
2013  * - Injecting MSIs
2014  * - Hooking IO port addresses
2015  *
2016  * The vmm_drv interface exists to provide that functionality to its consumers.
2017  * (At this time, 'viona' is the only user)
2018  */
2019 int
2020 vmm_drv_hold(file_t *fp, cred_t *cr, vmm_hold_t **holdp)
2021 {
2022 	vnode_t *vp = fp->f_vnode;
2023 	const dev_t dev = vp->v_rdev;
2024 	vmm_softc_t *sc;
2025 	vmm_hold_t *hold;
2026 	int err = 0;
2027 
2028 	if (vp->v_type != VCHR) {
2029 		return (ENXIO);
2030 	}
2031 	const major_t major = getmajor(dev);
2032 	const minor_t minor = getminor(dev);
2033 
2034 	mutex_enter(&vmmdev_mtx);
2035 	if (vmmdev_dip == NULL || major != ddi_driver_major(vmmdev_dip)) {
2036 		mutex_exit(&vmmdev_mtx);
2037 		return (ENOENT);
2038 	}
2039 	mutex_enter(&vmm_mtx);
2040 	mutex_exit(&vmmdev_mtx);
2041 
2042 	if ((sc = ddi_get_soft_state(vmm_statep, minor)) == NULL) {
2043 		err = ENOENT;
2044 		goto out;
2045 	}
2046 	/* XXXJOY: check cred permissions against instance */
2047 
2048 	if ((sc->vmm_flags & (VMM_CLEANUP|VMM_PURGED|VMM_DESTROY)) != 0) {
2049 		err = EBUSY;
2050 		goto out;
2051 	}
2052 
2053 	hold = kmem_zalloc(sizeof (*hold), KM_SLEEP);
2054 	hold->vmh_sc = sc;
2055 	hold->vmh_release_req = B_FALSE;
2056 
2057 	list_insert_tail(&sc->vmm_holds, hold);
2058 	sc->vmm_flags |= VMM_HELD;
2059 	*holdp = hold;
2060 
2061 out:
2062 	mutex_exit(&vmm_mtx);
2063 	return (err);
2064 }
2065 
2066 void
2067 vmm_drv_rele(vmm_hold_t *hold)
2068 {
2069 	vmm_softc_t *sc;
2070 	boolean_t hma_release = B_FALSE;
2071 
2072 	ASSERT(hold != NULL);
2073 	ASSERT(hold->vmh_sc != NULL);
2074 	VERIFY(hold->vmh_ioport_hook_cnt == 0);
2075 
2076 	mutex_enter(&vmm_mtx);
2077 	sc = hold->vmh_sc;
2078 	list_remove(&sc->vmm_holds, hold);
2079 	if (list_is_empty(&sc->vmm_holds)) {
2080 		sc->vmm_flags &= ~VMM_HELD;
2081 		cv_broadcast(&sc->vmm_cv);
2082 
2083 		/*
2084 		 * If pending hold(s) had prevented an auto-destruct of the
2085 		 * instance when it was closed, finish that clean-up now.
2086 		 */
2087 		if (sc->vmm_autodestruct && !sc->vmm_is_open) {
2088 			int err = vmm_destroy_locked(sc,
2089 			    VDO_NO_PURGE_WAIT, &hma_release);
2090 
2091 			VERIFY0(err);
2092 			VERIFY(hma_release);
2093 		}
2094 	}
2095 	mutex_exit(&vmm_mtx);
2096 	kmem_free(hold, sizeof (*hold));
2097 
2098 	if (hma_release) {
2099 		vmm_hma_release();
2100 	}
2101 }
2102 
2103 boolean_t
2104 vmm_drv_release_reqd(vmm_hold_t *hold)
2105 {
2106 	ASSERT(hold != NULL);
2107 
2108 	return (hold->vmh_release_req);
2109 }
2110 
2111 vmm_lease_t *
2112 vmm_drv_lease_sign(vmm_hold_t *hold, boolean_t (*expiref)(void *), void *arg)
2113 {
2114 	vmm_softc_t *sc = hold->vmh_sc;
2115 	vmm_lease_t *lease;
2116 
2117 	ASSERT3P(expiref, !=, NULL);
2118 
2119 	if (hold->vmh_release_req) {
2120 		return (NULL);
2121 	}
2122 
2123 	lease = kmem_alloc(sizeof (*lease), KM_SLEEP);
2124 	list_link_init(&lease->vml_node);
2125 	lease->vml_expire_func = expiref;
2126 	lease->vml_expire_arg = arg;
2127 	lease->vml_expired = B_FALSE;
2128 	lease->vml_break_deferred = B_FALSE;
2129 	lease->vml_hold = hold;
2130 	/* cache the VM pointer for one less pointer chase */
2131 	lease->vml_vm = sc->vmm_vm;
2132 	lease->vml_vmclient = vmspace_client_alloc(vm_get_vmspace(sc->vmm_vm));
2133 
2134 	mutex_enter(&sc->vmm_lease_lock);
2135 	while (sc->vmm_lease_blocker != 0) {
2136 		cv_wait(&sc->vmm_lease_cv, &sc->vmm_lease_lock);
2137 	}
2138 	list_insert_tail(&sc->vmm_lease_list, lease);
2139 	vmm_read_lock(sc);
2140 	mutex_exit(&sc->vmm_lease_lock);
2141 
2142 	return (lease);
2143 }
2144 
2145 static void
2146 vmm_lease_break_locked(vmm_softc_t *sc, vmm_lease_t *lease)
2147 {
2148 	ASSERT(MUTEX_HELD(&sc->vmm_lease_lock));
2149 
2150 	list_remove(&sc->vmm_lease_list, lease);
2151 	vmm_read_unlock(sc);
2152 	vmc_destroy(lease->vml_vmclient);
2153 	kmem_free(lease, sizeof (*lease));
2154 }
2155 
2156 static void
2157 vmm_lease_block(vmm_softc_t *sc)
2158 {
2159 	mutex_enter(&sc->vmm_lease_lock);
2160 	VERIFY3U(sc->vmm_lease_blocker, !=, UINT_MAX);
2161 	sc->vmm_lease_blocker++;
2162 	if (sc->vmm_lease_blocker == 1) {
2163 		list_t *list = &sc->vmm_lease_list;
2164 		vmm_lease_t *lease = list_head(list);
2165 
2166 		while (lease != NULL) {
2167 			void *arg = lease->vml_expire_arg;
2168 			boolean_t (*expiref)(void *) = lease->vml_expire_func;
2169 			boolean_t sync_break = B_FALSE;
2170 
2171 			/*
2172 			 * Since the lease expiration notification may
2173 			 * need to take locks which would deadlock with
2174 			 * vmm_lease_lock, drop it across the call.
2175 			 *
2176 			 * We are the only one allowed to manipulate
2177 			 * vmm_lease_list right now, so it is safe to
2178 			 * continue iterating through it after
2179 			 * reacquiring the lock.
2180 			 */
2181 			lease->vml_expired = B_TRUE;
2182 			mutex_exit(&sc->vmm_lease_lock);
2183 			sync_break = expiref(arg);
2184 			mutex_enter(&sc->vmm_lease_lock);
2185 
2186 			if (sync_break) {
2187 				vmm_lease_t *next;
2188 
2189 				/*
2190 				 * These leases which are synchronously broken
2191 				 * result in vmm_read_unlock() calls from a
2192 				 * different thread than the corresponding
2193 				 * vmm_read_lock().  This is acceptable, given
2194 				 * that the rwlock underpinning the whole
2195 				 * mechanism tolerates the behavior.  This
2196 				 * flexibility is _only_ afforded to VM read
2197 				 * lock (RW_READER) holders.
2198 				 */
2199 				next = list_next(list, lease);
2200 				vmm_lease_break_locked(sc, lease);
2201 				lease = next;
2202 			} else {
2203 				lease = list_next(list, lease);
2204 			}
2205 		}
2206 
2207 		/* Process leases which were not broken synchronously. */
2208 		while (!list_is_empty(list)) {
2209 			/*
2210 			 * Although the nested loops are quadratic, the number
2211 			 * of leases is small.
2212 			 */
2213 			lease = list_head(list);
2214 			while (lease != NULL) {
2215 				vmm_lease_t *next = list_next(list, lease);
2216 				if (lease->vml_break_deferred) {
2217 					vmm_lease_break_locked(sc, lease);
2218 				}
2219 				lease = next;
2220 			}
2221 			if (list_is_empty(list)) {
2222 				break;
2223 			}
2224 			cv_wait(&sc->vmm_lease_cv, &sc->vmm_lease_lock);
2225 		}
2226 		/* Wake anyone else waiting for the lease list to be empty  */
2227 		cv_broadcast(&sc->vmm_lease_cv);
2228 	} else {
2229 		list_t *list = &sc->vmm_lease_list;
2230 
2231 		/*
2232 		 * Some other thread beat us to the duty of lease cleanup.
2233 		 * Wait until that is complete.
2234 		 */
2235 		while (!list_is_empty(list)) {
2236 			cv_wait(&sc->vmm_lease_cv, &sc->vmm_lease_lock);
2237 		}
2238 	}
2239 	mutex_exit(&sc->vmm_lease_lock);
2240 }
2241 
2242 static void
2243 vmm_lease_unblock(vmm_softc_t *sc)
2244 {
2245 	mutex_enter(&sc->vmm_lease_lock);
2246 	VERIFY3U(sc->vmm_lease_blocker, !=, 0);
2247 	sc->vmm_lease_blocker--;
2248 	if (sc->vmm_lease_blocker == 0) {
2249 		cv_broadcast(&sc->vmm_lease_cv);
2250 	}
2251 	mutex_exit(&sc->vmm_lease_lock);
2252 }
2253 
2254 void
2255 vmm_drv_lease_break(vmm_hold_t *hold, vmm_lease_t *lease)
2256 {
2257 	vmm_softc_t *sc = hold->vmh_sc;
2258 
2259 	VERIFY3P(hold, ==, lease->vml_hold);
2260 	VERIFY(!lease->vml_break_deferred);
2261 
2262 	mutex_enter(&sc->vmm_lease_lock);
2263 	if (sc->vmm_lease_blocker == 0) {
2264 		vmm_lease_break_locked(sc, lease);
2265 	} else {
2266 		/*
2267 		 * Defer the lease-breaking to whichever thread is currently
2268 		 * cleaning up all leases as part of a vmm_lease_block() call.
2269 		 */
2270 		lease->vml_break_deferred = B_TRUE;
2271 		cv_broadcast(&sc->vmm_lease_cv);
2272 	}
2273 	mutex_exit(&sc->vmm_lease_lock);
2274 }
2275 
2276 boolean_t
2277 vmm_drv_lease_expired(vmm_lease_t *lease)
2278 {
2279 	return (lease->vml_expired);
2280 }
2281 
2282 vmm_page_t *
2283 vmm_drv_page_hold(vmm_lease_t *lease, uintptr_t gpa, int prot)
2284 {
2285 	ASSERT(lease != NULL);
2286 	ASSERT0(gpa & PAGEOFFSET);
2287 
2288 	return ((vmm_page_t *)vmc_hold(lease->vml_vmclient, gpa, prot));
2289 }
2290 
2291 void
2292 vmm_drv_page_release(vmm_page_t *vmmp)
2293 {
2294 	(void) vmp_release((vm_page_t *)vmmp);
2295 }
2296 
2297 void
2298 vmm_drv_page_release_chain(vmm_page_t *vmmp)
2299 {
2300 	(void) vmp_release_chain((vm_page_t *)vmmp);
2301 }
2302 
2303 const void *
2304 vmm_drv_page_readable(const vmm_page_t *vmmp)
2305 {
2306 	return (vmp_get_readable((const vm_page_t *)vmmp));
2307 }
2308 
2309 void *
2310 vmm_drv_page_writable(const vmm_page_t *vmmp)
2311 {
2312 	return (vmp_get_writable((const vm_page_t *)vmmp));
2313 }
2314 
2315 void
2316 vmm_drv_page_chain(vmm_page_t *vmmp, vmm_page_t *to_chain)
2317 {
2318 	vmp_chain((vm_page_t *)vmmp, (vm_page_t *)to_chain);
2319 }
2320 
2321 vmm_page_t *
2322 vmm_drv_page_next(const vmm_page_t *vmmp)
2323 {
2324 	return ((vmm_page_t *)vmp_next((vm_page_t *)vmmp));
2325 }
2326 
2327 int
2328 vmm_drv_msi(vmm_lease_t *lease, uint64_t addr, uint64_t msg)
2329 {
2330 	ASSERT(lease != NULL);
2331 
2332 	return (lapic_intr_msi(lease->vml_vm, addr, msg));
2333 }
2334 
2335 int
2336 vmm_drv_ioport_hook(vmm_hold_t *hold, uint16_t ioport, vmm_drv_iop_cb_t func,
2337     void *arg, void **cookie)
2338 {
2339 	vmm_softc_t *sc;
2340 	int err;
2341 
2342 	ASSERT(hold != NULL);
2343 	ASSERT(cookie != NULL);
2344 
2345 	sc = hold->vmh_sc;
2346 	mutex_enter(&vmm_mtx);
2347 	/* Confirm that hook installation is not blocked */
2348 	if ((sc->vmm_flags & VMM_BLOCK_HOOK) != 0) {
2349 		mutex_exit(&vmm_mtx);
2350 		return (EBUSY);
2351 	}
2352 	/*
2353 	 * Optimistically record an installed hook which will prevent a block
2354 	 * from being asserted while the mutex is dropped.
2355 	 */
2356 	hold->vmh_ioport_hook_cnt++;
2357 	mutex_exit(&vmm_mtx);
2358 
2359 	vmm_write_lock(sc);
2360 	err = vm_ioport_hook(sc->vmm_vm, ioport, (ioport_handler_t)func,
2361 	    arg, cookie);
2362 	vmm_write_unlock(sc);
2363 
2364 	if (err != 0) {
2365 		mutex_enter(&vmm_mtx);
2366 		/* Walk back optimism about the hook installation */
2367 		hold->vmh_ioport_hook_cnt--;
2368 		mutex_exit(&vmm_mtx);
2369 	}
2370 	return (err);
2371 }
2372 
2373 void
2374 vmm_drv_ioport_unhook(vmm_hold_t *hold, void **cookie)
2375 {
2376 	vmm_softc_t *sc;
2377 
2378 	ASSERT(hold != NULL);
2379 	ASSERT(cookie != NULL);
2380 	ASSERT(hold->vmh_ioport_hook_cnt != 0);
2381 
2382 	sc = hold->vmh_sc;
2383 	vmm_write_lock(sc);
2384 	vm_ioport_unhook(sc->vmm_vm, cookie);
2385 	vmm_write_unlock(sc);
2386 
2387 	mutex_enter(&vmm_mtx);
2388 	hold->vmh_ioport_hook_cnt--;
2389 	mutex_exit(&vmm_mtx);
2390 }
2391 
2392 static int
2393 vmm_drv_purge(vmm_softc_t *sc, boolean_t no_wait)
2394 {
2395 	ASSERT(MUTEX_HELD(&vmm_mtx));
2396 
2397 	if ((sc->vmm_flags & VMM_HELD) != 0) {
2398 		vmm_hold_t *hold;
2399 
2400 		sc->vmm_flags |= VMM_CLEANUP;
2401 		for (hold = list_head(&sc->vmm_holds); hold != NULL;
2402 		    hold = list_next(&sc->vmm_holds, hold)) {
2403 			hold->vmh_release_req = B_TRUE;
2404 		}
2405 
2406 		/*
2407 		 * Require that all leases on the instance be broken, now that
2408 		 * all associated holds have been marked as needing release.
2409 		 *
2410 		 * Dropping vmm_mtx is not strictly necessary, but if any of the
2411 		 * lessees are slow to respond, it would be nice to leave it
2412 		 * available for other parties.
2413 		 */
2414 		mutex_exit(&vmm_mtx);
2415 		vmm_lease_block(sc);
2416 		vmm_lease_unblock(sc);
2417 		mutex_enter(&vmm_mtx);
2418 
2419 		/*
2420 		 * With all of the leases broken, we can proceed in an orderly
2421 		 * fashion to waiting for any lingering holds to be dropped.
2422 		 */
2423 		while ((sc->vmm_flags & VMM_HELD) != 0) {
2424 			/*
2425 			 * Some holds remain, so wait (if acceptable) for them
2426 			 * to be cleaned up.
2427 			 */
2428 			if (no_wait ||
2429 			    cv_wait_sig(&sc->vmm_cv, &vmm_mtx) <= 0) {
2430 				sc->vmm_flags &= ~VMM_CLEANUP;
2431 				return (EINTR);
2432 			}
2433 		}
2434 		sc->vmm_flags &= ~VMM_CLEANUP;
2435 	}
2436 
2437 	VERIFY(list_is_empty(&sc->vmm_holds));
2438 	sc->vmm_flags |= VMM_PURGED;
2439 	return (0);
2440 }
2441 
2442 static int
2443 vmm_drv_block_hook(vmm_softc_t *sc, boolean_t enable_block)
2444 {
2445 	int err = 0;
2446 
2447 	mutex_enter(&vmm_mtx);
2448 	if (!enable_block) {
2449 		VERIFY((sc->vmm_flags & VMM_BLOCK_HOOK) != 0);
2450 
2451 		sc->vmm_flags &= ~VMM_BLOCK_HOOK;
2452 		goto done;
2453 	}
2454 
2455 	/* If any holds have hooks installed, the block is a failure */
2456 	if (!list_is_empty(&sc->vmm_holds)) {
2457 		vmm_hold_t *hold;
2458 
2459 		for (hold = list_head(&sc->vmm_holds); hold != NULL;
2460 		    hold = list_next(&sc->vmm_holds, hold)) {
2461 			if (hold->vmh_ioport_hook_cnt != 0) {
2462 				err = EBUSY;
2463 				goto done;
2464 			}
2465 		}
2466 	}
2467 	sc->vmm_flags |= VMM_BLOCK_HOOK;
2468 
2469 done:
2470 	mutex_exit(&vmm_mtx);
2471 	return (err);
2472 }
2473 
2474 static int
2475 vmm_destroy_locked(vmm_softc_t *sc, vmm_destroy_opts_t opts,
2476     boolean_t *hma_release)
2477 {
2478 	dev_info_t	*pdip = ddi_get_parent(vmmdev_dip);
2479 	minor_t		minor;
2480 
2481 	ASSERT(MUTEX_HELD(&vmm_mtx));
2482 
2483 	*hma_release = B_FALSE;
2484 
2485 	if (vmm_drv_purge(sc, (opts & VDO_NO_PURGE_WAIT) != 0) != 0) {
2486 		return (EINTR);
2487 	}
2488 
2489 	if ((opts & VDO_NO_CLEAN_ZSD) == 0) {
2490 		vmm_zsd_rem_vm(sc);
2491 	}
2492 
2493 	/* Clean up devmem entries */
2494 	vmmdev_devmem_purge(sc);
2495 
2496 	list_remove(&vmm_list, sc);
2497 	ddi_remove_minor_node(vmmdev_dip, sc->vmm_name);
2498 	minor = sc->vmm_minor;
2499 	zone_rele(sc->vmm_zone);
2500 	if (sc->vmm_is_open) {
2501 		list_insert_tail(&vmm_destroy_list, sc);
2502 		sc->vmm_flags |= VMM_DESTROY;
2503 	} else {
2504 		vmm_kstat_fini(sc);
2505 		vm_destroy(sc->vmm_vm);
2506 		ddi_soft_state_free(vmm_statep, minor);
2507 		id_free(vmm_minors, minor);
2508 		*hma_release = B_TRUE;
2509 	}
2510 	(void) devfs_clean(pdip, NULL, DV_CLEAN_FORCE);
2511 
2512 	return (0);
2513 }
2514 
2515 int
2516 vmm_zone_vm_destroy(vmm_softc_t *sc)
2517 {
2518 	boolean_t	hma_release = B_FALSE;
2519 	int		err;
2520 
2521 	mutex_enter(&vmm_mtx);
2522 	err = vmm_destroy_locked(sc, VDO_NO_CLEAN_ZSD, &hma_release);
2523 	mutex_exit(&vmm_mtx);
2524 
2525 	if (hma_release)
2526 		vmm_hma_release();
2527 
2528 	return (err);
2529 }
2530 
2531 /* ARGSUSED */
2532 static int
2533 vmmdev_do_vm_destroy(const struct vm_destroy_req *req, cred_t *cr)
2534 {
2535 	boolean_t	hma_release = B_FALSE;
2536 	vmm_softc_t	*sc;
2537 	int		err;
2538 
2539 	if (crgetuid(cr) != 0)
2540 		return (EPERM);
2541 
2542 	mutex_enter(&vmm_mtx);
2543 
2544 	if ((sc = vmm_lookup(req->name)) == NULL) {
2545 		mutex_exit(&vmm_mtx);
2546 		return (ENOENT);
2547 	}
2548 	/*
2549 	 * We don't check this in vmm_lookup() since that function is also used
2550 	 * for validation during create and currently vmm names must be unique.
2551 	 */
2552 	if (!INGLOBALZONE(curproc) && sc->vmm_zone != curzone) {
2553 		mutex_exit(&vmm_mtx);
2554 		return (EPERM);
2555 	}
2556 	err = vmm_destroy_locked(sc, VDO_DEFAULT, &hma_release);
2557 
2558 	mutex_exit(&vmm_mtx);
2559 
2560 	if (hma_release)
2561 		vmm_hma_release();
2562 
2563 	return (err);
2564 }
2565 
2566 #define	VCPU_NAME_BUFLEN	32
2567 
2568 static int
2569 vmm_kstat_alloc(vmm_softc_t *sc, minor_t minor, const cred_t *cr)
2570 {
2571 	zoneid_t zid = crgetzoneid(cr);
2572 	int instance = minor;
2573 	kstat_t *ksp;
2574 
2575 	ASSERT3P(sc->vmm_kstat_vm, ==, NULL);
2576 
2577 	ksp = kstat_create_zone(VMM_MODULE_NAME, instance, "vm",
2578 	    VMM_KSTAT_CLASS, KSTAT_TYPE_NAMED,
2579 	    sizeof (vmm_kstats_t) / sizeof (kstat_named_t), 0, zid);
2580 
2581 	if (ksp == NULL) {
2582 		return (-1);
2583 	}
2584 	sc->vmm_kstat_vm = ksp;
2585 
2586 	for (uint_t i = 0; i < VM_MAXCPU; i++) {
2587 		char namebuf[VCPU_NAME_BUFLEN];
2588 
2589 		ASSERT3P(sc->vmm_kstat_vcpu[i], ==, NULL);
2590 
2591 		(void) snprintf(namebuf, VCPU_NAME_BUFLEN, "vcpu%u", i);
2592 		ksp = kstat_create_zone(VMM_MODULE_NAME, instance, namebuf,
2593 		    VMM_KSTAT_CLASS, KSTAT_TYPE_NAMED,
2594 		    sizeof (vmm_vcpu_kstats_t) / sizeof (kstat_named_t),
2595 		    0, zid);
2596 		if (ksp == NULL) {
2597 			goto fail;
2598 		}
2599 
2600 		sc->vmm_kstat_vcpu[i] = ksp;
2601 	}
2602 
2603 	/*
2604 	 * If this instance is associated with a non-global zone, make its
2605 	 * kstats visible from the GZ.
2606 	 */
2607 	if (zid != GLOBAL_ZONEID) {
2608 		kstat_zone_add(sc->vmm_kstat_vm, GLOBAL_ZONEID);
2609 		for (uint_t i = 0; i < VM_MAXCPU; i++) {
2610 			kstat_zone_add(sc->vmm_kstat_vcpu[i], GLOBAL_ZONEID);
2611 		}
2612 	}
2613 
2614 	return (0);
2615 
2616 fail:
2617 	for (uint_t i = 0; i < VM_MAXCPU; i++) {
2618 		if (sc->vmm_kstat_vcpu[i] != NULL) {
2619 			kstat_delete(sc->vmm_kstat_vcpu[i]);
2620 			sc->vmm_kstat_vcpu[i] = NULL;
2621 		} else {
2622 			break;
2623 		}
2624 	}
2625 	kstat_delete(sc->vmm_kstat_vm);
2626 	sc->vmm_kstat_vm = NULL;
2627 	return (-1);
2628 }
2629 
2630 static void
2631 vmm_kstat_init(vmm_softc_t *sc)
2632 {
2633 	kstat_t *ksp;
2634 
2635 	ASSERT3P(sc->vmm_vm, !=, NULL);
2636 	ASSERT3P(sc->vmm_kstat_vm, !=, NULL);
2637 
2638 	ksp = sc->vmm_kstat_vm;
2639 	vmm_kstats_t *vk = ksp->ks_data;
2640 	ksp->ks_private = sc->vmm_vm;
2641 	kstat_named_init(&vk->vk_name, "vm_name", KSTAT_DATA_STRING);
2642 	kstat_named_setstr(&vk->vk_name, sc->vmm_name);
2643 
2644 	for (uint_t i = 0; i < VM_MAXCPU; i++) {
2645 		ASSERT3P(sc->vmm_kstat_vcpu[i], !=, NULL);
2646 
2647 		ksp = sc->vmm_kstat_vcpu[i];
2648 		vmm_vcpu_kstats_t *vvk = ksp->ks_data;
2649 
2650 		kstat_named_init(&vvk->vvk_vcpu, "vcpu", KSTAT_DATA_UINT32);
2651 		vvk->vvk_vcpu.value.ui32 = i;
2652 		kstat_named_init(&vvk->vvk_time_init, "time_init",
2653 		    KSTAT_DATA_UINT64);
2654 		kstat_named_init(&vvk->vvk_time_run, "time_run",
2655 		    KSTAT_DATA_UINT64);
2656 		kstat_named_init(&vvk->vvk_time_idle, "time_idle",
2657 		    KSTAT_DATA_UINT64);
2658 		kstat_named_init(&vvk->vvk_time_emu_kern, "time_emu_kern",
2659 		    KSTAT_DATA_UINT64);
2660 		kstat_named_init(&vvk->vvk_time_emu_user, "time_emu_user",
2661 		    KSTAT_DATA_UINT64);
2662 		kstat_named_init(&vvk->vvk_time_sched, "time_sched",
2663 		    KSTAT_DATA_UINT64);
2664 		ksp->ks_private = sc->vmm_vm;
2665 		ksp->ks_update = vmm_kstat_update_vcpu;
2666 	}
2667 
2668 	kstat_install(sc->vmm_kstat_vm);
2669 	for (uint_t i = 0; i < VM_MAXCPU; i++) {
2670 		kstat_install(sc->vmm_kstat_vcpu[i]);
2671 	}
2672 }
2673 
2674 static void
2675 vmm_kstat_fini(vmm_softc_t *sc)
2676 {
2677 	ASSERT(sc->vmm_kstat_vm != NULL);
2678 
2679 	kstat_delete(sc->vmm_kstat_vm);
2680 	sc->vmm_kstat_vm = NULL;
2681 
2682 	for (uint_t i = 0; i < VM_MAXCPU; i++) {
2683 		ASSERT3P(sc->vmm_kstat_vcpu[i], !=, NULL);
2684 
2685 		kstat_delete(sc->vmm_kstat_vcpu[i]);
2686 		sc->vmm_kstat_vcpu[i] = NULL;
2687 	}
2688 }
2689 
2690 static int
2691 vmm_open(dev_t *devp, int flag, int otyp, cred_t *credp)
2692 {
2693 	minor_t		minor;
2694 	vmm_softc_t	*sc;
2695 
2696 	/*
2697 	 * Forbid running bhyve in a 32-bit process until it has been tested and
2698 	 * verified to be safe.
2699 	 */
2700 	if (curproc->p_model != DATAMODEL_LP64) {
2701 		return (EFBIG);
2702 	}
2703 
2704 	minor = getminor(*devp);
2705 	if (minor == VMM_CTL_MINOR) {
2706 		/*
2707 		 * Master control device must be opened exclusively.
2708 		 */
2709 		if ((flag & FEXCL) != FEXCL || otyp != OTYP_CHR) {
2710 			return (EINVAL);
2711 		}
2712 
2713 		return (0);
2714 	}
2715 
2716 	mutex_enter(&vmm_mtx);
2717 	sc = ddi_get_soft_state(vmm_statep, minor);
2718 	if (sc == NULL) {
2719 		mutex_exit(&vmm_mtx);
2720 		return (ENXIO);
2721 	}
2722 
2723 	sc->vmm_is_open = B_TRUE;
2724 	mutex_exit(&vmm_mtx);
2725 
2726 	return (0);
2727 }
2728 
2729 static int
2730 vmm_close(dev_t dev, int flag, int otyp, cred_t *credp)
2731 {
2732 	minor_t		minor;
2733 	vmm_softc_t	*sc;
2734 	boolean_t	hma_release = B_FALSE;
2735 
2736 	minor = getminor(dev);
2737 	if (minor == VMM_CTL_MINOR)
2738 		return (0);
2739 
2740 	mutex_enter(&vmm_mtx);
2741 	sc = ddi_get_soft_state(vmm_statep, minor);
2742 	if (sc == NULL) {
2743 		mutex_exit(&vmm_mtx);
2744 		return (ENXIO);
2745 	}
2746 
2747 	VERIFY(sc->vmm_is_open);
2748 	sc->vmm_is_open = B_FALSE;
2749 
2750 	/*
2751 	 * If this VM was destroyed while the vmm device was open, then
2752 	 * clean it up now that it is closed.
2753 	 */
2754 	if (sc->vmm_flags & VMM_DESTROY) {
2755 		list_remove(&vmm_destroy_list, sc);
2756 		vmm_kstat_fini(sc);
2757 		vm_destroy(sc->vmm_vm);
2758 		ddi_soft_state_free(vmm_statep, minor);
2759 		id_free(vmm_minors, minor);
2760 		hma_release = B_TRUE;
2761 	} else if (sc->vmm_autodestruct) {
2762 		/*
2763 		 * Attempt auto-destruct on instance if requested.
2764 		 *
2765 		 * Do not wait for existing holds to be purged from the
2766 		 * instance, since there is no guarantee that will happen in a
2767 		 * timely manner.  Auto-destruction will resume when the last
2768 		 * hold is released. (See: vmm_drv_rele)
2769 		 */
2770 		(void) vmm_destroy_locked(sc, VDO_NO_PURGE_WAIT, &hma_release);
2771 	}
2772 	mutex_exit(&vmm_mtx);
2773 
2774 	if (hma_release)
2775 		vmm_hma_release();
2776 
2777 	return (0);
2778 }
2779 
2780 static int
2781 vmm_is_supported(intptr_t arg)
2782 {
2783 	int r;
2784 	const char *msg;
2785 
2786 	if (vmm_is_intel()) {
2787 		r = vmx_x86_supported(&msg);
2788 	} else if (vmm_is_svm()) {
2789 		/*
2790 		 * HMA already ensured that the features necessary for SVM
2791 		 * operation were present and online during vmm_attach().
2792 		 */
2793 		r = 0;
2794 	} else {
2795 		r = ENXIO;
2796 		msg = "Unsupported CPU vendor";
2797 	}
2798 
2799 	if (r != 0 && arg != (intptr_t)NULL) {
2800 		if (copyoutstr(msg, (char *)arg, strlen(msg) + 1, NULL) != 0)
2801 			return (EFAULT);
2802 	}
2803 	return (r);
2804 }
2805 
2806 static int
2807 vmm_ctl_ioctl(int cmd, intptr_t arg, int md, cred_t *cr, int *rvalp)
2808 {
2809 	void *argp = (void *)arg;
2810 
2811 	switch (cmd) {
2812 	case VMM_CREATE_VM: {
2813 		struct vm_create_req req;
2814 
2815 		if ((md & FWRITE) == 0) {
2816 			return (EPERM);
2817 		}
2818 		if (ddi_copyin(argp, &req, sizeof (req), md) != 0) {
2819 			return (EFAULT);
2820 		}
2821 		return (vmmdev_do_vm_create(&req, cr));
2822 	}
2823 	case VMM_DESTROY_VM: {
2824 		struct vm_destroy_req req;
2825 
2826 		if ((md & FWRITE) == 0) {
2827 			return (EPERM);
2828 		}
2829 		if (ddi_copyin(argp, &req, sizeof (req), md) != 0) {
2830 			return (EFAULT);
2831 		}
2832 		return (vmmdev_do_vm_destroy(&req, cr));
2833 	}
2834 	case VMM_VM_SUPPORTED:
2835 		return (vmm_is_supported(arg));
2836 	case VMM_INTERFACE_VERSION:
2837 		*rvalp = VMM_CURRENT_INTERFACE_VERSION;
2838 		return (0);
2839 	case VMM_CHECK_IOMMU:
2840 		if (!vmm_check_iommu()) {
2841 			return (ENXIO);
2842 		}
2843 		return (0);
2844 	case VMM_RESV_QUERY:
2845 	case VMM_RESV_ADD:
2846 	case VMM_RESV_REMOVE:
2847 		return (vmmr_ioctl(cmd, arg, md, cr, rvalp));
2848 	default:
2849 		break;
2850 	}
2851 	/* No other actions are legal on ctl device */
2852 	return (ENOTTY);
2853 }
2854 
2855 static int
2856 vmm_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp,
2857     int *rvalp)
2858 {
2859 	vmm_softc_t	*sc;
2860 	minor_t		minor;
2861 
2862 	/*
2863 	 * Forbid running bhyve in a 32-bit process until it has been tested and
2864 	 * verified to be safe.
2865 	 */
2866 	if (curproc->p_model != DATAMODEL_LP64) {
2867 		return (EFBIG);
2868 	}
2869 
2870 	/* The structs in bhyve ioctls assume a 64-bit datamodel */
2871 	if (ddi_model_convert_from(mode & FMODELS) != DDI_MODEL_NONE) {
2872 		return (ENOTSUP);
2873 	}
2874 
2875 	minor = getminor(dev);
2876 
2877 	if (minor == VMM_CTL_MINOR) {
2878 		return (vmm_ctl_ioctl(cmd, arg, mode, credp, rvalp));
2879 	}
2880 
2881 	sc = ddi_get_soft_state(vmm_statep, minor);
2882 	ASSERT(sc);
2883 
2884 	if (sc->vmm_flags & VMM_DESTROY)
2885 		return (ENXIO);
2886 
2887 	return (vmmdev_do_ioctl(sc, cmd, arg, mode, credp, rvalp));
2888 }
2889 
2890 static int
2891 vmm_segmap(dev_t dev, off_t off, struct as *as, caddr_t *addrp, off_t len,
2892     unsigned int prot, unsigned int maxprot, unsigned int flags, cred_t *credp)
2893 {
2894 	vmm_softc_t *sc;
2895 	const minor_t minor = getminor(dev);
2896 	int err;
2897 
2898 	if (minor == VMM_CTL_MINOR) {
2899 		return (ENODEV);
2900 	}
2901 	if (off < 0 || (off + len) <= 0) {
2902 		return (EINVAL);
2903 	}
2904 	if ((prot & PROT_USER) == 0) {
2905 		return (EACCES);
2906 	}
2907 
2908 	sc = ddi_get_soft_state(vmm_statep, minor);
2909 	ASSERT(sc);
2910 
2911 	if (sc->vmm_flags & VMM_DESTROY)
2912 		return (ENXIO);
2913 
2914 	/* Grab read lock on the VM to prevent any changes to the memory map */
2915 	vmm_read_lock(sc);
2916 
2917 	if (off >= VM_DEVMEM_START) {
2918 		int segid;
2919 		off_t segoff;
2920 
2921 		/* Mapping a devmem "device" */
2922 		if (!vmmdev_devmem_segid(sc, off, len, &segid, &segoff)) {
2923 			err = ENODEV;
2924 		} else {
2925 			err = vm_segmap_obj(sc->vmm_vm, segid, segoff, len, as,
2926 			    addrp, prot, maxprot, flags);
2927 		}
2928 	} else {
2929 		/* Mapping a part of the guest physical space */
2930 		err = vm_segmap_space(sc->vmm_vm, off, as, addrp, len, prot,
2931 		    maxprot, flags);
2932 	}
2933 
2934 	vmm_read_unlock(sc);
2935 	return (err);
2936 }
2937 
2938 static sdev_plugin_validate_t
2939 vmm_sdev_validate(sdev_ctx_t ctx)
2940 {
2941 	const char *name = sdev_ctx_name(ctx);
2942 	vmm_softc_t *sc;
2943 	sdev_plugin_validate_t ret;
2944 	minor_t minor;
2945 
2946 	if (sdev_ctx_vtype(ctx) != VCHR)
2947 		return (SDEV_VTOR_INVALID);
2948 
2949 	VERIFY3S(sdev_ctx_minor(ctx, &minor), ==, 0);
2950 
2951 	mutex_enter(&vmm_mtx);
2952 	if ((sc = vmm_lookup(name)) == NULL)
2953 		ret = SDEV_VTOR_INVALID;
2954 	else if (sc->vmm_minor != minor)
2955 		ret = SDEV_VTOR_STALE;
2956 	else
2957 		ret = SDEV_VTOR_VALID;
2958 	mutex_exit(&vmm_mtx);
2959 
2960 	return (ret);
2961 }
2962 
2963 static int
2964 vmm_sdev_filldir(sdev_ctx_t ctx)
2965 {
2966 	vmm_softc_t *sc;
2967 	int ret;
2968 
2969 	if (strcmp(sdev_ctx_path(ctx), VMM_SDEV_ROOT) != 0) {
2970 		cmn_err(CE_WARN, "%s: bad path '%s' != '%s'\n", __func__,
2971 		    sdev_ctx_path(ctx), VMM_SDEV_ROOT);
2972 		return (EINVAL);
2973 	}
2974 
2975 	mutex_enter(&vmm_mtx);
2976 	ASSERT(vmmdev_dip != NULL);
2977 	for (sc = list_head(&vmm_list); sc != NULL;
2978 	    sc = list_next(&vmm_list, sc)) {
2979 		if (INGLOBALZONE(curproc) || sc->vmm_zone == curzone) {
2980 			ret = sdev_plugin_mknod(ctx, sc->vmm_name,
2981 			    S_IFCHR | 0600,
2982 			    makedevice(ddi_driver_major(vmmdev_dip),
2983 			    sc->vmm_minor));
2984 		} else {
2985 			continue;
2986 		}
2987 		if (ret != 0 && ret != EEXIST)
2988 			goto out;
2989 	}
2990 
2991 	ret = 0;
2992 
2993 out:
2994 	mutex_exit(&vmm_mtx);
2995 	return (ret);
2996 }
2997 
2998 /* ARGSUSED */
2999 static void
3000 vmm_sdev_inactive(sdev_ctx_t ctx)
3001 {
3002 }
3003 
3004 static sdev_plugin_ops_t vmm_sdev_ops = {
3005 	.spo_version = SDEV_PLUGIN_VERSION,
3006 	.spo_flags = SDEV_PLUGIN_SUBDIR,
3007 	.spo_validate = vmm_sdev_validate,
3008 	.spo_filldir = vmm_sdev_filldir,
3009 	.spo_inactive = vmm_sdev_inactive
3010 };
3011 
3012 /* ARGSUSED */
3013 static int
3014 vmm_info(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result)
3015 {
3016 	int error;
3017 
3018 	switch (cmd) {
3019 	case DDI_INFO_DEVT2DEVINFO:
3020 		*result = (void *)vmmdev_dip;
3021 		error = DDI_SUCCESS;
3022 		break;
3023 	case DDI_INFO_DEVT2INSTANCE:
3024 		*result = (void *)0;
3025 		error = DDI_SUCCESS;
3026 		break;
3027 	default:
3028 		error = DDI_FAILURE;
3029 		break;
3030 	}
3031 	return (error);
3032 }
3033 
3034 static int
3035 vmm_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
3036 {
3037 	sdev_plugin_hdl_t sph;
3038 	hma_reg_t *reg = NULL;
3039 	boolean_t vmm_loaded = B_FALSE;
3040 
3041 	if (cmd != DDI_ATTACH) {
3042 		return (DDI_FAILURE);
3043 	}
3044 
3045 	mutex_enter(&vmmdev_mtx);
3046 	/* Ensure we are not already attached. */
3047 	if (vmmdev_dip != NULL) {
3048 		mutex_exit(&vmmdev_mtx);
3049 		return (DDI_FAILURE);
3050 	}
3051 
3052 	vmm_sol_glue_init();
3053 
3054 	/*
3055 	 * Perform temporary HMA registration to determine if the system
3056 	 * is capable.
3057 	 */
3058 	if ((reg = hma_register(vmmdev_hvm_name)) == NULL) {
3059 		goto fail;
3060 	} else if (vmm_mod_load() != 0) {
3061 		goto fail;
3062 	}
3063 	vmm_loaded = B_TRUE;
3064 	hma_unregister(reg);
3065 	reg = NULL;
3066 
3067 	/* Create control node.  Other nodes will be created on demand. */
3068 	if (ddi_create_minor_node(dip, "ctl", S_IFCHR,
3069 	    VMM_CTL_MINOR, DDI_PSEUDO, 0) != 0) {
3070 		goto fail;
3071 	}
3072 
3073 	sph = sdev_plugin_register(VMM_MODULE_NAME, &vmm_sdev_ops, NULL);
3074 	if (sph == (sdev_plugin_hdl_t)NULL) {
3075 		ddi_remove_minor_node(dip, NULL);
3076 		goto fail;
3077 	}
3078 
3079 	ddi_report_dev(dip);
3080 	vmmdev_sdev_hdl = sph;
3081 	vmmdev_dip = dip;
3082 	mutex_exit(&vmmdev_mtx);
3083 	return (DDI_SUCCESS);
3084 
3085 fail:
3086 	if (vmm_loaded) {
3087 		VERIFY0(vmm_mod_unload());
3088 	}
3089 	if (reg != NULL) {
3090 		hma_unregister(reg);
3091 	}
3092 	vmm_sol_glue_cleanup();
3093 	mutex_exit(&vmmdev_mtx);
3094 	return (DDI_FAILURE);
3095 }
3096 
3097 static int
3098 vmm_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
3099 {
3100 	if (cmd != DDI_DETACH) {
3101 		return (DDI_FAILURE);
3102 	}
3103 
3104 	/*
3105 	 * Ensure that all resources have been cleaned up.
3106 	 *
3107 	 * To prevent a deadlock with iommu_cleanup() we'll fail the detach if
3108 	 * vmmdev_mtx is already held. We can't wait for vmmdev_mtx with our
3109 	 * devinfo locked as iommu_cleanup() tries to recursively lock each
3110 	 * devinfo, including our own, while holding vmmdev_mtx.
3111 	 */
3112 	if (mutex_tryenter(&vmmdev_mtx) == 0)
3113 		return (DDI_FAILURE);
3114 
3115 	mutex_enter(&vmm_mtx);
3116 	if (!list_is_empty(&vmm_list) || !list_is_empty(&vmm_destroy_list)) {
3117 		mutex_exit(&vmm_mtx);
3118 		mutex_exit(&vmmdev_mtx);
3119 		return (DDI_FAILURE);
3120 	}
3121 	mutex_exit(&vmm_mtx);
3122 
3123 	if (!vmmr_is_empty()) {
3124 		mutex_exit(&vmmdev_mtx);
3125 		return (DDI_FAILURE);
3126 	}
3127 
3128 	VERIFY(vmmdev_sdev_hdl != (sdev_plugin_hdl_t)NULL);
3129 	if (sdev_plugin_unregister(vmmdev_sdev_hdl) != 0) {
3130 		mutex_exit(&vmmdev_mtx);
3131 		return (DDI_FAILURE);
3132 	}
3133 	vmmdev_sdev_hdl = (sdev_plugin_hdl_t)NULL;
3134 
3135 	/* Remove the control node. */
3136 	ddi_remove_minor_node(dip, "ctl");
3137 	vmmdev_dip = NULL;
3138 
3139 	VERIFY0(vmm_mod_unload());
3140 	VERIFY3U(vmmdev_hma_reg, ==, NULL);
3141 	vmm_sol_glue_cleanup();
3142 
3143 	mutex_exit(&vmmdev_mtx);
3144 
3145 	return (DDI_SUCCESS);
3146 }
3147 
3148 static struct cb_ops vmm_cb_ops = {
3149 	vmm_open,
3150 	vmm_close,
3151 	nodev,		/* strategy */
3152 	nodev,		/* print */
3153 	nodev,		/* dump */
3154 	nodev,		/* read */
3155 	nodev,		/* write */
3156 	vmm_ioctl,
3157 	nodev,		/* devmap */
3158 	nodev,		/* mmap */
3159 	vmm_segmap,
3160 	nochpoll,	/* poll */
3161 	ddi_prop_op,
3162 	NULL,
3163 	D_NEW | D_MP | D_DEVMAP
3164 };
3165 
3166 static struct dev_ops vmm_ops = {
3167 	DEVO_REV,
3168 	0,
3169 	vmm_info,
3170 	nulldev,	/* identify */
3171 	nulldev,	/* probe */
3172 	vmm_attach,
3173 	vmm_detach,
3174 	nodev,		/* reset */
3175 	&vmm_cb_ops,
3176 	(struct bus_ops *)NULL
3177 };
3178 
3179 static struct modldrv modldrv = {
3180 	&mod_driverops,
3181 	"bhyve vmm",
3182 	&vmm_ops
3183 };
3184 
3185 static struct modlinkage modlinkage = {
3186 	MODREV_1,
3187 	&modldrv,
3188 	NULL
3189 };
3190 
3191 int
3192 _init(void)
3193 {
3194 	int	error;
3195 
3196 	sysinit();
3197 
3198 	mutex_init(&vmmdev_mtx, NULL, MUTEX_DRIVER, NULL);
3199 	mutex_init(&vmm_mtx, NULL, MUTEX_DRIVER, NULL);
3200 	list_create(&vmm_list, sizeof (vmm_softc_t),
3201 	    offsetof(vmm_softc_t, vmm_node));
3202 	list_create(&vmm_destroy_list, sizeof (vmm_softc_t),
3203 	    offsetof(vmm_softc_t, vmm_node));
3204 	vmm_minors = id_space_create("vmm_minors", VMM_CTL_MINOR + 1, MAXMIN32);
3205 
3206 	error = ddi_soft_state_init(&vmm_statep, sizeof (vmm_softc_t), 0);
3207 	if (error) {
3208 		return (error);
3209 	}
3210 
3211 	vmm_zsd_init();
3212 	vmmr_init();
3213 
3214 	error = mod_install(&modlinkage);
3215 	if (error) {
3216 		ddi_soft_state_fini(&vmm_statep);
3217 		vmm_zsd_fini();
3218 		vmmr_fini();
3219 	}
3220 
3221 	return (error);
3222 }
3223 
3224 int
3225 _fini(void)
3226 {
3227 	int	error;
3228 
3229 	error = mod_remove(&modlinkage);
3230 	if (error) {
3231 		return (error);
3232 	}
3233 
3234 	vmm_zsd_fini();
3235 	vmmr_fini();
3236 
3237 	ddi_soft_state_fini(&vmm_statep);
3238 
3239 	return (0);
3240 }
3241 
3242 int
3243 _info(struct modinfo *modinfop)
3244 {
3245 	return (mod_info(&modlinkage, modinfop));
3246 }
3247