xref: /illumos-gate/usr/src/uts/intel/io/vmm/vmm_sol_dev.c (revision 6fa29843813e354e472ca1ef80590ab80e2362b7)
1 /*
2  * This file and its contents are supplied under the terms of the
3  * Common Development and Distribution License ("CDDL"), version 1.0.
4  * You may only use this file in accordance with the terms of version
5  * 1.0 of the CDDL.
6  *
7  * A full copy of the text of the CDDL should have accompanied this
8  * source.  A copy of the CDDL is also available via the Internet at
9  * http://www.illumos.org/license/CDDL.
10  */
11 /* This file is dual-licensed; see usr/src/contrib/bhyve/LICENSE */
12 
13 /*
14  * Copyright 2015 Pluribus Networks Inc.
15  * Copyright 2019 Joyent, Inc.
16  * Copyright 2020 OmniOS Community Edition (OmniOSce) Association.
17  * Copyright 2023 Oxide Computer Company
18  */
19 
20 #include <sys/types.h>
21 #include <sys/conf.h>
22 #include <sys/cpuvar.h>
23 #include <sys/ioccom.h>
24 #include <sys/stat.h>
25 #include <sys/vmsystm.h>
26 #include <sys/ddi.h>
27 #include <sys/mkdev.h>
28 #include <sys/sunddi.h>
29 #include <sys/fs/dv_node.h>
30 #include <sys/cpuset.h>
31 #include <sys/id_space.h>
32 #include <sys/fs/sdev_plugin.h>
33 #include <sys/smt.h>
34 #include <sys/kstat.h>
35 
36 #include <sys/kernel.h>
37 #include <sys/hma.h>
38 #include <sys/x86_archext.h>
39 #include <x86/apicreg.h>
40 
41 #include <sys/vmm.h>
42 #include <sys/vmm_kernel.h>
43 #include <sys/vmm_instruction_emul.h>
44 #include <sys/vmm_dev.h>
45 #include <sys/vmm_impl.h>
46 #include <sys/vmm_drv.h>
47 #include <sys/vmm_vm.h>
48 #include <sys/vmm_reservoir.h>
49 
50 #include <vm/seg_dev.h>
51 
52 #include "io/ppt.h"
53 #include "io/vatpic.h"
54 #include "io/vioapic.h"
55 #include "io/vrtc.h"
56 #include "io/vhpet.h"
57 #include "io/vpmtmr.h"
58 #include "vmm_lapic.h"
59 #include "vmm_stat.h"
60 #include "vmm_util.h"
61 
62 /*
63  * Locking details:
64  *
65  * Driver-wide data (vmmdev_*) , including HMA and sdev registration, is
66  * protected by vmmdev_mtx.  The list of vmm_softc_t instances and related data
67  * (vmm_*) are protected by vmm_mtx.  Actions requiring both locks must acquire
68  * vmmdev_mtx before vmm_mtx.  The sdev plugin functions must not attempt to
69  * acquire vmmdev_mtx, as they could deadlock with plugin unregistration.
70  */
71 
72 static kmutex_t		vmmdev_mtx;
73 static dev_info_t	*vmmdev_dip;
74 static hma_reg_t	*vmmdev_hma_reg;
75 static uint_t		vmmdev_hma_ref;
76 static sdev_plugin_hdl_t vmmdev_sdev_hdl;
77 
78 static kmutex_t		vmm_mtx;
79 static list_t		vmm_list;
80 static id_space_t	*vmm_minors;
81 static void		*vmm_statep;
82 
83 /*
84  * Until device emulation in bhyve had been adequately scrutinized and tested,
85  * there was (justified) concern that unusual or corrupt device state payloads
86  * could crash the host when loaded via the vmm-data interface.
87  *
88  * Now that those concerns have been mitigated, this protection is loosened to
89  * default-allow, but the switch is left in place, in case there is a need to
90  * once again clamp down on vmm-data writes.
91  */
92 int		vmm_allow_state_writes = 1;
93 
94 static const char *vmmdev_hvm_name = "bhyve";
95 
96 /* For sdev plugin (/dev) */
97 #define	VMM_SDEV_ROOT "/dev/vmm"
98 
99 /* From uts/intel/io/vmm/intel/vmx.c */
100 extern int vmx_x86_supported(const char **);
101 
102 /* Holds and hooks from drivers external to vmm */
103 struct vmm_hold {
104 	list_node_t	vmh_node;
105 	vmm_softc_t	*vmh_sc;
106 	boolean_t	vmh_release_req;
107 	uint_t		vmh_ioport_hook_cnt;
108 };
109 
110 struct vmm_lease {
111 	list_node_t		vml_node;
112 	struct vm		*vml_vm;
113 	vm_client_t		*vml_vmclient;
114 	boolean_t		vml_expired;
115 	boolean_t		vml_break_deferred;
116 	boolean_t		(*vml_expire_func)(void *);
117 	void			*vml_expire_arg;
118 	struct vmm_hold		*vml_hold;
119 };
120 
121 /* Options for vmm_destroy_locked */
122 typedef enum vmm_destroy_opts {
123 	VDO_DEFAULT		= 0,
124 	/*
125 	 * Indicate that zone-specific-data associated with this VM not be
126 	 * cleaned up as part of the destroy.  Skipping ZSD clean-up is
127 	 * necessary when VM is being destroyed as part of zone destruction,
128 	 * when said ZSD is already being cleaned up.
129 	 */
130 	VDO_NO_CLEAN_ZSD	= (1 << 0),
131 	/*
132 	 * Attempt to wait for VM destruction to complete.  This is opt-in,
133 	 * since there are many normal conditions which could lead to
134 	 * destruction being stalled pending other clean-up.
135 	 */
136 	VDO_ATTEMPT_WAIT	= (1 << 1),
137 } vmm_destroy_opts_t;
138 
139 static void vmm_hma_release(void);
140 static int vmm_destroy_locked(vmm_softc_t *, vmm_destroy_opts_t, bool *);
141 static int vmm_drv_block_hook(vmm_softc_t *, boolean_t);
142 static void vmm_lease_block(vmm_softc_t *);
143 static void vmm_lease_unblock(vmm_softc_t *);
144 static int vmm_kstat_alloc(vmm_softc_t *, minor_t, const cred_t *);
145 static void vmm_kstat_init(vmm_softc_t *);
146 static void vmm_kstat_fini(vmm_softc_t *);
147 
148 /*
149  * The 'devmem' hack:
150  *
151  * On native FreeBSD, bhyve consumers are allowed to create 'devmem' segments
152  * in the vm which appear with their own name related to the vm under /dev.
153  * Since this would be a hassle from an sdev perspective and would require a
154  * new cdev interface (or complicate the existing one), we choose to implement
155  * this in a different manner.  Direct access to the underlying vm memory
156  * segments is exposed by placing them in a range of offsets beyond the normal
157  * guest memory space.  Userspace can query the appropriate offset to mmap()
158  * for a given segment-id with the VM_DEVMEM_GETOFFSET ioctl.
159  */
160 
161 static vmm_devmem_entry_t *
162 vmmdev_devmem_find(vmm_softc_t *sc, int segid)
163 {
164 	vmm_devmem_entry_t *ent = NULL;
165 	list_t *dl = &sc->vmm_devmem_list;
166 
167 	for (ent = list_head(dl); ent != NULL; ent = list_next(dl, ent)) {
168 		if (ent->vde_segid == segid) {
169 			return (ent);
170 		}
171 	}
172 	return (NULL);
173 }
174 
175 static int
176 vmmdev_get_memseg(vmm_softc_t *sc, struct vm_memseg *mseg)
177 {
178 	int error;
179 	bool sysmem;
180 
181 	error = vm_get_memseg(sc->vmm_vm, mseg->segid, &mseg->len, &sysmem,
182 	    NULL);
183 	if (error || mseg->len == 0)
184 		return (error);
185 
186 	if (!sysmem) {
187 		vmm_devmem_entry_t *de;
188 
189 		de = vmmdev_devmem_find(sc, mseg->segid);
190 		if (de != NULL) {
191 			(void) strlcpy(mseg->name, de->vde_name,
192 			    sizeof (mseg->name));
193 		}
194 	} else {
195 		bzero(mseg->name, sizeof (mseg->name));
196 	}
197 
198 	return (error);
199 }
200 
201 static int
202 vmmdev_devmem_create(vmm_softc_t *sc, struct vm_memseg *mseg, const char *name)
203 {
204 	off_t map_offset;
205 	vmm_devmem_entry_t *entry;
206 
207 	if (list_is_empty(&sc->vmm_devmem_list)) {
208 		map_offset = VM_DEVMEM_START;
209 	} else {
210 		entry = list_tail(&sc->vmm_devmem_list);
211 		map_offset = entry->vde_off + entry->vde_len;
212 		if (map_offset < entry->vde_off) {
213 			/* Do not tolerate overflow */
214 			return (ERANGE);
215 		}
216 		/*
217 		 * XXXJOY: We could choose to search the list for duplicate
218 		 * names and toss an error.  Since we're using the offset
219 		 * method for now, it does not make much of a difference.
220 		 */
221 	}
222 
223 	entry = kmem_zalloc(sizeof (*entry), KM_SLEEP);
224 	entry->vde_segid = mseg->segid;
225 	entry->vde_len = mseg->len;
226 	entry->vde_off = map_offset;
227 	(void) strlcpy(entry->vde_name, name, sizeof (entry->vde_name));
228 	list_insert_tail(&sc->vmm_devmem_list, entry);
229 
230 	return (0);
231 }
232 
233 static boolean_t
234 vmmdev_devmem_segid(vmm_softc_t *sc, off_t off, off_t len, int *segidp,
235     off_t *map_offp)
236 {
237 	list_t *dl = &sc->vmm_devmem_list;
238 	vmm_devmem_entry_t *de = NULL;
239 	const off_t map_end = off + len;
240 
241 	VERIFY(off >= VM_DEVMEM_START);
242 
243 	if (map_end < off) {
244 		/* No match on overflow */
245 		return (B_FALSE);
246 	}
247 
248 	for (de = list_head(dl); de != NULL; de = list_next(dl, de)) {
249 		const off_t item_end = de->vde_off + de->vde_len;
250 
251 		if (de->vde_off <= off && item_end >= map_end) {
252 			*segidp = de->vde_segid;
253 			*map_offp = off - de->vde_off;
254 			return (B_TRUE);
255 		}
256 	}
257 	return (B_FALSE);
258 }
259 
260 /*
261  * When an instance is being destroyed, the devmem list of named memory objects
262  * can be torn down, as no new mappings are allowed.
263  */
264 static void
265 vmmdev_devmem_purge(vmm_softc_t *sc)
266 {
267 	vmm_devmem_entry_t *entry;
268 
269 	while ((entry = list_remove_head(&sc->vmm_devmem_list)) != NULL) {
270 		kmem_free(entry, sizeof (*entry));
271 	}
272 }
273 
274 static int
275 vmmdev_alloc_memseg(vmm_softc_t *sc, struct vm_memseg *mseg)
276 {
277 	int error;
278 	bool sysmem = true;
279 
280 	if (VM_MEMSEG_NAME(mseg)) {
281 		sysmem = false;
282 	}
283 	error = vm_alloc_memseg(sc->vmm_vm, mseg->segid, mseg->len, sysmem);
284 
285 	if (error == 0) {
286 		/*
287 		 * Rather than create a whole fresh device from which userspace
288 		 * can mmap this segment, instead make it available at an
289 		 * offset above where the main guest memory resides.
290 		 */
291 		error = vmmdev_devmem_create(sc, mseg, mseg->name);
292 		if (error != 0) {
293 			vm_free_memseg(sc->vmm_vm, mseg->segid);
294 		}
295 	}
296 	return (error);
297 }
298 
299 /*
300  * Resource Locking and Exclusion
301  *
302  * Much of bhyve depends on key portions of VM state, such as the guest memory
303  * map, to remain unchanged while the guest is running.  As ported from
304  * FreeBSD, the initial strategy for this resource exclusion hinged on gating
305  * access to the instance vCPUs.  Threads acting on a single vCPU, like those
306  * performing the work of actually running the guest in VMX/SVM, would lock
307  * only that vCPU during ioctl() entry.  For ioctls which would change VM-wide
308  * state, all of the vCPUs would be first locked, ensuring that the
309  * operation(s) could complete without any other threads stumbling into
310  * intermediate states.
311  *
312  * This approach is largely effective for bhyve.  Common operations, such as
313  * running the vCPUs, steer clear of lock contention.  The model begins to
314  * break down for operations which do not occur in the context of a specific
315  * vCPU.  LAPIC MSI delivery, for example, may be initiated from a worker
316  * thread in the bhyve process.  In order to properly protect those vCPU-less
317  * operations from encountering invalid states, additional locking is required.
318  * This was solved by forcing those operations to lock the VM_MAXCPU-1 vCPU.
319  * It does mean that class of operations will be serialized on locking the
320  * specific vCPU and that instances sized at VM_MAXCPU will potentially see
321  * undue contention on the VM_MAXCPU-1 vCPU.
322  *
323  * In order to address the shortcomings of this model, the concept of a
324  * read/write lock has been added to bhyve.  Operations which change
325  * fundamental aspects of a VM (such as the memory map) must acquire the write
326  * lock, which also implies locking all of the vCPUs and waiting for all read
327  * lock holders to release.  While it increases the cost and waiting time for
328  * those few operations, it allows most hot-path operations on the VM (which
329  * depend on its configuration remaining stable) to occur with minimal locking.
330  *
331  * Consumers of the Driver API (see below) are a special case when it comes to
332  * this locking, since they may hold a read lock via the drv_lease mechanism
333  * for an extended period of time.  Rather than forcing those consumers to
334  * continuously poll for a write lock attempt, the lease system forces them to
335  * provide a release callback to trigger their clean-up (and potential later
336  * reacquisition) of the read lock.
337  */
338 
339 static void
340 vcpu_lock_one(vmm_softc_t *sc, int vcpu)
341 {
342 	ASSERT(vcpu >= 0 && vcpu < VM_MAXCPU);
343 
344 	/*
345 	 * Since this state transition is utilizing from_idle=true, it should
346 	 * not fail, but rather block until it can be successful.
347 	 */
348 	VERIFY0(vcpu_set_state(sc->vmm_vm, vcpu, VCPU_FROZEN, true));
349 }
350 
351 static void
352 vcpu_unlock_one(vmm_softc_t *sc, int vcpu)
353 {
354 	ASSERT(vcpu >= 0 && vcpu < VM_MAXCPU);
355 
356 	VERIFY3U(vcpu_get_state(sc->vmm_vm, vcpu, NULL), ==, VCPU_FROZEN);
357 	VERIFY0(vcpu_set_state(sc->vmm_vm, vcpu, VCPU_IDLE, false));
358 }
359 
360 static void
361 vmm_read_lock(vmm_softc_t *sc)
362 {
363 	rw_enter(&sc->vmm_rwlock, RW_READER);
364 }
365 
366 static void
367 vmm_read_unlock(vmm_softc_t *sc)
368 {
369 	rw_exit(&sc->vmm_rwlock);
370 }
371 
372 static void
373 vmm_write_lock(vmm_softc_t *sc)
374 {
375 	int maxcpus;
376 
377 	/* First lock all the vCPUs */
378 	maxcpus = vm_get_maxcpus(sc->vmm_vm);
379 	for (int vcpu = 0; vcpu < maxcpus; vcpu++) {
380 		vcpu_lock_one(sc, vcpu);
381 	}
382 
383 	/*
384 	 * Block vmm_drv leases from being acquired or held while the VM write
385 	 * lock is held.
386 	 */
387 	vmm_lease_block(sc);
388 
389 	rw_enter(&sc->vmm_rwlock, RW_WRITER);
390 	/*
391 	 * For now, the 'maxcpus' value for an instance is fixed at the
392 	 * compile-time constant of VM_MAXCPU at creation.  If this changes in
393 	 * the future, allowing for dynamic vCPU resource sizing, acquisition
394 	 * of the write lock will need to be wary of such changes.
395 	 */
396 	VERIFY(maxcpus == vm_get_maxcpus(sc->vmm_vm));
397 }
398 
399 static void
400 vmm_write_unlock(vmm_softc_t *sc)
401 {
402 	int maxcpus;
403 
404 	/* Allow vmm_drv leases to be acquired once write lock is dropped */
405 	vmm_lease_unblock(sc);
406 
407 	/*
408 	 * The VM write lock _must_ be released from the same thread it was
409 	 * acquired in, unlike the read lock.
410 	 */
411 	VERIFY(rw_write_held(&sc->vmm_rwlock));
412 	rw_exit(&sc->vmm_rwlock);
413 
414 	/* Unlock all the vCPUs */
415 	maxcpus = vm_get_maxcpus(sc->vmm_vm);
416 	for (int vcpu = 0; vcpu < maxcpus; vcpu++) {
417 		vcpu_unlock_one(sc, vcpu);
418 	}
419 }
420 
421 static int
422 vmmdev_do_ioctl(vmm_softc_t *sc, int cmd, intptr_t arg, int md,
423     cred_t *credp, int *rvalp)
424 {
425 	int error = 0, vcpu = -1;
426 	void *datap = (void *)arg;
427 	enum vm_lock_type {
428 		LOCK_NONE = 0,
429 		LOCK_VCPU,
430 		LOCK_READ_HOLD,
431 		LOCK_WRITE_HOLD
432 	} lock_type = LOCK_NONE;
433 
434 	/* Acquire any exclusion resources needed for the operation. */
435 	switch (cmd) {
436 	case VM_RUN:
437 	case VM_GET_REGISTER:
438 	case VM_SET_REGISTER:
439 	case VM_GET_SEGMENT_DESCRIPTOR:
440 	case VM_SET_SEGMENT_DESCRIPTOR:
441 	case VM_GET_REGISTER_SET:
442 	case VM_SET_REGISTER_SET:
443 	case VM_INJECT_EXCEPTION:
444 	case VM_GET_CAPABILITY:
445 	case VM_SET_CAPABILITY:
446 	case VM_PPTDEV_MSI:
447 	case VM_PPTDEV_MSIX:
448 	case VM_SET_X2APIC_STATE:
449 	case VM_GLA2GPA:
450 	case VM_GLA2GPA_NOFAULT:
451 	case VM_ACTIVATE_CPU:
452 	case VM_SET_INTINFO:
453 	case VM_GET_INTINFO:
454 	case VM_RESTART_INSTRUCTION:
455 	case VM_SET_KERNEMU_DEV:
456 	case VM_GET_KERNEMU_DEV:
457 	case VM_RESET_CPU:
458 	case VM_GET_RUN_STATE:
459 	case VM_SET_RUN_STATE:
460 	case VM_GET_FPU:
461 	case VM_SET_FPU:
462 	case VM_GET_CPUID:
463 	case VM_SET_CPUID:
464 	case VM_LEGACY_CPUID:
465 		/*
466 		 * Copy in the ID of the vCPU chosen for this operation.
467 		 * Since a nefarious caller could update their struct between
468 		 * this locking and when the rest of the ioctl data is copied
469 		 * in, it is _critical_ that this local 'vcpu' variable be used
470 		 * rather than the in-struct one when performing the ioctl.
471 		 */
472 		if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) {
473 			return (EFAULT);
474 		}
475 		if (vcpu < 0 || vcpu >= vm_get_maxcpus(sc->vmm_vm)) {
476 			return (EINVAL);
477 		}
478 		vcpu_lock_one(sc, vcpu);
479 		lock_type = LOCK_VCPU;
480 		break;
481 
482 	case VM_REINIT:
483 	case VM_BIND_PPTDEV:
484 	case VM_UNBIND_PPTDEV:
485 	case VM_MAP_PPTDEV_MMIO:
486 	case VM_UNMAP_PPTDEV_MMIO:
487 	case VM_ALLOC_MEMSEG:
488 	case VM_MMAP_MEMSEG:
489 	case VM_MUNMAP_MEMSEG:
490 	case VM_WRLOCK_CYCLE:
491 	case VM_PMTMR_LOCATE:
492 	case VM_PAUSE:
493 	case VM_RESUME:
494 		vmm_write_lock(sc);
495 		lock_type = LOCK_WRITE_HOLD;
496 		break;
497 
498 	case VM_GET_MEMSEG:
499 	case VM_MMAP_GETNEXT:
500 	case VM_LAPIC_IRQ:
501 	case VM_INJECT_NMI:
502 	case VM_IOAPIC_ASSERT_IRQ:
503 	case VM_IOAPIC_DEASSERT_IRQ:
504 	case VM_IOAPIC_PULSE_IRQ:
505 	case VM_LAPIC_MSI:
506 	case VM_LAPIC_LOCAL_IRQ:
507 	case VM_GET_X2APIC_STATE:
508 	case VM_RTC_READ:
509 	case VM_RTC_WRITE:
510 	case VM_RTC_SETTIME:
511 	case VM_RTC_GETTIME:
512 	case VM_PPTDEV_DISABLE_MSIX:
513 	case VM_DEVMEM_GETOFFSET:
514 	case VM_TRACK_DIRTY_PAGES:
515 		vmm_read_lock(sc);
516 		lock_type = LOCK_READ_HOLD;
517 		break;
518 
519 	case VM_DATA_READ:
520 	case VM_DATA_WRITE:
521 		if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) {
522 			return (EFAULT);
523 		}
524 		if (vcpu == -1) {
525 			/* Access data for VM-wide devices */
526 			vmm_write_lock(sc);
527 			lock_type = LOCK_WRITE_HOLD;
528 		} else if (vcpu >= 0 && vcpu < vm_get_maxcpus(sc->vmm_vm)) {
529 			/* Access data associated with a specific vCPU */
530 			vcpu_lock_one(sc, vcpu);
531 			lock_type = LOCK_VCPU;
532 		} else {
533 			return (EINVAL);
534 		}
535 		break;
536 
537 	case VM_GET_GPA_PMAP:
538 	case VM_IOAPIC_PINCOUNT:
539 	case VM_SUSPEND:
540 	case VM_DESC_FPU_AREA:
541 	case VM_SET_AUTODESTRUCT:
542 	case VM_DESTROY_SELF:
543 	case VM_DESTROY_PENDING:
544 	case VM_VCPU_BARRIER:
545 	default:
546 		break;
547 	}
548 
549 	/* Execute the primary logic for the ioctl. */
550 	switch (cmd) {
551 	case VM_RUN: {
552 		struct vm_entry entry;
553 
554 		if (ddi_copyin(datap, &entry, sizeof (entry), md)) {
555 			error = EFAULT;
556 			break;
557 		}
558 
559 		if (!(curthread->t_schedflag & TS_VCPU))
560 			smt_mark_as_vcpu();
561 
562 		error = vm_run(sc->vmm_vm, vcpu, &entry);
563 
564 		/*
565 		 * Unexpected states in vm_run() are expressed through positive
566 		 * errno-oriented return values.  VM states which expect further
567 		 * processing in userspace (necessary context via exitinfo) are
568 		 * expressed through negative return values.  For the time being
569 		 * a return value of 0 is not expected from vm_run().
570 		 */
571 		ASSERT(error != 0);
572 		if (error < 0) {
573 			const struct vm_exit *vme;
574 			void *outp = entry.exit_data;
575 
576 			error = 0;
577 			vme = vm_exitinfo(sc->vmm_vm, vcpu);
578 			if (ddi_copyout(vme, outp, sizeof (*vme), md)) {
579 				error = EFAULT;
580 			}
581 		}
582 		break;
583 	}
584 	case VM_SUSPEND: {
585 		struct vm_suspend vmsuspend;
586 
587 		if (ddi_copyin(datap, &vmsuspend, sizeof (vmsuspend), md)) {
588 			error = EFAULT;
589 			break;
590 		}
591 		error = vm_suspend(sc->vmm_vm, vmsuspend.how, vmsuspend.source);
592 		break;
593 	}
594 	case VM_REINIT: {
595 		struct vm_reinit reinit;
596 
597 		if (ddi_copyin(datap, &reinit, sizeof (reinit), md)) {
598 			error = EFAULT;
599 			break;
600 		}
601 		if ((error = vmm_drv_block_hook(sc, B_TRUE)) != 0) {
602 			/*
603 			 * The VM instance should be free of driver-attached
604 			 * hooks during the reinitialization process.
605 			 */
606 			break;
607 		}
608 		error = vm_reinit(sc->vmm_vm, reinit.flags);
609 		(void) vmm_drv_block_hook(sc, B_FALSE);
610 		break;
611 	}
612 	case VM_STAT_DESC: {
613 		struct vm_stat_desc statdesc;
614 
615 		if (ddi_copyin(datap, &statdesc, sizeof (statdesc), md)) {
616 			error = EFAULT;
617 			break;
618 		}
619 		error = vmm_stat_desc_copy(statdesc.index, statdesc.desc,
620 		    sizeof (statdesc.desc));
621 		if (error == 0 &&
622 		    ddi_copyout(&statdesc, datap, sizeof (statdesc), md)) {
623 			error = EFAULT;
624 			break;
625 		}
626 		break;
627 	}
628 	case VM_STATS_IOC: {
629 		struct vm_stats vmstats;
630 
631 		if (ddi_copyin(datap, &vmstats, sizeof (vmstats), md)) {
632 			error = EFAULT;
633 			break;
634 		}
635 		hrt2tv(gethrtime(), &vmstats.tv);
636 		error = vmm_stat_copy(sc->vmm_vm, vmstats.cpuid, vmstats.index,
637 		    nitems(vmstats.statbuf),
638 		    &vmstats.num_entries, vmstats.statbuf);
639 		if (error == 0 &&
640 		    ddi_copyout(&vmstats, datap, sizeof (vmstats), md)) {
641 			error = EFAULT;
642 			break;
643 		}
644 		break;
645 	}
646 
647 	case VM_PPTDEV_MSI: {
648 		struct vm_pptdev_msi pptmsi;
649 
650 		if (ddi_copyin(datap, &pptmsi, sizeof (pptmsi), md)) {
651 			error = EFAULT;
652 			break;
653 		}
654 		error = ppt_setup_msi(sc->vmm_vm, pptmsi.vcpu, pptmsi.pptfd,
655 		    pptmsi.addr, pptmsi.msg, pptmsi.numvec);
656 		break;
657 	}
658 	case VM_PPTDEV_MSIX: {
659 		struct vm_pptdev_msix pptmsix;
660 
661 		if (ddi_copyin(datap, &pptmsix, sizeof (pptmsix), md)) {
662 			error = EFAULT;
663 			break;
664 		}
665 		error = ppt_setup_msix(sc->vmm_vm, pptmsix.vcpu, pptmsix.pptfd,
666 		    pptmsix.idx, pptmsix.addr, pptmsix.msg,
667 		    pptmsix.vector_control);
668 		break;
669 	}
670 	case VM_PPTDEV_DISABLE_MSIX: {
671 		struct vm_pptdev pptdev;
672 
673 		if (ddi_copyin(datap, &pptdev, sizeof (pptdev), md)) {
674 			error = EFAULT;
675 			break;
676 		}
677 		error = ppt_disable_msix(sc->vmm_vm, pptdev.pptfd);
678 		break;
679 	}
680 	case VM_MAP_PPTDEV_MMIO: {
681 		struct vm_pptdev_mmio pptmmio;
682 
683 		if (ddi_copyin(datap, &pptmmio, sizeof (pptmmio), md)) {
684 			error = EFAULT;
685 			break;
686 		}
687 		error = ppt_map_mmio(sc->vmm_vm, pptmmio.pptfd, pptmmio.gpa,
688 		    pptmmio.len, pptmmio.hpa);
689 		break;
690 	}
691 	case VM_UNMAP_PPTDEV_MMIO: {
692 		struct vm_pptdev_mmio pptmmio;
693 
694 		if (ddi_copyin(datap, &pptmmio, sizeof (pptmmio), md)) {
695 			error = EFAULT;
696 			break;
697 		}
698 		error = ppt_unmap_mmio(sc->vmm_vm, pptmmio.pptfd, pptmmio.gpa,
699 		    pptmmio.len);
700 		break;
701 	}
702 	case VM_BIND_PPTDEV: {
703 		struct vm_pptdev pptdev;
704 
705 		if (ddi_copyin(datap, &pptdev, sizeof (pptdev), md)) {
706 			error = EFAULT;
707 			break;
708 		}
709 		error = vm_assign_pptdev(sc->vmm_vm, pptdev.pptfd);
710 		break;
711 	}
712 	case VM_UNBIND_PPTDEV: {
713 		struct vm_pptdev pptdev;
714 
715 		if (ddi_copyin(datap, &pptdev, sizeof (pptdev), md)) {
716 			error = EFAULT;
717 			break;
718 		}
719 		error = vm_unassign_pptdev(sc->vmm_vm, pptdev.pptfd);
720 		break;
721 	}
722 	case VM_GET_PPTDEV_LIMITS: {
723 		struct vm_pptdev_limits pptlimits;
724 
725 		if (ddi_copyin(datap, &pptlimits, sizeof (pptlimits), md)) {
726 			error = EFAULT;
727 			break;
728 		}
729 		error = ppt_get_limits(sc->vmm_vm, pptlimits.pptfd,
730 		    &pptlimits.msi_limit, &pptlimits.msix_limit);
731 		if (error == 0 &&
732 		    ddi_copyout(&pptlimits, datap, sizeof (pptlimits), md)) {
733 			error = EFAULT;
734 			break;
735 		}
736 		break;
737 	}
738 	case VM_INJECT_EXCEPTION: {
739 		struct vm_exception vmexc;
740 		if (ddi_copyin(datap, &vmexc, sizeof (vmexc), md)) {
741 			error = EFAULT;
742 			break;
743 		}
744 		error = vm_inject_exception(sc->vmm_vm, vcpu, vmexc.vector,
745 		    vmexc.error_code_valid != 0, vmexc.error_code,
746 		    vmexc.restart_instruction != 0);
747 		break;
748 	}
749 	case VM_INJECT_NMI: {
750 		struct vm_nmi vmnmi;
751 
752 		if (ddi_copyin(datap, &vmnmi, sizeof (vmnmi), md)) {
753 			error = EFAULT;
754 			break;
755 		}
756 		error = vm_inject_nmi(sc->vmm_vm, vmnmi.cpuid);
757 		break;
758 	}
759 	case VM_LAPIC_IRQ: {
760 		struct vm_lapic_irq vmirq;
761 
762 		if (ddi_copyin(datap, &vmirq, sizeof (vmirq), md)) {
763 			error = EFAULT;
764 			break;
765 		}
766 		error = lapic_intr_edge(sc->vmm_vm, vmirq.cpuid, vmirq.vector);
767 		break;
768 	}
769 	case VM_LAPIC_LOCAL_IRQ: {
770 		struct vm_lapic_irq vmirq;
771 
772 		if (ddi_copyin(datap, &vmirq, sizeof (vmirq), md)) {
773 			error = EFAULT;
774 			break;
775 		}
776 		error = lapic_set_local_intr(sc->vmm_vm, vmirq.cpuid,
777 		    vmirq.vector);
778 		break;
779 	}
780 	case VM_LAPIC_MSI: {
781 		struct vm_lapic_msi vmmsi;
782 
783 		if (ddi_copyin(datap, &vmmsi, sizeof (vmmsi), md)) {
784 			error = EFAULT;
785 			break;
786 		}
787 		error = lapic_intr_msi(sc->vmm_vm, vmmsi.addr, vmmsi.msg);
788 		break;
789 	}
790 
791 	case VM_IOAPIC_ASSERT_IRQ: {
792 		struct vm_ioapic_irq ioapic_irq;
793 
794 		if (ddi_copyin(datap, &ioapic_irq, sizeof (ioapic_irq), md)) {
795 			error = EFAULT;
796 			break;
797 		}
798 		error = vioapic_assert_irq(sc->vmm_vm, ioapic_irq.irq);
799 		break;
800 	}
801 	case VM_IOAPIC_DEASSERT_IRQ: {
802 		struct vm_ioapic_irq ioapic_irq;
803 
804 		if (ddi_copyin(datap, &ioapic_irq, sizeof (ioapic_irq), md)) {
805 			error = EFAULT;
806 			break;
807 		}
808 		error = vioapic_deassert_irq(sc->vmm_vm, ioapic_irq.irq);
809 		break;
810 	}
811 	case VM_IOAPIC_PULSE_IRQ: {
812 		struct vm_ioapic_irq ioapic_irq;
813 
814 		if (ddi_copyin(datap, &ioapic_irq, sizeof (ioapic_irq), md)) {
815 			error = EFAULT;
816 			break;
817 		}
818 		error = vioapic_pulse_irq(sc->vmm_vm, ioapic_irq.irq);
819 		break;
820 	}
821 	case VM_IOAPIC_PINCOUNT: {
822 		int pincount;
823 
824 		pincount = vioapic_pincount(sc->vmm_vm);
825 		if (ddi_copyout(&pincount, datap, sizeof (int), md)) {
826 			error = EFAULT;
827 			break;
828 		}
829 		break;
830 	}
831 	case VM_DESC_FPU_AREA: {
832 		struct vm_fpu_desc desc;
833 		void *buf = NULL;
834 
835 		if (ddi_copyin(datap, &desc, sizeof (desc), md)) {
836 			error = EFAULT;
837 			break;
838 		}
839 		if (desc.vfd_num_entries > 64) {
840 			error = EINVAL;
841 			break;
842 		}
843 		const size_t buf_sz = sizeof (struct vm_fpu_desc_entry) *
844 		    desc.vfd_num_entries;
845 		if (buf_sz != 0) {
846 			buf = kmem_zalloc(buf_sz, KM_SLEEP);
847 		}
848 
849 		/*
850 		 * For now, we are depending on vm_fpu_desc_entry and
851 		 * hma_xsave_state_desc_t having the same format.
852 		 */
853 		CTASSERT(sizeof (struct vm_fpu_desc_entry) ==
854 		    sizeof (hma_xsave_state_desc_t));
855 
856 		size_t req_size;
857 		const uint_t max_entries = hma_fpu_describe_xsave_state(
858 		    (hma_xsave_state_desc_t *)buf,
859 		    desc.vfd_num_entries,
860 		    &req_size);
861 
862 		desc.vfd_req_size = req_size;
863 		desc.vfd_num_entries = max_entries;
864 		if (buf_sz != 0) {
865 			if (ddi_copyout(buf, desc.vfd_entry_data, buf_sz, md)) {
866 				error = EFAULT;
867 			}
868 			kmem_free(buf, buf_sz);
869 		}
870 
871 		if (error == 0) {
872 			if (ddi_copyout(&desc, datap, sizeof (desc), md)) {
873 				error = EFAULT;
874 			}
875 		}
876 		break;
877 	}
878 	case VM_SET_AUTODESTRUCT: {
879 		/*
880 		 * Since this has to do with controlling the lifetime of the
881 		 * greater vmm_softc_t, the flag is protected by vmm_mtx, rather
882 		 * than the vcpu-centric or rwlock exclusion mechanisms.
883 		 */
884 		mutex_enter(&vmm_mtx);
885 		if (arg != 0) {
886 			sc->vmm_flags |= VMM_AUTODESTROY;
887 		} else {
888 			sc->vmm_flags &= ~VMM_AUTODESTROY;
889 		}
890 		mutex_exit(&vmm_mtx);
891 		break;
892 	}
893 	case VM_DESTROY_SELF: {
894 		bool hma_release = false;
895 
896 		/*
897 		 * Just like VMM_DESTROY_VM, but on the instance file descriptor
898 		 * itself, rather than having to perform a racy name lookup as
899 		 * part of the destroy process.
900 		 *
901 		 * Since vmm_destroy_locked() performs vCPU lock acquisition in
902 		 * order to kick the vCPUs out of guest context as part of any
903 		 * destruction, we do not need to worry about it ourself using
904 		 * the `lock_type` logic here.
905 		 */
906 		mutex_enter(&vmm_mtx);
907 		VERIFY0(vmm_destroy_locked(sc, VDO_DEFAULT, &hma_release));
908 		mutex_exit(&vmm_mtx);
909 		if (hma_release) {
910 			vmm_hma_release();
911 		}
912 		break;
913 	}
914 	case VM_DESTROY_PENDING: {
915 		/*
916 		 * If we have made it this far, then destruction of the instance
917 		 * has not been initiated.
918 		 */
919 		*rvalp = 0;
920 		break;
921 	}
922 
923 	case VM_ISA_ASSERT_IRQ: {
924 		struct vm_isa_irq isa_irq;
925 
926 		if (ddi_copyin(datap, &isa_irq, sizeof (isa_irq), md)) {
927 			error = EFAULT;
928 			break;
929 		}
930 		error = vatpic_assert_irq(sc->vmm_vm, isa_irq.atpic_irq);
931 		if (error == 0 && isa_irq.ioapic_irq != -1) {
932 			error = vioapic_assert_irq(sc->vmm_vm,
933 			    isa_irq.ioapic_irq);
934 		}
935 		break;
936 	}
937 	case VM_ISA_DEASSERT_IRQ: {
938 		struct vm_isa_irq isa_irq;
939 
940 		if (ddi_copyin(datap, &isa_irq, sizeof (isa_irq), md)) {
941 			error = EFAULT;
942 			break;
943 		}
944 		error = vatpic_deassert_irq(sc->vmm_vm, isa_irq.atpic_irq);
945 		if (error == 0 && isa_irq.ioapic_irq != -1) {
946 			error = vioapic_deassert_irq(sc->vmm_vm,
947 			    isa_irq.ioapic_irq);
948 		}
949 		break;
950 	}
951 	case VM_ISA_PULSE_IRQ: {
952 		struct vm_isa_irq isa_irq;
953 
954 		if (ddi_copyin(datap, &isa_irq, sizeof (isa_irq), md)) {
955 			error = EFAULT;
956 			break;
957 		}
958 		error = vatpic_pulse_irq(sc->vmm_vm, isa_irq.atpic_irq);
959 		if (error == 0 && isa_irq.ioapic_irq != -1) {
960 			error = vioapic_pulse_irq(sc->vmm_vm,
961 			    isa_irq.ioapic_irq);
962 		}
963 		break;
964 	}
965 	case VM_ISA_SET_IRQ_TRIGGER: {
966 		struct vm_isa_irq_trigger isa_irq_trigger;
967 
968 		if (ddi_copyin(datap, &isa_irq_trigger,
969 		    sizeof (isa_irq_trigger), md)) {
970 			error = EFAULT;
971 			break;
972 		}
973 		error = vatpic_set_irq_trigger(sc->vmm_vm,
974 		    isa_irq_trigger.atpic_irq, isa_irq_trigger.trigger);
975 		break;
976 	}
977 
978 	case VM_MMAP_GETNEXT: {
979 		struct vm_memmap mm;
980 
981 		if (ddi_copyin(datap, &mm, sizeof (mm), md)) {
982 			error = EFAULT;
983 			break;
984 		}
985 		error = vm_mmap_getnext(sc->vmm_vm, &mm.gpa, &mm.segid,
986 		    &mm.segoff, &mm.len, &mm.prot, &mm.flags);
987 		if (error == 0 && ddi_copyout(&mm, datap, sizeof (mm), md)) {
988 			error = EFAULT;
989 			break;
990 		}
991 		break;
992 	}
993 	case VM_MMAP_MEMSEG: {
994 		struct vm_memmap mm;
995 
996 		if (ddi_copyin(datap, &mm, sizeof (mm), md)) {
997 			error = EFAULT;
998 			break;
999 		}
1000 		error = vm_mmap_memseg(sc->vmm_vm, mm.gpa, mm.segid, mm.segoff,
1001 		    mm.len, mm.prot, mm.flags);
1002 		break;
1003 	}
1004 	case VM_MUNMAP_MEMSEG: {
1005 		struct vm_munmap mu;
1006 
1007 		if (ddi_copyin(datap, &mu, sizeof (mu), md)) {
1008 			error = EFAULT;
1009 			break;
1010 		}
1011 		error = vm_munmap_memseg(sc->vmm_vm, mu.gpa, mu.len);
1012 		break;
1013 	}
1014 	case VM_ALLOC_MEMSEG: {
1015 		struct vm_memseg vmseg;
1016 
1017 		if (ddi_copyin(datap, &vmseg, sizeof (vmseg), md)) {
1018 			error = EFAULT;
1019 			break;
1020 		}
1021 		error = vmmdev_alloc_memseg(sc, &vmseg);
1022 		break;
1023 	}
1024 	case VM_GET_MEMSEG: {
1025 		struct vm_memseg vmseg;
1026 
1027 		if (ddi_copyin(datap, &vmseg, sizeof (vmseg), md)) {
1028 			error = EFAULT;
1029 			break;
1030 		}
1031 		error = vmmdev_get_memseg(sc, &vmseg);
1032 		if (error == 0 &&
1033 		    ddi_copyout(&vmseg, datap, sizeof (vmseg), md)) {
1034 			error = EFAULT;
1035 			break;
1036 		}
1037 		break;
1038 	}
1039 	case VM_GET_REGISTER: {
1040 		struct vm_register vmreg;
1041 
1042 		if (ddi_copyin(datap, &vmreg, sizeof (vmreg), md)) {
1043 			error = EFAULT;
1044 			break;
1045 		}
1046 		error = vm_get_register(sc->vmm_vm, vcpu, vmreg.regnum,
1047 		    &vmreg.regval);
1048 		if (error == 0 &&
1049 		    ddi_copyout(&vmreg, datap, sizeof (vmreg), md)) {
1050 			error = EFAULT;
1051 			break;
1052 		}
1053 		break;
1054 	}
1055 	case VM_SET_REGISTER: {
1056 		struct vm_register vmreg;
1057 
1058 		if (ddi_copyin(datap, &vmreg, sizeof (vmreg), md)) {
1059 			error = EFAULT;
1060 			break;
1061 		}
1062 		error = vm_set_register(sc->vmm_vm, vcpu, vmreg.regnum,
1063 		    vmreg.regval);
1064 		break;
1065 	}
1066 	case VM_SET_SEGMENT_DESCRIPTOR: {
1067 		struct vm_seg_desc vmsegd;
1068 
1069 		if (ddi_copyin(datap, &vmsegd, sizeof (vmsegd), md)) {
1070 			error = EFAULT;
1071 			break;
1072 		}
1073 		error = vm_set_seg_desc(sc->vmm_vm, vcpu, vmsegd.regnum,
1074 		    &vmsegd.desc);
1075 		break;
1076 	}
1077 	case VM_GET_SEGMENT_DESCRIPTOR: {
1078 		struct vm_seg_desc vmsegd;
1079 
1080 		if (ddi_copyin(datap, &vmsegd, sizeof (vmsegd), md)) {
1081 			error = EFAULT;
1082 			break;
1083 		}
1084 		error = vm_get_seg_desc(sc->vmm_vm, vcpu, vmsegd.regnum,
1085 		    &vmsegd.desc);
1086 		if (error == 0 &&
1087 		    ddi_copyout(&vmsegd, datap, sizeof (vmsegd), md)) {
1088 			error = EFAULT;
1089 			break;
1090 		}
1091 		break;
1092 	}
1093 	case VM_GET_REGISTER_SET: {
1094 		struct vm_register_set vrs;
1095 		int regnums[VM_REG_LAST];
1096 		uint64_t regvals[VM_REG_LAST];
1097 
1098 		if (ddi_copyin(datap, &vrs, sizeof (vrs), md)) {
1099 			error = EFAULT;
1100 			break;
1101 		}
1102 		if (vrs.count > VM_REG_LAST || vrs.count == 0) {
1103 			error = EINVAL;
1104 			break;
1105 		}
1106 		if (ddi_copyin(vrs.regnums, regnums,
1107 		    sizeof (int) * vrs.count, md)) {
1108 			error = EFAULT;
1109 			break;
1110 		}
1111 
1112 		error = 0;
1113 		for (uint_t i = 0; i < vrs.count && error == 0; i++) {
1114 			if (regnums[i] < 0) {
1115 				error = EINVAL;
1116 				break;
1117 			}
1118 			error = vm_get_register(sc->vmm_vm, vcpu, regnums[i],
1119 			    &regvals[i]);
1120 		}
1121 		if (error == 0 && ddi_copyout(regvals, vrs.regvals,
1122 		    sizeof (uint64_t) * vrs.count, md)) {
1123 			error = EFAULT;
1124 		}
1125 		break;
1126 	}
1127 	case VM_SET_REGISTER_SET: {
1128 		struct vm_register_set vrs;
1129 		int regnums[VM_REG_LAST];
1130 		uint64_t regvals[VM_REG_LAST];
1131 
1132 		if (ddi_copyin(datap, &vrs, sizeof (vrs), md)) {
1133 			error = EFAULT;
1134 			break;
1135 		}
1136 		if (vrs.count > VM_REG_LAST || vrs.count == 0) {
1137 			error = EINVAL;
1138 			break;
1139 		}
1140 		if (ddi_copyin(vrs.regnums, regnums,
1141 		    sizeof (int) * vrs.count, md)) {
1142 			error = EFAULT;
1143 			break;
1144 		}
1145 		if (ddi_copyin(vrs.regvals, regvals,
1146 		    sizeof (uint64_t) * vrs.count, md)) {
1147 			error = EFAULT;
1148 			break;
1149 		}
1150 
1151 		error = 0;
1152 		for (uint_t i = 0; i < vrs.count && error == 0; i++) {
1153 			/*
1154 			 * Setting registers in a set is not atomic, since a
1155 			 * failure in the middle of the set will cause a
1156 			 * bail-out and inconsistent register state.  Callers
1157 			 * should be wary of this.
1158 			 */
1159 			if (regnums[i] < 0) {
1160 				error = EINVAL;
1161 				break;
1162 			}
1163 			error = vm_set_register(sc->vmm_vm, vcpu, regnums[i],
1164 			    regvals[i]);
1165 		}
1166 		break;
1167 	}
1168 	case VM_RESET_CPU: {
1169 		struct vm_vcpu_reset vvr;
1170 
1171 		if (ddi_copyin(datap, &vvr, sizeof (vvr), md)) {
1172 			error = EFAULT;
1173 			break;
1174 		}
1175 		if (vvr.kind != VRK_RESET && vvr.kind != VRK_INIT) {
1176 			error = EINVAL;
1177 		}
1178 
1179 		error = vcpu_arch_reset(sc->vmm_vm, vcpu, vvr.kind == VRK_INIT);
1180 		break;
1181 	}
1182 	case VM_GET_RUN_STATE: {
1183 		struct vm_run_state vrs;
1184 
1185 		bzero(&vrs, sizeof (vrs));
1186 		error = vm_get_run_state(sc->vmm_vm, vcpu, &vrs.state,
1187 		    &vrs.sipi_vector);
1188 		if (error == 0) {
1189 			if (ddi_copyout(&vrs, datap, sizeof (vrs), md)) {
1190 				error = EFAULT;
1191 				break;
1192 			}
1193 		}
1194 		break;
1195 	}
1196 	case VM_SET_RUN_STATE: {
1197 		struct vm_run_state vrs;
1198 
1199 		if (ddi_copyin(datap, &vrs, sizeof (vrs), md)) {
1200 			error = EFAULT;
1201 			break;
1202 		}
1203 		error = vm_set_run_state(sc->vmm_vm, vcpu, vrs.state,
1204 		    vrs.sipi_vector);
1205 		break;
1206 	}
1207 	case VM_GET_FPU: {
1208 		struct vm_fpu_state req;
1209 		const size_t max_len = (PAGESIZE * 2);
1210 		void *kbuf;
1211 
1212 		if (ddi_copyin(datap, &req, sizeof (req), md)) {
1213 			error = EFAULT;
1214 			break;
1215 		}
1216 		if (req.len > max_len || req.len == 0) {
1217 			error = EINVAL;
1218 			break;
1219 		}
1220 		kbuf = kmem_zalloc(req.len, KM_SLEEP);
1221 		error = vm_get_fpu(sc->vmm_vm, vcpu, kbuf, req.len);
1222 		if (error == 0) {
1223 			if (ddi_copyout(kbuf, req.buf, req.len, md)) {
1224 				error = EFAULT;
1225 			}
1226 		}
1227 		kmem_free(kbuf, req.len);
1228 		break;
1229 	}
1230 	case VM_SET_FPU: {
1231 		struct vm_fpu_state req;
1232 		const size_t max_len = (PAGESIZE * 2);
1233 		void *kbuf;
1234 
1235 		if (ddi_copyin(datap, &req, sizeof (req), md)) {
1236 			error = EFAULT;
1237 			break;
1238 		}
1239 		if (req.len > max_len || req.len == 0) {
1240 			error = EINVAL;
1241 			break;
1242 		}
1243 		kbuf = kmem_alloc(req.len, KM_SLEEP);
1244 		if (ddi_copyin(req.buf, kbuf, req.len, md)) {
1245 			error = EFAULT;
1246 		} else {
1247 			error = vm_set_fpu(sc->vmm_vm, vcpu, kbuf, req.len);
1248 		}
1249 		kmem_free(kbuf, req.len);
1250 		break;
1251 	}
1252 	case VM_GET_CPUID: {
1253 		struct vm_vcpu_cpuid_config cfg;
1254 		struct vcpu_cpuid_entry *entries = NULL;
1255 
1256 		if (ddi_copyin(datap, &cfg, sizeof (cfg), md)) {
1257 			error = EFAULT;
1258 			break;
1259 		}
1260 		if (cfg.vvcc_nent > VMM_MAX_CPUID_ENTRIES) {
1261 			error = EINVAL;
1262 			break;
1263 		}
1264 
1265 		const size_t entries_size =
1266 		    cfg.vvcc_nent * sizeof (struct vcpu_cpuid_entry);
1267 		if (entries_size != 0) {
1268 			entries = kmem_zalloc(entries_size, KM_SLEEP);
1269 		}
1270 
1271 		vcpu_cpuid_config_t vm_cfg = {
1272 			.vcc_nent = cfg.vvcc_nent,
1273 			.vcc_entries = entries,
1274 		};
1275 		error = vm_get_cpuid(sc->vmm_vm, vcpu, &vm_cfg);
1276 
1277 		/*
1278 		 * Only attempt to copy out the resultant entries if we were
1279 		 * able to query them from the instance.  The flags and number
1280 		 * of entries are emitted regardless.
1281 		 */
1282 		cfg.vvcc_flags = vm_cfg.vcc_flags;
1283 		cfg.vvcc_nent = vm_cfg.vcc_nent;
1284 		if (entries != NULL) {
1285 			if (error == 0 && ddi_copyout(entries, cfg.vvcc_entries,
1286 			    entries_size, md) != 0) {
1287 				error = EFAULT;
1288 			}
1289 
1290 			kmem_free(entries, entries_size);
1291 		}
1292 
1293 		if (ddi_copyout(&cfg, datap, sizeof (cfg), md) != 0) {
1294 			error = EFAULT;
1295 		}
1296 		break;
1297 	}
1298 	case VM_SET_CPUID: {
1299 		struct vm_vcpu_cpuid_config cfg;
1300 		struct vcpu_cpuid_entry *entries = NULL;
1301 		size_t entries_size = 0;
1302 
1303 		if (ddi_copyin(datap, &cfg, sizeof (cfg), md)) {
1304 			error = EFAULT;
1305 			break;
1306 		}
1307 		if (cfg.vvcc_nent > VMM_MAX_CPUID_ENTRIES) {
1308 			error = EFBIG;
1309 			break;
1310 		}
1311 		if ((cfg.vvcc_flags & VCC_FLAG_LEGACY_HANDLING) != 0) {
1312 			/*
1313 			 * If we are being instructed to use "legacy" handling,
1314 			 * then no entries should be provided, since the static
1315 			 * in-kernel masking will be used.
1316 			 */
1317 			if (cfg.vvcc_nent != 0) {
1318 				error = EINVAL;
1319 				break;
1320 			}
1321 		} else if (cfg.vvcc_nent != 0) {
1322 			entries_size =
1323 			    cfg.vvcc_nent * sizeof (struct vcpu_cpuid_entry);
1324 			entries = kmem_alloc(entries_size, KM_SLEEP);
1325 
1326 			if (ddi_copyin(cfg.vvcc_entries, entries, entries_size,
1327 			    md) != 0) {
1328 				error = EFAULT;
1329 				kmem_free(entries, entries_size);
1330 				break;
1331 			}
1332 		}
1333 
1334 		vcpu_cpuid_config_t vm_cfg = {
1335 			.vcc_flags = cfg.vvcc_flags,
1336 			.vcc_nent = cfg.vvcc_nent,
1337 			.vcc_entries = entries,
1338 		};
1339 		error = vm_set_cpuid(sc->vmm_vm, vcpu, &vm_cfg);
1340 
1341 		if (entries != NULL) {
1342 			kmem_free(entries, entries_size);
1343 		}
1344 		break;
1345 	}
1346 	case VM_LEGACY_CPUID: {
1347 		struct vm_legacy_cpuid vlc;
1348 		if (ddi_copyin(datap, &vlc, sizeof (vlc), md)) {
1349 			error = EFAULT;
1350 			break;
1351 		}
1352 		vlc.vlc_vcpuid = vcpu;
1353 
1354 		legacy_emulate_cpuid(sc->vmm_vm, vcpu, &vlc.vlc_eax,
1355 		    &vlc.vlc_ebx, &vlc.vlc_ecx, &vlc.vlc_edx);
1356 
1357 		if (ddi_copyout(&vlc, datap, sizeof (vlc), md)) {
1358 			error = EFAULT;
1359 			break;
1360 		}
1361 		break;
1362 	}
1363 
1364 	case VM_SET_KERNEMU_DEV:
1365 	case VM_GET_KERNEMU_DEV: {
1366 		struct vm_readwrite_kernemu_device kemu;
1367 		size_t size = 0;
1368 
1369 		if (ddi_copyin(datap, &kemu, sizeof (kemu), md)) {
1370 			error = EFAULT;
1371 			break;
1372 		}
1373 
1374 		if (kemu.access_width > 3) {
1375 			error = EINVAL;
1376 			break;
1377 		}
1378 		size = (1 << kemu.access_width);
1379 		ASSERT(size >= 1 && size <= 8);
1380 
1381 		if (cmd == VM_SET_KERNEMU_DEV) {
1382 			error = vm_service_mmio_write(sc->vmm_vm, vcpu,
1383 			    kemu.gpa, kemu.value, size);
1384 		} else {
1385 			error = vm_service_mmio_read(sc->vmm_vm, vcpu,
1386 			    kemu.gpa, &kemu.value, size);
1387 		}
1388 
1389 		if (error == 0) {
1390 			if (ddi_copyout(&kemu, datap, sizeof (kemu), md)) {
1391 				error = EFAULT;
1392 				break;
1393 			}
1394 		}
1395 		break;
1396 	}
1397 
1398 	case VM_GET_CAPABILITY: {
1399 		struct vm_capability vmcap;
1400 
1401 		if (ddi_copyin(datap, &vmcap, sizeof (vmcap), md)) {
1402 			error = EFAULT;
1403 			break;
1404 		}
1405 		error = vm_get_capability(sc->vmm_vm, vcpu, vmcap.captype,
1406 		    &vmcap.capval);
1407 		if (error == 0 &&
1408 		    ddi_copyout(&vmcap, datap, sizeof (vmcap), md)) {
1409 			error = EFAULT;
1410 			break;
1411 		}
1412 		break;
1413 	}
1414 	case VM_SET_CAPABILITY: {
1415 		struct vm_capability vmcap;
1416 
1417 		if (ddi_copyin(datap, &vmcap, sizeof (vmcap), md)) {
1418 			error = EFAULT;
1419 			break;
1420 		}
1421 		error = vm_set_capability(sc->vmm_vm, vcpu, vmcap.captype,
1422 		    vmcap.capval);
1423 		break;
1424 	}
1425 	case VM_SET_X2APIC_STATE: {
1426 		struct vm_x2apic x2apic;
1427 
1428 		if (ddi_copyin(datap, &x2apic, sizeof (x2apic), md)) {
1429 			error = EFAULT;
1430 			break;
1431 		}
1432 		error = vm_set_x2apic_state(sc->vmm_vm, vcpu, x2apic.state);
1433 		break;
1434 	}
1435 	case VM_GET_X2APIC_STATE: {
1436 		struct vm_x2apic x2apic;
1437 
1438 		if (ddi_copyin(datap, &x2apic, sizeof (x2apic), md)) {
1439 			error = EFAULT;
1440 			break;
1441 		}
1442 		error = vm_get_x2apic_state(sc->vmm_vm, x2apic.cpuid,
1443 		    &x2apic.state);
1444 		if (error == 0 &&
1445 		    ddi_copyout(&x2apic, datap, sizeof (x2apic), md)) {
1446 			error = EFAULT;
1447 			break;
1448 		}
1449 		break;
1450 	}
1451 	case VM_GET_GPA_PMAP: {
1452 		/*
1453 		 * Until there is a necessity to leak EPT/RVI PTE values to
1454 		 * userspace, this will remain unimplemented
1455 		 */
1456 		error = EINVAL;
1457 		break;
1458 	}
1459 	case VM_GET_HPET_CAPABILITIES: {
1460 		struct vm_hpet_cap hpetcap;
1461 
1462 		error = vhpet_getcap(&hpetcap);
1463 		if (error == 0 &&
1464 		    ddi_copyout(&hpetcap, datap, sizeof (hpetcap), md)) {
1465 			error = EFAULT;
1466 			break;
1467 		}
1468 		break;
1469 	}
1470 	case VM_GLA2GPA: {
1471 		struct vm_gla2gpa gg;
1472 
1473 		if (ddi_copyin(datap, &gg, sizeof (gg), md)) {
1474 			error = EFAULT;
1475 			break;
1476 		}
1477 		gg.vcpuid = vcpu;
1478 		error = vm_gla2gpa(sc->vmm_vm, vcpu, &gg.paging, gg.gla,
1479 		    gg.prot, &gg.gpa, &gg.fault);
1480 		if (error == 0 && ddi_copyout(&gg, datap, sizeof (gg), md)) {
1481 			error = EFAULT;
1482 			break;
1483 		}
1484 		break;
1485 	}
1486 	case VM_GLA2GPA_NOFAULT: {
1487 		struct vm_gla2gpa gg;
1488 
1489 		if (ddi_copyin(datap, &gg, sizeof (gg), md)) {
1490 			error = EFAULT;
1491 			break;
1492 		}
1493 		gg.vcpuid = vcpu;
1494 		error = vm_gla2gpa_nofault(sc->vmm_vm, vcpu, &gg.paging,
1495 		    gg.gla, gg.prot, &gg.gpa, &gg.fault);
1496 		if (error == 0 && ddi_copyout(&gg, datap, sizeof (gg), md)) {
1497 			error = EFAULT;
1498 			break;
1499 		}
1500 		break;
1501 	}
1502 
1503 	case VM_ACTIVATE_CPU:
1504 		error = vm_activate_cpu(sc->vmm_vm, vcpu);
1505 		break;
1506 
1507 	case VM_SUSPEND_CPU:
1508 		if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) {
1509 			error = EFAULT;
1510 		} else {
1511 			error = vm_suspend_cpu(sc->vmm_vm, vcpu);
1512 		}
1513 		break;
1514 
1515 	case VM_RESUME_CPU:
1516 		if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) {
1517 			error = EFAULT;
1518 		} else {
1519 			error = vm_resume_cpu(sc->vmm_vm, vcpu);
1520 		}
1521 		break;
1522 
1523 	case VM_VCPU_BARRIER:
1524 		vcpu = arg;
1525 		error = vm_vcpu_barrier(sc->vmm_vm, vcpu);
1526 		break;
1527 
1528 	case VM_GET_CPUS: {
1529 		struct vm_cpuset vm_cpuset;
1530 		cpuset_t tempset;
1531 		void *srcp = &tempset;
1532 		int size;
1533 
1534 		if (ddi_copyin(datap, &vm_cpuset, sizeof (vm_cpuset), md)) {
1535 			error = EFAULT;
1536 			break;
1537 		}
1538 
1539 		/* Be more generous about sizing since our cpuset_t is large. */
1540 		size = vm_cpuset.cpusetsize;
1541 		if (size <= 0 || size > sizeof (cpuset_t)) {
1542 			error = ERANGE;
1543 		}
1544 		/*
1545 		 * If they want a ulong_t or less, make sure they receive the
1546 		 * low bits with all the useful information.
1547 		 */
1548 		if (size <= sizeof (tempset.cpub[0])) {
1549 			srcp = &tempset.cpub[0];
1550 		}
1551 
1552 		if (vm_cpuset.which == VM_ACTIVE_CPUS) {
1553 			tempset = vm_active_cpus(sc->vmm_vm);
1554 		} else if (vm_cpuset.which == VM_DEBUG_CPUS) {
1555 			tempset = vm_debug_cpus(sc->vmm_vm);
1556 		} else {
1557 			error = EINVAL;
1558 		}
1559 
1560 		ASSERT(size > 0 && size <= sizeof (tempset));
1561 		if (error == 0 &&
1562 		    ddi_copyout(srcp, vm_cpuset.cpus, size, md)) {
1563 			error = EFAULT;
1564 			break;
1565 		}
1566 		break;
1567 	}
1568 	case VM_SET_INTINFO: {
1569 		struct vm_intinfo vmii;
1570 
1571 		if (ddi_copyin(datap, &vmii, sizeof (vmii), md)) {
1572 			error = EFAULT;
1573 			break;
1574 		}
1575 		error = vm_exit_intinfo(sc->vmm_vm, vcpu, vmii.info1);
1576 		break;
1577 	}
1578 	case VM_GET_INTINFO: {
1579 		struct vm_intinfo vmii;
1580 
1581 		vmii.vcpuid = vcpu;
1582 		error = vm_get_intinfo(sc->vmm_vm, vcpu, &vmii.info1,
1583 		    &vmii.info2);
1584 		if (error == 0 &&
1585 		    ddi_copyout(&vmii, datap, sizeof (vmii), md)) {
1586 			error = EFAULT;
1587 			break;
1588 		}
1589 		break;
1590 	}
1591 	case VM_RTC_WRITE: {
1592 		struct vm_rtc_data rtcdata;
1593 
1594 		if (ddi_copyin(datap, &rtcdata, sizeof (rtcdata), md)) {
1595 			error = EFAULT;
1596 			break;
1597 		}
1598 		error = vrtc_nvram_write(sc->vmm_vm, rtcdata.offset,
1599 		    rtcdata.value);
1600 		break;
1601 	}
1602 	case VM_RTC_READ: {
1603 		struct vm_rtc_data rtcdata;
1604 
1605 		if (ddi_copyin(datap, &rtcdata, sizeof (rtcdata), md)) {
1606 			error = EFAULT;
1607 			break;
1608 		}
1609 		error = vrtc_nvram_read(sc->vmm_vm, rtcdata.offset,
1610 		    &rtcdata.value);
1611 		if (error == 0 &&
1612 		    ddi_copyout(&rtcdata, datap, sizeof (rtcdata), md)) {
1613 			error = EFAULT;
1614 			break;
1615 		}
1616 		break;
1617 	}
1618 	case VM_RTC_SETTIME: {
1619 		timespec_t ts;
1620 
1621 		if (ddi_copyin(datap, &ts, sizeof (ts), md)) {
1622 			error = EFAULT;
1623 			break;
1624 		}
1625 		error = vrtc_set_time(sc->vmm_vm, &ts);
1626 		break;
1627 	}
1628 	case VM_RTC_GETTIME: {
1629 		timespec_t ts;
1630 
1631 		vrtc_get_time(sc->vmm_vm, &ts);
1632 		if (ddi_copyout(&ts, datap, sizeof (ts), md)) {
1633 			error = EFAULT;
1634 			break;
1635 		}
1636 		break;
1637 	}
1638 
1639 	case VM_PMTMR_LOCATE: {
1640 		uint16_t port = arg;
1641 		error = vpmtmr_set_location(sc->vmm_vm, port);
1642 		break;
1643 	}
1644 
1645 	case VM_RESTART_INSTRUCTION:
1646 		error = vm_restart_instruction(sc->vmm_vm, vcpu);
1647 		break;
1648 
1649 	case VM_SET_TOPOLOGY: {
1650 		struct vm_cpu_topology topo;
1651 
1652 		if (ddi_copyin(datap, &topo, sizeof (topo), md) != 0) {
1653 			error = EFAULT;
1654 			break;
1655 		}
1656 		error = vm_set_topology(sc->vmm_vm, topo.sockets, topo.cores,
1657 		    topo.threads, topo.maxcpus);
1658 		break;
1659 	}
1660 	case VM_GET_TOPOLOGY: {
1661 		struct vm_cpu_topology topo;
1662 
1663 		vm_get_topology(sc->vmm_vm, &topo.sockets, &topo.cores,
1664 		    &topo.threads, &topo.maxcpus);
1665 		if (ddi_copyout(&topo, datap, sizeof (topo), md) != 0) {
1666 			error = EFAULT;
1667 			break;
1668 		}
1669 		break;
1670 	}
1671 	case VM_DEVMEM_GETOFFSET: {
1672 		struct vm_devmem_offset vdo;
1673 		vmm_devmem_entry_t *de;
1674 
1675 		if (ddi_copyin(datap, &vdo, sizeof (vdo), md) != 0) {
1676 			error = EFAULT;
1677 			break;
1678 		}
1679 
1680 		de = vmmdev_devmem_find(sc, vdo.segid);
1681 		if (de != NULL) {
1682 			vdo.offset = de->vde_off;
1683 			if (ddi_copyout(&vdo, datap, sizeof (vdo), md) != 0) {
1684 				error = EFAULT;
1685 			}
1686 		} else {
1687 			error = ENOENT;
1688 		}
1689 		break;
1690 	}
1691 	case VM_TRACK_DIRTY_PAGES: {
1692 		const size_t max_track_region_len = 8 * PAGESIZE * 8 * PAGESIZE;
1693 		struct vmm_dirty_tracker tracker;
1694 		uint8_t *bitmap;
1695 		size_t len;
1696 
1697 		if (ddi_copyin(datap, &tracker, sizeof (tracker), md) != 0) {
1698 			error = EFAULT;
1699 			break;
1700 		}
1701 		if ((tracker.vdt_start_gpa & PAGEOFFSET) != 0) {
1702 			error = EINVAL;
1703 			break;
1704 		}
1705 		if (tracker.vdt_len == 0) {
1706 			break;
1707 		}
1708 		if ((tracker.vdt_len & PAGEOFFSET) != 0) {
1709 			error = EINVAL;
1710 			break;
1711 		}
1712 		if (tracker.vdt_len > max_track_region_len) {
1713 			error = EINVAL;
1714 			break;
1715 		}
1716 		len = roundup(tracker.vdt_len / PAGESIZE, 8) / 8;
1717 		bitmap = kmem_zalloc(len, KM_SLEEP);
1718 		error = vm_track_dirty_pages(sc->vmm_vm, tracker.vdt_start_gpa,
1719 		    tracker.vdt_len, bitmap);
1720 		if (error == 0 &&
1721 		    ddi_copyout(bitmap, tracker.vdt_pfns, len, md) != 0) {
1722 			error = EFAULT;
1723 		}
1724 		kmem_free(bitmap, len);
1725 
1726 		break;
1727 	}
1728 	case VM_WRLOCK_CYCLE: {
1729 		/*
1730 		 * Present a test mechanism to acquire/release the write lock
1731 		 * on the VM without any other effects.
1732 		 */
1733 		break;
1734 	}
1735 	case VM_DATA_READ: {
1736 		struct vm_data_xfer vdx;
1737 
1738 		if (ddi_copyin(datap, &vdx, sizeof (vdx), md) != 0) {
1739 			error = EFAULT;
1740 			break;
1741 		}
1742 		if ((vdx.vdx_flags & ~VDX_FLAGS_VALID) != 0) {
1743 			error = EINVAL;
1744 			break;
1745 		}
1746 		if (vdx.vdx_len > VM_DATA_XFER_LIMIT) {
1747 			error = EFBIG;
1748 			break;
1749 		}
1750 
1751 		const size_t len = vdx.vdx_len;
1752 		void *buf = NULL;
1753 		if (len != 0) {
1754 			const void *udata = vdx.vdx_data;
1755 
1756 			buf = kmem_alloc(len, KM_SLEEP);
1757 			if ((vdx.vdx_flags & VDX_FLAG_READ_COPYIN) == 0) {
1758 				bzero(buf, len);
1759 			} else if (ddi_copyin(udata, buf, len, md) != 0) {
1760 				kmem_free(buf, len);
1761 				error = EFAULT;
1762 				break;
1763 			}
1764 		}
1765 
1766 		vdx.vdx_result_len = 0;
1767 		vmm_data_req_t req = {
1768 			.vdr_class = vdx.vdx_class,
1769 			.vdr_version = vdx.vdx_version,
1770 			.vdr_flags = vdx.vdx_flags,
1771 			.vdr_len = len,
1772 			.vdr_data = buf,
1773 			.vdr_result_len = &vdx.vdx_result_len,
1774 		};
1775 		error = vmm_data_read(sc->vmm_vm, vdx.vdx_vcpuid, &req);
1776 
1777 		if (error == 0 && buf != NULL) {
1778 			if (ddi_copyout(buf, vdx.vdx_data, len, md) != 0) {
1779 				error = EFAULT;
1780 			}
1781 		}
1782 
1783 		/*
1784 		 * Copy out the transfer request so that the value of
1785 		 * vdx_result_len can be made available, regardless of any
1786 		 * error(s) which may have occurred.
1787 		 */
1788 		if (ddi_copyout(&vdx, datap, sizeof (vdx), md) != 0) {
1789 			error = (error != 0) ? error : EFAULT;
1790 		}
1791 
1792 		if (buf != NULL) {
1793 			kmem_free(buf, len);
1794 		}
1795 		break;
1796 	}
1797 	case VM_DATA_WRITE: {
1798 		struct vm_data_xfer vdx;
1799 
1800 		if (ddi_copyin(datap, &vdx, sizeof (vdx), md) != 0) {
1801 			error = EFAULT;
1802 			break;
1803 		}
1804 		if ((vdx.vdx_flags & ~VDX_FLAGS_VALID) != 0) {
1805 			error = EINVAL;
1806 			break;
1807 		}
1808 		if (vdx.vdx_len > VM_DATA_XFER_LIMIT) {
1809 			error = EFBIG;
1810 			break;
1811 		}
1812 
1813 		const size_t len = vdx.vdx_len;
1814 		void *buf = NULL;
1815 		if (len != 0) {
1816 			buf = kmem_alloc(len, KM_SLEEP);
1817 			if (ddi_copyin(vdx.vdx_data, buf, len, md) != 0) {
1818 				kmem_free(buf, len);
1819 				error = EFAULT;
1820 				break;
1821 			}
1822 		}
1823 
1824 		vdx.vdx_result_len = 0;
1825 		vmm_data_req_t req = {
1826 			.vdr_class = vdx.vdx_class,
1827 			.vdr_version = vdx.vdx_version,
1828 			.vdr_flags = vdx.vdx_flags,
1829 			.vdr_len = len,
1830 			.vdr_data = buf,
1831 			.vdr_result_len = &vdx.vdx_result_len,
1832 		};
1833 		if (vmm_allow_state_writes != 0) {
1834 			error = vmm_data_write(sc->vmm_vm, vdx.vdx_vcpuid,
1835 			    &req);
1836 		} else {
1837 			/*
1838 			 * Reject the write if somone has thrown the switch back
1839 			 * into the "disallow" position.
1840 			 */
1841 			error = EPERM;
1842 		}
1843 
1844 		if (error == 0 && buf != NULL &&
1845 		    (vdx.vdx_flags & VDX_FLAG_WRITE_COPYOUT) != 0) {
1846 			if (ddi_copyout(buf, vdx.vdx_data, len, md) != 0) {
1847 				error = EFAULT;
1848 			}
1849 		}
1850 
1851 		/*
1852 		 * Copy out the transfer request so that the value of
1853 		 * vdx_result_len can be made available, regardless of any
1854 		 * error(s) which may have occurred.
1855 		 */
1856 		if (ddi_copyout(&vdx, datap, sizeof (vdx), md) != 0) {
1857 			error = (error != 0) ? error : EFAULT;
1858 		}
1859 
1860 		if (buf != NULL) {
1861 			kmem_free(buf, len);
1862 		}
1863 		break;
1864 	}
1865 
1866 	case VM_PAUSE: {
1867 		error = vm_pause_instance(sc->vmm_vm);
1868 		break;
1869 	}
1870 	case VM_RESUME: {
1871 		error = vm_resume_instance(sc->vmm_vm);
1872 		break;
1873 	}
1874 
1875 	default:
1876 		error = ENOTTY;
1877 		break;
1878 	}
1879 
1880 	/* Release exclusion resources */
1881 	switch (lock_type) {
1882 	case LOCK_NONE:
1883 		break;
1884 	case LOCK_VCPU:
1885 		vcpu_unlock_one(sc, vcpu);
1886 		break;
1887 	case LOCK_READ_HOLD:
1888 		vmm_read_unlock(sc);
1889 		break;
1890 	case LOCK_WRITE_HOLD:
1891 		vmm_write_unlock(sc);
1892 		break;
1893 	default:
1894 		panic("unexpected lock type");
1895 		break;
1896 	}
1897 
1898 	return (error);
1899 }
1900 
1901 static vmm_softc_t *
1902 vmm_lookup(const char *name)
1903 {
1904 	list_t *vml = &vmm_list;
1905 	vmm_softc_t *sc;
1906 
1907 	ASSERT(MUTEX_HELD(&vmm_mtx));
1908 
1909 	for (sc = list_head(vml); sc != NULL; sc = list_next(vml, sc)) {
1910 		if (strcmp(sc->vmm_name, name) == 0) {
1911 			break;
1912 		}
1913 	}
1914 
1915 	return (sc);
1916 }
1917 
1918 /*
1919  * Acquire an HMA registration if not already held.
1920  */
1921 static boolean_t
1922 vmm_hma_acquire(void)
1923 {
1924 	ASSERT(MUTEX_NOT_HELD(&vmm_mtx));
1925 
1926 	mutex_enter(&vmmdev_mtx);
1927 
1928 	if (vmmdev_hma_reg == NULL) {
1929 		VERIFY3U(vmmdev_hma_ref, ==, 0);
1930 		vmmdev_hma_reg = hma_register(vmmdev_hvm_name);
1931 		if (vmmdev_hma_reg == NULL) {
1932 			cmn_err(CE_WARN, "%s HMA registration failed.",
1933 			    vmmdev_hvm_name);
1934 			mutex_exit(&vmmdev_mtx);
1935 			return (B_FALSE);
1936 		}
1937 	}
1938 
1939 	vmmdev_hma_ref++;
1940 
1941 	mutex_exit(&vmmdev_mtx);
1942 
1943 	return (B_TRUE);
1944 }
1945 
1946 /*
1947  * Release the HMA registration if held and there are no remaining VMs.
1948  */
1949 static void
1950 vmm_hma_release(void)
1951 {
1952 	ASSERT(MUTEX_NOT_HELD(&vmm_mtx));
1953 
1954 	mutex_enter(&vmmdev_mtx);
1955 
1956 	VERIFY3U(vmmdev_hma_ref, !=, 0);
1957 
1958 	vmmdev_hma_ref--;
1959 
1960 	if (vmmdev_hma_ref == 0) {
1961 		VERIFY(vmmdev_hma_reg != NULL);
1962 		hma_unregister(vmmdev_hma_reg);
1963 		vmmdev_hma_reg = NULL;
1964 	}
1965 	mutex_exit(&vmmdev_mtx);
1966 }
1967 
1968 static int
1969 vmmdev_do_vm_create(const struct vm_create_req *req, cred_t *cr)
1970 {
1971 	vmm_softc_t	*sc = NULL;
1972 	minor_t		minor;
1973 	int		error = ENOMEM;
1974 	size_t		len;
1975 	const char	*name = req->name;
1976 
1977 	len = strnlen(name, VM_MAX_NAMELEN);
1978 	if (len == 0) {
1979 		return (EINVAL);
1980 	}
1981 	if (len >= VM_MAX_NAMELEN) {
1982 		return (ENAMETOOLONG);
1983 	}
1984 	if (strchr(name, '/') != NULL) {
1985 		return (EINVAL);
1986 	}
1987 
1988 	if (!vmm_hma_acquire())
1989 		return (ENXIO);
1990 
1991 	mutex_enter(&vmm_mtx);
1992 
1993 	/* Look for duplicate names */
1994 	if (vmm_lookup(name) != NULL) {
1995 		mutex_exit(&vmm_mtx);
1996 		vmm_hma_release();
1997 		return (EEXIST);
1998 	}
1999 
2000 	/* Allow only one instance per non-global zone. */
2001 	if (!INGLOBALZONE(curproc)) {
2002 		for (sc = list_head(&vmm_list); sc != NULL;
2003 		    sc = list_next(&vmm_list, sc)) {
2004 			if (sc->vmm_zone == curzone) {
2005 				mutex_exit(&vmm_mtx);
2006 				vmm_hma_release();
2007 				return (EINVAL);
2008 			}
2009 		}
2010 	}
2011 
2012 	minor = id_alloc(vmm_minors);
2013 	if (ddi_soft_state_zalloc(vmm_statep, minor) != DDI_SUCCESS) {
2014 		goto fail;
2015 	} else if ((sc = ddi_get_soft_state(vmm_statep, minor)) == NULL) {
2016 		ddi_soft_state_free(vmm_statep, minor);
2017 		goto fail;
2018 	} else if (ddi_create_minor_node(vmmdev_dip, name, S_IFCHR, minor,
2019 	    DDI_PSEUDO, 0) != DDI_SUCCESS) {
2020 		goto fail;
2021 	}
2022 
2023 	if (vmm_kstat_alloc(sc, minor, cr) != 0) {
2024 		goto fail;
2025 	}
2026 
2027 	error = vm_create(req->flags, &sc->vmm_vm);
2028 	if (error == 0) {
2029 		/* Complete VM intialization and report success. */
2030 		(void) strlcpy(sc->vmm_name, name, sizeof (sc->vmm_name));
2031 		sc->vmm_minor = minor;
2032 		list_create(&sc->vmm_devmem_list, sizeof (vmm_devmem_entry_t),
2033 		    offsetof(vmm_devmem_entry_t, vde_node));
2034 
2035 		list_create(&sc->vmm_holds, sizeof (vmm_hold_t),
2036 		    offsetof(vmm_hold_t, vmh_node));
2037 		cv_init(&sc->vmm_cv, NULL, CV_DEFAULT, NULL);
2038 
2039 		mutex_init(&sc->vmm_lease_lock, NULL, MUTEX_DEFAULT, NULL);
2040 		list_create(&sc->vmm_lease_list, sizeof (vmm_lease_t),
2041 		    offsetof(vmm_lease_t, vml_node));
2042 		cv_init(&sc->vmm_lease_cv, NULL, CV_DEFAULT, NULL);
2043 		rw_init(&sc->vmm_rwlock, NULL, RW_DEFAULT, NULL);
2044 
2045 		sc->vmm_zone = crgetzone(cr);
2046 		zone_hold(sc->vmm_zone);
2047 		vmm_zsd_add_vm(sc);
2048 		vmm_kstat_init(sc);
2049 
2050 		list_insert_tail(&vmm_list, sc);
2051 		mutex_exit(&vmm_mtx);
2052 		return (0);
2053 	}
2054 
2055 	vmm_kstat_fini(sc);
2056 	ddi_remove_minor_node(vmmdev_dip, name);
2057 fail:
2058 	id_free(vmm_minors, minor);
2059 	if (sc != NULL) {
2060 		ddi_soft_state_free(vmm_statep, minor);
2061 	}
2062 	mutex_exit(&vmm_mtx);
2063 	vmm_hma_release();
2064 
2065 	return (error);
2066 }
2067 
2068 /*
2069  * Bhyve 'Driver' Interface
2070  *
2071  * While many devices are emulated in the bhyve userspace process, there are
2072  * others with performance constraints which require that they run mostly or
2073  * entirely in-kernel.  For those not integrated directly into bhyve, an API is
2074  * needed so they can query/manipulate the portions of VM state needed to
2075  * fulfill their purpose.
2076  *
2077  * This includes:
2078  * - Translating guest-physical addresses to host-virtual pointers
2079  * - Injecting MSIs
2080  * - Hooking IO port addresses
2081  *
2082  * The vmm_drv interface exists to provide that functionality to its consumers.
2083  * (At this time, 'viona' is the only user)
2084  */
2085 int
2086 vmm_drv_hold(file_t *fp, cred_t *cr, vmm_hold_t **holdp)
2087 {
2088 	vnode_t *vp = fp->f_vnode;
2089 	const dev_t dev = vp->v_rdev;
2090 	vmm_softc_t *sc;
2091 	vmm_hold_t *hold;
2092 	int err = 0;
2093 
2094 	if (vp->v_type != VCHR) {
2095 		return (ENXIO);
2096 	}
2097 	const major_t major = getmajor(dev);
2098 	const minor_t minor = getminor(dev);
2099 
2100 	mutex_enter(&vmmdev_mtx);
2101 	if (vmmdev_dip == NULL || major != ddi_driver_major(vmmdev_dip)) {
2102 		mutex_exit(&vmmdev_mtx);
2103 		return (ENOENT);
2104 	}
2105 	mutex_enter(&vmm_mtx);
2106 	mutex_exit(&vmmdev_mtx);
2107 
2108 	if ((sc = ddi_get_soft_state(vmm_statep, minor)) == NULL) {
2109 		err = ENOENT;
2110 		goto out;
2111 	}
2112 	/* XXXJOY: check cred permissions against instance */
2113 
2114 	if ((sc->vmm_flags & VMM_DESTROY) != 0) {
2115 		err = EBUSY;
2116 		goto out;
2117 	}
2118 
2119 	hold = kmem_zalloc(sizeof (*hold), KM_SLEEP);
2120 	hold->vmh_sc = sc;
2121 	hold->vmh_release_req = B_FALSE;
2122 
2123 	list_insert_tail(&sc->vmm_holds, hold);
2124 	sc->vmm_flags |= VMM_HELD;
2125 	*holdp = hold;
2126 
2127 out:
2128 	mutex_exit(&vmm_mtx);
2129 	return (err);
2130 }
2131 
2132 void
2133 vmm_drv_rele(vmm_hold_t *hold)
2134 {
2135 	vmm_softc_t *sc;
2136 	bool hma_release = false;
2137 
2138 	ASSERT(hold != NULL);
2139 	ASSERT(hold->vmh_sc != NULL);
2140 	VERIFY(hold->vmh_ioport_hook_cnt == 0);
2141 
2142 	mutex_enter(&vmm_mtx);
2143 	sc = hold->vmh_sc;
2144 	list_remove(&sc->vmm_holds, hold);
2145 	kmem_free(hold, sizeof (*hold));
2146 
2147 	if (list_is_empty(&sc->vmm_holds)) {
2148 		sc->vmm_flags &= ~VMM_HELD;
2149 
2150 		/*
2151 		 * Since outstanding holds would prevent instance destruction
2152 		 * from completing, attempt to finish it now if it was already
2153 		 * set in motion.
2154 		 */
2155 		if ((sc->vmm_flags & VMM_DESTROY) != 0) {
2156 			VERIFY0(vmm_destroy_locked(sc, VDO_DEFAULT,
2157 			    &hma_release));
2158 		}
2159 	}
2160 	mutex_exit(&vmm_mtx);
2161 
2162 	if (hma_release) {
2163 		vmm_hma_release();
2164 	}
2165 }
2166 
2167 boolean_t
2168 vmm_drv_release_reqd(vmm_hold_t *hold)
2169 {
2170 	ASSERT(hold != NULL);
2171 
2172 	return (hold->vmh_release_req);
2173 }
2174 
2175 vmm_lease_t *
2176 vmm_drv_lease_sign(vmm_hold_t *hold, boolean_t (*expiref)(void *), void *arg)
2177 {
2178 	vmm_softc_t *sc = hold->vmh_sc;
2179 	vmm_lease_t *lease;
2180 
2181 	ASSERT3P(expiref, !=, NULL);
2182 
2183 	if (hold->vmh_release_req) {
2184 		return (NULL);
2185 	}
2186 
2187 	lease = kmem_alloc(sizeof (*lease), KM_SLEEP);
2188 	list_link_init(&lease->vml_node);
2189 	lease->vml_expire_func = expiref;
2190 	lease->vml_expire_arg = arg;
2191 	lease->vml_expired = B_FALSE;
2192 	lease->vml_break_deferred = B_FALSE;
2193 	lease->vml_hold = hold;
2194 	/* cache the VM pointer for one less pointer chase */
2195 	lease->vml_vm = sc->vmm_vm;
2196 	lease->vml_vmclient = vmspace_client_alloc(vm_get_vmspace(sc->vmm_vm));
2197 
2198 	mutex_enter(&sc->vmm_lease_lock);
2199 	while (sc->vmm_lease_blocker != 0) {
2200 		cv_wait(&sc->vmm_lease_cv, &sc->vmm_lease_lock);
2201 	}
2202 	list_insert_tail(&sc->vmm_lease_list, lease);
2203 	vmm_read_lock(sc);
2204 	mutex_exit(&sc->vmm_lease_lock);
2205 
2206 	return (lease);
2207 }
2208 
2209 static void
2210 vmm_lease_break_locked(vmm_softc_t *sc, vmm_lease_t *lease)
2211 {
2212 	ASSERT(MUTEX_HELD(&sc->vmm_lease_lock));
2213 
2214 	list_remove(&sc->vmm_lease_list, lease);
2215 	vmm_read_unlock(sc);
2216 	vmc_destroy(lease->vml_vmclient);
2217 	kmem_free(lease, sizeof (*lease));
2218 }
2219 
2220 static void
2221 vmm_lease_block(vmm_softc_t *sc)
2222 {
2223 	mutex_enter(&sc->vmm_lease_lock);
2224 	VERIFY3U(sc->vmm_lease_blocker, !=, UINT_MAX);
2225 	sc->vmm_lease_blocker++;
2226 	if (sc->vmm_lease_blocker == 1) {
2227 		list_t *list = &sc->vmm_lease_list;
2228 		vmm_lease_t *lease = list_head(list);
2229 
2230 		while (lease != NULL) {
2231 			void *arg = lease->vml_expire_arg;
2232 			boolean_t (*expiref)(void *) = lease->vml_expire_func;
2233 			boolean_t sync_break = B_FALSE;
2234 
2235 			/*
2236 			 * Since the lease expiration notification may
2237 			 * need to take locks which would deadlock with
2238 			 * vmm_lease_lock, drop it across the call.
2239 			 *
2240 			 * We are the only one allowed to manipulate
2241 			 * vmm_lease_list right now, so it is safe to
2242 			 * continue iterating through it after
2243 			 * reacquiring the lock.
2244 			 */
2245 			lease->vml_expired = B_TRUE;
2246 			mutex_exit(&sc->vmm_lease_lock);
2247 			sync_break = expiref(arg);
2248 			mutex_enter(&sc->vmm_lease_lock);
2249 
2250 			if (sync_break) {
2251 				vmm_lease_t *next;
2252 
2253 				/*
2254 				 * These leases which are synchronously broken
2255 				 * result in vmm_read_unlock() calls from a
2256 				 * different thread than the corresponding
2257 				 * vmm_read_lock().  This is acceptable, given
2258 				 * that the rwlock underpinning the whole
2259 				 * mechanism tolerates the behavior.  This
2260 				 * flexibility is _only_ afforded to VM read
2261 				 * lock (RW_READER) holders.
2262 				 */
2263 				next = list_next(list, lease);
2264 				vmm_lease_break_locked(sc, lease);
2265 				lease = next;
2266 			} else {
2267 				lease = list_next(list, lease);
2268 			}
2269 		}
2270 
2271 		/* Process leases which were not broken synchronously. */
2272 		while (!list_is_empty(list)) {
2273 			/*
2274 			 * Although the nested loops are quadratic, the number
2275 			 * of leases is small.
2276 			 */
2277 			lease = list_head(list);
2278 			while (lease != NULL) {
2279 				vmm_lease_t *next = list_next(list, lease);
2280 				if (lease->vml_break_deferred) {
2281 					vmm_lease_break_locked(sc, lease);
2282 				}
2283 				lease = next;
2284 			}
2285 			if (list_is_empty(list)) {
2286 				break;
2287 			}
2288 			cv_wait(&sc->vmm_lease_cv, &sc->vmm_lease_lock);
2289 		}
2290 		/* Wake anyone else waiting for the lease list to be empty  */
2291 		cv_broadcast(&sc->vmm_lease_cv);
2292 	} else {
2293 		list_t *list = &sc->vmm_lease_list;
2294 
2295 		/*
2296 		 * Some other thread beat us to the duty of lease cleanup.
2297 		 * Wait until that is complete.
2298 		 */
2299 		while (!list_is_empty(list)) {
2300 			cv_wait(&sc->vmm_lease_cv, &sc->vmm_lease_lock);
2301 		}
2302 	}
2303 	mutex_exit(&sc->vmm_lease_lock);
2304 }
2305 
2306 static void
2307 vmm_lease_unblock(vmm_softc_t *sc)
2308 {
2309 	mutex_enter(&sc->vmm_lease_lock);
2310 	VERIFY3U(sc->vmm_lease_blocker, !=, 0);
2311 	sc->vmm_lease_blocker--;
2312 	if (sc->vmm_lease_blocker == 0) {
2313 		cv_broadcast(&sc->vmm_lease_cv);
2314 	}
2315 	mutex_exit(&sc->vmm_lease_lock);
2316 }
2317 
2318 void
2319 vmm_drv_lease_break(vmm_hold_t *hold, vmm_lease_t *lease)
2320 {
2321 	vmm_softc_t *sc = hold->vmh_sc;
2322 
2323 	VERIFY3P(hold, ==, lease->vml_hold);
2324 	VERIFY(!lease->vml_break_deferred);
2325 
2326 	mutex_enter(&sc->vmm_lease_lock);
2327 	if (sc->vmm_lease_blocker == 0) {
2328 		vmm_lease_break_locked(sc, lease);
2329 	} else {
2330 		/*
2331 		 * Defer the lease-breaking to whichever thread is currently
2332 		 * cleaning up all leases as part of a vmm_lease_block() call.
2333 		 */
2334 		lease->vml_break_deferred = B_TRUE;
2335 		cv_broadcast(&sc->vmm_lease_cv);
2336 	}
2337 	mutex_exit(&sc->vmm_lease_lock);
2338 }
2339 
2340 boolean_t
2341 vmm_drv_lease_expired(vmm_lease_t *lease)
2342 {
2343 	return (lease->vml_expired);
2344 }
2345 
2346 vmm_page_t *
2347 vmm_drv_page_hold(vmm_lease_t *lease, uintptr_t gpa, int prot)
2348 {
2349 	ASSERT(lease != NULL);
2350 	ASSERT0(gpa & PAGEOFFSET);
2351 
2352 	return ((vmm_page_t *)vmc_hold(lease->vml_vmclient, gpa, prot));
2353 }
2354 
2355 
2356 /* Ensure that flags mirrored by vmm_drv interface properly match up */
2357 CTASSERT(VMPF_DEFER_DIRTY == VPF_DEFER_DIRTY);
2358 
2359 vmm_page_t *
2360 vmm_drv_page_hold_ext(vmm_lease_t *lease, uintptr_t gpa, int prot, int flags)
2361 {
2362 	ASSERT(lease != NULL);
2363 	ASSERT0(gpa & PAGEOFFSET);
2364 
2365 	vmm_page_t *page =
2366 	    (vmm_page_t *)vmc_hold_ext(lease->vml_vmclient, gpa, prot, flags);
2367 	return (page);
2368 }
2369 
2370 void
2371 vmm_drv_page_release(vmm_page_t *vmmp)
2372 {
2373 	(void) vmp_release((vm_page_t *)vmmp);
2374 }
2375 
2376 void
2377 vmm_drv_page_release_chain(vmm_page_t *vmmp)
2378 {
2379 	(void) vmp_release_chain((vm_page_t *)vmmp);
2380 }
2381 
2382 const void *
2383 vmm_drv_page_readable(const vmm_page_t *vmmp)
2384 {
2385 	return (vmp_get_readable((const vm_page_t *)vmmp));
2386 }
2387 
2388 void *
2389 vmm_drv_page_writable(const vmm_page_t *vmmp)
2390 {
2391 	return (vmp_get_writable((const vm_page_t *)vmmp));
2392 }
2393 
2394 void
2395 vmm_drv_page_mark_dirty(vmm_page_t *vmmp)
2396 {
2397 	return (vmp_mark_dirty((vm_page_t *)vmmp));
2398 }
2399 
2400 void
2401 vmm_drv_page_chain(vmm_page_t *vmmp, vmm_page_t *to_chain)
2402 {
2403 	vmp_chain((vm_page_t *)vmmp, (vm_page_t *)to_chain);
2404 }
2405 
2406 vmm_page_t *
2407 vmm_drv_page_next(const vmm_page_t *vmmp)
2408 {
2409 	return ((vmm_page_t *)vmp_next((vm_page_t *)vmmp));
2410 }
2411 
2412 int
2413 vmm_drv_msi(vmm_lease_t *lease, uint64_t addr, uint64_t msg)
2414 {
2415 	ASSERT(lease != NULL);
2416 
2417 	return (lapic_intr_msi(lease->vml_vm, addr, msg));
2418 }
2419 
2420 int
2421 vmm_drv_ioport_hook(vmm_hold_t *hold, uint16_t ioport, vmm_drv_iop_cb_t func,
2422     void *arg, void **cookie)
2423 {
2424 	vmm_softc_t *sc;
2425 	int err;
2426 
2427 	ASSERT(hold != NULL);
2428 	ASSERT(cookie != NULL);
2429 
2430 	sc = hold->vmh_sc;
2431 	mutex_enter(&vmm_mtx);
2432 	/* Confirm that hook installation is not blocked */
2433 	if ((sc->vmm_flags & VMM_BLOCK_HOOK) != 0) {
2434 		mutex_exit(&vmm_mtx);
2435 		return (EBUSY);
2436 	}
2437 	/*
2438 	 * Optimistically record an installed hook which will prevent a block
2439 	 * from being asserted while the mutex is dropped.
2440 	 */
2441 	hold->vmh_ioport_hook_cnt++;
2442 	mutex_exit(&vmm_mtx);
2443 
2444 	vmm_write_lock(sc);
2445 	err = vm_ioport_hook(sc->vmm_vm, ioport, (ioport_handler_t)func,
2446 	    arg, cookie);
2447 	vmm_write_unlock(sc);
2448 
2449 	if (err != 0) {
2450 		mutex_enter(&vmm_mtx);
2451 		/* Walk back optimism about the hook installation */
2452 		hold->vmh_ioport_hook_cnt--;
2453 		mutex_exit(&vmm_mtx);
2454 	}
2455 	return (err);
2456 }
2457 
2458 void
2459 vmm_drv_ioport_unhook(vmm_hold_t *hold, void **cookie)
2460 {
2461 	vmm_softc_t *sc;
2462 
2463 	ASSERT(hold != NULL);
2464 	ASSERT(cookie != NULL);
2465 	ASSERT(hold->vmh_ioport_hook_cnt != 0);
2466 
2467 	sc = hold->vmh_sc;
2468 	vmm_write_lock(sc);
2469 	vm_ioport_unhook(sc->vmm_vm, cookie);
2470 	vmm_write_unlock(sc);
2471 
2472 	mutex_enter(&vmm_mtx);
2473 	hold->vmh_ioport_hook_cnt--;
2474 	mutex_exit(&vmm_mtx);
2475 }
2476 
2477 static void
2478 vmm_drv_purge(vmm_softc_t *sc)
2479 {
2480 	ASSERT(MUTEX_HELD(&vmm_mtx));
2481 
2482 	if ((sc->vmm_flags & VMM_HELD) != 0) {
2483 		vmm_hold_t *hold;
2484 
2485 		for (hold = list_head(&sc->vmm_holds); hold != NULL;
2486 		    hold = list_next(&sc->vmm_holds, hold)) {
2487 			hold->vmh_release_req = B_TRUE;
2488 		}
2489 
2490 		/*
2491 		 * Require that all leases on the instance be broken, now that
2492 		 * all associated holds have been marked as needing release.
2493 		 *
2494 		 * Dropping vmm_mtx is not strictly necessary, but if any of the
2495 		 * lessees are slow to respond, it would be nice to leave it
2496 		 * available for other parties.
2497 		 */
2498 		mutex_exit(&vmm_mtx);
2499 		vmm_lease_block(sc);
2500 		vmm_lease_unblock(sc);
2501 		mutex_enter(&vmm_mtx);
2502 	}
2503 }
2504 
2505 static int
2506 vmm_drv_block_hook(vmm_softc_t *sc, boolean_t enable_block)
2507 {
2508 	int err = 0;
2509 
2510 	mutex_enter(&vmm_mtx);
2511 	if (!enable_block) {
2512 		VERIFY((sc->vmm_flags & VMM_BLOCK_HOOK) != 0);
2513 
2514 		sc->vmm_flags &= ~VMM_BLOCK_HOOK;
2515 		goto done;
2516 	}
2517 
2518 	/* If any holds have hooks installed, the block is a failure */
2519 	if (!list_is_empty(&sc->vmm_holds)) {
2520 		vmm_hold_t *hold;
2521 
2522 		for (hold = list_head(&sc->vmm_holds); hold != NULL;
2523 		    hold = list_next(&sc->vmm_holds, hold)) {
2524 			if (hold->vmh_ioport_hook_cnt != 0) {
2525 				err = EBUSY;
2526 				goto done;
2527 			}
2528 		}
2529 	}
2530 	sc->vmm_flags |= VMM_BLOCK_HOOK;
2531 
2532 done:
2533 	mutex_exit(&vmm_mtx);
2534 	return (err);
2535 }
2536 
2537 
2538 static void
2539 vmm_destroy_begin(vmm_softc_t *sc, vmm_destroy_opts_t opts)
2540 {
2541 	ASSERT(MUTEX_HELD(&vmm_mtx));
2542 	ASSERT0(sc->vmm_flags & VMM_DESTROY);
2543 
2544 	sc->vmm_flags |= VMM_DESTROY;
2545 
2546 	/*
2547 	 * Lock and unlock all of the vCPUs to ensure that they are kicked out
2548 	 * of guest context, being unable to return now that the instance is
2549 	 * marked for destruction.
2550 	 */
2551 	const int maxcpus = vm_get_maxcpus(sc->vmm_vm);
2552 	for (int vcpu = 0; vcpu < maxcpus; vcpu++) {
2553 		vcpu_lock_one(sc, vcpu);
2554 		vcpu_unlock_one(sc, vcpu);
2555 	}
2556 
2557 	vmmdev_devmem_purge(sc);
2558 	if ((opts & VDO_NO_CLEAN_ZSD) == 0) {
2559 		/*
2560 		 * The ZSD should be cleaned up now, unless destruction of the
2561 		 * instance was initated by destruction of the containing zone,
2562 		 * in which case the ZSD has already been removed.
2563 		 */
2564 		vmm_zsd_rem_vm(sc);
2565 	}
2566 	zone_rele(sc->vmm_zone);
2567 
2568 	vmm_drv_purge(sc);
2569 }
2570 
2571 static bool
2572 vmm_destroy_ready(vmm_softc_t *sc)
2573 {
2574 	ASSERT(MUTEX_HELD(&vmm_mtx));
2575 
2576 	if ((sc->vmm_flags & (VMM_HELD | VMM_IS_OPEN)) == 0) {
2577 		VERIFY(list_is_empty(&sc->vmm_holds));
2578 		return (true);
2579 	}
2580 
2581 	return (false);
2582 }
2583 
2584 static void
2585 vmm_destroy_finish(vmm_softc_t *sc)
2586 {
2587 	ASSERT(MUTEX_HELD(&vmm_mtx));
2588 	ASSERT(vmm_destroy_ready(sc));
2589 
2590 	list_remove(&vmm_list, sc);
2591 	vmm_kstat_fini(sc);
2592 	vm_destroy(sc->vmm_vm);
2593 	ddi_remove_minor_node(vmmdev_dip, sc->vmm_name);
2594 	(void) devfs_clean(ddi_get_parent(vmmdev_dip), NULL, DV_CLEAN_FORCE);
2595 
2596 	const minor_t minor = sc->vmm_minor;
2597 	ddi_soft_state_free(vmm_statep, minor);
2598 	id_free(vmm_minors, minor);
2599 }
2600 
2601 /*
2602  * Initiate or attempt to finish destruction of a VMM instance.
2603  *
2604  * This is called from several contexts:
2605  * - An explicit destroy ioctl is made
2606  * - A vmm_drv consumer releases its hold (being the last on the instance)
2607  * - The vmm device is closed, and auto-destruct is enabled
2608  */
2609 static int
2610 vmm_destroy_locked(vmm_softc_t *sc, vmm_destroy_opts_t opts,
2611     bool *hma_release)
2612 {
2613 	ASSERT(MUTEX_HELD(&vmm_mtx));
2614 
2615 	*hma_release = false;
2616 
2617 	/*
2618 	 * When instance destruction begins, it is so marked such that any
2619 	 * further requests to operate the instance will fail.
2620 	 */
2621 	if ((sc->vmm_flags & VMM_DESTROY) == 0) {
2622 		vmm_destroy_begin(sc, opts);
2623 	}
2624 
2625 	if (vmm_destroy_ready(sc)) {
2626 
2627 		/*
2628 		 * Notify anyone waiting for the destruction to finish.  They
2629 		 * must be clear before we can safely tear down the softc.
2630 		 */
2631 		if (sc->vmm_destroy_waiters != 0) {
2632 			cv_broadcast(&sc->vmm_cv);
2633 			while (sc->vmm_destroy_waiters != 0) {
2634 				cv_wait(&sc->vmm_cv, &vmm_mtx);
2635 			}
2636 		}
2637 
2638 		/*
2639 		 * Finish destruction of instance.  After this point, the softc
2640 		 * is freed and cannot be accessed again.
2641 		 *
2642 		 * With destruction complete, the HMA hold can be released
2643 		 */
2644 		vmm_destroy_finish(sc);
2645 		*hma_release = true;
2646 		return (0);
2647 	} else if ((opts & VDO_ATTEMPT_WAIT) != 0) {
2648 		int err = 0;
2649 
2650 		sc->vmm_destroy_waiters++;
2651 		while (!vmm_destroy_ready(sc) && err == 0) {
2652 			if (cv_wait_sig(&sc->vmm_cv, &vmm_mtx) <= 0) {
2653 				err = EINTR;
2654 			}
2655 		}
2656 		sc->vmm_destroy_waiters--;
2657 
2658 		if (sc->vmm_destroy_waiters == 0) {
2659 			/*
2660 			 * If we were the last waiter, it could be that VM
2661 			 * destruction is waiting on _us_ to proceed with the
2662 			 * final clean-up.
2663 			 */
2664 			cv_signal(&sc->vmm_cv);
2665 		}
2666 		return (err);
2667 	} else {
2668 		/*
2669 		 * Since the instance is not ready for destruction, and the
2670 		 * caller did not ask to wait, consider it a success for now.
2671 		 */
2672 		return (0);
2673 	}
2674 }
2675 
2676 void
2677 vmm_zone_vm_destroy(vmm_softc_t *sc)
2678 {
2679 	bool hma_release = false;
2680 	int err;
2681 
2682 	mutex_enter(&vmm_mtx);
2683 	err = vmm_destroy_locked(sc, VDO_NO_CLEAN_ZSD, &hma_release);
2684 	mutex_exit(&vmm_mtx);
2685 
2686 	VERIFY0(err);
2687 
2688 	if (hma_release) {
2689 		vmm_hma_release();
2690 	}
2691 }
2692 
2693 static int
2694 vmmdev_do_vm_destroy(const struct vm_destroy_req *req, cred_t *cr)
2695 {
2696 	vmm_softc_t *sc;
2697 	bool hma_release = false;
2698 	int err;
2699 
2700 	if (crgetuid(cr) != 0) {
2701 		return (EPERM);
2702 	}
2703 
2704 	mutex_enter(&vmm_mtx);
2705 	sc = vmm_lookup(req->name);
2706 	if (sc == NULL) {
2707 		mutex_exit(&vmm_mtx);
2708 		return (ENOENT);
2709 	}
2710 	/*
2711 	 * We don't check this in vmm_lookup() since that function is also used
2712 	 * for validation during create and currently vmm names must be unique.
2713 	 */
2714 	if (!INGLOBALZONE(curproc) && sc->vmm_zone != curzone) {
2715 		mutex_exit(&vmm_mtx);
2716 		return (EPERM);
2717 	}
2718 
2719 	err = vmm_destroy_locked(sc, VDO_ATTEMPT_WAIT, &hma_release);
2720 	mutex_exit(&vmm_mtx);
2721 
2722 	if (hma_release) {
2723 		vmm_hma_release();
2724 	}
2725 
2726 	return (err);
2727 }
2728 
2729 #define	VCPU_NAME_BUFLEN	32
2730 
2731 static int
2732 vmm_kstat_alloc(vmm_softc_t *sc, minor_t minor, const cred_t *cr)
2733 {
2734 	zoneid_t zid = crgetzoneid(cr);
2735 	int instance = minor;
2736 	kstat_t *ksp;
2737 
2738 	ASSERT3P(sc->vmm_kstat_vm, ==, NULL);
2739 
2740 	ksp = kstat_create_zone(VMM_MODULE_NAME, instance, "vm",
2741 	    VMM_KSTAT_CLASS, KSTAT_TYPE_NAMED,
2742 	    sizeof (vmm_kstats_t) / sizeof (kstat_named_t), 0, zid);
2743 
2744 	if (ksp == NULL) {
2745 		return (-1);
2746 	}
2747 	sc->vmm_kstat_vm = ksp;
2748 
2749 	for (uint_t i = 0; i < VM_MAXCPU; i++) {
2750 		char namebuf[VCPU_NAME_BUFLEN];
2751 
2752 		ASSERT3P(sc->vmm_kstat_vcpu[i], ==, NULL);
2753 
2754 		(void) snprintf(namebuf, VCPU_NAME_BUFLEN, "vcpu%u", i);
2755 		ksp = kstat_create_zone(VMM_MODULE_NAME, instance, namebuf,
2756 		    VMM_KSTAT_CLASS, KSTAT_TYPE_NAMED,
2757 		    sizeof (vmm_vcpu_kstats_t) / sizeof (kstat_named_t),
2758 		    0, zid);
2759 		if (ksp == NULL) {
2760 			goto fail;
2761 		}
2762 
2763 		sc->vmm_kstat_vcpu[i] = ksp;
2764 	}
2765 
2766 	/*
2767 	 * If this instance is associated with a non-global zone, make its
2768 	 * kstats visible from the GZ.
2769 	 */
2770 	if (zid != GLOBAL_ZONEID) {
2771 		kstat_zone_add(sc->vmm_kstat_vm, GLOBAL_ZONEID);
2772 		for (uint_t i = 0; i < VM_MAXCPU; i++) {
2773 			kstat_zone_add(sc->vmm_kstat_vcpu[i], GLOBAL_ZONEID);
2774 		}
2775 	}
2776 
2777 	return (0);
2778 
2779 fail:
2780 	for (uint_t i = 0; i < VM_MAXCPU; i++) {
2781 		if (sc->vmm_kstat_vcpu[i] != NULL) {
2782 			kstat_delete(sc->vmm_kstat_vcpu[i]);
2783 			sc->vmm_kstat_vcpu[i] = NULL;
2784 		} else {
2785 			break;
2786 		}
2787 	}
2788 	kstat_delete(sc->vmm_kstat_vm);
2789 	sc->vmm_kstat_vm = NULL;
2790 	return (-1);
2791 }
2792 
2793 static void
2794 vmm_kstat_init(vmm_softc_t *sc)
2795 {
2796 	kstat_t *ksp;
2797 
2798 	ASSERT3P(sc->vmm_vm, !=, NULL);
2799 	ASSERT3P(sc->vmm_kstat_vm, !=, NULL);
2800 
2801 	ksp = sc->vmm_kstat_vm;
2802 	vmm_kstats_t *vk = ksp->ks_data;
2803 	ksp->ks_private = sc->vmm_vm;
2804 	kstat_named_init(&vk->vk_name, "vm_name", KSTAT_DATA_STRING);
2805 	kstat_named_setstr(&vk->vk_name, sc->vmm_name);
2806 
2807 	for (uint_t i = 0; i < VM_MAXCPU; i++) {
2808 		ASSERT3P(sc->vmm_kstat_vcpu[i], !=, NULL);
2809 
2810 		ksp = sc->vmm_kstat_vcpu[i];
2811 		vmm_vcpu_kstats_t *vvk = ksp->ks_data;
2812 
2813 		kstat_named_init(&vvk->vvk_vcpu, "vcpu", KSTAT_DATA_UINT32);
2814 		vvk->vvk_vcpu.value.ui32 = i;
2815 		kstat_named_init(&vvk->vvk_time_init, "time_init",
2816 		    KSTAT_DATA_UINT64);
2817 		kstat_named_init(&vvk->vvk_time_run, "time_run",
2818 		    KSTAT_DATA_UINT64);
2819 		kstat_named_init(&vvk->vvk_time_idle, "time_idle",
2820 		    KSTAT_DATA_UINT64);
2821 		kstat_named_init(&vvk->vvk_time_emu_kern, "time_emu_kern",
2822 		    KSTAT_DATA_UINT64);
2823 		kstat_named_init(&vvk->vvk_time_emu_user, "time_emu_user",
2824 		    KSTAT_DATA_UINT64);
2825 		kstat_named_init(&vvk->vvk_time_sched, "time_sched",
2826 		    KSTAT_DATA_UINT64);
2827 		ksp->ks_private = sc->vmm_vm;
2828 		ksp->ks_update = vmm_kstat_update_vcpu;
2829 	}
2830 
2831 	kstat_install(sc->vmm_kstat_vm);
2832 	for (uint_t i = 0; i < VM_MAXCPU; i++) {
2833 		kstat_install(sc->vmm_kstat_vcpu[i]);
2834 	}
2835 }
2836 
2837 static void
2838 vmm_kstat_fini(vmm_softc_t *sc)
2839 {
2840 	ASSERT(sc->vmm_kstat_vm != NULL);
2841 
2842 	kstat_delete(sc->vmm_kstat_vm);
2843 	sc->vmm_kstat_vm = NULL;
2844 
2845 	for (uint_t i = 0; i < VM_MAXCPU; i++) {
2846 		ASSERT3P(sc->vmm_kstat_vcpu[i], !=, NULL);
2847 
2848 		kstat_delete(sc->vmm_kstat_vcpu[i]);
2849 		sc->vmm_kstat_vcpu[i] = NULL;
2850 	}
2851 }
2852 
2853 static int
2854 vmm_open(dev_t *devp, int flag, int otyp, cred_t *credp)
2855 {
2856 	minor_t		minor;
2857 	vmm_softc_t	*sc;
2858 
2859 	/*
2860 	 * Forbid running bhyve in a 32-bit process until it has been tested and
2861 	 * verified to be safe.
2862 	 */
2863 	if (curproc->p_model != DATAMODEL_LP64) {
2864 		return (EFBIG);
2865 	}
2866 
2867 	minor = getminor(*devp);
2868 	if (minor == VMM_CTL_MINOR) {
2869 		/*
2870 		 * Master control device must be opened exclusively.
2871 		 */
2872 		if ((flag & FEXCL) != FEXCL || otyp != OTYP_CHR) {
2873 			return (EINVAL);
2874 		}
2875 
2876 		return (0);
2877 	}
2878 
2879 	mutex_enter(&vmm_mtx);
2880 	sc = ddi_get_soft_state(vmm_statep, minor);
2881 	if (sc == NULL) {
2882 		mutex_exit(&vmm_mtx);
2883 		return (ENXIO);
2884 	}
2885 
2886 	sc->vmm_flags |= VMM_IS_OPEN;
2887 	mutex_exit(&vmm_mtx);
2888 
2889 	return (0);
2890 }
2891 
2892 static int
2893 vmm_close(dev_t dev, int flag, int otyp, cred_t *credp)
2894 {
2895 	const minor_t minor = getminor(dev);
2896 	vmm_softc_t *sc;
2897 	bool hma_release = false;
2898 
2899 	if (minor == VMM_CTL_MINOR) {
2900 		return (0);
2901 	}
2902 
2903 	mutex_enter(&vmm_mtx);
2904 	sc = ddi_get_soft_state(vmm_statep, minor);
2905 	if (sc == NULL) {
2906 		mutex_exit(&vmm_mtx);
2907 		return (ENXIO);
2908 	}
2909 
2910 	VERIFY3U(sc->vmm_flags & VMM_IS_OPEN, !=, 0);
2911 	sc->vmm_flags &= ~VMM_IS_OPEN;
2912 
2913 	/*
2914 	 * If instance was marked for auto-destruction begin that now.  Instance
2915 	 * destruction may have been initated already, so try to make progress
2916 	 * in that case, since closure of the device is one of its requirements.
2917 	 */
2918 	if ((sc->vmm_flags & VMM_DESTROY) != 0 ||
2919 	    (sc->vmm_flags & VMM_AUTODESTROY) != 0) {
2920 		VERIFY0(vmm_destroy_locked(sc, VDO_DEFAULT, &hma_release));
2921 	}
2922 	mutex_exit(&vmm_mtx);
2923 
2924 	if (hma_release) {
2925 		vmm_hma_release();
2926 	}
2927 
2928 	return (0);
2929 }
2930 
2931 static int
2932 vmm_is_supported(intptr_t arg)
2933 {
2934 	int r;
2935 	const char *msg;
2936 
2937 	if (vmm_is_intel()) {
2938 		r = vmx_x86_supported(&msg);
2939 	} else if (vmm_is_svm()) {
2940 		/*
2941 		 * HMA already ensured that the features necessary for SVM
2942 		 * operation were present and online during vmm_attach().
2943 		 */
2944 		r = 0;
2945 	} else {
2946 		r = ENXIO;
2947 		msg = "Unsupported CPU vendor";
2948 	}
2949 
2950 	if (r != 0 && arg != (intptr_t)NULL) {
2951 		if (copyoutstr(msg, (char *)arg, strlen(msg) + 1, NULL) != 0)
2952 			return (EFAULT);
2953 	}
2954 	return (r);
2955 }
2956 
2957 static int
2958 vmm_ctl_ioctl(int cmd, intptr_t arg, int md, cred_t *cr, int *rvalp)
2959 {
2960 	void *argp = (void *)arg;
2961 
2962 	switch (cmd) {
2963 	case VMM_CREATE_VM: {
2964 		struct vm_create_req req;
2965 
2966 		if ((md & FWRITE) == 0) {
2967 			return (EPERM);
2968 		}
2969 		if (ddi_copyin(argp, &req, sizeof (req), md) != 0) {
2970 			return (EFAULT);
2971 		}
2972 		return (vmmdev_do_vm_create(&req, cr));
2973 	}
2974 	case VMM_DESTROY_VM: {
2975 		struct vm_destroy_req req;
2976 
2977 		if ((md & FWRITE) == 0) {
2978 			return (EPERM);
2979 		}
2980 		if (ddi_copyin(argp, &req, sizeof (req), md) != 0) {
2981 			return (EFAULT);
2982 		}
2983 		return (vmmdev_do_vm_destroy(&req, cr));
2984 	}
2985 	case VMM_VM_SUPPORTED:
2986 		return (vmm_is_supported(arg));
2987 	case VMM_CHECK_IOMMU:
2988 		if (!vmm_check_iommu()) {
2989 			return (ENXIO);
2990 		}
2991 		return (0);
2992 	case VMM_RESV_QUERY:
2993 	case VMM_RESV_SET_TARGET:
2994 		return (vmmr_ioctl(cmd, arg, md, cr, rvalp));
2995 	default:
2996 		break;
2997 	}
2998 	/* No other actions are legal on ctl device */
2999 	return (ENOTTY);
3000 }
3001 
3002 static int
3003 vmm_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp,
3004     int *rvalp)
3005 {
3006 	vmm_softc_t	*sc;
3007 	minor_t		minor;
3008 
3009 	/*
3010 	 * Forbid running bhyve in a 32-bit process until it has been tested and
3011 	 * verified to be safe.
3012 	 */
3013 	if (curproc->p_model != DATAMODEL_LP64) {
3014 		return (EFBIG);
3015 	}
3016 
3017 	/* The structs in bhyve ioctls assume a 64-bit datamodel */
3018 	if (ddi_model_convert_from(mode & FMODELS) != DDI_MODEL_NONE) {
3019 		return (ENOTSUP);
3020 	}
3021 
3022 	/*
3023 	 * Regardless of minor (vmmctl or instance), we respond to queries of
3024 	 * the interface version.
3025 	 */
3026 	if (cmd == VMM_INTERFACE_VERSION) {
3027 		*rvalp = VMM_CURRENT_INTERFACE_VERSION;
3028 		return (0);
3029 	}
3030 
3031 	minor = getminor(dev);
3032 
3033 	if (minor == VMM_CTL_MINOR) {
3034 		return (vmm_ctl_ioctl(cmd, arg, mode, credp, rvalp));
3035 	}
3036 
3037 	sc = ddi_get_soft_state(vmm_statep, minor);
3038 	ASSERT(sc != NULL);
3039 
3040 	/*
3041 	 * Turn away any ioctls against an instance when it is being destroyed.
3042 	 * (Except for the ioctl inquiring about that destroy-in-progress.)
3043 	 */
3044 	if ((sc->vmm_flags & VMM_DESTROY) != 0) {
3045 		if (cmd == VM_DESTROY_PENDING) {
3046 			*rvalp = 1;
3047 			return (0);
3048 		}
3049 		return (ENXIO);
3050 	}
3051 
3052 	return (vmmdev_do_ioctl(sc, cmd, arg, mode, credp, rvalp));
3053 }
3054 
3055 static int
3056 vmm_segmap(dev_t dev, off_t off, struct as *as, caddr_t *addrp, off_t len,
3057     unsigned int prot, unsigned int maxprot, unsigned int flags, cred_t *credp)
3058 {
3059 	vmm_softc_t *sc;
3060 	const minor_t minor = getminor(dev);
3061 	int err;
3062 
3063 	if (minor == VMM_CTL_MINOR) {
3064 		return (ENODEV);
3065 	}
3066 	if (off < 0 || (off + len) <= 0) {
3067 		return (EINVAL);
3068 	}
3069 	if ((prot & PROT_USER) == 0) {
3070 		return (EACCES);
3071 	}
3072 
3073 	sc = ddi_get_soft_state(vmm_statep, minor);
3074 	ASSERT(sc);
3075 
3076 	if (sc->vmm_flags & VMM_DESTROY)
3077 		return (ENXIO);
3078 
3079 	/* Grab read lock on the VM to prevent any changes to the memory map */
3080 	vmm_read_lock(sc);
3081 
3082 	if (off >= VM_DEVMEM_START) {
3083 		int segid;
3084 		off_t segoff;
3085 
3086 		/* Mapping a devmem "device" */
3087 		if (!vmmdev_devmem_segid(sc, off, len, &segid, &segoff)) {
3088 			err = ENODEV;
3089 		} else {
3090 			err = vm_segmap_obj(sc->vmm_vm, segid, segoff, len, as,
3091 			    addrp, prot, maxprot, flags);
3092 		}
3093 	} else {
3094 		/* Mapping a part of the guest physical space */
3095 		err = vm_segmap_space(sc->vmm_vm, off, as, addrp, len, prot,
3096 		    maxprot, flags);
3097 	}
3098 
3099 	vmm_read_unlock(sc);
3100 	return (err);
3101 }
3102 
3103 static sdev_plugin_validate_t
3104 vmm_sdev_validate(sdev_ctx_t ctx)
3105 {
3106 	const char *name = sdev_ctx_name(ctx);
3107 	vmm_softc_t *sc;
3108 	sdev_plugin_validate_t ret;
3109 	minor_t minor;
3110 
3111 	if (sdev_ctx_vtype(ctx) != VCHR)
3112 		return (SDEV_VTOR_INVALID);
3113 
3114 	VERIFY3S(sdev_ctx_minor(ctx, &minor), ==, 0);
3115 
3116 	mutex_enter(&vmm_mtx);
3117 	if ((sc = vmm_lookup(name)) == NULL)
3118 		ret = SDEV_VTOR_INVALID;
3119 	else if (sc->vmm_minor != minor)
3120 		ret = SDEV_VTOR_STALE;
3121 	else
3122 		ret = SDEV_VTOR_VALID;
3123 	mutex_exit(&vmm_mtx);
3124 
3125 	return (ret);
3126 }
3127 
3128 static int
3129 vmm_sdev_filldir(sdev_ctx_t ctx)
3130 {
3131 	vmm_softc_t *sc;
3132 	int ret;
3133 
3134 	if (strcmp(sdev_ctx_path(ctx), VMM_SDEV_ROOT) != 0) {
3135 		cmn_err(CE_WARN, "%s: bad path '%s' != '%s'\n", __func__,
3136 		    sdev_ctx_path(ctx), VMM_SDEV_ROOT);
3137 		return (EINVAL);
3138 	}
3139 
3140 	mutex_enter(&vmm_mtx);
3141 	ASSERT(vmmdev_dip != NULL);
3142 	for (sc = list_head(&vmm_list); sc != NULL;
3143 	    sc = list_next(&vmm_list, sc)) {
3144 		if (INGLOBALZONE(curproc) || sc->vmm_zone == curzone) {
3145 			ret = sdev_plugin_mknod(ctx, sc->vmm_name,
3146 			    S_IFCHR | 0600,
3147 			    makedevice(ddi_driver_major(vmmdev_dip),
3148 			    sc->vmm_minor));
3149 		} else {
3150 			continue;
3151 		}
3152 		if (ret != 0 && ret != EEXIST)
3153 			goto out;
3154 	}
3155 
3156 	ret = 0;
3157 
3158 out:
3159 	mutex_exit(&vmm_mtx);
3160 	return (ret);
3161 }
3162 
3163 /* ARGSUSED */
3164 static void
3165 vmm_sdev_inactive(sdev_ctx_t ctx)
3166 {
3167 }
3168 
3169 static sdev_plugin_ops_t vmm_sdev_ops = {
3170 	.spo_version = SDEV_PLUGIN_VERSION,
3171 	.spo_flags = SDEV_PLUGIN_SUBDIR,
3172 	.spo_validate = vmm_sdev_validate,
3173 	.spo_filldir = vmm_sdev_filldir,
3174 	.spo_inactive = vmm_sdev_inactive
3175 };
3176 
3177 /* ARGSUSED */
3178 static int
3179 vmm_info(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result)
3180 {
3181 	int error;
3182 
3183 	switch (cmd) {
3184 	case DDI_INFO_DEVT2DEVINFO:
3185 		*result = (void *)vmmdev_dip;
3186 		error = DDI_SUCCESS;
3187 		break;
3188 	case DDI_INFO_DEVT2INSTANCE:
3189 		*result = (void *)0;
3190 		error = DDI_SUCCESS;
3191 		break;
3192 	default:
3193 		error = DDI_FAILURE;
3194 		break;
3195 	}
3196 	return (error);
3197 }
3198 
3199 static int
3200 vmm_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
3201 {
3202 	sdev_plugin_hdl_t sph;
3203 	hma_reg_t *reg = NULL;
3204 	boolean_t vmm_loaded = B_FALSE;
3205 
3206 	if (cmd != DDI_ATTACH) {
3207 		return (DDI_FAILURE);
3208 	}
3209 
3210 	mutex_enter(&vmmdev_mtx);
3211 	/* Ensure we are not already attached. */
3212 	if (vmmdev_dip != NULL) {
3213 		mutex_exit(&vmmdev_mtx);
3214 		return (DDI_FAILURE);
3215 	}
3216 
3217 	vmm_sol_glue_init();
3218 
3219 	/*
3220 	 * Perform temporary HMA registration to determine if the system
3221 	 * is capable.
3222 	 */
3223 	if ((reg = hma_register(vmmdev_hvm_name)) == NULL) {
3224 		goto fail;
3225 	} else if (vmm_mod_load() != 0) {
3226 		goto fail;
3227 	}
3228 	vmm_loaded = B_TRUE;
3229 	hma_unregister(reg);
3230 	reg = NULL;
3231 
3232 	/* Create control node.  Other nodes will be created on demand. */
3233 	if (ddi_create_minor_node(dip, "ctl", S_IFCHR,
3234 	    VMM_CTL_MINOR, DDI_PSEUDO, 0) != 0) {
3235 		goto fail;
3236 	}
3237 
3238 	sph = sdev_plugin_register(VMM_MODULE_NAME, &vmm_sdev_ops, NULL);
3239 	if (sph == (sdev_plugin_hdl_t)NULL) {
3240 		ddi_remove_minor_node(dip, NULL);
3241 		goto fail;
3242 	}
3243 
3244 	ddi_report_dev(dip);
3245 	vmmdev_sdev_hdl = sph;
3246 	vmmdev_dip = dip;
3247 	mutex_exit(&vmmdev_mtx);
3248 	return (DDI_SUCCESS);
3249 
3250 fail:
3251 	if (vmm_loaded) {
3252 		VERIFY0(vmm_mod_unload());
3253 	}
3254 	if (reg != NULL) {
3255 		hma_unregister(reg);
3256 	}
3257 	vmm_sol_glue_cleanup();
3258 	mutex_exit(&vmmdev_mtx);
3259 	return (DDI_FAILURE);
3260 }
3261 
3262 static int
3263 vmm_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
3264 {
3265 	if (cmd != DDI_DETACH) {
3266 		return (DDI_FAILURE);
3267 	}
3268 
3269 	/*
3270 	 * Ensure that all resources have been cleaned up.
3271 	 *
3272 	 * To prevent a deadlock with iommu_cleanup() we'll fail the detach if
3273 	 * vmmdev_mtx is already held. We can't wait for vmmdev_mtx with our
3274 	 * devinfo locked as iommu_cleanup() tries to recursively lock each
3275 	 * devinfo, including our own, while holding vmmdev_mtx.
3276 	 */
3277 	if (mutex_tryenter(&vmmdev_mtx) == 0)
3278 		return (DDI_FAILURE);
3279 
3280 	mutex_enter(&vmm_mtx);
3281 	if (!list_is_empty(&vmm_list)) {
3282 		mutex_exit(&vmm_mtx);
3283 		mutex_exit(&vmmdev_mtx);
3284 		return (DDI_FAILURE);
3285 	}
3286 	mutex_exit(&vmm_mtx);
3287 
3288 	if (!vmmr_is_empty()) {
3289 		mutex_exit(&vmmdev_mtx);
3290 		return (DDI_FAILURE);
3291 	}
3292 
3293 	VERIFY(vmmdev_sdev_hdl != (sdev_plugin_hdl_t)NULL);
3294 	if (sdev_plugin_unregister(vmmdev_sdev_hdl) != 0) {
3295 		mutex_exit(&vmmdev_mtx);
3296 		return (DDI_FAILURE);
3297 	}
3298 	vmmdev_sdev_hdl = (sdev_plugin_hdl_t)NULL;
3299 
3300 	/* Remove the control node. */
3301 	ddi_remove_minor_node(dip, "ctl");
3302 	vmmdev_dip = NULL;
3303 
3304 	VERIFY0(vmm_mod_unload());
3305 	VERIFY3U(vmmdev_hma_reg, ==, NULL);
3306 	vmm_sol_glue_cleanup();
3307 
3308 	mutex_exit(&vmmdev_mtx);
3309 
3310 	return (DDI_SUCCESS);
3311 }
3312 
3313 static struct cb_ops vmm_cb_ops = {
3314 	vmm_open,
3315 	vmm_close,
3316 	nodev,		/* strategy */
3317 	nodev,		/* print */
3318 	nodev,		/* dump */
3319 	nodev,		/* read */
3320 	nodev,		/* write */
3321 	vmm_ioctl,
3322 	nodev,		/* devmap */
3323 	nodev,		/* mmap */
3324 	vmm_segmap,
3325 	nochpoll,	/* poll */
3326 	ddi_prop_op,
3327 	NULL,
3328 	D_NEW | D_MP | D_DEVMAP
3329 };
3330 
3331 static struct dev_ops vmm_ops = {
3332 	DEVO_REV,
3333 	0,
3334 	vmm_info,
3335 	nulldev,	/* identify */
3336 	nulldev,	/* probe */
3337 	vmm_attach,
3338 	vmm_detach,
3339 	nodev,		/* reset */
3340 	&vmm_cb_ops,
3341 	(struct bus_ops *)NULL
3342 };
3343 
3344 static struct modldrv modldrv = {
3345 	&mod_driverops,
3346 	"bhyve vmm",
3347 	&vmm_ops
3348 };
3349 
3350 static struct modlinkage modlinkage = {
3351 	MODREV_1,
3352 	&modldrv,
3353 	NULL
3354 };
3355 
3356 int
3357 _init(void)
3358 {
3359 	int	error;
3360 
3361 	sysinit();
3362 
3363 	mutex_init(&vmmdev_mtx, NULL, MUTEX_DRIVER, NULL);
3364 	mutex_init(&vmm_mtx, NULL, MUTEX_DRIVER, NULL);
3365 	list_create(&vmm_list, sizeof (vmm_softc_t),
3366 	    offsetof(vmm_softc_t, vmm_node));
3367 	vmm_minors = id_space_create("vmm_minors", VMM_CTL_MINOR + 1, MAXMIN32);
3368 
3369 	error = ddi_soft_state_init(&vmm_statep, sizeof (vmm_softc_t), 0);
3370 	if (error) {
3371 		return (error);
3372 	}
3373 
3374 	error = vmmr_init();
3375 	if (error) {
3376 		ddi_soft_state_fini(&vmm_statep);
3377 		return (error);
3378 	}
3379 
3380 	vmm_zsd_init();
3381 
3382 	error = mod_install(&modlinkage);
3383 	if (error) {
3384 		ddi_soft_state_fini(&vmm_statep);
3385 		vmm_zsd_fini();
3386 		vmmr_fini();
3387 	}
3388 
3389 	return (error);
3390 }
3391 
3392 int
3393 _fini(void)
3394 {
3395 	int	error;
3396 
3397 	error = mod_remove(&modlinkage);
3398 	if (error) {
3399 		return (error);
3400 	}
3401 
3402 	vmm_zsd_fini();
3403 	vmmr_fini();
3404 
3405 	ddi_soft_state_fini(&vmm_statep);
3406 
3407 	return (0);
3408 }
3409 
3410 int
3411 _info(struct modinfo *modinfop)
3412 {
3413 	return (mod_info(&modlinkage, modinfop));
3414 }
3415