xref: /illumos-gate/usr/src/uts/intel/io/vmm/vmm_sol_dev.c (revision 9a244c8ee0ee32d71c3e66c8a1c3e18a518d48c8)
1 /*
2  * This file and its contents are supplied under the terms of the
3  * Common Development and Distribution License ("CDDL"), version 1.0.
4  * You may only use this file in accordance with the terms of version
5  * 1.0 of the CDDL.
6  *
7  * A full copy of the text of the CDDL should have accompanied this
8  * source.  A copy of the CDDL is also available via the Internet at
9  * http://www.illumos.org/license/CDDL.
10  */
11 /* This file is dual-licensed; see usr/src/contrib/bhyve/LICENSE */
12 
13 /*
14  * Copyright 2015 Pluribus Networks Inc.
15  * Copyright 2019 Joyent, Inc.
16  * Copyright 2020 OmniOS Community Edition (OmniOSce) Association.
17  * Copyright 2023 Oxide Computer Company
18  */
19 
20 #include <sys/types.h>
21 #include <sys/conf.h>
22 #include <sys/cpuvar.h>
23 #include <sys/ioccom.h>
24 #include <sys/stat.h>
25 #include <sys/vmsystm.h>
26 #include <sys/ddi.h>
27 #include <sys/mkdev.h>
28 #include <sys/sunddi.h>
29 #include <sys/fs/dv_node.h>
30 #include <sys/cpuset.h>
31 #include <sys/id_space.h>
32 #include <sys/fs/sdev_plugin.h>
33 #include <sys/smt.h>
34 #include <sys/kstat.h>
35 
36 #include <sys/kernel.h>
37 #include <sys/hma.h>
38 #include <sys/x86_archext.h>
39 #include <x86/apicreg.h>
40 
41 #include <sys/vmm.h>
42 #include <sys/vmm_kernel.h>
43 #include <sys/vmm_instruction_emul.h>
44 #include <sys/vmm_dev.h>
45 #include <sys/vmm_impl.h>
46 #include <sys/vmm_drv.h>
47 #include <sys/vmm_vm.h>
48 #include <sys/vmm_reservoir.h>
49 
50 #include <vm/seg_dev.h>
51 
52 #include "io/ppt.h"
53 #include "io/vatpic.h"
54 #include "io/vioapic.h"
55 #include "io/vrtc.h"
56 #include "io/vhpet.h"
57 #include "io/vpmtmr.h"
58 #include "vmm_lapic.h"
59 #include "vmm_stat.h"
60 #include "vmm_util.h"
61 
62 /*
63  * Locking details:
64  *
65  * Driver-wide data (vmmdev_*) , including HMA and sdev registration, is
66  * protected by vmmdev_mtx.  The list of vmm_softc_t instances and related data
67  * (vmm_*) are protected by vmm_mtx.  Actions requiring both locks must acquire
68  * vmmdev_mtx before vmm_mtx.  The sdev plugin functions must not attempt to
69  * acquire vmmdev_mtx, as they could deadlock with plugin unregistration.
70  */
71 
72 static kmutex_t		vmmdev_mtx;
73 static dev_info_t	*vmmdev_dip;
74 static hma_reg_t	*vmmdev_hma_reg;
75 static uint_t		vmmdev_hma_ref;
76 static sdev_plugin_hdl_t vmmdev_sdev_hdl;
77 
78 static kmutex_t		vmm_mtx;
79 static list_t		vmm_list;
80 static id_space_t	*vmm_minors;
81 static void		*vmm_statep;
82 
83 /* temporary safety switch */
84 int		vmm_allow_state_writes;
85 
86 static const char *vmmdev_hvm_name = "bhyve";
87 
88 /* For sdev plugin (/dev) */
89 #define	VMM_SDEV_ROOT "/dev/vmm"
90 
91 /* From uts/intel/io/vmm/intel/vmx.c */
92 extern int vmx_x86_supported(const char **);
93 
94 /* Holds and hooks from drivers external to vmm */
95 struct vmm_hold {
96 	list_node_t	vmh_node;
97 	vmm_softc_t	*vmh_sc;
98 	boolean_t	vmh_release_req;
99 	uint_t		vmh_ioport_hook_cnt;
100 };
101 
102 struct vmm_lease {
103 	list_node_t		vml_node;
104 	struct vm		*vml_vm;
105 	vm_client_t		*vml_vmclient;
106 	boolean_t		vml_expired;
107 	boolean_t		vml_break_deferred;
108 	boolean_t		(*vml_expire_func)(void *);
109 	void			*vml_expire_arg;
110 	struct vmm_hold		*vml_hold;
111 };
112 
113 /* Options for vmm_destroy_locked */
114 typedef enum vmm_destroy_opts {
115 	VDO_DEFAULT		= 0,
116 	/*
117 	 * Indicate that zone-specific-data associated with this VM not be
118 	 * cleaned up as part of the destroy.  Skipping ZSD clean-up is
119 	 * necessary when VM is being destroyed as part of zone destruction,
120 	 * when said ZSD is already being cleaned up.
121 	 */
122 	VDO_NO_CLEAN_ZSD	= (1 << 0),
123 	/*
124 	 * Attempt to wait for VM destruction to complete.  This is opt-in,
125 	 * since there are many normal conditions which could lead to
126 	 * destruction being stalled pending other clean-up.
127 	 */
128 	VDO_ATTEMPT_WAIT	= (1 << 1),
129 } vmm_destroy_opts_t;
130 
131 static void vmm_hma_release(void);
132 static int vmm_destroy_locked(vmm_softc_t *, vmm_destroy_opts_t, bool *);
133 static int vmm_drv_block_hook(vmm_softc_t *, boolean_t);
134 static void vmm_lease_block(vmm_softc_t *);
135 static void vmm_lease_unblock(vmm_softc_t *);
136 static int vmm_kstat_alloc(vmm_softc_t *, minor_t, const cred_t *);
137 static void vmm_kstat_init(vmm_softc_t *);
138 static void vmm_kstat_fini(vmm_softc_t *);
139 
140 /*
141  * The 'devmem' hack:
142  *
143  * On native FreeBSD, bhyve consumers are allowed to create 'devmem' segments
144  * in the vm which appear with their own name related to the vm under /dev.
145  * Since this would be a hassle from an sdev perspective and would require a
146  * new cdev interface (or complicate the existing one), we choose to implement
147  * this in a different manner.  Direct access to the underlying vm memory
148  * segments is exposed by placing them in a range of offsets beyond the normal
149  * guest memory space.  Userspace can query the appropriate offset to mmap()
150  * for a given segment-id with the VM_DEVMEM_GETOFFSET ioctl.
151  */
152 
153 static vmm_devmem_entry_t *
154 vmmdev_devmem_find(vmm_softc_t *sc, int segid)
155 {
156 	vmm_devmem_entry_t *ent = NULL;
157 	list_t *dl = &sc->vmm_devmem_list;
158 
159 	for (ent = list_head(dl); ent != NULL; ent = list_next(dl, ent)) {
160 		if (ent->vde_segid == segid) {
161 			return (ent);
162 		}
163 	}
164 	return (NULL);
165 }
166 
167 static int
168 vmmdev_get_memseg(vmm_softc_t *sc, struct vm_memseg *mseg)
169 {
170 	int error;
171 	bool sysmem;
172 
173 	error = vm_get_memseg(sc->vmm_vm, mseg->segid, &mseg->len, &sysmem,
174 	    NULL);
175 	if (error || mseg->len == 0)
176 		return (error);
177 
178 	if (!sysmem) {
179 		vmm_devmem_entry_t *de;
180 
181 		de = vmmdev_devmem_find(sc, mseg->segid);
182 		if (de != NULL) {
183 			(void) strlcpy(mseg->name, de->vde_name,
184 			    sizeof (mseg->name));
185 		}
186 	} else {
187 		bzero(mseg->name, sizeof (mseg->name));
188 	}
189 
190 	return (error);
191 }
192 
193 static int
194 vmmdev_devmem_create(vmm_softc_t *sc, struct vm_memseg *mseg, const char *name)
195 {
196 	off_t map_offset;
197 	vmm_devmem_entry_t *entry;
198 
199 	if (list_is_empty(&sc->vmm_devmem_list)) {
200 		map_offset = VM_DEVMEM_START;
201 	} else {
202 		entry = list_tail(&sc->vmm_devmem_list);
203 		map_offset = entry->vde_off + entry->vde_len;
204 		if (map_offset < entry->vde_off) {
205 			/* Do not tolerate overflow */
206 			return (ERANGE);
207 		}
208 		/*
209 		 * XXXJOY: We could choose to search the list for duplicate
210 		 * names and toss an error.  Since we're using the offset
211 		 * method for now, it does not make much of a difference.
212 		 */
213 	}
214 
215 	entry = kmem_zalloc(sizeof (*entry), KM_SLEEP);
216 	entry->vde_segid = mseg->segid;
217 	entry->vde_len = mseg->len;
218 	entry->vde_off = map_offset;
219 	(void) strlcpy(entry->vde_name, name, sizeof (entry->vde_name));
220 	list_insert_tail(&sc->vmm_devmem_list, entry);
221 
222 	return (0);
223 }
224 
225 static boolean_t
226 vmmdev_devmem_segid(vmm_softc_t *sc, off_t off, off_t len, int *segidp,
227     off_t *map_offp)
228 {
229 	list_t *dl = &sc->vmm_devmem_list;
230 	vmm_devmem_entry_t *de = NULL;
231 	const off_t map_end = off + len;
232 
233 	VERIFY(off >= VM_DEVMEM_START);
234 
235 	if (map_end < off) {
236 		/* No match on overflow */
237 		return (B_FALSE);
238 	}
239 
240 	for (de = list_head(dl); de != NULL; de = list_next(dl, de)) {
241 		const off_t item_end = de->vde_off + de->vde_len;
242 
243 		if (de->vde_off <= off && item_end >= map_end) {
244 			*segidp = de->vde_segid;
245 			*map_offp = off - de->vde_off;
246 			return (B_TRUE);
247 		}
248 	}
249 	return (B_FALSE);
250 }
251 
252 /*
253  * When an instance is being destroyed, the devmem list of named memory objects
254  * can be torn down, as no new mappings are allowed.
255  */
256 static void
257 vmmdev_devmem_purge(vmm_softc_t *sc)
258 {
259 	vmm_devmem_entry_t *entry;
260 
261 	while ((entry = list_remove_head(&sc->vmm_devmem_list)) != NULL) {
262 		kmem_free(entry, sizeof (*entry));
263 	}
264 }
265 
266 static int
267 vmmdev_alloc_memseg(vmm_softc_t *sc, struct vm_memseg *mseg)
268 {
269 	int error;
270 	bool sysmem = true;
271 
272 	if (VM_MEMSEG_NAME(mseg)) {
273 		sysmem = false;
274 	}
275 	error = vm_alloc_memseg(sc->vmm_vm, mseg->segid, mseg->len, sysmem);
276 
277 	if (error == 0) {
278 		/*
279 		 * Rather than create a whole fresh device from which userspace
280 		 * can mmap this segment, instead make it available at an
281 		 * offset above where the main guest memory resides.
282 		 */
283 		error = vmmdev_devmem_create(sc, mseg, mseg->name);
284 		if (error != 0) {
285 			vm_free_memseg(sc->vmm_vm, mseg->segid);
286 		}
287 	}
288 	return (error);
289 }
290 
291 /*
292  * Resource Locking and Exclusion
293  *
294  * Much of bhyve depends on key portions of VM state, such as the guest memory
295  * map, to remain unchanged while the guest is running.  As ported from
296  * FreeBSD, the initial strategy for this resource exclusion hinged on gating
297  * access to the instance vCPUs.  Threads acting on a single vCPU, like those
298  * performing the work of actually running the guest in VMX/SVM, would lock
299  * only that vCPU during ioctl() entry.  For ioctls which would change VM-wide
300  * state, all of the vCPUs would be first locked, ensuring that the
301  * operation(s) could complete without any other threads stumbling into
302  * intermediate states.
303  *
304  * This approach is largely effective for bhyve.  Common operations, such as
305  * running the vCPUs, steer clear of lock contention.  The model begins to
306  * break down for operations which do not occur in the context of a specific
307  * vCPU.  LAPIC MSI delivery, for example, may be initiated from a worker
308  * thread in the bhyve process.  In order to properly protect those vCPU-less
309  * operations from encountering invalid states, additional locking is required.
310  * This was solved by forcing those operations to lock the VM_MAXCPU-1 vCPU.
311  * It does mean that class of operations will be serialized on locking the
312  * specific vCPU and that instances sized at VM_MAXCPU will potentially see
313  * undue contention on the VM_MAXCPU-1 vCPU.
314  *
315  * In order to address the shortcomings of this model, the concept of a
316  * read/write lock has been added to bhyve.  Operations which change
317  * fundamental aspects of a VM (such as the memory map) must acquire the write
318  * lock, which also implies locking all of the vCPUs and waiting for all read
319  * lock holders to release.  While it increases the cost and waiting time for
320  * those few operations, it allows most hot-path operations on the VM (which
321  * depend on its configuration remaining stable) to occur with minimal locking.
322  *
323  * Consumers of the Driver API (see below) are a special case when it comes to
324  * this locking, since they may hold a read lock via the drv_lease mechanism
325  * for an extended period of time.  Rather than forcing those consumers to
326  * continuously poll for a write lock attempt, the lease system forces them to
327  * provide a release callback to trigger their clean-up (and potential later
328  * reacquisition) of the read lock.
329  */
330 
331 static void
332 vcpu_lock_one(vmm_softc_t *sc, int vcpu)
333 {
334 	ASSERT(vcpu >= 0 && vcpu < VM_MAXCPU);
335 
336 	/*
337 	 * Since this state transition is utilizing from_idle=true, it should
338 	 * not fail, but rather block until it can be successful.
339 	 */
340 	VERIFY0(vcpu_set_state(sc->vmm_vm, vcpu, VCPU_FROZEN, true));
341 }
342 
343 static void
344 vcpu_unlock_one(vmm_softc_t *sc, int vcpu)
345 {
346 	ASSERT(vcpu >= 0 && vcpu < VM_MAXCPU);
347 
348 	VERIFY3U(vcpu_get_state(sc->vmm_vm, vcpu, NULL), ==, VCPU_FROZEN);
349 	VERIFY0(vcpu_set_state(sc->vmm_vm, vcpu, VCPU_IDLE, false));
350 }
351 
352 static void
353 vmm_read_lock(vmm_softc_t *sc)
354 {
355 	rw_enter(&sc->vmm_rwlock, RW_READER);
356 }
357 
358 static void
359 vmm_read_unlock(vmm_softc_t *sc)
360 {
361 	rw_exit(&sc->vmm_rwlock);
362 }
363 
364 static void
365 vmm_write_lock(vmm_softc_t *sc)
366 {
367 	int maxcpus;
368 
369 	/* First lock all the vCPUs */
370 	maxcpus = vm_get_maxcpus(sc->vmm_vm);
371 	for (int vcpu = 0; vcpu < maxcpus; vcpu++) {
372 		vcpu_lock_one(sc, vcpu);
373 	}
374 
375 	/*
376 	 * Block vmm_drv leases from being acquired or held while the VM write
377 	 * lock is held.
378 	 */
379 	vmm_lease_block(sc);
380 
381 	rw_enter(&sc->vmm_rwlock, RW_WRITER);
382 	/*
383 	 * For now, the 'maxcpus' value for an instance is fixed at the
384 	 * compile-time constant of VM_MAXCPU at creation.  If this changes in
385 	 * the future, allowing for dynamic vCPU resource sizing, acquisition
386 	 * of the write lock will need to be wary of such changes.
387 	 */
388 	VERIFY(maxcpus == vm_get_maxcpus(sc->vmm_vm));
389 }
390 
391 static void
392 vmm_write_unlock(vmm_softc_t *sc)
393 {
394 	int maxcpus;
395 
396 	/* Allow vmm_drv leases to be acquired once write lock is dropped */
397 	vmm_lease_unblock(sc);
398 
399 	/*
400 	 * The VM write lock _must_ be released from the same thread it was
401 	 * acquired in, unlike the read lock.
402 	 */
403 	VERIFY(rw_write_held(&sc->vmm_rwlock));
404 	rw_exit(&sc->vmm_rwlock);
405 
406 	/* Unlock all the vCPUs */
407 	maxcpus = vm_get_maxcpus(sc->vmm_vm);
408 	for (int vcpu = 0; vcpu < maxcpus; vcpu++) {
409 		vcpu_unlock_one(sc, vcpu);
410 	}
411 }
412 
413 static int
414 vmmdev_do_ioctl(vmm_softc_t *sc, int cmd, intptr_t arg, int md,
415     cred_t *credp, int *rvalp)
416 {
417 	int error = 0, vcpu = -1;
418 	void *datap = (void *)arg;
419 	enum vm_lock_type {
420 		LOCK_NONE = 0,
421 		LOCK_VCPU,
422 		LOCK_READ_HOLD,
423 		LOCK_WRITE_HOLD
424 	} lock_type = LOCK_NONE;
425 
426 	/* Acquire any exclusion resources needed for the operation. */
427 	switch (cmd) {
428 	case VM_RUN:
429 	case VM_GET_REGISTER:
430 	case VM_SET_REGISTER:
431 	case VM_GET_SEGMENT_DESCRIPTOR:
432 	case VM_SET_SEGMENT_DESCRIPTOR:
433 	case VM_GET_REGISTER_SET:
434 	case VM_SET_REGISTER_SET:
435 	case VM_INJECT_EXCEPTION:
436 	case VM_GET_CAPABILITY:
437 	case VM_SET_CAPABILITY:
438 	case VM_PPTDEV_MSI:
439 	case VM_PPTDEV_MSIX:
440 	case VM_SET_X2APIC_STATE:
441 	case VM_GLA2GPA:
442 	case VM_GLA2GPA_NOFAULT:
443 	case VM_ACTIVATE_CPU:
444 	case VM_SET_INTINFO:
445 	case VM_GET_INTINFO:
446 	case VM_RESTART_INSTRUCTION:
447 	case VM_SET_KERNEMU_DEV:
448 	case VM_GET_KERNEMU_DEV:
449 	case VM_RESET_CPU:
450 	case VM_GET_RUN_STATE:
451 	case VM_SET_RUN_STATE:
452 	case VM_GET_FPU:
453 	case VM_SET_FPU:
454 	case VM_GET_CPUID:
455 	case VM_SET_CPUID:
456 	case VM_LEGACY_CPUID:
457 		/*
458 		 * Copy in the ID of the vCPU chosen for this operation.
459 		 * Since a nefarious caller could update their struct between
460 		 * this locking and when the rest of the ioctl data is copied
461 		 * in, it is _critical_ that this local 'vcpu' variable be used
462 		 * rather than the in-struct one when performing the ioctl.
463 		 */
464 		if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) {
465 			return (EFAULT);
466 		}
467 		if (vcpu < 0 || vcpu > vm_get_maxcpus(sc->vmm_vm)) {
468 			return (EINVAL);
469 		}
470 		vcpu_lock_one(sc, vcpu);
471 		lock_type = LOCK_VCPU;
472 		break;
473 
474 	case VM_REINIT:
475 	case VM_BIND_PPTDEV:
476 	case VM_UNBIND_PPTDEV:
477 	case VM_MAP_PPTDEV_MMIO:
478 	case VM_UNMAP_PPTDEV_MMIO:
479 	case VM_ALLOC_MEMSEG:
480 	case VM_MMAP_MEMSEG:
481 	case VM_MUNMAP_MEMSEG:
482 	case VM_WRLOCK_CYCLE:
483 	case VM_PMTMR_LOCATE:
484 	case VM_PAUSE:
485 	case VM_RESUME:
486 		vmm_write_lock(sc);
487 		lock_type = LOCK_WRITE_HOLD;
488 		break;
489 
490 	case VM_GET_MEMSEG:
491 	case VM_MMAP_GETNEXT:
492 	case VM_LAPIC_IRQ:
493 	case VM_INJECT_NMI:
494 	case VM_IOAPIC_ASSERT_IRQ:
495 	case VM_IOAPIC_DEASSERT_IRQ:
496 	case VM_IOAPIC_PULSE_IRQ:
497 	case VM_LAPIC_MSI:
498 	case VM_LAPIC_LOCAL_IRQ:
499 	case VM_GET_X2APIC_STATE:
500 	case VM_RTC_READ:
501 	case VM_RTC_WRITE:
502 	case VM_RTC_SETTIME:
503 	case VM_RTC_GETTIME:
504 	case VM_PPTDEV_DISABLE_MSIX:
505 	case VM_DEVMEM_GETOFFSET:
506 	case VM_TRACK_DIRTY_PAGES:
507 		vmm_read_lock(sc);
508 		lock_type = LOCK_READ_HOLD;
509 		break;
510 
511 	case VM_DATA_READ:
512 	case VM_DATA_WRITE:
513 		if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) {
514 			return (EFAULT);
515 		}
516 		if (vcpu == -1) {
517 			/* Access data for VM-wide devices */
518 			vmm_write_lock(sc);
519 			lock_type = LOCK_WRITE_HOLD;
520 		} else if (vcpu >= 0 && vcpu < vm_get_maxcpus(sc->vmm_vm)) {
521 			/* Access data associated with a specific vCPU */
522 			vcpu_lock_one(sc, vcpu);
523 			lock_type = LOCK_VCPU;
524 		} else {
525 			return (EINVAL);
526 		}
527 		break;
528 
529 	case VM_GET_GPA_PMAP:
530 	case VM_IOAPIC_PINCOUNT:
531 	case VM_SUSPEND:
532 	case VM_DESC_FPU_AREA:
533 	case VM_SET_AUTODESTRUCT:
534 	case VM_DESTROY_SELF:
535 	case VM_DESTROY_PENDING:
536 	default:
537 		break;
538 	}
539 
540 	/* Execute the primary logic for the ioctl. */
541 	switch (cmd) {
542 	case VM_RUN: {
543 		struct vm_entry entry;
544 
545 		if (ddi_copyin(datap, &entry, sizeof (entry), md)) {
546 			error = EFAULT;
547 			break;
548 		}
549 
550 		if (!(curthread->t_schedflag & TS_VCPU))
551 			smt_mark_as_vcpu();
552 
553 		error = vm_run(sc->vmm_vm, vcpu, &entry);
554 
555 		/*
556 		 * Unexpected states in vm_run() are expressed through positive
557 		 * errno-oriented return values.  VM states which expect further
558 		 * processing in userspace (necessary context via exitinfo) are
559 		 * expressed through negative return values.  For the time being
560 		 * a return value of 0 is not expected from vm_run().
561 		 */
562 		ASSERT(error != 0);
563 		if (error < 0) {
564 			const struct vm_exit *vme;
565 			void *outp = entry.exit_data;
566 
567 			error = 0;
568 			vme = vm_exitinfo(sc->vmm_vm, vcpu);
569 			if (ddi_copyout(vme, outp, sizeof (*vme), md)) {
570 				error = EFAULT;
571 			}
572 		}
573 		break;
574 	}
575 	case VM_SUSPEND: {
576 		struct vm_suspend vmsuspend;
577 
578 		if (ddi_copyin(datap, &vmsuspend, sizeof (vmsuspend), md)) {
579 			error = EFAULT;
580 			break;
581 		}
582 		error = vm_suspend(sc->vmm_vm, vmsuspend.how);
583 		break;
584 	}
585 	case VM_REINIT: {
586 		struct vm_reinit reinit;
587 
588 		if (ddi_copyin(datap, &reinit, sizeof (reinit), md)) {
589 			error = EFAULT;
590 			break;
591 		}
592 		if ((error = vmm_drv_block_hook(sc, B_TRUE)) != 0) {
593 			/*
594 			 * The VM instance should be free of driver-attached
595 			 * hooks during the reinitialization process.
596 			 */
597 			break;
598 		}
599 		error = vm_reinit(sc->vmm_vm, reinit.flags);
600 		(void) vmm_drv_block_hook(sc, B_FALSE);
601 		break;
602 	}
603 	case VM_STAT_DESC: {
604 		struct vm_stat_desc statdesc;
605 
606 		if (ddi_copyin(datap, &statdesc, sizeof (statdesc), md)) {
607 			error = EFAULT;
608 			break;
609 		}
610 		error = vmm_stat_desc_copy(statdesc.index, statdesc.desc,
611 		    sizeof (statdesc.desc));
612 		if (error == 0 &&
613 		    ddi_copyout(&statdesc, datap, sizeof (statdesc), md)) {
614 			error = EFAULT;
615 			break;
616 		}
617 		break;
618 	}
619 	case VM_STATS_IOC: {
620 		struct vm_stats vmstats;
621 
622 		if (ddi_copyin(datap, &vmstats, sizeof (vmstats), md)) {
623 			error = EFAULT;
624 			break;
625 		}
626 		hrt2tv(gethrtime(), &vmstats.tv);
627 		error = vmm_stat_copy(sc->vmm_vm, vmstats.cpuid, vmstats.index,
628 		    nitems(vmstats.statbuf),
629 		    &vmstats.num_entries, vmstats.statbuf);
630 		if (error == 0 &&
631 		    ddi_copyout(&vmstats, datap, sizeof (vmstats), md)) {
632 			error = EFAULT;
633 			break;
634 		}
635 		break;
636 	}
637 
638 	case VM_PPTDEV_MSI: {
639 		struct vm_pptdev_msi pptmsi;
640 
641 		if (ddi_copyin(datap, &pptmsi, sizeof (pptmsi), md)) {
642 			error = EFAULT;
643 			break;
644 		}
645 		error = ppt_setup_msi(sc->vmm_vm, pptmsi.vcpu, pptmsi.pptfd,
646 		    pptmsi.addr, pptmsi.msg, pptmsi.numvec);
647 		break;
648 	}
649 	case VM_PPTDEV_MSIX: {
650 		struct vm_pptdev_msix pptmsix;
651 
652 		if (ddi_copyin(datap, &pptmsix, sizeof (pptmsix), md)) {
653 			error = EFAULT;
654 			break;
655 		}
656 		error = ppt_setup_msix(sc->vmm_vm, pptmsix.vcpu, pptmsix.pptfd,
657 		    pptmsix.idx, pptmsix.addr, pptmsix.msg,
658 		    pptmsix.vector_control);
659 		break;
660 	}
661 	case VM_PPTDEV_DISABLE_MSIX: {
662 		struct vm_pptdev pptdev;
663 
664 		if (ddi_copyin(datap, &pptdev, sizeof (pptdev), md)) {
665 			error = EFAULT;
666 			break;
667 		}
668 		error = ppt_disable_msix(sc->vmm_vm, pptdev.pptfd);
669 		break;
670 	}
671 	case VM_MAP_PPTDEV_MMIO: {
672 		struct vm_pptdev_mmio pptmmio;
673 
674 		if (ddi_copyin(datap, &pptmmio, sizeof (pptmmio), md)) {
675 			error = EFAULT;
676 			break;
677 		}
678 		error = ppt_map_mmio(sc->vmm_vm, pptmmio.pptfd, pptmmio.gpa,
679 		    pptmmio.len, pptmmio.hpa);
680 		break;
681 	}
682 	case VM_UNMAP_PPTDEV_MMIO: {
683 		struct vm_pptdev_mmio pptmmio;
684 
685 		if (ddi_copyin(datap, &pptmmio, sizeof (pptmmio), md)) {
686 			error = EFAULT;
687 			break;
688 		}
689 		error = ppt_unmap_mmio(sc->vmm_vm, pptmmio.pptfd, pptmmio.gpa,
690 		    pptmmio.len);
691 		break;
692 	}
693 	case VM_BIND_PPTDEV: {
694 		struct vm_pptdev pptdev;
695 
696 		if (ddi_copyin(datap, &pptdev, sizeof (pptdev), md)) {
697 			error = EFAULT;
698 			break;
699 		}
700 		error = vm_assign_pptdev(sc->vmm_vm, pptdev.pptfd);
701 		break;
702 	}
703 	case VM_UNBIND_PPTDEV: {
704 		struct vm_pptdev pptdev;
705 
706 		if (ddi_copyin(datap, &pptdev, sizeof (pptdev), md)) {
707 			error = EFAULT;
708 			break;
709 		}
710 		error = vm_unassign_pptdev(sc->vmm_vm, pptdev.pptfd);
711 		break;
712 	}
713 	case VM_GET_PPTDEV_LIMITS: {
714 		struct vm_pptdev_limits pptlimits;
715 
716 		if (ddi_copyin(datap, &pptlimits, sizeof (pptlimits), md)) {
717 			error = EFAULT;
718 			break;
719 		}
720 		error = ppt_get_limits(sc->vmm_vm, pptlimits.pptfd,
721 		    &pptlimits.msi_limit, &pptlimits.msix_limit);
722 		if (error == 0 &&
723 		    ddi_copyout(&pptlimits, datap, sizeof (pptlimits), md)) {
724 			error = EFAULT;
725 			break;
726 		}
727 		break;
728 	}
729 	case VM_INJECT_EXCEPTION: {
730 		struct vm_exception vmexc;
731 		if (ddi_copyin(datap, &vmexc, sizeof (vmexc), md)) {
732 			error = EFAULT;
733 			break;
734 		}
735 		error = vm_inject_exception(sc->vmm_vm, vcpu, vmexc.vector,
736 		    vmexc.error_code_valid != 0, vmexc.error_code,
737 		    vmexc.restart_instruction != 0);
738 		break;
739 	}
740 	case VM_INJECT_NMI: {
741 		struct vm_nmi vmnmi;
742 
743 		if (ddi_copyin(datap, &vmnmi, sizeof (vmnmi), md)) {
744 			error = EFAULT;
745 			break;
746 		}
747 		error = vm_inject_nmi(sc->vmm_vm, vmnmi.cpuid);
748 		break;
749 	}
750 	case VM_LAPIC_IRQ: {
751 		struct vm_lapic_irq vmirq;
752 
753 		if (ddi_copyin(datap, &vmirq, sizeof (vmirq), md)) {
754 			error = EFAULT;
755 			break;
756 		}
757 		error = lapic_intr_edge(sc->vmm_vm, vmirq.cpuid, vmirq.vector);
758 		break;
759 	}
760 	case VM_LAPIC_LOCAL_IRQ: {
761 		struct vm_lapic_irq vmirq;
762 
763 		if (ddi_copyin(datap, &vmirq, sizeof (vmirq), md)) {
764 			error = EFAULT;
765 			break;
766 		}
767 		error = lapic_set_local_intr(sc->vmm_vm, vmirq.cpuid,
768 		    vmirq.vector);
769 		break;
770 	}
771 	case VM_LAPIC_MSI: {
772 		struct vm_lapic_msi vmmsi;
773 
774 		if (ddi_copyin(datap, &vmmsi, sizeof (vmmsi), md)) {
775 			error = EFAULT;
776 			break;
777 		}
778 		error = lapic_intr_msi(sc->vmm_vm, vmmsi.addr, vmmsi.msg);
779 		break;
780 	}
781 
782 	case VM_IOAPIC_ASSERT_IRQ: {
783 		struct vm_ioapic_irq ioapic_irq;
784 
785 		if (ddi_copyin(datap, &ioapic_irq, sizeof (ioapic_irq), md)) {
786 			error = EFAULT;
787 			break;
788 		}
789 		error = vioapic_assert_irq(sc->vmm_vm, ioapic_irq.irq);
790 		break;
791 	}
792 	case VM_IOAPIC_DEASSERT_IRQ: {
793 		struct vm_ioapic_irq ioapic_irq;
794 
795 		if (ddi_copyin(datap, &ioapic_irq, sizeof (ioapic_irq), md)) {
796 			error = EFAULT;
797 			break;
798 		}
799 		error = vioapic_deassert_irq(sc->vmm_vm, ioapic_irq.irq);
800 		break;
801 	}
802 	case VM_IOAPIC_PULSE_IRQ: {
803 		struct vm_ioapic_irq ioapic_irq;
804 
805 		if (ddi_copyin(datap, &ioapic_irq, sizeof (ioapic_irq), md)) {
806 			error = EFAULT;
807 			break;
808 		}
809 		error = vioapic_pulse_irq(sc->vmm_vm, ioapic_irq.irq);
810 		break;
811 	}
812 	case VM_IOAPIC_PINCOUNT: {
813 		int pincount;
814 
815 		pincount = vioapic_pincount(sc->vmm_vm);
816 		if (ddi_copyout(&pincount, datap, sizeof (int), md)) {
817 			error = EFAULT;
818 			break;
819 		}
820 		break;
821 	}
822 	case VM_DESC_FPU_AREA: {
823 		struct vm_fpu_desc desc;
824 		void *buf = NULL;
825 
826 		if (ddi_copyin(datap, &desc, sizeof (desc), md)) {
827 			error = EFAULT;
828 			break;
829 		}
830 		if (desc.vfd_num_entries > 64) {
831 			error = EINVAL;
832 			break;
833 		}
834 		const size_t buf_sz = sizeof (struct vm_fpu_desc_entry) *
835 		    desc.vfd_num_entries;
836 		if (buf_sz != 0) {
837 			buf = kmem_zalloc(buf_sz, KM_SLEEP);
838 		}
839 
840 		/*
841 		 * For now, we are depending on vm_fpu_desc_entry and
842 		 * hma_xsave_state_desc_t having the same format.
843 		 */
844 		CTASSERT(sizeof (struct vm_fpu_desc_entry) ==
845 		    sizeof (hma_xsave_state_desc_t));
846 
847 		size_t req_size;
848 		const uint_t max_entries = hma_fpu_describe_xsave_state(
849 		    (hma_xsave_state_desc_t *)buf,
850 		    desc.vfd_num_entries,
851 		    &req_size);
852 
853 		desc.vfd_req_size = req_size;
854 		desc.vfd_num_entries = max_entries;
855 		if (buf_sz != 0) {
856 			if (ddi_copyout(buf, desc.vfd_entry_data, buf_sz, md)) {
857 				error = EFAULT;
858 			}
859 			kmem_free(buf, buf_sz);
860 		}
861 
862 		if (error == 0) {
863 			if (ddi_copyout(&desc, datap, sizeof (desc), md)) {
864 				error = EFAULT;
865 			}
866 		}
867 		break;
868 	}
869 	case VM_SET_AUTODESTRUCT: {
870 		/*
871 		 * Since this has to do with controlling the lifetime of the
872 		 * greater vmm_softc_t, the flag is protected by vmm_mtx, rather
873 		 * than the vcpu-centric or rwlock exclusion mechanisms.
874 		 */
875 		mutex_enter(&vmm_mtx);
876 		if (arg != 0) {
877 			sc->vmm_flags |= VMM_AUTODESTROY;
878 		} else {
879 			sc->vmm_flags &= ~VMM_AUTODESTROY;
880 		}
881 		mutex_exit(&vmm_mtx);
882 		break;
883 	}
884 	case VM_DESTROY_SELF: {
885 		bool hma_release = false;
886 
887 		/*
888 		 * Just like VMM_DESTROY_VM, but on the instance file descriptor
889 		 * itself, rather than having to perform a racy name lookup as
890 		 * part of the destroy process.
891 		 *
892 		 * Since vmm_destroy_locked() performs vCPU lock acquisition in
893 		 * order to kick the vCPUs out of guest context as part of any
894 		 * destruction, we do not need to worry about it ourself using
895 		 * the `lock_type` logic here.
896 		 */
897 		mutex_enter(&vmm_mtx);
898 		VERIFY0(vmm_destroy_locked(sc, VDO_DEFAULT, &hma_release));
899 		mutex_exit(&vmm_mtx);
900 		if (hma_release) {
901 			vmm_hma_release();
902 		}
903 		break;
904 	}
905 	case VM_DESTROY_PENDING: {
906 		/*
907 		 * If we have made it this far, then destruction of the instance
908 		 * has not been initiated.
909 		 */
910 		*rvalp = 0;
911 		break;
912 	}
913 
914 	case VM_ISA_ASSERT_IRQ: {
915 		struct vm_isa_irq isa_irq;
916 
917 		if (ddi_copyin(datap, &isa_irq, sizeof (isa_irq), md)) {
918 			error = EFAULT;
919 			break;
920 		}
921 		error = vatpic_assert_irq(sc->vmm_vm, isa_irq.atpic_irq);
922 		if (error == 0 && isa_irq.ioapic_irq != -1) {
923 			error = vioapic_assert_irq(sc->vmm_vm,
924 			    isa_irq.ioapic_irq);
925 		}
926 		break;
927 	}
928 	case VM_ISA_DEASSERT_IRQ: {
929 		struct vm_isa_irq isa_irq;
930 
931 		if (ddi_copyin(datap, &isa_irq, sizeof (isa_irq), md)) {
932 			error = EFAULT;
933 			break;
934 		}
935 		error = vatpic_deassert_irq(sc->vmm_vm, isa_irq.atpic_irq);
936 		if (error == 0 && isa_irq.ioapic_irq != -1) {
937 			error = vioapic_deassert_irq(sc->vmm_vm,
938 			    isa_irq.ioapic_irq);
939 		}
940 		break;
941 	}
942 	case VM_ISA_PULSE_IRQ: {
943 		struct vm_isa_irq isa_irq;
944 
945 		if (ddi_copyin(datap, &isa_irq, sizeof (isa_irq), md)) {
946 			error = EFAULT;
947 			break;
948 		}
949 		error = vatpic_pulse_irq(sc->vmm_vm, isa_irq.atpic_irq);
950 		if (error == 0 && isa_irq.ioapic_irq != -1) {
951 			error = vioapic_pulse_irq(sc->vmm_vm,
952 			    isa_irq.ioapic_irq);
953 		}
954 		break;
955 	}
956 	case VM_ISA_SET_IRQ_TRIGGER: {
957 		struct vm_isa_irq_trigger isa_irq_trigger;
958 
959 		if (ddi_copyin(datap, &isa_irq_trigger,
960 		    sizeof (isa_irq_trigger), md)) {
961 			error = EFAULT;
962 			break;
963 		}
964 		error = vatpic_set_irq_trigger(sc->vmm_vm,
965 		    isa_irq_trigger.atpic_irq, isa_irq_trigger.trigger);
966 		break;
967 	}
968 
969 	case VM_MMAP_GETNEXT: {
970 		struct vm_memmap mm;
971 
972 		if (ddi_copyin(datap, &mm, sizeof (mm), md)) {
973 			error = EFAULT;
974 			break;
975 		}
976 		error = vm_mmap_getnext(sc->vmm_vm, &mm.gpa, &mm.segid,
977 		    &mm.segoff, &mm.len, &mm.prot, &mm.flags);
978 		if (error == 0 && ddi_copyout(&mm, datap, sizeof (mm), md)) {
979 			error = EFAULT;
980 			break;
981 		}
982 		break;
983 	}
984 	case VM_MMAP_MEMSEG: {
985 		struct vm_memmap mm;
986 
987 		if (ddi_copyin(datap, &mm, sizeof (mm), md)) {
988 			error = EFAULT;
989 			break;
990 		}
991 		error = vm_mmap_memseg(sc->vmm_vm, mm.gpa, mm.segid, mm.segoff,
992 		    mm.len, mm.prot, mm.flags);
993 		break;
994 	}
995 	case VM_MUNMAP_MEMSEG: {
996 		struct vm_munmap mu;
997 
998 		if (ddi_copyin(datap, &mu, sizeof (mu), md)) {
999 			error = EFAULT;
1000 			break;
1001 		}
1002 		error = vm_munmap_memseg(sc->vmm_vm, mu.gpa, mu.len);
1003 		break;
1004 	}
1005 	case VM_ALLOC_MEMSEG: {
1006 		struct vm_memseg vmseg;
1007 
1008 		if (ddi_copyin(datap, &vmseg, sizeof (vmseg), md)) {
1009 			error = EFAULT;
1010 			break;
1011 		}
1012 		error = vmmdev_alloc_memseg(sc, &vmseg);
1013 		break;
1014 	}
1015 	case VM_GET_MEMSEG: {
1016 		struct vm_memseg vmseg;
1017 
1018 		if (ddi_copyin(datap, &vmseg, sizeof (vmseg), md)) {
1019 			error = EFAULT;
1020 			break;
1021 		}
1022 		error = vmmdev_get_memseg(sc, &vmseg);
1023 		if (error == 0 &&
1024 		    ddi_copyout(&vmseg, datap, sizeof (vmseg), md)) {
1025 			error = EFAULT;
1026 			break;
1027 		}
1028 		break;
1029 	}
1030 	case VM_GET_REGISTER: {
1031 		struct vm_register vmreg;
1032 
1033 		if (ddi_copyin(datap, &vmreg, sizeof (vmreg), md)) {
1034 			error = EFAULT;
1035 			break;
1036 		}
1037 		error = vm_get_register(sc->vmm_vm, vcpu, vmreg.regnum,
1038 		    &vmreg.regval);
1039 		if (error == 0 &&
1040 		    ddi_copyout(&vmreg, datap, sizeof (vmreg), md)) {
1041 			error = EFAULT;
1042 			break;
1043 		}
1044 		break;
1045 	}
1046 	case VM_SET_REGISTER: {
1047 		struct vm_register vmreg;
1048 
1049 		if (ddi_copyin(datap, &vmreg, sizeof (vmreg), md)) {
1050 			error = EFAULT;
1051 			break;
1052 		}
1053 		error = vm_set_register(sc->vmm_vm, vcpu, vmreg.regnum,
1054 		    vmreg.regval);
1055 		break;
1056 	}
1057 	case VM_SET_SEGMENT_DESCRIPTOR: {
1058 		struct vm_seg_desc vmsegd;
1059 
1060 		if (ddi_copyin(datap, &vmsegd, sizeof (vmsegd), md)) {
1061 			error = EFAULT;
1062 			break;
1063 		}
1064 		error = vm_set_seg_desc(sc->vmm_vm, vcpu, vmsegd.regnum,
1065 		    &vmsegd.desc);
1066 		break;
1067 	}
1068 	case VM_GET_SEGMENT_DESCRIPTOR: {
1069 		struct vm_seg_desc vmsegd;
1070 
1071 		if (ddi_copyin(datap, &vmsegd, sizeof (vmsegd), md)) {
1072 			error = EFAULT;
1073 			break;
1074 		}
1075 		error = vm_get_seg_desc(sc->vmm_vm, vcpu, vmsegd.regnum,
1076 		    &vmsegd.desc);
1077 		if (error == 0 &&
1078 		    ddi_copyout(&vmsegd, datap, sizeof (vmsegd), md)) {
1079 			error = EFAULT;
1080 			break;
1081 		}
1082 		break;
1083 	}
1084 	case VM_GET_REGISTER_SET: {
1085 		struct vm_register_set vrs;
1086 		int regnums[VM_REG_LAST];
1087 		uint64_t regvals[VM_REG_LAST];
1088 
1089 		if (ddi_copyin(datap, &vrs, sizeof (vrs), md)) {
1090 			error = EFAULT;
1091 			break;
1092 		}
1093 		if (vrs.count > VM_REG_LAST || vrs.count == 0) {
1094 			error = EINVAL;
1095 			break;
1096 		}
1097 		if (ddi_copyin(vrs.regnums, regnums,
1098 		    sizeof (int) * vrs.count, md)) {
1099 			error = EFAULT;
1100 			break;
1101 		}
1102 
1103 		error = 0;
1104 		for (uint_t i = 0; i < vrs.count && error == 0; i++) {
1105 			if (regnums[i] < 0) {
1106 				error = EINVAL;
1107 				break;
1108 			}
1109 			error = vm_get_register(sc->vmm_vm, vcpu, regnums[i],
1110 			    &regvals[i]);
1111 		}
1112 		if (error == 0 && ddi_copyout(regvals, vrs.regvals,
1113 		    sizeof (uint64_t) * vrs.count, md)) {
1114 			error = EFAULT;
1115 		}
1116 		break;
1117 	}
1118 	case VM_SET_REGISTER_SET: {
1119 		struct vm_register_set vrs;
1120 		int regnums[VM_REG_LAST];
1121 		uint64_t regvals[VM_REG_LAST];
1122 
1123 		if (ddi_copyin(datap, &vrs, sizeof (vrs), md)) {
1124 			error = EFAULT;
1125 			break;
1126 		}
1127 		if (vrs.count > VM_REG_LAST || vrs.count == 0) {
1128 			error = EINVAL;
1129 			break;
1130 		}
1131 		if (ddi_copyin(vrs.regnums, regnums,
1132 		    sizeof (int) * vrs.count, md)) {
1133 			error = EFAULT;
1134 			break;
1135 		}
1136 		if (ddi_copyin(vrs.regvals, regvals,
1137 		    sizeof (uint64_t) * vrs.count, md)) {
1138 			error = EFAULT;
1139 			break;
1140 		}
1141 
1142 		error = 0;
1143 		for (uint_t i = 0; i < vrs.count && error == 0; i++) {
1144 			/*
1145 			 * Setting registers in a set is not atomic, since a
1146 			 * failure in the middle of the set will cause a
1147 			 * bail-out and inconsistent register state.  Callers
1148 			 * should be wary of this.
1149 			 */
1150 			if (regnums[i] < 0) {
1151 				error = EINVAL;
1152 				break;
1153 			}
1154 			error = vm_set_register(sc->vmm_vm, vcpu, regnums[i],
1155 			    regvals[i]);
1156 		}
1157 		break;
1158 	}
1159 	case VM_RESET_CPU: {
1160 		struct vm_vcpu_reset vvr;
1161 
1162 		if (ddi_copyin(datap, &vvr, sizeof (vvr), md)) {
1163 			error = EFAULT;
1164 			break;
1165 		}
1166 		if (vvr.kind != VRK_RESET && vvr.kind != VRK_INIT) {
1167 			error = EINVAL;
1168 		}
1169 
1170 		error = vcpu_arch_reset(sc->vmm_vm, vcpu, vvr.kind == VRK_INIT);
1171 		break;
1172 	}
1173 	case VM_GET_RUN_STATE: {
1174 		struct vm_run_state vrs;
1175 
1176 		bzero(&vrs, sizeof (vrs));
1177 		error = vm_get_run_state(sc->vmm_vm, vcpu, &vrs.state,
1178 		    &vrs.sipi_vector);
1179 		if (error == 0) {
1180 			if (ddi_copyout(&vrs, datap, sizeof (vrs), md)) {
1181 				error = EFAULT;
1182 				break;
1183 			}
1184 		}
1185 		break;
1186 	}
1187 	case VM_SET_RUN_STATE: {
1188 		struct vm_run_state vrs;
1189 
1190 		if (ddi_copyin(datap, &vrs, sizeof (vrs), md)) {
1191 			error = EFAULT;
1192 			break;
1193 		}
1194 		error = vm_set_run_state(sc->vmm_vm, vcpu, vrs.state,
1195 		    vrs.sipi_vector);
1196 		break;
1197 	}
1198 	case VM_GET_FPU: {
1199 		struct vm_fpu_state req;
1200 		const size_t max_len = (PAGESIZE * 2);
1201 		void *kbuf;
1202 
1203 		if (ddi_copyin(datap, &req, sizeof (req), md)) {
1204 			error = EFAULT;
1205 			break;
1206 		}
1207 		if (req.len > max_len || req.len == 0) {
1208 			error = EINVAL;
1209 			break;
1210 		}
1211 		kbuf = kmem_zalloc(req.len, KM_SLEEP);
1212 		error = vm_get_fpu(sc->vmm_vm, vcpu, kbuf, req.len);
1213 		if (error == 0) {
1214 			if (ddi_copyout(kbuf, req.buf, req.len, md)) {
1215 				error = EFAULT;
1216 			}
1217 		}
1218 		kmem_free(kbuf, req.len);
1219 		break;
1220 	}
1221 	case VM_SET_FPU: {
1222 		struct vm_fpu_state req;
1223 		const size_t max_len = (PAGESIZE * 2);
1224 		void *kbuf;
1225 
1226 		if (ddi_copyin(datap, &req, sizeof (req), md)) {
1227 			error = EFAULT;
1228 			break;
1229 		}
1230 		if (req.len > max_len || req.len == 0) {
1231 			error = EINVAL;
1232 			break;
1233 		}
1234 		kbuf = kmem_alloc(req.len, KM_SLEEP);
1235 		if (ddi_copyin(req.buf, kbuf, req.len, md)) {
1236 			error = EFAULT;
1237 		} else {
1238 			error = vm_set_fpu(sc->vmm_vm, vcpu, kbuf, req.len);
1239 		}
1240 		kmem_free(kbuf, req.len);
1241 		break;
1242 	}
1243 	case VM_GET_CPUID: {
1244 		struct vm_vcpu_cpuid_config cfg;
1245 		struct vcpu_cpuid_entry *entries = NULL;
1246 
1247 		if (ddi_copyin(datap, &cfg, sizeof (cfg), md)) {
1248 			error = EFAULT;
1249 			break;
1250 		}
1251 		if (cfg.vvcc_nent > VMM_MAX_CPUID_ENTRIES) {
1252 			error = EINVAL;
1253 			break;
1254 		}
1255 
1256 		const size_t entries_size =
1257 		    cfg.vvcc_nent * sizeof (struct vcpu_cpuid_entry);
1258 		if (entries_size != 0) {
1259 			entries = kmem_zalloc(entries_size, KM_SLEEP);
1260 		}
1261 
1262 		vcpu_cpuid_config_t vm_cfg = {
1263 			.vcc_nent = cfg.vvcc_nent,
1264 			.vcc_entries = entries,
1265 		};
1266 		error = vm_get_cpuid(sc->vmm_vm, vcpu, &vm_cfg);
1267 
1268 		/*
1269 		 * Only attempt to copy out the resultant entries if we were
1270 		 * able to query them from the instance.  The flags and number
1271 		 * of entries are emitted regardless.
1272 		 */
1273 		cfg.vvcc_flags = vm_cfg.vcc_flags;
1274 		cfg.vvcc_nent = vm_cfg.vcc_nent;
1275 		if (entries != NULL) {
1276 			if (error == 0 && ddi_copyout(entries, cfg.vvcc_entries,
1277 			    entries_size, md) != 0) {
1278 				error = EFAULT;
1279 			}
1280 
1281 			kmem_free(entries, entries_size);
1282 		}
1283 
1284 		if (ddi_copyout(&cfg, datap, sizeof (cfg), md) != 0) {
1285 			error = EFAULT;
1286 		}
1287 		break;
1288 	}
1289 	case VM_SET_CPUID: {
1290 		struct vm_vcpu_cpuid_config cfg;
1291 		struct vcpu_cpuid_entry *entries = NULL;
1292 		size_t entries_size = 0;
1293 
1294 		if (ddi_copyin(datap, &cfg, sizeof (cfg), md)) {
1295 			error = EFAULT;
1296 			break;
1297 		}
1298 		if (cfg.vvcc_nent > VMM_MAX_CPUID_ENTRIES) {
1299 			error = EFBIG;
1300 			break;
1301 		}
1302 		if ((cfg.vvcc_flags & VCC_FLAG_LEGACY_HANDLING) != 0) {
1303 			/*
1304 			 * If we are being instructed to use "legacy" handling,
1305 			 * then no entries should be provided, since the static
1306 			 * in-kernel masking will be used.
1307 			 */
1308 			if (cfg.vvcc_nent != 0) {
1309 				error = EINVAL;
1310 				break;
1311 			}
1312 		} else if (cfg.vvcc_nent != 0) {
1313 			entries_size =
1314 			    cfg.vvcc_nent * sizeof (struct vcpu_cpuid_entry);
1315 			entries = kmem_alloc(entries_size, KM_SLEEP);
1316 
1317 			if (ddi_copyin(cfg.vvcc_entries, entries, entries_size,
1318 			    md) != 0) {
1319 				error = EFAULT;
1320 				kmem_free(entries, entries_size);
1321 				break;
1322 			}
1323 		}
1324 
1325 		vcpu_cpuid_config_t vm_cfg = {
1326 			.vcc_flags = cfg.vvcc_flags,
1327 			.vcc_nent = cfg.vvcc_nent,
1328 			.vcc_entries = entries,
1329 		};
1330 		error = vm_set_cpuid(sc->vmm_vm, vcpu, &vm_cfg);
1331 
1332 		if (entries != NULL) {
1333 			kmem_free(entries, entries_size);
1334 		}
1335 		break;
1336 	}
1337 	case VM_LEGACY_CPUID: {
1338 		struct vm_legacy_cpuid vlc;
1339 		if (ddi_copyin(datap, &vlc, sizeof (vlc), md)) {
1340 			error = EFAULT;
1341 			break;
1342 		}
1343 		vlc.vlc_vcpuid = vcpu;
1344 
1345 		legacy_emulate_cpuid(sc->vmm_vm, vcpu, &vlc.vlc_eax,
1346 		    &vlc.vlc_ebx, &vlc.vlc_ecx, &vlc.vlc_edx);
1347 
1348 		if (ddi_copyout(&vlc, datap, sizeof (vlc), md)) {
1349 			error = EFAULT;
1350 			break;
1351 		}
1352 		break;
1353 	}
1354 
1355 	case VM_SET_KERNEMU_DEV:
1356 	case VM_GET_KERNEMU_DEV: {
1357 		struct vm_readwrite_kernemu_device kemu;
1358 		size_t size = 0;
1359 
1360 		if (ddi_copyin(datap, &kemu, sizeof (kemu), md)) {
1361 			error = EFAULT;
1362 			break;
1363 		}
1364 
1365 		if (kemu.access_width > 3) {
1366 			error = EINVAL;
1367 			break;
1368 		}
1369 		size = (1 << kemu.access_width);
1370 		ASSERT(size >= 1 && size <= 8);
1371 
1372 		if (cmd == VM_SET_KERNEMU_DEV) {
1373 			error = vm_service_mmio_write(sc->vmm_vm, vcpu,
1374 			    kemu.gpa, kemu.value, size);
1375 		} else {
1376 			error = vm_service_mmio_read(sc->vmm_vm, vcpu,
1377 			    kemu.gpa, &kemu.value, size);
1378 		}
1379 
1380 		if (error == 0) {
1381 			if (ddi_copyout(&kemu, datap, sizeof (kemu), md)) {
1382 				error = EFAULT;
1383 				break;
1384 			}
1385 		}
1386 		break;
1387 	}
1388 
1389 	case VM_GET_CAPABILITY: {
1390 		struct vm_capability vmcap;
1391 
1392 		if (ddi_copyin(datap, &vmcap, sizeof (vmcap), md)) {
1393 			error = EFAULT;
1394 			break;
1395 		}
1396 		error = vm_get_capability(sc->vmm_vm, vcpu, vmcap.captype,
1397 		    &vmcap.capval);
1398 		if (error == 0 &&
1399 		    ddi_copyout(&vmcap, datap, sizeof (vmcap), md)) {
1400 			error = EFAULT;
1401 			break;
1402 		}
1403 		break;
1404 	}
1405 	case VM_SET_CAPABILITY: {
1406 		struct vm_capability vmcap;
1407 
1408 		if (ddi_copyin(datap, &vmcap, sizeof (vmcap), md)) {
1409 			error = EFAULT;
1410 			break;
1411 		}
1412 		error = vm_set_capability(sc->vmm_vm, vcpu, vmcap.captype,
1413 		    vmcap.capval);
1414 		break;
1415 	}
1416 	case VM_SET_X2APIC_STATE: {
1417 		struct vm_x2apic x2apic;
1418 
1419 		if (ddi_copyin(datap, &x2apic, sizeof (x2apic), md)) {
1420 			error = EFAULT;
1421 			break;
1422 		}
1423 		error = vm_set_x2apic_state(sc->vmm_vm, vcpu, x2apic.state);
1424 		break;
1425 	}
1426 	case VM_GET_X2APIC_STATE: {
1427 		struct vm_x2apic x2apic;
1428 
1429 		if (ddi_copyin(datap, &x2apic, sizeof (x2apic), md)) {
1430 			error = EFAULT;
1431 			break;
1432 		}
1433 		error = vm_get_x2apic_state(sc->vmm_vm, x2apic.cpuid,
1434 		    &x2apic.state);
1435 		if (error == 0 &&
1436 		    ddi_copyout(&x2apic, datap, sizeof (x2apic), md)) {
1437 			error = EFAULT;
1438 			break;
1439 		}
1440 		break;
1441 	}
1442 	case VM_GET_GPA_PMAP: {
1443 		/*
1444 		 * Until there is a necessity to leak EPT/RVI PTE values to
1445 		 * userspace, this will remain unimplemented
1446 		 */
1447 		error = EINVAL;
1448 		break;
1449 	}
1450 	case VM_GET_HPET_CAPABILITIES: {
1451 		struct vm_hpet_cap hpetcap;
1452 
1453 		error = vhpet_getcap(&hpetcap);
1454 		if (error == 0 &&
1455 		    ddi_copyout(&hpetcap, datap, sizeof (hpetcap), md)) {
1456 			error = EFAULT;
1457 			break;
1458 		}
1459 		break;
1460 	}
1461 	case VM_GLA2GPA: {
1462 		struct vm_gla2gpa gg;
1463 
1464 		if (ddi_copyin(datap, &gg, sizeof (gg), md)) {
1465 			error = EFAULT;
1466 			break;
1467 		}
1468 		gg.vcpuid = vcpu;
1469 		error = vm_gla2gpa(sc->vmm_vm, vcpu, &gg.paging, gg.gla,
1470 		    gg.prot, &gg.gpa, &gg.fault);
1471 		if (error == 0 && ddi_copyout(&gg, datap, sizeof (gg), md)) {
1472 			error = EFAULT;
1473 			break;
1474 		}
1475 		break;
1476 	}
1477 	case VM_GLA2GPA_NOFAULT: {
1478 		struct vm_gla2gpa gg;
1479 
1480 		if (ddi_copyin(datap, &gg, sizeof (gg), md)) {
1481 			error = EFAULT;
1482 			break;
1483 		}
1484 		gg.vcpuid = vcpu;
1485 		error = vm_gla2gpa_nofault(sc->vmm_vm, vcpu, &gg.paging,
1486 		    gg.gla, gg.prot, &gg.gpa, &gg.fault);
1487 		if (error == 0 && ddi_copyout(&gg, datap, sizeof (gg), md)) {
1488 			error = EFAULT;
1489 			break;
1490 		}
1491 		break;
1492 	}
1493 
1494 	case VM_ACTIVATE_CPU:
1495 		error = vm_activate_cpu(sc->vmm_vm, vcpu);
1496 		break;
1497 
1498 	case VM_SUSPEND_CPU:
1499 		if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) {
1500 			error = EFAULT;
1501 		} else {
1502 			error = vm_suspend_cpu(sc->vmm_vm, vcpu);
1503 		}
1504 		break;
1505 
1506 	case VM_RESUME_CPU:
1507 		if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) {
1508 			error = EFAULT;
1509 		} else {
1510 			error = vm_resume_cpu(sc->vmm_vm, vcpu);
1511 		}
1512 		break;
1513 
1514 	case VM_GET_CPUS: {
1515 		struct vm_cpuset vm_cpuset;
1516 		cpuset_t tempset;
1517 		void *srcp = &tempset;
1518 		int size;
1519 
1520 		if (ddi_copyin(datap, &vm_cpuset, sizeof (vm_cpuset), md)) {
1521 			error = EFAULT;
1522 			break;
1523 		}
1524 
1525 		/* Be more generous about sizing since our cpuset_t is large. */
1526 		size = vm_cpuset.cpusetsize;
1527 		if (size <= 0 || size > sizeof (cpuset_t)) {
1528 			error = ERANGE;
1529 		}
1530 		/*
1531 		 * If they want a ulong_t or less, make sure they receive the
1532 		 * low bits with all the useful information.
1533 		 */
1534 		if (size <= sizeof (tempset.cpub[0])) {
1535 			srcp = &tempset.cpub[0];
1536 		}
1537 
1538 		if (vm_cpuset.which == VM_ACTIVE_CPUS) {
1539 			tempset = vm_active_cpus(sc->vmm_vm);
1540 		} else if (vm_cpuset.which == VM_SUSPENDED_CPUS) {
1541 			tempset = vm_suspended_cpus(sc->vmm_vm);
1542 		} else if (vm_cpuset.which == VM_DEBUG_CPUS) {
1543 			tempset = vm_debug_cpus(sc->vmm_vm);
1544 		} else {
1545 			error = EINVAL;
1546 		}
1547 
1548 		ASSERT(size > 0 && size <= sizeof (tempset));
1549 		if (error == 0 &&
1550 		    ddi_copyout(srcp, vm_cpuset.cpus, size, md)) {
1551 			error = EFAULT;
1552 			break;
1553 		}
1554 		break;
1555 	}
1556 	case VM_SET_INTINFO: {
1557 		struct vm_intinfo vmii;
1558 
1559 		if (ddi_copyin(datap, &vmii, sizeof (vmii), md)) {
1560 			error = EFAULT;
1561 			break;
1562 		}
1563 		error = vm_exit_intinfo(sc->vmm_vm, vcpu, vmii.info1);
1564 		break;
1565 	}
1566 	case VM_GET_INTINFO: {
1567 		struct vm_intinfo vmii;
1568 
1569 		vmii.vcpuid = vcpu;
1570 		error = vm_get_intinfo(sc->vmm_vm, vcpu, &vmii.info1,
1571 		    &vmii.info2);
1572 		if (error == 0 &&
1573 		    ddi_copyout(&vmii, datap, sizeof (vmii), md)) {
1574 			error = EFAULT;
1575 			break;
1576 		}
1577 		break;
1578 	}
1579 	case VM_RTC_WRITE: {
1580 		struct vm_rtc_data rtcdata;
1581 
1582 		if (ddi_copyin(datap, &rtcdata, sizeof (rtcdata), md)) {
1583 			error = EFAULT;
1584 			break;
1585 		}
1586 		error = vrtc_nvram_write(sc->vmm_vm, rtcdata.offset,
1587 		    rtcdata.value);
1588 		break;
1589 	}
1590 	case VM_RTC_READ: {
1591 		struct vm_rtc_data rtcdata;
1592 
1593 		if (ddi_copyin(datap, &rtcdata, sizeof (rtcdata), md)) {
1594 			error = EFAULT;
1595 			break;
1596 		}
1597 		error = vrtc_nvram_read(sc->vmm_vm, rtcdata.offset,
1598 		    &rtcdata.value);
1599 		if (error == 0 &&
1600 		    ddi_copyout(&rtcdata, datap, sizeof (rtcdata), md)) {
1601 			error = EFAULT;
1602 			break;
1603 		}
1604 		break;
1605 	}
1606 	case VM_RTC_SETTIME: {
1607 		struct vm_rtc_time rtctime;
1608 
1609 		if (ddi_copyin(datap, &rtctime, sizeof (rtctime), md)) {
1610 			error = EFAULT;
1611 			break;
1612 		}
1613 		error = vrtc_set_time(sc->vmm_vm, rtctime.secs);
1614 		break;
1615 	}
1616 	case VM_RTC_GETTIME: {
1617 		struct vm_rtc_time rtctime;
1618 
1619 		rtctime.secs = vrtc_get_time(sc->vmm_vm);
1620 		if (ddi_copyout(&rtctime, datap, sizeof (rtctime), md)) {
1621 			error = EFAULT;
1622 			break;
1623 		}
1624 		break;
1625 	}
1626 
1627 	case VM_PMTMR_LOCATE: {
1628 		uint16_t port = arg;
1629 		error = vpmtmr_set_location(sc->vmm_vm, port);
1630 		break;
1631 	}
1632 
1633 	case VM_RESTART_INSTRUCTION:
1634 		error = vm_restart_instruction(sc->vmm_vm, vcpu);
1635 		break;
1636 
1637 	case VM_SET_TOPOLOGY: {
1638 		struct vm_cpu_topology topo;
1639 
1640 		if (ddi_copyin(datap, &topo, sizeof (topo), md) != 0) {
1641 			error = EFAULT;
1642 			break;
1643 		}
1644 		error = vm_set_topology(sc->vmm_vm, topo.sockets, topo.cores,
1645 		    topo.threads, topo.maxcpus);
1646 		break;
1647 	}
1648 	case VM_GET_TOPOLOGY: {
1649 		struct vm_cpu_topology topo;
1650 
1651 		vm_get_topology(sc->vmm_vm, &topo.sockets, &topo.cores,
1652 		    &topo.threads, &topo.maxcpus);
1653 		if (ddi_copyout(&topo, datap, sizeof (topo), md) != 0) {
1654 			error = EFAULT;
1655 			break;
1656 		}
1657 		break;
1658 	}
1659 	case VM_DEVMEM_GETOFFSET: {
1660 		struct vm_devmem_offset vdo;
1661 		vmm_devmem_entry_t *de;
1662 
1663 		if (ddi_copyin(datap, &vdo, sizeof (vdo), md) != 0) {
1664 			error = EFAULT;
1665 			break;
1666 		}
1667 
1668 		de = vmmdev_devmem_find(sc, vdo.segid);
1669 		if (de != NULL) {
1670 			vdo.offset = de->vde_off;
1671 			if (ddi_copyout(&vdo, datap, sizeof (vdo), md) != 0) {
1672 				error = EFAULT;
1673 			}
1674 		} else {
1675 			error = ENOENT;
1676 		}
1677 		break;
1678 	}
1679 	case VM_TRACK_DIRTY_PAGES: {
1680 		const size_t max_track_region_len = 8 * PAGESIZE * 8 * PAGESIZE;
1681 		struct vmm_dirty_tracker tracker;
1682 		uint8_t *bitmap;
1683 		size_t len;
1684 
1685 		if (ddi_copyin(datap, &tracker, sizeof (tracker), md) != 0) {
1686 			error = EFAULT;
1687 			break;
1688 		}
1689 		if ((tracker.vdt_start_gpa & PAGEOFFSET) != 0) {
1690 			error = EINVAL;
1691 			break;
1692 		}
1693 		if (tracker.vdt_len == 0) {
1694 			break;
1695 		}
1696 		if ((tracker.vdt_len & PAGEOFFSET) != 0) {
1697 			error = EINVAL;
1698 			break;
1699 		}
1700 		if (tracker.vdt_len > max_track_region_len) {
1701 			error = EINVAL;
1702 			break;
1703 		}
1704 		len = roundup(tracker.vdt_len / PAGESIZE, 8) / 8;
1705 		bitmap = kmem_zalloc(len, KM_SLEEP);
1706 		error = vm_track_dirty_pages(sc->vmm_vm, tracker.vdt_start_gpa,
1707 		    tracker.vdt_len, bitmap);
1708 		if (error == 0 &&
1709 		    ddi_copyout(bitmap, tracker.vdt_pfns, len, md) != 0) {
1710 			error = EFAULT;
1711 		}
1712 		kmem_free(bitmap, len);
1713 
1714 		break;
1715 	}
1716 	case VM_WRLOCK_CYCLE: {
1717 		/*
1718 		 * Present a test mechanism to acquire/release the write lock
1719 		 * on the VM without any other effects.
1720 		 */
1721 		break;
1722 	}
1723 	case VM_DATA_READ: {
1724 		struct vm_data_xfer vdx;
1725 
1726 		if (ddi_copyin(datap, &vdx, sizeof (vdx), md) != 0) {
1727 			error = EFAULT;
1728 			break;
1729 		}
1730 		if ((vdx.vdx_flags & ~VDX_FLAGS_VALID) != 0) {
1731 			error = EINVAL;
1732 			break;
1733 		}
1734 		if (vdx.vdx_len > VM_DATA_XFER_LIMIT) {
1735 			error = EFBIG;
1736 			break;
1737 		}
1738 
1739 		const size_t len = vdx.vdx_len;
1740 		void *buf = NULL;
1741 		if (len != 0) {
1742 			const void *udata = vdx.vdx_data;
1743 
1744 			buf = kmem_alloc(len, KM_SLEEP);
1745 			if ((vdx.vdx_flags & VDX_FLAG_READ_COPYIN) == 0) {
1746 				bzero(buf, len);
1747 			} else if (ddi_copyin(udata, buf, len, md) != 0) {
1748 				kmem_free(buf, len);
1749 				error = EFAULT;
1750 				break;
1751 			}
1752 		}
1753 
1754 		vdx.vdx_result_len = 0;
1755 		vmm_data_req_t req = {
1756 			.vdr_class = vdx.vdx_class,
1757 			.vdr_version = vdx.vdx_version,
1758 			.vdr_flags = vdx.vdx_flags,
1759 			.vdr_len = len,
1760 			.vdr_data = buf,
1761 			.vdr_result_len = &vdx.vdx_result_len,
1762 		};
1763 		error = vmm_data_read(sc->vmm_vm, vdx.vdx_vcpuid, &req);
1764 
1765 		if (error == 0 && buf != NULL) {
1766 			if (ddi_copyout(buf, vdx.vdx_data, len, md) != 0) {
1767 				error = EFAULT;
1768 			}
1769 		}
1770 
1771 		/*
1772 		 * Copy out the transfer request so that the value of
1773 		 * vdx_result_len can be made available, regardless of any
1774 		 * error(s) which may have occurred.
1775 		 */
1776 		if (ddi_copyout(&vdx, datap, sizeof (vdx), md) != 0) {
1777 			error = (error != 0) ? error : EFAULT;
1778 		}
1779 
1780 		if (buf != NULL) {
1781 			kmem_free(buf, len);
1782 		}
1783 		break;
1784 	}
1785 	case VM_DATA_WRITE: {
1786 		struct vm_data_xfer vdx;
1787 
1788 		if (ddi_copyin(datap, &vdx, sizeof (vdx), md) != 0) {
1789 			error = EFAULT;
1790 			break;
1791 		}
1792 		if ((vdx.vdx_flags & ~VDX_FLAGS_VALID) != 0) {
1793 			error = EINVAL;
1794 			break;
1795 		}
1796 		if (vdx.vdx_len > VM_DATA_XFER_LIMIT) {
1797 			error = EFBIG;
1798 			break;
1799 		}
1800 
1801 		const size_t len = vdx.vdx_len;
1802 		void *buf = NULL;
1803 		if (len != 0) {
1804 			buf = kmem_alloc(len, KM_SLEEP);
1805 			if (ddi_copyin(vdx.vdx_data, buf, len, md) != 0) {
1806 				kmem_free(buf, len);
1807 				error = EFAULT;
1808 				break;
1809 			}
1810 		}
1811 
1812 		vdx.vdx_result_len = 0;
1813 		vmm_data_req_t req = {
1814 			.vdr_class = vdx.vdx_class,
1815 			.vdr_version = vdx.vdx_version,
1816 			.vdr_flags = vdx.vdx_flags,
1817 			.vdr_len = len,
1818 			.vdr_data = buf,
1819 			.vdr_result_len = &vdx.vdx_result_len,
1820 		};
1821 		if (vmm_allow_state_writes == 0) {
1822 			/* XXX: Play it safe for now */
1823 			error = EPERM;
1824 		} else {
1825 			error = vmm_data_write(sc->vmm_vm, vdx.vdx_vcpuid,
1826 			    &req);
1827 		}
1828 
1829 		if (error == 0 && buf != NULL &&
1830 		    (vdx.vdx_flags & VDX_FLAG_WRITE_COPYOUT) != 0) {
1831 			if (ddi_copyout(buf, vdx.vdx_data, len, md) != 0) {
1832 				error = EFAULT;
1833 			}
1834 		}
1835 
1836 		/*
1837 		 * Copy out the transfer request so that the value of
1838 		 * vdx_result_len can be made available, regardless of any
1839 		 * error(s) which may have occurred.
1840 		 */
1841 		if (ddi_copyout(&vdx, datap, sizeof (vdx), md) != 0) {
1842 			error = (error != 0) ? error : EFAULT;
1843 		}
1844 
1845 		if (buf != NULL) {
1846 			kmem_free(buf, len);
1847 		}
1848 		break;
1849 	}
1850 
1851 	case VM_PAUSE: {
1852 		error = vm_pause_instance(sc->vmm_vm);
1853 		break;
1854 	}
1855 	case VM_RESUME: {
1856 		error = vm_resume_instance(sc->vmm_vm);
1857 		break;
1858 	}
1859 
1860 	default:
1861 		error = ENOTTY;
1862 		break;
1863 	}
1864 
1865 	/* Release exclusion resources */
1866 	switch (lock_type) {
1867 	case LOCK_NONE:
1868 		break;
1869 	case LOCK_VCPU:
1870 		vcpu_unlock_one(sc, vcpu);
1871 		break;
1872 	case LOCK_READ_HOLD:
1873 		vmm_read_unlock(sc);
1874 		break;
1875 	case LOCK_WRITE_HOLD:
1876 		vmm_write_unlock(sc);
1877 		break;
1878 	default:
1879 		panic("unexpected lock type");
1880 		break;
1881 	}
1882 
1883 	return (error);
1884 }
1885 
1886 static vmm_softc_t *
1887 vmm_lookup(const char *name)
1888 {
1889 	list_t *vml = &vmm_list;
1890 	vmm_softc_t *sc;
1891 
1892 	ASSERT(MUTEX_HELD(&vmm_mtx));
1893 
1894 	for (sc = list_head(vml); sc != NULL; sc = list_next(vml, sc)) {
1895 		if (strcmp(sc->vmm_name, name) == 0) {
1896 			break;
1897 		}
1898 	}
1899 
1900 	return (sc);
1901 }
1902 
1903 /*
1904  * Acquire an HMA registration if not already held.
1905  */
1906 static boolean_t
1907 vmm_hma_acquire(void)
1908 {
1909 	ASSERT(MUTEX_NOT_HELD(&vmm_mtx));
1910 
1911 	mutex_enter(&vmmdev_mtx);
1912 
1913 	if (vmmdev_hma_reg == NULL) {
1914 		VERIFY3U(vmmdev_hma_ref, ==, 0);
1915 		vmmdev_hma_reg = hma_register(vmmdev_hvm_name);
1916 		if (vmmdev_hma_reg == NULL) {
1917 			cmn_err(CE_WARN, "%s HMA registration failed.",
1918 			    vmmdev_hvm_name);
1919 			mutex_exit(&vmmdev_mtx);
1920 			return (B_FALSE);
1921 		}
1922 	}
1923 
1924 	vmmdev_hma_ref++;
1925 
1926 	mutex_exit(&vmmdev_mtx);
1927 
1928 	return (B_TRUE);
1929 }
1930 
1931 /*
1932  * Release the HMA registration if held and there are no remaining VMs.
1933  */
1934 static void
1935 vmm_hma_release(void)
1936 {
1937 	ASSERT(MUTEX_NOT_HELD(&vmm_mtx));
1938 
1939 	mutex_enter(&vmmdev_mtx);
1940 
1941 	VERIFY3U(vmmdev_hma_ref, !=, 0);
1942 
1943 	vmmdev_hma_ref--;
1944 
1945 	if (vmmdev_hma_ref == 0) {
1946 		VERIFY(vmmdev_hma_reg != NULL);
1947 		hma_unregister(vmmdev_hma_reg);
1948 		vmmdev_hma_reg = NULL;
1949 	}
1950 	mutex_exit(&vmmdev_mtx);
1951 }
1952 
1953 static int
1954 vmmdev_do_vm_create(const struct vm_create_req *req, cred_t *cr)
1955 {
1956 	vmm_softc_t	*sc = NULL;
1957 	minor_t		minor;
1958 	int		error = ENOMEM;
1959 	size_t		len;
1960 	const char	*name = req->name;
1961 
1962 	len = strnlen(name, VM_MAX_NAMELEN);
1963 	if (len == 0) {
1964 		return (EINVAL);
1965 	}
1966 	if (len >= VM_MAX_NAMELEN) {
1967 		return (ENAMETOOLONG);
1968 	}
1969 	if (strchr(name, '/') != NULL) {
1970 		return (EINVAL);
1971 	}
1972 
1973 	if (!vmm_hma_acquire())
1974 		return (ENXIO);
1975 
1976 	mutex_enter(&vmm_mtx);
1977 
1978 	/* Look for duplicate names */
1979 	if (vmm_lookup(name) != NULL) {
1980 		mutex_exit(&vmm_mtx);
1981 		vmm_hma_release();
1982 		return (EEXIST);
1983 	}
1984 
1985 	/* Allow only one instance per non-global zone. */
1986 	if (!INGLOBALZONE(curproc)) {
1987 		for (sc = list_head(&vmm_list); sc != NULL;
1988 		    sc = list_next(&vmm_list, sc)) {
1989 			if (sc->vmm_zone == curzone) {
1990 				mutex_exit(&vmm_mtx);
1991 				vmm_hma_release();
1992 				return (EINVAL);
1993 			}
1994 		}
1995 	}
1996 
1997 	minor = id_alloc(vmm_minors);
1998 	if (ddi_soft_state_zalloc(vmm_statep, minor) != DDI_SUCCESS) {
1999 		goto fail;
2000 	} else if ((sc = ddi_get_soft_state(vmm_statep, minor)) == NULL) {
2001 		ddi_soft_state_free(vmm_statep, minor);
2002 		goto fail;
2003 	} else if (ddi_create_minor_node(vmmdev_dip, name, S_IFCHR, minor,
2004 	    DDI_PSEUDO, 0) != DDI_SUCCESS) {
2005 		goto fail;
2006 	}
2007 
2008 	if (vmm_kstat_alloc(sc, minor, cr) != 0) {
2009 		goto fail;
2010 	}
2011 
2012 	error = vm_create(req->flags, &sc->vmm_vm);
2013 	if (error == 0) {
2014 		/* Complete VM intialization and report success. */
2015 		(void) strlcpy(sc->vmm_name, name, sizeof (sc->vmm_name));
2016 		sc->vmm_minor = minor;
2017 		list_create(&sc->vmm_devmem_list, sizeof (vmm_devmem_entry_t),
2018 		    offsetof(vmm_devmem_entry_t, vde_node));
2019 
2020 		list_create(&sc->vmm_holds, sizeof (vmm_hold_t),
2021 		    offsetof(vmm_hold_t, vmh_node));
2022 		cv_init(&sc->vmm_cv, NULL, CV_DEFAULT, NULL);
2023 
2024 		mutex_init(&sc->vmm_lease_lock, NULL, MUTEX_DEFAULT, NULL);
2025 		list_create(&sc->vmm_lease_list, sizeof (vmm_lease_t),
2026 		    offsetof(vmm_lease_t, vml_node));
2027 		cv_init(&sc->vmm_lease_cv, NULL, CV_DEFAULT, NULL);
2028 		rw_init(&sc->vmm_rwlock, NULL, RW_DEFAULT, NULL);
2029 
2030 		sc->vmm_zone = crgetzone(cr);
2031 		zone_hold(sc->vmm_zone);
2032 		vmm_zsd_add_vm(sc);
2033 		vmm_kstat_init(sc);
2034 
2035 		list_insert_tail(&vmm_list, sc);
2036 		mutex_exit(&vmm_mtx);
2037 		return (0);
2038 	}
2039 
2040 	vmm_kstat_fini(sc);
2041 	ddi_remove_minor_node(vmmdev_dip, name);
2042 fail:
2043 	id_free(vmm_minors, minor);
2044 	if (sc != NULL) {
2045 		ddi_soft_state_free(vmm_statep, minor);
2046 	}
2047 	mutex_exit(&vmm_mtx);
2048 	vmm_hma_release();
2049 
2050 	return (error);
2051 }
2052 
2053 /*
2054  * Bhyve 'Driver' Interface
2055  *
2056  * While many devices are emulated in the bhyve userspace process, there are
2057  * others with performance constraints which require that they run mostly or
2058  * entirely in-kernel.  For those not integrated directly into bhyve, an API is
2059  * needed so they can query/manipulate the portions of VM state needed to
2060  * fulfill their purpose.
2061  *
2062  * This includes:
2063  * - Translating guest-physical addresses to host-virtual pointers
2064  * - Injecting MSIs
2065  * - Hooking IO port addresses
2066  *
2067  * The vmm_drv interface exists to provide that functionality to its consumers.
2068  * (At this time, 'viona' is the only user)
2069  */
2070 int
2071 vmm_drv_hold(file_t *fp, cred_t *cr, vmm_hold_t **holdp)
2072 {
2073 	vnode_t *vp = fp->f_vnode;
2074 	const dev_t dev = vp->v_rdev;
2075 	vmm_softc_t *sc;
2076 	vmm_hold_t *hold;
2077 	int err = 0;
2078 
2079 	if (vp->v_type != VCHR) {
2080 		return (ENXIO);
2081 	}
2082 	const major_t major = getmajor(dev);
2083 	const minor_t minor = getminor(dev);
2084 
2085 	mutex_enter(&vmmdev_mtx);
2086 	if (vmmdev_dip == NULL || major != ddi_driver_major(vmmdev_dip)) {
2087 		mutex_exit(&vmmdev_mtx);
2088 		return (ENOENT);
2089 	}
2090 	mutex_enter(&vmm_mtx);
2091 	mutex_exit(&vmmdev_mtx);
2092 
2093 	if ((sc = ddi_get_soft_state(vmm_statep, minor)) == NULL) {
2094 		err = ENOENT;
2095 		goto out;
2096 	}
2097 	/* XXXJOY: check cred permissions against instance */
2098 
2099 	if ((sc->vmm_flags & VMM_DESTROY) != 0) {
2100 		err = EBUSY;
2101 		goto out;
2102 	}
2103 
2104 	hold = kmem_zalloc(sizeof (*hold), KM_SLEEP);
2105 	hold->vmh_sc = sc;
2106 	hold->vmh_release_req = B_FALSE;
2107 
2108 	list_insert_tail(&sc->vmm_holds, hold);
2109 	sc->vmm_flags |= VMM_HELD;
2110 	*holdp = hold;
2111 
2112 out:
2113 	mutex_exit(&vmm_mtx);
2114 	return (err);
2115 }
2116 
2117 void
2118 vmm_drv_rele(vmm_hold_t *hold)
2119 {
2120 	vmm_softc_t *sc;
2121 	bool hma_release = false;
2122 
2123 	ASSERT(hold != NULL);
2124 	ASSERT(hold->vmh_sc != NULL);
2125 	VERIFY(hold->vmh_ioport_hook_cnt == 0);
2126 
2127 	mutex_enter(&vmm_mtx);
2128 	sc = hold->vmh_sc;
2129 	list_remove(&sc->vmm_holds, hold);
2130 	kmem_free(hold, sizeof (*hold));
2131 
2132 	if (list_is_empty(&sc->vmm_holds)) {
2133 		sc->vmm_flags &= ~VMM_HELD;
2134 
2135 		/*
2136 		 * Since outstanding holds would prevent instance destruction
2137 		 * from completing, attempt to finish it now if it was already
2138 		 * set in motion.
2139 		 */
2140 		if ((sc->vmm_flags & VMM_DESTROY) != 0) {
2141 			VERIFY0(vmm_destroy_locked(sc, VDO_DEFAULT,
2142 			    &hma_release));
2143 		}
2144 	}
2145 	mutex_exit(&vmm_mtx);
2146 
2147 	if (hma_release) {
2148 		vmm_hma_release();
2149 	}
2150 }
2151 
2152 boolean_t
2153 vmm_drv_release_reqd(vmm_hold_t *hold)
2154 {
2155 	ASSERT(hold != NULL);
2156 
2157 	return (hold->vmh_release_req);
2158 }
2159 
2160 vmm_lease_t *
2161 vmm_drv_lease_sign(vmm_hold_t *hold, boolean_t (*expiref)(void *), void *arg)
2162 {
2163 	vmm_softc_t *sc = hold->vmh_sc;
2164 	vmm_lease_t *lease;
2165 
2166 	ASSERT3P(expiref, !=, NULL);
2167 
2168 	if (hold->vmh_release_req) {
2169 		return (NULL);
2170 	}
2171 
2172 	lease = kmem_alloc(sizeof (*lease), KM_SLEEP);
2173 	list_link_init(&lease->vml_node);
2174 	lease->vml_expire_func = expiref;
2175 	lease->vml_expire_arg = arg;
2176 	lease->vml_expired = B_FALSE;
2177 	lease->vml_break_deferred = B_FALSE;
2178 	lease->vml_hold = hold;
2179 	/* cache the VM pointer for one less pointer chase */
2180 	lease->vml_vm = sc->vmm_vm;
2181 	lease->vml_vmclient = vmspace_client_alloc(vm_get_vmspace(sc->vmm_vm));
2182 
2183 	mutex_enter(&sc->vmm_lease_lock);
2184 	while (sc->vmm_lease_blocker != 0) {
2185 		cv_wait(&sc->vmm_lease_cv, &sc->vmm_lease_lock);
2186 	}
2187 	list_insert_tail(&sc->vmm_lease_list, lease);
2188 	vmm_read_lock(sc);
2189 	mutex_exit(&sc->vmm_lease_lock);
2190 
2191 	return (lease);
2192 }
2193 
2194 static void
2195 vmm_lease_break_locked(vmm_softc_t *sc, vmm_lease_t *lease)
2196 {
2197 	ASSERT(MUTEX_HELD(&sc->vmm_lease_lock));
2198 
2199 	list_remove(&sc->vmm_lease_list, lease);
2200 	vmm_read_unlock(sc);
2201 	vmc_destroy(lease->vml_vmclient);
2202 	kmem_free(lease, sizeof (*lease));
2203 }
2204 
2205 static void
2206 vmm_lease_block(vmm_softc_t *sc)
2207 {
2208 	mutex_enter(&sc->vmm_lease_lock);
2209 	VERIFY3U(sc->vmm_lease_blocker, !=, UINT_MAX);
2210 	sc->vmm_lease_blocker++;
2211 	if (sc->vmm_lease_blocker == 1) {
2212 		list_t *list = &sc->vmm_lease_list;
2213 		vmm_lease_t *lease = list_head(list);
2214 
2215 		while (lease != NULL) {
2216 			void *arg = lease->vml_expire_arg;
2217 			boolean_t (*expiref)(void *) = lease->vml_expire_func;
2218 			boolean_t sync_break = B_FALSE;
2219 
2220 			/*
2221 			 * Since the lease expiration notification may
2222 			 * need to take locks which would deadlock with
2223 			 * vmm_lease_lock, drop it across the call.
2224 			 *
2225 			 * We are the only one allowed to manipulate
2226 			 * vmm_lease_list right now, so it is safe to
2227 			 * continue iterating through it after
2228 			 * reacquiring the lock.
2229 			 */
2230 			lease->vml_expired = B_TRUE;
2231 			mutex_exit(&sc->vmm_lease_lock);
2232 			sync_break = expiref(arg);
2233 			mutex_enter(&sc->vmm_lease_lock);
2234 
2235 			if (sync_break) {
2236 				vmm_lease_t *next;
2237 
2238 				/*
2239 				 * These leases which are synchronously broken
2240 				 * result in vmm_read_unlock() calls from a
2241 				 * different thread than the corresponding
2242 				 * vmm_read_lock().  This is acceptable, given
2243 				 * that the rwlock underpinning the whole
2244 				 * mechanism tolerates the behavior.  This
2245 				 * flexibility is _only_ afforded to VM read
2246 				 * lock (RW_READER) holders.
2247 				 */
2248 				next = list_next(list, lease);
2249 				vmm_lease_break_locked(sc, lease);
2250 				lease = next;
2251 			} else {
2252 				lease = list_next(list, lease);
2253 			}
2254 		}
2255 
2256 		/* Process leases which were not broken synchronously. */
2257 		while (!list_is_empty(list)) {
2258 			/*
2259 			 * Although the nested loops are quadratic, the number
2260 			 * of leases is small.
2261 			 */
2262 			lease = list_head(list);
2263 			while (lease != NULL) {
2264 				vmm_lease_t *next = list_next(list, lease);
2265 				if (lease->vml_break_deferred) {
2266 					vmm_lease_break_locked(sc, lease);
2267 				}
2268 				lease = next;
2269 			}
2270 			if (list_is_empty(list)) {
2271 				break;
2272 			}
2273 			cv_wait(&sc->vmm_lease_cv, &sc->vmm_lease_lock);
2274 		}
2275 		/* Wake anyone else waiting for the lease list to be empty  */
2276 		cv_broadcast(&sc->vmm_lease_cv);
2277 	} else {
2278 		list_t *list = &sc->vmm_lease_list;
2279 
2280 		/*
2281 		 * Some other thread beat us to the duty of lease cleanup.
2282 		 * Wait until that is complete.
2283 		 */
2284 		while (!list_is_empty(list)) {
2285 			cv_wait(&sc->vmm_lease_cv, &sc->vmm_lease_lock);
2286 		}
2287 	}
2288 	mutex_exit(&sc->vmm_lease_lock);
2289 }
2290 
2291 static void
2292 vmm_lease_unblock(vmm_softc_t *sc)
2293 {
2294 	mutex_enter(&sc->vmm_lease_lock);
2295 	VERIFY3U(sc->vmm_lease_blocker, !=, 0);
2296 	sc->vmm_lease_blocker--;
2297 	if (sc->vmm_lease_blocker == 0) {
2298 		cv_broadcast(&sc->vmm_lease_cv);
2299 	}
2300 	mutex_exit(&sc->vmm_lease_lock);
2301 }
2302 
2303 void
2304 vmm_drv_lease_break(vmm_hold_t *hold, vmm_lease_t *lease)
2305 {
2306 	vmm_softc_t *sc = hold->vmh_sc;
2307 
2308 	VERIFY3P(hold, ==, lease->vml_hold);
2309 	VERIFY(!lease->vml_break_deferred);
2310 
2311 	mutex_enter(&sc->vmm_lease_lock);
2312 	if (sc->vmm_lease_blocker == 0) {
2313 		vmm_lease_break_locked(sc, lease);
2314 	} else {
2315 		/*
2316 		 * Defer the lease-breaking to whichever thread is currently
2317 		 * cleaning up all leases as part of a vmm_lease_block() call.
2318 		 */
2319 		lease->vml_break_deferred = B_TRUE;
2320 		cv_broadcast(&sc->vmm_lease_cv);
2321 	}
2322 	mutex_exit(&sc->vmm_lease_lock);
2323 }
2324 
2325 boolean_t
2326 vmm_drv_lease_expired(vmm_lease_t *lease)
2327 {
2328 	return (lease->vml_expired);
2329 }
2330 
2331 vmm_page_t *
2332 vmm_drv_page_hold(vmm_lease_t *lease, uintptr_t gpa, int prot)
2333 {
2334 	ASSERT(lease != NULL);
2335 	ASSERT0(gpa & PAGEOFFSET);
2336 
2337 	return ((vmm_page_t *)vmc_hold(lease->vml_vmclient, gpa, prot));
2338 }
2339 
2340 
2341 /* Ensure that flags mirrored by vmm_drv interface properly match up */
2342 CTASSERT(VMPF_DEFER_DIRTY == VPF_DEFER_DIRTY);
2343 
2344 vmm_page_t *
2345 vmm_drv_page_hold_ext(vmm_lease_t *lease, uintptr_t gpa, int prot, int flags)
2346 {
2347 	ASSERT(lease != NULL);
2348 	ASSERT0(gpa & PAGEOFFSET);
2349 
2350 	vmm_page_t *page =
2351 	    (vmm_page_t *)vmc_hold_ext(lease->vml_vmclient, gpa, prot, flags);
2352 	return (page);
2353 }
2354 
2355 void
2356 vmm_drv_page_release(vmm_page_t *vmmp)
2357 {
2358 	(void) vmp_release((vm_page_t *)vmmp);
2359 }
2360 
2361 void
2362 vmm_drv_page_release_chain(vmm_page_t *vmmp)
2363 {
2364 	(void) vmp_release_chain((vm_page_t *)vmmp);
2365 }
2366 
2367 const void *
2368 vmm_drv_page_readable(const vmm_page_t *vmmp)
2369 {
2370 	return (vmp_get_readable((const vm_page_t *)vmmp));
2371 }
2372 
2373 void *
2374 vmm_drv_page_writable(const vmm_page_t *vmmp)
2375 {
2376 	return (vmp_get_writable((const vm_page_t *)vmmp));
2377 }
2378 
2379 void
2380 vmm_drv_page_mark_dirty(vmm_page_t *vmmp)
2381 {
2382 	return (vmp_mark_dirty((vm_page_t *)vmmp));
2383 }
2384 
2385 void
2386 vmm_drv_page_chain(vmm_page_t *vmmp, vmm_page_t *to_chain)
2387 {
2388 	vmp_chain((vm_page_t *)vmmp, (vm_page_t *)to_chain);
2389 }
2390 
2391 vmm_page_t *
2392 vmm_drv_page_next(const vmm_page_t *vmmp)
2393 {
2394 	return ((vmm_page_t *)vmp_next((vm_page_t *)vmmp));
2395 }
2396 
2397 int
2398 vmm_drv_msi(vmm_lease_t *lease, uint64_t addr, uint64_t msg)
2399 {
2400 	ASSERT(lease != NULL);
2401 
2402 	return (lapic_intr_msi(lease->vml_vm, addr, msg));
2403 }
2404 
2405 int
2406 vmm_drv_ioport_hook(vmm_hold_t *hold, uint16_t ioport, vmm_drv_iop_cb_t func,
2407     void *arg, void **cookie)
2408 {
2409 	vmm_softc_t *sc;
2410 	int err;
2411 
2412 	ASSERT(hold != NULL);
2413 	ASSERT(cookie != NULL);
2414 
2415 	sc = hold->vmh_sc;
2416 	mutex_enter(&vmm_mtx);
2417 	/* Confirm that hook installation is not blocked */
2418 	if ((sc->vmm_flags & VMM_BLOCK_HOOK) != 0) {
2419 		mutex_exit(&vmm_mtx);
2420 		return (EBUSY);
2421 	}
2422 	/*
2423 	 * Optimistically record an installed hook which will prevent a block
2424 	 * from being asserted while the mutex is dropped.
2425 	 */
2426 	hold->vmh_ioport_hook_cnt++;
2427 	mutex_exit(&vmm_mtx);
2428 
2429 	vmm_write_lock(sc);
2430 	err = vm_ioport_hook(sc->vmm_vm, ioport, (ioport_handler_t)func,
2431 	    arg, cookie);
2432 	vmm_write_unlock(sc);
2433 
2434 	if (err != 0) {
2435 		mutex_enter(&vmm_mtx);
2436 		/* Walk back optimism about the hook installation */
2437 		hold->vmh_ioport_hook_cnt--;
2438 		mutex_exit(&vmm_mtx);
2439 	}
2440 	return (err);
2441 }
2442 
2443 void
2444 vmm_drv_ioport_unhook(vmm_hold_t *hold, void **cookie)
2445 {
2446 	vmm_softc_t *sc;
2447 
2448 	ASSERT(hold != NULL);
2449 	ASSERT(cookie != NULL);
2450 	ASSERT(hold->vmh_ioport_hook_cnt != 0);
2451 
2452 	sc = hold->vmh_sc;
2453 	vmm_write_lock(sc);
2454 	vm_ioport_unhook(sc->vmm_vm, cookie);
2455 	vmm_write_unlock(sc);
2456 
2457 	mutex_enter(&vmm_mtx);
2458 	hold->vmh_ioport_hook_cnt--;
2459 	mutex_exit(&vmm_mtx);
2460 }
2461 
2462 static void
2463 vmm_drv_purge(vmm_softc_t *sc)
2464 {
2465 	ASSERT(MUTEX_HELD(&vmm_mtx));
2466 
2467 	if ((sc->vmm_flags & VMM_HELD) != 0) {
2468 		vmm_hold_t *hold;
2469 
2470 		for (hold = list_head(&sc->vmm_holds); hold != NULL;
2471 		    hold = list_next(&sc->vmm_holds, hold)) {
2472 			hold->vmh_release_req = B_TRUE;
2473 		}
2474 
2475 		/*
2476 		 * Require that all leases on the instance be broken, now that
2477 		 * all associated holds have been marked as needing release.
2478 		 *
2479 		 * Dropping vmm_mtx is not strictly necessary, but if any of the
2480 		 * lessees are slow to respond, it would be nice to leave it
2481 		 * available for other parties.
2482 		 */
2483 		mutex_exit(&vmm_mtx);
2484 		vmm_lease_block(sc);
2485 		vmm_lease_unblock(sc);
2486 		mutex_enter(&vmm_mtx);
2487 	}
2488 }
2489 
2490 static int
2491 vmm_drv_block_hook(vmm_softc_t *sc, boolean_t enable_block)
2492 {
2493 	int err = 0;
2494 
2495 	mutex_enter(&vmm_mtx);
2496 	if (!enable_block) {
2497 		VERIFY((sc->vmm_flags & VMM_BLOCK_HOOK) != 0);
2498 
2499 		sc->vmm_flags &= ~VMM_BLOCK_HOOK;
2500 		goto done;
2501 	}
2502 
2503 	/* If any holds have hooks installed, the block is a failure */
2504 	if (!list_is_empty(&sc->vmm_holds)) {
2505 		vmm_hold_t *hold;
2506 
2507 		for (hold = list_head(&sc->vmm_holds); hold != NULL;
2508 		    hold = list_next(&sc->vmm_holds, hold)) {
2509 			if (hold->vmh_ioport_hook_cnt != 0) {
2510 				err = EBUSY;
2511 				goto done;
2512 			}
2513 		}
2514 	}
2515 	sc->vmm_flags |= VMM_BLOCK_HOOK;
2516 
2517 done:
2518 	mutex_exit(&vmm_mtx);
2519 	return (err);
2520 }
2521 
2522 
2523 static void
2524 vmm_destroy_begin(vmm_softc_t *sc, vmm_destroy_opts_t opts)
2525 {
2526 	ASSERT(MUTEX_HELD(&vmm_mtx));
2527 	ASSERT0(sc->vmm_flags & VMM_DESTROY);
2528 
2529 	sc->vmm_flags |= VMM_DESTROY;
2530 
2531 	/*
2532 	 * Lock and unlock all of the vCPUs to ensure that they are kicked out
2533 	 * of guest context, being unable to return now that the instance is
2534 	 * marked for destruction.
2535 	 */
2536 	const int maxcpus = vm_get_maxcpus(sc->vmm_vm);
2537 	for (int vcpu = 0; vcpu < maxcpus; vcpu++) {
2538 		vcpu_lock_one(sc, vcpu);
2539 		vcpu_unlock_one(sc, vcpu);
2540 	}
2541 
2542 	vmmdev_devmem_purge(sc);
2543 	if ((opts & VDO_NO_CLEAN_ZSD) == 0) {
2544 		/*
2545 		 * The ZSD should be cleaned up now, unless destruction of the
2546 		 * instance was initated by destruction of the containing zone,
2547 		 * in which case the ZSD has already been removed.
2548 		 */
2549 		vmm_zsd_rem_vm(sc);
2550 	}
2551 	zone_rele(sc->vmm_zone);
2552 
2553 	vmm_drv_purge(sc);
2554 }
2555 
2556 static bool
2557 vmm_destroy_ready(vmm_softc_t *sc)
2558 {
2559 	ASSERT(MUTEX_HELD(&vmm_mtx));
2560 
2561 	if ((sc->vmm_flags & (VMM_HELD | VMM_IS_OPEN)) == 0) {
2562 		VERIFY(list_is_empty(&sc->vmm_holds));
2563 		return (true);
2564 	}
2565 
2566 	return (false);
2567 }
2568 
2569 static void
2570 vmm_destroy_finish(vmm_softc_t *sc)
2571 {
2572 	ASSERT(MUTEX_HELD(&vmm_mtx));
2573 	ASSERT(vmm_destroy_ready(sc));
2574 
2575 	list_remove(&vmm_list, sc);
2576 	vmm_kstat_fini(sc);
2577 	vm_destroy(sc->vmm_vm);
2578 	ddi_remove_minor_node(vmmdev_dip, sc->vmm_name);
2579 	(void) devfs_clean(ddi_get_parent(vmmdev_dip), NULL, DV_CLEAN_FORCE);
2580 
2581 	const minor_t minor = sc->vmm_minor;
2582 	ddi_soft_state_free(vmm_statep, minor);
2583 	id_free(vmm_minors, minor);
2584 }
2585 
2586 /*
2587  * Initiate or attempt to finish destruction of a VMM instance.
2588  *
2589  * This is called from several contexts:
2590  * - An explicit destroy ioctl is made
2591  * - A vmm_drv consumer releases its hold (being the last on the instance)
2592  * - The vmm device is closed, and auto-destruct is enabled
2593  */
2594 static int
2595 vmm_destroy_locked(vmm_softc_t *sc, vmm_destroy_opts_t opts,
2596     bool *hma_release)
2597 {
2598 	ASSERT(MUTEX_HELD(&vmm_mtx));
2599 
2600 	*hma_release = false;
2601 
2602 	/*
2603 	 * When instance destruction begins, it is so marked such that any
2604 	 * further requests to operate the instance will fail.
2605 	 */
2606 	if ((sc->vmm_flags & VMM_DESTROY) == 0) {
2607 		vmm_destroy_begin(sc, opts);
2608 	}
2609 
2610 	if (vmm_destroy_ready(sc)) {
2611 
2612 		/*
2613 		 * Notify anyone waiting for the destruction to finish.  They
2614 		 * must be clear before we can safely tear down the softc.
2615 		 */
2616 		if (sc->vmm_destroy_waiters != 0) {
2617 			cv_broadcast(&sc->vmm_cv);
2618 			while (sc->vmm_destroy_waiters != 0) {
2619 				cv_wait(&sc->vmm_cv, &vmm_mtx);
2620 			}
2621 		}
2622 
2623 		/*
2624 		 * Finish destruction of instance.  After this point, the softc
2625 		 * is freed and cannot be accessed again.
2626 		 *
2627 		 * With destruction complete, the HMA hold can be released
2628 		 */
2629 		vmm_destroy_finish(sc);
2630 		*hma_release = true;
2631 		return (0);
2632 	} else if ((opts & VDO_ATTEMPT_WAIT) != 0) {
2633 		int err = 0;
2634 
2635 		sc->vmm_destroy_waiters++;
2636 		while (!vmm_destroy_ready(sc) && err == 0) {
2637 			if (cv_wait_sig(&sc->vmm_cv, &vmm_mtx) <= 0) {
2638 				err = EINTR;
2639 			}
2640 		}
2641 		sc->vmm_destroy_waiters--;
2642 
2643 		if (sc->vmm_destroy_waiters == 0) {
2644 			/*
2645 			 * If we were the last waiter, it could be that VM
2646 			 * destruction is waiting on _us_ to proceed with the
2647 			 * final clean-up.
2648 			 */
2649 			cv_signal(&sc->vmm_cv);
2650 		}
2651 		return (err);
2652 	} else {
2653 		/*
2654 		 * Since the instance is not ready for destruction, and the
2655 		 * caller did not ask to wait, consider it a success for now.
2656 		 */
2657 		return (0);
2658 	}
2659 }
2660 
2661 void
2662 vmm_zone_vm_destroy(vmm_softc_t *sc)
2663 {
2664 	bool hma_release = false;
2665 	int err;
2666 
2667 	mutex_enter(&vmm_mtx);
2668 	err = vmm_destroy_locked(sc, VDO_NO_CLEAN_ZSD, &hma_release);
2669 	mutex_exit(&vmm_mtx);
2670 
2671 	VERIFY0(err);
2672 
2673 	if (hma_release) {
2674 		vmm_hma_release();
2675 	}
2676 }
2677 
2678 static int
2679 vmmdev_do_vm_destroy(const struct vm_destroy_req *req, cred_t *cr)
2680 {
2681 	vmm_softc_t *sc;
2682 	bool hma_release = false;
2683 	int err;
2684 
2685 	if (crgetuid(cr) != 0) {
2686 		return (EPERM);
2687 	}
2688 
2689 	mutex_enter(&vmm_mtx);
2690 	sc = vmm_lookup(req->name);
2691 	if (sc == NULL) {
2692 		mutex_exit(&vmm_mtx);
2693 		return (ENOENT);
2694 	}
2695 	/*
2696 	 * We don't check this in vmm_lookup() since that function is also used
2697 	 * for validation during create and currently vmm names must be unique.
2698 	 */
2699 	if (!INGLOBALZONE(curproc) && sc->vmm_zone != curzone) {
2700 		mutex_exit(&vmm_mtx);
2701 		return (EPERM);
2702 	}
2703 
2704 	err = vmm_destroy_locked(sc, VDO_ATTEMPT_WAIT, &hma_release);
2705 	mutex_exit(&vmm_mtx);
2706 
2707 	if (hma_release) {
2708 		vmm_hma_release();
2709 	}
2710 
2711 	return (err);
2712 }
2713 
2714 #define	VCPU_NAME_BUFLEN	32
2715 
2716 static int
2717 vmm_kstat_alloc(vmm_softc_t *sc, minor_t minor, const cred_t *cr)
2718 {
2719 	zoneid_t zid = crgetzoneid(cr);
2720 	int instance = minor;
2721 	kstat_t *ksp;
2722 
2723 	ASSERT3P(sc->vmm_kstat_vm, ==, NULL);
2724 
2725 	ksp = kstat_create_zone(VMM_MODULE_NAME, instance, "vm",
2726 	    VMM_KSTAT_CLASS, KSTAT_TYPE_NAMED,
2727 	    sizeof (vmm_kstats_t) / sizeof (kstat_named_t), 0, zid);
2728 
2729 	if (ksp == NULL) {
2730 		return (-1);
2731 	}
2732 	sc->vmm_kstat_vm = ksp;
2733 
2734 	for (uint_t i = 0; i < VM_MAXCPU; i++) {
2735 		char namebuf[VCPU_NAME_BUFLEN];
2736 
2737 		ASSERT3P(sc->vmm_kstat_vcpu[i], ==, NULL);
2738 
2739 		(void) snprintf(namebuf, VCPU_NAME_BUFLEN, "vcpu%u", i);
2740 		ksp = kstat_create_zone(VMM_MODULE_NAME, instance, namebuf,
2741 		    VMM_KSTAT_CLASS, KSTAT_TYPE_NAMED,
2742 		    sizeof (vmm_vcpu_kstats_t) / sizeof (kstat_named_t),
2743 		    0, zid);
2744 		if (ksp == NULL) {
2745 			goto fail;
2746 		}
2747 
2748 		sc->vmm_kstat_vcpu[i] = ksp;
2749 	}
2750 
2751 	/*
2752 	 * If this instance is associated with a non-global zone, make its
2753 	 * kstats visible from the GZ.
2754 	 */
2755 	if (zid != GLOBAL_ZONEID) {
2756 		kstat_zone_add(sc->vmm_kstat_vm, GLOBAL_ZONEID);
2757 		for (uint_t i = 0; i < VM_MAXCPU; i++) {
2758 			kstat_zone_add(sc->vmm_kstat_vcpu[i], GLOBAL_ZONEID);
2759 		}
2760 	}
2761 
2762 	return (0);
2763 
2764 fail:
2765 	for (uint_t i = 0; i < VM_MAXCPU; i++) {
2766 		if (sc->vmm_kstat_vcpu[i] != NULL) {
2767 			kstat_delete(sc->vmm_kstat_vcpu[i]);
2768 			sc->vmm_kstat_vcpu[i] = NULL;
2769 		} else {
2770 			break;
2771 		}
2772 	}
2773 	kstat_delete(sc->vmm_kstat_vm);
2774 	sc->vmm_kstat_vm = NULL;
2775 	return (-1);
2776 }
2777 
2778 static void
2779 vmm_kstat_init(vmm_softc_t *sc)
2780 {
2781 	kstat_t *ksp;
2782 
2783 	ASSERT3P(sc->vmm_vm, !=, NULL);
2784 	ASSERT3P(sc->vmm_kstat_vm, !=, NULL);
2785 
2786 	ksp = sc->vmm_kstat_vm;
2787 	vmm_kstats_t *vk = ksp->ks_data;
2788 	ksp->ks_private = sc->vmm_vm;
2789 	kstat_named_init(&vk->vk_name, "vm_name", KSTAT_DATA_STRING);
2790 	kstat_named_setstr(&vk->vk_name, sc->vmm_name);
2791 
2792 	for (uint_t i = 0; i < VM_MAXCPU; i++) {
2793 		ASSERT3P(sc->vmm_kstat_vcpu[i], !=, NULL);
2794 
2795 		ksp = sc->vmm_kstat_vcpu[i];
2796 		vmm_vcpu_kstats_t *vvk = ksp->ks_data;
2797 
2798 		kstat_named_init(&vvk->vvk_vcpu, "vcpu", KSTAT_DATA_UINT32);
2799 		vvk->vvk_vcpu.value.ui32 = i;
2800 		kstat_named_init(&vvk->vvk_time_init, "time_init",
2801 		    KSTAT_DATA_UINT64);
2802 		kstat_named_init(&vvk->vvk_time_run, "time_run",
2803 		    KSTAT_DATA_UINT64);
2804 		kstat_named_init(&vvk->vvk_time_idle, "time_idle",
2805 		    KSTAT_DATA_UINT64);
2806 		kstat_named_init(&vvk->vvk_time_emu_kern, "time_emu_kern",
2807 		    KSTAT_DATA_UINT64);
2808 		kstat_named_init(&vvk->vvk_time_emu_user, "time_emu_user",
2809 		    KSTAT_DATA_UINT64);
2810 		kstat_named_init(&vvk->vvk_time_sched, "time_sched",
2811 		    KSTAT_DATA_UINT64);
2812 		ksp->ks_private = sc->vmm_vm;
2813 		ksp->ks_update = vmm_kstat_update_vcpu;
2814 	}
2815 
2816 	kstat_install(sc->vmm_kstat_vm);
2817 	for (uint_t i = 0; i < VM_MAXCPU; i++) {
2818 		kstat_install(sc->vmm_kstat_vcpu[i]);
2819 	}
2820 }
2821 
2822 static void
2823 vmm_kstat_fini(vmm_softc_t *sc)
2824 {
2825 	ASSERT(sc->vmm_kstat_vm != NULL);
2826 
2827 	kstat_delete(sc->vmm_kstat_vm);
2828 	sc->vmm_kstat_vm = NULL;
2829 
2830 	for (uint_t i = 0; i < VM_MAXCPU; i++) {
2831 		ASSERT3P(sc->vmm_kstat_vcpu[i], !=, NULL);
2832 
2833 		kstat_delete(sc->vmm_kstat_vcpu[i]);
2834 		sc->vmm_kstat_vcpu[i] = NULL;
2835 	}
2836 }
2837 
2838 static int
2839 vmm_open(dev_t *devp, int flag, int otyp, cred_t *credp)
2840 {
2841 	minor_t		minor;
2842 	vmm_softc_t	*sc;
2843 
2844 	/*
2845 	 * Forbid running bhyve in a 32-bit process until it has been tested and
2846 	 * verified to be safe.
2847 	 */
2848 	if (curproc->p_model != DATAMODEL_LP64) {
2849 		return (EFBIG);
2850 	}
2851 
2852 	minor = getminor(*devp);
2853 	if (minor == VMM_CTL_MINOR) {
2854 		/*
2855 		 * Master control device must be opened exclusively.
2856 		 */
2857 		if ((flag & FEXCL) != FEXCL || otyp != OTYP_CHR) {
2858 			return (EINVAL);
2859 		}
2860 
2861 		return (0);
2862 	}
2863 
2864 	mutex_enter(&vmm_mtx);
2865 	sc = ddi_get_soft_state(vmm_statep, minor);
2866 	if (sc == NULL) {
2867 		mutex_exit(&vmm_mtx);
2868 		return (ENXIO);
2869 	}
2870 
2871 	sc->vmm_flags |= VMM_IS_OPEN;
2872 	mutex_exit(&vmm_mtx);
2873 
2874 	return (0);
2875 }
2876 
2877 static int
2878 vmm_close(dev_t dev, int flag, int otyp, cred_t *credp)
2879 {
2880 	const minor_t minor = getminor(dev);
2881 	vmm_softc_t *sc;
2882 	bool hma_release = false;
2883 
2884 	if (minor == VMM_CTL_MINOR) {
2885 		return (0);
2886 	}
2887 
2888 	mutex_enter(&vmm_mtx);
2889 	sc = ddi_get_soft_state(vmm_statep, minor);
2890 	if (sc == NULL) {
2891 		mutex_exit(&vmm_mtx);
2892 		return (ENXIO);
2893 	}
2894 
2895 	VERIFY3U(sc->vmm_flags & VMM_IS_OPEN, !=, 0);
2896 	sc->vmm_flags &= ~VMM_IS_OPEN;
2897 
2898 	/*
2899 	 * If instance was marked for auto-destruction begin that now.  Instance
2900 	 * destruction may have been initated already, so try to make progress
2901 	 * in that case, since closure of the device is one of its requirements.
2902 	 */
2903 	if ((sc->vmm_flags & VMM_DESTROY) != 0 ||
2904 	    (sc->vmm_flags & VMM_AUTODESTROY) != 0) {
2905 		VERIFY0(vmm_destroy_locked(sc, VDO_DEFAULT, &hma_release));
2906 	}
2907 	mutex_exit(&vmm_mtx);
2908 
2909 	if (hma_release) {
2910 		vmm_hma_release();
2911 	}
2912 
2913 	return (0);
2914 }
2915 
2916 static int
2917 vmm_is_supported(intptr_t arg)
2918 {
2919 	int r;
2920 	const char *msg;
2921 
2922 	if (vmm_is_intel()) {
2923 		r = vmx_x86_supported(&msg);
2924 	} else if (vmm_is_svm()) {
2925 		/*
2926 		 * HMA already ensured that the features necessary for SVM
2927 		 * operation were present and online during vmm_attach().
2928 		 */
2929 		r = 0;
2930 	} else {
2931 		r = ENXIO;
2932 		msg = "Unsupported CPU vendor";
2933 	}
2934 
2935 	if (r != 0 && arg != (intptr_t)NULL) {
2936 		if (copyoutstr(msg, (char *)arg, strlen(msg) + 1, NULL) != 0)
2937 			return (EFAULT);
2938 	}
2939 	return (r);
2940 }
2941 
2942 static int
2943 vmm_ctl_ioctl(int cmd, intptr_t arg, int md, cred_t *cr, int *rvalp)
2944 {
2945 	void *argp = (void *)arg;
2946 
2947 	switch (cmd) {
2948 	case VMM_CREATE_VM: {
2949 		struct vm_create_req req;
2950 
2951 		if ((md & FWRITE) == 0) {
2952 			return (EPERM);
2953 		}
2954 		if (ddi_copyin(argp, &req, sizeof (req), md) != 0) {
2955 			return (EFAULT);
2956 		}
2957 		return (vmmdev_do_vm_create(&req, cr));
2958 	}
2959 	case VMM_DESTROY_VM: {
2960 		struct vm_destroy_req req;
2961 
2962 		if ((md & FWRITE) == 0) {
2963 			return (EPERM);
2964 		}
2965 		if (ddi_copyin(argp, &req, sizeof (req), md) != 0) {
2966 			return (EFAULT);
2967 		}
2968 		return (vmmdev_do_vm_destroy(&req, cr));
2969 	}
2970 	case VMM_VM_SUPPORTED:
2971 		return (vmm_is_supported(arg));
2972 	case VMM_CHECK_IOMMU:
2973 		if (!vmm_check_iommu()) {
2974 			return (ENXIO);
2975 		}
2976 		return (0);
2977 	case VMM_RESV_QUERY:
2978 	case VMM_RESV_SET_TARGET:
2979 		return (vmmr_ioctl(cmd, arg, md, cr, rvalp));
2980 	default:
2981 		break;
2982 	}
2983 	/* No other actions are legal on ctl device */
2984 	return (ENOTTY);
2985 }
2986 
2987 static int
2988 vmm_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp,
2989     int *rvalp)
2990 {
2991 	vmm_softc_t	*sc;
2992 	minor_t		minor;
2993 
2994 	/*
2995 	 * Forbid running bhyve in a 32-bit process until it has been tested and
2996 	 * verified to be safe.
2997 	 */
2998 	if (curproc->p_model != DATAMODEL_LP64) {
2999 		return (EFBIG);
3000 	}
3001 
3002 	/* The structs in bhyve ioctls assume a 64-bit datamodel */
3003 	if (ddi_model_convert_from(mode & FMODELS) != DDI_MODEL_NONE) {
3004 		return (ENOTSUP);
3005 	}
3006 
3007 	/*
3008 	 * Regardless of minor (vmmctl or instance), we respond to queries of
3009 	 * the interface version.
3010 	 */
3011 	if (cmd == VMM_INTERFACE_VERSION) {
3012 		*rvalp = VMM_CURRENT_INTERFACE_VERSION;
3013 		return (0);
3014 	}
3015 
3016 	minor = getminor(dev);
3017 
3018 	if (minor == VMM_CTL_MINOR) {
3019 		return (vmm_ctl_ioctl(cmd, arg, mode, credp, rvalp));
3020 	}
3021 
3022 	sc = ddi_get_soft_state(vmm_statep, minor);
3023 	ASSERT(sc != NULL);
3024 
3025 	/*
3026 	 * Turn away any ioctls against an instance when it is being destroyed.
3027 	 * (Except for the ioctl inquiring about that destroy-in-progress.)
3028 	 */
3029 	if ((sc->vmm_flags & VMM_DESTROY) != 0) {
3030 		if (cmd == VM_DESTROY_PENDING) {
3031 			*rvalp = 1;
3032 			return (0);
3033 		}
3034 		return (ENXIO);
3035 	}
3036 
3037 	return (vmmdev_do_ioctl(sc, cmd, arg, mode, credp, rvalp));
3038 }
3039 
3040 static int
3041 vmm_segmap(dev_t dev, off_t off, struct as *as, caddr_t *addrp, off_t len,
3042     unsigned int prot, unsigned int maxprot, unsigned int flags, cred_t *credp)
3043 {
3044 	vmm_softc_t *sc;
3045 	const minor_t minor = getminor(dev);
3046 	int err;
3047 
3048 	if (minor == VMM_CTL_MINOR) {
3049 		return (ENODEV);
3050 	}
3051 	if (off < 0 || (off + len) <= 0) {
3052 		return (EINVAL);
3053 	}
3054 	if ((prot & PROT_USER) == 0) {
3055 		return (EACCES);
3056 	}
3057 
3058 	sc = ddi_get_soft_state(vmm_statep, minor);
3059 	ASSERT(sc);
3060 
3061 	if (sc->vmm_flags & VMM_DESTROY)
3062 		return (ENXIO);
3063 
3064 	/* Grab read lock on the VM to prevent any changes to the memory map */
3065 	vmm_read_lock(sc);
3066 
3067 	if (off >= VM_DEVMEM_START) {
3068 		int segid;
3069 		off_t segoff;
3070 
3071 		/* Mapping a devmem "device" */
3072 		if (!vmmdev_devmem_segid(sc, off, len, &segid, &segoff)) {
3073 			err = ENODEV;
3074 		} else {
3075 			err = vm_segmap_obj(sc->vmm_vm, segid, segoff, len, as,
3076 			    addrp, prot, maxprot, flags);
3077 		}
3078 	} else {
3079 		/* Mapping a part of the guest physical space */
3080 		err = vm_segmap_space(sc->vmm_vm, off, as, addrp, len, prot,
3081 		    maxprot, flags);
3082 	}
3083 
3084 	vmm_read_unlock(sc);
3085 	return (err);
3086 }
3087 
3088 static sdev_plugin_validate_t
3089 vmm_sdev_validate(sdev_ctx_t ctx)
3090 {
3091 	const char *name = sdev_ctx_name(ctx);
3092 	vmm_softc_t *sc;
3093 	sdev_plugin_validate_t ret;
3094 	minor_t minor;
3095 
3096 	if (sdev_ctx_vtype(ctx) != VCHR)
3097 		return (SDEV_VTOR_INVALID);
3098 
3099 	VERIFY3S(sdev_ctx_minor(ctx, &minor), ==, 0);
3100 
3101 	mutex_enter(&vmm_mtx);
3102 	if ((sc = vmm_lookup(name)) == NULL)
3103 		ret = SDEV_VTOR_INVALID;
3104 	else if (sc->vmm_minor != minor)
3105 		ret = SDEV_VTOR_STALE;
3106 	else
3107 		ret = SDEV_VTOR_VALID;
3108 	mutex_exit(&vmm_mtx);
3109 
3110 	return (ret);
3111 }
3112 
3113 static int
3114 vmm_sdev_filldir(sdev_ctx_t ctx)
3115 {
3116 	vmm_softc_t *sc;
3117 	int ret;
3118 
3119 	if (strcmp(sdev_ctx_path(ctx), VMM_SDEV_ROOT) != 0) {
3120 		cmn_err(CE_WARN, "%s: bad path '%s' != '%s'\n", __func__,
3121 		    sdev_ctx_path(ctx), VMM_SDEV_ROOT);
3122 		return (EINVAL);
3123 	}
3124 
3125 	mutex_enter(&vmm_mtx);
3126 	ASSERT(vmmdev_dip != NULL);
3127 	for (sc = list_head(&vmm_list); sc != NULL;
3128 	    sc = list_next(&vmm_list, sc)) {
3129 		if (INGLOBALZONE(curproc) || sc->vmm_zone == curzone) {
3130 			ret = sdev_plugin_mknod(ctx, sc->vmm_name,
3131 			    S_IFCHR | 0600,
3132 			    makedevice(ddi_driver_major(vmmdev_dip),
3133 			    sc->vmm_minor));
3134 		} else {
3135 			continue;
3136 		}
3137 		if (ret != 0 && ret != EEXIST)
3138 			goto out;
3139 	}
3140 
3141 	ret = 0;
3142 
3143 out:
3144 	mutex_exit(&vmm_mtx);
3145 	return (ret);
3146 }
3147 
3148 /* ARGSUSED */
3149 static void
3150 vmm_sdev_inactive(sdev_ctx_t ctx)
3151 {
3152 }
3153 
3154 static sdev_plugin_ops_t vmm_sdev_ops = {
3155 	.spo_version = SDEV_PLUGIN_VERSION,
3156 	.spo_flags = SDEV_PLUGIN_SUBDIR,
3157 	.spo_validate = vmm_sdev_validate,
3158 	.spo_filldir = vmm_sdev_filldir,
3159 	.spo_inactive = vmm_sdev_inactive
3160 };
3161 
3162 /* ARGSUSED */
3163 static int
3164 vmm_info(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result)
3165 {
3166 	int error;
3167 
3168 	switch (cmd) {
3169 	case DDI_INFO_DEVT2DEVINFO:
3170 		*result = (void *)vmmdev_dip;
3171 		error = DDI_SUCCESS;
3172 		break;
3173 	case DDI_INFO_DEVT2INSTANCE:
3174 		*result = (void *)0;
3175 		error = DDI_SUCCESS;
3176 		break;
3177 	default:
3178 		error = DDI_FAILURE;
3179 		break;
3180 	}
3181 	return (error);
3182 }
3183 
3184 static int
3185 vmm_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
3186 {
3187 	sdev_plugin_hdl_t sph;
3188 	hma_reg_t *reg = NULL;
3189 	boolean_t vmm_loaded = B_FALSE;
3190 
3191 	if (cmd != DDI_ATTACH) {
3192 		return (DDI_FAILURE);
3193 	}
3194 
3195 	mutex_enter(&vmmdev_mtx);
3196 	/* Ensure we are not already attached. */
3197 	if (vmmdev_dip != NULL) {
3198 		mutex_exit(&vmmdev_mtx);
3199 		return (DDI_FAILURE);
3200 	}
3201 
3202 	vmm_sol_glue_init();
3203 
3204 	/*
3205 	 * Perform temporary HMA registration to determine if the system
3206 	 * is capable.
3207 	 */
3208 	if ((reg = hma_register(vmmdev_hvm_name)) == NULL) {
3209 		goto fail;
3210 	} else if (vmm_mod_load() != 0) {
3211 		goto fail;
3212 	}
3213 	vmm_loaded = B_TRUE;
3214 	hma_unregister(reg);
3215 	reg = NULL;
3216 
3217 	/* Create control node.  Other nodes will be created on demand. */
3218 	if (ddi_create_minor_node(dip, "ctl", S_IFCHR,
3219 	    VMM_CTL_MINOR, DDI_PSEUDO, 0) != 0) {
3220 		goto fail;
3221 	}
3222 
3223 	sph = sdev_plugin_register(VMM_MODULE_NAME, &vmm_sdev_ops, NULL);
3224 	if (sph == (sdev_plugin_hdl_t)NULL) {
3225 		ddi_remove_minor_node(dip, NULL);
3226 		goto fail;
3227 	}
3228 
3229 	ddi_report_dev(dip);
3230 	vmmdev_sdev_hdl = sph;
3231 	vmmdev_dip = dip;
3232 	mutex_exit(&vmmdev_mtx);
3233 	return (DDI_SUCCESS);
3234 
3235 fail:
3236 	if (vmm_loaded) {
3237 		VERIFY0(vmm_mod_unload());
3238 	}
3239 	if (reg != NULL) {
3240 		hma_unregister(reg);
3241 	}
3242 	vmm_sol_glue_cleanup();
3243 	mutex_exit(&vmmdev_mtx);
3244 	return (DDI_FAILURE);
3245 }
3246 
3247 static int
3248 vmm_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
3249 {
3250 	if (cmd != DDI_DETACH) {
3251 		return (DDI_FAILURE);
3252 	}
3253 
3254 	/*
3255 	 * Ensure that all resources have been cleaned up.
3256 	 *
3257 	 * To prevent a deadlock with iommu_cleanup() we'll fail the detach if
3258 	 * vmmdev_mtx is already held. We can't wait for vmmdev_mtx with our
3259 	 * devinfo locked as iommu_cleanup() tries to recursively lock each
3260 	 * devinfo, including our own, while holding vmmdev_mtx.
3261 	 */
3262 	if (mutex_tryenter(&vmmdev_mtx) == 0)
3263 		return (DDI_FAILURE);
3264 
3265 	mutex_enter(&vmm_mtx);
3266 	if (!list_is_empty(&vmm_list)) {
3267 		mutex_exit(&vmm_mtx);
3268 		mutex_exit(&vmmdev_mtx);
3269 		return (DDI_FAILURE);
3270 	}
3271 	mutex_exit(&vmm_mtx);
3272 
3273 	if (!vmmr_is_empty()) {
3274 		mutex_exit(&vmmdev_mtx);
3275 		return (DDI_FAILURE);
3276 	}
3277 
3278 	VERIFY(vmmdev_sdev_hdl != (sdev_plugin_hdl_t)NULL);
3279 	if (sdev_plugin_unregister(vmmdev_sdev_hdl) != 0) {
3280 		mutex_exit(&vmmdev_mtx);
3281 		return (DDI_FAILURE);
3282 	}
3283 	vmmdev_sdev_hdl = (sdev_plugin_hdl_t)NULL;
3284 
3285 	/* Remove the control node. */
3286 	ddi_remove_minor_node(dip, "ctl");
3287 	vmmdev_dip = NULL;
3288 
3289 	VERIFY0(vmm_mod_unload());
3290 	VERIFY3U(vmmdev_hma_reg, ==, NULL);
3291 	vmm_sol_glue_cleanup();
3292 
3293 	mutex_exit(&vmmdev_mtx);
3294 
3295 	return (DDI_SUCCESS);
3296 }
3297 
3298 static struct cb_ops vmm_cb_ops = {
3299 	vmm_open,
3300 	vmm_close,
3301 	nodev,		/* strategy */
3302 	nodev,		/* print */
3303 	nodev,		/* dump */
3304 	nodev,		/* read */
3305 	nodev,		/* write */
3306 	vmm_ioctl,
3307 	nodev,		/* devmap */
3308 	nodev,		/* mmap */
3309 	vmm_segmap,
3310 	nochpoll,	/* poll */
3311 	ddi_prop_op,
3312 	NULL,
3313 	D_NEW | D_MP | D_DEVMAP
3314 };
3315 
3316 static struct dev_ops vmm_ops = {
3317 	DEVO_REV,
3318 	0,
3319 	vmm_info,
3320 	nulldev,	/* identify */
3321 	nulldev,	/* probe */
3322 	vmm_attach,
3323 	vmm_detach,
3324 	nodev,		/* reset */
3325 	&vmm_cb_ops,
3326 	(struct bus_ops *)NULL
3327 };
3328 
3329 static struct modldrv modldrv = {
3330 	&mod_driverops,
3331 	"bhyve vmm",
3332 	&vmm_ops
3333 };
3334 
3335 static struct modlinkage modlinkage = {
3336 	MODREV_1,
3337 	&modldrv,
3338 	NULL
3339 };
3340 
3341 int
3342 _init(void)
3343 {
3344 	int	error;
3345 
3346 	sysinit();
3347 
3348 	mutex_init(&vmmdev_mtx, NULL, MUTEX_DRIVER, NULL);
3349 	mutex_init(&vmm_mtx, NULL, MUTEX_DRIVER, NULL);
3350 	list_create(&vmm_list, sizeof (vmm_softc_t),
3351 	    offsetof(vmm_softc_t, vmm_node));
3352 	vmm_minors = id_space_create("vmm_minors", VMM_CTL_MINOR + 1, MAXMIN32);
3353 
3354 	error = ddi_soft_state_init(&vmm_statep, sizeof (vmm_softc_t), 0);
3355 	if (error) {
3356 		return (error);
3357 	}
3358 
3359 	error = vmmr_init();
3360 	if (error) {
3361 		ddi_soft_state_fini(&vmm_statep);
3362 		return (error);
3363 	}
3364 
3365 	vmm_zsd_init();
3366 
3367 	error = mod_install(&modlinkage);
3368 	if (error) {
3369 		ddi_soft_state_fini(&vmm_statep);
3370 		vmm_zsd_fini();
3371 		vmmr_fini();
3372 	}
3373 
3374 	return (error);
3375 }
3376 
3377 int
3378 _fini(void)
3379 {
3380 	int	error;
3381 
3382 	error = mod_remove(&modlinkage);
3383 	if (error) {
3384 		return (error);
3385 	}
3386 
3387 	vmm_zsd_fini();
3388 	vmmr_fini();
3389 
3390 	ddi_soft_state_fini(&vmm_statep);
3391 
3392 	return (0);
3393 }
3394 
3395 int
3396 _info(struct modinfo *modinfop)
3397 {
3398 	return (mod_info(&modlinkage, modinfop));
3399 }
3400